diff options
Diffstat (limited to 'arch/x86')
75 files changed, 2778 insertions, 1085 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 469f3450bf81..87717f3687d2 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig | |||
| @@ -138,6 +138,9 @@ config ARCH_HAS_CACHE_LINE_SIZE | |||
| 138 | config HAVE_SETUP_PER_CPU_AREA | 138 | config HAVE_SETUP_PER_CPU_AREA |
| 139 | def_bool y | 139 | def_bool y |
| 140 | 140 | ||
| 141 | config HAVE_DYNAMIC_PER_CPU_AREA | ||
| 142 | def_bool y | ||
| 143 | |||
| 141 | config HAVE_CPUMASK_OF_CPU_MAP | 144 | config HAVE_CPUMASK_OF_CPU_MAP |
| 142 | def_bool X86_64_SMP | 145 | def_bool X86_64_SMP |
| 143 | 146 | ||
| @@ -780,6 +783,11 @@ config X86_MCE_AMD | |||
| 780 | Additional support for AMD specific MCE features such as | 783 | Additional support for AMD specific MCE features such as |
| 781 | the DRAM Error Threshold. | 784 | the DRAM Error Threshold. |
| 782 | 785 | ||
| 786 | config X86_MCE_THRESHOLD | ||
| 787 | depends on X86_MCE_AMD || X86_MCE_INTEL | ||
| 788 | bool | ||
| 789 | default y | ||
| 790 | |||
| 783 | config X86_MCE_NONFATAL | 791 | config X86_MCE_NONFATAL |
| 784 | tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4" | 792 | tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4" |
| 785 | depends on X86_32 && X86_MCE | 793 | depends on X86_32 && X86_MCE |
| @@ -1125,7 +1133,7 @@ config NODES_SHIFT | |||
| 1125 | Specify the maximum number of NUMA Nodes available on the target | 1133 | Specify the maximum number of NUMA Nodes available on the target |
| 1126 | system. Increases memory reserved to accomodate various tables. | 1134 | system. Increases memory reserved to accomodate various tables. |
| 1127 | 1135 | ||
| 1128 | config HAVE_ARCH_BOOTMEM_NODE | 1136 | config HAVE_ARCH_BOOTMEM |
| 1129 | def_bool y | 1137 | def_bool y |
| 1130 | depends on X86_32 && NUMA | 1138 | depends on X86_32 && NUMA |
| 1131 | 1139 | ||
| @@ -1423,7 +1431,7 @@ config CRASH_DUMP | |||
| 1423 | config KEXEC_JUMP | 1431 | config KEXEC_JUMP |
| 1424 | bool "kexec jump (EXPERIMENTAL)" | 1432 | bool "kexec jump (EXPERIMENTAL)" |
| 1425 | depends on EXPERIMENTAL | 1433 | depends on EXPERIMENTAL |
| 1426 | depends on KEXEC && HIBERNATION && X86_32 | 1434 | depends on KEXEC && HIBERNATION |
| 1427 | ---help--- | 1435 | ---help--- |
| 1428 | Jump between original kernel and kexeced kernel and invoke | 1436 | Jump between original kernel and kexeced kernel and invoke |
| 1429 | code in physical address mode via KEXEC | 1437 | code in physical address mode via KEXEC |
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 4ef949c1972e..394d177d721b 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h | |||
| @@ -379,6 +379,7 @@ static inline u32 safe_apic_wait_icr_idle(void) | |||
| 379 | 379 | ||
| 380 | static inline void ack_APIC_irq(void) | 380 | static inline void ack_APIC_irq(void) |
| 381 | { | 381 | { |
| 382 | #ifdef CONFIG_X86_LOCAL_APIC | ||
| 382 | /* | 383 | /* |
| 383 | * ack_APIC_irq() actually gets compiled as a single instruction | 384 | * ack_APIC_irq() actually gets compiled as a single instruction |
| 384 | * ... yummie. | 385 | * ... yummie. |
| @@ -386,6 +387,7 @@ static inline void ack_APIC_irq(void) | |||
| 386 | 387 | ||
| 387 | /* Docs say use 0 for future compatibility */ | 388 | /* Docs say use 0 for future compatibility */ |
| 388 | apic_write(APIC_EOI, 0); | 389 | apic_write(APIC_EOI, 0); |
| 390 | #endif | ||
| 389 | } | 391 | } |
| 390 | 392 | ||
| 391 | static inline unsigned default_get_apic_id(unsigned long x) | 393 | static inline unsigned default_get_apic_id(unsigned long x) |
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h index 63134e31e8b9..bc9514fb3b13 100644 --- a/arch/x86/include/asm/apicdef.h +++ b/arch/x86/include/asm/apicdef.h | |||
| @@ -53,6 +53,7 @@ | |||
| 53 | #define APIC_ESR_SENDILL 0x00020 | 53 | #define APIC_ESR_SENDILL 0x00020 |
| 54 | #define APIC_ESR_RECVILL 0x00040 | 54 | #define APIC_ESR_RECVILL 0x00040 |
| 55 | #define APIC_ESR_ILLREGA 0x00080 | 55 | #define APIC_ESR_ILLREGA 0x00080 |
| 56 | #define APIC_LVTCMCI 0x2f0 | ||
| 56 | #define APIC_ICR 0x300 | 57 | #define APIC_ICR 0x300 |
| 57 | #define APIC_DEST_SELF 0x40000 | 58 | #define APIC_DEST_SELF 0x40000 |
| 58 | #define APIC_DEST_ALLINC 0x80000 | 59 | #define APIC_DEST_ALLINC 0x80000 |
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h index 2f8466540fb5..5b301b7ff5f4 100644 --- a/arch/x86/include/asm/cacheflush.h +++ b/arch/x86/include/asm/cacheflush.h | |||
| @@ -5,24 +5,43 @@ | |||
| 5 | #include <linux/mm.h> | 5 | #include <linux/mm.h> |
| 6 | 6 | ||
| 7 | /* Caches aren't brain-dead on the intel. */ | 7 | /* Caches aren't brain-dead on the intel. */ |
| 8 | #define flush_cache_all() do { } while (0) | 8 | static inline void flush_cache_all(void) { } |
| 9 | #define flush_cache_mm(mm) do { } while (0) | 9 | static inline void flush_cache_mm(struct mm_struct *mm) { } |
| 10 | #define flush_cache_dup_mm(mm) do { } while (0) | 10 | static inline void flush_cache_dup_mm(struct mm_struct *mm) { } |
| 11 | #define flush_cache_range(vma, start, end) do { } while (0) | 11 | static inline void flush_cache_range(struct vm_area_struct *vma, |
| 12 | #define flush_cache_page(vma, vmaddr, pfn) do { } while (0) | 12 | unsigned long start, unsigned long end) { } |
| 13 | #define flush_dcache_page(page) do { } while (0) | 13 | static inline void flush_cache_page(struct vm_area_struct *vma, |
| 14 | #define flush_dcache_mmap_lock(mapping) do { } while (0) | 14 | unsigned long vmaddr, unsigned long pfn) { } |
| 15 | #define flush_dcache_mmap_unlock(mapping) do { } while (0) | 15 | static inline void flush_dcache_page(struct page *page) { } |
| 16 | #define flush_icache_range(start, end) do { } while (0) | 16 | static inline void flush_dcache_mmap_lock(struct address_space *mapping) { } |
| 17 | #define flush_icache_page(vma, pg) do { } while (0) | 17 | static inline void flush_dcache_mmap_unlock(struct address_space *mapping) { } |
| 18 | #define flush_icache_user_range(vma, pg, adr, len) do { } while (0) | 18 | static inline void flush_icache_range(unsigned long start, |
| 19 | #define flush_cache_vmap(start, end) do { } while (0) | 19 | unsigned long end) { } |
| 20 | #define flush_cache_vunmap(start, end) do { } while (0) | 20 | static inline void flush_icache_page(struct vm_area_struct *vma, |
| 21 | struct page *page) { } | ||
| 22 | static inline void flush_icache_user_range(struct vm_area_struct *vma, | ||
| 23 | struct page *page, | ||
| 24 | unsigned long addr, | ||
| 25 | unsigned long len) { } | ||
| 26 | static inline void flush_cache_vmap(unsigned long start, unsigned long end) { } | ||
| 27 | static inline void flush_cache_vunmap(unsigned long start, | ||
| 28 | unsigned long end) { } | ||
| 21 | 29 | ||
| 22 | #define copy_to_user_page(vma, page, vaddr, dst, src, len) \ | 30 | static inline void copy_to_user_page(struct vm_area_struct *vma, |
| 23 | memcpy((dst), (src), (len)) | 31 | struct page *page, unsigned long vaddr, |
| 24 | #define copy_from_user_page(vma, page, vaddr, dst, src, len) \ | 32 | void *dst, const void *src, |
| 25 | memcpy((dst), (src), (len)) | 33 | unsigned long len) |
| 34 | { | ||
| 35 | memcpy(dst, src, len); | ||
| 36 | } | ||
| 37 | |||
| 38 | static inline void copy_from_user_page(struct vm_area_struct *vma, | ||
| 39 | struct page *page, unsigned long vaddr, | ||
| 40 | void *dst, const void *src, | ||
| 41 | unsigned long len) | ||
| 42 | { | ||
| 43 | memcpy(dst, src, len); | ||
| 44 | } | ||
| 26 | 45 | ||
| 27 | #define PG_non_WB PG_arch_1 | 46 | #define PG_non_WB PG_arch_1 |
| 28 | PAGEFLAG(NonWB, non_WB) | 47 | PAGEFLAG(NonWB, non_WB) |
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index ca5ffb2856b6..edc90f23e708 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h | |||
| @@ -37,8 +37,6 @@ extern unsigned long asmlinkage efi_call_phys(void *, ...); | |||
| 37 | 37 | ||
| 38 | #else /* !CONFIG_X86_32 */ | 38 | #else /* !CONFIG_X86_32 */ |
| 39 | 39 | ||
| 40 | #define MAX_EFI_IO_PAGES 100 | ||
| 41 | |||
| 42 | extern u64 efi_call0(void *fp); | 40 | extern u64 efi_call0(void *fp); |
| 43 | extern u64 efi_call1(void *fp, u64 arg1); | 41 | extern u64 efi_call1(void *fp, u64 arg1); |
| 44 | extern u64 efi_call2(void *fp, u64 arg1, u64 arg2); | 42 | extern u64 efi_call2(void *fp, u64 arg1, u64 arg2); |
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index 854d538ae857..c2e6bedaf258 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h | |||
| @@ -33,6 +33,8 @@ BUILD_INTERRUPT3(invalidate_interrupt7,INVALIDATE_TLB_VECTOR_START+7, | |||
| 33 | smp_invalidate_interrupt) | 33 | smp_invalidate_interrupt) |
| 34 | #endif | 34 | #endif |
| 35 | 35 | ||
| 36 | BUILD_INTERRUPT(generic_interrupt, GENERIC_INTERRUPT_VECTOR) | ||
| 37 | |||
| 36 | /* | 38 | /* |
| 37 | * every pentium local APIC has two 'local interrupts', with a | 39 | * every pentium local APIC has two 'local interrupts', with a |
| 38 | * soft-definable vector attached to both interrupts, one of | 40 | * soft-definable vector attached to both interrupts, one of |
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index dca8f03da5b2..63a79c77d220 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h | |||
| @@ -24,9 +24,6 @@ | |||
| 24 | #include <asm/kmap_types.h> | 24 | #include <asm/kmap_types.h> |
| 25 | #else | 25 | #else |
| 26 | #include <asm/vsyscall.h> | 26 | #include <asm/vsyscall.h> |
| 27 | #ifdef CONFIG_EFI | ||
| 28 | #include <asm/efi.h> | ||
| 29 | #endif | ||
| 30 | #endif | 27 | #endif |
| 31 | 28 | ||
| 32 | /* | 29 | /* |
| @@ -92,13 +89,6 @@ enum fixed_addresses { | |||
| 92 | FIX_IO_APIC_BASE_0, | 89 | FIX_IO_APIC_BASE_0, |
| 93 | FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1, | 90 | FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1, |
| 94 | #endif | 91 | #endif |
| 95 | #ifdef CONFIG_X86_64 | ||
| 96 | #ifdef CONFIG_EFI | ||
| 97 | FIX_EFI_IO_MAP_LAST_PAGE, | ||
| 98 | FIX_EFI_IO_MAP_FIRST_PAGE = FIX_EFI_IO_MAP_LAST_PAGE | ||
| 99 | + MAX_EFI_IO_PAGES - 1, | ||
| 100 | #endif | ||
| 101 | #endif | ||
| 102 | #ifdef CONFIG_X86_VISWS_APIC | 92 | #ifdef CONFIG_X86_VISWS_APIC |
| 103 | FIX_CO_CPU, /* Cobalt timer */ | 93 | FIX_CO_CPU, /* Cobalt timer */ |
| 104 | FIX_CO_APIC, /* Cobalt APIC Redirection Table */ | 94 | FIX_CO_APIC, /* Cobalt APIC Redirection Table */ |
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index 176f058e7159..039db6aa8e02 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h | |||
| @@ -12,6 +12,7 @@ typedef struct { | |||
| 12 | unsigned int apic_timer_irqs; /* arch dependent */ | 12 | unsigned int apic_timer_irqs; /* arch dependent */ |
| 13 | unsigned int irq_spurious_count; | 13 | unsigned int irq_spurious_count; |
| 14 | #endif | 14 | #endif |
| 15 | unsigned int generic_irqs; /* arch dependent */ | ||
| 15 | #ifdef CONFIG_SMP | 16 | #ifdef CONFIG_SMP |
| 16 | unsigned int irq_resched_count; | 17 | unsigned int irq_resched_count; |
| 17 | unsigned int irq_call_count; | 18 | unsigned int irq_call_count; |
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 370e1c83bb49..b762ea49bd70 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h | |||
| @@ -27,6 +27,7 @@ | |||
| 27 | 27 | ||
| 28 | /* Interrupt handlers registered during init_IRQ */ | 28 | /* Interrupt handlers registered during init_IRQ */ |
| 29 | extern void apic_timer_interrupt(void); | 29 | extern void apic_timer_interrupt(void); |
| 30 | extern void generic_interrupt(void); | ||
| 30 | extern void error_interrupt(void); | 31 | extern void error_interrupt(void); |
| 31 | extern void spurious_interrupt(void); | 32 | extern void spurious_interrupt(void); |
| 32 | extern void thermal_interrupt(void); | 33 | extern void thermal_interrupt(void); |
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 48f0004db8c9..71c9e5183982 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h | |||
| @@ -172,7 +172,13 @@ static inline void __save_init_fpu(struct task_struct *tsk) | |||
| 172 | 172 | ||
| 173 | #else /* CONFIG_X86_32 */ | 173 | #else /* CONFIG_X86_32 */ |
| 174 | 174 | ||
| 175 | extern void finit(void); | 175 | #ifdef CONFIG_MATH_EMULATION |
| 176 | extern void finit_task(struct task_struct *tsk); | ||
| 177 | #else | ||
| 178 | static inline void finit_task(struct task_struct *tsk) | ||
| 179 | { | ||
| 180 | } | ||
| 181 | #endif | ||
| 176 | 182 | ||
| 177 | static inline void tolerant_fwait(void) | 183 | static inline void tolerant_fwait(void) |
| 178 | { | 184 | { |
diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h new file mode 100644 index 000000000000..36fb1a6a5109 --- /dev/null +++ b/arch/x86/include/asm/init.h | |||
| @@ -0,0 +1,18 @@ | |||
| 1 | #ifndef _ASM_X86_INIT_32_H | ||
| 2 | #define _ASM_X86_INIT_32_H | ||
| 3 | |||
| 4 | #ifdef CONFIG_X86_32 | ||
| 5 | extern void __init early_ioremap_page_table_range_init(void); | ||
| 6 | #endif | ||
| 7 | |||
| 8 | extern unsigned long __init | ||
| 9 | kernel_physical_mapping_init(unsigned long start, | ||
| 10 | unsigned long end, | ||
| 11 | unsigned long page_size_mask); | ||
| 12 | |||
| 13 | |||
| 14 | extern unsigned long __initdata e820_table_start; | ||
| 15 | extern unsigned long __meminitdata e820_table_end; | ||
| 16 | extern unsigned long __meminitdata e820_table_top; | ||
| 17 | |||
| 18 | #endif /* _ASM_X86_INIT_32_H */ | ||
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 683d0b4c00fc..e5383e3d2f8c 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h | |||
| @@ -172,8 +172,6 @@ static inline void __iomem *ioremap(resource_size_t offset, unsigned long size) | |||
| 172 | 172 | ||
| 173 | extern void iounmap(volatile void __iomem *addr); | 173 | extern void iounmap(volatile void __iomem *addr); |
| 174 | 174 | ||
| 175 | extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys); | ||
| 176 | |||
| 177 | 175 | ||
| 178 | #ifdef CONFIG_X86_32 | 176 | #ifdef CONFIG_X86_32 |
| 179 | # include "io_32.h" | 177 | # include "io_32.h" |
| @@ -198,7 +196,6 @@ extern void early_ioremap_reset(void); | |||
| 198 | extern void __iomem *early_ioremap(unsigned long offset, unsigned long size); | 196 | extern void __iomem *early_ioremap(unsigned long offset, unsigned long size); |
| 199 | extern void __iomem *early_memremap(unsigned long offset, unsigned long size); | 197 | extern void __iomem *early_memremap(unsigned long offset, unsigned long size); |
| 200 | extern void early_iounmap(void __iomem *addr, unsigned long size); | 198 | extern void early_iounmap(void __iomem *addr, unsigned long size); |
| 201 | extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys); | ||
| 202 | 199 | ||
| 203 | #define IO_SPACE_LIMIT 0xffff | 200 | #define IO_SPACE_LIMIT 0xffff |
| 204 | 201 | ||
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index 107eb2196691..f38481bcd455 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h | |||
| @@ -36,6 +36,7 @@ static inline int irq_canonicalize(int irq) | |||
| 36 | extern void fixup_irqs(void); | 36 | extern void fixup_irqs(void); |
| 37 | #endif | 37 | #endif |
| 38 | 38 | ||
| 39 | extern void (*generic_interrupt_extension)(void); | ||
| 39 | extern void init_IRQ(void); | 40 | extern void init_IRQ(void); |
| 40 | extern void native_init_IRQ(void); | 41 | extern void native_init_IRQ(void); |
| 41 | extern bool handle_irq(unsigned irq, struct pt_regs *regs); | 42 | extern bool handle_irq(unsigned irq, struct pt_regs *regs); |
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 8a285f356f8a..3cbd79bbb47c 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h | |||
| @@ -112,6 +112,11 @@ | |||
| 112 | #define LOCAL_PERF_VECTOR 0xee | 112 | #define LOCAL_PERF_VECTOR 0xee |
| 113 | 113 | ||
| 114 | /* | 114 | /* |
| 115 | * Generic system vector for platform specific use | ||
| 116 | */ | ||
| 117 | #define GENERIC_INTERRUPT_VECTOR 0xed | ||
| 118 | |||
| 119 | /* | ||
| 115 | * First APIC vector available to drivers: (vectors 0x30-0xee) we | 120 | * First APIC vector available to drivers: (vectors 0x30-0xee) we |
| 116 | * start at 0x31(0x41) to spread out vectors evenly between priority | 121 | * start at 0x31(0x41) to spread out vectors evenly between priority |
| 117 | * levels. (0x80 is the syscall vector) | 122 | * levels. (0x80 is the syscall vector) |
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index 0ceb6d19ed30..317ff1703d0b 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h | |||
| @@ -9,13 +9,13 @@ | |||
| 9 | # define PAGES_NR 4 | 9 | # define PAGES_NR 4 |
| 10 | #else | 10 | #else |
| 11 | # define PA_CONTROL_PAGE 0 | 11 | # define PA_CONTROL_PAGE 0 |
| 12 | # define PA_TABLE_PAGE 1 | 12 | # define VA_CONTROL_PAGE 1 |
| 13 | # define PAGES_NR 2 | 13 | # define PA_TABLE_PAGE 2 |
| 14 | # define PA_SWAP_PAGE 3 | ||
| 15 | # define PAGES_NR 4 | ||
| 14 | #endif | 16 | #endif |
| 15 | 17 | ||
| 16 | #ifdef CONFIG_X86_32 | ||
| 17 | # define KEXEC_CONTROL_CODE_MAX_SIZE 2048 | 18 | # define KEXEC_CONTROL_CODE_MAX_SIZE 2048 |
| 18 | #endif | ||
| 19 | 19 | ||
| 20 | #ifndef __ASSEMBLY__ | 20 | #ifndef __ASSEMBLY__ |
| 21 | 21 | ||
| @@ -136,10 +136,11 @@ relocate_kernel(unsigned long indirection_page, | |||
| 136 | unsigned int has_pae, | 136 | unsigned int has_pae, |
| 137 | unsigned int preserve_context); | 137 | unsigned int preserve_context); |
| 138 | #else | 138 | #else |
| 139 | NORET_TYPE void | 139 | unsigned long |
| 140 | relocate_kernel(unsigned long indirection_page, | 140 | relocate_kernel(unsigned long indirection_page, |
| 141 | unsigned long page_list, | 141 | unsigned long page_list, |
| 142 | unsigned long start_address) ATTRIB_NORET; | 142 | unsigned long start_address, |
| 143 | unsigned int preserve_context); | ||
| 143 | #endif | 144 | #endif |
| 144 | 145 | ||
| 145 | #define ARCH_HAS_KIMAGE_ARCH | 146 | #define ARCH_HAS_KIMAGE_ARCH |
diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h index 9320e2a8a26a..a0d70b46c27c 100644 --- a/arch/x86/include/asm/linkage.h +++ b/arch/x86/include/asm/linkage.h | |||
| @@ -4,11 +4,6 @@ | |||
| 4 | #undef notrace | 4 | #undef notrace |
| 5 | #define notrace __attribute__((no_instrument_function)) | 5 | #define notrace __attribute__((no_instrument_function)) |
| 6 | 6 | ||
| 7 | #ifdef CONFIG_X86_64 | ||
| 8 | #define __ALIGN .p2align 4,,15 | ||
| 9 | #define __ALIGN_STR ".p2align 4,,15" | ||
| 10 | #endif | ||
| 11 | |||
| 12 | #ifdef CONFIG_X86_32 | 7 | #ifdef CONFIG_X86_32 |
| 13 | #define asmlinkage CPP_ASMLINKAGE __attribute__((regparm(0))) | 8 | #define asmlinkage CPP_ASMLINKAGE __attribute__((regparm(0))) |
| 14 | /* | 9 | /* |
| @@ -50,16 +45,25 @@ | |||
| 50 | __asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2), "g" (arg3), \ | 45 | __asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2), "g" (arg3), \ |
| 51 | "g" (arg4), "g" (arg5), "g" (arg6)) | 46 | "g" (arg4), "g" (arg5), "g" (arg6)) |
| 52 | 47 | ||
| 53 | #endif | 48 | #endif /* CONFIG_X86_32 */ |
| 49 | |||
| 50 | #ifdef __ASSEMBLY__ | ||
| 54 | 51 | ||
| 55 | #define GLOBAL(name) \ | 52 | #define GLOBAL(name) \ |
| 56 | .globl name; \ | 53 | .globl name; \ |
| 57 | name: | 54 | name: |
| 58 | 55 | ||
| 56 | #ifdef CONFIG_X86_64 | ||
| 57 | #define __ALIGN .p2align 4,,15 | ||
| 58 | #define __ALIGN_STR ".p2align 4,,15" | ||
| 59 | #endif | ||
| 60 | |||
| 59 | #ifdef CONFIG_X86_ALIGNMENT_16 | 61 | #ifdef CONFIG_X86_ALIGNMENT_16 |
| 60 | #define __ALIGN .align 16,0x90 | 62 | #define __ALIGN .align 16,0x90 |
| 61 | #define __ALIGN_STR ".align 16,0x90" | 63 | #define __ALIGN_STR ".align 16,0x90" |
| 62 | #endif | 64 | #endif |
| 63 | 65 | ||
| 66 | #endif /* __ASSEMBLY__ */ | ||
| 67 | |||
| 64 | #endif /* _ASM_X86_LINKAGE_H */ | 68 | #endif /* _ASM_X86_LINKAGE_H */ |
| 65 | 69 | ||
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 32c6e17b960b..563933e06a35 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h | |||
| @@ -11,6 +11,8 @@ | |||
| 11 | */ | 11 | */ |
| 12 | 12 | ||
| 13 | #define MCG_CTL_P (1UL<<8) /* MCG_CAP register available */ | 13 | #define MCG_CTL_P (1UL<<8) /* MCG_CAP register available */ |
| 14 | #define MCG_EXT_P (1ULL<<9) /* Extended registers available */ | ||
| 15 | #define MCG_CMCI_P (1ULL<<10) /* CMCI supported */ | ||
| 14 | 16 | ||
| 15 | #define MCG_STATUS_RIPV (1UL<<0) /* restart ip valid */ | 17 | #define MCG_STATUS_RIPV (1UL<<0) /* restart ip valid */ |
| 16 | #define MCG_STATUS_EIPV (1UL<<1) /* ip points to correct instruction */ | 18 | #define MCG_STATUS_EIPV (1UL<<1) /* ip points to correct instruction */ |
| @@ -90,14 +92,29 @@ extern int mce_disabled; | |||
| 90 | 92 | ||
| 91 | #include <asm/atomic.h> | 93 | #include <asm/atomic.h> |
| 92 | 94 | ||
| 95 | void mce_setup(struct mce *m); | ||
| 93 | void mce_log(struct mce *m); | 96 | void mce_log(struct mce *m); |
| 94 | DECLARE_PER_CPU(struct sys_device, device_mce); | 97 | DECLARE_PER_CPU(struct sys_device, device_mce); |
| 95 | extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); | 98 | extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); |
| 96 | 99 | ||
| 100 | /* | ||
| 101 | * To support more than 128 would need to escape the predefined | ||
| 102 | * Linux defined extended banks first. | ||
| 103 | */ | ||
| 104 | #define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1) | ||
| 105 | |||
| 97 | #ifdef CONFIG_X86_MCE_INTEL | 106 | #ifdef CONFIG_X86_MCE_INTEL |
| 98 | void mce_intel_feature_init(struct cpuinfo_x86 *c); | 107 | void mce_intel_feature_init(struct cpuinfo_x86 *c); |
| 108 | void cmci_clear(void); | ||
| 109 | void cmci_reenable(void); | ||
| 110 | void cmci_rediscover(int dying); | ||
| 111 | void cmci_recheck(void); | ||
| 99 | #else | 112 | #else |
| 100 | static inline void mce_intel_feature_init(struct cpuinfo_x86 *c) { } | 113 | static inline void mce_intel_feature_init(struct cpuinfo_x86 *c) { } |
| 114 | static inline void cmci_clear(void) {} | ||
| 115 | static inline void cmci_reenable(void) {} | ||
| 116 | static inline void cmci_rediscover(int dying) {} | ||
| 117 | static inline void cmci_recheck(void) {} | ||
| 101 | #endif | 118 | #endif |
| 102 | 119 | ||
| 103 | #ifdef CONFIG_X86_MCE_AMD | 120 | #ifdef CONFIG_X86_MCE_AMD |
| @@ -106,11 +123,23 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c); | |||
| 106 | static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { } | 123 | static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { } |
| 107 | #endif | 124 | #endif |
| 108 | 125 | ||
| 109 | void mce_log_therm_throt_event(unsigned int cpu, __u64 status); | 126 | extern int mce_available(struct cpuinfo_x86 *c); |
| 127 | |||
| 128 | void mce_log_therm_throt_event(__u64 status); | ||
| 110 | 129 | ||
| 111 | extern atomic_t mce_entry; | 130 | extern atomic_t mce_entry; |
| 112 | 131 | ||
| 113 | extern void do_machine_check(struct pt_regs *, long); | 132 | extern void do_machine_check(struct pt_regs *, long); |
| 133 | |||
| 134 | typedef DECLARE_BITMAP(mce_banks_t, MAX_NR_BANKS); | ||
| 135 | DECLARE_PER_CPU(mce_banks_t, mce_poll_banks); | ||
| 136 | |||
| 137 | enum mcp_flags { | ||
| 138 | MCP_TIMESTAMP = (1 << 0), /* log time stamp */ | ||
| 139 | MCP_UC = (1 << 1), /* log uncorrected errors */ | ||
| 140 | }; | ||
| 141 | extern void machine_check_poll(enum mcp_flags flags, mce_banks_t *b); | ||
| 142 | |||
| 114 | extern int mce_notify_user(void); | 143 | extern int mce_notify_user(void); |
| 115 | 144 | ||
| 116 | #endif /* !CONFIG_X86_32 */ | 145 | #endif /* !CONFIG_X86_32 */ |
| @@ -120,8 +149,8 @@ extern void mcheck_init(struct cpuinfo_x86 *c); | |||
| 120 | #else | 149 | #else |
| 121 | #define mcheck_init(c) do { } while (0) | 150 | #define mcheck_init(c) do { } while (0) |
| 122 | #endif | 151 | #endif |
| 123 | extern void stop_mce(void); | 152 | |
| 124 | extern void restart_mce(void); | 153 | extern void (*mce_threshold_vector)(void); |
| 125 | 154 | ||
| 126 | #endif /* __KERNEL__ */ | 155 | #endif /* __KERNEL__ */ |
| 127 | #endif /* _ASM_X86_MCE_H */ | 156 | #endif /* _ASM_X86_MCE_H */ |
diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h index 105fb90a0635..ede6998bd92c 100644 --- a/arch/x86/include/asm/mmzone_32.h +++ b/arch/x86/include/asm/mmzone_32.h | |||
| @@ -91,46 +91,9 @@ static inline int pfn_valid(int pfn) | |||
| 91 | #endif /* CONFIG_DISCONTIGMEM */ | 91 | #endif /* CONFIG_DISCONTIGMEM */ |
| 92 | 92 | ||
| 93 | #ifdef CONFIG_NEED_MULTIPLE_NODES | 93 | #ifdef CONFIG_NEED_MULTIPLE_NODES |
| 94 | 94 | /* always use node 0 for bootmem on this numa platform */ | |
| 95 | /* | 95 | #define bootmem_arch_preferred_node(__bdata, size, align, goal, limit) \ |
| 96 | * Following are macros that are specific to this numa platform. | 96 | (NODE_DATA(0)->bdata) |
| 97 | */ | ||
| 98 | #define reserve_bootmem(addr, size, flags) \ | ||
| 99 | reserve_bootmem_node(NODE_DATA(0), (addr), (size), (flags)) | ||
| 100 | #define alloc_bootmem(x) \ | ||
| 101 | __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)) | ||
| 102 | #define alloc_bootmem_nopanic(x) \ | ||
| 103 | __alloc_bootmem_node_nopanic(NODE_DATA(0), (x), SMP_CACHE_BYTES, \ | ||
| 104 | __pa(MAX_DMA_ADDRESS)) | ||
| 105 | #define alloc_bootmem_low(x) \ | ||
| 106 | __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, 0) | ||
| 107 | #define alloc_bootmem_pages(x) \ | ||
| 108 | __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS)) | ||
| 109 | #define alloc_bootmem_pages_nopanic(x) \ | ||
| 110 | __alloc_bootmem_node_nopanic(NODE_DATA(0), (x), PAGE_SIZE, \ | ||
| 111 | __pa(MAX_DMA_ADDRESS)) | ||
| 112 | #define alloc_bootmem_low_pages(x) \ | ||
| 113 | __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0) | ||
| 114 | #define alloc_bootmem_node(pgdat, x) \ | ||
| 115 | ({ \ | ||
| 116 | struct pglist_data __maybe_unused \ | ||
| 117 | *__alloc_bootmem_node__pgdat = (pgdat); \ | ||
| 118 | __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, \ | ||
| 119 | __pa(MAX_DMA_ADDRESS)); \ | ||
| 120 | }) | ||
| 121 | #define alloc_bootmem_pages_node(pgdat, x) \ | ||
| 122 | ({ \ | ||
| 123 | struct pglist_data __maybe_unused \ | ||
| 124 | *__alloc_bootmem_node__pgdat = (pgdat); \ | ||
| 125 | __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, \ | ||
| 126 | __pa(MAX_DMA_ADDRESS)); \ | ||
| 127 | }) | ||
| 128 | #define alloc_bootmem_low_pages_node(pgdat, x) \ | ||
| 129 | ({ \ | ||
| 130 | struct pglist_data __maybe_unused \ | ||
| 131 | *__alloc_bootmem_node__pgdat = (pgdat); \ | ||
| 132 | __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0); \ | ||
| 133 | }) | ||
| 134 | #endif /* CONFIG_NEED_MULTIPLE_NODES */ | 97 | #endif /* CONFIG_NEED_MULTIPLE_NODES */ |
| 135 | 98 | ||
| 136 | #endif /* _ASM_X86_MMZONE_32_H */ | 99 | #endif /* _ASM_X86_MMZONE_32_H */ |
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 358acc59ae04..2dbd2314139e 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h | |||
| @@ -77,6 +77,11 @@ | |||
| 77 | #define MSR_IA32_MC0_ADDR 0x00000402 | 77 | #define MSR_IA32_MC0_ADDR 0x00000402 |
| 78 | #define MSR_IA32_MC0_MISC 0x00000403 | 78 | #define MSR_IA32_MC0_MISC 0x00000403 |
| 79 | 79 | ||
| 80 | /* These are consecutive and not in the normal 4er MCE bank block */ | ||
| 81 | #define MSR_IA32_MC0_CTL2 0x00000280 | ||
| 82 | #define CMCI_EN (1ULL << 30) | ||
| 83 | #define CMCI_THRESHOLD_MASK 0xffffULL | ||
| 84 | |||
| 80 | #define MSR_P6_PERFCTR0 0x000000c1 | 85 | #define MSR_P6_PERFCTR0 0x000000c1 |
| 81 | #define MSR_P6_PERFCTR1 0x000000c2 | 86 | #define MSR_P6_PERFCTR1 0x000000c2 |
| 82 | #define MSR_P6_EVNTSEL0 0x00000186 | 87 | #define MSR_P6_EVNTSEL0 0x00000186 |
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index 2d625da6603c..826ad37006ab 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h | |||
| @@ -40,14 +40,8 @@ | |||
| 40 | 40 | ||
| 41 | #ifndef __ASSEMBLY__ | 41 | #ifndef __ASSEMBLY__ |
| 42 | 42 | ||
| 43 | struct pgprot; | ||
| 44 | |||
| 45 | extern int page_is_ram(unsigned long pagenr); | 43 | extern int page_is_ram(unsigned long pagenr); |
| 46 | extern int devmem_is_allowed(unsigned long pagenr); | 44 | extern int devmem_is_allowed(unsigned long pagenr); |
| 47 | extern void map_devmem(unsigned long pfn, unsigned long size, | ||
| 48 | struct pgprot vma_prot); | ||
| 49 | extern void unmap_devmem(unsigned long pfn, unsigned long size, | ||
| 50 | struct pgprot vma_prot); | ||
| 51 | 45 | ||
| 52 | extern unsigned long max_low_pfn_mapped; | 46 | extern unsigned long max_low_pfn_mapped; |
| 53 | extern unsigned long max_pfn_mapped; | 47 | extern unsigned long max_pfn_mapped; |
diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h index b0e70056838e..2cd07b9422f4 100644 --- a/arch/x86/include/asm/pat.h +++ b/arch/x86/include/asm/pat.h | |||
| @@ -2,6 +2,7 @@ | |||
| 2 | #define _ASM_X86_PAT_H | 2 | #define _ASM_X86_PAT_H |
| 3 | 3 | ||
| 4 | #include <linux/types.h> | 4 | #include <linux/types.h> |
| 5 | #include <asm/pgtable_types.h> | ||
| 5 | 6 | ||
| 6 | #ifdef CONFIG_X86_PAT | 7 | #ifdef CONFIG_X86_PAT |
| 7 | extern int pat_enabled; | 8 | extern int pat_enabled; |
| @@ -17,5 +18,9 @@ extern int free_memtype(u64 start, u64 end); | |||
| 17 | 18 | ||
| 18 | extern int kernel_map_sync_memtype(u64 base, unsigned long size, | 19 | extern int kernel_map_sync_memtype(u64 base, unsigned long size, |
| 19 | unsigned long flag); | 20 | unsigned long flag); |
| 21 | extern void map_devmem(unsigned long pfn, unsigned long size, | ||
| 22 | struct pgprot vma_prot); | ||
| 23 | extern void unmap_devmem(unsigned long pfn, unsigned long size, | ||
| 24 | struct pgprot vma_prot); | ||
| 20 | 25 | ||
| 21 | #endif /* _ASM_X86_PAT_H */ | 26 | #endif /* _ASM_X86_PAT_H */ |
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index aee103b26d01..8f1d2fbec1d4 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h | |||
| @@ -43,6 +43,14 @@ | |||
| 43 | #else /* ...!ASSEMBLY */ | 43 | #else /* ...!ASSEMBLY */ |
| 44 | 44 | ||
| 45 | #include <linux/stringify.h> | 45 | #include <linux/stringify.h> |
| 46 | #include <asm/sections.h> | ||
| 47 | |||
| 48 | #define __addr_to_pcpu_ptr(addr) \ | ||
| 49 | (void *)((unsigned long)(addr) - (unsigned long)pcpu_base_addr \ | ||
| 50 | + (unsigned long)__per_cpu_start) | ||
| 51 | #define __pcpu_ptr_to_addr(ptr) \ | ||
| 52 | (void *)((unsigned long)(ptr) + (unsigned long)pcpu_base_addr \ | ||
| 53 | - (unsigned long)__per_cpu_start) | ||
| 46 | 54 | ||
| 47 | #ifdef CONFIG_SMP | 55 | #ifdef CONFIG_SMP |
| 48 | #define __percpu_arg(x) "%%"__stringify(__percpu_seg)":%P" #x | 56 | #define __percpu_arg(x) "%%"__stringify(__percpu_seg)":%P" #x |
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 1c097a3a6669..d0812e155f1d 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h | |||
| @@ -288,6 +288,8 @@ static inline int is_new_memtype_allowed(unsigned long flags, | |||
| 288 | return 1; | 288 | return 1; |
| 289 | } | 289 | } |
| 290 | 290 | ||
| 291 | pmd_t *populate_extra_pmd(unsigned long vaddr); | ||
| 292 | pte_t *populate_extra_pte(unsigned long vaddr); | ||
| 291 | #endif /* __ASSEMBLY__ */ | 293 | #endif /* __ASSEMBLY__ */ |
| 292 | 294 | ||
| 293 | #ifdef CONFIG_X86_32 | 295 | #ifdef CONFIG_X86_32 |
diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm/pgtable_32_types.h index bd8df3b2fe04..2733fad45f98 100644 --- a/arch/x86/include/asm/pgtable_32_types.h +++ b/arch/x86/include/asm/pgtable_32_types.h | |||
| @@ -25,6 +25,11 @@ | |||
| 25 | * area for the same reason. ;) | 25 | * area for the same reason. ;) |
| 26 | */ | 26 | */ |
| 27 | #define VMALLOC_OFFSET (8 * 1024 * 1024) | 27 | #define VMALLOC_OFFSET (8 * 1024 * 1024) |
| 28 | |||
| 29 | #ifndef __ASSEMBLER__ | ||
| 30 | extern bool __vmalloc_start_set; /* set once high_memory is set */ | ||
| 31 | #endif | ||
| 32 | |||
| 28 | #define VMALLOC_START ((unsigned long)high_memory + VMALLOC_OFFSET) | 33 | #define VMALLOC_START ((unsigned long)high_memory + VMALLOC_OFFSET) |
| 29 | #ifdef CONFIG_X86_PAE | 34 | #ifdef CONFIG_X86_PAE |
| 30 | #define LAST_PKMAP 512 | 35 | #define LAST_PKMAP 512 |
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 4d258ad76a0f..b8238dc8786d 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h | |||
| @@ -273,6 +273,7 @@ typedef struct page *pgtable_t; | |||
| 273 | 273 | ||
| 274 | extern pteval_t __supported_pte_mask; | 274 | extern pteval_t __supported_pte_mask; |
| 275 | extern int nx_enabled; | 275 | extern int nx_enabled; |
| 276 | extern void set_nx(void); | ||
| 276 | 277 | ||
| 277 | #define pgprot_writecombine pgprot_writecombine | 278 | #define pgprot_writecombine pgprot_writecombine |
| 278 | extern pgprot_t pgprot_writecombine(pgprot_t prot); | 279 | extern pgprot_t pgprot_writecombine(pgprot_t prot); |
diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index 777327ef05c1..9f4dfba33b28 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h | |||
| @@ -199,6 +199,10 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); | |||
| 199 | #define SCIR_CPU_ACTIVITY 0x02 /* not idle */ | 199 | #define SCIR_CPU_ACTIVITY 0x02 /* not idle */ |
| 200 | #define SCIR_CPU_HB_INTERVAL (HZ) /* once per second */ | 200 | #define SCIR_CPU_HB_INTERVAL (HZ) /* once per second */ |
| 201 | 201 | ||
| 202 | /* Loop through all installed blades */ | ||
| 203 | #define for_each_possible_blade(bid) \ | ||
| 204 | for ((bid) = 0; (bid) < uv_num_possible_blades(); (bid)++) | ||
| 205 | |||
| 202 | /* | 206 | /* |
| 203 | * Macros for converting between kernel virtual addresses, socket local physical | 207 | * Macros for converting between kernel virtual addresses, socket local physical |
| 204 | * addresses, and UV global physical addresses. | 208 | * addresses, and UV global physical addresses. |
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index 4bd990ee43df..1a918dde46b5 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h | |||
| @@ -164,6 +164,7 @@ static inline pte_t __pte_ma(pteval_t x) | |||
| 164 | 164 | ||
| 165 | 165 | ||
| 166 | xmaddr_t arbitrary_virt_to_machine(void *address); | 166 | xmaddr_t arbitrary_virt_to_machine(void *address); |
| 167 | unsigned long arbitrary_virt_to_mfn(void *vaddr); | ||
| 167 | void make_lowmem_page_readonly(void *vaddr); | 168 | void make_lowmem_page_readonly(void *vaddr); |
| 168 | void make_lowmem_page_readwrite(void *vaddr); | 169 | void make_lowmem_page_readwrite(void *vaddr); |
| 169 | 170 | ||
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 95f216bbfaf1..339ce35648e6 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile | |||
| @@ -111,7 +111,7 @@ obj-$(CONFIG_SWIOTLB) += pci-swiotlb_64.o # NB rename without _64 | |||
| 111 | ### | 111 | ### |
| 112 | # 64 bit specific files | 112 | # 64 bit specific files |
| 113 | ifeq ($(CONFIG_X86_64),y) | 113 | ifeq ($(CONFIG_X86_64),y) |
| 114 | obj-$(CONFIG_X86_UV) += tlb_uv.o bios_uv.o uv_irq.o uv_sysfs.o | 114 | obj-$(CONFIG_X86_UV) += tlb_uv.o bios_uv.o uv_irq.o uv_sysfs.o uv_time.o |
| 115 | obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o | 115 | obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o |
| 116 | obj-$(CONFIG_AUDIT) += audit_64.o | 116 | obj-$(CONFIG_AUDIT) += audit_64.o |
| 117 | 117 | ||
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 6907b8e85d52..4c80f1557433 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c | |||
| @@ -414,9 +414,17 @@ void __init alternative_instructions(void) | |||
| 414 | that might execute the to be patched code. | 414 | that might execute the to be patched code. |
| 415 | Other CPUs are not running. */ | 415 | Other CPUs are not running. */ |
| 416 | stop_nmi(); | 416 | stop_nmi(); |
| 417 | #ifdef CONFIG_X86_MCE | 417 | |
| 418 | stop_mce(); | 418 | /* |
| 419 | #endif | 419 | * Don't stop machine check exceptions while patching. |
| 420 | * MCEs only happen when something got corrupted and in this | ||
| 421 | * case we must do something about the corruption. | ||
| 422 | * Ignoring it is worse than a unlikely patching race. | ||
| 423 | * Also machine checks tend to be broadcast and if one CPU | ||
| 424 | * goes into machine check the others follow quickly, so we don't | ||
| 425 | * expect a machine check to cause undue problems during to code | ||
| 426 | * patching. | ||
| 427 | */ | ||
| 420 | 428 | ||
| 421 | apply_alternatives(__alt_instructions, __alt_instructions_end); | 429 | apply_alternatives(__alt_instructions, __alt_instructions_end); |
| 422 | 430 | ||
| @@ -456,9 +464,6 @@ void __init alternative_instructions(void) | |||
| 456 | (unsigned long)__smp_locks_end); | 464 | (unsigned long)__smp_locks_end); |
| 457 | 465 | ||
| 458 | restart_nmi(); | 466 | restart_nmi(); |
| 459 | #ifdef CONFIG_X86_MCE | ||
| 460 | restart_mce(); | ||
| 461 | #endif | ||
| 462 | } | 467 | } |
| 463 | 468 | ||
| 464 | /** | 469 | /** |
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index f9cecdfd05c5..30909a258d0f 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c | |||
| @@ -46,6 +46,7 @@ | |||
| 46 | #include <asm/idle.h> | 46 | #include <asm/idle.h> |
| 47 | #include <asm/mtrr.h> | 47 | #include <asm/mtrr.h> |
| 48 | #include <asm/smp.h> | 48 | #include <asm/smp.h> |
| 49 | #include <asm/mce.h> | ||
| 49 | 50 | ||
| 50 | unsigned int num_processors; | 51 | unsigned int num_processors; |
| 51 | 52 | ||
| @@ -842,6 +843,14 @@ void clear_local_APIC(void) | |||
| 842 | apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED); | 843 | apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED); |
| 843 | } | 844 | } |
| 844 | #endif | 845 | #endif |
| 846 | #ifdef CONFIG_X86_MCE_INTEL | ||
| 847 | if (maxlvt >= 6) { | ||
| 848 | v = apic_read(APIC_LVTCMCI); | ||
| 849 | if (!(v & APIC_LVT_MASKED)) | ||
| 850 | apic_write(APIC_LVTCMCI, v | APIC_LVT_MASKED); | ||
| 851 | } | ||
| 852 | #endif | ||
| 853 | |||
| 845 | /* | 854 | /* |
| 846 | * Clean APIC state for other OSs: | 855 | * Clean APIC state for other OSs: |
| 847 | */ | 856 | */ |
| @@ -1241,6 +1250,12 @@ void __cpuinit setup_local_APIC(void) | |||
| 1241 | apic_write(APIC_LVT1, value); | 1250 | apic_write(APIC_LVT1, value); |
| 1242 | 1251 | ||
| 1243 | preempt_enable(); | 1252 | preempt_enable(); |
| 1253 | |||
| 1254 | #ifdef CONFIG_X86_MCE_INTEL | ||
| 1255 | /* Recheck CMCI information after local APIC is up on CPU #0 */ | ||
| 1256 | if (smp_processor_id() == 0) | ||
| 1257 | cmci_recheck(); | ||
| 1258 | #endif | ||
| 1244 | } | 1259 | } |
| 1245 | 1260 | ||
| 1246 | void __cpuinit end_local_APIC_setup(void) | 1261 | void __cpuinit end_local_APIC_setup(void) |
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 25423a5b80ed..f47df59016c5 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c | |||
| @@ -5,6 +5,7 @@ | |||
| 5 | #include <asm/io.h> | 5 | #include <asm/io.h> |
| 6 | #include <asm/processor.h> | 6 | #include <asm/processor.h> |
| 7 | #include <asm/apic.h> | 7 | #include <asm/apic.h> |
| 8 | #include <asm/cpu.h> | ||
| 8 | 9 | ||
| 9 | #ifdef CONFIG_X86_64 | 10 | #ifdef CONFIG_X86_64 |
| 10 | # include <asm/numa_64.h> | 11 | # include <asm/numa_64.h> |
| @@ -141,6 +142,55 @@ static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c) | |||
| 141 | } | 142 | } |
| 142 | } | 143 | } |
| 143 | 144 | ||
| 145 | static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c) | ||
| 146 | { | ||
| 147 | #ifdef CONFIG_SMP | ||
| 148 | /* calling is from identify_secondary_cpu() ? */ | ||
| 149 | if (c->cpu_index == boot_cpu_id) | ||
| 150 | return; | ||
| 151 | |||
| 152 | /* | ||
| 153 | * Certain Athlons might work (for various values of 'work') in SMP | ||
| 154 | * but they are not certified as MP capable. | ||
| 155 | */ | ||
| 156 | /* Athlon 660/661 is valid. */ | ||
| 157 | if ((c->x86_model == 6) && ((c->x86_mask == 0) || | ||
| 158 | (c->x86_mask == 1))) | ||
| 159 | goto valid_k7; | ||
| 160 | |||
| 161 | /* Duron 670 is valid */ | ||
| 162 | if ((c->x86_model == 7) && (c->x86_mask == 0)) | ||
| 163 | goto valid_k7; | ||
| 164 | |||
| 165 | /* | ||
| 166 | * Athlon 662, Duron 671, and Athlon >model 7 have capability | ||
| 167 | * bit. It's worth noting that the A5 stepping (662) of some | ||
| 168 | * Athlon XP's have the MP bit set. | ||
| 169 | * See http://www.heise.de/newsticker/data/jow-18.10.01-000 for | ||
| 170 | * more. | ||
| 171 | */ | ||
| 172 | if (((c->x86_model == 6) && (c->x86_mask >= 2)) || | ||
| 173 | ((c->x86_model == 7) && (c->x86_mask >= 1)) || | ||
| 174 | (c->x86_model > 7)) | ||
| 175 | if (cpu_has_mp) | ||
| 176 | goto valid_k7; | ||
| 177 | |||
| 178 | /* If we get here, not a certified SMP capable AMD system. */ | ||
| 179 | |||
| 180 | /* | ||
| 181 | * Don't taint if we are running SMP kernel on a single non-MP | ||
| 182 | * approved Athlon | ||
| 183 | */ | ||
| 184 | WARN_ONCE(1, "WARNING: This combination of AMD" | ||
| 185 | "processors is not suitable for SMP.\n"); | ||
| 186 | if (!test_taint(TAINT_UNSAFE_SMP)) | ||
| 187 | add_taint(TAINT_UNSAFE_SMP); | ||
| 188 | |||
| 189 | valid_k7: | ||
| 190 | ; | ||
| 191 | #endif | ||
| 192 | } | ||
| 193 | |||
| 144 | static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c) | 194 | static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c) |
| 145 | { | 195 | { |
| 146 | u32 l, h; | 196 | u32 l, h; |
| @@ -175,6 +225,8 @@ static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c) | |||
| 175 | } | 225 | } |
| 176 | 226 | ||
| 177 | set_cpu_cap(c, X86_FEATURE_K7); | 227 | set_cpu_cap(c, X86_FEATURE_K7); |
| 228 | |||
| 229 | amd_k7_smp_check(c); | ||
| 178 | } | 230 | } |
| 179 | #endif | 231 | #endif |
| 180 | 232 | ||
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index 4b1c319d30c3..22590cf688ae 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | |||
| @@ -601,7 +601,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) | |||
| 601 | if (!data) | 601 | if (!data) |
| 602 | return -ENOMEM; | 602 | return -ENOMEM; |
| 603 | 603 | ||
| 604 | data->acpi_data = percpu_ptr(acpi_perf_data, cpu); | 604 | data->acpi_data = per_cpu_ptr(acpi_perf_data, cpu); |
| 605 | per_cpu(drv_data, cpu) = data; | 605 | per_cpu(drv_data, cpu) = data; |
| 606 | 606 | ||
| 607 | if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) | 607 | if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) |
diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c index b585e04cbc9e..3178c3acd97e 100644 --- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c +++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c | |||
| @@ -277,7 +277,6 @@ static struct cpufreq_driver p4clockmod_driver = { | |||
| 277 | .name = "p4-clockmod", | 277 | .name = "p4-clockmod", |
| 278 | .owner = THIS_MODULE, | 278 | .owner = THIS_MODULE, |
| 279 | .attr = p4clockmod_attr, | 279 | .attr = p4clockmod_attr, |
| 280 | .hide_interface = 1, | ||
| 281 | }; | 280 | }; |
| 282 | 281 | ||
| 283 | 282 | ||
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 25c559ba8d54..191117f1ad51 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c | |||
| @@ -13,6 +13,7 @@ | |||
| 13 | #include <asm/uaccess.h> | 13 | #include <asm/uaccess.h> |
| 14 | #include <asm/ds.h> | 14 | #include <asm/ds.h> |
| 15 | #include <asm/bugs.h> | 15 | #include <asm/bugs.h> |
| 16 | #include <asm/cpu.h> | ||
| 16 | 17 | ||
| 17 | #ifdef CONFIG_X86_64 | 18 | #ifdef CONFIG_X86_64 |
| 18 | #include <asm/topology.h> | 19 | #include <asm/topology.h> |
| @@ -110,6 +111,28 @@ static void __cpuinit trap_init_f00f_bug(void) | |||
| 110 | } | 111 | } |
| 111 | #endif | 112 | #endif |
| 112 | 113 | ||
| 114 | static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c) | ||
| 115 | { | ||
| 116 | #ifdef CONFIG_SMP | ||
| 117 | /* calling is from identify_secondary_cpu() ? */ | ||
| 118 | if (c->cpu_index == boot_cpu_id) | ||
| 119 | return; | ||
| 120 | |||
| 121 | /* | ||
| 122 | * Mask B, Pentium, but not Pentium MMX | ||
| 123 | */ | ||
| 124 | if (c->x86 == 5 && | ||
| 125 | c->x86_mask >= 1 && c->x86_mask <= 4 && | ||
| 126 | c->x86_model <= 3) { | ||
| 127 | /* | ||
| 128 | * Remember we have B step Pentia with bugs | ||
| 129 | */ | ||
| 130 | WARN_ONCE(1, "WARNING: SMP operation may be unreliable" | ||
| 131 | "with B stepping processors.\n"); | ||
| 132 | } | ||
| 133 | #endif | ||
| 134 | } | ||
| 135 | |||
| 113 | static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) | 136 | static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) |
| 114 | { | 137 | { |
| 115 | unsigned long lo, hi; | 138 | unsigned long lo, hi; |
| @@ -186,6 +209,8 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) | |||
| 186 | #ifdef CONFIG_X86_NUMAQ | 209 | #ifdef CONFIG_X86_NUMAQ |
| 187 | numaq_tsc_disable(); | 210 | numaq_tsc_disable(); |
| 188 | #endif | 211 | #endif |
| 212 | |||
| 213 | intel_smp_check(c); | ||
| 189 | } | 214 | } |
| 190 | #else | 215 | #else |
| 191 | static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) | 216 | static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) |
diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile index d7d2323bbb69..b2f89829bbe8 100644 --- a/arch/x86/kernel/cpu/mcheck/Makefile +++ b/arch/x86/kernel/cpu/mcheck/Makefile | |||
| @@ -4,3 +4,4 @@ obj-$(CONFIG_X86_32) += k7.o p4.o p5.o p6.o winchip.o | |||
| 4 | obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o | 4 | obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o |
| 5 | obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o | 5 | obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o |
| 6 | obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o | 6 | obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o |
| 7 | obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o | ||
diff --git a/arch/x86/kernel/cpu/mcheck/mce_32.c b/arch/x86/kernel/cpu/mcheck/mce_32.c index dfaebce3633e..3552119b091d 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_32.c +++ b/arch/x86/kernel/cpu/mcheck/mce_32.c | |||
| @@ -60,20 +60,6 @@ void mcheck_init(struct cpuinfo_x86 *c) | |||
| 60 | } | 60 | } |
| 61 | } | 61 | } |
| 62 | 62 | ||
| 63 | static unsigned long old_cr4 __initdata; | ||
| 64 | |||
| 65 | void __init stop_mce(void) | ||
| 66 | { | ||
| 67 | old_cr4 = read_cr4(); | ||
| 68 | clear_in_cr4(X86_CR4_MCE); | ||
| 69 | } | ||
| 70 | |||
| 71 | void __init restart_mce(void) | ||
| 72 | { | ||
| 73 | if (old_cr4 & X86_CR4_MCE) | ||
| 74 | set_in_cr4(X86_CR4_MCE); | ||
| 75 | } | ||
| 76 | |||
| 77 | static int __init mcheck_disable(char *str) | 63 | static int __init mcheck_disable(char *str) |
| 78 | { | 64 | { |
| 79 | mce_disabled = 1; | 65 | mce_disabled = 1; |
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index fe79985ce0f2..ca14604611ec 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c | |||
| @@ -3,6 +3,8 @@ | |||
| 3 | * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. | 3 | * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. |
| 4 | * Rest from unknown author(s). | 4 | * Rest from unknown author(s). |
| 5 | * 2004 Andi Kleen. Rewrote most of it. | 5 | * 2004 Andi Kleen. Rewrote most of it. |
| 6 | * Copyright 2008 Intel Corporation | ||
| 7 | * Author: Andi Kleen | ||
| 6 | */ | 8 | */ |
| 7 | 9 | ||
| 8 | #include <linux/init.h> | 10 | #include <linux/init.h> |
| @@ -24,6 +26,9 @@ | |||
| 24 | #include <linux/ctype.h> | 26 | #include <linux/ctype.h> |
| 25 | #include <linux/kmod.h> | 27 | #include <linux/kmod.h> |
| 26 | #include <linux/kdebug.h> | 28 | #include <linux/kdebug.h> |
| 29 | #include <linux/kobject.h> | ||
| 30 | #include <linux/sysfs.h> | ||
| 31 | #include <linux/ratelimit.h> | ||
| 27 | #include <asm/processor.h> | 32 | #include <asm/processor.h> |
| 28 | #include <asm/msr.h> | 33 | #include <asm/msr.h> |
| 29 | #include <asm/mce.h> | 34 | #include <asm/mce.h> |
| @@ -32,7 +37,6 @@ | |||
| 32 | #include <asm/idle.h> | 37 | #include <asm/idle.h> |
| 33 | 38 | ||
| 34 | #define MISC_MCELOG_MINOR 227 | 39 | #define MISC_MCELOG_MINOR 227 |
| 35 | #define NR_SYSFS_BANKS 6 | ||
| 36 | 40 | ||
| 37 | atomic_t mce_entry; | 41 | atomic_t mce_entry; |
| 38 | 42 | ||
| @@ -47,7 +51,7 @@ static int mce_dont_init; | |||
| 47 | */ | 51 | */ |
| 48 | static int tolerant = 1; | 52 | static int tolerant = 1; |
| 49 | static int banks; | 53 | static int banks; |
| 50 | static unsigned long bank[NR_SYSFS_BANKS] = { [0 ... NR_SYSFS_BANKS-1] = ~0UL }; | 54 | static u64 *bank; |
| 51 | static unsigned long notify_user; | 55 | static unsigned long notify_user; |
| 52 | static int rip_msr; | 56 | static int rip_msr; |
| 53 | static int mce_bootlog = -1; | 57 | static int mce_bootlog = -1; |
| @@ -58,6 +62,19 @@ static char *trigger_argv[2] = { trigger, NULL }; | |||
| 58 | 62 | ||
| 59 | static DECLARE_WAIT_QUEUE_HEAD(mce_wait); | 63 | static DECLARE_WAIT_QUEUE_HEAD(mce_wait); |
| 60 | 64 | ||
| 65 | /* MCA banks polled by the period polling timer for corrected events */ | ||
| 66 | DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { | ||
| 67 | [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL | ||
| 68 | }; | ||
| 69 | |||
| 70 | /* Do initial initialization of a struct mce */ | ||
| 71 | void mce_setup(struct mce *m) | ||
| 72 | { | ||
| 73 | memset(m, 0, sizeof(struct mce)); | ||
| 74 | m->cpu = smp_processor_id(); | ||
| 75 | rdtscll(m->tsc); | ||
| 76 | } | ||
| 77 | |||
| 61 | /* | 78 | /* |
| 62 | * Lockless MCE logging infrastructure. | 79 | * Lockless MCE logging infrastructure. |
| 63 | * This avoids deadlocks on printk locks without having to break locks. Also | 80 | * This avoids deadlocks on printk locks without having to break locks. Also |
| @@ -119,11 +136,11 @@ static void print_mce(struct mce *m) | |||
| 119 | print_symbol("{%s}", m->ip); | 136 | print_symbol("{%s}", m->ip); |
| 120 | printk("\n"); | 137 | printk("\n"); |
| 121 | } | 138 | } |
| 122 | printk(KERN_EMERG "TSC %Lx ", m->tsc); | 139 | printk(KERN_EMERG "TSC %llx ", m->tsc); |
| 123 | if (m->addr) | 140 | if (m->addr) |
| 124 | printk("ADDR %Lx ", m->addr); | 141 | printk("ADDR %llx ", m->addr); |
| 125 | if (m->misc) | 142 | if (m->misc) |
| 126 | printk("MISC %Lx ", m->misc); | 143 | printk("MISC %llx ", m->misc); |
| 127 | printk("\n"); | 144 | printk("\n"); |
| 128 | printk(KERN_EMERG "This is not a software problem!\n"); | 145 | printk(KERN_EMERG "This is not a software problem!\n"); |
| 129 | printk(KERN_EMERG "Run through mcelog --ascii to decode " | 146 | printk(KERN_EMERG "Run through mcelog --ascii to decode " |
| @@ -149,8 +166,10 @@ static void mce_panic(char *msg, struct mce *backup, unsigned long start) | |||
| 149 | panic(msg); | 166 | panic(msg); |
| 150 | } | 167 | } |
| 151 | 168 | ||
| 152 | static int mce_available(struct cpuinfo_x86 *c) | 169 | int mce_available(struct cpuinfo_x86 *c) |
| 153 | { | 170 | { |
| 171 | if (mce_dont_init) | ||
| 172 | return 0; | ||
| 154 | return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); | 173 | return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); |
| 155 | } | 174 | } |
| 156 | 175 | ||
| @@ -172,7 +191,77 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) | |||
| 172 | } | 191 | } |
| 173 | 192 | ||
| 174 | /* | 193 | /* |
| 175 | * The actual machine check handler | 194 | * Poll for corrected events or events that happened before reset. |
| 195 | * Those are just logged through /dev/mcelog. | ||
| 196 | * | ||
| 197 | * This is executed in standard interrupt context. | ||
| 198 | */ | ||
| 199 | void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) | ||
| 200 | { | ||
| 201 | struct mce m; | ||
| 202 | int i; | ||
| 203 | |||
| 204 | mce_setup(&m); | ||
| 205 | |||
| 206 | rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); | ||
| 207 | for (i = 0; i < banks; i++) { | ||
| 208 | if (!bank[i] || !test_bit(i, *b)) | ||
| 209 | continue; | ||
| 210 | |||
| 211 | m.misc = 0; | ||
| 212 | m.addr = 0; | ||
| 213 | m.bank = i; | ||
| 214 | m.tsc = 0; | ||
| 215 | |||
| 216 | barrier(); | ||
| 217 | rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); | ||
| 218 | if (!(m.status & MCI_STATUS_VAL)) | ||
| 219 | continue; | ||
| 220 | |||
| 221 | /* | ||
| 222 | * Uncorrected events are handled by the exception handler | ||
| 223 | * when it is enabled. But when the exception is disabled log | ||
| 224 | * everything. | ||
| 225 | * | ||
| 226 | * TBD do the same check for MCI_STATUS_EN here? | ||
| 227 | */ | ||
| 228 | if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC)) | ||
| 229 | continue; | ||
| 230 | |||
| 231 | if (m.status & MCI_STATUS_MISCV) | ||
| 232 | rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc); | ||
| 233 | if (m.status & MCI_STATUS_ADDRV) | ||
| 234 | rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); | ||
| 235 | |||
| 236 | if (!(flags & MCP_TIMESTAMP)) | ||
| 237 | m.tsc = 0; | ||
| 238 | /* | ||
| 239 | * Don't get the IP here because it's unlikely to | ||
| 240 | * have anything to do with the actual error location. | ||
| 241 | */ | ||
| 242 | |||
| 243 | mce_log(&m); | ||
| 244 | add_taint(TAINT_MACHINE_CHECK); | ||
| 245 | |||
| 246 | /* | ||
| 247 | * Clear state for this bank. | ||
| 248 | */ | ||
| 249 | wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); | ||
| 250 | } | ||
| 251 | |||
| 252 | /* | ||
| 253 | * Don't clear MCG_STATUS here because it's only defined for | ||
| 254 | * exceptions. | ||
| 255 | */ | ||
| 256 | } | ||
| 257 | |||
| 258 | /* | ||
| 259 | * The actual machine check handler. This only handles real | ||
| 260 | * exceptions when something got corrupted coming in through int 18. | ||
| 261 | * | ||
| 262 | * This is executed in NMI context not subject to normal locking rules. This | ||
| 263 | * implies that most kernel services cannot be safely used. Don't even | ||
| 264 | * think about putting a printk in there! | ||
| 176 | */ | 265 | */ |
| 177 | void do_machine_check(struct pt_regs * regs, long error_code) | 266 | void do_machine_check(struct pt_regs * regs, long error_code) |
| 178 | { | 267 | { |
| @@ -190,17 +279,18 @@ void do_machine_check(struct pt_regs * regs, long error_code) | |||
| 190 | * error. | 279 | * error. |
| 191 | */ | 280 | */ |
| 192 | int kill_it = 0; | 281 | int kill_it = 0; |
| 282 | DECLARE_BITMAP(toclear, MAX_NR_BANKS); | ||
| 193 | 283 | ||
| 194 | atomic_inc(&mce_entry); | 284 | atomic_inc(&mce_entry); |
| 195 | 285 | ||
| 196 | if ((regs | 286 | if (notify_die(DIE_NMI, "machine check", regs, error_code, |
| 197 | && notify_die(DIE_NMI, "machine check", regs, error_code, | ||
| 198 | 18, SIGKILL) == NOTIFY_STOP) | 287 | 18, SIGKILL) == NOTIFY_STOP) |
| 199 | || !banks) | 288 | goto out2; |
| 289 | if (!banks) | ||
| 200 | goto out2; | 290 | goto out2; |
| 201 | 291 | ||
| 202 | memset(&m, 0, sizeof(struct mce)); | 292 | mce_setup(&m); |
| 203 | m.cpu = smp_processor_id(); | 293 | |
| 204 | rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); | 294 | rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); |
| 205 | /* if the restart IP is not valid, we're done for */ | 295 | /* if the restart IP is not valid, we're done for */ |
| 206 | if (!(m.mcgstatus & MCG_STATUS_RIPV)) | 296 | if (!(m.mcgstatus & MCG_STATUS_RIPV)) |
| @@ -210,18 +300,32 @@ void do_machine_check(struct pt_regs * regs, long error_code) | |||
| 210 | barrier(); | 300 | barrier(); |
| 211 | 301 | ||
| 212 | for (i = 0; i < banks; i++) { | 302 | for (i = 0; i < banks; i++) { |
| 213 | if (i < NR_SYSFS_BANKS && !bank[i]) | 303 | __clear_bit(i, toclear); |
| 304 | if (!bank[i]) | ||
| 214 | continue; | 305 | continue; |
| 215 | 306 | ||
| 216 | m.misc = 0; | 307 | m.misc = 0; |
| 217 | m.addr = 0; | 308 | m.addr = 0; |
| 218 | m.bank = i; | 309 | m.bank = i; |
| 219 | m.tsc = 0; | ||
| 220 | 310 | ||
| 221 | rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); | 311 | rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); |
| 222 | if ((m.status & MCI_STATUS_VAL) == 0) | 312 | if ((m.status & MCI_STATUS_VAL) == 0) |
| 223 | continue; | 313 | continue; |
| 224 | 314 | ||
| 315 | /* | ||
| 316 | * Non uncorrected errors are handled by machine_check_poll | ||
| 317 | * Leave them alone. | ||
| 318 | */ | ||
| 319 | if ((m.status & MCI_STATUS_UC) == 0) | ||
| 320 | continue; | ||
| 321 | |||
| 322 | /* | ||
| 323 | * Set taint even when machine check was not enabled. | ||
| 324 | */ | ||
| 325 | add_taint(TAINT_MACHINE_CHECK); | ||
| 326 | |||
| 327 | __set_bit(i, toclear); | ||
| 328 | |||
| 225 | if (m.status & MCI_STATUS_EN) { | 329 | if (m.status & MCI_STATUS_EN) { |
| 226 | /* if PCC was set, there's no way out */ | 330 | /* if PCC was set, there's no way out */ |
| 227 | no_way_out |= !!(m.status & MCI_STATUS_PCC); | 331 | no_way_out |= !!(m.status & MCI_STATUS_PCC); |
| @@ -235,6 +339,12 @@ void do_machine_check(struct pt_regs * regs, long error_code) | |||
| 235 | no_way_out = 1; | 339 | no_way_out = 1; |
| 236 | kill_it = 1; | 340 | kill_it = 1; |
| 237 | } | 341 | } |
| 342 | } else { | ||
| 343 | /* | ||
| 344 | * Machine check event was not enabled. Clear, but | ||
| 345 | * ignore. | ||
| 346 | */ | ||
| 347 | continue; | ||
| 238 | } | 348 | } |
| 239 | 349 | ||
| 240 | if (m.status & MCI_STATUS_MISCV) | 350 | if (m.status & MCI_STATUS_MISCV) |
| @@ -243,10 +353,7 @@ void do_machine_check(struct pt_regs * regs, long error_code) | |||
| 243 | rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); | 353 | rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); |
| 244 | 354 | ||
| 245 | mce_get_rip(&m, regs); | 355 | mce_get_rip(&m, regs); |
| 246 | if (error_code >= 0) | 356 | mce_log(&m); |
| 247 | rdtscll(m.tsc); | ||
| 248 | if (error_code != -2) | ||
| 249 | mce_log(&m); | ||
| 250 | 357 | ||
| 251 | /* Did this bank cause the exception? */ | 358 | /* Did this bank cause the exception? */ |
| 252 | /* Assume that the bank with uncorrectable errors did it, | 359 | /* Assume that the bank with uncorrectable errors did it, |
| @@ -255,14 +362,8 @@ void do_machine_check(struct pt_regs * regs, long error_code) | |||
| 255 | panicm = m; | 362 | panicm = m; |
| 256 | panicm_found = 1; | 363 | panicm_found = 1; |
| 257 | } | 364 | } |
| 258 | |||
| 259 | add_taint(TAINT_MACHINE_CHECK); | ||
| 260 | } | 365 | } |
| 261 | 366 | ||
| 262 | /* Never do anything final in the polling timer */ | ||
| 263 | if (!regs) | ||
| 264 | goto out; | ||
| 265 | |||
| 266 | /* If we didn't find an uncorrectable error, pick | 367 | /* If we didn't find an uncorrectable error, pick |
| 267 | the last one (shouldn't happen, just being safe). */ | 368 | the last one (shouldn't happen, just being safe). */ |
| 268 | if (!panicm_found) | 369 | if (!panicm_found) |
| @@ -309,10 +410,11 @@ void do_machine_check(struct pt_regs * regs, long error_code) | |||
| 309 | /* notify userspace ASAP */ | 410 | /* notify userspace ASAP */ |
| 310 | set_thread_flag(TIF_MCE_NOTIFY); | 411 | set_thread_flag(TIF_MCE_NOTIFY); |
| 311 | 412 | ||
| 312 | out: | ||
| 313 | /* the last thing we do is clear state */ | 413 | /* the last thing we do is clear state */ |
| 314 | for (i = 0; i < banks; i++) | 414 | for (i = 0; i < banks; i++) { |
| 315 | wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); | 415 | if (test_bit(i, toclear)) |
| 416 | wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); | ||
| 417 | } | ||
| 316 | wrmsrl(MSR_IA32_MCG_STATUS, 0); | 418 | wrmsrl(MSR_IA32_MCG_STATUS, 0); |
| 317 | out2: | 419 | out2: |
| 318 | atomic_dec(&mce_entry); | 420 | atomic_dec(&mce_entry); |
| @@ -332,15 +434,13 @@ void do_machine_check(struct pt_regs * regs, long error_code) | |||
| 332 | * and historically has been the register value of the | 434 | * and historically has been the register value of the |
| 333 | * MSR_IA32_THERMAL_STATUS (Intel) msr. | 435 | * MSR_IA32_THERMAL_STATUS (Intel) msr. |
| 334 | */ | 436 | */ |
| 335 | void mce_log_therm_throt_event(unsigned int cpu, __u64 status) | 437 | void mce_log_therm_throt_event(__u64 status) |
| 336 | { | 438 | { |
| 337 | struct mce m; | 439 | struct mce m; |
| 338 | 440 | ||
| 339 | memset(&m, 0, sizeof(m)); | 441 | mce_setup(&m); |
| 340 | m.cpu = cpu; | ||
| 341 | m.bank = MCE_THERMAL_BANK; | 442 | m.bank = MCE_THERMAL_BANK; |
| 342 | m.status = status; | 443 | m.status = status; |
| 343 | rdtscll(m.tsc); | ||
| 344 | mce_log(&m); | 444 | mce_log(&m); |
| 345 | } | 445 | } |
| 346 | #endif /* CONFIG_X86_MCE_INTEL */ | 446 | #endif /* CONFIG_X86_MCE_INTEL */ |
| @@ -353,18 +453,18 @@ void mce_log_therm_throt_event(unsigned int cpu, __u64 status) | |||
| 353 | 453 | ||
| 354 | static int check_interval = 5 * 60; /* 5 minutes */ | 454 | static int check_interval = 5 * 60; /* 5 minutes */ |
| 355 | static int next_interval; /* in jiffies */ | 455 | static int next_interval; /* in jiffies */ |
| 356 | static void mcheck_timer(struct work_struct *work); | 456 | static void mcheck_timer(unsigned long); |
| 357 | static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer); | 457 | static DEFINE_PER_CPU(struct timer_list, mce_timer); |
| 358 | 458 | ||
| 359 | static void mcheck_check_cpu(void *info) | 459 | static void mcheck_timer(unsigned long data) |
| 360 | { | 460 | { |
| 361 | if (mce_available(¤t_cpu_data)) | 461 | struct timer_list *t = &per_cpu(mce_timer, data); |
| 362 | do_machine_check(NULL, 0); | ||
| 363 | } | ||
| 364 | 462 | ||
| 365 | static void mcheck_timer(struct work_struct *work) | 463 | WARN_ON(smp_processor_id() != data); |
| 366 | { | 464 | |
| 367 | on_each_cpu(mcheck_check_cpu, NULL, 1); | 465 | if (mce_available(¤t_cpu_data)) |
| 466 | machine_check_poll(MCP_TIMESTAMP, | ||
| 467 | &__get_cpu_var(mce_poll_banks)); | ||
| 368 | 468 | ||
| 369 | /* | 469 | /* |
| 370 | * Alert userspace if needed. If we logged an MCE, reduce the | 470 | * Alert userspace if needed. If we logged an MCE, reduce the |
| @@ -377,31 +477,41 @@ static void mcheck_timer(struct work_struct *work) | |||
| 377 | (int)round_jiffies_relative(check_interval*HZ)); | 477 | (int)round_jiffies_relative(check_interval*HZ)); |
| 378 | } | 478 | } |
| 379 | 479 | ||
| 380 | schedule_delayed_work(&mcheck_work, next_interval); | 480 | t->expires = jiffies + next_interval; |
| 481 | add_timer(t); | ||
| 482 | } | ||
| 483 | |||
| 484 | static void mce_do_trigger(struct work_struct *work) | ||
| 485 | { | ||
| 486 | call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT); | ||
| 381 | } | 487 | } |
| 382 | 488 | ||
| 489 | static DECLARE_WORK(mce_trigger_work, mce_do_trigger); | ||
| 490 | |||
| 383 | /* | 491 | /* |
| 384 | * This is only called from process context. This is where we do | 492 | * Notify the user(s) about new machine check events. |
| 385 | * anything we need to alert userspace about new MCEs. This is called | 493 | * Can be called from interrupt context, but not from machine check/NMI |
| 386 | * directly from the poller and also from entry.S and idle, thanks to | 494 | * context. |
| 387 | * TIF_MCE_NOTIFY. | ||
| 388 | */ | 495 | */ |
| 389 | int mce_notify_user(void) | 496 | int mce_notify_user(void) |
| 390 | { | 497 | { |
| 498 | /* Not more than two messages every minute */ | ||
| 499 | static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); | ||
| 500 | |||
| 391 | clear_thread_flag(TIF_MCE_NOTIFY); | 501 | clear_thread_flag(TIF_MCE_NOTIFY); |
| 392 | if (test_and_clear_bit(0, ¬ify_user)) { | 502 | if (test_and_clear_bit(0, ¬ify_user)) { |
| 393 | static unsigned long last_print; | ||
| 394 | unsigned long now = jiffies; | ||
| 395 | |||
| 396 | wake_up_interruptible(&mce_wait); | 503 | wake_up_interruptible(&mce_wait); |
| 397 | if (trigger[0]) | ||
| 398 | call_usermodehelper(trigger, trigger_argv, NULL, | ||
| 399 | UMH_NO_WAIT); | ||
| 400 | 504 | ||
| 401 | if (time_after_eq(now, last_print + (check_interval*HZ))) { | 505 | /* |
| 402 | last_print = now; | 506 | * There is no risk of missing notifications because |
| 507 | * work_pending is always cleared before the function is | ||
| 508 | * executed. | ||
| 509 | */ | ||
| 510 | if (trigger[0] && !work_pending(&mce_trigger_work)) | ||
| 511 | schedule_work(&mce_trigger_work); | ||
| 512 | |||
| 513 | if (__ratelimit(&ratelimit)) | ||
| 403 | printk(KERN_INFO "Machine check events logged\n"); | 514 | printk(KERN_INFO "Machine check events logged\n"); |
| 404 | } | ||
| 405 | 515 | ||
| 406 | return 1; | 516 | return 1; |
| 407 | } | 517 | } |
| @@ -425,63 +535,78 @@ static struct notifier_block mce_idle_notifier = { | |||
| 425 | 535 | ||
| 426 | static __init int periodic_mcheck_init(void) | 536 | static __init int periodic_mcheck_init(void) |
| 427 | { | 537 | { |
| 428 | next_interval = check_interval * HZ; | 538 | idle_notifier_register(&mce_idle_notifier); |
| 429 | if (next_interval) | 539 | return 0; |
| 430 | schedule_delayed_work(&mcheck_work, | ||
| 431 | round_jiffies_relative(next_interval)); | ||
| 432 | idle_notifier_register(&mce_idle_notifier); | ||
| 433 | return 0; | ||
| 434 | } | 540 | } |
| 435 | __initcall(periodic_mcheck_init); | 541 | __initcall(periodic_mcheck_init); |
| 436 | 542 | ||
| 437 | |||
| 438 | /* | 543 | /* |
| 439 | * Initialize Machine Checks for a CPU. | 544 | * Initialize Machine Checks for a CPU. |
| 440 | */ | 545 | */ |
| 441 | static void mce_init(void *dummy) | 546 | static int mce_cap_init(void) |
| 442 | { | 547 | { |
| 443 | u64 cap; | 548 | u64 cap; |
| 444 | int i; | 549 | unsigned b; |
| 445 | 550 | ||
| 446 | rdmsrl(MSR_IA32_MCG_CAP, cap); | 551 | rdmsrl(MSR_IA32_MCG_CAP, cap); |
| 447 | banks = cap & 0xff; | 552 | b = cap & 0xff; |
| 448 | if (banks > MCE_EXTENDED_BANK) { | 553 | if (b > MAX_NR_BANKS) { |
| 449 | banks = MCE_EXTENDED_BANK; | 554 | printk(KERN_WARNING |
| 450 | printk(KERN_INFO "MCE: warning: using only %d banks\n", | 555 | "MCE: Using only %u machine check banks out of %u\n", |
| 451 | MCE_EXTENDED_BANK); | 556 | MAX_NR_BANKS, b); |
| 557 | b = MAX_NR_BANKS; | ||
| 452 | } | 558 | } |
| 559 | |||
| 560 | /* Don't support asymmetric configurations today */ | ||
| 561 | WARN_ON(banks != 0 && b != banks); | ||
| 562 | banks = b; | ||
| 563 | if (!bank) { | ||
| 564 | bank = kmalloc(banks * sizeof(u64), GFP_KERNEL); | ||
| 565 | if (!bank) | ||
| 566 | return -ENOMEM; | ||
| 567 | memset(bank, 0xff, banks * sizeof(u64)); | ||
| 568 | } | ||
| 569 | |||
| 453 | /* Use accurate RIP reporting if available. */ | 570 | /* Use accurate RIP reporting if available. */ |
| 454 | if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9) | 571 | if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9) |
| 455 | rip_msr = MSR_IA32_MCG_EIP; | 572 | rip_msr = MSR_IA32_MCG_EIP; |
| 456 | 573 | ||
| 457 | /* Log the machine checks left over from the previous reset. | 574 | return 0; |
| 458 | This also clears all registers */ | 575 | } |
| 459 | do_machine_check(NULL, mce_bootlog ? -1 : -2); | 576 | |
| 577 | static void mce_init(void *dummy) | ||
| 578 | { | ||
| 579 | u64 cap; | ||
| 580 | int i; | ||
| 581 | mce_banks_t all_banks; | ||
| 582 | |||
| 583 | /* | ||
| 584 | * Log the machine checks left over from the previous reset. | ||
| 585 | */ | ||
| 586 | bitmap_fill(all_banks, MAX_NR_BANKS); | ||
| 587 | machine_check_poll(MCP_UC, &all_banks); | ||
| 460 | 588 | ||
| 461 | set_in_cr4(X86_CR4_MCE); | 589 | set_in_cr4(X86_CR4_MCE); |
| 462 | 590 | ||
| 591 | rdmsrl(MSR_IA32_MCG_CAP, cap); | ||
| 463 | if (cap & MCG_CTL_P) | 592 | if (cap & MCG_CTL_P) |
| 464 | wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); | 593 | wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); |
| 465 | 594 | ||
| 466 | for (i = 0; i < banks; i++) { | 595 | for (i = 0; i < banks; i++) { |
| 467 | if (i < NR_SYSFS_BANKS) | 596 | wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); |
| 468 | wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); | ||
| 469 | else | ||
| 470 | wrmsrl(MSR_IA32_MC0_CTL+4*i, ~0UL); | ||
| 471 | |||
| 472 | wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); | 597 | wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); |
| 473 | } | 598 | } |
| 474 | } | 599 | } |
| 475 | 600 | ||
| 476 | /* Add per CPU specific workarounds here */ | 601 | /* Add per CPU specific workarounds here */ |
| 477 | static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c) | 602 | static void mce_cpu_quirks(struct cpuinfo_x86 *c) |
| 478 | { | 603 | { |
| 479 | /* This should be disabled by the BIOS, but isn't always */ | 604 | /* This should be disabled by the BIOS, but isn't always */ |
| 480 | if (c->x86_vendor == X86_VENDOR_AMD) { | 605 | if (c->x86_vendor == X86_VENDOR_AMD) { |
| 481 | if(c->x86 == 15) | 606 | if (c->x86 == 15 && banks > 4) |
| 482 | /* disable GART TBL walk error reporting, which trips off | 607 | /* disable GART TBL walk error reporting, which trips off |
| 483 | incorrectly with the IOMMU & 3ware & Cerberus. */ | 608 | incorrectly with the IOMMU & 3ware & Cerberus. */ |
| 484 | clear_bit(10, &bank[4]); | 609 | clear_bit(10, (unsigned long *)&bank[4]); |
| 485 | if(c->x86 <= 17 && mce_bootlog < 0) | 610 | if(c->x86 <= 17 && mce_bootlog < 0) |
| 486 | /* Lots of broken BIOS around that don't clear them | 611 | /* Lots of broken BIOS around that don't clear them |
| 487 | by default and leave crap in there. Don't log. */ | 612 | by default and leave crap in there. Don't log. */ |
| @@ -504,20 +629,38 @@ static void mce_cpu_features(struct cpuinfo_x86 *c) | |||
| 504 | } | 629 | } |
| 505 | } | 630 | } |
| 506 | 631 | ||
| 632 | static void mce_init_timer(void) | ||
| 633 | { | ||
| 634 | struct timer_list *t = &__get_cpu_var(mce_timer); | ||
| 635 | |||
| 636 | /* data race harmless because everyone sets to the same value */ | ||
| 637 | if (!next_interval) | ||
| 638 | next_interval = check_interval * HZ; | ||
| 639 | if (!next_interval) | ||
| 640 | return; | ||
| 641 | setup_timer(t, mcheck_timer, smp_processor_id()); | ||
| 642 | t->expires = round_jiffies(jiffies + next_interval); | ||
| 643 | add_timer(t); | ||
| 644 | } | ||
| 645 | |||
| 507 | /* | 646 | /* |
| 508 | * Called for each booted CPU to set up machine checks. | 647 | * Called for each booted CPU to set up machine checks. |
| 509 | * Must be called with preempt off. | 648 | * Must be called with preempt off. |
| 510 | */ | 649 | */ |
| 511 | void __cpuinit mcheck_init(struct cpuinfo_x86 *c) | 650 | void __cpuinit mcheck_init(struct cpuinfo_x86 *c) |
| 512 | { | 651 | { |
| 513 | mce_cpu_quirks(c); | 652 | if (!mce_available(c)) |
| 653 | return; | ||
| 514 | 654 | ||
| 515 | if (mce_dont_init || | 655 | if (mce_cap_init() < 0) { |
| 516 | !mce_available(c)) | 656 | mce_dont_init = 1; |
| 517 | return; | 657 | return; |
| 658 | } | ||
| 659 | mce_cpu_quirks(c); | ||
| 518 | 660 | ||
| 519 | mce_init(NULL); | 661 | mce_init(NULL); |
| 520 | mce_cpu_features(c); | 662 | mce_cpu_features(c); |
| 663 | mce_init_timer(); | ||
| 521 | } | 664 | } |
| 522 | 665 | ||
| 523 | /* | 666 | /* |
| @@ -573,7 +716,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, | |||
| 573 | { | 716 | { |
| 574 | unsigned long *cpu_tsc; | 717 | unsigned long *cpu_tsc; |
| 575 | static DEFINE_MUTEX(mce_read_mutex); | 718 | static DEFINE_MUTEX(mce_read_mutex); |
| 576 | unsigned next; | 719 | unsigned prev, next; |
| 577 | char __user *buf = ubuf; | 720 | char __user *buf = ubuf; |
| 578 | int i, err; | 721 | int i, err; |
| 579 | 722 | ||
| @@ -592,25 +735,32 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, | |||
| 592 | } | 735 | } |
| 593 | 736 | ||
| 594 | err = 0; | 737 | err = 0; |
| 595 | for (i = 0; i < next; i++) { | 738 | prev = 0; |
| 596 | unsigned long start = jiffies; | 739 | do { |
| 597 | 740 | for (i = prev; i < next; i++) { | |
| 598 | while (!mcelog.entry[i].finished) { | 741 | unsigned long start = jiffies; |
| 599 | if (time_after_eq(jiffies, start + 2)) { | 742 | |
| 600 | memset(mcelog.entry + i,0, sizeof(struct mce)); | 743 | while (!mcelog.entry[i].finished) { |
| 601 | goto timeout; | 744 | if (time_after_eq(jiffies, start + 2)) { |
| 745 | memset(mcelog.entry + i, 0, | ||
| 746 | sizeof(struct mce)); | ||
| 747 | goto timeout; | ||
| 748 | } | ||
| 749 | cpu_relax(); | ||
| 602 | } | 750 | } |
| 603 | cpu_relax(); | 751 | smp_rmb(); |
| 752 | err |= copy_to_user(buf, mcelog.entry + i, | ||
| 753 | sizeof(struct mce)); | ||
| 754 | buf += sizeof(struct mce); | ||
| 755 | timeout: | ||
| 756 | ; | ||
| 604 | } | 757 | } |
| 605 | smp_rmb(); | ||
| 606 | err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce)); | ||
| 607 | buf += sizeof(struct mce); | ||
| 608 | timeout: | ||
| 609 | ; | ||
| 610 | } | ||
| 611 | 758 | ||
| 612 | memset(mcelog.entry, 0, next * sizeof(struct mce)); | 759 | memset(mcelog.entry + prev, 0, |
| 613 | mcelog.next = 0; | 760 | (next - prev) * sizeof(struct mce)); |
| 761 | prev = next; | ||
| 762 | next = cmpxchg(&mcelog.next, prev, 0); | ||
| 763 | } while (next != prev); | ||
| 614 | 764 | ||
| 615 | synchronize_sched(); | 765 | synchronize_sched(); |
| 616 | 766 | ||
| @@ -680,20 +830,6 @@ static struct miscdevice mce_log_device = { | |||
| 680 | &mce_chrdev_ops, | 830 | &mce_chrdev_ops, |
| 681 | }; | 831 | }; |
| 682 | 832 | ||
| 683 | static unsigned long old_cr4 __initdata; | ||
| 684 | |||
| 685 | void __init stop_mce(void) | ||
| 686 | { | ||
| 687 | old_cr4 = read_cr4(); | ||
| 688 | clear_in_cr4(X86_CR4_MCE); | ||
| 689 | } | ||
| 690 | |||
| 691 | void __init restart_mce(void) | ||
| 692 | { | ||
| 693 | if (old_cr4 & X86_CR4_MCE) | ||
| 694 | set_in_cr4(X86_CR4_MCE); | ||
| 695 | } | ||
| 696 | |||
| 697 | /* | 833 | /* |
| 698 | * Old style boot options parsing. Only for compatibility. | 834 | * Old style boot options parsing. Only for compatibility. |
| 699 | */ | 835 | */ |
| @@ -703,8 +839,7 @@ static int __init mcheck_disable(char *str) | |||
| 703 | return 1; | 839 | return 1; |
| 704 | } | 840 | } |
| 705 | 841 | ||
| 706 | /* mce=off disables machine check. Note you can re-enable it later | 842 | /* mce=off disables machine check. |
| 707 | using sysfs. | ||
| 708 | mce=TOLERANCELEVEL (number, see above) | 843 | mce=TOLERANCELEVEL (number, see above) |
| 709 | mce=bootlog Log MCEs from before booting. Disabled by default on AMD. | 844 | mce=bootlog Log MCEs from before booting. Disabled by default on AMD. |
| 710 | mce=nobootlog Don't log MCEs from before booting. */ | 845 | mce=nobootlog Don't log MCEs from before booting. */ |
| @@ -728,6 +863,29 @@ __setup("mce=", mcheck_enable); | |||
| 728 | * Sysfs support | 863 | * Sysfs support |
| 729 | */ | 864 | */ |
| 730 | 865 | ||
| 866 | /* | ||
| 867 | * Disable machine checks on suspend and shutdown. We can't really handle | ||
| 868 | * them later. | ||
| 869 | */ | ||
| 870 | static int mce_disable(void) | ||
| 871 | { | ||
| 872 | int i; | ||
| 873 | |||
| 874 | for (i = 0; i < banks; i++) | ||
| 875 | wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); | ||
| 876 | return 0; | ||
| 877 | } | ||
| 878 | |||
| 879 | static int mce_suspend(struct sys_device *dev, pm_message_t state) | ||
| 880 | { | ||
| 881 | return mce_disable(); | ||
| 882 | } | ||
| 883 | |||
| 884 | static int mce_shutdown(struct sys_device *dev) | ||
| 885 | { | ||
| 886 | return mce_disable(); | ||
| 887 | } | ||
| 888 | |||
| 731 | /* On resume clear all MCE state. Don't want to see leftovers from the BIOS. | 889 | /* On resume clear all MCE state. Don't want to see leftovers from the BIOS. |
| 732 | Only one CPU is active at this time, the others get readded later using | 890 | Only one CPU is active at this time, the others get readded later using |
| 733 | CPU hotplug. */ | 891 | CPU hotplug. */ |
| @@ -738,20 +896,24 @@ static int mce_resume(struct sys_device *dev) | |||
| 738 | return 0; | 896 | return 0; |
| 739 | } | 897 | } |
| 740 | 898 | ||
| 899 | static void mce_cpu_restart(void *data) | ||
| 900 | { | ||
| 901 | del_timer_sync(&__get_cpu_var(mce_timer)); | ||
| 902 | if (mce_available(¤t_cpu_data)) | ||
| 903 | mce_init(NULL); | ||
| 904 | mce_init_timer(); | ||
| 905 | } | ||
| 906 | |||
| 741 | /* Reinit MCEs after user configuration changes */ | 907 | /* Reinit MCEs after user configuration changes */ |
| 742 | static void mce_restart(void) | 908 | static void mce_restart(void) |
| 743 | { | 909 | { |
| 744 | if (next_interval) | ||
| 745 | cancel_delayed_work(&mcheck_work); | ||
| 746 | /* Timer race is harmless here */ | ||
| 747 | on_each_cpu(mce_init, NULL, 1); | ||
| 748 | next_interval = check_interval * HZ; | 910 | next_interval = check_interval * HZ; |
| 749 | if (next_interval) | 911 | on_each_cpu(mce_cpu_restart, NULL, 1); |
| 750 | schedule_delayed_work(&mcheck_work, | ||
| 751 | round_jiffies_relative(next_interval)); | ||
| 752 | } | 912 | } |
| 753 | 913 | ||
| 754 | static struct sysdev_class mce_sysclass = { | 914 | static struct sysdev_class mce_sysclass = { |
| 915 | .suspend = mce_suspend, | ||
| 916 | .shutdown = mce_shutdown, | ||
| 755 | .resume = mce_resume, | 917 | .resume = mce_resume, |
| 756 | .name = "machinecheck", | 918 | .name = "machinecheck", |
| 757 | }; | 919 | }; |
| @@ -778,16 +940,26 @@ void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinit | |||
| 778 | } \ | 940 | } \ |
| 779 | static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name); | 941 | static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name); |
| 780 | 942 | ||
| 781 | /* | 943 | static struct sysdev_attribute *bank_attrs; |
| 782 | * TBD should generate these dynamically based on number of available banks. | 944 | |
| 783 | * Have only 6 contol banks in /sysfs until then. | 945 | static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, |
| 784 | */ | 946 | char *buf) |
| 785 | ACCESSOR(bank0ctl,bank[0],mce_restart()) | 947 | { |
| 786 | ACCESSOR(bank1ctl,bank[1],mce_restart()) | 948 | u64 b = bank[attr - bank_attrs]; |
| 787 | ACCESSOR(bank2ctl,bank[2],mce_restart()) | 949 | return sprintf(buf, "%llx\n", b); |
| 788 | ACCESSOR(bank3ctl,bank[3],mce_restart()) | 950 | } |
| 789 | ACCESSOR(bank4ctl,bank[4],mce_restart()) | 951 | |
| 790 | ACCESSOR(bank5ctl,bank[5],mce_restart()) | 952 | static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, |
| 953 | const char *buf, size_t siz) | ||
| 954 | { | ||
| 955 | char *end; | ||
| 956 | u64 new = simple_strtoull(buf, &end, 0); | ||
| 957 | if (end == buf) | ||
| 958 | return -EINVAL; | ||
| 959 | bank[attr - bank_attrs] = new; | ||
| 960 | mce_restart(); | ||
| 961 | return end-buf; | ||
| 962 | } | ||
| 791 | 963 | ||
| 792 | static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr, | 964 | static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr, |
| 793 | char *buf) | 965 | char *buf) |
| @@ -814,8 +986,6 @@ static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); | |||
| 814 | static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); | 986 | static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); |
| 815 | ACCESSOR(check_interval,check_interval,mce_restart()) | 987 | ACCESSOR(check_interval,check_interval,mce_restart()) |
| 816 | static struct sysdev_attribute *mce_attributes[] = { | 988 | static struct sysdev_attribute *mce_attributes[] = { |
| 817 | &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl, | ||
| 818 | &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl, | ||
| 819 | &attr_tolerant.attr, &attr_check_interval, &attr_trigger, | 989 | &attr_tolerant.attr, &attr_check_interval, &attr_trigger, |
| 820 | NULL | 990 | NULL |
| 821 | }; | 991 | }; |
| @@ -845,11 +1015,22 @@ static __cpuinit int mce_create_device(unsigned int cpu) | |||
| 845 | if (err) | 1015 | if (err) |
| 846 | goto error; | 1016 | goto error; |
| 847 | } | 1017 | } |
| 1018 | for (i = 0; i < banks; i++) { | ||
| 1019 | err = sysdev_create_file(&per_cpu(device_mce, cpu), | ||
| 1020 | &bank_attrs[i]); | ||
| 1021 | if (err) | ||
| 1022 | goto error2; | ||
| 1023 | } | ||
| 848 | cpu_set(cpu, mce_device_initialized); | 1024 | cpu_set(cpu, mce_device_initialized); |
| 849 | 1025 | ||
| 850 | return 0; | 1026 | return 0; |
| 1027 | error2: | ||
| 1028 | while (--i >= 0) { | ||
| 1029 | sysdev_remove_file(&per_cpu(device_mce, cpu), | ||
| 1030 | &bank_attrs[i]); | ||
| 1031 | } | ||
| 851 | error: | 1032 | error: |
| 852 | while (i--) { | 1033 | while (--i >= 0) { |
| 853 | sysdev_remove_file(&per_cpu(device_mce,cpu), | 1034 | sysdev_remove_file(&per_cpu(device_mce,cpu), |
| 854 | mce_attributes[i]); | 1035 | mce_attributes[i]); |
| 855 | } | 1036 | } |
| @@ -868,15 +1049,46 @@ static __cpuinit void mce_remove_device(unsigned int cpu) | |||
| 868 | for (i = 0; mce_attributes[i]; i++) | 1049 | for (i = 0; mce_attributes[i]; i++) |
| 869 | sysdev_remove_file(&per_cpu(device_mce,cpu), | 1050 | sysdev_remove_file(&per_cpu(device_mce,cpu), |
| 870 | mce_attributes[i]); | 1051 | mce_attributes[i]); |
| 1052 | for (i = 0; i < banks; i++) | ||
| 1053 | sysdev_remove_file(&per_cpu(device_mce, cpu), | ||
| 1054 | &bank_attrs[i]); | ||
| 871 | sysdev_unregister(&per_cpu(device_mce,cpu)); | 1055 | sysdev_unregister(&per_cpu(device_mce,cpu)); |
| 872 | cpu_clear(cpu, mce_device_initialized); | 1056 | cpu_clear(cpu, mce_device_initialized); |
| 873 | } | 1057 | } |
| 874 | 1058 | ||
| 1059 | /* Make sure there are no machine checks on offlined CPUs. */ | ||
| 1060 | static void mce_disable_cpu(void *h) | ||
| 1061 | { | ||
| 1062 | int i; | ||
| 1063 | unsigned long action = *(unsigned long *)h; | ||
| 1064 | |||
| 1065 | if (!mce_available(¤t_cpu_data)) | ||
| 1066 | return; | ||
| 1067 | if (!(action & CPU_TASKS_FROZEN)) | ||
| 1068 | cmci_clear(); | ||
| 1069 | for (i = 0; i < banks; i++) | ||
| 1070 | wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); | ||
| 1071 | } | ||
| 1072 | |||
| 1073 | static void mce_reenable_cpu(void *h) | ||
| 1074 | { | ||
| 1075 | int i; | ||
| 1076 | unsigned long action = *(unsigned long *)h; | ||
| 1077 | |||
| 1078 | if (!mce_available(¤t_cpu_data)) | ||
| 1079 | return; | ||
| 1080 | if (!(action & CPU_TASKS_FROZEN)) | ||
| 1081 | cmci_reenable(); | ||
| 1082 | for (i = 0; i < banks; i++) | ||
| 1083 | wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]); | ||
| 1084 | } | ||
| 1085 | |||
| 875 | /* Get notified when a cpu comes on/off. Be hotplug friendly. */ | 1086 | /* Get notified when a cpu comes on/off. Be hotplug friendly. */ |
| 876 | static int __cpuinit mce_cpu_callback(struct notifier_block *nfb, | 1087 | static int __cpuinit mce_cpu_callback(struct notifier_block *nfb, |
| 877 | unsigned long action, void *hcpu) | 1088 | unsigned long action, void *hcpu) |
| 878 | { | 1089 | { |
| 879 | unsigned int cpu = (unsigned long)hcpu; | 1090 | unsigned int cpu = (unsigned long)hcpu; |
| 1091 | struct timer_list *t = &per_cpu(mce_timer, cpu); | ||
| 880 | 1092 | ||
| 881 | switch (action) { | 1093 | switch (action) { |
| 882 | case CPU_ONLINE: | 1094 | case CPU_ONLINE: |
| @@ -891,6 +1103,21 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb, | |||
| 891 | threshold_cpu_callback(action, cpu); | 1103 | threshold_cpu_callback(action, cpu); |
| 892 | mce_remove_device(cpu); | 1104 | mce_remove_device(cpu); |
| 893 | break; | 1105 | break; |
| 1106 | case CPU_DOWN_PREPARE: | ||
| 1107 | case CPU_DOWN_PREPARE_FROZEN: | ||
| 1108 | del_timer_sync(t); | ||
| 1109 | smp_call_function_single(cpu, mce_disable_cpu, &action, 1); | ||
| 1110 | break; | ||
| 1111 | case CPU_DOWN_FAILED: | ||
| 1112 | case CPU_DOWN_FAILED_FROZEN: | ||
| 1113 | t->expires = round_jiffies(jiffies + next_interval); | ||
| 1114 | add_timer_on(t, cpu); | ||
| 1115 | smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); | ||
| 1116 | break; | ||
| 1117 | case CPU_POST_DEAD: | ||
| 1118 | /* intentionally ignoring frozen here */ | ||
| 1119 | cmci_rediscover(cpu); | ||
| 1120 | break; | ||
| 894 | } | 1121 | } |
| 895 | return NOTIFY_OK; | 1122 | return NOTIFY_OK; |
| 896 | } | 1123 | } |
| @@ -899,6 +1126,34 @@ static struct notifier_block mce_cpu_notifier __cpuinitdata = { | |||
| 899 | .notifier_call = mce_cpu_callback, | 1126 | .notifier_call = mce_cpu_callback, |
| 900 | }; | 1127 | }; |
| 901 | 1128 | ||
| 1129 | static __init int mce_init_banks(void) | ||
| 1130 | { | ||
| 1131 | int i; | ||
| 1132 | |||
| 1133 | bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks, | ||
| 1134 | GFP_KERNEL); | ||
| 1135 | if (!bank_attrs) | ||
| 1136 | return -ENOMEM; | ||
| 1137 | |||
| 1138 | for (i = 0; i < banks; i++) { | ||
| 1139 | struct sysdev_attribute *a = &bank_attrs[i]; | ||
| 1140 | a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i); | ||
| 1141 | if (!a->attr.name) | ||
| 1142 | goto nomem; | ||
| 1143 | a->attr.mode = 0644; | ||
| 1144 | a->show = show_bank; | ||
| 1145 | a->store = set_bank; | ||
| 1146 | } | ||
| 1147 | return 0; | ||
| 1148 | |||
| 1149 | nomem: | ||
| 1150 | while (--i >= 0) | ||
| 1151 | kfree(bank_attrs[i].attr.name); | ||
| 1152 | kfree(bank_attrs); | ||
| 1153 | bank_attrs = NULL; | ||
| 1154 | return -ENOMEM; | ||
| 1155 | } | ||
| 1156 | |||
| 902 | static __init int mce_init_device(void) | 1157 | static __init int mce_init_device(void) |
| 903 | { | 1158 | { |
| 904 | int err; | 1159 | int err; |
| @@ -906,6 +1161,11 @@ static __init int mce_init_device(void) | |||
| 906 | 1161 | ||
| 907 | if (!mce_available(&boot_cpu_data)) | 1162 | if (!mce_available(&boot_cpu_data)) |
| 908 | return -EIO; | 1163 | return -EIO; |
| 1164 | |||
| 1165 | err = mce_init_banks(); | ||
| 1166 | if (err) | ||
| 1167 | return err; | ||
| 1168 | |||
| 909 | err = sysdev_class_register(&mce_sysclass); | 1169 | err = sysdev_class_register(&mce_sysclass); |
| 910 | if (err) | 1170 | if (err) |
| 911 | return err; | 1171 | return err; |
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c index 9817506dd469..c5a32f92d07e 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c | |||
| @@ -79,6 +79,8 @@ static unsigned char shared_bank[NR_BANKS] = { | |||
| 79 | 79 | ||
| 80 | static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */ | 80 | static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */ |
| 81 | 81 | ||
| 82 | static void amd_threshold_interrupt(void); | ||
| 83 | |||
| 82 | /* | 84 | /* |
| 83 | * CPU Initialization | 85 | * CPU Initialization |
| 84 | */ | 86 | */ |
| @@ -174,6 +176,8 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) | |||
| 174 | tr.reset = 0; | 176 | tr.reset = 0; |
| 175 | tr.old_limit = 0; | 177 | tr.old_limit = 0; |
| 176 | threshold_restart_bank(&tr); | 178 | threshold_restart_bank(&tr); |
| 179 | |||
| 180 | mce_threshold_vector = amd_threshold_interrupt; | ||
| 177 | } | 181 | } |
| 178 | } | 182 | } |
| 179 | } | 183 | } |
| @@ -187,19 +191,13 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) | |||
| 187 | * the interrupt goes off when error_count reaches threshold_limit. | 191 | * the interrupt goes off when error_count reaches threshold_limit. |
| 188 | * the handler will simply log mcelog w/ software defined bank number. | 192 | * the handler will simply log mcelog w/ software defined bank number. |
| 189 | */ | 193 | */ |
| 190 | asmlinkage void mce_threshold_interrupt(void) | 194 | static void amd_threshold_interrupt(void) |
| 191 | { | 195 | { |
| 192 | unsigned int bank, block; | 196 | unsigned int bank, block; |
| 193 | struct mce m; | 197 | struct mce m; |
| 194 | u32 low = 0, high = 0, address = 0; | 198 | u32 low = 0, high = 0, address = 0; |
| 195 | 199 | ||
| 196 | ack_APIC_irq(); | 200 | mce_setup(&m); |
| 197 | exit_idle(); | ||
| 198 | irq_enter(); | ||
| 199 | |||
| 200 | memset(&m, 0, sizeof(m)); | ||
| 201 | rdtscll(m.tsc); | ||
| 202 | m.cpu = smp_processor_id(); | ||
| 203 | 201 | ||
| 204 | /* assume first bank caused it */ | 202 | /* assume first bank caused it */ |
| 205 | for (bank = 0; bank < NR_BANKS; ++bank) { | 203 | for (bank = 0; bank < NR_BANKS; ++bank) { |
| @@ -233,7 +231,8 @@ asmlinkage void mce_threshold_interrupt(void) | |||
| 233 | 231 | ||
| 234 | /* Log the machine check that caused the threshold | 232 | /* Log the machine check that caused the threshold |
| 235 | event. */ | 233 | event. */ |
| 236 | do_machine_check(NULL, 0); | 234 | machine_check_poll(MCP_TIMESTAMP, |
| 235 | &__get_cpu_var(mce_poll_banks)); | ||
| 237 | 236 | ||
| 238 | if (high & MASK_OVERFLOW_HI) { | 237 | if (high & MASK_OVERFLOW_HI) { |
| 239 | rdmsrl(address, m.misc); | 238 | rdmsrl(address, m.misc); |
| @@ -243,13 +242,10 @@ asmlinkage void mce_threshold_interrupt(void) | |||
| 243 | + bank * NR_BLOCKS | 242 | + bank * NR_BLOCKS |
| 244 | + block; | 243 | + block; |
| 245 | mce_log(&m); | 244 | mce_log(&m); |
| 246 | goto out; | 245 | return; |
| 247 | } | 246 | } |
| 248 | } | 247 | } |
| 249 | } | 248 | } |
| 250 | out: | ||
| 251 | inc_irq_stat(irq_threshold_count); | ||
| 252 | irq_exit(); | ||
| 253 | } | 249 | } |
| 254 | 250 | ||
| 255 | /* | 251 | /* |
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c index aa5e287c98e0..aaa7d9730938 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c | |||
| @@ -1,6 +1,8 @@ | |||
| 1 | /* | 1 | /* |
| 2 | * Intel specific MCE features. | 2 | * Intel specific MCE features. |
| 3 | * Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca> | 3 | * Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca> |
| 4 | * Copyright (C) 2008, 2009 Intel Corporation | ||
| 5 | * Author: Andi Kleen | ||
| 4 | */ | 6 | */ |
| 5 | 7 | ||
| 6 | #include <linux/init.h> | 8 | #include <linux/init.h> |
| @@ -13,6 +15,7 @@ | |||
| 13 | #include <asm/hw_irq.h> | 15 | #include <asm/hw_irq.h> |
| 14 | #include <asm/idle.h> | 16 | #include <asm/idle.h> |
| 15 | #include <asm/therm_throt.h> | 17 | #include <asm/therm_throt.h> |
| 18 | #include <asm/apic.h> | ||
| 16 | 19 | ||
| 17 | asmlinkage void smp_thermal_interrupt(void) | 20 | asmlinkage void smp_thermal_interrupt(void) |
| 18 | { | 21 | { |
| @@ -25,7 +28,7 @@ asmlinkage void smp_thermal_interrupt(void) | |||
| 25 | 28 | ||
| 26 | rdmsrl(MSR_IA32_THERM_STATUS, msr_val); | 29 | rdmsrl(MSR_IA32_THERM_STATUS, msr_val); |
| 27 | if (therm_throt_process(msr_val & 1)) | 30 | if (therm_throt_process(msr_val & 1)) |
| 28 | mce_log_therm_throt_event(smp_processor_id(), msr_val); | 31 | mce_log_therm_throt_event(msr_val); |
| 29 | 32 | ||
| 30 | inc_irq_stat(irq_thermal_count); | 33 | inc_irq_stat(irq_thermal_count); |
| 31 | irq_exit(); | 34 | irq_exit(); |
| @@ -85,7 +88,209 @@ static void intel_init_thermal(struct cpuinfo_x86 *c) | |||
| 85 | return; | 88 | return; |
| 86 | } | 89 | } |
| 87 | 90 | ||
| 91 | /* | ||
| 92 | * Support for Intel Correct Machine Check Interrupts. This allows | ||
| 93 | * the CPU to raise an interrupt when a corrected machine check happened. | ||
| 94 | * Normally we pick those up using a regular polling timer. | ||
| 95 | * Also supports reliable discovery of shared banks. | ||
| 96 | */ | ||
| 97 | |||
| 98 | static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned); | ||
| 99 | |||
| 100 | /* | ||
| 101 | * cmci_discover_lock protects against parallel discovery attempts | ||
| 102 | * which could race against each other. | ||
| 103 | */ | ||
| 104 | static DEFINE_SPINLOCK(cmci_discover_lock); | ||
| 105 | |||
| 106 | #define CMCI_THRESHOLD 1 | ||
| 107 | |||
| 108 | static int cmci_supported(int *banks) | ||
| 109 | { | ||
| 110 | u64 cap; | ||
| 111 | |||
| 112 | /* | ||
| 113 | * Vendor check is not strictly needed, but the initial | ||
| 114 | * initialization is vendor keyed and this | ||
| 115 | * makes sure none of the backdoors are entered otherwise. | ||
| 116 | */ | ||
| 117 | if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) | ||
| 118 | return 0; | ||
| 119 | if (!cpu_has_apic || lapic_get_maxlvt() < 6) | ||
| 120 | return 0; | ||
| 121 | rdmsrl(MSR_IA32_MCG_CAP, cap); | ||
| 122 | *banks = min_t(unsigned, MAX_NR_BANKS, cap & 0xff); | ||
| 123 | return !!(cap & MCG_CMCI_P); | ||
| 124 | } | ||
| 125 | |||
| 126 | /* | ||
| 127 | * The interrupt handler. This is called on every event. | ||
| 128 | * Just call the poller directly to log any events. | ||
| 129 | * This could in theory increase the threshold under high load, | ||
| 130 | * but doesn't for now. | ||
| 131 | */ | ||
| 132 | static void intel_threshold_interrupt(void) | ||
| 133 | { | ||
| 134 | machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); | ||
| 135 | mce_notify_user(); | ||
| 136 | } | ||
| 137 | |||
| 138 | static void print_update(char *type, int *hdr, int num) | ||
| 139 | { | ||
| 140 | if (*hdr == 0) | ||
| 141 | printk(KERN_INFO "CPU %d MCA banks", smp_processor_id()); | ||
| 142 | *hdr = 1; | ||
| 143 | printk(KERN_CONT " %s:%d", type, num); | ||
| 144 | } | ||
| 145 | |||
| 146 | /* | ||
| 147 | * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks | ||
| 148 | * on this CPU. Use the algorithm recommended in the SDM to discover shared | ||
| 149 | * banks. | ||
| 150 | */ | ||
| 151 | static void cmci_discover(int banks, int boot) | ||
| 152 | { | ||
| 153 | unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned); | ||
| 154 | int hdr = 0; | ||
| 155 | int i; | ||
| 156 | |||
| 157 | spin_lock(&cmci_discover_lock); | ||
| 158 | for (i = 0; i < banks; i++) { | ||
| 159 | u64 val; | ||
| 160 | |||
| 161 | if (test_bit(i, owned)) | ||
| 162 | continue; | ||
| 163 | |||
| 164 | rdmsrl(MSR_IA32_MC0_CTL2 + i, val); | ||
| 165 | |||
| 166 | /* Already owned by someone else? */ | ||
| 167 | if (val & CMCI_EN) { | ||
| 168 | if (test_and_clear_bit(i, owned) || boot) | ||
| 169 | print_update("SHD", &hdr, i); | ||
| 170 | __clear_bit(i, __get_cpu_var(mce_poll_banks)); | ||
| 171 | continue; | ||
| 172 | } | ||
| 173 | |||
| 174 | val |= CMCI_EN | CMCI_THRESHOLD; | ||
| 175 | wrmsrl(MSR_IA32_MC0_CTL2 + i, val); | ||
| 176 | rdmsrl(MSR_IA32_MC0_CTL2 + i, val); | ||
| 177 | |||
| 178 | /* Did the enable bit stick? -- the bank supports CMCI */ | ||
| 179 | if (val & CMCI_EN) { | ||
| 180 | if (!test_and_set_bit(i, owned) || boot) | ||
| 181 | print_update("CMCI", &hdr, i); | ||
| 182 | __clear_bit(i, __get_cpu_var(mce_poll_banks)); | ||
| 183 | } else { | ||
| 184 | WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks))); | ||
| 185 | } | ||
| 186 | } | ||
| 187 | spin_unlock(&cmci_discover_lock); | ||
| 188 | if (hdr) | ||
| 189 | printk(KERN_CONT "\n"); | ||
| 190 | } | ||
| 191 | |||
| 192 | /* | ||
| 193 | * Just in case we missed an event during initialization check | ||
| 194 | * all the CMCI owned banks. | ||
| 195 | */ | ||
| 196 | void cmci_recheck(void) | ||
| 197 | { | ||
| 198 | unsigned long flags; | ||
| 199 | int banks; | ||
| 200 | |||
| 201 | if (!mce_available(¤t_cpu_data) || !cmci_supported(&banks)) | ||
| 202 | return; | ||
| 203 | local_irq_save(flags); | ||
| 204 | machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); | ||
| 205 | local_irq_restore(flags); | ||
| 206 | } | ||
| 207 | |||
| 208 | /* | ||
| 209 | * Disable CMCI on this CPU for all banks it owns when it goes down. | ||
| 210 | * This allows other CPUs to claim the banks on rediscovery. | ||
| 211 | */ | ||
| 212 | void cmci_clear(void) | ||
| 213 | { | ||
| 214 | int i; | ||
| 215 | int banks; | ||
| 216 | u64 val; | ||
| 217 | |||
| 218 | if (!cmci_supported(&banks)) | ||
| 219 | return; | ||
| 220 | spin_lock(&cmci_discover_lock); | ||
| 221 | for (i = 0; i < banks; i++) { | ||
| 222 | if (!test_bit(i, __get_cpu_var(mce_banks_owned))) | ||
| 223 | continue; | ||
| 224 | /* Disable CMCI */ | ||
| 225 | rdmsrl(MSR_IA32_MC0_CTL2 + i, val); | ||
| 226 | val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK); | ||
| 227 | wrmsrl(MSR_IA32_MC0_CTL2 + i, val); | ||
| 228 | __clear_bit(i, __get_cpu_var(mce_banks_owned)); | ||
| 229 | } | ||
| 230 | spin_unlock(&cmci_discover_lock); | ||
| 231 | } | ||
| 232 | |||
| 233 | /* | ||
| 234 | * After a CPU went down cycle through all the others and rediscover | ||
| 235 | * Must run in process context. | ||
| 236 | */ | ||
| 237 | void cmci_rediscover(int dying) | ||
| 238 | { | ||
| 239 | int banks; | ||
| 240 | int cpu; | ||
| 241 | cpumask_var_t old; | ||
| 242 | |||
| 243 | if (!cmci_supported(&banks)) | ||
| 244 | return; | ||
| 245 | if (!alloc_cpumask_var(&old, GFP_KERNEL)) | ||
| 246 | return; | ||
| 247 | cpumask_copy(old, ¤t->cpus_allowed); | ||
| 248 | |||
| 249 | for_each_online_cpu (cpu) { | ||
| 250 | if (cpu == dying) | ||
| 251 | continue; | ||
| 252 | if (set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu))) | ||
| 253 | continue; | ||
| 254 | /* Recheck banks in case CPUs don't all have the same */ | ||
| 255 | if (cmci_supported(&banks)) | ||
| 256 | cmci_discover(banks, 0); | ||
| 257 | } | ||
| 258 | |||
| 259 | set_cpus_allowed_ptr(current, old); | ||
| 260 | free_cpumask_var(old); | ||
| 261 | } | ||
| 262 | |||
| 263 | /* | ||
| 264 | * Reenable CMCI on this CPU in case a CPU down failed. | ||
| 265 | */ | ||
| 266 | void cmci_reenable(void) | ||
| 267 | { | ||
| 268 | int banks; | ||
| 269 | if (cmci_supported(&banks)) | ||
| 270 | cmci_discover(banks, 0); | ||
| 271 | } | ||
| 272 | |||
| 273 | static __cpuinit void intel_init_cmci(void) | ||
| 274 | { | ||
| 275 | int banks; | ||
| 276 | |||
| 277 | if (!cmci_supported(&banks)) | ||
| 278 | return; | ||
| 279 | |||
| 280 | mce_threshold_vector = intel_threshold_interrupt; | ||
| 281 | cmci_discover(banks, 1); | ||
| 282 | /* | ||
| 283 | * For CPU #0 this runs with still disabled APIC, but that's | ||
| 284 | * ok because only the vector is set up. We still do another | ||
| 285 | * check for the banks later for CPU #0 just to make sure | ||
| 286 | * to not miss any events. | ||
| 287 | */ | ||
| 288 | apic_write(APIC_LVTCMCI, THRESHOLD_APIC_VECTOR|APIC_DM_FIXED); | ||
| 289 | cmci_recheck(); | ||
| 290 | } | ||
| 291 | |||
| 88 | void mce_intel_feature_init(struct cpuinfo_x86 *c) | 292 | void mce_intel_feature_init(struct cpuinfo_x86 *c) |
| 89 | { | 293 | { |
| 90 | intel_init_thermal(c); | 294 | intel_init_thermal(c); |
| 295 | intel_init_cmci(); | ||
| 91 | } | 296 | } |
diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c new file mode 100644 index 000000000000..23ee9e730f78 --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/threshold.c | |||
| @@ -0,0 +1,29 @@ | |||
| 1 | /* | ||
| 2 | * Common corrected MCE threshold handler code: | ||
| 3 | */ | ||
| 4 | #include <linux/interrupt.h> | ||
| 5 | #include <linux/kernel.h> | ||
| 6 | |||
| 7 | #include <asm/irq_vectors.h> | ||
| 8 | #include <asm/apic.h> | ||
| 9 | #include <asm/idle.h> | ||
| 10 | #include <asm/mce.h> | ||
| 11 | |||
| 12 | static void default_threshold_interrupt(void) | ||
| 13 | { | ||
| 14 | printk(KERN_ERR "Unexpected threshold interrupt at vector %x\n", | ||
| 15 | THRESHOLD_APIC_VECTOR); | ||
| 16 | } | ||
| 17 | |||
| 18 | void (*mce_threshold_vector)(void) = default_threshold_interrupt; | ||
| 19 | |||
| 20 | asmlinkage void mce_threshold_interrupt(void) | ||
| 21 | { | ||
| 22 | exit_idle(); | ||
| 23 | irq_enter(); | ||
| 24 | inc_irq_stat(irq_threshold_count); | ||
| 25 | mce_threshold_vector(); | ||
| 26 | irq_exit(); | ||
| 27 | /* Ack only at the end to avoid potential reentry */ | ||
| 28 | ack_APIC_irq(); | ||
| 29 | } | ||
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 169a120587be..87b67e3a765a 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c | |||
| @@ -729,7 +729,7 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task, | |||
| 729 | 729 | ||
| 730 | spin_unlock_irqrestore(&ds_lock, irq); | 730 | spin_unlock_irqrestore(&ds_lock, irq); |
| 731 | 731 | ||
| 732 | ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts); | 732 | ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_pebs); |
| 733 | ds_resume_pebs(tracer); | 733 | ds_resume_pebs(tracer); |
| 734 | 734 | ||
| 735 | return tracer; | 735 | return tracer; |
| @@ -1029,5 +1029,4 @@ void ds_copy_thread(struct task_struct *tsk, struct task_struct *father) | |||
| 1029 | 1029 | ||
| 1030 | void ds_exit_thread(struct task_struct *tsk) | 1030 | void ds_exit_thread(struct task_struct *tsk) |
| 1031 | { | 1031 | { |
| 1032 | WARN_ON(tsk->thread.ds_ctx); | ||
| 1033 | } | 1032 | } |
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c index b205272ad394..1736acc4d7aa 100644 --- a/arch/x86/kernel/efi.c +++ b/arch/x86/kernel/efi.c | |||
| @@ -469,7 +469,7 @@ void __init efi_enter_virtual_mode(void) | |||
| 469 | efi_memory_desc_t *md; | 469 | efi_memory_desc_t *md; |
| 470 | efi_status_t status; | 470 | efi_status_t status; |
| 471 | unsigned long size; | 471 | unsigned long size; |
| 472 | u64 end, systab, addr, npages; | 472 | u64 end, systab, addr, npages, end_pfn; |
| 473 | void *p, *va; | 473 | void *p, *va; |
| 474 | 474 | ||
| 475 | efi.systab = NULL; | 475 | efi.systab = NULL; |
| @@ -481,7 +481,10 @@ void __init efi_enter_virtual_mode(void) | |||
| 481 | size = md->num_pages << EFI_PAGE_SHIFT; | 481 | size = md->num_pages << EFI_PAGE_SHIFT; |
| 482 | end = md->phys_addr + size; | 482 | end = md->phys_addr + size; |
| 483 | 483 | ||
| 484 | if (PFN_UP(end) <= max_low_pfn_mapped) | 484 | end_pfn = PFN_UP(end); |
| 485 | if (end_pfn <= max_low_pfn_mapped | ||
| 486 | || (end_pfn > (1UL << (32 - PAGE_SHIFT)) | ||
| 487 | && end_pfn <= max_pfn_mapped)) | ||
| 485 | va = __va(md->phys_addr); | 488 | va = __va(md->phys_addr); |
| 486 | else | 489 | else |
| 487 | va = efi_ioremap(md->phys_addr, size); | 490 | va = efi_ioremap(md->phys_addr, size); |
diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c index a4ee29127fdf..22c3b7828c50 100644 --- a/arch/x86/kernel/efi_64.c +++ b/arch/x86/kernel/efi_64.c | |||
| @@ -100,24 +100,11 @@ void __init efi_call_phys_epilog(void) | |||
| 100 | 100 | ||
| 101 | void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size) | 101 | void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size) |
| 102 | { | 102 | { |
| 103 | static unsigned pages_mapped __initdata; | 103 | unsigned long last_map_pfn; |
| 104 | unsigned i, pages; | ||
| 105 | unsigned long offset; | ||
| 106 | 104 | ||
| 107 | pages = PFN_UP(phys_addr + size) - PFN_DOWN(phys_addr); | 105 | last_map_pfn = init_memory_mapping(phys_addr, phys_addr + size); |
| 108 | offset = phys_addr & ~PAGE_MASK; | 106 | if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size) |
| 109 | phys_addr &= PAGE_MASK; | ||
| 110 | |||
| 111 | if (pages_mapped + pages > MAX_EFI_IO_PAGES) | ||
| 112 | return NULL; | 107 | return NULL; |
| 113 | 108 | ||
| 114 | for (i = 0; i < pages; i++) { | 109 | return (void __iomem *)__va(phys_addr); |
| 115 | __set_fixmap(FIX_EFI_IO_MAP_FIRST_PAGE - pages_mapped, | ||
| 116 | phys_addr, PAGE_KERNEL); | ||
| 117 | phys_addr += PAGE_SIZE; | ||
| 118 | pages_mapped++; | ||
| 119 | } | ||
| 120 | |||
| 121 | return (void __iomem *)__fix_to_virt(FIX_EFI_IO_MAP_FIRST_PAGE - \ | ||
| 122 | (pages_mapped - pages)) + offset; | ||
| 123 | } | 110 | } |
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 83d1836b9467..7ba4621c0dfa 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S | |||
| @@ -984,6 +984,8 @@ apicinterrupt UV_BAU_MESSAGE \ | |||
| 984 | #endif | 984 | #endif |
| 985 | apicinterrupt LOCAL_TIMER_VECTOR \ | 985 | apicinterrupt LOCAL_TIMER_VECTOR \ |
| 986 | apic_timer_interrupt smp_apic_timer_interrupt | 986 | apic_timer_interrupt smp_apic_timer_interrupt |
| 987 | apicinterrupt GENERIC_INTERRUPT_VECTOR \ | ||
| 988 | generic_interrupt smp_generic_interrupt | ||
| 987 | 989 | ||
| 988 | #ifdef CONFIG_SMP | 990 | #ifdef CONFIG_SMP |
| 989 | apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \ | 991 | apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \ |
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index b0f61f0dcd0a..f2f8540a7f3d 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c | |||
| @@ -136,7 +136,7 @@ int init_fpu(struct task_struct *tsk) | |||
| 136 | #ifdef CONFIG_X86_32 | 136 | #ifdef CONFIG_X86_32 |
| 137 | if (!HAVE_HWFP) { | 137 | if (!HAVE_HWFP) { |
| 138 | memset(tsk->thread.xstate, 0, xstate_size); | 138 | memset(tsk->thread.xstate, 0, xstate_size); |
| 139 | finit(); | 139 | finit_task(tsk); |
| 140 | set_stopped_child_used_math(tsk); | 140 | set_stopped_child_used_math(tsk); |
| 141 | return 0; | 141 | return 0; |
| 142 | } | 142 | } |
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index f13ca1650aaf..b864341dcc45 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c | |||
| @@ -15,6 +15,9 @@ | |||
| 15 | 15 | ||
| 16 | atomic_t irq_err_count; | 16 | atomic_t irq_err_count; |
| 17 | 17 | ||
| 18 | /* Function pointer for generic interrupt vector handling */ | ||
| 19 | void (*generic_interrupt_extension)(void) = NULL; | ||
| 20 | |||
| 18 | /* | 21 | /* |
| 19 | * 'what should we do if we get a hw irq event on an illegal vector'. | 22 | * 'what should we do if we get a hw irq event on an illegal vector'. |
| 20 | * each architecture has to answer this themselves. | 23 | * each architecture has to answer this themselves. |
| @@ -56,6 +59,12 @@ static int show_other_interrupts(struct seq_file *p) | |||
| 56 | seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs); | 59 | seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs); |
| 57 | seq_printf(p, " Local timer interrupts\n"); | 60 | seq_printf(p, " Local timer interrupts\n"); |
| 58 | #endif | 61 | #endif |
| 62 | if (generic_interrupt_extension) { | ||
| 63 | seq_printf(p, "PLT: "); | ||
| 64 | for_each_online_cpu(j) | ||
| 65 | seq_printf(p, "%10u ", irq_stats(j)->generic_irqs); | ||
| 66 | seq_printf(p, " Platform interrupts\n"); | ||
| 67 | } | ||
| 59 | #ifdef CONFIG_SMP | 68 | #ifdef CONFIG_SMP |
| 60 | seq_printf(p, "RES: "); | 69 | seq_printf(p, "RES: "); |
| 61 | for_each_online_cpu(j) | 70 | for_each_online_cpu(j) |
| @@ -163,6 +172,8 @@ u64 arch_irq_stat_cpu(unsigned int cpu) | |||
| 163 | #ifdef CONFIG_X86_LOCAL_APIC | 172 | #ifdef CONFIG_X86_LOCAL_APIC |
| 164 | sum += irq_stats(cpu)->apic_timer_irqs; | 173 | sum += irq_stats(cpu)->apic_timer_irqs; |
| 165 | #endif | 174 | #endif |
| 175 | if (generic_interrupt_extension) | ||
| 176 | sum += irq_stats(cpu)->generic_irqs; | ||
| 166 | #ifdef CONFIG_SMP | 177 | #ifdef CONFIG_SMP |
| 167 | sum += irq_stats(cpu)->irq_resched_count; | 178 | sum += irq_stats(cpu)->irq_resched_count; |
| 168 | sum += irq_stats(cpu)->irq_call_count; | 179 | sum += irq_stats(cpu)->irq_call_count; |
| @@ -226,4 +237,27 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs) | |||
| 226 | return 1; | 237 | return 1; |
| 227 | } | 238 | } |
| 228 | 239 | ||
| 240 | /* | ||
| 241 | * Handler for GENERIC_INTERRUPT_VECTOR. | ||
| 242 | */ | ||
| 243 | void smp_generic_interrupt(struct pt_regs *regs) | ||
| 244 | { | ||
| 245 | struct pt_regs *old_regs = set_irq_regs(regs); | ||
| 246 | |||
| 247 | ack_APIC_irq(); | ||
| 248 | |||
| 249 | exit_idle(); | ||
| 250 | |||
| 251 | irq_enter(); | ||
| 252 | |||
| 253 | inc_irq_stat(generic_irqs); | ||
| 254 | |||
| 255 | if (generic_interrupt_extension) | ||
| 256 | generic_interrupt_extension(); | ||
| 257 | |||
| 258 | irq_exit(); | ||
| 259 | |||
| 260 | set_irq_regs(old_regs); | ||
| 261 | } | ||
| 262 | |||
| 229 | EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq); | 263 | EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq); |
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 9dc6b2b24275..3b09634a5153 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c | |||
| @@ -16,6 +16,7 @@ | |||
| 16 | #include <linux/cpu.h> | 16 | #include <linux/cpu.h> |
| 17 | #include <linux/delay.h> | 17 | #include <linux/delay.h> |
| 18 | #include <linux/uaccess.h> | 18 | #include <linux/uaccess.h> |
| 19 | #include <linux/percpu.h> | ||
| 19 | 20 | ||
| 20 | #include <asm/apic.h> | 21 | #include <asm/apic.h> |
| 21 | 22 | ||
| @@ -55,13 +56,13 @@ static inline void print_stack_overflow(void) { } | |||
| 55 | union irq_ctx { | 56 | union irq_ctx { |
| 56 | struct thread_info tinfo; | 57 | struct thread_info tinfo; |
| 57 | u32 stack[THREAD_SIZE/sizeof(u32)]; | 58 | u32 stack[THREAD_SIZE/sizeof(u32)]; |
| 58 | }; | 59 | } __attribute__((aligned(PAGE_SIZE))); |
| 59 | 60 | ||
| 60 | static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly; | 61 | static DEFINE_PER_CPU(union irq_ctx *, hardirq_ctx); |
| 61 | static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly; | 62 | static DEFINE_PER_CPU(union irq_ctx *, softirq_ctx); |
| 62 | 63 | ||
| 63 | static char softirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss; | 64 | static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, hardirq_stack); |
| 64 | static char hardirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss; | 65 | static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, softirq_stack); |
| 65 | 66 | ||
| 66 | static void call_on_stack(void *func, void *stack) | 67 | static void call_on_stack(void *func, void *stack) |
| 67 | { | 68 | { |
| @@ -81,7 +82,7 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) | |||
| 81 | u32 *isp, arg1, arg2; | 82 | u32 *isp, arg1, arg2; |
| 82 | 83 | ||
| 83 | curctx = (union irq_ctx *) current_thread_info(); | 84 | curctx = (union irq_ctx *) current_thread_info(); |
| 84 | irqctx = hardirq_ctx[smp_processor_id()]; | 85 | irqctx = __get_cpu_var(hardirq_ctx); |
| 85 | 86 | ||
| 86 | /* | 87 | /* |
| 87 | * this is where we switch to the IRQ stack. However, if we are | 88 | * this is where we switch to the IRQ stack. However, if we are |
| @@ -125,34 +126,34 @@ void __cpuinit irq_ctx_init(int cpu) | |||
| 125 | { | 126 | { |
| 126 | union irq_ctx *irqctx; | 127 | union irq_ctx *irqctx; |
| 127 | 128 | ||
| 128 | if (hardirq_ctx[cpu]) | 129 | if (per_cpu(hardirq_ctx, cpu)) |
| 129 | return; | 130 | return; |
| 130 | 131 | ||
| 131 | irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE]; | 132 | irqctx = &per_cpu(hardirq_stack, cpu); |
| 132 | irqctx->tinfo.task = NULL; | 133 | irqctx->tinfo.task = NULL; |
| 133 | irqctx->tinfo.exec_domain = NULL; | 134 | irqctx->tinfo.exec_domain = NULL; |
| 134 | irqctx->tinfo.cpu = cpu; | 135 | irqctx->tinfo.cpu = cpu; |
| 135 | irqctx->tinfo.preempt_count = HARDIRQ_OFFSET; | 136 | irqctx->tinfo.preempt_count = HARDIRQ_OFFSET; |
| 136 | irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); | 137 | irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); |
| 137 | 138 | ||
| 138 | hardirq_ctx[cpu] = irqctx; | 139 | per_cpu(hardirq_ctx, cpu) = irqctx; |
| 139 | 140 | ||
| 140 | irqctx = (union irq_ctx *) &softirq_stack[cpu*THREAD_SIZE]; | 141 | irqctx = &per_cpu(softirq_stack, cpu); |
| 141 | irqctx->tinfo.task = NULL; | 142 | irqctx->tinfo.task = NULL; |
| 142 | irqctx->tinfo.exec_domain = NULL; | 143 | irqctx->tinfo.exec_domain = NULL; |
| 143 | irqctx->tinfo.cpu = cpu; | 144 | irqctx->tinfo.cpu = cpu; |
| 144 | irqctx->tinfo.preempt_count = 0; | 145 | irqctx->tinfo.preempt_count = 0; |
| 145 | irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); | 146 | irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); |
| 146 | 147 | ||
| 147 | softirq_ctx[cpu] = irqctx; | 148 | per_cpu(softirq_ctx, cpu) = irqctx; |
| 148 | 149 | ||
| 149 | printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n", | 150 | printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n", |
| 150 | cpu, hardirq_ctx[cpu], softirq_ctx[cpu]); | 151 | cpu, per_cpu(hardirq_ctx, cpu), per_cpu(softirq_ctx, cpu)); |
| 151 | } | 152 | } |
| 152 | 153 | ||
| 153 | void irq_ctx_exit(int cpu) | 154 | void irq_ctx_exit(int cpu) |
| 154 | { | 155 | { |
| 155 | hardirq_ctx[cpu] = NULL; | 156 | per_cpu(hardirq_ctx, cpu) = NULL; |
| 156 | } | 157 | } |
| 157 | 158 | ||
| 158 | asmlinkage void do_softirq(void) | 159 | asmlinkage void do_softirq(void) |
| @@ -169,7 +170,7 @@ asmlinkage void do_softirq(void) | |||
| 169 | 170 | ||
| 170 | if (local_softirq_pending()) { | 171 | if (local_softirq_pending()) { |
| 171 | curctx = current_thread_info(); | 172 | curctx = current_thread_info(); |
| 172 | irqctx = softirq_ctx[smp_processor_id()]; | 173 | irqctx = __get_cpu_var(softirq_ctx); |
| 173 | irqctx->tinfo.task = curctx->task; | 174 | irqctx->tinfo.task = curctx->task; |
| 174 | irqctx->tinfo.previous_esp = current_stack_pointer; | 175 | irqctx->tinfo.previous_esp = current_stack_pointer; |
| 175 | 176 | ||
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index 50b8c3a3006c..bc1326105448 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c | |||
| @@ -175,6 +175,9 @@ void __init native_init_IRQ(void) | |||
| 175 | /* self generated IPI for local APIC timer */ | 175 | /* self generated IPI for local APIC timer */ |
| 176 | alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); | 176 | alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); |
| 177 | 177 | ||
| 178 | /* generic IPI for platform specific use */ | ||
| 179 | alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt); | ||
| 180 | |||
| 178 | /* IPI vectors for APIC spurious and error interrupts */ | 181 | /* IPI vectors for APIC spurious and error interrupts */ |
| 179 | alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); | 182 | alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); |
| 180 | alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); | 183 | alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); |
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index da481a1e3f30..c7a49e0ffbfb 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c | |||
| @@ -147,6 +147,9 @@ static void __init apic_intr_init(void) | |||
| 147 | /* self generated IPI for local APIC timer */ | 147 | /* self generated IPI for local APIC timer */ |
| 148 | alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); | 148 | alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); |
| 149 | 149 | ||
| 150 | /* generic IPI for platform specific use */ | ||
| 151 | alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt); | ||
| 152 | |||
| 150 | /* IPI vectors for APIC spurious and error interrupts */ | 153 | /* IPI vectors for APIC spurious and error interrupts */ |
| 151 | alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); | 154 | alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); |
| 152 | alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); | 155 | alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); |
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index f5fc8c781a62..e7368c1da01d 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c | |||
| @@ -14,12 +14,12 @@ | |||
| 14 | #include <linux/ftrace.h> | 14 | #include <linux/ftrace.h> |
| 15 | #include <linux/suspend.h> | 15 | #include <linux/suspend.h> |
| 16 | #include <linux/gfp.h> | 16 | #include <linux/gfp.h> |
| 17 | #include <linux/io.h> | ||
| 17 | 18 | ||
| 18 | #include <asm/pgtable.h> | 19 | #include <asm/pgtable.h> |
| 19 | #include <asm/pgalloc.h> | 20 | #include <asm/pgalloc.h> |
| 20 | #include <asm/tlbflush.h> | 21 | #include <asm/tlbflush.h> |
| 21 | #include <asm/mmu_context.h> | 22 | #include <asm/mmu_context.h> |
| 22 | #include <asm/io.h> | ||
| 23 | #include <asm/apic.h> | 23 | #include <asm/apic.h> |
| 24 | #include <asm/cpufeature.h> | 24 | #include <asm/cpufeature.h> |
| 25 | #include <asm/desc.h> | 25 | #include <asm/desc.h> |
| @@ -63,7 +63,7 @@ static void load_segments(void) | |||
| 63 | "\tmovl %%eax,%%fs\n" | 63 | "\tmovl %%eax,%%fs\n" |
| 64 | "\tmovl %%eax,%%gs\n" | 64 | "\tmovl %%eax,%%gs\n" |
| 65 | "\tmovl %%eax,%%ss\n" | 65 | "\tmovl %%eax,%%ss\n" |
| 66 | ::: "eax", "memory"); | 66 | : : : "eax", "memory"); |
| 67 | #undef STR | 67 | #undef STR |
| 68 | #undef __STR | 68 | #undef __STR |
| 69 | } | 69 | } |
| @@ -205,7 +205,8 @@ void machine_kexec(struct kimage *image) | |||
| 205 | 205 | ||
| 206 | if (image->preserve_context) { | 206 | if (image->preserve_context) { |
| 207 | #ifdef CONFIG_X86_IO_APIC | 207 | #ifdef CONFIG_X86_IO_APIC |
| 208 | /* We need to put APICs in legacy mode so that we can | 208 | /* |
| 209 | * We need to put APICs in legacy mode so that we can | ||
| 209 | * get timer interrupts in second kernel. kexec/kdump | 210 | * get timer interrupts in second kernel. kexec/kdump |
| 210 | * paths already have calls to disable_IO_APIC() in | 211 | * paths already have calls to disable_IO_APIC() in |
| 211 | * one form or other. kexec jump path also need | 212 | * one form or other. kexec jump path also need |
| @@ -227,7 +228,8 @@ void machine_kexec(struct kimage *image) | |||
| 227 | page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) | 228 | page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) |
| 228 | << PAGE_SHIFT); | 229 | << PAGE_SHIFT); |
| 229 | 230 | ||
| 230 | /* The segment registers are funny things, they have both a | 231 | /* |
| 232 | * The segment registers are funny things, they have both a | ||
| 231 | * visible and an invisible part. Whenever the visible part is | 233 | * visible and an invisible part. Whenever the visible part is |
| 232 | * set to a specific selector, the invisible part is loaded | 234 | * set to a specific selector, the invisible part is loaded |
| 233 | * with from a table in memory. At no other time is the | 235 | * with from a table in memory. At no other time is the |
| @@ -237,11 +239,12 @@ void machine_kexec(struct kimage *image) | |||
| 237 | * segments, before I zap the gdt with an invalid value. | 239 | * segments, before I zap the gdt with an invalid value. |
| 238 | */ | 240 | */ |
| 239 | load_segments(); | 241 | load_segments(); |
| 240 | /* The gdt & idt are now invalid. | 242 | /* |
| 243 | * The gdt & idt are now invalid. | ||
| 241 | * If you want to load them you must set up your own idt & gdt. | 244 | * If you want to load them you must set up your own idt & gdt. |
| 242 | */ | 245 | */ |
| 243 | set_gdt(phys_to_virt(0),0); | 246 | set_gdt(phys_to_virt(0), 0); |
| 244 | set_idt(phys_to_virt(0),0); | 247 | set_idt(phys_to_virt(0), 0); |
| 245 | 248 | ||
| 246 | /* now call it */ | 249 | /* now call it */ |
| 247 | image->start = relocate_kernel_ptr((unsigned long)image->head, | 250 | image->start = relocate_kernel_ptr((unsigned long)image->head, |
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 6993d51b7fd8..89cea4d44679 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c | |||
| @@ -12,11 +12,47 @@ | |||
| 12 | #include <linux/reboot.h> | 12 | #include <linux/reboot.h> |
| 13 | #include <linux/numa.h> | 13 | #include <linux/numa.h> |
| 14 | #include <linux/ftrace.h> | 14 | #include <linux/ftrace.h> |
| 15 | #include <linux/io.h> | ||
| 16 | #include <linux/suspend.h> | ||
| 15 | 17 | ||
| 16 | #include <asm/pgtable.h> | 18 | #include <asm/pgtable.h> |
| 17 | #include <asm/tlbflush.h> | 19 | #include <asm/tlbflush.h> |
| 18 | #include <asm/mmu_context.h> | 20 | #include <asm/mmu_context.h> |
| 19 | #include <asm/io.h> | 21 | |
| 22 | static int init_one_level2_page(struct kimage *image, pgd_t *pgd, | ||
| 23 | unsigned long addr) | ||
| 24 | { | ||
| 25 | pud_t *pud; | ||
| 26 | pmd_t *pmd; | ||
| 27 | struct page *page; | ||
| 28 | int result = -ENOMEM; | ||
| 29 | |||
| 30 | addr &= PMD_MASK; | ||
| 31 | pgd += pgd_index(addr); | ||
| 32 | if (!pgd_present(*pgd)) { | ||
| 33 | page = kimage_alloc_control_pages(image, 0); | ||
| 34 | if (!page) | ||
| 35 | goto out; | ||
| 36 | pud = (pud_t *)page_address(page); | ||
| 37 | memset(pud, 0, PAGE_SIZE); | ||
| 38 | set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); | ||
| 39 | } | ||
| 40 | pud = pud_offset(pgd, addr); | ||
| 41 | if (!pud_present(*pud)) { | ||
| 42 | page = kimage_alloc_control_pages(image, 0); | ||
| 43 | if (!page) | ||
| 44 | goto out; | ||
| 45 | pmd = (pmd_t *)page_address(page); | ||
| 46 | memset(pmd, 0, PAGE_SIZE); | ||
| 47 | set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); | ||
| 48 | } | ||
| 49 | pmd = pmd_offset(pud, addr); | ||
| 50 | if (!pmd_present(*pmd)) | ||
| 51 | set_pmd(pmd, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC)); | ||
| 52 | result = 0; | ||
| 53 | out: | ||
| 54 | return result; | ||
| 55 | } | ||
| 20 | 56 | ||
| 21 | static void init_level2_page(pmd_t *level2p, unsigned long addr) | 57 | static void init_level2_page(pmd_t *level2p, unsigned long addr) |
| 22 | { | 58 | { |
| @@ -83,9 +119,8 @@ static int init_level4_page(struct kimage *image, pgd_t *level4p, | |||
| 83 | } | 119 | } |
| 84 | level3p = (pud_t *)page_address(page); | 120 | level3p = (pud_t *)page_address(page); |
| 85 | result = init_level3_page(image, level3p, addr, last_addr); | 121 | result = init_level3_page(image, level3p, addr, last_addr); |
| 86 | if (result) { | 122 | if (result) |
| 87 | goto out; | 123 | goto out; |
| 88 | } | ||
| 89 | set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE)); | 124 | set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE)); |
| 90 | addr += PGDIR_SIZE; | 125 | addr += PGDIR_SIZE; |
| 91 | } | 126 | } |
| @@ -156,6 +191,13 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable) | |||
| 156 | result = init_level4_page(image, level4p, 0, max_pfn << PAGE_SHIFT); | 191 | result = init_level4_page(image, level4p, 0, max_pfn << PAGE_SHIFT); |
| 157 | if (result) | 192 | if (result) |
| 158 | return result; | 193 | return result; |
| 194 | /* | ||
| 195 | * image->start may be outside 0 ~ max_pfn, for example when | ||
| 196 | * jump back to original kernel from kexeced kernel | ||
| 197 | */ | ||
| 198 | result = init_one_level2_page(image, level4p, image->start); | ||
| 199 | if (result) | ||
| 200 | return result; | ||
| 159 | return init_transition_pgtable(image, level4p); | 201 | return init_transition_pgtable(image, level4p); |
| 160 | } | 202 | } |
| 161 | 203 | ||
| @@ -229,20 +271,45 @@ void machine_kexec(struct kimage *image) | |||
| 229 | { | 271 | { |
| 230 | unsigned long page_list[PAGES_NR]; | 272 | unsigned long page_list[PAGES_NR]; |
| 231 | void *control_page; | 273 | void *control_page; |
| 274 | int save_ftrace_enabled; | ||
| 232 | 275 | ||
| 233 | tracer_disable(); | 276 | #ifdef CONFIG_KEXEC_JUMP |
| 277 | if (kexec_image->preserve_context) | ||
| 278 | save_processor_state(); | ||
| 279 | #endif | ||
| 280 | |||
| 281 | save_ftrace_enabled = __ftrace_enabled_save(); | ||
| 234 | 282 | ||
| 235 | /* Interrupts aren't acceptable while we reboot */ | 283 | /* Interrupts aren't acceptable while we reboot */ |
| 236 | local_irq_disable(); | 284 | local_irq_disable(); |
| 237 | 285 | ||
| 286 | if (image->preserve_context) { | ||
| 287 | #ifdef CONFIG_X86_IO_APIC | ||
| 288 | /* | ||
| 289 | * We need to put APICs in legacy mode so that we can | ||
| 290 | * get timer interrupts in second kernel. kexec/kdump | ||
| 291 | * paths already have calls to disable_IO_APIC() in | ||
| 292 | * one form or other. kexec jump path also need | ||
| 293 | * one. | ||
| 294 | */ | ||
| 295 | disable_IO_APIC(); | ||
| 296 | #endif | ||
| 297 | } | ||
| 298 | |||
| 238 | control_page = page_address(image->control_code_page) + PAGE_SIZE; | 299 | control_page = page_address(image->control_code_page) + PAGE_SIZE; |
| 239 | memcpy(control_page, relocate_kernel, PAGE_SIZE); | 300 | memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE); |
| 240 | 301 | ||
| 241 | page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page); | 302 | page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page); |
| 303 | page_list[VA_CONTROL_PAGE] = (unsigned long)control_page; | ||
| 242 | page_list[PA_TABLE_PAGE] = | 304 | page_list[PA_TABLE_PAGE] = |
| 243 | (unsigned long)__pa(page_address(image->control_code_page)); | 305 | (unsigned long)__pa(page_address(image->control_code_page)); |
| 244 | 306 | ||
| 245 | /* The segment registers are funny things, they have both a | 307 | if (image->type == KEXEC_TYPE_DEFAULT) |
| 308 | page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) | ||
| 309 | << PAGE_SHIFT); | ||
| 310 | |||
| 311 | /* | ||
| 312 | * The segment registers are funny things, they have both a | ||
| 246 | * visible and an invisible part. Whenever the visible part is | 313 | * visible and an invisible part. Whenever the visible part is |
| 247 | * set to a specific selector, the invisible part is loaded | 314 | * set to a specific selector, the invisible part is loaded |
| 248 | * with from a table in memory. At no other time is the | 315 | * with from a table in memory. At no other time is the |
| @@ -252,15 +319,25 @@ void machine_kexec(struct kimage *image) | |||
| 252 | * segments, before I zap the gdt with an invalid value. | 319 | * segments, before I zap the gdt with an invalid value. |
| 253 | */ | 320 | */ |
| 254 | load_segments(); | 321 | load_segments(); |
| 255 | /* The gdt & idt are now invalid. | 322 | /* |
| 323 | * The gdt & idt are now invalid. | ||
| 256 | * If you want to load them you must set up your own idt & gdt. | 324 | * If you want to load them you must set up your own idt & gdt. |
| 257 | */ | 325 | */ |
| 258 | set_gdt(phys_to_virt(0),0); | 326 | set_gdt(phys_to_virt(0), 0); |
| 259 | set_idt(phys_to_virt(0),0); | 327 | set_idt(phys_to_virt(0), 0); |
| 260 | 328 | ||
| 261 | /* now call it */ | 329 | /* now call it */ |
| 262 | relocate_kernel((unsigned long)image->head, (unsigned long)page_list, | 330 | image->start = relocate_kernel((unsigned long)image->head, |
| 263 | image->start); | 331 | (unsigned long)page_list, |
| 332 | image->start, | ||
| 333 | image->preserve_context); | ||
| 334 | |||
| 335 | #ifdef CONFIG_KEXEC_JUMP | ||
| 336 | if (kexec_image->preserve_context) | ||
| 337 | restore_processor_state(); | ||
| 338 | #endif | ||
| 339 | |||
| 340 | __ftrace_enabled_restore(save_ftrace_enabled); | ||
| 264 | } | 341 | } |
| 265 | 342 | ||
| 266 | void arch_crash_save_vmcoreinfo(void) | 343 | void arch_crash_save_vmcoreinfo(void) |
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 37cb1bda1baf..e8192401da47 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c | |||
| @@ -558,6 +558,19 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type) | |||
| 558 | 558 | ||
| 559 | static struct mpf_intel *mpf_found; | 559 | static struct mpf_intel *mpf_found; |
| 560 | 560 | ||
| 561 | static unsigned long __init get_mpc_size(unsigned long physptr) | ||
| 562 | { | ||
| 563 | struct mpc_table *mpc; | ||
| 564 | unsigned long size; | ||
| 565 | |||
| 566 | mpc = early_ioremap(physptr, PAGE_SIZE); | ||
| 567 | size = mpc->length; | ||
| 568 | early_iounmap(mpc, PAGE_SIZE); | ||
| 569 | apic_printk(APIC_VERBOSE, " mpc: %lx-%lx\n", physptr, physptr + size); | ||
| 570 | |||
| 571 | return size; | ||
| 572 | } | ||
| 573 | |||
| 561 | /* | 574 | /* |
| 562 | * Scan the memory blocks for an SMP configuration block. | 575 | * Scan the memory blocks for an SMP configuration block. |
| 563 | */ | 576 | */ |
| @@ -611,12 +624,16 @@ static void __init __get_smp_config(unsigned int early) | |||
| 611 | construct_default_ISA_mptable(mpf->feature1); | 624 | construct_default_ISA_mptable(mpf->feature1); |
| 612 | 625 | ||
| 613 | } else if (mpf->physptr) { | 626 | } else if (mpf->physptr) { |
| 627 | struct mpc_table *mpc; | ||
| 628 | unsigned long size; | ||
| 614 | 629 | ||
| 630 | size = get_mpc_size(mpf->physptr); | ||
| 631 | mpc = early_ioremap(mpf->physptr, size); | ||
| 615 | /* | 632 | /* |
| 616 | * Read the physical hardware table. Anything here will | 633 | * Read the physical hardware table. Anything here will |
| 617 | * override the defaults. | 634 | * override the defaults. |
| 618 | */ | 635 | */ |
| 619 | if (!smp_read_mpc(phys_to_virt(mpf->physptr), early)) { | 636 | if (!smp_read_mpc(mpc, early)) { |
| 620 | #ifdef CONFIG_X86_LOCAL_APIC | 637 | #ifdef CONFIG_X86_LOCAL_APIC |
| 621 | smp_found_config = 0; | 638 | smp_found_config = 0; |
| 622 | #endif | 639 | #endif |
| @@ -624,8 +641,10 @@ static void __init __get_smp_config(unsigned int early) | |||
| 624 | "BIOS bug, MP table errors detected!...\n"); | 641 | "BIOS bug, MP table errors detected!...\n"); |
| 625 | printk(KERN_ERR "... disabling SMP support. " | 642 | printk(KERN_ERR "... disabling SMP support. " |
| 626 | "(tell your hw vendor)\n"); | 643 | "(tell your hw vendor)\n"); |
| 644 | early_iounmap(mpc, size); | ||
| 627 | return; | 645 | return; |
| 628 | } | 646 | } |
| 647 | early_iounmap(mpc, size); | ||
| 629 | 648 | ||
| 630 | if (early) | 649 | if (early) |
| 631 | return; | 650 | return; |
| @@ -697,10 +716,10 @@ static int __init smp_scan_config(unsigned long base, unsigned long length, | |||
| 697 | 716 | ||
| 698 | if (!reserve) | 717 | if (!reserve) |
| 699 | return 1; | 718 | return 1; |
| 700 | reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE, | 719 | reserve_bootmem_generic(virt_to_phys(mpf), sizeof(*mpf), |
| 701 | BOOTMEM_DEFAULT); | 720 | BOOTMEM_DEFAULT); |
| 702 | if (mpf->physptr) { | 721 | if (mpf->physptr) { |
| 703 | unsigned long size = PAGE_SIZE; | 722 | unsigned long size = get_mpc_size(mpf->physptr); |
| 704 | #ifdef CONFIG_X86_32 | 723 | #ifdef CONFIG_X86_32 |
| 705 | /* | 724 | /* |
| 706 | * We cannot access to MPC table to compute | 725 | * We cannot access to MPC table to compute |
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 1cc18d439bbb..2aef36d8aca2 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c | |||
| @@ -216,6 +216,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { | |||
| 216 | DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq"), | 216 | DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq"), |
| 217 | }, | 217 | }, |
| 218 | }, | 218 | }, |
| 219 | { /* Handle problems with rebooting on Dell XPS710 */ | ||
| 220 | .callback = set_bios_reboot, | ||
| 221 | .ident = "Dell XPS710", | ||
| 222 | .matches = { | ||
| 223 | DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), | ||
| 224 | DMI_MATCH(DMI_PRODUCT_NAME, "Dell XPS710"), | ||
| 225 | }, | ||
| 226 | }, | ||
| 219 | { } | 227 | { } |
| 220 | }; | 228 | }; |
| 221 | 229 | ||
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S index 2064d0aa8d28..41235531b11c 100644 --- a/arch/x86/kernel/relocate_kernel_32.S +++ b/arch/x86/kernel/relocate_kernel_32.S | |||
| @@ -17,7 +17,8 @@ | |||
| 17 | 17 | ||
| 18 | #define PTR(x) (x << 2) | 18 | #define PTR(x) (x << 2) |
| 19 | 19 | ||
| 20 | /* control_page + KEXEC_CONTROL_CODE_MAX_SIZE | 20 | /* |
| 21 | * control_page + KEXEC_CONTROL_CODE_MAX_SIZE | ||
| 21 | * ~ control_page + PAGE_SIZE are used as data storage and stack for | 22 | * ~ control_page + PAGE_SIZE are used as data storage and stack for |
| 22 | * jumping back | 23 | * jumping back |
| 23 | */ | 24 | */ |
| @@ -76,8 +77,10 @@ relocate_kernel: | |||
| 76 | movl %eax, CP_PA_SWAP_PAGE(%edi) | 77 | movl %eax, CP_PA_SWAP_PAGE(%edi) |
| 77 | movl %ebx, CP_PA_BACKUP_PAGES_MAP(%edi) | 78 | movl %ebx, CP_PA_BACKUP_PAGES_MAP(%edi) |
| 78 | 79 | ||
| 79 | /* get physical address of control page now */ | 80 | /* |
| 80 | /* this is impossible after page table switch */ | 81 | * get physical address of control page now |
| 82 | * this is impossible after page table switch | ||
| 83 | */ | ||
| 81 | movl PTR(PA_CONTROL_PAGE)(%ebp), %edi | 84 | movl PTR(PA_CONTROL_PAGE)(%ebp), %edi |
| 82 | 85 | ||
| 83 | /* switch to new set of page tables */ | 86 | /* switch to new set of page tables */ |
| @@ -97,7 +100,8 @@ identity_mapped: | |||
| 97 | /* store the start address on the stack */ | 100 | /* store the start address on the stack */ |
| 98 | pushl %edx | 101 | pushl %edx |
| 99 | 102 | ||
| 100 | /* Set cr0 to a known state: | 103 | /* |
| 104 | * Set cr0 to a known state: | ||
| 101 | * - Paging disabled | 105 | * - Paging disabled |
| 102 | * - Alignment check disabled | 106 | * - Alignment check disabled |
| 103 | * - Write protect disabled | 107 | * - Write protect disabled |
| @@ -113,7 +117,8 @@ identity_mapped: | |||
| 113 | /* clear cr4 if applicable */ | 117 | /* clear cr4 if applicable */ |
| 114 | testl %ecx, %ecx | 118 | testl %ecx, %ecx |
| 115 | jz 1f | 119 | jz 1f |
| 116 | /* Set cr4 to a known state: | 120 | /* |
| 121 | * Set cr4 to a known state: | ||
| 117 | * Setting everything to zero seems safe. | 122 | * Setting everything to zero seems safe. |
| 118 | */ | 123 | */ |
| 119 | xorl %eax, %eax | 124 | xorl %eax, %eax |
| @@ -132,15 +137,18 @@ identity_mapped: | |||
| 132 | call swap_pages | 137 | call swap_pages |
| 133 | addl $8, %esp | 138 | addl $8, %esp |
| 134 | 139 | ||
| 135 | /* To be certain of avoiding problems with self-modifying code | 140 | /* |
| 141 | * To be certain of avoiding problems with self-modifying code | ||
| 136 | * I need to execute a serializing instruction here. | 142 | * I need to execute a serializing instruction here. |
| 137 | * So I flush the TLB, it's handy, and not processor dependent. | 143 | * So I flush the TLB, it's handy, and not processor dependent. |
| 138 | */ | 144 | */ |
| 139 | xorl %eax, %eax | 145 | xorl %eax, %eax |
| 140 | movl %eax, %cr3 | 146 | movl %eax, %cr3 |
| 141 | 147 | ||
| 142 | /* set all of the registers to known values */ | 148 | /* |
| 143 | /* leave %esp alone */ | 149 | * set all of the registers to known values |
| 150 | * leave %esp alone | ||
| 151 | */ | ||
| 144 | 152 | ||
| 145 | testl %esi, %esi | 153 | testl %esi, %esi |
| 146 | jnz 1f | 154 | jnz 1f |
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index d32cfb27a479..4de8f5b3d476 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S | |||
| @@ -19,29 +19,77 @@ | |||
| 19 | #define PTR(x) (x << 3) | 19 | #define PTR(x) (x << 3) |
| 20 | #define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) | 20 | #define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) |
| 21 | 21 | ||
| 22 | /* | ||
| 23 | * control_page + KEXEC_CONTROL_CODE_MAX_SIZE | ||
| 24 | * ~ control_page + PAGE_SIZE are used as data storage and stack for | ||
| 25 | * jumping back | ||
| 26 | */ | ||
| 27 | #define DATA(offset) (KEXEC_CONTROL_CODE_MAX_SIZE+(offset)) | ||
| 28 | |||
| 29 | /* Minimal CPU state */ | ||
| 30 | #define RSP DATA(0x0) | ||
| 31 | #define CR0 DATA(0x8) | ||
| 32 | #define CR3 DATA(0x10) | ||
| 33 | #define CR4 DATA(0x18) | ||
| 34 | |||
| 35 | /* other data */ | ||
| 36 | #define CP_PA_TABLE_PAGE DATA(0x20) | ||
| 37 | #define CP_PA_SWAP_PAGE DATA(0x28) | ||
| 38 | #define CP_PA_BACKUP_PAGES_MAP DATA(0x30) | ||
| 39 | |||
| 22 | .text | 40 | .text |
| 23 | .align PAGE_SIZE | 41 | .align PAGE_SIZE |
| 24 | .code64 | 42 | .code64 |
| 25 | .globl relocate_kernel | 43 | .globl relocate_kernel |
| 26 | relocate_kernel: | 44 | relocate_kernel: |
| 27 | /* %rdi indirection_page | 45 | /* |
| 46 | * %rdi indirection_page | ||
| 28 | * %rsi page_list | 47 | * %rsi page_list |
| 29 | * %rdx start address | 48 | * %rdx start address |
| 49 | * %rcx preserve_context | ||
| 30 | */ | 50 | */ |
| 31 | 51 | ||
| 52 | /* Save the CPU context, used for jumping back */ | ||
| 53 | pushq %rbx | ||
| 54 | pushq %rbp | ||
| 55 | pushq %r12 | ||
| 56 | pushq %r13 | ||
| 57 | pushq %r14 | ||
| 58 | pushq %r15 | ||
| 59 | pushf | ||
| 60 | |||
| 61 | movq PTR(VA_CONTROL_PAGE)(%rsi), %r11 | ||
| 62 | movq %rsp, RSP(%r11) | ||
| 63 | movq %cr0, %rax | ||
| 64 | movq %rax, CR0(%r11) | ||
| 65 | movq %cr3, %rax | ||
| 66 | movq %rax, CR3(%r11) | ||
| 67 | movq %cr4, %rax | ||
| 68 | movq %rax, CR4(%r11) | ||
| 69 | |||
| 32 | /* zero out flags, and disable interrupts */ | 70 | /* zero out flags, and disable interrupts */ |
| 33 | pushq $0 | 71 | pushq $0 |
| 34 | popfq | 72 | popfq |
| 35 | 73 | ||
| 36 | /* get physical address of control page now */ | 74 | /* |
| 37 | /* this is impossible after page table switch */ | 75 | * get physical address of control page now |
| 76 | * this is impossible after page table switch | ||
| 77 | */ | ||
| 38 | movq PTR(PA_CONTROL_PAGE)(%rsi), %r8 | 78 | movq PTR(PA_CONTROL_PAGE)(%rsi), %r8 |
| 39 | 79 | ||
| 40 | /* get physical address of page table now too */ | 80 | /* get physical address of page table now too */ |
| 41 | movq PTR(PA_TABLE_PAGE)(%rsi), %rcx | 81 | movq PTR(PA_TABLE_PAGE)(%rsi), %r9 |
| 82 | |||
| 83 | /* get physical address of swap page now */ | ||
| 84 | movq PTR(PA_SWAP_PAGE)(%rsi), %r10 | ||
| 85 | |||
| 86 | /* save some information for jumping back */ | ||
| 87 | movq %r9, CP_PA_TABLE_PAGE(%r11) | ||
| 88 | movq %r10, CP_PA_SWAP_PAGE(%r11) | ||
| 89 | movq %rdi, CP_PA_BACKUP_PAGES_MAP(%r11) | ||
| 42 | 90 | ||
| 43 | /* Switch to the identity mapped page tables */ | 91 | /* Switch to the identity mapped page tables */ |
| 44 | movq %rcx, %cr3 | 92 | movq %r9, %cr3 |
| 45 | 93 | ||
| 46 | /* setup a new stack at the end of the physical control page */ | 94 | /* setup a new stack at the end of the physical control page */ |
| 47 | lea PAGE_SIZE(%r8), %rsp | 95 | lea PAGE_SIZE(%r8), %rsp |
| @@ -55,7 +103,8 @@ identity_mapped: | |||
| 55 | /* store the start address on the stack */ | 103 | /* store the start address on the stack */ |
| 56 | pushq %rdx | 104 | pushq %rdx |
| 57 | 105 | ||
| 58 | /* Set cr0 to a known state: | 106 | /* |
| 107 | * Set cr0 to a known state: | ||
| 59 | * - Paging enabled | 108 | * - Paging enabled |
| 60 | * - Alignment check disabled | 109 | * - Alignment check disabled |
| 61 | * - Write protect disabled | 110 | * - Write protect disabled |
| @@ -68,7 +117,8 @@ identity_mapped: | |||
| 68 | orl $(X86_CR0_PG | X86_CR0_PE), %eax | 117 | orl $(X86_CR0_PG | X86_CR0_PE), %eax |
| 69 | movq %rax, %cr0 | 118 | movq %rax, %cr0 |
| 70 | 119 | ||
| 71 | /* Set cr4 to a known state: | 120 | /* |
| 121 | * Set cr4 to a known state: | ||
| 72 | * - physical address extension enabled | 122 | * - physical address extension enabled |
| 73 | */ | 123 | */ |
| 74 | movq $X86_CR4_PAE, %rax | 124 | movq $X86_CR4_PAE, %rax |
| @@ -78,9 +128,87 @@ identity_mapped: | |||
| 78 | 1: | 128 | 1: |
| 79 | 129 | ||
| 80 | /* Flush the TLB (needed?) */ | 130 | /* Flush the TLB (needed?) */ |
| 81 | movq %rcx, %cr3 | 131 | movq %r9, %cr3 |
| 132 | |||
| 133 | movq %rcx, %r11 | ||
| 134 | call swap_pages | ||
| 135 | |||
| 136 | /* | ||
| 137 | * To be certain of avoiding problems with self-modifying code | ||
| 138 | * I need to execute a serializing instruction here. | ||
| 139 | * So I flush the TLB by reloading %cr3 here, it's handy, | ||
| 140 | * and not processor dependent. | ||
| 141 | */ | ||
| 142 | movq %cr3, %rax | ||
| 143 | movq %rax, %cr3 | ||
| 144 | |||
| 145 | /* | ||
| 146 | * set all of the registers to known values | ||
| 147 | * leave %rsp alone | ||
| 148 | */ | ||
| 149 | |||
| 150 | testq %r11, %r11 | ||
| 151 | jnz 1f | ||
| 152 | xorq %rax, %rax | ||
| 153 | xorq %rbx, %rbx | ||
| 154 | xorq %rcx, %rcx | ||
| 155 | xorq %rdx, %rdx | ||
| 156 | xorq %rsi, %rsi | ||
| 157 | xorq %rdi, %rdi | ||
| 158 | xorq %rbp, %rbp | ||
| 159 | xorq %r8, %r8 | ||
| 160 | xorq %r9, %r9 | ||
| 161 | xorq %r10, %r9 | ||
| 162 | xorq %r11, %r11 | ||
| 163 | xorq %r12, %r12 | ||
| 164 | xorq %r13, %r13 | ||
| 165 | xorq %r14, %r14 | ||
| 166 | xorq %r15, %r15 | ||
| 167 | |||
| 168 | ret | ||
| 169 | |||
| 170 | 1: | ||
| 171 | popq %rdx | ||
| 172 | leaq PAGE_SIZE(%r10), %rsp | ||
| 173 | call *%rdx | ||
| 174 | |||
| 175 | /* get the re-entry point of the peer system */ | ||
| 176 | movq 0(%rsp), %rbp | ||
| 177 | call 1f | ||
| 178 | 1: | ||
| 179 | popq %r8 | ||
| 180 | subq $(1b - relocate_kernel), %r8 | ||
| 181 | movq CP_PA_SWAP_PAGE(%r8), %r10 | ||
| 182 | movq CP_PA_BACKUP_PAGES_MAP(%r8), %rdi | ||
| 183 | movq CP_PA_TABLE_PAGE(%r8), %rax | ||
| 184 | movq %rax, %cr3 | ||
| 185 | lea PAGE_SIZE(%r8), %rsp | ||
| 186 | call swap_pages | ||
| 187 | movq $virtual_mapped, %rax | ||
| 188 | pushq %rax | ||
| 189 | ret | ||
| 190 | |||
| 191 | virtual_mapped: | ||
| 192 | movq RSP(%r8), %rsp | ||
| 193 | movq CR4(%r8), %rax | ||
| 194 | movq %rax, %cr4 | ||
| 195 | movq CR3(%r8), %rax | ||
| 196 | movq CR0(%r8), %r8 | ||
| 197 | movq %rax, %cr3 | ||
| 198 | movq %r8, %cr0 | ||
| 199 | movq %rbp, %rax | ||
| 200 | |||
| 201 | popf | ||
| 202 | popq %r15 | ||
| 203 | popq %r14 | ||
| 204 | popq %r13 | ||
| 205 | popq %r12 | ||
| 206 | popq %rbp | ||
| 207 | popq %rbx | ||
| 208 | ret | ||
| 82 | 209 | ||
| 83 | /* Do the copies */ | 210 | /* Do the copies */ |
| 211 | swap_pages: | ||
| 84 | movq %rdi, %rcx /* Put the page_list in %rcx */ | 212 | movq %rdi, %rcx /* Put the page_list in %rcx */ |
| 85 | xorq %rdi, %rdi | 213 | xorq %rdi, %rdi |
| 86 | xorq %rsi, %rsi | 214 | xorq %rsi, %rsi |
| @@ -112,36 +240,27 @@ identity_mapped: | |||
| 112 | movq %rcx, %rsi /* For ever source page do a copy */ | 240 | movq %rcx, %rsi /* For ever source page do a copy */ |
| 113 | andq $0xfffffffffffff000, %rsi | 241 | andq $0xfffffffffffff000, %rsi |
| 114 | 242 | ||
| 243 | movq %rdi, %rdx | ||
| 244 | movq %rsi, %rax | ||
| 245 | |||
| 246 | movq %r10, %rdi | ||
| 115 | movq $512, %rcx | 247 | movq $512, %rcx |
| 116 | rep ; movsq | 248 | rep ; movsq |
| 117 | jmp 0b | ||
| 118 | 3: | ||
| 119 | |||
| 120 | /* To be certain of avoiding problems with self-modifying code | ||
| 121 | * I need to execute a serializing instruction here. | ||
| 122 | * So I flush the TLB by reloading %cr3 here, it's handy, | ||
| 123 | * and not processor dependent. | ||
| 124 | */ | ||
| 125 | movq %cr3, %rax | ||
| 126 | movq %rax, %cr3 | ||
| 127 | 249 | ||
| 128 | /* set all of the registers to known values */ | 250 | movq %rax, %rdi |
| 129 | /* leave %rsp alone */ | 251 | movq %rdx, %rsi |
| 252 | movq $512, %rcx | ||
| 253 | rep ; movsq | ||
| 130 | 254 | ||
| 131 | xorq %rax, %rax | 255 | movq %rdx, %rdi |
| 132 | xorq %rbx, %rbx | 256 | movq %r10, %rsi |
| 133 | xorq %rcx, %rcx | 257 | movq $512, %rcx |
| 134 | xorq %rdx, %rdx | 258 | rep ; movsq |
| 135 | xorq %rsi, %rsi | ||
| 136 | xorq %rdi, %rdi | ||
| 137 | xorq %rbp, %rbp | ||
| 138 | xorq %r8, %r8 | ||
| 139 | xorq %r9, %r9 | ||
| 140 | xorq %r10, %r9 | ||
| 141 | xorq %r11, %r11 | ||
| 142 | xorq %r12, %r12 | ||
| 143 | xorq %r13, %r13 | ||
| 144 | xorq %r14, %r14 | ||
| 145 | xorq %r15, %r15 | ||
| 146 | 259 | ||
| 260 | lea PAGE_SIZE(%rax), %rsi | ||
| 261 | jmp 0b | ||
| 262 | 3: | ||
| 147 | ret | 263 | ret |
| 264 | |||
| 265 | .globl kexec_control_code_size | ||
| 266 | .set kexec_control_code_size, . - relocate_kernel | ||
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 4c54bc0d8ff3..f28c56e6bf94 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c | |||
| @@ -202,7 +202,9 @@ struct ist_info ist_info; | |||
| 202 | #endif | 202 | #endif |
| 203 | 203 | ||
| 204 | #else | 204 | #else |
| 205 | struct cpuinfo_x86 boot_cpu_data __read_mostly; | 205 | struct cpuinfo_x86 boot_cpu_data __read_mostly = { |
| 206 | .x86_phys_bits = MAX_PHYSMEM_BITS, | ||
| 207 | }; | ||
| 206 | EXPORT_SYMBOL(boot_cpu_data); | 208 | EXPORT_SYMBOL(boot_cpu_data); |
| 207 | #endif | 209 | #endif |
| 208 | 210 | ||
| @@ -770,6 +772,9 @@ void __init setup_arch(char **cmdline_p) | |||
| 770 | 772 | ||
| 771 | finish_e820_parsing(); | 773 | finish_e820_parsing(); |
| 772 | 774 | ||
| 775 | if (efi_enabled) | ||
| 776 | efi_init(); | ||
| 777 | |||
| 773 | dmi_scan_machine(); | 778 | dmi_scan_machine(); |
| 774 | 779 | ||
| 775 | dmi_check_system(bad_bios_dmi_table); | 780 | dmi_check_system(bad_bios_dmi_table); |
| @@ -789,8 +794,6 @@ void __init setup_arch(char **cmdline_p) | |||
| 789 | insert_resource(&iomem_resource, &data_resource); | 794 | insert_resource(&iomem_resource, &data_resource); |
| 790 | insert_resource(&iomem_resource, &bss_resource); | 795 | insert_resource(&iomem_resource, &bss_resource); |
| 791 | 796 | ||
| 792 | if (efi_enabled) | ||
| 793 | efi_init(); | ||
| 794 | 797 | ||
| 795 | #ifdef CONFIG_X86_32 | 798 | #ifdef CONFIG_X86_32 |
| 796 | if (ppro_with_ram_bug()) { | 799 | if (ppro_with_ram_bug()) { |
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index d992e6cff730..efa615f2bf43 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c | |||
| @@ -7,6 +7,7 @@ | |||
| 7 | #include <linux/crash_dump.h> | 7 | #include <linux/crash_dump.h> |
| 8 | #include <linux/smp.h> | 8 | #include <linux/smp.h> |
| 9 | #include <linux/topology.h> | 9 | #include <linux/topology.h> |
| 10 | #include <linux/pfn.h> | ||
| 10 | #include <asm/sections.h> | 11 | #include <asm/sections.h> |
| 11 | #include <asm/processor.h> | 12 | #include <asm/processor.h> |
| 12 | #include <asm/setup.h> | 13 | #include <asm/setup.h> |
| @@ -41,6 +42,352 @@ unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = { | |||
| 41 | }; | 42 | }; |
| 42 | EXPORT_SYMBOL(__per_cpu_offset); | 43 | EXPORT_SYMBOL(__per_cpu_offset); |
| 43 | 44 | ||
| 45 | /* | ||
| 46 | * On x86_64 symbols referenced from code should be reachable using | ||
| 47 | * 32bit relocations. Reserve space for static percpu variables in | ||
| 48 | * modules so that they are always served from the first chunk which | ||
| 49 | * is located at the percpu segment base. On x86_32, anything can | ||
| 50 | * address anywhere. No need to reserve space in the first chunk. | ||
| 51 | */ | ||
| 52 | #ifdef CONFIG_X86_64 | ||
| 53 | #define PERCPU_FIRST_CHUNK_RESERVE PERCPU_MODULE_RESERVE | ||
| 54 | #else | ||
| 55 | #define PERCPU_FIRST_CHUNK_RESERVE 0 | ||
| 56 | #endif | ||
| 57 | |||
| 58 | /** | ||
| 59 | * pcpu_need_numa - determine percpu allocation needs to consider NUMA | ||
| 60 | * | ||
| 61 | * If NUMA is not configured or there is only one NUMA node available, | ||
| 62 | * there is no reason to consider NUMA. This function determines | ||
| 63 | * whether percpu allocation should consider NUMA or not. | ||
| 64 | * | ||
| 65 | * RETURNS: | ||
| 66 | * true if NUMA should be considered; otherwise, false. | ||
| 67 | */ | ||
| 68 | static bool __init pcpu_need_numa(void) | ||
| 69 | { | ||
| 70 | #ifdef CONFIG_NEED_MULTIPLE_NODES | ||
| 71 | pg_data_t *last = NULL; | ||
| 72 | unsigned int cpu; | ||
| 73 | |||
| 74 | for_each_possible_cpu(cpu) { | ||
| 75 | int node = early_cpu_to_node(cpu); | ||
| 76 | |||
| 77 | if (node_online(node) && NODE_DATA(node) && | ||
| 78 | last && last != NODE_DATA(node)) | ||
| 79 | return true; | ||
| 80 | |||
| 81 | last = NODE_DATA(node); | ||
| 82 | } | ||
| 83 | #endif | ||
| 84 | return false; | ||
| 85 | } | ||
| 86 | |||
| 87 | /** | ||
| 88 | * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu | ||
| 89 | * @cpu: cpu to allocate for | ||
| 90 | * @size: size allocation in bytes | ||
| 91 | * @align: alignment | ||
| 92 | * | ||
| 93 | * Allocate @size bytes aligned at @align for cpu @cpu. This wrapper | ||
| 94 | * does the right thing for NUMA regardless of the current | ||
| 95 | * configuration. | ||
| 96 | * | ||
| 97 | * RETURNS: | ||
| 98 | * Pointer to the allocated area on success, NULL on failure. | ||
| 99 | */ | ||
| 100 | static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, | ||
| 101 | unsigned long align) | ||
| 102 | { | ||
| 103 | const unsigned long goal = __pa(MAX_DMA_ADDRESS); | ||
| 104 | #ifdef CONFIG_NEED_MULTIPLE_NODES | ||
| 105 | int node = early_cpu_to_node(cpu); | ||
| 106 | void *ptr; | ||
| 107 | |||
| 108 | if (!node_online(node) || !NODE_DATA(node)) { | ||
| 109 | ptr = __alloc_bootmem_nopanic(size, align, goal); | ||
| 110 | pr_info("cpu %d has no node %d or node-local memory\n", | ||
| 111 | cpu, node); | ||
| 112 | pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n", | ||
| 113 | cpu, size, __pa(ptr)); | ||
| 114 | } else { | ||
| 115 | ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node), | ||
| 116 | size, align, goal); | ||
| 117 | pr_debug("per cpu data for cpu%d %lu bytes on node%d at " | ||
| 118 | "%016lx\n", cpu, size, node, __pa(ptr)); | ||
| 119 | } | ||
| 120 | return ptr; | ||
| 121 | #else | ||
| 122 | return __alloc_bootmem_nopanic(size, align, goal); | ||
| 123 | #endif | ||
| 124 | } | ||
| 125 | |||
| 126 | /* | ||
| 127 | * Remap allocator | ||
| 128 | * | ||
| 129 | * This allocator uses PMD page as unit. A PMD page is allocated for | ||
| 130 | * each cpu and each is remapped into vmalloc area using PMD mapping. | ||
| 131 | * As PMD page is quite large, only part of it is used for the first | ||
| 132 | * chunk. Unused part is returned to the bootmem allocator. | ||
| 133 | * | ||
| 134 | * So, the PMD pages are mapped twice - once to the physical mapping | ||
| 135 | * and to the vmalloc area for the first percpu chunk. The double | ||
| 136 | * mapping does add one more PMD TLB entry pressure but still is much | ||
| 137 | * better than only using 4k mappings while still being NUMA friendly. | ||
| 138 | */ | ||
| 139 | #ifdef CONFIG_NEED_MULTIPLE_NODES | ||
| 140 | static size_t pcpur_size __initdata; | ||
| 141 | static void **pcpur_ptrs __initdata; | ||
| 142 | |||
| 143 | static struct page * __init pcpur_get_page(unsigned int cpu, int pageno) | ||
| 144 | { | ||
| 145 | size_t off = (size_t)pageno << PAGE_SHIFT; | ||
| 146 | |||
| 147 | if (off >= pcpur_size) | ||
| 148 | return NULL; | ||
| 149 | |||
| 150 | return virt_to_page(pcpur_ptrs[cpu] + off); | ||
| 151 | } | ||
| 152 | |||
| 153 | static ssize_t __init setup_pcpu_remap(size_t static_size) | ||
| 154 | { | ||
| 155 | static struct vm_struct vm; | ||
| 156 | pg_data_t *last; | ||
| 157 | size_t ptrs_size, dyn_size; | ||
| 158 | unsigned int cpu; | ||
| 159 | ssize_t ret; | ||
| 160 | |||
| 161 | /* | ||
| 162 | * If large page isn't supported, there's no benefit in doing | ||
| 163 | * this. Also, on non-NUMA, embedding is better. | ||
| 164 | */ | ||
| 165 | if (!cpu_has_pse || pcpu_need_numa()) | ||
| 166 | return -EINVAL; | ||
| 167 | |||
| 168 | last = NULL; | ||
| 169 | for_each_possible_cpu(cpu) { | ||
| 170 | int node = early_cpu_to_node(cpu); | ||
| 171 | |||
| 172 | if (node_online(node) && NODE_DATA(node) && | ||
| 173 | last && last != NODE_DATA(node)) | ||
| 174 | goto proceed; | ||
| 175 | |||
| 176 | last = NODE_DATA(node); | ||
| 177 | } | ||
| 178 | return -EINVAL; | ||
| 179 | |||
| 180 | proceed: | ||
| 181 | /* | ||
| 182 | * Currently supports only single page. Supporting multiple | ||
| 183 | * pages won't be too difficult if it ever becomes necessary. | ||
| 184 | */ | ||
| 185 | pcpur_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE + | ||
| 186 | PERCPU_DYNAMIC_RESERVE); | ||
| 187 | if (pcpur_size > PMD_SIZE) { | ||
| 188 | pr_warning("PERCPU: static data is larger than large page, " | ||
| 189 | "can't use large page\n"); | ||
| 190 | return -EINVAL; | ||
| 191 | } | ||
| 192 | dyn_size = pcpur_size - static_size - PERCPU_FIRST_CHUNK_RESERVE; | ||
| 193 | |||
| 194 | /* allocate pointer array and alloc large pages */ | ||
| 195 | ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpur_ptrs[0])); | ||
| 196 | pcpur_ptrs = alloc_bootmem(ptrs_size); | ||
| 197 | |||
| 198 | for_each_possible_cpu(cpu) { | ||
| 199 | pcpur_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PMD_SIZE, PMD_SIZE); | ||
| 200 | if (!pcpur_ptrs[cpu]) | ||
| 201 | goto enomem; | ||
| 202 | |||
| 203 | /* | ||
| 204 | * Only use pcpur_size bytes and give back the rest. | ||
| 205 | * | ||
| 206 | * Ingo: The 2MB up-rounding bootmem is needed to make | ||
| 207 | * sure the partial 2MB page is still fully RAM - it's | ||
| 208 | * not well-specified to have a PAT-incompatible area | ||
| 209 | * (unmapped RAM, device memory, etc.) in that hole. | ||
| 210 | */ | ||
| 211 | free_bootmem(__pa(pcpur_ptrs[cpu] + pcpur_size), | ||
| 212 | PMD_SIZE - pcpur_size); | ||
| 213 | |||
| 214 | memcpy(pcpur_ptrs[cpu], __per_cpu_load, static_size); | ||
| 215 | } | ||
| 216 | |||
| 217 | /* allocate address and map */ | ||
| 218 | vm.flags = VM_ALLOC; | ||
| 219 | vm.size = num_possible_cpus() * PMD_SIZE; | ||
| 220 | vm_area_register_early(&vm, PMD_SIZE); | ||
| 221 | |||
| 222 | for_each_possible_cpu(cpu) { | ||
| 223 | pmd_t *pmd; | ||
| 224 | |||
| 225 | pmd = populate_extra_pmd((unsigned long)vm.addr | ||
| 226 | + cpu * PMD_SIZE); | ||
| 227 | set_pmd(pmd, pfn_pmd(page_to_pfn(virt_to_page(pcpur_ptrs[cpu])), | ||
| 228 | PAGE_KERNEL_LARGE)); | ||
| 229 | } | ||
| 230 | |||
| 231 | /* we're ready, commit */ | ||
| 232 | pr_info("PERCPU: Remapped at %p with large pages, static data " | ||
| 233 | "%zu bytes\n", vm.addr, static_size); | ||
| 234 | |||
| 235 | ret = pcpu_setup_first_chunk(pcpur_get_page, static_size, | ||
| 236 | PERCPU_FIRST_CHUNK_RESERVE, | ||
| 237 | PMD_SIZE, dyn_size, vm.addr, NULL); | ||
| 238 | goto out_free_ar; | ||
| 239 | |||
| 240 | enomem: | ||
| 241 | for_each_possible_cpu(cpu) | ||
| 242 | if (pcpur_ptrs[cpu]) | ||
| 243 | free_bootmem(__pa(pcpur_ptrs[cpu]), PMD_SIZE); | ||
| 244 | ret = -ENOMEM; | ||
| 245 | out_free_ar: | ||
| 246 | free_bootmem(__pa(pcpur_ptrs), ptrs_size); | ||
| 247 | return ret; | ||
| 248 | } | ||
| 249 | #else | ||
| 250 | static ssize_t __init setup_pcpu_remap(size_t static_size) | ||
| 251 | { | ||
| 252 | return -EINVAL; | ||
| 253 | } | ||
| 254 | #endif | ||
| 255 | |||
| 256 | /* | ||
| 257 | * Embedding allocator | ||
| 258 | * | ||
| 259 | * The first chunk is sized to just contain the static area plus | ||
| 260 | * module and dynamic reserves, and allocated as a contiguous area | ||
| 261 | * using bootmem allocator and used as-is without being mapped into | ||
| 262 | * vmalloc area. This enables the first chunk to piggy back on the | ||
| 263 | * linear physical PMD mapping and doesn't add any additional pressure | ||
| 264 | * to TLB. Note that if the needed size is smaller than the minimum | ||
| 265 | * unit size, the leftover is returned to the bootmem allocator. | ||
| 266 | */ | ||
| 267 | static void *pcpue_ptr __initdata; | ||
| 268 | static size_t pcpue_size __initdata; | ||
| 269 | static size_t pcpue_unit_size __initdata; | ||
| 270 | |||
| 271 | static struct page * __init pcpue_get_page(unsigned int cpu, int pageno) | ||
| 272 | { | ||
| 273 | size_t off = (size_t)pageno << PAGE_SHIFT; | ||
| 274 | |||
| 275 | if (off >= pcpue_size) | ||
| 276 | return NULL; | ||
| 277 | |||
| 278 | return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size + off); | ||
| 279 | } | ||
| 280 | |||
| 281 | static ssize_t __init setup_pcpu_embed(size_t static_size) | ||
| 282 | { | ||
| 283 | unsigned int cpu; | ||
| 284 | size_t dyn_size; | ||
| 285 | |||
| 286 | /* | ||
| 287 | * If large page isn't supported, there's no benefit in doing | ||
| 288 | * this. Also, embedding allocation doesn't play well with | ||
| 289 | * NUMA. | ||
| 290 | */ | ||
| 291 | if (!cpu_has_pse || pcpu_need_numa()) | ||
| 292 | return -EINVAL; | ||
| 293 | |||
| 294 | /* allocate and copy */ | ||
| 295 | pcpue_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE + | ||
| 296 | PERCPU_DYNAMIC_RESERVE); | ||
| 297 | pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE); | ||
| 298 | dyn_size = pcpue_size - static_size - PERCPU_FIRST_CHUNK_RESERVE; | ||
| 299 | |||
| 300 | pcpue_ptr = pcpu_alloc_bootmem(0, num_possible_cpus() * pcpue_unit_size, | ||
| 301 | PAGE_SIZE); | ||
| 302 | if (!pcpue_ptr) | ||
| 303 | return -ENOMEM; | ||
| 304 | |||
| 305 | for_each_possible_cpu(cpu) { | ||
| 306 | void *ptr = pcpue_ptr + cpu * pcpue_unit_size; | ||
| 307 | |||
| 308 | free_bootmem(__pa(ptr + pcpue_size), | ||
| 309 | pcpue_unit_size - pcpue_size); | ||
| 310 | memcpy(ptr, __per_cpu_load, static_size); | ||
| 311 | } | ||
| 312 | |||
| 313 | /* we're ready, commit */ | ||
| 314 | pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n", | ||
| 315 | pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size); | ||
| 316 | |||
| 317 | return pcpu_setup_first_chunk(pcpue_get_page, static_size, | ||
| 318 | PERCPU_FIRST_CHUNK_RESERVE, | ||
| 319 | pcpue_unit_size, dyn_size, | ||
| 320 | pcpue_ptr, NULL); | ||
| 321 | } | ||
| 322 | |||
| 323 | /* | ||
| 324 | * 4k page allocator | ||
| 325 | * | ||
| 326 | * This is the basic allocator. Static percpu area is allocated | ||
| 327 | * page-by-page and most of initialization is done by the generic | ||
| 328 | * setup function. | ||
| 329 | */ | ||
| 330 | static struct page **pcpu4k_pages __initdata; | ||
| 331 | static int pcpu4k_nr_static_pages __initdata; | ||
| 332 | |||
| 333 | static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno) | ||
| 334 | { | ||
| 335 | if (pageno < pcpu4k_nr_static_pages) | ||
| 336 | return pcpu4k_pages[cpu * pcpu4k_nr_static_pages + pageno]; | ||
| 337 | return NULL; | ||
| 338 | } | ||
| 339 | |||
| 340 | static void __init pcpu4k_populate_pte(unsigned long addr) | ||
| 341 | { | ||
| 342 | populate_extra_pte(addr); | ||
| 343 | } | ||
| 344 | |||
| 345 | static ssize_t __init setup_pcpu_4k(size_t static_size) | ||
| 346 | { | ||
| 347 | size_t pages_size; | ||
| 348 | unsigned int cpu; | ||
| 349 | int i, j; | ||
| 350 | ssize_t ret; | ||
| 351 | |||
| 352 | pcpu4k_nr_static_pages = PFN_UP(static_size); | ||
| 353 | |||
| 354 | /* unaligned allocations can't be freed, round up to page size */ | ||
| 355 | pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * num_possible_cpus() | ||
| 356 | * sizeof(pcpu4k_pages[0])); | ||
| 357 | pcpu4k_pages = alloc_bootmem(pages_size); | ||
| 358 | |||
| 359 | /* allocate and copy */ | ||
| 360 | j = 0; | ||
| 361 | for_each_possible_cpu(cpu) | ||
| 362 | for (i = 0; i < pcpu4k_nr_static_pages; i++) { | ||
| 363 | void *ptr; | ||
| 364 | |||
| 365 | ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE); | ||
| 366 | if (!ptr) | ||
| 367 | goto enomem; | ||
| 368 | |||
| 369 | memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE); | ||
| 370 | pcpu4k_pages[j++] = virt_to_page(ptr); | ||
| 371 | } | ||
| 372 | |||
| 373 | /* we're ready, commit */ | ||
| 374 | pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n", | ||
| 375 | pcpu4k_nr_static_pages, static_size); | ||
| 376 | |||
| 377 | ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, | ||
| 378 | PERCPU_FIRST_CHUNK_RESERVE, -1, -1, NULL, | ||
| 379 | pcpu4k_populate_pte); | ||
| 380 | goto out_free_ar; | ||
| 381 | |||
| 382 | enomem: | ||
| 383 | while (--j >= 0) | ||
| 384 | free_bootmem(__pa(page_address(pcpu4k_pages[j])), PAGE_SIZE); | ||
| 385 | ret = -ENOMEM; | ||
| 386 | out_free_ar: | ||
| 387 | free_bootmem(__pa(pcpu4k_pages), pages_size); | ||
| 388 | return ret; | ||
| 389 | } | ||
| 390 | |||
| 44 | static inline void setup_percpu_segment(int cpu) | 391 | static inline void setup_percpu_segment(int cpu) |
| 45 | { | 392 | { |
| 46 | #ifdef CONFIG_X86_32 | 393 | #ifdef CONFIG_X86_32 |
| @@ -61,38 +408,35 @@ static inline void setup_percpu_segment(int cpu) | |||
| 61 | */ | 408 | */ |
| 62 | void __init setup_per_cpu_areas(void) | 409 | void __init setup_per_cpu_areas(void) |
| 63 | { | 410 | { |
| 64 | ssize_t size; | 411 | size_t static_size = __per_cpu_end - __per_cpu_start; |
| 65 | char *ptr; | 412 | unsigned int cpu; |
| 66 | int cpu; | 413 | unsigned long delta; |
| 67 | 414 | size_t pcpu_unit_size; | |
| 68 | /* Copy section for each CPU (we discard the original) */ | 415 | ssize_t ret; |
| 69 | size = roundup(PERCPU_ENOUGH_ROOM, PAGE_SIZE); | ||
| 70 | 416 | ||
| 71 | pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n", | 417 | pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n", |
| 72 | NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); | 418 | NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); |
| 73 | 419 | ||
| 74 | pr_info("PERCPU: Allocating %zd bytes of per cpu data\n", size); | 420 | /* |
| 421 | * Allocate percpu area. If PSE is supported, try to make use | ||
| 422 | * of large page mappings. Please read comments on top of | ||
| 423 | * each allocator for details. | ||
| 424 | */ | ||
| 425 | ret = setup_pcpu_remap(static_size); | ||
| 426 | if (ret < 0) | ||
| 427 | ret = setup_pcpu_embed(static_size); | ||
| 428 | if (ret < 0) | ||
| 429 | ret = setup_pcpu_4k(static_size); | ||
| 430 | if (ret < 0) | ||
| 431 | panic("cannot allocate static percpu area (%zu bytes, err=%zd)", | ||
| 432 | static_size, ret); | ||
| 75 | 433 | ||
| 76 | for_each_possible_cpu(cpu) { | 434 | pcpu_unit_size = ret; |
| 77 | #ifndef CONFIG_NEED_MULTIPLE_NODES | ||
| 78 | ptr = alloc_bootmem_pages(size); | ||
| 79 | #else | ||
| 80 | int node = early_cpu_to_node(cpu); | ||
| 81 | if (!node_online(node) || !NODE_DATA(node)) { | ||
| 82 | ptr = alloc_bootmem_pages(size); | ||
| 83 | pr_info("cpu %d has no node %d or node-local memory\n", | ||
| 84 | cpu, node); | ||
| 85 | pr_debug("per cpu data for cpu%d at %016lx\n", | ||
| 86 | cpu, __pa(ptr)); | ||
| 87 | } else { | ||
| 88 | ptr = alloc_bootmem_pages_node(NODE_DATA(node), size); | ||
| 89 | pr_debug("per cpu data for cpu%d on node%d at %016lx\n", | ||
| 90 | cpu, node, __pa(ptr)); | ||
| 91 | } | ||
| 92 | #endif | ||
| 93 | 435 | ||
| 94 | memcpy(ptr, __per_cpu_load, __per_cpu_end - __per_cpu_start); | 436 | /* alrighty, percpu areas up and running */ |
| 95 | per_cpu_offset(cpu) = ptr - __per_cpu_start; | 437 | delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; |
| 438 | for_each_possible_cpu(cpu) { | ||
| 439 | per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size; | ||
| 96 | per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); | 440 | per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); |
| 97 | per_cpu(cpu_number, cpu) = cpu; | 441 | per_cpu(cpu_number, cpu) = cpu; |
| 98 | setup_percpu_segment(cpu); | 442 | setup_percpu_segment(cpu); |
| @@ -125,8 +469,6 @@ void __init setup_per_cpu_areas(void) | |||
| 125 | */ | 469 | */ |
| 126 | if (cpu == boot_cpu_id) | 470 | if (cpu == boot_cpu_id) |
| 127 | switch_to_new_gdt(cpu); | 471 | switch_to_new_gdt(cpu); |
| 128 | |||
| 129 | DBG("PERCPU: cpu %4d %p\n", cpu, ptr); | ||
| 130 | } | 472 | } |
| 131 | 473 | ||
| 132 | /* indicate the early static arrays will soon be gone */ | 474 | /* indicate the early static arrays will soon be gone */ |
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 249334f5080a..ef7d10170c30 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c | |||
| @@ -114,10 +114,6 @@ EXPORT_PER_CPU_SYMBOL(cpu_info); | |||
| 114 | 114 | ||
| 115 | atomic_t init_deasserted; | 115 | atomic_t init_deasserted; |
| 116 | 116 | ||
| 117 | |||
| 118 | /* Set if we find a B stepping CPU */ | ||
| 119 | static int __cpuinitdata smp_b_stepping; | ||
| 120 | |||
| 121 | #if defined(CONFIG_NUMA) && defined(CONFIG_X86_32) | 117 | #if defined(CONFIG_NUMA) && defined(CONFIG_X86_32) |
| 122 | 118 | ||
| 123 | /* which logical CPUs are on which nodes */ | 119 | /* which logical CPUs are on which nodes */ |
| @@ -271,8 +267,6 @@ static void __cpuinit smp_callin(void) | |||
| 271 | cpumask_set_cpu(cpuid, cpu_callin_mask); | 267 | cpumask_set_cpu(cpuid, cpu_callin_mask); |
| 272 | } | 268 | } |
| 273 | 269 | ||
| 274 | static int __cpuinitdata unsafe_smp; | ||
| 275 | |||
| 276 | /* | 270 | /* |
| 277 | * Activate a secondary processor. | 271 | * Activate a secondary processor. |
| 278 | */ | 272 | */ |
| @@ -340,76 +334,6 @@ notrace static void __cpuinit start_secondary(void *unused) | |||
| 340 | cpu_idle(); | 334 | cpu_idle(); |
| 341 | } | 335 | } |
| 342 | 336 | ||
| 343 | static void __cpuinit smp_apply_quirks(struct cpuinfo_x86 *c) | ||
| 344 | { | ||
| 345 | /* | ||
| 346 | * Mask B, Pentium, but not Pentium MMX | ||
| 347 | */ | ||
| 348 | if (c->x86_vendor == X86_VENDOR_INTEL && | ||
| 349 | c->x86 == 5 && | ||
| 350 | c->x86_mask >= 1 && c->x86_mask <= 4 && | ||
| 351 | c->x86_model <= 3) | ||
| 352 | /* | ||
| 353 | * Remember we have B step Pentia with bugs | ||
| 354 | */ | ||
| 355 | smp_b_stepping = 1; | ||
| 356 | |||
| 357 | /* | ||
| 358 | * Certain Athlons might work (for various values of 'work') in SMP | ||
| 359 | * but they are not certified as MP capable. | ||
| 360 | */ | ||
| 361 | if ((c->x86_vendor == X86_VENDOR_AMD) && (c->x86 == 6)) { | ||
| 362 | |||
| 363 | if (num_possible_cpus() == 1) | ||
| 364 | goto valid_k7; | ||
| 365 | |||
| 366 | /* Athlon 660/661 is valid. */ | ||
| 367 | if ((c->x86_model == 6) && ((c->x86_mask == 0) || | ||
| 368 | (c->x86_mask == 1))) | ||
| 369 | goto valid_k7; | ||
| 370 | |||
| 371 | /* Duron 670 is valid */ | ||
| 372 | if ((c->x86_model == 7) && (c->x86_mask == 0)) | ||
| 373 | goto valid_k7; | ||
| 374 | |||
| 375 | /* | ||
| 376 | * Athlon 662, Duron 671, and Athlon >model 7 have capability | ||
| 377 | * bit. It's worth noting that the A5 stepping (662) of some | ||
| 378 | * Athlon XP's have the MP bit set. | ||
| 379 | * See http://www.heise.de/newsticker/data/jow-18.10.01-000 for | ||
| 380 | * more. | ||
| 381 | */ | ||
| 382 | if (((c->x86_model == 6) && (c->x86_mask >= 2)) || | ||
| 383 | ((c->x86_model == 7) && (c->x86_mask >= 1)) || | ||
| 384 | (c->x86_model > 7)) | ||
| 385 | if (cpu_has_mp) | ||
| 386 | goto valid_k7; | ||
| 387 | |||
| 388 | /* If we get here, not a certified SMP capable AMD system. */ | ||
| 389 | unsafe_smp = 1; | ||
| 390 | } | ||
| 391 | |||
| 392 | valid_k7: | ||
| 393 | ; | ||
| 394 | } | ||
| 395 | |||
| 396 | static void __cpuinit smp_checks(void) | ||
| 397 | { | ||
| 398 | if (smp_b_stepping) | ||
| 399 | printk(KERN_WARNING "WARNING: SMP operation may be unreliable" | ||
| 400 | "with B stepping processors.\n"); | ||
| 401 | |||
| 402 | /* | ||
| 403 | * Don't taint if we are running SMP kernel on a single non-MP | ||
| 404 | * approved Athlon | ||
| 405 | */ | ||
| 406 | if (unsafe_smp && num_online_cpus() > 1) { | ||
| 407 | printk(KERN_INFO "WARNING: This combination of AMD" | ||
| 408 | "processors is not suitable for SMP.\n"); | ||
| 409 | add_taint(TAINT_UNSAFE_SMP); | ||
| 410 | } | ||
| 411 | } | ||
| 412 | |||
| 413 | /* | 337 | /* |
| 414 | * The bootstrap kernel entry code has set these up. Save them for | 338 | * The bootstrap kernel entry code has set these up. Save them for |
| 415 | * a given CPU | 339 | * a given CPU |
| @@ -423,7 +347,6 @@ void __cpuinit smp_store_cpu_info(int id) | |||
| 423 | c->cpu_index = id; | 347 | c->cpu_index = id; |
| 424 | if (id != 0) | 348 | if (id != 0) |
| 425 | identify_secondary_cpu(c); | 349 | identify_secondary_cpu(c); |
| 426 | smp_apply_quirks(c); | ||
| 427 | } | 350 | } |
| 428 | 351 | ||
| 429 | 352 | ||
| @@ -1193,7 +1116,6 @@ void __init native_smp_cpus_done(unsigned int max_cpus) | |||
| 1193 | pr_debug("Boot done.\n"); | 1116 | pr_debug("Boot done.\n"); |
| 1194 | 1117 | ||
| 1195 | impress_friends(); | 1118 | impress_friends(); |
| 1196 | smp_checks(); | ||
| 1197 | #ifdef CONFIG_X86_IO_APIC | 1119 | #ifdef CONFIG_X86_IO_APIC |
| 1198 | setup_ioapic_dest(); | 1120 | setup_ioapic_dest(); |
| 1199 | #endif | 1121 | #endif |
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index f04549afcfe9..d038b9c45cf8 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c | |||
| @@ -314,8 +314,6 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, | |||
| 314 | int locals = 0; | 314 | int locals = 0; |
| 315 | struct bau_desc *bau_desc; | 315 | struct bau_desc *bau_desc; |
| 316 | 316 | ||
| 317 | WARN_ON(!in_atomic()); | ||
| 318 | |||
| 319 | cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu)); | 317 | cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu)); |
| 320 | 318 | ||
| 321 | uv_cpu = uv_blade_processor_id(); | 319 | uv_cpu = uv_blade_processor_id(); |
diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/kernel/uv_time.c new file mode 100644 index 000000000000..2ffb6c53326e --- /dev/null +++ b/arch/x86/kernel/uv_time.c | |||
| @@ -0,0 +1,393 @@ | |||
| 1 | /* | ||
| 2 | * SGI RTC clock/timer routines. | ||
| 3 | * | ||
| 4 | * This program is free software; you can redistribute it and/or modify | ||
| 5 | * it under the terms of the GNU General Public License as published by | ||
| 6 | * the Free Software Foundation; either version 2 of the License, or | ||
| 7 | * (at your option) any later version. | ||
| 8 | * | ||
| 9 | * This program is distributed in the hope that it will be useful, | ||
| 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 12 | * GNU General Public License for more details. | ||
| 13 | * | ||
| 14 | * You should have received a copy of the GNU General Public License | ||
| 15 | * along with this program; if not, write to the Free Software | ||
| 16 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
| 17 | * | ||
| 18 | * Copyright (c) 2009 Silicon Graphics, Inc. All Rights Reserved. | ||
| 19 | * Copyright (c) Dimitri Sivanich | ||
| 20 | */ | ||
| 21 | #include <linux/clockchips.h> | ||
| 22 | |||
| 23 | #include <asm/uv/uv_mmrs.h> | ||
| 24 | #include <asm/uv/uv_hub.h> | ||
| 25 | #include <asm/uv/bios.h> | ||
| 26 | #include <asm/uv/uv.h> | ||
| 27 | #include <asm/apic.h> | ||
| 28 | #include <asm/cpu.h> | ||
| 29 | |||
| 30 | #define RTC_NAME "sgi_rtc" | ||
| 31 | |||
| 32 | static cycle_t uv_read_rtc(void); | ||
| 33 | static int uv_rtc_next_event(unsigned long, struct clock_event_device *); | ||
| 34 | static void uv_rtc_timer_setup(enum clock_event_mode, | ||
| 35 | struct clock_event_device *); | ||
| 36 | |||
| 37 | static struct clocksource clocksource_uv = { | ||
| 38 | .name = RTC_NAME, | ||
| 39 | .rating = 400, | ||
| 40 | .read = uv_read_rtc, | ||
| 41 | .mask = (cycle_t)UVH_RTC_REAL_TIME_CLOCK_MASK, | ||
| 42 | .shift = 10, | ||
| 43 | .flags = CLOCK_SOURCE_IS_CONTINUOUS, | ||
| 44 | }; | ||
| 45 | |||
| 46 | static struct clock_event_device clock_event_device_uv = { | ||
| 47 | .name = RTC_NAME, | ||
| 48 | .features = CLOCK_EVT_FEAT_ONESHOT, | ||
| 49 | .shift = 20, | ||
| 50 | .rating = 400, | ||
| 51 | .irq = -1, | ||
| 52 | .set_next_event = uv_rtc_next_event, | ||
| 53 | .set_mode = uv_rtc_timer_setup, | ||
| 54 | .event_handler = NULL, | ||
| 55 | }; | ||
| 56 | |||
| 57 | static DEFINE_PER_CPU(struct clock_event_device, cpu_ced); | ||
| 58 | |||
| 59 | /* There is one of these allocated per node */ | ||
| 60 | struct uv_rtc_timer_head { | ||
| 61 | spinlock_t lock; | ||
| 62 | /* next cpu waiting for timer, local node relative: */ | ||
| 63 | int next_cpu; | ||
| 64 | /* number of cpus on this node: */ | ||
| 65 | int ncpus; | ||
| 66 | struct { | ||
| 67 | int lcpu; /* systemwide logical cpu number */ | ||
| 68 | u64 expires; /* next timer expiration for this cpu */ | ||
| 69 | } cpu[1]; | ||
| 70 | }; | ||
| 71 | |||
| 72 | /* | ||
| 73 | * Access to uv_rtc_timer_head via blade id. | ||
| 74 | */ | ||
| 75 | static struct uv_rtc_timer_head **blade_info __read_mostly; | ||
| 76 | |||
| 77 | static int uv_rtc_enable; | ||
| 78 | |||
| 79 | /* | ||
| 80 | * Hardware interface routines | ||
| 81 | */ | ||
| 82 | |||
| 83 | /* Send IPIs to another node */ | ||
| 84 | static void uv_rtc_send_IPI(int cpu) | ||
| 85 | { | ||
| 86 | unsigned long apicid, val; | ||
| 87 | int pnode; | ||
| 88 | |||
| 89 | apicid = cpu_physical_id(cpu); | ||
| 90 | pnode = uv_apicid_to_pnode(apicid); | ||
| 91 | val = (1UL << UVH_IPI_INT_SEND_SHFT) | | ||
| 92 | (apicid << UVH_IPI_INT_APIC_ID_SHFT) | | ||
| 93 | (GENERIC_INTERRUPT_VECTOR << UVH_IPI_INT_VECTOR_SHFT); | ||
| 94 | |||
| 95 | uv_write_global_mmr64(pnode, UVH_IPI_INT, val); | ||
| 96 | } | ||
| 97 | |||
| 98 | /* Check for an RTC interrupt pending */ | ||
| 99 | static int uv_intr_pending(int pnode) | ||
| 100 | { | ||
| 101 | return uv_read_global_mmr64(pnode, UVH_EVENT_OCCURRED0) & | ||
| 102 | UVH_EVENT_OCCURRED0_RTC1_MASK; | ||
| 103 | } | ||
| 104 | |||
| 105 | /* Setup interrupt and return non-zero if early expiration occurred. */ | ||
| 106 | static int uv_setup_intr(int cpu, u64 expires) | ||
| 107 | { | ||
| 108 | u64 val; | ||
| 109 | int pnode = uv_cpu_to_pnode(cpu); | ||
| 110 | |||
| 111 | uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG, | ||
| 112 | UVH_RTC1_INT_CONFIG_M_MASK); | ||
| 113 | uv_write_global_mmr64(pnode, UVH_INT_CMPB, -1L); | ||
| 114 | |||
| 115 | uv_write_global_mmr64(pnode, UVH_EVENT_OCCURRED0_ALIAS, | ||
| 116 | UVH_EVENT_OCCURRED0_RTC1_MASK); | ||
| 117 | |||
| 118 | val = (GENERIC_INTERRUPT_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) | | ||
| 119 | ((u64)cpu_physical_id(cpu) << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT); | ||
| 120 | |||
| 121 | /* Set configuration */ | ||
| 122 | uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG, val); | ||
| 123 | /* Initialize comparator value */ | ||
| 124 | uv_write_global_mmr64(pnode, UVH_INT_CMPB, expires); | ||
| 125 | |||
| 126 | return (expires < uv_read_rtc() && !uv_intr_pending(pnode)); | ||
| 127 | } | ||
| 128 | |||
| 129 | /* | ||
| 130 | * Per-cpu timer tracking routines | ||
| 131 | */ | ||
| 132 | |||
| 133 | static __init void uv_rtc_deallocate_timers(void) | ||
| 134 | { | ||
| 135 | int bid; | ||
| 136 | |||
| 137 | for_each_possible_blade(bid) { | ||
| 138 | kfree(blade_info[bid]); | ||
| 139 | } | ||
| 140 | kfree(blade_info); | ||
| 141 | } | ||
| 142 | |||
| 143 | /* Allocate per-node list of cpu timer expiration times. */ | ||
| 144 | static __init int uv_rtc_allocate_timers(void) | ||
| 145 | { | ||
| 146 | int cpu; | ||
| 147 | |||
| 148 | blade_info = kmalloc(uv_possible_blades * sizeof(void *), GFP_KERNEL); | ||
| 149 | if (!blade_info) | ||
| 150 | return -ENOMEM; | ||
| 151 | memset(blade_info, 0, uv_possible_blades * sizeof(void *)); | ||
| 152 | |||
| 153 | for_each_present_cpu(cpu) { | ||
| 154 | int nid = cpu_to_node(cpu); | ||
| 155 | int bid = uv_cpu_to_blade_id(cpu); | ||
| 156 | int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id; | ||
| 157 | struct uv_rtc_timer_head *head = blade_info[bid]; | ||
| 158 | |||
| 159 | if (!head) { | ||
| 160 | head = kmalloc_node(sizeof(struct uv_rtc_timer_head) + | ||
| 161 | (uv_blade_nr_possible_cpus(bid) * | ||
| 162 | 2 * sizeof(u64)), | ||
| 163 | GFP_KERNEL, nid); | ||
| 164 | if (!head) { | ||
| 165 | uv_rtc_deallocate_timers(); | ||
| 166 | return -ENOMEM; | ||
| 167 | } | ||
| 168 | spin_lock_init(&head->lock); | ||
| 169 | head->ncpus = uv_blade_nr_possible_cpus(bid); | ||
| 170 | head->next_cpu = -1; | ||
| 171 | blade_info[bid] = head; | ||
| 172 | } | ||
| 173 | |||
| 174 | head->cpu[bcpu].lcpu = cpu; | ||
| 175 | head->cpu[bcpu].expires = ULLONG_MAX; | ||
| 176 | } | ||
| 177 | |||
| 178 | return 0; | ||
| 179 | } | ||
| 180 | |||
| 181 | /* Find and set the next expiring timer. */ | ||
| 182 | static void uv_rtc_find_next_timer(struct uv_rtc_timer_head *head, int pnode) | ||
| 183 | { | ||
| 184 | u64 lowest = ULLONG_MAX; | ||
| 185 | int c, bcpu = -1; | ||
| 186 | |||
| 187 | head->next_cpu = -1; | ||
| 188 | for (c = 0; c < head->ncpus; c++) { | ||
| 189 | u64 exp = head->cpu[c].expires; | ||
| 190 | if (exp < lowest) { | ||
| 191 | bcpu = c; | ||
| 192 | lowest = exp; | ||
| 193 | } | ||
| 194 | } | ||
| 195 | if (bcpu >= 0) { | ||
| 196 | head->next_cpu = bcpu; | ||
| 197 | c = head->cpu[bcpu].lcpu; | ||
| 198 | if (uv_setup_intr(c, lowest)) | ||
| 199 | /* If we didn't set it up in time, trigger */ | ||
| 200 | uv_rtc_send_IPI(c); | ||
| 201 | } else { | ||
| 202 | uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG, | ||
| 203 | UVH_RTC1_INT_CONFIG_M_MASK); | ||
| 204 | } | ||
| 205 | } | ||
| 206 | |||
| 207 | /* | ||
| 208 | * Set expiration time for current cpu. | ||
| 209 | * | ||
| 210 | * Returns 1 if we missed the expiration time. | ||
| 211 | */ | ||
| 212 | static int uv_rtc_set_timer(int cpu, u64 expires) | ||
| 213 | { | ||
| 214 | int pnode = uv_cpu_to_pnode(cpu); | ||
| 215 | int bid = uv_cpu_to_blade_id(cpu); | ||
| 216 | struct uv_rtc_timer_head *head = blade_info[bid]; | ||
| 217 | int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id; | ||
| 218 | u64 *t = &head->cpu[bcpu].expires; | ||
| 219 | unsigned long flags; | ||
| 220 | int next_cpu; | ||
| 221 | |||
| 222 | spin_lock_irqsave(&head->lock, flags); | ||
| 223 | |||
| 224 | next_cpu = head->next_cpu; | ||
| 225 | *t = expires; | ||
| 226 | /* Will this one be next to go off? */ | ||
| 227 | if (next_cpu < 0 || bcpu == next_cpu || | ||
| 228 | expires < head->cpu[next_cpu].expires) { | ||
| 229 | head->next_cpu = bcpu; | ||
| 230 | if (uv_setup_intr(cpu, expires)) { | ||
| 231 | *t = ULLONG_MAX; | ||
| 232 | uv_rtc_find_next_timer(head, pnode); | ||
| 233 | spin_unlock_irqrestore(&head->lock, flags); | ||
| 234 | return 1; | ||
| 235 | } | ||
| 236 | } | ||
| 237 | |||
| 238 | spin_unlock_irqrestore(&head->lock, flags); | ||
| 239 | return 0; | ||
| 240 | } | ||
| 241 | |||
| 242 | /* | ||
| 243 | * Unset expiration time for current cpu. | ||
| 244 | * | ||
| 245 | * Returns 1 if this timer was pending. | ||
| 246 | */ | ||
| 247 | static int uv_rtc_unset_timer(int cpu) | ||
| 248 | { | ||
| 249 | int pnode = uv_cpu_to_pnode(cpu); | ||
| 250 | int bid = uv_cpu_to_blade_id(cpu); | ||
| 251 | struct uv_rtc_timer_head *head = blade_info[bid]; | ||
| 252 | int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id; | ||
| 253 | u64 *t = &head->cpu[bcpu].expires; | ||
| 254 | unsigned long flags; | ||
| 255 | int rc = 0; | ||
| 256 | |||
| 257 | spin_lock_irqsave(&head->lock, flags); | ||
| 258 | |||
| 259 | if (head->next_cpu == bcpu && uv_read_rtc() >= *t) | ||
| 260 | rc = 1; | ||
| 261 | |||
| 262 | *t = ULLONG_MAX; | ||
| 263 | |||
| 264 | /* Was the hardware setup for this timer? */ | ||
| 265 | if (head->next_cpu == bcpu) | ||
| 266 | uv_rtc_find_next_timer(head, pnode); | ||
| 267 | |||
| 268 | spin_unlock_irqrestore(&head->lock, flags); | ||
| 269 | |||
| 270 | return rc; | ||
| 271 | } | ||
| 272 | |||
| 273 | |||
| 274 | /* | ||
| 275 | * Kernel interface routines. | ||
| 276 | */ | ||
| 277 | |||
| 278 | /* | ||
| 279 | * Read the RTC. | ||
| 280 | */ | ||
| 281 | static cycle_t uv_read_rtc(void) | ||
| 282 | { | ||
| 283 | return (cycle_t)uv_read_local_mmr(UVH_RTC); | ||
| 284 | } | ||
| 285 | |||
| 286 | /* | ||
| 287 | * Program the next event, relative to now | ||
| 288 | */ | ||
| 289 | static int uv_rtc_next_event(unsigned long delta, | ||
| 290 | struct clock_event_device *ced) | ||
| 291 | { | ||
| 292 | int ced_cpu = cpumask_first(ced->cpumask); | ||
| 293 | |||
| 294 | return uv_rtc_set_timer(ced_cpu, delta + uv_read_rtc()); | ||
| 295 | } | ||
| 296 | |||
| 297 | /* | ||
| 298 | * Setup the RTC timer in oneshot mode | ||
| 299 | */ | ||
| 300 | static void uv_rtc_timer_setup(enum clock_event_mode mode, | ||
| 301 | struct clock_event_device *evt) | ||
| 302 | { | ||
| 303 | int ced_cpu = cpumask_first(evt->cpumask); | ||
| 304 | |||
| 305 | switch (mode) { | ||
| 306 | case CLOCK_EVT_MODE_PERIODIC: | ||
| 307 | case CLOCK_EVT_MODE_ONESHOT: | ||
| 308 | case CLOCK_EVT_MODE_RESUME: | ||
| 309 | /* Nothing to do here yet */ | ||
| 310 | break; | ||
| 311 | case CLOCK_EVT_MODE_UNUSED: | ||
| 312 | case CLOCK_EVT_MODE_SHUTDOWN: | ||
| 313 | uv_rtc_unset_timer(ced_cpu); | ||
| 314 | break; | ||
| 315 | } | ||
| 316 | } | ||
| 317 | |||
| 318 | static void uv_rtc_interrupt(void) | ||
| 319 | { | ||
| 320 | struct clock_event_device *ced = &__get_cpu_var(cpu_ced); | ||
| 321 | int cpu = smp_processor_id(); | ||
| 322 | |||
| 323 | if (!ced || !ced->event_handler) | ||
| 324 | return; | ||
| 325 | |||
| 326 | if (uv_rtc_unset_timer(cpu) != 1) | ||
| 327 | return; | ||
| 328 | |||
| 329 | ced->event_handler(ced); | ||
| 330 | } | ||
| 331 | |||
| 332 | static int __init uv_enable_rtc(char *str) | ||
| 333 | { | ||
| 334 | uv_rtc_enable = 1; | ||
| 335 | |||
| 336 | return 1; | ||
| 337 | } | ||
| 338 | __setup("uvrtc", uv_enable_rtc); | ||
| 339 | |||
| 340 | static __init void uv_rtc_register_clockevents(struct work_struct *dummy) | ||
| 341 | { | ||
| 342 | struct clock_event_device *ced = &__get_cpu_var(cpu_ced); | ||
| 343 | |||
| 344 | *ced = clock_event_device_uv; | ||
| 345 | ced->cpumask = cpumask_of(smp_processor_id()); | ||
| 346 | clockevents_register_device(ced); | ||
| 347 | } | ||
| 348 | |||
| 349 | static __init int uv_rtc_setup_clock(void) | ||
| 350 | { | ||
| 351 | int rc; | ||
| 352 | |||
| 353 | if (!uv_rtc_enable || !is_uv_system() || generic_interrupt_extension) | ||
| 354 | return -ENODEV; | ||
| 355 | |||
| 356 | generic_interrupt_extension = uv_rtc_interrupt; | ||
| 357 | |||
| 358 | clocksource_uv.mult = clocksource_hz2mult(sn_rtc_cycles_per_second, | ||
| 359 | clocksource_uv.shift); | ||
| 360 | |||
| 361 | rc = clocksource_register(&clocksource_uv); | ||
| 362 | if (rc) { | ||
| 363 | generic_interrupt_extension = NULL; | ||
| 364 | return rc; | ||
| 365 | } | ||
| 366 | |||
| 367 | /* Setup and register clockevents */ | ||
| 368 | rc = uv_rtc_allocate_timers(); | ||
| 369 | if (rc) { | ||
| 370 | clocksource_unregister(&clocksource_uv); | ||
| 371 | generic_interrupt_extension = NULL; | ||
| 372 | return rc; | ||
| 373 | } | ||
| 374 | |||
| 375 | clock_event_device_uv.mult = div_sc(sn_rtc_cycles_per_second, | ||
| 376 | NSEC_PER_SEC, clock_event_device_uv.shift); | ||
| 377 | |||
| 378 | clock_event_device_uv.min_delta_ns = NSEC_PER_SEC / | ||
| 379 | sn_rtc_cycles_per_second; | ||
| 380 | |||
| 381 | clock_event_device_uv.max_delta_ns = clocksource_uv.mask * | ||
| 382 | (NSEC_PER_SEC / sn_rtc_cycles_per_second); | ||
| 383 | |||
| 384 | rc = schedule_on_each_cpu(uv_rtc_register_clockevents); | ||
| 385 | if (rc) { | ||
| 386 | clocksource_unregister(&clocksource_uv); | ||
| 387 | generic_interrupt_extension = NULL; | ||
| 388 | uv_rtc_deallocate_timers(); | ||
| 389 | } | ||
| 390 | |||
| 391 | return rc; | ||
| 392 | } | ||
| 393 | arch_initcall(uv_rtc_setup_clock); | ||
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index fbfced6f6800..5bf54e40c6ef 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S | |||
| @@ -275,3 +275,10 @@ ASSERT((_end - _text <= KERNEL_IMAGE_SIZE), | |||
| 275 | ASSERT((per_cpu__irq_stack_union == 0), | 275 | ASSERT((per_cpu__irq_stack_union == 0), |
| 276 | "irq_stack_union is not at start of per-cpu area"); | 276 | "irq_stack_union is not at start of per-cpu area"); |
| 277 | #endif | 277 | #endif |
| 278 | |||
| 279 | #ifdef CONFIG_KEXEC | ||
| 280 | #include <asm/kexec.h> | ||
| 281 | |||
| 282 | ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE, | ||
| 283 | "kexec control code size is too big") | ||
| 284 | #endif | ||
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index f3a5305b8adf..9fe4ddaa8f6f 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c | |||
| @@ -348,6 +348,11 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx, | |||
| 348 | * flush_tlb_user() for both user and kernel mappings unless | 348 | * flush_tlb_user() for both user and kernel mappings unless |
| 349 | * the Page Global Enable (PGE) feature bit is set. */ | 349 | * the Page Global Enable (PGE) feature bit is set. */ |
| 350 | *dx |= 0x00002000; | 350 | *dx |= 0x00002000; |
| 351 | /* We also lie, and say we're family id 5. 6 or greater | ||
| 352 | * leads to a rdmsr in early_init_intel which we can't handle. | ||
| 353 | * Family ID is returned as bits 8-12 in ax. */ | ||
| 354 | *ax &= 0xFFFFF0FF; | ||
| 355 | *ax |= 0x00000500; | ||
| 351 | break; | 356 | break; |
| 352 | case 0x80000000: | 357 | case 0x80000000: |
| 353 | /* Futureproof this a little: if they ask how much extended | 358 | /* Futureproof this a little: if they ask how much extended |
| @@ -594,19 +599,21 @@ static void __init lguest_init_IRQ(void) | |||
| 594 | /* Some systems map "vectors" to interrupts weirdly. Lguest has | 599 | /* Some systems map "vectors" to interrupts weirdly. Lguest has |
| 595 | * a straightforward 1 to 1 mapping, so force that here. */ | 600 | * a straightforward 1 to 1 mapping, so force that here. */ |
| 596 | __get_cpu_var(vector_irq)[vector] = i; | 601 | __get_cpu_var(vector_irq)[vector] = i; |
| 597 | if (vector != SYSCALL_VECTOR) { | 602 | if (vector != SYSCALL_VECTOR) |
| 598 | set_intr_gate(vector, | 603 | set_intr_gate(vector, interrupt[i]); |
| 599 | interrupt[vector-FIRST_EXTERNAL_VECTOR]); | ||
| 600 | set_irq_chip_and_handler_name(i, &lguest_irq_controller, | ||
| 601 | handle_level_irq, | ||
| 602 | "level"); | ||
| 603 | } | ||
| 604 | } | 604 | } |
| 605 | /* This call is required to set up for 4k stacks, where we have | 605 | /* This call is required to set up for 4k stacks, where we have |
| 606 | * separate stacks for hard and soft interrupts. */ | 606 | * separate stacks for hard and soft interrupts. */ |
| 607 | irq_ctx_init(smp_processor_id()); | 607 | irq_ctx_init(smp_processor_id()); |
| 608 | } | 608 | } |
| 609 | 609 | ||
| 610 | void lguest_setup_irq(unsigned int irq) | ||
| 611 | { | ||
| 612 | irq_to_desc_alloc_cpu(irq, 0); | ||
| 613 | set_irq_chip_and_handler_name(irq, &lguest_irq_controller, | ||
| 614 | handle_level_irq, "level"); | ||
| 615 | } | ||
| 616 | |||
| 610 | /* | 617 | /* |
| 611 | * Time. | 618 | * Time. |
| 612 | * | 619 | * |
diff --git a/arch/x86/math-emu/fpu_aux.c b/arch/x86/math-emu/fpu_aux.c index 491e737ce547..aa0987088774 100644 --- a/arch/x86/math-emu/fpu_aux.c +++ b/arch/x86/math-emu/fpu_aux.c | |||
| @@ -30,20 +30,29 @@ static void fclex(void) | |||
| 30 | } | 30 | } |
| 31 | 31 | ||
| 32 | /* Needs to be externally visible */ | 32 | /* Needs to be externally visible */ |
| 33 | void finit(void) | 33 | void finit_task(struct task_struct *tsk) |
| 34 | { | 34 | { |
| 35 | control_word = 0x037f; | 35 | struct i387_soft_struct *soft = &tsk->thread.xstate->soft; |
| 36 | partial_status = 0; | 36 | struct address *oaddr, *iaddr; |
| 37 | top = 0; /* We don't keep top in the status word internally. */ | 37 | soft->cwd = 0x037f; |
| 38 | fpu_tag_word = 0xffff; | 38 | soft->swd = 0; |
| 39 | soft->ftop = 0; /* We don't keep top in the status word internally. */ | ||
| 40 | soft->twd = 0xffff; | ||
| 39 | /* The behaviour is different from that detailed in | 41 | /* The behaviour is different from that detailed in |
| 40 | Section 15.1.6 of the Intel manual */ | 42 | Section 15.1.6 of the Intel manual */ |
| 41 | operand_address.offset = 0; | 43 | oaddr = (struct address *)&soft->foo; |
| 42 | operand_address.selector = 0; | 44 | oaddr->offset = 0; |
| 43 | instruction_address.offset = 0; | 45 | oaddr->selector = 0; |
| 44 | instruction_address.selector = 0; | 46 | iaddr = (struct address *)&soft->fip; |
| 45 | instruction_address.opcode = 0; | 47 | iaddr->offset = 0; |
| 46 | no_ip_update = 1; | 48 | iaddr->selector = 0; |
| 49 | iaddr->opcode = 0; | ||
| 50 | soft->no_update = 1; | ||
| 51 | } | ||
| 52 | |||
| 53 | void finit(void) | ||
| 54 | { | ||
| 55 | finit_task(current); | ||
| 47 | } | 56 | } |
| 48 | 57 | ||
| 49 | /* | 58 | /* |
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 00f127c80b0e..d11745334a67 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c | |||
| @@ -158,7 +158,6 @@ EXPORT_SYMBOL(kunmap); | |||
| 158 | EXPORT_SYMBOL(kmap_atomic); | 158 | EXPORT_SYMBOL(kmap_atomic); |
| 159 | EXPORT_SYMBOL(kunmap_atomic); | 159 | EXPORT_SYMBOL(kunmap_atomic); |
| 160 | 160 | ||
| 161 | #ifdef CONFIG_NUMA | ||
| 162 | void __init set_highmem_pages_init(void) | 161 | void __init set_highmem_pages_init(void) |
| 163 | { | 162 | { |
| 164 | struct zone *zone; | 163 | struct zone *zone; |
| @@ -182,11 +181,3 @@ void __init set_highmem_pages_init(void) | |||
| 182 | } | 181 | } |
| 183 | totalram_pages += totalhigh_pages; | 182 | totalram_pages += totalhigh_pages; |
| 184 | } | 183 | } |
| 185 | #else | ||
| 186 | void __init set_highmem_pages_init(void) | ||
| 187 | { | ||
| 188 | add_highpages_with_active_regions(0, highstart_pfn, highend_pfn); | ||
| 189 | |||
| 190 | totalram_pages += totalhigh_pages; | ||
| 191 | } | ||
| 192 | #endif /* CONFIG_NUMA */ | ||
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index ce6a722587d8..15219e0d1243 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c | |||
| @@ -1,8 +1,345 @@ | |||
| 1 | #include <linux/ioport.h> | ||
| 1 | #include <linux/swap.h> | 2 | #include <linux/swap.h> |
| 3 | |||
| 2 | #include <asm/cacheflush.h> | 4 | #include <asm/cacheflush.h> |
| 5 | #include <asm/e820.h> | ||
| 6 | #include <asm/init.h> | ||
| 3 | #include <asm/page.h> | 7 | #include <asm/page.h> |
| 8 | #include <asm/page_types.h> | ||
| 4 | #include <asm/sections.h> | 9 | #include <asm/sections.h> |
| 5 | #include <asm/system.h> | 10 | #include <asm/system.h> |
| 11 | #include <asm/tlbflush.h> | ||
| 12 | |||
| 13 | unsigned long __initdata e820_table_start; | ||
| 14 | unsigned long __meminitdata e820_table_end; | ||
| 15 | unsigned long __meminitdata e820_table_top; | ||
| 16 | |||
| 17 | int after_bootmem; | ||
| 18 | |||
| 19 | int direct_gbpages | ||
| 20 | #ifdef CONFIG_DIRECT_GBPAGES | ||
| 21 | = 1 | ||
| 22 | #endif | ||
| 23 | ; | ||
| 24 | |||
| 25 | static void __init find_early_table_space(unsigned long end, int use_pse, | ||
| 26 | int use_gbpages) | ||
| 27 | { | ||
| 28 | unsigned long puds, pmds, ptes, tables, start; | ||
| 29 | |||
| 30 | puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; | ||
| 31 | tables = roundup(puds * sizeof(pud_t), PAGE_SIZE); | ||
| 32 | |||
| 33 | if (use_gbpages) { | ||
| 34 | unsigned long extra; | ||
| 35 | |||
| 36 | extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT); | ||
| 37 | pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT; | ||
| 38 | } else | ||
| 39 | pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; | ||
| 40 | |||
| 41 | tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE); | ||
| 42 | |||
| 43 | if (use_pse) { | ||
| 44 | unsigned long extra; | ||
| 45 | |||
| 46 | extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); | ||
| 47 | #ifdef CONFIG_X86_32 | ||
| 48 | extra += PMD_SIZE; | ||
| 49 | #endif | ||
| 50 | ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; | ||
| 51 | } else | ||
| 52 | ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; | ||
| 53 | |||
| 54 | tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE); | ||
| 55 | |||
| 56 | #ifdef CONFIG_X86_32 | ||
| 57 | /* for fixmap */ | ||
| 58 | tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE); | ||
| 59 | #endif | ||
| 60 | |||
| 61 | /* | ||
| 62 | * RED-PEN putting page tables only on node 0 could | ||
| 63 | * cause a hotspot and fill up ZONE_DMA. The page tables | ||
| 64 | * need roughly 0.5KB per GB. | ||
| 65 | */ | ||
| 66 | #ifdef CONFIG_X86_32 | ||
| 67 | start = 0x7000; | ||
| 68 | e820_table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT, | ||
| 69 | tables, PAGE_SIZE); | ||
| 70 | #else /* CONFIG_X86_64 */ | ||
| 71 | start = 0x8000; | ||
| 72 | e820_table_start = find_e820_area(start, end, tables, PAGE_SIZE); | ||
| 73 | #endif | ||
| 74 | if (e820_table_start == -1UL) | ||
| 75 | panic("Cannot find space for the kernel page tables"); | ||
| 76 | |||
| 77 | e820_table_start >>= PAGE_SHIFT; | ||
| 78 | e820_table_end = e820_table_start; | ||
| 79 | e820_table_top = e820_table_start + (tables >> PAGE_SHIFT); | ||
| 80 | |||
| 81 | printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n", | ||
| 82 | end, e820_table_start << PAGE_SHIFT, e820_table_top << PAGE_SHIFT); | ||
| 83 | } | ||
| 84 | |||
| 85 | struct map_range { | ||
| 86 | unsigned long start; | ||
| 87 | unsigned long end; | ||
| 88 | unsigned page_size_mask; | ||
| 89 | }; | ||
| 90 | |||
| 91 | #ifdef CONFIG_X86_32 | ||
| 92 | #define NR_RANGE_MR 3 | ||
| 93 | #else /* CONFIG_X86_64 */ | ||
| 94 | #define NR_RANGE_MR 5 | ||
| 95 | #endif | ||
| 96 | |||
| 97 | static int save_mr(struct map_range *mr, int nr_range, | ||
| 98 | unsigned long start_pfn, unsigned long end_pfn, | ||
| 99 | unsigned long page_size_mask) | ||
| 100 | { | ||
| 101 | if (start_pfn < end_pfn) { | ||
| 102 | if (nr_range >= NR_RANGE_MR) | ||
| 103 | panic("run out of range for init_memory_mapping\n"); | ||
| 104 | mr[nr_range].start = start_pfn<<PAGE_SHIFT; | ||
| 105 | mr[nr_range].end = end_pfn<<PAGE_SHIFT; | ||
| 106 | mr[nr_range].page_size_mask = page_size_mask; | ||
| 107 | nr_range++; | ||
| 108 | } | ||
| 109 | |||
| 110 | return nr_range; | ||
| 111 | } | ||
| 112 | |||
| 113 | #ifdef CONFIG_X86_64 | ||
| 114 | static void __init init_gbpages(void) | ||
| 115 | { | ||
| 116 | if (direct_gbpages && cpu_has_gbpages) | ||
| 117 | printk(KERN_INFO "Using GB pages for direct mapping\n"); | ||
| 118 | else | ||
| 119 | direct_gbpages = 0; | ||
| 120 | } | ||
| 121 | #else | ||
| 122 | static inline void init_gbpages(void) | ||
| 123 | { | ||
| 124 | } | ||
| 125 | #endif | ||
| 126 | |||
| 127 | /* | ||
| 128 | * Setup the direct mapping of the physical memory at PAGE_OFFSET. | ||
| 129 | * This runs before bootmem is initialized and gets pages directly from | ||
| 130 | * the physical memory. To access them they are temporarily mapped. | ||
| 131 | */ | ||
| 132 | unsigned long __init_refok init_memory_mapping(unsigned long start, | ||
| 133 | unsigned long end) | ||
| 134 | { | ||
| 135 | unsigned long page_size_mask = 0; | ||
| 136 | unsigned long start_pfn, end_pfn; | ||
| 137 | unsigned long ret = 0; | ||
| 138 | unsigned long pos; | ||
| 139 | |||
| 140 | struct map_range mr[NR_RANGE_MR]; | ||
| 141 | int nr_range, i; | ||
| 142 | int use_pse, use_gbpages; | ||
| 143 | |||
| 144 | printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end); | ||
| 145 | |||
| 146 | if (!after_bootmem) | ||
| 147 | init_gbpages(); | ||
| 148 | |||
| 149 | #ifdef CONFIG_DEBUG_PAGEALLOC | ||
| 150 | /* | ||
| 151 | * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. | ||
| 152 | * This will simplify cpa(), which otherwise needs to support splitting | ||
| 153 | * large pages into small in interrupt context, etc. | ||
| 154 | */ | ||
| 155 | use_pse = use_gbpages = 0; | ||
| 156 | #else | ||
| 157 | use_pse = cpu_has_pse; | ||
| 158 | use_gbpages = direct_gbpages; | ||
| 159 | #endif | ||
| 160 | |||
| 161 | #ifdef CONFIG_X86_32 | ||
| 162 | #ifdef CONFIG_X86_PAE | ||
| 163 | set_nx(); | ||
| 164 | if (nx_enabled) | ||
| 165 | printk(KERN_INFO "NX (Execute Disable) protection: active\n"); | ||
| 166 | #endif | ||
| 167 | |||
| 168 | /* Enable PSE if available */ | ||
| 169 | if (cpu_has_pse) | ||
| 170 | set_in_cr4(X86_CR4_PSE); | ||
| 171 | |||
| 172 | /* Enable PGE if available */ | ||
| 173 | if (cpu_has_pge) { | ||
| 174 | set_in_cr4(X86_CR4_PGE); | ||
| 175 | __supported_pte_mask |= _PAGE_GLOBAL; | ||
| 176 | } | ||
| 177 | #endif | ||
| 178 | |||
| 179 | if (use_gbpages) | ||
| 180 | page_size_mask |= 1 << PG_LEVEL_1G; | ||
| 181 | if (use_pse) | ||
| 182 | page_size_mask |= 1 << PG_LEVEL_2M; | ||
| 183 | |||
| 184 | memset(mr, 0, sizeof(mr)); | ||
| 185 | nr_range = 0; | ||
| 186 | |||
| 187 | /* head if not big page alignment ? */ | ||
| 188 | start_pfn = start >> PAGE_SHIFT; | ||
| 189 | pos = start_pfn << PAGE_SHIFT; | ||
| 190 | #ifdef CONFIG_X86_32 | ||
| 191 | /* | ||
| 192 | * Don't use a large page for the first 2/4MB of memory | ||
| 193 | * because there are often fixed size MTRRs in there | ||
| 194 | * and overlapping MTRRs into large pages can cause | ||
| 195 | * slowdowns. | ||
| 196 | */ | ||
| 197 | if (pos == 0) | ||
| 198 | end_pfn = 1<<(PMD_SHIFT - PAGE_SHIFT); | ||
| 199 | else | ||
| 200 | end_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) | ||
| 201 | << (PMD_SHIFT - PAGE_SHIFT); | ||
| 202 | #else /* CONFIG_X86_64 */ | ||
| 203 | end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT) | ||
| 204 | << (PMD_SHIFT - PAGE_SHIFT); | ||
| 205 | #endif | ||
| 206 | if (end_pfn > (end >> PAGE_SHIFT)) | ||
| 207 | end_pfn = end >> PAGE_SHIFT; | ||
| 208 | if (start_pfn < end_pfn) { | ||
| 209 | nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); | ||
| 210 | pos = end_pfn << PAGE_SHIFT; | ||
| 211 | } | ||
| 212 | |||
| 213 | /* big page (2M) range */ | ||
| 214 | start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) | ||
| 215 | << (PMD_SHIFT - PAGE_SHIFT); | ||
| 216 | #ifdef CONFIG_X86_32 | ||
| 217 | end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); | ||
| 218 | #else /* CONFIG_X86_64 */ | ||
| 219 | end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT) | ||
| 220 | << (PUD_SHIFT - PAGE_SHIFT); | ||
| 221 | if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT))) | ||
| 222 | end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)); | ||
| 223 | #endif | ||
| 224 | |||
| 225 | if (start_pfn < end_pfn) { | ||
| 226 | nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, | ||
| 227 | page_size_mask & (1<<PG_LEVEL_2M)); | ||
| 228 | pos = end_pfn << PAGE_SHIFT; | ||
| 229 | } | ||
| 230 | |||
| 231 | #ifdef CONFIG_X86_64 | ||
| 232 | /* big page (1G) range */ | ||
| 233 | start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT) | ||
| 234 | << (PUD_SHIFT - PAGE_SHIFT); | ||
| 235 | end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT); | ||
| 236 | if (start_pfn < end_pfn) { | ||
| 237 | nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, | ||
| 238 | page_size_mask & | ||
| 239 | ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G))); | ||
| 240 | pos = end_pfn << PAGE_SHIFT; | ||
| 241 | } | ||
| 242 | |||
| 243 | /* tail is not big page (1G) alignment */ | ||
| 244 | start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) | ||
| 245 | << (PMD_SHIFT - PAGE_SHIFT); | ||
| 246 | end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); | ||
| 247 | if (start_pfn < end_pfn) { | ||
| 248 | nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, | ||
| 249 | page_size_mask & (1<<PG_LEVEL_2M)); | ||
| 250 | pos = end_pfn << PAGE_SHIFT; | ||
| 251 | } | ||
| 252 | #endif | ||
| 253 | |||
| 254 | /* tail is not big page (2M) alignment */ | ||
| 255 | start_pfn = pos>>PAGE_SHIFT; | ||
| 256 | end_pfn = end>>PAGE_SHIFT; | ||
| 257 | nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); | ||
| 258 | |||
| 259 | /* try to merge same page size and continuous */ | ||
| 260 | for (i = 0; nr_range > 1 && i < nr_range - 1; i++) { | ||
| 261 | unsigned long old_start; | ||
| 262 | if (mr[i].end != mr[i+1].start || | ||
| 263 | mr[i].page_size_mask != mr[i+1].page_size_mask) | ||
| 264 | continue; | ||
| 265 | /* move it */ | ||
| 266 | old_start = mr[i].start; | ||
| 267 | memmove(&mr[i], &mr[i+1], | ||
| 268 | (nr_range - 1 - i) * sizeof(struct map_range)); | ||
| 269 | mr[i--].start = old_start; | ||
| 270 | nr_range--; | ||
| 271 | } | ||
| 272 | |||
| 273 | for (i = 0; i < nr_range; i++) | ||
| 274 | printk(KERN_DEBUG " %010lx - %010lx page %s\n", | ||
| 275 | mr[i].start, mr[i].end, | ||
| 276 | (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":( | ||
| 277 | (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k")); | ||
| 278 | |||
| 279 | /* | ||
| 280 | * Find space for the kernel direct mapping tables. | ||
| 281 | * | ||
| 282 | * Later we should allocate these tables in the local node of the | ||
| 283 | * memory mapped. Unfortunately this is done currently before the | ||
| 284 | * nodes are discovered. | ||
| 285 | */ | ||
| 286 | if (!after_bootmem) | ||
| 287 | find_early_table_space(end, use_pse, use_gbpages); | ||
| 288 | |||
| 289 | #ifdef CONFIG_X86_32 | ||
| 290 | for (i = 0; i < nr_range; i++) | ||
| 291 | kernel_physical_mapping_init(mr[i].start, mr[i].end, | ||
| 292 | mr[i].page_size_mask); | ||
| 293 | ret = end; | ||
| 294 | #else /* CONFIG_X86_64 */ | ||
| 295 | for (i = 0; i < nr_range; i++) | ||
| 296 | ret = kernel_physical_mapping_init(mr[i].start, mr[i].end, | ||
| 297 | mr[i].page_size_mask); | ||
| 298 | #endif | ||
| 299 | |||
| 300 | #ifdef CONFIG_X86_32 | ||
| 301 | early_ioremap_page_table_range_init(); | ||
| 302 | |||
| 303 | load_cr3(swapper_pg_dir); | ||
| 304 | #endif | ||
| 305 | |||
| 306 | #ifdef CONFIG_X86_64 | ||
| 307 | if (!after_bootmem) | ||
| 308 | mmu_cr4_features = read_cr4(); | ||
| 309 | #endif | ||
| 310 | __flush_tlb_all(); | ||
| 311 | |||
| 312 | if (!after_bootmem && e820_table_end > e820_table_start) | ||
| 313 | reserve_early(e820_table_start << PAGE_SHIFT, | ||
| 314 | e820_table_end << PAGE_SHIFT, "PGTABLE"); | ||
| 315 | |||
| 316 | if (!after_bootmem) | ||
| 317 | early_memtest(start, end); | ||
| 318 | |||
| 319 | return ret >> PAGE_SHIFT; | ||
| 320 | } | ||
| 321 | |||
| 322 | |||
| 323 | /* | ||
| 324 | * devmem_is_allowed() checks to see if /dev/mem access to a certain address | ||
| 325 | * is valid. The argument is a physical page number. | ||
| 326 | * | ||
| 327 | * | ||
| 328 | * On x86, access has to be given to the first megabyte of ram because that area | ||
| 329 | * contains bios code and data regions used by X and dosemu and similar apps. | ||
| 330 | * Access has to be given to non-kernel-ram areas as well, these contain the PCI | ||
| 331 | * mmio resources as well as potential bios/acpi data regions. | ||
| 332 | */ | ||
| 333 | int devmem_is_allowed(unsigned long pagenr) | ||
| 334 | { | ||
| 335 | if (pagenr <= 256) | ||
| 336 | return 1; | ||
| 337 | if (iomem_is_exclusive(pagenr << PAGE_SHIFT)) | ||
| 338 | return 0; | ||
| 339 | if (!page_is_ram(pagenr)) | ||
| 340 | return 1; | ||
| 341 | return 0; | ||
| 342 | } | ||
| 6 | 343 | ||
| 7 | void free_init_pages(char *what, unsigned long begin, unsigned long end) | 344 | void free_init_pages(char *what, unsigned long begin, unsigned long end) |
| 8 | { | 345 | { |
| @@ -47,3 +384,10 @@ void free_initmem(void) | |||
| 47 | (unsigned long)(&__init_begin), | 384 | (unsigned long)(&__init_begin), |
| 48 | (unsigned long)(&__init_end)); | 385 | (unsigned long)(&__init_end)); |
| 49 | } | 386 | } |
| 387 | |||
| 388 | #ifdef CONFIG_BLK_DEV_INITRD | ||
| 389 | void free_initrd_mem(unsigned long start, unsigned long end) | ||
| 390 | { | ||
| 391 | free_init_pages("initrd memory", start, end); | ||
| 392 | } | ||
| 393 | #endif | ||
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 0b087dcd2c18..db81e9a8556b 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c | |||
| @@ -49,6 +49,7 @@ | |||
| 49 | #include <asm/paravirt.h> | 49 | #include <asm/paravirt.h> |
| 50 | #include <asm/setup.h> | 50 | #include <asm/setup.h> |
| 51 | #include <asm/cacheflush.h> | 51 | #include <asm/cacheflush.h> |
| 52 | #include <asm/init.h> | ||
| 52 | 53 | ||
| 53 | unsigned long max_low_pfn_mapped; | 54 | unsigned long max_low_pfn_mapped; |
| 54 | unsigned long max_pfn_mapped; | 55 | unsigned long max_pfn_mapped; |
| @@ -58,19 +59,14 @@ unsigned long highstart_pfn, highend_pfn; | |||
| 58 | 59 | ||
| 59 | static noinline int do_test_wp_bit(void); | 60 | static noinline int do_test_wp_bit(void); |
| 60 | 61 | ||
| 61 | 62 | bool __read_mostly __vmalloc_start_set = false; | |
| 62 | static unsigned long __initdata table_start; | ||
| 63 | static unsigned long __meminitdata table_end; | ||
| 64 | static unsigned long __meminitdata table_top; | ||
| 65 | |||
| 66 | static int __initdata after_init_bootmem; | ||
| 67 | 63 | ||
| 68 | static __init void *alloc_low_page(void) | 64 | static __init void *alloc_low_page(void) |
| 69 | { | 65 | { |
| 70 | unsigned long pfn = table_end++; | 66 | unsigned long pfn = e820_table_end++; |
| 71 | void *adr; | 67 | void *adr; |
| 72 | 68 | ||
| 73 | if (pfn >= table_top) | 69 | if (pfn >= e820_table_top) |
| 74 | panic("alloc_low_page: ran out of memory"); | 70 | panic("alloc_low_page: ran out of memory"); |
| 75 | 71 | ||
| 76 | adr = __va(pfn * PAGE_SIZE); | 72 | adr = __va(pfn * PAGE_SIZE); |
| @@ -90,7 +86,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd) | |||
| 90 | 86 | ||
| 91 | #ifdef CONFIG_X86_PAE | 87 | #ifdef CONFIG_X86_PAE |
| 92 | if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { | 88 | if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { |
| 93 | if (after_init_bootmem) | 89 | if (after_bootmem) |
| 94 | pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE); | 90 | pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE); |
| 95 | else | 91 | else |
| 96 | pmd_table = (pmd_t *)alloc_low_page(); | 92 | pmd_table = (pmd_t *)alloc_low_page(); |
| @@ -117,7 +113,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd) | |||
| 117 | if (!(pmd_val(*pmd) & _PAGE_PRESENT)) { | 113 | if (!(pmd_val(*pmd) & _PAGE_PRESENT)) { |
| 118 | pte_t *page_table = NULL; | 114 | pte_t *page_table = NULL; |
| 119 | 115 | ||
| 120 | if (after_init_bootmem) { | 116 | if (after_bootmem) { |
| 121 | #ifdef CONFIG_DEBUG_PAGEALLOC | 117 | #ifdef CONFIG_DEBUG_PAGEALLOC |
| 122 | page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE); | 118 | page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE); |
| 123 | #endif | 119 | #endif |
| @@ -135,6 +131,23 @@ static pte_t * __init one_page_table_init(pmd_t *pmd) | |||
| 135 | return pte_offset_kernel(pmd, 0); | 131 | return pte_offset_kernel(pmd, 0); |
| 136 | } | 132 | } |
| 137 | 133 | ||
| 134 | pmd_t * __init populate_extra_pmd(unsigned long vaddr) | ||
| 135 | { | ||
| 136 | int pgd_idx = pgd_index(vaddr); | ||
| 137 | int pmd_idx = pmd_index(vaddr); | ||
| 138 | |||
| 139 | return one_md_table_init(swapper_pg_dir + pgd_idx) + pmd_idx; | ||
| 140 | } | ||
| 141 | |||
| 142 | pte_t * __init populate_extra_pte(unsigned long vaddr) | ||
| 143 | { | ||
| 144 | int pte_idx = pte_index(vaddr); | ||
| 145 | pmd_t *pmd; | ||
| 146 | |||
| 147 | pmd = populate_extra_pmd(vaddr); | ||
| 148 | return one_page_table_init(pmd) + pte_idx; | ||
| 149 | } | ||
| 150 | |||
| 138 | static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd, | 151 | static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd, |
| 139 | unsigned long vaddr, pte_t *lastpte) | 152 | unsigned long vaddr, pte_t *lastpte) |
| 140 | { | 153 | { |
| @@ -151,12 +164,12 @@ static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd, | |||
| 151 | if (pmd_idx_kmap_begin != pmd_idx_kmap_end | 164 | if (pmd_idx_kmap_begin != pmd_idx_kmap_end |
| 152 | && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin | 165 | && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin |
| 153 | && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end | 166 | && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end |
| 154 | && ((__pa(pte) >> PAGE_SHIFT) < table_start | 167 | && ((__pa(pte) >> PAGE_SHIFT) < e820_table_start |
| 155 | || (__pa(pte) >> PAGE_SHIFT) >= table_end)) { | 168 | || (__pa(pte) >> PAGE_SHIFT) >= e820_table_end)) { |
| 156 | pte_t *newpte; | 169 | pte_t *newpte; |
| 157 | int i; | 170 | int i; |
| 158 | 171 | ||
| 159 | BUG_ON(after_init_bootmem); | 172 | BUG_ON(after_bootmem); |
| 160 | newpte = alloc_low_page(); | 173 | newpte = alloc_low_page(); |
| 161 | for (i = 0; i < PTRS_PER_PTE; i++) | 174 | for (i = 0; i < PTRS_PER_PTE; i++) |
| 162 | set_pte(newpte + i, pte[i]); | 175 | set_pte(newpte + i, pte[i]); |
| @@ -225,11 +238,14 @@ static inline int is_kernel_text(unsigned long addr) | |||
| 225 | * of max_low_pfn pages, by creating page tables starting from address | 238 | * of max_low_pfn pages, by creating page tables starting from address |
| 226 | * PAGE_OFFSET: | 239 | * PAGE_OFFSET: |
| 227 | */ | 240 | */ |
| 228 | static void __init kernel_physical_mapping_init(pgd_t *pgd_base, | 241 | unsigned long __init |
| 229 | unsigned long start_pfn, | 242 | kernel_physical_mapping_init(unsigned long start, |
| 230 | unsigned long end_pfn, | 243 | unsigned long end, |
| 231 | int use_pse) | 244 | unsigned long page_size_mask) |
| 232 | { | 245 | { |
| 246 | int use_pse = page_size_mask == (1<<PG_LEVEL_2M); | ||
| 247 | unsigned long start_pfn, end_pfn; | ||
| 248 | pgd_t *pgd_base = swapper_pg_dir; | ||
| 233 | int pgd_idx, pmd_idx, pte_ofs; | 249 | int pgd_idx, pmd_idx, pte_ofs; |
| 234 | unsigned long pfn; | 250 | unsigned long pfn; |
| 235 | pgd_t *pgd; | 251 | pgd_t *pgd; |
| @@ -238,6 +254,9 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base, | |||
| 238 | unsigned pages_2m, pages_4k; | 254 | unsigned pages_2m, pages_4k; |
| 239 | int mapping_iter; | 255 | int mapping_iter; |
| 240 | 256 | ||
| 257 | start_pfn = start >> PAGE_SHIFT; | ||
| 258 | end_pfn = end >> PAGE_SHIFT; | ||
| 259 | |||
| 241 | /* | 260 | /* |
| 242 | * First iteration will setup identity mapping using large/small pages | 261 | * First iteration will setup identity mapping using large/small pages |
| 243 | * based on use_pse, with other attributes same as set by | 262 | * based on use_pse, with other attributes same as set by |
| @@ -352,26 +371,6 @@ repeat: | |||
| 352 | mapping_iter = 2; | 371 | mapping_iter = 2; |
| 353 | goto repeat; | 372 | goto repeat; |
| 354 | } | 373 | } |
| 355 | } | ||
| 356 | |||
| 357 | /* | ||
| 358 | * devmem_is_allowed() checks to see if /dev/mem access to a certain address | ||
| 359 | * is valid. The argument is a physical page number. | ||
| 360 | * | ||
| 361 | * | ||
| 362 | * On x86, access has to be given to the first megabyte of ram because that area | ||
| 363 | * contains bios code and data regions used by X and dosemu and similar apps. | ||
| 364 | * Access has to be given to non-kernel-ram areas as well, these contain the PCI | ||
| 365 | * mmio resources as well as potential bios/acpi data regions. | ||
| 366 | */ | ||
| 367 | int devmem_is_allowed(unsigned long pagenr) | ||
| 368 | { | ||
| 369 | if (pagenr <= 256) | ||
| 370 | return 1; | ||
| 371 | if (iomem_is_exclusive(pagenr << PAGE_SHIFT)) | ||
| 372 | return 0; | ||
| 373 | if (!page_is_ram(pagenr)) | ||
| 374 | return 1; | ||
| 375 | return 0; | 374 | return 0; |
| 376 | } | 375 | } |
| 377 | 376 | ||
| @@ -528,8 +527,9 @@ void __init native_pagetable_setup_done(pgd_t *base) | |||
| 528 | * be partially populated, and so it avoids stomping on any existing | 527 | * be partially populated, and so it avoids stomping on any existing |
| 529 | * mappings. | 528 | * mappings. |
| 530 | */ | 529 | */ |
| 531 | static void __init early_ioremap_page_table_range_init(pgd_t *pgd_base) | 530 | void __init early_ioremap_page_table_range_init(void) |
| 532 | { | 531 | { |
| 532 | pgd_t *pgd_base = swapper_pg_dir; | ||
| 533 | unsigned long vaddr, end; | 533 | unsigned long vaddr, end; |
| 534 | 534 | ||
| 535 | /* | 535 | /* |
| @@ -624,7 +624,7 @@ static int __init noexec_setup(char *str) | |||
| 624 | } | 624 | } |
| 625 | early_param("noexec", noexec_setup); | 625 | early_param("noexec", noexec_setup); |
| 626 | 626 | ||
| 627 | static void __init set_nx(void) | 627 | void __init set_nx(void) |
| 628 | { | 628 | { |
| 629 | unsigned int v[4], l, h; | 629 | unsigned int v[4], l, h; |
| 630 | 630 | ||
| @@ -776,6 +776,8 @@ void __init initmem_init(unsigned long start_pfn, | |||
| 776 | #ifdef CONFIG_FLATMEM | 776 | #ifdef CONFIG_FLATMEM |
| 777 | max_mapnr = num_physpages; | 777 | max_mapnr = num_physpages; |
| 778 | #endif | 778 | #endif |
| 779 | __vmalloc_start_set = true; | ||
| 780 | |||
| 779 | printk(KERN_NOTICE "%ldMB LOWMEM available.\n", | 781 | printk(KERN_NOTICE "%ldMB LOWMEM available.\n", |
| 780 | pages_to_mb(max_low_pfn)); | 782 | pages_to_mb(max_low_pfn)); |
| 781 | 783 | ||
| @@ -797,176 +799,66 @@ static void __init zone_sizes_init(void) | |||
| 797 | free_area_init_nodes(max_zone_pfns); | 799 | free_area_init_nodes(max_zone_pfns); |
| 798 | } | 800 | } |
| 799 | 801 | ||
| 802 | static unsigned long __init setup_node_bootmem(int nodeid, | ||
| 803 | unsigned long start_pfn, | ||
| 804 | unsigned long end_pfn, | ||
| 805 | unsigned long bootmap) | ||
| 806 | { | ||
| 807 | unsigned long bootmap_size; | ||
| 808 | |||
| 809 | /* don't touch min_low_pfn */ | ||
| 810 | bootmap_size = init_bootmem_node(NODE_DATA(nodeid), | ||
| 811 | bootmap >> PAGE_SHIFT, | ||
| 812 | start_pfn, end_pfn); | ||
| 813 | printk(KERN_INFO " node %d low ram: %08lx - %08lx\n", | ||
| 814 | nodeid, start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT); | ||
| 815 | printk(KERN_INFO " node %d bootmap %08lx - %08lx\n", | ||
| 816 | nodeid, bootmap, bootmap + bootmap_size); | ||
| 817 | free_bootmem_with_active_regions(nodeid, end_pfn); | ||
| 818 | early_res_to_bootmem(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT); | ||
| 819 | |||
| 820 | return bootmap + bootmap_size; | ||
| 821 | } | ||
| 822 | |||
| 800 | void __init setup_bootmem_allocator(void) | 823 | void __init setup_bootmem_allocator(void) |
| 801 | { | 824 | { |
| 802 | int i; | 825 | int nodeid; |
| 803 | unsigned long bootmap_size, bootmap; | 826 | unsigned long bootmap_size, bootmap; |
| 804 | /* | 827 | /* |
| 805 | * Initialize the boot-time allocator (with low memory only): | 828 | * Initialize the boot-time allocator (with low memory only): |
| 806 | */ | 829 | */ |
| 807 | bootmap_size = bootmem_bootmap_pages(max_low_pfn)<<PAGE_SHIFT; | 830 | bootmap_size = bootmem_bootmap_pages(max_low_pfn)<<PAGE_SHIFT; |
| 808 | bootmap = find_e820_area(min_low_pfn<<PAGE_SHIFT, | 831 | bootmap = find_e820_area(0, max_pfn_mapped<<PAGE_SHIFT, bootmap_size, |
| 809 | max_pfn_mapped<<PAGE_SHIFT, bootmap_size, | ||
| 810 | PAGE_SIZE); | 832 | PAGE_SIZE); |
| 811 | if (bootmap == -1L) | 833 | if (bootmap == -1L) |
| 812 | panic("Cannot find bootmem map of size %ld\n", bootmap_size); | 834 | panic("Cannot find bootmem map of size %ld\n", bootmap_size); |
| 813 | reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP"); | 835 | reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP"); |
| 814 | 836 | ||
| 815 | /* don't touch min_low_pfn */ | ||
| 816 | bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT, | ||
| 817 | min_low_pfn, max_low_pfn); | ||
| 818 | printk(KERN_INFO " mapped low ram: 0 - %08lx\n", | 837 | printk(KERN_INFO " mapped low ram: 0 - %08lx\n", |
| 819 | max_pfn_mapped<<PAGE_SHIFT); | 838 | max_pfn_mapped<<PAGE_SHIFT); |
| 820 | printk(KERN_INFO " low ram: %08lx - %08lx\n", | 839 | printk(KERN_INFO " low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT); |
| 821 | min_low_pfn<<PAGE_SHIFT, max_low_pfn<<PAGE_SHIFT); | ||
| 822 | printk(KERN_INFO " bootmap %08lx - %08lx\n", | ||
| 823 | bootmap, bootmap + bootmap_size); | ||
| 824 | for_each_online_node(i) | ||
| 825 | free_bootmem_with_active_regions(i, max_low_pfn); | ||
| 826 | early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT); | ||
| 827 | |||
| 828 | after_init_bootmem = 1; | ||
| 829 | } | ||
| 830 | |||
| 831 | static void __init find_early_table_space(unsigned long end, int use_pse) | ||
| 832 | { | ||
| 833 | unsigned long puds, pmds, ptes, tables, start; | ||
| 834 | |||
| 835 | puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; | ||
| 836 | tables = roundup(puds * sizeof(pud_t), PAGE_SIZE); | ||
| 837 | |||
| 838 | pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; | ||
| 839 | tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE); | ||
| 840 | |||
| 841 | if (use_pse) { | ||
| 842 | unsigned long extra; | ||
| 843 | 840 | ||
| 844 | extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); | 841 | for_each_online_node(nodeid) { |
| 845 | extra += PMD_SIZE; | 842 | unsigned long start_pfn, end_pfn; |
| 846 | ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; | ||
| 847 | } else | ||
| 848 | ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; | ||
| 849 | 843 | ||
| 850 | tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE); | 844 | #ifdef CONFIG_NEED_MULTIPLE_NODES |
| 851 | 845 | start_pfn = node_start_pfn[nodeid]; | |
| 852 | /* for fixmap */ | 846 | end_pfn = node_end_pfn[nodeid]; |
| 853 | tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE); | 847 | if (start_pfn > max_low_pfn) |
| 854 | 848 | continue; | |
| 855 | /* | 849 | if (end_pfn > max_low_pfn) |
| 856 | * RED-PEN putting page tables only on node 0 could | 850 | end_pfn = max_low_pfn; |
| 857 | * cause a hotspot and fill up ZONE_DMA. The page tables | ||
| 858 | * need roughly 0.5KB per GB. | ||
| 859 | */ | ||
| 860 | start = 0x7000; | ||
| 861 | table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT, | ||
| 862 | tables, PAGE_SIZE); | ||
| 863 | if (table_start == -1UL) | ||
| 864 | panic("Cannot find space for the kernel page tables"); | ||
| 865 | |||
| 866 | table_start >>= PAGE_SHIFT; | ||
| 867 | table_end = table_start; | ||
| 868 | table_top = table_start + (tables>>PAGE_SHIFT); | ||
| 869 | |||
| 870 | printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n", | ||
| 871 | end, table_start << PAGE_SHIFT, | ||
| 872 | (table_start << PAGE_SHIFT) + tables); | ||
| 873 | } | ||
| 874 | |||
| 875 | unsigned long __init_refok init_memory_mapping(unsigned long start, | ||
| 876 | unsigned long end) | ||
| 877 | { | ||
| 878 | pgd_t *pgd_base = swapper_pg_dir; | ||
| 879 | unsigned long start_pfn, end_pfn; | ||
| 880 | unsigned long big_page_start; | ||
| 881 | #ifdef CONFIG_DEBUG_PAGEALLOC | ||
| 882 | /* | ||
| 883 | * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. | ||
| 884 | * This will simplify cpa(), which otherwise needs to support splitting | ||
| 885 | * large pages into small in interrupt context, etc. | ||
| 886 | */ | ||
| 887 | int use_pse = 0; | ||
| 888 | #else | 851 | #else |
| 889 | int use_pse = cpu_has_pse; | 852 | start_pfn = 0; |
| 890 | #endif | 853 | end_pfn = max_low_pfn; |
| 891 | |||
| 892 | /* | ||
| 893 | * Find space for the kernel direct mapping tables. | ||
| 894 | */ | ||
| 895 | if (!after_init_bootmem) | ||
| 896 | find_early_table_space(end, use_pse); | ||
| 897 | |||
| 898 | #ifdef CONFIG_X86_PAE | ||
| 899 | set_nx(); | ||
| 900 | if (nx_enabled) | ||
| 901 | printk(KERN_INFO "NX (Execute Disable) protection: active\n"); | ||
| 902 | #endif | 854 | #endif |
| 903 | 855 | bootmap = setup_node_bootmem(nodeid, start_pfn, end_pfn, | |
| 904 | /* Enable PSE if available */ | 856 | bootmap); |
| 905 | if (cpu_has_pse) | ||
| 906 | set_in_cr4(X86_CR4_PSE); | ||
| 907 | |||
| 908 | /* Enable PGE if available */ | ||
| 909 | if (cpu_has_pge) { | ||
| 910 | set_in_cr4(X86_CR4_PGE); | ||
| 911 | __supported_pte_mask |= _PAGE_GLOBAL; | ||
| 912 | } | ||
| 913 | |||
| 914 | /* | ||
| 915 | * Don't use a large page for the first 2/4MB of memory | ||
| 916 | * because there are often fixed size MTRRs in there | ||
| 917 | * and overlapping MTRRs into large pages can cause | ||
| 918 | * slowdowns. | ||
| 919 | */ | ||
| 920 | big_page_start = PMD_SIZE; | ||
| 921 | |||
| 922 | if (start < big_page_start) { | ||
| 923 | start_pfn = start >> PAGE_SHIFT; | ||
| 924 | end_pfn = min(big_page_start>>PAGE_SHIFT, end>>PAGE_SHIFT); | ||
| 925 | } else { | ||
| 926 | /* head is not big page alignment ? */ | ||
| 927 | start_pfn = start >> PAGE_SHIFT; | ||
| 928 | end_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT) | ||
| 929 | << (PMD_SHIFT - PAGE_SHIFT); | ||
| 930 | } | ||
| 931 | if (start_pfn < end_pfn) | ||
| 932 | kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn, 0); | ||
| 933 | |||
| 934 | /* big page range */ | ||
| 935 | start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT) | ||
| 936 | << (PMD_SHIFT - PAGE_SHIFT); | ||
| 937 | if (start_pfn < (big_page_start >> PAGE_SHIFT)) | ||
| 938 | start_pfn = big_page_start >> PAGE_SHIFT; | ||
| 939 | end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); | ||
| 940 | if (start_pfn < end_pfn) | ||
| 941 | kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn, | ||
| 942 | use_pse); | ||
| 943 | |||
| 944 | /* tail is not big page alignment ? */ | ||
| 945 | start_pfn = end_pfn; | ||
| 946 | if (start_pfn > (big_page_start>>PAGE_SHIFT)) { | ||
| 947 | end_pfn = end >> PAGE_SHIFT; | ||
| 948 | if (start_pfn < end_pfn) | ||
| 949 | kernel_physical_mapping_init(pgd_base, start_pfn, | ||
| 950 | end_pfn, 0); | ||
| 951 | } | 857 | } |
| 952 | 858 | ||
| 953 | early_ioremap_page_table_range_init(pgd_base); | 859 | after_bootmem = 1; |
| 954 | |||
| 955 | load_cr3(swapper_pg_dir); | ||
| 956 | |||
| 957 | __flush_tlb_all(); | ||
| 958 | |||
| 959 | if (!after_init_bootmem) | ||
| 960 | reserve_early(table_start << PAGE_SHIFT, | ||
| 961 | table_end << PAGE_SHIFT, "PGTABLE"); | ||
| 962 | |||
| 963 | if (!after_init_bootmem) | ||
| 964 | early_memtest(start, end); | ||
| 965 | |||
| 966 | return end >> PAGE_SHIFT; | ||
| 967 | } | 860 | } |
| 968 | 861 | ||
| 969 | |||
| 970 | /* | 862 | /* |
| 971 | * paging_init() sets up the page tables - note that the first 8MB are | 863 | * paging_init() sets up the page tables - note that the first 8MB are |
| 972 | * already mapped by head.S. | 864 | * already mapped by head.S. |
| @@ -1200,13 +1092,6 @@ void mark_rodata_ro(void) | |||
| 1200 | } | 1092 | } |
| 1201 | #endif | 1093 | #endif |
| 1202 | 1094 | ||
| 1203 | #ifdef CONFIG_BLK_DEV_INITRD | ||
| 1204 | void free_initrd_mem(unsigned long start, unsigned long end) | ||
| 1205 | { | ||
| 1206 | free_init_pages("initrd memory", start, end); | ||
| 1207 | } | ||
| 1208 | #endif | ||
| 1209 | |||
| 1210 | int __init reserve_bootmem_generic(unsigned long phys, unsigned long len, | 1095 | int __init reserve_bootmem_generic(unsigned long phys, unsigned long len, |
| 1211 | int flags) | 1096 | int flags) |
| 1212 | { | 1097 | { |
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 724e537432e7..54efa57d1c03 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c | |||
| @@ -48,6 +48,7 @@ | |||
| 48 | #include <asm/kdebug.h> | 48 | #include <asm/kdebug.h> |
| 49 | #include <asm/numa.h> | 49 | #include <asm/numa.h> |
| 50 | #include <asm/cacheflush.h> | 50 | #include <asm/cacheflush.h> |
| 51 | #include <asm/init.h> | ||
| 51 | 52 | ||
| 52 | /* | 53 | /* |
| 53 | * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. | 54 | * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. |
| @@ -61,12 +62,6 @@ static unsigned long dma_reserve __initdata; | |||
| 61 | 62 | ||
| 62 | DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); | 63 | DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); |
| 63 | 64 | ||
| 64 | int direct_gbpages | ||
| 65 | #ifdef CONFIG_DIRECT_GBPAGES | ||
| 66 | = 1 | ||
| 67 | #endif | ||
| 68 | ; | ||
| 69 | |||
| 70 | static int __init parse_direct_gbpages_off(char *arg) | 65 | static int __init parse_direct_gbpages_off(char *arg) |
| 71 | { | 66 | { |
| 72 | direct_gbpages = 0; | 67 | direct_gbpages = 0; |
| @@ -87,12 +82,10 @@ early_param("gbpages", parse_direct_gbpages_on); | |||
| 87 | * around without checking the pgd every time. | 82 | * around without checking the pgd every time. |
| 88 | */ | 83 | */ |
| 89 | 84 | ||
| 90 | int after_bootmem; | ||
| 91 | |||
| 92 | pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP; | 85 | pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP; |
| 93 | EXPORT_SYMBOL_GPL(__supported_pte_mask); | 86 | EXPORT_SYMBOL_GPL(__supported_pte_mask); |
| 94 | 87 | ||
| 95 | static int do_not_nx __cpuinitdata; | 88 | static int disable_nx __cpuinitdata; |
| 96 | 89 | ||
| 97 | /* | 90 | /* |
| 98 | * noexec=on|off | 91 | * noexec=on|off |
| @@ -107,9 +100,9 @@ static int __init nonx_setup(char *str) | |||
| 107 | return -EINVAL; | 100 | return -EINVAL; |
| 108 | if (!strncmp(str, "on", 2)) { | 101 | if (!strncmp(str, "on", 2)) { |
| 109 | __supported_pte_mask |= _PAGE_NX; | 102 | __supported_pte_mask |= _PAGE_NX; |
| 110 | do_not_nx = 0; | 103 | disable_nx = 0; |
| 111 | } else if (!strncmp(str, "off", 3)) { | 104 | } else if (!strncmp(str, "off", 3)) { |
| 112 | do_not_nx = 1; | 105 | disable_nx = 1; |
| 113 | __supported_pte_mask &= ~_PAGE_NX; | 106 | __supported_pte_mask &= ~_PAGE_NX; |
| 114 | } | 107 | } |
| 115 | return 0; | 108 | return 0; |
| @@ -121,7 +114,7 @@ void __cpuinit check_efer(void) | |||
| 121 | unsigned long efer; | 114 | unsigned long efer; |
| 122 | 115 | ||
| 123 | rdmsrl(MSR_EFER, efer); | 116 | rdmsrl(MSR_EFER, efer); |
| 124 | if (!(efer & EFER_NX) || do_not_nx) | 117 | if (!(efer & EFER_NX) || disable_nx) |
| 125 | __supported_pte_mask &= ~_PAGE_NX; | 118 | __supported_pte_mask &= ~_PAGE_NX; |
| 126 | } | 119 | } |
| 127 | 120 | ||
| @@ -168,34 +161,51 @@ static __ref void *spp_getpage(void) | |||
| 168 | return ptr; | 161 | return ptr; |
| 169 | } | 162 | } |
| 170 | 163 | ||
| 171 | void | 164 | static pud_t *fill_pud(pgd_t *pgd, unsigned long vaddr) |
| 172 | set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte) | ||
| 173 | { | 165 | { |
| 174 | pud_t *pud; | 166 | if (pgd_none(*pgd)) { |
| 175 | pmd_t *pmd; | 167 | pud_t *pud = (pud_t *)spp_getpage(); |
| 176 | pte_t *pte; | 168 | pgd_populate(&init_mm, pgd, pud); |
| 169 | if (pud != pud_offset(pgd, 0)) | ||
| 170 | printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n", | ||
| 171 | pud, pud_offset(pgd, 0)); | ||
| 172 | } | ||
| 173 | return pud_offset(pgd, vaddr); | ||
| 174 | } | ||
| 177 | 175 | ||
| 178 | pud = pud_page + pud_index(vaddr); | 176 | static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr) |
| 177 | { | ||
| 179 | if (pud_none(*pud)) { | 178 | if (pud_none(*pud)) { |
| 180 | pmd = (pmd_t *) spp_getpage(); | 179 | pmd_t *pmd = (pmd_t *) spp_getpage(); |
| 181 | pud_populate(&init_mm, pud, pmd); | 180 | pud_populate(&init_mm, pud, pmd); |
| 182 | if (pmd != pmd_offset(pud, 0)) { | 181 | if (pmd != pmd_offset(pud, 0)) |
| 183 | printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n", | 182 | printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n", |
| 184 | pmd, pmd_offset(pud, 0)); | 183 | pmd, pmd_offset(pud, 0)); |
| 185 | return; | ||
| 186 | } | ||
| 187 | } | 184 | } |
| 188 | pmd = pmd_offset(pud, vaddr); | 185 | return pmd_offset(pud, vaddr); |
| 186 | } | ||
| 187 | |||
| 188 | static pte_t *fill_pte(pmd_t *pmd, unsigned long vaddr) | ||
| 189 | { | ||
| 189 | if (pmd_none(*pmd)) { | 190 | if (pmd_none(*pmd)) { |
| 190 | pte = (pte_t *) spp_getpage(); | 191 | pte_t *pte = (pte_t *) spp_getpage(); |
| 191 | pmd_populate_kernel(&init_mm, pmd, pte); | 192 | pmd_populate_kernel(&init_mm, pmd, pte); |
| 192 | if (pte != pte_offset_kernel(pmd, 0)) { | 193 | if (pte != pte_offset_kernel(pmd, 0)) |
| 193 | printk(KERN_ERR "PAGETABLE BUG #02!\n"); | 194 | printk(KERN_ERR "PAGETABLE BUG #02!\n"); |
| 194 | return; | ||
| 195 | } | ||
| 196 | } | 195 | } |
| 196 | return pte_offset_kernel(pmd, vaddr); | ||
| 197 | } | ||
| 198 | |||
| 199 | void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte) | ||
| 200 | { | ||
| 201 | pud_t *pud; | ||
| 202 | pmd_t *pmd; | ||
| 203 | pte_t *pte; | ||
| 204 | |||
| 205 | pud = pud_page + pud_index(vaddr); | ||
| 206 | pmd = fill_pmd(pud, vaddr); | ||
| 207 | pte = fill_pte(pmd, vaddr); | ||
| 197 | 208 | ||
| 198 | pte = pte_offset_kernel(pmd, vaddr); | ||
| 199 | set_pte(pte, new_pte); | 209 | set_pte(pte, new_pte); |
| 200 | 210 | ||
| 201 | /* | 211 | /* |
| @@ -205,8 +215,7 @@ set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte) | |||
| 205 | __flush_tlb_one(vaddr); | 215 | __flush_tlb_one(vaddr); |
| 206 | } | 216 | } |
| 207 | 217 | ||
| 208 | void | 218 | void set_pte_vaddr(unsigned long vaddr, pte_t pteval) |
| 209 | set_pte_vaddr(unsigned long vaddr, pte_t pteval) | ||
| 210 | { | 219 | { |
| 211 | pgd_t *pgd; | 220 | pgd_t *pgd; |
| 212 | pud_t *pud_page; | 221 | pud_t *pud_page; |
| @@ -223,6 +232,24 @@ set_pte_vaddr(unsigned long vaddr, pte_t pteval) | |||
| 223 | set_pte_vaddr_pud(pud_page, vaddr, pteval); | 232 | set_pte_vaddr_pud(pud_page, vaddr, pteval); |
| 224 | } | 233 | } |
| 225 | 234 | ||
| 235 | pmd_t * __init populate_extra_pmd(unsigned long vaddr) | ||
| 236 | { | ||
| 237 | pgd_t *pgd; | ||
| 238 | pud_t *pud; | ||
| 239 | |||
| 240 | pgd = pgd_offset_k(vaddr); | ||
| 241 | pud = fill_pud(pgd, vaddr); | ||
| 242 | return fill_pmd(pud, vaddr); | ||
| 243 | } | ||
| 244 | |||
| 245 | pte_t * __init populate_extra_pte(unsigned long vaddr) | ||
| 246 | { | ||
| 247 | pmd_t *pmd; | ||
| 248 | |||
| 249 | pmd = populate_extra_pmd(vaddr); | ||
| 250 | return fill_pte(pmd, vaddr); | ||
| 251 | } | ||
| 252 | |||
| 226 | /* | 253 | /* |
| 227 | * Create large page table mappings for a range of physical addresses. | 254 | * Create large page table mappings for a range of physical addresses. |
| 228 | */ | 255 | */ |
| @@ -291,13 +318,9 @@ void __init cleanup_highmap(void) | |||
| 291 | } | 318 | } |
| 292 | } | 319 | } |
| 293 | 320 | ||
| 294 | static unsigned long __initdata table_start; | ||
| 295 | static unsigned long __meminitdata table_end; | ||
| 296 | static unsigned long __meminitdata table_top; | ||
| 297 | |||
| 298 | static __ref void *alloc_low_page(unsigned long *phys) | 321 | static __ref void *alloc_low_page(unsigned long *phys) |
| 299 | { | 322 | { |
| 300 | unsigned long pfn = table_end++; | 323 | unsigned long pfn = e820_table_end++; |
| 301 | void *adr; | 324 | void *adr; |
| 302 | 325 | ||
| 303 | if (after_bootmem) { | 326 | if (after_bootmem) { |
| @@ -307,7 +330,7 @@ static __ref void *alloc_low_page(unsigned long *phys) | |||
| 307 | return adr; | 330 | return adr; |
| 308 | } | 331 | } |
| 309 | 332 | ||
| 310 | if (pfn >= table_top) | 333 | if (pfn >= e820_table_top) |
| 311 | panic("alloc_low_page: ran out of memory"); | 334 | panic("alloc_low_page: ran out of memory"); |
| 312 | 335 | ||
| 313 | adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE); | 336 | adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE); |
| @@ -547,58 +570,10 @@ phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end, | |||
| 547 | return phys_pud_init(pud, addr, end, page_size_mask); | 570 | return phys_pud_init(pud, addr, end, page_size_mask); |
| 548 | } | 571 | } |
| 549 | 572 | ||
| 550 | static void __init find_early_table_space(unsigned long end, int use_pse, | 573 | unsigned long __init |
| 551 | int use_gbpages) | 574 | kernel_physical_mapping_init(unsigned long start, |
| 552 | { | 575 | unsigned long end, |
| 553 | unsigned long puds, pmds, ptes, tables, start; | 576 | unsigned long page_size_mask) |
| 554 | |||
| 555 | puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; | ||
| 556 | tables = roundup(puds * sizeof(pud_t), PAGE_SIZE); | ||
| 557 | if (use_gbpages) { | ||
| 558 | unsigned long extra; | ||
| 559 | extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT); | ||
| 560 | pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT; | ||
| 561 | } else | ||
| 562 | pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; | ||
| 563 | tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE); | ||
| 564 | |||
| 565 | if (use_pse) { | ||
| 566 | unsigned long extra; | ||
| 567 | extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); | ||
| 568 | ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; | ||
| 569 | } else | ||
| 570 | ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; | ||
| 571 | tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE); | ||
| 572 | |||
| 573 | /* | ||
| 574 | * RED-PEN putting page tables only on node 0 could | ||
| 575 | * cause a hotspot and fill up ZONE_DMA. The page tables | ||
| 576 | * need roughly 0.5KB per GB. | ||
| 577 | */ | ||
| 578 | start = 0x8000; | ||
| 579 | table_start = find_e820_area(start, end, tables, PAGE_SIZE); | ||
| 580 | if (table_start == -1UL) | ||
| 581 | panic("Cannot find space for the kernel page tables"); | ||
| 582 | |||
| 583 | table_start >>= PAGE_SHIFT; | ||
| 584 | table_end = table_start; | ||
| 585 | table_top = table_start + (tables >> PAGE_SHIFT); | ||
| 586 | |||
| 587 | printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n", | ||
| 588 | end, table_start << PAGE_SHIFT, table_top << PAGE_SHIFT); | ||
| 589 | } | ||
| 590 | |||
| 591 | static void __init init_gbpages(void) | ||
| 592 | { | ||
| 593 | if (direct_gbpages && cpu_has_gbpages) | ||
| 594 | printk(KERN_INFO "Using GB pages for direct mapping\n"); | ||
| 595 | else | ||
| 596 | direct_gbpages = 0; | ||
| 597 | } | ||
| 598 | |||
| 599 | static unsigned long __meminit kernel_physical_mapping_init(unsigned long start, | ||
| 600 | unsigned long end, | ||
| 601 | unsigned long page_size_mask) | ||
| 602 | { | 577 | { |
| 603 | 578 | ||
| 604 | unsigned long next, last_map_addr = end; | 579 | unsigned long next, last_map_addr = end; |
| @@ -635,176 +610,6 @@ static unsigned long __meminit kernel_physical_mapping_init(unsigned long start, | |||
| 635 | return last_map_addr; | 610 | return last_map_addr; |
| 636 | } | 611 | } |
| 637 | 612 | ||
| 638 | struct map_range { | ||
| 639 | unsigned long start; | ||
| 640 | unsigned long end; | ||
| 641 | unsigned page_size_mask; | ||
| 642 | }; | ||
| 643 | |||
| 644 | #define NR_RANGE_MR 5 | ||
| 645 | |||
| 646 | static int save_mr(struct map_range *mr, int nr_range, | ||
| 647 | unsigned long start_pfn, unsigned long end_pfn, | ||
| 648 | unsigned long page_size_mask) | ||
| 649 | { | ||
| 650 | |||
| 651 | if (start_pfn < end_pfn) { | ||
| 652 | if (nr_range >= NR_RANGE_MR) | ||
| 653 | panic("run out of range for init_memory_mapping\n"); | ||
| 654 | mr[nr_range].start = start_pfn<<PAGE_SHIFT; | ||
| 655 | mr[nr_range].end = end_pfn<<PAGE_SHIFT; | ||
| 656 | mr[nr_range].page_size_mask = page_size_mask; | ||
| 657 | nr_range++; | ||
| 658 | } | ||
| 659 | |||
| 660 | return nr_range; | ||
| 661 | } | ||
| 662 | |||
| 663 | /* | ||
| 664 | * Setup the direct mapping of the physical memory at PAGE_OFFSET. | ||
| 665 | * This runs before bootmem is initialized and gets pages directly from | ||
| 666 | * the physical memory. To access them they are temporarily mapped. | ||
| 667 | */ | ||
| 668 | unsigned long __init_refok init_memory_mapping(unsigned long start, | ||
| 669 | unsigned long end) | ||
| 670 | { | ||
| 671 | unsigned long last_map_addr = 0; | ||
| 672 | unsigned long page_size_mask = 0; | ||
| 673 | unsigned long start_pfn, end_pfn; | ||
| 674 | unsigned long pos; | ||
| 675 | |||
| 676 | struct map_range mr[NR_RANGE_MR]; | ||
| 677 | int nr_range, i; | ||
| 678 | int use_pse, use_gbpages; | ||
| 679 | |||
| 680 | printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end); | ||
| 681 | |||
| 682 | /* | ||
| 683 | * Find space for the kernel direct mapping tables. | ||
| 684 | * | ||
| 685 | * Later we should allocate these tables in the local node of the | ||
| 686 | * memory mapped. Unfortunately this is done currently before the | ||
| 687 | * nodes are discovered. | ||
| 688 | */ | ||
| 689 | if (!after_bootmem) | ||
| 690 | init_gbpages(); | ||
| 691 | |||
| 692 | #ifdef CONFIG_DEBUG_PAGEALLOC | ||
| 693 | /* | ||
| 694 | * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. | ||
| 695 | * This will simplify cpa(), which otherwise needs to support splitting | ||
| 696 | * large pages into small in interrupt context, etc. | ||
| 697 | */ | ||
| 698 | use_pse = use_gbpages = 0; | ||
| 699 | #else | ||
| 700 | use_pse = cpu_has_pse; | ||
| 701 | use_gbpages = direct_gbpages; | ||
| 702 | #endif | ||
| 703 | |||
| 704 | if (use_gbpages) | ||
| 705 | page_size_mask |= 1 << PG_LEVEL_1G; | ||
| 706 | if (use_pse) | ||
| 707 | page_size_mask |= 1 << PG_LEVEL_2M; | ||
| 708 | |||
| 709 | memset(mr, 0, sizeof(mr)); | ||
| 710 | nr_range = 0; | ||
| 711 | |||
| 712 | /* head if not big page alignment ?*/ | ||
| 713 | start_pfn = start >> PAGE_SHIFT; | ||
| 714 | pos = start_pfn << PAGE_SHIFT; | ||
| 715 | end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT) | ||
| 716 | << (PMD_SHIFT - PAGE_SHIFT); | ||
| 717 | if (end_pfn > (end >> PAGE_SHIFT)) | ||
| 718 | end_pfn = end >> PAGE_SHIFT; | ||
| 719 | if (start_pfn < end_pfn) { | ||
| 720 | nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); | ||
| 721 | pos = end_pfn << PAGE_SHIFT; | ||
| 722 | } | ||
| 723 | |||
| 724 | /* big page (2M) range*/ | ||
| 725 | start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) | ||
| 726 | << (PMD_SHIFT - PAGE_SHIFT); | ||
| 727 | end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT) | ||
| 728 | << (PUD_SHIFT - PAGE_SHIFT); | ||
| 729 | if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT))) | ||
| 730 | end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)); | ||
| 731 | if (start_pfn < end_pfn) { | ||
| 732 | nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, | ||
| 733 | page_size_mask & (1<<PG_LEVEL_2M)); | ||
| 734 | pos = end_pfn << PAGE_SHIFT; | ||
| 735 | } | ||
| 736 | |||
| 737 | /* big page (1G) range */ | ||
| 738 | start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT) | ||
| 739 | << (PUD_SHIFT - PAGE_SHIFT); | ||
| 740 | end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT); | ||
| 741 | if (start_pfn < end_pfn) { | ||
| 742 | nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, | ||
| 743 | page_size_mask & | ||
| 744 | ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G))); | ||
| 745 | pos = end_pfn << PAGE_SHIFT; | ||
| 746 | } | ||
| 747 | |||
| 748 | /* tail is not big page (1G) alignment */ | ||
| 749 | start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) | ||
| 750 | << (PMD_SHIFT - PAGE_SHIFT); | ||
| 751 | end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); | ||
| 752 | if (start_pfn < end_pfn) { | ||
| 753 | nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, | ||
| 754 | page_size_mask & (1<<PG_LEVEL_2M)); | ||
| 755 | pos = end_pfn << PAGE_SHIFT; | ||
| 756 | } | ||
| 757 | |||
| 758 | /* tail is not big page (2M) alignment */ | ||
| 759 | start_pfn = pos>>PAGE_SHIFT; | ||
| 760 | end_pfn = end>>PAGE_SHIFT; | ||
| 761 | nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); | ||
| 762 | |||
| 763 | /* try to merge same page size and continuous */ | ||
| 764 | for (i = 0; nr_range > 1 && i < nr_range - 1; i++) { | ||
| 765 | unsigned long old_start; | ||
| 766 | if (mr[i].end != mr[i+1].start || | ||
| 767 | mr[i].page_size_mask != mr[i+1].page_size_mask) | ||
| 768 | continue; | ||
| 769 | /* move it */ | ||
| 770 | old_start = mr[i].start; | ||
| 771 | memmove(&mr[i], &mr[i+1], | ||
| 772 | (nr_range - 1 - i) * sizeof (struct map_range)); | ||
| 773 | mr[i--].start = old_start; | ||
| 774 | nr_range--; | ||
| 775 | } | ||
| 776 | |||
| 777 | for (i = 0; i < nr_range; i++) | ||
| 778 | printk(KERN_DEBUG " %010lx - %010lx page %s\n", | ||
| 779 | mr[i].start, mr[i].end, | ||
| 780 | (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":( | ||
| 781 | (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k")); | ||
| 782 | |||
| 783 | if (!after_bootmem) | ||
| 784 | find_early_table_space(end, use_pse, use_gbpages); | ||
| 785 | |||
| 786 | for (i = 0; i < nr_range; i++) | ||
| 787 | last_map_addr = kernel_physical_mapping_init( | ||
| 788 | mr[i].start, mr[i].end, | ||
| 789 | mr[i].page_size_mask); | ||
| 790 | |||
| 791 | if (!after_bootmem) | ||
| 792 | mmu_cr4_features = read_cr4(); | ||
| 793 | __flush_tlb_all(); | ||
| 794 | |||
| 795 | if (!after_bootmem && table_end > table_start) | ||
| 796 | reserve_early(table_start << PAGE_SHIFT, | ||
| 797 | table_end << PAGE_SHIFT, "PGTABLE"); | ||
| 798 | |||
| 799 | printk(KERN_INFO "last_map_addr: %lx end: %lx\n", | ||
| 800 | last_map_addr, end); | ||
| 801 | |||
| 802 | if (!after_bootmem) | ||
| 803 | early_memtest(start, end); | ||
| 804 | |||
| 805 | return last_map_addr >> PAGE_SHIFT; | ||
| 806 | } | ||
| 807 | |||
| 808 | #ifndef CONFIG_NUMA | 613 | #ifndef CONFIG_NUMA |
| 809 | void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn) | 614 | void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn) |
| 810 | { | 615 | { |
| @@ -876,28 +681,6 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); | |||
| 876 | 681 | ||
| 877 | #endif /* CONFIG_MEMORY_HOTPLUG */ | 682 | #endif /* CONFIG_MEMORY_HOTPLUG */ |
| 878 | 683 | ||
| 879 | /* | ||
| 880 | * devmem_is_allowed() checks to see if /dev/mem access to a certain address | ||
| 881 | * is valid. The argument is a physical page number. | ||
| 882 | * | ||
| 883 | * | ||
| 884 | * On x86, access has to be given to the first megabyte of ram because that area | ||
| 885 | * contains bios code and data regions used by X and dosemu and similar apps. | ||
| 886 | * Access has to be given to non-kernel-ram areas as well, these contain the PCI | ||
| 887 | * mmio resources as well as potential bios/acpi data regions. | ||
| 888 | */ | ||
| 889 | int devmem_is_allowed(unsigned long pagenr) | ||
| 890 | { | ||
| 891 | if (pagenr <= 256) | ||
| 892 | return 1; | ||
| 893 | if (iomem_is_exclusive(pagenr << PAGE_SHIFT)) | ||
| 894 | return 0; | ||
| 895 | if (!page_is_ram(pagenr)) | ||
| 896 | return 1; | ||
| 897 | return 0; | ||
| 898 | } | ||
| 899 | |||
| 900 | |||
| 901 | static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, | 684 | static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, |
| 902 | kcore_modules, kcore_vsyscall; | 685 | kcore_modules, kcore_vsyscall; |
| 903 | 686 | ||
| @@ -985,13 +768,6 @@ void mark_rodata_ro(void) | |||
| 985 | 768 | ||
| 986 | #endif | 769 | #endif |
| 987 | 770 | ||
| 988 | #ifdef CONFIG_BLK_DEV_INITRD | ||
| 989 | void free_initrd_mem(unsigned long start, unsigned long end) | ||
| 990 | { | ||
| 991 | free_init_pages("initrd memory", start, end); | ||
| 992 | } | ||
| 993 | #endif | ||
| 994 | |||
| 995 | int __init reserve_bootmem_generic(unsigned long phys, unsigned long len, | 771 | int __init reserve_bootmem_generic(unsigned long phys, unsigned long len, |
| 996 | int flags) | 772 | int flags) |
| 997 | { | 773 | { |
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 433f7bd4648a..aca924a30ee6 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c | |||
| @@ -38,8 +38,7 @@ unsigned long __phys_addr(unsigned long x) | |||
| 38 | } else { | 38 | } else { |
| 39 | VIRTUAL_BUG_ON(x < PAGE_OFFSET); | 39 | VIRTUAL_BUG_ON(x < PAGE_OFFSET); |
| 40 | x -= PAGE_OFFSET; | 40 | x -= PAGE_OFFSET; |
| 41 | VIRTUAL_BUG_ON(system_state == SYSTEM_BOOTING ? x > MAXMEM : | 41 | VIRTUAL_BUG_ON(!phys_addr_valid(x)); |
| 42 | !phys_addr_valid(x)); | ||
| 43 | } | 42 | } |
| 44 | return x; | 43 | return x; |
| 45 | } | 44 | } |
| @@ -56,10 +55,8 @@ bool __virt_addr_valid(unsigned long x) | |||
| 56 | if (x < PAGE_OFFSET) | 55 | if (x < PAGE_OFFSET) |
| 57 | return false; | 56 | return false; |
| 58 | x -= PAGE_OFFSET; | 57 | x -= PAGE_OFFSET; |
| 59 | if (system_state == SYSTEM_BOOTING ? | 58 | if (!phys_addr_valid(x)) |
| 60 | x > MAXMEM : !phys_addr_valid(x)) { | ||
| 61 | return false; | 59 | return false; |
| 62 | } | ||
| 63 | } | 60 | } |
| 64 | 61 | ||
| 65 | return pfn_valid(x >> PAGE_SHIFT); | 62 | return pfn_valid(x >> PAGE_SHIFT); |
| @@ -76,10 +73,9 @@ static inline int phys_addr_valid(unsigned long addr) | |||
| 76 | #ifdef CONFIG_DEBUG_VIRTUAL | 73 | #ifdef CONFIG_DEBUG_VIRTUAL |
| 77 | unsigned long __phys_addr(unsigned long x) | 74 | unsigned long __phys_addr(unsigned long x) |
| 78 | { | 75 | { |
| 79 | /* VMALLOC_* aren't constants; not available at the boot time */ | 76 | /* VMALLOC_* aren't constants */ |
| 80 | VIRTUAL_BUG_ON(x < PAGE_OFFSET); | 77 | VIRTUAL_BUG_ON(x < PAGE_OFFSET); |
| 81 | VIRTUAL_BUG_ON(system_state != SYSTEM_BOOTING && | 78 | VIRTUAL_BUG_ON(__vmalloc_start_set && is_vmalloc_addr((void *) x)); |
| 82 | is_vmalloc_addr((void *) x)); | ||
| 83 | return x - PAGE_OFFSET; | 79 | return x - PAGE_OFFSET; |
| 84 | } | 80 | } |
| 85 | EXPORT_SYMBOL(__phys_addr); | 81 | EXPORT_SYMBOL(__phys_addr); |
| @@ -89,7 +85,9 @@ bool __virt_addr_valid(unsigned long x) | |||
| 89 | { | 85 | { |
| 90 | if (x < PAGE_OFFSET) | 86 | if (x < PAGE_OFFSET) |
| 91 | return false; | 87 | return false; |
| 92 | if (system_state != SYSTEM_BOOTING && is_vmalloc_addr((void *) x)) | 88 | if (__vmalloc_start_set && is_vmalloc_addr((void *) x)) |
| 89 | return false; | ||
| 90 | if (x >= FIXADDR_START) | ||
| 93 | return false; | 91 | return false; |
| 94 | return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT); | 92 | return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT); |
| 95 | } | 93 | } |
| @@ -508,13 +506,19 @@ static inline pte_t * __init early_ioremap_pte(unsigned long addr) | |||
| 508 | return &bm_pte[pte_index(addr)]; | 506 | return &bm_pte[pte_index(addr)]; |
| 509 | } | 507 | } |
| 510 | 508 | ||
| 509 | static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata; | ||
| 510 | |||
| 511 | void __init early_ioremap_init(void) | 511 | void __init early_ioremap_init(void) |
| 512 | { | 512 | { |
| 513 | pmd_t *pmd; | 513 | pmd_t *pmd; |
| 514 | int i; | ||
| 514 | 515 | ||
| 515 | if (early_ioremap_debug) | 516 | if (early_ioremap_debug) |
| 516 | printk(KERN_INFO "early_ioremap_init()\n"); | 517 | printk(KERN_INFO "early_ioremap_init()\n"); |
| 517 | 518 | ||
| 519 | for (i = 0; i < FIX_BTMAPS_SLOTS; i++) | ||
| 520 | slot_virt[i] = fix_to_virt(FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*i); | ||
| 521 | |||
| 518 | pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)); | 522 | pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)); |
| 519 | memset(bm_pte, 0, sizeof(bm_pte)); | 523 | memset(bm_pte, 0, sizeof(bm_pte)); |
| 520 | pmd_populate_kernel(&init_mm, pmd, bm_pte); | 524 | pmd_populate_kernel(&init_mm, pmd, bm_pte); |
| @@ -581,6 +585,7 @@ static inline void __init early_clear_fixmap(enum fixed_addresses idx) | |||
| 581 | 585 | ||
| 582 | static void __iomem *prev_map[FIX_BTMAPS_SLOTS] __initdata; | 586 | static void __iomem *prev_map[FIX_BTMAPS_SLOTS] __initdata; |
| 583 | static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata; | 587 | static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata; |
| 588 | |||
| 584 | static int __init check_early_ioremap_leak(void) | 589 | static int __init check_early_ioremap_leak(void) |
| 585 | { | 590 | { |
| 586 | int count = 0; | 591 | int count = 0; |
| @@ -602,7 +607,8 @@ static int __init check_early_ioremap_leak(void) | |||
| 602 | } | 607 | } |
| 603 | late_initcall(check_early_ioremap_leak); | 608 | late_initcall(check_early_ioremap_leak); |
| 604 | 609 | ||
| 605 | static void __init __iomem *__early_ioremap(unsigned long phys_addr, unsigned long size, pgprot_t prot) | 610 | static void __init __iomem * |
| 611 | __early_ioremap(unsigned long phys_addr, unsigned long size, pgprot_t prot) | ||
| 606 | { | 612 | { |
| 607 | unsigned long offset, last_addr; | 613 | unsigned long offset, last_addr; |
| 608 | unsigned int nrpages; | 614 | unsigned int nrpages; |
| @@ -668,9 +674,9 @@ static void __init __iomem *__early_ioremap(unsigned long phys_addr, unsigned lo | |||
| 668 | --nrpages; | 674 | --nrpages; |
| 669 | } | 675 | } |
| 670 | if (early_ioremap_debug) | 676 | if (early_ioremap_debug) |
| 671 | printk(KERN_CONT "%08lx + %08lx\n", offset, fix_to_virt(idx0)); | 677 | printk(KERN_CONT "%08lx + %08lx\n", offset, slot_virt[slot]); |
| 672 | 678 | ||
| 673 | prev_map[slot] = (void __iomem *)(offset + fix_to_virt(idx0)); | 679 | prev_map[slot] = (void __iomem *)(offset + slot_virt[slot]); |
| 674 | return prev_map[slot]; | 680 | return prev_map[slot]; |
| 675 | } | 681 | } |
| 676 | 682 | ||
| @@ -738,8 +744,3 @@ void __init early_iounmap(void __iomem *addr, unsigned long size) | |||
| 738 | } | 744 | } |
| 739 | prev_map[slot] = NULL; | 745 | prev_map[slot] = NULL; |
| 740 | } | 746 | } |
| 741 | |||
| 742 | void __this_fixmap_does_not_exist(void) | ||
| 743 | { | ||
| 744 | WARN_ON(1); | ||
| 745 | } | ||
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index 93d82038af4b..6a518dd08a36 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c | |||
| @@ -32,11 +32,14 @@ struct kmmio_fault_page { | |||
| 32 | struct list_head list; | 32 | struct list_head list; |
| 33 | struct kmmio_fault_page *release_next; | 33 | struct kmmio_fault_page *release_next; |
| 34 | unsigned long page; /* location of the fault page */ | 34 | unsigned long page; /* location of the fault page */ |
| 35 | bool old_presence; /* page presence prior to arming */ | ||
| 36 | bool armed; | ||
| 35 | 37 | ||
| 36 | /* | 38 | /* |
| 37 | * Number of times this page has been registered as a part | 39 | * Number of times this page has been registered as a part |
| 38 | * of a probe. If zero, page is disarmed and this may be freed. | 40 | * of a probe. If zero, page is disarmed and this may be freed. |
| 39 | * Used only by writers (RCU). | 41 | * Used only by writers (RCU) and post_kmmio_handler(). |
| 42 | * Protected by kmmio_lock, when linked into kmmio_page_table. | ||
| 40 | */ | 43 | */ |
| 41 | int count; | 44 | int count; |
| 42 | }; | 45 | }; |
| @@ -105,57 +108,85 @@ static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page) | |||
| 105 | return NULL; | 108 | return NULL; |
| 106 | } | 109 | } |
| 107 | 110 | ||
| 108 | static void set_page_present(unsigned long addr, bool present, | 111 | static void set_pmd_presence(pmd_t *pmd, bool present, bool *old) |
| 109 | unsigned int *pglevel) | 112 | { |
| 113 | pmdval_t v = pmd_val(*pmd); | ||
| 114 | *old = !!(v & _PAGE_PRESENT); | ||
| 115 | v &= ~_PAGE_PRESENT; | ||
| 116 | if (present) | ||
| 117 | v |= _PAGE_PRESENT; | ||
| 118 | set_pmd(pmd, __pmd(v)); | ||
| 119 | } | ||
| 120 | |||
| 121 | static void set_pte_presence(pte_t *pte, bool present, bool *old) | ||
| 122 | { | ||
| 123 | pteval_t v = pte_val(*pte); | ||
| 124 | *old = !!(v & _PAGE_PRESENT); | ||
| 125 | v &= ~_PAGE_PRESENT; | ||
| 126 | if (present) | ||
| 127 | v |= _PAGE_PRESENT; | ||
| 128 | set_pte_atomic(pte, __pte(v)); | ||
| 129 | } | ||
| 130 | |||
| 131 | static int set_page_presence(unsigned long addr, bool present, bool *old) | ||
| 110 | { | 132 | { |
| 111 | pteval_t pteval; | ||
| 112 | pmdval_t pmdval; | ||
| 113 | unsigned int level; | 133 | unsigned int level; |
| 114 | pmd_t *pmd; | ||
| 115 | pte_t *pte = lookup_address(addr, &level); | 134 | pte_t *pte = lookup_address(addr, &level); |
| 116 | 135 | ||
| 117 | if (!pte) { | 136 | if (!pte) { |
| 118 | pr_err("kmmio: no pte for page 0x%08lx\n", addr); | 137 | pr_err("kmmio: no pte for page 0x%08lx\n", addr); |
| 119 | return; | 138 | return -1; |
| 120 | } | 139 | } |
| 121 | 140 | ||
| 122 | if (pglevel) | ||
| 123 | *pglevel = level; | ||
| 124 | |||
| 125 | switch (level) { | 141 | switch (level) { |
| 126 | case PG_LEVEL_2M: | 142 | case PG_LEVEL_2M: |
| 127 | pmd = (pmd_t *)pte; | 143 | set_pmd_presence((pmd_t *)pte, present, old); |
| 128 | pmdval = pmd_val(*pmd) & ~_PAGE_PRESENT; | ||
| 129 | if (present) | ||
| 130 | pmdval |= _PAGE_PRESENT; | ||
| 131 | set_pmd(pmd, __pmd(pmdval)); | ||
| 132 | break; | 144 | break; |
| 133 | |||
| 134 | case PG_LEVEL_4K: | 145 | case PG_LEVEL_4K: |
| 135 | pteval = pte_val(*pte) & ~_PAGE_PRESENT; | 146 | set_pte_presence(pte, present, old); |
| 136 | if (present) | ||
| 137 | pteval |= _PAGE_PRESENT; | ||
| 138 | set_pte_atomic(pte, __pte(pteval)); | ||
| 139 | break; | 147 | break; |
| 140 | |||
| 141 | default: | 148 | default: |
| 142 | pr_err("kmmio: unexpected page level 0x%x.\n", level); | 149 | pr_err("kmmio: unexpected page level 0x%x.\n", level); |
| 143 | return; | 150 | return -1; |
| 144 | } | 151 | } |
| 145 | 152 | ||
| 146 | __flush_tlb_one(addr); | 153 | __flush_tlb_one(addr); |
| 154 | return 0; | ||
| 147 | } | 155 | } |
| 148 | 156 | ||
| 149 | /** Mark the given page as not present. Access to it will trigger a fault. */ | 157 | /* |
| 150 | static void arm_kmmio_fault_page(unsigned long page, unsigned int *pglevel) | 158 | * Mark the given page as not present. Access to it will trigger a fault. |
| 159 | * | ||
| 160 | * Struct kmmio_fault_page is protected by RCU and kmmio_lock, but the | ||
| 161 | * protection is ignored here. RCU read lock is assumed held, so the struct | ||
| 162 | * will not disappear unexpectedly. Furthermore, the caller must guarantee, | ||
| 163 | * that double arming the same virtual address (page) cannot occur. | ||
| 164 | * | ||
| 165 | * Double disarming on the other hand is allowed, and may occur when a fault | ||
| 166 | * and mmiotrace shutdown happen simultaneously. | ||
| 167 | */ | ||
| 168 | static int arm_kmmio_fault_page(struct kmmio_fault_page *f) | ||
| 151 | { | 169 | { |
| 152 | set_page_present(page & PAGE_MASK, false, pglevel); | 170 | int ret; |
| 171 | WARN_ONCE(f->armed, KERN_ERR "kmmio page already armed.\n"); | ||
| 172 | if (f->armed) { | ||
| 173 | pr_warning("kmmio double-arm: page 0x%08lx, ref %d, old %d\n", | ||
| 174 | f->page, f->count, f->old_presence); | ||
| 175 | } | ||
| 176 | ret = set_page_presence(f->page, false, &f->old_presence); | ||
| 177 | WARN_ONCE(ret < 0, KERN_ERR "kmmio arming 0x%08lx failed.\n", f->page); | ||
| 178 | f->armed = true; | ||
| 179 | return ret; | ||
| 153 | } | 180 | } |
| 154 | 181 | ||
| 155 | /** Mark the given page as present. */ | 182 | /** Restore the given page to saved presence state. */ |
| 156 | static void disarm_kmmio_fault_page(unsigned long page, unsigned int *pglevel) | 183 | static void disarm_kmmio_fault_page(struct kmmio_fault_page *f) |
| 157 | { | 184 | { |
| 158 | set_page_present(page & PAGE_MASK, true, pglevel); | 185 | bool tmp; |
| 186 | int ret = set_page_presence(f->page, f->old_presence, &tmp); | ||
| 187 | WARN_ONCE(ret < 0, | ||
| 188 | KERN_ERR "kmmio disarming 0x%08lx failed.\n", f->page); | ||
| 189 | f->armed = false; | ||
| 159 | } | 190 | } |
| 160 | 191 | ||
| 161 | /* | 192 | /* |
| @@ -202,28 +233,32 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr) | |||
| 202 | 233 | ||
| 203 | ctx = &get_cpu_var(kmmio_ctx); | 234 | ctx = &get_cpu_var(kmmio_ctx); |
| 204 | if (ctx->active) { | 235 | if (ctx->active) { |
| 205 | disarm_kmmio_fault_page(faultpage->page, NULL); | ||
| 206 | if (addr == ctx->addr) { | 236 | if (addr == ctx->addr) { |
| 207 | /* | 237 | /* |
| 208 | * On SMP we sometimes get recursive probe hits on the | 238 | * A second fault on the same page means some other |
| 209 | * same address. Context is already saved, fall out. | 239 | * condition needs handling by do_page_fault(), the |
| 240 | * page really not being present is the most common. | ||
| 210 | */ | 241 | */ |
| 211 | pr_debug("kmmio: duplicate probe hit on CPU %d, for " | 242 | pr_debug("kmmio: secondary hit for 0x%08lx CPU %d.\n", |
| 212 | "address 0x%08lx.\n", | 243 | addr, smp_processor_id()); |
| 213 | smp_processor_id(), addr); | 244 | |
| 214 | ret = 1; | 245 | if (!faultpage->old_presence) |
| 215 | goto no_kmmio_ctx; | 246 | pr_info("kmmio: unexpected secondary hit for " |
| 216 | } | 247 | "address 0x%08lx on CPU %d.\n", addr, |
| 217 | /* | 248 | smp_processor_id()); |
| 218 | * Prevent overwriting already in-flight context. | 249 | } else { |
| 219 | * This should not happen, let's hope disarming at least | 250 | /* |
| 220 | * prevents a panic. | 251 | * Prevent overwriting already in-flight context. |
| 221 | */ | 252 | * This should not happen, let's hope disarming at |
| 222 | pr_emerg("kmmio: recursive probe hit on CPU %d, " | 253 | * least prevents a panic. |
| 254 | */ | ||
| 255 | pr_emerg("kmmio: recursive probe hit on CPU %d, " | ||
| 223 | "for address 0x%08lx. Ignoring.\n", | 256 | "for address 0x%08lx. Ignoring.\n", |
| 224 | smp_processor_id(), addr); | 257 | smp_processor_id(), addr); |
| 225 | pr_emerg("kmmio: previous hit was at 0x%08lx.\n", | 258 | pr_emerg("kmmio: previous hit was at 0x%08lx.\n", |
| 226 | ctx->addr); | 259 | ctx->addr); |
| 260 | disarm_kmmio_fault_page(faultpage); | ||
| 261 | } | ||
| 227 | goto no_kmmio_ctx; | 262 | goto no_kmmio_ctx; |
| 228 | } | 263 | } |
| 229 | ctx->active++; | 264 | ctx->active++; |
| @@ -244,7 +279,7 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr) | |||
| 244 | regs->flags &= ~X86_EFLAGS_IF; | 279 | regs->flags &= ~X86_EFLAGS_IF; |
| 245 | 280 | ||
| 246 | /* Now we set present bit in PTE and single step. */ | 281 | /* Now we set present bit in PTE and single step. */ |
| 247 | disarm_kmmio_fault_page(ctx->fpage->page, NULL); | 282 | disarm_kmmio_fault_page(ctx->fpage); |
| 248 | 283 | ||
| 249 | /* | 284 | /* |
| 250 | * If another cpu accesses the same page while we are stepping, | 285 | * If another cpu accesses the same page while we are stepping, |
| @@ -275,7 +310,7 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs) | |||
| 275 | struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx); | 310 | struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx); |
| 276 | 311 | ||
| 277 | if (!ctx->active) { | 312 | if (!ctx->active) { |
| 278 | pr_debug("kmmio: spurious debug trap on CPU %d.\n", | 313 | pr_warning("kmmio: spurious debug trap on CPU %d.\n", |
| 279 | smp_processor_id()); | 314 | smp_processor_id()); |
| 280 | goto out; | 315 | goto out; |
| 281 | } | 316 | } |
| @@ -283,7 +318,11 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs) | |||
| 283 | if (ctx->probe && ctx->probe->post_handler) | 318 | if (ctx->probe && ctx->probe->post_handler) |
| 284 | ctx->probe->post_handler(ctx->probe, condition, regs); | 319 | ctx->probe->post_handler(ctx->probe, condition, regs); |
| 285 | 320 | ||
| 286 | arm_kmmio_fault_page(ctx->fpage->page, NULL); | 321 | /* Prevent racing against release_kmmio_fault_page(). */ |
| 322 | spin_lock(&kmmio_lock); | ||
| 323 | if (ctx->fpage->count) | ||
| 324 | arm_kmmio_fault_page(ctx->fpage); | ||
| 325 | spin_unlock(&kmmio_lock); | ||
| 287 | 326 | ||
| 288 | regs->flags &= ~X86_EFLAGS_TF; | 327 | regs->flags &= ~X86_EFLAGS_TF; |
| 289 | regs->flags |= ctx->saved_flags; | 328 | regs->flags |= ctx->saved_flags; |
| @@ -315,20 +354,24 @@ static int add_kmmio_fault_page(unsigned long page) | |||
| 315 | f = get_kmmio_fault_page(page); | 354 | f = get_kmmio_fault_page(page); |
| 316 | if (f) { | 355 | if (f) { |
| 317 | if (!f->count) | 356 | if (!f->count) |
| 318 | arm_kmmio_fault_page(f->page, NULL); | 357 | arm_kmmio_fault_page(f); |
| 319 | f->count++; | 358 | f->count++; |
| 320 | return 0; | 359 | return 0; |
| 321 | } | 360 | } |
| 322 | 361 | ||
| 323 | f = kmalloc(sizeof(*f), GFP_ATOMIC); | 362 | f = kzalloc(sizeof(*f), GFP_ATOMIC); |
| 324 | if (!f) | 363 | if (!f) |
| 325 | return -1; | 364 | return -1; |
| 326 | 365 | ||
| 327 | f->count = 1; | 366 | f->count = 1; |
| 328 | f->page = page; | 367 | f->page = page; |
| 329 | list_add_rcu(&f->list, kmmio_page_list(f->page)); | ||
| 330 | 368 | ||
| 331 | arm_kmmio_fault_page(f->page, NULL); | 369 | if (arm_kmmio_fault_page(f)) { |
| 370 | kfree(f); | ||
| 371 | return -1; | ||
| 372 | } | ||
| 373 | |||
| 374 | list_add_rcu(&f->list, kmmio_page_list(f->page)); | ||
| 332 | 375 | ||
| 333 | return 0; | 376 | return 0; |
| 334 | } | 377 | } |
| @@ -347,7 +390,7 @@ static void release_kmmio_fault_page(unsigned long page, | |||
| 347 | f->count--; | 390 | f->count--; |
| 348 | BUG_ON(f->count < 0); | 391 | BUG_ON(f->count < 0); |
| 349 | if (!f->count) { | 392 | if (!f->count) { |
| 350 | disarm_kmmio_fault_page(f->page, NULL); | 393 | disarm_kmmio_fault_page(f); |
| 351 | f->release_next = *release_list; | 394 | f->release_next = *release_list; |
| 352 | *release_list = f; | 395 | *release_list = f; |
| 353 | } | 396 | } |
| @@ -408,23 +451,24 @@ static void rcu_free_kmmio_fault_pages(struct rcu_head *head) | |||
| 408 | 451 | ||
| 409 | static void remove_kmmio_fault_pages(struct rcu_head *head) | 452 | static void remove_kmmio_fault_pages(struct rcu_head *head) |
| 410 | { | 453 | { |
| 411 | struct kmmio_delayed_release *dr = container_of( | 454 | struct kmmio_delayed_release *dr = |
| 412 | head, | 455 | container_of(head, struct kmmio_delayed_release, rcu); |
| 413 | struct kmmio_delayed_release, | ||
| 414 | rcu); | ||
| 415 | struct kmmio_fault_page *p = dr->release_list; | 456 | struct kmmio_fault_page *p = dr->release_list; |
| 416 | struct kmmio_fault_page **prevp = &dr->release_list; | 457 | struct kmmio_fault_page **prevp = &dr->release_list; |
| 417 | unsigned long flags; | 458 | unsigned long flags; |
| 459 | |||
| 418 | spin_lock_irqsave(&kmmio_lock, flags); | 460 | spin_lock_irqsave(&kmmio_lock, flags); |
| 419 | while (p) { | 461 | while (p) { |
| 420 | if (!p->count) | 462 | if (!p->count) { |
| 421 | list_del_rcu(&p->list); | 463 | list_del_rcu(&p->list); |
| 422 | else | 464 | prevp = &p->release_next; |
| 465 | } else { | ||
| 423 | *prevp = p->release_next; | 466 | *prevp = p->release_next; |
| 424 | prevp = &p->release_next; | 467 | } |
| 425 | p = p->release_next; | 468 | p = p->release_next; |
| 426 | } | 469 | } |
| 427 | spin_unlock_irqrestore(&kmmio_lock, flags); | 470 | spin_unlock_irqrestore(&kmmio_lock, flags); |
| 471 | |||
| 428 | /* This is the real RCU destroy call. */ | 472 | /* This is the real RCU destroy call. */ |
| 429 | call_rcu(&dr->rcu, rcu_free_kmmio_fault_pages); | 473 | call_rcu(&dr->rcu, rcu_free_kmmio_fault_pages); |
| 430 | } | 474 | } |
diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c index 0bcd7883d036..605c8be06217 100644 --- a/arch/x86/mm/memtest.c +++ b/arch/x86/mm/memtest.c | |||
| @@ -100,6 +100,9 @@ static int __init parse_memtest(char *arg) | |||
| 100 | { | 100 | { |
| 101 | if (arg) | 101 | if (arg) |
| 102 | memtest_pattern = simple_strtoul(arg, NULL, 0); | 102 | memtest_pattern = simple_strtoul(arg, NULL, 0); |
| 103 | else | ||
| 104 | memtest_pattern = ARRAY_SIZE(patterns); | ||
| 105 | |||
| 103 | return 0; | 106 | return 0; |
| 104 | } | 107 | } |
| 105 | 108 | ||
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index 451fe95a0352..3daefa04ace5 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c | |||
| @@ -416,10 +416,11 @@ void __init initmem_init(unsigned long start_pfn, | |||
| 416 | for_each_online_node(nid) | 416 | for_each_online_node(nid) |
| 417 | propagate_e820_map_node(nid); | 417 | propagate_e820_map_node(nid); |
| 418 | 418 | ||
| 419 | for_each_online_node(nid) | 419 | for_each_online_node(nid) { |
| 420 | memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); | 420 | memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); |
| 421 | NODE_DATA(nid)->bdata = &bootmem_node_data[nid]; | ||
| 422 | } | ||
| 421 | 423 | ||
| 422 | NODE_DATA(0)->bdata = &bootmem_node_data[0]; | ||
| 423 | setup_bootmem_allocator(); | 424 | setup_bootmem_allocator(); |
| 424 | } | 425 | } |
| 425 | 426 | ||
diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c index ab50a8d7402c..427fd1b56df5 100644 --- a/arch/x86/mm/testmmiotrace.c +++ b/arch/x86/mm/testmmiotrace.c | |||
| @@ -1,5 +1,5 @@ | |||
| 1 | /* | 1 | /* |
| 2 | * Written by Pekka Paalanen, 2008 <pq@iki.fi> | 2 | * Written by Pekka Paalanen, 2008-2009 <pq@iki.fi> |
| 3 | */ | 3 | */ |
| 4 | #include <linux/module.h> | 4 | #include <linux/module.h> |
| 5 | #include <linux/io.h> | 5 | #include <linux/io.h> |
| @@ -9,35 +9,74 @@ | |||
| 9 | 9 | ||
| 10 | static unsigned long mmio_address; | 10 | static unsigned long mmio_address; |
| 11 | module_param(mmio_address, ulong, 0); | 11 | module_param(mmio_address, ulong, 0); |
| 12 | MODULE_PARM_DESC(mmio_address, "Start address of the mapping of 16 kB."); | 12 | MODULE_PARM_DESC(mmio_address, " Start address of the mapping of 16 kB " |
| 13 | "(or 8 MB if read_far is non-zero)."); | ||
| 14 | |||
| 15 | static unsigned long read_far = 0x400100; | ||
| 16 | module_param(read_far, ulong, 0); | ||
| 17 | MODULE_PARM_DESC(read_far, " Offset of a 32-bit read within 8 MB " | ||
| 18 | "(default: 0x400100)."); | ||
| 19 | |||
| 20 | static unsigned v16(unsigned i) | ||
| 21 | { | ||
| 22 | return i * 12 + 7; | ||
| 23 | } | ||
| 24 | |||
| 25 | static unsigned v32(unsigned i) | ||
| 26 | { | ||
| 27 | return i * 212371 + 13; | ||
| 28 | } | ||
| 13 | 29 | ||
| 14 | static void do_write_test(void __iomem *p) | 30 | static void do_write_test(void __iomem *p) |
| 15 | { | 31 | { |
| 16 | unsigned int i; | 32 | unsigned int i; |
| 33 | pr_info(MODULE_NAME ": write test.\n"); | ||
| 17 | mmiotrace_printk("Write test.\n"); | 34 | mmiotrace_printk("Write test.\n"); |
| 35 | |||
| 18 | for (i = 0; i < 256; i++) | 36 | for (i = 0; i < 256; i++) |
| 19 | iowrite8(i, p + i); | 37 | iowrite8(i, p + i); |
| 38 | |||
| 20 | for (i = 1024; i < (5 * 1024); i += 2) | 39 | for (i = 1024; i < (5 * 1024); i += 2) |
| 21 | iowrite16(i * 12 + 7, p + i); | 40 | iowrite16(v16(i), p + i); |
| 41 | |||
| 22 | for (i = (5 * 1024); i < (16 * 1024); i += 4) | 42 | for (i = (5 * 1024); i < (16 * 1024); i += 4) |
| 23 | iowrite32(i * 212371 + 13, p + i); | 43 | iowrite32(v32(i), p + i); |
| 24 | } | 44 | } |
| 25 | 45 | ||
| 26 | static void do_read_test(void __iomem *p) | 46 | static void do_read_test(void __iomem *p) |
| 27 | { | 47 | { |
| 28 | unsigned int i; | 48 | unsigned int i; |
| 49 | unsigned errs[3] = { 0 }; | ||
| 50 | pr_info(MODULE_NAME ": read test.\n"); | ||
| 29 | mmiotrace_printk("Read test.\n"); | 51 | mmiotrace_printk("Read test.\n"); |
| 52 | |||
| 30 | for (i = 0; i < 256; i++) | 53 | for (i = 0; i < 256; i++) |
| 31 | ioread8(p + i); | 54 | if (ioread8(p + i) != i) |
| 55 | ++errs[0]; | ||
| 56 | |||
| 32 | for (i = 1024; i < (5 * 1024); i += 2) | 57 | for (i = 1024; i < (5 * 1024); i += 2) |
| 33 | ioread16(p + i); | 58 | if (ioread16(p + i) != v16(i)) |
| 59 | ++errs[1]; | ||
| 60 | |||
| 34 | for (i = (5 * 1024); i < (16 * 1024); i += 4) | 61 | for (i = (5 * 1024); i < (16 * 1024); i += 4) |
| 35 | ioread32(p + i); | 62 | if (ioread32(p + i) != v32(i)) |
| 63 | ++errs[2]; | ||
| 64 | |||
| 65 | mmiotrace_printk("Read errors: 8-bit %d, 16-bit %d, 32-bit %d.\n", | ||
| 66 | errs[0], errs[1], errs[2]); | ||
| 36 | } | 67 | } |
| 37 | 68 | ||
| 38 | static void do_test(void) | 69 | static void do_read_far_test(void __iomem *p) |
| 39 | { | 70 | { |
| 40 | void __iomem *p = ioremap_nocache(mmio_address, 0x4000); | 71 | pr_info(MODULE_NAME ": read far test.\n"); |
| 72 | mmiotrace_printk("Read far test.\n"); | ||
| 73 | |||
| 74 | ioread32(p + read_far); | ||
| 75 | } | ||
| 76 | |||
| 77 | static void do_test(unsigned long size) | ||
| 78 | { | ||
| 79 | void __iomem *p = ioremap_nocache(mmio_address, size); | ||
| 41 | if (!p) { | 80 | if (!p) { |
| 42 | pr_err(MODULE_NAME ": could not ioremap, aborting.\n"); | 81 | pr_err(MODULE_NAME ": could not ioremap, aborting.\n"); |
| 43 | return; | 82 | return; |
| @@ -45,11 +84,15 @@ static void do_test(void) | |||
| 45 | mmiotrace_printk("ioremap returned %p.\n", p); | 84 | mmiotrace_printk("ioremap returned %p.\n", p); |
| 46 | do_write_test(p); | 85 | do_write_test(p); |
| 47 | do_read_test(p); | 86 | do_read_test(p); |
| 87 | if (read_far && read_far < size - 4) | ||
| 88 | do_read_far_test(p); | ||
| 48 | iounmap(p); | 89 | iounmap(p); |
| 49 | } | 90 | } |
| 50 | 91 | ||
| 51 | static int __init init(void) | 92 | static int __init init(void) |
| 52 | { | 93 | { |
| 94 | unsigned long size = (read_far) ? (8 << 20) : (16 << 10); | ||
| 95 | |||
| 53 | if (mmio_address == 0) { | 96 | if (mmio_address == 0) { |
| 54 | pr_err(MODULE_NAME ": you have to use the module argument " | 97 | pr_err(MODULE_NAME ": you have to use the module argument " |
| 55 | "mmio_address.\n"); | 98 | "mmio_address.\n"); |
| @@ -58,10 +101,11 @@ static int __init init(void) | |||
| 58 | return -ENXIO; | 101 | return -ENXIO; |
| 59 | } | 102 | } |
| 60 | 103 | ||
| 61 | pr_warning(MODULE_NAME ": WARNING: mapping 16 kB @ 0x%08lx " | 104 | pr_warning(MODULE_NAME ": WARNING: mapping %lu kB @ 0x%08lx in PCI " |
| 62 | "in PCI address space, and writing " | 105 | "address space, and writing 16 kB of rubbish in there.\n", |
| 63 | "rubbish in there.\n", mmio_address); | 106 | size >> 10, mmio_address); |
| 64 | do_test(); | 107 | do_test(size); |
| 108 | pr_info(MODULE_NAME ": All done.\n"); | ||
| 65 | return 0; | 109 | return 0; |
| 66 | } | 110 | } |
| 67 | 111 | ||
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index c52f4034c7fd..82cd39a6cbd3 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c | |||
| @@ -103,7 +103,7 @@ static void xen_vcpu_setup(int cpu) | |||
| 103 | 103 | ||
| 104 | vcpup = &per_cpu(xen_vcpu_info, cpu); | 104 | vcpup = &per_cpu(xen_vcpu_info, cpu); |
| 105 | 105 | ||
| 106 | info.mfn = virt_to_mfn(vcpup); | 106 | info.mfn = arbitrary_virt_to_mfn(vcpup); |
| 107 | info.offset = offset_in_page(vcpup); | 107 | info.offset = offset_in_page(vcpup); |
| 108 | 108 | ||
| 109 | printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %llx, offset %d\n", | 109 | printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %llx, offset %d\n", |
| @@ -301,8 +301,10 @@ static void xen_load_gdt(const struct desc_ptr *dtr) | |||
| 301 | frames = mcs.args; | 301 | frames = mcs.args; |
| 302 | 302 | ||
| 303 | for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) { | 303 | for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) { |
| 304 | frames[f] = virt_to_mfn(va); | 304 | frames[f] = arbitrary_virt_to_mfn((void *)va); |
| 305 | |||
| 305 | make_lowmem_page_readonly((void *)va); | 306 | make_lowmem_page_readonly((void *)va); |
| 307 | make_lowmem_page_readonly(mfn_to_virt(frames[f])); | ||
| 306 | } | 308 | } |
| 307 | 309 | ||
| 308 | MULTI_set_gdt(mcs.mc, frames, size / sizeof(struct desc_struct)); | 310 | MULTI_set_gdt(mcs.mc, frames, size / sizeof(struct desc_struct)); |
| @@ -314,7 +316,7 @@ static void load_TLS_descriptor(struct thread_struct *t, | |||
| 314 | unsigned int cpu, unsigned int i) | 316 | unsigned int cpu, unsigned int i) |
| 315 | { | 317 | { |
| 316 | struct desc_struct *gdt = get_cpu_gdt_table(cpu); | 318 | struct desc_struct *gdt = get_cpu_gdt_table(cpu); |
| 317 | xmaddr_t maddr = virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]); | 319 | xmaddr_t maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]); |
| 318 | struct multicall_space mc = __xen_mc_entry(0); | 320 | struct multicall_space mc = __xen_mc_entry(0); |
| 319 | 321 | ||
| 320 | MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]); | 322 | MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]); |
| @@ -488,7 +490,7 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry, | |||
| 488 | break; | 490 | break; |
| 489 | 491 | ||
| 490 | default: { | 492 | default: { |
| 491 | xmaddr_t maddr = virt_to_machine(&dt[entry]); | 493 | xmaddr_t maddr = arbitrary_virt_to_machine(&dt[entry]); |
| 492 | 494 | ||
| 493 | xen_mc_flush(); | 495 | xen_mc_flush(); |
| 494 | if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc)) | 496 | if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc)) |
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 319bd40a57c2..cb6afa4ec95c 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c | |||
| @@ -276,6 +276,13 @@ void set_phys_to_machine(unsigned long pfn, unsigned long mfn) | |||
| 276 | p2m_top[topidx][idx] = mfn; | 276 | p2m_top[topidx][idx] = mfn; |
| 277 | } | 277 | } |
| 278 | 278 | ||
| 279 | unsigned long arbitrary_virt_to_mfn(void *vaddr) | ||
| 280 | { | ||
| 281 | xmaddr_t maddr = arbitrary_virt_to_machine(vaddr); | ||
| 282 | |||
| 283 | return PFN_DOWN(maddr.maddr); | ||
| 284 | } | ||
| 285 | |||
| 279 | xmaddr_t arbitrary_virt_to_machine(void *vaddr) | 286 | xmaddr_t arbitrary_virt_to_machine(void *vaddr) |
| 280 | { | 287 | { |
| 281 | unsigned long address = (unsigned long)vaddr; | 288 | unsigned long address = (unsigned long)vaddr; |
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 035582ae815d..8d470562ffc9 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c | |||
| @@ -219,6 +219,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) | |||
| 219 | { | 219 | { |
| 220 | struct vcpu_guest_context *ctxt; | 220 | struct vcpu_guest_context *ctxt; |
| 221 | struct desc_struct *gdt; | 221 | struct desc_struct *gdt; |
| 222 | unsigned long gdt_mfn; | ||
| 222 | 223 | ||
| 223 | if (cpumask_test_and_set_cpu(cpu, xen_cpu_initialized_map)) | 224 | if (cpumask_test_and_set_cpu(cpu, xen_cpu_initialized_map)) |
| 224 | return 0; | 225 | return 0; |
| @@ -248,9 +249,12 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) | |||
| 248 | ctxt->ldt_ents = 0; | 249 | ctxt->ldt_ents = 0; |
| 249 | 250 | ||
| 250 | BUG_ON((unsigned long)gdt & ~PAGE_MASK); | 251 | BUG_ON((unsigned long)gdt & ~PAGE_MASK); |
| 252 | |||
| 253 | gdt_mfn = arbitrary_virt_to_mfn(gdt); | ||
| 251 | make_lowmem_page_readonly(gdt); | 254 | make_lowmem_page_readonly(gdt); |
| 255 | make_lowmem_page_readonly(mfn_to_virt(gdt_mfn)); | ||
| 252 | 256 | ||
| 253 | ctxt->gdt_frames[0] = virt_to_mfn(gdt); | 257 | ctxt->gdt_frames[0] = gdt_mfn; |
| 254 | ctxt->gdt_ents = GDT_ENTRIES; | 258 | ctxt->gdt_ents = GDT_ENTRIES; |
| 255 | 259 | ||
| 256 | ctxt->user_regs.cs = __KERNEL_CS; | 260 | ctxt->user_regs.cs = __KERNEL_CS; |
