summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-05-06 18:56:41 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2019-05-06 18:56:41 -0400
commit8f147727030bf9e81331ab9b8f42d4611bb6a3d9 (patch)
treed3f1e2410174bb8c479590a8f1c7e204e3a48eaf
parent53f8b081c184328b82c8a7b5e70b8243b3cea8bd (diff)
parent2c4645439e8f2f6e7c37f158feae6f6a82baa910 (diff)
Merge branch 'x86-irq-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 irq updates from Ingo Molnar: "Here are the main changes in this tree: - Introduce x86-64 IRQ/exception/debug stack guard pages to detect stack overflows immediately and deterministically. - Clean up over a decade worth of cruft accumulated. The outcome of this should be more clear-cut faults/crashes when any of the low level x86 CPU stacks overflow, instead of silent memory corruption and sporadic failures much later on" * 'x86-irq-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (33 commits) x86/irq: Fix outdated comments x86/irq/64: Remove stack overflow debug code x86/irq/64: Remap the IRQ stack with guard pages x86/irq/64: Split the IRQ stack into its own pages x86/irq/64: Init hardirq_stack_ptr during CPU hotplug x86/irq/32: Handle irq stack allocation failure proper x86/irq/32: Invoke irq_ctx_init() from init_IRQ() x86/irq/64: Rename irq_stack_ptr to hardirq_stack_ptr x86/irq/32: Rename hard/softirq_stack to hard/softirq_stack_ptr x86/irq/32: Make irq stack a character array x86/irq/32: Define IRQ_STACK_SIZE x86/dumpstack/64: Speedup in_exception_stack() x86/exceptions: Split debug IST stack x86/exceptions: Enable IST guard pages x86/exceptions: Disconnect IST index and stack order x86/cpu: Remove orig_ist array x86/cpu: Prepare TSS.IST setup for guard pages x86/dumpstack/64: Use cpu_entry_area instead of orig_ist x86/irq/64: Use cpu entry area instead of orig_ist x86/traps: Use cpu_entry_area instead of orig_ist ...
-rw-r--r--Documentation/x86/kernel-stacks13
-rw-r--r--arch/x86/Kconfig2
-rw-r--r--arch/x86/entry/entry_64.S16
-rw-r--r--arch/x86/include/asm/cpu_entry_area.h69
-rw-r--r--arch/x86/include/asm/debugreg.h2
-rw-r--r--arch/x86/include/asm/irq.h6
-rw-r--r--arch/x86/include/asm/irq_vectors.h4
-rw-r--r--arch/x86/include/asm/page_32_types.h8
-rw-r--r--arch/x86/include/asm/page_64_types.h16
-rw-r--r--arch/x86/include/asm/processor.h43
-rw-r--r--arch/x86/include/asm/smp.h2
-rw-r--r--arch/x86/include/asm/stackprotector.h6
-rw-r--r--arch/x86/include/asm/stacktrace.h2
-rw-r--r--arch/x86/kernel/asm-offsets_64.c4
-rw-r--r--arch/x86/kernel/cpu/common.c60
-rw-r--r--arch/x86/kernel/dumpstack_32.c8
-rw-r--r--arch/x86/kernel/dumpstack_64.c99
-rw-r--r--arch/x86/kernel/head_64.S2
-rw-r--r--arch/x86/kernel/idt.c19
-rw-r--r--arch/x86/kernel/irq_32.c41
-rw-r--r--arch/x86/kernel/irq_64.c89
-rw-r--r--arch/x86/kernel/irqinit.c4
-rw-r--r--arch/x86/kernel/nmi.c20
-rw-r--r--arch/x86/kernel/setup_percpu.c5
-rw-r--r--arch/x86/kernel/smpboot.c15
-rw-r--r--arch/x86/kernel/vmlinux.lds.S7
-rw-r--r--arch/x86/mm/cpu_entry_area.c64
-rw-r--r--arch/x86/mm/fault.c3
-rw-r--r--arch/x86/tools/relocs.c2
-rw-r--r--arch/x86/xen/smp_pv.c4
-rw-r--r--arch/x86/xen/xen-head.S10
-rw-r--r--drivers/xen/events/events_base.c1
-rw-r--r--mm/slab.c48
33 files changed, 377 insertions, 317 deletions
diff --git a/Documentation/x86/kernel-stacks b/Documentation/x86/kernel-stacks
index 9a0aa4d3a866..d1bfb0b95ee0 100644
--- a/Documentation/x86/kernel-stacks
+++ b/Documentation/x86/kernel-stacks
@@ -59,7 +59,7 @@ If that assumption is ever broken then the stacks will become corrupt.
59 59
60The currently assigned IST stacks are :- 60The currently assigned IST stacks are :-
61 61
62* DOUBLEFAULT_STACK. EXCEPTION_STKSZ (PAGE_SIZE). 62* ESTACK_DF. EXCEPTION_STKSZ (PAGE_SIZE).
63 63
64 Used for interrupt 8 - Double Fault Exception (#DF). 64 Used for interrupt 8 - Double Fault Exception (#DF).
65 65
@@ -68,7 +68,7 @@ The currently assigned IST stacks are :-
68 Using a separate stack allows the kernel to recover from it well enough 68 Using a separate stack allows the kernel to recover from it well enough
69 in many cases to still output an oops. 69 in many cases to still output an oops.
70 70
71* NMI_STACK. EXCEPTION_STKSZ (PAGE_SIZE). 71* ESTACK_NMI. EXCEPTION_STKSZ (PAGE_SIZE).
72 72
73 Used for non-maskable interrupts (NMI). 73 Used for non-maskable interrupts (NMI).
74 74
@@ -76,7 +76,7 @@ The currently assigned IST stacks are :-
76 middle of switching stacks. Using IST for NMI events avoids making 76 middle of switching stacks. Using IST for NMI events avoids making
77 assumptions about the previous state of the kernel stack. 77 assumptions about the previous state of the kernel stack.
78 78
79* DEBUG_STACK. DEBUG_STKSZ 79* ESTACK_DB. EXCEPTION_STKSZ (PAGE_SIZE).
80 80
81 Used for hardware debug interrupts (interrupt 1) and for software 81 Used for hardware debug interrupts (interrupt 1) and for software
82 debug interrupts (INT3). 82 debug interrupts (INT3).
@@ -86,7 +86,12 @@ The currently assigned IST stacks are :-
86 avoids making assumptions about the previous state of the kernel 86 avoids making assumptions about the previous state of the kernel
87 stack. 87 stack.
88 88
89* MCE_STACK. EXCEPTION_STKSZ (PAGE_SIZE). 89 To handle nested #DB correctly there exist two instances of DB stacks. On
90 #DB entry the IST stackpointer for #DB is switched to the second instance
91 so a nested #DB starts from a clean stack. The nested #DB switches
92 the IST stackpointer to a guard hole to catch triple nesting.
93
94* ESTACK_MCE. EXCEPTION_STKSZ (PAGE_SIZE).
90 95
91 Used for interrupt 18 - Machine Check Exception (#MC). 96 Used for interrupt 18 - Machine Check Exception (#MC).
92 97
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 7e59efc70b91..db95da6d644d 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -14,6 +14,7 @@ config X86_32
14 select ARCH_WANT_IPC_PARSE_VERSION 14 select ARCH_WANT_IPC_PARSE_VERSION
15 select CLKSRC_I8253 15 select CLKSRC_I8253
16 select CLONE_BACKWARDS 16 select CLONE_BACKWARDS
17 select HAVE_DEBUG_STACKOVERFLOW
17 select MODULES_USE_ELF_REL 18 select MODULES_USE_ELF_REL
18 select OLD_SIGACTION 19 select OLD_SIGACTION
19 20
@@ -138,7 +139,6 @@ config X86
138 select HAVE_COPY_THREAD_TLS 139 select HAVE_COPY_THREAD_TLS
139 select HAVE_C_RECORDMCOUNT 140 select HAVE_C_RECORDMCOUNT
140 select HAVE_DEBUG_KMEMLEAK 141 select HAVE_DEBUG_KMEMLEAK
141 select HAVE_DEBUG_STACKOVERFLOW
142 select HAVE_DMA_CONTIGUOUS 142 select HAVE_DMA_CONTIGUOUS
143 select HAVE_DYNAMIC_FTRACE 143 select HAVE_DYNAMIC_FTRACE
144 select HAVE_DYNAMIC_FTRACE_WITH_REGS 144 select HAVE_DYNAMIC_FTRACE_WITH_REGS
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index e7e270603fe7..20e45d9b4e15 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -298,7 +298,7 @@ ENTRY(__switch_to_asm)
298 298
299#ifdef CONFIG_STACKPROTECTOR 299#ifdef CONFIG_STACKPROTECTOR
300 movq TASK_stack_canary(%rsi), %rbx 300 movq TASK_stack_canary(%rsi), %rbx
301 movq %rbx, PER_CPU_VAR(irq_stack_union)+stack_canary_offset 301 movq %rbx, PER_CPU_VAR(fixed_percpu_data) + stack_canary_offset
302#endif 302#endif
303 303
304#ifdef CONFIG_RETPOLINE 304#ifdef CONFIG_RETPOLINE
@@ -430,8 +430,8 @@ END(irq_entries_start)
430 * it before we actually move ourselves to the IRQ stack. 430 * it before we actually move ourselves to the IRQ stack.
431 */ 431 */
432 432
433 movq \old_rsp, PER_CPU_VAR(irq_stack_union + IRQ_STACK_SIZE - 8) 433 movq \old_rsp, PER_CPU_VAR(irq_stack_backing_store + IRQ_STACK_SIZE - 8)
434 movq PER_CPU_VAR(irq_stack_ptr), %rsp 434 movq PER_CPU_VAR(hardirq_stack_ptr), %rsp
435 435
436#ifdef CONFIG_DEBUG_ENTRY 436#ifdef CONFIG_DEBUG_ENTRY
437 /* 437 /*
@@ -840,7 +840,7 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt
840/* 840/*
841 * Exception entry points. 841 * Exception entry points.
842 */ 842 */
843#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + ((x) - 1) * 8) 843#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + (x) * 8)
844 844
845/** 845/**
846 * idtentry - Generate an IDT entry stub 846 * idtentry - Generate an IDT entry stub
@@ -878,7 +878,7 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt
878 * @paranoid == 2 is special: the stub will never switch stacks. This is for 878 * @paranoid == 2 is special: the stub will never switch stacks. This is for
879 * #DF: if the thread stack is somehow unusable, we'll still get a useful OOPS. 879 * #DF: if the thread stack is somehow unusable, we'll still get a useful OOPS.
880 */ 880 */
881.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 881.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ist_offset=0
882ENTRY(\sym) 882ENTRY(\sym)
883 UNWIND_HINT_IRET_REGS offset=\has_error_code*8 883 UNWIND_HINT_IRET_REGS offset=\has_error_code*8
884 884
@@ -924,13 +924,13 @@ ENTRY(\sym)
924 .endif 924 .endif
925 925
926 .if \shift_ist != -1 926 .if \shift_ist != -1
927 subq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist) 927 subq $\ist_offset, CPU_TSS_IST(\shift_ist)
928 .endif 928 .endif
929 929
930 call \do_sym 930 call \do_sym
931 931
932 .if \shift_ist != -1 932 .if \shift_ist != -1
933 addq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist) 933 addq $\ist_offset, CPU_TSS_IST(\shift_ist)
934 .endif 934 .endif
935 935
936 /* these procedures expect "no swapgs" flag in ebx */ 936 /* these procedures expect "no swapgs" flag in ebx */
@@ -1128,7 +1128,7 @@ apicinterrupt3 HYPERV_STIMER0_VECTOR \
1128 hv_stimer0_callback_vector hv_stimer0_vector_handler 1128 hv_stimer0_callback_vector hv_stimer0_vector_handler
1129#endif /* CONFIG_HYPERV */ 1129#endif /* CONFIG_HYPERV */
1130 1130
1131idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK 1131idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=IST_INDEX_DB ist_offset=DB_STACK_OFFSET
1132idtentry int3 do_int3 has_error_code=0 1132idtentry int3 do_int3 has_error_code=0
1133idtentry stack_segment do_stack_segment has_error_code=1 1133idtentry stack_segment do_stack_segment has_error_code=1
1134 1134
diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h
index 29c706415443..cff3f3f3bfe0 100644
--- a/arch/x86/include/asm/cpu_entry_area.h
+++ b/arch/x86/include/asm/cpu_entry_area.h
@@ -7,6 +7,64 @@
7#include <asm/processor.h> 7#include <asm/processor.h>
8#include <asm/intel_ds.h> 8#include <asm/intel_ds.h>
9 9
10#ifdef CONFIG_X86_64
11
12/* Macro to enforce the same ordering and stack sizes */
13#define ESTACKS_MEMBERS(guardsize, db2_holesize)\
14 char DF_stack_guard[guardsize]; \
15 char DF_stack[EXCEPTION_STKSZ]; \
16 char NMI_stack_guard[guardsize]; \
17 char NMI_stack[EXCEPTION_STKSZ]; \
18 char DB2_stack_guard[guardsize]; \
19 char DB2_stack[db2_holesize]; \
20 char DB1_stack_guard[guardsize]; \
21 char DB1_stack[EXCEPTION_STKSZ]; \
22 char DB_stack_guard[guardsize]; \
23 char DB_stack[EXCEPTION_STKSZ]; \
24 char MCE_stack_guard[guardsize]; \
25 char MCE_stack[EXCEPTION_STKSZ]; \
26 char IST_top_guard[guardsize]; \
27
28/* The exception stacks' physical storage. No guard pages required */
29struct exception_stacks {
30 ESTACKS_MEMBERS(0, 0)
31};
32
33/* The effective cpu entry area mapping with guard pages. */
34struct cea_exception_stacks {
35 ESTACKS_MEMBERS(PAGE_SIZE, EXCEPTION_STKSZ)
36};
37
38/*
39 * The exception stack ordering in [cea_]exception_stacks
40 */
41enum exception_stack_ordering {
42 ESTACK_DF,
43 ESTACK_NMI,
44 ESTACK_DB2,
45 ESTACK_DB1,
46 ESTACK_DB,
47 ESTACK_MCE,
48 N_EXCEPTION_STACKS
49};
50
51#define CEA_ESTACK_SIZE(st) \
52 sizeof(((struct cea_exception_stacks *)0)->st## _stack)
53
54#define CEA_ESTACK_BOT(ceastp, st) \
55 ((unsigned long)&(ceastp)->st## _stack)
56
57#define CEA_ESTACK_TOP(ceastp, st) \
58 (CEA_ESTACK_BOT(ceastp, st) + CEA_ESTACK_SIZE(st))
59
60#define CEA_ESTACK_OFFS(st) \
61 offsetof(struct cea_exception_stacks, st## _stack)
62
63#define CEA_ESTACK_PAGES \
64 (sizeof(struct cea_exception_stacks) / PAGE_SIZE)
65
66#endif
67
10/* 68/*
11 * cpu_entry_area is a percpu region that contains things needed by the CPU 69 * cpu_entry_area is a percpu region that contains things needed by the CPU
12 * and early entry/exit code. Real types aren't used for all fields here 70 * and early entry/exit code. Real types aren't used for all fields here
@@ -32,12 +90,9 @@ struct cpu_entry_area {
32 90
33#ifdef CONFIG_X86_64 91#ifdef CONFIG_X86_64
34 /* 92 /*
35 * Exception stacks used for IST entries. 93 * Exception stacks used for IST entries with guard pages.
36 *
37 * In the future, this should have a separate slot for each stack
38 * with guard pages between them.
39 */ 94 */
40 char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]; 95 struct cea_exception_stacks estacks;
41#endif 96#endif
42#ifdef CONFIG_CPU_SUP_INTEL 97#ifdef CONFIG_CPU_SUP_INTEL
43 /* 98 /*
@@ -57,6 +112,7 @@ struct cpu_entry_area {
57#define CPU_ENTRY_AREA_TOT_SIZE (CPU_ENTRY_AREA_SIZE * NR_CPUS) 112#define CPU_ENTRY_AREA_TOT_SIZE (CPU_ENTRY_AREA_SIZE * NR_CPUS)
58 113
59DECLARE_PER_CPU(struct cpu_entry_area *, cpu_entry_area); 114DECLARE_PER_CPU(struct cpu_entry_area *, cpu_entry_area);
115DECLARE_PER_CPU(struct cea_exception_stacks *, cea_exception_stacks);
60 116
61extern void setup_cpu_entry_areas(void); 117extern void setup_cpu_entry_areas(void);
62extern void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags); 118extern void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags);
@@ -76,4 +132,7 @@ static inline struct entry_stack *cpu_entry_stack(int cpu)
76 return &get_cpu_entry_area(cpu)->entry_stack_page.stack; 132 return &get_cpu_entry_area(cpu)->entry_stack_page.stack;
77} 133}
78 134
135#define __this_cpu_ist_top_va(name) \
136 CEA_ESTACK_TOP(__this_cpu_read(cea_exception_stacks), name)
137
79#endif 138#endif
diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h
index 9e5ca30738e5..1a8609a15856 100644
--- a/arch/x86/include/asm/debugreg.h
+++ b/arch/x86/include/asm/debugreg.h
@@ -104,11 +104,9 @@ static inline void debug_stack_usage_dec(void)
104{ 104{
105 __this_cpu_dec(debug_stack_usage); 105 __this_cpu_dec(debug_stack_usage);
106} 106}
107int is_debug_stack(unsigned long addr);
108void debug_stack_set_zero(void); 107void debug_stack_set_zero(void);
109void debug_stack_reset(void); 108void debug_stack_reset(void);
110#else /* !X86_64 */ 109#else /* !X86_64 */
111static inline int is_debug_stack(unsigned long addr) { return 0; }
112static inline void debug_stack_set_zero(void) { } 110static inline void debug_stack_set_zero(void) { }
113static inline void debug_stack_reset(void) { } 111static inline void debug_stack_reset(void) { }
114static inline void debug_stack_usage_inc(void) { } 112static inline void debug_stack_usage_inc(void) { }
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index fbb16e6b6c18..8f95686ec27e 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -16,11 +16,7 @@ static inline int irq_canonicalize(int irq)
16 return ((irq == 2) ? 9 : irq); 16 return ((irq == 2) ? 9 : irq);
17} 17}
18 18
19#ifdef CONFIG_X86_32 19extern int irq_init_percpu_irqstack(unsigned int cpu);
20extern void irq_ctx_init(int cpu);
21#else
22# define irq_ctx_init(cpu) do { } while (0)
23#endif
24 20
25#define __ARCH_HAS_DO_SOFTIRQ 21#define __ARCH_HAS_DO_SOFTIRQ
26 22
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 548d90bbf919..889f8b1b5b7f 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -18,8 +18,8 @@
18 * Vectors 0 ... 31 : system traps and exceptions - hardcoded events 18 * Vectors 0 ... 31 : system traps and exceptions - hardcoded events
19 * Vectors 32 ... 127 : device interrupts 19 * Vectors 32 ... 127 : device interrupts
20 * Vector 128 : legacy int80 syscall interface 20 * Vector 128 : legacy int80 syscall interface
21 * Vectors 129 ... INVALIDATE_TLB_VECTOR_START-1 except 204 : device interrupts 21 * Vectors 129 ... LOCAL_TIMER_VECTOR-1
22 * Vectors INVALIDATE_TLB_VECTOR_START ... 255 : special interrupts 22 * Vectors LOCAL_TIMER_VECTOR ... 255 : special interrupts
23 * 23 *
24 * 64-bit x86 has per CPU IDT tables, 32-bit has one shared IDT table. 24 * 64-bit x86 has per CPU IDT tables, 32-bit has one shared IDT table.
25 * 25 *
diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h
index 0d5c739eebd7..565ad755c785 100644
--- a/arch/x86/include/asm/page_32_types.h
+++ b/arch/x86/include/asm/page_32_types.h
@@ -22,11 +22,9 @@
22#define THREAD_SIZE_ORDER 1 22#define THREAD_SIZE_ORDER 1
23#define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER) 23#define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER)
24 24
25#define DOUBLEFAULT_STACK 1 25#define IRQ_STACK_SIZE THREAD_SIZE
26#define NMI_STACK 0 26
27#define DEBUG_STACK 0 27#define N_EXCEPTION_STACKS 1
28#define MCE_STACK 0
29#define N_EXCEPTION_STACKS 1
30 28
31#ifdef CONFIG_X86_PAE 29#ifdef CONFIG_X86_PAE
32/* 30/*
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index 8f657286d599..793c14c372cb 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -14,22 +14,20 @@
14 14
15#define THREAD_SIZE_ORDER (2 + KASAN_STACK_ORDER) 15#define THREAD_SIZE_ORDER (2 + KASAN_STACK_ORDER)
16#define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER) 16#define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER)
17#define CURRENT_MASK (~(THREAD_SIZE - 1))
18 17
19#define EXCEPTION_STACK_ORDER (0 + KASAN_STACK_ORDER) 18#define EXCEPTION_STACK_ORDER (0 + KASAN_STACK_ORDER)
20#define EXCEPTION_STKSZ (PAGE_SIZE << EXCEPTION_STACK_ORDER) 19#define EXCEPTION_STKSZ (PAGE_SIZE << EXCEPTION_STACK_ORDER)
21 20
22#define DEBUG_STACK_ORDER (EXCEPTION_STACK_ORDER + 1)
23#define DEBUG_STKSZ (PAGE_SIZE << DEBUG_STACK_ORDER)
24
25#define IRQ_STACK_ORDER (2 + KASAN_STACK_ORDER) 21#define IRQ_STACK_ORDER (2 + KASAN_STACK_ORDER)
26#define IRQ_STACK_SIZE (PAGE_SIZE << IRQ_STACK_ORDER) 22#define IRQ_STACK_SIZE (PAGE_SIZE << IRQ_STACK_ORDER)
27 23
28#define DOUBLEFAULT_STACK 1 24/*
29#define NMI_STACK 2 25 * The index for the tss.ist[] array. The hardware limit is 7 entries.
30#define DEBUG_STACK 3 26 */
31#define MCE_STACK 4 27#define IST_INDEX_DF 0
32#define N_EXCEPTION_STACKS 4 /* hw limit: 7 */ 28#define IST_INDEX_NMI 1
29#define IST_INDEX_DB 2
30#define IST_INDEX_MCE 3
33 31
34/* 32/*
35 * Set __PAGE_OFFSET to the most negative possible address + 33 * Set __PAGE_OFFSET to the most negative possible address +
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 2bb3a648fc12..7e99ef67bff0 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -367,6 +367,13 @@ DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw);
367#define __KERNEL_TSS_LIMIT \ 367#define __KERNEL_TSS_LIMIT \
368 (IO_BITMAP_OFFSET + IO_BITMAP_BYTES + sizeof(unsigned long) - 1) 368 (IO_BITMAP_OFFSET + IO_BITMAP_BYTES + sizeof(unsigned long) - 1)
369 369
370/* Per CPU interrupt stacks */
371struct irq_stack {
372 char stack[IRQ_STACK_SIZE];
373} __aligned(IRQ_STACK_SIZE);
374
375DECLARE_PER_CPU(struct irq_stack *, hardirq_stack_ptr);
376
370#ifdef CONFIG_X86_32 377#ifdef CONFIG_X86_32
371DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack); 378DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
372#else 379#else
@@ -374,38 +381,25 @@ DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
374#define cpu_current_top_of_stack cpu_tss_rw.x86_tss.sp1 381#define cpu_current_top_of_stack cpu_tss_rw.x86_tss.sp1
375#endif 382#endif
376 383
377/*
378 * Save the original ist values for checking stack pointers during debugging
379 */
380struct orig_ist {
381 unsigned long ist[7];
382};
383
384#ifdef CONFIG_X86_64 384#ifdef CONFIG_X86_64
385DECLARE_PER_CPU(struct orig_ist, orig_ist); 385struct fixed_percpu_data {
386
387union irq_stack_union {
388 char irq_stack[IRQ_STACK_SIZE];
389 /* 386 /*
390 * GCC hardcodes the stack canary as %gs:40. Since the 387 * GCC hardcodes the stack canary as %gs:40. Since the
391 * irq_stack is the object at %gs:0, we reserve the bottom 388 * irq_stack is the object at %gs:0, we reserve the bottom
392 * 48 bytes of the irq stack for the canary. 389 * 48 bytes of the irq stack for the canary.
393 */ 390 */
394 struct { 391 char gs_base[40];
395 char gs_base[40]; 392 unsigned long stack_canary;
396 unsigned long stack_canary;
397 };
398}; 393};
399 394
400DECLARE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __visible; 395DECLARE_PER_CPU_FIRST(struct fixed_percpu_data, fixed_percpu_data) __visible;
401DECLARE_INIT_PER_CPU(irq_stack_union); 396DECLARE_INIT_PER_CPU(fixed_percpu_data);
402 397
403static inline unsigned long cpu_kernelmode_gs_base(int cpu) 398static inline unsigned long cpu_kernelmode_gs_base(int cpu)
404{ 399{
405 return (unsigned long)per_cpu(irq_stack_union.gs_base, cpu); 400 return (unsigned long)per_cpu(fixed_percpu_data.gs_base, cpu);
406} 401}
407 402
408DECLARE_PER_CPU(char *, irq_stack_ptr);
409DECLARE_PER_CPU(unsigned int, irq_count); 403DECLARE_PER_CPU(unsigned int, irq_count);
410extern asmlinkage void ignore_sysret(void); 404extern asmlinkage void ignore_sysret(void);
411 405
@@ -427,15 +421,8 @@ struct stack_canary {
427}; 421};
428DECLARE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); 422DECLARE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
429#endif 423#endif
430/* 424/* Per CPU softirq stack pointer */
431 * per-CPU IRQ handling stacks 425DECLARE_PER_CPU(struct irq_stack *, softirq_stack_ptr);
432 */
433struct irq_stack {
434 u32 stack[THREAD_SIZE/sizeof(u32)];
435} __aligned(THREAD_SIZE);
436
437DECLARE_PER_CPU(struct irq_stack *, hardirq_stack);
438DECLARE_PER_CPU(struct irq_stack *, softirq_stack);
439#endif /* X86_64 */ 426#endif /* X86_64 */
440 427
441extern unsigned int fpu_kernel_xstate_size; 428extern unsigned int fpu_kernel_xstate_size;
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 2e95b6c1bca3..da545df207b2 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -131,7 +131,7 @@ void native_smp_prepare_boot_cpu(void);
131void native_smp_prepare_cpus(unsigned int max_cpus); 131void native_smp_prepare_cpus(unsigned int max_cpus);
132void calculate_max_logical_packages(void); 132void calculate_max_logical_packages(void);
133void native_smp_cpus_done(unsigned int max_cpus); 133void native_smp_cpus_done(unsigned int max_cpus);
134void common_cpu_up(unsigned int cpunum, struct task_struct *tidle); 134int common_cpu_up(unsigned int cpunum, struct task_struct *tidle);
135int native_cpu_up(unsigned int cpunum, struct task_struct *tidle); 135int native_cpu_up(unsigned int cpunum, struct task_struct *tidle);
136int native_cpu_disable(void); 136int native_cpu_disable(void);
137int common_cpu_die(unsigned int cpu); 137int common_cpu_die(unsigned int cpu);
diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h
index 8ec97a62c245..91e29b6a86a5 100644
--- a/arch/x86/include/asm/stackprotector.h
+++ b/arch/x86/include/asm/stackprotector.h
@@ -13,7 +13,7 @@
13 * On x86_64, %gs is shared by percpu area and stack canary. All 13 * On x86_64, %gs is shared by percpu area and stack canary. All
14 * percpu symbols are zero based and %gs points to the base of percpu 14 * percpu symbols are zero based and %gs points to the base of percpu
15 * area. The first occupant of the percpu area is always 15 * area. The first occupant of the percpu area is always
16 * irq_stack_union which contains stack_canary at offset 40. Userland 16 * fixed_percpu_data which contains stack_canary at offset 40. Userland
17 * %gs is always saved and restored on kernel entry and exit using 17 * %gs is always saved and restored on kernel entry and exit using
18 * swapgs, so stack protector doesn't add any complexity there. 18 * swapgs, so stack protector doesn't add any complexity there.
19 * 19 *
@@ -64,7 +64,7 @@ static __always_inline void boot_init_stack_canary(void)
64 u64 tsc; 64 u64 tsc;
65 65
66#ifdef CONFIG_X86_64 66#ifdef CONFIG_X86_64
67 BUILD_BUG_ON(offsetof(union irq_stack_union, stack_canary) != 40); 67 BUILD_BUG_ON(offsetof(struct fixed_percpu_data, stack_canary) != 40);
68#endif 68#endif
69 /* 69 /*
70 * We both use the random pool and the current TSC as a source 70 * We both use the random pool and the current TSC as a source
@@ -79,7 +79,7 @@ static __always_inline void boot_init_stack_canary(void)
79 79
80 current->stack_canary = canary; 80 current->stack_canary = canary;
81#ifdef CONFIG_X86_64 81#ifdef CONFIG_X86_64
82 this_cpu_write(irq_stack_union.stack_canary, canary); 82 this_cpu_write(fixed_percpu_data.stack_canary, canary);
83#else 83#else
84 this_cpu_write(stack_canary.canary, canary); 84 this_cpu_write(stack_canary.canary, canary);
85#endif 85#endif
diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index beef7ad9e43a..a8d0cdf48616 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -9,6 +9,8 @@
9 9
10#include <linux/uaccess.h> 10#include <linux/uaccess.h>
11#include <linux/ptrace.h> 11#include <linux/ptrace.h>
12
13#include <asm/cpu_entry_area.h>
12#include <asm/switch_to.h> 14#include <asm/switch_to.h>
13 15
14enum stack_type { 16enum stack_type {
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index ddced33184b5..d3d075226c0a 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -68,10 +68,12 @@ int main(void)
68#undef ENTRY 68#undef ENTRY
69 69
70 OFFSET(TSS_ist, tss_struct, x86_tss.ist); 70 OFFSET(TSS_ist, tss_struct, x86_tss.ist);
71 DEFINE(DB_STACK_OFFSET, offsetof(struct cea_exception_stacks, DB_stack) -
72 offsetof(struct cea_exception_stacks, DB1_stack));
71 BLANK(); 73 BLANK();
72 74
73#ifdef CONFIG_STACKPROTECTOR 75#ifdef CONFIG_STACKPROTECTOR
74 DEFINE(stack_canary_offset, offsetof(union irq_stack_union, stack_canary)); 76 DEFINE(stack_canary_offset, offsetof(struct fixed_percpu_data, stack_canary));
75 BLANK(); 77 BLANK();
76#endif 78#endif
77 79
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 95a5faf3a6a0..37f7d438a6ef 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -507,19 +507,6 @@ void load_percpu_segment(int cpu)
507DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area); 507DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area);
508#endif 508#endif
509 509
510#ifdef CONFIG_X86_64
511/*
512 * Special IST stacks which the CPU switches to when it calls
513 * an IST-marked descriptor entry. Up to 7 stacks (hardware
514 * limit), all of them are 4K, except the debug stack which
515 * is 8K.
516 */
517static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
518 [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ,
519 [DEBUG_STACK - 1] = DEBUG_STKSZ
520};
521#endif
522
523/* Load the original GDT from the per-cpu structure */ 510/* Load the original GDT from the per-cpu structure */
524void load_direct_gdt(int cpu) 511void load_direct_gdt(int cpu)
525{ 512{
@@ -1511,9 +1498,9 @@ static __init int setup_clearcpuid(char *arg)
1511__setup("clearcpuid=", setup_clearcpuid); 1498__setup("clearcpuid=", setup_clearcpuid);
1512 1499
1513#ifdef CONFIG_X86_64 1500#ifdef CONFIG_X86_64
1514DEFINE_PER_CPU_FIRST(union irq_stack_union, 1501DEFINE_PER_CPU_FIRST(struct fixed_percpu_data,
1515 irq_stack_union) __aligned(PAGE_SIZE) __visible; 1502 fixed_percpu_data) __aligned(PAGE_SIZE) __visible;
1516EXPORT_PER_CPU_SYMBOL_GPL(irq_stack_union); 1503EXPORT_PER_CPU_SYMBOL_GPL(fixed_percpu_data);
1517 1504
1518/* 1505/*
1519 * The following percpu variables are hot. Align current_task to 1506 * The following percpu variables are hot. Align current_task to
@@ -1523,9 +1510,7 @@ DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned =
1523 &init_task; 1510 &init_task;
1524EXPORT_PER_CPU_SYMBOL(current_task); 1511EXPORT_PER_CPU_SYMBOL(current_task);
1525 1512
1526DEFINE_PER_CPU(char *, irq_stack_ptr) = 1513DEFINE_PER_CPU(struct irq_stack *, hardirq_stack_ptr);
1527 init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE;
1528
1529DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1; 1514DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1;
1530 1515
1531DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; 1516DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
@@ -1562,23 +1547,7 @@ void syscall_init(void)
1562 X86_EFLAGS_IOPL|X86_EFLAGS_AC|X86_EFLAGS_NT); 1547 X86_EFLAGS_IOPL|X86_EFLAGS_AC|X86_EFLAGS_NT);
1563} 1548}
1564 1549
1565/*
1566 * Copies of the original ist values from the tss are only accessed during
1567 * debugging, no special alignment required.
1568 */
1569DEFINE_PER_CPU(struct orig_ist, orig_ist);
1570
1571static DEFINE_PER_CPU(unsigned long, debug_stack_addr);
1572DEFINE_PER_CPU(int, debug_stack_usage); 1550DEFINE_PER_CPU(int, debug_stack_usage);
1573
1574int is_debug_stack(unsigned long addr)
1575{
1576 return __this_cpu_read(debug_stack_usage) ||
1577 (addr <= __this_cpu_read(debug_stack_addr) &&
1578 addr > (__this_cpu_read(debug_stack_addr) - DEBUG_STKSZ));
1579}
1580NOKPROBE_SYMBOL(is_debug_stack);
1581
1582DEFINE_PER_CPU(u32, debug_idt_ctr); 1551DEFINE_PER_CPU(u32, debug_idt_ctr);
1583 1552
1584void debug_stack_set_zero(void) 1553void debug_stack_set_zero(void)
@@ -1690,17 +1659,14 @@ static void setup_getcpu(int cpu)
1690 * initialized (naturally) in the bootstrap process, such as the GDT 1659 * initialized (naturally) in the bootstrap process, such as the GDT
1691 * and IDT. We reload them nevertheless, this function acts as a 1660 * and IDT. We reload them nevertheless, this function acts as a
1692 * 'CPU state barrier', nothing should get across. 1661 * 'CPU state barrier', nothing should get across.
1693 * A lot of state is already set up in PDA init for 64 bit
1694 */ 1662 */
1695#ifdef CONFIG_X86_64 1663#ifdef CONFIG_X86_64
1696 1664
1697void cpu_init(void) 1665void cpu_init(void)
1698{ 1666{
1699 struct orig_ist *oist; 1667 int cpu = raw_smp_processor_id();
1700 struct task_struct *me; 1668 struct task_struct *me;
1701 struct tss_struct *t; 1669 struct tss_struct *t;
1702 unsigned long v;
1703 int cpu = raw_smp_processor_id();
1704 int i; 1670 int i;
1705 1671
1706 wait_for_master_cpu(cpu); 1672 wait_for_master_cpu(cpu);
@@ -1715,7 +1681,6 @@ void cpu_init(void)
1715 load_ucode_ap(); 1681 load_ucode_ap();
1716 1682
1717 t = &per_cpu(cpu_tss_rw, cpu); 1683 t = &per_cpu(cpu_tss_rw, cpu);
1718 oist = &per_cpu(orig_ist, cpu);
1719 1684
1720#ifdef CONFIG_NUMA 1685#ifdef CONFIG_NUMA
1721 if (this_cpu_read(numa_node) == 0 && 1686 if (this_cpu_read(numa_node) == 0 &&
@@ -1753,16 +1718,11 @@ void cpu_init(void)
1753 /* 1718 /*
1754 * set up and load the per-CPU TSS 1719 * set up and load the per-CPU TSS
1755 */ 1720 */
1756 if (!oist->ist[0]) { 1721 if (!t->x86_tss.ist[0]) {
1757 char *estacks = get_cpu_entry_area(cpu)->exception_stacks; 1722 t->x86_tss.ist[IST_INDEX_DF] = __this_cpu_ist_top_va(DF);
1758 1723 t->x86_tss.ist[IST_INDEX_NMI] = __this_cpu_ist_top_va(NMI);
1759 for (v = 0; v < N_EXCEPTION_STACKS; v++) { 1724 t->x86_tss.ist[IST_INDEX_DB] = __this_cpu_ist_top_va(DB);
1760 estacks += exception_stack_sizes[v]; 1725 t->x86_tss.ist[IST_INDEX_MCE] = __this_cpu_ist_top_va(MCE);
1761 oist->ist[v] = t->x86_tss.ist[v] =
1762 (unsigned long)estacks;
1763 if (v == DEBUG_STACK-1)
1764 per_cpu(debug_stack_addr, cpu) = (unsigned long)estacks;
1765 }
1766 } 1726 }
1767 1727
1768 t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; 1728 t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index cd53f3030e40..64a59d726639 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -34,14 +34,14 @@ const char *stack_type_name(enum stack_type type)
34 34
35static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info) 35static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info)
36{ 36{
37 unsigned long *begin = (unsigned long *)this_cpu_read(hardirq_stack); 37 unsigned long *begin = (unsigned long *)this_cpu_read(hardirq_stack_ptr);
38 unsigned long *end = begin + (THREAD_SIZE / sizeof(long)); 38 unsigned long *end = begin + (THREAD_SIZE / sizeof(long));
39 39
40 /* 40 /*
41 * This is a software stack, so 'end' can be a valid stack pointer. 41 * This is a software stack, so 'end' can be a valid stack pointer.
42 * It just means the stack is empty. 42 * It just means the stack is empty.
43 */ 43 */
44 if (stack <= begin || stack > end) 44 if (stack < begin || stack > end)
45 return false; 45 return false;
46 46
47 info->type = STACK_TYPE_IRQ; 47 info->type = STACK_TYPE_IRQ;
@@ -59,14 +59,14 @@ static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info)
59 59
60static bool in_softirq_stack(unsigned long *stack, struct stack_info *info) 60static bool in_softirq_stack(unsigned long *stack, struct stack_info *info)
61{ 61{
62 unsigned long *begin = (unsigned long *)this_cpu_read(softirq_stack); 62 unsigned long *begin = (unsigned long *)this_cpu_read(softirq_stack_ptr);
63 unsigned long *end = begin + (THREAD_SIZE / sizeof(long)); 63 unsigned long *end = begin + (THREAD_SIZE / sizeof(long));
64 64
65 /* 65 /*
66 * This is a software stack, so 'end' can be a valid stack pointer. 66 * This is a software stack, so 'end' can be a valid stack pointer.
67 * It just means the stack is empty. 67 * It just means the stack is empty.
68 */ 68 */
69 if (stack <= begin || stack > end) 69 if (stack < begin || stack > end)
70 return false; 70 return false;
71 71
72 info->type = STACK_TYPE_SOFTIRQ; 72 info->type = STACK_TYPE_SOFTIRQ;
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 5cdb9e84da57..753b8cfe8b8a 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -16,23 +16,21 @@
16#include <linux/bug.h> 16#include <linux/bug.h>
17#include <linux/nmi.h> 17#include <linux/nmi.h>
18 18
19#include <asm/cpu_entry_area.h>
19#include <asm/stacktrace.h> 20#include <asm/stacktrace.h>
20 21
21static char *exception_stack_names[N_EXCEPTION_STACKS] = { 22static const char * const exception_stack_names[] = {
22 [ DOUBLEFAULT_STACK-1 ] = "#DF", 23 [ ESTACK_DF ] = "#DF",
23 [ NMI_STACK-1 ] = "NMI", 24 [ ESTACK_NMI ] = "NMI",
24 [ DEBUG_STACK-1 ] = "#DB", 25 [ ESTACK_DB2 ] = "#DB2",
25 [ MCE_STACK-1 ] = "#MC", 26 [ ESTACK_DB1 ] = "#DB1",
26}; 27 [ ESTACK_DB ] = "#DB",
27 28 [ ESTACK_MCE ] = "#MC",
28static unsigned long exception_stack_sizes[N_EXCEPTION_STACKS] = {
29 [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ,
30 [DEBUG_STACK - 1] = DEBUG_STKSZ
31}; 29};
32 30
33const char *stack_type_name(enum stack_type type) 31const char *stack_type_name(enum stack_type type)
34{ 32{
35 BUILD_BUG_ON(N_EXCEPTION_STACKS != 4); 33 BUILD_BUG_ON(N_EXCEPTION_STACKS != 6);
36 34
37 if (type == STACK_TYPE_IRQ) 35 if (type == STACK_TYPE_IRQ)
38 return "IRQ"; 36 return "IRQ";
@@ -52,43 +50,84 @@ const char *stack_type_name(enum stack_type type)
52 return NULL; 50 return NULL;
53} 51}
54 52
53/**
54 * struct estack_pages - Page descriptor for exception stacks
55 * @offs: Offset from the start of the exception stack area
56 * @size: Size of the exception stack
57 * @type: Type to store in the stack_info struct
58 */
59struct estack_pages {
60 u32 offs;
61 u16 size;
62 u16 type;
63};
64
65#define EPAGERANGE(st) \
66 [PFN_DOWN(CEA_ESTACK_OFFS(st)) ... \
67 PFN_DOWN(CEA_ESTACK_OFFS(st) + CEA_ESTACK_SIZE(st) - 1)] = { \
68 .offs = CEA_ESTACK_OFFS(st), \
69 .size = CEA_ESTACK_SIZE(st), \
70 .type = STACK_TYPE_EXCEPTION + ESTACK_ ##st, }
71
72/*
73 * Array of exception stack page descriptors. If the stack is larger than
74 * PAGE_SIZE, all pages covering a particular stack will have the same
75 * info. The guard pages including the not mapped DB2 stack are zeroed
76 * out.
77 */
78static const
79struct estack_pages estack_pages[CEA_ESTACK_PAGES] ____cacheline_aligned = {
80 EPAGERANGE(DF),
81 EPAGERANGE(NMI),
82 EPAGERANGE(DB1),
83 EPAGERANGE(DB),
84 EPAGERANGE(MCE),
85};
86
55static bool in_exception_stack(unsigned long *stack, struct stack_info *info) 87static bool in_exception_stack(unsigned long *stack, struct stack_info *info)
56{ 88{
57 unsigned long *begin, *end; 89 unsigned long begin, end, stk = (unsigned long)stack;
90 const struct estack_pages *ep;
58 struct pt_regs *regs; 91 struct pt_regs *regs;
59 unsigned k; 92 unsigned int k;
60 93
61 BUILD_BUG_ON(N_EXCEPTION_STACKS != 4); 94 BUILD_BUG_ON(N_EXCEPTION_STACKS != 6);
62 95
63 for (k = 0; k < N_EXCEPTION_STACKS; k++) { 96 begin = (unsigned long)__this_cpu_read(cea_exception_stacks);
64 end = (unsigned long *)raw_cpu_ptr(&orig_ist)->ist[k]; 97 end = begin + sizeof(struct cea_exception_stacks);
65 begin = end - (exception_stack_sizes[k] / sizeof(long)); 98 /* Bail if @stack is outside the exception stack area. */
66 regs = (struct pt_regs *)end - 1; 99 if (stk < begin || stk >= end)
67 100 return false;
68 if (stack <= begin || stack >= end)
69 continue;
70 101
71 info->type = STACK_TYPE_EXCEPTION + k; 102 /* Calc page offset from start of exception stacks */
72 info->begin = begin; 103 k = (stk - begin) >> PAGE_SHIFT;
73 info->end = end; 104 /* Lookup the page descriptor */
74 info->next_sp = (unsigned long *)regs->sp; 105 ep = &estack_pages[k];
106 /* Guard page? */
107 if (!ep->size)
108 return false;
75 109
76 return true; 110 begin += (unsigned long)ep->offs;
77 } 111 end = begin + (unsigned long)ep->size;
112 regs = (struct pt_regs *)end - 1;
78 113
79 return false; 114 info->type = ep->type;
115 info->begin = (unsigned long *)begin;
116 info->end = (unsigned long *)end;
117 info->next_sp = (unsigned long *)regs->sp;
118 return true;
80} 119}
81 120
82static bool in_irq_stack(unsigned long *stack, struct stack_info *info) 121static bool in_irq_stack(unsigned long *stack, struct stack_info *info)
83{ 122{
84 unsigned long *end = (unsigned long *)this_cpu_read(irq_stack_ptr); 123 unsigned long *end = (unsigned long *)this_cpu_read(hardirq_stack_ptr);
85 unsigned long *begin = end - (IRQ_STACK_SIZE / sizeof(long)); 124 unsigned long *begin = end - (IRQ_STACK_SIZE / sizeof(long));
86 125
87 /* 126 /*
88 * This is a software stack, so 'end' can be a valid stack pointer. 127 * This is a software stack, so 'end' can be a valid stack pointer.
89 * It just means the stack is empty. 128 * It just means the stack is empty.
90 */ 129 */
91 if (stack <= begin || stack > end) 130 if (stack < begin || stack >= end)
92 return false; 131 return false;
93 132
94 info->type = STACK_TYPE_IRQ; 133 info->type = STACK_TYPE_IRQ;
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index d1dbe8e4eb82..bcd206c8ac90 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -265,7 +265,7 @@ ENDPROC(start_cpu0)
265 GLOBAL(initial_code) 265 GLOBAL(initial_code)
266 .quad x86_64_start_kernel 266 .quad x86_64_start_kernel
267 GLOBAL(initial_gs) 267 GLOBAL(initial_gs)
268 .quad INIT_PER_CPU_VAR(irq_stack_union) 268 .quad INIT_PER_CPU_VAR(fixed_percpu_data)
269 GLOBAL(initial_stack) 269 GLOBAL(initial_stack)
270 /* 270 /*
271 * The SIZEOF_PTREGS gap is a convention which helps the in-kernel 271 * The SIZEOF_PTREGS gap is a convention which helps the in-kernel
diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
index 01adea278a71..6d8917875f44 100644
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -41,13 +41,12 @@ struct idt_data {
41#define SYSG(_vector, _addr) \ 41#define SYSG(_vector, _addr) \
42 G(_vector, _addr, DEFAULT_STACK, GATE_INTERRUPT, DPL3, __KERNEL_CS) 42 G(_vector, _addr, DEFAULT_STACK, GATE_INTERRUPT, DPL3, __KERNEL_CS)
43 43
44/* Interrupt gate with interrupt stack */ 44/*
45 * Interrupt gate with interrupt stack. The _ist index is the index in
46 * the tss.ist[] array, but for the descriptor it needs to start at 1.
47 */
45#define ISTG(_vector, _addr, _ist) \ 48#define ISTG(_vector, _addr, _ist) \
46 G(_vector, _addr, _ist, GATE_INTERRUPT, DPL0, __KERNEL_CS) 49 G(_vector, _addr, _ist + 1, GATE_INTERRUPT, DPL0, __KERNEL_CS)
47
48/* System interrupt gate with interrupt stack */
49#define SISTG(_vector, _addr, _ist) \
50 G(_vector, _addr, _ist, GATE_INTERRUPT, DPL3, __KERNEL_CS)
51 50
52/* Task gate */ 51/* Task gate */
53#define TSKG(_vector, _gdt) \ 52#define TSKG(_vector, _gdt) \
@@ -184,11 +183,11 @@ gate_desc debug_idt_table[IDT_ENTRIES] __page_aligned_bss;
184 * cpu_init() when the TSS has been initialized. 183 * cpu_init() when the TSS has been initialized.
185 */ 184 */
186static const __initconst struct idt_data ist_idts[] = { 185static const __initconst struct idt_data ist_idts[] = {
187 ISTG(X86_TRAP_DB, debug, DEBUG_STACK), 186 ISTG(X86_TRAP_DB, debug, IST_INDEX_DB),
188 ISTG(X86_TRAP_NMI, nmi, NMI_STACK), 187 ISTG(X86_TRAP_NMI, nmi, IST_INDEX_NMI),
189 ISTG(X86_TRAP_DF, double_fault, DOUBLEFAULT_STACK), 188 ISTG(X86_TRAP_DF, double_fault, IST_INDEX_DF),
190#ifdef CONFIG_X86_MCE 189#ifdef CONFIG_X86_MCE
191 ISTG(X86_TRAP_MC, &machine_check, MCE_STACK), 190 ISTG(X86_TRAP_MC, &machine_check, IST_INDEX_MCE),
192#endif 191#endif
193}; 192};
194 193
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 95600a99ae93..fc34816c6f04 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -51,8 +51,8 @@ static inline int check_stack_overflow(void) { return 0; }
51static inline void print_stack_overflow(void) { } 51static inline void print_stack_overflow(void) { }
52#endif 52#endif
53 53
54DEFINE_PER_CPU(struct irq_stack *, hardirq_stack); 54DEFINE_PER_CPU(struct irq_stack *, hardirq_stack_ptr);
55DEFINE_PER_CPU(struct irq_stack *, softirq_stack); 55DEFINE_PER_CPU(struct irq_stack *, softirq_stack_ptr);
56 56
57static void call_on_stack(void *func, void *stack) 57static void call_on_stack(void *func, void *stack)
58{ 58{
@@ -76,7 +76,7 @@ static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc)
76 u32 *isp, *prev_esp, arg1; 76 u32 *isp, *prev_esp, arg1;
77 77
78 curstk = (struct irq_stack *) current_stack(); 78 curstk = (struct irq_stack *) current_stack();
79 irqstk = __this_cpu_read(hardirq_stack); 79 irqstk = __this_cpu_read(hardirq_stack_ptr);
80 80
81 /* 81 /*
82 * this is where we switch to the IRQ stack. However, if we are 82 * this is where we switch to the IRQ stack. However, if we are
@@ -107,27 +107,28 @@ static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc)
107} 107}
108 108
109/* 109/*
110 * allocate per-cpu stacks for hardirq and for softirq processing 110 * Allocate per-cpu stacks for hardirq and softirq processing
111 */ 111 */
112void irq_ctx_init(int cpu) 112int irq_init_percpu_irqstack(unsigned int cpu)
113{ 113{
114 struct irq_stack *irqstk; 114 int node = cpu_to_node(cpu);
115 115 struct page *ph, *ps;
116 if (per_cpu(hardirq_stack, cpu))
117 return;
118 116
119 irqstk = page_address(alloc_pages_node(cpu_to_node(cpu), 117 if (per_cpu(hardirq_stack_ptr, cpu))
120 THREADINFO_GFP, 118 return 0;
121 THREAD_SIZE_ORDER));
122 per_cpu(hardirq_stack, cpu) = irqstk;
123 119
124 irqstk = page_address(alloc_pages_node(cpu_to_node(cpu), 120 ph = alloc_pages_node(node, THREADINFO_GFP, THREAD_SIZE_ORDER);
125 THREADINFO_GFP, 121 if (!ph)
126 THREAD_SIZE_ORDER)); 122 return -ENOMEM;
127 per_cpu(softirq_stack, cpu) = irqstk; 123 ps = alloc_pages_node(node, THREADINFO_GFP, THREAD_SIZE_ORDER);
124 if (!ps) {
125 __free_pages(ph, THREAD_SIZE_ORDER);
126 return -ENOMEM;
127 }
128 128
129 printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n", 129 per_cpu(hardirq_stack_ptr, cpu) = page_address(ph);
130 cpu, per_cpu(hardirq_stack, cpu), per_cpu(softirq_stack, cpu)); 130 per_cpu(softirq_stack_ptr, cpu) = page_address(ps);
131 return 0;
131} 132}
132 133
133void do_softirq_own_stack(void) 134void do_softirq_own_stack(void)
@@ -135,7 +136,7 @@ void do_softirq_own_stack(void)
135 struct irq_stack *irqstk; 136 struct irq_stack *irqstk;
136 u32 *isp, *prev_esp; 137 u32 *isp, *prev_esp;
137 138
138 irqstk = __this_cpu_read(softirq_stack); 139 irqstk = __this_cpu_read(softirq_stack_ptr);
139 140
140 /* build the stack frame on the softirq stack */ 141 /* build the stack frame on the softirq stack */
141 isp = (u32 *) ((char *)irqstk + sizeof(*irqstk)); 142 isp = (u32 *) ((char *)irqstk + sizeof(*irqstk));
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 0469cd078db1..6bf6517a05bb 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -18,63 +18,64 @@
18#include <linux/uaccess.h> 18#include <linux/uaccess.h>
19#include <linux/smp.h> 19#include <linux/smp.h>
20#include <linux/sched/task_stack.h> 20#include <linux/sched/task_stack.h>
21
22#include <asm/cpu_entry_area.h>
21#include <asm/io_apic.h> 23#include <asm/io_apic.h>
22#include <asm/apic.h> 24#include <asm/apic.h>
23 25
24int sysctl_panic_on_stackoverflow; 26DEFINE_PER_CPU_PAGE_ALIGNED(struct irq_stack, irq_stack_backing_store) __visible;
27DECLARE_INIT_PER_CPU(irq_stack_backing_store);
25 28
26/* 29bool handle_irq(struct irq_desc *desc, struct pt_regs *regs)
27 * Probabilistic stack overflow check:
28 *
29 * Only check the stack in process context, because everything else
30 * runs on the big interrupt stacks. Checking reliably is too expensive,
31 * so we just check from interrupts.
32 */
33static inline void stack_overflow_check(struct pt_regs *regs)
34{ 30{
35#ifdef CONFIG_DEBUG_STACKOVERFLOW 31 if (IS_ERR_OR_NULL(desc))
36#define STACK_TOP_MARGIN 128 32 return false;
37 struct orig_ist *oist;
38 u64 irq_stack_top, irq_stack_bottom;
39 u64 estack_top, estack_bottom;
40 u64 curbase = (u64)task_stack_page(current);
41 33
42 if (user_mode(regs)) 34 generic_handle_irq_desc(desc);
43 return; 35 return true;
36}
44 37
45 if (regs->sp >= curbase + sizeof(struct pt_regs) + STACK_TOP_MARGIN && 38#ifdef CONFIG_VMAP_STACK
46 regs->sp <= curbase + THREAD_SIZE) 39/*
47 return; 40 * VMAP the backing store with guard pages
41 */
42static int map_irq_stack(unsigned int cpu)
43{
44 char *stack = (char *)per_cpu_ptr(&irq_stack_backing_store, cpu);
45 struct page *pages[IRQ_STACK_SIZE / PAGE_SIZE];
46 void *va;
47 int i;
48 48
49 irq_stack_top = (u64)this_cpu_ptr(irq_stack_union.irq_stack) + 49 for (i = 0; i < IRQ_STACK_SIZE / PAGE_SIZE; i++) {
50 STACK_TOP_MARGIN; 50 phys_addr_t pa = per_cpu_ptr_to_phys(stack + (i << PAGE_SHIFT));
51 irq_stack_bottom = (u64)__this_cpu_read(irq_stack_ptr);
52 if (regs->sp >= irq_stack_top && regs->sp <= irq_stack_bottom)
53 return;
54 51
55 oist = this_cpu_ptr(&orig_ist); 52 pages[i] = pfn_to_page(pa >> PAGE_SHIFT);
56 estack_top = (u64)oist->ist[0] - EXCEPTION_STKSZ + STACK_TOP_MARGIN; 53 }
57 estack_bottom = (u64)oist->ist[N_EXCEPTION_STACKS - 1];
58 if (regs->sp >= estack_top && regs->sp <= estack_bottom)
59 return;
60 54
61 WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx,ip:%pF)\n", 55 va = vmap(pages, IRQ_STACK_SIZE / PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL);
62 current->comm, curbase, regs->sp, 56 if (!va)
63 irq_stack_top, irq_stack_bottom, 57 return -ENOMEM;
64 estack_top, estack_bottom, (void *)regs->ip);
65 58
66 if (sysctl_panic_on_stackoverflow) 59 per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE;
67 panic("low stack detected by irq handler - check messages\n"); 60 return 0;
68#endif
69} 61}
70 62#else
71bool handle_irq(struct irq_desc *desc, struct pt_regs *regs) 63/*
64 * If VMAP stacks are disabled due to KASAN, just use the per cpu
65 * backing store without guard pages.
66 */
67static int map_irq_stack(unsigned int cpu)
72{ 68{
73 stack_overflow_check(regs); 69 void *va = per_cpu_ptr(&irq_stack_backing_store, cpu);
74 70
75 if (IS_ERR_OR_NULL(desc)) 71 per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE;
76 return false; 72 return 0;
73}
74#endif
77 75
78 generic_handle_irq_desc(desc); 76int irq_init_percpu_irqstack(unsigned int cpu)
79 return true; 77{
78 if (per_cpu(hardirq_stack_ptr, cpu))
79 return 0;
80 return map_irq_stack(cpu);
80} 81}
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index a0693b71cfc1..16919a9671fa 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -91,6 +91,8 @@ void __init init_IRQ(void)
91 for (i = 0; i < nr_legacy_irqs(); i++) 91 for (i = 0; i < nr_legacy_irqs(); i++)
92 per_cpu(vector_irq, 0)[ISA_IRQ_VECTOR(i)] = irq_to_desc(i); 92 per_cpu(vector_irq, 0)[ISA_IRQ_VECTOR(i)] = irq_to_desc(i);
93 93
94 BUG_ON(irq_init_percpu_irqstack(smp_processor_id()));
95
94 x86_init.irqs.intr_init(); 96 x86_init.irqs.intr_init();
95} 97}
96 98
@@ -104,6 +106,4 @@ void __init native_init_IRQ(void)
104 106
105 if (!acpi_ioapic && !of_ioapic && nr_legacy_irqs()) 107 if (!acpi_ioapic && !of_ioapic && nr_legacy_irqs())
106 setup_irq(2, &irq2); 108 setup_irq(2, &irq2);
107
108 irq_ctx_init(smp_processor_id());
109} 109}
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 18bc9b51ac9b..3755d0310026 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -21,13 +21,14 @@
21#include <linux/ratelimit.h> 21#include <linux/ratelimit.h>
22#include <linux/slab.h> 22#include <linux/slab.h>
23#include <linux/export.h> 23#include <linux/export.h>
24#include <linux/atomic.h>
24#include <linux/sched/clock.h> 25#include <linux/sched/clock.h>
25 26
26#if defined(CONFIG_EDAC) 27#if defined(CONFIG_EDAC)
27#include <linux/edac.h> 28#include <linux/edac.h>
28#endif 29#endif
29 30
30#include <linux/atomic.h> 31#include <asm/cpu_entry_area.h>
31#include <asm/traps.h> 32#include <asm/traps.h>
32#include <asm/mach_traps.h> 33#include <asm/mach_traps.h>
33#include <asm/nmi.h> 34#include <asm/nmi.h>
@@ -487,6 +488,23 @@ static DEFINE_PER_CPU(unsigned long, nmi_cr2);
487 * switch back to the original IDT. 488 * switch back to the original IDT.
488 */ 489 */
489static DEFINE_PER_CPU(int, update_debug_stack); 490static DEFINE_PER_CPU(int, update_debug_stack);
491
492static bool notrace is_debug_stack(unsigned long addr)
493{
494 struct cea_exception_stacks *cs = __this_cpu_read(cea_exception_stacks);
495 unsigned long top = CEA_ESTACK_TOP(cs, DB);
496 unsigned long bot = CEA_ESTACK_BOT(cs, DB1);
497
498 if (__this_cpu_read(debug_stack_usage))
499 return true;
500 /*
501 * Note, this covers the guard page between DB and DB1 as well to
502 * avoid two checks. But by all means @addr can never point into
503 * the guard page.
504 */
505 return addr >= bot && addr < top;
506}
507NOKPROBE_SYMBOL(is_debug_stack);
490#endif 508#endif
491 509
492dotraplinkage notrace void 510dotraplinkage notrace void
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 4bf46575568a..86663874ef04 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -244,11 +244,6 @@ void __init setup_per_cpu_areas(void)
244 per_cpu(x86_cpu_to_logical_apicid, cpu) = 244 per_cpu(x86_cpu_to_logical_apicid, cpu) =
245 early_per_cpu_map(x86_cpu_to_logical_apicid, cpu); 245 early_per_cpu_map(x86_cpu_to_logical_apicid, cpu);
246#endif 246#endif
247#ifdef CONFIG_X86_64
248 per_cpu(irq_stack_ptr, cpu) =
249 per_cpu(irq_stack_union.irq_stack, cpu) +
250 IRQ_STACK_SIZE;
251#endif
252#ifdef CONFIG_NUMA 247#ifdef CONFIG_NUMA
253 per_cpu(x86_cpu_to_node_map, cpu) = 248 per_cpu(x86_cpu_to_node_map, cpu) =
254 early_per_cpu_map(x86_cpu_to_node_map, cpu); 249 early_per_cpu_map(x86_cpu_to_node_map, cpu);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index ce1a67b70168..c92b21f9e9dc 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -935,20 +935,27 @@ out:
935 return boot_error; 935 return boot_error;
936} 936}
937 937
938void common_cpu_up(unsigned int cpu, struct task_struct *idle) 938int common_cpu_up(unsigned int cpu, struct task_struct *idle)
939{ 939{
940 int ret;
941
940 /* Just in case we booted with a single CPU. */ 942 /* Just in case we booted with a single CPU. */
941 alternatives_enable_smp(); 943 alternatives_enable_smp();
942 944
943 per_cpu(current_task, cpu) = idle; 945 per_cpu(current_task, cpu) = idle;
944 946
947 /* Initialize the interrupt stack(s) */
948 ret = irq_init_percpu_irqstack(cpu);
949 if (ret)
950 return ret;
951
945#ifdef CONFIG_X86_32 952#ifdef CONFIG_X86_32
946 /* Stack for startup_32 can be just as for start_secondary onwards */ 953 /* Stack for startup_32 can be just as for start_secondary onwards */
947 irq_ctx_init(cpu);
948 per_cpu(cpu_current_top_of_stack, cpu) = task_top_of_stack(idle); 954 per_cpu(cpu_current_top_of_stack, cpu) = task_top_of_stack(idle);
949#else 955#else
950 initial_gs = per_cpu_offset(cpu); 956 initial_gs = per_cpu_offset(cpu);
951#endif 957#endif
958 return 0;
952} 959}
953 960
954/* 961/*
@@ -1106,7 +1113,9 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
1106 /* the FPU context is blank, nobody can own it */ 1113 /* the FPU context is blank, nobody can own it */
1107 per_cpu(fpu_fpregs_owner_ctx, cpu) = NULL; 1114 per_cpu(fpu_fpregs_owner_ctx, cpu) = NULL;
1108 1115
1109 common_cpu_up(cpu, tidle); 1116 err = common_cpu_up(cpu, tidle);
1117 if (err)
1118 return err;
1110 1119
1111 err = do_boot_cpu(apicid, cpu, tidle, &cpu0_nmi_registered); 1120 err = do_boot_cpu(apicid, cpu, tidle, &cpu0_nmi_registered);
1112 if (err) { 1121 if (err) {
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index a5127b2c195f..4d1517022a14 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -403,7 +403,8 @@ SECTIONS
403 */ 403 */
404#define INIT_PER_CPU(x) init_per_cpu__##x = ABSOLUTE(x) + __per_cpu_load 404#define INIT_PER_CPU(x) init_per_cpu__##x = ABSOLUTE(x) + __per_cpu_load
405INIT_PER_CPU(gdt_page); 405INIT_PER_CPU(gdt_page);
406INIT_PER_CPU(irq_stack_union); 406INIT_PER_CPU(fixed_percpu_data);
407INIT_PER_CPU(irq_stack_backing_store);
407 408
408/* 409/*
409 * Build-time check on the image size: 410 * Build-time check on the image size:
@@ -412,8 +413,8 @@ INIT_PER_CPU(irq_stack_union);
412 "kernel image bigger than KERNEL_IMAGE_SIZE"); 413 "kernel image bigger than KERNEL_IMAGE_SIZE");
413 414
414#ifdef CONFIG_SMP 415#ifdef CONFIG_SMP
415. = ASSERT((irq_stack_union == 0), 416. = ASSERT((fixed_percpu_data == 0),
416 "irq_stack_union is not at start of per-cpu area"); 417 "fixed_percpu_data is not at start of per-cpu area");
417#endif 418#endif
418 419
419#endif /* CONFIG_X86_32 */ 420#endif /* CONFIG_X86_32 */
diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c
index 19c6abf9ea31..752ad11d6868 100644
--- a/arch/x86/mm/cpu_entry_area.c
+++ b/arch/x86/mm/cpu_entry_area.c
@@ -13,8 +13,8 @@
13static DEFINE_PER_CPU_PAGE_ALIGNED(struct entry_stack_page, entry_stack_storage); 13static DEFINE_PER_CPU_PAGE_ALIGNED(struct entry_stack_page, entry_stack_storage);
14 14
15#ifdef CONFIG_X86_64 15#ifdef CONFIG_X86_64
16static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks 16static DEFINE_PER_CPU_PAGE_ALIGNED(struct exception_stacks, exception_stacks);
17 [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); 17DEFINE_PER_CPU(struct cea_exception_stacks*, cea_exception_stacks);
18#endif 18#endif
19 19
20struct cpu_entry_area *get_cpu_entry_area(int cpu) 20struct cpu_entry_area *get_cpu_entry_area(int cpu)
@@ -52,10 +52,10 @@ cea_map_percpu_pages(void *cea_vaddr, void *ptr, int pages, pgprot_t prot)
52 cea_set_pte(cea_vaddr, per_cpu_ptr_to_phys(ptr), prot); 52 cea_set_pte(cea_vaddr, per_cpu_ptr_to_phys(ptr), prot);
53} 53}
54 54
55static void __init percpu_setup_debug_store(int cpu) 55static void __init percpu_setup_debug_store(unsigned int cpu)
56{ 56{
57#ifdef CONFIG_CPU_SUP_INTEL 57#ifdef CONFIG_CPU_SUP_INTEL
58 int npages; 58 unsigned int npages;
59 void *cea; 59 void *cea;
60 60
61 if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) 61 if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
@@ -78,9 +78,43 @@ static void __init percpu_setup_debug_store(int cpu)
78#endif 78#endif
79} 79}
80 80
81#ifdef CONFIG_X86_64
82
83#define cea_map_stack(name) do { \
84 npages = sizeof(estacks->name## _stack) / PAGE_SIZE; \
85 cea_map_percpu_pages(cea->estacks.name## _stack, \
86 estacks->name## _stack, npages, PAGE_KERNEL); \
87 } while (0)
88
89static void __init percpu_setup_exception_stacks(unsigned int cpu)
90{
91 struct exception_stacks *estacks = per_cpu_ptr(&exception_stacks, cpu);
92 struct cpu_entry_area *cea = get_cpu_entry_area(cpu);
93 unsigned int npages;
94
95 BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0);
96
97 per_cpu(cea_exception_stacks, cpu) = &cea->estacks;
98
99 /*
100 * The exceptions stack mappings in the per cpu area are protected
101 * by guard pages so each stack must be mapped separately. DB2 is
102 * not mapped; it just exists to catch triple nesting of #DB.
103 */
104 cea_map_stack(DF);
105 cea_map_stack(NMI);
106 cea_map_stack(DB1);
107 cea_map_stack(DB);
108 cea_map_stack(MCE);
109}
110#else
111static inline void percpu_setup_exception_stacks(unsigned int cpu) {}
112#endif
113
81/* Setup the fixmap mappings only once per-processor */ 114/* Setup the fixmap mappings only once per-processor */
82static void __init setup_cpu_entry_area(int cpu) 115static void __init setup_cpu_entry_area(unsigned int cpu)
83{ 116{
117 struct cpu_entry_area *cea = get_cpu_entry_area(cpu);
84#ifdef CONFIG_X86_64 118#ifdef CONFIG_X86_64
85 /* On 64-bit systems, we use a read-only fixmap GDT and TSS. */ 119 /* On 64-bit systems, we use a read-only fixmap GDT and TSS. */
86 pgprot_t gdt_prot = PAGE_KERNEL_RO; 120 pgprot_t gdt_prot = PAGE_KERNEL_RO;
@@ -101,10 +135,9 @@ static void __init setup_cpu_entry_area(int cpu)
101 pgprot_t tss_prot = PAGE_KERNEL; 135 pgprot_t tss_prot = PAGE_KERNEL;
102#endif 136#endif
103 137
104 cea_set_pte(&get_cpu_entry_area(cpu)->gdt, get_cpu_gdt_paddr(cpu), 138 cea_set_pte(&cea->gdt, get_cpu_gdt_paddr(cpu), gdt_prot);
105 gdt_prot);
106 139
107 cea_map_percpu_pages(&get_cpu_entry_area(cpu)->entry_stack_page, 140 cea_map_percpu_pages(&cea->entry_stack_page,
108 per_cpu_ptr(&entry_stack_storage, cpu), 1, 141 per_cpu_ptr(&entry_stack_storage, cpu), 1,
109 PAGE_KERNEL); 142 PAGE_KERNEL);
110 143
@@ -128,22 +161,15 @@ static void __init setup_cpu_entry_area(int cpu)
128 BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^ 161 BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^
129 offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK); 162 offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK);
130 BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0); 163 BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0);
131 cea_map_percpu_pages(&get_cpu_entry_area(cpu)->tss, 164 cea_map_percpu_pages(&cea->tss, &per_cpu(cpu_tss_rw, cpu),
132 &per_cpu(cpu_tss_rw, cpu),
133 sizeof(struct tss_struct) / PAGE_SIZE, tss_prot); 165 sizeof(struct tss_struct) / PAGE_SIZE, tss_prot);
134 166
135#ifdef CONFIG_X86_32 167#ifdef CONFIG_X86_32
136 per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu); 168 per_cpu(cpu_entry_area, cpu) = cea;
137#endif 169#endif
138 170
139#ifdef CONFIG_X86_64 171 percpu_setup_exception_stacks(cpu);
140 BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0); 172
141 BUILD_BUG_ON(sizeof(exception_stacks) !=
142 sizeof(((struct cpu_entry_area *)0)->exception_stacks));
143 cea_map_percpu_pages(&get_cpu_entry_area(cpu)->exception_stacks,
144 &per_cpu(exception_stacks, cpu),
145 sizeof(exception_stacks) / PAGE_SIZE, PAGE_KERNEL);
146#endif
147 percpu_setup_debug_store(cpu); 173 percpu_setup_debug_store(cpu);
148} 174}
149 175
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 667f1da36208..06c089513d39 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -28,6 +28,7 @@
28#include <asm/mmu_context.h> /* vma_pkey() */ 28#include <asm/mmu_context.h> /* vma_pkey() */
29#include <asm/efi.h> /* efi_recover_from_page_fault()*/ 29#include <asm/efi.h> /* efi_recover_from_page_fault()*/
30#include <asm/desc.h> /* store_idt(), ... */ 30#include <asm/desc.h> /* store_idt(), ... */
31#include <asm/cpu_entry_area.h> /* exception stack */
31 32
32#define CREATE_TRACE_POINTS 33#define CREATE_TRACE_POINTS
33#include <asm/trace/exceptions.h> 34#include <asm/trace/exceptions.h>
@@ -793,7 +794,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,
793 if (is_vmalloc_addr((void *)address) && 794 if (is_vmalloc_addr((void *)address) &&
794 (((unsigned long)tsk->stack - 1 - address < PAGE_SIZE) || 795 (((unsigned long)tsk->stack - 1 - address < PAGE_SIZE) ||
795 address - ((unsigned long)tsk->stack + THREAD_SIZE) < PAGE_SIZE)) { 796 address - ((unsigned long)tsk->stack + THREAD_SIZE) < PAGE_SIZE)) {
796 unsigned long stack = this_cpu_read(orig_ist.ist[DOUBLEFAULT_STACK]) - sizeof(void *); 797 unsigned long stack = __this_cpu_ist_top_va(DF) - sizeof(void *);
797 /* 798 /*
798 * We're likely to be running with very little stack space 799 * We're likely to be running with very little stack space
799 * left. It's plausible that we'd hit this condition but 800 * left. It's plausible that we'd hit this condition but
diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c
index f345586f5e50..ce7188cbdae5 100644
--- a/arch/x86/tools/relocs.c
+++ b/arch/x86/tools/relocs.c
@@ -754,7 +754,7 @@ static void percpu_init(void)
754 * __per_cpu_load 754 * __per_cpu_load
755 * 755 *
756 * The "gold" linker incorrectly associates: 756 * The "gold" linker incorrectly associates:
757 * init_per_cpu__irq_stack_union 757 * init_per_cpu__fixed_percpu_data
758 * init_per_cpu__gdt_page 758 * init_per_cpu__gdt_page
759 */ 759 */
760static int is_percpu_sym(ElfW(Sym) *sym, const char *symname) 760static int is_percpu_sym(ElfW(Sym) *sym, const char *symname)
diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c
index 145506f9fdbe..590fcf863006 100644
--- a/arch/x86/xen/smp_pv.c
+++ b/arch/x86/xen/smp_pv.c
@@ -361,7 +361,9 @@ static int xen_pv_cpu_up(unsigned int cpu, struct task_struct *idle)
361{ 361{
362 int rc; 362 int rc;
363 363
364 common_cpu_up(cpu, idle); 364 rc = common_cpu_up(cpu, idle);
365 if (rc)
366 return rc;
365 367
366 xen_setup_runstate_info(cpu); 368 xen_setup_runstate_info(cpu);
367 369
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 5077ead5e59c..c1d8b90aa4e2 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -40,13 +40,13 @@ ENTRY(startup_xen)
40#ifdef CONFIG_X86_64 40#ifdef CONFIG_X86_64
41 /* Set up %gs. 41 /* Set up %gs.
42 * 42 *
43 * The base of %gs always points to the bottom of the irqstack 43 * The base of %gs always points to fixed_percpu_data. If the
44 * union. If the stack protector canary is enabled, it is 44 * stack protector canary is enabled, it is located at %gs:40.
45 * located at %gs:40. Note that, on SMP, the boot cpu uses 45 * Note that, on SMP, the boot cpu uses init data section until
46 * init data section till per cpu areas are set up. 46 * the per cpu areas are set up.
47 */ 47 */
48 movl $MSR_GS_BASE,%ecx 48 movl $MSR_GS_BASE,%ecx
49 movq $INIT_PER_CPU_VAR(irq_stack_union),%rax 49 movq $INIT_PER_CPU_VAR(fixed_percpu_data),%rax
50 cdq 50 cdq
51 wrmsr 51 wrmsr
52#endif 52#endif
diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c
index 117e76b2f939..084e45882c73 100644
--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
@@ -1687,7 +1687,6 @@ void __init xen_init_IRQ(void)
1687 1687
1688#ifdef CONFIG_X86 1688#ifdef CONFIG_X86
1689 if (xen_pv_domain()) { 1689 if (xen_pv_domain()) {
1690 irq_ctx_init(smp_processor_id());
1691 if (xen_initial_domain()) 1690 if (xen_initial_domain())
1692 pci_xen_initial_domain(); 1691 pci_xen_initial_domain();
1693 } 1692 }
diff --git a/mm/slab.c b/mm/slab.c
index 9142ee992493..284ab737faee 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1467,53 +1467,17 @@ static bool is_debug_pagealloc_cache(struct kmem_cache *cachep)
1467} 1467}
1468 1468
1469#ifdef CONFIG_DEBUG_PAGEALLOC 1469#ifdef CONFIG_DEBUG_PAGEALLOC
1470static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr, 1470static void slab_kernel_map(struct kmem_cache *cachep, void *objp, int map)
1471 unsigned long caller)
1472{
1473 int size = cachep->object_size;
1474
1475 addr = (unsigned long *)&((char *)addr)[obj_offset(cachep)];
1476
1477 if (size < 5 * sizeof(unsigned long))
1478 return;
1479
1480 *addr++ = 0x12345678;
1481 *addr++ = caller;
1482 *addr++ = smp_processor_id();
1483 size -= 3 * sizeof(unsigned long);
1484 {
1485 unsigned long *sptr = &caller;
1486 unsigned long svalue;
1487
1488 while (!kstack_end(sptr)) {
1489 svalue = *sptr++;
1490 if (kernel_text_address(svalue)) {
1491 *addr++ = svalue;
1492 size -= sizeof(unsigned long);
1493 if (size <= sizeof(unsigned long))
1494 break;
1495 }
1496 }
1497
1498 }
1499 *addr++ = 0x87654321;
1500}
1501
1502static void slab_kernel_map(struct kmem_cache *cachep, void *objp,
1503 int map, unsigned long caller)
1504{ 1471{
1505 if (!is_debug_pagealloc_cache(cachep)) 1472 if (!is_debug_pagealloc_cache(cachep))
1506 return; 1473 return;
1507 1474
1508 if (caller)
1509 store_stackinfo(cachep, objp, caller);
1510
1511 kernel_map_pages(virt_to_page(objp), cachep->size / PAGE_SIZE, map); 1475 kernel_map_pages(virt_to_page(objp), cachep->size / PAGE_SIZE, map);
1512} 1476}
1513 1477
1514#else 1478#else
1515static inline void slab_kernel_map(struct kmem_cache *cachep, void *objp, 1479static inline void slab_kernel_map(struct kmem_cache *cachep, void *objp,
1516 int map, unsigned long caller) {} 1480 int map) {}
1517 1481
1518#endif 1482#endif
1519 1483
@@ -1661,7 +1625,7 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep,
1661 1625
1662 if (cachep->flags & SLAB_POISON) { 1626 if (cachep->flags & SLAB_POISON) {
1663 check_poison_obj(cachep, objp); 1627 check_poison_obj(cachep, objp);
1664 slab_kernel_map(cachep, objp, 1, 0); 1628 slab_kernel_map(cachep, objp, 1);
1665 } 1629 }
1666 if (cachep->flags & SLAB_RED_ZONE) { 1630 if (cachep->flags & SLAB_RED_ZONE) {
1667 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) 1631 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
@@ -2433,7 +2397,7 @@ static void cache_init_objs_debug(struct kmem_cache *cachep, struct page *page)
2433 /* need to poison the objs? */ 2397 /* need to poison the objs? */
2434 if (cachep->flags & SLAB_POISON) { 2398 if (cachep->flags & SLAB_POISON) {
2435 poison_obj(cachep, objp, POISON_FREE); 2399 poison_obj(cachep, objp, POISON_FREE);
2436 slab_kernel_map(cachep, objp, 0, 0); 2400 slab_kernel_map(cachep, objp, 0);
2437 } 2401 }
2438 } 2402 }
2439#endif 2403#endif
@@ -2812,7 +2776,7 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
2812 2776
2813 if (cachep->flags & SLAB_POISON) { 2777 if (cachep->flags & SLAB_POISON) {
2814 poison_obj(cachep, objp, POISON_FREE); 2778 poison_obj(cachep, objp, POISON_FREE);
2815 slab_kernel_map(cachep, objp, 0, caller); 2779 slab_kernel_map(cachep, objp, 0);
2816 } 2780 }
2817 return objp; 2781 return objp;
2818} 2782}
@@ -3076,7 +3040,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
3076 return objp; 3040 return objp;
3077 if (cachep->flags & SLAB_POISON) { 3041 if (cachep->flags & SLAB_POISON) {
3078 check_poison_obj(cachep, objp); 3042 check_poison_obj(cachep, objp);
3079 slab_kernel_map(cachep, objp, 1, 0); 3043 slab_kernel_map(cachep, objp, 1);
3080 poison_obj(cachep, objp, POISON_INUSE); 3044 poison_obj(cachep, objp, POISON_INUSE);
3081 } 3045 }
3082 if (cachep->flags & SLAB_STORE_USER) 3046 if (cachep->flags & SLAB_STORE_USER)