aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
authorH. Peter Anvin <hpa@linux.intel.com>2014-04-29 19:46:09 -0400
committerH. Peter Anvin <hpa@linux.intel.com>2014-04-30 17:14:28 -0400
commit3891a04aafd668686239349ea58f3314ea2af86b (patch)
treef1c3b49ceb091a875aaa6e99b6b4a91ea79dc2ec /arch/x86/kernel
parentd1db0eea852497762cab43b905b879dfcd3b8987 (diff)
x86-64, espfix: Don't leak bits 31:16 of %esp returning to 16-bit stack
The IRET instruction, when returning to a 16-bit segment, only restores the bottom 16 bits of the user space stack pointer. This causes some 16-bit software to break, but it also leaks kernel state to user space. We have a software workaround for that ("espfix") for the 32-bit kernel, but it relies on a nonzero stack segment base which is not available in 64-bit mode. In checkin: b3b42ac2cbae x86-64, modify_ldt: Ban 16-bit segments on 64-bit kernels we "solved" this by forbidding 16-bit segments on 64-bit kernels, with the logic that 16-bit support is crippled on 64-bit kernels anyway (no V86 support), but it turns out that people are doing stuff like running old Win16 binaries under Wine and expect it to work. This works around this by creating percpu "ministacks", each of which is mapped 2^16 times 64K apart. When we detect that the return SS is on the LDT, we copy the IRET frame to the ministack and use the relevant alias to return to userspace. The ministacks are mapped readonly, so if IRET faults we promote #GP to #DF which is an IST vector and thus has its own stack; we then do the fixup in the #DF handler. (Making #GP an IST exception would make the msr_safe functions unsafe in NMI/MC context, and quite possibly have other effects.) Special thanks to: - Andy Lutomirski, for the suggestion of using very small stack slots and copy (as opposed to map) the IRET frame there, and for the suggestion to mark them readonly and let the fault promote to #DF. - Konrad Wilk for paravirt fixup and testing. - Borislav Petkov for testing help and useful comments. Reported-by: Brian Gerst <brgerst@gmail.com> Signed-off-by: H. Peter Anvin <hpa@linux.intel.com> Link: http://lkml.kernel.org/r/1398816946-3351-1-git-send-email-hpa@linux.intel.com Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> Cc: Borislav Petkov <bp@alien8.de> Cc: Andrew Lutomriski <amluto@gmail.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Dirk Hohndel <dirk@hohndel.org> Cc: Arjan van de Ven <arjan.van.de.ven@intel.com> Cc: comex <comexk@gmail.com> Cc: Alexander van Heukelum <heukelum@fastmail.fm> Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com> Cc: <stable@vger.kernel.org> # consider after upstream merge
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile1
-rw-r--r--arch/x86/kernel/entry_64.S73
-rw-r--r--arch/x86/kernel/espfix_64.c208
-rw-r--r--arch/x86/kernel/ldt.c11
-rw-r--r--arch/x86/kernel/smpboot.c7
5 files changed, 285 insertions, 15 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index f4d96000d33a..1cc3789d99d9 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -29,6 +29,7 @@ obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
29obj-y += syscall_$(BITS).o vsyscall_gtod.o 29obj-y += syscall_$(BITS).o vsyscall_gtod.o
30obj-$(CONFIG_X86_64) += vsyscall_64.o 30obj-$(CONFIG_X86_64) += vsyscall_64.o
31obj-$(CONFIG_X86_64) += vsyscall_emu_64.o 31obj-$(CONFIG_X86_64) += vsyscall_emu_64.o
32obj-$(CONFIG_X86_64) += espfix_64.o
32obj-$(CONFIG_SYSFS) += ksysfs.o 33obj-$(CONFIG_SYSFS) += ksysfs.o
33obj-y += bootflag.o e820.o 34obj-y += bootflag.o e820.o
34obj-y += pci-dma.o quirks.o topology.o kdebugfs.o 35obj-y += pci-dma.o quirks.o topology.o kdebugfs.o
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 1e96c3628bf2..bffaa986cafc 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -58,6 +58,7 @@
58#include <asm/asm.h> 58#include <asm/asm.h>
59#include <asm/context_tracking.h> 59#include <asm/context_tracking.h>
60#include <asm/smap.h> 60#include <asm/smap.h>
61#include <asm/pgtable_types.h>
61#include <linux/err.h> 62#include <linux/err.h>
62 63
63/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ 64/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
@@ -1040,8 +1041,16 @@ restore_args:
1040 RESTORE_ARGS 1,8,1 1041 RESTORE_ARGS 1,8,1
1041 1042
1042irq_return: 1043irq_return:
1044 /*
1045 * Are we returning to a stack segment from the LDT? Note: in
1046 * 64-bit mode SS:RSP on the exception stack is always valid.
1047 */
1048 testb $4,(SS-RIP)(%rsp)
1049 jnz irq_return_ldt
1050
1051irq_return_iret:
1043 INTERRUPT_RETURN 1052 INTERRUPT_RETURN
1044 _ASM_EXTABLE(irq_return, bad_iret) 1053 _ASM_EXTABLE(irq_return_iret, bad_iret)
1045 1054
1046#ifdef CONFIG_PARAVIRT 1055#ifdef CONFIG_PARAVIRT
1047ENTRY(native_iret) 1056ENTRY(native_iret)
@@ -1049,6 +1058,30 @@ ENTRY(native_iret)
1049 _ASM_EXTABLE(native_iret, bad_iret) 1058 _ASM_EXTABLE(native_iret, bad_iret)
1050#endif 1059#endif
1051 1060
1061irq_return_ldt:
1062 pushq_cfi %rax
1063 pushq_cfi %rdi
1064 SWAPGS
1065 movq PER_CPU_VAR(espfix_waddr),%rdi
1066 movq %rax,(0*8)(%rdi) /* RAX */
1067 movq (2*8)(%rsp),%rax /* RIP */
1068 movq %rax,(1*8)(%rdi)
1069 movq (3*8)(%rsp),%rax /* CS */
1070 movq %rax,(2*8)(%rdi)
1071 movq (4*8)(%rsp),%rax /* RFLAGS */
1072 movq %rax,(3*8)(%rdi)
1073 movq (6*8)(%rsp),%rax /* SS */
1074 movq %rax,(5*8)(%rdi)
1075 movq (5*8)(%rsp),%rax /* RSP */
1076 movq %rax,(4*8)(%rdi)
1077 andl $0xffff0000,%eax
1078 popq_cfi %rdi
1079 orq PER_CPU_VAR(espfix_stack),%rax
1080 SWAPGS
1081 movq %rax,%rsp
1082 popq_cfi %rax
1083 jmp irq_return_iret
1084
1052 .section .fixup,"ax" 1085 .section .fixup,"ax"
1053bad_iret: 1086bad_iret:
1054 /* 1087 /*
@@ -1110,9 +1143,41 @@ ENTRY(retint_kernel)
1110 call preempt_schedule_irq 1143 call preempt_schedule_irq
1111 jmp exit_intr 1144 jmp exit_intr
1112#endif 1145#endif
1113
1114 CFI_ENDPROC 1146 CFI_ENDPROC
1115END(common_interrupt) 1147END(common_interrupt)
1148
1149 /*
1150 * If IRET takes a fault on the espfix stack, then we
1151 * end up promoting it to a doublefault. In that case,
1152 * modify the stack to make it look like we just entered
1153 * the #GP handler from user space, similar to bad_iret.
1154 */
1155 ALIGN
1156__do_double_fault:
1157 XCPT_FRAME 1 RDI+8
1158 movq RSP(%rdi),%rax /* Trap on the espfix stack? */
1159 sarq $PGDIR_SHIFT,%rax
1160 cmpl $ESPFIX_PGD_ENTRY,%eax
1161 jne do_double_fault /* No, just deliver the fault */
1162 cmpl $__KERNEL_CS,CS(%rdi)
1163 jne do_double_fault
1164 movq RIP(%rdi),%rax
1165 cmpq $irq_return_iret,%rax
1166#ifdef CONFIG_PARAVIRT
1167 je 1f
1168 cmpq $native_iret,%rax
1169#endif
1170 jne do_double_fault /* This shouldn't happen... */
11711:
1172 movq PER_CPU_VAR(kernel_stack),%rax
1173 subq $(6*8-KERNEL_STACK_OFFSET),%rax /* Reset to original stack */
1174 movq %rax,RSP(%rdi)
1175 movq $0,(%rax) /* Missing (lost) #GP error code */
1176 movq $general_protection,RIP(%rdi)
1177 retq
1178 CFI_ENDPROC
1179END(__do_double_fault)
1180
1116/* 1181/*
1117 * End of kprobes section 1182 * End of kprobes section
1118 */ 1183 */
@@ -1314,7 +1379,7 @@ zeroentry overflow do_overflow
1314zeroentry bounds do_bounds 1379zeroentry bounds do_bounds
1315zeroentry invalid_op do_invalid_op 1380zeroentry invalid_op do_invalid_op
1316zeroentry device_not_available do_device_not_available 1381zeroentry device_not_available do_device_not_available
1317paranoiderrorentry double_fault do_double_fault 1382paranoiderrorentry double_fault __do_double_fault
1318zeroentry coprocessor_segment_overrun do_coprocessor_segment_overrun 1383zeroentry coprocessor_segment_overrun do_coprocessor_segment_overrun
1319errorentry invalid_TSS do_invalid_TSS 1384errorentry invalid_TSS do_invalid_TSS
1320errorentry segment_not_present do_segment_not_present 1385errorentry segment_not_present do_segment_not_present
@@ -1601,7 +1666,7 @@ error_sti:
1601 */ 1666 */
1602error_kernelspace: 1667error_kernelspace:
1603 incl %ebx 1668 incl %ebx
1604 leaq irq_return(%rip),%rcx 1669 leaq irq_return_iret(%rip),%rcx
1605 cmpq %rcx,RIP+8(%rsp) 1670 cmpq %rcx,RIP+8(%rsp)
1606 je error_swapgs 1671 je error_swapgs
1607 movl %ecx,%eax /* zero extend */ 1672 movl %ecx,%eax /* zero extend */
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
new file mode 100644
index 000000000000..8a64da36310f
--- /dev/null
+++ b/arch/x86/kernel/espfix_64.c
@@ -0,0 +1,208 @@
1/* ----------------------------------------------------------------------- *
2 *
3 * Copyright 2014 Intel Corporation; author: H. Peter Anvin
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * ----------------------------------------------------------------------- */
15
16/*
17 * The IRET instruction, when returning to a 16-bit segment, only
18 * restores the bottom 16 bits of the user space stack pointer. This
19 * causes some 16-bit software to break, but it also leaks kernel state
20 * to user space.
21 *
22 * This works around this by creating percpu "ministacks", each of which
23 * is mapped 2^16 times 64K apart. When we detect that the return SS is
24 * on the LDT, we copy the IRET frame to the ministack and use the
25 * relevant alias to return to userspace. The ministacks are mapped
26 * readonly, so if the IRET fault we promote #GP to #DF which is an IST
27 * vector and thus has its own stack; we then do the fixup in the #DF
28 * handler.
29 *
30 * This file sets up the ministacks and the related page tables. The
31 * actual ministack invocation is in entry_64.S.
32 */
33
34#include <linux/init.h>
35#include <linux/init_task.h>
36#include <linux/kernel.h>
37#include <linux/percpu.h>
38#include <linux/gfp.h>
39#include <linux/random.h>
40#include <asm/pgtable.h>
41#include <asm/pgalloc.h>
42#include <asm/setup.h>
43
44/*
45 * Note: we only need 6*8 = 48 bytes for the espfix stack, but round
46 * it up to a cache line to avoid unnecessary sharing.
47 */
48#define ESPFIX_STACK_SIZE (8*8UL)
49#define ESPFIX_STACKS_PER_PAGE (PAGE_SIZE/ESPFIX_STACK_SIZE)
50
51/* There is address space for how many espfix pages? */
52#define ESPFIX_PAGE_SPACE (1UL << (PGDIR_SHIFT-PAGE_SHIFT-16))
53
54#define ESPFIX_MAX_CPUS (ESPFIX_STACKS_PER_PAGE * ESPFIX_PAGE_SPACE)
55#if CONFIG_NR_CPUS > ESPFIX_MAX_CPUS
56# error "Need more than one PGD for the ESPFIX hack"
57#endif
58
59#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO)
60
61/* This contains the *bottom* address of the espfix stack */
62DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack);
63DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr);
64
65/* Initialization mutex - should this be a spinlock? */
66static DEFINE_MUTEX(espfix_init_mutex);
67
68/* Page allocation bitmap - each page serves ESPFIX_STACKS_PER_PAGE CPUs */
69#define ESPFIX_MAX_PAGES DIV_ROUND_UP(CONFIG_NR_CPUS, ESPFIX_STACKS_PER_PAGE)
70static void *espfix_pages[ESPFIX_MAX_PAGES];
71
72static __page_aligned_bss pud_t espfix_pud_page[PTRS_PER_PUD]
73 __aligned(PAGE_SIZE);
74
75static unsigned int page_random, slot_random;
76
77/*
78 * This returns the bottom address of the espfix stack for a specific CPU.
79 * The math allows for a non-power-of-two ESPFIX_STACK_SIZE, in which case
80 * we have to account for some amount of padding at the end of each page.
81 */
82static inline unsigned long espfix_base_addr(unsigned int cpu)
83{
84 unsigned long page, slot;
85 unsigned long addr;
86
87 page = (cpu / ESPFIX_STACKS_PER_PAGE) ^ page_random;
88 slot = (cpu + slot_random) % ESPFIX_STACKS_PER_PAGE;
89 addr = (page << PAGE_SHIFT) + (slot * ESPFIX_STACK_SIZE);
90 addr = (addr & 0xffffUL) | ((addr & ~0xffffUL) << 16);
91 addr += ESPFIX_BASE_ADDR;
92 return addr;
93}
94
95#define PTE_STRIDE (65536/PAGE_SIZE)
96#define ESPFIX_PTE_CLONES (PTRS_PER_PTE/PTE_STRIDE)
97#define ESPFIX_PMD_CLONES PTRS_PER_PMD
98#define ESPFIX_PUD_CLONES (65536/(ESPFIX_PTE_CLONES*ESPFIX_PMD_CLONES))
99
100#define PGTABLE_PROT ((_KERNPG_TABLE & ~_PAGE_RW) | _PAGE_NX)
101
102static void init_espfix_random(void)
103{
104 unsigned long rand;
105
106 /*
107 * This is run before the entropy pools are initialized,
108 * but this is hopefully better than nothing.
109 */
110 if (!arch_get_random_long(&rand)) {
111 /* The constant is an arbitrary large prime */
112 rdtscll(rand);
113 rand *= 0xc345c6b72fd16123UL;
114 }
115
116 slot_random = rand % ESPFIX_STACKS_PER_PAGE;
117 page_random = (rand / ESPFIX_STACKS_PER_PAGE)
118 & (ESPFIX_PAGE_SPACE - 1);
119}
120
121void __init init_espfix_bsp(void)
122{
123 pgd_t *pgd_p;
124 pteval_t ptemask;
125
126 ptemask = __supported_pte_mask;
127
128 /* Install the espfix pud into the kernel page directory */
129 pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];
130 pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page);
131
132 /* Randomize the locations */
133 init_espfix_random();
134
135 /* The rest is the same as for any other processor */
136 init_espfix_ap();
137}
138
139void init_espfix_ap(void)
140{
141 unsigned int cpu, page;
142 unsigned long addr;
143 pud_t pud, *pud_p;
144 pmd_t pmd, *pmd_p;
145 pte_t pte, *pte_p;
146 int n;
147 void *stack_page;
148 pteval_t ptemask;
149
150 /* We only have to do this once... */
151 if (likely(this_cpu_read(espfix_stack)))
152 return; /* Already initialized */
153
154 cpu = smp_processor_id();
155 addr = espfix_base_addr(cpu);
156 page = cpu/ESPFIX_STACKS_PER_PAGE;
157
158 /* Did another CPU already set this up? */
159 stack_page = ACCESS_ONCE(espfix_pages[page]);
160 if (likely(stack_page))
161 goto done;
162
163 mutex_lock(&espfix_init_mutex);
164
165 /* Did we race on the lock? */
166 stack_page = ACCESS_ONCE(espfix_pages[page]);
167 if (stack_page)
168 goto unlock_done;
169
170 ptemask = __supported_pte_mask;
171
172 pud_p = &espfix_pud_page[pud_index(addr)];
173 pud = *pud_p;
174 if (!pud_present(pud)) {
175 pmd_p = (pmd_t *)__get_free_page(PGALLOC_GFP);
176 pud = __pud(__pa(pmd_p) | (PGTABLE_PROT & ptemask));
177 paravirt_alloc_pud(&init_mm, __pa(pmd_p) >> PAGE_SHIFT);
178 for (n = 0; n < ESPFIX_PUD_CLONES; n++)
179 set_pud(&pud_p[n], pud);
180 }
181
182 pmd_p = pmd_offset(&pud, addr);
183 pmd = *pmd_p;
184 if (!pmd_present(pmd)) {
185 pte_p = (pte_t *)__get_free_page(PGALLOC_GFP);
186 pmd = __pmd(__pa(pte_p) | (PGTABLE_PROT & ptemask));
187 paravirt_alloc_pmd(&init_mm, __pa(pte_p) >> PAGE_SHIFT);
188 for (n = 0; n < ESPFIX_PMD_CLONES; n++)
189 set_pmd(&pmd_p[n], pmd);
190 }
191
192 pte_p = pte_offset_kernel(&pmd, addr);
193 stack_page = (void *)__get_free_page(GFP_KERNEL);
194 pte = __pte(__pa(stack_page) | (__PAGE_KERNEL_RO & ptemask));
195 paravirt_alloc_pte(&init_mm, __pa(stack_page) >> PAGE_SHIFT);
196 for (n = 0; n < ESPFIX_PTE_CLONES; n++)
197 set_pte(&pte_p[n*PTE_STRIDE], pte);
198
199 /* Job is done for this CPU and any CPU which shares this page */
200 ACCESS_ONCE(espfix_pages[page]) = stack_page;
201
202unlock_done:
203 mutex_unlock(&espfix_init_mutex);
204done:
205 this_cpu_write(espfix_stack, addr);
206 this_cpu_write(espfix_waddr, (unsigned long)stack_page
207 + (addr & ~PAGE_MASK));
208}
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index af1d14a9ebda..ebc987398923 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -229,17 +229,6 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
229 } 229 }
230 } 230 }
231 231
232 /*
233 * On x86-64 we do not support 16-bit segments due to
234 * IRET leaking the high bits of the kernel stack address.
235 */
236#ifdef CONFIG_X86_64
237 if (!ldt_info.seg_32bit) {
238 error = -EINVAL;
239 goto out_unlock;
240 }
241#endif
242
243 fill_ldt(&ldt, &ldt_info); 232 fill_ldt(&ldt, &ldt_info);
244 if (oldmode) 233 if (oldmode)
245 ldt.avl = 0; 234 ldt.avl = 0;
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 34826934d4a7..61a5350850fb 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -244,6 +244,13 @@ static void notrace start_secondary(void *unused)
244 check_tsc_sync_target(); 244 check_tsc_sync_target();
245 245
246 /* 246 /*
247 * Enable the espfix hack for this CPU
248 */
249#ifdef CONFIG_X86_64
250 init_espfix_ap();
251#endif
252
253 /*
247 * We need to hold vector_lock so there the set of online cpus 254 * We need to hold vector_lock so there the set of online cpus
248 * does not change while we are assigning vectors to cpus. Holding 255 * does not change while we are assigning vectors to cpus. Holding
249 * this lock ensures we don't half assign or remove an irq from a cpu. 256 * this lock ensures we don't half assign or remove an irq from a cpu.