aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/x86/x86_64/mm.txt2
-rw-r--r--arch/x86/Kconfig25
-rw-r--r--arch/x86/include/asm/espfix.h16
-rw-r--r--arch/x86/include/asm/pgtable_64_types.h2
-rw-r--r--arch/x86/include/asm/setup.h2
-rw-r--r--arch/x86/kernel/Makefile1
-rw-r--r--arch/x86/kernel/entry_32.S17
-rw-r--r--arch/x86/kernel/entry_64.S81
-rw-r--r--arch/x86/kernel/espfix_64.c209
-rw-r--r--arch/x86/kernel/ldt.c10
-rw-r--r--arch/x86/kernel/smpboot.c7
-rw-r--r--arch/x86/mm/dump_pagetables.c44
-rw-r--r--arch/x86/vdso/vdso32-setup.c8
-rw-r--r--init/main.c4
14 files changed, 387 insertions, 41 deletions
diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt
index c584a51add15..afe68ddbe6a4 100644
--- a/Documentation/x86/x86_64/mm.txt
+++ b/Documentation/x86/x86_64/mm.txt
@@ -12,6 +12,8 @@ ffffc90000000000 - ffffe8ffffffffff (=45 bits) vmalloc/ioremap space
12ffffe90000000000 - ffffe9ffffffffff (=40 bits) hole 12ffffe90000000000 - ffffe9ffffffffff (=40 bits) hole
13ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB) 13ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB)
14... unused hole ... 14... unused hole ...
15ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
16... unused hole ...
15ffffffff80000000 - ffffffffa0000000 (=512 MB) kernel text mapping, from phys 0 17ffffffff80000000 - ffffffffa0000000 (=512 MB) kernel text mapping, from phys 0
16ffffffffa0000000 - ffffffffff5fffff (=1525 MB) module mapping space 18ffffffffa0000000 - ffffffffff5fffff (=1525 MB) module mapping space
17ffffffffff600000 - ffffffffffdfffff (=8 MB) vsyscalls 19ffffffffff600000 - ffffffffffdfffff (=8 MB) vsyscalls
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 272b493ea1bf..b660088c220d 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -912,10 +912,27 @@ config VM86
912 default y 912 default y
913 depends on X86_32 913 depends on X86_32
914 ---help--- 914 ---help---
915 This option is required by programs like DOSEMU to run 16-bit legacy 915 This option is required by programs like DOSEMU to run
916 code on X86 processors. It also may be needed by software like 916 16-bit real mode legacy code on x86 processors. It also may
917 XFree86 to initialize some video cards via BIOS. Disabling this 917 be needed by software like XFree86 to initialize some video
918 option saves about 6k. 918 cards via BIOS. Disabling this option saves about 6K.
919
920config X86_16BIT
921 bool "Enable support for 16-bit segments" if EXPERT
922 default y
923 ---help---
924 This option is required by programs like Wine to run 16-bit
925 protected mode legacy code on x86 processors. Disabling
926 this option saves about 300 bytes on i386, or around 6K text
927 plus 16K runtime memory on x86-64,
928
929config X86_ESPFIX32
930 def_bool y
931 depends on X86_16BIT && X86_32
932
933config X86_ESPFIX64
934 def_bool y
935 depends on X86_16BIT && X86_64
919 936
920config TOSHIBA 937config TOSHIBA
921 tristate "Toshiba Laptop support" 938 tristate "Toshiba Laptop support"
diff --git a/arch/x86/include/asm/espfix.h b/arch/x86/include/asm/espfix.h
new file mode 100644
index 000000000000..99efebb2f69d
--- /dev/null
+++ b/arch/x86/include/asm/espfix.h
@@ -0,0 +1,16 @@
1#ifndef _ASM_X86_ESPFIX_H
2#define _ASM_X86_ESPFIX_H
3
4#ifdef CONFIG_X86_64
5
6#include <asm/percpu.h>
7
8DECLARE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack);
9DECLARE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr);
10
11extern void init_espfix_bsp(void);
12extern void init_espfix_ap(void);
13
14#endif /* CONFIG_X86_64 */
15
16#endif /* _ASM_X86_ESPFIX_H */
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index c883bf726398..7166e25ecb57 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -61,6 +61,8 @@ typedef struct { pteval_t pte; } pte_t;
61#define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE) 61#define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE)
62#define MODULES_END _AC(0xffffffffff000000, UL) 62#define MODULES_END _AC(0xffffffffff000000, UL)
63#define MODULES_LEN (MODULES_END - MODULES_VADDR) 63#define MODULES_LEN (MODULES_END - MODULES_VADDR)
64#define ESPFIX_PGD_ENTRY _AC(-2, UL)
65#define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << PGDIR_SHIFT)
64 66
65#define EARLY_DYNAMIC_PAGE_TABLES 64 67#define EARLY_DYNAMIC_PAGE_TABLES 64
66 68
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 9264f04a4c55..ff4e7b236e21 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -59,6 +59,8 @@ static inline void x86_ce4100_early_setup(void) { }
59 59
60#ifndef _SETUP 60#ifndef _SETUP
61 61
62#include <asm/espfix.h>
63
62/* 64/*
63 * This is set up by the setup-routine at boot-time 65 * This is set up by the setup-routine at boot-time
64 */ 66 */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index f4d96000d33a..491ef3e59850 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -29,6 +29,7 @@ obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
29obj-y += syscall_$(BITS).o vsyscall_gtod.o 29obj-y += syscall_$(BITS).o vsyscall_gtod.o
30obj-$(CONFIG_X86_64) += vsyscall_64.o 30obj-$(CONFIG_X86_64) += vsyscall_64.o
31obj-$(CONFIG_X86_64) += vsyscall_emu_64.o 31obj-$(CONFIG_X86_64) += vsyscall_emu_64.o
32obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o
32obj-$(CONFIG_SYSFS) += ksysfs.o 33obj-$(CONFIG_SYSFS) += ksysfs.o
33obj-y += bootflag.o e820.o 34obj-y += bootflag.o e820.o
34obj-y += pci-dma.o quirks.o topology.o kdebugfs.o 35obj-y += pci-dma.o quirks.o topology.o kdebugfs.o
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index a2a4f4697889..98313ffaae6a 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -527,6 +527,7 @@ syscall_exit:
527restore_all: 527restore_all:
528 TRACE_IRQS_IRET 528 TRACE_IRQS_IRET
529restore_all_notrace: 529restore_all_notrace:
530#ifdef CONFIG_X86_ESPFIX32
530 movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS 531 movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS
531 # Warning: PT_OLDSS(%esp) contains the wrong/random values if we 532 # Warning: PT_OLDSS(%esp) contains the wrong/random values if we
532 # are returning to the kernel. 533 # are returning to the kernel.
@@ -537,6 +538,7 @@ restore_all_notrace:
537 cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax 538 cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
538 CFI_REMEMBER_STATE 539 CFI_REMEMBER_STATE
539 je ldt_ss # returning to user-space with LDT SS 540 je ldt_ss # returning to user-space with LDT SS
541#endif
540restore_nocheck: 542restore_nocheck:
541 RESTORE_REGS 4 # skip orig_eax/error_code 543 RESTORE_REGS 4 # skip orig_eax/error_code
542irq_return: 544irq_return:
@@ -549,13 +551,9 @@ ENTRY(iret_exc)
549.previous 551.previous
550 _ASM_EXTABLE(irq_return,iret_exc) 552 _ASM_EXTABLE(irq_return,iret_exc)
551 553
554#ifdef CONFIG_X86_ESPFIX32
552 CFI_RESTORE_STATE 555 CFI_RESTORE_STATE
553ldt_ss: 556ldt_ss:
554 larl PT_OLDSS(%esp), %eax
555 jnz restore_nocheck
556 testl $0x00400000, %eax # returning to 32bit stack?
557 jnz restore_nocheck # allright, normal return
558
559#ifdef CONFIG_PARAVIRT 557#ifdef CONFIG_PARAVIRT
560 /* 558 /*
561 * The kernel can't run on a non-flat stack if paravirt mode 559 * The kernel can't run on a non-flat stack if paravirt mode
@@ -597,6 +595,7 @@ ldt_ss:
597 lss (%esp), %esp /* switch to espfix segment */ 595 lss (%esp), %esp /* switch to espfix segment */
598 CFI_ADJUST_CFA_OFFSET -8 596 CFI_ADJUST_CFA_OFFSET -8
599 jmp restore_nocheck 597 jmp restore_nocheck
598#endif
600 CFI_ENDPROC 599 CFI_ENDPROC
601ENDPROC(system_call) 600ENDPROC(system_call)
602 601
@@ -704,6 +703,7 @@ END(syscall_badsys)
704 * the high word of the segment base from the GDT and swiches to the 703 * the high word of the segment base from the GDT and swiches to the
705 * normal stack and adjusts ESP with the matching offset. 704 * normal stack and adjusts ESP with the matching offset.
706 */ 705 */
706#ifdef CONFIG_X86_ESPFIX32
707 /* fixup the stack */ 707 /* fixup the stack */
708 mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */ 708 mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */
709 mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */ 709 mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */
@@ -713,8 +713,10 @@ END(syscall_badsys)
713 pushl_cfi %eax 713 pushl_cfi %eax
714 lss (%esp), %esp /* switch to the normal stack segment */ 714 lss (%esp), %esp /* switch to the normal stack segment */
715 CFI_ADJUST_CFA_OFFSET -8 715 CFI_ADJUST_CFA_OFFSET -8
716#endif
716.endm 717.endm
717.macro UNWIND_ESPFIX_STACK 718.macro UNWIND_ESPFIX_STACK
719#ifdef CONFIG_X86_ESPFIX32
718 movl %ss, %eax 720 movl %ss, %eax
719 /* see if on espfix stack */ 721 /* see if on espfix stack */
720 cmpw $__ESPFIX_SS, %ax 722 cmpw $__ESPFIX_SS, %ax
@@ -725,6 +727,7 @@ END(syscall_badsys)
725 /* switch to normal stack */ 727 /* switch to normal stack */
726 FIXUP_ESPFIX_STACK 728 FIXUP_ESPFIX_STACK
72727: 72927:
730#endif
728.endm 731.endm
729 732
730/* 733/*
@@ -1355,11 +1358,13 @@ END(debug)
1355ENTRY(nmi) 1358ENTRY(nmi)
1356 RING0_INT_FRAME 1359 RING0_INT_FRAME
1357 ASM_CLAC 1360 ASM_CLAC
1361#ifdef CONFIG_X86_ESPFIX32
1358 pushl_cfi %eax 1362 pushl_cfi %eax
1359 movl %ss, %eax 1363 movl %ss, %eax
1360 cmpw $__ESPFIX_SS, %ax 1364 cmpw $__ESPFIX_SS, %ax
1361 popl_cfi %eax 1365 popl_cfi %eax
1362 je nmi_espfix_stack 1366 je nmi_espfix_stack
1367#endif
1363 cmpl $ia32_sysenter_target,(%esp) 1368 cmpl $ia32_sysenter_target,(%esp)
1364 je nmi_stack_fixup 1369 je nmi_stack_fixup
1365 pushl_cfi %eax 1370 pushl_cfi %eax
@@ -1399,6 +1404,7 @@ nmi_debug_stack_check:
1399 FIX_STACK 24, nmi_stack_correct, 1 1404 FIX_STACK 24, nmi_stack_correct, 1
1400 jmp nmi_stack_correct 1405 jmp nmi_stack_correct
1401 1406
1407#ifdef CONFIG_X86_ESPFIX32
1402nmi_espfix_stack: 1408nmi_espfix_stack:
1403 /* We have a RING0_INT_FRAME here. 1409 /* We have a RING0_INT_FRAME here.
1404 * 1410 *
@@ -1420,6 +1426,7 @@ nmi_espfix_stack:
1420 lss 12+4(%esp), %esp # back to espfix stack 1426 lss 12+4(%esp), %esp # back to espfix stack
1421 CFI_ADJUST_CFA_OFFSET -24 1427 CFI_ADJUST_CFA_OFFSET -24
1422 jmp irq_return 1428 jmp irq_return
1429#endif
1423 CFI_ENDPROC 1430 CFI_ENDPROC
1424END(nmi) 1431END(nmi)
1425 1432
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index be846d2468f7..96987987c5de 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -58,6 +58,7 @@
58#include <asm/asm.h> 58#include <asm/asm.h>
59#include <asm/context_tracking.h> 59#include <asm/context_tracking.h>
60#include <asm/smap.h> 60#include <asm/smap.h>
61#include <asm/pgtable_types.h>
61#include <linux/err.h> 62#include <linux/err.h>
62 63
63/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ 64/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
@@ -1040,8 +1041,18 @@ restore_args:
1040 RESTORE_ARGS 1,8,1 1041 RESTORE_ARGS 1,8,1
1041 1042
1042irq_return: 1043irq_return:
1044 /*
1045 * Are we returning to a stack segment from the LDT? Note: in
1046 * 64-bit mode SS:RSP on the exception stack is always valid.
1047 */
1048#ifdef CONFIG_X86_ESPFIX64
1049 testb $4,(SS-RIP)(%rsp)
1050 jnz irq_return_ldt
1051#endif
1052
1053irq_return_iret:
1043 INTERRUPT_RETURN 1054 INTERRUPT_RETURN
1044 _ASM_EXTABLE(irq_return, bad_iret) 1055 _ASM_EXTABLE(irq_return_iret, bad_iret)
1045 1056
1046#ifdef CONFIG_PARAVIRT 1057#ifdef CONFIG_PARAVIRT
1047ENTRY(native_iret) 1058ENTRY(native_iret)
@@ -1049,6 +1060,32 @@ ENTRY(native_iret)
1049 _ASM_EXTABLE(native_iret, bad_iret) 1060 _ASM_EXTABLE(native_iret, bad_iret)
1050#endif 1061#endif
1051 1062
1063#ifdef CONFIG_X86_ESPFIX64
1064irq_return_ldt:
1065 pushq_cfi %rax
1066 pushq_cfi %rdi
1067 SWAPGS
1068 movq PER_CPU_VAR(espfix_waddr),%rdi
1069 movq %rax,(0*8)(%rdi) /* RAX */
1070 movq (2*8)(%rsp),%rax /* RIP */
1071 movq %rax,(1*8)(%rdi)
1072 movq (3*8)(%rsp),%rax /* CS */
1073 movq %rax,(2*8)(%rdi)
1074 movq (4*8)(%rsp),%rax /* RFLAGS */
1075 movq %rax,(3*8)(%rdi)
1076 movq (6*8)(%rsp),%rax /* SS */
1077 movq %rax,(5*8)(%rdi)
1078 movq (5*8)(%rsp),%rax /* RSP */
1079 movq %rax,(4*8)(%rdi)
1080 andl $0xffff0000,%eax
1081 popq_cfi %rdi
1082 orq PER_CPU_VAR(espfix_stack),%rax
1083 SWAPGS
1084 movq %rax,%rsp
1085 popq_cfi %rax
1086 jmp irq_return_iret
1087#endif
1088
1052 .section .fixup,"ax" 1089 .section .fixup,"ax"
1053bad_iret: 1090bad_iret:
1054 /* 1091 /*
@@ -1110,9 +1147,45 @@ ENTRY(retint_kernel)
1110 call preempt_schedule_irq 1147 call preempt_schedule_irq
1111 jmp exit_intr 1148 jmp exit_intr
1112#endif 1149#endif
1113
1114 CFI_ENDPROC 1150 CFI_ENDPROC
1115END(common_interrupt) 1151END(common_interrupt)
1152
1153 /*
1154 * If IRET takes a fault on the espfix stack, then we
1155 * end up promoting it to a doublefault. In that case,
1156 * modify the stack to make it look like we just entered
1157 * the #GP handler from user space, similar to bad_iret.
1158 */
1159#ifdef CONFIG_X86_ESPFIX64
1160 ALIGN
1161__do_double_fault:
1162 XCPT_FRAME 1 RDI+8
1163 movq RSP(%rdi),%rax /* Trap on the espfix stack? */
1164 sarq $PGDIR_SHIFT,%rax
1165 cmpl $ESPFIX_PGD_ENTRY,%eax
1166 jne do_double_fault /* No, just deliver the fault */
1167 cmpl $__KERNEL_CS,CS(%rdi)
1168 jne do_double_fault
1169 movq RIP(%rdi),%rax
1170 cmpq $irq_return_iret,%rax
1171#ifdef CONFIG_PARAVIRT
1172 je 1f
1173 cmpq $native_iret,%rax
1174#endif
1175 jne do_double_fault /* This shouldn't happen... */
11761:
1177 movq PER_CPU_VAR(kernel_stack),%rax
1178 subq $(6*8-KERNEL_STACK_OFFSET),%rax /* Reset to original stack */
1179 movq %rax,RSP(%rdi)
1180 movq $0,(%rax) /* Missing (lost) #GP error code */
1181 movq $general_protection,RIP(%rdi)
1182 retq
1183 CFI_ENDPROC
1184END(__do_double_fault)
1185#else
1186# define __do_double_fault do_double_fault
1187#endif
1188
1116/* 1189/*
1117 * End of kprobes section 1190 * End of kprobes section
1118 */ 1191 */
@@ -1289,7 +1362,7 @@ idtentry overflow do_overflow has_error_code=0
1289idtentry bounds do_bounds has_error_code=0 1362idtentry bounds do_bounds has_error_code=0
1290idtentry invalid_op do_invalid_op has_error_code=0 1363idtentry invalid_op do_invalid_op has_error_code=0
1291idtentry device_not_available do_device_not_available has_error_code=0 1364idtentry device_not_available do_device_not_available has_error_code=0
1292idtentry double_fault do_double_fault has_error_code=1 paranoid=1 1365idtentry double_fault __do_double_fault has_error_code=1 paranoid=1
1293idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0 1366idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0
1294idtentry invalid_TSS do_invalid_TSS has_error_code=1 1367idtentry invalid_TSS do_invalid_TSS has_error_code=1
1295idtentry segment_not_present do_segment_not_present has_error_code=1 1368idtentry segment_not_present do_segment_not_present has_error_code=1
@@ -1576,7 +1649,7 @@ error_sti:
1576 */ 1649 */
1577error_kernelspace: 1650error_kernelspace:
1578 incl %ebx 1651 incl %ebx
1579 leaq irq_return(%rip),%rcx 1652 leaq irq_return_iret(%rip),%rcx
1580 cmpq %rcx,RIP+8(%rsp) 1653 cmpq %rcx,RIP+8(%rsp)
1581 je error_swapgs 1654 je error_swapgs
1582 movl %ecx,%eax /* zero extend */ 1655 movl %ecx,%eax /* zero extend */
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
new file mode 100644
index 000000000000..6afbb16e9b79
--- /dev/null
+++ b/arch/x86/kernel/espfix_64.c
@@ -0,0 +1,209 @@
1/* ----------------------------------------------------------------------- *
2 *
3 * Copyright 2014 Intel Corporation; author: H. Peter Anvin
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * ----------------------------------------------------------------------- */
15
16/*
17 * The IRET instruction, when returning to a 16-bit segment, only
18 * restores the bottom 16 bits of the user space stack pointer. This
19 * causes some 16-bit software to break, but it also leaks kernel state
20 * to user space.
21 *
22 * This works around this by creating percpu "ministacks", each of which
23 * is mapped 2^16 times 64K apart. When we detect that the return SS is
24 * on the LDT, we copy the IRET frame to the ministack and use the
25 * relevant alias to return to userspace. The ministacks are mapped
26 * readonly, so if the IRET fault we promote #GP to #DF which is an IST
27 * vector and thus has its own stack; we then do the fixup in the #DF
28 * handler.
29 *
30 * This file sets up the ministacks and the related page tables. The
31 * actual ministack invocation is in entry_64.S.
32 */
33
34#include <linux/init.h>
35#include <linux/init_task.h>
36#include <linux/kernel.h>
37#include <linux/percpu.h>
38#include <linux/gfp.h>
39#include <linux/random.h>
40#include <asm/pgtable.h>
41#include <asm/pgalloc.h>
42#include <asm/setup.h>
43#include <asm/espfix.h>
44
45/*
46 * Note: we only need 6*8 = 48 bytes for the espfix stack, but round
47 * it up to a cache line to avoid unnecessary sharing.
48 */
49#define ESPFIX_STACK_SIZE (8*8UL)
50#define ESPFIX_STACKS_PER_PAGE (PAGE_SIZE/ESPFIX_STACK_SIZE)
51
52/* There is address space for how many espfix pages? */
53#define ESPFIX_PAGE_SPACE (1UL << (PGDIR_SHIFT-PAGE_SHIFT-16))
54
55#define ESPFIX_MAX_CPUS (ESPFIX_STACKS_PER_PAGE * ESPFIX_PAGE_SPACE)
56#if CONFIG_NR_CPUS > ESPFIX_MAX_CPUS
57# error "Need more than one PGD for the ESPFIX hack"
58#endif
59
60#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO)
61
62/* This contains the *bottom* address of the espfix stack */
63DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack);
64DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr);
65
66/* Initialization mutex - should this be a spinlock? */
67static DEFINE_MUTEX(espfix_init_mutex);
68
69/* Page allocation bitmap - each page serves ESPFIX_STACKS_PER_PAGE CPUs */
70#define ESPFIX_MAX_PAGES DIV_ROUND_UP(CONFIG_NR_CPUS, ESPFIX_STACKS_PER_PAGE)
71static void *espfix_pages[ESPFIX_MAX_PAGES];
72
73static __page_aligned_bss pud_t espfix_pud_page[PTRS_PER_PUD]
74 __aligned(PAGE_SIZE);
75
76static unsigned int page_random, slot_random;
77
78/*
79 * This returns the bottom address of the espfix stack for a specific CPU.
80 * The math allows for a non-power-of-two ESPFIX_STACK_SIZE, in which case
81 * we have to account for some amount of padding at the end of each page.
82 */
83static inline unsigned long espfix_base_addr(unsigned int cpu)
84{
85 unsigned long page, slot;
86 unsigned long addr;
87
88 page = (cpu / ESPFIX_STACKS_PER_PAGE) ^ page_random;
89 slot = (cpu + slot_random) % ESPFIX_STACKS_PER_PAGE;
90 addr = (page << PAGE_SHIFT) + (slot * ESPFIX_STACK_SIZE);
91 addr = (addr & 0xffffUL) | ((addr & ~0xffffUL) << 16);
92 addr += ESPFIX_BASE_ADDR;
93 return addr;
94}
95
96#define PTE_STRIDE (65536/PAGE_SIZE)
97#define ESPFIX_PTE_CLONES (PTRS_PER_PTE/PTE_STRIDE)
98#define ESPFIX_PMD_CLONES PTRS_PER_PMD
99#define ESPFIX_PUD_CLONES (65536/(ESPFIX_PTE_CLONES*ESPFIX_PMD_CLONES))
100
101#define PGTABLE_PROT ((_KERNPG_TABLE & ~_PAGE_RW) | _PAGE_NX)
102
103static void init_espfix_random(void)
104{
105 unsigned long rand;
106
107 /*
108 * This is run before the entropy pools are initialized,
109 * but this is hopefully better than nothing.
110 */
111 if (!arch_get_random_long(&rand)) {
112 /* The constant is an arbitrary large prime */
113 rdtscll(rand);
114 rand *= 0xc345c6b72fd16123UL;
115 }
116
117 slot_random = rand % ESPFIX_STACKS_PER_PAGE;
118 page_random = (rand / ESPFIX_STACKS_PER_PAGE)
119 & (ESPFIX_PAGE_SPACE - 1);
120}
121
122void __init init_espfix_bsp(void)
123{
124 pgd_t *pgd_p;
125 pteval_t ptemask;
126
127 ptemask = __supported_pte_mask;
128
129 /* Install the espfix pud into the kernel page directory */
130 pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];
131 pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page);
132
133 /* Randomize the locations */
134 init_espfix_random();
135
136 /* The rest is the same as for any other processor */
137 init_espfix_ap();
138}
139
140void init_espfix_ap(void)
141{
142 unsigned int cpu, page;
143 unsigned long addr;
144 pud_t pud, *pud_p;
145 pmd_t pmd, *pmd_p;
146 pte_t pte, *pte_p;
147 int n;
148 void *stack_page;
149 pteval_t ptemask;
150
151 /* We only have to do this once... */
152 if (likely(this_cpu_read(espfix_stack)))
153 return; /* Already initialized */
154
155 cpu = smp_processor_id();
156 addr = espfix_base_addr(cpu);
157 page = cpu/ESPFIX_STACKS_PER_PAGE;
158
159 /* Did another CPU already set this up? */
160 stack_page = ACCESS_ONCE(espfix_pages[page]);
161 if (likely(stack_page))
162 goto done;
163
164 mutex_lock(&espfix_init_mutex);
165
166 /* Did we race on the lock? */
167 stack_page = ACCESS_ONCE(espfix_pages[page]);
168 if (stack_page)
169 goto unlock_done;
170
171 ptemask = __supported_pte_mask;
172
173 pud_p = &espfix_pud_page[pud_index(addr)];
174 pud = *pud_p;
175 if (!pud_present(pud)) {
176 pmd_p = (pmd_t *)__get_free_page(PGALLOC_GFP);
177 pud = __pud(__pa(pmd_p) | (PGTABLE_PROT & ptemask));
178 paravirt_alloc_pud(&init_mm, __pa(pmd_p) >> PAGE_SHIFT);
179 for (n = 0; n < ESPFIX_PUD_CLONES; n++)
180 set_pud(&pud_p[n], pud);
181 }
182
183 pmd_p = pmd_offset(&pud, addr);
184 pmd = *pmd_p;
185 if (!pmd_present(pmd)) {
186 pte_p = (pte_t *)__get_free_page(PGALLOC_GFP);
187 pmd = __pmd(__pa(pte_p) | (PGTABLE_PROT & ptemask));
188 paravirt_alloc_pmd(&init_mm, __pa(pte_p) >> PAGE_SHIFT);
189 for (n = 0; n < ESPFIX_PMD_CLONES; n++)
190 set_pmd(&pmd_p[n], pmd);
191 }
192
193 pte_p = pte_offset_kernel(&pmd, addr);
194 stack_page = (void *)__get_free_page(GFP_KERNEL);
195 pte = __pte(__pa(stack_page) | (__PAGE_KERNEL_RO & ptemask));
196 paravirt_alloc_pte(&init_mm, __pa(stack_page) >> PAGE_SHIFT);
197 for (n = 0; n < ESPFIX_PTE_CLONES; n++)
198 set_pte(&pte_p[n*PTE_STRIDE], pte);
199
200 /* Job is done for this CPU and any CPU which shares this page */
201 ACCESS_ONCE(espfix_pages[page]) = stack_page;
202
203unlock_done:
204 mutex_unlock(&espfix_init_mutex);
205done:
206 this_cpu_write(espfix_stack, addr);
207 this_cpu_write(espfix_waddr, (unsigned long)stack_page
208 + (addr & ~PAGE_MASK));
209}
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index dcbbaa165bde..c37886d759cc 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -20,8 +20,6 @@
20#include <asm/mmu_context.h> 20#include <asm/mmu_context.h>
21#include <asm/syscalls.h> 21#include <asm/syscalls.h>
22 22
23int sysctl_ldt16 = 0;
24
25#ifdef CONFIG_SMP 23#ifdef CONFIG_SMP
26static void flush_ldt(void *current_mm) 24static void flush_ldt(void *current_mm)
27{ 25{
@@ -231,16 +229,10 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
231 } 229 }
232 } 230 }
233 231
234 /* 232 if (!IS_ENABLED(CONFIG_X86_16BIT) && !ldt_info.seg_32bit) {
235 * On x86-64 we do not support 16-bit segments due to
236 * IRET leaking the high bits of the kernel stack address.
237 */
238#ifdef CONFIG_X86_64
239 if (!ldt_info.seg_32bit && !sysctl_ldt16) {
240 error = -EINVAL; 233 error = -EINVAL;
241 goto out_unlock; 234 goto out_unlock;
242 } 235 }
243#endif
244 236
245 fill_ldt(&ldt, &ldt_info); 237 fill_ldt(&ldt, &ldt_info);
246 if (oldmode) 238 if (oldmode)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 34826934d4a7..5d93ac1b72db 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -244,6 +244,13 @@ static void notrace start_secondary(void *unused)
244 check_tsc_sync_target(); 244 check_tsc_sync_target();
245 245
246 /* 246 /*
247 * Enable the espfix hack for this CPU
248 */
249#ifdef CONFIG_X86_ESPFIX64
250 init_espfix_ap();
251#endif
252
253 /*
247 * We need to hold vector_lock so there the set of online cpus 254 * We need to hold vector_lock so there the set of online cpus
248 * does not change while we are assigning vectors to cpus. Holding 255 * does not change while we are assigning vectors to cpus. Holding
249 * this lock ensures we don't half assign or remove an irq from a cpu. 256 * this lock ensures we don't half assign or remove an irq from a cpu.
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 20621d753d5f..167ffcac16ed 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -30,12 +30,14 @@ struct pg_state {
30 unsigned long start_address; 30 unsigned long start_address;
31 unsigned long current_address; 31 unsigned long current_address;
32 const struct addr_marker *marker; 32 const struct addr_marker *marker;
33 unsigned long lines;
33 bool to_dmesg; 34 bool to_dmesg;
34}; 35};
35 36
36struct addr_marker { 37struct addr_marker {
37 unsigned long start_address; 38 unsigned long start_address;
38 const char *name; 39 const char *name;
40 unsigned long max_lines;
39}; 41};
40 42
41/* indices for address_markers; keep sync'd w/ address_markers below */ 43/* indices for address_markers; keep sync'd w/ address_markers below */
@@ -46,6 +48,7 @@ enum address_markers_idx {
46 LOW_KERNEL_NR, 48 LOW_KERNEL_NR,
47 VMALLOC_START_NR, 49 VMALLOC_START_NR,
48 VMEMMAP_START_NR, 50 VMEMMAP_START_NR,
51 ESPFIX_START_NR,
49 HIGH_KERNEL_NR, 52 HIGH_KERNEL_NR,
50 MODULES_VADDR_NR, 53 MODULES_VADDR_NR,
51 MODULES_END_NR, 54 MODULES_END_NR,
@@ -68,6 +71,7 @@ static struct addr_marker address_markers[] = {
68 { PAGE_OFFSET, "Low Kernel Mapping" }, 71 { PAGE_OFFSET, "Low Kernel Mapping" },
69 { VMALLOC_START, "vmalloc() Area" }, 72 { VMALLOC_START, "vmalloc() Area" },
70 { VMEMMAP_START, "Vmemmap" }, 73 { VMEMMAP_START, "Vmemmap" },
74 { ESPFIX_BASE_ADDR, "ESPfix Area", 16 },
71 { __START_KERNEL_map, "High Kernel Mapping" }, 75 { __START_KERNEL_map, "High Kernel Mapping" },
72 { MODULES_VADDR, "Modules" }, 76 { MODULES_VADDR, "Modules" },
73 { MODULES_END, "End Modules" }, 77 { MODULES_END, "End Modules" },
@@ -182,7 +186,7 @@ static void note_page(struct seq_file *m, struct pg_state *st,
182 pgprot_t new_prot, int level) 186 pgprot_t new_prot, int level)
183{ 187{
184 pgprotval_t prot, cur; 188 pgprotval_t prot, cur;
185 static const char units[] = "KMGTPE"; 189 static const char units[] = "BKMGTPE";
186 190
187 /* 191 /*
188 * If we have a "break" in the series, we need to flush the state that 192 * If we have a "break" in the series, we need to flush the state that
@@ -197,6 +201,7 @@ static void note_page(struct seq_file *m, struct pg_state *st,
197 st->current_prot = new_prot; 201 st->current_prot = new_prot;
198 st->level = level; 202 st->level = level;
199 st->marker = address_markers; 203 st->marker = address_markers;
204 st->lines = 0;
200 pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n", 205 pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n",
201 st->marker->name); 206 st->marker->name);
202 } else if (prot != cur || level != st->level || 207 } else if (prot != cur || level != st->level ||
@@ -208,17 +213,24 @@ static void note_page(struct seq_file *m, struct pg_state *st,
208 /* 213 /*
209 * Now print the actual finished series 214 * Now print the actual finished series
210 */ 215 */
211 pt_dump_seq_printf(m, st->to_dmesg, "0x%0*lx-0x%0*lx ", 216 if (!st->marker->max_lines ||
212 width, st->start_address, 217 st->lines < st->marker->max_lines) {
213 width, st->current_address); 218 pt_dump_seq_printf(m, st->to_dmesg,
214 219 "0x%0*lx-0x%0*lx ",
215 delta = (st->current_address - st->start_address) >> 10; 220 width, st->start_address,
216 while (!(delta & 1023) && unit[1]) { 221 width, st->current_address);
217 delta >>= 10; 222
218 unit++; 223 delta = st->current_address - st->start_address;
224 while (!(delta & 1023) && unit[1]) {
225 delta >>= 10;
226 unit++;
227 }
228 pt_dump_cont_printf(m, st->to_dmesg, "%9lu%c ",
229 delta, *unit);
230 printk_prot(m, st->current_prot, st->level,
231 st->to_dmesg);
219 } 232 }
220 pt_dump_cont_printf(m, st->to_dmesg, "%9lu%c ", delta, *unit); 233 st->lines++;
221 printk_prot(m, st->current_prot, st->level, st->to_dmesg);
222 234
223 /* 235 /*
224 * We print markers for special areas of address space, 236 * We print markers for special areas of address space,
@@ -226,7 +238,17 @@ static void note_page(struct seq_file *m, struct pg_state *st,
226 * This helps in the interpretation. 238 * This helps in the interpretation.
227 */ 239 */
228 if (st->current_address >= st->marker[1].start_address) { 240 if (st->current_address >= st->marker[1].start_address) {
241 if (st->marker->max_lines &&
242 st->lines > st->marker->max_lines) {
243 unsigned long nskip =
244 st->lines - st->marker->max_lines;
245 pt_dump_seq_printf(m, st->to_dmesg,
246 "... %lu entr%s skipped ... \n",
247 nskip,
248 nskip == 1 ? "y" : "ies");
249 }
229 st->marker++; 250 st->marker++;
251 st->lines = 0;
230 pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n", 252 pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n",
231 st->marker->name); 253 st->marker->name);
232 } 254 }
diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
index e1f220e3ca68..00348980a3a6 100644
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -39,7 +39,6 @@
39#ifdef CONFIG_X86_64 39#ifdef CONFIG_X86_64
40#define vdso_enabled sysctl_vsyscall32 40#define vdso_enabled sysctl_vsyscall32
41#define arch_setup_additional_pages syscall32_setup_pages 41#define arch_setup_additional_pages syscall32_setup_pages
42extern int sysctl_ldt16;
43#endif 42#endif
44 43
45/* 44/*
@@ -250,13 +249,6 @@ static struct ctl_table abi_table2[] = {
250 .mode = 0644, 249 .mode = 0644,
251 .proc_handler = proc_dointvec 250 .proc_handler = proc_dointvec
252 }, 251 },
253 {
254 .procname = "ldt16",
255 .data = &sysctl_ldt16,
256 .maxlen = sizeof(int),
257 .mode = 0644,
258 .proc_handler = proc_dointvec
259 },
260 {} 252 {}
261}; 253};
262 254
diff --git a/init/main.c b/init/main.c
index 17d47bcdf573..0ec25157deef 100644
--- a/init/main.c
+++ b/init/main.c
@@ -617,6 +617,10 @@ asmlinkage __visible void __init start_kernel(void)
617 if (efi_enabled(EFI_RUNTIME_SERVICES)) 617 if (efi_enabled(EFI_RUNTIME_SERVICES))
618 efi_enter_virtual_mode(); 618 efi_enter_virtual_mode();
619#endif 619#endif
620#ifdef CONFIG_X86_ESPFIX64
621 /* Should be run before the first non-init thread is created */
622 init_espfix_bsp();
623#endif
620 thread_info_cache_init(); 624 thread_info_cache_init();
621 cred_init(); 625 cred_init();
622 fork_init(totalram_pages); 626 fork_init(totalram_pages);