diff options
-rw-r--r-- | Documentation/x86/x86_64/mm.txt | 2 | ||||
-rw-r--r-- | arch/x86/Kconfig | 25 | ||||
-rw-r--r-- | arch/x86/include/asm/espfix.h | 16 | ||||
-rw-r--r-- | arch/x86/include/asm/pgtable_64_types.h | 2 | ||||
-rw-r--r-- | arch/x86/include/asm/setup.h | 2 | ||||
-rw-r--r-- | arch/x86/kernel/Makefile | 1 | ||||
-rw-r--r-- | arch/x86/kernel/entry_32.S | 17 | ||||
-rw-r--r-- | arch/x86/kernel/entry_64.S | 81 | ||||
-rw-r--r-- | arch/x86/kernel/espfix_64.c | 209 | ||||
-rw-r--r-- | arch/x86/kernel/ldt.c | 10 | ||||
-rw-r--r-- | arch/x86/kernel/smpboot.c | 7 | ||||
-rw-r--r-- | arch/x86/mm/dump_pagetables.c | 44 | ||||
-rw-r--r-- | arch/x86/vdso/vdso32-setup.c | 8 | ||||
-rw-r--r-- | init/main.c | 4 |
14 files changed, 387 insertions, 41 deletions
diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt index c584a51add15..afe68ddbe6a4 100644 --- a/Documentation/x86/x86_64/mm.txt +++ b/Documentation/x86/x86_64/mm.txt | |||
@@ -12,6 +12,8 @@ ffffc90000000000 - ffffe8ffffffffff (=45 bits) vmalloc/ioremap space | |||
12 | ffffe90000000000 - ffffe9ffffffffff (=40 bits) hole | 12 | ffffe90000000000 - ffffe9ffffffffff (=40 bits) hole |
13 | ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB) | 13 | ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB) |
14 | ... unused hole ... | 14 | ... unused hole ... |
15 | ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks | ||
16 | ... unused hole ... | ||
15 | ffffffff80000000 - ffffffffa0000000 (=512 MB) kernel text mapping, from phys 0 | 17 | ffffffff80000000 - ffffffffa0000000 (=512 MB) kernel text mapping, from phys 0 |
16 | ffffffffa0000000 - ffffffffff5fffff (=1525 MB) module mapping space | 18 | ffffffffa0000000 - ffffffffff5fffff (=1525 MB) module mapping space |
17 | ffffffffff600000 - ffffffffffdfffff (=8 MB) vsyscalls | 19 | ffffffffff600000 - ffffffffffdfffff (=8 MB) vsyscalls |
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 272b493ea1bf..b660088c220d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig | |||
@@ -912,10 +912,27 @@ config VM86 | |||
912 | default y | 912 | default y |
913 | depends on X86_32 | 913 | depends on X86_32 |
914 | ---help--- | 914 | ---help--- |
915 | This option is required by programs like DOSEMU to run 16-bit legacy | 915 | This option is required by programs like DOSEMU to run |
916 | code on X86 processors. It also may be needed by software like | 916 | 16-bit real mode legacy code on x86 processors. It also may |
917 | XFree86 to initialize some video cards via BIOS. Disabling this | 917 | be needed by software like XFree86 to initialize some video |
918 | option saves about 6k. | 918 | cards via BIOS. Disabling this option saves about 6K. |
919 | |||
920 | config X86_16BIT | ||
921 | bool "Enable support for 16-bit segments" if EXPERT | ||
922 | default y | ||
923 | ---help--- | ||
924 | This option is required by programs like Wine to run 16-bit | ||
925 | protected mode legacy code on x86 processors. Disabling | ||
926 | this option saves about 300 bytes on i386, or around 6K text | ||
927 | plus 16K runtime memory on x86-64, | ||
928 | |||
929 | config X86_ESPFIX32 | ||
930 | def_bool y | ||
931 | depends on X86_16BIT && X86_32 | ||
932 | |||
933 | config X86_ESPFIX64 | ||
934 | def_bool y | ||
935 | depends on X86_16BIT && X86_64 | ||
919 | 936 | ||
920 | config TOSHIBA | 937 | config TOSHIBA |
921 | tristate "Toshiba Laptop support" | 938 | tristate "Toshiba Laptop support" |
diff --git a/arch/x86/include/asm/espfix.h b/arch/x86/include/asm/espfix.h new file mode 100644 index 000000000000..99efebb2f69d --- /dev/null +++ b/arch/x86/include/asm/espfix.h | |||
@@ -0,0 +1,16 @@ | |||
1 | #ifndef _ASM_X86_ESPFIX_H | ||
2 | #define _ASM_X86_ESPFIX_H | ||
3 | |||
4 | #ifdef CONFIG_X86_64 | ||
5 | |||
6 | #include <asm/percpu.h> | ||
7 | |||
8 | DECLARE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack); | ||
9 | DECLARE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr); | ||
10 | |||
11 | extern void init_espfix_bsp(void); | ||
12 | extern void init_espfix_ap(void); | ||
13 | |||
14 | #endif /* CONFIG_X86_64 */ | ||
15 | |||
16 | #endif /* _ASM_X86_ESPFIX_H */ | ||
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index c883bf726398..7166e25ecb57 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h | |||
@@ -61,6 +61,8 @@ typedef struct { pteval_t pte; } pte_t; | |||
61 | #define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE) | 61 | #define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE) |
62 | #define MODULES_END _AC(0xffffffffff000000, UL) | 62 | #define MODULES_END _AC(0xffffffffff000000, UL) |
63 | #define MODULES_LEN (MODULES_END - MODULES_VADDR) | 63 | #define MODULES_LEN (MODULES_END - MODULES_VADDR) |
64 | #define ESPFIX_PGD_ENTRY _AC(-2, UL) | ||
65 | #define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << PGDIR_SHIFT) | ||
64 | 66 | ||
65 | #define EARLY_DYNAMIC_PAGE_TABLES 64 | 67 | #define EARLY_DYNAMIC_PAGE_TABLES 64 |
66 | 68 | ||
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 9264f04a4c55..ff4e7b236e21 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h | |||
@@ -59,6 +59,8 @@ static inline void x86_ce4100_early_setup(void) { } | |||
59 | 59 | ||
60 | #ifndef _SETUP | 60 | #ifndef _SETUP |
61 | 61 | ||
62 | #include <asm/espfix.h> | ||
63 | |||
62 | /* | 64 | /* |
63 | * This is set up by the setup-routine at boot-time | 65 | * This is set up by the setup-routine at boot-time |
64 | */ | 66 | */ |
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index f4d96000d33a..491ef3e59850 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile | |||
@@ -29,6 +29,7 @@ obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o | |||
29 | obj-y += syscall_$(BITS).o vsyscall_gtod.o | 29 | obj-y += syscall_$(BITS).o vsyscall_gtod.o |
30 | obj-$(CONFIG_X86_64) += vsyscall_64.o | 30 | obj-$(CONFIG_X86_64) += vsyscall_64.o |
31 | obj-$(CONFIG_X86_64) += vsyscall_emu_64.o | 31 | obj-$(CONFIG_X86_64) += vsyscall_emu_64.o |
32 | obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o | ||
32 | obj-$(CONFIG_SYSFS) += ksysfs.o | 33 | obj-$(CONFIG_SYSFS) += ksysfs.o |
33 | obj-y += bootflag.o e820.o | 34 | obj-y += bootflag.o e820.o |
34 | obj-y += pci-dma.o quirks.o topology.o kdebugfs.o | 35 | obj-y += pci-dma.o quirks.o topology.o kdebugfs.o |
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index a2a4f4697889..98313ffaae6a 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S | |||
@@ -527,6 +527,7 @@ syscall_exit: | |||
527 | restore_all: | 527 | restore_all: |
528 | TRACE_IRQS_IRET | 528 | TRACE_IRQS_IRET |
529 | restore_all_notrace: | 529 | restore_all_notrace: |
530 | #ifdef CONFIG_X86_ESPFIX32 | ||
530 | movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS | 531 | movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS |
531 | # Warning: PT_OLDSS(%esp) contains the wrong/random values if we | 532 | # Warning: PT_OLDSS(%esp) contains the wrong/random values if we |
532 | # are returning to the kernel. | 533 | # are returning to the kernel. |
@@ -537,6 +538,7 @@ restore_all_notrace: | |||
537 | cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax | 538 | cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax |
538 | CFI_REMEMBER_STATE | 539 | CFI_REMEMBER_STATE |
539 | je ldt_ss # returning to user-space with LDT SS | 540 | je ldt_ss # returning to user-space with LDT SS |
541 | #endif | ||
540 | restore_nocheck: | 542 | restore_nocheck: |
541 | RESTORE_REGS 4 # skip orig_eax/error_code | 543 | RESTORE_REGS 4 # skip orig_eax/error_code |
542 | irq_return: | 544 | irq_return: |
@@ -549,13 +551,9 @@ ENTRY(iret_exc) | |||
549 | .previous | 551 | .previous |
550 | _ASM_EXTABLE(irq_return,iret_exc) | 552 | _ASM_EXTABLE(irq_return,iret_exc) |
551 | 553 | ||
554 | #ifdef CONFIG_X86_ESPFIX32 | ||
552 | CFI_RESTORE_STATE | 555 | CFI_RESTORE_STATE |
553 | ldt_ss: | 556 | ldt_ss: |
554 | larl PT_OLDSS(%esp), %eax | ||
555 | jnz restore_nocheck | ||
556 | testl $0x00400000, %eax # returning to 32bit stack? | ||
557 | jnz restore_nocheck # allright, normal return | ||
558 | |||
559 | #ifdef CONFIG_PARAVIRT | 557 | #ifdef CONFIG_PARAVIRT |
560 | /* | 558 | /* |
561 | * The kernel can't run on a non-flat stack if paravirt mode | 559 | * The kernel can't run on a non-flat stack if paravirt mode |
@@ -597,6 +595,7 @@ ldt_ss: | |||
597 | lss (%esp), %esp /* switch to espfix segment */ | 595 | lss (%esp), %esp /* switch to espfix segment */ |
598 | CFI_ADJUST_CFA_OFFSET -8 | 596 | CFI_ADJUST_CFA_OFFSET -8 |
599 | jmp restore_nocheck | 597 | jmp restore_nocheck |
598 | #endif | ||
600 | CFI_ENDPROC | 599 | CFI_ENDPROC |
601 | ENDPROC(system_call) | 600 | ENDPROC(system_call) |
602 | 601 | ||
@@ -704,6 +703,7 @@ END(syscall_badsys) | |||
704 | * the high word of the segment base from the GDT and swiches to the | 703 | * the high word of the segment base from the GDT and swiches to the |
705 | * normal stack and adjusts ESP with the matching offset. | 704 | * normal stack and adjusts ESP with the matching offset. |
706 | */ | 705 | */ |
706 | #ifdef CONFIG_X86_ESPFIX32 | ||
707 | /* fixup the stack */ | 707 | /* fixup the stack */ |
708 | mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */ | 708 | mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */ |
709 | mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */ | 709 | mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */ |
@@ -713,8 +713,10 @@ END(syscall_badsys) | |||
713 | pushl_cfi %eax | 713 | pushl_cfi %eax |
714 | lss (%esp), %esp /* switch to the normal stack segment */ | 714 | lss (%esp), %esp /* switch to the normal stack segment */ |
715 | CFI_ADJUST_CFA_OFFSET -8 | 715 | CFI_ADJUST_CFA_OFFSET -8 |
716 | #endif | ||
716 | .endm | 717 | .endm |
717 | .macro UNWIND_ESPFIX_STACK | 718 | .macro UNWIND_ESPFIX_STACK |
719 | #ifdef CONFIG_X86_ESPFIX32 | ||
718 | movl %ss, %eax | 720 | movl %ss, %eax |
719 | /* see if on espfix stack */ | 721 | /* see if on espfix stack */ |
720 | cmpw $__ESPFIX_SS, %ax | 722 | cmpw $__ESPFIX_SS, %ax |
@@ -725,6 +727,7 @@ END(syscall_badsys) | |||
725 | /* switch to normal stack */ | 727 | /* switch to normal stack */ |
726 | FIXUP_ESPFIX_STACK | 728 | FIXUP_ESPFIX_STACK |
727 | 27: | 729 | 27: |
730 | #endif | ||
728 | .endm | 731 | .endm |
729 | 732 | ||
730 | /* | 733 | /* |
@@ -1355,11 +1358,13 @@ END(debug) | |||
1355 | ENTRY(nmi) | 1358 | ENTRY(nmi) |
1356 | RING0_INT_FRAME | 1359 | RING0_INT_FRAME |
1357 | ASM_CLAC | 1360 | ASM_CLAC |
1361 | #ifdef CONFIG_X86_ESPFIX32 | ||
1358 | pushl_cfi %eax | 1362 | pushl_cfi %eax |
1359 | movl %ss, %eax | 1363 | movl %ss, %eax |
1360 | cmpw $__ESPFIX_SS, %ax | 1364 | cmpw $__ESPFIX_SS, %ax |
1361 | popl_cfi %eax | 1365 | popl_cfi %eax |
1362 | je nmi_espfix_stack | 1366 | je nmi_espfix_stack |
1367 | #endif | ||
1363 | cmpl $ia32_sysenter_target,(%esp) | 1368 | cmpl $ia32_sysenter_target,(%esp) |
1364 | je nmi_stack_fixup | 1369 | je nmi_stack_fixup |
1365 | pushl_cfi %eax | 1370 | pushl_cfi %eax |
@@ -1399,6 +1404,7 @@ nmi_debug_stack_check: | |||
1399 | FIX_STACK 24, nmi_stack_correct, 1 | 1404 | FIX_STACK 24, nmi_stack_correct, 1 |
1400 | jmp nmi_stack_correct | 1405 | jmp nmi_stack_correct |
1401 | 1406 | ||
1407 | #ifdef CONFIG_X86_ESPFIX32 | ||
1402 | nmi_espfix_stack: | 1408 | nmi_espfix_stack: |
1403 | /* We have a RING0_INT_FRAME here. | 1409 | /* We have a RING0_INT_FRAME here. |
1404 | * | 1410 | * |
@@ -1420,6 +1426,7 @@ nmi_espfix_stack: | |||
1420 | lss 12+4(%esp), %esp # back to espfix stack | 1426 | lss 12+4(%esp), %esp # back to espfix stack |
1421 | CFI_ADJUST_CFA_OFFSET -24 | 1427 | CFI_ADJUST_CFA_OFFSET -24 |
1422 | jmp irq_return | 1428 | jmp irq_return |
1429 | #endif | ||
1423 | CFI_ENDPROC | 1430 | CFI_ENDPROC |
1424 | END(nmi) | 1431 | END(nmi) |
1425 | 1432 | ||
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index be846d2468f7..96987987c5de 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S | |||
@@ -58,6 +58,7 @@ | |||
58 | #include <asm/asm.h> | 58 | #include <asm/asm.h> |
59 | #include <asm/context_tracking.h> | 59 | #include <asm/context_tracking.h> |
60 | #include <asm/smap.h> | 60 | #include <asm/smap.h> |
61 | #include <asm/pgtable_types.h> | ||
61 | #include <linux/err.h> | 62 | #include <linux/err.h> |
62 | 63 | ||
63 | /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ | 64 | /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ |
@@ -1040,8 +1041,18 @@ restore_args: | |||
1040 | RESTORE_ARGS 1,8,1 | 1041 | RESTORE_ARGS 1,8,1 |
1041 | 1042 | ||
1042 | irq_return: | 1043 | irq_return: |
1044 | /* | ||
1045 | * Are we returning to a stack segment from the LDT? Note: in | ||
1046 | * 64-bit mode SS:RSP on the exception stack is always valid. | ||
1047 | */ | ||
1048 | #ifdef CONFIG_X86_ESPFIX64 | ||
1049 | testb $4,(SS-RIP)(%rsp) | ||
1050 | jnz irq_return_ldt | ||
1051 | #endif | ||
1052 | |||
1053 | irq_return_iret: | ||
1043 | INTERRUPT_RETURN | 1054 | INTERRUPT_RETURN |
1044 | _ASM_EXTABLE(irq_return, bad_iret) | 1055 | _ASM_EXTABLE(irq_return_iret, bad_iret) |
1045 | 1056 | ||
1046 | #ifdef CONFIG_PARAVIRT | 1057 | #ifdef CONFIG_PARAVIRT |
1047 | ENTRY(native_iret) | 1058 | ENTRY(native_iret) |
@@ -1049,6 +1060,32 @@ ENTRY(native_iret) | |||
1049 | _ASM_EXTABLE(native_iret, bad_iret) | 1060 | _ASM_EXTABLE(native_iret, bad_iret) |
1050 | #endif | 1061 | #endif |
1051 | 1062 | ||
1063 | #ifdef CONFIG_X86_ESPFIX64 | ||
1064 | irq_return_ldt: | ||
1065 | pushq_cfi %rax | ||
1066 | pushq_cfi %rdi | ||
1067 | SWAPGS | ||
1068 | movq PER_CPU_VAR(espfix_waddr),%rdi | ||
1069 | movq %rax,(0*8)(%rdi) /* RAX */ | ||
1070 | movq (2*8)(%rsp),%rax /* RIP */ | ||
1071 | movq %rax,(1*8)(%rdi) | ||
1072 | movq (3*8)(%rsp),%rax /* CS */ | ||
1073 | movq %rax,(2*8)(%rdi) | ||
1074 | movq (4*8)(%rsp),%rax /* RFLAGS */ | ||
1075 | movq %rax,(3*8)(%rdi) | ||
1076 | movq (6*8)(%rsp),%rax /* SS */ | ||
1077 | movq %rax,(5*8)(%rdi) | ||
1078 | movq (5*8)(%rsp),%rax /* RSP */ | ||
1079 | movq %rax,(4*8)(%rdi) | ||
1080 | andl $0xffff0000,%eax | ||
1081 | popq_cfi %rdi | ||
1082 | orq PER_CPU_VAR(espfix_stack),%rax | ||
1083 | SWAPGS | ||
1084 | movq %rax,%rsp | ||
1085 | popq_cfi %rax | ||
1086 | jmp irq_return_iret | ||
1087 | #endif | ||
1088 | |||
1052 | .section .fixup,"ax" | 1089 | .section .fixup,"ax" |
1053 | bad_iret: | 1090 | bad_iret: |
1054 | /* | 1091 | /* |
@@ -1110,9 +1147,45 @@ ENTRY(retint_kernel) | |||
1110 | call preempt_schedule_irq | 1147 | call preempt_schedule_irq |
1111 | jmp exit_intr | 1148 | jmp exit_intr |
1112 | #endif | 1149 | #endif |
1113 | |||
1114 | CFI_ENDPROC | 1150 | CFI_ENDPROC |
1115 | END(common_interrupt) | 1151 | END(common_interrupt) |
1152 | |||
1153 | /* | ||
1154 | * If IRET takes a fault on the espfix stack, then we | ||
1155 | * end up promoting it to a doublefault. In that case, | ||
1156 | * modify the stack to make it look like we just entered | ||
1157 | * the #GP handler from user space, similar to bad_iret. | ||
1158 | */ | ||
1159 | #ifdef CONFIG_X86_ESPFIX64 | ||
1160 | ALIGN | ||
1161 | __do_double_fault: | ||
1162 | XCPT_FRAME 1 RDI+8 | ||
1163 | movq RSP(%rdi),%rax /* Trap on the espfix stack? */ | ||
1164 | sarq $PGDIR_SHIFT,%rax | ||
1165 | cmpl $ESPFIX_PGD_ENTRY,%eax | ||
1166 | jne do_double_fault /* No, just deliver the fault */ | ||
1167 | cmpl $__KERNEL_CS,CS(%rdi) | ||
1168 | jne do_double_fault | ||
1169 | movq RIP(%rdi),%rax | ||
1170 | cmpq $irq_return_iret,%rax | ||
1171 | #ifdef CONFIG_PARAVIRT | ||
1172 | je 1f | ||
1173 | cmpq $native_iret,%rax | ||
1174 | #endif | ||
1175 | jne do_double_fault /* This shouldn't happen... */ | ||
1176 | 1: | ||
1177 | movq PER_CPU_VAR(kernel_stack),%rax | ||
1178 | subq $(6*8-KERNEL_STACK_OFFSET),%rax /* Reset to original stack */ | ||
1179 | movq %rax,RSP(%rdi) | ||
1180 | movq $0,(%rax) /* Missing (lost) #GP error code */ | ||
1181 | movq $general_protection,RIP(%rdi) | ||
1182 | retq | ||
1183 | CFI_ENDPROC | ||
1184 | END(__do_double_fault) | ||
1185 | #else | ||
1186 | # define __do_double_fault do_double_fault | ||
1187 | #endif | ||
1188 | |||
1116 | /* | 1189 | /* |
1117 | * End of kprobes section | 1190 | * End of kprobes section |
1118 | */ | 1191 | */ |
@@ -1289,7 +1362,7 @@ idtentry overflow do_overflow has_error_code=0 | |||
1289 | idtentry bounds do_bounds has_error_code=0 | 1362 | idtentry bounds do_bounds has_error_code=0 |
1290 | idtentry invalid_op do_invalid_op has_error_code=0 | 1363 | idtentry invalid_op do_invalid_op has_error_code=0 |
1291 | idtentry device_not_available do_device_not_available has_error_code=0 | 1364 | idtentry device_not_available do_device_not_available has_error_code=0 |
1292 | idtentry double_fault do_double_fault has_error_code=1 paranoid=1 | 1365 | idtentry double_fault __do_double_fault has_error_code=1 paranoid=1 |
1293 | idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0 | 1366 | idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0 |
1294 | idtentry invalid_TSS do_invalid_TSS has_error_code=1 | 1367 | idtentry invalid_TSS do_invalid_TSS has_error_code=1 |
1295 | idtentry segment_not_present do_segment_not_present has_error_code=1 | 1368 | idtentry segment_not_present do_segment_not_present has_error_code=1 |
@@ -1576,7 +1649,7 @@ error_sti: | |||
1576 | */ | 1649 | */ |
1577 | error_kernelspace: | 1650 | error_kernelspace: |
1578 | incl %ebx | 1651 | incl %ebx |
1579 | leaq irq_return(%rip),%rcx | 1652 | leaq irq_return_iret(%rip),%rcx |
1580 | cmpq %rcx,RIP+8(%rsp) | 1653 | cmpq %rcx,RIP+8(%rsp) |
1581 | je error_swapgs | 1654 | je error_swapgs |
1582 | movl %ecx,%eax /* zero extend */ | 1655 | movl %ecx,%eax /* zero extend */ |
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c new file mode 100644 index 000000000000..6afbb16e9b79 --- /dev/null +++ b/arch/x86/kernel/espfix_64.c | |||
@@ -0,0 +1,209 @@ | |||
1 | /* ----------------------------------------------------------------------- * | ||
2 | * | ||
3 | * Copyright 2014 Intel Corporation; author: H. Peter Anvin | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify it | ||
6 | * under the terms and conditions of the GNU General Public License, | ||
7 | * version 2, as published by the Free Software Foundation. | ||
8 | * | ||
9 | * This program is distributed in the hope it will be useful, but WITHOUT | ||
10 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | ||
11 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | ||
12 | * more details. | ||
13 | * | ||
14 | * ----------------------------------------------------------------------- */ | ||
15 | |||
16 | /* | ||
17 | * The IRET instruction, when returning to a 16-bit segment, only | ||
18 | * restores the bottom 16 bits of the user space stack pointer. This | ||
19 | * causes some 16-bit software to break, but it also leaks kernel state | ||
20 | * to user space. | ||
21 | * | ||
22 | * This works around this by creating percpu "ministacks", each of which | ||
23 | * is mapped 2^16 times 64K apart. When we detect that the return SS is | ||
24 | * on the LDT, we copy the IRET frame to the ministack and use the | ||
25 | * relevant alias to return to userspace. The ministacks are mapped | ||
26 | * readonly, so if the IRET fault we promote #GP to #DF which is an IST | ||
27 | * vector and thus has its own stack; we then do the fixup in the #DF | ||
28 | * handler. | ||
29 | * | ||
30 | * This file sets up the ministacks and the related page tables. The | ||
31 | * actual ministack invocation is in entry_64.S. | ||
32 | */ | ||
33 | |||
34 | #include <linux/init.h> | ||
35 | #include <linux/init_task.h> | ||
36 | #include <linux/kernel.h> | ||
37 | #include <linux/percpu.h> | ||
38 | #include <linux/gfp.h> | ||
39 | #include <linux/random.h> | ||
40 | #include <asm/pgtable.h> | ||
41 | #include <asm/pgalloc.h> | ||
42 | #include <asm/setup.h> | ||
43 | #include <asm/espfix.h> | ||
44 | |||
45 | /* | ||
46 | * Note: we only need 6*8 = 48 bytes for the espfix stack, but round | ||
47 | * it up to a cache line to avoid unnecessary sharing. | ||
48 | */ | ||
49 | #define ESPFIX_STACK_SIZE (8*8UL) | ||
50 | #define ESPFIX_STACKS_PER_PAGE (PAGE_SIZE/ESPFIX_STACK_SIZE) | ||
51 | |||
52 | /* There is address space for how many espfix pages? */ | ||
53 | #define ESPFIX_PAGE_SPACE (1UL << (PGDIR_SHIFT-PAGE_SHIFT-16)) | ||
54 | |||
55 | #define ESPFIX_MAX_CPUS (ESPFIX_STACKS_PER_PAGE * ESPFIX_PAGE_SPACE) | ||
56 | #if CONFIG_NR_CPUS > ESPFIX_MAX_CPUS | ||
57 | # error "Need more than one PGD for the ESPFIX hack" | ||
58 | #endif | ||
59 | |||
60 | #define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO) | ||
61 | |||
62 | /* This contains the *bottom* address of the espfix stack */ | ||
63 | DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack); | ||
64 | DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr); | ||
65 | |||
66 | /* Initialization mutex - should this be a spinlock? */ | ||
67 | static DEFINE_MUTEX(espfix_init_mutex); | ||
68 | |||
69 | /* Page allocation bitmap - each page serves ESPFIX_STACKS_PER_PAGE CPUs */ | ||
70 | #define ESPFIX_MAX_PAGES DIV_ROUND_UP(CONFIG_NR_CPUS, ESPFIX_STACKS_PER_PAGE) | ||
71 | static void *espfix_pages[ESPFIX_MAX_PAGES]; | ||
72 | |||
73 | static __page_aligned_bss pud_t espfix_pud_page[PTRS_PER_PUD] | ||
74 | __aligned(PAGE_SIZE); | ||
75 | |||
76 | static unsigned int page_random, slot_random; | ||
77 | |||
78 | /* | ||
79 | * This returns the bottom address of the espfix stack for a specific CPU. | ||
80 | * The math allows for a non-power-of-two ESPFIX_STACK_SIZE, in which case | ||
81 | * we have to account for some amount of padding at the end of each page. | ||
82 | */ | ||
83 | static inline unsigned long espfix_base_addr(unsigned int cpu) | ||
84 | { | ||
85 | unsigned long page, slot; | ||
86 | unsigned long addr; | ||
87 | |||
88 | page = (cpu / ESPFIX_STACKS_PER_PAGE) ^ page_random; | ||
89 | slot = (cpu + slot_random) % ESPFIX_STACKS_PER_PAGE; | ||
90 | addr = (page << PAGE_SHIFT) + (slot * ESPFIX_STACK_SIZE); | ||
91 | addr = (addr & 0xffffUL) | ((addr & ~0xffffUL) << 16); | ||
92 | addr += ESPFIX_BASE_ADDR; | ||
93 | return addr; | ||
94 | } | ||
95 | |||
96 | #define PTE_STRIDE (65536/PAGE_SIZE) | ||
97 | #define ESPFIX_PTE_CLONES (PTRS_PER_PTE/PTE_STRIDE) | ||
98 | #define ESPFIX_PMD_CLONES PTRS_PER_PMD | ||
99 | #define ESPFIX_PUD_CLONES (65536/(ESPFIX_PTE_CLONES*ESPFIX_PMD_CLONES)) | ||
100 | |||
101 | #define PGTABLE_PROT ((_KERNPG_TABLE & ~_PAGE_RW) | _PAGE_NX) | ||
102 | |||
103 | static void init_espfix_random(void) | ||
104 | { | ||
105 | unsigned long rand; | ||
106 | |||
107 | /* | ||
108 | * This is run before the entropy pools are initialized, | ||
109 | * but this is hopefully better than nothing. | ||
110 | */ | ||
111 | if (!arch_get_random_long(&rand)) { | ||
112 | /* The constant is an arbitrary large prime */ | ||
113 | rdtscll(rand); | ||
114 | rand *= 0xc345c6b72fd16123UL; | ||
115 | } | ||
116 | |||
117 | slot_random = rand % ESPFIX_STACKS_PER_PAGE; | ||
118 | page_random = (rand / ESPFIX_STACKS_PER_PAGE) | ||
119 | & (ESPFIX_PAGE_SPACE - 1); | ||
120 | } | ||
121 | |||
122 | void __init init_espfix_bsp(void) | ||
123 | { | ||
124 | pgd_t *pgd_p; | ||
125 | pteval_t ptemask; | ||
126 | |||
127 | ptemask = __supported_pte_mask; | ||
128 | |||
129 | /* Install the espfix pud into the kernel page directory */ | ||
130 | pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)]; | ||
131 | pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page); | ||
132 | |||
133 | /* Randomize the locations */ | ||
134 | init_espfix_random(); | ||
135 | |||
136 | /* The rest is the same as for any other processor */ | ||
137 | init_espfix_ap(); | ||
138 | } | ||
139 | |||
140 | void init_espfix_ap(void) | ||
141 | { | ||
142 | unsigned int cpu, page; | ||
143 | unsigned long addr; | ||
144 | pud_t pud, *pud_p; | ||
145 | pmd_t pmd, *pmd_p; | ||
146 | pte_t pte, *pte_p; | ||
147 | int n; | ||
148 | void *stack_page; | ||
149 | pteval_t ptemask; | ||
150 | |||
151 | /* We only have to do this once... */ | ||
152 | if (likely(this_cpu_read(espfix_stack))) | ||
153 | return; /* Already initialized */ | ||
154 | |||
155 | cpu = smp_processor_id(); | ||
156 | addr = espfix_base_addr(cpu); | ||
157 | page = cpu/ESPFIX_STACKS_PER_PAGE; | ||
158 | |||
159 | /* Did another CPU already set this up? */ | ||
160 | stack_page = ACCESS_ONCE(espfix_pages[page]); | ||
161 | if (likely(stack_page)) | ||
162 | goto done; | ||
163 | |||
164 | mutex_lock(&espfix_init_mutex); | ||
165 | |||
166 | /* Did we race on the lock? */ | ||
167 | stack_page = ACCESS_ONCE(espfix_pages[page]); | ||
168 | if (stack_page) | ||
169 | goto unlock_done; | ||
170 | |||
171 | ptemask = __supported_pte_mask; | ||
172 | |||
173 | pud_p = &espfix_pud_page[pud_index(addr)]; | ||
174 | pud = *pud_p; | ||
175 | if (!pud_present(pud)) { | ||
176 | pmd_p = (pmd_t *)__get_free_page(PGALLOC_GFP); | ||
177 | pud = __pud(__pa(pmd_p) | (PGTABLE_PROT & ptemask)); | ||
178 | paravirt_alloc_pud(&init_mm, __pa(pmd_p) >> PAGE_SHIFT); | ||
179 | for (n = 0; n < ESPFIX_PUD_CLONES; n++) | ||
180 | set_pud(&pud_p[n], pud); | ||
181 | } | ||
182 | |||
183 | pmd_p = pmd_offset(&pud, addr); | ||
184 | pmd = *pmd_p; | ||
185 | if (!pmd_present(pmd)) { | ||
186 | pte_p = (pte_t *)__get_free_page(PGALLOC_GFP); | ||
187 | pmd = __pmd(__pa(pte_p) | (PGTABLE_PROT & ptemask)); | ||
188 | paravirt_alloc_pmd(&init_mm, __pa(pte_p) >> PAGE_SHIFT); | ||
189 | for (n = 0; n < ESPFIX_PMD_CLONES; n++) | ||
190 | set_pmd(&pmd_p[n], pmd); | ||
191 | } | ||
192 | |||
193 | pte_p = pte_offset_kernel(&pmd, addr); | ||
194 | stack_page = (void *)__get_free_page(GFP_KERNEL); | ||
195 | pte = __pte(__pa(stack_page) | (__PAGE_KERNEL_RO & ptemask)); | ||
196 | paravirt_alloc_pte(&init_mm, __pa(stack_page) >> PAGE_SHIFT); | ||
197 | for (n = 0; n < ESPFIX_PTE_CLONES; n++) | ||
198 | set_pte(&pte_p[n*PTE_STRIDE], pte); | ||
199 | |||
200 | /* Job is done for this CPU and any CPU which shares this page */ | ||
201 | ACCESS_ONCE(espfix_pages[page]) = stack_page; | ||
202 | |||
203 | unlock_done: | ||
204 | mutex_unlock(&espfix_init_mutex); | ||
205 | done: | ||
206 | this_cpu_write(espfix_stack, addr); | ||
207 | this_cpu_write(espfix_waddr, (unsigned long)stack_page | ||
208 | + (addr & ~PAGE_MASK)); | ||
209 | } | ||
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index dcbbaa165bde..c37886d759cc 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c | |||
@@ -20,8 +20,6 @@ | |||
20 | #include <asm/mmu_context.h> | 20 | #include <asm/mmu_context.h> |
21 | #include <asm/syscalls.h> | 21 | #include <asm/syscalls.h> |
22 | 22 | ||
23 | int sysctl_ldt16 = 0; | ||
24 | |||
25 | #ifdef CONFIG_SMP | 23 | #ifdef CONFIG_SMP |
26 | static void flush_ldt(void *current_mm) | 24 | static void flush_ldt(void *current_mm) |
27 | { | 25 | { |
@@ -231,16 +229,10 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) | |||
231 | } | 229 | } |
232 | } | 230 | } |
233 | 231 | ||
234 | /* | 232 | if (!IS_ENABLED(CONFIG_X86_16BIT) && !ldt_info.seg_32bit) { |
235 | * On x86-64 we do not support 16-bit segments due to | ||
236 | * IRET leaking the high bits of the kernel stack address. | ||
237 | */ | ||
238 | #ifdef CONFIG_X86_64 | ||
239 | if (!ldt_info.seg_32bit && !sysctl_ldt16) { | ||
240 | error = -EINVAL; | 233 | error = -EINVAL; |
241 | goto out_unlock; | 234 | goto out_unlock; |
242 | } | 235 | } |
243 | #endif | ||
244 | 236 | ||
245 | fill_ldt(&ldt, &ldt_info); | 237 | fill_ldt(&ldt, &ldt_info); |
246 | if (oldmode) | 238 | if (oldmode) |
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 34826934d4a7..5d93ac1b72db 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c | |||
@@ -244,6 +244,13 @@ static void notrace start_secondary(void *unused) | |||
244 | check_tsc_sync_target(); | 244 | check_tsc_sync_target(); |
245 | 245 | ||
246 | /* | 246 | /* |
247 | * Enable the espfix hack for this CPU | ||
248 | */ | ||
249 | #ifdef CONFIG_X86_ESPFIX64 | ||
250 | init_espfix_ap(); | ||
251 | #endif | ||
252 | |||
253 | /* | ||
247 | * We need to hold vector_lock so there the set of online cpus | 254 | * We need to hold vector_lock so there the set of online cpus |
248 | * does not change while we are assigning vectors to cpus. Holding | 255 | * does not change while we are assigning vectors to cpus. Holding |
249 | * this lock ensures we don't half assign or remove an irq from a cpu. | 256 | * this lock ensures we don't half assign or remove an irq from a cpu. |
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 20621d753d5f..167ffcac16ed 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c | |||
@@ -30,12 +30,14 @@ struct pg_state { | |||
30 | unsigned long start_address; | 30 | unsigned long start_address; |
31 | unsigned long current_address; | 31 | unsigned long current_address; |
32 | const struct addr_marker *marker; | 32 | const struct addr_marker *marker; |
33 | unsigned long lines; | ||
33 | bool to_dmesg; | 34 | bool to_dmesg; |
34 | }; | 35 | }; |
35 | 36 | ||
36 | struct addr_marker { | 37 | struct addr_marker { |
37 | unsigned long start_address; | 38 | unsigned long start_address; |
38 | const char *name; | 39 | const char *name; |
40 | unsigned long max_lines; | ||
39 | }; | 41 | }; |
40 | 42 | ||
41 | /* indices for address_markers; keep sync'd w/ address_markers below */ | 43 | /* indices for address_markers; keep sync'd w/ address_markers below */ |
@@ -46,6 +48,7 @@ enum address_markers_idx { | |||
46 | LOW_KERNEL_NR, | 48 | LOW_KERNEL_NR, |
47 | VMALLOC_START_NR, | 49 | VMALLOC_START_NR, |
48 | VMEMMAP_START_NR, | 50 | VMEMMAP_START_NR, |
51 | ESPFIX_START_NR, | ||
49 | HIGH_KERNEL_NR, | 52 | HIGH_KERNEL_NR, |
50 | MODULES_VADDR_NR, | 53 | MODULES_VADDR_NR, |
51 | MODULES_END_NR, | 54 | MODULES_END_NR, |
@@ -68,6 +71,7 @@ static struct addr_marker address_markers[] = { | |||
68 | { PAGE_OFFSET, "Low Kernel Mapping" }, | 71 | { PAGE_OFFSET, "Low Kernel Mapping" }, |
69 | { VMALLOC_START, "vmalloc() Area" }, | 72 | { VMALLOC_START, "vmalloc() Area" }, |
70 | { VMEMMAP_START, "Vmemmap" }, | 73 | { VMEMMAP_START, "Vmemmap" }, |
74 | { ESPFIX_BASE_ADDR, "ESPfix Area", 16 }, | ||
71 | { __START_KERNEL_map, "High Kernel Mapping" }, | 75 | { __START_KERNEL_map, "High Kernel Mapping" }, |
72 | { MODULES_VADDR, "Modules" }, | 76 | { MODULES_VADDR, "Modules" }, |
73 | { MODULES_END, "End Modules" }, | 77 | { MODULES_END, "End Modules" }, |
@@ -182,7 +186,7 @@ static void note_page(struct seq_file *m, struct pg_state *st, | |||
182 | pgprot_t new_prot, int level) | 186 | pgprot_t new_prot, int level) |
183 | { | 187 | { |
184 | pgprotval_t prot, cur; | 188 | pgprotval_t prot, cur; |
185 | static const char units[] = "KMGTPE"; | 189 | static const char units[] = "BKMGTPE"; |
186 | 190 | ||
187 | /* | 191 | /* |
188 | * If we have a "break" in the series, we need to flush the state that | 192 | * If we have a "break" in the series, we need to flush the state that |
@@ -197,6 +201,7 @@ static void note_page(struct seq_file *m, struct pg_state *st, | |||
197 | st->current_prot = new_prot; | 201 | st->current_prot = new_prot; |
198 | st->level = level; | 202 | st->level = level; |
199 | st->marker = address_markers; | 203 | st->marker = address_markers; |
204 | st->lines = 0; | ||
200 | pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n", | 205 | pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n", |
201 | st->marker->name); | 206 | st->marker->name); |
202 | } else if (prot != cur || level != st->level || | 207 | } else if (prot != cur || level != st->level || |
@@ -208,17 +213,24 @@ static void note_page(struct seq_file *m, struct pg_state *st, | |||
208 | /* | 213 | /* |
209 | * Now print the actual finished series | 214 | * Now print the actual finished series |
210 | */ | 215 | */ |
211 | pt_dump_seq_printf(m, st->to_dmesg, "0x%0*lx-0x%0*lx ", | 216 | if (!st->marker->max_lines || |
212 | width, st->start_address, | 217 | st->lines < st->marker->max_lines) { |
213 | width, st->current_address); | 218 | pt_dump_seq_printf(m, st->to_dmesg, |
214 | 219 | "0x%0*lx-0x%0*lx ", | |
215 | delta = (st->current_address - st->start_address) >> 10; | 220 | width, st->start_address, |
216 | while (!(delta & 1023) && unit[1]) { | 221 | width, st->current_address); |
217 | delta >>= 10; | 222 | |
218 | unit++; | 223 | delta = st->current_address - st->start_address; |
224 | while (!(delta & 1023) && unit[1]) { | ||
225 | delta >>= 10; | ||
226 | unit++; | ||
227 | } | ||
228 | pt_dump_cont_printf(m, st->to_dmesg, "%9lu%c ", | ||
229 | delta, *unit); | ||
230 | printk_prot(m, st->current_prot, st->level, | ||
231 | st->to_dmesg); | ||
219 | } | 232 | } |
220 | pt_dump_cont_printf(m, st->to_dmesg, "%9lu%c ", delta, *unit); | 233 | st->lines++; |
221 | printk_prot(m, st->current_prot, st->level, st->to_dmesg); | ||
222 | 234 | ||
223 | /* | 235 | /* |
224 | * We print markers for special areas of address space, | 236 | * We print markers for special areas of address space, |
@@ -226,7 +238,17 @@ static void note_page(struct seq_file *m, struct pg_state *st, | |||
226 | * This helps in the interpretation. | 238 | * This helps in the interpretation. |
227 | */ | 239 | */ |
228 | if (st->current_address >= st->marker[1].start_address) { | 240 | if (st->current_address >= st->marker[1].start_address) { |
241 | if (st->marker->max_lines && | ||
242 | st->lines > st->marker->max_lines) { | ||
243 | unsigned long nskip = | ||
244 | st->lines - st->marker->max_lines; | ||
245 | pt_dump_seq_printf(m, st->to_dmesg, | ||
246 | "... %lu entr%s skipped ... \n", | ||
247 | nskip, | ||
248 | nskip == 1 ? "y" : "ies"); | ||
249 | } | ||
229 | st->marker++; | 250 | st->marker++; |
251 | st->lines = 0; | ||
230 | pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n", | 252 | pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n", |
231 | st->marker->name); | 253 | st->marker->name); |
232 | } | 254 | } |
diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c index e1f220e3ca68..00348980a3a6 100644 --- a/arch/x86/vdso/vdso32-setup.c +++ b/arch/x86/vdso/vdso32-setup.c | |||
@@ -39,7 +39,6 @@ | |||
39 | #ifdef CONFIG_X86_64 | 39 | #ifdef CONFIG_X86_64 |
40 | #define vdso_enabled sysctl_vsyscall32 | 40 | #define vdso_enabled sysctl_vsyscall32 |
41 | #define arch_setup_additional_pages syscall32_setup_pages | 41 | #define arch_setup_additional_pages syscall32_setup_pages |
42 | extern int sysctl_ldt16; | ||
43 | #endif | 42 | #endif |
44 | 43 | ||
45 | /* | 44 | /* |
@@ -250,13 +249,6 @@ static struct ctl_table abi_table2[] = { | |||
250 | .mode = 0644, | 249 | .mode = 0644, |
251 | .proc_handler = proc_dointvec | 250 | .proc_handler = proc_dointvec |
252 | }, | 251 | }, |
253 | { | ||
254 | .procname = "ldt16", | ||
255 | .data = &sysctl_ldt16, | ||
256 | .maxlen = sizeof(int), | ||
257 | .mode = 0644, | ||
258 | .proc_handler = proc_dointvec | ||
259 | }, | ||
260 | {} | 252 | {} |
261 | }; | 253 | }; |
262 | 254 | ||
diff --git a/init/main.c b/init/main.c index 17d47bcdf573..0ec25157deef 100644 --- a/init/main.c +++ b/init/main.c | |||
@@ -617,6 +617,10 @@ asmlinkage __visible void __init start_kernel(void) | |||
617 | if (efi_enabled(EFI_RUNTIME_SERVICES)) | 617 | if (efi_enabled(EFI_RUNTIME_SERVICES)) |
618 | efi_enter_virtual_mode(); | 618 | efi_enter_virtual_mode(); |
619 | #endif | 619 | #endif |
620 | #ifdef CONFIG_X86_ESPFIX64 | ||
621 | /* Should be run before the first non-init thread is created */ | ||
622 | init_espfix_bsp(); | ||
623 | #endif | ||
620 | thread_info_cache_init(); | 624 | thread_info_cache_init(); |
621 | cred_init(); | 625 | cred_init(); |
622 | fork_init(totalram_pages); | 626 | fork_init(totalram_pages); |