diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2017-07-03 17:45:09 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2017-07-03 17:45:09 -0400 |
commit | 7a69f9c60b49699579f5bfb71f928cceba0afe1a (patch) | |
tree | bf3b5640bbd9f23beeb5a55d18348d65bafff8e8 /arch/x86 | |
parent | 9bc088ab66be8978fbc981ba9644468fa2c2fd3f (diff) | |
parent | 8781fb7e9749da424e01daacd14834b674658c63 (diff) |
Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 mm updates from Ingo Molnar:
"The main changes in this cycle were:
- Continued work to add support for 5-level paging provided by future
Intel CPUs. In particular we switch the x86 GUP code to the generic
implementation. (Kirill A. Shutemov)
- Continued work to add PCID CPU support to native kernels as well.
In this round most of the focus is on reworking/refreshing the TLB
flush infrastructure for the upcoming PCID changes. (Andy
Lutomirski)"
* 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (34 commits)
x86/mm: Delete a big outdated comment about TLB flushing
x86/mm: Don't reenter flush_tlb_func_common()
x86/KASLR: Fix detection 32/64 bit bootloaders for 5-level paging
x86/ftrace: Exclude functions in head64.c from function-tracing
x86/mmap, ASLR: Do not treat unlimited-stack tasks as legacy mmap
x86/mm: Remove reset_lazy_tlbstate()
x86/ldt: Simplify the LDT switching logic
x86/boot/64: Put __startup_64() into .head.text
x86/mm: Add support for 5-level paging for KASLR
x86/mm: Make kernel_physical_mapping_init() support 5-level paging
x86/mm: Add sync_global_pgds() for configuration with 5-level paging
x86/boot/64: Add support of additional page table level during early boot
x86/boot/64: Rename init_level4_pgt and early_level4_pgt
x86/boot/64: Rewrite startup_64() in C
x86/boot/compressed: Enable 5-level paging during decompression stage
x86/boot/efi: Define __KERNEL32_CS GDT on 64-bit configurations
x86/boot/efi: Fix __KERNEL_CS definition of GDT entry on 64-bit configurations
x86/boot/efi: Cleanup initialization of GDT entries
x86/asm: Fix comment in return_from_SYSCALL_64()
x86/mm/gup: Switch GUP to the generic get_user_page_fast() implementation
...
Diffstat (limited to 'arch/x86')
54 files changed, 1061 insertions, 1191 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 0efb4c9497bc..737212c0333e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig | |||
@@ -69,7 +69,7 @@ config X86 | |||
69 | select ARCH_USE_BUILTIN_BSWAP | 69 | select ARCH_USE_BUILTIN_BSWAP |
70 | select ARCH_USE_QUEUED_RWLOCKS | 70 | select ARCH_USE_QUEUED_RWLOCKS |
71 | select ARCH_USE_QUEUED_SPINLOCKS | 71 | select ARCH_USE_QUEUED_SPINLOCKS |
72 | select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH if SMP | 72 | select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH |
73 | select ARCH_WANT_FRAME_POINTERS | 73 | select ARCH_WANT_FRAME_POINTERS |
74 | select ARCH_WANTS_DYNAMIC_TASK_STRUCT | 74 | select ARCH_WANTS_DYNAMIC_TASK_STRUCT |
75 | select BUILDTIME_EXTABLE_SORT | 75 | select BUILDTIME_EXTABLE_SORT |
@@ -2793,6 +2793,9 @@ config X86_DMA_REMAP | |||
2793 | bool | 2793 | bool |
2794 | depends on STA2X11 | 2794 | depends on STA2X11 |
2795 | 2795 | ||
2796 | config HAVE_GENERIC_GUP | ||
2797 | def_bool y | ||
2798 | |||
2796 | source "net/Kconfig" | 2799 | source "net/Kconfig" |
2797 | 2800 | ||
2798 | source "drivers/Kconfig" | 2801 | source "drivers/Kconfig" |
diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index cbf4b87f55b9..c3e869eaef0c 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c | |||
@@ -1046,9 +1046,31 @@ struct boot_params *efi_main(struct efi_config *c, | |||
1046 | memset((char *)gdt->address, 0x0, gdt->size); | 1046 | memset((char *)gdt->address, 0x0, gdt->size); |
1047 | desc = (struct desc_struct *)gdt->address; | 1047 | desc = (struct desc_struct *)gdt->address; |
1048 | 1048 | ||
1049 | /* The first GDT is a dummy and the second is unused. */ | 1049 | /* The first GDT is a dummy. */ |
1050 | desc += 2; | 1050 | desc++; |
1051 | |||
1052 | if (IS_ENABLED(CONFIG_X86_64)) { | ||
1053 | /* __KERNEL32_CS */ | ||
1054 | desc->limit0 = 0xffff; | ||
1055 | desc->base0 = 0x0000; | ||
1056 | desc->base1 = 0x0000; | ||
1057 | desc->type = SEG_TYPE_CODE | SEG_TYPE_EXEC_READ; | ||
1058 | desc->s = DESC_TYPE_CODE_DATA; | ||
1059 | desc->dpl = 0; | ||
1060 | desc->p = 1; | ||
1061 | desc->limit = 0xf; | ||
1062 | desc->avl = 0; | ||
1063 | desc->l = 0; | ||
1064 | desc->d = SEG_OP_SIZE_32BIT; | ||
1065 | desc->g = SEG_GRANULARITY_4KB; | ||
1066 | desc->base2 = 0x00; | ||
1067 | desc++; | ||
1068 | } else { | ||
1069 | /* Second entry is unused on 32-bit */ | ||
1070 | desc++; | ||
1071 | } | ||
1051 | 1072 | ||
1073 | /* __KERNEL_CS */ | ||
1052 | desc->limit0 = 0xffff; | 1074 | desc->limit0 = 0xffff; |
1053 | desc->base0 = 0x0000; | 1075 | desc->base0 = 0x0000; |
1054 | desc->base1 = 0x0000; | 1076 | desc->base1 = 0x0000; |
@@ -1058,12 +1080,18 @@ struct boot_params *efi_main(struct efi_config *c, | |||
1058 | desc->p = 1; | 1080 | desc->p = 1; |
1059 | desc->limit = 0xf; | 1081 | desc->limit = 0xf; |
1060 | desc->avl = 0; | 1082 | desc->avl = 0; |
1061 | desc->l = 0; | 1083 | if (IS_ENABLED(CONFIG_X86_64)) { |
1062 | desc->d = SEG_OP_SIZE_32BIT; | 1084 | desc->l = 1; |
1085 | desc->d = 0; | ||
1086 | } else { | ||
1087 | desc->l = 0; | ||
1088 | desc->d = SEG_OP_SIZE_32BIT; | ||
1089 | } | ||
1063 | desc->g = SEG_GRANULARITY_4KB; | 1090 | desc->g = SEG_GRANULARITY_4KB; |
1064 | desc->base2 = 0x00; | 1091 | desc->base2 = 0x00; |
1065 | |||
1066 | desc++; | 1092 | desc++; |
1093 | |||
1094 | /* __KERNEL_DS */ | ||
1067 | desc->limit0 = 0xffff; | 1095 | desc->limit0 = 0xffff; |
1068 | desc->base0 = 0x0000; | 1096 | desc->base0 = 0x0000; |
1069 | desc->base1 = 0x0000; | 1097 | desc->base1 = 0x0000; |
@@ -1077,24 +1105,25 @@ struct boot_params *efi_main(struct efi_config *c, | |||
1077 | desc->d = SEG_OP_SIZE_32BIT; | 1105 | desc->d = SEG_OP_SIZE_32BIT; |
1078 | desc->g = SEG_GRANULARITY_4KB; | 1106 | desc->g = SEG_GRANULARITY_4KB; |
1079 | desc->base2 = 0x00; | 1107 | desc->base2 = 0x00; |
1080 | |||
1081 | #ifdef CONFIG_X86_64 | ||
1082 | /* Task segment value */ | ||
1083 | desc++; | 1108 | desc++; |
1084 | desc->limit0 = 0x0000; | 1109 | |
1085 | desc->base0 = 0x0000; | 1110 | if (IS_ENABLED(CONFIG_X86_64)) { |
1086 | desc->base1 = 0x0000; | 1111 | /* Task segment value */ |
1087 | desc->type = SEG_TYPE_TSS; | 1112 | desc->limit0 = 0x0000; |
1088 | desc->s = 0; | 1113 | desc->base0 = 0x0000; |
1089 | desc->dpl = 0; | 1114 | desc->base1 = 0x0000; |
1090 | desc->p = 1; | 1115 | desc->type = SEG_TYPE_TSS; |
1091 | desc->limit = 0x0; | 1116 | desc->s = 0; |
1092 | desc->avl = 0; | 1117 | desc->dpl = 0; |
1093 | desc->l = 0; | 1118 | desc->p = 1; |
1094 | desc->d = 0; | 1119 | desc->limit = 0x0; |
1095 | desc->g = SEG_GRANULARITY_4KB; | 1120 | desc->avl = 0; |
1096 | desc->base2 = 0x00; | 1121 | desc->l = 0; |
1097 | #endif /* CONFIG_X86_64 */ | 1122 | desc->d = 0; |
1123 | desc->g = SEG_GRANULARITY_4KB; | ||
1124 | desc->base2 = 0x00; | ||
1125 | desc++; | ||
1126 | } | ||
1098 | 1127 | ||
1099 | asm volatile("cli"); | 1128 | asm volatile("cli"); |
1100 | asm volatile ("lgdt %0" : : "m" (*gdt)); | 1129 | asm volatile ("lgdt %0" : : "m" (*gdt)); |
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index d2ae1f821e0c..fbf4c32d0b62 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S | |||
@@ -346,6 +346,48 @@ preferred_addr: | |||
346 | /* Set up the stack */ | 346 | /* Set up the stack */ |
347 | leaq boot_stack_end(%rbx), %rsp | 347 | leaq boot_stack_end(%rbx), %rsp |
348 | 348 | ||
349 | #ifdef CONFIG_X86_5LEVEL | ||
350 | /* Check if 5-level paging has already enabled */ | ||
351 | movq %cr4, %rax | ||
352 | testl $X86_CR4_LA57, %eax | ||
353 | jnz lvl5 | ||
354 | |||
355 | /* | ||
356 | * At this point we are in long mode with 4-level paging enabled, | ||
357 | * but we want to enable 5-level paging. | ||
358 | * | ||
359 | * The problem is that we cannot do it directly. Setting LA57 in | ||
360 | * long mode would trigger #GP. So we need to switch off long mode | ||
361 | * first. | ||
362 | * | ||
363 | * NOTE: This is not going to work if bootloader put us above 4G | ||
364 | * limit. | ||
365 | * | ||
366 | * The first step is go into compatibility mode. | ||
367 | */ | ||
368 | |||
369 | /* Clear additional page table */ | ||
370 | leaq lvl5_pgtable(%rbx), %rdi | ||
371 | xorq %rax, %rax | ||
372 | movq $(PAGE_SIZE/8), %rcx | ||
373 | rep stosq | ||
374 | |||
375 | /* | ||
376 | * Setup current CR3 as the first and only entry in a new top level | ||
377 | * page table. | ||
378 | */ | ||
379 | movq %cr3, %rdi | ||
380 | leaq 0x7 (%rdi), %rax | ||
381 | movq %rax, lvl5_pgtable(%rbx) | ||
382 | |||
383 | /* Switch to compatibility mode (CS.L = 0 CS.D = 1) via far return */ | ||
384 | pushq $__KERNEL32_CS | ||
385 | leaq compatible_mode(%rip), %rax | ||
386 | pushq %rax | ||
387 | lretq | ||
388 | lvl5: | ||
389 | #endif | ||
390 | |||
349 | /* Zero EFLAGS */ | 391 | /* Zero EFLAGS */ |
350 | pushq $0 | 392 | pushq $0 |
351 | popfq | 393 | popfq |
@@ -429,6 +471,44 @@ relocated: | |||
429 | jmp *%rax | 471 | jmp *%rax |
430 | 472 | ||
431 | .code32 | 473 | .code32 |
474 | #ifdef CONFIG_X86_5LEVEL | ||
475 | compatible_mode: | ||
476 | /* Setup data and stack segments */ | ||
477 | movl $__KERNEL_DS, %eax | ||
478 | movl %eax, %ds | ||
479 | movl %eax, %ss | ||
480 | |||
481 | /* Disable paging */ | ||
482 | movl %cr0, %eax | ||
483 | btrl $X86_CR0_PG_BIT, %eax | ||
484 | movl %eax, %cr0 | ||
485 | |||
486 | /* Point CR3 to 5-level paging */ | ||
487 | leal lvl5_pgtable(%ebx), %eax | ||
488 | movl %eax, %cr3 | ||
489 | |||
490 | /* Enable PAE and LA57 mode */ | ||
491 | movl %cr4, %eax | ||
492 | orl $(X86_CR4_PAE | X86_CR4_LA57), %eax | ||
493 | movl %eax, %cr4 | ||
494 | |||
495 | /* Calculate address we are running at */ | ||
496 | call 1f | ||
497 | 1: popl %edi | ||
498 | subl $1b, %edi | ||
499 | |||
500 | /* Prepare stack for far return to Long Mode */ | ||
501 | pushl $__KERNEL_CS | ||
502 | leal lvl5(%edi), %eax | ||
503 | push %eax | ||
504 | |||
505 | /* Enable paging back */ | ||
506 | movl $(X86_CR0_PG | X86_CR0_PE), %eax | ||
507 | movl %eax, %cr0 | ||
508 | |||
509 | lret | ||
510 | #endif | ||
511 | |||
432 | no_longmode: | 512 | no_longmode: |
433 | /* This isn't an x86-64 CPU so hang */ | 513 | /* This isn't an x86-64 CPU so hang */ |
434 | 1: | 514 | 1: |
@@ -442,7 +522,7 @@ gdt: | |||
442 | .word gdt_end - gdt | 522 | .word gdt_end - gdt |
443 | .long gdt | 523 | .long gdt |
444 | .word 0 | 524 | .word 0 |
445 | .quad 0x0000000000000000 /* NULL descriptor */ | 525 | .quad 0x00cf9a000000ffff /* __KERNEL32_CS */ |
446 | .quad 0x00af9a000000ffff /* __KERNEL_CS */ | 526 | .quad 0x00af9a000000ffff /* __KERNEL_CS */ |
447 | .quad 0x00cf92000000ffff /* __KERNEL_DS */ | 527 | .quad 0x00cf92000000ffff /* __KERNEL_DS */ |
448 | .quad 0x0080890000000000 /* TS descriptor */ | 528 | .quad 0x0080890000000000 /* TS descriptor */ |
@@ -486,3 +566,7 @@ boot_stack_end: | |||
486 | .balign 4096 | 566 | .balign 4096 |
487 | pgtable: | 567 | pgtable: |
488 | .fill BOOT_PGT_SIZE, 1, 0 | 568 | .fill BOOT_PGT_SIZE, 1, 0 |
569 | #ifdef CONFIG_X86_5LEVEL | ||
570 | lvl5_pgtable: | ||
571 | .fill PAGE_SIZE, 1, 0 | ||
572 | #endif | ||
diff --git a/arch/x86/boot/compressed/pagetable.c b/arch/x86/boot/compressed/pagetable.c index 1d78f1739087..28029be47fbb 100644 --- a/arch/x86/boot/compressed/pagetable.c +++ b/arch/x86/boot/compressed/pagetable.c | |||
@@ -63,7 +63,7 @@ static void *alloc_pgt_page(void *context) | |||
63 | static struct alloc_pgt_data pgt_data; | 63 | static struct alloc_pgt_data pgt_data; |
64 | 64 | ||
65 | /* The top level page table entry pointer. */ | 65 | /* The top level page table entry pointer. */ |
66 | static unsigned long level4p; | 66 | static unsigned long top_level_pgt; |
67 | 67 | ||
68 | /* | 68 | /* |
69 | * Mapping information structure passed to kernel_ident_mapping_init(). | 69 | * Mapping information structure passed to kernel_ident_mapping_init(). |
@@ -91,9 +91,15 @@ void initialize_identity_maps(void) | |||
91 | * If we came here via startup_32(), cr3 will be _pgtable already | 91 | * If we came here via startup_32(), cr3 will be _pgtable already |
92 | * and we must append to the existing area instead of entirely | 92 | * and we must append to the existing area instead of entirely |
93 | * overwriting it. | 93 | * overwriting it. |
94 | * | ||
95 | * With 5-level paging, we use '_pgtable' to allocate the p4d page table, | ||
96 | * the top-level page table is allocated separately. | ||
97 | * | ||
98 | * p4d_offset(top_level_pgt, 0) would cover both the 4- and 5-level | ||
99 | * cases. On 4-level paging it's equal to 'top_level_pgt'. | ||
94 | */ | 100 | */ |
95 | level4p = read_cr3(); | 101 | top_level_pgt = read_cr3_pa(); |
96 | if (level4p == (unsigned long)_pgtable) { | 102 | if (p4d_offset((pgd_t *)top_level_pgt, 0) == (p4d_t *)_pgtable) { |
97 | debug_putstr("booted via startup_32()\n"); | 103 | debug_putstr("booted via startup_32()\n"); |
98 | pgt_data.pgt_buf = _pgtable + BOOT_INIT_PGT_SIZE; | 104 | pgt_data.pgt_buf = _pgtable + BOOT_INIT_PGT_SIZE; |
99 | pgt_data.pgt_buf_size = BOOT_PGT_SIZE - BOOT_INIT_PGT_SIZE; | 105 | pgt_data.pgt_buf_size = BOOT_PGT_SIZE - BOOT_INIT_PGT_SIZE; |
@@ -103,7 +109,7 @@ void initialize_identity_maps(void) | |||
103 | pgt_data.pgt_buf = _pgtable; | 109 | pgt_data.pgt_buf = _pgtable; |
104 | pgt_data.pgt_buf_size = BOOT_PGT_SIZE; | 110 | pgt_data.pgt_buf_size = BOOT_PGT_SIZE; |
105 | memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size); | 111 | memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size); |
106 | level4p = (unsigned long)alloc_pgt_page(&pgt_data); | 112 | top_level_pgt = (unsigned long)alloc_pgt_page(&pgt_data); |
107 | } | 113 | } |
108 | } | 114 | } |
109 | 115 | ||
@@ -123,7 +129,7 @@ void add_identity_map(unsigned long start, unsigned long size) | |||
123 | return; | 129 | return; |
124 | 130 | ||
125 | /* Build the mapping. */ | 131 | /* Build the mapping. */ |
126 | kernel_ident_mapping_init(&mapping_info, (pgd_t *)level4p, | 132 | kernel_ident_mapping_init(&mapping_info, (pgd_t *)top_level_pgt, |
127 | start, end); | 133 | start, end); |
128 | } | 134 | } |
129 | 135 | ||
@@ -134,5 +140,5 @@ void add_identity_map(unsigned long start, unsigned long size) | |||
134 | */ | 140 | */ |
135 | void finalize_identity_maps(void) | 141 | void finalize_identity_maps(void) |
136 | { | 142 | { |
137 | write_cr3(level4p); | 143 | write_cr3(top_level_pgt); |
138 | } | 144 | } |
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 4a4c0834f965..a9a8027a6c0e 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S | |||
@@ -265,7 +265,8 @@ return_from_SYSCALL_64: | |||
265 | * If width of "canonical tail" ever becomes variable, this will need | 265 | * If width of "canonical tail" ever becomes variable, this will need |
266 | * to be updated to remain correct on both old and new CPUs. | 266 | * to be updated to remain correct on both old and new CPUs. |
267 | * | 267 | * |
268 | * Change top 16 bits to be the sign-extension of 47th bit | 268 | * Change top bits to match most significant bit (47th or 56th bit |
269 | * depending on paging mode) in the address. | ||
269 | */ | 270 | */ |
270 | shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx | 271 | shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx |
271 | sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx | 272 | sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx |
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 628b8c556aab..2de0dd73830a 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c | |||
@@ -2111,8 +2111,7 @@ static int x86_pmu_event_init(struct perf_event *event) | |||
2111 | 2111 | ||
2112 | static void refresh_pce(void *ignored) | 2112 | static void refresh_pce(void *ignored) |
2113 | { | 2113 | { |
2114 | if (current->active_mm) | 2114 | load_mm_cr4(this_cpu_read(cpu_tlbstate.loaded_mm)); |
2115 | load_mm_cr4(current->active_mm); | ||
2116 | } | 2115 | } |
2117 | 2116 | ||
2118 | static void x86_pmu_event_mapped(struct perf_event *event) | 2117 | static void x86_pmu_event_mapped(struct perf_event *event) |
@@ -2344,7 +2343,7 @@ static unsigned long get_segment_base(unsigned int segment) | |||
2344 | 2343 | ||
2345 | /* IRQs are off, so this synchronizes with smp_store_release */ | 2344 | /* IRQs are off, so this synchronizes with smp_store_release */ |
2346 | ldt = lockless_dereference(current->active_mm->context.ldt); | 2345 | ldt = lockless_dereference(current->active_mm->context.ldt); |
2347 | if (!ldt || idx > ldt->size) | 2346 | if (!ldt || idx > ldt->nr_entries) |
2348 | return 0; | 2347 | return 0; |
2349 | 2348 | ||
2350 | desc = &ldt->entries[idx]; | 2349 | desc = &ldt->entries[idx]; |
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 2f77bcefe6b4..d2ff779f347e 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h | |||
@@ -74,7 +74,7 @@ struct efi_scratch { | |||
74 | __kernel_fpu_begin(); \ | 74 | __kernel_fpu_begin(); \ |
75 | \ | 75 | \ |
76 | if (efi_scratch.use_pgd) { \ | 76 | if (efi_scratch.use_pgd) { \ |
77 | efi_scratch.prev_cr3 = read_cr3(); \ | 77 | efi_scratch.prev_cr3 = __read_cr3(); \ |
78 | write_cr3((unsigned long)efi_scratch.efi_pgt); \ | 78 | write_cr3((unsigned long)efi_scratch.efi_pgt); \ |
79 | __flush_tlb_all(); \ | 79 | __flush_tlb_all(); \ |
80 | } \ | 80 | } \ |
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index 59405a248fc2..9b76cd331990 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h | |||
@@ -22,8 +22,8 @@ typedef struct { | |||
22 | #ifdef CONFIG_SMP | 22 | #ifdef CONFIG_SMP |
23 | unsigned int irq_resched_count; | 23 | unsigned int irq_resched_count; |
24 | unsigned int irq_call_count; | 24 | unsigned int irq_call_count; |
25 | unsigned int irq_tlb_count; | ||
26 | #endif | 25 | #endif |
26 | unsigned int irq_tlb_count; | ||
27 | #ifdef CONFIG_X86_THERMAL_VECTOR | 27 | #ifdef CONFIG_X86_THERMAL_VECTOR |
28 | unsigned int irq_thermal_count; | 28 | unsigned int irq_thermal_count; |
29 | #endif | 29 | #endif |
diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h index f9813b6d8b80..79b647a7ebd0 100644 --- a/arch/x86/include/asm/mmu.h +++ b/arch/x86/include/asm/mmu.h | |||
@@ -37,12 +37,6 @@ typedef struct { | |||
37 | #endif | 37 | #endif |
38 | } mm_context_t; | 38 | } mm_context_t; |
39 | 39 | ||
40 | #ifdef CONFIG_SMP | ||
41 | void leave_mm(int cpu); | 40 | void leave_mm(int cpu); |
42 | #else | ||
43 | static inline void leave_mm(int cpu) | ||
44 | { | ||
45 | } | ||
46 | #endif | ||
47 | 41 | ||
48 | #endif /* _ASM_X86_MMU_H */ | 42 | #endif /* _ASM_X86_MMU_H */ |
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 68b329d77b3a..ecfcb6643c9b 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h | |||
@@ -47,7 +47,7 @@ struct ldt_struct { | |||
47 | * allocations, but it's not worth trying to optimize. | 47 | * allocations, but it's not worth trying to optimize. |
48 | */ | 48 | */ |
49 | struct desc_struct *entries; | 49 | struct desc_struct *entries; |
50 | unsigned int size; | 50 | unsigned int nr_entries; |
51 | }; | 51 | }; |
52 | 52 | ||
53 | /* | 53 | /* |
@@ -87,22 +87,46 @@ static inline void load_mm_ldt(struct mm_struct *mm) | |||
87 | */ | 87 | */ |
88 | 88 | ||
89 | if (unlikely(ldt)) | 89 | if (unlikely(ldt)) |
90 | set_ldt(ldt->entries, ldt->size); | 90 | set_ldt(ldt->entries, ldt->nr_entries); |
91 | else | 91 | else |
92 | clear_LDT(); | 92 | clear_LDT(); |
93 | #else | 93 | #else |
94 | clear_LDT(); | 94 | clear_LDT(); |
95 | #endif | 95 | #endif |
96 | } | ||
97 | |||
98 | static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next) | ||
99 | { | ||
100 | #ifdef CONFIG_MODIFY_LDT_SYSCALL | ||
101 | /* | ||
102 | * Load the LDT if either the old or new mm had an LDT. | ||
103 | * | ||
104 | * An mm will never go from having an LDT to not having an LDT. Two | ||
105 | * mms never share an LDT, so we don't gain anything by checking to | ||
106 | * see whether the LDT changed. There's also no guarantee that | ||
107 | * prev->context.ldt actually matches LDTR, but, if LDTR is non-NULL, | ||
108 | * then prev->context.ldt will also be non-NULL. | ||
109 | * | ||
110 | * If we really cared, we could optimize the case where prev == next | ||
111 | * and we're exiting lazy mode. Most of the time, if this happens, | ||
112 | * we don't actually need to reload LDTR, but modify_ldt() is mostly | ||
113 | * used by legacy code and emulators where we don't need this level of | ||
114 | * performance. | ||
115 | * | ||
116 | * This uses | instead of || because it generates better code. | ||
117 | */ | ||
118 | if (unlikely((unsigned long)prev->context.ldt | | ||
119 | (unsigned long)next->context.ldt)) | ||
120 | load_mm_ldt(next); | ||
121 | #endif | ||
96 | 122 | ||
97 | DEBUG_LOCKS_WARN_ON(preemptible()); | 123 | DEBUG_LOCKS_WARN_ON(preemptible()); |
98 | } | 124 | } |
99 | 125 | ||
100 | static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) | 126 | static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) |
101 | { | 127 | { |
102 | #ifdef CONFIG_SMP | ||
103 | if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) | 128 | if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) |
104 | this_cpu_write(cpu_tlbstate.state, TLBSTATE_LAZY); | 129 | this_cpu_write(cpu_tlbstate.state, TLBSTATE_LAZY); |
105 | #endif | ||
106 | } | 130 | } |
107 | 131 | ||
108 | static inline int init_new_context(struct task_struct *tsk, | 132 | static inline int init_new_context(struct task_struct *tsk, |
@@ -220,18 +244,6 @@ static inline int vma_pkey(struct vm_area_struct *vma) | |||
220 | } | 244 | } |
221 | #endif | 245 | #endif |
222 | 246 | ||
223 | static inline bool __pkru_allows_pkey(u16 pkey, bool write) | ||
224 | { | ||
225 | u32 pkru = read_pkru(); | ||
226 | |||
227 | if (!__pkru_allows_read(pkru, pkey)) | ||
228 | return false; | ||
229 | if (write && !__pkru_allows_write(pkru, pkey)) | ||
230 | return false; | ||
231 | |||
232 | return true; | ||
233 | } | ||
234 | |||
235 | /* | 247 | /* |
236 | * We only want to enforce protection keys on the current process | 248 | * We only want to enforce protection keys on the current process |
237 | * because we effectively have no access to PKRU for other | 249 | * because we effectively have no access to PKRU for other |
@@ -268,4 +280,23 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, | |||
268 | return __pkru_allows_pkey(vma_pkey(vma), write); | 280 | return __pkru_allows_pkey(vma_pkey(vma), write); |
269 | } | 281 | } |
270 | 282 | ||
283 | |||
284 | /* | ||
285 | * This can be used from process context to figure out what the value of | ||
286 | * CR3 is without needing to do a (slow) __read_cr3(). | ||
287 | * | ||
288 | * It's intended to be used for code like KVM that sneakily changes CR3 | ||
289 | * and needs to restore it. It needs to be used very carefully. | ||
290 | */ | ||
291 | static inline unsigned long __get_current_cr3_fast(void) | ||
292 | { | ||
293 | unsigned long cr3 = __pa(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd); | ||
294 | |||
295 | /* For now, be very restrictive about when this can be called. */ | ||
296 | VM_WARN_ON(in_nmi() || !in_atomic()); | ||
297 | |||
298 | VM_BUG_ON(cr3 != __read_cr3()); | ||
299 | return cr3; | ||
300 | } | ||
301 | |||
271 | #endif /* _ASM_X86_MMU_CONTEXT_H */ | 302 | #endif /* _ASM_X86_MMU_CONTEXT_H */ |
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index a3dcf8944cb9..9ccac1926587 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h | |||
@@ -61,7 +61,7 @@ static inline void write_cr2(unsigned long x) | |||
61 | PVOP_VCALL1(pv_mmu_ops.write_cr2, x); | 61 | PVOP_VCALL1(pv_mmu_ops.write_cr2, x); |
62 | } | 62 | } |
63 | 63 | ||
64 | static inline unsigned long read_cr3(void) | 64 | static inline unsigned long __read_cr3(void) |
65 | { | 65 | { |
66 | return PVOP_CALL0(unsigned long, pv_mmu_ops.read_cr3); | 66 | return PVOP_CALL0(unsigned long, pv_mmu_ops.read_cr3); |
67 | } | 67 | } |
@@ -312,11 +312,9 @@ static inline void __flush_tlb_single(unsigned long addr) | |||
312 | } | 312 | } |
313 | 313 | ||
314 | static inline void flush_tlb_others(const struct cpumask *cpumask, | 314 | static inline void flush_tlb_others(const struct cpumask *cpumask, |
315 | struct mm_struct *mm, | 315 | const struct flush_tlb_info *info) |
316 | unsigned long start, | ||
317 | unsigned long end) | ||
318 | { | 316 | { |
319 | PVOP_VCALL4(pv_mmu_ops.flush_tlb_others, cpumask, mm, start, end); | 317 | PVOP_VCALL2(pv_mmu_ops.flush_tlb_others, cpumask, info); |
320 | } | 318 | } |
321 | 319 | ||
322 | static inline int paravirt_pgd_alloc(struct mm_struct *mm) | 320 | static inline int paravirt_pgd_alloc(struct mm_struct *mm) |
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 7465d6fe336f..cb976bab6299 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h | |||
@@ -51,6 +51,7 @@ struct mm_struct; | |||
51 | struct desc_struct; | 51 | struct desc_struct; |
52 | struct task_struct; | 52 | struct task_struct; |
53 | struct cpumask; | 53 | struct cpumask; |
54 | struct flush_tlb_info; | ||
54 | 55 | ||
55 | /* | 56 | /* |
56 | * Wrapper type for pointers to code which uses the non-standard | 57 | * Wrapper type for pointers to code which uses the non-standard |
@@ -223,9 +224,7 @@ struct pv_mmu_ops { | |||
223 | void (*flush_tlb_kernel)(void); | 224 | void (*flush_tlb_kernel)(void); |
224 | void (*flush_tlb_single)(unsigned long addr); | 225 | void (*flush_tlb_single)(unsigned long addr); |
225 | void (*flush_tlb_others)(const struct cpumask *cpus, | 226 | void (*flush_tlb_others)(const struct cpumask *cpus, |
226 | struct mm_struct *mm, | 227 | const struct flush_tlb_info *info); |
227 | unsigned long start, | ||
228 | unsigned long end); | ||
229 | 228 | ||
230 | /* Hooks for allocating and freeing a pagetable top-level */ | 229 | /* Hooks for allocating and freeing a pagetable top-level */ |
231 | int (*pgd_alloc)(struct mm_struct *mm); | 230 | int (*pgd_alloc)(struct mm_struct *mm); |
diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index 50d35e3185f5..c8821bab938f 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h | |||
@@ -212,4 +212,51 @@ static inline pud_t native_pudp_get_and_clear(pud_t *pudp) | |||
212 | #define __pte_to_swp_entry(pte) ((swp_entry_t){ (pte).pte_high }) | 212 | #define __pte_to_swp_entry(pte) ((swp_entry_t){ (pte).pte_high }) |
213 | #define __swp_entry_to_pte(x) ((pte_t){ { .pte_high = (x).val } }) | 213 | #define __swp_entry_to_pte(x) ((pte_t){ { .pte_high = (x).val } }) |
214 | 214 | ||
215 | #define gup_get_pte gup_get_pte | ||
216 | /* | ||
217 | * WARNING: only to be used in the get_user_pages_fast() implementation. | ||
218 | * | ||
219 | * With get_user_pages_fast(), we walk down the pagetables without taking | ||
220 | * any locks. For this we would like to load the pointers atomically, | ||
221 | * but that is not possible (without expensive cmpxchg8b) on PAE. What | ||
222 | * we do have is the guarantee that a PTE will only either go from not | ||
223 | * present to present, or present to not present or both -- it will not | ||
224 | * switch to a completely different present page without a TLB flush in | ||
225 | * between; something that we are blocking by holding interrupts off. | ||
226 | * | ||
227 | * Setting ptes from not present to present goes: | ||
228 | * | ||
229 | * ptep->pte_high = h; | ||
230 | * smp_wmb(); | ||
231 | * ptep->pte_low = l; | ||
232 | * | ||
233 | * And present to not present goes: | ||
234 | * | ||
235 | * ptep->pte_low = 0; | ||
236 | * smp_wmb(); | ||
237 | * ptep->pte_high = 0; | ||
238 | * | ||
239 | * We must ensure here that the load of pte_low sees 'l' iff pte_high | ||
240 | * sees 'h'. We load pte_high *after* loading pte_low, which ensures we | ||
241 | * don't see an older value of pte_high. *Then* we recheck pte_low, | ||
242 | * which ensures that we haven't picked up a changed pte high. We might | ||
243 | * have gotten rubbish values from pte_low and pte_high, but we are | ||
244 | * guaranteed that pte_low will not have the present bit set *unless* | ||
245 | * it is 'l'. Because get_user_pages_fast() only operates on present ptes | ||
246 | * we're safe. | ||
247 | */ | ||
248 | static inline pte_t gup_get_pte(pte_t *ptep) | ||
249 | { | ||
250 | pte_t pte; | ||
251 | |||
252 | do { | ||
253 | pte.pte_low = ptep->pte_low; | ||
254 | smp_rmb(); | ||
255 | pte.pte_high = ptep->pte_high; | ||
256 | smp_rmb(); | ||
257 | } while (unlikely(pte.pte_low != ptep->pte_low)); | ||
258 | |||
259 | return pte; | ||
260 | } | ||
261 | |||
215 | #endif /* _ASM_X86_PGTABLE_3LEVEL_H */ | 262 | #endif /* _ASM_X86_PGTABLE_3LEVEL_H */ |
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index f5af95a0c6b8..77037b6f1caa 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h | |||
@@ -244,6 +244,11 @@ static inline int pud_devmap(pud_t pud) | |||
244 | return 0; | 244 | return 0; |
245 | } | 245 | } |
246 | #endif | 246 | #endif |
247 | |||
248 | static inline int pgd_devmap(pgd_t pgd) | ||
249 | { | ||
250 | return 0; | ||
251 | } | ||
247 | #endif | 252 | #endif |
248 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | 253 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ |
249 | 254 | ||
@@ -917,7 +922,7 @@ extern pgd_t trampoline_pgd_entry; | |||
917 | static inline void __meminit init_trampoline_default(void) | 922 | static inline void __meminit init_trampoline_default(void) |
918 | { | 923 | { |
919 | /* Default trampoline pgd value */ | 924 | /* Default trampoline pgd value */ |
920 | trampoline_pgd_entry = init_level4_pgt[pgd_index(__PAGE_OFFSET)]; | 925 | trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)]; |
921 | } | 926 | } |
922 | # ifdef CONFIG_RANDOMIZE_MEMORY | 927 | # ifdef CONFIG_RANDOMIZE_MEMORY |
923 | void __meminit init_trampoline(void); | 928 | void __meminit init_trampoline(void); |
@@ -1185,6 +1190,54 @@ static inline u16 pte_flags_pkey(unsigned long pte_flags) | |||
1185 | #endif | 1190 | #endif |
1186 | } | 1191 | } |
1187 | 1192 | ||
1193 | static inline bool __pkru_allows_pkey(u16 pkey, bool write) | ||
1194 | { | ||
1195 | u32 pkru = read_pkru(); | ||
1196 | |||
1197 | if (!__pkru_allows_read(pkru, pkey)) | ||
1198 | return false; | ||
1199 | if (write && !__pkru_allows_write(pkru, pkey)) | ||
1200 | return false; | ||
1201 | |||
1202 | return true; | ||
1203 | } | ||
1204 | |||
1205 | /* | ||
1206 | * 'pteval' can come from a PTE, PMD or PUD. We only check | ||
1207 | * _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the | ||
1208 | * same value on all 3 types. | ||
1209 | */ | ||
1210 | static inline bool __pte_access_permitted(unsigned long pteval, bool write) | ||
1211 | { | ||
1212 | unsigned long need_pte_bits = _PAGE_PRESENT|_PAGE_USER; | ||
1213 | |||
1214 | if (write) | ||
1215 | need_pte_bits |= _PAGE_RW; | ||
1216 | |||
1217 | if ((pteval & need_pte_bits) != need_pte_bits) | ||
1218 | return 0; | ||
1219 | |||
1220 | return __pkru_allows_pkey(pte_flags_pkey(pteval), write); | ||
1221 | } | ||
1222 | |||
1223 | #define pte_access_permitted pte_access_permitted | ||
1224 | static inline bool pte_access_permitted(pte_t pte, bool write) | ||
1225 | { | ||
1226 | return __pte_access_permitted(pte_val(pte), write); | ||
1227 | } | ||
1228 | |||
1229 | #define pmd_access_permitted pmd_access_permitted | ||
1230 | static inline bool pmd_access_permitted(pmd_t pmd, bool write) | ||
1231 | { | ||
1232 | return __pte_access_permitted(pmd_val(pmd), write); | ||
1233 | } | ||
1234 | |||
1235 | #define pud_access_permitted pud_access_permitted | ||
1236 | static inline bool pud_access_permitted(pud_t pud, bool write) | ||
1237 | { | ||
1238 | return __pte_access_permitted(pud_val(pud), write); | ||
1239 | } | ||
1240 | |||
1188 | #include <asm-generic/pgtable.h> | 1241 | #include <asm-generic/pgtable.h> |
1189 | #endif /* __ASSEMBLY__ */ | 1242 | #endif /* __ASSEMBLY__ */ |
1190 | 1243 | ||
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 9991224f6238..2160c1fee920 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h | |||
@@ -14,15 +14,17 @@ | |||
14 | #include <linux/bitops.h> | 14 | #include <linux/bitops.h> |
15 | #include <linux/threads.h> | 15 | #include <linux/threads.h> |
16 | 16 | ||
17 | extern p4d_t level4_kernel_pgt[512]; | ||
18 | extern p4d_t level4_ident_pgt[512]; | ||
17 | extern pud_t level3_kernel_pgt[512]; | 19 | extern pud_t level3_kernel_pgt[512]; |
18 | extern pud_t level3_ident_pgt[512]; | 20 | extern pud_t level3_ident_pgt[512]; |
19 | extern pmd_t level2_kernel_pgt[512]; | 21 | extern pmd_t level2_kernel_pgt[512]; |
20 | extern pmd_t level2_fixmap_pgt[512]; | 22 | extern pmd_t level2_fixmap_pgt[512]; |
21 | extern pmd_t level2_ident_pgt[512]; | 23 | extern pmd_t level2_ident_pgt[512]; |
22 | extern pte_t level1_fixmap_pgt[512]; | 24 | extern pte_t level1_fixmap_pgt[512]; |
23 | extern pgd_t init_level4_pgt[]; | 25 | extern pgd_t init_top_pgt[]; |
24 | 26 | ||
25 | #define swapper_pg_dir init_level4_pgt | 27 | #define swapper_pg_dir init_top_pgt |
26 | 28 | ||
27 | extern void paging_init(void); | 29 | extern void paging_init(void); |
28 | 30 | ||
@@ -227,6 +229,20 @@ extern void cleanup_highmap(void); | |||
227 | extern void init_extra_mapping_uc(unsigned long phys, unsigned long size); | 229 | extern void init_extra_mapping_uc(unsigned long phys, unsigned long size); |
228 | extern void init_extra_mapping_wb(unsigned long phys, unsigned long size); | 230 | extern void init_extra_mapping_wb(unsigned long phys, unsigned long size); |
229 | 231 | ||
230 | #endif /* !__ASSEMBLY__ */ | 232 | #define gup_fast_permitted gup_fast_permitted |
233 | static inline bool gup_fast_permitted(unsigned long start, int nr_pages, | ||
234 | int write) | ||
235 | { | ||
236 | unsigned long len, end; | ||
237 | |||
238 | len = (unsigned long)nr_pages << PAGE_SHIFT; | ||
239 | end = start + len; | ||
240 | if (end < start) | ||
241 | return false; | ||
242 | if (end >> __VIRTUAL_MASK_SHIFT) | ||
243 | return false; | ||
244 | return true; | ||
245 | } | ||
231 | 246 | ||
247 | #endif /* !__ASSEMBLY__ */ | ||
232 | #endif /* _ASM_X86_PGTABLE_64_H */ | 248 | #endif /* _ASM_X86_PGTABLE_64_H */ |
diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h index 39fb618e2211..79aa2f98398d 100644 --- a/arch/x86/include/asm/processor-flags.h +++ b/arch/x86/include/asm/processor-flags.h | |||
@@ -8,4 +8,40 @@ | |||
8 | #else | 8 | #else |
9 | #define X86_VM_MASK 0 /* No VM86 support */ | 9 | #define X86_VM_MASK 0 /* No VM86 support */ |
10 | #endif | 10 | #endif |
11 | |||
12 | /* | ||
13 | * CR3's layout varies depending on several things. | ||
14 | * | ||
15 | * If CR4.PCIDE is set (64-bit only), then CR3[11:0] is the address space ID. | ||
16 | * If PAE is enabled, then CR3[11:5] is part of the PDPT address | ||
17 | * (i.e. it's 32-byte aligned, not page-aligned) and CR3[4:0] is ignored. | ||
18 | * Otherwise (non-PAE, non-PCID), CR3[3] is PWT, CR3[4] is PCD, and | ||
19 | * CR3[2:0] and CR3[11:5] are ignored. | ||
20 | * | ||
21 | * In all cases, Linux puts zeros in the low ignored bits and in PWT and PCD. | ||
22 | * | ||
23 | * CR3[63] is always read as zero. If CR4.PCIDE is set, then CR3[63] may be | ||
24 | * written as 1 to prevent the write to CR3 from flushing the TLB. | ||
25 | * | ||
26 | * On systems with SME, one bit (in a variable position!) is stolen to indicate | ||
27 | * that the top-level paging structure is encrypted. | ||
28 | * | ||
29 | * All of the remaining bits indicate the physical address of the top-level | ||
30 | * paging structure. | ||
31 | * | ||
32 | * CR3_ADDR_MASK is the mask used by read_cr3_pa(). | ||
33 | */ | ||
34 | #ifdef CONFIG_X86_64 | ||
35 | /* Mask off the address space ID bits. */ | ||
36 | #define CR3_ADDR_MASK 0x7FFFFFFFFFFFF000ull | ||
37 | #define CR3_PCID_MASK 0xFFFull | ||
38 | #else | ||
39 | /* | ||
40 | * CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save | ||
41 | * a tiny bit of code size by setting all the bits. | ||
42 | */ | ||
43 | #define CR3_ADDR_MASK 0xFFFFFFFFull | ||
44 | #define CR3_PCID_MASK 0ull | ||
45 | #endif | ||
46 | |||
11 | #endif /* _ASM_X86_PROCESSOR_FLAGS_H */ | 47 | #endif /* _ASM_X86_PROCESSOR_FLAGS_H */ |
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index a28b671f1549..2e1696294af5 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h | |||
@@ -231,6 +231,14 @@ native_cpuid_reg(ebx) | |||
231 | native_cpuid_reg(ecx) | 231 | native_cpuid_reg(ecx) |
232 | native_cpuid_reg(edx) | 232 | native_cpuid_reg(edx) |
233 | 233 | ||
234 | /* | ||
235 | * Friendlier CR3 helpers. | ||
236 | */ | ||
237 | static inline unsigned long read_cr3_pa(void) | ||
238 | { | ||
239 | return __read_cr3() & CR3_ADDR_MASK; | ||
240 | } | ||
241 | |||
234 | static inline void load_cr3(pgd_t *pgdir) | 242 | static inline void load_cr3(pgd_t *pgdir) |
235 | { | 243 | { |
236 | write_cr3(__pa(pgdir)); | 244 | write_cr3(__pa(pgdir)); |
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index 12af3e35edfa..9efaabf5b54b 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h | |||
@@ -39,7 +39,7 @@ static inline void native_write_cr2(unsigned long val) | |||
39 | asm volatile("mov %0,%%cr2": : "r" (val), "m" (__force_order)); | 39 | asm volatile("mov %0,%%cr2": : "r" (val), "m" (__force_order)); |
40 | } | 40 | } |
41 | 41 | ||
42 | static inline unsigned long native_read_cr3(void) | 42 | static inline unsigned long __native_read_cr3(void) |
43 | { | 43 | { |
44 | unsigned long val; | 44 | unsigned long val; |
45 | asm volatile("mov %%cr3,%0\n\t" : "=r" (val), "=m" (__force_order)); | 45 | asm volatile("mov %%cr3,%0\n\t" : "=r" (val), "=m" (__force_order)); |
@@ -159,9 +159,13 @@ static inline void write_cr2(unsigned long x) | |||
159 | native_write_cr2(x); | 159 | native_write_cr2(x); |
160 | } | 160 | } |
161 | 161 | ||
162 | static inline unsigned long read_cr3(void) | 162 | /* |
163 | * Careful! CR3 contains more than just an address. You probably want | ||
164 | * read_cr3_pa() instead. | ||
165 | */ | ||
166 | static inline unsigned long __read_cr3(void) | ||
163 | { | 167 | { |
164 | return native_read_cr3(); | 168 | return __native_read_cr3(); |
165 | } | 169 | } |
166 | 170 | ||
167 | static inline void write_cr3(unsigned long x) | 171 | static inline void write_cr3(unsigned long x) |
diff --git a/arch/x86/include/asm/tlbbatch.h b/arch/x86/include/asm/tlbbatch.h new file mode 100644 index 000000000000..f4a6ff352a0e --- /dev/null +++ b/arch/x86/include/asm/tlbbatch.h | |||
@@ -0,0 +1,14 @@ | |||
1 | #ifndef _ARCH_X86_TLBBATCH_H | ||
2 | #define _ARCH_X86_TLBBATCH_H | ||
3 | |||
4 | #include <linux/cpumask.h> | ||
5 | |||
6 | struct arch_tlbflush_unmap_batch { | ||
7 | /* | ||
8 | * Each bit set is a CPU that potentially has a TLB entry for one of | ||
9 | * the PFNs being flushed.. | ||
10 | */ | ||
11 | struct cpumask cpumask; | ||
12 | }; | ||
13 | |||
14 | #endif /* _ARCH_X86_TLBBATCH_H */ | ||
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 6ed9ea469b48..50ea3482e1d1 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h | |||
@@ -7,6 +7,7 @@ | |||
7 | #include <asm/processor.h> | 7 | #include <asm/processor.h> |
8 | #include <asm/cpufeature.h> | 8 | #include <asm/cpufeature.h> |
9 | #include <asm/special_insns.h> | 9 | #include <asm/special_insns.h> |
10 | #include <asm/smp.h> | ||
10 | 11 | ||
11 | static inline void __invpcid(unsigned long pcid, unsigned long addr, | 12 | static inline void __invpcid(unsigned long pcid, unsigned long addr, |
12 | unsigned long type) | 13 | unsigned long type) |
@@ -65,10 +66,14 @@ static inline void invpcid_flush_all_nonglobals(void) | |||
65 | #endif | 66 | #endif |
66 | 67 | ||
67 | struct tlb_state { | 68 | struct tlb_state { |
68 | #ifdef CONFIG_SMP | 69 | /* |
69 | struct mm_struct *active_mm; | 70 | * cpu_tlbstate.loaded_mm should match CR3 whenever interrupts |
71 | * are on. This means that it may not match current->active_mm, | ||
72 | * which will contain the previous user mm when we're in lazy TLB | ||
73 | * mode even if we've already switched back to swapper_pg_dir. | ||
74 | */ | ||
75 | struct mm_struct *loaded_mm; | ||
70 | int state; | 76 | int state; |
71 | #endif | ||
72 | 77 | ||
73 | /* | 78 | /* |
74 | * Access to this CR4 shadow and to H/W CR4 is protected by | 79 | * Access to this CR4 shadow and to H/W CR4 is protected by |
@@ -151,7 +156,7 @@ static inline void __native_flush_tlb(void) | |||
151 | * back: | 156 | * back: |
152 | */ | 157 | */ |
153 | preempt_disable(); | 158 | preempt_disable(); |
154 | native_write_cr3(native_read_cr3()); | 159 | native_write_cr3(__native_read_cr3()); |
155 | preempt_enable(); | 160 | preempt_enable(); |
156 | } | 161 | } |
157 | 162 | ||
@@ -220,84 +225,16 @@ static inline void __flush_tlb_one(unsigned long addr) | |||
220 | * - flush_tlb_page(vma, vmaddr) flushes one page | 225 | * - flush_tlb_page(vma, vmaddr) flushes one page |
221 | * - flush_tlb_range(vma, start, end) flushes a range of pages | 226 | * - flush_tlb_range(vma, start, end) flushes a range of pages |
222 | * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages | 227 | * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages |
223 | * - flush_tlb_others(cpumask, mm, start, end) flushes TLBs on other cpus | 228 | * - flush_tlb_others(cpumask, info) flushes TLBs on other cpus |
224 | * | 229 | * |
225 | * ..but the i386 has somewhat limited tlb flushing capabilities, | 230 | * ..but the i386 has somewhat limited tlb flushing capabilities, |
226 | * and page-granular flushes are available only on i486 and up. | 231 | * and page-granular flushes are available only on i486 and up. |
227 | */ | 232 | */ |
228 | 233 | struct flush_tlb_info { | |
229 | #ifndef CONFIG_SMP | 234 | struct mm_struct *mm; |
230 | 235 | unsigned long start; | |
231 | /* "_up" is for UniProcessor. | 236 | unsigned long end; |
232 | * | 237 | }; |
233 | * This is a helper for other header functions. *Not* intended to be called | ||
234 | * directly. All global TLB flushes need to either call this, or to bump the | ||
235 | * vm statistics themselves. | ||
236 | */ | ||
237 | static inline void __flush_tlb_up(void) | ||
238 | { | ||
239 | count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); | ||
240 | __flush_tlb(); | ||
241 | } | ||
242 | |||
243 | static inline void flush_tlb_all(void) | ||
244 | { | ||
245 | count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); | ||
246 | __flush_tlb_all(); | ||
247 | } | ||
248 | |||
249 | static inline void local_flush_tlb(void) | ||
250 | { | ||
251 | __flush_tlb_up(); | ||
252 | } | ||
253 | |||
254 | static inline void flush_tlb_mm(struct mm_struct *mm) | ||
255 | { | ||
256 | if (mm == current->active_mm) | ||
257 | __flush_tlb_up(); | ||
258 | } | ||
259 | |||
260 | static inline void flush_tlb_page(struct vm_area_struct *vma, | ||
261 | unsigned long addr) | ||
262 | { | ||
263 | if (vma->vm_mm == current->active_mm) | ||
264 | __flush_tlb_one(addr); | ||
265 | } | ||
266 | |||
267 | static inline void flush_tlb_range(struct vm_area_struct *vma, | ||
268 | unsigned long start, unsigned long end) | ||
269 | { | ||
270 | if (vma->vm_mm == current->active_mm) | ||
271 | __flush_tlb_up(); | ||
272 | } | ||
273 | |||
274 | static inline void flush_tlb_mm_range(struct mm_struct *mm, | ||
275 | unsigned long start, unsigned long end, unsigned long vmflag) | ||
276 | { | ||
277 | if (mm == current->active_mm) | ||
278 | __flush_tlb_up(); | ||
279 | } | ||
280 | |||
281 | static inline void native_flush_tlb_others(const struct cpumask *cpumask, | ||
282 | struct mm_struct *mm, | ||
283 | unsigned long start, | ||
284 | unsigned long end) | ||
285 | { | ||
286 | } | ||
287 | |||
288 | static inline void reset_lazy_tlbstate(void) | ||
289 | { | ||
290 | } | ||
291 | |||
292 | static inline void flush_tlb_kernel_range(unsigned long start, | ||
293 | unsigned long end) | ||
294 | { | ||
295 | flush_tlb_all(); | ||
296 | } | ||
297 | |||
298 | #else /* SMP */ | ||
299 | |||
300 | #include <asm/smp.h> | ||
301 | 238 | ||
302 | #define local_flush_tlb() __flush_tlb() | 239 | #define local_flush_tlb() __flush_tlb() |
303 | 240 | ||
@@ -307,29 +244,32 @@ static inline void flush_tlb_kernel_range(unsigned long start, | |||
307 | flush_tlb_mm_range(vma->vm_mm, start, end, vma->vm_flags) | 244 | flush_tlb_mm_range(vma->vm_mm, start, end, vma->vm_flags) |
308 | 245 | ||
309 | extern void flush_tlb_all(void); | 246 | extern void flush_tlb_all(void); |
310 | extern void flush_tlb_page(struct vm_area_struct *, unsigned long); | ||
311 | extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, | 247 | extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, |
312 | unsigned long end, unsigned long vmflag); | 248 | unsigned long end, unsigned long vmflag); |
313 | extern void flush_tlb_kernel_range(unsigned long start, unsigned long end); | 249 | extern void flush_tlb_kernel_range(unsigned long start, unsigned long end); |
314 | 250 | ||
251 | static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a) | ||
252 | { | ||
253 | flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, VM_NONE); | ||
254 | } | ||
255 | |||
315 | void native_flush_tlb_others(const struct cpumask *cpumask, | 256 | void native_flush_tlb_others(const struct cpumask *cpumask, |
316 | struct mm_struct *mm, | 257 | const struct flush_tlb_info *info); |
317 | unsigned long start, unsigned long end); | ||
318 | 258 | ||
319 | #define TLBSTATE_OK 1 | 259 | #define TLBSTATE_OK 1 |
320 | #define TLBSTATE_LAZY 2 | 260 | #define TLBSTATE_LAZY 2 |
321 | 261 | ||
322 | static inline void reset_lazy_tlbstate(void) | 262 | static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch, |
263 | struct mm_struct *mm) | ||
323 | { | 264 | { |
324 | this_cpu_write(cpu_tlbstate.state, 0); | 265 | cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm)); |
325 | this_cpu_write(cpu_tlbstate.active_mm, &init_mm); | ||
326 | } | 266 | } |
327 | 267 | ||
328 | #endif /* SMP */ | 268 | extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch); |
329 | 269 | ||
330 | #ifndef CONFIG_PARAVIRT | 270 | #ifndef CONFIG_PARAVIRT |
331 | #define flush_tlb_others(mask, mm, start, end) \ | 271 | #define flush_tlb_others(mask, info) \ |
332 | native_flush_tlb_others(mask, mm, start, end) | 272 | native_flush_tlb_others(mask, info) |
333 | #endif | 273 | #endif |
334 | 274 | ||
335 | #endif /* _ASM_X86_TLBFLUSH_H */ | 275 | #endif /* _ASM_X86_TLBFLUSH_H */ |
diff --git a/arch/x86/include/asm/uv/uv.h b/arch/x86/include/asm/uv/uv.h index 6686820feae9..b5a32231abd8 100644 --- a/arch/x86/include/asm/uv/uv.h +++ b/arch/x86/include/asm/uv/uv.h | |||
@@ -1,6 +1,8 @@ | |||
1 | #ifndef _ASM_X86_UV_UV_H | 1 | #ifndef _ASM_X86_UV_UV_H |
2 | #define _ASM_X86_UV_UV_H | 2 | #define _ASM_X86_UV_UV_H |
3 | 3 | ||
4 | #include <asm/tlbflush.h> | ||
5 | |||
4 | enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC}; | 6 | enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC}; |
5 | 7 | ||
6 | struct cpumask; | 8 | struct cpumask; |
@@ -15,10 +17,7 @@ extern void uv_cpu_init(void); | |||
15 | extern void uv_nmi_init(void); | 17 | extern void uv_nmi_init(void); |
16 | extern void uv_system_init(void); | 18 | extern void uv_system_init(void); |
17 | extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, | 19 | extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, |
18 | struct mm_struct *mm, | 20 | const struct flush_tlb_info *info); |
19 | unsigned long start, | ||
20 | unsigned long end, | ||
21 | unsigned int cpu); | ||
22 | 21 | ||
23 | #else /* X86_UV */ | 22 | #else /* X86_UV */ |
24 | 23 | ||
@@ -28,8 +27,8 @@ static inline int is_uv_hubless(void) { return 0; } | |||
28 | static inline void uv_cpu_init(void) { } | 27 | static inline void uv_cpu_init(void) { } |
29 | static inline void uv_system_init(void) { } | 28 | static inline void uv_system_init(void) { } |
30 | static inline const struct cpumask * | 29 | static inline const struct cpumask * |
31 | uv_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm, | 30 | uv_flush_tlb_others(const struct cpumask *cpumask, |
32 | unsigned long start, unsigned long end, unsigned int cpu) | 31 | const struct flush_tlb_info *info) |
33 | { return cpumask; } | 32 | { return cpumask; } |
34 | 33 | ||
35 | #endif /* X86_UV */ | 34 | #endif /* X86_UV */ |
diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h index 567de50a4c2a..185f3d10c194 100644 --- a/arch/x86/include/uapi/asm/processor-flags.h +++ b/arch/x86/include/uapi/asm/processor-flags.h | |||
@@ -104,6 +104,8 @@ | |||
104 | #define X86_CR4_OSFXSR _BITUL(X86_CR4_OSFXSR_BIT) | 104 | #define X86_CR4_OSFXSR _BITUL(X86_CR4_OSFXSR_BIT) |
105 | #define X86_CR4_OSXMMEXCPT_BIT 10 /* enable unmasked SSE exceptions */ | 105 | #define X86_CR4_OSXMMEXCPT_BIT 10 /* enable unmasked SSE exceptions */ |
106 | #define X86_CR4_OSXMMEXCPT _BITUL(X86_CR4_OSXMMEXCPT_BIT) | 106 | #define X86_CR4_OSXMMEXCPT _BITUL(X86_CR4_OSXMMEXCPT_BIT) |
107 | #define X86_CR4_LA57_BIT 12 /* enable 5-level page tables */ | ||
108 | #define X86_CR4_LA57 _BITUL(X86_CR4_LA57_BIT) | ||
107 | #define X86_CR4_VMXE_BIT 13 /* enable VMX virtualization */ | 109 | #define X86_CR4_VMXE_BIT 13 /* enable VMX virtualization */ |
108 | #define X86_CR4_VMXE _BITUL(X86_CR4_VMXE_BIT) | 110 | #define X86_CR4_VMXE _BITUL(X86_CR4_VMXE_BIT) |
109 | #define X86_CR4_SMXE_BIT 14 /* enable safer mode (TXT) */ | 111 | #define X86_CR4_SMXE_BIT 14 /* enable safer mode (TXT) */ |
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 3c7c419c4e3e..a01892bdd61a 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile | |||
@@ -18,6 +18,7 @@ CFLAGS_REMOVE_pvclock.o = -pg | |||
18 | CFLAGS_REMOVE_kvmclock.o = -pg | 18 | CFLAGS_REMOVE_kvmclock.o = -pg |
19 | CFLAGS_REMOVE_ftrace.o = -pg | 19 | CFLAGS_REMOVE_ftrace.o = -pg |
20 | CFLAGS_REMOVE_early_printk.o = -pg | 20 | CFLAGS_REMOVE_early_printk.o = -pg |
21 | CFLAGS_REMOVE_head64.o = -pg | ||
21 | endif | 22 | endif |
22 | 23 | ||
23 | KASAN_SANITIZE_head$(BITS).o := n | 24 | KASAN_SANITIZE_head$(BITS).o := n |
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c index 8e598a1ad986..6b91e2eb8d3f 100644 --- a/arch/x86/kernel/espfix_64.c +++ b/arch/x86/kernel/espfix_64.c | |||
@@ -125,7 +125,7 @@ void __init init_espfix_bsp(void) | |||
125 | p4d_t *p4d; | 125 | p4d_t *p4d; |
126 | 126 | ||
127 | /* Install the espfix pud into the kernel page directory */ | 127 | /* Install the espfix pud into the kernel page directory */ |
128 | pgd = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)]; | 128 | pgd = &init_top_pgt[pgd_index(ESPFIX_BASE_ADDR)]; |
129 | p4d = p4d_alloc(&init_mm, pgd, ESPFIX_BASE_ADDR); | 129 | p4d = p4d_alloc(&init_mm, pgd, ESPFIX_BASE_ADDR); |
130 | p4d_populate(&init_mm, p4d, espfix_pud_page); | 130 | p4d_populate(&init_mm, p4d, espfix_pud_page); |
131 | 131 | ||
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 43b7002f44fb..46c3c73e7f43 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c | |||
@@ -33,17 +33,120 @@ | |||
33 | /* | 33 | /* |
34 | * Manage page tables very early on. | 34 | * Manage page tables very early on. |
35 | */ | 35 | */ |
36 | extern pgd_t early_level4_pgt[PTRS_PER_PGD]; | 36 | extern pgd_t early_top_pgt[PTRS_PER_PGD]; |
37 | extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD]; | 37 | extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD]; |
38 | static unsigned int __initdata next_early_pgt = 2; | 38 | static unsigned int __initdata next_early_pgt; |
39 | pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX); | 39 | pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX); |
40 | 40 | ||
41 | #define __head __section(.head.text) | ||
42 | |||
43 | static void __head *fixup_pointer(void *ptr, unsigned long physaddr) | ||
44 | { | ||
45 | return ptr - (void *)_text + (void *)physaddr; | ||
46 | } | ||
47 | |||
48 | void __head __startup_64(unsigned long physaddr) | ||
49 | { | ||
50 | unsigned long load_delta, *p; | ||
51 | pgdval_t *pgd; | ||
52 | p4dval_t *p4d; | ||
53 | pudval_t *pud; | ||
54 | pmdval_t *pmd, pmd_entry; | ||
55 | int i; | ||
56 | |||
57 | /* Is the address too large? */ | ||
58 | if (physaddr >> MAX_PHYSMEM_BITS) | ||
59 | for (;;); | ||
60 | |||
61 | /* | ||
62 | * Compute the delta between the address I am compiled to run at | ||
63 | * and the address I am actually running at. | ||
64 | */ | ||
65 | load_delta = physaddr - (unsigned long)(_text - __START_KERNEL_map); | ||
66 | |||
67 | /* Is the address not 2M aligned? */ | ||
68 | if (load_delta & ~PMD_PAGE_MASK) | ||
69 | for (;;); | ||
70 | |||
71 | /* Fixup the physical addresses in the page table */ | ||
72 | |||
73 | pgd = fixup_pointer(&early_top_pgt, physaddr); | ||
74 | pgd[pgd_index(__START_KERNEL_map)] += load_delta; | ||
75 | |||
76 | if (IS_ENABLED(CONFIG_X86_5LEVEL)) { | ||
77 | p4d = fixup_pointer(&level4_kernel_pgt, physaddr); | ||
78 | p4d[511] += load_delta; | ||
79 | } | ||
80 | |||
81 | pud = fixup_pointer(&level3_kernel_pgt, physaddr); | ||
82 | pud[510] += load_delta; | ||
83 | pud[511] += load_delta; | ||
84 | |||
85 | pmd = fixup_pointer(level2_fixmap_pgt, physaddr); | ||
86 | pmd[506] += load_delta; | ||
87 | |||
88 | /* | ||
89 | * Set up the identity mapping for the switchover. These | ||
90 | * entries should *NOT* have the global bit set! This also | ||
91 | * creates a bunch of nonsense entries but that is fine -- | ||
92 | * it avoids problems around wraparound. | ||
93 | */ | ||
94 | |||
95 | pud = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr); | ||
96 | pmd = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr); | ||
97 | |||
98 | if (IS_ENABLED(CONFIG_X86_5LEVEL)) { | ||
99 | p4d = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr); | ||
100 | |||
101 | i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD; | ||
102 | pgd[i + 0] = (pgdval_t)p4d + _KERNPG_TABLE; | ||
103 | pgd[i + 1] = (pgdval_t)p4d + _KERNPG_TABLE; | ||
104 | |||
105 | i = (physaddr >> P4D_SHIFT) % PTRS_PER_P4D; | ||
106 | p4d[i + 0] = (pgdval_t)pud + _KERNPG_TABLE; | ||
107 | p4d[i + 1] = (pgdval_t)pud + _KERNPG_TABLE; | ||
108 | } else { | ||
109 | i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD; | ||
110 | pgd[i + 0] = (pgdval_t)pud + _KERNPG_TABLE; | ||
111 | pgd[i + 1] = (pgdval_t)pud + _KERNPG_TABLE; | ||
112 | } | ||
113 | |||
114 | i = (physaddr >> PUD_SHIFT) % PTRS_PER_PUD; | ||
115 | pud[i + 0] = (pudval_t)pmd + _KERNPG_TABLE; | ||
116 | pud[i + 1] = (pudval_t)pmd + _KERNPG_TABLE; | ||
117 | |||
118 | pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL; | ||
119 | pmd_entry += physaddr; | ||
120 | |||
121 | for (i = 0; i < DIV_ROUND_UP(_end - _text, PMD_SIZE); i++) { | ||
122 | int idx = i + (physaddr >> PMD_SHIFT) % PTRS_PER_PMD; | ||
123 | pmd[idx] = pmd_entry + i * PMD_SIZE; | ||
124 | } | ||
125 | |||
126 | /* | ||
127 | * Fixup the kernel text+data virtual addresses. Note that | ||
128 | * we might write invalid pmds, when the kernel is relocated | ||
129 | * cleanup_highmap() fixes this up along with the mappings | ||
130 | * beyond _end. | ||
131 | */ | ||
132 | |||
133 | pmd = fixup_pointer(level2_kernel_pgt, physaddr); | ||
134 | for (i = 0; i < PTRS_PER_PMD; i++) { | ||
135 | if (pmd[i] & _PAGE_PRESENT) | ||
136 | pmd[i] += load_delta; | ||
137 | } | ||
138 | |||
139 | /* Fixup phys_base */ | ||
140 | p = fixup_pointer(&phys_base, physaddr); | ||
141 | *p += load_delta; | ||
142 | } | ||
143 | |||
41 | /* Wipe all early page tables except for the kernel symbol map */ | 144 | /* Wipe all early page tables except for the kernel symbol map */ |
42 | static void __init reset_early_page_tables(void) | 145 | static void __init reset_early_page_tables(void) |
43 | { | 146 | { |
44 | memset(early_level4_pgt, 0, sizeof(pgd_t)*(PTRS_PER_PGD-1)); | 147 | memset(early_top_pgt, 0, sizeof(pgd_t)*(PTRS_PER_PGD-1)); |
45 | next_early_pgt = 0; | 148 | next_early_pgt = 0; |
46 | write_cr3(__pa_nodebug(early_level4_pgt)); | 149 | write_cr3(__pa_nodebug(early_top_pgt)); |
47 | } | 150 | } |
48 | 151 | ||
49 | /* Create a new PMD entry */ | 152 | /* Create a new PMD entry */ |
@@ -51,15 +154,16 @@ int __init early_make_pgtable(unsigned long address) | |||
51 | { | 154 | { |
52 | unsigned long physaddr = address - __PAGE_OFFSET; | 155 | unsigned long physaddr = address - __PAGE_OFFSET; |
53 | pgdval_t pgd, *pgd_p; | 156 | pgdval_t pgd, *pgd_p; |
157 | p4dval_t p4d, *p4d_p; | ||
54 | pudval_t pud, *pud_p; | 158 | pudval_t pud, *pud_p; |
55 | pmdval_t pmd, *pmd_p; | 159 | pmdval_t pmd, *pmd_p; |
56 | 160 | ||
57 | /* Invalid address or early pgt is done ? */ | 161 | /* Invalid address or early pgt is done ? */ |
58 | if (physaddr >= MAXMEM || read_cr3() != __pa_nodebug(early_level4_pgt)) | 162 | if (physaddr >= MAXMEM || read_cr3_pa() != __pa_nodebug(early_top_pgt)) |
59 | return -1; | 163 | return -1; |
60 | 164 | ||
61 | again: | 165 | again: |
62 | pgd_p = &early_level4_pgt[pgd_index(address)].pgd; | 166 | pgd_p = &early_top_pgt[pgd_index(address)].pgd; |
63 | pgd = *pgd_p; | 167 | pgd = *pgd_p; |
64 | 168 | ||
65 | /* | 169 | /* |
@@ -67,8 +171,25 @@ again: | |||
67 | * critical -- __PAGE_OFFSET would point us back into the dynamic | 171 | * critical -- __PAGE_OFFSET would point us back into the dynamic |
68 | * range and we might end up looping forever... | 172 | * range and we might end up looping forever... |
69 | */ | 173 | */ |
70 | if (pgd) | 174 | if (!IS_ENABLED(CONFIG_X86_5LEVEL)) |
71 | pud_p = (pudval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base); | 175 | p4d_p = pgd_p; |
176 | else if (pgd) | ||
177 | p4d_p = (p4dval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base); | ||
178 | else { | ||
179 | if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) { | ||
180 | reset_early_page_tables(); | ||
181 | goto again; | ||
182 | } | ||
183 | |||
184 | p4d_p = (p4dval_t *)early_dynamic_pgts[next_early_pgt++]; | ||
185 | memset(p4d_p, 0, sizeof(*p4d_p) * PTRS_PER_P4D); | ||
186 | *pgd_p = (pgdval_t)p4d_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; | ||
187 | } | ||
188 | p4d_p += p4d_index(address); | ||
189 | p4d = *p4d_p; | ||
190 | |||
191 | if (p4d) | ||
192 | pud_p = (pudval_t *)((p4d & PTE_PFN_MASK) + __START_KERNEL_map - phys_base); | ||
72 | else { | 193 | else { |
73 | if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) { | 194 | if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) { |
74 | reset_early_page_tables(); | 195 | reset_early_page_tables(); |
@@ -77,7 +198,7 @@ again: | |||
77 | 198 | ||
78 | pud_p = (pudval_t *)early_dynamic_pgts[next_early_pgt++]; | 199 | pud_p = (pudval_t *)early_dynamic_pgts[next_early_pgt++]; |
79 | memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD); | 200 | memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD); |
80 | *pgd_p = (pgdval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; | 201 | *p4d_p = (p4dval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; |
81 | } | 202 | } |
82 | pud_p += pud_index(address); | 203 | pud_p += pud_index(address); |
83 | pud = *pud_p; | 204 | pud = *pud_p; |
@@ -156,7 +277,7 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) | |||
156 | 277 | ||
157 | clear_bss(); | 278 | clear_bss(); |
158 | 279 | ||
159 | clear_page(init_level4_pgt); | 280 | clear_page(init_top_pgt); |
160 | 281 | ||
161 | kasan_early_init(); | 282 | kasan_early_init(); |
162 | 283 | ||
@@ -171,8 +292,8 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) | |||
171 | */ | 292 | */ |
172 | load_ucode_bsp(); | 293 | load_ucode_bsp(); |
173 | 294 | ||
174 | /* set init_level4_pgt kernel high mapping*/ | 295 | /* set init_top_pgt kernel high mapping*/ |
175 | init_level4_pgt[511] = early_level4_pgt[511]; | 296 | init_top_pgt[511] = early_top_pgt[511]; |
176 | 297 | ||
177 | x86_64_start_reservations(real_mode_data); | 298 | x86_64_start_reservations(real_mode_data); |
178 | } | 299 | } |
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index ac9d327d2e42..6225550883df 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S | |||
@@ -37,10 +37,11 @@ | |||
37 | * | 37 | * |
38 | */ | 38 | */ |
39 | 39 | ||
40 | #define p4d_index(x) (((x) >> P4D_SHIFT) & (PTRS_PER_P4D-1)) | ||
40 | #define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1)) | 41 | #define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1)) |
41 | 42 | ||
42 | L4_PAGE_OFFSET = pgd_index(__PAGE_OFFSET_BASE) | 43 | PGD_PAGE_OFFSET = pgd_index(__PAGE_OFFSET_BASE) |
43 | L4_START_KERNEL = pgd_index(__START_KERNEL_map) | 44 | PGD_START_KERNEL = pgd_index(__START_KERNEL_map) |
44 | L3_START_KERNEL = pud_index(__START_KERNEL_map) | 45 | L3_START_KERNEL = pud_index(__START_KERNEL_map) |
45 | 46 | ||
46 | .text | 47 | .text |
@@ -72,101 +73,12 @@ startup_64: | |||
72 | /* Sanitize CPU configuration */ | 73 | /* Sanitize CPU configuration */ |
73 | call verify_cpu | 74 | call verify_cpu |
74 | 75 | ||
75 | /* | ||
76 | * Compute the delta between the address I am compiled to run at and the | ||
77 | * address I am actually running at. | ||
78 | */ | ||
79 | leaq _text(%rip), %rbp | ||
80 | subq $_text - __START_KERNEL_map, %rbp | ||
81 | |||
82 | /* Is the address not 2M aligned? */ | ||
83 | testl $~PMD_PAGE_MASK, %ebp | ||
84 | jnz bad_address | ||
85 | |||
86 | /* | ||
87 | * Is the address too large? | ||
88 | */ | ||
89 | leaq _text(%rip), %rax | ||
90 | shrq $MAX_PHYSMEM_BITS, %rax | ||
91 | jnz bad_address | ||
92 | |||
93 | /* | ||
94 | * Fixup the physical addresses in the page table | ||
95 | */ | ||
96 | addq %rbp, early_level4_pgt + (L4_START_KERNEL*8)(%rip) | ||
97 | |||
98 | addq %rbp, level3_kernel_pgt + (510*8)(%rip) | ||
99 | addq %rbp, level3_kernel_pgt + (511*8)(%rip) | ||
100 | |||
101 | addq %rbp, level2_fixmap_pgt + (506*8)(%rip) | ||
102 | |||
103 | /* | ||
104 | * Set up the identity mapping for the switchover. These | ||
105 | * entries should *NOT* have the global bit set! This also | ||
106 | * creates a bunch of nonsense entries but that is fine -- | ||
107 | * it avoids problems around wraparound. | ||
108 | */ | ||
109 | leaq _text(%rip), %rdi | 76 | leaq _text(%rip), %rdi |
110 | leaq early_level4_pgt(%rip), %rbx | 77 | pushq %rsi |
111 | 78 | call __startup_64 | |
112 | movq %rdi, %rax | 79 | popq %rsi |
113 | shrq $PGDIR_SHIFT, %rax | ||
114 | |||
115 | leaq (PAGE_SIZE + _KERNPG_TABLE)(%rbx), %rdx | ||
116 | movq %rdx, 0(%rbx,%rax,8) | ||
117 | movq %rdx, 8(%rbx,%rax,8) | ||
118 | |||
119 | addq $PAGE_SIZE, %rdx | ||
120 | movq %rdi, %rax | ||
121 | shrq $PUD_SHIFT, %rax | ||
122 | andl $(PTRS_PER_PUD-1), %eax | ||
123 | movq %rdx, PAGE_SIZE(%rbx,%rax,8) | ||
124 | incl %eax | ||
125 | andl $(PTRS_PER_PUD-1), %eax | ||
126 | movq %rdx, PAGE_SIZE(%rbx,%rax,8) | ||
127 | |||
128 | addq $PAGE_SIZE * 2, %rbx | ||
129 | movq %rdi, %rax | ||
130 | shrq $PMD_SHIFT, %rdi | ||
131 | addq $(__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL), %rax | ||
132 | leaq (_end - 1)(%rip), %rcx | ||
133 | shrq $PMD_SHIFT, %rcx | ||
134 | subq %rdi, %rcx | ||
135 | incl %ecx | ||
136 | |||
137 | 1: | ||
138 | andq $(PTRS_PER_PMD - 1), %rdi | ||
139 | movq %rax, (%rbx,%rdi,8) | ||
140 | incq %rdi | ||
141 | addq $PMD_SIZE, %rax | ||
142 | decl %ecx | ||
143 | jnz 1b | ||
144 | |||
145 | test %rbp, %rbp | ||
146 | jz .Lskip_fixup | ||
147 | 80 | ||
148 | /* | 81 | movq $(early_top_pgt - __START_KERNEL_map), %rax |
149 | * Fixup the kernel text+data virtual addresses. Note that | ||
150 | * we might write invalid pmds, when the kernel is relocated | ||
151 | * cleanup_highmap() fixes this up along with the mappings | ||
152 | * beyond _end. | ||
153 | */ | ||
154 | leaq level2_kernel_pgt(%rip), %rdi | ||
155 | leaq PAGE_SIZE(%rdi), %r8 | ||
156 | /* See if it is a valid page table entry */ | ||
157 | 1: testb $_PAGE_PRESENT, 0(%rdi) | ||
158 | jz 2f | ||
159 | addq %rbp, 0(%rdi) | ||
160 | /* Go to the next page */ | ||
161 | 2: addq $8, %rdi | ||
162 | cmp %r8, %rdi | ||
163 | jne 1b | ||
164 | |||
165 | /* Fixup phys_base */ | ||
166 | addq %rbp, phys_base(%rip) | ||
167 | |||
168 | .Lskip_fixup: | ||
169 | movq $(early_level4_pgt - __START_KERNEL_map), %rax | ||
170 | jmp 1f | 82 | jmp 1f |
171 | ENTRY(secondary_startup_64) | 83 | ENTRY(secondary_startup_64) |
172 | /* | 84 | /* |
@@ -186,14 +98,17 @@ ENTRY(secondary_startup_64) | |||
186 | /* Sanitize CPU configuration */ | 98 | /* Sanitize CPU configuration */ |
187 | call verify_cpu | 99 | call verify_cpu |
188 | 100 | ||
189 | movq $(init_level4_pgt - __START_KERNEL_map), %rax | 101 | movq $(init_top_pgt - __START_KERNEL_map), %rax |
190 | 1: | 102 | 1: |
191 | 103 | ||
192 | /* Enable PAE mode and PGE */ | 104 | /* Enable PAE mode, PGE and LA57 */ |
193 | movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx | 105 | movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx |
106 | #ifdef CONFIG_X86_5LEVEL | ||
107 | orl $X86_CR4_LA57, %ecx | ||
108 | #endif | ||
194 | movq %rcx, %cr4 | 109 | movq %rcx, %cr4 |
195 | 110 | ||
196 | /* Setup early boot stage 4 level pagetables. */ | 111 | /* Setup early boot stage 4-/5-level pagetables. */ |
197 | addq phys_base(%rip), %rax | 112 | addq phys_base(%rip), %rax |
198 | movq %rax, %cr3 | 113 | movq %rax, %cr3 |
199 | 114 | ||
@@ -417,9 +332,13 @@ GLOBAL(name) | |||
417 | .endr | 332 | .endr |
418 | 333 | ||
419 | __INITDATA | 334 | __INITDATA |
420 | NEXT_PAGE(early_level4_pgt) | 335 | NEXT_PAGE(early_top_pgt) |
421 | .fill 511,8,0 | 336 | .fill 511,8,0 |
337 | #ifdef CONFIG_X86_5LEVEL | ||
338 | .quad level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE | ||
339 | #else | ||
422 | .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE | 340 | .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE |
341 | #endif | ||
423 | 342 | ||
424 | NEXT_PAGE(early_dynamic_pgts) | 343 | NEXT_PAGE(early_dynamic_pgts) |
425 | .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0 | 344 | .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0 |
@@ -427,14 +346,14 @@ NEXT_PAGE(early_dynamic_pgts) | |||
427 | .data | 346 | .data |
428 | 347 | ||
429 | #ifndef CONFIG_XEN | 348 | #ifndef CONFIG_XEN |
430 | NEXT_PAGE(init_level4_pgt) | 349 | NEXT_PAGE(init_top_pgt) |
431 | .fill 512,8,0 | 350 | .fill 512,8,0 |
432 | #else | 351 | #else |
433 | NEXT_PAGE(init_level4_pgt) | 352 | NEXT_PAGE(init_top_pgt) |
434 | .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE | 353 | .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE |
435 | .org init_level4_pgt + L4_PAGE_OFFSET*8, 0 | 354 | .org init_top_pgt + PGD_PAGE_OFFSET*8, 0 |
436 | .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE | 355 | .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE |
437 | .org init_level4_pgt + L4_START_KERNEL*8, 0 | 356 | .org init_top_pgt + PGD_START_KERNEL*8, 0 |
438 | /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ | 357 | /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ |
439 | .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE | 358 | .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE |
440 | 359 | ||
@@ -448,6 +367,12 @@ NEXT_PAGE(level2_ident_pgt) | |||
448 | PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) | 367 | PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) |
449 | #endif | 368 | #endif |
450 | 369 | ||
370 | #ifdef CONFIG_X86_5LEVEL | ||
371 | NEXT_PAGE(level4_kernel_pgt) | ||
372 | .fill 511,8,0 | ||
373 | .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE | ||
374 | #endif | ||
375 | |||
451 | NEXT_PAGE(level3_kernel_pgt) | 376 | NEXT_PAGE(level3_kernel_pgt) |
452 | .fill L3_START_KERNEL,8,0 | 377 | .fill L3_START_KERNEL,8,0 |
453 | /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */ | 378 | /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */ |
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index d4a15831ac58..a870910c8565 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c | |||
@@ -22,24 +22,25 @@ | |||
22 | #include <asm/syscalls.h> | 22 | #include <asm/syscalls.h> |
23 | 23 | ||
24 | /* context.lock is held for us, so we don't need any locking. */ | 24 | /* context.lock is held for us, so we don't need any locking. */ |
25 | static void flush_ldt(void *current_mm) | 25 | static void flush_ldt(void *__mm) |
26 | { | 26 | { |
27 | struct mm_struct *mm = __mm; | ||
27 | mm_context_t *pc; | 28 | mm_context_t *pc; |
28 | 29 | ||
29 | if (current->active_mm != current_mm) | 30 | if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm) |
30 | return; | 31 | return; |
31 | 32 | ||
32 | pc = ¤t->active_mm->context; | 33 | pc = &mm->context; |
33 | set_ldt(pc->ldt->entries, pc->ldt->size); | 34 | set_ldt(pc->ldt->entries, pc->ldt->nr_entries); |
34 | } | 35 | } |
35 | 36 | ||
36 | /* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */ | 37 | /* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */ |
37 | static struct ldt_struct *alloc_ldt_struct(unsigned int size) | 38 | static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries) |
38 | { | 39 | { |
39 | struct ldt_struct *new_ldt; | 40 | struct ldt_struct *new_ldt; |
40 | unsigned int alloc_size; | 41 | unsigned int alloc_size; |
41 | 42 | ||
42 | if (size > LDT_ENTRIES) | 43 | if (num_entries > LDT_ENTRIES) |
43 | return NULL; | 44 | return NULL; |
44 | 45 | ||
45 | new_ldt = kmalloc(sizeof(struct ldt_struct), GFP_KERNEL); | 46 | new_ldt = kmalloc(sizeof(struct ldt_struct), GFP_KERNEL); |
@@ -47,7 +48,7 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int size) | |||
47 | return NULL; | 48 | return NULL; |
48 | 49 | ||
49 | BUILD_BUG_ON(LDT_ENTRY_SIZE != sizeof(struct desc_struct)); | 50 | BUILD_BUG_ON(LDT_ENTRY_SIZE != sizeof(struct desc_struct)); |
50 | alloc_size = size * LDT_ENTRY_SIZE; | 51 | alloc_size = num_entries * LDT_ENTRY_SIZE; |
51 | 52 | ||
52 | /* | 53 | /* |
53 | * Xen is very picky: it requires a page-aligned LDT that has no | 54 | * Xen is very picky: it requires a page-aligned LDT that has no |
@@ -65,14 +66,14 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int size) | |||
65 | return NULL; | 66 | return NULL; |
66 | } | 67 | } |
67 | 68 | ||
68 | new_ldt->size = size; | 69 | new_ldt->nr_entries = num_entries; |
69 | return new_ldt; | 70 | return new_ldt; |
70 | } | 71 | } |
71 | 72 | ||
72 | /* After calling this, the LDT is immutable. */ | 73 | /* After calling this, the LDT is immutable. */ |
73 | static void finalize_ldt_struct(struct ldt_struct *ldt) | 74 | static void finalize_ldt_struct(struct ldt_struct *ldt) |
74 | { | 75 | { |
75 | paravirt_alloc_ldt(ldt->entries, ldt->size); | 76 | paravirt_alloc_ldt(ldt->entries, ldt->nr_entries); |
76 | } | 77 | } |
77 | 78 | ||
78 | /* context.lock is held */ | 79 | /* context.lock is held */ |
@@ -91,8 +92,8 @@ static void free_ldt_struct(struct ldt_struct *ldt) | |||
91 | if (likely(!ldt)) | 92 | if (likely(!ldt)) |
92 | return; | 93 | return; |
93 | 94 | ||
94 | paravirt_free_ldt(ldt->entries, ldt->size); | 95 | paravirt_free_ldt(ldt->entries, ldt->nr_entries); |
95 | if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE) | 96 | if (ldt->nr_entries * LDT_ENTRY_SIZE > PAGE_SIZE) |
96 | vfree_atomic(ldt->entries); | 97 | vfree_atomic(ldt->entries); |
97 | else | 98 | else |
98 | free_page((unsigned long)ldt->entries); | 99 | free_page((unsigned long)ldt->entries); |
@@ -122,14 +123,14 @@ int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm) | |||
122 | goto out_unlock; | 123 | goto out_unlock; |
123 | } | 124 | } |
124 | 125 | ||
125 | new_ldt = alloc_ldt_struct(old_mm->context.ldt->size); | 126 | new_ldt = alloc_ldt_struct(old_mm->context.ldt->nr_entries); |
126 | if (!new_ldt) { | 127 | if (!new_ldt) { |
127 | retval = -ENOMEM; | 128 | retval = -ENOMEM; |
128 | goto out_unlock; | 129 | goto out_unlock; |
129 | } | 130 | } |
130 | 131 | ||
131 | memcpy(new_ldt->entries, old_mm->context.ldt->entries, | 132 | memcpy(new_ldt->entries, old_mm->context.ldt->entries, |
132 | new_ldt->size * LDT_ENTRY_SIZE); | 133 | new_ldt->nr_entries * LDT_ENTRY_SIZE); |
133 | finalize_ldt_struct(new_ldt); | 134 | finalize_ldt_struct(new_ldt); |
134 | 135 | ||
135 | mm->context.ldt = new_ldt; | 136 | mm->context.ldt = new_ldt; |
@@ -152,9 +153,9 @@ void destroy_context_ldt(struct mm_struct *mm) | |||
152 | 153 | ||
153 | static int read_ldt(void __user *ptr, unsigned long bytecount) | 154 | static int read_ldt(void __user *ptr, unsigned long bytecount) |
154 | { | 155 | { |
155 | int retval; | ||
156 | unsigned long size; | ||
157 | struct mm_struct *mm = current->mm; | 156 | struct mm_struct *mm = current->mm; |
157 | unsigned long entries_size; | ||
158 | int retval; | ||
158 | 159 | ||
159 | mutex_lock(&mm->context.lock); | 160 | mutex_lock(&mm->context.lock); |
160 | 161 | ||
@@ -166,18 +167,18 @@ static int read_ldt(void __user *ptr, unsigned long bytecount) | |||
166 | if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES) | 167 | if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES) |
167 | bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES; | 168 | bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES; |
168 | 169 | ||
169 | size = mm->context.ldt->size * LDT_ENTRY_SIZE; | 170 | entries_size = mm->context.ldt->nr_entries * LDT_ENTRY_SIZE; |
170 | if (size > bytecount) | 171 | if (entries_size > bytecount) |
171 | size = bytecount; | 172 | entries_size = bytecount; |
172 | 173 | ||
173 | if (copy_to_user(ptr, mm->context.ldt->entries, size)) { | 174 | if (copy_to_user(ptr, mm->context.ldt->entries, entries_size)) { |
174 | retval = -EFAULT; | 175 | retval = -EFAULT; |
175 | goto out_unlock; | 176 | goto out_unlock; |
176 | } | 177 | } |
177 | 178 | ||
178 | if (size != bytecount) { | 179 | if (entries_size != bytecount) { |
179 | /* Zero-fill the rest and pretend we read bytecount bytes. */ | 180 | /* Zero-fill the rest and pretend we read bytecount bytes. */ |
180 | if (clear_user(ptr + size, bytecount - size)) { | 181 | if (clear_user(ptr + entries_size, bytecount - entries_size)) { |
181 | retval = -EFAULT; | 182 | retval = -EFAULT; |
182 | goto out_unlock; | 183 | goto out_unlock; |
183 | } | 184 | } |
@@ -208,7 +209,7 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) | |||
208 | { | 209 | { |
209 | struct mm_struct *mm = current->mm; | 210 | struct mm_struct *mm = current->mm; |
210 | struct ldt_struct *new_ldt, *old_ldt; | 211 | struct ldt_struct *new_ldt, *old_ldt; |
211 | unsigned int oldsize, newsize; | 212 | unsigned int old_nr_entries, new_nr_entries; |
212 | struct user_desc ldt_info; | 213 | struct user_desc ldt_info; |
213 | struct desc_struct ldt; | 214 | struct desc_struct ldt; |
214 | int error; | 215 | int error; |
@@ -247,17 +248,18 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) | |||
247 | 248 | ||
248 | mutex_lock(&mm->context.lock); | 249 | mutex_lock(&mm->context.lock); |
249 | 250 | ||
250 | old_ldt = mm->context.ldt; | 251 | old_ldt = mm->context.ldt; |
251 | oldsize = old_ldt ? old_ldt->size : 0; | 252 | old_nr_entries = old_ldt ? old_ldt->nr_entries : 0; |
252 | newsize = max(ldt_info.entry_number + 1, oldsize); | 253 | new_nr_entries = max(ldt_info.entry_number + 1, old_nr_entries); |
253 | 254 | ||
254 | error = -ENOMEM; | 255 | error = -ENOMEM; |
255 | new_ldt = alloc_ldt_struct(newsize); | 256 | new_ldt = alloc_ldt_struct(new_nr_entries); |
256 | if (!new_ldt) | 257 | if (!new_ldt) |
257 | goto out_unlock; | 258 | goto out_unlock; |
258 | 259 | ||
259 | if (old_ldt) | 260 | if (old_ldt) |
260 | memcpy(new_ldt->entries, old_ldt->entries, oldsize * LDT_ENTRY_SIZE); | 261 | memcpy(new_ldt->entries, old_ldt->entries, old_nr_entries * LDT_ENTRY_SIZE); |
262 | |||
261 | new_ldt->entries[ldt_info.entry_number] = ldt; | 263 | new_ldt->entries[ldt_info.entry_number] = ldt; |
262 | finalize_ldt_struct(new_ldt); | 264 | finalize_ldt_struct(new_ldt); |
263 | 265 | ||
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 6f5ca4ebe6e5..cb0a30473c23 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c | |||
@@ -347,7 +347,7 @@ void machine_kexec(struct kimage *image) | |||
347 | void arch_crash_save_vmcoreinfo(void) | 347 | void arch_crash_save_vmcoreinfo(void) |
348 | { | 348 | { |
349 | VMCOREINFO_NUMBER(phys_base); | 349 | VMCOREINFO_NUMBER(phys_base); |
350 | VMCOREINFO_SYMBOL(init_level4_pgt); | 350 | VMCOREINFO_SYMBOL(init_top_pgt); |
351 | 351 | ||
352 | #ifdef CONFIG_NUMA | 352 | #ifdef CONFIG_NUMA |
353 | VMCOREINFO_SYMBOL(node_data); | 353 | VMCOREINFO_SYMBOL(node_data); |
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 3586996fc50d..bc0a849589bb 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c | |||
@@ -391,7 +391,7 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = { | |||
391 | 391 | ||
392 | .read_cr2 = native_read_cr2, | 392 | .read_cr2 = native_read_cr2, |
393 | .write_cr2 = native_write_cr2, | 393 | .write_cr2 = native_write_cr2, |
394 | .read_cr3 = native_read_cr3, | 394 | .read_cr3 = __native_read_cr3, |
395 | .write_cr3 = native_write_cr3, | 395 | .write_cr3 = native_write_cr3, |
396 | 396 | ||
397 | .flush_tlb_user = native_flush_tlb, | 397 | .flush_tlb_user = native_flush_tlb, |
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index ffeae818aa7a..c6d6dc5f8bb2 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c | |||
@@ -92,7 +92,7 @@ void __show_regs(struct pt_regs *regs, int all) | |||
92 | 92 | ||
93 | cr0 = read_cr0(); | 93 | cr0 = read_cr0(); |
94 | cr2 = read_cr2(); | 94 | cr2 = read_cr2(); |
95 | cr3 = read_cr3(); | 95 | cr3 = __read_cr3(); |
96 | cr4 = __read_cr4(); | 96 | cr4 = __read_cr4(); |
97 | printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", | 97 | printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", |
98 | cr0, cr2, cr3, cr4); | 98 | cr0, cr2, cr3, cr4); |
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index b6840bf3940b..c3169be4c596 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c | |||
@@ -104,7 +104,7 @@ void __show_regs(struct pt_regs *regs, int all) | |||
104 | 104 | ||
105 | cr0 = read_cr0(); | 105 | cr0 = read_cr0(); |
106 | cr2 = read_cr2(); | 106 | cr2 = read_cr2(); |
107 | cr3 = read_cr3(); | 107 | cr3 = __read_cr3(); |
108 | cr4 = __read_cr4(); | 108 | cr4 = __read_cr4(); |
109 | 109 | ||
110 | printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", | 110 | printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", |
@@ -142,7 +142,7 @@ void release_thread(struct task_struct *dead_task) | |||
142 | pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n", | 142 | pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n", |
143 | dead_task->comm, | 143 | dead_task->comm, |
144 | dead_task->mm->context.ldt->entries, | 144 | dead_task->mm->context.ldt->entries, |
145 | dead_task->mm->context.ldt->size); | 145 | dead_task->mm->context.ldt->nr_entries); |
146 | BUG(); | 146 | BUG(); |
147 | } | 147 | } |
148 | #endif | 148 | #endif |
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 045e4f993bd2..b474c8de7fba 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c | |||
@@ -1589,7 +1589,6 @@ void native_cpu_die(unsigned int cpu) | |||
1589 | void play_dead_common(void) | 1589 | void play_dead_common(void) |
1590 | { | 1590 | { |
1591 | idle_task_exit(); | 1591 | idle_task_exit(); |
1592 | reset_lazy_tlbstate(); | ||
1593 | 1592 | ||
1594 | /* Ack it */ | 1593 | /* Ack it */ |
1595 | (void)cpu_report_death(); | 1594 | (void)cpu_report_death(); |
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c index f07f83b3611b..5f25cfbd952e 100644 --- a/arch/x86/kernel/step.c +++ b/arch/x86/kernel/step.c | |||
@@ -34,7 +34,7 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re | |||
34 | 34 | ||
35 | mutex_lock(&child->mm->context.lock); | 35 | mutex_lock(&child->mm->context.lock); |
36 | if (unlikely(!child->mm->context.ldt || | 36 | if (unlikely(!child->mm->context.ldt || |
37 | seg >= child->mm->context.ldt->size)) | 37 | seg >= child->mm->context.ldt->nr_entries)) |
38 | addr = -1L; /* bogus selector, access would fault */ | 38 | addr = -1L; /* bogus selector, access would fault */ |
39 | else { | 39 | else { |
40 | desc = &child->mm->context.ldt->entries[seg]; | 40 | desc = &child->mm->context.ldt->entries[seg]; |
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 1b469b6c762f..6dcc4873e435 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c | |||
@@ -49,6 +49,7 @@ | |||
49 | #include <asm/kexec.h> | 49 | #include <asm/kexec.h> |
50 | #include <asm/apic.h> | 50 | #include <asm/apic.h> |
51 | #include <asm/irq_remapping.h> | 51 | #include <asm/irq_remapping.h> |
52 | #include <asm/mmu_context.h> | ||
52 | 53 | ||
53 | #include "trace.h" | 54 | #include "trace.h" |
54 | #include "pmu.h" | 55 | #include "pmu.h" |
@@ -597,6 +598,7 @@ struct vcpu_vmx { | |||
597 | int gs_ldt_reload_needed; | 598 | int gs_ldt_reload_needed; |
598 | int fs_reload_needed; | 599 | int fs_reload_needed; |
599 | u64 msr_host_bndcfgs; | 600 | u64 msr_host_bndcfgs; |
601 | unsigned long vmcs_host_cr3; /* May not match real cr3 */ | ||
600 | unsigned long vmcs_host_cr4; /* May not match real cr4 */ | 602 | unsigned long vmcs_host_cr4; /* May not match real cr4 */ |
601 | } host_state; | 603 | } host_state; |
602 | struct { | 604 | struct { |
@@ -5013,12 +5015,19 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx) | |||
5013 | u32 low32, high32; | 5015 | u32 low32, high32; |
5014 | unsigned long tmpl; | 5016 | unsigned long tmpl; |
5015 | struct desc_ptr dt; | 5017 | struct desc_ptr dt; |
5016 | unsigned long cr0, cr4; | 5018 | unsigned long cr0, cr3, cr4; |
5017 | 5019 | ||
5018 | cr0 = read_cr0(); | 5020 | cr0 = read_cr0(); |
5019 | WARN_ON(cr0 & X86_CR0_TS); | 5021 | WARN_ON(cr0 & X86_CR0_TS); |
5020 | vmcs_writel(HOST_CR0, cr0); /* 22.2.3 */ | 5022 | vmcs_writel(HOST_CR0, cr0); /* 22.2.3 */ |
5021 | vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */ | 5023 | |
5024 | /* | ||
5025 | * Save the most likely value for this task's CR3 in the VMCS. | ||
5026 | * We can't use __get_current_cr3_fast() because we're not atomic. | ||
5027 | */ | ||
5028 | cr3 = __read_cr3(); | ||
5029 | vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */ | ||
5030 | vmx->host_state.vmcs_host_cr3 = cr3; | ||
5022 | 5031 | ||
5023 | /* Save the most likely value for this task's CR4 in the VMCS. */ | 5032 | /* Save the most likely value for this task's CR4 in the VMCS. */ |
5024 | cr4 = cr4_read_shadow(); | 5033 | cr4 = cr4_read_shadow(); |
@@ -8822,7 +8831,7 @@ static void vmx_arm_hv_timer(struct kvm_vcpu *vcpu) | |||
8822 | static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) | 8831 | static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) |
8823 | { | 8832 | { |
8824 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 8833 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
8825 | unsigned long debugctlmsr, cr4; | 8834 | unsigned long debugctlmsr, cr3, cr4; |
8826 | 8835 | ||
8827 | /* Don't enter VMX if guest state is invalid, let the exit handler | 8836 | /* Don't enter VMX if guest state is invalid, let the exit handler |
8828 | start emulation until we arrive back to a valid state */ | 8837 | start emulation until we arrive back to a valid state */ |
@@ -8844,6 +8853,12 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) | |||
8844 | if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty)) | 8853 | if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty)) |
8845 | vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); | 8854 | vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); |
8846 | 8855 | ||
8856 | cr3 = __get_current_cr3_fast(); | ||
8857 | if (unlikely(cr3 != vmx->host_state.vmcs_host_cr3)) { | ||
8858 | vmcs_writel(HOST_CR3, cr3); | ||
8859 | vmx->host_state.vmcs_host_cr3 = cr3; | ||
8860 | } | ||
8861 | |||
8847 | cr4 = cr4_read_shadow(); | 8862 | cr4 = cr4_read_shadow(); |
8848 | if (unlikely(cr4 != vmx->host_state.vmcs_host_cr4)) { | 8863 | if (unlikely(cr4 != vmx->host_state.vmcs_host_cr4)) { |
8849 | vmcs_writel(HOST_CR4, cr4); | 8864 | vmcs_writel(HOST_CR4, cr4); |
diff --git a/arch/x86/math-emu/fpu_system.h b/arch/x86/math-emu/fpu_system.h index 5e044d506b7a..a179254a5122 100644 --- a/arch/x86/math-emu/fpu_system.h +++ b/arch/x86/math-emu/fpu_system.h | |||
@@ -27,7 +27,7 @@ static inline struct desc_struct FPU_get_ldt_descriptor(unsigned seg) | |||
27 | #ifdef CONFIG_MODIFY_LDT_SYSCALL | 27 | #ifdef CONFIG_MODIFY_LDT_SYSCALL |
28 | seg >>= 3; | 28 | seg >>= 3; |
29 | mutex_lock(¤t->mm->context.lock); | 29 | mutex_lock(¤t->mm->context.lock); |
30 | if (current->mm->context.ldt && seg < current->mm->context.ldt->size) | 30 | if (current->mm->context.ldt && seg < current->mm->context.ldt->nr_entries) |
31 | ret = current->mm->context.ldt->entries[seg]; | 31 | ret = current->mm->context.ldt->entries[seg]; |
32 | mutex_unlock(¤t->mm->context.lock); | 32 | mutex_unlock(¤t->mm->context.lock); |
33 | #endif | 33 | #endif |
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 96d2b847e09e..0fbdcb64f9f8 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile | |||
@@ -2,7 +2,7 @@ | |||
2 | KCOV_INSTRUMENT_tlb.o := n | 2 | KCOV_INSTRUMENT_tlb.o := n |
3 | 3 | ||
4 | obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ | 4 | obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ |
5 | pat.o pgtable.o physaddr.o gup.o setup_nx.o tlb.o | 5 | pat.o pgtable.o physaddr.o setup_nx.o tlb.o |
6 | 6 | ||
7 | # Make sure __phys_addr has no stackprotector | 7 | # Make sure __phys_addr has no stackprotector |
8 | nostackp := $(call cc-option, -fno-stack-protector) | 8 | nostackp := $(call cc-option, -fno-stack-protector) |
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index bce6990b1d81..0470826d2bdc 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c | |||
@@ -431,7 +431,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, | |||
431 | bool checkwx) | 431 | bool checkwx) |
432 | { | 432 | { |
433 | #ifdef CONFIG_X86_64 | 433 | #ifdef CONFIG_X86_64 |
434 | pgd_t *start = (pgd_t *) &init_level4_pgt; | 434 | pgd_t *start = (pgd_t *) &init_top_pgt; |
435 | #else | 435 | #else |
436 | pgd_t *start = swapper_pg_dir; | 436 | pgd_t *start = swapper_pg_dir; |
437 | #endif | 437 | #endif |
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 8ad91a01cbc8..2a1fa10c6a98 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c | |||
@@ -346,7 +346,7 @@ static noinline int vmalloc_fault(unsigned long address) | |||
346 | * Do _not_ use "current" here. We might be inside | 346 | * Do _not_ use "current" here. We might be inside |
347 | * an interrupt in the middle of a task switch.. | 347 | * an interrupt in the middle of a task switch.. |
348 | */ | 348 | */ |
349 | pgd_paddr = read_cr3(); | 349 | pgd_paddr = read_cr3_pa(); |
350 | pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); | 350 | pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); |
351 | if (!pmd_k) | 351 | if (!pmd_k) |
352 | return -1; | 352 | return -1; |
@@ -388,7 +388,7 @@ static bool low_pfn(unsigned long pfn) | |||
388 | 388 | ||
389 | static void dump_pagetable(unsigned long address) | 389 | static void dump_pagetable(unsigned long address) |
390 | { | 390 | { |
391 | pgd_t *base = __va(read_cr3()); | 391 | pgd_t *base = __va(read_cr3_pa()); |
392 | pgd_t *pgd = &base[pgd_index(address)]; | 392 | pgd_t *pgd = &base[pgd_index(address)]; |
393 | p4d_t *p4d; | 393 | p4d_t *p4d; |
394 | pud_t *pud; | 394 | pud_t *pud; |
@@ -451,7 +451,7 @@ static noinline int vmalloc_fault(unsigned long address) | |||
451 | * happen within a race in page table update. In the later | 451 | * happen within a race in page table update. In the later |
452 | * case just flush: | 452 | * case just flush: |
453 | */ | 453 | */ |
454 | pgd = (pgd_t *)__va(read_cr3()) + pgd_index(address); | 454 | pgd = (pgd_t *)__va(read_cr3_pa()) + pgd_index(address); |
455 | pgd_ref = pgd_offset_k(address); | 455 | pgd_ref = pgd_offset_k(address); |
456 | if (pgd_none(*pgd_ref)) | 456 | if (pgd_none(*pgd_ref)) |
457 | return -1; | 457 | return -1; |
@@ -555,7 +555,7 @@ static int bad_address(void *p) | |||
555 | 555 | ||
556 | static void dump_pagetable(unsigned long address) | 556 | static void dump_pagetable(unsigned long address) |
557 | { | 557 | { |
558 | pgd_t *base = __va(read_cr3() & PHYSICAL_PAGE_MASK); | 558 | pgd_t *base = __va(read_cr3_pa()); |
559 | pgd_t *pgd = base + pgd_index(address); | 559 | pgd_t *pgd = base + pgd_index(address); |
560 | p4d_t *p4d; | 560 | p4d_t *p4d; |
561 | pud_t *pud; | 561 | pud_t *pud; |
@@ -700,7 +700,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, | |||
700 | pgd_t *pgd; | 700 | pgd_t *pgd; |
701 | pte_t *pte; | 701 | pte_t *pte; |
702 | 702 | ||
703 | pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK); | 703 | pgd = __va(read_cr3_pa()); |
704 | pgd += pgd_index(address); | 704 | pgd += pgd_index(address); |
705 | 705 | ||
706 | pte = lookup_address_in_pgd(pgd, address, &level); | 706 | pte = lookup_address_in_pgd(pgd, address, &level); |
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c deleted file mode 100644 index 456dfdfd2249..000000000000 --- a/arch/x86/mm/gup.c +++ /dev/null | |||
@@ -1,496 +0,0 @@ | |||
1 | /* | ||
2 | * Lockless get_user_pages_fast for x86 | ||
3 | * | ||
4 | * Copyright (C) 2008 Nick Piggin | ||
5 | * Copyright (C) 2008 Novell Inc. | ||
6 | */ | ||
7 | #include <linux/sched.h> | ||
8 | #include <linux/mm.h> | ||
9 | #include <linux/vmstat.h> | ||
10 | #include <linux/highmem.h> | ||
11 | #include <linux/swap.h> | ||
12 | #include <linux/memremap.h> | ||
13 | |||
14 | #include <asm/mmu_context.h> | ||
15 | #include <asm/pgtable.h> | ||
16 | |||
17 | static inline pte_t gup_get_pte(pte_t *ptep) | ||
18 | { | ||
19 | #ifndef CONFIG_X86_PAE | ||
20 | return READ_ONCE(*ptep); | ||
21 | #else | ||
22 | /* | ||
23 | * With get_user_pages_fast, we walk down the pagetables without taking | ||
24 | * any locks. For this we would like to load the pointers atomically, | ||
25 | * but that is not possible (without expensive cmpxchg8b) on PAE. What | ||
26 | * we do have is the guarantee that a pte will only either go from not | ||
27 | * present to present, or present to not present or both -- it will not | ||
28 | * switch to a completely different present page without a TLB flush in | ||
29 | * between; something that we are blocking by holding interrupts off. | ||
30 | * | ||
31 | * Setting ptes from not present to present goes: | ||
32 | * ptep->pte_high = h; | ||
33 | * smp_wmb(); | ||
34 | * ptep->pte_low = l; | ||
35 | * | ||
36 | * And present to not present goes: | ||
37 | * ptep->pte_low = 0; | ||
38 | * smp_wmb(); | ||
39 | * ptep->pte_high = 0; | ||
40 | * | ||
41 | * We must ensure here that the load of pte_low sees l iff pte_high | ||
42 | * sees h. We load pte_high *after* loading pte_low, which ensures we | ||
43 | * don't see an older value of pte_high. *Then* we recheck pte_low, | ||
44 | * which ensures that we haven't picked up a changed pte high. We might | ||
45 | * have got rubbish values from pte_low and pte_high, but we are | ||
46 | * guaranteed that pte_low will not have the present bit set *unless* | ||
47 | * it is 'l'. And get_user_pages_fast only operates on present ptes, so | ||
48 | * we're safe. | ||
49 | * | ||
50 | * gup_get_pte should not be used or copied outside gup.c without being | ||
51 | * very careful -- it does not atomically load the pte or anything that | ||
52 | * is likely to be useful for you. | ||
53 | */ | ||
54 | pte_t pte; | ||
55 | |||
56 | retry: | ||
57 | pte.pte_low = ptep->pte_low; | ||
58 | smp_rmb(); | ||
59 | pte.pte_high = ptep->pte_high; | ||
60 | smp_rmb(); | ||
61 | if (unlikely(pte.pte_low != ptep->pte_low)) | ||
62 | goto retry; | ||
63 | |||
64 | return pte; | ||
65 | #endif | ||
66 | } | ||
67 | |||
68 | static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages) | ||
69 | { | ||
70 | while ((*nr) - nr_start) { | ||
71 | struct page *page = pages[--(*nr)]; | ||
72 | |||
73 | ClearPageReferenced(page); | ||
74 | put_page(page); | ||
75 | } | ||
76 | } | ||
77 | |||
78 | /* | ||
79 | * 'pteval' can come from a pte, pmd, pud or p4d. We only check | ||
80 | * _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the | ||
81 | * same value on all 4 types. | ||
82 | */ | ||
83 | static inline int pte_allows_gup(unsigned long pteval, int write) | ||
84 | { | ||
85 | unsigned long need_pte_bits = _PAGE_PRESENT|_PAGE_USER; | ||
86 | |||
87 | if (write) | ||
88 | need_pte_bits |= _PAGE_RW; | ||
89 | |||
90 | if ((pteval & need_pte_bits) != need_pte_bits) | ||
91 | return 0; | ||
92 | |||
93 | /* Check memory protection keys permissions. */ | ||
94 | if (!__pkru_allows_pkey(pte_flags_pkey(pteval), write)) | ||
95 | return 0; | ||
96 | |||
97 | return 1; | ||
98 | } | ||
99 | |||
100 | /* | ||
101 | * The performance critical leaf functions are made noinline otherwise gcc | ||
102 | * inlines everything into a single function which results in too much | ||
103 | * register pressure. | ||
104 | */ | ||
105 | static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, | ||
106 | unsigned long end, int write, struct page **pages, int *nr) | ||
107 | { | ||
108 | struct dev_pagemap *pgmap = NULL; | ||
109 | int nr_start = *nr, ret = 0; | ||
110 | pte_t *ptep, *ptem; | ||
111 | |||
112 | /* | ||
113 | * Keep the original mapped PTE value (ptem) around since we | ||
114 | * might increment ptep off the end of the page when finishing | ||
115 | * our loop iteration. | ||
116 | */ | ||
117 | ptem = ptep = pte_offset_map(&pmd, addr); | ||
118 | do { | ||
119 | pte_t pte = gup_get_pte(ptep); | ||
120 | struct page *page; | ||
121 | |||
122 | /* Similar to the PMD case, NUMA hinting must take slow path */ | ||
123 | if (pte_protnone(pte)) | ||
124 | break; | ||
125 | |||
126 | if (!pte_allows_gup(pte_val(pte), write)) | ||
127 | break; | ||
128 | |||
129 | if (pte_devmap(pte)) { | ||
130 | pgmap = get_dev_pagemap(pte_pfn(pte), pgmap); | ||
131 | if (unlikely(!pgmap)) { | ||
132 | undo_dev_pagemap(nr, nr_start, pages); | ||
133 | break; | ||
134 | } | ||
135 | } else if (pte_special(pte)) | ||
136 | break; | ||
137 | |||
138 | VM_BUG_ON(!pfn_valid(pte_pfn(pte))); | ||
139 | page = pte_page(pte); | ||
140 | get_page(page); | ||
141 | put_dev_pagemap(pgmap); | ||
142 | SetPageReferenced(page); | ||
143 | pages[*nr] = page; | ||
144 | (*nr)++; | ||
145 | |||
146 | } while (ptep++, addr += PAGE_SIZE, addr != end); | ||
147 | if (addr == end) | ||
148 | ret = 1; | ||
149 | pte_unmap(ptem); | ||
150 | |||
151 | return ret; | ||
152 | } | ||
153 | |||
154 | static inline void get_head_page_multiple(struct page *page, int nr) | ||
155 | { | ||
156 | VM_BUG_ON_PAGE(page != compound_head(page), page); | ||
157 | VM_BUG_ON_PAGE(page_count(page) == 0, page); | ||
158 | page_ref_add(page, nr); | ||
159 | SetPageReferenced(page); | ||
160 | } | ||
161 | |||
162 | static int __gup_device_huge(unsigned long pfn, unsigned long addr, | ||
163 | unsigned long end, struct page **pages, int *nr) | ||
164 | { | ||
165 | int nr_start = *nr; | ||
166 | struct dev_pagemap *pgmap = NULL; | ||
167 | |||
168 | do { | ||
169 | struct page *page = pfn_to_page(pfn); | ||
170 | |||
171 | pgmap = get_dev_pagemap(pfn, pgmap); | ||
172 | if (unlikely(!pgmap)) { | ||
173 | undo_dev_pagemap(nr, nr_start, pages); | ||
174 | return 0; | ||
175 | } | ||
176 | SetPageReferenced(page); | ||
177 | pages[*nr] = page; | ||
178 | get_page(page); | ||
179 | put_dev_pagemap(pgmap); | ||
180 | (*nr)++; | ||
181 | pfn++; | ||
182 | } while (addr += PAGE_SIZE, addr != end); | ||
183 | return 1; | ||
184 | } | ||
185 | |||
186 | static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr, | ||
187 | unsigned long end, struct page **pages, int *nr) | ||
188 | { | ||
189 | unsigned long fault_pfn; | ||
190 | |||
191 | fault_pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); | ||
192 | return __gup_device_huge(fault_pfn, addr, end, pages, nr); | ||
193 | } | ||
194 | |||
195 | static int __gup_device_huge_pud(pud_t pud, unsigned long addr, | ||
196 | unsigned long end, struct page **pages, int *nr) | ||
197 | { | ||
198 | unsigned long fault_pfn; | ||
199 | |||
200 | fault_pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); | ||
201 | return __gup_device_huge(fault_pfn, addr, end, pages, nr); | ||
202 | } | ||
203 | |||
204 | static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, | ||
205 | unsigned long end, int write, struct page **pages, int *nr) | ||
206 | { | ||
207 | struct page *head, *page; | ||
208 | int refs; | ||
209 | |||
210 | if (!pte_allows_gup(pmd_val(pmd), write)) | ||
211 | return 0; | ||
212 | |||
213 | VM_BUG_ON(!pfn_valid(pmd_pfn(pmd))); | ||
214 | if (pmd_devmap(pmd)) | ||
215 | return __gup_device_huge_pmd(pmd, addr, end, pages, nr); | ||
216 | |||
217 | /* hugepages are never "special" */ | ||
218 | VM_BUG_ON(pmd_flags(pmd) & _PAGE_SPECIAL); | ||
219 | |||
220 | refs = 0; | ||
221 | head = pmd_page(pmd); | ||
222 | page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); | ||
223 | do { | ||
224 | VM_BUG_ON_PAGE(compound_head(page) != head, page); | ||
225 | pages[*nr] = page; | ||
226 | (*nr)++; | ||
227 | page++; | ||
228 | refs++; | ||
229 | } while (addr += PAGE_SIZE, addr != end); | ||
230 | get_head_page_multiple(head, refs); | ||
231 | |||
232 | return 1; | ||
233 | } | ||
234 | |||
235 | static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, | ||
236 | int write, struct page **pages, int *nr) | ||
237 | { | ||
238 | unsigned long next; | ||
239 | pmd_t *pmdp; | ||
240 | |||
241 | pmdp = pmd_offset(&pud, addr); | ||
242 | do { | ||
243 | pmd_t pmd = *pmdp; | ||
244 | |||
245 | next = pmd_addr_end(addr, end); | ||
246 | if (pmd_none(pmd)) | ||
247 | return 0; | ||
248 | if (unlikely(pmd_large(pmd) || !pmd_present(pmd))) { | ||
249 | /* | ||
250 | * NUMA hinting faults need to be handled in the GUP | ||
251 | * slowpath for accounting purposes and so that they | ||
252 | * can be serialised against THP migration. | ||
253 | */ | ||
254 | if (pmd_protnone(pmd)) | ||
255 | return 0; | ||
256 | if (!gup_huge_pmd(pmd, addr, next, write, pages, nr)) | ||
257 | return 0; | ||
258 | } else { | ||
259 | if (!gup_pte_range(pmd, addr, next, write, pages, nr)) | ||
260 | return 0; | ||
261 | } | ||
262 | } while (pmdp++, addr = next, addr != end); | ||
263 | |||
264 | return 1; | ||
265 | } | ||
266 | |||
267 | static noinline int gup_huge_pud(pud_t pud, unsigned long addr, | ||
268 | unsigned long end, int write, struct page **pages, int *nr) | ||
269 | { | ||
270 | struct page *head, *page; | ||
271 | int refs; | ||
272 | |||
273 | if (!pte_allows_gup(pud_val(pud), write)) | ||
274 | return 0; | ||
275 | |||
276 | VM_BUG_ON(!pfn_valid(pud_pfn(pud))); | ||
277 | if (pud_devmap(pud)) | ||
278 | return __gup_device_huge_pud(pud, addr, end, pages, nr); | ||
279 | |||
280 | /* hugepages are never "special" */ | ||
281 | VM_BUG_ON(pud_flags(pud) & _PAGE_SPECIAL); | ||
282 | |||
283 | refs = 0; | ||
284 | head = pud_page(pud); | ||
285 | page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT); | ||
286 | do { | ||
287 | VM_BUG_ON_PAGE(compound_head(page) != head, page); | ||
288 | pages[*nr] = page; | ||
289 | (*nr)++; | ||
290 | page++; | ||
291 | refs++; | ||
292 | } while (addr += PAGE_SIZE, addr != end); | ||
293 | get_head_page_multiple(head, refs); | ||
294 | |||
295 | return 1; | ||
296 | } | ||
297 | |||
298 | static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end, | ||
299 | int write, struct page **pages, int *nr) | ||
300 | { | ||
301 | unsigned long next; | ||
302 | pud_t *pudp; | ||
303 | |||
304 | pudp = pud_offset(&p4d, addr); | ||
305 | do { | ||
306 | pud_t pud = *pudp; | ||
307 | |||
308 | next = pud_addr_end(addr, end); | ||
309 | if (pud_none(pud)) | ||
310 | return 0; | ||
311 | if (unlikely(pud_large(pud))) { | ||
312 | if (!gup_huge_pud(pud, addr, next, write, pages, nr)) | ||
313 | return 0; | ||
314 | } else { | ||
315 | if (!gup_pmd_range(pud, addr, next, write, pages, nr)) | ||
316 | return 0; | ||
317 | } | ||
318 | } while (pudp++, addr = next, addr != end); | ||
319 | |||
320 | return 1; | ||
321 | } | ||
322 | |||
323 | static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end, | ||
324 | int write, struct page **pages, int *nr) | ||
325 | { | ||
326 | unsigned long next; | ||
327 | p4d_t *p4dp; | ||
328 | |||
329 | p4dp = p4d_offset(&pgd, addr); | ||
330 | do { | ||
331 | p4d_t p4d = *p4dp; | ||
332 | |||
333 | next = p4d_addr_end(addr, end); | ||
334 | if (p4d_none(p4d)) | ||
335 | return 0; | ||
336 | BUILD_BUG_ON(p4d_large(p4d)); | ||
337 | if (!gup_pud_range(p4d, addr, next, write, pages, nr)) | ||
338 | return 0; | ||
339 | } while (p4dp++, addr = next, addr != end); | ||
340 | |||
341 | return 1; | ||
342 | } | ||
343 | |||
344 | /* | ||
345 | * Like get_user_pages_fast() except its IRQ-safe in that it won't fall | ||
346 | * back to the regular GUP. | ||
347 | */ | ||
348 | int __get_user_pages_fast(unsigned long start, int nr_pages, int write, | ||
349 | struct page **pages) | ||
350 | { | ||
351 | struct mm_struct *mm = current->mm; | ||
352 | unsigned long addr, len, end; | ||
353 | unsigned long next; | ||
354 | unsigned long flags; | ||
355 | pgd_t *pgdp; | ||
356 | int nr = 0; | ||
357 | |||
358 | start &= PAGE_MASK; | ||
359 | addr = start; | ||
360 | len = (unsigned long) nr_pages << PAGE_SHIFT; | ||
361 | end = start + len; | ||
362 | if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ, | ||
363 | (void __user *)start, len))) | ||
364 | return 0; | ||
365 | |||
366 | /* | ||
367 | * XXX: batch / limit 'nr', to avoid large irq off latency | ||
368 | * needs some instrumenting to determine the common sizes used by | ||
369 | * important workloads (eg. DB2), and whether limiting the batch size | ||
370 | * will decrease performance. | ||
371 | * | ||
372 | * It seems like we're in the clear for the moment. Direct-IO is | ||
373 | * the main guy that batches up lots of get_user_pages, and even | ||
374 | * they are limited to 64-at-a-time which is not so many. | ||
375 | */ | ||
376 | /* | ||
377 | * This doesn't prevent pagetable teardown, but does prevent | ||
378 | * the pagetables and pages from being freed on x86. | ||
379 | * | ||
380 | * So long as we atomically load page table pointers versus teardown | ||
381 | * (which we do on x86, with the above PAE exception), we can follow the | ||
382 | * address down to the the page and take a ref on it. | ||
383 | */ | ||
384 | local_irq_save(flags); | ||
385 | pgdp = pgd_offset(mm, addr); | ||
386 | do { | ||
387 | pgd_t pgd = *pgdp; | ||
388 | |||
389 | next = pgd_addr_end(addr, end); | ||
390 | if (pgd_none(pgd)) | ||
391 | break; | ||
392 | if (!gup_p4d_range(pgd, addr, next, write, pages, &nr)) | ||
393 | break; | ||
394 | } while (pgdp++, addr = next, addr != end); | ||
395 | local_irq_restore(flags); | ||
396 | |||
397 | return nr; | ||
398 | } | ||
399 | |||
400 | /** | ||
401 | * get_user_pages_fast() - pin user pages in memory | ||
402 | * @start: starting user address | ||
403 | * @nr_pages: number of pages from start to pin | ||
404 | * @write: whether pages will be written to | ||
405 | * @pages: array that receives pointers to the pages pinned. | ||
406 | * Should be at least nr_pages long. | ||
407 | * | ||
408 | * Attempt to pin user pages in memory without taking mm->mmap_sem. | ||
409 | * If not successful, it will fall back to taking the lock and | ||
410 | * calling get_user_pages(). | ||
411 | * | ||
412 | * Returns number of pages pinned. This may be fewer than the number | ||
413 | * requested. If nr_pages is 0 or negative, returns 0. If no pages | ||
414 | * were pinned, returns -errno. | ||
415 | */ | ||
416 | int get_user_pages_fast(unsigned long start, int nr_pages, int write, | ||
417 | struct page **pages) | ||
418 | { | ||
419 | struct mm_struct *mm = current->mm; | ||
420 | unsigned long addr, len, end; | ||
421 | unsigned long next; | ||
422 | pgd_t *pgdp; | ||
423 | int nr = 0; | ||
424 | |||
425 | start &= PAGE_MASK; | ||
426 | addr = start; | ||
427 | len = (unsigned long) nr_pages << PAGE_SHIFT; | ||
428 | |||
429 | end = start + len; | ||
430 | if (end < start) | ||
431 | goto slow_irqon; | ||
432 | |||
433 | #ifdef CONFIG_X86_64 | ||
434 | if (end >> __VIRTUAL_MASK_SHIFT) | ||
435 | goto slow_irqon; | ||
436 | #endif | ||
437 | |||
438 | /* | ||
439 | * XXX: batch / limit 'nr', to avoid large irq off latency | ||
440 | * needs some instrumenting to determine the common sizes used by | ||
441 | * important workloads (eg. DB2), and whether limiting the batch size | ||
442 | * will decrease performance. | ||
443 | * | ||
444 | * It seems like we're in the clear for the moment. Direct-IO is | ||
445 | * the main guy that batches up lots of get_user_pages, and even | ||
446 | * they are limited to 64-at-a-time which is not so many. | ||
447 | */ | ||
448 | /* | ||
449 | * This doesn't prevent pagetable teardown, but does prevent | ||
450 | * the pagetables and pages from being freed on x86. | ||
451 | * | ||
452 | * So long as we atomically load page table pointers versus teardown | ||
453 | * (which we do on x86, with the above PAE exception), we can follow the | ||
454 | * address down to the the page and take a ref on it. | ||
455 | */ | ||
456 | local_irq_disable(); | ||
457 | pgdp = pgd_offset(mm, addr); | ||
458 | do { | ||
459 | pgd_t pgd = *pgdp; | ||
460 | |||
461 | next = pgd_addr_end(addr, end); | ||
462 | if (pgd_none(pgd)) | ||
463 | goto slow; | ||
464 | if (!gup_p4d_range(pgd, addr, next, write, pages, &nr)) | ||
465 | goto slow; | ||
466 | } while (pgdp++, addr = next, addr != end); | ||
467 | local_irq_enable(); | ||
468 | |||
469 | VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT); | ||
470 | return nr; | ||
471 | |||
472 | { | ||
473 | int ret; | ||
474 | |||
475 | slow: | ||
476 | local_irq_enable(); | ||
477 | slow_irqon: | ||
478 | /* Try to get the remaining pages with get_user_pages */ | ||
479 | start += nr << PAGE_SHIFT; | ||
480 | pages += nr; | ||
481 | |||
482 | ret = get_user_pages_unlocked(start, | ||
483 | (end - start) >> PAGE_SHIFT, | ||
484 | pages, write ? FOLL_WRITE : 0); | ||
485 | |||
486 | /* Have to be a bit careful with return values */ | ||
487 | if (nr > 0) { | ||
488 | if (ret < 0) | ||
489 | ret = nr; | ||
490 | else | ||
491 | ret += nr; | ||
492 | } | ||
493 | |||
494 | return ret; | ||
495 | } | ||
496 | } | ||
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 9b3f9fa5b283..673541eb3b3f 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c | |||
@@ -811,10 +811,8 @@ void __init zone_sizes_init(void) | |||
811 | } | 811 | } |
812 | 812 | ||
813 | DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { | 813 | DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { |
814 | #ifdef CONFIG_SMP | 814 | .loaded_mm = &init_mm, |
815 | .active_mm = &init_mm, | ||
816 | .state = 0, | 815 | .state = 0, |
817 | #endif | ||
818 | .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */ | 816 | .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */ |
819 | }; | 817 | }; |
820 | EXPORT_SYMBOL_GPL(cpu_tlbstate); | 818 | EXPORT_SYMBOL_GPL(cpu_tlbstate); |
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 0a59daf799f8..dae6a5e5ad4a 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c | |||
@@ -92,6 +92,44 @@ __setup("noexec32=", nonx32_setup); | |||
92 | * When memory was added make sure all the processes MM have | 92 | * When memory was added make sure all the processes MM have |
93 | * suitable PGD entries in the local PGD level page. | 93 | * suitable PGD entries in the local PGD level page. |
94 | */ | 94 | */ |
95 | #ifdef CONFIG_X86_5LEVEL | ||
96 | void sync_global_pgds(unsigned long start, unsigned long end) | ||
97 | { | ||
98 | unsigned long addr; | ||
99 | |||
100 | for (addr = start; addr <= end; addr = ALIGN(addr + 1, PGDIR_SIZE)) { | ||
101 | const pgd_t *pgd_ref = pgd_offset_k(addr); | ||
102 | struct page *page; | ||
103 | |||
104 | /* Check for overflow */ | ||
105 | if (addr < start) | ||
106 | break; | ||
107 | |||
108 | if (pgd_none(*pgd_ref)) | ||
109 | continue; | ||
110 | |||
111 | spin_lock(&pgd_lock); | ||
112 | list_for_each_entry(page, &pgd_list, lru) { | ||
113 | pgd_t *pgd; | ||
114 | spinlock_t *pgt_lock; | ||
115 | |||
116 | pgd = (pgd_t *)page_address(page) + pgd_index(addr); | ||
117 | /* the pgt_lock only for Xen */ | ||
118 | pgt_lock = &pgd_page_get_mm(page)->page_table_lock; | ||
119 | spin_lock(pgt_lock); | ||
120 | |||
121 | if (!pgd_none(*pgd_ref) && !pgd_none(*pgd)) | ||
122 | BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); | ||
123 | |||
124 | if (pgd_none(*pgd)) | ||
125 | set_pgd(pgd, *pgd_ref); | ||
126 | |||
127 | spin_unlock(pgt_lock); | ||
128 | } | ||
129 | spin_unlock(&pgd_lock); | ||
130 | } | ||
131 | } | ||
132 | #else | ||
95 | void sync_global_pgds(unsigned long start, unsigned long end) | 133 | void sync_global_pgds(unsigned long start, unsigned long end) |
96 | { | 134 | { |
97 | unsigned long addr; | 135 | unsigned long addr; |
@@ -135,6 +173,7 @@ void sync_global_pgds(unsigned long start, unsigned long end) | |||
135 | spin_unlock(&pgd_lock); | 173 | spin_unlock(&pgd_lock); |
136 | } | 174 | } |
137 | } | 175 | } |
176 | #endif | ||
138 | 177 | ||
139 | /* | 178 | /* |
140 | * NOTE: This function is marked __ref because it calls __init function | 179 | * NOTE: This function is marked __ref because it calls __init function |
@@ -585,6 +624,57 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end, | |||
585 | return paddr_last; | 624 | return paddr_last; |
586 | } | 625 | } |
587 | 626 | ||
627 | static unsigned long __meminit | ||
628 | phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end, | ||
629 | unsigned long page_size_mask) | ||
630 | { | ||
631 | unsigned long paddr_next, paddr_last = paddr_end; | ||
632 | unsigned long vaddr = (unsigned long)__va(paddr); | ||
633 | int i = p4d_index(vaddr); | ||
634 | |||
635 | if (!IS_ENABLED(CONFIG_X86_5LEVEL)) | ||
636 | return phys_pud_init((pud_t *) p4d_page, paddr, paddr_end, page_size_mask); | ||
637 | |||
638 | for (; i < PTRS_PER_P4D; i++, paddr = paddr_next) { | ||
639 | p4d_t *p4d; | ||
640 | pud_t *pud; | ||
641 | |||
642 | vaddr = (unsigned long)__va(paddr); | ||
643 | p4d = p4d_page + p4d_index(vaddr); | ||
644 | paddr_next = (paddr & P4D_MASK) + P4D_SIZE; | ||
645 | |||
646 | if (paddr >= paddr_end) { | ||
647 | if (!after_bootmem && | ||
648 | !e820__mapped_any(paddr & P4D_MASK, paddr_next, | ||
649 | E820_TYPE_RAM) && | ||
650 | !e820__mapped_any(paddr & P4D_MASK, paddr_next, | ||
651 | E820_TYPE_RESERVED_KERN)) | ||
652 | set_p4d(p4d, __p4d(0)); | ||
653 | continue; | ||
654 | } | ||
655 | |||
656 | if (!p4d_none(*p4d)) { | ||
657 | pud = pud_offset(p4d, 0); | ||
658 | paddr_last = phys_pud_init(pud, paddr, | ||
659 | paddr_end, | ||
660 | page_size_mask); | ||
661 | __flush_tlb_all(); | ||
662 | continue; | ||
663 | } | ||
664 | |||
665 | pud = alloc_low_page(); | ||
666 | paddr_last = phys_pud_init(pud, paddr, paddr_end, | ||
667 | page_size_mask); | ||
668 | |||
669 | spin_lock(&init_mm.page_table_lock); | ||
670 | p4d_populate(&init_mm, p4d, pud); | ||
671 | spin_unlock(&init_mm.page_table_lock); | ||
672 | } | ||
673 | __flush_tlb_all(); | ||
674 | |||
675 | return paddr_last; | ||
676 | } | ||
677 | |||
588 | /* | 678 | /* |
589 | * Create page table mapping for the physical memory for specific physical | 679 | * Create page table mapping for the physical memory for specific physical |
590 | * addresses. The virtual and physical addresses have to be aligned on PMD level | 680 | * addresses. The virtual and physical addresses have to be aligned on PMD level |
@@ -606,26 +696,26 @@ kernel_physical_mapping_init(unsigned long paddr_start, | |||
606 | for (; vaddr < vaddr_end; vaddr = vaddr_next) { | 696 | for (; vaddr < vaddr_end; vaddr = vaddr_next) { |
607 | pgd_t *pgd = pgd_offset_k(vaddr); | 697 | pgd_t *pgd = pgd_offset_k(vaddr); |
608 | p4d_t *p4d; | 698 | p4d_t *p4d; |
609 | pud_t *pud; | ||
610 | 699 | ||
611 | vaddr_next = (vaddr & PGDIR_MASK) + PGDIR_SIZE; | 700 | vaddr_next = (vaddr & PGDIR_MASK) + PGDIR_SIZE; |
612 | 701 | ||
613 | BUILD_BUG_ON(pgd_none(*pgd)); | 702 | if (pgd_val(*pgd)) { |
614 | p4d = p4d_offset(pgd, vaddr); | 703 | p4d = (p4d_t *)pgd_page_vaddr(*pgd); |
615 | if (p4d_val(*p4d)) { | 704 | paddr_last = phys_p4d_init(p4d, __pa(vaddr), |
616 | pud = (pud_t *)p4d_page_vaddr(*p4d); | ||
617 | paddr_last = phys_pud_init(pud, __pa(vaddr), | ||
618 | __pa(vaddr_end), | 705 | __pa(vaddr_end), |
619 | page_size_mask); | 706 | page_size_mask); |
620 | continue; | 707 | continue; |
621 | } | 708 | } |
622 | 709 | ||
623 | pud = alloc_low_page(); | 710 | p4d = alloc_low_page(); |
624 | paddr_last = phys_pud_init(pud, __pa(vaddr), __pa(vaddr_end), | 711 | paddr_last = phys_p4d_init(p4d, __pa(vaddr), __pa(vaddr_end), |
625 | page_size_mask); | 712 | page_size_mask); |
626 | 713 | ||
627 | spin_lock(&init_mm.page_table_lock); | 714 | spin_lock(&init_mm.page_table_lock); |
628 | p4d_populate(&init_mm, p4d, pud); | 715 | if (IS_ENABLED(CONFIG_X86_5LEVEL)) |
716 | pgd_populate(&init_mm, pgd, p4d); | ||
717 | else | ||
718 | p4d_populate(&init_mm, p4d_offset(pgd, vaddr), (pud_t *) p4d); | ||
629 | spin_unlock(&init_mm.page_table_lock); | 719 | spin_unlock(&init_mm.page_table_lock); |
630 | pgd_changed = true; | 720 | pgd_changed = true; |
631 | } | 721 | } |
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index bbc558b88a88..4c1b5fd0c7ad 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c | |||
@@ -424,7 +424,7 @@ static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss; | |||
424 | static inline pmd_t * __init early_ioremap_pmd(unsigned long addr) | 424 | static inline pmd_t * __init early_ioremap_pmd(unsigned long addr) |
425 | { | 425 | { |
426 | /* Don't assume we're using swapper_pg_dir at this point */ | 426 | /* Don't assume we're using swapper_pg_dir at this point */ |
427 | pgd_t *base = __va(read_cr3()); | 427 | pgd_t *base = __va(read_cr3_pa()); |
428 | pgd_t *pgd = &base[pgd_index(addr)]; | 428 | pgd_t *pgd = &base[pgd_index(addr)]; |
429 | p4d_t *p4d = p4d_offset(pgd, addr); | 429 | p4d_t *p4d = p4d_offset(pgd, addr); |
430 | pud_t *pud = pud_offset(p4d, addr); | 430 | pud_t *pud = pud_offset(p4d, addr); |
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 0c7d8129bed6..88215ac16b24 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c | |||
@@ -12,7 +12,7 @@ | |||
12 | #include <asm/tlbflush.h> | 12 | #include <asm/tlbflush.h> |
13 | #include <asm/sections.h> | 13 | #include <asm/sections.h> |
14 | 14 | ||
15 | extern pgd_t early_level4_pgt[PTRS_PER_PGD]; | 15 | extern pgd_t early_top_pgt[PTRS_PER_PGD]; |
16 | extern struct range pfn_mapped[E820_MAX_ENTRIES]; | 16 | extern struct range pfn_mapped[E820_MAX_ENTRIES]; |
17 | 17 | ||
18 | static int __init map_range(struct range *range) | 18 | static int __init map_range(struct range *range) |
@@ -109,8 +109,8 @@ void __init kasan_early_init(void) | |||
109 | for (i = 0; CONFIG_PGTABLE_LEVELS >= 5 && i < PTRS_PER_P4D; i++) | 109 | for (i = 0; CONFIG_PGTABLE_LEVELS >= 5 && i < PTRS_PER_P4D; i++) |
110 | kasan_zero_p4d[i] = __p4d(p4d_val); | 110 | kasan_zero_p4d[i] = __p4d(p4d_val); |
111 | 111 | ||
112 | kasan_map_early_shadow(early_level4_pgt); | 112 | kasan_map_early_shadow(early_top_pgt); |
113 | kasan_map_early_shadow(init_level4_pgt); | 113 | kasan_map_early_shadow(init_top_pgt); |
114 | } | 114 | } |
115 | 115 | ||
116 | void __init kasan_init(void) | 116 | void __init kasan_init(void) |
@@ -121,8 +121,8 @@ void __init kasan_init(void) | |||
121 | register_die_notifier(&kasan_die_notifier); | 121 | register_die_notifier(&kasan_die_notifier); |
122 | #endif | 122 | #endif |
123 | 123 | ||
124 | memcpy(early_level4_pgt, init_level4_pgt, sizeof(early_level4_pgt)); | 124 | memcpy(early_top_pgt, init_top_pgt, sizeof(early_top_pgt)); |
125 | load_cr3(early_level4_pgt); | 125 | load_cr3(early_top_pgt); |
126 | __flush_tlb_all(); | 126 | __flush_tlb_all(); |
127 | 127 | ||
128 | clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END); | 128 | clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END); |
@@ -148,7 +148,7 @@ void __init kasan_init(void) | |||
148 | kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END), | 148 | kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END), |
149 | (void *)KASAN_SHADOW_END); | 149 | (void *)KASAN_SHADOW_END); |
150 | 150 | ||
151 | load_cr3(init_level4_pgt); | 151 | load_cr3(init_top_pgt); |
152 | __flush_tlb_all(); | 152 | __flush_tlb_all(); |
153 | 153 | ||
154 | /* | 154 | /* |
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index aed206475aa7..af599167fe3c 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c | |||
@@ -6,12 +6,12 @@ | |||
6 | * | 6 | * |
7 | * Entropy is generated using the KASLR early boot functions now shared in | 7 | * Entropy is generated using the KASLR early boot functions now shared in |
8 | * the lib directory (originally written by Kees Cook). Randomization is | 8 | * the lib directory (originally written by Kees Cook). Randomization is |
9 | * done on PGD & PUD page table levels to increase possible addresses. The | 9 | * done on PGD & P4D/PUD page table levels to increase possible addresses. |
10 | * physical memory mapping code was adapted to support PUD level virtual | 10 | * The physical memory mapping code was adapted to support P4D/PUD level |
11 | * addresses. This implementation on the best configuration provides 30,000 | 11 | * virtual addresses. This implementation on the best configuration provides |
12 | * possible virtual addresses in average for each memory region. An additional | 12 | * 30,000 possible virtual addresses in average for each memory region. |
13 | * low memory page is used to ensure each CPU can start with a PGD aligned | 13 | * An additional low memory page is used to ensure each CPU can start with |
14 | * virtual address (for realmode). | 14 | * a PGD aligned virtual address (for realmode). |
15 | * | 15 | * |
16 | * The order of each memory region is not changed. The feature looks at | 16 | * The order of each memory region is not changed. The feature looks at |
17 | * the available space for the regions based on different configuration | 17 | * the available space for the regions based on different configuration |
@@ -70,7 +70,7 @@ static __initdata struct kaslr_memory_region { | |||
70 | unsigned long *base; | 70 | unsigned long *base; |
71 | unsigned long size_tb; | 71 | unsigned long size_tb; |
72 | } kaslr_regions[] = { | 72 | } kaslr_regions[] = { |
73 | { &page_offset_base, 64/* Maximum */ }, | 73 | { &page_offset_base, 1 << (__PHYSICAL_MASK_SHIFT - TB_SHIFT) /* Maximum */ }, |
74 | { &vmalloc_base, VMALLOC_SIZE_TB }, | 74 | { &vmalloc_base, VMALLOC_SIZE_TB }, |
75 | { &vmemmap_base, 1 }, | 75 | { &vmemmap_base, 1 }, |
76 | }; | 76 | }; |
@@ -142,7 +142,10 @@ void __init kernel_randomize_memory(void) | |||
142 | */ | 142 | */ |
143 | entropy = remain_entropy / (ARRAY_SIZE(kaslr_regions) - i); | 143 | entropy = remain_entropy / (ARRAY_SIZE(kaslr_regions) - i); |
144 | prandom_bytes_state(&rand_state, &rand, sizeof(rand)); | 144 | prandom_bytes_state(&rand_state, &rand, sizeof(rand)); |
145 | entropy = (rand % (entropy + 1)) & PUD_MASK; | 145 | if (IS_ENABLED(CONFIG_X86_5LEVEL)) |
146 | entropy = (rand % (entropy + 1)) & P4D_MASK; | ||
147 | else | ||
148 | entropy = (rand % (entropy + 1)) & PUD_MASK; | ||
146 | vaddr += entropy; | 149 | vaddr += entropy; |
147 | *kaslr_regions[i].base = vaddr; | 150 | *kaslr_regions[i].base = vaddr; |
148 | 151 | ||
@@ -151,27 +154,21 @@ void __init kernel_randomize_memory(void) | |||
151 | * randomization alignment. | 154 | * randomization alignment. |
152 | */ | 155 | */ |
153 | vaddr += get_padding(&kaslr_regions[i]); | 156 | vaddr += get_padding(&kaslr_regions[i]); |
154 | vaddr = round_up(vaddr + 1, PUD_SIZE); | 157 | if (IS_ENABLED(CONFIG_X86_5LEVEL)) |
158 | vaddr = round_up(vaddr + 1, P4D_SIZE); | ||
159 | else | ||
160 | vaddr = round_up(vaddr + 1, PUD_SIZE); | ||
155 | remain_entropy -= entropy; | 161 | remain_entropy -= entropy; |
156 | } | 162 | } |
157 | } | 163 | } |
158 | 164 | ||
159 | /* | 165 | static void __meminit init_trampoline_pud(void) |
160 | * Create PGD aligned trampoline table to allow real mode initialization | ||
161 | * of additional CPUs. Consume only 1 low memory page. | ||
162 | */ | ||
163 | void __meminit init_trampoline(void) | ||
164 | { | 166 | { |
165 | unsigned long paddr, paddr_next; | 167 | unsigned long paddr, paddr_next; |
166 | pgd_t *pgd; | 168 | pgd_t *pgd; |
167 | pud_t *pud_page, *pud_page_tramp; | 169 | pud_t *pud_page, *pud_page_tramp; |
168 | int i; | 170 | int i; |
169 | 171 | ||
170 | if (!kaslr_memory_enabled()) { | ||
171 | init_trampoline_default(); | ||
172 | return; | ||
173 | } | ||
174 | |||
175 | pud_page_tramp = alloc_low_page(); | 172 | pud_page_tramp = alloc_low_page(); |
176 | 173 | ||
177 | paddr = 0; | 174 | paddr = 0; |
@@ -192,3 +189,49 @@ void __meminit init_trampoline(void) | |||
192 | set_pgd(&trampoline_pgd_entry, | 189 | set_pgd(&trampoline_pgd_entry, |
193 | __pgd(_KERNPG_TABLE | __pa(pud_page_tramp))); | 190 | __pgd(_KERNPG_TABLE | __pa(pud_page_tramp))); |
194 | } | 191 | } |
192 | |||
193 | static void __meminit init_trampoline_p4d(void) | ||
194 | { | ||
195 | unsigned long paddr, paddr_next; | ||
196 | pgd_t *pgd; | ||
197 | p4d_t *p4d_page, *p4d_page_tramp; | ||
198 | int i; | ||
199 | |||
200 | p4d_page_tramp = alloc_low_page(); | ||
201 | |||
202 | paddr = 0; | ||
203 | pgd = pgd_offset_k((unsigned long)__va(paddr)); | ||
204 | p4d_page = (p4d_t *) pgd_page_vaddr(*pgd); | ||
205 | |||
206 | for (i = p4d_index(paddr); i < PTRS_PER_P4D; i++, paddr = paddr_next) { | ||
207 | p4d_t *p4d, *p4d_tramp; | ||
208 | unsigned long vaddr = (unsigned long)__va(paddr); | ||
209 | |||
210 | p4d_tramp = p4d_page_tramp + p4d_index(paddr); | ||
211 | p4d = p4d_page + p4d_index(vaddr); | ||
212 | paddr_next = (paddr & P4D_MASK) + P4D_SIZE; | ||
213 | |||
214 | *p4d_tramp = *p4d; | ||
215 | } | ||
216 | |||
217 | set_pgd(&trampoline_pgd_entry, | ||
218 | __pgd(_KERNPG_TABLE | __pa(p4d_page_tramp))); | ||
219 | } | ||
220 | |||
221 | /* | ||
222 | * Create PGD aligned trampoline table to allow real mode initialization | ||
223 | * of additional CPUs. Consume only 1 low memory page. | ||
224 | */ | ||
225 | void __meminit init_trampoline(void) | ||
226 | { | ||
227 | |||
228 | if (!kaslr_memory_enabled()) { | ||
229 | init_trampoline_default(); | ||
230 | return; | ||
231 | } | ||
232 | |||
233 | if (IS_ENABLED(CONFIG_X86_5LEVEL)) | ||
234 | init_trampoline_p4d(); | ||
235 | else | ||
236 | init_trampoline_pud(); | ||
237 | } | ||
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index 19ad095b41df..797295e792b2 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c | |||
@@ -74,9 +74,6 @@ static int mmap_is_legacy(void) | |||
74 | if (current->personality & ADDR_COMPAT_LAYOUT) | 74 | if (current->personality & ADDR_COMPAT_LAYOUT) |
75 | return 1; | 75 | return 1; |
76 | 76 | ||
77 | if (rlimit(RLIMIT_STACK) == RLIM_INFINITY) | ||
78 | return 1; | ||
79 | |||
80 | return sysctl_legacy_va_layout; | 77 | return sysctl_legacy_va_layout; |
81 | } | 78 | } |
82 | 79 | ||
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 6e7bedf69af7..014d07a80053 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c | |||
@@ -15,7 +15,7 @@ | |||
15 | #include <linux/debugfs.h> | 15 | #include <linux/debugfs.h> |
16 | 16 | ||
17 | /* | 17 | /* |
18 | * Smarter SMP flushing macros. | 18 | * TLB flushing, formerly SMP-only |
19 | * c/o Linus Torvalds. | 19 | * c/o Linus Torvalds. |
20 | * | 20 | * |
21 | * These mean you can really definitely utterly forget about | 21 | * These mean you can really definitely utterly forget about |
@@ -28,39 +28,28 @@ | |||
28 | * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi | 28 | * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi |
29 | */ | 29 | */ |
30 | 30 | ||
31 | #ifdef CONFIG_SMP | ||
32 | |||
33 | struct flush_tlb_info { | ||
34 | struct mm_struct *flush_mm; | ||
35 | unsigned long flush_start; | ||
36 | unsigned long flush_end; | ||
37 | }; | ||
38 | |||
39 | /* | ||
40 | * We cannot call mmdrop() because we are in interrupt context, | ||
41 | * instead update mm->cpu_vm_mask. | ||
42 | */ | ||
43 | void leave_mm(int cpu) | 31 | void leave_mm(int cpu) |
44 | { | 32 | { |
45 | struct mm_struct *active_mm = this_cpu_read(cpu_tlbstate.active_mm); | 33 | struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); |
34 | |||
35 | /* | ||
36 | * It's plausible that we're in lazy TLB mode while our mm is init_mm. | ||
37 | * If so, our callers still expect us to flush the TLB, but there | ||
38 | * aren't any user TLB entries in init_mm to worry about. | ||
39 | * | ||
40 | * This needs to happen before any other sanity checks due to | ||
41 | * intel_idle's shenanigans. | ||
42 | */ | ||
43 | if (loaded_mm == &init_mm) | ||
44 | return; | ||
45 | |||
46 | if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) | 46 | if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) |
47 | BUG(); | 47 | BUG(); |
48 | if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) { | 48 | |
49 | cpumask_clear_cpu(cpu, mm_cpumask(active_mm)); | 49 | switch_mm(NULL, &init_mm, NULL); |
50 | load_cr3(swapper_pg_dir); | ||
51 | /* | ||
52 | * This gets called in the idle path where RCU | ||
53 | * functions differently. Tracing normally | ||
54 | * uses RCU, so we have to call the tracepoint | ||
55 | * specially here. | ||
56 | */ | ||
57 | trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); | ||
58 | } | ||
59 | } | 50 | } |
60 | EXPORT_SYMBOL_GPL(leave_mm); | 51 | EXPORT_SYMBOL_GPL(leave_mm); |
61 | 52 | ||
62 | #endif /* CONFIG_SMP */ | ||
63 | |||
64 | void switch_mm(struct mm_struct *prev, struct mm_struct *next, | 53 | void switch_mm(struct mm_struct *prev, struct mm_struct *next, |
65 | struct task_struct *tsk) | 54 | struct task_struct *tsk) |
66 | { | 55 | { |
@@ -75,216 +64,167 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, | |||
75 | struct task_struct *tsk) | 64 | struct task_struct *tsk) |
76 | { | 65 | { |
77 | unsigned cpu = smp_processor_id(); | 66 | unsigned cpu = smp_processor_id(); |
67 | struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm); | ||
78 | 68 | ||
79 | if (likely(prev != next)) { | 69 | /* |
80 | if (IS_ENABLED(CONFIG_VMAP_STACK)) { | 70 | * NB: The scheduler will call us with prev == next when |
81 | /* | 71 | * switching from lazy TLB mode to normal mode if active_mm |
82 | * If our current stack is in vmalloc space and isn't | 72 | * isn't changing. When this happens, there is no guarantee |
83 | * mapped in the new pgd, we'll double-fault. Forcibly | 73 | * that CR3 (and hence cpu_tlbstate.loaded_mm) matches next. |
84 | * map it. | 74 | * |
85 | */ | 75 | * NB: leave_mm() calls us with prev == NULL and tsk == NULL. |
86 | unsigned int stack_pgd_index = pgd_index(current_stack_pointer()); | 76 | */ |
87 | |||
88 | pgd_t *pgd = next->pgd + stack_pgd_index; | ||
89 | |||
90 | if (unlikely(pgd_none(*pgd))) | ||
91 | set_pgd(pgd, init_mm.pgd[stack_pgd_index]); | ||
92 | } | ||
93 | 77 | ||
94 | #ifdef CONFIG_SMP | 78 | this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); |
95 | this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); | ||
96 | this_cpu_write(cpu_tlbstate.active_mm, next); | ||
97 | #endif | ||
98 | 79 | ||
99 | cpumask_set_cpu(cpu, mm_cpumask(next)); | 80 | if (real_prev == next) { |
81 | /* | ||
82 | * There's nothing to do: we always keep the per-mm control | ||
83 | * regs in sync with cpu_tlbstate.loaded_mm. Just | ||
84 | * sanity-check mm_cpumask. | ||
85 | */ | ||
86 | if (WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(next)))) | ||
87 | cpumask_set_cpu(cpu, mm_cpumask(next)); | ||
88 | return; | ||
89 | } | ||
100 | 90 | ||
91 | if (IS_ENABLED(CONFIG_VMAP_STACK)) { | ||
101 | /* | 92 | /* |
102 | * Re-load page tables. | 93 | * If our current stack is in vmalloc space and isn't |
103 | * | 94 | * mapped in the new pgd, we'll double-fault. Forcibly |
104 | * This logic has an ordering constraint: | 95 | * map it. |
105 | * | ||
106 | * CPU 0: Write to a PTE for 'next' | ||
107 | * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI. | ||
108 | * CPU 1: set bit 1 in next's mm_cpumask | ||
109 | * CPU 1: load from the PTE that CPU 0 writes (implicit) | ||
110 | * | ||
111 | * We need to prevent an outcome in which CPU 1 observes | ||
112 | * the new PTE value and CPU 0 observes bit 1 clear in | ||
113 | * mm_cpumask. (If that occurs, then the IPI will never | ||
114 | * be sent, and CPU 0's TLB will contain a stale entry.) | ||
115 | * | ||
116 | * The bad outcome can occur if either CPU's load is | ||
117 | * reordered before that CPU's store, so both CPUs must | ||
118 | * execute full barriers to prevent this from happening. | ||
119 | * | ||
120 | * Thus, switch_mm needs a full barrier between the | ||
121 | * store to mm_cpumask and any operation that could load | ||
122 | * from next->pgd. TLB fills are special and can happen | ||
123 | * due to instruction fetches or for no reason at all, | ||
124 | * and neither LOCK nor MFENCE orders them. | ||
125 | * Fortunately, load_cr3() is serializing and gives the | ||
126 | * ordering guarantee we need. | ||
127 | * | ||
128 | */ | 96 | */ |
129 | load_cr3(next->pgd); | 97 | unsigned int stack_pgd_index = pgd_index(current_stack_pointer()); |
130 | 98 | ||
131 | trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); | 99 | pgd_t *pgd = next->pgd + stack_pgd_index; |
132 | 100 | ||
133 | /* Stop flush ipis for the previous mm */ | 101 | if (unlikely(pgd_none(*pgd))) |
134 | cpumask_clear_cpu(cpu, mm_cpumask(prev)); | 102 | set_pgd(pgd, init_mm.pgd[stack_pgd_index]); |
103 | } | ||
135 | 104 | ||
136 | /* Load per-mm CR4 state */ | 105 | this_cpu_write(cpu_tlbstate.loaded_mm, next); |
137 | load_mm_cr4(next); | ||
138 | 106 | ||
139 | #ifdef CONFIG_MODIFY_LDT_SYSCALL | 107 | WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next))); |
140 | /* | 108 | cpumask_set_cpu(cpu, mm_cpumask(next)); |
141 | * Load the LDT, if the LDT is different. | 109 | |
142 | * | 110 | /* |
143 | * It's possible that prev->context.ldt doesn't match | 111 | * Re-load page tables. |
144 | * the LDT register. This can happen if leave_mm(prev) | 112 | * |
145 | * was called and then modify_ldt changed | 113 | * This logic has an ordering constraint: |
146 | * prev->context.ldt but suppressed an IPI to this CPU. | 114 | * |
147 | * In this case, prev->context.ldt != NULL, because we | 115 | * CPU 0: Write to a PTE for 'next' |
148 | * never set context.ldt to NULL while the mm still | 116 | * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI. |
149 | * exists. That means that next->context.ldt != | 117 | * CPU 1: set bit 1 in next's mm_cpumask |
150 | * prev->context.ldt, because mms never share an LDT. | 118 | * CPU 1: load from the PTE that CPU 0 writes (implicit) |
151 | */ | 119 | * |
152 | if (unlikely(prev->context.ldt != next->context.ldt)) | 120 | * We need to prevent an outcome in which CPU 1 observes |
153 | load_mm_ldt(next); | 121 | * the new PTE value and CPU 0 observes bit 1 clear in |
154 | #endif | 122 | * mm_cpumask. (If that occurs, then the IPI will never |
123 | * be sent, and CPU 0's TLB will contain a stale entry.) | ||
124 | * | ||
125 | * The bad outcome can occur if either CPU's load is | ||
126 | * reordered before that CPU's store, so both CPUs must | ||
127 | * execute full barriers to prevent this from happening. | ||
128 | * | ||
129 | * Thus, switch_mm needs a full barrier between the | ||
130 | * store to mm_cpumask and any operation that could load | ||
131 | * from next->pgd. TLB fills are special and can happen | ||
132 | * due to instruction fetches or for no reason at all, | ||
133 | * and neither LOCK nor MFENCE orders them. | ||
134 | * Fortunately, load_cr3() is serializing and gives the | ||
135 | * ordering guarantee we need. | ||
136 | */ | ||
137 | load_cr3(next->pgd); | ||
138 | |||
139 | /* | ||
140 | * This gets called via leave_mm() in the idle path where RCU | ||
141 | * functions differently. Tracing normally uses RCU, so we have to | ||
142 | * call the tracepoint specially here. | ||
143 | */ | ||
144 | trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); | ||
145 | |||
146 | /* Stop flush ipis for the previous mm */ | ||
147 | WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) && | ||
148 | real_prev != &init_mm); | ||
149 | cpumask_clear_cpu(cpu, mm_cpumask(real_prev)); | ||
150 | |||
151 | /* Load per-mm CR4 and LDTR state */ | ||
152 | load_mm_cr4(next); | ||
153 | switch_ldt(real_prev, next); | ||
154 | } | ||
155 | |||
156 | static void flush_tlb_func_common(const struct flush_tlb_info *f, | ||
157 | bool local, enum tlb_flush_reason reason) | ||
158 | { | ||
159 | /* This code cannot presently handle being reentered. */ | ||
160 | VM_WARN_ON(!irqs_disabled()); | ||
161 | |||
162 | if (this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK) { | ||
163 | leave_mm(smp_processor_id()); | ||
164 | return; | ||
155 | } | 165 | } |
156 | #ifdef CONFIG_SMP | ||
157 | else { | ||
158 | this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); | ||
159 | BUG_ON(this_cpu_read(cpu_tlbstate.active_mm) != next); | ||
160 | |||
161 | if (!cpumask_test_cpu(cpu, mm_cpumask(next))) { | ||
162 | /* | ||
163 | * On established mms, the mm_cpumask is only changed | ||
164 | * from irq context, from ptep_clear_flush() while in | ||
165 | * lazy tlb mode, and here. Irqs are blocked during | ||
166 | * schedule, protecting us from simultaneous changes. | ||
167 | */ | ||
168 | cpumask_set_cpu(cpu, mm_cpumask(next)); | ||
169 | 166 | ||
170 | /* | 167 | if (f->end == TLB_FLUSH_ALL) { |
171 | * We were in lazy tlb mode and leave_mm disabled | 168 | local_flush_tlb(); |
172 | * tlb flush IPI delivery. We must reload CR3 | 169 | if (local) |
173 | * to make sure to use no freed page tables. | 170 | count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); |
174 | * | 171 | trace_tlb_flush(reason, TLB_FLUSH_ALL); |
175 | * As above, load_cr3() is serializing and orders TLB | 172 | } else { |
176 | * fills with respect to the mm_cpumask write. | 173 | unsigned long addr; |
177 | */ | 174 | unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT; |
178 | load_cr3(next->pgd); | 175 | addr = f->start; |
179 | trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); | 176 | while (addr < f->end) { |
180 | load_mm_cr4(next); | 177 | __flush_tlb_single(addr); |
181 | load_mm_ldt(next); | 178 | addr += PAGE_SIZE; |
182 | } | 179 | } |
180 | if (local) | ||
181 | count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages); | ||
182 | trace_tlb_flush(reason, nr_pages); | ||
183 | } | 183 | } |
184 | #endif | ||
185 | } | 184 | } |
186 | 185 | ||
187 | #ifdef CONFIG_SMP | 186 | static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason) |
187 | { | ||
188 | const struct flush_tlb_info *f = info; | ||
188 | 189 | ||
189 | /* | 190 | flush_tlb_func_common(f, true, reason); |
190 | * The flush IPI assumes that a thread switch happens in this order: | 191 | } |
191 | * [cpu0: the cpu that switches] | ||
192 | * 1) switch_mm() either 1a) or 1b) | ||
193 | * 1a) thread switch to a different mm | ||
194 | * 1a1) set cpu_tlbstate to TLBSTATE_OK | ||
195 | * Now the tlb flush NMI handler flush_tlb_func won't call leave_mm | ||
196 | * if cpu0 was in lazy tlb mode. | ||
197 | * 1a2) update cpu active_mm | ||
198 | * Now cpu0 accepts tlb flushes for the new mm. | ||
199 | * 1a3) cpu_set(cpu, new_mm->cpu_vm_mask); | ||
200 | * Now the other cpus will send tlb flush ipis. | ||
201 | * 1a4) change cr3. | ||
202 | * 1a5) cpu_clear(cpu, old_mm->cpu_vm_mask); | ||
203 | * Stop ipi delivery for the old mm. This is not synchronized with | ||
204 | * the other cpus, but flush_tlb_func ignore flush ipis for the wrong | ||
205 | * mm, and in the worst case we perform a superfluous tlb flush. | ||
206 | * 1b) thread switch without mm change | ||
207 | * cpu active_mm is correct, cpu0 already handles flush ipis. | ||
208 | * 1b1) set cpu_tlbstate to TLBSTATE_OK | ||
209 | * 1b2) test_and_set the cpu bit in cpu_vm_mask. | ||
210 | * Atomically set the bit [other cpus will start sending flush ipis], | ||
211 | * and test the bit. | ||
212 | * 1b3) if the bit was 0: leave_mm was called, flush the tlb. | ||
213 | * 2) switch %%esp, ie current | ||
214 | * | ||
215 | * The interrupt must handle 2 special cases: | ||
216 | * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm. | ||
217 | * - the cpu performs speculative tlb reads, i.e. even if the cpu only | ||
218 | * runs in kernel space, the cpu could load tlb entries for user space | ||
219 | * pages. | ||
220 | * | ||
221 | * The good news is that cpu_tlbstate is local to each cpu, no | ||
222 | * write/read ordering problems. | ||
223 | */ | ||
224 | 192 | ||
225 | /* | 193 | static void flush_tlb_func_remote(void *info) |
226 | * TLB flush funcation: | ||
227 | * 1) Flush the tlb entries if the cpu uses the mm that's being flushed. | ||
228 | * 2) Leave the mm if we are in the lazy tlb mode. | ||
229 | */ | ||
230 | static void flush_tlb_func(void *info) | ||
231 | { | 194 | { |
232 | struct flush_tlb_info *f = info; | 195 | const struct flush_tlb_info *f = info; |
233 | 196 | ||
234 | inc_irq_stat(irq_tlb_count); | 197 | inc_irq_stat(irq_tlb_count); |
235 | 198 | ||
236 | if (f->flush_mm && f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm)) | 199 | if (f->mm && f->mm != this_cpu_read(cpu_tlbstate.loaded_mm)) |
237 | return; | 200 | return; |
238 | 201 | ||
239 | count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); | 202 | count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); |
240 | if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) { | 203 | flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN); |
241 | if (f->flush_end == TLB_FLUSH_ALL) { | ||
242 | local_flush_tlb(); | ||
243 | trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, TLB_FLUSH_ALL); | ||
244 | } else { | ||
245 | unsigned long addr; | ||
246 | unsigned long nr_pages = | ||
247 | (f->flush_end - f->flush_start) / PAGE_SIZE; | ||
248 | addr = f->flush_start; | ||
249 | while (addr < f->flush_end) { | ||
250 | __flush_tlb_single(addr); | ||
251 | addr += PAGE_SIZE; | ||
252 | } | ||
253 | trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, nr_pages); | ||
254 | } | ||
255 | } else | ||
256 | leave_mm(smp_processor_id()); | ||
257 | |||
258 | } | 204 | } |
259 | 205 | ||
260 | void native_flush_tlb_others(const struct cpumask *cpumask, | 206 | void native_flush_tlb_others(const struct cpumask *cpumask, |
261 | struct mm_struct *mm, unsigned long start, | 207 | const struct flush_tlb_info *info) |
262 | unsigned long end) | ||
263 | { | 208 | { |
264 | struct flush_tlb_info info; | ||
265 | |||
266 | info.flush_mm = mm; | ||
267 | info.flush_start = start; | ||
268 | info.flush_end = end; | ||
269 | |||
270 | count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); | 209 | count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); |
271 | if (end == TLB_FLUSH_ALL) | 210 | if (info->end == TLB_FLUSH_ALL) |
272 | trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL); | 211 | trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL); |
273 | else | 212 | else |
274 | trace_tlb_flush(TLB_REMOTE_SEND_IPI, | 213 | trace_tlb_flush(TLB_REMOTE_SEND_IPI, |
275 | (end - start) >> PAGE_SHIFT); | 214 | (info->end - info->start) >> PAGE_SHIFT); |
276 | 215 | ||
277 | if (is_uv_system()) { | 216 | if (is_uv_system()) { |
278 | unsigned int cpu; | 217 | unsigned int cpu; |
279 | 218 | ||
280 | cpu = smp_processor_id(); | 219 | cpu = smp_processor_id(); |
281 | cpumask = uv_flush_tlb_others(cpumask, mm, start, end, cpu); | 220 | cpumask = uv_flush_tlb_others(cpumask, info); |
282 | if (cpumask) | 221 | if (cpumask) |
283 | smp_call_function_many(cpumask, flush_tlb_func, | 222 | smp_call_function_many(cpumask, flush_tlb_func_remote, |
284 | &info, 1); | 223 | (void *)info, 1); |
285 | return; | 224 | return; |
286 | } | 225 | } |
287 | smp_call_function_many(cpumask, flush_tlb_func, &info, 1); | 226 | smp_call_function_many(cpumask, flush_tlb_func_remote, |
227 | (void *)info, 1); | ||
288 | } | 228 | } |
289 | 229 | ||
290 | /* | 230 | /* |
@@ -302,85 +242,41 @@ static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33; | |||
302 | void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, | 242 | void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, |
303 | unsigned long end, unsigned long vmflag) | 243 | unsigned long end, unsigned long vmflag) |
304 | { | 244 | { |
305 | unsigned long addr; | 245 | int cpu; |
306 | /* do a global flush by default */ | ||
307 | unsigned long base_pages_to_flush = TLB_FLUSH_ALL; | ||
308 | |||
309 | preempt_disable(); | ||
310 | 246 | ||
311 | if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB)) | 247 | struct flush_tlb_info info = { |
312 | base_pages_to_flush = (end - start) >> PAGE_SHIFT; | 248 | .mm = mm, |
313 | if (base_pages_to_flush > tlb_single_page_flush_ceiling) | 249 | }; |
314 | base_pages_to_flush = TLB_FLUSH_ALL; | ||
315 | 250 | ||
316 | if (current->active_mm != mm) { | 251 | cpu = get_cpu(); |
317 | /* Synchronize with switch_mm. */ | ||
318 | smp_mb(); | ||
319 | 252 | ||
320 | goto out; | 253 | /* Synchronize with switch_mm. */ |
321 | } | 254 | smp_mb(); |
322 | |||
323 | if (!current->mm) { | ||
324 | leave_mm(smp_processor_id()); | ||
325 | 255 | ||
326 | /* Synchronize with switch_mm. */ | 256 | /* Should we flush just the requested range? */ |
327 | smp_mb(); | 257 | if ((end != TLB_FLUSH_ALL) && |
328 | 258 | !(vmflag & VM_HUGETLB) && | |
329 | goto out; | 259 | ((end - start) >> PAGE_SHIFT) <= tlb_single_page_flush_ceiling) { |
330 | } | 260 | info.start = start; |
331 | 261 | info.end = end; | |
332 | /* | ||
333 | * Both branches below are implicit full barriers (MOV to CR or | ||
334 | * INVLPG) that synchronize with switch_mm. | ||
335 | */ | ||
336 | if (base_pages_to_flush == TLB_FLUSH_ALL) { | ||
337 | count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); | ||
338 | local_flush_tlb(); | ||
339 | } else { | 262 | } else { |
340 | /* flush range by one by one 'invlpg' */ | 263 | info.start = 0UL; |
341 | for (addr = start; addr < end; addr += PAGE_SIZE) { | 264 | info.end = TLB_FLUSH_ALL; |
342 | count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE); | ||
343 | __flush_tlb_single(addr); | ||
344 | } | ||
345 | } | ||
346 | trace_tlb_flush(TLB_LOCAL_MM_SHOOTDOWN, base_pages_to_flush); | ||
347 | out: | ||
348 | if (base_pages_to_flush == TLB_FLUSH_ALL) { | ||
349 | start = 0UL; | ||
350 | end = TLB_FLUSH_ALL; | ||
351 | } | 265 | } |
352 | if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) | ||
353 | flush_tlb_others(mm_cpumask(mm), mm, start, end); | ||
354 | preempt_enable(); | ||
355 | } | ||
356 | 266 | ||
357 | void flush_tlb_page(struct vm_area_struct *vma, unsigned long start) | 267 | if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) { |
358 | { | 268 | VM_WARN_ON(irqs_disabled()); |
359 | struct mm_struct *mm = vma->vm_mm; | 269 | local_irq_disable(); |
360 | 270 | flush_tlb_func_local(&info, TLB_LOCAL_MM_SHOOTDOWN); | |
361 | preempt_disable(); | 271 | local_irq_enable(); |
362 | |||
363 | if (current->active_mm == mm) { | ||
364 | if (current->mm) { | ||
365 | /* | ||
366 | * Implicit full barrier (INVLPG) that synchronizes | ||
367 | * with switch_mm. | ||
368 | */ | ||
369 | __flush_tlb_one(start); | ||
370 | } else { | ||
371 | leave_mm(smp_processor_id()); | ||
372 | |||
373 | /* Synchronize with switch_mm. */ | ||
374 | smp_mb(); | ||
375 | } | ||
376 | } | 272 | } |
377 | 273 | ||
378 | if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) | 274 | if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) |
379 | flush_tlb_others(mm_cpumask(mm), mm, start, start + PAGE_SIZE); | 275 | flush_tlb_others(mm_cpumask(mm), &info); |
380 | 276 | put_cpu(); | |
381 | preempt_enable(); | ||
382 | } | 277 | } |
383 | 278 | ||
279 | |||
384 | static void do_flush_tlb_all(void *info) | 280 | static void do_flush_tlb_all(void *info) |
385 | { | 281 | { |
386 | count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); | 282 | count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); |
@@ -401,7 +297,7 @@ static void do_kernel_range_flush(void *info) | |||
401 | unsigned long addr; | 297 | unsigned long addr; |
402 | 298 | ||
403 | /* flush range by one by one 'invlpg' */ | 299 | /* flush range by one by one 'invlpg' */ |
404 | for (addr = f->flush_start; addr < f->flush_end; addr += PAGE_SIZE) | 300 | for (addr = f->start; addr < f->end; addr += PAGE_SIZE) |
405 | __flush_tlb_single(addr); | 301 | __flush_tlb_single(addr); |
406 | } | 302 | } |
407 | 303 | ||
@@ -410,16 +306,40 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end) | |||
410 | 306 | ||
411 | /* Balance as user space task's flush, a bit conservative */ | 307 | /* Balance as user space task's flush, a bit conservative */ |
412 | if (end == TLB_FLUSH_ALL || | 308 | if (end == TLB_FLUSH_ALL || |
413 | (end - start) > tlb_single_page_flush_ceiling * PAGE_SIZE) { | 309 | (end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) { |
414 | on_each_cpu(do_flush_tlb_all, NULL, 1); | 310 | on_each_cpu(do_flush_tlb_all, NULL, 1); |
415 | } else { | 311 | } else { |
416 | struct flush_tlb_info info; | 312 | struct flush_tlb_info info; |
417 | info.flush_start = start; | 313 | info.start = start; |
418 | info.flush_end = end; | 314 | info.end = end; |
419 | on_each_cpu(do_kernel_range_flush, &info, 1); | 315 | on_each_cpu(do_kernel_range_flush, &info, 1); |
420 | } | 316 | } |
421 | } | 317 | } |
422 | 318 | ||
319 | void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) | ||
320 | { | ||
321 | struct flush_tlb_info info = { | ||
322 | .mm = NULL, | ||
323 | .start = 0UL, | ||
324 | .end = TLB_FLUSH_ALL, | ||
325 | }; | ||
326 | |||
327 | int cpu = get_cpu(); | ||
328 | |||
329 | if (cpumask_test_cpu(cpu, &batch->cpumask)) { | ||
330 | VM_WARN_ON(irqs_disabled()); | ||
331 | local_irq_disable(); | ||
332 | flush_tlb_func_local(&info, TLB_LOCAL_SHOOTDOWN); | ||
333 | local_irq_enable(); | ||
334 | } | ||
335 | |||
336 | if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) | ||
337 | flush_tlb_others(&batch->cpumask, &info); | ||
338 | cpumask_clear(&batch->cpumask); | ||
339 | |||
340 | put_cpu(); | ||
341 | } | ||
342 | |||
423 | static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf, | 343 | static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf, |
424 | size_t count, loff_t *ppos) | 344 | size_t count, loff_t *ppos) |
425 | { | 345 | { |
@@ -465,5 +385,3 @@ static int __init create_tlb_single_page_flush_ceiling(void) | |||
465 | return 0; | 385 | return 0; |
466 | } | 386 | } |
467 | late_initcall(create_tlb_single_page_flush_ceiling); | 387 | late_initcall(create_tlb_single_page_flush_ceiling); |
468 | |||
469 | #endif /* CONFIG_SMP */ | ||
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 8ff1f95627f9..9bf72f5bfedb 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c | |||
@@ -80,7 +80,7 @@ pgd_t * __init efi_call_phys_prolog(void) | |||
80 | int n_pgds, i, j; | 80 | int n_pgds, i, j; |
81 | 81 | ||
82 | if (!efi_enabled(EFI_OLD_MEMMAP)) { | 82 | if (!efi_enabled(EFI_OLD_MEMMAP)) { |
83 | save_pgd = (pgd_t *)read_cr3(); | 83 | save_pgd = (pgd_t *)__read_cr3(); |
84 | write_cr3((unsigned long)efi_scratch.efi_pgt); | 84 | write_cr3((unsigned long)efi_scratch.efi_pgt); |
85 | goto out; | 85 | goto out; |
86 | } | 86 | } |
@@ -649,7 +649,7 @@ efi_status_t efi_thunk_set_virtual_address_map( | |||
649 | efi_sync_low_kernel_mappings(); | 649 | efi_sync_low_kernel_mappings(); |
650 | local_irq_save(flags); | 650 | local_irq_save(flags); |
651 | 651 | ||
652 | efi_scratch.prev_cr3 = read_cr3(); | 652 | efi_scratch.prev_cr3 = __read_cr3(); |
653 | write_cr3((unsigned long)efi_scratch.efi_pgt); | 653 | write_cr3((unsigned long)efi_scratch.efi_pgt); |
654 | __flush_tlb_all(); | 654 | __flush_tlb_all(); |
655 | 655 | ||
diff --git a/arch/x86/platform/olpc/olpc-xo1-pm.c b/arch/x86/platform/olpc/olpc-xo1-pm.c index c5350fd27d70..0668aaff8bfe 100644 --- a/arch/x86/platform/olpc/olpc-xo1-pm.c +++ b/arch/x86/platform/olpc/olpc-xo1-pm.c | |||
@@ -77,7 +77,7 @@ static int xo1_power_state_enter(suspend_state_t pm_state) | |||
77 | 77 | ||
78 | asmlinkage __visible int xo1_do_sleep(u8 sleep_state) | 78 | asmlinkage __visible int xo1_do_sleep(u8 sleep_state) |
79 | { | 79 | { |
80 | void *pgd_addr = __va(read_cr3()); | 80 | void *pgd_addr = __va(read_cr3_pa()); |
81 | 81 | ||
82 | /* Program wakeup mask (using dword access to CS5536_PM1_EN) */ | 82 | /* Program wakeup mask (using dword access to CS5536_PM1_EN) */ |
83 | outl(wakeup_mask << 16, acpi_base + CS5536_PM1_STS); | 83 | outl(wakeup_mask << 16, acpi_base + CS5536_PM1_STS); |
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 795671593528..2983faab5b18 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c | |||
@@ -1123,11 +1123,9 @@ static int set_distrib_bits(struct cpumask *flush_mask, struct bau_control *bcp, | |||
1123 | * done. The returned pointer is valid till preemption is re-enabled. | 1123 | * done. The returned pointer is valid till preemption is re-enabled. |
1124 | */ | 1124 | */ |
1125 | const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, | 1125 | const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, |
1126 | struct mm_struct *mm, | 1126 | const struct flush_tlb_info *info) |
1127 | unsigned long start, | ||
1128 | unsigned long end, | ||
1129 | unsigned int cpu) | ||
1130 | { | 1127 | { |
1128 | unsigned int cpu = smp_processor_id(); | ||
1131 | int locals = 0, remotes = 0, hubs = 0; | 1129 | int locals = 0, remotes = 0, hubs = 0; |
1132 | struct bau_desc *bau_desc; | 1130 | struct bau_desc *bau_desc; |
1133 | struct cpumask *flush_mask; | 1131 | struct cpumask *flush_mask; |
@@ -1181,8 +1179,8 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, | |||
1181 | 1179 | ||
1182 | record_send_statistics(stat, locals, hubs, remotes, bau_desc); | 1180 | record_send_statistics(stat, locals, hubs, remotes, bau_desc); |
1183 | 1181 | ||
1184 | if (!end || (end - start) <= PAGE_SIZE) | 1182 | if (!info->end || (info->end - info->start) <= PAGE_SIZE) |
1185 | address = start; | 1183 | address = info->start; |
1186 | else | 1184 | else |
1187 | address = TLB_FLUSH_ALL; | 1185 | address = TLB_FLUSH_ALL; |
1188 | 1186 | ||
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 6b05a9219ea2..78459a6d455a 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c | |||
@@ -129,7 +129,7 @@ static void __save_processor_state(struct saved_context *ctxt) | |||
129 | */ | 129 | */ |
130 | ctxt->cr0 = read_cr0(); | 130 | ctxt->cr0 = read_cr0(); |
131 | ctxt->cr2 = read_cr2(); | 131 | ctxt->cr2 = read_cr2(); |
132 | ctxt->cr3 = read_cr3(); | 132 | ctxt->cr3 = __read_cr3(); |
133 | ctxt->cr4 = __read_cr4(); | 133 | ctxt->cr4 = __read_cr4(); |
134 | #ifdef CONFIG_X86_64 | 134 | #ifdef CONFIG_X86_64 |
135 | ctxt->cr8 = read_cr8(); | 135 | ctxt->cr8 = read_cr8(); |
diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c index a6e21fee22ea..e3e62c8a8e70 100644 --- a/arch/x86/power/hibernate_64.c +++ b/arch/x86/power/hibernate_64.c | |||
@@ -150,7 +150,8 @@ static int relocate_restore_code(void) | |||
150 | memcpy((void *)relocated_restore_code, &core_restore_code, PAGE_SIZE); | 150 | memcpy((void *)relocated_restore_code, &core_restore_code, PAGE_SIZE); |
151 | 151 | ||
152 | /* Make the page containing the relocated code executable */ | 152 | /* Make the page containing the relocated code executable */ |
153 | pgd = (pgd_t *)__va(read_cr3()) + pgd_index(relocated_restore_code); | 153 | pgd = (pgd_t *)__va(read_cr3_pa()) + |
154 | pgd_index(relocated_restore_code); | ||
154 | p4d = p4d_offset(pgd, relocated_restore_code); | 155 | p4d = p4d_offset(pgd, relocated_restore_code); |
155 | if (p4d_large(*p4d)) { | 156 | if (p4d_large(*p4d)) { |
156 | set_p4d(p4d, __p4d(p4d_val(*p4d) & ~_PAGE_NX)); | 157 | set_p4d(p4d, __p4d(p4d_val(*p4d) & ~_PAGE_NX)); |
diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c index a163a90af4aa..cd4be19c36dc 100644 --- a/arch/x86/realmode/init.c +++ b/arch/x86/realmode/init.c | |||
@@ -102,7 +102,7 @@ static void __init setup_real_mode(void) | |||
102 | 102 | ||
103 | trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd); | 103 | trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd); |
104 | trampoline_pgd[0] = trampoline_pgd_entry.pgd; | 104 | trampoline_pgd[0] = trampoline_pgd_entry.pgd; |
105 | trampoline_pgd[511] = init_level4_pgt[511].pgd; | 105 | trampoline_pgd[511] = init_top_pgt[511].pgd; |
106 | #endif | 106 | #endif |
107 | } | 107 | } |
108 | 108 | ||
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 1f386d7fdf70..1d7a7213a310 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c | |||
@@ -975,37 +975,32 @@ static void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) | |||
975 | spin_unlock(&mm->page_table_lock); | 975 | spin_unlock(&mm->page_table_lock); |
976 | } | 976 | } |
977 | 977 | ||
978 | 978 | static void drop_mm_ref_this_cpu(void *info) | |
979 | #ifdef CONFIG_SMP | ||
980 | /* Another cpu may still have their %cr3 pointing at the pagetable, so | ||
981 | we need to repoint it somewhere else before we can unpin it. */ | ||
982 | static void drop_other_mm_ref(void *info) | ||
983 | { | 979 | { |
984 | struct mm_struct *mm = info; | 980 | struct mm_struct *mm = info; |
985 | struct mm_struct *active_mm; | ||
986 | |||
987 | active_mm = this_cpu_read(cpu_tlbstate.active_mm); | ||
988 | 981 | ||
989 | if (active_mm == mm && this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK) | 982 | if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm) |
990 | leave_mm(smp_processor_id()); | 983 | leave_mm(smp_processor_id()); |
991 | 984 | ||
992 | /* If this cpu still has a stale cr3 reference, then make sure | 985 | /* |
993 | it has been flushed. */ | 986 | * If this cpu still has a stale cr3 reference, then make sure |
987 | * it has been flushed. | ||
988 | */ | ||
994 | if (this_cpu_read(xen_current_cr3) == __pa(mm->pgd)) | 989 | if (this_cpu_read(xen_current_cr3) == __pa(mm->pgd)) |
995 | load_cr3(swapper_pg_dir); | 990 | xen_mc_flush(); |
996 | } | 991 | } |
997 | 992 | ||
993 | #ifdef CONFIG_SMP | ||
994 | /* | ||
995 | * Another cpu may still have their %cr3 pointing at the pagetable, so | ||
996 | * we need to repoint it somewhere else before we can unpin it. | ||
997 | */ | ||
998 | static void xen_drop_mm_ref(struct mm_struct *mm) | 998 | static void xen_drop_mm_ref(struct mm_struct *mm) |
999 | { | 999 | { |
1000 | cpumask_var_t mask; | 1000 | cpumask_var_t mask; |
1001 | unsigned cpu; | 1001 | unsigned cpu; |
1002 | 1002 | ||
1003 | if (current->active_mm == mm) { | 1003 | drop_mm_ref_this_cpu(mm); |
1004 | if (current->mm == mm) | ||
1005 | load_cr3(swapper_pg_dir); | ||
1006 | else | ||
1007 | leave_mm(smp_processor_id()); | ||
1008 | } | ||
1009 | 1004 | ||
1010 | /* Get the "official" set of cpus referring to our pagetable. */ | 1005 | /* Get the "official" set of cpus referring to our pagetable. */ |
1011 | if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) { | 1006 | if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) { |
@@ -1013,31 +1008,31 @@ static void xen_drop_mm_ref(struct mm_struct *mm) | |||
1013 | if (!cpumask_test_cpu(cpu, mm_cpumask(mm)) | 1008 | if (!cpumask_test_cpu(cpu, mm_cpumask(mm)) |
1014 | && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd)) | 1009 | && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd)) |
1015 | continue; | 1010 | continue; |
1016 | smp_call_function_single(cpu, drop_other_mm_ref, mm, 1); | 1011 | smp_call_function_single(cpu, drop_mm_ref_this_cpu, mm, 1); |
1017 | } | 1012 | } |
1018 | return; | 1013 | return; |
1019 | } | 1014 | } |
1020 | cpumask_copy(mask, mm_cpumask(mm)); | 1015 | cpumask_copy(mask, mm_cpumask(mm)); |
1021 | 1016 | ||
1022 | /* It's possible that a vcpu may have a stale reference to our | 1017 | /* |
1023 | cr3, because its in lazy mode, and it hasn't yet flushed | 1018 | * It's possible that a vcpu may have a stale reference to our |
1024 | its set of pending hypercalls yet. In this case, we can | 1019 | * cr3, because its in lazy mode, and it hasn't yet flushed |
1025 | look at its actual current cr3 value, and force it to flush | 1020 | * its set of pending hypercalls yet. In this case, we can |
1026 | if needed. */ | 1021 | * look at its actual current cr3 value, and force it to flush |
1022 | * if needed. | ||
1023 | */ | ||
1027 | for_each_online_cpu(cpu) { | 1024 | for_each_online_cpu(cpu) { |
1028 | if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd)) | 1025 | if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd)) |
1029 | cpumask_set_cpu(cpu, mask); | 1026 | cpumask_set_cpu(cpu, mask); |
1030 | } | 1027 | } |
1031 | 1028 | ||
1032 | if (!cpumask_empty(mask)) | 1029 | smp_call_function_many(mask, drop_mm_ref_this_cpu, mm, 1); |
1033 | smp_call_function_many(mask, drop_other_mm_ref, mm, 1); | ||
1034 | free_cpumask_var(mask); | 1030 | free_cpumask_var(mask); |
1035 | } | 1031 | } |
1036 | #else | 1032 | #else |
1037 | static void xen_drop_mm_ref(struct mm_struct *mm) | 1033 | static void xen_drop_mm_ref(struct mm_struct *mm) |
1038 | { | 1034 | { |
1039 | if (current->active_mm == mm) | 1035 | drop_mm_ref_this_cpu(mm); |
1040 | load_cr3(swapper_pg_dir); | ||
1041 | } | 1036 | } |
1042 | #endif | 1037 | #endif |
1043 | 1038 | ||
@@ -1366,8 +1361,7 @@ static void xen_flush_tlb_single(unsigned long addr) | |||
1366 | } | 1361 | } |
1367 | 1362 | ||
1368 | static void xen_flush_tlb_others(const struct cpumask *cpus, | 1363 | static void xen_flush_tlb_others(const struct cpumask *cpus, |
1369 | struct mm_struct *mm, unsigned long start, | 1364 | const struct flush_tlb_info *info) |
1370 | unsigned long end) | ||
1371 | { | 1365 | { |
1372 | struct { | 1366 | struct { |
1373 | struct mmuext_op op; | 1367 | struct mmuext_op op; |
@@ -1379,7 +1373,7 @@ static void xen_flush_tlb_others(const struct cpumask *cpus, | |||
1379 | } *args; | 1373 | } *args; |
1380 | struct multicall_space mcs; | 1374 | struct multicall_space mcs; |
1381 | 1375 | ||
1382 | trace_xen_mmu_flush_tlb_others(cpus, mm, start, end); | 1376 | trace_xen_mmu_flush_tlb_others(cpus, info->mm, info->start, info->end); |
1383 | 1377 | ||
1384 | if (cpumask_empty(cpus)) | 1378 | if (cpumask_empty(cpus)) |
1385 | return; /* nothing to do */ | 1379 | return; /* nothing to do */ |
@@ -1393,9 +1387,10 @@ static void xen_flush_tlb_others(const struct cpumask *cpus, | |||
1393 | cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask)); | 1387 | cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask)); |
1394 | 1388 | ||
1395 | args->op.cmd = MMUEXT_TLB_FLUSH_MULTI; | 1389 | args->op.cmd = MMUEXT_TLB_FLUSH_MULTI; |
1396 | if (end != TLB_FLUSH_ALL && (end - start) <= PAGE_SIZE) { | 1390 | if (info->end != TLB_FLUSH_ALL && |
1391 | (info->end - info->start) <= PAGE_SIZE) { | ||
1397 | args->op.cmd = MMUEXT_INVLPG_MULTI; | 1392 | args->op.cmd = MMUEXT_INVLPG_MULTI; |
1398 | args->op.arg1.linear_addr = start; | 1393 | args->op.arg1.linear_addr = info->start; |
1399 | } | 1394 | } |
1400 | 1395 | ||
1401 | MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF); | 1396 | MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF); |
@@ -1470,8 +1465,8 @@ static void xen_write_cr3(unsigned long cr3) | |||
1470 | * At the start of the day - when Xen launches a guest, it has already | 1465 | * At the start of the day - when Xen launches a guest, it has already |
1471 | * built pagetables for the guest. We diligently look over them | 1466 | * built pagetables for the guest. We diligently look over them |
1472 | * in xen_setup_kernel_pagetable and graft as appropriate them in the | 1467 | * in xen_setup_kernel_pagetable and graft as appropriate them in the |
1473 | * init_level4_pgt and its friends. Then when we are happy we load | 1468 | * init_top_pgt and its friends. Then when we are happy we load |
1474 | * the new init_level4_pgt - and continue on. | 1469 | * the new init_top_pgt - and continue on. |
1475 | * | 1470 | * |
1476 | * The generic code starts (start_kernel) and 'init_mem_mapping' sets | 1471 | * The generic code starts (start_kernel) and 'init_mem_mapping' sets |
1477 | * up the rest of the pagetables. When it has completed it loads the cr3. | 1472 | * up the rest of the pagetables. When it has completed it loads the cr3. |
@@ -1914,12 +1909,12 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) | |||
1914 | pt_end = pt_base + xen_start_info->nr_pt_frames; | 1909 | pt_end = pt_base + xen_start_info->nr_pt_frames; |
1915 | 1910 | ||
1916 | /* Zap identity mapping */ | 1911 | /* Zap identity mapping */ |
1917 | init_level4_pgt[0] = __pgd(0); | 1912 | init_top_pgt[0] = __pgd(0); |
1918 | 1913 | ||
1919 | /* Pre-constructed entries are in pfn, so convert to mfn */ | 1914 | /* Pre-constructed entries are in pfn, so convert to mfn */ |
1920 | /* L4[272] -> level3_ident_pgt */ | 1915 | /* L4[272] -> level3_ident_pgt */ |
1921 | /* L4[511] -> level3_kernel_pgt */ | 1916 | /* L4[511] -> level3_kernel_pgt */ |
1922 | convert_pfn_mfn(init_level4_pgt); | 1917 | convert_pfn_mfn(init_top_pgt); |
1923 | 1918 | ||
1924 | /* L3_i[0] -> level2_ident_pgt */ | 1919 | /* L3_i[0] -> level2_ident_pgt */ |
1925 | convert_pfn_mfn(level3_ident_pgt); | 1920 | convert_pfn_mfn(level3_ident_pgt); |
@@ -1950,10 +1945,10 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) | |||
1950 | /* Copy the initial P->M table mappings if necessary. */ | 1945 | /* Copy the initial P->M table mappings if necessary. */ |
1951 | i = pgd_index(xen_start_info->mfn_list); | 1946 | i = pgd_index(xen_start_info->mfn_list); |
1952 | if (i && i < pgd_index(__START_KERNEL_map)) | 1947 | if (i && i < pgd_index(__START_KERNEL_map)) |
1953 | init_level4_pgt[i] = ((pgd_t *)xen_start_info->pt_base)[i]; | 1948 | init_top_pgt[i] = ((pgd_t *)xen_start_info->pt_base)[i]; |
1954 | 1949 | ||
1955 | /* Make pagetable pieces RO */ | 1950 | /* Make pagetable pieces RO */ |
1956 | set_page_prot(init_level4_pgt, PAGE_KERNEL_RO); | 1951 | set_page_prot(init_top_pgt, PAGE_KERNEL_RO); |
1957 | set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO); | 1952 | set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO); |
1958 | set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO); | 1953 | set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO); |
1959 | set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO); | 1954 | set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO); |
@@ -1964,7 +1959,7 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) | |||
1964 | 1959 | ||
1965 | /* Pin down new L4 */ | 1960 | /* Pin down new L4 */ |
1966 | pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, | 1961 | pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, |
1967 | PFN_DOWN(__pa_symbol(init_level4_pgt))); | 1962 | PFN_DOWN(__pa_symbol(init_top_pgt))); |
1968 | 1963 | ||
1969 | /* Unpin Xen-provided one */ | 1964 | /* Unpin Xen-provided one */ |
1970 | pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); | 1965 | pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); |
@@ -1974,7 +1969,7 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) | |||
1974 | * attach it to, so make sure we just set kernel pgd. | 1969 | * attach it to, so make sure we just set kernel pgd. |
1975 | */ | 1970 | */ |
1976 | xen_mc_batch(); | 1971 | xen_mc_batch(); |
1977 | __xen_write_cr3(true, __pa(init_level4_pgt)); | 1972 | __xen_write_cr3(true, __pa(init_top_pgt)); |
1978 | xen_mc_issue(PARAVIRT_LAZY_CPU); | 1973 | xen_mc_issue(PARAVIRT_LAZY_CPU); |
1979 | 1974 | ||
1980 | /* We can't that easily rip out L3 and L2, as the Xen pagetables are | 1975 | /* We can't that easily rip out L3 and L2, as the Xen pagetables are |
@@ -2022,7 +2017,7 @@ static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr) | |||
2022 | pmd_t pmd; | 2017 | pmd_t pmd; |
2023 | pte_t pte; | 2018 | pte_t pte; |
2024 | 2019 | ||
2025 | pa = read_cr3(); | 2020 | pa = read_cr3_pa(); |
2026 | pgd = native_make_pgd(xen_read_phys_ulong(pa + pgd_index(vaddr) * | 2021 | pgd = native_make_pgd(xen_read_phys_ulong(pa + pgd_index(vaddr) * |
2027 | sizeof(pgd))); | 2022 | sizeof(pgd))); |
2028 | if (!pgd_present(pgd)) | 2023 | if (!pgd_present(pgd)) |
@@ -2102,7 +2097,7 @@ void __init xen_relocate_p2m(void) | |||
2102 | pt_phys = pmd_phys + PFN_PHYS(n_pmd); | 2097 | pt_phys = pmd_phys + PFN_PHYS(n_pmd); |
2103 | p2m_pfn = PFN_DOWN(pt_phys) + n_pt; | 2098 | p2m_pfn = PFN_DOWN(pt_phys) + n_pt; |
2104 | 2099 | ||
2105 | pgd = __va(read_cr3()); | 2100 | pgd = __va(read_cr3_pa()); |
2106 | new_p2m = (unsigned long *)(2 * PGDIR_SIZE); | 2101 | new_p2m = (unsigned long *)(2 * PGDIR_SIZE); |
2107 | idx_p4d = 0; | 2102 | idx_p4d = 0; |
2108 | save_pud = n_pud; | 2103 | save_pud = n_pud; |
@@ -2209,7 +2204,7 @@ static void __init xen_write_cr3_init(unsigned long cr3) | |||
2209 | { | 2204 | { |
2210 | unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir)); | 2205 | unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir)); |
2211 | 2206 | ||
2212 | BUG_ON(read_cr3() != __pa(initial_page_table)); | 2207 | BUG_ON(read_cr3_pa() != __pa(initial_page_table)); |
2213 | BUG_ON(cr3 != __pa(swapper_pg_dir)); | 2208 | BUG_ON(cr3 != __pa(swapper_pg_dir)); |
2214 | 2209 | ||
2215 | /* | 2210 | /* |
diff --git a/arch/x86/xen/xen-pvh.S b/arch/x86/xen/xen-pvh.S index 5e246716d58f..e1a5fbeae08d 100644 --- a/arch/x86/xen/xen-pvh.S +++ b/arch/x86/xen/xen-pvh.S | |||
@@ -87,7 +87,7 @@ ENTRY(pvh_start_xen) | |||
87 | wrmsr | 87 | wrmsr |
88 | 88 | ||
89 | /* Enable pre-constructed page tables. */ | 89 | /* Enable pre-constructed page tables. */ |
90 | mov $_pa(init_level4_pgt), %eax | 90 | mov $_pa(init_top_pgt), %eax |
91 | mov %eax, %cr3 | 91 | mov %eax, %cr3 |
92 | mov $(X86_CR0_PG | X86_CR0_PE), %eax | 92 | mov $(X86_CR0_PG | X86_CR0_PE), %eax |
93 | mov %eax, %cr0 | 93 | mov %eax, %cr0 |