summaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-07-03 17:45:09 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2017-07-03 17:45:09 -0400
commit7a69f9c60b49699579f5bfb71f928cceba0afe1a (patch)
treebf3b5640bbd9f23beeb5a55d18348d65bafff8e8 /arch/x86
parent9bc088ab66be8978fbc981ba9644468fa2c2fd3f (diff)
parent8781fb7e9749da424e01daacd14834b674658c63 (diff)
Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 mm updates from Ingo Molnar: "The main changes in this cycle were: - Continued work to add support for 5-level paging provided by future Intel CPUs. In particular we switch the x86 GUP code to the generic implementation. (Kirill A. Shutemov) - Continued work to add PCID CPU support to native kernels as well. In this round most of the focus is on reworking/refreshing the TLB flush infrastructure for the upcoming PCID changes. (Andy Lutomirski)" * 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (34 commits) x86/mm: Delete a big outdated comment about TLB flushing x86/mm: Don't reenter flush_tlb_func_common() x86/KASLR: Fix detection 32/64 bit bootloaders for 5-level paging x86/ftrace: Exclude functions in head64.c from function-tracing x86/mmap, ASLR: Do not treat unlimited-stack tasks as legacy mmap x86/mm: Remove reset_lazy_tlbstate() x86/ldt: Simplify the LDT switching logic x86/boot/64: Put __startup_64() into .head.text x86/mm: Add support for 5-level paging for KASLR x86/mm: Make kernel_physical_mapping_init() support 5-level paging x86/mm: Add sync_global_pgds() for configuration with 5-level paging x86/boot/64: Add support of additional page table level during early boot x86/boot/64: Rename init_level4_pgt and early_level4_pgt x86/boot/64: Rewrite startup_64() in C x86/boot/compressed: Enable 5-level paging during decompression stage x86/boot/efi: Define __KERNEL32_CS GDT on 64-bit configurations x86/boot/efi: Fix __KERNEL_CS definition of GDT entry on 64-bit configurations x86/boot/efi: Cleanup initialization of GDT entries x86/asm: Fix comment in return_from_SYSCALL_64() x86/mm/gup: Switch GUP to the generic get_user_page_fast() implementation ...
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig5
-rw-r--r--arch/x86/boot/compressed/eboot.c73
-rw-r--r--arch/x86/boot/compressed/head_64.S86
-rw-r--r--arch/x86/boot/compressed/pagetable.c18
-rw-r--r--arch/x86/entry/entry_64.S3
-rw-r--r--arch/x86/events/core.c5
-rw-r--r--arch/x86/include/asm/efi.h2
-rw-r--r--arch/x86/include/asm/hardirq.h2
-rw-r--r--arch/x86/include/asm/mmu.h6
-rw-r--r--arch/x86/include/asm/mmu_context.h63
-rw-r--r--arch/x86/include/asm/paravirt.h8
-rw-r--r--arch/x86/include/asm/paravirt_types.h5
-rw-r--r--arch/x86/include/asm/pgtable-3level.h47
-rw-r--r--arch/x86/include/asm/pgtable.h55
-rw-r--r--arch/x86/include/asm/pgtable_64.h22
-rw-r--r--arch/x86/include/asm/processor-flags.h36
-rw-r--r--arch/x86/include/asm/processor.h8
-rw-r--r--arch/x86/include/asm/special_insns.h10
-rw-r--r--arch/x86/include/asm/tlbbatch.h14
-rw-r--r--arch/x86/include/asm/tlbflush.h114
-rw-r--r--arch/x86/include/asm/uv/uv.h11
-rw-r--r--arch/x86/include/uapi/asm/processor-flags.h2
-rw-r--r--arch/x86/kernel/Makefile1
-rw-r--r--arch/x86/kernel/espfix_64.c2
-rw-r--r--arch/x86/kernel/head64.c145
-rw-r--r--arch/x86/kernel/head_64.S131
-rw-r--r--arch/x86/kernel/ldt.c56
-rw-r--r--arch/x86/kernel/machine_kexec_64.c2
-rw-r--r--arch/x86/kernel/paravirt.c2
-rw-r--r--arch/x86/kernel/process_32.c2
-rw-r--r--arch/x86/kernel/process_64.c4
-rw-r--r--arch/x86/kernel/smpboot.c1
-rw-r--r--arch/x86/kernel/step.c2
-rw-r--r--arch/x86/kvm/vmx.c21
-rw-r--r--arch/x86/math-emu/fpu_system.h2
-rw-r--r--arch/x86/mm/Makefile2
-rw-r--r--arch/x86/mm/dump_pagetables.c2
-rw-r--r--arch/x86/mm/fault.c10
-rw-r--r--arch/x86/mm/gup.c496
-rw-r--r--arch/x86/mm/init.c4
-rw-r--r--arch/x86/mm/init_64.c108
-rw-r--r--arch/x86/mm/ioremap.c2
-rw-r--r--arch/x86/mm/kasan_init_64.c12
-rw-r--r--arch/x86/mm/kaslr.c81
-rw-r--r--arch/x86/mm/mmap.c3
-rw-r--r--arch/x86/mm/tlb.c458
-rw-r--r--arch/x86/platform/efi/efi_64.c4
-rw-r--r--arch/x86/platform/olpc/olpc-xo1-pm.c2
-rw-r--r--arch/x86/platform/uv/tlb_uv.c10
-rw-r--r--arch/x86/power/cpu.c2
-rw-r--r--arch/x86/power/hibernate_64.c3
-rw-r--r--arch/x86/realmode/init.c2
-rw-r--r--arch/x86/xen/mmu_pv.c83
-rw-r--r--arch/x86/xen/xen-pvh.S2
54 files changed, 1061 insertions, 1191 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 0efb4c9497bc..737212c0333e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -69,7 +69,7 @@ config X86
69 select ARCH_USE_BUILTIN_BSWAP 69 select ARCH_USE_BUILTIN_BSWAP
70 select ARCH_USE_QUEUED_RWLOCKS 70 select ARCH_USE_QUEUED_RWLOCKS
71 select ARCH_USE_QUEUED_SPINLOCKS 71 select ARCH_USE_QUEUED_SPINLOCKS
72 select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH if SMP 72 select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
73 select ARCH_WANT_FRAME_POINTERS 73 select ARCH_WANT_FRAME_POINTERS
74 select ARCH_WANTS_DYNAMIC_TASK_STRUCT 74 select ARCH_WANTS_DYNAMIC_TASK_STRUCT
75 select BUILDTIME_EXTABLE_SORT 75 select BUILDTIME_EXTABLE_SORT
@@ -2793,6 +2793,9 @@ config X86_DMA_REMAP
2793 bool 2793 bool
2794 depends on STA2X11 2794 depends on STA2X11
2795 2795
2796config HAVE_GENERIC_GUP
2797 def_bool y
2798
2796source "net/Kconfig" 2799source "net/Kconfig"
2797 2800
2798source "drivers/Kconfig" 2801source "drivers/Kconfig"
diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index cbf4b87f55b9..c3e869eaef0c 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -1046,9 +1046,31 @@ struct boot_params *efi_main(struct efi_config *c,
1046 memset((char *)gdt->address, 0x0, gdt->size); 1046 memset((char *)gdt->address, 0x0, gdt->size);
1047 desc = (struct desc_struct *)gdt->address; 1047 desc = (struct desc_struct *)gdt->address;
1048 1048
1049 /* The first GDT is a dummy and the second is unused. */ 1049 /* The first GDT is a dummy. */
1050 desc += 2; 1050 desc++;
1051
1052 if (IS_ENABLED(CONFIG_X86_64)) {
1053 /* __KERNEL32_CS */
1054 desc->limit0 = 0xffff;
1055 desc->base0 = 0x0000;
1056 desc->base1 = 0x0000;
1057 desc->type = SEG_TYPE_CODE | SEG_TYPE_EXEC_READ;
1058 desc->s = DESC_TYPE_CODE_DATA;
1059 desc->dpl = 0;
1060 desc->p = 1;
1061 desc->limit = 0xf;
1062 desc->avl = 0;
1063 desc->l = 0;
1064 desc->d = SEG_OP_SIZE_32BIT;
1065 desc->g = SEG_GRANULARITY_4KB;
1066 desc->base2 = 0x00;
1067 desc++;
1068 } else {
1069 /* Second entry is unused on 32-bit */
1070 desc++;
1071 }
1051 1072
1073 /* __KERNEL_CS */
1052 desc->limit0 = 0xffff; 1074 desc->limit0 = 0xffff;
1053 desc->base0 = 0x0000; 1075 desc->base0 = 0x0000;
1054 desc->base1 = 0x0000; 1076 desc->base1 = 0x0000;
@@ -1058,12 +1080,18 @@ struct boot_params *efi_main(struct efi_config *c,
1058 desc->p = 1; 1080 desc->p = 1;
1059 desc->limit = 0xf; 1081 desc->limit = 0xf;
1060 desc->avl = 0; 1082 desc->avl = 0;
1061 desc->l = 0; 1083 if (IS_ENABLED(CONFIG_X86_64)) {
1062 desc->d = SEG_OP_SIZE_32BIT; 1084 desc->l = 1;
1085 desc->d = 0;
1086 } else {
1087 desc->l = 0;
1088 desc->d = SEG_OP_SIZE_32BIT;
1089 }
1063 desc->g = SEG_GRANULARITY_4KB; 1090 desc->g = SEG_GRANULARITY_4KB;
1064 desc->base2 = 0x00; 1091 desc->base2 = 0x00;
1065
1066 desc++; 1092 desc++;
1093
1094 /* __KERNEL_DS */
1067 desc->limit0 = 0xffff; 1095 desc->limit0 = 0xffff;
1068 desc->base0 = 0x0000; 1096 desc->base0 = 0x0000;
1069 desc->base1 = 0x0000; 1097 desc->base1 = 0x0000;
@@ -1077,24 +1105,25 @@ struct boot_params *efi_main(struct efi_config *c,
1077 desc->d = SEG_OP_SIZE_32BIT; 1105 desc->d = SEG_OP_SIZE_32BIT;
1078 desc->g = SEG_GRANULARITY_4KB; 1106 desc->g = SEG_GRANULARITY_4KB;
1079 desc->base2 = 0x00; 1107 desc->base2 = 0x00;
1080
1081#ifdef CONFIG_X86_64
1082 /* Task segment value */
1083 desc++; 1108 desc++;
1084 desc->limit0 = 0x0000; 1109
1085 desc->base0 = 0x0000; 1110 if (IS_ENABLED(CONFIG_X86_64)) {
1086 desc->base1 = 0x0000; 1111 /* Task segment value */
1087 desc->type = SEG_TYPE_TSS; 1112 desc->limit0 = 0x0000;
1088 desc->s = 0; 1113 desc->base0 = 0x0000;
1089 desc->dpl = 0; 1114 desc->base1 = 0x0000;
1090 desc->p = 1; 1115 desc->type = SEG_TYPE_TSS;
1091 desc->limit = 0x0; 1116 desc->s = 0;
1092 desc->avl = 0; 1117 desc->dpl = 0;
1093 desc->l = 0; 1118 desc->p = 1;
1094 desc->d = 0; 1119 desc->limit = 0x0;
1095 desc->g = SEG_GRANULARITY_4KB; 1120 desc->avl = 0;
1096 desc->base2 = 0x00; 1121 desc->l = 0;
1097#endif /* CONFIG_X86_64 */ 1122 desc->d = 0;
1123 desc->g = SEG_GRANULARITY_4KB;
1124 desc->base2 = 0x00;
1125 desc++;
1126 }
1098 1127
1099 asm volatile("cli"); 1128 asm volatile("cli");
1100 asm volatile ("lgdt %0" : : "m" (*gdt)); 1129 asm volatile ("lgdt %0" : : "m" (*gdt));
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index d2ae1f821e0c..fbf4c32d0b62 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -346,6 +346,48 @@ preferred_addr:
346 /* Set up the stack */ 346 /* Set up the stack */
347 leaq boot_stack_end(%rbx), %rsp 347 leaq boot_stack_end(%rbx), %rsp
348 348
349#ifdef CONFIG_X86_5LEVEL
350 /* Check if 5-level paging has already enabled */
351 movq %cr4, %rax
352 testl $X86_CR4_LA57, %eax
353 jnz lvl5
354
355 /*
356 * At this point we are in long mode with 4-level paging enabled,
357 * but we want to enable 5-level paging.
358 *
359 * The problem is that we cannot do it directly. Setting LA57 in
360 * long mode would trigger #GP. So we need to switch off long mode
361 * first.
362 *
363 * NOTE: This is not going to work if bootloader put us above 4G
364 * limit.
365 *
366 * The first step is go into compatibility mode.
367 */
368
369 /* Clear additional page table */
370 leaq lvl5_pgtable(%rbx), %rdi
371 xorq %rax, %rax
372 movq $(PAGE_SIZE/8), %rcx
373 rep stosq
374
375 /*
376 * Setup current CR3 as the first and only entry in a new top level
377 * page table.
378 */
379 movq %cr3, %rdi
380 leaq 0x7 (%rdi), %rax
381 movq %rax, lvl5_pgtable(%rbx)
382
383 /* Switch to compatibility mode (CS.L = 0 CS.D = 1) via far return */
384 pushq $__KERNEL32_CS
385 leaq compatible_mode(%rip), %rax
386 pushq %rax
387 lretq
388lvl5:
389#endif
390
349 /* Zero EFLAGS */ 391 /* Zero EFLAGS */
350 pushq $0 392 pushq $0
351 popfq 393 popfq
@@ -429,6 +471,44 @@ relocated:
429 jmp *%rax 471 jmp *%rax
430 472
431 .code32 473 .code32
474#ifdef CONFIG_X86_5LEVEL
475compatible_mode:
476 /* Setup data and stack segments */
477 movl $__KERNEL_DS, %eax
478 movl %eax, %ds
479 movl %eax, %ss
480
481 /* Disable paging */
482 movl %cr0, %eax
483 btrl $X86_CR0_PG_BIT, %eax
484 movl %eax, %cr0
485
486 /* Point CR3 to 5-level paging */
487 leal lvl5_pgtable(%ebx), %eax
488 movl %eax, %cr3
489
490 /* Enable PAE and LA57 mode */
491 movl %cr4, %eax
492 orl $(X86_CR4_PAE | X86_CR4_LA57), %eax
493 movl %eax, %cr4
494
495 /* Calculate address we are running at */
496 call 1f
4971: popl %edi
498 subl $1b, %edi
499
500 /* Prepare stack for far return to Long Mode */
501 pushl $__KERNEL_CS
502 leal lvl5(%edi), %eax
503 push %eax
504
505 /* Enable paging back */
506 movl $(X86_CR0_PG | X86_CR0_PE), %eax
507 movl %eax, %cr0
508
509 lret
510#endif
511
432no_longmode: 512no_longmode:
433 /* This isn't an x86-64 CPU so hang */ 513 /* This isn't an x86-64 CPU so hang */
4341: 5141:
@@ -442,7 +522,7 @@ gdt:
442 .word gdt_end - gdt 522 .word gdt_end - gdt
443 .long gdt 523 .long gdt
444 .word 0 524 .word 0
445 .quad 0x0000000000000000 /* NULL descriptor */ 525 .quad 0x00cf9a000000ffff /* __KERNEL32_CS */
446 .quad 0x00af9a000000ffff /* __KERNEL_CS */ 526 .quad 0x00af9a000000ffff /* __KERNEL_CS */
447 .quad 0x00cf92000000ffff /* __KERNEL_DS */ 527 .quad 0x00cf92000000ffff /* __KERNEL_DS */
448 .quad 0x0080890000000000 /* TS descriptor */ 528 .quad 0x0080890000000000 /* TS descriptor */
@@ -486,3 +566,7 @@ boot_stack_end:
486 .balign 4096 566 .balign 4096
487pgtable: 567pgtable:
488 .fill BOOT_PGT_SIZE, 1, 0 568 .fill BOOT_PGT_SIZE, 1, 0
569#ifdef CONFIG_X86_5LEVEL
570lvl5_pgtable:
571 .fill PAGE_SIZE, 1, 0
572#endif
diff --git a/arch/x86/boot/compressed/pagetable.c b/arch/x86/boot/compressed/pagetable.c
index 1d78f1739087..28029be47fbb 100644
--- a/arch/x86/boot/compressed/pagetable.c
+++ b/arch/x86/boot/compressed/pagetable.c
@@ -63,7 +63,7 @@ static void *alloc_pgt_page(void *context)
63static struct alloc_pgt_data pgt_data; 63static struct alloc_pgt_data pgt_data;
64 64
65/* The top level page table entry pointer. */ 65/* The top level page table entry pointer. */
66static unsigned long level4p; 66static unsigned long top_level_pgt;
67 67
68/* 68/*
69 * Mapping information structure passed to kernel_ident_mapping_init(). 69 * Mapping information structure passed to kernel_ident_mapping_init().
@@ -91,9 +91,15 @@ void initialize_identity_maps(void)
91 * If we came here via startup_32(), cr3 will be _pgtable already 91 * If we came here via startup_32(), cr3 will be _pgtable already
92 * and we must append to the existing area instead of entirely 92 * and we must append to the existing area instead of entirely
93 * overwriting it. 93 * overwriting it.
94 *
95 * With 5-level paging, we use '_pgtable' to allocate the p4d page table,
96 * the top-level page table is allocated separately.
97 *
98 * p4d_offset(top_level_pgt, 0) would cover both the 4- and 5-level
99 * cases. On 4-level paging it's equal to 'top_level_pgt'.
94 */ 100 */
95 level4p = read_cr3(); 101 top_level_pgt = read_cr3_pa();
96 if (level4p == (unsigned long)_pgtable) { 102 if (p4d_offset((pgd_t *)top_level_pgt, 0) == (p4d_t *)_pgtable) {
97 debug_putstr("booted via startup_32()\n"); 103 debug_putstr("booted via startup_32()\n");
98 pgt_data.pgt_buf = _pgtable + BOOT_INIT_PGT_SIZE; 104 pgt_data.pgt_buf = _pgtable + BOOT_INIT_PGT_SIZE;
99 pgt_data.pgt_buf_size = BOOT_PGT_SIZE - BOOT_INIT_PGT_SIZE; 105 pgt_data.pgt_buf_size = BOOT_PGT_SIZE - BOOT_INIT_PGT_SIZE;
@@ -103,7 +109,7 @@ void initialize_identity_maps(void)
103 pgt_data.pgt_buf = _pgtable; 109 pgt_data.pgt_buf = _pgtable;
104 pgt_data.pgt_buf_size = BOOT_PGT_SIZE; 110 pgt_data.pgt_buf_size = BOOT_PGT_SIZE;
105 memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size); 111 memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size);
106 level4p = (unsigned long)alloc_pgt_page(&pgt_data); 112 top_level_pgt = (unsigned long)alloc_pgt_page(&pgt_data);
107 } 113 }
108} 114}
109 115
@@ -123,7 +129,7 @@ void add_identity_map(unsigned long start, unsigned long size)
123 return; 129 return;
124 130
125 /* Build the mapping. */ 131 /* Build the mapping. */
126 kernel_ident_mapping_init(&mapping_info, (pgd_t *)level4p, 132 kernel_ident_mapping_init(&mapping_info, (pgd_t *)top_level_pgt,
127 start, end); 133 start, end);
128} 134}
129 135
@@ -134,5 +140,5 @@ void add_identity_map(unsigned long start, unsigned long size)
134 */ 140 */
135void finalize_identity_maps(void) 141void finalize_identity_maps(void)
136{ 142{
137 write_cr3(level4p); 143 write_cr3(top_level_pgt);
138} 144}
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 4a4c0834f965..a9a8027a6c0e 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -265,7 +265,8 @@ return_from_SYSCALL_64:
265 * If width of "canonical tail" ever becomes variable, this will need 265 * If width of "canonical tail" ever becomes variable, this will need
266 * to be updated to remain correct on both old and new CPUs. 266 * to be updated to remain correct on both old and new CPUs.
267 * 267 *
268 * Change top 16 bits to be the sign-extension of 47th bit 268 * Change top bits to match most significant bit (47th or 56th bit
269 * depending on paging mode) in the address.
269 */ 270 */
270 shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx 271 shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
271 sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx 272 sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 628b8c556aab..2de0dd73830a 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2111,8 +2111,7 @@ static int x86_pmu_event_init(struct perf_event *event)
2111 2111
2112static void refresh_pce(void *ignored) 2112static void refresh_pce(void *ignored)
2113{ 2113{
2114 if (current->active_mm) 2114 load_mm_cr4(this_cpu_read(cpu_tlbstate.loaded_mm));
2115 load_mm_cr4(current->active_mm);
2116} 2115}
2117 2116
2118static void x86_pmu_event_mapped(struct perf_event *event) 2117static void x86_pmu_event_mapped(struct perf_event *event)
@@ -2344,7 +2343,7 @@ static unsigned long get_segment_base(unsigned int segment)
2344 2343
2345 /* IRQs are off, so this synchronizes with smp_store_release */ 2344 /* IRQs are off, so this synchronizes with smp_store_release */
2346 ldt = lockless_dereference(current->active_mm->context.ldt); 2345 ldt = lockless_dereference(current->active_mm->context.ldt);
2347 if (!ldt || idx > ldt->size) 2346 if (!ldt || idx > ldt->nr_entries)
2348 return 0; 2347 return 0;
2349 2348
2350 desc = &ldt->entries[idx]; 2349 desc = &ldt->entries[idx];
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 2f77bcefe6b4..d2ff779f347e 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -74,7 +74,7 @@ struct efi_scratch {
74 __kernel_fpu_begin(); \ 74 __kernel_fpu_begin(); \
75 \ 75 \
76 if (efi_scratch.use_pgd) { \ 76 if (efi_scratch.use_pgd) { \
77 efi_scratch.prev_cr3 = read_cr3(); \ 77 efi_scratch.prev_cr3 = __read_cr3(); \
78 write_cr3((unsigned long)efi_scratch.efi_pgt); \ 78 write_cr3((unsigned long)efi_scratch.efi_pgt); \
79 __flush_tlb_all(); \ 79 __flush_tlb_all(); \
80 } \ 80 } \
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 59405a248fc2..9b76cd331990 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -22,8 +22,8 @@ typedef struct {
22#ifdef CONFIG_SMP 22#ifdef CONFIG_SMP
23 unsigned int irq_resched_count; 23 unsigned int irq_resched_count;
24 unsigned int irq_call_count; 24 unsigned int irq_call_count;
25 unsigned int irq_tlb_count;
26#endif 25#endif
26 unsigned int irq_tlb_count;
27#ifdef CONFIG_X86_THERMAL_VECTOR 27#ifdef CONFIG_X86_THERMAL_VECTOR
28 unsigned int irq_thermal_count; 28 unsigned int irq_thermal_count;
29#endif 29#endif
diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index f9813b6d8b80..79b647a7ebd0 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -37,12 +37,6 @@ typedef struct {
37#endif 37#endif
38} mm_context_t; 38} mm_context_t;
39 39
40#ifdef CONFIG_SMP
41void leave_mm(int cpu); 40void leave_mm(int cpu);
42#else
43static inline void leave_mm(int cpu)
44{
45}
46#endif
47 41
48#endif /* _ASM_X86_MMU_H */ 42#endif /* _ASM_X86_MMU_H */
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 68b329d77b3a..ecfcb6643c9b 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -47,7 +47,7 @@ struct ldt_struct {
47 * allocations, but it's not worth trying to optimize. 47 * allocations, but it's not worth trying to optimize.
48 */ 48 */
49 struct desc_struct *entries; 49 struct desc_struct *entries;
50 unsigned int size; 50 unsigned int nr_entries;
51}; 51};
52 52
53/* 53/*
@@ -87,22 +87,46 @@ static inline void load_mm_ldt(struct mm_struct *mm)
87 */ 87 */
88 88
89 if (unlikely(ldt)) 89 if (unlikely(ldt))
90 set_ldt(ldt->entries, ldt->size); 90 set_ldt(ldt->entries, ldt->nr_entries);
91 else 91 else
92 clear_LDT(); 92 clear_LDT();
93#else 93#else
94 clear_LDT(); 94 clear_LDT();
95#endif 95#endif
96}
97
98static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next)
99{
100#ifdef CONFIG_MODIFY_LDT_SYSCALL
101 /*
102 * Load the LDT if either the old or new mm had an LDT.
103 *
104 * An mm will never go from having an LDT to not having an LDT. Two
105 * mms never share an LDT, so we don't gain anything by checking to
106 * see whether the LDT changed. There's also no guarantee that
107 * prev->context.ldt actually matches LDTR, but, if LDTR is non-NULL,
108 * then prev->context.ldt will also be non-NULL.
109 *
110 * If we really cared, we could optimize the case where prev == next
111 * and we're exiting lazy mode. Most of the time, if this happens,
112 * we don't actually need to reload LDTR, but modify_ldt() is mostly
113 * used by legacy code and emulators where we don't need this level of
114 * performance.
115 *
116 * This uses | instead of || because it generates better code.
117 */
118 if (unlikely((unsigned long)prev->context.ldt |
119 (unsigned long)next->context.ldt))
120 load_mm_ldt(next);
121#endif
96 122
97 DEBUG_LOCKS_WARN_ON(preemptible()); 123 DEBUG_LOCKS_WARN_ON(preemptible());
98} 124}
99 125
100static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) 126static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
101{ 127{
102#ifdef CONFIG_SMP
103 if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) 128 if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
104 this_cpu_write(cpu_tlbstate.state, TLBSTATE_LAZY); 129 this_cpu_write(cpu_tlbstate.state, TLBSTATE_LAZY);
105#endif
106} 130}
107 131
108static inline int init_new_context(struct task_struct *tsk, 132static inline int init_new_context(struct task_struct *tsk,
@@ -220,18 +244,6 @@ static inline int vma_pkey(struct vm_area_struct *vma)
220} 244}
221#endif 245#endif
222 246
223static inline bool __pkru_allows_pkey(u16 pkey, bool write)
224{
225 u32 pkru = read_pkru();
226
227 if (!__pkru_allows_read(pkru, pkey))
228 return false;
229 if (write && !__pkru_allows_write(pkru, pkey))
230 return false;
231
232 return true;
233}
234
235/* 247/*
236 * We only want to enforce protection keys on the current process 248 * We only want to enforce protection keys on the current process
237 * because we effectively have no access to PKRU for other 249 * because we effectively have no access to PKRU for other
@@ -268,4 +280,23 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
268 return __pkru_allows_pkey(vma_pkey(vma), write); 280 return __pkru_allows_pkey(vma_pkey(vma), write);
269} 281}
270 282
283
284/*
285 * This can be used from process context to figure out what the value of
286 * CR3 is without needing to do a (slow) __read_cr3().
287 *
288 * It's intended to be used for code like KVM that sneakily changes CR3
289 * and needs to restore it. It needs to be used very carefully.
290 */
291static inline unsigned long __get_current_cr3_fast(void)
292{
293 unsigned long cr3 = __pa(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd);
294
295 /* For now, be very restrictive about when this can be called. */
296 VM_WARN_ON(in_nmi() || !in_atomic());
297
298 VM_BUG_ON(cr3 != __read_cr3());
299 return cr3;
300}
301
271#endif /* _ASM_X86_MMU_CONTEXT_H */ 302#endif /* _ASM_X86_MMU_CONTEXT_H */
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index a3dcf8944cb9..9ccac1926587 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -61,7 +61,7 @@ static inline void write_cr2(unsigned long x)
61 PVOP_VCALL1(pv_mmu_ops.write_cr2, x); 61 PVOP_VCALL1(pv_mmu_ops.write_cr2, x);
62} 62}
63 63
64static inline unsigned long read_cr3(void) 64static inline unsigned long __read_cr3(void)
65{ 65{
66 return PVOP_CALL0(unsigned long, pv_mmu_ops.read_cr3); 66 return PVOP_CALL0(unsigned long, pv_mmu_ops.read_cr3);
67} 67}
@@ -312,11 +312,9 @@ static inline void __flush_tlb_single(unsigned long addr)
312} 312}
313 313
314static inline void flush_tlb_others(const struct cpumask *cpumask, 314static inline void flush_tlb_others(const struct cpumask *cpumask,
315 struct mm_struct *mm, 315 const struct flush_tlb_info *info)
316 unsigned long start,
317 unsigned long end)
318{ 316{
319 PVOP_VCALL4(pv_mmu_ops.flush_tlb_others, cpumask, mm, start, end); 317 PVOP_VCALL2(pv_mmu_ops.flush_tlb_others, cpumask, info);
320} 318}
321 319
322static inline int paravirt_pgd_alloc(struct mm_struct *mm) 320static inline int paravirt_pgd_alloc(struct mm_struct *mm)
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 7465d6fe336f..cb976bab6299 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -51,6 +51,7 @@ struct mm_struct;
51struct desc_struct; 51struct desc_struct;
52struct task_struct; 52struct task_struct;
53struct cpumask; 53struct cpumask;
54struct flush_tlb_info;
54 55
55/* 56/*
56 * Wrapper type for pointers to code which uses the non-standard 57 * Wrapper type for pointers to code which uses the non-standard
@@ -223,9 +224,7 @@ struct pv_mmu_ops {
223 void (*flush_tlb_kernel)(void); 224 void (*flush_tlb_kernel)(void);
224 void (*flush_tlb_single)(unsigned long addr); 225 void (*flush_tlb_single)(unsigned long addr);
225 void (*flush_tlb_others)(const struct cpumask *cpus, 226 void (*flush_tlb_others)(const struct cpumask *cpus,
226 struct mm_struct *mm, 227 const struct flush_tlb_info *info);
227 unsigned long start,
228 unsigned long end);
229 228
230 /* Hooks for allocating and freeing a pagetable top-level */ 229 /* Hooks for allocating and freeing a pagetable top-level */
231 int (*pgd_alloc)(struct mm_struct *mm); 230 int (*pgd_alloc)(struct mm_struct *mm);
diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
index 50d35e3185f5..c8821bab938f 100644
--- a/arch/x86/include/asm/pgtable-3level.h
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -212,4 +212,51 @@ static inline pud_t native_pudp_get_and_clear(pud_t *pudp)
212#define __pte_to_swp_entry(pte) ((swp_entry_t){ (pte).pte_high }) 212#define __pte_to_swp_entry(pte) ((swp_entry_t){ (pte).pte_high })
213#define __swp_entry_to_pte(x) ((pte_t){ { .pte_high = (x).val } }) 213#define __swp_entry_to_pte(x) ((pte_t){ { .pte_high = (x).val } })
214 214
215#define gup_get_pte gup_get_pte
216/*
217 * WARNING: only to be used in the get_user_pages_fast() implementation.
218 *
219 * With get_user_pages_fast(), we walk down the pagetables without taking
220 * any locks. For this we would like to load the pointers atomically,
221 * but that is not possible (without expensive cmpxchg8b) on PAE. What
222 * we do have is the guarantee that a PTE will only either go from not
223 * present to present, or present to not present or both -- it will not
224 * switch to a completely different present page without a TLB flush in
225 * between; something that we are blocking by holding interrupts off.
226 *
227 * Setting ptes from not present to present goes:
228 *
229 * ptep->pte_high = h;
230 * smp_wmb();
231 * ptep->pte_low = l;
232 *
233 * And present to not present goes:
234 *
235 * ptep->pte_low = 0;
236 * smp_wmb();
237 * ptep->pte_high = 0;
238 *
239 * We must ensure here that the load of pte_low sees 'l' iff pte_high
240 * sees 'h'. We load pte_high *after* loading pte_low, which ensures we
241 * don't see an older value of pte_high. *Then* we recheck pte_low,
242 * which ensures that we haven't picked up a changed pte high. We might
243 * have gotten rubbish values from pte_low and pte_high, but we are
244 * guaranteed that pte_low will not have the present bit set *unless*
245 * it is 'l'. Because get_user_pages_fast() only operates on present ptes
246 * we're safe.
247 */
248static inline pte_t gup_get_pte(pte_t *ptep)
249{
250 pte_t pte;
251
252 do {
253 pte.pte_low = ptep->pte_low;
254 smp_rmb();
255 pte.pte_high = ptep->pte_high;
256 smp_rmb();
257 } while (unlikely(pte.pte_low != ptep->pte_low));
258
259 return pte;
260}
261
215#endif /* _ASM_X86_PGTABLE_3LEVEL_H */ 262#endif /* _ASM_X86_PGTABLE_3LEVEL_H */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index f5af95a0c6b8..77037b6f1caa 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -244,6 +244,11 @@ static inline int pud_devmap(pud_t pud)
244 return 0; 244 return 0;
245} 245}
246#endif 246#endif
247
248static inline int pgd_devmap(pgd_t pgd)
249{
250 return 0;
251}
247#endif 252#endif
248#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 253#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
249 254
@@ -917,7 +922,7 @@ extern pgd_t trampoline_pgd_entry;
917static inline void __meminit init_trampoline_default(void) 922static inline void __meminit init_trampoline_default(void)
918{ 923{
919 /* Default trampoline pgd value */ 924 /* Default trampoline pgd value */
920 trampoline_pgd_entry = init_level4_pgt[pgd_index(__PAGE_OFFSET)]; 925 trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)];
921} 926}
922# ifdef CONFIG_RANDOMIZE_MEMORY 927# ifdef CONFIG_RANDOMIZE_MEMORY
923void __meminit init_trampoline(void); 928void __meminit init_trampoline(void);
@@ -1185,6 +1190,54 @@ static inline u16 pte_flags_pkey(unsigned long pte_flags)
1185#endif 1190#endif
1186} 1191}
1187 1192
1193static inline bool __pkru_allows_pkey(u16 pkey, bool write)
1194{
1195 u32 pkru = read_pkru();
1196
1197 if (!__pkru_allows_read(pkru, pkey))
1198 return false;
1199 if (write && !__pkru_allows_write(pkru, pkey))
1200 return false;
1201
1202 return true;
1203}
1204
1205/*
1206 * 'pteval' can come from a PTE, PMD or PUD. We only check
1207 * _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the
1208 * same value on all 3 types.
1209 */
1210static inline bool __pte_access_permitted(unsigned long pteval, bool write)
1211{
1212 unsigned long need_pte_bits = _PAGE_PRESENT|_PAGE_USER;
1213
1214 if (write)
1215 need_pte_bits |= _PAGE_RW;
1216
1217 if ((pteval & need_pte_bits) != need_pte_bits)
1218 return 0;
1219
1220 return __pkru_allows_pkey(pte_flags_pkey(pteval), write);
1221}
1222
1223#define pte_access_permitted pte_access_permitted
1224static inline bool pte_access_permitted(pte_t pte, bool write)
1225{
1226 return __pte_access_permitted(pte_val(pte), write);
1227}
1228
1229#define pmd_access_permitted pmd_access_permitted
1230static inline bool pmd_access_permitted(pmd_t pmd, bool write)
1231{
1232 return __pte_access_permitted(pmd_val(pmd), write);
1233}
1234
1235#define pud_access_permitted pud_access_permitted
1236static inline bool pud_access_permitted(pud_t pud, bool write)
1237{
1238 return __pte_access_permitted(pud_val(pud), write);
1239}
1240
1188#include <asm-generic/pgtable.h> 1241#include <asm-generic/pgtable.h>
1189#endif /* __ASSEMBLY__ */ 1242#endif /* __ASSEMBLY__ */
1190 1243
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 9991224f6238..2160c1fee920 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -14,15 +14,17 @@
14#include <linux/bitops.h> 14#include <linux/bitops.h>
15#include <linux/threads.h> 15#include <linux/threads.h>
16 16
17extern p4d_t level4_kernel_pgt[512];
18extern p4d_t level4_ident_pgt[512];
17extern pud_t level3_kernel_pgt[512]; 19extern pud_t level3_kernel_pgt[512];
18extern pud_t level3_ident_pgt[512]; 20extern pud_t level3_ident_pgt[512];
19extern pmd_t level2_kernel_pgt[512]; 21extern pmd_t level2_kernel_pgt[512];
20extern pmd_t level2_fixmap_pgt[512]; 22extern pmd_t level2_fixmap_pgt[512];
21extern pmd_t level2_ident_pgt[512]; 23extern pmd_t level2_ident_pgt[512];
22extern pte_t level1_fixmap_pgt[512]; 24extern pte_t level1_fixmap_pgt[512];
23extern pgd_t init_level4_pgt[]; 25extern pgd_t init_top_pgt[];
24 26
25#define swapper_pg_dir init_level4_pgt 27#define swapper_pg_dir init_top_pgt
26 28
27extern void paging_init(void); 29extern void paging_init(void);
28 30
@@ -227,6 +229,20 @@ extern void cleanup_highmap(void);
227extern void init_extra_mapping_uc(unsigned long phys, unsigned long size); 229extern void init_extra_mapping_uc(unsigned long phys, unsigned long size);
228extern void init_extra_mapping_wb(unsigned long phys, unsigned long size); 230extern void init_extra_mapping_wb(unsigned long phys, unsigned long size);
229 231
230#endif /* !__ASSEMBLY__ */ 232#define gup_fast_permitted gup_fast_permitted
233static inline bool gup_fast_permitted(unsigned long start, int nr_pages,
234 int write)
235{
236 unsigned long len, end;
237
238 len = (unsigned long)nr_pages << PAGE_SHIFT;
239 end = start + len;
240 if (end < start)
241 return false;
242 if (end >> __VIRTUAL_MASK_SHIFT)
243 return false;
244 return true;
245}
231 246
247#endif /* !__ASSEMBLY__ */
232#endif /* _ASM_X86_PGTABLE_64_H */ 248#endif /* _ASM_X86_PGTABLE_64_H */
diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
index 39fb618e2211..79aa2f98398d 100644
--- a/arch/x86/include/asm/processor-flags.h
+++ b/arch/x86/include/asm/processor-flags.h
@@ -8,4 +8,40 @@
8#else 8#else
9#define X86_VM_MASK 0 /* No VM86 support */ 9#define X86_VM_MASK 0 /* No VM86 support */
10#endif 10#endif
11
12/*
13 * CR3's layout varies depending on several things.
14 *
15 * If CR4.PCIDE is set (64-bit only), then CR3[11:0] is the address space ID.
16 * If PAE is enabled, then CR3[11:5] is part of the PDPT address
17 * (i.e. it's 32-byte aligned, not page-aligned) and CR3[4:0] is ignored.
18 * Otherwise (non-PAE, non-PCID), CR3[3] is PWT, CR3[4] is PCD, and
19 * CR3[2:0] and CR3[11:5] are ignored.
20 *
21 * In all cases, Linux puts zeros in the low ignored bits and in PWT and PCD.
22 *
23 * CR3[63] is always read as zero. If CR4.PCIDE is set, then CR3[63] may be
24 * written as 1 to prevent the write to CR3 from flushing the TLB.
25 *
26 * On systems with SME, one bit (in a variable position!) is stolen to indicate
27 * that the top-level paging structure is encrypted.
28 *
29 * All of the remaining bits indicate the physical address of the top-level
30 * paging structure.
31 *
32 * CR3_ADDR_MASK is the mask used by read_cr3_pa().
33 */
34#ifdef CONFIG_X86_64
35/* Mask off the address space ID bits. */
36#define CR3_ADDR_MASK 0x7FFFFFFFFFFFF000ull
37#define CR3_PCID_MASK 0xFFFull
38#else
39/*
40 * CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save
41 * a tiny bit of code size by setting all the bits.
42 */
43#define CR3_ADDR_MASK 0xFFFFFFFFull
44#define CR3_PCID_MASK 0ull
45#endif
46
11#endif /* _ASM_X86_PROCESSOR_FLAGS_H */ 47#endif /* _ASM_X86_PROCESSOR_FLAGS_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index a28b671f1549..2e1696294af5 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -231,6 +231,14 @@ native_cpuid_reg(ebx)
231native_cpuid_reg(ecx) 231native_cpuid_reg(ecx)
232native_cpuid_reg(edx) 232native_cpuid_reg(edx)
233 233
234/*
235 * Friendlier CR3 helpers.
236 */
237static inline unsigned long read_cr3_pa(void)
238{
239 return __read_cr3() & CR3_ADDR_MASK;
240}
241
234static inline void load_cr3(pgd_t *pgdir) 242static inline void load_cr3(pgd_t *pgdir)
235{ 243{
236 write_cr3(__pa(pgdir)); 244 write_cr3(__pa(pgdir));
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index 12af3e35edfa..9efaabf5b54b 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -39,7 +39,7 @@ static inline void native_write_cr2(unsigned long val)
39 asm volatile("mov %0,%%cr2": : "r" (val), "m" (__force_order)); 39 asm volatile("mov %0,%%cr2": : "r" (val), "m" (__force_order));
40} 40}
41 41
42static inline unsigned long native_read_cr3(void) 42static inline unsigned long __native_read_cr3(void)
43{ 43{
44 unsigned long val; 44 unsigned long val;
45 asm volatile("mov %%cr3,%0\n\t" : "=r" (val), "=m" (__force_order)); 45 asm volatile("mov %%cr3,%0\n\t" : "=r" (val), "=m" (__force_order));
@@ -159,9 +159,13 @@ static inline void write_cr2(unsigned long x)
159 native_write_cr2(x); 159 native_write_cr2(x);
160} 160}
161 161
162static inline unsigned long read_cr3(void) 162/*
163 * Careful! CR3 contains more than just an address. You probably want
164 * read_cr3_pa() instead.
165 */
166static inline unsigned long __read_cr3(void)
163{ 167{
164 return native_read_cr3(); 168 return __native_read_cr3();
165} 169}
166 170
167static inline void write_cr3(unsigned long x) 171static inline void write_cr3(unsigned long x)
diff --git a/arch/x86/include/asm/tlbbatch.h b/arch/x86/include/asm/tlbbatch.h
new file mode 100644
index 000000000000..f4a6ff352a0e
--- /dev/null
+++ b/arch/x86/include/asm/tlbbatch.h
@@ -0,0 +1,14 @@
1#ifndef _ARCH_X86_TLBBATCH_H
2#define _ARCH_X86_TLBBATCH_H
3
4#include <linux/cpumask.h>
5
6struct arch_tlbflush_unmap_batch {
7 /*
8 * Each bit set is a CPU that potentially has a TLB entry for one of
9 * the PFNs being flushed..
10 */
11 struct cpumask cpumask;
12};
13
14#endif /* _ARCH_X86_TLBBATCH_H */
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 6ed9ea469b48..50ea3482e1d1 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -7,6 +7,7 @@
7#include <asm/processor.h> 7#include <asm/processor.h>
8#include <asm/cpufeature.h> 8#include <asm/cpufeature.h>
9#include <asm/special_insns.h> 9#include <asm/special_insns.h>
10#include <asm/smp.h>
10 11
11static inline void __invpcid(unsigned long pcid, unsigned long addr, 12static inline void __invpcid(unsigned long pcid, unsigned long addr,
12 unsigned long type) 13 unsigned long type)
@@ -65,10 +66,14 @@ static inline void invpcid_flush_all_nonglobals(void)
65#endif 66#endif
66 67
67struct tlb_state { 68struct tlb_state {
68#ifdef CONFIG_SMP 69 /*
69 struct mm_struct *active_mm; 70 * cpu_tlbstate.loaded_mm should match CR3 whenever interrupts
71 * are on. This means that it may not match current->active_mm,
72 * which will contain the previous user mm when we're in lazy TLB
73 * mode even if we've already switched back to swapper_pg_dir.
74 */
75 struct mm_struct *loaded_mm;
70 int state; 76 int state;
71#endif
72 77
73 /* 78 /*
74 * Access to this CR4 shadow and to H/W CR4 is protected by 79 * Access to this CR4 shadow and to H/W CR4 is protected by
@@ -151,7 +156,7 @@ static inline void __native_flush_tlb(void)
151 * back: 156 * back:
152 */ 157 */
153 preempt_disable(); 158 preempt_disable();
154 native_write_cr3(native_read_cr3()); 159 native_write_cr3(__native_read_cr3());
155 preempt_enable(); 160 preempt_enable();
156} 161}
157 162
@@ -220,84 +225,16 @@ static inline void __flush_tlb_one(unsigned long addr)
220 * - flush_tlb_page(vma, vmaddr) flushes one page 225 * - flush_tlb_page(vma, vmaddr) flushes one page
221 * - flush_tlb_range(vma, start, end) flushes a range of pages 226 * - flush_tlb_range(vma, start, end) flushes a range of pages
222 * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages 227 * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
223 * - flush_tlb_others(cpumask, mm, start, end) flushes TLBs on other cpus 228 * - flush_tlb_others(cpumask, info) flushes TLBs on other cpus
224 * 229 *
225 * ..but the i386 has somewhat limited tlb flushing capabilities, 230 * ..but the i386 has somewhat limited tlb flushing capabilities,
226 * and page-granular flushes are available only on i486 and up. 231 * and page-granular flushes are available only on i486 and up.
227 */ 232 */
228 233struct flush_tlb_info {
229#ifndef CONFIG_SMP 234 struct mm_struct *mm;
230 235 unsigned long start;
231/* "_up" is for UniProcessor. 236 unsigned long end;
232 * 237};
233 * This is a helper for other header functions. *Not* intended to be called
234 * directly. All global TLB flushes need to either call this, or to bump the
235 * vm statistics themselves.
236 */
237static inline void __flush_tlb_up(void)
238{
239 count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
240 __flush_tlb();
241}
242
243static inline void flush_tlb_all(void)
244{
245 count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
246 __flush_tlb_all();
247}
248
249static inline void local_flush_tlb(void)
250{
251 __flush_tlb_up();
252}
253
254static inline void flush_tlb_mm(struct mm_struct *mm)
255{
256 if (mm == current->active_mm)
257 __flush_tlb_up();
258}
259
260static inline void flush_tlb_page(struct vm_area_struct *vma,
261 unsigned long addr)
262{
263 if (vma->vm_mm == current->active_mm)
264 __flush_tlb_one(addr);
265}
266
267static inline void flush_tlb_range(struct vm_area_struct *vma,
268 unsigned long start, unsigned long end)
269{
270 if (vma->vm_mm == current->active_mm)
271 __flush_tlb_up();
272}
273
274static inline void flush_tlb_mm_range(struct mm_struct *mm,
275 unsigned long start, unsigned long end, unsigned long vmflag)
276{
277 if (mm == current->active_mm)
278 __flush_tlb_up();
279}
280
281static inline void native_flush_tlb_others(const struct cpumask *cpumask,
282 struct mm_struct *mm,
283 unsigned long start,
284 unsigned long end)
285{
286}
287
288static inline void reset_lazy_tlbstate(void)
289{
290}
291
292static inline void flush_tlb_kernel_range(unsigned long start,
293 unsigned long end)
294{
295 flush_tlb_all();
296}
297
298#else /* SMP */
299
300#include <asm/smp.h>
301 238
302#define local_flush_tlb() __flush_tlb() 239#define local_flush_tlb() __flush_tlb()
303 240
@@ -307,29 +244,32 @@ static inline void flush_tlb_kernel_range(unsigned long start,
307 flush_tlb_mm_range(vma->vm_mm, start, end, vma->vm_flags) 244 flush_tlb_mm_range(vma->vm_mm, start, end, vma->vm_flags)
308 245
309extern void flush_tlb_all(void); 246extern void flush_tlb_all(void);
310extern void flush_tlb_page(struct vm_area_struct *, unsigned long);
311extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, 247extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
312 unsigned long end, unsigned long vmflag); 248 unsigned long end, unsigned long vmflag);
313extern void flush_tlb_kernel_range(unsigned long start, unsigned long end); 249extern void flush_tlb_kernel_range(unsigned long start, unsigned long end);
314 250
251static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a)
252{
253 flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, VM_NONE);
254}
255
315void native_flush_tlb_others(const struct cpumask *cpumask, 256void native_flush_tlb_others(const struct cpumask *cpumask,
316 struct mm_struct *mm, 257 const struct flush_tlb_info *info);
317 unsigned long start, unsigned long end);
318 258
319#define TLBSTATE_OK 1 259#define TLBSTATE_OK 1
320#define TLBSTATE_LAZY 2 260#define TLBSTATE_LAZY 2
321 261
322static inline void reset_lazy_tlbstate(void) 262static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch,
263 struct mm_struct *mm)
323{ 264{
324 this_cpu_write(cpu_tlbstate.state, 0); 265 cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
325 this_cpu_write(cpu_tlbstate.active_mm, &init_mm);
326} 266}
327 267
328#endif /* SMP */ 268extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch);
329 269
330#ifndef CONFIG_PARAVIRT 270#ifndef CONFIG_PARAVIRT
331#define flush_tlb_others(mask, mm, start, end) \ 271#define flush_tlb_others(mask, info) \
332 native_flush_tlb_others(mask, mm, start, end) 272 native_flush_tlb_others(mask, info)
333#endif 273#endif
334 274
335#endif /* _ASM_X86_TLBFLUSH_H */ 275#endif /* _ASM_X86_TLBFLUSH_H */
diff --git a/arch/x86/include/asm/uv/uv.h b/arch/x86/include/asm/uv/uv.h
index 6686820feae9..b5a32231abd8 100644
--- a/arch/x86/include/asm/uv/uv.h
+++ b/arch/x86/include/asm/uv/uv.h
@@ -1,6 +1,8 @@
1#ifndef _ASM_X86_UV_UV_H 1#ifndef _ASM_X86_UV_UV_H
2#define _ASM_X86_UV_UV_H 2#define _ASM_X86_UV_UV_H
3 3
4#include <asm/tlbflush.h>
5
4enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC}; 6enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC};
5 7
6struct cpumask; 8struct cpumask;
@@ -15,10 +17,7 @@ extern void uv_cpu_init(void);
15extern void uv_nmi_init(void); 17extern void uv_nmi_init(void);
16extern void uv_system_init(void); 18extern void uv_system_init(void);
17extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, 19extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
18 struct mm_struct *mm, 20 const struct flush_tlb_info *info);
19 unsigned long start,
20 unsigned long end,
21 unsigned int cpu);
22 21
23#else /* X86_UV */ 22#else /* X86_UV */
24 23
@@ -28,8 +27,8 @@ static inline int is_uv_hubless(void) { return 0; }
28static inline void uv_cpu_init(void) { } 27static inline void uv_cpu_init(void) { }
29static inline void uv_system_init(void) { } 28static inline void uv_system_init(void) { }
30static inline const struct cpumask * 29static inline const struct cpumask *
31uv_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm, 30uv_flush_tlb_others(const struct cpumask *cpumask,
32 unsigned long start, unsigned long end, unsigned int cpu) 31 const struct flush_tlb_info *info)
33{ return cpumask; } 32{ return cpumask; }
34 33
35#endif /* X86_UV */ 34#endif /* X86_UV */
diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h
index 567de50a4c2a..185f3d10c194 100644
--- a/arch/x86/include/uapi/asm/processor-flags.h
+++ b/arch/x86/include/uapi/asm/processor-flags.h
@@ -104,6 +104,8 @@
104#define X86_CR4_OSFXSR _BITUL(X86_CR4_OSFXSR_BIT) 104#define X86_CR4_OSFXSR _BITUL(X86_CR4_OSFXSR_BIT)
105#define X86_CR4_OSXMMEXCPT_BIT 10 /* enable unmasked SSE exceptions */ 105#define X86_CR4_OSXMMEXCPT_BIT 10 /* enable unmasked SSE exceptions */
106#define X86_CR4_OSXMMEXCPT _BITUL(X86_CR4_OSXMMEXCPT_BIT) 106#define X86_CR4_OSXMMEXCPT _BITUL(X86_CR4_OSXMMEXCPT_BIT)
107#define X86_CR4_LA57_BIT 12 /* enable 5-level page tables */
108#define X86_CR4_LA57 _BITUL(X86_CR4_LA57_BIT)
107#define X86_CR4_VMXE_BIT 13 /* enable VMX virtualization */ 109#define X86_CR4_VMXE_BIT 13 /* enable VMX virtualization */
108#define X86_CR4_VMXE _BITUL(X86_CR4_VMXE_BIT) 110#define X86_CR4_VMXE _BITUL(X86_CR4_VMXE_BIT)
109#define X86_CR4_SMXE_BIT 14 /* enable safer mode (TXT) */ 111#define X86_CR4_SMXE_BIT 14 /* enable safer mode (TXT) */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 3c7c419c4e3e..a01892bdd61a 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -18,6 +18,7 @@ CFLAGS_REMOVE_pvclock.o = -pg
18CFLAGS_REMOVE_kvmclock.o = -pg 18CFLAGS_REMOVE_kvmclock.o = -pg
19CFLAGS_REMOVE_ftrace.o = -pg 19CFLAGS_REMOVE_ftrace.o = -pg
20CFLAGS_REMOVE_early_printk.o = -pg 20CFLAGS_REMOVE_early_printk.o = -pg
21CFLAGS_REMOVE_head64.o = -pg
21endif 22endif
22 23
23KASAN_SANITIZE_head$(BITS).o := n 24KASAN_SANITIZE_head$(BITS).o := n
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index 8e598a1ad986..6b91e2eb8d3f 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -125,7 +125,7 @@ void __init init_espfix_bsp(void)
125 p4d_t *p4d; 125 p4d_t *p4d;
126 126
127 /* Install the espfix pud into the kernel page directory */ 127 /* Install the espfix pud into the kernel page directory */
128 pgd = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)]; 128 pgd = &init_top_pgt[pgd_index(ESPFIX_BASE_ADDR)];
129 p4d = p4d_alloc(&init_mm, pgd, ESPFIX_BASE_ADDR); 129 p4d = p4d_alloc(&init_mm, pgd, ESPFIX_BASE_ADDR);
130 p4d_populate(&init_mm, p4d, espfix_pud_page); 130 p4d_populate(&init_mm, p4d, espfix_pud_page);
131 131
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 43b7002f44fb..46c3c73e7f43 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -33,17 +33,120 @@
33/* 33/*
34 * Manage page tables very early on. 34 * Manage page tables very early on.
35 */ 35 */
36extern pgd_t early_level4_pgt[PTRS_PER_PGD]; 36extern pgd_t early_top_pgt[PTRS_PER_PGD];
37extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD]; 37extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD];
38static unsigned int __initdata next_early_pgt = 2; 38static unsigned int __initdata next_early_pgt;
39pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX); 39pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX);
40 40
41#define __head __section(.head.text)
42
43static void __head *fixup_pointer(void *ptr, unsigned long physaddr)
44{
45 return ptr - (void *)_text + (void *)physaddr;
46}
47
48void __head __startup_64(unsigned long physaddr)
49{
50 unsigned long load_delta, *p;
51 pgdval_t *pgd;
52 p4dval_t *p4d;
53 pudval_t *pud;
54 pmdval_t *pmd, pmd_entry;
55 int i;
56
57 /* Is the address too large? */
58 if (physaddr >> MAX_PHYSMEM_BITS)
59 for (;;);
60
61 /*
62 * Compute the delta between the address I am compiled to run at
63 * and the address I am actually running at.
64 */
65 load_delta = physaddr - (unsigned long)(_text - __START_KERNEL_map);
66
67 /* Is the address not 2M aligned? */
68 if (load_delta & ~PMD_PAGE_MASK)
69 for (;;);
70
71 /* Fixup the physical addresses in the page table */
72
73 pgd = fixup_pointer(&early_top_pgt, physaddr);
74 pgd[pgd_index(__START_KERNEL_map)] += load_delta;
75
76 if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
77 p4d = fixup_pointer(&level4_kernel_pgt, physaddr);
78 p4d[511] += load_delta;
79 }
80
81 pud = fixup_pointer(&level3_kernel_pgt, physaddr);
82 pud[510] += load_delta;
83 pud[511] += load_delta;
84
85 pmd = fixup_pointer(level2_fixmap_pgt, physaddr);
86 pmd[506] += load_delta;
87
88 /*
89 * Set up the identity mapping for the switchover. These
90 * entries should *NOT* have the global bit set! This also
91 * creates a bunch of nonsense entries but that is fine --
92 * it avoids problems around wraparound.
93 */
94
95 pud = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr);
96 pmd = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr);
97
98 if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
99 p4d = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr);
100
101 i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
102 pgd[i + 0] = (pgdval_t)p4d + _KERNPG_TABLE;
103 pgd[i + 1] = (pgdval_t)p4d + _KERNPG_TABLE;
104
105 i = (physaddr >> P4D_SHIFT) % PTRS_PER_P4D;
106 p4d[i + 0] = (pgdval_t)pud + _KERNPG_TABLE;
107 p4d[i + 1] = (pgdval_t)pud + _KERNPG_TABLE;
108 } else {
109 i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
110 pgd[i + 0] = (pgdval_t)pud + _KERNPG_TABLE;
111 pgd[i + 1] = (pgdval_t)pud + _KERNPG_TABLE;
112 }
113
114 i = (physaddr >> PUD_SHIFT) % PTRS_PER_PUD;
115 pud[i + 0] = (pudval_t)pmd + _KERNPG_TABLE;
116 pud[i + 1] = (pudval_t)pmd + _KERNPG_TABLE;
117
118 pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL;
119 pmd_entry += physaddr;
120
121 for (i = 0; i < DIV_ROUND_UP(_end - _text, PMD_SIZE); i++) {
122 int idx = i + (physaddr >> PMD_SHIFT) % PTRS_PER_PMD;
123 pmd[idx] = pmd_entry + i * PMD_SIZE;
124 }
125
126 /*
127 * Fixup the kernel text+data virtual addresses. Note that
128 * we might write invalid pmds, when the kernel is relocated
129 * cleanup_highmap() fixes this up along with the mappings
130 * beyond _end.
131 */
132
133 pmd = fixup_pointer(level2_kernel_pgt, physaddr);
134 for (i = 0; i < PTRS_PER_PMD; i++) {
135 if (pmd[i] & _PAGE_PRESENT)
136 pmd[i] += load_delta;
137 }
138
139 /* Fixup phys_base */
140 p = fixup_pointer(&phys_base, physaddr);
141 *p += load_delta;
142}
143
41/* Wipe all early page tables except for the kernel symbol map */ 144/* Wipe all early page tables except for the kernel symbol map */
42static void __init reset_early_page_tables(void) 145static void __init reset_early_page_tables(void)
43{ 146{
44 memset(early_level4_pgt, 0, sizeof(pgd_t)*(PTRS_PER_PGD-1)); 147 memset(early_top_pgt, 0, sizeof(pgd_t)*(PTRS_PER_PGD-1));
45 next_early_pgt = 0; 148 next_early_pgt = 0;
46 write_cr3(__pa_nodebug(early_level4_pgt)); 149 write_cr3(__pa_nodebug(early_top_pgt));
47} 150}
48 151
49/* Create a new PMD entry */ 152/* Create a new PMD entry */
@@ -51,15 +154,16 @@ int __init early_make_pgtable(unsigned long address)
51{ 154{
52 unsigned long physaddr = address - __PAGE_OFFSET; 155 unsigned long physaddr = address - __PAGE_OFFSET;
53 pgdval_t pgd, *pgd_p; 156 pgdval_t pgd, *pgd_p;
157 p4dval_t p4d, *p4d_p;
54 pudval_t pud, *pud_p; 158 pudval_t pud, *pud_p;
55 pmdval_t pmd, *pmd_p; 159 pmdval_t pmd, *pmd_p;
56 160
57 /* Invalid address or early pgt is done ? */ 161 /* Invalid address or early pgt is done ? */
58 if (physaddr >= MAXMEM || read_cr3() != __pa_nodebug(early_level4_pgt)) 162 if (physaddr >= MAXMEM || read_cr3_pa() != __pa_nodebug(early_top_pgt))
59 return -1; 163 return -1;
60 164
61again: 165again:
62 pgd_p = &early_level4_pgt[pgd_index(address)].pgd; 166 pgd_p = &early_top_pgt[pgd_index(address)].pgd;
63 pgd = *pgd_p; 167 pgd = *pgd_p;
64 168
65 /* 169 /*
@@ -67,8 +171,25 @@ again:
67 * critical -- __PAGE_OFFSET would point us back into the dynamic 171 * critical -- __PAGE_OFFSET would point us back into the dynamic
68 * range and we might end up looping forever... 172 * range and we might end up looping forever...
69 */ 173 */
70 if (pgd) 174 if (!IS_ENABLED(CONFIG_X86_5LEVEL))
71 pud_p = (pudval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base); 175 p4d_p = pgd_p;
176 else if (pgd)
177 p4d_p = (p4dval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base);
178 else {
179 if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) {
180 reset_early_page_tables();
181 goto again;
182 }
183
184 p4d_p = (p4dval_t *)early_dynamic_pgts[next_early_pgt++];
185 memset(p4d_p, 0, sizeof(*p4d_p) * PTRS_PER_P4D);
186 *pgd_p = (pgdval_t)p4d_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
187 }
188 p4d_p += p4d_index(address);
189 p4d = *p4d_p;
190
191 if (p4d)
192 pud_p = (pudval_t *)((p4d & PTE_PFN_MASK) + __START_KERNEL_map - phys_base);
72 else { 193 else {
73 if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) { 194 if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) {
74 reset_early_page_tables(); 195 reset_early_page_tables();
@@ -77,7 +198,7 @@ again:
77 198
78 pud_p = (pudval_t *)early_dynamic_pgts[next_early_pgt++]; 199 pud_p = (pudval_t *)early_dynamic_pgts[next_early_pgt++];
79 memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD); 200 memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD);
80 *pgd_p = (pgdval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; 201 *p4d_p = (p4dval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
81 } 202 }
82 pud_p += pud_index(address); 203 pud_p += pud_index(address);
83 pud = *pud_p; 204 pud = *pud_p;
@@ -156,7 +277,7 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
156 277
157 clear_bss(); 278 clear_bss();
158 279
159 clear_page(init_level4_pgt); 280 clear_page(init_top_pgt);
160 281
161 kasan_early_init(); 282 kasan_early_init();
162 283
@@ -171,8 +292,8 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
171 */ 292 */
172 load_ucode_bsp(); 293 load_ucode_bsp();
173 294
174 /* set init_level4_pgt kernel high mapping*/ 295 /* set init_top_pgt kernel high mapping*/
175 init_level4_pgt[511] = early_level4_pgt[511]; 296 init_top_pgt[511] = early_top_pgt[511];
176 297
177 x86_64_start_reservations(real_mode_data); 298 x86_64_start_reservations(real_mode_data);
178} 299}
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index ac9d327d2e42..6225550883df 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -37,10 +37,11 @@
37 * 37 *
38 */ 38 */
39 39
40#define p4d_index(x) (((x) >> P4D_SHIFT) & (PTRS_PER_P4D-1))
40#define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1)) 41#define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
41 42
42L4_PAGE_OFFSET = pgd_index(__PAGE_OFFSET_BASE) 43PGD_PAGE_OFFSET = pgd_index(__PAGE_OFFSET_BASE)
43L4_START_KERNEL = pgd_index(__START_KERNEL_map) 44PGD_START_KERNEL = pgd_index(__START_KERNEL_map)
44L3_START_KERNEL = pud_index(__START_KERNEL_map) 45L3_START_KERNEL = pud_index(__START_KERNEL_map)
45 46
46 .text 47 .text
@@ -72,101 +73,12 @@ startup_64:
72 /* Sanitize CPU configuration */ 73 /* Sanitize CPU configuration */
73 call verify_cpu 74 call verify_cpu
74 75
75 /*
76 * Compute the delta between the address I am compiled to run at and the
77 * address I am actually running at.
78 */
79 leaq _text(%rip), %rbp
80 subq $_text - __START_KERNEL_map, %rbp
81
82 /* Is the address not 2M aligned? */
83 testl $~PMD_PAGE_MASK, %ebp
84 jnz bad_address
85
86 /*
87 * Is the address too large?
88 */
89 leaq _text(%rip), %rax
90 shrq $MAX_PHYSMEM_BITS, %rax
91 jnz bad_address
92
93 /*
94 * Fixup the physical addresses in the page table
95 */
96 addq %rbp, early_level4_pgt + (L4_START_KERNEL*8)(%rip)
97
98 addq %rbp, level3_kernel_pgt + (510*8)(%rip)
99 addq %rbp, level3_kernel_pgt + (511*8)(%rip)
100
101 addq %rbp, level2_fixmap_pgt + (506*8)(%rip)
102
103 /*
104 * Set up the identity mapping for the switchover. These
105 * entries should *NOT* have the global bit set! This also
106 * creates a bunch of nonsense entries but that is fine --
107 * it avoids problems around wraparound.
108 */
109 leaq _text(%rip), %rdi 76 leaq _text(%rip), %rdi
110 leaq early_level4_pgt(%rip), %rbx 77 pushq %rsi
111 78 call __startup_64
112 movq %rdi, %rax 79 popq %rsi
113 shrq $PGDIR_SHIFT, %rax
114
115 leaq (PAGE_SIZE + _KERNPG_TABLE)(%rbx), %rdx
116 movq %rdx, 0(%rbx,%rax,8)
117 movq %rdx, 8(%rbx,%rax,8)
118
119 addq $PAGE_SIZE, %rdx
120 movq %rdi, %rax
121 shrq $PUD_SHIFT, %rax
122 andl $(PTRS_PER_PUD-1), %eax
123 movq %rdx, PAGE_SIZE(%rbx,%rax,8)
124 incl %eax
125 andl $(PTRS_PER_PUD-1), %eax
126 movq %rdx, PAGE_SIZE(%rbx,%rax,8)
127
128 addq $PAGE_SIZE * 2, %rbx
129 movq %rdi, %rax
130 shrq $PMD_SHIFT, %rdi
131 addq $(__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL), %rax
132 leaq (_end - 1)(%rip), %rcx
133 shrq $PMD_SHIFT, %rcx
134 subq %rdi, %rcx
135 incl %ecx
136
1371:
138 andq $(PTRS_PER_PMD - 1), %rdi
139 movq %rax, (%rbx,%rdi,8)
140 incq %rdi
141 addq $PMD_SIZE, %rax
142 decl %ecx
143 jnz 1b
144
145 test %rbp, %rbp
146 jz .Lskip_fixup
147 80
148 /* 81 movq $(early_top_pgt - __START_KERNEL_map), %rax
149 * Fixup the kernel text+data virtual addresses. Note that
150 * we might write invalid pmds, when the kernel is relocated
151 * cleanup_highmap() fixes this up along with the mappings
152 * beyond _end.
153 */
154 leaq level2_kernel_pgt(%rip), %rdi
155 leaq PAGE_SIZE(%rdi), %r8
156 /* See if it is a valid page table entry */
1571: testb $_PAGE_PRESENT, 0(%rdi)
158 jz 2f
159 addq %rbp, 0(%rdi)
160 /* Go to the next page */
1612: addq $8, %rdi
162 cmp %r8, %rdi
163 jne 1b
164
165 /* Fixup phys_base */
166 addq %rbp, phys_base(%rip)
167
168.Lskip_fixup:
169 movq $(early_level4_pgt - __START_KERNEL_map), %rax
170 jmp 1f 82 jmp 1f
171ENTRY(secondary_startup_64) 83ENTRY(secondary_startup_64)
172 /* 84 /*
@@ -186,14 +98,17 @@ ENTRY(secondary_startup_64)
186 /* Sanitize CPU configuration */ 98 /* Sanitize CPU configuration */
187 call verify_cpu 99 call verify_cpu
188 100
189 movq $(init_level4_pgt - __START_KERNEL_map), %rax 101 movq $(init_top_pgt - __START_KERNEL_map), %rax
1901: 1021:
191 103
192 /* Enable PAE mode and PGE */ 104 /* Enable PAE mode, PGE and LA57 */
193 movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx 105 movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx
106#ifdef CONFIG_X86_5LEVEL
107 orl $X86_CR4_LA57, %ecx
108#endif
194 movq %rcx, %cr4 109 movq %rcx, %cr4
195 110
196 /* Setup early boot stage 4 level pagetables. */ 111 /* Setup early boot stage 4-/5-level pagetables. */
197 addq phys_base(%rip), %rax 112 addq phys_base(%rip), %rax
198 movq %rax, %cr3 113 movq %rax, %cr3
199 114
@@ -417,9 +332,13 @@ GLOBAL(name)
417 .endr 332 .endr
418 333
419 __INITDATA 334 __INITDATA
420NEXT_PAGE(early_level4_pgt) 335NEXT_PAGE(early_top_pgt)
421 .fill 511,8,0 336 .fill 511,8,0
337#ifdef CONFIG_X86_5LEVEL
338 .quad level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
339#else
422 .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE 340 .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
341#endif
423 342
424NEXT_PAGE(early_dynamic_pgts) 343NEXT_PAGE(early_dynamic_pgts)
425 .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0 344 .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0
@@ -427,14 +346,14 @@ NEXT_PAGE(early_dynamic_pgts)
427 .data 346 .data
428 347
429#ifndef CONFIG_XEN 348#ifndef CONFIG_XEN
430NEXT_PAGE(init_level4_pgt) 349NEXT_PAGE(init_top_pgt)
431 .fill 512,8,0 350 .fill 512,8,0
432#else 351#else
433NEXT_PAGE(init_level4_pgt) 352NEXT_PAGE(init_top_pgt)
434 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE 353 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
435 .org init_level4_pgt + L4_PAGE_OFFSET*8, 0 354 .org init_top_pgt + PGD_PAGE_OFFSET*8, 0
436 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE 355 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
437 .org init_level4_pgt + L4_START_KERNEL*8, 0 356 .org init_top_pgt + PGD_START_KERNEL*8, 0
438 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ 357 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
439 .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE 358 .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
440 359
@@ -448,6 +367,12 @@ NEXT_PAGE(level2_ident_pgt)
448 PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) 367 PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
449#endif 368#endif
450 369
370#ifdef CONFIG_X86_5LEVEL
371NEXT_PAGE(level4_kernel_pgt)
372 .fill 511,8,0
373 .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
374#endif
375
451NEXT_PAGE(level3_kernel_pgt) 376NEXT_PAGE(level3_kernel_pgt)
452 .fill L3_START_KERNEL,8,0 377 .fill L3_START_KERNEL,8,0
453 /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */ 378 /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index d4a15831ac58..a870910c8565 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -22,24 +22,25 @@
22#include <asm/syscalls.h> 22#include <asm/syscalls.h>
23 23
24/* context.lock is held for us, so we don't need any locking. */ 24/* context.lock is held for us, so we don't need any locking. */
25static void flush_ldt(void *current_mm) 25static void flush_ldt(void *__mm)
26{ 26{
27 struct mm_struct *mm = __mm;
27 mm_context_t *pc; 28 mm_context_t *pc;
28 29
29 if (current->active_mm != current_mm) 30 if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm)
30 return; 31 return;
31 32
32 pc = &current->active_mm->context; 33 pc = &mm->context;
33 set_ldt(pc->ldt->entries, pc->ldt->size); 34 set_ldt(pc->ldt->entries, pc->ldt->nr_entries);
34} 35}
35 36
36/* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */ 37/* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */
37static struct ldt_struct *alloc_ldt_struct(unsigned int size) 38static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries)
38{ 39{
39 struct ldt_struct *new_ldt; 40 struct ldt_struct *new_ldt;
40 unsigned int alloc_size; 41 unsigned int alloc_size;
41 42
42 if (size > LDT_ENTRIES) 43 if (num_entries > LDT_ENTRIES)
43 return NULL; 44 return NULL;
44 45
45 new_ldt = kmalloc(sizeof(struct ldt_struct), GFP_KERNEL); 46 new_ldt = kmalloc(sizeof(struct ldt_struct), GFP_KERNEL);
@@ -47,7 +48,7 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int size)
47 return NULL; 48 return NULL;
48 49
49 BUILD_BUG_ON(LDT_ENTRY_SIZE != sizeof(struct desc_struct)); 50 BUILD_BUG_ON(LDT_ENTRY_SIZE != sizeof(struct desc_struct));
50 alloc_size = size * LDT_ENTRY_SIZE; 51 alloc_size = num_entries * LDT_ENTRY_SIZE;
51 52
52 /* 53 /*
53 * Xen is very picky: it requires a page-aligned LDT that has no 54 * Xen is very picky: it requires a page-aligned LDT that has no
@@ -65,14 +66,14 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int size)
65 return NULL; 66 return NULL;
66 } 67 }
67 68
68 new_ldt->size = size; 69 new_ldt->nr_entries = num_entries;
69 return new_ldt; 70 return new_ldt;
70} 71}
71 72
72/* After calling this, the LDT is immutable. */ 73/* After calling this, the LDT is immutable. */
73static void finalize_ldt_struct(struct ldt_struct *ldt) 74static void finalize_ldt_struct(struct ldt_struct *ldt)
74{ 75{
75 paravirt_alloc_ldt(ldt->entries, ldt->size); 76 paravirt_alloc_ldt(ldt->entries, ldt->nr_entries);
76} 77}
77 78
78/* context.lock is held */ 79/* context.lock is held */
@@ -91,8 +92,8 @@ static void free_ldt_struct(struct ldt_struct *ldt)
91 if (likely(!ldt)) 92 if (likely(!ldt))
92 return; 93 return;
93 94
94 paravirt_free_ldt(ldt->entries, ldt->size); 95 paravirt_free_ldt(ldt->entries, ldt->nr_entries);
95 if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE) 96 if (ldt->nr_entries * LDT_ENTRY_SIZE > PAGE_SIZE)
96 vfree_atomic(ldt->entries); 97 vfree_atomic(ldt->entries);
97 else 98 else
98 free_page((unsigned long)ldt->entries); 99 free_page((unsigned long)ldt->entries);
@@ -122,14 +123,14 @@ int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm)
122 goto out_unlock; 123 goto out_unlock;
123 } 124 }
124 125
125 new_ldt = alloc_ldt_struct(old_mm->context.ldt->size); 126 new_ldt = alloc_ldt_struct(old_mm->context.ldt->nr_entries);
126 if (!new_ldt) { 127 if (!new_ldt) {
127 retval = -ENOMEM; 128 retval = -ENOMEM;
128 goto out_unlock; 129 goto out_unlock;
129 } 130 }
130 131
131 memcpy(new_ldt->entries, old_mm->context.ldt->entries, 132 memcpy(new_ldt->entries, old_mm->context.ldt->entries,
132 new_ldt->size * LDT_ENTRY_SIZE); 133 new_ldt->nr_entries * LDT_ENTRY_SIZE);
133 finalize_ldt_struct(new_ldt); 134 finalize_ldt_struct(new_ldt);
134 135
135 mm->context.ldt = new_ldt; 136 mm->context.ldt = new_ldt;
@@ -152,9 +153,9 @@ void destroy_context_ldt(struct mm_struct *mm)
152 153
153static int read_ldt(void __user *ptr, unsigned long bytecount) 154static int read_ldt(void __user *ptr, unsigned long bytecount)
154{ 155{
155 int retval;
156 unsigned long size;
157 struct mm_struct *mm = current->mm; 156 struct mm_struct *mm = current->mm;
157 unsigned long entries_size;
158 int retval;
158 159
159 mutex_lock(&mm->context.lock); 160 mutex_lock(&mm->context.lock);
160 161
@@ -166,18 +167,18 @@ static int read_ldt(void __user *ptr, unsigned long bytecount)
166 if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES) 167 if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES)
167 bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES; 168 bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES;
168 169
169 size = mm->context.ldt->size * LDT_ENTRY_SIZE; 170 entries_size = mm->context.ldt->nr_entries * LDT_ENTRY_SIZE;
170 if (size > bytecount) 171 if (entries_size > bytecount)
171 size = bytecount; 172 entries_size = bytecount;
172 173
173 if (copy_to_user(ptr, mm->context.ldt->entries, size)) { 174 if (copy_to_user(ptr, mm->context.ldt->entries, entries_size)) {
174 retval = -EFAULT; 175 retval = -EFAULT;
175 goto out_unlock; 176 goto out_unlock;
176 } 177 }
177 178
178 if (size != bytecount) { 179 if (entries_size != bytecount) {
179 /* Zero-fill the rest and pretend we read bytecount bytes. */ 180 /* Zero-fill the rest and pretend we read bytecount bytes. */
180 if (clear_user(ptr + size, bytecount - size)) { 181 if (clear_user(ptr + entries_size, bytecount - entries_size)) {
181 retval = -EFAULT; 182 retval = -EFAULT;
182 goto out_unlock; 183 goto out_unlock;
183 } 184 }
@@ -208,7 +209,7 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
208{ 209{
209 struct mm_struct *mm = current->mm; 210 struct mm_struct *mm = current->mm;
210 struct ldt_struct *new_ldt, *old_ldt; 211 struct ldt_struct *new_ldt, *old_ldt;
211 unsigned int oldsize, newsize; 212 unsigned int old_nr_entries, new_nr_entries;
212 struct user_desc ldt_info; 213 struct user_desc ldt_info;
213 struct desc_struct ldt; 214 struct desc_struct ldt;
214 int error; 215 int error;
@@ -247,17 +248,18 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
247 248
248 mutex_lock(&mm->context.lock); 249 mutex_lock(&mm->context.lock);
249 250
250 old_ldt = mm->context.ldt; 251 old_ldt = mm->context.ldt;
251 oldsize = old_ldt ? old_ldt->size : 0; 252 old_nr_entries = old_ldt ? old_ldt->nr_entries : 0;
252 newsize = max(ldt_info.entry_number + 1, oldsize); 253 new_nr_entries = max(ldt_info.entry_number + 1, old_nr_entries);
253 254
254 error = -ENOMEM; 255 error = -ENOMEM;
255 new_ldt = alloc_ldt_struct(newsize); 256 new_ldt = alloc_ldt_struct(new_nr_entries);
256 if (!new_ldt) 257 if (!new_ldt)
257 goto out_unlock; 258 goto out_unlock;
258 259
259 if (old_ldt) 260 if (old_ldt)
260 memcpy(new_ldt->entries, old_ldt->entries, oldsize * LDT_ENTRY_SIZE); 261 memcpy(new_ldt->entries, old_ldt->entries, old_nr_entries * LDT_ENTRY_SIZE);
262
261 new_ldt->entries[ldt_info.entry_number] = ldt; 263 new_ldt->entries[ldt_info.entry_number] = ldt;
262 finalize_ldt_struct(new_ldt); 264 finalize_ldt_struct(new_ldt);
263 265
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 6f5ca4ebe6e5..cb0a30473c23 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -347,7 +347,7 @@ void machine_kexec(struct kimage *image)
347void arch_crash_save_vmcoreinfo(void) 347void arch_crash_save_vmcoreinfo(void)
348{ 348{
349 VMCOREINFO_NUMBER(phys_base); 349 VMCOREINFO_NUMBER(phys_base);
350 VMCOREINFO_SYMBOL(init_level4_pgt); 350 VMCOREINFO_SYMBOL(init_top_pgt);
351 351
352#ifdef CONFIG_NUMA 352#ifdef CONFIG_NUMA
353 VMCOREINFO_SYMBOL(node_data); 353 VMCOREINFO_SYMBOL(node_data);
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 3586996fc50d..bc0a849589bb 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -391,7 +391,7 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = {
391 391
392 .read_cr2 = native_read_cr2, 392 .read_cr2 = native_read_cr2,
393 .write_cr2 = native_write_cr2, 393 .write_cr2 = native_write_cr2,
394 .read_cr3 = native_read_cr3, 394 .read_cr3 = __native_read_cr3,
395 .write_cr3 = native_write_cr3, 395 .write_cr3 = native_write_cr3,
396 396
397 .flush_tlb_user = native_flush_tlb, 397 .flush_tlb_user = native_flush_tlb,
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index ffeae818aa7a..c6d6dc5f8bb2 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -92,7 +92,7 @@ void __show_regs(struct pt_regs *regs, int all)
92 92
93 cr0 = read_cr0(); 93 cr0 = read_cr0();
94 cr2 = read_cr2(); 94 cr2 = read_cr2();
95 cr3 = read_cr3(); 95 cr3 = __read_cr3();
96 cr4 = __read_cr4(); 96 cr4 = __read_cr4();
97 printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", 97 printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n",
98 cr0, cr2, cr3, cr4); 98 cr0, cr2, cr3, cr4);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index b6840bf3940b..c3169be4c596 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -104,7 +104,7 @@ void __show_regs(struct pt_regs *regs, int all)
104 104
105 cr0 = read_cr0(); 105 cr0 = read_cr0();
106 cr2 = read_cr2(); 106 cr2 = read_cr2();
107 cr3 = read_cr3(); 107 cr3 = __read_cr3();
108 cr4 = __read_cr4(); 108 cr4 = __read_cr4();
109 109
110 printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", 110 printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
@@ -142,7 +142,7 @@ void release_thread(struct task_struct *dead_task)
142 pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n", 142 pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n",
143 dead_task->comm, 143 dead_task->comm,
144 dead_task->mm->context.ldt->entries, 144 dead_task->mm->context.ldt->entries,
145 dead_task->mm->context.ldt->size); 145 dead_task->mm->context.ldt->nr_entries);
146 BUG(); 146 BUG();
147 } 147 }
148#endif 148#endif
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 045e4f993bd2..b474c8de7fba 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1589,7 +1589,6 @@ void native_cpu_die(unsigned int cpu)
1589void play_dead_common(void) 1589void play_dead_common(void)
1590{ 1590{
1591 idle_task_exit(); 1591 idle_task_exit();
1592 reset_lazy_tlbstate();
1593 1592
1594 /* Ack it */ 1593 /* Ack it */
1595 (void)cpu_report_death(); 1594 (void)cpu_report_death();
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index f07f83b3611b..5f25cfbd952e 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -34,7 +34,7 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re
34 34
35 mutex_lock(&child->mm->context.lock); 35 mutex_lock(&child->mm->context.lock);
36 if (unlikely(!child->mm->context.ldt || 36 if (unlikely(!child->mm->context.ldt ||
37 seg >= child->mm->context.ldt->size)) 37 seg >= child->mm->context.ldt->nr_entries))
38 addr = -1L; /* bogus selector, access would fault */ 38 addr = -1L; /* bogus selector, access would fault */
39 else { 39 else {
40 desc = &child->mm->context.ldt->entries[seg]; 40 desc = &child->mm->context.ldt->entries[seg];
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 1b469b6c762f..6dcc4873e435 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -49,6 +49,7 @@
49#include <asm/kexec.h> 49#include <asm/kexec.h>
50#include <asm/apic.h> 50#include <asm/apic.h>
51#include <asm/irq_remapping.h> 51#include <asm/irq_remapping.h>
52#include <asm/mmu_context.h>
52 53
53#include "trace.h" 54#include "trace.h"
54#include "pmu.h" 55#include "pmu.h"
@@ -597,6 +598,7 @@ struct vcpu_vmx {
597 int gs_ldt_reload_needed; 598 int gs_ldt_reload_needed;
598 int fs_reload_needed; 599 int fs_reload_needed;
599 u64 msr_host_bndcfgs; 600 u64 msr_host_bndcfgs;
601 unsigned long vmcs_host_cr3; /* May not match real cr3 */
600 unsigned long vmcs_host_cr4; /* May not match real cr4 */ 602 unsigned long vmcs_host_cr4; /* May not match real cr4 */
601 } host_state; 603 } host_state;
602 struct { 604 struct {
@@ -5013,12 +5015,19 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
5013 u32 low32, high32; 5015 u32 low32, high32;
5014 unsigned long tmpl; 5016 unsigned long tmpl;
5015 struct desc_ptr dt; 5017 struct desc_ptr dt;
5016 unsigned long cr0, cr4; 5018 unsigned long cr0, cr3, cr4;
5017 5019
5018 cr0 = read_cr0(); 5020 cr0 = read_cr0();
5019 WARN_ON(cr0 & X86_CR0_TS); 5021 WARN_ON(cr0 & X86_CR0_TS);
5020 vmcs_writel(HOST_CR0, cr0); /* 22.2.3 */ 5022 vmcs_writel(HOST_CR0, cr0); /* 22.2.3 */
5021 vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */ 5023
5024 /*
5025 * Save the most likely value for this task's CR3 in the VMCS.
5026 * We can't use __get_current_cr3_fast() because we're not atomic.
5027 */
5028 cr3 = __read_cr3();
5029 vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */
5030 vmx->host_state.vmcs_host_cr3 = cr3;
5022 5031
5023 /* Save the most likely value for this task's CR4 in the VMCS. */ 5032 /* Save the most likely value for this task's CR4 in the VMCS. */
5024 cr4 = cr4_read_shadow(); 5033 cr4 = cr4_read_shadow();
@@ -8822,7 +8831,7 @@ static void vmx_arm_hv_timer(struct kvm_vcpu *vcpu)
8822static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) 8831static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
8823{ 8832{
8824 struct vcpu_vmx *vmx = to_vmx(vcpu); 8833 struct vcpu_vmx *vmx = to_vmx(vcpu);
8825 unsigned long debugctlmsr, cr4; 8834 unsigned long debugctlmsr, cr3, cr4;
8826 8835
8827 /* Don't enter VMX if guest state is invalid, let the exit handler 8836 /* Don't enter VMX if guest state is invalid, let the exit handler
8828 start emulation until we arrive back to a valid state */ 8837 start emulation until we arrive back to a valid state */
@@ -8844,6 +8853,12 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
8844 if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty)) 8853 if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
8845 vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); 8854 vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
8846 8855
8856 cr3 = __get_current_cr3_fast();
8857 if (unlikely(cr3 != vmx->host_state.vmcs_host_cr3)) {
8858 vmcs_writel(HOST_CR3, cr3);
8859 vmx->host_state.vmcs_host_cr3 = cr3;
8860 }
8861
8847 cr4 = cr4_read_shadow(); 8862 cr4 = cr4_read_shadow();
8848 if (unlikely(cr4 != vmx->host_state.vmcs_host_cr4)) { 8863 if (unlikely(cr4 != vmx->host_state.vmcs_host_cr4)) {
8849 vmcs_writel(HOST_CR4, cr4); 8864 vmcs_writel(HOST_CR4, cr4);
diff --git a/arch/x86/math-emu/fpu_system.h b/arch/x86/math-emu/fpu_system.h
index 5e044d506b7a..a179254a5122 100644
--- a/arch/x86/math-emu/fpu_system.h
+++ b/arch/x86/math-emu/fpu_system.h
@@ -27,7 +27,7 @@ static inline struct desc_struct FPU_get_ldt_descriptor(unsigned seg)
27#ifdef CONFIG_MODIFY_LDT_SYSCALL 27#ifdef CONFIG_MODIFY_LDT_SYSCALL
28 seg >>= 3; 28 seg >>= 3;
29 mutex_lock(&current->mm->context.lock); 29 mutex_lock(&current->mm->context.lock);
30 if (current->mm->context.ldt && seg < current->mm->context.ldt->size) 30 if (current->mm->context.ldt && seg < current->mm->context.ldt->nr_entries)
31 ret = current->mm->context.ldt->entries[seg]; 31 ret = current->mm->context.ldt->entries[seg];
32 mutex_unlock(&current->mm->context.lock); 32 mutex_unlock(&current->mm->context.lock);
33#endif 33#endif
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 96d2b847e09e..0fbdcb64f9f8 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -2,7 +2,7 @@
2KCOV_INSTRUMENT_tlb.o := n 2KCOV_INSTRUMENT_tlb.o := n
3 3
4obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ 4obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
5 pat.o pgtable.o physaddr.o gup.o setup_nx.o tlb.o 5 pat.o pgtable.o physaddr.o setup_nx.o tlb.o
6 6
7# Make sure __phys_addr has no stackprotector 7# Make sure __phys_addr has no stackprotector
8nostackp := $(call cc-option, -fno-stack-protector) 8nostackp := $(call cc-option, -fno-stack-protector)
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index bce6990b1d81..0470826d2bdc 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -431,7 +431,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
431 bool checkwx) 431 bool checkwx)
432{ 432{
433#ifdef CONFIG_X86_64 433#ifdef CONFIG_X86_64
434 pgd_t *start = (pgd_t *) &init_level4_pgt; 434 pgd_t *start = (pgd_t *) &init_top_pgt;
435#else 435#else
436 pgd_t *start = swapper_pg_dir; 436 pgd_t *start = swapper_pg_dir;
437#endif 437#endif
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 8ad91a01cbc8..2a1fa10c6a98 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -346,7 +346,7 @@ static noinline int vmalloc_fault(unsigned long address)
346 * Do _not_ use "current" here. We might be inside 346 * Do _not_ use "current" here. We might be inside
347 * an interrupt in the middle of a task switch.. 347 * an interrupt in the middle of a task switch..
348 */ 348 */
349 pgd_paddr = read_cr3(); 349 pgd_paddr = read_cr3_pa();
350 pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); 350 pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
351 if (!pmd_k) 351 if (!pmd_k)
352 return -1; 352 return -1;
@@ -388,7 +388,7 @@ static bool low_pfn(unsigned long pfn)
388 388
389static void dump_pagetable(unsigned long address) 389static void dump_pagetable(unsigned long address)
390{ 390{
391 pgd_t *base = __va(read_cr3()); 391 pgd_t *base = __va(read_cr3_pa());
392 pgd_t *pgd = &base[pgd_index(address)]; 392 pgd_t *pgd = &base[pgd_index(address)];
393 p4d_t *p4d; 393 p4d_t *p4d;
394 pud_t *pud; 394 pud_t *pud;
@@ -451,7 +451,7 @@ static noinline int vmalloc_fault(unsigned long address)
451 * happen within a race in page table update. In the later 451 * happen within a race in page table update. In the later
452 * case just flush: 452 * case just flush:
453 */ 453 */
454 pgd = (pgd_t *)__va(read_cr3()) + pgd_index(address); 454 pgd = (pgd_t *)__va(read_cr3_pa()) + pgd_index(address);
455 pgd_ref = pgd_offset_k(address); 455 pgd_ref = pgd_offset_k(address);
456 if (pgd_none(*pgd_ref)) 456 if (pgd_none(*pgd_ref))
457 return -1; 457 return -1;
@@ -555,7 +555,7 @@ static int bad_address(void *p)
555 555
556static void dump_pagetable(unsigned long address) 556static void dump_pagetable(unsigned long address)
557{ 557{
558 pgd_t *base = __va(read_cr3() & PHYSICAL_PAGE_MASK); 558 pgd_t *base = __va(read_cr3_pa());
559 pgd_t *pgd = base + pgd_index(address); 559 pgd_t *pgd = base + pgd_index(address);
560 p4d_t *p4d; 560 p4d_t *p4d;
561 pud_t *pud; 561 pud_t *pud;
@@ -700,7 +700,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code,
700 pgd_t *pgd; 700 pgd_t *pgd;
701 pte_t *pte; 701 pte_t *pte;
702 702
703 pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK); 703 pgd = __va(read_cr3_pa());
704 pgd += pgd_index(address); 704 pgd += pgd_index(address);
705 705
706 pte = lookup_address_in_pgd(pgd, address, &level); 706 pte = lookup_address_in_pgd(pgd, address, &level);
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
deleted file mode 100644
index 456dfdfd2249..000000000000
--- a/arch/x86/mm/gup.c
+++ /dev/null
@@ -1,496 +0,0 @@
1/*
2 * Lockless get_user_pages_fast for x86
3 *
4 * Copyright (C) 2008 Nick Piggin
5 * Copyright (C) 2008 Novell Inc.
6 */
7#include <linux/sched.h>
8#include <linux/mm.h>
9#include <linux/vmstat.h>
10#include <linux/highmem.h>
11#include <linux/swap.h>
12#include <linux/memremap.h>
13
14#include <asm/mmu_context.h>
15#include <asm/pgtable.h>
16
17static inline pte_t gup_get_pte(pte_t *ptep)
18{
19#ifndef CONFIG_X86_PAE
20 return READ_ONCE(*ptep);
21#else
22 /*
23 * With get_user_pages_fast, we walk down the pagetables without taking
24 * any locks. For this we would like to load the pointers atomically,
25 * but that is not possible (without expensive cmpxchg8b) on PAE. What
26 * we do have is the guarantee that a pte will only either go from not
27 * present to present, or present to not present or both -- it will not
28 * switch to a completely different present page without a TLB flush in
29 * between; something that we are blocking by holding interrupts off.
30 *
31 * Setting ptes from not present to present goes:
32 * ptep->pte_high = h;
33 * smp_wmb();
34 * ptep->pte_low = l;
35 *
36 * And present to not present goes:
37 * ptep->pte_low = 0;
38 * smp_wmb();
39 * ptep->pte_high = 0;
40 *
41 * We must ensure here that the load of pte_low sees l iff pte_high
42 * sees h. We load pte_high *after* loading pte_low, which ensures we
43 * don't see an older value of pte_high. *Then* we recheck pte_low,
44 * which ensures that we haven't picked up a changed pte high. We might
45 * have got rubbish values from pte_low and pte_high, but we are
46 * guaranteed that pte_low will not have the present bit set *unless*
47 * it is 'l'. And get_user_pages_fast only operates on present ptes, so
48 * we're safe.
49 *
50 * gup_get_pte should not be used or copied outside gup.c without being
51 * very careful -- it does not atomically load the pte or anything that
52 * is likely to be useful for you.
53 */
54 pte_t pte;
55
56retry:
57 pte.pte_low = ptep->pte_low;
58 smp_rmb();
59 pte.pte_high = ptep->pte_high;
60 smp_rmb();
61 if (unlikely(pte.pte_low != ptep->pte_low))
62 goto retry;
63
64 return pte;
65#endif
66}
67
68static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages)
69{
70 while ((*nr) - nr_start) {
71 struct page *page = pages[--(*nr)];
72
73 ClearPageReferenced(page);
74 put_page(page);
75 }
76}
77
78/*
79 * 'pteval' can come from a pte, pmd, pud or p4d. We only check
80 * _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the
81 * same value on all 4 types.
82 */
83static inline int pte_allows_gup(unsigned long pteval, int write)
84{
85 unsigned long need_pte_bits = _PAGE_PRESENT|_PAGE_USER;
86
87 if (write)
88 need_pte_bits |= _PAGE_RW;
89
90 if ((pteval & need_pte_bits) != need_pte_bits)
91 return 0;
92
93 /* Check memory protection keys permissions. */
94 if (!__pkru_allows_pkey(pte_flags_pkey(pteval), write))
95 return 0;
96
97 return 1;
98}
99
100/*
101 * The performance critical leaf functions are made noinline otherwise gcc
102 * inlines everything into a single function which results in too much
103 * register pressure.
104 */
105static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
106 unsigned long end, int write, struct page **pages, int *nr)
107{
108 struct dev_pagemap *pgmap = NULL;
109 int nr_start = *nr, ret = 0;
110 pte_t *ptep, *ptem;
111
112 /*
113 * Keep the original mapped PTE value (ptem) around since we
114 * might increment ptep off the end of the page when finishing
115 * our loop iteration.
116 */
117 ptem = ptep = pte_offset_map(&pmd, addr);
118 do {
119 pte_t pte = gup_get_pte(ptep);
120 struct page *page;
121
122 /* Similar to the PMD case, NUMA hinting must take slow path */
123 if (pte_protnone(pte))
124 break;
125
126 if (!pte_allows_gup(pte_val(pte), write))
127 break;
128
129 if (pte_devmap(pte)) {
130 pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
131 if (unlikely(!pgmap)) {
132 undo_dev_pagemap(nr, nr_start, pages);
133 break;
134 }
135 } else if (pte_special(pte))
136 break;
137
138 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
139 page = pte_page(pte);
140 get_page(page);
141 put_dev_pagemap(pgmap);
142 SetPageReferenced(page);
143 pages[*nr] = page;
144 (*nr)++;
145
146 } while (ptep++, addr += PAGE_SIZE, addr != end);
147 if (addr == end)
148 ret = 1;
149 pte_unmap(ptem);
150
151 return ret;
152}
153
154static inline void get_head_page_multiple(struct page *page, int nr)
155{
156 VM_BUG_ON_PAGE(page != compound_head(page), page);
157 VM_BUG_ON_PAGE(page_count(page) == 0, page);
158 page_ref_add(page, nr);
159 SetPageReferenced(page);
160}
161
162static int __gup_device_huge(unsigned long pfn, unsigned long addr,
163 unsigned long end, struct page **pages, int *nr)
164{
165 int nr_start = *nr;
166 struct dev_pagemap *pgmap = NULL;
167
168 do {
169 struct page *page = pfn_to_page(pfn);
170
171 pgmap = get_dev_pagemap(pfn, pgmap);
172 if (unlikely(!pgmap)) {
173 undo_dev_pagemap(nr, nr_start, pages);
174 return 0;
175 }
176 SetPageReferenced(page);
177 pages[*nr] = page;
178 get_page(page);
179 put_dev_pagemap(pgmap);
180 (*nr)++;
181 pfn++;
182 } while (addr += PAGE_SIZE, addr != end);
183 return 1;
184}
185
186static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr,
187 unsigned long end, struct page **pages, int *nr)
188{
189 unsigned long fault_pfn;
190
191 fault_pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
192 return __gup_device_huge(fault_pfn, addr, end, pages, nr);
193}
194
195static int __gup_device_huge_pud(pud_t pud, unsigned long addr,
196 unsigned long end, struct page **pages, int *nr)
197{
198 unsigned long fault_pfn;
199
200 fault_pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
201 return __gup_device_huge(fault_pfn, addr, end, pages, nr);
202}
203
204static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
205 unsigned long end, int write, struct page **pages, int *nr)
206{
207 struct page *head, *page;
208 int refs;
209
210 if (!pte_allows_gup(pmd_val(pmd), write))
211 return 0;
212
213 VM_BUG_ON(!pfn_valid(pmd_pfn(pmd)));
214 if (pmd_devmap(pmd))
215 return __gup_device_huge_pmd(pmd, addr, end, pages, nr);
216
217 /* hugepages are never "special" */
218 VM_BUG_ON(pmd_flags(pmd) & _PAGE_SPECIAL);
219
220 refs = 0;
221 head = pmd_page(pmd);
222 page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
223 do {
224 VM_BUG_ON_PAGE(compound_head(page) != head, page);
225 pages[*nr] = page;
226 (*nr)++;
227 page++;
228 refs++;
229 } while (addr += PAGE_SIZE, addr != end);
230 get_head_page_multiple(head, refs);
231
232 return 1;
233}
234
235static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
236 int write, struct page **pages, int *nr)
237{
238 unsigned long next;
239 pmd_t *pmdp;
240
241 pmdp = pmd_offset(&pud, addr);
242 do {
243 pmd_t pmd = *pmdp;
244
245 next = pmd_addr_end(addr, end);
246 if (pmd_none(pmd))
247 return 0;
248 if (unlikely(pmd_large(pmd) || !pmd_present(pmd))) {
249 /*
250 * NUMA hinting faults need to be handled in the GUP
251 * slowpath for accounting purposes and so that they
252 * can be serialised against THP migration.
253 */
254 if (pmd_protnone(pmd))
255 return 0;
256 if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
257 return 0;
258 } else {
259 if (!gup_pte_range(pmd, addr, next, write, pages, nr))
260 return 0;
261 }
262 } while (pmdp++, addr = next, addr != end);
263
264 return 1;
265}
266
267static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
268 unsigned long end, int write, struct page **pages, int *nr)
269{
270 struct page *head, *page;
271 int refs;
272
273 if (!pte_allows_gup(pud_val(pud), write))
274 return 0;
275
276 VM_BUG_ON(!pfn_valid(pud_pfn(pud)));
277 if (pud_devmap(pud))
278 return __gup_device_huge_pud(pud, addr, end, pages, nr);
279
280 /* hugepages are never "special" */
281 VM_BUG_ON(pud_flags(pud) & _PAGE_SPECIAL);
282
283 refs = 0;
284 head = pud_page(pud);
285 page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
286 do {
287 VM_BUG_ON_PAGE(compound_head(page) != head, page);
288 pages[*nr] = page;
289 (*nr)++;
290 page++;
291 refs++;
292 } while (addr += PAGE_SIZE, addr != end);
293 get_head_page_multiple(head, refs);
294
295 return 1;
296}
297
298static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
299 int write, struct page **pages, int *nr)
300{
301 unsigned long next;
302 pud_t *pudp;
303
304 pudp = pud_offset(&p4d, addr);
305 do {
306 pud_t pud = *pudp;
307
308 next = pud_addr_end(addr, end);
309 if (pud_none(pud))
310 return 0;
311 if (unlikely(pud_large(pud))) {
312 if (!gup_huge_pud(pud, addr, next, write, pages, nr))
313 return 0;
314 } else {
315 if (!gup_pmd_range(pud, addr, next, write, pages, nr))
316 return 0;
317 }
318 } while (pudp++, addr = next, addr != end);
319
320 return 1;
321}
322
323static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end,
324 int write, struct page **pages, int *nr)
325{
326 unsigned long next;
327 p4d_t *p4dp;
328
329 p4dp = p4d_offset(&pgd, addr);
330 do {
331 p4d_t p4d = *p4dp;
332
333 next = p4d_addr_end(addr, end);
334 if (p4d_none(p4d))
335 return 0;
336 BUILD_BUG_ON(p4d_large(p4d));
337 if (!gup_pud_range(p4d, addr, next, write, pages, nr))
338 return 0;
339 } while (p4dp++, addr = next, addr != end);
340
341 return 1;
342}
343
344/*
345 * Like get_user_pages_fast() except its IRQ-safe in that it won't fall
346 * back to the regular GUP.
347 */
348int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
349 struct page **pages)
350{
351 struct mm_struct *mm = current->mm;
352 unsigned long addr, len, end;
353 unsigned long next;
354 unsigned long flags;
355 pgd_t *pgdp;
356 int nr = 0;
357
358 start &= PAGE_MASK;
359 addr = start;
360 len = (unsigned long) nr_pages << PAGE_SHIFT;
361 end = start + len;
362 if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
363 (void __user *)start, len)))
364 return 0;
365
366 /*
367 * XXX: batch / limit 'nr', to avoid large irq off latency
368 * needs some instrumenting to determine the common sizes used by
369 * important workloads (eg. DB2), and whether limiting the batch size
370 * will decrease performance.
371 *
372 * It seems like we're in the clear for the moment. Direct-IO is
373 * the main guy that batches up lots of get_user_pages, and even
374 * they are limited to 64-at-a-time which is not so many.
375 */
376 /*
377 * This doesn't prevent pagetable teardown, but does prevent
378 * the pagetables and pages from being freed on x86.
379 *
380 * So long as we atomically load page table pointers versus teardown
381 * (which we do on x86, with the above PAE exception), we can follow the
382 * address down to the the page and take a ref on it.
383 */
384 local_irq_save(flags);
385 pgdp = pgd_offset(mm, addr);
386 do {
387 pgd_t pgd = *pgdp;
388
389 next = pgd_addr_end(addr, end);
390 if (pgd_none(pgd))
391 break;
392 if (!gup_p4d_range(pgd, addr, next, write, pages, &nr))
393 break;
394 } while (pgdp++, addr = next, addr != end);
395 local_irq_restore(flags);
396
397 return nr;
398}
399
400/**
401 * get_user_pages_fast() - pin user pages in memory
402 * @start: starting user address
403 * @nr_pages: number of pages from start to pin
404 * @write: whether pages will be written to
405 * @pages: array that receives pointers to the pages pinned.
406 * Should be at least nr_pages long.
407 *
408 * Attempt to pin user pages in memory without taking mm->mmap_sem.
409 * If not successful, it will fall back to taking the lock and
410 * calling get_user_pages().
411 *
412 * Returns number of pages pinned. This may be fewer than the number
413 * requested. If nr_pages is 0 or negative, returns 0. If no pages
414 * were pinned, returns -errno.
415 */
416int get_user_pages_fast(unsigned long start, int nr_pages, int write,
417 struct page **pages)
418{
419 struct mm_struct *mm = current->mm;
420 unsigned long addr, len, end;
421 unsigned long next;
422 pgd_t *pgdp;
423 int nr = 0;
424
425 start &= PAGE_MASK;
426 addr = start;
427 len = (unsigned long) nr_pages << PAGE_SHIFT;
428
429 end = start + len;
430 if (end < start)
431 goto slow_irqon;
432
433#ifdef CONFIG_X86_64
434 if (end >> __VIRTUAL_MASK_SHIFT)
435 goto slow_irqon;
436#endif
437
438 /*
439 * XXX: batch / limit 'nr', to avoid large irq off latency
440 * needs some instrumenting to determine the common sizes used by
441 * important workloads (eg. DB2), and whether limiting the batch size
442 * will decrease performance.
443 *
444 * It seems like we're in the clear for the moment. Direct-IO is
445 * the main guy that batches up lots of get_user_pages, and even
446 * they are limited to 64-at-a-time which is not so many.
447 */
448 /*
449 * This doesn't prevent pagetable teardown, but does prevent
450 * the pagetables and pages from being freed on x86.
451 *
452 * So long as we atomically load page table pointers versus teardown
453 * (which we do on x86, with the above PAE exception), we can follow the
454 * address down to the the page and take a ref on it.
455 */
456 local_irq_disable();
457 pgdp = pgd_offset(mm, addr);
458 do {
459 pgd_t pgd = *pgdp;
460
461 next = pgd_addr_end(addr, end);
462 if (pgd_none(pgd))
463 goto slow;
464 if (!gup_p4d_range(pgd, addr, next, write, pages, &nr))
465 goto slow;
466 } while (pgdp++, addr = next, addr != end);
467 local_irq_enable();
468
469 VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
470 return nr;
471
472 {
473 int ret;
474
475slow:
476 local_irq_enable();
477slow_irqon:
478 /* Try to get the remaining pages with get_user_pages */
479 start += nr << PAGE_SHIFT;
480 pages += nr;
481
482 ret = get_user_pages_unlocked(start,
483 (end - start) >> PAGE_SHIFT,
484 pages, write ? FOLL_WRITE : 0);
485
486 /* Have to be a bit careful with return values */
487 if (nr > 0) {
488 if (ret < 0)
489 ret = nr;
490 else
491 ret += nr;
492 }
493
494 return ret;
495 }
496}
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 9b3f9fa5b283..673541eb3b3f 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -811,10 +811,8 @@ void __init zone_sizes_init(void)
811} 811}
812 812
813DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { 813DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
814#ifdef CONFIG_SMP 814 .loaded_mm = &init_mm,
815 .active_mm = &init_mm,
816 .state = 0, 815 .state = 0,
817#endif
818 .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */ 816 .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */
819}; 817};
820EXPORT_SYMBOL_GPL(cpu_tlbstate); 818EXPORT_SYMBOL_GPL(cpu_tlbstate);
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 0a59daf799f8..dae6a5e5ad4a 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -92,6 +92,44 @@ __setup("noexec32=", nonx32_setup);
92 * When memory was added make sure all the processes MM have 92 * When memory was added make sure all the processes MM have
93 * suitable PGD entries in the local PGD level page. 93 * suitable PGD entries in the local PGD level page.
94 */ 94 */
95#ifdef CONFIG_X86_5LEVEL
96void sync_global_pgds(unsigned long start, unsigned long end)
97{
98 unsigned long addr;
99
100 for (addr = start; addr <= end; addr = ALIGN(addr + 1, PGDIR_SIZE)) {
101 const pgd_t *pgd_ref = pgd_offset_k(addr);
102 struct page *page;
103
104 /* Check for overflow */
105 if (addr < start)
106 break;
107
108 if (pgd_none(*pgd_ref))
109 continue;
110
111 spin_lock(&pgd_lock);
112 list_for_each_entry(page, &pgd_list, lru) {
113 pgd_t *pgd;
114 spinlock_t *pgt_lock;
115
116 pgd = (pgd_t *)page_address(page) + pgd_index(addr);
117 /* the pgt_lock only for Xen */
118 pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
119 spin_lock(pgt_lock);
120
121 if (!pgd_none(*pgd_ref) && !pgd_none(*pgd))
122 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
123
124 if (pgd_none(*pgd))
125 set_pgd(pgd, *pgd_ref);
126
127 spin_unlock(pgt_lock);
128 }
129 spin_unlock(&pgd_lock);
130 }
131}
132#else
95void sync_global_pgds(unsigned long start, unsigned long end) 133void sync_global_pgds(unsigned long start, unsigned long end)
96{ 134{
97 unsigned long addr; 135 unsigned long addr;
@@ -135,6 +173,7 @@ void sync_global_pgds(unsigned long start, unsigned long end)
135 spin_unlock(&pgd_lock); 173 spin_unlock(&pgd_lock);
136 } 174 }
137} 175}
176#endif
138 177
139/* 178/*
140 * NOTE: This function is marked __ref because it calls __init function 179 * NOTE: This function is marked __ref because it calls __init function
@@ -585,6 +624,57 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end,
585 return paddr_last; 624 return paddr_last;
586} 625}
587 626
627static unsigned long __meminit
628phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end,
629 unsigned long page_size_mask)
630{
631 unsigned long paddr_next, paddr_last = paddr_end;
632 unsigned long vaddr = (unsigned long)__va(paddr);
633 int i = p4d_index(vaddr);
634
635 if (!IS_ENABLED(CONFIG_X86_5LEVEL))
636 return phys_pud_init((pud_t *) p4d_page, paddr, paddr_end, page_size_mask);
637
638 for (; i < PTRS_PER_P4D; i++, paddr = paddr_next) {
639 p4d_t *p4d;
640 pud_t *pud;
641
642 vaddr = (unsigned long)__va(paddr);
643 p4d = p4d_page + p4d_index(vaddr);
644 paddr_next = (paddr & P4D_MASK) + P4D_SIZE;
645
646 if (paddr >= paddr_end) {
647 if (!after_bootmem &&
648 !e820__mapped_any(paddr & P4D_MASK, paddr_next,
649 E820_TYPE_RAM) &&
650 !e820__mapped_any(paddr & P4D_MASK, paddr_next,
651 E820_TYPE_RESERVED_KERN))
652 set_p4d(p4d, __p4d(0));
653 continue;
654 }
655
656 if (!p4d_none(*p4d)) {
657 pud = pud_offset(p4d, 0);
658 paddr_last = phys_pud_init(pud, paddr,
659 paddr_end,
660 page_size_mask);
661 __flush_tlb_all();
662 continue;
663 }
664
665 pud = alloc_low_page();
666 paddr_last = phys_pud_init(pud, paddr, paddr_end,
667 page_size_mask);
668
669 spin_lock(&init_mm.page_table_lock);
670 p4d_populate(&init_mm, p4d, pud);
671 spin_unlock(&init_mm.page_table_lock);
672 }
673 __flush_tlb_all();
674
675 return paddr_last;
676}
677
588/* 678/*
589 * Create page table mapping for the physical memory for specific physical 679 * Create page table mapping for the physical memory for specific physical
590 * addresses. The virtual and physical addresses have to be aligned on PMD level 680 * addresses. The virtual and physical addresses have to be aligned on PMD level
@@ -606,26 +696,26 @@ kernel_physical_mapping_init(unsigned long paddr_start,
606 for (; vaddr < vaddr_end; vaddr = vaddr_next) { 696 for (; vaddr < vaddr_end; vaddr = vaddr_next) {
607 pgd_t *pgd = pgd_offset_k(vaddr); 697 pgd_t *pgd = pgd_offset_k(vaddr);
608 p4d_t *p4d; 698 p4d_t *p4d;
609 pud_t *pud;
610 699
611 vaddr_next = (vaddr & PGDIR_MASK) + PGDIR_SIZE; 700 vaddr_next = (vaddr & PGDIR_MASK) + PGDIR_SIZE;
612 701
613 BUILD_BUG_ON(pgd_none(*pgd)); 702 if (pgd_val(*pgd)) {
614 p4d = p4d_offset(pgd, vaddr); 703 p4d = (p4d_t *)pgd_page_vaddr(*pgd);
615 if (p4d_val(*p4d)) { 704 paddr_last = phys_p4d_init(p4d, __pa(vaddr),
616 pud = (pud_t *)p4d_page_vaddr(*p4d);
617 paddr_last = phys_pud_init(pud, __pa(vaddr),
618 __pa(vaddr_end), 705 __pa(vaddr_end),
619 page_size_mask); 706 page_size_mask);
620 continue; 707 continue;
621 } 708 }
622 709
623 pud = alloc_low_page(); 710 p4d = alloc_low_page();
624 paddr_last = phys_pud_init(pud, __pa(vaddr), __pa(vaddr_end), 711 paddr_last = phys_p4d_init(p4d, __pa(vaddr), __pa(vaddr_end),
625 page_size_mask); 712 page_size_mask);
626 713
627 spin_lock(&init_mm.page_table_lock); 714 spin_lock(&init_mm.page_table_lock);
628 p4d_populate(&init_mm, p4d, pud); 715 if (IS_ENABLED(CONFIG_X86_5LEVEL))
716 pgd_populate(&init_mm, pgd, p4d);
717 else
718 p4d_populate(&init_mm, p4d_offset(pgd, vaddr), (pud_t *) p4d);
629 spin_unlock(&init_mm.page_table_lock); 719 spin_unlock(&init_mm.page_table_lock);
630 pgd_changed = true; 720 pgd_changed = true;
631 } 721 }
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index bbc558b88a88..4c1b5fd0c7ad 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -424,7 +424,7 @@ static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss;
424static inline pmd_t * __init early_ioremap_pmd(unsigned long addr) 424static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
425{ 425{
426 /* Don't assume we're using swapper_pg_dir at this point */ 426 /* Don't assume we're using swapper_pg_dir at this point */
427 pgd_t *base = __va(read_cr3()); 427 pgd_t *base = __va(read_cr3_pa());
428 pgd_t *pgd = &base[pgd_index(addr)]; 428 pgd_t *pgd = &base[pgd_index(addr)];
429 p4d_t *p4d = p4d_offset(pgd, addr); 429 p4d_t *p4d = p4d_offset(pgd, addr);
430 pud_t *pud = pud_offset(p4d, addr); 430 pud_t *pud = pud_offset(p4d, addr);
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 0c7d8129bed6..88215ac16b24 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -12,7 +12,7 @@
12#include <asm/tlbflush.h> 12#include <asm/tlbflush.h>
13#include <asm/sections.h> 13#include <asm/sections.h>
14 14
15extern pgd_t early_level4_pgt[PTRS_PER_PGD]; 15extern pgd_t early_top_pgt[PTRS_PER_PGD];
16extern struct range pfn_mapped[E820_MAX_ENTRIES]; 16extern struct range pfn_mapped[E820_MAX_ENTRIES];
17 17
18static int __init map_range(struct range *range) 18static int __init map_range(struct range *range)
@@ -109,8 +109,8 @@ void __init kasan_early_init(void)
109 for (i = 0; CONFIG_PGTABLE_LEVELS >= 5 && i < PTRS_PER_P4D; i++) 109 for (i = 0; CONFIG_PGTABLE_LEVELS >= 5 && i < PTRS_PER_P4D; i++)
110 kasan_zero_p4d[i] = __p4d(p4d_val); 110 kasan_zero_p4d[i] = __p4d(p4d_val);
111 111
112 kasan_map_early_shadow(early_level4_pgt); 112 kasan_map_early_shadow(early_top_pgt);
113 kasan_map_early_shadow(init_level4_pgt); 113 kasan_map_early_shadow(init_top_pgt);
114} 114}
115 115
116void __init kasan_init(void) 116void __init kasan_init(void)
@@ -121,8 +121,8 @@ void __init kasan_init(void)
121 register_die_notifier(&kasan_die_notifier); 121 register_die_notifier(&kasan_die_notifier);
122#endif 122#endif
123 123
124 memcpy(early_level4_pgt, init_level4_pgt, sizeof(early_level4_pgt)); 124 memcpy(early_top_pgt, init_top_pgt, sizeof(early_top_pgt));
125 load_cr3(early_level4_pgt); 125 load_cr3(early_top_pgt);
126 __flush_tlb_all(); 126 __flush_tlb_all();
127 127
128 clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END); 128 clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END);
@@ -148,7 +148,7 @@ void __init kasan_init(void)
148 kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END), 148 kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END),
149 (void *)KASAN_SHADOW_END); 149 (void *)KASAN_SHADOW_END);
150 150
151 load_cr3(init_level4_pgt); 151 load_cr3(init_top_pgt);
152 __flush_tlb_all(); 152 __flush_tlb_all();
153 153
154 /* 154 /*
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
index aed206475aa7..af599167fe3c 100644
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -6,12 +6,12 @@
6 * 6 *
7 * Entropy is generated using the KASLR early boot functions now shared in 7 * Entropy is generated using the KASLR early boot functions now shared in
8 * the lib directory (originally written by Kees Cook). Randomization is 8 * the lib directory (originally written by Kees Cook). Randomization is
9 * done on PGD & PUD page table levels to increase possible addresses. The 9 * done on PGD & P4D/PUD page table levels to increase possible addresses.
10 * physical memory mapping code was adapted to support PUD level virtual 10 * The physical memory mapping code was adapted to support P4D/PUD level
11 * addresses. This implementation on the best configuration provides 30,000 11 * virtual addresses. This implementation on the best configuration provides
12 * possible virtual addresses in average for each memory region. An additional 12 * 30,000 possible virtual addresses in average for each memory region.
13 * low memory page is used to ensure each CPU can start with a PGD aligned 13 * An additional low memory page is used to ensure each CPU can start with
14 * virtual address (for realmode). 14 * a PGD aligned virtual address (for realmode).
15 * 15 *
16 * The order of each memory region is not changed. The feature looks at 16 * The order of each memory region is not changed. The feature looks at
17 * the available space for the regions based on different configuration 17 * the available space for the regions based on different configuration
@@ -70,7 +70,7 @@ static __initdata struct kaslr_memory_region {
70 unsigned long *base; 70 unsigned long *base;
71 unsigned long size_tb; 71 unsigned long size_tb;
72} kaslr_regions[] = { 72} kaslr_regions[] = {
73 { &page_offset_base, 64/* Maximum */ }, 73 { &page_offset_base, 1 << (__PHYSICAL_MASK_SHIFT - TB_SHIFT) /* Maximum */ },
74 { &vmalloc_base, VMALLOC_SIZE_TB }, 74 { &vmalloc_base, VMALLOC_SIZE_TB },
75 { &vmemmap_base, 1 }, 75 { &vmemmap_base, 1 },
76}; 76};
@@ -142,7 +142,10 @@ void __init kernel_randomize_memory(void)
142 */ 142 */
143 entropy = remain_entropy / (ARRAY_SIZE(kaslr_regions) - i); 143 entropy = remain_entropy / (ARRAY_SIZE(kaslr_regions) - i);
144 prandom_bytes_state(&rand_state, &rand, sizeof(rand)); 144 prandom_bytes_state(&rand_state, &rand, sizeof(rand));
145 entropy = (rand % (entropy + 1)) & PUD_MASK; 145 if (IS_ENABLED(CONFIG_X86_5LEVEL))
146 entropy = (rand % (entropy + 1)) & P4D_MASK;
147 else
148 entropy = (rand % (entropy + 1)) & PUD_MASK;
146 vaddr += entropy; 149 vaddr += entropy;
147 *kaslr_regions[i].base = vaddr; 150 *kaslr_regions[i].base = vaddr;
148 151
@@ -151,27 +154,21 @@ void __init kernel_randomize_memory(void)
151 * randomization alignment. 154 * randomization alignment.
152 */ 155 */
153 vaddr += get_padding(&kaslr_regions[i]); 156 vaddr += get_padding(&kaslr_regions[i]);
154 vaddr = round_up(vaddr + 1, PUD_SIZE); 157 if (IS_ENABLED(CONFIG_X86_5LEVEL))
158 vaddr = round_up(vaddr + 1, P4D_SIZE);
159 else
160 vaddr = round_up(vaddr + 1, PUD_SIZE);
155 remain_entropy -= entropy; 161 remain_entropy -= entropy;
156 } 162 }
157} 163}
158 164
159/* 165static void __meminit init_trampoline_pud(void)
160 * Create PGD aligned trampoline table to allow real mode initialization
161 * of additional CPUs. Consume only 1 low memory page.
162 */
163void __meminit init_trampoline(void)
164{ 166{
165 unsigned long paddr, paddr_next; 167 unsigned long paddr, paddr_next;
166 pgd_t *pgd; 168 pgd_t *pgd;
167 pud_t *pud_page, *pud_page_tramp; 169 pud_t *pud_page, *pud_page_tramp;
168 int i; 170 int i;
169 171
170 if (!kaslr_memory_enabled()) {
171 init_trampoline_default();
172 return;
173 }
174
175 pud_page_tramp = alloc_low_page(); 172 pud_page_tramp = alloc_low_page();
176 173
177 paddr = 0; 174 paddr = 0;
@@ -192,3 +189,49 @@ void __meminit init_trampoline(void)
192 set_pgd(&trampoline_pgd_entry, 189 set_pgd(&trampoline_pgd_entry,
193 __pgd(_KERNPG_TABLE | __pa(pud_page_tramp))); 190 __pgd(_KERNPG_TABLE | __pa(pud_page_tramp)));
194} 191}
192
193static void __meminit init_trampoline_p4d(void)
194{
195 unsigned long paddr, paddr_next;
196 pgd_t *pgd;
197 p4d_t *p4d_page, *p4d_page_tramp;
198 int i;
199
200 p4d_page_tramp = alloc_low_page();
201
202 paddr = 0;
203 pgd = pgd_offset_k((unsigned long)__va(paddr));
204 p4d_page = (p4d_t *) pgd_page_vaddr(*pgd);
205
206 for (i = p4d_index(paddr); i < PTRS_PER_P4D; i++, paddr = paddr_next) {
207 p4d_t *p4d, *p4d_tramp;
208 unsigned long vaddr = (unsigned long)__va(paddr);
209
210 p4d_tramp = p4d_page_tramp + p4d_index(paddr);
211 p4d = p4d_page + p4d_index(vaddr);
212 paddr_next = (paddr & P4D_MASK) + P4D_SIZE;
213
214 *p4d_tramp = *p4d;
215 }
216
217 set_pgd(&trampoline_pgd_entry,
218 __pgd(_KERNPG_TABLE | __pa(p4d_page_tramp)));
219}
220
221/*
222 * Create PGD aligned trampoline table to allow real mode initialization
223 * of additional CPUs. Consume only 1 low memory page.
224 */
225void __meminit init_trampoline(void)
226{
227
228 if (!kaslr_memory_enabled()) {
229 init_trampoline_default();
230 return;
231 }
232
233 if (IS_ENABLED(CONFIG_X86_5LEVEL))
234 init_trampoline_p4d();
235 else
236 init_trampoline_pud();
237}
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 19ad095b41df..797295e792b2 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -74,9 +74,6 @@ static int mmap_is_legacy(void)
74 if (current->personality & ADDR_COMPAT_LAYOUT) 74 if (current->personality & ADDR_COMPAT_LAYOUT)
75 return 1; 75 return 1;
76 76
77 if (rlimit(RLIMIT_STACK) == RLIM_INFINITY)
78 return 1;
79
80 return sysctl_legacy_va_layout; 77 return sysctl_legacy_va_layout;
81} 78}
82 79
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 6e7bedf69af7..014d07a80053 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -15,7 +15,7 @@
15#include <linux/debugfs.h> 15#include <linux/debugfs.h>
16 16
17/* 17/*
18 * Smarter SMP flushing macros. 18 * TLB flushing, formerly SMP-only
19 * c/o Linus Torvalds. 19 * c/o Linus Torvalds.
20 * 20 *
21 * These mean you can really definitely utterly forget about 21 * These mean you can really definitely utterly forget about
@@ -28,39 +28,28 @@
28 * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi 28 * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
29 */ 29 */
30 30
31#ifdef CONFIG_SMP
32
33struct flush_tlb_info {
34 struct mm_struct *flush_mm;
35 unsigned long flush_start;
36 unsigned long flush_end;
37};
38
39/*
40 * We cannot call mmdrop() because we are in interrupt context,
41 * instead update mm->cpu_vm_mask.
42 */
43void leave_mm(int cpu) 31void leave_mm(int cpu)
44{ 32{
45 struct mm_struct *active_mm = this_cpu_read(cpu_tlbstate.active_mm); 33 struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
34
35 /*
36 * It's plausible that we're in lazy TLB mode while our mm is init_mm.
37 * If so, our callers still expect us to flush the TLB, but there
38 * aren't any user TLB entries in init_mm to worry about.
39 *
40 * This needs to happen before any other sanity checks due to
41 * intel_idle's shenanigans.
42 */
43 if (loaded_mm == &init_mm)
44 return;
45
46 if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) 46 if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
47 BUG(); 47 BUG();
48 if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) { 48
49 cpumask_clear_cpu(cpu, mm_cpumask(active_mm)); 49 switch_mm(NULL, &init_mm, NULL);
50 load_cr3(swapper_pg_dir);
51 /*
52 * This gets called in the idle path where RCU
53 * functions differently. Tracing normally
54 * uses RCU, so we have to call the tracepoint
55 * specially here.
56 */
57 trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
58 }
59} 50}
60EXPORT_SYMBOL_GPL(leave_mm); 51EXPORT_SYMBOL_GPL(leave_mm);
61 52
62#endif /* CONFIG_SMP */
63
64void switch_mm(struct mm_struct *prev, struct mm_struct *next, 53void switch_mm(struct mm_struct *prev, struct mm_struct *next,
65 struct task_struct *tsk) 54 struct task_struct *tsk)
66{ 55{
@@ -75,216 +64,167 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
75 struct task_struct *tsk) 64 struct task_struct *tsk)
76{ 65{
77 unsigned cpu = smp_processor_id(); 66 unsigned cpu = smp_processor_id();
67 struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
78 68
79 if (likely(prev != next)) { 69 /*
80 if (IS_ENABLED(CONFIG_VMAP_STACK)) { 70 * NB: The scheduler will call us with prev == next when
81 /* 71 * switching from lazy TLB mode to normal mode if active_mm
82 * If our current stack is in vmalloc space and isn't 72 * isn't changing. When this happens, there is no guarantee
83 * mapped in the new pgd, we'll double-fault. Forcibly 73 * that CR3 (and hence cpu_tlbstate.loaded_mm) matches next.
84 * map it. 74 *
85 */ 75 * NB: leave_mm() calls us with prev == NULL and tsk == NULL.
86 unsigned int stack_pgd_index = pgd_index(current_stack_pointer()); 76 */
87
88 pgd_t *pgd = next->pgd + stack_pgd_index;
89
90 if (unlikely(pgd_none(*pgd)))
91 set_pgd(pgd, init_mm.pgd[stack_pgd_index]);
92 }
93 77
94#ifdef CONFIG_SMP 78 this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
95 this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
96 this_cpu_write(cpu_tlbstate.active_mm, next);
97#endif
98 79
99 cpumask_set_cpu(cpu, mm_cpumask(next)); 80 if (real_prev == next) {
81 /*
82 * There's nothing to do: we always keep the per-mm control
83 * regs in sync with cpu_tlbstate.loaded_mm. Just
84 * sanity-check mm_cpumask.
85 */
86 if (WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(next))))
87 cpumask_set_cpu(cpu, mm_cpumask(next));
88 return;
89 }
100 90
91 if (IS_ENABLED(CONFIG_VMAP_STACK)) {
101 /* 92 /*
102 * Re-load page tables. 93 * If our current stack is in vmalloc space and isn't
103 * 94 * mapped in the new pgd, we'll double-fault. Forcibly
104 * This logic has an ordering constraint: 95 * map it.
105 *
106 * CPU 0: Write to a PTE for 'next'
107 * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI.
108 * CPU 1: set bit 1 in next's mm_cpumask
109 * CPU 1: load from the PTE that CPU 0 writes (implicit)
110 *
111 * We need to prevent an outcome in which CPU 1 observes
112 * the new PTE value and CPU 0 observes bit 1 clear in
113 * mm_cpumask. (If that occurs, then the IPI will never
114 * be sent, and CPU 0's TLB will contain a stale entry.)
115 *
116 * The bad outcome can occur if either CPU's load is
117 * reordered before that CPU's store, so both CPUs must
118 * execute full barriers to prevent this from happening.
119 *
120 * Thus, switch_mm needs a full barrier between the
121 * store to mm_cpumask and any operation that could load
122 * from next->pgd. TLB fills are special and can happen
123 * due to instruction fetches or for no reason at all,
124 * and neither LOCK nor MFENCE orders them.
125 * Fortunately, load_cr3() is serializing and gives the
126 * ordering guarantee we need.
127 *
128 */ 96 */
129 load_cr3(next->pgd); 97 unsigned int stack_pgd_index = pgd_index(current_stack_pointer());
130 98
131 trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); 99 pgd_t *pgd = next->pgd + stack_pgd_index;
132 100
133 /* Stop flush ipis for the previous mm */ 101 if (unlikely(pgd_none(*pgd)))
134 cpumask_clear_cpu(cpu, mm_cpumask(prev)); 102 set_pgd(pgd, init_mm.pgd[stack_pgd_index]);
103 }
135 104
136 /* Load per-mm CR4 state */ 105 this_cpu_write(cpu_tlbstate.loaded_mm, next);
137 load_mm_cr4(next);
138 106
139#ifdef CONFIG_MODIFY_LDT_SYSCALL 107 WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next)));
140 /* 108 cpumask_set_cpu(cpu, mm_cpumask(next));
141 * Load the LDT, if the LDT is different. 109
142 * 110 /*
143 * It's possible that prev->context.ldt doesn't match 111 * Re-load page tables.
144 * the LDT register. This can happen if leave_mm(prev) 112 *
145 * was called and then modify_ldt changed 113 * This logic has an ordering constraint:
146 * prev->context.ldt but suppressed an IPI to this CPU. 114 *
147 * In this case, prev->context.ldt != NULL, because we 115 * CPU 0: Write to a PTE for 'next'
148 * never set context.ldt to NULL while the mm still 116 * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI.
149 * exists. That means that next->context.ldt != 117 * CPU 1: set bit 1 in next's mm_cpumask
150 * prev->context.ldt, because mms never share an LDT. 118 * CPU 1: load from the PTE that CPU 0 writes (implicit)
151 */ 119 *
152 if (unlikely(prev->context.ldt != next->context.ldt)) 120 * We need to prevent an outcome in which CPU 1 observes
153 load_mm_ldt(next); 121 * the new PTE value and CPU 0 observes bit 1 clear in
154#endif 122 * mm_cpumask. (If that occurs, then the IPI will never
123 * be sent, and CPU 0's TLB will contain a stale entry.)
124 *
125 * The bad outcome can occur if either CPU's load is
126 * reordered before that CPU's store, so both CPUs must
127 * execute full barriers to prevent this from happening.
128 *
129 * Thus, switch_mm needs a full barrier between the
130 * store to mm_cpumask and any operation that could load
131 * from next->pgd. TLB fills are special and can happen
132 * due to instruction fetches or for no reason at all,
133 * and neither LOCK nor MFENCE orders them.
134 * Fortunately, load_cr3() is serializing and gives the
135 * ordering guarantee we need.
136 */
137 load_cr3(next->pgd);
138
139 /*
140 * This gets called via leave_mm() in the idle path where RCU
141 * functions differently. Tracing normally uses RCU, so we have to
142 * call the tracepoint specially here.
143 */
144 trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
145
146 /* Stop flush ipis for the previous mm */
147 WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) &&
148 real_prev != &init_mm);
149 cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
150
151 /* Load per-mm CR4 and LDTR state */
152 load_mm_cr4(next);
153 switch_ldt(real_prev, next);
154}
155
156static void flush_tlb_func_common(const struct flush_tlb_info *f,
157 bool local, enum tlb_flush_reason reason)
158{
159 /* This code cannot presently handle being reentered. */
160 VM_WARN_ON(!irqs_disabled());
161
162 if (this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK) {
163 leave_mm(smp_processor_id());
164 return;
155 } 165 }
156#ifdef CONFIG_SMP
157 else {
158 this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
159 BUG_ON(this_cpu_read(cpu_tlbstate.active_mm) != next);
160
161 if (!cpumask_test_cpu(cpu, mm_cpumask(next))) {
162 /*
163 * On established mms, the mm_cpumask is only changed
164 * from irq context, from ptep_clear_flush() while in
165 * lazy tlb mode, and here. Irqs are blocked during
166 * schedule, protecting us from simultaneous changes.
167 */
168 cpumask_set_cpu(cpu, mm_cpumask(next));
169 166
170 /* 167 if (f->end == TLB_FLUSH_ALL) {
171 * We were in lazy tlb mode and leave_mm disabled 168 local_flush_tlb();
172 * tlb flush IPI delivery. We must reload CR3 169 if (local)
173 * to make sure to use no freed page tables. 170 count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
174 * 171 trace_tlb_flush(reason, TLB_FLUSH_ALL);
175 * As above, load_cr3() is serializing and orders TLB 172 } else {
176 * fills with respect to the mm_cpumask write. 173 unsigned long addr;
177 */ 174 unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT;
178 load_cr3(next->pgd); 175 addr = f->start;
179 trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); 176 while (addr < f->end) {
180 load_mm_cr4(next); 177 __flush_tlb_single(addr);
181 load_mm_ldt(next); 178 addr += PAGE_SIZE;
182 } 179 }
180 if (local)
181 count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages);
182 trace_tlb_flush(reason, nr_pages);
183 } 183 }
184#endif
185} 184}
186 185
187#ifdef CONFIG_SMP 186static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason)
187{
188 const struct flush_tlb_info *f = info;
188 189
189/* 190 flush_tlb_func_common(f, true, reason);
190 * The flush IPI assumes that a thread switch happens in this order: 191}
191 * [cpu0: the cpu that switches]
192 * 1) switch_mm() either 1a) or 1b)
193 * 1a) thread switch to a different mm
194 * 1a1) set cpu_tlbstate to TLBSTATE_OK
195 * Now the tlb flush NMI handler flush_tlb_func won't call leave_mm
196 * if cpu0 was in lazy tlb mode.
197 * 1a2) update cpu active_mm
198 * Now cpu0 accepts tlb flushes for the new mm.
199 * 1a3) cpu_set(cpu, new_mm->cpu_vm_mask);
200 * Now the other cpus will send tlb flush ipis.
201 * 1a4) change cr3.
202 * 1a5) cpu_clear(cpu, old_mm->cpu_vm_mask);
203 * Stop ipi delivery for the old mm. This is not synchronized with
204 * the other cpus, but flush_tlb_func ignore flush ipis for the wrong
205 * mm, and in the worst case we perform a superfluous tlb flush.
206 * 1b) thread switch without mm change
207 * cpu active_mm is correct, cpu0 already handles flush ipis.
208 * 1b1) set cpu_tlbstate to TLBSTATE_OK
209 * 1b2) test_and_set the cpu bit in cpu_vm_mask.
210 * Atomically set the bit [other cpus will start sending flush ipis],
211 * and test the bit.
212 * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
213 * 2) switch %%esp, ie current
214 *
215 * The interrupt must handle 2 special cases:
216 * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
217 * - the cpu performs speculative tlb reads, i.e. even if the cpu only
218 * runs in kernel space, the cpu could load tlb entries for user space
219 * pages.
220 *
221 * The good news is that cpu_tlbstate is local to each cpu, no
222 * write/read ordering problems.
223 */
224 192
225/* 193static void flush_tlb_func_remote(void *info)
226 * TLB flush funcation:
227 * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
228 * 2) Leave the mm if we are in the lazy tlb mode.
229 */
230static void flush_tlb_func(void *info)
231{ 194{
232 struct flush_tlb_info *f = info; 195 const struct flush_tlb_info *f = info;
233 196
234 inc_irq_stat(irq_tlb_count); 197 inc_irq_stat(irq_tlb_count);
235 198
236 if (f->flush_mm && f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm)) 199 if (f->mm && f->mm != this_cpu_read(cpu_tlbstate.loaded_mm))
237 return; 200 return;
238 201
239 count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); 202 count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
240 if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) { 203 flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN);
241 if (f->flush_end == TLB_FLUSH_ALL) {
242 local_flush_tlb();
243 trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, TLB_FLUSH_ALL);
244 } else {
245 unsigned long addr;
246 unsigned long nr_pages =
247 (f->flush_end - f->flush_start) / PAGE_SIZE;
248 addr = f->flush_start;
249 while (addr < f->flush_end) {
250 __flush_tlb_single(addr);
251 addr += PAGE_SIZE;
252 }
253 trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, nr_pages);
254 }
255 } else
256 leave_mm(smp_processor_id());
257
258} 204}
259 205
260void native_flush_tlb_others(const struct cpumask *cpumask, 206void native_flush_tlb_others(const struct cpumask *cpumask,
261 struct mm_struct *mm, unsigned long start, 207 const struct flush_tlb_info *info)
262 unsigned long end)
263{ 208{
264 struct flush_tlb_info info;
265
266 info.flush_mm = mm;
267 info.flush_start = start;
268 info.flush_end = end;
269
270 count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); 209 count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
271 if (end == TLB_FLUSH_ALL) 210 if (info->end == TLB_FLUSH_ALL)
272 trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL); 211 trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL);
273 else 212 else
274 trace_tlb_flush(TLB_REMOTE_SEND_IPI, 213 trace_tlb_flush(TLB_REMOTE_SEND_IPI,
275 (end - start) >> PAGE_SHIFT); 214 (info->end - info->start) >> PAGE_SHIFT);
276 215
277 if (is_uv_system()) { 216 if (is_uv_system()) {
278 unsigned int cpu; 217 unsigned int cpu;
279 218
280 cpu = smp_processor_id(); 219 cpu = smp_processor_id();
281 cpumask = uv_flush_tlb_others(cpumask, mm, start, end, cpu); 220 cpumask = uv_flush_tlb_others(cpumask, info);
282 if (cpumask) 221 if (cpumask)
283 smp_call_function_many(cpumask, flush_tlb_func, 222 smp_call_function_many(cpumask, flush_tlb_func_remote,
284 &info, 1); 223 (void *)info, 1);
285 return; 224 return;
286 } 225 }
287 smp_call_function_many(cpumask, flush_tlb_func, &info, 1); 226 smp_call_function_many(cpumask, flush_tlb_func_remote,
227 (void *)info, 1);
288} 228}
289 229
290/* 230/*
@@ -302,85 +242,41 @@ static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
302void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, 242void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
303 unsigned long end, unsigned long vmflag) 243 unsigned long end, unsigned long vmflag)
304{ 244{
305 unsigned long addr; 245 int cpu;
306 /* do a global flush by default */
307 unsigned long base_pages_to_flush = TLB_FLUSH_ALL;
308
309 preempt_disable();
310 246
311 if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB)) 247 struct flush_tlb_info info = {
312 base_pages_to_flush = (end - start) >> PAGE_SHIFT; 248 .mm = mm,
313 if (base_pages_to_flush > tlb_single_page_flush_ceiling) 249 };
314 base_pages_to_flush = TLB_FLUSH_ALL;
315 250
316 if (current->active_mm != mm) { 251 cpu = get_cpu();
317 /* Synchronize with switch_mm. */
318 smp_mb();
319 252
320 goto out; 253 /* Synchronize with switch_mm. */
321 } 254 smp_mb();
322
323 if (!current->mm) {
324 leave_mm(smp_processor_id());
325 255
326 /* Synchronize with switch_mm. */ 256 /* Should we flush just the requested range? */
327 smp_mb(); 257 if ((end != TLB_FLUSH_ALL) &&
328 258 !(vmflag & VM_HUGETLB) &&
329 goto out; 259 ((end - start) >> PAGE_SHIFT) <= tlb_single_page_flush_ceiling) {
330 } 260 info.start = start;
331 261 info.end = end;
332 /*
333 * Both branches below are implicit full barriers (MOV to CR or
334 * INVLPG) that synchronize with switch_mm.
335 */
336 if (base_pages_to_flush == TLB_FLUSH_ALL) {
337 count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
338 local_flush_tlb();
339 } else { 262 } else {
340 /* flush range by one by one 'invlpg' */ 263 info.start = 0UL;
341 for (addr = start; addr < end; addr += PAGE_SIZE) { 264 info.end = TLB_FLUSH_ALL;
342 count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
343 __flush_tlb_single(addr);
344 }
345 }
346 trace_tlb_flush(TLB_LOCAL_MM_SHOOTDOWN, base_pages_to_flush);
347out:
348 if (base_pages_to_flush == TLB_FLUSH_ALL) {
349 start = 0UL;
350 end = TLB_FLUSH_ALL;
351 } 265 }
352 if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
353 flush_tlb_others(mm_cpumask(mm), mm, start, end);
354 preempt_enable();
355}
356 266
357void flush_tlb_page(struct vm_area_struct *vma, unsigned long start) 267 if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
358{ 268 VM_WARN_ON(irqs_disabled());
359 struct mm_struct *mm = vma->vm_mm; 269 local_irq_disable();
360 270 flush_tlb_func_local(&info, TLB_LOCAL_MM_SHOOTDOWN);
361 preempt_disable(); 271 local_irq_enable();
362
363 if (current->active_mm == mm) {
364 if (current->mm) {
365 /*
366 * Implicit full barrier (INVLPG) that synchronizes
367 * with switch_mm.
368 */
369 __flush_tlb_one(start);
370 } else {
371 leave_mm(smp_processor_id());
372
373 /* Synchronize with switch_mm. */
374 smp_mb();
375 }
376 } 272 }
377 273
378 if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) 274 if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids)
379 flush_tlb_others(mm_cpumask(mm), mm, start, start + PAGE_SIZE); 275 flush_tlb_others(mm_cpumask(mm), &info);
380 276 put_cpu();
381 preempt_enable();
382} 277}
383 278
279
384static void do_flush_tlb_all(void *info) 280static void do_flush_tlb_all(void *info)
385{ 281{
386 count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); 282 count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
@@ -401,7 +297,7 @@ static void do_kernel_range_flush(void *info)
401 unsigned long addr; 297 unsigned long addr;
402 298
403 /* flush range by one by one 'invlpg' */ 299 /* flush range by one by one 'invlpg' */
404 for (addr = f->flush_start; addr < f->flush_end; addr += PAGE_SIZE) 300 for (addr = f->start; addr < f->end; addr += PAGE_SIZE)
405 __flush_tlb_single(addr); 301 __flush_tlb_single(addr);
406} 302}
407 303
@@ -410,16 +306,40 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end)
410 306
411 /* Balance as user space task's flush, a bit conservative */ 307 /* Balance as user space task's flush, a bit conservative */
412 if (end == TLB_FLUSH_ALL || 308 if (end == TLB_FLUSH_ALL ||
413 (end - start) > tlb_single_page_flush_ceiling * PAGE_SIZE) { 309 (end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) {
414 on_each_cpu(do_flush_tlb_all, NULL, 1); 310 on_each_cpu(do_flush_tlb_all, NULL, 1);
415 } else { 311 } else {
416 struct flush_tlb_info info; 312 struct flush_tlb_info info;
417 info.flush_start = start; 313 info.start = start;
418 info.flush_end = end; 314 info.end = end;
419 on_each_cpu(do_kernel_range_flush, &info, 1); 315 on_each_cpu(do_kernel_range_flush, &info, 1);
420 } 316 }
421} 317}
422 318
319void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
320{
321 struct flush_tlb_info info = {
322 .mm = NULL,
323 .start = 0UL,
324 .end = TLB_FLUSH_ALL,
325 };
326
327 int cpu = get_cpu();
328
329 if (cpumask_test_cpu(cpu, &batch->cpumask)) {
330 VM_WARN_ON(irqs_disabled());
331 local_irq_disable();
332 flush_tlb_func_local(&info, TLB_LOCAL_SHOOTDOWN);
333 local_irq_enable();
334 }
335
336 if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids)
337 flush_tlb_others(&batch->cpumask, &info);
338 cpumask_clear(&batch->cpumask);
339
340 put_cpu();
341}
342
423static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf, 343static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf,
424 size_t count, loff_t *ppos) 344 size_t count, loff_t *ppos)
425{ 345{
@@ -465,5 +385,3 @@ static int __init create_tlb_single_page_flush_ceiling(void)
465 return 0; 385 return 0;
466} 386}
467late_initcall(create_tlb_single_page_flush_ceiling); 387late_initcall(create_tlb_single_page_flush_ceiling);
468
469#endif /* CONFIG_SMP */
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 8ff1f95627f9..9bf72f5bfedb 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -80,7 +80,7 @@ pgd_t * __init efi_call_phys_prolog(void)
80 int n_pgds, i, j; 80 int n_pgds, i, j;
81 81
82 if (!efi_enabled(EFI_OLD_MEMMAP)) { 82 if (!efi_enabled(EFI_OLD_MEMMAP)) {
83 save_pgd = (pgd_t *)read_cr3(); 83 save_pgd = (pgd_t *)__read_cr3();
84 write_cr3((unsigned long)efi_scratch.efi_pgt); 84 write_cr3((unsigned long)efi_scratch.efi_pgt);
85 goto out; 85 goto out;
86 } 86 }
@@ -649,7 +649,7 @@ efi_status_t efi_thunk_set_virtual_address_map(
649 efi_sync_low_kernel_mappings(); 649 efi_sync_low_kernel_mappings();
650 local_irq_save(flags); 650 local_irq_save(flags);
651 651
652 efi_scratch.prev_cr3 = read_cr3(); 652 efi_scratch.prev_cr3 = __read_cr3();
653 write_cr3((unsigned long)efi_scratch.efi_pgt); 653 write_cr3((unsigned long)efi_scratch.efi_pgt);
654 __flush_tlb_all(); 654 __flush_tlb_all();
655 655
diff --git a/arch/x86/platform/olpc/olpc-xo1-pm.c b/arch/x86/platform/olpc/olpc-xo1-pm.c
index c5350fd27d70..0668aaff8bfe 100644
--- a/arch/x86/platform/olpc/olpc-xo1-pm.c
+++ b/arch/x86/platform/olpc/olpc-xo1-pm.c
@@ -77,7 +77,7 @@ static int xo1_power_state_enter(suspend_state_t pm_state)
77 77
78asmlinkage __visible int xo1_do_sleep(u8 sleep_state) 78asmlinkage __visible int xo1_do_sleep(u8 sleep_state)
79{ 79{
80 void *pgd_addr = __va(read_cr3()); 80 void *pgd_addr = __va(read_cr3_pa());
81 81
82 /* Program wakeup mask (using dword access to CS5536_PM1_EN) */ 82 /* Program wakeup mask (using dword access to CS5536_PM1_EN) */
83 outl(wakeup_mask << 16, acpi_base + CS5536_PM1_STS); 83 outl(wakeup_mask << 16, acpi_base + CS5536_PM1_STS);
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index 795671593528..2983faab5b18 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -1123,11 +1123,9 @@ static int set_distrib_bits(struct cpumask *flush_mask, struct bau_control *bcp,
1123 * done. The returned pointer is valid till preemption is re-enabled. 1123 * done. The returned pointer is valid till preemption is re-enabled.
1124 */ 1124 */
1125const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, 1125const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
1126 struct mm_struct *mm, 1126 const struct flush_tlb_info *info)
1127 unsigned long start,
1128 unsigned long end,
1129 unsigned int cpu)
1130{ 1127{
1128 unsigned int cpu = smp_processor_id();
1131 int locals = 0, remotes = 0, hubs = 0; 1129 int locals = 0, remotes = 0, hubs = 0;
1132 struct bau_desc *bau_desc; 1130 struct bau_desc *bau_desc;
1133 struct cpumask *flush_mask; 1131 struct cpumask *flush_mask;
@@ -1181,8 +1179,8 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
1181 1179
1182 record_send_statistics(stat, locals, hubs, remotes, bau_desc); 1180 record_send_statistics(stat, locals, hubs, remotes, bau_desc);
1183 1181
1184 if (!end || (end - start) <= PAGE_SIZE) 1182 if (!info->end || (info->end - info->start) <= PAGE_SIZE)
1185 address = start; 1183 address = info->start;
1186 else 1184 else
1187 address = TLB_FLUSH_ALL; 1185 address = TLB_FLUSH_ALL;
1188 1186
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 6b05a9219ea2..78459a6d455a 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -129,7 +129,7 @@ static void __save_processor_state(struct saved_context *ctxt)
129 */ 129 */
130 ctxt->cr0 = read_cr0(); 130 ctxt->cr0 = read_cr0();
131 ctxt->cr2 = read_cr2(); 131 ctxt->cr2 = read_cr2();
132 ctxt->cr3 = read_cr3(); 132 ctxt->cr3 = __read_cr3();
133 ctxt->cr4 = __read_cr4(); 133 ctxt->cr4 = __read_cr4();
134#ifdef CONFIG_X86_64 134#ifdef CONFIG_X86_64
135 ctxt->cr8 = read_cr8(); 135 ctxt->cr8 = read_cr8();
diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c
index a6e21fee22ea..e3e62c8a8e70 100644
--- a/arch/x86/power/hibernate_64.c
+++ b/arch/x86/power/hibernate_64.c
@@ -150,7 +150,8 @@ static int relocate_restore_code(void)
150 memcpy((void *)relocated_restore_code, &core_restore_code, PAGE_SIZE); 150 memcpy((void *)relocated_restore_code, &core_restore_code, PAGE_SIZE);
151 151
152 /* Make the page containing the relocated code executable */ 152 /* Make the page containing the relocated code executable */
153 pgd = (pgd_t *)__va(read_cr3()) + pgd_index(relocated_restore_code); 153 pgd = (pgd_t *)__va(read_cr3_pa()) +
154 pgd_index(relocated_restore_code);
154 p4d = p4d_offset(pgd, relocated_restore_code); 155 p4d = p4d_offset(pgd, relocated_restore_code);
155 if (p4d_large(*p4d)) { 156 if (p4d_large(*p4d)) {
156 set_p4d(p4d, __p4d(p4d_val(*p4d) & ~_PAGE_NX)); 157 set_p4d(p4d, __p4d(p4d_val(*p4d) & ~_PAGE_NX));
diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
index a163a90af4aa..cd4be19c36dc 100644
--- a/arch/x86/realmode/init.c
+++ b/arch/x86/realmode/init.c
@@ -102,7 +102,7 @@ static void __init setup_real_mode(void)
102 102
103 trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd); 103 trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd);
104 trampoline_pgd[0] = trampoline_pgd_entry.pgd; 104 trampoline_pgd[0] = trampoline_pgd_entry.pgd;
105 trampoline_pgd[511] = init_level4_pgt[511].pgd; 105 trampoline_pgd[511] = init_top_pgt[511].pgd;
106#endif 106#endif
107} 107}
108 108
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index 1f386d7fdf70..1d7a7213a310 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -975,37 +975,32 @@ static void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
975 spin_unlock(&mm->page_table_lock); 975 spin_unlock(&mm->page_table_lock);
976} 976}
977 977
978 978static void drop_mm_ref_this_cpu(void *info)
979#ifdef CONFIG_SMP
980/* Another cpu may still have their %cr3 pointing at the pagetable, so
981 we need to repoint it somewhere else before we can unpin it. */
982static void drop_other_mm_ref(void *info)
983{ 979{
984 struct mm_struct *mm = info; 980 struct mm_struct *mm = info;
985 struct mm_struct *active_mm;
986
987 active_mm = this_cpu_read(cpu_tlbstate.active_mm);
988 981
989 if (active_mm == mm && this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK) 982 if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm)
990 leave_mm(smp_processor_id()); 983 leave_mm(smp_processor_id());
991 984
992 /* If this cpu still has a stale cr3 reference, then make sure 985 /*
993 it has been flushed. */ 986 * If this cpu still has a stale cr3 reference, then make sure
987 * it has been flushed.
988 */
994 if (this_cpu_read(xen_current_cr3) == __pa(mm->pgd)) 989 if (this_cpu_read(xen_current_cr3) == __pa(mm->pgd))
995 load_cr3(swapper_pg_dir); 990 xen_mc_flush();
996} 991}
997 992
993#ifdef CONFIG_SMP
994/*
995 * Another cpu may still have their %cr3 pointing at the pagetable, so
996 * we need to repoint it somewhere else before we can unpin it.
997 */
998static void xen_drop_mm_ref(struct mm_struct *mm) 998static void xen_drop_mm_ref(struct mm_struct *mm)
999{ 999{
1000 cpumask_var_t mask; 1000 cpumask_var_t mask;
1001 unsigned cpu; 1001 unsigned cpu;
1002 1002
1003 if (current->active_mm == mm) { 1003 drop_mm_ref_this_cpu(mm);
1004 if (current->mm == mm)
1005 load_cr3(swapper_pg_dir);
1006 else
1007 leave_mm(smp_processor_id());
1008 }
1009 1004
1010 /* Get the "official" set of cpus referring to our pagetable. */ 1005 /* Get the "official" set of cpus referring to our pagetable. */
1011 if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) { 1006 if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
@@ -1013,31 +1008,31 @@ static void xen_drop_mm_ref(struct mm_struct *mm)
1013 if (!cpumask_test_cpu(cpu, mm_cpumask(mm)) 1008 if (!cpumask_test_cpu(cpu, mm_cpumask(mm))
1014 && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd)) 1009 && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
1015 continue; 1010 continue;
1016 smp_call_function_single(cpu, drop_other_mm_ref, mm, 1); 1011 smp_call_function_single(cpu, drop_mm_ref_this_cpu, mm, 1);
1017 } 1012 }
1018 return; 1013 return;
1019 } 1014 }
1020 cpumask_copy(mask, mm_cpumask(mm)); 1015 cpumask_copy(mask, mm_cpumask(mm));
1021 1016
1022 /* It's possible that a vcpu may have a stale reference to our 1017 /*
1023 cr3, because its in lazy mode, and it hasn't yet flushed 1018 * It's possible that a vcpu may have a stale reference to our
1024 its set of pending hypercalls yet. In this case, we can 1019 * cr3, because its in lazy mode, and it hasn't yet flushed
1025 look at its actual current cr3 value, and force it to flush 1020 * its set of pending hypercalls yet. In this case, we can
1026 if needed. */ 1021 * look at its actual current cr3 value, and force it to flush
1022 * if needed.
1023 */
1027 for_each_online_cpu(cpu) { 1024 for_each_online_cpu(cpu) {
1028 if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd)) 1025 if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
1029 cpumask_set_cpu(cpu, mask); 1026 cpumask_set_cpu(cpu, mask);
1030 } 1027 }
1031 1028
1032 if (!cpumask_empty(mask)) 1029 smp_call_function_many(mask, drop_mm_ref_this_cpu, mm, 1);
1033 smp_call_function_many(mask, drop_other_mm_ref, mm, 1);
1034 free_cpumask_var(mask); 1030 free_cpumask_var(mask);
1035} 1031}
1036#else 1032#else
1037static void xen_drop_mm_ref(struct mm_struct *mm) 1033static void xen_drop_mm_ref(struct mm_struct *mm)
1038{ 1034{
1039 if (current->active_mm == mm) 1035 drop_mm_ref_this_cpu(mm);
1040 load_cr3(swapper_pg_dir);
1041} 1036}
1042#endif 1037#endif
1043 1038
@@ -1366,8 +1361,7 @@ static void xen_flush_tlb_single(unsigned long addr)
1366} 1361}
1367 1362
1368static void xen_flush_tlb_others(const struct cpumask *cpus, 1363static void xen_flush_tlb_others(const struct cpumask *cpus,
1369 struct mm_struct *mm, unsigned long start, 1364 const struct flush_tlb_info *info)
1370 unsigned long end)
1371{ 1365{
1372 struct { 1366 struct {
1373 struct mmuext_op op; 1367 struct mmuext_op op;
@@ -1379,7 +1373,7 @@ static void xen_flush_tlb_others(const struct cpumask *cpus,
1379 } *args; 1373 } *args;
1380 struct multicall_space mcs; 1374 struct multicall_space mcs;
1381 1375
1382 trace_xen_mmu_flush_tlb_others(cpus, mm, start, end); 1376 trace_xen_mmu_flush_tlb_others(cpus, info->mm, info->start, info->end);
1383 1377
1384 if (cpumask_empty(cpus)) 1378 if (cpumask_empty(cpus))
1385 return; /* nothing to do */ 1379 return; /* nothing to do */
@@ -1393,9 +1387,10 @@ static void xen_flush_tlb_others(const struct cpumask *cpus,
1393 cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask)); 1387 cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
1394 1388
1395 args->op.cmd = MMUEXT_TLB_FLUSH_MULTI; 1389 args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
1396 if (end != TLB_FLUSH_ALL && (end - start) <= PAGE_SIZE) { 1390 if (info->end != TLB_FLUSH_ALL &&
1391 (info->end - info->start) <= PAGE_SIZE) {
1397 args->op.cmd = MMUEXT_INVLPG_MULTI; 1392 args->op.cmd = MMUEXT_INVLPG_MULTI;
1398 args->op.arg1.linear_addr = start; 1393 args->op.arg1.linear_addr = info->start;
1399 } 1394 }
1400 1395
1401 MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF); 1396 MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
@@ -1470,8 +1465,8 @@ static void xen_write_cr3(unsigned long cr3)
1470 * At the start of the day - when Xen launches a guest, it has already 1465 * At the start of the day - when Xen launches a guest, it has already
1471 * built pagetables for the guest. We diligently look over them 1466 * built pagetables for the guest. We diligently look over them
1472 * in xen_setup_kernel_pagetable and graft as appropriate them in the 1467 * in xen_setup_kernel_pagetable and graft as appropriate them in the
1473 * init_level4_pgt and its friends. Then when we are happy we load 1468 * init_top_pgt and its friends. Then when we are happy we load
1474 * the new init_level4_pgt - and continue on. 1469 * the new init_top_pgt - and continue on.
1475 * 1470 *
1476 * The generic code starts (start_kernel) and 'init_mem_mapping' sets 1471 * The generic code starts (start_kernel) and 'init_mem_mapping' sets
1477 * up the rest of the pagetables. When it has completed it loads the cr3. 1472 * up the rest of the pagetables. When it has completed it loads the cr3.
@@ -1914,12 +1909,12 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
1914 pt_end = pt_base + xen_start_info->nr_pt_frames; 1909 pt_end = pt_base + xen_start_info->nr_pt_frames;
1915 1910
1916 /* Zap identity mapping */ 1911 /* Zap identity mapping */
1917 init_level4_pgt[0] = __pgd(0); 1912 init_top_pgt[0] = __pgd(0);
1918 1913
1919 /* Pre-constructed entries are in pfn, so convert to mfn */ 1914 /* Pre-constructed entries are in pfn, so convert to mfn */
1920 /* L4[272] -> level3_ident_pgt */ 1915 /* L4[272] -> level3_ident_pgt */
1921 /* L4[511] -> level3_kernel_pgt */ 1916 /* L4[511] -> level3_kernel_pgt */
1922 convert_pfn_mfn(init_level4_pgt); 1917 convert_pfn_mfn(init_top_pgt);
1923 1918
1924 /* L3_i[0] -> level2_ident_pgt */ 1919 /* L3_i[0] -> level2_ident_pgt */
1925 convert_pfn_mfn(level3_ident_pgt); 1920 convert_pfn_mfn(level3_ident_pgt);
@@ -1950,10 +1945,10 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
1950 /* Copy the initial P->M table mappings if necessary. */ 1945 /* Copy the initial P->M table mappings if necessary. */
1951 i = pgd_index(xen_start_info->mfn_list); 1946 i = pgd_index(xen_start_info->mfn_list);
1952 if (i && i < pgd_index(__START_KERNEL_map)) 1947 if (i && i < pgd_index(__START_KERNEL_map))
1953 init_level4_pgt[i] = ((pgd_t *)xen_start_info->pt_base)[i]; 1948 init_top_pgt[i] = ((pgd_t *)xen_start_info->pt_base)[i];
1954 1949
1955 /* Make pagetable pieces RO */ 1950 /* Make pagetable pieces RO */
1956 set_page_prot(init_level4_pgt, PAGE_KERNEL_RO); 1951 set_page_prot(init_top_pgt, PAGE_KERNEL_RO);
1957 set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO); 1952 set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
1958 set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO); 1953 set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
1959 set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO); 1954 set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
@@ -1964,7 +1959,7 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
1964 1959
1965 /* Pin down new L4 */ 1960 /* Pin down new L4 */
1966 pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, 1961 pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
1967 PFN_DOWN(__pa_symbol(init_level4_pgt))); 1962 PFN_DOWN(__pa_symbol(init_top_pgt)));
1968 1963
1969 /* Unpin Xen-provided one */ 1964 /* Unpin Xen-provided one */
1970 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); 1965 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
@@ -1974,7 +1969,7 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
1974 * attach it to, so make sure we just set kernel pgd. 1969 * attach it to, so make sure we just set kernel pgd.
1975 */ 1970 */
1976 xen_mc_batch(); 1971 xen_mc_batch();
1977 __xen_write_cr3(true, __pa(init_level4_pgt)); 1972 __xen_write_cr3(true, __pa(init_top_pgt));
1978 xen_mc_issue(PARAVIRT_LAZY_CPU); 1973 xen_mc_issue(PARAVIRT_LAZY_CPU);
1979 1974
1980 /* We can't that easily rip out L3 and L2, as the Xen pagetables are 1975 /* We can't that easily rip out L3 and L2, as the Xen pagetables are
@@ -2022,7 +2017,7 @@ static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr)
2022 pmd_t pmd; 2017 pmd_t pmd;
2023 pte_t pte; 2018 pte_t pte;
2024 2019
2025 pa = read_cr3(); 2020 pa = read_cr3_pa();
2026 pgd = native_make_pgd(xen_read_phys_ulong(pa + pgd_index(vaddr) * 2021 pgd = native_make_pgd(xen_read_phys_ulong(pa + pgd_index(vaddr) *
2027 sizeof(pgd))); 2022 sizeof(pgd)));
2028 if (!pgd_present(pgd)) 2023 if (!pgd_present(pgd))
@@ -2102,7 +2097,7 @@ void __init xen_relocate_p2m(void)
2102 pt_phys = pmd_phys + PFN_PHYS(n_pmd); 2097 pt_phys = pmd_phys + PFN_PHYS(n_pmd);
2103 p2m_pfn = PFN_DOWN(pt_phys) + n_pt; 2098 p2m_pfn = PFN_DOWN(pt_phys) + n_pt;
2104 2099
2105 pgd = __va(read_cr3()); 2100 pgd = __va(read_cr3_pa());
2106 new_p2m = (unsigned long *)(2 * PGDIR_SIZE); 2101 new_p2m = (unsigned long *)(2 * PGDIR_SIZE);
2107 idx_p4d = 0; 2102 idx_p4d = 0;
2108 save_pud = n_pud; 2103 save_pud = n_pud;
@@ -2209,7 +2204,7 @@ static void __init xen_write_cr3_init(unsigned long cr3)
2209{ 2204{
2210 unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir)); 2205 unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir));
2211 2206
2212 BUG_ON(read_cr3() != __pa(initial_page_table)); 2207 BUG_ON(read_cr3_pa() != __pa(initial_page_table));
2213 BUG_ON(cr3 != __pa(swapper_pg_dir)); 2208 BUG_ON(cr3 != __pa(swapper_pg_dir));
2214 2209
2215 /* 2210 /*
diff --git a/arch/x86/xen/xen-pvh.S b/arch/x86/xen/xen-pvh.S
index 5e246716d58f..e1a5fbeae08d 100644
--- a/arch/x86/xen/xen-pvh.S
+++ b/arch/x86/xen/xen-pvh.S
@@ -87,7 +87,7 @@ ENTRY(pvh_start_xen)
87 wrmsr 87 wrmsr
88 88
89 /* Enable pre-constructed page tables. */ 89 /* Enable pre-constructed page tables. */
90 mov $_pa(init_level4_pgt), %eax 90 mov $_pa(init_top_pgt), %eax
91 mov %eax, %cr3 91 mov %eax, %cr3
92 mov $(X86_CR0_PG | X86_CR0_PE), %eax 92 mov $(X86_CR0_PG | X86_CR0_PE), %eax
93 mov %eax, %cr0 93 mov %eax, %cr0