aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2018-12-26 21:08:18 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2018-12-26 21:08:18 -0500
commite57d9f638af9673f38d9f09de66fa0a28303127d (patch)
tree1948825bba57b4563ef0a5e7dd7e90634441b66e
parentd6e867a6ae13bc02cd01c535764e5b051d26cf28 (diff)
parent6848ac7ca39a226ede5df7af0efcc4ef0611e99c (diff)
Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 mm updates from Ingo Molnar: "The main changes in this cycle were: - Update and clean up x86 fault handling, by Andy Lutomirski. - Drop usage of __flush_tlb_all() in kernel_physical_mapping_init() and related fallout, by Dan Williams. - CPA cleanups and reorganization by Peter Zijlstra: simplify the flow and remove a few warts. - Other misc cleanups" * 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (29 commits) x86/mm/dump_pagetables: Use DEFINE_SHOW_ATTRIBUTE() x86/mm/cpa: Rename @addrinarray to @numpages x86/mm/cpa: Better use CLFLUSHOPT x86/mm/cpa: Fold cpa_flush_range() and cpa_flush_array() into a single cpa_flush() function x86/mm/cpa: Make cpa_data::numpages invariant x86/mm/cpa: Optimize cpa_flush_array() TLB invalidation x86/mm/cpa: Simplify the code after making cpa->vaddr invariant x86/mm/cpa: Make cpa_data::vaddr invariant x86/mm/cpa: Add __cpa_addr() helper x86/mm/cpa: Add ARRAY and PAGES_ARRAY selftests x86/mm: Drop usage of __flush_tlb_all() in kernel_physical_mapping_init() x86/mm: Validate kernel_physical_mapping_init() PTE population generic/pgtable: Introduce set_pte_safe() generic/pgtable: Introduce {p4d,pgd}_same() generic/pgtable: Make {pmd, pud}_same() unconditionally available x86/fault: Clean up the page fault oops decoder a bit x86/fault: Decode page fault OOPSes better x86/vsyscall/64: Use X86_PF constants in the simulated #PF error code x86/oops: Show the correct CS value in show_regs() x86/fault: Don't try to recover from an implicit supervisor access ...
-rw-r--r--arch/x86/entry/vsyscall/vsyscall_64.c2
-rw-r--r--arch/x86/include/asm/disabled-features.h8
-rw-r--r--arch/x86/include/asm/pgalloc.h27
-rw-r--r--arch/x86/kernel/process_64.c5
-rw-r--r--arch/x86/mm/debug_pagetables.c58
-rw-r--r--arch/x86/mm/fault.c244
-rw-r--r--arch/x86/mm/init_64.c30
-rw-r--r--arch/x86/mm/mm_internal.h2
-rw-r--r--arch/x86/mm/pageattr-test.c31
-rw-r--r--arch/x86/mm/pageattr.c271
-rw-r--r--arch/x86/mm/tlb.c4
-rw-r--r--include/asm-generic/5level-fixup.h1
-rw-r--r--include/asm-generic/pgtable-nop4d-hack.h1
-rw-r--r--include/asm-generic/pgtable-nop4d.h1
-rw-r--r--include/asm-generic/pgtable-nopud.h1
-rw-r--r--include/asm-generic/pgtable.h56
16 files changed, 396 insertions, 346 deletions
diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
index 85fd85d52ffd..d78bcc03e60e 100644
--- a/arch/x86/entry/vsyscall/vsyscall_64.c
+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
@@ -102,7 +102,7 @@ static bool write_ok_or_segv(unsigned long ptr, size_t size)
102 if (!access_ok(VERIFY_WRITE, (void __user *)ptr, size)) { 102 if (!access_ok(VERIFY_WRITE, (void __user *)ptr, size)) {
103 struct thread_struct *thread = &current->thread; 103 struct thread_struct *thread = &current->thread;
104 104
105 thread->error_code = 6; /* user fault, no page, write */ 105 thread->error_code = X86_PF_USER | X86_PF_WRITE;
106 thread->cr2 = ptr; 106 thread->cr2 = ptr;
107 thread->trap_nr = X86_TRAP_PF; 107 thread->trap_nr = X86_TRAP_PF;
108 108
diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h
index 33833d1909af..a5ea841cc6d2 100644
--- a/arch/x86/include/asm/disabled-features.h
+++ b/arch/x86/include/asm/disabled-features.h
@@ -16,6 +16,12 @@
16# define DISABLE_MPX (1<<(X86_FEATURE_MPX & 31)) 16# define DISABLE_MPX (1<<(X86_FEATURE_MPX & 31))
17#endif 17#endif
18 18
19#ifdef CONFIG_X86_SMAP
20# define DISABLE_SMAP 0
21#else
22# define DISABLE_SMAP (1<<(X86_FEATURE_SMAP & 31))
23#endif
24
19#ifdef CONFIG_X86_INTEL_UMIP 25#ifdef CONFIG_X86_INTEL_UMIP
20# define DISABLE_UMIP 0 26# define DISABLE_UMIP 0
21#else 27#else
@@ -68,7 +74,7 @@
68#define DISABLED_MASK6 0 74#define DISABLED_MASK6 0
69#define DISABLED_MASK7 (DISABLE_PTI) 75#define DISABLED_MASK7 (DISABLE_PTI)
70#define DISABLED_MASK8 0 76#define DISABLED_MASK8 0
71#define DISABLED_MASK9 (DISABLE_MPX) 77#define DISABLED_MASK9 (DISABLE_MPX|DISABLE_SMAP)
72#define DISABLED_MASK10 0 78#define DISABLED_MASK10 0
73#define DISABLED_MASK11 0 79#define DISABLED_MASK11 0
74#define DISABLED_MASK12 0 80#define DISABLED_MASK12 0
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index ec7f43327033..1ea41aaef68b 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -80,6 +80,13 @@ static inline void pmd_populate_kernel(struct mm_struct *mm,
80 set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE)); 80 set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE));
81} 81}
82 82
83static inline void pmd_populate_kernel_safe(struct mm_struct *mm,
84 pmd_t *pmd, pte_t *pte)
85{
86 paravirt_alloc_pte(mm, __pa(pte) >> PAGE_SHIFT);
87 set_pmd_safe(pmd, __pmd(__pa(pte) | _PAGE_TABLE));
88}
89
83static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, 90static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
84 struct page *pte) 91 struct page *pte)
85{ 92{
@@ -132,6 +139,12 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
132 paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT); 139 paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
133 set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd))); 140 set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd)));
134} 141}
142
143static inline void pud_populate_safe(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
144{
145 paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
146 set_pud_safe(pud, __pud(_PAGE_TABLE | __pa(pmd)));
147}
135#endif /* CONFIG_X86_PAE */ 148#endif /* CONFIG_X86_PAE */
136 149
137#if CONFIG_PGTABLE_LEVELS > 3 150#if CONFIG_PGTABLE_LEVELS > 3
@@ -141,6 +154,12 @@ static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4d, pud_t *pud)
141 set_p4d(p4d, __p4d(_PAGE_TABLE | __pa(pud))); 154 set_p4d(p4d, __p4d(_PAGE_TABLE | __pa(pud)));
142} 155}
143 156
157static inline void p4d_populate_safe(struct mm_struct *mm, p4d_t *p4d, pud_t *pud)
158{
159 paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT);
160 set_p4d_safe(p4d, __p4d(_PAGE_TABLE | __pa(pud)));
161}
162
144static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) 163static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
145{ 164{
146 gfp_t gfp = GFP_KERNEL_ACCOUNT; 165 gfp_t gfp = GFP_KERNEL_ACCOUNT;
@@ -173,6 +192,14 @@ static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, p4d_t *p4d)
173 set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(p4d))); 192 set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(p4d)));
174} 193}
175 194
195static inline void pgd_populate_safe(struct mm_struct *mm, pgd_t *pgd, p4d_t *p4d)
196{
197 if (!pgtable_l5_enabled())
198 return;
199 paravirt_alloc_p4d(mm, __pa(p4d) >> PAGE_SHIFT);
200 set_pgd_safe(pgd, __pgd(_PAGE_TABLE | __pa(p4d)));
201}
202
176static inline p4d_t *p4d_alloc_one(struct mm_struct *mm, unsigned long addr) 203static inline p4d_t *p4d_alloc_one(struct mm_struct *mm, unsigned long addr)
177{ 204{
178 gfp_t gfp = GFP_KERNEL_ACCOUNT; 205 gfp_t gfp = GFP_KERNEL_ACCOUNT;
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 721d02bd2d0d..6a62f4af9fcf 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -68,7 +68,7 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode)
68 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; 68 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
69 unsigned long d0, d1, d2, d3, d6, d7; 69 unsigned long d0, d1, d2, d3, d6, d7;
70 unsigned int fsindex, gsindex; 70 unsigned int fsindex, gsindex;
71 unsigned int ds, cs, es; 71 unsigned int ds, es;
72 72
73 show_iret_regs(regs); 73 show_iret_regs(regs);
74 74
@@ -100,7 +100,6 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode)
100 } 100 }
101 101
102 asm("movl %%ds,%0" : "=r" (ds)); 102 asm("movl %%ds,%0" : "=r" (ds));
103 asm("movl %%cs,%0" : "=r" (cs));
104 asm("movl %%es,%0" : "=r" (es)); 103 asm("movl %%es,%0" : "=r" (es));
105 asm("movl %%fs,%0" : "=r" (fsindex)); 104 asm("movl %%fs,%0" : "=r" (fsindex));
106 asm("movl %%gs,%0" : "=r" (gsindex)); 105 asm("movl %%gs,%0" : "=r" (gsindex));
@@ -116,7 +115,7 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode)
116 115
117 printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", 116 printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
118 fs, fsindex, gs, gsindex, shadowgs); 117 fs, fsindex, gs, gsindex, shadowgs);
119 printk(KERN_DEFAULT "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, 118 printk(KERN_DEFAULT "CS: %04lx DS: %04x ES: %04x CR0: %016lx\n", regs->cs, ds,
120 es, cr0); 119 es, cr0);
121 printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, 120 printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
122 cr4); 121 cr4);
diff --git a/arch/x86/mm/debug_pagetables.c b/arch/x86/mm/debug_pagetables.c
index 225fe2f0bfec..cd84f067e41d 100644
--- a/arch/x86/mm/debug_pagetables.c
+++ b/arch/x86/mm/debug_pagetables.c
@@ -10,20 +10,9 @@ static int ptdump_show(struct seq_file *m, void *v)
10 return 0; 10 return 0;
11} 11}
12 12
13static int ptdump_open(struct inode *inode, struct file *filp) 13DEFINE_SHOW_ATTRIBUTE(ptdump);
14{
15 return single_open(filp, ptdump_show, NULL);
16}
17
18static const struct file_operations ptdump_fops = {
19 .owner = THIS_MODULE,
20 .open = ptdump_open,
21 .read = seq_read,
22 .llseek = seq_lseek,
23 .release = single_release,
24};
25 14
26static int ptdump_show_curknl(struct seq_file *m, void *v) 15static int ptdump_curknl_show(struct seq_file *m, void *v)
27{ 16{
28 if (current->mm->pgd) { 17 if (current->mm->pgd) {
29 down_read(&current->mm->mmap_sem); 18 down_read(&current->mm->mmap_sem);
@@ -33,23 +22,12 @@ static int ptdump_show_curknl(struct seq_file *m, void *v)
33 return 0; 22 return 0;
34} 23}
35 24
36static int ptdump_open_curknl(struct inode *inode, struct file *filp) 25DEFINE_SHOW_ATTRIBUTE(ptdump_curknl);
37{
38 return single_open(filp, ptdump_show_curknl, NULL);
39}
40
41static const struct file_operations ptdump_curknl_fops = {
42 .owner = THIS_MODULE,
43 .open = ptdump_open_curknl,
44 .read = seq_read,
45 .llseek = seq_lseek,
46 .release = single_release,
47};
48 26
49#ifdef CONFIG_PAGE_TABLE_ISOLATION 27#ifdef CONFIG_PAGE_TABLE_ISOLATION
50static struct dentry *pe_curusr; 28static struct dentry *pe_curusr;
51 29
52static int ptdump_show_curusr(struct seq_file *m, void *v) 30static int ptdump_curusr_show(struct seq_file *m, void *v)
53{ 31{
54 if (current->mm->pgd) { 32 if (current->mm->pgd) {
55 down_read(&current->mm->mmap_sem); 33 down_read(&current->mm->mmap_sem);
@@ -59,42 +37,20 @@ static int ptdump_show_curusr(struct seq_file *m, void *v)
59 return 0; 37 return 0;
60} 38}
61 39
62static int ptdump_open_curusr(struct inode *inode, struct file *filp) 40DEFINE_SHOW_ATTRIBUTE(ptdump_curusr);
63{
64 return single_open(filp, ptdump_show_curusr, NULL);
65}
66
67static const struct file_operations ptdump_curusr_fops = {
68 .owner = THIS_MODULE,
69 .open = ptdump_open_curusr,
70 .read = seq_read,
71 .llseek = seq_lseek,
72 .release = single_release,
73};
74#endif 41#endif
75 42
76#if defined(CONFIG_EFI) && defined(CONFIG_X86_64) 43#if defined(CONFIG_EFI) && defined(CONFIG_X86_64)
77static struct dentry *pe_efi; 44static struct dentry *pe_efi;
78 45
79static int ptdump_show_efi(struct seq_file *m, void *v) 46static int ptdump_efi_show(struct seq_file *m, void *v)
80{ 47{
81 if (efi_mm.pgd) 48 if (efi_mm.pgd)
82 ptdump_walk_pgd_level_debugfs(m, efi_mm.pgd, false); 49 ptdump_walk_pgd_level_debugfs(m, efi_mm.pgd, false);
83 return 0; 50 return 0;
84} 51}
85 52
86static int ptdump_open_efi(struct inode *inode, struct file *filp) 53DEFINE_SHOW_ATTRIBUTE(ptdump_efi);
87{
88 return single_open(filp, ptdump_show_efi, NULL);
89}
90
91static const struct file_operations ptdump_efi_fops = {
92 .owner = THIS_MODULE,
93 .open = ptdump_open_efi,
94 .read = seq_read,
95 .llseek = seq_lseek,
96 .release = single_release,
97};
98#endif 54#endif
99 55
100static struct dentry *dir, *pe_knl, *pe_curknl; 56static struct dentry *dir, *pe_knl, *pe_curknl;
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 71d4b9d4d43f..2ff25ad33233 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -27,6 +27,7 @@
27#include <asm/vm86.h> /* struct vm86 */ 27#include <asm/vm86.h> /* struct vm86 */
28#include <asm/mmu_context.h> /* vma_pkey() */ 28#include <asm/mmu_context.h> /* vma_pkey() */
29#include <asm/efi.h> /* efi_recover_from_page_fault()*/ 29#include <asm/efi.h> /* efi_recover_from_page_fault()*/
30#include <asm/desc.h> /* store_idt(), ... */
30 31
31#define CREATE_TRACE_POINTS 32#define CREATE_TRACE_POINTS
32#include <asm/trace/exceptions.h> 33#include <asm/trace/exceptions.h>
@@ -571,10 +572,55 @@ static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
571 return 0; 572 return 0;
572} 573}
573 574
575static void show_ldttss(const struct desc_ptr *gdt, const char *name, u16 index)
576{
577 u32 offset = (index >> 3) * sizeof(struct desc_struct);
578 unsigned long addr;
579 struct ldttss_desc desc;
580
581 if (index == 0) {
582 pr_alert("%s: NULL\n", name);
583 return;
584 }
585
586 if (offset + sizeof(struct ldttss_desc) >= gdt->size) {
587 pr_alert("%s: 0x%hx -- out of bounds\n", name, index);
588 return;
589 }
590
591 if (probe_kernel_read(&desc, (void *)(gdt->address + offset),
592 sizeof(struct ldttss_desc))) {
593 pr_alert("%s: 0x%hx -- GDT entry is not readable\n",
594 name, index);
595 return;
596 }
597
598 addr = desc.base0 | (desc.base1 << 16) | (desc.base2 << 24);
599#ifdef CONFIG_X86_64
600 addr |= ((u64)desc.base3 << 32);
601#endif
602 pr_alert("%s: 0x%hx -- base=0x%lx limit=0x%x\n",
603 name, index, addr, (desc.limit0 | (desc.limit1 << 16)));
604}
605
606/*
607 * This helper function transforms the #PF error_code bits into
608 * "[PROT] [USER]" type of descriptive, almost human-readable error strings:
609 */
610static void err_str_append(unsigned long error_code, char *buf, unsigned long mask, const char *txt)
611{
612 if (error_code & mask) {
613 if (buf[0])
614 strcat(buf, " ");
615 strcat(buf, txt);
616 }
617}
618
574static void 619static void
575show_fault_oops(struct pt_regs *regs, unsigned long error_code, 620show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long address)
576 unsigned long address)
577{ 621{
622 char err_txt[64];
623
578 if (!oops_may_print()) 624 if (!oops_may_print())
579 return; 625 return;
580 626
@@ -602,6 +648,52 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code,
602 address < PAGE_SIZE ? "NULL pointer dereference" : "paging request", 648 address < PAGE_SIZE ? "NULL pointer dereference" : "paging request",
603 (void *)address); 649 (void *)address);
604 650
651 err_txt[0] = 0;
652
653 /*
654 * Note: length of these appended strings including the separation space and the
655 * zero delimiter must fit into err_txt[].
656 */
657 err_str_append(error_code, err_txt, X86_PF_PROT, "[PROT]" );
658 err_str_append(error_code, err_txt, X86_PF_WRITE, "[WRITE]");
659 err_str_append(error_code, err_txt, X86_PF_USER, "[USER]" );
660 err_str_append(error_code, err_txt, X86_PF_RSVD, "[RSVD]" );
661 err_str_append(error_code, err_txt, X86_PF_INSTR, "[INSTR]");
662 err_str_append(error_code, err_txt, X86_PF_PK, "[PK]" );
663
664 pr_alert("#PF error: %s\n", error_code ? err_txt : "[normal kernel read fault]");
665
666 if (!(error_code & X86_PF_USER) && user_mode(regs)) {
667 struct desc_ptr idt, gdt;
668 u16 ldtr, tr;
669
670 pr_alert("This was a system access from user code\n");
671
672 /*
673 * This can happen for quite a few reasons. The more obvious
674 * ones are faults accessing the GDT, or LDT. Perhaps
675 * surprisingly, if the CPU tries to deliver a benign or
676 * contributory exception from user code and gets a page fault
677 * during delivery, the page fault can be delivered as though
678 * it originated directly from user code. This could happen
679 * due to wrong permissions on the IDT, GDT, LDT, TSS, or
680 * kernel or IST stack.
681 */
682 store_idt(&idt);
683
684 /* Usable even on Xen PV -- it's just slow. */
685 native_store_gdt(&gdt);
686
687 pr_alert("IDT: 0x%lx (limit=0x%hx) GDT: 0x%lx (limit=0x%hx)\n",
688 idt.address, idt.size, gdt.address, gdt.size);
689
690 store_ldt(ldtr);
691 show_ldttss(&gdt, "LDTR", ldtr);
692
693 store_tr(tr);
694 show_ldttss(&gdt, "TR", tr);
695 }
696
605 dump_pagetable(address); 697 dump_pagetable(address);
606} 698}
607 699
@@ -621,16 +713,30 @@ pgtable_bad(struct pt_regs *regs, unsigned long error_code,
621 tsk->comm, address); 713 tsk->comm, address);
622 dump_pagetable(address); 714 dump_pagetable(address);
623 715
624 tsk->thread.cr2 = address;
625 tsk->thread.trap_nr = X86_TRAP_PF;
626 tsk->thread.error_code = error_code;
627
628 if (__die("Bad pagetable", regs, error_code)) 716 if (__die("Bad pagetable", regs, error_code))
629 sig = 0; 717 sig = 0;
630 718
631 oops_end(flags, regs, sig); 719 oops_end(flags, regs, sig);
632} 720}
633 721
722static void set_signal_archinfo(unsigned long address,
723 unsigned long error_code)
724{
725 struct task_struct *tsk = current;
726
727 /*
728 * To avoid leaking information about the kernel page
729 * table layout, pretend that user-mode accesses to
730 * kernel addresses are always protection faults.
731 */
732 if (address >= TASK_SIZE_MAX)
733 error_code |= X86_PF_PROT;
734
735 tsk->thread.trap_nr = X86_TRAP_PF;
736 tsk->thread.error_code = error_code | X86_PF_USER;
737 tsk->thread.cr2 = address;
738}
739
634static noinline void 740static noinline void
635no_context(struct pt_regs *regs, unsigned long error_code, 741no_context(struct pt_regs *regs, unsigned long error_code,
636 unsigned long address, int signal, int si_code) 742 unsigned long address, int signal, int si_code)
@@ -639,6 +745,15 @@ no_context(struct pt_regs *regs, unsigned long error_code,
639 unsigned long flags; 745 unsigned long flags;
640 int sig; 746 int sig;
641 747
748 if (user_mode(regs)) {
749 /*
750 * This is an implicit supervisor-mode access from user
751 * mode. Bypass all the kernel-mode recovery code and just
752 * OOPS.
753 */
754 goto oops;
755 }
756
642 /* Are we prepared to handle this kernel fault? */ 757 /* Are we prepared to handle this kernel fault? */
643 if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) { 758 if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) {
644 /* 759 /*
@@ -656,9 +771,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,
656 * faulting through the emulate_vsyscall() logic. 771 * faulting through the emulate_vsyscall() logic.
657 */ 772 */
658 if (current->thread.sig_on_uaccess_err && signal) { 773 if (current->thread.sig_on_uaccess_err && signal) {
659 tsk->thread.trap_nr = X86_TRAP_PF; 774 set_signal_archinfo(address, error_code);
660 tsk->thread.error_code = error_code | X86_PF_USER;
661 tsk->thread.cr2 = address;
662 775
663 /* XXX: hwpoison faults will set the wrong code. */ 776 /* XXX: hwpoison faults will set the wrong code. */
664 force_sig_fault(signal, si_code, (void __user *)address, 777 force_sig_fault(signal, si_code, (void __user *)address,
@@ -726,6 +839,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,
726 if (IS_ENABLED(CONFIG_EFI)) 839 if (IS_ENABLED(CONFIG_EFI))
727 efi_recover_from_page_fault(address); 840 efi_recover_from_page_fault(address);
728 841
842oops:
729 /* 843 /*
730 * Oops. The kernel tried to access some bad page. We'll have to 844 * Oops. The kernel tried to access some bad page. We'll have to
731 * terminate things with extreme prejudice: 845 * terminate things with extreme prejudice:
@@ -737,10 +851,6 @@ no_context(struct pt_regs *regs, unsigned long error_code,
737 if (task_stack_end_corrupted(tsk)) 851 if (task_stack_end_corrupted(tsk))
738 printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); 852 printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
739 853
740 tsk->thread.cr2 = address;
741 tsk->thread.trap_nr = X86_TRAP_PF;
742 tsk->thread.error_code = error_code;
743
744 sig = SIGKILL; 854 sig = SIGKILL;
745 if (__die("Oops", regs, error_code)) 855 if (__die("Oops", regs, error_code))
746 sig = 0; 856 sig = 0;
@@ -794,7 +904,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
794 struct task_struct *tsk = current; 904 struct task_struct *tsk = current;
795 905
796 /* User mode accesses just cause a SIGSEGV */ 906 /* User mode accesses just cause a SIGSEGV */
797 if (error_code & X86_PF_USER) { 907 if (user_mode(regs) && (error_code & X86_PF_USER)) {
798 /* 908 /*
799 * It's possible to have interrupts off here: 909 * It's possible to have interrupts off here:
800 */ 910 */
@@ -821,9 +931,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
821 if (likely(show_unhandled_signals)) 931 if (likely(show_unhandled_signals))
822 show_signal_msg(regs, error_code, address, tsk); 932 show_signal_msg(regs, error_code, address, tsk);
823 933
824 tsk->thread.cr2 = address; 934 set_signal_archinfo(address, error_code);
825 tsk->thread.error_code = error_code;
826 tsk->thread.trap_nr = X86_TRAP_PF;
827 935
828 if (si_code == SEGV_PKUERR) 936 if (si_code == SEGV_PKUERR)
829 force_sig_pkuerr((void __user *)address, pkey); 937 force_sig_pkuerr((void __user *)address, pkey);
@@ -937,9 +1045,7 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
937 if (is_prefetch(regs, error_code, address)) 1045 if (is_prefetch(regs, error_code, address))
938 return; 1046 return;
939 1047
940 tsk->thread.cr2 = address; 1048 set_signal_archinfo(address, error_code);
941 tsk->thread.error_code = error_code;
942 tsk->thread.trap_nr = X86_TRAP_PF;
943 1049
944#ifdef CONFIG_MEMORY_FAILURE 1050#ifdef CONFIG_MEMORY_FAILURE
945 if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) { 1051 if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
@@ -1148,23 +1254,6 @@ static int fault_in_kernel_space(unsigned long address)
1148 return address >= TASK_SIZE_MAX; 1254 return address >= TASK_SIZE_MAX;
1149} 1255}
1150 1256
1151static inline bool smap_violation(int error_code, struct pt_regs *regs)
1152{
1153 if (!IS_ENABLED(CONFIG_X86_SMAP))
1154 return false;
1155
1156 if (!static_cpu_has(X86_FEATURE_SMAP))
1157 return false;
1158
1159 if (error_code & X86_PF_USER)
1160 return false;
1161
1162 if (!user_mode(regs) && (regs->flags & X86_EFLAGS_AC))
1163 return false;
1164
1165 return true;
1166}
1167
1168/* 1257/*
1169 * Called for all faults where 'address' is part of the kernel address 1258 * Called for all faults where 'address' is part of the kernel address
1170 * space. Might get called for faults that originate from *code* that 1259 * space. Might get called for faults that originate from *code* that
@@ -1230,7 +1319,6 @@ void do_user_addr_fault(struct pt_regs *regs,
1230 unsigned long hw_error_code, 1319 unsigned long hw_error_code,
1231 unsigned long address) 1320 unsigned long address)
1232{ 1321{
1233 unsigned long sw_error_code;
1234 struct vm_area_struct *vma; 1322 struct vm_area_struct *vma;
1235 struct task_struct *tsk; 1323 struct task_struct *tsk;
1236 struct mm_struct *mm; 1324 struct mm_struct *mm;
@@ -1252,10 +1340,16 @@ void do_user_addr_fault(struct pt_regs *regs,
1252 pgtable_bad(regs, hw_error_code, address); 1340 pgtable_bad(regs, hw_error_code, address);
1253 1341
1254 /* 1342 /*
1255 * Check for invalid kernel (supervisor) access to user 1343 * If SMAP is on, check for invalid kernel (supervisor) access to user
1256 * pages in the user address space. 1344 * pages in the user address space. The odd case here is WRUSS,
1345 * which, according to the preliminary documentation, does not respect
1346 * SMAP and will have the USER bit set so, in all cases, SMAP
1347 * enforcement appears to be consistent with the USER bit.
1257 */ 1348 */
1258 if (unlikely(smap_violation(hw_error_code, regs))) { 1349 if (unlikely(cpu_feature_enabled(X86_FEATURE_SMAP) &&
1350 !(hw_error_code & X86_PF_USER) &&
1351 !(regs->flags & X86_EFLAGS_AC)))
1352 {
1259 bad_area_nosemaphore(regs, hw_error_code, address); 1353 bad_area_nosemaphore(regs, hw_error_code, address);
1260 return; 1354 return;
1261 } 1355 }
@@ -1270,13 +1364,6 @@ void do_user_addr_fault(struct pt_regs *regs,
1270 } 1364 }
1271 1365
1272 /* 1366 /*
1273 * hw_error_code is literally the "page fault error code" passed to
1274 * the kernel directly from the hardware. But, we will shortly be
1275 * modifying it in software, so give it a new name.
1276 */
1277 sw_error_code = hw_error_code;
1278
1279 /*
1280 * It's safe to allow irq's after cr2 has been saved and the 1367 * It's safe to allow irq's after cr2 has been saved and the
1281 * vmalloc fault has been handled. 1368 * vmalloc fault has been handled.
1282 * 1369 *
@@ -1285,26 +1372,6 @@ void do_user_addr_fault(struct pt_regs *regs,
1285 */ 1372 */
1286 if (user_mode(regs)) { 1373 if (user_mode(regs)) {
1287 local_irq_enable(); 1374 local_irq_enable();
1288 /*
1289 * Up to this point, X86_PF_USER set in hw_error_code
1290 * indicated a user-mode access. But, after this,
1291 * X86_PF_USER in sw_error_code will indicate either
1292 * that, *or* an implicit kernel(supervisor)-mode access
1293 * which originated from user mode.
1294 */
1295 if (!(hw_error_code & X86_PF_USER)) {
1296 /*
1297 * The CPU was in user mode, but the CPU says
1298 * the fault was not a user-mode access.
1299 * Must be an implicit kernel-mode access,
1300 * which we do not expect to happen in the
1301 * user address space.
1302 */
1303 pr_warn_once("kernel-mode error from user-mode: %lx\n",
1304 hw_error_code);
1305
1306 sw_error_code |= X86_PF_USER;
1307 }
1308 flags |= FAULT_FLAG_USER; 1375 flags |= FAULT_FLAG_USER;
1309 } else { 1376 } else {
1310 if (regs->flags & X86_EFLAGS_IF) 1377 if (regs->flags & X86_EFLAGS_IF)
@@ -1313,9 +1380,9 @@ void do_user_addr_fault(struct pt_regs *regs,
1313 1380
1314 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); 1381 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
1315 1382
1316 if (sw_error_code & X86_PF_WRITE) 1383 if (hw_error_code & X86_PF_WRITE)
1317 flags |= FAULT_FLAG_WRITE; 1384 flags |= FAULT_FLAG_WRITE;
1318 if (sw_error_code & X86_PF_INSTR) 1385 if (hw_error_code & X86_PF_INSTR)
1319 flags |= FAULT_FLAG_INSTRUCTION; 1386 flags |= FAULT_FLAG_INSTRUCTION;
1320 1387
1321#ifdef CONFIG_X86_64 1388#ifdef CONFIG_X86_64
@@ -1328,7 +1395,7 @@ void do_user_addr_fault(struct pt_regs *regs,
1328 * The vsyscall page does not have a "real" VMA, so do this 1395 * The vsyscall page does not have a "real" VMA, so do this
1329 * emulation before we go searching for VMAs. 1396 * emulation before we go searching for VMAs.
1330 */ 1397 */
1331 if ((sw_error_code & X86_PF_INSTR) && is_vsyscall_vaddr(address)) { 1398 if ((hw_error_code & X86_PF_INSTR) && is_vsyscall_vaddr(address)) {
1332 if (emulate_vsyscall(regs, address)) 1399 if (emulate_vsyscall(regs, address))
1333 return; 1400 return;
1334 } 1401 }
@@ -1344,18 +1411,15 @@ void do_user_addr_fault(struct pt_regs *regs,
1344 * Only do the expensive exception table search when we might be at 1411 * Only do the expensive exception table search when we might be at
1345 * risk of a deadlock. This happens if we 1412 * risk of a deadlock. This happens if we
1346 * 1. Failed to acquire mmap_sem, and 1413 * 1. Failed to acquire mmap_sem, and
1347 * 2. The access did not originate in userspace. Note: either the 1414 * 2. The access did not originate in userspace.
1348 * hardware or earlier page fault code may set X86_PF_USER
1349 * in sw_error_code.
1350 */ 1415 */
1351 if (unlikely(!down_read_trylock(&mm->mmap_sem))) { 1416 if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
1352 if (!(sw_error_code & X86_PF_USER) && 1417 if (!user_mode(regs) && !search_exception_tables(regs->ip)) {
1353 !search_exception_tables(regs->ip)) {
1354 /* 1418 /*
1355 * Fault from code in kernel from 1419 * Fault from code in kernel from
1356 * which we do not expect faults. 1420 * which we do not expect faults.
1357 */ 1421 */
1358 bad_area_nosemaphore(regs, sw_error_code, address); 1422 bad_area_nosemaphore(regs, hw_error_code, address);
1359 return; 1423 return;
1360 } 1424 }
1361retry: 1425retry:
@@ -1371,29 +1435,17 @@ retry:
1371 1435
1372 vma = find_vma(mm, address); 1436 vma = find_vma(mm, address);
1373 if (unlikely(!vma)) { 1437 if (unlikely(!vma)) {
1374 bad_area(regs, sw_error_code, address); 1438 bad_area(regs, hw_error_code, address);
1375 return; 1439 return;
1376 } 1440 }
1377 if (likely(vma->vm_start <= address)) 1441 if (likely(vma->vm_start <= address))
1378 goto good_area; 1442 goto good_area;
1379 if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { 1443 if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
1380 bad_area(regs, sw_error_code, address); 1444 bad_area(regs, hw_error_code, address);
1381 return; 1445 return;
1382 } 1446 }
1383 if (sw_error_code & X86_PF_USER) {
1384 /*
1385 * Accessing the stack below %sp is always a bug.
1386 * The large cushion allows instructions like enter
1387 * and pusha to work. ("enter $65535, $31" pushes
1388 * 32 pointers and then decrements %sp by 65535.)
1389 */
1390 if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) {
1391 bad_area(regs, sw_error_code, address);
1392 return;
1393 }
1394 }
1395 if (unlikely(expand_stack(vma, address))) { 1447 if (unlikely(expand_stack(vma, address))) {
1396 bad_area(regs, sw_error_code, address); 1448 bad_area(regs, hw_error_code, address);
1397 return; 1449 return;
1398 } 1450 }
1399 1451
@@ -1402,8 +1454,8 @@ retry:
1402 * we can handle it.. 1454 * we can handle it..
1403 */ 1455 */
1404good_area: 1456good_area:
1405 if (unlikely(access_error(sw_error_code, vma))) { 1457 if (unlikely(access_error(hw_error_code, vma))) {
1406 bad_area_access_error(regs, sw_error_code, address, vma); 1458 bad_area_access_error(regs, hw_error_code, address, vma);
1407 return; 1459 return;
1408 } 1460 }
1409 1461
@@ -1442,13 +1494,13 @@ good_area:
1442 return; 1494 return;
1443 1495
1444 /* Not returning to user mode? Handle exceptions or die: */ 1496 /* Not returning to user mode? Handle exceptions or die: */
1445 no_context(regs, sw_error_code, address, SIGBUS, BUS_ADRERR); 1497 no_context(regs, hw_error_code, address, SIGBUS, BUS_ADRERR);
1446 return; 1498 return;
1447 } 1499 }
1448 1500
1449 up_read(&mm->mmap_sem); 1501 up_read(&mm->mmap_sem);
1450 if (unlikely(fault & VM_FAULT_ERROR)) { 1502 if (unlikely(fault & VM_FAULT_ERROR)) {
1451 mm_fault_error(regs, sw_error_code, address, fault); 1503 mm_fault_error(regs, hw_error_code, address, fault);
1452 return; 1504 return;
1453 } 1505 }
1454 1506
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 5fab264948c2..484c1b92f078 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -432,7 +432,7 @@ phys_pte_init(pte_t *pte_page, unsigned long paddr, unsigned long paddr_end,
432 E820_TYPE_RAM) && 432 E820_TYPE_RAM) &&
433 !e820__mapped_any(paddr & PAGE_MASK, paddr_next, 433 !e820__mapped_any(paddr & PAGE_MASK, paddr_next,
434 E820_TYPE_RESERVED_KERN)) 434 E820_TYPE_RESERVED_KERN))
435 set_pte(pte, __pte(0)); 435 set_pte_safe(pte, __pte(0));
436 continue; 436 continue;
437 } 437 }
438 438
@@ -452,7 +452,7 @@ phys_pte_init(pte_t *pte_page, unsigned long paddr, unsigned long paddr_end,
452 pr_info(" pte=%p addr=%lx pte=%016lx\n", pte, paddr, 452 pr_info(" pte=%p addr=%lx pte=%016lx\n", pte, paddr,
453 pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL).pte); 453 pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL).pte);
454 pages++; 454 pages++;
455 set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, prot)); 455 set_pte_safe(pte, pfn_pte(paddr >> PAGE_SHIFT, prot));
456 paddr_last = (paddr & PAGE_MASK) + PAGE_SIZE; 456 paddr_last = (paddr & PAGE_MASK) + PAGE_SIZE;
457 } 457 }
458 458
@@ -487,7 +487,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end,
487 E820_TYPE_RAM) && 487 E820_TYPE_RAM) &&
488 !e820__mapped_any(paddr & PMD_MASK, paddr_next, 488 !e820__mapped_any(paddr & PMD_MASK, paddr_next,
489 E820_TYPE_RESERVED_KERN)) 489 E820_TYPE_RESERVED_KERN))
490 set_pmd(pmd, __pmd(0)); 490 set_pmd_safe(pmd, __pmd(0));
491 continue; 491 continue;
492 } 492 }
493 493
@@ -524,7 +524,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end,
524 if (page_size_mask & (1<<PG_LEVEL_2M)) { 524 if (page_size_mask & (1<<PG_LEVEL_2M)) {
525 pages++; 525 pages++;
526 spin_lock(&init_mm.page_table_lock); 526 spin_lock(&init_mm.page_table_lock);
527 set_pte((pte_t *)pmd, 527 set_pte_safe((pte_t *)pmd,
528 pfn_pte((paddr & PMD_MASK) >> PAGE_SHIFT, 528 pfn_pte((paddr & PMD_MASK) >> PAGE_SHIFT,
529 __pgprot(pgprot_val(prot) | _PAGE_PSE))); 529 __pgprot(pgprot_val(prot) | _PAGE_PSE)));
530 spin_unlock(&init_mm.page_table_lock); 530 spin_unlock(&init_mm.page_table_lock);
@@ -536,7 +536,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end,
536 paddr_last = phys_pte_init(pte, paddr, paddr_end, new_prot); 536 paddr_last = phys_pte_init(pte, paddr, paddr_end, new_prot);
537 537
538 spin_lock(&init_mm.page_table_lock); 538 spin_lock(&init_mm.page_table_lock);
539 pmd_populate_kernel(&init_mm, pmd, pte); 539 pmd_populate_kernel_safe(&init_mm, pmd, pte);
540 spin_unlock(&init_mm.page_table_lock); 540 spin_unlock(&init_mm.page_table_lock);
541 } 541 }
542 update_page_count(PG_LEVEL_2M, pages); 542 update_page_count(PG_LEVEL_2M, pages);
@@ -573,7 +573,7 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end,
573 E820_TYPE_RAM) && 573 E820_TYPE_RAM) &&
574 !e820__mapped_any(paddr & PUD_MASK, paddr_next, 574 !e820__mapped_any(paddr & PUD_MASK, paddr_next,
575 E820_TYPE_RESERVED_KERN)) 575 E820_TYPE_RESERVED_KERN))
576 set_pud(pud, __pud(0)); 576 set_pud_safe(pud, __pud(0));
577 continue; 577 continue;
578 } 578 }
579 579
@@ -584,7 +584,6 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end,
584 paddr_end, 584 paddr_end,
585 page_size_mask, 585 page_size_mask,
586 prot); 586 prot);
587 __flush_tlb_all();
588 continue; 587 continue;
589 } 588 }
590 /* 589 /*
@@ -611,7 +610,7 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end,
611 if (page_size_mask & (1<<PG_LEVEL_1G)) { 610 if (page_size_mask & (1<<PG_LEVEL_1G)) {
612 pages++; 611 pages++;
613 spin_lock(&init_mm.page_table_lock); 612 spin_lock(&init_mm.page_table_lock);
614 set_pte((pte_t *)pud, 613 set_pte_safe((pte_t *)pud,
615 pfn_pte((paddr & PUD_MASK) >> PAGE_SHIFT, 614 pfn_pte((paddr & PUD_MASK) >> PAGE_SHIFT,
616 PAGE_KERNEL_LARGE)); 615 PAGE_KERNEL_LARGE));
617 spin_unlock(&init_mm.page_table_lock); 616 spin_unlock(&init_mm.page_table_lock);
@@ -624,10 +623,9 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end,
624 page_size_mask, prot); 623 page_size_mask, prot);
625 624
626 spin_lock(&init_mm.page_table_lock); 625 spin_lock(&init_mm.page_table_lock);
627 pud_populate(&init_mm, pud, pmd); 626 pud_populate_safe(&init_mm, pud, pmd);
628 spin_unlock(&init_mm.page_table_lock); 627 spin_unlock(&init_mm.page_table_lock);
629 } 628 }
630 __flush_tlb_all();
631 629
632 update_page_count(PG_LEVEL_1G, pages); 630 update_page_count(PG_LEVEL_1G, pages);
633 631
@@ -659,7 +657,7 @@ phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end,
659 E820_TYPE_RAM) && 657 E820_TYPE_RAM) &&
660 !e820__mapped_any(paddr & P4D_MASK, paddr_next, 658 !e820__mapped_any(paddr & P4D_MASK, paddr_next,
661 E820_TYPE_RESERVED_KERN)) 659 E820_TYPE_RESERVED_KERN))
662 set_p4d(p4d, __p4d(0)); 660 set_p4d_safe(p4d, __p4d(0));
663 continue; 661 continue;
664 } 662 }
665 663
@@ -668,7 +666,6 @@ phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end,
668 paddr_last = phys_pud_init(pud, paddr, 666 paddr_last = phys_pud_init(pud, paddr,
669 paddr_end, 667 paddr_end,
670 page_size_mask); 668 page_size_mask);
671 __flush_tlb_all();
672 continue; 669 continue;
673 } 670 }
674 671
@@ -677,10 +674,9 @@ phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end,
677 page_size_mask); 674 page_size_mask);
678 675
679 spin_lock(&init_mm.page_table_lock); 676 spin_lock(&init_mm.page_table_lock);
680 p4d_populate(&init_mm, p4d, pud); 677 p4d_populate_safe(&init_mm, p4d, pud);
681 spin_unlock(&init_mm.page_table_lock); 678 spin_unlock(&init_mm.page_table_lock);
682 } 679 }
683 __flush_tlb_all();
684 680
685 return paddr_last; 681 return paddr_last;
686} 682}
@@ -723,9 +719,9 @@ kernel_physical_mapping_init(unsigned long paddr_start,
723 719
724 spin_lock(&init_mm.page_table_lock); 720 spin_lock(&init_mm.page_table_lock);
725 if (pgtable_l5_enabled()) 721 if (pgtable_l5_enabled())
726 pgd_populate(&init_mm, pgd, p4d); 722 pgd_populate_safe(&init_mm, pgd, p4d);
727 else 723 else
728 p4d_populate(&init_mm, p4d_offset(pgd, vaddr), (pud_t *) p4d); 724 p4d_populate_safe(&init_mm, p4d_offset(pgd, vaddr), (pud_t *) p4d);
729 spin_unlock(&init_mm.page_table_lock); 725 spin_unlock(&init_mm.page_table_lock);
730 pgd_changed = true; 726 pgd_changed = true;
731 } 727 }
@@ -733,8 +729,6 @@ kernel_physical_mapping_init(unsigned long paddr_start,
733 if (pgd_changed) 729 if (pgd_changed)
734 sync_global_pgds(vaddr_start, vaddr_end - 1); 730 sync_global_pgds(vaddr_start, vaddr_end - 1);
735 731
736 __flush_tlb_all();
737
738 return paddr_last; 732 return paddr_last;
739} 733}
740 734
diff --git a/arch/x86/mm/mm_internal.h b/arch/x86/mm/mm_internal.h
index 4e1f6e1b8159..319bde386d5f 100644
--- a/arch/x86/mm/mm_internal.h
+++ b/arch/x86/mm/mm_internal.h
@@ -19,4 +19,6 @@ extern int after_bootmem;
19 19
20void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache); 20void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache);
21 21
22extern unsigned long tlb_single_page_flush_ceiling;
23
22#endif /* __X86_MM_INTERNAL_H */ 24#endif /* __X86_MM_INTERNAL_H */
diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c
index 08f8f76a4852..facce271e8b9 100644
--- a/arch/x86/mm/pageattr-test.c
+++ b/arch/x86/mm/pageattr-test.c
@@ -23,7 +23,8 @@
23static __read_mostly int print = 1; 23static __read_mostly int print = 1;
24 24
25enum { 25enum {
26 NTEST = 400, 26 NTEST = 3 * 100,
27 NPAGES = 100,
27#ifdef CONFIG_X86_64 28#ifdef CONFIG_X86_64
28 LPS = (1 << PMD_SHIFT), 29 LPS = (1 << PMD_SHIFT),
29#elif defined(CONFIG_X86_PAE) 30#elif defined(CONFIG_X86_PAE)
@@ -110,6 +111,9 @@ static int print_split(struct split_state *s)
110static unsigned long addr[NTEST]; 111static unsigned long addr[NTEST];
111static unsigned int len[NTEST]; 112static unsigned int len[NTEST];
112 113
114static struct page *pages[NPAGES];
115static unsigned long addrs[NPAGES];
116
113/* Change the global bit on random pages in the direct mapping */ 117/* Change the global bit on random pages in the direct mapping */
114static int pageattr_test(void) 118static int pageattr_test(void)
115{ 119{
@@ -120,7 +124,6 @@ static int pageattr_test(void)
120 unsigned int level; 124 unsigned int level;
121 int i, k; 125 int i, k;
122 int err; 126 int err;
123 unsigned long test_addr;
124 127
125 if (print) 128 if (print)
126 printk(KERN_INFO "CPA self-test:\n"); 129 printk(KERN_INFO "CPA self-test:\n");
@@ -137,7 +140,7 @@ static int pageattr_test(void)
137 unsigned long pfn = prandom_u32() % max_pfn_mapped; 140 unsigned long pfn = prandom_u32() % max_pfn_mapped;
138 141
139 addr[i] = (unsigned long)__va(pfn << PAGE_SHIFT); 142 addr[i] = (unsigned long)__va(pfn << PAGE_SHIFT);
140 len[i] = prandom_u32() % 100; 143 len[i] = prandom_u32() % NPAGES;
141 len[i] = min_t(unsigned long, len[i], max_pfn_mapped - pfn - 1); 144 len[i] = min_t(unsigned long, len[i], max_pfn_mapped - pfn - 1);
142 145
143 if (len[i] == 0) 146 if (len[i] == 0)
@@ -167,14 +170,29 @@ static int pageattr_test(void)
167 break; 170 break;
168 } 171 }
169 __set_bit(pfn + k, bm); 172 __set_bit(pfn + k, bm);
173 addrs[k] = addr[i] + k*PAGE_SIZE;
174 pages[k] = pfn_to_page(pfn + k);
170 } 175 }
171 if (!addr[i] || !pte || !k) { 176 if (!addr[i] || !pte || !k) {
172 addr[i] = 0; 177 addr[i] = 0;
173 continue; 178 continue;
174 } 179 }
175 180
176 test_addr = addr[i]; 181 switch (i % 3) {
177 err = change_page_attr_set(&test_addr, len[i], PAGE_CPA_TEST, 0); 182 case 0:
183 err = change_page_attr_set(&addr[i], len[i], PAGE_CPA_TEST, 0);
184 break;
185
186 case 1:
187 err = change_page_attr_set(addrs, len[1], PAGE_CPA_TEST, 1);
188 break;
189
190 case 2:
191 err = cpa_set_pages_array(pages, len[i], PAGE_CPA_TEST);
192 break;
193 }
194
195
178 if (err < 0) { 196 if (err < 0) {
179 printk(KERN_ERR "CPA %d failed %d\n", i, err); 197 printk(KERN_ERR "CPA %d failed %d\n", i, err);
180 failed++; 198 failed++;
@@ -206,8 +224,7 @@ static int pageattr_test(void)
206 failed++; 224 failed++;
207 continue; 225 continue;
208 } 226 }
209 test_addr = addr[i]; 227 err = change_page_attr_clear(&addr[i], len[i], PAGE_CPA_TEST, 0);
210 err = change_page_attr_clear(&test_addr, len[i], PAGE_CPA_TEST, 0);
211 if (err < 0) { 228 if (err < 0) {
212 printk(KERN_ERR "CPA reverting failed: %d\n", err); 229 printk(KERN_ERR "CPA reverting failed: %d\n", err);
213 failed++; 230 failed++;
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index e44fe1a63f72..4f8972311a77 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -26,6 +26,8 @@
26#include <asm/pat.h> 26#include <asm/pat.h>
27#include <asm/set_memory.h> 27#include <asm/set_memory.h>
28 28
29#include "mm_internal.h"
30
29/* 31/*
30 * The current flushing context - we pass it instead of 5 arguments: 32 * The current flushing context - we pass it instead of 5 arguments:
31 */ 33 */
@@ -35,11 +37,11 @@ struct cpa_data {
35 pgprot_t mask_set; 37 pgprot_t mask_set;
36 pgprot_t mask_clr; 38 pgprot_t mask_clr;
37 unsigned long numpages; 39 unsigned long numpages;
38 int flags; 40 unsigned long curpage;
39 unsigned long pfn; 41 unsigned long pfn;
40 unsigned force_split : 1, 42 unsigned int flags;
43 unsigned int force_split : 1,
41 force_static_prot : 1; 44 force_static_prot : 1;
42 int curpage;
43 struct page **pages; 45 struct page **pages;
44}; 46};
45 47
@@ -228,19 +230,28 @@ static bool __cpa_pfn_in_highmap(unsigned long pfn)
228 230
229#endif 231#endif
230 232
233static unsigned long __cpa_addr(struct cpa_data *cpa, unsigned long idx)
234{
235 if (cpa->flags & CPA_PAGES_ARRAY) {
236 struct page *page = cpa->pages[idx];
237
238 if (unlikely(PageHighMem(page)))
239 return 0;
240
241 return (unsigned long)page_address(page);
242 }
243
244 if (cpa->flags & CPA_ARRAY)
245 return cpa->vaddr[idx];
246
247 return *cpa->vaddr + idx * PAGE_SIZE;
248}
249
231/* 250/*
232 * Flushing functions 251 * Flushing functions
233 */ 252 */
234 253
235/** 254static void clflush_cache_range_opt(void *vaddr, unsigned int size)
236 * clflush_cache_range - flush a cache range with clflush
237 * @vaddr: virtual start address
238 * @size: number of bytes to flush
239 *
240 * clflushopt is an unordered instruction which needs fencing with mfence or
241 * sfence to avoid ordering issues.
242 */
243void clflush_cache_range(void *vaddr, unsigned int size)
244{ 255{
245 const unsigned long clflush_size = boot_cpu_data.x86_clflush_size; 256 const unsigned long clflush_size = boot_cpu_data.x86_clflush_size;
246 void *p = (void *)((unsigned long)vaddr & ~(clflush_size - 1)); 257 void *p = (void *)((unsigned long)vaddr & ~(clflush_size - 1));
@@ -249,11 +260,22 @@ void clflush_cache_range(void *vaddr, unsigned int size)
249 if (p >= vend) 260 if (p >= vend)
250 return; 261 return;
251 262
252 mb();
253
254 for (; p < vend; p += clflush_size) 263 for (; p < vend; p += clflush_size)
255 clflushopt(p); 264 clflushopt(p);
265}
256 266
267/**
268 * clflush_cache_range - flush a cache range with clflush
269 * @vaddr: virtual start address
270 * @size: number of bytes to flush
271 *
272 * CLFLUSHOPT is an unordered instruction which needs fencing with MFENCE or
273 * SFENCE to avoid ordering issues.
274 */
275void clflush_cache_range(void *vaddr, unsigned int size)
276{
277 mb();
278 clflush_cache_range_opt(vaddr, size);
257 mb(); 279 mb();
258} 280}
259EXPORT_SYMBOL_GPL(clflush_cache_range); 281EXPORT_SYMBOL_GPL(clflush_cache_range);
@@ -285,87 +307,49 @@ static void cpa_flush_all(unsigned long cache)
285 on_each_cpu(__cpa_flush_all, (void *) cache, 1); 307 on_each_cpu(__cpa_flush_all, (void *) cache, 1);
286} 308}
287 309
288static bool __inv_flush_all(int cache) 310void __cpa_flush_tlb(void *data)
289{ 311{
290 BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); 312 struct cpa_data *cpa = data;
313 unsigned int i;
291 314
292 if (cache && !static_cpu_has(X86_FEATURE_CLFLUSH)) { 315 for (i = 0; i < cpa->numpages; i++)
293 cpa_flush_all(cache); 316 __flush_tlb_one_kernel(__cpa_addr(cpa, i));
294 return true;
295 }
296
297 return false;
298} 317}
299 318
300static void cpa_flush_range(unsigned long start, int numpages, int cache) 319static void cpa_flush(struct cpa_data *data, int cache)
301{ 320{
302 unsigned int i, level; 321 struct cpa_data *cpa = data;
303 unsigned long addr; 322 unsigned int i;
304 323
305 WARN_ON(PAGE_ALIGN(start) != start); 324 BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
306
307 if (__inv_flush_all(cache))
308 return;
309
310 flush_tlb_kernel_range(start, start + PAGE_SIZE * numpages);
311 325
312 if (!cache) 326 if (cache && !static_cpu_has(X86_FEATURE_CLFLUSH)) {
327 cpa_flush_all(cache);
313 return; 328 return;
314
315 /*
316 * We only need to flush on one CPU,
317 * clflush is a MESI-coherent instruction that
318 * will cause all other CPUs to flush the same
319 * cachelines:
320 */
321 for (i = 0, addr = start; i < numpages; i++, addr += PAGE_SIZE) {
322 pte_t *pte = lookup_address(addr, &level);
323
324 /*
325 * Only flush present addresses:
326 */
327 if (pte && (pte_val(*pte) & _PAGE_PRESENT))
328 clflush_cache_range((void *) addr, PAGE_SIZE);
329 } 329 }
330}
331 330
332static void cpa_flush_array(unsigned long baddr, unsigned long *start, 331 if (cpa->numpages <= tlb_single_page_flush_ceiling)
333 int numpages, int cache, 332 on_each_cpu(__cpa_flush_tlb, cpa, 1);
334 int in_flags, struct page **pages) 333 else
335{ 334 flush_tlb_all();
336 unsigned int i, level;
337
338 if (__inv_flush_all(cache))
339 return;
340
341 flush_tlb_all();
342 335
343 if (!cache) 336 if (!cache)
344 return; 337 return;
345 338
346 /* 339 mb();
347 * We only need to flush on one CPU, 340 for (i = 0; i < cpa->numpages; i++) {
348 * clflush is a MESI-coherent instruction that 341 unsigned long addr = __cpa_addr(cpa, i);
349 * will cause all other CPUs to flush the same 342 unsigned int level;
350 * cachelines:
351 */
352 for (i = 0; i < numpages; i++) {
353 unsigned long addr;
354 pte_t *pte;
355
356 if (in_flags & CPA_PAGES_ARRAY)
357 addr = (unsigned long)page_address(pages[i]);
358 else
359 addr = start[i];
360 343
361 pte = lookup_address(addr, &level); 344 pte_t *pte = lookup_address(addr, &level);
362 345
363 /* 346 /*
364 * Only flush present addresses: 347 * Only flush present addresses:
365 */ 348 */
366 if (pte && (pte_val(*pte) & _PAGE_PRESENT)) 349 if (pte && (pte_val(*pte) & _PAGE_PRESENT))
367 clflush_cache_range((void *)addr, PAGE_SIZE); 350 clflush_cache_range_opt((void *)addr, PAGE_SIZE);
368 } 351 }
352 mb();
369} 353}
370 354
371static bool overlaps(unsigned long r1_start, unsigned long r1_end, 355static bool overlaps(unsigned long r1_start, unsigned long r1_end,
@@ -1476,15 +1460,7 @@ static int __change_page_attr(struct cpa_data *cpa, int primary)
1476 unsigned int level; 1460 unsigned int level;
1477 pte_t *kpte, old_pte; 1461 pte_t *kpte, old_pte;
1478 1462
1479 if (cpa->flags & CPA_PAGES_ARRAY) { 1463 address = __cpa_addr(cpa, cpa->curpage);
1480 struct page *page = cpa->pages[cpa->curpage];
1481 if (unlikely(PageHighMem(page)))
1482 return 0;
1483 address = (unsigned long)page_address(page);
1484 } else if (cpa->flags & CPA_ARRAY)
1485 address = cpa->vaddr[cpa->curpage];
1486 else
1487 address = *cpa->vaddr;
1488repeat: 1464repeat:
1489 kpte = _lookup_address_cpa(cpa, address, &level); 1465 kpte = _lookup_address_cpa(cpa, address, &level);
1490 if (!kpte) 1466 if (!kpte)
@@ -1565,22 +1541,14 @@ static int cpa_process_alias(struct cpa_data *cpa)
1565 * No need to redo, when the primary call touched the direct 1541 * No need to redo, when the primary call touched the direct
1566 * mapping already: 1542 * mapping already:
1567 */ 1543 */
1568 if (cpa->flags & CPA_PAGES_ARRAY) { 1544 vaddr = __cpa_addr(cpa, cpa->curpage);
1569 struct page *page = cpa->pages[cpa->curpage];
1570 if (unlikely(PageHighMem(page)))
1571 return 0;
1572 vaddr = (unsigned long)page_address(page);
1573 } else if (cpa->flags & CPA_ARRAY)
1574 vaddr = cpa->vaddr[cpa->curpage];
1575 else
1576 vaddr = *cpa->vaddr;
1577
1578 if (!(within(vaddr, PAGE_OFFSET, 1545 if (!(within(vaddr, PAGE_OFFSET,
1579 PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) { 1546 PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) {
1580 1547
1581 alias_cpa = *cpa; 1548 alias_cpa = *cpa;
1582 alias_cpa.vaddr = &laddr; 1549 alias_cpa.vaddr = &laddr;
1583 alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); 1550 alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
1551 alias_cpa.curpage = 0;
1584 1552
1585 ret = __change_page_attr_set_clr(&alias_cpa, 0); 1553 ret = __change_page_attr_set_clr(&alias_cpa, 0);
1586 if (ret) 1554 if (ret)
@@ -1600,6 +1568,7 @@ static int cpa_process_alias(struct cpa_data *cpa)
1600 alias_cpa = *cpa; 1568 alias_cpa = *cpa;
1601 alias_cpa.vaddr = &temp_cpa_vaddr; 1569 alias_cpa.vaddr = &temp_cpa_vaddr;
1602 alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); 1570 alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
1571 alias_cpa.curpage = 0;
1603 1572
1604 /* 1573 /*
1605 * The high mapping range is imprecise, so ignore the 1574 * The high mapping range is imprecise, so ignore the
@@ -1615,14 +1584,15 @@ static int cpa_process_alias(struct cpa_data *cpa)
1615static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) 1584static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
1616{ 1585{
1617 unsigned long numpages = cpa->numpages; 1586 unsigned long numpages = cpa->numpages;
1618 int ret; 1587 unsigned long rempages = numpages;
1588 int ret = 0;
1619 1589
1620 while (numpages) { 1590 while (rempages) {
1621 /* 1591 /*
1622 * Store the remaining nr of pages for the large page 1592 * Store the remaining nr of pages for the large page
1623 * preservation check. 1593 * preservation check.
1624 */ 1594 */
1625 cpa->numpages = numpages; 1595 cpa->numpages = rempages;
1626 /* for array changes, we can't use large page */ 1596 /* for array changes, we can't use large page */
1627 if (cpa->flags & (CPA_ARRAY | CPA_PAGES_ARRAY)) 1597 if (cpa->flags & (CPA_ARRAY | CPA_PAGES_ARRAY))
1628 cpa->numpages = 1; 1598 cpa->numpages = 1;
@@ -1633,12 +1603,12 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
1633 if (!debug_pagealloc_enabled()) 1603 if (!debug_pagealloc_enabled())
1634 spin_unlock(&cpa_lock); 1604 spin_unlock(&cpa_lock);
1635 if (ret) 1605 if (ret)
1636 return ret; 1606 goto out;
1637 1607
1638 if (checkalias) { 1608 if (checkalias) {
1639 ret = cpa_process_alias(cpa); 1609 ret = cpa_process_alias(cpa);
1640 if (ret) 1610 if (ret)
1641 return ret; 1611 goto out;
1642 } 1612 }
1643 1613
1644 /* 1614 /*
@@ -1646,15 +1616,15 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
1646 * CPA operation. Either a large page has been 1616 * CPA operation. Either a large page has been
1647 * preserved or a single page update happened. 1617 * preserved or a single page update happened.
1648 */ 1618 */
1649 BUG_ON(cpa->numpages > numpages || !cpa->numpages); 1619 BUG_ON(cpa->numpages > rempages || !cpa->numpages);
1650 numpages -= cpa->numpages; 1620 rempages -= cpa->numpages;
1651 if (cpa->flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) 1621 cpa->curpage += cpa->numpages;
1652 cpa->curpage++;
1653 else
1654 *cpa->vaddr += cpa->numpages * PAGE_SIZE;
1655
1656 } 1622 }
1657 return 0; 1623
1624out:
1625 /* Restore the original numpages */
1626 cpa->numpages = numpages;
1627 return ret;
1658} 1628}
1659 1629
1660/* 1630/*
@@ -1687,7 +1657,6 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
1687{ 1657{
1688 struct cpa_data cpa; 1658 struct cpa_data cpa;
1689 int ret, cache, checkalias; 1659 int ret, cache, checkalias;
1690 unsigned long baddr = 0;
1691 1660
1692 memset(&cpa, 0, sizeof(cpa)); 1661 memset(&cpa, 0, sizeof(cpa));
1693 1662
@@ -1721,11 +1690,6 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
1721 */ 1690 */
1722 WARN_ON_ONCE(1); 1691 WARN_ON_ONCE(1);
1723 } 1692 }
1724 /*
1725 * Save address for cache flush. *addr is modified in the call
1726 * to __change_page_attr_set_clr() below.
1727 */
1728 baddr = make_addr_canonical_again(*addr);
1729 } 1693 }
1730 1694
1731 /* Must avoid aliasing mappings in the highmem code */ 1695 /* Must avoid aliasing mappings in the highmem code */
@@ -1773,13 +1737,7 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
1773 goto out; 1737 goto out;
1774 } 1738 }
1775 1739
1776 if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) { 1740 cpa_flush(&cpa, cache);
1777 cpa_flush_array(baddr, addr, numpages, cache,
1778 cpa.flags, pages);
1779 } else {
1780 cpa_flush_range(baddr, numpages, cache);
1781 }
1782
1783out: 1741out:
1784 return ret; 1742 return ret;
1785} 1743}
@@ -1850,14 +1808,14 @@ out_err:
1850} 1808}
1851EXPORT_SYMBOL(set_memory_uc); 1809EXPORT_SYMBOL(set_memory_uc);
1852 1810
1853static int _set_memory_array(unsigned long *addr, int addrinarray, 1811static int _set_memory_array(unsigned long *addr, int numpages,
1854 enum page_cache_mode new_type) 1812 enum page_cache_mode new_type)
1855{ 1813{
1856 enum page_cache_mode set_type; 1814 enum page_cache_mode set_type;
1857 int i, j; 1815 int i, j;
1858 int ret; 1816 int ret;
1859 1817
1860 for (i = 0; i < addrinarray; i++) { 1818 for (i = 0; i < numpages; i++) {
1861 ret = reserve_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE, 1819 ret = reserve_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE,
1862 new_type, NULL); 1820 new_type, NULL);
1863 if (ret) 1821 if (ret)
@@ -1868,11 +1826,11 @@ static int _set_memory_array(unsigned long *addr, int addrinarray,
1868 set_type = (new_type == _PAGE_CACHE_MODE_WC) ? 1826 set_type = (new_type == _PAGE_CACHE_MODE_WC) ?
1869 _PAGE_CACHE_MODE_UC_MINUS : new_type; 1827 _PAGE_CACHE_MODE_UC_MINUS : new_type;
1870 1828
1871 ret = change_page_attr_set(addr, addrinarray, 1829 ret = change_page_attr_set(addr, numpages,
1872 cachemode2pgprot(set_type), 1); 1830 cachemode2pgprot(set_type), 1);
1873 1831
1874 if (!ret && new_type == _PAGE_CACHE_MODE_WC) 1832 if (!ret && new_type == _PAGE_CACHE_MODE_WC)
1875 ret = change_page_attr_set_clr(addr, addrinarray, 1833 ret = change_page_attr_set_clr(addr, numpages,
1876 cachemode2pgprot( 1834 cachemode2pgprot(
1877 _PAGE_CACHE_MODE_WC), 1835 _PAGE_CACHE_MODE_WC),
1878 __pgprot(_PAGE_CACHE_MASK), 1836 __pgprot(_PAGE_CACHE_MASK),
@@ -1889,36 +1847,34 @@ out_free:
1889 return ret; 1847 return ret;
1890} 1848}
1891 1849
1892int set_memory_array_uc(unsigned long *addr, int addrinarray) 1850int set_memory_array_uc(unsigned long *addr, int numpages)
1893{ 1851{
1894 return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_UC_MINUS); 1852 return _set_memory_array(addr, numpages, _PAGE_CACHE_MODE_UC_MINUS);
1895} 1853}
1896EXPORT_SYMBOL(set_memory_array_uc); 1854EXPORT_SYMBOL(set_memory_array_uc);
1897 1855
1898int set_memory_array_wc(unsigned long *addr, int addrinarray) 1856int set_memory_array_wc(unsigned long *addr, int numpages)
1899{ 1857{
1900 return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_WC); 1858 return _set_memory_array(addr, numpages, _PAGE_CACHE_MODE_WC);
1901} 1859}
1902EXPORT_SYMBOL(set_memory_array_wc); 1860EXPORT_SYMBOL(set_memory_array_wc);
1903 1861
1904int set_memory_array_wt(unsigned long *addr, int addrinarray) 1862int set_memory_array_wt(unsigned long *addr, int numpages)
1905{ 1863{
1906 return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_WT); 1864 return _set_memory_array(addr, numpages, _PAGE_CACHE_MODE_WT);
1907} 1865}
1908EXPORT_SYMBOL_GPL(set_memory_array_wt); 1866EXPORT_SYMBOL_GPL(set_memory_array_wt);
1909 1867
1910int _set_memory_wc(unsigned long addr, int numpages) 1868int _set_memory_wc(unsigned long addr, int numpages)
1911{ 1869{
1912 int ret; 1870 int ret;
1913 unsigned long addr_copy = addr;
1914 1871
1915 ret = change_page_attr_set(&addr, numpages, 1872 ret = change_page_attr_set(&addr, numpages,
1916 cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS), 1873 cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS),
1917 0); 1874 0);
1918 if (!ret) { 1875 if (!ret) {
1919 ret = change_page_attr_set_clr(&addr_copy, numpages, 1876 ret = change_page_attr_set_clr(&addr, numpages,
1920 cachemode2pgprot( 1877 cachemode2pgprot(_PAGE_CACHE_MODE_WC),
1921 _PAGE_CACHE_MODE_WC),
1922 __pgprot(_PAGE_CACHE_MASK), 1878 __pgprot(_PAGE_CACHE_MASK),
1923 0, 0, NULL); 1879 0, 0, NULL);
1924 } 1880 }
@@ -1985,18 +1941,18 @@ int set_memory_wb(unsigned long addr, int numpages)
1985} 1941}
1986EXPORT_SYMBOL(set_memory_wb); 1942EXPORT_SYMBOL(set_memory_wb);
1987 1943
1988int set_memory_array_wb(unsigned long *addr, int addrinarray) 1944int set_memory_array_wb(unsigned long *addr, int numpages)
1989{ 1945{
1990 int i; 1946 int i;
1991 int ret; 1947 int ret;
1992 1948
1993 /* WB cache mode is hard wired to all cache attribute bits being 0 */ 1949 /* WB cache mode is hard wired to all cache attribute bits being 0 */
1994 ret = change_page_attr_clear(addr, addrinarray, 1950 ret = change_page_attr_clear(addr, numpages,
1995 __pgprot(_PAGE_CACHE_MASK), 1); 1951 __pgprot(_PAGE_CACHE_MASK), 1);
1996 if (ret) 1952 if (ret)
1997 return ret; 1953 return ret;
1998 1954
1999 for (i = 0; i < addrinarray; i++) 1955 for (i = 0; i < numpages; i++)
2000 free_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE); 1956 free_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE);
2001 1957
2002 return 0; 1958 return 0;
@@ -2066,7 +2022,6 @@ int set_memory_global(unsigned long addr, int numpages)
2066static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc) 2022static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc)
2067{ 2023{
2068 struct cpa_data cpa; 2024 struct cpa_data cpa;
2069 unsigned long start;
2070 int ret; 2025 int ret;
2071 2026
2072 /* Nothing to do if memory encryption is not active */ 2027 /* Nothing to do if memory encryption is not active */
@@ -2077,8 +2032,6 @@ static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc)
2077 if (WARN_ONCE(addr & ~PAGE_MASK, "misaligned address: %#lx\n", addr)) 2032 if (WARN_ONCE(addr & ~PAGE_MASK, "misaligned address: %#lx\n", addr))
2078 addr &= PAGE_MASK; 2033 addr &= PAGE_MASK;
2079 2034
2080 start = addr;
2081
2082 memset(&cpa, 0, sizeof(cpa)); 2035 memset(&cpa, 0, sizeof(cpa));
2083 cpa.vaddr = &addr; 2036 cpa.vaddr = &addr;
2084 cpa.numpages = numpages; 2037 cpa.numpages = numpages;
@@ -2093,18 +2046,18 @@ static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc)
2093 /* 2046 /*
2094 * Before changing the encryption attribute, we need to flush caches. 2047 * Before changing the encryption attribute, we need to flush caches.
2095 */ 2048 */
2096 cpa_flush_range(start, numpages, 1); 2049 cpa_flush(&cpa, 1);
2097 2050
2098 ret = __change_page_attr_set_clr(&cpa, 1); 2051 ret = __change_page_attr_set_clr(&cpa, 1);
2099 2052
2100 /* 2053 /*
2101 * After changing the encryption attribute, we need to flush TLBs 2054 * After changing the encryption attribute, we need to flush TLBs again
2102 * again in case any speculative TLB caching occurred (but no need 2055 * in case any speculative TLB caching occurred (but no need to flush
2103 * to flush caches again). We could just use cpa_flush_all(), but 2056 * caches again). We could just use cpa_flush_all(), but in case TLB
2104 * in case TLB flushing gets optimized in the cpa_flush_range() 2057 * flushing gets optimized in the cpa_flush() path use the same logic
2105 * path use the same logic as above. 2058 * as above.
2106 */ 2059 */
2107 cpa_flush_range(start, numpages, 0); 2060 cpa_flush(&cpa, 0);
2108 2061
2109 return ret; 2062 return ret;
2110} 2063}
@@ -2129,7 +2082,7 @@ int set_pages_uc(struct page *page, int numpages)
2129} 2082}
2130EXPORT_SYMBOL(set_pages_uc); 2083EXPORT_SYMBOL(set_pages_uc);
2131 2084
2132static int _set_pages_array(struct page **pages, int addrinarray, 2085static int _set_pages_array(struct page **pages, int numpages,
2133 enum page_cache_mode new_type) 2086 enum page_cache_mode new_type)
2134{ 2087{
2135 unsigned long start; 2088 unsigned long start;
@@ -2139,7 +2092,7 @@ static int _set_pages_array(struct page **pages, int addrinarray,
2139 int free_idx; 2092 int free_idx;
2140 int ret; 2093 int ret;
2141 2094
2142 for (i = 0; i < addrinarray; i++) { 2095 for (i = 0; i < numpages; i++) {
2143 if (PageHighMem(pages[i])) 2096 if (PageHighMem(pages[i]))
2144 continue; 2097 continue;
2145 start = page_to_pfn(pages[i]) << PAGE_SHIFT; 2098 start = page_to_pfn(pages[i]) << PAGE_SHIFT;
@@ -2152,10 +2105,10 @@ static int _set_pages_array(struct page **pages, int addrinarray,
2152 set_type = (new_type == _PAGE_CACHE_MODE_WC) ? 2105 set_type = (new_type == _PAGE_CACHE_MODE_WC) ?
2153 _PAGE_CACHE_MODE_UC_MINUS : new_type; 2106 _PAGE_CACHE_MODE_UC_MINUS : new_type;
2154 2107
2155 ret = cpa_set_pages_array(pages, addrinarray, 2108 ret = cpa_set_pages_array(pages, numpages,
2156 cachemode2pgprot(set_type)); 2109 cachemode2pgprot(set_type));
2157 if (!ret && new_type == _PAGE_CACHE_MODE_WC) 2110 if (!ret && new_type == _PAGE_CACHE_MODE_WC)
2158 ret = change_page_attr_set_clr(NULL, addrinarray, 2111 ret = change_page_attr_set_clr(NULL, numpages,
2159 cachemode2pgprot( 2112 cachemode2pgprot(
2160 _PAGE_CACHE_MODE_WC), 2113 _PAGE_CACHE_MODE_WC),
2161 __pgprot(_PAGE_CACHE_MASK), 2114 __pgprot(_PAGE_CACHE_MASK),
@@ -2175,21 +2128,21 @@ err_out:
2175 return -EINVAL; 2128 return -EINVAL;
2176} 2129}
2177 2130
2178int set_pages_array_uc(struct page **pages, int addrinarray) 2131int set_pages_array_uc(struct page **pages, int numpages)
2179{ 2132{
2180 return _set_pages_array(pages, addrinarray, _PAGE_CACHE_MODE_UC_MINUS); 2133 return _set_pages_array(pages, numpages, _PAGE_CACHE_MODE_UC_MINUS);
2181} 2134}
2182EXPORT_SYMBOL(set_pages_array_uc); 2135EXPORT_SYMBOL(set_pages_array_uc);
2183 2136
2184int set_pages_array_wc(struct page **pages, int addrinarray) 2137int set_pages_array_wc(struct page **pages, int numpages)
2185{ 2138{
2186 return _set_pages_array(pages, addrinarray, _PAGE_CACHE_MODE_WC); 2139 return _set_pages_array(pages, numpages, _PAGE_CACHE_MODE_WC);
2187} 2140}
2188EXPORT_SYMBOL(set_pages_array_wc); 2141EXPORT_SYMBOL(set_pages_array_wc);
2189 2142
2190int set_pages_array_wt(struct page **pages, int addrinarray) 2143int set_pages_array_wt(struct page **pages, int numpages)
2191{ 2144{
2192 return _set_pages_array(pages, addrinarray, _PAGE_CACHE_MODE_WT); 2145 return _set_pages_array(pages, numpages, _PAGE_CACHE_MODE_WT);
2193} 2146}
2194EXPORT_SYMBOL_GPL(set_pages_array_wt); 2147EXPORT_SYMBOL_GPL(set_pages_array_wt);
2195 2148
@@ -2201,7 +2154,7 @@ int set_pages_wb(struct page *page, int numpages)
2201} 2154}
2202EXPORT_SYMBOL(set_pages_wb); 2155EXPORT_SYMBOL(set_pages_wb);
2203 2156
2204int set_pages_array_wb(struct page **pages, int addrinarray) 2157int set_pages_array_wb(struct page **pages, int numpages)
2205{ 2158{
2206 int retval; 2159 int retval;
2207 unsigned long start; 2160 unsigned long start;
@@ -2209,12 +2162,12 @@ int set_pages_array_wb(struct page **pages, int addrinarray)
2209 int i; 2162 int i;
2210 2163
2211 /* WB cache mode is hard wired to all cache attribute bits being 0 */ 2164 /* WB cache mode is hard wired to all cache attribute bits being 0 */
2212 retval = cpa_clear_pages_array(pages, addrinarray, 2165 retval = cpa_clear_pages_array(pages, numpages,
2213 __pgprot(_PAGE_CACHE_MASK)); 2166 __pgprot(_PAGE_CACHE_MASK));
2214 if (retval) 2167 if (retval)
2215 return retval; 2168 return retval;
2216 2169
2217 for (i = 0; i < addrinarray; i++) { 2170 for (i = 0; i < numpages; i++) {
2218 if (PageHighMem(pages[i])) 2171 if (PageHighMem(pages[i]))
2219 continue; 2172 continue;
2220 start = page_to_pfn(pages[i]) << PAGE_SHIFT; 2173 start = page_to_pfn(pages[i]) << PAGE_SHIFT;
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 03b6b4c2238d..999d6d8f0bef 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -15,6 +15,8 @@
15#include <asm/apic.h> 15#include <asm/apic.h>
16#include <asm/uv/uv.h> 16#include <asm/uv/uv.h>
17 17
18#include "mm_internal.h"
19
18/* 20/*
19 * TLB flushing, formerly SMP-only 21 * TLB flushing, formerly SMP-only
20 * c/o Linus Torvalds. 22 * c/o Linus Torvalds.
@@ -721,7 +723,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
721 * 723 *
722 * This is in units of pages. 724 * This is in units of pages.
723 */ 725 */
724static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33; 726unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
725 727
726void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, 728void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
727 unsigned long end, unsigned int stride_shift, 729 unsigned long end, unsigned int stride_shift,
diff --git a/include/asm-generic/5level-fixup.h b/include/asm-generic/5level-fixup.h
index 73474bb52344..bb6cb347018c 100644
--- a/include/asm-generic/5level-fixup.h
+++ b/include/asm-generic/5level-fixup.h
@@ -26,6 +26,7 @@
26#define p4d_clear(p4d) pgd_clear(p4d) 26#define p4d_clear(p4d) pgd_clear(p4d)
27#define p4d_val(p4d) pgd_val(p4d) 27#define p4d_val(p4d) pgd_val(p4d)
28#define p4d_populate(mm, p4d, pud) pgd_populate(mm, p4d, pud) 28#define p4d_populate(mm, p4d, pud) pgd_populate(mm, p4d, pud)
29#define p4d_populate_safe(mm, p4d, pud) pgd_populate(mm, p4d, pud)
29#define p4d_page(p4d) pgd_page(p4d) 30#define p4d_page(p4d) pgd_page(p4d)
30#define p4d_page_vaddr(p4d) pgd_page_vaddr(p4d) 31#define p4d_page_vaddr(p4d) pgd_page_vaddr(p4d)
31 32
diff --git a/include/asm-generic/pgtable-nop4d-hack.h b/include/asm-generic/pgtable-nop4d-hack.h
index 1d6dd38c0e5e..829bdb0d6327 100644
--- a/include/asm-generic/pgtable-nop4d-hack.h
+++ b/include/asm-generic/pgtable-nop4d-hack.h
@@ -31,6 +31,7 @@ static inline void pgd_clear(pgd_t *pgd) { }
31#define pud_ERROR(pud) (pgd_ERROR((pud).pgd)) 31#define pud_ERROR(pud) (pgd_ERROR((pud).pgd))
32 32
33#define pgd_populate(mm, pgd, pud) do { } while (0) 33#define pgd_populate(mm, pgd, pud) do { } while (0)
34#define pgd_populate_safe(mm, pgd, pud) do { } while (0)
34/* 35/*
35 * (puds are folded into pgds so this doesn't get actually called, 36 * (puds are folded into pgds so this doesn't get actually called,
36 * but the define is needed for a generic inline function.) 37 * but the define is needed for a generic inline function.)
diff --git a/include/asm-generic/pgtable-nop4d.h b/include/asm-generic/pgtable-nop4d.h
index 04cb913797bc..aebab905e6cd 100644
--- a/include/asm-generic/pgtable-nop4d.h
+++ b/include/asm-generic/pgtable-nop4d.h
@@ -26,6 +26,7 @@ static inline void pgd_clear(pgd_t *pgd) { }
26#define p4d_ERROR(p4d) (pgd_ERROR((p4d).pgd)) 26#define p4d_ERROR(p4d) (pgd_ERROR((p4d).pgd))
27 27
28#define pgd_populate(mm, pgd, p4d) do { } while (0) 28#define pgd_populate(mm, pgd, p4d) do { } while (0)
29#define pgd_populate_safe(mm, pgd, p4d) do { } while (0)
29/* 30/*
30 * (p4ds are folded into pgds so this doesn't get actually called, 31 * (p4ds are folded into pgds so this doesn't get actually called,
31 * but the define is needed for a generic inline function.) 32 * but the define is needed for a generic inline function.)
diff --git a/include/asm-generic/pgtable-nopud.h b/include/asm-generic/pgtable-nopud.h
index 9bef475db6fe..c77a1d301155 100644
--- a/include/asm-generic/pgtable-nopud.h
+++ b/include/asm-generic/pgtable-nopud.h
@@ -35,6 +35,7 @@ static inline void p4d_clear(p4d_t *p4d) { }
35#define pud_ERROR(pud) (p4d_ERROR((pud).p4d)) 35#define pud_ERROR(pud) (p4d_ERROR((pud).p4d))
36 36
37#define p4d_populate(mm, p4d, pud) do { } while (0) 37#define p4d_populate(mm, p4d, pud) do { } while (0)
38#define p4d_populate_safe(mm, p4d, pud) do { } while (0)
38/* 39/*
39 * (puds are folded into p4ds so this doesn't get actually called, 40 * (puds are folded into p4ds so this doesn't get actually called,
40 * but the define is needed for a generic inline function.) 41 * but the define is needed for a generic inline function.)
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 359fb935ded6..a9cac82e9a7a 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -375,7 +375,6 @@ static inline int pte_unused(pte_t pte)
375#endif 375#endif
376 376
377#ifndef __HAVE_ARCH_PMD_SAME 377#ifndef __HAVE_ARCH_PMD_SAME
378#ifdef CONFIG_TRANSPARENT_HUGEPAGE
379static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) 378static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
380{ 379{
381 return pmd_val(pmd_a) == pmd_val(pmd_b); 380 return pmd_val(pmd_a) == pmd_val(pmd_b);
@@ -385,21 +384,60 @@ static inline int pud_same(pud_t pud_a, pud_t pud_b)
385{ 384{
386 return pud_val(pud_a) == pud_val(pud_b); 385 return pud_val(pud_a) == pud_val(pud_b);
387} 386}
388#else /* CONFIG_TRANSPARENT_HUGEPAGE */ 387#endif
389static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) 388
389#ifndef __HAVE_ARCH_P4D_SAME
390static inline int p4d_same(p4d_t p4d_a, p4d_t p4d_b)
390{ 391{
391 BUILD_BUG(); 392 return p4d_val(p4d_a) == p4d_val(p4d_b);
392 return 0;
393} 393}
394#endif
394 395
395static inline int pud_same(pud_t pud_a, pud_t pud_b) 396#ifndef __HAVE_ARCH_PGD_SAME
397static inline int pgd_same(pgd_t pgd_a, pgd_t pgd_b)
396{ 398{
397 BUILD_BUG(); 399 return pgd_val(pgd_a) == pgd_val(pgd_b);
398 return 0;
399} 400}
400#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
401#endif 401#endif
402 402
403/*
404 * Use set_p*_safe(), and elide TLB flushing, when confident that *no*
405 * TLB flush will be required as a result of the "set". For example, use
406 * in scenarios where it is known ahead of time that the routine is
407 * setting non-present entries, or re-setting an existing entry to the
408 * same value. Otherwise, use the typical "set" helpers and flush the
409 * TLB.
410 */
411#define set_pte_safe(ptep, pte) \
412({ \
413 WARN_ON_ONCE(pte_present(*ptep) && !pte_same(*ptep, pte)); \
414 set_pte(ptep, pte); \
415})
416
417#define set_pmd_safe(pmdp, pmd) \
418({ \
419 WARN_ON_ONCE(pmd_present(*pmdp) && !pmd_same(*pmdp, pmd)); \
420 set_pmd(pmdp, pmd); \
421})
422
423#define set_pud_safe(pudp, pud) \
424({ \
425 WARN_ON_ONCE(pud_present(*pudp) && !pud_same(*pudp, pud)); \
426 set_pud(pudp, pud); \
427})
428
429#define set_p4d_safe(p4dp, p4d) \
430({ \
431 WARN_ON_ONCE(p4d_present(*p4dp) && !p4d_same(*p4dp, p4d)); \
432 set_p4d(p4dp, p4d); \
433})
434
435#define set_pgd_safe(pgdp, pgd) \
436({ \
437 WARN_ON_ONCE(pgd_present(*pgdp) && !pgd_same(*pgdp, pgd)); \
438 set_pgd(pgdp, pgd); \
439})
440
403#ifndef __HAVE_ARCH_DO_SWAP_PAGE 441#ifndef __HAVE_ARCH_DO_SWAP_PAGE
404/* 442/*
405 * Some architectures support metadata associated with a page. When a 443 * Some architectures support metadata associated with a page. When a