aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/mm')
-rw-r--r--arch/x86/mm/fault.c61
-rw-r--r--arch/x86/mm/numa.c21
-rw-r--r--arch/x86/mm/numa_32.c2
-rw-r--r--arch/x86/mm/srat.c16
-rw-r--r--arch/x86/mm/tlb.c52
5 files changed, 82 insertions, 70 deletions
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 9d591c895803..a10c8c792161 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1001,6 +1001,12 @@ static int fault_in_kernel_space(unsigned long address)
1001 1001
1002static inline bool smap_violation(int error_code, struct pt_regs *regs) 1002static inline bool smap_violation(int error_code, struct pt_regs *regs)
1003{ 1003{
1004 if (!IS_ENABLED(CONFIG_X86_SMAP))
1005 return false;
1006
1007 if (!static_cpu_has(X86_FEATURE_SMAP))
1008 return false;
1009
1004 if (error_code & PF_USER) 1010 if (error_code & PF_USER)
1005 return false; 1011 return false;
1006 1012
@@ -1014,13 +1020,17 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs)
1014 * This routine handles page faults. It determines the address, 1020 * This routine handles page faults. It determines the address,
1015 * and the problem, and then passes it off to one of the appropriate 1021 * and the problem, and then passes it off to one of the appropriate
1016 * routines. 1022 * routines.
1023 *
1024 * This function must have noinline because both callers
1025 * {,trace_}do_page_fault() have notrace on. Having this an actual function
1026 * guarantees there's a function trace entry.
1017 */ 1027 */
1018static void __kprobes 1028static void __kprobes noinline
1019__do_page_fault(struct pt_regs *regs, unsigned long error_code) 1029__do_page_fault(struct pt_regs *regs, unsigned long error_code,
1030 unsigned long address)
1020{ 1031{
1021 struct vm_area_struct *vma; 1032 struct vm_area_struct *vma;
1022 struct task_struct *tsk; 1033 struct task_struct *tsk;
1023 unsigned long address;
1024 struct mm_struct *mm; 1034 struct mm_struct *mm;
1025 int fault; 1035 int fault;
1026 unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; 1036 unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
@@ -1028,9 +1038,6 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code)
1028 tsk = current; 1038 tsk = current;
1029 mm = tsk->mm; 1039 mm = tsk->mm;
1030 1040
1031 /* Get the faulting address: */
1032 address = read_cr2();
1033
1034 /* 1041 /*
1035 * Detect and handle instructions that would cause a page fault for 1042 * Detect and handle instructions that would cause a page fault for
1036 * both a tracked kernel page and a userspace page. 1043 * both a tracked kernel page and a userspace page.
@@ -1087,11 +1094,9 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code)
1087 if (unlikely(error_code & PF_RSVD)) 1094 if (unlikely(error_code & PF_RSVD))
1088 pgtable_bad(regs, error_code, address); 1095 pgtable_bad(regs, error_code, address);
1089 1096
1090 if (static_cpu_has(X86_FEATURE_SMAP)) { 1097 if (unlikely(smap_violation(error_code, regs))) {
1091 if (unlikely(smap_violation(error_code, regs))) { 1098 bad_area_nosemaphore(regs, error_code, address);
1092 bad_area_nosemaphore(regs, error_code, address); 1099 return;
1093 return;
1094 }
1095 } 1100 }
1096 1101
1097 /* 1102 /*
@@ -1244,32 +1249,50 @@ good_area:
1244 up_read(&mm->mmap_sem); 1249 up_read(&mm->mmap_sem);
1245} 1250}
1246 1251
1247dotraplinkage void __kprobes 1252dotraplinkage void __kprobes notrace
1248do_page_fault(struct pt_regs *regs, unsigned long error_code) 1253do_page_fault(struct pt_regs *regs, unsigned long error_code)
1249{ 1254{
1255 unsigned long address = read_cr2(); /* Get the faulting address */
1250 enum ctx_state prev_state; 1256 enum ctx_state prev_state;
1251 1257
1258 /*
1259 * We must have this function tagged with __kprobes, notrace and call
1260 * read_cr2() before calling anything else. To avoid calling any kind
1261 * of tracing machinery before we've observed the CR2 value.
1262 *
1263 * exception_{enter,exit}() contain all sorts of tracepoints.
1264 */
1265
1252 prev_state = exception_enter(); 1266 prev_state = exception_enter();
1253 __do_page_fault(regs, error_code); 1267 __do_page_fault(regs, error_code, address);
1254 exception_exit(prev_state); 1268 exception_exit(prev_state);
1255} 1269}
1256 1270
1257static void trace_page_fault_entries(struct pt_regs *regs, 1271#ifdef CONFIG_TRACING
1272static void trace_page_fault_entries(unsigned long address, struct pt_regs *regs,
1258 unsigned long error_code) 1273 unsigned long error_code)
1259{ 1274{
1260 if (user_mode(regs)) 1275 if (user_mode(regs))
1261 trace_page_fault_user(read_cr2(), regs, error_code); 1276 trace_page_fault_user(address, regs, error_code);
1262 else 1277 else
1263 trace_page_fault_kernel(read_cr2(), regs, error_code); 1278 trace_page_fault_kernel(address, regs, error_code);
1264} 1279}
1265 1280
1266dotraplinkage void __kprobes 1281dotraplinkage void __kprobes notrace
1267trace_do_page_fault(struct pt_regs *regs, unsigned long error_code) 1282trace_do_page_fault(struct pt_regs *regs, unsigned long error_code)
1268{ 1283{
1284 /*
1285 * The exception_enter and tracepoint processing could
1286 * trigger another page faults (user space callchain
1287 * reading) and destroy the original cr2 value, so read
1288 * the faulting address now.
1289 */
1290 unsigned long address = read_cr2();
1269 enum ctx_state prev_state; 1291 enum ctx_state prev_state;
1270 1292
1271 prev_state = exception_enter(); 1293 prev_state = exception_enter();
1272 trace_page_fault_entries(regs, error_code); 1294 trace_page_fault_entries(address, regs, error_code);
1273 __do_page_fault(regs, error_code); 1295 __do_page_fault(regs, error_code, address);
1274 exception_exit(prev_state); 1296 exception_exit(prev_state);
1275} 1297}
1298#endif /* CONFIG_TRACING */
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 81b2750f3666..27aa0455fab3 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -493,14 +493,6 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
493 struct numa_memblk *mb = &mi->blk[i]; 493 struct numa_memblk *mb = &mi->blk[i];
494 memblock_set_node(mb->start, mb->end - mb->start, 494 memblock_set_node(mb->start, mb->end - mb->start,
495 &memblock.memory, mb->nid); 495 &memblock.memory, mb->nid);
496
497 /*
498 * At this time, all memory regions reserved by memblock are
499 * used by the kernel. Set the nid in memblock.reserved will
500 * mark out all the nodes the kernel resides in.
501 */
502 memblock_set_node(mb->start, mb->end - mb->start,
503 &memblock.reserved, mb->nid);
504 } 496 }
505 497
506 /* 498 /*
@@ -565,10 +557,21 @@ static void __init numa_init_array(void)
565static void __init numa_clear_kernel_node_hotplug(void) 557static void __init numa_clear_kernel_node_hotplug(void)
566{ 558{
567 int i, nid; 559 int i, nid;
568 nodemask_t numa_kernel_nodes; 560 nodemask_t numa_kernel_nodes = NODE_MASK_NONE;
569 unsigned long start, end; 561 unsigned long start, end;
570 struct memblock_type *type = &memblock.reserved; 562 struct memblock_type *type = &memblock.reserved;
571 563
564 /*
565 * At this time, all memory regions reserved by memblock are
566 * used by the kernel. Set the nid in memblock.reserved will
567 * mark out all the nodes the kernel resides in.
568 */
569 for (i = 0; i < numa_meminfo.nr_blks; i++) {
570 struct numa_memblk *mb = &numa_meminfo.blk[i];
571 memblock_set_node(mb->start, mb->end - mb->start,
572 &memblock.reserved, mb->nid);
573 }
574
572 /* Mark all kernel nodes. */ 575 /* Mark all kernel nodes. */
573 for (i = 0; i < type->cnt; i++) 576 for (i = 0; i < type->cnt; i++)
574 node_set(type->regions[i].nid, numa_kernel_nodes); 577 node_set(type->regions[i].nid, numa_kernel_nodes);
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 0342d27ca798..47b6436e41c2 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -52,6 +52,8 @@ void memory_present(int nid, unsigned long start, unsigned long end)
52 nid, start, end); 52 nid, start, end);
53 printk(KERN_DEBUG " Setting physnode_map array to node %d for pfns:\n", nid); 53 printk(KERN_DEBUG " Setting physnode_map array to node %d for pfns:\n", nid);
54 printk(KERN_DEBUG " "); 54 printk(KERN_DEBUG " ");
55 start = round_down(start, PAGES_PER_SECTION);
56 end = round_up(end, PAGES_PER_SECTION);
55 for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) { 57 for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) {
56 physnode_map[pfn / PAGES_PER_SECTION] = nid; 58 physnode_map[pfn / PAGES_PER_SECTION] = nid;
57 printk(KERN_CONT "%lx ", pfn); 59 printk(KERN_CONT "%lx ", pfn);
diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c
index 1a25187e151e..1953e9c9391a 100644
--- a/arch/x86/mm/srat.c
+++ b/arch/x86/mm/srat.c
@@ -42,15 +42,25 @@ static __init inline int srat_disabled(void)
42 return acpi_numa < 0; 42 return acpi_numa < 0;
43} 43}
44 44
45/* Callback for SLIT parsing */ 45/*
46 * Callback for SLIT parsing. pxm_to_node() returns NUMA_NO_NODE for
47 * I/O localities since SRAT does not list them. I/O localities are
48 * not supported at this point.
49 */
46void __init acpi_numa_slit_init(struct acpi_table_slit *slit) 50void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
47{ 51{
48 int i, j; 52 int i, j;
49 53
50 for (i = 0; i < slit->locality_count; i++) 54 for (i = 0; i < slit->locality_count; i++) {
51 for (j = 0; j < slit->locality_count; j++) 55 if (pxm_to_node(i) == NUMA_NO_NODE)
56 continue;
57 for (j = 0; j < slit->locality_count; j++) {
58 if (pxm_to_node(j) == NUMA_NO_NODE)
59 continue;
52 numa_set_distance(pxm_to_node(i), pxm_to_node(j), 60 numa_set_distance(pxm_to_node(i), pxm_to_node(j),
53 slit->entry[slit->locality_count * i + j]); 61 slit->entry[slit->locality_count * i + j]);
62 }
63 }
54} 64}
55 65
56/* Callback for Proximity Domain -> x2APIC mapping */ 66/* Callback for Proximity Domain -> x2APIC mapping */
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index ae699b3bbac8..dd8dda167a24 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -103,7 +103,7 @@ static void flush_tlb_func(void *info)
103 if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm)) 103 if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm))
104 return; 104 return;
105 105
106 count_vm_event(NR_TLB_REMOTE_FLUSH_RECEIVED); 106 count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
107 if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) { 107 if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
108 if (f->flush_end == TLB_FLUSH_ALL) 108 if (f->flush_end == TLB_FLUSH_ALL)
109 local_flush_tlb(); 109 local_flush_tlb();
@@ -131,7 +131,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
131 info.flush_start = start; 131 info.flush_start = start;
132 info.flush_end = end; 132 info.flush_end = end;
133 133
134 count_vm_event(NR_TLB_REMOTE_FLUSH); 134 count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
135 if (is_uv_system()) { 135 if (is_uv_system()) {
136 unsigned int cpu; 136 unsigned int cpu;
137 137
@@ -151,44 +151,19 @@ void flush_tlb_current_task(void)
151 151
152 preempt_disable(); 152 preempt_disable();
153 153
154 count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); 154 count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
155 local_flush_tlb(); 155 local_flush_tlb();
156 if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) 156 if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
157 flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL); 157 flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
158 preempt_enable(); 158 preempt_enable();
159} 159}
160 160
161/*
162 * It can find out the THP large page, or
163 * HUGETLB page in tlb_flush when THP disabled
164 */
165static inline unsigned long has_large_page(struct mm_struct *mm,
166 unsigned long start, unsigned long end)
167{
168 pgd_t *pgd;
169 pud_t *pud;
170 pmd_t *pmd;
171 unsigned long addr = ALIGN(start, HPAGE_SIZE);
172 for (; addr < end; addr += HPAGE_SIZE) {
173 pgd = pgd_offset(mm, addr);
174 if (likely(!pgd_none(*pgd))) {
175 pud = pud_offset(pgd, addr);
176 if (likely(!pud_none(*pud))) {
177 pmd = pmd_offset(pud, addr);
178 if (likely(!pmd_none(*pmd)))
179 if (pmd_large(*pmd))
180 return addr;
181 }
182 }
183 }
184 return 0;
185}
186
187void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, 161void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
188 unsigned long end, unsigned long vmflag) 162 unsigned long end, unsigned long vmflag)
189{ 163{
190 unsigned long addr; 164 unsigned long addr;
191 unsigned act_entries, tlb_entries = 0; 165 unsigned act_entries, tlb_entries = 0;
166 unsigned long nr_base_pages;
192 167
193 preempt_disable(); 168 preempt_disable();
194 if (current->active_mm != mm) 169 if (current->active_mm != mm)
@@ -210,21 +185,20 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
210 tlb_entries = tlb_lli_4k[ENTRIES]; 185 tlb_entries = tlb_lli_4k[ENTRIES];
211 else 186 else
212 tlb_entries = tlb_lld_4k[ENTRIES]; 187 tlb_entries = tlb_lld_4k[ENTRIES];
188
213 /* Assume all of TLB entries was occupied by this task */ 189 /* Assume all of TLB entries was occupied by this task */
214 act_entries = mm->total_vm > tlb_entries ? tlb_entries : mm->total_vm; 190 act_entries = tlb_entries >> tlb_flushall_shift;
191 act_entries = mm->total_vm > act_entries ? act_entries : mm->total_vm;
192 nr_base_pages = (end - start) >> PAGE_SHIFT;
215 193
216 /* tlb_flushall_shift is on balance point, details in commit log */ 194 /* tlb_flushall_shift is on balance point, details in commit log */
217 if ((end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift) { 195 if (nr_base_pages > act_entries) {
218 count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); 196 count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
219 local_flush_tlb(); 197 local_flush_tlb();
220 } else { 198 } else {
221 if (has_large_page(mm, start, end)) {
222 local_flush_tlb();
223 goto flush_all;
224 }
225 /* flush range by one by one 'invlpg' */ 199 /* flush range by one by one 'invlpg' */
226 for (addr = start; addr < end; addr += PAGE_SIZE) { 200 for (addr = start; addr < end; addr += PAGE_SIZE) {
227 count_vm_event(NR_TLB_LOCAL_FLUSH_ONE); 201 count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
228 __flush_tlb_single(addr); 202 __flush_tlb_single(addr);
229 } 203 }
230 204
@@ -262,7 +236,7 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long start)
262 236
263static void do_flush_tlb_all(void *info) 237static void do_flush_tlb_all(void *info)
264{ 238{
265 count_vm_event(NR_TLB_REMOTE_FLUSH_RECEIVED); 239 count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
266 __flush_tlb_all(); 240 __flush_tlb_all();
267 if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY) 241 if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
268 leave_mm(smp_processor_id()); 242 leave_mm(smp_processor_id());
@@ -270,7 +244,7 @@ static void do_flush_tlb_all(void *info)
270 244
271void flush_tlb_all(void) 245void flush_tlb_all(void)
272{ 246{
273 count_vm_event(NR_TLB_REMOTE_FLUSH); 247 count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
274 on_each_cpu(do_flush_tlb_all, NULL, 1); 248 on_each_cpu(do_flush_tlb_all, NULL, 1);
275} 249}
276 250