diff options
Diffstat (limited to 'arch/x86/mm')
-rw-r--r-- | arch/x86/mm/fault.c | 61 | ||||
-rw-r--r-- | arch/x86/mm/numa.c | 21 | ||||
-rw-r--r-- | arch/x86/mm/numa_32.c | 2 | ||||
-rw-r--r-- | arch/x86/mm/srat.c | 16 | ||||
-rw-r--r-- | arch/x86/mm/tlb.c | 52 |
5 files changed, 82 insertions, 70 deletions
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 9d591c895803..a10c8c792161 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c | |||
@@ -1001,6 +1001,12 @@ static int fault_in_kernel_space(unsigned long address) | |||
1001 | 1001 | ||
1002 | static inline bool smap_violation(int error_code, struct pt_regs *regs) | 1002 | static inline bool smap_violation(int error_code, struct pt_regs *regs) |
1003 | { | 1003 | { |
1004 | if (!IS_ENABLED(CONFIG_X86_SMAP)) | ||
1005 | return false; | ||
1006 | |||
1007 | if (!static_cpu_has(X86_FEATURE_SMAP)) | ||
1008 | return false; | ||
1009 | |||
1004 | if (error_code & PF_USER) | 1010 | if (error_code & PF_USER) |
1005 | return false; | 1011 | return false; |
1006 | 1012 | ||
@@ -1014,13 +1020,17 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs) | |||
1014 | * This routine handles page faults. It determines the address, | 1020 | * This routine handles page faults. It determines the address, |
1015 | * and the problem, and then passes it off to one of the appropriate | 1021 | * and the problem, and then passes it off to one of the appropriate |
1016 | * routines. | 1022 | * routines. |
1023 | * | ||
1024 | * This function must have noinline because both callers | ||
1025 | * {,trace_}do_page_fault() have notrace on. Having this an actual function | ||
1026 | * guarantees there's a function trace entry. | ||
1017 | */ | 1027 | */ |
1018 | static void __kprobes | 1028 | static void __kprobes noinline |
1019 | __do_page_fault(struct pt_regs *regs, unsigned long error_code) | 1029 | __do_page_fault(struct pt_regs *regs, unsigned long error_code, |
1030 | unsigned long address) | ||
1020 | { | 1031 | { |
1021 | struct vm_area_struct *vma; | 1032 | struct vm_area_struct *vma; |
1022 | struct task_struct *tsk; | 1033 | struct task_struct *tsk; |
1023 | unsigned long address; | ||
1024 | struct mm_struct *mm; | 1034 | struct mm_struct *mm; |
1025 | int fault; | 1035 | int fault; |
1026 | unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; | 1036 | unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; |
@@ -1028,9 +1038,6 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code) | |||
1028 | tsk = current; | 1038 | tsk = current; |
1029 | mm = tsk->mm; | 1039 | mm = tsk->mm; |
1030 | 1040 | ||
1031 | /* Get the faulting address: */ | ||
1032 | address = read_cr2(); | ||
1033 | |||
1034 | /* | 1041 | /* |
1035 | * Detect and handle instructions that would cause a page fault for | 1042 | * Detect and handle instructions that would cause a page fault for |
1036 | * both a tracked kernel page and a userspace page. | 1043 | * both a tracked kernel page and a userspace page. |
@@ -1087,11 +1094,9 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code) | |||
1087 | if (unlikely(error_code & PF_RSVD)) | 1094 | if (unlikely(error_code & PF_RSVD)) |
1088 | pgtable_bad(regs, error_code, address); | 1095 | pgtable_bad(regs, error_code, address); |
1089 | 1096 | ||
1090 | if (static_cpu_has(X86_FEATURE_SMAP)) { | 1097 | if (unlikely(smap_violation(error_code, regs))) { |
1091 | if (unlikely(smap_violation(error_code, regs))) { | 1098 | bad_area_nosemaphore(regs, error_code, address); |
1092 | bad_area_nosemaphore(regs, error_code, address); | 1099 | return; |
1093 | return; | ||
1094 | } | ||
1095 | } | 1100 | } |
1096 | 1101 | ||
1097 | /* | 1102 | /* |
@@ -1244,32 +1249,50 @@ good_area: | |||
1244 | up_read(&mm->mmap_sem); | 1249 | up_read(&mm->mmap_sem); |
1245 | } | 1250 | } |
1246 | 1251 | ||
1247 | dotraplinkage void __kprobes | 1252 | dotraplinkage void __kprobes notrace |
1248 | do_page_fault(struct pt_regs *regs, unsigned long error_code) | 1253 | do_page_fault(struct pt_regs *regs, unsigned long error_code) |
1249 | { | 1254 | { |
1255 | unsigned long address = read_cr2(); /* Get the faulting address */ | ||
1250 | enum ctx_state prev_state; | 1256 | enum ctx_state prev_state; |
1251 | 1257 | ||
1258 | /* | ||
1259 | * We must have this function tagged with __kprobes, notrace and call | ||
1260 | * read_cr2() before calling anything else. To avoid calling any kind | ||
1261 | * of tracing machinery before we've observed the CR2 value. | ||
1262 | * | ||
1263 | * exception_{enter,exit}() contain all sorts of tracepoints. | ||
1264 | */ | ||
1265 | |||
1252 | prev_state = exception_enter(); | 1266 | prev_state = exception_enter(); |
1253 | __do_page_fault(regs, error_code); | 1267 | __do_page_fault(regs, error_code, address); |
1254 | exception_exit(prev_state); | 1268 | exception_exit(prev_state); |
1255 | } | 1269 | } |
1256 | 1270 | ||
1257 | static void trace_page_fault_entries(struct pt_regs *regs, | 1271 | #ifdef CONFIG_TRACING |
1272 | static void trace_page_fault_entries(unsigned long address, struct pt_regs *regs, | ||
1258 | unsigned long error_code) | 1273 | unsigned long error_code) |
1259 | { | 1274 | { |
1260 | if (user_mode(regs)) | 1275 | if (user_mode(regs)) |
1261 | trace_page_fault_user(read_cr2(), regs, error_code); | 1276 | trace_page_fault_user(address, regs, error_code); |
1262 | else | 1277 | else |
1263 | trace_page_fault_kernel(read_cr2(), regs, error_code); | 1278 | trace_page_fault_kernel(address, regs, error_code); |
1264 | } | 1279 | } |
1265 | 1280 | ||
1266 | dotraplinkage void __kprobes | 1281 | dotraplinkage void __kprobes notrace |
1267 | trace_do_page_fault(struct pt_regs *regs, unsigned long error_code) | 1282 | trace_do_page_fault(struct pt_regs *regs, unsigned long error_code) |
1268 | { | 1283 | { |
1284 | /* | ||
1285 | * The exception_enter and tracepoint processing could | ||
1286 | * trigger another page faults (user space callchain | ||
1287 | * reading) and destroy the original cr2 value, so read | ||
1288 | * the faulting address now. | ||
1289 | */ | ||
1290 | unsigned long address = read_cr2(); | ||
1269 | enum ctx_state prev_state; | 1291 | enum ctx_state prev_state; |
1270 | 1292 | ||
1271 | prev_state = exception_enter(); | 1293 | prev_state = exception_enter(); |
1272 | trace_page_fault_entries(regs, error_code); | 1294 | trace_page_fault_entries(address, regs, error_code); |
1273 | __do_page_fault(regs, error_code); | 1295 | __do_page_fault(regs, error_code, address); |
1274 | exception_exit(prev_state); | 1296 | exception_exit(prev_state); |
1275 | } | 1297 | } |
1298 | #endif /* CONFIG_TRACING */ | ||
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 81b2750f3666..27aa0455fab3 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c | |||
@@ -493,14 +493,6 @@ static int __init numa_register_memblks(struct numa_meminfo *mi) | |||
493 | struct numa_memblk *mb = &mi->blk[i]; | 493 | struct numa_memblk *mb = &mi->blk[i]; |
494 | memblock_set_node(mb->start, mb->end - mb->start, | 494 | memblock_set_node(mb->start, mb->end - mb->start, |
495 | &memblock.memory, mb->nid); | 495 | &memblock.memory, mb->nid); |
496 | |||
497 | /* | ||
498 | * At this time, all memory regions reserved by memblock are | ||
499 | * used by the kernel. Set the nid in memblock.reserved will | ||
500 | * mark out all the nodes the kernel resides in. | ||
501 | */ | ||
502 | memblock_set_node(mb->start, mb->end - mb->start, | ||
503 | &memblock.reserved, mb->nid); | ||
504 | } | 496 | } |
505 | 497 | ||
506 | /* | 498 | /* |
@@ -565,10 +557,21 @@ static void __init numa_init_array(void) | |||
565 | static void __init numa_clear_kernel_node_hotplug(void) | 557 | static void __init numa_clear_kernel_node_hotplug(void) |
566 | { | 558 | { |
567 | int i, nid; | 559 | int i, nid; |
568 | nodemask_t numa_kernel_nodes; | 560 | nodemask_t numa_kernel_nodes = NODE_MASK_NONE; |
569 | unsigned long start, end; | 561 | unsigned long start, end; |
570 | struct memblock_type *type = &memblock.reserved; | 562 | struct memblock_type *type = &memblock.reserved; |
571 | 563 | ||
564 | /* | ||
565 | * At this time, all memory regions reserved by memblock are | ||
566 | * used by the kernel. Set the nid in memblock.reserved will | ||
567 | * mark out all the nodes the kernel resides in. | ||
568 | */ | ||
569 | for (i = 0; i < numa_meminfo.nr_blks; i++) { | ||
570 | struct numa_memblk *mb = &numa_meminfo.blk[i]; | ||
571 | memblock_set_node(mb->start, mb->end - mb->start, | ||
572 | &memblock.reserved, mb->nid); | ||
573 | } | ||
574 | |||
572 | /* Mark all kernel nodes. */ | 575 | /* Mark all kernel nodes. */ |
573 | for (i = 0; i < type->cnt; i++) | 576 | for (i = 0; i < type->cnt; i++) |
574 | node_set(type->regions[i].nid, numa_kernel_nodes); | 577 | node_set(type->regions[i].nid, numa_kernel_nodes); |
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index 0342d27ca798..47b6436e41c2 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c | |||
@@ -52,6 +52,8 @@ void memory_present(int nid, unsigned long start, unsigned long end) | |||
52 | nid, start, end); | 52 | nid, start, end); |
53 | printk(KERN_DEBUG " Setting physnode_map array to node %d for pfns:\n", nid); | 53 | printk(KERN_DEBUG " Setting physnode_map array to node %d for pfns:\n", nid); |
54 | printk(KERN_DEBUG " "); | 54 | printk(KERN_DEBUG " "); |
55 | start = round_down(start, PAGES_PER_SECTION); | ||
56 | end = round_up(end, PAGES_PER_SECTION); | ||
55 | for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) { | 57 | for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) { |
56 | physnode_map[pfn / PAGES_PER_SECTION] = nid; | 58 | physnode_map[pfn / PAGES_PER_SECTION] = nid; |
57 | printk(KERN_CONT "%lx ", pfn); | 59 | printk(KERN_CONT "%lx ", pfn); |
diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c index 1a25187e151e..1953e9c9391a 100644 --- a/arch/x86/mm/srat.c +++ b/arch/x86/mm/srat.c | |||
@@ -42,15 +42,25 @@ static __init inline int srat_disabled(void) | |||
42 | return acpi_numa < 0; | 42 | return acpi_numa < 0; |
43 | } | 43 | } |
44 | 44 | ||
45 | /* Callback for SLIT parsing */ | 45 | /* |
46 | * Callback for SLIT parsing. pxm_to_node() returns NUMA_NO_NODE for | ||
47 | * I/O localities since SRAT does not list them. I/O localities are | ||
48 | * not supported at this point. | ||
49 | */ | ||
46 | void __init acpi_numa_slit_init(struct acpi_table_slit *slit) | 50 | void __init acpi_numa_slit_init(struct acpi_table_slit *slit) |
47 | { | 51 | { |
48 | int i, j; | 52 | int i, j; |
49 | 53 | ||
50 | for (i = 0; i < slit->locality_count; i++) | 54 | for (i = 0; i < slit->locality_count; i++) { |
51 | for (j = 0; j < slit->locality_count; j++) | 55 | if (pxm_to_node(i) == NUMA_NO_NODE) |
56 | continue; | ||
57 | for (j = 0; j < slit->locality_count; j++) { | ||
58 | if (pxm_to_node(j) == NUMA_NO_NODE) | ||
59 | continue; | ||
52 | numa_set_distance(pxm_to_node(i), pxm_to_node(j), | 60 | numa_set_distance(pxm_to_node(i), pxm_to_node(j), |
53 | slit->entry[slit->locality_count * i + j]); | 61 | slit->entry[slit->locality_count * i + j]); |
62 | } | ||
63 | } | ||
54 | } | 64 | } |
55 | 65 | ||
56 | /* Callback for Proximity Domain -> x2APIC mapping */ | 66 | /* Callback for Proximity Domain -> x2APIC mapping */ |
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index ae699b3bbac8..dd8dda167a24 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c | |||
@@ -103,7 +103,7 @@ static void flush_tlb_func(void *info) | |||
103 | if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm)) | 103 | if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm)) |
104 | return; | 104 | return; |
105 | 105 | ||
106 | count_vm_event(NR_TLB_REMOTE_FLUSH_RECEIVED); | 106 | count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); |
107 | if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) { | 107 | if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) { |
108 | if (f->flush_end == TLB_FLUSH_ALL) | 108 | if (f->flush_end == TLB_FLUSH_ALL) |
109 | local_flush_tlb(); | 109 | local_flush_tlb(); |
@@ -131,7 +131,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask, | |||
131 | info.flush_start = start; | 131 | info.flush_start = start; |
132 | info.flush_end = end; | 132 | info.flush_end = end; |
133 | 133 | ||
134 | count_vm_event(NR_TLB_REMOTE_FLUSH); | 134 | count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); |
135 | if (is_uv_system()) { | 135 | if (is_uv_system()) { |
136 | unsigned int cpu; | 136 | unsigned int cpu; |
137 | 137 | ||
@@ -151,44 +151,19 @@ void flush_tlb_current_task(void) | |||
151 | 151 | ||
152 | preempt_disable(); | 152 | preempt_disable(); |
153 | 153 | ||
154 | count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); | 154 | count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); |
155 | local_flush_tlb(); | 155 | local_flush_tlb(); |
156 | if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) | 156 | if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) |
157 | flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL); | 157 | flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL); |
158 | preempt_enable(); | 158 | preempt_enable(); |
159 | } | 159 | } |
160 | 160 | ||
161 | /* | ||
162 | * It can find out the THP large page, or | ||
163 | * HUGETLB page in tlb_flush when THP disabled | ||
164 | */ | ||
165 | static inline unsigned long has_large_page(struct mm_struct *mm, | ||
166 | unsigned long start, unsigned long end) | ||
167 | { | ||
168 | pgd_t *pgd; | ||
169 | pud_t *pud; | ||
170 | pmd_t *pmd; | ||
171 | unsigned long addr = ALIGN(start, HPAGE_SIZE); | ||
172 | for (; addr < end; addr += HPAGE_SIZE) { | ||
173 | pgd = pgd_offset(mm, addr); | ||
174 | if (likely(!pgd_none(*pgd))) { | ||
175 | pud = pud_offset(pgd, addr); | ||
176 | if (likely(!pud_none(*pud))) { | ||
177 | pmd = pmd_offset(pud, addr); | ||
178 | if (likely(!pmd_none(*pmd))) | ||
179 | if (pmd_large(*pmd)) | ||
180 | return addr; | ||
181 | } | ||
182 | } | ||
183 | } | ||
184 | return 0; | ||
185 | } | ||
186 | |||
187 | void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, | 161 | void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, |
188 | unsigned long end, unsigned long vmflag) | 162 | unsigned long end, unsigned long vmflag) |
189 | { | 163 | { |
190 | unsigned long addr; | 164 | unsigned long addr; |
191 | unsigned act_entries, tlb_entries = 0; | 165 | unsigned act_entries, tlb_entries = 0; |
166 | unsigned long nr_base_pages; | ||
192 | 167 | ||
193 | preempt_disable(); | 168 | preempt_disable(); |
194 | if (current->active_mm != mm) | 169 | if (current->active_mm != mm) |
@@ -210,21 +185,20 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, | |||
210 | tlb_entries = tlb_lli_4k[ENTRIES]; | 185 | tlb_entries = tlb_lli_4k[ENTRIES]; |
211 | else | 186 | else |
212 | tlb_entries = tlb_lld_4k[ENTRIES]; | 187 | tlb_entries = tlb_lld_4k[ENTRIES]; |
188 | |||
213 | /* Assume all of TLB entries was occupied by this task */ | 189 | /* Assume all of TLB entries was occupied by this task */ |
214 | act_entries = mm->total_vm > tlb_entries ? tlb_entries : mm->total_vm; | 190 | act_entries = tlb_entries >> tlb_flushall_shift; |
191 | act_entries = mm->total_vm > act_entries ? act_entries : mm->total_vm; | ||
192 | nr_base_pages = (end - start) >> PAGE_SHIFT; | ||
215 | 193 | ||
216 | /* tlb_flushall_shift is on balance point, details in commit log */ | 194 | /* tlb_flushall_shift is on balance point, details in commit log */ |
217 | if ((end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift) { | 195 | if (nr_base_pages > act_entries) { |
218 | count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); | 196 | count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); |
219 | local_flush_tlb(); | 197 | local_flush_tlb(); |
220 | } else { | 198 | } else { |
221 | if (has_large_page(mm, start, end)) { | ||
222 | local_flush_tlb(); | ||
223 | goto flush_all; | ||
224 | } | ||
225 | /* flush range by one by one 'invlpg' */ | 199 | /* flush range by one by one 'invlpg' */ |
226 | for (addr = start; addr < end; addr += PAGE_SIZE) { | 200 | for (addr = start; addr < end; addr += PAGE_SIZE) { |
227 | count_vm_event(NR_TLB_LOCAL_FLUSH_ONE); | 201 | count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE); |
228 | __flush_tlb_single(addr); | 202 | __flush_tlb_single(addr); |
229 | } | 203 | } |
230 | 204 | ||
@@ -262,7 +236,7 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long start) | |||
262 | 236 | ||
263 | static void do_flush_tlb_all(void *info) | 237 | static void do_flush_tlb_all(void *info) |
264 | { | 238 | { |
265 | count_vm_event(NR_TLB_REMOTE_FLUSH_RECEIVED); | 239 | count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); |
266 | __flush_tlb_all(); | 240 | __flush_tlb_all(); |
267 | if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY) | 241 | if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY) |
268 | leave_mm(smp_processor_id()); | 242 | leave_mm(smp_processor_id()); |
@@ -270,7 +244,7 @@ static void do_flush_tlb_all(void *info) | |||
270 | 244 | ||
271 | void flush_tlb_all(void) | 245 | void flush_tlb_all(void) |
272 | { | 246 | { |
273 | count_vm_event(NR_TLB_REMOTE_FLUSH); | 247 | count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); |
274 | on_each_cpu(do_flush_tlb_all, NULL, 1); | 248 | on_each_cpu(do_flush_tlb_all, NULL, 1); |
275 | } | 249 | } |
276 | 250 | ||