diff options
Diffstat (limited to 'arch/x86/mm')
| -rw-r--r-- | arch/x86/mm/fault.c | 61 | ||||
| -rw-r--r-- | arch/x86/mm/numa.c | 21 | ||||
| -rw-r--r-- | arch/x86/mm/numa_32.c | 2 | ||||
| -rw-r--r-- | arch/x86/mm/srat.c | 16 | ||||
| -rw-r--r-- | arch/x86/mm/tlb.c | 52 |
5 files changed, 82 insertions, 70 deletions
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 9d591c895803..a10c8c792161 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c | |||
| @@ -1001,6 +1001,12 @@ static int fault_in_kernel_space(unsigned long address) | |||
| 1001 | 1001 | ||
| 1002 | static inline bool smap_violation(int error_code, struct pt_regs *regs) | 1002 | static inline bool smap_violation(int error_code, struct pt_regs *regs) |
| 1003 | { | 1003 | { |
| 1004 | if (!IS_ENABLED(CONFIG_X86_SMAP)) | ||
| 1005 | return false; | ||
| 1006 | |||
| 1007 | if (!static_cpu_has(X86_FEATURE_SMAP)) | ||
| 1008 | return false; | ||
| 1009 | |||
| 1004 | if (error_code & PF_USER) | 1010 | if (error_code & PF_USER) |
| 1005 | return false; | 1011 | return false; |
| 1006 | 1012 | ||
| @@ -1014,13 +1020,17 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs) | |||
| 1014 | * This routine handles page faults. It determines the address, | 1020 | * This routine handles page faults. It determines the address, |
| 1015 | * and the problem, and then passes it off to one of the appropriate | 1021 | * and the problem, and then passes it off to one of the appropriate |
| 1016 | * routines. | 1022 | * routines. |
| 1023 | * | ||
| 1024 | * This function must have noinline because both callers | ||
| 1025 | * {,trace_}do_page_fault() have notrace on. Having this an actual function | ||
| 1026 | * guarantees there's a function trace entry. | ||
| 1017 | */ | 1027 | */ |
| 1018 | static void __kprobes | 1028 | static void __kprobes noinline |
| 1019 | __do_page_fault(struct pt_regs *regs, unsigned long error_code) | 1029 | __do_page_fault(struct pt_regs *regs, unsigned long error_code, |
| 1030 | unsigned long address) | ||
| 1020 | { | 1031 | { |
| 1021 | struct vm_area_struct *vma; | 1032 | struct vm_area_struct *vma; |
| 1022 | struct task_struct *tsk; | 1033 | struct task_struct *tsk; |
| 1023 | unsigned long address; | ||
| 1024 | struct mm_struct *mm; | 1034 | struct mm_struct *mm; |
| 1025 | int fault; | 1035 | int fault; |
| 1026 | unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; | 1036 | unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; |
| @@ -1028,9 +1038,6 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code) | |||
| 1028 | tsk = current; | 1038 | tsk = current; |
| 1029 | mm = tsk->mm; | 1039 | mm = tsk->mm; |
| 1030 | 1040 | ||
| 1031 | /* Get the faulting address: */ | ||
| 1032 | address = read_cr2(); | ||
| 1033 | |||
| 1034 | /* | 1041 | /* |
| 1035 | * Detect and handle instructions that would cause a page fault for | 1042 | * Detect and handle instructions that would cause a page fault for |
| 1036 | * both a tracked kernel page and a userspace page. | 1043 | * both a tracked kernel page and a userspace page. |
| @@ -1087,11 +1094,9 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code) | |||
| 1087 | if (unlikely(error_code & PF_RSVD)) | 1094 | if (unlikely(error_code & PF_RSVD)) |
| 1088 | pgtable_bad(regs, error_code, address); | 1095 | pgtable_bad(regs, error_code, address); |
| 1089 | 1096 | ||
| 1090 | if (static_cpu_has(X86_FEATURE_SMAP)) { | 1097 | if (unlikely(smap_violation(error_code, regs))) { |
| 1091 | if (unlikely(smap_violation(error_code, regs))) { | 1098 | bad_area_nosemaphore(regs, error_code, address); |
| 1092 | bad_area_nosemaphore(regs, error_code, address); | 1099 | return; |
| 1093 | return; | ||
| 1094 | } | ||
| 1095 | } | 1100 | } |
| 1096 | 1101 | ||
| 1097 | /* | 1102 | /* |
| @@ -1244,32 +1249,50 @@ good_area: | |||
| 1244 | up_read(&mm->mmap_sem); | 1249 | up_read(&mm->mmap_sem); |
| 1245 | } | 1250 | } |
| 1246 | 1251 | ||
| 1247 | dotraplinkage void __kprobes | 1252 | dotraplinkage void __kprobes notrace |
| 1248 | do_page_fault(struct pt_regs *regs, unsigned long error_code) | 1253 | do_page_fault(struct pt_regs *regs, unsigned long error_code) |
| 1249 | { | 1254 | { |
| 1255 | unsigned long address = read_cr2(); /* Get the faulting address */ | ||
| 1250 | enum ctx_state prev_state; | 1256 | enum ctx_state prev_state; |
| 1251 | 1257 | ||
| 1258 | /* | ||
| 1259 | * We must have this function tagged with __kprobes, notrace and call | ||
| 1260 | * read_cr2() before calling anything else. To avoid calling any kind | ||
| 1261 | * of tracing machinery before we've observed the CR2 value. | ||
| 1262 | * | ||
| 1263 | * exception_{enter,exit}() contain all sorts of tracepoints. | ||
| 1264 | */ | ||
| 1265 | |||
| 1252 | prev_state = exception_enter(); | 1266 | prev_state = exception_enter(); |
| 1253 | __do_page_fault(regs, error_code); | 1267 | __do_page_fault(regs, error_code, address); |
| 1254 | exception_exit(prev_state); | 1268 | exception_exit(prev_state); |
| 1255 | } | 1269 | } |
| 1256 | 1270 | ||
| 1257 | static void trace_page_fault_entries(struct pt_regs *regs, | 1271 | #ifdef CONFIG_TRACING |
| 1272 | static void trace_page_fault_entries(unsigned long address, struct pt_regs *regs, | ||
| 1258 | unsigned long error_code) | 1273 | unsigned long error_code) |
| 1259 | { | 1274 | { |
| 1260 | if (user_mode(regs)) | 1275 | if (user_mode(regs)) |
| 1261 | trace_page_fault_user(read_cr2(), regs, error_code); | 1276 | trace_page_fault_user(address, regs, error_code); |
| 1262 | else | 1277 | else |
| 1263 | trace_page_fault_kernel(read_cr2(), regs, error_code); | 1278 | trace_page_fault_kernel(address, regs, error_code); |
| 1264 | } | 1279 | } |
| 1265 | 1280 | ||
| 1266 | dotraplinkage void __kprobes | 1281 | dotraplinkage void __kprobes notrace |
| 1267 | trace_do_page_fault(struct pt_regs *regs, unsigned long error_code) | 1282 | trace_do_page_fault(struct pt_regs *regs, unsigned long error_code) |
| 1268 | { | 1283 | { |
| 1284 | /* | ||
| 1285 | * The exception_enter and tracepoint processing could | ||
| 1286 | * trigger another page faults (user space callchain | ||
| 1287 | * reading) and destroy the original cr2 value, so read | ||
| 1288 | * the faulting address now. | ||
| 1289 | */ | ||
| 1290 | unsigned long address = read_cr2(); | ||
| 1269 | enum ctx_state prev_state; | 1291 | enum ctx_state prev_state; |
| 1270 | 1292 | ||
| 1271 | prev_state = exception_enter(); | 1293 | prev_state = exception_enter(); |
| 1272 | trace_page_fault_entries(regs, error_code); | 1294 | trace_page_fault_entries(address, regs, error_code); |
| 1273 | __do_page_fault(regs, error_code); | 1295 | __do_page_fault(regs, error_code, address); |
| 1274 | exception_exit(prev_state); | 1296 | exception_exit(prev_state); |
| 1275 | } | 1297 | } |
| 1298 | #endif /* CONFIG_TRACING */ | ||
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 81b2750f3666..27aa0455fab3 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c | |||
| @@ -493,14 +493,6 @@ static int __init numa_register_memblks(struct numa_meminfo *mi) | |||
| 493 | struct numa_memblk *mb = &mi->blk[i]; | 493 | struct numa_memblk *mb = &mi->blk[i]; |
| 494 | memblock_set_node(mb->start, mb->end - mb->start, | 494 | memblock_set_node(mb->start, mb->end - mb->start, |
| 495 | &memblock.memory, mb->nid); | 495 | &memblock.memory, mb->nid); |
| 496 | |||
| 497 | /* | ||
| 498 | * At this time, all memory regions reserved by memblock are | ||
| 499 | * used by the kernel. Set the nid in memblock.reserved will | ||
| 500 | * mark out all the nodes the kernel resides in. | ||
| 501 | */ | ||
| 502 | memblock_set_node(mb->start, mb->end - mb->start, | ||
| 503 | &memblock.reserved, mb->nid); | ||
| 504 | } | 496 | } |
| 505 | 497 | ||
| 506 | /* | 498 | /* |
| @@ -565,10 +557,21 @@ static void __init numa_init_array(void) | |||
| 565 | static void __init numa_clear_kernel_node_hotplug(void) | 557 | static void __init numa_clear_kernel_node_hotplug(void) |
| 566 | { | 558 | { |
| 567 | int i, nid; | 559 | int i, nid; |
| 568 | nodemask_t numa_kernel_nodes; | 560 | nodemask_t numa_kernel_nodes = NODE_MASK_NONE; |
| 569 | unsigned long start, end; | 561 | unsigned long start, end; |
| 570 | struct memblock_type *type = &memblock.reserved; | 562 | struct memblock_type *type = &memblock.reserved; |
| 571 | 563 | ||
| 564 | /* | ||
| 565 | * At this time, all memory regions reserved by memblock are | ||
| 566 | * used by the kernel. Set the nid in memblock.reserved will | ||
| 567 | * mark out all the nodes the kernel resides in. | ||
| 568 | */ | ||
| 569 | for (i = 0; i < numa_meminfo.nr_blks; i++) { | ||
| 570 | struct numa_memblk *mb = &numa_meminfo.blk[i]; | ||
| 571 | memblock_set_node(mb->start, mb->end - mb->start, | ||
| 572 | &memblock.reserved, mb->nid); | ||
| 573 | } | ||
| 574 | |||
| 572 | /* Mark all kernel nodes. */ | 575 | /* Mark all kernel nodes. */ |
| 573 | for (i = 0; i < type->cnt; i++) | 576 | for (i = 0; i < type->cnt; i++) |
| 574 | node_set(type->regions[i].nid, numa_kernel_nodes); | 577 | node_set(type->regions[i].nid, numa_kernel_nodes); |
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index 0342d27ca798..47b6436e41c2 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c | |||
| @@ -52,6 +52,8 @@ void memory_present(int nid, unsigned long start, unsigned long end) | |||
| 52 | nid, start, end); | 52 | nid, start, end); |
| 53 | printk(KERN_DEBUG " Setting physnode_map array to node %d for pfns:\n", nid); | 53 | printk(KERN_DEBUG " Setting physnode_map array to node %d for pfns:\n", nid); |
| 54 | printk(KERN_DEBUG " "); | 54 | printk(KERN_DEBUG " "); |
| 55 | start = round_down(start, PAGES_PER_SECTION); | ||
| 56 | end = round_up(end, PAGES_PER_SECTION); | ||
| 55 | for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) { | 57 | for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) { |
| 56 | physnode_map[pfn / PAGES_PER_SECTION] = nid; | 58 | physnode_map[pfn / PAGES_PER_SECTION] = nid; |
| 57 | printk(KERN_CONT "%lx ", pfn); | 59 | printk(KERN_CONT "%lx ", pfn); |
diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c index 1a25187e151e..1953e9c9391a 100644 --- a/arch/x86/mm/srat.c +++ b/arch/x86/mm/srat.c | |||
| @@ -42,15 +42,25 @@ static __init inline int srat_disabled(void) | |||
| 42 | return acpi_numa < 0; | 42 | return acpi_numa < 0; |
| 43 | } | 43 | } |
| 44 | 44 | ||
| 45 | /* Callback for SLIT parsing */ | 45 | /* |
| 46 | * Callback for SLIT parsing. pxm_to_node() returns NUMA_NO_NODE for | ||
| 47 | * I/O localities since SRAT does not list them. I/O localities are | ||
| 48 | * not supported at this point. | ||
| 49 | */ | ||
| 46 | void __init acpi_numa_slit_init(struct acpi_table_slit *slit) | 50 | void __init acpi_numa_slit_init(struct acpi_table_slit *slit) |
| 47 | { | 51 | { |
| 48 | int i, j; | 52 | int i, j; |
| 49 | 53 | ||
| 50 | for (i = 0; i < slit->locality_count; i++) | 54 | for (i = 0; i < slit->locality_count; i++) { |
| 51 | for (j = 0; j < slit->locality_count; j++) | 55 | if (pxm_to_node(i) == NUMA_NO_NODE) |
| 56 | continue; | ||
| 57 | for (j = 0; j < slit->locality_count; j++) { | ||
| 58 | if (pxm_to_node(j) == NUMA_NO_NODE) | ||
| 59 | continue; | ||
| 52 | numa_set_distance(pxm_to_node(i), pxm_to_node(j), | 60 | numa_set_distance(pxm_to_node(i), pxm_to_node(j), |
| 53 | slit->entry[slit->locality_count * i + j]); | 61 | slit->entry[slit->locality_count * i + j]); |
| 62 | } | ||
| 63 | } | ||
| 54 | } | 64 | } |
| 55 | 65 | ||
| 56 | /* Callback for Proximity Domain -> x2APIC mapping */ | 66 | /* Callback for Proximity Domain -> x2APIC mapping */ |
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index ae699b3bbac8..dd8dda167a24 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c | |||
| @@ -103,7 +103,7 @@ static void flush_tlb_func(void *info) | |||
| 103 | if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm)) | 103 | if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm)) |
| 104 | return; | 104 | return; |
| 105 | 105 | ||
| 106 | count_vm_event(NR_TLB_REMOTE_FLUSH_RECEIVED); | 106 | count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); |
| 107 | if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) { | 107 | if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) { |
| 108 | if (f->flush_end == TLB_FLUSH_ALL) | 108 | if (f->flush_end == TLB_FLUSH_ALL) |
| 109 | local_flush_tlb(); | 109 | local_flush_tlb(); |
| @@ -131,7 +131,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask, | |||
| 131 | info.flush_start = start; | 131 | info.flush_start = start; |
| 132 | info.flush_end = end; | 132 | info.flush_end = end; |
| 133 | 133 | ||
| 134 | count_vm_event(NR_TLB_REMOTE_FLUSH); | 134 | count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); |
| 135 | if (is_uv_system()) { | 135 | if (is_uv_system()) { |
| 136 | unsigned int cpu; | 136 | unsigned int cpu; |
| 137 | 137 | ||
| @@ -151,44 +151,19 @@ void flush_tlb_current_task(void) | |||
| 151 | 151 | ||
| 152 | preempt_disable(); | 152 | preempt_disable(); |
| 153 | 153 | ||
| 154 | count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); | 154 | count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); |
| 155 | local_flush_tlb(); | 155 | local_flush_tlb(); |
| 156 | if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) | 156 | if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) |
| 157 | flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL); | 157 | flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL); |
| 158 | preempt_enable(); | 158 | preempt_enable(); |
| 159 | } | 159 | } |
| 160 | 160 | ||
| 161 | /* | ||
| 162 | * It can find out the THP large page, or | ||
| 163 | * HUGETLB page in tlb_flush when THP disabled | ||
| 164 | */ | ||
| 165 | static inline unsigned long has_large_page(struct mm_struct *mm, | ||
| 166 | unsigned long start, unsigned long end) | ||
| 167 | { | ||
| 168 | pgd_t *pgd; | ||
| 169 | pud_t *pud; | ||
| 170 | pmd_t *pmd; | ||
| 171 | unsigned long addr = ALIGN(start, HPAGE_SIZE); | ||
| 172 | for (; addr < end; addr += HPAGE_SIZE) { | ||
| 173 | pgd = pgd_offset(mm, addr); | ||
| 174 | if (likely(!pgd_none(*pgd))) { | ||
| 175 | pud = pud_offset(pgd, addr); | ||
| 176 | if (likely(!pud_none(*pud))) { | ||
| 177 | pmd = pmd_offset(pud, addr); | ||
| 178 | if (likely(!pmd_none(*pmd))) | ||
| 179 | if (pmd_large(*pmd)) | ||
| 180 | return addr; | ||
| 181 | } | ||
| 182 | } | ||
| 183 | } | ||
| 184 | return 0; | ||
| 185 | } | ||
| 186 | |||
| 187 | void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, | 161 | void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, |
| 188 | unsigned long end, unsigned long vmflag) | 162 | unsigned long end, unsigned long vmflag) |
| 189 | { | 163 | { |
| 190 | unsigned long addr; | 164 | unsigned long addr; |
| 191 | unsigned act_entries, tlb_entries = 0; | 165 | unsigned act_entries, tlb_entries = 0; |
| 166 | unsigned long nr_base_pages; | ||
| 192 | 167 | ||
| 193 | preempt_disable(); | 168 | preempt_disable(); |
| 194 | if (current->active_mm != mm) | 169 | if (current->active_mm != mm) |
| @@ -210,21 +185,20 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, | |||
| 210 | tlb_entries = tlb_lli_4k[ENTRIES]; | 185 | tlb_entries = tlb_lli_4k[ENTRIES]; |
| 211 | else | 186 | else |
| 212 | tlb_entries = tlb_lld_4k[ENTRIES]; | 187 | tlb_entries = tlb_lld_4k[ENTRIES]; |
| 188 | |||
| 213 | /* Assume all of TLB entries was occupied by this task */ | 189 | /* Assume all of TLB entries was occupied by this task */ |
| 214 | act_entries = mm->total_vm > tlb_entries ? tlb_entries : mm->total_vm; | 190 | act_entries = tlb_entries >> tlb_flushall_shift; |
| 191 | act_entries = mm->total_vm > act_entries ? act_entries : mm->total_vm; | ||
| 192 | nr_base_pages = (end - start) >> PAGE_SHIFT; | ||
| 215 | 193 | ||
| 216 | /* tlb_flushall_shift is on balance point, details in commit log */ | 194 | /* tlb_flushall_shift is on balance point, details in commit log */ |
| 217 | if ((end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift) { | 195 | if (nr_base_pages > act_entries) { |
| 218 | count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); | 196 | count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); |
| 219 | local_flush_tlb(); | 197 | local_flush_tlb(); |
| 220 | } else { | 198 | } else { |
| 221 | if (has_large_page(mm, start, end)) { | ||
| 222 | local_flush_tlb(); | ||
| 223 | goto flush_all; | ||
| 224 | } | ||
| 225 | /* flush range by one by one 'invlpg' */ | 199 | /* flush range by one by one 'invlpg' */ |
| 226 | for (addr = start; addr < end; addr += PAGE_SIZE) { | 200 | for (addr = start; addr < end; addr += PAGE_SIZE) { |
| 227 | count_vm_event(NR_TLB_LOCAL_FLUSH_ONE); | 201 | count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE); |
| 228 | __flush_tlb_single(addr); | 202 | __flush_tlb_single(addr); |
| 229 | } | 203 | } |
| 230 | 204 | ||
| @@ -262,7 +236,7 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long start) | |||
| 262 | 236 | ||
| 263 | static void do_flush_tlb_all(void *info) | 237 | static void do_flush_tlb_all(void *info) |
| 264 | { | 238 | { |
| 265 | count_vm_event(NR_TLB_REMOTE_FLUSH_RECEIVED); | 239 | count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); |
| 266 | __flush_tlb_all(); | 240 | __flush_tlb_all(); |
| 267 | if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY) | 241 | if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY) |
| 268 | leave_mm(smp_processor_id()); | 242 | leave_mm(smp_processor_id()); |
| @@ -270,7 +244,7 @@ static void do_flush_tlb_all(void *info) | |||
| 270 | 244 | ||
| 271 | void flush_tlb_all(void) | 245 | void flush_tlb_all(void) |
| 272 | { | 246 | { |
| 273 | count_vm_event(NR_TLB_REMOTE_FLUSH); | 247 | count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); |
| 274 | on_each_cpu(do_flush_tlb_all, NULL, 1); | 248 | on_each_cpu(do_flush_tlb_all, NULL, 1); |
| 275 | } | 249 | } |
| 276 | 250 | ||
