aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/mm/fault.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/mm/fault.c')
-rw-r--r--arch/x86/mm/fault.c148
1 files changed, 94 insertions, 54 deletions
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 4c4508e8a204..2dbf6bf4c7e5 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -11,6 +11,8 @@
11#include <linux/kprobes.h> /* __kprobes, ... */ 11#include <linux/kprobes.h> /* __kprobes, ... */
12#include <linux/mmiotrace.h> /* kmmio_handler, ... */ 12#include <linux/mmiotrace.h> /* kmmio_handler, ... */
13#include <linux/perf_event.h> /* perf_sw_event */ 13#include <linux/perf_event.h> /* perf_sw_event */
14#include <linux/hugetlb.h> /* hstate_index_to_shift */
15#include <linux/prefetch.h> /* prefetchw */
14 16
15#include <asm/traps.h> /* dotraplinkage, ... */ 17#include <asm/traps.h> /* dotraplinkage, ... */
16#include <asm/pgalloc.h> /* pgd_*(), ... */ 18#include <asm/pgalloc.h> /* pgd_*(), ... */
@@ -160,15 +162,20 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
160 162
161static void 163static void
162force_sig_info_fault(int si_signo, int si_code, unsigned long address, 164force_sig_info_fault(int si_signo, int si_code, unsigned long address,
163 struct task_struct *tsk) 165 struct task_struct *tsk, int fault)
164{ 166{
167 unsigned lsb = 0;
165 siginfo_t info; 168 siginfo_t info;
166 169
167 info.si_signo = si_signo; 170 info.si_signo = si_signo;
168 info.si_errno = 0; 171 info.si_errno = 0;
169 info.si_code = si_code; 172 info.si_code = si_code;
170 info.si_addr = (void __user *)address; 173 info.si_addr = (void __user *)address;
171 info.si_addr_lsb = si_code == BUS_MCEERR_AR ? PAGE_SHIFT : 0; 174 if (fault & VM_FAULT_HWPOISON_LARGE)
175 lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
176 if (fault & VM_FAULT_HWPOISON)
177 lsb = PAGE_SHIFT;
178 info.si_addr_lsb = lsb;
172 179
173 force_sig_info(si_signo, &info, tsk); 180 force_sig_info(si_signo, &info, tsk);
174} 181}
@@ -223,16 +230,24 @@ void vmalloc_sync_all(void)
223 for (address = VMALLOC_START & PMD_MASK; 230 for (address = VMALLOC_START & PMD_MASK;
224 address >= TASK_SIZE && address < FIXADDR_TOP; 231 address >= TASK_SIZE && address < FIXADDR_TOP;
225 address += PMD_SIZE) { 232 address += PMD_SIZE) {
226
227 unsigned long flags;
228 struct page *page; 233 struct page *page;
229 234
230 spin_lock_irqsave(&pgd_lock, flags); 235 spin_lock(&pgd_lock);
231 list_for_each_entry(page, &pgd_list, lru) { 236 list_for_each_entry(page, &pgd_list, lru) {
232 if (!vmalloc_sync_one(page_address(page), address)) 237 spinlock_t *pgt_lock;
238 pmd_t *ret;
239
240 /* the pgt_lock only for Xen */
241 pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
242
243 spin_lock(pgt_lock);
244 ret = vmalloc_sync_one(page_address(page), address);
245 spin_unlock(pgt_lock);
246
247 if (!ret)
233 break; 248 break;
234 } 249 }
235 spin_unlock_irqrestore(&pgd_lock, flags); 250 spin_unlock(&pgd_lock);
236 } 251 }
237} 252}
238 253
@@ -251,6 +266,8 @@ static noinline __kprobes int vmalloc_fault(unsigned long address)
251 if (!(address >= VMALLOC_START && address < VMALLOC_END)) 266 if (!(address >= VMALLOC_START && address < VMALLOC_END))
252 return -1; 267 return -1;
253 268
269 WARN_ON_ONCE(in_nmi());
270
254 /* 271 /*
255 * Synchronize this task's top level page-table 272 * Synchronize this task's top level page-table
256 * with the 'reference' page table. 273 * with the 'reference' page table.
@@ -326,29 +343,7 @@ out:
326 343
327void vmalloc_sync_all(void) 344void vmalloc_sync_all(void)
328{ 345{
329 unsigned long address; 346 sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END);
330
331 for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END;
332 address += PGDIR_SIZE) {
333
334 const pgd_t *pgd_ref = pgd_offset_k(address);
335 unsigned long flags;
336 struct page *page;
337
338 if (pgd_none(*pgd_ref))
339 continue;
340
341 spin_lock_irqsave(&pgd_lock, flags);
342 list_for_each_entry(page, &pgd_list, lru) {
343 pgd_t *pgd;
344 pgd = (pgd_t *)page_address(page) + pgd_index(address);
345 if (pgd_none(*pgd))
346 set_pgd(pgd, *pgd_ref);
347 else
348 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
349 }
350 spin_unlock_irqrestore(&pgd_lock, flags);
351 }
352} 347}
353 348
354/* 349/*
@@ -369,6 +364,8 @@ static noinline __kprobes int vmalloc_fault(unsigned long address)
369 if (!(address >= VMALLOC_START && address < VMALLOC_END)) 364 if (!(address >= VMALLOC_START && address < VMALLOC_END))
370 return -1; 365 return -1;
371 366
367 WARN_ON_ONCE(in_nmi());
368
372 /* 369 /*
373 * Copy kernel mappings over when needed. This can also 370 * Copy kernel mappings over when needed. This can also
374 * happen within a race in page table update. In the later 371 * happen within a race in page table update. In the later
@@ -731,7 +728,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
731 tsk->thread.error_code = error_code | (address >= TASK_SIZE); 728 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
732 tsk->thread.trap_no = 14; 729 tsk->thread.trap_no = 14;
733 730
734 force_sig_info_fault(SIGSEGV, si_code, address, tsk); 731 force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0);
735 732
736 return; 733 return;
737 } 734 }
@@ -816,28 +813,51 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
816 tsk->thread.trap_no = 14; 813 tsk->thread.trap_no = 14;
817 814
818#ifdef CONFIG_MEMORY_FAILURE 815#ifdef CONFIG_MEMORY_FAILURE
819 if (fault & VM_FAULT_HWPOISON) { 816 if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
820 printk(KERN_ERR 817 printk(KERN_ERR
821 "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n", 818 "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
822 tsk->comm, tsk->pid, address); 819 tsk->comm, tsk->pid, address);
823 code = BUS_MCEERR_AR; 820 code = BUS_MCEERR_AR;
824 } 821 }
825#endif 822#endif
826 force_sig_info_fault(SIGBUS, code, address, tsk); 823 force_sig_info_fault(SIGBUS, code, address, tsk, fault);
827} 824}
828 825
829static noinline void 826static noinline int
830mm_fault_error(struct pt_regs *regs, unsigned long error_code, 827mm_fault_error(struct pt_regs *regs, unsigned long error_code,
831 unsigned long address, unsigned int fault) 828 unsigned long address, unsigned int fault)
832{ 829{
830 /*
831 * Pagefault was interrupted by SIGKILL. We have no reason to
832 * continue pagefault.
833 */
834 if (fatal_signal_pending(current)) {
835 if (!(fault & VM_FAULT_RETRY))
836 up_read(&current->mm->mmap_sem);
837 if (!(error_code & PF_USER))
838 no_context(regs, error_code, address);
839 return 1;
840 }
841 if (!(fault & VM_FAULT_ERROR))
842 return 0;
843
833 if (fault & VM_FAULT_OOM) { 844 if (fault & VM_FAULT_OOM) {
845 /* Kernel mode? Handle exceptions or die: */
846 if (!(error_code & PF_USER)) {
847 up_read(&current->mm->mmap_sem);
848 no_context(regs, error_code, address);
849 return 1;
850 }
851
834 out_of_memory(regs, error_code, address); 852 out_of_memory(regs, error_code, address);
835 } else { 853 } else {
836 if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON)) 854 if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
855 VM_FAULT_HWPOISON_LARGE))
837 do_sigbus(regs, error_code, address, fault); 856 do_sigbus(regs, error_code, address, fault);
838 else 857 else
839 BUG(); 858 BUG();
840 } 859 }
860 return 1;
841} 861}
842 862
843static int spurious_fault_check(unsigned long error_code, pte_t *pte) 863static int spurious_fault_check(unsigned long error_code, pte_t *pte)
@@ -894,8 +914,14 @@ spurious_fault(unsigned long error_code, unsigned long address)
894 if (pmd_large(*pmd)) 914 if (pmd_large(*pmd))
895 return spurious_fault_check(error_code, (pte_t *) pmd); 915 return spurious_fault_check(error_code, (pte_t *) pmd);
896 916
917 /*
918 * Note: don't use pte_present() here, since it returns true
919 * if the _PAGE_PROTNONE bit is set. However, this aliases the
920 * _PAGE_GLOBAL bit, which for kernel pages give false positives
921 * when CONFIG_DEBUG_PAGEALLOC is used.
922 */
897 pte = pte_offset_kernel(pmd, address); 923 pte = pte_offset_kernel(pmd, address);
898 if (!pte_present(*pte)) 924 if (!(pte_flags(*pte) & _PAGE_PRESENT))
899 return 0; 925 return 0;
900 926
901 ret = spurious_fault_check(error_code, pte); 927 ret = spurious_fault_check(error_code, pte);
@@ -915,9 +941,9 @@ spurious_fault(unsigned long error_code, unsigned long address)
915int show_unhandled_signals = 1; 941int show_unhandled_signals = 1;
916 942
917static inline int 943static inline int
918access_error(unsigned long error_code, int write, struct vm_area_struct *vma) 944access_error(unsigned long error_code, struct vm_area_struct *vma)
919{ 945{
920 if (write) { 946 if (error_code & PF_WRITE) {
921 /* write, present and write, not present: */ 947 /* write, present and write, not present: */
922 if (unlikely(!(vma->vm_flags & VM_WRITE))) 948 if (unlikely(!(vma->vm_flags & VM_WRITE)))
923 return 1; 949 return 1;
@@ -952,8 +978,10 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
952 struct task_struct *tsk; 978 struct task_struct *tsk;
953 unsigned long address; 979 unsigned long address;
954 struct mm_struct *mm; 980 struct mm_struct *mm;
955 int write;
956 int fault; 981 int fault;
982 int write = error_code & PF_WRITE;
983 unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
984 (write ? FAULT_FLAG_WRITE : 0);
957 985
958 tsk = current; 986 tsk = current;
959 mm = tsk->mm; 987 mm = tsk->mm;
@@ -1064,6 +1092,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
1064 bad_area_nosemaphore(regs, error_code, address); 1092 bad_area_nosemaphore(regs, error_code, address);
1065 return; 1093 return;
1066 } 1094 }
1095retry:
1067 down_read(&mm->mmap_sem); 1096 down_read(&mm->mmap_sem);
1068 } else { 1097 } else {
1069 /* 1098 /*
@@ -1107,9 +1136,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
1107 * we can handle it.. 1136 * we can handle it..
1108 */ 1137 */
1109good_area: 1138good_area:
1110 write = error_code & PF_WRITE; 1139 if (unlikely(access_error(error_code, vma))) {
1111
1112 if (unlikely(access_error(error_code, write, vma))) {
1113 bad_area_access_error(regs, error_code, address); 1140 bad_area_access_error(regs, error_code, address);
1114 return; 1141 return;
1115 } 1142 }
@@ -1119,21 +1146,34 @@ good_area:
1119 * make sure we exit gracefully rather than endlessly redo 1146 * make sure we exit gracefully rather than endlessly redo
1120 * the fault: 1147 * the fault:
1121 */ 1148 */
1122 fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0); 1149 fault = handle_mm_fault(mm, vma, address, flags);
1123 1150
1124 if (unlikely(fault & VM_FAULT_ERROR)) { 1151 if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) {
1125 mm_fault_error(regs, error_code, address, fault); 1152 if (mm_fault_error(regs, error_code, address, fault))
1126 return; 1153 return;
1127 } 1154 }
1128 1155
1129 if (fault & VM_FAULT_MAJOR) { 1156 /*
1130 tsk->maj_flt++; 1157 * Major/minor page fault accounting is only done on the
1131 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, 1158 * initial attempt. If we go through a retry, it is extremely
1132 regs, address); 1159 * likely that the page will be found in page cache at that point.
1133 } else { 1160 */
1134 tsk->min_flt++; 1161 if (flags & FAULT_FLAG_ALLOW_RETRY) {
1135 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, 1162 if (fault & VM_FAULT_MAJOR) {
1136 regs, address); 1163 tsk->maj_flt++;
1164 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
1165 regs, address);
1166 } else {
1167 tsk->min_flt++;
1168 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
1169 regs, address);
1170 }
1171 if (fault & VM_FAULT_RETRY) {
1172 /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
1173 * of starvation. */
1174 flags &= ~FAULT_FLAG_ALLOW_RETRY;
1175 goto retry;
1176 }
1137 } 1177 }
1138 1178
1139 check_v8086_mode(regs, address, tsk); 1179 check_v8086_mode(regs, address, tsk);