diff options
Diffstat (limited to 'arch/x86/mm/fault.c')
-rw-r--r-- | arch/x86/mm/fault.c | 148 |
1 files changed, 94 insertions, 54 deletions
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 4c4508e8a204..2dbf6bf4c7e5 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c | |||
@@ -11,6 +11,8 @@ | |||
11 | #include <linux/kprobes.h> /* __kprobes, ... */ | 11 | #include <linux/kprobes.h> /* __kprobes, ... */ |
12 | #include <linux/mmiotrace.h> /* kmmio_handler, ... */ | 12 | #include <linux/mmiotrace.h> /* kmmio_handler, ... */ |
13 | #include <linux/perf_event.h> /* perf_sw_event */ | 13 | #include <linux/perf_event.h> /* perf_sw_event */ |
14 | #include <linux/hugetlb.h> /* hstate_index_to_shift */ | ||
15 | #include <linux/prefetch.h> /* prefetchw */ | ||
14 | 16 | ||
15 | #include <asm/traps.h> /* dotraplinkage, ... */ | 17 | #include <asm/traps.h> /* dotraplinkage, ... */ |
16 | #include <asm/pgalloc.h> /* pgd_*(), ... */ | 18 | #include <asm/pgalloc.h> /* pgd_*(), ... */ |
@@ -160,15 +162,20 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) | |||
160 | 162 | ||
161 | static void | 163 | static void |
162 | force_sig_info_fault(int si_signo, int si_code, unsigned long address, | 164 | force_sig_info_fault(int si_signo, int si_code, unsigned long address, |
163 | struct task_struct *tsk) | 165 | struct task_struct *tsk, int fault) |
164 | { | 166 | { |
167 | unsigned lsb = 0; | ||
165 | siginfo_t info; | 168 | siginfo_t info; |
166 | 169 | ||
167 | info.si_signo = si_signo; | 170 | info.si_signo = si_signo; |
168 | info.si_errno = 0; | 171 | info.si_errno = 0; |
169 | info.si_code = si_code; | 172 | info.si_code = si_code; |
170 | info.si_addr = (void __user *)address; | 173 | info.si_addr = (void __user *)address; |
171 | info.si_addr_lsb = si_code == BUS_MCEERR_AR ? PAGE_SHIFT : 0; | 174 | if (fault & VM_FAULT_HWPOISON_LARGE) |
175 | lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); | ||
176 | if (fault & VM_FAULT_HWPOISON) | ||
177 | lsb = PAGE_SHIFT; | ||
178 | info.si_addr_lsb = lsb; | ||
172 | 179 | ||
173 | force_sig_info(si_signo, &info, tsk); | 180 | force_sig_info(si_signo, &info, tsk); |
174 | } | 181 | } |
@@ -223,16 +230,24 @@ void vmalloc_sync_all(void) | |||
223 | for (address = VMALLOC_START & PMD_MASK; | 230 | for (address = VMALLOC_START & PMD_MASK; |
224 | address >= TASK_SIZE && address < FIXADDR_TOP; | 231 | address >= TASK_SIZE && address < FIXADDR_TOP; |
225 | address += PMD_SIZE) { | 232 | address += PMD_SIZE) { |
226 | |||
227 | unsigned long flags; | ||
228 | struct page *page; | 233 | struct page *page; |
229 | 234 | ||
230 | spin_lock_irqsave(&pgd_lock, flags); | 235 | spin_lock(&pgd_lock); |
231 | list_for_each_entry(page, &pgd_list, lru) { | 236 | list_for_each_entry(page, &pgd_list, lru) { |
232 | if (!vmalloc_sync_one(page_address(page), address)) | 237 | spinlock_t *pgt_lock; |
238 | pmd_t *ret; | ||
239 | |||
240 | /* the pgt_lock only for Xen */ | ||
241 | pgt_lock = &pgd_page_get_mm(page)->page_table_lock; | ||
242 | |||
243 | spin_lock(pgt_lock); | ||
244 | ret = vmalloc_sync_one(page_address(page), address); | ||
245 | spin_unlock(pgt_lock); | ||
246 | |||
247 | if (!ret) | ||
233 | break; | 248 | break; |
234 | } | 249 | } |
235 | spin_unlock_irqrestore(&pgd_lock, flags); | 250 | spin_unlock(&pgd_lock); |
236 | } | 251 | } |
237 | } | 252 | } |
238 | 253 | ||
@@ -251,6 +266,8 @@ static noinline __kprobes int vmalloc_fault(unsigned long address) | |||
251 | if (!(address >= VMALLOC_START && address < VMALLOC_END)) | 266 | if (!(address >= VMALLOC_START && address < VMALLOC_END)) |
252 | return -1; | 267 | return -1; |
253 | 268 | ||
269 | WARN_ON_ONCE(in_nmi()); | ||
270 | |||
254 | /* | 271 | /* |
255 | * Synchronize this task's top level page-table | 272 | * Synchronize this task's top level page-table |
256 | * with the 'reference' page table. | 273 | * with the 'reference' page table. |
@@ -326,29 +343,7 @@ out: | |||
326 | 343 | ||
327 | void vmalloc_sync_all(void) | 344 | void vmalloc_sync_all(void) |
328 | { | 345 | { |
329 | unsigned long address; | 346 | sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END); |
330 | |||
331 | for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END; | ||
332 | address += PGDIR_SIZE) { | ||
333 | |||
334 | const pgd_t *pgd_ref = pgd_offset_k(address); | ||
335 | unsigned long flags; | ||
336 | struct page *page; | ||
337 | |||
338 | if (pgd_none(*pgd_ref)) | ||
339 | continue; | ||
340 | |||
341 | spin_lock_irqsave(&pgd_lock, flags); | ||
342 | list_for_each_entry(page, &pgd_list, lru) { | ||
343 | pgd_t *pgd; | ||
344 | pgd = (pgd_t *)page_address(page) + pgd_index(address); | ||
345 | if (pgd_none(*pgd)) | ||
346 | set_pgd(pgd, *pgd_ref); | ||
347 | else | ||
348 | BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); | ||
349 | } | ||
350 | spin_unlock_irqrestore(&pgd_lock, flags); | ||
351 | } | ||
352 | } | 347 | } |
353 | 348 | ||
354 | /* | 349 | /* |
@@ -369,6 +364,8 @@ static noinline __kprobes int vmalloc_fault(unsigned long address) | |||
369 | if (!(address >= VMALLOC_START && address < VMALLOC_END)) | 364 | if (!(address >= VMALLOC_START && address < VMALLOC_END)) |
370 | return -1; | 365 | return -1; |
371 | 366 | ||
367 | WARN_ON_ONCE(in_nmi()); | ||
368 | |||
372 | /* | 369 | /* |
373 | * Copy kernel mappings over when needed. This can also | 370 | * Copy kernel mappings over when needed. This can also |
374 | * happen within a race in page table update. In the later | 371 | * happen within a race in page table update. In the later |
@@ -731,7 +728,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, | |||
731 | tsk->thread.error_code = error_code | (address >= TASK_SIZE); | 728 | tsk->thread.error_code = error_code | (address >= TASK_SIZE); |
732 | tsk->thread.trap_no = 14; | 729 | tsk->thread.trap_no = 14; |
733 | 730 | ||
734 | force_sig_info_fault(SIGSEGV, si_code, address, tsk); | 731 | force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0); |
735 | 732 | ||
736 | return; | 733 | return; |
737 | } | 734 | } |
@@ -816,28 +813,51 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, | |||
816 | tsk->thread.trap_no = 14; | 813 | tsk->thread.trap_no = 14; |
817 | 814 | ||
818 | #ifdef CONFIG_MEMORY_FAILURE | 815 | #ifdef CONFIG_MEMORY_FAILURE |
819 | if (fault & VM_FAULT_HWPOISON) { | 816 | if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) { |
820 | printk(KERN_ERR | 817 | printk(KERN_ERR |
821 | "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n", | 818 | "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n", |
822 | tsk->comm, tsk->pid, address); | 819 | tsk->comm, tsk->pid, address); |
823 | code = BUS_MCEERR_AR; | 820 | code = BUS_MCEERR_AR; |
824 | } | 821 | } |
825 | #endif | 822 | #endif |
826 | force_sig_info_fault(SIGBUS, code, address, tsk); | 823 | force_sig_info_fault(SIGBUS, code, address, tsk, fault); |
827 | } | 824 | } |
828 | 825 | ||
829 | static noinline void | 826 | static noinline int |
830 | mm_fault_error(struct pt_regs *regs, unsigned long error_code, | 827 | mm_fault_error(struct pt_regs *regs, unsigned long error_code, |
831 | unsigned long address, unsigned int fault) | 828 | unsigned long address, unsigned int fault) |
832 | { | 829 | { |
830 | /* | ||
831 | * Pagefault was interrupted by SIGKILL. We have no reason to | ||
832 | * continue pagefault. | ||
833 | */ | ||
834 | if (fatal_signal_pending(current)) { | ||
835 | if (!(fault & VM_FAULT_RETRY)) | ||
836 | up_read(¤t->mm->mmap_sem); | ||
837 | if (!(error_code & PF_USER)) | ||
838 | no_context(regs, error_code, address); | ||
839 | return 1; | ||
840 | } | ||
841 | if (!(fault & VM_FAULT_ERROR)) | ||
842 | return 0; | ||
843 | |||
833 | if (fault & VM_FAULT_OOM) { | 844 | if (fault & VM_FAULT_OOM) { |
845 | /* Kernel mode? Handle exceptions or die: */ | ||
846 | if (!(error_code & PF_USER)) { | ||
847 | up_read(¤t->mm->mmap_sem); | ||
848 | no_context(regs, error_code, address); | ||
849 | return 1; | ||
850 | } | ||
851 | |||
834 | out_of_memory(regs, error_code, address); | 852 | out_of_memory(regs, error_code, address); |
835 | } else { | 853 | } else { |
836 | if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON)) | 854 | if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON| |
855 | VM_FAULT_HWPOISON_LARGE)) | ||
837 | do_sigbus(regs, error_code, address, fault); | 856 | do_sigbus(regs, error_code, address, fault); |
838 | else | 857 | else |
839 | BUG(); | 858 | BUG(); |
840 | } | 859 | } |
860 | return 1; | ||
841 | } | 861 | } |
842 | 862 | ||
843 | static int spurious_fault_check(unsigned long error_code, pte_t *pte) | 863 | static int spurious_fault_check(unsigned long error_code, pte_t *pte) |
@@ -894,8 +914,14 @@ spurious_fault(unsigned long error_code, unsigned long address) | |||
894 | if (pmd_large(*pmd)) | 914 | if (pmd_large(*pmd)) |
895 | return spurious_fault_check(error_code, (pte_t *) pmd); | 915 | return spurious_fault_check(error_code, (pte_t *) pmd); |
896 | 916 | ||
917 | /* | ||
918 | * Note: don't use pte_present() here, since it returns true | ||
919 | * if the _PAGE_PROTNONE bit is set. However, this aliases the | ||
920 | * _PAGE_GLOBAL bit, which for kernel pages give false positives | ||
921 | * when CONFIG_DEBUG_PAGEALLOC is used. | ||
922 | */ | ||
897 | pte = pte_offset_kernel(pmd, address); | 923 | pte = pte_offset_kernel(pmd, address); |
898 | if (!pte_present(*pte)) | 924 | if (!(pte_flags(*pte) & _PAGE_PRESENT)) |
899 | return 0; | 925 | return 0; |
900 | 926 | ||
901 | ret = spurious_fault_check(error_code, pte); | 927 | ret = spurious_fault_check(error_code, pte); |
@@ -915,9 +941,9 @@ spurious_fault(unsigned long error_code, unsigned long address) | |||
915 | int show_unhandled_signals = 1; | 941 | int show_unhandled_signals = 1; |
916 | 942 | ||
917 | static inline int | 943 | static inline int |
918 | access_error(unsigned long error_code, int write, struct vm_area_struct *vma) | 944 | access_error(unsigned long error_code, struct vm_area_struct *vma) |
919 | { | 945 | { |
920 | if (write) { | 946 | if (error_code & PF_WRITE) { |
921 | /* write, present and write, not present: */ | 947 | /* write, present and write, not present: */ |
922 | if (unlikely(!(vma->vm_flags & VM_WRITE))) | 948 | if (unlikely(!(vma->vm_flags & VM_WRITE))) |
923 | return 1; | 949 | return 1; |
@@ -952,8 +978,10 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) | |||
952 | struct task_struct *tsk; | 978 | struct task_struct *tsk; |
953 | unsigned long address; | 979 | unsigned long address; |
954 | struct mm_struct *mm; | 980 | struct mm_struct *mm; |
955 | int write; | ||
956 | int fault; | 981 | int fault; |
982 | int write = error_code & PF_WRITE; | ||
983 | unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | | ||
984 | (write ? FAULT_FLAG_WRITE : 0); | ||
957 | 985 | ||
958 | tsk = current; | 986 | tsk = current; |
959 | mm = tsk->mm; | 987 | mm = tsk->mm; |
@@ -1064,6 +1092,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) | |||
1064 | bad_area_nosemaphore(regs, error_code, address); | 1092 | bad_area_nosemaphore(regs, error_code, address); |
1065 | return; | 1093 | return; |
1066 | } | 1094 | } |
1095 | retry: | ||
1067 | down_read(&mm->mmap_sem); | 1096 | down_read(&mm->mmap_sem); |
1068 | } else { | 1097 | } else { |
1069 | /* | 1098 | /* |
@@ -1107,9 +1136,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) | |||
1107 | * we can handle it.. | 1136 | * we can handle it.. |
1108 | */ | 1137 | */ |
1109 | good_area: | 1138 | good_area: |
1110 | write = error_code & PF_WRITE; | 1139 | if (unlikely(access_error(error_code, vma))) { |
1111 | |||
1112 | if (unlikely(access_error(error_code, write, vma))) { | ||
1113 | bad_area_access_error(regs, error_code, address); | 1140 | bad_area_access_error(regs, error_code, address); |
1114 | return; | 1141 | return; |
1115 | } | 1142 | } |
@@ -1119,21 +1146,34 @@ good_area: | |||
1119 | * make sure we exit gracefully rather than endlessly redo | 1146 | * make sure we exit gracefully rather than endlessly redo |
1120 | * the fault: | 1147 | * the fault: |
1121 | */ | 1148 | */ |
1122 | fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0); | 1149 | fault = handle_mm_fault(mm, vma, address, flags); |
1123 | 1150 | ||
1124 | if (unlikely(fault & VM_FAULT_ERROR)) { | 1151 | if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) { |
1125 | mm_fault_error(regs, error_code, address, fault); | 1152 | if (mm_fault_error(regs, error_code, address, fault)) |
1126 | return; | 1153 | return; |
1127 | } | 1154 | } |
1128 | 1155 | ||
1129 | if (fault & VM_FAULT_MAJOR) { | 1156 | /* |
1130 | tsk->maj_flt++; | 1157 | * Major/minor page fault accounting is only done on the |
1131 | perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, | 1158 | * initial attempt. If we go through a retry, it is extremely |
1132 | regs, address); | 1159 | * likely that the page will be found in page cache at that point. |
1133 | } else { | 1160 | */ |
1134 | tsk->min_flt++; | 1161 | if (flags & FAULT_FLAG_ALLOW_RETRY) { |
1135 | perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, | 1162 | if (fault & VM_FAULT_MAJOR) { |
1136 | regs, address); | 1163 | tsk->maj_flt++; |
1164 | perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, | ||
1165 | regs, address); | ||
1166 | } else { | ||
1167 | tsk->min_flt++; | ||
1168 | perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, | ||
1169 | regs, address); | ||
1170 | } | ||
1171 | if (fault & VM_FAULT_RETRY) { | ||
1172 | /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk | ||
1173 | * of starvation. */ | ||
1174 | flags &= ~FAULT_FLAG_ALLOW_RETRY; | ||
1175 | goto retry; | ||
1176 | } | ||
1137 | } | 1177 | } |
1138 | 1178 | ||
1139 | check_v8086_mode(regs, address, tsk); | 1179 | check_v8086_mode(regs, address, tsk); |