diff options
author | Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> | 2010-11-16 14:06:22 -0500 |
---|---|---|
committer | Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> | 2010-11-16 14:06:22 -0500 |
commit | 20b4755e4fbb226eb42951bd40b53fcbce9ef944 (patch) | |
tree | 43da70e0b32ee423d3643ecd422821383411ab72 /arch/x86/mm/fault.c | |
parent | 744f9f104ea262de1dc3e29265870c649f0d9473 (diff) | |
parent | e53beacd23d9cb47590da6a7a7f6d417b941a994 (diff) |
Merge commit 'v2.6.37-rc2' into upstream/xenfs
* commit 'v2.6.37-rc2': (10093 commits)
Linux 2.6.37-rc2
capabilities/syslog: open code cap_syslog logic to fix build failure
i2c: Sanity checks on adapter registration
i2c: Mark i2c_adapter.id as deprecated
i2c: Drivers shouldn't include <linux/i2c-id.h>
i2c: Delete unused adapter IDs
i2c: Remove obsolete cleanup for clientdata
include/linux/kernel.h: Move logging bits to include/linux/printk.h
Fix gcc 4.5.1 miscompiling drivers/char/i8k.c (again)
hwmon: (w83795) Check for BEEP pin availability
hwmon: (w83795) Clear intrusion alarm immediately
hwmon: (w83795) Read the intrusion state properly
hwmon: (w83795) Print the actual temperature channels as sources
hwmon: (w83795) List all usable temperature sources
hwmon: (w83795) Expose fan control method
hwmon: (w83795) Fix fan control mode attributes
hwmon: (lm95241) Check validity of input values
hwmon: Change mail address of Hans J. Koch
PCI: sysfs: fix printk warnings
GFS2: Fix inode deallocation race
...
Diffstat (limited to 'arch/x86/mm/fault.c')
-rw-r--r-- | arch/x86/mm/fault.c | 110 |
1 files changed, 64 insertions, 46 deletions
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 4c4508e8a204..7d90ceb882a4 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c | |||
@@ -11,6 +11,7 @@ | |||
11 | #include <linux/kprobes.h> /* __kprobes, ... */ | 11 | #include <linux/kprobes.h> /* __kprobes, ... */ |
12 | #include <linux/mmiotrace.h> /* kmmio_handler, ... */ | 12 | #include <linux/mmiotrace.h> /* kmmio_handler, ... */ |
13 | #include <linux/perf_event.h> /* perf_sw_event */ | 13 | #include <linux/perf_event.h> /* perf_sw_event */ |
14 | #include <linux/hugetlb.h> /* hstate_index_to_shift */ | ||
14 | 15 | ||
15 | #include <asm/traps.h> /* dotraplinkage, ... */ | 16 | #include <asm/traps.h> /* dotraplinkage, ... */ |
16 | #include <asm/pgalloc.h> /* pgd_*(), ... */ | 17 | #include <asm/pgalloc.h> /* pgd_*(), ... */ |
@@ -160,15 +161,20 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) | |||
160 | 161 | ||
161 | static void | 162 | static void |
162 | force_sig_info_fault(int si_signo, int si_code, unsigned long address, | 163 | force_sig_info_fault(int si_signo, int si_code, unsigned long address, |
163 | struct task_struct *tsk) | 164 | struct task_struct *tsk, int fault) |
164 | { | 165 | { |
166 | unsigned lsb = 0; | ||
165 | siginfo_t info; | 167 | siginfo_t info; |
166 | 168 | ||
167 | info.si_signo = si_signo; | 169 | info.si_signo = si_signo; |
168 | info.si_errno = 0; | 170 | info.si_errno = 0; |
169 | info.si_code = si_code; | 171 | info.si_code = si_code; |
170 | info.si_addr = (void __user *)address; | 172 | info.si_addr = (void __user *)address; |
171 | info.si_addr_lsb = si_code == BUS_MCEERR_AR ? PAGE_SHIFT : 0; | 173 | if (fault & VM_FAULT_HWPOISON_LARGE) |
174 | lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); | ||
175 | if (fault & VM_FAULT_HWPOISON) | ||
176 | lsb = PAGE_SHIFT; | ||
177 | info.si_addr_lsb = lsb; | ||
172 | 178 | ||
173 | force_sig_info(si_signo, &info, tsk); | 179 | force_sig_info(si_signo, &info, tsk); |
174 | } | 180 | } |
@@ -229,7 +235,16 @@ void vmalloc_sync_all(void) | |||
229 | 235 | ||
230 | spin_lock_irqsave(&pgd_lock, flags); | 236 | spin_lock_irqsave(&pgd_lock, flags); |
231 | list_for_each_entry(page, &pgd_list, lru) { | 237 | list_for_each_entry(page, &pgd_list, lru) { |
232 | if (!vmalloc_sync_one(page_address(page), address)) | 238 | spinlock_t *pgt_lock; |
239 | pmd_t *ret; | ||
240 | |||
241 | pgt_lock = &pgd_page_get_mm(page)->page_table_lock; | ||
242 | |||
243 | spin_lock(pgt_lock); | ||
244 | ret = vmalloc_sync_one(page_address(page), address); | ||
245 | spin_unlock(pgt_lock); | ||
246 | |||
247 | if (!ret) | ||
233 | break; | 248 | break; |
234 | } | 249 | } |
235 | spin_unlock_irqrestore(&pgd_lock, flags); | 250 | spin_unlock_irqrestore(&pgd_lock, flags); |
@@ -251,6 +266,8 @@ static noinline __kprobes int vmalloc_fault(unsigned long address) | |||
251 | if (!(address >= VMALLOC_START && address < VMALLOC_END)) | 266 | if (!(address >= VMALLOC_START && address < VMALLOC_END)) |
252 | return -1; | 267 | return -1; |
253 | 268 | ||
269 | WARN_ON_ONCE(in_nmi()); | ||
270 | |||
254 | /* | 271 | /* |
255 | * Synchronize this task's top level page-table | 272 | * Synchronize this task's top level page-table |
256 | * with the 'reference' page table. | 273 | * with the 'reference' page table. |
@@ -326,29 +343,7 @@ out: | |||
326 | 343 | ||
327 | void vmalloc_sync_all(void) | 344 | void vmalloc_sync_all(void) |
328 | { | 345 | { |
329 | unsigned long address; | 346 | sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END); |
330 | |||
331 | for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END; | ||
332 | address += PGDIR_SIZE) { | ||
333 | |||
334 | const pgd_t *pgd_ref = pgd_offset_k(address); | ||
335 | unsigned long flags; | ||
336 | struct page *page; | ||
337 | |||
338 | if (pgd_none(*pgd_ref)) | ||
339 | continue; | ||
340 | |||
341 | spin_lock_irqsave(&pgd_lock, flags); | ||
342 | list_for_each_entry(page, &pgd_list, lru) { | ||
343 | pgd_t *pgd; | ||
344 | pgd = (pgd_t *)page_address(page) + pgd_index(address); | ||
345 | if (pgd_none(*pgd)) | ||
346 | set_pgd(pgd, *pgd_ref); | ||
347 | else | ||
348 | BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); | ||
349 | } | ||
350 | spin_unlock_irqrestore(&pgd_lock, flags); | ||
351 | } | ||
352 | } | 347 | } |
353 | 348 | ||
354 | /* | 349 | /* |
@@ -369,6 +364,8 @@ static noinline __kprobes int vmalloc_fault(unsigned long address) | |||
369 | if (!(address >= VMALLOC_START && address < VMALLOC_END)) | 364 | if (!(address >= VMALLOC_START && address < VMALLOC_END)) |
370 | return -1; | 365 | return -1; |
371 | 366 | ||
367 | WARN_ON_ONCE(in_nmi()); | ||
368 | |||
372 | /* | 369 | /* |
373 | * Copy kernel mappings over when needed. This can also | 370 | * Copy kernel mappings over when needed. This can also |
374 | * happen within a race in page table update. In the later | 371 | * happen within a race in page table update. In the later |
@@ -731,7 +728,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, | |||
731 | tsk->thread.error_code = error_code | (address >= TASK_SIZE); | 728 | tsk->thread.error_code = error_code | (address >= TASK_SIZE); |
732 | tsk->thread.trap_no = 14; | 729 | tsk->thread.trap_no = 14; |
733 | 730 | ||
734 | force_sig_info_fault(SIGSEGV, si_code, address, tsk); | 731 | force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0); |
735 | 732 | ||
736 | return; | 733 | return; |
737 | } | 734 | } |
@@ -816,14 +813,14 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, | |||
816 | tsk->thread.trap_no = 14; | 813 | tsk->thread.trap_no = 14; |
817 | 814 | ||
818 | #ifdef CONFIG_MEMORY_FAILURE | 815 | #ifdef CONFIG_MEMORY_FAILURE |
819 | if (fault & VM_FAULT_HWPOISON) { | 816 | if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) { |
820 | printk(KERN_ERR | 817 | printk(KERN_ERR |
821 | "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n", | 818 | "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n", |
822 | tsk->comm, tsk->pid, address); | 819 | tsk->comm, tsk->pid, address); |
823 | code = BUS_MCEERR_AR; | 820 | code = BUS_MCEERR_AR; |
824 | } | 821 | } |
825 | #endif | 822 | #endif |
826 | force_sig_info_fault(SIGBUS, code, address, tsk); | 823 | force_sig_info_fault(SIGBUS, code, address, tsk, fault); |
827 | } | 824 | } |
828 | 825 | ||
829 | static noinline void | 826 | static noinline void |
@@ -833,7 +830,8 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, | |||
833 | if (fault & VM_FAULT_OOM) { | 830 | if (fault & VM_FAULT_OOM) { |
834 | out_of_memory(regs, error_code, address); | 831 | out_of_memory(regs, error_code, address); |
835 | } else { | 832 | } else { |
836 | if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON)) | 833 | if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON| |
834 | VM_FAULT_HWPOISON_LARGE)) | ||
837 | do_sigbus(regs, error_code, address, fault); | 835 | do_sigbus(regs, error_code, address, fault); |
838 | else | 836 | else |
839 | BUG(); | 837 | BUG(); |
@@ -894,8 +892,14 @@ spurious_fault(unsigned long error_code, unsigned long address) | |||
894 | if (pmd_large(*pmd)) | 892 | if (pmd_large(*pmd)) |
895 | return spurious_fault_check(error_code, (pte_t *) pmd); | 893 | return spurious_fault_check(error_code, (pte_t *) pmd); |
896 | 894 | ||
895 | /* | ||
896 | * Note: don't use pte_present() here, since it returns true | ||
897 | * if the _PAGE_PROTNONE bit is set. However, this aliases the | ||
898 | * _PAGE_GLOBAL bit, which for kernel pages give false positives | ||
899 | * when CONFIG_DEBUG_PAGEALLOC is used. | ||
900 | */ | ||
897 | pte = pte_offset_kernel(pmd, address); | 901 | pte = pte_offset_kernel(pmd, address); |
898 | if (!pte_present(*pte)) | 902 | if (!(pte_flags(*pte) & _PAGE_PRESENT)) |
899 | return 0; | 903 | return 0; |
900 | 904 | ||
901 | ret = spurious_fault_check(error_code, pte); | 905 | ret = spurious_fault_check(error_code, pte); |
@@ -915,9 +919,9 @@ spurious_fault(unsigned long error_code, unsigned long address) | |||
915 | int show_unhandled_signals = 1; | 919 | int show_unhandled_signals = 1; |
916 | 920 | ||
917 | static inline int | 921 | static inline int |
918 | access_error(unsigned long error_code, int write, struct vm_area_struct *vma) | 922 | access_error(unsigned long error_code, struct vm_area_struct *vma) |
919 | { | 923 | { |
920 | if (write) { | 924 | if (error_code & PF_WRITE) { |
921 | /* write, present and write, not present: */ | 925 | /* write, present and write, not present: */ |
922 | if (unlikely(!(vma->vm_flags & VM_WRITE))) | 926 | if (unlikely(!(vma->vm_flags & VM_WRITE))) |
923 | return 1; | 927 | return 1; |
@@ -952,8 +956,10 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) | |||
952 | struct task_struct *tsk; | 956 | struct task_struct *tsk; |
953 | unsigned long address; | 957 | unsigned long address; |
954 | struct mm_struct *mm; | 958 | struct mm_struct *mm; |
955 | int write; | ||
956 | int fault; | 959 | int fault; |
960 | int write = error_code & PF_WRITE; | ||
961 | unsigned int flags = FAULT_FLAG_ALLOW_RETRY | | ||
962 | (write ? FAULT_FLAG_WRITE : 0); | ||
957 | 963 | ||
958 | tsk = current; | 964 | tsk = current; |
959 | mm = tsk->mm; | 965 | mm = tsk->mm; |
@@ -1064,6 +1070,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) | |||
1064 | bad_area_nosemaphore(regs, error_code, address); | 1070 | bad_area_nosemaphore(regs, error_code, address); |
1065 | return; | 1071 | return; |
1066 | } | 1072 | } |
1073 | retry: | ||
1067 | down_read(&mm->mmap_sem); | 1074 | down_read(&mm->mmap_sem); |
1068 | } else { | 1075 | } else { |
1069 | /* | 1076 | /* |
@@ -1107,9 +1114,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) | |||
1107 | * we can handle it.. | 1114 | * we can handle it.. |
1108 | */ | 1115 | */ |
1109 | good_area: | 1116 | good_area: |
1110 | write = error_code & PF_WRITE; | 1117 | if (unlikely(access_error(error_code, vma))) { |
1111 | |||
1112 | if (unlikely(access_error(error_code, write, vma))) { | ||
1113 | bad_area_access_error(regs, error_code, address); | 1118 | bad_area_access_error(regs, error_code, address); |
1114 | return; | 1119 | return; |
1115 | } | 1120 | } |
@@ -1119,21 +1124,34 @@ good_area: | |||
1119 | * make sure we exit gracefully rather than endlessly redo | 1124 | * make sure we exit gracefully rather than endlessly redo |
1120 | * the fault: | 1125 | * the fault: |
1121 | */ | 1126 | */ |
1122 | fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0); | 1127 | fault = handle_mm_fault(mm, vma, address, flags); |
1123 | 1128 | ||
1124 | if (unlikely(fault & VM_FAULT_ERROR)) { | 1129 | if (unlikely(fault & VM_FAULT_ERROR)) { |
1125 | mm_fault_error(regs, error_code, address, fault); | 1130 | mm_fault_error(regs, error_code, address, fault); |
1126 | return; | 1131 | return; |
1127 | } | 1132 | } |
1128 | 1133 | ||
1129 | if (fault & VM_FAULT_MAJOR) { | 1134 | /* |
1130 | tsk->maj_flt++; | 1135 | * Major/minor page fault accounting is only done on the |
1131 | perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, | 1136 | * initial attempt. If we go through a retry, it is extremely |
1132 | regs, address); | 1137 | * likely that the page will be found in page cache at that point. |
1133 | } else { | 1138 | */ |
1134 | tsk->min_flt++; | 1139 | if (flags & FAULT_FLAG_ALLOW_RETRY) { |
1135 | perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, | 1140 | if (fault & VM_FAULT_MAJOR) { |
1136 | regs, address); | 1141 | tsk->maj_flt++; |
1142 | perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, | ||
1143 | regs, address); | ||
1144 | } else { | ||
1145 | tsk->min_flt++; | ||
1146 | perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, | ||
1147 | regs, address); | ||
1148 | } | ||
1149 | if (fault & VM_FAULT_RETRY) { | ||
1150 | /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk | ||
1151 | * of starvation. */ | ||
1152 | flags &= ~FAULT_FLAG_ALLOW_RETRY; | ||
1153 | goto retry; | ||
1154 | } | ||
1137 | } | 1155 | } |
1138 | 1156 | ||
1139 | check_v8086_mode(regs, address, tsk); | 1157 | check_v8086_mode(regs, address, tsk); |