aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/mm/fault.c
diff options
context:
space:
mode:
authorJeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>2010-11-16 14:06:22 -0500
committerJeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>2010-11-16 14:06:22 -0500
commit20b4755e4fbb226eb42951bd40b53fcbce9ef944 (patch)
tree43da70e0b32ee423d3643ecd422821383411ab72 /arch/x86/mm/fault.c
parent744f9f104ea262de1dc3e29265870c649f0d9473 (diff)
parente53beacd23d9cb47590da6a7a7f6d417b941a994 (diff)
Merge commit 'v2.6.37-rc2' into upstream/xenfs
* commit 'v2.6.37-rc2': (10093 commits) Linux 2.6.37-rc2 capabilities/syslog: open code cap_syslog logic to fix build failure i2c: Sanity checks on adapter registration i2c: Mark i2c_adapter.id as deprecated i2c: Drivers shouldn't include <linux/i2c-id.h> i2c: Delete unused adapter IDs i2c: Remove obsolete cleanup for clientdata include/linux/kernel.h: Move logging bits to include/linux/printk.h Fix gcc 4.5.1 miscompiling drivers/char/i8k.c (again) hwmon: (w83795) Check for BEEP pin availability hwmon: (w83795) Clear intrusion alarm immediately hwmon: (w83795) Read the intrusion state properly hwmon: (w83795) Print the actual temperature channels as sources hwmon: (w83795) List all usable temperature sources hwmon: (w83795) Expose fan control method hwmon: (w83795) Fix fan control mode attributes hwmon: (lm95241) Check validity of input values hwmon: Change mail address of Hans J. Koch PCI: sysfs: fix printk warnings GFS2: Fix inode deallocation race ...
Diffstat (limited to 'arch/x86/mm/fault.c')
-rw-r--r--arch/x86/mm/fault.c110
1 files changed, 64 insertions, 46 deletions
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 4c4508e8a204..7d90ceb882a4 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -11,6 +11,7 @@
11#include <linux/kprobes.h> /* __kprobes, ... */ 11#include <linux/kprobes.h> /* __kprobes, ... */
12#include <linux/mmiotrace.h> /* kmmio_handler, ... */ 12#include <linux/mmiotrace.h> /* kmmio_handler, ... */
13#include <linux/perf_event.h> /* perf_sw_event */ 13#include <linux/perf_event.h> /* perf_sw_event */
14#include <linux/hugetlb.h> /* hstate_index_to_shift */
14 15
15#include <asm/traps.h> /* dotraplinkage, ... */ 16#include <asm/traps.h> /* dotraplinkage, ... */
16#include <asm/pgalloc.h> /* pgd_*(), ... */ 17#include <asm/pgalloc.h> /* pgd_*(), ... */
@@ -160,15 +161,20 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
160 161
161static void 162static void
162force_sig_info_fault(int si_signo, int si_code, unsigned long address, 163force_sig_info_fault(int si_signo, int si_code, unsigned long address,
163 struct task_struct *tsk) 164 struct task_struct *tsk, int fault)
164{ 165{
166 unsigned lsb = 0;
165 siginfo_t info; 167 siginfo_t info;
166 168
167 info.si_signo = si_signo; 169 info.si_signo = si_signo;
168 info.si_errno = 0; 170 info.si_errno = 0;
169 info.si_code = si_code; 171 info.si_code = si_code;
170 info.si_addr = (void __user *)address; 172 info.si_addr = (void __user *)address;
171 info.si_addr_lsb = si_code == BUS_MCEERR_AR ? PAGE_SHIFT : 0; 173 if (fault & VM_FAULT_HWPOISON_LARGE)
174 lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
175 if (fault & VM_FAULT_HWPOISON)
176 lsb = PAGE_SHIFT;
177 info.si_addr_lsb = lsb;
172 178
173 force_sig_info(si_signo, &info, tsk); 179 force_sig_info(si_signo, &info, tsk);
174} 180}
@@ -229,7 +235,16 @@ void vmalloc_sync_all(void)
229 235
230 spin_lock_irqsave(&pgd_lock, flags); 236 spin_lock_irqsave(&pgd_lock, flags);
231 list_for_each_entry(page, &pgd_list, lru) { 237 list_for_each_entry(page, &pgd_list, lru) {
232 if (!vmalloc_sync_one(page_address(page), address)) 238 spinlock_t *pgt_lock;
239 pmd_t *ret;
240
241 pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
242
243 spin_lock(pgt_lock);
244 ret = vmalloc_sync_one(page_address(page), address);
245 spin_unlock(pgt_lock);
246
247 if (!ret)
233 break; 248 break;
234 } 249 }
235 spin_unlock_irqrestore(&pgd_lock, flags); 250 spin_unlock_irqrestore(&pgd_lock, flags);
@@ -251,6 +266,8 @@ static noinline __kprobes int vmalloc_fault(unsigned long address)
251 if (!(address >= VMALLOC_START && address < VMALLOC_END)) 266 if (!(address >= VMALLOC_START && address < VMALLOC_END))
252 return -1; 267 return -1;
253 268
269 WARN_ON_ONCE(in_nmi());
270
254 /* 271 /*
255 * Synchronize this task's top level page-table 272 * Synchronize this task's top level page-table
256 * with the 'reference' page table. 273 * with the 'reference' page table.
@@ -326,29 +343,7 @@ out:
326 343
327void vmalloc_sync_all(void) 344void vmalloc_sync_all(void)
328{ 345{
329 unsigned long address; 346 sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END);
330
331 for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END;
332 address += PGDIR_SIZE) {
333
334 const pgd_t *pgd_ref = pgd_offset_k(address);
335 unsigned long flags;
336 struct page *page;
337
338 if (pgd_none(*pgd_ref))
339 continue;
340
341 spin_lock_irqsave(&pgd_lock, flags);
342 list_for_each_entry(page, &pgd_list, lru) {
343 pgd_t *pgd;
344 pgd = (pgd_t *)page_address(page) + pgd_index(address);
345 if (pgd_none(*pgd))
346 set_pgd(pgd, *pgd_ref);
347 else
348 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
349 }
350 spin_unlock_irqrestore(&pgd_lock, flags);
351 }
352} 347}
353 348
354/* 349/*
@@ -369,6 +364,8 @@ static noinline __kprobes int vmalloc_fault(unsigned long address)
369 if (!(address >= VMALLOC_START && address < VMALLOC_END)) 364 if (!(address >= VMALLOC_START && address < VMALLOC_END))
370 return -1; 365 return -1;
371 366
367 WARN_ON_ONCE(in_nmi());
368
372 /* 369 /*
373 * Copy kernel mappings over when needed. This can also 370 * Copy kernel mappings over when needed. This can also
374 * happen within a race in page table update. In the later 371 * happen within a race in page table update. In the later
@@ -731,7 +728,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
731 tsk->thread.error_code = error_code | (address >= TASK_SIZE); 728 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
732 tsk->thread.trap_no = 14; 729 tsk->thread.trap_no = 14;
733 730
734 force_sig_info_fault(SIGSEGV, si_code, address, tsk); 731 force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0);
735 732
736 return; 733 return;
737 } 734 }
@@ -816,14 +813,14 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
816 tsk->thread.trap_no = 14; 813 tsk->thread.trap_no = 14;
817 814
818#ifdef CONFIG_MEMORY_FAILURE 815#ifdef CONFIG_MEMORY_FAILURE
819 if (fault & VM_FAULT_HWPOISON) { 816 if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
820 printk(KERN_ERR 817 printk(KERN_ERR
821 "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n", 818 "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
822 tsk->comm, tsk->pid, address); 819 tsk->comm, tsk->pid, address);
823 code = BUS_MCEERR_AR; 820 code = BUS_MCEERR_AR;
824 } 821 }
825#endif 822#endif
826 force_sig_info_fault(SIGBUS, code, address, tsk); 823 force_sig_info_fault(SIGBUS, code, address, tsk, fault);
827} 824}
828 825
829static noinline void 826static noinline void
@@ -833,7 +830,8 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
833 if (fault & VM_FAULT_OOM) { 830 if (fault & VM_FAULT_OOM) {
834 out_of_memory(regs, error_code, address); 831 out_of_memory(regs, error_code, address);
835 } else { 832 } else {
836 if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON)) 833 if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
834 VM_FAULT_HWPOISON_LARGE))
837 do_sigbus(regs, error_code, address, fault); 835 do_sigbus(regs, error_code, address, fault);
838 else 836 else
839 BUG(); 837 BUG();
@@ -894,8 +892,14 @@ spurious_fault(unsigned long error_code, unsigned long address)
894 if (pmd_large(*pmd)) 892 if (pmd_large(*pmd))
895 return spurious_fault_check(error_code, (pte_t *) pmd); 893 return spurious_fault_check(error_code, (pte_t *) pmd);
896 894
895 /*
896 * Note: don't use pte_present() here, since it returns true
897 * if the _PAGE_PROTNONE bit is set. However, this aliases the
898 * _PAGE_GLOBAL bit, which for kernel pages give false positives
899 * when CONFIG_DEBUG_PAGEALLOC is used.
900 */
897 pte = pte_offset_kernel(pmd, address); 901 pte = pte_offset_kernel(pmd, address);
898 if (!pte_present(*pte)) 902 if (!(pte_flags(*pte) & _PAGE_PRESENT))
899 return 0; 903 return 0;
900 904
901 ret = spurious_fault_check(error_code, pte); 905 ret = spurious_fault_check(error_code, pte);
@@ -915,9 +919,9 @@ spurious_fault(unsigned long error_code, unsigned long address)
915int show_unhandled_signals = 1; 919int show_unhandled_signals = 1;
916 920
917static inline int 921static inline int
918access_error(unsigned long error_code, int write, struct vm_area_struct *vma) 922access_error(unsigned long error_code, struct vm_area_struct *vma)
919{ 923{
920 if (write) { 924 if (error_code & PF_WRITE) {
921 /* write, present and write, not present: */ 925 /* write, present and write, not present: */
922 if (unlikely(!(vma->vm_flags & VM_WRITE))) 926 if (unlikely(!(vma->vm_flags & VM_WRITE)))
923 return 1; 927 return 1;
@@ -952,8 +956,10 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
952 struct task_struct *tsk; 956 struct task_struct *tsk;
953 unsigned long address; 957 unsigned long address;
954 struct mm_struct *mm; 958 struct mm_struct *mm;
955 int write;
956 int fault; 959 int fault;
960 int write = error_code & PF_WRITE;
961 unsigned int flags = FAULT_FLAG_ALLOW_RETRY |
962 (write ? FAULT_FLAG_WRITE : 0);
957 963
958 tsk = current; 964 tsk = current;
959 mm = tsk->mm; 965 mm = tsk->mm;
@@ -1064,6 +1070,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
1064 bad_area_nosemaphore(regs, error_code, address); 1070 bad_area_nosemaphore(regs, error_code, address);
1065 return; 1071 return;
1066 } 1072 }
1073retry:
1067 down_read(&mm->mmap_sem); 1074 down_read(&mm->mmap_sem);
1068 } else { 1075 } else {
1069 /* 1076 /*
@@ -1107,9 +1114,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
1107 * we can handle it.. 1114 * we can handle it..
1108 */ 1115 */
1109good_area: 1116good_area:
1110 write = error_code & PF_WRITE; 1117 if (unlikely(access_error(error_code, vma))) {
1111
1112 if (unlikely(access_error(error_code, write, vma))) {
1113 bad_area_access_error(regs, error_code, address); 1118 bad_area_access_error(regs, error_code, address);
1114 return; 1119 return;
1115 } 1120 }
@@ -1119,21 +1124,34 @@ good_area:
1119 * make sure we exit gracefully rather than endlessly redo 1124 * make sure we exit gracefully rather than endlessly redo
1120 * the fault: 1125 * the fault:
1121 */ 1126 */
1122 fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0); 1127 fault = handle_mm_fault(mm, vma, address, flags);
1123 1128
1124 if (unlikely(fault & VM_FAULT_ERROR)) { 1129 if (unlikely(fault & VM_FAULT_ERROR)) {
1125 mm_fault_error(regs, error_code, address, fault); 1130 mm_fault_error(regs, error_code, address, fault);
1126 return; 1131 return;
1127 } 1132 }
1128 1133
1129 if (fault & VM_FAULT_MAJOR) { 1134 /*
1130 tsk->maj_flt++; 1135 * Major/minor page fault accounting is only done on the
1131 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, 1136 * initial attempt. If we go through a retry, it is extremely
1132 regs, address); 1137 * likely that the page will be found in page cache at that point.
1133 } else { 1138 */
1134 tsk->min_flt++; 1139 if (flags & FAULT_FLAG_ALLOW_RETRY) {
1135 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, 1140 if (fault & VM_FAULT_MAJOR) {
1136 regs, address); 1141 tsk->maj_flt++;
1142 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
1143 regs, address);
1144 } else {
1145 tsk->min_flt++;
1146 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
1147 regs, address);
1148 }
1149 if (fault & VM_FAULT_RETRY) {
1150 /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
1151 * of starvation. */
1152 flags &= ~FAULT_FLAG_ALLOW_RETRY;
1153 goto retry;
1154 }
1137 } 1155 }
1138 1156
1139 check_v8086_mode(regs, address, tsk); 1157 check_v8086_mode(regs, address, tsk);