aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJan Beulich <jbeulich@novell.com>2006-03-23 05:59:45 -0500
committerLinus Torvalds <torvalds@g5.osdl.org>2006-03-23 10:38:05 -0500
commit101f12af16fb12f8da8100899a13ee1b1b576a0a (patch)
tree0bea73d2702ba438e8e82bc8000b498aa50aee6e
parent44fd22992cb76dc51c52cf4b8aff1bc7899bb23c (diff)
[PATCH] i386: actively synchronize vmalloc area when registering certain callbacks
Registering a callback handler through register_die_notifier() is obviously primarily intended for use by modules. However, the way these currently get called it is basically impossible for them to actually be used by modules, as there is, on non-PAE configurationes, a good chance (the larger the module, the better) for the system to crash as a result. This is because the callback gets invoked (a) in the page fault path before the top level page table propagation gets carried out (hence a fault to propagate the top level page table entry/entries mapping to module's code/data would nest infinitly) and (b) in the NMI path, where nested faults must absolutely not happen, since otherwise the IRET from the nested fault re-enables NMIs, potentially resulting in nested NMI occurences. Besides the modular aspect, similar problems would even arise for in- kernel consumers of the API if they touched ioremap()ed or vmalloc()ed memory inside their handlers. Signed-off-by: Jan Beulich <jbeulich@novell.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--arch/i386/kernel/traps.c3
-rw-r--r--arch/i386/mm/fault.c173
-rw-r--r--include/asm-i386/pgtable-2level.h2
-rw-r--r--include/asm-i386/pgtable-3level.h2
4 files changed, 123 insertions, 57 deletions
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index f20797b8da1d..d510de7e4f2a 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -99,6 +99,8 @@ int register_die_notifier(struct notifier_block *nb)
99{ 99{
100 int err = 0; 100 int err = 0;
101 unsigned long flags; 101 unsigned long flags;
102
103 vmalloc_sync_all();
102 spin_lock_irqsave(&die_notifier_lock, flags); 104 spin_lock_irqsave(&die_notifier_lock, flags);
103 err = notifier_chain_register(&i386die_chain, nb); 105 err = notifier_chain_register(&i386die_chain, nb);
104 spin_unlock_irqrestore(&die_notifier_lock, flags); 106 spin_unlock_irqrestore(&die_notifier_lock, flags);
@@ -713,6 +715,7 @@ fastcall void do_nmi(struct pt_regs * regs, long error_code)
713 715
714void set_nmi_callback(nmi_callback_t callback) 716void set_nmi_callback(nmi_callback_t callback)
715{ 717{
718 vmalloc_sync_all();
716 rcu_assign_pointer(nmi_callback, callback); 719 rcu_assign_pointer(nmi_callback, callback);
717} 720}
718EXPORT_SYMBOL_GPL(set_nmi_callback); 721EXPORT_SYMBOL_GPL(set_nmi_callback);
diff --git a/arch/i386/mm/fault.c b/arch/i386/mm/fault.c
index cf572d9a3b6e..bbb24af5d860 100644
--- a/arch/i386/mm/fault.c
+++ b/arch/i386/mm/fault.c
@@ -214,6 +214,68 @@ static noinline void force_sig_info_fault(int si_signo, int si_code,
214 214
215fastcall void do_invalid_op(struct pt_regs *, unsigned long); 215fastcall void do_invalid_op(struct pt_regs *, unsigned long);
216 216
217static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
218{
219 unsigned index = pgd_index(address);
220 pgd_t *pgd_k;
221 pud_t *pud, *pud_k;
222 pmd_t *pmd, *pmd_k;
223
224 pgd += index;
225 pgd_k = init_mm.pgd + index;
226
227 if (!pgd_present(*pgd_k))
228 return NULL;
229
230 /*
231 * set_pgd(pgd, *pgd_k); here would be useless on PAE
232 * and redundant with the set_pmd() on non-PAE. As would
233 * set_pud.
234 */
235
236 pud = pud_offset(pgd, address);
237 pud_k = pud_offset(pgd_k, address);
238 if (!pud_present(*pud_k))
239 return NULL;
240
241 pmd = pmd_offset(pud, address);
242 pmd_k = pmd_offset(pud_k, address);
243 if (!pmd_present(*pmd_k))
244 return NULL;
245 if (!pmd_present(*pmd))
246 set_pmd(pmd, *pmd_k);
247 else
248 BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
249 return pmd_k;
250}
251
252/*
253 * Handle a fault on the vmalloc or module mapping area
254 *
255 * This assumes no large pages in there.
256 */
257static inline int vmalloc_fault(unsigned long address)
258{
259 unsigned long pgd_paddr;
260 pmd_t *pmd_k;
261 pte_t *pte_k;
262 /*
263 * Synchronize this task's top level page-table
264 * with the 'reference' page table.
265 *
266 * Do _not_ use "current" here. We might be inside
267 * an interrupt in the middle of a task switch..
268 */
269 pgd_paddr = read_cr3();
270 pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
271 if (!pmd_k)
272 return -1;
273 pte_k = pte_offset_kernel(pmd_k, address);
274 if (!pte_present(*pte_k))
275 return -1;
276 return 0;
277}
278
217/* 279/*
218 * This routine handles page faults. It determines the address, 280 * This routine handles page faults. It determines the address,
219 * and the problem, and then passes it off to one of the appropriate 281 * and the problem, and then passes it off to one of the appropriate
@@ -223,6 +285,8 @@ fastcall void do_invalid_op(struct pt_regs *, unsigned long);
223 * bit 0 == 0 means no page found, 1 means protection fault 285 * bit 0 == 0 means no page found, 1 means protection fault
224 * bit 1 == 0 means read, 1 means write 286 * bit 1 == 0 means read, 1 means write
225 * bit 2 == 0 means kernel, 1 means user-mode 287 * bit 2 == 0 means kernel, 1 means user-mode
288 * bit 3 == 1 means use of reserved bit detected
289 * bit 4 == 1 means fault was an instruction fetch
226 */ 290 */
227fastcall void __kprobes do_page_fault(struct pt_regs *regs, 291fastcall void __kprobes do_page_fault(struct pt_regs *regs,
228 unsigned long error_code) 292 unsigned long error_code)
@@ -237,13 +301,6 @@ fastcall void __kprobes do_page_fault(struct pt_regs *regs,
237 /* get the address */ 301 /* get the address */
238 address = read_cr2(); 302 address = read_cr2();
239 303
240 if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
241 SIGSEGV) == NOTIFY_STOP)
242 return;
243 /* It's safe to allow irq's after cr2 has been saved */
244 if (regs->eflags & (X86_EFLAGS_IF|VM_MASK))
245 local_irq_enable();
246
247 tsk = current; 304 tsk = current;
248 305
249 si_code = SEGV_MAPERR; 306 si_code = SEGV_MAPERR;
@@ -259,17 +316,29 @@ fastcall void __kprobes do_page_fault(struct pt_regs *regs,
259 * 316 *
260 * This verifies that the fault happens in kernel space 317 * This verifies that the fault happens in kernel space
261 * (error_code & 4) == 0, and that the fault was not a 318 * (error_code & 4) == 0, and that the fault was not a
262 * protection error (error_code & 1) == 0. 319 * protection error (error_code & 9) == 0.
263 */ 320 */
264 if (unlikely(address >= TASK_SIZE)) { 321 if (unlikely(address >= TASK_SIZE)) {
265 if (!(error_code & 5)) 322 if (!(error_code & 0x0000000d) && vmalloc_fault(address) >= 0)
266 goto vmalloc_fault; 323 return;
267 /* 324 if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
325 SIGSEGV) == NOTIFY_STOP)
326 return;
327 /*
268 * Don't take the mm semaphore here. If we fixup a prefetch 328 * Don't take the mm semaphore here. If we fixup a prefetch
269 * fault we could otherwise deadlock. 329 * fault we could otherwise deadlock.
270 */ 330 */
271 goto bad_area_nosemaphore; 331 goto bad_area_nosemaphore;
272 } 332 }
333
334 if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
335 SIGSEGV) == NOTIFY_STOP)
336 return;
337
338 /* It's safe to allow irq's after cr2 has been saved and the vmalloc
339 fault has been handled. */
340 if (regs->eflags & (X86_EFLAGS_IF|VM_MASK))
341 local_irq_enable();
273 342
274 mm = tsk->mm; 343 mm = tsk->mm;
275 344
@@ -510,51 +579,41 @@ do_sigbus:
510 tsk->thread.error_code = error_code; 579 tsk->thread.error_code = error_code;
511 tsk->thread.trap_no = 14; 580 tsk->thread.trap_no = 14;
512 force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); 581 force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
513 return; 582}
514
515vmalloc_fault:
516 {
517 /*
518 * Synchronize this task's top level page-table
519 * with the 'reference' page table.
520 *
521 * Do _not_ use "tsk" here. We might be inside
522 * an interrupt in the middle of a task switch..
523 */
524 int index = pgd_index(address);
525 unsigned long pgd_paddr;
526 pgd_t *pgd, *pgd_k;
527 pud_t *pud, *pud_k;
528 pmd_t *pmd, *pmd_k;
529 pte_t *pte_k;
530
531 pgd_paddr = read_cr3();
532 pgd = index + (pgd_t *)__va(pgd_paddr);
533 pgd_k = init_mm.pgd + index;
534
535 if (!pgd_present(*pgd_k))
536 goto no_context;
537
538 /*
539 * set_pgd(pgd, *pgd_k); here would be useless on PAE
540 * and redundant with the set_pmd() on non-PAE. As would
541 * set_pud.
542 */
543 583
544 pud = pud_offset(pgd, address); 584#ifndef CONFIG_X86_PAE
545 pud_k = pud_offset(pgd_k, address); 585void vmalloc_sync_all(void)
546 if (!pud_present(*pud_k)) 586{
547 goto no_context; 587 /*
548 588 * Note that races in the updates of insync and start aren't
549 pmd = pmd_offset(pud, address); 589 * problematic: insync can only get set bits added, and updates to
550 pmd_k = pmd_offset(pud_k, address); 590 * start are only improving performance (without affecting correctness
551 if (!pmd_present(*pmd_k)) 591 * if undone).
552 goto no_context; 592 */
553 set_pmd(pmd, *pmd_k); 593 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
594 static unsigned long start = TASK_SIZE;
595 unsigned long address;
554 596
555 pte_k = pte_offset_kernel(pmd_k, address); 597 BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
556 if (!pte_present(*pte_k)) 598 for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) {
557 goto no_context; 599 if (!test_bit(pgd_index(address), insync)) {
558 return; 600 unsigned long flags;
601 struct page *page;
602
603 spin_lock_irqsave(&pgd_lock, flags);
604 for (page = pgd_list; page; page =
605 (struct page *)page->index)
606 if (!vmalloc_sync_one(page_address(page),
607 address)) {
608 BUG_ON(page != pgd_list);
609 break;
610 }
611 spin_unlock_irqrestore(&pgd_lock, flags);
612 if (!page)
613 set_bit(pgd_index(address), insync);
614 }
615 if (address == start && test_bit(pgd_index(address), insync))
616 start = address + PGDIR_SIZE;
559 } 617 }
560} 618}
619#endif
diff --git a/include/asm-i386/pgtable-2level.h b/include/asm-i386/pgtable-2level.h
index 74ef721b534d..27bde973abc7 100644
--- a/include/asm-i386/pgtable-2level.h
+++ b/include/asm-i386/pgtable-2level.h
@@ -61,4 +61,6 @@ static inline int pte_exec_kernel(pte_t pte)
61#define __pte_to_swp_entry(pte) ((swp_entry_t) { (pte).pte_low }) 61#define __pte_to_swp_entry(pte) ((swp_entry_t) { (pte).pte_low })
62#define __swp_entry_to_pte(x) ((pte_t) { (x).val }) 62#define __swp_entry_to_pte(x) ((pte_t) { (x).val })
63 63
64void vmalloc_sync_all(void);
65
64#endif /* _I386_PGTABLE_2LEVEL_H */ 66#endif /* _I386_PGTABLE_2LEVEL_H */
diff --git a/include/asm-i386/pgtable-3level.h b/include/asm-i386/pgtable-3level.h
index f1a8b454920a..36a5aa63cbbf 100644
--- a/include/asm-i386/pgtable-3level.h
+++ b/include/asm-i386/pgtable-3level.h
@@ -152,4 +152,6 @@ static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
152 152
153#define __pmd_free_tlb(tlb, x) do { } while (0) 153#define __pmd_free_tlb(tlb, x) do { } while (0)
154 154
155#define vmalloc_sync_all() ((void)0)
156
155#endif /* _I386_PGTABLE_3LEVEL_H */ 157#endif /* _I386_PGTABLE_3LEVEL_H */