diff options
author | Jan Beulich <jbeulich@novell.com> | 2006-03-23 05:59:45 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@g5.osdl.org> | 2006-03-23 10:38:05 -0500 |
commit | 101f12af16fb12f8da8100899a13ee1b1b576a0a (patch) | |
tree | 0bea73d2702ba438e8e82bc8000b498aa50aee6e | |
parent | 44fd22992cb76dc51c52cf4b8aff1bc7899bb23c (diff) |
[PATCH] i386: actively synchronize vmalloc area when registering certain callbacks
Registering a callback handler through register_die_notifier() is obviously
primarily intended for use by modules. However, the way these currently
get called it is basically impossible for them to actually be used by
modules, as there is, on non-PAE configurationes, a good chance (the larger
the module, the better) for the system to crash as a result.
This is because the callback gets invoked
(a) in the page fault path before the top level page table propagation
gets carried out (hence a fault to propagate the top level page table
entry/entries mapping to module's code/data would nest infinitly) and
(b) in the NMI path, where nested faults must absolutely not happen,
since otherwise the IRET from the nested fault re-enables NMIs,
potentially resulting in nested NMI occurences.
Besides the modular aspect, similar problems would even arise for in-
kernel consumers of the API if they touched ioremap()ed or vmalloc()ed
memory inside their handlers.
Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r-- | arch/i386/kernel/traps.c | 3 | ||||
-rw-r--r-- | arch/i386/mm/fault.c | 173 | ||||
-rw-r--r-- | include/asm-i386/pgtable-2level.h | 2 | ||||
-rw-r--r-- | include/asm-i386/pgtable-3level.h | 2 |
4 files changed, 123 insertions, 57 deletions
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index f20797b8da1d..d510de7e4f2a 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c | |||
@@ -99,6 +99,8 @@ int register_die_notifier(struct notifier_block *nb) | |||
99 | { | 99 | { |
100 | int err = 0; | 100 | int err = 0; |
101 | unsigned long flags; | 101 | unsigned long flags; |
102 | |||
103 | vmalloc_sync_all(); | ||
102 | spin_lock_irqsave(&die_notifier_lock, flags); | 104 | spin_lock_irqsave(&die_notifier_lock, flags); |
103 | err = notifier_chain_register(&i386die_chain, nb); | 105 | err = notifier_chain_register(&i386die_chain, nb); |
104 | spin_unlock_irqrestore(&die_notifier_lock, flags); | 106 | spin_unlock_irqrestore(&die_notifier_lock, flags); |
@@ -713,6 +715,7 @@ fastcall void do_nmi(struct pt_regs * regs, long error_code) | |||
713 | 715 | ||
714 | void set_nmi_callback(nmi_callback_t callback) | 716 | void set_nmi_callback(nmi_callback_t callback) |
715 | { | 717 | { |
718 | vmalloc_sync_all(); | ||
716 | rcu_assign_pointer(nmi_callback, callback); | 719 | rcu_assign_pointer(nmi_callback, callback); |
717 | } | 720 | } |
718 | EXPORT_SYMBOL_GPL(set_nmi_callback); | 721 | EXPORT_SYMBOL_GPL(set_nmi_callback); |
diff --git a/arch/i386/mm/fault.c b/arch/i386/mm/fault.c index cf572d9a3b6e..bbb24af5d860 100644 --- a/arch/i386/mm/fault.c +++ b/arch/i386/mm/fault.c | |||
@@ -214,6 +214,68 @@ static noinline void force_sig_info_fault(int si_signo, int si_code, | |||
214 | 214 | ||
215 | fastcall void do_invalid_op(struct pt_regs *, unsigned long); | 215 | fastcall void do_invalid_op(struct pt_regs *, unsigned long); |
216 | 216 | ||
217 | static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) | ||
218 | { | ||
219 | unsigned index = pgd_index(address); | ||
220 | pgd_t *pgd_k; | ||
221 | pud_t *pud, *pud_k; | ||
222 | pmd_t *pmd, *pmd_k; | ||
223 | |||
224 | pgd += index; | ||
225 | pgd_k = init_mm.pgd + index; | ||
226 | |||
227 | if (!pgd_present(*pgd_k)) | ||
228 | return NULL; | ||
229 | |||
230 | /* | ||
231 | * set_pgd(pgd, *pgd_k); here would be useless on PAE | ||
232 | * and redundant with the set_pmd() on non-PAE. As would | ||
233 | * set_pud. | ||
234 | */ | ||
235 | |||
236 | pud = pud_offset(pgd, address); | ||
237 | pud_k = pud_offset(pgd_k, address); | ||
238 | if (!pud_present(*pud_k)) | ||
239 | return NULL; | ||
240 | |||
241 | pmd = pmd_offset(pud, address); | ||
242 | pmd_k = pmd_offset(pud_k, address); | ||
243 | if (!pmd_present(*pmd_k)) | ||
244 | return NULL; | ||
245 | if (!pmd_present(*pmd)) | ||
246 | set_pmd(pmd, *pmd_k); | ||
247 | else | ||
248 | BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k)); | ||
249 | return pmd_k; | ||
250 | } | ||
251 | |||
252 | /* | ||
253 | * Handle a fault on the vmalloc or module mapping area | ||
254 | * | ||
255 | * This assumes no large pages in there. | ||
256 | */ | ||
257 | static inline int vmalloc_fault(unsigned long address) | ||
258 | { | ||
259 | unsigned long pgd_paddr; | ||
260 | pmd_t *pmd_k; | ||
261 | pte_t *pte_k; | ||
262 | /* | ||
263 | * Synchronize this task's top level page-table | ||
264 | * with the 'reference' page table. | ||
265 | * | ||
266 | * Do _not_ use "current" here. We might be inside | ||
267 | * an interrupt in the middle of a task switch.. | ||
268 | */ | ||
269 | pgd_paddr = read_cr3(); | ||
270 | pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); | ||
271 | if (!pmd_k) | ||
272 | return -1; | ||
273 | pte_k = pte_offset_kernel(pmd_k, address); | ||
274 | if (!pte_present(*pte_k)) | ||
275 | return -1; | ||
276 | return 0; | ||
277 | } | ||
278 | |||
217 | /* | 279 | /* |
218 | * This routine handles page faults. It determines the address, | 280 | * This routine handles page faults. It determines the address, |
219 | * and the problem, and then passes it off to one of the appropriate | 281 | * and the problem, and then passes it off to one of the appropriate |
@@ -223,6 +285,8 @@ fastcall void do_invalid_op(struct pt_regs *, unsigned long); | |||
223 | * bit 0 == 0 means no page found, 1 means protection fault | 285 | * bit 0 == 0 means no page found, 1 means protection fault |
224 | * bit 1 == 0 means read, 1 means write | 286 | * bit 1 == 0 means read, 1 means write |
225 | * bit 2 == 0 means kernel, 1 means user-mode | 287 | * bit 2 == 0 means kernel, 1 means user-mode |
288 | * bit 3 == 1 means use of reserved bit detected | ||
289 | * bit 4 == 1 means fault was an instruction fetch | ||
226 | */ | 290 | */ |
227 | fastcall void __kprobes do_page_fault(struct pt_regs *regs, | 291 | fastcall void __kprobes do_page_fault(struct pt_regs *regs, |
228 | unsigned long error_code) | 292 | unsigned long error_code) |
@@ -237,13 +301,6 @@ fastcall void __kprobes do_page_fault(struct pt_regs *regs, | |||
237 | /* get the address */ | 301 | /* get the address */ |
238 | address = read_cr2(); | 302 | address = read_cr2(); |
239 | 303 | ||
240 | if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, | ||
241 | SIGSEGV) == NOTIFY_STOP) | ||
242 | return; | ||
243 | /* It's safe to allow irq's after cr2 has been saved */ | ||
244 | if (regs->eflags & (X86_EFLAGS_IF|VM_MASK)) | ||
245 | local_irq_enable(); | ||
246 | |||
247 | tsk = current; | 304 | tsk = current; |
248 | 305 | ||
249 | si_code = SEGV_MAPERR; | 306 | si_code = SEGV_MAPERR; |
@@ -259,17 +316,29 @@ fastcall void __kprobes do_page_fault(struct pt_regs *regs, | |||
259 | * | 316 | * |
260 | * This verifies that the fault happens in kernel space | 317 | * This verifies that the fault happens in kernel space |
261 | * (error_code & 4) == 0, and that the fault was not a | 318 | * (error_code & 4) == 0, and that the fault was not a |
262 | * protection error (error_code & 1) == 0. | 319 | * protection error (error_code & 9) == 0. |
263 | */ | 320 | */ |
264 | if (unlikely(address >= TASK_SIZE)) { | 321 | if (unlikely(address >= TASK_SIZE)) { |
265 | if (!(error_code & 5)) | 322 | if (!(error_code & 0x0000000d) && vmalloc_fault(address) >= 0) |
266 | goto vmalloc_fault; | 323 | return; |
267 | /* | 324 | if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, |
325 | SIGSEGV) == NOTIFY_STOP) | ||
326 | return; | ||
327 | /* | ||
268 | * Don't take the mm semaphore here. If we fixup a prefetch | 328 | * Don't take the mm semaphore here. If we fixup a prefetch |
269 | * fault we could otherwise deadlock. | 329 | * fault we could otherwise deadlock. |
270 | */ | 330 | */ |
271 | goto bad_area_nosemaphore; | 331 | goto bad_area_nosemaphore; |
272 | } | 332 | } |
333 | |||
334 | if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, | ||
335 | SIGSEGV) == NOTIFY_STOP) | ||
336 | return; | ||
337 | |||
338 | /* It's safe to allow irq's after cr2 has been saved and the vmalloc | ||
339 | fault has been handled. */ | ||
340 | if (regs->eflags & (X86_EFLAGS_IF|VM_MASK)) | ||
341 | local_irq_enable(); | ||
273 | 342 | ||
274 | mm = tsk->mm; | 343 | mm = tsk->mm; |
275 | 344 | ||
@@ -510,51 +579,41 @@ do_sigbus: | |||
510 | tsk->thread.error_code = error_code; | 579 | tsk->thread.error_code = error_code; |
511 | tsk->thread.trap_no = 14; | 580 | tsk->thread.trap_no = 14; |
512 | force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); | 581 | force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); |
513 | return; | 582 | } |
514 | |||
515 | vmalloc_fault: | ||
516 | { | ||
517 | /* | ||
518 | * Synchronize this task's top level page-table | ||
519 | * with the 'reference' page table. | ||
520 | * | ||
521 | * Do _not_ use "tsk" here. We might be inside | ||
522 | * an interrupt in the middle of a task switch.. | ||
523 | */ | ||
524 | int index = pgd_index(address); | ||
525 | unsigned long pgd_paddr; | ||
526 | pgd_t *pgd, *pgd_k; | ||
527 | pud_t *pud, *pud_k; | ||
528 | pmd_t *pmd, *pmd_k; | ||
529 | pte_t *pte_k; | ||
530 | |||
531 | pgd_paddr = read_cr3(); | ||
532 | pgd = index + (pgd_t *)__va(pgd_paddr); | ||
533 | pgd_k = init_mm.pgd + index; | ||
534 | |||
535 | if (!pgd_present(*pgd_k)) | ||
536 | goto no_context; | ||
537 | |||
538 | /* | ||
539 | * set_pgd(pgd, *pgd_k); here would be useless on PAE | ||
540 | * and redundant with the set_pmd() on non-PAE. As would | ||
541 | * set_pud. | ||
542 | */ | ||
543 | 583 | ||
544 | pud = pud_offset(pgd, address); | 584 | #ifndef CONFIG_X86_PAE |
545 | pud_k = pud_offset(pgd_k, address); | 585 | void vmalloc_sync_all(void) |
546 | if (!pud_present(*pud_k)) | 586 | { |
547 | goto no_context; | 587 | /* |
548 | 588 | * Note that races in the updates of insync and start aren't | |
549 | pmd = pmd_offset(pud, address); | 589 | * problematic: insync can only get set bits added, and updates to |
550 | pmd_k = pmd_offset(pud_k, address); | 590 | * start are only improving performance (without affecting correctness |
551 | if (!pmd_present(*pmd_k)) | 591 | * if undone). |
552 | goto no_context; | 592 | */ |
553 | set_pmd(pmd, *pmd_k); | 593 | static DECLARE_BITMAP(insync, PTRS_PER_PGD); |
594 | static unsigned long start = TASK_SIZE; | ||
595 | unsigned long address; | ||
554 | 596 | ||
555 | pte_k = pte_offset_kernel(pmd_k, address); | 597 | BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK); |
556 | if (!pte_present(*pte_k)) | 598 | for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) { |
557 | goto no_context; | 599 | if (!test_bit(pgd_index(address), insync)) { |
558 | return; | 600 | unsigned long flags; |
601 | struct page *page; | ||
602 | |||
603 | spin_lock_irqsave(&pgd_lock, flags); | ||
604 | for (page = pgd_list; page; page = | ||
605 | (struct page *)page->index) | ||
606 | if (!vmalloc_sync_one(page_address(page), | ||
607 | address)) { | ||
608 | BUG_ON(page != pgd_list); | ||
609 | break; | ||
610 | } | ||
611 | spin_unlock_irqrestore(&pgd_lock, flags); | ||
612 | if (!page) | ||
613 | set_bit(pgd_index(address), insync); | ||
614 | } | ||
615 | if (address == start && test_bit(pgd_index(address), insync)) | ||
616 | start = address + PGDIR_SIZE; | ||
559 | } | 617 | } |
560 | } | 618 | } |
619 | #endif | ||
diff --git a/include/asm-i386/pgtable-2level.h b/include/asm-i386/pgtable-2level.h index 74ef721b534d..27bde973abc7 100644 --- a/include/asm-i386/pgtable-2level.h +++ b/include/asm-i386/pgtable-2level.h | |||
@@ -61,4 +61,6 @@ static inline int pte_exec_kernel(pte_t pte) | |||
61 | #define __pte_to_swp_entry(pte) ((swp_entry_t) { (pte).pte_low }) | 61 | #define __pte_to_swp_entry(pte) ((swp_entry_t) { (pte).pte_low }) |
62 | #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) | 62 | #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) |
63 | 63 | ||
64 | void vmalloc_sync_all(void); | ||
65 | |||
64 | #endif /* _I386_PGTABLE_2LEVEL_H */ | 66 | #endif /* _I386_PGTABLE_2LEVEL_H */ |
diff --git a/include/asm-i386/pgtable-3level.h b/include/asm-i386/pgtable-3level.h index f1a8b454920a..36a5aa63cbbf 100644 --- a/include/asm-i386/pgtable-3level.h +++ b/include/asm-i386/pgtable-3level.h | |||
@@ -152,4 +152,6 @@ static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot) | |||
152 | 152 | ||
153 | #define __pmd_free_tlb(tlb, x) do { } while (0) | 153 | #define __pmd_free_tlb(tlb, x) do { } while (0) |
154 | 154 | ||
155 | #define vmalloc_sync_all() ((void)0) | ||
156 | |||
155 | #endif /* _I386_PGTABLE_3LEVEL_H */ | 157 | #endif /* _I386_PGTABLE_3LEVEL_H */ |