aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJan Beulich <jbeulich@novell.com>2006-03-25 10:29:40 -0500
committerLinus Torvalds <torvalds@g5.osdl.org>2006-03-25 12:10:53 -0500
commit8c914cb704a11460eec7ed2a572bb5e9bd513d24 (patch)
tree3d735f0e33f474b296f106dee70935d77e267a74
parent85f9eebccde51e24896f31383f5b70776362e1a6 (diff)
[PATCH] x86_64: actively synchronize vmalloc area when registering certain callbacks
While the modular aspect of the respective i386 patch doesn't apply to x86-64 (as the top level page directory entry is shared between modules and the base kernel), handlers registered with register_die_notifier() are still under similar constraints for touching ioremap()ed or vmalloc()ed memory. The likelihood of this problem becoming visible is of course significantly lower, as the assigned virtual addresses would have to cross a 2**39 byte boundary. This is because the callback gets invoked (a) in the page fault path before the top level page table propagation gets carried out (hence a fault to propagate the top level page table entry/entries mapping to module's code/data would nest infinitly) and (b) in the NMI path, where nested faults must absolutely not happen, since otherwise the IRET from the nested fault re-enables NMIs, potentially resulting in nested NMI occurences. Signed-off-by: Jan Beulich <jbeulich@novell.com> Signed-off-by: Andi Kleen <ak@suse.de> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--arch/x86_64/kernel/nmi.c1
-rw-r--r--arch/x86_64/kernel/traps.c2
-rw-r--r--arch/x86_64/mm/fault.c73
-rw-r--r--include/asm-x86_64/pgalloc.h28
-rw-r--r--include/asm-x86_64/pgtable.h4
5 files changed, 95 insertions, 13 deletions
diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c
index 66c009e10bac..d9e4067faf05 100644
--- a/arch/x86_64/kernel/nmi.c
+++ b/arch/x86_64/kernel/nmi.c
@@ -534,6 +534,7 @@ asmlinkage __kprobes void do_nmi(struct pt_regs * regs, long error_code)
534 534
535void set_nmi_callback(nmi_callback_t callback) 535void set_nmi_callback(nmi_callback_t callback)
536{ 536{
537 vmalloc_sync_all();
537 rcu_assign_pointer(nmi_callback, callback); 538 rcu_assign_pointer(nmi_callback, callback);
538} 539}
539 540
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index 28d50dc540e8..b25bc904d42d 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -78,6 +78,8 @@ int register_die_notifier(struct notifier_block *nb)
78{ 78{
79 int err = 0; 79 int err = 0;
80 unsigned long flags; 80 unsigned long flags;
81
82 vmalloc_sync_all();
81 spin_lock_irqsave(&die_notifier_lock, flags); 83 spin_lock_irqsave(&die_notifier_lock, flags);
82 err = notifier_chain_register(&die_chain, nb); 84 err = notifier_chain_register(&die_chain, nb);
83 spin_unlock_irqrestore(&die_notifier_lock, flags); 85 spin_unlock_irqrestore(&die_notifier_lock, flags);
diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c
index 2e7c3c8ffe03..de91e17daf6f 100644
--- a/arch/x86_64/mm/fault.c
+++ b/arch/x86_64/mm/fault.c
@@ -264,6 +264,8 @@ static int vmalloc_fault(unsigned long address)
264 return -1; 264 return -1;
265 if (pgd_none(*pgd)) 265 if (pgd_none(*pgd))
266 set_pgd(pgd, *pgd_ref); 266 set_pgd(pgd, *pgd_ref);
267 else
268 BUG_ON(pgd_page(*pgd) != pgd_page(*pgd_ref));
267 269
268 /* Below here mismatches are bugs because these lower tables 270 /* Below here mismatches are bugs because these lower tables
269 are shared */ 271 are shared */
@@ -314,16 +316,6 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
314 316
315 /* get the address */ 317 /* get the address */
316 __asm__("movq %%cr2,%0":"=r" (address)); 318 __asm__("movq %%cr2,%0":"=r" (address));
317 if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
318 SIGSEGV) == NOTIFY_STOP)
319 return;
320
321 if (likely(regs->eflags & X86_EFLAGS_IF))
322 local_irq_enable();
323
324 if (unlikely(page_fault_trace))
325 printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n",
326 regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code);
327 319
328 tsk = current; 320 tsk = current;
329 mm = tsk->mm; 321 mm = tsk->mm;
@@ -351,10 +343,12 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
351 */ 343 */
352 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) && 344 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
353 ((address >= VMALLOC_START && address < VMALLOC_END))) { 345 ((address >= VMALLOC_START && address < VMALLOC_END))) {
354 if (vmalloc_fault(address) < 0) 346 if (vmalloc_fault(address) >= 0)
355 goto bad_area_nosemaphore; 347 return;
356 return;
357 } 348 }
349 if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
350 SIGSEGV) == NOTIFY_STOP)
351 return;
358 /* 352 /*
359 * Don't take the mm semaphore here. If we fixup a prefetch 353 * Don't take the mm semaphore here. If we fixup a prefetch
360 * fault we could otherwise deadlock. 354 * fault we could otherwise deadlock.
@@ -362,6 +356,17 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
362 goto bad_area_nosemaphore; 356 goto bad_area_nosemaphore;
363 } 357 }
364 358
359 if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
360 SIGSEGV) == NOTIFY_STOP)
361 return;
362
363 if (likely(regs->eflags & X86_EFLAGS_IF))
364 local_irq_enable();
365
366 if (unlikely(page_fault_trace))
367 printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n",
368 regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code);
369
365 if (unlikely(error_code & PF_RSVD)) 370 if (unlikely(error_code & PF_RSVD))
366 pgtable_bad(address, regs, error_code); 371 pgtable_bad(address, regs, error_code);
367 372
@@ -571,6 +576,48 @@ do_sigbus:
571 return; 576 return;
572} 577}
573 578
579DEFINE_SPINLOCK(pgd_lock);
580struct page *pgd_list;
581
582void vmalloc_sync_all(void)
583{
584 /* Note that races in the updates of insync and start aren't
585 problematic:
586 insync can only get set bits added, and updates to start are only
587 improving performance (without affecting correctness if undone). */
588 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
589 static unsigned long start = VMALLOC_START & PGDIR_MASK;
590 unsigned long address;
591
592 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
593 if (!test_bit(pgd_index(address), insync)) {
594 const pgd_t *pgd_ref = pgd_offset_k(address);
595 struct page *page;
596
597 if (pgd_none(*pgd_ref))
598 continue;
599 spin_lock(&pgd_lock);
600 for (page = pgd_list; page;
601 page = (struct page *)page->index) {
602 pgd_t *pgd;
603 pgd = (pgd_t *)page_address(page) + pgd_index(address);
604 if (pgd_none(*pgd))
605 set_pgd(pgd, *pgd_ref);
606 else
607 BUG_ON(pgd_page(*pgd) != pgd_page(*pgd_ref));
608 }
609 spin_unlock(&pgd_lock);
610 set_bit(pgd_index(address), insync);
611 }
612 if (address == start)
613 start = address + PGDIR_SIZE;
614 }
615 /* Check that there is no need to do the same for the modules area. */
616 BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
617 BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
618 (__START_KERNEL & PGDIR_MASK)));
619}
620
574static int __init enable_pagefaulttrace(char *str) 621static int __init enable_pagefaulttrace(char *str)
575{ 622{
576 page_fault_trace = 1; 623 page_fault_trace = 1;
diff --git a/include/asm-x86_64/pgalloc.h b/include/asm-x86_64/pgalloc.h
index 08cad2482bcb..43d4c333a8b1 100644
--- a/include/asm-x86_64/pgalloc.h
+++ b/include/asm-x86_64/pgalloc.h
@@ -45,12 +45,39 @@ static inline void pud_free (pud_t *pud)
45 free_page((unsigned long)pud); 45 free_page((unsigned long)pud);
46} 46}
47 47
48static inline void pgd_list_add(pgd_t *pgd)
49{
50 struct page *page = virt_to_page(pgd);
51
52 spin_lock(&pgd_lock);
53 page->index = (pgoff_t)pgd_list;
54 if (pgd_list)
55 pgd_list->private = (unsigned long)&page->index;
56 pgd_list = page;
57 page->private = (unsigned long)&pgd_list;
58 spin_unlock(&pgd_lock);
59}
60
61static inline void pgd_list_del(pgd_t *pgd)
62{
63 struct page *next, **pprev, *page = virt_to_page(pgd);
64
65 spin_lock(&pgd_lock);
66 next = (struct page *)page->index;
67 pprev = (struct page **)page->private;
68 *pprev = next;
69 if (next)
70 next->private = (unsigned long)pprev;
71 spin_unlock(&pgd_lock);
72}
73
48static inline pgd_t *pgd_alloc(struct mm_struct *mm) 74static inline pgd_t *pgd_alloc(struct mm_struct *mm)
49{ 75{
50 unsigned boundary; 76 unsigned boundary;
51 pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT); 77 pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
52 if (!pgd) 78 if (!pgd)
53 return NULL; 79 return NULL;
80 pgd_list_add(pgd);
54 /* 81 /*
55 * Copy kernel pointers in from init. 82 * Copy kernel pointers in from init.
56 * Could keep a freelist or slab cache of those because the kernel 83 * Could keep a freelist or slab cache of those because the kernel
@@ -67,6 +94,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm)
67static inline void pgd_free(pgd_t *pgd) 94static inline void pgd_free(pgd_t *pgd)
68{ 95{
69 BUG_ON((unsigned long)pgd & (PAGE_SIZE-1)); 96 BUG_ON((unsigned long)pgd & (PAGE_SIZE-1));
97 pgd_list_del(pgd);
70 free_page((unsigned long)pgd); 98 free_page((unsigned long)pgd);
71} 99}
72 100
diff --git a/include/asm-x86_64/pgtable.h b/include/asm-x86_64/pgtable.h
index def903287193..31e83c3bd022 100644
--- a/include/asm-x86_64/pgtable.h
+++ b/include/asm-x86_64/pgtable.h
@@ -420,6 +420,10 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
420#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) 420#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) })
421#define __swp_entry_to_pte(x) ((pte_t) { (x).val }) 421#define __swp_entry_to_pte(x) ((pte_t) { (x).val })
422 422
423extern spinlock_t pgd_lock;
424extern struct page *pgd_list;
425void vmalloc_sync_all(void);
426
423#endif /* !__ASSEMBLY__ */ 427#endif /* !__ASSEMBLY__ */
424 428
425extern int kern_addr_valid(unsigned long addr); 429extern int kern_addr_valid(unsigned long addr);