diff options
Diffstat (limited to 'arch/i386/mm')
-rw-r--r-- | arch/i386/mm/fault.c | 210 | ||||
-rw-r--r-- | arch/i386/mm/hugetlbpage.c | 12 | ||||
-rw-r--r-- | arch/i386/mm/init.c | 47 | ||||
-rw-r--r-- | arch/i386/mm/pageattr.c | 20 |
4 files changed, 174 insertions, 115 deletions
diff --git a/arch/i386/mm/fault.c b/arch/i386/mm/fault.c index cf572d9a3b6e..7f0fcf219a26 100644 --- a/arch/i386/mm/fault.c +++ b/arch/i386/mm/fault.c | |||
@@ -214,6 +214,68 @@ static noinline void force_sig_info_fault(int si_signo, int si_code, | |||
214 | 214 | ||
215 | fastcall void do_invalid_op(struct pt_regs *, unsigned long); | 215 | fastcall void do_invalid_op(struct pt_regs *, unsigned long); |
216 | 216 | ||
217 | static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) | ||
218 | { | ||
219 | unsigned index = pgd_index(address); | ||
220 | pgd_t *pgd_k; | ||
221 | pud_t *pud, *pud_k; | ||
222 | pmd_t *pmd, *pmd_k; | ||
223 | |||
224 | pgd += index; | ||
225 | pgd_k = init_mm.pgd + index; | ||
226 | |||
227 | if (!pgd_present(*pgd_k)) | ||
228 | return NULL; | ||
229 | |||
230 | /* | ||
231 | * set_pgd(pgd, *pgd_k); here would be useless on PAE | ||
232 | * and redundant with the set_pmd() on non-PAE. As would | ||
233 | * set_pud. | ||
234 | */ | ||
235 | |||
236 | pud = pud_offset(pgd, address); | ||
237 | pud_k = pud_offset(pgd_k, address); | ||
238 | if (!pud_present(*pud_k)) | ||
239 | return NULL; | ||
240 | |||
241 | pmd = pmd_offset(pud, address); | ||
242 | pmd_k = pmd_offset(pud_k, address); | ||
243 | if (!pmd_present(*pmd_k)) | ||
244 | return NULL; | ||
245 | if (!pmd_present(*pmd)) | ||
246 | set_pmd(pmd, *pmd_k); | ||
247 | else | ||
248 | BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k)); | ||
249 | return pmd_k; | ||
250 | } | ||
251 | |||
252 | /* | ||
253 | * Handle a fault on the vmalloc or module mapping area | ||
254 | * | ||
255 | * This assumes no large pages in there. | ||
256 | */ | ||
257 | static inline int vmalloc_fault(unsigned long address) | ||
258 | { | ||
259 | unsigned long pgd_paddr; | ||
260 | pmd_t *pmd_k; | ||
261 | pte_t *pte_k; | ||
262 | /* | ||
263 | * Synchronize this task's top level page-table | ||
264 | * with the 'reference' page table. | ||
265 | * | ||
266 | * Do _not_ use "current" here. We might be inside | ||
267 | * an interrupt in the middle of a task switch.. | ||
268 | */ | ||
269 | pgd_paddr = read_cr3(); | ||
270 | pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); | ||
271 | if (!pmd_k) | ||
272 | return -1; | ||
273 | pte_k = pte_offset_kernel(pmd_k, address); | ||
274 | if (!pte_present(*pte_k)) | ||
275 | return -1; | ||
276 | return 0; | ||
277 | } | ||
278 | |||
217 | /* | 279 | /* |
218 | * This routine handles page faults. It determines the address, | 280 | * This routine handles page faults. It determines the address, |
219 | * and the problem, and then passes it off to one of the appropriate | 281 | * and the problem, and then passes it off to one of the appropriate |
@@ -223,6 +285,8 @@ fastcall void do_invalid_op(struct pt_regs *, unsigned long); | |||
223 | * bit 0 == 0 means no page found, 1 means protection fault | 285 | * bit 0 == 0 means no page found, 1 means protection fault |
224 | * bit 1 == 0 means read, 1 means write | 286 | * bit 1 == 0 means read, 1 means write |
225 | * bit 2 == 0 means kernel, 1 means user-mode | 287 | * bit 2 == 0 means kernel, 1 means user-mode |
288 | * bit 3 == 1 means use of reserved bit detected | ||
289 | * bit 4 == 1 means fault was an instruction fetch | ||
226 | */ | 290 | */ |
227 | fastcall void __kprobes do_page_fault(struct pt_regs *regs, | 291 | fastcall void __kprobes do_page_fault(struct pt_regs *regs, |
228 | unsigned long error_code) | 292 | unsigned long error_code) |
@@ -237,13 +301,6 @@ fastcall void __kprobes do_page_fault(struct pt_regs *regs, | |||
237 | /* get the address */ | 301 | /* get the address */ |
238 | address = read_cr2(); | 302 | address = read_cr2(); |
239 | 303 | ||
240 | if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, | ||
241 | SIGSEGV) == NOTIFY_STOP) | ||
242 | return; | ||
243 | /* It's safe to allow irq's after cr2 has been saved */ | ||
244 | if (regs->eflags & (X86_EFLAGS_IF|VM_MASK)) | ||
245 | local_irq_enable(); | ||
246 | |||
247 | tsk = current; | 304 | tsk = current; |
248 | 305 | ||
249 | si_code = SEGV_MAPERR; | 306 | si_code = SEGV_MAPERR; |
@@ -259,17 +316,29 @@ fastcall void __kprobes do_page_fault(struct pt_regs *regs, | |||
259 | * | 316 | * |
260 | * This verifies that the fault happens in kernel space | 317 | * This verifies that the fault happens in kernel space |
261 | * (error_code & 4) == 0, and that the fault was not a | 318 | * (error_code & 4) == 0, and that the fault was not a |
262 | * protection error (error_code & 1) == 0. | 319 | * protection error (error_code & 9) == 0. |
263 | */ | 320 | */ |
264 | if (unlikely(address >= TASK_SIZE)) { | 321 | if (unlikely(address >= TASK_SIZE)) { |
265 | if (!(error_code & 5)) | 322 | if (!(error_code & 0x0000000d) && vmalloc_fault(address) >= 0) |
266 | goto vmalloc_fault; | 323 | return; |
267 | /* | 324 | if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, |
325 | SIGSEGV) == NOTIFY_STOP) | ||
326 | return; | ||
327 | /* | ||
268 | * Don't take the mm semaphore here. If we fixup a prefetch | 328 | * Don't take the mm semaphore here. If we fixup a prefetch |
269 | * fault we could otherwise deadlock. | 329 | * fault we could otherwise deadlock. |
270 | */ | 330 | */ |
271 | goto bad_area_nosemaphore; | 331 | goto bad_area_nosemaphore; |
272 | } | 332 | } |
333 | |||
334 | if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, | ||
335 | SIGSEGV) == NOTIFY_STOP) | ||
336 | return; | ||
337 | |||
338 | /* It's safe to allow irq's after cr2 has been saved and the vmalloc | ||
339 | fault has been handled. */ | ||
340 | if (regs->eflags & (X86_EFLAGS_IF|VM_MASK)) | ||
341 | local_irq_enable(); | ||
273 | 342 | ||
274 | mm = tsk->mm; | 343 | mm = tsk->mm; |
275 | 344 | ||
@@ -440,24 +509,31 @@ no_context: | |||
440 | 509 | ||
441 | bust_spinlocks(1); | 510 | bust_spinlocks(1); |
442 | 511 | ||
443 | #ifdef CONFIG_X86_PAE | 512 | if (oops_may_print()) { |
444 | if (error_code & 16) { | 513 | #ifdef CONFIG_X86_PAE |
445 | pte_t *pte = lookup_address(address); | 514 | if (error_code & 16) { |
515 | pte_t *pte = lookup_address(address); | ||
446 | 516 | ||
447 | if (pte && pte_present(*pte) && !pte_exec_kernel(*pte)) | 517 | if (pte && pte_present(*pte) && !pte_exec_kernel(*pte)) |
448 | printk(KERN_CRIT "kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n", current->uid); | 518 | printk(KERN_CRIT "kernel tried to execute " |
519 | "NX-protected page - exploit attempt? " | ||
520 | "(uid: %d)\n", current->uid); | ||
521 | } | ||
522 | #endif | ||
523 | if (address < PAGE_SIZE) | ||
524 | printk(KERN_ALERT "BUG: unable to handle kernel NULL " | ||
525 | "pointer dereference"); | ||
526 | else | ||
527 | printk(KERN_ALERT "BUG: unable to handle kernel paging" | ||
528 | " request"); | ||
529 | printk(" at virtual address %08lx\n",address); | ||
530 | printk(KERN_ALERT " printing eip:\n"); | ||
531 | printk("%08lx\n", regs->eip); | ||
449 | } | 532 | } |
450 | #endif | ||
451 | if (address < PAGE_SIZE) | ||
452 | printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference"); | ||
453 | else | ||
454 | printk(KERN_ALERT "Unable to handle kernel paging request"); | ||
455 | printk(" at virtual address %08lx\n",address); | ||
456 | printk(KERN_ALERT " printing eip:\n"); | ||
457 | printk("%08lx\n", regs->eip); | ||
458 | page = read_cr3(); | 533 | page = read_cr3(); |
459 | page = ((unsigned long *) __va(page))[address >> 22]; | 534 | page = ((unsigned long *) __va(page))[address >> 22]; |
460 | printk(KERN_ALERT "*pde = %08lx\n", page); | 535 | if (oops_may_print()) |
536 | printk(KERN_ALERT "*pde = %08lx\n", page); | ||
461 | /* | 537 | /* |
462 | * We must not directly access the pte in the highpte | 538 | * We must not directly access the pte in the highpte |
463 | * case, the page table might be allocated in highmem. | 539 | * case, the page table might be allocated in highmem. |
@@ -465,7 +541,7 @@ no_context: | |||
465 | * it's allocated already. | 541 | * it's allocated already. |
466 | */ | 542 | */ |
467 | #ifndef CONFIG_HIGHPTE | 543 | #ifndef CONFIG_HIGHPTE |
468 | if (page & 1) { | 544 | if ((page & 1) && oops_may_print()) { |
469 | page &= PAGE_MASK; | 545 | page &= PAGE_MASK; |
470 | address &= 0x003ff000; | 546 | address &= 0x003ff000; |
471 | page = ((unsigned long *) __va(page))[address >> PAGE_SHIFT]; | 547 | page = ((unsigned long *) __va(page))[address >> PAGE_SHIFT]; |
@@ -510,51 +586,41 @@ do_sigbus: | |||
510 | tsk->thread.error_code = error_code; | 586 | tsk->thread.error_code = error_code; |
511 | tsk->thread.trap_no = 14; | 587 | tsk->thread.trap_no = 14; |
512 | force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); | 588 | force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); |
513 | return; | 589 | } |
514 | |||
515 | vmalloc_fault: | ||
516 | { | ||
517 | /* | ||
518 | * Synchronize this task's top level page-table | ||
519 | * with the 'reference' page table. | ||
520 | * | ||
521 | * Do _not_ use "tsk" here. We might be inside | ||
522 | * an interrupt in the middle of a task switch.. | ||
523 | */ | ||
524 | int index = pgd_index(address); | ||
525 | unsigned long pgd_paddr; | ||
526 | pgd_t *pgd, *pgd_k; | ||
527 | pud_t *pud, *pud_k; | ||
528 | pmd_t *pmd, *pmd_k; | ||
529 | pte_t *pte_k; | ||
530 | |||
531 | pgd_paddr = read_cr3(); | ||
532 | pgd = index + (pgd_t *)__va(pgd_paddr); | ||
533 | pgd_k = init_mm.pgd + index; | ||
534 | |||
535 | if (!pgd_present(*pgd_k)) | ||
536 | goto no_context; | ||
537 | |||
538 | /* | ||
539 | * set_pgd(pgd, *pgd_k); here would be useless on PAE | ||
540 | * and redundant with the set_pmd() on non-PAE. As would | ||
541 | * set_pud. | ||
542 | */ | ||
543 | 590 | ||
544 | pud = pud_offset(pgd, address); | 591 | #ifndef CONFIG_X86_PAE |
545 | pud_k = pud_offset(pgd_k, address); | 592 | void vmalloc_sync_all(void) |
546 | if (!pud_present(*pud_k)) | 593 | { |
547 | goto no_context; | 594 | /* |
548 | 595 | * Note that races in the updates of insync and start aren't | |
549 | pmd = pmd_offset(pud, address); | 596 | * problematic: insync can only get set bits added, and updates to |
550 | pmd_k = pmd_offset(pud_k, address); | 597 | * start are only improving performance (without affecting correctness |
551 | if (!pmd_present(*pmd_k)) | 598 | * if undone). |
552 | goto no_context; | 599 | */ |
553 | set_pmd(pmd, *pmd_k); | 600 | static DECLARE_BITMAP(insync, PTRS_PER_PGD); |
601 | static unsigned long start = TASK_SIZE; | ||
602 | unsigned long address; | ||
554 | 603 | ||
555 | pte_k = pte_offset_kernel(pmd_k, address); | 604 | BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK); |
556 | if (!pte_present(*pte_k)) | 605 | for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) { |
557 | goto no_context; | 606 | if (!test_bit(pgd_index(address), insync)) { |
558 | return; | 607 | unsigned long flags; |
608 | struct page *page; | ||
609 | |||
610 | spin_lock_irqsave(&pgd_lock, flags); | ||
611 | for (page = pgd_list; page; page = | ||
612 | (struct page *)page->index) | ||
613 | if (!vmalloc_sync_one(page_address(page), | ||
614 | address)) { | ||
615 | BUG_ON(page != pgd_list); | ||
616 | break; | ||
617 | } | ||
618 | spin_unlock_irqrestore(&pgd_lock, flags); | ||
619 | if (!page) | ||
620 | set_bit(pgd_index(address), insync); | ||
621 | } | ||
622 | if (address == start && test_bit(pgd_index(address), insync)) | ||
623 | start = address + PGDIR_SIZE; | ||
559 | } | 624 | } |
560 | } | 625 | } |
626 | #endif | ||
diff --git a/arch/i386/mm/hugetlbpage.c b/arch/i386/mm/hugetlbpage.c index d524127c9afc..a7d891585411 100644 --- a/arch/i386/mm/hugetlbpage.c +++ b/arch/i386/mm/hugetlbpage.c | |||
@@ -48,18 +48,6 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) | |||
48 | return (pte_t *) pmd; | 48 | return (pte_t *) pmd; |
49 | } | 49 | } |
50 | 50 | ||
51 | /* | ||
52 | * This function checks for proper alignment of input addr and len parameters. | ||
53 | */ | ||
54 | int is_aligned_hugepage_range(unsigned long addr, unsigned long len) | ||
55 | { | ||
56 | if (len & ~HPAGE_MASK) | ||
57 | return -EINVAL; | ||
58 | if (addr & ~HPAGE_MASK) | ||
59 | return -EINVAL; | ||
60 | return 0; | ||
61 | } | ||
62 | |||
63 | #if 0 /* This is just for testing */ | 51 | #if 0 /* This is just for testing */ |
64 | struct page * | 52 | struct page * |
65 | follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) | 53 | follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) |
diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c index 2700f01994ba..9f66ac582a8b 100644 --- a/arch/i386/mm/init.c +++ b/arch/i386/mm/init.c | |||
@@ -270,7 +270,7 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base) | |||
270 | 270 | ||
271 | static void __meminit free_new_highpage(struct page *page) | 271 | static void __meminit free_new_highpage(struct page *page) |
272 | { | 272 | { |
273 | set_page_count(page, 1); | 273 | init_page_count(page); |
274 | __free_page(page); | 274 | __free_page(page); |
275 | totalhigh_pages++; | 275 | totalhigh_pages++; |
276 | } | 276 | } |
@@ -720,21 +720,6 @@ static int noinline do_test_wp_bit(void) | |||
720 | return flag; | 720 | return flag; |
721 | } | 721 | } |
722 | 722 | ||
723 | void free_initmem(void) | ||
724 | { | ||
725 | unsigned long addr; | ||
726 | |||
727 | addr = (unsigned long)(&__init_begin); | ||
728 | for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) { | ||
729 | ClearPageReserved(virt_to_page(addr)); | ||
730 | set_page_count(virt_to_page(addr), 1); | ||
731 | memset((void *)addr, 0xcc, PAGE_SIZE); | ||
732 | free_page(addr); | ||
733 | totalram_pages++; | ||
734 | } | ||
735 | printk (KERN_INFO "Freeing unused kernel memory: %dk freed\n", (__init_end - __init_begin) >> 10); | ||
736 | } | ||
737 | |||
738 | #ifdef CONFIG_DEBUG_RODATA | 723 | #ifdef CONFIG_DEBUG_RODATA |
739 | 724 | ||
740 | extern char __start_rodata, __end_rodata; | 725 | extern char __start_rodata, __end_rodata; |
@@ -758,17 +743,31 @@ void mark_rodata_ro(void) | |||
758 | } | 743 | } |
759 | #endif | 744 | #endif |
760 | 745 | ||
746 | void free_init_pages(char *what, unsigned long begin, unsigned long end) | ||
747 | { | ||
748 | unsigned long addr; | ||
749 | |||
750 | for (addr = begin; addr < end; addr += PAGE_SIZE) { | ||
751 | ClearPageReserved(virt_to_page(addr)); | ||
752 | init_page_count(virt_to_page(addr)); | ||
753 | memset((void *)addr, 0xcc, PAGE_SIZE); | ||
754 | free_page(addr); | ||
755 | totalram_pages++; | ||
756 | } | ||
757 | printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10); | ||
758 | } | ||
759 | |||
760 | void free_initmem(void) | ||
761 | { | ||
762 | free_init_pages("unused kernel memory", | ||
763 | (unsigned long)(&__init_begin), | ||
764 | (unsigned long)(&__init_end)); | ||
765 | } | ||
761 | 766 | ||
762 | #ifdef CONFIG_BLK_DEV_INITRD | 767 | #ifdef CONFIG_BLK_DEV_INITRD |
763 | void free_initrd_mem(unsigned long start, unsigned long end) | 768 | void free_initrd_mem(unsigned long start, unsigned long end) |
764 | { | 769 | { |
765 | if (start < end) | 770 | free_init_pages("initrd memory", start, end); |
766 | printk (KERN_INFO "Freeing initrd memory: %ldk freed\n", (end - start) >> 10); | ||
767 | for (; start < end; start += PAGE_SIZE) { | ||
768 | ClearPageReserved(virt_to_page(start)); | ||
769 | set_page_count(virt_to_page(start), 1); | ||
770 | free_page(start); | ||
771 | totalram_pages++; | ||
772 | } | ||
773 | } | 771 | } |
774 | #endif | 772 | #endif |
773 | |||
diff --git a/arch/i386/mm/pageattr.c b/arch/i386/mm/pageattr.c index d0cadb33b54c..92c3d9f0e731 100644 --- a/arch/i386/mm/pageattr.c +++ b/arch/i386/mm/pageattr.c | |||
@@ -51,6 +51,13 @@ static struct page *split_large_page(unsigned long address, pgprot_t prot, | |||
51 | if (!base) | 51 | if (!base) |
52 | return NULL; | 52 | return NULL; |
53 | 53 | ||
54 | /* | ||
55 | * page_private is used to track the number of entries in | ||
56 | * the page table page that have non standard attributes. | ||
57 | */ | ||
58 | SetPagePrivate(base); | ||
59 | page_private(base) = 0; | ||
60 | |||
54 | address = __pa(address); | 61 | address = __pa(address); |
55 | addr = address & LARGE_PAGE_MASK; | 62 | addr = address & LARGE_PAGE_MASK; |
56 | pbase = (pte_t *)page_address(base); | 63 | pbase = (pte_t *)page_address(base); |
@@ -143,11 +150,12 @@ __change_page_attr(struct page *page, pgprot_t prot) | |||
143 | return -ENOMEM; | 150 | return -ENOMEM; |
144 | set_pmd_pte(kpte,address,mk_pte(split, ref_prot)); | 151 | set_pmd_pte(kpte,address,mk_pte(split, ref_prot)); |
145 | kpte_page = split; | 152 | kpte_page = split; |
146 | } | 153 | } |
147 | get_page(kpte_page); | 154 | page_private(kpte_page)++; |
148 | } else if ((pte_val(*kpte) & _PAGE_PSE) == 0) { | 155 | } else if ((pte_val(*kpte) & _PAGE_PSE) == 0) { |
149 | set_pte_atomic(kpte, mk_pte(page, PAGE_KERNEL)); | 156 | set_pte_atomic(kpte, mk_pte(page, PAGE_KERNEL)); |
150 | __put_page(kpte_page); | 157 | BUG_ON(page_private(kpte_page) == 0); |
158 | page_private(kpte_page)--; | ||
151 | } else | 159 | } else |
152 | BUG(); | 160 | BUG(); |
153 | 161 | ||
@@ -157,10 +165,8 @@ __change_page_attr(struct page *page, pgprot_t prot) | |||
157 | * replace it with a largepage. | 165 | * replace it with a largepage. |
158 | */ | 166 | */ |
159 | if (!PageReserved(kpte_page)) { | 167 | if (!PageReserved(kpte_page)) { |
160 | /* memleak and potential failed 2M page regeneration */ | 168 | if (cpu_has_pse && (page_private(kpte_page) == 0)) { |
161 | BUG_ON(!page_count(kpte_page)); | 169 | ClearPagePrivate(kpte_page); |
162 | |||
163 | if (cpu_has_pse && (page_count(kpte_page) == 1)) { | ||
164 | list_add(&kpte_page->lru, &df_list); | 170 | list_add(&kpte_page->lru, &df_list); |
165 | revert_page(kpte_page, address); | 171 | revert_page(kpte_page, address); |
166 | } | 172 | } |