aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNick Piggin <npiggin@suse.de>2009-01-19 22:24:26 -0500
committerIngo Molnar <mingo@elte.hu>2009-01-20 07:14:23 -0500
commit92181f190b649f7ef2b79cbf5c00f26ccc66da2a (patch)
treec5c15a4d998d8f21dfa389559408268f917696ac
parent0ce1c383681370964e7f77dd44506aeb3a6ba657 (diff)
x86: optimise x86's do_page_fault (C entry point for the page fault path)
Impact: cleanup, restructure code to improve assembly gcc isn't _all_ that smart about spilling registers to stack or reusing stack slots, even with branch annotations. do_page_fault contained a lot of functionality, so split unlikely paths into their own functions, and mark them as noinline just to be sure. I consider this actually to be somewhat of a cleanup too: the main function now contains about half the number of lines so the normal path is easier to read, while the error cases are also nicely split away. Also, ensure the order of arguments to functions is always the same: regs, addr, error_code. This can reduce code size a tiny bit, and just looks neater too. And add a couple of branch annotations. Before: do_page_fault: subq $360, %rsp #, After: do_page_fault: subq $56, %rsp #, bloat-o-meter: add/remove: 8/0 grow/shrink: 0/1 up/down: 2222/-1680 (542) function old new delta __bad_area_nosemaphore - 506 +506 no_context - 474 +474 vmalloc_fault - 424 +424 spurious_fault - 358 +358 mm_fault_error - 272 +272 bad_area_access_error - 89 +89 bad_area - 89 +89 bad_area_nosemaphore - 10 +10 do_page_fault 2464 784 -1680 Yes, the total size increases by 542 bytes, due to the extra function calls. But these will very rarely be called (except for vmalloc_fault) in a normal workload. Importantly, do_page_fault is less than 1/3rd it's original size, and touches far less stack. Existing gotos and branch hints did move a lot of the infrequently used text out of the fastpath, but that's even further improved after this patch. Signed-off-by: Nick Piggin <npiggin@suse.de> Acked-by: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r--arch/x86/mm/fault.c438
1 files changed, 256 insertions, 182 deletions
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 90dfae511a41..033292dc9e21 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -91,8 +91,8 @@ static inline int notify_page_fault(struct pt_regs *regs)
91 * 91 *
92 * Opcode checker based on code by Richard Brunner 92 * Opcode checker based on code by Richard Brunner
93 */ 93 */
94static int is_prefetch(struct pt_regs *regs, unsigned long addr, 94static int is_prefetch(struct pt_regs *regs, unsigned long error_code,
95 unsigned long error_code) 95 unsigned long addr)
96{ 96{
97 unsigned char *instr; 97 unsigned char *instr;
98 int scan_more = 1; 98 int scan_more = 1;
@@ -409,15 +409,15 @@ static void show_fault_oops(struct pt_regs *regs, unsigned long error_code,
409} 409}
410 410
411#ifdef CONFIG_X86_64 411#ifdef CONFIG_X86_64
412static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs, 412static noinline void pgtable_bad(struct pt_regs *regs,
413 unsigned long error_code) 413 unsigned long error_code, unsigned long address)
414{ 414{
415 unsigned long flags = oops_begin(); 415 unsigned long flags = oops_begin();
416 int sig = SIGKILL; 416 int sig = SIGKILL;
417 struct task_struct *tsk; 417 struct task_struct *tsk = current;
418 418
419 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", 419 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
420 current->comm, address); 420 tsk->comm, address);
421 dump_pagetable(address); 421 dump_pagetable(address);
422 tsk = current; 422 tsk = current;
423 tsk->thread.cr2 = address; 423 tsk->thread.cr2 = address;
@@ -429,6 +429,190 @@ static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
429} 429}
430#endif 430#endif
431 431
432static noinline void no_context(struct pt_regs *regs,
433 unsigned long error_code, unsigned long address)
434{
435 struct task_struct *tsk = current;
436#ifdef CONFIG_X86_64
437 unsigned long flags;
438 int sig;
439#endif
440
441 /* Are we prepared to handle this kernel fault? */
442 if (fixup_exception(regs))
443 return;
444
445 /*
446 * X86_32
447 * Valid to do another page fault here, because if this fault
448 * had been triggered by is_prefetch fixup_exception would have
449 * handled it.
450 *
451 * X86_64
452 * Hall of shame of CPU/BIOS bugs.
453 */
454 if (is_prefetch(regs, error_code, address))
455 return;
456
457 if (is_errata93(regs, address))
458 return;
459
460 /*
461 * Oops. The kernel tried to access some bad page. We'll have to
462 * terminate things with extreme prejudice.
463 */
464#ifdef CONFIG_X86_32
465 bust_spinlocks(1);
466#else
467 flags = oops_begin();
468#endif
469
470 show_fault_oops(regs, error_code, address);
471
472 tsk->thread.cr2 = address;
473 tsk->thread.trap_no = 14;
474 tsk->thread.error_code = error_code;
475
476#ifdef CONFIG_X86_32
477 die("Oops", regs, error_code);
478 bust_spinlocks(0);
479 do_exit(SIGKILL);
480#else
481 sig = SIGKILL;
482 if (__die("Oops", regs, error_code))
483 sig = 0;
484 /* Executive summary in case the body of the oops scrolled away */
485 printk(KERN_EMERG "CR2: %016lx\n", address);
486 oops_end(flags, regs, sig);
487#endif
488}
489
490static void __bad_area_nosemaphore(struct pt_regs *regs,
491 unsigned long error_code, unsigned long address,
492 int si_code)
493{
494 struct task_struct *tsk = current;
495
496 /* User mode accesses just cause a SIGSEGV */
497 if (error_code & PF_USER) {
498 /*
499 * It's possible to have interrupts off here.
500 */
501 local_irq_enable();
502
503 /*
504 * Valid to do another page fault here because this one came
505 * from user space.
506 */
507 if (is_prefetch(regs, error_code, address))
508 return;
509
510 if (is_errata100(regs, address))
511 return;
512
513 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
514 printk_ratelimit()) {
515 printk(
516 "%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
517 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
518 tsk->comm, task_pid_nr(tsk), address,
519 (void *) regs->ip, (void *) regs->sp, error_code);
520 print_vma_addr(" in ", regs->ip);
521 printk("\n");
522 }
523
524 tsk->thread.cr2 = address;
525 /* Kernel addresses are always protection faults */
526 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
527 tsk->thread.trap_no = 14;
528 force_sig_info_fault(SIGSEGV, si_code, address, tsk);
529 return;
530 }
531
532 if (is_f00f_bug(regs, address))
533 return;
534
535 no_context(regs, error_code, address);
536}
537
538static noinline void bad_area_nosemaphore(struct pt_regs *regs,
539 unsigned long error_code, unsigned long address)
540{
541 __bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR);
542}
543
544static void __bad_area(struct pt_regs *regs,
545 unsigned long error_code, unsigned long address,
546 int si_code)
547{
548 struct mm_struct *mm = current->mm;
549
550 /*
551 * Something tried to access memory that isn't in our memory map..
552 * Fix it, but check if it's kernel or user first..
553 */
554 up_read(&mm->mmap_sem);
555
556 __bad_area_nosemaphore(regs, error_code, address, si_code);
557}
558
559static noinline void bad_area(struct pt_regs *regs,
560 unsigned long error_code, unsigned long address)
561{
562 __bad_area(regs, error_code, address, SEGV_MAPERR);
563}
564
565static noinline void bad_area_access_error(struct pt_regs *regs,
566 unsigned long error_code, unsigned long address)
567{
568 __bad_area(regs, error_code, address, SEGV_ACCERR);
569}
570
571/* TODO: fixup for "mm-invoke-oom-killer-from-page-fault.patch" */
572static void out_of_memory(struct pt_regs *regs,
573 unsigned long error_code, unsigned long address)
574{
575 /*
576 * We ran out of memory, call the OOM killer, and return the userspace
577 * (which will retry the fault, or kill us if we got oom-killed).
578 */
579 up_read(&current->mm->mmap_sem);
580 pagefault_out_of_memory();
581}
582
583static void do_sigbus(struct pt_regs *regs,
584 unsigned long error_code, unsigned long address)
585{
586 struct task_struct *tsk = current;
587 struct mm_struct *mm = tsk->mm;
588
589 up_read(&mm->mmap_sem);
590
591 /* Kernel mode? Handle exceptions or die */
592 if (!(error_code & PF_USER))
593 no_context(regs, error_code, address);
594#ifdef CONFIG_X86_32
595 /* User space => ok to do another page fault */
596 if (is_prefetch(regs, error_code, address))
597 return;
598#endif
599 tsk->thread.cr2 = address;
600 tsk->thread.error_code = error_code;
601 tsk->thread.trap_no = 14;
602 force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
603}
604
605static noinline void mm_fault_error(struct pt_regs *regs,
606 unsigned long error_code, unsigned long address, unsigned int fault)
607{
608 if (fault & VM_FAULT_OOM)
609 out_of_memory(regs, error_code, address);
610 else if (fault & VM_FAULT_SIGBUS)
611 do_sigbus(regs, error_code, address);
612 else
613 BUG();
614}
615
432static int spurious_fault_check(unsigned long error_code, pte_t *pte) 616static int spurious_fault_check(unsigned long error_code, pte_t *pte)
433{ 617{
434 if ((error_code & PF_WRITE) && !pte_write(*pte)) 618 if ((error_code & PF_WRITE) && !pte_write(*pte))
@@ -448,8 +632,8 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte)
448 * There are no security implications to leaving a stale TLB when 632 * There are no security implications to leaving a stale TLB when
449 * increasing the permissions on a page. 633 * increasing the permissions on a page.
450 */ 634 */
451static int spurious_fault(unsigned long address, 635static noinline int spurious_fault(unsigned long error_code,
452 unsigned long error_code) 636 unsigned long address)
453{ 637{
454 pgd_t *pgd; 638 pgd_t *pgd;
455 pud_t *pud; 639 pud_t *pud;
@@ -494,7 +678,7 @@ static int spurious_fault(unsigned long address,
494 * 678 *
495 * This assumes no large pages in there. 679 * This assumes no large pages in there.
496 */ 680 */
497static int vmalloc_fault(unsigned long address) 681static noinline int vmalloc_fault(unsigned long address)
498{ 682{
499#ifdef CONFIG_X86_32 683#ifdef CONFIG_X86_32
500 unsigned long pgd_paddr; 684 unsigned long pgd_paddr;
@@ -573,6 +757,25 @@ static int vmalloc_fault(unsigned long address)
573 757
574int show_unhandled_signals = 1; 758int show_unhandled_signals = 1;
575 759
760static inline int access_error(unsigned long error_code, int write,
761 struct vm_area_struct *vma)
762{
763 if (write) {
764 /* write, present and write, not present */
765 if (unlikely(!(vma->vm_flags & VM_WRITE)))
766 return 1;
767 } else if (unlikely(error_code & PF_PROT)) {
768 /* read, present */
769 return 1;
770 } else {
771 /* read, not present */
772 if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))))
773 return 1;
774 }
775
776 return 0;
777}
778
576/* 779/*
577 * This routine handles page faults. It determines the address, 780 * This routine handles page faults. It determines the address,
578 * and the problem, and then passes it off to one of the appropriate 781 * and the problem, and then passes it off to one of the appropriate
@@ -583,16 +786,12 @@ asmlinkage
583#endif 786#endif
584void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) 787void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
585{ 788{
789 unsigned long address;
586 struct task_struct *tsk; 790 struct task_struct *tsk;
587 struct mm_struct *mm; 791 struct mm_struct *mm;
588 struct vm_area_struct *vma; 792 struct vm_area_struct *vma;
589 unsigned long address; 793 int write;
590 int write, si_code;
591 int fault; 794 int fault;
592#ifdef CONFIG_X86_64
593 unsigned long flags;
594 int sig;
595#endif
596 795
597 tsk = current; 796 tsk = current;
598 mm = tsk->mm; 797 mm = tsk->mm;
@@ -601,9 +800,7 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
601 /* get the address */ 800 /* get the address */
602 address = read_cr2(); 801 address = read_cr2();
603 802
604 si_code = SEGV_MAPERR; 803 if (unlikely(notify_page_fault(regs)))
605
606 if (notify_page_fault(regs))
607 return; 804 return;
608 if (unlikely(kmmio_fault(regs, address))) 805 if (unlikely(kmmio_fault(regs, address)))
609 return; 806 return;
@@ -631,17 +828,17 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
631 return; 828 return;
632 829
633 /* Can handle a stale RO->RW TLB */ 830 /* Can handle a stale RO->RW TLB */
634 if (spurious_fault(address, error_code)) 831 if (spurious_fault(error_code, address))
635 return; 832 return;
636 833
637 /* 834 /*
638 * Don't take the mm semaphore here. If we fixup a prefetch 835 * Don't take the mm semaphore here. If we fixup a prefetch
639 * fault we could otherwise deadlock. 836 * fault we could otherwise deadlock.
640 */ 837 */
641 goto bad_area_nosemaphore; 838 bad_area_nosemaphore(regs, error_code, address);
839 return;
642 } 840 }
643 841
644
645 /* 842 /*
646 * It's safe to allow irq's after cr2 has been saved and the 843 * It's safe to allow irq's after cr2 has been saved and the
647 * vmalloc fault has been handled. 844 * vmalloc fault has been handled.
@@ -657,15 +854,17 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
657 854
658#ifdef CONFIG_X86_64 855#ifdef CONFIG_X86_64
659 if (unlikely(error_code & PF_RSVD)) 856 if (unlikely(error_code & PF_RSVD))
660 pgtable_bad(address, regs, error_code); 857 pgtable_bad(regs, error_code, address);
661#endif 858#endif
662 859
663 /* 860 /*
664 * If we're in an interrupt, have no user context or are running in an 861 * If we're in an interrupt, have no user context or are running in an
665 * atomic region then we must not take the fault. 862 * atomic region then we must not take the fault.
666 */ 863 */
667 if (unlikely(in_atomic() || !mm)) 864 if (unlikely(in_atomic() || !mm)) {
668 goto bad_area_nosemaphore; 865 bad_area_nosemaphore(regs, error_code, address);
866 return;
867 }
669 868
670 /* 869 /*
671 * When running in the kernel we expect faults to occur only to 870 * When running in the kernel we expect faults to occur only to
@@ -683,20 +882,26 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
683 * source. If this is invalid we can skip the address space check, 882 * source. If this is invalid we can skip the address space check,
684 * thus avoiding the deadlock. 883 * thus avoiding the deadlock.
685 */ 884 */
686 if (!down_read_trylock(&mm->mmap_sem)) { 885 if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
687 if ((error_code & PF_USER) == 0 && 886 if ((error_code & PF_USER) == 0 &&
688 !search_exception_tables(regs->ip)) 887 !search_exception_tables(regs->ip)) {
689 goto bad_area_nosemaphore; 888 bad_area_nosemaphore(regs, error_code, address);
889 return;
890 }
690 down_read(&mm->mmap_sem); 891 down_read(&mm->mmap_sem);
691 } 892 }
692 893
693 vma = find_vma(mm, address); 894 vma = find_vma(mm, address);
694 if (!vma) 895 if (unlikely(!vma)) {
695 goto bad_area; 896 bad_area(regs, error_code, address);
696 if (vma->vm_start <= address) 897 return;
898 }
899 if (likely(vma->vm_start <= address))
697 goto good_area; 900 goto good_area;
698 if (!(vma->vm_flags & VM_GROWSDOWN)) 901 if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
699 goto bad_area; 902 bad_area(regs, error_code, address);
903 return;
904 }
700 if (error_code & PF_USER) { 905 if (error_code & PF_USER) {
701 /* 906 /*
702 * Accessing the stack below %sp is always a bug. 907 * Accessing the stack below %sp is always a bug.
@@ -704,31 +909,25 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
704 * and pusha to work. ("enter $65535,$31" pushes 909 * and pusha to work. ("enter $65535,$31" pushes
705 * 32 pointers and then decrements %sp by 65535.) 910 * 32 pointers and then decrements %sp by 65535.)
706 */ 911 */
707 if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp) 912 if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) {
708 goto bad_area; 913 bad_area(regs, error_code, address);
914 return;
915 }
709 } 916 }
710 if (expand_stack(vma, address)) 917 if (unlikely(expand_stack(vma, address))) {
711 goto bad_area; 918 bad_area(regs, error_code, address);
712/* 919 return;
713 * Ok, we have a good vm_area for this memory access, so 920 }
714 * we can handle it.. 921
715 */ 922 /*
923 * Ok, we have a good vm_area for this memory access, so
924 * we can handle it..
925 */
716good_area: 926good_area:
717 si_code = SEGV_ACCERR; 927 write = error_code & PF_WRITE;
718 write = 0; 928 if (unlikely(access_error(error_code, write, vma))) {
719 switch (error_code & (PF_PROT|PF_WRITE)) { 929 bad_area_access_error(regs, error_code, address);
720 default: /* 3: write, present */ 930 return;
721 /* fall through */
722 case PF_WRITE: /* write, not present */
723 if (!(vma->vm_flags & VM_WRITE))
724 goto bad_area;
725 write++;
726 break;
727 case PF_PROT: /* read, present */
728 goto bad_area;
729 case 0: /* read, not present */
730 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
731 goto bad_area;
732 } 931 }
733 932
734 /* 933 /*
@@ -738,11 +937,8 @@ good_area:
738 */ 937 */
739 fault = handle_mm_fault(mm, vma, address, write); 938 fault = handle_mm_fault(mm, vma, address, write);
740 if (unlikely(fault & VM_FAULT_ERROR)) { 939 if (unlikely(fault & VM_FAULT_ERROR)) {
741 if (fault & VM_FAULT_OOM) 940 mm_fault_error(regs, error_code, address, fault);
742 goto out_of_memory; 941 return;
743 else if (fault & VM_FAULT_SIGBUS)
744 goto do_sigbus;
745 BUG();
746 } 942 }
747 if (fault & VM_FAULT_MAJOR) 943 if (fault & VM_FAULT_MAJOR)
748 tsk->maj_flt++; 944 tsk->maj_flt++;
@@ -760,128 +956,6 @@ good_area:
760 } 956 }
761#endif 957#endif
762 up_read(&mm->mmap_sem); 958 up_read(&mm->mmap_sem);
763 return;
764
765/*
766 * Something tried to access memory that isn't in our memory map..
767 * Fix it, but check if it's kernel or user first..
768 */
769bad_area:
770 up_read(&mm->mmap_sem);
771
772bad_area_nosemaphore:
773 /* User mode accesses just cause a SIGSEGV */
774 if (error_code & PF_USER) {
775 /*
776 * It's possible to have interrupts off here.
777 */
778 local_irq_enable();
779
780 /*
781 * Valid to do another page fault here because this one came
782 * from user space.
783 */
784 if (is_prefetch(regs, address, error_code))
785 return;
786
787 if (is_errata100(regs, address))
788 return;
789
790 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
791 printk_ratelimit()) {
792 printk(
793 "%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
794 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
795 tsk->comm, task_pid_nr(tsk), address,
796 (void *) regs->ip, (void *) regs->sp, error_code);
797 print_vma_addr(" in ", regs->ip);
798 printk("\n");
799 }
800
801 tsk->thread.cr2 = address;
802 /* Kernel addresses are always protection faults */
803 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
804 tsk->thread.trap_no = 14;
805 force_sig_info_fault(SIGSEGV, si_code, address, tsk);
806 return;
807 }
808
809 if (is_f00f_bug(regs, address))
810 return;
811
812no_context:
813 /* Are we prepared to handle this kernel fault? */
814 if (fixup_exception(regs))
815 return;
816
817 /*
818 * X86_32
819 * Valid to do another page fault here, because if this fault
820 * had been triggered by is_prefetch fixup_exception would have
821 * handled it.
822 *
823 * X86_64
824 * Hall of shame of CPU/BIOS bugs.
825 */
826 if (is_prefetch(regs, address, error_code))
827 return;
828
829 if (is_errata93(regs, address))
830 return;
831
832/*
833 * Oops. The kernel tried to access some bad page. We'll have to
834 * terminate things with extreme prejudice.
835 */
836#ifdef CONFIG_X86_32
837 bust_spinlocks(1);
838#else
839 flags = oops_begin();
840#endif
841
842 show_fault_oops(regs, error_code, address);
843
844 tsk->thread.cr2 = address;
845 tsk->thread.trap_no = 14;
846 tsk->thread.error_code = error_code;
847
848#ifdef CONFIG_X86_32
849 die("Oops", regs, error_code);
850 bust_spinlocks(0);
851 do_exit(SIGKILL);
852#else
853 sig = SIGKILL;
854 if (__die("Oops", regs, error_code))
855 sig = 0;
856 /* Executive summary in case the body of the oops scrolled away */
857 printk(KERN_EMERG "CR2: %016lx\n", address);
858 oops_end(flags, regs, sig);
859#endif
860
861out_of_memory:
862 /*
863 * We ran out of memory, call the OOM killer, and return the userspace
864 * (which will retry the fault, or kill us if we got oom-killed).
865 */
866 up_read(&mm->mmap_sem);
867 pagefault_out_of_memory();
868 return;
869
870do_sigbus:
871 up_read(&mm->mmap_sem);
872
873 /* Kernel mode? Handle exceptions or die */
874 if (!(error_code & PF_USER))
875 goto no_context;
876#ifdef CONFIG_X86_32
877 /* User space => ok to do another page fault */
878 if (is_prefetch(regs, address, error_code))
879 return;
880#endif
881 tsk->thread.cr2 = address;
882 tsk->thread.error_code = error_code;
883 tsk->thread.trap_no = 14;
884 force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
885} 959}
886 960
887DEFINE_SPINLOCK(pgd_lock); 961DEFINE_SPINLOCK(pgd_lock);