aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/mm')
-rw-r--r--arch/x86/mm/Makefile2
-rw-r--r--arch/x86/mm/extable.c6
-rw-r--r--arch/x86/mm/fault.c446
-rw-r--r--arch/x86/mm/init_32.c49
-rw-r--r--arch/x86/mm/init_64.c2
-rw-r--r--arch/x86/mm/iomap_32.c10
-rw-r--r--arch/x86/mm/ioremap.c27
-rw-r--r--arch/x86/mm/numa_64.c217
-rw-r--r--arch/x86/mm/pageattr.c49
-rw-r--r--arch/x86/mm/pat.c74
-rw-r--r--arch/x86/mm/srat_64.c1
-rw-r--r--arch/x86/mm/tlb.c296
12 files changed, 926 insertions, 253 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index d8cc96a2738f..9f05157220f5 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -1,6 +1,8 @@
1obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ 1obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
2 pat.o pgtable.o gup.o 2 pat.o pgtable.o gup.o
3 3
4obj-$(CONFIG_X86_SMP) += tlb.o
5
4obj-$(CONFIG_X86_32) += pgtable_32.o iomap_32.o 6obj-$(CONFIG_X86_32) += pgtable_32.o iomap_32.o
5 7
6obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o 8obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 7e8db53528a7..61b41ca3b5a2 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -23,6 +23,12 @@ int fixup_exception(struct pt_regs *regs)
23 23
24 fixup = search_exception_tables(regs->ip); 24 fixup = search_exception_tables(regs->ip);
25 if (fixup) { 25 if (fixup) {
26 /* If fixup is less than 16, it means uaccess error */
27 if (fixup->fixup < 16) {
28 current_thread_info()->uaccess_err = -EFAULT;
29 regs->ip += fixup->fixup;
30 return 1;
31 }
26 regs->ip = fixup->fixup; 32 regs->ip = fixup->fixup;
27 return 1; 33 return 1;
28 } 34 }
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 90dfae511a41..d3eee74f830a 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -26,6 +26,7 @@
26#include <linux/kprobes.h> 26#include <linux/kprobes.h>
27#include <linux/uaccess.h> 27#include <linux/uaccess.h>
28#include <linux/kdebug.h> 28#include <linux/kdebug.h>
29#include <linux/magic.h>
29 30
30#include <asm/system.h> 31#include <asm/system.h>
31#include <asm/desc.h> 32#include <asm/desc.h>
@@ -91,8 +92,8 @@ static inline int notify_page_fault(struct pt_regs *regs)
91 * 92 *
92 * Opcode checker based on code by Richard Brunner 93 * Opcode checker based on code by Richard Brunner
93 */ 94 */
94static int is_prefetch(struct pt_regs *regs, unsigned long addr, 95static int is_prefetch(struct pt_regs *regs, unsigned long error_code,
95 unsigned long error_code) 96 unsigned long addr)
96{ 97{
97 unsigned char *instr; 98 unsigned char *instr;
98 int scan_more = 1; 99 int scan_more = 1;
@@ -409,17 +410,16 @@ static void show_fault_oops(struct pt_regs *regs, unsigned long error_code,
409} 410}
410 411
411#ifdef CONFIG_X86_64 412#ifdef CONFIG_X86_64
412static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs, 413static noinline void pgtable_bad(struct pt_regs *regs,
413 unsigned long error_code) 414 unsigned long error_code, unsigned long address)
414{ 415{
415 unsigned long flags = oops_begin(); 416 unsigned long flags = oops_begin();
416 int sig = SIGKILL; 417 int sig = SIGKILL;
417 struct task_struct *tsk; 418 struct task_struct *tsk = current;
418 419
419 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", 420 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
420 current->comm, address); 421 tsk->comm, address);
421 dump_pagetable(address); 422 dump_pagetable(address);
422 tsk = current;
423 tsk->thread.cr2 = address; 423 tsk->thread.cr2 = address;
424 tsk->thread.trap_no = 14; 424 tsk->thread.trap_no = 14;
425 tsk->thread.error_code = error_code; 425 tsk->thread.error_code = error_code;
@@ -429,6 +429,196 @@ static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
429} 429}
430#endif 430#endif
431 431
432static noinline void no_context(struct pt_regs *regs,
433 unsigned long error_code, unsigned long address)
434{
435 struct task_struct *tsk = current;
436 unsigned long *stackend;
437
438#ifdef CONFIG_X86_64
439 unsigned long flags;
440 int sig;
441#endif
442
443 /* Are we prepared to handle this kernel fault? */
444 if (fixup_exception(regs))
445 return;
446
447 /*
448 * X86_32
449 * Valid to do another page fault here, because if this fault
450 * had been triggered by is_prefetch fixup_exception would have
451 * handled it.
452 *
453 * X86_64
454 * Hall of shame of CPU/BIOS bugs.
455 */
456 if (is_prefetch(regs, error_code, address))
457 return;
458
459 if (is_errata93(regs, address))
460 return;
461
462 /*
463 * Oops. The kernel tried to access some bad page. We'll have to
464 * terminate things with extreme prejudice.
465 */
466#ifdef CONFIG_X86_32
467 bust_spinlocks(1);
468#else
469 flags = oops_begin();
470#endif
471
472 show_fault_oops(regs, error_code, address);
473
474 stackend = end_of_stack(tsk);
475 if (*stackend != STACK_END_MAGIC)
476 printk(KERN_ALERT "Thread overran stack, or stack corrupted\n");
477
478 tsk->thread.cr2 = address;
479 tsk->thread.trap_no = 14;
480 tsk->thread.error_code = error_code;
481
482#ifdef CONFIG_X86_32
483 die("Oops", regs, error_code);
484 bust_spinlocks(0);
485 do_exit(SIGKILL);
486#else
487 sig = SIGKILL;
488 if (__die("Oops", regs, error_code))
489 sig = 0;
490 /* Executive summary in case the body of the oops scrolled away */
491 printk(KERN_EMERG "CR2: %016lx\n", address);
492 oops_end(flags, regs, sig);
493#endif
494}
495
496static void __bad_area_nosemaphore(struct pt_regs *regs,
497 unsigned long error_code, unsigned long address,
498 int si_code)
499{
500 struct task_struct *tsk = current;
501
502 /* User mode accesses just cause a SIGSEGV */
503 if (error_code & PF_USER) {
504 /*
505 * It's possible to have interrupts off here.
506 */
507 local_irq_enable();
508
509 /*
510 * Valid to do another page fault here because this one came
511 * from user space.
512 */
513 if (is_prefetch(regs, error_code, address))
514 return;
515
516 if (is_errata100(regs, address))
517 return;
518
519 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
520 printk_ratelimit()) {
521 printk(
522 "%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
523 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
524 tsk->comm, task_pid_nr(tsk), address,
525 (void *) regs->ip, (void *) regs->sp, error_code);
526 print_vma_addr(" in ", regs->ip);
527 printk("\n");
528 }
529
530 tsk->thread.cr2 = address;
531 /* Kernel addresses are always protection faults */
532 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
533 tsk->thread.trap_no = 14;
534 force_sig_info_fault(SIGSEGV, si_code, address, tsk);
535 return;
536 }
537
538 if (is_f00f_bug(regs, address))
539 return;
540
541 no_context(regs, error_code, address);
542}
543
544static noinline void bad_area_nosemaphore(struct pt_regs *regs,
545 unsigned long error_code, unsigned long address)
546{
547 __bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR);
548}
549
550static void __bad_area(struct pt_regs *regs,
551 unsigned long error_code, unsigned long address,
552 int si_code)
553{
554 struct mm_struct *mm = current->mm;
555
556 /*
557 * Something tried to access memory that isn't in our memory map..
558 * Fix it, but check if it's kernel or user first..
559 */
560 up_read(&mm->mmap_sem);
561
562 __bad_area_nosemaphore(regs, error_code, address, si_code);
563}
564
565static noinline void bad_area(struct pt_regs *regs,
566 unsigned long error_code, unsigned long address)
567{
568 __bad_area(regs, error_code, address, SEGV_MAPERR);
569}
570
571static noinline void bad_area_access_error(struct pt_regs *regs,
572 unsigned long error_code, unsigned long address)
573{
574 __bad_area(regs, error_code, address, SEGV_ACCERR);
575}
576
577/* TODO: fixup for "mm-invoke-oom-killer-from-page-fault.patch" */
578static void out_of_memory(struct pt_regs *regs,
579 unsigned long error_code, unsigned long address)
580{
581 /*
582 * We ran out of memory, call the OOM killer, and return the userspace
583 * (which will retry the fault, or kill us if we got oom-killed).
584 */
585 up_read(&current->mm->mmap_sem);
586 pagefault_out_of_memory();
587}
588
589static void do_sigbus(struct pt_regs *regs,
590 unsigned long error_code, unsigned long address)
591{
592 struct task_struct *tsk = current;
593 struct mm_struct *mm = tsk->mm;
594
595 up_read(&mm->mmap_sem);
596
597 /* Kernel mode? Handle exceptions or die */
598 if (!(error_code & PF_USER))
599 no_context(regs, error_code, address);
600#ifdef CONFIG_X86_32
601 /* User space => ok to do another page fault */
602 if (is_prefetch(regs, error_code, address))
603 return;
604#endif
605 tsk->thread.cr2 = address;
606 tsk->thread.error_code = error_code;
607 tsk->thread.trap_no = 14;
608 force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
609}
610
611static noinline void mm_fault_error(struct pt_regs *regs,
612 unsigned long error_code, unsigned long address, unsigned int fault)
613{
614 if (fault & VM_FAULT_OOM)
615 out_of_memory(regs, error_code, address);
616 else if (fault & VM_FAULT_SIGBUS)
617 do_sigbus(regs, error_code, address);
618 else
619 BUG();
620}
621
432static int spurious_fault_check(unsigned long error_code, pte_t *pte) 622static int spurious_fault_check(unsigned long error_code, pte_t *pte)
433{ 623{
434 if ((error_code & PF_WRITE) && !pte_write(*pte)) 624 if ((error_code & PF_WRITE) && !pte_write(*pte))
@@ -448,8 +638,8 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte)
448 * There are no security implications to leaving a stale TLB when 638 * There are no security implications to leaving a stale TLB when
449 * increasing the permissions on a page. 639 * increasing the permissions on a page.
450 */ 640 */
451static int spurious_fault(unsigned long address, 641static noinline int spurious_fault(unsigned long error_code,
452 unsigned long error_code) 642 unsigned long address)
453{ 643{
454 pgd_t *pgd; 644 pgd_t *pgd;
455 pud_t *pud; 645 pud_t *pud;
@@ -494,7 +684,7 @@ static int spurious_fault(unsigned long address,
494 * 684 *
495 * This assumes no large pages in there. 685 * This assumes no large pages in there.
496 */ 686 */
497static int vmalloc_fault(unsigned long address) 687static noinline int vmalloc_fault(unsigned long address)
498{ 688{
499#ifdef CONFIG_X86_32 689#ifdef CONFIG_X86_32
500 unsigned long pgd_paddr; 690 unsigned long pgd_paddr;
@@ -573,6 +763,25 @@ static int vmalloc_fault(unsigned long address)
573 763
574int show_unhandled_signals = 1; 764int show_unhandled_signals = 1;
575 765
766static inline int access_error(unsigned long error_code, int write,
767 struct vm_area_struct *vma)
768{
769 if (write) {
770 /* write, present and write, not present */
771 if (unlikely(!(vma->vm_flags & VM_WRITE)))
772 return 1;
773 } else if (unlikely(error_code & PF_PROT)) {
774 /* read, present */
775 return 1;
776 } else {
777 /* read, not present */
778 if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))))
779 return 1;
780 }
781
782 return 0;
783}
784
576/* 785/*
577 * This routine handles page faults. It determines the address, 786 * This routine handles page faults. It determines the address,
578 * and the problem, and then passes it off to one of the appropriate 787 * and the problem, and then passes it off to one of the appropriate
@@ -583,16 +792,12 @@ asmlinkage
583#endif 792#endif
584void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) 793void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
585{ 794{
795 unsigned long address;
586 struct task_struct *tsk; 796 struct task_struct *tsk;
587 struct mm_struct *mm; 797 struct mm_struct *mm;
588 struct vm_area_struct *vma; 798 struct vm_area_struct *vma;
589 unsigned long address; 799 int write;
590 int write, si_code;
591 int fault; 800 int fault;
592#ifdef CONFIG_X86_64
593 unsigned long flags;
594 int sig;
595#endif
596 801
597 tsk = current; 802 tsk = current;
598 mm = tsk->mm; 803 mm = tsk->mm;
@@ -601,9 +806,7 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
601 /* get the address */ 806 /* get the address */
602 address = read_cr2(); 807 address = read_cr2();
603 808
604 si_code = SEGV_MAPERR; 809 if (unlikely(notify_page_fault(regs)))
605
606 if (notify_page_fault(regs))
607 return; 810 return;
608 if (unlikely(kmmio_fault(regs, address))) 811 if (unlikely(kmmio_fault(regs, address)))
609 return; 812 return;
@@ -631,17 +834,17 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
631 return; 834 return;
632 835
633 /* Can handle a stale RO->RW TLB */ 836 /* Can handle a stale RO->RW TLB */
634 if (spurious_fault(address, error_code)) 837 if (spurious_fault(error_code, address))
635 return; 838 return;
636 839
637 /* 840 /*
638 * Don't take the mm semaphore here. If we fixup a prefetch 841 * Don't take the mm semaphore here. If we fixup a prefetch
639 * fault we could otherwise deadlock. 842 * fault we could otherwise deadlock.
640 */ 843 */
641 goto bad_area_nosemaphore; 844 bad_area_nosemaphore(regs, error_code, address);
845 return;
642 } 846 }
643 847
644
645 /* 848 /*
646 * It's safe to allow irq's after cr2 has been saved and the 849 * It's safe to allow irq's after cr2 has been saved and the
647 * vmalloc fault has been handled. 850 * vmalloc fault has been handled.
@@ -657,15 +860,17 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
657 860
658#ifdef CONFIG_X86_64 861#ifdef CONFIG_X86_64
659 if (unlikely(error_code & PF_RSVD)) 862 if (unlikely(error_code & PF_RSVD))
660 pgtable_bad(address, regs, error_code); 863 pgtable_bad(regs, error_code, address);
661#endif 864#endif
662 865
663 /* 866 /*
664 * If we're in an interrupt, have no user context or are running in an 867 * If we're in an interrupt, have no user context or are running in an
665 * atomic region then we must not take the fault. 868 * atomic region then we must not take the fault.
666 */ 869 */
667 if (unlikely(in_atomic() || !mm)) 870 if (unlikely(in_atomic() || !mm)) {
668 goto bad_area_nosemaphore; 871 bad_area_nosemaphore(regs, error_code, address);
872 return;
873 }
669 874
670 /* 875 /*
671 * When running in the kernel we expect faults to occur only to 876 * When running in the kernel we expect faults to occur only to
@@ -683,20 +888,26 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
683 * source. If this is invalid we can skip the address space check, 888 * source. If this is invalid we can skip the address space check,
684 * thus avoiding the deadlock. 889 * thus avoiding the deadlock.
685 */ 890 */
686 if (!down_read_trylock(&mm->mmap_sem)) { 891 if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
687 if ((error_code & PF_USER) == 0 && 892 if ((error_code & PF_USER) == 0 &&
688 !search_exception_tables(regs->ip)) 893 !search_exception_tables(regs->ip)) {
689 goto bad_area_nosemaphore; 894 bad_area_nosemaphore(regs, error_code, address);
895 return;
896 }
690 down_read(&mm->mmap_sem); 897 down_read(&mm->mmap_sem);
691 } 898 }
692 899
693 vma = find_vma(mm, address); 900 vma = find_vma(mm, address);
694 if (!vma) 901 if (unlikely(!vma)) {
695 goto bad_area; 902 bad_area(regs, error_code, address);
696 if (vma->vm_start <= address) 903 return;
904 }
905 if (likely(vma->vm_start <= address))
697 goto good_area; 906 goto good_area;
698 if (!(vma->vm_flags & VM_GROWSDOWN)) 907 if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
699 goto bad_area; 908 bad_area(regs, error_code, address);
909 return;
910 }
700 if (error_code & PF_USER) { 911 if (error_code & PF_USER) {
701 /* 912 /*
702 * Accessing the stack below %sp is always a bug. 913 * Accessing the stack below %sp is always a bug.
@@ -704,31 +915,25 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
704 * and pusha to work. ("enter $65535,$31" pushes 915 * and pusha to work. ("enter $65535,$31" pushes
705 * 32 pointers and then decrements %sp by 65535.) 916 * 32 pointers and then decrements %sp by 65535.)
706 */ 917 */
707 if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp) 918 if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) {
708 goto bad_area; 919 bad_area(regs, error_code, address);
920 return;
921 }
709 } 922 }
710 if (expand_stack(vma, address)) 923 if (unlikely(expand_stack(vma, address))) {
711 goto bad_area; 924 bad_area(regs, error_code, address);
712/* 925 return;
713 * Ok, we have a good vm_area for this memory access, so 926 }
714 * we can handle it.. 927
715 */ 928 /*
929 * Ok, we have a good vm_area for this memory access, so
930 * we can handle it..
931 */
716good_area: 932good_area:
717 si_code = SEGV_ACCERR; 933 write = error_code & PF_WRITE;
718 write = 0; 934 if (unlikely(access_error(error_code, write, vma))) {
719 switch (error_code & (PF_PROT|PF_WRITE)) { 935 bad_area_access_error(regs, error_code, address);
720 default: /* 3: write, present */ 936 return;
721 /* fall through */
722 case PF_WRITE: /* write, not present */
723 if (!(vma->vm_flags & VM_WRITE))
724 goto bad_area;
725 write++;
726 break;
727 case PF_PROT: /* read, present */
728 goto bad_area;
729 case 0: /* read, not present */
730 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
731 goto bad_area;
732 } 937 }
733 938
734 /* 939 /*
@@ -738,11 +943,8 @@ good_area:
738 */ 943 */
739 fault = handle_mm_fault(mm, vma, address, write); 944 fault = handle_mm_fault(mm, vma, address, write);
740 if (unlikely(fault & VM_FAULT_ERROR)) { 945 if (unlikely(fault & VM_FAULT_ERROR)) {
741 if (fault & VM_FAULT_OOM) 946 mm_fault_error(regs, error_code, address, fault);
742 goto out_of_memory; 947 return;
743 else if (fault & VM_FAULT_SIGBUS)
744 goto do_sigbus;
745 BUG();
746 } 948 }
747 if (fault & VM_FAULT_MAJOR) 949 if (fault & VM_FAULT_MAJOR)
748 tsk->maj_flt++; 950 tsk->maj_flt++;
@@ -760,128 +962,6 @@ good_area:
760 } 962 }
761#endif 963#endif
762 up_read(&mm->mmap_sem); 964 up_read(&mm->mmap_sem);
763 return;
764
765/*
766 * Something tried to access memory that isn't in our memory map..
767 * Fix it, but check if it's kernel or user first..
768 */
769bad_area:
770 up_read(&mm->mmap_sem);
771
772bad_area_nosemaphore:
773 /* User mode accesses just cause a SIGSEGV */
774 if (error_code & PF_USER) {
775 /*
776 * It's possible to have interrupts off here.
777 */
778 local_irq_enable();
779
780 /*
781 * Valid to do another page fault here because this one came
782 * from user space.
783 */
784 if (is_prefetch(regs, address, error_code))
785 return;
786
787 if (is_errata100(regs, address))
788 return;
789
790 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
791 printk_ratelimit()) {
792 printk(
793 "%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
794 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
795 tsk->comm, task_pid_nr(tsk), address,
796 (void *) regs->ip, (void *) regs->sp, error_code);
797 print_vma_addr(" in ", regs->ip);
798 printk("\n");
799 }
800
801 tsk->thread.cr2 = address;
802 /* Kernel addresses are always protection faults */
803 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
804 tsk->thread.trap_no = 14;
805 force_sig_info_fault(SIGSEGV, si_code, address, tsk);
806 return;
807 }
808
809 if (is_f00f_bug(regs, address))
810 return;
811
812no_context:
813 /* Are we prepared to handle this kernel fault? */
814 if (fixup_exception(regs))
815 return;
816
817 /*
818 * X86_32
819 * Valid to do another page fault here, because if this fault
820 * had been triggered by is_prefetch fixup_exception would have
821 * handled it.
822 *
823 * X86_64
824 * Hall of shame of CPU/BIOS bugs.
825 */
826 if (is_prefetch(regs, address, error_code))
827 return;
828
829 if (is_errata93(regs, address))
830 return;
831
832/*
833 * Oops. The kernel tried to access some bad page. We'll have to
834 * terminate things with extreme prejudice.
835 */
836#ifdef CONFIG_X86_32
837 bust_spinlocks(1);
838#else
839 flags = oops_begin();
840#endif
841
842 show_fault_oops(regs, error_code, address);
843
844 tsk->thread.cr2 = address;
845 tsk->thread.trap_no = 14;
846 tsk->thread.error_code = error_code;
847
848#ifdef CONFIG_X86_32
849 die("Oops", regs, error_code);
850 bust_spinlocks(0);
851 do_exit(SIGKILL);
852#else
853 sig = SIGKILL;
854 if (__die("Oops", regs, error_code))
855 sig = 0;
856 /* Executive summary in case the body of the oops scrolled away */
857 printk(KERN_EMERG "CR2: %016lx\n", address);
858 oops_end(flags, regs, sig);
859#endif
860
861out_of_memory:
862 /*
863 * We ran out of memory, call the OOM killer, and return the userspace
864 * (which will retry the fault, or kill us if we got oom-killed).
865 */
866 up_read(&mm->mmap_sem);
867 pagefault_out_of_memory();
868 return;
869
870do_sigbus:
871 up_read(&mm->mmap_sem);
872
873 /* Kernel mode? Handle exceptions or die */
874 if (!(error_code & PF_USER))
875 goto no_context;
876#ifdef CONFIG_X86_32
877 /* User space => ok to do another page fault */
878 if (is_prefetch(regs, address, error_code))
879 return;
880#endif
881 tsk->thread.cr2 = address;
882 tsk->thread.error_code = error_code;
883 tsk->thread.trap_no = 14;
884 force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
885} 965}
886 966
887DEFINE_SPINLOCK(pgd_lock); 967DEFINE_SPINLOCK(pgd_lock);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 88f1b10de3be..00263bf07a88 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -49,7 +49,6 @@
49#include <asm/paravirt.h> 49#include <asm/paravirt.h>
50#include <asm/setup.h> 50#include <asm/setup.h>
51#include <asm/cacheflush.h> 51#include <asm/cacheflush.h>
52#include <asm/smp.h>
53 52
54unsigned int __VMALLOC_RESERVE = 128 << 20; 53unsigned int __VMALLOC_RESERVE = 128 << 20;
55 54
@@ -138,6 +137,47 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
138 return pte_offset_kernel(pmd, 0); 137 return pte_offset_kernel(pmd, 0);
139} 138}
140 139
140static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
141 unsigned long vaddr, pte_t *lastpte)
142{
143#ifdef CONFIG_HIGHMEM
144 /*
145 * Something (early fixmap) may already have put a pte
146 * page here, which causes the page table allocation
147 * to become nonlinear. Attempt to fix it, and if it
148 * is still nonlinear then we have to bug.
149 */
150 int pmd_idx_kmap_begin = fix_to_virt(FIX_KMAP_END) >> PMD_SHIFT;
151 int pmd_idx_kmap_end = fix_to_virt(FIX_KMAP_BEGIN) >> PMD_SHIFT;
152
153 if (pmd_idx_kmap_begin != pmd_idx_kmap_end
154 && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin
155 && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end
156 && ((__pa(pte) >> PAGE_SHIFT) < table_start
157 || (__pa(pte) >> PAGE_SHIFT) >= table_end)) {
158 pte_t *newpte;
159 int i;
160
161 BUG_ON(after_init_bootmem);
162 newpte = alloc_low_page();
163 for (i = 0; i < PTRS_PER_PTE; i++)
164 set_pte(newpte + i, pte[i]);
165
166 paravirt_alloc_pte(&init_mm, __pa(newpte) >> PAGE_SHIFT);
167 set_pmd(pmd, __pmd(__pa(newpte)|_PAGE_TABLE));
168 BUG_ON(newpte != pte_offset_kernel(pmd, 0));
169 __flush_tlb_all();
170
171 paravirt_release_pte(__pa(pte) >> PAGE_SHIFT);
172 pte = newpte;
173 }
174 BUG_ON(vaddr < fix_to_virt(FIX_KMAP_BEGIN - 1)
175 && vaddr > fix_to_virt(FIX_KMAP_END)
176 && lastpte && lastpte + PTRS_PER_PTE != pte);
177#endif
178 return pte;
179}
180
141/* 181/*
142 * This function initializes a certain range of kernel virtual memory 182 * This function initializes a certain range of kernel virtual memory
143 * with new bootmem page tables, everywhere page tables are missing in 183 * with new bootmem page tables, everywhere page tables are missing in
@@ -154,6 +194,7 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
154 unsigned long vaddr; 194 unsigned long vaddr;
155 pgd_t *pgd; 195 pgd_t *pgd;
156 pmd_t *pmd; 196 pmd_t *pmd;
197 pte_t *pte = NULL;
157 198
158 vaddr = start; 199 vaddr = start;
159 pgd_idx = pgd_index(vaddr); 200 pgd_idx = pgd_index(vaddr);
@@ -165,7 +206,8 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
165 pmd = pmd + pmd_index(vaddr); 206 pmd = pmd + pmd_index(vaddr);
166 for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); 207 for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end);
167 pmd++, pmd_idx++) { 208 pmd++, pmd_idx++) {
168 one_page_table_init(pmd); 209 pte = page_table_kmap_check(one_page_table_init(pmd),
210 pmd, vaddr, pte);
169 211
170 vaddr += PMD_SIZE; 212 vaddr += PMD_SIZE;
171 } 213 }
@@ -508,7 +550,6 @@ static void __init early_ioremap_page_table_range_init(pgd_t *pgd_base)
508 * Fixed mappings, only the page table structure has to be 550 * Fixed mappings, only the page table structure has to be
509 * created - mappings will be set by set_fixmap(): 551 * created - mappings will be set by set_fixmap():
510 */ 552 */
511 early_ioremap_clear();
512 vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK; 553 vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
513 end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK; 554 end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
514 page_table_range_init(vaddr, end, pgd_base); 555 page_table_range_init(vaddr, end, pgd_base);
@@ -801,7 +842,7 @@ static void __init find_early_table_space(unsigned long end, int use_pse)
801 tables += PAGE_ALIGN(ptes * sizeof(pte_t)); 842 tables += PAGE_ALIGN(ptes * sizeof(pte_t));
802 843
803 /* for fixmap */ 844 /* for fixmap */
804 tables += PAGE_SIZE * 2; 845 tables += PAGE_ALIGN(__end_of_fixed_addresses * sizeof(pte_t));
805 846
806 /* 847 /*
807 * RED-PEN putting page tables only on node 0 could 848 * RED-PEN putting page tables only on node 0 could
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 23f68e77ad1f..e6d36b490250 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -596,7 +596,7 @@ static void __init init_gbpages(void)
596 direct_gbpages = 0; 596 direct_gbpages = 0;
597} 597}
598 598
599static unsigned long __init kernel_physical_mapping_init(unsigned long start, 599static unsigned long __meminit kernel_physical_mapping_init(unsigned long start,
600 unsigned long end, 600 unsigned long end,
601 unsigned long page_size_mask) 601 unsigned long page_size_mask)
602{ 602{
diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
index d0151d8ce452..ca53224fc56c 100644
--- a/arch/x86/mm/iomap_32.c
+++ b/arch/x86/mm/iomap_32.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <asm/iomap.h> 19#include <asm/iomap.h>
20#include <asm/pat.h>
20#include <linux/module.h> 21#include <linux/module.h>
21 22
22/* Map 'pfn' using fixed map 'type' and protections 'prot' 23/* Map 'pfn' using fixed map 'type' and protections 'prot'
@@ -29,6 +30,15 @@ iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
29 30
30 pagefault_disable(); 31 pagefault_disable();
31 32
33 /*
34 * For non-PAT systems, promote PAGE_KERNEL_WC to PAGE_KERNEL_UC_MINUS.
35 * PAGE_KERNEL_WC maps to PWT, which translates to uncached if the
36 * MTRR is UC or WC. UC_MINUS gets the real intention, of the
37 * user, which is "WC if the MTRR is WC, UC if you can't do that."
38 */
39 if (!pat_enabled && pgprot_val(prot) == pgprot_val(PAGE_KERNEL_WC))
40 prot = PAGE_KERNEL_UC_MINUS;
41
32 idx = type + KM_TYPE_NR*smp_processor_id(); 42 idx = type + KM_TYPE_NR*smp_processor_id();
33 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); 43 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
34 set_pte(kmap_pte-idx, pfn_pte(pfn, prot)); 44 set_pte(kmap_pte-idx, pfn_pte(pfn, prot));
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index bd85d42819e1..1448bcb7f22f 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -367,7 +367,7 @@ EXPORT_SYMBOL(ioremap_nocache);
367 * 367 *
368 * Must be freed with iounmap. 368 * Must be freed with iounmap.
369 */ 369 */
370void __iomem *ioremap_wc(unsigned long phys_addr, unsigned long size) 370void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size)
371{ 371{
372 if (pat_enabled) 372 if (pat_enabled)
373 return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC, 373 return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC,
@@ -557,34 +557,9 @@ void __init early_ioremap_init(void)
557 } 557 }
558} 558}
559 559
560void __init early_ioremap_clear(void)
561{
562 pmd_t *pmd;
563
564 if (early_ioremap_debug)
565 printk(KERN_INFO "early_ioremap_clear()\n");
566
567 pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));
568 pmd_clear(pmd);
569 paravirt_release_pte(__pa(bm_pte) >> PAGE_SHIFT);
570 __flush_tlb_all();
571}
572
573void __init early_ioremap_reset(void) 560void __init early_ioremap_reset(void)
574{ 561{
575 enum fixed_addresses idx;
576 unsigned long addr, phys;
577 pte_t *pte;
578
579 after_paging_init = 1; 562 after_paging_init = 1;
580 for (idx = FIX_BTMAP_BEGIN; idx >= FIX_BTMAP_END; idx--) {
581 addr = fix_to_virt(idx);
582 pte = early_ioremap_pte(addr);
583 if (pte_present(*pte)) {
584 phys = pte_val(*pte) & PAGE_MASK;
585 set_fixmap(idx, phys);
586 }
587 }
588} 563}
589 564
590static void __init __early_set_fixmap(enum fixed_addresses idx, 565static void __init __early_set_fixmap(enum fixed_addresses idx,
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 71a14f89f89e..08d140fbc31b 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -20,6 +20,12 @@
20#include <asm/acpi.h> 20#include <asm/acpi.h>
21#include <asm/k8.h> 21#include <asm/k8.h>
22 22
23#ifdef CONFIG_DEBUG_PER_CPU_MAPS
24# define DBG(x...) printk(KERN_DEBUG x)
25#else
26# define DBG(x...)
27#endif
28
23struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; 29struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
24EXPORT_SYMBOL(node_data); 30EXPORT_SYMBOL(node_data);
25 31
@@ -33,6 +39,21 @@ int numa_off __initdata;
33static unsigned long __initdata nodemap_addr; 39static unsigned long __initdata nodemap_addr;
34static unsigned long __initdata nodemap_size; 40static unsigned long __initdata nodemap_size;
35 41
42DEFINE_PER_CPU(int, node_number) = 0;
43EXPORT_PER_CPU_SYMBOL(node_number);
44
45/*
46 * Map cpu index to node index
47 */
48DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
49EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
50
51/*
52 * Which logical CPUs are on which nodes
53 */
54cpumask_t *node_to_cpumask_map;
55EXPORT_SYMBOL(node_to_cpumask_map);
56
36/* 57/*
37 * Given a shift value, try to populate memnodemap[] 58 * Given a shift value, try to populate memnodemap[]
38 * Returns : 59 * Returns :
@@ -640,3 +661,199 @@ void __init init_cpu_to_node(void)
640#endif 661#endif
641 662
642 663
664/*
665 * Allocate node_to_cpumask_map based on number of available nodes
666 * Requires node_possible_map to be valid.
667 *
668 * Note: node_to_cpumask() is not valid until after this is done.
669 * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.)
670 */
671void __init setup_node_to_cpumask_map(void)
672{
673 unsigned int node, num = 0;
674 cpumask_t *map;
675
676 /* setup nr_node_ids if not done yet */
677 if (nr_node_ids == MAX_NUMNODES) {
678 for_each_node_mask(node, node_possible_map)
679 num = node;
680 nr_node_ids = num + 1;
681 }
682
683 /* allocate the map */
684 map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t));
685 DBG("node_to_cpumask_map at %p for %d nodes\n", map, nr_node_ids);
686
687 pr_debug("Node to cpumask map at %p for %d nodes\n",
688 map, nr_node_ids);
689
690 /* node_to_cpumask() will now work */
691 node_to_cpumask_map = map;
692}
693
694void __cpuinit numa_set_node(int cpu, int node)
695{
696 int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
697
698 /* early setting, no percpu area yet */
699 if (cpu_to_node_map) {
700 cpu_to_node_map[cpu] = node;
701 return;
702 }
703
704#ifdef CONFIG_DEBUG_PER_CPU_MAPS
705 if (cpu >= nr_cpu_ids || !per_cpu_offset(cpu)) {
706 printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu);
707 dump_stack();
708 return;
709 }
710#endif
711 per_cpu(x86_cpu_to_node_map, cpu) = node;
712
713 if (node != NUMA_NO_NODE)
714 per_cpu(node_number, cpu) = node;
715}
716
717void __cpuinit numa_clear_node(int cpu)
718{
719 numa_set_node(cpu, NUMA_NO_NODE);
720}
721
722#ifndef CONFIG_DEBUG_PER_CPU_MAPS
723
724void __cpuinit numa_add_cpu(int cpu)
725{
726 cpu_set(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
727}
728
729void __cpuinit numa_remove_cpu(int cpu)
730{
731 cpu_clear(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
732}
733
734#else /* CONFIG_DEBUG_PER_CPU_MAPS */
735
736/*
737 * --------- debug versions of the numa functions ---------
738 */
739static void __cpuinit numa_set_cpumask(int cpu, int enable)
740{
741 int node = early_cpu_to_node(cpu);
742 cpumask_t *mask;
743 char buf[64];
744
745 if (node_to_cpumask_map == NULL) {
746 printk(KERN_ERR "node_to_cpumask_map NULL\n");
747 dump_stack();
748 return;
749 }
750
751 mask = &node_to_cpumask_map[node];
752 if (enable)
753 cpu_set(cpu, *mask);
754 else
755 cpu_clear(cpu, *mask);
756
757 cpulist_scnprintf(buf, sizeof(buf), mask);
758 printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
759 enable ? "numa_add_cpu" : "numa_remove_cpu", cpu, node, buf);
760}
761
762void __cpuinit numa_add_cpu(int cpu)
763{
764 numa_set_cpumask(cpu, 1);
765}
766
767void __cpuinit numa_remove_cpu(int cpu)
768{
769 numa_set_cpumask(cpu, 0);
770}
771
772int cpu_to_node(int cpu)
773{
774 if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
775 printk(KERN_WARNING
776 "cpu_to_node(%d): usage too early!\n", cpu);
777 dump_stack();
778 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
779 }
780 return per_cpu(x86_cpu_to_node_map, cpu);
781}
782EXPORT_SYMBOL(cpu_to_node);
783
784/*
785 * Same function as cpu_to_node() but used if called before the
786 * per_cpu areas are setup.
787 */
788int early_cpu_to_node(int cpu)
789{
790 if (early_per_cpu_ptr(x86_cpu_to_node_map))
791 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
792
793 if (!per_cpu_offset(cpu)) {
794 printk(KERN_WARNING
795 "early_cpu_to_node(%d): no per_cpu area!\n", cpu);
796 dump_stack();
797 return NUMA_NO_NODE;
798 }
799 return per_cpu(x86_cpu_to_node_map, cpu);
800}
801
802
803/* empty cpumask */
804static const cpumask_t cpu_mask_none;
805
806/*
807 * Returns a pointer to the bitmask of CPUs on Node 'node'.
808 */
809const cpumask_t *cpumask_of_node(int node)
810{
811 if (node_to_cpumask_map == NULL) {
812 printk(KERN_WARNING
813 "cpumask_of_node(%d): no node_to_cpumask_map!\n",
814 node);
815 dump_stack();
816 return (const cpumask_t *)&cpu_online_map;
817 }
818 if (node >= nr_node_ids) {
819 printk(KERN_WARNING
820 "cpumask_of_node(%d): node > nr_node_ids(%d)\n",
821 node, nr_node_ids);
822 dump_stack();
823 return &cpu_mask_none;
824 }
825 return &node_to_cpumask_map[node];
826}
827EXPORT_SYMBOL(cpumask_of_node);
828
829/*
830 * Returns a bitmask of CPUs on Node 'node'.
831 *
832 * Side note: this function creates the returned cpumask on the stack
833 * so with a high NR_CPUS count, excessive stack space is used. The
834 * node_to_cpumask_ptr function should be used whenever possible.
835 */
836cpumask_t node_to_cpumask(int node)
837{
838 if (node_to_cpumask_map == NULL) {
839 printk(KERN_WARNING
840 "node_to_cpumask(%d): no node_to_cpumask_map!\n", node);
841 dump_stack();
842 return cpu_online_map;
843 }
844 if (node >= nr_node_ids) {
845 printk(KERN_WARNING
846 "node_to_cpumask(%d): node > nr_node_ids(%d)\n",
847 node, nr_node_ids);
848 dump_stack();
849 return cpu_mask_none;
850 }
851 return node_to_cpumask_map[node];
852}
853EXPORT_SYMBOL(node_to_cpumask);
854
855/*
856 * --------- end of debug versions of the numa functions ---------
857 */
858
859#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index e89d24815f26..84ba74820ad6 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -534,6 +534,36 @@ out_unlock:
534 return 0; 534 return 0;
535} 535}
536 536
537static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr,
538 int primary)
539{
540 /*
541 * Ignore all non primary paths.
542 */
543 if (!primary)
544 return 0;
545
546 /*
547 * Ignore the NULL PTE for kernel identity mapping, as it is expected
548 * to have holes.
549 * Also set numpages to '1' indicating that we processed cpa req for
550 * one virtual address page and its pfn. TBD: numpages can be set based
551 * on the initial value and the level returned by lookup_address().
552 */
553 if (within(vaddr, PAGE_OFFSET,
554 PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) {
555 cpa->numpages = 1;
556 cpa->pfn = __pa(vaddr) >> PAGE_SHIFT;
557 return 0;
558 } else {
559 WARN(1, KERN_WARNING "CPA: called for zero pte. "
560 "vaddr = %lx cpa->vaddr = %lx\n", vaddr,
561 *cpa->vaddr);
562
563 return -EFAULT;
564 }
565}
566
537static int __change_page_attr(struct cpa_data *cpa, int primary) 567static int __change_page_attr(struct cpa_data *cpa, int primary)
538{ 568{
539 unsigned long address; 569 unsigned long address;
@@ -549,17 +579,11 @@ static int __change_page_attr(struct cpa_data *cpa, int primary)
549repeat: 579repeat:
550 kpte = lookup_address(address, &level); 580 kpte = lookup_address(address, &level);
551 if (!kpte) 581 if (!kpte)
552 return 0; 582 return __cpa_process_fault(cpa, address, primary);
553 583
554 old_pte = *kpte; 584 old_pte = *kpte;
555 if (!pte_val(old_pte)) { 585 if (!pte_val(old_pte))
556 if (!primary) 586 return __cpa_process_fault(cpa, address, primary);
557 return 0;
558 WARN(1, KERN_WARNING "CPA: called for zero pte. "
559 "vaddr = %lx cpa->vaddr = %lx\n", address,
560 *cpa->vaddr);
561 return -EINVAL;
562 }
563 587
564 if (level == PG_LEVEL_4K) { 588 if (level == PG_LEVEL_4K) {
565 pte_t new_pte; 589 pte_t new_pte;
@@ -657,12 +681,7 @@ static int cpa_process_alias(struct cpa_data *cpa)
657 vaddr = *cpa->vaddr; 681 vaddr = *cpa->vaddr;
658 682
659 if (!(within(vaddr, PAGE_OFFSET, 683 if (!(within(vaddr, PAGE_OFFSET,
660 PAGE_OFFSET + (max_low_pfn_mapped << PAGE_SHIFT)) 684 PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) {
661#ifdef CONFIG_X86_64
662 || within(vaddr, PAGE_OFFSET + (1UL<<32),
663 PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))
664#endif
665 )) {
666 685
667 alias_cpa = *cpa; 686 alias_cpa = *cpa;
668 temp_cpa_vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT); 687 temp_cpa_vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT);
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 8b08fb955274..9127e31c7268 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -30,7 +30,7 @@
30#ifdef CONFIG_X86_PAT 30#ifdef CONFIG_X86_PAT
31int __read_mostly pat_enabled = 1; 31int __read_mostly pat_enabled = 1;
32 32
33void __cpuinit pat_disable(char *reason) 33void __cpuinit pat_disable(const char *reason)
34{ 34{
35 pat_enabled = 0; 35 pat_enabled = 0;
36 printk(KERN_INFO "%s\n", reason); 36 printk(KERN_INFO "%s\n", reason);
@@ -42,6 +42,11 @@ static int __init nopat(char *str)
42 return 0; 42 return 0;
43} 43}
44early_param("nopat", nopat); 44early_param("nopat", nopat);
45#else
46static inline void pat_disable(const char *reason)
47{
48 (void)reason;
49}
45#endif 50#endif
46 51
47 52
@@ -78,16 +83,20 @@ void pat_init(void)
78 if (!pat_enabled) 83 if (!pat_enabled)
79 return; 84 return;
80 85
81 /* Paranoia check. */ 86 if (!cpu_has_pat) {
82 if (!cpu_has_pat && boot_pat_state) { 87 if (!boot_pat_state) {
83 /* 88 pat_disable("PAT not supported by CPU.");
84 * If this happens we are on a secondary CPU, but 89 return;
85 * switched to PAT on the boot CPU. We have no way to 90 } else {
86 * undo PAT. 91 /*
87 */ 92 * If this happens we are on a secondary CPU, but
88 printk(KERN_ERR "PAT enabled, " 93 * switched to PAT on the boot CPU. We have no way to
89 "but not supported by secondary CPU\n"); 94 * undo PAT.
90 BUG(); 95 */
96 printk(KERN_ERR "PAT enabled, "
97 "but not supported by secondary CPU\n");
98 BUG();
99 }
91 } 100 }
92 101
93 /* Set PWT to Write-Combining. All other bits stay the same */ 102 /* Set PWT to Write-Combining. All other bits stay the same */
@@ -333,11 +342,23 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
333 req_type & _PAGE_CACHE_MASK); 342 req_type & _PAGE_CACHE_MASK);
334 } 343 }
335 344
336 is_range_ram = pagerange_is_ram(start, end); 345 if (new_type)
337 if (is_range_ram == 1) 346 *new_type = actual_type;
338 return reserve_ram_pages_type(start, end, req_type, new_type); 347
339 else if (is_range_ram < 0) 348 /*
340 return -EINVAL; 349 * For legacy reasons, some parts of the physical address range in the
350 * legacy 1MB region is treated as non-RAM (even when listed as RAM in
351 * the e820 tables). So we will track the memory attributes of this
352 * legacy 1MB region using the linear memtype_list always.
353 */
354 if (end >= ISA_END_ADDRESS) {
355 is_range_ram = pagerange_is_ram(start, end);
356 if (is_range_ram == 1)
357 return reserve_ram_pages_type(start, end, req_type,
358 new_type);
359 else if (is_range_ram < 0)
360 return -EINVAL;
361 }
341 362
342 new = kmalloc(sizeof(struct memtype), GFP_KERNEL); 363 new = kmalloc(sizeof(struct memtype), GFP_KERNEL);
343 if (!new) 364 if (!new)
@@ -347,9 +368,6 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
347 new->end = end; 368 new->end = end;
348 new->type = actual_type; 369 new->type = actual_type;
349 370
350 if (new_type)
351 *new_type = actual_type;
352
353 spin_lock(&memtype_lock); 371 spin_lock(&memtype_lock);
354 372
355 if (cached_entry && start >= cached_start) 373 if (cached_entry && start >= cached_start)
@@ -437,11 +455,19 @@ int free_memtype(u64 start, u64 end)
437 if (is_ISA_range(start, end - 1)) 455 if (is_ISA_range(start, end - 1))
438 return 0; 456 return 0;
439 457
440 is_range_ram = pagerange_is_ram(start, end); 458 /*
441 if (is_range_ram == 1) 459 * For legacy reasons, some parts of the physical address range in the
442 return free_ram_pages_type(start, end); 460 * legacy 1MB region is treated as non-RAM (even when listed as RAM in
443 else if (is_range_ram < 0) 461 * the e820 tables). So we will track the memory attributes of this
444 return -EINVAL; 462 * legacy 1MB region using the linear memtype_list always.
463 */
464 if (end >= ISA_END_ADDRESS) {
465 is_range_ram = pagerange_is_ram(start, end);
466 if (is_range_ram == 1)
467 return free_ram_pages_type(start, end);
468 else if (is_range_ram < 0)
469 return -EINVAL;
470 }
445 471
446 spin_lock(&memtype_lock); 472 spin_lock(&memtype_lock);
447 list_for_each_entry(entry, &memtype_list, nd) { 473 list_for_each_entry(entry, &memtype_list, nd) {
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 09737c8af074..15df1baee100 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -21,6 +21,7 @@
21#include <asm/numa.h> 21#include <asm/numa.h>
22#include <asm/e820.h> 22#include <asm/e820.h>
23#include <asm/genapic.h> 23#include <asm/genapic.h>
24#include <asm/uv/uv.h>
24 25
25int acpi_numa __initdata; 26int acpi_numa __initdata;
26 27
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
new file mode 100644
index 000000000000..72a6d4ebe34d
--- /dev/null
+++ b/arch/x86/mm/tlb.c
@@ -0,0 +1,296 @@
1#include <linux/init.h>
2
3#include <linux/mm.h>
4#include <linux/spinlock.h>
5#include <linux/smp.h>
6#include <linux/interrupt.h>
7#include <linux/module.h>
8
9#include <asm/tlbflush.h>
10#include <asm/mmu_context.h>
11#include <asm/apic.h>
12#include <asm/uv/uv.h>
13
14DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate)
15 = { &init_mm, 0, };
16
17#include <mach_ipi.h>
18/*
19 * Smarter SMP flushing macros.
20 * c/o Linus Torvalds.
21 *
22 * These mean you can really definitely utterly forget about
23 * writing to user space from interrupts. (Its not allowed anyway).
24 *
25 * Optimizations Manfred Spraul <manfred@colorfullife.com>
26 *
27 * More scalable flush, from Andi Kleen
28 *
29 * To avoid global state use 8 different call vectors.
30 * Each CPU uses a specific vector to trigger flushes on other
31 * CPUs. Depending on the received vector the target CPUs look into
32 * the right array slot for the flush data.
33 *
34 * With more than 8 CPUs they are hashed to the 8 available
35 * vectors. The limited global vector space forces us to this right now.
36 * In future when interrupts are split into per CPU domains this could be
37 * fixed, at the cost of triggering multiple IPIs in some cases.
38 */
39
40union smp_flush_state {
41 struct {
42 struct mm_struct *flush_mm;
43 unsigned long flush_va;
44 spinlock_t tlbstate_lock;
45 DECLARE_BITMAP(flush_cpumask, NR_CPUS);
46 };
47 char pad[CONFIG_X86_INTERNODE_CACHE_BYTES];
48} ____cacheline_internodealigned_in_smp;
49
50/* State is put into the per CPU data section, but padded
51 to a full cache line because other CPUs can access it and we don't
52 want false sharing in the per cpu data segment. */
53static union smp_flush_state flush_state[NUM_INVALIDATE_TLB_VECTORS];
54
55/*
56 * We cannot call mmdrop() because we are in interrupt context,
57 * instead update mm->cpu_vm_mask.
58 */
59void leave_mm(int cpu)
60{
61 if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
62 BUG();
63 cpu_clear(cpu, percpu_read(cpu_tlbstate.active_mm)->cpu_vm_mask);
64 load_cr3(swapper_pg_dir);
65}
66EXPORT_SYMBOL_GPL(leave_mm);
67
68/*
69 *
70 * The flush IPI assumes that a thread switch happens in this order:
71 * [cpu0: the cpu that switches]
72 * 1) switch_mm() either 1a) or 1b)
73 * 1a) thread switch to a different mm
74 * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
75 * Stop ipi delivery for the old mm. This is not synchronized with
76 * the other cpus, but smp_invalidate_interrupt ignore flush ipis
77 * for the wrong mm, and in the worst case we perform a superfluous
78 * tlb flush.
79 * 1a2) set cpu mmu_state to TLBSTATE_OK
80 * Now the smp_invalidate_interrupt won't call leave_mm if cpu0
81 * was in lazy tlb mode.
82 * 1a3) update cpu active_mm
83 * Now cpu0 accepts tlb flushes for the new mm.
84 * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
85 * Now the other cpus will send tlb flush ipis.
86 * 1a4) change cr3.
87 * 1b) thread switch without mm change
88 * cpu active_mm is correct, cpu0 already handles
89 * flush ipis.
90 * 1b1) set cpu mmu_state to TLBSTATE_OK
91 * 1b2) test_and_set the cpu bit in cpu_vm_mask.
92 * Atomically set the bit [other cpus will start sending flush ipis],
93 * and test the bit.
94 * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
95 * 2) switch %%esp, ie current
96 *
97 * The interrupt must handle 2 special cases:
98 * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
99 * - the cpu performs speculative tlb reads, i.e. even if the cpu only
100 * runs in kernel space, the cpu could load tlb entries for user space
101 * pages.
102 *
103 * The good news is that cpu mmu_state is local to each cpu, no
104 * write/read ordering problems.
105 */
106
107/*
108 * TLB flush IPI:
109 *
110 * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
111 * 2) Leave the mm if we are in the lazy tlb mode.
112 *
113 * Interrupts are disabled.
114 */
115
116/*
117 * FIXME: use of asmlinkage is not consistent. On x86_64 it's noop
118 * but still used for documentation purpose but the usage is slightly
119 * inconsistent. On x86_32, asmlinkage is regparm(0) but interrupt
120 * entry calls in with the first parameter in %eax. Maybe define
121 * intrlinkage?
122 */
123#ifdef CONFIG_X86_64
124asmlinkage
125#endif
126void smp_invalidate_interrupt(struct pt_regs *regs)
127{
128 unsigned int cpu;
129 unsigned int sender;
130 union smp_flush_state *f;
131
132 cpu = smp_processor_id();
133 /*
134 * orig_rax contains the negated interrupt vector.
135 * Use that to determine where the sender put the data.
136 */
137 sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START;
138 f = &flush_state[sender];
139
140 if (!cpumask_test_cpu(cpu, to_cpumask(f->flush_cpumask)))
141 goto out;
142 /*
143 * This was a BUG() but until someone can quote me the
144 * line from the intel manual that guarantees an IPI to
145 * multiple CPUs is retried _only_ on the erroring CPUs
146 * its staying as a return
147 *
148 * BUG();
149 */
150
151 if (f->flush_mm == percpu_read(cpu_tlbstate.active_mm)) {
152 if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
153 if (f->flush_va == TLB_FLUSH_ALL)
154 local_flush_tlb();
155 else
156 __flush_tlb_one(f->flush_va);
157 } else
158 leave_mm(cpu);
159 }
160out:
161 ack_APIC_irq();
162 smp_mb__before_clear_bit();
163 cpumask_clear_cpu(cpu, to_cpumask(f->flush_cpumask));
164 smp_mb__after_clear_bit();
165 inc_irq_stat(irq_tlb_count);
166}
167
168static void flush_tlb_others_ipi(const struct cpumask *cpumask,
169 struct mm_struct *mm, unsigned long va)
170{
171 unsigned int sender;
172 union smp_flush_state *f;
173
174 /* Caller has disabled preemption */
175 sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
176 f = &flush_state[sender];
177
178 /*
179 * Could avoid this lock when
180 * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
181 * probably not worth checking this for a cache-hot lock.
182 */
183 spin_lock(&f->tlbstate_lock);
184
185 f->flush_mm = mm;
186 f->flush_va = va;
187 cpumask_andnot(to_cpumask(f->flush_cpumask),
188 cpumask, cpumask_of(smp_processor_id()));
189
190 /*
191 * Make the above memory operations globally visible before
192 * sending the IPI.
193 */
194 smp_mb();
195 /*
196 * We have to send the IPI only to
197 * CPUs affected.
198 */
199 send_IPI_mask(to_cpumask(f->flush_cpumask),
200 INVALIDATE_TLB_VECTOR_START + sender);
201
202 while (!cpumask_empty(to_cpumask(f->flush_cpumask)))
203 cpu_relax();
204
205 f->flush_mm = NULL;
206 f->flush_va = 0;
207 spin_unlock(&f->tlbstate_lock);
208}
209
210void native_flush_tlb_others(const struct cpumask *cpumask,
211 struct mm_struct *mm, unsigned long va)
212{
213 if (is_uv_system()) {
214 unsigned int cpu;
215
216 cpu = get_cpu();
217 cpumask = uv_flush_tlb_others(cpumask, mm, va, cpu);
218 if (cpumask)
219 flush_tlb_others_ipi(cpumask, mm, va);
220 put_cpu();
221 return;
222 }
223 flush_tlb_others_ipi(cpumask, mm, va);
224}
225
226static int __cpuinit init_smp_flush(void)
227{
228 int i;
229
230 for (i = 0; i < ARRAY_SIZE(flush_state); i++)
231 spin_lock_init(&flush_state[i].tlbstate_lock);
232
233 return 0;
234}
235core_initcall(init_smp_flush);
236
237void flush_tlb_current_task(void)
238{
239 struct mm_struct *mm = current->mm;
240
241 preempt_disable();
242
243 local_flush_tlb();
244 if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids)
245 flush_tlb_others(&mm->cpu_vm_mask, mm, TLB_FLUSH_ALL);
246 preempt_enable();
247}
248
249void flush_tlb_mm(struct mm_struct *mm)
250{
251 preempt_disable();
252
253 if (current->active_mm == mm) {
254 if (current->mm)
255 local_flush_tlb();
256 else
257 leave_mm(smp_processor_id());
258 }
259 if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids)
260 flush_tlb_others(&mm->cpu_vm_mask, mm, TLB_FLUSH_ALL);
261
262 preempt_enable();
263}
264
265void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
266{
267 struct mm_struct *mm = vma->vm_mm;
268
269 preempt_disable();
270
271 if (current->active_mm == mm) {
272 if (current->mm)
273 __flush_tlb_one(va);
274 else
275 leave_mm(smp_processor_id());
276 }
277
278 if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids)
279 flush_tlb_others(&mm->cpu_vm_mask, mm, va);
280
281 preempt_enable();
282}
283
284static void do_flush_tlb_all(void *info)
285{
286 unsigned long cpu = smp_processor_id();
287
288 __flush_tlb_all();
289 if (percpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
290 leave_mm(cpu);
291}
292
293void flush_tlb_all(void)
294{
295 on_each_cpu(do_flush_tlb_all, NULL, 1);
296}