diff options
Diffstat (limited to 'arch/x86/mm')
-rw-r--r-- | arch/x86/mm/Makefile | 2 | ||||
-rw-r--r-- | arch/x86/mm/extable.c | 6 | ||||
-rw-r--r-- | arch/x86/mm/fault.c | 446 | ||||
-rw-r--r-- | arch/x86/mm/init_32.c | 49 | ||||
-rw-r--r-- | arch/x86/mm/init_64.c | 2 | ||||
-rw-r--r-- | arch/x86/mm/iomap_32.c | 10 | ||||
-rw-r--r-- | arch/x86/mm/ioremap.c | 27 | ||||
-rw-r--r-- | arch/x86/mm/numa_64.c | 217 | ||||
-rw-r--r-- | arch/x86/mm/pageattr.c | 49 | ||||
-rw-r--r-- | arch/x86/mm/pat.c | 74 | ||||
-rw-r--r-- | arch/x86/mm/srat_64.c | 1 | ||||
-rw-r--r-- | arch/x86/mm/tlb.c | 296 |
12 files changed, 926 insertions, 253 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index d8cc96a2738f..9f05157220f5 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile | |||
@@ -1,6 +1,8 @@ | |||
1 | obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ | 1 | obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ |
2 | pat.o pgtable.o gup.o | 2 | pat.o pgtable.o gup.o |
3 | 3 | ||
4 | obj-$(CONFIG_X86_SMP) += tlb.o | ||
5 | |||
4 | obj-$(CONFIG_X86_32) += pgtable_32.o iomap_32.o | 6 | obj-$(CONFIG_X86_32) += pgtable_32.o iomap_32.o |
5 | 7 | ||
6 | obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o | 8 | obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o |
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 7e8db53528a7..61b41ca3b5a2 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c | |||
@@ -23,6 +23,12 @@ int fixup_exception(struct pt_regs *regs) | |||
23 | 23 | ||
24 | fixup = search_exception_tables(regs->ip); | 24 | fixup = search_exception_tables(regs->ip); |
25 | if (fixup) { | 25 | if (fixup) { |
26 | /* If fixup is less than 16, it means uaccess error */ | ||
27 | if (fixup->fixup < 16) { | ||
28 | current_thread_info()->uaccess_err = -EFAULT; | ||
29 | regs->ip += fixup->fixup; | ||
30 | return 1; | ||
31 | } | ||
26 | regs->ip = fixup->fixup; | 32 | regs->ip = fixup->fixup; |
27 | return 1; | 33 | return 1; |
28 | } | 34 | } |
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 90dfae511a41..d3eee74f830a 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c | |||
@@ -26,6 +26,7 @@ | |||
26 | #include <linux/kprobes.h> | 26 | #include <linux/kprobes.h> |
27 | #include <linux/uaccess.h> | 27 | #include <linux/uaccess.h> |
28 | #include <linux/kdebug.h> | 28 | #include <linux/kdebug.h> |
29 | #include <linux/magic.h> | ||
29 | 30 | ||
30 | #include <asm/system.h> | 31 | #include <asm/system.h> |
31 | #include <asm/desc.h> | 32 | #include <asm/desc.h> |
@@ -91,8 +92,8 @@ static inline int notify_page_fault(struct pt_regs *regs) | |||
91 | * | 92 | * |
92 | * Opcode checker based on code by Richard Brunner | 93 | * Opcode checker based on code by Richard Brunner |
93 | */ | 94 | */ |
94 | static int is_prefetch(struct pt_regs *regs, unsigned long addr, | 95 | static int is_prefetch(struct pt_regs *regs, unsigned long error_code, |
95 | unsigned long error_code) | 96 | unsigned long addr) |
96 | { | 97 | { |
97 | unsigned char *instr; | 98 | unsigned char *instr; |
98 | int scan_more = 1; | 99 | int scan_more = 1; |
@@ -409,17 +410,16 @@ static void show_fault_oops(struct pt_regs *regs, unsigned long error_code, | |||
409 | } | 410 | } |
410 | 411 | ||
411 | #ifdef CONFIG_X86_64 | 412 | #ifdef CONFIG_X86_64 |
412 | static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs, | 413 | static noinline void pgtable_bad(struct pt_regs *regs, |
413 | unsigned long error_code) | 414 | unsigned long error_code, unsigned long address) |
414 | { | 415 | { |
415 | unsigned long flags = oops_begin(); | 416 | unsigned long flags = oops_begin(); |
416 | int sig = SIGKILL; | 417 | int sig = SIGKILL; |
417 | struct task_struct *tsk; | 418 | struct task_struct *tsk = current; |
418 | 419 | ||
419 | printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", | 420 | printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", |
420 | current->comm, address); | 421 | tsk->comm, address); |
421 | dump_pagetable(address); | 422 | dump_pagetable(address); |
422 | tsk = current; | ||
423 | tsk->thread.cr2 = address; | 423 | tsk->thread.cr2 = address; |
424 | tsk->thread.trap_no = 14; | 424 | tsk->thread.trap_no = 14; |
425 | tsk->thread.error_code = error_code; | 425 | tsk->thread.error_code = error_code; |
@@ -429,6 +429,196 @@ static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs, | |||
429 | } | 429 | } |
430 | #endif | 430 | #endif |
431 | 431 | ||
432 | static noinline void no_context(struct pt_regs *regs, | ||
433 | unsigned long error_code, unsigned long address) | ||
434 | { | ||
435 | struct task_struct *tsk = current; | ||
436 | unsigned long *stackend; | ||
437 | |||
438 | #ifdef CONFIG_X86_64 | ||
439 | unsigned long flags; | ||
440 | int sig; | ||
441 | #endif | ||
442 | |||
443 | /* Are we prepared to handle this kernel fault? */ | ||
444 | if (fixup_exception(regs)) | ||
445 | return; | ||
446 | |||
447 | /* | ||
448 | * X86_32 | ||
449 | * Valid to do another page fault here, because if this fault | ||
450 | * had been triggered by is_prefetch fixup_exception would have | ||
451 | * handled it. | ||
452 | * | ||
453 | * X86_64 | ||
454 | * Hall of shame of CPU/BIOS bugs. | ||
455 | */ | ||
456 | if (is_prefetch(regs, error_code, address)) | ||
457 | return; | ||
458 | |||
459 | if (is_errata93(regs, address)) | ||
460 | return; | ||
461 | |||
462 | /* | ||
463 | * Oops. The kernel tried to access some bad page. We'll have to | ||
464 | * terminate things with extreme prejudice. | ||
465 | */ | ||
466 | #ifdef CONFIG_X86_32 | ||
467 | bust_spinlocks(1); | ||
468 | #else | ||
469 | flags = oops_begin(); | ||
470 | #endif | ||
471 | |||
472 | show_fault_oops(regs, error_code, address); | ||
473 | |||
474 | stackend = end_of_stack(tsk); | ||
475 | if (*stackend != STACK_END_MAGIC) | ||
476 | printk(KERN_ALERT "Thread overran stack, or stack corrupted\n"); | ||
477 | |||
478 | tsk->thread.cr2 = address; | ||
479 | tsk->thread.trap_no = 14; | ||
480 | tsk->thread.error_code = error_code; | ||
481 | |||
482 | #ifdef CONFIG_X86_32 | ||
483 | die("Oops", regs, error_code); | ||
484 | bust_spinlocks(0); | ||
485 | do_exit(SIGKILL); | ||
486 | #else | ||
487 | sig = SIGKILL; | ||
488 | if (__die("Oops", regs, error_code)) | ||
489 | sig = 0; | ||
490 | /* Executive summary in case the body of the oops scrolled away */ | ||
491 | printk(KERN_EMERG "CR2: %016lx\n", address); | ||
492 | oops_end(flags, regs, sig); | ||
493 | #endif | ||
494 | } | ||
495 | |||
496 | static void __bad_area_nosemaphore(struct pt_regs *regs, | ||
497 | unsigned long error_code, unsigned long address, | ||
498 | int si_code) | ||
499 | { | ||
500 | struct task_struct *tsk = current; | ||
501 | |||
502 | /* User mode accesses just cause a SIGSEGV */ | ||
503 | if (error_code & PF_USER) { | ||
504 | /* | ||
505 | * It's possible to have interrupts off here. | ||
506 | */ | ||
507 | local_irq_enable(); | ||
508 | |||
509 | /* | ||
510 | * Valid to do another page fault here because this one came | ||
511 | * from user space. | ||
512 | */ | ||
513 | if (is_prefetch(regs, error_code, address)) | ||
514 | return; | ||
515 | |||
516 | if (is_errata100(regs, address)) | ||
517 | return; | ||
518 | |||
519 | if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && | ||
520 | printk_ratelimit()) { | ||
521 | printk( | ||
522 | "%s%s[%d]: segfault at %lx ip %p sp %p error %lx", | ||
523 | task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, | ||
524 | tsk->comm, task_pid_nr(tsk), address, | ||
525 | (void *) regs->ip, (void *) regs->sp, error_code); | ||
526 | print_vma_addr(" in ", regs->ip); | ||
527 | printk("\n"); | ||
528 | } | ||
529 | |||
530 | tsk->thread.cr2 = address; | ||
531 | /* Kernel addresses are always protection faults */ | ||
532 | tsk->thread.error_code = error_code | (address >= TASK_SIZE); | ||
533 | tsk->thread.trap_no = 14; | ||
534 | force_sig_info_fault(SIGSEGV, si_code, address, tsk); | ||
535 | return; | ||
536 | } | ||
537 | |||
538 | if (is_f00f_bug(regs, address)) | ||
539 | return; | ||
540 | |||
541 | no_context(regs, error_code, address); | ||
542 | } | ||
543 | |||
544 | static noinline void bad_area_nosemaphore(struct pt_regs *regs, | ||
545 | unsigned long error_code, unsigned long address) | ||
546 | { | ||
547 | __bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR); | ||
548 | } | ||
549 | |||
550 | static void __bad_area(struct pt_regs *regs, | ||
551 | unsigned long error_code, unsigned long address, | ||
552 | int si_code) | ||
553 | { | ||
554 | struct mm_struct *mm = current->mm; | ||
555 | |||
556 | /* | ||
557 | * Something tried to access memory that isn't in our memory map.. | ||
558 | * Fix it, but check if it's kernel or user first.. | ||
559 | */ | ||
560 | up_read(&mm->mmap_sem); | ||
561 | |||
562 | __bad_area_nosemaphore(regs, error_code, address, si_code); | ||
563 | } | ||
564 | |||
565 | static noinline void bad_area(struct pt_regs *regs, | ||
566 | unsigned long error_code, unsigned long address) | ||
567 | { | ||
568 | __bad_area(regs, error_code, address, SEGV_MAPERR); | ||
569 | } | ||
570 | |||
571 | static noinline void bad_area_access_error(struct pt_regs *regs, | ||
572 | unsigned long error_code, unsigned long address) | ||
573 | { | ||
574 | __bad_area(regs, error_code, address, SEGV_ACCERR); | ||
575 | } | ||
576 | |||
577 | /* TODO: fixup for "mm-invoke-oom-killer-from-page-fault.patch" */ | ||
578 | static void out_of_memory(struct pt_regs *regs, | ||
579 | unsigned long error_code, unsigned long address) | ||
580 | { | ||
581 | /* | ||
582 | * We ran out of memory, call the OOM killer, and return the userspace | ||
583 | * (which will retry the fault, or kill us if we got oom-killed). | ||
584 | */ | ||
585 | up_read(¤t->mm->mmap_sem); | ||
586 | pagefault_out_of_memory(); | ||
587 | } | ||
588 | |||
589 | static void do_sigbus(struct pt_regs *regs, | ||
590 | unsigned long error_code, unsigned long address) | ||
591 | { | ||
592 | struct task_struct *tsk = current; | ||
593 | struct mm_struct *mm = tsk->mm; | ||
594 | |||
595 | up_read(&mm->mmap_sem); | ||
596 | |||
597 | /* Kernel mode? Handle exceptions or die */ | ||
598 | if (!(error_code & PF_USER)) | ||
599 | no_context(regs, error_code, address); | ||
600 | #ifdef CONFIG_X86_32 | ||
601 | /* User space => ok to do another page fault */ | ||
602 | if (is_prefetch(regs, error_code, address)) | ||
603 | return; | ||
604 | #endif | ||
605 | tsk->thread.cr2 = address; | ||
606 | tsk->thread.error_code = error_code; | ||
607 | tsk->thread.trap_no = 14; | ||
608 | force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); | ||
609 | } | ||
610 | |||
611 | static noinline void mm_fault_error(struct pt_regs *regs, | ||
612 | unsigned long error_code, unsigned long address, unsigned int fault) | ||
613 | { | ||
614 | if (fault & VM_FAULT_OOM) | ||
615 | out_of_memory(regs, error_code, address); | ||
616 | else if (fault & VM_FAULT_SIGBUS) | ||
617 | do_sigbus(regs, error_code, address); | ||
618 | else | ||
619 | BUG(); | ||
620 | } | ||
621 | |||
432 | static int spurious_fault_check(unsigned long error_code, pte_t *pte) | 622 | static int spurious_fault_check(unsigned long error_code, pte_t *pte) |
433 | { | 623 | { |
434 | if ((error_code & PF_WRITE) && !pte_write(*pte)) | 624 | if ((error_code & PF_WRITE) && !pte_write(*pte)) |
@@ -448,8 +638,8 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte) | |||
448 | * There are no security implications to leaving a stale TLB when | 638 | * There are no security implications to leaving a stale TLB when |
449 | * increasing the permissions on a page. | 639 | * increasing the permissions on a page. |
450 | */ | 640 | */ |
451 | static int spurious_fault(unsigned long address, | 641 | static noinline int spurious_fault(unsigned long error_code, |
452 | unsigned long error_code) | 642 | unsigned long address) |
453 | { | 643 | { |
454 | pgd_t *pgd; | 644 | pgd_t *pgd; |
455 | pud_t *pud; | 645 | pud_t *pud; |
@@ -494,7 +684,7 @@ static int spurious_fault(unsigned long address, | |||
494 | * | 684 | * |
495 | * This assumes no large pages in there. | 685 | * This assumes no large pages in there. |
496 | */ | 686 | */ |
497 | static int vmalloc_fault(unsigned long address) | 687 | static noinline int vmalloc_fault(unsigned long address) |
498 | { | 688 | { |
499 | #ifdef CONFIG_X86_32 | 689 | #ifdef CONFIG_X86_32 |
500 | unsigned long pgd_paddr; | 690 | unsigned long pgd_paddr; |
@@ -573,6 +763,25 @@ static int vmalloc_fault(unsigned long address) | |||
573 | 763 | ||
574 | int show_unhandled_signals = 1; | 764 | int show_unhandled_signals = 1; |
575 | 765 | ||
766 | static inline int access_error(unsigned long error_code, int write, | ||
767 | struct vm_area_struct *vma) | ||
768 | { | ||
769 | if (write) { | ||
770 | /* write, present and write, not present */ | ||
771 | if (unlikely(!(vma->vm_flags & VM_WRITE))) | ||
772 | return 1; | ||
773 | } else if (unlikely(error_code & PF_PROT)) { | ||
774 | /* read, present */ | ||
775 | return 1; | ||
776 | } else { | ||
777 | /* read, not present */ | ||
778 | if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))) | ||
779 | return 1; | ||
780 | } | ||
781 | |||
782 | return 0; | ||
783 | } | ||
784 | |||
576 | /* | 785 | /* |
577 | * This routine handles page faults. It determines the address, | 786 | * This routine handles page faults. It determines the address, |
578 | * and the problem, and then passes it off to one of the appropriate | 787 | * and the problem, and then passes it off to one of the appropriate |
@@ -583,16 +792,12 @@ asmlinkage | |||
583 | #endif | 792 | #endif |
584 | void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) | 793 | void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) |
585 | { | 794 | { |
795 | unsigned long address; | ||
586 | struct task_struct *tsk; | 796 | struct task_struct *tsk; |
587 | struct mm_struct *mm; | 797 | struct mm_struct *mm; |
588 | struct vm_area_struct *vma; | 798 | struct vm_area_struct *vma; |
589 | unsigned long address; | 799 | int write; |
590 | int write, si_code; | ||
591 | int fault; | 800 | int fault; |
592 | #ifdef CONFIG_X86_64 | ||
593 | unsigned long flags; | ||
594 | int sig; | ||
595 | #endif | ||
596 | 801 | ||
597 | tsk = current; | 802 | tsk = current; |
598 | mm = tsk->mm; | 803 | mm = tsk->mm; |
@@ -601,9 +806,7 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) | |||
601 | /* get the address */ | 806 | /* get the address */ |
602 | address = read_cr2(); | 807 | address = read_cr2(); |
603 | 808 | ||
604 | si_code = SEGV_MAPERR; | 809 | if (unlikely(notify_page_fault(regs))) |
605 | |||
606 | if (notify_page_fault(regs)) | ||
607 | return; | 810 | return; |
608 | if (unlikely(kmmio_fault(regs, address))) | 811 | if (unlikely(kmmio_fault(regs, address))) |
609 | return; | 812 | return; |
@@ -631,17 +834,17 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) | |||
631 | return; | 834 | return; |
632 | 835 | ||
633 | /* Can handle a stale RO->RW TLB */ | 836 | /* Can handle a stale RO->RW TLB */ |
634 | if (spurious_fault(address, error_code)) | 837 | if (spurious_fault(error_code, address)) |
635 | return; | 838 | return; |
636 | 839 | ||
637 | /* | 840 | /* |
638 | * Don't take the mm semaphore here. If we fixup a prefetch | 841 | * Don't take the mm semaphore here. If we fixup a prefetch |
639 | * fault we could otherwise deadlock. | 842 | * fault we could otherwise deadlock. |
640 | */ | 843 | */ |
641 | goto bad_area_nosemaphore; | 844 | bad_area_nosemaphore(regs, error_code, address); |
845 | return; | ||
642 | } | 846 | } |
643 | 847 | ||
644 | |||
645 | /* | 848 | /* |
646 | * It's safe to allow irq's after cr2 has been saved and the | 849 | * It's safe to allow irq's after cr2 has been saved and the |
647 | * vmalloc fault has been handled. | 850 | * vmalloc fault has been handled. |
@@ -657,15 +860,17 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) | |||
657 | 860 | ||
658 | #ifdef CONFIG_X86_64 | 861 | #ifdef CONFIG_X86_64 |
659 | if (unlikely(error_code & PF_RSVD)) | 862 | if (unlikely(error_code & PF_RSVD)) |
660 | pgtable_bad(address, regs, error_code); | 863 | pgtable_bad(regs, error_code, address); |
661 | #endif | 864 | #endif |
662 | 865 | ||
663 | /* | 866 | /* |
664 | * If we're in an interrupt, have no user context or are running in an | 867 | * If we're in an interrupt, have no user context or are running in an |
665 | * atomic region then we must not take the fault. | 868 | * atomic region then we must not take the fault. |
666 | */ | 869 | */ |
667 | if (unlikely(in_atomic() || !mm)) | 870 | if (unlikely(in_atomic() || !mm)) { |
668 | goto bad_area_nosemaphore; | 871 | bad_area_nosemaphore(regs, error_code, address); |
872 | return; | ||
873 | } | ||
669 | 874 | ||
670 | /* | 875 | /* |
671 | * When running in the kernel we expect faults to occur only to | 876 | * When running in the kernel we expect faults to occur only to |
@@ -683,20 +888,26 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) | |||
683 | * source. If this is invalid we can skip the address space check, | 888 | * source. If this is invalid we can skip the address space check, |
684 | * thus avoiding the deadlock. | 889 | * thus avoiding the deadlock. |
685 | */ | 890 | */ |
686 | if (!down_read_trylock(&mm->mmap_sem)) { | 891 | if (unlikely(!down_read_trylock(&mm->mmap_sem))) { |
687 | if ((error_code & PF_USER) == 0 && | 892 | if ((error_code & PF_USER) == 0 && |
688 | !search_exception_tables(regs->ip)) | 893 | !search_exception_tables(regs->ip)) { |
689 | goto bad_area_nosemaphore; | 894 | bad_area_nosemaphore(regs, error_code, address); |
895 | return; | ||
896 | } | ||
690 | down_read(&mm->mmap_sem); | 897 | down_read(&mm->mmap_sem); |
691 | } | 898 | } |
692 | 899 | ||
693 | vma = find_vma(mm, address); | 900 | vma = find_vma(mm, address); |
694 | if (!vma) | 901 | if (unlikely(!vma)) { |
695 | goto bad_area; | 902 | bad_area(regs, error_code, address); |
696 | if (vma->vm_start <= address) | 903 | return; |
904 | } | ||
905 | if (likely(vma->vm_start <= address)) | ||
697 | goto good_area; | 906 | goto good_area; |
698 | if (!(vma->vm_flags & VM_GROWSDOWN)) | 907 | if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { |
699 | goto bad_area; | 908 | bad_area(regs, error_code, address); |
909 | return; | ||
910 | } | ||
700 | if (error_code & PF_USER) { | 911 | if (error_code & PF_USER) { |
701 | /* | 912 | /* |
702 | * Accessing the stack below %sp is always a bug. | 913 | * Accessing the stack below %sp is always a bug. |
@@ -704,31 +915,25 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) | |||
704 | * and pusha to work. ("enter $65535,$31" pushes | 915 | * and pusha to work. ("enter $65535,$31" pushes |
705 | * 32 pointers and then decrements %sp by 65535.) | 916 | * 32 pointers and then decrements %sp by 65535.) |
706 | */ | 917 | */ |
707 | if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp) | 918 | if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) { |
708 | goto bad_area; | 919 | bad_area(regs, error_code, address); |
920 | return; | ||
921 | } | ||
709 | } | 922 | } |
710 | if (expand_stack(vma, address)) | 923 | if (unlikely(expand_stack(vma, address))) { |
711 | goto bad_area; | 924 | bad_area(regs, error_code, address); |
712 | /* | 925 | return; |
713 | * Ok, we have a good vm_area for this memory access, so | 926 | } |
714 | * we can handle it.. | 927 | |
715 | */ | 928 | /* |
929 | * Ok, we have a good vm_area for this memory access, so | ||
930 | * we can handle it.. | ||
931 | */ | ||
716 | good_area: | 932 | good_area: |
717 | si_code = SEGV_ACCERR; | 933 | write = error_code & PF_WRITE; |
718 | write = 0; | 934 | if (unlikely(access_error(error_code, write, vma))) { |
719 | switch (error_code & (PF_PROT|PF_WRITE)) { | 935 | bad_area_access_error(regs, error_code, address); |
720 | default: /* 3: write, present */ | 936 | return; |
721 | /* fall through */ | ||
722 | case PF_WRITE: /* write, not present */ | ||
723 | if (!(vma->vm_flags & VM_WRITE)) | ||
724 | goto bad_area; | ||
725 | write++; | ||
726 | break; | ||
727 | case PF_PROT: /* read, present */ | ||
728 | goto bad_area; | ||
729 | case 0: /* read, not present */ | ||
730 | if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) | ||
731 | goto bad_area; | ||
732 | } | 937 | } |
733 | 938 | ||
734 | /* | 939 | /* |
@@ -738,11 +943,8 @@ good_area: | |||
738 | */ | 943 | */ |
739 | fault = handle_mm_fault(mm, vma, address, write); | 944 | fault = handle_mm_fault(mm, vma, address, write); |
740 | if (unlikely(fault & VM_FAULT_ERROR)) { | 945 | if (unlikely(fault & VM_FAULT_ERROR)) { |
741 | if (fault & VM_FAULT_OOM) | 946 | mm_fault_error(regs, error_code, address, fault); |
742 | goto out_of_memory; | 947 | return; |
743 | else if (fault & VM_FAULT_SIGBUS) | ||
744 | goto do_sigbus; | ||
745 | BUG(); | ||
746 | } | 948 | } |
747 | if (fault & VM_FAULT_MAJOR) | 949 | if (fault & VM_FAULT_MAJOR) |
748 | tsk->maj_flt++; | 950 | tsk->maj_flt++; |
@@ -760,128 +962,6 @@ good_area: | |||
760 | } | 962 | } |
761 | #endif | 963 | #endif |
762 | up_read(&mm->mmap_sem); | 964 | up_read(&mm->mmap_sem); |
763 | return; | ||
764 | |||
765 | /* | ||
766 | * Something tried to access memory that isn't in our memory map.. | ||
767 | * Fix it, but check if it's kernel or user first.. | ||
768 | */ | ||
769 | bad_area: | ||
770 | up_read(&mm->mmap_sem); | ||
771 | |||
772 | bad_area_nosemaphore: | ||
773 | /* User mode accesses just cause a SIGSEGV */ | ||
774 | if (error_code & PF_USER) { | ||
775 | /* | ||
776 | * It's possible to have interrupts off here. | ||
777 | */ | ||
778 | local_irq_enable(); | ||
779 | |||
780 | /* | ||
781 | * Valid to do another page fault here because this one came | ||
782 | * from user space. | ||
783 | */ | ||
784 | if (is_prefetch(regs, address, error_code)) | ||
785 | return; | ||
786 | |||
787 | if (is_errata100(regs, address)) | ||
788 | return; | ||
789 | |||
790 | if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && | ||
791 | printk_ratelimit()) { | ||
792 | printk( | ||
793 | "%s%s[%d]: segfault at %lx ip %p sp %p error %lx", | ||
794 | task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, | ||
795 | tsk->comm, task_pid_nr(tsk), address, | ||
796 | (void *) regs->ip, (void *) regs->sp, error_code); | ||
797 | print_vma_addr(" in ", regs->ip); | ||
798 | printk("\n"); | ||
799 | } | ||
800 | |||
801 | tsk->thread.cr2 = address; | ||
802 | /* Kernel addresses are always protection faults */ | ||
803 | tsk->thread.error_code = error_code | (address >= TASK_SIZE); | ||
804 | tsk->thread.trap_no = 14; | ||
805 | force_sig_info_fault(SIGSEGV, si_code, address, tsk); | ||
806 | return; | ||
807 | } | ||
808 | |||
809 | if (is_f00f_bug(regs, address)) | ||
810 | return; | ||
811 | |||
812 | no_context: | ||
813 | /* Are we prepared to handle this kernel fault? */ | ||
814 | if (fixup_exception(regs)) | ||
815 | return; | ||
816 | |||
817 | /* | ||
818 | * X86_32 | ||
819 | * Valid to do another page fault here, because if this fault | ||
820 | * had been triggered by is_prefetch fixup_exception would have | ||
821 | * handled it. | ||
822 | * | ||
823 | * X86_64 | ||
824 | * Hall of shame of CPU/BIOS bugs. | ||
825 | */ | ||
826 | if (is_prefetch(regs, address, error_code)) | ||
827 | return; | ||
828 | |||
829 | if (is_errata93(regs, address)) | ||
830 | return; | ||
831 | |||
832 | /* | ||
833 | * Oops. The kernel tried to access some bad page. We'll have to | ||
834 | * terminate things with extreme prejudice. | ||
835 | */ | ||
836 | #ifdef CONFIG_X86_32 | ||
837 | bust_spinlocks(1); | ||
838 | #else | ||
839 | flags = oops_begin(); | ||
840 | #endif | ||
841 | |||
842 | show_fault_oops(regs, error_code, address); | ||
843 | |||
844 | tsk->thread.cr2 = address; | ||
845 | tsk->thread.trap_no = 14; | ||
846 | tsk->thread.error_code = error_code; | ||
847 | |||
848 | #ifdef CONFIG_X86_32 | ||
849 | die("Oops", regs, error_code); | ||
850 | bust_spinlocks(0); | ||
851 | do_exit(SIGKILL); | ||
852 | #else | ||
853 | sig = SIGKILL; | ||
854 | if (__die("Oops", regs, error_code)) | ||
855 | sig = 0; | ||
856 | /* Executive summary in case the body of the oops scrolled away */ | ||
857 | printk(KERN_EMERG "CR2: %016lx\n", address); | ||
858 | oops_end(flags, regs, sig); | ||
859 | #endif | ||
860 | |||
861 | out_of_memory: | ||
862 | /* | ||
863 | * We ran out of memory, call the OOM killer, and return the userspace | ||
864 | * (which will retry the fault, or kill us if we got oom-killed). | ||
865 | */ | ||
866 | up_read(&mm->mmap_sem); | ||
867 | pagefault_out_of_memory(); | ||
868 | return; | ||
869 | |||
870 | do_sigbus: | ||
871 | up_read(&mm->mmap_sem); | ||
872 | |||
873 | /* Kernel mode? Handle exceptions or die */ | ||
874 | if (!(error_code & PF_USER)) | ||
875 | goto no_context; | ||
876 | #ifdef CONFIG_X86_32 | ||
877 | /* User space => ok to do another page fault */ | ||
878 | if (is_prefetch(regs, address, error_code)) | ||
879 | return; | ||
880 | #endif | ||
881 | tsk->thread.cr2 = address; | ||
882 | tsk->thread.error_code = error_code; | ||
883 | tsk->thread.trap_no = 14; | ||
884 | force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); | ||
885 | } | 965 | } |
886 | 966 | ||
887 | DEFINE_SPINLOCK(pgd_lock); | 967 | DEFINE_SPINLOCK(pgd_lock); |
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 88f1b10de3be..00263bf07a88 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c | |||
@@ -49,7 +49,6 @@ | |||
49 | #include <asm/paravirt.h> | 49 | #include <asm/paravirt.h> |
50 | #include <asm/setup.h> | 50 | #include <asm/setup.h> |
51 | #include <asm/cacheflush.h> | 51 | #include <asm/cacheflush.h> |
52 | #include <asm/smp.h> | ||
53 | 52 | ||
54 | unsigned int __VMALLOC_RESERVE = 128 << 20; | 53 | unsigned int __VMALLOC_RESERVE = 128 << 20; |
55 | 54 | ||
@@ -138,6 +137,47 @@ static pte_t * __init one_page_table_init(pmd_t *pmd) | |||
138 | return pte_offset_kernel(pmd, 0); | 137 | return pte_offset_kernel(pmd, 0); |
139 | } | 138 | } |
140 | 139 | ||
140 | static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd, | ||
141 | unsigned long vaddr, pte_t *lastpte) | ||
142 | { | ||
143 | #ifdef CONFIG_HIGHMEM | ||
144 | /* | ||
145 | * Something (early fixmap) may already have put a pte | ||
146 | * page here, which causes the page table allocation | ||
147 | * to become nonlinear. Attempt to fix it, and if it | ||
148 | * is still nonlinear then we have to bug. | ||
149 | */ | ||
150 | int pmd_idx_kmap_begin = fix_to_virt(FIX_KMAP_END) >> PMD_SHIFT; | ||
151 | int pmd_idx_kmap_end = fix_to_virt(FIX_KMAP_BEGIN) >> PMD_SHIFT; | ||
152 | |||
153 | if (pmd_idx_kmap_begin != pmd_idx_kmap_end | ||
154 | && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin | ||
155 | && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end | ||
156 | && ((__pa(pte) >> PAGE_SHIFT) < table_start | ||
157 | || (__pa(pte) >> PAGE_SHIFT) >= table_end)) { | ||
158 | pte_t *newpte; | ||
159 | int i; | ||
160 | |||
161 | BUG_ON(after_init_bootmem); | ||
162 | newpte = alloc_low_page(); | ||
163 | for (i = 0; i < PTRS_PER_PTE; i++) | ||
164 | set_pte(newpte + i, pte[i]); | ||
165 | |||
166 | paravirt_alloc_pte(&init_mm, __pa(newpte) >> PAGE_SHIFT); | ||
167 | set_pmd(pmd, __pmd(__pa(newpte)|_PAGE_TABLE)); | ||
168 | BUG_ON(newpte != pte_offset_kernel(pmd, 0)); | ||
169 | __flush_tlb_all(); | ||
170 | |||
171 | paravirt_release_pte(__pa(pte) >> PAGE_SHIFT); | ||
172 | pte = newpte; | ||
173 | } | ||
174 | BUG_ON(vaddr < fix_to_virt(FIX_KMAP_BEGIN - 1) | ||
175 | && vaddr > fix_to_virt(FIX_KMAP_END) | ||
176 | && lastpte && lastpte + PTRS_PER_PTE != pte); | ||
177 | #endif | ||
178 | return pte; | ||
179 | } | ||
180 | |||
141 | /* | 181 | /* |
142 | * This function initializes a certain range of kernel virtual memory | 182 | * This function initializes a certain range of kernel virtual memory |
143 | * with new bootmem page tables, everywhere page tables are missing in | 183 | * with new bootmem page tables, everywhere page tables are missing in |
@@ -154,6 +194,7 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base) | |||
154 | unsigned long vaddr; | 194 | unsigned long vaddr; |
155 | pgd_t *pgd; | 195 | pgd_t *pgd; |
156 | pmd_t *pmd; | 196 | pmd_t *pmd; |
197 | pte_t *pte = NULL; | ||
157 | 198 | ||
158 | vaddr = start; | 199 | vaddr = start; |
159 | pgd_idx = pgd_index(vaddr); | 200 | pgd_idx = pgd_index(vaddr); |
@@ -165,7 +206,8 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base) | |||
165 | pmd = pmd + pmd_index(vaddr); | 206 | pmd = pmd + pmd_index(vaddr); |
166 | for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); | 207 | for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); |
167 | pmd++, pmd_idx++) { | 208 | pmd++, pmd_idx++) { |
168 | one_page_table_init(pmd); | 209 | pte = page_table_kmap_check(one_page_table_init(pmd), |
210 | pmd, vaddr, pte); | ||
169 | 211 | ||
170 | vaddr += PMD_SIZE; | 212 | vaddr += PMD_SIZE; |
171 | } | 213 | } |
@@ -508,7 +550,6 @@ static void __init early_ioremap_page_table_range_init(pgd_t *pgd_base) | |||
508 | * Fixed mappings, only the page table structure has to be | 550 | * Fixed mappings, only the page table structure has to be |
509 | * created - mappings will be set by set_fixmap(): | 551 | * created - mappings will be set by set_fixmap(): |
510 | */ | 552 | */ |
511 | early_ioremap_clear(); | ||
512 | vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK; | 553 | vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK; |
513 | end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK; | 554 | end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK; |
514 | page_table_range_init(vaddr, end, pgd_base); | 555 | page_table_range_init(vaddr, end, pgd_base); |
@@ -801,7 +842,7 @@ static void __init find_early_table_space(unsigned long end, int use_pse) | |||
801 | tables += PAGE_ALIGN(ptes * sizeof(pte_t)); | 842 | tables += PAGE_ALIGN(ptes * sizeof(pte_t)); |
802 | 843 | ||
803 | /* for fixmap */ | 844 | /* for fixmap */ |
804 | tables += PAGE_SIZE * 2; | 845 | tables += PAGE_ALIGN(__end_of_fixed_addresses * sizeof(pte_t)); |
805 | 846 | ||
806 | /* | 847 | /* |
807 | * RED-PEN putting page tables only on node 0 could | 848 | * RED-PEN putting page tables only on node 0 could |
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 23f68e77ad1f..e6d36b490250 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c | |||
@@ -596,7 +596,7 @@ static void __init init_gbpages(void) | |||
596 | direct_gbpages = 0; | 596 | direct_gbpages = 0; |
597 | } | 597 | } |
598 | 598 | ||
599 | static unsigned long __init kernel_physical_mapping_init(unsigned long start, | 599 | static unsigned long __meminit kernel_physical_mapping_init(unsigned long start, |
600 | unsigned long end, | 600 | unsigned long end, |
601 | unsigned long page_size_mask) | 601 | unsigned long page_size_mask) |
602 | { | 602 | { |
diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c index d0151d8ce452..ca53224fc56c 100644 --- a/arch/x86/mm/iomap_32.c +++ b/arch/x86/mm/iomap_32.c | |||
@@ -17,6 +17,7 @@ | |||
17 | */ | 17 | */ |
18 | 18 | ||
19 | #include <asm/iomap.h> | 19 | #include <asm/iomap.h> |
20 | #include <asm/pat.h> | ||
20 | #include <linux/module.h> | 21 | #include <linux/module.h> |
21 | 22 | ||
22 | /* Map 'pfn' using fixed map 'type' and protections 'prot' | 23 | /* Map 'pfn' using fixed map 'type' and protections 'prot' |
@@ -29,6 +30,15 @@ iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot) | |||
29 | 30 | ||
30 | pagefault_disable(); | 31 | pagefault_disable(); |
31 | 32 | ||
33 | /* | ||
34 | * For non-PAT systems, promote PAGE_KERNEL_WC to PAGE_KERNEL_UC_MINUS. | ||
35 | * PAGE_KERNEL_WC maps to PWT, which translates to uncached if the | ||
36 | * MTRR is UC or WC. UC_MINUS gets the real intention, of the | ||
37 | * user, which is "WC if the MTRR is WC, UC if you can't do that." | ||
38 | */ | ||
39 | if (!pat_enabled && pgprot_val(prot) == pgprot_val(PAGE_KERNEL_WC)) | ||
40 | prot = PAGE_KERNEL_UC_MINUS; | ||
41 | |||
32 | idx = type + KM_TYPE_NR*smp_processor_id(); | 42 | idx = type + KM_TYPE_NR*smp_processor_id(); |
33 | vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); | 43 | vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); |
34 | set_pte(kmap_pte-idx, pfn_pte(pfn, prot)); | 44 | set_pte(kmap_pte-idx, pfn_pte(pfn, prot)); |
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index bd85d42819e1..1448bcb7f22f 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c | |||
@@ -367,7 +367,7 @@ EXPORT_SYMBOL(ioremap_nocache); | |||
367 | * | 367 | * |
368 | * Must be freed with iounmap. | 368 | * Must be freed with iounmap. |
369 | */ | 369 | */ |
370 | void __iomem *ioremap_wc(unsigned long phys_addr, unsigned long size) | 370 | void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size) |
371 | { | 371 | { |
372 | if (pat_enabled) | 372 | if (pat_enabled) |
373 | return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC, | 373 | return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC, |
@@ -557,34 +557,9 @@ void __init early_ioremap_init(void) | |||
557 | } | 557 | } |
558 | } | 558 | } |
559 | 559 | ||
560 | void __init early_ioremap_clear(void) | ||
561 | { | ||
562 | pmd_t *pmd; | ||
563 | |||
564 | if (early_ioremap_debug) | ||
565 | printk(KERN_INFO "early_ioremap_clear()\n"); | ||
566 | |||
567 | pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)); | ||
568 | pmd_clear(pmd); | ||
569 | paravirt_release_pte(__pa(bm_pte) >> PAGE_SHIFT); | ||
570 | __flush_tlb_all(); | ||
571 | } | ||
572 | |||
573 | void __init early_ioremap_reset(void) | 560 | void __init early_ioremap_reset(void) |
574 | { | 561 | { |
575 | enum fixed_addresses idx; | ||
576 | unsigned long addr, phys; | ||
577 | pte_t *pte; | ||
578 | |||
579 | after_paging_init = 1; | 562 | after_paging_init = 1; |
580 | for (idx = FIX_BTMAP_BEGIN; idx >= FIX_BTMAP_END; idx--) { | ||
581 | addr = fix_to_virt(idx); | ||
582 | pte = early_ioremap_pte(addr); | ||
583 | if (pte_present(*pte)) { | ||
584 | phys = pte_val(*pte) & PAGE_MASK; | ||
585 | set_fixmap(idx, phys); | ||
586 | } | ||
587 | } | ||
588 | } | 563 | } |
589 | 564 | ||
590 | static void __init __early_set_fixmap(enum fixed_addresses idx, | 565 | static void __init __early_set_fixmap(enum fixed_addresses idx, |
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 71a14f89f89e..08d140fbc31b 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c | |||
@@ -20,6 +20,12 @@ | |||
20 | #include <asm/acpi.h> | 20 | #include <asm/acpi.h> |
21 | #include <asm/k8.h> | 21 | #include <asm/k8.h> |
22 | 22 | ||
23 | #ifdef CONFIG_DEBUG_PER_CPU_MAPS | ||
24 | # define DBG(x...) printk(KERN_DEBUG x) | ||
25 | #else | ||
26 | # define DBG(x...) | ||
27 | #endif | ||
28 | |||
23 | struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; | 29 | struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; |
24 | EXPORT_SYMBOL(node_data); | 30 | EXPORT_SYMBOL(node_data); |
25 | 31 | ||
@@ -33,6 +39,21 @@ int numa_off __initdata; | |||
33 | static unsigned long __initdata nodemap_addr; | 39 | static unsigned long __initdata nodemap_addr; |
34 | static unsigned long __initdata nodemap_size; | 40 | static unsigned long __initdata nodemap_size; |
35 | 41 | ||
42 | DEFINE_PER_CPU(int, node_number) = 0; | ||
43 | EXPORT_PER_CPU_SYMBOL(node_number); | ||
44 | |||
45 | /* | ||
46 | * Map cpu index to node index | ||
47 | */ | ||
48 | DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); | ||
49 | EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); | ||
50 | |||
51 | /* | ||
52 | * Which logical CPUs are on which nodes | ||
53 | */ | ||
54 | cpumask_t *node_to_cpumask_map; | ||
55 | EXPORT_SYMBOL(node_to_cpumask_map); | ||
56 | |||
36 | /* | 57 | /* |
37 | * Given a shift value, try to populate memnodemap[] | 58 | * Given a shift value, try to populate memnodemap[] |
38 | * Returns : | 59 | * Returns : |
@@ -640,3 +661,199 @@ void __init init_cpu_to_node(void) | |||
640 | #endif | 661 | #endif |
641 | 662 | ||
642 | 663 | ||
664 | /* | ||
665 | * Allocate node_to_cpumask_map based on number of available nodes | ||
666 | * Requires node_possible_map to be valid. | ||
667 | * | ||
668 | * Note: node_to_cpumask() is not valid until after this is done. | ||
669 | * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.) | ||
670 | */ | ||
671 | void __init setup_node_to_cpumask_map(void) | ||
672 | { | ||
673 | unsigned int node, num = 0; | ||
674 | cpumask_t *map; | ||
675 | |||
676 | /* setup nr_node_ids if not done yet */ | ||
677 | if (nr_node_ids == MAX_NUMNODES) { | ||
678 | for_each_node_mask(node, node_possible_map) | ||
679 | num = node; | ||
680 | nr_node_ids = num + 1; | ||
681 | } | ||
682 | |||
683 | /* allocate the map */ | ||
684 | map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t)); | ||
685 | DBG("node_to_cpumask_map at %p for %d nodes\n", map, nr_node_ids); | ||
686 | |||
687 | pr_debug("Node to cpumask map at %p for %d nodes\n", | ||
688 | map, nr_node_ids); | ||
689 | |||
690 | /* node_to_cpumask() will now work */ | ||
691 | node_to_cpumask_map = map; | ||
692 | } | ||
693 | |||
694 | void __cpuinit numa_set_node(int cpu, int node) | ||
695 | { | ||
696 | int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); | ||
697 | |||
698 | /* early setting, no percpu area yet */ | ||
699 | if (cpu_to_node_map) { | ||
700 | cpu_to_node_map[cpu] = node; | ||
701 | return; | ||
702 | } | ||
703 | |||
704 | #ifdef CONFIG_DEBUG_PER_CPU_MAPS | ||
705 | if (cpu >= nr_cpu_ids || !per_cpu_offset(cpu)) { | ||
706 | printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu); | ||
707 | dump_stack(); | ||
708 | return; | ||
709 | } | ||
710 | #endif | ||
711 | per_cpu(x86_cpu_to_node_map, cpu) = node; | ||
712 | |||
713 | if (node != NUMA_NO_NODE) | ||
714 | per_cpu(node_number, cpu) = node; | ||
715 | } | ||
716 | |||
717 | void __cpuinit numa_clear_node(int cpu) | ||
718 | { | ||
719 | numa_set_node(cpu, NUMA_NO_NODE); | ||
720 | } | ||
721 | |||
722 | #ifndef CONFIG_DEBUG_PER_CPU_MAPS | ||
723 | |||
724 | void __cpuinit numa_add_cpu(int cpu) | ||
725 | { | ||
726 | cpu_set(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); | ||
727 | } | ||
728 | |||
729 | void __cpuinit numa_remove_cpu(int cpu) | ||
730 | { | ||
731 | cpu_clear(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); | ||
732 | } | ||
733 | |||
734 | #else /* CONFIG_DEBUG_PER_CPU_MAPS */ | ||
735 | |||
736 | /* | ||
737 | * --------- debug versions of the numa functions --------- | ||
738 | */ | ||
739 | static void __cpuinit numa_set_cpumask(int cpu, int enable) | ||
740 | { | ||
741 | int node = early_cpu_to_node(cpu); | ||
742 | cpumask_t *mask; | ||
743 | char buf[64]; | ||
744 | |||
745 | if (node_to_cpumask_map == NULL) { | ||
746 | printk(KERN_ERR "node_to_cpumask_map NULL\n"); | ||
747 | dump_stack(); | ||
748 | return; | ||
749 | } | ||
750 | |||
751 | mask = &node_to_cpumask_map[node]; | ||
752 | if (enable) | ||
753 | cpu_set(cpu, *mask); | ||
754 | else | ||
755 | cpu_clear(cpu, *mask); | ||
756 | |||
757 | cpulist_scnprintf(buf, sizeof(buf), mask); | ||
758 | printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", | ||
759 | enable ? "numa_add_cpu" : "numa_remove_cpu", cpu, node, buf); | ||
760 | } | ||
761 | |||
762 | void __cpuinit numa_add_cpu(int cpu) | ||
763 | { | ||
764 | numa_set_cpumask(cpu, 1); | ||
765 | } | ||
766 | |||
767 | void __cpuinit numa_remove_cpu(int cpu) | ||
768 | { | ||
769 | numa_set_cpumask(cpu, 0); | ||
770 | } | ||
771 | |||
772 | int cpu_to_node(int cpu) | ||
773 | { | ||
774 | if (early_per_cpu_ptr(x86_cpu_to_node_map)) { | ||
775 | printk(KERN_WARNING | ||
776 | "cpu_to_node(%d): usage too early!\n", cpu); | ||
777 | dump_stack(); | ||
778 | return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; | ||
779 | } | ||
780 | return per_cpu(x86_cpu_to_node_map, cpu); | ||
781 | } | ||
782 | EXPORT_SYMBOL(cpu_to_node); | ||
783 | |||
784 | /* | ||
785 | * Same function as cpu_to_node() but used if called before the | ||
786 | * per_cpu areas are setup. | ||
787 | */ | ||
788 | int early_cpu_to_node(int cpu) | ||
789 | { | ||
790 | if (early_per_cpu_ptr(x86_cpu_to_node_map)) | ||
791 | return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; | ||
792 | |||
793 | if (!per_cpu_offset(cpu)) { | ||
794 | printk(KERN_WARNING | ||
795 | "early_cpu_to_node(%d): no per_cpu area!\n", cpu); | ||
796 | dump_stack(); | ||
797 | return NUMA_NO_NODE; | ||
798 | } | ||
799 | return per_cpu(x86_cpu_to_node_map, cpu); | ||
800 | } | ||
801 | |||
802 | |||
803 | /* empty cpumask */ | ||
804 | static const cpumask_t cpu_mask_none; | ||
805 | |||
806 | /* | ||
807 | * Returns a pointer to the bitmask of CPUs on Node 'node'. | ||
808 | */ | ||
809 | const cpumask_t *cpumask_of_node(int node) | ||
810 | { | ||
811 | if (node_to_cpumask_map == NULL) { | ||
812 | printk(KERN_WARNING | ||
813 | "cpumask_of_node(%d): no node_to_cpumask_map!\n", | ||
814 | node); | ||
815 | dump_stack(); | ||
816 | return (const cpumask_t *)&cpu_online_map; | ||
817 | } | ||
818 | if (node >= nr_node_ids) { | ||
819 | printk(KERN_WARNING | ||
820 | "cpumask_of_node(%d): node > nr_node_ids(%d)\n", | ||
821 | node, nr_node_ids); | ||
822 | dump_stack(); | ||
823 | return &cpu_mask_none; | ||
824 | } | ||
825 | return &node_to_cpumask_map[node]; | ||
826 | } | ||
827 | EXPORT_SYMBOL(cpumask_of_node); | ||
828 | |||
829 | /* | ||
830 | * Returns a bitmask of CPUs on Node 'node'. | ||
831 | * | ||
832 | * Side note: this function creates the returned cpumask on the stack | ||
833 | * so with a high NR_CPUS count, excessive stack space is used. The | ||
834 | * node_to_cpumask_ptr function should be used whenever possible. | ||
835 | */ | ||
836 | cpumask_t node_to_cpumask(int node) | ||
837 | { | ||
838 | if (node_to_cpumask_map == NULL) { | ||
839 | printk(KERN_WARNING | ||
840 | "node_to_cpumask(%d): no node_to_cpumask_map!\n", node); | ||
841 | dump_stack(); | ||
842 | return cpu_online_map; | ||
843 | } | ||
844 | if (node >= nr_node_ids) { | ||
845 | printk(KERN_WARNING | ||
846 | "node_to_cpumask(%d): node > nr_node_ids(%d)\n", | ||
847 | node, nr_node_ids); | ||
848 | dump_stack(); | ||
849 | return cpu_mask_none; | ||
850 | } | ||
851 | return node_to_cpumask_map[node]; | ||
852 | } | ||
853 | EXPORT_SYMBOL(node_to_cpumask); | ||
854 | |||
855 | /* | ||
856 | * --------- end of debug versions of the numa functions --------- | ||
857 | */ | ||
858 | |||
859 | #endif /* CONFIG_DEBUG_PER_CPU_MAPS */ | ||
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index e89d24815f26..84ba74820ad6 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c | |||
@@ -534,6 +534,36 @@ out_unlock: | |||
534 | return 0; | 534 | return 0; |
535 | } | 535 | } |
536 | 536 | ||
537 | static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr, | ||
538 | int primary) | ||
539 | { | ||
540 | /* | ||
541 | * Ignore all non primary paths. | ||
542 | */ | ||
543 | if (!primary) | ||
544 | return 0; | ||
545 | |||
546 | /* | ||
547 | * Ignore the NULL PTE for kernel identity mapping, as it is expected | ||
548 | * to have holes. | ||
549 | * Also set numpages to '1' indicating that we processed cpa req for | ||
550 | * one virtual address page and its pfn. TBD: numpages can be set based | ||
551 | * on the initial value and the level returned by lookup_address(). | ||
552 | */ | ||
553 | if (within(vaddr, PAGE_OFFSET, | ||
554 | PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) { | ||
555 | cpa->numpages = 1; | ||
556 | cpa->pfn = __pa(vaddr) >> PAGE_SHIFT; | ||
557 | return 0; | ||
558 | } else { | ||
559 | WARN(1, KERN_WARNING "CPA: called for zero pte. " | ||
560 | "vaddr = %lx cpa->vaddr = %lx\n", vaddr, | ||
561 | *cpa->vaddr); | ||
562 | |||
563 | return -EFAULT; | ||
564 | } | ||
565 | } | ||
566 | |||
537 | static int __change_page_attr(struct cpa_data *cpa, int primary) | 567 | static int __change_page_attr(struct cpa_data *cpa, int primary) |
538 | { | 568 | { |
539 | unsigned long address; | 569 | unsigned long address; |
@@ -549,17 +579,11 @@ static int __change_page_attr(struct cpa_data *cpa, int primary) | |||
549 | repeat: | 579 | repeat: |
550 | kpte = lookup_address(address, &level); | 580 | kpte = lookup_address(address, &level); |
551 | if (!kpte) | 581 | if (!kpte) |
552 | return 0; | 582 | return __cpa_process_fault(cpa, address, primary); |
553 | 583 | ||
554 | old_pte = *kpte; | 584 | old_pte = *kpte; |
555 | if (!pte_val(old_pte)) { | 585 | if (!pte_val(old_pte)) |
556 | if (!primary) | 586 | return __cpa_process_fault(cpa, address, primary); |
557 | return 0; | ||
558 | WARN(1, KERN_WARNING "CPA: called for zero pte. " | ||
559 | "vaddr = %lx cpa->vaddr = %lx\n", address, | ||
560 | *cpa->vaddr); | ||
561 | return -EINVAL; | ||
562 | } | ||
563 | 587 | ||
564 | if (level == PG_LEVEL_4K) { | 588 | if (level == PG_LEVEL_4K) { |
565 | pte_t new_pte; | 589 | pte_t new_pte; |
@@ -657,12 +681,7 @@ static int cpa_process_alias(struct cpa_data *cpa) | |||
657 | vaddr = *cpa->vaddr; | 681 | vaddr = *cpa->vaddr; |
658 | 682 | ||
659 | if (!(within(vaddr, PAGE_OFFSET, | 683 | if (!(within(vaddr, PAGE_OFFSET, |
660 | PAGE_OFFSET + (max_low_pfn_mapped << PAGE_SHIFT)) | 684 | PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) { |
661 | #ifdef CONFIG_X86_64 | ||
662 | || within(vaddr, PAGE_OFFSET + (1UL<<32), | ||
663 | PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)) | ||
664 | #endif | ||
665 | )) { | ||
666 | 685 | ||
667 | alias_cpa = *cpa; | 686 | alias_cpa = *cpa; |
668 | temp_cpa_vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT); | 687 | temp_cpa_vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT); |
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 8b08fb955274..9127e31c7268 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c | |||
@@ -30,7 +30,7 @@ | |||
30 | #ifdef CONFIG_X86_PAT | 30 | #ifdef CONFIG_X86_PAT |
31 | int __read_mostly pat_enabled = 1; | 31 | int __read_mostly pat_enabled = 1; |
32 | 32 | ||
33 | void __cpuinit pat_disable(char *reason) | 33 | void __cpuinit pat_disable(const char *reason) |
34 | { | 34 | { |
35 | pat_enabled = 0; | 35 | pat_enabled = 0; |
36 | printk(KERN_INFO "%s\n", reason); | 36 | printk(KERN_INFO "%s\n", reason); |
@@ -42,6 +42,11 @@ static int __init nopat(char *str) | |||
42 | return 0; | 42 | return 0; |
43 | } | 43 | } |
44 | early_param("nopat", nopat); | 44 | early_param("nopat", nopat); |
45 | #else | ||
46 | static inline void pat_disable(const char *reason) | ||
47 | { | ||
48 | (void)reason; | ||
49 | } | ||
45 | #endif | 50 | #endif |
46 | 51 | ||
47 | 52 | ||
@@ -78,16 +83,20 @@ void pat_init(void) | |||
78 | if (!pat_enabled) | 83 | if (!pat_enabled) |
79 | return; | 84 | return; |
80 | 85 | ||
81 | /* Paranoia check. */ | 86 | if (!cpu_has_pat) { |
82 | if (!cpu_has_pat && boot_pat_state) { | 87 | if (!boot_pat_state) { |
83 | /* | 88 | pat_disable("PAT not supported by CPU."); |
84 | * If this happens we are on a secondary CPU, but | 89 | return; |
85 | * switched to PAT on the boot CPU. We have no way to | 90 | } else { |
86 | * undo PAT. | 91 | /* |
87 | */ | 92 | * If this happens we are on a secondary CPU, but |
88 | printk(KERN_ERR "PAT enabled, " | 93 | * switched to PAT on the boot CPU. We have no way to |
89 | "but not supported by secondary CPU\n"); | 94 | * undo PAT. |
90 | BUG(); | 95 | */ |
96 | printk(KERN_ERR "PAT enabled, " | ||
97 | "but not supported by secondary CPU\n"); | ||
98 | BUG(); | ||
99 | } | ||
91 | } | 100 | } |
92 | 101 | ||
93 | /* Set PWT to Write-Combining. All other bits stay the same */ | 102 | /* Set PWT to Write-Combining. All other bits stay the same */ |
@@ -333,11 +342,23 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, | |||
333 | req_type & _PAGE_CACHE_MASK); | 342 | req_type & _PAGE_CACHE_MASK); |
334 | } | 343 | } |
335 | 344 | ||
336 | is_range_ram = pagerange_is_ram(start, end); | 345 | if (new_type) |
337 | if (is_range_ram == 1) | 346 | *new_type = actual_type; |
338 | return reserve_ram_pages_type(start, end, req_type, new_type); | 347 | |
339 | else if (is_range_ram < 0) | 348 | /* |
340 | return -EINVAL; | 349 | * For legacy reasons, some parts of the physical address range in the |
350 | * legacy 1MB region is treated as non-RAM (even when listed as RAM in | ||
351 | * the e820 tables). So we will track the memory attributes of this | ||
352 | * legacy 1MB region using the linear memtype_list always. | ||
353 | */ | ||
354 | if (end >= ISA_END_ADDRESS) { | ||
355 | is_range_ram = pagerange_is_ram(start, end); | ||
356 | if (is_range_ram == 1) | ||
357 | return reserve_ram_pages_type(start, end, req_type, | ||
358 | new_type); | ||
359 | else if (is_range_ram < 0) | ||
360 | return -EINVAL; | ||
361 | } | ||
341 | 362 | ||
342 | new = kmalloc(sizeof(struct memtype), GFP_KERNEL); | 363 | new = kmalloc(sizeof(struct memtype), GFP_KERNEL); |
343 | if (!new) | 364 | if (!new) |
@@ -347,9 +368,6 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, | |||
347 | new->end = end; | 368 | new->end = end; |
348 | new->type = actual_type; | 369 | new->type = actual_type; |
349 | 370 | ||
350 | if (new_type) | ||
351 | *new_type = actual_type; | ||
352 | |||
353 | spin_lock(&memtype_lock); | 371 | spin_lock(&memtype_lock); |
354 | 372 | ||
355 | if (cached_entry && start >= cached_start) | 373 | if (cached_entry && start >= cached_start) |
@@ -437,11 +455,19 @@ int free_memtype(u64 start, u64 end) | |||
437 | if (is_ISA_range(start, end - 1)) | 455 | if (is_ISA_range(start, end - 1)) |
438 | return 0; | 456 | return 0; |
439 | 457 | ||
440 | is_range_ram = pagerange_is_ram(start, end); | 458 | /* |
441 | if (is_range_ram == 1) | 459 | * For legacy reasons, some parts of the physical address range in the |
442 | return free_ram_pages_type(start, end); | 460 | * legacy 1MB region is treated as non-RAM (even when listed as RAM in |
443 | else if (is_range_ram < 0) | 461 | * the e820 tables). So we will track the memory attributes of this |
444 | return -EINVAL; | 462 | * legacy 1MB region using the linear memtype_list always. |
463 | */ | ||
464 | if (end >= ISA_END_ADDRESS) { | ||
465 | is_range_ram = pagerange_is_ram(start, end); | ||
466 | if (is_range_ram == 1) | ||
467 | return free_ram_pages_type(start, end); | ||
468 | else if (is_range_ram < 0) | ||
469 | return -EINVAL; | ||
470 | } | ||
445 | 471 | ||
446 | spin_lock(&memtype_lock); | 472 | spin_lock(&memtype_lock); |
447 | list_for_each_entry(entry, &memtype_list, nd) { | 473 | list_for_each_entry(entry, &memtype_list, nd) { |
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index 09737c8af074..15df1baee100 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c | |||
@@ -21,6 +21,7 @@ | |||
21 | #include <asm/numa.h> | 21 | #include <asm/numa.h> |
22 | #include <asm/e820.h> | 22 | #include <asm/e820.h> |
23 | #include <asm/genapic.h> | 23 | #include <asm/genapic.h> |
24 | #include <asm/uv/uv.h> | ||
24 | 25 | ||
25 | int acpi_numa __initdata; | 26 | int acpi_numa __initdata; |
26 | 27 | ||
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c new file mode 100644 index 000000000000..72a6d4ebe34d --- /dev/null +++ b/arch/x86/mm/tlb.c | |||
@@ -0,0 +1,296 @@ | |||
1 | #include <linux/init.h> | ||
2 | |||
3 | #include <linux/mm.h> | ||
4 | #include <linux/spinlock.h> | ||
5 | #include <linux/smp.h> | ||
6 | #include <linux/interrupt.h> | ||
7 | #include <linux/module.h> | ||
8 | |||
9 | #include <asm/tlbflush.h> | ||
10 | #include <asm/mmu_context.h> | ||
11 | #include <asm/apic.h> | ||
12 | #include <asm/uv/uv.h> | ||
13 | |||
14 | DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) | ||
15 | = { &init_mm, 0, }; | ||
16 | |||
17 | #include <mach_ipi.h> | ||
18 | /* | ||
19 | * Smarter SMP flushing macros. | ||
20 | * c/o Linus Torvalds. | ||
21 | * | ||
22 | * These mean you can really definitely utterly forget about | ||
23 | * writing to user space from interrupts. (Its not allowed anyway). | ||
24 | * | ||
25 | * Optimizations Manfred Spraul <manfred@colorfullife.com> | ||
26 | * | ||
27 | * More scalable flush, from Andi Kleen | ||
28 | * | ||
29 | * To avoid global state use 8 different call vectors. | ||
30 | * Each CPU uses a specific vector to trigger flushes on other | ||
31 | * CPUs. Depending on the received vector the target CPUs look into | ||
32 | * the right array slot for the flush data. | ||
33 | * | ||
34 | * With more than 8 CPUs they are hashed to the 8 available | ||
35 | * vectors. The limited global vector space forces us to this right now. | ||
36 | * In future when interrupts are split into per CPU domains this could be | ||
37 | * fixed, at the cost of triggering multiple IPIs in some cases. | ||
38 | */ | ||
39 | |||
40 | union smp_flush_state { | ||
41 | struct { | ||
42 | struct mm_struct *flush_mm; | ||
43 | unsigned long flush_va; | ||
44 | spinlock_t tlbstate_lock; | ||
45 | DECLARE_BITMAP(flush_cpumask, NR_CPUS); | ||
46 | }; | ||
47 | char pad[CONFIG_X86_INTERNODE_CACHE_BYTES]; | ||
48 | } ____cacheline_internodealigned_in_smp; | ||
49 | |||
50 | /* State is put into the per CPU data section, but padded | ||
51 | to a full cache line because other CPUs can access it and we don't | ||
52 | want false sharing in the per cpu data segment. */ | ||
53 | static union smp_flush_state flush_state[NUM_INVALIDATE_TLB_VECTORS]; | ||
54 | |||
55 | /* | ||
56 | * We cannot call mmdrop() because we are in interrupt context, | ||
57 | * instead update mm->cpu_vm_mask. | ||
58 | */ | ||
59 | void leave_mm(int cpu) | ||
60 | { | ||
61 | if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) | ||
62 | BUG(); | ||
63 | cpu_clear(cpu, percpu_read(cpu_tlbstate.active_mm)->cpu_vm_mask); | ||
64 | load_cr3(swapper_pg_dir); | ||
65 | } | ||
66 | EXPORT_SYMBOL_GPL(leave_mm); | ||
67 | |||
68 | /* | ||
69 | * | ||
70 | * The flush IPI assumes that a thread switch happens in this order: | ||
71 | * [cpu0: the cpu that switches] | ||
72 | * 1) switch_mm() either 1a) or 1b) | ||
73 | * 1a) thread switch to a different mm | ||
74 | * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask); | ||
75 | * Stop ipi delivery for the old mm. This is not synchronized with | ||
76 | * the other cpus, but smp_invalidate_interrupt ignore flush ipis | ||
77 | * for the wrong mm, and in the worst case we perform a superfluous | ||
78 | * tlb flush. | ||
79 | * 1a2) set cpu mmu_state to TLBSTATE_OK | ||
80 | * Now the smp_invalidate_interrupt won't call leave_mm if cpu0 | ||
81 | * was in lazy tlb mode. | ||
82 | * 1a3) update cpu active_mm | ||
83 | * Now cpu0 accepts tlb flushes for the new mm. | ||
84 | * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask); | ||
85 | * Now the other cpus will send tlb flush ipis. | ||
86 | * 1a4) change cr3. | ||
87 | * 1b) thread switch without mm change | ||
88 | * cpu active_mm is correct, cpu0 already handles | ||
89 | * flush ipis. | ||
90 | * 1b1) set cpu mmu_state to TLBSTATE_OK | ||
91 | * 1b2) test_and_set the cpu bit in cpu_vm_mask. | ||
92 | * Atomically set the bit [other cpus will start sending flush ipis], | ||
93 | * and test the bit. | ||
94 | * 1b3) if the bit was 0: leave_mm was called, flush the tlb. | ||
95 | * 2) switch %%esp, ie current | ||
96 | * | ||
97 | * The interrupt must handle 2 special cases: | ||
98 | * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm. | ||
99 | * - the cpu performs speculative tlb reads, i.e. even if the cpu only | ||
100 | * runs in kernel space, the cpu could load tlb entries for user space | ||
101 | * pages. | ||
102 | * | ||
103 | * The good news is that cpu mmu_state is local to each cpu, no | ||
104 | * write/read ordering problems. | ||
105 | */ | ||
106 | |||
107 | /* | ||
108 | * TLB flush IPI: | ||
109 | * | ||
110 | * 1) Flush the tlb entries if the cpu uses the mm that's being flushed. | ||
111 | * 2) Leave the mm if we are in the lazy tlb mode. | ||
112 | * | ||
113 | * Interrupts are disabled. | ||
114 | */ | ||
115 | |||
116 | /* | ||
117 | * FIXME: use of asmlinkage is not consistent. On x86_64 it's noop | ||
118 | * but still used for documentation purpose but the usage is slightly | ||
119 | * inconsistent. On x86_32, asmlinkage is regparm(0) but interrupt | ||
120 | * entry calls in with the first parameter in %eax. Maybe define | ||
121 | * intrlinkage? | ||
122 | */ | ||
123 | #ifdef CONFIG_X86_64 | ||
124 | asmlinkage | ||
125 | #endif | ||
126 | void smp_invalidate_interrupt(struct pt_regs *regs) | ||
127 | { | ||
128 | unsigned int cpu; | ||
129 | unsigned int sender; | ||
130 | union smp_flush_state *f; | ||
131 | |||
132 | cpu = smp_processor_id(); | ||
133 | /* | ||
134 | * orig_rax contains the negated interrupt vector. | ||
135 | * Use that to determine where the sender put the data. | ||
136 | */ | ||
137 | sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START; | ||
138 | f = &flush_state[sender]; | ||
139 | |||
140 | if (!cpumask_test_cpu(cpu, to_cpumask(f->flush_cpumask))) | ||
141 | goto out; | ||
142 | /* | ||
143 | * This was a BUG() but until someone can quote me the | ||
144 | * line from the intel manual that guarantees an IPI to | ||
145 | * multiple CPUs is retried _only_ on the erroring CPUs | ||
146 | * its staying as a return | ||
147 | * | ||
148 | * BUG(); | ||
149 | */ | ||
150 | |||
151 | if (f->flush_mm == percpu_read(cpu_tlbstate.active_mm)) { | ||
152 | if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) { | ||
153 | if (f->flush_va == TLB_FLUSH_ALL) | ||
154 | local_flush_tlb(); | ||
155 | else | ||
156 | __flush_tlb_one(f->flush_va); | ||
157 | } else | ||
158 | leave_mm(cpu); | ||
159 | } | ||
160 | out: | ||
161 | ack_APIC_irq(); | ||
162 | smp_mb__before_clear_bit(); | ||
163 | cpumask_clear_cpu(cpu, to_cpumask(f->flush_cpumask)); | ||
164 | smp_mb__after_clear_bit(); | ||
165 | inc_irq_stat(irq_tlb_count); | ||
166 | } | ||
167 | |||
168 | static void flush_tlb_others_ipi(const struct cpumask *cpumask, | ||
169 | struct mm_struct *mm, unsigned long va) | ||
170 | { | ||
171 | unsigned int sender; | ||
172 | union smp_flush_state *f; | ||
173 | |||
174 | /* Caller has disabled preemption */ | ||
175 | sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS; | ||
176 | f = &flush_state[sender]; | ||
177 | |||
178 | /* | ||
179 | * Could avoid this lock when | ||
180 | * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is | ||
181 | * probably not worth checking this for a cache-hot lock. | ||
182 | */ | ||
183 | spin_lock(&f->tlbstate_lock); | ||
184 | |||
185 | f->flush_mm = mm; | ||
186 | f->flush_va = va; | ||
187 | cpumask_andnot(to_cpumask(f->flush_cpumask), | ||
188 | cpumask, cpumask_of(smp_processor_id())); | ||
189 | |||
190 | /* | ||
191 | * Make the above memory operations globally visible before | ||
192 | * sending the IPI. | ||
193 | */ | ||
194 | smp_mb(); | ||
195 | /* | ||
196 | * We have to send the IPI only to | ||
197 | * CPUs affected. | ||
198 | */ | ||
199 | send_IPI_mask(to_cpumask(f->flush_cpumask), | ||
200 | INVALIDATE_TLB_VECTOR_START + sender); | ||
201 | |||
202 | while (!cpumask_empty(to_cpumask(f->flush_cpumask))) | ||
203 | cpu_relax(); | ||
204 | |||
205 | f->flush_mm = NULL; | ||
206 | f->flush_va = 0; | ||
207 | spin_unlock(&f->tlbstate_lock); | ||
208 | } | ||
209 | |||
210 | void native_flush_tlb_others(const struct cpumask *cpumask, | ||
211 | struct mm_struct *mm, unsigned long va) | ||
212 | { | ||
213 | if (is_uv_system()) { | ||
214 | unsigned int cpu; | ||
215 | |||
216 | cpu = get_cpu(); | ||
217 | cpumask = uv_flush_tlb_others(cpumask, mm, va, cpu); | ||
218 | if (cpumask) | ||
219 | flush_tlb_others_ipi(cpumask, mm, va); | ||
220 | put_cpu(); | ||
221 | return; | ||
222 | } | ||
223 | flush_tlb_others_ipi(cpumask, mm, va); | ||
224 | } | ||
225 | |||
226 | static int __cpuinit init_smp_flush(void) | ||
227 | { | ||
228 | int i; | ||
229 | |||
230 | for (i = 0; i < ARRAY_SIZE(flush_state); i++) | ||
231 | spin_lock_init(&flush_state[i].tlbstate_lock); | ||
232 | |||
233 | return 0; | ||
234 | } | ||
235 | core_initcall(init_smp_flush); | ||
236 | |||
237 | void flush_tlb_current_task(void) | ||
238 | { | ||
239 | struct mm_struct *mm = current->mm; | ||
240 | |||
241 | preempt_disable(); | ||
242 | |||
243 | local_flush_tlb(); | ||
244 | if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids) | ||
245 | flush_tlb_others(&mm->cpu_vm_mask, mm, TLB_FLUSH_ALL); | ||
246 | preempt_enable(); | ||
247 | } | ||
248 | |||
249 | void flush_tlb_mm(struct mm_struct *mm) | ||
250 | { | ||
251 | preempt_disable(); | ||
252 | |||
253 | if (current->active_mm == mm) { | ||
254 | if (current->mm) | ||
255 | local_flush_tlb(); | ||
256 | else | ||
257 | leave_mm(smp_processor_id()); | ||
258 | } | ||
259 | if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids) | ||
260 | flush_tlb_others(&mm->cpu_vm_mask, mm, TLB_FLUSH_ALL); | ||
261 | |||
262 | preempt_enable(); | ||
263 | } | ||
264 | |||
265 | void flush_tlb_page(struct vm_area_struct *vma, unsigned long va) | ||
266 | { | ||
267 | struct mm_struct *mm = vma->vm_mm; | ||
268 | |||
269 | preempt_disable(); | ||
270 | |||
271 | if (current->active_mm == mm) { | ||
272 | if (current->mm) | ||
273 | __flush_tlb_one(va); | ||
274 | else | ||
275 | leave_mm(smp_processor_id()); | ||
276 | } | ||
277 | |||
278 | if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids) | ||
279 | flush_tlb_others(&mm->cpu_vm_mask, mm, va); | ||
280 | |||
281 | preempt_enable(); | ||
282 | } | ||
283 | |||
284 | static void do_flush_tlb_all(void *info) | ||
285 | { | ||
286 | unsigned long cpu = smp_processor_id(); | ||
287 | |||
288 | __flush_tlb_all(); | ||
289 | if (percpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY) | ||
290 | leave_mm(cpu); | ||
291 | } | ||
292 | |||
293 | void flush_tlb_all(void) | ||
294 | { | ||
295 | on_each_cpu(do_flush_tlb_all, NULL, 1); | ||
296 | } | ||