aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/mm')
-rw-r--r--arch/x86/mm/Makefile2
-rw-r--r--arch/x86/mm/extable.c6
-rw-r--r--arch/x86/mm/fault.c7
-rw-r--r--arch/x86/mm/init_32.c49
-rw-r--r--arch/x86/mm/init_64.c2
-rw-r--r--arch/x86/mm/iomap_32.c10
-rw-r--r--arch/x86/mm/ioremap.c27
-rw-r--r--arch/x86/mm/numa_64.c217
-rw-r--r--arch/x86/mm/pageattr.c49
-rw-r--r--arch/x86/mm/pat.c74
-rw-r--r--arch/x86/mm/srat_64.c1
-rw-r--r--arch/x86/mm/tlb.c296
12 files changed, 670 insertions, 70 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index d8cc96a2738f..9f05157220f5 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -1,6 +1,8 @@
1obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ 1obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
2 pat.o pgtable.o gup.o 2 pat.o pgtable.o gup.o
3 3
4obj-$(CONFIG_X86_SMP) += tlb.o
5
4obj-$(CONFIG_X86_32) += pgtable_32.o iomap_32.o 6obj-$(CONFIG_X86_32) += pgtable_32.o iomap_32.o
5 7
6obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o 8obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 7e8db53528a7..61b41ca3b5a2 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -23,6 +23,12 @@ int fixup_exception(struct pt_regs *regs)
23 23
24 fixup = search_exception_tables(regs->ip); 24 fixup = search_exception_tables(regs->ip);
25 if (fixup) { 25 if (fixup) {
26 /* If fixup is less than 16, it means uaccess error */
27 if (fixup->fixup < 16) {
28 current_thread_info()->uaccess_err = -EFAULT;
29 regs->ip += fixup->fixup;
30 return 1;
31 }
26 regs->ip = fixup->fixup; 32 regs->ip = fixup->fixup;
27 return 1; 33 return 1;
28 } 34 }
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 8f4b859a04b3..d3eee74f830a 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -26,6 +26,7 @@
26#include <linux/kprobes.h> 26#include <linux/kprobes.h>
27#include <linux/uaccess.h> 27#include <linux/uaccess.h>
28#include <linux/kdebug.h> 28#include <linux/kdebug.h>
29#include <linux/magic.h>
29 30
30#include <asm/system.h> 31#include <asm/system.h>
31#include <asm/desc.h> 32#include <asm/desc.h>
@@ -432,6 +433,8 @@ static noinline void no_context(struct pt_regs *regs,
432 unsigned long error_code, unsigned long address) 433 unsigned long error_code, unsigned long address)
433{ 434{
434 struct task_struct *tsk = current; 435 struct task_struct *tsk = current;
436 unsigned long *stackend;
437
435#ifdef CONFIG_X86_64 438#ifdef CONFIG_X86_64
436 unsigned long flags; 439 unsigned long flags;
437 int sig; 440 int sig;
@@ -468,6 +471,10 @@ static noinline void no_context(struct pt_regs *regs,
468 471
469 show_fault_oops(regs, error_code, address); 472 show_fault_oops(regs, error_code, address);
470 473
474 stackend = end_of_stack(tsk);
475 if (*stackend != STACK_END_MAGIC)
476 printk(KERN_ALERT "Thread overran stack, or stack corrupted\n");
477
471 tsk->thread.cr2 = address; 478 tsk->thread.cr2 = address;
472 tsk->thread.trap_no = 14; 479 tsk->thread.trap_no = 14;
473 tsk->thread.error_code = error_code; 480 tsk->thread.error_code = error_code;
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 88f1b10de3be..00263bf07a88 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -49,7 +49,6 @@
49#include <asm/paravirt.h> 49#include <asm/paravirt.h>
50#include <asm/setup.h> 50#include <asm/setup.h>
51#include <asm/cacheflush.h> 51#include <asm/cacheflush.h>
52#include <asm/smp.h>
53 52
54unsigned int __VMALLOC_RESERVE = 128 << 20; 53unsigned int __VMALLOC_RESERVE = 128 << 20;
55 54
@@ -138,6 +137,47 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
138 return pte_offset_kernel(pmd, 0); 137 return pte_offset_kernel(pmd, 0);
139} 138}
140 139
140static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
141 unsigned long vaddr, pte_t *lastpte)
142{
143#ifdef CONFIG_HIGHMEM
144 /*
145 * Something (early fixmap) may already have put a pte
146 * page here, which causes the page table allocation
147 * to become nonlinear. Attempt to fix it, and if it
148 * is still nonlinear then we have to bug.
149 */
150 int pmd_idx_kmap_begin = fix_to_virt(FIX_KMAP_END) >> PMD_SHIFT;
151 int pmd_idx_kmap_end = fix_to_virt(FIX_KMAP_BEGIN) >> PMD_SHIFT;
152
153 if (pmd_idx_kmap_begin != pmd_idx_kmap_end
154 && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin
155 && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end
156 && ((__pa(pte) >> PAGE_SHIFT) < table_start
157 || (__pa(pte) >> PAGE_SHIFT) >= table_end)) {
158 pte_t *newpte;
159 int i;
160
161 BUG_ON(after_init_bootmem);
162 newpte = alloc_low_page();
163 for (i = 0; i < PTRS_PER_PTE; i++)
164 set_pte(newpte + i, pte[i]);
165
166 paravirt_alloc_pte(&init_mm, __pa(newpte) >> PAGE_SHIFT);
167 set_pmd(pmd, __pmd(__pa(newpte)|_PAGE_TABLE));
168 BUG_ON(newpte != pte_offset_kernel(pmd, 0));
169 __flush_tlb_all();
170
171 paravirt_release_pte(__pa(pte) >> PAGE_SHIFT);
172 pte = newpte;
173 }
174 BUG_ON(vaddr < fix_to_virt(FIX_KMAP_BEGIN - 1)
175 && vaddr > fix_to_virt(FIX_KMAP_END)
176 && lastpte && lastpte + PTRS_PER_PTE != pte);
177#endif
178 return pte;
179}
180
141/* 181/*
142 * This function initializes a certain range of kernel virtual memory 182 * This function initializes a certain range of kernel virtual memory
143 * with new bootmem page tables, everywhere page tables are missing in 183 * with new bootmem page tables, everywhere page tables are missing in
@@ -154,6 +194,7 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
154 unsigned long vaddr; 194 unsigned long vaddr;
155 pgd_t *pgd; 195 pgd_t *pgd;
156 pmd_t *pmd; 196 pmd_t *pmd;
197 pte_t *pte = NULL;
157 198
158 vaddr = start; 199 vaddr = start;
159 pgd_idx = pgd_index(vaddr); 200 pgd_idx = pgd_index(vaddr);
@@ -165,7 +206,8 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
165 pmd = pmd + pmd_index(vaddr); 206 pmd = pmd + pmd_index(vaddr);
166 for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); 207 for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end);
167 pmd++, pmd_idx++) { 208 pmd++, pmd_idx++) {
168 one_page_table_init(pmd); 209 pte = page_table_kmap_check(one_page_table_init(pmd),
210 pmd, vaddr, pte);
169 211
170 vaddr += PMD_SIZE; 212 vaddr += PMD_SIZE;
171 } 213 }
@@ -508,7 +550,6 @@ static void __init early_ioremap_page_table_range_init(pgd_t *pgd_base)
508 * Fixed mappings, only the page table structure has to be 550 * Fixed mappings, only the page table structure has to be
509 * created - mappings will be set by set_fixmap(): 551 * created - mappings will be set by set_fixmap():
510 */ 552 */
511 early_ioremap_clear();
512 vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK; 553 vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
513 end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK; 554 end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
514 page_table_range_init(vaddr, end, pgd_base); 555 page_table_range_init(vaddr, end, pgd_base);
@@ -801,7 +842,7 @@ static void __init find_early_table_space(unsigned long end, int use_pse)
801 tables += PAGE_ALIGN(ptes * sizeof(pte_t)); 842 tables += PAGE_ALIGN(ptes * sizeof(pte_t));
802 843
803 /* for fixmap */ 844 /* for fixmap */
804 tables += PAGE_SIZE * 2; 845 tables += PAGE_ALIGN(__end_of_fixed_addresses * sizeof(pte_t));
805 846
806 /* 847 /*
807 * RED-PEN putting page tables only on node 0 could 848 * RED-PEN putting page tables only on node 0 could
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 23f68e77ad1f..e6d36b490250 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -596,7 +596,7 @@ static void __init init_gbpages(void)
596 direct_gbpages = 0; 596 direct_gbpages = 0;
597} 597}
598 598
599static unsigned long __init kernel_physical_mapping_init(unsigned long start, 599static unsigned long __meminit kernel_physical_mapping_init(unsigned long start,
600 unsigned long end, 600 unsigned long end,
601 unsigned long page_size_mask) 601 unsigned long page_size_mask)
602{ 602{
diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
index d0151d8ce452..ca53224fc56c 100644
--- a/arch/x86/mm/iomap_32.c
+++ b/arch/x86/mm/iomap_32.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <asm/iomap.h> 19#include <asm/iomap.h>
20#include <asm/pat.h>
20#include <linux/module.h> 21#include <linux/module.h>
21 22
22/* Map 'pfn' using fixed map 'type' and protections 'prot' 23/* Map 'pfn' using fixed map 'type' and protections 'prot'
@@ -29,6 +30,15 @@ iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
29 30
30 pagefault_disable(); 31 pagefault_disable();
31 32
33 /*
34 * For non-PAT systems, promote PAGE_KERNEL_WC to PAGE_KERNEL_UC_MINUS.
35 * PAGE_KERNEL_WC maps to PWT, which translates to uncached if the
36 * MTRR is UC or WC. UC_MINUS gets the real intention, of the
37 * user, which is "WC if the MTRR is WC, UC if you can't do that."
38 */
39 if (!pat_enabled && pgprot_val(prot) == pgprot_val(PAGE_KERNEL_WC))
40 prot = PAGE_KERNEL_UC_MINUS;
41
32 idx = type + KM_TYPE_NR*smp_processor_id(); 42 idx = type + KM_TYPE_NR*smp_processor_id();
33 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); 43 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
34 set_pte(kmap_pte-idx, pfn_pte(pfn, prot)); 44 set_pte(kmap_pte-idx, pfn_pte(pfn, prot));
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index bd85d42819e1..1448bcb7f22f 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -367,7 +367,7 @@ EXPORT_SYMBOL(ioremap_nocache);
367 * 367 *
368 * Must be freed with iounmap. 368 * Must be freed with iounmap.
369 */ 369 */
370void __iomem *ioremap_wc(unsigned long phys_addr, unsigned long size) 370void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size)
371{ 371{
372 if (pat_enabled) 372 if (pat_enabled)
373 return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC, 373 return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC,
@@ -557,34 +557,9 @@ void __init early_ioremap_init(void)
557 } 557 }
558} 558}
559 559
560void __init early_ioremap_clear(void)
561{
562 pmd_t *pmd;
563
564 if (early_ioremap_debug)
565 printk(KERN_INFO "early_ioremap_clear()\n");
566
567 pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));
568 pmd_clear(pmd);
569 paravirt_release_pte(__pa(bm_pte) >> PAGE_SHIFT);
570 __flush_tlb_all();
571}
572
573void __init early_ioremap_reset(void) 560void __init early_ioremap_reset(void)
574{ 561{
575 enum fixed_addresses idx;
576 unsigned long addr, phys;
577 pte_t *pte;
578
579 after_paging_init = 1; 562 after_paging_init = 1;
580 for (idx = FIX_BTMAP_BEGIN; idx >= FIX_BTMAP_END; idx--) {
581 addr = fix_to_virt(idx);
582 pte = early_ioremap_pte(addr);
583 if (pte_present(*pte)) {
584 phys = pte_val(*pte) & PAGE_MASK;
585 set_fixmap(idx, phys);
586 }
587 }
588} 563}
589 564
590static void __init __early_set_fixmap(enum fixed_addresses idx, 565static void __init __early_set_fixmap(enum fixed_addresses idx,
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 71a14f89f89e..08d140fbc31b 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -20,6 +20,12 @@
20#include <asm/acpi.h> 20#include <asm/acpi.h>
21#include <asm/k8.h> 21#include <asm/k8.h>
22 22
23#ifdef CONFIG_DEBUG_PER_CPU_MAPS
24# define DBG(x...) printk(KERN_DEBUG x)
25#else
26# define DBG(x...)
27#endif
28
23struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; 29struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
24EXPORT_SYMBOL(node_data); 30EXPORT_SYMBOL(node_data);
25 31
@@ -33,6 +39,21 @@ int numa_off __initdata;
33static unsigned long __initdata nodemap_addr; 39static unsigned long __initdata nodemap_addr;
34static unsigned long __initdata nodemap_size; 40static unsigned long __initdata nodemap_size;
35 41
42DEFINE_PER_CPU(int, node_number) = 0;
43EXPORT_PER_CPU_SYMBOL(node_number);
44
45/*
46 * Map cpu index to node index
47 */
48DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
49EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
50
51/*
52 * Which logical CPUs are on which nodes
53 */
54cpumask_t *node_to_cpumask_map;
55EXPORT_SYMBOL(node_to_cpumask_map);
56
36/* 57/*
37 * Given a shift value, try to populate memnodemap[] 58 * Given a shift value, try to populate memnodemap[]
38 * Returns : 59 * Returns :
@@ -640,3 +661,199 @@ void __init init_cpu_to_node(void)
640#endif 661#endif
641 662
642 663
664/*
665 * Allocate node_to_cpumask_map based on number of available nodes
666 * Requires node_possible_map to be valid.
667 *
668 * Note: node_to_cpumask() is not valid until after this is done.
669 * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.)
670 */
671void __init setup_node_to_cpumask_map(void)
672{
673 unsigned int node, num = 0;
674 cpumask_t *map;
675
676 /* setup nr_node_ids if not done yet */
677 if (nr_node_ids == MAX_NUMNODES) {
678 for_each_node_mask(node, node_possible_map)
679 num = node;
680 nr_node_ids = num + 1;
681 }
682
683 /* allocate the map */
684 map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t));
685 DBG("node_to_cpumask_map at %p for %d nodes\n", map, nr_node_ids);
686
687 pr_debug("Node to cpumask map at %p for %d nodes\n",
688 map, nr_node_ids);
689
690 /* node_to_cpumask() will now work */
691 node_to_cpumask_map = map;
692}
693
694void __cpuinit numa_set_node(int cpu, int node)
695{
696 int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
697
698 /* early setting, no percpu area yet */
699 if (cpu_to_node_map) {
700 cpu_to_node_map[cpu] = node;
701 return;
702 }
703
704#ifdef CONFIG_DEBUG_PER_CPU_MAPS
705 if (cpu >= nr_cpu_ids || !per_cpu_offset(cpu)) {
706 printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu);
707 dump_stack();
708 return;
709 }
710#endif
711 per_cpu(x86_cpu_to_node_map, cpu) = node;
712
713 if (node != NUMA_NO_NODE)
714 per_cpu(node_number, cpu) = node;
715}
716
717void __cpuinit numa_clear_node(int cpu)
718{
719 numa_set_node(cpu, NUMA_NO_NODE);
720}
721
722#ifndef CONFIG_DEBUG_PER_CPU_MAPS
723
724void __cpuinit numa_add_cpu(int cpu)
725{
726 cpu_set(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
727}
728
729void __cpuinit numa_remove_cpu(int cpu)
730{
731 cpu_clear(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
732}
733
734#else /* CONFIG_DEBUG_PER_CPU_MAPS */
735
736/*
737 * --------- debug versions of the numa functions ---------
738 */
739static void __cpuinit numa_set_cpumask(int cpu, int enable)
740{
741 int node = early_cpu_to_node(cpu);
742 cpumask_t *mask;
743 char buf[64];
744
745 if (node_to_cpumask_map == NULL) {
746 printk(KERN_ERR "node_to_cpumask_map NULL\n");
747 dump_stack();
748 return;
749 }
750
751 mask = &node_to_cpumask_map[node];
752 if (enable)
753 cpu_set(cpu, *mask);
754 else
755 cpu_clear(cpu, *mask);
756
757 cpulist_scnprintf(buf, sizeof(buf), mask);
758 printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
759 enable ? "numa_add_cpu" : "numa_remove_cpu", cpu, node, buf);
760}
761
762void __cpuinit numa_add_cpu(int cpu)
763{
764 numa_set_cpumask(cpu, 1);
765}
766
767void __cpuinit numa_remove_cpu(int cpu)
768{
769 numa_set_cpumask(cpu, 0);
770}
771
772int cpu_to_node(int cpu)
773{
774 if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
775 printk(KERN_WARNING
776 "cpu_to_node(%d): usage too early!\n", cpu);
777 dump_stack();
778 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
779 }
780 return per_cpu(x86_cpu_to_node_map, cpu);
781}
782EXPORT_SYMBOL(cpu_to_node);
783
784/*
785 * Same function as cpu_to_node() but used if called before the
786 * per_cpu areas are setup.
787 */
788int early_cpu_to_node(int cpu)
789{
790 if (early_per_cpu_ptr(x86_cpu_to_node_map))
791 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
792
793 if (!per_cpu_offset(cpu)) {
794 printk(KERN_WARNING
795 "early_cpu_to_node(%d): no per_cpu area!\n", cpu);
796 dump_stack();
797 return NUMA_NO_NODE;
798 }
799 return per_cpu(x86_cpu_to_node_map, cpu);
800}
801
802
803/* empty cpumask */
804static const cpumask_t cpu_mask_none;
805
806/*
807 * Returns a pointer to the bitmask of CPUs on Node 'node'.
808 */
809const cpumask_t *cpumask_of_node(int node)
810{
811 if (node_to_cpumask_map == NULL) {
812 printk(KERN_WARNING
813 "cpumask_of_node(%d): no node_to_cpumask_map!\n",
814 node);
815 dump_stack();
816 return (const cpumask_t *)&cpu_online_map;
817 }
818 if (node >= nr_node_ids) {
819 printk(KERN_WARNING
820 "cpumask_of_node(%d): node > nr_node_ids(%d)\n",
821 node, nr_node_ids);
822 dump_stack();
823 return &cpu_mask_none;
824 }
825 return &node_to_cpumask_map[node];
826}
827EXPORT_SYMBOL(cpumask_of_node);
828
829/*
830 * Returns a bitmask of CPUs on Node 'node'.
831 *
832 * Side note: this function creates the returned cpumask on the stack
833 * so with a high NR_CPUS count, excessive stack space is used. The
834 * node_to_cpumask_ptr function should be used whenever possible.
835 */
836cpumask_t node_to_cpumask(int node)
837{
838 if (node_to_cpumask_map == NULL) {
839 printk(KERN_WARNING
840 "node_to_cpumask(%d): no node_to_cpumask_map!\n", node);
841 dump_stack();
842 return cpu_online_map;
843 }
844 if (node >= nr_node_ids) {
845 printk(KERN_WARNING
846 "node_to_cpumask(%d): node > nr_node_ids(%d)\n",
847 node, nr_node_ids);
848 dump_stack();
849 return cpu_mask_none;
850 }
851 return node_to_cpumask_map[node];
852}
853EXPORT_SYMBOL(node_to_cpumask);
854
855/*
856 * --------- end of debug versions of the numa functions ---------
857 */
858
859#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index e89d24815f26..84ba74820ad6 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -534,6 +534,36 @@ out_unlock:
534 return 0; 534 return 0;
535} 535}
536 536
537static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr,
538 int primary)
539{
540 /*
541 * Ignore all non primary paths.
542 */
543 if (!primary)
544 return 0;
545
546 /*
547 * Ignore the NULL PTE for kernel identity mapping, as it is expected
548 * to have holes.
549 * Also set numpages to '1' indicating that we processed cpa req for
550 * one virtual address page and its pfn. TBD: numpages can be set based
551 * on the initial value and the level returned by lookup_address().
552 */
553 if (within(vaddr, PAGE_OFFSET,
554 PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) {
555 cpa->numpages = 1;
556 cpa->pfn = __pa(vaddr) >> PAGE_SHIFT;
557 return 0;
558 } else {
559 WARN(1, KERN_WARNING "CPA: called for zero pte. "
560 "vaddr = %lx cpa->vaddr = %lx\n", vaddr,
561 *cpa->vaddr);
562
563 return -EFAULT;
564 }
565}
566
537static int __change_page_attr(struct cpa_data *cpa, int primary) 567static int __change_page_attr(struct cpa_data *cpa, int primary)
538{ 568{
539 unsigned long address; 569 unsigned long address;
@@ -549,17 +579,11 @@ static int __change_page_attr(struct cpa_data *cpa, int primary)
549repeat: 579repeat:
550 kpte = lookup_address(address, &level); 580 kpte = lookup_address(address, &level);
551 if (!kpte) 581 if (!kpte)
552 return 0; 582 return __cpa_process_fault(cpa, address, primary);
553 583
554 old_pte = *kpte; 584 old_pte = *kpte;
555 if (!pte_val(old_pte)) { 585 if (!pte_val(old_pte))
556 if (!primary) 586 return __cpa_process_fault(cpa, address, primary);
557 return 0;
558 WARN(1, KERN_WARNING "CPA: called for zero pte. "
559 "vaddr = %lx cpa->vaddr = %lx\n", address,
560 *cpa->vaddr);
561 return -EINVAL;
562 }
563 587
564 if (level == PG_LEVEL_4K) { 588 if (level == PG_LEVEL_4K) {
565 pte_t new_pte; 589 pte_t new_pte;
@@ -657,12 +681,7 @@ static int cpa_process_alias(struct cpa_data *cpa)
657 vaddr = *cpa->vaddr; 681 vaddr = *cpa->vaddr;
658 682
659 if (!(within(vaddr, PAGE_OFFSET, 683 if (!(within(vaddr, PAGE_OFFSET,
660 PAGE_OFFSET + (max_low_pfn_mapped << PAGE_SHIFT)) 684 PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) {
661#ifdef CONFIG_X86_64
662 || within(vaddr, PAGE_OFFSET + (1UL<<32),
663 PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))
664#endif
665 )) {
666 685
667 alias_cpa = *cpa; 686 alias_cpa = *cpa;
668 temp_cpa_vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT); 687 temp_cpa_vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT);
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 8b08fb955274..9127e31c7268 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -30,7 +30,7 @@
30#ifdef CONFIG_X86_PAT 30#ifdef CONFIG_X86_PAT
31int __read_mostly pat_enabled = 1; 31int __read_mostly pat_enabled = 1;
32 32
33void __cpuinit pat_disable(char *reason) 33void __cpuinit pat_disable(const char *reason)
34{ 34{
35 pat_enabled = 0; 35 pat_enabled = 0;
36 printk(KERN_INFO "%s\n", reason); 36 printk(KERN_INFO "%s\n", reason);
@@ -42,6 +42,11 @@ static int __init nopat(char *str)
42 return 0; 42 return 0;
43} 43}
44early_param("nopat", nopat); 44early_param("nopat", nopat);
45#else
46static inline void pat_disable(const char *reason)
47{
48 (void)reason;
49}
45#endif 50#endif
46 51
47 52
@@ -78,16 +83,20 @@ void pat_init(void)
78 if (!pat_enabled) 83 if (!pat_enabled)
79 return; 84 return;
80 85
81 /* Paranoia check. */ 86 if (!cpu_has_pat) {
82 if (!cpu_has_pat && boot_pat_state) { 87 if (!boot_pat_state) {
83 /* 88 pat_disable("PAT not supported by CPU.");
84 * If this happens we are on a secondary CPU, but 89 return;
85 * switched to PAT on the boot CPU. We have no way to 90 } else {
86 * undo PAT. 91 /*
87 */ 92 * If this happens we are on a secondary CPU, but
88 printk(KERN_ERR "PAT enabled, " 93 * switched to PAT on the boot CPU. We have no way to
89 "but not supported by secondary CPU\n"); 94 * undo PAT.
90 BUG(); 95 */
96 printk(KERN_ERR "PAT enabled, "
97 "but not supported by secondary CPU\n");
98 BUG();
99 }
91 } 100 }
92 101
93 /* Set PWT to Write-Combining. All other bits stay the same */ 102 /* Set PWT to Write-Combining. All other bits stay the same */
@@ -333,11 +342,23 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
333 req_type & _PAGE_CACHE_MASK); 342 req_type & _PAGE_CACHE_MASK);
334 } 343 }
335 344
336 is_range_ram = pagerange_is_ram(start, end); 345 if (new_type)
337 if (is_range_ram == 1) 346 *new_type = actual_type;
338 return reserve_ram_pages_type(start, end, req_type, new_type); 347
339 else if (is_range_ram < 0) 348 /*
340 return -EINVAL; 349 * For legacy reasons, some parts of the physical address range in the
350 * legacy 1MB region is treated as non-RAM (even when listed as RAM in
351 * the e820 tables). So we will track the memory attributes of this
352 * legacy 1MB region using the linear memtype_list always.
353 */
354 if (end >= ISA_END_ADDRESS) {
355 is_range_ram = pagerange_is_ram(start, end);
356 if (is_range_ram == 1)
357 return reserve_ram_pages_type(start, end, req_type,
358 new_type);
359 else if (is_range_ram < 0)
360 return -EINVAL;
361 }
341 362
342 new = kmalloc(sizeof(struct memtype), GFP_KERNEL); 363 new = kmalloc(sizeof(struct memtype), GFP_KERNEL);
343 if (!new) 364 if (!new)
@@ -347,9 +368,6 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
347 new->end = end; 368 new->end = end;
348 new->type = actual_type; 369 new->type = actual_type;
349 370
350 if (new_type)
351 *new_type = actual_type;
352
353 spin_lock(&memtype_lock); 371 spin_lock(&memtype_lock);
354 372
355 if (cached_entry && start >= cached_start) 373 if (cached_entry && start >= cached_start)
@@ -437,11 +455,19 @@ int free_memtype(u64 start, u64 end)
437 if (is_ISA_range(start, end - 1)) 455 if (is_ISA_range(start, end - 1))
438 return 0; 456 return 0;
439 457
440 is_range_ram = pagerange_is_ram(start, end); 458 /*
441 if (is_range_ram == 1) 459 * For legacy reasons, some parts of the physical address range in the
442 return free_ram_pages_type(start, end); 460 * legacy 1MB region is treated as non-RAM (even when listed as RAM in
443 else if (is_range_ram < 0) 461 * the e820 tables). So we will track the memory attributes of this
444 return -EINVAL; 462 * legacy 1MB region using the linear memtype_list always.
463 */
464 if (end >= ISA_END_ADDRESS) {
465 is_range_ram = pagerange_is_ram(start, end);
466 if (is_range_ram == 1)
467 return free_ram_pages_type(start, end);
468 else if (is_range_ram < 0)
469 return -EINVAL;
470 }
445 471
446 spin_lock(&memtype_lock); 472 spin_lock(&memtype_lock);
447 list_for_each_entry(entry, &memtype_list, nd) { 473 list_for_each_entry(entry, &memtype_list, nd) {
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 09737c8af074..15df1baee100 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -21,6 +21,7 @@
21#include <asm/numa.h> 21#include <asm/numa.h>
22#include <asm/e820.h> 22#include <asm/e820.h>
23#include <asm/genapic.h> 23#include <asm/genapic.h>
24#include <asm/uv/uv.h>
24 25
25int acpi_numa __initdata; 26int acpi_numa __initdata;
26 27
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
new file mode 100644
index 000000000000..72a6d4ebe34d
--- /dev/null
+++ b/arch/x86/mm/tlb.c
@@ -0,0 +1,296 @@
1#include <linux/init.h>
2
3#include <linux/mm.h>
4#include <linux/spinlock.h>
5#include <linux/smp.h>
6#include <linux/interrupt.h>
7#include <linux/module.h>
8
9#include <asm/tlbflush.h>
10#include <asm/mmu_context.h>
11#include <asm/apic.h>
12#include <asm/uv/uv.h>
13
14DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate)
15 = { &init_mm, 0, };
16
17#include <mach_ipi.h>
18/*
19 * Smarter SMP flushing macros.
20 * c/o Linus Torvalds.
21 *
22 * These mean you can really definitely utterly forget about
23 * writing to user space from interrupts. (Its not allowed anyway).
24 *
25 * Optimizations Manfred Spraul <manfred@colorfullife.com>
26 *
27 * More scalable flush, from Andi Kleen
28 *
29 * To avoid global state use 8 different call vectors.
30 * Each CPU uses a specific vector to trigger flushes on other
31 * CPUs. Depending on the received vector the target CPUs look into
32 * the right array slot for the flush data.
33 *
34 * With more than 8 CPUs they are hashed to the 8 available
35 * vectors. The limited global vector space forces us to this right now.
36 * In future when interrupts are split into per CPU domains this could be
37 * fixed, at the cost of triggering multiple IPIs in some cases.
38 */
39
40union smp_flush_state {
41 struct {
42 struct mm_struct *flush_mm;
43 unsigned long flush_va;
44 spinlock_t tlbstate_lock;
45 DECLARE_BITMAP(flush_cpumask, NR_CPUS);
46 };
47 char pad[CONFIG_X86_INTERNODE_CACHE_BYTES];
48} ____cacheline_internodealigned_in_smp;
49
50/* State is put into the per CPU data section, but padded
51 to a full cache line because other CPUs can access it and we don't
52 want false sharing in the per cpu data segment. */
53static union smp_flush_state flush_state[NUM_INVALIDATE_TLB_VECTORS];
54
55/*
56 * We cannot call mmdrop() because we are in interrupt context,
57 * instead update mm->cpu_vm_mask.
58 */
59void leave_mm(int cpu)
60{
61 if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
62 BUG();
63 cpu_clear(cpu, percpu_read(cpu_tlbstate.active_mm)->cpu_vm_mask);
64 load_cr3(swapper_pg_dir);
65}
66EXPORT_SYMBOL_GPL(leave_mm);
67
68/*
69 *
70 * The flush IPI assumes that a thread switch happens in this order:
71 * [cpu0: the cpu that switches]
72 * 1) switch_mm() either 1a) or 1b)
73 * 1a) thread switch to a different mm
74 * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
75 * Stop ipi delivery for the old mm. This is not synchronized with
76 * the other cpus, but smp_invalidate_interrupt ignore flush ipis
77 * for the wrong mm, and in the worst case we perform a superfluous
78 * tlb flush.
79 * 1a2) set cpu mmu_state to TLBSTATE_OK
80 * Now the smp_invalidate_interrupt won't call leave_mm if cpu0
81 * was in lazy tlb mode.
82 * 1a3) update cpu active_mm
83 * Now cpu0 accepts tlb flushes for the new mm.
84 * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
85 * Now the other cpus will send tlb flush ipis.
86 * 1a4) change cr3.
87 * 1b) thread switch without mm change
88 * cpu active_mm is correct, cpu0 already handles
89 * flush ipis.
90 * 1b1) set cpu mmu_state to TLBSTATE_OK
91 * 1b2) test_and_set the cpu bit in cpu_vm_mask.
92 * Atomically set the bit [other cpus will start sending flush ipis],
93 * and test the bit.
94 * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
95 * 2) switch %%esp, ie current
96 *
97 * The interrupt must handle 2 special cases:
98 * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
99 * - the cpu performs speculative tlb reads, i.e. even if the cpu only
100 * runs in kernel space, the cpu could load tlb entries for user space
101 * pages.
102 *
103 * The good news is that cpu mmu_state is local to each cpu, no
104 * write/read ordering problems.
105 */
106
107/*
108 * TLB flush IPI:
109 *
110 * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
111 * 2) Leave the mm if we are in the lazy tlb mode.
112 *
113 * Interrupts are disabled.
114 */
115
116/*
117 * FIXME: use of asmlinkage is not consistent. On x86_64 it's noop
118 * but still used for documentation purpose but the usage is slightly
119 * inconsistent. On x86_32, asmlinkage is regparm(0) but interrupt
120 * entry calls in with the first parameter in %eax. Maybe define
121 * intrlinkage?
122 */
123#ifdef CONFIG_X86_64
124asmlinkage
125#endif
126void smp_invalidate_interrupt(struct pt_regs *regs)
127{
128 unsigned int cpu;
129 unsigned int sender;
130 union smp_flush_state *f;
131
132 cpu = smp_processor_id();
133 /*
134 * orig_rax contains the negated interrupt vector.
135 * Use that to determine where the sender put the data.
136 */
137 sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START;
138 f = &flush_state[sender];
139
140 if (!cpumask_test_cpu(cpu, to_cpumask(f->flush_cpumask)))
141 goto out;
142 /*
143 * This was a BUG() but until someone can quote me the
144 * line from the intel manual that guarantees an IPI to
145 * multiple CPUs is retried _only_ on the erroring CPUs
146 * its staying as a return
147 *
148 * BUG();
149 */
150
151 if (f->flush_mm == percpu_read(cpu_tlbstate.active_mm)) {
152 if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
153 if (f->flush_va == TLB_FLUSH_ALL)
154 local_flush_tlb();
155 else
156 __flush_tlb_one(f->flush_va);
157 } else
158 leave_mm(cpu);
159 }
160out:
161 ack_APIC_irq();
162 smp_mb__before_clear_bit();
163 cpumask_clear_cpu(cpu, to_cpumask(f->flush_cpumask));
164 smp_mb__after_clear_bit();
165 inc_irq_stat(irq_tlb_count);
166}
167
168static void flush_tlb_others_ipi(const struct cpumask *cpumask,
169 struct mm_struct *mm, unsigned long va)
170{
171 unsigned int sender;
172 union smp_flush_state *f;
173
174 /* Caller has disabled preemption */
175 sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
176 f = &flush_state[sender];
177
178 /*
179 * Could avoid this lock when
180 * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
181 * probably not worth checking this for a cache-hot lock.
182 */
183 spin_lock(&f->tlbstate_lock);
184
185 f->flush_mm = mm;
186 f->flush_va = va;
187 cpumask_andnot(to_cpumask(f->flush_cpumask),
188 cpumask, cpumask_of(smp_processor_id()));
189
190 /*
191 * Make the above memory operations globally visible before
192 * sending the IPI.
193 */
194 smp_mb();
195 /*
196 * We have to send the IPI only to
197 * CPUs affected.
198 */
199 send_IPI_mask(to_cpumask(f->flush_cpumask),
200 INVALIDATE_TLB_VECTOR_START + sender);
201
202 while (!cpumask_empty(to_cpumask(f->flush_cpumask)))
203 cpu_relax();
204
205 f->flush_mm = NULL;
206 f->flush_va = 0;
207 spin_unlock(&f->tlbstate_lock);
208}
209
210void native_flush_tlb_others(const struct cpumask *cpumask,
211 struct mm_struct *mm, unsigned long va)
212{
213 if (is_uv_system()) {
214 unsigned int cpu;
215
216 cpu = get_cpu();
217 cpumask = uv_flush_tlb_others(cpumask, mm, va, cpu);
218 if (cpumask)
219 flush_tlb_others_ipi(cpumask, mm, va);
220 put_cpu();
221 return;
222 }
223 flush_tlb_others_ipi(cpumask, mm, va);
224}
225
226static int __cpuinit init_smp_flush(void)
227{
228 int i;
229
230 for (i = 0; i < ARRAY_SIZE(flush_state); i++)
231 spin_lock_init(&flush_state[i].tlbstate_lock);
232
233 return 0;
234}
235core_initcall(init_smp_flush);
236
237void flush_tlb_current_task(void)
238{
239 struct mm_struct *mm = current->mm;
240
241 preempt_disable();
242
243 local_flush_tlb();
244 if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids)
245 flush_tlb_others(&mm->cpu_vm_mask, mm, TLB_FLUSH_ALL);
246 preempt_enable();
247}
248
249void flush_tlb_mm(struct mm_struct *mm)
250{
251 preempt_disable();
252
253 if (current->active_mm == mm) {
254 if (current->mm)
255 local_flush_tlb();
256 else
257 leave_mm(smp_processor_id());
258 }
259 if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids)
260 flush_tlb_others(&mm->cpu_vm_mask, mm, TLB_FLUSH_ALL);
261
262 preempt_enable();
263}
264
265void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
266{
267 struct mm_struct *mm = vma->vm_mm;
268
269 preempt_disable();
270
271 if (current->active_mm == mm) {
272 if (current->mm)
273 __flush_tlb_one(va);
274 else
275 leave_mm(smp_processor_id());
276 }
277
278 if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids)
279 flush_tlb_others(&mm->cpu_vm_mask, mm, va);
280
281 preempt_enable();
282}
283
284static void do_flush_tlb_all(void *info)
285{
286 unsigned long cpu = smp_processor_id();
287
288 __flush_tlb_all();
289 if (percpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
290 leave_mm(cpu);
291}
292
293void flush_tlb_all(void)
294{
295 on_each_cpu(do_flush_tlb_all, NULL, 1);
296}