aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/mm')
-rw-r--r--arch/x86/mm/Makefile6
-rw-r--r--arch/x86/mm/dump_pagetables.c4
-rw-r--r--arch/x86/mm/fault.c67
-rw-r--r--arch/x86/mm/gup.c10
-rw-r--r--arch/x86/mm/highmem_32.c1
-rw-r--r--arch/x86/mm/init_32.c90
-rw-r--r--arch/x86/mm/init_64.c184
-rw-r--r--arch/x86/mm/ioremap.c197
-rw-r--r--arch/x86/mm/memtest.c7
-rw-r--r--arch/x86/mm/mmio-mod.c87
-rw-r--r--arch/x86/mm/numa_32.c (renamed from arch/x86/mm/discontig_32.c)2
-rw-r--r--arch/x86/mm/numa_64.c10
-rw-r--r--arch/x86/mm/pageattr-test.c9
-rw-r--r--arch/x86/mm/pageattr.c476
-rw-r--r--arch/x86/mm/pat.c132
-rw-r--r--arch/x86/mm/pf_in.c121
-rw-r--r--arch/x86/mm/pgtable.c6
-rw-r--r--arch/x86/mm/pgtable_32.c3
-rw-r--r--arch/x86/mm/srat_64.c2
-rw-r--r--arch/x86/mm/testmmiotrace.c4
20 files changed, 916 insertions, 502 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index dfb932dcf13..59f89b434b4 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -13,12 +13,8 @@ obj-$(CONFIG_MMIOTRACE) += mmiotrace.o
13mmiotrace-y := pf_in.o mmio-mod.o 13mmiotrace-y := pf_in.o mmio-mod.o
14obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o 14obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o
15 15
16ifeq ($(CONFIG_X86_32),y) 16obj-$(CONFIG_NUMA) += numa_$(BITS).o
17obj-$(CONFIG_NUMA) += discontig_32.o
18else
19obj-$(CONFIG_NUMA) += numa_64.o
20obj-$(CONFIG_K8_NUMA) += k8topology_64.o 17obj-$(CONFIG_K8_NUMA) += k8topology_64.o
21endif
22obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o 18obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o
23 19
24obj-$(CONFIG_MEMTEST) += memtest.o 20obj-$(CONFIG_MEMTEST) += memtest.o
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index a20d1fa64b4..e7277cbcfb4 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -148,8 +148,8 @@ static void note_page(struct seq_file *m, struct pg_state *st,
148 * we have now. "break" is either changing perms, levels or 148 * we have now. "break" is either changing perms, levels or
149 * address space marker. 149 * address space marker.
150 */ 150 */
151 prot = pgprot_val(new_prot) & ~(PTE_PFN_MASK); 151 prot = pgprot_val(new_prot) & PTE_FLAGS_MASK;
152 cur = pgprot_val(st->current_prot) & ~(PTE_PFN_MASK); 152 cur = pgprot_val(st->current_prot) & PTE_FLAGS_MASK;
153 153
154 if (!st->level) { 154 if (!st->level) {
155 /* First entry */ 155 /* First entry */
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 455f3fe67b4..31e8730fa24 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -35,6 +35,7 @@
35#include <asm/tlbflush.h> 35#include <asm/tlbflush.h>
36#include <asm/proto.h> 36#include <asm/proto.h>
37#include <asm-generic/sections.h> 37#include <asm-generic/sections.h>
38#include <asm/traps.h>
38 39
39/* 40/*
40 * Page fault error code bits 41 * Page fault error code bits
@@ -357,8 +358,6 @@ static int is_errata100(struct pt_regs *regs, unsigned long address)
357 return 0; 358 return 0;
358} 359}
359 360
360void do_invalid_op(struct pt_regs *, unsigned long);
361
362static int is_f00f_bug(struct pt_regs *regs, unsigned long address) 361static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
363{ 362{
364#ifdef CONFIG_X86_F00F_BUG 363#ifdef CONFIG_X86_F00F_BUG
@@ -593,11 +592,6 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
593 unsigned long flags; 592 unsigned long flags;
594#endif 593#endif
595 594
596 /*
597 * We can fault from pretty much anywhere, with unknown IRQ state.
598 */
599 trace_hardirqs_fixup();
600
601 tsk = current; 595 tsk = current;
602 mm = tsk->mm; 596 mm = tsk->mm;
603 prefetchw(&mm->mmap_sem); 597 prefetchw(&mm->mmap_sem);
@@ -646,24 +640,23 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
646 } 640 }
647 641
648 642
649#ifdef CONFIG_X86_32
650 /* It's safe to allow irq's after cr2 has been saved and the vmalloc
651 fault has been handled. */
652 if (regs->flags & (X86_EFLAGS_IF | X86_VM_MASK))
653 local_irq_enable();
654
655 /* 643 /*
656 * If we're in an interrupt, have no user context or are running in an 644 * It's safe to allow irq's after cr2 has been saved and the
657 * atomic region then we must not take the fault. 645 * vmalloc fault has been handled.
646 *
647 * User-mode registers count as a user access even for any
648 * potential system fault or CPU buglet.
658 */ 649 */
659 if (in_atomic() || !mm) 650 if (user_mode_vm(regs)) {
660 goto bad_area_nosemaphore; 651 local_irq_enable();
661#else /* CONFIG_X86_64 */ 652 error_code |= PF_USER;
662 if (likely(regs->flags & X86_EFLAGS_IF)) 653 } else if (regs->flags & X86_EFLAGS_IF)
663 local_irq_enable(); 654 local_irq_enable();
664 655
656#ifdef CONFIG_X86_64
665 if (unlikely(error_code & PF_RSVD)) 657 if (unlikely(error_code & PF_RSVD))
666 pgtable_bad(address, regs, error_code); 658 pgtable_bad(address, regs, error_code);
659#endif
667 660
668 /* 661 /*
669 * If we're in an interrupt, have no user context or are running in an 662 * If we're in an interrupt, have no user context or are running in an
@@ -672,15 +665,9 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
672 if (unlikely(in_atomic() || !mm)) 665 if (unlikely(in_atomic() || !mm))
673 goto bad_area_nosemaphore; 666 goto bad_area_nosemaphore;
674 667
675 /*
676 * User-mode registers count as a user access even for any
677 * potential system fault or CPU buglet.
678 */
679 if (user_mode_vm(regs))
680 error_code |= PF_USER;
681again: 668again:
682#endif 669 /*
683 /* When running in the kernel we expect faults to occur only to 670 * When running in the kernel we expect faults to occur only to
684 * addresses in user space. All other faults represent errors in the 671 * addresses in user space. All other faults represent errors in the
685 * kernel and should generate an OOPS. Unfortunately, in the case of an 672 * kernel and should generate an OOPS. Unfortunately, in the case of an
686 * erroneous fault occurring in a code path which already holds mmap_sem 673 * erroneous fault occurring in a code path which already holds mmap_sem
@@ -743,9 +730,6 @@ good_area:
743 goto bad_area; 730 goto bad_area;
744 } 731 }
745 732
746#ifdef CONFIG_X86_32
747survive:
748#endif
749 /* 733 /*
750 * If for any reason at all we couldn't handle the fault, 734 * If for any reason at all we couldn't handle the fault,
751 * make sure we exit gracefully rather than endlessly redo 735 * make sure we exit gracefully rather than endlessly redo
@@ -880,12 +864,11 @@ out_of_memory:
880 up_read(&mm->mmap_sem); 864 up_read(&mm->mmap_sem);
881 if (is_global_init(tsk)) { 865 if (is_global_init(tsk)) {
882 yield(); 866 yield();
883#ifdef CONFIG_X86_32 867 /*
884 down_read(&mm->mmap_sem); 868 * Re-lookup the vma - in theory the vma tree might
885 goto survive; 869 * have changed:
886#else 870 */
887 goto again; 871 goto again;
888#endif
889 } 872 }
890 873
891 printk("VM: killing process %s\n", tsk->comm); 874 printk("VM: killing process %s\n", tsk->comm);
@@ -915,15 +898,15 @@ LIST_HEAD(pgd_list);
915 898
916void vmalloc_sync_all(void) 899void vmalloc_sync_all(void)
917{ 900{
918#ifdef CONFIG_X86_32
919 unsigned long start = VMALLOC_START & PGDIR_MASK;
920 unsigned long address; 901 unsigned long address;
921 902
903#ifdef CONFIG_X86_32
922 if (SHARED_KERNEL_PMD) 904 if (SHARED_KERNEL_PMD)
923 return; 905 return;
924 906
925 BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK); 907 for (address = VMALLOC_START & PMD_MASK;
926 for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) { 908 address >= TASK_SIZE && address < FIXADDR_TOP;
909 address += PMD_SIZE) {
927 unsigned long flags; 910 unsigned long flags;
928 struct page *page; 911 struct page *page;
929 912
@@ -936,10 +919,8 @@ void vmalloc_sync_all(void)
936 spin_unlock_irqrestore(&pgd_lock, flags); 919 spin_unlock_irqrestore(&pgd_lock, flags);
937 } 920 }
938#else /* CONFIG_X86_64 */ 921#else /* CONFIG_X86_64 */
939 unsigned long start = VMALLOC_START & PGDIR_MASK; 922 for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END;
940 unsigned long address; 923 address += PGDIR_SIZE) {
941
942 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
943 const pgd_t *pgd_ref = pgd_offset_k(address); 924 const pgd_t *pgd_ref = pgd_offset_k(address);
944 unsigned long flags; 925 unsigned long flags;
945 struct page *page; 926 struct page *page;
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index 007bb06c750..4ba373c5b8c 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -82,7 +82,7 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
82 pte_t pte = gup_get_pte(ptep); 82 pte_t pte = gup_get_pte(ptep);
83 struct page *page; 83 struct page *page;
84 84
85 if ((pte_val(pte) & (mask | _PAGE_SPECIAL)) != mask) { 85 if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) {
86 pte_unmap(ptep); 86 pte_unmap(ptep);
87 return 0; 87 return 0;
88 } 88 }
@@ -116,10 +116,10 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
116 mask = _PAGE_PRESENT|_PAGE_USER; 116 mask = _PAGE_PRESENT|_PAGE_USER;
117 if (write) 117 if (write)
118 mask |= _PAGE_RW; 118 mask |= _PAGE_RW;
119 if ((pte_val(pte) & mask) != mask) 119 if ((pte_flags(pte) & mask) != mask)
120 return 0; 120 return 0;
121 /* hugepages are never "special" */ 121 /* hugepages are never "special" */
122 VM_BUG_ON(pte_val(pte) & _PAGE_SPECIAL); 122 VM_BUG_ON(pte_flags(pte) & _PAGE_SPECIAL);
123 VM_BUG_ON(!pfn_valid(pte_pfn(pte))); 123 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
124 124
125 refs = 0; 125 refs = 0;
@@ -173,10 +173,10 @@ static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
173 mask = _PAGE_PRESENT|_PAGE_USER; 173 mask = _PAGE_PRESENT|_PAGE_USER;
174 if (write) 174 if (write)
175 mask |= _PAGE_RW; 175 mask |= _PAGE_RW;
176 if ((pte_val(pte) & mask) != mask) 176 if ((pte_flags(pte) & mask) != mask)
177 return 0; 177 return 0;
178 /* hugepages are never "special" */ 178 /* hugepages are never "special" */
179 VM_BUG_ON(pte_val(pte) & _PAGE_SPECIAL); 179 VM_BUG_ON(pte_flags(pte) & _PAGE_SPECIAL);
180 VM_BUG_ON(!pfn_valid(pte_pfn(pte))); 180 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
181 181
182 refs = 0; 182 refs = 0;
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 165c871ba9a..bcc079c282d 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -137,6 +137,7 @@ void *kmap_atomic_pfn(unsigned long pfn, enum km_type type)
137 137
138 return (void*) vaddr; 138 return (void*) vaddr;
139} 139}
140EXPORT_SYMBOL_GPL(kmap_atomic_pfn); /* temporarily in use by i915 GEM until vmap */
140 141
141struct page *kmap_atomic_to_page(void *ptr) 142struct page *kmap_atomic_to_page(void *ptr)
142{ 143{
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 60ec1d08ff2..8396868e82c 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -31,6 +31,7 @@
31#include <linux/cpumask.h> 31#include <linux/cpumask.h>
32 32
33#include <asm/asm.h> 33#include <asm/asm.h>
34#include <asm/bios_ebda.h>
34#include <asm/processor.h> 35#include <asm/processor.h>
35#include <asm/system.h> 36#include <asm/system.h>
36#include <asm/uaccess.h> 37#include <asm/uaccess.h>
@@ -47,6 +48,7 @@
47#include <asm/paravirt.h> 48#include <asm/paravirt.h>
48#include <asm/setup.h> 49#include <asm/setup.h>
49#include <asm/cacheflush.h> 50#include <asm/cacheflush.h>
51#include <asm/smp.h>
50 52
51unsigned int __VMALLOC_RESERVE = 128 << 20; 53unsigned int __VMALLOC_RESERVE = 128 << 20;
52 54
@@ -194,11 +196,30 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
194 pgd_t *pgd; 196 pgd_t *pgd;
195 pmd_t *pmd; 197 pmd_t *pmd;
196 pte_t *pte; 198 pte_t *pte;
197 unsigned pages_2m = 0, pages_4k = 0; 199 unsigned pages_2m, pages_4k;
200 int mapping_iter;
201
202 /*
203 * First iteration will setup identity mapping using large/small pages
204 * based on use_pse, with other attributes same as set by
205 * the early code in head_32.S
206 *
207 * Second iteration will setup the appropriate attributes (NX, GLOBAL..)
208 * as desired for the kernel identity mapping.
209 *
210 * This two pass mechanism conforms to the TLB app note which says:
211 *
212 * "Software should not write to a paging-structure entry in a way
213 * that would change, for any linear address, both the page size
214 * and either the page frame or attributes."
215 */
216 mapping_iter = 1;
198 217
199 if (!cpu_has_pse) 218 if (!cpu_has_pse)
200 use_pse = 0; 219 use_pse = 0;
201 220
221repeat:
222 pages_2m = pages_4k = 0;
202 pfn = start_pfn; 223 pfn = start_pfn;
203 pgd_idx = pgd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET); 224 pgd_idx = pgd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
204 pgd = pgd_base + pgd_idx; 225 pgd = pgd_base + pgd_idx;
@@ -224,6 +245,13 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
224 if (use_pse) { 245 if (use_pse) {
225 unsigned int addr2; 246 unsigned int addr2;
226 pgprot_t prot = PAGE_KERNEL_LARGE; 247 pgprot_t prot = PAGE_KERNEL_LARGE;
248 /*
249 * first pass will use the same initial
250 * identity mapping attribute + _PAGE_PSE.
251 */
252 pgprot_t init_prot =
253 __pgprot(PTE_IDENT_ATTR |
254 _PAGE_PSE);
227 255
228 addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE + 256 addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE +
229 PAGE_OFFSET + PAGE_SIZE-1; 257 PAGE_OFFSET + PAGE_SIZE-1;
@@ -233,7 +261,10 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
233 prot = PAGE_KERNEL_LARGE_EXEC; 261 prot = PAGE_KERNEL_LARGE_EXEC;
234 262
235 pages_2m++; 263 pages_2m++;
236 set_pmd(pmd, pfn_pmd(pfn, prot)); 264 if (mapping_iter == 1)
265 set_pmd(pmd, pfn_pmd(pfn, init_prot));
266 else
267 set_pmd(pmd, pfn_pmd(pfn, prot));
237 268
238 pfn += PTRS_PER_PTE; 269 pfn += PTRS_PER_PTE;
239 continue; 270 continue;
@@ -245,17 +276,43 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
245 for (; pte_ofs < PTRS_PER_PTE && pfn < end_pfn; 276 for (; pte_ofs < PTRS_PER_PTE && pfn < end_pfn;
246 pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) { 277 pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) {
247 pgprot_t prot = PAGE_KERNEL; 278 pgprot_t prot = PAGE_KERNEL;
279 /*
280 * first pass will use the same initial
281 * identity mapping attribute.
282 */
283 pgprot_t init_prot = __pgprot(PTE_IDENT_ATTR);
248 284
249 if (is_kernel_text(addr)) 285 if (is_kernel_text(addr))
250 prot = PAGE_KERNEL_EXEC; 286 prot = PAGE_KERNEL_EXEC;
251 287
252 pages_4k++; 288 pages_4k++;
253 set_pte(pte, pfn_pte(pfn, prot)); 289 if (mapping_iter == 1)
290 set_pte(pte, pfn_pte(pfn, init_prot));
291 else
292 set_pte(pte, pfn_pte(pfn, prot));
254 } 293 }
255 } 294 }
256 } 295 }
257 update_page_count(PG_LEVEL_2M, pages_2m); 296 if (mapping_iter == 1) {
258 update_page_count(PG_LEVEL_4K, pages_4k); 297 /*
298 * update direct mapping page count only in the first
299 * iteration.
300 */
301 update_page_count(PG_LEVEL_2M, pages_2m);
302 update_page_count(PG_LEVEL_4K, pages_4k);
303
304 /*
305 * local global flush tlb, which will flush the previous
306 * mappings present in both small and large page TLB's.
307 */
308 __flush_tlb_all();
309
310 /*
311 * Second iteration will set the actual desired PTE attributes.
312 */
313 mapping_iter = 2;
314 goto repeat;
315 }
259} 316}
260 317
261/* 318/*
@@ -501,7 +558,7 @@ void zap_low_mappings(void)
501 558
502int nx_enabled; 559int nx_enabled;
503 560
504pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL); 561pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP);
505EXPORT_SYMBOL_GPL(__supported_pte_mask); 562EXPORT_SYMBOL_GPL(__supported_pte_mask);
506 563
507#ifdef CONFIG_X86_PAE 564#ifdef CONFIG_X86_PAE
@@ -718,7 +775,7 @@ void __init setup_bootmem_allocator(void)
718 after_init_bootmem = 1; 775 after_init_bootmem = 1;
719} 776}
720 777
721static void __init find_early_table_space(unsigned long end) 778static void __init find_early_table_space(unsigned long end, int use_pse)
722{ 779{
723 unsigned long puds, pmds, ptes, tables, start; 780 unsigned long puds, pmds, ptes, tables, start;
724 781
@@ -728,7 +785,7 @@ static void __init find_early_table_space(unsigned long end)
728 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; 785 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
729 tables += PAGE_ALIGN(pmds * sizeof(pmd_t)); 786 tables += PAGE_ALIGN(pmds * sizeof(pmd_t));
730 787
731 if (cpu_has_pse) { 788 if (use_pse) {
732 unsigned long extra; 789 unsigned long extra;
733 790
734 extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); 791 extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
@@ -768,12 +825,22 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
768 pgd_t *pgd_base = swapper_pg_dir; 825 pgd_t *pgd_base = swapper_pg_dir;
769 unsigned long start_pfn, end_pfn; 826 unsigned long start_pfn, end_pfn;
770 unsigned long big_page_start; 827 unsigned long big_page_start;
828#ifdef CONFIG_DEBUG_PAGEALLOC
829 /*
830 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
831 * This will simplify cpa(), which otherwise needs to support splitting
832 * large pages into small in interrupt context, etc.
833 */
834 int use_pse = 0;
835#else
836 int use_pse = cpu_has_pse;
837#endif
771 838
772 /* 839 /*
773 * Find space for the kernel direct mapping tables. 840 * Find space for the kernel direct mapping tables.
774 */ 841 */
775 if (!after_init_bootmem) 842 if (!after_init_bootmem)
776 find_early_table_space(end); 843 find_early_table_space(end, use_pse);
777 844
778#ifdef CONFIG_X86_PAE 845#ifdef CONFIG_X86_PAE
779 set_nx(); 846 set_nx();
@@ -819,7 +886,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
819 end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); 886 end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
820 if (start_pfn < end_pfn) 887 if (start_pfn < end_pfn)
821 kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn, 888 kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn,
822 cpu_has_pse); 889 use_pse);
823 890
824 /* tail is not big page alignment ? */ 891 /* tail is not big page alignment ? */
825 start_pfn = end_pfn; 892 start_pfn = end_pfn;
@@ -903,6 +970,8 @@ void __init mem_init(void)
903 int codesize, reservedpages, datasize, initsize; 970 int codesize, reservedpages, datasize, initsize;
904 int tmp; 971 int tmp;
905 972
973 start_periodic_check_for_corruption();
974
906#ifdef CONFIG_FLATMEM 975#ifdef CONFIG_FLATMEM
907 BUG_ON(!mem_map); 976 BUG_ON(!mem_map);
908#endif 977#endif
@@ -982,7 +1051,6 @@ void __init mem_init(void)
982 if (boot_cpu_data.wp_works_ok < 0) 1051 if (boot_cpu_data.wp_works_ok < 0)
983 test_wp_bit(); 1052 test_wp_bit();
984 1053
985 cpa_init();
986 save_pg_dir(); 1054 save_pg_dir();
987 zap_low_mappings(); 1055 zap_low_mappings();
988} 1056}
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index d3746efb060..b8e461d4941 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -31,6 +31,7 @@
31#include <linux/nmi.h> 31#include <linux/nmi.h>
32 32
33#include <asm/processor.h> 33#include <asm/processor.h>
34#include <asm/bios_ebda.h>
34#include <asm/system.h> 35#include <asm/system.h>
35#include <asm/uaccess.h> 36#include <asm/uaccess.h>
36#include <asm/pgtable.h> 37#include <asm/pgtable.h>
@@ -88,6 +89,62 @@ early_param("gbpages", parse_direct_gbpages_on);
88 89
89int after_bootmem; 90int after_bootmem;
90 91
92pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP;
93EXPORT_SYMBOL_GPL(__supported_pte_mask);
94
95static int do_not_nx __cpuinitdata;
96
97/*
98 * noexec=on|off
99 * Control non-executable mappings for 64-bit processes.
100 *
101 * on Enable (default)
102 * off Disable
103 */
104static int __init nonx_setup(char *str)
105{
106 if (!str)
107 return -EINVAL;
108 if (!strncmp(str, "on", 2)) {
109 __supported_pte_mask |= _PAGE_NX;
110 do_not_nx = 0;
111 } else if (!strncmp(str, "off", 3)) {
112 do_not_nx = 1;
113 __supported_pte_mask &= ~_PAGE_NX;
114 }
115 return 0;
116}
117early_param("noexec", nonx_setup);
118
119void __cpuinit check_efer(void)
120{
121 unsigned long efer;
122
123 rdmsrl(MSR_EFER, efer);
124 if (!(efer & EFER_NX) || do_not_nx)
125 __supported_pte_mask &= ~_PAGE_NX;
126}
127
128int force_personality32;
129
130/*
131 * noexec32=on|off
132 * Control non executable heap for 32bit processes.
133 * To control the stack too use noexec=off
134 *
135 * on PROT_READ does not imply PROT_EXEC for 32-bit processes (default)
136 * off PROT_READ implies PROT_EXEC
137 */
138static int __init nonx32_setup(char *str)
139{
140 if (!strcmp(str, "on"))
141 force_personality32 &= ~READ_IMPLIES_EXEC;
142 else if (!strcmp(str, "off"))
143 force_personality32 |= READ_IMPLIES_EXEC;
144 return 1;
145}
146__setup("noexec32=", nonx32_setup);
147
91/* 148/*
92 * NOTE: This function is marked __ref because it calls __init function 149 * NOTE: This function is marked __ref because it calls __init function
93 * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0. 150 * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
@@ -139,9 +196,6 @@ set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
139 } 196 }
140 197
141 pte = pte_offset_kernel(pmd, vaddr); 198 pte = pte_offset_kernel(pmd, vaddr);
142 if (!pte_none(*pte) && pte_val(new_pte) &&
143 pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
144 pte_ERROR(*pte);
145 set_pte(pte, new_pte); 199 set_pte(pte, new_pte);
146 200
147 /* 201 /*
@@ -225,7 +279,7 @@ void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
225void __init cleanup_highmap(void) 279void __init cleanup_highmap(void)
226{ 280{
227 unsigned long vaddr = __START_KERNEL_map; 281 unsigned long vaddr = __START_KERNEL_map;
228 unsigned long end = round_up((unsigned long)_end, PMD_SIZE) - 1; 282 unsigned long end = roundup((unsigned long)_end, PMD_SIZE) - 1;
229 pmd_t *pmd = level2_kernel_pgt; 283 pmd_t *pmd = level2_kernel_pgt;
230 pmd_t *last_pmd = pmd + PTRS_PER_PMD; 284 pmd_t *last_pmd = pmd + PTRS_PER_PMD;
231 285
@@ -256,7 +310,7 @@ static __ref void *alloc_low_page(unsigned long *phys)
256 if (pfn >= table_top) 310 if (pfn >= table_top)
257 panic("alloc_low_page: ran out of memory"); 311 panic("alloc_low_page: ran out of memory");
258 312
259 adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE); 313 adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
260 memset(adr, 0, PAGE_SIZE); 314 memset(adr, 0, PAGE_SIZE);
261 *phys = pfn * PAGE_SIZE; 315 *phys = pfn * PAGE_SIZE;
262 return adr; 316 return adr;
@@ -271,7 +325,8 @@ static __ref void unmap_low_page(void *adr)
271} 325}
272 326
273static unsigned long __meminit 327static unsigned long __meminit
274phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end) 328phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
329 pgprot_t prot)
275{ 330{
276 unsigned pages = 0; 331 unsigned pages = 0;
277 unsigned long last_map_addr = end; 332 unsigned long last_map_addr = end;
@@ -289,36 +344,43 @@ phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end)
289 break; 344 break;
290 } 345 }
291 346
347 /*
348 * We will re-use the existing mapping.
349 * Xen for example has some special requirements, like mapping
350 * pagetable pages as RO. So assume someone who pre-setup
351 * these mappings are more intelligent.
352 */
292 if (pte_val(*pte)) 353 if (pte_val(*pte))
293 continue; 354 continue;
294 355
295 if (0) 356 if (0)
296 printk(" pte=%p addr=%lx pte=%016lx\n", 357 printk(" pte=%p addr=%lx pte=%016lx\n",
297 pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte); 358 pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte);
298 set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL));
299 last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
300 pages++; 359 pages++;
360 set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, prot));
361 last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
301 } 362 }
363
302 update_page_count(PG_LEVEL_4K, pages); 364 update_page_count(PG_LEVEL_4K, pages);
303 365
304 return last_map_addr; 366 return last_map_addr;
305} 367}
306 368
307static unsigned long __meminit 369static unsigned long __meminit
308phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end) 370phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end,
371 pgprot_t prot)
309{ 372{
310 pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd); 373 pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd);
311 374
312 return phys_pte_init(pte, address, end); 375 return phys_pte_init(pte, address, end, prot);
313} 376}
314 377
315static unsigned long __meminit 378static unsigned long __meminit
316phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, 379phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
317 unsigned long page_size_mask) 380 unsigned long page_size_mask, pgprot_t prot)
318{ 381{
319 unsigned long pages = 0; 382 unsigned long pages = 0;
320 unsigned long last_map_addr = end; 383 unsigned long last_map_addr = end;
321 unsigned long start = address;
322 384
323 int i = pmd_index(address); 385 int i = pmd_index(address);
324 386
@@ -326,6 +388,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
326 unsigned long pte_phys; 388 unsigned long pte_phys;
327 pmd_t *pmd = pmd_page + pmd_index(address); 389 pmd_t *pmd = pmd_page + pmd_index(address);
328 pte_t *pte; 390 pte_t *pte;
391 pgprot_t new_prot = prot;
329 392
330 if (address >= end) { 393 if (address >= end) {
331 if (!after_bootmem) { 394 if (!after_bootmem) {
@@ -339,27 +402,40 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
339 if (!pmd_large(*pmd)) { 402 if (!pmd_large(*pmd)) {
340 spin_lock(&init_mm.page_table_lock); 403 spin_lock(&init_mm.page_table_lock);
341 last_map_addr = phys_pte_update(pmd, address, 404 last_map_addr = phys_pte_update(pmd, address,
342 end); 405 end, prot);
343 spin_unlock(&init_mm.page_table_lock); 406 spin_unlock(&init_mm.page_table_lock);
407 continue;
344 } 408 }
345 /* Count entries we're using from level2_ident_pgt */ 409 /*
346 if (start == 0) 410 * If we are ok with PG_LEVEL_2M mapping, then we will
347 pages++; 411 * use the existing mapping,
348 continue; 412 *
413 * Otherwise, we will split the large page mapping but
414 * use the same existing protection bits except for
415 * large page, so that we don't violate Intel's TLB
416 * Application note (317080) which says, while changing
417 * the page sizes, new and old translations should
418 * not differ with respect to page frame and
419 * attributes.
420 */
421 if (page_size_mask & (1 << PG_LEVEL_2M))
422 continue;
423 new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd));
349 } 424 }
350 425
351 if (page_size_mask & (1<<PG_LEVEL_2M)) { 426 if (page_size_mask & (1<<PG_LEVEL_2M)) {
352 pages++; 427 pages++;
353 spin_lock(&init_mm.page_table_lock); 428 spin_lock(&init_mm.page_table_lock);
354 set_pte((pte_t *)pmd, 429 set_pte((pte_t *)pmd,
355 pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); 430 pfn_pte(address >> PAGE_SHIFT,
431 __pgprot(pgprot_val(prot) | _PAGE_PSE)));
356 spin_unlock(&init_mm.page_table_lock); 432 spin_unlock(&init_mm.page_table_lock);
357 last_map_addr = (address & PMD_MASK) + PMD_SIZE; 433 last_map_addr = (address & PMD_MASK) + PMD_SIZE;
358 continue; 434 continue;
359 } 435 }
360 436
361 pte = alloc_low_page(&pte_phys); 437 pte = alloc_low_page(&pte_phys);
362 last_map_addr = phys_pte_init(pte, address, end); 438 last_map_addr = phys_pte_init(pte, address, end, new_prot);
363 unmap_low_page(pte); 439 unmap_low_page(pte);
364 440
365 spin_lock(&init_mm.page_table_lock); 441 spin_lock(&init_mm.page_table_lock);
@@ -372,12 +448,12 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
372 448
373static unsigned long __meminit 449static unsigned long __meminit
374phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end, 450phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end,
375 unsigned long page_size_mask) 451 unsigned long page_size_mask, pgprot_t prot)
376{ 452{
377 pmd_t *pmd = pmd_offset(pud, 0); 453 pmd_t *pmd = pmd_offset(pud, 0);
378 unsigned long last_map_addr; 454 unsigned long last_map_addr;
379 455
380 last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask); 456 last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask, prot);
381 __flush_tlb_all(); 457 __flush_tlb_all();
382 return last_map_addr; 458 return last_map_addr;
383} 459}
@@ -394,6 +470,7 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
394 unsigned long pmd_phys; 470 unsigned long pmd_phys;
395 pud_t *pud = pud_page + pud_index(addr); 471 pud_t *pud = pud_page + pud_index(addr);
396 pmd_t *pmd; 472 pmd_t *pmd;
473 pgprot_t prot = PAGE_KERNEL;
397 474
398 if (addr >= end) 475 if (addr >= end)
399 break; 476 break;
@@ -405,10 +482,26 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
405 } 482 }
406 483
407 if (pud_val(*pud)) { 484 if (pud_val(*pud)) {
408 if (!pud_large(*pud)) 485 if (!pud_large(*pud)) {
409 last_map_addr = phys_pmd_update(pud, addr, end, 486 last_map_addr = phys_pmd_update(pud, addr, end,
410 page_size_mask); 487 page_size_mask, prot);
411 continue; 488 continue;
489 }
490 /*
491 * If we are ok with PG_LEVEL_1G mapping, then we will
492 * use the existing mapping.
493 *
494 * Otherwise, we will split the gbpage mapping but use
495 * the same existing protection bits except for large
496 * page, so that we don't violate Intel's TLB
497 * Application note (317080) which says, while changing
498 * the page sizes, new and old translations should
499 * not differ with respect to page frame and
500 * attributes.
501 */
502 if (page_size_mask & (1 << PG_LEVEL_1G))
503 continue;
504 prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud));
412 } 505 }
413 506
414 if (page_size_mask & (1<<PG_LEVEL_1G)) { 507 if (page_size_mask & (1<<PG_LEVEL_1G)) {
@@ -422,7 +515,8 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
422 } 515 }
423 516
424 pmd = alloc_low_page(&pmd_phys); 517 pmd = alloc_low_page(&pmd_phys);
425 last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask); 518 last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask,
519 prot);
426 unmap_low_page(pmd); 520 unmap_low_page(pmd);
427 521
428 spin_lock(&init_mm.page_table_lock); 522 spin_lock(&init_mm.page_table_lock);
@@ -430,6 +524,7 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
430 spin_unlock(&init_mm.page_table_lock); 524 spin_unlock(&init_mm.page_table_lock);
431 } 525 }
432 __flush_tlb_all(); 526 __flush_tlb_all();
527
433 update_page_count(PG_LEVEL_1G, pages); 528 update_page_count(PG_LEVEL_1G, pages);
434 529
435 return last_map_addr; 530 return last_map_addr;
@@ -446,27 +541,28 @@ phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
446 return phys_pud_init(pud, addr, end, page_size_mask); 541 return phys_pud_init(pud, addr, end, page_size_mask);
447} 542}
448 543
449static void __init find_early_table_space(unsigned long end) 544static void __init find_early_table_space(unsigned long end, int use_pse,
545 int use_gbpages)
450{ 546{
451 unsigned long puds, pmds, ptes, tables, start; 547 unsigned long puds, pmds, ptes, tables, start;
452 548
453 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; 549 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
454 tables = round_up(puds * sizeof(pud_t), PAGE_SIZE); 550 tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
455 if (direct_gbpages) { 551 if (use_gbpages) {
456 unsigned long extra; 552 unsigned long extra;
457 extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT); 553 extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
458 pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT; 554 pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
459 } else 555 } else
460 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; 556 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
461 tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE); 557 tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
462 558
463 if (cpu_has_pse) { 559 if (use_pse) {
464 unsigned long extra; 560 unsigned long extra;
465 extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); 561 extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
466 ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; 562 ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
467 } else 563 } else
468 ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; 564 ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
469 tables += round_up(ptes * sizeof(pte_t), PAGE_SIZE); 565 tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
470 566
471 /* 567 /*
472 * RED-PEN putting page tables only on node 0 could 568 * RED-PEN putting page tables only on node 0 could
@@ -528,6 +624,7 @@ static unsigned long __init kernel_physical_mapping_init(unsigned long start,
528 pgd_populate(&init_mm, pgd, __va(pud_phys)); 624 pgd_populate(&init_mm, pgd, __va(pud_phys));
529 spin_unlock(&init_mm.page_table_lock); 625 spin_unlock(&init_mm.page_table_lock);
530 } 626 }
627 __flush_tlb_all();
531 628
532 return last_map_addr; 629 return last_map_addr;
533} 630}
@@ -571,6 +668,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
571 668
572 struct map_range mr[NR_RANGE_MR]; 669 struct map_range mr[NR_RANGE_MR];
573 int nr_range, i; 670 int nr_range, i;
671 int use_pse, use_gbpages;
574 672
575 printk(KERN_INFO "init_memory_mapping\n"); 673 printk(KERN_INFO "init_memory_mapping\n");
576 674
@@ -584,9 +682,21 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
584 if (!after_bootmem) 682 if (!after_bootmem)
585 init_gbpages(); 683 init_gbpages();
586 684
587 if (direct_gbpages) 685#ifdef CONFIG_DEBUG_PAGEALLOC
686 /*
687 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
688 * This will simplify cpa(), which otherwise needs to support splitting
689 * large pages into small in interrupt context, etc.
690 */
691 use_pse = use_gbpages = 0;
692#else
693 use_pse = cpu_has_pse;
694 use_gbpages = direct_gbpages;
695#endif
696
697 if (use_gbpages)
588 page_size_mask |= 1 << PG_LEVEL_1G; 698 page_size_mask |= 1 << PG_LEVEL_1G;
589 if (cpu_has_pse) 699 if (use_pse)
590 page_size_mask |= 1 << PG_LEVEL_2M; 700 page_size_mask |= 1 << PG_LEVEL_2M;
591 701
592 memset(mr, 0, sizeof(mr)); 702 memset(mr, 0, sizeof(mr));
@@ -636,7 +746,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
636 old_start = mr[i].start; 746 old_start = mr[i].start;
637 memmove(&mr[i], &mr[i+1], 747 memmove(&mr[i], &mr[i+1],
638 (nr_range - 1 - i) * sizeof (struct map_range)); 748 (nr_range - 1 - i) * sizeof (struct map_range));
639 mr[i].start = old_start; 749 mr[i--].start = old_start;
640 nr_range--; 750 nr_range--;
641 } 751 }
642 752
@@ -647,7 +757,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
647 (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k")); 757 (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
648 758
649 if (!after_bootmem) 759 if (!after_bootmem)
650 find_early_table_space(end); 760 find_early_table_space(end, use_pse, use_gbpages);
651 761
652 for (i = 0; i < nr_range; i++) 762 for (i = 0; i < nr_range; i++)
653 last_map_addr = kernel_physical_mapping_init( 763 last_map_addr = kernel_physical_mapping_init(
@@ -769,6 +879,8 @@ void __init mem_init(void)
769{ 879{
770 long codesize, reservedpages, datasize, initsize; 880 long codesize, reservedpages, datasize, initsize;
771 881
882 start_periodic_check_for_corruption();
883
772 pci_iommu_alloc(); 884 pci_iommu_alloc();
773 885
774 /* clear_bss() already clear the empty_zero_page */ 886 /* clear_bss() already clear the empty_zero_page */
@@ -806,8 +918,6 @@ void __init mem_init(void)
806 reservedpages << (PAGE_SHIFT-10), 918 reservedpages << (PAGE_SHIFT-10),
807 datasize >> 10, 919 datasize >> 10,
808 initsize >> 10); 920 initsize >> 10);
809
810 cpa_init();
811} 921}
812 922
813void free_init_pages(char *what, unsigned long begin, unsigned long end) 923void free_init_pages(char *what, unsigned long begin, unsigned long end)
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index d4b6e6a29ae..ae71e11eb3e 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -24,18 +24,47 @@
24 24
25#ifdef CONFIG_X86_64 25#ifdef CONFIG_X86_64
26 26
27static inline int phys_addr_valid(unsigned long addr)
28{
29 return addr < (1UL << boot_cpu_data.x86_phys_bits);
30}
31
27unsigned long __phys_addr(unsigned long x) 32unsigned long __phys_addr(unsigned long x)
28{ 33{
29 if (x >= __START_KERNEL_map) 34 if (x >= __START_KERNEL_map) {
30 return x - __START_KERNEL_map + phys_base; 35 x -= __START_KERNEL_map;
31 return x - PAGE_OFFSET; 36 VIRTUAL_BUG_ON(x >= KERNEL_IMAGE_SIZE);
37 x += phys_base;
38 } else {
39 VIRTUAL_BUG_ON(x < PAGE_OFFSET);
40 x -= PAGE_OFFSET;
41 VIRTUAL_BUG_ON(system_state == SYSTEM_BOOTING ? x > MAXMEM :
42 !phys_addr_valid(x));
43 }
44 return x;
32} 45}
33EXPORT_SYMBOL(__phys_addr); 46EXPORT_SYMBOL(__phys_addr);
34 47
35static inline int phys_addr_valid(unsigned long addr) 48bool __virt_addr_valid(unsigned long x)
36{ 49{
37 return addr < (1UL << boot_cpu_data.x86_phys_bits); 50 if (x >= __START_KERNEL_map) {
51 x -= __START_KERNEL_map;
52 if (x >= KERNEL_IMAGE_SIZE)
53 return false;
54 x += phys_base;
55 } else {
56 if (x < PAGE_OFFSET)
57 return false;
58 x -= PAGE_OFFSET;
59 if (system_state == SYSTEM_BOOTING ?
60 x > MAXMEM : !phys_addr_valid(x)) {
61 return false;
62 }
63 }
64
65 return pfn_valid(x >> PAGE_SHIFT);
38} 66}
67EXPORT_SYMBOL(__virt_addr_valid);
39 68
40#else 69#else
41 70
@@ -44,6 +73,28 @@ static inline int phys_addr_valid(unsigned long addr)
44 return 1; 73 return 1;
45} 74}
46 75
76#ifdef CONFIG_DEBUG_VIRTUAL
77unsigned long __phys_addr(unsigned long x)
78{
79 /* VMALLOC_* aren't constants; not available at the boot time */
80 VIRTUAL_BUG_ON(x < PAGE_OFFSET);
81 VIRTUAL_BUG_ON(system_state != SYSTEM_BOOTING &&
82 is_vmalloc_addr((void *) x));
83 return x - PAGE_OFFSET;
84}
85EXPORT_SYMBOL(__phys_addr);
86#endif
87
88bool __virt_addr_valid(unsigned long x)
89{
90 if (x < PAGE_OFFSET)
91 return false;
92 if (system_state != SYSTEM_BOOTING && is_vmalloc_addr((void *) x))
93 return false;
94 return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT);
95}
96EXPORT_SYMBOL(__virt_addr_valid);
97
47#endif 98#endif
48 99
49int page_is_ram(unsigned long pagenr) 100int page_is_ram(unsigned long pagenr)
@@ -83,6 +134,25 @@ int page_is_ram(unsigned long pagenr)
83 return 0; 134 return 0;
84} 135}
85 136
137int pagerange_is_ram(unsigned long start, unsigned long end)
138{
139 int ram_page = 0, not_rampage = 0;
140 unsigned long page_nr;
141
142 for (page_nr = (start >> PAGE_SHIFT); page_nr < (end >> PAGE_SHIFT);
143 ++page_nr) {
144 if (page_is_ram(page_nr))
145 ram_page = 1;
146 else
147 not_rampage = 1;
148
149 if (ram_page == not_rampage)
150 return -1;
151 }
152
153 return ram_page;
154}
155
86/* 156/*
87 * Fix up the linear direct mapping of the kernel to avoid cache attribute 157 * Fix up the linear direct mapping of the kernel to avoid cache attribute
88 * conflicts. 158 * conflicts.
@@ -150,6 +220,12 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
150 return (__force void __iomem *)phys_to_virt(phys_addr); 220 return (__force void __iomem *)phys_to_virt(phys_addr);
151 221
152 /* 222 /*
223 * Check if the request spans more than any BAR in the iomem resource
224 * tree.
225 */
226 WARN_ON(iomem_map_sanity_check(phys_addr, size));
227
228 /*
153 * Don't allow anybody to remap normal RAM that we're using.. 229 * Don't allow anybody to remap normal RAM that we're using..
154 */ 230 */
155 for (pfn = phys_addr >> PAGE_SHIFT; 231 for (pfn = phys_addr >> PAGE_SHIFT;
@@ -204,16 +280,16 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
204 switch (prot_val) { 280 switch (prot_val) {
205 case _PAGE_CACHE_UC: 281 case _PAGE_CACHE_UC:
206 default: 282 default:
207 prot = PAGE_KERNEL_NOCACHE; 283 prot = PAGE_KERNEL_IO_NOCACHE;
208 break; 284 break;
209 case _PAGE_CACHE_UC_MINUS: 285 case _PAGE_CACHE_UC_MINUS:
210 prot = PAGE_KERNEL_UC_MINUS; 286 prot = PAGE_KERNEL_IO_UC_MINUS;
211 break; 287 break;
212 case _PAGE_CACHE_WC: 288 case _PAGE_CACHE_WC:
213 prot = PAGE_KERNEL_WC; 289 prot = PAGE_KERNEL_IO_WC;
214 break; 290 break;
215 case _PAGE_CACHE_WB: 291 case _PAGE_CACHE_WB:
216 prot = PAGE_KERNEL; 292 prot = PAGE_KERNEL_IO;
217 break; 293 break;
218 } 294 }
219 295
@@ -421,7 +497,7 @@ void unxlate_dev_mem_ptr(unsigned long phys, void *addr)
421 return; 497 return;
422} 498}
423 499
424int __initdata early_ioremap_debug; 500static int __initdata early_ioremap_debug;
425 501
426static int __init early_ioremap_debug_setup(char *str) 502static int __init early_ioremap_debug_setup(char *str)
427{ 503{
@@ -530,12 +606,12 @@ static void __init __early_set_fixmap(enum fixed_addresses idx,
530} 606}
531 607
532static inline void __init early_set_fixmap(enum fixed_addresses idx, 608static inline void __init early_set_fixmap(enum fixed_addresses idx,
533 unsigned long phys) 609 unsigned long phys, pgprot_t prot)
534{ 610{
535 if (after_paging_init) 611 if (after_paging_init)
536 set_fixmap(idx, phys); 612 __set_fixmap(idx, phys, prot);
537 else 613 else
538 __early_set_fixmap(idx, phys, PAGE_KERNEL); 614 __early_set_fixmap(idx, phys, prot);
539} 615}
540 616
541static inline void __init early_clear_fixmap(enum fixed_addresses idx) 617static inline void __init early_clear_fixmap(enum fixed_addresses idx)
@@ -546,16 +622,22 @@ static inline void __init early_clear_fixmap(enum fixed_addresses idx)
546 __early_set_fixmap(idx, 0, __pgprot(0)); 622 __early_set_fixmap(idx, 0, __pgprot(0));
547} 623}
548 624
549 625static void *prev_map[FIX_BTMAPS_SLOTS] __initdata;
550int __initdata early_ioremap_nested; 626static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata;
551
552static int __init check_early_ioremap_leak(void) 627static int __init check_early_ioremap_leak(void)
553{ 628{
554 if (!early_ioremap_nested) 629 int count = 0;
630 int i;
631
632 for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
633 if (prev_map[i])
634 count++;
635
636 if (!count)
555 return 0; 637 return 0;
556 WARN(1, KERN_WARNING 638 WARN(1, KERN_WARNING
557 "Debug warning: early ioremap leak of %d areas detected.\n", 639 "Debug warning: early ioremap leak of %d areas detected.\n",
558 early_ioremap_nested); 640 count);
559 printk(KERN_WARNING 641 printk(KERN_WARNING
560 "please boot with early_ioremap_debug and report the dmesg.\n"); 642 "please boot with early_ioremap_debug and report the dmesg.\n");
561 643
@@ -563,18 +645,33 @@ static int __init check_early_ioremap_leak(void)
563} 645}
564late_initcall(check_early_ioremap_leak); 646late_initcall(check_early_ioremap_leak);
565 647
566void __init *early_ioremap(unsigned long phys_addr, unsigned long size) 648static void __init *__early_ioremap(unsigned long phys_addr, unsigned long size, pgprot_t prot)
567{ 649{
568 unsigned long offset, last_addr; 650 unsigned long offset, last_addr;
569 unsigned int nrpages, nesting; 651 unsigned int nrpages;
570 enum fixed_addresses idx0, idx; 652 enum fixed_addresses idx0, idx;
653 int i, slot;
571 654
572 WARN_ON(system_state != SYSTEM_BOOTING); 655 WARN_ON(system_state != SYSTEM_BOOTING);
573 656
574 nesting = early_ioremap_nested; 657 slot = -1;
658 for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
659 if (!prev_map[i]) {
660 slot = i;
661 break;
662 }
663 }
664
665 if (slot < 0) {
666 printk(KERN_INFO "early_iomap(%08lx, %08lx) not found slot\n",
667 phys_addr, size);
668 WARN_ON(1);
669 return NULL;
670 }
671
575 if (early_ioremap_debug) { 672 if (early_ioremap_debug) {
576 printk(KERN_INFO "early_ioremap(%08lx, %08lx) [%d] => ", 673 printk(KERN_INFO "early_ioremap(%08lx, %08lx) [%d] => ",
577 phys_addr, size, nesting); 674 phys_addr, size, slot);
578 dump_stack(); 675 dump_stack();
579 } 676 }
580 677
@@ -585,17 +682,13 @@ void __init *early_ioremap(unsigned long phys_addr, unsigned long size)
585 return NULL; 682 return NULL;
586 } 683 }
587 684
588 if (nesting >= FIX_BTMAPS_NESTING) { 685 prev_size[slot] = size;
589 WARN_ON(1);
590 return NULL;
591 }
592 early_ioremap_nested++;
593 /* 686 /*
594 * Mappings have to be page-aligned 687 * Mappings have to be page-aligned
595 */ 688 */
596 offset = phys_addr & ~PAGE_MASK; 689 offset = phys_addr & ~PAGE_MASK;
597 phys_addr &= PAGE_MASK; 690 phys_addr &= PAGE_MASK;
598 size = PAGE_ALIGN(last_addr) - phys_addr; 691 size = PAGE_ALIGN(last_addr + 1) - phys_addr;
599 692
600 /* 693 /*
601 * Mappings have to fit in the FIX_BTMAP area. 694 * Mappings have to fit in the FIX_BTMAP area.
@@ -609,10 +702,10 @@ void __init *early_ioremap(unsigned long phys_addr, unsigned long size)
609 /* 702 /*
610 * Ok, go for it.. 703 * Ok, go for it..
611 */ 704 */
612 idx0 = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*nesting; 705 idx0 = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot;
613 idx = idx0; 706 idx = idx0;
614 while (nrpages > 0) { 707 while (nrpages > 0) {
615 early_set_fixmap(idx, phys_addr); 708 early_set_fixmap(idx, phys_addr, prot);
616 phys_addr += PAGE_SIZE; 709 phys_addr += PAGE_SIZE;
617 --idx; 710 --idx;
618 --nrpages; 711 --nrpages;
@@ -620,7 +713,20 @@ void __init *early_ioremap(unsigned long phys_addr, unsigned long size)
620 if (early_ioremap_debug) 713 if (early_ioremap_debug)
621 printk(KERN_CONT "%08lx + %08lx\n", offset, fix_to_virt(idx0)); 714 printk(KERN_CONT "%08lx + %08lx\n", offset, fix_to_virt(idx0));
622 715
623 return (void *) (offset + fix_to_virt(idx0)); 716 prev_map[slot] = (void *) (offset + fix_to_virt(idx0));
717 return prev_map[slot];
718}
719
720/* Remap an IO device */
721void __init *early_ioremap(unsigned long phys_addr, unsigned long size)
722{
723 return __early_ioremap(phys_addr, size, PAGE_KERNEL_IO);
724}
725
726/* Remap memory */
727void __init *early_memremap(unsigned long phys_addr, unsigned long size)
728{
729 return __early_ioremap(phys_addr, size, PAGE_KERNEL);
624} 730}
625 731
626void __init early_iounmap(void *addr, unsigned long size) 732void __init early_iounmap(void *addr, unsigned long size)
@@ -629,15 +735,33 @@ void __init early_iounmap(void *addr, unsigned long size)
629 unsigned long offset; 735 unsigned long offset;
630 unsigned int nrpages; 736 unsigned int nrpages;
631 enum fixed_addresses idx; 737 enum fixed_addresses idx;
632 int nesting; 738 int i, slot;
739
740 slot = -1;
741 for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
742 if (prev_map[i] == addr) {
743 slot = i;
744 break;
745 }
746 }
747
748 if (slot < 0) {
749 printk(KERN_INFO "early_iounmap(%p, %08lx) not found slot\n",
750 addr, size);
751 WARN_ON(1);
752 return;
753 }
633 754
634 nesting = --early_ioremap_nested; 755 if (prev_size[slot] != size) {
635 if (WARN_ON(nesting < 0)) 756 printk(KERN_INFO "early_iounmap(%p, %08lx) [%d] size not consistent %08lx\n",
757 addr, size, slot, prev_size[slot]);
758 WARN_ON(1);
636 return; 759 return;
760 }
637 761
638 if (early_ioremap_debug) { 762 if (early_ioremap_debug) {
639 printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr, 763 printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr,
640 size, nesting); 764 size, slot);
641 dump_stack(); 765 dump_stack();
642 } 766 }
643 767
@@ -649,12 +773,13 @@ void __init early_iounmap(void *addr, unsigned long size)
649 offset = virt_addr & ~PAGE_MASK; 773 offset = virt_addr & ~PAGE_MASK;
650 nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT; 774 nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT;
651 775
652 idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*nesting; 776 idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot;
653 while (nrpages > 0) { 777 while (nrpages > 0) {
654 early_clear_fixmap(idx); 778 early_clear_fixmap(idx);
655 --idx; 779 --idx;
656 --nrpages; 780 --nrpages;
657 } 781 }
782 prev_map[slot] = 0;
658} 783}
659 784
660void __this_fixmap_does_not_exist(void) 785void __this_fixmap_does_not_exist(void)
diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c
index 672e17f8262..9cab18b0b85 100644
--- a/arch/x86/mm/memtest.c
+++ b/arch/x86/mm/memtest.c
@@ -61,9 +61,9 @@ static void __init memtest(unsigned long start_phys, unsigned long size,
61 last_bad += incr; 61 last_bad += incr;
62 } else { 62 } else {
63 if (start_bad) { 63 if (start_bad) {
64 printk(KERN_CONT "\n %010lx bad mem addr %010lx - %010lx reserved", 64 printk(KERN_CONT "\n %016lx bad mem addr %010lx - %010lx reserved",
65 val, start_bad, last_bad + incr); 65 val, start_bad, last_bad + incr);
66 reserve_early(start_bad, last_bad - start_bad, "BAD RAM"); 66 reserve_early(start_bad, last_bad + incr, "BAD RAM");
67 } 67 }
68 start_bad = last_bad = start_phys_aligned; 68 start_bad = last_bad = start_phys_aligned;
69 } 69 }
@@ -72,9 +72,8 @@ static void __init memtest(unsigned long start_phys, unsigned long size,
72 if (start_bad) { 72 if (start_bad) {
73 printk(KERN_CONT "\n %016lx bad mem addr %010lx - %010lx reserved", 73 printk(KERN_CONT "\n %016lx bad mem addr %010lx - %010lx reserved",
74 val, start_bad, last_bad + incr); 74 val, start_bad, last_bad + incr);
75 reserve_early(start_bad, last_bad - start_bad, "BAD RAM"); 75 reserve_early(start_bad, last_bad + incr, "BAD RAM");
76 } 76 }
77
78} 77}
79 78
80/* default is disabled */ 79/* default is disabled */
diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
index 635b50e8558..2c4baa88f2c 100644
--- a/arch/x86/mm/mmio-mod.c
+++ b/arch/x86/mm/mmio-mod.c
@@ -56,13 +56,6 @@ struct remap_trace {
56static DEFINE_PER_CPU(struct trap_reason, pf_reason); 56static DEFINE_PER_CPU(struct trap_reason, pf_reason);
57static DEFINE_PER_CPU(struct mmiotrace_rw, cpu_trace); 57static DEFINE_PER_CPU(struct mmiotrace_rw, cpu_trace);
58 58
59#if 0 /* XXX: no way gather this info anymore */
60/* Access to this is not per-cpu. */
61static DEFINE_PER_CPU(atomic_t, dropped);
62#endif
63
64static struct dentry *marker_file;
65
66static DEFINE_MUTEX(mmiotrace_mutex); 59static DEFINE_MUTEX(mmiotrace_mutex);
67static DEFINE_SPINLOCK(trace_lock); 60static DEFINE_SPINLOCK(trace_lock);
68static atomic_t mmiotrace_enabled; 61static atomic_t mmiotrace_enabled;
@@ -75,7 +68,7 @@ static LIST_HEAD(trace_list); /* struct remap_trace */
75 * and trace_lock. 68 * and trace_lock.
76 * - Routines depending on is_enabled() must take trace_lock. 69 * - Routines depending on is_enabled() must take trace_lock.
77 * - trace_list users must hold trace_lock. 70 * - trace_list users must hold trace_lock.
78 * - is_enabled() guarantees that mmio_trace_record is allowed. 71 * - is_enabled() guarantees that mmio_trace_{rw,mapping} are allowed.
79 * - pre/post callbacks assume the effect of is_enabled() being true. 72 * - pre/post callbacks assume the effect of is_enabled() being true.
80 */ 73 */
81 74
@@ -97,44 +90,6 @@ static bool is_enabled(void)
97 return atomic_read(&mmiotrace_enabled); 90 return atomic_read(&mmiotrace_enabled);
98} 91}
99 92
100#if 0 /* XXX: needs rewrite */
101/*
102 * Write callback for the debugfs entry:
103 * Read a marker and write it to the mmio trace log
104 */
105static ssize_t write_marker(struct file *file, const char __user *buffer,
106 size_t count, loff_t *ppos)
107{
108 char *event = NULL;
109 struct mm_io_header *headp;
110 ssize_t len = (count > 65535) ? 65535 : count;
111
112 event = kzalloc(sizeof(*headp) + len, GFP_KERNEL);
113 if (!event)
114 return -ENOMEM;
115
116 headp = (struct mm_io_header *)event;
117 headp->type = MMIO_MAGIC | (MMIO_MARKER << MMIO_OPCODE_SHIFT);
118 headp->data_len = len;
119
120 if (copy_from_user(event + sizeof(*headp), buffer, len)) {
121 kfree(event);
122 return -EFAULT;
123 }
124
125 spin_lock_irq(&trace_lock);
126#if 0 /* XXX: convert this to use tracing */
127 if (is_enabled())
128 relay_write(chan, event, sizeof(*headp) + len);
129 else
130#endif
131 len = -EINVAL;
132 spin_unlock_irq(&trace_lock);
133 kfree(event);
134 return len;
135}
136#endif
137
138static void print_pte(unsigned long address) 93static void print_pte(unsigned long address)
139{ 94{
140 unsigned int level; 95 unsigned int level;
@@ -307,8 +262,10 @@ static void ioremap_trace_core(resource_size_t offset, unsigned long size,
307 map.map_id = trace->id; 262 map.map_id = trace->id;
308 263
309 spin_lock_irq(&trace_lock); 264 spin_lock_irq(&trace_lock);
310 if (!is_enabled()) 265 if (!is_enabled()) {
266 kfree(trace);
311 goto not_enabled; 267 goto not_enabled;
268 }
312 269
313 mmio_trace_mapping(&map); 270 mmio_trace_mapping(&map);
314 list_add_tail(&trace->list, &trace_list); 271 list_add_tail(&trace->list, &trace_list);
@@ -377,6 +334,23 @@ void mmiotrace_iounmap(volatile void __iomem *addr)
377 iounmap_trace_core(addr); 334 iounmap_trace_core(addr);
378} 335}
379 336
337int mmiotrace_printk(const char *fmt, ...)
338{
339 int ret = 0;
340 va_list args;
341 unsigned long flags;
342 va_start(args, fmt);
343
344 spin_lock_irqsave(&trace_lock, flags);
345 if (is_enabled())
346 ret = mmio_trace_printk(fmt, args);
347 spin_unlock_irqrestore(&trace_lock, flags);
348
349 va_end(args);
350 return ret;
351}
352EXPORT_SYMBOL(mmiotrace_printk);
353
380static void clear_trace_list(void) 354static void clear_trace_list(void)
381{ 355{
382 struct remap_trace *trace; 356 struct remap_trace *trace;
@@ -462,26 +436,12 @@ static void leave_uniprocessor(void)
462} 436}
463#endif 437#endif
464 438
465#if 0 /* XXX: out of order */
466static struct file_operations fops_marker = {
467 .owner = THIS_MODULE,
468 .write = write_marker
469};
470#endif
471
472void enable_mmiotrace(void) 439void enable_mmiotrace(void)
473{ 440{
474 mutex_lock(&mmiotrace_mutex); 441 mutex_lock(&mmiotrace_mutex);
475 if (is_enabled()) 442 if (is_enabled())
476 goto out; 443 goto out;
477 444
478#if 0 /* XXX: tracing does not support text entries */
479 marker_file = debugfs_create_file("marker", 0660, dir, NULL,
480 &fops_marker);
481 if (!marker_file)
482 pr_err(NAME "marker file creation failed.\n");
483#endif
484
485 if (nommiotrace) 445 if (nommiotrace)
486 pr_info(NAME "MMIO tracing disabled.\n"); 446 pr_info(NAME "MMIO tracing disabled.\n");
487 enter_uniprocessor(); 447 enter_uniprocessor();
@@ -506,11 +466,6 @@ void disable_mmiotrace(void)
506 466
507 clear_trace_list(); /* guarantees: no more kmmio callbacks */ 467 clear_trace_list(); /* guarantees: no more kmmio callbacks */
508 leave_uniprocessor(); 468 leave_uniprocessor();
509 if (marker_file) {
510 debugfs_remove(marker_file);
511 marker_file = NULL;
512 }
513
514 pr_info(NAME "disabled.\n"); 469 pr_info(NAME "disabled.\n");
515out: 470out:
516 mutex_unlock(&mmiotrace_mutex); 471 mutex_unlock(&mmiotrace_mutex);
diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/numa_32.c
index 62fa440678d..847c164725f 100644
--- a/arch/x86/mm/discontig_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -328,7 +328,7 @@ void __init initmem_init(unsigned long start_pfn,
328 328
329 get_memcfg_numa(); 329 get_memcfg_numa();
330 330
331 kva_pages = round_up(calculate_numa_remap_pages(), PTRS_PER_PTE); 331 kva_pages = roundup(calculate_numa_remap_pages(), PTRS_PER_PTE);
332 332
333 kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE); 333 kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE);
334 do { 334 do {
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index a4dd793d600..cebcbf152d4 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -79,7 +79,7 @@ static int __init allocate_cachealigned_memnodemap(void)
79 return 0; 79 return 0;
80 80
81 addr = 0x8000; 81 addr = 0x8000;
82 nodemap_size = round_up(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES); 82 nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES);
83 nodemap_addr = find_e820_area(addr, max_pfn<<PAGE_SHIFT, 83 nodemap_addr = find_e820_area(addr, max_pfn<<PAGE_SHIFT,
84 nodemap_size, L1_CACHE_BYTES); 84 nodemap_size, L1_CACHE_BYTES);
85 if (nodemap_addr == -1UL) { 85 if (nodemap_addr == -1UL) {
@@ -176,10 +176,10 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
176 unsigned long start_pfn, last_pfn, bootmap_pages, bootmap_size; 176 unsigned long start_pfn, last_pfn, bootmap_pages, bootmap_size;
177 unsigned long bootmap_start, nodedata_phys; 177 unsigned long bootmap_start, nodedata_phys;
178 void *bootmap; 178 void *bootmap;
179 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE); 179 const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
180 int nid; 180 int nid;
181 181
182 start = round_up(start, ZONE_ALIGN); 182 start = roundup(start, ZONE_ALIGN);
183 183
184 printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, 184 printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid,
185 start, end); 185 start, end);
@@ -210,9 +210,9 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
210 bootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn); 210 bootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn);
211 nid = phys_to_nid(nodedata_phys); 211 nid = phys_to_nid(nodedata_phys);
212 if (nid == nodeid) 212 if (nid == nodeid)
213 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE); 213 bootmap_start = roundup(nodedata_phys + pgdat_size, PAGE_SIZE);
214 else 214 else
215 bootmap_start = round_up(start, PAGE_SIZE); 215 bootmap_start = roundup(start, PAGE_SIZE);
216 /* 216 /*
217 * SMP_CACHE_BYTES could be enough, but init_bootmem_node like 217 * SMP_CACHE_BYTES could be enough, but init_bootmem_node like
218 * to use that to align to PAGE_SIZE 218 * to use that to align to PAGE_SIZE
diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c
index d4aa503caaa..e1d10690921 100644
--- a/arch/x86/mm/pageattr-test.c
+++ b/arch/x86/mm/pageattr-test.c
@@ -32,7 +32,7 @@ enum {
32 GPS = (1<<30) 32 GPS = (1<<30)
33}; 33};
34 34
35#define PAGE_TESTBIT __pgprot(_PAGE_UNUSED1) 35#define PAGE_CPA_TEST __pgprot(_PAGE_CPA_TEST)
36 36
37static int pte_testbit(pte_t pte) 37static int pte_testbit(pte_t pte)
38{ 38{
@@ -118,6 +118,7 @@ static int pageattr_test(void)
118 unsigned int level; 118 unsigned int level;
119 int i, k; 119 int i, k;
120 int err; 120 int err;
121 unsigned long test_addr;
121 122
122 if (print) 123 if (print)
123 printk(KERN_INFO "CPA self-test:\n"); 124 printk(KERN_INFO "CPA self-test:\n");
@@ -172,7 +173,8 @@ static int pageattr_test(void)
172 continue; 173 continue;
173 } 174 }
174 175
175 err = change_page_attr_set(addr[i], len[i], PAGE_TESTBIT); 176 test_addr = addr[i];
177 err = change_page_attr_set(&test_addr, len[i], PAGE_CPA_TEST, 0);
176 if (err < 0) { 178 if (err < 0) {
177 printk(KERN_ERR "CPA %d failed %d\n", i, err); 179 printk(KERN_ERR "CPA %d failed %d\n", i, err);
178 failed++; 180 failed++;
@@ -204,7 +206,8 @@ static int pageattr_test(void)
204 failed++; 206 failed++;
205 continue; 207 continue;
206 } 208 }
207 err = change_page_attr_clear(addr[i], len[i], PAGE_TESTBIT); 209 test_addr = addr[i];
210 err = change_page_attr_clear(&test_addr, len[i], PAGE_CPA_TEST, 0);
208 if (err < 0) { 211 if (err < 0) {
209 printk(KERN_ERR "CPA reverting failed: %d\n", err); 212 printk(KERN_ERR "CPA reverting failed: %d\n", err);
210 failed++; 213 failed++;
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 43e2f8483e4..f1dc1b75d16 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -25,15 +25,27 @@
25 * The current flushing context - we pass it instead of 5 arguments: 25 * The current flushing context - we pass it instead of 5 arguments:
26 */ 26 */
27struct cpa_data { 27struct cpa_data {
28 unsigned long vaddr; 28 unsigned long *vaddr;
29 pgprot_t mask_set; 29 pgprot_t mask_set;
30 pgprot_t mask_clr; 30 pgprot_t mask_clr;
31 int numpages; 31 int numpages;
32 int flushtlb; 32 int flags;
33 unsigned long pfn; 33 unsigned long pfn;
34 unsigned force_split : 1; 34 unsigned force_split : 1;
35 int curpage;
35}; 36};
36 37
38/*
39 * Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings)
40 * using cpa_lock. So that we don't allow any other cpu, with stale large tlb
41 * entries change the page attribute in parallel to some other cpu
42 * splitting a large page entry along with changing the attribute.
43 */
44static DEFINE_SPINLOCK(cpa_lock);
45
46#define CPA_FLUSHTLB 1
47#define CPA_ARRAY 2
48
37#ifdef CONFIG_PROC_FS 49#ifdef CONFIG_PROC_FS
38static unsigned long direct_pages_count[PG_LEVEL_NUM]; 50static unsigned long direct_pages_count[PG_LEVEL_NUM];
39 51
@@ -53,23 +65,22 @@ static void split_page_count(int level)
53 direct_pages_count[level - 1] += PTRS_PER_PTE; 65 direct_pages_count[level - 1] += PTRS_PER_PTE;
54} 66}
55 67
56int arch_report_meminfo(char *page) 68void arch_report_meminfo(struct seq_file *m)
57{ 69{
58 int n = sprintf(page, "DirectMap4k: %8lu kB\n", 70 seq_printf(m, "DirectMap4k: %8lu kB\n",
59 direct_pages_count[PG_LEVEL_4K] << 2); 71 direct_pages_count[PG_LEVEL_4K] << 2);
60#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) 72#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
61 n += sprintf(page + n, "DirectMap2M: %8lu kB\n", 73 seq_printf(m, "DirectMap2M: %8lu kB\n",
62 direct_pages_count[PG_LEVEL_2M] << 11); 74 direct_pages_count[PG_LEVEL_2M] << 11);
63#else 75#else
64 n += sprintf(page + n, "DirectMap4M: %8lu kB\n", 76 seq_printf(m, "DirectMap4M: %8lu kB\n",
65 direct_pages_count[PG_LEVEL_2M] << 12); 77 direct_pages_count[PG_LEVEL_2M] << 12);
66#endif 78#endif
67#ifdef CONFIG_X86_64 79#ifdef CONFIG_X86_64
68 if (direct_gbpages) 80 if (direct_gbpages)
69 n += sprintf(page + n, "DirectMap1G: %8lu kB\n", 81 seq_printf(m, "DirectMap1G: %8lu kB\n",
70 direct_pages_count[PG_LEVEL_1G] << 20); 82 direct_pages_count[PG_LEVEL_1G] << 20);
71#endif 83#endif
72 return n;
73} 84}
74#else 85#else
75static inline void split_page_count(int level) { } 86static inline void split_page_count(int level) { }
@@ -84,7 +95,7 @@ static inline unsigned long highmap_start_pfn(void)
84 95
85static inline unsigned long highmap_end_pfn(void) 96static inline unsigned long highmap_end_pfn(void)
86{ 97{
87 return __pa(round_up((unsigned long)_end, PMD_SIZE)) >> PAGE_SHIFT; 98 return __pa(roundup((unsigned long)_end, PMD_SIZE)) >> PAGE_SHIFT;
88} 99}
89 100
90#endif 101#endif
@@ -190,6 +201,41 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache)
190 } 201 }
191} 202}
192 203
204static void cpa_flush_array(unsigned long *start, int numpages, int cache)
205{
206 unsigned int i, level;
207 unsigned long *addr;
208
209 BUG_ON(irqs_disabled());
210
211 on_each_cpu(__cpa_flush_range, NULL, 1);
212
213 if (!cache)
214 return;
215
216 /* 4M threshold */
217 if (numpages >= 1024) {
218 if (boot_cpu_data.x86_model >= 4)
219 wbinvd();
220 return;
221 }
222 /*
223 * We only need to flush on one CPU,
224 * clflush is a MESI-coherent instruction that
225 * will cause all other CPUs to flush the same
226 * cachelines:
227 */
228 for (i = 0, addr = start; i < numpages; i++, addr++) {
229 pte_t *pte = lookup_address(*addr, &level);
230
231 /*
232 * Only flush present addresses:
233 */
234 if (pte && (pte_val(*pte) & _PAGE_PRESENT))
235 clflush_cache_range((void *) *addr, PAGE_SIZE);
236 }
237}
238
193/* 239/*
194 * Certain areas of memory on x86 require very specific protection flags, 240 * Certain areas of memory on x86 require very specific protection flags,
195 * for example the BIOS area or kernel text. Callers don't always get this 241 * for example the BIOS area or kernel text. Callers don't always get this
@@ -398,7 +444,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
398 */ 444 */
399 new_pte = pfn_pte(pte_pfn(old_pte), canon_pgprot(new_prot)); 445 new_pte = pfn_pte(pte_pfn(old_pte), canon_pgprot(new_prot));
400 __set_pmd_pte(kpte, address, new_pte); 446 __set_pmd_pte(kpte, address, new_pte);
401 cpa->flushtlb = 1; 447 cpa->flags |= CPA_FLUSHTLB;
402 do_split = 0; 448 do_split = 0;
403 } 449 }
404 450
@@ -408,84 +454,6 @@ out_unlock:
408 return do_split; 454 return do_split;
409} 455}
410 456
411static LIST_HEAD(page_pool);
412static unsigned long pool_size, pool_pages, pool_low;
413static unsigned long pool_used, pool_failed;
414
415static void cpa_fill_pool(struct page **ret)
416{
417 gfp_t gfp = GFP_KERNEL;
418 unsigned long flags;
419 struct page *p;
420
421 /*
422 * Avoid recursion (on debug-pagealloc) and also signal
423 * our priority to get to these pagetables:
424 */
425 if (current->flags & PF_MEMALLOC)
426 return;
427 current->flags |= PF_MEMALLOC;
428
429 /*
430 * Allocate atomically from atomic contexts:
431 */
432 if (in_atomic() || irqs_disabled() || debug_pagealloc)
433 gfp = GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN;
434
435 while (pool_pages < pool_size || (ret && !*ret)) {
436 p = alloc_pages(gfp, 0);
437 if (!p) {
438 pool_failed++;
439 break;
440 }
441 /*
442 * If the call site needs a page right now, provide it:
443 */
444 if (ret && !*ret) {
445 *ret = p;
446 continue;
447 }
448 spin_lock_irqsave(&pgd_lock, flags);
449 list_add(&p->lru, &page_pool);
450 pool_pages++;
451 spin_unlock_irqrestore(&pgd_lock, flags);
452 }
453
454 current->flags &= ~PF_MEMALLOC;
455}
456
457#define SHIFT_MB (20 - PAGE_SHIFT)
458#define ROUND_MB_GB ((1 << 10) - 1)
459#define SHIFT_MB_GB 10
460#define POOL_PAGES_PER_GB 16
461
462void __init cpa_init(void)
463{
464 struct sysinfo si;
465 unsigned long gb;
466
467 si_meminfo(&si);
468 /*
469 * Calculate the number of pool pages:
470 *
471 * Convert totalram (nr of pages) to MiB and round to the next
472 * GiB. Shift MiB to Gib and multiply the result by
473 * POOL_PAGES_PER_GB:
474 */
475 if (debug_pagealloc) {
476 gb = ((si.totalram >> SHIFT_MB) + ROUND_MB_GB) >> SHIFT_MB_GB;
477 pool_size = POOL_PAGES_PER_GB * gb;
478 } else {
479 pool_size = 1;
480 }
481 pool_low = pool_size;
482
483 cpa_fill_pool(NULL);
484 printk(KERN_DEBUG
485 "CPA: page pool initialized %lu of %lu pages preallocated\n",
486 pool_pages, pool_size);
487}
488
489static int split_large_page(pte_t *kpte, unsigned long address) 457static int split_large_page(pte_t *kpte, unsigned long address)
490{ 458{
491 unsigned long flags, pfn, pfninc = 1; 459 unsigned long flags, pfn, pfninc = 1;
@@ -494,28 +462,15 @@ static int split_large_page(pte_t *kpte, unsigned long address)
494 pgprot_t ref_prot; 462 pgprot_t ref_prot;
495 struct page *base; 463 struct page *base;
496 464
497 /* 465 if (!debug_pagealloc)
498 * Get a page from the pool. The pool list is protected by the 466 spin_unlock(&cpa_lock);
499 * pgd_lock, which we have to take anyway for the split 467 base = alloc_pages(GFP_KERNEL, 0);
500 * operation: 468 if (!debug_pagealloc)
501 */ 469 spin_lock(&cpa_lock);
502 spin_lock_irqsave(&pgd_lock, flags); 470 if (!base)
503 if (list_empty(&page_pool)) { 471 return -ENOMEM;
504 spin_unlock_irqrestore(&pgd_lock, flags);
505 base = NULL;
506 cpa_fill_pool(&base);
507 if (!base)
508 return -ENOMEM;
509 spin_lock_irqsave(&pgd_lock, flags);
510 } else {
511 base = list_first_entry(&page_pool, struct page, lru);
512 list_del(&base->lru);
513 pool_pages--;
514
515 if (pool_pages < pool_low)
516 pool_low = pool_pages;
517 }
518 472
473 spin_lock_irqsave(&pgd_lock, flags);
519 /* 474 /*
520 * Check for races, another CPU might have split this page 475 * Check for races, another CPU might have split this page
521 * up for us already: 476 * up for us already:
@@ -572,11 +527,8 @@ out_unlock:
572 * If we dropped out via the lookup_address check under 527 * If we dropped out via the lookup_address check under
573 * pgd_lock then stick the page back into the pool: 528 * pgd_lock then stick the page back into the pool:
574 */ 529 */
575 if (base) { 530 if (base)
576 list_add(&base->lru, &page_pool); 531 __free_page(base);
577 pool_pages++;
578 } else
579 pool_used++;
580 spin_unlock_irqrestore(&pgd_lock, flags); 532 spin_unlock_irqrestore(&pgd_lock, flags);
581 533
582 return 0; 534 return 0;
@@ -584,11 +536,16 @@ out_unlock:
584 536
585static int __change_page_attr(struct cpa_data *cpa, int primary) 537static int __change_page_attr(struct cpa_data *cpa, int primary)
586{ 538{
587 unsigned long address = cpa->vaddr; 539 unsigned long address;
588 int do_split, err; 540 int do_split, err;
589 unsigned int level; 541 unsigned int level;
590 pte_t *kpte, old_pte; 542 pte_t *kpte, old_pte;
591 543
544 if (cpa->flags & CPA_ARRAY)
545 address = cpa->vaddr[cpa->curpage];
546 else
547 address = *cpa->vaddr;
548
592repeat: 549repeat:
593 kpte = lookup_address(address, &level); 550 kpte = lookup_address(address, &level);
594 if (!kpte) 551 if (!kpte)
@@ -600,7 +557,7 @@ repeat:
600 return 0; 557 return 0;
601 WARN(1, KERN_WARNING "CPA: called for zero pte. " 558 WARN(1, KERN_WARNING "CPA: called for zero pte. "
602 "vaddr = %lx cpa->vaddr = %lx\n", address, 559 "vaddr = %lx cpa->vaddr = %lx\n", address,
603 cpa->vaddr); 560 *cpa->vaddr);
604 return -EINVAL; 561 return -EINVAL;
605 } 562 }
606 563
@@ -626,7 +583,7 @@ repeat:
626 */ 583 */
627 if (pte_val(old_pte) != pte_val(new_pte)) { 584 if (pte_val(old_pte) != pte_val(new_pte)) {
628 set_pte_atomic(kpte, new_pte); 585 set_pte_atomic(kpte, new_pte);
629 cpa->flushtlb = 1; 586 cpa->flags |= CPA_FLUSHTLB;
630 } 587 }
631 cpa->numpages = 1; 588 cpa->numpages = 1;
632 return 0; 589 return 0;
@@ -650,7 +607,25 @@ repeat:
650 */ 607 */
651 err = split_large_page(kpte, address); 608 err = split_large_page(kpte, address);
652 if (!err) { 609 if (!err) {
653 cpa->flushtlb = 1; 610 /*
611 * Do a global flush tlb after splitting the large page
612 * and before we do the actual change page attribute in the PTE.
613 *
614 * With out this, we violate the TLB application note, that says
615 * "The TLBs may contain both ordinary and large-page
616 * translations for a 4-KByte range of linear addresses. This
617 * may occur if software modifies the paging structures so that
618 * the page size used for the address range changes. If the two
619 * translations differ with respect to page frame or attributes
620 * (e.g., permissions), processor behavior is undefined and may
621 * be implementation-specific."
622 *
623 * We do this global tlb flush inside the cpa_lock, so that we
624 * don't allow any other cpu, with stale tlb entries change the
625 * page attribute in parallel, that also falls into the
626 * just split large page entry.
627 */
628 flush_tlb_all();
654 goto repeat; 629 goto repeat;
655 } 630 }
656 631
@@ -663,6 +638,7 @@ static int cpa_process_alias(struct cpa_data *cpa)
663{ 638{
664 struct cpa_data alias_cpa; 639 struct cpa_data alias_cpa;
665 int ret = 0; 640 int ret = 0;
641 unsigned long temp_cpa_vaddr, vaddr;
666 642
667 if (cpa->pfn >= max_pfn_mapped) 643 if (cpa->pfn >= max_pfn_mapped)
668 return 0; 644 return 0;
@@ -675,16 +651,24 @@ static int cpa_process_alias(struct cpa_data *cpa)
675 * No need to redo, when the primary call touched the direct 651 * No need to redo, when the primary call touched the direct
676 * mapping already: 652 * mapping already:
677 */ 653 */
678 if (!(within(cpa->vaddr, PAGE_OFFSET, 654 if (cpa->flags & CPA_ARRAY)
655 vaddr = cpa->vaddr[cpa->curpage];
656 else
657 vaddr = *cpa->vaddr;
658
659 if (!(within(vaddr, PAGE_OFFSET,
679 PAGE_OFFSET + (max_low_pfn_mapped << PAGE_SHIFT)) 660 PAGE_OFFSET + (max_low_pfn_mapped << PAGE_SHIFT))
680#ifdef CONFIG_X86_64 661#ifdef CONFIG_X86_64
681 || within(cpa->vaddr, PAGE_OFFSET + (1UL<<32), 662 || within(vaddr, PAGE_OFFSET + (1UL<<32),
682 PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)) 663 PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))
683#endif 664#endif
684 )) { 665 )) {
685 666
686 alias_cpa = *cpa; 667 alias_cpa = *cpa;
687 alias_cpa.vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT); 668 temp_cpa_vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT);
669 alias_cpa.vaddr = &temp_cpa_vaddr;
670 alias_cpa.flags &= ~CPA_ARRAY;
671
688 672
689 ret = __change_page_attr_set_clr(&alias_cpa, 0); 673 ret = __change_page_attr_set_clr(&alias_cpa, 0);
690 } 674 }
@@ -696,7 +680,7 @@ static int cpa_process_alias(struct cpa_data *cpa)
696 * No need to redo, when the primary call touched the high 680 * No need to redo, when the primary call touched the high
697 * mapping already: 681 * mapping already:
698 */ 682 */
699 if (within(cpa->vaddr, (unsigned long) _text, (unsigned long) _end)) 683 if (within(vaddr, (unsigned long) _text, (unsigned long) _end))
700 return 0; 684 return 0;
701 685
702 /* 686 /*
@@ -707,8 +691,9 @@ static int cpa_process_alias(struct cpa_data *cpa)
707 return 0; 691 return 0;
708 692
709 alias_cpa = *cpa; 693 alias_cpa = *cpa;
710 alias_cpa.vaddr = 694 temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base;
711 (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base; 695 alias_cpa.vaddr = &temp_cpa_vaddr;
696 alias_cpa.flags &= ~CPA_ARRAY;
712 697
713 /* 698 /*
714 * The high mapping range is imprecise, so ignore the return value. 699 * The high mapping range is imprecise, so ignore the return value.
@@ -728,8 +713,15 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
728 * preservation check. 713 * preservation check.
729 */ 714 */
730 cpa->numpages = numpages; 715 cpa->numpages = numpages;
716 /* for array changes, we can't use large page */
717 if (cpa->flags & CPA_ARRAY)
718 cpa->numpages = 1;
731 719
720 if (!debug_pagealloc)
721 spin_lock(&cpa_lock);
732 ret = __change_page_attr(cpa, checkalias); 722 ret = __change_page_attr(cpa, checkalias);
723 if (!debug_pagealloc)
724 spin_unlock(&cpa_lock);
733 if (ret) 725 if (ret)
734 return ret; 726 return ret;
735 727
@@ -746,7 +738,11 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
746 */ 738 */
747 BUG_ON(cpa->numpages > numpages); 739 BUG_ON(cpa->numpages > numpages);
748 numpages -= cpa->numpages; 740 numpages -= cpa->numpages;
749 cpa->vaddr += cpa->numpages * PAGE_SIZE; 741 if (cpa->flags & CPA_ARRAY)
742 cpa->curpage++;
743 else
744 *cpa->vaddr += cpa->numpages * PAGE_SIZE;
745
750 } 746 }
751 return 0; 747 return 0;
752} 748}
@@ -757,9 +753,9 @@ static inline int cache_attr(pgprot_t attr)
757 (_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD); 753 (_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD);
758} 754}
759 755
760static int change_page_attr_set_clr(unsigned long addr, int numpages, 756static int change_page_attr_set_clr(unsigned long *addr, int numpages,
761 pgprot_t mask_set, pgprot_t mask_clr, 757 pgprot_t mask_set, pgprot_t mask_clr,
762 int force_split) 758 int force_split, int array)
763{ 759{
764 struct cpa_data cpa; 760 struct cpa_data cpa;
765 int ret, cache, checkalias; 761 int ret, cache, checkalias;
@@ -774,21 +770,40 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages,
774 return 0; 770 return 0;
775 771
776 /* Ensure we are PAGE_SIZE aligned */ 772 /* Ensure we are PAGE_SIZE aligned */
777 if (addr & ~PAGE_MASK) { 773 if (!array) {
778 addr &= PAGE_MASK; 774 if (*addr & ~PAGE_MASK) {
779 /* 775 *addr &= PAGE_MASK;
780 * People should not be passing in unaligned addresses: 776 /*
781 */ 777 * People should not be passing in unaligned addresses:
782 WARN_ON_ONCE(1); 778 */
779 WARN_ON_ONCE(1);
780 }
781 } else {
782 int i;
783 for (i = 0; i < numpages; i++) {
784 if (addr[i] & ~PAGE_MASK) {
785 addr[i] &= PAGE_MASK;
786 WARN_ON_ONCE(1);
787 }
788 }
783 } 789 }
784 790
791 /* Must avoid aliasing mappings in the highmem code */
792 kmap_flush_unused();
793
794 vm_unmap_aliases();
795
785 cpa.vaddr = addr; 796 cpa.vaddr = addr;
786 cpa.numpages = numpages; 797 cpa.numpages = numpages;
787 cpa.mask_set = mask_set; 798 cpa.mask_set = mask_set;
788 cpa.mask_clr = mask_clr; 799 cpa.mask_clr = mask_clr;
789 cpa.flushtlb = 0; 800 cpa.flags = 0;
801 cpa.curpage = 0;
790 cpa.force_split = force_split; 802 cpa.force_split = force_split;
791 803
804 if (array)
805 cpa.flags |= CPA_ARRAY;
806
792 /* No alias checking for _NX bit modifications */ 807 /* No alias checking for _NX bit modifications */
793 checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX; 808 checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;
794 809
@@ -797,7 +812,7 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages,
797 /* 812 /*
798 * Check whether we really changed something: 813 * Check whether we really changed something:
799 */ 814 */
800 if (!cpa.flushtlb) 815 if (!(cpa.flags & CPA_FLUSHTLB))
801 goto out; 816 goto out;
802 817
803 /* 818 /*
@@ -812,27 +827,30 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages,
812 * error case we fall back to cpa_flush_all (which uses 827 * error case we fall back to cpa_flush_all (which uses
813 * wbindv): 828 * wbindv):
814 */ 829 */
815 if (!ret && cpu_has_clflush) 830 if (!ret && cpu_has_clflush) {
816 cpa_flush_range(addr, numpages, cache); 831 if (cpa.flags & CPA_ARRAY)
817 else 832 cpa_flush_array(addr, numpages, cache);
833 else
834 cpa_flush_range(*addr, numpages, cache);
835 } else
818 cpa_flush_all(cache); 836 cpa_flush_all(cache);
819 837
820out: 838out:
821 cpa_fill_pool(NULL);
822
823 return ret; 839 return ret;
824} 840}
825 841
826static inline int change_page_attr_set(unsigned long addr, int numpages, 842static inline int change_page_attr_set(unsigned long *addr, int numpages,
827 pgprot_t mask) 843 pgprot_t mask, int array)
828{ 844{
829 return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0); 845 return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0,
846 array);
830} 847}
831 848
832static inline int change_page_attr_clear(unsigned long addr, int numpages, 849static inline int change_page_attr_clear(unsigned long *addr, int numpages,
833 pgprot_t mask) 850 pgprot_t mask, int array)
834{ 851{
835 return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0); 852 return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0,
853 array);
836} 854}
837 855
838int _set_memory_uc(unsigned long addr, int numpages) 856int _set_memory_uc(unsigned long addr, int numpages)
@@ -840,8 +858,8 @@ int _set_memory_uc(unsigned long addr, int numpages)
840 /* 858 /*
841 * for now UC MINUS. see comments in ioremap_nocache() 859 * for now UC MINUS. see comments in ioremap_nocache()
842 */ 860 */
843 return change_page_attr_set(addr, numpages, 861 return change_page_attr_set(&addr, numpages,
844 __pgprot(_PAGE_CACHE_UC_MINUS)); 862 __pgprot(_PAGE_CACHE_UC_MINUS), 0);
845} 863}
846 864
847int set_memory_uc(unsigned long addr, int numpages) 865int set_memory_uc(unsigned long addr, int numpages)
@@ -857,10 +875,48 @@ int set_memory_uc(unsigned long addr, int numpages)
857} 875}
858EXPORT_SYMBOL(set_memory_uc); 876EXPORT_SYMBOL(set_memory_uc);
859 877
878int set_memory_array_uc(unsigned long *addr, int addrinarray)
879{
880 unsigned long start;
881 unsigned long end;
882 int i;
883 /*
884 * for now UC MINUS. see comments in ioremap_nocache()
885 */
886 for (i = 0; i < addrinarray; i++) {
887 start = __pa(addr[i]);
888 for (end = start + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) {
889 if (end != __pa(addr[i + 1]))
890 break;
891 i++;
892 }
893 if (reserve_memtype(start, end, _PAGE_CACHE_UC_MINUS, NULL))
894 goto out;
895 }
896
897 return change_page_attr_set(addr, addrinarray,
898 __pgprot(_PAGE_CACHE_UC_MINUS), 1);
899out:
900 for (i = 0; i < addrinarray; i++) {
901 unsigned long tmp = __pa(addr[i]);
902
903 if (tmp == start)
904 break;
905 for (end = tmp + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) {
906 if (end != __pa(addr[i + 1]))
907 break;
908 i++;
909 }
910 free_memtype(tmp, end);
911 }
912 return -EINVAL;
913}
914EXPORT_SYMBOL(set_memory_array_uc);
915
860int _set_memory_wc(unsigned long addr, int numpages) 916int _set_memory_wc(unsigned long addr, int numpages)
861{ 917{
862 return change_page_attr_set(addr, numpages, 918 return change_page_attr_set(&addr, numpages,
863 __pgprot(_PAGE_CACHE_WC)); 919 __pgprot(_PAGE_CACHE_WC), 0);
864} 920}
865 921
866int set_memory_wc(unsigned long addr, int numpages) 922int set_memory_wc(unsigned long addr, int numpages)
@@ -878,8 +934,8 @@ EXPORT_SYMBOL(set_memory_wc);
878 934
879int _set_memory_wb(unsigned long addr, int numpages) 935int _set_memory_wb(unsigned long addr, int numpages)
880{ 936{
881 return change_page_attr_clear(addr, numpages, 937 return change_page_attr_clear(&addr, numpages,
882 __pgprot(_PAGE_CACHE_MASK)); 938 __pgprot(_PAGE_CACHE_MASK), 0);
883} 939}
884 940
885int set_memory_wb(unsigned long addr, int numpages) 941int set_memory_wb(unsigned long addr, int numpages)
@@ -890,37 +946,59 @@ int set_memory_wb(unsigned long addr, int numpages)
890} 946}
891EXPORT_SYMBOL(set_memory_wb); 947EXPORT_SYMBOL(set_memory_wb);
892 948
949int set_memory_array_wb(unsigned long *addr, int addrinarray)
950{
951 int i;
952
953 for (i = 0; i < addrinarray; i++) {
954 unsigned long start = __pa(addr[i]);
955 unsigned long end;
956
957 for (end = start + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) {
958 if (end != __pa(addr[i + 1]))
959 break;
960 i++;
961 }
962 free_memtype(start, end);
963 }
964 return change_page_attr_clear(addr, addrinarray,
965 __pgprot(_PAGE_CACHE_MASK), 1);
966}
967EXPORT_SYMBOL(set_memory_array_wb);
968
893int set_memory_x(unsigned long addr, int numpages) 969int set_memory_x(unsigned long addr, int numpages)
894{ 970{
895 return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_NX)); 971 return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0);
896} 972}
897EXPORT_SYMBOL(set_memory_x); 973EXPORT_SYMBOL(set_memory_x);
898 974
899int set_memory_nx(unsigned long addr, int numpages) 975int set_memory_nx(unsigned long addr, int numpages)
900{ 976{
901 return change_page_attr_set(addr, numpages, __pgprot(_PAGE_NX)); 977 return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0);
902} 978}
903EXPORT_SYMBOL(set_memory_nx); 979EXPORT_SYMBOL(set_memory_nx);
904 980
905int set_memory_ro(unsigned long addr, int numpages) 981int set_memory_ro(unsigned long addr, int numpages)
906{ 982{
907 return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_RW)); 983 return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW), 0);
908} 984}
985EXPORT_SYMBOL_GPL(set_memory_ro);
909 986
910int set_memory_rw(unsigned long addr, int numpages) 987int set_memory_rw(unsigned long addr, int numpages)
911{ 988{
912 return change_page_attr_set(addr, numpages, __pgprot(_PAGE_RW)); 989 return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0);
913} 990}
991EXPORT_SYMBOL_GPL(set_memory_rw);
914 992
915int set_memory_np(unsigned long addr, int numpages) 993int set_memory_np(unsigned long addr, int numpages)
916{ 994{
917 return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_PRESENT)); 995 return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_PRESENT), 0);
918} 996}
919 997
920int set_memory_4k(unsigned long addr, int numpages) 998int set_memory_4k(unsigned long addr, int numpages)
921{ 999{
922 return change_page_attr_set_clr(addr, numpages, __pgprot(0), 1000 return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
923 __pgprot(0), 1); 1001 __pgprot(0), 1, 0);
924} 1002}
925 1003
926int set_pages_uc(struct page *page, int numpages) 1004int set_pages_uc(struct page *page, int numpages)
@@ -973,22 +1051,38 @@ int set_pages_rw(struct page *page, int numpages)
973 1051
974static int __set_pages_p(struct page *page, int numpages) 1052static int __set_pages_p(struct page *page, int numpages)
975{ 1053{
976 struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page), 1054 unsigned long tempaddr = (unsigned long) page_address(page);
1055 struct cpa_data cpa = { .vaddr = &tempaddr,
977 .numpages = numpages, 1056 .numpages = numpages,
978 .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW), 1057 .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW),
979 .mask_clr = __pgprot(0)}; 1058 .mask_clr = __pgprot(0),
1059 .flags = 0};
980 1060
981 return __change_page_attr_set_clr(&cpa, 1); 1061 /*
1062 * No alias checking needed for setting present flag. otherwise,
1063 * we may need to break large pages for 64-bit kernel text
1064 * mappings (this adds to complexity if we want to do this from
1065 * atomic context especially). Let's keep it simple!
1066 */
1067 return __change_page_attr_set_clr(&cpa, 0);
982} 1068}
983 1069
984static int __set_pages_np(struct page *page, int numpages) 1070static int __set_pages_np(struct page *page, int numpages)
985{ 1071{
986 struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page), 1072 unsigned long tempaddr = (unsigned long) page_address(page);
1073 struct cpa_data cpa = { .vaddr = &tempaddr,
987 .numpages = numpages, 1074 .numpages = numpages,
988 .mask_set = __pgprot(0), 1075 .mask_set = __pgprot(0),
989 .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW)}; 1076 .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW),
1077 .flags = 0};
990 1078
991 return __change_page_attr_set_clr(&cpa, 1); 1079 /*
1080 * No alias checking needed for setting not present flag. otherwise,
1081 * we may need to break large pages for 64-bit kernel text
1082 * mappings (this adds to complexity if we want to do this from
1083 * atomic context especially). Let's keep it simple!
1084 */
1085 return __change_page_attr_set_clr(&cpa, 0);
992} 1086}
993 1087
994void kernel_map_pages(struct page *page, int numpages, int enable) 1088void kernel_map_pages(struct page *page, int numpages, int enable)
@@ -1008,11 +1102,8 @@ void kernel_map_pages(struct page *page, int numpages, int enable)
1008 1102
1009 /* 1103 /*
1010 * The return value is ignored as the calls cannot fail. 1104 * The return value is ignored as the calls cannot fail.
1011 * Large pages are kept enabled at boot time, and are 1105 * Large pages for identity mappings are not used at boot time
1012 * split up quickly with DEBUG_PAGEALLOC. If a splitup 1106 * and hence no memory allocations during large page split.
1013 * fails here (due to temporary memory shortage) no damage
1014 * is done because we just keep the largepage intact up
1015 * to the next attempt when it will likely be split up:
1016 */ 1107 */
1017 if (enable) 1108 if (enable)
1018 __set_pages_p(page, numpages); 1109 __set_pages_p(page, numpages);
@@ -1024,53 +1115,8 @@ void kernel_map_pages(struct page *page, int numpages, int enable)
1024 * but that can deadlock->flush only current cpu: 1115 * but that can deadlock->flush only current cpu:
1025 */ 1116 */
1026 __flush_tlb_all(); 1117 __flush_tlb_all();
1027
1028 /*
1029 * Try to refill the page pool here. We can do this only after
1030 * the tlb flush.
1031 */
1032 cpa_fill_pool(NULL);
1033} 1118}
1034 1119
1035#ifdef CONFIG_DEBUG_FS
1036static int dpa_show(struct seq_file *m, void *v)
1037{
1038 seq_puts(m, "DEBUG_PAGEALLOC\n");
1039 seq_printf(m, "pool_size : %lu\n", pool_size);
1040 seq_printf(m, "pool_pages : %lu\n", pool_pages);
1041 seq_printf(m, "pool_low : %lu\n", pool_low);
1042 seq_printf(m, "pool_used : %lu\n", pool_used);
1043 seq_printf(m, "pool_failed : %lu\n", pool_failed);
1044
1045 return 0;
1046}
1047
1048static int dpa_open(struct inode *inode, struct file *filp)
1049{
1050 return single_open(filp, dpa_show, NULL);
1051}
1052
1053static const struct file_operations dpa_fops = {
1054 .open = dpa_open,
1055 .read = seq_read,
1056 .llseek = seq_lseek,
1057 .release = single_release,
1058};
1059
1060static int __init debug_pagealloc_proc_init(void)
1061{
1062 struct dentry *de;
1063
1064 de = debugfs_create_file("debug_pagealloc", 0600, NULL, NULL,
1065 &dpa_fops);
1066 if (!de)
1067 return -ENOMEM;
1068
1069 return 0;
1070}
1071__initcall(debug_pagealloc_proc_init);
1072#endif
1073
1074#ifdef CONFIG_HIBERNATION 1120#ifdef CONFIG_HIBERNATION
1075 1121
1076bool kernel_page_present(struct page *page) 1122bool kernel_page_present(struct page *page)
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 2a50e0fa64a..738fd0f2495 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -7,24 +7,24 @@
7 * Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen. 7 * Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen.
8 */ 8 */
9 9
10#include <linux/mm.h> 10#include <linux/seq_file.h>
11#include <linux/bootmem.h>
12#include <linux/debugfs.h>
11#include <linux/kernel.h> 13#include <linux/kernel.h>
12#include <linux/gfp.h> 14#include <linux/gfp.h>
15#include <linux/mm.h>
13#include <linux/fs.h> 16#include <linux/fs.h>
14#include <linux/bootmem.h>
15#include <linux/debugfs.h>
16#include <linux/seq_file.h>
17 17
18#include <asm/msr.h> 18#include <asm/cacheflush.h>
19#include <asm/tlbflush.h>
20#include <asm/processor.h> 19#include <asm/processor.h>
21#include <asm/page.h> 20#include <asm/tlbflush.h>
22#include <asm/pgtable.h> 21#include <asm/pgtable.h>
23#include <asm/pat.h>
24#include <asm/e820.h>
25#include <asm/cacheflush.h>
26#include <asm/fcntl.h> 22#include <asm/fcntl.h>
23#include <asm/e820.h>
27#include <asm/mtrr.h> 24#include <asm/mtrr.h>
25#include <asm/page.h>
26#include <asm/msr.h>
27#include <asm/pat.h>
28#include <asm/io.h> 28#include <asm/io.h>
29 29
30#ifdef CONFIG_X86_PAT 30#ifdef CONFIG_X86_PAT
@@ -46,6 +46,7 @@ early_param("nopat", nopat);
46 46
47 47
48static int debug_enable; 48static int debug_enable;
49
49static int __init pat_debug_setup(char *str) 50static int __init pat_debug_setup(char *str)
50{ 51{
51 debug_enable = 1; 52 debug_enable = 1;
@@ -145,14 +146,14 @@ static char *cattr_name(unsigned long flags)
145 */ 146 */
146 147
147struct memtype { 148struct memtype {
148 u64 start; 149 u64 start;
149 u64 end; 150 u64 end;
150 unsigned long type; 151 unsigned long type;
151 struct list_head nd; 152 struct list_head nd;
152}; 153};
153 154
154static LIST_HEAD(memtype_list); 155static LIST_HEAD(memtype_list);
155static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */ 156static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */
156 157
157/* 158/*
158 * Does intersection of PAT memory type and MTRR memory type and returns 159 * Does intersection of PAT memory type and MTRR memory type and returns
@@ -180,8 +181,8 @@ static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type)
180 return req_type; 181 return req_type;
181} 182}
182 183
183static int chk_conflict(struct memtype *new, struct memtype *entry, 184static int
184 unsigned long *type) 185chk_conflict(struct memtype *new, struct memtype *entry, unsigned long *type)
185{ 186{
186 if (new->type != entry->type) { 187 if (new->type != entry->type) {
187 if (type) { 188 if (type) {
@@ -211,6 +212,66 @@ static struct memtype *cached_entry;
211static u64 cached_start; 212static u64 cached_start;
212 213
213/* 214/*
215 * For RAM pages, mark the pages as non WB memory type using
216 * PageNonWB (PG_arch_1). We allow only one set_memory_uc() or
217 * set_memory_wc() on a RAM page at a time before marking it as WB again.
218 * This is ok, because only one driver will be owning the page and
219 * doing set_memory_*() calls.
220 *
221 * For now, we use PageNonWB to track that the RAM page is being mapped
222 * as non WB. In future, we will have to use one more flag
223 * (or some other mechanism in page_struct) to distinguish between
224 * UC and WC mapping.
225 */
226static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type,
227 unsigned long *new_type)
228{
229 struct page *page;
230 u64 pfn, end_pfn;
231
232 for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
233 page = pfn_to_page(pfn);
234 if (page_mapped(page) || PageNonWB(page))
235 goto out;
236
237 SetPageNonWB(page);
238 }
239 return 0;
240
241out:
242 end_pfn = pfn;
243 for (pfn = (start >> PAGE_SHIFT); pfn < end_pfn; ++pfn) {
244 page = pfn_to_page(pfn);
245 ClearPageNonWB(page);
246 }
247
248 return -EINVAL;
249}
250
251static int free_ram_pages_type(u64 start, u64 end)
252{
253 struct page *page;
254 u64 pfn, end_pfn;
255
256 for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
257 page = pfn_to_page(pfn);
258 if (page_mapped(page) || !PageNonWB(page))
259 goto out;
260
261 ClearPageNonWB(page);
262 }
263 return 0;
264
265out:
266 end_pfn = pfn;
267 for (pfn = (start >> PAGE_SHIFT); pfn < end_pfn; ++pfn) {
268 page = pfn_to_page(pfn);
269 SetPageNonWB(page);
270 }
271 return -EINVAL;
272}
273
274/*
214 * req_type typically has one of the: 275 * req_type typically has one of the:
215 * - _PAGE_CACHE_WB 276 * - _PAGE_CACHE_WB
216 * - _PAGE_CACHE_WC 277 * - _PAGE_CACHE_WC
@@ -226,14 +287,15 @@ static u64 cached_start;
226 * it will return a negative return value. 287 * it will return a negative return value.
227 */ 288 */
228int reserve_memtype(u64 start, u64 end, unsigned long req_type, 289int reserve_memtype(u64 start, u64 end, unsigned long req_type,
229 unsigned long *new_type) 290 unsigned long *new_type)
230{ 291{
231 struct memtype *new, *entry; 292 struct memtype *new, *entry;
232 unsigned long actual_type; 293 unsigned long actual_type;
233 struct list_head *where; 294 struct list_head *where;
295 int is_range_ram;
234 int err = 0; 296 int err = 0;
235 297
236 BUG_ON(start >= end); /* end is exclusive */ 298 BUG_ON(start >= end); /* end is exclusive */
237 299
238 if (!pat_enabled) { 300 if (!pat_enabled) {
239 /* This is identical to page table setting without PAT */ 301 /* This is identical to page table setting without PAT */
@@ -266,17 +328,24 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
266 actual_type = _PAGE_CACHE_WB; 328 actual_type = _PAGE_CACHE_WB;
267 else 329 else
268 actual_type = _PAGE_CACHE_UC_MINUS; 330 actual_type = _PAGE_CACHE_UC_MINUS;
269 } else 331 } else {
270 actual_type = pat_x_mtrr_type(start, end, 332 actual_type = pat_x_mtrr_type(start, end,
271 req_type & _PAGE_CACHE_MASK); 333 req_type & _PAGE_CACHE_MASK);
334 }
335
336 is_range_ram = pagerange_is_ram(start, end);
337 if (is_range_ram == 1)
338 return reserve_ram_pages_type(start, end, req_type, new_type);
339 else if (is_range_ram < 0)
340 return -EINVAL;
272 341
273 new = kmalloc(sizeof(struct memtype), GFP_KERNEL); 342 new = kmalloc(sizeof(struct memtype), GFP_KERNEL);
274 if (!new) 343 if (!new)
275 return -ENOMEM; 344 return -ENOMEM;
276 345
277 new->start = start; 346 new->start = start;
278 new->end = end; 347 new->end = end;
279 new->type = actual_type; 348 new->type = actual_type;
280 349
281 if (new_type) 350 if (new_type)
282 *new_type = actual_type; 351 *new_type = actual_type;
@@ -335,6 +404,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
335 start, end, cattr_name(new->type), cattr_name(req_type)); 404 start, end, cattr_name(new->type), cattr_name(req_type));
336 kfree(new); 405 kfree(new);
337 spin_unlock(&memtype_lock); 406 spin_unlock(&memtype_lock);
407
338 return err; 408 return err;
339 } 409 }
340 410
@@ -358,6 +428,7 @@ int free_memtype(u64 start, u64 end)
358{ 428{
359 struct memtype *entry; 429 struct memtype *entry;
360 int err = -EINVAL; 430 int err = -EINVAL;
431 int is_range_ram;
361 432
362 if (!pat_enabled) 433 if (!pat_enabled)
363 return 0; 434 return 0;
@@ -366,6 +437,12 @@ int free_memtype(u64 start, u64 end)
366 if (is_ISA_range(start, end - 1)) 437 if (is_ISA_range(start, end - 1))
367 return 0; 438 return 0;
368 439
440 is_range_ram = pagerange_is_ram(start, end);
441 if (is_range_ram == 1)
442 return free_ram_pages_type(start, end);
443 else if (is_range_ram < 0)
444 return -EINVAL;
445
369 spin_lock(&memtype_lock); 446 spin_lock(&memtype_lock);
370 list_for_each_entry(entry, &memtype_list, nd) { 447 list_for_each_entry(entry, &memtype_list, nd) {
371 if (entry->start == start && entry->end == end) { 448 if (entry->start == start && entry->end == end) {
@@ -386,6 +463,7 @@ int free_memtype(u64 start, u64 end)
386 } 463 }
387 464
388 dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end); 465 dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end);
466
389 return err; 467 return err;
390} 468}
391 469
@@ -492,9 +570,9 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
492 570
493void map_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot) 571void map_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot)
494{ 572{
573 unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK);
495 u64 addr = (u64)pfn << PAGE_SHIFT; 574 u64 addr = (u64)pfn << PAGE_SHIFT;
496 unsigned long flags; 575 unsigned long flags;
497 unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK);
498 576
499 reserve_memtype(addr, addr + size, want_flags, &flags); 577 reserve_memtype(addr, addr + size, want_flags, &flags);
500 if (flags != want_flags) { 578 if (flags != want_flags) {
@@ -514,7 +592,7 @@ void unmap_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot)
514 free_memtype(addr, addr + size); 592 free_memtype(addr, addr + size);
515} 593}
516 594
517#if defined(CONFIG_DEBUG_FS) 595#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT)
518 596
519/* get Nth element of the linked list */ 597/* get Nth element of the linked list */
520static struct memtype *memtype_get_idx(loff_t pos) 598static struct memtype *memtype_get_idx(loff_t pos)
@@ -537,6 +615,7 @@ static struct memtype *memtype_get_idx(loff_t pos)
537 } 615 }
538 spin_unlock(&memtype_lock); 616 spin_unlock(&memtype_lock);
539 kfree(print_entry); 617 kfree(print_entry);
618
540 return NULL; 619 return NULL;
541} 620}
542 621
@@ -567,6 +646,7 @@ static int memtype_seq_show(struct seq_file *seq, void *v)
567 seq_printf(seq, "%s @ 0x%Lx-0x%Lx\n", cattr_name(print_entry->type), 646 seq_printf(seq, "%s @ 0x%Lx-0x%Lx\n", cattr_name(print_entry->type),
568 print_entry->start, print_entry->end); 647 print_entry->start, print_entry->end);
569 kfree(print_entry); 648 kfree(print_entry);
649
570 return 0; 650 return 0;
571} 651}
572 652
@@ -598,4 +678,4 @@ static int __init pat_memtype_list_init(void)
598 678
599late_initcall(pat_memtype_list_init); 679late_initcall(pat_memtype_list_init);
600 680
601#endif /* CONFIG_DEBUG_FS */ 681#endif /* CONFIG_DEBUG_FS && CONFIG_X86_PAT */
diff --git a/arch/x86/mm/pf_in.c b/arch/x86/mm/pf_in.c
index efa1911e20c..df3d5c861cd 100644
--- a/arch/x86/mm/pf_in.c
+++ b/arch/x86/mm/pf_in.c
@@ -79,25 +79,34 @@ static unsigned int mw32[] = { 0xC7 };
79static unsigned int mw64[] = { 0x89, 0x8B }; 79static unsigned int mw64[] = { 0x89, 0x8B };
80#endif /* not __i386__ */ 80#endif /* not __i386__ */
81 81
82static int skip_prefix(unsigned char *addr, int *shorted, int *enlarged, 82struct prefix_bits {
83 int *rexr) 83 unsigned shorted:1;
84 unsigned enlarged:1;
85 unsigned rexr:1;
86 unsigned rex:1;
87};
88
89static int skip_prefix(unsigned char *addr, struct prefix_bits *prf)
84{ 90{
85 int i; 91 int i;
86 unsigned char *p = addr; 92 unsigned char *p = addr;
87 *shorted = 0; 93 prf->shorted = 0;
88 *enlarged = 0; 94 prf->enlarged = 0;
89 *rexr = 0; 95 prf->rexr = 0;
96 prf->rex = 0;
90 97
91restart: 98restart:
92 for (i = 0; i < ARRAY_SIZE(prefix_codes); i++) { 99 for (i = 0; i < ARRAY_SIZE(prefix_codes); i++) {
93 if (*p == prefix_codes[i]) { 100 if (*p == prefix_codes[i]) {
94 if (*p == 0x66) 101 if (*p == 0x66)
95 *shorted = 1; 102 prf->shorted = 1;
96#ifdef __amd64__ 103#ifdef __amd64__
97 if ((*p & 0xf8) == 0x48) 104 if ((*p & 0xf8) == 0x48)
98 *enlarged = 1; 105 prf->enlarged = 1;
99 if ((*p & 0xf4) == 0x44) 106 if ((*p & 0xf4) == 0x44)
100 *rexr = 1; 107 prf->rexr = 1;
108 if ((*p & 0xf0) == 0x40)
109 prf->rex = 1;
101#endif 110#endif
102 p++; 111 p++;
103 goto restart; 112 goto restart;
@@ -135,12 +144,12 @@ enum reason_type get_ins_type(unsigned long ins_addr)
135{ 144{
136 unsigned int opcode; 145 unsigned int opcode;
137 unsigned char *p; 146 unsigned char *p;
138 int shorted, enlarged, rexr; 147 struct prefix_bits prf;
139 int i; 148 int i;
140 enum reason_type rv = OTHERS; 149 enum reason_type rv = OTHERS;
141 150
142 p = (unsigned char *)ins_addr; 151 p = (unsigned char *)ins_addr;
143 p += skip_prefix(p, &shorted, &enlarged, &rexr); 152 p += skip_prefix(p, &prf);
144 p += get_opcode(p, &opcode); 153 p += get_opcode(p, &opcode);
145 154
146 CHECK_OP_TYPE(opcode, reg_rop, REG_READ); 155 CHECK_OP_TYPE(opcode, reg_rop, REG_READ);
@@ -156,10 +165,11 @@ static unsigned int get_ins_reg_width(unsigned long ins_addr)
156{ 165{
157 unsigned int opcode; 166 unsigned int opcode;
158 unsigned char *p; 167 unsigned char *p;
159 int i, shorted, enlarged, rexr; 168 struct prefix_bits prf;
169 int i;
160 170
161 p = (unsigned char *)ins_addr; 171 p = (unsigned char *)ins_addr;
162 p += skip_prefix(p, &shorted, &enlarged, &rexr); 172 p += skip_prefix(p, &prf);
163 p += get_opcode(p, &opcode); 173 p += get_opcode(p, &opcode);
164 174
165 for (i = 0; i < ARRAY_SIZE(rw8); i++) 175 for (i = 0; i < ARRAY_SIZE(rw8); i++)
@@ -168,7 +178,7 @@ static unsigned int get_ins_reg_width(unsigned long ins_addr)
168 178
169 for (i = 0; i < ARRAY_SIZE(rw32); i++) 179 for (i = 0; i < ARRAY_SIZE(rw32); i++)
170 if (rw32[i] == opcode) 180 if (rw32[i] == opcode)
171 return (shorted ? 2 : (enlarged ? 8 : 4)); 181 return prf.shorted ? 2 : (prf.enlarged ? 8 : 4);
172 182
173 printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode); 183 printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode);
174 return 0; 184 return 0;
@@ -178,10 +188,11 @@ unsigned int get_ins_mem_width(unsigned long ins_addr)
178{ 188{
179 unsigned int opcode; 189 unsigned int opcode;
180 unsigned char *p; 190 unsigned char *p;
181 int i, shorted, enlarged, rexr; 191 struct prefix_bits prf;
192 int i;
182 193
183 p = (unsigned char *)ins_addr; 194 p = (unsigned char *)ins_addr;
184 p += skip_prefix(p, &shorted, &enlarged, &rexr); 195 p += skip_prefix(p, &prf);
185 p += get_opcode(p, &opcode); 196 p += get_opcode(p, &opcode);
186 197
187 for (i = 0; i < ARRAY_SIZE(mw8); i++) 198 for (i = 0; i < ARRAY_SIZE(mw8); i++)
@@ -194,11 +205,11 @@ unsigned int get_ins_mem_width(unsigned long ins_addr)
194 205
195 for (i = 0; i < ARRAY_SIZE(mw32); i++) 206 for (i = 0; i < ARRAY_SIZE(mw32); i++)
196 if (mw32[i] == opcode) 207 if (mw32[i] == opcode)
197 return shorted ? 2 : 4; 208 return prf.shorted ? 2 : 4;
198 209
199 for (i = 0; i < ARRAY_SIZE(mw64); i++) 210 for (i = 0; i < ARRAY_SIZE(mw64); i++)
200 if (mw64[i] == opcode) 211 if (mw64[i] == opcode)
201 return shorted ? 2 : (enlarged ? 8 : 4); 212 return prf.shorted ? 2 : (prf.enlarged ? 8 : 4);
202 213
203 printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode); 214 printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode);
204 return 0; 215 return 0;
@@ -238,7 +249,7 @@ enum {
238#endif 249#endif
239}; 250};
240 251
241static unsigned char *get_reg_w8(int no, struct pt_regs *regs) 252static unsigned char *get_reg_w8(int no, int rex, struct pt_regs *regs)
242{ 253{
243 unsigned char *rv = NULL; 254 unsigned char *rv = NULL;
244 255
@@ -255,18 +266,6 @@ static unsigned char *get_reg_w8(int no, struct pt_regs *regs)
255 case arg_DL: 266 case arg_DL:
256 rv = (unsigned char *)&regs->dx; 267 rv = (unsigned char *)&regs->dx;
257 break; 268 break;
258 case arg_AH:
259 rv = 1 + (unsigned char *)&regs->ax;
260 break;
261 case arg_BH:
262 rv = 1 + (unsigned char *)&regs->bx;
263 break;
264 case arg_CH:
265 rv = 1 + (unsigned char *)&regs->cx;
266 break;
267 case arg_DH:
268 rv = 1 + (unsigned char *)&regs->dx;
269 break;
270#ifdef __amd64__ 269#ifdef __amd64__
271 case arg_R8: 270 case arg_R8:
272 rv = (unsigned char *)&regs->r8; 271 rv = (unsigned char *)&regs->r8;
@@ -294,9 +293,55 @@ static unsigned char *get_reg_w8(int no, struct pt_regs *regs)
294 break; 293 break;
295#endif 294#endif
296 default: 295 default:
297 printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no);
298 break; 296 break;
299 } 297 }
298
299 if (rv)
300 return rv;
301
302 if (rex) {
303 /*
304 * If REX prefix exists, access low bytes of SI etc.
305 * instead of AH etc.
306 */
307 switch (no) {
308 case arg_SI:
309 rv = (unsigned char *)&regs->si;
310 break;
311 case arg_DI:
312 rv = (unsigned char *)&regs->di;
313 break;
314 case arg_BP:
315 rv = (unsigned char *)&regs->bp;
316 break;
317 case arg_SP:
318 rv = (unsigned char *)&regs->sp;
319 break;
320 default:
321 break;
322 }
323 } else {
324 switch (no) {
325 case arg_AH:
326 rv = 1 + (unsigned char *)&regs->ax;
327 break;
328 case arg_BH:
329 rv = 1 + (unsigned char *)&regs->bx;
330 break;
331 case arg_CH:
332 rv = 1 + (unsigned char *)&regs->cx;
333 break;
334 case arg_DH:
335 rv = 1 + (unsigned char *)&regs->dx;
336 break;
337 default:
338 break;
339 }
340 }
341
342 if (!rv)
343 printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no);
344
300 return rv; 345 return rv;
301} 346}
302 347
@@ -368,11 +413,12 @@ unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs)
368 unsigned char mod_rm; 413 unsigned char mod_rm;
369 int reg; 414 int reg;
370 unsigned char *p; 415 unsigned char *p;
371 int i, shorted, enlarged, rexr; 416 struct prefix_bits prf;
417 int i;
372 unsigned long rv; 418 unsigned long rv;
373 419
374 p = (unsigned char *)ins_addr; 420 p = (unsigned char *)ins_addr;
375 p += skip_prefix(p, &shorted, &enlarged, &rexr); 421 p += skip_prefix(p, &prf);
376 p += get_opcode(p, &opcode); 422 p += get_opcode(p, &opcode);
377 for (i = 0; i < ARRAY_SIZE(reg_rop); i++) 423 for (i = 0; i < ARRAY_SIZE(reg_rop); i++)
378 if (reg_rop[i] == opcode) { 424 if (reg_rop[i] == opcode) {
@@ -392,10 +438,10 @@ unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs)
392 438
393do_work: 439do_work:
394 mod_rm = *p; 440 mod_rm = *p;
395 reg = ((mod_rm >> 3) & 0x7) | (rexr << 3); 441 reg = ((mod_rm >> 3) & 0x7) | (prf.rexr << 3);
396 switch (get_ins_reg_width(ins_addr)) { 442 switch (get_ins_reg_width(ins_addr)) {
397 case 1: 443 case 1:
398 return *get_reg_w8(reg, regs); 444 return *get_reg_w8(reg, prf.rex, regs);
399 445
400 case 2: 446 case 2:
401 return *(unsigned short *)get_reg_w32(reg, regs); 447 return *(unsigned short *)get_reg_w32(reg, regs);
@@ -422,11 +468,12 @@ unsigned long get_ins_imm_val(unsigned long ins_addr)
422 unsigned char mod_rm; 468 unsigned char mod_rm;
423 unsigned char mod; 469 unsigned char mod;
424 unsigned char *p; 470 unsigned char *p;
425 int i, shorted, enlarged, rexr; 471 struct prefix_bits prf;
472 int i;
426 unsigned long rv; 473 unsigned long rv;
427 474
428 p = (unsigned char *)ins_addr; 475 p = (unsigned char *)ins_addr;
429 p += skip_prefix(p, &shorted, &enlarged, &rexr); 476 p += skip_prefix(p, &prf);
430 p += get_opcode(p, &opcode); 477 p += get_opcode(p, &opcode);
431 for (i = 0; i < ARRAY_SIZE(imm_wop); i++) 478 for (i = 0; i < ARRAY_SIZE(imm_wop); i++)
432 if (imm_wop[i] == opcode) { 479 if (imm_wop[i] == opcode) {
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index d50302774fe..86f2ffc43c3 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -63,10 +63,8 @@ static inline void pgd_list_del(pgd_t *pgd)
63#define UNSHARED_PTRS_PER_PGD \ 63#define UNSHARED_PTRS_PER_PGD \
64 (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD) 64 (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
65 65
66static void pgd_ctor(void *p) 66static void pgd_ctor(pgd_t *pgd)
67{ 67{
68 pgd_t *pgd = p;
69
70 /* If the pgd points to a shared pagetable level (either the 68 /* If the pgd points to a shared pagetable level (either the
71 ptes in non-PAE, or shared PMD in PAE), then just copy the 69 ptes in non-PAE, or shared PMD in PAE), then just copy the
72 references from swapper_pg_dir. */ 70 references from swapper_pg_dir. */
@@ -87,7 +85,7 @@ static void pgd_ctor(void *p)
87 pgd_list_add(pgd); 85 pgd_list_add(pgd);
88} 86}
89 87
90static void pgd_dtor(void *pgd) 88static void pgd_dtor(pgd_t *pgd)
91{ 89{
92 unsigned long flags; /* can be called from interrupt context */ 90 unsigned long flags; /* can be called from interrupt context */
93 91
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index cab0abbd1eb..0951db9ee51 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -123,7 +123,8 @@ static int __init parse_vmalloc(char *arg)
123 if (!arg) 123 if (!arg)
124 return -EINVAL; 124 return -EINVAL;
125 125
126 __VMALLOC_RESERVE = memparse(arg, &arg); 126 /* Add VMALLOC_OFFSET to the parsed value due to vm area guard hole*/
127 __VMALLOC_RESERVE = memparse(arg, &arg) + VMALLOC_OFFSET;
127 return 0; 128 return 0;
128} 129}
129early_param("vmalloc", parse_vmalloc); 130early_param("vmalloc", parse_vmalloc);
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 1b4763e26ea..51c0a2fc14f 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -138,7 +138,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
138 return; 138 return;
139 } 139 }
140 140
141 if (is_uv_system()) 141 if (get_uv_system_type() >= UV_X2APIC)
142 apic_id = (pa->apic_id << 8) | pa->local_sapic_eid; 142 apic_id = (pa->apic_id << 8) | pa->local_sapic_eid;
143 else 143 else
144 apic_id = pa->apic_id; 144 apic_id = pa->apic_id;
diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c
index d877c5b423e..ab50a8d7402 100644
--- a/arch/x86/mm/testmmiotrace.c
+++ b/arch/x86/mm/testmmiotrace.c
@@ -3,6 +3,7 @@
3 */ 3 */
4#include <linux/module.h> 4#include <linux/module.h>
5#include <linux/io.h> 5#include <linux/io.h>
6#include <linux/mmiotrace.h>
6 7
7#define MODULE_NAME "testmmiotrace" 8#define MODULE_NAME "testmmiotrace"
8 9
@@ -13,6 +14,7 @@ MODULE_PARM_DESC(mmio_address, "Start address of the mapping of 16 kB.");
13static void do_write_test(void __iomem *p) 14static void do_write_test(void __iomem *p)
14{ 15{
15 unsigned int i; 16 unsigned int i;
17 mmiotrace_printk("Write test.\n");
16 for (i = 0; i < 256; i++) 18 for (i = 0; i < 256; i++)
17 iowrite8(i, p + i); 19 iowrite8(i, p + i);
18 for (i = 1024; i < (5 * 1024); i += 2) 20 for (i = 1024; i < (5 * 1024); i += 2)
@@ -24,6 +26,7 @@ static void do_write_test(void __iomem *p)
24static void do_read_test(void __iomem *p) 26static void do_read_test(void __iomem *p)
25{ 27{
26 unsigned int i; 28 unsigned int i;
29 mmiotrace_printk("Read test.\n");
27 for (i = 0; i < 256; i++) 30 for (i = 0; i < 256; i++)
28 ioread8(p + i); 31 ioread8(p + i);
29 for (i = 1024; i < (5 * 1024); i += 2) 32 for (i = 1024; i < (5 * 1024); i += 2)
@@ -39,6 +42,7 @@ static void do_test(void)
39 pr_err(MODULE_NAME ": could not ioremap, aborting.\n"); 42 pr_err(MODULE_NAME ": could not ioremap, aborting.\n");
40 return; 43 return;
41 } 44 }
45 mmiotrace_printk("ioremap returned %p.\n", p);
42 do_write_test(p); 46 do_write_test(p);
43 do_read_test(p); 47 do_read_test(p);
44 iounmap(p); 48 iounmap(p);