aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/mm')
-rw-r--r--arch/x86/mm/Makefile2
-rw-r--r--arch/x86/mm/discontig_32.c26
-rw-r--r--arch/x86/mm/dump_pagetables.c2
-rw-r--r--arch/x86/mm/highmem_32.c1
-rw-r--r--arch/x86/mm/init_32.c69
-rw-r--r--arch/x86/mm/init_64.c74
-rw-r--r--arch/x86/mm/ioremap.c67
-rw-r--r--arch/x86/mm/k8topology_64.c38
-rw-r--r--arch/x86/mm/numa_64.c42
-rw-r--r--arch/x86/mm/pageattr.c16
-rw-r--r--arch/x86/mm/pat.c207
-rw-r--r--arch/x86/mm/pgtable.c276
-rw-r--r--arch/x86/mm/pgtable_32.c204
-rw-r--r--arch/x86/mm/srat_64.c2
14 files changed, 695 insertions, 331 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 20941d2954e2..b7b3e4c7cfc9 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -1,5 +1,5 @@
1obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ 1obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
2 pat.o 2 pat.o pgtable.o
3 3
4obj-$(CONFIG_X86_32) += pgtable_32.o 4obj-$(CONFIG_X86_32) += pgtable_32.o
5 5
diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c
index 18378850e25a..914ccf983687 100644
--- a/arch/x86/mm/discontig_32.c
+++ b/arch/x86/mm/discontig_32.c
@@ -476,29 +476,3 @@ int memory_add_physaddr_to_nid(u64 addr)
476 476
477EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); 477EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
478#endif 478#endif
479
480#ifndef CONFIG_HAVE_ARCH_PARSE_SRAT
481/*
482 * XXX FIXME: Make SLIT table parsing available to 32-bit NUMA
483 *
484 * These stub functions are needed to compile 32-bit NUMA when SRAT is
485 * not set. There are functions in srat_64.c for parsing this table
486 * and it may be possible to make them common functions.
487 */
488void acpi_numa_slit_init (struct acpi_table_slit *slit)
489{
490 printk(KERN_INFO "ACPI: No support for parsing SLIT table\n");
491}
492
493void acpi_numa_processor_affinity_init (struct acpi_srat_cpu_affinity *pa)
494{
495}
496
497void acpi_numa_memory_affinity_init (struct acpi_srat_mem_affinity *ma)
498{
499}
500
501void acpi_numa_arch_fixup(void)
502{
503}
504#endif /* CONFIG_HAVE_ARCH_PARSE_SRAT */
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 6791b8334bc6..2c24bea92c66 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -324,7 +324,7 @@ static const struct file_operations ptdump_fops = {
324 .release = single_release, 324 .release = single_release,
325}; 325};
326 326
327int pt_dump_init(void) 327static int pt_dump_init(void)
328{ 328{
329 struct dentry *pe; 329 struct dentry *pe;
330 330
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 9cf33d3ee5bc..165c871ba9af 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -155,4 +155,3 @@ EXPORT_SYMBOL(kmap);
155EXPORT_SYMBOL(kunmap); 155EXPORT_SYMBOL(kunmap);
156EXPORT_SYMBOL(kmap_atomic); 156EXPORT_SYMBOL(kmap_atomic);
157EXPORT_SYMBOL(kunmap_atomic); 157EXPORT_SYMBOL(kunmap_atomic);
158EXPORT_SYMBOL(kmap_atomic_to_page);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 9ec62da85fd7..de236e419cb5 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -71,7 +71,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
71 if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { 71 if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
72 pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); 72 pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
73 73
74 paravirt_alloc_pd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); 74 paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
75 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); 75 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
76 pud = pud_offset(pgd, 0); 76 pud = pud_offset(pgd, 0);
77 BUG_ON(pmd_table != pmd_offset(pud, 0)); 77 BUG_ON(pmd_table != pmd_offset(pud, 0));
@@ -100,7 +100,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
100 (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE); 100 (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE);
101 } 101 }
102 102
103 paravirt_alloc_pt(&init_mm, __pa(page_table) >> PAGE_SHIFT); 103 paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT);
104 set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); 104 set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
105 BUG_ON(page_table != pte_offset_kernel(pmd, 0)); 105 BUG_ON(page_table != pte_offset_kernel(pmd, 0));
106 } 106 }
@@ -227,6 +227,25 @@ static inline int page_kills_ppro(unsigned long pagenr)
227 return 0; 227 return 0;
228} 228}
229 229
230/*
231 * devmem_is_allowed() checks to see if /dev/mem access to a certain address
232 * is valid. The argument is a physical page number.
233 *
234 *
235 * On x86, access has to be given to the first megabyte of ram because that area
236 * contains bios code and data regions used by X and dosemu and similar apps.
237 * Access has to be given to non-kernel-ram areas as well, these contain the PCI
238 * mmio resources as well as potential bios/acpi data regions.
239 */
240int devmem_is_allowed(unsigned long pagenr)
241{
242 if (pagenr <= 256)
243 return 1;
244 if (!page_is_ram(pagenr))
245 return 1;
246 return 0;
247}
248
230#ifdef CONFIG_HIGHMEM 249#ifdef CONFIG_HIGHMEM
231pte_t *kmap_pte; 250pte_t *kmap_pte;
232pgprot_t kmap_prot; 251pgprot_t kmap_prot;
@@ -268,47 +287,17 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base)
268 pkmap_page_table = pte; 287 pkmap_page_table = pte;
269} 288}
270 289
271static void __meminit free_new_highpage(struct page *page)
272{
273 init_page_count(page);
274 __free_page(page);
275 totalhigh_pages++;
276}
277
278void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro) 290void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro)
279{ 291{
280 if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) { 292 if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) {
281 ClearPageReserved(page); 293 ClearPageReserved(page);
282 free_new_highpage(page); 294 init_page_count(page);
295 __free_page(page);
296 totalhigh_pages++;
283 } else 297 } else
284 SetPageReserved(page); 298 SetPageReserved(page);
285} 299}
286 300
287static int __meminit
288add_one_highpage_hotplug(struct page *page, unsigned long pfn)
289{
290 free_new_highpage(page);
291 totalram_pages++;
292#ifdef CONFIG_FLATMEM
293 max_mapnr = max(pfn, max_mapnr);
294#endif
295 num_physpages++;
296
297 return 0;
298}
299
300/*
301 * Not currently handling the NUMA case.
302 * Assuming single node and all memory that
303 * has been added dynamically that would be
304 * onlined here is in HIGHMEM.
305 */
306void __meminit online_page(struct page *page)
307{
308 ClearPageReserved(page);
309 add_one_highpage_hotplug(page, page_to_pfn(page));
310}
311
312#ifndef CONFIG_NUMA 301#ifndef CONFIG_NUMA
313static void __init set_highmem_pages_init(int bad_ppro) 302static void __init set_highmem_pages_init(int bad_ppro)
314{ 303{
@@ -365,7 +354,7 @@ void __init native_pagetable_setup_start(pgd_t *base)
365 354
366 pte_clear(NULL, va, pte); 355 pte_clear(NULL, va, pte);
367 } 356 }
368 paravirt_alloc_pd(&init_mm, __pa(base) >> PAGE_SHIFT); 357 paravirt_alloc_pmd(&init_mm, __pa(base) >> PAGE_SHIFT);
369} 358}
370 359
371void __init native_pagetable_setup_done(pgd_t *base) 360void __init native_pagetable_setup_done(pgd_t *base)
@@ -457,7 +446,7 @@ void zap_low_mappings(void)
457 * Note that "pgd_clear()" doesn't do it for 446 * Note that "pgd_clear()" doesn't do it for
458 * us, because pgd_clear() is a no-op on i386. 447 * us, because pgd_clear() is a no-op on i386.
459 */ 448 */
460 for (i = 0; i < USER_PTRS_PER_PGD; i++) { 449 for (i = 0; i < KERNEL_PGD_BOUNDARY; i++) {
461#ifdef CONFIG_X86_PAE 450#ifdef CONFIG_X86_PAE
462 set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page))); 451 set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page)));
463#else 452#else
@@ -547,9 +536,9 @@ void __init paging_init(void)
547 536
548/* 537/*
549 * Test if the WP bit works in supervisor mode. It isn't supported on 386's 538 * Test if the WP bit works in supervisor mode. It isn't supported on 386's
550 * and also on some strange 486's (NexGen etc.). All 586+'s are OK. This 539 * and also on some strange 486's. All 586+'s are OK. This used to involve
551 * used to involve black magic jumps to work around some nasty CPU bugs, 540 * black magic jumps to work around some nasty CPU bugs, but fortunately the
552 * but fortunately the switch to using exceptions got rid of all that. 541 * switch to using exceptions got rid of all that.
553 */ 542 */
554static void __init test_wp_bit(void) 543static void __init test_wp_bit(void)
555{ 544{
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 1ff7906a9a4d..32ba13b0f818 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -135,7 +135,7 @@ static __init void *spp_getpage(void)
135 return ptr; 135 return ptr;
136} 136}
137 137
138static __init void 138static void
139set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot) 139set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot)
140{ 140{
141 pgd_t *pgd; 141 pgd_t *pgd;
@@ -173,7 +173,7 @@ set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot)
173 new_pte = pfn_pte(phys >> PAGE_SHIFT, prot); 173 new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
174 174
175 pte = pte_offset_kernel(pmd, vaddr); 175 pte = pte_offset_kernel(pmd, vaddr);
176 if (!pte_none(*pte) && 176 if (!pte_none(*pte) && pte_val(new_pte) &&
177 pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask)) 177 pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
178 pte_ERROR(*pte); 178 pte_ERROR(*pte);
179 set_pte(pte, new_pte); 179 set_pte(pte, new_pte);
@@ -214,8 +214,7 @@ void __init cleanup_highmap(void)
214} 214}
215 215
216/* NOTE: this is meant to be run only at boot */ 216/* NOTE: this is meant to be run only at boot */
217void __init 217void __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
218__set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
219{ 218{
220 unsigned long address = __fix_to_virt(idx); 219 unsigned long address = __fix_to_virt(idx);
221 220
@@ -621,15 +620,6 @@ void __init paging_init(void)
621/* 620/*
622 * Memory hotplug specific functions 621 * Memory hotplug specific functions
623 */ 622 */
624void online_page(struct page *page)
625{
626 ClearPageReserved(page);
627 init_page_count(page);
628 __free_page(page);
629 totalram_pages++;
630 num_physpages++;
631}
632
633#ifdef CONFIG_MEMORY_HOTPLUG 623#ifdef CONFIG_MEMORY_HOTPLUG
634/* 624/*
635 * Memory is added always to NORMAL zone. This means you will never get 625 * Memory is added always to NORMAL zone. This means you will never get
@@ -664,6 +654,26 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
664 654
665#endif /* CONFIG_MEMORY_HOTPLUG */ 655#endif /* CONFIG_MEMORY_HOTPLUG */
666 656
657/*
658 * devmem_is_allowed() checks to see if /dev/mem access to a certain address
659 * is valid. The argument is a physical page number.
660 *
661 *
662 * On x86, access has to be given to the first megabyte of ram because that area
663 * contains bios code and data regions used by X and dosemu and similar apps.
664 * Access has to be given to non-kernel-ram areas as well, these contain the PCI
665 * mmio resources as well as potential bios/acpi data regions.
666 */
667int devmem_is_allowed(unsigned long pagenr)
668{
669 if (pagenr <= 256)
670 return 1;
671 if (!page_is_ram(pagenr))
672 return 1;
673 return 0;
674}
675
676
667static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, 677static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel,
668 kcore_modules, kcore_vsyscall; 678 kcore_modules, kcore_vsyscall;
669 679
@@ -791,7 +801,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
791void __init reserve_bootmem_generic(unsigned long phys, unsigned len) 801void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
792{ 802{
793#ifdef CONFIG_NUMA 803#ifdef CONFIG_NUMA
794 int nid = phys_to_nid(phys); 804 int nid, next_nid;
795#endif 805#endif
796 unsigned long pfn = phys >> PAGE_SHIFT; 806 unsigned long pfn = phys >> PAGE_SHIFT;
797 807
@@ -810,10 +820,16 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
810 820
811 /* Should check here against the e820 map to avoid double free */ 821 /* Should check here against the e820 map to avoid double free */
812#ifdef CONFIG_NUMA 822#ifdef CONFIG_NUMA
813 reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT); 823 nid = phys_to_nid(phys);
824 next_nid = phys_to_nid(phys + len - 1);
825 if (nid == next_nid)
826 reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT);
827 else
828 reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
814#else 829#else
815 reserve_bootmem(phys, len, BOOTMEM_DEFAULT); 830 reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
816#endif 831#endif
832
817 if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) { 833 if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
818 dma_reserve += len / PAGE_SIZE; 834 dma_reserve += len / PAGE_SIZE;
819 set_dma_reserve(dma_reserve); 835 set_dma_reserve(dma_reserve);
@@ -907,6 +923,10 @@ const char *arch_vma_name(struct vm_area_struct *vma)
907/* 923/*
908 * Initialise the sparsemem vmemmap using huge-pages at the PMD level. 924 * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
909 */ 925 */
926static long __meminitdata addr_start, addr_end;
927static void __meminitdata *p_start, *p_end;
928static int __meminitdata node_start;
929
910int __meminit 930int __meminit
911vmemmap_populate(struct page *start_page, unsigned long size, int node) 931vmemmap_populate(struct page *start_page, unsigned long size, int node)
912{ 932{
@@ -941,12 +961,32 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node)
941 PAGE_KERNEL_LARGE); 961 PAGE_KERNEL_LARGE);
942 set_pmd(pmd, __pmd(pte_val(entry))); 962 set_pmd(pmd, __pmd(pte_val(entry)));
943 963
944 printk(KERN_DEBUG " [%lx-%lx] PMD ->%p on node %d\n", 964 /* check to see if we have contiguous blocks */
945 addr, addr + PMD_SIZE - 1, p, node); 965 if (p_end != p || node_start != node) {
966 if (p_start)
967 printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
968 addr_start, addr_end-1, p_start, p_end-1, node_start);
969 addr_start = addr;
970 node_start = node;
971 p_start = p;
972 }
973 addr_end = addr + PMD_SIZE;
974 p_end = p + PMD_SIZE;
946 } else { 975 } else {
947 vmemmap_verify((pte_t *)pmd, node, addr, next); 976 vmemmap_verify((pte_t *)pmd, node, addr, next);
948 } 977 }
949 } 978 }
950 return 0; 979 return 0;
951} 980}
981
982void __meminit vmemmap_populate_print_last(void)
983{
984 if (p_start) {
985 printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
986 addr_start, addr_end-1, p_start, p_end-1, node_start);
987 p_start = NULL;
988 p_end = NULL;
989 node_start = 0;
990 }
991}
952#endif 992#endif
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 3a4baf95e24d..71bb3159031a 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -117,8 +117,8 @@ int ioremap_change_attr(unsigned long vaddr, unsigned long size,
117 * have to convert them into an offset in a page-aligned mapping, but the 117 * have to convert them into an offset in a page-aligned mapping, but the
118 * caller shouldn't need to know that small detail. 118 * caller shouldn't need to know that small detail.
119 */ 119 */
120static void __iomem *__ioremap(resource_size_t phys_addr, unsigned long size, 120static void __iomem *__ioremap_caller(resource_size_t phys_addr,
121 unsigned long prot_val) 121 unsigned long size, unsigned long prot_val, void *caller)
122{ 122{
123 unsigned long pfn, offset, vaddr; 123 unsigned long pfn, offset, vaddr;
124 resource_size_t last_addr; 124 resource_size_t last_addr;
@@ -149,7 +149,8 @@ static void __iomem *__ioremap(resource_size_t phys_addr, unsigned long size,
149 * Don't allow anybody to remap normal RAM that we're using.. 149 * Don't allow anybody to remap normal RAM that we're using..
150 */ 150 */
151 for (pfn = phys_addr >> PAGE_SHIFT; 151 for (pfn = phys_addr >> PAGE_SHIFT;
152 (pfn << PAGE_SHIFT) < last_addr; pfn++) { 152 (pfn << PAGE_SHIFT) < (last_addr & PAGE_MASK);
153 pfn++) {
153 154
154 int is_ram = page_is_ram(pfn); 155 int is_ram = page_is_ram(pfn);
155 156
@@ -176,11 +177,11 @@ static void __iomem *__ioremap(resource_size_t phys_addr, unsigned long size,
176 /* 177 /*
177 * Do not fallback to certain memory types with certain 178 * Do not fallback to certain memory types with certain
178 * requested type: 179 * requested type:
179 * - request is uncached, return cannot be write-back 180 * - request is uc-, return cannot be write-back
180 * - request is uncached, return cannot be write-combine 181 * - request is uc-, return cannot be write-combine
181 * - request is write-combine, return cannot be write-back 182 * - request is write-combine, return cannot be write-back
182 */ 183 */
183 if ((prot_val == _PAGE_CACHE_UC && 184 if ((prot_val == _PAGE_CACHE_UC_MINUS &&
184 (new_prot_val == _PAGE_CACHE_WB || 185 (new_prot_val == _PAGE_CACHE_WB ||
185 new_prot_val == _PAGE_CACHE_WC)) || 186 new_prot_val == _PAGE_CACHE_WC)) ||
186 (prot_val == _PAGE_CACHE_WC && 187 (prot_val == _PAGE_CACHE_WC &&
@@ -201,6 +202,9 @@ static void __iomem *__ioremap(resource_size_t phys_addr, unsigned long size,
201 default: 202 default:
202 prot = PAGE_KERNEL_NOCACHE; 203 prot = PAGE_KERNEL_NOCACHE;
203 break; 204 break;
205 case _PAGE_CACHE_UC_MINUS:
206 prot = PAGE_KERNEL_UC_MINUS;
207 break;
204 case _PAGE_CACHE_WC: 208 case _PAGE_CACHE_WC:
205 prot = PAGE_KERNEL_WC; 209 prot = PAGE_KERNEL_WC;
206 break; 210 break;
@@ -212,7 +216,7 @@ static void __iomem *__ioremap(resource_size_t phys_addr, unsigned long size,
212 /* 216 /*
213 * Ok, go for it.. 217 * Ok, go for it..
214 */ 218 */
215 area = get_vm_area(size, VM_IOREMAP); 219 area = get_vm_area_caller(size, VM_IOREMAP, caller);
216 if (!area) 220 if (!area)
217 return NULL; 221 return NULL;
218 area->phys_addr = phys_addr; 222 area->phys_addr = phys_addr;
@@ -255,7 +259,17 @@ static void __iomem *__ioremap(resource_size_t phys_addr, unsigned long size,
255 */ 259 */
256void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size) 260void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size)
257{ 261{
258 return __ioremap(phys_addr, size, _PAGE_CACHE_UC); 262 /*
263 * Ideally, this should be:
264 * pat_wc_enabled ? _PAGE_CACHE_UC : _PAGE_CACHE_UC_MINUS;
265 *
266 * Till we fix all X drivers to use ioremap_wc(), we will use
267 * UC MINUS.
268 */
269 unsigned long val = _PAGE_CACHE_UC_MINUS;
270
271 return __ioremap_caller(phys_addr, size, val,
272 __builtin_return_address(0));
259} 273}
260EXPORT_SYMBOL(ioremap_nocache); 274EXPORT_SYMBOL(ioremap_nocache);
261 275
@@ -272,7 +286,8 @@ EXPORT_SYMBOL(ioremap_nocache);
272void __iomem *ioremap_wc(unsigned long phys_addr, unsigned long size) 286void __iomem *ioremap_wc(unsigned long phys_addr, unsigned long size)
273{ 287{
274 if (pat_wc_enabled) 288 if (pat_wc_enabled)
275 return __ioremap(phys_addr, size, _PAGE_CACHE_WC); 289 return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC,
290 __builtin_return_address(0));
276 else 291 else
277 return ioremap_nocache(phys_addr, size); 292 return ioremap_nocache(phys_addr, size);
278} 293}
@@ -280,7 +295,8 @@ EXPORT_SYMBOL(ioremap_wc);
280 295
281void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size) 296void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
282{ 297{
283 return __ioremap(phys_addr, size, _PAGE_CACHE_WB); 298 return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WB,
299 __builtin_return_address(0));
284} 300}
285EXPORT_SYMBOL(ioremap_cache); 301EXPORT_SYMBOL(ioremap_cache);
286 302
@@ -336,6 +352,35 @@ void iounmap(volatile void __iomem *addr)
336} 352}
337EXPORT_SYMBOL(iounmap); 353EXPORT_SYMBOL(iounmap);
338 354
355/*
356 * Convert a physical pointer to a virtual kernel pointer for /dev/mem
357 * access
358 */
359void *xlate_dev_mem_ptr(unsigned long phys)
360{
361 void *addr;
362 unsigned long start = phys & PAGE_MASK;
363
364 /* If page is RAM, we can use __va. Otherwise ioremap and unmap. */
365 if (page_is_ram(start >> PAGE_SHIFT))
366 return __va(phys);
367
368 addr = (void *)ioremap(start, PAGE_SIZE);
369 if (addr)
370 addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK));
371
372 return addr;
373}
374
375void unxlate_dev_mem_ptr(unsigned long phys, void *addr)
376{
377 if (page_is_ram(phys >> PAGE_SHIFT))
378 return;
379
380 iounmap((void __iomem *)((unsigned long)addr & PAGE_MASK));
381 return;
382}
383
339#ifdef CONFIG_X86_32 384#ifdef CONFIG_X86_32
340 385
341int __initdata early_ioremap_debug; 386int __initdata early_ioremap_debug;
@@ -407,7 +452,7 @@ void __init early_ioremap_clear(void)
407 452
408 pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)); 453 pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));
409 pmd_clear(pmd); 454 pmd_clear(pmd);
410 paravirt_release_pt(__pa(bm_pte) >> PAGE_SHIFT); 455 paravirt_release_pte(__pa(bm_pte) >> PAGE_SHIFT);
411 __flush_tlb_all(); 456 __flush_tlb_all();
412} 457}
413 458
diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c
index 86808e666f9c..1f476e477844 100644
--- a/arch/x86/mm/k8topology_64.c
+++ b/arch/x86/mm/k8topology_64.c
@@ -13,12 +13,15 @@
13#include <linux/nodemask.h> 13#include <linux/nodemask.h>
14#include <asm/io.h> 14#include <asm/io.h>
15#include <linux/pci_ids.h> 15#include <linux/pci_ids.h>
16#include <linux/acpi.h>
16#include <asm/types.h> 17#include <asm/types.h>
17#include <asm/mmzone.h> 18#include <asm/mmzone.h>
18#include <asm/proto.h> 19#include <asm/proto.h>
19#include <asm/e820.h> 20#include <asm/e820.h>
20#include <asm/pci-direct.h> 21#include <asm/pci-direct.h>
21#include <asm/numa.h> 22#include <asm/numa.h>
23#include <asm/mpspec.h>
24#include <asm/apic.h>
22 25
23static __init int find_northbridge(void) 26static __init int find_northbridge(void)
24{ 27{
@@ -44,6 +47,30 @@ static __init int find_northbridge(void)
44 return -1; 47 return -1;
45} 48}
46 49
50static __init void early_get_boot_cpu_id(void)
51{
52 /*
53 * need to get boot_cpu_id so can use that to create apicid_to_node
54 * in k8_scan_nodes()
55 */
56 /*
57 * Find possible boot-time SMP configuration:
58 */
59 early_find_smp_config();
60#ifdef CONFIG_ACPI
61 /*
62 * Read APIC information from ACPI tables.
63 */
64 early_acpi_boot_init();
65#endif
66 /*
67 * get boot-time SMP configuration:
68 */
69 if (smp_found_config)
70 early_get_smp_config();
71 early_init_lapic_mapping();
72}
73
47int __init k8_scan_nodes(unsigned long start, unsigned long end) 74int __init k8_scan_nodes(unsigned long start, unsigned long end)
48{ 75{
49 unsigned long prevbase; 76 unsigned long prevbase;
@@ -56,6 +83,7 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
56 unsigned cores; 83 unsigned cores;
57 unsigned bits; 84 unsigned bits;
58 int j; 85 int j;
86 unsigned apicid_base;
59 87
60 if (!early_pci_allowed()) 88 if (!early_pci_allowed())
61 return -1; 89 return -1;
@@ -174,11 +202,19 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
174 /* use the coreid bits from early_identify_cpu */ 202 /* use the coreid bits from early_identify_cpu */
175 bits = boot_cpu_data.x86_coreid_bits; 203 bits = boot_cpu_data.x86_coreid_bits;
176 cores = (1<<bits); 204 cores = (1<<bits);
205 apicid_base = 0;
206 /* need to get boot_cpu_id early for system with apicid lifting */
207 early_get_boot_cpu_id();
208 if (boot_cpu_physical_apicid > 0) {
209 printk(KERN_INFO "BSP APIC ID: %02x\n",
210 boot_cpu_physical_apicid);
211 apicid_base = boot_cpu_physical_apicid;
212 }
177 213
178 for (i = 0; i < 8; i++) { 214 for (i = 0; i < 8; i++) {
179 if (nodes[i].start != nodes[i].end) { 215 if (nodes[i].start != nodes[i].end) {
180 nodeid = nodeids[i]; 216 nodeid = nodeids[i];
181 for (j = 0; j < cores; j++) 217 for (j = apicid_base; j < cores + apicid_base; j++)
182 apicid_to_node[(nodeid << bits) + j] = i; 218 apicid_to_node[(nodeid << bits) + j] = i;
183 setup_node_bootmem(i, nodes[i].start, nodes[i].end); 219 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
184 } 220 }
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 9a6892200b27..c5066d519e5d 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -196,6 +196,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
196 unsigned long bootmap_start, nodedata_phys; 196 unsigned long bootmap_start, nodedata_phys;
197 void *bootmap; 197 void *bootmap;
198 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE); 198 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
199 int nid;
199 200
200 start = round_up(start, ZONE_ALIGN); 201 start = round_up(start, ZONE_ALIGN);
201 202
@@ -218,9 +219,19 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
218 NODE_DATA(nodeid)->node_start_pfn = start_pfn; 219 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
219 NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn; 220 NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
220 221
221 /* Find a place for the bootmem map */ 222 /*
223 * Find a place for the bootmem map
224 * nodedata_phys could be on other nodes by alloc_bootmem,
225 * so need to sure bootmap_start not to be small, otherwise
226 * early_node_mem will get that with find_e820_area instead
227 * of alloc_bootmem, that could clash with reserved range
228 */
222 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); 229 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
223 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE); 230 nid = phys_to_nid(nodedata_phys);
231 if (nid == nodeid)
232 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
233 else
234 bootmap_start = round_up(start, PAGE_SIZE);
224 /* 235 /*
225 * SMP_CAHCE_BYTES could be enough, but init_bootmem_node like 236 * SMP_CAHCE_BYTES could be enough, but init_bootmem_node like
226 * to use that to align to PAGE_SIZE 237 * to use that to align to PAGE_SIZE
@@ -245,10 +256,29 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
245 256
246 free_bootmem_with_active_regions(nodeid, end); 257 free_bootmem_with_active_regions(nodeid, end);
247 258
248 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size, 259 /*
249 BOOTMEM_DEFAULT); 260 * convert early reserve to bootmem reserve earlier
250 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, 261 * otherwise early_node_mem could use early reserved mem
251 bootmap_pages<<PAGE_SHIFT, BOOTMEM_DEFAULT); 262 * on previous node
263 */
264 early_res_to_bootmem(start, end);
265
266 /*
267 * in some case early_node_mem could use alloc_bootmem
268 * to get range on other node, don't reserve that again
269 */
270 if (nid != nodeid)
271 printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nodeid, nid);
272 else
273 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys,
274 pgdat_size, BOOTMEM_DEFAULT);
275 nid = phys_to_nid(bootmap_start);
276 if (nid != nodeid)
277 printk(KERN_INFO " bootmap(%d) on node %d\n", nodeid, nid);
278 else
279 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start,
280 bootmap_pages<<PAGE_SHIFT, BOOTMEM_DEFAULT);
281
252#ifdef CONFIG_ACPI_NUMA 282#ifdef CONFIG_ACPI_NUMA
253 srat_reserve_add_area(nodeid); 283 srat_reserve_add_area(nodeid);
254#endif 284#endif
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index f7823a172868..60bcb5b6a37e 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -483,9 +483,7 @@ static int split_large_page(pte_t *kpte, unsigned long address)
483 goto out_unlock; 483 goto out_unlock;
484 484
485 pbase = (pte_t *)page_address(base); 485 pbase = (pte_t *)page_address(base);
486#ifdef CONFIG_X86_32 486 paravirt_alloc_pte(&init_mm, page_to_pfn(base));
487 paravirt_alloc_pt(&init_mm, page_to_pfn(base));
488#endif
489 ref_prot = pte_pgprot(pte_clrhuge(*kpte)); 487 ref_prot = pte_pgprot(pte_clrhuge(*kpte));
490 488
491#ifdef CONFIG_X86_64 489#ifdef CONFIG_X86_64
@@ -779,14 +777,20 @@ static inline int change_page_attr_clear(unsigned long addr, int numpages,
779 777
780int _set_memory_uc(unsigned long addr, int numpages) 778int _set_memory_uc(unsigned long addr, int numpages)
781{ 779{
780 /*
781 * for now UC MINUS. see comments in ioremap_nocache()
782 */
782 return change_page_attr_set(addr, numpages, 783 return change_page_attr_set(addr, numpages,
783 __pgprot(_PAGE_CACHE_UC)); 784 __pgprot(_PAGE_CACHE_UC_MINUS));
784} 785}
785 786
786int set_memory_uc(unsigned long addr, int numpages) 787int set_memory_uc(unsigned long addr, int numpages)
787{ 788{
789 /*
790 * for now UC MINUS. see comments in ioremap_nocache()
791 */
788 if (reserve_memtype(addr, addr + numpages * PAGE_SIZE, 792 if (reserve_memtype(addr, addr + numpages * PAGE_SIZE,
789 _PAGE_CACHE_UC, NULL)) 793 _PAGE_CACHE_UC_MINUS, NULL))
790 return -EINVAL; 794 return -EINVAL;
791 795
792 return _set_memory_uc(addr, numpages); 796 return _set_memory_uc(addr, numpages);
@@ -993,7 +997,7 @@ static const struct file_operations dpa_fops = {
993 .release = single_release, 997 .release = single_release,
994}; 998};
995 999
996int __init debug_pagealloc_proc_init(void) 1000static int __init debug_pagealloc_proc_init(void)
997{ 1001{
998 struct dentry *de; 1002 struct dentry *de;
999 1003
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 72c0f6097402..277446cd30b6 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -11,16 +11,19 @@
11#include <linux/kernel.h> 11#include <linux/kernel.h>
12#include <linux/gfp.h> 12#include <linux/gfp.h>
13#include <linux/fs.h> 13#include <linux/fs.h>
14#include <linux/bootmem.h>
14 15
15#include <asm/msr.h> 16#include <asm/msr.h>
16#include <asm/tlbflush.h> 17#include <asm/tlbflush.h>
17#include <asm/processor.h> 18#include <asm/processor.h>
19#include <asm/page.h>
18#include <asm/pgtable.h> 20#include <asm/pgtable.h>
19#include <asm/pat.h> 21#include <asm/pat.h>
20#include <asm/e820.h> 22#include <asm/e820.h>
21#include <asm/cacheflush.h> 23#include <asm/cacheflush.h>
22#include <asm/fcntl.h> 24#include <asm/fcntl.h>
23#include <asm/mtrr.h> 25#include <asm/mtrr.h>
26#include <asm/io.h>
24 27
25int pat_wc_enabled = 1; 28int pat_wc_enabled = 1;
26 29
@@ -190,6 +193,21 @@ static int pat_x_mtrr_type(u64 start, u64 end, unsigned long prot,
190 return 0; 193 return 0;
191} 194}
192 195
196/*
197 * req_type typically has one of the:
198 * - _PAGE_CACHE_WB
199 * - _PAGE_CACHE_WC
200 * - _PAGE_CACHE_UC_MINUS
201 * - _PAGE_CACHE_UC
202 *
203 * req_type will have a special case value '-1', when requester want to inherit
204 * the memory type from mtrr (if WB), existing PAT, defaulting to UC_MINUS.
205 *
206 * If ret_type is NULL, function will return an error if it cannot reserve the
207 * region with req_type. If ret_type is non-null, function will return
208 * available type in ret_type in case of no error. In case of any error
209 * it will return a negative return value.
210 */
193int reserve_memtype(u64 start, u64 end, unsigned long req_type, 211int reserve_memtype(u64 start, u64 end, unsigned long req_type,
194 unsigned long *ret_type) 212 unsigned long *ret_type)
195{ 213{
@@ -200,9 +218,14 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
200 218
201 /* Only track when pat_wc_enabled */ 219 /* Only track when pat_wc_enabled */
202 if (!pat_wc_enabled) { 220 if (!pat_wc_enabled) {
203 if (ret_type) 221 /* This is identical to page table setting without PAT */
204 *ret_type = req_type; 222 if (ret_type) {
205 223 if (req_type == -1) {
224 *ret_type = _PAGE_CACHE_WB;
225 } else {
226 *ret_type = req_type;
227 }
228 }
206 return 0; 229 return 0;
207 } 230 }
208 231
@@ -214,8 +237,29 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
214 return 0; 237 return 0;
215 } 238 }
216 239
217 req_type &= _PAGE_CACHE_MASK; 240 if (req_type == -1) {
218 err = pat_x_mtrr_type(start, end, req_type, &actual_type); 241 /*
242 * Special case where caller wants to inherit from mtrr or
243 * existing pat mapping, defaulting to UC_MINUS in case of
244 * no match.
245 */
246 u8 mtrr_type = mtrr_type_lookup(start, end);
247 if (mtrr_type == 0xFE) { /* MTRR match error */
248 err = -1;
249 }
250
251 if (mtrr_type == MTRR_TYPE_WRBACK) {
252 req_type = _PAGE_CACHE_WB;
253 actual_type = _PAGE_CACHE_WB;
254 } else {
255 req_type = _PAGE_CACHE_UC_MINUS;
256 actual_type = _PAGE_CACHE_UC_MINUS;
257 }
258 } else {
259 req_type &= _PAGE_CACHE_MASK;
260 err = pat_x_mtrr_type(start, end, req_type, &actual_type);
261 }
262
219 if (err) { 263 if (err) {
220 if (ret_type) 264 if (ret_type)
221 *ret_type = actual_type; 265 *ret_type = actual_type;
@@ -241,7 +285,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
241 struct memtype *saved_ptr; 285 struct memtype *saved_ptr;
242 286
243 if (parse->start >= end) { 287 if (parse->start >= end) {
244 printk("New Entry\n"); 288 pr_debug("New Entry\n");
245 list_add(&new_entry->nd, parse->nd.prev); 289 list_add(&new_entry->nd, parse->nd.prev);
246 new_entry = NULL; 290 new_entry = NULL;
247 break; 291 break;
@@ -291,7 +335,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
291 break; 335 break;
292 } 336 }
293 337
294 printk("Overlap at 0x%Lx-0x%Lx\n", 338 pr_debug("Overlap at 0x%Lx-0x%Lx\n",
295 saved_ptr->start, saved_ptr->end); 339 saved_ptr->start, saved_ptr->end);
296 /* No conflict. Go ahead and add this new entry */ 340 /* No conflict. Go ahead and add this new entry */
297 list_add(&new_entry->nd, saved_ptr->nd.prev); 341 list_add(&new_entry->nd, saved_ptr->nd.prev);
@@ -343,8 +387,8 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
343 break; 387 break;
344 } 388 }
345 389
346 printk("Overlap at 0x%Lx-0x%Lx\n", 390 pr_debug(KERN_INFO "Overlap at 0x%Lx-0x%Lx\n",
347 saved_ptr->start, saved_ptr->end); 391 saved_ptr->start, saved_ptr->end);
348 /* No conflict. Go ahead and add this new entry */ 392 /* No conflict. Go ahead and add this new entry */
349 list_add(&new_entry->nd, &saved_ptr->nd); 393 list_add(&new_entry->nd, &saved_ptr->nd);
350 new_entry = NULL; 394 new_entry = NULL;
@@ -353,7 +397,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
353 } 397 }
354 398
355 if (err) { 399 if (err) {
356 printk( 400 printk(KERN_INFO
357 "reserve_memtype failed 0x%Lx-0x%Lx, track %s, req %s\n", 401 "reserve_memtype failed 0x%Lx-0x%Lx, track %s, req %s\n",
358 start, end, cattr_name(new_entry->type), 402 start, end, cattr_name(new_entry->type),
359 cattr_name(req_type)); 403 cattr_name(req_type));
@@ -365,16 +409,16 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
365 if (new_entry) { 409 if (new_entry) {
366 /* No conflict. Not yet added to the list. Add to the tail */ 410 /* No conflict. Not yet added to the list. Add to the tail */
367 list_add_tail(&new_entry->nd, &memtype_list); 411 list_add_tail(&new_entry->nd, &memtype_list);
368 printk("New Entry\n"); 412 pr_debug("New Entry\n");
369 } 413 }
370 414
371 if (ret_type) { 415 if (ret_type) {
372 printk( 416 pr_debug(
373 "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n", 417 "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n",
374 start, end, cattr_name(actual_type), 418 start, end, cattr_name(actual_type),
375 cattr_name(req_type), cattr_name(*ret_type)); 419 cattr_name(req_type), cattr_name(*ret_type));
376 } else { 420 } else {
377 printk( 421 pr_debug(
378 "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s\n", 422 "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s\n",
379 start, end, cattr_name(actual_type), 423 start, end, cattr_name(actual_type),
380 cattr_name(req_type)); 424 cattr_name(req_type));
@@ -411,11 +455,142 @@ int free_memtype(u64 start, u64 end)
411 spin_unlock(&memtype_lock); 455 spin_unlock(&memtype_lock);
412 456
413 if (err) { 457 if (err) {
414 printk(KERN_DEBUG "%s:%d freeing invalid memtype %Lx-%Lx\n", 458 printk(KERN_INFO "%s:%d freeing invalid memtype %Lx-%Lx\n",
415 current->comm, current->pid, start, end); 459 current->comm, current->pid, start, end);
416 } 460 }
417 461
418 printk( "free_memtype request 0x%Lx-0x%Lx\n", start, end); 462 pr_debug("free_memtype request 0x%Lx-0x%Lx\n", start, end);
419 return err; 463 return err;
420} 464}
421 465
466
467/*
468 * /dev/mem mmap interface. The memtype used for mapping varies:
469 * - Use UC for mappings with O_SYNC flag
470 * - Without O_SYNC flag, if there is any conflict in reserve_memtype,
471 * inherit the memtype from existing mapping.
472 * - Else use UC_MINUS memtype (for backward compatibility with existing
473 * X drivers.
474 */
475pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
476 unsigned long size, pgprot_t vma_prot)
477{
478 return vma_prot;
479}
480
481#ifdef CONFIG_NONPROMISC_DEVMEM
482/* This check is done in drivers/char/mem.c in case of NONPROMISC_DEVMEM*/
483static inline int range_is_allowed(unsigned long pfn, unsigned long size)
484{
485 return 1;
486}
487#else
488static inline int range_is_allowed(unsigned long pfn, unsigned long size)
489{
490 u64 from = ((u64)pfn) << PAGE_SHIFT;
491 u64 to = from + size;
492 u64 cursor = from;
493
494 while (cursor < to) {
495 if (!devmem_is_allowed(pfn)) {
496 printk(KERN_INFO
497 "Program %s tried to access /dev/mem between %Lx->%Lx.\n",
498 current->comm, from, to);
499 return 0;
500 }
501 cursor += PAGE_SIZE;
502 pfn++;
503 }
504 return 1;
505}
506#endif /* CONFIG_NONPROMISC_DEVMEM */
507
508int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
509 unsigned long size, pgprot_t *vma_prot)
510{
511 u64 offset = ((u64) pfn) << PAGE_SHIFT;
512 unsigned long flags = _PAGE_CACHE_UC_MINUS;
513 int retval;
514
515 if (!range_is_allowed(pfn, size))
516 return 0;
517
518 if (file->f_flags & O_SYNC) {
519 flags = _PAGE_CACHE_UC;
520 }
521
522#ifdef CONFIG_X86_32
523 /*
524 * On the PPro and successors, the MTRRs are used to set
525 * memory types for physical addresses outside main memory,
526 * so blindly setting UC or PWT on those pages is wrong.
527 * For Pentiums and earlier, the surround logic should disable
528 * caching for the high addresses through the KEN pin, but
529 * we maintain the tradition of paranoia in this code.
530 */
531 if (!pat_wc_enabled &&
532 ! ( test_bit(X86_FEATURE_MTRR, boot_cpu_data.x86_capability) ||
533 test_bit(X86_FEATURE_K6_MTRR, boot_cpu_data.x86_capability) ||
534 test_bit(X86_FEATURE_CYRIX_ARR, boot_cpu_data.x86_capability) ||
535 test_bit(X86_FEATURE_CENTAUR_MCR, boot_cpu_data.x86_capability)) &&
536 (pfn << PAGE_SHIFT) >= __pa(high_memory)) {
537 flags = _PAGE_CACHE_UC;
538 }
539#endif
540
541 /*
542 * With O_SYNC, we can only take UC mapping. Fail if we cannot.
543 * Without O_SYNC, we want to get
544 * - WB for WB-able memory and no other conflicting mappings
545 * - UC_MINUS for non-WB-able memory with no other conflicting mappings
546 * - Inherit from confliting mappings otherwise
547 */
548 if (flags != _PAGE_CACHE_UC_MINUS) {
549 retval = reserve_memtype(offset, offset + size, flags, NULL);
550 } else {
551 retval = reserve_memtype(offset, offset + size, -1, &flags);
552 }
553
554 if (retval < 0)
555 return 0;
556
557 if (pfn <= max_pfn_mapped &&
558 ioremap_change_attr((unsigned long)__va(offset), size, flags) < 0) {
559 free_memtype(offset, offset + size);
560 printk(KERN_INFO
561 "%s:%d /dev/mem ioremap_change_attr failed %s for %Lx-%Lx\n",
562 current->comm, current->pid,
563 cattr_name(flags),
564 offset, offset + size);
565 return 0;
566 }
567
568 *vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) |
569 flags);
570 return 1;
571}
572
573void map_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot)
574{
575 u64 addr = (u64)pfn << PAGE_SHIFT;
576 unsigned long flags;
577 unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK);
578
579 reserve_memtype(addr, addr + size, want_flags, &flags);
580 if (flags != want_flags) {
581 printk(KERN_INFO
582 "%s:%d /dev/mem expected mapping type %s for %Lx-%Lx, got %s\n",
583 current->comm, current->pid,
584 cattr_name(want_flags),
585 addr, addr + size,
586 cattr_name(flags));
587 }
588}
589
590void unmap_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot)
591{
592 u64 addr = (u64)pfn << PAGE_SHIFT;
593
594 free_memtype(addr, addr + size);
595}
596
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
new file mode 100644
index 000000000000..50159764f694
--- /dev/null
+++ b/arch/x86/mm/pgtable.c
@@ -0,0 +1,276 @@
1#include <linux/mm.h>
2#include <asm/pgalloc.h>
3#include <asm/pgtable.h>
4#include <asm/tlb.h>
5
6pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
7{
8 return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
9}
10
11pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
12{
13 struct page *pte;
14
15#ifdef CONFIG_HIGHPTE
16 pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
17#else
18 pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
19#endif
20 if (pte)
21 pgtable_page_ctor(pte);
22 return pte;
23}
24
25void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
26{
27 pgtable_page_dtor(pte);
28 paravirt_release_pte(page_to_pfn(pte));
29 tlb_remove_page(tlb, pte);
30}
31
32#if PAGETABLE_LEVELS > 2
33void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
34{
35 paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT);
36 tlb_remove_page(tlb, virt_to_page(pmd));
37}
38
39#if PAGETABLE_LEVELS > 3
40void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
41{
42 paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
43 tlb_remove_page(tlb, virt_to_page(pud));
44}
45#endif /* PAGETABLE_LEVELS > 3 */
46#endif /* PAGETABLE_LEVELS > 2 */
47
48static inline void pgd_list_add(pgd_t *pgd)
49{
50 struct page *page = virt_to_page(pgd);
51
52 list_add(&page->lru, &pgd_list);
53}
54
55static inline void pgd_list_del(pgd_t *pgd)
56{
57 struct page *page = virt_to_page(pgd);
58
59 list_del(&page->lru);
60}
61
62#define UNSHARED_PTRS_PER_PGD \
63 (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
64
65static void pgd_ctor(void *p)
66{
67 pgd_t *pgd = p;
68 unsigned long flags;
69
70 /* Clear usermode parts of PGD */
71 memset(pgd, 0, KERNEL_PGD_BOUNDARY*sizeof(pgd_t));
72
73 spin_lock_irqsave(&pgd_lock, flags);
74
75 /* If the pgd points to a shared pagetable level (either the
76 ptes in non-PAE, or shared PMD in PAE), then just copy the
77 references from swapper_pg_dir. */
78 if (PAGETABLE_LEVELS == 2 ||
79 (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD) ||
80 PAGETABLE_LEVELS == 4) {
81 clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
82 swapper_pg_dir + KERNEL_PGD_BOUNDARY,
83 KERNEL_PGD_PTRS);
84 paravirt_alloc_pmd_clone(__pa(pgd) >> PAGE_SHIFT,
85 __pa(swapper_pg_dir) >> PAGE_SHIFT,
86 KERNEL_PGD_BOUNDARY,
87 KERNEL_PGD_PTRS);
88 }
89
90 /* list required to sync kernel mapping updates */
91 if (!SHARED_KERNEL_PMD)
92 pgd_list_add(pgd);
93
94 spin_unlock_irqrestore(&pgd_lock, flags);
95}
96
97static void pgd_dtor(void *pgd)
98{
99 unsigned long flags; /* can be called from interrupt context */
100
101 if (SHARED_KERNEL_PMD)
102 return;
103
104 spin_lock_irqsave(&pgd_lock, flags);
105 pgd_list_del(pgd);
106 spin_unlock_irqrestore(&pgd_lock, flags);
107}
108
109/*
110 * List of all pgd's needed for non-PAE so it can invalidate entries
111 * in both cached and uncached pgd's; not needed for PAE since the
112 * kernel pmd is shared. If PAE were not to share the pmd a similar
113 * tactic would be needed. This is essentially codepath-based locking
114 * against pageattr.c; it is the unique case in which a valid change
115 * of kernel pagetables can't be lazily synchronized by vmalloc faults.
116 * vmalloc faults work because attached pagetables are never freed.
117 * -- wli
118 */
119
120#ifdef CONFIG_X86_PAE
121/*
122 * Mop up any pmd pages which may still be attached to the pgd.
123 * Normally they will be freed by munmap/exit_mmap, but any pmd we
124 * preallocate which never got a corresponding vma will need to be
125 * freed manually.
126 */
127static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
128{
129 int i;
130
131 for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) {
132 pgd_t pgd = pgdp[i];
133
134 if (pgd_val(pgd) != 0) {
135 pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
136
137 pgdp[i] = native_make_pgd(0);
138
139 paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
140 pmd_free(mm, pmd);
141 }
142 }
143}
144
145/*
146 * In PAE mode, we need to do a cr3 reload (=tlb flush) when
147 * updating the top-level pagetable entries to guarantee the
148 * processor notices the update. Since this is expensive, and
149 * all 4 top-level entries are used almost immediately in a
150 * new process's life, we just pre-populate them here.
151 *
152 * Also, if we're in a paravirt environment where the kernel pmd is
153 * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
154 * and initialize the kernel pmds here.
155 */
156static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
157{
158 pud_t *pud;
159 unsigned long addr;
160 int i;
161
162 pud = pud_offset(pgd, 0);
163 for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD;
164 i++, pud++, addr += PUD_SIZE) {
165 pmd_t *pmd = pmd_alloc_one(mm, addr);
166
167 if (!pmd) {
168 pgd_mop_up_pmds(mm, pgd);
169 return 0;
170 }
171
172 if (i >= KERNEL_PGD_BOUNDARY)
173 memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
174 sizeof(pmd_t) * PTRS_PER_PMD);
175
176 pud_populate(mm, pud, pmd);
177 }
178
179 return 1;
180}
181
182void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
183{
184 paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
185
186 /* Note: almost everything apart from _PAGE_PRESENT is
187 reserved at the pmd (PDPT) level. */
188 set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
189
190 /*
191 * According to Intel App note "TLBs, Paging-Structure Caches,
192 * and Their Invalidation", April 2007, document 317080-001,
193 * section 8.1: in PAE mode we explicitly have to flush the
194 * TLB via cr3 if the top-level pgd is changed...
195 */
196 if (mm == current->active_mm)
197 write_cr3(read_cr3());
198}
199#else /* !CONFIG_X86_PAE */
200/* No need to prepopulate any pagetable entries in non-PAE modes. */
201static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
202{
203 return 1;
204}
205
206static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgd)
207{
208}
209#endif /* CONFIG_X86_PAE */
210
211pgd_t *pgd_alloc(struct mm_struct *mm)
212{
213 pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
214
215 /* so that alloc_pmd can use it */
216 mm->pgd = pgd;
217 if (pgd)
218 pgd_ctor(pgd);
219
220 if (pgd && !pgd_prepopulate_pmd(mm, pgd)) {
221 pgd_dtor(pgd);
222 free_page((unsigned long)pgd);
223 pgd = NULL;
224 }
225
226 return pgd;
227}
228
229void pgd_free(struct mm_struct *mm, pgd_t *pgd)
230{
231 pgd_mop_up_pmds(mm, pgd);
232 pgd_dtor(pgd);
233 free_page((unsigned long)pgd);
234}
235
236int ptep_set_access_flags(struct vm_area_struct *vma,
237 unsigned long address, pte_t *ptep,
238 pte_t entry, int dirty)
239{
240 int changed = !pte_same(*ptep, entry);
241
242 if (changed && dirty) {
243 *ptep = entry;
244 pte_update_defer(vma->vm_mm, address, ptep);
245 flush_tlb_page(vma, address);
246 }
247
248 return changed;
249}
250
251int ptep_test_and_clear_young(struct vm_area_struct *vma,
252 unsigned long addr, pte_t *ptep)
253{
254 int ret = 0;
255
256 if (pte_young(*ptep))
257 ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
258 &ptep->pte);
259
260 if (ret)
261 pte_update(vma->vm_mm, addr, ptep);
262
263 return ret;
264}
265
266int ptep_clear_flush_young(struct vm_area_struct *vma,
267 unsigned long address, pte_t *ptep)
268{
269 int young;
270
271 young = ptep_test_and_clear_young(vma, address, ptep);
272 if (young)
273 flush_tlb_page(vma, address);
274
275 return young;
276}
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index 6fb9e7c6893f..9ee007be9142 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -173,210 +173,6 @@ void reserve_top_address(unsigned long reserve)
173 __VMALLOC_RESERVE += reserve; 173 __VMALLOC_RESERVE += reserve;
174} 174}
175 175
176pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
177{
178 return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
179}
180
181pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
182{
183 struct page *pte;
184
185#ifdef CONFIG_HIGHPTE
186 pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
187#else
188 pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
189#endif
190 if (pte)
191 pgtable_page_ctor(pte);
192 return pte;
193}
194
195/*
196 * List of all pgd's needed for non-PAE so it can invalidate entries
197 * in both cached and uncached pgd's; not needed for PAE since the
198 * kernel pmd is shared. If PAE were not to share the pmd a similar
199 * tactic would be needed. This is essentially codepath-based locking
200 * against pageattr.c; it is the unique case in which a valid change
201 * of kernel pagetables can't be lazily synchronized by vmalloc faults.
202 * vmalloc faults work because attached pagetables are never freed.
203 * -- wli
204 */
205static inline void pgd_list_add(pgd_t *pgd)
206{
207 struct page *page = virt_to_page(pgd);
208
209 list_add(&page->lru, &pgd_list);
210}
211
212static inline void pgd_list_del(pgd_t *pgd)
213{
214 struct page *page = virt_to_page(pgd);
215
216 list_del(&page->lru);
217}
218
219#define UNSHARED_PTRS_PER_PGD \
220 (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD)
221
222static void pgd_ctor(void *p)
223{
224 pgd_t *pgd = p;
225 unsigned long flags;
226
227 /* Clear usermode parts of PGD */
228 memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
229
230 spin_lock_irqsave(&pgd_lock, flags);
231
232 /* If the pgd points to a shared pagetable level (either the
233 ptes in non-PAE, or shared PMD in PAE), then just copy the
234 references from swapper_pg_dir. */
235 if (PAGETABLE_LEVELS == 2 ||
236 (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD)) {
237 clone_pgd_range(pgd + USER_PTRS_PER_PGD,
238 swapper_pg_dir + USER_PTRS_PER_PGD,
239 KERNEL_PGD_PTRS);
240 paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT,
241 __pa(swapper_pg_dir) >> PAGE_SHIFT,
242 USER_PTRS_PER_PGD,
243 KERNEL_PGD_PTRS);
244 }
245
246 /* list required to sync kernel mapping updates */
247 if (!SHARED_KERNEL_PMD)
248 pgd_list_add(pgd);
249
250 spin_unlock_irqrestore(&pgd_lock, flags);
251}
252
253static void pgd_dtor(void *pgd)
254{
255 unsigned long flags; /* can be called from interrupt context */
256
257 if (SHARED_KERNEL_PMD)
258 return;
259
260 spin_lock_irqsave(&pgd_lock, flags);
261 pgd_list_del(pgd);
262 spin_unlock_irqrestore(&pgd_lock, flags);
263}
264
265#ifdef CONFIG_X86_PAE
266/*
267 * Mop up any pmd pages which may still be attached to the pgd.
268 * Normally they will be freed by munmap/exit_mmap, but any pmd we
269 * preallocate which never got a corresponding vma will need to be
270 * freed manually.
271 */
272static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
273{
274 int i;
275
276 for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) {
277 pgd_t pgd = pgdp[i];
278
279 if (pgd_val(pgd) != 0) {
280 pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
281
282 pgdp[i] = native_make_pgd(0);
283
284 paravirt_release_pd(pgd_val(pgd) >> PAGE_SHIFT);
285 pmd_free(mm, pmd);
286 }
287 }
288}
289
290/*
291 * In PAE mode, we need to do a cr3 reload (=tlb flush) when
292 * updating the top-level pagetable entries to guarantee the
293 * processor notices the update. Since this is expensive, and
294 * all 4 top-level entries are used almost immediately in a
295 * new process's life, we just pre-populate them here.
296 *
297 * Also, if we're in a paravirt environment where the kernel pmd is
298 * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
299 * and initialize the kernel pmds here.
300 */
301static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
302{
303 pud_t *pud;
304 unsigned long addr;
305 int i;
306
307 pud = pud_offset(pgd, 0);
308 for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD;
309 i++, pud++, addr += PUD_SIZE) {
310 pmd_t *pmd = pmd_alloc_one(mm, addr);
311
312 if (!pmd) {
313 pgd_mop_up_pmds(mm, pgd);
314 return 0;
315 }
316
317 if (i >= USER_PTRS_PER_PGD)
318 memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
319 sizeof(pmd_t) * PTRS_PER_PMD);
320
321 pud_populate(mm, pud, pmd);
322 }
323
324 return 1;
325}
326#else /* !CONFIG_X86_PAE */
327/* No need to prepopulate any pagetable entries in non-PAE modes. */
328static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
329{
330 return 1;
331}
332
333static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
334{
335}
336#endif /* CONFIG_X86_PAE */
337
338pgd_t *pgd_alloc(struct mm_struct *mm)
339{
340 pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
341
342 /* so that alloc_pd can use it */
343 mm->pgd = pgd;
344 if (pgd)
345 pgd_ctor(pgd);
346
347 if (pgd && !pgd_prepopulate_pmd(mm, pgd)) {
348 pgd_dtor(pgd);
349 free_page((unsigned long)pgd);
350 pgd = NULL;
351 }
352
353 return pgd;
354}
355
356void pgd_free(struct mm_struct *mm, pgd_t *pgd)
357{
358 pgd_mop_up_pmds(mm, pgd);
359 pgd_dtor(pgd);
360 free_page((unsigned long)pgd);
361}
362
363void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
364{
365 pgtable_page_dtor(pte);
366 paravirt_release_pt(page_to_pfn(pte));
367 tlb_remove_page(tlb, pte);
368}
369
370#ifdef CONFIG_X86_PAE
371
372void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
373{
374 paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
375 tlb_remove_page(tlb, virt_to_page(pmd));
376}
377
378#endif
379
380int pmd_bad(pmd_t pmd) 176int pmd_bad(pmd_t pmd)
381{ 177{
382 WARN_ON_ONCE(pmd_bad_v1(pmd) != pmd_bad_v2(pmd)); 178 WARN_ON_ONCE(pmd_bad_v1(pmd) != pmd_bad_v2(pmd));
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index fb43d89f46f3..3890234e5b26 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -163,7 +163,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
163 pxm, apic_id, node); 163 pxm, apic_id, node);
164} 164}
165 165
166int update_end_of_memory(unsigned long end) {return -1;} 166static int update_end_of_memory(unsigned long end) {return -1;}
167static int hotadd_enough_memory(struct bootnode *nd) {return 1;} 167static int hotadd_enough_memory(struct bootnode *nd) {return 1;}
168#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE 168#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
169static inline int save_add_info(void) {return 1;} 169static inline int save_add_info(void) {return 1;}