aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/mm/init_64.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/mm/init_64.c')
-rw-r--r--arch/x86/mm/init_64.c184
1 files changed, 147 insertions, 37 deletions
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index d3746efb060d..b8e461d49412 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -31,6 +31,7 @@
31#include <linux/nmi.h> 31#include <linux/nmi.h>
32 32
33#include <asm/processor.h> 33#include <asm/processor.h>
34#include <asm/bios_ebda.h>
34#include <asm/system.h> 35#include <asm/system.h>
35#include <asm/uaccess.h> 36#include <asm/uaccess.h>
36#include <asm/pgtable.h> 37#include <asm/pgtable.h>
@@ -88,6 +89,62 @@ early_param("gbpages", parse_direct_gbpages_on);
88 89
89int after_bootmem; 90int after_bootmem;
90 91
92pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP;
93EXPORT_SYMBOL_GPL(__supported_pte_mask);
94
95static int do_not_nx __cpuinitdata;
96
97/*
98 * noexec=on|off
99 * Control non-executable mappings for 64-bit processes.
100 *
101 * on Enable (default)
102 * off Disable
103 */
104static int __init nonx_setup(char *str)
105{
106 if (!str)
107 return -EINVAL;
108 if (!strncmp(str, "on", 2)) {
109 __supported_pte_mask |= _PAGE_NX;
110 do_not_nx = 0;
111 } else if (!strncmp(str, "off", 3)) {
112 do_not_nx = 1;
113 __supported_pte_mask &= ~_PAGE_NX;
114 }
115 return 0;
116}
117early_param("noexec", nonx_setup);
118
119void __cpuinit check_efer(void)
120{
121 unsigned long efer;
122
123 rdmsrl(MSR_EFER, efer);
124 if (!(efer & EFER_NX) || do_not_nx)
125 __supported_pte_mask &= ~_PAGE_NX;
126}
127
128int force_personality32;
129
130/*
131 * noexec32=on|off
132 * Control non executable heap for 32bit processes.
133 * To control the stack too use noexec=off
134 *
135 * on PROT_READ does not imply PROT_EXEC for 32-bit processes (default)
136 * off PROT_READ implies PROT_EXEC
137 */
138static int __init nonx32_setup(char *str)
139{
140 if (!strcmp(str, "on"))
141 force_personality32 &= ~READ_IMPLIES_EXEC;
142 else if (!strcmp(str, "off"))
143 force_personality32 |= READ_IMPLIES_EXEC;
144 return 1;
145}
146__setup("noexec32=", nonx32_setup);
147
91/* 148/*
92 * NOTE: This function is marked __ref because it calls __init function 149 * NOTE: This function is marked __ref because it calls __init function
93 * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0. 150 * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
@@ -139,9 +196,6 @@ set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
139 } 196 }
140 197
141 pte = pte_offset_kernel(pmd, vaddr); 198 pte = pte_offset_kernel(pmd, vaddr);
142 if (!pte_none(*pte) && pte_val(new_pte) &&
143 pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
144 pte_ERROR(*pte);
145 set_pte(pte, new_pte); 199 set_pte(pte, new_pte);
146 200
147 /* 201 /*
@@ -225,7 +279,7 @@ void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
225void __init cleanup_highmap(void) 279void __init cleanup_highmap(void)
226{ 280{
227 unsigned long vaddr = __START_KERNEL_map; 281 unsigned long vaddr = __START_KERNEL_map;
228 unsigned long end = round_up((unsigned long)_end, PMD_SIZE) - 1; 282 unsigned long end = roundup((unsigned long)_end, PMD_SIZE) - 1;
229 pmd_t *pmd = level2_kernel_pgt; 283 pmd_t *pmd = level2_kernel_pgt;
230 pmd_t *last_pmd = pmd + PTRS_PER_PMD; 284 pmd_t *last_pmd = pmd + PTRS_PER_PMD;
231 285
@@ -256,7 +310,7 @@ static __ref void *alloc_low_page(unsigned long *phys)
256 if (pfn >= table_top) 310 if (pfn >= table_top)
257 panic("alloc_low_page: ran out of memory"); 311 panic("alloc_low_page: ran out of memory");
258 312
259 adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE); 313 adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
260 memset(adr, 0, PAGE_SIZE); 314 memset(adr, 0, PAGE_SIZE);
261 *phys = pfn * PAGE_SIZE; 315 *phys = pfn * PAGE_SIZE;
262 return adr; 316 return adr;
@@ -271,7 +325,8 @@ static __ref void unmap_low_page(void *adr)
271} 325}
272 326
273static unsigned long __meminit 327static unsigned long __meminit
274phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end) 328phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
329 pgprot_t prot)
275{ 330{
276 unsigned pages = 0; 331 unsigned pages = 0;
277 unsigned long last_map_addr = end; 332 unsigned long last_map_addr = end;
@@ -289,36 +344,43 @@ phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end)
289 break; 344 break;
290 } 345 }
291 346
347 /*
348 * We will re-use the existing mapping.
349 * Xen for example has some special requirements, like mapping
350 * pagetable pages as RO. So assume someone who pre-setup
351 * these mappings are more intelligent.
352 */
292 if (pte_val(*pte)) 353 if (pte_val(*pte))
293 continue; 354 continue;
294 355
295 if (0) 356 if (0)
296 printk(" pte=%p addr=%lx pte=%016lx\n", 357 printk(" pte=%p addr=%lx pte=%016lx\n",
297 pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte); 358 pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte);
298 set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL));
299 last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
300 pages++; 359 pages++;
360 set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, prot));
361 last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
301 } 362 }
363
302 update_page_count(PG_LEVEL_4K, pages); 364 update_page_count(PG_LEVEL_4K, pages);
303 365
304 return last_map_addr; 366 return last_map_addr;
305} 367}
306 368
307static unsigned long __meminit 369static unsigned long __meminit
308phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end) 370phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end,
371 pgprot_t prot)
309{ 372{
310 pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd); 373 pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd);
311 374
312 return phys_pte_init(pte, address, end); 375 return phys_pte_init(pte, address, end, prot);
313} 376}
314 377
315static unsigned long __meminit 378static unsigned long __meminit
316phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, 379phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
317 unsigned long page_size_mask) 380 unsigned long page_size_mask, pgprot_t prot)
318{ 381{
319 unsigned long pages = 0; 382 unsigned long pages = 0;
320 unsigned long last_map_addr = end; 383 unsigned long last_map_addr = end;
321 unsigned long start = address;
322 384
323 int i = pmd_index(address); 385 int i = pmd_index(address);
324 386
@@ -326,6 +388,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
326 unsigned long pte_phys; 388 unsigned long pte_phys;
327 pmd_t *pmd = pmd_page + pmd_index(address); 389 pmd_t *pmd = pmd_page + pmd_index(address);
328 pte_t *pte; 390 pte_t *pte;
391 pgprot_t new_prot = prot;
329 392
330 if (address >= end) { 393 if (address >= end) {
331 if (!after_bootmem) { 394 if (!after_bootmem) {
@@ -339,27 +402,40 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
339 if (!pmd_large(*pmd)) { 402 if (!pmd_large(*pmd)) {
340 spin_lock(&init_mm.page_table_lock); 403 spin_lock(&init_mm.page_table_lock);
341 last_map_addr = phys_pte_update(pmd, address, 404 last_map_addr = phys_pte_update(pmd, address,
342 end); 405 end, prot);
343 spin_unlock(&init_mm.page_table_lock); 406 spin_unlock(&init_mm.page_table_lock);
407 continue;
344 } 408 }
345 /* Count entries we're using from level2_ident_pgt */ 409 /*
346 if (start == 0) 410 * If we are ok with PG_LEVEL_2M mapping, then we will
347 pages++; 411 * use the existing mapping,
348 continue; 412 *
413 * Otherwise, we will split the large page mapping but
414 * use the same existing protection bits except for
415 * large page, so that we don't violate Intel's TLB
416 * Application note (317080) which says, while changing
417 * the page sizes, new and old translations should
418 * not differ with respect to page frame and
419 * attributes.
420 */
421 if (page_size_mask & (1 << PG_LEVEL_2M))
422 continue;
423 new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd));
349 } 424 }
350 425
351 if (page_size_mask & (1<<PG_LEVEL_2M)) { 426 if (page_size_mask & (1<<PG_LEVEL_2M)) {
352 pages++; 427 pages++;
353 spin_lock(&init_mm.page_table_lock); 428 spin_lock(&init_mm.page_table_lock);
354 set_pte((pte_t *)pmd, 429 set_pte((pte_t *)pmd,
355 pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); 430 pfn_pte(address >> PAGE_SHIFT,
431 __pgprot(pgprot_val(prot) | _PAGE_PSE)));
356 spin_unlock(&init_mm.page_table_lock); 432 spin_unlock(&init_mm.page_table_lock);
357 last_map_addr = (address & PMD_MASK) + PMD_SIZE; 433 last_map_addr = (address & PMD_MASK) + PMD_SIZE;
358 continue; 434 continue;
359 } 435 }
360 436
361 pte = alloc_low_page(&pte_phys); 437 pte = alloc_low_page(&pte_phys);
362 last_map_addr = phys_pte_init(pte, address, end); 438 last_map_addr = phys_pte_init(pte, address, end, new_prot);
363 unmap_low_page(pte); 439 unmap_low_page(pte);
364 440
365 spin_lock(&init_mm.page_table_lock); 441 spin_lock(&init_mm.page_table_lock);
@@ -372,12 +448,12 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
372 448
373static unsigned long __meminit 449static unsigned long __meminit
374phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end, 450phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end,
375 unsigned long page_size_mask) 451 unsigned long page_size_mask, pgprot_t prot)
376{ 452{
377 pmd_t *pmd = pmd_offset(pud, 0); 453 pmd_t *pmd = pmd_offset(pud, 0);
378 unsigned long last_map_addr; 454 unsigned long last_map_addr;
379 455
380 last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask); 456 last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask, prot);
381 __flush_tlb_all(); 457 __flush_tlb_all();
382 return last_map_addr; 458 return last_map_addr;
383} 459}
@@ -394,6 +470,7 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
394 unsigned long pmd_phys; 470 unsigned long pmd_phys;
395 pud_t *pud = pud_page + pud_index(addr); 471 pud_t *pud = pud_page + pud_index(addr);
396 pmd_t *pmd; 472 pmd_t *pmd;
473 pgprot_t prot = PAGE_KERNEL;
397 474
398 if (addr >= end) 475 if (addr >= end)
399 break; 476 break;
@@ -405,10 +482,26 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
405 } 482 }
406 483
407 if (pud_val(*pud)) { 484 if (pud_val(*pud)) {
408 if (!pud_large(*pud)) 485 if (!pud_large(*pud)) {
409 last_map_addr = phys_pmd_update(pud, addr, end, 486 last_map_addr = phys_pmd_update(pud, addr, end,
410 page_size_mask); 487 page_size_mask, prot);
411 continue; 488 continue;
489 }
490 /*
491 * If we are ok with PG_LEVEL_1G mapping, then we will
492 * use the existing mapping.
493 *
494 * Otherwise, we will split the gbpage mapping but use
495 * the same existing protection bits except for large
496 * page, so that we don't violate Intel's TLB
497 * Application note (317080) which says, while changing
498 * the page sizes, new and old translations should
499 * not differ with respect to page frame and
500 * attributes.
501 */
502 if (page_size_mask & (1 << PG_LEVEL_1G))
503 continue;
504 prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud));
412 } 505 }
413 506
414 if (page_size_mask & (1<<PG_LEVEL_1G)) { 507 if (page_size_mask & (1<<PG_LEVEL_1G)) {
@@ -422,7 +515,8 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
422 } 515 }
423 516
424 pmd = alloc_low_page(&pmd_phys); 517 pmd = alloc_low_page(&pmd_phys);
425 last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask); 518 last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask,
519 prot);
426 unmap_low_page(pmd); 520 unmap_low_page(pmd);
427 521
428 spin_lock(&init_mm.page_table_lock); 522 spin_lock(&init_mm.page_table_lock);
@@ -430,6 +524,7 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
430 spin_unlock(&init_mm.page_table_lock); 524 spin_unlock(&init_mm.page_table_lock);
431 } 525 }
432 __flush_tlb_all(); 526 __flush_tlb_all();
527
433 update_page_count(PG_LEVEL_1G, pages); 528 update_page_count(PG_LEVEL_1G, pages);
434 529
435 return last_map_addr; 530 return last_map_addr;
@@ -446,27 +541,28 @@ phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
446 return phys_pud_init(pud, addr, end, page_size_mask); 541 return phys_pud_init(pud, addr, end, page_size_mask);
447} 542}
448 543
449static void __init find_early_table_space(unsigned long end) 544static void __init find_early_table_space(unsigned long end, int use_pse,
545 int use_gbpages)
450{ 546{
451 unsigned long puds, pmds, ptes, tables, start; 547 unsigned long puds, pmds, ptes, tables, start;
452 548
453 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; 549 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
454 tables = round_up(puds * sizeof(pud_t), PAGE_SIZE); 550 tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
455 if (direct_gbpages) { 551 if (use_gbpages) {
456 unsigned long extra; 552 unsigned long extra;
457 extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT); 553 extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
458 pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT; 554 pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
459 } else 555 } else
460 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; 556 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
461 tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE); 557 tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
462 558
463 if (cpu_has_pse) { 559 if (use_pse) {
464 unsigned long extra; 560 unsigned long extra;
465 extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); 561 extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
466 ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; 562 ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
467 } else 563 } else
468 ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; 564 ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
469 tables += round_up(ptes * sizeof(pte_t), PAGE_SIZE); 565 tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
470 566
471 /* 567 /*
472 * RED-PEN putting page tables only on node 0 could 568 * RED-PEN putting page tables only on node 0 could
@@ -528,6 +624,7 @@ static unsigned long __init kernel_physical_mapping_init(unsigned long start,
528 pgd_populate(&init_mm, pgd, __va(pud_phys)); 624 pgd_populate(&init_mm, pgd, __va(pud_phys));
529 spin_unlock(&init_mm.page_table_lock); 625 spin_unlock(&init_mm.page_table_lock);
530 } 626 }
627 __flush_tlb_all();
531 628
532 return last_map_addr; 629 return last_map_addr;
533} 630}
@@ -571,6 +668,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
571 668
572 struct map_range mr[NR_RANGE_MR]; 669 struct map_range mr[NR_RANGE_MR];
573 int nr_range, i; 670 int nr_range, i;
671 int use_pse, use_gbpages;
574 672
575 printk(KERN_INFO "init_memory_mapping\n"); 673 printk(KERN_INFO "init_memory_mapping\n");
576 674
@@ -584,9 +682,21 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
584 if (!after_bootmem) 682 if (!after_bootmem)
585 init_gbpages(); 683 init_gbpages();
586 684
587 if (direct_gbpages) 685#ifdef CONFIG_DEBUG_PAGEALLOC
686 /*
687 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
688 * This will simplify cpa(), which otherwise needs to support splitting
689 * large pages into small in interrupt context, etc.
690 */
691 use_pse = use_gbpages = 0;
692#else
693 use_pse = cpu_has_pse;
694 use_gbpages = direct_gbpages;
695#endif
696
697 if (use_gbpages)
588 page_size_mask |= 1 << PG_LEVEL_1G; 698 page_size_mask |= 1 << PG_LEVEL_1G;
589 if (cpu_has_pse) 699 if (use_pse)
590 page_size_mask |= 1 << PG_LEVEL_2M; 700 page_size_mask |= 1 << PG_LEVEL_2M;
591 701
592 memset(mr, 0, sizeof(mr)); 702 memset(mr, 0, sizeof(mr));
@@ -636,7 +746,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
636 old_start = mr[i].start; 746 old_start = mr[i].start;
637 memmove(&mr[i], &mr[i+1], 747 memmove(&mr[i], &mr[i+1],
638 (nr_range - 1 - i) * sizeof (struct map_range)); 748 (nr_range - 1 - i) * sizeof (struct map_range));
639 mr[i].start = old_start; 749 mr[i--].start = old_start;
640 nr_range--; 750 nr_range--;
641 } 751 }
642 752
@@ -647,7 +757,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
647 (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k")); 757 (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
648 758
649 if (!after_bootmem) 759 if (!after_bootmem)
650 find_early_table_space(end); 760 find_early_table_space(end, use_pse, use_gbpages);
651 761
652 for (i = 0; i < nr_range; i++) 762 for (i = 0; i < nr_range; i++)
653 last_map_addr = kernel_physical_mapping_init( 763 last_map_addr = kernel_physical_mapping_init(
@@ -769,6 +879,8 @@ void __init mem_init(void)
769{ 879{
770 long codesize, reservedpages, datasize, initsize; 880 long codesize, reservedpages, datasize, initsize;
771 881
882 start_periodic_check_for_corruption();
883
772 pci_iommu_alloc(); 884 pci_iommu_alloc();
773 885
774 /* clear_bss() already clear the empty_zero_page */ 886 /* clear_bss() already clear the empty_zero_page */
@@ -806,8 +918,6 @@ void __init mem_init(void)
806 reservedpages << (PAGE_SHIFT-10), 918 reservedpages << (PAGE_SHIFT-10),
807 datasize >> 10, 919 datasize >> 10,
808 initsize >> 10); 920 initsize >> 10);
809
810 cpa_init();
811} 921}
812 922
813void free_init_pages(char *what, unsigned long begin, unsigned long end) 923void free_init_pages(char *what, unsigned long begin, unsigned long end)