aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/mm/init_64.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/mm/init_64.c')
-rw-r--r--arch/x86/mm/init_64.c174
1 files changed, 142 insertions, 32 deletions
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index d3746efb060d..83e13f2d53d2 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -88,6 +88,62 @@ early_param("gbpages", parse_direct_gbpages_on);
88 88
89int after_bootmem; 89int after_bootmem;
90 90
91unsigned long __supported_pte_mask __read_mostly = ~0UL;
92EXPORT_SYMBOL_GPL(__supported_pte_mask);
93
94static int do_not_nx __cpuinitdata;
95
96/*
97 * noexec=on|off
98 * Control non-executable mappings for 64-bit processes.
99 *
100 * on Enable (default)
101 * off Disable
102 */
103static int __init nonx_setup(char *str)
104{
105 if (!str)
106 return -EINVAL;
107 if (!strncmp(str, "on", 2)) {
108 __supported_pte_mask |= _PAGE_NX;
109 do_not_nx = 0;
110 } else if (!strncmp(str, "off", 3)) {
111 do_not_nx = 1;
112 __supported_pte_mask &= ~_PAGE_NX;
113 }
114 return 0;
115}
116early_param("noexec", nonx_setup);
117
118void __cpuinit check_efer(void)
119{
120 unsigned long efer;
121
122 rdmsrl(MSR_EFER, efer);
123 if (!(efer & EFER_NX) || do_not_nx)
124 __supported_pte_mask &= ~_PAGE_NX;
125}
126
127int force_personality32;
128
129/*
130 * noexec32=on|off
131 * Control non executable heap for 32bit processes.
132 * To control the stack too use noexec=off
133 *
134 * on PROT_READ does not imply PROT_EXEC for 32-bit processes (default)
135 * off PROT_READ implies PROT_EXEC
136 */
137static int __init nonx32_setup(char *str)
138{
139 if (!strcmp(str, "on"))
140 force_personality32 &= ~READ_IMPLIES_EXEC;
141 else if (!strcmp(str, "off"))
142 force_personality32 |= READ_IMPLIES_EXEC;
143 return 1;
144}
145__setup("noexec32=", nonx32_setup);
146
91/* 147/*
92 * NOTE: This function is marked __ref because it calls __init function 148 * NOTE: This function is marked __ref because it calls __init function
93 * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0. 149 * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
@@ -225,7 +281,7 @@ void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
225void __init cleanup_highmap(void) 281void __init cleanup_highmap(void)
226{ 282{
227 unsigned long vaddr = __START_KERNEL_map; 283 unsigned long vaddr = __START_KERNEL_map;
228 unsigned long end = round_up((unsigned long)_end, PMD_SIZE) - 1; 284 unsigned long end = roundup((unsigned long)_end, PMD_SIZE) - 1;
229 pmd_t *pmd = level2_kernel_pgt; 285 pmd_t *pmd = level2_kernel_pgt;
230 pmd_t *last_pmd = pmd + PTRS_PER_PMD; 286 pmd_t *last_pmd = pmd + PTRS_PER_PMD;
231 287
@@ -271,7 +327,8 @@ static __ref void unmap_low_page(void *adr)
271} 327}
272 328
273static unsigned long __meminit 329static unsigned long __meminit
274phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end) 330phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
331 pgprot_t prot)
275{ 332{
276 unsigned pages = 0; 333 unsigned pages = 0;
277 unsigned long last_map_addr = end; 334 unsigned long last_map_addr = end;
@@ -289,36 +346,43 @@ phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end)
289 break; 346 break;
290 } 347 }
291 348
349 /*
350 * We will re-use the existing mapping.
351 * Xen for example has some special requirements, like mapping
352 * pagetable pages as RO. So assume someone who pre-setup
353 * these mappings are more intelligent.
354 */
292 if (pte_val(*pte)) 355 if (pte_val(*pte))
293 continue; 356 continue;
294 357
295 if (0) 358 if (0)
296 printk(" pte=%p addr=%lx pte=%016lx\n", 359 printk(" pte=%p addr=%lx pte=%016lx\n",
297 pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte); 360 pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte);
298 set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL));
299 last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
300 pages++; 361 pages++;
362 set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, prot));
363 last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
301 } 364 }
365
302 update_page_count(PG_LEVEL_4K, pages); 366 update_page_count(PG_LEVEL_4K, pages);
303 367
304 return last_map_addr; 368 return last_map_addr;
305} 369}
306 370
307static unsigned long __meminit 371static unsigned long __meminit
308phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end) 372phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end,
373 pgprot_t prot)
309{ 374{
310 pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd); 375 pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd);
311 376
312 return phys_pte_init(pte, address, end); 377 return phys_pte_init(pte, address, end, prot);
313} 378}
314 379
315static unsigned long __meminit 380static unsigned long __meminit
316phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, 381phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
317 unsigned long page_size_mask) 382 unsigned long page_size_mask, pgprot_t prot)
318{ 383{
319 unsigned long pages = 0; 384 unsigned long pages = 0;
320 unsigned long last_map_addr = end; 385 unsigned long last_map_addr = end;
321 unsigned long start = address;
322 386
323 int i = pmd_index(address); 387 int i = pmd_index(address);
324 388
@@ -326,6 +390,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
326 unsigned long pte_phys; 390 unsigned long pte_phys;
327 pmd_t *pmd = pmd_page + pmd_index(address); 391 pmd_t *pmd = pmd_page + pmd_index(address);
328 pte_t *pte; 392 pte_t *pte;
393 pgprot_t new_prot = prot;
329 394
330 if (address >= end) { 395 if (address >= end) {
331 if (!after_bootmem) { 396 if (!after_bootmem) {
@@ -339,27 +404,40 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
339 if (!pmd_large(*pmd)) { 404 if (!pmd_large(*pmd)) {
340 spin_lock(&init_mm.page_table_lock); 405 spin_lock(&init_mm.page_table_lock);
341 last_map_addr = phys_pte_update(pmd, address, 406 last_map_addr = phys_pte_update(pmd, address,
342 end); 407 end, prot);
343 spin_unlock(&init_mm.page_table_lock); 408 spin_unlock(&init_mm.page_table_lock);
409 continue;
344 } 410 }
345 /* Count entries we're using from level2_ident_pgt */ 411 /*
346 if (start == 0) 412 * If we are ok with PG_LEVEL_2M mapping, then we will
347 pages++; 413 * use the existing mapping,
348 continue; 414 *
415 * Otherwise, we will split the large page mapping but
416 * use the same existing protection bits except for
417 * large page, so that we don't violate Intel's TLB
418 * Application note (317080) which says, while changing
419 * the page sizes, new and old translations should
420 * not differ with respect to page frame and
421 * attributes.
422 */
423 if (page_size_mask & (1 << PG_LEVEL_2M))
424 continue;
425 new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd));
349 } 426 }
350 427
351 if (page_size_mask & (1<<PG_LEVEL_2M)) { 428 if (page_size_mask & (1<<PG_LEVEL_2M)) {
352 pages++; 429 pages++;
353 spin_lock(&init_mm.page_table_lock); 430 spin_lock(&init_mm.page_table_lock);
354 set_pte((pte_t *)pmd, 431 set_pte((pte_t *)pmd,
355 pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); 432 pfn_pte(address >> PAGE_SHIFT,
433 __pgprot(pgprot_val(prot) | _PAGE_PSE)));
356 spin_unlock(&init_mm.page_table_lock); 434 spin_unlock(&init_mm.page_table_lock);
357 last_map_addr = (address & PMD_MASK) + PMD_SIZE; 435 last_map_addr = (address & PMD_MASK) + PMD_SIZE;
358 continue; 436 continue;
359 } 437 }
360 438
361 pte = alloc_low_page(&pte_phys); 439 pte = alloc_low_page(&pte_phys);
362 last_map_addr = phys_pte_init(pte, address, end); 440 last_map_addr = phys_pte_init(pte, address, end, new_prot);
363 unmap_low_page(pte); 441 unmap_low_page(pte);
364 442
365 spin_lock(&init_mm.page_table_lock); 443 spin_lock(&init_mm.page_table_lock);
@@ -372,12 +450,12 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
372 450
373static unsigned long __meminit 451static unsigned long __meminit
374phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end, 452phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end,
375 unsigned long page_size_mask) 453 unsigned long page_size_mask, pgprot_t prot)
376{ 454{
377 pmd_t *pmd = pmd_offset(pud, 0); 455 pmd_t *pmd = pmd_offset(pud, 0);
378 unsigned long last_map_addr; 456 unsigned long last_map_addr;
379 457
380 last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask); 458 last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask, prot);
381 __flush_tlb_all(); 459 __flush_tlb_all();
382 return last_map_addr; 460 return last_map_addr;
383} 461}
@@ -394,6 +472,7 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
394 unsigned long pmd_phys; 472 unsigned long pmd_phys;
395 pud_t *pud = pud_page + pud_index(addr); 473 pud_t *pud = pud_page + pud_index(addr);
396 pmd_t *pmd; 474 pmd_t *pmd;
475 pgprot_t prot = PAGE_KERNEL;
397 476
398 if (addr >= end) 477 if (addr >= end)
399 break; 478 break;
@@ -405,10 +484,26 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
405 } 484 }
406 485
407 if (pud_val(*pud)) { 486 if (pud_val(*pud)) {
408 if (!pud_large(*pud)) 487 if (!pud_large(*pud)) {
409 last_map_addr = phys_pmd_update(pud, addr, end, 488 last_map_addr = phys_pmd_update(pud, addr, end,
410 page_size_mask); 489 page_size_mask, prot);
411 continue; 490 continue;
491 }
492 /*
493 * If we are ok with PG_LEVEL_1G mapping, then we will
494 * use the existing mapping.
495 *
496 * Otherwise, we will split the gbpage mapping but use
497 * the same existing protection bits except for large
498 * page, so that we don't violate Intel's TLB
499 * Application note (317080) which says, while changing
500 * the page sizes, new and old translations should
501 * not differ with respect to page frame and
502 * attributes.
503 */
504 if (page_size_mask & (1 << PG_LEVEL_1G))
505 continue;
506 prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud));
412 } 507 }
413 508
414 if (page_size_mask & (1<<PG_LEVEL_1G)) { 509 if (page_size_mask & (1<<PG_LEVEL_1G)) {
@@ -422,7 +517,8 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
422 } 517 }
423 518
424 pmd = alloc_low_page(&pmd_phys); 519 pmd = alloc_low_page(&pmd_phys);
425 last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask); 520 last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask,
521 prot);
426 unmap_low_page(pmd); 522 unmap_low_page(pmd);
427 523
428 spin_lock(&init_mm.page_table_lock); 524 spin_lock(&init_mm.page_table_lock);
@@ -430,6 +526,7 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
430 spin_unlock(&init_mm.page_table_lock); 526 spin_unlock(&init_mm.page_table_lock);
431 } 527 }
432 __flush_tlb_all(); 528 __flush_tlb_all();
529
433 update_page_count(PG_LEVEL_1G, pages); 530 update_page_count(PG_LEVEL_1G, pages);
434 531
435 return last_map_addr; 532 return last_map_addr;
@@ -446,27 +543,28 @@ phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
446 return phys_pud_init(pud, addr, end, page_size_mask); 543 return phys_pud_init(pud, addr, end, page_size_mask);
447} 544}
448 545
449static void __init find_early_table_space(unsigned long end) 546static void __init find_early_table_space(unsigned long end, int use_pse,
547 int use_gbpages)
450{ 548{
451 unsigned long puds, pmds, ptes, tables, start; 549 unsigned long puds, pmds, ptes, tables, start;
452 550
453 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; 551 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
454 tables = round_up(puds * sizeof(pud_t), PAGE_SIZE); 552 tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
455 if (direct_gbpages) { 553 if (use_gbpages) {
456 unsigned long extra; 554 unsigned long extra;
457 extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT); 555 extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
458 pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT; 556 pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
459 } else 557 } else
460 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; 558 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
461 tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE); 559 tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
462 560
463 if (cpu_has_pse) { 561 if (use_pse) {
464 unsigned long extra; 562 unsigned long extra;
465 extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); 563 extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
466 ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; 564 ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
467 } else 565 } else
468 ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; 566 ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
469 tables += round_up(ptes * sizeof(pte_t), PAGE_SIZE); 567 tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
470 568
471 /* 569 /*
472 * RED-PEN putting page tables only on node 0 could 570 * RED-PEN putting page tables only on node 0 could
@@ -528,6 +626,7 @@ static unsigned long __init kernel_physical_mapping_init(unsigned long start,
528 pgd_populate(&init_mm, pgd, __va(pud_phys)); 626 pgd_populate(&init_mm, pgd, __va(pud_phys));
529 spin_unlock(&init_mm.page_table_lock); 627 spin_unlock(&init_mm.page_table_lock);
530 } 628 }
629 __flush_tlb_all();
531 630
532 return last_map_addr; 631 return last_map_addr;
533} 632}
@@ -571,6 +670,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
571 670
572 struct map_range mr[NR_RANGE_MR]; 671 struct map_range mr[NR_RANGE_MR];
573 int nr_range, i; 672 int nr_range, i;
673 int use_pse, use_gbpages;
574 674
575 printk(KERN_INFO "init_memory_mapping\n"); 675 printk(KERN_INFO "init_memory_mapping\n");
576 676
@@ -584,9 +684,21 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
584 if (!after_bootmem) 684 if (!after_bootmem)
585 init_gbpages(); 685 init_gbpages();
586 686
587 if (direct_gbpages) 687#ifdef CONFIG_DEBUG_PAGEALLOC
688 /*
689 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
690 * This will simplify cpa(), which otherwise needs to support splitting
691 * large pages into small in interrupt context, etc.
692 */
693 use_pse = use_gbpages = 0;
694#else
695 use_pse = cpu_has_pse;
696 use_gbpages = direct_gbpages;
697#endif
698
699 if (use_gbpages)
588 page_size_mask |= 1 << PG_LEVEL_1G; 700 page_size_mask |= 1 << PG_LEVEL_1G;
589 if (cpu_has_pse) 701 if (use_pse)
590 page_size_mask |= 1 << PG_LEVEL_2M; 702 page_size_mask |= 1 << PG_LEVEL_2M;
591 703
592 memset(mr, 0, sizeof(mr)); 704 memset(mr, 0, sizeof(mr));
@@ -647,7 +759,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
647 (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k")); 759 (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
648 760
649 if (!after_bootmem) 761 if (!after_bootmem)
650 find_early_table_space(end); 762 find_early_table_space(end, use_pse, use_gbpages);
651 763
652 for (i = 0; i < nr_range; i++) 764 for (i = 0; i < nr_range; i++)
653 last_map_addr = kernel_physical_mapping_init( 765 last_map_addr = kernel_physical_mapping_init(
@@ -806,8 +918,6 @@ void __init mem_init(void)
806 reservedpages << (PAGE_SHIFT-10), 918 reservedpages << (PAGE_SHIFT-10),
807 datasize >> 10, 919 datasize >> 10,
808 initsize >> 10); 920 initsize >> 10);
809
810 cpa_init();
811} 921}
812 922
813void free_init_pages(char *what, unsigned long begin, unsigned long end) 923void free_init_pages(char *what, unsigned long begin, unsigned long end)