diff options
Diffstat (limited to 'arch/x86/mm/init_64.c')
-rw-r--r-- | arch/x86/mm/init_64.c | 174 |
1 files changed, 142 insertions, 32 deletions
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index f4db5276fa21..d84d3e91d348 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c | |||
@@ -88,6 +88,62 @@ early_param("gbpages", parse_direct_gbpages_on); | |||
88 | 88 | ||
89 | int after_bootmem; | 89 | int after_bootmem; |
90 | 90 | ||
91 | unsigned long __supported_pte_mask __read_mostly = ~0UL; | ||
92 | EXPORT_SYMBOL_GPL(__supported_pte_mask); | ||
93 | |||
94 | static int do_not_nx __cpuinitdata; | ||
95 | |||
96 | /* | ||
97 | * noexec=on|off | ||
98 | * Control non-executable mappings for 64-bit processes. | ||
99 | * | ||
100 | * on Enable (default) | ||
101 | * off Disable | ||
102 | */ | ||
103 | static int __init nonx_setup(char *str) | ||
104 | { | ||
105 | if (!str) | ||
106 | return -EINVAL; | ||
107 | if (!strncmp(str, "on", 2)) { | ||
108 | __supported_pte_mask |= _PAGE_NX; | ||
109 | do_not_nx = 0; | ||
110 | } else if (!strncmp(str, "off", 3)) { | ||
111 | do_not_nx = 1; | ||
112 | __supported_pte_mask &= ~_PAGE_NX; | ||
113 | } | ||
114 | return 0; | ||
115 | } | ||
116 | early_param("noexec", nonx_setup); | ||
117 | |||
118 | void __cpuinit check_efer(void) | ||
119 | { | ||
120 | unsigned long efer; | ||
121 | |||
122 | rdmsrl(MSR_EFER, efer); | ||
123 | if (!(efer & EFER_NX) || do_not_nx) | ||
124 | __supported_pte_mask &= ~_PAGE_NX; | ||
125 | } | ||
126 | |||
127 | int force_personality32; | ||
128 | |||
129 | /* | ||
130 | * noexec32=on|off | ||
131 | * Control non executable heap for 32bit processes. | ||
132 | * To control the stack too use noexec=off | ||
133 | * | ||
134 | * on PROT_READ does not imply PROT_EXEC for 32-bit processes (default) | ||
135 | * off PROT_READ implies PROT_EXEC | ||
136 | */ | ||
137 | static int __init nonx32_setup(char *str) | ||
138 | { | ||
139 | if (!strcmp(str, "on")) | ||
140 | force_personality32 &= ~READ_IMPLIES_EXEC; | ||
141 | else if (!strcmp(str, "off")) | ||
142 | force_personality32 |= READ_IMPLIES_EXEC; | ||
143 | return 1; | ||
144 | } | ||
145 | __setup("noexec32=", nonx32_setup); | ||
146 | |||
91 | /* | 147 | /* |
92 | * NOTE: This function is marked __ref because it calls __init function | 148 | * NOTE: This function is marked __ref because it calls __init function |
93 | * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0. | 149 | * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0. |
@@ -225,7 +281,7 @@ void __init init_extra_mapping_uc(unsigned long phys, unsigned long size) | |||
225 | void __init cleanup_highmap(void) | 281 | void __init cleanup_highmap(void) |
226 | { | 282 | { |
227 | unsigned long vaddr = __START_KERNEL_map; | 283 | unsigned long vaddr = __START_KERNEL_map; |
228 | unsigned long end = round_up((unsigned long)_end, PMD_SIZE) - 1; | 284 | unsigned long end = roundup((unsigned long)_end, PMD_SIZE) - 1; |
229 | pmd_t *pmd = level2_kernel_pgt; | 285 | pmd_t *pmd = level2_kernel_pgt; |
230 | pmd_t *last_pmd = pmd + PTRS_PER_PMD; | 286 | pmd_t *last_pmd = pmd + PTRS_PER_PMD; |
231 | 287 | ||
@@ -271,7 +327,8 @@ static __ref void unmap_low_page(void *adr) | |||
271 | } | 327 | } |
272 | 328 | ||
273 | static unsigned long __meminit | 329 | static unsigned long __meminit |
274 | phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end) | 330 | phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end, |
331 | pgprot_t prot) | ||
275 | { | 332 | { |
276 | unsigned pages = 0; | 333 | unsigned pages = 0; |
277 | unsigned long last_map_addr = end; | 334 | unsigned long last_map_addr = end; |
@@ -289,36 +346,43 @@ phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end) | |||
289 | break; | 346 | break; |
290 | } | 347 | } |
291 | 348 | ||
349 | /* | ||
350 | * We will re-use the existing mapping. | ||
351 | * Xen for example has some special requirements, like mapping | ||
352 | * pagetable pages as RO. So assume someone who pre-setup | ||
353 | * these mappings are more intelligent. | ||
354 | */ | ||
292 | if (pte_val(*pte)) | 355 | if (pte_val(*pte)) |
293 | continue; | 356 | continue; |
294 | 357 | ||
295 | if (0) | 358 | if (0) |
296 | printk(" pte=%p addr=%lx pte=%016lx\n", | 359 | printk(" pte=%p addr=%lx pte=%016lx\n", |
297 | pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte); | 360 | pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte); |
298 | set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL)); | ||
299 | last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE; | ||
300 | pages++; | 361 | pages++; |
362 | set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, prot)); | ||
363 | last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE; | ||
301 | } | 364 | } |
365 | |||
302 | update_page_count(PG_LEVEL_4K, pages); | 366 | update_page_count(PG_LEVEL_4K, pages); |
303 | 367 | ||
304 | return last_map_addr; | 368 | return last_map_addr; |
305 | } | 369 | } |
306 | 370 | ||
307 | static unsigned long __meminit | 371 | static unsigned long __meminit |
308 | phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end) | 372 | phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end, |
373 | pgprot_t prot) | ||
309 | { | 374 | { |
310 | pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd); | 375 | pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd); |
311 | 376 | ||
312 | return phys_pte_init(pte, address, end); | 377 | return phys_pte_init(pte, address, end, prot); |
313 | } | 378 | } |
314 | 379 | ||
315 | static unsigned long __meminit | 380 | static unsigned long __meminit |
316 | phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, | 381 | phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, |
317 | unsigned long page_size_mask) | 382 | unsigned long page_size_mask, pgprot_t prot) |
318 | { | 383 | { |
319 | unsigned long pages = 0; | 384 | unsigned long pages = 0; |
320 | unsigned long last_map_addr = end; | 385 | unsigned long last_map_addr = end; |
321 | unsigned long start = address; | ||
322 | 386 | ||
323 | int i = pmd_index(address); | 387 | int i = pmd_index(address); |
324 | 388 | ||
@@ -326,6 +390,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, | |||
326 | unsigned long pte_phys; | 390 | unsigned long pte_phys; |
327 | pmd_t *pmd = pmd_page + pmd_index(address); | 391 | pmd_t *pmd = pmd_page + pmd_index(address); |
328 | pte_t *pte; | 392 | pte_t *pte; |
393 | pgprot_t new_prot = prot; | ||
329 | 394 | ||
330 | if (address >= end) { | 395 | if (address >= end) { |
331 | if (!after_bootmem) { | 396 | if (!after_bootmem) { |
@@ -339,27 +404,40 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, | |||
339 | if (!pmd_large(*pmd)) { | 404 | if (!pmd_large(*pmd)) { |
340 | spin_lock(&init_mm.page_table_lock); | 405 | spin_lock(&init_mm.page_table_lock); |
341 | last_map_addr = phys_pte_update(pmd, address, | 406 | last_map_addr = phys_pte_update(pmd, address, |
342 | end); | 407 | end, prot); |
343 | spin_unlock(&init_mm.page_table_lock); | 408 | spin_unlock(&init_mm.page_table_lock); |
409 | continue; | ||
344 | } | 410 | } |
345 | /* Count entries we're using from level2_ident_pgt */ | 411 | /* |
346 | if (start == 0) | 412 | * If we are ok with PG_LEVEL_2M mapping, then we will |
347 | pages++; | 413 | * use the existing mapping, |
348 | continue; | 414 | * |
415 | * Otherwise, we will split the large page mapping but | ||
416 | * use the same existing protection bits except for | ||
417 | * large page, so that we don't violate Intel's TLB | ||
418 | * Application note (317080) which says, while changing | ||
419 | * the page sizes, new and old translations should | ||
420 | * not differ with respect to page frame and | ||
421 | * attributes. | ||
422 | */ | ||
423 | if (page_size_mask & (1 << PG_LEVEL_2M)) | ||
424 | continue; | ||
425 | new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd)); | ||
349 | } | 426 | } |
350 | 427 | ||
351 | if (page_size_mask & (1<<PG_LEVEL_2M)) { | 428 | if (page_size_mask & (1<<PG_LEVEL_2M)) { |
352 | pages++; | 429 | pages++; |
353 | spin_lock(&init_mm.page_table_lock); | 430 | spin_lock(&init_mm.page_table_lock); |
354 | set_pte((pte_t *)pmd, | 431 | set_pte((pte_t *)pmd, |
355 | pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); | 432 | pfn_pte(address >> PAGE_SHIFT, |
433 | __pgprot(pgprot_val(prot) | _PAGE_PSE))); | ||
356 | spin_unlock(&init_mm.page_table_lock); | 434 | spin_unlock(&init_mm.page_table_lock); |
357 | last_map_addr = (address & PMD_MASK) + PMD_SIZE; | 435 | last_map_addr = (address & PMD_MASK) + PMD_SIZE; |
358 | continue; | 436 | continue; |
359 | } | 437 | } |
360 | 438 | ||
361 | pte = alloc_low_page(&pte_phys); | 439 | pte = alloc_low_page(&pte_phys); |
362 | last_map_addr = phys_pte_init(pte, address, end); | 440 | last_map_addr = phys_pte_init(pte, address, end, new_prot); |
363 | unmap_low_page(pte); | 441 | unmap_low_page(pte); |
364 | 442 | ||
365 | spin_lock(&init_mm.page_table_lock); | 443 | spin_lock(&init_mm.page_table_lock); |
@@ -372,12 +450,12 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, | |||
372 | 450 | ||
373 | static unsigned long __meminit | 451 | static unsigned long __meminit |
374 | phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end, | 452 | phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end, |
375 | unsigned long page_size_mask) | 453 | unsigned long page_size_mask, pgprot_t prot) |
376 | { | 454 | { |
377 | pmd_t *pmd = pmd_offset(pud, 0); | 455 | pmd_t *pmd = pmd_offset(pud, 0); |
378 | unsigned long last_map_addr; | 456 | unsigned long last_map_addr; |
379 | 457 | ||
380 | last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask); | 458 | last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask, prot); |
381 | __flush_tlb_all(); | 459 | __flush_tlb_all(); |
382 | return last_map_addr; | 460 | return last_map_addr; |
383 | } | 461 | } |
@@ -394,6 +472,7 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, | |||
394 | unsigned long pmd_phys; | 472 | unsigned long pmd_phys; |
395 | pud_t *pud = pud_page + pud_index(addr); | 473 | pud_t *pud = pud_page + pud_index(addr); |
396 | pmd_t *pmd; | 474 | pmd_t *pmd; |
475 | pgprot_t prot = PAGE_KERNEL; | ||
397 | 476 | ||
398 | if (addr >= end) | 477 | if (addr >= end) |
399 | break; | 478 | break; |
@@ -405,10 +484,26 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, | |||
405 | } | 484 | } |
406 | 485 | ||
407 | if (pud_val(*pud)) { | 486 | if (pud_val(*pud)) { |
408 | if (!pud_large(*pud)) | 487 | if (!pud_large(*pud)) { |
409 | last_map_addr = phys_pmd_update(pud, addr, end, | 488 | last_map_addr = phys_pmd_update(pud, addr, end, |
410 | page_size_mask); | 489 | page_size_mask, prot); |
411 | continue; | 490 | continue; |
491 | } | ||
492 | /* | ||
493 | * If we are ok with PG_LEVEL_1G mapping, then we will | ||
494 | * use the existing mapping. | ||
495 | * | ||
496 | * Otherwise, we will split the gbpage mapping but use | ||
497 | * the same existing protection bits except for large | ||
498 | * page, so that we don't violate Intel's TLB | ||
499 | * Application note (317080) which says, while changing | ||
500 | * the page sizes, new and old translations should | ||
501 | * not differ with respect to page frame and | ||
502 | * attributes. | ||
503 | */ | ||
504 | if (page_size_mask & (1 << PG_LEVEL_1G)) | ||
505 | continue; | ||
506 | prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud)); | ||
412 | } | 507 | } |
413 | 508 | ||
414 | if (page_size_mask & (1<<PG_LEVEL_1G)) { | 509 | if (page_size_mask & (1<<PG_LEVEL_1G)) { |
@@ -422,7 +517,8 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, | |||
422 | } | 517 | } |
423 | 518 | ||
424 | pmd = alloc_low_page(&pmd_phys); | 519 | pmd = alloc_low_page(&pmd_phys); |
425 | last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask); | 520 | last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask, |
521 | prot); | ||
426 | unmap_low_page(pmd); | 522 | unmap_low_page(pmd); |
427 | 523 | ||
428 | spin_lock(&init_mm.page_table_lock); | 524 | spin_lock(&init_mm.page_table_lock); |
@@ -430,6 +526,7 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, | |||
430 | spin_unlock(&init_mm.page_table_lock); | 526 | spin_unlock(&init_mm.page_table_lock); |
431 | } | 527 | } |
432 | __flush_tlb_all(); | 528 | __flush_tlb_all(); |
529 | |||
433 | update_page_count(PG_LEVEL_1G, pages); | 530 | update_page_count(PG_LEVEL_1G, pages); |
434 | 531 | ||
435 | return last_map_addr; | 532 | return last_map_addr; |
@@ -446,27 +543,28 @@ phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end, | |||
446 | return phys_pud_init(pud, addr, end, page_size_mask); | 543 | return phys_pud_init(pud, addr, end, page_size_mask); |
447 | } | 544 | } |
448 | 545 | ||
449 | static void __init find_early_table_space(unsigned long end) | 546 | static void __init find_early_table_space(unsigned long end, int use_pse, |
547 | int use_gbpages) | ||
450 | { | 548 | { |
451 | unsigned long puds, pmds, ptes, tables, start; | 549 | unsigned long puds, pmds, ptes, tables, start; |
452 | 550 | ||
453 | puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; | 551 | puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; |
454 | tables = round_up(puds * sizeof(pud_t), PAGE_SIZE); | 552 | tables = roundup(puds * sizeof(pud_t), PAGE_SIZE); |
455 | if (direct_gbpages) { | 553 | if (use_gbpages) { |
456 | unsigned long extra; | 554 | unsigned long extra; |
457 | extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT); | 555 | extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT); |
458 | pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT; | 556 | pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT; |
459 | } else | 557 | } else |
460 | pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; | 558 | pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; |
461 | tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE); | 559 | tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE); |
462 | 560 | ||
463 | if (cpu_has_pse) { | 561 | if (use_pse) { |
464 | unsigned long extra; | 562 | unsigned long extra; |
465 | extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); | 563 | extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); |
466 | ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; | 564 | ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; |
467 | } else | 565 | } else |
468 | ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; | 566 | ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; |
469 | tables += round_up(ptes * sizeof(pte_t), PAGE_SIZE); | 567 | tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE); |
470 | 568 | ||
471 | /* | 569 | /* |
472 | * RED-PEN putting page tables only on node 0 could | 570 | * RED-PEN putting page tables only on node 0 could |
@@ -528,6 +626,7 @@ static unsigned long __init kernel_physical_mapping_init(unsigned long start, | |||
528 | pgd_populate(&init_mm, pgd, __va(pud_phys)); | 626 | pgd_populate(&init_mm, pgd, __va(pud_phys)); |
529 | spin_unlock(&init_mm.page_table_lock); | 627 | spin_unlock(&init_mm.page_table_lock); |
530 | } | 628 | } |
629 | __flush_tlb_all(); | ||
531 | 630 | ||
532 | return last_map_addr; | 631 | return last_map_addr; |
533 | } | 632 | } |
@@ -571,6 +670,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, | |||
571 | 670 | ||
572 | struct map_range mr[NR_RANGE_MR]; | 671 | struct map_range mr[NR_RANGE_MR]; |
573 | int nr_range, i; | 672 | int nr_range, i; |
673 | int use_pse, use_gbpages; | ||
574 | 674 | ||
575 | printk(KERN_INFO "init_memory_mapping\n"); | 675 | printk(KERN_INFO "init_memory_mapping\n"); |
576 | 676 | ||
@@ -584,9 +684,21 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, | |||
584 | if (!after_bootmem) | 684 | if (!after_bootmem) |
585 | init_gbpages(); | 685 | init_gbpages(); |
586 | 686 | ||
587 | if (direct_gbpages) | 687 | #ifdef CONFIG_DEBUG_PAGEALLOC |
688 | /* | ||
689 | * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. | ||
690 | * This will simplify cpa(), which otherwise needs to support splitting | ||
691 | * large pages into small in interrupt context, etc. | ||
692 | */ | ||
693 | use_pse = use_gbpages = 0; | ||
694 | #else | ||
695 | use_pse = cpu_has_pse; | ||
696 | use_gbpages = direct_gbpages; | ||
697 | #endif | ||
698 | |||
699 | if (use_gbpages) | ||
588 | page_size_mask |= 1 << PG_LEVEL_1G; | 700 | page_size_mask |= 1 << PG_LEVEL_1G; |
589 | if (cpu_has_pse) | 701 | if (use_pse) |
590 | page_size_mask |= 1 << PG_LEVEL_2M; | 702 | page_size_mask |= 1 << PG_LEVEL_2M; |
591 | 703 | ||
592 | memset(mr, 0, sizeof(mr)); | 704 | memset(mr, 0, sizeof(mr)); |
@@ -647,7 +759,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, | |||
647 | (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k")); | 759 | (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k")); |
648 | 760 | ||
649 | if (!after_bootmem) | 761 | if (!after_bootmem) |
650 | find_early_table_space(end); | 762 | find_early_table_space(end, use_pse, use_gbpages); |
651 | 763 | ||
652 | for (i = 0; i < nr_range; i++) | 764 | for (i = 0; i < nr_range; i++) |
653 | last_map_addr = kernel_physical_mapping_init( | 765 | last_map_addr = kernel_physical_mapping_init( |
@@ -808,8 +920,6 @@ void __init mem_init(void) | |||
808 | reservedpages << (PAGE_SHIFT-10), | 920 | reservedpages << (PAGE_SHIFT-10), |
809 | datasize >> 10, | 921 | datasize >> 10, |
810 | initsize >> 10); | 922 | initsize >> 10); |
811 | |||
812 | cpa_init(); | ||
813 | } | 923 | } |
814 | 924 | ||
815 | void free_init_pages(char *what, unsigned long begin, unsigned long end) | 925 | void free_init_pages(char *what, unsigned long begin, unsigned long end) |