aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/mm/init_64.c
diff options
context:
space:
mode:
authorSuresh Siddha <suresh.b.siddha@intel.com>2008-10-07 16:58:46 -0400
committerIngo Molnar <mingo@elte.hu>2008-10-10 13:29:21 -0400
commitb27a43c1e90582facad44de67d02bc9e9f900289 (patch)
tree5cdf26c4043e9345cb132798a8535ccb9c20aa9a /arch/x86/mm/init_64.c
parentad2cde16a21985cdc4302e4a4b0fc373d666fdf7 (diff)
x86, cpa: make the kernel physical mapping initialization a two pass sequence, fix
Jeremy Fitzhardinge wrote: > I'd noticed that current tip/master hasn't been booting under Xen, and I > just got around to bisecting it down to this change. > > commit 065ae73c5462d42e9761afb76f2b52965ff45bd6 > Author: Suresh Siddha <suresh.b.siddha@intel.com> > > x86, cpa: make the kernel physical mapping initialization a two pass sequence > > This patch is causing Xen to fail various pagetable updates because it > ends up remapping pagetables to RW, which Xen explicitly prohibits (as > that would allow guests to make arbitrary changes to pagetables, rather > than have them mediated by the hypervisor). Instead of making init a two pass sequence, to satisfy the Intel's TLB Application note (developer.intel.com/design/processor/applnots/317080.pdf Section 6 page 26), we preserve the original page permissions when fragmenting the large mappings and don't touch the existing memory mapping (which satisfies Xen's requirements). Only open issue is: on a native linux kernel, we will go back to mapping the first 0-1GB kernel identity mapping as executable (because of the static mapping setup in head_64.S). We can fix this in a different patch if needed. Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com> Acked-by: Jeremy Fitzhardinge <jeremy@goop.org> Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'arch/x86/mm/init_64.c')
-rw-r--r--arch/x86/mm/init_64.c149
1 files changed, 61 insertions, 88 deletions
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 6116ff0d7416..8c7eae490a2c 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -270,10 +270,9 @@ static __ref void unmap_low_page(void *adr)
270 early_iounmap(adr, PAGE_SIZE); 270 early_iounmap(adr, PAGE_SIZE);
271} 271}
272 272
273static int physical_mapping_iter;
274
275static unsigned long __meminit 273static unsigned long __meminit
276phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end) 274phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
275 pgprot_t prot)
277{ 276{
278 unsigned pages = 0; 277 unsigned pages = 0;
279 unsigned long last_map_addr = end; 278 unsigned long last_map_addr = end;
@@ -291,35 +290,40 @@ phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end)
291 break; 290 break;
292 } 291 }
293 292
293 /*
294 * We will re-use the existing mapping.
295 * Xen for example has some special requirements, like mapping
296 * pagetable pages as RO. So assume someone who pre-setup
297 * these mappings are more intelligent.
298 */
294 if (pte_val(*pte)) 299 if (pte_val(*pte))
295 goto repeat_set_pte; 300 continue;
296 301
297 if (0) 302 if (0)
298 printk(" pte=%p addr=%lx pte=%016lx\n", 303 printk(" pte=%p addr=%lx pte=%016lx\n",
299 pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte); 304 pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte);
300 pages++; 305 pages++;
301repeat_set_pte: 306 set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, prot));
302 set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL));
303 last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE; 307 last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
304 } 308 }
305 309
306 if (physical_mapping_iter == 1) 310 update_page_count(PG_LEVEL_4K, pages);
307 update_page_count(PG_LEVEL_4K, pages);
308 311
309 return last_map_addr; 312 return last_map_addr;
310} 313}
311 314
312static unsigned long __meminit 315static unsigned long __meminit
313phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end) 316phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end,
317 pgprot_t prot)
314{ 318{
315 pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd); 319 pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd);
316 320
317 return phys_pte_init(pte, address, end); 321 return phys_pte_init(pte, address, end, prot);
318} 322}
319 323
320static unsigned long __meminit 324static unsigned long __meminit
321phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, 325phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
322 unsigned long page_size_mask) 326 unsigned long page_size_mask, pgprot_t prot)
323{ 327{
324 unsigned long pages = 0; 328 unsigned long pages = 0;
325 unsigned long last_map_addr = end; 329 unsigned long last_map_addr = end;
@@ -330,6 +334,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
330 unsigned long pte_phys; 334 unsigned long pte_phys;
331 pmd_t *pmd = pmd_page + pmd_index(address); 335 pmd_t *pmd = pmd_page + pmd_index(address);
332 pte_t *pte; 336 pte_t *pte;
337 pgprot_t new_prot = prot;
333 338
334 if (address >= end) { 339 if (address >= end) {
335 if (!after_bootmem) { 340 if (!after_bootmem) {
@@ -343,45 +348,58 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
343 if (!pmd_large(*pmd)) { 348 if (!pmd_large(*pmd)) {
344 spin_lock(&init_mm.page_table_lock); 349 spin_lock(&init_mm.page_table_lock);
345 last_map_addr = phys_pte_update(pmd, address, 350 last_map_addr = phys_pte_update(pmd, address,
346 end); 351 end, prot);
347 spin_unlock(&init_mm.page_table_lock); 352 spin_unlock(&init_mm.page_table_lock);
348 continue; 353 continue;
349 } 354 }
350 goto repeat_set_pte; 355 /*
356 * If we are ok with PG_LEVEL_2M mapping, then we will
357 * use the existing mapping,
358 *
359 * Otherwise, we will split the large page mapping but
360 * use the same existing protection bits except for
361 * large page, so that we don't violate Intel's TLB
362 * Application note (317080) which says, while changing
363 * the page sizes, new and old translations should
364 * not differ with respect to page frame and
365 * attributes.
366 */
367 if (page_size_mask & (1 << PG_LEVEL_2M))
368 continue;
369 new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd));
351 } 370 }
352 371
353 if (page_size_mask & (1<<PG_LEVEL_2M)) { 372 if (page_size_mask & (1<<PG_LEVEL_2M)) {
354 pages++; 373 pages++;
355repeat_set_pte:
356 spin_lock(&init_mm.page_table_lock); 374 spin_lock(&init_mm.page_table_lock);
357 set_pte((pte_t *)pmd, 375 set_pte((pte_t *)pmd,
358 pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); 376 pfn_pte(address >> PAGE_SHIFT,
377 __pgprot(pgprot_val(prot) | _PAGE_PSE)));
359 spin_unlock(&init_mm.page_table_lock); 378 spin_unlock(&init_mm.page_table_lock);
360 last_map_addr = (address & PMD_MASK) + PMD_SIZE; 379 last_map_addr = (address & PMD_MASK) + PMD_SIZE;
361 continue; 380 continue;
362 } 381 }
363 382
364 pte = alloc_low_page(&pte_phys); 383 pte = alloc_low_page(&pte_phys);
365 last_map_addr = phys_pte_init(pte, address, end); 384 last_map_addr = phys_pte_init(pte, address, end, new_prot);
366 unmap_low_page(pte); 385 unmap_low_page(pte);
367 386
368 spin_lock(&init_mm.page_table_lock); 387 spin_lock(&init_mm.page_table_lock);
369 pmd_populate_kernel(&init_mm, pmd, __va(pte_phys)); 388 pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
370 spin_unlock(&init_mm.page_table_lock); 389 spin_unlock(&init_mm.page_table_lock);
371 } 390 }
372 if (physical_mapping_iter == 1) 391 update_page_count(PG_LEVEL_2M, pages);
373 update_page_count(PG_LEVEL_2M, pages);
374 return last_map_addr; 392 return last_map_addr;
375} 393}
376 394
377static unsigned long __meminit 395static unsigned long __meminit
378phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end, 396phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end,
379 unsigned long page_size_mask) 397 unsigned long page_size_mask, pgprot_t prot)
380{ 398{
381 pmd_t *pmd = pmd_offset(pud, 0); 399 pmd_t *pmd = pmd_offset(pud, 0);
382 unsigned long last_map_addr; 400 unsigned long last_map_addr;
383 401
384 last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask); 402 last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask, prot);
385 __flush_tlb_all(); 403 __flush_tlb_all();
386 return last_map_addr; 404 return last_map_addr;
387} 405}
@@ -398,6 +416,7 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
398 unsigned long pmd_phys; 416 unsigned long pmd_phys;
399 pud_t *pud = pud_page + pud_index(addr); 417 pud_t *pud = pud_page + pud_index(addr);
400 pmd_t *pmd; 418 pmd_t *pmd;
419 pgprot_t prot = PAGE_KERNEL;
401 420
402 if (addr >= end) 421 if (addr >= end)
403 break; 422 break;
@@ -411,16 +430,28 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
411 if (pud_val(*pud)) { 430 if (pud_val(*pud)) {
412 if (!pud_large(*pud)) { 431 if (!pud_large(*pud)) {
413 last_map_addr = phys_pmd_update(pud, addr, end, 432 last_map_addr = phys_pmd_update(pud, addr, end,
414 page_size_mask); 433 page_size_mask, prot);
415 continue; 434 continue;
416 } 435 }
417 436 /*
418 goto repeat_set_pte; 437 * If we are ok with PG_LEVEL_1G mapping, then we will
438 * use the existing mapping.
439 *
440 * Otherwise, we will split the gbpage mapping but use
441 * the same existing protection bits except for large
442 * page, so that we don't violate Intel's TLB
443 * Application note (317080) which says, while changing
444 * the page sizes, new and old translations should
445 * not differ with respect to page frame and
446 * attributes.
447 */
448 if (page_size_mask & (1 << PG_LEVEL_1G))
449 continue;
450 prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud));
419 } 451 }
420 452
421 if (page_size_mask & (1<<PG_LEVEL_1G)) { 453 if (page_size_mask & (1<<PG_LEVEL_1G)) {
422 pages++; 454 pages++;
423repeat_set_pte:
424 spin_lock(&init_mm.page_table_lock); 455 spin_lock(&init_mm.page_table_lock);
425 set_pte((pte_t *)pud, 456 set_pte((pte_t *)pud,
426 pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); 457 pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
@@ -430,7 +461,8 @@ repeat_set_pte:
430 } 461 }
431 462
432 pmd = alloc_low_page(&pmd_phys); 463 pmd = alloc_low_page(&pmd_phys);
433 last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask); 464 last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask,
465 prot);
434 unmap_low_page(pmd); 466 unmap_low_page(pmd);
435 467
436 spin_lock(&init_mm.page_table_lock); 468 spin_lock(&init_mm.page_table_lock);
@@ -439,8 +471,7 @@ repeat_set_pte:
439 } 471 }
440 __flush_tlb_all(); 472 __flush_tlb_all();
441 473
442 if (physical_mapping_iter == 1) 474 update_page_count(PG_LEVEL_1G, pages);
443 update_page_count(PG_LEVEL_1G, pages);
444 475
445 return last_map_addr; 476 return last_map_addr;
446} 477}
@@ -505,54 +536,15 @@ static void __init init_gbpages(void)
505 direct_gbpages = 0; 536 direct_gbpages = 0;
506} 537}
507 538
508static int is_kernel(unsigned long pfn)
509{
510 unsigned long pg_addresss = pfn << PAGE_SHIFT;
511
512 if (pg_addresss >= (unsigned long) __pa(_text) &&
513 pg_addresss < (unsigned long) __pa(_end))
514 return 1;
515
516 return 0;
517}
518
519static unsigned long __init kernel_physical_mapping_init(unsigned long start, 539static unsigned long __init kernel_physical_mapping_init(unsigned long start,
520 unsigned long end, 540 unsigned long end,
521 unsigned long page_size_mask) 541 unsigned long page_size_mask)
522{ 542{
523 543
524 unsigned long next, last_map_addr; 544 unsigned long next, last_map_addr = end;
525 u64 cached_supported_pte_mask = __supported_pte_mask;
526 unsigned long cache_start = start;
527 unsigned long cache_end = end;
528
529 /*
530 * First iteration will setup identity mapping using large/small pages
531 * based on page_size_mask, with other attributes same as set by
532 * the early code in head_64.S
533 *
534 * Second iteration will setup the appropriate attributes
535 * as desired for the kernel identity mapping.
536 *
537 * This two pass mechanism conforms to the TLB app note which says:
538 *
539 * "Software should not write to a paging-structure entry in a way
540 * that would change, for any linear address, both the page size
541 * and either the page frame or attributes."
542 *
543 * For now, only difference between very early PTE attributes used in
544 * head_64.S and here is _PAGE_NX.
545 */
546 BUILD_BUG_ON((__PAGE_KERNEL_LARGE & ~__PAGE_KERNEL_IDENT_LARGE_EXEC)
547 != _PAGE_NX);
548 __supported_pte_mask &= ~(_PAGE_NX);
549 physical_mapping_iter = 1;
550 545
551repeat: 546 start = (unsigned long)__va(start);
552 last_map_addr = cache_end; 547 end = (unsigned long)__va(end);
553
554 start = (unsigned long)__va(cache_start);
555 end = (unsigned long)__va(cache_end);
556 548
557 for (; start < end; start = next) { 549 for (; start < end; start = next) {
558 pgd_t *pgd = pgd_offset_k(start); 550 pgd_t *pgd = pgd_offset_k(start);
@@ -564,21 +556,11 @@ repeat:
564 next = end; 556 next = end;
565 557
566 if (pgd_val(*pgd)) { 558 if (pgd_val(*pgd)) {
567 /*
568 * Static identity mappings will be overwritten
569 * with run-time mappings. For example, this allows
570 * the static 0-1GB identity mapping to be mapped
571 * non-executable with this.
572 */
573 if (is_kernel(pte_pfn(*((pte_t *) pgd))))
574 goto realloc;
575
576 last_map_addr = phys_pud_update(pgd, __pa(start), 559 last_map_addr = phys_pud_update(pgd, __pa(start),
577 __pa(end), page_size_mask); 560 __pa(end), page_size_mask);
578 continue; 561 continue;
579 } 562 }
580 563
581realloc:
582 pud = alloc_low_page(&pud_phys); 564 pud = alloc_low_page(&pud_phys);
583 last_map_addr = phys_pud_init(pud, __pa(start), __pa(next), 565 last_map_addr = phys_pud_init(pud, __pa(start), __pa(next),
584 page_size_mask); 566 page_size_mask);
@@ -590,15 +572,6 @@ realloc:
590 } 572 }
591 __flush_tlb_all(); 573 __flush_tlb_all();
592 574
593 if (physical_mapping_iter == 1) {
594 physical_mapping_iter = 2;
595 /*
596 * Second iteration will set the actual desired PTE attributes.
597 */
598 __supported_pte_mask = cached_supported_pte_mask;
599 goto repeat;
600 }
601
602 return last_map_addr; 575 return last_map_addr;
603} 576}
604 577