diff options
author | Suresh Siddha <suresh.b.siddha@intel.com> | 2008-10-07 16:58:46 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2008-10-10 13:29:21 -0400 |
commit | b27a43c1e90582facad44de67d02bc9e9f900289 (patch) | |
tree | 5cdf26c4043e9345cb132798a8535ccb9c20aa9a | |
parent | ad2cde16a21985cdc4302e4a4b0fc373d666fdf7 (diff) |
x86, cpa: make the kernel physical mapping initialization a two pass sequence, fix
Jeremy Fitzhardinge wrote:
> I'd noticed that current tip/master hasn't been booting under Xen, and I
> just got around to bisecting it down to this change.
>
> commit 065ae73c5462d42e9761afb76f2b52965ff45bd6
> Author: Suresh Siddha <suresh.b.siddha@intel.com>
>
> x86, cpa: make the kernel physical mapping initialization a two pass sequence
>
> This patch is causing Xen to fail various pagetable updates because it
> ends up remapping pagetables to RW, which Xen explicitly prohibits (as
> that would allow guests to make arbitrary changes to pagetables, rather
> than have them mediated by the hypervisor).
Instead of making init a two pass sequence, to satisfy the Intel's TLB
Application note (developer.intel.com/design/processor/applnots/317080.pdf
Section 6 page 26), we preserve the original page permissions
when fragmenting the large mappings and don't touch the existing memory
mapping (which satisfies Xen's requirements).
Only open issue is: on a native linux kernel, we will go back to mapping
the first 0-1GB kernel identity mapping as executable (because of the
static mapping setup in head_64.S). We can fix this in a different
patch if needed.
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Acked-by: Jeremy Fitzhardinge <jeremy@goop.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r-- | arch/x86/mm/init_64.c | 149 |
1 files changed, 61 insertions, 88 deletions
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 6116ff0d7416..8c7eae490a2c 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c | |||
@@ -270,10 +270,9 @@ static __ref void unmap_low_page(void *adr) | |||
270 | early_iounmap(adr, PAGE_SIZE); | 270 | early_iounmap(adr, PAGE_SIZE); |
271 | } | 271 | } |
272 | 272 | ||
273 | static int physical_mapping_iter; | ||
274 | |||
275 | static unsigned long __meminit | 273 | static unsigned long __meminit |
276 | phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end) | 274 | phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end, |
275 | pgprot_t prot) | ||
277 | { | 276 | { |
278 | unsigned pages = 0; | 277 | unsigned pages = 0; |
279 | unsigned long last_map_addr = end; | 278 | unsigned long last_map_addr = end; |
@@ -291,35 +290,40 @@ phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end) | |||
291 | break; | 290 | break; |
292 | } | 291 | } |
293 | 292 | ||
293 | /* | ||
294 | * We will re-use the existing mapping. | ||
295 | * Xen for example has some special requirements, like mapping | ||
296 | * pagetable pages as RO. So assume someone who pre-setup | ||
297 | * these mappings are more intelligent. | ||
298 | */ | ||
294 | if (pte_val(*pte)) | 299 | if (pte_val(*pte)) |
295 | goto repeat_set_pte; | 300 | continue; |
296 | 301 | ||
297 | if (0) | 302 | if (0) |
298 | printk(" pte=%p addr=%lx pte=%016lx\n", | 303 | printk(" pte=%p addr=%lx pte=%016lx\n", |
299 | pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte); | 304 | pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte); |
300 | pages++; | 305 | pages++; |
301 | repeat_set_pte: | 306 | set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, prot)); |
302 | set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL)); | ||
303 | last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE; | 307 | last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE; |
304 | } | 308 | } |
305 | 309 | ||
306 | if (physical_mapping_iter == 1) | 310 | update_page_count(PG_LEVEL_4K, pages); |
307 | update_page_count(PG_LEVEL_4K, pages); | ||
308 | 311 | ||
309 | return last_map_addr; | 312 | return last_map_addr; |
310 | } | 313 | } |
311 | 314 | ||
312 | static unsigned long __meminit | 315 | static unsigned long __meminit |
313 | phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end) | 316 | phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end, |
317 | pgprot_t prot) | ||
314 | { | 318 | { |
315 | pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd); | 319 | pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd); |
316 | 320 | ||
317 | return phys_pte_init(pte, address, end); | 321 | return phys_pte_init(pte, address, end, prot); |
318 | } | 322 | } |
319 | 323 | ||
320 | static unsigned long __meminit | 324 | static unsigned long __meminit |
321 | phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, | 325 | phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, |
322 | unsigned long page_size_mask) | 326 | unsigned long page_size_mask, pgprot_t prot) |
323 | { | 327 | { |
324 | unsigned long pages = 0; | 328 | unsigned long pages = 0; |
325 | unsigned long last_map_addr = end; | 329 | unsigned long last_map_addr = end; |
@@ -330,6 +334,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, | |||
330 | unsigned long pte_phys; | 334 | unsigned long pte_phys; |
331 | pmd_t *pmd = pmd_page + pmd_index(address); | 335 | pmd_t *pmd = pmd_page + pmd_index(address); |
332 | pte_t *pte; | 336 | pte_t *pte; |
337 | pgprot_t new_prot = prot; | ||
333 | 338 | ||
334 | if (address >= end) { | 339 | if (address >= end) { |
335 | if (!after_bootmem) { | 340 | if (!after_bootmem) { |
@@ -343,45 +348,58 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, | |||
343 | if (!pmd_large(*pmd)) { | 348 | if (!pmd_large(*pmd)) { |
344 | spin_lock(&init_mm.page_table_lock); | 349 | spin_lock(&init_mm.page_table_lock); |
345 | last_map_addr = phys_pte_update(pmd, address, | 350 | last_map_addr = phys_pte_update(pmd, address, |
346 | end); | 351 | end, prot); |
347 | spin_unlock(&init_mm.page_table_lock); | 352 | spin_unlock(&init_mm.page_table_lock); |
348 | continue; | 353 | continue; |
349 | } | 354 | } |
350 | goto repeat_set_pte; | 355 | /* |
356 | * If we are ok with PG_LEVEL_2M mapping, then we will | ||
357 | * use the existing mapping, | ||
358 | * | ||
359 | * Otherwise, we will split the large page mapping but | ||
360 | * use the same existing protection bits except for | ||
361 | * large page, so that we don't violate Intel's TLB | ||
362 | * Application note (317080) which says, while changing | ||
363 | * the page sizes, new and old translations should | ||
364 | * not differ with respect to page frame and | ||
365 | * attributes. | ||
366 | */ | ||
367 | if (page_size_mask & (1 << PG_LEVEL_2M)) | ||
368 | continue; | ||
369 | new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd)); | ||
351 | } | 370 | } |
352 | 371 | ||
353 | if (page_size_mask & (1<<PG_LEVEL_2M)) { | 372 | if (page_size_mask & (1<<PG_LEVEL_2M)) { |
354 | pages++; | 373 | pages++; |
355 | repeat_set_pte: | ||
356 | spin_lock(&init_mm.page_table_lock); | 374 | spin_lock(&init_mm.page_table_lock); |
357 | set_pte((pte_t *)pmd, | 375 | set_pte((pte_t *)pmd, |
358 | pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); | 376 | pfn_pte(address >> PAGE_SHIFT, |
377 | __pgprot(pgprot_val(prot) | _PAGE_PSE))); | ||
359 | spin_unlock(&init_mm.page_table_lock); | 378 | spin_unlock(&init_mm.page_table_lock); |
360 | last_map_addr = (address & PMD_MASK) + PMD_SIZE; | 379 | last_map_addr = (address & PMD_MASK) + PMD_SIZE; |
361 | continue; | 380 | continue; |
362 | } | 381 | } |
363 | 382 | ||
364 | pte = alloc_low_page(&pte_phys); | 383 | pte = alloc_low_page(&pte_phys); |
365 | last_map_addr = phys_pte_init(pte, address, end); | 384 | last_map_addr = phys_pte_init(pte, address, end, new_prot); |
366 | unmap_low_page(pte); | 385 | unmap_low_page(pte); |
367 | 386 | ||
368 | spin_lock(&init_mm.page_table_lock); | 387 | spin_lock(&init_mm.page_table_lock); |
369 | pmd_populate_kernel(&init_mm, pmd, __va(pte_phys)); | 388 | pmd_populate_kernel(&init_mm, pmd, __va(pte_phys)); |
370 | spin_unlock(&init_mm.page_table_lock); | 389 | spin_unlock(&init_mm.page_table_lock); |
371 | } | 390 | } |
372 | if (physical_mapping_iter == 1) | 391 | update_page_count(PG_LEVEL_2M, pages); |
373 | update_page_count(PG_LEVEL_2M, pages); | ||
374 | return last_map_addr; | 392 | return last_map_addr; |
375 | } | 393 | } |
376 | 394 | ||
377 | static unsigned long __meminit | 395 | static unsigned long __meminit |
378 | phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end, | 396 | phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end, |
379 | unsigned long page_size_mask) | 397 | unsigned long page_size_mask, pgprot_t prot) |
380 | { | 398 | { |
381 | pmd_t *pmd = pmd_offset(pud, 0); | 399 | pmd_t *pmd = pmd_offset(pud, 0); |
382 | unsigned long last_map_addr; | 400 | unsigned long last_map_addr; |
383 | 401 | ||
384 | last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask); | 402 | last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask, prot); |
385 | __flush_tlb_all(); | 403 | __flush_tlb_all(); |
386 | return last_map_addr; | 404 | return last_map_addr; |
387 | } | 405 | } |
@@ -398,6 +416,7 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, | |||
398 | unsigned long pmd_phys; | 416 | unsigned long pmd_phys; |
399 | pud_t *pud = pud_page + pud_index(addr); | 417 | pud_t *pud = pud_page + pud_index(addr); |
400 | pmd_t *pmd; | 418 | pmd_t *pmd; |
419 | pgprot_t prot = PAGE_KERNEL; | ||
401 | 420 | ||
402 | if (addr >= end) | 421 | if (addr >= end) |
403 | break; | 422 | break; |
@@ -411,16 +430,28 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, | |||
411 | if (pud_val(*pud)) { | 430 | if (pud_val(*pud)) { |
412 | if (!pud_large(*pud)) { | 431 | if (!pud_large(*pud)) { |
413 | last_map_addr = phys_pmd_update(pud, addr, end, | 432 | last_map_addr = phys_pmd_update(pud, addr, end, |
414 | page_size_mask); | 433 | page_size_mask, prot); |
415 | continue; | 434 | continue; |
416 | } | 435 | } |
417 | 436 | /* | |
418 | goto repeat_set_pte; | 437 | * If we are ok with PG_LEVEL_1G mapping, then we will |
438 | * use the existing mapping. | ||
439 | * | ||
440 | * Otherwise, we will split the gbpage mapping but use | ||
441 | * the same existing protection bits except for large | ||
442 | * page, so that we don't violate Intel's TLB | ||
443 | * Application note (317080) which says, while changing | ||
444 | * the page sizes, new and old translations should | ||
445 | * not differ with respect to page frame and | ||
446 | * attributes. | ||
447 | */ | ||
448 | if (page_size_mask & (1 << PG_LEVEL_1G)) | ||
449 | continue; | ||
450 | prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud)); | ||
419 | } | 451 | } |
420 | 452 | ||
421 | if (page_size_mask & (1<<PG_LEVEL_1G)) { | 453 | if (page_size_mask & (1<<PG_LEVEL_1G)) { |
422 | pages++; | 454 | pages++; |
423 | repeat_set_pte: | ||
424 | spin_lock(&init_mm.page_table_lock); | 455 | spin_lock(&init_mm.page_table_lock); |
425 | set_pte((pte_t *)pud, | 456 | set_pte((pte_t *)pud, |
426 | pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); | 457 | pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); |
@@ -430,7 +461,8 @@ repeat_set_pte: | |||
430 | } | 461 | } |
431 | 462 | ||
432 | pmd = alloc_low_page(&pmd_phys); | 463 | pmd = alloc_low_page(&pmd_phys); |
433 | last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask); | 464 | last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask, |
465 | prot); | ||
434 | unmap_low_page(pmd); | 466 | unmap_low_page(pmd); |
435 | 467 | ||
436 | spin_lock(&init_mm.page_table_lock); | 468 | spin_lock(&init_mm.page_table_lock); |
@@ -439,8 +471,7 @@ repeat_set_pte: | |||
439 | } | 471 | } |
440 | __flush_tlb_all(); | 472 | __flush_tlb_all(); |
441 | 473 | ||
442 | if (physical_mapping_iter == 1) | 474 | update_page_count(PG_LEVEL_1G, pages); |
443 | update_page_count(PG_LEVEL_1G, pages); | ||
444 | 475 | ||
445 | return last_map_addr; | 476 | return last_map_addr; |
446 | } | 477 | } |
@@ -505,54 +536,15 @@ static void __init init_gbpages(void) | |||
505 | direct_gbpages = 0; | 536 | direct_gbpages = 0; |
506 | } | 537 | } |
507 | 538 | ||
508 | static int is_kernel(unsigned long pfn) | ||
509 | { | ||
510 | unsigned long pg_addresss = pfn << PAGE_SHIFT; | ||
511 | |||
512 | if (pg_addresss >= (unsigned long) __pa(_text) && | ||
513 | pg_addresss < (unsigned long) __pa(_end)) | ||
514 | return 1; | ||
515 | |||
516 | return 0; | ||
517 | } | ||
518 | |||
519 | static unsigned long __init kernel_physical_mapping_init(unsigned long start, | 539 | static unsigned long __init kernel_physical_mapping_init(unsigned long start, |
520 | unsigned long end, | 540 | unsigned long end, |
521 | unsigned long page_size_mask) | 541 | unsigned long page_size_mask) |
522 | { | 542 | { |
523 | 543 | ||
524 | unsigned long next, last_map_addr; | 544 | unsigned long next, last_map_addr = end; |
525 | u64 cached_supported_pte_mask = __supported_pte_mask; | ||
526 | unsigned long cache_start = start; | ||
527 | unsigned long cache_end = end; | ||
528 | |||
529 | /* | ||
530 | * First iteration will setup identity mapping using large/small pages | ||
531 | * based on page_size_mask, with other attributes same as set by | ||
532 | * the early code in head_64.S | ||
533 | * | ||
534 | * Second iteration will setup the appropriate attributes | ||
535 | * as desired for the kernel identity mapping. | ||
536 | * | ||
537 | * This two pass mechanism conforms to the TLB app note which says: | ||
538 | * | ||
539 | * "Software should not write to a paging-structure entry in a way | ||
540 | * that would change, for any linear address, both the page size | ||
541 | * and either the page frame or attributes." | ||
542 | * | ||
543 | * For now, only difference between very early PTE attributes used in | ||
544 | * head_64.S and here is _PAGE_NX. | ||
545 | */ | ||
546 | BUILD_BUG_ON((__PAGE_KERNEL_LARGE & ~__PAGE_KERNEL_IDENT_LARGE_EXEC) | ||
547 | != _PAGE_NX); | ||
548 | __supported_pte_mask &= ~(_PAGE_NX); | ||
549 | physical_mapping_iter = 1; | ||
550 | 545 | ||
551 | repeat: | 546 | start = (unsigned long)__va(start); |
552 | last_map_addr = cache_end; | 547 | end = (unsigned long)__va(end); |
553 | |||
554 | start = (unsigned long)__va(cache_start); | ||
555 | end = (unsigned long)__va(cache_end); | ||
556 | 548 | ||
557 | for (; start < end; start = next) { | 549 | for (; start < end; start = next) { |
558 | pgd_t *pgd = pgd_offset_k(start); | 550 | pgd_t *pgd = pgd_offset_k(start); |
@@ -564,21 +556,11 @@ repeat: | |||
564 | next = end; | 556 | next = end; |
565 | 557 | ||
566 | if (pgd_val(*pgd)) { | 558 | if (pgd_val(*pgd)) { |
567 | /* | ||
568 | * Static identity mappings will be overwritten | ||
569 | * with run-time mappings. For example, this allows | ||
570 | * the static 0-1GB identity mapping to be mapped | ||
571 | * non-executable with this. | ||
572 | */ | ||
573 | if (is_kernel(pte_pfn(*((pte_t *) pgd)))) | ||
574 | goto realloc; | ||
575 | |||
576 | last_map_addr = phys_pud_update(pgd, __pa(start), | 559 | last_map_addr = phys_pud_update(pgd, __pa(start), |
577 | __pa(end), page_size_mask); | 560 | __pa(end), page_size_mask); |
578 | continue; | 561 | continue; |
579 | } | 562 | } |
580 | 563 | ||
581 | realloc: | ||
582 | pud = alloc_low_page(&pud_phys); | 564 | pud = alloc_low_page(&pud_phys); |
583 | last_map_addr = phys_pud_init(pud, __pa(start), __pa(next), | 565 | last_map_addr = phys_pud_init(pud, __pa(start), __pa(next), |
584 | page_size_mask); | 566 | page_size_mask); |
@@ -590,15 +572,6 @@ realloc: | |||
590 | } | 572 | } |
591 | __flush_tlb_all(); | 573 | __flush_tlb_all(); |
592 | 574 | ||
593 | if (physical_mapping_iter == 1) { | ||
594 | physical_mapping_iter = 2; | ||
595 | /* | ||
596 | * Second iteration will set the actual desired PTE attributes. | ||
597 | */ | ||
598 | __supported_pte_mask = cached_supported_pte_mask; | ||
599 | goto repeat; | ||
600 | } | ||
601 | |||
602 | return last_map_addr; | 575 | return last_map_addr; |
603 | } | 576 | } |
604 | 577 | ||