diff options
author | Ingo Molnar <mingo@elte.hu> | 2011-05-12 03:36:18 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2011-05-12 03:36:18 -0400 |
commit | 9cb5baba5e3acba0994ad899ee908799104c9965 (patch) | |
tree | d5ff16000256a0bf56279926e6114b4603ede2b4 /arch/x86/xen | |
parent | 7142d17e8f935fa842e9f6eece2281b6d41625d6 (diff) | |
parent | 693d92a1bbc9e42681c42ed190bd42b636ca876f (diff) |
Merge commit 'v2.6.39-rc7' into sched/core
Diffstat (limited to 'arch/x86/xen')
-rw-r--r-- | arch/x86/xen/mmu.c | 138 | ||||
-rw-r--r-- | arch/x86/xen/setup.c | 2 |
2 files changed, 134 insertions, 6 deletions
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index a991b57f91fe..55c965b38c27 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c | |||
@@ -1463,6 +1463,119 @@ static int xen_pgd_alloc(struct mm_struct *mm) | |||
1463 | return ret; | 1463 | return ret; |
1464 | } | 1464 | } |
1465 | 1465 | ||
1466 | #ifdef CONFIG_X86_64 | ||
1467 | static __initdata u64 __last_pgt_set_rw = 0; | ||
1468 | static __initdata u64 __pgt_buf_start = 0; | ||
1469 | static __initdata u64 __pgt_buf_end = 0; | ||
1470 | static __initdata u64 __pgt_buf_top = 0; | ||
1471 | /* | ||
1472 | * As a consequence of the commit: | ||
1473 | * | ||
1474 | * commit 4b239f458c229de044d6905c2b0f9fe16ed9e01e | ||
1475 | * Author: Yinghai Lu <yinghai@kernel.org> | ||
1476 | * Date: Fri Dec 17 16:58:28 2010 -0800 | ||
1477 | * | ||
1478 | * x86-64, mm: Put early page table high | ||
1479 | * | ||
1480 | * at some point init_memory_mapping is going to reach the pagetable pages | ||
1481 | * area and map those pages too (mapping them as normal memory that falls | ||
1482 | * in the range of addresses passed to init_memory_mapping as argument). | ||
1483 | * Some of those pages are already pagetable pages (they are in the range | ||
1484 | * pgt_buf_start-pgt_buf_end) therefore they are going to be mapped RO and | ||
1485 | * everything is fine. | ||
1486 | * Some of these pages are not pagetable pages yet (they fall in the range | ||
1487 | * pgt_buf_end-pgt_buf_top; for example the page at pgt_buf_end) so they | ||
1488 | * are going to be mapped RW. When these pages become pagetable pages and | ||
1489 | * are hooked into the pagetable, xen will find that the guest has already | ||
1490 | * a RW mapping of them somewhere and fail the operation. | ||
1491 | * The reason Xen requires pagetables to be RO is that the hypervisor needs | ||
1492 | * to verify that the pagetables are valid before using them. The validation | ||
1493 | * operations are called "pinning". | ||
1494 | * | ||
1495 | * In order to fix the issue we mark all the pages in the entire range | ||
1496 | * pgt_buf_start-pgt_buf_top as RO, however when the pagetable allocation | ||
1497 | * is completed only the range pgt_buf_start-pgt_buf_end is reserved by | ||
1498 | * init_memory_mapping. Hence the kernel is going to crash as soon as one | ||
1499 | * of the pages in the range pgt_buf_end-pgt_buf_top is reused (b/c those | ||
1500 | * ranges are RO). | ||
1501 | * | ||
1502 | * For this reason, 'mark_rw_past_pgt' is introduced which is called _after_ | ||
1503 | * the init_memory_mapping has completed (in a perfect world we would | ||
1504 | * call this function from init_memory_mapping, but lets ignore that). | ||
1505 | * | ||
1506 | * Because we are called _after_ init_memory_mapping the pgt_buf_[start, | ||
1507 | * end,top] have all changed to new values (b/c init_memory_mapping | ||
1508 | * is called and setting up another new page-table). Hence, the first time | ||
1509 | * we enter this function, we save away the pgt_buf_start value and update | ||
1510 | * the pgt_buf_[end,top]. | ||
1511 | * | ||
1512 | * When we detect that the "old" pgt_buf_start through pgt_buf_end | ||
1513 | * PFNs have been reserved (so memblock_x86_reserve_range has been called), | ||
1514 | * we immediately set out to RW the "old" pgt_buf_end through pgt_buf_top. | ||
1515 | * | ||
1516 | * And then we update those "old" pgt_buf_[end|top] with the new ones | ||
1517 | * so that we can redo this on the next pagetable. | ||
1518 | */ | ||
1519 | static __init void mark_rw_past_pgt(void) { | ||
1520 | |||
1521 | if (pgt_buf_end > pgt_buf_start) { | ||
1522 | u64 addr, size; | ||
1523 | |||
1524 | /* Save it away. */ | ||
1525 | if (!__pgt_buf_start) { | ||
1526 | __pgt_buf_start = pgt_buf_start; | ||
1527 | __pgt_buf_end = pgt_buf_end; | ||
1528 | __pgt_buf_top = pgt_buf_top; | ||
1529 | return; | ||
1530 | } | ||
1531 | /* If we get the range that starts at __pgt_buf_end that means | ||
1532 | * the range is reserved, and that in 'init_memory_mapping' | ||
1533 | * the 'memblock_x86_reserve_range' has been called with the | ||
1534 | * outdated __pgt_buf_start, __pgt_buf_end (the "new" | ||
1535 | * pgt_buf_[start|end|top] refer now to a new pagetable. | ||
1536 | * Note: we are called _after_ the pgt_buf_[..] have been | ||
1537 | * updated.*/ | ||
1538 | |||
1539 | addr = memblock_x86_find_in_range_size(PFN_PHYS(__pgt_buf_start), | ||
1540 | &size, PAGE_SIZE); | ||
1541 | |||
1542 | /* Still not reserved, meaning 'memblock_x86_reserve_range' | ||
1543 | * hasn't been called yet. Update the _end and _top.*/ | ||
1544 | if (addr == PFN_PHYS(__pgt_buf_start)) { | ||
1545 | __pgt_buf_end = pgt_buf_end; | ||
1546 | __pgt_buf_top = pgt_buf_top; | ||
1547 | return; | ||
1548 | } | ||
1549 | |||
1550 | /* OK, the area is reserved, meaning it is time for us to | ||
1551 | * set RW for the old end->top PFNs. */ | ||
1552 | |||
1553 | /* ..unless we had already done this. */ | ||
1554 | if (__pgt_buf_end == __last_pgt_set_rw) | ||
1555 | return; | ||
1556 | |||
1557 | addr = PFN_PHYS(__pgt_buf_end); | ||
1558 | |||
1559 | /* set as RW the rest */ | ||
1560 | printk(KERN_DEBUG "xen: setting RW the range %llx - %llx\n", | ||
1561 | PFN_PHYS(__pgt_buf_end), PFN_PHYS(__pgt_buf_top)); | ||
1562 | |||
1563 | while (addr < PFN_PHYS(__pgt_buf_top)) { | ||
1564 | make_lowmem_page_readwrite(__va(addr)); | ||
1565 | addr += PAGE_SIZE; | ||
1566 | } | ||
1567 | /* And update everything so that we are ready for the next | ||
1568 | * pagetable (the one created for regions past 4GB) */ | ||
1569 | __last_pgt_set_rw = __pgt_buf_end; | ||
1570 | __pgt_buf_start = pgt_buf_start; | ||
1571 | __pgt_buf_end = pgt_buf_end; | ||
1572 | __pgt_buf_top = pgt_buf_top; | ||
1573 | } | ||
1574 | return; | ||
1575 | } | ||
1576 | #else | ||
1577 | static __init void mark_rw_past_pgt(void) { } | ||
1578 | #endif | ||
1466 | static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd) | 1579 | static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd) |
1467 | { | 1580 | { |
1468 | #ifdef CONFIG_X86_64 | 1581 | #ifdef CONFIG_X86_64 |
@@ -1473,30 +1586,43 @@ static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd) | |||
1473 | #endif | 1586 | #endif |
1474 | } | 1587 | } |
1475 | 1588 | ||
1589 | #ifdef CONFIG_X86_32 | ||
1476 | static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) | 1590 | static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) |
1477 | { | 1591 | { |
1478 | unsigned long pfn = pte_pfn(pte); | ||
1479 | |||
1480 | #ifdef CONFIG_X86_32 | ||
1481 | /* If there's an existing pte, then don't allow _PAGE_RW to be set */ | 1592 | /* If there's an existing pte, then don't allow _PAGE_RW to be set */ |
1482 | if (pte_val_ma(*ptep) & _PAGE_PRESENT) | 1593 | if (pte_val_ma(*ptep) & _PAGE_PRESENT) |
1483 | pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) & | 1594 | pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) & |
1484 | pte_val_ma(pte)); | 1595 | pte_val_ma(pte)); |
1485 | #endif | 1596 | |
1597 | return pte; | ||
1598 | } | ||
1599 | #else /* CONFIG_X86_64 */ | ||
1600 | static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) | ||
1601 | { | ||
1602 | unsigned long pfn = pte_pfn(pte); | ||
1486 | 1603 | ||
1487 | /* | 1604 | /* |
1605 | * A bit of optimization. We do not need to call the workaround | ||
1606 | * when xen_set_pte_init is called with a PTE with 0 as PFN. | ||
1607 | * That is b/c the pagetable at that point are just being populated | ||
1608 | * with empty values and we can save some cycles by not calling | ||
1609 | * the 'memblock' code.*/ | ||
1610 | if (pfn) | ||
1611 | mark_rw_past_pgt(); | ||
1612 | /* | ||
1488 | * If the new pfn is within the range of the newly allocated | 1613 | * If the new pfn is within the range of the newly allocated |
1489 | * kernel pagetable, and it isn't being mapped into an | 1614 | * kernel pagetable, and it isn't being mapped into an |
1490 | * early_ioremap fixmap slot as a freshly allocated page, make sure | 1615 | * early_ioremap fixmap slot as a freshly allocated page, make sure |
1491 | * it is RO. | 1616 | * it is RO. |
1492 | */ | 1617 | */ |
1493 | if (((!is_early_ioremap_ptep(ptep) && | 1618 | if (((!is_early_ioremap_ptep(ptep) && |
1494 | pfn >= pgt_buf_start && pfn < pgt_buf_end)) || | 1619 | pfn >= pgt_buf_start && pfn < pgt_buf_top)) || |
1495 | (is_early_ioremap_ptep(ptep) && pfn != (pgt_buf_end - 1))) | 1620 | (is_early_ioremap_ptep(ptep) && pfn != (pgt_buf_end - 1))) |
1496 | pte = pte_wrprotect(pte); | 1621 | pte = pte_wrprotect(pte); |
1497 | 1622 | ||
1498 | return pte; | 1623 | return pte; |
1499 | } | 1624 | } |
1625 | #endif /* CONFIG_X86_64 */ | ||
1500 | 1626 | ||
1501 | /* Init-time set_pte while constructing initial pagetables, which | 1627 | /* Init-time set_pte while constructing initial pagetables, which |
1502 | doesn't allow RO pagetable pages to be remapped RW */ | 1628 | doesn't allow RO pagetable pages to be remapped RW */ |
@@ -1992,6 +2118,8 @@ __init void xen_ident_map_ISA(void) | |||
1992 | 2118 | ||
1993 | static __init void xen_post_allocator_init(void) | 2119 | static __init void xen_post_allocator_init(void) |
1994 | { | 2120 | { |
2121 | mark_rw_past_pgt(); | ||
2122 | |||
1995 | #ifdef CONFIG_XEN_DEBUG | 2123 | #ifdef CONFIG_XEN_DEBUG |
1996 | pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte_debug); | 2124 | pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte_debug); |
1997 | #endif | 2125 | #endif |
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index fa0269a99377..90bac0aac3a5 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c | |||
@@ -227,7 +227,7 @@ char * __init xen_memory_setup(void) | |||
227 | 227 | ||
228 | memcpy(map_raw, map, sizeof(map)); | 228 | memcpy(map_raw, map, sizeof(map)); |
229 | e820.nr_map = 0; | 229 | e820.nr_map = 0; |
230 | xen_extra_mem_start = mem_end; | 230 | xen_extra_mem_start = max((1ULL << 32), mem_end); |
231 | for (i = 0; i < memmap.nr_entries; i++) { | 231 | for (i = 0; i < memmap.nr_entries; i++) { |
232 | unsigned long long end; | 232 | unsigned long long end; |
233 | 233 | ||