aboutsummaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
authorKonrad Rzeszutek Wilk <konrad.wilk@oracle.com>2011-04-29 11:34:00 -0400
committerKonrad Rzeszutek Wilk <konrad.wilk@oracle.com>2011-05-02 16:33:34 -0400
commita38647837a411f7df79623128421eef2118b5884 (patch)
tree49c444f812e80f72e36cfd86d653bb43519615f7 /arch
parent8e10cd74342c7f5ce259cceca36f6eba084f5d58 (diff)
xen/mmu: Add workaround "x86-64, mm: Put early page table high"
As a consequence of the commit: commit 4b239f458c229de044d6905c2b0f9fe16ed9e01e Author: Yinghai Lu <yinghai@kernel.org> Date: Fri Dec 17 16:58:28 2010 -0800 x86-64, mm: Put early page table high it causes the Linux kernel to crash under Xen: mapping kernel into physical memory Xen: setup ISA identity maps about to get started... (XEN) mm.c:2466:d0 Bad type (saw 7400000000000001 != exp 1000000000000000) for mfn b1d89 (pfn bacf7) (XEN) mm.c:3027:d0 Error while pinning mfn b1d89 (XEN) traps.c:481:d0 Unhandled invalid opcode fault/trap [#6] on VCPU 0 [ec=0000] (XEN) domain_crash_sync called from entry.S (XEN) Domain 0 (vcpu#0) crashed on cpu#0: ... The reason is that at some point init_memory_mapping is going to reach the pagetable pages area and map those pages too (mapping them as normal memory that falls in the range of addresses passed to init_memory_mapping as argument). Some of those pages are already pagetable pages (they are in the range pgt_buf_start-pgt_buf_end) therefore they are going to be mapped RO and everything is fine. Some of these pages are not pagetable pages yet (they fall in the range pgt_buf_end-pgt_buf_top; for example the page at pgt_buf_end) so they are going to be mapped RW. When these pages become pagetable pages and are hooked into the pagetable, xen will find that the guest has already a RW mapping of them somewhere and fail the operation. The reason Xen requires pagetables to be RO is that the hypervisor needs to verify that the pagetables are valid before using them. The validation operations are called "pinning" (more details in arch/x86/xen/mmu.c). In order to fix the issue we mark all the pages in the entire range pgt_buf_start-pgt_buf_top as RO, however when the pagetable allocation is completed only the range pgt_buf_start-pgt_buf_end is reserved by init_memory_mapping. Hence the kernel is going to crash as soon as one of the pages in the range pgt_buf_end-pgt_buf_top is reused (b/c those ranges are RO). For this reason, this function is introduced which is called _after_ the init_memory_mapping has completed (in a perfect world we would call this function from init_memory_mapping, but lets ignore that). Because we are called _after_ init_memory_mapping the pgt_buf_[start, end,top] have all changed to new values (b/c another init_memory_mapping is called). Hence, the first time we enter this function, we save away the pgt_buf_start value and update the pgt_buf_[end,top]. When we detect that the "old" pgt_buf_start through pgt_buf_end PFNs have been reserved (so memblock_x86_reserve_range has been called), we immediately set out to RW the "old" pgt_buf_end through pgt_buf_top. And then we update those "old" pgt_buf_[end|top] with the new ones so that we can redo this on the next pagetable. Acked-by: "H. Peter Anvin" <hpa@zytor.com> Reviewed-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> [v1: Updated with Jeremy's comments] [v2: Added the crash output] Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Diffstat (limited to 'arch')
-rw-r--r--arch/x86/xen/mmu.c123
1 files changed, 123 insertions, 0 deletions
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index aef7af92b28b..1bca25f60ff2 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1463,6 +1463,119 @@ static int xen_pgd_alloc(struct mm_struct *mm)
1463 return ret; 1463 return ret;
1464} 1464}
1465 1465
1466#ifdef CONFIG_X86_64
1467static __initdata u64 __last_pgt_set_rw = 0;
1468static __initdata u64 __pgt_buf_start = 0;
1469static __initdata u64 __pgt_buf_end = 0;
1470static __initdata u64 __pgt_buf_top = 0;
1471/*
1472 * As a consequence of the commit:
1473 *
1474 * commit 4b239f458c229de044d6905c2b0f9fe16ed9e01e
1475 * Author: Yinghai Lu <yinghai@kernel.org>
1476 * Date: Fri Dec 17 16:58:28 2010 -0800
1477 *
1478 * x86-64, mm: Put early page table high
1479 *
1480 * at some point init_memory_mapping is going to reach the pagetable pages
1481 * area and map those pages too (mapping them as normal memory that falls
1482 * in the range of addresses passed to init_memory_mapping as argument).
1483 * Some of those pages are already pagetable pages (they are in the range
1484 * pgt_buf_start-pgt_buf_end) therefore they are going to be mapped RO and
1485 * everything is fine.
1486 * Some of these pages are not pagetable pages yet (they fall in the range
1487 * pgt_buf_end-pgt_buf_top; for example the page at pgt_buf_end) so they
1488 * are going to be mapped RW. When these pages become pagetable pages and
1489 * are hooked into the pagetable, xen will find that the guest has already
1490 * a RW mapping of them somewhere and fail the operation.
1491 * The reason Xen requires pagetables to be RO is that the hypervisor needs
1492 * to verify that the pagetables are valid before using them. The validation
1493 * operations are called "pinning".
1494 *
1495 * In order to fix the issue we mark all the pages in the entire range
1496 * pgt_buf_start-pgt_buf_top as RO, however when the pagetable allocation
1497 * is completed only the range pgt_buf_start-pgt_buf_end is reserved by
1498 * init_memory_mapping. Hence the kernel is going to crash as soon as one
1499 * of the pages in the range pgt_buf_end-pgt_buf_top is reused (b/c those
1500 * ranges are RO).
1501 *
1502 * For this reason, 'mark_rw_past_pgt' is introduced which is called _after_
1503 * the init_memory_mapping has completed (in a perfect world we would
1504 * call this function from init_memory_mapping, but lets ignore that).
1505 *
1506 * Because we are called _after_ init_memory_mapping the pgt_buf_[start,
1507 * end,top] have all changed to new values (b/c init_memory_mapping
1508 * is called and setting up another new page-table). Hence, the first time
1509 * we enter this function, we save away the pgt_buf_start value and update
1510 * the pgt_buf_[end,top].
1511 *
1512 * When we detect that the "old" pgt_buf_start through pgt_buf_end
1513 * PFNs have been reserved (so memblock_x86_reserve_range has been called),
1514 * we immediately set out to RW the "old" pgt_buf_end through pgt_buf_top.
1515 *
1516 * And then we update those "old" pgt_buf_[end|top] with the new ones
1517 * so that we can redo this on the next pagetable.
1518 */
1519static __init void mark_rw_past_pgt(void) {
1520
1521 if (pgt_buf_end > pgt_buf_start) {
1522 u64 addr, size;
1523
1524 /* Save it away. */
1525 if (!__pgt_buf_start) {
1526 __pgt_buf_start = pgt_buf_start;
1527 __pgt_buf_end = pgt_buf_end;
1528 __pgt_buf_top = pgt_buf_top;
1529 return;
1530 }
1531 /* If we get the range that starts at __pgt_buf_end that means
1532 * the range is reserved, and that in 'init_memory_mapping'
1533 * the 'memblock_x86_reserve_range' has been called with the
1534 * outdated __pgt_buf_start, __pgt_buf_end (the "new"
1535 * pgt_buf_[start|end|top] refer now to a new pagetable.
1536 * Note: we are called _after_ the pgt_buf_[..] have been
1537 * updated.*/
1538
1539 addr = memblock_x86_find_in_range_size(PFN_PHYS(__pgt_buf_start),
1540 &size, PAGE_SIZE);
1541
1542 /* Still not reserved, meaning 'memblock_x86_reserve_range'
1543 * hasn't been called yet. Update the _end and _top.*/
1544 if (addr == PFN_PHYS(__pgt_buf_start)) {
1545 __pgt_buf_end = pgt_buf_end;
1546 __pgt_buf_top = pgt_buf_top;
1547 return;
1548 }
1549
1550 /* OK, the area is reserved, meaning it is time for us to
1551 * set RW for the old end->top PFNs. */
1552
1553 /* ..unless we had already done this. */
1554 if (__pgt_buf_end == __last_pgt_set_rw)
1555 return;
1556
1557 addr = PFN_PHYS(__pgt_buf_end);
1558
1559 /* set as RW the rest */
1560 printk(KERN_DEBUG "xen: setting RW the range %llx - %llx\n",
1561 PFN_PHYS(__pgt_buf_end), PFN_PHYS(__pgt_buf_top));
1562
1563 while (addr < PFN_PHYS(__pgt_buf_top)) {
1564 make_lowmem_page_readwrite(__va(addr));
1565 addr += PAGE_SIZE;
1566 }
1567 /* And update everything so that we are ready for the next
1568 * pagetable (the one created for regions past 4GB) */
1569 __last_pgt_set_rw = __pgt_buf_end;
1570 __pgt_buf_start = pgt_buf_start;
1571 __pgt_buf_end = pgt_buf_end;
1572 __pgt_buf_top = pgt_buf_top;
1573 }
1574 return;
1575}
1576#else
1577static __init void mark_rw_past_pgt(void) { }
1578#endif
1466static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd) 1579static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
1467{ 1580{
1468#ifdef CONFIG_X86_64 1581#ifdef CONFIG_X86_64
@@ -1489,6 +1602,14 @@ static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
1489 unsigned long pfn = pte_pfn(pte); 1602 unsigned long pfn = pte_pfn(pte);
1490 1603
1491 /* 1604 /*
1605 * A bit of optimization. We do not need to call the workaround
1606 * when xen_set_pte_init is called with a PTE with 0 as PFN.
1607 * That is b/c the pagetable at that point are just being populated
1608 * with empty values and we can save some cycles by not calling
1609 * the 'memblock' code.*/
1610 if (pfn)
1611 mark_rw_past_pgt();
1612 /*
1492 * If the new pfn is within the range of the newly allocated 1613 * If the new pfn is within the range of the newly allocated
1493 * kernel pagetable, and it isn't being mapped into an 1614 * kernel pagetable, and it isn't being mapped into an
1494 * early_ioremap fixmap slot as a freshly allocated page, make sure 1615 * early_ioremap fixmap slot as a freshly allocated page, make sure
@@ -1997,6 +2118,8 @@ __init void xen_ident_map_ISA(void)
1997 2118
1998static __init void xen_post_allocator_init(void) 2119static __init void xen_post_allocator_init(void)
1999{ 2120{
2121 mark_rw_past_pgt();
2122
2000#ifdef CONFIG_XEN_DEBUG 2123#ifdef CONFIG_XEN_DEBUG
2001 pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte_debug); 2124 pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte_debug);
2002#endif 2125#endif