diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2011-05-12 15:21:51 -0400 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2011-05-12 15:21:51 -0400 |
| commit | 0c5e1577f1108e88475ca7b7ca75c411460173e1 (patch) | |
| tree | d74285bb8d89c7c6d2d6896410fdcf57ff2a3b96 | |
| parent | 982b2035d9d7033f63db187bac55e9d8998b0266 (diff) | |
| parent | 53f8023febf9b3e18d8fb0d99c55010e473ce53d (diff) | |
Merge branch 'stable/bug-fixes-for-rc7' of git://git.kernel.org/pub/scm/linux/kernel/git/konrad/xen
* 'stable/bug-fixes-for-rc7' of git://git.kernel.org/pub/scm/linux/kernel/git/konrad/xen:
x86/mm: Fix section mismatch derived from native_pagetable_reserve()
x86,xen: introduce x86_init.mapping.pagetable_reserve
Revert "xen/mmu: Add workaround "x86-64, mm: Put early page table high""
| -rw-r--r-- | arch/x86/include/asm/pgtable_types.h | 1 | ||||
| -rw-r--r-- | arch/x86/include/asm/x86_init.h | 12 | ||||
| -rw-r--r-- | arch/x86/kernel/x86_init.c | 4 | ||||
| -rw-r--r-- | arch/x86/mm/init.c | 24 | ||||
| -rw-r--r-- | arch/x86/xen/mmu.c | 138 |
5 files changed, 54 insertions, 125 deletions
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 7db7723d1f32..d56187c6b838 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h | |||
| @@ -299,6 +299,7 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, | |||
| 299 | /* Install a pte for a particular vaddr in kernel space. */ | 299 | /* Install a pte for a particular vaddr in kernel space. */ |
| 300 | void set_pte_vaddr(unsigned long vaddr, pte_t pte); | 300 | void set_pte_vaddr(unsigned long vaddr, pte_t pte); |
| 301 | 301 | ||
| 302 | extern void native_pagetable_reserve(u64 start, u64 end); | ||
| 302 | #ifdef CONFIG_X86_32 | 303 | #ifdef CONFIG_X86_32 |
| 303 | extern void native_pagetable_setup_start(pgd_t *base); | 304 | extern void native_pagetable_setup_start(pgd_t *base); |
| 304 | extern void native_pagetable_setup_done(pgd_t *base); | 305 | extern void native_pagetable_setup_done(pgd_t *base); |
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 643ebf2e2ad8..d3d859035af9 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h | |||
| @@ -68,6 +68,17 @@ struct x86_init_oem { | |||
| 68 | }; | 68 | }; |
| 69 | 69 | ||
| 70 | /** | 70 | /** |
| 71 | * struct x86_init_mapping - platform specific initial kernel pagetable setup | ||
| 72 | * @pagetable_reserve: reserve a range of addresses for kernel pagetable usage | ||
| 73 | * | ||
| 74 | * For more details on the purpose of this hook, look in | ||
| 75 | * init_memory_mapping and the commit that added it. | ||
| 76 | */ | ||
| 77 | struct x86_init_mapping { | ||
| 78 | void (*pagetable_reserve)(u64 start, u64 end); | ||
| 79 | }; | ||
| 80 | |||
| 81 | /** | ||
| 71 | * struct x86_init_paging - platform specific paging functions | 82 | * struct x86_init_paging - platform specific paging functions |
| 72 | * @pagetable_setup_start: platform specific pre paging_init() call | 83 | * @pagetable_setup_start: platform specific pre paging_init() call |
| 73 | * @pagetable_setup_done: platform specific post paging_init() call | 84 | * @pagetable_setup_done: platform specific post paging_init() call |
| @@ -123,6 +134,7 @@ struct x86_init_ops { | |||
| 123 | struct x86_init_mpparse mpparse; | 134 | struct x86_init_mpparse mpparse; |
| 124 | struct x86_init_irqs irqs; | 135 | struct x86_init_irqs irqs; |
| 125 | struct x86_init_oem oem; | 136 | struct x86_init_oem oem; |
| 137 | struct x86_init_mapping mapping; | ||
| 126 | struct x86_init_paging paging; | 138 | struct x86_init_paging paging; |
| 127 | struct x86_init_timers timers; | 139 | struct x86_init_timers timers; |
| 128 | struct x86_init_iommu iommu; | 140 | struct x86_init_iommu iommu; |
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index c11514e9128b..75ef4b18e9b7 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c | |||
| @@ -61,6 +61,10 @@ struct x86_init_ops x86_init __initdata = { | |||
| 61 | .banner = default_banner, | 61 | .banner = default_banner, |
| 62 | }, | 62 | }, |
| 63 | 63 | ||
| 64 | .mapping = { | ||
| 65 | .pagetable_reserve = native_pagetable_reserve, | ||
| 66 | }, | ||
| 67 | |||
| 64 | .paging = { | 68 | .paging = { |
| 65 | .pagetable_setup_start = native_pagetable_setup_start, | 69 | .pagetable_setup_start = native_pagetable_setup_start, |
| 66 | .pagetable_setup_done = native_pagetable_setup_done, | 70 | .pagetable_setup_done = native_pagetable_setup_done, |
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 286d289b039b..37b8b0fe8320 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c | |||
| @@ -81,6 +81,11 @@ static void __init find_early_table_space(unsigned long end, int use_pse, | |||
| 81 | end, pgt_buf_start << PAGE_SHIFT, pgt_buf_top << PAGE_SHIFT); | 81 | end, pgt_buf_start << PAGE_SHIFT, pgt_buf_top << PAGE_SHIFT); |
| 82 | } | 82 | } |
| 83 | 83 | ||
| 84 | void __init native_pagetable_reserve(u64 start, u64 end) | ||
| 85 | { | ||
| 86 | memblock_x86_reserve_range(start, end, "PGTABLE"); | ||
| 87 | } | ||
| 88 | |||
| 84 | struct map_range { | 89 | struct map_range { |
| 85 | unsigned long start; | 90 | unsigned long start; |
| 86 | unsigned long end; | 91 | unsigned long end; |
| @@ -272,9 +277,24 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, | |||
| 272 | 277 | ||
| 273 | __flush_tlb_all(); | 278 | __flush_tlb_all(); |
| 274 | 279 | ||
| 280 | /* | ||
| 281 | * Reserve the kernel pagetable pages we used (pgt_buf_start - | ||
| 282 | * pgt_buf_end) and free the other ones (pgt_buf_end - pgt_buf_top) | ||
| 283 | * so that they can be reused for other purposes. | ||
| 284 | * | ||
| 285 | * On native it just means calling memblock_x86_reserve_range, on Xen it | ||
| 286 | * also means marking RW the pagetable pages that we allocated before | ||
| 287 | * but that haven't been used. | ||
| 288 | * | ||
| 289 | * In fact on xen we mark RO the whole range pgt_buf_start - | ||
| 290 | * pgt_buf_top, because we have to make sure that when | ||
| 291 | * init_memory_mapping reaches the pagetable pages area, it maps | ||
| 292 | * RO all the pagetable pages, including the ones that are beyond | ||
| 293 | * pgt_buf_end at that time. | ||
| 294 | */ | ||
| 275 | if (!after_bootmem && pgt_buf_end > pgt_buf_start) | 295 | if (!after_bootmem && pgt_buf_end > pgt_buf_start) |
| 276 | memblock_x86_reserve_range(pgt_buf_start << PAGE_SHIFT, | 296 | x86_init.mapping.pagetable_reserve(PFN_PHYS(pgt_buf_start), |
| 277 | pgt_buf_end << PAGE_SHIFT, "PGTABLE"); | 297 | PFN_PHYS(pgt_buf_end)); |
| 278 | 298 | ||
| 279 | if (!after_bootmem) | 299 | if (!after_bootmem) |
| 280 | early_memtest(start, end); | 300 | early_memtest(start, end); |
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 55c965b38c27..0684f3c74d53 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c | |||
| @@ -1275,6 +1275,20 @@ static __init void xen_pagetable_setup_start(pgd_t *base) | |||
| 1275 | { | 1275 | { |
| 1276 | } | 1276 | } |
| 1277 | 1277 | ||
| 1278 | static __init void xen_mapping_pagetable_reserve(u64 start, u64 end) | ||
| 1279 | { | ||
| 1280 | /* reserve the range used */ | ||
| 1281 | native_pagetable_reserve(start, end); | ||
| 1282 | |||
| 1283 | /* set as RW the rest */ | ||
| 1284 | printk(KERN_DEBUG "xen: setting RW the range %llx - %llx\n", end, | ||
| 1285 | PFN_PHYS(pgt_buf_top)); | ||
| 1286 | while (end < PFN_PHYS(pgt_buf_top)) { | ||
| 1287 | make_lowmem_page_readwrite(__va(end)); | ||
| 1288 | end += PAGE_SIZE; | ||
| 1289 | } | ||
| 1290 | } | ||
| 1291 | |||
| 1278 | static void xen_post_allocator_init(void); | 1292 | static void xen_post_allocator_init(void); |
| 1279 | 1293 | ||
| 1280 | static __init void xen_pagetable_setup_done(pgd_t *base) | 1294 | static __init void xen_pagetable_setup_done(pgd_t *base) |
| @@ -1463,119 +1477,6 @@ static int xen_pgd_alloc(struct mm_struct *mm) | |||
| 1463 | return ret; | 1477 | return ret; |
| 1464 | } | 1478 | } |
| 1465 | 1479 | ||
| 1466 | #ifdef CONFIG_X86_64 | ||
| 1467 | static __initdata u64 __last_pgt_set_rw = 0; | ||
| 1468 | static __initdata u64 __pgt_buf_start = 0; | ||
| 1469 | static __initdata u64 __pgt_buf_end = 0; | ||
| 1470 | static __initdata u64 __pgt_buf_top = 0; | ||
| 1471 | /* | ||
| 1472 | * As a consequence of the commit: | ||
| 1473 | * | ||
| 1474 | * commit 4b239f458c229de044d6905c2b0f9fe16ed9e01e | ||
| 1475 | * Author: Yinghai Lu <yinghai@kernel.org> | ||
| 1476 | * Date: Fri Dec 17 16:58:28 2010 -0800 | ||
| 1477 | * | ||
| 1478 | * x86-64, mm: Put early page table high | ||
| 1479 | * | ||
| 1480 | * at some point init_memory_mapping is going to reach the pagetable pages | ||
| 1481 | * area and map those pages too (mapping them as normal memory that falls | ||
| 1482 | * in the range of addresses passed to init_memory_mapping as argument). | ||
| 1483 | * Some of those pages are already pagetable pages (they are in the range | ||
| 1484 | * pgt_buf_start-pgt_buf_end) therefore they are going to be mapped RO and | ||
| 1485 | * everything is fine. | ||
| 1486 | * Some of these pages are not pagetable pages yet (they fall in the range | ||
| 1487 | * pgt_buf_end-pgt_buf_top; for example the page at pgt_buf_end) so they | ||
| 1488 | * are going to be mapped RW. When these pages become pagetable pages and | ||
| 1489 | * are hooked into the pagetable, xen will find that the guest has already | ||
| 1490 | * a RW mapping of them somewhere and fail the operation. | ||
| 1491 | * The reason Xen requires pagetables to be RO is that the hypervisor needs | ||
| 1492 | * to verify that the pagetables are valid before using them. The validation | ||
| 1493 | * operations are called "pinning". | ||
| 1494 | * | ||
| 1495 | * In order to fix the issue we mark all the pages in the entire range | ||
| 1496 | * pgt_buf_start-pgt_buf_top as RO, however when the pagetable allocation | ||
| 1497 | * is completed only the range pgt_buf_start-pgt_buf_end is reserved by | ||
| 1498 | * init_memory_mapping. Hence the kernel is going to crash as soon as one | ||
| 1499 | * of the pages in the range pgt_buf_end-pgt_buf_top is reused (b/c those | ||
| 1500 | * ranges are RO). | ||
| 1501 | * | ||
| 1502 | * For this reason, 'mark_rw_past_pgt' is introduced which is called _after_ | ||
| 1503 | * the init_memory_mapping has completed (in a perfect world we would | ||
| 1504 | * call this function from init_memory_mapping, but lets ignore that). | ||
| 1505 | * | ||
| 1506 | * Because we are called _after_ init_memory_mapping the pgt_buf_[start, | ||
| 1507 | * end,top] have all changed to new values (b/c init_memory_mapping | ||
| 1508 | * is called and setting up another new page-table). Hence, the first time | ||
| 1509 | * we enter this function, we save away the pgt_buf_start value and update | ||
| 1510 | * the pgt_buf_[end,top]. | ||
| 1511 | * | ||
| 1512 | * When we detect that the "old" pgt_buf_start through pgt_buf_end | ||
| 1513 | * PFNs have been reserved (so memblock_x86_reserve_range has been called), | ||
| 1514 | * we immediately set out to RW the "old" pgt_buf_end through pgt_buf_top. | ||
| 1515 | * | ||
| 1516 | * And then we update those "old" pgt_buf_[end|top] with the new ones | ||
| 1517 | * so that we can redo this on the next pagetable. | ||
| 1518 | */ | ||
| 1519 | static __init void mark_rw_past_pgt(void) { | ||
| 1520 | |||
| 1521 | if (pgt_buf_end > pgt_buf_start) { | ||
| 1522 | u64 addr, size; | ||
| 1523 | |||
| 1524 | /* Save it away. */ | ||
| 1525 | if (!__pgt_buf_start) { | ||
| 1526 | __pgt_buf_start = pgt_buf_start; | ||
| 1527 | __pgt_buf_end = pgt_buf_end; | ||
| 1528 | __pgt_buf_top = pgt_buf_top; | ||
| 1529 | return; | ||
| 1530 | } | ||
| 1531 | /* If we get the range that starts at __pgt_buf_end that means | ||
| 1532 | * the range is reserved, and that in 'init_memory_mapping' | ||
| 1533 | * the 'memblock_x86_reserve_range' has been called with the | ||
| 1534 | * outdated __pgt_buf_start, __pgt_buf_end (the "new" | ||
| 1535 | * pgt_buf_[start|end|top] refer now to a new pagetable. | ||
| 1536 | * Note: we are called _after_ the pgt_buf_[..] have been | ||
| 1537 | * updated.*/ | ||
| 1538 | |||
| 1539 | addr = memblock_x86_find_in_range_size(PFN_PHYS(__pgt_buf_start), | ||
| 1540 | &size, PAGE_SIZE); | ||
| 1541 | |||
| 1542 | /* Still not reserved, meaning 'memblock_x86_reserve_range' | ||
| 1543 | * hasn't been called yet. Update the _end and _top.*/ | ||
| 1544 | if (addr == PFN_PHYS(__pgt_buf_start)) { | ||
| 1545 | __pgt_buf_end = pgt_buf_end; | ||
| 1546 | __pgt_buf_top = pgt_buf_top; | ||
| 1547 | return; | ||
| 1548 | } | ||
| 1549 | |||
| 1550 | /* OK, the area is reserved, meaning it is time for us to | ||
| 1551 | * set RW for the old end->top PFNs. */ | ||
| 1552 | |||
| 1553 | /* ..unless we had already done this. */ | ||
| 1554 | if (__pgt_buf_end == __last_pgt_set_rw) | ||
| 1555 | return; | ||
| 1556 | |||
| 1557 | addr = PFN_PHYS(__pgt_buf_end); | ||
| 1558 | |||
| 1559 | /* set as RW the rest */ | ||
| 1560 | printk(KERN_DEBUG "xen: setting RW the range %llx - %llx\n", | ||
| 1561 | PFN_PHYS(__pgt_buf_end), PFN_PHYS(__pgt_buf_top)); | ||
| 1562 | |||
| 1563 | while (addr < PFN_PHYS(__pgt_buf_top)) { | ||
| 1564 | make_lowmem_page_readwrite(__va(addr)); | ||
| 1565 | addr += PAGE_SIZE; | ||
| 1566 | } | ||
| 1567 | /* And update everything so that we are ready for the next | ||
| 1568 | * pagetable (the one created for regions past 4GB) */ | ||
| 1569 | __last_pgt_set_rw = __pgt_buf_end; | ||
| 1570 | __pgt_buf_start = pgt_buf_start; | ||
| 1571 | __pgt_buf_end = pgt_buf_end; | ||
| 1572 | __pgt_buf_top = pgt_buf_top; | ||
| 1573 | } | ||
| 1574 | return; | ||
| 1575 | } | ||
| 1576 | #else | ||
| 1577 | static __init void mark_rw_past_pgt(void) { } | ||
| 1578 | #endif | ||
| 1579 | static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd) | 1480 | static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd) |
| 1580 | { | 1481 | { |
| 1581 | #ifdef CONFIG_X86_64 | 1482 | #ifdef CONFIG_X86_64 |
| @@ -1602,14 +1503,6 @@ static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) | |||
| 1602 | unsigned long pfn = pte_pfn(pte); | 1503 | unsigned long pfn = pte_pfn(pte); |
| 1603 | 1504 | ||
| 1604 | /* | 1505 | /* |
| 1605 | * A bit of optimization. We do not need to call the workaround | ||
| 1606 | * when xen_set_pte_init is called with a PTE with 0 as PFN. | ||
| 1607 | * That is b/c the pagetable at that point are just being populated | ||
| 1608 | * with empty values and we can save some cycles by not calling | ||
| 1609 | * the 'memblock' code.*/ | ||
| 1610 | if (pfn) | ||
| 1611 | mark_rw_past_pgt(); | ||
| 1612 | /* | ||
| 1613 | * If the new pfn is within the range of the newly allocated | 1506 | * If the new pfn is within the range of the newly allocated |
| 1614 | * kernel pagetable, and it isn't being mapped into an | 1507 | * kernel pagetable, and it isn't being mapped into an |
| 1615 | * early_ioremap fixmap slot as a freshly allocated page, make sure | 1508 | * early_ioremap fixmap slot as a freshly allocated page, make sure |
| @@ -2118,8 +2011,6 @@ __init void xen_ident_map_ISA(void) | |||
| 2118 | 2011 | ||
| 2119 | static __init void xen_post_allocator_init(void) | 2012 | static __init void xen_post_allocator_init(void) |
| 2120 | { | 2013 | { |
| 2121 | mark_rw_past_pgt(); | ||
| 2122 | |||
| 2123 | #ifdef CONFIG_XEN_DEBUG | 2014 | #ifdef CONFIG_XEN_DEBUG |
| 2124 | pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte_debug); | 2015 | pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte_debug); |
| 2125 | #endif | 2016 | #endif |
| @@ -2228,6 +2119,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { | |||
| 2228 | 2119 | ||
| 2229 | void __init xen_init_mmu_ops(void) | 2120 | void __init xen_init_mmu_ops(void) |
| 2230 | { | 2121 | { |
| 2122 | x86_init.mapping.pagetable_reserve = xen_mapping_pagetable_reserve; | ||
| 2231 | x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start; | 2123 | x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start; |
| 2232 | x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done; | 2124 | x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done; |
| 2233 | pv_mmu_ops = xen_mmu_ops; | 2125 | pv_mmu_ops = xen_mmu_ops; |
