aboutsummaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2011-05-12 15:21:51 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2011-05-12 15:21:51 -0400
commit0c5e1577f1108e88475ca7b7ca75c411460173e1 (patch)
treed74285bb8d89c7c6d2d6896410fdcf57ff2a3b96 /arch
parent982b2035d9d7033f63db187bac55e9d8998b0266 (diff)
parent53f8023febf9b3e18d8fb0d99c55010e473ce53d (diff)
Merge branch 'stable/bug-fixes-for-rc7' of git://git.kernel.org/pub/scm/linux/kernel/git/konrad/xen
* 'stable/bug-fixes-for-rc7' of git://git.kernel.org/pub/scm/linux/kernel/git/konrad/xen: x86/mm: Fix section mismatch derived from native_pagetable_reserve() x86,xen: introduce x86_init.mapping.pagetable_reserve Revert "xen/mmu: Add workaround "x86-64, mm: Put early page table high""
Diffstat (limited to 'arch')
-rw-r--r--arch/x86/include/asm/pgtable_types.h1
-rw-r--r--arch/x86/include/asm/x86_init.h12
-rw-r--r--arch/x86/kernel/x86_init.c4
-rw-r--r--arch/x86/mm/init.c24
-rw-r--r--arch/x86/xen/mmu.c138
5 files changed, 54 insertions, 125 deletions
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 7db7723d1f32..d56187c6b838 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -299,6 +299,7 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
299/* Install a pte for a particular vaddr in kernel space. */ 299/* Install a pte for a particular vaddr in kernel space. */
300void set_pte_vaddr(unsigned long vaddr, pte_t pte); 300void set_pte_vaddr(unsigned long vaddr, pte_t pte);
301 301
302extern void native_pagetable_reserve(u64 start, u64 end);
302#ifdef CONFIG_X86_32 303#ifdef CONFIG_X86_32
303extern void native_pagetable_setup_start(pgd_t *base); 304extern void native_pagetable_setup_start(pgd_t *base);
304extern void native_pagetable_setup_done(pgd_t *base); 305extern void native_pagetable_setup_done(pgd_t *base);
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 643ebf2e2ad8..d3d859035af9 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -68,6 +68,17 @@ struct x86_init_oem {
68}; 68};
69 69
70/** 70/**
71 * struct x86_init_mapping - platform specific initial kernel pagetable setup
72 * @pagetable_reserve: reserve a range of addresses for kernel pagetable usage
73 *
74 * For more details on the purpose of this hook, look in
75 * init_memory_mapping and the commit that added it.
76 */
77struct x86_init_mapping {
78 void (*pagetable_reserve)(u64 start, u64 end);
79};
80
81/**
71 * struct x86_init_paging - platform specific paging functions 82 * struct x86_init_paging - platform specific paging functions
72 * @pagetable_setup_start: platform specific pre paging_init() call 83 * @pagetable_setup_start: platform specific pre paging_init() call
73 * @pagetable_setup_done: platform specific post paging_init() call 84 * @pagetable_setup_done: platform specific post paging_init() call
@@ -123,6 +134,7 @@ struct x86_init_ops {
123 struct x86_init_mpparse mpparse; 134 struct x86_init_mpparse mpparse;
124 struct x86_init_irqs irqs; 135 struct x86_init_irqs irqs;
125 struct x86_init_oem oem; 136 struct x86_init_oem oem;
137 struct x86_init_mapping mapping;
126 struct x86_init_paging paging; 138 struct x86_init_paging paging;
127 struct x86_init_timers timers; 139 struct x86_init_timers timers;
128 struct x86_init_iommu iommu; 140 struct x86_init_iommu iommu;
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index c11514e9128b..75ef4b18e9b7 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -61,6 +61,10 @@ struct x86_init_ops x86_init __initdata = {
61 .banner = default_banner, 61 .banner = default_banner,
62 }, 62 },
63 63
64 .mapping = {
65 .pagetable_reserve = native_pagetable_reserve,
66 },
67
64 .paging = { 68 .paging = {
65 .pagetable_setup_start = native_pagetable_setup_start, 69 .pagetable_setup_start = native_pagetable_setup_start,
66 .pagetable_setup_done = native_pagetable_setup_done, 70 .pagetable_setup_done = native_pagetable_setup_done,
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 286d289b039b..37b8b0fe8320 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -81,6 +81,11 @@ static void __init find_early_table_space(unsigned long end, int use_pse,
81 end, pgt_buf_start << PAGE_SHIFT, pgt_buf_top << PAGE_SHIFT); 81 end, pgt_buf_start << PAGE_SHIFT, pgt_buf_top << PAGE_SHIFT);
82} 82}
83 83
84void __init native_pagetable_reserve(u64 start, u64 end)
85{
86 memblock_x86_reserve_range(start, end, "PGTABLE");
87}
88
84struct map_range { 89struct map_range {
85 unsigned long start; 90 unsigned long start;
86 unsigned long end; 91 unsigned long end;
@@ -272,9 +277,24 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
272 277
273 __flush_tlb_all(); 278 __flush_tlb_all();
274 279
280 /*
281 * Reserve the kernel pagetable pages we used (pgt_buf_start -
282 * pgt_buf_end) and free the other ones (pgt_buf_end - pgt_buf_top)
283 * so that they can be reused for other purposes.
284 *
285 * On native it just means calling memblock_x86_reserve_range, on Xen it
286 * also means marking RW the pagetable pages that we allocated before
287 * but that haven't been used.
288 *
289 * In fact on xen we mark RO the whole range pgt_buf_start -
290 * pgt_buf_top, because we have to make sure that when
291 * init_memory_mapping reaches the pagetable pages area, it maps
292 * RO all the pagetable pages, including the ones that are beyond
293 * pgt_buf_end at that time.
294 */
275 if (!after_bootmem && pgt_buf_end > pgt_buf_start) 295 if (!after_bootmem && pgt_buf_end > pgt_buf_start)
276 memblock_x86_reserve_range(pgt_buf_start << PAGE_SHIFT, 296 x86_init.mapping.pagetable_reserve(PFN_PHYS(pgt_buf_start),
277 pgt_buf_end << PAGE_SHIFT, "PGTABLE"); 297 PFN_PHYS(pgt_buf_end));
278 298
279 if (!after_bootmem) 299 if (!after_bootmem)
280 early_memtest(start, end); 300 early_memtest(start, end);
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 55c965b38c27..0684f3c74d53 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1275,6 +1275,20 @@ static __init void xen_pagetable_setup_start(pgd_t *base)
1275{ 1275{
1276} 1276}
1277 1277
1278static __init void xen_mapping_pagetable_reserve(u64 start, u64 end)
1279{
1280 /* reserve the range used */
1281 native_pagetable_reserve(start, end);
1282
1283 /* set as RW the rest */
1284 printk(KERN_DEBUG "xen: setting RW the range %llx - %llx\n", end,
1285 PFN_PHYS(pgt_buf_top));
1286 while (end < PFN_PHYS(pgt_buf_top)) {
1287 make_lowmem_page_readwrite(__va(end));
1288 end += PAGE_SIZE;
1289 }
1290}
1291
1278static void xen_post_allocator_init(void); 1292static void xen_post_allocator_init(void);
1279 1293
1280static __init void xen_pagetable_setup_done(pgd_t *base) 1294static __init void xen_pagetable_setup_done(pgd_t *base)
@@ -1463,119 +1477,6 @@ static int xen_pgd_alloc(struct mm_struct *mm)
1463 return ret; 1477 return ret;
1464} 1478}
1465 1479
1466#ifdef CONFIG_X86_64
1467static __initdata u64 __last_pgt_set_rw = 0;
1468static __initdata u64 __pgt_buf_start = 0;
1469static __initdata u64 __pgt_buf_end = 0;
1470static __initdata u64 __pgt_buf_top = 0;
1471/*
1472 * As a consequence of the commit:
1473 *
1474 * commit 4b239f458c229de044d6905c2b0f9fe16ed9e01e
1475 * Author: Yinghai Lu <yinghai@kernel.org>
1476 * Date: Fri Dec 17 16:58:28 2010 -0800
1477 *
1478 * x86-64, mm: Put early page table high
1479 *
1480 * at some point init_memory_mapping is going to reach the pagetable pages
1481 * area and map those pages too (mapping them as normal memory that falls
1482 * in the range of addresses passed to init_memory_mapping as argument).
1483 * Some of those pages are already pagetable pages (they are in the range
1484 * pgt_buf_start-pgt_buf_end) therefore they are going to be mapped RO and
1485 * everything is fine.
1486 * Some of these pages are not pagetable pages yet (they fall in the range
1487 * pgt_buf_end-pgt_buf_top; for example the page at pgt_buf_end) so they
1488 * are going to be mapped RW. When these pages become pagetable pages and
1489 * are hooked into the pagetable, xen will find that the guest has already
1490 * a RW mapping of them somewhere and fail the operation.
1491 * The reason Xen requires pagetables to be RO is that the hypervisor needs
1492 * to verify that the pagetables are valid before using them. The validation
1493 * operations are called "pinning".
1494 *
1495 * In order to fix the issue we mark all the pages in the entire range
1496 * pgt_buf_start-pgt_buf_top as RO, however when the pagetable allocation
1497 * is completed only the range pgt_buf_start-pgt_buf_end is reserved by
1498 * init_memory_mapping. Hence the kernel is going to crash as soon as one
1499 * of the pages in the range pgt_buf_end-pgt_buf_top is reused (b/c those
1500 * ranges are RO).
1501 *
1502 * For this reason, 'mark_rw_past_pgt' is introduced which is called _after_
1503 * the init_memory_mapping has completed (in a perfect world we would
1504 * call this function from init_memory_mapping, but lets ignore that).
1505 *
1506 * Because we are called _after_ init_memory_mapping the pgt_buf_[start,
1507 * end,top] have all changed to new values (b/c init_memory_mapping
1508 * is called and setting up another new page-table). Hence, the first time
1509 * we enter this function, we save away the pgt_buf_start value and update
1510 * the pgt_buf_[end,top].
1511 *
1512 * When we detect that the "old" pgt_buf_start through pgt_buf_end
1513 * PFNs have been reserved (so memblock_x86_reserve_range has been called),
1514 * we immediately set out to RW the "old" pgt_buf_end through pgt_buf_top.
1515 *
1516 * And then we update those "old" pgt_buf_[end|top] with the new ones
1517 * so that we can redo this on the next pagetable.
1518 */
1519static __init void mark_rw_past_pgt(void) {
1520
1521 if (pgt_buf_end > pgt_buf_start) {
1522 u64 addr, size;
1523
1524 /* Save it away. */
1525 if (!__pgt_buf_start) {
1526 __pgt_buf_start = pgt_buf_start;
1527 __pgt_buf_end = pgt_buf_end;
1528 __pgt_buf_top = pgt_buf_top;
1529 return;
1530 }
1531 /* If we get the range that starts at __pgt_buf_end that means
1532 * the range is reserved, and that in 'init_memory_mapping'
1533 * the 'memblock_x86_reserve_range' has been called with the
1534 * outdated __pgt_buf_start, __pgt_buf_end (the "new"
1535 * pgt_buf_[start|end|top] refer now to a new pagetable.
1536 * Note: we are called _after_ the pgt_buf_[..] have been
1537 * updated.*/
1538
1539 addr = memblock_x86_find_in_range_size(PFN_PHYS(__pgt_buf_start),
1540 &size, PAGE_SIZE);
1541
1542 /* Still not reserved, meaning 'memblock_x86_reserve_range'
1543 * hasn't been called yet. Update the _end and _top.*/
1544 if (addr == PFN_PHYS(__pgt_buf_start)) {
1545 __pgt_buf_end = pgt_buf_end;
1546 __pgt_buf_top = pgt_buf_top;
1547 return;
1548 }
1549
1550 /* OK, the area is reserved, meaning it is time for us to
1551 * set RW for the old end->top PFNs. */
1552
1553 /* ..unless we had already done this. */
1554 if (__pgt_buf_end == __last_pgt_set_rw)
1555 return;
1556
1557 addr = PFN_PHYS(__pgt_buf_end);
1558
1559 /* set as RW the rest */
1560 printk(KERN_DEBUG "xen: setting RW the range %llx - %llx\n",
1561 PFN_PHYS(__pgt_buf_end), PFN_PHYS(__pgt_buf_top));
1562
1563 while (addr < PFN_PHYS(__pgt_buf_top)) {
1564 make_lowmem_page_readwrite(__va(addr));
1565 addr += PAGE_SIZE;
1566 }
1567 /* And update everything so that we are ready for the next
1568 * pagetable (the one created for regions past 4GB) */
1569 __last_pgt_set_rw = __pgt_buf_end;
1570 __pgt_buf_start = pgt_buf_start;
1571 __pgt_buf_end = pgt_buf_end;
1572 __pgt_buf_top = pgt_buf_top;
1573 }
1574 return;
1575}
1576#else
1577static __init void mark_rw_past_pgt(void) { }
1578#endif
1579static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd) 1480static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
1580{ 1481{
1581#ifdef CONFIG_X86_64 1482#ifdef CONFIG_X86_64
@@ -1602,14 +1503,6 @@ static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
1602 unsigned long pfn = pte_pfn(pte); 1503 unsigned long pfn = pte_pfn(pte);
1603 1504
1604 /* 1505 /*
1605 * A bit of optimization. We do not need to call the workaround
1606 * when xen_set_pte_init is called with a PTE with 0 as PFN.
1607 * That is b/c the pagetable at that point are just being populated
1608 * with empty values and we can save some cycles by not calling
1609 * the 'memblock' code.*/
1610 if (pfn)
1611 mark_rw_past_pgt();
1612 /*
1613 * If the new pfn is within the range of the newly allocated 1506 * If the new pfn is within the range of the newly allocated
1614 * kernel pagetable, and it isn't being mapped into an 1507 * kernel pagetable, and it isn't being mapped into an
1615 * early_ioremap fixmap slot as a freshly allocated page, make sure 1508 * early_ioremap fixmap slot as a freshly allocated page, make sure
@@ -2118,8 +2011,6 @@ __init void xen_ident_map_ISA(void)
2118 2011
2119static __init void xen_post_allocator_init(void) 2012static __init void xen_post_allocator_init(void)
2120{ 2013{
2121 mark_rw_past_pgt();
2122
2123#ifdef CONFIG_XEN_DEBUG 2014#ifdef CONFIG_XEN_DEBUG
2124 pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte_debug); 2015 pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte_debug);
2125#endif 2016#endif
@@ -2228,6 +2119,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
2228 2119
2229void __init xen_init_mmu_ops(void) 2120void __init xen_init_mmu_ops(void)
2230{ 2121{
2122 x86_init.mapping.pagetable_reserve = xen_mapping_pagetable_reserve;
2231 x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start; 2123 x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start;
2232 x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done; 2124 x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done;
2233 pv_mmu_ops = xen_mmu_ops; 2125 pv_mmu_ops = xen_mmu_ops;