aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/xen
diff options
context:
space:
mode:
authorThomas Gleixner <tglx@linutronix.de>2011-05-14 06:06:36 -0400
committerThomas Gleixner <tglx@linutronix.de>2011-05-14 06:06:36 -0400
commita18f22a968de17b29f2310cdb7ba69163e65ec15 (patch)
treea7d56d88fad5e444d7661484109758a2f436129e /arch/x86/xen
parenta1c57e0fec53defe745e64417eacdbd3618c3e66 (diff)
parent798778b8653f64b7b2162ac70eca10367cff6ce8 (diff)
Merge branch 'consolidate-clksrc-i8253' of master.kernel.org:~rmk/linux-2.6-arm into timers/clocksource
Conflicts: arch/ia64/kernel/cyclone.c arch/mips/kernel/i8253.c arch/x86/kernel/i8253.c Reason: Resolve conflicts so further cleanups do not conflict further Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Diffstat (limited to 'arch/x86/xen')
-rw-r--r--arch/x86/xen/Kconfig11
-rw-r--r--arch/x86/xen/enlighten.c29
-rw-r--r--arch/x86/xen/mmu.c244
-rw-r--r--arch/x86/xen/p2m.c336
-rw-r--r--arch/x86/xen/setup.c70
-rw-r--r--arch/x86/xen/smp.c38
-rw-r--r--arch/x86/xen/suspend.c8
-rw-r--r--arch/x86/xen/time.c4
-rw-r--r--arch/x86/xen/xen-head.S4
-rw-r--r--arch/x86/xen/xen-ops.h2
10 files changed, 683 insertions, 63 deletions
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 5b54892e4bc3..5cc821cb2e09 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -38,7 +38,8 @@ config XEN_MAX_DOMAIN_MEMORY
38 38
39config XEN_SAVE_RESTORE 39config XEN_SAVE_RESTORE
40 bool 40 bool
41 depends on XEN && PM 41 depends on XEN
42 select HIBERNATE_CALLBACKS
42 default y 43 default y
43 44
44config XEN_DEBUG_FS 45config XEN_DEBUG_FS
@@ -48,3 +49,11 @@ config XEN_DEBUG_FS
48 help 49 help
49 Enable statistics output and various tuning options in debugfs. 50 Enable statistics output and various tuning options in debugfs.
50 Enabling this option may incur a significant performance overhead. 51 Enabling this option may incur a significant performance overhead.
52
53config XEN_DEBUG
54 bool "Enable Xen debug checks"
55 depends on XEN
56 default n
57 help
58 Enable various WARN_ON checks in the Xen MMU code.
59 Enabling this option WILL incur a significant performance overhead.
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 50542efe45fb..e3c6a06cf725 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -238,6 +238,7 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx,
238static __init void xen_init_cpuid_mask(void) 238static __init void xen_init_cpuid_mask(void)
239{ 239{
240 unsigned int ax, bx, cx, dx; 240 unsigned int ax, bx, cx, dx;
241 unsigned int xsave_mask;
241 242
242 cpuid_leaf1_edx_mask = 243 cpuid_leaf1_edx_mask =
243 ~((1 << X86_FEATURE_MCE) | /* disable MCE */ 244 ~((1 << X86_FEATURE_MCE) | /* disable MCE */
@@ -249,24 +250,16 @@ static __init void xen_init_cpuid_mask(void)
249 cpuid_leaf1_edx_mask &= 250 cpuid_leaf1_edx_mask &=
250 ~((1 << X86_FEATURE_APIC) | /* disable local APIC */ 251 ~((1 << X86_FEATURE_APIC) | /* disable local APIC */
251 (1 << X86_FEATURE_ACPI)); /* disable ACPI */ 252 (1 << X86_FEATURE_ACPI)); /* disable ACPI */
252
253 ax = 1; 253 ax = 1;
254 cx = 0;
255 xen_cpuid(&ax, &bx, &cx, &dx); 254 xen_cpuid(&ax, &bx, &cx, &dx);
256 255
257 /* cpuid claims we support xsave; try enabling it to see what happens */ 256 xsave_mask =
258 if (cx & (1 << (X86_FEATURE_XSAVE % 32))) { 257 (1 << (X86_FEATURE_XSAVE % 32)) |
259 unsigned long cr4; 258 (1 << (X86_FEATURE_OSXSAVE % 32));
260
261 set_in_cr4(X86_CR4_OSXSAVE);
262
263 cr4 = read_cr4();
264
265 if ((cr4 & X86_CR4_OSXSAVE) == 0)
266 cpuid_leaf1_ecx_mask &= ~(1 << (X86_FEATURE_XSAVE % 32));
267 259
268 clear_in_cr4(X86_CR4_OSXSAVE); 260 /* Xen will set CR4.OSXSAVE if supported and not disabled by force */
269 } 261 if ((cx & xsave_mask) != xsave_mask)
262 cpuid_leaf1_ecx_mask &= ~xsave_mask; /* disable XSAVE & OSXSAVE */
270} 263}
271 264
272static void xen_set_debugreg(int reg, unsigned long val) 265static void xen_set_debugreg(int reg, unsigned long val)
@@ -1284,15 +1277,14 @@ static int init_hvm_pv_info(int *major, int *minor)
1284 1277
1285 xen_setup_features(); 1278 xen_setup_features();
1286 1279
1287 pv_info = xen_info; 1280 pv_info.name = "Xen HVM";
1288 pv_info.kernel_rpl = 0;
1289 1281
1290 xen_domain_type = XEN_HVM_DOMAIN; 1282 xen_domain_type = XEN_HVM_DOMAIN;
1291 1283
1292 return 0; 1284 return 0;
1293} 1285}
1294 1286
1295void xen_hvm_init_shared_info(void) 1287void __ref xen_hvm_init_shared_info(void)
1296{ 1288{
1297 int cpu; 1289 int cpu;
1298 struct xen_add_to_physmap xatp; 1290 struct xen_add_to_physmap xatp;
@@ -1331,6 +1323,8 @@ static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self,
1331 switch (action) { 1323 switch (action) {
1332 case CPU_UP_PREPARE: 1324 case CPU_UP_PREPARE:
1333 per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; 1325 per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
1326 if (xen_have_vector_callback)
1327 xen_init_lock_cpu(cpu);
1334 break; 1328 break;
1335 default: 1329 default:
1336 break; 1330 break;
@@ -1355,6 +1349,7 @@ static void __init xen_hvm_guest_init(void)
1355 1349
1356 if (xen_feature(XENFEAT_hvm_callback_vector)) 1350 if (xen_feature(XENFEAT_hvm_callback_vector))
1357 xen_have_vector_callback = 1; 1351 xen_have_vector_callback = 1;
1352 xen_hvm_smp_init();
1358 register_cpu_notifier(&xen_hvm_cpu_notifier); 1353 register_cpu_notifier(&xen_hvm_cpu_notifier);
1359 xen_unplug_emulated_devices(); 1354 xen_unplug_emulated_devices();
1360 have_vcpu_info_placement = 0; 1355 have_vcpu_info_placement = 0;
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 5e92b61ad574..55c965b38c27 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -46,6 +46,7 @@
46#include <linux/module.h> 46#include <linux/module.h>
47#include <linux/gfp.h> 47#include <linux/gfp.h>
48#include <linux/memblock.h> 48#include <linux/memblock.h>
49#include <linux/seq_file.h>
49 50
50#include <asm/pgtable.h> 51#include <asm/pgtable.h>
51#include <asm/tlbflush.h> 52#include <asm/tlbflush.h>
@@ -78,8 +79,7 @@
78 79
79/* 80/*
80 * Protects atomic reservation decrease/increase against concurrent increases. 81 * Protects atomic reservation decrease/increase against concurrent increases.
81 * Also protects non-atomic updates of current_pages and driver_pages, and 82 * Also protects non-atomic updates of current_pages and balloon lists.
82 * balloon lists.
83 */ 83 */
84DEFINE_SPINLOCK(xen_reservation_lock); 84DEFINE_SPINLOCK(xen_reservation_lock);
85 85
@@ -416,8 +416,12 @@ static pteval_t pte_pfn_to_mfn(pteval_t val)
416 if (val & _PAGE_PRESENT) { 416 if (val & _PAGE_PRESENT) {
417 unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; 417 unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
418 pteval_t flags = val & PTE_FLAGS_MASK; 418 pteval_t flags = val & PTE_FLAGS_MASK;
419 unsigned long mfn = pfn_to_mfn(pfn); 419 unsigned long mfn;
420 420
421 if (!xen_feature(XENFEAT_auto_translated_physmap))
422 mfn = get_phys_to_machine(pfn);
423 else
424 mfn = pfn;
421 /* 425 /*
422 * If there's no mfn for the pfn, then just create an 426 * If there's no mfn for the pfn, then just create an
423 * empty non-present pte. Unfortunately this loses 427 * empty non-present pte. Unfortunately this loses
@@ -427,8 +431,18 @@ static pteval_t pte_pfn_to_mfn(pteval_t val)
427 if (unlikely(mfn == INVALID_P2M_ENTRY)) { 431 if (unlikely(mfn == INVALID_P2M_ENTRY)) {
428 mfn = 0; 432 mfn = 0;
429 flags = 0; 433 flags = 0;
434 } else {
435 /*
436 * Paramount to do this test _after_ the
437 * INVALID_P2M_ENTRY as INVALID_P2M_ENTRY &
438 * IDENTITY_FRAME_BIT resolves to true.
439 */
440 mfn &= ~FOREIGN_FRAME_BIT;
441 if (mfn & IDENTITY_FRAME_BIT) {
442 mfn &= ~IDENTITY_FRAME_BIT;
443 flags |= _PAGE_IOMAP;
444 }
430 } 445 }
431
432 val = ((pteval_t)mfn << PAGE_SHIFT) | flags; 446 val = ((pteval_t)mfn << PAGE_SHIFT) | flags;
433 } 447 }
434 448
@@ -532,6 +546,41 @@ pte_t xen_make_pte(pteval_t pte)
532} 546}
533PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte); 547PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
534 548
549#ifdef CONFIG_XEN_DEBUG
550pte_t xen_make_pte_debug(pteval_t pte)
551{
552 phys_addr_t addr = (pte & PTE_PFN_MASK);
553 phys_addr_t other_addr;
554 bool io_page = false;
555 pte_t _pte;
556
557 if (pte & _PAGE_IOMAP)
558 io_page = true;
559
560 _pte = xen_make_pte(pte);
561
562 if (!addr)
563 return _pte;
564
565 if (io_page &&
566 (xen_initial_domain() || addr >= ISA_END_ADDRESS)) {
567 other_addr = pfn_to_mfn(addr >> PAGE_SHIFT) << PAGE_SHIFT;
568 WARN_ONCE(addr != other_addr,
569 "0x%lx is using VM_IO, but it is 0x%lx!\n",
570 (unsigned long)addr, (unsigned long)other_addr);
571 } else {
572 pteval_t iomap_set = (_pte.pte & PTE_FLAGS_MASK) & _PAGE_IOMAP;
573 other_addr = (_pte.pte & PTE_PFN_MASK);
574 WARN_ONCE((addr == other_addr) && (!io_page) && (!iomap_set),
575 "0x%lx is missing VM_IO (and wasn't fixed)!\n",
576 (unsigned long)addr);
577 }
578
579 return _pte;
580}
581PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte_debug);
582#endif
583
535pgd_t xen_make_pgd(pgdval_t pgd) 584pgd_t xen_make_pgd(pgdval_t pgd)
536{ 585{
537 pgd = pte_pfn_to_mfn(pgd); 586 pgd = pte_pfn_to_mfn(pgd);
@@ -986,10 +1035,9 @@ static void xen_pgd_pin(struct mm_struct *mm)
986 */ 1035 */
987void xen_mm_pin_all(void) 1036void xen_mm_pin_all(void)
988{ 1037{
989 unsigned long flags;
990 struct page *page; 1038 struct page *page;
991 1039
992 spin_lock_irqsave(&pgd_lock, flags); 1040 spin_lock(&pgd_lock);
993 1041
994 list_for_each_entry(page, &pgd_list, lru) { 1042 list_for_each_entry(page, &pgd_list, lru) {
995 if (!PagePinned(page)) { 1043 if (!PagePinned(page)) {
@@ -998,7 +1046,7 @@ void xen_mm_pin_all(void)
998 } 1046 }
999 } 1047 }
1000 1048
1001 spin_unlock_irqrestore(&pgd_lock, flags); 1049 spin_unlock(&pgd_lock);
1002} 1050}
1003 1051
1004/* 1052/*
@@ -1099,10 +1147,9 @@ static void xen_pgd_unpin(struct mm_struct *mm)
1099 */ 1147 */
1100void xen_mm_unpin_all(void) 1148void xen_mm_unpin_all(void)
1101{ 1149{
1102 unsigned long flags;
1103 struct page *page; 1150 struct page *page;
1104 1151
1105 spin_lock_irqsave(&pgd_lock, flags); 1152 spin_lock(&pgd_lock);
1106 1153
1107 list_for_each_entry(page, &pgd_list, lru) { 1154 list_for_each_entry(page, &pgd_list, lru) {
1108 if (PageSavePinned(page)) { 1155 if (PageSavePinned(page)) {
@@ -1112,7 +1159,7 @@ void xen_mm_unpin_all(void)
1112 } 1159 }
1113 } 1160 }
1114 1161
1115 spin_unlock_irqrestore(&pgd_lock, flags); 1162 spin_unlock(&pgd_lock);
1116} 1163}
1117 1164
1118void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) 1165void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
@@ -1416,6 +1463,119 @@ static int xen_pgd_alloc(struct mm_struct *mm)
1416 return ret; 1463 return ret;
1417} 1464}
1418 1465
1466#ifdef CONFIG_X86_64
1467static __initdata u64 __last_pgt_set_rw = 0;
1468static __initdata u64 __pgt_buf_start = 0;
1469static __initdata u64 __pgt_buf_end = 0;
1470static __initdata u64 __pgt_buf_top = 0;
1471/*
1472 * As a consequence of the commit:
1473 *
1474 * commit 4b239f458c229de044d6905c2b0f9fe16ed9e01e
1475 * Author: Yinghai Lu <yinghai@kernel.org>
1476 * Date: Fri Dec 17 16:58:28 2010 -0800
1477 *
1478 * x86-64, mm: Put early page table high
1479 *
1480 * at some point init_memory_mapping is going to reach the pagetable pages
1481 * area and map those pages too (mapping them as normal memory that falls
1482 * in the range of addresses passed to init_memory_mapping as argument).
1483 * Some of those pages are already pagetable pages (they are in the range
1484 * pgt_buf_start-pgt_buf_end) therefore they are going to be mapped RO and
1485 * everything is fine.
1486 * Some of these pages are not pagetable pages yet (they fall in the range
1487 * pgt_buf_end-pgt_buf_top; for example the page at pgt_buf_end) so they
1488 * are going to be mapped RW. When these pages become pagetable pages and
1489 * are hooked into the pagetable, xen will find that the guest has already
1490 * a RW mapping of them somewhere and fail the operation.
1491 * The reason Xen requires pagetables to be RO is that the hypervisor needs
1492 * to verify that the pagetables are valid before using them. The validation
1493 * operations are called "pinning".
1494 *
1495 * In order to fix the issue we mark all the pages in the entire range
1496 * pgt_buf_start-pgt_buf_top as RO, however when the pagetable allocation
1497 * is completed only the range pgt_buf_start-pgt_buf_end is reserved by
1498 * init_memory_mapping. Hence the kernel is going to crash as soon as one
1499 * of the pages in the range pgt_buf_end-pgt_buf_top is reused (b/c those
1500 * ranges are RO).
1501 *
1502 * For this reason, 'mark_rw_past_pgt' is introduced which is called _after_
1503 * the init_memory_mapping has completed (in a perfect world we would
1504 * call this function from init_memory_mapping, but lets ignore that).
1505 *
1506 * Because we are called _after_ init_memory_mapping the pgt_buf_[start,
1507 * end,top] have all changed to new values (b/c init_memory_mapping
1508 * is called and setting up another new page-table). Hence, the first time
1509 * we enter this function, we save away the pgt_buf_start value and update
1510 * the pgt_buf_[end,top].
1511 *
1512 * When we detect that the "old" pgt_buf_start through pgt_buf_end
1513 * PFNs have been reserved (so memblock_x86_reserve_range has been called),
1514 * we immediately set out to RW the "old" pgt_buf_end through pgt_buf_top.
1515 *
1516 * And then we update those "old" pgt_buf_[end|top] with the new ones
1517 * so that we can redo this on the next pagetable.
1518 */
1519static __init void mark_rw_past_pgt(void) {
1520
1521 if (pgt_buf_end > pgt_buf_start) {
1522 u64 addr, size;
1523
1524 /* Save it away. */
1525 if (!__pgt_buf_start) {
1526 __pgt_buf_start = pgt_buf_start;
1527 __pgt_buf_end = pgt_buf_end;
1528 __pgt_buf_top = pgt_buf_top;
1529 return;
1530 }
1531 /* If we get the range that starts at __pgt_buf_end that means
1532 * the range is reserved, and that in 'init_memory_mapping'
1533 * the 'memblock_x86_reserve_range' has been called with the
1534 * outdated __pgt_buf_start, __pgt_buf_end (the "new"
1535 * pgt_buf_[start|end|top] refer now to a new pagetable.
1536 * Note: we are called _after_ the pgt_buf_[..] have been
1537 * updated.*/
1538
1539 addr = memblock_x86_find_in_range_size(PFN_PHYS(__pgt_buf_start),
1540 &size, PAGE_SIZE);
1541
1542 /* Still not reserved, meaning 'memblock_x86_reserve_range'
1543 * hasn't been called yet. Update the _end and _top.*/
1544 if (addr == PFN_PHYS(__pgt_buf_start)) {
1545 __pgt_buf_end = pgt_buf_end;
1546 __pgt_buf_top = pgt_buf_top;
1547 return;
1548 }
1549
1550 /* OK, the area is reserved, meaning it is time for us to
1551 * set RW for the old end->top PFNs. */
1552
1553 /* ..unless we had already done this. */
1554 if (__pgt_buf_end == __last_pgt_set_rw)
1555 return;
1556
1557 addr = PFN_PHYS(__pgt_buf_end);
1558
1559 /* set as RW the rest */
1560 printk(KERN_DEBUG "xen: setting RW the range %llx - %llx\n",
1561 PFN_PHYS(__pgt_buf_end), PFN_PHYS(__pgt_buf_top));
1562
1563 while (addr < PFN_PHYS(__pgt_buf_top)) {
1564 make_lowmem_page_readwrite(__va(addr));
1565 addr += PAGE_SIZE;
1566 }
1567 /* And update everything so that we are ready for the next
1568 * pagetable (the one created for regions past 4GB) */
1569 __last_pgt_set_rw = __pgt_buf_end;
1570 __pgt_buf_start = pgt_buf_start;
1571 __pgt_buf_end = pgt_buf_end;
1572 __pgt_buf_top = pgt_buf_top;
1573 }
1574 return;
1575}
1576#else
1577static __init void mark_rw_past_pgt(void) { }
1578#endif
1419static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd) 1579static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
1420{ 1580{
1421#ifdef CONFIG_X86_64 1581#ifdef CONFIG_X86_64
@@ -1426,28 +1586,43 @@ static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
1426#endif 1586#endif
1427} 1587}
1428 1588
1589#ifdef CONFIG_X86_32
1429static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) 1590static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
1430{ 1591{
1431 unsigned long pfn = pte_pfn(pte);
1432
1433#ifdef CONFIG_X86_32
1434 /* If there's an existing pte, then don't allow _PAGE_RW to be set */ 1592 /* If there's an existing pte, then don't allow _PAGE_RW to be set */
1435 if (pte_val_ma(*ptep) & _PAGE_PRESENT) 1593 if (pte_val_ma(*ptep) & _PAGE_PRESENT)
1436 pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) & 1594 pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
1437 pte_val_ma(pte)); 1595 pte_val_ma(pte));
1438#endif 1596
1597 return pte;
1598}
1599#else /* CONFIG_X86_64 */
1600static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
1601{
1602 unsigned long pfn = pte_pfn(pte);
1439 1603
1440 /* 1604 /*
1605 * A bit of optimization. We do not need to call the workaround
1606 * when xen_set_pte_init is called with a PTE with 0 as PFN.
1607 * That is b/c the pagetable at that point are just being populated
1608 * with empty values and we can save some cycles by not calling
1609 * the 'memblock' code.*/
1610 if (pfn)
1611 mark_rw_past_pgt();
1612 /*
1441 * If the new pfn is within the range of the newly allocated 1613 * If the new pfn is within the range of the newly allocated
1442 * kernel pagetable, and it isn't being mapped into an 1614 * kernel pagetable, and it isn't being mapped into an
1443 * early_ioremap fixmap slot, make sure it is RO. 1615 * early_ioremap fixmap slot as a freshly allocated page, make sure
1616 * it is RO.
1444 */ 1617 */
1445 if (!is_early_ioremap_ptep(ptep) && 1618 if (((!is_early_ioremap_ptep(ptep) &&
1446 pfn >= e820_table_start && pfn < e820_table_end) 1619 pfn >= pgt_buf_start && pfn < pgt_buf_top)) ||
1620 (is_early_ioremap_ptep(ptep) && pfn != (pgt_buf_end - 1)))
1447 pte = pte_wrprotect(pte); 1621 pte = pte_wrprotect(pte);
1448 1622
1449 return pte; 1623 return pte;
1450} 1624}
1625#endif /* CONFIG_X86_64 */
1451 1626
1452/* Init-time set_pte while constructing initial pagetables, which 1627/* Init-time set_pte while constructing initial pagetables, which
1453 doesn't allow RO pagetable pages to be remapped RW */ 1628 doesn't allow RO pagetable pages to be remapped RW */
@@ -1653,9 +1828,6 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1653 for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) { 1828 for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
1654 pte_t pte; 1829 pte_t pte;
1655 1830
1656 if (pfn > max_pfn_mapped)
1657 max_pfn_mapped = pfn;
1658
1659 if (!pte_none(pte_page[pteidx])) 1831 if (!pte_none(pte_page[pteidx]))
1660 continue; 1832 continue;
1661 1833
@@ -1697,7 +1869,7 @@ static void convert_pfn_mfn(void *v)
1697} 1869}
1698 1870
1699/* 1871/*
1700 * Set up the inital kernel pagetable. 1872 * Set up the initial kernel pagetable.
1701 * 1873 *
1702 * We can construct this by grafting the Xen provided pagetable into 1874 * We can construct this by grafting the Xen provided pagetable into
1703 * head_64.S's preconstructed pagetables. We copy the Xen L2's into 1875 * head_64.S's preconstructed pagetables. We copy the Xen L2's into
@@ -1713,6 +1885,12 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1713 pud_t *l3; 1885 pud_t *l3;
1714 pmd_t *l2; 1886 pmd_t *l2;
1715 1887
1888 /* max_pfn_mapped is the last pfn mapped in the initial memory
1889 * mappings. Considering that on Xen after the kernel mappings we
1890 * have the mappings of some pages that don't exist in pfn space, we
1891 * set max_pfn_mapped to the last real pfn mapped. */
1892 max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list));
1893
1716 /* Zap identity mapping */ 1894 /* Zap identity mapping */
1717 init_level4_pgt[0] = __pgd(0); 1895 init_level4_pgt[0] = __pgd(0);
1718 1896
@@ -1817,9 +1995,7 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1817 initial_kernel_pmd = 1995 initial_kernel_pmd =
1818 extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE); 1996 extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
1819 1997
1820 max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) + 1998 max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list));
1821 xen_start_info->nr_pt_frames * PAGE_SIZE +
1822 512*1024);
1823 1999
1824 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd); 2000 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
1825 memcpy(initial_kernel_pmd, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD); 2001 memcpy(initial_kernel_pmd, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
@@ -1942,6 +2118,11 @@ __init void xen_ident_map_ISA(void)
1942 2118
1943static __init void xen_post_allocator_init(void) 2119static __init void xen_post_allocator_init(void)
1944{ 2120{
2121 mark_rw_past_pgt();
2122
2123#ifdef CONFIG_XEN_DEBUG
2124 pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte_debug);
2125#endif
1945 pv_mmu_ops.set_pte = xen_set_pte; 2126 pv_mmu_ops.set_pte = xen_set_pte;
1946 pv_mmu_ops.set_pmd = xen_set_pmd; 2127 pv_mmu_ops.set_pmd = xen_set_pmd;
1947 pv_mmu_ops.set_pud = xen_set_pud; 2128 pv_mmu_ops.set_pud = xen_set_pud;
@@ -2074,7 +2255,7 @@ static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order,
2074 in_frames[i] = virt_to_mfn(vaddr); 2255 in_frames[i] = virt_to_mfn(vaddr);
2075 2256
2076 MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0); 2257 MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0);
2077 set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY); 2258 __set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY);
2078 2259
2079 if (out_frames) 2260 if (out_frames)
2080 out_frames[i] = virt_to_pfn(vaddr); 2261 out_frames[i] = virt_to_pfn(vaddr);
@@ -2353,6 +2534,18 @@ EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range);
2353 2534
2354#ifdef CONFIG_XEN_DEBUG_FS 2535#ifdef CONFIG_XEN_DEBUG_FS
2355 2536
2537static int p2m_dump_open(struct inode *inode, struct file *filp)
2538{
2539 return single_open(filp, p2m_dump_show, NULL);
2540}
2541
2542static const struct file_operations p2m_dump_fops = {
2543 .open = p2m_dump_open,
2544 .read = seq_read,
2545 .llseek = seq_lseek,
2546 .release = single_release,
2547};
2548
2356static struct dentry *d_mmu_debug; 2549static struct dentry *d_mmu_debug;
2357 2550
2358static int __init xen_mmu_debugfs(void) 2551static int __init xen_mmu_debugfs(void)
@@ -2408,6 +2601,7 @@ static int __init xen_mmu_debugfs(void)
2408 debugfs_create_u32("prot_commit_batched", 0444, d_mmu_debug, 2601 debugfs_create_u32("prot_commit_batched", 0444, d_mmu_debug,
2409 &mmu_stats.prot_commit_batched); 2602 &mmu_stats.prot_commit_batched);
2410 2603
2604 debugfs_create_file("p2m", 0600, d_mmu_debug, NULL, &p2m_dump_fops);
2411 return 0; 2605 return 0;
2412} 2606}
2413fs_initcall(xen_mmu_debugfs); 2607fs_initcall(xen_mmu_debugfs);
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index fd12d7ce7ff9..141eb0de8b06 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -23,6 +23,129 @@
23 * P2M_PER_PAGE depends on the architecture, as a mfn is always 23 * P2M_PER_PAGE depends on the architecture, as a mfn is always
24 * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to 24 * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to
25 * 512 and 1024 entries respectively. 25 * 512 and 1024 entries respectively.
26 *
27 * In short, these structures contain the Machine Frame Number (MFN) of the PFN.
28 *
29 * However not all entries are filled with MFNs. Specifically for all other
30 * leaf entries, or for the top root, or middle one, for which there is a void
31 * entry, we assume it is "missing". So (for example)
32 * pfn_to_mfn(0x90909090)=INVALID_P2M_ENTRY.
33 *
34 * We also have the possibility of setting 1-1 mappings on certain regions, so
35 * that:
36 * pfn_to_mfn(0xc0000)=0xc0000
37 *
38 * The benefit of this is, that we can assume for non-RAM regions (think
39 * PCI BARs, or ACPI spaces), we can create mappings easily b/c we
40 * get the PFN value to match the MFN.
41 *
42 * For this to work efficiently we have one new page p2m_identity and
43 * allocate (via reserved_brk) any other pages we need to cover the sides
44 * (1GB or 4MB boundary violations). All entries in p2m_identity are set to
45 * INVALID_P2M_ENTRY type (Xen toolstack only recognizes that and MFNs,
46 * no other fancy value).
47 *
48 * On lookup we spot that the entry points to p2m_identity and return the
49 * identity value instead of dereferencing and returning INVALID_P2M_ENTRY.
50 * If the entry points to an allocated page, we just proceed as before and
51 * return the PFN. If the PFN has IDENTITY_FRAME_BIT set we unmask that in
52 * appropriate functions (pfn_to_mfn).
53 *
54 * The reason for having the IDENTITY_FRAME_BIT instead of just returning the
55 * PFN is that we could find ourselves where pfn_to_mfn(pfn)==pfn for a
56 * non-identity pfn. To protect ourselves against we elect to set (and get) the
57 * IDENTITY_FRAME_BIT on all identity mapped PFNs.
58 *
59 * This simplistic diagram is used to explain the more subtle piece of code.
60 * There is also a digram of the P2M at the end that can help.
61 * Imagine your E820 looking as so:
62 *
63 * 1GB 2GB
64 * /-------------------+---------\/----\ /----------\ /---+-----\
65 * | System RAM | Sys RAM ||ACPI| | reserved | | Sys RAM |
66 * \-------------------+---------/\----/ \----------/ \---+-----/
67 * ^- 1029MB ^- 2001MB
68 *
69 * [1029MB = 263424 (0x40500), 2001MB = 512256 (0x7D100),
70 * 2048MB = 524288 (0x80000)]
71 *
72 * And dom0_mem=max:3GB,1GB is passed in to the guest, meaning memory past 1GB
73 * is actually not present (would have to kick the balloon driver to put it in).
74 *
75 * When we are told to set the PFNs for identity mapping (see patch: "xen/setup:
76 * Set identity mapping for non-RAM E820 and E820 gaps.") we pass in the start
77 * of the PFN and the end PFN (263424 and 512256 respectively). The first step
78 * is to reserve_brk a top leaf page if the p2m[1] is missing. The top leaf page
79 * covers 512^2 of page estate (1GB) and in case the start or end PFN is not
80 * aligned on 512^2*PAGE_SIZE (1GB) we loop on aligned 1GB PFNs from start pfn
81 * to end pfn. We reserve_brk top leaf pages if they are missing (means they
82 * point to p2m_mid_missing).
83 *
84 * With the E820 example above, 263424 is not 1GB aligned so we allocate a
85 * reserve_brk page which will cover the PFNs estate from 0x40000 to 0x80000.
86 * Each entry in the allocate page is "missing" (points to p2m_missing).
87 *
88 * Next stage is to determine if we need to do a more granular boundary check
89 * on the 4MB (or 2MB depending on architecture) off the start and end pfn's.
90 * We check if the start pfn and end pfn violate that boundary check, and if
91 * so reserve_brk a middle (p2m[x][y]) leaf page. This way we have a much finer
92 * granularity of setting which PFNs are missing and which ones are identity.
93 * In our example 263424 and 512256 both fail the check so we reserve_brk two
94 * pages. Populate them with INVALID_P2M_ENTRY (so they both have "missing"
95 * values) and assign them to p2m[1][2] and p2m[1][488] respectively.
96 *
97 * At this point we would at minimum reserve_brk one page, but could be up to
98 * three. Each call to set_phys_range_identity has at maximum a three page
99 * cost. If we were to query the P2M at this stage, all those entries from
100 * start PFN through end PFN (so 1029MB -> 2001MB) would return
101 * INVALID_P2M_ENTRY ("missing").
102 *
103 * The next step is to walk from the start pfn to the end pfn setting
104 * the IDENTITY_FRAME_BIT on each PFN. This is done in set_phys_range_identity.
105 * If we find that the middle leaf is pointing to p2m_missing we can swap it
106 * over to p2m_identity - this way covering 4MB (or 2MB) PFN space. At this
107 * point we do not need to worry about boundary aligment (so no need to
108 * reserve_brk a middle page, figure out which PFNs are "missing" and which
109 * ones are identity), as that has been done earlier. If we find that the
110 * middle leaf is not occupied by p2m_identity or p2m_missing, we dereference
111 * that page (which covers 512 PFNs) and set the appropriate PFN with
112 * IDENTITY_FRAME_BIT. In our example 263424 and 512256 end up there, and we
113 * set from p2m[1][2][256->511] and p2m[1][488][0->256] with
114 * IDENTITY_FRAME_BIT set.
115 *
116 * All other regions that are void (or not filled) either point to p2m_missing
117 * (considered missing) or have the default value of INVALID_P2M_ENTRY (also
118 * considered missing). In our case, p2m[1][2][0->255] and p2m[1][488][257->511]
119 * contain the INVALID_P2M_ENTRY value and are considered "missing."
120 *
121 * This is what the p2m ends up looking (for the E820 above) with this
122 * fabulous drawing:
123 *
124 * p2m /--------------\
125 * /-----\ | &mfn_list[0],| /-----------------\
126 * | 0 |------>| &mfn_list[1],| /---------------\ | ~0, ~0, .. |
127 * |-----| | ..., ~0, ~0 | | ~0, ~0, [x]---+----->| IDENTITY [@256] |
128 * | 1 |---\ \--------------/ | [p2m_identity]+\ | IDENTITY [@257] |
129 * |-----| \ | [p2m_identity]+\\ | .... |
130 * | 2 |--\ \-------------------->| ... | \\ \----------------/
131 * |-----| \ \---------------/ \\
132 * | 3 |\ \ \\ p2m_identity
133 * |-----| \ \-------------------->/---------------\ /-----------------\
134 * | .. +->+ | [p2m_identity]+-->| ~0, ~0, ~0, ... |
135 * \-----/ / | [p2m_identity]+-->| ..., ~0 |
136 * / /---------------\ | .... | \-----------------/
137 * / | IDENTITY[@0] | /-+-[x], ~0, ~0.. |
138 * / | IDENTITY[@256]|<----/ \---------------/
139 * / | ~0, ~0, .... |
140 * | \---------------/
141 * |
142 * p2m_missing p2m_missing
143 * /------------------\ /------------\
144 * | [p2m_mid_missing]+---->| ~0, ~0, ~0 |
145 * | [p2m_mid_missing]+---->| ..., ~0 |
146 * \------------------/ \------------/
147 *
148 * where ~0 is INVALID_P2M_ENTRY. IDENTITY is (PFN | IDENTITY_BIT)
26 */ 149 */
27 150
28#include <linux/init.h> 151#include <linux/init.h>
@@ -30,6 +153,7 @@
30#include <linux/list.h> 153#include <linux/list.h>
31#include <linux/hash.h> 154#include <linux/hash.h>
32#include <linux/sched.h> 155#include <linux/sched.h>
156#include <linux/seq_file.h>
33 157
34#include <asm/cache.h> 158#include <asm/cache.h>
35#include <asm/setup.h> 159#include <asm/setup.h>
@@ -59,9 +183,15 @@ static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE);
59static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE); 183static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE);
60static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE); 184static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE);
61 185
186static RESERVE_BRK_ARRAY(unsigned long, p2m_identity, P2M_PER_PAGE);
187
62RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); 188RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
63RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); 189RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
64 190
191/* We might hit two boundary violations at the start and end, at max each
192 * boundary violation will require three middle nodes. */
193RESERVE_BRK(p2m_mid_identity, PAGE_SIZE * 2 * 3);
194
65static inline unsigned p2m_top_index(unsigned long pfn) 195static inline unsigned p2m_top_index(unsigned long pfn)
66{ 196{
67 BUG_ON(pfn >= MAX_P2M_PFN); 197 BUG_ON(pfn >= MAX_P2M_PFN);
@@ -136,7 +266,7 @@ static void p2m_init(unsigned long *p2m)
136 * - After resume we're called from within stop_machine, but the mfn 266 * - After resume we're called from within stop_machine, but the mfn
137 * tree should alreay be completely allocated. 267 * tree should alreay be completely allocated.
138 */ 268 */
139void xen_build_mfn_list_list(void) 269void __ref xen_build_mfn_list_list(void)
140{ 270{
141 unsigned long pfn; 271 unsigned long pfn;
142 272
@@ -221,6 +351,9 @@ void __init xen_build_dynamic_phys_to_machine(void)
221 p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE); 351 p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE);
222 p2m_top_init(p2m_top); 352 p2m_top_init(p2m_top);
223 353
354 p2m_identity = extend_brk(PAGE_SIZE, PAGE_SIZE);
355 p2m_init(p2m_identity);
356
224 /* 357 /*
225 * The domain builder gives us a pre-constructed p2m array in 358 * The domain builder gives us a pre-constructed p2m array in
226 * mfn_list for all the pages initially given to us, so we just 359 * mfn_list for all the pages initially given to us, so we just
@@ -266,6 +399,14 @@ unsigned long get_phys_to_machine(unsigned long pfn)
266 mididx = p2m_mid_index(pfn); 399 mididx = p2m_mid_index(pfn);
267 idx = p2m_index(pfn); 400 idx = p2m_index(pfn);
268 401
402 /*
403 * The INVALID_P2M_ENTRY is filled in both p2m_*identity
404 * and in p2m_*missing, so returning the INVALID_P2M_ENTRY
405 * would be wrong.
406 */
407 if (p2m_top[topidx][mididx] == p2m_identity)
408 return IDENTITY_FRAME(pfn);
409
269 return p2m_top[topidx][mididx][idx]; 410 return p2m_top[topidx][mididx][idx];
270} 411}
271EXPORT_SYMBOL_GPL(get_phys_to_machine); 412EXPORT_SYMBOL_GPL(get_phys_to_machine);
@@ -335,9 +476,11 @@ static bool alloc_p2m(unsigned long pfn)
335 p2m_top_mfn_p[topidx] = mid_mfn; 476 p2m_top_mfn_p[topidx] = mid_mfn;
336 } 477 }
337 478
338 if (p2m_top[topidx][mididx] == p2m_missing) { 479 if (p2m_top[topidx][mididx] == p2m_identity ||
480 p2m_top[topidx][mididx] == p2m_missing) {
339 /* p2m leaf page is missing */ 481 /* p2m leaf page is missing */
340 unsigned long *p2m; 482 unsigned long *p2m;
483 unsigned long *p2m_orig = p2m_top[topidx][mididx];
341 484
342 p2m = alloc_p2m_page(); 485 p2m = alloc_p2m_page();
343 if (!p2m) 486 if (!p2m)
@@ -345,7 +488,7 @@ static bool alloc_p2m(unsigned long pfn)
345 488
346 p2m_init(p2m); 489 p2m_init(p2m);
347 490
348 if (cmpxchg(&mid[mididx], p2m_missing, p2m) != p2m_missing) 491 if (cmpxchg(&mid[mididx], p2m_orig, p2m) != p2m_orig)
349 free_p2m_page(p2m); 492 free_p2m_page(p2m);
350 else 493 else
351 mid_mfn[mididx] = virt_to_mfn(p2m); 494 mid_mfn[mididx] = virt_to_mfn(p2m);
@@ -354,11 +497,91 @@ static bool alloc_p2m(unsigned long pfn)
354 return true; 497 return true;
355} 498}
356 499
500static bool __init __early_alloc_p2m(unsigned long pfn)
501{
502 unsigned topidx, mididx, idx;
503
504 topidx = p2m_top_index(pfn);
505 mididx = p2m_mid_index(pfn);
506 idx = p2m_index(pfn);
507
508 /* Pfff.. No boundary cross-over, lets get out. */
509 if (!idx)
510 return false;
511
512 WARN(p2m_top[topidx][mididx] == p2m_identity,
513 "P2M[%d][%d] == IDENTITY, should be MISSING (or alloced)!\n",
514 topidx, mididx);
515
516 /*
517 * Could be done by xen_build_dynamic_phys_to_machine..
518 */
519 if (p2m_top[topidx][mididx] != p2m_missing)
520 return false;
521
522 /* Boundary cross-over for the edges: */
523 if (idx) {
524 unsigned long *p2m = extend_brk(PAGE_SIZE, PAGE_SIZE);
525
526 p2m_init(p2m);
527
528 p2m_top[topidx][mididx] = p2m;
529
530 }
531 return idx != 0;
532}
533unsigned long __init set_phys_range_identity(unsigned long pfn_s,
534 unsigned long pfn_e)
535{
536 unsigned long pfn;
537
538 if (unlikely(pfn_s >= MAX_P2M_PFN || pfn_e >= MAX_P2M_PFN))
539 return 0;
540
541 if (unlikely(xen_feature(XENFEAT_auto_translated_physmap)))
542 return pfn_e - pfn_s;
543
544 if (pfn_s > pfn_e)
545 return 0;
546
547 for (pfn = (pfn_s & ~(P2M_MID_PER_PAGE * P2M_PER_PAGE - 1));
548 pfn < ALIGN(pfn_e, (P2M_MID_PER_PAGE * P2M_PER_PAGE));
549 pfn += P2M_MID_PER_PAGE * P2M_PER_PAGE)
550 {
551 unsigned topidx = p2m_top_index(pfn);
552 if (p2m_top[topidx] == p2m_mid_missing) {
553 unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
554
555 p2m_mid_init(mid);
556
557 p2m_top[topidx] = mid;
558 }
559 }
560
561 __early_alloc_p2m(pfn_s);
562 __early_alloc_p2m(pfn_e);
563
564 for (pfn = pfn_s; pfn < pfn_e; pfn++)
565 if (!__set_phys_to_machine(pfn, IDENTITY_FRAME(pfn)))
566 break;
567
568 if (!WARN((pfn - pfn_s) != (pfn_e - pfn_s),
569 "Identity mapping failed. We are %ld short of 1-1 mappings!\n",
570 (pfn_e - pfn_s) - (pfn - pfn_s)))
571 printk(KERN_DEBUG "1-1 mapping on %lx->%lx\n", pfn_s, pfn);
572
573 return pfn - pfn_s;
574}
575
357/* Try to install p2m mapping; fail if intermediate bits missing */ 576/* Try to install p2m mapping; fail if intermediate bits missing */
358bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) 577bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
359{ 578{
360 unsigned topidx, mididx, idx; 579 unsigned topidx, mididx, idx;
361 580
581 if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
582 BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
583 return true;
584 }
362 if (unlikely(pfn >= MAX_P2M_PFN)) { 585 if (unlikely(pfn >= MAX_P2M_PFN)) {
363 BUG_ON(mfn != INVALID_P2M_ENTRY); 586 BUG_ON(mfn != INVALID_P2M_ENTRY);
364 return true; 587 return true;
@@ -368,6 +591,21 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
368 mididx = p2m_mid_index(pfn); 591 mididx = p2m_mid_index(pfn);
369 idx = p2m_index(pfn); 592 idx = p2m_index(pfn);
370 593
594 /* For sparse holes were the p2m leaf has real PFN along with
595 * PCI holes, stick in the PFN as the MFN value.
596 */
597 if (mfn != INVALID_P2M_ENTRY && (mfn & IDENTITY_FRAME_BIT)) {
598 if (p2m_top[topidx][mididx] == p2m_identity)
599 return true;
600
601 /* Swap over from MISSING to IDENTITY if needed. */
602 if (p2m_top[topidx][mididx] == p2m_missing) {
603 WARN_ON(cmpxchg(&p2m_top[topidx][mididx], p2m_missing,
604 p2m_identity) != p2m_missing);
605 return true;
606 }
607 }
608
371 if (p2m_top[topidx][mididx] == p2m_missing) 609 if (p2m_top[topidx][mididx] == p2m_missing)
372 return mfn == INVALID_P2M_ENTRY; 610 return mfn == INVALID_P2M_ENTRY;
373 611
@@ -378,11 +616,6 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
378 616
379bool set_phys_to_machine(unsigned long pfn, unsigned long mfn) 617bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)
380{ 618{
381 if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
382 BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
383 return true;
384 }
385
386 if (unlikely(!__set_phys_to_machine(pfn, mfn))) { 619 if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
387 if (!alloc_p2m(pfn)) 620 if (!alloc_p2m(pfn))
388 return false; 621 return false;
@@ -421,7 +654,7 @@ int m2p_add_override(unsigned long mfn, struct page *page)
421{ 654{
422 unsigned long flags; 655 unsigned long flags;
423 unsigned long pfn; 656 unsigned long pfn;
424 unsigned long address; 657 unsigned long uninitialized_var(address);
425 unsigned level; 658 unsigned level;
426 pte_t *ptep = NULL; 659 pte_t *ptep = NULL;
427 660
@@ -438,7 +671,9 @@ int m2p_add_override(unsigned long mfn, struct page *page)
438 page->private = mfn; 671 page->private = mfn;
439 page->index = pfn_to_mfn(pfn); 672 page->index = pfn_to_mfn(pfn);
440 673
441 __set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)); 674 if (unlikely(!set_phys_to_machine(pfn, FOREIGN_FRAME(mfn))))
675 return -ENOMEM;
676
442 if (!PageHighMem(page)) 677 if (!PageHighMem(page))
443 /* Just zap old mapping for now */ 678 /* Just zap old mapping for now */
444 pte_clear(&init_mm, address, ptep); 679 pte_clear(&init_mm, address, ptep);
@@ -455,7 +690,7 @@ int m2p_remove_override(struct page *page)
455 unsigned long flags; 690 unsigned long flags;
456 unsigned long mfn; 691 unsigned long mfn;
457 unsigned long pfn; 692 unsigned long pfn;
458 unsigned long address; 693 unsigned long uninitialized_var(address);
459 unsigned level; 694 unsigned level;
460 pte_t *ptep = NULL; 695 pte_t *ptep = NULL;
461 696
@@ -476,7 +711,7 @@ int m2p_remove_override(struct page *page)
476 spin_lock_irqsave(&m2p_override_lock, flags); 711 spin_lock_irqsave(&m2p_override_lock, flags);
477 list_del(&page->lru); 712 list_del(&page->lru);
478 spin_unlock_irqrestore(&m2p_override_lock, flags); 713 spin_unlock_irqrestore(&m2p_override_lock, flags);
479 __set_phys_to_machine(pfn, page->index); 714 set_phys_to_machine(pfn, page->index);
480 715
481 if (!PageHighMem(page)) 716 if (!PageHighMem(page))
482 set_pte_at(&init_mm, address, ptep, 717 set_pte_at(&init_mm, address, ptep,
@@ -520,3 +755,80 @@ unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn)
520 return ret; 755 return ret;
521} 756}
522EXPORT_SYMBOL_GPL(m2p_find_override_pfn); 757EXPORT_SYMBOL_GPL(m2p_find_override_pfn);
758
759#ifdef CONFIG_XEN_DEBUG_FS
760
761int p2m_dump_show(struct seq_file *m, void *v)
762{
763 static const char * const level_name[] = { "top", "middle",
764 "entry", "abnormal" };
765 static const char * const type_name[] = { "identity", "missing",
766 "pfn", "abnormal"};
767#define TYPE_IDENTITY 0
768#define TYPE_MISSING 1
769#define TYPE_PFN 2
770#define TYPE_UNKNOWN 3
771 unsigned long pfn, prev_pfn_type = 0, prev_pfn_level = 0;
772 unsigned int uninitialized_var(prev_level);
773 unsigned int uninitialized_var(prev_type);
774
775 if (!p2m_top)
776 return 0;
777
778 for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn++) {
779 unsigned topidx = p2m_top_index(pfn);
780 unsigned mididx = p2m_mid_index(pfn);
781 unsigned idx = p2m_index(pfn);
782 unsigned lvl, type;
783
784 lvl = 4;
785 type = TYPE_UNKNOWN;
786 if (p2m_top[topidx] == p2m_mid_missing) {
787 lvl = 0; type = TYPE_MISSING;
788 } else if (p2m_top[topidx] == NULL) {
789 lvl = 0; type = TYPE_UNKNOWN;
790 } else if (p2m_top[topidx][mididx] == NULL) {
791 lvl = 1; type = TYPE_UNKNOWN;
792 } else if (p2m_top[topidx][mididx] == p2m_identity) {
793 lvl = 1; type = TYPE_IDENTITY;
794 } else if (p2m_top[topidx][mididx] == p2m_missing) {
795 lvl = 1; type = TYPE_MISSING;
796 } else if (p2m_top[topidx][mididx][idx] == 0) {
797 lvl = 2; type = TYPE_UNKNOWN;
798 } else if (p2m_top[topidx][mididx][idx] == IDENTITY_FRAME(pfn)) {
799 lvl = 2; type = TYPE_IDENTITY;
800 } else if (p2m_top[topidx][mididx][idx] == INVALID_P2M_ENTRY) {
801 lvl = 2; type = TYPE_MISSING;
802 } else if (p2m_top[topidx][mididx][idx] == pfn) {
803 lvl = 2; type = TYPE_PFN;
804 } else if (p2m_top[topidx][mididx][idx] != pfn) {
805 lvl = 2; type = TYPE_PFN;
806 }
807 if (pfn == 0) {
808 prev_level = lvl;
809 prev_type = type;
810 }
811 if (pfn == MAX_DOMAIN_PAGES-1) {
812 lvl = 3;
813 type = TYPE_UNKNOWN;
814 }
815 if (prev_type != type) {
816 seq_printf(m, " [0x%lx->0x%lx] %s\n",
817 prev_pfn_type, pfn, type_name[prev_type]);
818 prev_pfn_type = pfn;
819 prev_type = type;
820 }
821 if (prev_level != lvl) {
822 seq_printf(m, " [0x%lx->0x%lx] level %s\n",
823 prev_pfn_level, pfn, level_name[prev_level]);
824 prev_pfn_level = pfn;
825 prev_level = lvl;
826 }
827 }
828 return 0;
829#undef TYPE_IDENTITY
830#undef TYPE_MISSING
831#undef TYPE_PFN
832#undef TYPE_UNKNOWN
833}
834#endif
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index a8a66a50d446..90bac0aac3a5 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -52,6 +52,8 @@ phys_addr_t xen_extra_mem_start, xen_extra_mem_size;
52 52
53static __init void xen_add_extra_mem(unsigned long pages) 53static __init void xen_add_extra_mem(unsigned long pages)
54{ 54{
55 unsigned long pfn;
56
55 u64 size = (u64)pages * PAGE_SIZE; 57 u64 size = (u64)pages * PAGE_SIZE;
56 u64 extra_start = xen_extra_mem_start + xen_extra_mem_size; 58 u64 extra_start = xen_extra_mem_start + xen_extra_mem_size;
57 59
@@ -66,6 +68,9 @@ static __init void xen_add_extra_mem(unsigned long pages)
66 xen_extra_mem_size += size; 68 xen_extra_mem_size += size;
67 69
68 xen_max_p2m_pfn = PFN_DOWN(extra_start + size); 70 xen_max_p2m_pfn = PFN_DOWN(extra_start + size);
71
72 for (pfn = PFN_DOWN(extra_start); pfn <= xen_max_p2m_pfn; pfn++)
73 __set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
69} 74}
70 75
71static unsigned long __init xen_release_chunk(phys_addr_t start_addr, 76static unsigned long __init xen_release_chunk(phys_addr_t start_addr,
@@ -104,7 +109,7 @@ static unsigned long __init xen_release_chunk(phys_addr_t start_addr,
104 WARN(ret != 1, "Failed to release memory %lx-%lx err=%d\n", 109 WARN(ret != 1, "Failed to release memory %lx-%lx err=%d\n",
105 start, end, ret); 110 start, end, ret);
106 if (ret == 1) { 111 if (ret == 1) {
107 set_phys_to_machine(pfn, INVALID_P2M_ENTRY); 112 __set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
108 len++; 113 len++;
109 } 114 }
110 } 115 }
@@ -138,12 +143,55 @@ static unsigned long __init xen_return_unused_memory(unsigned long max_pfn,
138 return released; 143 return released;
139} 144}
140 145
146static unsigned long __init xen_set_identity(const struct e820entry *list,
147 ssize_t map_size)
148{
149 phys_addr_t last = xen_initial_domain() ? 0 : ISA_END_ADDRESS;
150 phys_addr_t start_pci = last;
151 const struct e820entry *entry;
152 unsigned long identity = 0;
153 int i;
154
155 for (i = 0, entry = list; i < map_size; i++, entry++) {
156 phys_addr_t start = entry->addr;
157 phys_addr_t end = start + entry->size;
158
159 if (start < last)
160 start = last;
161
162 if (end <= start)
163 continue;
164
165 /* Skip over the 1MB region. */
166 if (last > end)
167 continue;
168
169 if (entry->type == E820_RAM) {
170 if (start > start_pci)
171 identity += set_phys_range_identity(
172 PFN_UP(start_pci), PFN_DOWN(start));
173
174 /* Without saving 'last' we would gooble RAM too
175 * at the end of the loop. */
176 last = end;
177 start_pci = end;
178 continue;
179 }
180 start_pci = min(start, start_pci);
181 last = end;
182 }
183 if (last > start_pci)
184 identity += set_phys_range_identity(
185 PFN_UP(start_pci), PFN_DOWN(last));
186 return identity;
187}
141/** 188/**
142 * machine_specific_memory_setup - Hook for machine specific memory setup. 189 * machine_specific_memory_setup - Hook for machine specific memory setup.
143 **/ 190 **/
144char * __init xen_memory_setup(void) 191char * __init xen_memory_setup(void)
145{ 192{
146 static struct e820entry map[E820MAX] __initdata; 193 static struct e820entry map[E820MAX] __initdata;
194 static struct e820entry map_raw[E820MAX] __initdata;
147 195
148 unsigned long max_pfn = xen_start_info->nr_pages; 196 unsigned long max_pfn = xen_start_info->nr_pages;
149 unsigned long long mem_end; 197 unsigned long long mem_end;
@@ -151,6 +199,7 @@ char * __init xen_memory_setup(void)
151 struct xen_memory_map memmap; 199 struct xen_memory_map memmap;
152 unsigned long extra_pages = 0; 200 unsigned long extra_pages = 0;
153 unsigned long extra_limit; 201 unsigned long extra_limit;
202 unsigned long identity_pages = 0;
154 int i; 203 int i;
155 int op; 204 int op;
156 205
@@ -176,8 +225,9 @@ char * __init xen_memory_setup(void)
176 } 225 }
177 BUG_ON(rc); 226 BUG_ON(rc);
178 227
228 memcpy(map_raw, map, sizeof(map));
179 e820.nr_map = 0; 229 e820.nr_map = 0;
180 xen_extra_mem_start = mem_end; 230 xen_extra_mem_start = max((1ULL << 32), mem_end);
181 for (i = 0; i < memmap.nr_entries; i++) { 231 for (i = 0; i < memmap.nr_entries; i++) {
182 unsigned long long end; 232 unsigned long long end;
183 233
@@ -194,6 +244,15 @@ char * __init xen_memory_setup(void)
194 end -= delta; 244 end -= delta;
195 245
196 extra_pages += PFN_DOWN(delta); 246 extra_pages += PFN_DOWN(delta);
247 /*
248 * Set RAM below 4GB that is not for us to be unusable.
249 * This prevents "System RAM" address space from being
250 * used as potential resource for I/O address (happens
251 * when 'allocate_resource' is called).
252 */
253 if (delta &&
254 (xen_initial_domain() && end < 0x100000000ULL))
255 e820_add_region(end, delta, E820_UNUSABLE);
197 } 256 }
198 257
199 if (map[i].size > 0 && end > xen_extra_mem_start) 258 if (map[i].size > 0 && end > xen_extra_mem_start)
@@ -251,6 +310,13 @@ char * __init xen_memory_setup(void)
251 310
252 xen_add_extra_mem(extra_pages); 311 xen_add_extra_mem(extra_pages);
253 312
313 /*
314 * Set P2M for all non-RAM pages and E820 gaps to be identity
315 * type PFNs. We supply it with the non-sanitized version
316 * of the E820.
317 */
318 identity_pages = xen_set_identity(map_raw, memmap.nr_entries);
319 printk(KERN_INFO "Set %ld page(s) to 1-1 mapping.\n", identity_pages);
254 return "Xen"; 320 return "Xen";
255} 321}
256 322
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 72a4c7959045..30612441ed99 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -509,3 +509,41 @@ void __init xen_smp_init(void)
509 xen_fill_possible_map(); 509 xen_fill_possible_map();
510 xen_init_spinlocks(); 510 xen_init_spinlocks();
511} 511}
512
513static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus)
514{
515 native_smp_prepare_cpus(max_cpus);
516 WARN_ON(xen_smp_intr_init(0));
517
518 if (!xen_have_vector_callback)
519 return;
520 xen_init_lock_cpu(0);
521 xen_init_spinlocks();
522}
523
524static int __cpuinit xen_hvm_cpu_up(unsigned int cpu)
525{
526 int rc;
527 rc = native_cpu_up(cpu);
528 WARN_ON (xen_smp_intr_init(cpu));
529 return rc;
530}
531
532static void xen_hvm_cpu_die(unsigned int cpu)
533{
534 unbind_from_irqhandler(per_cpu(xen_resched_irq, cpu), NULL);
535 unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu), NULL);
536 unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu), NULL);
537 unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu), NULL);
538 native_cpu_die(cpu);
539}
540
541void __init xen_hvm_smp_init(void)
542{
543 smp_ops.smp_prepare_cpus = xen_hvm_smp_prepare_cpus;
544 smp_ops.smp_send_reschedule = xen_smp_send_reschedule;
545 smp_ops.cpu_up = xen_hvm_cpu_up;
546 smp_ops.cpu_die = xen_hvm_cpu_die;
547 smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi;
548 smp_ops.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi;
549}
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index 9bbd63a129b5..45329c8c226e 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -12,7 +12,7 @@
12#include "xen-ops.h" 12#include "xen-ops.h"
13#include "mmu.h" 13#include "mmu.h"
14 14
15void xen_pre_suspend(void) 15void xen_arch_pre_suspend(void)
16{ 16{
17 xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn); 17 xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn);
18 xen_start_info->console.domU.mfn = 18 xen_start_info->console.domU.mfn =
@@ -26,8 +26,9 @@ void xen_pre_suspend(void)
26 BUG(); 26 BUG();
27} 27}
28 28
29void xen_hvm_post_suspend(int suspend_cancelled) 29void xen_arch_hvm_post_suspend(int suspend_cancelled)
30{ 30{
31#ifdef CONFIG_XEN_PVHVM
31 int cpu; 32 int cpu;
32 xen_hvm_init_shared_info(); 33 xen_hvm_init_shared_info();
33 xen_callback_vector(); 34 xen_callback_vector();
@@ -37,9 +38,10 @@ void xen_hvm_post_suspend(int suspend_cancelled)
37 xen_setup_runstate_info(cpu); 38 xen_setup_runstate_info(cpu);
38 } 39 }
39 } 40 }
41#endif
40} 42}
41 43
42void xen_post_suspend(int suspend_cancelled) 44void xen_arch_post_suspend(int suspend_cancelled)
43{ 45{
44 xen_build_mfn_list_list(); 46 xen_build_mfn_list_list();
45 47
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 04e11597a8c5..c532d280ce36 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -393,7 +393,9 @@ void xen_setup_timer(int cpu)
393 name = "<timer kasprintf failed>"; 393 name = "<timer kasprintf failed>";
394 394
395 irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt, 395 irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
396 IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING|IRQF_TIMER, 396 IRQF_DISABLED|IRQF_PERCPU|
397 IRQF_NOBALANCING|IRQF_TIMER|
398 IRQF_FORCE_RESUME,
397 name, NULL); 399 name, NULL);
398 400
399 evt = &per_cpu(xen_clock_events, cpu); 401 evt = &per_cpu(xen_clock_events, cpu);
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 1a5ff24e29c0..aaa7291c9259 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -28,9 +28,9 @@ ENTRY(startup_xen)
28 __FINIT 28 __FINIT
29 29
30.pushsection .text 30.pushsection .text
31 .align PAGE_SIZE_asm 31 .align PAGE_SIZE
32ENTRY(hypercall_page) 32ENTRY(hypercall_page)
33 .skip PAGE_SIZE_asm 33 .skip PAGE_SIZE
34.popsection 34.popsection
35 35
36 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") 36 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux")
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 9d41bf985757..3112f55638c4 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -64,10 +64,12 @@ void xen_setup_vcpu_info_placement(void);
64 64
65#ifdef CONFIG_SMP 65#ifdef CONFIG_SMP
66void xen_smp_init(void); 66void xen_smp_init(void);
67void __init xen_hvm_smp_init(void);
67 68
68extern cpumask_var_t xen_cpu_initialized_map; 69extern cpumask_var_t xen_cpu_initialized_map;
69#else 70#else
70static inline void xen_smp_init(void) {} 71static inline void xen_smp_init(void) {}
72static inline void xen_hvm_smp_init(void) {}
71#endif 73#endif
72 74
73#ifdef CONFIG_PARAVIRT_SPINLOCKS 75#ifdef CONFIG_PARAVIRT_SPINLOCKS