aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/include/asm/spinlock.h3
-rw-r--r--arch/x86/kernel/alternative.c2
-rw-r--r--arch/x86/kernel/irq.c2
-rw-r--r--arch/x86/kernel/microcode_amd.c7
-rw-r--r--arch/x86/kvm/emulate.c30
-rw-r--r--arch/x86/kvm/mmu.c13
-rw-r--r--arch/x86/kvm/x86.c5
-rw-r--r--arch/x86/xen/enlighten.c118
-rw-r--r--arch/x86/xen/mmu.c2
-rw-r--r--arch/x86/xen/p2m.c95
-rw-r--r--arch/x86/xen/setup.c9
-rw-r--r--arch/x86/xen/suspend.c2
-rw-r--r--arch/x86/xen/xen-ops.h2
13 files changed, 155 insertions, 135 deletions
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index b315a33867f2..33692eaabab5 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -12,8 +12,7 @@
12 * Simple spin lock operations. There are two variants, one clears IRQ's 12 * Simple spin lock operations. There are two variants, one clears IRQ's
13 * on the local processor, one does not. 13 * on the local processor, one does not.
14 * 14 *
15 * These are fair FIFO ticket locks, which are currently limited to 256 15 * These are fair FIFO ticket locks, which support up to 2^16 CPUs.
16 * CPUs.
17 * 16 *
18 * (the type definitions are in asm/spinlock_types.h) 17 * (the type definitions are in asm/spinlock_types.h)
19 */ 18 */
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index afb7ff79a29f..ced4534baed5 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -165,7 +165,7 @@ static const unsigned char * const k7_nops[ASM_NOP_MAX+2] =
165#endif 165#endif
166 166
167#ifdef P6_NOP1 167#ifdef P6_NOP1
168static const unsigned char __initconst_or_module p6nops[] = 168static const unsigned char p6nops[] =
169{ 169{
170 P6_NOP1, 170 P6_NOP1,
171 P6_NOP2, 171 P6_NOP2,
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 7ad683d78645..d44f7829968e 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -270,7 +270,7 @@ void fixup_irqs(void)
270 270
271 if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { 271 if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
272 break_affinity = 1; 272 break_affinity = 1;
273 affinity = cpu_all_mask; 273 affinity = cpu_online_mask;
274 } 274 }
275 275
276 chip = irq_data_get_irq_chip(data); 276 chip = irq_data_get_irq_chip(data);
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index 8a2ce8fd41c0..82746f942cd8 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -143,11 +143,12 @@ static int get_matching_microcode(int cpu, const u8 *ucode_ptr,
143 unsigned int *current_size) 143 unsigned int *current_size)
144{ 144{
145 struct microcode_header_amd *mc_hdr; 145 struct microcode_header_amd *mc_hdr;
146 unsigned int actual_size; 146 unsigned int actual_size, patch_size;
147 u16 equiv_cpu_id; 147 u16 equiv_cpu_id;
148 148
149 /* size of the current patch we're staring at */ 149 /* size of the current patch we're staring at */
150 *current_size = *(u32 *)(ucode_ptr + 4) + SECTION_HDR_SIZE; 150 patch_size = *(u32 *)(ucode_ptr + 4);
151 *current_size = patch_size + SECTION_HDR_SIZE;
151 152
152 equiv_cpu_id = find_equiv_id(); 153 equiv_cpu_id = find_equiv_id();
153 if (!equiv_cpu_id) 154 if (!equiv_cpu_id)
@@ -174,7 +175,7 @@ static int get_matching_microcode(int cpu, const u8 *ucode_ptr,
174 /* 175 /*
175 * now that the header looks sane, verify its size 176 * now that the header looks sane, verify its size
176 */ 177 */
177 actual_size = verify_ucode_size(cpu, *current_size, leftover_size); 178 actual_size = verify_ucode_size(cpu, patch_size, leftover_size);
178 if (!actual_size) 179 if (!actual_size)
179 return 0; 180 return 0;
180 181
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 97d9a9914ba8..a3b57a27be88 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -475,13 +475,26 @@ register_address(struct x86_emulate_ctxt *ctxt, unsigned long reg)
475 return address_mask(ctxt, reg); 475 return address_mask(ctxt, reg);
476} 476}
477 477
478static void masked_increment(ulong *reg, ulong mask, int inc)
479{
480 assign_masked(reg, *reg + inc, mask);
481}
482
478static inline void 483static inline void
479register_address_increment(struct x86_emulate_ctxt *ctxt, unsigned long *reg, int inc) 484register_address_increment(struct x86_emulate_ctxt *ctxt, unsigned long *reg, int inc)
480{ 485{
486 ulong mask;
487
481 if (ctxt->ad_bytes == sizeof(unsigned long)) 488 if (ctxt->ad_bytes == sizeof(unsigned long))
482 *reg += inc; 489 mask = ~0UL;
483 else 490 else
484 *reg = (*reg & ~ad_mask(ctxt)) | ((*reg + inc) & ad_mask(ctxt)); 491 mask = ad_mask(ctxt);
492 masked_increment(reg, mask, inc);
493}
494
495static void rsp_increment(struct x86_emulate_ctxt *ctxt, int inc)
496{
497 masked_increment(&ctxt->regs[VCPU_REGS_RSP], stack_mask(ctxt), inc);
485} 498}
486 499
487static inline void jmp_rel(struct x86_emulate_ctxt *ctxt, int rel) 500static inline void jmp_rel(struct x86_emulate_ctxt *ctxt, int rel)
@@ -1522,8 +1535,8 @@ static int push(struct x86_emulate_ctxt *ctxt, void *data, int bytes)
1522{ 1535{
1523 struct segmented_address addr; 1536 struct segmented_address addr;
1524 1537
1525 register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], -bytes); 1538 rsp_increment(ctxt, -bytes);
1526 addr.ea = register_address(ctxt, ctxt->regs[VCPU_REGS_RSP]); 1539 addr.ea = ctxt->regs[VCPU_REGS_RSP] & stack_mask(ctxt);
1527 addr.seg = VCPU_SREG_SS; 1540 addr.seg = VCPU_SREG_SS;
1528 1541
1529 return segmented_write(ctxt, addr, data, bytes); 1542 return segmented_write(ctxt, addr, data, bytes);
@@ -1542,13 +1555,13 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt,
1542 int rc; 1555 int rc;
1543 struct segmented_address addr; 1556 struct segmented_address addr;
1544 1557
1545 addr.ea = register_address(ctxt, ctxt->regs[VCPU_REGS_RSP]); 1558 addr.ea = ctxt->regs[VCPU_REGS_RSP] & stack_mask(ctxt);
1546 addr.seg = VCPU_SREG_SS; 1559 addr.seg = VCPU_SREG_SS;
1547 rc = segmented_read(ctxt, addr, dest, len); 1560 rc = segmented_read(ctxt, addr, dest, len);
1548 if (rc != X86EMUL_CONTINUE) 1561 if (rc != X86EMUL_CONTINUE)
1549 return rc; 1562 return rc;
1550 1563
1551 register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], len); 1564 rsp_increment(ctxt, len);
1552 return rc; 1565 return rc;
1553} 1566}
1554 1567
@@ -1688,8 +1701,7 @@ static int em_popa(struct x86_emulate_ctxt *ctxt)
1688 1701
1689 while (reg >= VCPU_REGS_RAX) { 1702 while (reg >= VCPU_REGS_RAX) {
1690 if (reg == VCPU_REGS_RSP) { 1703 if (reg == VCPU_REGS_RSP) {
1691 register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], 1704 rsp_increment(ctxt, ctxt->op_bytes);
1692 ctxt->op_bytes);
1693 --reg; 1705 --reg;
1694 } 1706 }
1695 1707
@@ -2825,7 +2837,7 @@ static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt)
2825 rc = emulate_pop(ctxt, &ctxt->dst.val, ctxt->op_bytes); 2837 rc = emulate_pop(ctxt, &ctxt->dst.val, ctxt->op_bytes);
2826 if (rc != X86EMUL_CONTINUE) 2838 if (rc != X86EMUL_CONTINUE)
2827 return rc; 2839 return rc;
2828 register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], ctxt->src.val); 2840 rsp_increment(ctxt, ctxt->src.val);
2829 return X86EMUL_CONTINUE; 2841 return X86EMUL_CONTINUE;
2830} 2842}
2831 2843
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 01ca00423938..7fbd0d273ea8 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -4113,16 +4113,21 @@ static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
4113 LIST_HEAD(invalid_list); 4113 LIST_HEAD(invalid_list);
4114 4114
4115 /* 4115 /*
4116 * Never scan more than sc->nr_to_scan VM instances.
4117 * Will not hit this condition practically since we do not try
4118 * to shrink more than one VM and it is very unlikely to see
4119 * !n_used_mmu_pages so many times.
4120 */
4121 if (!nr_to_scan--)
4122 break;
4123 /*
4116 * n_used_mmu_pages is accessed without holding kvm->mmu_lock 4124 * n_used_mmu_pages is accessed without holding kvm->mmu_lock
4117 * here. We may skip a VM instance errorneosly, but we do not 4125 * here. We may skip a VM instance errorneosly, but we do not
4118 * want to shrink a VM that only started to populate its MMU 4126 * want to shrink a VM that only started to populate its MMU
4119 * anyway. 4127 * anyway.
4120 */ 4128 */
4121 if (kvm->arch.n_used_mmu_pages > 0) { 4129 if (!kvm->arch.n_used_mmu_pages)
4122 if (!nr_to_scan--)
4123 break;
4124 continue; 4130 continue;
4125 }
4126 4131
4127 idx = srcu_read_lock(&kvm->srcu); 4132 idx = srcu_read_lock(&kvm->srcu);
4128 spin_lock(&kvm->mmu_lock); 4133 spin_lock(&kvm->mmu_lock);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 42bce48f6928..148ed666e311 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -806,7 +806,7 @@ EXPORT_SYMBOL_GPL(kvm_rdpmc);
806 * kvm-specific. Those are put in the beginning of the list. 806 * kvm-specific. Those are put in the beginning of the list.
807 */ 807 */
808 808
809#define KVM_SAVE_MSRS_BEGIN 9 809#define KVM_SAVE_MSRS_BEGIN 10
810static u32 msrs_to_save[] = { 810static u32 msrs_to_save[] = {
811 MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, 811 MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
812 MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, 812 MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
@@ -2000,6 +2000,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
2000 case MSR_KVM_STEAL_TIME: 2000 case MSR_KVM_STEAL_TIME:
2001 data = vcpu->arch.st.msr_val; 2001 data = vcpu->arch.st.msr_val;
2002 break; 2002 break;
2003 case MSR_KVM_PV_EOI_EN:
2004 data = vcpu->arch.pv_eoi.msr_val;
2005 break;
2003 case MSR_IA32_P5_MC_ADDR: 2006 case MSR_IA32_P5_MC_ADDR:
2004 case MSR_IA32_P5_MC_TYPE: 2007 case MSR_IA32_P5_MC_TYPE:
2005 case MSR_IA32_MCG_CAP: 2008 case MSR_IA32_MCG_CAP:
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index bf4bda6d3e9a..9642d4a38602 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -31,7 +31,6 @@
31#include <linux/pci.h> 31#include <linux/pci.h>
32#include <linux/gfp.h> 32#include <linux/gfp.h>
33#include <linux/memblock.h> 33#include <linux/memblock.h>
34#include <linux/syscore_ops.h>
35 34
36#include <xen/xen.h> 35#include <xen/xen.h>
37#include <xen/interface/xen.h> 36#include <xen/interface/xen.h>
@@ -1470,130 +1469,38 @@ asmlinkage void __init xen_start_kernel(void)
1470#endif 1469#endif
1471} 1470}
1472 1471
1473#ifdef CONFIG_XEN_PVHVM 1472void __ref xen_hvm_init_shared_info(void)
1474/*
1475 * The pfn containing the shared_info is located somewhere in RAM. This
1476 * will cause trouble if the current kernel is doing a kexec boot into a
1477 * new kernel. The new kernel (and its startup code) can not know where
1478 * the pfn is, so it can not reserve the page. The hypervisor will
1479 * continue to update the pfn, and as a result memory corruption occours
1480 * in the new kernel.
1481 *
1482 * One way to work around this issue is to allocate a page in the
1483 * xen-platform pci device's BAR memory range. But pci init is done very
1484 * late and the shared_info page is already in use very early to read
1485 * the pvclock. So moving the pfn from RAM to MMIO is racy because some
1486 * code paths on other vcpus could access the pfn during the small
1487 * window when the old pfn is moved to the new pfn. There is even a
1488 * small window were the old pfn is not backed by a mfn, and during that
1489 * time all reads return -1.
1490 *
1491 * Because it is not known upfront where the MMIO region is located it
1492 * can not be used right from the start in xen_hvm_init_shared_info.
1493 *
1494 * To minimise trouble the move of the pfn is done shortly before kexec.
1495 * This does not eliminate the race because all vcpus are still online
1496 * when the syscore_ops will be called. But hopefully there is no work
1497 * pending at this point in time. Also the syscore_op is run last which
1498 * reduces the risk further.
1499 */
1500
1501static struct shared_info *xen_hvm_shared_info;
1502
1503static void xen_hvm_connect_shared_info(unsigned long pfn)
1504{ 1473{
1474 int cpu;
1505 struct xen_add_to_physmap xatp; 1475 struct xen_add_to_physmap xatp;
1476 static struct shared_info *shared_info_page = 0;
1506 1477
1478 if (!shared_info_page)
1479 shared_info_page = (struct shared_info *)
1480 extend_brk(PAGE_SIZE, PAGE_SIZE);
1507 xatp.domid = DOMID_SELF; 1481 xatp.domid = DOMID_SELF;
1508 xatp.idx = 0; 1482 xatp.idx = 0;
1509 xatp.space = XENMAPSPACE_shared_info; 1483 xatp.space = XENMAPSPACE_shared_info;
1510 xatp.gpfn = pfn; 1484 xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT;
1511 if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp)) 1485 if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp))
1512 BUG(); 1486 BUG();
1513 1487
1514} 1488 HYPERVISOR_shared_info = (struct shared_info *)shared_info_page;
1515static void xen_hvm_set_shared_info(struct shared_info *sip)
1516{
1517 int cpu;
1518
1519 HYPERVISOR_shared_info = sip;
1520 1489
1521 /* xen_vcpu is a pointer to the vcpu_info struct in the shared_info 1490 /* xen_vcpu is a pointer to the vcpu_info struct in the shared_info
1522 * page, we use it in the event channel upcall and in some pvclock 1491 * page, we use it in the event channel upcall and in some pvclock
1523 * related functions. We don't need the vcpu_info placement 1492 * related functions. We don't need the vcpu_info placement
1524 * optimizations because we don't use any pv_mmu or pv_irq op on 1493 * optimizations because we don't use any pv_mmu or pv_irq op on
1525 * HVM. 1494 * HVM.
1526 * When xen_hvm_set_shared_info is run at boot time only vcpu 0 is 1495 * When xen_hvm_init_shared_info is run at boot time only vcpu 0 is
1527 * online but xen_hvm_set_shared_info is run at resume time too and 1496 * online but xen_hvm_init_shared_info is run at resume time too and
1528 * in that case multiple vcpus might be online. */ 1497 * in that case multiple vcpus might be online. */
1529 for_each_online_cpu(cpu) { 1498 for_each_online_cpu(cpu) {
1530 per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; 1499 per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
1531 } 1500 }
1532} 1501}
1533 1502
1534/* Reconnect the shared_info pfn to a mfn */ 1503#ifdef CONFIG_XEN_PVHVM
1535void xen_hvm_resume_shared_info(void)
1536{
1537 xen_hvm_connect_shared_info(__pa(xen_hvm_shared_info) >> PAGE_SHIFT);
1538}
1539
1540#ifdef CONFIG_KEXEC
1541static struct shared_info *xen_hvm_shared_info_kexec;
1542static unsigned long xen_hvm_shared_info_pfn_kexec;
1543
1544/* Remember a pfn in MMIO space for kexec reboot */
1545void __devinit xen_hvm_prepare_kexec(struct shared_info *sip, unsigned long pfn)
1546{
1547 xen_hvm_shared_info_kexec = sip;
1548 xen_hvm_shared_info_pfn_kexec = pfn;
1549}
1550
1551static void xen_hvm_syscore_shutdown(void)
1552{
1553 struct xen_memory_reservation reservation = {
1554 .domid = DOMID_SELF,
1555 .nr_extents = 1,
1556 };
1557 unsigned long prev_pfn;
1558 int rc;
1559
1560 if (!xen_hvm_shared_info_kexec)
1561 return;
1562
1563 prev_pfn = __pa(xen_hvm_shared_info) >> PAGE_SHIFT;
1564 set_xen_guest_handle(reservation.extent_start, &prev_pfn);
1565
1566 /* Move pfn to MMIO, disconnects previous pfn from mfn */
1567 xen_hvm_connect_shared_info(xen_hvm_shared_info_pfn_kexec);
1568
1569 /* Update pointers, following hypercall is also a memory barrier */
1570 xen_hvm_set_shared_info(xen_hvm_shared_info_kexec);
1571
1572 /* Allocate new mfn for previous pfn */
1573 do {
1574 rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation);
1575 if (rc == 0)
1576 msleep(123);
1577 } while (rc == 0);
1578
1579 /* Make sure the previous pfn is really connected to a (new) mfn */
1580 BUG_ON(rc != 1);
1581}
1582
1583static struct syscore_ops xen_hvm_syscore_ops = {
1584 .shutdown = xen_hvm_syscore_shutdown,
1585};
1586#endif
1587
1588/* Use a pfn in RAM, may move to MMIO before kexec. */
1589static void __init xen_hvm_init_shared_info(void)
1590{
1591 /* Remember pointer for resume */
1592 xen_hvm_shared_info = extend_brk(PAGE_SIZE, PAGE_SIZE);
1593 xen_hvm_connect_shared_info(__pa(xen_hvm_shared_info) >> PAGE_SHIFT);
1594 xen_hvm_set_shared_info(xen_hvm_shared_info);
1595}
1596
1597static void __init init_hvm_pv_info(void) 1504static void __init init_hvm_pv_info(void)
1598{ 1505{
1599 int major, minor; 1506 int major, minor;
@@ -1644,9 +1551,6 @@ static void __init xen_hvm_guest_init(void)
1644 init_hvm_pv_info(); 1551 init_hvm_pv_info();
1645 1552
1646 xen_hvm_init_shared_info(); 1553 xen_hvm_init_shared_info();
1647#ifdef CONFIG_KEXEC
1648 register_syscore_ops(&xen_hvm_syscore_ops);
1649#endif
1650 1554
1651 if (xen_feature(XENFEAT_hvm_callback_vector)) 1555 if (xen_feature(XENFEAT_hvm_callback_vector))
1652 xen_have_vector_callback = 1; 1556 xen_have_vector_callback = 1;
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index b65a76133f4f..5141d808e751 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1283,7 +1283,7 @@ static void xen_flush_tlb_others(const struct cpumask *cpus,
1283 cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask)); 1283 cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
1284 1284
1285 args->op.cmd = MMUEXT_TLB_FLUSH_MULTI; 1285 args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
1286 if (start != TLB_FLUSH_ALL && (end - start) <= PAGE_SIZE) { 1286 if (end != TLB_FLUSH_ALL && (end - start) <= PAGE_SIZE) {
1287 args->op.cmd = MMUEXT_INVLPG_MULTI; 1287 args->op.cmd = MMUEXT_INVLPG_MULTI;
1288 args->op.arg1.linear_addr = start; 1288 args->op.arg1.linear_addr = start;
1289 } 1289 }
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index b2e91d40a4cb..76ba0e97e530 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -196,9 +196,11 @@ RESERVE_BRK(p2m_mid_identity, PAGE_SIZE * 2 * 3);
196 196
197/* When we populate back during bootup, the amount of pages can vary. The 197/* When we populate back during bootup, the amount of pages can vary. The
198 * max we have is seen is 395979, but that does not mean it can't be more. 198 * max we have is seen is 395979, but that does not mean it can't be more.
199 * But some machines can have 3GB I/O holes even. So lets reserve enough 199 * Some machines can have 3GB I/O holes even. With early_can_reuse_p2m_middle
200 * for 4GB of I/O and E820 holes. */ 200 * it can re-use Xen provided mfn_list array, so we only need to allocate at
201RESERVE_BRK(p2m_populated, PMD_SIZE * 4); 201 * most three P2M top nodes. */
202RESERVE_BRK(p2m_populated, PAGE_SIZE * 3);
203
202static inline unsigned p2m_top_index(unsigned long pfn) 204static inline unsigned p2m_top_index(unsigned long pfn)
203{ 205{
204 BUG_ON(pfn >= MAX_P2M_PFN); 206 BUG_ON(pfn >= MAX_P2M_PFN);
@@ -575,12 +577,99 @@ static bool __init early_alloc_p2m(unsigned long pfn)
575 } 577 }
576 return true; 578 return true;
577} 579}
580
581/*
582 * Skim over the P2M tree looking at pages that are either filled with
583 * INVALID_P2M_ENTRY or with 1:1 PFNs. If found, re-use that page and
584 * replace the P2M leaf with a p2m_missing or p2m_identity.
585 * Stick the old page in the new P2M tree location.
586 */
587bool __init early_can_reuse_p2m_middle(unsigned long set_pfn, unsigned long set_mfn)
588{
589 unsigned topidx;
590 unsigned mididx;
591 unsigned ident_pfns;
592 unsigned inv_pfns;
593 unsigned long *p2m;
594 unsigned long *mid_mfn_p;
595 unsigned idx;
596 unsigned long pfn;
597
598 /* We only look when this entails a P2M middle layer */
599 if (p2m_index(set_pfn))
600 return false;
601
602 for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_PER_PAGE) {
603 topidx = p2m_top_index(pfn);
604
605 if (!p2m_top[topidx])
606 continue;
607
608 if (p2m_top[topidx] == p2m_mid_missing)
609 continue;
610
611 mididx = p2m_mid_index(pfn);
612 p2m = p2m_top[topidx][mididx];
613 if (!p2m)
614 continue;
615
616 if ((p2m == p2m_missing) || (p2m == p2m_identity))
617 continue;
618
619 if ((unsigned long)p2m == INVALID_P2M_ENTRY)
620 continue;
621
622 ident_pfns = 0;
623 inv_pfns = 0;
624 for (idx = 0; idx < P2M_PER_PAGE; idx++) {
625 /* IDENTITY_PFNs are 1:1 */
626 if (p2m[idx] == IDENTITY_FRAME(pfn + idx))
627 ident_pfns++;
628 else if (p2m[idx] == INVALID_P2M_ENTRY)
629 inv_pfns++;
630 else
631 break;
632 }
633 if ((ident_pfns == P2M_PER_PAGE) || (inv_pfns == P2M_PER_PAGE))
634 goto found;
635 }
636 return false;
637found:
638 /* Found one, replace old with p2m_identity or p2m_missing */
639 p2m_top[topidx][mididx] = (ident_pfns ? p2m_identity : p2m_missing);
640 /* And the other for save/restore.. */
641 mid_mfn_p = p2m_top_mfn_p[topidx];
642 /* NOTE: Even if it is a p2m_identity it should still be point to
643 * a page filled with INVALID_P2M_ENTRY entries. */
644 mid_mfn_p[mididx] = virt_to_mfn(p2m_missing);
645
646 /* Reset where we want to stick the old page in. */
647 topidx = p2m_top_index(set_pfn);
648 mididx = p2m_mid_index(set_pfn);
649
650 /* This shouldn't happen */
651 if (WARN_ON(p2m_top[topidx] == p2m_mid_missing))
652 early_alloc_p2m(set_pfn);
653
654 if (WARN_ON(p2m_top[topidx][mididx] != p2m_missing))
655 return false;
656
657 p2m_init(p2m);
658 p2m_top[topidx][mididx] = p2m;
659 mid_mfn_p = p2m_top_mfn_p[topidx];
660 mid_mfn_p[mididx] = virt_to_mfn(p2m);
661
662 return true;
663}
578bool __init early_set_phys_to_machine(unsigned long pfn, unsigned long mfn) 664bool __init early_set_phys_to_machine(unsigned long pfn, unsigned long mfn)
579{ 665{
580 if (unlikely(!__set_phys_to_machine(pfn, mfn))) { 666 if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
581 if (!early_alloc_p2m(pfn)) 667 if (!early_alloc_p2m(pfn))
582 return false; 668 return false;
583 669
670 if (early_can_reuse_p2m_middle(pfn, mfn))
671 return __set_phys_to_machine(pfn, mfn);
672
584 if (!early_alloc_p2m_middle(pfn, false /* boundary crossover OK!*/)) 673 if (!early_alloc_p2m_middle(pfn, false /* boundary crossover OK!*/))
585 return false; 674 return false;
586 675
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index ead85576d54a..d11ca11d14fc 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -78,9 +78,16 @@ static void __init xen_add_extra_mem(u64 start, u64 size)
78 memblock_reserve(start, size); 78 memblock_reserve(start, size);
79 79
80 xen_max_p2m_pfn = PFN_DOWN(start + size); 80 xen_max_p2m_pfn = PFN_DOWN(start + size);
81 for (pfn = PFN_DOWN(start); pfn < xen_max_p2m_pfn; pfn++) {
82 unsigned long mfn = pfn_to_mfn(pfn);
83
84 if (WARN(mfn == pfn, "Trying to over-write 1-1 mapping (pfn: %lx)\n", pfn))
85 continue;
86 WARN(mfn != INVALID_P2M_ENTRY, "Trying to remove %lx which has %lx mfn!\n",
87 pfn, mfn);
81 88
82 for (pfn = PFN_DOWN(start); pfn <= xen_max_p2m_pfn; pfn++)
83 __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); 89 __set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
90 }
84} 91}
85 92
86static unsigned long __init xen_do_chunk(unsigned long start, 93static unsigned long __init xen_do_chunk(unsigned long start,
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index ae8a00c39de4..45329c8c226e 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -30,7 +30,7 @@ void xen_arch_hvm_post_suspend(int suspend_cancelled)
30{ 30{
31#ifdef CONFIG_XEN_PVHVM 31#ifdef CONFIG_XEN_PVHVM
32 int cpu; 32 int cpu;
33 xen_hvm_resume_shared_info(); 33 xen_hvm_init_shared_info();
34 xen_callback_vector(); 34 xen_callback_vector();
35 xen_unplug_emulated_devices(); 35 xen_unplug_emulated_devices();
36 if (xen_feature(XENFEAT_hvm_safe_pvclock)) { 36 if (xen_feature(XENFEAT_hvm_safe_pvclock)) {
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 1e4329e04e0f..202d4c150154 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -41,7 +41,7 @@ void xen_enable_syscall(void);
41void xen_vcpu_restore(void); 41void xen_vcpu_restore(void);
42 42
43void xen_callback_vector(void); 43void xen_callback_vector(void);
44void xen_hvm_resume_shared_info(void); 44void xen_hvm_init_shared_info(void);
45void xen_unplug_emulated_devices(void); 45void xen_unplug_emulated_devices(void);
46 46
47void __init xen_build_dynamic_phys_to_machine(void); 47void __init xen_build_dynamic_phys_to_machine(void);