Merge 3.6-rc6 into tty-next

This pulls in the fixes in 3.6-rc6 Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
author: Greg Kroah-Hartman <gregkh@linuxfoundation.org> 2012-09-16 20:31:36 -0400
committer: Greg Kroah-Hartman <gregkh@linuxfoundation.org> 2012-09-16 20:31:36 -0400
commit: 7ac3c93e5dd74486ca4f8f0b02ae55182658d2e5 (patch)
tree: 08b949c872aefbc0f8e12bdcc4dc82297bdd0f2e /arch/x86
parent: 23666a74c9f552bc9cfef20ded1b8b29bedb80c6 (diff)
parent: 5698bd757d55b1bb87edd1a9744ab09c142abfc2 (diff)
18 files changed, 190 insertions, 145 deletions
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index b315a33867f2..33692eaabab5 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -12,8 +12,7 @@
 * Simple spin lock operations.  There are two variants, one clears IRQ's
 * on the local processor, one does not.
 *
- * These are fair FIFO ticket locks, which are currently limited to 256
+ * These are fair FIFO ticket locks, which support up to 2^16 CPUs.
- * CPUs.
 *
 * (the type definitions are in asm/spinlock_types.h)
 */
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index afb7ff79a29f..ced4534baed5 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -165,7 +165,7 @@ static const unsigned char * const k7_nops[ASM_NOP_MAX+2] =
 #endif
 #ifdef P6_NOP1
-static const unsigned char  __initconst_or_module p6nops[] =
+static const unsigned char p6nops[] =
 {
        P6_NOP1,
        P6_NOP2,
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 7f2739e03e79..0d3d63afa76a 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -2008,6 +2008,7 @@ __init int intel_pmu_init(void)
                break;
        case 28: /* Atom */
+        case 54: /* Cedariew */
                memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
                       sizeof(hw_cache_event_ids));
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index 520b4265fcd2..da02e9cc3754 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -686,7 +686,8 @@ void intel_pmu_lbr_init_atom(void)
         * to have an operational LBR which can freeze
         * on PMU interrupt
         */
-        if (boot_cpu_data.x86_mask < 10) {
+        if (boot_cpu_data.x86_model == 28
+            && boot_cpu_data.x86_mask < 10) {
                pr_cont("LBR disabled due to erratum");
                return;
        }
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 7ad683d78645..d44f7829968e 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -270,7 +270,7 @@ void fixup_irqs(void)
                if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
                        break_affinity = 1;
-                        affinity = cpu_all_mask;
+                        affinity = cpu_online_mask;
                }
                chip = irq_data_get_irq_chip(data);
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index 8a2ce8fd41c0..82746f942cd8 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -143,11 +143,12 @@ static int get_matching_microcode(int cpu, const u8 *ucode_ptr,
                                  unsigned int *current_size)
 {
        struct microcode_header_amd *mc_hdr;
-        unsigned int actual_size;
+        unsigned int actual_size, patch_size;
        u16 equiv_cpu_id;
        /* size of the current patch we're staring at */
-        *current_size = *(u32 *)(ucode_ptr + 4) + SECTION_HDR_SIZE;
+        patch_size = *(u32 *)(ucode_ptr + 4);
+        *current_size = patch_size + SECTION_HDR_SIZE;
        equiv_cpu_id = find_equiv_id();
        if (!equiv_cpu_id)
@@ -174,7 +175,7 @@ static int get_matching_microcode(int cpu, const u8 *ucode_ptr,
        /*
         * now that the header looks sane, verify its size
         */
-        actual_size = verify_ucode_size(cpu, *current_size, leftover_size);
+        actual_size = verify_ucode_size(cpu, patch_size, leftover_size);
        if (!actual_size)
                return 0;
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index 4873e62db6a1..9e5bcf1e2376 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -225,6 +225,9 @@ static ssize_t microcode_write(struct file *file, const char __user *buf,
        if (do_microcode_update(buf, len) == 0)
                ret = (ssize_t)len;
+        if (ret > 0)
+                perf_check_microcode();
        mutex_unlock(&microcode_mutex);
        put_online_cpus();
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 97d9a9914ba8..a3b57a27be88 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -475,13 +475,26 @@ register_address(struct x86_emulate_ctxt *ctxt, unsigned long reg)
        return address_mask(ctxt, reg);
 }
+static void masked_increment(ulong *reg, ulong mask, int inc)
+{
+        assign_masked(reg, *reg + inc, mask);
+}
 static inline void
 register_address_increment(struct x86_emulate_ctxt *ctxt, unsigned long *reg, int inc)
 {
+        ulong mask;
        if (ctxt->ad_bytes == sizeof(unsigned long))
-                *reg += inc;
+                mask = ~0UL;
        else
-                *reg = (*reg & ~ad_mask(ctxt)) | ((*reg + inc) & ad_mask(ctxt));
+                mask = ad_mask(ctxt);
+        masked_increment(reg, mask, inc);
+}
+static void rsp_increment(struct x86_emulate_ctxt *ctxt, int inc)
+{
+        masked_increment(&ctxt->regs[VCPU_REGS_RSP], stack_mask(ctxt), inc);
 }
 static inline void jmp_rel(struct x86_emulate_ctxt *ctxt, int rel)
@@ -1522,8 +1535,8 @@ static int push(struct x86_emulate_ctxt *ctxt, void *data, int bytes)
 {
        struct segmented_address addr;
-        register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], -bytes);
+        rsp_increment(ctxt, -bytes);
-        addr.ea = register_address(ctxt, ctxt->regs[VCPU_REGS_RSP]);
+        addr.ea = ctxt->regs[VCPU_REGS_RSP] & stack_mask(ctxt);
        addr.seg = VCPU_SREG_SS;
        return segmented_write(ctxt, addr, data, bytes);
@@ -1542,13 +1555,13 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt,
        int rc;
        struct segmented_address addr;
-        addr.ea = register_address(ctxt, ctxt->regs[VCPU_REGS_RSP]);
+        addr.ea = ctxt->regs[VCPU_REGS_RSP] & stack_mask(ctxt);
        addr.seg = VCPU_SREG_SS;
        rc = segmented_read(ctxt, addr, dest, len);
        if (rc != X86EMUL_CONTINUE)
                return rc;
-        register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], len);
+        rsp_increment(ctxt, len);
        return rc;
 }
@@ -1688,8 +1701,7 @@ static int em_popa(struct x86_emulate_ctxt *ctxt)
        while (reg >= VCPU_REGS_RAX) {
                if (reg == VCPU_REGS_RSP) {
-                        register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP],
+                        rsp_increment(ctxt, ctxt->op_bytes);
-                                                        ctxt->op_bytes);
                        --reg;
                }
@@ -2825,7 +2837,7 @@ static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt)
        rc = emulate_pop(ctxt, &ctxt->dst.val, ctxt->op_bytes);
        if (rc != X86EMUL_CONTINUE)
                return rc;
-        register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], ctxt->src.val);
+        rsp_increment(ctxt, ctxt->src.val);
        return X86EMUL_CONTINUE;
 }
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index e498b18f010c..9fc9aa7ac703 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -318,7 +318,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
                if (val & 0x10) {
                        u8 edge_irr = s->irr & ~s->elcr;
                        int i;
-                        bool found;
+                        bool found = false;
                        struct kvm_vcpu *vcpu;
                        s->init4 = val & 1;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 01ca00423938..7fbd0d273ea8 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -4113,16 +4113,21 @@ static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
                LIST_HEAD(invalid_list);
                /*
+                 * Never scan more than sc->nr_to_scan VM instances.
+                 * Will not hit this condition practically since we do not try
+                 * to shrink more than one VM and it is very unlikely to see
+                 * !n_used_mmu_pages so many times.
+                 */
+                if (!nr_to_scan--)
+                        break;
+                /*
                 * n_used_mmu_pages is accessed without holding kvm->mmu_lock
                 * here. We may skip a VM instance errorneosly, but we do not
                 * want to shrink a VM that only started to populate its MMU
                 * anyway.
                 */
-                if (kvm->arch.n_used_mmu_pages > 0) {
+                if (!kvm->arch.n_used_mmu_pages)
-                        if (!nr_to_scan--)
-                                break;
                        continue;
-                }
                idx = srcu_read_lock(&kvm->srcu);
                spin_lock(&kvm->mmu_lock);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index c00f03de1b79..b1eb202ee76a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3619,6 +3619,7 @@ static void seg_setup(int seg)
 static int alloc_apic_access_page(struct kvm *kvm)
 {
+        struct page *page;
        struct kvm_userspace_memory_region kvm_userspace_mem;
        int r = 0;
@@ -3633,7 +3634,13 @@ static int alloc_apic_access_page(struct kvm *kvm)
        if (r)
                goto out;
-        kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00);
+        page = gfn_to_page(kvm, 0xfee00);
+        if (is_error_page(page)) {
+                r = -EFAULT;
+                goto out;
+        }
+        kvm->arch.apic_access_page = page;
 out:
        mutex_unlock(&kvm->slots_lock);
        return r;
@@ -3641,6 +3648,7 @@ out:
 static int alloc_identity_pagetable(struct kvm *kvm)
 {
+        struct page *page;
        struct kvm_userspace_memory_region kvm_userspace_mem;
        int r = 0;
@@ -3656,8 +3664,13 @@ static int alloc_identity_pagetable(struct kvm *kvm)
        if (r)
                goto out;
-        kvm->arch.ept_identity_pagetable = gfn_to_page(kvm,
+        page = gfn_to_page(kvm, kvm->arch.ept_identity_map_addr >> PAGE_SHIFT);
-                        kvm->arch.ept_identity_map_addr >> PAGE_SHIFT);
+        if (is_error_page(page)) {
+                r = -EFAULT;
+                goto out;
+        }
+        kvm->arch.ept_identity_pagetable = page;
 out:
        mutex_unlock(&kvm->slots_lock);
        return r;
@@ -6575,7 +6588,7 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
        /* Exposing INVPCID only when PCID is exposed */
        best = kvm_find_cpuid_entry(vcpu, 0x7, 0);
        if (vmx_invpcid_supported() &&
-            best && (best->ecx & bit(X86_FEATURE_INVPCID)) &&
+            best && (best->ebx & bit(X86_FEATURE_INVPCID)) &&
            guest_cpuid_has_pcid(vcpu)) {
                exec_control |= SECONDARY_EXEC_ENABLE_INVPCID;
                vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
@@ -6585,7 +6598,7 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
                vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
                             exec_control);
                if (best)
-                        best->ecx &= ~bit(X86_FEATURE_INVPCID);
+                        best->ebx &= ~bit(X86_FEATURE_INVPCID);
        }
 }
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 42bce48f6928..2966c847d489 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -806,7 +806,7 @@ EXPORT_SYMBOL_GPL(kvm_rdpmc);
 * kvm-specific. Those are put in the beginning of the list.
 */
-#define KVM_SAVE_MSRS_BEGIN     9
+#define KVM_SAVE_MSRS_BEGIN     10
 static u32 msrs_to_save[] = {
        MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
        MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
@@ -2000,6 +2000,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
        case MSR_KVM_STEAL_TIME:
                data = vcpu->arch.st.msr_val;
                break;
+        case MSR_KVM_PV_EOI_EN:
+                data = vcpu->arch.pv_eoi.msr_val;
+                break;
        case MSR_IA32_P5_MC_ADDR:
        case MSR_IA32_P5_MC_TYPE:
        case MSR_IA32_MCG_CAP:
@@ -5110,17 +5113,20 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu)
                        !kvm_event_needs_reinjection(vcpu);
 }
-static void vapic_enter(struct kvm_vcpu *vcpu)
+static int vapic_enter(struct kvm_vcpu *vcpu)
 {
        struct kvm_lapic *apic = vcpu->arch.apic;
        struct page *page;
        if (!apic || !apic->vapic_addr)
-                return;
+                return 0;
        page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
+        if (is_error_page(page))
+                return -EFAULT;
        vcpu->arch.apic->vapic_page = page;
+        return 0;
 }
 static void vapic_exit(struct kvm_vcpu *vcpu)
@@ -5427,7 +5433,11 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
        }
        vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
-        vapic_enter(vcpu);
+        r = vapic_enter(vcpu);
+        if (r) {
+                srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
+                return r;
+        }
        r = 1;
        while (r > 0) {
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index bf4bda6d3e9a..9642d4a38602 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -31,7 +31,6 @@
 #include <linux/pci.h>
 #include <linux/gfp.h>
 #include <linux/memblock.h>
-#include <linux/syscore_ops.h>
 #include <xen/xen.h>
 #include <xen/interface/xen.h>
@@ -1470,130 +1469,38 @@ asmlinkage void __init xen_start_kernel(void)
 #endif
 }
-#ifdef CONFIG_XEN_PVHVM
+void __ref xen_hvm_init_shared_info(void)
-/*
- * The pfn containing the shared_info is located somewhere in RAM. This
- * will cause trouble if the current kernel is doing a kexec boot into a
- * new kernel. The new kernel (and its startup code) can not know where
- * the pfn is, so it can not reserve the page. The hypervisor will
- * continue to update the pfn, and as a result memory corruption occours
- * in the new kernel.
- *
- * One way to work around this issue is to allocate a page in the
- * xen-platform pci device's BAR memory range. But pci init is done very
- * late and the shared_info page is already in use very early to read
- * the pvclock. So moving the pfn from RAM to MMIO is racy because some
- * code paths on other vcpus could access the pfn during the small
- * window when the old pfn is moved to the new pfn. There is even a
- * small window were the old pfn is not backed by a mfn, and during that
- * time all reads return -1.
- *
- * Because it is not known upfront where the MMIO region is located it
- * can not be used right from the start in xen_hvm_init_shared_info.
- *
- * To minimise trouble the move of the pfn is done shortly before kexec.
- * This does not eliminate the race because all vcpus are still online
- * when the syscore_ops will be called. But hopefully there is no work
- * pending at this point in time. Also the syscore_op is run last which
- * reduces the risk further.
- */
-static struct shared_info *xen_hvm_shared_info;
-static void xen_hvm_connect_shared_info(unsigned long pfn)
 {
+        int cpu;
        struct xen_add_to_physmap xatp;
+        static struct shared_info *shared_info_page = 0;
+        if (!shared_info_page)
+                shared_info_page = (struct shared_info *)
+                        extend_brk(PAGE_SIZE, PAGE_SIZE);
        xatp.domid = DOMID_SELF;
        xatp.idx = 0;
        xatp.space = XENMAPSPACE_shared_info;
-        xatp.gpfn = pfn;
+        xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT;
        if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp))
                BUG();
-}
+        HYPERVISOR_shared_info = (struct shared_info *)shared_info_page;
-static void xen_hvm_set_shared_info(struct shared_info *sip)
-{
-        int cpu;
-        HYPERVISOR_shared_info = sip;
        /* xen_vcpu is a pointer to the vcpu_info struct in the shared_info
         * page, we use it in the event channel upcall and in some pvclock
         * related functions. We don't need the vcpu_info placement
         * optimizations because we don't use any pv_mmu or pv_irq op on
         * HVM.
-         * When xen_hvm_set_shared_info is run at boot time only vcpu 0 is
+         * When xen_hvm_init_shared_info is run at boot time only vcpu 0 is
-         * online but xen_hvm_set_shared_info is run at resume time too and
+         * online but xen_hvm_init_shared_info is run at resume time too and
         * in that case multiple vcpus might be online. */
        for_each_online_cpu(cpu) {
                per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
        }
 }
-/* Reconnect the shared_info pfn to a mfn */
+#ifdef CONFIG_XEN_PVHVM
-void xen_hvm_resume_shared_info(void)
-{
-        xen_hvm_connect_shared_info(__pa(xen_hvm_shared_info) >> PAGE_SHIFT);
-}
-#ifdef CONFIG_KEXEC
-static struct shared_info *xen_hvm_shared_info_kexec;
-static unsigned long xen_hvm_shared_info_pfn_kexec;
-/* Remember a pfn in MMIO space for kexec reboot */
-void __devinit xen_hvm_prepare_kexec(struct shared_info *sip, unsigned long pfn)
-{
-        xen_hvm_shared_info_kexec = sip;
-        xen_hvm_shared_info_pfn_kexec = pfn;
-}
-static void xen_hvm_syscore_shutdown(void)
-{
-        struct xen_memory_reservation reservation = {
-                .domid = DOMID_SELF,
-                .nr_extents = 1,
-        };
-        unsigned long prev_pfn;
-        int rc;
-        if (!xen_hvm_shared_info_kexec)
-                return;
-        prev_pfn = __pa(xen_hvm_shared_info) >> PAGE_SHIFT;
-        set_xen_guest_handle(reservation.extent_start, &prev_pfn);
-        /* Move pfn to MMIO, disconnects previous pfn from mfn */
-        xen_hvm_connect_shared_info(xen_hvm_shared_info_pfn_kexec);
-        /* Update pointers, following hypercall is also a memory barrier */
-        xen_hvm_set_shared_info(xen_hvm_shared_info_kexec);
-        /* Allocate new mfn for previous pfn */
-        do {
-                rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation);
-                if (rc == 0)
-                        msleep(123);
-        } while (rc == 0);
-        /* Make sure the previous pfn is really connected to a (new) mfn */
-        BUG_ON(rc != 1);
-}
-static struct syscore_ops xen_hvm_syscore_ops = {
-        .shutdown = xen_hvm_syscore_shutdown,
-};
-#endif
-/* Use a pfn in RAM, may move to MMIO before kexec. */
-static void __init xen_hvm_init_shared_info(void)
-{
-        /* Remember pointer for resume */
-        xen_hvm_shared_info = extend_brk(PAGE_SIZE, PAGE_SIZE);
-        xen_hvm_connect_shared_info(__pa(xen_hvm_shared_info) >> PAGE_SHIFT);
-        xen_hvm_set_shared_info(xen_hvm_shared_info);
-}
 static void __init init_hvm_pv_info(void)
 {
        int major, minor;
@@ -1644,9 +1551,6 @@ static void __init xen_hvm_guest_init(void)
        init_hvm_pv_info();
        xen_hvm_init_shared_info();
-#ifdef CONFIG_KEXEC
-        register_syscore_ops(&xen_hvm_syscore_ops);
-#endif
        if (xen_feature(XENFEAT_hvm_callback_vector))
                xen_have_vector_callback = 1;
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index b65a76133f4f..5141d808e751 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1283,7 +1283,7 @@ static void xen_flush_tlb_others(const struct cpumask *cpus,
        cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
        args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
-        if (start != TLB_FLUSH_ALL && (end - start) <= PAGE_SIZE) {
+        if (end != TLB_FLUSH_ALL && (end - start) <= PAGE_SIZE) {
                args->op.cmd = MMUEXT_INVLPG_MULTI;
                args->op.arg1.linear_addr = start;
        }
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index b2e91d40a4cb..76ba0e97e530 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -196,9 +196,11 @@ RESERVE_BRK(p2m_mid_identity, PAGE_SIZE * 2 * 3);
 /* When we populate back during bootup, the amount of pages can vary. The
 * max we have is seen is 395979, but that does not mean it can't be more.
- * But some machines can have 3GB I/O holes even. So lets reserve enough
+ * Some machines can have 3GB I/O holes even. With early_can_reuse_p2m_middle
- * for 4GB of I/O and E820 holes. */
+ * it can re-use Xen provided mfn_list array, so we only need to allocate at
-RESERVE_BRK(p2m_populated, PMD_SIZE * 4);
+ * most three P2M top nodes. */
+RESERVE_BRK(p2m_populated, PAGE_SIZE * 3);
 static inline unsigned p2m_top_index(unsigned long pfn)
 {
        BUG_ON(pfn >= MAX_P2M_PFN);
@@ -575,12 +577,99 @@ static bool __init early_alloc_p2m(unsigned long pfn)
        }
        return true;
 }
+/*
+ * Skim over the P2M tree looking at pages that are either filled with
+ * INVALID_P2M_ENTRY or with 1:1 PFNs. If found, re-use that page and
+ * replace the P2M leaf with a p2m_missing or p2m_identity.
+ * Stick the old page in the new P2M tree location.
+ */
+bool __init early_can_reuse_p2m_middle(unsigned long set_pfn, unsigned long set_mfn)
+{
+        unsigned topidx;
+        unsigned mididx;
+        unsigned ident_pfns;
+        unsigned inv_pfns;
+        unsigned long *p2m;
+        unsigned long *mid_mfn_p;
+        unsigned idx;
+        unsigned long pfn;
+        /* We only look when this entails a P2M middle layer */
+        if (p2m_index(set_pfn))
+                return false;
+        for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_PER_PAGE) {
+                topidx = p2m_top_index(pfn);
+                if (!p2m_top[topidx])
+                        continue;
+                if (p2m_top[topidx] == p2m_mid_missing)
+                        continue;
+                mididx = p2m_mid_index(pfn);
+                p2m = p2m_top[topidx][mididx];
+                if (!p2m)
+                        continue;
+                if ((p2m == p2m_missing) || (p2m == p2m_identity))
+                        continue;
+                if ((unsigned long)p2m == INVALID_P2M_ENTRY)
+                        continue;
+                ident_pfns = 0;
+                inv_pfns = 0;
+                for (idx = 0; idx < P2M_PER_PAGE; idx++) {
+                        /* IDENTITY_PFNs are 1:1 */
+                        if (p2m[idx] == IDENTITY_FRAME(pfn + idx))
+                                ident_pfns++;
+                        else if (p2m[idx] == INVALID_P2M_ENTRY)
+                                inv_pfns++;
+                        else
+                                break;
+                }
+                if ((ident_pfns == P2M_PER_PAGE) || (inv_pfns == P2M_PER_PAGE))
+                        goto found;
+        }
+        return false;
+found:
+        /* Found one, replace old with p2m_identity or p2m_missing */
+        p2m_top[topidx][mididx] = (ident_pfns ? p2m_identity : p2m_missing);
+        /* And the other for save/restore.. */
+        mid_mfn_p = p2m_top_mfn_p[topidx];
+        /* NOTE: Even if it is a p2m_identity it should still be point to
+         * a page filled with INVALID_P2M_ENTRY entries. */
+        mid_mfn_p[mididx] = virt_to_mfn(p2m_missing);
+        /* Reset where we want to stick the old page in. */
+        topidx = p2m_top_index(set_pfn);
+        mididx = p2m_mid_index(set_pfn);
+        /* This shouldn't happen */
+        if (WARN_ON(p2m_top[topidx] == p2m_mid_missing))
+                early_alloc_p2m(set_pfn);
+        if (WARN_ON(p2m_top[topidx][mididx] != p2m_missing))
+                return false;
+        p2m_init(p2m);
+        p2m_top[topidx][mididx] = p2m;
+        mid_mfn_p = p2m_top_mfn_p[topidx];
+        mid_mfn_p[mididx] = virt_to_mfn(p2m);
+        return true;
+}
 bool __init early_set_phys_to_machine(unsigned long pfn, unsigned long mfn)
 {
        if (unlikely(!__set_phys_to_machine(pfn, mfn)))  {
                if (!early_alloc_p2m(pfn))
                        return false;
+                if (early_can_reuse_p2m_middle(pfn, mfn))
+                        return __set_phys_to_machine(pfn, mfn);
                if (!early_alloc_p2m_middle(pfn, false /* boundary crossover OK!*/))
                        return false;
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index ead85576d54a..d11ca11d14fc 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -78,9 +78,16 @@ static void __init xen_add_extra_mem(u64 start, u64 size)
        memblock_reserve(start, size);
        xen_max_p2m_pfn = PFN_DOWN(start + size);
+        for (pfn = PFN_DOWN(start); pfn < xen_max_p2m_pfn; pfn++) {
+                unsigned long mfn = pfn_to_mfn(pfn);
+                if (WARN(mfn == pfn, "Trying to over-write 1-1 mapping (pfn: %lx)\n", pfn))
+                        continue;
+                WARN(mfn != INVALID_P2M_ENTRY, "Trying to remove %lx which has %lx mfn!\n",
+                        pfn, mfn);
-        for (pfn = PFN_DOWN(start); pfn <= xen_max_p2m_pfn; pfn++)
                __set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
+        }
 }
 static unsigned long __init xen_do_chunk(unsigned long start,
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index ae8a00c39de4..45329c8c226e 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -30,7 +30,7 @@ void xen_arch_hvm_post_suspend(int suspend_cancelled)
 {
 #ifdef CONFIG_XEN_PVHVM
        int cpu;
-        xen_hvm_resume_shared_info();
+        xen_hvm_init_shared_info();
        xen_callback_vector();
        xen_unplug_emulated_devices();
        if (xen_feature(XENFEAT_hvm_safe_pvclock)) {
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 1e4329e04e0f..202d4c150154 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -41,7 +41,7 @@ void xen_enable_syscall(void);
 void xen_vcpu_restore(void);
 void xen_callback_vector(void);
-void xen_hvm_resume_shared_info(void);
+void xen_hvm_init_shared_info(void);
 void xen_unplug_emulated_devices(void);
 void __init xen_build_dynamic_phys_to_machine(void);
author	Greg Kroah-Hartman <gregkh@linuxfoundation.org>	2012-09-16 20:31:36 -0400
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>	2012-09-16 20:31:36 -0400
commit	7ac3c93e5dd74486ca4f8f0b02ae55182658d2e5 (patch)
tree	08b949c872aefbc0f8e12bdcc4dc82297bdd0f2e /arch/x86
parent	23666a74c9f552bc9cfef20ded1b8b29bedb80c6 (diff)
parent	5698bd757d55b1bb87edd1a9744ab09c142abfc2 (diff)