Merge tag 'stable/for-linus-3.6-rc0-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/konrad/xen

Pull Xen update from Konrad Rzeszutek Wilk: "Features: * Performance improvement to lower the amount of traps the hypervisor has to do 32-bit guests. Mainly for setting PTE entries and updating TLS descriptors. * MCE polling driver to collect hypervisor MCE buffer and present them to /dev/mcelog. * Physical CPU online/offline support. When an privileged guest is booted it is present with virtual CPUs, which might have an 1:1 to physical CPUs but usually don't. This provides mechanism to offline/online physical CPUs. Bug-fixes for: * Coverity found fixes in the console and ACPI processor driver. * PVonHVM kexec fixes along with some cleanups. * Pages that fall within E820 gaps and non-RAM regions (and had been released to hypervisor) would be populated back, but potentially in non-RAM regions." * tag 'stable/for-linus-3.6-rc0-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/konrad/xen: xen: populate correct number of pages when across mem boundary (v2) xen PVonHVM: move shared_info to MMIO before kexec xen: simplify init_hvm_pv_info xen: remove cast from HYPERVISOR_shared_info assignment xen: enable platform-pci only in a Xen guest xen/pv-on-hvm kexec: shutdown watches from old kernel xen/x86: avoid updating TLS descriptors if they haven't changed xen/x86: add desc_equal() to compare GDT descriptors xen/mm: zero PTEs for non-present MFNs in the initial page table xen/mm: do direct hypercall in xen_set_pte() if batching is unavailable xen/hvc: Fix up checks when the info is allocated. xen/acpi: Fix potential memory leak. xen/mce: add .poll method for mcelog device driver xen/mce: schedule a workqueue to avoid sleep in atomic context xen/pcpu: Xen physical cpus online/offline sys interface xen/mce: Register native mce handler as vMCE bounce back point x86, MCE, AMD: Adjust initcall sequence for xen xen/mce: Add mcelog support for Xen platform
author: Linus Torvalds <torvalds@linux-foundation.org> 2012-07-24 16:14:03 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2012-07-24 16:14:03 -0400
commit: 62c4d9afa4bcf5315e2745a17a0228bf65b9ba40 (patch)
tree: a7b9d97283441ea5f0c738fa388e120c4c1491b6 /arch/x86/xen
parent: 5fecc9d8f59e765c2a48379dd7c6f5cf88c7d75a (diff)
parent: c3d93f880197953f86ab90d9da4744e926b38e33 (diff)
5 files changed, 217 insertions, 73 deletions
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index ed7d54985d0..bf4bda6d3e9 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -31,6 +31,7 @@
 #include <linux/pci.h>
 #include <linux/gfp.h>
 #include <linux/memblock.h>
+#include <linux/syscore_ops.h>
 #include <xen/xen.h>
 #include <xen/interface/xen.h>
@@ -38,6 +39,7 @@
 #include <xen/interface/physdev.h>
 #include <xen/interface/vcpu.h>
 #include <xen/interface/memory.h>
+#include <xen/interface/xen-mca.h>
 #include <xen/features.h>
 #include <xen/page.h>
 #include <xen/hvm.h>
@@ -107,7 +109,7 @@ EXPORT_SYMBOL_GPL(xen_have_vector_callback);
 * Point at some empty memory to start with. We map the real shared_info
 * page as soon as fixmap is up and running.
 */
-struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
+struct shared_info *HYPERVISOR_shared_info = &xen_dummy_shared_info;
 /*
 * Flag to determine whether vcpu info placement is available on all
@@ -124,6 +126,19 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
 */
 static int have_vcpu_info_placement = 1;
+struct tls_descs {
+        struct desc_struct desc[3];
+};
+/*
+ * Updating the 3 TLS descriptors in the GDT on every task switch is
+ * surprisingly expensive so we avoid updating them if they haven't
+ * changed.  Since Xen writes different descriptors than the one
+ * passed in the update_descriptor hypercall we keep shadow copies to
+ * compare against.
+ */
+static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc);
 static void clamp_max_cpus(void)
 {
 #ifdef CONFIG_SMP
@@ -341,9 +356,7 @@ static void __init xen_init_cpuid_mask(void)
        unsigned int xsave_mask;
        cpuid_leaf1_edx_mask =
-                ~((1 << X86_FEATURE_MCE)  |  /* disable MCE */
+                ~((1 << X86_FEATURE_MTRR) |  /* disable MTRR */
-                  (1 << X86_FEATURE_MCA)  |  /* disable MCA */
-                  (1 << X86_FEATURE_MTRR) |  /* disable MTRR */
                  (1 << X86_FEATURE_ACC));   /* thermal monitoring */
        if (!xen_initial_domain())
@@ -540,12 +553,28 @@ static void __init xen_load_gdt_boot(const struct desc_ptr *dtr)
                BUG();
 }
+static inline bool desc_equal(const struct desc_struct *d1,
+                              const struct desc_struct *d2)
+{
+        return d1->a == d2->a && d1->b == d2->b;
+}
 static void load_TLS_descriptor(struct thread_struct *t,
                                unsigned int cpu, unsigned int i)
 {
-        struct desc_struct *gdt = get_cpu_gdt_table(cpu);
+        struct desc_struct *shadow = &per_cpu(shadow_tls_desc, cpu).desc[i];
-        xmaddr_t maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
+        struct desc_struct *gdt;
-        struct multicall_space mc = __xen_mc_entry(0);
+        xmaddr_t maddr;
+        struct multicall_space mc;
+        if (desc_equal(shadow, &t->tls_array[i]))
+                return;
+        *shadow = t->tls_array[i];
+        gdt = get_cpu_gdt_table(cpu);
+        maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
+        mc = __xen_mc_entry(0);
        MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]);
 }
@@ -627,8 +656,8 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val,
        /*
         * Look for known traps using IST, and substitute them
         * appropriately.  The debugger ones are the only ones we care
-         * about.  Xen will handle faults like double_fault and
+         * about.  Xen will handle faults like double_fault,
-         * machine_check, so we should never see them.  Warn if
+         * so we should never see them.  Warn if
         * there's an unexpected IST-using fault handler.
         */
        if (addr == (unsigned long)debug)
@@ -643,7 +672,11 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val,
                return 0;
 #ifdef CONFIG_X86_MCE
        } else if (addr == (unsigned long)machine_check) {
-                return 0;
+                /*
+                 * when xen hypervisor inject vMCE to guest,
+                 * use native mce handler to handle it
+                 */
+                ;
 #endif
        } else {
                /* Some other trap using IST? */
@@ -1437,64 +1470,155 @@ asmlinkage void __init xen_start_kernel(void)
 #endif
 }
-static int init_hvm_pv_info(int *major, int *minor)
+#ifdef CONFIG_XEN_PVHVM
-{
+/*
-        uint32_t eax, ebx, ecx, edx, pages, msr, base;
+ * The pfn containing the shared_info is located somewhere in RAM. This
-        u64 pfn;
+ * will cause trouble if the current kernel is doing a kexec boot into a
+ * new kernel. The new kernel (and its startup code) can not know where
-        base = xen_cpuid_base();
+ * the pfn is, so it can not reserve the page. The hypervisor will
-        cpuid(base + 1, &eax, &ebx, &ecx, &edx);
+ * continue to update the pfn, and as a result memory corruption occours
+ * in the new kernel.
-        *major = eax >> 16;
+ *
-        *minor = eax & 0xffff;
+ * One way to work around this issue is to allocate a page in the
-        printk(KERN_INFO "Xen version %d.%d.\n", *major, *minor);
+ * xen-platform pci device's BAR memory range. But pci init is done very
+ * late and the shared_info page is already in use very early to read
-        cpuid(base + 2, &pages, &msr, &ecx, &edx);
+ * the pvclock. So moving the pfn from RAM to MMIO is racy because some
+ * code paths on other vcpus could access the pfn during the small
-        pfn = __pa(hypercall_page);
+ * window when the old pfn is moved to the new pfn. There is even a
-        wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
+ * small window were the old pfn is not backed by a mfn, and during that
+ * time all reads return -1.
-        xen_setup_features();
+ *
+ * Because it is not known upfront where the MMIO region is located it
-        pv_info.name = "Xen HVM";
+ * can not be used right from the start in xen_hvm_init_shared_info.
+ *
-        xen_domain_type = XEN_HVM_DOMAIN;
+ * To minimise trouble the move of the pfn is done shortly before kexec.
+ * This does not eliminate the race because all vcpus are still online
+ * when the syscore_ops will be called. But hopefully there is no work
+ * pending at this point in time. Also the syscore_op is run last which
+ * reduces the risk further.
+ */
-        return 0;
+static struct shared_info *xen_hvm_shared_info;
-}
-void __ref xen_hvm_init_shared_info(void)
+static void xen_hvm_connect_shared_info(unsigned long pfn)
 {
-        int cpu;
        struct xen_add_to_physmap xatp;
-        static struct shared_info *shared_info_page = 0;
-        if (!shared_info_page)
-                shared_info_page = (struct shared_info *)
-                        extend_brk(PAGE_SIZE, PAGE_SIZE);
        xatp.domid = DOMID_SELF;
        xatp.idx = 0;
        xatp.space = XENMAPSPACE_shared_info;
-        xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT;
+        xatp.gpfn = pfn;
        if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp))
                BUG();
-        HYPERVISOR_shared_info = (struct shared_info *)shared_info_page;
+}
+static void xen_hvm_set_shared_info(struct shared_info *sip)
+{
+        int cpu;
+        HYPERVISOR_shared_info = sip;
        /* xen_vcpu is a pointer to the vcpu_info struct in the shared_info
         * page, we use it in the event channel upcall and in some pvclock
         * related functions. We don't need the vcpu_info placement
         * optimizations because we don't use any pv_mmu or pv_irq op on
         * HVM.
-         * When xen_hvm_init_shared_info is run at boot time only vcpu 0 is
+         * When xen_hvm_set_shared_info is run at boot time only vcpu 0 is
-         * online but xen_hvm_init_shared_info is run at resume time too and
+         * online but xen_hvm_set_shared_info is run at resume time too and
         * in that case multiple vcpus might be online. */
        for_each_online_cpu(cpu) {
                per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
        }
 }
-#ifdef CONFIG_XEN_PVHVM
+/* Reconnect the shared_info pfn to a mfn */
+void xen_hvm_resume_shared_info(void)
+{
+        xen_hvm_connect_shared_info(__pa(xen_hvm_shared_info) >> PAGE_SHIFT);
+}
+#ifdef CONFIG_KEXEC
+static struct shared_info *xen_hvm_shared_info_kexec;
+static unsigned long xen_hvm_shared_info_pfn_kexec;
+/* Remember a pfn in MMIO space for kexec reboot */
+void __devinit xen_hvm_prepare_kexec(struct shared_info *sip, unsigned long pfn)
+{
+        xen_hvm_shared_info_kexec = sip;
+        xen_hvm_shared_info_pfn_kexec = pfn;
+}
+static void xen_hvm_syscore_shutdown(void)
+{
+        struct xen_memory_reservation reservation = {
+                .domid = DOMID_SELF,
+                .nr_extents = 1,
+        };
+        unsigned long prev_pfn;
+        int rc;
+        if (!xen_hvm_shared_info_kexec)
+                return;
+        prev_pfn = __pa(xen_hvm_shared_info) >> PAGE_SHIFT;
+        set_xen_guest_handle(reservation.extent_start, &prev_pfn);
+        /* Move pfn to MMIO, disconnects previous pfn from mfn */
+        xen_hvm_connect_shared_info(xen_hvm_shared_info_pfn_kexec);
+        /* Update pointers, following hypercall is also a memory barrier */
+        xen_hvm_set_shared_info(xen_hvm_shared_info_kexec);
+        /* Allocate new mfn for previous pfn */
+        do {
+                rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation);
+                if (rc == 0)
+                        msleep(123);
+        } while (rc == 0);
+        /* Make sure the previous pfn is really connected to a (new) mfn */
+        BUG_ON(rc != 1);
+}
+static struct syscore_ops xen_hvm_syscore_ops = {
+        .shutdown = xen_hvm_syscore_shutdown,
+};
+#endif
+/* Use a pfn in RAM, may move to MMIO before kexec. */
+static void __init xen_hvm_init_shared_info(void)
+{
+        /* Remember pointer for resume */
+        xen_hvm_shared_info = extend_brk(PAGE_SIZE, PAGE_SIZE);
+        xen_hvm_connect_shared_info(__pa(xen_hvm_shared_info) >> PAGE_SHIFT);
+        xen_hvm_set_shared_info(xen_hvm_shared_info);
+}
+static void __init init_hvm_pv_info(void)
+{
+        int major, minor;
+        uint32_t eax, ebx, ecx, edx, pages, msr, base;
+        u64 pfn;
+        base = xen_cpuid_base();
+        cpuid(base + 1, &eax, &ebx, &ecx, &edx);
+        major = eax >> 16;
+        minor = eax & 0xffff;
+        printk(KERN_INFO "Xen version %d.%d.\n", major, minor);
+        cpuid(base + 2, &pages, &msr, &ecx, &edx);
+        pfn = __pa(hypercall_page);
+        wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
+        xen_setup_features();
+        pv_info.name = "Xen HVM";
+        xen_domain_type = XEN_HVM_DOMAIN;
+}
 static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self,
                                    unsigned long action, void *hcpu)
 {
@@ -1517,14 +1641,12 @@ static struct notifier_block xen_hvm_cpu_notifier __cpuinitdata = {
 static void __init xen_hvm_guest_init(void)
 {
-        int r;
+        init_hvm_pv_info();
-        int major, minor;
-        r = init_hvm_pv_info(&major, &minor);
-        if (r < 0)
-                return;
        xen_hvm_init_shared_info();
+#ifdef CONFIG_KEXEC
+        register_syscore_ops(&xen_hvm_syscore_ops);
+#endif
        if (xen_feature(XENFEAT_hvm_callback_vector))
                xen_have_vector_callback = 1;
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 3a73785631c..27336dfcda8 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -308,8 +308,20 @@ static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval)
 static inline void __xen_set_pte(pte_t *ptep, pte_t pteval)
 {
-        if (!xen_batched_set_pte(ptep, pteval))
+        if (!xen_batched_set_pte(ptep, pteval)) {
-                native_set_pte(ptep, pteval);
+                /*
+                 * Could call native_set_pte() here and trap and
+                 * emulate the PTE write but with 32-bit guests this
+                 * needs two traps (one for each of the two 32-bit
+                 * words in the PTE) so do one hypercall directly
+                 * instead.
+                 */
+                struct mmu_update u;
+                u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE;
+                u.val = pte_val_ma(pteval);
+                HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF);
+        }
 }
 static void xen_set_pte(pte_t *ptep, pte_t pteval)
@@ -1416,13 +1428,28 @@ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
 }
 #endif /* CONFIG_X86_64 */
-/* Init-time set_pte while constructing initial pagetables, which
+/*
-   doesn't allow RO pagetable pages to be remapped RW */
+ * Init-time set_pte while constructing initial pagetables, which
+ * doesn't allow RO page table pages to be remapped RW.
+ *
+ * If there is no MFN for this PFN then this page is initially
+ * ballooned out so clear the PTE (as in decrease_reservation() in
+ * drivers/xen/balloon.c).
+ *
+ * Many of these PTE updates are done on unpinned and writable pages
+ * and doing a hypercall for these is unnecessary and expensive.  At
+ * this point it is not possible to tell if a page is pinned or not,
+ * so always write the PTE directly and rely on Xen trapping and
+ * emulating any updates as necessary.
+ */
 static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
 {
-        pte = mask_rw_pte(ptep, pte);
+        if (pte_mfn(pte) != INVALID_P2M_ENTRY)
+                pte = mask_rw_pte(ptep, pte);
+        else
+                pte = __pte_ma(0);
-        xen_set_pte(ptep, pte);
+        native_set_pte(ptep, pte);
 }
 static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index a4790bf22c5..ead85576d54 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -157,25 +157,24 @@ static unsigned long __init xen_populate_chunk(
        unsigned long dest_pfn;
        for (i = 0, entry = list; i < map_size; i++, entry++) {
-                unsigned long credits = credits_left;
                unsigned long s_pfn;
                unsigned long e_pfn;
                unsigned long pfns;
                long capacity;
-                if (credits <= 0)
+                if (credits_left <= 0)
                        break;
                if (entry->type != E820_RAM)
                        continue;
-                e_pfn = PFN_UP(entry->addr + entry->size);
+                e_pfn = PFN_DOWN(entry->addr + entry->size);
                /* We only care about E820 after the xen_start_info->nr_pages */
                if (e_pfn <= max_pfn)
                        continue;
-                s_pfn = PFN_DOWN(entry->addr);
+                s_pfn = PFN_UP(entry->addr);
                /* If the E820 falls within the nr_pages, we want to start
                 * at the nr_pages PFN.
                 * If that would mean going past the E820 entry, skip it
@@ -184,23 +183,19 @@ static unsigned long __init xen_populate_chunk(
                        capacity = e_pfn - max_pfn;
                        dest_pfn = max_pfn;
                } else {
-                        /* last_pfn MUST be within E820_RAM regions */
-                        if (*last_pfn && e_pfn >= *last_pfn)
-                                s_pfn = *last_pfn;
                        capacity = e_pfn - s_pfn;
                        dest_pfn = s_pfn;
                }
-                /* If we had filled this E820_RAM entry, go to the next one. */
-                if (capacity <= 0)
-                        continue;
-                if (credits > capacity)
+                if (credits_left < capacity)
-                        credits = capacity;
+                        capacity = credits_left;
-                pfns = xen_do_chunk(dest_pfn, dest_pfn + credits, false);
+                pfns = xen_do_chunk(dest_pfn, dest_pfn + capacity, false);
                done += pfns;
-                credits_left -= pfns;
                *last_pfn = (dest_pfn + pfns);
+                if (pfns < capacity)
+                        break;
+                credits_left -= pfns;
        }
        return done;
 }
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index 45329c8c226..ae8a00c39de 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -30,7 +30,7 @@ void xen_arch_hvm_post_suspend(int suspend_cancelled)
 {
 #ifdef CONFIG_XEN_PVHVM
        int cpu;
-        xen_hvm_init_shared_info();
+        xen_hvm_resume_shared_info();
        xen_callback_vector();
        xen_unplug_emulated_devices();
        if (xen_feature(XENFEAT_hvm_safe_pvclock)) {
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 202d4c15015..1e4329e04e0 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -41,7 +41,7 @@ void xen_enable_syscall(void);
 void xen_vcpu_restore(void);
 void xen_callback_vector(void);
-void xen_hvm_init_shared_info(void);
+void xen_hvm_resume_shared_info(void);
 void xen_unplug_emulated_devices(void);
 void __init xen_build_dynamic_phys_to_machine(void);
author	Linus Torvalds <torvalds@linux-foundation.org>	2012-07-24 16:14:03 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2012-07-24 16:14:03 -0400
commit	62c4d9afa4bcf5315e2745a17a0228bf65b9ba40 (patch)
tree	a7b9d97283441ea5f0c738fa388e120c4c1491b6 /arch/x86/xen
parent	5fecc9d8f59e765c2a48379dd7c6f5cf88c7d75a (diff)
parent	c3d93f880197953f86ab90d9da4744e926b38e33 (diff)

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index ed7d54985d0..bf4bda6d3e9 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c
@@ -31,6 +31,7 @@
31	#include <linux/pci.h>	31	#include <linux/pci.h>
32	#include <linux/gfp.h>	32	#include <linux/gfp.h>
33	#include <linux/memblock.h>	33	#include <linux/memblock.h>
		34	#include <linux/syscore_ops.h>
34		35
35	#include <xen/xen.h>	36	#include <xen/xen.h>
36	#include <xen/interface/xen.h>	37	#include <xen/interface/xen.h>
@@ -38,6 +39,7 @@
38	#include <xen/interface/physdev.h>	39	#include <xen/interface/physdev.h>
39	#include <xen/interface/vcpu.h>	40	#include <xen/interface/vcpu.h>
40	#include <xen/interface/memory.h>	41	#include <xen/interface/memory.h>
		42	#include <xen/interface/xen-mca.h>
41	#include <xen/features.h>	43	#include <xen/features.h>
42	#include <xen/page.h>	44	#include <xen/page.h>
43	#include <xen/hvm.h>	45	#include <xen/hvm.h>
@@ -107,7 +109,7 @@ EXPORT_SYMBOL_GPL(xen_have_vector_callback);
107	* Point at some empty memory to start with. We map the real shared_info	109	* Point at some empty memory to start with. We map the real shared_info
108	* page as soon as fixmap is up and running.	110	* page as soon as fixmap is up and running.
109	*/	111	*/
110	struct shared_info HYPERVISOR_shared_info = (void )&xen_dummy_shared_info;	112	struct shared_info *HYPERVISOR_shared_info = &xen_dummy_shared_info;
111		113
112	/*	114	/*
113	* Flag to determine whether vcpu info placement is available on all	115	* Flag to determine whether vcpu info placement is available on all
@@ -124,6 +126,19 @@ struct shared_info HYPERVISOR_shared_info = (void )&xen_dummy_shared_info;
124	*/	126	*/
125	static int have_vcpu_info_placement = 1;	127	static int have_vcpu_info_placement = 1;
126		128
		129	struct tls_descs {
		130	struct desc_struct desc[3];
		131	};
		132
		133	/*
		134	* Updating the 3 TLS descriptors in the GDT on every task switch is
		135	* surprisingly expensive so we avoid updating them if they haven't
		136	* changed. Since Xen writes different descriptors than the one
		137	* passed in the update_descriptor hypercall we keep shadow copies to
		138	* compare against.
		139	*/
		140	static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc);
		141
127	static void clamp_max_cpus(void)	142	static void clamp_max_cpus(void)
128	{	143	{
129	#ifdef CONFIG_SMP	144	#ifdef CONFIG_SMP
@@ -341,9 +356,7 @@ static void __init xen_init_cpuid_mask(void)
341	unsigned int xsave_mask;	356	unsigned int xsave_mask;
342		357
343	cpuid_leaf1_edx_mask =	358	cpuid_leaf1_edx_mask =
344	~((1 << X86_FEATURE_MCE) \| /* disable MCE */	359	~((1 << X86_FEATURE_MTRR) \| /* disable MTRR */
345	(1 << X86_FEATURE_MCA) \| /* disable MCA */
346	(1 << X86_FEATURE_MTRR) \| /* disable MTRR */
347	(1 << X86_FEATURE_ACC)); /* thermal monitoring */	360	(1 << X86_FEATURE_ACC)); /* thermal monitoring */
348		361
349	if (!xen_initial_domain())	362	if (!xen_initial_domain())
@@ -540,12 +553,28 @@ static void __init xen_load_gdt_boot(const struct desc_ptr *dtr)
540	BUG();	553	BUG();
541	}	554	}
542		555
		556	static inline bool desc_equal(const struct desc_struct *d1,
		557	const struct desc_struct *d2)
		558	{
		559	return d1->a == d2->a && d1->b == d2->b;
		560	}
		561
543	static void load_TLS_descriptor(struct thread_struct *t,	562	static void load_TLS_descriptor(struct thread_struct *t,
544	unsigned int cpu, unsigned int i)	563	unsigned int cpu, unsigned int i)
545	{	564	{
546	struct desc_struct *gdt = get_cpu_gdt_table(cpu);	565	struct desc_struct *shadow = &per_cpu(shadow_tls_desc, cpu).desc[i];
547	xmaddr_t maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);	566	struct desc_struct *gdt;
548	struct multicall_space mc = __xen_mc_entry(0);	567	xmaddr_t maddr;
		568	struct multicall_space mc;
		569
		570	if (desc_equal(shadow, &t->tls_array[i]))
		571	return;
		572
		573	*shadow = t->tls_array[i];
		574
		575	gdt = get_cpu_gdt_table(cpu);
		576	maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
		577	mc = __xen_mc_entry(0);
549		578
550	MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]);	579	MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]);
551	}	580	}
@@ -627,8 +656,8 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val,
627	/*	656	/*
628	* Look for known traps using IST, and substitute them	657	* Look for known traps using IST, and substitute them
629	* appropriately. The debugger ones are the only ones we care	658	* appropriately. The debugger ones are the only ones we care
630	* about. Xen will handle faults like double_fault and	659	* about. Xen will handle faults like double_fault,
631	* machine_check, so we should never see them. Warn if	660	* so we should never see them. Warn if
632	* there's an unexpected IST-using fault handler.	661	* there's an unexpected IST-using fault handler.
633	*/	662	*/
634	if (addr == (unsigned long)debug)	663	if (addr == (unsigned long)debug)
@@ -643,7 +672,11 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val,
643	return 0;	672	return 0;
644	#ifdef CONFIG_X86_MCE	673	#ifdef CONFIG_X86_MCE
645	} else if (addr == (unsigned long)machine_check) {	674	} else if (addr == (unsigned long)machine_check) {
646	return 0;	675	/*
		676	* when xen hypervisor inject vMCE to guest,
		677	* use native mce handler to handle it
		678	*/
		679	;
647	#endif	680	#endif
648	} else {	681	} else {
649	/* Some other trap using IST? */	682	/* Some other trap using IST? */
@@ -1437,64 +1470,155 @@ asmlinkage void __init xen_start_kernel(void)
1437	#endif	1470	#endif
1438	}	1471	}
1439		1472
1440	static int init_hvm_pv_info(int major, int minor)	1473	#ifdef CONFIG_XEN_PVHVM
1441	{	1474	/*
1442	uint32_t eax, ebx, ecx, edx, pages, msr, base;	1475	* The pfn containing the shared_info is located somewhere in RAM. This
1443	u64 pfn;	1476	* will cause trouble if the current kernel is doing a kexec boot into a
1444		1477	* new kernel. The new kernel (and its startup code) can not know where
1445	base = xen_cpuid_base();	1478	* the pfn is, so it can not reserve the page. The hypervisor will
1446	cpuid(base + 1, &eax, &ebx, &ecx, &edx);	1479	* continue to update the pfn, and as a result memory corruption occours
1447		1480	* in the new kernel.
1448	*major = eax >> 16;	1481	*
1449	*minor = eax & 0xffff;	1482	* One way to work around this issue is to allocate a page in the
1450	printk(KERN_INFO "Xen version %d.%d.\n", major, minor);	1483	* xen-platform pci device's BAR memory range. But pci init is done very
1451		1484	* late and the shared_info page is already in use very early to read
1452	cpuid(base + 2, &pages, &msr, &ecx, &edx);	1485	* the pvclock. So moving the pfn from RAM to MMIO is racy because some
1453		1486	* code paths on other vcpus could access the pfn during the small
1454	pfn = __pa(hypercall_page);	1487	* window when the old pfn is moved to the new pfn. There is even a
1455	wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));	1488	* small window were the old pfn is not backed by a mfn, and during that
1456		1489	* time all reads return -1.
1457	xen_setup_features();	1490	*
1458		1491	* Because it is not known upfront where the MMIO region is located it
1459	pv_info.name = "Xen HVM";	1492	* can not be used right from the start in xen_hvm_init_shared_info.
1460		1493	*
1461	xen_domain_type = XEN_HVM_DOMAIN;	1494	* To minimise trouble the move of the pfn is done shortly before kexec.
		1495	* This does not eliminate the race because all vcpus are still online
		1496	* when the syscore_ops will be called. But hopefully there is no work
		1497	* pending at this point in time. Also the syscore_op is run last which
		1498	* reduces the risk further.
		1499	*/
1462		1500
1463	return 0;	1501	static struct shared_info *xen_hvm_shared_info;
1464	}
1465		1502
1466	void __ref xen_hvm_init_shared_info(void)	1503	static void xen_hvm_connect_shared_info(unsigned long pfn)
1467	{	1504	{
1468	int cpu;
1469	struct xen_add_to_physmap xatp;	1505	struct xen_add_to_physmap xatp;
1470	static struct shared_info *shared_info_page = 0;
1471		1506
1472	if (!shared_info_page)
1473	shared_info_page = (struct shared_info *)
1474	extend_brk(PAGE_SIZE, PAGE_SIZE);
1475	xatp.domid = DOMID_SELF;	1507	xatp.domid = DOMID_SELF;
1476	xatp.idx = 0;	1508	xatp.idx = 0;
1477	xatp.space = XENMAPSPACE_shared_info;	1509	xatp.space = XENMAPSPACE_shared_info;
1478	xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT;	1510	xatp.gpfn = pfn;
1479	if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp))	1511	if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp))
1480	BUG();	1512	BUG();
1481		1513
1482	HYPERVISOR_shared_info = (struct shared_info *)shared_info_page;	1514	}
		1515	static void xen_hvm_set_shared_info(struct shared_info *sip)
		1516	{
		1517	int cpu;
		1518
		1519	HYPERVISOR_shared_info = sip;
1483		1520
1484	/* xen_vcpu is a pointer to the vcpu_info struct in the shared_info	1521	/* xen_vcpu is a pointer to the vcpu_info struct in the shared_info
1485	* page, we use it in the event channel upcall and in some pvclock	1522	* page, we use it in the event channel upcall and in some pvclock
1486	* related functions. We don't need the vcpu_info placement	1523	* related functions. We don't need the vcpu_info placement
1487	* optimizations because we don't use any pv_mmu or pv_irq op on	1524	* optimizations because we don't use any pv_mmu or pv_irq op on
1488	* HVM.	1525	* HVM.
1489	* When xen_hvm_init_shared_info is run at boot time only vcpu 0 is	1526	* When xen_hvm_set_shared_info is run at boot time only vcpu 0 is
1490	* online but xen_hvm_init_shared_info is run at resume time too and	1527	* online but xen_hvm_set_shared_info is run at resume time too and
1491	* in that case multiple vcpus might be online. */	1528	* in that case multiple vcpus might be online. */
1492	for_each_online_cpu(cpu) {	1529	for_each_online_cpu(cpu) {
1493	per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];	1530	per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
1494	}	1531	}
1495	}	1532	}
1496		1533
1497	#ifdef CONFIG_XEN_PVHVM	1534	/* Reconnect the shared_info pfn to a mfn */
		1535	void xen_hvm_resume_shared_info(void)
		1536	{
		1537	xen_hvm_connect_shared_info(__pa(xen_hvm_shared_info) >> PAGE_SHIFT);
		1538	}
		1539
		1540	#ifdef CONFIG_KEXEC
		1541	static struct shared_info *xen_hvm_shared_info_kexec;
		1542	static unsigned long xen_hvm_shared_info_pfn_kexec;
		1543
		1544	/* Remember a pfn in MMIO space for kexec reboot */
		1545	void __devinit xen_hvm_prepare_kexec(struct shared_info *sip, unsigned long pfn)
		1546	{
		1547	xen_hvm_shared_info_kexec = sip;
		1548	xen_hvm_shared_info_pfn_kexec = pfn;
		1549	}
		1550
		1551	static void xen_hvm_syscore_shutdown(void)
		1552	{
		1553	struct xen_memory_reservation reservation = {
		1554	.domid = DOMID_SELF,
		1555	.nr_extents = 1,
		1556	};
		1557	unsigned long prev_pfn;
		1558	int rc;
		1559
		1560	if (!xen_hvm_shared_info_kexec)
		1561	return;
		1562
		1563	prev_pfn = __pa(xen_hvm_shared_info) >> PAGE_SHIFT;
		1564	set_xen_guest_handle(reservation.extent_start, &prev_pfn);
		1565
		1566	/* Move pfn to MMIO, disconnects previous pfn from mfn */
		1567	xen_hvm_connect_shared_info(xen_hvm_shared_info_pfn_kexec);
		1568
		1569	/* Update pointers, following hypercall is also a memory barrier */
		1570	xen_hvm_set_shared_info(xen_hvm_shared_info_kexec);
		1571
		1572	/* Allocate new mfn for previous pfn */
		1573	do {
		1574	rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation);
		1575	if (rc == 0)
		1576	msleep(123);
		1577	} while (rc == 0);
		1578
		1579	/* Make sure the previous pfn is really connected to a (new) mfn */
		1580	BUG_ON(rc != 1);
		1581	}
		1582
		1583	static struct syscore_ops xen_hvm_syscore_ops = {
		1584	.shutdown = xen_hvm_syscore_shutdown,
		1585	};
		1586	#endif
		1587
		1588	/* Use a pfn in RAM, may move to MMIO before kexec. */
		1589	static void __init xen_hvm_init_shared_info(void)
		1590	{
		1591	/* Remember pointer for resume */
		1592	xen_hvm_shared_info = extend_brk(PAGE_SIZE, PAGE_SIZE);
		1593	xen_hvm_connect_shared_info(__pa(xen_hvm_shared_info) >> PAGE_SHIFT);
		1594	xen_hvm_set_shared_info(xen_hvm_shared_info);
		1595	}
		1596
		1597	static void __init init_hvm_pv_info(void)
		1598	{
		1599	int major, minor;
		1600	uint32_t eax, ebx, ecx, edx, pages, msr, base;
		1601	u64 pfn;
		1602
		1603	base = xen_cpuid_base();
		1604	cpuid(base + 1, &eax, &ebx, &ecx, &edx);
		1605
		1606	major = eax >> 16;
		1607	minor = eax & 0xffff;
		1608	printk(KERN_INFO "Xen version %d.%d.\n", major, minor);
		1609
		1610	cpuid(base + 2, &pages, &msr, &ecx, &edx);
		1611
		1612	pfn = __pa(hypercall_page);
		1613	wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
		1614
		1615	xen_setup_features();
		1616
		1617	pv_info.name = "Xen HVM";
		1618
		1619	xen_domain_type = XEN_HVM_DOMAIN;
		1620	}
		1621
1498	static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self,	1622	static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self,
1499	unsigned long action, void *hcpu)	1623	unsigned long action, void *hcpu)
1500	{	1624	{
@@ -1517,14 +1641,12 @@ static struct notifier_block xen_hvm_cpu_notifier __cpuinitdata = {
1517		1641
1518	static void __init xen_hvm_guest_init(void)	1642	static void __init xen_hvm_guest_init(void)
1519	{	1643	{
1520	int r;	1644	init_hvm_pv_info();
1521	int major, minor;
1522
1523	r = init_hvm_pv_info(&major, &minor);
1524	if (r < 0)
1525	return;
1526		1645
1527	xen_hvm_init_shared_info();	1646	xen_hvm_init_shared_info();
		1647	#ifdef CONFIG_KEXEC
		1648	register_syscore_ops(&xen_hvm_syscore_ops);
		1649	#endif
1528		1650
1529	if (xen_feature(XENFEAT_hvm_callback_vector))	1651	if (xen_feature(XENFEAT_hvm_callback_vector))
1530	xen_have_vector_callback = 1;	1652	xen_have_vector_callback = 1;


diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 3a73785631c..27336dfcda8 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c
@@ -308,8 +308,20 @@ static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval)
308		308
309	static inline void __xen_set_pte(pte_t *ptep, pte_t pteval)	309	static inline void __xen_set_pte(pte_t *ptep, pte_t pteval)
310	{	310	{
311	if (!xen_batched_set_pte(ptep, pteval))	311	if (!xen_batched_set_pte(ptep, pteval)) {
312	native_set_pte(ptep, pteval);	312	/*
		313	* Could call native_set_pte() here and trap and
		314	* emulate the PTE write but with 32-bit guests this
		315	* needs two traps (one for each of the two 32-bit
		316	* words in the PTE) so do one hypercall directly
		317	* instead.
		318	*/
		319	struct mmu_update u;
		320
		321	u.ptr = virt_to_machine(ptep).maddr \| MMU_NORMAL_PT_UPDATE;
		322	u.val = pte_val_ma(pteval);
		323	HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF);
		324	}
313	}	325	}
314		326
315	static void xen_set_pte(pte_t *ptep, pte_t pteval)	327	static void xen_set_pte(pte_t *ptep, pte_t pteval)
@@ -1416,13 +1428,28 @@ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
1416	}	1428	}
1417	#endif /* CONFIG_X86_64 */	1429	#endif /* CONFIG_X86_64 */
1418		1430
1419	/* Init-time set_pte while constructing initial pagetables, which	1431	/*
1420	doesn't allow RO pagetable pages to be remapped RW */	1432	* Init-time set_pte while constructing initial pagetables, which
		1433	* doesn't allow RO page table pages to be remapped RW.
		1434	*
		1435	* If there is no MFN for this PFN then this page is initially
		1436	* ballooned out so clear the PTE (as in decrease_reservation() in
		1437	* drivers/xen/balloon.c).
		1438	*
		1439	* Many of these PTE updates are done on unpinned and writable pages
		1440	* and doing a hypercall for these is unnecessary and expensive. At
		1441	* this point it is not possible to tell if a page is pinned or not,
		1442	* so always write the PTE directly and rely on Xen trapping and
		1443	* emulating any updates as necessary.
		1444	*/
1421	static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)	1445	static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
1422	{	1446	{
1423	pte = mask_rw_pte(ptep, pte);	1447	if (pte_mfn(pte) != INVALID_P2M_ENTRY)
		1448	pte = mask_rw_pte(ptep, pte);
		1449	else
		1450	pte = __pte_ma(0);
1424		1451
1425	xen_set_pte(ptep, pte);	1452	native_set_pte(ptep, pte);
1426	}	1453	}
1427		1454
1428	static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)	1455	static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)


diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index a4790bf22c5..ead85576d54 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c
@@ -157,25 +157,24 @@ static unsigned long __init xen_populate_chunk(
157	unsigned long dest_pfn;	157	unsigned long dest_pfn;
158		158
159	for (i = 0, entry = list; i < map_size; i++, entry++) {	159	for (i = 0, entry = list; i < map_size; i++, entry++) {
160	unsigned long credits = credits_left;
161	unsigned long s_pfn;	160	unsigned long s_pfn;
162	unsigned long e_pfn;	161	unsigned long e_pfn;
163	unsigned long pfns;	162	unsigned long pfns;
164	long capacity;	163	long capacity;
165		164
166	if (credits <= 0)	165	if (credits_left <= 0)
167	break;	166	break;
168		167
169	if (entry->type != E820_RAM)	168	if (entry->type != E820_RAM)
170	continue;	169	continue;
171		170
172	e_pfn = PFN_UP(entry->addr + entry->size);	171	e_pfn = PFN_DOWN(entry->addr + entry->size);
173		172
174	/* We only care about E820 after the xen_start_info->nr_pages */	173	/* We only care about E820 after the xen_start_info->nr_pages */
175	if (e_pfn <= max_pfn)	174	if (e_pfn <= max_pfn)
176	continue;	175	continue;
177		176
178	s_pfn = PFN_DOWN(entry->addr);	177	s_pfn = PFN_UP(entry->addr);
179	/* If the E820 falls within the nr_pages, we want to start	178	/* If the E820 falls within the nr_pages, we want to start
180	* at the nr_pages PFN.	179	* at the nr_pages PFN.
181	* If that would mean going past the E820 entry, skip it	180	* If that would mean going past the E820 entry, skip it
@@ -184,23 +183,19 @@ static unsigned long __init xen_populate_chunk(
184	capacity = e_pfn - max_pfn;	183	capacity = e_pfn - max_pfn;
185	dest_pfn = max_pfn;	184	dest_pfn = max_pfn;
186	} else {	185	} else {
187	/* last_pfn MUST be within E820_RAM regions */
188	if (last_pfn && e_pfn >= last_pfn)
189	s_pfn = *last_pfn;
190	capacity = e_pfn - s_pfn;	186	capacity = e_pfn - s_pfn;
191	dest_pfn = s_pfn;	187	dest_pfn = s_pfn;
192	}	188	}
193	/* If we had filled this E820_RAM entry, go to the next one. */
194	if (capacity <= 0)
195	continue;
196		189
197	if (credits > capacity)	190	if (credits_left < capacity)
198	credits = capacity;	191	capacity = credits_left;
199		192
200	pfns = xen_do_chunk(dest_pfn, dest_pfn + credits, false);	193	pfns = xen_do_chunk(dest_pfn, dest_pfn + capacity, false);
201	done += pfns;	194	done += pfns;
202	credits_left -= pfns;
203	*last_pfn = (dest_pfn + pfns);	195	*last_pfn = (dest_pfn + pfns);
		196	if (pfns < capacity)
		197	break;
		198	credits_left -= pfns;
204	}	199	}
205	return done;	200	return done;
206	}	201	}


diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index 45329c8c226..ae8a00c39de 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c
@@ -30,7 +30,7 @@ void xen_arch_hvm_post_suspend(int suspend_cancelled)
30	{	30	{
31	#ifdef CONFIG_XEN_PVHVM	31	#ifdef CONFIG_XEN_PVHVM
32	int cpu;	32	int cpu;
33	xen_hvm_init_shared_info();	33	xen_hvm_resume_shared_info();
34	xen_callback_vector();	34	xen_callback_vector();
35	xen_unplug_emulated_devices();	35	xen_unplug_emulated_devices();
36	if (xen_feature(XENFEAT_hvm_safe_pvclock)) {	36	if (xen_feature(XENFEAT_hvm_safe_pvclock)) {


diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 202d4c15015..1e4329e04e0 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h
@@ -41,7 +41,7 @@ void xen_enable_syscall(void);
41	void xen_vcpu_restore(void);	41	void xen_vcpu_restore(void);
42		42
43	void xen_callback_vector(void);	43	void xen_callback_vector(void);
44	void xen_hvm_init_shared_info(void);	44	void xen_hvm_resume_shared_info(void);
45	void xen_unplug_emulated_devices(void);	45	void xen_unplug_emulated_devices(void);
46		46
47	void __init xen_build_dynamic_phys_to_machine(void);	47	void __init xen_build_dynamic_phys_to_machine(void);