Merge tag 'stable/for-linus-3.6-rc0-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/konrad/xen

Pull Xen update from Konrad Rzeszutek Wilk: "Features: * Performance improvement to lower the amount of traps the hypervisor has to do 32-bit guests. Mainly for setting PTE entries and updating TLS descriptors. * MCE polling driver to collect hypervisor MCE buffer and present them to /dev/mcelog. * Physical CPU online/offline support. When an privileged guest is booted it is present with virtual CPUs, which might have an 1:1 to physical CPUs but usually don't. This provides mechanism to offline/online physical CPUs. Bug-fixes for: * Coverity found fixes in the console and ACPI processor driver. * PVonHVM kexec fixes along with some cleanups. * Pages that fall within E820 gaps and non-RAM regions (and had been released to hypervisor) would be populated back, but potentially in non-RAM regions." * tag 'stable/for-linus-3.6-rc0-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/konrad/xen: xen: populate correct number of pages when across mem boundary (v2) xen PVonHVM: move shared_info to MMIO before kexec xen: simplify init_hvm_pv_info xen: remove cast from HYPERVISOR_shared_info assignment xen: enable platform-pci only in a Xen guest xen/pv-on-hvm kexec: shutdown watches from old kernel xen/x86: avoid updating TLS descriptors if they haven't changed xen/x86: add desc_equal() to compare GDT descriptors xen/mm: zero PTEs for non-present MFNs in the initial page table xen/mm: do direct hypercall in xen_set_pte() if batching is unavailable xen/hvc: Fix up checks when the info is allocated. xen/acpi: Fix potential memory leak. xen/mce: add .poll method for mcelog device driver xen/mce: schedule a workqueue to avoid sleep in atomic context xen/pcpu: Xen physical cpus online/offline sys interface xen/mce: Register native mce handler as vMCE bounce back point x86, MCE, AMD: Adjust initcall sequence for xen xen/mce: Add mcelog support for Xen platform
author: Linus Torvalds <torvalds@linux-foundation.org> 2012-07-24 16:14:03 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2012-07-24 16:14:03 -0400
commit: 62c4d9afa4bcf5315e2745a17a0228bf65b9ba40 (patch)
tree: a7b9d97283441ea5f0c738fa388e120c4c1491b6 /arch/x86/xen/enlighten.c
parent: 5fecc9d8f59e765c2a48379dd7c6f5cf88c7d75a (diff)
parent: c3d93f880197953f86ab90d9da4744e926b38e33 (diff)
1 files changed, 173 insertions, 51 deletions
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index ed7d54985d0c..bf4bda6d3e9a 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -31,6 +31,7 @@
 #include <linux/pci.h>
 #include <linux/gfp.h>
 #include <linux/memblock.h>
+#include <linux/syscore_ops.h>
 #include <xen/xen.h>
 #include <xen/interface/xen.h>
@@ -38,6 +39,7 @@
 #include <xen/interface/physdev.h>
 #include <xen/interface/vcpu.h>
 #include <xen/interface/memory.h>
+#include <xen/interface/xen-mca.h>
 #include <xen/features.h>
 #include <xen/page.h>
 #include <xen/hvm.h>
@@ -107,7 +109,7 @@ EXPORT_SYMBOL_GPL(xen_have_vector_callback);
 * Point at some empty memory to start with. We map the real shared_info
 * page as soon as fixmap is up and running.
 */
-struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
+struct shared_info *HYPERVISOR_shared_info = &xen_dummy_shared_info;
 /*
 * Flag to determine whether vcpu info placement is available on all
@@ -124,6 +126,19 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
 */
 static int have_vcpu_info_placement = 1;
+struct tls_descs {
+        struct desc_struct desc[3];
+};
+/*
+ * Updating the 3 TLS descriptors in the GDT on every task switch is
+ * surprisingly expensive so we avoid updating them if they haven't
+ * changed.  Since Xen writes different descriptors than the one
+ * passed in the update_descriptor hypercall we keep shadow copies to
+ * compare against.
+ */
+static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc);
 static void clamp_max_cpus(void)
 {
 #ifdef CONFIG_SMP
@@ -341,9 +356,7 @@ static void __init xen_init_cpuid_mask(void)
        unsigned int xsave_mask;
        cpuid_leaf1_edx_mask =
-                ~((1 << X86_FEATURE_MCE)  |  /* disable MCE */
+                ~((1 << X86_FEATURE_MTRR) |  /* disable MTRR */
-                  (1 << X86_FEATURE_MCA)  |  /* disable MCA */
-                  (1 << X86_FEATURE_MTRR) |  /* disable MTRR */
                  (1 << X86_FEATURE_ACC));   /* thermal monitoring */
        if (!xen_initial_domain())
@@ -540,12 +553,28 @@ static void __init xen_load_gdt_boot(const struct desc_ptr *dtr)
                BUG();
 }
+static inline bool desc_equal(const struct desc_struct *d1,
+                              const struct desc_struct *d2)
+{
+        return d1->a == d2->a && d1->b == d2->b;
+}
 static void load_TLS_descriptor(struct thread_struct *t,
                                unsigned int cpu, unsigned int i)
 {
-        struct desc_struct *gdt = get_cpu_gdt_table(cpu);
+        struct desc_struct *shadow = &per_cpu(shadow_tls_desc, cpu).desc[i];
-        xmaddr_t maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
+        struct desc_struct *gdt;
-        struct multicall_space mc = __xen_mc_entry(0);
+        xmaddr_t maddr;
+        struct multicall_space mc;
+        if (desc_equal(shadow, &t->tls_array[i]))
+                return;
+        *shadow = t->tls_array[i];
+        gdt = get_cpu_gdt_table(cpu);
+        maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
+        mc = __xen_mc_entry(0);
        MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]);
 }
@@ -627,8 +656,8 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val,
        /*
         * Look for known traps using IST, and substitute them
         * appropriately.  The debugger ones are the only ones we care
-         * about.  Xen will handle faults like double_fault and
+         * about.  Xen will handle faults like double_fault,
-         * machine_check, so we should never see them.  Warn if
+         * so we should never see them.  Warn if
         * there's an unexpected IST-using fault handler.
         */
        if (addr == (unsigned long)debug)
@@ -643,7 +672,11 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val,
                return 0;
 #ifdef CONFIG_X86_MCE
        } else if (addr == (unsigned long)machine_check) {
-                return 0;
+                /*
+                 * when xen hypervisor inject vMCE to guest,
+                 * use native mce handler to handle it
+                 */
+                ;
 #endif
        } else {
                /* Some other trap using IST? */
@@ -1437,64 +1470,155 @@ asmlinkage void __init xen_start_kernel(void)
 #endif
 }
-static int init_hvm_pv_info(int *major, int *minor)
+#ifdef CONFIG_XEN_PVHVM
-{
+/*
-        uint32_t eax, ebx, ecx, edx, pages, msr, base;
+ * The pfn containing the shared_info is located somewhere in RAM. This
-        u64 pfn;
+ * will cause trouble if the current kernel is doing a kexec boot into a
+ * new kernel. The new kernel (and its startup code) can not know where
-        base = xen_cpuid_base();
+ * the pfn is, so it can not reserve the page. The hypervisor will
-        cpuid(base + 1, &eax, &ebx, &ecx, &edx);
+ * continue to update the pfn, and as a result memory corruption occours
+ * in the new kernel.
-        *major = eax >> 16;
+ *
-        *minor = eax & 0xffff;
+ * One way to work around this issue is to allocate a page in the
-        printk(KERN_INFO "Xen version %d.%d.\n", *major, *minor);
+ * xen-platform pci device's BAR memory range. But pci init is done very
+ * late and the shared_info page is already in use very early to read
-        cpuid(base + 2, &pages, &msr, &ecx, &edx);
+ * the pvclock. So moving the pfn from RAM to MMIO is racy because some
+ * code paths on other vcpus could access the pfn during the small
-        pfn = __pa(hypercall_page);
+ * window when the old pfn is moved to the new pfn. There is even a
-        wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
+ * small window were the old pfn is not backed by a mfn, and during that
+ * time all reads return -1.
-        xen_setup_features();
+ *
+ * Because it is not known upfront where the MMIO region is located it
-        pv_info.name = "Xen HVM";
+ * can not be used right from the start in xen_hvm_init_shared_info.
+ *
-        xen_domain_type = XEN_HVM_DOMAIN;
+ * To minimise trouble the move of the pfn is done shortly before kexec.
+ * This does not eliminate the race because all vcpus are still online
+ * when the syscore_ops will be called. But hopefully there is no work
+ * pending at this point in time. Also the syscore_op is run last which
+ * reduces the risk further.
+ */
-        return 0;
+static struct shared_info *xen_hvm_shared_info;
-}
-void __ref xen_hvm_init_shared_info(void)
+static void xen_hvm_connect_shared_info(unsigned long pfn)
 {
-        int cpu;
        struct xen_add_to_physmap xatp;
-        static struct shared_info *shared_info_page = 0;
-        if (!shared_info_page)
-                shared_info_page = (struct shared_info *)
-                        extend_brk(PAGE_SIZE, PAGE_SIZE);
        xatp.domid = DOMID_SELF;
        xatp.idx = 0;
        xatp.space = XENMAPSPACE_shared_info;
-        xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT;
+        xatp.gpfn = pfn;
        if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp))
                BUG();
-        HYPERVISOR_shared_info = (struct shared_info *)shared_info_page;
+}
+static void xen_hvm_set_shared_info(struct shared_info *sip)
+{
+        int cpu;
+        HYPERVISOR_shared_info = sip;
        /* xen_vcpu is a pointer to the vcpu_info struct in the shared_info
         * page, we use it in the event channel upcall and in some pvclock
         * related functions. We don't need the vcpu_info placement
         * optimizations because we don't use any pv_mmu or pv_irq op on
         * HVM.
-         * When xen_hvm_init_shared_info is run at boot time only vcpu 0 is
+         * When xen_hvm_set_shared_info is run at boot time only vcpu 0 is
-         * online but xen_hvm_init_shared_info is run at resume time too and
+         * online but xen_hvm_set_shared_info is run at resume time too and
         * in that case multiple vcpus might be online. */
        for_each_online_cpu(cpu) {
                per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
        }
 }
-#ifdef CONFIG_XEN_PVHVM
+/* Reconnect the shared_info pfn to a mfn */
+void xen_hvm_resume_shared_info(void)
+{
+        xen_hvm_connect_shared_info(__pa(xen_hvm_shared_info) >> PAGE_SHIFT);
+}
+#ifdef CONFIG_KEXEC
+static struct shared_info *xen_hvm_shared_info_kexec;
+static unsigned long xen_hvm_shared_info_pfn_kexec;
+/* Remember a pfn in MMIO space for kexec reboot */
+void __devinit xen_hvm_prepare_kexec(struct shared_info *sip, unsigned long pfn)
+{
+        xen_hvm_shared_info_kexec = sip;
+        xen_hvm_shared_info_pfn_kexec = pfn;
+}
+static void xen_hvm_syscore_shutdown(void)
+{
+        struct xen_memory_reservation reservation = {
+                .domid = DOMID_SELF,
+                .nr_extents = 1,
+        };
+        unsigned long prev_pfn;
+        int rc;
+        if (!xen_hvm_shared_info_kexec)
+                return;
+        prev_pfn = __pa(xen_hvm_shared_info) >> PAGE_SHIFT;
+        set_xen_guest_handle(reservation.extent_start, &prev_pfn);
+        /* Move pfn to MMIO, disconnects previous pfn from mfn */
+        xen_hvm_connect_shared_info(xen_hvm_shared_info_pfn_kexec);
+        /* Update pointers, following hypercall is also a memory barrier */
+        xen_hvm_set_shared_info(xen_hvm_shared_info_kexec);
+        /* Allocate new mfn for previous pfn */
+        do {
+                rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation);
+                if (rc == 0)
+                        msleep(123);
+        } while (rc == 0);
+        /* Make sure the previous pfn is really connected to a (new) mfn */
+        BUG_ON(rc != 1);
+}
+static struct syscore_ops xen_hvm_syscore_ops = {
+        .shutdown = xen_hvm_syscore_shutdown,
+};
+#endif
+/* Use a pfn in RAM, may move to MMIO before kexec. */
+static void __init xen_hvm_init_shared_info(void)
+{
+        /* Remember pointer for resume */
+        xen_hvm_shared_info = extend_brk(PAGE_SIZE, PAGE_SIZE);
+        xen_hvm_connect_shared_info(__pa(xen_hvm_shared_info) >> PAGE_SHIFT);
+        xen_hvm_set_shared_info(xen_hvm_shared_info);
+}
+static void __init init_hvm_pv_info(void)
+{
+        int major, minor;
+        uint32_t eax, ebx, ecx, edx, pages, msr, base;
+        u64 pfn;
+        base = xen_cpuid_base();
+        cpuid(base + 1, &eax, &ebx, &ecx, &edx);
+        major = eax >> 16;
+        minor = eax & 0xffff;
+        printk(KERN_INFO "Xen version %d.%d.\n", major, minor);
+        cpuid(base + 2, &pages, &msr, &ecx, &edx);
+        pfn = __pa(hypercall_page);
+        wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
+        xen_setup_features();
+        pv_info.name = "Xen HVM";
+        xen_domain_type = XEN_HVM_DOMAIN;
+}
 static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self,
                                    unsigned long action, void *hcpu)
 {
@@ -1517,14 +1641,12 @@ static struct notifier_block xen_hvm_cpu_notifier __cpuinitdata = {
 static void __init xen_hvm_guest_init(void)
 {
-        int r;
+        init_hvm_pv_info();
-        int major, minor;
-        r = init_hvm_pv_info(&major, &minor);
-        if (r < 0)
-                return;
        xen_hvm_init_shared_info();
+#ifdef CONFIG_KEXEC
+        register_syscore_ops(&xen_hvm_syscore_ops);
+#endif
        if (xen_feature(XENFEAT_hvm_callback_vector))
                xen_have_vector_callback = 1;
author	Linus Torvalds <torvalds@linux-foundation.org>	2012-07-24 16:14:03 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2012-07-24 16:14:03 -0400
commit	62c4d9afa4bcf5315e2745a17a0228bf65b9ba40 (patch)
tree	a7b9d97283441ea5f0c738fa388e120c4c1491b6 /arch/x86/xen/enlighten.c
parent	5fecc9d8f59e765c2a48379dd7c6f5cf88c7d75a (diff)
parent	c3d93f880197953f86ab90d9da4744e926b38e33 (diff)

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index ed7d54985d0c..bf4bda6d3e9a 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c
@@ -31,6 +31,7 @@
31	#include <linux/pci.h>	31	#include <linux/pci.h>
32	#include <linux/gfp.h>	32	#include <linux/gfp.h>
33	#include <linux/memblock.h>	33	#include <linux/memblock.h>
		34	#include <linux/syscore_ops.h>
34		35
35	#include <xen/xen.h>	36	#include <xen/xen.h>
36	#include <xen/interface/xen.h>	37	#include <xen/interface/xen.h>
@@ -38,6 +39,7 @@
38	#include <xen/interface/physdev.h>	39	#include <xen/interface/physdev.h>
39	#include <xen/interface/vcpu.h>	40	#include <xen/interface/vcpu.h>
40	#include <xen/interface/memory.h>	41	#include <xen/interface/memory.h>
		42	#include <xen/interface/xen-mca.h>
41	#include <xen/features.h>	43	#include <xen/features.h>
42	#include <xen/page.h>	44	#include <xen/page.h>
43	#include <xen/hvm.h>	45	#include <xen/hvm.h>
@@ -107,7 +109,7 @@ EXPORT_SYMBOL_GPL(xen_have_vector_callback);
107	* Point at some empty memory to start with. We map the real shared_info	109	* Point at some empty memory to start with. We map the real shared_info
108	* page as soon as fixmap is up and running.	110	* page as soon as fixmap is up and running.
109	*/	111	*/
110	struct shared_info HYPERVISOR_shared_info = (void )&xen_dummy_shared_info;	112	struct shared_info *HYPERVISOR_shared_info = &xen_dummy_shared_info;
111		113
112	/*	114	/*
113	* Flag to determine whether vcpu info placement is available on all	115	* Flag to determine whether vcpu info placement is available on all
@@ -124,6 +126,19 @@ struct shared_info HYPERVISOR_shared_info = (void )&xen_dummy_shared_info;
124	*/	126	*/
125	static int have_vcpu_info_placement = 1;	127	static int have_vcpu_info_placement = 1;
126		128
		129	struct tls_descs {
		130	struct desc_struct desc[3];
		131	};
		132
		133	/*
		134	* Updating the 3 TLS descriptors in the GDT on every task switch is
		135	* surprisingly expensive so we avoid updating them if they haven't
		136	* changed. Since Xen writes different descriptors than the one
		137	* passed in the update_descriptor hypercall we keep shadow copies to
		138	* compare against.
		139	*/
		140	static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc);
		141
127	static void clamp_max_cpus(void)	142	static void clamp_max_cpus(void)
128	{	143	{
129	#ifdef CONFIG_SMP	144	#ifdef CONFIG_SMP
@@ -341,9 +356,7 @@ static void __init xen_init_cpuid_mask(void)
341	unsigned int xsave_mask;	356	unsigned int xsave_mask;
342		357
343	cpuid_leaf1_edx_mask =	358	cpuid_leaf1_edx_mask =
344	~((1 << X86_FEATURE_MCE) \| /* disable MCE */	359	~((1 << X86_FEATURE_MTRR) \| /* disable MTRR */
345	(1 << X86_FEATURE_MCA) \| /* disable MCA */
346	(1 << X86_FEATURE_MTRR) \| /* disable MTRR */
347	(1 << X86_FEATURE_ACC)); /* thermal monitoring */	360	(1 << X86_FEATURE_ACC)); /* thermal monitoring */
348		361
349	if (!xen_initial_domain())	362	if (!xen_initial_domain())
@@ -540,12 +553,28 @@ static void __init xen_load_gdt_boot(const struct desc_ptr *dtr)
540	BUG();	553	BUG();
541	}	554	}
542		555
		556	static inline bool desc_equal(const struct desc_struct *d1,
		557	const struct desc_struct *d2)
		558	{
		559	return d1->a == d2->a && d1->b == d2->b;
		560	}
		561
543	static void load_TLS_descriptor(struct thread_struct *t,	562	static void load_TLS_descriptor(struct thread_struct *t,
544	unsigned int cpu, unsigned int i)	563	unsigned int cpu, unsigned int i)
545	{	564	{
546	struct desc_struct *gdt = get_cpu_gdt_table(cpu);	565	struct desc_struct *shadow = &per_cpu(shadow_tls_desc, cpu).desc[i];
547	xmaddr_t maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);	566	struct desc_struct *gdt;
548	struct multicall_space mc = __xen_mc_entry(0);	567	xmaddr_t maddr;
		568	struct multicall_space mc;
		569
		570	if (desc_equal(shadow, &t->tls_array[i]))
		571	return;
		572
		573	*shadow = t->tls_array[i];
		574
		575	gdt = get_cpu_gdt_table(cpu);
		576	maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
		577	mc = __xen_mc_entry(0);
549		578
550	MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]);	579	MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]);
551	}	580	}
@@ -627,8 +656,8 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val,
627	/*	656	/*
628	* Look for known traps using IST, and substitute them	657	* Look for known traps using IST, and substitute them
629	* appropriately. The debugger ones are the only ones we care	658	* appropriately. The debugger ones are the only ones we care
630	* about. Xen will handle faults like double_fault and	659	* about. Xen will handle faults like double_fault,
631	* machine_check, so we should never see them. Warn if	660	* so we should never see them. Warn if
632	* there's an unexpected IST-using fault handler.	661	* there's an unexpected IST-using fault handler.
633	*/	662	*/
634	if (addr == (unsigned long)debug)	663	if (addr == (unsigned long)debug)
@@ -643,7 +672,11 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val,
643	return 0;	672	return 0;
644	#ifdef CONFIG_X86_MCE	673	#ifdef CONFIG_X86_MCE
645	} else if (addr == (unsigned long)machine_check) {	674	} else if (addr == (unsigned long)machine_check) {
646	return 0;	675	/*
		676	* when xen hypervisor inject vMCE to guest,
		677	* use native mce handler to handle it
		678	*/
		679	;
647	#endif	680	#endif
648	} else {	681	} else {
649	/* Some other trap using IST? */	682	/* Some other trap using IST? */
@@ -1437,64 +1470,155 @@ asmlinkage void __init xen_start_kernel(void)
1437	#endif	1470	#endif
1438	}	1471	}
1439		1472
1440	static int init_hvm_pv_info(int major, int minor)	1473	#ifdef CONFIG_XEN_PVHVM
1441	{	1474	/*
1442	uint32_t eax, ebx, ecx, edx, pages, msr, base;	1475	* The pfn containing the shared_info is located somewhere in RAM. This
1443	u64 pfn;	1476	* will cause trouble if the current kernel is doing a kexec boot into a
1444		1477	* new kernel. The new kernel (and its startup code) can not know where
1445	base = xen_cpuid_base();	1478	* the pfn is, so it can not reserve the page. The hypervisor will
1446	cpuid(base + 1, &eax, &ebx, &ecx, &edx);	1479	* continue to update the pfn, and as a result memory corruption occours
1447		1480	* in the new kernel.
1448	*major = eax >> 16;	1481	*
1449	*minor = eax & 0xffff;	1482	* One way to work around this issue is to allocate a page in the
1450	printk(KERN_INFO "Xen version %d.%d.\n", major, minor);	1483	* xen-platform pci device's BAR memory range. But pci init is done very
1451		1484	* late and the shared_info page is already in use very early to read
1452	cpuid(base + 2, &pages, &msr, &ecx, &edx);	1485	* the pvclock. So moving the pfn from RAM to MMIO is racy because some
1453		1486	* code paths on other vcpus could access the pfn during the small
1454	pfn = __pa(hypercall_page);	1487	* window when the old pfn is moved to the new pfn. There is even a
1455	wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));	1488	* small window were the old pfn is not backed by a mfn, and during that
1456		1489	* time all reads return -1.
1457	xen_setup_features();	1490	*
1458		1491	* Because it is not known upfront where the MMIO region is located it
1459	pv_info.name = "Xen HVM";	1492	* can not be used right from the start in xen_hvm_init_shared_info.
1460		1493	*
1461	xen_domain_type = XEN_HVM_DOMAIN;	1494	* To minimise trouble the move of the pfn is done shortly before kexec.
		1495	* This does not eliminate the race because all vcpus are still online
		1496	* when the syscore_ops will be called. But hopefully there is no work
		1497	* pending at this point in time. Also the syscore_op is run last which
		1498	* reduces the risk further.
		1499	*/
1462		1500
1463	return 0;	1501	static struct shared_info *xen_hvm_shared_info;
1464	}
1465		1502
1466	void __ref xen_hvm_init_shared_info(void)	1503	static void xen_hvm_connect_shared_info(unsigned long pfn)
1467	{	1504	{
1468	int cpu;
1469	struct xen_add_to_physmap xatp;	1505	struct xen_add_to_physmap xatp;
1470	static struct shared_info *shared_info_page = 0;
1471		1506
1472	if (!shared_info_page)
1473	shared_info_page = (struct shared_info *)
1474	extend_brk(PAGE_SIZE, PAGE_SIZE);
1475	xatp.domid = DOMID_SELF;	1507	xatp.domid = DOMID_SELF;
1476	xatp.idx = 0;	1508	xatp.idx = 0;
1477	xatp.space = XENMAPSPACE_shared_info;	1509	xatp.space = XENMAPSPACE_shared_info;
1478	xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT;	1510	xatp.gpfn = pfn;
1479	if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp))	1511	if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp))
1480	BUG();	1512	BUG();
1481		1513
1482	HYPERVISOR_shared_info = (struct shared_info *)shared_info_page;	1514	}
		1515	static void xen_hvm_set_shared_info(struct shared_info *sip)
		1516	{
		1517	int cpu;
		1518
		1519	HYPERVISOR_shared_info = sip;
1483		1520
1484	/* xen_vcpu is a pointer to the vcpu_info struct in the shared_info	1521	/* xen_vcpu is a pointer to the vcpu_info struct in the shared_info
1485	* page, we use it in the event channel upcall and in some pvclock	1522	* page, we use it in the event channel upcall and in some pvclock
1486	* related functions. We don't need the vcpu_info placement	1523	* related functions. We don't need the vcpu_info placement
1487	* optimizations because we don't use any pv_mmu or pv_irq op on	1524	* optimizations because we don't use any pv_mmu or pv_irq op on
1488	* HVM.	1525	* HVM.
1489	* When xen_hvm_init_shared_info is run at boot time only vcpu 0 is	1526	* When xen_hvm_set_shared_info is run at boot time only vcpu 0 is
1490	* online but xen_hvm_init_shared_info is run at resume time too and	1527	* online but xen_hvm_set_shared_info is run at resume time too and
1491	* in that case multiple vcpus might be online. */	1528	* in that case multiple vcpus might be online. */
1492	for_each_online_cpu(cpu) {	1529	for_each_online_cpu(cpu) {
1493	per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];	1530	per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
1494	}	1531	}
1495	}	1532	}
1496		1533
1497	#ifdef CONFIG_XEN_PVHVM	1534	/* Reconnect the shared_info pfn to a mfn */
		1535	void xen_hvm_resume_shared_info(void)
		1536	{
		1537	xen_hvm_connect_shared_info(__pa(xen_hvm_shared_info) >> PAGE_SHIFT);
		1538	}
		1539
		1540	#ifdef CONFIG_KEXEC
		1541	static struct shared_info *xen_hvm_shared_info_kexec;
		1542	static unsigned long xen_hvm_shared_info_pfn_kexec;
		1543
		1544	/* Remember a pfn in MMIO space for kexec reboot */
		1545	void __devinit xen_hvm_prepare_kexec(struct shared_info *sip, unsigned long pfn)
		1546	{
		1547	xen_hvm_shared_info_kexec = sip;
		1548	xen_hvm_shared_info_pfn_kexec = pfn;
		1549	}
		1550
		1551	static void xen_hvm_syscore_shutdown(void)
		1552	{
		1553	struct xen_memory_reservation reservation = {
		1554	.domid = DOMID_SELF,
		1555	.nr_extents = 1,
		1556	};
		1557	unsigned long prev_pfn;
		1558	int rc;
		1559
		1560	if (!xen_hvm_shared_info_kexec)
		1561	return;
		1562
		1563	prev_pfn = __pa(xen_hvm_shared_info) >> PAGE_SHIFT;
		1564	set_xen_guest_handle(reservation.extent_start, &prev_pfn);
		1565
		1566	/* Move pfn to MMIO, disconnects previous pfn from mfn */
		1567	xen_hvm_connect_shared_info(xen_hvm_shared_info_pfn_kexec);
		1568
		1569	/* Update pointers, following hypercall is also a memory barrier */
		1570	xen_hvm_set_shared_info(xen_hvm_shared_info_kexec);
		1571
		1572	/* Allocate new mfn for previous pfn */
		1573	do {
		1574	rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation);
		1575	if (rc == 0)
		1576	msleep(123);
		1577	} while (rc == 0);
		1578
		1579	/* Make sure the previous pfn is really connected to a (new) mfn */
		1580	BUG_ON(rc != 1);
		1581	}
		1582
		1583	static struct syscore_ops xen_hvm_syscore_ops = {
		1584	.shutdown = xen_hvm_syscore_shutdown,
		1585	};
		1586	#endif
		1587
		1588	/* Use a pfn in RAM, may move to MMIO before kexec. */
		1589	static void __init xen_hvm_init_shared_info(void)
		1590	{
		1591	/* Remember pointer for resume */
		1592	xen_hvm_shared_info = extend_brk(PAGE_SIZE, PAGE_SIZE);
		1593	xen_hvm_connect_shared_info(__pa(xen_hvm_shared_info) >> PAGE_SHIFT);
		1594	xen_hvm_set_shared_info(xen_hvm_shared_info);
		1595	}
		1596
		1597	static void __init init_hvm_pv_info(void)
		1598	{
		1599	int major, minor;
		1600	uint32_t eax, ebx, ecx, edx, pages, msr, base;
		1601	u64 pfn;
		1602
		1603	base = xen_cpuid_base();
		1604	cpuid(base + 1, &eax, &ebx, &ecx, &edx);
		1605
		1606	major = eax >> 16;
		1607	minor = eax & 0xffff;
		1608	printk(KERN_INFO "Xen version %d.%d.\n", major, minor);
		1609
		1610	cpuid(base + 2, &pages, &msr, &ecx, &edx);
		1611
		1612	pfn = __pa(hypercall_page);
		1613	wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
		1614
		1615	xen_setup_features();
		1616
		1617	pv_info.name = "Xen HVM";
		1618
		1619	xen_domain_type = XEN_HVM_DOMAIN;
		1620	}
		1621
1498	static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self,	1622	static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self,
1499	unsigned long action, void *hcpu)	1623	unsigned long action, void *hcpu)
1500	{	1624	{
@@ -1517,14 +1641,12 @@ static struct notifier_block xen_hvm_cpu_notifier __cpuinitdata = {
1517		1641
1518	static void __init xen_hvm_guest_init(void)	1642	static void __init xen_hvm_guest_init(void)
1519	{	1643	{
1520	int r;	1644	init_hvm_pv_info();
1521	int major, minor;
1522
1523	r = init_hvm_pv_info(&major, &minor);
1524	if (r < 0)
1525	return;
1526		1645
1527	xen_hvm_init_shared_info();	1646	xen_hvm_init_shared_info();
		1647	#ifdef CONFIG_KEXEC
		1648	register_syscore_ops(&xen_hvm_syscore_ops);
		1649	#endif
1528		1650
1529	if (xen_feature(XENFEAT_hvm_callback_vector))	1651	if (xen_feature(XENFEAT_hvm_callback_vector))
1530	xen_have_vector_callback = 1;	1652	xen_have_vector_callback = 1;