aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorOlaf Hering <olaf@aepfle.de>2012-07-17 11:43:35 -0400
committerKonrad Rzeszutek Wilk <konrad.wilk@oracle.com>2012-07-19 15:52:05 -0400
commit00e37bdb0113a98408de42db85be002f21dbffd3 (patch)
tree13207109cddbc8c3550659eb67e3345ed6cca9a7
parent4ff2d06255461390ad685843d0d7364aaa6642d2 (diff)
xen PVonHVM: move shared_info to MMIO before kexec
Currently kexec in a PVonHVM guest fails with a triple fault because the new kernel overwrites the shared info page. The exact failure depends on the size of the kernel image. This patch moves the pfn from RAM into MMIO space before the kexec boot. The pfn containing the shared_info is located somewhere in RAM. This will cause trouble if the current kernel is doing a kexec boot into a new kernel. The new kernel (and its startup code) can not know where the pfn is, so it can not reserve the page. The hypervisor will continue to update the pfn, and as a result memory corruption occours in the new kernel. One way to work around this issue is to allocate a page in the xen-platform pci device's BAR memory range. But pci init is done very late and the shared_info page is already in use very early to read the pvclock. So moving the pfn from RAM to MMIO is racy because some code paths on other vcpus could access the pfn during the small window when the old pfn is moved to the new pfn. There is even a small window were the old pfn is not backed by a mfn, and during that time all reads return -1. Because it is not known upfront where the MMIO region is located it can not be used right from the start in xen_hvm_init_shared_info. To minimise trouble the move of the pfn is done shortly before kexec. This does not eliminate the race because all vcpus are still online when the syscore_ops will be called. But hopefully there is no work pending at this point in time. Also the syscore_op is run last which reduces the risk further. Signed-off-by: Olaf Hering <olaf@aepfle.de> Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
-rw-r--r--arch/x86/xen/enlighten.c118
-rw-r--r--arch/x86/xen/suspend.c2
-rw-r--r--arch/x86/xen/xen-ops.h2
-rw-r--r--drivers/xen/platform-pci.c15
-rw-r--r--include/xen/events.h2
5 files changed, 126 insertions, 13 deletions
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index f1814fc2cb77..a6f8acbdfc9a 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -31,6 +31,7 @@
31#include <linux/pci.h> 31#include <linux/pci.h>
32#include <linux/gfp.h> 32#include <linux/gfp.h>
33#include <linux/memblock.h> 33#include <linux/memblock.h>
34#include <linux/syscore_ops.h>
34 35
35#include <xen/xen.h> 36#include <xen/xen.h>
36#include <xen/interface/xen.h> 37#include <xen/interface/xen.h>
@@ -1471,38 +1472,130 @@ asmlinkage void __init xen_start_kernel(void)
1471#endif 1472#endif
1472} 1473}
1473 1474
1474void __ref xen_hvm_init_shared_info(void) 1475#ifdef CONFIG_XEN_PVHVM
1476/*
1477 * The pfn containing the shared_info is located somewhere in RAM. This
1478 * will cause trouble if the current kernel is doing a kexec boot into a
1479 * new kernel. The new kernel (and its startup code) can not know where
1480 * the pfn is, so it can not reserve the page. The hypervisor will
1481 * continue to update the pfn, and as a result memory corruption occours
1482 * in the new kernel.
1483 *
1484 * One way to work around this issue is to allocate a page in the
1485 * xen-platform pci device's BAR memory range. But pci init is done very
1486 * late and the shared_info page is already in use very early to read
1487 * the pvclock. So moving the pfn from RAM to MMIO is racy because some
1488 * code paths on other vcpus could access the pfn during the small
1489 * window when the old pfn is moved to the new pfn. There is even a
1490 * small window were the old pfn is not backed by a mfn, and during that
1491 * time all reads return -1.
1492 *
1493 * Because it is not known upfront where the MMIO region is located it
1494 * can not be used right from the start in xen_hvm_init_shared_info.
1495 *
1496 * To minimise trouble the move of the pfn is done shortly before kexec.
1497 * This does not eliminate the race because all vcpus are still online
1498 * when the syscore_ops will be called. But hopefully there is no work
1499 * pending at this point in time. Also the syscore_op is run last which
1500 * reduces the risk further.
1501 */
1502
1503static struct shared_info *xen_hvm_shared_info;
1504
1505static void xen_hvm_connect_shared_info(unsigned long pfn)
1475{ 1506{
1476 int cpu;
1477 struct xen_add_to_physmap xatp; 1507 struct xen_add_to_physmap xatp;
1478 static struct shared_info *shared_info_page = 0;
1479 1508
1480 if (!shared_info_page)
1481 shared_info_page = (struct shared_info *)
1482 extend_brk(PAGE_SIZE, PAGE_SIZE);
1483 xatp.domid = DOMID_SELF; 1509 xatp.domid = DOMID_SELF;
1484 xatp.idx = 0; 1510 xatp.idx = 0;
1485 xatp.space = XENMAPSPACE_shared_info; 1511 xatp.space = XENMAPSPACE_shared_info;
1486 xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT; 1512 xatp.gpfn = pfn;
1487 if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp)) 1513 if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp))
1488 BUG(); 1514 BUG();
1489 1515
1490 HYPERVISOR_shared_info = (struct shared_info *)shared_info_page; 1516}
1517static void xen_hvm_set_shared_info(struct shared_info *sip)
1518{
1519 int cpu;
1520
1521 HYPERVISOR_shared_info = sip;
1491 1522
1492 /* xen_vcpu is a pointer to the vcpu_info struct in the shared_info 1523 /* xen_vcpu is a pointer to the vcpu_info struct in the shared_info
1493 * page, we use it in the event channel upcall and in some pvclock 1524 * page, we use it in the event channel upcall and in some pvclock
1494 * related functions. We don't need the vcpu_info placement 1525 * related functions. We don't need the vcpu_info placement
1495 * optimizations because we don't use any pv_mmu or pv_irq op on 1526 * optimizations because we don't use any pv_mmu or pv_irq op on
1496 * HVM. 1527 * HVM.
1497 * When xen_hvm_init_shared_info is run at boot time only vcpu 0 is 1528 * When xen_hvm_set_shared_info is run at boot time only vcpu 0 is
1498 * online but xen_hvm_init_shared_info is run at resume time too and 1529 * online but xen_hvm_set_shared_info is run at resume time too and
1499 * in that case multiple vcpus might be online. */ 1530 * in that case multiple vcpus might be online. */
1500 for_each_online_cpu(cpu) { 1531 for_each_online_cpu(cpu) {
1501 per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; 1532 per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
1502 } 1533 }
1503} 1534}
1504 1535
1505#ifdef CONFIG_XEN_PVHVM 1536/* Reconnect the shared_info pfn to a mfn */
1537void xen_hvm_resume_shared_info(void)
1538{
1539 xen_hvm_connect_shared_info(__pa(xen_hvm_shared_info) >> PAGE_SHIFT);
1540}
1541
1542#ifdef CONFIG_KEXEC
1543static struct shared_info *xen_hvm_shared_info_kexec;
1544static unsigned long xen_hvm_shared_info_pfn_kexec;
1545
1546/* Remember a pfn in MMIO space for kexec reboot */
1547void __devinit xen_hvm_prepare_kexec(struct shared_info *sip, unsigned long pfn)
1548{
1549 xen_hvm_shared_info_kexec = sip;
1550 xen_hvm_shared_info_pfn_kexec = pfn;
1551}
1552
1553static void xen_hvm_syscore_shutdown(void)
1554{
1555 struct xen_memory_reservation reservation = {
1556 .domid = DOMID_SELF,
1557 .nr_extents = 1,
1558 };
1559 unsigned long prev_pfn;
1560 int rc;
1561
1562 if (!xen_hvm_shared_info_kexec)
1563 return;
1564
1565 prev_pfn = __pa(xen_hvm_shared_info) >> PAGE_SHIFT;
1566 set_xen_guest_handle(reservation.extent_start, &prev_pfn);
1567
1568 /* Move pfn to MMIO, disconnects previous pfn from mfn */
1569 xen_hvm_connect_shared_info(xen_hvm_shared_info_pfn_kexec);
1570
1571 /* Update pointers, following hypercall is also a memory barrier */
1572 xen_hvm_set_shared_info(xen_hvm_shared_info_kexec);
1573
1574 /* Allocate new mfn for previous pfn */
1575 do {
1576 rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation);
1577 if (rc == 0)
1578 msleep(123);
1579 } while (rc == 0);
1580
1581 /* Make sure the previous pfn is really connected to a (new) mfn */
1582 BUG_ON(rc != 1);
1583}
1584
1585static struct syscore_ops xen_hvm_syscore_ops = {
1586 .shutdown = xen_hvm_syscore_shutdown,
1587};
1588#endif
1589
1590/* Use a pfn in RAM, may move to MMIO before kexec. */
1591static void __init xen_hvm_init_shared_info(void)
1592{
1593 /* Remember pointer for resume */
1594 xen_hvm_shared_info = extend_brk(PAGE_SIZE, PAGE_SIZE);
1595 xen_hvm_connect_shared_info(__pa(xen_hvm_shared_info) >> PAGE_SHIFT);
1596 xen_hvm_set_shared_info(xen_hvm_shared_info);
1597}
1598
1506static void __init init_hvm_pv_info(void) 1599static void __init init_hvm_pv_info(void)
1507{ 1600{
1508 int major, minor; 1601 int major, minor;
@@ -1553,6 +1646,9 @@ static void __init xen_hvm_guest_init(void)
1553 init_hvm_pv_info(); 1646 init_hvm_pv_info();
1554 1647
1555 xen_hvm_init_shared_info(); 1648 xen_hvm_init_shared_info();
1649#ifdef CONFIG_KEXEC
1650 register_syscore_ops(&xen_hvm_syscore_ops);
1651#endif
1556 1652
1557 if (xen_feature(XENFEAT_hvm_callback_vector)) 1653 if (xen_feature(XENFEAT_hvm_callback_vector))
1558 xen_have_vector_callback = 1; 1654 xen_have_vector_callback = 1;
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index 45329c8c226e..ae8a00c39de4 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -30,7 +30,7 @@ void xen_arch_hvm_post_suspend(int suspend_cancelled)
30{ 30{
31#ifdef CONFIG_XEN_PVHVM 31#ifdef CONFIG_XEN_PVHVM
32 int cpu; 32 int cpu;
33 xen_hvm_init_shared_info(); 33 xen_hvm_resume_shared_info();
34 xen_callback_vector(); 34 xen_callback_vector();
35 xen_unplug_emulated_devices(); 35 xen_unplug_emulated_devices();
36 if (xen_feature(XENFEAT_hvm_safe_pvclock)) { 36 if (xen_feature(XENFEAT_hvm_safe_pvclock)) {
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 202d4c150154..1e4329e04e0f 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -41,7 +41,7 @@ void xen_enable_syscall(void);
41void xen_vcpu_restore(void); 41void xen_vcpu_restore(void);
42 42
43void xen_callback_vector(void); 43void xen_callback_vector(void);
44void xen_hvm_init_shared_info(void); 44void xen_hvm_resume_shared_info(void);
45void xen_unplug_emulated_devices(void); 45void xen_unplug_emulated_devices(void);
46 46
47void __init xen_build_dynamic_phys_to_machine(void); 47void __init xen_build_dynamic_phys_to_machine(void);
diff --git a/drivers/xen/platform-pci.c b/drivers/xen/platform-pci.c
index 97ca359ae2bd..d4c50d63acbc 100644
--- a/drivers/xen/platform-pci.c
+++ b/drivers/xen/platform-pci.c
@@ -101,6 +101,19 @@ static int platform_pci_resume(struct pci_dev *pdev)
101 return 0; 101 return 0;
102} 102}
103 103
104static void __devinit prepare_shared_info(void)
105{
106#ifdef CONFIG_KEXEC
107 unsigned long addr;
108 struct shared_info *hvm_shared_info;
109
110 addr = alloc_xen_mmio(PAGE_SIZE);
111 hvm_shared_info = ioremap(addr, PAGE_SIZE);
112 memset(hvm_shared_info, 0, PAGE_SIZE);
113 xen_hvm_prepare_kexec(hvm_shared_info, addr >> PAGE_SHIFT);
114#endif
115}
116
104static int __devinit platform_pci_init(struct pci_dev *pdev, 117static int __devinit platform_pci_init(struct pci_dev *pdev,
105 const struct pci_device_id *ent) 118 const struct pci_device_id *ent)
106{ 119{
@@ -138,6 +151,8 @@ static int __devinit platform_pci_init(struct pci_dev *pdev,
138 platform_mmio = mmio_addr; 151 platform_mmio = mmio_addr;
139 platform_mmiolen = mmio_len; 152 platform_mmiolen = mmio_len;
140 153
154 prepare_shared_info();
155
141 if (!xen_have_vector_callback) { 156 if (!xen_have_vector_callback) {
142 ret = xen_allocate_irq(pdev); 157 ret = xen_allocate_irq(pdev);
143 if (ret) { 158 if (ret) {
diff --git a/include/xen/events.h b/include/xen/events.h
index 04399b28e821..9c641deb65d2 100644
--- a/include/xen/events.h
+++ b/include/xen/events.h
@@ -58,6 +58,8 @@ void notify_remote_via_irq(int irq);
58 58
59void xen_irq_resume(void); 59void xen_irq_resume(void);
60 60
61void xen_hvm_prepare_kexec(struct shared_info *sip, unsigned long pfn);
62
61/* Clear an irq's pending state, in preparation for polling on it */ 63/* Clear an irq's pending state, in preparation for polling on it */
62void xen_clear_irq_pending(int irq); 64void xen_clear_irq_pending(int irq);
63void xen_set_irq_pending(int irq); 65void xen_set_irq_pending(int irq);