aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/xen/enlighten.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-07-24 16:14:03 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-07-24 16:14:03 -0400
commit62c4d9afa4bcf5315e2745a17a0228bf65b9ba40 (patch)
treea7b9d97283441ea5f0c738fa388e120c4c1491b6 /arch/x86/xen/enlighten.c
parent5fecc9d8f59e765c2a48379dd7c6f5cf88c7d75a (diff)
parentc3d93f880197953f86ab90d9da4744e926b38e33 (diff)
Merge tag 'stable/for-linus-3.6-rc0-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/konrad/xen
Pull Xen update from Konrad Rzeszutek Wilk: "Features: * Performance improvement to lower the amount of traps the hypervisor has to do 32-bit guests. Mainly for setting PTE entries and updating TLS descriptors. * MCE polling driver to collect hypervisor MCE buffer and present them to /dev/mcelog. * Physical CPU online/offline support. When an privileged guest is booted it is present with virtual CPUs, which might have an 1:1 to physical CPUs but usually don't. This provides mechanism to offline/online physical CPUs. Bug-fixes for: * Coverity found fixes in the console and ACPI processor driver. * PVonHVM kexec fixes along with some cleanups. * Pages that fall within E820 gaps and non-RAM regions (and had been released to hypervisor) would be populated back, but potentially in non-RAM regions." * tag 'stable/for-linus-3.6-rc0-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/konrad/xen: xen: populate correct number of pages when across mem boundary (v2) xen PVonHVM: move shared_info to MMIO before kexec xen: simplify init_hvm_pv_info xen: remove cast from HYPERVISOR_shared_info assignment xen: enable platform-pci only in a Xen guest xen/pv-on-hvm kexec: shutdown watches from old kernel xen/x86: avoid updating TLS descriptors if they haven't changed xen/x86: add desc_equal() to compare GDT descriptors xen/mm: zero PTEs for non-present MFNs in the initial page table xen/mm: do direct hypercall in xen_set_pte() if batching is unavailable xen/hvc: Fix up checks when the info is allocated. xen/acpi: Fix potential memory leak. xen/mce: add .poll method for mcelog device driver xen/mce: schedule a workqueue to avoid sleep in atomic context xen/pcpu: Xen physical cpus online/offline sys interface xen/mce: Register native mce handler as vMCE bounce back point x86, MCE, AMD: Adjust initcall sequence for xen xen/mce: Add mcelog support for Xen platform
Diffstat (limited to 'arch/x86/xen/enlighten.c')
-rw-r--r--arch/x86/xen/enlighten.c224
1 files changed, 173 insertions, 51 deletions
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index ed7d54985d0c..bf4bda6d3e9a 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -31,6 +31,7 @@
31#include <linux/pci.h> 31#include <linux/pci.h>
32#include <linux/gfp.h> 32#include <linux/gfp.h>
33#include <linux/memblock.h> 33#include <linux/memblock.h>
34#include <linux/syscore_ops.h>
34 35
35#include <xen/xen.h> 36#include <xen/xen.h>
36#include <xen/interface/xen.h> 37#include <xen/interface/xen.h>
@@ -38,6 +39,7 @@
38#include <xen/interface/physdev.h> 39#include <xen/interface/physdev.h>
39#include <xen/interface/vcpu.h> 40#include <xen/interface/vcpu.h>
40#include <xen/interface/memory.h> 41#include <xen/interface/memory.h>
42#include <xen/interface/xen-mca.h>
41#include <xen/features.h> 43#include <xen/features.h>
42#include <xen/page.h> 44#include <xen/page.h>
43#include <xen/hvm.h> 45#include <xen/hvm.h>
@@ -107,7 +109,7 @@ EXPORT_SYMBOL_GPL(xen_have_vector_callback);
107 * Point at some empty memory to start with. We map the real shared_info 109 * Point at some empty memory to start with. We map the real shared_info
108 * page as soon as fixmap is up and running. 110 * page as soon as fixmap is up and running.
109 */ 111 */
110struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info; 112struct shared_info *HYPERVISOR_shared_info = &xen_dummy_shared_info;
111 113
112/* 114/*
113 * Flag to determine whether vcpu info placement is available on all 115 * Flag to determine whether vcpu info placement is available on all
@@ -124,6 +126,19 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
124 */ 126 */
125static int have_vcpu_info_placement = 1; 127static int have_vcpu_info_placement = 1;
126 128
129struct tls_descs {
130 struct desc_struct desc[3];
131};
132
133/*
134 * Updating the 3 TLS descriptors in the GDT on every task switch is
135 * surprisingly expensive so we avoid updating them if they haven't
136 * changed. Since Xen writes different descriptors than the one
137 * passed in the update_descriptor hypercall we keep shadow copies to
138 * compare against.
139 */
140static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc);
141
127static void clamp_max_cpus(void) 142static void clamp_max_cpus(void)
128{ 143{
129#ifdef CONFIG_SMP 144#ifdef CONFIG_SMP
@@ -341,9 +356,7 @@ static void __init xen_init_cpuid_mask(void)
341 unsigned int xsave_mask; 356 unsigned int xsave_mask;
342 357
343 cpuid_leaf1_edx_mask = 358 cpuid_leaf1_edx_mask =
344 ~((1 << X86_FEATURE_MCE) | /* disable MCE */ 359 ~((1 << X86_FEATURE_MTRR) | /* disable MTRR */
345 (1 << X86_FEATURE_MCA) | /* disable MCA */
346 (1 << X86_FEATURE_MTRR) | /* disable MTRR */
347 (1 << X86_FEATURE_ACC)); /* thermal monitoring */ 360 (1 << X86_FEATURE_ACC)); /* thermal monitoring */
348 361
349 if (!xen_initial_domain()) 362 if (!xen_initial_domain())
@@ -540,12 +553,28 @@ static void __init xen_load_gdt_boot(const struct desc_ptr *dtr)
540 BUG(); 553 BUG();
541} 554}
542 555
556static inline bool desc_equal(const struct desc_struct *d1,
557 const struct desc_struct *d2)
558{
559 return d1->a == d2->a && d1->b == d2->b;
560}
561
543static void load_TLS_descriptor(struct thread_struct *t, 562static void load_TLS_descriptor(struct thread_struct *t,
544 unsigned int cpu, unsigned int i) 563 unsigned int cpu, unsigned int i)
545{ 564{
546 struct desc_struct *gdt = get_cpu_gdt_table(cpu); 565 struct desc_struct *shadow = &per_cpu(shadow_tls_desc, cpu).desc[i];
547 xmaddr_t maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]); 566 struct desc_struct *gdt;
548 struct multicall_space mc = __xen_mc_entry(0); 567 xmaddr_t maddr;
568 struct multicall_space mc;
569
570 if (desc_equal(shadow, &t->tls_array[i]))
571 return;
572
573 *shadow = t->tls_array[i];
574
575 gdt = get_cpu_gdt_table(cpu);
576 maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
577 mc = __xen_mc_entry(0);
549 578
550 MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]); 579 MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]);
551} 580}
@@ -627,8 +656,8 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val,
627 /* 656 /*
628 * Look for known traps using IST, and substitute them 657 * Look for known traps using IST, and substitute them
629 * appropriately. The debugger ones are the only ones we care 658 * appropriately. The debugger ones are the only ones we care
630 * about. Xen will handle faults like double_fault and 659 * about. Xen will handle faults like double_fault,
631 * machine_check, so we should never see them. Warn if 660 * so we should never see them. Warn if
632 * there's an unexpected IST-using fault handler. 661 * there's an unexpected IST-using fault handler.
633 */ 662 */
634 if (addr == (unsigned long)debug) 663 if (addr == (unsigned long)debug)
@@ -643,7 +672,11 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val,
643 return 0; 672 return 0;
644#ifdef CONFIG_X86_MCE 673#ifdef CONFIG_X86_MCE
645 } else if (addr == (unsigned long)machine_check) { 674 } else if (addr == (unsigned long)machine_check) {
646 return 0; 675 /*
676 * when xen hypervisor inject vMCE to guest,
677 * use native mce handler to handle it
678 */
679 ;
647#endif 680#endif
648 } else { 681 } else {
649 /* Some other trap using IST? */ 682 /* Some other trap using IST? */
@@ -1437,64 +1470,155 @@ asmlinkage void __init xen_start_kernel(void)
1437#endif 1470#endif
1438} 1471}
1439 1472
1440static int init_hvm_pv_info(int *major, int *minor) 1473#ifdef CONFIG_XEN_PVHVM
1441{ 1474/*
1442 uint32_t eax, ebx, ecx, edx, pages, msr, base; 1475 * The pfn containing the shared_info is located somewhere in RAM. This
1443 u64 pfn; 1476 * will cause trouble if the current kernel is doing a kexec boot into a
1444 1477 * new kernel. The new kernel (and its startup code) can not know where
1445 base = xen_cpuid_base(); 1478 * the pfn is, so it can not reserve the page. The hypervisor will
1446 cpuid(base + 1, &eax, &ebx, &ecx, &edx); 1479 * continue to update the pfn, and as a result memory corruption occours
1447 1480 * in the new kernel.
1448 *major = eax >> 16; 1481 *
1449 *minor = eax & 0xffff; 1482 * One way to work around this issue is to allocate a page in the
1450 printk(KERN_INFO "Xen version %d.%d.\n", *major, *minor); 1483 * xen-platform pci device's BAR memory range. But pci init is done very
1451 1484 * late and the shared_info page is already in use very early to read
1452 cpuid(base + 2, &pages, &msr, &ecx, &edx); 1485 * the pvclock. So moving the pfn from RAM to MMIO is racy because some
1453 1486 * code paths on other vcpus could access the pfn during the small
1454 pfn = __pa(hypercall_page); 1487 * window when the old pfn is moved to the new pfn. There is even a
1455 wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); 1488 * small window were the old pfn is not backed by a mfn, and during that
1456 1489 * time all reads return -1.
1457 xen_setup_features(); 1490 *
1458 1491 * Because it is not known upfront where the MMIO region is located it
1459 pv_info.name = "Xen HVM"; 1492 * can not be used right from the start in xen_hvm_init_shared_info.
1460 1493 *
1461 xen_domain_type = XEN_HVM_DOMAIN; 1494 * To minimise trouble the move of the pfn is done shortly before kexec.
1495 * This does not eliminate the race because all vcpus are still online
1496 * when the syscore_ops will be called. But hopefully there is no work
1497 * pending at this point in time. Also the syscore_op is run last which
1498 * reduces the risk further.
1499 */
1462 1500
1463 return 0; 1501static struct shared_info *xen_hvm_shared_info;
1464}
1465 1502
1466void __ref xen_hvm_init_shared_info(void) 1503static void xen_hvm_connect_shared_info(unsigned long pfn)
1467{ 1504{
1468 int cpu;
1469 struct xen_add_to_physmap xatp; 1505 struct xen_add_to_physmap xatp;
1470 static struct shared_info *shared_info_page = 0;
1471 1506
1472 if (!shared_info_page)
1473 shared_info_page = (struct shared_info *)
1474 extend_brk(PAGE_SIZE, PAGE_SIZE);
1475 xatp.domid = DOMID_SELF; 1507 xatp.domid = DOMID_SELF;
1476 xatp.idx = 0; 1508 xatp.idx = 0;
1477 xatp.space = XENMAPSPACE_shared_info; 1509 xatp.space = XENMAPSPACE_shared_info;
1478 xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT; 1510 xatp.gpfn = pfn;
1479 if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp)) 1511 if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp))
1480 BUG(); 1512 BUG();
1481 1513
1482 HYPERVISOR_shared_info = (struct shared_info *)shared_info_page; 1514}
1515static void xen_hvm_set_shared_info(struct shared_info *sip)
1516{
1517 int cpu;
1518
1519 HYPERVISOR_shared_info = sip;
1483 1520
1484 /* xen_vcpu is a pointer to the vcpu_info struct in the shared_info 1521 /* xen_vcpu is a pointer to the vcpu_info struct in the shared_info
1485 * page, we use it in the event channel upcall and in some pvclock 1522 * page, we use it in the event channel upcall and in some pvclock
1486 * related functions. We don't need the vcpu_info placement 1523 * related functions. We don't need the vcpu_info placement
1487 * optimizations because we don't use any pv_mmu or pv_irq op on 1524 * optimizations because we don't use any pv_mmu or pv_irq op on
1488 * HVM. 1525 * HVM.
1489 * When xen_hvm_init_shared_info is run at boot time only vcpu 0 is 1526 * When xen_hvm_set_shared_info is run at boot time only vcpu 0 is
1490 * online but xen_hvm_init_shared_info is run at resume time too and 1527 * online but xen_hvm_set_shared_info is run at resume time too and
1491 * in that case multiple vcpus might be online. */ 1528 * in that case multiple vcpus might be online. */
1492 for_each_online_cpu(cpu) { 1529 for_each_online_cpu(cpu) {
1493 per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; 1530 per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
1494 } 1531 }
1495} 1532}
1496 1533
1497#ifdef CONFIG_XEN_PVHVM 1534/* Reconnect the shared_info pfn to a mfn */
1535void xen_hvm_resume_shared_info(void)
1536{
1537 xen_hvm_connect_shared_info(__pa(xen_hvm_shared_info) >> PAGE_SHIFT);
1538}
1539
1540#ifdef CONFIG_KEXEC
1541static struct shared_info *xen_hvm_shared_info_kexec;
1542static unsigned long xen_hvm_shared_info_pfn_kexec;
1543
1544/* Remember a pfn in MMIO space for kexec reboot */
1545void __devinit xen_hvm_prepare_kexec(struct shared_info *sip, unsigned long pfn)
1546{
1547 xen_hvm_shared_info_kexec = sip;
1548 xen_hvm_shared_info_pfn_kexec = pfn;
1549}
1550
1551static void xen_hvm_syscore_shutdown(void)
1552{
1553 struct xen_memory_reservation reservation = {
1554 .domid = DOMID_SELF,
1555 .nr_extents = 1,
1556 };
1557 unsigned long prev_pfn;
1558 int rc;
1559
1560 if (!xen_hvm_shared_info_kexec)
1561 return;
1562
1563 prev_pfn = __pa(xen_hvm_shared_info) >> PAGE_SHIFT;
1564 set_xen_guest_handle(reservation.extent_start, &prev_pfn);
1565
1566 /* Move pfn to MMIO, disconnects previous pfn from mfn */
1567 xen_hvm_connect_shared_info(xen_hvm_shared_info_pfn_kexec);
1568
1569 /* Update pointers, following hypercall is also a memory barrier */
1570 xen_hvm_set_shared_info(xen_hvm_shared_info_kexec);
1571
1572 /* Allocate new mfn for previous pfn */
1573 do {
1574 rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation);
1575 if (rc == 0)
1576 msleep(123);
1577 } while (rc == 0);
1578
1579 /* Make sure the previous pfn is really connected to a (new) mfn */
1580 BUG_ON(rc != 1);
1581}
1582
1583static struct syscore_ops xen_hvm_syscore_ops = {
1584 .shutdown = xen_hvm_syscore_shutdown,
1585};
1586#endif
1587
1588/* Use a pfn in RAM, may move to MMIO before kexec. */
1589static void __init xen_hvm_init_shared_info(void)
1590{
1591 /* Remember pointer for resume */
1592 xen_hvm_shared_info = extend_brk(PAGE_SIZE, PAGE_SIZE);
1593 xen_hvm_connect_shared_info(__pa(xen_hvm_shared_info) >> PAGE_SHIFT);
1594 xen_hvm_set_shared_info(xen_hvm_shared_info);
1595}
1596
1597static void __init init_hvm_pv_info(void)
1598{
1599 int major, minor;
1600 uint32_t eax, ebx, ecx, edx, pages, msr, base;
1601 u64 pfn;
1602
1603 base = xen_cpuid_base();
1604 cpuid(base + 1, &eax, &ebx, &ecx, &edx);
1605
1606 major = eax >> 16;
1607 minor = eax & 0xffff;
1608 printk(KERN_INFO "Xen version %d.%d.\n", major, minor);
1609
1610 cpuid(base + 2, &pages, &msr, &ecx, &edx);
1611
1612 pfn = __pa(hypercall_page);
1613 wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
1614
1615 xen_setup_features();
1616
1617 pv_info.name = "Xen HVM";
1618
1619 xen_domain_type = XEN_HVM_DOMAIN;
1620}
1621
1498static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self, 1622static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self,
1499 unsigned long action, void *hcpu) 1623 unsigned long action, void *hcpu)
1500{ 1624{
@@ -1517,14 +1641,12 @@ static struct notifier_block xen_hvm_cpu_notifier __cpuinitdata = {
1517 1641
1518static void __init xen_hvm_guest_init(void) 1642static void __init xen_hvm_guest_init(void)
1519{ 1643{
1520 int r; 1644 init_hvm_pv_info();
1521 int major, minor;
1522
1523 r = init_hvm_pv_info(&major, &minor);
1524 if (r < 0)
1525 return;
1526 1645
1527 xen_hvm_init_shared_info(); 1646 xen_hvm_init_shared_info();
1647#ifdef CONFIG_KEXEC
1648 register_syscore_ops(&xen_hvm_syscore_ops);
1649#endif
1528 1650
1529 if (xen_feature(XENFEAT_hvm_callback_vector)) 1651 if (xen_feature(XENFEAT_hvm_callback_vector))
1530 xen_have_vector_callback = 1; 1652 xen_have_vector_callback = 1;