diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2012-07-24 16:14:03 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-07-24 16:14:03 -0400 |
commit | 62c4d9afa4bcf5315e2745a17a0228bf65b9ba40 (patch) | |
tree | a7b9d97283441ea5f0c738fa388e120c4c1491b6 /arch/x86/xen/enlighten.c | |
parent | 5fecc9d8f59e765c2a48379dd7c6f5cf88c7d75a (diff) | |
parent | c3d93f880197953f86ab90d9da4744e926b38e33 (diff) |
Merge tag 'stable/for-linus-3.6-rc0-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/konrad/xen
Pull Xen update from Konrad Rzeszutek Wilk:
"Features:
* Performance improvement to lower the amount of traps the hypervisor
has to do 32-bit guests. Mainly for setting PTE entries and
updating TLS descriptors.
* MCE polling driver to collect hypervisor MCE buffer and present
them to /dev/mcelog.
* Physical CPU online/offline support. When an privileged guest is
booted it is present with virtual CPUs, which might have an 1:1 to
physical CPUs but usually don't. This provides mechanism to
offline/online physical CPUs.
Bug-fixes for:
* Coverity found fixes in the console and ACPI processor driver.
* PVonHVM kexec fixes along with some cleanups.
* Pages that fall within E820 gaps and non-RAM regions (and had been
released to hypervisor) would be populated back, but potentially in
non-RAM regions."
* tag 'stable/for-linus-3.6-rc0-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/konrad/xen:
xen: populate correct number of pages when across mem boundary (v2)
xen PVonHVM: move shared_info to MMIO before kexec
xen: simplify init_hvm_pv_info
xen: remove cast from HYPERVISOR_shared_info assignment
xen: enable platform-pci only in a Xen guest
xen/pv-on-hvm kexec: shutdown watches from old kernel
xen/x86: avoid updating TLS descriptors if they haven't changed
xen/x86: add desc_equal() to compare GDT descriptors
xen/mm: zero PTEs for non-present MFNs in the initial page table
xen/mm: do direct hypercall in xen_set_pte() if batching is unavailable
xen/hvc: Fix up checks when the info is allocated.
xen/acpi: Fix potential memory leak.
xen/mce: add .poll method for mcelog device driver
xen/mce: schedule a workqueue to avoid sleep in atomic context
xen/pcpu: Xen physical cpus online/offline sys interface
xen/mce: Register native mce handler as vMCE bounce back point
x86, MCE, AMD: Adjust initcall sequence for xen
xen/mce: Add mcelog support for Xen platform
Diffstat (limited to 'arch/x86/xen/enlighten.c')
-rw-r--r-- | arch/x86/xen/enlighten.c | 224 |
1 files changed, 173 insertions, 51 deletions
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index ed7d54985d0c..bf4bda6d3e9a 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c | |||
@@ -31,6 +31,7 @@ | |||
31 | #include <linux/pci.h> | 31 | #include <linux/pci.h> |
32 | #include <linux/gfp.h> | 32 | #include <linux/gfp.h> |
33 | #include <linux/memblock.h> | 33 | #include <linux/memblock.h> |
34 | #include <linux/syscore_ops.h> | ||
34 | 35 | ||
35 | #include <xen/xen.h> | 36 | #include <xen/xen.h> |
36 | #include <xen/interface/xen.h> | 37 | #include <xen/interface/xen.h> |
@@ -38,6 +39,7 @@ | |||
38 | #include <xen/interface/physdev.h> | 39 | #include <xen/interface/physdev.h> |
39 | #include <xen/interface/vcpu.h> | 40 | #include <xen/interface/vcpu.h> |
40 | #include <xen/interface/memory.h> | 41 | #include <xen/interface/memory.h> |
42 | #include <xen/interface/xen-mca.h> | ||
41 | #include <xen/features.h> | 43 | #include <xen/features.h> |
42 | #include <xen/page.h> | 44 | #include <xen/page.h> |
43 | #include <xen/hvm.h> | 45 | #include <xen/hvm.h> |
@@ -107,7 +109,7 @@ EXPORT_SYMBOL_GPL(xen_have_vector_callback); | |||
107 | * Point at some empty memory to start with. We map the real shared_info | 109 | * Point at some empty memory to start with. We map the real shared_info |
108 | * page as soon as fixmap is up and running. | 110 | * page as soon as fixmap is up and running. |
109 | */ | 111 | */ |
110 | struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info; | 112 | struct shared_info *HYPERVISOR_shared_info = &xen_dummy_shared_info; |
111 | 113 | ||
112 | /* | 114 | /* |
113 | * Flag to determine whether vcpu info placement is available on all | 115 | * Flag to determine whether vcpu info placement is available on all |
@@ -124,6 +126,19 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info; | |||
124 | */ | 126 | */ |
125 | static int have_vcpu_info_placement = 1; | 127 | static int have_vcpu_info_placement = 1; |
126 | 128 | ||
129 | struct tls_descs { | ||
130 | struct desc_struct desc[3]; | ||
131 | }; | ||
132 | |||
133 | /* | ||
134 | * Updating the 3 TLS descriptors in the GDT on every task switch is | ||
135 | * surprisingly expensive so we avoid updating them if they haven't | ||
136 | * changed. Since Xen writes different descriptors than the one | ||
137 | * passed in the update_descriptor hypercall we keep shadow copies to | ||
138 | * compare against. | ||
139 | */ | ||
140 | static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc); | ||
141 | |||
127 | static void clamp_max_cpus(void) | 142 | static void clamp_max_cpus(void) |
128 | { | 143 | { |
129 | #ifdef CONFIG_SMP | 144 | #ifdef CONFIG_SMP |
@@ -341,9 +356,7 @@ static void __init xen_init_cpuid_mask(void) | |||
341 | unsigned int xsave_mask; | 356 | unsigned int xsave_mask; |
342 | 357 | ||
343 | cpuid_leaf1_edx_mask = | 358 | cpuid_leaf1_edx_mask = |
344 | ~((1 << X86_FEATURE_MCE) | /* disable MCE */ | 359 | ~((1 << X86_FEATURE_MTRR) | /* disable MTRR */ |
345 | (1 << X86_FEATURE_MCA) | /* disable MCA */ | ||
346 | (1 << X86_FEATURE_MTRR) | /* disable MTRR */ | ||
347 | (1 << X86_FEATURE_ACC)); /* thermal monitoring */ | 360 | (1 << X86_FEATURE_ACC)); /* thermal monitoring */ |
348 | 361 | ||
349 | if (!xen_initial_domain()) | 362 | if (!xen_initial_domain()) |
@@ -540,12 +553,28 @@ static void __init xen_load_gdt_boot(const struct desc_ptr *dtr) | |||
540 | BUG(); | 553 | BUG(); |
541 | } | 554 | } |
542 | 555 | ||
556 | static inline bool desc_equal(const struct desc_struct *d1, | ||
557 | const struct desc_struct *d2) | ||
558 | { | ||
559 | return d1->a == d2->a && d1->b == d2->b; | ||
560 | } | ||
561 | |||
543 | static void load_TLS_descriptor(struct thread_struct *t, | 562 | static void load_TLS_descriptor(struct thread_struct *t, |
544 | unsigned int cpu, unsigned int i) | 563 | unsigned int cpu, unsigned int i) |
545 | { | 564 | { |
546 | struct desc_struct *gdt = get_cpu_gdt_table(cpu); | 565 | struct desc_struct *shadow = &per_cpu(shadow_tls_desc, cpu).desc[i]; |
547 | xmaddr_t maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]); | 566 | struct desc_struct *gdt; |
548 | struct multicall_space mc = __xen_mc_entry(0); | 567 | xmaddr_t maddr; |
568 | struct multicall_space mc; | ||
569 | |||
570 | if (desc_equal(shadow, &t->tls_array[i])) | ||
571 | return; | ||
572 | |||
573 | *shadow = t->tls_array[i]; | ||
574 | |||
575 | gdt = get_cpu_gdt_table(cpu); | ||
576 | maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]); | ||
577 | mc = __xen_mc_entry(0); | ||
549 | 578 | ||
550 | MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]); | 579 | MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]); |
551 | } | 580 | } |
@@ -627,8 +656,8 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val, | |||
627 | /* | 656 | /* |
628 | * Look for known traps using IST, and substitute them | 657 | * Look for known traps using IST, and substitute them |
629 | * appropriately. The debugger ones are the only ones we care | 658 | * appropriately. The debugger ones are the only ones we care |
630 | * about. Xen will handle faults like double_fault and | 659 | * about. Xen will handle faults like double_fault, |
631 | * machine_check, so we should never see them. Warn if | 660 | * so we should never see them. Warn if |
632 | * there's an unexpected IST-using fault handler. | 661 | * there's an unexpected IST-using fault handler. |
633 | */ | 662 | */ |
634 | if (addr == (unsigned long)debug) | 663 | if (addr == (unsigned long)debug) |
@@ -643,7 +672,11 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val, | |||
643 | return 0; | 672 | return 0; |
644 | #ifdef CONFIG_X86_MCE | 673 | #ifdef CONFIG_X86_MCE |
645 | } else if (addr == (unsigned long)machine_check) { | 674 | } else if (addr == (unsigned long)machine_check) { |
646 | return 0; | 675 | /* |
676 | * when xen hypervisor inject vMCE to guest, | ||
677 | * use native mce handler to handle it | ||
678 | */ | ||
679 | ; | ||
647 | #endif | 680 | #endif |
648 | } else { | 681 | } else { |
649 | /* Some other trap using IST? */ | 682 | /* Some other trap using IST? */ |
@@ -1437,64 +1470,155 @@ asmlinkage void __init xen_start_kernel(void) | |||
1437 | #endif | 1470 | #endif |
1438 | } | 1471 | } |
1439 | 1472 | ||
1440 | static int init_hvm_pv_info(int *major, int *minor) | 1473 | #ifdef CONFIG_XEN_PVHVM |
1441 | { | 1474 | /* |
1442 | uint32_t eax, ebx, ecx, edx, pages, msr, base; | 1475 | * The pfn containing the shared_info is located somewhere in RAM. This |
1443 | u64 pfn; | 1476 | * will cause trouble if the current kernel is doing a kexec boot into a |
1444 | 1477 | * new kernel. The new kernel (and its startup code) can not know where | |
1445 | base = xen_cpuid_base(); | 1478 | * the pfn is, so it can not reserve the page. The hypervisor will |
1446 | cpuid(base + 1, &eax, &ebx, &ecx, &edx); | 1479 | * continue to update the pfn, and as a result memory corruption occours |
1447 | 1480 | * in the new kernel. | |
1448 | *major = eax >> 16; | 1481 | * |
1449 | *minor = eax & 0xffff; | 1482 | * One way to work around this issue is to allocate a page in the |
1450 | printk(KERN_INFO "Xen version %d.%d.\n", *major, *minor); | 1483 | * xen-platform pci device's BAR memory range. But pci init is done very |
1451 | 1484 | * late and the shared_info page is already in use very early to read | |
1452 | cpuid(base + 2, &pages, &msr, &ecx, &edx); | 1485 | * the pvclock. So moving the pfn from RAM to MMIO is racy because some |
1453 | 1486 | * code paths on other vcpus could access the pfn during the small | |
1454 | pfn = __pa(hypercall_page); | 1487 | * window when the old pfn is moved to the new pfn. There is even a |
1455 | wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); | 1488 | * small window were the old pfn is not backed by a mfn, and during that |
1456 | 1489 | * time all reads return -1. | |
1457 | xen_setup_features(); | 1490 | * |
1458 | 1491 | * Because it is not known upfront where the MMIO region is located it | |
1459 | pv_info.name = "Xen HVM"; | 1492 | * can not be used right from the start in xen_hvm_init_shared_info. |
1460 | 1493 | * | |
1461 | xen_domain_type = XEN_HVM_DOMAIN; | 1494 | * To minimise trouble the move of the pfn is done shortly before kexec. |
1495 | * This does not eliminate the race because all vcpus are still online | ||
1496 | * when the syscore_ops will be called. But hopefully there is no work | ||
1497 | * pending at this point in time. Also the syscore_op is run last which | ||
1498 | * reduces the risk further. | ||
1499 | */ | ||
1462 | 1500 | ||
1463 | return 0; | 1501 | static struct shared_info *xen_hvm_shared_info; |
1464 | } | ||
1465 | 1502 | ||
1466 | void __ref xen_hvm_init_shared_info(void) | 1503 | static void xen_hvm_connect_shared_info(unsigned long pfn) |
1467 | { | 1504 | { |
1468 | int cpu; | ||
1469 | struct xen_add_to_physmap xatp; | 1505 | struct xen_add_to_physmap xatp; |
1470 | static struct shared_info *shared_info_page = 0; | ||
1471 | 1506 | ||
1472 | if (!shared_info_page) | ||
1473 | shared_info_page = (struct shared_info *) | ||
1474 | extend_brk(PAGE_SIZE, PAGE_SIZE); | ||
1475 | xatp.domid = DOMID_SELF; | 1507 | xatp.domid = DOMID_SELF; |
1476 | xatp.idx = 0; | 1508 | xatp.idx = 0; |
1477 | xatp.space = XENMAPSPACE_shared_info; | 1509 | xatp.space = XENMAPSPACE_shared_info; |
1478 | xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT; | 1510 | xatp.gpfn = pfn; |
1479 | if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp)) | 1511 | if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp)) |
1480 | BUG(); | 1512 | BUG(); |
1481 | 1513 | ||
1482 | HYPERVISOR_shared_info = (struct shared_info *)shared_info_page; | 1514 | } |
1515 | static void xen_hvm_set_shared_info(struct shared_info *sip) | ||
1516 | { | ||
1517 | int cpu; | ||
1518 | |||
1519 | HYPERVISOR_shared_info = sip; | ||
1483 | 1520 | ||
1484 | /* xen_vcpu is a pointer to the vcpu_info struct in the shared_info | 1521 | /* xen_vcpu is a pointer to the vcpu_info struct in the shared_info |
1485 | * page, we use it in the event channel upcall and in some pvclock | 1522 | * page, we use it in the event channel upcall and in some pvclock |
1486 | * related functions. We don't need the vcpu_info placement | 1523 | * related functions. We don't need the vcpu_info placement |
1487 | * optimizations because we don't use any pv_mmu or pv_irq op on | 1524 | * optimizations because we don't use any pv_mmu or pv_irq op on |
1488 | * HVM. | 1525 | * HVM. |
1489 | * When xen_hvm_init_shared_info is run at boot time only vcpu 0 is | 1526 | * When xen_hvm_set_shared_info is run at boot time only vcpu 0 is |
1490 | * online but xen_hvm_init_shared_info is run at resume time too and | 1527 | * online but xen_hvm_set_shared_info is run at resume time too and |
1491 | * in that case multiple vcpus might be online. */ | 1528 | * in that case multiple vcpus might be online. */ |
1492 | for_each_online_cpu(cpu) { | 1529 | for_each_online_cpu(cpu) { |
1493 | per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; | 1530 | per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; |
1494 | } | 1531 | } |
1495 | } | 1532 | } |
1496 | 1533 | ||
1497 | #ifdef CONFIG_XEN_PVHVM | 1534 | /* Reconnect the shared_info pfn to a mfn */ |
1535 | void xen_hvm_resume_shared_info(void) | ||
1536 | { | ||
1537 | xen_hvm_connect_shared_info(__pa(xen_hvm_shared_info) >> PAGE_SHIFT); | ||
1538 | } | ||
1539 | |||
1540 | #ifdef CONFIG_KEXEC | ||
1541 | static struct shared_info *xen_hvm_shared_info_kexec; | ||
1542 | static unsigned long xen_hvm_shared_info_pfn_kexec; | ||
1543 | |||
1544 | /* Remember a pfn in MMIO space for kexec reboot */ | ||
1545 | void __devinit xen_hvm_prepare_kexec(struct shared_info *sip, unsigned long pfn) | ||
1546 | { | ||
1547 | xen_hvm_shared_info_kexec = sip; | ||
1548 | xen_hvm_shared_info_pfn_kexec = pfn; | ||
1549 | } | ||
1550 | |||
1551 | static void xen_hvm_syscore_shutdown(void) | ||
1552 | { | ||
1553 | struct xen_memory_reservation reservation = { | ||
1554 | .domid = DOMID_SELF, | ||
1555 | .nr_extents = 1, | ||
1556 | }; | ||
1557 | unsigned long prev_pfn; | ||
1558 | int rc; | ||
1559 | |||
1560 | if (!xen_hvm_shared_info_kexec) | ||
1561 | return; | ||
1562 | |||
1563 | prev_pfn = __pa(xen_hvm_shared_info) >> PAGE_SHIFT; | ||
1564 | set_xen_guest_handle(reservation.extent_start, &prev_pfn); | ||
1565 | |||
1566 | /* Move pfn to MMIO, disconnects previous pfn from mfn */ | ||
1567 | xen_hvm_connect_shared_info(xen_hvm_shared_info_pfn_kexec); | ||
1568 | |||
1569 | /* Update pointers, following hypercall is also a memory barrier */ | ||
1570 | xen_hvm_set_shared_info(xen_hvm_shared_info_kexec); | ||
1571 | |||
1572 | /* Allocate new mfn for previous pfn */ | ||
1573 | do { | ||
1574 | rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation); | ||
1575 | if (rc == 0) | ||
1576 | msleep(123); | ||
1577 | } while (rc == 0); | ||
1578 | |||
1579 | /* Make sure the previous pfn is really connected to a (new) mfn */ | ||
1580 | BUG_ON(rc != 1); | ||
1581 | } | ||
1582 | |||
1583 | static struct syscore_ops xen_hvm_syscore_ops = { | ||
1584 | .shutdown = xen_hvm_syscore_shutdown, | ||
1585 | }; | ||
1586 | #endif | ||
1587 | |||
1588 | /* Use a pfn in RAM, may move to MMIO before kexec. */ | ||
1589 | static void __init xen_hvm_init_shared_info(void) | ||
1590 | { | ||
1591 | /* Remember pointer for resume */ | ||
1592 | xen_hvm_shared_info = extend_brk(PAGE_SIZE, PAGE_SIZE); | ||
1593 | xen_hvm_connect_shared_info(__pa(xen_hvm_shared_info) >> PAGE_SHIFT); | ||
1594 | xen_hvm_set_shared_info(xen_hvm_shared_info); | ||
1595 | } | ||
1596 | |||
1597 | static void __init init_hvm_pv_info(void) | ||
1598 | { | ||
1599 | int major, minor; | ||
1600 | uint32_t eax, ebx, ecx, edx, pages, msr, base; | ||
1601 | u64 pfn; | ||
1602 | |||
1603 | base = xen_cpuid_base(); | ||
1604 | cpuid(base + 1, &eax, &ebx, &ecx, &edx); | ||
1605 | |||
1606 | major = eax >> 16; | ||
1607 | minor = eax & 0xffff; | ||
1608 | printk(KERN_INFO "Xen version %d.%d.\n", major, minor); | ||
1609 | |||
1610 | cpuid(base + 2, &pages, &msr, &ecx, &edx); | ||
1611 | |||
1612 | pfn = __pa(hypercall_page); | ||
1613 | wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); | ||
1614 | |||
1615 | xen_setup_features(); | ||
1616 | |||
1617 | pv_info.name = "Xen HVM"; | ||
1618 | |||
1619 | xen_domain_type = XEN_HVM_DOMAIN; | ||
1620 | } | ||
1621 | |||
1498 | static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self, | 1622 | static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self, |
1499 | unsigned long action, void *hcpu) | 1623 | unsigned long action, void *hcpu) |
1500 | { | 1624 | { |
@@ -1517,14 +1641,12 @@ static struct notifier_block xen_hvm_cpu_notifier __cpuinitdata = { | |||
1517 | 1641 | ||
1518 | static void __init xen_hvm_guest_init(void) | 1642 | static void __init xen_hvm_guest_init(void) |
1519 | { | 1643 | { |
1520 | int r; | 1644 | init_hvm_pv_info(); |
1521 | int major, minor; | ||
1522 | |||
1523 | r = init_hvm_pv_info(&major, &minor); | ||
1524 | if (r < 0) | ||
1525 | return; | ||
1526 | 1645 | ||
1527 | xen_hvm_init_shared_info(); | 1646 | xen_hvm_init_shared_info(); |
1647 | #ifdef CONFIG_KEXEC | ||
1648 | register_syscore_ops(&xen_hvm_syscore_ops); | ||
1649 | #endif | ||
1528 | 1650 | ||
1529 | if (xen_feature(XENFEAT_hvm_callback_vector)) | 1651 | if (xen_feature(XENFEAT_hvm_callback_vector)) |
1530 | xen_have_vector_callback = 1; | 1652 | xen_have_vector_callback = 1; |