aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/xen
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@woody.linux-foundation.org>2007-10-17 14:10:11 -0400
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-10-17 14:10:11 -0400
commitfb9fc395174138983a49f2da982ed14caabbe741 (patch)
tree5d5d3643ee6853a899205613da272cc343fdc1a4 /arch/x86/xen
parent0eafaae84e21ac033815cc9f33c3ae889cd7ccfe (diff)
parentace2e92e193126711cb3a83a3752b2c5b8396950 (diff)
Merge branch 'xen-upstream' of git://git.kernel.org/pub/scm/linux/kernel/git/jeremy/xen
* 'xen-upstream' of git://git.kernel.org/pub/scm/linux/kernel/git/jeremy/xen: xfs: eagerly remove vmap mappings to avoid upsetting Xen xen: add some debug output for failed multicalls xen: fix incorrect vcpu_register_vcpu_info hypercall argument xen: ask the hypervisor how much space it needs reserved xen: lock pte pages while pinning/unpinning xen: deal with stale cr3 values when unpinning pagetables xen: add batch completion callbacks xen: yield to IPI target if necessary Clean up duplicate includes in arch/i386/xen/ remove dead code in pgtable_cache_init paravirt: clean up lazy mode handling paravirt: refactor struct paravirt_ops into smaller pv_*_ops
Diffstat (limited to 'arch/x86/xen')
-rw-r--r--arch/x86/xen/enlighten.c233
-rw-r--r--arch/x86/xen/mmu.c145
-rw-r--r--arch/x86/xen/multicalls.c52
-rw-r--r--arch/x86/xen/multicalls.h5
-rw-r--r--arch/x86/xen/smp.c14
-rw-r--r--arch/x86/xen/time.c6
-rw-r--r--arch/x86/xen/xen-ops.h10
7 files changed, 324 insertions, 141 deletions
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 493a083f6886..94c39aaf695f 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -25,7 +25,6 @@
25#include <linux/mm.h> 25#include <linux/mm.h>
26#include <linux/page-flags.h> 26#include <linux/page-flags.h>
27#include <linux/highmem.h> 27#include <linux/highmem.h>
28#include <linux/smp.h>
29 28
30#include <xen/interface/xen.h> 29#include <xen/interface/xen.h>
31#include <xen/interface/physdev.h> 30#include <xen/interface/physdev.h>
@@ -52,11 +51,25 @@
52 51
53EXPORT_SYMBOL_GPL(hypercall_page); 52EXPORT_SYMBOL_GPL(hypercall_page);
54 53
55DEFINE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode);
56
57DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu); 54DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
58DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); 55DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
59DEFINE_PER_CPU(unsigned long, xen_cr3); 56
57/*
58 * Note about cr3 (pagetable base) values:
59 *
60 * xen_cr3 contains the current logical cr3 value; it contains the
61 * last set cr3. This may not be the current effective cr3, because
62 * its update may be being lazily deferred. However, a vcpu looking
63 * at its own cr3 can use this value knowing that it everything will
64 * be self-consistent.
65 *
66 * xen_current_cr3 contains the actual vcpu cr3; it is set once the
67 * hypercall to set the vcpu cr3 is complete (so it may be a little
68 * out of date, but it will never be set early). If one vcpu is
69 * looking at another vcpu's cr3 value, it should use this variable.
70 */
71DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */
72DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
60 73
61struct start_info *xen_start_info; 74struct start_info *xen_start_info;
62EXPORT_SYMBOL_GPL(xen_start_info); 75EXPORT_SYMBOL_GPL(xen_start_info);
@@ -100,7 +113,7 @@ static void __init xen_vcpu_setup(int cpu)
100 info.mfn = virt_to_mfn(vcpup); 113 info.mfn = virt_to_mfn(vcpup);
101 info.offset = offset_in_page(vcpup); 114 info.offset = offset_in_page(vcpup);
102 115
103 printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %x, offset %d\n", 116 printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %llx, offset %d\n",
104 cpu, vcpup, info.mfn, info.offset); 117 cpu, vcpup, info.mfn, info.offset);
105 118
106 /* Check to see if the hypervisor will put the vcpu_info 119 /* Check to see if the hypervisor will put the vcpu_info
@@ -124,7 +137,7 @@ static void __init xen_vcpu_setup(int cpu)
124static void __init xen_banner(void) 137static void __init xen_banner(void)
125{ 138{
126 printk(KERN_INFO "Booting paravirtualized kernel on %s\n", 139 printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
127 paravirt_ops.name); 140 pv_info.name);
128 printk(KERN_INFO "Hypervisor signature: %s\n", xen_start_info->magic); 141 printk(KERN_INFO "Hypervisor signature: %s\n", xen_start_info->magic);
129} 142}
130 143
@@ -249,29 +262,10 @@ static void xen_halt(void)
249 xen_safe_halt(); 262 xen_safe_halt();
250} 263}
251 264
252static void xen_set_lazy_mode(enum paravirt_lazy_mode mode) 265static void xen_leave_lazy(void)
253{ 266{
254 BUG_ON(preemptible()); 267 paravirt_leave_lazy(paravirt_get_lazy_mode());
255
256 switch (mode) {
257 case PARAVIRT_LAZY_NONE:
258 BUG_ON(x86_read_percpu(xen_lazy_mode) == PARAVIRT_LAZY_NONE);
259 break;
260
261 case PARAVIRT_LAZY_MMU:
262 case PARAVIRT_LAZY_CPU:
263 BUG_ON(x86_read_percpu(xen_lazy_mode) != PARAVIRT_LAZY_NONE);
264 break;
265
266 case PARAVIRT_LAZY_FLUSH:
267 /* flush if necessary, but don't change state */
268 if (x86_read_percpu(xen_lazy_mode) != PARAVIRT_LAZY_NONE)
269 xen_mc_flush();
270 return;
271 }
272
273 xen_mc_flush(); 268 xen_mc_flush();
274 x86_write_percpu(xen_lazy_mode, mode);
275} 269}
276 270
277static unsigned long xen_store_tr(void) 271static unsigned long xen_store_tr(void)
@@ -358,7 +352,7 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
358 * loaded properly. This will go away as soon as Xen has been 352 * loaded properly. This will go away as soon as Xen has been
359 * modified to not save/restore %gs for normal hypercalls. 353 * modified to not save/restore %gs for normal hypercalls.
360 */ 354 */
361 if (xen_get_lazy_mode() == PARAVIRT_LAZY_CPU) 355 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU)
362 loadsegment(gs, 0); 356 loadsegment(gs, 0);
363} 357}
364 358
@@ -632,32 +626,36 @@ static unsigned long xen_read_cr3(void)
632 return x86_read_percpu(xen_cr3); 626 return x86_read_percpu(xen_cr3);
633} 627}
634 628
629static void set_current_cr3(void *v)
630{
631 x86_write_percpu(xen_current_cr3, (unsigned long)v);
632}
633
635static void xen_write_cr3(unsigned long cr3) 634static void xen_write_cr3(unsigned long cr3)
636{ 635{
636 struct mmuext_op *op;
637 struct multicall_space mcs;
638 unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3));
639
637 BUG_ON(preemptible()); 640 BUG_ON(preemptible());
638 641
639 if (cr3 == x86_read_percpu(xen_cr3)) { 642 mcs = xen_mc_entry(sizeof(*op)); /* disables interrupts */
640 /* just a simple tlb flush */
641 xen_flush_tlb();
642 return;
643 }
644 643
644 /* Update while interrupts are disabled, so its atomic with
645 respect to ipis */
645 x86_write_percpu(xen_cr3, cr3); 646 x86_write_percpu(xen_cr3, cr3);
646 647
648 op = mcs.args;
649 op->cmd = MMUEXT_NEW_BASEPTR;
650 op->arg1.mfn = mfn;
647 651
648 { 652 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
649 struct mmuext_op *op;
650 struct multicall_space mcs = xen_mc_entry(sizeof(*op));
651 unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3));
652
653 op = mcs.args;
654 op->cmd = MMUEXT_NEW_BASEPTR;
655 op->arg1.mfn = mfn;
656 653
657 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); 654 /* Update xen_update_cr3 once the batch has actually
655 been submitted. */
656 xen_mc_callback(set_current_cr3, (void *)cr3);
658 657
659 xen_mc_issue(PARAVIRT_LAZY_CPU); 658 xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
660 }
661} 659}
662 660
663/* Early in boot, while setting up the initial pagetable, assume 661/* Early in boot, while setting up the initial pagetable, assume
@@ -668,6 +666,15 @@ static __init void xen_alloc_pt_init(struct mm_struct *mm, u32 pfn)
668 make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); 666 make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
669} 667}
670 668
669static void pin_pagetable_pfn(unsigned level, unsigned long pfn)
670{
671 struct mmuext_op op;
672 op.cmd = level;
673 op.arg1.mfn = pfn_to_mfn(pfn);
674 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
675 BUG();
676}
677
671/* This needs to make sure the new pte page is pinned iff its being 678/* This needs to make sure the new pte page is pinned iff its being
672 attached to a pinned pagetable. */ 679 attached to a pinned pagetable. */
673static void xen_alloc_pt(struct mm_struct *mm, u32 pfn) 680static void xen_alloc_pt(struct mm_struct *mm, u32 pfn)
@@ -677,9 +684,10 @@ static void xen_alloc_pt(struct mm_struct *mm, u32 pfn)
677 if (PagePinned(virt_to_page(mm->pgd))) { 684 if (PagePinned(virt_to_page(mm->pgd))) {
678 SetPagePinned(page); 685 SetPagePinned(page);
679 686
680 if (!PageHighMem(page)) 687 if (!PageHighMem(page)) {
681 make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); 688 make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
682 else 689 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
690 } else
683 /* make sure there are no stray mappings of 691 /* make sure there are no stray mappings of
684 this page */ 692 this page */
685 kmap_flush_unused(); 693 kmap_flush_unused();
@@ -692,8 +700,10 @@ static void xen_release_pt(u32 pfn)
692 struct page *page = pfn_to_page(pfn); 700 struct page *page = pfn_to_page(pfn);
693 701
694 if (PagePinned(page)) { 702 if (PagePinned(page)) {
695 if (!PageHighMem(page)) 703 if (!PageHighMem(page)) {
704 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
696 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); 705 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
706 }
697 } 707 }
698} 708}
699 709
@@ -738,7 +748,7 @@ static __init void xen_pagetable_setup_start(pgd_t *base)
738 pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base; 748 pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base;
739 749
740 /* special set_pte for pagetable initialization */ 750 /* special set_pte for pagetable initialization */
741 paravirt_ops.set_pte = xen_set_pte_init; 751 pv_mmu_ops.set_pte = xen_set_pte_init;
742 752
743 init_mm.pgd = base; 753 init_mm.pgd = base;
744 /* 754 /*
@@ -785,8 +795,8 @@ static __init void xen_pagetable_setup_done(pgd_t *base)
785{ 795{
786 /* This will work as long as patching hasn't happened yet 796 /* This will work as long as patching hasn't happened yet
787 (which it hasn't) */ 797 (which it hasn't) */
788 paravirt_ops.alloc_pt = xen_alloc_pt; 798 pv_mmu_ops.alloc_pt = xen_alloc_pt;
789 paravirt_ops.set_pte = xen_set_pte; 799 pv_mmu_ops.set_pte = xen_set_pte;
790 800
791 if (!xen_feature(XENFEAT_auto_translated_physmap)) { 801 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
792 /* 802 /*
@@ -808,15 +818,15 @@ static __init void xen_pagetable_setup_done(pgd_t *base)
808 /* Actually pin the pagetable down, but we can't set PG_pinned 818 /* Actually pin the pagetable down, but we can't set PG_pinned
809 yet because the page structures don't exist yet. */ 819 yet because the page structures don't exist yet. */
810 { 820 {
811 struct mmuext_op op; 821 unsigned level;
822
812#ifdef CONFIG_X86_PAE 823#ifdef CONFIG_X86_PAE
813 op.cmd = MMUEXT_PIN_L3_TABLE; 824 level = MMUEXT_PIN_L3_TABLE;
814#else 825#else
815 op.cmd = MMUEXT_PIN_L3_TABLE; 826 level = MMUEXT_PIN_L2_TABLE;
816#endif 827#endif
817 op.arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(base))); 828
818 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) 829 pin_pagetable_pfn(level, PFN_DOWN(__pa(base)));
819 BUG();
820 } 830 }
821} 831}
822 832
@@ -833,12 +843,12 @@ void __init xen_setup_vcpu_info_placement(void)
833 if (have_vcpu_info_placement) { 843 if (have_vcpu_info_placement) {
834 printk(KERN_INFO "Xen: using vcpu_info placement\n"); 844 printk(KERN_INFO "Xen: using vcpu_info placement\n");
835 845
836 paravirt_ops.save_fl = xen_save_fl_direct; 846 pv_irq_ops.save_fl = xen_save_fl_direct;
837 paravirt_ops.restore_fl = xen_restore_fl_direct; 847 pv_irq_ops.restore_fl = xen_restore_fl_direct;
838 paravirt_ops.irq_disable = xen_irq_disable_direct; 848 pv_irq_ops.irq_disable = xen_irq_disable_direct;
839 paravirt_ops.irq_enable = xen_irq_enable_direct; 849 pv_irq_ops.irq_enable = xen_irq_enable_direct;
840 paravirt_ops.read_cr2 = xen_read_cr2_direct; 850 pv_mmu_ops.read_cr2 = xen_read_cr2_direct;
841 paravirt_ops.iret = xen_iret_direct; 851 pv_cpu_ops.iret = xen_iret_direct;
842 } 852 }
843} 853}
844 854
@@ -850,8 +860,8 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
850 860
851 start = end = reloc = NULL; 861 start = end = reloc = NULL;
852 862
853#define SITE(x) \ 863#define SITE(op, x) \
854 case PARAVIRT_PATCH(x): \ 864 case PARAVIRT_PATCH(op.x): \
855 if (have_vcpu_info_placement) { \ 865 if (have_vcpu_info_placement) { \
856 start = (char *)xen_##x##_direct; \ 866 start = (char *)xen_##x##_direct; \
857 end = xen_##x##_direct_end; \ 867 end = xen_##x##_direct_end; \
@@ -860,10 +870,10 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
860 goto patch_site 870 goto patch_site
861 871
862 switch (type) { 872 switch (type) {
863 SITE(irq_enable); 873 SITE(pv_irq_ops, irq_enable);
864 SITE(irq_disable); 874 SITE(pv_irq_ops, irq_disable);
865 SITE(save_fl); 875 SITE(pv_irq_ops, save_fl);
866 SITE(restore_fl); 876 SITE(pv_irq_ops, restore_fl);
867#undef SITE 877#undef SITE
868 878
869 patch_site: 879 patch_site:
@@ -895,26 +905,32 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
895 return ret; 905 return ret;
896} 906}
897 907
898static const struct paravirt_ops xen_paravirt_ops __initdata = { 908static const struct pv_info xen_info __initdata = {
899 .paravirt_enabled = 1, 909 .paravirt_enabled = 1,
900 .shared_kernel_pmd = 0, 910 .shared_kernel_pmd = 0,
901 911
902 .name = "Xen", 912 .name = "Xen",
903 .banner = xen_banner, 913};
904 914
915static const struct pv_init_ops xen_init_ops __initdata = {
905 .patch = xen_patch, 916 .patch = xen_patch,
906 917
918 .banner = xen_banner,
907 .memory_setup = xen_memory_setup, 919 .memory_setup = xen_memory_setup,
908 .arch_setup = xen_arch_setup, 920 .arch_setup = xen_arch_setup,
909 .init_IRQ = xen_init_IRQ,
910 .post_allocator_init = xen_mark_init_mm_pinned, 921 .post_allocator_init = xen_mark_init_mm_pinned,
922};
911 923
924static const struct pv_time_ops xen_time_ops __initdata = {
912 .time_init = xen_time_init, 925 .time_init = xen_time_init,
926
913 .set_wallclock = xen_set_wallclock, 927 .set_wallclock = xen_set_wallclock,
914 .get_wallclock = xen_get_wallclock, 928 .get_wallclock = xen_get_wallclock,
915 .get_cpu_khz = xen_cpu_khz, 929 .get_cpu_khz = xen_cpu_khz,
916 .sched_clock = xen_sched_clock, 930 .sched_clock = xen_sched_clock,
931};
917 932
933static const struct pv_cpu_ops xen_cpu_ops __initdata = {
918 .cpuid = xen_cpuid, 934 .cpuid = xen_cpuid,
919 935
920 .set_debugreg = xen_set_debugreg, 936 .set_debugreg = xen_set_debugreg,
@@ -925,22 +941,10 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = {
925 .read_cr0 = native_read_cr0, 941 .read_cr0 = native_read_cr0,
926 .write_cr0 = native_write_cr0, 942 .write_cr0 = native_write_cr0,
927 943
928 .read_cr2 = xen_read_cr2,
929 .write_cr2 = xen_write_cr2,
930
931 .read_cr3 = xen_read_cr3,
932 .write_cr3 = xen_write_cr3,
933
934 .read_cr4 = native_read_cr4, 944 .read_cr4 = native_read_cr4,
935 .read_cr4_safe = native_read_cr4_safe, 945 .read_cr4_safe = native_read_cr4_safe,
936 .write_cr4 = xen_write_cr4, 946 .write_cr4 = xen_write_cr4,
937 947
938 .save_fl = xen_save_fl,
939 .restore_fl = xen_restore_fl,
940 .irq_disable = xen_irq_disable,
941 .irq_enable = xen_irq_enable,
942 .safe_halt = xen_safe_halt,
943 .halt = xen_halt,
944 .wbinvd = native_wbinvd, 948 .wbinvd = native_wbinvd,
945 949
946 .read_msr = native_read_msr_safe, 950 .read_msr = native_read_msr_safe,
@@ -969,6 +973,23 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = {
969 .set_iopl_mask = xen_set_iopl_mask, 973 .set_iopl_mask = xen_set_iopl_mask,
970 .io_delay = xen_io_delay, 974 .io_delay = xen_io_delay,
971 975
976 .lazy_mode = {
977 .enter = paravirt_enter_lazy_cpu,
978 .leave = xen_leave_lazy,
979 },
980};
981
982static const struct pv_irq_ops xen_irq_ops __initdata = {
983 .init_IRQ = xen_init_IRQ,
984 .save_fl = xen_save_fl,
985 .restore_fl = xen_restore_fl,
986 .irq_disable = xen_irq_disable,
987 .irq_enable = xen_irq_enable,
988 .safe_halt = xen_safe_halt,
989 .halt = xen_halt,
990};
991
992static const struct pv_apic_ops xen_apic_ops __initdata = {
972#ifdef CONFIG_X86_LOCAL_APIC 993#ifdef CONFIG_X86_LOCAL_APIC
973 .apic_write = xen_apic_write, 994 .apic_write = xen_apic_write,
974 .apic_write_atomic = xen_apic_write, 995 .apic_write_atomic = xen_apic_write,
@@ -977,6 +998,17 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = {
977 .setup_secondary_clock = paravirt_nop, 998 .setup_secondary_clock = paravirt_nop,
978 .startup_ipi_hook = paravirt_nop, 999 .startup_ipi_hook = paravirt_nop,
979#endif 1000#endif
1001};
1002
1003static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1004 .pagetable_setup_start = xen_pagetable_setup_start,
1005 .pagetable_setup_done = xen_pagetable_setup_done,
1006
1007 .read_cr2 = xen_read_cr2,
1008 .write_cr2 = xen_write_cr2,
1009
1010 .read_cr3 = xen_read_cr3,
1011 .write_cr3 = xen_write_cr3,
980 1012
981 .flush_tlb_user = xen_flush_tlb, 1013 .flush_tlb_user = xen_flush_tlb,
982 .flush_tlb_kernel = xen_flush_tlb, 1014 .flush_tlb_kernel = xen_flush_tlb,
@@ -986,9 +1018,6 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = {
986 .pte_update = paravirt_nop, 1018 .pte_update = paravirt_nop,
987 .pte_update_defer = paravirt_nop, 1019 .pte_update_defer = paravirt_nop,
988 1020
989 .pagetable_setup_start = xen_pagetable_setup_start,
990 .pagetable_setup_done = xen_pagetable_setup_done,
991
992 .alloc_pt = xen_alloc_pt_init, 1021 .alloc_pt = xen_alloc_pt_init,
993 .release_pt = xen_release_pt, 1022 .release_pt = xen_release_pt,
994 .alloc_pd = paravirt_nop, 1023 .alloc_pd = paravirt_nop,
@@ -1024,7 +1053,10 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = {
1024 .dup_mmap = xen_dup_mmap, 1053 .dup_mmap = xen_dup_mmap,
1025 .exit_mmap = xen_exit_mmap, 1054 .exit_mmap = xen_exit_mmap,
1026 1055
1027 .set_lazy_mode = xen_set_lazy_mode, 1056 .lazy_mode = {
1057 .enter = paravirt_enter_lazy_mmu,
1058 .leave = xen_leave_lazy,
1059 },
1028}; 1060};
1029 1061
1030#ifdef CONFIG_SMP 1062#ifdef CONFIG_SMP
@@ -1080,6 +1112,17 @@ static const struct machine_ops __initdata xen_machine_ops = {
1080}; 1112};
1081 1113
1082 1114
1115static void __init xen_reserve_top(void)
1116{
1117 unsigned long top = HYPERVISOR_VIRT_START;
1118 struct xen_platform_parameters pp;
1119
1120 if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
1121 top = pp.virt_start;
1122
1123 reserve_top_address(-top + 2 * PAGE_SIZE);
1124}
1125
1083/* First C function to be called on Xen boot */ 1126/* First C function to be called on Xen boot */
1084asmlinkage void __init xen_start_kernel(void) 1127asmlinkage void __init xen_start_kernel(void)
1085{ 1128{
@@ -1091,7 +1134,14 @@ asmlinkage void __init xen_start_kernel(void)
1091 BUG_ON(memcmp(xen_start_info->magic, "xen-3.0", 7) != 0); 1134 BUG_ON(memcmp(xen_start_info->magic, "xen-3.0", 7) != 0);
1092 1135
1093 /* Install Xen paravirt ops */ 1136 /* Install Xen paravirt ops */
1094 paravirt_ops = xen_paravirt_ops; 1137 pv_info = xen_info;
1138 pv_init_ops = xen_init_ops;
1139 pv_time_ops = xen_time_ops;
1140 pv_cpu_ops = xen_cpu_ops;
1141 pv_irq_ops = xen_irq_ops;
1142 pv_apic_ops = xen_apic_ops;
1143 pv_mmu_ops = xen_mmu_ops;
1144
1095 machine_ops = xen_machine_ops; 1145 machine_ops = xen_machine_ops;
1096 1146
1097#ifdef CONFIG_SMP 1147#ifdef CONFIG_SMP
@@ -1113,6 +1163,7 @@ asmlinkage void __init xen_start_kernel(void)
1113 /* keep using Xen gdt for now; no urgent need to change it */ 1163 /* keep using Xen gdt for now; no urgent need to change it */
1114 1164
1115 x86_write_percpu(xen_cr3, __pa(pgd)); 1165 x86_write_percpu(xen_cr3, __pa(pgd));
1166 x86_write_percpu(xen_current_cr3, __pa(pgd));
1116 1167
1117#ifdef CONFIG_SMP 1168#ifdef CONFIG_SMP
1118 /* Don't do the full vcpu_info placement stuff until we have a 1169 /* Don't do the full vcpu_info placement stuff until we have a
@@ -1124,12 +1175,12 @@ asmlinkage void __init xen_start_kernel(void)
1124 xen_setup_vcpu_info_placement(); 1175 xen_setup_vcpu_info_placement();
1125#endif 1176#endif
1126 1177
1127 paravirt_ops.kernel_rpl = 1; 1178 pv_info.kernel_rpl = 1;
1128 if (xen_feature(XENFEAT_supervisor_mode_kernel)) 1179 if (xen_feature(XENFEAT_supervisor_mode_kernel))
1129 paravirt_ops.kernel_rpl = 0; 1180 pv_info.kernel_rpl = 0;
1130 1181
1131 /* set the limit of our address space */ 1182 /* set the limit of our address space */
1132 reserve_top_address(-HYPERVISOR_VIRT_START + 2 * PAGE_SIZE); 1183 xen_reserve_top();
1133 1184
1134 /* set up basic CPUID stuff */ 1185 /* set up basic CPUID stuff */
1135 cpu_detect(&new_cpu_data); 1186 cpu_detect(&new_cpu_data);
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 874db0cd1d2a..b2e32f9d0071 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -41,7 +41,6 @@
41#include <linux/sched.h> 41#include <linux/sched.h>
42#include <linux/highmem.h> 42#include <linux/highmem.h>
43#include <linux/bug.h> 43#include <linux/bug.h>
44#include <linux/sched.h>
45 44
46#include <asm/pgtable.h> 45#include <asm/pgtable.h>
47#include <asm/tlbflush.h> 46#include <asm/tlbflush.h>
@@ -155,7 +154,7 @@ void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
155 pte_t *ptep, pte_t pteval) 154 pte_t *ptep, pte_t pteval)
156{ 155{
157 if (mm == current->mm || mm == &init_mm) { 156 if (mm == current->mm || mm == &init_mm) {
158 if (xen_get_lazy_mode() == PARAVIRT_LAZY_MMU) { 157 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
159 struct multicall_space mcs; 158 struct multicall_space mcs;
160 mcs = xen_mc_entry(0); 159 mcs = xen_mc_entry(0);
161 160
@@ -304,7 +303,12 @@ pgd_t xen_make_pgd(unsigned long pgd)
304} 303}
305#endif /* CONFIG_X86_PAE */ 304#endif /* CONFIG_X86_PAE */
306 305
307 306enum pt_level {
307 PT_PGD,
308 PT_PUD,
309 PT_PMD,
310 PT_PTE
311};
308 312
309/* 313/*
310 (Yet another) pagetable walker. This one is intended for pinning a 314 (Yet another) pagetable walker. This one is intended for pinning a
@@ -316,7 +320,7 @@ pgd_t xen_make_pgd(unsigned long pgd)
316 FIXADDR_TOP. But the important bit is that we don't pin beyond 320 FIXADDR_TOP. But the important bit is that we don't pin beyond
317 there, because then we start getting into Xen's ptes. 321 there, because then we start getting into Xen's ptes.
318*/ 322*/
319static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned), 323static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, enum pt_level),
320 unsigned long limit) 324 unsigned long limit)
321{ 325{
322 pgd_t *pgd = pgd_base; 326 pgd_t *pgd = pgd_base;
@@ -341,7 +345,7 @@ static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned),
341 pud = pud_offset(pgd, 0); 345 pud = pud_offset(pgd, 0);
342 346
343 if (PTRS_PER_PUD > 1) /* not folded */ 347 if (PTRS_PER_PUD > 1) /* not folded */
344 flush |= (*func)(virt_to_page(pud), 0); 348 flush |= (*func)(virt_to_page(pud), PT_PUD);
345 349
346 for (; addr != pud_limit; pud++, addr = pud_next) { 350 for (; addr != pud_limit; pud++, addr = pud_next) {
347 pmd_t *pmd; 351 pmd_t *pmd;
@@ -360,7 +364,7 @@ static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned),
360 pmd = pmd_offset(pud, 0); 364 pmd = pmd_offset(pud, 0);
361 365
362 if (PTRS_PER_PMD > 1) /* not folded */ 366 if (PTRS_PER_PMD > 1) /* not folded */
363 flush |= (*func)(virt_to_page(pmd), 0); 367 flush |= (*func)(virt_to_page(pmd), PT_PMD);
364 368
365 for (; addr != pmd_limit; pmd++) { 369 for (; addr != pmd_limit; pmd++) {
366 addr += (PAGE_SIZE * PTRS_PER_PTE); 370 addr += (PAGE_SIZE * PTRS_PER_PTE);
@@ -372,17 +376,47 @@ static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned),
372 if (pmd_none(*pmd)) 376 if (pmd_none(*pmd))
373 continue; 377 continue;
374 378
375 flush |= (*func)(pmd_page(*pmd), 0); 379 flush |= (*func)(pmd_page(*pmd), PT_PTE);
376 } 380 }
377 } 381 }
378 } 382 }
379 383
380 flush |= (*func)(virt_to_page(pgd_base), UVMF_TLB_FLUSH); 384 flush |= (*func)(virt_to_page(pgd_base), PT_PGD);
381 385
382 return flush; 386 return flush;
383} 387}
384 388
385static int pin_page(struct page *page, unsigned flags) 389static spinlock_t *lock_pte(struct page *page)
390{
391 spinlock_t *ptl = NULL;
392
393#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
394 ptl = __pte_lockptr(page);
395 spin_lock(ptl);
396#endif
397
398 return ptl;
399}
400
401static void do_unlock(void *v)
402{
403 spinlock_t *ptl = v;
404 spin_unlock(ptl);
405}
406
407static void xen_do_pin(unsigned level, unsigned long pfn)
408{
409 struct mmuext_op *op;
410 struct multicall_space mcs;
411
412 mcs = __xen_mc_entry(sizeof(*op));
413 op = mcs.args;
414 op->cmd = level;
415 op->arg1.mfn = pfn_to_mfn(pfn);
416 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
417}
418
419static int pin_page(struct page *page, enum pt_level level)
386{ 420{
387 unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags); 421 unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags);
388 int flush; 422 int flush;
@@ -397,12 +431,26 @@ static int pin_page(struct page *page, unsigned flags)
397 void *pt = lowmem_page_address(page); 431 void *pt = lowmem_page_address(page);
398 unsigned long pfn = page_to_pfn(page); 432 unsigned long pfn = page_to_pfn(page);
399 struct multicall_space mcs = __xen_mc_entry(0); 433 struct multicall_space mcs = __xen_mc_entry(0);
434 spinlock_t *ptl;
400 435
401 flush = 0; 436 flush = 0;
402 437
438 ptl = NULL;
439 if (level == PT_PTE)
440 ptl = lock_pte(page);
441
403 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, 442 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
404 pfn_pte(pfn, PAGE_KERNEL_RO), 443 pfn_pte(pfn, PAGE_KERNEL_RO),
405 flags); 444 level == PT_PGD ? UVMF_TLB_FLUSH : 0);
445
446 if (level == PT_PTE)
447 xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
448
449 if (ptl) {
450 /* Queue a deferred unlock for when this batch
451 is completed. */
452 xen_mc_callback(do_unlock, ptl);
453 }
406 } 454 }
407 455
408 return flush; 456 return flush;
@@ -413,8 +461,7 @@ static int pin_page(struct page *page, unsigned flags)
413 read-only, and can be pinned. */ 461 read-only, and can be pinned. */
414void xen_pgd_pin(pgd_t *pgd) 462void xen_pgd_pin(pgd_t *pgd)
415{ 463{
416 struct multicall_space mcs; 464 unsigned level;
417 struct mmuext_op *op;
418 465
419 xen_mc_batch(); 466 xen_mc_batch();
420 467
@@ -425,16 +472,13 @@ void xen_pgd_pin(pgd_t *pgd)
425 xen_mc_batch(); 472 xen_mc_batch();
426 } 473 }
427 474
428 mcs = __xen_mc_entry(sizeof(*op));
429 op = mcs.args;
430
431#ifdef CONFIG_X86_PAE 475#ifdef CONFIG_X86_PAE
432 op->cmd = MMUEXT_PIN_L3_TABLE; 476 level = MMUEXT_PIN_L3_TABLE;
433#else 477#else
434 op->cmd = MMUEXT_PIN_L2_TABLE; 478 level = MMUEXT_PIN_L2_TABLE;
435#endif 479#endif
436 op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd))); 480
437 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); 481 xen_do_pin(level, PFN_DOWN(__pa(pgd)));
438 482
439 xen_mc_issue(0); 483 xen_mc_issue(0);
440} 484}
@@ -442,7 +486,7 @@ void xen_pgd_pin(pgd_t *pgd)
442/* The init_mm pagetable is really pinned as soon as its created, but 486/* The init_mm pagetable is really pinned as soon as its created, but
443 that's before we have page structures to store the bits. So do all 487 that's before we have page structures to store the bits. So do all
444 the book-keeping now. */ 488 the book-keeping now. */
445static __init int mark_pinned(struct page *page, unsigned flags) 489static __init int mark_pinned(struct page *page, enum pt_level level)
446{ 490{
447 SetPagePinned(page); 491 SetPagePinned(page);
448 return 0; 492 return 0;
@@ -453,18 +497,32 @@ void __init xen_mark_init_mm_pinned(void)
453 pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP); 497 pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP);
454} 498}
455 499
456static int unpin_page(struct page *page, unsigned flags) 500static int unpin_page(struct page *page, enum pt_level level)
457{ 501{
458 unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags); 502 unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags);
459 503
460 if (pgfl && !PageHighMem(page)) { 504 if (pgfl && !PageHighMem(page)) {
461 void *pt = lowmem_page_address(page); 505 void *pt = lowmem_page_address(page);
462 unsigned long pfn = page_to_pfn(page); 506 unsigned long pfn = page_to_pfn(page);
463 struct multicall_space mcs = __xen_mc_entry(0); 507 spinlock_t *ptl = NULL;
508 struct multicall_space mcs;
509
510 if (level == PT_PTE) {
511 ptl = lock_pte(page);
512
513 xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
514 }
515
516 mcs = __xen_mc_entry(0);
464 517
465 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, 518 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
466 pfn_pte(pfn, PAGE_KERNEL), 519 pfn_pte(pfn, PAGE_KERNEL),
467 flags); 520 level == PT_PGD ? UVMF_TLB_FLUSH : 0);
521
522 if (ptl) {
523 /* unlock when batch completed */
524 xen_mc_callback(do_unlock, ptl);
525 }
468 } 526 }
469 527
470 return 0; /* never need to flush on unpin */ 528 return 0; /* never need to flush on unpin */
@@ -473,18 +531,9 @@ static int unpin_page(struct page *page, unsigned flags)
473/* Release a pagetables pages back as normal RW */ 531/* Release a pagetables pages back as normal RW */
474static void xen_pgd_unpin(pgd_t *pgd) 532static void xen_pgd_unpin(pgd_t *pgd)
475{ 533{
476 struct mmuext_op *op;
477 struct multicall_space mcs;
478
479 xen_mc_batch(); 534 xen_mc_batch();
480 535
481 mcs = __xen_mc_entry(sizeof(*op)); 536 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
482
483 op = mcs.args;
484 op->cmd = MMUEXT_UNPIN_TABLE;
485 op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
486
487 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
488 537
489 pgd_walk(pgd, unpin_page, TASK_SIZE); 538 pgd_walk(pgd, unpin_page, TASK_SIZE);
490 539
@@ -515,20 +564,43 @@ static void drop_other_mm_ref(void *info)
515 564
516 if (__get_cpu_var(cpu_tlbstate).active_mm == mm) 565 if (__get_cpu_var(cpu_tlbstate).active_mm == mm)
517 leave_mm(smp_processor_id()); 566 leave_mm(smp_processor_id());
567
568 /* If this cpu still has a stale cr3 reference, then make sure
569 it has been flushed. */
570 if (x86_read_percpu(xen_current_cr3) == __pa(mm->pgd)) {
571 load_cr3(swapper_pg_dir);
572 arch_flush_lazy_cpu_mode();
573 }
518} 574}
519 575
520static void drop_mm_ref(struct mm_struct *mm) 576static void drop_mm_ref(struct mm_struct *mm)
521{ 577{
578 cpumask_t mask;
579 unsigned cpu;
580
522 if (current->active_mm == mm) { 581 if (current->active_mm == mm) {
523 if (current->mm == mm) 582 if (current->mm == mm)
524 load_cr3(swapper_pg_dir); 583 load_cr3(swapper_pg_dir);
525 else 584 else
526 leave_mm(smp_processor_id()); 585 leave_mm(smp_processor_id());
586 arch_flush_lazy_cpu_mode();
527 } 587 }
528 588
529 if (!cpus_empty(mm->cpu_vm_mask)) 589 /* Get the "official" set of cpus referring to our pagetable. */
530 xen_smp_call_function_mask(mm->cpu_vm_mask, drop_other_mm_ref, 590 mask = mm->cpu_vm_mask;
531 mm, 1); 591
592 /* It's possible that a vcpu may have a stale reference to our
593 cr3, because its in lazy mode, and it hasn't yet flushed
594 its set of pending hypercalls yet. In this case, we can
595 look at its actual current cr3 value, and force it to flush
596 if needed. */
597 for_each_online_cpu(cpu) {
598 if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
599 cpu_set(cpu, mask);
600 }
601
602 if (!cpus_empty(mask))
603 xen_smp_call_function_mask(mask, drop_other_mm_ref, mm, 1);
532} 604}
533#else 605#else
534static void drop_mm_ref(struct mm_struct *mm) 606static void drop_mm_ref(struct mm_struct *mm)
@@ -563,5 +635,6 @@ void xen_exit_mmap(struct mm_struct *mm)
563 /* pgd may not be pinned in the error exit path of execve */ 635 /* pgd may not be pinned in the error exit path of execve */
564 if (PagePinned(virt_to_page(mm->pgd))) 636 if (PagePinned(virt_to_page(mm->pgd)))
565 xen_pgd_unpin(mm->pgd); 637 xen_pgd_unpin(mm->pgd);
638
566 spin_unlock(&mm->page_table_lock); 639 spin_unlock(&mm->page_table_lock);
567} 640}
diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c
index c837e8e463db..5e6f36f6d876 100644
--- a/arch/x86/xen/multicalls.c
+++ b/arch/x86/xen/multicalls.c
@@ -26,13 +26,22 @@
26 26
27#include "multicalls.h" 27#include "multicalls.h"
28 28
29#define MC_DEBUG 1
30
29#define MC_BATCH 32 31#define MC_BATCH 32
30#define MC_ARGS (MC_BATCH * 16 / sizeof(u64)) 32#define MC_ARGS (MC_BATCH * 16 / sizeof(u64))
31 33
32struct mc_buffer { 34struct mc_buffer {
33 struct multicall_entry entries[MC_BATCH]; 35 struct multicall_entry entries[MC_BATCH];
36#if MC_DEBUG
37 struct multicall_entry debug[MC_BATCH];
38#endif
34 u64 args[MC_ARGS]; 39 u64 args[MC_ARGS];
35 unsigned mcidx, argidx; 40 struct callback {
41 void (*fn)(void *);
42 void *data;
43 } callbacks[MC_BATCH];
44 unsigned mcidx, argidx, cbidx;
36}; 45};
37 46
38static DEFINE_PER_CPU(struct mc_buffer, mc_buffer); 47static DEFINE_PER_CPU(struct mc_buffer, mc_buffer);
@@ -43,6 +52,7 @@ void xen_mc_flush(void)
43 struct mc_buffer *b = &__get_cpu_var(mc_buffer); 52 struct mc_buffer *b = &__get_cpu_var(mc_buffer);
44 int ret = 0; 53 int ret = 0;
45 unsigned long flags; 54 unsigned long flags;
55 int i;
46 56
47 BUG_ON(preemptible()); 57 BUG_ON(preemptible());
48 58
@@ -51,13 +61,31 @@ void xen_mc_flush(void)
51 local_irq_save(flags); 61 local_irq_save(flags);
52 62
53 if (b->mcidx) { 63 if (b->mcidx) {
54 int i; 64#if MC_DEBUG
65 memcpy(b->debug, b->entries,
66 b->mcidx * sizeof(struct multicall_entry));
67#endif
55 68
56 if (HYPERVISOR_multicall(b->entries, b->mcidx) != 0) 69 if (HYPERVISOR_multicall(b->entries, b->mcidx) != 0)
57 BUG(); 70 BUG();
58 for (i = 0; i < b->mcidx; i++) 71 for (i = 0; i < b->mcidx; i++)
59 if (b->entries[i].result < 0) 72 if (b->entries[i].result < 0)
60 ret++; 73 ret++;
74
75#if MC_DEBUG
76 if (ret) {
77 printk(KERN_ERR "%d multicall(s) failed: cpu %d\n",
78 ret, smp_processor_id());
79 for(i = 0; i < b->mcidx; i++) {
80 printk(" call %2d/%d: op=%lu arg=[%lx] result=%ld\n",
81 i+1, b->mcidx,
82 b->debug[i].op,
83 b->debug[i].args[0],
84 b->entries[i].result);
85 }
86 }
87#endif
88
61 b->mcidx = 0; 89 b->mcidx = 0;
62 b->argidx = 0; 90 b->argidx = 0;
63 } else 91 } else
@@ -65,6 +93,13 @@ void xen_mc_flush(void)
65 93
66 local_irq_restore(flags); 94 local_irq_restore(flags);
67 95
96 for(i = 0; i < b->cbidx; i++) {
97 struct callback *cb = &b->callbacks[i];
98
99 (*cb->fn)(cb->data);
100 }
101 b->cbidx = 0;
102
68 BUG_ON(ret); 103 BUG_ON(ret);
69} 104}
70 105
@@ -88,3 +123,16 @@ struct multicall_space __xen_mc_entry(size_t args)
88 123
89 return ret; 124 return ret;
90} 125}
126
127void xen_mc_callback(void (*fn)(void *), void *data)
128{
129 struct mc_buffer *b = &__get_cpu_var(mc_buffer);
130 struct callback *cb;
131
132 if (b->cbidx == MC_BATCH)
133 xen_mc_flush();
134
135 cb = &b->callbacks[b->cbidx++];
136 cb->fn = fn;
137 cb->data = data;
138}
diff --git a/arch/x86/xen/multicalls.h b/arch/x86/xen/multicalls.h
index e6f7530b156c..8bae996d99a3 100644
--- a/arch/x86/xen/multicalls.h
+++ b/arch/x86/xen/multicalls.h
@@ -35,11 +35,14 @@ void xen_mc_flush(void);
35/* Issue a multicall if we're not in a lazy mode */ 35/* Issue a multicall if we're not in a lazy mode */
36static inline void xen_mc_issue(unsigned mode) 36static inline void xen_mc_issue(unsigned mode)
37{ 37{
38 if ((xen_get_lazy_mode() & mode) == 0) 38 if ((paravirt_get_lazy_mode() & mode) == 0)
39 xen_mc_flush(); 39 xen_mc_flush();
40 40
41 /* restore flags saved in xen_mc_batch */ 41 /* restore flags saved in xen_mc_batch */
42 local_irq_restore(x86_read_percpu(xen_mc_irq_flags)); 42 local_irq_restore(x86_read_percpu(xen_mc_irq_flags));
43} 43}
44 44
45/* Set up a callback to be called when the current batch is flushed */
46void xen_mc_callback(void (*fn)(void *), void *data);
47
45#endif /* _XEN_MULTICALLS_H */ 48#endif /* _XEN_MULTICALLS_H */
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 4fa33c27ccb6..d53bf9d8a72d 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -370,7 +370,8 @@ int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *),
370 void *info, int wait) 370 void *info, int wait)
371{ 371{
372 struct call_data_struct data; 372 struct call_data_struct data;
373 int cpus; 373 int cpus, cpu;
374 bool yield;
374 375
375 /* Holding any lock stops cpus from going down. */ 376 /* Holding any lock stops cpus from going down. */
376 spin_lock(&call_lock); 377 spin_lock(&call_lock);
@@ -399,9 +400,14 @@ int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *),
399 /* Send a message to other CPUs and wait for them to respond */ 400 /* Send a message to other CPUs and wait for them to respond */
400 xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR); 401 xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR);
401 402
402 /* Make sure other vcpus get a chance to run. 403 /* Make sure other vcpus get a chance to run if they need to. */
403 XXX too severe? Maybe we should check the other CPU's states? */ 404 yield = false;
404 HYPERVISOR_sched_op(SCHEDOP_yield, 0); 405 for_each_cpu_mask(cpu, mask)
406 if (xen_vcpu_stolen(cpu))
407 yield = true;
408
409 if (yield)
410 HYPERVISOR_sched_op(SCHEDOP_yield, 0);
405 411
406 /* Wait for response */ 412 /* Wait for response */
407 while (atomic_read(&data.started) != cpus || 413 while (atomic_read(&data.started) != cpus ||
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index dfd6db69ead5..d083ff5ef088 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -105,6 +105,12 @@ static void get_runstate_snapshot(struct vcpu_runstate_info *res)
105 } while (get64(&state->state_entry_time) != state_time); 105 } while (get64(&state->state_entry_time) != state_time);
106} 106}
107 107
108/* return true when a vcpu could run but has no real cpu to run on */
109bool xen_vcpu_stolen(int vcpu)
110{
111 return per_cpu(runstate, vcpu).state == RUNSTATE_runnable;
112}
113
108static void setup_runstate_info(int cpu) 114static void setup_runstate_info(int cpu)
109{ 115{
110 struct vcpu_register_runstate_memory_area area; 116 struct vcpu_register_runstate_memory_area area;
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index b9aaea45f07f..b02a909bfd4c 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -11,6 +11,7 @@ void xen_copy_trap_info(struct trap_info *traps);
11 11
12DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu); 12DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu);
13DECLARE_PER_CPU(unsigned long, xen_cr3); 13DECLARE_PER_CPU(unsigned long, xen_cr3);
14DECLARE_PER_CPU(unsigned long, xen_current_cr3);
14 15
15extern struct start_info *xen_start_info; 16extern struct start_info *xen_start_info;
16extern struct shared_info *HYPERVISOR_shared_info; 17extern struct shared_info *HYPERVISOR_shared_info;
@@ -27,14 +28,9 @@ unsigned long xen_get_wallclock(void);
27int xen_set_wallclock(unsigned long time); 28int xen_set_wallclock(unsigned long time);
28unsigned long long xen_sched_clock(void); 29unsigned long long xen_sched_clock(void);
29 30
30void xen_mark_init_mm_pinned(void); 31bool xen_vcpu_stolen(int vcpu);
31
32DECLARE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode);
33 32
34static inline unsigned xen_get_lazy_mode(void) 33void xen_mark_init_mm_pinned(void);
35{
36 return x86_read_percpu(xen_lazy_mode);
37}
38 34
39void __init xen_fill_possible_map(void); 35void __init xen_fill_possible_map(void);
40 36