diff options
author | Linus Torvalds <torvalds@woody.linux-foundation.org> | 2007-10-17 14:10:11 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@woody.linux-foundation.org> | 2007-10-17 14:10:11 -0400 |
commit | fb9fc395174138983a49f2da982ed14caabbe741 (patch) | |
tree | 5d5d3643ee6853a899205613da272cc343fdc1a4 /arch/x86/xen | |
parent | 0eafaae84e21ac033815cc9f33c3ae889cd7ccfe (diff) | |
parent | ace2e92e193126711cb3a83a3752b2c5b8396950 (diff) |
Merge branch 'xen-upstream' of git://git.kernel.org/pub/scm/linux/kernel/git/jeremy/xen
* 'xen-upstream' of git://git.kernel.org/pub/scm/linux/kernel/git/jeremy/xen:
xfs: eagerly remove vmap mappings to avoid upsetting Xen
xen: add some debug output for failed multicalls
xen: fix incorrect vcpu_register_vcpu_info hypercall argument
xen: ask the hypervisor how much space it needs reserved
xen: lock pte pages while pinning/unpinning
xen: deal with stale cr3 values when unpinning pagetables
xen: add batch completion callbacks
xen: yield to IPI target if necessary
Clean up duplicate includes in arch/i386/xen/
remove dead code in pgtable_cache_init
paravirt: clean up lazy mode handling
paravirt: refactor struct paravirt_ops into smaller pv_*_ops
Diffstat (limited to 'arch/x86/xen')
-rw-r--r-- | arch/x86/xen/enlighten.c | 233 | ||||
-rw-r--r-- | arch/x86/xen/mmu.c | 145 | ||||
-rw-r--r-- | arch/x86/xen/multicalls.c | 52 | ||||
-rw-r--r-- | arch/x86/xen/multicalls.h | 5 | ||||
-rw-r--r-- | arch/x86/xen/smp.c | 14 | ||||
-rw-r--r-- | arch/x86/xen/time.c | 6 | ||||
-rw-r--r-- | arch/x86/xen/xen-ops.h | 10 |
7 files changed, 324 insertions, 141 deletions
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 493a083f6886..94c39aaf695f 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c | |||
@@ -25,7 +25,6 @@ | |||
25 | #include <linux/mm.h> | 25 | #include <linux/mm.h> |
26 | #include <linux/page-flags.h> | 26 | #include <linux/page-flags.h> |
27 | #include <linux/highmem.h> | 27 | #include <linux/highmem.h> |
28 | #include <linux/smp.h> | ||
29 | 28 | ||
30 | #include <xen/interface/xen.h> | 29 | #include <xen/interface/xen.h> |
31 | #include <xen/interface/physdev.h> | 30 | #include <xen/interface/physdev.h> |
@@ -52,11 +51,25 @@ | |||
52 | 51 | ||
53 | EXPORT_SYMBOL_GPL(hypercall_page); | 52 | EXPORT_SYMBOL_GPL(hypercall_page); |
54 | 53 | ||
55 | DEFINE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode); | ||
56 | |||
57 | DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu); | 54 | DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu); |
58 | DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); | 55 | DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); |
59 | DEFINE_PER_CPU(unsigned long, xen_cr3); | 56 | |
57 | /* | ||
58 | * Note about cr3 (pagetable base) values: | ||
59 | * | ||
60 | * xen_cr3 contains the current logical cr3 value; it contains the | ||
61 | * last set cr3. This may not be the current effective cr3, because | ||
62 | * its update may be being lazily deferred. However, a vcpu looking | ||
63 | * at its own cr3 can use this value knowing that it everything will | ||
64 | * be self-consistent. | ||
65 | * | ||
66 | * xen_current_cr3 contains the actual vcpu cr3; it is set once the | ||
67 | * hypercall to set the vcpu cr3 is complete (so it may be a little | ||
68 | * out of date, but it will never be set early). If one vcpu is | ||
69 | * looking at another vcpu's cr3 value, it should use this variable. | ||
70 | */ | ||
71 | DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */ | ||
72 | DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */ | ||
60 | 73 | ||
61 | struct start_info *xen_start_info; | 74 | struct start_info *xen_start_info; |
62 | EXPORT_SYMBOL_GPL(xen_start_info); | 75 | EXPORT_SYMBOL_GPL(xen_start_info); |
@@ -100,7 +113,7 @@ static void __init xen_vcpu_setup(int cpu) | |||
100 | info.mfn = virt_to_mfn(vcpup); | 113 | info.mfn = virt_to_mfn(vcpup); |
101 | info.offset = offset_in_page(vcpup); | 114 | info.offset = offset_in_page(vcpup); |
102 | 115 | ||
103 | printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %x, offset %d\n", | 116 | printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %llx, offset %d\n", |
104 | cpu, vcpup, info.mfn, info.offset); | 117 | cpu, vcpup, info.mfn, info.offset); |
105 | 118 | ||
106 | /* Check to see if the hypervisor will put the vcpu_info | 119 | /* Check to see if the hypervisor will put the vcpu_info |
@@ -124,7 +137,7 @@ static void __init xen_vcpu_setup(int cpu) | |||
124 | static void __init xen_banner(void) | 137 | static void __init xen_banner(void) |
125 | { | 138 | { |
126 | printk(KERN_INFO "Booting paravirtualized kernel on %s\n", | 139 | printk(KERN_INFO "Booting paravirtualized kernel on %s\n", |
127 | paravirt_ops.name); | 140 | pv_info.name); |
128 | printk(KERN_INFO "Hypervisor signature: %s\n", xen_start_info->magic); | 141 | printk(KERN_INFO "Hypervisor signature: %s\n", xen_start_info->magic); |
129 | } | 142 | } |
130 | 143 | ||
@@ -249,29 +262,10 @@ static void xen_halt(void) | |||
249 | xen_safe_halt(); | 262 | xen_safe_halt(); |
250 | } | 263 | } |
251 | 264 | ||
252 | static void xen_set_lazy_mode(enum paravirt_lazy_mode mode) | 265 | static void xen_leave_lazy(void) |
253 | { | 266 | { |
254 | BUG_ON(preemptible()); | 267 | paravirt_leave_lazy(paravirt_get_lazy_mode()); |
255 | |||
256 | switch (mode) { | ||
257 | case PARAVIRT_LAZY_NONE: | ||
258 | BUG_ON(x86_read_percpu(xen_lazy_mode) == PARAVIRT_LAZY_NONE); | ||
259 | break; | ||
260 | |||
261 | case PARAVIRT_LAZY_MMU: | ||
262 | case PARAVIRT_LAZY_CPU: | ||
263 | BUG_ON(x86_read_percpu(xen_lazy_mode) != PARAVIRT_LAZY_NONE); | ||
264 | break; | ||
265 | |||
266 | case PARAVIRT_LAZY_FLUSH: | ||
267 | /* flush if necessary, but don't change state */ | ||
268 | if (x86_read_percpu(xen_lazy_mode) != PARAVIRT_LAZY_NONE) | ||
269 | xen_mc_flush(); | ||
270 | return; | ||
271 | } | ||
272 | |||
273 | xen_mc_flush(); | 268 | xen_mc_flush(); |
274 | x86_write_percpu(xen_lazy_mode, mode); | ||
275 | } | 269 | } |
276 | 270 | ||
277 | static unsigned long xen_store_tr(void) | 271 | static unsigned long xen_store_tr(void) |
@@ -358,7 +352,7 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu) | |||
358 | * loaded properly. This will go away as soon as Xen has been | 352 | * loaded properly. This will go away as soon as Xen has been |
359 | * modified to not save/restore %gs for normal hypercalls. | 353 | * modified to not save/restore %gs for normal hypercalls. |
360 | */ | 354 | */ |
361 | if (xen_get_lazy_mode() == PARAVIRT_LAZY_CPU) | 355 | if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) |
362 | loadsegment(gs, 0); | 356 | loadsegment(gs, 0); |
363 | } | 357 | } |
364 | 358 | ||
@@ -632,32 +626,36 @@ static unsigned long xen_read_cr3(void) | |||
632 | return x86_read_percpu(xen_cr3); | 626 | return x86_read_percpu(xen_cr3); |
633 | } | 627 | } |
634 | 628 | ||
629 | static void set_current_cr3(void *v) | ||
630 | { | ||
631 | x86_write_percpu(xen_current_cr3, (unsigned long)v); | ||
632 | } | ||
633 | |||
635 | static void xen_write_cr3(unsigned long cr3) | 634 | static void xen_write_cr3(unsigned long cr3) |
636 | { | 635 | { |
636 | struct mmuext_op *op; | ||
637 | struct multicall_space mcs; | ||
638 | unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3)); | ||
639 | |||
637 | BUG_ON(preemptible()); | 640 | BUG_ON(preemptible()); |
638 | 641 | ||
639 | if (cr3 == x86_read_percpu(xen_cr3)) { | 642 | mcs = xen_mc_entry(sizeof(*op)); /* disables interrupts */ |
640 | /* just a simple tlb flush */ | ||
641 | xen_flush_tlb(); | ||
642 | return; | ||
643 | } | ||
644 | 643 | ||
644 | /* Update while interrupts are disabled, so its atomic with | ||
645 | respect to ipis */ | ||
645 | x86_write_percpu(xen_cr3, cr3); | 646 | x86_write_percpu(xen_cr3, cr3); |
646 | 647 | ||
648 | op = mcs.args; | ||
649 | op->cmd = MMUEXT_NEW_BASEPTR; | ||
650 | op->arg1.mfn = mfn; | ||
647 | 651 | ||
648 | { | 652 | MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); |
649 | struct mmuext_op *op; | ||
650 | struct multicall_space mcs = xen_mc_entry(sizeof(*op)); | ||
651 | unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3)); | ||
652 | |||
653 | op = mcs.args; | ||
654 | op->cmd = MMUEXT_NEW_BASEPTR; | ||
655 | op->arg1.mfn = mfn; | ||
656 | 653 | ||
657 | MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); | 654 | /* Update xen_update_cr3 once the batch has actually |
655 | been submitted. */ | ||
656 | xen_mc_callback(set_current_cr3, (void *)cr3); | ||
658 | 657 | ||
659 | xen_mc_issue(PARAVIRT_LAZY_CPU); | 658 | xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */ |
660 | } | ||
661 | } | 659 | } |
662 | 660 | ||
663 | /* Early in boot, while setting up the initial pagetable, assume | 661 | /* Early in boot, while setting up the initial pagetable, assume |
@@ -668,6 +666,15 @@ static __init void xen_alloc_pt_init(struct mm_struct *mm, u32 pfn) | |||
668 | make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); | 666 | make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); |
669 | } | 667 | } |
670 | 668 | ||
669 | static void pin_pagetable_pfn(unsigned level, unsigned long pfn) | ||
670 | { | ||
671 | struct mmuext_op op; | ||
672 | op.cmd = level; | ||
673 | op.arg1.mfn = pfn_to_mfn(pfn); | ||
674 | if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) | ||
675 | BUG(); | ||
676 | } | ||
677 | |||
671 | /* This needs to make sure the new pte page is pinned iff its being | 678 | /* This needs to make sure the new pte page is pinned iff its being |
672 | attached to a pinned pagetable. */ | 679 | attached to a pinned pagetable. */ |
673 | static void xen_alloc_pt(struct mm_struct *mm, u32 pfn) | 680 | static void xen_alloc_pt(struct mm_struct *mm, u32 pfn) |
@@ -677,9 +684,10 @@ static void xen_alloc_pt(struct mm_struct *mm, u32 pfn) | |||
677 | if (PagePinned(virt_to_page(mm->pgd))) { | 684 | if (PagePinned(virt_to_page(mm->pgd))) { |
678 | SetPagePinned(page); | 685 | SetPagePinned(page); |
679 | 686 | ||
680 | if (!PageHighMem(page)) | 687 | if (!PageHighMem(page)) { |
681 | make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); | 688 | make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); |
682 | else | 689 | pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); |
690 | } else | ||
683 | /* make sure there are no stray mappings of | 691 | /* make sure there are no stray mappings of |
684 | this page */ | 692 | this page */ |
685 | kmap_flush_unused(); | 693 | kmap_flush_unused(); |
@@ -692,8 +700,10 @@ static void xen_release_pt(u32 pfn) | |||
692 | struct page *page = pfn_to_page(pfn); | 700 | struct page *page = pfn_to_page(pfn); |
693 | 701 | ||
694 | if (PagePinned(page)) { | 702 | if (PagePinned(page)) { |
695 | if (!PageHighMem(page)) | 703 | if (!PageHighMem(page)) { |
704 | pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); | ||
696 | make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); | 705 | make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); |
706 | } | ||
697 | } | 707 | } |
698 | } | 708 | } |
699 | 709 | ||
@@ -738,7 +748,7 @@ static __init void xen_pagetable_setup_start(pgd_t *base) | |||
738 | pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base; | 748 | pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base; |
739 | 749 | ||
740 | /* special set_pte for pagetable initialization */ | 750 | /* special set_pte for pagetable initialization */ |
741 | paravirt_ops.set_pte = xen_set_pte_init; | 751 | pv_mmu_ops.set_pte = xen_set_pte_init; |
742 | 752 | ||
743 | init_mm.pgd = base; | 753 | init_mm.pgd = base; |
744 | /* | 754 | /* |
@@ -785,8 +795,8 @@ static __init void xen_pagetable_setup_done(pgd_t *base) | |||
785 | { | 795 | { |
786 | /* This will work as long as patching hasn't happened yet | 796 | /* This will work as long as patching hasn't happened yet |
787 | (which it hasn't) */ | 797 | (which it hasn't) */ |
788 | paravirt_ops.alloc_pt = xen_alloc_pt; | 798 | pv_mmu_ops.alloc_pt = xen_alloc_pt; |
789 | paravirt_ops.set_pte = xen_set_pte; | 799 | pv_mmu_ops.set_pte = xen_set_pte; |
790 | 800 | ||
791 | if (!xen_feature(XENFEAT_auto_translated_physmap)) { | 801 | if (!xen_feature(XENFEAT_auto_translated_physmap)) { |
792 | /* | 802 | /* |
@@ -808,15 +818,15 @@ static __init void xen_pagetable_setup_done(pgd_t *base) | |||
808 | /* Actually pin the pagetable down, but we can't set PG_pinned | 818 | /* Actually pin the pagetable down, but we can't set PG_pinned |
809 | yet because the page structures don't exist yet. */ | 819 | yet because the page structures don't exist yet. */ |
810 | { | 820 | { |
811 | struct mmuext_op op; | 821 | unsigned level; |
822 | |||
812 | #ifdef CONFIG_X86_PAE | 823 | #ifdef CONFIG_X86_PAE |
813 | op.cmd = MMUEXT_PIN_L3_TABLE; | 824 | level = MMUEXT_PIN_L3_TABLE; |
814 | #else | 825 | #else |
815 | op.cmd = MMUEXT_PIN_L3_TABLE; | 826 | level = MMUEXT_PIN_L2_TABLE; |
816 | #endif | 827 | #endif |
817 | op.arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(base))); | 828 | |
818 | if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) | 829 | pin_pagetable_pfn(level, PFN_DOWN(__pa(base))); |
819 | BUG(); | ||
820 | } | 830 | } |
821 | } | 831 | } |
822 | 832 | ||
@@ -833,12 +843,12 @@ void __init xen_setup_vcpu_info_placement(void) | |||
833 | if (have_vcpu_info_placement) { | 843 | if (have_vcpu_info_placement) { |
834 | printk(KERN_INFO "Xen: using vcpu_info placement\n"); | 844 | printk(KERN_INFO "Xen: using vcpu_info placement\n"); |
835 | 845 | ||
836 | paravirt_ops.save_fl = xen_save_fl_direct; | 846 | pv_irq_ops.save_fl = xen_save_fl_direct; |
837 | paravirt_ops.restore_fl = xen_restore_fl_direct; | 847 | pv_irq_ops.restore_fl = xen_restore_fl_direct; |
838 | paravirt_ops.irq_disable = xen_irq_disable_direct; | 848 | pv_irq_ops.irq_disable = xen_irq_disable_direct; |
839 | paravirt_ops.irq_enable = xen_irq_enable_direct; | 849 | pv_irq_ops.irq_enable = xen_irq_enable_direct; |
840 | paravirt_ops.read_cr2 = xen_read_cr2_direct; | 850 | pv_mmu_ops.read_cr2 = xen_read_cr2_direct; |
841 | paravirt_ops.iret = xen_iret_direct; | 851 | pv_cpu_ops.iret = xen_iret_direct; |
842 | } | 852 | } |
843 | } | 853 | } |
844 | 854 | ||
@@ -850,8 +860,8 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, | |||
850 | 860 | ||
851 | start = end = reloc = NULL; | 861 | start = end = reloc = NULL; |
852 | 862 | ||
853 | #define SITE(x) \ | 863 | #define SITE(op, x) \ |
854 | case PARAVIRT_PATCH(x): \ | 864 | case PARAVIRT_PATCH(op.x): \ |
855 | if (have_vcpu_info_placement) { \ | 865 | if (have_vcpu_info_placement) { \ |
856 | start = (char *)xen_##x##_direct; \ | 866 | start = (char *)xen_##x##_direct; \ |
857 | end = xen_##x##_direct_end; \ | 867 | end = xen_##x##_direct_end; \ |
@@ -860,10 +870,10 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, | |||
860 | goto patch_site | 870 | goto patch_site |
861 | 871 | ||
862 | switch (type) { | 872 | switch (type) { |
863 | SITE(irq_enable); | 873 | SITE(pv_irq_ops, irq_enable); |
864 | SITE(irq_disable); | 874 | SITE(pv_irq_ops, irq_disable); |
865 | SITE(save_fl); | 875 | SITE(pv_irq_ops, save_fl); |
866 | SITE(restore_fl); | 876 | SITE(pv_irq_ops, restore_fl); |
867 | #undef SITE | 877 | #undef SITE |
868 | 878 | ||
869 | patch_site: | 879 | patch_site: |
@@ -895,26 +905,32 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, | |||
895 | return ret; | 905 | return ret; |
896 | } | 906 | } |
897 | 907 | ||
898 | static const struct paravirt_ops xen_paravirt_ops __initdata = { | 908 | static const struct pv_info xen_info __initdata = { |
899 | .paravirt_enabled = 1, | 909 | .paravirt_enabled = 1, |
900 | .shared_kernel_pmd = 0, | 910 | .shared_kernel_pmd = 0, |
901 | 911 | ||
902 | .name = "Xen", | 912 | .name = "Xen", |
903 | .banner = xen_banner, | 913 | }; |
904 | 914 | ||
915 | static const struct pv_init_ops xen_init_ops __initdata = { | ||
905 | .patch = xen_patch, | 916 | .patch = xen_patch, |
906 | 917 | ||
918 | .banner = xen_banner, | ||
907 | .memory_setup = xen_memory_setup, | 919 | .memory_setup = xen_memory_setup, |
908 | .arch_setup = xen_arch_setup, | 920 | .arch_setup = xen_arch_setup, |
909 | .init_IRQ = xen_init_IRQ, | ||
910 | .post_allocator_init = xen_mark_init_mm_pinned, | 921 | .post_allocator_init = xen_mark_init_mm_pinned, |
922 | }; | ||
911 | 923 | ||
924 | static const struct pv_time_ops xen_time_ops __initdata = { | ||
912 | .time_init = xen_time_init, | 925 | .time_init = xen_time_init, |
926 | |||
913 | .set_wallclock = xen_set_wallclock, | 927 | .set_wallclock = xen_set_wallclock, |
914 | .get_wallclock = xen_get_wallclock, | 928 | .get_wallclock = xen_get_wallclock, |
915 | .get_cpu_khz = xen_cpu_khz, | 929 | .get_cpu_khz = xen_cpu_khz, |
916 | .sched_clock = xen_sched_clock, | 930 | .sched_clock = xen_sched_clock, |
931 | }; | ||
917 | 932 | ||
933 | static const struct pv_cpu_ops xen_cpu_ops __initdata = { | ||
918 | .cpuid = xen_cpuid, | 934 | .cpuid = xen_cpuid, |
919 | 935 | ||
920 | .set_debugreg = xen_set_debugreg, | 936 | .set_debugreg = xen_set_debugreg, |
@@ -925,22 +941,10 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = { | |||
925 | .read_cr0 = native_read_cr0, | 941 | .read_cr0 = native_read_cr0, |
926 | .write_cr0 = native_write_cr0, | 942 | .write_cr0 = native_write_cr0, |
927 | 943 | ||
928 | .read_cr2 = xen_read_cr2, | ||
929 | .write_cr2 = xen_write_cr2, | ||
930 | |||
931 | .read_cr3 = xen_read_cr3, | ||
932 | .write_cr3 = xen_write_cr3, | ||
933 | |||
934 | .read_cr4 = native_read_cr4, | 944 | .read_cr4 = native_read_cr4, |
935 | .read_cr4_safe = native_read_cr4_safe, | 945 | .read_cr4_safe = native_read_cr4_safe, |
936 | .write_cr4 = xen_write_cr4, | 946 | .write_cr4 = xen_write_cr4, |
937 | 947 | ||
938 | .save_fl = xen_save_fl, | ||
939 | .restore_fl = xen_restore_fl, | ||
940 | .irq_disable = xen_irq_disable, | ||
941 | .irq_enable = xen_irq_enable, | ||
942 | .safe_halt = xen_safe_halt, | ||
943 | .halt = xen_halt, | ||
944 | .wbinvd = native_wbinvd, | 948 | .wbinvd = native_wbinvd, |
945 | 949 | ||
946 | .read_msr = native_read_msr_safe, | 950 | .read_msr = native_read_msr_safe, |
@@ -969,6 +973,23 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = { | |||
969 | .set_iopl_mask = xen_set_iopl_mask, | 973 | .set_iopl_mask = xen_set_iopl_mask, |
970 | .io_delay = xen_io_delay, | 974 | .io_delay = xen_io_delay, |
971 | 975 | ||
976 | .lazy_mode = { | ||
977 | .enter = paravirt_enter_lazy_cpu, | ||
978 | .leave = xen_leave_lazy, | ||
979 | }, | ||
980 | }; | ||
981 | |||
982 | static const struct pv_irq_ops xen_irq_ops __initdata = { | ||
983 | .init_IRQ = xen_init_IRQ, | ||
984 | .save_fl = xen_save_fl, | ||
985 | .restore_fl = xen_restore_fl, | ||
986 | .irq_disable = xen_irq_disable, | ||
987 | .irq_enable = xen_irq_enable, | ||
988 | .safe_halt = xen_safe_halt, | ||
989 | .halt = xen_halt, | ||
990 | }; | ||
991 | |||
992 | static const struct pv_apic_ops xen_apic_ops __initdata = { | ||
972 | #ifdef CONFIG_X86_LOCAL_APIC | 993 | #ifdef CONFIG_X86_LOCAL_APIC |
973 | .apic_write = xen_apic_write, | 994 | .apic_write = xen_apic_write, |
974 | .apic_write_atomic = xen_apic_write, | 995 | .apic_write_atomic = xen_apic_write, |
@@ -977,6 +998,17 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = { | |||
977 | .setup_secondary_clock = paravirt_nop, | 998 | .setup_secondary_clock = paravirt_nop, |
978 | .startup_ipi_hook = paravirt_nop, | 999 | .startup_ipi_hook = paravirt_nop, |
979 | #endif | 1000 | #endif |
1001 | }; | ||
1002 | |||
1003 | static const struct pv_mmu_ops xen_mmu_ops __initdata = { | ||
1004 | .pagetable_setup_start = xen_pagetable_setup_start, | ||
1005 | .pagetable_setup_done = xen_pagetable_setup_done, | ||
1006 | |||
1007 | .read_cr2 = xen_read_cr2, | ||
1008 | .write_cr2 = xen_write_cr2, | ||
1009 | |||
1010 | .read_cr3 = xen_read_cr3, | ||
1011 | .write_cr3 = xen_write_cr3, | ||
980 | 1012 | ||
981 | .flush_tlb_user = xen_flush_tlb, | 1013 | .flush_tlb_user = xen_flush_tlb, |
982 | .flush_tlb_kernel = xen_flush_tlb, | 1014 | .flush_tlb_kernel = xen_flush_tlb, |
@@ -986,9 +1018,6 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = { | |||
986 | .pte_update = paravirt_nop, | 1018 | .pte_update = paravirt_nop, |
987 | .pte_update_defer = paravirt_nop, | 1019 | .pte_update_defer = paravirt_nop, |
988 | 1020 | ||
989 | .pagetable_setup_start = xen_pagetable_setup_start, | ||
990 | .pagetable_setup_done = xen_pagetable_setup_done, | ||
991 | |||
992 | .alloc_pt = xen_alloc_pt_init, | 1021 | .alloc_pt = xen_alloc_pt_init, |
993 | .release_pt = xen_release_pt, | 1022 | .release_pt = xen_release_pt, |
994 | .alloc_pd = paravirt_nop, | 1023 | .alloc_pd = paravirt_nop, |
@@ -1024,7 +1053,10 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = { | |||
1024 | .dup_mmap = xen_dup_mmap, | 1053 | .dup_mmap = xen_dup_mmap, |
1025 | .exit_mmap = xen_exit_mmap, | 1054 | .exit_mmap = xen_exit_mmap, |
1026 | 1055 | ||
1027 | .set_lazy_mode = xen_set_lazy_mode, | 1056 | .lazy_mode = { |
1057 | .enter = paravirt_enter_lazy_mmu, | ||
1058 | .leave = xen_leave_lazy, | ||
1059 | }, | ||
1028 | }; | 1060 | }; |
1029 | 1061 | ||
1030 | #ifdef CONFIG_SMP | 1062 | #ifdef CONFIG_SMP |
@@ -1080,6 +1112,17 @@ static const struct machine_ops __initdata xen_machine_ops = { | |||
1080 | }; | 1112 | }; |
1081 | 1113 | ||
1082 | 1114 | ||
1115 | static void __init xen_reserve_top(void) | ||
1116 | { | ||
1117 | unsigned long top = HYPERVISOR_VIRT_START; | ||
1118 | struct xen_platform_parameters pp; | ||
1119 | |||
1120 | if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0) | ||
1121 | top = pp.virt_start; | ||
1122 | |||
1123 | reserve_top_address(-top + 2 * PAGE_SIZE); | ||
1124 | } | ||
1125 | |||
1083 | /* First C function to be called on Xen boot */ | 1126 | /* First C function to be called on Xen boot */ |
1084 | asmlinkage void __init xen_start_kernel(void) | 1127 | asmlinkage void __init xen_start_kernel(void) |
1085 | { | 1128 | { |
@@ -1091,7 +1134,14 @@ asmlinkage void __init xen_start_kernel(void) | |||
1091 | BUG_ON(memcmp(xen_start_info->magic, "xen-3.0", 7) != 0); | 1134 | BUG_ON(memcmp(xen_start_info->magic, "xen-3.0", 7) != 0); |
1092 | 1135 | ||
1093 | /* Install Xen paravirt ops */ | 1136 | /* Install Xen paravirt ops */ |
1094 | paravirt_ops = xen_paravirt_ops; | 1137 | pv_info = xen_info; |
1138 | pv_init_ops = xen_init_ops; | ||
1139 | pv_time_ops = xen_time_ops; | ||
1140 | pv_cpu_ops = xen_cpu_ops; | ||
1141 | pv_irq_ops = xen_irq_ops; | ||
1142 | pv_apic_ops = xen_apic_ops; | ||
1143 | pv_mmu_ops = xen_mmu_ops; | ||
1144 | |||
1095 | machine_ops = xen_machine_ops; | 1145 | machine_ops = xen_machine_ops; |
1096 | 1146 | ||
1097 | #ifdef CONFIG_SMP | 1147 | #ifdef CONFIG_SMP |
@@ -1113,6 +1163,7 @@ asmlinkage void __init xen_start_kernel(void) | |||
1113 | /* keep using Xen gdt for now; no urgent need to change it */ | 1163 | /* keep using Xen gdt for now; no urgent need to change it */ |
1114 | 1164 | ||
1115 | x86_write_percpu(xen_cr3, __pa(pgd)); | 1165 | x86_write_percpu(xen_cr3, __pa(pgd)); |
1166 | x86_write_percpu(xen_current_cr3, __pa(pgd)); | ||
1116 | 1167 | ||
1117 | #ifdef CONFIG_SMP | 1168 | #ifdef CONFIG_SMP |
1118 | /* Don't do the full vcpu_info placement stuff until we have a | 1169 | /* Don't do the full vcpu_info placement stuff until we have a |
@@ -1124,12 +1175,12 @@ asmlinkage void __init xen_start_kernel(void) | |||
1124 | xen_setup_vcpu_info_placement(); | 1175 | xen_setup_vcpu_info_placement(); |
1125 | #endif | 1176 | #endif |
1126 | 1177 | ||
1127 | paravirt_ops.kernel_rpl = 1; | 1178 | pv_info.kernel_rpl = 1; |
1128 | if (xen_feature(XENFEAT_supervisor_mode_kernel)) | 1179 | if (xen_feature(XENFEAT_supervisor_mode_kernel)) |
1129 | paravirt_ops.kernel_rpl = 0; | 1180 | pv_info.kernel_rpl = 0; |
1130 | 1181 | ||
1131 | /* set the limit of our address space */ | 1182 | /* set the limit of our address space */ |
1132 | reserve_top_address(-HYPERVISOR_VIRT_START + 2 * PAGE_SIZE); | 1183 | xen_reserve_top(); |
1133 | 1184 | ||
1134 | /* set up basic CPUID stuff */ | 1185 | /* set up basic CPUID stuff */ |
1135 | cpu_detect(&new_cpu_data); | 1186 | cpu_detect(&new_cpu_data); |
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 874db0cd1d2a..b2e32f9d0071 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c | |||
@@ -41,7 +41,6 @@ | |||
41 | #include <linux/sched.h> | 41 | #include <linux/sched.h> |
42 | #include <linux/highmem.h> | 42 | #include <linux/highmem.h> |
43 | #include <linux/bug.h> | 43 | #include <linux/bug.h> |
44 | #include <linux/sched.h> | ||
45 | 44 | ||
46 | #include <asm/pgtable.h> | 45 | #include <asm/pgtable.h> |
47 | #include <asm/tlbflush.h> | 46 | #include <asm/tlbflush.h> |
@@ -155,7 +154,7 @@ void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, | |||
155 | pte_t *ptep, pte_t pteval) | 154 | pte_t *ptep, pte_t pteval) |
156 | { | 155 | { |
157 | if (mm == current->mm || mm == &init_mm) { | 156 | if (mm == current->mm || mm == &init_mm) { |
158 | if (xen_get_lazy_mode() == PARAVIRT_LAZY_MMU) { | 157 | if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { |
159 | struct multicall_space mcs; | 158 | struct multicall_space mcs; |
160 | mcs = xen_mc_entry(0); | 159 | mcs = xen_mc_entry(0); |
161 | 160 | ||
@@ -304,7 +303,12 @@ pgd_t xen_make_pgd(unsigned long pgd) | |||
304 | } | 303 | } |
305 | #endif /* CONFIG_X86_PAE */ | 304 | #endif /* CONFIG_X86_PAE */ |
306 | 305 | ||
307 | 306 | enum pt_level { | |
307 | PT_PGD, | ||
308 | PT_PUD, | ||
309 | PT_PMD, | ||
310 | PT_PTE | ||
311 | }; | ||
308 | 312 | ||
309 | /* | 313 | /* |
310 | (Yet another) pagetable walker. This one is intended for pinning a | 314 | (Yet another) pagetable walker. This one is intended for pinning a |
@@ -316,7 +320,7 @@ pgd_t xen_make_pgd(unsigned long pgd) | |||
316 | FIXADDR_TOP. But the important bit is that we don't pin beyond | 320 | FIXADDR_TOP. But the important bit is that we don't pin beyond |
317 | there, because then we start getting into Xen's ptes. | 321 | there, because then we start getting into Xen's ptes. |
318 | */ | 322 | */ |
319 | static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned), | 323 | static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, enum pt_level), |
320 | unsigned long limit) | 324 | unsigned long limit) |
321 | { | 325 | { |
322 | pgd_t *pgd = pgd_base; | 326 | pgd_t *pgd = pgd_base; |
@@ -341,7 +345,7 @@ static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned), | |||
341 | pud = pud_offset(pgd, 0); | 345 | pud = pud_offset(pgd, 0); |
342 | 346 | ||
343 | if (PTRS_PER_PUD > 1) /* not folded */ | 347 | if (PTRS_PER_PUD > 1) /* not folded */ |
344 | flush |= (*func)(virt_to_page(pud), 0); | 348 | flush |= (*func)(virt_to_page(pud), PT_PUD); |
345 | 349 | ||
346 | for (; addr != pud_limit; pud++, addr = pud_next) { | 350 | for (; addr != pud_limit; pud++, addr = pud_next) { |
347 | pmd_t *pmd; | 351 | pmd_t *pmd; |
@@ -360,7 +364,7 @@ static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned), | |||
360 | pmd = pmd_offset(pud, 0); | 364 | pmd = pmd_offset(pud, 0); |
361 | 365 | ||
362 | if (PTRS_PER_PMD > 1) /* not folded */ | 366 | if (PTRS_PER_PMD > 1) /* not folded */ |
363 | flush |= (*func)(virt_to_page(pmd), 0); | 367 | flush |= (*func)(virt_to_page(pmd), PT_PMD); |
364 | 368 | ||
365 | for (; addr != pmd_limit; pmd++) { | 369 | for (; addr != pmd_limit; pmd++) { |
366 | addr += (PAGE_SIZE * PTRS_PER_PTE); | 370 | addr += (PAGE_SIZE * PTRS_PER_PTE); |
@@ -372,17 +376,47 @@ static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned), | |||
372 | if (pmd_none(*pmd)) | 376 | if (pmd_none(*pmd)) |
373 | continue; | 377 | continue; |
374 | 378 | ||
375 | flush |= (*func)(pmd_page(*pmd), 0); | 379 | flush |= (*func)(pmd_page(*pmd), PT_PTE); |
376 | } | 380 | } |
377 | } | 381 | } |
378 | } | 382 | } |
379 | 383 | ||
380 | flush |= (*func)(virt_to_page(pgd_base), UVMF_TLB_FLUSH); | 384 | flush |= (*func)(virt_to_page(pgd_base), PT_PGD); |
381 | 385 | ||
382 | return flush; | 386 | return flush; |
383 | } | 387 | } |
384 | 388 | ||
385 | static int pin_page(struct page *page, unsigned flags) | 389 | static spinlock_t *lock_pte(struct page *page) |
390 | { | ||
391 | spinlock_t *ptl = NULL; | ||
392 | |||
393 | #if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS | ||
394 | ptl = __pte_lockptr(page); | ||
395 | spin_lock(ptl); | ||
396 | #endif | ||
397 | |||
398 | return ptl; | ||
399 | } | ||
400 | |||
401 | static void do_unlock(void *v) | ||
402 | { | ||
403 | spinlock_t *ptl = v; | ||
404 | spin_unlock(ptl); | ||
405 | } | ||
406 | |||
407 | static void xen_do_pin(unsigned level, unsigned long pfn) | ||
408 | { | ||
409 | struct mmuext_op *op; | ||
410 | struct multicall_space mcs; | ||
411 | |||
412 | mcs = __xen_mc_entry(sizeof(*op)); | ||
413 | op = mcs.args; | ||
414 | op->cmd = level; | ||
415 | op->arg1.mfn = pfn_to_mfn(pfn); | ||
416 | MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); | ||
417 | } | ||
418 | |||
419 | static int pin_page(struct page *page, enum pt_level level) | ||
386 | { | 420 | { |
387 | unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags); | 421 | unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags); |
388 | int flush; | 422 | int flush; |
@@ -397,12 +431,26 @@ static int pin_page(struct page *page, unsigned flags) | |||
397 | void *pt = lowmem_page_address(page); | 431 | void *pt = lowmem_page_address(page); |
398 | unsigned long pfn = page_to_pfn(page); | 432 | unsigned long pfn = page_to_pfn(page); |
399 | struct multicall_space mcs = __xen_mc_entry(0); | 433 | struct multicall_space mcs = __xen_mc_entry(0); |
434 | spinlock_t *ptl; | ||
400 | 435 | ||
401 | flush = 0; | 436 | flush = 0; |
402 | 437 | ||
438 | ptl = NULL; | ||
439 | if (level == PT_PTE) | ||
440 | ptl = lock_pte(page); | ||
441 | |||
403 | MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, | 442 | MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, |
404 | pfn_pte(pfn, PAGE_KERNEL_RO), | 443 | pfn_pte(pfn, PAGE_KERNEL_RO), |
405 | flags); | 444 | level == PT_PGD ? UVMF_TLB_FLUSH : 0); |
445 | |||
446 | if (level == PT_PTE) | ||
447 | xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn); | ||
448 | |||
449 | if (ptl) { | ||
450 | /* Queue a deferred unlock for when this batch | ||
451 | is completed. */ | ||
452 | xen_mc_callback(do_unlock, ptl); | ||
453 | } | ||
406 | } | 454 | } |
407 | 455 | ||
408 | return flush; | 456 | return flush; |
@@ -413,8 +461,7 @@ static int pin_page(struct page *page, unsigned flags) | |||
413 | read-only, and can be pinned. */ | 461 | read-only, and can be pinned. */ |
414 | void xen_pgd_pin(pgd_t *pgd) | 462 | void xen_pgd_pin(pgd_t *pgd) |
415 | { | 463 | { |
416 | struct multicall_space mcs; | 464 | unsigned level; |
417 | struct mmuext_op *op; | ||
418 | 465 | ||
419 | xen_mc_batch(); | 466 | xen_mc_batch(); |
420 | 467 | ||
@@ -425,16 +472,13 @@ void xen_pgd_pin(pgd_t *pgd) | |||
425 | xen_mc_batch(); | 472 | xen_mc_batch(); |
426 | } | 473 | } |
427 | 474 | ||
428 | mcs = __xen_mc_entry(sizeof(*op)); | ||
429 | op = mcs.args; | ||
430 | |||
431 | #ifdef CONFIG_X86_PAE | 475 | #ifdef CONFIG_X86_PAE |
432 | op->cmd = MMUEXT_PIN_L3_TABLE; | 476 | level = MMUEXT_PIN_L3_TABLE; |
433 | #else | 477 | #else |
434 | op->cmd = MMUEXT_PIN_L2_TABLE; | 478 | level = MMUEXT_PIN_L2_TABLE; |
435 | #endif | 479 | #endif |
436 | op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd))); | 480 | |
437 | MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); | 481 | xen_do_pin(level, PFN_DOWN(__pa(pgd))); |
438 | 482 | ||
439 | xen_mc_issue(0); | 483 | xen_mc_issue(0); |
440 | } | 484 | } |
@@ -442,7 +486,7 @@ void xen_pgd_pin(pgd_t *pgd) | |||
442 | /* The init_mm pagetable is really pinned as soon as its created, but | 486 | /* The init_mm pagetable is really pinned as soon as its created, but |
443 | that's before we have page structures to store the bits. So do all | 487 | that's before we have page structures to store the bits. So do all |
444 | the book-keeping now. */ | 488 | the book-keeping now. */ |
445 | static __init int mark_pinned(struct page *page, unsigned flags) | 489 | static __init int mark_pinned(struct page *page, enum pt_level level) |
446 | { | 490 | { |
447 | SetPagePinned(page); | 491 | SetPagePinned(page); |
448 | return 0; | 492 | return 0; |
@@ -453,18 +497,32 @@ void __init xen_mark_init_mm_pinned(void) | |||
453 | pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP); | 497 | pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP); |
454 | } | 498 | } |
455 | 499 | ||
456 | static int unpin_page(struct page *page, unsigned flags) | 500 | static int unpin_page(struct page *page, enum pt_level level) |
457 | { | 501 | { |
458 | unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags); | 502 | unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags); |
459 | 503 | ||
460 | if (pgfl && !PageHighMem(page)) { | 504 | if (pgfl && !PageHighMem(page)) { |
461 | void *pt = lowmem_page_address(page); | 505 | void *pt = lowmem_page_address(page); |
462 | unsigned long pfn = page_to_pfn(page); | 506 | unsigned long pfn = page_to_pfn(page); |
463 | struct multicall_space mcs = __xen_mc_entry(0); | 507 | spinlock_t *ptl = NULL; |
508 | struct multicall_space mcs; | ||
509 | |||
510 | if (level == PT_PTE) { | ||
511 | ptl = lock_pte(page); | ||
512 | |||
513 | xen_do_pin(MMUEXT_UNPIN_TABLE, pfn); | ||
514 | } | ||
515 | |||
516 | mcs = __xen_mc_entry(0); | ||
464 | 517 | ||
465 | MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, | 518 | MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, |
466 | pfn_pte(pfn, PAGE_KERNEL), | 519 | pfn_pte(pfn, PAGE_KERNEL), |
467 | flags); | 520 | level == PT_PGD ? UVMF_TLB_FLUSH : 0); |
521 | |||
522 | if (ptl) { | ||
523 | /* unlock when batch completed */ | ||
524 | xen_mc_callback(do_unlock, ptl); | ||
525 | } | ||
468 | } | 526 | } |
469 | 527 | ||
470 | return 0; /* never need to flush on unpin */ | 528 | return 0; /* never need to flush on unpin */ |
@@ -473,18 +531,9 @@ static int unpin_page(struct page *page, unsigned flags) | |||
473 | /* Release a pagetables pages back as normal RW */ | 531 | /* Release a pagetables pages back as normal RW */ |
474 | static void xen_pgd_unpin(pgd_t *pgd) | 532 | static void xen_pgd_unpin(pgd_t *pgd) |
475 | { | 533 | { |
476 | struct mmuext_op *op; | ||
477 | struct multicall_space mcs; | ||
478 | |||
479 | xen_mc_batch(); | 534 | xen_mc_batch(); |
480 | 535 | ||
481 | mcs = __xen_mc_entry(sizeof(*op)); | 536 | xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); |
482 | |||
483 | op = mcs.args; | ||
484 | op->cmd = MMUEXT_UNPIN_TABLE; | ||
485 | op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd))); | ||
486 | |||
487 | MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); | ||
488 | 537 | ||
489 | pgd_walk(pgd, unpin_page, TASK_SIZE); | 538 | pgd_walk(pgd, unpin_page, TASK_SIZE); |
490 | 539 | ||
@@ -515,20 +564,43 @@ static void drop_other_mm_ref(void *info) | |||
515 | 564 | ||
516 | if (__get_cpu_var(cpu_tlbstate).active_mm == mm) | 565 | if (__get_cpu_var(cpu_tlbstate).active_mm == mm) |
517 | leave_mm(smp_processor_id()); | 566 | leave_mm(smp_processor_id()); |
567 | |||
568 | /* If this cpu still has a stale cr3 reference, then make sure | ||
569 | it has been flushed. */ | ||
570 | if (x86_read_percpu(xen_current_cr3) == __pa(mm->pgd)) { | ||
571 | load_cr3(swapper_pg_dir); | ||
572 | arch_flush_lazy_cpu_mode(); | ||
573 | } | ||
518 | } | 574 | } |
519 | 575 | ||
520 | static void drop_mm_ref(struct mm_struct *mm) | 576 | static void drop_mm_ref(struct mm_struct *mm) |
521 | { | 577 | { |
578 | cpumask_t mask; | ||
579 | unsigned cpu; | ||
580 | |||
522 | if (current->active_mm == mm) { | 581 | if (current->active_mm == mm) { |
523 | if (current->mm == mm) | 582 | if (current->mm == mm) |
524 | load_cr3(swapper_pg_dir); | 583 | load_cr3(swapper_pg_dir); |
525 | else | 584 | else |
526 | leave_mm(smp_processor_id()); | 585 | leave_mm(smp_processor_id()); |
586 | arch_flush_lazy_cpu_mode(); | ||
527 | } | 587 | } |
528 | 588 | ||
529 | if (!cpus_empty(mm->cpu_vm_mask)) | 589 | /* Get the "official" set of cpus referring to our pagetable. */ |
530 | xen_smp_call_function_mask(mm->cpu_vm_mask, drop_other_mm_ref, | 590 | mask = mm->cpu_vm_mask; |
531 | mm, 1); | 591 | |
592 | /* It's possible that a vcpu may have a stale reference to our | ||
593 | cr3, because its in lazy mode, and it hasn't yet flushed | ||
594 | its set of pending hypercalls yet. In this case, we can | ||
595 | look at its actual current cr3 value, and force it to flush | ||
596 | if needed. */ | ||
597 | for_each_online_cpu(cpu) { | ||
598 | if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd)) | ||
599 | cpu_set(cpu, mask); | ||
600 | } | ||
601 | |||
602 | if (!cpus_empty(mask)) | ||
603 | xen_smp_call_function_mask(mask, drop_other_mm_ref, mm, 1); | ||
532 | } | 604 | } |
533 | #else | 605 | #else |
534 | static void drop_mm_ref(struct mm_struct *mm) | 606 | static void drop_mm_ref(struct mm_struct *mm) |
@@ -563,5 +635,6 @@ void xen_exit_mmap(struct mm_struct *mm) | |||
563 | /* pgd may not be pinned in the error exit path of execve */ | 635 | /* pgd may not be pinned in the error exit path of execve */ |
564 | if (PagePinned(virt_to_page(mm->pgd))) | 636 | if (PagePinned(virt_to_page(mm->pgd))) |
565 | xen_pgd_unpin(mm->pgd); | 637 | xen_pgd_unpin(mm->pgd); |
638 | |||
566 | spin_unlock(&mm->page_table_lock); | 639 | spin_unlock(&mm->page_table_lock); |
567 | } | 640 | } |
diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c index c837e8e463db..5e6f36f6d876 100644 --- a/arch/x86/xen/multicalls.c +++ b/arch/x86/xen/multicalls.c | |||
@@ -26,13 +26,22 @@ | |||
26 | 26 | ||
27 | #include "multicalls.h" | 27 | #include "multicalls.h" |
28 | 28 | ||
29 | #define MC_DEBUG 1 | ||
30 | |||
29 | #define MC_BATCH 32 | 31 | #define MC_BATCH 32 |
30 | #define MC_ARGS (MC_BATCH * 16 / sizeof(u64)) | 32 | #define MC_ARGS (MC_BATCH * 16 / sizeof(u64)) |
31 | 33 | ||
32 | struct mc_buffer { | 34 | struct mc_buffer { |
33 | struct multicall_entry entries[MC_BATCH]; | 35 | struct multicall_entry entries[MC_BATCH]; |
36 | #if MC_DEBUG | ||
37 | struct multicall_entry debug[MC_BATCH]; | ||
38 | #endif | ||
34 | u64 args[MC_ARGS]; | 39 | u64 args[MC_ARGS]; |
35 | unsigned mcidx, argidx; | 40 | struct callback { |
41 | void (*fn)(void *); | ||
42 | void *data; | ||
43 | } callbacks[MC_BATCH]; | ||
44 | unsigned mcidx, argidx, cbidx; | ||
36 | }; | 45 | }; |
37 | 46 | ||
38 | static DEFINE_PER_CPU(struct mc_buffer, mc_buffer); | 47 | static DEFINE_PER_CPU(struct mc_buffer, mc_buffer); |
@@ -43,6 +52,7 @@ void xen_mc_flush(void) | |||
43 | struct mc_buffer *b = &__get_cpu_var(mc_buffer); | 52 | struct mc_buffer *b = &__get_cpu_var(mc_buffer); |
44 | int ret = 0; | 53 | int ret = 0; |
45 | unsigned long flags; | 54 | unsigned long flags; |
55 | int i; | ||
46 | 56 | ||
47 | BUG_ON(preemptible()); | 57 | BUG_ON(preemptible()); |
48 | 58 | ||
@@ -51,13 +61,31 @@ void xen_mc_flush(void) | |||
51 | local_irq_save(flags); | 61 | local_irq_save(flags); |
52 | 62 | ||
53 | if (b->mcidx) { | 63 | if (b->mcidx) { |
54 | int i; | 64 | #if MC_DEBUG |
65 | memcpy(b->debug, b->entries, | ||
66 | b->mcidx * sizeof(struct multicall_entry)); | ||
67 | #endif | ||
55 | 68 | ||
56 | if (HYPERVISOR_multicall(b->entries, b->mcidx) != 0) | 69 | if (HYPERVISOR_multicall(b->entries, b->mcidx) != 0) |
57 | BUG(); | 70 | BUG(); |
58 | for (i = 0; i < b->mcidx; i++) | 71 | for (i = 0; i < b->mcidx; i++) |
59 | if (b->entries[i].result < 0) | 72 | if (b->entries[i].result < 0) |
60 | ret++; | 73 | ret++; |
74 | |||
75 | #if MC_DEBUG | ||
76 | if (ret) { | ||
77 | printk(KERN_ERR "%d multicall(s) failed: cpu %d\n", | ||
78 | ret, smp_processor_id()); | ||
79 | for(i = 0; i < b->mcidx; i++) { | ||
80 | printk(" call %2d/%d: op=%lu arg=[%lx] result=%ld\n", | ||
81 | i+1, b->mcidx, | ||
82 | b->debug[i].op, | ||
83 | b->debug[i].args[0], | ||
84 | b->entries[i].result); | ||
85 | } | ||
86 | } | ||
87 | #endif | ||
88 | |||
61 | b->mcidx = 0; | 89 | b->mcidx = 0; |
62 | b->argidx = 0; | 90 | b->argidx = 0; |
63 | } else | 91 | } else |
@@ -65,6 +93,13 @@ void xen_mc_flush(void) | |||
65 | 93 | ||
66 | local_irq_restore(flags); | 94 | local_irq_restore(flags); |
67 | 95 | ||
96 | for(i = 0; i < b->cbidx; i++) { | ||
97 | struct callback *cb = &b->callbacks[i]; | ||
98 | |||
99 | (*cb->fn)(cb->data); | ||
100 | } | ||
101 | b->cbidx = 0; | ||
102 | |||
68 | BUG_ON(ret); | 103 | BUG_ON(ret); |
69 | } | 104 | } |
70 | 105 | ||
@@ -88,3 +123,16 @@ struct multicall_space __xen_mc_entry(size_t args) | |||
88 | 123 | ||
89 | return ret; | 124 | return ret; |
90 | } | 125 | } |
126 | |||
127 | void xen_mc_callback(void (*fn)(void *), void *data) | ||
128 | { | ||
129 | struct mc_buffer *b = &__get_cpu_var(mc_buffer); | ||
130 | struct callback *cb; | ||
131 | |||
132 | if (b->cbidx == MC_BATCH) | ||
133 | xen_mc_flush(); | ||
134 | |||
135 | cb = &b->callbacks[b->cbidx++]; | ||
136 | cb->fn = fn; | ||
137 | cb->data = data; | ||
138 | } | ||
diff --git a/arch/x86/xen/multicalls.h b/arch/x86/xen/multicalls.h index e6f7530b156c..8bae996d99a3 100644 --- a/arch/x86/xen/multicalls.h +++ b/arch/x86/xen/multicalls.h | |||
@@ -35,11 +35,14 @@ void xen_mc_flush(void); | |||
35 | /* Issue a multicall if we're not in a lazy mode */ | 35 | /* Issue a multicall if we're not in a lazy mode */ |
36 | static inline void xen_mc_issue(unsigned mode) | 36 | static inline void xen_mc_issue(unsigned mode) |
37 | { | 37 | { |
38 | if ((xen_get_lazy_mode() & mode) == 0) | 38 | if ((paravirt_get_lazy_mode() & mode) == 0) |
39 | xen_mc_flush(); | 39 | xen_mc_flush(); |
40 | 40 | ||
41 | /* restore flags saved in xen_mc_batch */ | 41 | /* restore flags saved in xen_mc_batch */ |
42 | local_irq_restore(x86_read_percpu(xen_mc_irq_flags)); | 42 | local_irq_restore(x86_read_percpu(xen_mc_irq_flags)); |
43 | } | 43 | } |
44 | 44 | ||
45 | /* Set up a callback to be called when the current batch is flushed */ | ||
46 | void xen_mc_callback(void (*fn)(void *), void *data); | ||
47 | |||
45 | #endif /* _XEN_MULTICALLS_H */ | 48 | #endif /* _XEN_MULTICALLS_H */ |
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 4fa33c27ccb6..d53bf9d8a72d 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c | |||
@@ -370,7 +370,8 @@ int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *), | |||
370 | void *info, int wait) | 370 | void *info, int wait) |
371 | { | 371 | { |
372 | struct call_data_struct data; | 372 | struct call_data_struct data; |
373 | int cpus; | 373 | int cpus, cpu; |
374 | bool yield; | ||
374 | 375 | ||
375 | /* Holding any lock stops cpus from going down. */ | 376 | /* Holding any lock stops cpus from going down. */ |
376 | spin_lock(&call_lock); | 377 | spin_lock(&call_lock); |
@@ -399,9 +400,14 @@ int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *), | |||
399 | /* Send a message to other CPUs and wait for them to respond */ | 400 | /* Send a message to other CPUs and wait for them to respond */ |
400 | xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR); | 401 | xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR); |
401 | 402 | ||
402 | /* Make sure other vcpus get a chance to run. | 403 | /* Make sure other vcpus get a chance to run if they need to. */ |
403 | XXX too severe? Maybe we should check the other CPU's states? */ | 404 | yield = false; |
404 | HYPERVISOR_sched_op(SCHEDOP_yield, 0); | 405 | for_each_cpu_mask(cpu, mask) |
406 | if (xen_vcpu_stolen(cpu)) | ||
407 | yield = true; | ||
408 | |||
409 | if (yield) | ||
410 | HYPERVISOR_sched_op(SCHEDOP_yield, 0); | ||
405 | 411 | ||
406 | /* Wait for response */ | 412 | /* Wait for response */ |
407 | while (atomic_read(&data.started) != cpus || | 413 | while (atomic_read(&data.started) != cpus || |
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index dfd6db69ead5..d083ff5ef088 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c | |||
@@ -105,6 +105,12 @@ static void get_runstate_snapshot(struct vcpu_runstate_info *res) | |||
105 | } while (get64(&state->state_entry_time) != state_time); | 105 | } while (get64(&state->state_entry_time) != state_time); |
106 | } | 106 | } |
107 | 107 | ||
108 | /* return true when a vcpu could run but has no real cpu to run on */ | ||
109 | bool xen_vcpu_stolen(int vcpu) | ||
110 | { | ||
111 | return per_cpu(runstate, vcpu).state == RUNSTATE_runnable; | ||
112 | } | ||
113 | |||
108 | static void setup_runstate_info(int cpu) | 114 | static void setup_runstate_info(int cpu) |
109 | { | 115 | { |
110 | struct vcpu_register_runstate_memory_area area; | 116 | struct vcpu_register_runstate_memory_area area; |
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index b9aaea45f07f..b02a909bfd4c 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h | |||
@@ -11,6 +11,7 @@ void xen_copy_trap_info(struct trap_info *traps); | |||
11 | 11 | ||
12 | DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu); | 12 | DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu); |
13 | DECLARE_PER_CPU(unsigned long, xen_cr3); | 13 | DECLARE_PER_CPU(unsigned long, xen_cr3); |
14 | DECLARE_PER_CPU(unsigned long, xen_current_cr3); | ||
14 | 15 | ||
15 | extern struct start_info *xen_start_info; | 16 | extern struct start_info *xen_start_info; |
16 | extern struct shared_info *HYPERVISOR_shared_info; | 17 | extern struct shared_info *HYPERVISOR_shared_info; |
@@ -27,14 +28,9 @@ unsigned long xen_get_wallclock(void); | |||
27 | int xen_set_wallclock(unsigned long time); | 28 | int xen_set_wallclock(unsigned long time); |
28 | unsigned long long xen_sched_clock(void); | 29 | unsigned long long xen_sched_clock(void); |
29 | 30 | ||
30 | void xen_mark_init_mm_pinned(void); | 31 | bool xen_vcpu_stolen(int vcpu); |
31 | |||
32 | DECLARE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode); | ||
33 | 32 | ||
34 | static inline unsigned xen_get_lazy_mode(void) | 33 | void xen_mark_init_mm_pinned(void); |
35 | { | ||
36 | return x86_read_percpu(xen_lazy_mode); | ||
37 | } | ||
38 | 34 | ||
39 | void __init xen_fill_possible_map(void); | 35 | void __init xen_fill_possible_map(void); |
40 | 36 | ||