aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/xen
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2009-02-05 09:54:27 -0500
committerIngo Molnar <mingo@elte.hu>2009-02-05 09:54:27 -0500
commit69b745ff91836dfc25b26d7be0ead02a6fc0286e (patch)
treec37bf097914cb1da1e1c55cbe0c6ce58d0358edc /arch/x86/xen
parentef3892bd63420380d115f755d351d2071f1f805f (diff)
parente4d0407185cdbdcfd99fc23bde2e5454bbc46329 (diff)
Merge branch 'tj-percpu' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/misc into core/percpu
Diffstat (limited to 'arch/x86/xen')
-rw-r--r--arch/x86/xen/Makefile3
-rw-r--r--arch/x86/xen/enlighten.c767
-rw-r--r--arch/x86/xen/irq.c14
-rw-r--r--arch/x86/xen/mmu.c745
-rw-r--r--arch/x86/xen/mmu.h3
-rw-r--r--arch/x86/xen/smp.c6
-rw-r--r--arch/x86/xen/xen-asm.S140
-rw-r--r--arch/x86/xen/xen-asm.h12
-rw-r--r--arch/x86/xen/xen-asm_32.S111
-rw-r--r--arch/x86/xen/xen-asm_64.S134
-rw-r--r--arch/x86/xen/xen-ops.h10
11 files changed, 960 insertions, 985 deletions
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 6dcefba7836f..3b767d03fd6a 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -6,7 +6,8 @@ CFLAGS_REMOVE_irq.o = -pg
6endif 6endif
7 7
8obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ 8obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \
9 time.o xen-asm_$(BITS).o grant-table.o suspend.o 9 time.o xen-asm.o xen-asm_$(BITS).o \
10 grant-table.o suspend.o
10 11
11obj-$(CONFIG_SMP) += smp.o spinlock.o 12obj-$(CONFIG_SMP) += smp.o spinlock.o
12obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o \ No newline at end of file 13obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o \ No newline at end of file
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index fe19c88a5029..37230342c2c4 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -61,40 +61,13 @@ DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
61enum xen_domain_type xen_domain_type = XEN_NATIVE; 61enum xen_domain_type xen_domain_type = XEN_NATIVE;
62EXPORT_SYMBOL_GPL(xen_domain_type); 62EXPORT_SYMBOL_GPL(xen_domain_type);
63 63
64/*
65 * Identity map, in addition to plain kernel map. This needs to be
66 * large enough to allocate page table pages to allocate the rest.
67 * Each page can map 2MB.
68 */
69static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss;
70
71#ifdef CONFIG_X86_64
72/* l3 pud for userspace vsyscall mapping */
73static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
74#endif /* CONFIG_X86_64 */
75
76/*
77 * Note about cr3 (pagetable base) values:
78 *
79 * xen_cr3 contains the current logical cr3 value; it contains the
80 * last set cr3. This may not be the current effective cr3, because
81 * its update may be being lazily deferred. However, a vcpu looking
82 * at its own cr3 can use this value knowing that it everything will
83 * be self-consistent.
84 *
85 * xen_current_cr3 contains the actual vcpu cr3; it is set once the
86 * hypercall to set the vcpu cr3 is complete (so it may be a little
87 * out of date, but it will never be set early). If one vcpu is
88 * looking at another vcpu's cr3 value, it should use this variable.
89 */
90DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */
91DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
92
93struct start_info *xen_start_info; 64struct start_info *xen_start_info;
94EXPORT_SYMBOL_GPL(xen_start_info); 65EXPORT_SYMBOL_GPL(xen_start_info);
95 66
96struct shared_info xen_dummy_shared_info; 67struct shared_info xen_dummy_shared_info;
97 68
69void *xen_initial_gdt;
70
98/* 71/*
99 * Point at some empty memory to start with. We map the real shared_info 72 * Point at some empty memory to start with. We map the real shared_info
100 * page as soon as fixmap is up and running. 73 * page as soon as fixmap is up and running.
@@ -114,14 +87,7 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
114 * 87 *
115 * 0: not available, 1: available 88 * 0: not available, 1: available
116 */ 89 */
117static int have_vcpu_info_placement = 90static int have_vcpu_info_placement = 1;
118#ifdef CONFIG_X86_32
119 1
120#else
121 0
122#endif
123 ;
124
125 91
126static void xen_vcpu_setup(int cpu) 92static void xen_vcpu_setup(int cpu)
127{ 93{
@@ -237,7 +203,7 @@ static unsigned long xen_get_debugreg(int reg)
237 return HYPERVISOR_get_debugreg(reg); 203 return HYPERVISOR_get_debugreg(reg);
238} 204}
239 205
240static void xen_leave_lazy(void) 206void xen_leave_lazy(void)
241{ 207{
242 paravirt_leave_lazy(paravirt_get_lazy_mode()); 208 paravirt_leave_lazy(paravirt_get_lazy_mode());
243 xen_mc_flush(); 209 xen_mc_flush();
@@ -598,76 +564,6 @@ static struct apic_ops xen_basic_apic_ops = {
598 564
599#endif 565#endif
600 566
601static void xen_flush_tlb(void)
602{
603 struct mmuext_op *op;
604 struct multicall_space mcs;
605
606 preempt_disable();
607
608 mcs = xen_mc_entry(sizeof(*op));
609
610 op = mcs.args;
611 op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
612 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
613
614 xen_mc_issue(PARAVIRT_LAZY_MMU);
615
616 preempt_enable();
617}
618
619static void xen_flush_tlb_single(unsigned long addr)
620{
621 struct mmuext_op *op;
622 struct multicall_space mcs;
623
624 preempt_disable();
625
626 mcs = xen_mc_entry(sizeof(*op));
627 op = mcs.args;
628 op->cmd = MMUEXT_INVLPG_LOCAL;
629 op->arg1.linear_addr = addr & PAGE_MASK;
630 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
631
632 xen_mc_issue(PARAVIRT_LAZY_MMU);
633
634 preempt_enable();
635}
636
637static void xen_flush_tlb_others(const struct cpumask *cpus,
638 struct mm_struct *mm, unsigned long va)
639{
640 struct {
641 struct mmuext_op op;
642 DECLARE_BITMAP(mask, NR_CPUS);
643 } *args;
644 struct multicall_space mcs;
645
646 BUG_ON(cpumask_empty(cpus));
647 BUG_ON(!mm);
648
649 mcs = xen_mc_entry(sizeof(*args));
650 args = mcs.args;
651 args->op.arg2.vcpumask = to_cpumask(args->mask);
652
653 /* Remove us, and any offline CPUS. */
654 cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
655 cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
656 if (unlikely(cpumask_empty(to_cpumask(args->mask))))
657 goto issue;
658
659 if (va == TLB_FLUSH_ALL) {
660 args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
661 } else {
662 args->op.cmd = MMUEXT_INVLPG_MULTI;
663 args->op.arg1.linear_addr = va;
664 }
665
666 MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
667
668issue:
669 xen_mc_issue(PARAVIRT_LAZY_MMU);
670}
671 567
672static void xen_clts(void) 568static void xen_clts(void)
673{ 569{
@@ -693,21 +589,6 @@ static void xen_write_cr0(unsigned long cr0)
693 xen_mc_issue(PARAVIRT_LAZY_CPU); 589 xen_mc_issue(PARAVIRT_LAZY_CPU);
694} 590}
695 591
696static void xen_write_cr2(unsigned long cr2)
697{
698 percpu_read(xen_vcpu)->arch.cr2 = cr2;
699}
700
701static unsigned long xen_read_cr2(void)
702{
703 return percpu_read(xen_vcpu)->arch.cr2;
704}
705
706static unsigned long xen_read_cr2_direct(void)
707{
708 return percpu_read(xen_vcpu_info.arch.cr2);
709}
710
711static void xen_write_cr4(unsigned long cr4) 592static void xen_write_cr4(unsigned long cr4)
712{ 593{
713 cr4 &= ~X86_CR4_PGE; 594 cr4 &= ~X86_CR4_PGE;
@@ -716,71 +597,6 @@ static void xen_write_cr4(unsigned long cr4)
716 native_write_cr4(cr4); 597 native_write_cr4(cr4);
717} 598}
718 599
719static unsigned long xen_read_cr3(void)
720{
721 return percpu_read(xen_cr3);
722}
723
724static void set_current_cr3(void *v)
725{
726 percpu_write(xen_current_cr3, (unsigned long)v);
727}
728
729static void __xen_write_cr3(bool kernel, unsigned long cr3)
730{
731 struct mmuext_op *op;
732 struct multicall_space mcs;
733 unsigned long mfn;
734
735 if (cr3)
736 mfn = pfn_to_mfn(PFN_DOWN(cr3));
737 else
738 mfn = 0;
739
740 WARN_ON(mfn == 0 && kernel);
741
742 mcs = __xen_mc_entry(sizeof(*op));
743
744 op = mcs.args;
745 op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
746 op->arg1.mfn = mfn;
747
748 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
749
750 if (kernel) {
751 percpu_write(xen_cr3, cr3);
752
753 /* Update xen_current_cr3 once the batch has actually
754 been submitted. */
755 xen_mc_callback(set_current_cr3, (void *)cr3);
756 }
757}
758
759static void xen_write_cr3(unsigned long cr3)
760{
761 BUG_ON(preemptible());
762
763 xen_mc_batch(); /* disables interrupts */
764
765 /* Update while interrupts are disabled, so its atomic with
766 respect to ipis */
767 percpu_write(xen_cr3, cr3);
768
769 __xen_write_cr3(true, cr3);
770
771#ifdef CONFIG_X86_64
772 {
773 pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
774 if (user_pgd)
775 __xen_write_cr3(false, __pa(user_pgd));
776 else
777 __xen_write_cr3(false, 0);
778 }
779#endif
780
781 xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
782}
783
784static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) 600static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
785{ 601{
786 int ret; 602 int ret;
@@ -822,185 +638,6 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
822 return ret; 638 return ret;
823} 639}
824 640
825/* Early in boot, while setting up the initial pagetable, assume
826 everything is pinned. */
827static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
828{
829#ifdef CONFIG_FLATMEM
830 BUG_ON(mem_map); /* should only be used early */
831#endif
832 make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
833}
834
835/* Early release_pte assumes that all pts are pinned, since there's
836 only init_mm and anything attached to that is pinned. */
837static void xen_release_pte_init(unsigned long pfn)
838{
839 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
840}
841
842static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
843{
844 struct mmuext_op op;
845 op.cmd = cmd;
846 op.arg1.mfn = pfn_to_mfn(pfn);
847 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
848 BUG();
849}
850
851/* This needs to make sure the new pte page is pinned iff its being
852 attached to a pinned pagetable. */
853static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level)
854{
855 struct page *page = pfn_to_page(pfn);
856
857 if (PagePinned(virt_to_page(mm->pgd))) {
858 SetPagePinned(page);
859
860 vm_unmap_aliases();
861 if (!PageHighMem(page)) {
862 make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
863 if (level == PT_PTE && USE_SPLIT_PTLOCKS)
864 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
865 } else {
866 /* make sure there are no stray mappings of
867 this page */
868 kmap_flush_unused();
869 }
870 }
871}
872
873static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
874{
875 xen_alloc_ptpage(mm, pfn, PT_PTE);
876}
877
878static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
879{
880 xen_alloc_ptpage(mm, pfn, PT_PMD);
881}
882
883static int xen_pgd_alloc(struct mm_struct *mm)
884{
885 pgd_t *pgd = mm->pgd;
886 int ret = 0;
887
888 BUG_ON(PagePinned(virt_to_page(pgd)));
889
890#ifdef CONFIG_X86_64
891 {
892 struct page *page = virt_to_page(pgd);
893 pgd_t *user_pgd;
894
895 BUG_ON(page->private != 0);
896
897 ret = -ENOMEM;
898
899 user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
900 page->private = (unsigned long)user_pgd;
901
902 if (user_pgd != NULL) {
903 user_pgd[pgd_index(VSYSCALL_START)] =
904 __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
905 ret = 0;
906 }
907
908 BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
909 }
910#endif
911
912 return ret;
913}
914
915static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
916{
917#ifdef CONFIG_X86_64
918 pgd_t *user_pgd = xen_get_user_pgd(pgd);
919
920 if (user_pgd)
921 free_page((unsigned long)user_pgd);
922#endif
923}
924
925/* This should never happen until we're OK to use struct page */
926static void xen_release_ptpage(unsigned long pfn, unsigned level)
927{
928 struct page *page = pfn_to_page(pfn);
929
930 if (PagePinned(page)) {
931 if (!PageHighMem(page)) {
932 if (level == PT_PTE && USE_SPLIT_PTLOCKS)
933 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
934 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
935 }
936 ClearPagePinned(page);
937 }
938}
939
940static void xen_release_pte(unsigned long pfn)
941{
942 xen_release_ptpage(pfn, PT_PTE);
943}
944
945static void xen_release_pmd(unsigned long pfn)
946{
947 xen_release_ptpage(pfn, PT_PMD);
948}
949
950#if PAGETABLE_LEVELS == 4
951static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
952{
953 xen_alloc_ptpage(mm, pfn, PT_PUD);
954}
955
956static void xen_release_pud(unsigned long pfn)
957{
958 xen_release_ptpage(pfn, PT_PUD);
959}
960#endif
961
962#ifdef CONFIG_HIGHPTE
963static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
964{
965 pgprot_t prot = PAGE_KERNEL;
966
967 if (PagePinned(page))
968 prot = PAGE_KERNEL_RO;
969
970 if (0 && PageHighMem(page))
971 printk("mapping highpte %lx type %d prot %s\n",
972 page_to_pfn(page), type,
973 (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ");
974
975 return kmap_atomic_prot(page, type, prot);
976}
977#endif
978
979#ifdef CONFIG_X86_32
980static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
981{
982 /* If there's an existing pte, then don't allow _PAGE_RW to be set */
983 if (pte_val_ma(*ptep) & _PAGE_PRESENT)
984 pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
985 pte_val_ma(pte));
986
987 return pte;
988}
989
990/* Init-time set_pte while constructing initial pagetables, which
991 doesn't allow RO pagetable pages to be remapped RW */
992static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
993{
994 pte = mask_rw_pte(ptep, pte);
995
996 xen_set_pte(ptep, pte);
997}
998#endif
999
1000static __init void xen_pagetable_setup_start(pgd_t *base)
1001{
1002}
1003
1004void xen_setup_shared_info(void) 641void xen_setup_shared_info(void)
1005{ 642{
1006 if (!xen_feature(XENFEAT_auto_translated_physmap)) { 643 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
@@ -1021,37 +658,6 @@ void xen_setup_shared_info(void)
1021 xen_setup_mfn_list_list(); 658 xen_setup_mfn_list_list();
1022} 659}
1023 660
1024static __init void xen_pagetable_setup_done(pgd_t *base)
1025{
1026 xen_setup_shared_info();
1027}
1028
1029static __init void xen_post_allocator_init(void)
1030{
1031 pv_mmu_ops.set_pte = xen_set_pte;
1032 pv_mmu_ops.set_pmd = xen_set_pmd;
1033 pv_mmu_ops.set_pud = xen_set_pud;
1034#if PAGETABLE_LEVELS == 4
1035 pv_mmu_ops.set_pgd = xen_set_pgd;
1036#endif
1037
1038 /* This will work as long as patching hasn't happened yet
1039 (which it hasn't) */
1040 pv_mmu_ops.alloc_pte = xen_alloc_pte;
1041 pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
1042 pv_mmu_ops.release_pte = xen_release_pte;
1043 pv_mmu_ops.release_pmd = xen_release_pmd;
1044#if PAGETABLE_LEVELS == 4
1045 pv_mmu_ops.alloc_pud = xen_alloc_pud;
1046 pv_mmu_ops.release_pud = xen_release_pud;
1047#endif
1048
1049#ifdef CONFIG_X86_64
1050 SetPagePinned(virt_to_page(level3_user_vsyscall));
1051#endif
1052 xen_mark_init_mm_pinned();
1053}
1054
1055/* This is called once we have the cpu_possible_map */ 661/* This is called once we have the cpu_possible_map */
1056void xen_setup_vcpu_info_placement(void) 662void xen_setup_vcpu_info_placement(void)
1057{ 663{
@@ -1065,10 +671,10 @@ void xen_setup_vcpu_info_placement(void)
1065 if (have_vcpu_info_placement) { 671 if (have_vcpu_info_placement) {
1066 printk(KERN_INFO "Xen: using vcpu_info placement\n"); 672 printk(KERN_INFO "Xen: using vcpu_info placement\n");
1067 673
1068 pv_irq_ops.save_fl = xen_save_fl_direct; 674 pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct);
1069 pv_irq_ops.restore_fl = xen_restore_fl_direct; 675 pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct);
1070 pv_irq_ops.irq_disable = xen_irq_disable_direct; 676 pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct);
1071 pv_irq_ops.irq_enable = xen_irq_enable_direct; 677 pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(xen_irq_enable_direct);
1072 pv_mmu_ops.read_cr2 = xen_read_cr2_direct; 678 pv_mmu_ops.read_cr2 = xen_read_cr2_direct;
1073 } 679 }
1074} 680}
@@ -1126,49 +732,6 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
1126 return ret; 732 return ret;
1127} 733}
1128 734
1129static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot)
1130{
1131 pte_t pte;
1132
1133 phys >>= PAGE_SHIFT;
1134
1135 switch (idx) {
1136 case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
1137#ifdef CONFIG_X86_F00F_BUG
1138 case FIX_F00F_IDT:
1139#endif
1140#ifdef CONFIG_X86_32
1141 case FIX_WP_TEST:
1142 case FIX_VDSO:
1143# ifdef CONFIG_HIGHMEM
1144 case FIX_KMAP_BEGIN ... FIX_KMAP_END:
1145# endif
1146#else
1147 case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
1148#endif
1149#ifdef CONFIG_X86_LOCAL_APIC
1150 case FIX_APIC_BASE: /* maps dummy local APIC */
1151#endif
1152 pte = pfn_pte(phys, prot);
1153 break;
1154
1155 default:
1156 pte = mfn_pte(phys, prot);
1157 break;
1158 }
1159
1160 __native_set_fixmap(idx, pte);
1161
1162#ifdef CONFIG_X86_64
1163 /* Replicate changes to map the vsyscall page into the user
1164 pagetable vsyscall mapping. */
1165 if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) {
1166 unsigned long vaddr = __fix_to_virt(idx);
1167 set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
1168 }
1169#endif
1170}
1171
1172static const struct pv_info xen_info __initdata = { 735static const struct pv_info xen_info __initdata = {
1173 .paravirt_enabled = 1, 736 .paravirt_enabled = 1,
1174 .shared_kernel_pmd = 0, 737 .shared_kernel_pmd = 0,
@@ -1264,87 +827,6 @@ static const struct pv_apic_ops xen_apic_ops __initdata = {
1264#endif 827#endif
1265}; 828};
1266 829
1267static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1268 .pagetable_setup_start = xen_pagetable_setup_start,
1269 .pagetable_setup_done = xen_pagetable_setup_done,
1270
1271 .read_cr2 = xen_read_cr2,
1272 .write_cr2 = xen_write_cr2,
1273
1274 .read_cr3 = xen_read_cr3,
1275 .write_cr3 = xen_write_cr3,
1276
1277 .flush_tlb_user = xen_flush_tlb,
1278 .flush_tlb_kernel = xen_flush_tlb,
1279 .flush_tlb_single = xen_flush_tlb_single,
1280 .flush_tlb_others = xen_flush_tlb_others,
1281
1282 .pte_update = paravirt_nop,
1283 .pte_update_defer = paravirt_nop,
1284
1285 .pgd_alloc = xen_pgd_alloc,
1286 .pgd_free = xen_pgd_free,
1287
1288 .alloc_pte = xen_alloc_pte_init,
1289 .release_pte = xen_release_pte_init,
1290 .alloc_pmd = xen_alloc_pte_init,
1291 .alloc_pmd_clone = paravirt_nop,
1292 .release_pmd = xen_release_pte_init,
1293
1294#ifdef CONFIG_HIGHPTE
1295 .kmap_atomic_pte = xen_kmap_atomic_pte,
1296#endif
1297
1298#ifdef CONFIG_X86_64
1299 .set_pte = xen_set_pte,
1300#else
1301 .set_pte = xen_set_pte_init,
1302#endif
1303 .set_pte_at = xen_set_pte_at,
1304 .set_pmd = xen_set_pmd_hyper,
1305
1306 .ptep_modify_prot_start = __ptep_modify_prot_start,
1307 .ptep_modify_prot_commit = __ptep_modify_prot_commit,
1308
1309 .pte_val = xen_pte_val,
1310 .pte_flags = native_pte_flags,
1311 .pgd_val = xen_pgd_val,
1312
1313 .make_pte = xen_make_pte,
1314 .make_pgd = xen_make_pgd,
1315
1316#ifdef CONFIG_X86_PAE
1317 .set_pte_atomic = xen_set_pte_atomic,
1318 .set_pte_present = xen_set_pte_at,
1319 .pte_clear = xen_pte_clear,
1320 .pmd_clear = xen_pmd_clear,
1321#endif /* CONFIG_X86_PAE */
1322 .set_pud = xen_set_pud_hyper,
1323
1324 .make_pmd = xen_make_pmd,
1325 .pmd_val = xen_pmd_val,
1326
1327#if PAGETABLE_LEVELS == 4
1328 .pud_val = xen_pud_val,
1329 .make_pud = xen_make_pud,
1330 .set_pgd = xen_set_pgd_hyper,
1331
1332 .alloc_pud = xen_alloc_pte_init,
1333 .release_pud = xen_release_pte_init,
1334#endif /* PAGETABLE_LEVELS == 4 */
1335
1336 .activate_mm = xen_activate_mm,
1337 .dup_mmap = xen_dup_mmap,
1338 .exit_mmap = xen_exit_mmap,
1339
1340 .lazy_mode = {
1341 .enter = paravirt_enter_lazy_mmu,
1342 .leave = xen_leave_lazy,
1343 },
1344
1345 .set_fixmap = xen_set_fixmap,
1346};
1347
1348static void xen_reboot(int reason) 830static void xen_reboot(int reason)
1349{ 831{
1350 struct sched_shutdown r = { .reason = reason }; 832 struct sched_shutdown r = { .reason = reason };
@@ -1387,223 +869,6 @@ static const struct machine_ops __initdata xen_machine_ops = {
1387}; 869};
1388 870
1389 871
1390static void __init xen_reserve_top(void)
1391{
1392#ifdef CONFIG_X86_32
1393 unsigned long top = HYPERVISOR_VIRT_START;
1394 struct xen_platform_parameters pp;
1395
1396 if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
1397 top = pp.virt_start;
1398
1399 reserve_top_address(-top);
1400#endif /* CONFIG_X86_32 */
1401}
1402
1403/*
1404 * Like __va(), but returns address in the kernel mapping (which is
1405 * all we have until the physical memory mapping has been set up.
1406 */
1407static void *__ka(phys_addr_t paddr)
1408{
1409#ifdef CONFIG_X86_64
1410 return (void *)(paddr + __START_KERNEL_map);
1411#else
1412 return __va(paddr);
1413#endif
1414}
1415
1416/* Convert a machine address to physical address */
1417static unsigned long m2p(phys_addr_t maddr)
1418{
1419 phys_addr_t paddr;
1420
1421 maddr &= PTE_PFN_MASK;
1422 paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
1423
1424 return paddr;
1425}
1426
1427/* Convert a machine address to kernel virtual */
1428static void *m2v(phys_addr_t maddr)
1429{
1430 return __ka(m2p(maddr));
1431}
1432
1433static void set_page_prot(void *addr, pgprot_t prot)
1434{
1435 unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
1436 pte_t pte = pfn_pte(pfn, prot);
1437
1438 if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
1439 BUG();
1440}
1441
1442static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1443{
1444 unsigned pmdidx, pteidx;
1445 unsigned ident_pte;
1446 unsigned long pfn;
1447
1448 ident_pte = 0;
1449 pfn = 0;
1450 for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
1451 pte_t *pte_page;
1452
1453 /* Reuse or allocate a page of ptes */
1454 if (pmd_present(pmd[pmdidx]))
1455 pte_page = m2v(pmd[pmdidx].pmd);
1456 else {
1457 /* Check for free pte pages */
1458 if (ident_pte == ARRAY_SIZE(level1_ident_pgt))
1459 break;
1460
1461 pte_page = &level1_ident_pgt[ident_pte];
1462 ident_pte += PTRS_PER_PTE;
1463
1464 pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
1465 }
1466
1467 /* Install mappings */
1468 for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
1469 pte_t pte;
1470
1471 if (pfn > max_pfn_mapped)
1472 max_pfn_mapped = pfn;
1473
1474 if (!pte_none(pte_page[pteidx]))
1475 continue;
1476
1477 pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
1478 pte_page[pteidx] = pte;
1479 }
1480 }
1481
1482 for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
1483 set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
1484
1485 set_page_prot(pmd, PAGE_KERNEL_RO);
1486}
1487
1488#ifdef CONFIG_X86_64
1489static void convert_pfn_mfn(void *v)
1490{
1491 pte_t *pte = v;
1492 int i;
1493
1494 /* All levels are converted the same way, so just treat them
1495 as ptes. */
1496 for (i = 0; i < PTRS_PER_PTE; i++)
1497 pte[i] = xen_make_pte(pte[i].pte);
1498}
1499
1500/*
1501 * Set up the inital kernel pagetable.
1502 *
1503 * We can construct this by grafting the Xen provided pagetable into
1504 * head_64.S's preconstructed pagetables. We copy the Xen L2's into
1505 * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt. This
1506 * means that only the kernel has a physical mapping to start with -
1507 * but that's enough to get __va working. We need to fill in the rest
1508 * of the physical mapping once some sort of allocator has been set
1509 * up.
1510 */
1511static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1512 unsigned long max_pfn)
1513{
1514 pud_t *l3;
1515 pmd_t *l2;
1516
1517 /* Zap identity mapping */
1518 init_level4_pgt[0] = __pgd(0);
1519
1520 /* Pre-constructed entries are in pfn, so convert to mfn */
1521 convert_pfn_mfn(init_level4_pgt);
1522 convert_pfn_mfn(level3_ident_pgt);
1523 convert_pfn_mfn(level3_kernel_pgt);
1524
1525 l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
1526 l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
1527
1528 memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1529 memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1530
1531 l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
1532 l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
1533 memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1534
1535 /* Set up identity map */
1536 xen_map_identity_early(level2_ident_pgt, max_pfn);
1537
1538 /* Make pagetable pieces RO */
1539 set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
1540 set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
1541 set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
1542 set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
1543 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1544 set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
1545
1546 /* Pin down new L4 */
1547 pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
1548 PFN_DOWN(__pa_symbol(init_level4_pgt)));
1549
1550 /* Unpin Xen-provided one */
1551 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1552
1553 /* Switch over */
1554 pgd = init_level4_pgt;
1555
1556 /*
1557 * At this stage there can be no user pgd, and no page
1558 * structure to attach it to, so make sure we just set kernel
1559 * pgd.
1560 */
1561 xen_mc_batch();
1562 __xen_write_cr3(true, __pa(pgd));
1563 xen_mc_issue(PARAVIRT_LAZY_CPU);
1564
1565 reserve_early(__pa(xen_start_info->pt_base),
1566 __pa(xen_start_info->pt_base +
1567 xen_start_info->nr_pt_frames * PAGE_SIZE),
1568 "XEN PAGETABLES");
1569
1570 return pgd;
1571}
1572#else /* !CONFIG_X86_64 */
1573static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss;
1574
1575static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1576 unsigned long max_pfn)
1577{
1578 pmd_t *kernel_pmd;
1579
1580 init_pg_tables_start = __pa(pgd);
1581 init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
1582 max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024);
1583
1584 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
1585 memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
1586
1587 xen_map_identity_early(level2_kernel_pgt, max_pfn);
1588
1589 memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
1590 set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY],
1591 __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT));
1592
1593 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1594 set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
1595 set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
1596
1597 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1598
1599 xen_write_cr3(__pa(swapper_pg_dir));
1600
1601 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir)));
1602
1603 return swapper_pg_dir;
1604}
1605#endif /* CONFIG_X86_64 */
1606
1607/* First C function to be called on Xen boot */ 872/* First C function to be called on Xen boot */
1608asmlinkage void __init xen_start_kernel(void) 873asmlinkage void __init xen_start_kernel(void)
1609{ 874{
@@ -1643,12 +908,18 @@ asmlinkage void __init xen_start_kernel(void)
1643 machine_ops = xen_machine_ops; 908 machine_ops = xen_machine_ops;
1644 909
1645#ifdef CONFIG_X86_64 910#ifdef CONFIG_X86_64
1646 /* Disable until direct per-cpu data access. */ 911 /*
1647 have_vcpu_info_placement = 0; 912 * Setup percpu state. We only need to do this for 64-bit
1648#endif 913 * because 32-bit already has %fs set properly.
1649 914 */
1650 /* setup percpu state */
1651 load_percpu_segment(0); 915 load_percpu_segment(0);
916#endif
917 /*
918 * The only reliable way to retain the initial address of the
919 * percpu gdt_page is to remember it here, so we can go and
920 * mark it RW later, when the initial percpu area is freed.
921 */
922 xen_initial_gdt = &per_cpu(gdt_page, 0);
1652 923
1653 xen_smp_init(); 924 xen_smp_init();
1654 925
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
index 2e8271431e1a..5a070900ad35 100644
--- a/arch/x86/xen/irq.c
+++ b/arch/x86/xen/irq.c
@@ -50,6 +50,7 @@ static unsigned long xen_save_fl(void)
50 */ 50 */
51 return (-flags) & X86_EFLAGS_IF; 51 return (-flags) & X86_EFLAGS_IF;
52} 52}
53PV_CALLEE_SAVE_REGS_THUNK(xen_save_fl);
53 54
54static void xen_restore_fl(unsigned long flags) 55static void xen_restore_fl(unsigned long flags)
55{ 56{
@@ -76,6 +77,7 @@ static void xen_restore_fl(unsigned long flags)
76 xen_force_evtchn_callback(); 77 xen_force_evtchn_callback();
77 } 78 }
78} 79}
80PV_CALLEE_SAVE_REGS_THUNK(xen_restore_fl);
79 81
80static void xen_irq_disable(void) 82static void xen_irq_disable(void)
81{ 83{
@@ -86,6 +88,7 @@ static void xen_irq_disable(void)
86 percpu_read(xen_vcpu)->evtchn_upcall_mask = 1; 88 percpu_read(xen_vcpu)->evtchn_upcall_mask = 1;
87 preempt_enable_no_resched(); 89 preempt_enable_no_resched();
88} 90}
91PV_CALLEE_SAVE_REGS_THUNK(xen_irq_disable);
89 92
90static void xen_irq_enable(void) 93static void xen_irq_enable(void)
91{ 94{
@@ -106,6 +109,7 @@ static void xen_irq_enable(void)
106 if (unlikely(vcpu->evtchn_upcall_pending)) 109 if (unlikely(vcpu->evtchn_upcall_pending))
107 xen_force_evtchn_callback(); 110 xen_force_evtchn_callback();
108} 111}
112PV_CALLEE_SAVE_REGS_THUNK(xen_irq_enable);
109 113
110static void xen_safe_halt(void) 114static void xen_safe_halt(void)
111{ 115{
@@ -124,10 +128,12 @@ static void xen_halt(void)
124 128
125static const struct pv_irq_ops xen_irq_ops __initdata = { 129static const struct pv_irq_ops xen_irq_ops __initdata = {
126 .init_IRQ = __xen_init_IRQ, 130 .init_IRQ = __xen_init_IRQ,
127 .save_fl = xen_save_fl, 131
128 .restore_fl = xen_restore_fl, 132 .save_fl = PV_CALLEE_SAVE(xen_save_fl),
129 .irq_disable = xen_irq_disable, 133 .restore_fl = PV_CALLEE_SAVE(xen_restore_fl),
130 .irq_enable = xen_irq_enable, 134 .irq_disable = PV_CALLEE_SAVE(xen_irq_disable),
135 .irq_enable = PV_CALLEE_SAVE(xen_irq_enable),
136
131 .safe_halt = xen_safe_halt, 137 .safe_halt = xen_safe_halt,
132 .halt = xen_halt, 138 .halt = xen_halt,
133#ifdef CONFIG_X86_64 139#ifdef CONFIG_X86_64
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 98cb9869eb24..d2e8ed1aff3d 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -47,6 +47,7 @@
47#include <asm/tlbflush.h> 47#include <asm/tlbflush.h>
48#include <asm/fixmap.h> 48#include <asm/fixmap.h>
49#include <asm/mmu_context.h> 49#include <asm/mmu_context.h>
50#include <asm/setup.h>
50#include <asm/paravirt.h> 51#include <asm/paravirt.h>
51#include <asm/linkage.h> 52#include <asm/linkage.h>
52 53
@@ -55,6 +56,8 @@
55 56
56#include <xen/page.h> 57#include <xen/page.h>
57#include <xen/interface/xen.h> 58#include <xen/interface/xen.h>
59#include <xen/interface/version.h>
60#include <xen/hvc-console.h>
58 61
59#include "multicalls.h" 62#include "multicalls.h"
60#include "mmu.h" 63#include "mmu.h"
@@ -114,6 +117,37 @@ static inline void check_zero(void)
114 117
115#endif /* CONFIG_XEN_DEBUG_FS */ 118#endif /* CONFIG_XEN_DEBUG_FS */
116 119
120
121/*
122 * Identity map, in addition to plain kernel map. This needs to be
123 * large enough to allocate page table pages to allocate the rest.
124 * Each page can map 2MB.
125 */
126static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss;
127
128#ifdef CONFIG_X86_64
129/* l3 pud for userspace vsyscall mapping */
130static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
131#endif /* CONFIG_X86_64 */
132
133/*
134 * Note about cr3 (pagetable base) values:
135 *
136 * xen_cr3 contains the current logical cr3 value; it contains the
137 * last set cr3. This may not be the current effective cr3, because
138 * its update may be being lazily deferred. However, a vcpu looking
139 * at its own cr3 can use this value knowing that it everything will
140 * be self-consistent.
141 *
142 * xen_current_cr3 contains the actual vcpu cr3; it is set once the
143 * hypercall to set the vcpu cr3 is complete (so it may be a little
144 * out of date, but it will never be set early). If one vcpu is
145 * looking at another vcpu's cr3 value, it should use this variable.
146 */
147DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */
148DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
149
150
117/* 151/*
118 * Just beyond the highest usermode address. STACK_TOP_MAX has a 152 * Just beyond the highest usermode address. STACK_TOP_MAX has a
119 * redzone above it, so round it up to a PGD boundary. 153 * redzone above it, so round it up to a PGD boundary.
@@ -458,28 +492,33 @@ pteval_t xen_pte_val(pte_t pte)
458{ 492{
459 return pte_mfn_to_pfn(pte.pte); 493 return pte_mfn_to_pfn(pte.pte);
460} 494}
495PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
461 496
462pgdval_t xen_pgd_val(pgd_t pgd) 497pgdval_t xen_pgd_val(pgd_t pgd)
463{ 498{
464 return pte_mfn_to_pfn(pgd.pgd); 499 return pte_mfn_to_pfn(pgd.pgd);
465} 500}
501PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
466 502
467pte_t xen_make_pte(pteval_t pte) 503pte_t xen_make_pte(pteval_t pte)
468{ 504{
469 pte = pte_pfn_to_mfn(pte); 505 pte = pte_pfn_to_mfn(pte);
470 return native_make_pte(pte); 506 return native_make_pte(pte);
471} 507}
508PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
472 509
473pgd_t xen_make_pgd(pgdval_t pgd) 510pgd_t xen_make_pgd(pgdval_t pgd)
474{ 511{
475 pgd = pte_pfn_to_mfn(pgd); 512 pgd = pte_pfn_to_mfn(pgd);
476 return native_make_pgd(pgd); 513 return native_make_pgd(pgd);
477} 514}
515PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd);
478 516
479pmdval_t xen_pmd_val(pmd_t pmd) 517pmdval_t xen_pmd_val(pmd_t pmd)
480{ 518{
481 return pte_mfn_to_pfn(pmd.pmd); 519 return pte_mfn_to_pfn(pmd.pmd);
482} 520}
521PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val);
483 522
484void xen_set_pud_hyper(pud_t *ptr, pud_t val) 523void xen_set_pud_hyper(pud_t *ptr, pud_t val)
485{ 524{
@@ -556,12 +595,14 @@ pmd_t xen_make_pmd(pmdval_t pmd)
556 pmd = pte_pfn_to_mfn(pmd); 595 pmd = pte_pfn_to_mfn(pmd);
557 return native_make_pmd(pmd); 596 return native_make_pmd(pmd);
558} 597}
598PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
559 599
560#if PAGETABLE_LEVELS == 4 600#if PAGETABLE_LEVELS == 4
561pudval_t xen_pud_val(pud_t pud) 601pudval_t xen_pud_val(pud_t pud)
562{ 602{
563 return pte_mfn_to_pfn(pud.pud); 603 return pte_mfn_to_pfn(pud.pud);
564} 604}
605PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val);
565 606
566pud_t xen_make_pud(pudval_t pud) 607pud_t xen_make_pud(pudval_t pud)
567{ 608{
@@ -569,6 +610,7 @@ pud_t xen_make_pud(pudval_t pud)
569 610
570 return native_make_pud(pud); 611 return native_make_pud(pud);
571} 612}
613PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud);
572 614
573pgd_t *xen_get_user_pgd(pgd_t *pgd) 615pgd_t *xen_get_user_pgd(pgd_t *pgd)
574{ 616{
@@ -1152,6 +1194,709 @@ void xen_exit_mmap(struct mm_struct *mm)
1152 spin_unlock(&mm->page_table_lock); 1194 spin_unlock(&mm->page_table_lock);
1153} 1195}
1154 1196
1197static __init void xen_pagetable_setup_start(pgd_t *base)
1198{
1199}
1200
1201static __init void xen_pagetable_setup_done(pgd_t *base)
1202{
1203 xen_setup_shared_info();
1204}
1205
1206static void xen_write_cr2(unsigned long cr2)
1207{
1208 percpu_read(xen_vcpu)->arch.cr2 = cr2;
1209}
1210
1211static unsigned long xen_read_cr2(void)
1212{
1213 return percpu_read(xen_vcpu)->arch.cr2;
1214}
1215
1216unsigned long xen_read_cr2_direct(void)
1217{
1218 return percpu_read(xen_vcpu_info.arch.cr2);
1219}
1220
1221static void xen_flush_tlb(void)
1222{
1223 struct mmuext_op *op;
1224 struct multicall_space mcs;
1225
1226 preempt_disable();
1227
1228 mcs = xen_mc_entry(sizeof(*op));
1229
1230 op = mcs.args;
1231 op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
1232 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1233
1234 xen_mc_issue(PARAVIRT_LAZY_MMU);
1235
1236 preempt_enable();
1237}
1238
1239static void xen_flush_tlb_single(unsigned long addr)
1240{
1241 struct mmuext_op *op;
1242 struct multicall_space mcs;
1243
1244 preempt_disable();
1245
1246 mcs = xen_mc_entry(sizeof(*op));
1247 op = mcs.args;
1248 op->cmd = MMUEXT_INVLPG_LOCAL;
1249 op->arg1.linear_addr = addr & PAGE_MASK;
1250 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1251
1252 xen_mc_issue(PARAVIRT_LAZY_MMU);
1253
1254 preempt_enable();
1255}
1256
1257static void xen_flush_tlb_others(const struct cpumask *cpus,
1258 struct mm_struct *mm, unsigned long va)
1259{
1260 struct {
1261 struct mmuext_op op;
1262 DECLARE_BITMAP(mask, NR_CPUS);
1263 } *args;
1264 struct multicall_space mcs;
1265
1266 BUG_ON(cpumask_empty(cpus));
1267 BUG_ON(!mm);
1268
1269 mcs = xen_mc_entry(sizeof(*args));
1270 args = mcs.args;
1271 args->op.arg2.vcpumask = to_cpumask(args->mask);
1272
1273 /* Remove us, and any offline CPUS. */
1274 cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
1275 cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
1276 if (unlikely(cpumask_empty(to_cpumask(args->mask))))
1277 goto issue;
1278
1279 if (va == TLB_FLUSH_ALL) {
1280 args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
1281 } else {
1282 args->op.cmd = MMUEXT_INVLPG_MULTI;
1283 args->op.arg1.linear_addr = va;
1284 }
1285
1286 MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
1287
1288issue:
1289 xen_mc_issue(PARAVIRT_LAZY_MMU);
1290}
1291
1292static unsigned long xen_read_cr3(void)
1293{
1294 return percpu_read(xen_cr3);
1295}
1296
1297static void set_current_cr3(void *v)
1298{
1299 percpu_write(xen_current_cr3, (unsigned long)v);
1300}
1301
1302static void __xen_write_cr3(bool kernel, unsigned long cr3)
1303{
1304 struct mmuext_op *op;
1305 struct multicall_space mcs;
1306 unsigned long mfn;
1307
1308 if (cr3)
1309 mfn = pfn_to_mfn(PFN_DOWN(cr3));
1310 else
1311 mfn = 0;
1312
1313 WARN_ON(mfn == 0 && kernel);
1314
1315 mcs = __xen_mc_entry(sizeof(*op));
1316
1317 op = mcs.args;
1318 op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
1319 op->arg1.mfn = mfn;
1320
1321 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1322
1323 if (kernel) {
1324 percpu_write(xen_cr3, cr3);
1325
1326 /* Update xen_current_cr3 once the batch has actually
1327 been submitted. */
1328 xen_mc_callback(set_current_cr3, (void *)cr3);
1329 }
1330}
1331
1332static void xen_write_cr3(unsigned long cr3)
1333{
1334 BUG_ON(preemptible());
1335
1336 xen_mc_batch(); /* disables interrupts */
1337
1338 /* Update while interrupts are disabled, so its atomic with
1339 respect to ipis */
1340 percpu_write(xen_cr3, cr3);
1341
1342 __xen_write_cr3(true, cr3);
1343
1344#ifdef CONFIG_X86_64
1345 {
1346 pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
1347 if (user_pgd)
1348 __xen_write_cr3(false, __pa(user_pgd));
1349 else
1350 __xen_write_cr3(false, 0);
1351 }
1352#endif
1353
1354 xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
1355}
1356
1357static int xen_pgd_alloc(struct mm_struct *mm)
1358{
1359 pgd_t *pgd = mm->pgd;
1360 int ret = 0;
1361
1362 BUG_ON(PagePinned(virt_to_page(pgd)));
1363
1364#ifdef CONFIG_X86_64
1365 {
1366 struct page *page = virt_to_page(pgd);
1367 pgd_t *user_pgd;
1368
1369 BUG_ON(page->private != 0);
1370
1371 ret = -ENOMEM;
1372
1373 user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
1374 page->private = (unsigned long)user_pgd;
1375
1376 if (user_pgd != NULL) {
1377 user_pgd[pgd_index(VSYSCALL_START)] =
1378 __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
1379 ret = 0;
1380 }
1381
1382 BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
1383 }
1384#endif
1385
1386 return ret;
1387}
1388
1389static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
1390{
1391#ifdef CONFIG_X86_64
1392 pgd_t *user_pgd = xen_get_user_pgd(pgd);
1393
1394 if (user_pgd)
1395 free_page((unsigned long)user_pgd);
1396#endif
1397}
1398
1399#ifdef CONFIG_HIGHPTE
1400static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
1401{
1402 pgprot_t prot = PAGE_KERNEL;
1403
1404 if (PagePinned(page))
1405 prot = PAGE_KERNEL_RO;
1406
1407 if (0 && PageHighMem(page))
1408 printk("mapping highpte %lx type %d prot %s\n",
1409 page_to_pfn(page), type,
1410 (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ");
1411
1412 return kmap_atomic_prot(page, type, prot);
1413}
1414#endif
1415
1416#ifdef CONFIG_X86_32
1417static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
1418{
1419 /* If there's an existing pte, then don't allow _PAGE_RW to be set */
1420 if (pte_val_ma(*ptep) & _PAGE_PRESENT)
1421 pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
1422 pte_val_ma(pte));
1423
1424 return pte;
1425}
1426
1427/* Init-time set_pte while constructing initial pagetables, which
1428 doesn't allow RO pagetable pages to be remapped RW */
1429static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
1430{
1431 pte = mask_rw_pte(ptep, pte);
1432
1433 xen_set_pte(ptep, pte);
1434}
1435#endif
1436
1437/* Early in boot, while setting up the initial pagetable, assume
1438 everything is pinned. */
1439static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
1440{
1441#ifdef CONFIG_FLATMEM
1442 BUG_ON(mem_map); /* should only be used early */
1443#endif
1444 make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
1445}
1446
1447/* Early release_pte assumes that all pts are pinned, since there's
1448 only init_mm and anything attached to that is pinned. */
1449static void xen_release_pte_init(unsigned long pfn)
1450{
1451 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1452}
1453
1454static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
1455{
1456 struct mmuext_op op;
1457 op.cmd = cmd;
1458 op.arg1.mfn = pfn_to_mfn(pfn);
1459 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
1460 BUG();
1461}
1462
1463/* This needs to make sure the new pte page is pinned iff its being
1464 attached to a pinned pagetable. */
1465static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level)
1466{
1467 struct page *page = pfn_to_page(pfn);
1468
1469 if (PagePinned(virt_to_page(mm->pgd))) {
1470 SetPagePinned(page);
1471
1472 vm_unmap_aliases();
1473 if (!PageHighMem(page)) {
1474 make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
1475 if (level == PT_PTE && USE_SPLIT_PTLOCKS)
1476 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1477 } else {
1478 /* make sure there are no stray mappings of
1479 this page */
1480 kmap_flush_unused();
1481 }
1482 }
1483}
1484
1485static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
1486{
1487 xen_alloc_ptpage(mm, pfn, PT_PTE);
1488}
1489
1490static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
1491{
1492 xen_alloc_ptpage(mm, pfn, PT_PMD);
1493}
1494
1495/* This should never happen until we're OK to use struct page */
1496static void xen_release_ptpage(unsigned long pfn, unsigned level)
1497{
1498 struct page *page = pfn_to_page(pfn);
1499
1500 if (PagePinned(page)) {
1501 if (!PageHighMem(page)) {
1502 if (level == PT_PTE && USE_SPLIT_PTLOCKS)
1503 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
1504 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1505 }
1506 ClearPagePinned(page);
1507 }
1508}
1509
1510static void xen_release_pte(unsigned long pfn)
1511{
1512 xen_release_ptpage(pfn, PT_PTE);
1513}
1514
1515static void xen_release_pmd(unsigned long pfn)
1516{
1517 xen_release_ptpage(pfn, PT_PMD);
1518}
1519
1520#if PAGETABLE_LEVELS == 4
1521static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
1522{
1523 xen_alloc_ptpage(mm, pfn, PT_PUD);
1524}
1525
1526static void xen_release_pud(unsigned long pfn)
1527{
1528 xen_release_ptpage(pfn, PT_PUD);
1529}
1530#endif
1531
1532void __init xen_reserve_top(void)
1533{
1534#ifdef CONFIG_X86_32
1535 unsigned long top = HYPERVISOR_VIRT_START;
1536 struct xen_platform_parameters pp;
1537
1538 if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
1539 top = pp.virt_start;
1540
1541 reserve_top_address(-top);
1542#endif /* CONFIG_X86_32 */
1543}
1544
1545/*
1546 * Like __va(), but returns address in the kernel mapping (which is
1547 * all we have until the physical memory mapping has been set up.
1548 */
1549static void *__ka(phys_addr_t paddr)
1550{
1551#ifdef CONFIG_X86_64
1552 return (void *)(paddr + __START_KERNEL_map);
1553#else
1554 return __va(paddr);
1555#endif
1556}
1557
1558/* Convert a machine address to physical address */
1559static unsigned long m2p(phys_addr_t maddr)
1560{
1561 phys_addr_t paddr;
1562
1563 maddr &= PTE_PFN_MASK;
1564 paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
1565
1566 return paddr;
1567}
1568
1569/* Convert a machine address to kernel virtual */
1570static void *m2v(phys_addr_t maddr)
1571{
1572 return __ka(m2p(maddr));
1573}
1574
1575static void set_page_prot(void *addr, pgprot_t prot)
1576{
1577 unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
1578 pte_t pte = pfn_pte(pfn, prot);
1579
1580 if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
1581 BUG();
1582}
1583
1584static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1585{
1586 unsigned pmdidx, pteidx;
1587 unsigned ident_pte;
1588 unsigned long pfn;
1589
1590 ident_pte = 0;
1591 pfn = 0;
1592 for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
1593 pte_t *pte_page;
1594
1595 /* Reuse or allocate a page of ptes */
1596 if (pmd_present(pmd[pmdidx]))
1597 pte_page = m2v(pmd[pmdidx].pmd);
1598 else {
1599 /* Check for free pte pages */
1600 if (ident_pte == ARRAY_SIZE(level1_ident_pgt))
1601 break;
1602
1603 pte_page = &level1_ident_pgt[ident_pte];
1604 ident_pte += PTRS_PER_PTE;
1605
1606 pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
1607 }
1608
1609 /* Install mappings */
1610 for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
1611 pte_t pte;
1612
1613 if (pfn > max_pfn_mapped)
1614 max_pfn_mapped = pfn;
1615
1616 if (!pte_none(pte_page[pteidx]))
1617 continue;
1618
1619 pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
1620 pte_page[pteidx] = pte;
1621 }
1622 }
1623
1624 for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
1625 set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
1626
1627 set_page_prot(pmd, PAGE_KERNEL_RO);
1628}
1629
1630#ifdef CONFIG_X86_64
1631static void convert_pfn_mfn(void *v)
1632{
1633 pte_t *pte = v;
1634 int i;
1635
1636 /* All levels are converted the same way, so just treat them
1637 as ptes. */
1638 for (i = 0; i < PTRS_PER_PTE; i++)
1639 pte[i] = xen_make_pte(pte[i].pte);
1640}
1641
1642/*
1643 * Set up the inital kernel pagetable.
1644 *
1645 * We can construct this by grafting the Xen provided pagetable into
1646 * head_64.S's preconstructed pagetables. We copy the Xen L2's into
1647 * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt. This
1648 * means that only the kernel has a physical mapping to start with -
1649 * but that's enough to get __va working. We need to fill in the rest
1650 * of the physical mapping once some sort of allocator has been set
1651 * up.
1652 */
1653__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1654 unsigned long max_pfn)
1655{
1656 pud_t *l3;
1657 pmd_t *l2;
1658
1659 /* Zap identity mapping */
1660 init_level4_pgt[0] = __pgd(0);
1661
1662 /* Pre-constructed entries are in pfn, so convert to mfn */
1663 convert_pfn_mfn(init_level4_pgt);
1664 convert_pfn_mfn(level3_ident_pgt);
1665 convert_pfn_mfn(level3_kernel_pgt);
1666
1667 l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
1668 l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
1669
1670 memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1671 memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1672
1673 l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
1674 l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
1675 memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1676
1677 /* Set up identity map */
1678 xen_map_identity_early(level2_ident_pgt, max_pfn);
1679
1680 /* Make pagetable pieces RO */
1681 set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
1682 set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
1683 set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
1684 set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
1685 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1686 set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
1687
1688 /* Pin down new L4 */
1689 pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
1690 PFN_DOWN(__pa_symbol(init_level4_pgt)));
1691
1692 /* Unpin Xen-provided one */
1693 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1694
1695 /* Switch over */
1696 pgd = init_level4_pgt;
1697
1698 /*
1699 * At this stage there can be no user pgd, and no page
1700 * structure to attach it to, so make sure we just set kernel
1701 * pgd.
1702 */
1703 xen_mc_batch();
1704 __xen_write_cr3(true, __pa(pgd));
1705 xen_mc_issue(PARAVIRT_LAZY_CPU);
1706
1707 reserve_early(__pa(xen_start_info->pt_base),
1708 __pa(xen_start_info->pt_base +
1709 xen_start_info->nr_pt_frames * PAGE_SIZE),
1710 "XEN PAGETABLES");
1711
1712 return pgd;
1713}
1714#else /* !CONFIG_X86_64 */
1715static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss;
1716
1717__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1718 unsigned long max_pfn)
1719{
1720 pmd_t *kernel_pmd;
1721
1722 init_pg_tables_start = __pa(pgd);
1723 init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
1724 max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024);
1725
1726 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
1727 memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
1728
1729 xen_map_identity_early(level2_kernel_pgt, max_pfn);
1730
1731 memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
1732 set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY],
1733 __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT));
1734
1735 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1736 set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
1737 set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
1738
1739 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1740
1741 xen_write_cr3(__pa(swapper_pg_dir));
1742
1743 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir)));
1744
1745 return swapper_pg_dir;
1746}
1747#endif /* CONFIG_X86_64 */
1748
1749static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot)
1750{
1751 pte_t pte;
1752
1753 phys >>= PAGE_SHIFT;
1754
1755 switch (idx) {
1756 case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
1757#ifdef CONFIG_X86_F00F_BUG
1758 case FIX_F00F_IDT:
1759#endif
1760#ifdef CONFIG_X86_32
1761 case FIX_WP_TEST:
1762 case FIX_VDSO:
1763# ifdef CONFIG_HIGHMEM
1764 case FIX_KMAP_BEGIN ... FIX_KMAP_END:
1765# endif
1766#else
1767 case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
1768#endif
1769#ifdef CONFIG_X86_LOCAL_APIC
1770 case FIX_APIC_BASE: /* maps dummy local APIC */
1771#endif
1772 pte = pfn_pte(phys, prot);
1773 break;
1774
1775 default:
1776 pte = mfn_pte(phys, prot);
1777 break;
1778 }
1779
1780 __native_set_fixmap(idx, pte);
1781
1782#ifdef CONFIG_X86_64
1783 /* Replicate changes to map the vsyscall page into the user
1784 pagetable vsyscall mapping. */
1785 if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) {
1786 unsigned long vaddr = __fix_to_virt(idx);
1787 set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
1788 }
1789#endif
1790}
1791
1792__init void xen_post_allocator_init(void)
1793{
1794 pv_mmu_ops.set_pte = xen_set_pte;
1795 pv_mmu_ops.set_pmd = xen_set_pmd;
1796 pv_mmu_ops.set_pud = xen_set_pud;
1797#if PAGETABLE_LEVELS == 4
1798 pv_mmu_ops.set_pgd = xen_set_pgd;
1799#endif
1800
1801 /* This will work as long as patching hasn't happened yet
1802 (which it hasn't) */
1803 pv_mmu_ops.alloc_pte = xen_alloc_pte;
1804 pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
1805 pv_mmu_ops.release_pte = xen_release_pte;
1806 pv_mmu_ops.release_pmd = xen_release_pmd;
1807#if PAGETABLE_LEVELS == 4
1808 pv_mmu_ops.alloc_pud = xen_alloc_pud;
1809 pv_mmu_ops.release_pud = xen_release_pud;
1810#endif
1811
1812#ifdef CONFIG_X86_64
1813 SetPagePinned(virt_to_page(level3_user_vsyscall));
1814#endif
1815 xen_mark_init_mm_pinned();
1816}
1817
1818
1819const struct pv_mmu_ops xen_mmu_ops __initdata = {
1820 .pagetable_setup_start = xen_pagetable_setup_start,
1821 .pagetable_setup_done = xen_pagetable_setup_done,
1822
1823 .read_cr2 = xen_read_cr2,
1824 .write_cr2 = xen_write_cr2,
1825
1826 .read_cr3 = xen_read_cr3,
1827 .write_cr3 = xen_write_cr3,
1828
1829 .flush_tlb_user = xen_flush_tlb,
1830 .flush_tlb_kernel = xen_flush_tlb,
1831 .flush_tlb_single = xen_flush_tlb_single,
1832 .flush_tlb_others = xen_flush_tlb_others,
1833
1834 .pte_update = paravirt_nop,
1835 .pte_update_defer = paravirt_nop,
1836
1837 .pgd_alloc = xen_pgd_alloc,
1838 .pgd_free = xen_pgd_free,
1839
1840 .alloc_pte = xen_alloc_pte_init,
1841 .release_pte = xen_release_pte_init,
1842 .alloc_pmd = xen_alloc_pte_init,
1843 .alloc_pmd_clone = paravirt_nop,
1844 .release_pmd = xen_release_pte_init,
1845
1846#ifdef CONFIG_HIGHPTE
1847 .kmap_atomic_pte = xen_kmap_atomic_pte,
1848#endif
1849
1850#ifdef CONFIG_X86_64
1851 .set_pte = xen_set_pte,
1852#else
1853 .set_pte = xen_set_pte_init,
1854#endif
1855 .set_pte_at = xen_set_pte_at,
1856 .set_pmd = xen_set_pmd_hyper,
1857
1858 .ptep_modify_prot_start = __ptep_modify_prot_start,
1859 .ptep_modify_prot_commit = __ptep_modify_prot_commit,
1860
1861 .pte_val = PV_CALLEE_SAVE(xen_pte_val),
1862 .pgd_val = PV_CALLEE_SAVE(xen_pgd_val),
1863
1864 .make_pte = PV_CALLEE_SAVE(xen_make_pte),
1865 .make_pgd = PV_CALLEE_SAVE(xen_make_pgd),
1866
1867#ifdef CONFIG_X86_PAE
1868 .set_pte_atomic = xen_set_pte_atomic,
1869 .set_pte_present = xen_set_pte_at,
1870 .pte_clear = xen_pte_clear,
1871 .pmd_clear = xen_pmd_clear,
1872#endif /* CONFIG_X86_PAE */
1873 .set_pud = xen_set_pud_hyper,
1874
1875 .make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
1876 .pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
1877
1878#if PAGETABLE_LEVELS == 4
1879 .pud_val = PV_CALLEE_SAVE(xen_pud_val),
1880 .make_pud = PV_CALLEE_SAVE(xen_make_pud),
1881 .set_pgd = xen_set_pgd_hyper,
1882
1883 .alloc_pud = xen_alloc_pte_init,
1884 .release_pud = xen_release_pte_init,
1885#endif /* PAGETABLE_LEVELS == 4 */
1886
1887 .activate_mm = xen_activate_mm,
1888 .dup_mmap = xen_dup_mmap,
1889 .exit_mmap = xen_exit_mmap,
1890
1891 .lazy_mode = {
1892 .enter = paravirt_enter_lazy_mmu,
1893 .leave = xen_leave_lazy,
1894 },
1895
1896 .set_fixmap = xen_set_fixmap,
1897};
1898
1899
1155#ifdef CONFIG_XEN_DEBUG_FS 1900#ifdef CONFIG_XEN_DEBUG_FS
1156 1901
1157static struct dentry *d_mmu_debug; 1902static struct dentry *d_mmu_debug;
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index 98d71659da5a..24d1b44a337d 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -54,4 +54,7 @@ pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t
54void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, 54void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
55 pte_t *ptep, pte_t pte); 55 pte_t *ptep, pte_t pte);
56 56
57unsigned long xen_read_cr2_direct(void);
58
59extern const struct pv_mmu_ops xen_mmu_ops;
57#endif /* _XEN_MMU_H */ 60#endif /* _XEN_MMU_H */
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 88d5d5ec6beb..035582ae815d 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -170,8 +170,7 @@ static void __init xen_smp_prepare_boot_cpu(void)
170 170
171 /* We've switched to the "real" per-cpu gdt, so make sure the 171 /* We've switched to the "real" per-cpu gdt, so make sure the
172 old memory can be recycled */ 172 old memory can be recycled */
173 make_lowmem_page_readwrite(__per_cpu_load + 173 make_lowmem_page_readwrite(xen_initial_gdt);
174 (unsigned long)&per_cpu_var(gdt_page));
175 174
176 xen_setup_vcpu_info_placement(); 175 xen_setup_vcpu_info_placement();
177} 176}
@@ -287,6 +286,9 @@ static int __cpuinit xen_cpu_up(unsigned int cpu)
287 irq_ctx_init(cpu); 286 irq_ctx_init(cpu);
288#else 287#else
289 clear_tsk_thread_flag(idle, TIF_FORK); 288 clear_tsk_thread_flag(idle, TIF_FORK);
289 per_cpu(kernel_stack, cpu) =
290 (unsigned long)task_stack_page(idle) -
291 KERNEL_STACK_OFFSET + THREAD_SIZE;
290#endif 292#endif
291 xen_setup_timer(cpu); 293 xen_setup_timer(cpu);
292 xen_init_lock_cpu(cpu); 294 xen_init_lock_cpu(cpu);
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S
new file mode 100644
index 000000000000..4c6f96799131
--- /dev/null
+++ b/arch/x86/xen/xen-asm.S
@@ -0,0 +1,140 @@
1/*
2 Asm versions of Xen pv-ops, suitable for either direct use or inlining.
3 The inline versions are the same as the direct-use versions, with the
4 pre- and post-amble chopped off.
5
6 This code is encoded for size rather than absolute efficiency,
7 with a view to being able to inline as much as possible.
8
9 We only bother with direct forms (ie, vcpu in percpu data) of
10 the operations here; the indirect forms are better handled in
11 C, since they're generally too large to inline anyway.
12 */
13
14#include <asm/asm-offsets.h>
15#include <asm/percpu.h>
16#include <asm/processor-flags.h>
17
18#include "xen-asm.h"
19
20/*
21 Enable events. This clears the event mask and tests the pending
22 event status with one and operation. If there are pending
23 events, then enter the hypervisor to get them handled.
24 */
25ENTRY(xen_irq_enable_direct)
26 /* Unmask events */
27 movb $0, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
28
29 /* Preempt here doesn't matter because that will deal with
30 any pending interrupts. The pending check may end up being
31 run on the wrong CPU, but that doesn't hurt. */
32
33 /* Test for pending */
34 testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending
35 jz 1f
36
372: call check_events
381:
39ENDPATCH(xen_irq_enable_direct)
40 ret
41 ENDPROC(xen_irq_enable_direct)
42 RELOC(xen_irq_enable_direct, 2b+1)
43
44
45/*
46 Disabling events is simply a matter of making the event mask
47 non-zero.
48 */
49ENTRY(xen_irq_disable_direct)
50 movb $1, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
51ENDPATCH(xen_irq_disable_direct)
52 ret
53 ENDPROC(xen_irq_disable_direct)
54 RELOC(xen_irq_disable_direct, 0)
55
56/*
57 (xen_)save_fl is used to get the current interrupt enable status.
58 Callers expect the status to be in X86_EFLAGS_IF, and other bits
59 may be set in the return value. We take advantage of this by
60 making sure that X86_EFLAGS_IF has the right value (and other bits
61 in that byte are 0), but other bits in the return value are
62 undefined. We need to toggle the state of the bit, because
63 Xen and x86 use opposite senses (mask vs enable).
64 */
65ENTRY(xen_save_fl_direct)
66 testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
67 setz %ah
68 addb %ah,%ah
69ENDPATCH(xen_save_fl_direct)
70 ret
71 ENDPROC(xen_save_fl_direct)
72 RELOC(xen_save_fl_direct, 0)
73
74
75/*
76 In principle the caller should be passing us a value return
77 from xen_save_fl_direct, but for robustness sake we test only
78 the X86_EFLAGS_IF flag rather than the whole byte. After
79 setting the interrupt mask state, it checks for unmasked
80 pending events and enters the hypervisor to get them delivered
81 if so.
82 */
83ENTRY(xen_restore_fl_direct)
84#ifdef CONFIG_X86_64
85 testw $X86_EFLAGS_IF, %di
86#else
87 testb $X86_EFLAGS_IF>>8, %ah
88#endif
89 setz PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
90 /* Preempt here doesn't matter because that will deal with
91 any pending interrupts. The pending check may end up being
92 run on the wrong CPU, but that doesn't hurt. */
93
94 /* check for unmasked and pending */
95 cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending
96 jz 1f
972: call check_events
981:
99ENDPATCH(xen_restore_fl_direct)
100 ret
101 ENDPROC(xen_restore_fl_direct)
102 RELOC(xen_restore_fl_direct, 2b+1)
103
104
105/*
106 Force an event check by making a hypercall,
107 but preserve regs before making the call.
108 */
109check_events:
110#ifdef CONFIG_X86_32
111 push %eax
112 push %ecx
113 push %edx
114 call xen_force_evtchn_callback
115 pop %edx
116 pop %ecx
117 pop %eax
118#else
119 push %rax
120 push %rcx
121 push %rdx
122 push %rsi
123 push %rdi
124 push %r8
125 push %r9
126 push %r10
127 push %r11
128 call xen_force_evtchn_callback
129 pop %r11
130 pop %r10
131 pop %r9
132 pop %r8
133 pop %rdi
134 pop %rsi
135 pop %rdx
136 pop %rcx
137 pop %rax
138#endif
139 ret
140
diff --git a/arch/x86/xen/xen-asm.h b/arch/x86/xen/xen-asm.h
new file mode 100644
index 000000000000..465276467a47
--- /dev/null
+++ b/arch/x86/xen/xen-asm.h
@@ -0,0 +1,12 @@
1#ifndef _XEN_XEN_ASM_H
2#define _XEN_XEN_ASM_H
3
4#include <linux/linkage.h>
5
6#define RELOC(x, v) .globl x##_reloc; x##_reloc=v
7#define ENDPATCH(x) .globl x##_end; x##_end=.
8
9/* Pseudo-flag used for virtual NMI, which we don't implement yet */
10#define XEN_EFLAGS_NMI 0x80000000
11
12#endif
diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S
index 42786f59d9c0..082d173caaf3 100644
--- a/arch/x86/xen/xen-asm_32.S
+++ b/arch/x86/xen/xen-asm_32.S
@@ -11,101 +11,28 @@
11 generally too large to inline anyway. 11 generally too large to inline anyway.
12 */ 12 */
13 13
14#include <linux/linkage.h> 14//#include <asm/asm-offsets.h>
15
16#include <asm/asm-offsets.h>
17#include <asm/thread_info.h> 15#include <asm/thread_info.h>
18#include <asm/percpu.h>
19#include <asm/processor-flags.h> 16#include <asm/processor-flags.h>
20#include <asm/segment.h> 17#include <asm/segment.h>
21 18
22#include <xen/interface/xen.h> 19#include <xen/interface/xen.h>
23 20
24#define RELOC(x, v) .globl x##_reloc; x##_reloc=v 21#include "xen-asm.h"
25#define ENDPATCH(x) .globl x##_end; x##_end=.
26
27/* Pseudo-flag used for virtual NMI, which we don't implement yet */
28#define XEN_EFLAGS_NMI 0x80000000
29
30/*
31 Enable events. This clears the event mask and tests the pending
32 event status with one and operation. If there are pending
33 events, then enter the hypervisor to get them handled.
34 */
35ENTRY(xen_irq_enable_direct)
36 /* Unmask events */
37 movb $0, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
38
39 /* Preempt here doesn't matter because that will deal with
40 any pending interrupts. The pending check may end up being
41 run on the wrong CPU, but that doesn't hurt. */
42
43 /* Test for pending */
44 testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
45 jz 1f
46
472: call check_events
481:
49ENDPATCH(xen_irq_enable_direct)
50 ret
51 ENDPROC(xen_irq_enable_direct)
52 RELOC(xen_irq_enable_direct, 2b+1)
53
54 22
55/* 23/*
56 Disabling events is simply a matter of making the event mask 24 Force an event check by making a hypercall,
57 non-zero. 25 but preserve regs before making the call.
58 */
59ENTRY(xen_irq_disable_direct)
60 movb $1, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
61ENDPATCH(xen_irq_disable_direct)
62 ret
63 ENDPROC(xen_irq_disable_direct)
64 RELOC(xen_irq_disable_direct, 0)
65
66/*
67 (xen_)save_fl is used to get the current interrupt enable status.
68 Callers expect the status to be in X86_EFLAGS_IF, and other bits
69 may be set in the return value. We take advantage of this by
70 making sure that X86_EFLAGS_IF has the right value (and other bits
71 in that byte are 0), but other bits in the return value are
72 undefined. We need to toggle the state of the bit, because
73 Xen and x86 use opposite senses (mask vs enable).
74 */
75ENTRY(xen_save_fl_direct)
76 testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
77 setz %ah
78 addb %ah,%ah
79ENDPATCH(xen_save_fl_direct)
80 ret
81 ENDPROC(xen_save_fl_direct)
82 RELOC(xen_save_fl_direct, 0)
83
84
85/*
86 In principle the caller should be passing us a value return
87 from xen_save_fl_direct, but for robustness sake we test only
88 the X86_EFLAGS_IF flag rather than the whole byte. After
89 setting the interrupt mask state, it checks for unmasked
90 pending events and enters the hypervisor to get them delivered
91 if so.
92 */ 26 */
93ENTRY(xen_restore_fl_direct) 27check_events:
94 testb $X86_EFLAGS_IF>>8, %ah 28 push %eax
95 setz PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask 29 push %ecx
96 /* Preempt here doesn't matter because that will deal with 30 push %edx
97 any pending interrupts. The pending check may end up being 31 call xen_force_evtchn_callback
98 run on the wrong CPU, but that doesn't hurt. */ 32 pop %edx
99 33 pop %ecx
100 /* check for unmasked and pending */ 34 pop %eax
101 cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
102 jz 1f
1032: call check_events
1041:
105ENDPATCH(xen_restore_fl_direct)
106 ret 35 ret
107 ENDPROC(xen_restore_fl_direct)
108 RELOC(xen_restore_fl_direct, 2b+1)
109 36
110/* 37/*
111 We can't use sysexit directly, because we're not running in ring0. 38 We can't use sysexit directly, because we're not running in ring0.
@@ -289,17 +216,3 @@ ENTRY(xen_iret_crit_fixup)
289 lea 4(%edi),%esp /* point esp to new frame */ 216 lea 4(%edi),%esp /* point esp to new frame */
2902: jmp xen_do_upcall 2172: jmp xen_do_upcall
291 218
292
293/*
294 Force an event check by making a hypercall,
295 but preserve regs before making the call.
296 */
297check_events:
298 push %eax
299 push %ecx
300 push %edx
301 call xen_force_evtchn_callback
302 pop %edx
303 pop %ecx
304 pop %eax
305 ret
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
index d6fc51f4ce85..d205a283efe0 100644
--- a/arch/x86/xen/xen-asm_64.S
+++ b/arch/x86/xen/xen-asm_64.S
@@ -11,142 +11,14 @@
11 generally too large to inline anyway. 11 generally too large to inline anyway.
12 */ 12 */
13 13
14#include <linux/linkage.h>
15
16#include <asm/asm-offsets.h>
17#include <asm/processor-flags.h>
18#include <asm/errno.h> 14#include <asm/errno.h>
19#include <asm/segment.h>
20#include <asm/percpu.h> 15#include <asm/percpu.h>
16#include <asm/processor-flags.h>
17#include <asm/segment.h>
21 18
22#include <xen/interface/xen.h> 19#include <xen/interface/xen.h>
23 20
24#define RELOC(x, v) .globl x##_reloc; x##_reloc=v 21#include "xen-asm.h"
25#define ENDPATCH(x) .globl x##_end; x##_end=.
26
27/* Pseudo-flag used for virtual NMI, which we don't implement yet */
28#define XEN_EFLAGS_NMI 0x80000000
29
30#if 1
31/*
32 FIXME: x86_64 now can support direct access to percpu variables
33 via a segment override. Update xen accordingly.
34 */
35#define BUG ud2a
36#endif
37
38/*
39 Enable events. This clears the event mask and tests the pending
40 event status with one and operation. If there are pending
41 events, then enter the hypervisor to get them handled.
42 */
43ENTRY(xen_irq_enable_direct)
44 BUG
45
46 /* Unmask events */
47 movb $0, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
48
49 /* Preempt here doesn't matter because that will deal with
50 any pending interrupts. The pending check may end up being
51 run on the wrong CPU, but that doesn't hurt. */
52
53 /* Test for pending */
54 testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending
55 jz 1f
56
572: call check_events
581:
59ENDPATCH(xen_irq_enable_direct)
60 ret
61 ENDPROC(xen_irq_enable_direct)
62 RELOC(xen_irq_enable_direct, 2b+1)
63
64/*
65 Disabling events is simply a matter of making the event mask
66 non-zero.
67 */
68ENTRY(xen_irq_disable_direct)
69 BUG
70
71 movb $1, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
72ENDPATCH(xen_irq_disable_direct)
73 ret
74 ENDPROC(xen_irq_disable_direct)
75 RELOC(xen_irq_disable_direct, 0)
76
77/*
78 (xen_)save_fl is used to get the current interrupt enable status.
79 Callers expect the status to be in X86_EFLAGS_IF, and other bits
80 may be set in the return value. We take advantage of this by
81 making sure that X86_EFLAGS_IF has the right value (and other bits
82 in that byte are 0), but other bits in the return value are
83 undefined. We need to toggle the state of the bit, because
84 Xen and x86 use opposite senses (mask vs enable).
85 */
86ENTRY(xen_save_fl_direct)
87 BUG
88
89 testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
90 setz %ah
91 addb %ah,%ah
92ENDPATCH(xen_save_fl_direct)
93 ret
94 ENDPROC(xen_save_fl_direct)
95 RELOC(xen_save_fl_direct, 0)
96
97/*
98 In principle the caller should be passing us a value return
99 from xen_save_fl_direct, but for robustness sake we test only
100 the X86_EFLAGS_IF flag rather than the whole byte. After
101 setting the interrupt mask state, it checks for unmasked
102 pending events and enters the hypervisor to get them delivered
103 if so.
104 */
105ENTRY(xen_restore_fl_direct)
106 BUG
107
108 testb $X86_EFLAGS_IF>>8, %ah
109 setz PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
110 /* Preempt here doesn't matter because that will deal with
111 any pending interrupts. The pending check may end up being
112 run on the wrong CPU, but that doesn't hurt. */
113
114 /* check for unmasked and pending */
115 cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending
116 jz 1f
1172: call check_events
1181:
119ENDPATCH(xen_restore_fl_direct)
120 ret
121 ENDPROC(xen_restore_fl_direct)
122 RELOC(xen_restore_fl_direct, 2b+1)
123
124
125/*
126 Force an event check by making a hypercall,
127 but preserve regs before making the call.
128 */
129check_events:
130 push %rax
131 push %rcx
132 push %rdx
133 push %rsi
134 push %rdi
135 push %r8
136 push %r9
137 push %r10
138 push %r11
139 call xen_force_evtchn_callback
140 pop %r11
141 pop %r10
142 pop %r9
143 pop %r8
144 pop %rdi
145 pop %rsi
146 pop %rdx
147 pop %rcx
148 pop %rax
149 ret
150 22
151ENTRY(xen_adjust_exception_frame) 23ENTRY(xen_adjust_exception_frame)
152 mov 8+0(%rsp),%rcx 24 mov 8+0(%rsp),%rcx
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index c1f8faf0a2c5..2f5ef2632ea2 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -10,9 +10,12 @@
10extern const char xen_hypervisor_callback[]; 10extern const char xen_hypervisor_callback[];
11extern const char xen_failsafe_callback[]; 11extern const char xen_failsafe_callback[];
12 12
13extern void *xen_initial_gdt;
14
13struct trap_info; 15struct trap_info;
14void xen_copy_trap_info(struct trap_info *traps); 16void xen_copy_trap_info(struct trap_info *traps);
15 17
18DECLARE_PER_CPU(struct vcpu_info, xen_vcpu_info);
16DECLARE_PER_CPU(unsigned long, xen_cr3); 19DECLARE_PER_CPU(unsigned long, xen_cr3);
17DECLARE_PER_CPU(unsigned long, xen_current_cr3); 20DECLARE_PER_CPU(unsigned long, xen_current_cr3);
18 21
@@ -22,6 +25,13 @@ extern struct shared_info *HYPERVISOR_shared_info;
22 25
23void xen_setup_mfn_list_list(void); 26void xen_setup_mfn_list_list(void);
24void xen_setup_shared_info(void); 27void xen_setup_shared_info(void);
28void xen_setup_machphys_mapping(void);
29pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
30void xen_ident_map_ISA(void);
31void xen_reserve_top(void);
32
33void xen_leave_lazy(void);
34void xen_post_allocator_init(void);
25 35
26char * __init xen_memory_setup(void); 36char * __init xen_memory_setup(void);
27void __init xen_arch_setup(void); 37void __init xen_arch_setup(void);