aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/xen
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2009-03-26 16:39:17 -0400
committerIngo Molnar <mingo@elte.hu>2009-03-27 12:28:43 -0400
commit6e15cf04860074ad032e88c306bea656bbdd0f22 (patch)
treec346383bb7563e8d66b2f4a502f875b259c34870 /arch/x86/xen
parentbe0ea69674ed95e1e98cb3687a241badc756d228 (diff)
parent60db56422043aaa455ac7f858ce23c273220f9d9 (diff)
Merge branch 'core/percpu' into percpu-cpumask-x86-for-linus-2
Conflicts: arch/parisc/kernel/irq.c arch/x86/include/asm/fixmap_64.h arch/x86/include/asm/setup.h kernel/irq/handle.c Semantic merge: arch/x86/include/asm/fixmap.h Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'arch/x86/xen')
-rw-r--r--arch/x86/xen/Kconfig2
-rw-r--r--arch/x86/xen/Makefile3
-rw-r--r--arch/x86/xen/enlighten.c816
-rw-r--r--arch/x86/xen/irq.c39
-rw-r--r--arch/x86/xen/mmu.c757
-rw-r--r--arch/x86/xen/mmu.h3
-rw-r--r--arch/x86/xen/multicalls.c15
-rw-r--r--arch/x86/xen/multicalls.h2
-rw-r--r--arch/x86/xen/smp.c47
-rw-r--r--arch/x86/xen/suspend.c1
-rw-r--r--arch/x86/xen/xen-asm.S142
-rw-r--r--arch/x86/xen/xen-asm.h12
-rw-r--r--arch/x86/xen/xen-asm_32.S343
-rw-r--r--arch/x86/xen/xen-asm_64.S252
-rw-r--r--arch/x86/xen/xen-head.S2
-rw-r--r--arch/x86/xen/xen-ops.h10
16 files changed, 1204 insertions, 1242 deletions
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 87b9ab166423..b83e119fbeb0 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -6,7 +6,7 @@ config XEN
6 bool "Xen guest support" 6 bool "Xen guest support"
7 select PARAVIRT 7 select PARAVIRT
8 select PARAVIRT_CLOCK 8 select PARAVIRT_CLOCK
9 depends on X86_64 || (X86_32 && X86_PAE && !(X86_VISWS || X86_VOYAGER)) 9 depends on X86_64 || (X86_32 && X86_PAE && !X86_VISWS)
10 depends on X86_CMPXCHG && X86_TSC 10 depends on X86_CMPXCHG && X86_TSC
11 help 11 help
12 This is the Linux Xen port. Enabling this will allow the 12 This is the Linux Xen port. Enabling this will allow the
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 6dcefba7836f..3b767d03fd6a 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -6,7 +6,8 @@ CFLAGS_REMOVE_irq.o = -pg
6endif 6endif
7 7
8obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ 8obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \
9 time.o xen-asm_$(BITS).o grant-table.o suspend.o 9 time.o xen-asm.o xen-asm_$(BITS).o \
10 grant-table.o suspend.o
10 11
11obj-$(CONFIG_SMP) += smp.o spinlock.o 12obj-$(CONFIG_SMP) += smp.o spinlock.o
12obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o \ No newline at end of file 13obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o \ No newline at end of file
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index b58e96338149..82cd39a6cbd3 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -61,40 +61,13 @@ DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
61enum xen_domain_type xen_domain_type = XEN_NATIVE; 61enum xen_domain_type xen_domain_type = XEN_NATIVE;
62EXPORT_SYMBOL_GPL(xen_domain_type); 62EXPORT_SYMBOL_GPL(xen_domain_type);
63 63
64/*
65 * Identity map, in addition to plain kernel map. This needs to be
66 * large enough to allocate page table pages to allocate the rest.
67 * Each page can map 2MB.
68 */
69static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss;
70
71#ifdef CONFIG_X86_64
72/* l3 pud for userspace vsyscall mapping */
73static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
74#endif /* CONFIG_X86_64 */
75
76/*
77 * Note about cr3 (pagetable base) values:
78 *
79 * xen_cr3 contains the current logical cr3 value; it contains the
80 * last set cr3. This may not be the current effective cr3, because
81 * its update may be being lazily deferred. However, a vcpu looking
82 * at its own cr3 can use this value knowing that it everything will
83 * be self-consistent.
84 *
85 * xen_current_cr3 contains the actual vcpu cr3; it is set once the
86 * hypercall to set the vcpu cr3 is complete (so it may be a little
87 * out of date, but it will never be set early). If one vcpu is
88 * looking at another vcpu's cr3 value, it should use this variable.
89 */
90DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */
91DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
92
93struct start_info *xen_start_info; 64struct start_info *xen_start_info;
94EXPORT_SYMBOL_GPL(xen_start_info); 65EXPORT_SYMBOL_GPL(xen_start_info);
95 66
96struct shared_info xen_dummy_shared_info; 67struct shared_info xen_dummy_shared_info;
97 68
69void *xen_initial_gdt;
70
98/* 71/*
99 * Point at some empty memory to start with. We map the real shared_info 72 * Point at some empty memory to start with. We map the real shared_info
100 * page as soon as fixmap is up and running. 73 * page as soon as fixmap is up and running.
@@ -114,14 +87,7 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
114 * 87 *
115 * 0: not available, 1: available 88 * 0: not available, 1: available
116 */ 89 */
117static int have_vcpu_info_placement = 90static int have_vcpu_info_placement = 1;
118#ifdef CONFIG_X86_32
119 1
120#else
121 0
122#endif
123 ;
124
125 91
126static void xen_vcpu_setup(int cpu) 92static void xen_vcpu_setup(int cpu)
127{ 93{
@@ -137,7 +103,7 @@ static void xen_vcpu_setup(int cpu)
137 103
138 vcpup = &per_cpu(xen_vcpu_info, cpu); 104 vcpup = &per_cpu(xen_vcpu_info, cpu);
139 105
140 info.mfn = virt_to_mfn(vcpup); 106 info.mfn = arbitrary_virt_to_mfn(vcpup);
141 info.offset = offset_in_page(vcpup); 107 info.offset = offset_in_page(vcpup);
142 108
143 printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %llx, offset %d\n", 109 printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %llx, offset %d\n",
@@ -237,7 +203,7 @@ static unsigned long xen_get_debugreg(int reg)
237 return HYPERVISOR_get_debugreg(reg); 203 return HYPERVISOR_get_debugreg(reg);
238} 204}
239 205
240static void xen_leave_lazy(void) 206void xen_leave_lazy(void)
241{ 207{
242 paravirt_leave_lazy(paravirt_get_lazy_mode()); 208 paravirt_leave_lazy(paravirt_get_lazy_mode());
243 xen_mc_flush(); 209 xen_mc_flush();
@@ -335,8 +301,10 @@ static void xen_load_gdt(const struct desc_ptr *dtr)
335 frames = mcs.args; 301 frames = mcs.args;
336 302
337 for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) { 303 for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) {
338 frames[f] = virt_to_mfn(va); 304 frames[f] = arbitrary_virt_to_mfn((void *)va);
305
339 make_lowmem_page_readonly((void *)va); 306 make_lowmem_page_readonly((void *)va);
307 make_lowmem_page_readonly(mfn_to_virt(frames[f]));
340 } 308 }
341 309
342 MULTI_set_gdt(mcs.mc, frames, size / sizeof(struct desc_struct)); 310 MULTI_set_gdt(mcs.mc, frames, size / sizeof(struct desc_struct));
@@ -348,7 +316,7 @@ static void load_TLS_descriptor(struct thread_struct *t,
348 unsigned int cpu, unsigned int i) 316 unsigned int cpu, unsigned int i)
349{ 317{
350 struct desc_struct *gdt = get_cpu_gdt_table(cpu); 318 struct desc_struct *gdt = get_cpu_gdt_table(cpu);
351 xmaddr_t maddr = virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]); 319 xmaddr_t maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
352 struct multicall_space mc = __xen_mc_entry(0); 320 struct multicall_space mc = __xen_mc_entry(0);
353 321
354 MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]); 322 MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]);
@@ -357,13 +325,14 @@ static void load_TLS_descriptor(struct thread_struct *t,
357static void xen_load_tls(struct thread_struct *t, unsigned int cpu) 325static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
358{ 326{
359 /* 327 /*
360 * XXX sleazy hack: If we're being called in a lazy-cpu zone, 328 * XXX sleazy hack: If we're being called in a lazy-cpu zone
361 * it means we're in a context switch, and %gs has just been 329 * and lazy gs handling is enabled, it means we're in a
362 * saved. This means we can zero it out to prevent faults on 330 * context switch, and %gs has just been saved. This means we
363 * exit from the hypervisor if the next process has no %gs. 331 * can zero it out to prevent faults on exit from the
364 * Either way, it has been saved, and the new value will get 332 * hypervisor if the next process has no %gs. Either way, it
365 * loaded properly. This will go away as soon as Xen has been 333 * has been saved, and the new value will get loaded properly.
366 * modified to not save/restore %gs for normal hypercalls. 334 * This will go away as soon as Xen has been modified to not
335 * save/restore %gs for normal hypercalls.
367 * 336 *
368 * On x86_64, this hack is not used for %gs, because gs points 337 * On x86_64, this hack is not used for %gs, because gs points
369 * to KERNEL_GS_BASE (and uses it for PDA references), so we 338 * to KERNEL_GS_BASE (and uses it for PDA references), so we
@@ -375,7 +344,7 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
375 */ 344 */
376 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) { 345 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) {
377#ifdef CONFIG_X86_32 346#ifdef CONFIG_X86_32
378 loadsegment(gs, 0); 347 lazy_load_gs(0);
379#else 348#else
380 loadsegment(fs, 0); 349 loadsegment(fs, 0);
381#endif 350#endif
@@ -521,7 +490,7 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
521 break; 490 break;
522 491
523 default: { 492 default: {
524 xmaddr_t maddr = virt_to_machine(&dt[entry]); 493 xmaddr_t maddr = arbitrary_virt_to_machine(&dt[entry]);
525 494
526 xen_mc_flush(); 495 xen_mc_flush();
527 if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc)) 496 if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc))
@@ -587,94 +556,18 @@ static u32 xen_safe_apic_wait_icr_idle(void)
587 return 0; 556 return 0;
588} 557}
589 558
590static struct apic_ops xen_basic_apic_ops = { 559static void set_xen_basic_apic_ops(void)
591 .read = xen_apic_read,
592 .write = xen_apic_write,
593 .icr_read = xen_apic_icr_read,
594 .icr_write = xen_apic_icr_write,
595 .wait_icr_idle = xen_apic_wait_icr_idle,
596 .safe_wait_icr_idle = xen_safe_apic_wait_icr_idle,
597};
598
599#endif
600
601static void xen_flush_tlb(void)
602{
603 struct mmuext_op *op;
604 struct multicall_space mcs;
605
606 preempt_disable();
607
608 mcs = xen_mc_entry(sizeof(*op));
609
610 op = mcs.args;
611 op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
612 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
613
614 xen_mc_issue(PARAVIRT_LAZY_MMU);
615
616 preempt_enable();
617}
618
619static void xen_flush_tlb_single(unsigned long addr)
620{ 560{
621 struct mmuext_op *op; 561 apic->read = xen_apic_read;
622 struct multicall_space mcs; 562 apic->write = xen_apic_write;
623 563 apic->icr_read = xen_apic_icr_read;
624 preempt_disable(); 564 apic->icr_write = xen_apic_icr_write;
625 565 apic->wait_icr_idle = xen_apic_wait_icr_idle;
626 mcs = xen_mc_entry(sizeof(*op)); 566 apic->safe_wait_icr_idle = xen_safe_apic_wait_icr_idle;
627 op = mcs.args;
628 op->cmd = MMUEXT_INVLPG_LOCAL;
629 op->arg1.linear_addr = addr & PAGE_MASK;
630 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
631
632 xen_mc_issue(PARAVIRT_LAZY_MMU);
633
634 preempt_enable();
635} 567}
636 568
637static void xen_flush_tlb_others(const cpumask_t *cpus, struct mm_struct *mm, 569#endif
638 unsigned long va)
639{
640 struct {
641 struct mmuext_op op;
642 cpumask_t mask;
643 } *args;
644 cpumask_t cpumask = *cpus;
645 struct multicall_space mcs;
646
647 /*
648 * A couple of (to be removed) sanity checks:
649 *
650 * - current CPU must not be in mask
651 * - mask must exist :)
652 */
653 BUG_ON(cpus_empty(cpumask));
654 BUG_ON(cpu_isset(smp_processor_id(), cpumask));
655 BUG_ON(!mm);
656
657 /* If a CPU which we ran on has gone down, OK. */
658 cpus_and(cpumask, cpumask, cpu_online_map);
659 if (cpus_empty(cpumask))
660 return;
661
662 mcs = xen_mc_entry(sizeof(*args));
663 args = mcs.args;
664 args->mask = cpumask;
665 args->op.arg2.vcpumask = &args->mask;
666
667 if (va == TLB_FLUSH_ALL) {
668 args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
669 } else {
670 args->op.cmd = MMUEXT_INVLPG_MULTI;
671 args->op.arg1.linear_addr = va;
672 }
673
674 MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
675 570
676 xen_mc_issue(PARAVIRT_LAZY_MMU);
677}
678 571
679static void xen_clts(void) 572static void xen_clts(void)
680{ 573{
@@ -700,21 +593,6 @@ static void xen_write_cr0(unsigned long cr0)
700 xen_mc_issue(PARAVIRT_LAZY_CPU); 593 xen_mc_issue(PARAVIRT_LAZY_CPU);
701} 594}
702 595
703static void xen_write_cr2(unsigned long cr2)
704{
705 x86_read_percpu(xen_vcpu)->arch.cr2 = cr2;
706}
707
708static unsigned long xen_read_cr2(void)
709{
710 return x86_read_percpu(xen_vcpu)->arch.cr2;
711}
712
713static unsigned long xen_read_cr2_direct(void)
714{
715 return x86_read_percpu(xen_vcpu_info.arch.cr2);
716}
717
718static void xen_write_cr4(unsigned long cr4) 596static void xen_write_cr4(unsigned long cr4)
719{ 597{
720 cr4 &= ~X86_CR4_PGE; 598 cr4 &= ~X86_CR4_PGE;
@@ -723,71 +601,6 @@ static void xen_write_cr4(unsigned long cr4)
723 native_write_cr4(cr4); 601 native_write_cr4(cr4);
724} 602}
725 603
726static unsigned long xen_read_cr3(void)
727{
728 return x86_read_percpu(xen_cr3);
729}
730
731static void set_current_cr3(void *v)
732{
733 x86_write_percpu(xen_current_cr3, (unsigned long)v);
734}
735
736static void __xen_write_cr3(bool kernel, unsigned long cr3)
737{
738 struct mmuext_op *op;
739 struct multicall_space mcs;
740 unsigned long mfn;
741
742 if (cr3)
743 mfn = pfn_to_mfn(PFN_DOWN(cr3));
744 else
745 mfn = 0;
746
747 WARN_ON(mfn == 0 && kernel);
748
749 mcs = __xen_mc_entry(sizeof(*op));
750
751 op = mcs.args;
752 op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
753 op->arg1.mfn = mfn;
754
755 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
756
757 if (kernel) {
758 x86_write_percpu(xen_cr3, cr3);
759
760 /* Update xen_current_cr3 once the batch has actually
761 been submitted. */
762 xen_mc_callback(set_current_cr3, (void *)cr3);
763 }
764}
765
766static void xen_write_cr3(unsigned long cr3)
767{
768 BUG_ON(preemptible());
769
770 xen_mc_batch(); /* disables interrupts */
771
772 /* Update while interrupts are disabled, so its atomic with
773 respect to ipis */
774 x86_write_percpu(xen_cr3, cr3);
775
776 __xen_write_cr3(true, cr3);
777
778#ifdef CONFIG_X86_64
779 {
780 pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
781 if (user_pgd)
782 __xen_write_cr3(false, __pa(user_pgd));
783 else
784 __xen_write_cr3(false, 0);
785 }
786#endif
787
788 xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
789}
790
791static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) 604static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
792{ 605{
793 int ret; 606 int ret;
@@ -829,185 +642,6 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
829 return ret; 642 return ret;
830} 643}
831 644
832/* Early in boot, while setting up the initial pagetable, assume
833 everything is pinned. */
834static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
835{
836#ifdef CONFIG_FLATMEM
837 BUG_ON(mem_map); /* should only be used early */
838#endif
839 make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
840}
841
842/* Early release_pte assumes that all pts are pinned, since there's
843 only init_mm and anything attached to that is pinned. */
844static void xen_release_pte_init(unsigned long pfn)
845{
846 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
847}
848
849static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
850{
851 struct mmuext_op op;
852 op.cmd = cmd;
853 op.arg1.mfn = pfn_to_mfn(pfn);
854 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
855 BUG();
856}
857
858/* This needs to make sure the new pte page is pinned iff its being
859 attached to a pinned pagetable. */
860static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level)
861{
862 struct page *page = pfn_to_page(pfn);
863
864 if (PagePinned(virt_to_page(mm->pgd))) {
865 SetPagePinned(page);
866
867 vm_unmap_aliases();
868 if (!PageHighMem(page)) {
869 make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
870 if (level == PT_PTE && USE_SPLIT_PTLOCKS)
871 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
872 } else {
873 /* make sure there are no stray mappings of
874 this page */
875 kmap_flush_unused();
876 }
877 }
878}
879
880static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
881{
882 xen_alloc_ptpage(mm, pfn, PT_PTE);
883}
884
885static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
886{
887 xen_alloc_ptpage(mm, pfn, PT_PMD);
888}
889
890static int xen_pgd_alloc(struct mm_struct *mm)
891{
892 pgd_t *pgd = mm->pgd;
893 int ret = 0;
894
895 BUG_ON(PagePinned(virt_to_page(pgd)));
896
897#ifdef CONFIG_X86_64
898 {
899 struct page *page = virt_to_page(pgd);
900 pgd_t *user_pgd;
901
902 BUG_ON(page->private != 0);
903
904 ret = -ENOMEM;
905
906 user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
907 page->private = (unsigned long)user_pgd;
908
909 if (user_pgd != NULL) {
910 user_pgd[pgd_index(VSYSCALL_START)] =
911 __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
912 ret = 0;
913 }
914
915 BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
916 }
917#endif
918
919 return ret;
920}
921
922static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
923{
924#ifdef CONFIG_X86_64
925 pgd_t *user_pgd = xen_get_user_pgd(pgd);
926
927 if (user_pgd)
928 free_page((unsigned long)user_pgd);
929#endif
930}
931
932/* This should never happen until we're OK to use struct page */
933static void xen_release_ptpage(unsigned long pfn, unsigned level)
934{
935 struct page *page = pfn_to_page(pfn);
936
937 if (PagePinned(page)) {
938 if (!PageHighMem(page)) {
939 if (level == PT_PTE && USE_SPLIT_PTLOCKS)
940 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
941 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
942 }
943 ClearPagePinned(page);
944 }
945}
946
947static void xen_release_pte(unsigned long pfn)
948{
949 xen_release_ptpage(pfn, PT_PTE);
950}
951
952static void xen_release_pmd(unsigned long pfn)
953{
954 xen_release_ptpage(pfn, PT_PMD);
955}
956
957#if PAGETABLE_LEVELS == 4
958static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
959{
960 xen_alloc_ptpage(mm, pfn, PT_PUD);
961}
962
963static void xen_release_pud(unsigned long pfn)
964{
965 xen_release_ptpage(pfn, PT_PUD);
966}
967#endif
968
969#ifdef CONFIG_HIGHPTE
970static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
971{
972 pgprot_t prot = PAGE_KERNEL;
973
974 if (PagePinned(page))
975 prot = PAGE_KERNEL_RO;
976
977 if (0 && PageHighMem(page))
978 printk("mapping highpte %lx type %d prot %s\n",
979 page_to_pfn(page), type,
980 (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ");
981
982 return kmap_atomic_prot(page, type, prot);
983}
984#endif
985
986#ifdef CONFIG_X86_32
987static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
988{
989 /* If there's an existing pte, then don't allow _PAGE_RW to be set */
990 if (pte_val_ma(*ptep) & _PAGE_PRESENT)
991 pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
992 pte_val_ma(pte));
993
994 return pte;
995}
996
997/* Init-time set_pte while constructing initial pagetables, which
998 doesn't allow RO pagetable pages to be remapped RW */
999static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
1000{
1001 pte = mask_rw_pte(ptep, pte);
1002
1003 xen_set_pte(ptep, pte);
1004}
1005#endif
1006
1007static __init void xen_pagetable_setup_start(pgd_t *base)
1008{
1009}
1010
1011void xen_setup_shared_info(void) 645void xen_setup_shared_info(void)
1012{ 646{
1013 if (!xen_feature(XENFEAT_auto_translated_physmap)) { 647 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
@@ -1028,37 +662,6 @@ void xen_setup_shared_info(void)
1028 xen_setup_mfn_list_list(); 662 xen_setup_mfn_list_list();
1029} 663}
1030 664
1031static __init void xen_pagetable_setup_done(pgd_t *base)
1032{
1033 xen_setup_shared_info();
1034}
1035
1036static __init void xen_post_allocator_init(void)
1037{
1038 pv_mmu_ops.set_pte = xen_set_pte;
1039 pv_mmu_ops.set_pmd = xen_set_pmd;
1040 pv_mmu_ops.set_pud = xen_set_pud;
1041#if PAGETABLE_LEVELS == 4
1042 pv_mmu_ops.set_pgd = xen_set_pgd;
1043#endif
1044
1045 /* This will work as long as patching hasn't happened yet
1046 (which it hasn't) */
1047 pv_mmu_ops.alloc_pte = xen_alloc_pte;
1048 pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
1049 pv_mmu_ops.release_pte = xen_release_pte;
1050 pv_mmu_ops.release_pmd = xen_release_pmd;
1051#if PAGETABLE_LEVELS == 4
1052 pv_mmu_ops.alloc_pud = xen_alloc_pud;
1053 pv_mmu_ops.release_pud = xen_release_pud;
1054#endif
1055
1056#ifdef CONFIG_X86_64
1057 SetPagePinned(virt_to_page(level3_user_vsyscall));
1058#endif
1059 xen_mark_init_mm_pinned();
1060}
1061
1062/* This is called once we have the cpu_possible_map */ 665/* This is called once we have the cpu_possible_map */
1063void xen_setup_vcpu_info_placement(void) 666void xen_setup_vcpu_info_placement(void)
1064{ 667{
@@ -1072,10 +675,10 @@ void xen_setup_vcpu_info_placement(void)
1072 if (have_vcpu_info_placement) { 675 if (have_vcpu_info_placement) {
1073 printk(KERN_INFO "Xen: using vcpu_info placement\n"); 676 printk(KERN_INFO "Xen: using vcpu_info placement\n");
1074 677
1075 pv_irq_ops.save_fl = xen_save_fl_direct; 678 pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct);
1076 pv_irq_ops.restore_fl = xen_restore_fl_direct; 679 pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct);
1077 pv_irq_ops.irq_disable = xen_irq_disable_direct; 680 pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct);
1078 pv_irq_ops.irq_enable = xen_irq_enable_direct; 681 pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(xen_irq_enable_direct);
1079 pv_mmu_ops.read_cr2 = xen_read_cr2_direct; 682 pv_mmu_ops.read_cr2 = xen_read_cr2_direct;
1080 } 683 }
1081} 684}
@@ -1133,49 +736,6 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
1133 return ret; 736 return ret;
1134} 737}
1135 738
1136static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot)
1137{
1138 pte_t pte;
1139
1140 phys >>= PAGE_SHIFT;
1141
1142 switch (idx) {
1143 case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
1144#ifdef CONFIG_X86_F00F_BUG
1145 case FIX_F00F_IDT:
1146#endif
1147#ifdef CONFIG_X86_32
1148 case FIX_WP_TEST:
1149 case FIX_VDSO:
1150# ifdef CONFIG_HIGHMEM
1151 case FIX_KMAP_BEGIN ... FIX_KMAP_END:
1152# endif
1153#else
1154 case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
1155#endif
1156#ifdef CONFIG_X86_LOCAL_APIC
1157 case FIX_APIC_BASE: /* maps dummy local APIC */
1158#endif
1159 pte = pfn_pte(phys, prot);
1160 break;
1161
1162 default:
1163 pte = mfn_pte(phys, prot);
1164 break;
1165 }
1166
1167 __native_set_fixmap(idx, pte);
1168
1169#ifdef CONFIG_X86_64
1170 /* Replicate changes to map the vsyscall page into the user
1171 pagetable vsyscall mapping. */
1172 if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) {
1173 unsigned long vaddr = __fix_to_virt(idx);
1174 set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
1175 }
1176#endif
1177}
1178
1179static const struct pv_info xen_info __initdata = { 739static const struct pv_info xen_info __initdata = {
1180 .paravirt_enabled = 1, 740 .paravirt_enabled = 1,
1181 .shared_kernel_pmd = 0, 741 .shared_kernel_pmd = 0,
@@ -1271,87 +831,6 @@ static const struct pv_apic_ops xen_apic_ops __initdata = {
1271#endif 831#endif
1272}; 832};
1273 833
1274static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1275 .pagetable_setup_start = xen_pagetable_setup_start,
1276 .pagetable_setup_done = xen_pagetable_setup_done,
1277
1278 .read_cr2 = xen_read_cr2,
1279 .write_cr2 = xen_write_cr2,
1280
1281 .read_cr3 = xen_read_cr3,
1282 .write_cr3 = xen_write_cr3,
1283
1284 .flush_tlb_user = xen_flush_tlb,
1285 .flush_tlb_kernel = xen_flush_tlb,
1286 .flush_tlb_single = xen_flush_tlb_single,
1287 .flush_tlb_others = xen_flush_tlb_others,
1288
1289 .pte_update = paravirt_nop,
1290 .pte_update_defer = paravirt_nop,
1291
1292 .pgd_alloc = xen_pgd_alloc,
1293 .pgd_free = xen_pgd_free,
1294
1295 .alloc_pte = xen_alloc_pte_init,
1296 .release_pte = xen_release_pte_init,
1297 .alloc_pmd = xen_alloc_pte_init,
1298 .alloc_pmd_clone = paravirt_nop,
1299 .release_pmd = xen_release_pte_init,
1300
1301#ifdef CONFIG_HIGHPTE
1302 .kmap_atomic_pte = xen_kmap_atomic_pte,
1303#endif
1304
1305#ifdef CONFIG_X86_64
1306 .set_pte = xen_set_pte,
1307#else
1308 .set_pte = xen_set_pte_init,
1309#endif
1310 .set_pte_at = xen_set_pte_at,
1311 .set_pmd = xen_set_pmd_hyper,
1312
1313 .ptep_modify_prot_start = __ptep_modify_prot_start,
1314 .ptep_modify_prot_commit = __ptep_modify_prot_commit,
1315
1316 .pte_val = xen_pte_val,
1317 .pte_flags = native_pte_flags,
1318 .pgd_val = xen_pgd_val,
1319
1320 .make_pte = xen_make_pte,
1321 .make_pgd = xen_make_pgd,
1322
1323#ifdef CONFIG_X86_PAE
1324 .set_pte_atomic = xen_set_pte_atomic,
1325 .set_pte_present = xen_set_pte_at,
1326 .pte_clear = xen_pte_clear,
1327 .pmd_clear = xen_pmd_clear,
1328#endif /* CONFIG_X86_PAE */
1329 .set_pud = xen_set_pud_hyper,
1330
1331 .make_pmd = xen_make_pmd,
1332 .pmd_val = xen_pmd_val,
1333
1334#if PAGETABLE_LEVELS == 4
1335 .pud_val = xen_pud_val,
1336 .make_pud = xen_make_pud,
1337 .set_pgd = xen_set_pgd_hyper,
1338
1339 .alloc_pud = xen_alloc_pte_init,
1340 .release_pud = xen_release_pte_init,
1341#endif /* PAGETABLE_LEVELS == 4 */
1342
1343 .activate_mm = xen_activate_mm,
1344 .dup_mmap = xen_dup_mmap,
1345 .exit_mmap = xen_exit_mmap,
1346
1347 .lazy_mode = {
1348 .enter = paravirt_enter_lazy_mmu,
1349 .leave = xen_leave_lazy,
1350 },
1351
1352 .set_fixmap = xen_set_fixmap,
1353};
1354
1355static void xen_reboot(int reason) 834static void xen_reboot(int reason)
1356{ 835{
1357 struct sched_shutdown r = { .reason = reason }; 836 struct sched_shutdown r = { .reason = reason };
@@ -1394,223 +873,6 @@ static const struct machine_ops __initdata xen_machine_ops = {
1394}; 873};
1395 874
1396 875
1397static void __init xen_reserve_top(void)
1398{
1399#ifdef CONFIG_X86_32
1400 unsigned long top = HYPERVISOR_VIRT_START;
1401 struct xen_platform_parameters pp;
1402
1403 if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
1404 top = pp.virt_start;
1405
1406 reserve_top_address(-top);
1407#endif /* CONFIG_X86_32 */
1408}
1409
1410/*
1411 * Like __va(), but returns address in the kernel mapping (which is
1412 * all we have until the physical memory mapping has been set up.
1413 */
1414static void *__ka(phys_addr_t paddr)
1415{
1416#ifdef CONFIG_X86_64
1417 return (void *)(paddr + __START_KERNEL_map);
1418#else
1419 return __va(paddr);
1420#endif
1421}
1422
1423/* Convert a machine address to physical address */
1424static unsigned long m2p(phys_addr_t maddr)
1425{
1426 phys_addr_t paddr;
1427
1428 maddr &= PTE_PFN_MASK;
1429 paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
1430
1431 return paddr;
1432}
1433
1434/* Convert a machine address to kernel virtual */
1435static void *m2v(phys_addr_t maddr)
1436{
1437 return __ka(m2p(maddr));
1438}
1439
1440static void set_page_prot(void *addr, pgprot_t prot)
1441{
1442 unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
1443 pte_t pte = pfn_pte(pfn, prot);
1444
1445 if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
1446 BUG();
1447}
1448
1449static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1450{
1451 unsigned pmdidx, pteidx;
1452 unsigned ident_pte;
1453 unsigned long pfn;
1454
1455 ident_pte = 0;
1456 pfn = 0;
1457 for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
1458 pte_t *pte_page;
1459
1460 /* Reuse or allocate a page of ptes */
1461 if (pmd_present(pmd[pmdidx]))
1462 pte_page = m2v(pmd[pmdidx].pmd);
1463 else {
1464 /* Check for free pte pages */
1465 if (ident_pte == ARRAY_SIZE(level1_ident_pgt))
1466 break;
1467
1468 pte_page = &level1_ident_pgt[ident_pte];
1469 ident_pte += PTRS_PER_PTE;
1470
1471 pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
1472 }
1473
1474 /* Install mappings */
1475 for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
1476 pte_t pte;
1477
1478 if (pfn > max_pfn_mapped)
1479 max_pfn_mapped = pfn;
1480
1481 if (!pte_none(pte_page[pteidx]))
1482 continue;
1483
1484 pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
1485 pte_page[pteidx] = pte;
1486 }
1487 }
1488
1489 for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
1490 set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
1491
1492 set_page_prot(pmd, PAGE_KERNEL_RO);
1493}
1494
1495#ifdef CONFIG_X86_64
1496static void convert_pfn_mfn(void *v)
1497{
1498 pte_t *pte = v;
1499 int i;
1500
1501 /* All levels are converted the same way, so just treat them
1502 as ptes. */
1503 for (i = 0; i < PTRS_PER_PTE; i++)
1504 pte[i] = xen_make_pte(pte[i].pte);
1505}
1506
1507/*
1508 * Set up the inital kernel pagetable.
1509 *
1510 * We can construct this by grafting the Xen provided pagetable into
1511 * head_64.S's preconstructed pagetables. We copy the Xen L2's into
1512 * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt. This
1513 * means that only the kernel has a physical mapping to start with -
1514 * but that's enough to get __va working. We need to fill in the rest
1515 * of the physical mapping once some sort of allocator has been set
1516 * up.
1517 */
1518static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1519 unsigned long max_pfn)
1520{
1521 pud_t *l3;
1522 pmd_t *l2;
1523
1524 /* Zap identity mapping */
1525 init_level4_pgt[0] = __pgd(0);
1526
1527 /* Pre-constructed entries are in pfn, so convert to mfn */
1528 convert_pfn_mfn(init_level4_pgt);
1529 convert_pfn_mfn(level3_ident_pgt);
1530 convert_pfn_mfn(level3_kernel_pgt);
1531
1532 l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
1533 l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
1534
1535 memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1536 memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1537
1538 l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
1539 l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
1540 memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1541
1542 /* Set up identity map */
1543 xen_map_identity_early(level2_ident_pgt, max_pfn);
1544
1545 /* Make pagetable pieces RO */
1546 set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
1547 set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
1548 set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
1549 set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
1550 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1551 set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
1552
1553 /* Pin down new L4 */
1554 pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
1555 PFN_DOWN(__pa_symbol(init_level4_pgt)));
1556
1557 /* Unpin Xen-provided one */
1558 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1559
1560 /* Switch over */
1561 pgd = init_level4_pgt;
1562
1563 /*
1564 * At this stage there can be no user pgd, and no page
1565 * structure to attach it to, so make sure we just set kernel
1566 * pgd.
1567 */
1568 xen_mc_batch();
1569 __xen_write_cr3(true, __pa(pgd));
1570 xen_mc_issue(PARAVIRT_LAZY_CPU);
1571
1572 reserve_early(__pa(xen_start_info->pt_base),
1573 __pa(xen_start_info->pt_base +
1574 xen_start_info->nr_pt_frames * PAGE_SIZE),
1575 "XEN PAGETABLES");
1576
1577 return pgd;
1578}
1579#else /* !CONFIG_X86_64 */
1580static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss;
1581
1582static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1583 unsigned long max_pfn)
1584{
1585 pmd_t *kernel_pmd;
1586
1587 init_pg_tables_start = __pa(pgd);
1588 init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
1589 max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024);
1590
1591 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
1592 memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
1593
1594 xen_map_identity_early(level2_kernel_pgt, max_pfn);
1595
1596 memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
1597 set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY],
1598 __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT));
1599
1600 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1601 set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
1602 set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
1603
1604 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1605
1606 xen_write_cr3(__pa(swapper_pg_dir));
1607
1608 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir)));
1609
1610 return swapper_pg_dir;
1611}
1612#endif /* CONFIG_X86_64 */
1613
1614/* First C function to be called on Xen boot */ 876/* First C function to be called on Xen boot */
1615asmlinkage void __init xen_start_kernel(void) 877asmlinkage void __init xen_start_kernel(void)
1616{ 878{
@@ -1639,7 +901,7 @@ asmlinkage void __init xen_start_kernel(void)
1639 /* 901 /*
1640 * set up the basic apic ops. 902 * set up the basic apic ops.
1641 */ 903 */
1642 apic_ops = &xen_basic_apic_ops; 904 set_xen_basic_apic_ops();
1643#endif 905#endif
1644 906
1645 if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) { 907 if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) {
@@ -1650,10 +912,18 @@ asmlinkage void __init xen_start_kernel(void)
1650 machine_ops = xen_machine_ops; 912 machine_ops = xen_machine_ops;
1651 913
1652#ifdef CONFIG_X86_64 914#ifdef CONFIG_X86_64
1653 /* Disable until direct per-cpu data access. */ 915 /*
1654 have_vcpu_info_placement = 0; 916 * Setup percpu state. We only need to do this for 64-bit
1655 x86_64_init_pda(); 917 * because 32-bit already has %fs set properly.
918 */
919 load_percpu_segment(0);
1656#endif 920#endif
921 /*
922 * The only reliable way to retain the initial address of the
923 * percpu gdt_page is to remember it here, so we can go and
924 * mark it RW later, when the initial percpu area is freed.
925 */
926 xen_initial_gdt = &per_cpu(gdt_page, 0);
1657 927
1658 xen_smp_init(); 928 xen_smp_init();
1659 929
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
index bb042608c602..cfd17799bd6d 100644
--- a/arch/x86/xen/irq.c
+++ b/arch/x86/xen/irq.c
@@ -19,27 +19,12 @@ void xen_force_evtchn_callback(void)
19 (void)HYPERVISOR_xen_version(0, NULL); 19 (void)HYPERVISOR_xen_version(0, NULL);
20} 20}
21 21
22static void __init __xen_init_IRQ(void)
23{
24 int i;
25
26 /* Create identity vector->irq map */
27 for(i = 0; i < NR_VECTORS; i++) {
28 int cpu;
29
30 for_each_possible_cpu(cpu)
31 per_cpu(vector_irq, cpu)[i] = i;
32 }
33
34 xen_init_IRQ();
35}
36
37static unsigned long xen_save_fl(void) 22static unsigned long xen_save_fl(void)
38{ 23{
39 struct vcpu_info *vcpu; 24 struct vcpu_info *vcpu;
40 unsigned long flags; 25 unsigned long flags;
41 26
42 vcpu = x86_read_percpu(xen_vcpu); 27 vcpu = percpu_read(xen_vcpu);
43 28
44 /* flag has opposite sense of mask */ 29 /* flag has opposite sense of mask */
45 flags = !vcpu->evtchn_upcall_mask; 30 flags = !vcpu->evtchn_upcall_mask;
@@ -50,6 +35,7 @@ static unsigned long xen_save_fl(void)
50 */ 35 */
51 return (-flags) & X86_EFLAGS_IF; 36 return (-flags) & X86_EFLAGS_IF;
52} 37}
38PV_CALLEE_SAVE_REGS_THUNK(xen_save_fl);
53 39
54static void xen_restore_fl(unsigned long flags) 40static void xen_restore_fl(unsigned long flags)
55{ 41{
@@ -62,7 +48,7 @@ static void xen_restore_fl(unsigned long flags)
62 make sure we're don't switch CPUs between getting the vcpu 48 make sure we're don't switch CPUs between getting the vcpu
63 pointer and updating the mask. */ 49 pointer and updating the mask. */
64 preempt_disable(); 50 preempt_disable();
65 vcpu = x86_read_percpu(xen_vcpu); 51 vcpu = percpu_read(xen_vcpu);
66 vcpu->evtchn_upcall_mask = flags; 52 vcpu->evtchn_upcall_mask = flags;
67 preempt_enable_no_resched(); 53 preempt_enable_no_resched();
68 54
@@ -76,6 +62,7 @@ static void xen_restore_fl(unsigned long flags)
76 xen_force_evtchn_callback(); 62 xen_force_evtchn_callback();
77 } 63 }
78} 64}
65PV_CALLEE_SAVE_REGS_THUNK(xen_restore_fl);
79 66
80static void xen_irq_disable(void) 67static void xen_irq_disable(void)
81{ 68{
@@ -83,9 +70,10 @@ static void xen_irq_disable(void)
83 make sure we're don't switch CPUs between getting the vcpu 70 make sure we're don't switch CPUs between getting the vcpu
84 pointer and updating the mask. */ 71 pointer and updating the mask. */
85 preempt_disable(); 72 preempt_disable();
86 x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1; 73 percpu_read(xen_vcpu)->evtchn_upcall_mask = 1;
87 preempt_enable_no_resched(); 74 preempt_enable_no_resched();
88} 75}
76PV_CALLEE_SAVE_REGS_THUNK(xen_irq_disable);
89 77
90static void xen_irq_enable(void) 78static void xen_irq_enable(void)
91{ 79{
@@ -96,7 +84,7 @@ static void xen_irq_enable(void)
96 the caller is confused and is trying to re-enable interrupts 84 the caller is confused and is trying to re-enable interrupts
97 on an indeterminate processor. */ 85 on an indeterminate processor. */
98 86
99 vcpu = x86_read_percpu(xen_vcpu); 87 vcpu = percpu_read(xen_vcpu);
100 vcpu->evtchn_upcall_mask = 0; 88 vcpu->evtchn_upcall_mask = 0;
101 89
102 /* Doesn't matter if we get preempted here, because any 90 /* Doesn't matter if we get preempted here, because any
@@ -106,6 +94,7 @@ static void xen_irq_enable(void)
106 if (unlikely(vcpu->evtchn_upcall_pending)) 94 if (unlikely(vcpu->evtchn_upcall_pending))
107 xen_force_evtchn_callback(); 95 xen_force_evtchn_callback();
108} 96}
97PV_CALLEE_SAVE_REGS_THUNK(xen_irq_enable);
109 98
110static void xen_safe_halt(void) 99static void xen_safe_halt(void)
111{ 100{
@@ -123,11 +112,13 @@ static void xen_halt(void)
123} 112}
124 113
125static const struct pv_irq_ops xen_irq_ops __initdata = { 114static const struct pv_irq_ops xen_irq_ops __initdata = {
126 .init_IRQ = __xen_init_IRQ, 115 .init_IRQ = xen_init_IRQ,
127 .save_fl = xen_save_fl, 116
128 .restore_fl = xen_restore_fl, 117 .save_fl = PV_CALLEE_SAVE(xen_save_fl),
129 .irq_disable = xen_irq_disable, 118 .restore_fl = PV_CALLEE_SAVE(xen_restore_fl),
130 .irq_enable = xen_irq_enable, 119 .irq_disable = PV_CALLEE_SAVE(xen_irq_disable),
120 .irq_enable = PV_CALLEE_SAVE(xen_irq_enable),
121
131 .safe_halt = xen_safe_halt, 122 .safe_halt = xen_safe_halt,
132 .halt = xen_halt, 123 .halt = xen_halt,
133#ifdef CONFIG_X86_64 124#ifdef CONFIG_X86_64
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 503c240e26c7..cb6afa4ec95c 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -47,6 +47,7 @@
47#include <asm/tlbflush.h> 47#include <asm/tlbflush.h>
48#include <asm/fixmap.h> 48#include <asm/fixmap.h>
49#include <asm/mmu_context.h> 49#include <asm/mmu_context.h>
50#include <asm/setup.h>
50#include <asm/paravirt.h> 51#include <asm/paravirt.h>
51#include <asm/linkage.h> 52#include <asm/linkage.h>
52 53
@@ -55,6 +56,8 @@
55 56
56#include <xen/page.h> 57#include <xen/page.h>
57#include <xen/interface/xen.h> 58#include <xen/interface/xen.h>
59#include <xen/interface/version.h>
60#include <xen/hvc-console.h>
58 61
59#include "multicalls.h" 62#include "multicalls.h"
60#include "mmu.h" 63#include "mmu.h"
@@ -114,6 +117,37 @@ static inline void check_zero(void)
114 117
115#endif /* CONFIG_XEN_DEBUG_FS */ 118#endif /* CONFIG_XEN_DEBUG_FS */
116 119
120
121/*
122 * Identity map, in addition to plain kernel map. This needs to be
123 * large enough to allocate page table pages to allocate the rest.
124 * Each page can map 2MB.
125 */
126static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss;
127
128#ifdef CONFIG_X86_64
129/* l3 pud for userspace vsyscall mapping */
130static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
131#endif /* CONFIG_X86_64 */
132
133/*
134 * Note about cr3 (pagetable base) values:
135 *
136 * xen_cr3 contains the current logical cr3 value; it contains the
137 * last set cr3. This may not be the current effective cr3, because
138 * its update may be being lazily deferred. However, a vcpu looking
139 * at its own cr3 can use this value knowing that it everything will
140 * be self-consistent.
141 *
142 * xen_current_cr3 contains the actual vcpu cr3; it is set once the
143 * hypercall to set the vcpu cr3 is complete (so it may be a little
144 * out of date, but it will never be set early). If one vcpu is
145 * looking at another vcpu's cr3 value, it should use this variable.
146 */
147DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */
148DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
149
150
117/* 151/*
118 * Just beyond the highest usermode address. STACK_TOP_MAX has a 152 * Just beyond the highest usermode address. STACK_TOP_MAX has a
119 * redzone above it, so round it up to a PGD boundary. 153 * redzone above it, so round it up to a PGD boundary.
@@ -242,6 +276,13 @@ void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
242 p2m_top[topidx][idx] = mfn; 276 p2m_top[topidx][idx] = mfn;
243} 277}
244 278
279unsigned long arbitrary_virt_to_mfn(void *vaddr)
280{
281 xmaddr_t maddr = arbitrary_virt_to_machine(vaddr);
282
283 return PFN_DOWN(maddr.maddr);
284}
285
245xmaddr_t arbitrary_virt_to_machine(void *vaddr) 286xmaddr_t arbitrary_virt_to_machine(void *vaddr)
246{ 287{
247 unsigned long address = (unsigned long)vaddr; 288 unsigned long address = (unsigned long)vaddr;
@@ -458,28 +499,33 @@ pteval_t xen_pte_val(pte_t pte)
458{ 499{
459 return pte_mfn_to_pfn(pte.pte); 500 return pte_mfn_to_pfn(pte.pte);
460} 501}
502PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
461 503
462pgdval_t xen_pgd_val(pgd_t pgd) 504pgdval_t xen_pgd_val(pgd_t pgd)
463{ 505{
464 return pte_mfn_to_pfn(pgd.pgd); 506 return pte_mfn_to_pfn(pgd.pgd);
465} 507}
508PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
466 509
467pte_t xen_make_pte(pteval_t pte) 510pte_t xen_make_pte(pteval_t pte)
468{ 511{
469 pte = pte_pfn_to_mfn(pte); 512 pte = pte_pfn_to_mfn(pte);
470 return native_make_pte(pte); 513 return native_make_pte(pte);
471} 514}
515PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
472 516
473pgd_t xen_make_pgd(pgdval_t pgd) 517pgd_t xen_make_pgd(pgdval_t pgd)
474{ 518{
475 pgd = pte_pfn_to_mfn(pgd); 519 pgd = pte_pfn_to_mfn(pgd);
476 return native_make_pgd(pgd); 520 return native_make_pgd(pgd);
477} 521}
522PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd);
478 523
479pmdval_t xen_pmd_val(pmd_t pmd) 524pmdval_t xen_pmd_val(pmd_t pmd)
480{ 525{
481 return pte_mfn_to_pfn(pmd.pmd); 526 return pte_mfn_to_pfn(pmd.pmd);
482} 527}
528PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val);
483 529
484void xen_set_pud_hyper(pud_t *ptr, pud_t val) 530void xen_set_pud_hyper(pud_t *ptr, pud_t val)
485{ 531{
@@ -556,12 +602,14 @@ pmd_t xen_make_pmd(pmdval_t pmd)
556 pmd = pte_pfn_to_mfn(pmd); 602 pmd = pte_pfn_to_mfn(pmd);
557 return native_make_pmd(pmd); 603 return native_make_pmd(pmd);
558} 604}
605PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
559 606
560#if PAGETABLE_LEVELS == 4 607#if PAGETABLE_LEVELS == 4
561pudval_t xen_pud_val(pud_t pud) 608pudval_t xen_pud_val(pud_t pud)
562{ 609{
563 return pte_mfn_to_pfn(pud.pud); 610 return pte_mfn_to_pfn(pud.pud);
564} 611}
612PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val);
565 613
566pud_t xen_make_pud(pudval_t pud) 614pud_t xen_make_pud(pudval_t pud)
567{ 615{
@@ -569,6 +617,7 @@ pud_t xen_make_pud(pudval_t pud)
569 617
570 return native_make_pud(pud); 618 return native_make_pud(pud);
571} 619}
620PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud);
572 621
573pgd_t *xen_get_user_pgd(pgd_t *pgd) 622pgd_t *xen_get_user_pgd(pgd_t *pgd)
574{ 623{
@@ -1063,18 +1112,14 @@ static void drop_other_mm_ref(void *info)
1063 struct mm_struct *mm = info; 1112 struct mm_struct *mm = info;
1064 struct mm_struct *active_mm; 1113 struct mm_struct *active_mm;
1065 1114
1066#ifdef CONFIG_X86_64 1115 active_mm = percpu_read(cpu_tlbstate.active_mm);
1067 active_mm = read_pda(active_mm);
1068#else
1069 active_mm = __get_cpu_var(cpu_tlbstate).active_mm;
1070#endif
1071 1116
1072 if (active_mm == mm) 1117 if (active_mm == mm)
1073 leave_mm(smp_processor_id()); 1118 leave_mm(smp_processor_id());
1074 1119
1075 /* If this cpu still has a stale cr3 reference, then make sure 1120 /* If this cpu still has a stale cr3 reference, then make sure
1076 it has been flushed. */ 1121 it has been flushed. */
1077 if (x86_read_percpu(xen_current_cr3) == __pa(mm->pgd)) { 1122 if (percpu_read(xen_current_cr3) == __pa(mm->pgd)) {
1078 load_cr3(swapper_pg_dir); 1123 load_cr3(swapper_pg_dir);
1079 arch_flush_lazy_cpu_mode(); 1124 arch_flush_lazy_cpu_mode();
1080 } 1125 }
@@ -1156,6 +1201,706 @@ void xen_exit_mmap(struct mm_struct *mm)
1156 spin_unlock(&mm->page_table_lock); 1201 spin_unlock(&mm->page_table_lock);
1157} 1202}
1158 1203
1204static __init void xen_pagetable_setup_start(pgd_t *base)
1205{
1206}
1207
1208static __init void xen_pagetable_setup_done(pgd_t *base)
1209{
1210 xen_setup_shared_info();
1211}
1212
1213static void xen_write_cr2(unsigned long cr2)
1214{
1215 percpu_read(xen_vcpu)->arch.cr2 = cr2;
1216}
1217
1218static unsigned long xen_read_cr2(void)
1219{
1220 return percpu_read(xen_vcpu)->arch.cr2;
1221}
1222
1223unsigned long xen_read_cr2_direct(void)
1224{
1225 return percpu_read(xen_vcpu_info.arch.cr2);
1226}
1227
1228static void xen_flush_tlb(void)
1229{
1230 struct mmuext_op *op;
1231 struct multicall_space mcs;
1232
1233 preempt_disable();
1234
1235 mcs = xen_mc_entry(sizeof(*op));
1236
1237 op = mcs.args;
1238 op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
1239 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1240
1241 xen_mc_issue(PARAVIRT_LAZY_MMU);
1242
1243 preempt_enable();
1244}
1245
1246static void xen_flush_tlb_single(unsigned long addr)
1247{
1248 struct mmuext_op *op;
1249 struct multicall_space mcs;
1250
1251 preempt_disable();
1252
1253 mcs = xen_mc_entry(sizeof(*op));
1254 op = mcs.args;
1255 op->cmd = MMUEXT_INVLPG_LOCAL;
1256 op->arg1.linear_addr = addr & PAGE_MASK;
1257 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1258
1259 xen_mc_issue(PARAVIRT_LAZY_MMU);
1260
1261 preempt_enable();
1262}
1263
1264static void xen_flush_tlb_others(const struct cpumask *cpus,
1265 struct mm_struct *mm, unsigned long va)
1266{
1267 struct {
1268 struct mmuext_op op;
1269 DECLARE_BITMAP(mask, NR_CPUS);
1270 } *args;
1271 struct multicall_space mcs;
1272
1273 BUG_ON(cpumask_empty(cpus));
1274 BUG_ON(!mm);
1275
1276 mcs = xen_mc_entry(sizeof(*args));
1277 args = mcs.args;
1278 args->op.arg2.vcpumask = to_cpumask(args->mask);
1279
1280 /* Remove us, and any offline CPUS. */
1281 cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
1282 cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
1283
1284 if (va == TLB_FLUSH_ALL) {
1285 args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
1286 } else {
1287 args->op.cmd = MMUEXT_INVLPG_MULTI;
1288 args->op.arg1.linear_addr = va;
1289 }
1290
1291 MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
1292
1293 xen_mc_issue(PARAVIRT_LAZY_MMU);
1294}
1295
1296static unsigned long xen_read_cr3(void)
1297{
1298 return percpu_read(xen_cr3);
1299}
1300
1301static void set_current_cr3(void *v)
1302{
1303 percpu_write(xen_current_cr3, (unsigned long)v);
1304}
1305
1306static void __xen_write_cr3(bool kernel, unsigned long cr3)
1307{
1308 struct mmuext_op *op;
1309 struct multicall_space mcs;
1310 unsigned long mfn;
1311
1312 if (cr3)
1313 mfn = pfn_to_mfn(PFN_DOWN(cr3));
1314 else
1315 mfn = 0;
1316
1317 WARN_ON(mfn == 0 && kernel);
1318
1319 mcs = __xen_mc_entry(sizeof(*op));
1320
1321 op = mcs.args;
1322 op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
1323 op->arg1.mfn = mfn;
1324
1325 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1326
1327 if (kernel) {
1328 percpu_write(xen_cr3, cr3);
1329
1330 /* Update xen_current_cr3 once the batch has actually
1331 been submitted. */
1332 xen_mc_callback(set_current_cr3, (void *)cr3);
1333 }
1334}
1335
1336static void xen_write_cr3(unsigned long cr3)
1337{
1338 BUG_ON(preemptible());
1339
1340 xen_mc_batch(); /* disables interrupts */
1341
1342 /* Update while interrupts are disabled, so its atomic with
1343 respect to ipis */
1344 percpu_write(xen_cr3, cr3);
1345
1346 __xen_write_cr3(true, cr3);
1347
1348#ifdef CONFIG_X86_64
1349 {
1350 pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
1351 if (user_pgd)
1352 __xen_write_cr3(false, __pa(user_pgd));
1353 else
1354 __xen_write_cr3(false, 0);
1355 }
1356#endif
1357
1358 xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
1359}
1360
1361static int xen_pgd_alloc(struct mm_struct *mm)
1362{
1363 pgd_t *pgd = mm->pgd;
1364 int ret = 0;
1365
1366 BUG_ON(PagePinned(virt_to_page(pgd)));
1367
1368#ifdef CONFIG_X86_64
1369 {
1370 struct page *page = virt_to_page(pgd);
1371 pgd_t *user_pgd;
1372
1373 BUG_ON(page->private != 0);
1374
1375 ret = -ENOMEM;
1376
1377 user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
1378 page->private = (unsigned long)user_pgd;
1379
1380 if (user_pgd != NULL) {
1381 user_pgd[pgd_index(VSYSCALL_START)] =
1382 __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
1383 ret = 0;
1384 }
1385
1386 BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
1387 }
1388#endif
1389
1390 return ret;
1391}
1392
1393static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
1394{
1395#ifdef CONFIG_X86_64
1396 pgd_t *user_pgd = xen_get_user_pgd(pgd);
1397
1398 if (user_pgd)
1399 free_page((unsigned long)user_pgd);
1400#endif
1401}
1402
1403#ifdef CONFIG_HIGHPTE
1404static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
1405{
1406 pgprot_t prot = PAGE_KERNEL;
1407
1408 if (PagePinned(page))
1409 prot = PAGE_KERNEL_RO;
1410
1411 if (0 && PageHighMem(page))
1412 printk("mapping highpte %lx type %d prot %s\n",
1413 page_to_pfn(page), type,
1414 (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ");
1415
1416 return kmap_atomic_prot(page, type, prot);
1417}
1418#endif
1419
1420#ifdef CONFIG_X86_32
1421static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
1422{
1423 /* If there's an existing pte, then don't allow _PAGE_RW to be set */
1424 if (pte_val_ma(*ptep) & _PAGE_PRESENT)
1425 pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
1426 pte_val_ma(pte));
1427
1428 return pte;
1429}
1430
1431/* Init-time set_pte while constructing initial pagetables, which
1432 doesn't allow RO pagetable pages to be remapped RW */
1433static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
1434{
1435 pte = mask_rw_pte(ptep, pte);
1436
1437 xen_set_pte(ptep, pte);
1438}
1439#endif
1440
1441/* Early in boot, while setting up the initial pagetable, assume
1442 everything is pinned. */
1443static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
1444{
1445#ifdef CONFIG_FLATMEM
1446 BUG_ON(mem_map); /* should only be used early */
1447#endif
1448 make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
1449}
1450
1451/* Early release_pte assumes that all pts are pinned, since there's
1452 only init_mm and anything attached to that is pinned. */
1453static void xen_release_pte_init(unsigned long pfn)
1454{
1455 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1456}
1457
1458static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
1459{
1460 struct mmuext_op op;
1461 op.cmd = cmd;
1462 op.arg1.mfn = pfn_to_mfn(pfn);
1463 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
1464 BUG();
1465}
1466
1467/* This needs to make sure the new pte page is pinned iff its being
1468 attached to a pinned pagetable. */
1469static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level)
1470{
1471 struct page *page = pfn_to_page(pfn);
1472
1473 if (PagePinned(virt_to_page(mm->pgd))) {
1474 SetPagePinned(page);
1475
1476 vm_unmap_aliases();
1477 if (!PageHighMem(page)) {
1478 make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
1479 if (level == PT_PTE && USE_SPLIT_PTLOCKS)
1480 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1481 } else {
1482 /* make sure there are no stray mappings of
1483 this page */
1484 kmap_flush_unused();
1485 }
1486 }
1487}
1488
1489static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
1490{
1491 xen_alloc_ptpage(mm, pfn, PT_PTE);
1492}
1493
1494static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
1495{
1496 xen_alloc_ptpage(mm, pfn, PT_PMD);
1497}
1498
1499/* This should never happen until we're OK to use struct page */
1500static void xen_release_ptpage(unsigned long pfn, unsigned level)
1501{
1502 struct page *page = pfn_to_page(pfn);
1503
1504 if (PagePinned(page)) {
1505 if (!PageHighMem(page)) {
1506 if (level == PT_PTE && USE_SPLIT_PTLOCKS)
1507 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
1508 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1509 }
1510 ClearPagePinned(page);
1511 }
1512}
1513
1514static void xen_release_pte(unsigned long pfn)
1515{
1516 xen_release_ptpage(pfn, PT_PTE);
1517}
1518
1519static void xen_release_pmd(unsigned long pfn)
1520{
1521 xen_release_ptpage(pfn, PT_PMD);
1522}
1523
1524#if PAGETABLE_LEVELS == 4
1525static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
1526{
1527 xen_alloc_ptpage(mm, pfn, PT_PUD);
1528}
1529
1530static void xen_release_pud(unsigned long pfn)
1531{
1532 xen_release_ptpage(pfn, PT_PUD);
1533}
1534#endif
1535
1536void __init xen_reserve_top(void)
1537{
1538#ifdef CONFIG_X86_32
1539 unsigned long top = HYPERVISOR_VIRT_START;
1540 struct xen_platform_parameters pp;
1541
1542 if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
1543 top = pp.virt_start;
1544
1545 reserve_top_address(-top);
1546#endif /* CONFIG_X86_32 */
1547}
1548
1549/*
1550 * Like __va(), but returns address in the kernel mapping (which is
1551 * all we have until the physical memory mapping has been set up.
1552 */
1553static void *__ka(phys_addr_t paddr)
1554{
1555#ifdef CONFIG_X86_64
1556 return (void *)(paddr + __START_KERNEL_map);
1557#else
1558 return __va(paddr);
1559#endif
1560}
1561
1562/* Convert a machine address to physical address */
1563static unsigned long m2p(phys_addr_t maddr)
1564{
1565 phys_addr_t paddr;
1566
1567 maddr &= PTE_PFN_MASK;
1568 paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
1569
1570 return paddr;
1571}
1572
1573/* Convert a machine address to kernel virtual */
1574static void *m2v(phys_addr_t maddr)
1575{
1576 return __ka(m2p(maddr));
1577}
1578
1579static void set_page_prot(void *addr, pgprot_t prot)
1580{
1581 unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
1582 pte_t pte = pfn_pte(pfn, prot);
1583
1584 if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
1585 BUG();
1586}
1587
1588static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1589{
1590 unsigned pmdidx, pteidx;
1591 unsigned ident_pte;
1592 unsigned long pfn;
1593
1594 ident_pte = 0;
1595 pfn = 0;
1596 for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
1597 pte_t *pte_page;
1598
1599 /* Reuse or allocate a page of ptes */
1600 if (pmd_present(pmd[pmdidx]))
1601 pte_page = m2v(pmd[pmdidx].pmd);
1602 else {
1603 /* Check for free pte pages */
1604 if (ident_pte == ARRAY_SIZE(level1_ident_pgt))
1605 break;
1606
1607 pte_page = &level1_ident_pgt[ident_pte];
1608 ident_pte += PTRS_PER_PTE;
1609
1610 pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
1611 }
1612
1613 /* Install mappings */
1614 for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
1615 pte_t pte;
1616
1617 if (pfn > max_pfn_mapped)
1618 max_pfn_mapped = pfn;
1619
1620 if (!pte_none(pte_page[pteidx]))
1621 continue;
1622
1623 pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
1624 pte_page[pteidx] = pte;
1625 }
1626 }
1627
1628 for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
1629 set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
1630
1631 set_page_prot(pmd, PAGE_KERNEL_RO);
1632}
1633
1634#ifdef CONFIG_X86_64
1635static void convert_pfn_mfn(void *v)
1636{
1637 pte_t *pte = v;
1638 int i;
1639
1640 /* All levels are converted the same way, so just treat them
1641 as ptes. */
1642 for (i = 0; i < PTRS_PER_PTE; i++)
1643 pte[i] = xen_make_pte(pte[i].pte);
1644}
1645
1646/*
1647 * Set up the inital kernel pagetable.
1648 *
1649 * We can construct this by grafting the Xen provided pagetable into
1650 * head_64.S's preconstructed pagetables. We copy the Xen L2's into
1651 * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt. This
1652 * means that only the kernel has a physical mapping to start with -
1653 * but that's enough to get __va working. We need to fill in the rest
1654 * of the physical mapping once some sort of allocator has been set
1655 * up.
1656 */
1657__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1658 unsigned long max_pfn)
1659{
1660 pud_t *l3;
1661 pmd_t *l2;
1662
1663 /* Zap identity mapping */
1664 init_level4_pgt[0] = __pgd(0);
1665
1666 /* Pre-constructed entries are in pfn, so convert to mfn */
1667 convert_pfn_mfn(init_level4_pgt);
1668 convert_pfn_mfn(level3_ident_pgt);
1669 convert_pfn_mfn(level3_kernel_pgt);
1670
1671 l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
1672 l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
1673
1674 memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1675 memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1676
1677 l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
1678 l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
1679 memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1680
1681 /* Set up identity map */
1682 xen_map_identity_early(level2_ident_pgt, max_pfn);
1683
1684 /* Make pagetable pieces RO */
1685 set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
1686 set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
1687 set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
1688 set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
1689 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1690 set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
1691
1692 /* Pin down new L4 */
1693 pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
1694 PFN_DOWN(__pa_symbol(init_level4_pgt)));
1695
1696 /* Unpin Xen-provided one */
1697 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1698
1699 /* Switch over */
1700 pgd = init_level4_pgt;
1701
1702 /*
1703 * At this stage there can be no user pgd, and no page
1704 * structure to attach it to, so make sure we just set kernel
1705 * pgd.
1706 */
1707 xen_mc_batch();
1708 __xen_write_cr3(true, __pa(pgd));
1709 xen_mc_issue(PARAVIRT_LAZY_CPU);
1710
1711 reserve_early(__pa(xen_start_info->pt_base),
1712 __pa(xen_start_info->pt_base +
1713 xen_start_info->nr_pt_frames * PAGE_SIZE),
1714 "XEN PAGETABLES");
1715
1716 return pgd;
1717}
1718#else /* !CONFIG_X86_64 */
1719static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss;
1720
1721__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1722 unsigned long max_pfn)
1723{
1724 pmd_t *kernel_pmd;
1725
1726 init_pg_tables_start = __pa(pgd);
1727 init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
1728 max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024);
1729
1730 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
1731 memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
1732
1733 xen_map_identity_early(level2_kernel_pgt, max_pfn);
1734
1735 memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
1736 set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY],
1737 __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT));
1738
1739 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1740 set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
1741 set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
1742
1743 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1744
1745 xen_write_cr3(__pa(swapper_pg_dir));
1746
1747 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir)));
1748
1749 return swapper_pg_dir;
1750}
1751#endif /* CONFIG_X86_64 */
1752
1753static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot)
1754{
1755 pte_t pte;
1756
1757 phys >>= PAGE_SHIFT;
1758
1759 switch (idx) {
1760 case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
1761#ifdef CONFIG_X86_F00F_BUG
1762 case FIX_F00F_IDT:
1763#endif
1764#ifdef CONFIG_X86_32
1765 case FIX_WP_TEST:
1766 case FIX_VDSO:
1767# ifdef CONFIG_HIGHMEM
1768 case FIX_KMAP_BEGIN ... FIX_KMAP_END:
1769# endif
1770#else
1771 case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
1772#endif
1773#ifdef CONFIG_X86_LOCAL_APIC
1774 case FIX_APIC_BASE: /* maps dummy local APIC */
1775#endif
1776 pte = pfn_pte(phys, prot);
1777 break;
1778
1779 default:
1780 pte = mfn_pte(phys, prot);
1781 break;
1782 }
1783
1784 __native_set_fixmap(idx, pte);
1785
1786#ifdef CONFIG_X86_64
1787 /* Replicate changes to map the vsyscall page into the user
1788 pagetable vsyscall mapping. */
1789 if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) {
1790 unsigned long vaddr = __fix_to_virt(idx);
1791 set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
1792 }
1793#endif
1794}
1795
1796__init void xen_post_allocator_init(void)
1797{
1798 pv_mmu_ops.set_pte = xen_set_pte;
1799 pv_mmu_ops.set_pmd = xen_set_pmd;
1800 pv_mmu_ops.set_pud = xen_set_pud;
1801#if PAGETABLE_LEVELS == 4
1802 pv_mmu_ops.set_pgd = xen_set_pgd;
1803#endif
1804
1805 /* This will work as long as patching hasn't happened yet
1806 (which it hasn't) */
1807 pv_mmu_ops.alloc_pte = xen_alloc_pte;
1808 pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
1809 pv_mmu_ops.release_pte = xen_release_pte;
1810 pv_mmu_ops.release_pmd = xen_release_pmd;
1811#if PAGETABLE_LEVELS == 4
1812 pv_mmu_ops.alloc_pud = xen_alloc_pud;
1813 pv_mmu_ops.release_pud = xen_release_pud;
1814#endif
1815
1816#ifdef CONFIG_X86_64
1817 SetPagePinned(virt_to_page(level3_user_vsyscall));
1818#endif
1819 xen_mark_init_mm_pinned();
1820}
1821
1822
1823const struct pv_mmu_ops xen_mmu_ops __initdata = {
1824 .pagetable_setup_start = xen_pagetable_setup_start,
1825 .pagetable_setup_done = xen_pagetable_setup_done,
1826
1827 .read_cr2 = xen_read_cr2,
1828 .write_cr2 = xen_write_cr2,
1829
1830 .read_cr3 = xen_read_cr3,
1831 .write_cr3 = xen_write_cr3,
1832
1833 .flush_tlb_user = xen_flush_tlb,
1834 .flush_tlb_kernel = xen_flush_tlb,
1835 .flush_tlb_single = xen_flush_tlb_single,
1836 .flush_tlb_others = xen_flush_tlb_others,
1837
1838 .pte_update = paravirt_nop,
1839 .pte_update_defer = paravirt_nop,
1840
1841 .pgd_alloc = xen_pgd_alloc,
1842 .pgd_free = xen_pgd_free,
1843
1844 .alloc_pte = xen_alloc_pte_init,
1845 .release_pte = xen_release_pte_init,
1846 .alloc_pmd = xen_alloc_pte_init,
1847 .alloc_pmd_clone = paravirt_nop,
1848 .release_pmd = xen_release_pte_init,
1849
1850#ifdef CONFIG_HIGHPTE
1851 .kmap_atomic_pte = xen_kmap_atomic_pte,
1852#endif
1853
1854#ifdef CONFIG_X86_64
1855 .set_pte = xen_set_pte,
1856#else
1857 .set_pte = xen_set_pte_init,
1858#endif
1859 .set_pte_at = xen_set_pte_at,
1860 .set_pmd = xen_set_pmd_hyper,
1861
1862 .ptep_modify_prot_start = __ptep_modify_prot_start,
1863 .ptep_modify_prot_commit = __ptep_modify_prot_commit,
1864
1865 .pte_val = PV_CALLEE_SAVE(xen_pte_val),
1866 .pgd_val = PV_CALLEE_SAVE(xen_pgd_val),
1867
1868 .make_pte = PV_CALLEE_SAVE(xen_make_pte),
1869 .make_pgd = PV_CALLEE_SAVE(xen_make_pgd),
1870
1871#ifdef CONFIG_X86_PAE
1872 .set_pte_atomic = xen_set_pte_atomic,
1873 .set_pte_present = xen_set_pte_at,
1874 .pte_clear = xen_pte_clear,
1875 .pmd_clear = xen_pmd_clear,
1876#endif /* CONFIG_X86_PAE */
1877 .set_pud = xen_set_pud_hyper,
1878
1879 .make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
1880 .pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
1881
1882#if PAGETABLE_LEVELS == 4
1883 .pud_val = PV_CALLEE_SAVE(xen_pud_val),
1884 .make_pud = PV_CALLEE_SAVE(xen_make_pud),
1885 .set_pgd = xen_set_pgd_hyper,
1886
1887 .alloc_pud = xen_alloc_pte_init,
1888 .release_pud = xen_release_pte_init,
1889#endif /* PAGETABLE_LEVELS == 4 */
1890
1891 .activate_mm = xen_activate_mm,
1892 .dup_mmap = xen_dup_mmap,
1893 .exit_mmap = xen_exit_mmap,
1894
1895 .lazy_mode = {
1896 .enter = paravirt_enter_lazy_mmu,
1897 .leave = xen_leave_lazy,
1898 },
1899
1900 .set_fixmap = xen_set_fixmap,
1901};
1902
1903
1159#ifdef CONFIG_XEN_DEBUG_FS 1904#ifdef CONFIG_XEN_DEBUG_FS
1160 1905
1161static struct dentry *d_mmu_debug; 1906static struct dentry *d_mmu_debug;
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index 98d71659da5a..24d1b44a337d 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -54,4 +54,7 @@ pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t
54void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, 54void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
55 pte_t *ptep, pte_t pte); 55 pte_t *ptep, pte_t pte);
56 56
57unsigned long xen_read_cr2_direct(void);
58
59extern const struct pv_mmu_ops xen_mmu_ops;
57#endif /* _XEN_MMU_H */ 60#endif /* _XEN_MMU_H */
diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c
index c738644b5435..8bff7e7c290b 100644
--- a/arch/x86/xen/multicalls.c
+++ b/arch/x86/xen/multicalls.c
@@ -39,6 +39,7 @@ struct mc_buffer {
39 struct multicall_entry entries[MC_BATCH]; 39 struct multicall_entry entries[MC_BATCH];
40#if MC_DEBUG 40#if MC_DEBUG
41 struct multicall_entry debug[MC_BATCH]; 41 struct multicall_entry debug[MC_BATCH];
42 void *caller[MC_BATCH];
42#endif 43#endif
43 unsigned char args[MC_ARGS]; 44 unsigned char args[MC_ARGS];
44 struct callback { 45 struct callback {
@@ -154,11 +155,12 @@ void xen_mc_flush(void)
154 ret, smp_processor_id()); 155 ret, smp_processor_id());
155 dump_stack(); 156 dump_stack();
156 for (i = 0; i < b->mcidx; i++) { 157 for (i = 0; i < b->mcidx; i++) {
157 printk(KERN_DEBUG " call %2d/%d: op=%lu arg=[%lx] result=%ld\n", 158 printk(KERN_DEBUG " call %2d/%d: op=%lu arg=[%lx] result=%ld\t%pF\n",
158 i+1, b->mcidx, 159 i+1, b->mcidx,
159 b->debug[i].op, 160 b->debug[i].op,
160 b->debug[i].args[0], 161 b->debug[i].args[0],
161 b->entries[i].result); 162 b->entries[i].result,
163 b->caller[i]);
162 } 164 }
163 } 165 }
164#endif 166#endif
@@ -168,8 +170,6 @@ void xen_mc_flush(void)
168 } else 170 } else
169 BUG_ON(b->argidx != 0); 171 BUG_ON(b->argidx != 0);
170 172
171 local_irq_restore(flags);
172
173 for (i = 0; i < b->cbidx; i++) { 173 for (i = 0; i < b->cbidx; i++) {
174 struct callback *cb = &b->callbacks[i]; 174 struct callback *cb = &b->callbacks[i];
175 175
@@ -177,7 +177,9 @@ void xen_mc_flush(void)
177 } 177 }
178 b->cbidx = 0; 178 b->cbidx = 0;
179 179
180 BUG_ON(ret); 180 local_irq_restore(flags);
181
182 WARN_ON(ret);
181} 183}
182 184
183struct multicall_space __xen_mc_entry(size_t args) 185struct multicall_space __xen_mc_entry(size_t args)
@@ -197,6 +199,9 @@ struct multicall_space __xen_mc_entry(size_t args)
197 } 199 }
198 200
199 ret.mc = &b->entries[b->mcidx]; 201 ret.mc = &b->entries[b->mcidx];
202#ifdef MC_DEBUG
203 b->caller[b->mcidx] = __builtin_return_address(0);
204#endif
200 b->mcidx++; 205 b->mcidx++;
201 ret.args = &b->args[argidx]; 206 ret.args = &b->args[argidx];
202 b->argidx = argidx + args; 207 b->argidx = argidx + args;
diff --git a/arch/x86/xen/multicalls.h b/arch/x86/xen/multicalls.h
index fa3e10725d98..9e565da5d1f7 100644
--- a/arch/x86/xen/multicalls.h
+++ b/arch/x86/xen/multicalls.h
@@ -41,7 +41,7 @@ static inline void xen_mc_issue(unsigned mode)
41 xen_mc_flush(); 41 xen_mc_flush();
42 42
43 /* restore flags saved in xen_mc_batch */ 43 /* restore flags saved in xen_mc_batch */
44 local_irq_restore(x86_read_percpu(xen_mc_irq_flags)); 44 local_irq_restore(percpu_read(xen_mc_irq_flags));
45} 45}
46 46
47/* Set up a callback to be called when the current batch is flushed */ 47/* Set up a callback to be called when the current batch is flushed */
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index c44e2069c7c7..8d470562ffc9 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -50,11 +50,7 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id);
50 */ 50 */
51static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id) 51static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
52{ 52{
53#ifdef CONFIG_X86_32 53 inc_irq_stat(irq_resched_count);
54 __get_cpu_var(irq_stat).irq_resched_count++;
55#else
56 add_pda(irq_resched_count, 1);
57#endif
58 54
59 return IRQ_HANDLED; 55 return IRQ_HANDLED;
60} 56}
@@ -78,7 +74,7 @@ static __cpuinit void cpu_bringup(void)
78 xen_setup_cpu_clockevents(); 74 xen_setup_cpu_clockevents();
79 75
80 cpu_set(cpu, cpu_online_map); 76 cpu_set(cpu, cpu_online_map);
81 x86_write_percpu(cpu_state, CPU_ONLINE); 77 percpu_write(cpu_state, CPU_ONLINE);
82 wmb(); 78 wmb();
83 79
84 /* We can take interrupts now: we're officially "up". */ 80 /* We can take interrupts now: we're officially "up". */
@@ -174,7 +170,7 @@ static void __init xen_smp_prepare_boot_cpu(void)
174 170
175 /* We've switched to the "real" per-cpu gdt, so make sure the 171 /* We've switched to the "real" per-cpu gdt, so make sure the
176 old memory can be recycled */ 172 old memory can be recycled */
177 make_lowmem_page_readwrite(&per_cpu_var(gdt_page)); 173 make_lowmem_page_readwrite(xen_initial_gdt);
178 174
179 xen_setup_vcpu_info_placement(); 175 xen_setup_vcpu_info_placement();
180} 176}
@@ -223,6 +219,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
223{ 219{
224 struct vcpu_guest_context *ctxt; 220 struct vcpu_guest_context *ctxt;
225 struct desc_struct *gdt; 221 struct desc_struct *gdt;
222 unsigned long gdt_mfn;
226 223
227 if (cpumask_test_and_set_cpu(cpu, xen_cpu_initialized_map)) 224 if (cpumask_test_and_set_cpu(cpu, xen_cpu_initialized_map))
228 return 0; 225 return 0;
@@ -239,6 +236,8 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
239 ctxt->user_regs.ss = __KERNEL_DS; 236 ctxt->user_regs.ss = __KERNEL_DS;
240#ifdef CONFIG_X86_32 237#ifdef CONFIG_X86_32
241 ctxt->user_regs.fs = __KERNEL_PERCPU; 238 ctxt->user_regs.fs = __KERNEL_PERCPU;
239#else
240 ctxt->gs_base_kernel = per_cpu_offset(cpu);
242#endif 241#endif
243 ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle; 242 ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
244 ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */ 243 ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
@@ -250,9 +249,12 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
250 ctxt->ldt_ents = 0; 249 ctxt->ldt_ents = 0;
251 250
252 BUG_ON((unsigned long)gdt & ~PAGE_MASK); 251 BUG_ON((unsigned long)gdt & ~PAGE_MASK);
252
253 gdt_mfn = arbitrary_virt_to_mfn(gdt);
253 make_lowmem_page_readonly(gdt); 254 make_lowmem_page_readonly(gdt);
255 make_lowmem_page_readonly(mfn_to_virt(gdt_mfn));
254 256
255 ctxt->gdt_frames[0] = virt_to_mfn(gdt); 257 ctxt->gdt_frames[0] = gdt_mfn;
256 ctxt->gdt_ents = GDT_ENTRIES; 258 ctxt->gdt_ents = GDT_ENTRIES;
257 259
258 ctxt->user_regs.cs = __KERNEL_CS; 260 ctxt->user_regs.cs = __KERNEL_CS;
@@ -283,23 +285,14 @@ static int __cpuinit xen_cpu_up(unsigned int cpu)
283 struct task_struct *idle = idle_task(cpu); 285 struct task_struct *idle = idle_task(cpu);
284 int rc; 286 int rc;
285 287
286#ifdef CONFIG_X86_64
287 /* Allocate node local memory for AP pdas */
288 WARN_ON(cpu == 0);
289 if (cpu > 0) {
290 rc = get_local_pda(cpu);
291 if (rc)
292 return rc;
293 }
294#endif
295
296#ifdef CONFIG_X86_32
297 init_gdt(cpu);
298 per_cpu(current_task, cpu) = idle; 288 per_cpu(current_task, cpu) = idle;
289#ifdef CONFIG_X86_32
299 irq_ctx_init(cpu); 290 irq_ctx_init(cpu);
300#else 291#else
301 cpu_pda(cpu)->pcurrent = idle;
302 clear_tsk_thread_flag(idle, TIF_FORK); 292 clear_tsk_thread_flag(idle, TIF_FORK);
293 per_cpu(kernel_stack, cpu) =
294 (unsigned long)task_stack_page(idle) -
295 KERNEL_STACK_OFFSET + THREAD_SIZE;
303#endif 296#endif
304 xen_setup_timer(cpu); 297 xen_setup_timer(cpu);
305 xen_init_lock_cpu(cpu); 298 xen_init_lock_cpu(cpu);
@@ -445,11 +438,7 @@ static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
445{ 438{
446 irq_enter(); 439 irq_enter();
447 generic_smp_call_function_interrupt(); 440 generic_smp_call_function_interrupt();
448#ifdef CONFIG_X86_32 441 inc_irq_stat(irq_call_count);
449 __get_cpu_var(irq_stat).irq_call_count++;
450#else
451 add_pda(irq_call_count, 1);
452#endif
453 irq_exit(); 442 irq_exit();
454 443
455 return IRQ_HANDLED; 444 return IRQ_HANDLED;
@@ -459,11 +448,7 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id)
459{ 448{
460 irq_enter(); 449 irq_enter();
461 generic_smp_call_function_single_interrupt(); 450 generic_smp_call_function_single_interrupt();
462#ifdef CONFIG_X86_32 451 inc_irq_stat(irq_call_count);
463 __get_cpu_var(irq_stat).irq_call_count++;
464#else
465 add_pda(irq_call_count, 1);
466#endif
467 irq_exit(); 452 irq_exit();
468 453
469 return IRQ_HANDLED; 454 return IRQ_HANDLED;
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index 212ffe012b76..95be7b434724 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -6,6 +6,7 @@
6 6
7#include <asm/xen/hypercall.h> 7#include <asm/xen/hypercall.h>
8#include <asm/xen/page.h> 8#include <asm/xen/page.h>
9#include <asm/fixmap.h>
9 10
10#include "xen-ops.h" 11#include "xen-ops.h"
11#include "mmu.h" 12#include "mmu.h"
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S
new file mode 100644
index 000000000000..79d7362ad6d1
--- /dev/null
+++ b/arch/x86/xen/xen-asm.S
@@ -0,0 +1,142 @@
1/*
2 * Asm versions of Xen pv-ops, suitable for either direct use or
3 * inlining. The inline versions are the same as the direct-use
4 * versions, with the pre- and post-amble chopped off.
5 *
6 * This code is encoded for size rather than absolute efficiency, with
7 * a view to being able to inline as much as possible.
8 *
9 * We only bother with direct forms (ie, vcpu in percpu data) of the
10 * operations here; the indirect forms are better handled in C, since
11 * they're generally too large to inline anyway.
12 */
13
14#include <asm/asm-offsets.h>
15#include <asm/percpu.h>
16#include <asm/processor-flags.h>
17
18#include "xen-asm.h"
19
20/*
21 * Enable events. This clears the event mask and tests the pending
22 * event status with one and operation. If there are pending events,
23 * then enter the hypervisor to get them handled.
24 */
25ENTRY(xen_irq_enable_direct)
26 /* Unmask events */
27 movb $0, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
28
29 /*
30 * Preempt here doesn't matter because that will deal with any
31 * pending interrupts. The pending check may end up being run
32 * on the wrong CPU, but that doesn't hurt.
33 */
34
35 /* Test for pending */
36 testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending
37 jz 1f
38
392: call check_events
401:
41ENDPATCH(xen_irq_enable_direct)
42 ret
43 ENDPROC(xen_irq_enable_direct)
44 RELOC(xen_irq_enable_direct, 2b+1)
45
46
47/*
48 * Disabling events is simply a matter of making the event mask
49 * non-zero.
50 */
51ENTRY(xen_irq_disable_direct)
52 movb $1, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
53ENDPATCH(xen_irq_disable_direct)
54 ret
55 ENDPROC(xen_irq_disable_direct)
56 RELOC(xen_irq_disable_direct, 0)
57
58/*
59 * (xen_)save_fl is used to get the current interrupt enable status.
60 * Callers expect the status to be in X86_EFLAGS_IF, and other bits
61 * may be set in the return value. We take advantage of this by
62 * making sure that X86_EFLAGS_IF has the right value (and other bits
63 * in that byte are 0), but other bits in the return value are
64 * undefined. We need to toggle the state of the bit, because Xen and
65 * x86 use opposite senses (mask vs enable).
66 */
67ENTRY(xen_save_fl_direct)
68 testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
69 setz %ah
70 addb %ah, %ah
71ENDPATCH(xen_save_fl_direct)
72 ret
73 ENDPROC(xen_save_fl_direct)
74 RELOC(xen_save_fl_direct, 0)
75
76
77/*
78 * In principle the caller should be passing us a value return from
79 * xen_save_fl_direct, but for robustness sake we test only the
80 * X86_EFLAGS_IF flag rather than the whole byte. After setting the
81 * interrupt mask state, it checks for unmasked pending events and
82 * enters the hypervisor to get them delivered if so.
83 */
84ENTRY(xen_restore_fl_direct)
85#ifdef CONFIG_X86_64
86 testw $X86_EFLAGS_IF, %di
87#else
88 testb $X86_EFLAGS_IF>>8, %ah
89#endif
90 setz PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
91 /*
92 * Preempt here doesn't matter because that will deal with any
93 * pending interrupts. The pending check may end up being run
94 * on the wrong CPU, but that doesn't hurt.
95 */
96
97 /* check for unmasked and pending */
98 cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending
99 jz 1f
1002: call check_events
1011:
102ENDPATCH(xen_restore_fl_direct)
103 ret
104 ENDPROC(xen_restore_fl_direct)
105 RELOC(xen_restore_fl_direct, 2b+1)
106
107
108/*
109 * Force an event check by making a hypercall, but preserve regs
110 * before making the call.
111 */
112check_events:
113#ifdef CONFIG_X86_32
114 push %eax
115 push %ecx
116 push %edx
117 call xen_force_evtchn_callback
118 pop %edx
119 pop %ecx
120 pop %eax
121#else
122 push %rax
123 push %rcx
124 push %rdx
125 push %rsi
126 push %rdi
127 push %r8
128 push %r9
129 push %r10
130 push %r11
131 call xen_force_evtchn_callback
132 pop %r11
133 pop %r10
134 pop %r9
135 pop %r8
136 pop %rdi
137 pop %rsi
138 pop %rdx
139 pop %rcx
140 pop %rax
141#endif
142 ret
diff --git a/arch/x86/xen/xen-asm.h b/arch/x86/xen/xen-asm.h
new file mode 100644
index 000000000000..465276467a47
--- /dev/null
+++ b/arch/x86/xen/xen-asm.h
@@ -0,0 +1,12 @@
1#ifndef _XEN_XEN_ASM_H
2#define _XEN_XEN_ASM_H
3
4#include <linux/linkage.h>
5
6#define RELOC(x, v) .globl x##_reloc; x##_reloc=v
7#define ENDPATCH(x) .globl x##_end; x##_end=.
8
9/* Pseudo-flag used for virtual NMI, which we don't implement yet */
10#define XEN_EFLAGS_NMI 0x80000000
11
12#endif
diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S
index 42786f59d9c0..88e15deb8b82 100644
--- a/arch/x86/xen/xen-asm_32.S
+++ b/arch/x86/xen/xen-asm_32.S
@@ -1,117 +1,43 @@
1/* 1/*
2 Asm versions of Xen pv-ops, suitable for either direct use or inlining. 2 * Asm versions of Xen pv-ops, suitable for either direct use or
3 The inline versions are the same as the direct-use versions, with the 3 * inlining. The inline versions are the same as the direct-use
4 pre- and post-amble chopped off. 4 * versions, with the pre- and post-amble chopped off.
5 5 *
6 This code is encoded for size rather than absolute efficiency, 6 * This code is encoded for size rather than absolute efficiency, with
7 with a view to being able to inline as much as possible. 7 * a view to being able to inline as much as possible.
8 8 *
9 We only bother with direct forms (ie, vcpu in pda) of the operations 9 * We only bother with direct forms (ie, vcpu in pda) of the
10 here; the indirect forms are better handled in C, since they're 10 * operations here; the indirect forms are better handled in C, since
11 generally too large to inline anyway. 11 * they're generally too large to inline anyway.
12 */ 12 */
13 13
14#include <linux/linkage.h>
15
16#include <asm/asm-offsets.h>
17#include <asm/thread_info.h> 14#include <asm/thread_info.h>
18#include <asm/percpu.h>
19#include <asm/processor-flags.h> 15#include <asm/processor-flags.h>
20#include <asm/segment.h> 16#include <asm/segment.h>
21 17
22#include <xen/interface/xen.h> 18#include <xen/interface/xen.h>
23 19
24#define RELOC(x, v) .globl x##_reloc; x##_reloc=v 20#include "xen-asm.h"
25#define ENDPATCH(x) .globl x##_end; x##_end=.
26
27/* Pseudo-flag used for virtual NMI, which we don't implement yet */
28#define XEN_EFLAGS_NMI 0x80000000
29
30/*
31 Enable events. This clears the event mask and tests the pending
32 event status with one and operation. If there are pending
33 events, then enter the hypervisor to get them handled.
34 */
35ENTRY(xen_irq_enable_direct)
36 /* Unmask events */
37 movb $0, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
38
39 /* Preempt here doesn't matter because that will deal with
40 any pending interrupts. The pending check may end up being
41 run on the wrong CPU, but that doesn't hurt. */
42
43 /* Test for pending */
44 testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
45 jz 1f
46
472: call check_events
481:
49ENDPATCH(xen_irq_enable_direct)
50 ret
51 ENDPROC(xen_irq_enable_direct)
52 RELOC(xen_irq_enable_direct, 2b+1)
53
54
55/*
56 Disabling events is simply a matter of making the event mask
57 non-zero.
58 */
59ENTRY(xen_irq_disable_direct)
60 movb $1, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
61ENDPATCH(xen_irq_disable_direct)
62 ret
63 ENDPROC(xen_irq_disable_direct)
64 RELOC(xen_irq_disable_direct, 0)
65 21
66/* 22/*
67 (xen_)save_fl is used to get the current interrupt enable status. 23 * Force an event check by making a hypercall, but preserve regs
68 Callers expect the status to be in X86_EFLAGS_IF, and other bits 24 * before making the call.
69 may be set in the return value. We take advantage of this by
70 making sure that X86_EFLAGS_IF has the right value (and other bits
71 in that byte are 0), but other bits in the return value are
72 undefined. We need to toggle the state of the bit, because
73 Xen and x86 use opposite senses (mask vs enable).
74 */ 25 */
75ENTRY(xen_save_fl_direct) 26check_events:
76 testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask 27 push %eax
77 setz %ah 28 push %ecx
78 addb %ah,%ah 29 push %edx
79ENDPATCH(xen_save_fl_direct) 30 call xen_force_evtchn_callback
80 ret 31 pop %edx
81 ENDPROC(xen_save_fl_direct) 32 pop %ecx
82 RELOC(xen_save_fl_direct, 0) 33 pop %eax
83
84
85/*
86 In principle the caller should be passing us a value return
87 from xen_save_fl_direct, but for robustness sake we test only
88 the X86_EFLAGS_IF flag rather than the whole byte. After
89 setting the interrupt mask state, it checks for unmasked
90 pending events and enters the hypervisor to get them delivered
91 if so.
92 */
93ENTRY(xen_restore_fl_direct)
94 testb $X86_EFLAGS_IF>>8, %ah
95 setz PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
96 /* Preempt here doesn't matter because that will deal with
97 any pending interrupts. The pending check may end up being
98 run on the wrong CPU, but that doesn't hurt. */
99
100 /* check for unmasked and pending */
101 cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
102 jz 1f
1032: call check_events
1041:
105ENDPATCH(xen_restore_fl_direct)
106 ret 34 ret
107 ENDPROC(xen_restore_fl_direct)
108 RELOC(xen_restore_fl_direct, 2b+1)
109 35
110/* 36/*
111 We can't use sysexit directly, because we're not running in ring0. 37 * We can't use sysexit directly, because we're not running in ring0.
112 But we can easily fake it up using iret. Assuming xen_sysexit 38 * But we can easily fake it up using iret. Assuming xen_sysexit is
113 is jumped to with a standard stack frame, we can just strip it 39 * jumped to with a standard stack frame, we can just strip it back to
114 back to a standard iret frame and use iret. 40 * a standard iret frame and use iret.
115 */ 41 */
116ENTRY(xen_sysexit) 42ENTRY(xen_sysexit)
117 movl PT_EAX(%esp), %eax /* Shouldn't be necessary? */ 43 movl PT_EAX(%esp), %eax /* Shouldn't be necessary? */
@@ -122,33 +48,31 @@ ENTRY(xen_sysexit)
122ENDPROC(xen_sysexit) 48ENDPROC(xen_sysexit)
123 49
124/* 50/*
125 This is run where a normal iret would be run, with the same stack setup: 51 * This is run where a normal iret would be run, with the same stack setup:
126 8: eflags 52 * 8: eflags
127 4: cs 53 * 4: cs
128 esp-> 0: eip 54 * esp-> 0: eip
129 55 *
130 This attempts to make sure that any pending events are dealt 56 * This attempts to make sure that any pending events are dealt with
131 with on return to usermode, but there is a small window in 57 * on return to usermode, but there is a small window in which an
132 which an event can happen just before entering usermode. If 58 * event can happen just before entering usermode. If the nested
133 the nested interrupt ends up setting one of the TIF_WORK_MASK 59 * interrupt ends up setting one of the TIF_WORK_MASK pending work
134 pending work flags, they will not be tested again before 60 * flags, they will not be tested again before returning to
135 returning to usermode. This means that a process can end up 61 * usermode. This means that a process can end up with pending work,
136 with pending work, which will be unprocessed until the process 62 * which will be unprocessed until the process enters and leaves the
137 enters and leaves the kernel again, which could be an 63 * kernel again, which could be an unbounded amount of time. This
138 unbounded amount of time. This means that a pending signal or 64 * means that a pending signal or reschedule event could be
139 reschedule event could be indefinitely delayed. 65 * indefinitely delayed.
140 66 *
141 The fix is to notice a nested interrupt in the critical 67 * The fix is to notice a nested interrupt in the critical window, and
142 window, and if one occurs, then fold the nested interrupt into 68 * if one occurs, then fold the nested interrupt into the current
143 the current interrupt stack frame, and re-process it 69 * interrupt stack frame, and re-process it iteratively rather than
144 iteratively rather than recursively. This means that it will 70 * recursively. This means that it will exit via the normal path, and
145 exit via the normal path, and all pending work will be dealt 71 * all pending work will be dealt with appropriately.
146 with appropriately. 72 *
147 73 * Because the nested interrupt handler needs to deal with the current
148 Because the nested interrupt handler needs to deal with the 74 * stack state in whatever form its in, we keep things simple by only
149 current stack state in whatever form its in, we keep things 75 * using a single register which is pushed/popped on the stack.
150 simple by only using a single register which is pushed/popped
151 on the stack.
152 */ 76 */
153ENTRY(xen_iret) 77ENTRY(xen_iret)
154 /* test eflags for special cases */ 78 /* test eflags for special cases */
@@ -158,13 +82,15 @@ ENTRY(xen_iret)
158 push %eax 82 push %eax
159 ESP_OFFSET=4 # bytes pushed onto stack 83 ESP_OFFSET=4 # bytes pushed onto stack
160 84
161 /* Store vcpu_info pointer for easy access. Do it this 85 /*
162 way to avoid having to reload %fs */ 86 * Store vcpu_info pointer for easy access. Do it this way to
87 * avoid having to reload %fs
88 */
163#ifdef CONFIG_SMP 89#ifdef CONFIG_SMP
164 GET_THREAD_INFO(%eax) 90 GET_THREAD_INFO(%eax)
165 movl TI_cpu(%eax),%eax 91 movl TI_cpu(%eax), %eax
166 movl __per_cpu_offset(,%eax,4),%eax 92 movl __per_cpu_offset(,%eax,4), %eax
167 mov per_cpu__xen_vcpu(%eax),%eax 93 mov per_cpu__xen_vcpu(%eax), %eax
168#else 94#else
169 movl per_cpu__xen_vcpu, %eax 95 movl per_cpu__xen_vcpu, %eax
170#endif 96#endif
@@ -172,37 +98,46 @@ ENTRY(xen_iret)
172 /* check IF state we're restoring */ 98 /* check IF state we're restoring */
173 testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(%esp) 99 testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(%esp)
174 100
175 /* Maybe enable events. Once this happens we could get a 101 /*
176 recursive event, so the critical region starts immediately 102 * Maybe enable events. Once this happens we could get a
177 afterwards. However, if that happens we don't end up 103 * recursive event, so the critical region starts immediately
178 resuming the code, so we don't have to be worried about 104 * afterwards. However, if that happens we don't end up
179 being preempted to another CPU. */ 105 * resuming the code, so we don't have to be worried about
106 * being preempted to another CPU.
107 */
180 setz XEN_vcpu_info_mask(%eax) 108 setz XEN_vcpu_info_mask(%eax)
181xen_iret_start_crit: 109xen_iret_start_crit:
182 110
183 /* check for unmasked and pending */ 111 /* check for unmasked and pending */
184 cmpw $0x0001, XEN_vcpu_info_pending(%eax) 112 cmpw $0x0001, XEN_vcpu_info_pending(%eax)
185 113
186 /* If there's something pending, mask events again so we 114 /*
187 can jump back into xen_hypervisor_callback */ 115 * If there's something pending, mask events again so we can
116 * jump back into xen_hypervisor_callback
117 */
188 sete XEN_vcpu_info_mask(%eax) 118 sete XEN_vcpu_info_mask(%eax)
189 119
190 popl %eax 120 popl %eax
191 121
192 /* From this point on the registers are restored and the stack 122 /*
193 updated, so we don't need to worry about it if we're preempted */ 123 * From this point on the registers are restored and the stack
124 * updated, so we don't need to worry about it if we're
125 * preempted
126 */
194iret_restore_end: 127iret_restore_end:
195 128
196 /* Jump to hypervisor_callback after fixing up the stack. 129 /*
197 Events are masked, so jumping out of the critical 130 * Jump to hypervisor_callback after fixing up the stack.
198 region is OK. */ 131 * Events are masked, so jumping out of the critical region is
132 * OK.
133 */
199 je xen_hypervisor_callback 134 je xen_hypervisor_callback
200 135
2011: iret 1361: iret
202xen_iret_end_crit: 137xen_iret_end_crit:
203.section __ex_table,"a" 138.section __ex_table, "a"
204 .align 4 139 .align 4
205 .long 1b,iret_exc 140 .long 1b, iret_exc
206.previous 141.previous
207 142
208hyper_iret: 143hyper_iret:
@@ -212,55 +147,55 @@ hyper_iret:
212 .globl xen_iret_start_crit, xen_iret_end_crit 147 .globl xen_iret_start_crit, xen_iret_end_crit
213 148
214/* 149/*
215 This is called by xen_hypervisor_callback in entry.S when it sees 150 * This is called by xen_hypervisor_callback in entry.S when it sees
216 that the EIP at the time of interrupt was between xen_iret_start_crit 151 * that the EIP at the time of interrupt was between
217 and xen_iret_end_crit. We're passed the EIP in %eax so we can do 152 * xen_iret_start_crit and xen_iret_end_crit. We're passed the EIP in
218 a more refined determination of what to do. 153 * %eax so we can do a more refined determination of what to do.
219 154 *
220 The stack format at this point is: 155 * The stack format at this point is:
221 ---------------- 156 * ----------------
222 ss : (ss/esp may be present if we came from usermode) 157 * ss : (ss/esp may be present if we came from usermode)
223 esp : 158 * esp :
224 eflags } outer exception info 159 * eflags } outer exception info
225 cs } 160 * cs }
226 eip } 161 * eip }
227 ---------------- <- edi (copy dest) 162 * ---------------- <- edi (copy dest)
228 eax : outer eax if it hasn't been restored 163 * eax : outer eax if it hasn't been restored
229 ---------------- 164 * ----------------
230 eflags } nested exception info 165 * eflags } nested exception info
231 cs } (no ss/esp because we're nested 166 * cs } (no ss/esp because we're nested
232 eip } from the same ring) 167 * eip } from the same ring)
233 orig_eax }<- esi (copy src) 168 * orig_eax }<- esi (copy src)
234 - - - - - - - - 169 * - - - - - - - -
235 fs } 170 * fs }
236 es } 171 * es }
237 ds } SAVE_ALL state 172 * ds } SAVE_ALL state
238 eax } 173 * eax }
239 : : 174 * : :
240 ebx }<- esp 175 * ebx }<- esp
241 ---------------- 176 * ----------------
242 177 *
243 In order to deliver the nested exception properly, we need to shift 178 * In order to deliver the nested exception properly, we need to shift
244 everything from the return addr up to the error code so it 179 * everything from the return addr up to the error code so it sits
245 sits just under the outer exception info. This means that when we 180 * just under the outer exception info. This means that when we
246 handle the exception, we do it in the context of the outer exception 181 * handle the exception, we do it in the context of the outer
247 rather than starting a new one. 182 * exception rather than starting a new one.
248 183 *
249 The only caveat is that if the outer eax hasn't been 184 * The only caveat is that if the outer eax hasn't been restored yet
250 restored yet (ie, it's still on stack), we need to insert 185 * (ie, it's still on stack), we need to insert its value into the
251 its value into the SAVE_ALL state before going on, since 186 * SAVE_ALL state before going on, since it's usermode state which we
252 it's usermode state which we eventually need to restore. 187 * eventually need to restore.
253 */ 188 */
254ENTRY(xen_iret_crit_fixup) 189ENTRY(xen_iret_crit_fixup)
255 /* 190 /*
256 Paranoia: Make sure we're really coming from kernel space. 191 * Paranoia: Make sure we're really coming from kernel space.
257 One could imagine a case where userspace jumps into the 192 * One could imagine a case where userspace jumps into the
258 critical range address, but just before the CPU delivers a GP, 193 * critical range address, but just before the CPU delivers a
259 it decides to deliver an interrupt instead. Unlikely? 194 * GP, it decides to deliver an interrupt instead. Unlikely?
260 Definitely. Easy to avoid? Yes. The Intel documents 195 * Definitely. Easy to avoid? Yes. The Intel documents
261 explicitly say that the reported EIP for a bad jump is the 196 * explicitly say that the reported EIP for a bad jump is the
262 jump instruction itself, not the destination, but some virtual 197 * jump instruction itself, not the destination, but some
263 environments get this wrong. 198 * virtual environments get this wrong.
264 */ 199 */
265 movl PT_CS(%esp), %ecx 200 movl PT_CS(%esp), %ecx
266 andl $SEGMENT_RPL_MASK, %ecx 201 andl $SEGMENT_RPL_MASK, %ecx
@@ -270,15 +205,17 @@ ENTRY(xen_iret_crit_fixup)
270 lea PT_ORIG_EAX(%esp), %esi 205 lea PT_ORIG_EAX(%esp), %esi
271 lea PT_EFLAGS(%esp), %edi 206 lea PT_EFLAGS(%esp), %edi
272 207
273 /* If eip is before iret_restore_end then stack 208 /*
274 hasn't been restored yet. */ 209 * If eip is before iret_restore_end then stack
210 * hasn't been restored yet.
211 */
275 cmp $iret_restore_end, %eax 212 cmp $iret_restore_end, %eax
276 jae 1f 213 jae 1f
277 214
278 movl 0+4(%edi),%eax /* copy EAX (just above top of frame) */ 215 movl 0+4(%edi), %eax /* copy EAX (just above top of frame) */
279 movl %eax, PT_EAX(%esp) 216 movl %eax, PT_EAX(%esp)
280 217
281 lea ESP_OFFSET(%edi),%edi /* move dest up over saved regs */ 218 lea ESP_OFFSET(%edi), %edi /* move dest up over saved regs */
282 219
283 /* set up the copy */ 220 /* set up the copy */
2841: std 2211: std
@@ -286,20 +223,6 @@ ENTRY(xen_iret_crit_fixup)
286 rep movsl 223 rep movsl
287 cld 224 cld
288 225
289 lea 4(%edi),%esp /* point esp to new frame */ 226 lea 4(%edi), %esp /* point esp to new frame */
2902: jmp xen_do_upcall 2272: jmp xen_do_upcall
291 228
292
293/*
294 Force an event check by making a hypercall,
295 but preserve regs before making the call.
296 */
297check_events:
298 push %eax
299 push %ecx
300 push %edx
301 call xen_force_evtchn_callback
302 pop %edx
303 pop %ecx
304 pop %eax
305 ret
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
index 05794c566e87..02f496a8dbaa 100644
--- a/arch/x86/xen/xen-asm_64.S
+++ b/arch/x86/xen/xen-asm_64.S
@@ -1,174 +1,45 @@
1/* 1/*
2 Asm versions of Xen pv-ops, suitable for either direct use or inlining. 2 * Asm versions of Xen pv-ops, suitable for either direct use or
3 The inline versions are the same as the direct-use versions, with the 3 * inlining. The inline versions are the same as the direct-use
4 pre- and post-amble chopped off. 4 * versions, with the pre- and post-amble chopped off.
5 5 *
6 This code is encoded for size rather than absolute efficiency, 6 * This code is encoded for size rather than absolute efficiency, with
7 with a view to being able to inline as much as possible. 7 * a view to being able to inline as much as possible.
8 8 *
9 We only bother with direct forms (ie, vcpu in pda) of the operations 9 * We only bother with direct forms (ie, vcpu in pda) of the
10 here; the indirect forms are better handled in C, since they're 10 * operations here; the indirect forms are better handled in C, since
11 generally too large to inline anyway. 11 * they're generally too large to inline anyway.
12 */ 12 */
13 13
14#include <linux/linkage.h>
15
16#include <asm/asm-offsets.h>
17#include <asm/processor-flags.h>
18#include <asm/errno.h> 14#include <asm/errno.h>
15#include <asm/percpu.h>
16#include <asm/processor-flags.h>
19#include <asm/segment.h> 17#include <asm/segment.h>
20 18
21#include <xen/interface/xen.h> 19#include <xen/interface/xen.h>
22 20
23#define RELOC(x, v) .globl x##_reloc; x##_reloc=v 21#include "xen-asm.h"
24#define ENDPATCH(x) .globl x##_end; x##_end=.
25
26/* Pseudo-flag used for virtual NMI, which we don't implement yet */
27#define XEN_EFLAGS_NMI 0x80000000
28
29#if 1
30/*
31 x86-64 does not yet support direct access to percpu variables
32 via a segment override, so we just need to make sure this code
33 never gets used
34 */
35#define BUG ud2a
36#define PER_CPU_VAR(var, off) 0xdeadbeef
37#endif
38
39/*
40 Enable events. This clears the event mask and tests the pending
41 event status with one and operation. If there are pending
42 events, then enter the hypervisor to get them handled.
43 */
44ENTRY(xen_irq_enable_direct)
45 BUG
46
47 /* Unmask events */
48 movb $0, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
49
50 /* Preempt here doesn't matter because that will deal with
51 any pending interrupts. The pending check may end up being
52 run on the wrong CPU, but that doesn't hurt. */
53
54 /* Test for pending */
55 testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending)
56 jz 1f
57
582: call check_events
591:
60ENDPATCH(xen_irq_enable_direct)
61 ret
62 ENDPROC(xen_irq_enable_direct)
63 RELOC(xen_irq_enable_direct, 2b+1)
64
65/*
66 Disabling events is simply a matter of making the event mask
67 non-zero.
68 */
69ENTRY(xen_irq_disable_direct)
70 BUG
71
72 movb $1, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
73ENDPATCH(xen_irq_disable_direct)
74 ret
75 ENDPROC(xen_irq_disable_direct)
76 RELOC(xen_irq_disable_direct, 0)
77
78/*
79 (xen_)save_fl is used to get the current interrupt enable status.
80 Callers expect the status to be in X86_EFLAGS_IF, and other bits
81 may be set in the return value. We take advantage of this by
82 making sure that X86_EFLAGS_IF has the right value (and other bits
83 in that byte are 0), but other bits in the return value are
84 undefined. We need to toggle the state of the bit, because
85 Xen and x86 use opposite senses (mask vs enable).
86 */
87ENTRY(xen_save_fl_direct)
88 BUG
89
90 testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
91 setz %ah
92 addb %ah,%ah
93ENDPATCH(xen_save_fl_direct)
94 ret
95 ENDPROC(xen_save_fl_direct)
96 RELOC(xen_save_fl_direct, 0)
97
98/*
99 In principle the caller should be passing us a value return
100 from xen_save_fl_direct, but for robustness sake we test only
101 the X86_EFLAGS_IF flag rather than the whole byte. After
102 setting the interrupt mask state, it checks for unmasked
103 pending events and enters the hypervisor to get them delivered
104 if so.
105 */
106ENTRY(xen_restore_fl_direct)
107 BUG
108
109 testb $X86_EFLAGS_IF>>8, %ah
110 setz PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
111 /* Preempt here doesn't matter because that will deal with
112 any pending interrupts. The pending check may end up being
113 run on the wrong CPU, but that doesn't hurt. */
114
115 /* check for unmasked and pending */
116 cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending)
117 jz 1f
1182: call check_events
1191:
120ENDPATCH(xen_restore_fl_direct)
121 ret
122 ENDPROC(xen_restore_fl_direct)
123 RELOC(xen_restore_fl_direct, 2b+1)
124
125
126/*
127 Force an event check by making a hypercall,
128 but preserve regs before making the call.
129 */
130check_events:
131 push %rax
132 push %rcx
133 push %rdx
134 push %rsi
135 push %rdi
136 push %r8
137 push %r9
138 push %r10
139 push %r11
140 call xen_force_evtchn_callback
141 pop %r11
142 pop %r10
143 pop %r9
144 pop %r8
145 pop %rdi
146 pop %rsi
147 pop %rdx
148 pop %rcx
149 pop %rax
150 ret
151 22
152ENTRY(xen_adjust_exception_frame) 23ENTRY(xen_adjust_exception_frame)
153 mov 8+0(%rsp),%rcx 24 mov 8+0(%rsp), %rcx
154 mov 8+8(%rsp),%r11 25 mov 8+8(%rsp), %r11
155 ret $16 26 ret $16
156 27
157hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32 28hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32
158/* 29/*
159 Xen64 iret frame: 30 * Xen64 iret frame:
160 31 *
161 ss 32 * ss
162 rsp 33 * rsp
163 rflags 34 * rflags
164 cs 35 * cs
165 rip <-- standard iret frame 36 * rip <-- standard iret frame
166 37 *
167 flags 38 * flags
168 39 *
169 rcx } 40 * rcx }
170 r11 }<-- pushed by hypercall page 41 * r11 }<-- pushed by hypercall page
171rsp -> rax } 42 * rsp->rax }
172 */ 43 */
173ENTRY(xen_iret) 44ENTRY(xen_iret)
174 pushq $0 45 pushq $0
@@ -177,8 +48,8 @@ ENDPATCH(xen_iret)
177RELOC(xen_iret, 1b+1) 48RELOC(xen_iret, 1b+1)
178 49
179/* 50/*
180 sysexit is not used for 64-bit processes, so it's 51 * sysexit is not used for 64-bit processes, so it's only ever used to
181 only ever used to return to 32-bit compat userspace. 52 * return to 32-bit compat userspace.
182 */ 53 */
183ENTRY(xen_sysexit) 54ENTRY(xen_sysexit)
184 pushq $__USER32_DS 55 pushq $__USER32_DS
@@ -193,13 +64,15 @@ ENDPATCH(xen_sysexit)
193RELOC(xen_sysexit, 1b+1) 64RELOC(xen_sysexit, 1b+1)
194 65
195ENTRY(xen_sysret64) 66ENTRY(xen_sysret64)
196 /* We're already on the usermode stack at this point, but still 67 /*
197 with the kernel gs, so we can easily switch back */ 68 * We're already on the usermode stack at this point, but
198 movq %rsp, %gs:pda_oldrsp 69 * still with the kernel gs, so we can easily switch back
199 movq %gs:pda_kernelstack,%rsp 70 */
71 movq %rsp, PER_CPU_VAR(old_rsp)
72 movq PER_CPU_VAR(kernel_stack), %rsp
200 73
201 pushq $__USER_DS 74 pushq $__USER_DS
202 pushq %gs:pda_oldrsp 75 pushq PER_CPU_VAR(old_rsp)
203 pushq %r11 76 pushq %r11
204 pushq $__USER_CS 77 pushq $__USER_CS
205 pushq %rcx 78 pushq %rcx
@@ -210,13 +83,15 @@ ENDPATCH(xen_sysret64)
210RELOC(xen_sysret64, 1b+1) 83RELOC(xen_sysret64, 1b+1)
211 84
212ENTRY(xen_sysret32) 85ENTRY(xen_sysret32)
213 /* We're already on the usermode stack at this point, but still 86 /*
214 with the kernel gs, so we can easily switch back */ 87 * We're already on the usermode stack at this point, but
215 movq %rsp, %gs:pda_oldrsp 88 * still with the kernel gs, so we can easily switch back
216 movq %gs:pda_kernelstack, %rsp 89 */
90 movq %rsp, PER_CPU_VAR(old_rsp)
91 movq PER_CPU_VAR(kernel_stack), %rsp
217 92
218 pushq $__USER32_DS 93 pushq $__USER32_DS
219 pushq %gs:pda_oldrsp 94 pushq PER_CPU_VAR(old_rsp)
220 pushq %r11 95 pushq %r11
221 pushq $__USER32_CS 96 pushq $__USER32_CS
222 pushq %rcx 97 pushq %rcx
@@ -227,28 +102,27 @@ ENDPATCH(xen_sysret32)
227RELOC(xen_sysret32, 1b+1) 102RELOC(xen_sysret32, 1b+1)
228 103
229/* 104/*
230 Xen handles syscall callbacks much like ordinary exceptions, 105 * Xen handles syscall callbacks much like ordinary exceptions, which
231 which means we have: 106 * means we have:
232 - kernel gs 107 * - kernel gs
233 - kernel rsp 108 * - kernel rsp
234 - an iret-like stack frame on the stack (including rcx and r11): 109 * - an iret-like stack frame on the stack (including rcx and r11):
235 ss 110 * ss
236 rsp 111 * rsp
237 rflags 112 * rflags
238 cs 113 * cs
239 rip 114 * rip
240 r11 115 * r11
241 rsp-> rcx 116 * rsp->rcx
242 117 *
243 In all the entrypoints, we undo all that to make it look 118 * In all the entrypoints, we undo all that to make it look like a
244 like a CPU-generated syscall/sysenter and jump to the normal 119 * CPU-generated syscall/sysenter and jump to the normal entrypoint.
245 entrypoint.
246 */ 120 */
247 121
248.macro undo_xen_syscall 122.macro undo_xen_syscall
249 mov 0*8(%rsp),%rcx 123 mov 0*8(%rsp), %rcx
250 mov 1*8(%rsp),%r11 124 mov 1*8(%rsp), %r11
251 mov 5*8(%rsp),%rsp 125 mov 5*8(%rsp), %rsp
252.endm 126.endm
253 127
254/* Normal 64-bit system call target */ 128/* Normal 64-bit system call target */
@@ -275,7 +149,7 @@ ENDPROC(xen_sysenter_target)
275 149
276ENTRY(xen_syscall32_target) 150ENTRY(xen_syscall32_target)
277ENTRY(xen_sysenter_target) 151ENTRY(xen_sysenter_target)
278 lea 16(%rsp), %rsp /* strip %rcx,%r11 */ 152 lea 16(%rsp), %rsp /* strip %rcx, %r11 */
279 mov $-ENOSYS, %rax 153 mov $-ENOSYS, %rax
280 pushq $VGCF_in_syscall 154 pushq $VGCF_in_syscall
281 jmp hypercall_iret 155 jmp hypercall_iret
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 63d49a523ed3..1a5ff24e29c0 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -8,7 +8,7 @@
8 8
9#include <asm/boot.h> 9#include <asm/boot.h>
10#include <asm/asm.h> 10#include <asm/asm.h>
11#include <asm/page.h> 11#include <asm/page_types.h>
12 12
13#include <xen/interface/elfnote.h> 13#include <xen/interface/elfnote.h>
14#include <asm/xen/interface.h> 14#include <asm/xen/interface.h>
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index c1f8faf0a2c5..2f5ef2632ea2 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -10,9 +10,12 @@
10extern const char xen_hypervisor_callback[]; 10extern const char xen_hypervisor_callback[];
11extern const char xen_failsafe_callback[]; 11extern const char xen_failsafe_callback[];
12 12
13extern void *xen_initial_gdt;
14
13struct trap_info; 15struct trap_info;
14void xen_copy_trap_info(struct trap_info *traps); 16void xen_copy_trap_info(struct trap_info *traps);
15 17
18DECLARE_PER_CPU(struct vcpu_info, xen_vcpu_info);
16DECLARE_PER_CPU(unsigned long, xen_cr3); 19DECLARE_PER_CPU(unsigned long, xen_cr3);
17DECLARE_PER_CPU(unsigned long, xen_current_cr3); 20DECLARE_PER_CPU(unsigned long, xen_current_cr3);
18 21
@@ -22,6 +25,13 @@ extern struct shared_info *HYPERVISOR_shared_info;
22 25
23void xen_setup_mfn_list_list(void); 26void xen_setup_mfn_list_list(void);
24void xen_setup_shared_info(void); 27void xen_setup_shared_info(void);
28void xen_setup_machphys_mapping(void);
29pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
30void xen_ident_map_ISA(void);
31void xen_reserve_top(void);
32
33void xen_leave_lazy(void);
34void xen_post_allocator_init(void);
25 35
26char * __init xen_memory_setup(void); 36char * __init xen_memory_setup(void);
27void __init xen_arch_setup(void); 37void __init xen_arch_setup(void);