aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/xen/enlighten.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/xen/enlighten.c')
-rw-r--r--arch/x86/xen/enlighten.c835
1 files changed, 678 insertions, 157 deletions
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index f09c1c69c37..194bbd6e324 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -33,6 +33,7 @@
33#include <xen/interface/sched.h> 33#include <xen/interface/sched.h>
34#include <xen/features.h> 34#include <xen/features.h>
35#include <xen/page.h> 35#include <xen/page.h>
36#include <xen/hvc-console.h>
36 37
37#include <asm/paravirt.h> 38#include <asm/paravirt.h>
38#include <asm/page.h> 39#include <asm/page.h>
@@ -40,6 +41,7 @@
40#include <asm/xen/hypervisor.h> 41#include <asm/xen/hypervisor.h>
41#include <asm/fixmap.h> 42#include <asm/fixmap.h>
42#include <asm/processor.h> 43#include <asm/processor.h>
44#include <asm/msr-index.h>
43#include <asm/setup.h> 45#include <asm/setup.h>
44#include <asm/desc.h> 46#include <asm/desc.h>
45#include <asm/pgtable.h> 47#include <asm/pgtable.h>
@@ -56,6 +58,18 @@ DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
56DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); 58DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
57 59
58/* 60/*
61 * Identity map, in addition to plain kernel map. This needs to be
62 * large enough to allocate page table pages to allocate the rest.
63 * Each page can map 2MB.
64 */
65static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss;
66
67#ifdef CONFIG_X86_64
68/* l3 pud for userspace vsyscall mapping */
69static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
70#endif /* CONFIG_X86_64 */
71
72/*
59 * Note about cr3 (pagetable base) values: 73 * Note about cr3 (pagetable base) values:
60 * 74 *
61 * xen_cr3 contains the current logical cr3 value; it contains the 75 * xen_cr3 contains the current logical cr3 value; it contains the
@@ -75,13 +89,13 @@ DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
75struct start_info *xen_start_info; 89struct start_info *xen_start_info;
76EXPORT_SYMBOL_GPL(xen_start_info); 90EXPORT_SYMBOL_GPL(xen_start_info);
77 91
78static /* __initdata */ struct shared_info dummy_shared_info; 92struct shared_info xen_dummy_shared_info;
79 93
80/* 94/*
81 * Point at some empty memory to start with. We map the real shared_info 95 * Point at some empty memory to start with. We map the real shared_info
82 * page as soon as fixmap is up and running. 96 * page as soon as fixmap is up and running.
83 */ 97 */
84struct shared_info *HYPERVISOR_shared_info = (void *)&dummy_shared_info; 98struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
85 99
86/* 100/*
87 * Flag to determine whether vcpu info placement is available on all 101 * Flag to determine whether vcpu info placement is available on all
@@ -98,13 +112,13 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&dummy_shared_info;
98 */ 112 */
99static int have_vcpu_info_placement = 1; 113static int have_vcpu_info_placement = 1;
100 114
101static void __init xen_vcpu_setup(int cpu) 115static void xen_vcpu_setup(int cpu)
102{ 116{
103 struct vcpu_register_vcpu_info info; 117 struct vcpu_register_vcpu_info info;
104 int err; 118 int err;
105 struct vcpu_info *vcpup; 119 struct vcpu_info *vcpup;
106 120
107 BUG_ON(HYPERVISOR_shared_info == &dummy_shared_info); 121 BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
108 per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; 122 per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
109 123
110 if (!have_vcpu_info_placement) 124 if (!have_vcpu_info_placement)
@@ -136,11 +150,45 @@ static void __init xen_vcpu_setup(int cpu)
136 } 150 }
137} 151}
138 152
153/*
154 * On restore, set the vcpu placement up again.
155 * If it fails, then we're in a bad state, since
156 * we can't back out from using it...
157 */
158void xen_vcpu_restore(void)
159{
160 if (have_vcpu_info_placement) {
161 int cpu;
162
163 for_each_online_cpu(cpu) {
164 bool other_cpu = (cpu != smp_processor_id());
165
166 if (other_cpu &&
167 HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL))
168 BUG();
169
170 xen_vcpu_setup(cpu);
171
172 if (other_cpu &&
173 HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL))
174 BUG();
175 }
176
177 BUG_ON(!have_vcpu_info_placement);
178 }
179}
180
139static void __init xen_banner(void) 181static void __init xen_banner(void)
140{ 182{
183 unsigned version = HYPERVISOR_xen_version(XENVER_version, NULL);
184 struct xen_extraversion extra;
185 HYPERVISOR_xen_version(XENVER_extraversion, &extra);
186
141 printk(KERN_INFO "Booting paravirtualized kernel on %s\n", 187 printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
142 pv_info.name); 188 pv_info.name);
143 printk(KERN_INFO "Hypervisor signature: %s\n", xen_start_info->magic); 189 printk(KERN_INFO "Xen version: %d.%d%s%s\n",
190 version >> 16, version & 0xffff, extra.extraversion,
191 xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : "");
144} 192}
145 193
146static void xen_cpuid(unsigned int *ax, unsigned int *bx, 194static void xen_cpuid(unsigned int *ax, unsigned int *bx,
@@ -235,13 +283,13 @@ static void xen_irq_enable(void)
235{ 283{
236 struct vcpu_info *vcpu; 284 struct vcpu_info *vcpu;
237 285
238 /* There's a one instruction preempt window here. We need to 286 /* We don't need to worry about being preempted here, since
239 make sure we're don't switch CPUs between getting the vcpu 287 either a) interrupts are disabled, so no preemption, or b)
240 pointer and updating the mask. */ 288 the caller is confused and is trying to re-enable interrupts
241 preempt_disable(); 289 on an indeterminate processor. */
290
242 vcpu = x86_read_percpu(xen_vcpu); 291 vcpu = x86_read_percpu(xen_vcpu);
243 vcpu->evtchn_upcall_mask = 0; 292 vcpu->evtchn_upcall_mask = 0;
244 preempt_enable_no_resched();
245 293
246 /* Doesn't matter if we get preempted here, because any 294 /* Doesn't matter if we get preempted here, because any
247 pending event will get dealt with anyway. */ 295 pending event will get dealt with anyway. */
@@ -254,7 +302,7 @@ static void xen_irq_enable(void)
254static void xen_safe_halt(void) 302static void xen_safe_halt(void)
255{ 303{
256 /* Blocking includes an implicit local_irq_enable(). */ 304 /* Blocking includes an implicit local_irq_enable(). */
257 if (HYPERVISOR_sched_op(SCHEDOP_block, 0) != 0) 305 if (HYPERVISOR_sched_op(SCHEDOP_block, NULL) != 0)
258 BUG(); 306 BUG();
259} 307}
260 308
@@ -332,14 +380,6 @@ static void load_TLS_descriptor(struct thread_struct *t,
332 380
333static void xen_load_tls(struct thread_struct *t, unsigned int cpu) 381static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
334{ 382{
335 xen_mc_batch();
336
337 load_TLS_descriptor(t, cpu, 0);
338 load_TLS_descriptor(t, cpu, 1);
339 load_TLS_descriptor(t, cpu, 2);
340
341 xen_mc_issue(PARAVIRT_LAZY_CPU);
342
343 /* 383 /*
344 * XXX sleazy hack: If we're being called in a lazy-cpu zone, 384 * XXX sleazy hack: If we're being called in a lazy-cpu zone,
345 * it means we're in a context switch, and %gs has just been 385 * it means we're in a context switch, and %gs has just been
@@ -348,10 +388,39 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
348 * Either way, it has been saved, and the new value will get 388 * Either way, it has been saved, and the new value will get
349 * loaded properly. This will go away as soon as Xen has been 389 * loaded properly. This will go away as soon as Xen has been
350 * modified to not save/restore %gs for normal hypercalls. 390 * modified to not save/restore %gs for normal hypercalls.
391 *
392 * On x86_64, this hack is not used for %gs, because gs points
393 * to KERNEL_GS_BASE (and uses it for PDA references), so we
394 * must not zero %gs on x86_64
395 *
396 * For x86_64, we need to zero %fs, otherwise we may get an
397 * exception between the new %fs descriptor being loaded and
398 * %fs being effectively cleared at __switch_to().
351 */ 399 */
352 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) 400 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) {
401#ifdef CONFIG_X86_32
353 loadsegment(gs, 0); 402 loadsegment(gs, 0);
403#else
404 loadsegment(fs, 0);
405#endif
406 }
407
408 xen_mc_batch();
409
410 load_TLS_descriptor(t, cpu, 0);
411 load_TLS_descriptor(t, cpu, 1);
412 load_TLS_descriptor(t, cpu, 2);
413
414 xen_mc_issue(PARAVIRT_LAZY_CPU);
415}
416
417#ifdef CONFIG_X86_64
418static void xen_load_gs_index(unsigned int idx)
419{
420 if (HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, idx))
421 BUG();
354} 422}
423#endif
355 424
356static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, 425static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
357 const void *ptr) 426 const void *ptr)
@@ -369,23 +438,18 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
369 preempt_enable(); 438 preempt_enable();
370} 439}
371 440
372static int cvt_gate_to_trap(int vector, u32 low, u32 high, 441static int cvt_gate_to_trap(int vector, const gate_desc *val,
373 struct trap_info *info) 442 struct trap_info *info)
374{ 443{
375 u8 type, dpl; 444 if (val->type != 0xf && val->type != 0xe)
376
377 type = (high >> 8) & 0x1f;
378 dpl = (high >> 13) & 3;
379
380 if (type != 0xf && type != 0xe)
381 return 0; 445 return 0;
382 446
383 info->vector = vector; 447 info->vector = vector;
384 info->address = (high & 0xffff0000) | (low & 0x0000ffff); 448 info->address = gate_offset(*val);
385 info->cs = low >> 16; 449 info->cs = gate_segment(*val);
386 info->flags = dpl; 450 info->flags = val->dpl;
387 /* interrupt gates clear IF */ 451 /* interrupt gates clear IF */
388 if (type == 0xe) 452 if (val->type == 0xe)
389 info->flags |= 4; 453 info->flags |= 4;
390 454
391 return 1; 455 return 1;
@@ -412,11 +476,10 @@ static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g)
412 476
413 if (p >= start && (p + 8) <= end) { 477 if (p >= start && (p + 8) <= end) {
414 struct trap_info info[2]; 478 struct trap_info info[2];
415 u32 *desc = (u32 *)g;
416 479
417 info[1].address = 0; 480 info[1].address = 0;
418 481
419 if (cvt_gate_to_trap(entrynum, desc[0], desc[1], &info[0])) 482 if (cvt_gate_to_trap(entrynum, g, &info[0]))
420 if (HYPERVISOR_set_trap_table(info)) 483 if (HYPERVISOR_set_trap_table(info))
421 BUG(); 484 BUG();
422 } 485 }
@@ -429,13 +492,13 @@ static void xen_convert_trap_info(const struct desc_ptr *desc,
429{ 492{
430 unsigned in, out, count; 493 unsigned in, out, count;
431 494
432 count = (desc->size+1) / 8; 495 count = (desc->size+1) / sizeof(gate_desc);
433 BUG_ON(count > 256); 496 BUG_ON(count > 256);
434 497
435 for (in = out = 0; in < count; in++) { 498 for (in = out = 0; in < count; in++) {
436 const u32 *entry = (u32 *)(desc->address + in * 8); 499 gate_desc *entry = (gate_desc*)(desc->address) + in;
437 500
438 if (cvt_gate_to_trap(in, entry[0], entry[1], &traps[out])) 501 if (cvt_gate_to_trap(in, entry, &traps[out]))
439 out++; 502 out++;
440 } 503 }
441 traps[out].address = 0; 504 traps[out].address = 0;
@@ -607,6 +670,30 @@ static void xen_flush_tlb_others(const cpumask_t *cpus, struct mm_struct *mm,
607 xen_mc_issue(PARAVIRT_LAZY_MMU); 670 xen_mc_issue(PARAVIRT_LAZY_MMU);
608} 671}
609 672
673static void xen_clts(void)
674{
675 struct multicall_space mcs;
676
677 mcs = xen_mc_entry(0);
678
679 MULTI_fpu_taskswitch(mcs.mc, 0);
680
681 xen_mc_issue(PARAVIRT_LAZY_CPU);
682}
683
684static void xen_write_cr0(unsigned long cr0)
685{
686 struct multicall_space mcs;
687
688 /* Only pay attention to cr0.TS; everything else is
689 ignored. */
690 mcs = xen_mc_entry(0);
691
692 MULTI_fpu_taskswitch(mcs.mc, (cr0 & X86_CR0_TS) != 0);
693
694 xen_mc_issue(PARAVIRT_LAZY_CPU);
695}
696
610static void xen_write_cr2(unsigned long cr2) 697static void xen_write_cr2(unsigned long cr2)
611{ 698{
612 x86_read_percpu(xen_vcpu)->arch.cr2 = cr2; 699 x86_read_percpu(xen_vcpu)->arch.cr2 = cr2;
@@ -624,8 +711,10 @@ static unsigned long xen_read_cr2_direct(void)
624 711
625static void xen_write_cr4(unsigned long cr4) 712static void xen_write_cr4(unsigned long cr4)
626{ 713{
627 /* Just ignore cr4 changes; Xen doesn't allow us to do 714 cr4 &= ~X86_CR4_PGE;
628 anything anyway. */ 715 cr4 &= ~X86_CR4_PSE;
716
717 native_write_cr4(cr4);
629} 718}
630 719
631static unsigned long xen_read_cr3(void) 720static unsigned long xen_read_cr3(void)
@@ -638,33 +727,89 @@ static void set_current_cr3(void *v)
638 x86_write_percpu(xen_current_cr3, (unsigned long)v); 727 x86_write_percpu(xen_current_cr3, (unsigned long)v);
639} 728}
640 729
641static void xen_write_cr3(unsigned long cr3) 730static void __xen_write_cr3(bool kernel, unsigned long cr3)
642{ 731{
643 struct mmuext_op *op; 732 struct mmuext_op *op;
644 struct multicall_space mcs; 733 struct multicall_space mcs;
645 unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3)); 734 unsigned long mfn;
646 735
647 BUG_ON(preemptible()); 736 if (cr3)
737 mfn = pfn_to_mfn(PFN_DOWN(cr3));
738 else
739 mfn = 0;
648 740
649 mcs = xen_mc_entry(sizeof(*op)); /* disables interrupts */ 741 WARN_ON(mfn == 0 && kernel);
650 742
651 /* Update while interrupts are disabled, so its atomic with 743 mcs = __xen_mc_entry(sizeof(*op));
652 respect to ipis */
653 x86_write_percpu(xen_cr3, cr3);
654 744
655 op = mcs.args; 745 op = mcs.args;
656 op->cmd = MMUEXT_NEW_BASEPTR; 746 op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
657 op->arg1.mfn = mfn; 747 op->arg1.mfn = mfn;
658 748
659 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); 749 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
660 750
661 /* Update xen_update_cr3 once the batch has actually 751 if (kernel) {
662 been submitted. */ 752 x86_write_percpu(xen_cr3, cr3);
663 xen_mc_callback(set_current_cr3, (void *)cr3); 753
754 /* Update xen_current_cr3 once the batch has actually
755 been submitted. */
756 xen_mc_callback(set_current_cr3, (void *)cr3);
757 }
758}
759
760static void xen_write_cr3(unsigned long cr3)
761{
762 BUG_ON(preemptible());
763
764 xen_mc_batch(); /* disables interrupts */
765
766 /* Update while interrupts are disabled, so its atomic with
767 respect to ipis */
768 x86_write_percpu(xen_cr3, cr3);
769
770 __xen_write_cr3(true, cr3);
771
772#ifdef CONFIG_X86_64
773 {
774 pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
775 if (user_pgd)
776 __xen_write_cr3(false, __pa(user_pgd));
777 else
778 __xen_write_cr3(false, 0);
779 }
780#endif
664 781
665 xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */ 782 xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
666} 783}
667 784
785static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
786{
787 int ret;
788
789 ret = 0;
790
791 switch(msr) {
792#ifdef CONFIG_X86_64
793 unsigned which;
794 u64 base;
795
796 case MSR_FS_BASE: which = SEGBASE_FS; goto set;
797 case MSR_KERNEL_GS_BASE: which = SEGBASE_GS_USER; goto set;
798 case MSR_GS_BASE: which = SEGBASE_GS_KERNEL; goto set;
799
800 set:
801 base = ((u64)high << 32) | low;
802 if (HYPERVISOR_set_segment_base(which, base) != 0)
803 ret = -EFAULT;
804 break;
805#endif
806 default:
807 ret = native_write_msr_safe(msr, low, high);
808 }
809
810 return ret;
811}
812
668/* Early in boot, while setting up the initial pagetable, assume 813/* Early in boot, while setting up the initial pagetable, assume
669 everything is pinned. */ 814 everything is pinned. */
670static __init void xen_alloc_pte_init(struct mm_struct *mm, u32 pfn) 815static __init void xen_alloc_pte_init(struct mm_struct *mm, u32 pfn)
@@ -721,6 +866,48 @@ static void xen_alloc_pmd(struct mm_struct *mm, u32 pfn)
721 xen_alloc_ptpage(mm, pfn, PT_PMD); 866 xen_alloc_ptpage(mm, pfn, PT_PMD);
722} 867}
723 868
869static int xen_pgd_alloc(struct mm_struct *mm)
870{
871 pgd_t *pgd = mm->pgd;
872 int ret = 0;
873
874 BUG_ON(PagePinned(virt_to_page(pgd)));
875
876#ifdef CONFIG_X86_64
877 {
878 struct page *page = virt_to_page(pgd);
879 pgd_t *user_pgd;
880
881 BUG_ON(page->private != 0);
882
883 ret = -ENOMEM;
884
885 user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
886 page->private = (unsigned long)user_pgd;
887
888 if (user_pgd != NULL) {
889 user_pgd[pgd_index(VSYSCALL_START)] =
890 __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
891 ret = 0;
892 }
893
894 BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
895 }
896#endif
897
898 return ret;
899}
900
901static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
902{
903#ifdef CONFIG_X86_64
904 pgd_t *user_pgd = xen_get_user_pgd(pgd);
905
906 if (user_pgd)
907 free_page((unsigned long)user_pgd);
908#endif
909}
910
724/* This should never happen until we're OK to use struct page */ 911/* This should never happen until we're OK to use struct page */
725static void xen_release_ptpage(u32 pfn, unsigned level) 912static void xen_release_ptpage(u32 pfn, unsigned level)
726{ 913{
@@ -746,6 +933,18 @@ static void xen_release_pmd(u32 pfn)
746 xen_release_ptpage(pfn, PT_PMD); 933 xen_release_ptpage(pfn, PT_PMD);
747} 934}
748 935
936#if PAGETABLE_LEVELS == 4
937static void xen_alloc_pud(struct mm_struct *mm, u32 pfn)
938{
939 xen_alloc_ptpage(mm, pfn, PT_PUD);
940}
941
942static void xen_release_pud(u32 pfn)
943{
944 xen_release_ptpage(pfn, PT_PUD);
945}
946#endif
947
749#ifdef CONFIG_HIGHPTE 948#ifdef CONFIG_HIGHPTE
750static void *xen_kmap_atomic_pte(struct page *page, enum km_type type) 949static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
751{ 950{
@@ -784,68 +983,16 @@ static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
784 983
785static __init void xen_pagetable_setup_start(pgd_t *base) 984static __init void xen_pagetable_setup_start(pgd_t *base)
786{ 985{
787 pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base;
788 int i;
789
790 /* special set_pte for pagetable initialization */
791 pv_mmu_ops.set_pte = xen_set_pte_init;
792
793 init_mm.pgd = base;
794 /*
795 * copy top-level of Xen-supplied pagetable into place. This
796 * is a stand-in while we copy the pmd pages.
797 */
798 memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t));
799
800 /*
801 * For PAE, need to allocate new pmds, rather than
802 * share Xen's, since Xen doesn't like pmd's being
803 * shared between address spaces.
804 */
805 for (i = 0; i < PTRS_PER_PGD; i++) {
806 if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) {
807 pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
808
809 memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]),
810 PAGE_SIZE);
811
812 make_lowmem_page_readonly(pmd);
813
814 set_pgd(&base[i], __pgd(1 + __pa(pmd)));
815 } else
816 pgd_clear(&base[i]);
817 }
818
819 /* make sure zero_page is mapped RO so we can use it in pagetables */
820 make_lowmem_page_readonly(empty_zero_page);
821 make_lowmem_page_readonly(base);
822 /*
823 * Switch to new pagetable. This is done before
824 * pagetable_init has done anything so that the new pages
825 * added to the table can be prepared properly for Xen.
826 */
827 xen_write_cr3(__pa(base));
828
829 /* Unpin initial Xen pagetable */
830 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE,
831 PFN_DOWN(__pa(xen_start_info->pt_base)));
832} 986}
833 987
834static __init void setup_shared_info(void) 988void xen_setup_shared_info(void)
835{ 989{
836 if (!xen_feature(XENFEAT_auto_translated_physmap)) { 990 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
837 unsigned long addr = fix_to_virt(FIX_PARAVIRT_BOOTMAP); 991 set_fixmap(FIX_PARAVIRT_BOOTMAP,
838 992 xen_start_info->shared_info);
839 /* 993
840 * Create a mapping for the shared info page. 994 HYPERVISOR_shared_info =
841 * Should be set_fixmap(), but shared_info is a machine 995 (struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP);
842 * address with no corresponding pseudo-phys address.
843 */
844 set_pte_mfn(addr,
845 PFN_DOWN(xen_start_info->shared_info),
846 PAGE_KERNEL);
847
848 HYPERVISOR_shared_info = (struct shared_info *)addr;
849 } else 996 } else
850 HYPERVISOR_shared_info = 997 HYPERVISOR_shared_info =
851 (struct shared_info *)__va(xen_start_info->shared_info); 998 (struct shared_info *)__va(xen_start_info->shared_info);
@@ -854,27 +1001,43 @@ static __init void setup_shared_info(void)
854 /* In UP this is as good a place as any to set up shared info */ 1001 /* In UP this is as good a place as any to set up shared info */
855 xen_setup_vcpu_info_placement(); 1002 xen_setup_vcpu_info_placement();
856#endif 1003#endif
1004
1005 xen_setup_mfn_list_list();
857} 1006}
858 1007
859static __init void xen_pagetable_setup_done(pgd_t *base) 1008static __init void xen_pagetable_setup_done(pgd_t *base)
860{ 1009{
1010 xen_setup_shared_info();
1011}
1012
1013static __init void xen_post_allocator_init(void)
1014{
1015 pv_mmu_ops.set_pte = xen_set_pte;
1016 pv_mmu_ops.set_pmd = xen_set_pmd;
1017 pv_mmu_ops.set_pud = xen_set_pud;
1018#if PAGETABLE_LEVELS == 4
1019 pv_mmu_ops.set_pgd = xen_set_pgd;
1020#endif
1021
861 /* This will work as long as patching hasn't happened yet 1022 /* This will work as long as patching hasn't happened yet
862 (which it hasn't) */ 1023 (which it hasn't) */
863 pv_mmu_ops.alloc_pte = xen_alloc_pte; 1024 pv_mmu_ops.alloc_pte = xen_alloc_pte;
864 pv_mmu_ops.alloc_pmd = xen_alloc_pmd; 1025 pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
865 pv_mmu_ops.release_pte = xen_release_pte; 1026 pv_mmu_ops.release_pte = xen_release_pte;
866 pv_mmu_ops.release_pmd = xen_release_pmd; 1027 pv_mmu_ops.release_pmd = xen_release_pmd;
867 pv_mmu_ops.set_pte = xen_set_pte; 1028#if PAGETABLE_LEVELS == 4
868 1029 pv_mmu_ops.alloc_pud = xen_alloc_pud;
869 setup_shared_info(); 1030 pv_mmu_ops.release_pud = xen_release_pud;
1031#endif
870 1032
871 /* Actually pin the pagetable down, but we can't set PG_pinned 1033#ifdef CONFIG_X86_64
872 yet because the page structures don't exist yet. */ 1034 SetPagePinned(virt_to_page(level3_user_vsyscall));
873 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(base))); 1035#endif
1036 xen_mark_init_mm_pinned();
874} 1037}
875 1038
876/* This is called once we have the cpu_possible_map */ 1039/* This is called once we have the cpu_possible_map */
877void __init xen_setup_vcpu_info_placement(void) 1040void xen_setup_vcpu_info_placement(void)
878{ 1041{
879 int cpu; 1042 int cpu;
880 1043
@@ -883,6 +1046,7 @@ void __init xen_setup_vcpu_info_placement(void)
883 1046
884 /* xen_vcpu_setup managed to place the vcpu_info within the 1047 /* xen_vcpu_setup managed to place the vcpu_info within the
885 percpu area for all cpus, so make use of it */ 1048 percpu area for all cpus, so make use of it */
1049#ifdef CONFIG_X86_32
886 if (have_vcpu_info_placement) { 1050 if (have_vcpu_info_placement) {
887 printk(KERN_INFO "Xen: using vcpu_info placement\n"); 1051 printk(KERN_INFO "Xen: using vcpu_info placement\n");
888 1052
@@ -892,6 +1056,7 @@ void __init xen_setup_vcpu_info_placement(void)
892 pv_irq_ops.irq_enable = xen_irq_enable_direct; 1056 pv_irq_ops.irq_enable = xen_irq_enable_direct;
893 pv_mmu_ops.read_cr2 = xen_read_cr2_direct; 1057 pv_mmu_ops.read_cr2 = xen_read_cr2_direct;
894 } 1058 }
1059#endif
895} 1060}
896 1061
897static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, 1062static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
@@ -912,10 +1077,12 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
912 goto patch_site 1077 goto patch_site
913 1078
914 switch (type) { 1079 switch (type) {
1080#ifdef CONFIG_X86_32
915 SITE(pv_irq_ops, irq_enable); 1081 SITE(pv_irq_ops, irq_enable);
916 SITE(pv_irq_ops, irq_disable); 1082 SITE(pv_irq_ops, irq_disable);
917 SITE(pv_irq_ops, save_fl); 1083 SITE(pv_irq_ops, save_fl);
918 SITE(pv_irq_ops, restore_fl); 1084 SITE(pv_irq_ops, restore_fl);
1085#endif /* CONFIG_X86_32 */
919#undef SITE 1086#undef SITE
920 1087
921 patch_site: 1088 patch_site:
@@ -947,6 +1114,49 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
947 return ret; 1114 return ret;
948} 1115}
949 1116
1117static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot)
1118{
1119 pte_t pte;
1120
1121 phys >>= PAGE_SHIFT;
1122
1123 switch (idx) {
1124 case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
1125#ifdef CONFIG_X86_F00F_BUG
1126 case FIX_F00F_IDT:
1127#endif
1128#ifdef CONFIG_X86_32
1129 case FIX_WP_TEST:
1130 case FIX_VDSO:
1131# ifdef CONFIG_HIGHMEM
1132 case FIX_KMAP_BEGIN ... FIX_KMAP_END:
1133# endif
1134#else
1135 case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
1136#endif
1137#ifdef CONFIG_X86_LOCAL_APIC
1138 case FIX_APIC_BASE: /* maps dummy local APIC */
1139#endif
1140 pte = pfn_pte(phys, prot);
1141 break;
1142
1143 default:
1144 pte = mfn_pte(phys, prot);
1145 break;
1146 }
1147
1148 __native_set_fixmap(idx, pte);
1149
1150#ifdef CONFIG_X86_64
1151 /* Replicate changes to map the vsyscall page into the user
1152 pagetable vsyscall mapping. */
1153 if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) {
1154 unsigned long vaddr = __fix_to_virt(idx);
1155 set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
1156 }
1157#endif
1158}
1159
950static const struct pv_info xen_info __initdata = { 1160static const struct pv_info xen_info __initdata = {
951 .paravirt_enabled = 1, 1161 .paravirt_enabled = 1,
952 .shared_kernel_pmd = 0, 1162 .shared_kernel_pmd = 0,
@@ -960,7 +1170,7 @@ static const struct pv_init_ops xen_init_ops __initdata = {
960 .banner = xen_banner, 1170 .banner = xen_banner,
961 .memory_setup = xen_memory_setup, 1171 .memory_setup = xen_memory_setup,
962 .arch_setup = xen_arch_setup, 1172 .arch_setup = xen_arch_setup,
963 .post_allocator_init = xen_mark_init_mm_pinned, 1173 .post_allocator_init = xen_post_allocator_init,
964}; 1174};
965 1175
966static const struct pv_time_ops xen_time_ops __initdata = { 1176static const struct pv_time_ops xen_time_ops __initdata = {
@@ -968,7 +1178,7 @@ static const struct pv_time_ops xen_time_ops __initdata = {
968 1178
969 .set_wallclock = xen_set_wallclock, 1179 .set_wallclock = xen_set_wallclock,
970 .get_wallclock = xen_get_wallclock, 1180 .get_wallclock = xen_get_wallclock,
971 .get_cpu_khz = xen_cpu_khz, 1181 .get_tsc_khz = xen_tsc_khz,
972 .sched_clock = xen_sched_clock, 1182 .sched_clock = xen_sched_clock,
973}; 1183};
974 1184
@@ -978,10 +1188,10 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
978 .set_debugreg = xen_set_debugreg, 1188 .set_debugreg = xen_set_debugreg,
979 .get_debugreg = xen_get_debugreg, 1189 .get_debugreg = xen_get_debugreg,
980 1190
981 .clts = native_clts, 1191 .clts = xen_clts,
982 1192
983 .read_cr0 = native_read_cr0, 1193 .read_cr0 = native_read_cr0,
984 .write_cr0 = native_write_cr0, 1194 .write_cr0 = xen_write_cr0,
985 1195
986 .read_cr4 = native_read_cr4, 1196 .read_cr4 = native_read_cr4,
987 .read_cr4_safe = native_read_cr4_safe, 1197 .read_cr4_safe = native_read_cr4_safe,
@@ -990,18 +1200,25 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
990 .wbinvd = native_wbinvd, 1200 .wbinvd = native_wbinvd,
991 1201
992 .read_msr = native_read_msr_safe, 1202 .read_msr = native_read_msr_safe,
993 .write_msr = native_write_msr_safe, 1203 .write_msr = xen_write_msr_safe,
994 .read_tsc = native_read_tsc, 1204 .read_tsc = native_read_tsc,
995 .read_pmc = native_read_pmc, 1205 .read_pmc = native_read_pmc,
996 1206
997 .iret = xen_iret, 1207 .iret = xen_iret,
998 .irq_enable_syscall_ret = xen_sysexit, 1208 .irq_enable_sysexit = xen_sysexit,
1209#ifdef CONFIG_X86_64
1210 .usergs_sysret32 = xen_sysret32,
1211 .usergs_sysret64 = xen_sysret64,
1212#endif
999 1213
1000 .load_tr_desc = paravirt_nop, 1214 .load_tr_desc = paravirt_nop,
1001 .set_ldt = xen_set_ldt, 1215 .set_ldt = xen_set_ldt,
1002 .load_gdt = xen_load_gdt, 1216 .load_gdt = xen_load_gdt,
1003 .load_idt = xen_load_idt, 1217 .load_idt = xen_load_idt,
1004 .load_tls = xen_load_tls, 1218 .load_tls = xen_load_tls,
1219#ifdef CONFIG_X86_64
1220 .load_gs_index = xen_load_gs_index,
1221#endif
1005 1222
1006 .store_gdt = native_store_gdt, 1223 .store_gdt = native_store_gdt,
1007 .store_idt = native_store_idt, 1224 .store_idt = native_store_idt,
@@ -1015,26 +1232,48 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
1015 .set_iopl_mask = xen_set_iopl_mask, 1232 .set_iopl_mask = xen_set_iopl_mask,
1016 .io_delay = xen_io_delay, 1233 .io_delay = xen_io_delay,
1017 1234
1235 /* Xen takes care of %gs when switching to usermode for us */
1236 .swapgs = paravirt_nop,
1237
1018 .lazy_mode = { 1238 .lazy_mode = {
1019 .enter = paravirt_enter_lazy_cpu, 1239 .enter = paravirt_enter_lazy_cpu,
1020 .leave = xen_leave_lazy, 1240 .leave = xen_leave_lazy,
1021 }, 1241 },
1022}; 1242};
1023 1243
1244static void __init __xen_init_IRQ(void)
1245{
1246#ifdef CONFIG_X86_64
1247 int i;
1248
1249 /* Create identity vector->irq map */
1250 for(i = 0; i < NR_VECTORS; i++) {
1251 int cpu;
1252
1253 for_each_possible_cpu(cpu)
1254 per_cpu(vector_irq, cpu)[i] = i;
1255 }
1256#endif /* CONFIG_X86_64 */
1257
1258 xen_init_IRQ();
1259}
1260
1024static const struct pv_irq_ops xen_irq_ops __initdata = { 1261static const struct pv_irq_ops xen_irq_ops __initdata = {
1025 .init_IRQ = xen_init_IRQ, 1262 .init_IRQ = __xen_init_IRQ,
1026 .save_fl = xen_save_fl, 1263 .save_fl = xen_save_fl,
1027 .restore_fl = xen_restore_fl, 1264 .restore_fl = xen_restore_fl,
1028 .irq_disable = xen_irq_disable, 1265 .irq_disable = xen_irq_disable,
1029 .irq_enable = xen_irq_enable, 1266 .irq_enable = xen_irq_enable,
1030 .safe_halt = xen_safe_halt, 1267 .safe_halt = xen_safe_halt,
1031 .halt = xen_halt, 1268 .halt = xen_halt,
1269#ifdef CONFIG_X86_64
1270 .adjust_exception_frame = xen_adjust_exception_frame,
1271#endif
1032}; 1272};
1033 1273
1034static const struct pv_apic_ops xen_apic_ops __initdata = { 1274static const struct pv_apic_ops xen_apic_ops __initdata = {
1035#ifdef CONFIG_X86_LOCAL_APIC 1275#ifdef CONFIG_X86_LOCAL_APIC
1036 .apic_write = xen_apic_write, 1276 .apic_write = xen_apic_write,
1037 .apic_write_atomic = xen_apic_write,
1038 .apic_read = xen_apic_read, 1277 .apic_read = xen_apic_read,
1039 .setup_boot_clock = paravirt_nop, 1278 .setup_boot_clock = paravirt_nop,
1040 .setup_secondary_clock = paravirt_nop, 1279 .setup_secondary_clock = paravirt_nop,
@@ -1060,6 +1299,9 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1060 .pte_update = paravirt_nop, 1299 .pte_update = paravirt_nop,
1061 .pte_update_defer = paravirt_nop, 1300 .pte_update_defer = paravirt_nop,
1062 1301
1302 .pgd_alloc = xen_pgd_alloc,
1303 .pgd_free = xen_pgd_free,
1304
1063 .alloc_pte = xen_alloc_pte_init, 1305 .alloc_pte = xen_alloc_pte_init,
1064 .release_pte = xen_release_pte_init, 1306 .release_pte = xen_release_pte_init,
1065 .alloc_pmd = xen_alloc_pte_init, 1307 .alloc_pmd = xen_alloc_pte_init,
@@ -1070,25 +1312,44 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1070 .kmap_atomic_pte = xen_kmap_atomic_pte, 1312 .kmap_atomic_pte = xen_kmap_atomic_pte,
1071#endif 1313#endif
1072 1314
1073 .set_pte = NULL, /* see xen_pagetable_setup_* */ 1315#ifdef CONFIG_X86_64
1316 .set_pte = xen_set_pte,
1317#else
1318 .set_pte = xen_set_pte_init,
1319#endif
1074 .set_pte_at = xen_set_pte_at, 1320 .set_pte_at = xen_set_pte_at,
1075 .set_pmd = xen_set_pmd, 1321 .set_pmd = xen_set_pmd_hyper,
1322
1323 .ptep_modify_prot_start = __ptep_modify_prot_start,
1324 .ptep_modify_prot_commit = __ptep_modify_prot_commit,
1076 1325
1077 .pte_val = xen_pte_val, 1326 .pte_val = xen_pte_val,
1327 .pte_flags = native_pte_val,
1078 .pgd_val = xen_pgd_val, 1328 .pgd_val = xen_pgd_val,
1079 1329
1080 .make_pte = xen_make_pte, 1330 .make_pte = xen_make_pte,
1081 .make_pgd = xen_make_pgd, 1331 .make_pgd = xen_make_pgd,
1082 1332
1333#ifdef CONFIG_X86_PAE
1083 .set_pte_atomic = xen_set_pte_atomic, 1334 .set_pte_atomic = xen_set_pte_atomic,
1084 .set_pte_present = xen_set_pte_at, 1335 .set_pte_present = xen_set_pte_at,
1085 .set_pud = xen_set_pud,
1086 .pte_clear = xen_pte_clear, 1336 .pte_clear = xen_pte_clear,
1087 .pmd_clear = xen_pmd_clear, 1337 .pmd_clear = xen_pmd_clear,
1338#endif /* CONFIG_X86_PAE */
1339 .set_pud = xen_set_pud_hyper,
1088 1340
1089 .make_pmd = xen_make_pmd, 1341 .make_pmd = xen_make_pmd,
1090 .pmd_val = xen_pmd_val, 1342 .pmd_val = xen_pmd_val,
1091 1343
1344#if PAGETABLE_LEVELS == 4
1345 .pud_val = xen_pud_val,
1346 .make_pud = xen_make_pud,
1347 .set_pgd = xen_set_pgd_hyper,
1348
1349 .alloc_pud = xen_alloc_pte_init,
1350 .release_pud = xen_release_pte_init,
1351#endif /* PAGETABLE_LEVELS == 4 */
1352
1092 .activate_mm = xen_activate_mm, 1353 .activate_mm = xen_activate_mm,
1093 .dup_mmap = xen_dup_mmap, 1354 .dup_mmap = xen_dup_mmap,
1094 .exit_mmap = xen_exit_mmap, 1355 .exit_mmap = xen_exit_mmap,
@@ -1097,28 +1358,19 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1097 .enter = paravirt_enter_lazy_mmu, 1358 .enter = paravirt_enter_lazy_mmu,
1098 .leave = xen_leave_lazy, 1359 .leave = xen_leave_lazy,
1099 }, 1360 },
1100};
1101 1361
1102#ifdef CONFIG_SMP 1362 .set_fixmap = xen_set_fixmap,
1103static const struct smp_ops xen_smp_ops __initdata = {
1104 .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
1105 .smp_prepare_cpus = xen_smp_prepare_cpus,
1106 .cpu_up = xen_cpu_up,
1107 .smp_cpus_done = xen_smp_cpus_done,
1108
1109 .smp_send_stop = xen_smp_send_stop,
1110 .smp_send_reschedule = xen_smp_send_reschedule,
1111 .smp_call_function_mask = xen_smp_call_function_mask,
1112}; 1363};
1113#endif /* CONFIG_SMP */
1114 1364
1115static void xen_reboot(int reason) 1365static void xen_reboot(int reason)
1116{ 1366{
1367 struct sched_shutdown r = { .reason = reason };
1368
1117#ifdef CONFIG_SMP 1369#ifdef CONFIG_SMP
1118 smp_send_stop(); 1370 smp_send_stop();
1119#endif 1371#endif
1120 1372
1121 if (HYPERVISOR_sched_op(SCHEDOP_shutdown, reason)) 1373 if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r))
1122 BUG(); 1374 BUG();
1123} 1375}
1124 1376
@@ -1154,6 +1406,7 @@ static const struct machine_ops __initdata xen_machine_ops = {
1154 1406
1155static void __init xen_reserve_top(void) 1407static void __init xen_reserve_top(void)
1156{ 1408{
1409#ifdef CONFIG_X86_32
1157 unsigned long top = HYPERVISOR_VIRT_START; 1410 unsigned long top = HYPERVISOR_VIRT_START;
1158 struct xen_platform_parameters pp; 1411 struct xen_platform_parameters pp;
1159 1412
@@ -1161,8 +1414,248 @@ static void __init xen_reserve_top(void)
1161 top = pp.virt_start; 1414 top = pp.virt_start;
1162 1415
1163 reserve_top_address(-top + 2 * PAGE_SIZE); 1416 reserve_top_address(-top + 2 * PAGE_SIZE);
1417#endif /* CONFIG_X86_32 */
1418}
1419
1420/*
1421 * Like __va(), but returns address in the kernel mapping (which is
1422 * all we have until the physical memory mapping has been set up.
1423 */
1424static void *__ka(phys_addr_t paddr)
1425{
1426#ifdef CONFIG_X86_64
1427 return (void *)(paddr + __START_KERNEL_map);
1428#else
1429 return __va(paddr);
1430#endif
1431}
1432
1433/* Convert a machine address to physical address */
1434static unsigned long m2p(phys_addr_t maddr)
1435{
1436 phys_addr_t paddr;
1437
1438 maddr &= PTE_MASK;
1439 paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
1440
1441 return paddr;
1164} 1442}
1165 1443
1444/* Convert a machine address to kernel virtual */
1445static void *m2v(phys_addr_t maddr)
1446{
1447 return __ka(m2p(maddr));
1448}
1449
1450#ifdef CONFIG_X86_64
1451static void walk(pgd_t *pgd, unsigned long addr)
1452{
1453 unsigned l4idx = pgd_index(addr);
1454 unsigned l3idx = pud_index(addr);
1455 unsigned l2idx = pmd_index(addr);
1456 unsigned l1idx = pte_index(addr);
1457 pgd_t l4;
1458 pud_t l3;
1459 pmd_t l2;
1460 pte_t l1;
1461
1462 xen_raw_printk("walk %p, %lx -> %d %d %d %d\n",
1463 pgd, addr, l4idx, l3idx, l2idx, l1idx);
1464
1465 l4 = pgd[l4idx];
1466 xen_raw_printk(" l4: %016lx\n", l4.pgd);
1467 xen_raw_printk(" %016lx\n", pgd_val(l4));
1468
1469 l3 = ((pud_t *)(m2v(l4.pgd)))[l3idx];
1470 xen_raw_printk(" l3: %016lx\n", l3.pud);
1471 xen_raw_printk(" %016lx\n", pud_val(l3));
1472
1473 l2 = ((pmd_t *)(m2v(l3.pud)))[l2idx];
1474 xen_raw_printk(" l2: %016lx\n", l2.pmd);
1475 xen_raw_printk(" %016lx\n", pmd_val(l2));
1476
1477 l1 = ((pte_t *)(m2v(l2.pmd)))[l1idx];
1478 xen_raw_printk(" l1: %016lx\n", l1.pte);
1479 xen_raw_printk(" %016lx\n", pte_val(l1));
1480}
1481#endif
1482
1483static void set_page_prot(void *addr, pgprot_t prot)
1484{
1485 unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
1486 pte_t pte = pfn_pte(pfn, prot);
1487
1488 xen_raw_printk("addr=%p pfn=%lx mfn=%lx prot=%016llx pte=%016llx\n",
1489 addr, pfn, get_phys_to_machine(pfn),
1490 pgprot_val(prot), pte.pte);
1491
1492 if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
1493 BUG();
1494}
1495
1496static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1497{
1498 unsigned pmdidx, pteidx;
1499 unsigned ident_pte;
1500 unsigned long pfn;
1501
1502 ident_pte = 0;
1503 pfn = 0;
1504 for(pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
1505 pte_t *pte_page;
1506
1507 /* Reuse or allocate a page of ptes */
1508 if (pmd_present(pmd[pmdidx]))
1509 pte_page = m2v(pmd[pmdidx].pmd);
1510 else {
1511 /* Check for free pte pages */
1512 if (ident_pte == ARRAY_SIZE(level1_ident_pgt))
1513 break;
1514
1515 pte_page = &level1_ident_pgt[ident_pte];
1516 ident_pte += PTRS_PER_PTE;
1517
1518 pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
1519 }
1520
1521 /* Install mappings */
1522 for(pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
1523 pte_t pte;
1524
1525 if (pfn > max_pfn_mapped)
1526 max_pfn_mapped = pfn;
1527
1528 if (!pte_none(pte_page[pteidx]))
1529 continue;
1530
1531 pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
1532 pte_page[pteidx] = pte;
1533 }
1534 }
1535
1536 for(pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
1537 set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
1538
1539 set_page_prot(pmd, PAGE_KERNEL_RO);
1540}
1541
1542#ifdef CONFIG_X86_64
1543static void convert_pfn_mfn(void *v)
1544{
1545 pte_t *pte = v;
1546 int i;
1547
1548 /* All levels are converted the same way, so just treat them
1549 as ptes. */
1550 for(i = 0; i < PTRS_PER_PTE; i++)
1551 pte[i] = xen_make_pte(pte[i].pte);
1552}
1553
1554/*
1555 * Set up the inital kernel pagetable.
1556 *
1557 * We can construct this by grafting the Xen provided pagetable into
1558 * head_64.S's preconstructed pagetables. We copy the Xen L2's into
1559 * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt. This
1560 * means that only the kernel has a physical mapping to start with -
1561 * but that's enough to get __va working. We need to fill in the rest
1562 * of the physical mapping once some sort of allocator has been set
1563 * up.
1564 */
1565static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
1566{
1567 pud_t *l3;
1568 pmd_t *l2;
1569
1570 /* Zap identity mapping */
1571 init_level4_pgt[0] = __pgd(0);
1572
1573 /* Pre-constructed entries are in pfn, so convert to mfn */
1574 convert_pfn_mfn(init_level4_pgt);
1575 convert_pfn_mfn(level3_ident_pgt);
1576 convert_pfn_mfn(level3_kernel_pgt);
1577
1578 l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
1579 l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
1580
1581 memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1582 memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1583
1584 l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
1585 l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
1586 memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1587
1588 /* Set up identity map */
1589 xen_map_identity_early(level2_ident_pgt, max_pfn);
1590
1591 /* Make pagetable pieces RO */
1592 set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
1593 set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
1594 set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
1595 set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
1596 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1597 set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
1598
1599 /* Pin down new L4 */
1600 pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
1601 PFN_DOWN(__pa_symbol(init_level4_pgt)));
1602
1603 /* Unpin Xen-provided one */
1604 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1605
1606 /* Switch over */
1607 pgd = init_level4_pgt;
1608
1609 /*
1610 * At this stage there can be no user pgd, and no page
1611 * structure to attach it to, so make sure we just set kernel
1612 * pgd.
1613 */
1614 xen_mc_batch();
1615 __xen_write_cr3(true, __pa(pgd));
1616 xen_mc_issue(PARAVIRT_LAZY_CPU);
1617
1618 reserve_early(__pa(xen_start_info->pt_base),
1619 __pa(xen_start_info->pt_base +
1620 xen_start_info->nr_pt_frames * PAGE_SIZE),
1621 "XEN PAGETABLES");
1622
1623 return pgd;
1624}
1625#else /* !CONFIG_X86_64 */
1626static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss;
1627
1628static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
1629{
1630 pmd_t *kernel_pmd;
1631
1632 init_pg_tables_start = __pa(pgd);
1633 init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
1634 max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024);
1635
1636 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
1637 memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
1638
1639 xen_map_identity_early(level2_kernel_pgt, max_pfn);
1640
1641 memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
1642 set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY],
1643 __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT));
1644
1645 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1646 set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
1647 set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
1648
1649 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1650
1651 xen_write_cr3(__pa(swapper_pg_dir));
1652
1653 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir)));
1654
1655 return swapper_pg_dir;
1656}
1657#endif /* CONFIG_X86_64 */
1658
1166/* First C function to be called on Xen boot */ 1659/* First C function to be called on Xen boot */
1167asmlinkage void __init xen_start_kernel(void) 1660asmlinkage void __init xen_start_kernel(void)
1168{ 1661{
@@ -1173,6 +1666,8 @@ asmlinkage void __init xen_start_kernel(void)
1173 1666
1174 BUG_ON(memcmp(xen_start_info->magic, "xen-3", 5) != 0); 1667 BUG_ON(memcmp(xen_start_info->magic, "xen-3", 5) != 0);
1175 1668
1669 xen_setup_features();
1670
1176 /* Install Xen paravirt ops */ 1671 /* Install Xen paravirt ops */
1177 pv_info = xen_info; 1672 pv_info = xen_info;
1178 pv_init_ops = xen_init_ops; 1673 pv_init_ops = xen_init_ops;
@@ -1182,59 +1677,85 @@ asmlinkage void __init xen_start_kernel(void)
1182 pv_apic_ops = xen_apic_ops; 1677 pv_apic_ops = xen_apic_ops;
1183 pv_mmu_ops = xen_mmu_ops; 1678 pv_mmu_ops = xen_mmu_ops;
1184 1679
1680 if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) {
1681 pv_mmu_ops.ptep_modify_prot_start = xen_ptep_modify_prot_start;
1682 pv_mmu_ops.ptep_modify_prot_commit = xen_ptep_modify_prot_commit;
1683 }
1684
1185 machine_ops = xen_machine_ops; 1685 machine_ops = xen_machine_ops;
1186 1686
1187#ifdef CONFIG_SMP 1687#ifdef CONFIG_X86_64
1188 smp_ops = xen_smp_ops; 1688 /* Disable until direct per-cpu data access. */
1689 have_vcpu_info_placement = 0;
1690 x86_64_init_pda();
1189#endif 1691#endif
1190 1692
1191 xen_setup_features(); 1693 xen_smp_init();
1192 1694
1193 /* Get mfn list */ 1695 /* Get mfn list */
1194 if (!xen_feature(XENFEAT_auto_translated_physmap)) 1696 if (!xen_feature(XENFEAT_auto_translated_physmap))
1195 phys_to_machine_mapping = (unsigned long *)xen_start_info->mfn_list; 1697 xen_build_dynamic_phys_to_machine();
1196 1698
1197 pgd = (pgd_t *)xen_start_info->pt_base; 1699 pgd = (pgd_t *)xen_start_info->pt_base;
1198 1700
1199 init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE; 1701 /* Prevent unwanted bits from being set in PTEs. */
1200 1702 __supported_pte_mask &= ~_PAGE_GLOBAL;
1201 init_mm.pgd = pgd; /* use the Xen pagetables to start */ 1703 if (!is_initial_xendomain())
1202 1704 __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
1203 /* keep using Xen gdt for now; no urgent need to change it */
1204
1205 x86_write_percpu(xen_cr3, __pa(pgd));
1206 x86_write_percpu(xen_current_cr3, __pa(pgd));
1207 1705
1208 /* Don't do the full vcpu_info placement stuff until we have a 1706 /* Don't do the full vcpu_info placement stuff until we have a
1209 possible map and a non-dummy shared_info. */ 1707 possible map and a non-dummy shared_info. */
1210 per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; 1708 per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
1211 1709
1710 xen_raw_console_write("mapping kernel into physical memory\n");
1711 pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages);
1712
1713 init_mm.pgd = pgd;
1714
1715 /* keep using Xen gdt for now; no urgent need to change it */
1716
1212 pv_info.kernel_rpl = 1; 1717 pv_info.kernel_rpl = 1;
1213 if (xen_feature(XENFEAT_supervisor_mode_kernel)) 1718 if (xen_feature(XENFEAT_supervisor_mode_kernel))
1214 pv_info.kernel_rpl = 0; 1719 pv_info.kernel_rpl = 0;
1215 1720
1216 /* Prevent unwanted bits from being set in PTEs. */
1217 __supported_pte_mask &= ~_PAGE_GLOBAL;
1218 if (!is_initial_xendomain())
1219 __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
1220
1221 /* set the limit of our address space */ 1721 /* set the limit of our address space */
1222 xen_reserve_top(); 1722 xen_reserve_top();
1223 1723
1724#ifdef CONFIG_X86_32
1224 /* set up basic CPUID stuff */ 1725 /* set up basic CPUID stuff */
1225 cpu_detect(&new_cpu_data); 1726 cpu_detect(&new_cpu_data);
1226 new_cpu_data.hard_math = 1; 1727 new_cpu_data.hard_math = 1;
1227 new_cpu_data.x86_capability[0] = cpuid_edx(1); 1728 new_cpu_data.x86_capability[0] = cpuid_edx(1);
1729#endif
1228 1730
1229 /* Poke various useful things into boot_params */ 1731 /* Poke various useful things into boot_params */
1230 boot_params.hdr.type_of_loader = (9 << 4) | 0; 1732 boot_params.hdr.type_of_loader = (9 << 4) | 0;
1231 boot_params.hdr.ramdisk_image = xen_start_info->mod_start 1733 boot_params.hdr.ramdisk_image = xen_start_info->mod_start
1232 ? __pa(xen_start_info->mod_start) : 0; 1734 ? __pa(xen_start_info->mod_start) : 0;
1233 boot_params.hdr.ramdisk_size = xen_start_info->mod_len; 1735 boot_params.hdr.ramdisk_size = xen_start_info->mod_len;
1736 boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line);
1234 1737
1235 if (!is_initial_xendomain()) 1738 if (!is_initial_xendomain()) {
1739 add_preferred_console("xenboot", 0, NULL);
1740 add_preferred_console("tty", 0, NULL);
1236 add_preferred_console("hvc", 0, NULL); 1741 add_preferred_console("hvc", 0, NULL);
1742 }
1743
1744 xen_raw_console_write("about to get started...\n");
1745
1746#if 0
1747 xen_raw_printk("&boot_params=%p __pa(&boot_params)=%lx __va(__pa(&boot_params))=%lx\n",
1748 &boot_params, __pa_symbol(&boot_params),
1749 __va(__pa_symbol(&boot_params)));
1750
1751 walk(pgd, &boot_params);
1752 walk(pgd, __va(__pa(&boot_params)));
1753#endif
1237 1754
1238 /* Start the world */ 1755 /* Start the world */
1239 start_kernel(); 1756#ifdef CONFIG_X86_32
1757 i386_start_kernel();
1758#else
1759 x86_64_start_reservations((char *)__pa_symbol(&boot_params));
1760#endif
1240} 1761}