aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/xen
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/xen')
-rw-r--r--arch/x86/xen/Kconfig20
-rw-r--r--arch/x86/xen/Makefile2
-rw-r--r--arch/x86/xen/enlighten.c835
-rw-r--r--arch/x86/xen/manage.c143
-rw-r--r--arch/x86/xen/mmu.c568
-rw-r--r--arch/x86/xen/mmu.h39
-rw-r--r--arch/x86/xen/multicalls.c41
-rw-r--r--arch/x86/xen/multicalls.h12
-rw-r--r--arch/x86/xen/setup.c109
-rw-r--r--arch/x86/xen/smp.c268
-rw-r--r--arch/x86/xen/suspend.c48
-rw-r--r--arch/x86/xen/time.c17
-rw-r--r--arch/x86/xen/xen-asm_32.S (renamed from arch/x86/xen/xen-asm.S)0
-rw-r--r--arch/x86/xen/xen-asm_64.S271
-rw-r--r--arch/x86/xen/xen-head.S31
-rw-r--r--arch/x86/xen/xen-ops.h35
16 files changed, 1837 insertions, 602 deletions
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 6c388e593bc8..3815e425f470 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -6,9 +6,25 @@ config XEN
6 bool "Xen guest support" 6 bool "Xen guest support"
7 select PARAVIRT 7 select PARAVIRT
8 select PARAVIRT_CLOCK 8 select PARAVIRT_CLOCK
9 depends on X86_32 9 depends on X86_64 || (X86_32 && X86_PAE && !(X86_VISWS || X86_VOYAGER))
10 depends on X86_CMPXCHG && X86_TSC && X86_PAE && !(X86_VISWS || X86_VOYAGER) 10 depends on X86_CMPXCHG && X86_TSC
11 help 11 help
12 This is the Linux Xen port. Enabling this will allow the 12 This is the Linux Xen port. Enabling this will allow the
13 kernel to boot in a paravirtualized environment under the 13 kernel to boot in a paravirtualized environment under the
14 Xen hypervisor. 14 Xen hypervisor.
15
16config XEN_MAX_DOMAIN_MEMORY
17 int "Maximum allowed size of a domain in gigabytes"
18 default 8 if X86_32
19 default 32 if X86_64
20 depends on XEN
21 help
22 The pseudo-physical to machine address array is sized
23 according to the maximum possible memory size of a Xen
24 domain. This array uses 1 page per gigabyte, so there's no
25 need to be too stingy here.
26
27config XEN_SAVE_RESTORE
28 bool
29 depends on PM
30 default y \ No newline at end of file
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 3d8df981d5fd..59c1e539aed2 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -1,4 +1,4 @@
1obj-y := enlighten.o setup.o multicalls.o mmu.o \ 1obj-y := enlighten.o setup.o multicalls.o mmu.o \
2 time.o manage.o xen-asm.o grant-table.o 2 time.o xen-asm_$(BITS).o grant-table.o suspend.o
3 3
4obj-$(CONFIG_SMP) += smp.o 4obj-$(CONFIG_SMP) += smp.o
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index f09c1c69c37a..194bbd6e3241 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -33,6 +33,7 @@
33#include <xen/interface/sched.h> 33#include <xen/interface/sched.h>
34#include <xen/features.h> 34#include <xen/features.h>
35#include <xen/page.h> 35#include <xen/page.h>
36#include <xen/hvc-console.h>
36 37
37#include <asm/paravirt.h> 38#include <asm/paravirt.h>
38#include <asm/page.h> 39#include <asm/page.h>
@@ -40,6 +41,7 @@
40#include <asm/xen/hypervisor.h> 41#include <asm/xen/hypervisor.h>
41#include <asm/fixmap.h> 42#include <asm/fixmap.h>
42#include <asm/processor.h> 43#include <asm/processor.h>
44#include <asm/msr-index.h>
43#include <asm/setup.h> 45#include <asm/setup.h>
44#include <asm/desc.h> 46#include <asm/desc.h>
45#include <asm/pgtable.h> 47#include <asm/pgtable.h>
@@ -56,6 +58,18 @@ DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
56DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); 58DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
57 59
58/* 60/*
61 * Identity map, in addition to plain kernel map. This needs to be
62 * large enough to allocate page table pages to allocate the rest.
63 * Each page can map 2MB.
64 */
65static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss;
66
67#ifdef CONFIG_X86_64
68/* l3 pud for userspace vsyscall mapping */
69static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
70#endif /* CONFIG_X86_64 */
71
72/*
59 * Note about cr3 (pagetable base) values: 73 * Note about cr3 (pagetable base) values:
60 * 74 *
61 * xen_cr3 contains the current logical cr3 value; it contains the 75 * xen_cr3 contains the current logical cr3 value; it contains the
@@ -75,13 +89,13 @@ DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
75struct start_info *xen_start_info; 89struct start_info *xen_start_info;
76EXPORT_SYMBOL_GPL(xen_start_info); 90EXPORT_SYMBOL_GPL(xen_start_info);
77 91
78static /* __initdata */ struct shared_info dummy_shared_info; 92struct shared_info xen_dummy_shared_info;
79 93
80/* 94/*
81 * Point at some empty memory to start with. We map the real shared_info 95 * Point at some empty memory to start with. We map the real shared_info
82 * page as soon as fixmap is up and running. 96 * page as soon as fixmap is up and running.
83 */ 97 */
84struct shared_info *HYPERVISOR_shared_info = (void *)&dummy_shared_info; 98struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
85 99
86/* 100/*
87 * Flag to determine whether vcpu info placement is available on all 101 * Flag to determine whether vcpu info placement is available on all
@@ -98,13 +112,13 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&dummy_shared_info;
98 */ 112 */
99static int have_vcpu_info_placement = 1; 113static int have_vcpu_info_placement = 1;
100 114
101static void __init xen_vcpu_setup(int cpu) 115static void xen_vcpu_setup(int cpu)
102{ 116{
103 struct vcpu_register_vcpu_info info; 117 struct vcpu_register_vcpu_info info;
104 int err; 118 int err;
105 struct vcpu_info *vcpup; 119 struct vcpu_info *vcpup;
106 120
107 BUG_ON(HYPERVISOR_shared_info == &dummy_shared_info); 121 BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
108 per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; 122 per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
109 123
110 if (!have_vcpu_info_placement) 124 if (!have_vcpu_info_placement)
@@ -136,11 +150,45 @@ static void __init xen_vcpu_setup(int cpu)
136 } 150 }
137} 151}
138 152
153/*
154 * On restore, set the vcpu placement up again.
155 * If it fails, then we're in a bad state, since
156 * we can't back out from using it...
157 */
158void xen_vcpu_restore(void)
159{
160 if (have_vcpu_info_placement) {
161 int cpu;
162
163 for_each_online_cpu(cpu) {
164 bool other_cpu = (cpu != smp_processor_id());
165
166 if (other_cpu &&
167 HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL))
168 BUG();
169
170 xen_vcpu_setup(cpu);
171
172 if (other_cpu &&
173 HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL))
174 BUG();
175 }
176
177 BUG_ON(!have_vcpu_info_placement);
178 }
179}
180
139static void __init xen_banner(void) 181static void __init xen_banner(void)
140{ 182{
183 unsigned version = HYPERVISOR_xen_version(XENVER_version, NULL);
184 struct xen_extraversion extra;
185 HYPERVISOR_xen_version(XENVER_extraversion, &extra);
186
141 printk(KERN_INFO "Booting paravirtualized kernel on %s\n", 187 printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
142 pv_info.name); 188 pv_info.name);
143 printk(KERN_INFO "Hypervisor signature: %s\n", xen_start_info->magic); 189 printk(KERN_INFO "Xen version: %d.%d%s%s\n",
190 version >> 16, version & 0xffff, extra.extraversion,
191 xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : "");
144} 192}
145 193
146static void xen_cpuid(unsigned int *ax, unsigned int *bx, 194static void xen_cpuid(unsigned int *ax, unsigned int *bx,
@@ -235,13 +283,13 @@ static void xen_irq_enable(void)
235{ 283{
236 struct vcpu_info *vcpu; 284 struct vcpu_info *vcpu;
237 285
238 /* There's a one instruction preempt window here. We need to 286 /* We don't need to worry about being preempted here, since
239 make sure we're don't switch CPUs between getting the vcpu 287 either a) interrupts are disabled, so no preemption, or b)
240 pointer and updating the mask. */ 288 the caller is confused and is trying to re-enable interrupts
241 preempt_disable(); 289 on an indeterminate processor. */
290
242 vcpu = x86_read_percpu(xen_vcpu); 291 vcpu = x86_read_percpu(xen_vcpu);
243 vcpu->evtchn_upcall_mask = 0; 292 vcpu->evtchn_upcall_mask = 0;
244 preempt_enable_no_resched();
245 293
246 /* Doesn't matter if we get preempted here, because any 294 /* Doesn't matter if we get preempted here, because any
247 pending event will get dealt with anyway. */ 295 pending event will get dealt with anyway. */
@@ -254,7 +302,7 @@ static void xen_irq_enable(void)
254static void xen_safe_halt(void) 302static void xen_safe_halt(void)
255{ 303{
256 /* Blocking includes an implicit local_irq_enable(). */ 304 /* Blocking includes an implicit local_irq_enable(). */
257 if (HYPERVISOR_sched_op(SCHEDOP_block, 0) != 0) 305 if (HYPERVISOR_sched_op(SCHEDOP_block, NULL) != 0)
258 BUG(); 306 BUG();
259} 307}
260 308
@@ -332,14 +380,6 @@ static void load_TLS_descriptor(struct thread_struct *t,
332 380
333static void xen_load_tls(struct thread_struct *t, unsigned int cpu) 381static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
334{ 382{
335 xen_mc_batch();
336
337 load_TLS_descriptor(t, cpu, 0);
338 load_TLS_descriptor(t, cpu, 1);
339 load_TLS_descriptor(t, cpu, 2);
340
341 xen_mc_issue(PARAVIRT_LAZY_CPU);
342
343 /* 383 /*
344 * XXX sleazy hack: If we're being called in a lazy-cpu zone, 384 * XXX sleazy hack: If we're being called in a lazy-cpu zone,
345 * it means we're in a context switch, and %gs has just been 385 * it means we're in a context switch, and %gs has just been
@@ -348,10 +388,39 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
348 * Either way, it has been saved, and the new value will get 388 * Either way, it has been saved, and the new value will get
349 * loaded properly. This will go away as soon as Xen has been 389 * loaded properly. This will go away as soon as Xen has been
350 * modified to not save/restore %gs for normal hypercalls. 390 * modified to not save/restore %gs for normal hypercalls.
391 *
392 * On x86_64, this hack is not used for %gs, because gs points
393 * to KERNEL_GS_BASE (and uses it for PDA references), so we
394 * must not zero %gs on x86_64
395 *
396 * For x86_64, we need to zero %fs, otherwise we may get an
397 * exception between the new %fs descriptor being loaded and
398 * %fs being effectively cleared at __switch_to().
351 */ 399 */
352 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) 400 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) {
401#ifdef CONFIG_X86_32
353 loadsegment(gs, 0); 402 loadsegment(gs, 0);
403#else
404 loadsegment(fs, 0);
405#endif
406 }
407
408 xen_mc_batch();
409
410 load_TLS_descriptor(t, cpu, 0);
411 load_TLS_descriptor(t, cpu, 1);
412 load_TLS_descriptor(t, cpu, 2);
413
414 xen_mc_issue(PARAVIRT_LAZY_CPU);
415}
416
417#ifdef CONFIG_X86_64
418static void xen_load_gs_index(unsigned int idx)
419{
420 if (HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, idx))
421 BUG();
354} 422}
423#endif
355 424
356static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, 425static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
357 const void *ptr) 426 const void *ptr)
@@ -369,23 +438,18 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
369 preempt_enable(); 438 preempt_enable();
370} 439}
371 440
372static int cvt_gate_to_trap(int vector, u32 low, u32 high, 441static int cvt_gate_to_trap(int vector, const gate_desc *val,
373 struct trap_info *info) 442 struct trap_info *info)
374{ 443{
375 u8 type, dpl; 444 if (val->type != 0xf && val->type != 0xe)
376
377 type = (high >> 8) & 0x1f;
378 dpl = (high >> 13) & 3;
379
380 if (type != 0xf && type != 0xe)
381 return 0; 445 return 0;
382 446
383 info->vector = vector; 447 info->vector = vector;
384 info->address = (high & 0xffff0000) | (low & 0x0000ffff); 448 info->address = gate_offset(*val);
385 info->cs = low >> 16; 449 info->cs = gate_segment(*val);
386 info->flags = dpl; 450 info->flags = val->dpl;
387 /* interrupt gates clear IF */ 451 /* interrupt gates clear IF */
388 if (type == 0xe) 452 if (val->type == 0xe)
389 info->flags |= 4; 453 info->flags |= 4;
390 454
391 return 1; 455 return 1;
@@ -412,11 +476,10 @@ static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g)
412 476
413 if (p >= start && (p + 8) <= end) { 477 if (p >= start && (p + 8) <= end) {
414 struct trap_info info[2]; 478 struct trap_info info[2];
415 u32 *desc = (u32 *)g;
416 479
417 info[1].address = 0; 480 info[1].address = 0;
418 481
419 if (cvt_gate_to_trap(entrynum, desc[0], desc[1], &info[0])) 482 if (cvt_gate_to_trap(entrynum, g, &info[0]))
420 if (HYPERVISOR_set_trap_table(info)) 483 if (HYPERVISOR_set_trap_table(info))
421 BUG(); 484 BUG();
422 } 485 }
@@ -429,13 +492,13 @@ static void xen_convert_trap_info(const struct desc_ptr *desc,
429{ 492{
430 unsigned in, out, count; 493 unsigned in, out, count;
431 494
432 count = (desc->size+1) / 8; 495 count = (desc->size+1) / sizeof(gate_desc);
433 BUG_ON(count > 256); 496 BUG_ON(count > 256);
434 497
435 for (in = out = 0; in < count; in++) { 498 for (in = out = 0; in < count; in++) {
436 const u32 *entry = (u32 *)(desc->address + in * 8); 499 gate_desc *entry = (gate_desc*)(desc->address) + in;
437 500
438 if (cvt_gate_to_trap(in, entry[0], entry[1], &traps[out])) 501 if (cvt_gate_to_trap(in, entry, &traps[out]))
439 out++; 502 out++;
440 } 503 }
441 traps[out].address = 0; 504 traps[out].address = 0;
@@ -607,6 +670,30 @@ static void xen_flush_tlb_others(const cpumask_t *cpus, struct mm_struct *mm,
607 xen_mc_issue(PARAVIRT_LAZY_MMU); 670 xen_mc_issue(PARAVIRT_LAZY_MMU);
608} 671}
609 672
673static void xen_clts(void)
674{
675 struct multicall_space mcs;
676
677 mcs = xen_mc_entry(0);
678
679 MULTI_fpu_taskswitch(mcs.mc, 0);
680
681 xen_mc_issue(PARAVIRT_LAZY_CPU);
682}
683
684static void xen_write_cr0(unsigned long cr0)
685{
686 struct multicall_space mcs;
687
688 /* Only pay attention to cr0.TS; everything else is
689 ignored. */
690 mcs = xen_mc_entry(0);
691
692 MULTI_fpu_taskswitch(mcs.mc, (cr0 & X86_CR0_TS) != 0);
693
694 xen_mc_issue(PARAVIRT_LAZY_CPU);
695}
696
610static void xen_write_cr2(unsigned long cr2) 697static void xen_write_cr2(unsigned long cr2)
611{ 698{
612 x86_read_percpu(xen_vcpu)->arch.cr2 = cr2; 699 x86_read_percpu(xen_vcpu)->arch.cr2 = cr2;
@@ -624,8 +711,10 @@ static unsigned long xen_read_cr2_direct(void)
624 711
625static void xen_write_cr4(unsigned long cr4) 712static void xen_write_cr4(unsigned long cr4)
626{ 713{
627 /* Just ignore cr4 changes; Xen doesn't allow us to do 714 cr4 &= ~X86_CR4_PGE;
628 anything anyway. */ 715 cr4 &= ~X86_CR4_PSE;
716
717 native_write_cr4(cr4);
629} 718}
630 719
631static unsigned long xen_read_cr3(void) 720static unsigned long xen_read_cr3(void)
@@ -638,33 +727,89 @@ static void set_current_cr3(void *v)
638 x86_write_percpu(xen_current_cr3, (unsigned long)v); 727 x86_write_percpu(xen_current_cr3, (unsigned long)v);
639} 728}
640 729
641static void xen_write_cr3(unsigned long cr3) 730static void __xen_write_cr3(bool kernel, unsigned long cr3)
642{ 731{
643 struct mmuext_op *op; 732 struct mmuext_op *op;
644 struct multicall_space mcs; 733 struct multicall_space mcs;
645 unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3)); 734 unsigned long mfn;
646 735
647 BUG_ON(preemptible()); 736 if (cr3)
737 mfn = pfn_to_mfn(PFN_DOWN(cr3));
738 else
739 mfn = 0;
648 740
649 mcs = xen_mc_entry(sizeof(*op)); /* disables interrupts */ 741 WARN_ON(mfn == 0 && kernel);
650 742
651 /* Update while interrupts are disabled, so its atomic with 743 mcs = __xen_mc_entry(sizeof(*op));
652 respect to ipis */
653 x86_write_percpu(xen_cr3, cr3);
654 744
655 op = mcs.args; 745 op = mcs.args;
656 op->cmd = MMUEXT_NEW_BASEPTR; 746 op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
657 op->arg1.mfn = mfn; 747 op->arg1.mfn = mfn;
658 748
659 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); 749 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
660 750
661 /* Update xen_update_cr3 once the batch has actually 751 if (kernel) {
662 been submitted. */ 752 x86_write_percpu(xen_cr3, cr3);
663 xen_mc_callback(set_current_cr3, (void *)cr3); 753
754 /* Update xen_current_cr3 once the batch has actually
755 been submitted. */
756 xen_mc_callback(set_current_cr3, (void *)cr3);
757 }
758}
759
760static void xen_write_cr3(unsigned long cr3)
761{
762 BUG_ON(preemptible());
763
764 xen_mc_batch(); /* disables interrupts */
765
766 /* Update while interrupts are disabled, so its atomic with
767 respect to ipis */
768 x86_write_percpu(xen_cr3, cr3);
769
770 __xen_write_cr3(true, cr3);
771
772#ifdef CONFIG_X86_64
773 {
774 pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
775 if (user_pgd)
776 __xen_write_cr3(false, __pa(user_pgd));
777 else
778 __xen_write_cr3(false, 0);
779 }
780#endif
664 781
665 xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */ 782 xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
666} 783}
667 784
785static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
786{
787 int ret;
788
789 ret = 0;
790
791 switch(msr) {
792#ifdef CONFIG_X86_64
793 unsigned which;
794 u64 base;
795
796 case MSR_FS_BASE: which = SEGBASE_FS; goto set;
797 case MSR_KERNEL_GS_BASE: which = SEGBASE_GS_USER; goto set;
798 case MSR_GS_BASE: which = SEGBASE_GS_KERNEL; goto set;
799
800 set:
801 base = ((u64)high << 32) | low;
802 if (HYPERVISOR_set_segment_base(which, base) != 0)
803 ret = -EFAULT;
804 break;
805#endif
806 default:
807 ret = native_write_msr_safe(msr, low, high);
808 }
809
810 return ret;
811}
812
668/* Early in boot, while setting up the initial pagetable, assume 813/* Early in boot, while setting up the initial pagetable, assume
669 everything is pinned. */ 814 everything is pinned. */
670static __init void xen_alloc_pte_init(struct mm_struct *mm, u32 pfn) 815static __init void xen_alloc_pte_init(struct mm_struct *mm, u32 pfn)
@@ -721,6 +866,48 @@ static void xen_alloc_pmd(struct mm_struct *mm, u32 pfn)
721 xen_alloc_ptpage(mm, pfn, PT_PMD); 866 xen_alloc_ptpage(mm, pfn, PT_PMD);
722} 867}
723 868
869static int xen_pgd_alloc(struct mm_struct *mm)
870{
871 pgd_t *pgd = mm->pgd;
872 int ret = 0;
873
874 BUG_ON(PagePinned(virt_to_page(pgd)));
875
876#ifdef CONFIG_X86_64
877 {
878 struct page *page = virt_to_page(pgd);
879 pgd_t *user_pgd;
880
881 BUG_ON(page->private != 0);
882
883 ret = -ENOMEM;
884
885 user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
886 page->private = (unsigned long)user_pgd;
887
888 if (user_pgd != NULL) {
889 user_pgd[pgd_index(VSYSCALL_START)] =
890 __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
891 ret = 0;
892 }
893
894 BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
895 }
896#endif
897
898 return ret;
899}
900
901static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
902{
903#ifdef CONFIG_X86_64
904 pgd_t *user_pgd = xen_get_user_pgd(pgd);
905
906 if (user_pgd)
907 free_page((unsigned long)user_pgd);
908#endif
909}
910
724/* This should never happen until we're OK to use struct page */ 911/* This should never happen until we're OK to use struct page */
725static void xen_release_ptpage(u32 pfn, unsigned level) 912static void xen_release_ptpage(u32 pfn, unsigned level)
726{ 913{
@@ -746,6 +933,18 @@ static void xen_release_pmd(u32 pfn)
746 xen_release_ptpage(pfn, PT_PMD); 933 xen_release_ptpage(pfn, PT_PMD);
747} 934}
748 935
936#if PAGETABLE_LEVELS == 4
937static void xen_alloc_pud(struct mm_struct *mm, u32 pfn)
938{
939 xen_alloc_ptpage(mm, pfn, PT_PUD);
940}
941
942static void xen_release_pud(u32 pfn)
943{
944 xen_release_ptpage(pfn, PT_PUD);
945}
946#endif
947
749#ifdef CONFIG_HIGHPTE 948#ifdef CONFIG_HIGHPTE
750static void *xen_kmap_atomic_pte(struct page *page, enum km_type type) 949static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
751{ 950{
@@ -784,68 +983,16 @@ static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
784 983
785static __init void xen_pagetable_setup_start(pgd_t *base) 984static __init void xen_pagetable_setup_start(pgd_t *base)
786{ 985{
787 pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base;
788 int i;
789
790 /* special set_pte for pagetable initialization */
791 pv_mmu_ops.set_pte = xen_set_pte_init;
792
793 init_mm.pgd = base;
794 /*
795 * copy top-level of Xen-supplied pagetable into place. This
796 * is a stand-in while we copy the pmd pages.
797 */
798 memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t));
799
800 /*
801 * For PAE, need to allocate new pmds, rather than
802 * share Xen's, since Xen doesn't like pmd's being
803 * shared between address spaces.
804 */
805 for (i = 0; i < PTRS_PER_PGD; i++) {
806 if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) {
807 pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
808
809 memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]),
810 PAGE_SIZE);
811
812 make_lowmem_page_readonly(pmd);
813
814 set_pgd(&base[i], __pgd(1 + __pa(pmd)));
815 } else
816 pgd_clear(&base[i]);
817 }
818
819 /* make sure zero_page is mapped RO so we can use it in pagetables */
820 make_lowmem_page_readonly(empty_zero_page);
821 make_lowmem_page_readonly(base);
822 /*
823 * Switch to new pagetable. This is done before
824 * pagetable_init has done anything so that the new pages
825 * added to the table can be prepared properly for Xen.
826 */
827 xen_write_cr3(__pa(base));
828
829 /* Unpin initial Xen pagetable */
830 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE,
831 PFN_DOWN(__pa(xen_start_info->pt_base)));
832} 986}
833 987
834static __init void setup_shared_info(void) 988void xen_setup_shared_info(void)
835{ 989{
836 if (!xen_feature(XENFEAT_auto_translated_physmap)) { 990 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
837 unsigned long addr = fix_to_virt(FIX_PARAVIRT_BOOTMAP); 991 set_fixmap(FIX_PARAVIRT_BOOTMAP,
838 992 xen_start_info->shared_info);
839 /* 993
840 * Create a mapping for the shared info page. 994 HYPERVISOR_shared_info =
841 * Should be set_fixmap(), but shared_info is a machine 995 (struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP);
842 * address with no corresponding pseudo-phys address.
843 */
844 set_pte_mfn(addr,
845 PFN_DOWN(xen_start_info->shared_info),
846 PAGE_KERNEL);
847
848 HYPERVISOR_shared_info = (struct shared_info *)addr;
849 } else 996 } else
850 HYPERVISOR_shared_info = 997 HYPERVISOR_shared_info =
851 (struct shared_info *)__va(xen_start_info->shared_info); 998 (struct shared_info *)__va(xen_start_info->shared_info);
@@ -854,27 +1001,43 @@ static __init void setup_shared_info(void)
854 /* In UP this is as good a place as any to set up shared info */ 1001 /* In UP this is as good a place as any to set up shared info */
855 xen_setup_vcpu_info_placement(); 1002 xen_setup_vcpu_info_placement();
856#endif 1003#endif
1004
1005 xen_setup_mfn_list_list();
857} 1006}
858 1007
859static __init void xen_pagetable_setup_done(pgd_t *base) 1008static __init void xen_pagetable_setup_done(pgd_t *base)
860{ 1009{
1010 xen_setup_shared_info();
1011}
1012
1013static __init void xen_post_allocator_init(void)
1014{
1015 pv_mmu_ops.set_pte = xen_set_pte;
1016 pv_mmu_ops.set_pmd = xen_set_pmd;
1017 pv_mmu_ops.set_pud = xen_set_pud;
1018#if PAGETABLE_LEVELS == 4
1019 pv_mmu_ops.set_pgd = xen_set_pgd;
1020#endif
1021
861 /* This will work as long as patching hasn't happened yet 1022 /* This will work as long as patching hasn't happened yet
862 (which it hasn't) */ 1023 (which it hasn't) */
863 pv_mmu_ops.alloc_pte = xen_alloc_pte; 1024 pv_mmu_ops.alloc_pte = xen_alloc_pte;
864 pv_mmu_ops.alloc_pmd = xen_alloc_pmd; 1025 pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
865 pv_mmu_ops.release_pte = xen_release_pte; 1026 pv_mmu_ops.release_pte = xen_release_pte;
866 pv_mmu_ops.release_pmd = xen_release_pmd; 1027 pv_mmu_ops.release_pmd = xen_release_pmd;
867 pv_mmu_ops.set_pte = xen_set_pte; 1028#if PAGETABLE_LEVELS == 4
868 1029 pv_mmu_ops.alloc_pud = xen_alloc_pud;
869 setup_shared_info(); 1030 pv_mmu_ops.release_pud = xen_release_pud;
1031#endif
870 1032
871 /* Actually pin the pagetable down, but we can't set PG_pinned 1033#ifdef CONFIG_X86_64
872 yet because the page structures don't exist yet. */ 1034 SetPagePinned(virt_to_page(level3_user_vsyscall));
873 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(base))); 1035#endif
1036 xen_mark_init_mm_pinned();
874} 1037}
875 1038
876/* This is called once we have the cpu_possible_map */ 1039/* This is called once we have the cpu_possible_map */
877void __init xen_setup_vcpu_info_placement(void) 1040void xen_setup_vcpu_info_placement(void)
878{ 1041{
879 int cpu; 1042 int cpu;
880 1043
@@ -883,6 +1046,7 @@ void __init xen_setup_vcpu_info_placement(void)
883 1046
884 /* xen_vcpu_setup managed to place the vcpu_info within the 1047 /* xen_vcpu_setup managed to place the vcpu_info within the
885 percpu area for all cpus, so make use of it */ 1048 percpu area for all cpus, so make use of it */
1049#ifdef CONFIG_X86_32
886 if (have_vcpu_info_placement) { 1050 if (have_vcpu_info_placement) {
887 printk(KERN_INFO "Xen: using vcpu_info placement\n"); 1051 printk(KERN_INFO "Xen: using vcpu_info placement\n");
888 1052
@@ -892,6 +1056,7 @@ void __init xen_setup_vcpu_info_placement(void)
892 pv_irq_ops.irq_enable = xen_irq_enable_direct; 1056 pv_irq_ops.irq_enable = xen_irq_enable_direct;
893 pv_mmu_ops.read_cr2 = xen_read_cr2_direct; 1057 pv_mmu_ops.read_cr2 = xen_read_cr2_direct;
894 } 1058 }
1059#endif
895} 1060}
896 1061
897static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, 1062static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
@@ -912,10 +1077,12 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
912 goto patch_site 1077 goto patch_site
913 1078
914 switch (type) { 1079 switch (type) {
1080#ifdef CONFIG_X86_32
915 SITE(pv_irq_ops, irq_enable); 1081 SITE(pv_irq_ops, irq_enable);
916 SITE(pv_irq_ops, irq_disable); 1082 SITE(pv_irq_ops, irq_disable);
917 SITE(pv_irq_ops, save_fl); 1083 SITE(pv_irq_ops, save_fl);
918 SITE(pv_irq_ops, restore_fl); 1084 SITE(pv_irq_ops, restore_fl);
1085#endif /* CONFIG_X86_32 */
919#undef SITE 1086#undef SITE
920 1087
921 patch_site: 1088 patch_site:
@@ -947,6 +1114,49 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
947 return ret; 1114 return ret;
948} 1115}
949 1116
1117static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot)
1118{
1119 pte_t pte;
1120
1121 phys >>= PAGE_SHIFT;
1122
1123 switch (idx) {
1124 case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
1125#ifdef CONFIG_X86_F00F_BUG
1126 case FIX_F00F_IDT:
1127#endif
1128#ifdef CONFIG_X86_32
1129 case FIX_WP_TEST:
1130 case FIX_VDSO:
1131# ifdef CONFIG_HIGHMEM
1132 case FIX_KMAP_BEGIN ... FIX_KMAP_END:
1133# endif
1134#else
1135 case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
1136#endif
1137#ifdef CONFIG_X86_LOCAL_APIC
1138 case FIX_APIC_BASE: /* maps dummy local APIC */
1139#endif
1140 pte = pfn_pte(phys, prot);
1141 break;
1142
1143 default:
1144 pte = mfn_pte(phys, prot);
1145 break;
1146 }
1147
1148 __native_set_fixmap(idx, pte);
1149
1150#ifdef CONFIG_X86_64
1151 /* Replicate changes to map the vsyscall page into the user
1152 pagetable vsyscall mapping. */
1153 if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) {
1154 unsigned long vaddr = __fix_to_virt(idx);
1155 set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
1156 }
1157#endif
1158}
1159
950static const struct pv_info xen_info __initdata = { 1160static const struct pv_info xen_info __initdata = {
951 .paravirt_enabled = 1, 1161 .paravirt_enabled = 1,
952 .shared_kernel_pmd = 0, 1162 .shared_kernel_pmd = 0,
@@ -960,7 +1170,7 @@ static const struct pv_init_ops xen_init_ops __initdata = {
960 .banner = xen_banner, 1170 .banner = xen_banner,
961 .memory_setup = xen_memory_setup, 1171 .memory_setup = xen_memory_setup,
962 .arch_setup = xen_arch_setup, 1172 .arch_setup = xen_arch_setup,
963 .post_allocator_init = xen_mark_init_mm_pinned, 1173 .post_allocator_init = xen_post_allocator_init,
964}; 1174};
965 1175
966static const struct pv_time_ops xen_time_ops __initdata = { 1176static const struct pv_time_ops xen_time_ops __initdata = {
@@ -968,7 +1178,7 @@ static const struct pv_time_ops xen_time_ops __initdata = {
968 1178
969 .set_wallclock = xen_set_wallclock, 1179 .set_wallclock = xen_set_wallclock,
970 .get_wallclock = xen_get_wallclock, 1180 .get_wallclock = xen_get_wallclock,
971 .get_cpu_khz = xen_cpu_khz, 1181 .get_tsc_khz = xen_tsc_khz,
972 .sched_clock = xen_sched_clock, 1182 .sched_clock = xen_sched_clock,
973}; 1183};
974 1184
@@ -978,10 +1188,10 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
978 .set_debugreg = xen_set_debugreg, 1188 .set_debugreg = xen_set_debugreg,
979 .get_debugreg = xen_get_debugreg, 1189 .get_debugreg = xen_get_debugreg,
980 1190
981 .clts = native_clts, 1191 .clts = xen_clts,
982 1192
983 .read_cr0 = native_read_cr0, 1193 .read_cr0 = native_read_cr0,
984 .write_cr0 = native_write_cr0, 1194 .write_cr0 = xen_write_cr0,
985 1195
986 .read_cr4 = native_read_cr4, 1196 .read_cr4 = native_read_cr4,
987 .read_cr4_safe = native_read_cr4_safe, 1197 .read_cr4_safe = native_read_cr4_safe,
@@ -990,18 +1200,25 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
990 .wbinvd = native_wbinvd, 1200 .wbinvd = native_wbinvd,
991 1201
992 .read_msr = native_read_msr_safe, 1202 .read_msr = native_read_msr_safe,
993 .write_msr = native_write_msr_safe, 1203 .write_msr = xen_write_msr_safe,
994 .read_tsc = native_read_tsc, 1204 .read_tsc = native_read_tsc,
995 .read_pmc = native_read_pmc, 1205 .read_pmc = native_read_pmc,
996 1206
997 .iret = xen_iret, 1207 .iret = xen_iret,
998 .irq_enable_syscall_ret = xen_sysexit, 1208 .irq_enable_sysexit = xen_sysexit,
1209#ifdef CONFIG_X86_64
1210 .usergs_sysret32 = xen_sysret32,
1211 .usergs_sysret64 = xen_sysret64,
1212#endif
999 1213
1000 .load_tr_desc = paravirt_nop, 1214 .load_tr_desc = paravirt_nop,
1001 .set_ldt = xen_set_ldt, 1215 .set_ldt = xen_set_ldt,
1002 .load_gdt = xen_load_gdt, 1216 .load_gdt = xen_load_gdt,
1003 .load_idt = xen_load_idt, 1217 .load_idt = xen_load_idt,
1004 .load_tls = xen_load_tls, 1218 .load_tls = xen_load_tls,
1219#ifdef CONFIG_X86_64
1220 .load_gs_index = xen_load_gs_index,
1221#endif
1005 1222
1006 .store_gdt = native_store_gdt, 1223 .store_gdt = native_store_gdt,
1007 .store_idt = native_store_idt, 1224 .store_idt = native_store_idt,
@@ -1015,26 +1232,48 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
1015 .set_iopl_mask = xen_set_iopl_mask, 1232 .set_iopl_mask = xen_set_iopl_mask,
1016 .io_delay = xen_io_delay, 1233 .io_delay = xen_io_delay,
1017 1234
1235 /* Xen takes care of %gs when switching to usermode for us */
1236 .swapgs = paravirt_nop,
1237
1018 .lazy_mode = { 1238 .lazy_mode = {
1019 .enter = paravirt_enter_lazy_cpu, 1239 .enter = paravirt_enter_lazy_cpu,
1020 .leave = xen_leave_lazy, 1240 .leave = xen_leave_lazy,
1021 }, 1241 },
1022}; 1242};
1023 1243
1244static void __init __xen_init_IRQ(void)
1245{
1246#ifdef CONFIG_X86_64
1247 int i;
1248
1249 /* Create identity vector->irq map */
1250 for(i = 0; i < NR_VECTORS; i++) {
1251 int cpu;
1252
1253 for_each_possible_cpu(cpu)
1254 per_cpu(vector_irq, cpu)[i] = i;
1255 }
1256#endif /* CONFIG_X86_64 */
1257
1258 xen_init_IRQ();
1259}
1260
1024static const struct pv_irq_ops xen_irq_ops __initdata = { 1261static const struct pv_irq_ops xen_irq_ops __initdata = {
1025 .init_IRQ = xen_init_IRQ, 1262 .init_IRQ = __xen_init_IRQ,
1026 .save_fl = xen_save_fl, 1263 .save_fl = xen_save_fl,
1027 .restore_fl = xen_restore_fl, 1264 .restore_fl = xen_restore_fl,
1028 .irq_disable = xen_irq_disable, 1265 .irq_disable = xen_irq_disable,
1029 .irq_enable = xen_irq_enable, 1266 .irq_enable = xen_irq_enable,
1030 .safe_halt = xen_safe_halt, 1267 .safe_halt = xen_safe_halt,
1031 .halt = xen_halt, 1268 .halt = xen_halt,
1269#ifdef CONFIG_X86_64
1270 .adjust_exception_frame = xen_adjust_exception_frame,
1271#endif
1032}; 1272};
1033 1273
1034static const struct pv_apic_ops xen_apic_ops __initdata = { 1274static const struct pv_apic_ops xen_apic_ops __initdata = {
1035#ifdef CONFIG_X86_LOCAL_APIC 1275#ifdef CONFIG_X86_LOCAL_APIC
1036 .apic_write = xen_apic_write, 1276 .apic_write = xen_apic_write,
1037 .apic_write_atomic = xen_apic_write,
1038 .apic_read = xen_apic_read, 1277 .apic_read = xen_apic_read,
1039 .setup_boot_clock = paravirt_nop, 1278 .setup_boot_clock = paravirt_nop,
1040 .setup_secondary_clock = paravirt_nop, 1279 .setup_secondary_clock = paravirt_nop,
@@ -1060,6 +1299,9 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1060 .pte_update = paravirt_nop, 1299 .pte_update = paravirt_nop,
1061 .pte_update_defer = paravirt_nop, 1300 .pte_update_defer = paravirt_nop,
1062 1301
1302 .pgd_alloc = xen_pgd_alloc,
1303 .pgd_free = xen_pgd_free,
1304
1063 .alloc_pte = xen_alloc_pte_init, 1305 .alloc_pte = xen_alloc_pte_init,
1064 .release_pte = xen_release_pte_init, 1306 .release_pte = xen_release_pte_init,
1065 .alloc_pmd = xen_alloc_pte_init, 1307 .alloc_pmd = xen_alloc_pte_init,
@@ -1070,25 +1312,44 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1070 .kmap_atomic_pte = xen_kmap_atomic_pte, 1312 .kmap_atomic_pte = xen_kmap_atomic_pte,
1071#endif 1313#endif
1072 1314
1073 .set_pte = NULL, /* see xen_pagetable_setup_* */ 1315#ifdef CONFIG_X86_64
1316 .set_pte = xen_set_pte,
1317#else
1318 .set_pte = xen_set_pte_init,
1319#endif
1074 .set_pte_at = xen_set_pte_at, 1320 .set_pte_at = xen_set_pte_at,
1075 .set_pmd = xen_set_pmd, 1321 .set_pmd = xen_set_pmd_hyper,
1322
1323 .ptep_modify_prot_start = __ptep_modify_prot_start,
1324 .ptep_modify_prot_commit = __ptep_modify_prot_commit,
1076 1325
1077 .pte_val = xen_pte_val, 1326 .pte_val = xen_pte_val,
1327 .pte_flags = native_pte_val,
1078 .pgd_val = xen_pgd_val, 1328 .pgd_val = xen_pgd_val,
1079 1329
1080 .make_pte = xen_make_pte, 1330 .make_pte = xen_make_pte,
1081 .make_pgd = xen_make_pgd, 1331 .make_pgd = xen_make_pgd,
1082 1332
1333#ifdef CONFIG_X86_PAE
1083 .set_pte_atomic = xen_set_pte_atomic, 1334 .set_pte_atomic = xen_set_pte_atomic,
1084 .set_pte_present = xen_set_pte_at, 1335 .set_pte_present = xen_set_pte_at,
1085 .set_pud = xen_set_pud,
1086 .pte_clear = xen_pte_clear, 1336 .pte_clear = xen_pte_clear,
1087 .pmd_clear = xen_pmd_clear, 1337 .pmd_clear = xen_pmd_clear,
1338#endif /* CONFIG_X86_PAE */
1339 .set_pud = xen_set_pud_hyper,
1088 1340
1089 .make_pmd = xen_make_pmd, 1341 .make_pmd = xen_make_pmd,
1090 .pmd_val = xen_pmd_val, 1342 .pmd_val = xen_pmd_val,
1091 1343
1344#if PAGETABLE_LEVELS == 4
1345 .pud_val = xen_pud_val,
1346 .make_pud = xen_make_pud,
1347 .set_pgd = xen_set_pgd_hyper,
1348
1349 .alloc_pud = xen_alloc_pte_init,
1350 .release_pud = xen_release_pte_init,
1351#endif /* PAGETABLE_LEVELS == 4 */
1352
1092 .activate_mm = xen_activate_mm, 1353 .activate_mm = xen_activate_mm,
1093 .dup_mmap = xen_dup_mmap, 1354 .dup_mmap = xen_dup_mmap,
1094 .exit_mmap = xen_exit_mmap, 1355 .exit_mmap = xen_exit_mmap,
@@ -1097,28 +1358,19 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1097 .enter = paravirt_enter_lazy_mmu, 1358 .enter = paravirt_enter_lazy_mmu,
1098 .leave = xen_leave_lazy, 1359 .leave = xen_leave_lazy,
1099 }, 1360 },
1100};
1101 1361
1102#ifdef CONFIG_SMP 1362 .set_fixmap = xen_set_fixmap,
1103static const struct smp_ops xen_smp_ops __initdata = {
1104 .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
1105 .smp_prepare_cpus = xen_smp_prepare_cpus,
1106 .cpu_up = xen_cpu_up,
1107 .smp_cpus_done = xen_smp_cpus_done,
1108
1109 .smp_send_stop = xen_smp_send_stop,
1110 .smp_send_reschedule = xen_smp_send_reschedule,
1111 .smp_call_function_mask = xen_smp_call_function_mask,
1112}; 1363};
1113#endif /* CONFIG_SMP */
1114 1364
1115static void xen_reboot(int reason) 1365static void xen_reboot(int reason)
1116{ 1366{
1367 struct sched_shutdown r = { .reason = reason };
1368
1117#ifdef CONFIG_SMP 1369#ifdef CONFIG_SMP
1118 smp_send_stop(); 1370 smp_send_stop();
1119#endif 1371#endif
1120 1372
1121 if (HYPERVISOR_sched_op(SCHEDOP_shutdown, reason)) 1373 if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r))
1122 BUG(); 1374 BUG();
1123} 1375}
1124 1376
@@ -1154,6 +1406,7 @@ static const struct machine_ops __initdata xen_machine_ops = {
1154 1406
1155static void __init xen_reserve_top(void) 1407static void __init xen_reserve_top(void)
1156{ 1408{
1409#ifdef CONFIG_X86_32
1157 unsigned long top = HYPERVISOR_VIRT_START; 1410 unsigned long top = HYPERVISOR_VIRT_START;
1158 struct xen_platform_parameters pp; 1411 struct xen_platform_parameters pp;
1159 1412
@@ -1161,8 +1414,248 @@ static void __init xen_reserve_top(void)
1161 top = pp.virt_start; 1414 top = pp.virt_start;
1162 1415
1163 reserve_top_address(-top + 2 * PAGE_SIZE); 1416 reserve_top_address(-top + 2 * PAGE_SIZE);
1417#endif /* CONFIG_X86_32 */
1418}
1419
1420/*
1421 * Like __va(), but returns address in the kernel mapping (which is
1422 * all we have until the physical memory mapping has been set up.
1423 */
1424static void *__ka(phys_addr_t paddr)
1425{
1426#ifdef CONFIG_X86_64
1427 return (void *)(paddr + __START_KERNEL_map);
1428#else
1429 return __va(paddr);
1430#endif
1431}
1432
1433/* Convert a machine address to physical address */
1434static unsigned long m2p(phys_addr_t maddr)
1435{
1436 phys_addr_t paddr;
1437
1438 maddr &= PTE_MASK;
1439 paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
1440
1441 return paddr;
1164} 1442}
1165 1443
1444/* Convert a machine address to kernel virtual */
1445static void *m2v(phys_addr_t maddr)
1446{
1447 return __ka(m2p(maddr));
1448}
1449
1450#ifdef CONFIG_X86_64
1451static void walk(pgd_t *pgd, unsigned long addr)
1452{
1453 unsigned l4idx = pgd_index(addr);
1454 unsigned l3idx = pud_index(addr);
1455 unsigned l2idx = pmd_index(addr);
1456 unsigned l1idx = pte_index(addr);
1457 pgd_t l4;
1458 pud_t l3;
1459 pmd_t l2;
1460 pte_t l1;
1461
1462 xen_raw_printk("walk %p, %lx -> %d %d %d %d\n",
1463 pgd, addr, l4idx, l3idx, l2idx, l1idx);
1464
1465 l4 = pgd[l4idx];
1466 xen_raw_printk(" l4: %016lx\n", l4.pgd);
1467 xen_raw_printk(" %016lx\n", pgd_val(l4));
1468
1469 l3 = ((pud_t *)(m2v(l4.pgd)))[l3idx];
1470 xen_raw_printk(" l3: %016lx\n", l3.pud);
1471 xen_raw_printk(" %016lx\n", pud_val(l3));
1472
1473 l2 = ((pmd_t *)(m2v(l3.pud)))[l2idx];
1474 xen_raw_printk(" l2: %016lx\n", l2.pmd);
1475 xen_raw_printk(" %016lx\n", pmd_val(l2));
1476
1477 l1 = ((pte_t *)(m2v(l2.pmd)))[l1idx];
1478 xen_raw_printk(" l1: %016lx\n", l1.pte);
1479 xen_raw_printk(" %016lx\n", pte_val(l1));
1480}
1481#endif
1482
1483static void set_page_prot(void *addr, pgprot_t prot)
1484{
1485 unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
1486 pte_t pte = pfn_pte(pfn, prot);
1487
1488 xen_raw_printk("addr=%p pfn=%lx mfn=%lx prot=%016llx pte=%016llx\n",
1489 addr, pfn, get_phys_to_machine(pfn),
1490 pgprot_val(prot), pte.pte);
1491
1492 if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
1493 BUG();
1494}
1495
1496static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1497{
1498 unsigned pmdidx, pteidx;
1499 unsigned ident_pte;
1500 unsigned long pfn;
1501
1502 ident_pte = 0;
1503 pfn = 0;
1504 for(pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
1505 pte_t *pte_page;
1506
1507 /* Reuse or allocate a page of ptes */
1508 if (pmd_present(pmd[pmdidx]))
1509 pte_page = m2v(pmd[pmdidx].pmd);
1510 else {
1511 /* Check for free pte pages */
1512 if (ident_pte == ARRAY_SIZE(level1_ident_pgt))
1513 break;
1514
1515 pte_page = &level1_ident_pgt[ident_pte];
1516 ident_pte += PTRS_PER_PTE;
1517
1518 pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
1519 }
1520
1521 /* Install mappings */
1522 for(pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
1523 pte_t pte;
1524
1525 if (pfn > max_pfn_mapped)
1526 max_pfn_mapped = pfn;
1527
1528 if (!pte_none(pte_page[pteidx]))
1529 continue;
1530
1531 pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
1532 pte_page[pteidx] = pte;
1533 }
1534 }
1535
1536 for(pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
1537 set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
1538
1539 set_page_prot(pmd, PAGE_KERNEL_RO);
1540}
1541
1542#ifdef CONFIG_X86_64
1543static void convert_pfn_mfn(void *v)
1544{
1545 pte_t *pte = v;
1546 int i;
1547
1548 /* All levels are converted the same way, so just treat them
1549 as ptes. */
1550 for(i = 0; i < PTRS_PER_PTE; i++)
1551 pte[i] = xen_make_pte(pte[i].pte);
1552}
1553
1554/*
1555 * Set up the inital kernel pagetable.
1556 *
1557 * We can construct this by grafting the Xen provided pagetable into
1558 * head_64.S's preconstructed pagetables. We copy the Xen L2's into
1559 * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt. This
1560 * means that only the kernel has a physical mapping to start with -
1561 * but that's enough to get __va working. We need to fill in the rest
1562 * of the physical mapping once some sort of allocator has been set
1563 * up.
1564 */
1565static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
1566{
1567 pud_t *l3;
1568 pmd_t *l2;
1569
1570 /* Zap identity mapping */
1571 init_level4_pgt[0] = __pgd(0);
1572
1573 /* Pre-constructed entries are in pfn, so convert to mfn */
1574 convert_pfn_mfn(init_level4_pgt);
1575 convert_pfn_mfn(level3_ident_pgt);
1576 convert_pfn_mfn(level3_kernel_pgt);
1577
1578 l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
1579 l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
1580
1581 memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1582 memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1583
1584 l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
1585 l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
1586 memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1587
1588 /* Set up identity map */
1589 xen_map_identity_early(level2_ident_pgt, max_pfn);
1590
1591 /* Make pagetable pieces RO */
1592 set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
1593 set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
1594 set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
1595 set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
1596 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1597 set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
1598
1599 /* Pin down new L4 */
1600 pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
1601 PFN_DOWN(__pa_symbol(init_level4_pgt)));
1602
1603 /* Unpin Xen-provided one */
1604 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1605
1606 /* Switch over */
1607 pgd = init_level4_pgt;
1608
1609 /*
1610 * At this stage there can be no user pgd, and no page
1611 * structure to attach it to, so make sure we just set kernel
1612 * pgd.
1613 */
1614 xen_mc_batch();
1615 __xen_write_cr3(true, __pa(pgd));
1616 xen_mc_issue(PARAVIRT_LAZY_CPU);
1617
1618 reserve_early(__pa(xen_start_info->pt_base),
1619 __pa(xen_start_info->pt_base +
1620 xen_start_info->nr_pt_frames * PAGE_SIZE),
1621 "XEN PAGETABLES");
1622
1623 return pgd;
1624}
1625#else /* !CONFIG_X86_64 */
1626static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss;
1627
1628static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
1629{
1630 pmd_t *kernel_pmd;
1631
1632 init_pg_tables_start = __pa(pgd);
1633 init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
1634 max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024);
1635
1636 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
1637 memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
1638
1639 xen_map_identity_early(level2_kernel_pgt, max_pfn);
1640
1641 memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
1642 set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY],
1643 __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT));
1644
1645 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1646 set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
1647 set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
1648
1649 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1650
1651 xen_write_cr3(__pa(swapper_pg_dir));
1652
1653 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir)));
1654
1655 return swapper_pg_dir;
1656}
1657#endif /* CONFIG_X86_64 */
1658
1166/* First C function to be called on Xen boot */ 1659/* First C function to be called on Xen boot */
1167asmlinkage void __init xen_start_kernel(void) 1660asmlinkage void __init xen_start_kernel(void)
1168{ 1661{
@@ -1173,6 +1666,8 @@ asmlinkage void __init xen_start_kernel(void)
1173 1666
1174 BUG_ON(memcmp(xen_start_info->magic, "xen-3", 5) != 0); 1667 BUG_ON(memcmp(xen_start_info->magic, "xen-3", 5) != 0);
1175 1668
1669 xen_setup_features();
1670
1176 /* Install Xen paravirt ops */ 1671 /* Install Xen paravirt ops */
1177 pv_info = xen_info; 1672 pv_info = xen_info;
1178 pv_init_ops = xen_init_ops; 1673 pv_init_ops = xen_init_ops;
@@ -1182,59 +1677,85 @@ asmlinkage void __init xen_start_kernel(void)
1182 pv_apic_ops = xen_apic_ops; 1677 pv_apic_ops = xen_apic_ops;
1183 pv_mmu_ops = xen_mmu_ops; 1678 pv_mmu_ops = xen_mmu_ops;
1184 1679
1680 if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) {
1681 pv_mmu_ops.ptep_modify_prot_start = xen_ptep_modify_prot_start;
1682 pv_mmu_ops.ptep_modify_prot_commit = xen_ptep_modify_prot_commit;
1683 }
1684
1185 machine_ops = xen_machine_ops; 1685 machine_ops = xen_machine_ops;
1186 1686
1187#ifdef CONFIG_SMP 1687#ifdef CONFIG_X86_64
1188 smp_ops = xen_smp_ops; 1688 /* Disable until direct per-cpu data access. */
1689 have_vcpu_info_placement = 0;
1690 x86_64_init_pda();
1189#endif 1691#endif
1190 1692
1191 xen_setup_features(); 1693 xen_smp_init();
1192 1694
1193 /* Get mfn list */ 1695 /* Get mfn list */
1194 if (!xen_feature(XENFEAT_auto_translated_physmap)) 1696 if (!xen_feature(XENFEAT_auto_translated_physmap))
1195 phys_to_machine_mapping = (unsigned long *)xen_start_info->mfn_list; 1697 xen_build_dynamic_phys_to_machine();
1196 1698
1197 pgd = (pgd_t *)xen_start_info->pt_base; 1699 pgd = (pgd_t *)xen_start_info->pt_base;
1198 1700
1199 init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE; 1701 /* Prevent unwanted bits from being set in PTEs. */
1200 1702 __supported_pte_mask &= ~_PAGE_GLOBAL;
1201 init_mm.pgd = pgd; /* use the Xen pagetables to start */ 1703 if (!is_initial_xendomain())
1202 1704 __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
1203 /* keep using Xen gdt for now; no urgent need to change it */
1204
1205 x86_write_percpu(xen_cr3, __pa(pgd));
1206 x86_write_percpu(xen_current_cr3, __pa(pgd));
1207 1705
1208 /* Don't do the full vcpu_info placement stuff until we have a 1706 /* Don't do the full vcpu_info placement stuff until we have a
1209 possible map and a non-dummy shared_info. */ 1707 possible map and a non-dummy shared_info. */
1210 per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; 1708 per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
1211 1709
1710 xen_raw_console_write("mapping kernel into physical memory\n");
1711 pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages);
1712
1713 init_mm.pgd = pgd;
1714
1715 /* keep using Xen gdt for now; no urgent need to change it */
1716
1212 pv_info.kernel_rpl = 1; 1717 pv_info.kernel_rpl = 1;
1213 if (xen_feature(XENFEAT_supervisor_mode_kernel)) 1718 if (xen_feature(XENFEAT_supervisor_mode_kernel))
1214 pv_info.kernel_rpl = 0; 1719 pv_info.kernel_rpl = 0;
1215 1720
1216 /* Prevent unwanted bits from being set in PTEs. */
1217 __supported_pte_mask &= ~_PAGE_GLOBAL;
1218 if (!is_initial_xendomain())
1219 __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
1220
1221 /* set the limit of our address space */ 1721 /* set the limit of our address space */
1222 xen_reserve_top(); 1722 xen_reserve_top();
1223 1723
1724#ifdef CONFIG_X86_32
1224 /* set up basic CPUID stuff */ 1725 /* set up basic CPUID stuff */
1225 cpu_detect(&new_cpu_data); 1726 cpu_detect(&new_cpu_data);
1226 new_cpu_data.hard_math = 1; 1727 new_cpu_data.hard_math = 1;
1227 new_cpu_data.x86_capability[0] = cpuid_edx(1); 1728 new_cpu_data.x86_capability[0] = cpuid_edx(1);
1729#endif
1228 1730
1229 /* Poke various useful things into boot_params */ 1731 /* Poke various useful things into boot_params */
1230 boot_params.hdr.type_of_loader = (9 << 4) | 0; 1732 boot_params.hdr.type_of_loader = (9 << 4) | 0;
1231 boot_params.hdr.ramdisk_image = xen_start_info->mod_start 1733 boot_params.hdr.ramdisk_image = xen_start_info->mod_start
1232 ? __pa(xen_start_info->mod_start) : 0; 1734 ? __pa(xen_start_info->mod_start) : 0;
1233 boot_params.hdr.ramdisk_size = xen_start_info->mod_len; 1735 boot_params.hdr.ramdisk_size = xen_start_info->mod_len;
1736 boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line);
1234 1737
1235 if (!is_initial_xendomain()) 1738 if (!is_initial_xendomain()) {
1739 add_preferred_console("xenboot", 0, NULL);
1740 add_preferred_console("tty", 0, NULL);
1236 add_preferred_console("hvc", 0, NULL); 1741 add_preferred_console("hvc", 0, NULL);
1742 }
1743
1744 xen_raw_console_write("about to get started...\n");
1745
1746#if 0
1747 xen_raw_printk("&boot_params=%p __pa(&boot_params)=%lx __va(__pa(&boot_params))=%lx\n",
1748 &boot_params, __pa_symbol(&boot_params),
1749 __va(__pa_symbol(&boot_params)));
1750
1751 walk(pgd, &boot_params);
1752 walk(pgd, __va(__pa(&boot_params)));
1753#endif
1237 1754
1238 /* Start the world */ 1755 /* Start the world */
1239 start_kernel(); 1756#ifdef CONFIG_X86_32
1757 i386_start_kernel();
1758#else
1759 x86_64_start_reservations((char *)__pa_symbol(&boot_params));
1760#endif
1240} 1761}
diff --git a/arch/x86/xen/manage.c b/arch/x86/xen/manage.c
deleted file mode 100644
index aa7af9e6abc0..000000000000
--- a/arch/x86/xen/manage.c
+++ /dev/null
@@ -1,143 +0,0 @@
1/*
2 * Handle extern requests for shutdown, reboot and sysrq
3 */
4#include <linux/kernel.h>
5#include <linux/err.h>
6#include <linux/reboot.h>
7#include <linux/sysrq.h>
8
9#include <xen/xenbus.h>
10
11#define SHUTDOWN_INVALID -1
12#define SHUTDOWN_POWEROFF 0
13#define SHUTDOWN_SUSPEND 2
14/* Code 3 is SHUTDOWN_CRASH, which we don't use because the domain can only
15 * report a crash, not be instructed to crash!
16 * HALT is the same as POWEROFF, as far as we're concerned. The tools use
17 * the distinction when we return the reason code to them.
18 */
19#define SHUTDOWN_HALT 4
20
21/* Ignore multiple shutdown requests. */
22static int shutting_down = SHUTDOWN_INVALID;
23
24static void shutdown_handler(struct xenbus_watch *watch,
25 const char **vec, unsigned int len)
26{
27 char *str;
28 struct xenbus_transaction xbt;
29 int err;
30
31 if (shutting_down != SHUTDOWN_INVALID)
32 return;
33
34 again:
35 err = xenbus_transaction_start(&xbt);
36 if (err)
37 return;
38
39 str = (char *)xenbus_read(xbt, "control", "shutdown", NULL);
40 /* Ignore read errors and empty reads. */
41 if (XENBUS_IS_ERR_READ(str)) {
42 xenbus_transaction_end(xbt, 1);
43 return;
44 }
45
46 xenbus_write(xbt, "control", "shutdown", "");
47
48 err = xenbus_transaction_end(xbt, 0);
49 if (err == -EAGAIN) {
50 kfree(str);
51 goto again;
52 }
53
54 if (strcmp(str, "poweroff") == 0 ||
55 strcmp(str, "halt") == 0)
56 orderly_poweroff(false);
57 else if (strcmp(str, "reboot") == 0)
58 ctrl_alt_del();
59 else {
60 printk(KERN_INFO "Ignoring shutdown request: %s\n", str);
61 shutting_down = SHUTDOWN_INVALID;
62 }
63
64 kfree(str);
65}
66
67static void sysrq_handler(struct xenbus_watch *watch, const char **vec,
68 unsigned int len)
69{
70 char sysrq_key = '\0';
71 struct xenbus_transaction xbt;
72 int err;
73
74 again:
75 err = xenbus_transaction_start(&xbt);
76 if (err)
77 return;
78 if (!xenbus_scanf(xbt, "control", "sysrq", "%c", &sysrq_key)) {
79 printk(KERN_ERR "Unable to read sysrq code in "
80 "control/sysrq\n");
81 xenbus_transaction_end(xbt, 1);
82 return;
83 }
84
85 if (sysrq_key != '\0')
86 xenbus_printf(xbt, "control", "sysrq", "%c", '\0');
87
88 err = xenbus_transaction_end(xbt, 0);
89 if (err == -EAGAIN)
90 goto again;
91
92 if (sysrq_key != '\0')
93 handle_sysrq(sysrq_key, NULL);
94}
95
96static struct xenbus_watch shutdown_watch = {
97 .node = "control/shutdown",
98 .callback = shutdown_handler
99};
100
101static struct xenbus_watch sysrq_watch = {
102 .node = "control/sysrq",
103 .callback = sysrq_handler
104};
105
106static int setup_shutdown_watcher(void)
107{
108 int err;
109
110 err = register_xenbus_watch(&shutdown_watch);
111 if (err) {
112 printk(KERN_ERR "Failed to set shutdown watcher\n");
113 return err;
114 }
115
116 err = register_xenbus_watch(&sysrq_watch);
117 if (err) {
118 printk(KERN_ERR "Failed to set sysrq watcher\n");
119 return err;
120 }
121
122 return 0;
123}
124
125static int shutdown_event(struct notifier_block *notifier,
126 unsigned long event,
127 void *data)
128{
129 setup_shutdown_watcher();
130 return NOTIFY_DONE;
131}
132
133static int __init setup_shutdown_event(void)
134{
135 static struct notifier_block xenstore_notifier = {
136 .notifier_call = shutdown_event
137 };
138 register_xenstore_notifier(&xenstore_notifier);
139
140 return 0;
141}
142
143subsys_initcall(setup_shutdown_event);
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index df40bf74ea75..a44d56e38bd1 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -44,8 +44,10 @@
44 44
45#include <asm/pgtable.h> 45#include <asm/pgtable.h>
46#include <asm/tlbflush.h> 46#include <asm/tlbflush.h>
47#include <asm/fixmap.h>
47#include <asm/mmu_context.h> 48#include <asm/mmu_context.h>
48#include <asm/paravirt.h> 49#include <asm/paravirt.h>
50#include <asm/linkage.h>
49 51
50#include <asm/xen/hypercall.h> 52#include <asm/xen/hypercall.h>
51#include <asm/xen/hypervisor.h> 53#include <asm/xen/hypervisor.h>
@@ -56,15 +58,144 @@
56#include "multicalls.h" 58#include "multicalls.h"
57#include "mmu.h" 59#include "mmu.h"
58 60
59xmaddr_t arbitrary_virt_to_machine(unsigned long address) 61/*
62 * Just beyond the highest usermode address. STACK_TOP_MAX has a
63 * redzone above it, so round it up to a PGD boundary.
64 */
65#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
66
67
68#define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
69#define TOP_ENTRIES (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE)
70
71/* Placeholder for holes in the address space */
72static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] __page_aligned_data =
73 { [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL };
74
75 /* Array of pointers to pages containing p2m entries */
76static unsigned long *p2m_top[TOP_ENTRIES] __page_aligned_data =
77 { [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] };
78
79/* Arrays of p2m arrays expressed in mfns used for save/restore */
80static unsigned long p2m_top_mfn[TOP_ENTRIES] __page_aligned_bss;
81
82static unsigned long p2m_top_mfn_list[TOP_ENTRIES / P2M_ENTRIES_PER_PAGE]
83 __page_aligned_bss;
84
85static inline unsigned p2m_top_index(unsigned long pfn)
86{
87 BUG_ON(pfn >= MAX_DOMAIN_PAGES);
88 return pfn / P2M_ENTRIES_PER_PAGE;
89}
90
91static inline unsigned p2m_index(unsigned long pfn)
92{
93 return pfn % P2M_ENTRIES_PER_PAGE;
94}
95
96/* Build the parallel p2m_top_mfn structures */
97void xen_setup_mfn_list_list(void)
98{
99 unsigned pfn, idx;
100
101 for(pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_ENTRIES_PER_PAGE) {
102 unsigned topidx = p2m_top_index(pfn);
103
104 p2m_top_mfn[topidx] = virt_to_mfn(p2m_top[topidx]);
105 }
106
107 for(idx = 0; idx < ARRAY_SIZE(p2m_top_mfn_list); idx++) {
108 unsigned topidx = idx * P2M_ENTRIES_PER_PAGE;
109 p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]);
110 }
111
112 BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
113
114 HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
115 virt_to_mfn(p2m_top_mfn_list);
116 HYPERVISOR_shared_info->arch.max_pfn = xen_start_info->nr_pages;
117}
118
119/* Set up p2m_top to point to the domain-builder provided p2m pages */
120void __init xen_build_dynamic_phys_to_machine(void)
121{
122 unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
123 unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
124 unsigned pfn;
125
126 for(pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) {
127 unsigned topidx = p2m_top_index(pfn);
128
129 p2m_top[topidx] = &mfn_list[pfn];
130 }
131}
132
133unsigned long get_phys_to_machine(unsigned long pfn)
134{
135 unsigned topidx, idx;
136
137 if (unlikely(pfn >= MAX_DOMAIN_PAGES))
138 return INVALID_P2M_ENTRY;
139
140 topidx = p2m_top_index(pfn);
141 idx = p2m_index(pfn);
142 return p2m_top[topidx][idx];
143}
144EXPORT_SYMBOL_GPL(get_phys_to_machine);
145
146static void alloc_p2m(unsigned long **pp, unsigned long *mfnp)
147{
148 unsigned long *p;
149 unsigned i;
150
151 p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL);
152 BUG_ON(p == NULL);
153
154 for(i = 0; i < P2M_ENTRIES_PER_PAGE; i++)
155 p[i] = INVALID_P2M_ENTRY;
156
157 if (cmpxchg(pp, p2m_missing, p) != p2m_missing)
158 free_page((unsigned long)p);
159 else
160 *mfnp = virt_to_mfn(p);
161}
162
163void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
164{
165 unsigned topidx, idx;
166
167 if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
168 BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
169 return;
170 }
171
172 if (unlikely(pfn >= MAX_DOMAIN_PAGES)) {
173 BUG_ON(mfn != INVALID_P2M_ENTRY);
174 return;
175 }
176
177 topidx = p2m_top_index(pfn);
178 if (p2m_top[topidx] == p2m_missing) {
179 /* no need to allocate a page to store an invalid entry */
180 if (mfn == INVALID_P2M_ENTRY)
181 return;
182 alloc_p2m(&p2m_top[topidx], &p2m_top_mfn[topidx]);
183 }
184
185 idx = p2m_index(pfn);
186 p2m_top[topidx][idx] = mfn;
187}
188
189xmaddr_t arbitrary_virt_to_machine(void *vaddr)
60{ 190{
191 unsigned long address = (unsigned long)vaddr;
61 unsigned int level; 192 unsigned int level;
62 pte_t *pte = lookup_address(address, &level); 193 pte_t *pte = lookup_address(address, &level);
63 unsigned offset = address & ~PAGE_MASK; 194 unsigned offset = address & ~PAGE_MASK;
64 195
65 BUG_ON(pte == NULL); 196 BUG_ON(pte == NULL);
66 197
67 return XMADDR((pte_mfn(*pte) << PAGE_SHIFT) + offset); 198 return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset);
68} 199}
69 200
70void make_lowmem_page_readonly(void *vaddr) 201void make_lowmem_page_readonly(void *vaddr)
@@ -98,59 +229,68 @@ void make_lowmem_page_readwrite(void *vaddr)
98} 229}
99 230
100 231
101void xen_set_pmd(pmd_t *ptr, pmd_t val) 232static bool page_pinned(void *ptr)
233{
234 struct page *page = virt_to_page(ptr);
235
236 return PagePinned(page);
237}
238
239static void extend_mmu_update(const struct mmu_update *update)
102{ 240{
103 struct multicall_space mcs; 241 struct multicall_space mcs;
104 struct mmu_update *u; 242 struct mmu_update *u;
105 243
106 preempt_disable(); 244 mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u));
245
246 if (mcs.mc != NULL)
247 mcs.mc->args[1]++;
248 else {
249 mcs = __xen_mc_entry(sizeof(*u));
250 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
251 }
107 252
108 mcs = xen_mc_entry(sizeof(*u));
109 u = mcs.args; 253 u = mcs.args;
110 u->ptr = virt_to_machine(ptr).maddr; 254 *u = *update;
111 u->val = pmd_val_ma(val); 255}
112 MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF); 256
257void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
258{
259 struct mmu_update u;
260
261 preempt_disable();
262
263 xen_mc_batch();
264
265 /* ptr may be ioremapped for 64-bit pagetable setup */
266 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
267 u.val = pmd_val_ma(val);
268 extend_mmu_update(&u);
113 269
114 xen_mc_issue(PARAVIRT_LAZY_MMU); 270 xen_mc_issue(PARAVIRT_LAZY_MMU);
115 271
116 preempt_enable(); 272 preempt_enable();
117} 273}
118 274
275void xen_set_pmd(pmd_t *ptr, pmd_t val)
276{
277 /* If page is not pinned, we can just update the entry
278 directly */
279 if (!page_pinned(ptr)) {
280 *ptr = val;
281 return;
282 }
283
284 xen_set_pmd_hyper(ptr, val);
285}
286
119/* 287/*
120 * Associate a virtual page frame with a given physical page frame 288 * Associate a virtual page frame with a given physical page frame
121 * and protection flags for that frame. 289 * and protection flags for that frame.
122 */ 290 */
123void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags) 291void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
124{ 292{
125 pgd_t *pgd; 293 set_pte_vaddr(vaddr, mfn_pte(mfn, flags));
126 pud_t *pud;
127 pmd_t *pmd;
128 pte_t *pte;
129
130 pgd = swapper_pg_dir + pgd_index(vaddr);
131 if (pgd_none(*pgd)) {
132 BUG();
133 return;
134 }
135 pud = pud_offset(pgd, vaddr);
136 if (pud_none(*pud)) {
137 BUG();
138 return;
139 }
140 pmd = pmd_offset(pud, vaddr);
141 if (pmd_none(*pmd)) {
142 BUG();
143 return;
144 }
145 pte = pte_offset_kernel(pmd, vaddr);
146 /* <mfn,flags> stored as-is, to permit clearing entries */
147 xen_set_pte(pte, mfn_pte(mfn, flags));
148
149 /*
150 * It's enough to flush this one mapping.
151 * (PGE mappings get flushed as well)
152 */
153 __flush_tlb_one(vaddr);
154} 294}
155 295
156void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, 296void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
@@ -179,13 +319,33 @@ out:
179 preempt_enable(); 319 preempt_enable();
180} 320}
181 321
322pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
323{
324 /* Just return the pte as-is. We preserve the bits on commit */
325 return *ptep;
326}
327
328void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
329 pte_t *ptep, pte_t pte)
330{
331 struct mmu_update u;
332
333 xen_mc_batch();
334
335 u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
336 u.val = pte_val_ma(pte);
337 extend_mmu_update(&u);
338
339 xen_mc_issue(PARAVIRT_LAZY_MMU);
340}
341
182/* Assume pteval_t is equivalent to all the other *val_t types. */ 342/* Assume pteval_t is equivalent to all the other *val_t types. */
183static pteval_t pte_mfn_to_pfn(pteval_t val) 343static pteval_t pte_mfn_to_pfn(pteval_t val)
184{ 344{
185 if (val & _PAGE_PRESENT) { 345 if (val & _PAGE_PRESENT) {
186 unsigned long mfn = (val & PTE_MASK) >> PAGE_SHIFT; 346 unsigned long mfn = (val & PTE_MASK) >> PAGE_SHIFT;
187 pteval_t flags = val & ~PTE_MASK; 347 pteval_t flags = val & ~PTE_MASK;
188 val = (mfn_to_pfn(mfn) << PAGE_SHIFT) | flags; 348 val = ((pteval_t)mfn_to_pfn(mfn) << PAGE_SHIFT) | flags;
189 } 349 }
190 350
191 return val; 351 return val;
@@ -196,7 +356,7 @@ static pteval_t pte_pfn_to_mfn(pteval_t val)
196 if (val & _PAGE_PRESENT) { 356 if (val & _PAGE_PRESENT) {
197 unsigned long pfn = (val & PTE_MASK) >> PAGE_SHIFT; 357 unsigned long pfn = (val & PTE_MASK) >> PAGE_SHIFT;
198 pteval_t flags = val & ~PTE_MASK; 358 pteval_t flags = val & ~PTE_MASK;
199 val = (pfn_to_mfn(pfn) << PAGE_SHIFT) | flags; 359 val = ((pteval_t)pfn_to_mfn(pfn) << PAGE_SHIFT) | flags;
200 } 360 }
201 361
202 return val; 362 return val;
@@ -229,34 +389,51 @@ pmdval_t xen_pmd_val(pmd_t pmd)
229 return pte_mfn_to_pfn(pmd.pmd); 389 return pte_mfn_to_pfn(pmd.pmd);
230} 390}
231 391
232void xen_set_pud(pud_t *ptr, pud_t val) 392void xen_set_pud_hyper(pud_t *ptr, pud_t val)
233{ 393{
234 struct multicall_space mcs; 394 struct mmu_update u;
235 struct mmu_update *u;
236 395
237 preempt_disable(); 396 preempt_disable();
238 397
239 mcs = xen_mc_entry(sizeof(*u)); 398 xen_mc_batch();
240 u = mcs.args; 399
241 u->ptr = virt_to_machine(ptr).maddr; 400 /* ptr may be ioremapped for 64-bit pagetable setup */
242 u->val = pud_val_ma(val); 401 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
243 MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF); 402 u.val = pud_val_ma(val);
403 extend_mmu_update(&u);
244 404
245 xen_mc_issue(PARAVIRT_LAZY_MMU); 405 xen_mc_issue(PARAVIRT_LAZY_MMU);
246 406
247 preempt_enable(); 407 preempt_enable();
248} 408}
249 409
410void xen_set_pud(pud_t *ptr, pud_t val)
411{
412 /* If page is not pinned, we can just update the entry
413 directly */
414 if (!page_pinned(ptr)) {
415 *ptr = val;
416 return;
417 }
418
419 xen_set_pud_hyper(ptr, val);
420}
421
250void xen_set_pte(pte_t *ptep, pte_t pte) 422void xen_set_pte(pte_t *ptep, pte_t pte)
251{ 423{
424#ifdef CONFIG_X86_PAE
252 ptep->pte_high = pte.pte_high; 425 ptep->pte_high = pte.pte_high;
253 smp_wmb(); 426 smp_wmb();
254 ptep->pte_low = pte.pte_low; 427 ptep->pte_low = pte.pte_low;
428#else
429 *ptep = pte;
430#endif
255} 431}
256 432
433#ifdef CONFIG_X86_PAE
257void xen_set_pte_atomic(pte_t *ptep, pte_t pte) 434void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
258{ 435{
259 set_64bit((u64 *)ptep, pte_val_ma(pte)); 436 set_64bit((u64 *)ptep, native_pte_val(pte));
260} 437}
261 438
262void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) 439void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
@@ -268,8 +445,9 @@ void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
268 445
269void xen_pmd_clear(pmd_t *pmdp) 446void xen_pmd_clear(pmd_t *pmdp)
270{ 447{
271 xen_set_pmd(pmdp, __pmd(0)); 448 set_pmd(pmdp, __pmd(0));
272} 449}
450#endif /* CONFIG_X86_PAE */
273 451
274pmd_t xen_make_pmd(pmdval_t pmd) 452pmd_t xen_make_pmd(pmdval_t pmd)
275{ 453{
@@ -277,78 +455,189 @@ pmd_t xen_make_pmd(pmdval_t pmd)
277 return native_make_pmd(pmd); 455 return native_make_pmd(pmd);
278} 456}
279 457
458#if PAGETABLE_LEVELS == 4
459pudval_t xen_pud_val(pud_t pud)
460{
461 return pte_mfn_to_pfn(pud.pud);
462}
463
464pud_t xen_make_pud(pudval_t pud)
465{
466 pud = pte_pfn_to_mfn(pud);
467
468 return native_make_pud(pud);
469}
470
471pgd_t *xen_get_user_pgd(pgd_t *pgd)
472{
473 pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK);
474 unsigned offset = pgd - pgd_page;
475 pgd_t *user_ptr = NULL;
476
477 if (offset < pgd_index(USER_LIMIT)) {
478 struct page *page = virt_to_page(pgd_page);
479 user_ptr = (pgd_t *)page->private;
480 if (user_ptr)
481 user_ptr += offset;
482 }
483
484 return user_ptr;
485}
486
487static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
488{
489 struct mmu_update u;
490
491 u.ptr = virt_to_machine(ptr).maddr;
492 u.val = pgd_val_ma(val);
493 extend_mmu_update(&u);
494}
495
280/* 496/*
281 (Yet another) pagetable walker. This one is intended for pinning a 497 * Raw hypercall-based set_pgd, intended for in early boot before
282 pagetable. This means that it walks a pagetable and calls the 498 * there's a page structure. This implies:
283 callback function on each page it finds making up the page table, 499 * 1. The only existing pagetable is the kernel's
284 at every level. It walks the entire pagetable, but it only bothers 500 * 2. It is always pinned
285 pinning pte pages which are below pte_limit. In the normal case 501 * 3. It has no user pagetable attached to it
286 this will be TASK_SIZE, but at boot we need to pin up to 502 */
287 FIXADDR_TOP. But the important bit is that we don't pin beyond 503void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
288 there, because then we start getting into Xen's ptes. 504{
289*/ 505 preempt_disable();
290static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, enum pt_level), 506
507 xen_mc_batch();
508
509 __xen_set_pgd_hyper(ptr, val);
510
511 xen_mc_issue(PARAVIRT_LAZY_MMU);
512
513 preempt_enable();
514}
515
516void xen_set_pgd(pgd_t *ptr, pgd_t val)
517{
518 pgd_t *user_ptr = xen_get_user_pgd(ptr);
519
520 /* If page is not pinned, we can just update the entry
521 directly */
522 if (!page_pinned(ptr)) {
523 *ptr = val;
524 if (user_ptr) {
525 WARN_ON(page_pinned(user_ptr));
526 *user_ptr = val;
527 }
528 return;
529 }
530
531 /* If it's pinned, then we can at least batch the kernel and
532 user updates together. */
533 xen_mc_batch();
534
535 __xen_set_pgd_hyper(ptr, val);
536 if (user_ptr)
537 __xen_set_pgd_hyper(user_ptr, val);
538
539 xen_mc_issue(PARAVIRT_LAZY_MMU);
540}
541#endif /* PAGETABLE_LEVELS == 4 */
542
543/*
544 * (Yet another) pagetable walker. This one is intended for pinning a
545 * pagetable. This means that it walks a pagetable and calls the
546 * callback function on each page it finds making up the page table,
547 * at every level. It walks the entire pagetable, but it only bothers
548 * pinning pte pages which are below limit. In the normal case this
549 * will be STACK_TOP_MAX, but at boot we need to pin up to
550 * FIXADDR_TOP.
551 *
552 * For 32-bit the important bit is that we don't pin beyond there,
553 * because then we start getting into Xen's ptes.
554 *
555 * For 64-bit, we must skip the Xen hole in the middle of the address
556 * space, just after the big x86-64 virtual hole.
557 */
558static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level),
291 unsigned long limit) 559 unsigned long limit)
292{ 560{
293 pgd_t *pgd = pgd_base;
294 int flush = 0; 561 int flush = 0;
295 unsigned long addr = 0; 562 unsigned hole_low, hole_high;
296 unsigned long pgd_next; 563 unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;
564 unsigned pgdidx, pudidx, pmdidx;
297 565
298 BUG_ON(limit > FIXADDR_TOP); 566 /* The limit is the last byte to be touched */
567 limit--;
568 BUG_ON(limit >= FIXADDR_TOP);
299 569
300 if (xen_feature(XENFEAT_auto_translated_physmap)) 570 if (xen_feature(XENFEAT_auto_translated_physmap))
301 return 0; 571 return 0;
302 572
303 for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) { 573 /*
574 * 64-bit has a great big hole in the middle of the address
575 * space, which contains the Xen mappings. On 32-bit these
576 * will end up making a zero-sized hole and so is a no-op.
577 */
578 hole_low = pgd_index(USER_LIMIT);
579 hole_high = pgd_index(PAGE_OFFSET);
580
581 pgdidx_limit = pgd_index(limit);
582#if PTRS_PER_PUD > 1
583 pudidx_limit = pud_index(limit);
584#else
585 pudidx_limit = 0;
586#endif
587#if PTRS_PER_PMD > 1
588 pmdidx_limit = pmd_index(limit);
589#else
590 pmdidx_limit = 0;
591#endif
592
593 flush |= (*func)(virt_to_page(pgd), PT_PGD);
594
595 for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) {
304 pud_t *pud; 596 pud_t *pud;
305 unsigned long pud_limit, pud_next;
306 597
307 pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP); 598 if (pgdidx >= hole_low && pgdidx < hole_high)
599 continue;
308 600
309 if (!pgd_val(*pgd)) 601 if (!pgd_val(pgd[pgdidx]))
310 continue; 602 continue;
311 603
312 pud = pud_offset(pgd, 0); 604 pud = pud_offset(&pgd[pgdidx], 0);
313 605
314 if (PTRS_PER_PUD > 1) /* not folded */ 606 if (PTRS_PER_PUD > 1) /* not folded */
315 flush |= (*func)(virt_to_page(pud), PT_PUD); 607 flush |= (*func)(virt_to_page(pud), PT_PUD);
316 608
317 for (; addr != pud_limit; pud++, addr = pud_next) { 609 for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) {
318 pmd_t *pmd; 610 pmd_t *pmd;
319 unsigned long pmd_limit;
320 611
321 pud_next = pud_addr_end(addr, pud_limit); 612 if (pgdidx == pgdidx_limit &&
322 613 pudidx > pudidx_limit)
323 if (pud_next < limit) 614 goto out;
324 pmd_limit = pud_next;
325 else
326 pmd_limit = limit;
327 615
328 if (pud_none(*pud)) 616 if (pud_none(pud[pudidx]))
329 continue; 617 continue;
330 618
331 pmd = pmd_offset(pud, 0); 619 pmd = pmd_offset(&pud[pudidx], 0);
332 620
333 if (PTRS_PER_PMD > 1) /* not folded */ 621 if (PTRS_PER_PMD > 1) /* not folded */
334 flush |= (*func)(virt_to_page(pmd), PT_PMD); 622 flush |= (*func)(virt_to_page(pmd), PT_PMD);
335 623
336 for (; addr != pmd_limit; pmd++) { 624 for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) {
337 addr += (PAGE_SIZE * PTRS_PER_PTE); 625 struct page *pte;
338 if ((pmd_limit-1) < (addr-1)) { 626
339 addr = pmd_limit; 627 if (pgdidx == pgdidx_limit &&
340 break; 628 pudidx == pudidx_limit &&
341 } 629 pmdidx > pmdidx_limit)
630 goto out;
342 631
343 if (pmd_none(*pmd)) 632 if (pmd_none(pmd[pmdidx]))
344 continue; 633 continue;
345 634
346 flush |= (*func)(pmd_page(*pmd), PT_PTE); 635 pte = pmd_page(pmd[pmdidx]);
636 flush |= (*func)(pte, PT_PTE);
347 } 637 }
348 } 638 }
349 } 639 }
350 640out:
351 flush |= (*func)(virt_to_page(pgd_base), PT_PGD);
352 641
353 return flush; 642 return flush;
354} 643}
@@ -430,20 +719,62 @@ void xen_pgd_pin(pgd_t *pgd)
430{ 719{
431 xen_mc_batch(); 720 xen_mc_batch();
432 721
433 if (pgd_walk(pgd, pin_page, TASK_SIZE)) { 722 if (pgd_walk(pgd, pin_page, USER_LIMIT)) {
434 /* re-enable interrupts for kmap_flush_unused */ 723 /* re-enable interrupts for kmap_flush_unused */
435 xen_mc_issue(0); 724 xen_mc_issue(0);
436 kmap_flush_unused(); 725 kmap_flush_unused();
437 xen_mc_batch(); 726 xen_mc_batch();
438 } 727 }
439 728
729#ifdef CONFIG_X86_64
730 {
731 pgd_t *user_pgd = xen_get_user_pgd(pgd);
732
733 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
734
735 if (user_pgd) {
736 pin_page(virt_to_page(user_pgd), PT_PGD);
737 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd)));
738 }
739 }
740#else /* CONFIG_X86_32 */
741#ifdef CONFIG_X86_PAE
742 /* Need to make sure unshared kernel PMD is pinnable */
743 pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD);
744#endif
440 xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd))); 745 xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
746#endif /* CONFIG_X86_64 */
441 xen_mc_issue(0); 747 xen_mc_issue(0);
442} 748}
443 749
444/* The init_mm pagetable is really pinned as soon as its created, but 750/*
445 that's before we have page structures to store the bits. So do all 751 * On save, we need to pin all pagetables to make sure they get their
446 the book-keeping now. */ 752 * mfns turned into pfns. Search the list for any unpinned pgds and pin
753 * them (unpinned pgds are not currently in use, probably because the
754 * process is under construction or destruction).
755 */
756void xen_mm_pin_all(void)
757{
758 unsigned long flags;
759 struct page *page;
760
761 spin_lock_irqsave(&pgd_lock, flags);
762
763 list_for_each_entry(page, &pgd_list, lru) {
764 if (!PagePinned(page)) {
765 xen_pgd_pin((pgd_t *)page_address(page));
766 SetPageSavePinned(page);
767 }
768 }
769
770 spin_unlock_irqrestore(&pgd_lock, flags);
771}
772
773/*
774 * The init_mm pagetable is really pinned as soon as its created, but
775 * that's before we have page structures to store the bits. So do all
776 * the book-keeping now.
777 */
447static __init int mark_pinned(struct page *page, enum pt_level level) 778static __init int mark_pinned(struct page *page, enum pt_level level)
448{ 779{
449 SetPagePinned(page); 780 SetPagePinned(page);
@@ -493,11 +824,49 @@ static void xen_pgd_unpin(pgd_t *pgd)
493 824
494 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); 825 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
495 826
496 pgd_walk(pgd, unpin_page, TASK_SIZE); 827#ifdef CONFIG_X86_64
828 {
829 pgd_t *user_pgd = xen_get_user_pgd(pgd);
830
831 if (user_pgd) {
832 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd)));
833 unpin_page(virt_to_page(user_pgd), PT_PGD);
834 }
835 }
836#endif
837
838#ifdef CONFIG_X86_PAE
839 /* Need to make sure unshared kernel PMD is unpinned */
840 pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD);
841#endif
842
843 pgd_walk(pgd, unpin_page, USER_LIMIT);
497 844
498 xen_mc_issue(0); 845 xen_mc_issue(0);
499} 846}
500 847
848/*
849 * On resume, undo any pinning done at save, so that the rest of the
850 * kernel doesn't see any unexpected pinned pagetables.
851 */
852void xen_mm_unpin_all(void)
853{
854 unsigned long flags;
855 struct page *page;
856
857 spin_lock_irqsave(&pgd_lock, flags);
858
859 list_for_each_entry(page, &pgd_list, lru) {
860 if (PageSavePinned(page)) {
861 BUG_ON(!PagePinned(page));
862 xen_pgd_unpin((pgd_t *)page_address(page));
863 ClearPageSavePinned(page);
864 }
865 }
866
867 spin_unlock_irqrestore(&pgd_lock, flags);
868}
869
501void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) 870void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
502{ 871{
503 spin_lock(&next->page_table_lock); 872 spin_lock(&next->page_table_lock);
@@ -519,8 +888,15 @@ void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
519static void drop_other_mm_ref(void *info) 888static void drop_other_mm_ref(void *info)
520{ 889{
521 struct mm_struct *mm = info; 890 struct mm_struct *mm = info;
891 struct mm_struct *active_mm;
892
893#ifdef CONFIG_X86_64
894 active_mm = read_pda(active_mm);
895#else
896 active_mm = __get_cpu_var(cpu_tlbstate).active_mm;
897#endif
522 898
523 if (__get_cpu_var(cpu_tlbstate).active_mm == mm) 899 if (active_mm == mm)
524 leave_mm(smp_processor_id()); 900 leave_mm(smp_processor_id());
525 901
526 /* If this cpu still has a stale cr3 reference, then make sure 902 /* If this cpu still has a stale cr3 reference, then make sure
@@ -558,7 +934,7 @@ static void drop_mm_ref(struct mm_struct *mm)
558 } 934 }
559 935
560 if (!cpus_empty(mask)) 936 if (!cpus_empty(mask))
561 xen_smp_call_function_mask(mask, drop_other_mm_ref, mm, 1); 937 smp_call_function_mask(mask, drop_other_mm_ref, mm, 1);
562} 938}
563#else 939#else
564static void drop_mm_ref(struct mm_struct *mm) 940static void drop_mm_ref(struct mm_struct *mm)
@@ -591,7 +967,7 @@ void xen_exit_mmap(struct mm_struct *mm)
591 spin_lock(&mm->page_table_lock); 967 spin_lock(&mm->page_table_lock);
592 968
593 /* pgd may not be pinned in the error exit path of execve */ 969 /* pgd may not be pinned in the error exit path of execve */
594 if (PagePinned(virt_to_page(mm->pgd))) 970 if (page_pinned(mm->pgd))
595 xen_pgd_unpin(mm->pgd); 971 xen_pgd_unpin(mm->pgd);
596 972
597 spin_unlock(&mm->page_table_lock); 973 spin_unlock(&mm->page_table_lock);
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index 5fe961caffd4..0f59bd03f9e3 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -10,25 +10,9 @@ enum pt_level {
10 PT_PTE 10 PT_PTE
11}; 11};
12 12
13/*
14 * Page-directory addresses above 4GB do not fit into architectural %cr3.
15 * When accessing %cr3, or equivalent field in vcpu_guest_context, guests
16 * must use the following accessor macros to pack/unpack valid MFNs.
17 *
18 * Note that Xen is using the fact that the pagetable base is always
19 * page-aligned, and putting the 12 MSB of the address into the 12 LSB
20 * of cr3.
21 */
22#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20))
23#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20))
24
25 13
26void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); 14void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
27 15
28void xen_set_pte(pte_t *ptep, pte_t pteval);
29void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
30 pte_t *ptep, pte_t pteval);
31void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval);
32 16
33void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next); 17void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next);
34void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm); 18void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
@@ -45,11 +29,32 @@ pte_t xen_make_pte(pteval_t);
45pmd_t xen_make_pmd(pmdval_t); 29pmd_t xen_make_pmd(pmdval_t);
46pgd_t xen_make_pgd(pgdval_t); 30pgd_t xen_make_pgd(pgdval_t);
47 31
32void xen_set_pte(pte_t *ptep, pte_t pteval);
48void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, 33void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
49 pte_t *ptep, pte_t pteval); 34 pte_t *ptep, pte_t pteval);
35
36#ifdef CONFIG_X86_PAE
50void xen_set_pte_atomic(pte_t *ptep, pte_t pte); 37void xen_set_pte_atomic(pte_t *ptep, pte_t pte);
51void xen_set_pud(pud_t *ptr, pud_t val);
52void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); 38void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
53void xen_pmd_clear(pmd_t *pmdp); 39void xen_pmd_clear(pmd_t *pmdp);
40#endif /* CONFIG_X86_PAE */
41
42void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval);
43void xen_set_pud(pud_t *ptr, pud_t val);
44void xen_set_pmd_hyper(pmd_t *pmdp, pmd_t pmdval);
45void xen_set_pud_hyper(pud_t *ptr, pud_t val);
46
47#if PAGETABLE_LEVELS == 4
48pudval_t xen_pud_val(pud_t pud);
49pud_t xen_make_pud(pudval_t pudval);
50void xen_set_pgd(pgd_t *pgdp, pgd_t pgd);
51void xen_set_pgd_hyper(pgd_t *pgdp, pgd_t pgd);
52#endif
53
54pgd_t *xen_get_user_pgd(pgd_t *pgd);
55
56pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
57void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
58 pte_t *ptep, pte_t pte);
54 59
55#endif /* _XEN_MMU_H */ 60#endif /* _XEN_MMU_H */
diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c
index 5791eb2e3750..9efd1c6c9776 100644
--- a/arch/x86/xen/multicalls.c
+++ b/arch/x86/xen/multicalls.c
@@ -29,14 +29,14 @@
29#define MC_DEBUG 1 29#define MC_DEBUG 1
30 30
31#define MC_BATCH 32 31#define MC_BATCH 32
32#define MC_ARGS (MC_BATCH * 16 / sizeof(u64)) 32#define MC_ARGS (MC_BATCH * 16)
33 33
34struct mc_buffer { 34struct mc_buffer {
35 struct multicall_entry entries[MC_BATCH]; 35 struct multicall_entry entries[MC_BATCH];
36#if MC_DEBUG 36#if MC_DEBUG
37 struct multicall_entry debug[MC_BATCH]; 37 struct multicall_entry debug[MC_BATCH];
38#endif 38#endif
39 u64 args[MC_ARGS]; 39 unsigned char args[MC_ARGS];
40 struct callback { 40 struct callback {
41 void (*fn)(void *); 41 void (*fn)(void *);
42 void *data; 42 void *data;
@@ -76,6 +76,7 @@ void xen_mc_flush(void)
76 if (ret) { 76 if (ret) {
77 printk(KERN_ERR "%d multicall(s) failed: cpu %d\n", 77 printk(KERN_ERR "%d multicall(s) failed: cpu %d\n",
78 ret, smp_processor_id()); 78 ret, smp_processor_id());
79 dump_stack();
79 for (i = 0; i < b->mcidx; i++) { 80 for (i = 0; i < b->mcidx; i++) {
80 printk(" call %2d/%d: op=%lu arg=[%lx] result=%ld\n", 81 printk(" call %2d/%d: op=%lu arg=[%lx] result=%ld\n",
81 i+1, b->mcidx, 82 i+1, b->mcidx,
@@ -107,20 +108,48 @@ struct multicall_space __xen_mc_entry(size_t args)
107{ 108{
108 struct mc_buffer *b = &__get_cpu_var(mc_buffer); 109 struct mc_buffer *b = &__get_cpu_var(mc_buffer);
109 struct multicall_space ret; 110 struct multicall_space ret;
110 unsigned argspace = (args + sizeof(u64) - 1) / sizeof(u64); 111 unsigned argidx = roundup(b->argidx, sizeof(u64));
111 112
112 BUG_ON(preemptible()); 113 BUG_ON(preemptible());
113 BUG_ON(argspace > MC_ARGS); 114 BUG_ON(b->argidx > MC_ARGS);
114 115
115 if (b->mcidx == MC_BATCH || 116 if (b->mcidx == MC_BATCH ||
116 (b->argidx + argspace) > MC_ARGS) 117 (argidx + args) > MC_ARGS) {
117 xen_mc_flush(); 118 xen_mc_flush();
119 argidx = roundup(b->argidx, sizeof(u64));
120 }
118 121
119 ret.mc = &b->entries[b->mcidx]; 122 ret.mc = &b->entries[b->mcidx];
120 b->mcidx++; 123 b->mcidx++;
124 ret.args = &b->args[argidx];
125 b->argidx = argidx + args;
126
127 BUG_ON(b->argidx > MC_ARGS);
128 return ret;
129}
130
131struct multicall_space xen_mc_extend_args(unsigned long op, size_t size)
132{
133 struct mc_buffer *b = &__get_cpu_var(mc_buffer);
134 struct multicall_space ret = { NULL, NULL };
135
136 BUG_ON(preemptible());
137 BUG_ON(b->argidx > MC_ARGS);
138
139 if (b->mcidx == 0)
140 return ret;
141
142 if (b->entries[b->mcidx - 1].op != op)
143 return ret;
144
145 if ((b->argidx + size) > MC_ARGS)
146 return ret;
147
148 ret.mc = &b->entries[b->mcidx - 1];
121 ret.args = &b->args[b->argidx]; 149 ret.args = &b->args[b->argidx];
122 b->argidx += argspace; 150 b->argidx += size;
123 151
152 BUG_ON(b->argidx > MC_ARGS);
124 return ret; 153 return ret;
125} 154}
126 155
diff --git a/arch/x86/xen/multicalls.h b/arch/x86/xen/multicalls.h
index 8bae996d99a3..858938241616 100644
--- a/arch/x86/xen/multicalls.h
+++ b/arch/x86/xen/multicalls.h
@@ -45,4 +45,16 @@ static inline void xen_mc_issue(unsigned mode)
45/* Set up a callback to be called when the current batch is flushed */ 45/* Set up a callback to be called when the current batch is flushed */
46void xen_mc_callback(void (*fn)(void *), void *data); 46void xen_mc_callback(void (*fn)(void *), void *data);
47 47
48/*
49 * Try to extend the arguments of the previous multicall command. The
50 * previous command's op must match. If it does, then it attempts to
51 * extend the argument space allocated to the multicall entry by
52 * arg_size bytes.
53 *
54 * The returned multicall_space will return with mc pointing to the
55 * command on success, or NULL on failure, and args pointing to the
56 * newly allocated space.
57 */
58struct multicall_space xen_mc_extend_args(unsigned long op, size_t arg_size);
59
48#endif /* _XEN_MULTICALLS_H */ 60#endif /* _XEN_MULTICALLS_H */
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 82517e4a752a..b6acc3a0af46 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -13,9 +13,11 @@
13#include <asm/vdso.h> 13#include <asm/vdso.h>
14#include <asm/e820.h> 14#include <asm/e820.h>
15#include <asm/setup.h> 15#include <asm/setup.h>
16#include <asm/acpi.h>
16#include <asm/xen/hypervisor.h> 17#include <asm/xen/hypervisor.h>
17#include <asm/xen/hypercall.h> 18#include <asm/xen/hypercall.h>
18 19
20#include <xen/page.h>
19#include <xen/interface/callback.h> 21#include <xen/interface/callback.h>
20#include <xen/interface/physdev.h> 22#include <xen/interface/physdev.h>
21#include <xen/features.h> 23#include <xen/features.h>
@@ -27,8 +29,6 @@
27extern const char xen_hypervisor_callback[]; 29extern const char xen_hypervisor_callback[];
28extern const char xen_failsafe_callback[]; 30extern const char xen_failsafe_callback[];
29 31
30unsigned long *phys_to_machine_mapping;
31EXPORT_SYMBOL(phys_to_machine_mapping);
32 32
33/** 33/**
34 * machine_specific_memory_setup - Hook for machine specific memory setup. 34 * machine_specific_memory_setup - Hook for machine specific memory setup.
@@ -38,9 +38,31 @@ char * __init xen_memory_setup(void)
38{ 38{
39 unsigned long max_pfn = xen_start_info->nr_pages; 39 unsigned long max_pfn = xen_start_info->nr_pages;
40 40
41 max_pfn = min(MAX_DOMAIN_PAGES, max_pfn);
42
41 e820.nr_map = 0; 43 e820.nr_map = 0;
42 add_memory_region(0, LOWMEMSIZE(), E820_RAM); 44
43 add_memory_region(HIGH_MEMORY, PFN_PHYS(max_pfn)-HIGH_MEMORY, E820_RAM); 45 e820_add_region(0, PFN_PHYS(max_pfn), E820_RAM);
46
47 /*
48 * Even though this is normal, usable memory under Xen, reserve
49 * ISA memory anyway because too many things think they can poke
50 * about in there.
51 */
52 e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS,
53 E820_RESERVED);
54
55 /*
56 * Reserve Xen bits:
57 * - mfn_list
58 * - xen_start_info
59 * See comment above "struct start_info" in <xen/interface/xen.h>
60 */
61 e820_add_region(__pa(xen_start_info->mfn_list),
62 xen_start_info->pt_base - xen_start_info->mfn_list,
63 E820_RESERVED);
64
65 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
44 66
45 return "Xen"; 67 return "Xen";
46} 68}
@@ -61,30 +83,72 @@ static void xen_idle(void)
61 83
62/* 84/*
63 * Set the bit indicating "nosegneg" library variants should be used. 85 * Set the bit indicating "nosegneg" library variants should be used.
86 * We only need to bother in pure 32-bit mode; compat 32-bit processes
87 * can have un-truncated segments, so wrapping around is allowed.
64 */ 88 */
65static void __init fiddle_vdso(void) 89static void __init fiddle_vdso(void)
66{ 90{
67 extern const char vdso32_default_start; 91#ifdef CONFIG_X86_32
68 u32 *mask = VDSO32_SYMBOL(&vdso32_default_start, NOTE_MASK); 92 u32 *mask;
93 mask = VDSO32_SYMBOL(&vdso32_int80_start, NOTE_MASK);
94 *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
95 mask = VDSO32_SYMBOL(&vdso32_sysenter_start, NOTE_MASK);
69 *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT; 96 *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
97#endif
70} 98}
71 99
72void xen_enable_sysenter(void) 100static __cpuinit int register_callback(unsigned type, const void *func)
73{ 101{
74 int cpu = smp_processor_id(); 102 struct callback_register callback = {
75 extern void xen_sysenter_target(void); 103 .type = type,
76 /* Mask events on entry, even though they get enabled immediately */ 104 .address = XEN_CALLBACK(__KERNEL_CS, func),
77 static struct callback_register sysenter = {
78 .type = CALLBACKTYPE_sysenter,
79 .address = { __KERNEL_CS, (unsigned long)xen_sysenter_target },
80 .flags = CALLBACKF_mask_events, 105 .flags = CALLBACKF_mask_events,
81 }; 106 };
82 107
83 if (!boot_cpu_has(X86_FEATURE_SEP) || 108 return HYPERVISOR_callback_op(CALLBACKOP_register, &callback);
84 HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) != 0) { 109}
85 clear_cpu_cap(&cpu_data(cpu), X86_FEATURE_SEP); 110
86 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_SEP); 111void __cpuinit xen_enable_sysenter(void)
112{
113 extern void xen_sysenter_target(void);
114 int ret;
115 unsigned sysenter_feature;
116
117#ifdef CONFIG_X86_32
118 sysenter_feature = X86_FEATURE_SEP;
119#else
120 sysenter_feature = X86_FEATURE_SYSENTER32;
121#endif
122
123 if (!boot_cpu_has(sysenter_feature))
124 return;
125
126 ret = register_callback(CALLBACKTYPE_sysenter, xen_sysenter_target);
127 if(ret != 0)
128 setup_clear_cpu_cap(sysenter_feature);
129}
130
131void __cpuinit xen_enable_syscall(void)
132{
133#ifdef CONFIG_X86_64
134 int ret;
135 extern void xen_syscall_target(void);
136 extern void xen_syscall32_target(void);
137
138 ret = register_callback(CALLBACKTYPE_syscall, xen_syscall_target);
139 if (ret != 0) {
140 printk(KERN_ERR "Failed to set syscall callback: %d\n", ret);
141 /* Pretty fatal; 64-bit userspace has no other
142 mechanism for syscalls. */
143 }
144
145 if (boot_cpu_has(X86_FEATURE_SYSCALL32)) {
146 ret = register_callback(CALLBACKTYPE_syscall32,
147 xen_syscall32_target);
148 if (ret != 0)
149 setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
87 } 150 }
151#endif /* CONFIG_X86_64 */
88} 152}
89 153
90void __init xen_arch_setup(void) 154void __init xen_arch_setup(void)
@@ -98,10 +162,12 @@ void __init xen_arch_setup(void)
98 if (!xen_feature(XENFEAT_auto_translated_physmap)) 162 if (!xen_feature(XENFEAT_auto_translated_physmap))
99 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3); 163 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3);
100 164
101 HYPERVISOR_set_callbacks(__KERNEL_CS, (unsigned long)xen_hypervisor_callback, 165 if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) ||
102 __KERNEL_CS, (unsigned long)xen_failsafe_callback); 166 register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback))
167 BUG();
103 168
104 xen_enable_sysenter(); 169 xen_enable_sysenter();
170 xen_enable_syscall();
105 171
106 set_iopl.iopl = 1; 172 set_iopl.iopl = 1;
107 rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); 173 rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
@@ -121,11 +187,6 @@ void __init xen_arch_setup(void)
121 187
122 pm_idle = xen_idle; 188 pm_idle = xen_idle;
123 189
124#ifdef CONFIG_SMP
125 /* fill cpus_possible with all available cpus */
126 xen_fill_possible_map();
127#endif
128
129 paravirt_disable_iospace(); 190 paravirt_disable_iospace();
130 191
131 fiddle_vdso(); 192 fiddle_vdso();
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 94e69000f982..f702199312a5 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -35,28 +35,15 @@
35#include "xen-ops.h" 35#include "xen-ops.h"
36#include "mmu.h" 36#include "mmu.h"
37 37
38static cpumask_t xen_cpu_initialized_map; 38cpumask_t xen_cpu_initialized_map;
39static DEFINE_PER_CPU(int, resched_irq) = -1;
40static DEFINE_PER_CPU(int, callfunc_irq) = -1;
41static DEFINE_PER_CPU(int, debug_irq) = -1;
42 39
43/* 40static DEFINE_PER_CPU(int, resched_irq);
44 * Structure and data for smp_call_function(). This is designed to minimise 41static DEFINE_PER_CPU(int, callfunc_irq);
45 * static memory requirements. It also looks cleaner. 42static DEFINE_PER_CPU(int, callfuncsingle_irq);
46 */ 43static DEFINE_PER_CPU(int, debug_irq) = -1;
47static DEFINE_SPINLOCK(call_lock);
48
49struct call_data_struct {
50 void (*func) (void *info);
51 void *info;
52 atomic_t started;
53 atomic_t finished;
54 int wait;
55};
56 44
57static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id); 45static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id);
58 46static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id);
59static struct call_data_struct *call_data;
60 47
61/* 48/*
62 * Reschedule call back. Nothing to do, 49 * Reschedule call back. Nothing to do,
@@ -65,6 +52,12 @@ static struct call_data_struct *call_data;
65 */ 52 */
66static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id) 53static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
67{ 54{
55#ifdef CONFIG_X86_32
56 __get_cpu_var(irq_stat).irq_resched_count++;
57#else
58 add_pda(irq_resched_count, 1);
59#endif
60
68 return IRQ_HANDLED; 61 return IRQ_HANDLED;
69} 62}
70 63
@@ -73,13 +66,22 @@ static __cpuinit void cpu_bringup_and_idle(void)
73 int cpu = smp_processor_id(); 66 int cpu = smp_processor_id();
74 67
75 cpu_init(); 68 cpu_init();
69 preempt_disable();
70
76 xen_enable_sysenter(); 71 xen_enable_sysenter();
72 xen_enable_syscall();
77 73
78 preempt_disable(); 74 cpu = smp_processor_id();
79 per_cpu(cpu_state, cpu) = CPU_ONLINE; 75 smp_store_cpu_info(cpu);
76 cpu_data(cpu).x86_max_cores = 1;
77 set_cpu_sibling_map(cpu);
80 78
81 xen_setup_cpu_clockevents(); 79 xen_setup_cpu_clockevents();
82 80
81 cpu_set(cpu, cpu_online_map);
82 x86_write_percpu(cpu_state, CPU_ONLINE);
83 wmb();
84
83 /* We can take interrupts now: we're officially "up". */ 85 /* We can take interrupts now: we're officially "up". */
84 local_irq_enable(); 86 local_irq_enable();
85 87
@@ -122,6 +124,17 @@ static int xen_smp_intr_init(unsigned int cpu)
122 goto fail; 124 goto fail;
123 per_cpu(debug_irq, cpu) = rc; 125 per_cpu(debug_irq, cpu) = rc;
124 126
127 callfunc_name = kasprintf(GFP_KERNEL, "callfuncsingle%d", cpu);
128 rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_SINGLE_VECTOR,
129 cpu,
130 xen_call_function_single_interrupt,
131 IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
132 callfunc_name,
133 NULL);
134 if (rc < 0)
135 goto fail;
136 per_cpu(callfuncsingle_irq, cpu) = rc;
137
125 return 0; 138 return 0;
126 139
127 fail: 140 fail:
@@ -131,59 +144,43 @@ static int xen_smp_intr_init(unsigned int cpu)
131 unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL); 144 unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
132 if (per_cpu(debug_irq, cpu) >= 0) 145 if (per_cpu(debug_irq, cpu) >= 0)
133 unbind_from_irqhandler(per_cpu(debug_irq, cpu), NULL); 146 unbind_from_irqhandler(per_cpu(debug_irq, cpu), NULL);
147 if (per_cpu(callfuncsingle_irq, cpu) >= 0)
148 unbind_from_irqhandler(per_cpu(callfuncsingle_irq, cpu), NULL);
149
134 return rc; 150 return rc;
135} 151}
136 152
137void __init xen_fill_possible_map(void) 153static void __init xen_fill_possible_map(void)
138{ 154{
139 int i, rc; 155 int i, rc;
140 156
141 for (i = 0; i < NR_CPUS; i++) { 157 for (i = 0; i < NR_CPUS; i++) {
142 rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); 158 rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
143 if (rc >= 0) 159 if (rc >= 0) {
160 num_processors++;
144 cpu_set(i, cpu_possible_map); 161 cpu_set(i, cpu_possible_map);
162 }
145 } 163 }
146} 164}
147 165
148void __init xen_smp_prepare_boot_cpu(void) 166static void __init xen_smp_prepare_boot_cpu(void)
149{ 167{
150 int cpu;
151
152 BUG_ON(smp_processor_id() != 0); 168 BUG_ON(smp_processor_id() != 0);
153 native_smp_prepare_boot_cpu(); 169 native_smp_prepare_boot_cpu();
154 170
155 /* We've switched to the "real" per-cpu gdt, so make sure the 171 /* We've switched to the "real" per-cpu gdt, so make sure the
156 old memory can be recycled */ 172 old memory can be recycled */
157 make_lowmem_page_readwrite(&per_cpu__gdt_page); 173 make_lowmem_page_readwrite(&per_cpu_var(gdt_page));
158
159 for_each_possible_cpu(cpu) {
160 cpus_clear(per_cpu(cpu_sibling_map, cpu));
161 /*
162 * cpu_core_map lives in a per cpu area that is cleared
163 * when the per cpu array is allocated.
164 *
165 * cpus_clear(per_cpu(cpu_core_map, cpu));
166 */
167 }
168 174
169 xen_setup_vcpu_info_placement(); 175 xen_setup_vcpu_info_placement();
170} 176}
171 177
172void __init xen_smp_prepare_cpus(unsigned int max_cpus) 178static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
173{ 179{
174 unsigned cpu; 180 unsigned cpu;
175 181
176 for_each_possible_cpu(cpu) {
177 cpus_clear(per_cpu(cpu_sibling_map, cpu));
178 /*
179 * cpu_core_ map will be zeroed when the per
180 * cpu area is allocated.
181 *
182 * cpus_clear(per_cpu(cpu_core_map, cpu));
183 */
184 }
185
186 smp_store_cpu_info(0); 182 smp_store_cpu_info(0);
183 cpu_data(0).x86_max_cores = 1;
187 set_cpu_sibling_map(0); 184 set_cpu_sibling_map(0);
188 185
189 if (xen_smp_intr_init(0)) 186 if (xen_smp_intr_init(0))
@@ -218,7 +215,7 @@ static __cpuinit int
218cpu_initialize_context(unsigned int cpu, struct task_struct *idle) 215cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
219{ 216{
220 struct vcpu_guest_context *ctxt; 217 struct vcpu_guest_context *ctxt;
221 struct gdt_page *gdt = &per_cpu(gdt_page, cpu); 218 struct desc_struct *gdt;
222 219
223 if (cpu_test_and_set(cpu, xen_cpu_initialized_map)) 220 if (cpu_test_and_set(cpu, xen_cpu_initialized_map))
224 return 0; 221 return 0;
@@ -227,12 +224,15 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
227 if (ctxt == NULL) 224 if (ctxt == NULL)
228 return -ENOMEM; 225 return -ENOMEM;
229 226
227 gdt = get_cpu_gdt_table(cpu);
228
230 ctxt->flags = VGCF_IN_KERNEL; 229 ctxt->flags = VGCF_IN_KERNEL;
231 ctxt->user_regs.ds = __USER_DS; 230 ctxt->user_regs.ds = __USER_DS;
232 ctxt->user_regs.es = __USER_DS; 231 ctxt->user_regs.es = __USER_DS;
233 ctxt->user_regs.fs = __KERNEL_PERCPU;
234 ctxt->user_regs.gs = 0;
235 ctxt->user_regs.ss = __KERNEL_DS; 232 ctxt->user_regs.ss = __KERNEL_DS;
233#ifdef CONFIG_X86_32
234 ctxt->user_regs.fs = __KERNEL_PERCPU;
235#endif
236 ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle; 236 ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
237 ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */ 237 ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
238 238
@@ -242,11 +242,11 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
242 242
243 ctxt->ldt_ents = 0; 243 ctxt->ldt_ents = 0;
244 244
245 BUG_ON((unsigned long)gdt->gdt & ~PAGE_MASK); 245 BUG_ON((unsigned long)gdt & ~PAGE_MASK);
246 make_lowmem_page_readonly(gdt->gdt); 246 make_lowmem_page_readonly(gdt);
247 247
248 ctxt->gdt_frames[0] = virt_to_mfn(gdt->gdt); 248 ctxt->gdt_frames[0] = virt_to_mfn(gdt);
249 ctxt->gdt_ents = ARRAY_SIZE(gdt->gdt); 249 ctxt->gdt_ents = GDT_ENTRIES;
250 250
251 ctxt->user_regs.cs = __KERNEL_CS; 251 ctxt->user_regs.cs = __KERNEL_CS;
252 ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs); 252 ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
@@ -254,9 +254,11 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
254 ctxt->kernel_ss = __KERNEL_DS; 254 ctxt->kernel_ss = __KERNEL_DS;
255 ctxt->kernel_sp = idle->thread.sp0; 255 ctxt->kernel_sp = idle->thread.sp0;
256 256
257#ifdef CONFIG_X86_32
257 ctxt->event_callback_cs = __KERNEL_CS; 258 ctxt->event_callback_cs = __KERNEL_CS;
258 ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback;
259 ctxt->failsafe_callback_cs = __KERNEL_CS; 259 ctxt->failsafe_callback_cs = __KERNEL_CS;
260#endif
261 ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback;
260 ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback; 262 ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback;
261 263
262 per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir); 264 per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
@@ -269,7 +271,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
269 return 0; 271 return 0;
270} 272}
271 273
272int __cpuinit xen_cpu_up(unsigned int cpu) 274static int __cpuinit xen_cpu_up(unsigned int cpu)
273{ 275{
274 struct task_struct *idle = idle_task(cpu); 276 struct task_struct *idle = idle_task(cpu);
275 int rc; 277 int rc;
@@ -280,11 +282,28 @@ int __cpuinit xen_cpu_up(unsigned int cpu)
280 return rc; 282 return rc;
281#endif 283#endif
282 284
285#ifdef CONFIG_X86_64
286 /* Allocate node local memory for AP pdas */
287 WARN_ON(cpu == 0);
288 if (cpu > 0) {
289 rc = get_local_pda(cpu);
290 if (rc)
291 return rc;
292 }
293#endif
294
295#ifdef CONFIG_X86_32
283 init_gdt(cpu); 296 init_gdt(cpu);
284 per_cpu(current_task, cpu) = idle; 297 per_cpu(current_task, cpu) = idle;
285 irq_ctx_init(cpu); 298 irq_ctx_init(cpu);
299#else
300 cpu_pda(cpu)->pcurrent = idle;
301 clear_tsk_thread_flag(idle, TIF_FORK);
302#endif
286 xen_setup_timer(cpu); 303 xen_setup_timer(cpu);
287 304
305 per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
306
288 /* make sure interrupts start blocked */ 307 /* make sure interrupts start blocked */
289 per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1; 308 per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1;
290 309
@@ -299,20 +318,18 @@ int __cpuinit xen_cpu_up(unsigned int cpu)
299 if (rc) 318 if (rc)
300 return rc; 319 return rc;
301 320
302 smp_store_cpu_info(cpu);
303 set_cpu_sibling_map(cpu);
304 /* This must be done before setting cpu_online_map */
305 wmb();
306
307 cpu_set(cpu, cpu_online_map);
308
309 rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL); 321 rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL);
310 BUG_ON(rc); 322 BUG_ON(rc);
311 323
324 while(per_cpu(cpu_state, cpu) != CPU_ONLINE) {
325 HYPERVISOR_sched_op(SCHEDOP_yield, 0);
326 barrier();
327 }
328
312 return 0; 329 return 0;
313} 330}
314 331
315void xen_smp_cpus_done(unsigned int max_cpus) 332static void xen_smp_cpus_done(unsigned int max_cpus)
316{ 333{
317} 334}
318 335
@@ -328,17 +345,16 @@ static void stop_self(void *v)
328 BUG(); 345 BUG();
329} 346}
330 347
331void xen_smp_send_stop(void) 348static void xen_smp_send_stop(void)
332{ 349{
333 smp_call_function(stop_self, NULL, 0, 0); 350 smp_call_function(stop_self, NULL, 0);
334} 351}
335 352
336void xen_smp_send_reschedule(int cpu) 353static void xen_smp_send_reschedule(int cpu)
337{ 354{
338 xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR); 355 xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR);
339} 356}
340 357
341
342static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector) 358static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector)
343{ 359{
344 unsigned cpu; 360 unsigned cpu;
@@ -349,83 +365,69 @@ static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector)
349 xen_send_IPI_one(cpu, vector); 365 xen_send_IPI_one(cpu, vector);
350} 366}
351 367
368static void xen_smp_send_call_function_ipi(cpumask_t mask)
369{
370 int cpu;
371
372 xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR);
373
374 /* Make sure other vcpus get a chance to run if they need to. */
375 for_each_cpu_mask(cpu, mask) {
376 if (xen_vcpu_stolen(cpu)) {
377 HYPERVISOR_sched_op(SCHEDOP_yield, 0);
378 break;
379 }
380 }
381}
382
383static void xen_smp_send_call_function_single_ipi(int cpu)
384{
385 xen_send_IPI_mask(cpumask_of_cpu(cpu), XEN_CALL_FUNCTION_SINGLE_VECTOR);
386}
387
352static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id) 388static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
353{ 389{
354 void (*func) (void *info) = call_data->func;
355 void *info = call_data->info;
356 int wait = call_data->wait;
357
358 /*
359 * Notify initiating CPU that I've grabbed the data and am
360 * about to execute the function
361 */
362 mb();
363 atomic_inc(&call_data->started);
364 /*
365 * At this point the info structure may be out of scope unless wait==1
366 */
367 irq_enter(); 390 irq_enter();
368 (*func)(info); 391 generic_smp_call_function_interrupt();
392#ifdef CONFIG_X86_32
369 __get_cpu_var(irq_stat).irq_call_count++; 393 __get_cpu_var(irq_stat).irq_call_count++;
394#else
395 add_pda(irq_call_count, 1);
396#endif
370 irq_exit(); 397 irq_exit();
371 398
372 if (wait) {
373 mb(); /* commit everything before setting finished */
374 atomic_inc(&call_data->finished);
375 }
376
377 return IRQ_HANDLED; 399 return IRQ_HANDLED;
378} 400}
379 401
380int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *), 402static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id)
381 void *info, int wait)
382{ 403{
383 struct call_data_struct data; 404 irq_enter();
384 int cpus, cpu; 405 generic_smp_call_function_single_interrupt();
385 bool yield; 406#ifdef CONFIG_X86_32
386 407 __get_cpu_var(irq_stat).irq_call_count++;
387 /* Holding any lock stops cpus from going down. */ 408#else
388 spin_lock(&call_lock); 409 add_pda(irq_call_count, 1);
389 410#endif
390 cpu_clear(smp_processor_id(), mask); 411 irq_exit();
391
392 cpus = cpus_weight(mask);
393 if (!cpus) {
394 spin_unlock(&call_lock);
395 return 0;
396 }
397
398 /* Can deadlock when called with interrupts disabled */
399 WARN_ON(irqs_disabled());
400
401 data.func = func;
402 data.info = info;
403 atomic_set(&data.started, 0);
404 data.wait = wait;
405 if (wait)
406 atomic_set(&data.finished, 0);
407
408 call_data = &data;
409 mb(); /* write everything before IPI */
410
411 /* Send a message to other CPUs and wait for them to respond */
412 xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR);
413 412
414 /* Make sure other vcpus get a chance to run if they need to. */ 413 return IRQ_HANDLED;
415 yield = false; 414}
416 for_each_cpu_mask(cpu, mask)
417 if (xen_vcpu_stolen(cpu))
418 yield = true;
419 415
420 if (yield) 416static const struct smp_ops xen_smp_ops __initdata = {
421 HYPERVISOR_sched_op(SCHEDOP_yield, 0); 417 .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
418 .smp_prepare_cpus = xen_smp_prepare_cpus,
419 .cpu_up = xen_cpu_up,
420 .smp_cpus_done = xen_smp_cpus_done,
422 421
423 /* Wait for response */ 422 .smp_send_stop = xen_smp_send_stop,
424 while (atomic_read(&data.started) != cpus || 423 .smp_send_reschedule = xen_smp_send_reschedule,
425 (wait && atomic_read(&data.finished) != cpus))
426 cpu_relax();
427 424
428 spin_unlock(&call_lock); 425 .send_call_func_ipi = xen_smp_send_call_function_ipi,
426 .send_call_func_single_ipi = xen_smp_send_call_function_single_ipi,
427};
429 428
430 return 0; 429void __init xen_smp_init(void)
430{
431 smp_ops = xen_smp_ops;
432 xen_fill_possible_map();
431} 433}
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
new file mode 100644
index 000000000000..2a234db5949b
--- /dev/null
+++ b/arch/x86/xen/suspend.c
@@ -0,0 +1,48 @@
1#include <linux/types.h>
2
3#include <xen/interface/xen.h>
4#include <xen/grant_table.h>
5#include <xen/events.h>
6
7#include <asm/xen/hypercall.h>
8#include <asm/xen/page.h>
9
10#include "xen-ops.h"
11#include "mmu.h"
12
13void xen_pre_suspend(void)
14{
15 xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn);
16 xen_start_info->console.domU.mfn =
17 mfn_to_pfn(xen_start_info->console.domU.mfn);
18
19 BUG_ON(!irqs_disabled());
20
21 HYPERVISOR_shared_info = &xen_dummy_shared_info;
22 if (HYPERVISOR_update_va_mapping(fix_to_virt(FIX_PARAVIRT_BOOTMAP),
23 __pte_ma(0), 0))
24 BUG();
25}
26
27void xen_post_suspend(int suspend_cancelled)
28{
29 xen_setup_shared_info();
30
31 if (suspend_cancelled) {
32 xen_start_info->store_mfn =
33 pfn_to_mfn(xen_start_info->store_mfn);
34 xen_start_info->console.domU.mfn =
35 pfn_to_mfn(xen_start_info->console.domU.mfn);
36 } else {
37#ifdef CONFIG_SMP
38 xen_cpu_initialized_map = cpu_online_map;
39#endif
40 xen_vcpu_restore();
41 }
42
43}
44
45void xen_arch_resume(void)
46{
47 /* nothing */
48}
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 41e217503c96..685b77470fc3 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -197,8 +197,8 @@ unsigned long long xen_sched_clock(void)
197} 197}
198 198
199 199
200/* Get the CPU speed from Xen */ 200/* Get the TSC speed from Xen */
201unsigned long xen_cpu_khz(void) 201unsigned long xen_tsc_khz(void)
202{ 202{
203 u64 xen_khz = 1000000ULL << 32; 203 u64 xen_khz = 1000000ULL << 32;
204 const struct pvclock_vcpu_time_info *info = 204 const struct pvclock_vcpu_time_info *info =
@@ -459,6 +459,19 @@ void xen_setup_cpu_clockevents(void)
459 clockevents_register_device(&__get_cpu_var(xen_clock_events)); 459 clockevents_register_device(&__get_cpu_var(xen_clock_events));
460} 460}
461 461
462void xen_timer_resume(void)
463{
464 int cpu;
465
466 if (xen_clockevent != &xen_vcpuop_clockevent)
467 return;
468
469 for_each_online_cpu(cpu) {
470 if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
471 BUG();
472 }
473}
474
462__init void xen_time_init(void) 475__init void xen_time_init(void)
463{ 476{
464 int cpu = smp_processor_id(); 477 int cpu = smp_processor_id();
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm_32.S
index 2497a30f41de..2497a30f41de 100644
--- a/arch/x86/xen/xen-asm.S
+++ b/arch/x86/xen/xen-asm_32.S
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
new file mode 100644
index 000000000000..4038cbfe3331
--- /dev/null
+++ b/arch/x86/xen/xen-asm_64.S
@@ -0,0 +1,271 @@
1/*
2 Asm versions of Xen pv-ops, suitable for either direct use or inlining.
3 The inline versions are the same as the direct-use versions, with the
4 pre- and post-amble chopped off.
5
6 This code is encoded for size rather than absolute efficiency,
7 with a view to being able to inline as much as possible.
8
9 We only bother with direct forms (ie, vcpu in pda) of the operations
10 here; the indirect forms are better handled in C, since they're
11 generally too large to inline anyway.
12 */
13
14#include <linux/linkage.h>
15
16#include <asm/asm-offsets.h>
17#include <asm/processor-flags.h>
18#include <asm/errno.h>
19#include <asm/segment.h>
20
21#include <xen/interface/xen.h>
22
23#define RELOC(x, v) .globl x##_reloc; x##_reloc=v
24#define ENDPATCH(x) .globl x##_end; x##_end=.
25
26/* Pseudo-flag used for virtual NMI, which we don't implement yet */
27#define XEN_EFLAGS_NMI 0x80000000
28
29#if 0
30#include <asm/percpu.h>
31
32/*
33 Enable events. This clears the event mask and tests the pending
34 event status with one and operation. If there are pending
35 events, then enter the hypervisor to get them handled.
36 */
37ENTRY(xen_irq_enable_direct)
38 /* Unmask events */
39 movb $0, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
40
41 /* Preempt here doesn't matter because that will deal with
42 any pending interrupts. The pending check may end up being
43 run on the wrong CPU, but that doesn't hurt. */
44
45 /* Test for pending */
46 testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending)
47 jz 1f
48
492: call check_events
501:
51ENDPATCH(xen_irq_enable_direct)
52 ret
53 ENDPROC(xen_irq_enable_direct)
54 RELOC(xen_irq_enable_direct, 2b+1)
55
56/*
57 Disabling events is simply a matter of making the event mask
58 non-zero.
59 */
60ENTRY(xen_irq_disable_direct)
61 movb $1, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
62ENDPATCH(xen_irq_disable_direct)
63 ret
64 ENDPROC(xen_irq_disable_direct)
65 RELOC(xen_irq_disable_direct, 0)
66
67/*
68 (xen_)save_fl is used to get the current interrupt enable status.
69 Callers expect the status to be in X86_EFLAGS_IF, and other bits
70 may be set in the return value. We take advantage of this by
71 making sure that X86_EFLAGS_IF has the right value (and other bits
72 in that byte are 0), but other bits in the return value are
73 undefined. We need to toggle the state of the bit, because
74 Xen and x86 use opposite senses (mask vs enable).
75 */
76ENTRY(xen_save_fl_direct)
77 testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
78 setz %ah
79 addb %ah,%ah
80ENDPATCH(xen_save_fl_direct)
81 ret
82 ENDPROC(xen_save_fl_direct)
83 RELOC(xen_save_fl_direct, 0)
84
85/*
86 In principle the caller should be passing us a value return
87 from xen_save_fl_direct, but for robustness sake we test only
88 the X86_EFLAGS_IF flag rather than the whole byte. After
89 setting the interrupt mask state, it checks for unmasked
90 pending events and enters the hypervisor to get them delivered
91 if so.
92 */
93ENTRY(xen_restore_fl_direct)
94 testb $X86_EFLAGS_IF>>8, %ah
95 setz PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
96 /* Preempt here doesn't matter because that will deal with
97 any pending interrupts. The pending check may end up being
98 run on the wrong CPU, but that doesn't hurt. */
99
100 /* check for unmasked and pending */
101 cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending)
102 jz 1f
1032: call check_events
1041:
105ENDPATCH(xen_restore_fl_direct)
106 ret
107 ENDPROC(xen_restore_fl_direct)
108 RELOC(xen_restore_fl_direct, 2b+1)
109
110
111/*
112 Force an event check by making a hypercall,
113 but preserve regs before making the call.
114 */
115check_events:
116 push %rax
117 push %rcx
118 push %rdx
119 push %rsi
120 push %rdi
121 push %r8
122 push %r9
123 push %r10
124 push %r11
125 call force_evtchn_callback
126 pop %r11
127 pop %r10
128 pop %r9
129 pop %r8
130 pop %rdi
131 pop %rsi
132 pop %rdx
133 pop %rcx
134 pop %rax
135 ret
136#endif
137
138ENTRY(xen_adjust_exception_frame)
139 mov 8+0(%rsp),%rcx
140 mov 8+8(%rsp),%r11
141 ret $16
142
143hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32
144/*
145 Xen64 iret frame:
146
147 ss
148 rsp
149 rflags
150 cs
151 rip <-- standard iret frame
152
153 flags
154
155 rcx }
156 r11 }<-- pushed by hypercall page
157rsp -> rax }
158 */
159ENTRY(xen_iret)
160 pushq $0
1611: jmp hypercall_iret
162ENDPATCH(xen_iret)
163RELOC(xen_iret, 1b+1)
164
165/*
166 sysexit is not used for 64-bit processes, so it's
167 only ever used to return to 32-bit compat userspace.
168 */
169ENTRY(xen_sysexit)
170 pushq $__USER32_DS
171 pushq %rcx
172 pushq $X86_EFLAGS_IF
173 pushq $__USER32_CS
174 pushq %rdx
175
176 pushq $VGCF_in_syscall
1771: jmp hypercall_iret
178ENDPATCH(xen_sysexit)
179RELOC(xen_sysexit, 1b+1)
180
181ENTRY(xen_sysret64)
182 /* We're already on the usermode stack at this point, but still
183 with the kernel gs, so we can easily switch back */
184 movq %rsp, %gs:pda_oldrsp
185 movq %gs:pda_kernelstack,%rsp
186
187 pushq $__USER_DS
188 pushq %gs:pda_oldrsp
189 pushq %r11
190 pushq $__USER_CS
191 pushq %rcx
192
193 pushq $VGCF_in_syscall
1941: jmp hypercall_iret
195ENDPATCH(xen_sysret64)
196RELOC(xen_sysret64, 1b+1)
197
198ENTRY(xen_sysret32)
199 /* We're already on the usermode stack at this point, but still
200 with the kernel gs, so we can easily switch back */
201 movq %rsp, %gs:pda_oldrsp
202 movq %gs:pda_kernelstack, %rsp
203
204 pushq $__USER32_DS
205 pushq %gs:pda_oldrsp
206 pushq %r11
207 pushq $__USER32_CS
208 pushq %rcx
209
210 pushq $VGCF_in_syscall
2111: jmp hypercall_iret
212ENDPATCH(xen_sysret32)
213RELOC(xen_sysret32, 1b+1)
214
215/*
216 Xen handles syscall callbacks much like ordinary exceptions,
217 which means we have:
218 - kernel gs
219 - kernel rsp
220 - an iret-like stack frame on the stack (including rcx and r11):
221 ss
222 rsp
223 rflags
224 cs
225 rip
226 r11
227 rsp-> rcx
228
229 In all the entrypoints, we undo all that to make it look
230 like a CPU-generated syscall/sysenter and jump to the normal
231 entrypoint.
232 */
233
234.macro undo_xen_syscall
235 mov 0*8(%rsp),%rcx
236 mov 1*8(%rsp),%r11
237 mov 5*8(%rsp),%rsp
238.endm
239
240/* Normal 64-bit system call target */
241ENTRY(xen_syscall_target)
242 undo_xen_syscall
243 jmp system_call_after_swapgs
244ENDPROC(xen_syscall_target)
245
246#ifdef CONFIG_IA32_EMULATION
247
248/* 32-bit compat syscall target */
249ENTRY(xen_syscall32_target)
250 undo_xen_syscall
251 jmp ia32_cstar_target
252ENDPROC(xen_syscall32_target)
253
254/* 32-bit compat sysenter target */
255ENTRY(xen_sysenter_target)
256 undo_xen_syscall
257 jmp ia32_sysenter_target
258ENDPROC(xen_sysenter_target)
259
260#else /* !CONFIG_IA32_EMULATION */
261
262ENTRY(xen_syscall32_target)
263ENTRY(xen_sysenter_target)
264 lea 16(%rsp), %rsp /* strip %rcx,%r11 */
265 mov $-ENOSYS, %rax
266 pushq $VGCF_in_syscall
267 jmp hypercall_iret
268ENDPROC(xen_syscall32_target)
269ENDPROC(xen_sysenter_target)
270
271#endif /* CONFIG_IA32_EMULATION */
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 6ec3b4f7719b..63d49a523ed3 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -5,14 +5,24 @@
5 5
6#include <linux/elfnote.h> 6#include <linux/elfnote.h>
7#include <linux/init.h> 7#include <linux/init.h>
8
8#include <asm/boot.h> 9#include <asm/boot.h>
10#include <asm/asm.h>
11#include <asm/page.h>
12
9#include <xen/interface/elfnote.h> 13#include <xen/interface/elfnote.h>
14#include <asm/xen/interface.h>
10 15
11 __INIT 16 __INIT
12ENTRY(startup_xen) 17ENTRY(startup_xen)
13 movl %esi,xen_start_info
14 cld 18 cld
15 movl $(init_thread_union+THREAD_SIZE),%esp 19#ifdef CONFIG_X86_32
20 mov %esi,xen_start_info
21 mov $init_thread_union+THREAD_SIZE,%esp
22#else
23 mov %rsi,xen_start_info
24 mov $init_thread_union+THREAD_SIZE,%rsp
25#endif
16 jmp xen_start_kernel 26 jmp xen_start_kernel
17 27
18 __FINIT 28 __FINIT
@@ -20,17 +30,26 @@ ENTRY(startup_xen)
20.pushsection .text 30.pushsection .text
21 .align PAGE_SIZE_asm 31 .align PAGE_SIZE_asm
22ENTRY(hypercall_page) 32ENTRY(hypercall_page)
23 .skip 0x1000 33 .skip PAGE_SIZE_asm
24.popsection 34.popsection
25 35
26 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") 36 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux")
27 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6") 37 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6")
28 ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0") 38 ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0")
29 ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .long __PAGE_OFFSET) 39#ifdef CONFIG_X86_32
30 ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long startup_xen) 40 ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __PAGE_OFFSET)
31 ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long hypercall_page) 41#else
42 ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __START_KERNEL_map)
43#endif
44 ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen)
45 ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page)
32 ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb") 46 ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb")
33 ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes") 47 ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes")
34 ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic") 48 ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic")
49 ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID,
50 .quad _PAGE_PRESENT; .quad _PAGE_PRESENT)
51 ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1)
52 ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, _ASM_PTR __HYPERVISOR_VIRT_START)
53 ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, _ASM_PTR 0)
35 54
36#endif /*CONFIG_XEN */ 55#endif /*CONFIG_XEN */
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index f1063ae08037..dd3c23152a2e 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -9,22 +9,31 @@
9extern const char xen_hypervisor_callback[]; 9extern const char xen_hypervisor_callback[];
10extern const char xen_failsafe_callback[]; 10extern const char xen_failsafe_callback[];
11 11
12struct trap_info;
12void xen_copy_trap_info(struct trap_info *traps); 13void xen_copy_trap_info(struct trap_info *traps);
13 14
14DECLARE_PER_CPU(unsigned long, xen_cr3); 15DECLARE_PER_CPU(unsigned long, xen_cr3);
15DECLARE_PER_CPU(unsigned long, xen_current_cr3); 16DECLARE_PER_CPU(unsigned long, xen_current_cr3);
16 17
17extern struct start_info *xen_start_info; 18extern struct start_info *xen_start_info;
19extern struct shared_info xen_dummy_shared_info;
18extern struct shared_info *HYPERVISOR_shared_info; 20extern struct shared_info *HYPERVISOR_shared_info;
19 21
22void xen_setup_mfn_list_list(void);
23void xen_setup_shared_info(void);
24
20char * __init xen_memory_setup(void); 25char * __init xen_memory_setup(void);
21void __init xen_arch_setup(void); 26void __init xen_arch_setup(void);
22void __init xen_init_IRQ(void); 27void __init xen_init_IRQ(void);
23void xen_enable_sysenter(void); 28void xen_enable_sysenter(void);
29void xen_enable_syscall(void);
30void xen_vcpu_restore(void);
31
32void __init xen_build_dynamic_phys_to_machine(void);
24 33
25void xen_setup_timer(int cpu); 34void xen_setup_timer(int cpu);
26void xen_setup_cpu_clockevents(void); 35void xen_setup_cpu_clockevents(void);
27unsigned long xen_cpu_khz(void); 36unsigned long xen_tsc_khz(void);
28void __init xen_time_init(void); 37void __init xen_time_init(void);
29unsigned long xen_get_wallclock(void); 38unsigned long xen_get_wallclock(void);
30int xen_set_wallclock(unsigned long time); 39int xen_set_wallclock(unsigned long time);
@@ -36,23 +45,15 @@ bool xen_vcpu_stolen(int vcpu);
36 45
37void xen_mark_init_mm_pinned(void); 46void xen_mark_init_mm_pinned(void);
38 47
39void __init xen_fill_possible_map(void);
40
41void __init xen_setup_vcpu_info_placement(void); 48void __init xen_setup_vcpu_info_placement(void);
42void xen_smp_prepare_boot_cpu(void);
43void xen_smp_prepare_cpus(unsigned int max_cpus);
44int xen_cpu_up(unsigned int cpu);
45void xen_smp_cpus_done(unsigned int max_cpus);
46 49
47void xen_smp_send_stop(void); 50#ifdef CONFIG_SMP
48void xen_smp_send_reschedule(int cpu); 51void xen_smp_init(void);
49int xen_smp_call_function (void (*func) (void *info), void *info, int nonatomic,
50 int wait);
51int xen_smp_call_function_single(int cpu, void (*func) (void *info), void *info,
52 int nonatomic, int wait);
53 52
54int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *), 53extern cpumask_t xen_cpu_initialized_map;
55 void *info, int wait); 54#else
55static inline void xen_smp_init(void) {}
56#endif
56 57
57 58
58/* Declare an asm function, along with symbols needed to make it 59/* Declare an asm function, along with symbols needed to make it
@@ -67,7 +68,11 @@ DECL_ASM(void, xen_irq_disable_direct, void);
67DECL_ASM(unsigned long, xen_save_fl_direct, void); 68DECL_ASM(unsigned long, xen_save_fl_direct, void);
68DECL_ASM(void, xen_restore_fl_direct, unsigned long); 69DECL_ASM(void, xen_restore_fl_direct, unsigned long);
69 70
71/* These are not functions, and cannot be called normally */
70void xen_iret(void); 72void xen_iret(void);
71void xen_sysexit(void); 73void xen_sysexit(void);
74void xen_sysret32(void);
75void xen_sysret64(void);
76void xen_adjust_exception_frame(void);
72 77
73#endif /* XEN_OPS_H */ 78#endif /* XEN_OPS_H */