aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/xen
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/xen')
-rw-r--r--arch/x86/xen/Kconfig20
-rw-r--r--arch/x86/xen/Makefile2
-rw-r--r--arch/x86/xen/enlighten.c880
-rw-r--r--arch/x86/xen/manage.c143
-rw-r--r--arch/x86/xen/mmu.c572
-rw-r--r--arch/x86/xen/mmu.h39
-rw-r--r--arch/x86/xen/multicalls.c41
-rw-r--r--arch/x86/xen/multicalls.h12
-rw-r--r--arch/x86/xen/setup.c109
-rw-r--r--arch/x86/xen/smp.c419
-rw-r--r--arch/x86/xen/suspend.c48
-rw-r--r--arch/x86/xen/time.c17
-rw-r--r--arch/x86/xen/xen-asm_32.S (renamed from arch/x86/xen/xen-asm.S)0
-rw-r--r--arch/x86/xen/xen-asm_64.S271
-rw-r--r--arch/x86/xen/xen-head.S31
-rw-r--r--arch/x86/xen/xen-ops.h35
16 files changed, 2041 insertions, 598 deletions
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 6c388e593bc8..3815e425f470 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -6,9 +6,25 @@ config XEN
6 bool "Xen guest support" 6 bool "Xen guest support"
7 select PARAVIRT 7 select PARAVIRT
8 select PARAVIRT_CLOCK 8 select PARAVIRT_CLOCK
9 depends on X86_32 9 depends on X86_64 || (X86_32 && X86_PAE && !(X86_VISWS || X86_VOYAGER))
10 depends on X86_CMPXCHG && X86_TSC && X86_PAE && !(X86_VISWS || X86_VOYAGER) 10 depends on X86_CMPXCHG && X86_TSC
11 help 11 help
12 This is the Linux Xen port. Enabling this will allow the 12 This is the Linux Xen port. Enabling this will allow the
13 kernel to boot in a paravirtualized environment under the 13 kernel to boot in a paravirtualized environment under the
14 Xen hypervisor. 14 Xen hypervisor.
15
16config XEN_MAX_DOMAIN_MEMORY
17 int "Maximum allowed size of a domain in gigabytes"
18 default 8 if X86_32
19 default 32 if X86_64
20 depends on XEN
21 help
22 The pseudo-physical to machine address array is sized
23 according to the maximum possible memory size of a Xen
24 domain. This array uses 1 page per gigabyte, so there's no
25 need to be too stingy here.
26
27config XEN_SAVE_RESTORE
28 bool
29 depends on PM
30 default y \ No newline at end of file
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 3d8df981d5fd..59c1e539aed2 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -1,4 +1,4 @@
1obj-y := enlighten.o setup.o multicalls.o mmu.o \ 1obj-y := enlighten.o setup.o multicalls.o mmu.o \
2 time.o manage.o xen-asm.o grant-table.o 2 time.o xen-asm_$(BITS).o grant-table.o suspend.o
3 3
4obj-$(CONFIG_SMP) += smp.o 4obj-$(CONFIG_SMP) += smp.o
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index f09c1c69c37a..8d28925ebed9 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -33,13 +33,16 @@
33#include <xen/interface/sched.h> 33#include <xen/interface/sched.h>
34#include <xen/features.h> 34#include <xen/features.h>
35#include <xen/page.h> 35#include <xen/page.h>
36#include <xen/hvc-console.h>
36 37
37#include <asm/paravirt.h> 38#include <asm/paravirt.h>
39#include <asm/apic.h>
38#include <asm/page.h> 40#include <asm/page.h>
39#include <asm/xen/hypercall.h> 41#include <asm/xen/hypercall.h>
40#include <asm/xen/hypervisor.h> 42#include <asm/xen/hypervisor.h>
41#include <asm/fixmap.h> 43#include <asm/fixmap.h>
42#include <asm/processor.h> 44#include <asm/processor.h>
45#include <asm/msr-index.h>
43#include <asm/setup.h> 46#include <asm/setup.h>
44#include <asm/desc.h> 47#include <asm/desc.h>
45#include <asm/pgtable.h> 48#include <asm/pgtable.h>
@@ -56,6 +59,18 @@ DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
56DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); 59DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
57 60
58/* 61/*
62 * Identity map, in addition to plain kernel map. This needs to be
63 * large enough to allocate page table pages to allocate the rest.
64 * Each page can map 2MB.
65 */
66static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss;
67
68#ifdef CONFIG_X86_64
69/* l3 pud for userspace vsyscall mapping */
70static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
71#endif /* CONFIG_X86_64 */
72
73/*
59 * Note about cr3 (pagetable base) values: 74 * Note about cr3 (pagetable base) values:
60 * 75 *
61 * xen_cr3 contains the current logical cr3 value; it contains the 76 * xen_cr3 contains the current logical cr3 value; it contains the
@@ -75,13 +90,13 @@ DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
75struct start_info *xen_start_info; 90struct start_info *xen_start_info;
76EXPORT_SYMBOL_GPL(xen_start_info); 91EXPORT_SYMBOL_GPL(xen_start_info);
77 92
78static /* __initdata */ struct shared_info dummy_shared_info; 93struct shared_info xen_dummy_shared_info;
79 94
80/* 95/*
81 * Point at some empty memory to start with. We map the real shared_info 96 * Point at some empty memory to start with. We map the real shared_info
82 * page as soon as fixmap is up and running. 97 * page as soon as fixmap is up and running.
83 */ 98 */
84struct shared_info *HYPERVISOR_shared_info = (void *)&dummy_shared_info; 99struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
85 100
86/* 101/*
87 * Flag to determine whether vcpu info placement is available on all 102 * Flag to determine whether vcpu info placement is available on all
@@ -98,13 +113,13 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&dummy_shared_info;
98 */ 113 */
99static int have_vcpu_info_placement = 1; 114static int have_vcpu_info_placement = 1;
100 115
101static void __init xen_vcpu_setup(int cpu) 116static void xen_vcpu_setup(int cpu)
102{ 117{
103 struct vcpu_register_vcpu_info info; 118 struct vcpu_register_vcpu_info info;
104 int err; 119 int err;
105 struct vcpu_info *vcpup; 120 struct vcpu_info *vcpup;
106 121
107 BUG_ON(HYPERVISOR_shared_info == &dummy_shared_info); 122 BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
108 per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; 123 per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
109 124
110 if (!have_vcpu_info_placement) 125 if (!have_vcpu_info_placement)
@@ -136,11 +151,45 @@ static void __init xen_vcpu_setup(int cpu)
136 } 151 }
137} 152}
138 153
154/*
155 * On restore, set the vcpu placement up again.
156 * If it fails, then we're in a bad state, since
157 * we can't back out from using it...
158 */
159void xen_vcpu_restore(void)
160{
161 if (have_vcpu_info_placement) {
162 int cpu;
163
164 for_each_online_cpu(cpu) {
165 bool other_cpu = (cpu != smp_processor_id());
166
167 if (other_cpu &&
168 HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL))
169 BUG();
170
171 xen_vcpu_setup(cpu);
172
173 if (other_cpu &&
174 HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL))
175 BUG();
176 }
177
178 BUG_ON(!have_vcpu_info_placement);
179 }
180}
181
139static void __init xen_banner(void) 182static void __init xen_banner(void)
140{ 183{
184 unsigned version = HYPERVISOR_xen_version(XENVER_version, NULL);
185 struct xen_extraversion extra;
186 HYPERVISOR_xen_version(XENVER_extraversion, &extra);
187
141 printk(KERN_INFO "Booting paravirtualized kernel on %s\n", 188 printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
142 pv_info.name); 189 pv_info.name);
143 printk(KERN_INFO "Hypervisor signature: %s\n", xen_start_info->magic); 190 printk(KERN_INFO "Xen version: %d.%d%s%s\n",
191 version >> 16, version & 0xffff, extra.extraversion,
192 xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : "");
144} 193}
145 194
146static void xen_cpuid(unsigned int *ax, unsigned int *bx, 195static void xen_cpuid(unsigned int *ax, unsigned int *bx,
@@ -235,13 +284,13 @@ static void xen_irq_enable(void)
235{ 284{
236 struct vcpu_info *vcpu; 285 struct vcpu_info *vcpu;
237 286
238 /* There's a one instruction preempt window here. We need to 287 /* We don't need to worry about being preempted here, since
239 make sure we're don't switch CPUs between getting the vcpu 288 either a) interrupts are disabled, so no preemption, or b)
240 pointer and updating the mask. */ 289 the caller is confused and is trying to re-enable interrupts
241 preempt_disable(); 290 on an indeterminate processor. */
291
242 vcpu = x86_read_percpu(xen_vcpu); 292 vcpu = x86_read_percpu(xen_vcpu);
243 vcpu->evtchn_upcall_mask = 0; 293 vcpu->evtchn_upcall_mask = 0;
244 preempt_enable_no_resched();
245 294
246 /* Doesn't matter if we get preempted here, because any 295 /* Doesn't matter if we get preempted here, because any
247 pending event will get dealt with anyway. */ 296 pending event will get dealt with anyway. */
@@ -254,7 +303,7 @@ static void xen_irq_enable(void)
254static void xen_safe_halt(void) 303static void xen_safe_halt(void)
255{ 304{
256 /* Blocking includes an implicit local_irq_enable(). */ 305 /* Blocking includes an implicit local_irq_enable(). */
257 if (HYPERVISOR_sched_op(SCHEDOP_block, 0) != 0) 306 if (HYPERVISOR_sched_op(SCHEDOP_block, NULL) != 0)
258 BUG(); 307 BUG();
259} 308}
260 309
@@ -332,14 +381,6 @@ static void load_TLS_descriptor(struct thread_struct *t,
332 381
333static void xen_load_tls(struct thread_struct *t, unsigned int cpu) 382static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
334{ 383{
335 xen_mc_batch();
336
337 load_TLS_descriptor(t, cpu, 0);
338 load_TLS_descriptor(t, cpu, 1);
339 load_TLS_descriptor(t, cpu, 2);
340
341 xen_mc_issue(PARAVIRT_LAZY_CPU);
342
343 /* 384 /*
344 * XXX sleazy hack: If we're being called in a lazy-cpu zone, 385 * XXX sleazy hack: If we're being called in a lazy-cpu zone,
345 * it means we're in a context switch, and %gs has just been 386 * it means we're in a context switch, and %gs has just been
@@ -348,11 +389,40 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
348 * Either way, it has been saved, and the new value will get 389 * Either way, it has been saved, and the new value will get
349 * loaded properly. This will go away as soon as Xen has been 390 * loaded properly. This will go away as soon as Xen has been
350 * modified to not save/restore %gs for normal hypercalls. 391 * modified to not save/restore %gs for normal hypercalls.
392 *
393 * On x86_64, this hack is not used for %gs, because gs points
394 * to KERNEL_GS_BASE (and uses it for PDA references), so we
395 * must not zero %gs on x86_64
396 *
397 * For x86_64, we need to zero %fs, otherwise we may get an
398 * exception between the new %fs descriptor being loaded and
399 * %fs being effectively cleared at __switch_to().
351 */ 400 */
352 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) 401 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) {
402#ifdef CONFIG_X86_32
353 loadsegment(gs, 0); 403 loadsegment(gs, 0);
404#else
405 loadsegment(fs, 0);
406#endif
407 }
408
409 xen_mc_batch();
410
411 load_TLS_descriptor(t, cpu, 0);
412 load_TLS_descriptor(t, cpu, 1);
413 load_TLS_descriptor(t, cpu, 2);
414
415 xen_mc_issue(PARAVIRT_LAZY_CPU);
354} 416}
355 417
418#ifdef CONFIG_X86_64
419static void xen_load_gs_index(unsigned int idx)
420{
421 if (HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, idx))
422 BUG();
423}
424#endif
425
356static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, 426static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
357 const void *ptr) 427 const void *ptr)
358{ 428{
@@ -369,23 +439,18 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
369 preempt_enable(); 439 preempt_enable();
370} 440}
371 441
372static int cvt_gate_to_trap(int vector, u32 low, u32 high, 442static int cvt_gate_to_trap(int vector, const gate_desc *val,
373 struct trap_info *info) 443 struct trap_info *info)
374{ 444{
375 u8 type, dpl; 445 if (val->type != 0xf && val->type != 0xe)
376
377 type = (high >> 8) & 0x1f;
378 dpl = (high >> 13) & 3;
379
380 if (type != 0xf && type != 0xe)
381 return 0; 446 return 0;
382 447
383 info->vector = vector; 448 info->vector = vector;
384 info->address = (high & 0xffff0000) | (low & 0x0000ffff); 449 info->address = gate_offset(*val);
385 info->cs = low >> 16; 450 info->cs = gate_segment(*val);
386 info->flags = dpl; 451 info->flags = val->dpl;
387 /* interrupt gates clear IF */ 452 /* interrupt gates clear IF */
388 if (type == 0xe) 453 if (val->type == 0xe)
389 info->flags |= 4; 454 info->flags |= 4;
390 455
391 return 1; 456 return 1;
@@ -412,11 +477,10 @@ static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g)
412 477
413 if (p >= start && (p + 8) <= end) { 478 if (p >= start && (p + 8) <= end) {
414 struct trap_info info[2]; 479 struct trap_info info[2];
415 u32 *desc = (u32 *)g;
416 480
417 info[1].address = 0; 481 info[1].address = 0;
418 482
419 if (cvt_gate_to_trap(entrynum, desc[0], desc[1], &info[0])) 483 if (cvt_gate_to_trap(entrynum, g, &info[0]))
420 if (HYPERVISOR_set_trap_table(info)) 484 if (HYPERVISOR_set_trap_table(info))
421 BUG(); 485 BUG();
422 } 486 }
@@ -429,13 +493,13 @@ static void xen_convert_trap_info(const struct desc_ptr *desc,
429{ 493{
430 unsigned in, out, count; 494 unsigned in, out, count;
431 495
432 count = (desc->size+1) / 8; 496 count = (desc->size+1) / sizeof(gate_desc);
433 BUG_ON(count > 256); 497 BUG_ON(count > 256);
434 498
435 for (in = out = 0; in < count; in++) { 499 for (in = out = 0; in < count; in++) {
436 const u32 *entry = (u32 *)(desc->address + in * 8); 500 gate_desc *entry = (gate_desc*)(desc->address) + in;
437 501
438 if (cvt_gate_to_trap(in, entry[0], entry[1], &traps[out])) 502 if (cvt_gate_to_trap(in, entry, &traps[out]))
439 out++; 503 out++;
440 } 504 }
441 traps[out].address = 0; 505 traps[out].address = 0;
@@ -517,16 +581,47 @@ static void xen_io_delay(void)
517} 581}
518 582
519#ifdef CONFIG_X86_LOCAL_APIC 583#ifdef CONFIG_X86_LOCAL_APIC
520static u32 xen_apic_read(unsigned long reg) 584static u32 xen_apic_read(u32 reg)
521{ 585{
522 return 0; 586 return 0;
523} 587}
524 588
525static void xen_apic_write(unsigned long reg, u32 val) 589static void xen_apic_write(u32 reg, u32 val)
526{ 590{
527 /* Warn to see if there's any stray references */ 591 /* Warn to see if there's any stray references */
528 WARN_ON(1); 592 WARN_ON(1);
529} 593}
594
595static u64 xen_apic_icr_read(void)
596{
597 return 0;
598}
599
600static void xen_apic_icr_write(u32 low, u32 id)
601{
602 /* Warn to see if there's any stray references */
603 WARN_ON(1);
604}
605
606static void xen_apic_wait_icr_idle(void)
607{
608 return;
609}
610
611static u32 xen_safe_apic_wait_icr_idle(void)
612{
613 return 0;
614}
615
616static struct apic_ops xen_basic_apic_ops = {
617 .read = xen_apic_read,
618 .write = xen_apic_write,
619 .icr_read = xen_apic_icr_read,
620 .icr_write = xen_apic_icr_write,
621 .wait_icr_idle = xen_apic_wait_icr_idle,
622 .safe_wait_icr_idle = xen_safe_apic_wait_icr_idle,
623};
624
530#endif 625#endif
531 626
532static void xen_flush_tlb(void) 627static void xen_flush_tlb(void)
@@ -607,6 +702,30 @@ static void xen_flush_tlb_others(const cpumask_t *cpus, struct mm_struct *mm,
607 xen_mc_issue(PARAVIRT_LAZY_MMU); 702 xen_mc_issue(PARAVIRT_LAZY_MMU);
608} 703}
609 704
705static void xen_clts(void)
706{
707 struct multicall_space mcs;
708
709 mcs = xen_mc_entry(0);
710
711 MULTI_fpu_taskswitch(mcs.mc, 0);
712
713 xen_mc_issue(PARAVIRT_LAZY_CPU);
714}
715
716static void xen_write_cr0(unsigned long cr0)
717{
718 struct multicall_space mcs;
719
720 /* Only pay attention to cr0.TS; everything else is
721 ignored. */
722 mcs = xen_mc_entry(0);
723
724 MULTI_fpu_taskswitch(mcs.mc, (cr0 & X86_CR0_TS) != 0);
725
726 xen_mc_issue(PARAVIRT_LAZY_CPU);
727}
728
610static void xen_write_cr2(unsigned long cr2) 729static void xen_write_cr2(unsigned long cr2)
611{ 730{
612 x86_read_percpu(xen_vcpu)->arch.cr2 = cr2; 731 x86_read_percpu(xen_vcpu)->arch.cr2 = cr2;
@@ -624,8 +743,10 @@ static unsigned long xen_read_cr2_direct(void)
624 743
625static void xen_write_cr4(unsigned long cr4) 744static void xen_write_cr4(unsigned long cr4)
626{ 745{
627 /* Just ignore cr4 changes; Xen doesn't allow us to do 746 cr4 &= ~X86_CR4_PGE;
628 anything anyway. */ 747 cr4 &= ~X86_CR4_PSE;
748
749 native_write_cr4(cr4);
629} 750}
630 751
631static unsigned long xen_read_cr3(void) 752static unsigned long xen_read_cr3(void)
@@ -638,33 +759,89 @@ static void set_current_cr3(void *v)
638 x86_write_percpu(xen_current_cr3, (unsigned long)v); 759 x86_write_percpu(xen_current_cr3, (unsigned long)v);
639} 760}
640 761
641static void xen_write_cr3(unsigned long cr3) 762static void __xen_write_cr3(bool kernel, unsigned long cr3)
642{ 763{
643 struct mmuext_op *op; 764 struct mmuext_op *op;
644 struct multicall_space mcs; 765 struct multicall_space mcs;
645 unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3)); 766 unsigned long mfn;
646 767
647 BUG_ON(preemptible()); 768 if (cr3)
769 mfn = pfn_to_mfn(PFN_DOWN(cr3));
770 else
771 mfn = 0;
648 772
649 mcs = xen_mc_entry(sizeof(*op)); /* disables interrupts */ 773 WARN_ON(mfn == 0 && kernel);
650 774
651 /* Update while interrupts are disabled, so its atomic with 775 mcs = __xen_mc_entry(sizeof(*op));
652 respect to ipis */
653 x86_write_percpu(xen_cr3, cr3);
654 776
655 op = mcs.args; 777 op = mcs.args;
656 op->cmd = MMUEXT_NEW_BASEPTR; 778 op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
657 op->arg1.mfn = mfn; 779 op->arg1.mfn = mfn;
658 780
659 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); 781 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
660 782
661 /* Update xen_update_cr3 once the batch has actually 783 if (kernel) {
662 been submitted. */ 784 x86_write_percpu(xen_cr3, cr3);
663 xen_mc_callback(set_current_cr3, (void *)cr3); 785
786 /* Update xen_current_cr3 once the batch has actually
787 been submitted. */
788 xen_mc_callback(set_current_cr3, (void *)cr3);
789 }
790}
791
792static void xen_write_cr3(unsigned long cr3)
793{
794 BUG_ON(preemptible());
795
796 xen_mc_batch(); /* disables interrupts */
797
798 /* Update while interrupts are disabled, so its atomic with
799 respect to ipis */
800 x86_write_percpu(xen_cr3, cr3);
801
802 __xen_write_cr3(true, cr3);
803
804#ifdef CONFIG_X86_64
805 {
806 pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
807 if (user_pgd)
808 __xen_write_cr3(false, __pa(user_pgd));
809 else
810 __xen_write_cr3(false, 0);
811 }
812#endif
664 813
665 xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */ 814 xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
666} 815}
667 816
817static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
818{
819 int ret;
820
821 ret = 0;
822
823 switch(msr) {
824#ifdef CONFIG_X86_64
825 unsigned which;
826 u64 base;
827
828 case MSR_FS_BASE: which = SEGBASE_FS; goto set;
829 case MSR_KERNEL_GS_BASE: which = SEGBASE_GS_USER; goto set;
830 case MSR_GS_BASE: which = SEGBASE_GS_KERNEL; goto set;
831
832 set:
833 base = ((u64)high << 32) | low;
834 if (HYPERVISOR_set_segment_base(which, base) != 0)
835 ret = -EFAULT;
836 break;
837#endif
838 default:
839 ret = native_write_msr_safe(msr, low, high);
840 }
841
842 return ret;
843}
844
668/* Early in boot, while setting up the initial pagetable, assume 845/* Early in boot, while setting up the initial pagetable, assume
669 everything is pinned. */ 846 everything is pinned. */
670static __init void xen_alloc_pte_init(struct mm_struct *mm, u32 pfn) 847static __init void xen_alloc_pte_init(struct mm_struct *mm, u32 pfn)
@@ -721,6 +898,48 @@ static void xen_alloc_pmd(struct mm_struct *mm, u32 pfn)
721 xen_alloc_ptpage(mm, pfn, PT_PMD); 898 xen_alloc_ptpage(mm, pfn, PT_PMD);
722} 899}
723 900
901static int xen_pgd_alloc(struct mm_struct *mm)
902{
903 pgd_t *pgd = mm->pgd;
904 int ret = 0;
905
906 BUG_ON(PagePinned(virt_to_page(pgd)));
907
908#ifdef CONFIG_X86_64
909 {
910 struct page *page = virt_to_page(pgd);
911 pgd_t *user_pgd;
912
913 BUG_ON(page->private != 0);
914
915 ret = -ENOMEM;
916
917 user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
918 page->private = (unsigned long)user_pgd;
919
920 if (user_pgd != NULL) {
921 user_pgd[pgd_index(VSYSCALL_START)] =
922 __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
923 ret = 0;
924 }
925
926 BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
927 }
928#endif
929
930 return ret;
931}
932
933static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
934{
935#ifdef CONFIG_X86_64
936 pgd_t *user_pgd = xen_get_user_pgd(pgd);
937
938 if (user_pgd)
939 free_page((unsigned long)user_pgd);
940#endif
941}
942
724/* This should never happen until we're OK to use struct page */ 943/* This should never happen until we're OK to use struct page */
725static void xen_release_ptpage(u32 pfn, unsigned level) 944static void xen_release_ptpage(u32 pfn, unsigned level)
726{ 945{
@@ -746,6 +965,18 @@ static void xen_release_pmd(u32 pfn)
746 xen_release_ptpage(pfn, PT_PMD); 965 xen_release_ptpage(pfn, PT_PMD);
747} 966}
748 967
968#if PAGETABLE_LEVELS == 4
969static void xen_alloc_pud(struct mm_struct *mm, u32 pfn)
970{
971 xen_alloc_ptpage(mm, pfn, PT_PUD);
972}
973
974static void xen_release_pud(u32 pfn)
975{
976 xen_release_ptpage(pfn, PT_PUD);
977}
978#endif
979
749#ifdef CONFIG_HIGHPTE 980#ifdef CONFIG_HIGHPTE
750static void *xen_kmap_atomic_pte(struct page *page, enum km_type type) 981static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
751{ 982{
@@ -784,68 +1015,16 @@ static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
784 1015
785static __init void xen_pagetable_setup_start(pgd_t *base) 1016static __init void xen_pagetable_setup_start(pgd_t *base)
786{ 1017{
787 pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base;
788 int i;
789
790 /* special set_pte for pagetable initialization */
791 pv_mmu_ops.set_pte = xen_set_pte_init;
792
793 init_mm.pgd = base;
794 /*
795 * copy top-level of Xen-supplied pagetable into place. This
796 * is a stand-in while we copy the pmd pages.
797 */
798 memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t));
799
800 /*
801 * For PAE, need to allocate new pmds, rather than
802 * share Xen's, since Xen doesn't like pmd's being
803 * shared between address spaces.
804 */
805 for (i = 0; i < PTRS_PER_PGD; i++) {
806 if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) {
807 pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
808
809 memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]),
810 PAGE_SIZE);
811
812 make_lowmem_page_readonly(pmd);
813
814 set_pgd(&base[i], __pgd(1 + __pa(pmd)));
815 } else
816 pgd_clear(&base[i]);
817 }
818
819 /* make sure zero_page is mapped RO so we can use it in pagetables */
820 make_lowmem_page_readonly(empty_zero_page);
821 make_lowmem_page_readonly(base);
822 /*
823 * Switch to new pagetable. This is done before
824 * pagetable_init has done anything so that the new pages
825 * added to the table can be prepared properly for Xen.
826 */
827 xen_write_cr3(__pa(base));
828
829 /* Unpin initial Xen pagetable */
830 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE,
831 PFN_DOWN(__pa(xen_start_info->pt_base)));
832} 1018}
833 1019
834static __init void setup_shared_info(void) 1020void xen_setup_shared_info(void)
835{ 1021{
836 if (!xen_feature(XENFEAT_auto_translated_physmap)) { 1022 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
837 unsigned long addr = fix_to_virt(FIX_PARAVIRT_BOOTMAP); 1023 set_fixmap(FIX_PARAVIRT_BOOTMAP,
838 1024 xen_start_info->shared_info);
839 /* 1025
840 * Create a mapping for the shared info page. 1026 HYPERVISOR_shared_info =
841 * Should be set_fixmap(), but shared_info is a machine 1027 (struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP);
842 * address with no corresponding pseudo-phys address.
843 */
844 set_pte_mfn(addr,
845 PFN_DOWN(xen_start_info->shared_info),
846 PAGE_KERNEL);
847
848 HYPERVISOR_shared_info = (struct shared_info *)addr;
849 } else 1028 } else
850 HYPERVISOR_shared_info = 1029 HYPERVISOR_shared_info =
851 (struct shared_info *)__va(xen_start_info->shared_info); 1030 (struct shared_info *)__va(xen_start_info->shared_info);
@@ -854,27 +1033,43 @@ static __init void setup_shared_info(void)
854 /* In UP this is as good a place as any to set up shared info */ 1033 /* In UP this is as good a place as any to set up shared info */
855 xen_setup_vcpu_info_placement(); 1034 xen_setup_vcpu_info_placement();
856#endif 1035#endif
1036
1037 xen_setup_mfn_list_list();
857} 1038}
858 1039
859static __init void xen_pagetable_setup_done(pgd_t *base) 1040static __init void xen_pagetable_setup_done(pgd_t *base)
860{ 1041{
1042 xen_setup_shared_info();
1043}
1044
1045static __init void xen_post_allocator_init(void)
1046{
1047 pv_mmu_ops.set_pte = xen_set_pte;
1048 pv_mmu_ops.set_pmd = xen_set_pmd;
1049 pv_mmu_ops.set_pud = xen_set_pud;
1050#if PAGETABLE_LEVELS == 4
1051 pv_mmu_ops.set_pgd = xen_set_pgd;
1052#endif
1053
861 /* This will work as long as patching hasn't happened yet 1054 /* This will work as long as patching hasn't happened yet
862 (which it hasn't) */ 1055 (which it hasn't) */
863 pv_mmu_ops.alloc_pte = xen_alloc_pte; 1056 pv_mmu_ops.alloc_pte = xen_alloc_pte;
864 pv_mmu_ops.alloc_pmd = xen_alloc_pmd; 1057 pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
865 pv_mmu_ops.release_pte = xen_release_pte; 1058 pv_mmu_ops.release_pte = xen_release_pte;
866 pv_mmu_ops.release_pmd = xen_release_pmd; 1059 pv_mmu_ops.release_pmd = xen_release_pmd;
867 pv_mmu_ops.set_pte = xen_set_pte; 1060#if PAGETABLE_LEVELS == 4
868 1061 pv_mmu_ops.alloc_pud = xen_alloc_pud;
869 setup_shared_info(); 1062 pv_mmu_ops.release_pud = xen_release_pud;
1063#endif
870 1064
871 /* Actually pin the pagetable down, but we can't set PG_pinned 1065#ifdef CONFIG_X86_64
872 yet because the page structures don't exist yet. */ 1066 SetPagePinned(virt_to_page(level3_user_vsyscall));
873 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(base))); 1067#endif
1068 xen_mark_init_mm_pinned();
874} 1069}
875 1070
876/* This is called once we have the cpu_possible_map */ 1071/* This is called once we have the cpu_possible_map */
877void __init xen_setup_vcpu_info_placement(void) 1072void xen_setup_vcpu_info_placement(void)
878{ 1073{
879 int cpu; 1074 int cpu;
880 1075
@@ -883,6 +1078,7 @@ void __init xen_setup_vcpu_info_placement(void)
883 1078
884 /* xen_vcpu_setup managed to place the vcpu_info within the 1079 /* xen_vcpu_setup managed to place the vcpu_info within the
885 percpu area for all cpus, so make use of it */ 1080 percpu area for all cpus, so make use of it */
1081#ifdef CONFIG_X86_32
886 if (have_vcpu_info_placement) { 1082 if (have_vcpu_info_placement) {
887 printk(KERN_INFO "Xen: using vcpu_info placement\n"); 1083 printk(KERN_INFO "Xen: using vcpu_info placement\n");
888 1084
@@ -892,6 +1088,7 @@ void __init xen_setup_vcpu_info_placement(void)
892 pv_irq_ops.irq_enable = xen_irq_enable_direct; 1088 pv_irq_ops.irq_enable = xen_irq_enable_direct;
893 pv_mmu_ops.read_cr2 = xen_read_cr2_direct; 1089 pv_mmu_ops.read_cr2 = xen_read_cr2_direct;
894 } 1090 }
1091#endif
895} 1092}
896 1093
897static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, 1094static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
@@ -912,10 +1109,12 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
912 goto patch_site 1109 goto patch_site
913 1110
914 switch (type) { 1111 switch (type) {
1112#ifdef CONFIG_X86_32
915 SITE(pv_irq_ops, irq_enable); 1113 SITE(pv_irq_ops, irq_enable);
916 SITE(pv_irq_ops, irq_disable); 1114 SITE(pv_irq_ops, irq_disable);
917 SITE(pv_irq_ops, save_fl); 1115 SITE(pv_irq_ops, save_fl);
918 SITE(pv_irq_ops, restore_fl); 1116 SITE(pv_irq_ops, restore_fl);
1117#endif /* CONFIG_X86_32 */
919#undef SITE 1118#undef SITE
920 1119
921 patch_site: 1120 patch_site:
@@ -947,6 +1146,49 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
947 return ret; 1146 return ret;
948} 1147}
949 1148
1149static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot)
1150{
1151 pte_t pte;
1152
1153 phys >>= PAGE_SHIFT;
1154
1155 switch (idx) {
1156 case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
1157#ifdef CONFIG_X86_F00F_BUG
1158 case FIX_F00F_IDT:
1159#endif
1160#ifdef CONFIG_X86_32
1161 case FIX_WP_TEST:
1162 case FIX_VDSO:
1163# ifdef CONFIG_HIGHMEM
1164 case FIX_KMAP_BEGIN ... FIX_KMAP_END:
1165# endif
1166#else
1167 case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
1168#endif
1169#ifdef CONFIG_X86_LOCAL_APIC
1170 case FIX_APIC_BASE: /* maps dummy local APIC */
1171#endif
1172 pte = pfn_pte(phys, prot);
1173 break;
1174
1175 default:
1176 pte = mfn_pte(phys, prot);
1177 break;
1178 }
1179
1180 __native_set_fixmap(idx, pte);
1181
1182#ifdef CONFIG_X86_64
1183 /* Replicate changes to map the vsyscall page into the user
1184 pagetable vsyscall mapping. */
1185 if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) {
1186 unsigned long vaddr = __fix_to_virt(idx);
1187 set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
1188 }
1189#endif
1190}
1191
950static const struct pv_info xen_info __initdata = { 1192static const struct pv_info xen_info __initdata = {
951 .paravirt_enabled = 1, 1193 .paravirt_enabled = 1,
952 .shared_kernel_pmd = 0, 1194 .shared_kernel_pmd = 0,
@@ -960,7 +1202,7 @@ static const struct pv_init_ops xen_init_ops __initdata = {
960 .banner = xen_banner, 1202 .banner = xen_banner,
961 .memory_setup = xen_memory_setup, 1203 .memory_setup = xen_memory_setup,
962 .arch_setup = xen_arch_setup, 1204 .arch_setup = xen_arch_setup,
963 .post_allocator_init = xen_mark_init_mm_pinned, 1205 .post_allocator_init = xen_post_allocator_init,
964}; 1206};
965 1207
966static const struct pv_time_ops xen_time_ops __initdata = { 1208static const struct pv_time_ops xen_time_ops __initdata = {
@@ -968,7 +1210,7 @@ static const struct pv_time_ops xen_time_ops __initdata = {
968 1210
969 .set_wallclock = xen_set_wallclock, 1211 .set_wallclock = xen_set_wallclock,
970 .get_wallclock = xen_get_wallclock, 1212 .get_wallclock = xen_get_wallclock,
971 .get_cpu_khz = xen_cpu_khz, 1213 .get_tsc_khz = xen_tsc_khz,
972 .sched_clock = xen_sched_clock, 1214 .sched_clock = xen_sched_clock,
973}; 1215};
974 1216
@@ -978,10 +1220,10 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
978 .set_debugreg = xen_set_debugreg, 1220 .set_debugreg = xen_set_debugreg,
979 .get_debugreg = xen_get_debugreg, 1221 .get_debugreg = xen_get_debugreg,
980 1222
981 .clts = native_clts, 1223 .clts = xen_clts,
982 1224
983 .read_cr0 = native_read_cr0, 1225 .read_cr0 = native_read_cr0,
984 .write_cr0 = native_write_cr0, 1226 .write_cr0 = xen_write_cr0,
985 1227
986 .read_cr4 = native_read_cr4, 1228 .read_cr4 = native_read_cr4,
987 .read_cr4_safe = native_read_cr4_safe, 1229 .read_cr4_safe = native_read_cr4_safe,
@@ -990,18 +1232,25 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
990 .wbinvd = native_wbinvd, 1232 .wbinvd = native_wbinvd,
991 1233
992 .read_msr = native_read_msr_safe, 1234 .read_msr = native_read_msr_safe,
993 .write_msr = native_write_msr_safe, 1235 .write_msr = xen_write_msr_safe,
994 .read_tsc = native_read_tsc, 1236 .read_tsc = native_read_tsc,
995 .read_pmc = native_read_pmc, 1237 .read_pmc = native_read_pmc,
996 1238
997 .iret = xen_iret, 1239 .iret = xen_iret,
998 .irq_enable_syscall_ret = xen_sysexit, 1240 .irq_enable_sysexit = xen_sysexit,
1241#ifdef CONFIG_X86_64
1242 .usergs_sysret32 = xen_sysret32,
1243 .usergs_sysret64 = xen_sysret64,
1244#endif
999 1245
1000 .load_tr_desc = paravirt_nop, 1246 .load_tr_desc = paravirt_nop,
1001 .set_ldt = xen_set_ldt, 1247 .set_ldt = xen_set_ldt,
1002 .load_gdt = xen_load_gdt, 1248 .load_gdt = xen_load_gdt,
1003 .load_idt = xen_load_idt, 1249 .load_idt = xen_load_idt,
1004 .load_tls = xen_load_tls, 1250 .load_tls = xen_load_tls,
1251#ifdef CONFIG_X86_64
1252 .load_gs_index = xen_load_gs_index,
1253#endif
1005 1254
1006 .store_gdt = native_store_gdt, 1255 .store_gdt = native_store_gdt,
1007 .store_idt = native_store_idt, 1256 .store_idt = native_store_idt,
@@ -1015,27 +1264,47 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
1015 .set_iopl_mask = xen_set_iopl_mask, 1264 .set_iopl_mask = xen_set_iopl_mask,
1016 .io_delay = xen_io_delay, 1265 .io_delay = xen_io_delay,
1017 1266
1267 /* Xen takes care of %gs when switching to usermode for us */
1268 .swapgs = paravirt_nop,
1269
1018 .lazy_mode = { 1270 .lazy_mode = {
1019 .enter = paravirt_enter_lazy_cpu, 1271 .enter = paravirt_enter_lazy_cpu,
1020 .leave = xen_leave_lazy, 1272 .leave = xen_leave_lazy,
1021 }, 1273 },
1022}; 1274};
1023 1275
1276static void __init __xen_init_IRQ(void)
1277{
1278#ifdef CONFIG_X86_64
1279 int i;
1280
1281 /* Create identity vector->irq map */
1282 for(i = 0; i < NR_VECTORS; i++) {
1283 int cpu;
1284
1285 for_each_possible_cpu(cpu)
1286 per_cpu(vector_irq, cpu)[i] = i;
1287 }
1288#endif /* CONFIG_X86_64 */
1289
1290 xen_init_IRQ();
1291}
1292
1024static const struct pv_irq_ops xen_irq_ops __initdata = { 1293static const struct pv_irq_ops xen_irq_ops __initdata = {
1025 .init_IRQ = xen_init_IRQ, 1294 .init_IRQ = __xen_init_IRQ,
1026 .save_fl = xen_save_fl, 1295 .save_fl = xen_save_fl,
1027 .restore_fl = xen_restore_fl, 1296 .restore_fl = xen_restore_fl,
1028 .irq_disable = xen_irq_disable, 1297 .irq_disable = xen_irq_disable,
1029 .irq_enable = xen_irq_enable, 1298 .irq_enable = xen_irq_enable,
1030 .safe_halt = xen_safe_halt, 1299 .safe_halt = xen_safe_halt,
1031 .halt = xen_halt, 1300 .halt = xen_halt,
1301#ifdef CONFIG_X86_64
1302 .adjust_exception_frame = xen_adjust_exception_frame,
1303#endif
1032}; 1304};
1033 1305
1034static const struct pv_apic_ops xen_apic_ops __initdata = { 1306static const struct pv_apic_ops xen_apic_ops __initdata = {
1035#ifdef CONFIG_X86_LOCAL_APIC 1307#ifdef CONFIG_X86_LOCAL_APIC
1036 .apic_write = xen_apic_write,
1037 .apic_write_atomic = xen_apic_write,
1038 .apic_read = xen_apic_read,
1039 .setup_boot_clock = paravirt_nop, 1308 .setup_boot_clock = paravirt_nop,
1040 .setup_secondary_clock = paravirt_nop, 1309 .setup_secondary_clock = paravirt_nop,
1041 .startup_ipi_hook = paravirt_nop, 1310 .startup_ipi_hook = paravirt_nop,
@@ -1060,6 +1329,9 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1060 .pte_update = paravirt_nop, 1329 .pte_update = paravirt_nop,
1061 .pte_update_defer = paravirt_nop, 1330 .pte_update_defer = paravirt_nop,
1062 1331
1332 .pgd_alloc = xen_pgd_alloc,
1333 .pgd_free = xen_pgd_free,
1334
1063 .alloc_pte = xen_alloc_pte_init, 1335 .alloc_pte = xen_alloc_pte_init,
1064 .release_pte = xen_release_pte_init, 1336 .release_pte = xen_release_pte_init,
1065 .alloc_pmd = xen_alloc_pte_init, 1337 .alloc_pmd = xen_alloc_pte_init,
@@ -1070,25 +1342,44 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1070 .kmap_atomic_pte = xen_kmap_atomic_pte, 1342 .kmap_atomic_pte = xen_kmap_atomic_pte,
1071#endif 1343#endif
1072 1344
1073 .set_pte = NULL, /* see xen_pagetable_setup_* */ 1345#ifdef CONFIG_X86_64
1346 .set_pte = xen_set_pte,
1347#else
1348 .set_pte = xen_set_pte_init,
1349#endif
1074 .set_pte_at = xen_set_pte_at, 1350 .set_pte_at = xen_set_pte_at,
1075 .set_pmd = xen_set_pmd, 1351 .set_pmd = xen_set_pmd_hyper,
1352
1353 .ptep_modify_prot_start = __ptep_modify_prot_start,
1354 .ptep_modify_prot_commit = __ptep_modify_prot_commit,
1076 1355
1077 .pte_val = xen_pte_val, 1356 .pte_val = xen_pte_val,
1357 .pte_flags = native_pte_val,
1078 .pgd_val = xen_pgd_val, 1358 .pgd_val = xen_pgd_val,
1079 1359
1080 .make_pte = xen_make_pte, 1360 .make_pte = xen_make_pte,
1081 .make_pgd = xen_make_pgd, 1361 .make_pgd = xen_make_pgd,
1082 1362
1363#ifdef CONFIG_X86_PAE
1083 .set_pte_atomic = xen_set_pte_atomic, 1364 .set_pte_atomic = xen_set_pte_atomic,
1084 .set_pte_present = xen_set_pte_at, 1365 .set_pte_present = xen_set_pte_at,
1085 .set_pud = xen_set_pud,
1086 .pte_clear = xen_pte_clear, 1366 .pte_clear = xen_pte_clear,
1087 .pmd_clear = xen_pmd_clear, 1367 .pmd_clear = xen_pmd_clear,
1368#endif /* CONFIG_X86_PAE */
1369 .set_pud = xen_set_pud_hyper,
1088 1370
1089 .make_pmd = xen_make_pmd, 1371 .make_pmd = xen_make_pmd,
1090 .pmd_val = xen_pmd_val, 1372 .pmd_val = xen_pmd_val,
1091 1373
1374#if PAGETABLE_LEVELS == 4
1375 .pud_val = xen_pud_val,
1376 .make_pud = xen_make_pud,
1377 .set_pgd = xen_set_pgd_hyper,
1378
1379 .alloc_pud = xen_alloc_pte_init,
1380 .release_pud = xen_release_pte_init,
1381#endif /* PAGETABLE_LEVELS == 4 */
1382
1092 .activate_mm = xen_activate_mm, 1383 .activate_mm = xen_activate_mm,
1093 .dup_mmap = xen_dup_mmap, 1384 .dup_mmap = xen_dup_mmap,
1094 .exit_mmap = xen_exit_mmap, 1385 .exit_mmap = xen_exit_mmap,
@@ -1097,28 +1388,19 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1097 .enter = paravirt_enter_lazy_mmu, 1388 .enter = paravirt_enter_lazy_mmu,
1098 .leave = xen_leave_lazy, 1389 .leave = xen_leave_lazy,
1099 }, 1390 },
1100};
1101 1391
1102#ifdef CONFIG_SMP 1392 .set_fixmap = xen_set_fixmap,
1103static const struct smp_ops xen_smp_ops __initdata = {
1104 .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
1105 .smp_prepare_cpus = xen_smp_prepare_cpus,
1106 .cpu_up = xen_cpu_up,
1107 .smp_cpus_done = xen_smp_cpus_done,
1108
1109 .smp_send_stop = xen_smp_send_stop,
1110 .smp_send_reschedule = xen_smp_send_reschedule,
1111 .smp_call_function_mask = xen_smp_call_function_mask,
1112}; 1393};
1113#endif /* CONFIG_SMP */
1114 1394
1115static void xen_reboot(int reason) 1395static void xen_reboot(int reason)
1116{ 1396{
1397 struct sched_shutdown r = { .reason = reason };
1398
1117#ifdef CONFIG_SMP 1399#ifdef CONFIG_SMP
1118 smp_send_stop(); 1400 smp_send_stop();
1119#endif 1401#endif
1120 1402
1121 if (HYPERVISOR_sched_op(SCHEDOP_shutdown, reason)) 1403 if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r))
1122 BUG(); 1404 BUG();
1123} 1405}
1124 1406
@@ -1154,6 +1436,7 @@ static const struct machine_ops __initdata xen_machine_ops = {
1154 1436
1155static void __init xen_reserve_top(void) 1437static void __init xen_reserve_top(void)
1156{ 1438{
1439#ifdef CONFIG_X86_32
1157 unsigned long top = HYPERVISOR_VIRT_START; 1440 unsigned long top = HYPERVISOR_VIRT_START;
1158 struct xen_platform_parameters pp; 1441 struct xen_platform_parameters pp;
1159 1442
@@ -1161,7 +1444,247 @@ static void __init xen_reserve_top(void)
1161 top = pp.virt_start; 1444 top = pp.virt_start;
1162 1445
1163 reserve_top_address(-top + 2 * PAGE_SIZE); 1446 reserve_top_address(-top + 2 * PAGE_SIZE);
1447#endif /* CONFIG_X86_32 */
1448}
1449
1450/*
1451 * Like __va(), but returns address in the kernel mapping (which is
1452 * all we have until the physical memory mapping has been set up.
1453 */
1454static void *__ka(phys_addr_t paddr)
1455{
1456#ifdef CONFIG_X86_64
1457 return (void *)(paddr + __START_KERNEL_map);
1458#else
1459 return __va(paddr);
1460#endif
1461}
1462
1463/* Convert a machine address to physical address */
1464static unsigned long m2p(phys_addr_t maddr)
1465{
1466 phys_addr_t paddr;
1467
1468 maddr &= PTE_PFN_MASK;
1469 paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
1470
1471 return paddr;
1472}
1473
1474/* Convert a machine address to kernel virtual */
1475static void *m2v(phys_addr_t maddr)
1476{
1477 return __ka(m2p(maddr));
1478}
1479
1480#ifdef CONFIG_X86_64
1481static void walk(pgd_t *pgd, unsigned long addr)
1482{
1483 unsigned l4idx = pgd_index(addr);
1484 unsigned l3idx = pud_index(addr);
1485 unsigned l2idx = pmd_index(addr);
1486 unsigned l1idx = pte_index(addr);
1487 pgd_t l4;
1488 pud_t l3;
1489 pmd_t l2;
1490 pte_t l1;
1491
1492 xen_raw_printk("walk %p, %lx -> %d %d %d %d\n",
1493 pgd, addr, l4idx, l3idx, l2idx, l1idx);
1494
1495 l4 = pgd[l4idx];
1496 xen_raw_printk(" l4: %016lx\n", l4.pgd);
1497 xen_raw_printk(" %016lx\n", pgd_val(l4));
1498
1499 l3 = ((pud_t *)(m2v(l4.pgd)))[l3idx];
1500 xen_raw_printk(" l3: %016lx\n", l3.pud);
1501 xen_raw_printk(" %016lx\n", pud_val(l3));
1502
1503 l2 = ((pmd_t *)(m2v(l3.pud)))[l2idx];
1504 xen_raw_printk(" l2: %016lx\n", l2.pmd);
1505 xen_raw_printk(" %016lx\n", pmd_val(l2));
1506
1507 l1 = ((pte_t *)(m2v(l2.pmd)))[l1idx];
1508 xen_raw_printk(" l1: %016lx\n", l1.pte);
1509 xen_raw_printk(" %016lx\n", pte_val(l1));
1510}
1511#endif
1512
1513static void set_page_prot(void *addr, pgprot_t prot)
1514{
1515 unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
1516 pte_t pte = pfn_pte(pfn, prot);
1517
1518 xen_raw_printk("addr=%p pfn=%lx mfn=%lx prot=%016llx pte=%016llx\n",
1519 addr, pfn, get_phys_to_machine(pfn),
1520 pgprot_val(prot), pte.pte);
1521
1522 if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
1523 BUG();
1524}
1525
1526static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1527{
1528 unsigned pmdidx, pteidx;
1529 unsigned ident_pte;
1530 unsigned long pfn;
1531
1532 ident_pte = 0;
1533 pfn = 0;
1534 for(pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
1535 pte_t *pte_page;
1536
1537 /* Reuse or allocate a page of ptes */
1538 if (pmd_present(pmd[pmdidx]))
1539 pte_page = m2v(pmd[pmdidx].pmd);
1540 else {
1541 /* Check for free pte pages */
1542 if (ident_pte == ARRAY_SIZE(level1_ident_pgt))
1543 break;
1544
1545 pte_page = &level1_ident_pgt[ident_pte];
1546 ident_pte += PTRS_PER_PTE;
1547
1548 pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
1549 }
1550
1551 /* Install mappings */
1552 for(pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
1553 pte_t pte;
1554
1555 if (pfn > max_pfn_mapped)
1556 max_pfn_mapped = pfn;
1557
1558 if (!pte_none(pte_page[pteidx]))
1559 continue;
1560
1561 pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
1562 pte_page[pteidx] = pte;
1563 }
1564 }
1565
1566 for(pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
1567 set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
1568
1569 set_page_prot(pmd, PAGE_KERNEL_RO);
1570}
1571
1572#ifdef CONFIG_X86_64
1573static void convert_pfn_mfn(void *v)
1574{
1575 pte_t *pte = v;
1576 int i;
1577
1578 /* All levels are converted the same way, so just treat them
1579 as ptes. */
1580 for(i = 0; i < PTRS_PER_PTE; i++)
1581 pte[i] = xen_make_pte(pte[i].pte);
1582}
1583
1584/*
1585 * Set up the inital kernel pagetable.
1586 *
1587 * We can construct this by grafting the Xen provided pagetable into
1588 * head_64.S's preconstructed pagetables. We copy the Xen L2's into
1589 * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt. This
1590 * means that only the kernel has a physical mapping to start with -
1591 * but that's enough to get __va working. We need to fill in the rest
1592 * of the physical mapping once some sort of allocator has been set
1593 * up.
1594 */
1595static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
1596{
1597 pud_t *l3;
1598 pmd_t *l2;
1599
1600 /* Zap identity mapping */
1601 init_level4_pgt[0] = __pgd(0);
1602
1603 /* Pre-constructed entries are in pfn, so convert to mfn */
1604 convert_pfn_mfn(init_level4_pgt);
1605 convert_pfn_mfn(level3_ident_pgt);
1606 convert_pfn_mfn(level3_kernel_pgt);
1607
1608 l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
1609 l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
1610
1611 memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1612 memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1613
1614 l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
1615 l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
1616 memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1617
1618 /* Set up identity map */
1619 xen_map_identity_early(level2_ident_pgt, max_pfn);
1620
1621 /* Make pagetable pieces RO */
1622 set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
1623 set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
1624 set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
1625 set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
1626 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1627 set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
1628
1629 /* Pin down new L4 */
1630 pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
1631 PFN_DOWN(__pa_symbol(init_level4_pgt)));
1632
1633 /* Unpin Xen-provided one */
1634 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1635
1636 /* Switch over */
1637 pgd = init_level4_pgt;
1638
1639 /*
1640 * At this stage there can be no user pgd, and no page
1641 * structure to attach it to, so make sure we just set kernel
1642 * pgd.
1643 */
1644 xen_mc_batch();
1645 __xen_write_cr3(true, __pa(pgd));
1646 xen_mc_issue(PARAVIRT_LAZY_CPU);
1647
1648 reserve_early(__pa(xen_start_info->pt_base),
1649 __pa(xen_start_info->pt_base +
1650 xen_start_info->nr_pt_frames * PAGE_SIZE),
1651 "XEN PAGETABLES");
1652
1653 return pgd;
1164} 1654}
1655#else /* !CONFIG_X86_64 */
1656static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss;
1657
1658static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
1659{
1660 pmd_t *kernel_pmd;
1661
1662 init_pg_tables_start = __pa(pgd);
1663 init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
1664 max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024);
1665
1666 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
1667 memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
1668
1669 xen_map_identity_early(level2_kernel_pgt, max_pfn);
1670
1671 memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
1672 set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY],
1673 __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT));
1674
1675 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1676 set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
1677 set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
1678
1679 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1680
1681 xen_write_cr3(__pa(swapper_pg_dir));
1682
1683 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir)));
1684
1685 return swapper_pg_dir;
1686}
1687#endif /* CONFIG_X86_64 */
1165 1688
1166/* First C function to be called on Xen boot */ 1689/* First C function to be called on Xen boot */
1167asmlinkage void __init xen_start_kernel(void) 1690asmlinkage void __init xen_start_kernel(void)
@@ -1173,6 +1696,8 @@ asmlinkage void __init xen_start_kernel(void)
1173 1696
1174 BUG_ON(memcmp(xen_start_info->magic, "xen-3", 5) != 0); 1697 BUG_ON(memcmp(xen_start_info->magic, "xen-3", 5) != 0);
1175 1698
1699 xen_setup_features();
1700
1176 /* Install Xen paravirt ops */ 1701 /* Install Xen paravirt ops */
1177 pv_info = xen_info; 1702 pv_info = xen_info;
1178 pv_init_ops = xen_init_ops; 1703 pv_init_ops = xen_init_ops;
@@ -1182,59 +1707,92 @@ asmlinkage void __init xen_start_kernel(void)
1182 pv_apic_ops = xen_apic_ops; 1707 pv_apic_ops = xen_apic_ops;
1183 pv_mmu_ops = xen_mmu_ops; 1708 pv_mmu_ops = xen_mmu_ops;
1184 1709
1710#ifdef CONFIG_X86_LOCAL_APIC
1711 /*
1712 * set up the basic apic ops.
1713 */
1714 apic_ops = &xen_basic_apic_ops;
1715#endif
1716
1717 if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) {
1718 pv_mmu_ops.ptep_modify_prot_start = xen_ptep_modify_prot_start;
1719 pv_mmu_ops.ptep_modify_prot_commit = xen_ptep_modify_prot_commit;
1720 }
1721
1185 machine_ops = xen_machine_ops; 1722 machine_ops = xen_machine_ops;
1186 1723
1187#ifdef CONFIG_SMP 1724#ifdef CONFIG_X86_64
1188 smp_ops = xen_smp_ops; 1725 /* Disable until direct per-cpu data access. */
1726 have_vcpu_info_placement = 0;
1727 x86_64_init_pda();
1189#endif 1728#endif
1190 1729
1191 xen_setup_features(); 1730 xen_smp_init();
1192 1731
1193 /* Get mfn list */ 1732 /* Get mfn list */
1194 if (!xen_feature(XENFEAT_auto_translated_physmap)) 1733 if (!xen_feature(XENFEAT_auto_translated_physmap))
1195 phys_to_machine_mapping = (unsigned long *)xen_start_info->mfn_list; 1734 xen_build_dynamic_phys_to_machine();
1196 1735
1197 pgd = (pgd_t *)xen_start_info->pt_base; 1736 pgd = (pgd_t *)xen_start_info->pt_base;
1198 1737
1199 init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE; 1738 /* Prevent unwanted bits from being set in PTEs. */
1200 1739 __supported_pte_mask &= ~_PAGE_GLOBAL;
1201 init_mm.pgd = pgd; /* use the Xen pagetables to start */ 1740 if (!is_initial_xendomain())
1202 1741 __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
1203 /* keep using Xen gdt for now; no urgent need to change it */
1204
1205 x86_write_percpu(xen_cr3, __pa(pgd));
1206 x86_write_percpu(xen_current_cr3, __pa(pgd));
1207 1742
1208 /* Don't do the full vcpu_info placement stuff until we have a 1743 /* Don't do the full vcpu_info placement stuff until we have a
1209 possible map and a non-dummy shared_info. */ 1744 possible map and a non-dummy shared_info. */
1210 per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; 1745 per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
1211 1746
1747 xen_raw_console_write("mapping kernel into physical memory\n");
1748 pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages);
1749
1750 init_mm.pgd = pgd;
1751
1752 /* keep using Xen gdt for now; no urgent need to change it */
1753
1212 pv_info.kernel_rpl = 1; 1754 pv_info.kernel_rpl = 1;
1213 if (xen_feature(XENFEAT_supervisor_mode_kernel)) 1755 if (xen_feature(XENFEAT_supervisor_mode_kernel))
1214 pv_info.kernel_rpl = 0; 1756 pv_info.kernel_rpl = 0;
1215 1757
1216 /* Prevent unwanted bits from being set in PTEs. */
1217 __supported_pte_mask &= ~_PAGE_GLOBAL;
1218 if (!is_initial_xendomain())
1219 __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
1220
1221 /* set the limit of our address space */ 1758 /* set the limit of our address space */
1222 xen_reserve_top(); 1759 xen_reserve_top();
1223 1760
1761#ifdef CONFIG_X86_32
1224 /* set up basic CPUID stuff */ 1762 /* set up basic CPUID stuff */
1225 cpu_detect(&new_cpu_data); 1763 cpu_detect(&new_cpu_data);
1226 new_cpu_data.hard_math = 1; 1764 new_cpu_data.hard_math = 1;
1227 new_cpu_data.x86_capability[0] = cpuid_edx(1); 1765 new_cpu_data.x86_capability[0] = cpuid_edx(1);
1766#endif
1228 1767
1229 /* Poke various useful things into boot_params */ 1768 /* Poke various useful things into boot_params */
1230 boot_params.hdr.type_of_loader = (9 << 4) | 0; 1769 boot_params.hdr.type_of_loader = (9 << 4) | 0;
1231 boot_params.hdr.ramdisk_image = xen_start_info->mod_start 1770 boot_params.hdr.ramdisk_image = xen_start_info->mod_start
1232 ? __pa(xen_start_info->mod_start) : 0; 1771 ? __pa(xen_start_info->mod_start) : 0;
1233 boot_params.hdr.ramdisk_size = xen_start_info->mod_len; 1772 boot_params.hdr.ramdisk_size = xen_start_info->mod_len;
1773 boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line);
1234 1774
1235 if (!is_initial_xendomain()) 1775 if (!is_initial_xendomain()) {
1776 add_preferred_console("xenboot", 0, NULL);
1777 add_preferred_console("tty", 0, NULL);
1236 add_preferred_console("hvc", 0, NULL); 1778 add_preferred_console("hvc", 0, NULL);
1779 }
1780
1781 xen_raw_console_write("about to get started...\n");
1782
1783#if 0
1784 xen_raw_printk("&boot_params=%p __pa(&boot_params)=%lx __va(__pa(&boot_params))=%lx\n",
1785 &boot_params, __pa_symbol(&boot_params),
1786 __va(__pa_symbol(&boot_params)));
1787
1788 walk(pgd, &boot_params);
1789 walk(pgd, __va(__pa(&boot_params)));
1790#endif
1237 1791
1238 /* Start the world */ 1792 /* Start the world */
1239 start_kernel(); 1793#ifdef CONFIG_X86_32
1794 i386_start_kernel();
1795#else
1796 x86_64_start_reservations((char *)__pa_symbol(&boot_params));
1797#endif
1240} 1798}
diff --git a/arch/x86/xen/manage.c b/arch/x86/xen/manage.c
deleted file mode 100644
index aa7af9e6abc0..000000000000
--- a/arch/x86/xen/manage.c
+++ /dev/null
@@ -1,143 +0,0 @@
1/*
2 * Handle extern requests for shutdown, reboot and sysrq
3 */
4#include <linux/kernel.h>
5#include <linux/err.h>
6#include <linux/reboot.h>
7#include <linux/sysrq.h>
8
9#include <xen/xenbus.h>
10
11#define SHUTDOWN_INVALID -1
12#define SHUTDOWN_POWEROFF 0
13#define SHUTDOWN_SUSPEND 2
14/* Code 3 is SHUTDOWN_CRASH, which we don't use because the domain can only
15 * report a crash, not be instructed to crash!
16 * HALT is the same as POWEROFF, as far as we're concerned. The tools use
17 * the distinction when we return the reason code to them.
18 */
19#define SHUTDOWN_HALT 4
20
21/* Ignore multiple shutdown requests. */
22static int shutting_down = SHUTDOWN_INVALID;
23
24static void shutdown_handler(struct xenbus_watch *watch,
25 const char **vec, unsigned int len)
26{
27 char *str;
28 struct xenbus_transaction xbt;
29 int err;
30
31 if (shutting_down != SHUTDOWN_INVALID)
32 return;
33
34 again:
35 err = xenbus_transaction_start(&xbt);
36 if (err)
37 return;
38
39 str = (char *)xenbus_read(xbt, "control", "shutdown", NULL);
40 /* Ignore read errors and empty reads. */
41 if (XENBUS_IS_ERR_READ(str)) {
42 xenbus_transaction_end(xbt, 1);
43 return;
44 }
45
46 xenbus_write(xbt, "control", "shutdown", "");
47
48 err = xenbus_transaction_end(xbt, 0);
49 if (err == -EAGAIN) {
50 kfree(str);
51 goto again;
52 }
53
54 if (strcmp(str, "poweroff") == 0 ||
55 strcmp(str, "halt") == 0)
56 orderly_poweroff(false);
57 else if (strcmp(str, "reboot") == 0)
58 ctrl_alt_del();
59 else {
60 printk(KERN_INFO "Ignoring shutdown request: %s\n", str);
61 shutting_down = SHUTDOWN_INVALID;
62 }
63
64 kfree(str);
65}
66
67static void sysrq_handler(struct xenbus_watch *watch, const char **vec,
68 unsigned int len)
69{
70 char sysrq_key = '\0';
71 struct xenbus_transaction xbt;
72 int err;
73
74 again:
75 err = xenbus_transaction_start(&xbt);
76 if (err)
77 return;
78 if (!xenbus_scanf(xbt, "control", "sysrq", "%c", &sysrq_key)) {
79 printk(KERN_ERR "Unable to read sysrq code in "
80 "control/sysrq\n");
81 xenbus_transaction_end(xbt, 1);
82 return;
83 }
84
85 if (sysrq_key != '\0')
86 xenbus_printf(xbt, "control", "sysrq", "%c", '\0');
87
88 err = xenbus_transaction_end(xbt, 0);
89 if (err == -EAGAIN)
90 goto again;
91
92 if (sysrq_key != '\0')
93 handle_sysrq(sysrq_key, NULL);
94}
95
96static struct xenbus_watch shutdown_watch = {
97 .node = "control/shutdown",
98 .callback = shutdown_handler
99};
100
101static struct xenbus_watch sysrq_watch = {
102 .node = "control/sysrq",
103 .callback = sysrq_handler
104};
105
106static int setup_shutdown_watcher(void)
107{
108 int err;
109
110 err = register_xenbus_watch(&shutdown_watch);
111 if (err) {
112 printk(KERN_ERR "Failed to set shutdown watcher\n");
113 return err;
114 }
115
116 err = register_xenbus_watch(&sysrq_watch);
117 if (err) {
118 printk(KERN_ERR "Failed to set sysrq watcher\n");
119 return err;
120 }
121
122 return 0;
123}
124
125static int shutdown_event(struct notifier_block *notifier,
126 unsigned long event,
127 void *data)
128{
129 setup_shutdown_watcher();
130 return NOTIFY_DONE;
131}
132
133static int __init setup_shutdown_event(void)
134{
135 static struct notifier_block xenstore_notifier = {
136 .notifier_call = shutdown_event
137 };
138 register_xenstore_notifier(&xenstore_notifier);
139
140 return 0;
141}
142
143subsys_initcall(setup_shutdown_event);
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 4e527e7893a8..aa37469da696 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -44,8 +44,10 @@
44 44
45#include <asm/pgtable.h> 45#include <asm/pgtable.h>
46#include <asm/tlbflush.h> 46#include <asm/tlbflush.h>
47#include <asm/fixmap.h>
47#include <asm/mmu_context.h> 48#include <asm/mmu_context.h>
48#include <asm/paravirt.h> 49#include <asm/paravirt.h>
50#include <asm/linkage.h>
49 51
50#include <asm/xen/hypercall.h> 52#include <asm/xen/hypercall.h>
51#include <asm/xen/hypervisor.h> 53#include <asm/xen/hypervisor.h>
@@ -56,15 +58,144 @@
56#include "multicalls.h" 58#include "multicalls.h"
57#include "mmu.h" 59#include "mmu.h"
58 60
59xmaddr_t arbitrary_virt_to_machine(unsigned long address) 61/*
62 * Just beyond the highest usermode address. STACK_TOP_MAX has a
63 * redzone above it, so round it up to a PGD boundary.
64 */
65#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
66
67
68#define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
69#define TOP_ENTRIES (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE)
70
71/* Placeholder for holes in the address space */
72static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] __page_aligned_data =
73 { [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL };
74
75 /* Array of pointers to pages containing p2m entries */
76static unsigned long *p2m_top[TOP_ENTRIES] __page_aligned_data =
77 { [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] };
78
79/* Arrays of p2m arrays expressed in mfns used for save/restore */
80static unsigned long p2m_top_mfn[TOP_ENTRIES] __page_aligned_bss;
81
82static unsigned long p2m_top_mfn_list[TOP_ENTRIES / P2M_ENTRIES_PER_PAGE]
83 __page_aligned_bss;
84
85static inline unsigned p2m_top_index(unsigned long pfn)
86{
87 BUG_ON(pfn >= MAX_DOMAIN_PAGES);
88 return pfn / P2M_ENTRIES_PER_PAGE;
89}
90
91static inline unsigned p2m_index(unsigned long pfn)
92{
93 return pfn % P2M_ENTRIES_PER_PAGE;
94}
95
96/* Build the parallel p2m_top_mfn structures */
97void xen_setup_mfn_list_list(void)
98{
99 unsigned pfn, idx;
100
101 for(pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_ENTRIES_PER_PAGE) {
102 unsigned topidx = p2m_top_index(pfn);
103
104 p2m_top_mfn[topidx] = virt_to_mfn(p2m_top[topidx]);
105 }
106
107 for(idx = 0; idx < ARRAY_SIZE(p2m_top_mfn_list); idx++) {
108 unsigned topidx = idx * P2M_ENTRIES_PER_PAGE;
109 p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]);
110 }
111
112 BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
113
114 HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
115 virt_to_mfn(p2m_top_mfn_list);
116 HYPERVISOR_shared_info->arch.max_pfn = xen_start_info->nr_pages;
117}
118
119/* Set up p2m_top to point to the domain-builder provided p2m pages */
120void __init xen_build_dynamic_phys_to_machine(void)
121{
122 unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
123 unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
124 unsigned pfn;
125
126 for(pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) {
127 unsigned topidx = p2m_top_index(pfn);
128
129 p2m_top[topidx] = &mfn_list[pfn];
130 }
131}
132
133unsigned long get_phys_to_machine(unsigned long pfn)
134{
135 unsigned topidx, idx;
136
137 if (unlikely(pfn >= MAX_DOMAIN_PAGES))
138 return INVALID_P2M_ENTRY;
139
140 topidx = p2m_top_index(pfn);
141 idx = p2m_index(pfn);
142 return p2m_top[topidx][idx];
143}
144EXPORT_SYMBOL_GPL(get_phys_to_machine);
145
146static void alloc_p2m(unsigned long **pp, unsigned long *mfnp)
147{
148 unsigned long *p;
149 unsigned i;
150
151 p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL);
152 BUG_ON(p == NULL);
153
154 for(i = 0; i < P2M_ENTRIES_PER_PAGE; i++)
155 p[i] = INVALID_P2M_ENTRY;
156
157 if (cmpxchg(pp, p2m_missing, p) != p2m_missing)
158 free_page((unsigned long)p);
159 else
160 *mfnp = virt_to_mfn(p);
161}
162
163void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
164{
165 unsigned topidx, idx;
166
167 if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
168 BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
169 return;
170 }
171
172 if (unlikely(pfn >= MAX_DOMAIN_PAGES)) {
173 BUG_ON(mfn != INVALID_P2M_ENTRY);
174 return;
175 }
176
177 topidx = p2m_top_index(pfn);
178 if (p2m_top[topidx] == p2m_missing) {
179 /* no need to allocate a page to store an invalid entry */
180 if (mfn == INVALID_P2M_ENTRY)
181 return;
182 alloc_p2m(&p2m_top[topidx], &p2m_top_mfn[topidx]);
183 }
184
185 idx = p2m_index(pfn);
186 p2m_top[topidx][idx] = mfn;
187}
188
189xmaddr_t arbitrary_virt_to_machine(void *vaddr)
60{ 190{
191 unsigned long address = (unsigned long)vaddr;
61 unsigned int level; 192 unsigned int level;
62 pte_t *pte = lookup_address(address, &level); 193 pte_t *pte = lookup_address(address, &level);
63 unsigned offset = address & ~PAGE_MASK; 194 unsigned offset = address & ~PAGE_MASK;
64 195
65 BUG_ON(pte == NULL); 196 BUG_ON(pte == NULL);
66 197
67 return XMADDR((pte_mfn(*pte) << PAGE_SHIFT) + offset); 198 return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset);
68} 199}
69 200
70void make_lowmem_page_readonly(void *vaddr) 201void make_lowmem_page_readonly(void *vaddr)
@@ -98,59 +229,68 @@ void make_lowmem_page_readwrite(void *vaddr)
98} 229}
99 230
100 231
101void xen_set_pmd(pmd_t *ptr, pmd_t val) 232static bool page_pinned(void *ptr)
233{
234 struct page *page = virt_to_page(ptr);
235
236 return PagePinned(page);
237}
238
239static void extend_mmu_update(const struct mmu_update *update)
102{ 240{
103 struct multicall_space mcs; 241 struct multicall_space mcs;
104 struct mmu_update *u; 242 struct mmu_update *u;
105 243
106 preempt_disable(); 244 mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u));
245
246 if (mcs.mc != NULL)
247 mcs.mc->args[1]++;
248 else {
249 mcs = __xen_mc_entry(sizeof(*u));
250 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
251 }
107 252
108 mcs = xen_mc_entry(sizeof(*u));
109 u = mcs.args; 253 u = mcs.args;
110 u->ptr = virt_to_machine(ptr).maddr; 254 *u = *update;
111 u->val = pmd_val_ma(val); 255}
112 MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF); 256
257void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
258{
259 struct mmu_update u;
260
261 preempt_disable();
262
263 xen_mc_batch();
264
265 /* ptr may be ioremapped for 64-bit pagetable setup */
266 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
267 u.val = pmd_val_ma(val);
268 extend_mmu_update(&u);
113 269
114 xen_mc_issue(PARAVIRT_LAZY_MMU); 270 xen_mc_issue(PARAVIRT_LAZY_MMU);
115 271
116 preempt_enable(); 272 preempt_enable();
117} 273}
118 274
275void xen_set_pmd(pmd_t *ptr, pmd_t val)
276{
277 /* If page is not pinned, we can just update the entry
278 directly */
279 if (!page_pinned(ptr)) {
280 *ptr = val;
281 return;
282 }
283
284 xen_set_pmd_hyper(ptr, val);
285}
286
119/* 287/*
120 * Associate a virtual page frame with a given physical page frame 288 * Associate a virtual page frame with a given physical page frame
121 * and protection flags for that frame. 289 * and protection flags for that frame.
122 */ 290 */
123void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags) 291void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
124{ 292{
125 pgd_t *pgd; 293 set_pte_vaddr(vaddr, mfn_pte(mfn, flags));
126 pud_t *pud;
127 pmd_t *pmd;
128 pte_t *pte;
129
130 pgd = swapper_pg_dir + pgd_index(vaddr);
131 if (pgd_none(*pgd)) {
132 BUG();
133 return;
134 }
135 pud = pud_offset(pgd, vaddr);
136 if (pud_none(*pud)) {
137 BUG();
138 return;
139 }
140 pmd = pmd_offset(pud, vaddr);
141 if (pmd_none(*pmd)) {
142 BUG();
143 return;
144 }
145 pte = pte_offset_kernel(pmd, vaddr);
146 /* <mfn,flags> stored as-is, to permit clearing entries */
147 xen_set_pte(pte, mfn_pte(mfn, flags));
148
149 /*
150 * It's enough to flush this one mapping.
151 * (PGE mappings get flushed as well)
152 */
153 __flush_tlb_one(vaddr);
154} 294}
155 295
156void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, 296void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
@@ -179,12 +319,32 @@ out:
179 preempt_enable(); 319 preempt_enable();
180} 320}
181 321
322pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
323{
324 /* Just return the pte as-is. We preserve the bits on commit */
325 return *ptep;
326}
327
328void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
329 pte_t *ptep, pte_t pte)
330{
331 struct mmu_update u;
332
333 xen_mc_batch();
334
335 u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
336 u.val = pte_val_ma(pte);
337 extend_mmu_update(&u);
338
339 xen_mc_issue(PARAVIRT_LAZY_MMU);
340}
341
182/* Assume pteval_t is equivalent to all the other *val_t types. */ 342/* Assume pteval_t is equivalent to all the other *val_t types. */
183static pteval_t pte_mfn_to_pfn(pteval_t val) 343static pteval_t pte_mfn_to_pfn(pteval_t val)
184{ 344{
185 if (val & _PAGE_PRESENT) { 345 if (val & _PAGE_PRESENT) {
186 unsigned long mfn = (val & PTE_MASK) >> PAGE_SHIFT; 346 unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
187 pteval_t flags = val & ~PTE_MASK; 347 pteval_t flags = val & PTE_FLAGS_MASK;
188 val = ((pteval_t)mfn_to_pfn(mfn) << PAGE_SHIFT) | flags; 348 val = ((pteval_t)mfn_to_pfn(mfn) << PAGE_SHIFT) | flags;
189 } 349 }
190 350
@@ -194,8 +354,8 @@ static pteval_t pte_mfn_to_pfn(pteval_t val)
194static pteval_t pte_pfn_to_mfn(pteval_t val) 354static pteval_t pte_pfn_to_mfn(pteval_t val)
195{ 355{
196 if (val & _PAGE_PRESENT) { 356 if (val & _PAGE_PRESENT) {
197 unsigned long pfn = (val & PTE_MASK) >> PAGE_SHIFT; 357 unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
198 pteval_t flags = val & ~PTE_MASK; 358 pteval_t flags = val & PTE_FLAGS_MASK;
199 val = ((pteval_t)pfn_to_mfn(pfn) << PAGE_SHIFT) | flags; 359 val = ((pteval_t)pfn_to_mfn(pfn) << PAGE_SHIFT) | flags;
200 } 360 }
201 361
@@ -229,34 +389,51 @@ pmdval_t xen_pmd_val(pmd_t pmd)
229 return pte_mfn_to_pfn(pmd.pmd); 389 return pte_mfn_to_pfn(pmd.pmd);
230} 390}
231 391
232void xen_set_pud(pud_t *ptr, pud_t val) 392void xen_set_pud_hyper(pud_t *ptr, pud_t val)
233{ 393{
234 struct multicall_space mcs; 394 struct mmu_update u;
235 struct mmu_update *u;
236 395
237 preempt_disable(); 396 preempt_disable();
238 397
239 mcs = xen_mc_entry(sizeof(*u)); 398 xen_mc_batch();
240 u = mcs.args; 399
241 u->ptr = virt_to_machine(ptr).maddr; 400 /* ptr may be ioremapped for 64-bit pagetable setup */
242 u->val = pud_val_ma(val); 401 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
243 MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF); 402 u.val = pud_val_ma(val);
403 extend_mmu_update(&u);
244 404
245 xen_mc_issue(PARAVIRT_LAZY_MMU); 405 xen_mc_issue(PARAVIRT_LAZY_MMU);
246 406
247 preempt_enable(); 407 preempt_enable();
248} 408}
249 409
410void xen_set_pud(pud_t *ptr, pud_t val)
411{
412 /* If page is not pinned, we can just update the entry
413 directly */
414 if (!page_pinned(ptr)) {
415 *ptr = val;
416 return;
417 }
418
419 xen_set_pud_hyper(ptr, val);
420}
421
250void xen_set_pte(pte_t *ptep, pte_t pte) 422void xen_set_pte(pte_t *ptep, pte_t pte)
251{ 423{
424#ifdef CONFIG_X86_PAE
252 ptep->pte_high = pte.pte_high; 425 ptep->pte_high = pte.pte_high;
253 smp_wmb(); 426 smp_wmb();
254 ptep->pte_low = pte.pte_low; 427 ptep->pte_low = pte.pte_low;
428#else
429 *ptep = pte;
430#endif
255} 431}
256 432
433#ifdef CONFIG_X86_PAE
257void xen_set_pte_atomic(pte_t *ptep, pte_t pte) 434void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
258{ 435{
259 set_64bit((u64 *)ptep, pte_val_ma(pte)); 436 set_64bit((u64 *)ptep, native_pte_val(pte));
260} 437}
261 438
262void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) 439void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
@@ -268,8 +445,9 @@ void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
268 445
269void xen_pmd_clear(pmd_t *pmdp) 446void xen_pmd_clear(pmd_t *pmdp)
270{ 447{
271 xen_set_pmd(pmdp, __pmd(0)); 448 set_pmd(pmdp, __pmd(0));
272} 449}
450#endif /* CONFIG_X86_PAE */
273 451
274pmd_t xen_make_pmd(pmdval_t pmd) 452pmd_t xen_make_pmd(pmdval_t pmd)
275{ 453{
@@ -277,78 +455,189 @@ pmd_t xen_make_pmd(pmdval_t pmd)
277 return native_make_pmd(pmd); 455 return native_make_pmd(pmd);
278} 456}
279 457
458#if PAGETABLE_LEVELS == 4
459pudval_t xen_pud_val(pud_t pud)
460{
461 return pte_mfn_to_pfn(pud.pud);
462}
463
464pud_t xen_make_pud(pudval_t pud)
465{
466 pud = pte_pfn_to_mfn(pud);
467
468 return native_make_pud(pud);
469}
470
471pgd_t *xen_get_user_pgd(pgd_t *pgd)
472{
473 pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK);
474 unsigned offset = pgd - pgd_page;
475 pgd_t *user_ptr = NULL;
476
477 if (offset < pgd_index(USER_LIMIT)) {
478 struct page *page = virt_to_page(pgd_page);
479 user_ptr = (pgd_t *)page->private;
480 if (user_ptr)
481 user_ptr += offset;
482 }
483
484 return user_ptr;
485}
486
487static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
488{
489 struct mmu_update u;
490
491 u.ptr = virt_to_machine(ptr).maddr;
492 u.val = pgd_val_ma(val);
493 extend_mmu_update(&u);
494}
495
280/* 496/*
281 (Yet another) pagetable walker. This one is intended for pinning a 497 * Raw hypercall-based set_pgd, intended for in early boot before
282 pagetable. This means that it walks a pagetable and calls the 498 * there's a page structure. This implies:
283 callback function on each page it finds making up the page table, 499 * 1. The only existing pagetable is the kernel's
284 at every level. It walks the entire pagetable, but it only bothers 500 * 2. It is always pinned
285 pinning pte pages which are below pte_limit. In the normal case 501 * 3. It has no user pagetable attached to it
286 this will be TASK_SIZE, but at boot we need to pin up to 502 */
287 FIXADDR_TOP. But the important bit is that we don't pin beyond 503void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
288 there, because then we start getting into Xen's ptes. 504{
289*/ 505 preempt_disable();
290static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, enum pt_level), 506
507 xen_mc_batch();
508
509 __xen_set_pgd_hyper(ptr, val);
510
511 xen_mc_issue(PARAVIRT_LAZY_MMU);
512
513 preempt_enable();
514}
515
516void xen_set_pgd(pgd_t *ptr, pgd_t val)
517{
518 pgd_t *user_ptr = xen_get_user_pgd(ptr);
519
520 /* If page is not pinned, we can just update the entry
521 directly */
522 if (!page_pinned(ptr)) {
523 *ptr = val;
524 if (user_ptr) {
525 WARN_ON(page_pinned(user_ptr));
526 *user_ptr = val;
527 }
528 return;
529 }
530
531 /* If it's pinned, then we can at least batch the kernel and
532 user updates together. */
533 xen_mc_batch();
534
535 __xen_set_pgd_hyper(ptr, val);
536 if (user_ptr)
537 __xen_set_pgd_hyper(user_ptr, val);
538
539 xen_mc_issue(PARAVIRT_LAZY_MMU);
540}
541#endif /* PAGETABLE_LEVELS == 4 */
542
543/*
544 * (Yet another) pagetable walker. This one is intended for pinning a
545 * pagetable. This means that it walks a pagetable and calls the
546 * callback function on each page it finds making up the page table,
547 * at every level. It walks the entire pagetable, but it only bothers
548 * pinning pte pages which are below limit. In the normal case this
549 * will be STACK_TOP_MAX, but at boot we need to pin up to
550 * FIXADDR_TOP.
551 *
552 * For 32-bit the important bit is that we don't pin beyond there,
553 * because then we start getting into Xen's ptes.
554 *
555 * For 64-bit, we must skip the Xen hole in the middle of the address
556 * space, just after the big x86-64 virtual hole.
557 */
558static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level),
291 unsigned long limit) 559 unsigned long limit)
292{ 560{
293 pgd_t *pgd = pgd_base;
294 int flush = 0; 561 int flush = 0;
295 unsigned long addr = 0; 562 unsigned hole_low, hole_high;
296 unsigned long pgd_next; 563 unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;
564 unsigned pgdidx, pudidx, pmdidx;
297 565
298 BUG_ON(limit > FIXADDR_TOP); 566 /* The limit is the last byte to be touched */
567 limit--;
568 BUG_ON(limit >= FIXADDR_TOP);
299 569
300 if (xen_feature(XENFEAT_auto_translated_physmap)) 570 if (xen_feature(XENFEAT_auto_translated_physmap))
301 return 0; 571 return 0;
302 572
303 for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) { 573 /*
574 * 64-bit has a great big hole in the middle of the address
575 * space, which contains the Xen mappings. On 32-bit these
576 * will end up making a zero-sized hole and so is a no-op.
577 */
578 hole_low = pgd_index(USER_LIMIT);
579 hole_high = pgd_index(PAGE_OFFSET);
580
581 pgdidx_limit = pgd_index(limit);
582#if PTRS_PER_PUD > 1
583 pudidx_limit = pud_index(limit);
584#else
585 pudidx_limit = 0;
586#endif
587#if PTRS_PER_PMD > 1
588 pmdidx_limit = pmd_index(limit);
589#else
590 pmdidx_limit = 0;
591#endif
592
593 flush |= (*func)(virt_to_page(pgd), PT_PGD);
594
595 for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) {
304 pud_t *pud; 596 pud_t *pud;
305 unsigned long pud_limit, pud_next;
306 597
307 pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP); 598 if (pgdidx >= hole_low && pgdidx < hole_high)
599 continue;
308 600
309 if (!pgd_val(*pgd)) 601 if (!pgd_val(pgd[pgdidx]))
310 continue; 602 continue;
311 603
312 pud = pud_offset(pgd, 0); 604 pud = pud_offset(&pgd[pgdidx], 0);
313 605
314 if (PTRS_PER_PUD > 1) /* not folded */ 606 if (PTRS_PER_PUD > 1) /* not folded */
315 flush |= (*func)(virt_to_page(pud), PT_PUD); 607 flush |= (*func)(virt_to_page(pud), PT_PUD);
316 608
317 for (; addr != pud_limit; pud++, addr = pud_next) { 609 for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) {
318 pmd_t *pmd; 610 pmd_t *pmd;
319 unsigned long pmd_limit;
320 611
321 pud_next = pud_addr_end(addr, pud_limit); 612 if (pgdidx == pgdidx_limit &&
322 613 pudidx > pudidx_limit)
323 if (pud_next < limit) 614 goto out;
324 pmd_limit = pud_next;
325 else
326 pmd_limit = limit;
327 615
328 if (pud_none(*pud)) 616 if (pud_none(pud[pudidx]))
329 continue; 617 continue;
330 618
331 pmd = pmd_offset(pud, 0); 619 pmd = pmd_offset(&pud[pudidx], 0);
332 620
333 if (PTRS_PER_PMD > 1) /* not folded */ 621 if (PTRS_PER_PMD > 1) /* not folded */
334 flush |= (*func)(virt_to_page(pmd), PT_PMD); 622 flush |= (*func)(virt_to_page(pmd), PT_PMD);
335 623
336 for (; addr != pmd_limit; pmd++) { 624 for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) {
337 addr += (PAGE_SIZE * PTRS_PER_PTE); 625 struct page *pte;
338 if ((pmd_limit-1) < (addr-1)) { 626
339 addr = pmd_limit; 627 if (pgdidx == pgdidx_limit &&
340 break; 628 pudidx == pudidx_limit &&
341 } 629 pmdidx > pmdidx_limit)
630 goto out;
342 631
343 if (pmd_none(*pmd)) 632 if (pmd_none(pmd[pmdidx]))
344 continue; 633 continue;
345 634
346 flush |= (*func)(pmd_page(*pmd), PT_PTE); 635 pte = pmd_page(pmd[pmdidx]);
636 flush |= (*func)(pte, PT_PTE);
347 } 637 }
348 } 638 }
349 } 639 }
350 640out:
351 flush |= (*func)(virt_to_page(pgd_base), PT_PGD);
352 641
353 return flush; 642 return flush;
354} 643}
@@ -430,20 +719,62 @@ void xen_pgd_pin(pgd_t *pgd)
430{ 719{
431 xen_mc_batch(); 720 xen_mc_batch();
432 721
433 if (pgd_walk(pgd, pin_page, TASK_SIZE)) { 722 if (pgd_walk(pgd, pin_page, USER_LIMIT)) {
434 /* re-enable interrupts for kmap_flush_unused */ 723 /* re-enable interrupts for kmap_flush_unused */
435 xen_mc_issue(0); 724 xen_mc_issue(0);
436 kmap_flush_unused(); 725 kmap_flush_unused();
437 xen_mc_batch(); 726 xen_mc_batch();
438 } 727 }
439 728
729#ifdef CONFIG_X86_64
730 {
731 pgd_t *user_pgd = xen_get_user_pgd(pgd);
732
733 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
734
735 if (user_pgd) {
736 pin_page(virt_to_page(user_pgd), PT_PGD);
737 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd)));
738 }
739 }
740#else /* CONFIG_X86_32 */
741#ifdef CONFIG_X86_PAE
742 /* Need to make sure unshared kernel PMD is pinnable */
743 pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD);
744#endif
440 xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd))); 745 xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
746#endif /* CONFIG_X86_64 */
441 xen_mc_issue(0); 747 xen_mc_issue(0);
442} 748}
443 749
444/* The init_mm pagetable is really pinned as soon as its created, but 750/*
445 that's before we have page structures to store the bits. So do all 751 * On save, we need to pin all pagetables to make sure they get their
446 the book-keeping now. */ 752 * mfns turned into pfns. Search the list for any unpinned pgds and pin
753 * them (unpinned pgds are not currently in use, probably because the
754 * process is under construction or destruction).
755 */
756void xen_mm_pin_all(void)
757{
758 unsigned long flags;
759 struct page *page;
760
761 spin_lock_irqsave(&pgd_lock, flags);
762
763 list_for_each_entry(page, &pgd_list, lru) {
764 if (!PagePinned(page)) {
765 xen_pgd_pin((pgd_t *)page_address(page));
766 SetPageSavePinned(page);
767 }
768 }
769
770 spin_unlock_irqrestore(&pgd_lock, flags);
771}
772
773/*
774 * The init_mm pagetable is really pinned as soon as its created, but
775 * that's before we have page structures to store the bits. So do all
776 * the book-keeping now.
777 */
447static __init int mark_pinned(struct page *page, enum pt_level level) 778static __init int mark_pinned(struct page *page, enum pt_level level)
448{ 779{
449 SetPagePinned(page); 780 SetPagePinned(page);
@@ -493,11 +824,49 @@ static void xen_pgd_unpin(pgd_t *pgd)
493 824
494 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); 825 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
495 826
496 pgd_walk(pgd, unpin_page, TASK_SIZE); 827#ifdef CONFIG_X86_64
828 {
829 pgd_t *user_pgd = xen_get_user_pgd(pgd);
830
831 if (user_pgd) {
832 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd)));
833 unpin_page(virt_to_page(user_pgd), PT_PGD);
834 }
835 }
836#endif
837
838#ifdef CONFIG_X86_PAE
839 /* Need to make sure unshared kernel PMD is unpinned */
840 pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD);
841#endif
842
843 pgd_walk(pgd, unpin_page, USER_LIMIT);
497 844
498 xen_mc_issue(0); 845 xen_mc_issue(0);
499} 846}
500 847
848/*
849 * On resume, undo any pinning done at save, so that the rest of the
850 * kernel doesn't see any unexpected pinned pagetables.
851 */
852void xen_mm_unpin_all(void)
853{
854 unsigned long flags;
855 struct page *page;
856
857 spin_lock_irqsave(&pgd_lock, flags);
858
859 list_for_each_entry(page, &pgd_list, lru) {
860 if (PageSavePinned(page)) {
861 BUG_ON(!PagePinned(page));
862 xen_pgd_unpin((pgd_t *)page_address(page));
863 ClearPageSavePinned(page);
864 }
865 }
866
867 spin_unlock_irqrestore(&pgd_lock, flags);
868}
869
501void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) 870void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
502{ 871{
503 spin_lock(&next->page_table_lock); 872 spin_lock(&next->page_table_lock);
@@ -519,8 +888,15 @@ void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
519static void drop_other_mm_ref(void *info) 888static void drop_other_mm_ref(void *info)
520{ 889{
521 struct mm_struct *mm = info; 890 struct mm_struct *mm = info;
891 struct mm_struct *active_mm;
892
893#ifdef CONFIG_X86_64
894 active_mm = read_pda(active_mm);
895#else
896 active_mm = __get_cpu_var(cpu_tlbstate).active_mm;
897#endif
522 898
523 if (__get_cpu_var(cpu_tlbstate).active_mm == mm) 899 if (active_mm == mm)
524 leave_mm(smp_processor_id()); 900 leave_mm(smp_processor_id());
525 901
526 /* If this cpu still has a stale cr3 reference, then make sure 902 /* If this cpu still has a stale cr3 reference, then make sure
@@ -558,7 +934,7 @@ static void drop_mm_ref(struct mm_struct *mm)
558 } 934 }
559 935
560 if (!cpus_empty(mask)) 936 if (!cpus_empty(mask))
561 xen_smp_call_function_mask(mask, drop_other_mm_ref, mm, 1); 937 smp_call_function_mask(mask, drop_other_mm_ref, mm, 1);
562} 938}
563#else 939#else
564static void drop_mm_ref(struct mm_struct *mm) 940static void drop_mm_ref(struct mm_struct *mm)
@@ -591,7 +967,7 @@ void xen_exit_mmap(struct mm_struct *mm)
591 spin_lock(&mm->page_table_lock); 967 spin_lock(&mm->page_table_lock);
592 968
593 /* pgd may not be pinned in the error exit path of execve */ 969 /* pgd may not be pinned in the error exit path of execve */
594 if (PagePinned(virt_to_page(mm->pgd))) 970 if (page_pinned(mm->pgd))
595 xen_pgd_unpin(mm->pgd); 971 xen_pgd_unpin(mm->pgd);
596 972
597 spin_unlock(&mm->page_table_lock); 973 spin_unlock(&mm->page_table_lock);
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index 5fe961caffd4..0f59bd03f9e3 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -10,25 +10,9 @@ enum pt_level {
10 PT_PTE 10 PT_PTE
11}; 11};
12 12
13/*
14 * Page-directory addresses above 4GB do not fit into architectural %cr3.
15 * When accessing %cr3, or equivalent field in vcpu_guest_context, guests
16 * must use the following accessor macros to pack/unpack valid MFNs.
17 *
18 * Note that Xen is using the fact that the pagetable base is always
19 * page-aligned, and putting the 12 MSB of the address into the 12 LSB
20 * of cr3.
21 */
22#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20))
23#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20))
24
25 13
26void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); 14void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
27 15
28void xen_set_pte(pte_t *ptep, pte_t pteval);
29void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
30 pte_t *ptep, pte_t pteval);
31void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval);
32 16
33void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next); 17void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next);
34void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm); 18void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
@@ -45,11 +29,32 @@ pte_t xen_make_pte(pteval_t);
45pmd_t xen_make_pmd(pmdval_t); 29pmd_t xen_make_pmd(pmdval_t);
46pgd_t xen_make_pgd(pgdval_t); 30pgd_t xen_make_pgd(pgdval_t);
47 31
32void xen_set_pte(pte_t *ptep, pte_t pteval);
48void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, 33void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
49 pte_t *ptep, pte_t pteval); 34 pte_t *ptep, pte_t pteval);
35
36#ifdef CONFIG_X86_PAE
50void xen_set_pte_atomic(pte_t *ptep, pte_t pte); 37void xen_set_pte_atomic(pte_t *ptep, pte_t pte);
51void xen_set_pud(pud_t *ptr, pud_t val);
52void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); 38void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
53void xen_pmd_clear(pmd_t *pmdp); 39void xen_pmd_clear(pmd_t *pmdp);
40#endif /* CONFIG_X86_PAE */
41
42void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval);
43void xen_set_pud(pud_t *ptr, pud_t val);
44void xen_set_pmd_hyper(pmd_t *pmdp, pmd_t pmdval);
45void xen_set_pud_hyper(pud_t *ptr, pud_t val);
46
47#if PAGETABLE_LEVELS == 4
48pudval_t xen_pud_val(pud_t pud);
49pud_t xen_make_pud(pudval_t pudval);
50void xen_set_pgd(pgd_t *pgdp, pgd_t pgd);
51void xen_set_pgd_hyper(pgd_t *pgdp, pgd_t pgd);
52#endif
53
54pgd_t *xen_get_user_pgd(pgd_t *pgd);
55
56pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
57void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
58 pte_t *ptep, pte_t pte);
54 59
55#endif /* _XEN_MMU_H */ 60#endif /* _XEN_MMU_H */
diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c
index 5791eb2e3750..9efd1c6c9776 100644
--- a/arch/x86/xen/multicalls.c
+++ b/arch/x86/xen/multicalls.c
@@ -29,14 +29,14 @@
29#define MC_DEBUG 1 29#define MC_DEBUG 1
30 30
31#define MC_BATCH 32 31#define MC_BATCH 32
32#define MC_ARGS (MC_BATCH * 16 / sizeof(u64)) 32#define MC_ARGS (MC_BATCH * 16)
33 33
34struct mc_buffer { 34struct mc_buffer {
35 struct multicall_entry entries[MC_BATCH]; 35 struct multicall_entry entries[MC_BATCH];
36#if MC_DEBUG 36#if MC_DEBUG
37 struct multicall_entry debug[MC_BATCH]; 37 struct multicall_entry debug[MC_BATCH];
38#endif 38#endif
39 u64 args[MC_ARGS]; 39 unsigned char args[MC_ARGS];
40 struct callback { 40 struct callback {
41 void (*fn)(void *); 41 void (*fn)(void *);
42 void *data; 42 void *data;
@@ -76,6 +76,7 @@ void xen_mc_flush(void)
76 if (ret) { 76 if (ret) {
77 printk(KERN_ERR "%d multicall(s) failed: cpu %d\n", 77 printk(KERN_ERR "%d multicall(s) failed: cpu %d\n",
78 ret, smp_processor_id()); 78 ret, smp_processor_id());
79 dump_stack();
79 for (i = 0; i < b->mcidx; i++) { 80 for (i = 0; i < b->mcidx; i++) {
80 printk(" call %2d/%d: op=%lu arg=[%lx] result=%ld\n", 81 printk(" call %2d/%d: op=%lu arg=[%lx] result=%ld\n",
81 i+1, b->mcidx, 82 i+1, b->mcidx,
@@ -107,20 +108,48 @@ struct multicall_space __xen_mc_entry(size_t args)
107{ 108{
108 struct mc_buffer *b = &__get_cpu_var(mc_buffer); 109 struct mc_buffer *b = &__get_cpu_var(mc_buffer);
109 struct multicall_space ret; 110 struct multicall_space ret;
110 unsigned argspace = (args + sizeof(u64) - 1) / sizeof(u64); 111 unsigned argidx = roundup(b->argidx, sizeof(u64));
111 112
112 BUG_ON(preemptible()); 113 BUG_ON(preemptible());
113 BUG_ON(argspace > MC_ARGS); 114 BUG_ON(b->argidx > MC_ARGS);
114 115
115 if (b->mcidx == MC_BATCH || 116 if (b->mcidx == MC_BATCH ||
116 (b->argidx + argspace) > MC_ARGS) 117 (argidx + args) > MC_ARGS) {
117 xen_mc_flush(); 118 xen_mc_flush();
119 argidx = roundup(b->argidx, sizeof(u64));
120 }
118 121
119 ret.mc = &b->entries[b->mcidx]; 122 ret.mc = &b->entries[b->mcidx];
120 b->mcidx++; 123 b->mcidx++;
124 ret.args = &b->args[argidx];
125 b->argidx = argidx + args;
126
127 BUG_ON(b->argidx > MC_ARGS);
128 return ret;
129}
130
131struct multicall_space xen_mc_extend_args(unsigned long op, size_t size)
132{
133 struct mc_buffer *b = &__get_cpu_var(mc_buffer);
134 struct multicall_space ret = { NULL, NULL };
135
136 BUG_ON(preemptible());
137 BUG_ON(b->argidx > MC_ARGS);
138
139 if (b->mcidx == 0)
140 return ret;
141
142 if (b->entries[b->mcidx - 1].op != op)
143 return ret;
144
145 if ((b->argidx + size) > MC_ARGS)
146 return ret;
147
148 ret.mc = &b->entries[b->mcidx - 1];
121 ret.args = &b->args[b->argidx]; 149 ret.args = &b->args[b->argidx];
122 b->argidx += argspace; 150 b->argidx += size;
123 151
152 BUG_ON(b->argidx > MC_ARGS);
124 return ret; 153 return ret;
125} 154}
126 155
diff --git a/arch/x86/xen/multicalls.h b/arch/x86/xen/multicalls.h
index 8bae996d99a3..858938241616 100644
--- a/arch/x86/xen/multicalls.h
+++ b/arch/x86/xen/multicalls.h
@@ -45,4 +45,16 @@ static inline void xen_mc_issue(unsigned mode)
45/* Set up a callback to be called when the current batch is flushed */ 45/* Set up a callback to be called when the current batch is flushed */
46void xen_mc_callback(void (*fn)(void *), void *data); 46void xen_mc_callback(void (*fn)(void *), void *data);
47 47
48/*
49 * Try to extend the arguments of the previous multicall command. The
50 * previous command's op must match. If it does, then it attempts to
51 * extend the argument space allocated to the multicall entry by
52 * arg_size bytes.
53 *
54 * The returned multicall_space will return with mc pointing to the
55 * command on success, or NULL on failure, and args pointing to the
56 * newly allocated space.
57 */
58struct multicall_space xen_mc_extend_args(unsigned long op, size_t arg_size);
59
48#endif /* _XEN_MULTICALLS_H */ 60#endif /* _XEN_MULTICALLS_H */
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 82517e4a752a..b6acc3a0af46 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -13,9 +13,11 @@
13#include <asm/vdso.h> 13#include <asm/vdso.h>
14#include <asm/e820.h> 14#include <asm/e820.h>
15#include <asm/setup.h> 15#include <asm/setup.h>
16#include <asm/acpi.h>
16#include <asm/xen/hypervisor.h> 17#include <asm/xen/hypervisor.h>
17#include <asm/xen/hypercall.h> 18#include <asm/xen/hypercall.h>
18 19
20#include <xen/page.h>
19#include <xen/interface/callback.h> 21#include <xen/interface/callback.h>
20#include <xen/interface/physdev.h> 22#include <xen/interface/physdev.h>
21#include <xen/features.h> 23#include <xen/features.h>
@@ -27,8 +29,6 @@
27extern const char xen_hypervisor_callback[]; 29extern const char xen_hypervisor_callback[];
28extern const char xen_failsafe_callback[]; 30extern const char xen_failsafe_callback[];
29 31
30unsigned long *phys_to_machine_mapping;
31EXPORT_SYMBOL(phys_to_machine_mapping);
32 32
33/** 33/**
34 * machine_specific_memory_setup - Hook for machine specific memory setup. 34 * machine_specific_memory_setup - Hook for machine specific memory setup.
@@ -38,9 +38,31 @@ char * __init xen_memory_setup(void)
38{ 38{
39 unsigned long max_pfn = xen_start_info->nr_pages; 39 unsigned long max_pfn = xen_start_info->nr_pages;
40 40
41 max_pfn = min(MAX_DOMAIN_PAGES, max_pfn);
42
41 e820.nr_map = 0; 43 e820.nr_map = 0;
42 add_memory_region(0, LOWMEMSIZE(), E820_RAM); 44
43 add_memory_region(HIGH_MEMORY, PFN_PHYS(max_pfn)-HIGH_MEMORY, E820_RAM); 45 e820_add_region(0, PFN_PHYS(max_pfn), E820_RAM);
46
47 /*
48 * Even though this is normal, usable memory under Xen, reserve
49 * ISA memory anyway because too many things think they can poke
50 * about in there.
51 */
52 e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS,
53 E820_RESERVED);
54
55 /*
56 * Reserve Xen bits:
57 * - mfn_list
58 * - xen_start_info
59 * See comment above "struct start_info" in <xen/interface/xen.h>
60 */
61 e820_add_region(__pa(xen_start_info->mfn_list),
62 xen_start_info->pt_base - xen_start_info->mfn_list,
63 E820_RESERVED);
64
65 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
44 66
45 return "Xen"; 67 return "Xen";
46} 68}
@@ -61,30 +83,72 @@ static void xen_idle(void)
61 83
62/* 84/*
63 * Set the bit indicating "nosegneg" library variants should be used. 85 * Set the bit indicating "nosegneg" library variants should be used.
86 * We only need to bother in pure 32-bit mode; compat 32-bit processes
87 * can have un-truncated segments, so wrapping around is allowed.
64 */ 88 */
65static void __init fiddle_vdso(void) 89static void __init fiddle_vdso(void)
66{ 90{
67 extern const char vdso32_default_start; 91#ifdef CONFIG_X86_32
68 u32 *mask = VDSO32_SYMBOL(&vdso32_default_start, NOTE_MASK); 92 u32 *mask;
93 mask = VDSO32_SYMBOL(&vdso32_int80_start, NOTE_MASK);
94 *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
95 mask = VDSO32_SYMBOL(&vdso32_sysenter_start, NOTE_MASK);
69 *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT; 96 *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
97#endif
70} 98}
71 99
72void xen_enable_sysenter(void) 100static __cpuinit int register_callback(unsigned type, const void *func)
73{ 101{
74 int cpu = smp_processor_id(); 102 struct callback_register callback = {
75 extern void xen_sysenter_target(void); 103 .type = type,
76 /* Mask events on entry, even though they get enabled immediately */ 104 .address = XEN_CALLBACK(__KERNEL_CS, func),
77 static struct callback_register sysenter = {
78 .type = CALLBACKTYPE_sysenter,
79 .address = { __KERNEL_CS, (unsigned long)xen_sysenter_target },
80 .flags = CALLBACKF_mask_events, 105 .flags = CALLBACKF_mask_events,
81 }; 106 };
82 107
83 if (!boot_cpu_has(X86_FEATURE_SEP) || 108 return HYPERVISOR_callback_op(CALLBACKOP_register, &callback);
84 HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) != 0) { 109}
85 clear_cpu_cap(&cpu_data(cpu), X86_FEATURE_SEP); 110
86 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_SEP); 111void __cpuinit xen_enable_sysenter(void)
112{
113 extern void xen_sysenter_target(void);
114 int ret;
115 unsigned sysenter_feature;
116
117#ifdef CONFIG_X86_32
118 sysenter_feature = X86_FEATURE_SEP;
119#else
120 sysenter_feature = X86_FEATURE_SYSENTER32;
121#endif
122
123 if (!boot_cpu_has(sysenter_feature))
124 return;
125
126 ret = register_callback(CALLBACKTYPE_sysenter, xen_sysenter_target);
127 if(ret != 0)
128 setup_clear_cpu_cap(sysenter_feature);
129}
130
131void __cpuinit xen_enable_syscall(void)
132{
133#ifdef CONFIG_X86_64
134 int ret;
135 extern void xen_syscall_target(void);
136 extern void xen_syscall32_target(void);
137
138 ret = register_callback(CALLBACKTYPE_syscall, xen_syscall_target);
139 if (ret != 0) {
140 printk(KERN_ERR "Failed to set syscall callback: %d\n", ret);
141 /* Pretty fatal; 64-bit userspace has no other
142 mechanism for syscalls. */
143 }
144
145 if (boot_cpu_has(X86_FEATURE_SYSCALL32)) {
146 ret = register_callback(CALLBACKTYPE_syscall32,
147 xen_syscall32_target);
148 if (ret != 0)
149 setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
87 } 150 }
151#endif /* CONFIG_X86_64 */
88} 152}
89 153
90void __init xen_arch_setup(void) 154void __init xen_arch_setup(void)
@@ -98,10 +162,12 @@ void __init xen_arch_setup(void)
98 if (!xen_feature(XENFEAT_auto_translated_physmap)) 162 if (!xen_feature(XENFEAT_auto_translated_physmap))
99 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3); 163 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3);
100 164
101 HYPERVISOR_set_callbacks(__KERNEL_CS, (unsigned long)xen_hypervisor_callback, 165 if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) ||
102 __KERNEL_CS, (unsigned long)xen_failsafe_callback); 166 register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback))
167 BUG();
103 168
104 xen_enable_sysenter(); 169 xen_enable_sysenter();
170 xen_enable_syscall();
105 171
106 set_iopl.iopl = 1; 172 set_iopl.iopl = 1;
107 rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); 173 rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
@@ -121,11 +187,6 @@ void __init xen_arch_setup(void)
121 187
122 pm_idle = xen_idle; 188 pm_idle = xen_idle;
123 189
124#ifdef CONFIG_SMP
125 /* fill cpus_possible with all available cpus */
126 xen_fill_possible_map();
127#endif
128
129 paravirt_disable_iospace(); 190 paravirt_disable_iospace();
130 191
131 fiddle_vdso(); 192 fiddle_vdso();
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 94e69000f982..d8faf79a0a1d 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -15,6 +15,7 @@
15 * This does not handle HOTPLUG_CPU yet. 15 * This does not handle HOTPLUG_CPU yet.
16 */ 16 */
17#include <linux/sched.h> 17#include <linux/sched.h>
18#include <linux/kernel_stat.h>
18#include <linux/err.h> 19#include <linux/err.h>
19#include <linux/smp.h> 20#include <linux/smp.h>
20 21
@@ -35,28 +36,17 @@
35#include "xen-ops.h" 36#include "xen-ops.h"
36#include "mmu.h" 37#include "mmu.h"
37 38
38static cpumask_t xen_cpu_initialized_map; 39static void __cpuinit xen_init_lock_cpu(int cpu);
39static DEFINE_PER_CPU(int, resched_irq) = -1;
40static DEFINE_PER_CPU(int, callfunc_irq) = -1;
41static DEFINE_PER_CPU(int, debug_irq) = -1;
42 40
43/* 41cpumask_t xen_cpu_initialized_map;
44 * Structure and data for smp_call_function(). This is designed to minimise
45 * static memory requirements. It also looks cleaner.
46 */
47static DEFINE_SPINLOCK(call_lock);
48
49struct call_data_struct {
50 void (*func) (void *info);
51 void *info;
52 atomic_t started;
53 atomic_t finished;
54 int wait;
55};
56 42
57static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id); 43static DEFINE_PER_CPU(int, resched_irq);
44static DEFINE_PER_CPU(int, callfunc_irq);
45static DEFINE_PER_CPU(int, callfuncsingle_irq);
46static DEFINE_PER_CPU(int, debug_irq) = -1;
58 47
59static struct call_data_struct *call_data; 48static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id);
49static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id);
60 50
61/* 51/*
62 * Reschedule call back. Nothing to do, 52 * Reschedule call back. Nothing to do,
@@ -65,6 +55,12 @@ static struct call_data_struct *call_data;
65 */ 55 */
66static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id) 56static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
67{ 57{
58#ifdef CONFIG_X86_32
59 __get_cpu_var(irq_stat).irq_resched_count++;
60#else
61 add_pda(irq_resched_count, 1);
62#endif
63
68 return IRQ_HANDLED; 64 return IRQ_HANDLED;
69} 65}
70 66
@@ -73,13 +69,22 @@ static __cpuinit void cpu_bringup_and_idle(void)
73 int cpu = smp_processor_id(); 69 int cpu = smp_processor_id();
74 70
75 cpu_init(); 71 cpu_init();
72 preempt_disable();
73
76 xen_enable_sysenter(); 74 xen_enable_sysenter();
75 xen_enable_syscall();
77 76
78 preempt_disable(); 77 cpu = smp_processor_id();
79 per_cpu(cpu_state, cpu) = CPU_ONLINE; 78 smp_store_cpu_info(cpu);
79 cpu_data(cpu).x86_max_cores = 1;
80 set_cpu_sibling_map(cpu);
80 81
81 xen_setup_cpu_clockevents(); 82 xen_setup_cpu_clockevents();
82 83
84 cpu_set(cpu, cpu_online_map);
85 x86_write_percpu(cpu_state, CPU_ONLINE);
86 wmb();
87
83 /* We can take interrupts now: we're officially "up". */ 88 /* We can take interrupts now: we're officially "up". */
84 local_irq_enable(); 89 local_irq_enable();
85 90
@@ -122,6 +127,17 @@ static int xen_smp_intr_init(unsigned int cpu)
122 goto fail; 127 goto fail;
123 per_cpu(debug_irq, cpu) = rc; 128 per_cpu(debug_irq, cpu) = rc;
124 129
130 callfunc_name = kasprintf(GFP_KERNEL, "callfuncsingle%d", cpu);
131 rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_SINGLE_VECTOR,
132 cpu,
133 xen_call_function_single_interrupt,
134 IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
135 callfunc_name,
136 NULL);
137 if (rc < 0)
138 goto fail;
139 per_cpu(callfuncsingle_irq, cpu) = rc;
140
125 return 0; 141 return 0;
126 142
127 fail: 143 fail:
@@ -131,59 +147,45 @@ static int xen_smp_intr_init(unsigned int cpu)
131 unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL); 147 unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
132 if (per_cpu(debug_irq, cpu) >= 0) 148 if (per_cpu(debug_irq, cpu) >= 0)
133 unbind_from_irqhandler(per_cpu(debug_irq, cpu), NULL); 149 unbind_from_irqhandler(per_cpu(debug_irq, cpu), NULL);
150 if (per_cpu(callfuncsingle_irq, cpu) >= 0)
151 unbind_from_irqhandler(per_cpu(callfuncsingle_irq, cpu), NULL);
152
134 return rc; 153 return rc;
135} 154}
136 155
137void __init xen_fill_possible_map(void) 156static void __init xen_fill_possible_map(void)
138{ 157{
139 int i, rc; 158 int i, rc;
140 159
141 for (i = 0; i < NR_CPUS; i++) { 160 for (i = 0; i < NR_CPUS; i++) {
142 rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); 161 rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
143 if (rc >= 0) 162 if (rc >= 0) {
163 num_processors++;
144 cpu_set(i, cpu_possible_map); 164 cpu_set(i, cpu_possible_map);
165 }
145 } 166 }
146} 167}
147 168
148void __init xen_smp_prepare_boot_cpu(void) 169static void __init xen_smp_prepare_boot_cpu(void)
149{ 170{
150 int cpu;
151
152 BUG_ON(smp_processor_id() != 0); 171 BUG_ON(smp_processor_id() != 0);
153 native_smp_prepare_boot_cpu(); 172 native_smp_prepare_boot_cpu();
154 173
155 /* We've switched to the "real" per-cpu gdt, so make sure the 174 /* We've switched to the "real" per-cpu gdt, so make sure the
156 old memory can be recycled */ 175 old memory can be recycled */
157 make_lowmem_page_readwrite(&per_cpu__gdt_page); 176 make_lowmem_page_readwrite(&per_cpu_var(gdt_page));
158
159 for_each_possible_cpu(cpu) {
160 cpus_clear(per_cpu(cpu_sibling_map, cpu));
161 /*
162 * cpu_core_map lives in a per cpu area that is cleared
163 * when the per cpu array is allocated.
164 *
165 * cpus_clear(per_cpu(cpu_core_map, cpu));
166 */
167 }
168 177
169 xen_setup_vcpu_info_placement(); 178 xen_setup_vcpu_info_placement();
170} 179}
171 180
172void __init xen_smp_prepare_cpus(unsigned int max_cpus) 181static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
173{ 182{
174 unsigned cpu; 183 unsigned cpu;
175 184
176 for_each_possible_cpu(cpu) { 185 xen_init_lock_cpu(0);
177 cpus_clear(per_cpu(cpu_sibling_map, cpu));
178 /*
179 * cpu_core_ map will be zeroed when the per
180 * cpu area is allocated.
181 *
182 * cpus_clear(per_cpu(cpu_core_map, cpu));
183 */
184 }
185 186
186 smp_store_cpu_info(0); 187 smp_store_cpu_info(0);
188 cpu_data(0).x86_max_cores = 1;
187 set_cpu_sibling_map(0); 189 set_cpu_sibling_map(0);
188 190
189 if (xen_smp_intr_init(0)) 191 if (xen_smp_intr_init(0))
@@ -218,7 +220,7 @@ static __cpuinit int
218cpu_initialize_context(unsigned int cpu, struct task_struct *idle) 220cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
219{ 221{
220 struct vcpu_guest_context *ctxt; 222 struct vcpu_guest_context *ctxt;
221 struct gdt_page *gdt = &per_cpu(gdt_page, cpu); 223 struct desc_struct *gdt;
222 224
223 if (cpu_test_and_set(cpu, xen_cpu_initialized_map)) 225 if (cpu_test_and_set(cpu, xen_cpu_initialized_map))
224 return 0; 226 return 0;
@@ -227,12 +229,15 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
227 if (ctxt == NULL) 229 if (ctxt == NULL)
228 return -ENOMEM; 230 return -ENOMEM;
229 231
232 gdt = get_cpu_gdt_table(cpu);
233
230 ctxt->flags = VGCF_IN_KERNEL; 234 ctxt->flags = VGCF_IN_KERNEL;
231 ctxt->user_regs.ds = __USER_DS; 235 ctxt->user_regs.ds = __USER_DS;
232 ctxt->user_regs.es = __USER_DS; 236 ctxt->user_regs.es = __USER_DS;
233 ctxt->user_regs.fs = __KERNEL_PERCPU;
234 ctxt->user_regs.gs = 0;
235 ctxt->user_regs.ss = __KERNEL_DS; 237 ctxt->user_regs.ss = __KERNEL_DS;
238#ifdef CONFIG_X86_32
239 ctxt->user_regs.fs = __KERNEL_PERCPU;
240#endif
236 ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle; 241 ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
237 ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */ 242 ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
238 243
@@ -242,11 +247,11 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
242 247
243 ctxt->ldt_ents = 0; 248 ctxt->ldt_ents = 0;
244 249
245 BUG_ON((unsigned long)gdt->gdt & ~PAGE_MASK); 250 BUG_ON((unsigned long)gdt & ~PAGE_MASK);
246 make_lowmem_page_readonly(gdt->gdt); 251 make_lowmem_page_readonly(gdt);
247 252
248 ctxt->gdt_frames[0] = virt_to_mfn(gdt->gdt); 253 ctxt->gdt_frames[0] = virt_to_mfn(gdt);
249 ctxt->gdt_ents = ARRAY_SIZE(gdt->gdt); 254 ctxt->gdt_ents = GDT_ENTRIES;
250 255
251 ctxt->user_regs.cs = __KERNEL_CS; 256 ctxt->user_regs.cs = __KERNEL_CS;
252 ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs); 257 ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
@@ -254,9 +259,11 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
254 ctxt->kernel_ss = __KERNEL_DS; 259 ctxt->kernel_ss = __KERNEL_DS;
255 ctxt->kernel_sp = idle->thread.sp0; 260 ctxt->kernel_sp = idle->thread.sp0;
256 261
262#ifdef CONFIG_X86_32
257 ctxt->event_callback_cs = __KERNEL_CS; 263 ctxt->event_callback_cs = __KERNEL_CS;
258 ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback;
259 ctxt->failsafe_callback_cs = __KERNEL_CS; 264 ctxt->failsafe_callback_cs = __KERNEL_CS;
265#endif
266 ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback;
260 ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback; 267 ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback;
261 268
262 per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir); 269 per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
@@ -269,7 +276,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
269 return 0; 276 return 0;
270} 277}
271 278
272int __cpuinit xen_cpu_up(unsigned int cpu) 279static int __cpuinit xen_cpu_up(unsigned int cpu)
273{ 280{
274 struct task_struct *idle = idle_task(cpu); 281 struct task_struct *idle = idle_task(cpu);
275 int rc; 282 int rc;
@@ -280,10 +287,28 @@ int __cpuinit xen_cpu_up(unsigned int cpu)
280 return rc; 287 return rc;
281#endif 288#endif
282 289
290#ifdef CONFIG_X86_64
291 /* Allocate node local memory for AP pdas */
292 WARN_ON(cpu == 0);
293 if (cpu > 0) {
294 rc = get_local_pda(cpu);
295 if (rc)
296 return rc;
297 }
298#endif
299
300#ifdef CONFIG_X86_32
283 init_gdt(cpu); 301 init_gdt(cpu);
284 per_cpu(current_task, cpu) = idle; 302 per_cpu(current_task, cpu) = idle;
285 irq_ctx_init(cpu); 303 irq_ctx_init(cpu);
304#else
305 cpu_pda(cpu)->pcurrent = idle;
306 clear_tsk_thread_flag(idle, TIF_FORK);
307#endif
286 xen_setup_timer(cpu); 308 xen_setup_timer(cpu);
309 xen_init_lock_cpu(cpu);
310
311 per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
287 312
288 /* make sure interrupts start blocked */ 313 /* make sure interrupts start blocked */
289 per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1; 314 per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1;
@@ -299,20 +324,18 @@ int __cpuinit xen_cpu_up(unsigned int cpu)
299 if (rc) 324 if (rc)
300 return rc; 325 return rc;
301 326
302 smp_store_cpu_info(cpu);
303 set_cpu_sibling_map(cpu);
304 /* This must be done before setting cpu_online_map */
305 wmb();
306
307 cpu_set(cpu, cpu_online_map);
308
309 rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL); 327 rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL);
310 BUG_ON(rc); 328 BUG_ON(rc);
311 329
330 while(per_cpu(cpu_state, cpu) != CPU_ONLINE) {
331 HYPERVISOR_sched_op(SCHEDOP_yield, 0);
332 barrier();
333 }
334
312 return 0; 335 return 0;
313} 336}
314 337
315void xen_smp_cpus_done(unsigned int max_cpus) 338static void xen_smp_cpus_done(unsigned int max_cpus)
316{ 339{
317} 340}
318 341
@@ -328,104 +351,254 @@ static void stop_self(void *v)
328 BUG(); 351 BUG();
329} 352}
330 353
331void xen_smp_send_stop(void) 354static void xen_smp_send_stop(void)
332{ 355{
333 smp_call_function(stop_self, NULL, 0, 0); 356 smp_call_function(stop_self, NULL, 0);
334} 357}
335 358
336void xen_smp_send_reschedule(int cpu) 359static void xen_smp_send_reschedule(int cpu)
337{ 360{
338 xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR); 361 xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR);
339} 362}
340 363
341
342static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector) 364static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector)
343{ 365{
344 unsigned cpu; 366 unsigned cpu;
345 367
346 cpus_and(mask, mask, cpu_online_map); 368 cpus_and(mask, mask, cpu_online_map);
347 369
348 for_each_cpu_mask(cpu, mask) 370 for_each_cpu_mask_nr(cpu, mask)
349 xen_send_IPI_one(cpu, vector); 371 xen_send_IPI_one(cpu, vector);
350} 372}
351 373
374static void xen_smp_send_call_function_ipi(cpumask_t mask)
375{
376 int cpu;
377
378 xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR);
379
380 /* Make sure other vcpus get a chance to run if they need to. */
381 for_each_cpu_mask_nr(cpu, mask) {
382 if (xen_vcpu_stolen(cpu)) {
383 HYPERVISOR_sched_op(SCHEDOP_yield, 0);
384 break;
385 }
386 }
387}
388
389static void xen_smp_send_call_function_single_ipi(int cpu)
390{
391 xen_send_IPI_mask(cpumask_of_cpu(cpu), XEN_CALL_FUNCTION_SINGLE_VECTOR);
392}
393
352static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id) 394static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
353{ 395{
354 void (*func) (void *info) = call_data->func;
355 void *info = call_data->info;
356 int wait = call_data->wait;
357
358 /*
359 * Notify initiating CPU that I've grabbed the data and am
360 * about to execute the function
361 */
362 mb();
363 atomic_inc(&call_data->started);
364 /*
365 * At this point the info structure may be out of scope unless wait==1
366 */
367 irq_enter(); 396 irq_enter();
368 (*func)(info); 397 generic_smp_call_function_interrupt();
398#ifdef CONFIG_X86_32
369 __get_cpu_var(irq_stat).irq_call_count++; 399 __get_cpu_var(irq_stat).irq_call_count++;
400#else
401 add_pda(irq_call_count, 1);
402#endif
370 irq_exit(); 403 irq_exit();
371 404
372 if (wait) { 405 return IRQ_HANDLED;
373 mb(); /* commit everything before setting finished */ 406}
374 atomic_inc(&call_data->finished); 407
375 } 408static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id)
409{
410 irq_enter();
411 generic_smp_call_function_single_interrupt();
412#ifdef CONFIG_X86_32
413 __get_cpu_var(irq_stat).irq_call_count++;
414#else
415 add_pda(irq_call_count, 1);
416#endif
417 irq_exit();
376 418
377 return IRQ_HANDLED; 419 return IRQ_HANDLED;
378} 420}
379 421
380int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *), 422struct xen_spinlock {
381 void *info, int wait) 423 unsigned char lock; /* 0 -> free; 1 -> locked */
424 unsigned short spinners; /* count of waiting cpus */
425};
426
427static int xen_spin_is_locked(struct raw_spinlock *lock)
382{ 428{
383 struct call_data_struct data; 429 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
384 int cpus, cpu;
385 bool yield;
386 430
387 /* Holding any lock stops cpus from going down. */ 431 return xl->lock != 0;
388 spin_lock(&call_lock); 432}
389 433
390 cpu_clear(smp_processor_id(), mask); 434static int xen_spin_is_contended(struct raw_spinlock *lock)
435{
436 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
391 437
392 cpus = cpus_weight(mask); 438 /* Not strictly true; this is only the count of contended
393 if (!cpus) { 439 lock-takers entering the slow path. */
394 spin_unlock(&call_lock); 440 return xl->spinners != 0;
441}
442
443static int xen_spin_trylock(struct raw_spinlock *lock)
444{
445 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
446 u8 old = 1;
447
448 asm("xchgb %b0,%1"
449 : "+q" (old), "+m" (xl->lock) : : "memory");
450
451 return old == 0;
452}
453
454static DEFINE_PER_CPU(int, lock_kicker_irq) = -1;
455static DEFINE_PER_CPU(struct xen_spinlock *, lock_spinners);
456
457static inline void spinning_lock(struct xen_spinlock *xl)
458{
459 __get_cpu_var(lock_spinners) = xl;
460 wmb(); /* set lock of interest before count */
461 asm(LOCK_PREFIX " incw %0"
462 : "+m" (xl->spinners) : : "memory");
463}
464
465static inline void unspinning_lock(struct xen_spinlock *xl)
466{
467 asm(LOCK_PREFIX " decw %0"
468 : "+m" (xl->spinners) : : "memory");
469 wmb(); /* decrement count before clearing lock */
470 __get_cpu_var(lock_spinners) = NULL;
471}
472
473static noinline int xen_spin_lock_slow(struct raw_spinlock *lock)
474{
475 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
476 int irq = __get_cpu_var(lock_kicker_irq);
477 int ret;
478
479 /* If kicker interrupts not initialized yet, just spin */
480 if (irq == -1)
395 return 0; 481 return 0;
482
483 /* announce we're spinning */
484 spinning_lock(xl);
485
486 /* clear pending */
487 xen_clear_irq_pending(irq);
488
489 /* check again make sure it didn't become free while
490 we weren't looking */
491 ret = xen_spin_trylock(lock);
492 if (ret)
493 goto out;
494
495 /* block until irq becomes pending */
496 xen_poll_irq(irq);
497 kstat_this_cpu.irqs[irq]++;
498
499out:
500 unspinning_lock(xl);
501 return ret;
502}
503
504static void xen_spin_lock(struct raw_spinlock *lock)
505{
506 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
507 int timeout;
508 u8 oldval;
509
510 do {
511 timeout = 1 << 10;
512
513 asm("1: xchgb %1,%0\n"
514 " testb %1,%1\n"
515 " jz 3f\n"
516 "2: rep;nop\n"
517 " cmpb $0,%0\n"
518 " je 1b\n"
519 " dec %2\n"
520 " jnz 2b\n"
521 "3:\n"
522 : "+m" (xl->lock), "=q" (oldval), "+r" (timeout)
523 : "1" (1)
524 : "memory");
525
526 } while (unlikely(oldval != 0 && !xen_spin_lock_slow(lock)));
527}
528
529static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl)
530{
531 int cpu;
532
533 for_each_online_cpu(cpu) {
534 /* XXX should mix up next cpu selection */
535 if (per_cpu(lock_spinners, cpu) == xl) {
536 xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR);
537 break;
538 }
396 } 539 }
540}
397 541
398 /* Can deadlock when called with interrupts disabled */ 542static void xen_spin_unlock(struct raw_spinlock *lock)
399 WARN_ON(irqs_disabled()); 543{
544 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
400 545
401 data.func = func; 546 smp_wmb(); /* make sure no writes get moved after unlock */
402 data.info = info; 547 xl->lock = 0; /* release lock */
403 atomic_set(&data.started, 0);
404 data.wait = wait;
405 if (wait)
406 atomic_set(&data.finished, 0);
407 548
408 call_data = &data; 549 /* make sure unlock happens before kick */
409 mb(); /* write everything before IPI */ 550 barrier();
410 551
411 /* Send a message to other CPUs and wait for them to respond */ 552 if (unlikely(xl->spinners))
412 xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR); 553 xen_spin_unlock_slow(xl);
554}
413 555
414 /* Make sure other vcpus get a chance to run if they need to. */ 556static __cpuinit void xen_init_lock_cpu(int cpu)
415 yield = false; 557{
416 for_each_cpu_mask(cpu, mask) 558 int irq;
417 if (xen_vcpu_stolen(cpu)) 559 const char *name;
418 yield = true; 560
561 name = kasprintf(GFP_KERNEL, "spinlock%d", cpu);
562 irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR,
563 cpu,
564 xen_reschedule_interrupt,
565 IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
566 name,
567 NULL);
568
569 if (irq >= 0) {
570 disable_irq(irq); /* make sure it's never delivered */
571 per_cpu(lock_kicker_irq, cpu) = irq;
572 }
419 573
420 if (yield) 574 printk("cpu %d spinlock event irq %d\n", cpu, irq);
421 HYPERVISOR_sched_op(SCHEDOP_yield, 0); 575}
422 576
423 /* Wait for response */ 577static void __init xen_init_spinlocks(void)
424 while (atomic_read(&data.started) != cpus || 578{
425 (wait && atomic_read(&data.finished) != cpus)) 579 pv_lock_ops.spin_is_locked = xen_spin_is_locked;
426 cpu_relax(); 580 pv_lock_ops.spin_is_contended = xen_spin_is_contended;
581 pv_lock_ops.spin_lock = xen_spin_lock;
582 pv_lock_ops.spin_trylock = xen_spin_trylock;
583 pv_lock_ops.spin_unlock = xen_spin_unlock;
584}
427 585
428 spin_unlock(&call_lock); 586static const struct smp_ops xen_smp_ops __initdata = {
587 .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
588 .smp_prepare_cpus = xen_smp_prepare_cpus,
589 .cpu_up = xen_cpu_up,
590 .smp_cpus_done = xen_smp_cpus_done,
429 591
430 return 0; 592 .smp_send_stop = xen_smp_send_stop,
593 .smp_send_reschedule = xen_smp_send_reschedule,
594
595 .send_call_func_ipi = xen_smp_send_call_function_ipi,
596 .send_call_func_single_ipi = xen_smp_send_call_function_single_ipi,
597};
598
599void __init xen_smp_init(void)
600{
601 smp_ops = xen_smp_ops;
602 xen_fill_possible_map();
603 xen_init_spinlocks();
431} 604}
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
new file mode 100644
index 000000000000..2a234db5949b
--- /dev/null
+++ b/arch/x86/xen/suspend.c
@@ -0,0 +1,48 @@
1#include <linux/types.h>
2
3#include <xen/interface/xen.h>
4#include <xen/grant_table.h>
5#include <xen/events.h>
6
7#include <asm/xen/hypercall.h>
8#include <asm/xen/page.h>
9
10#include "xen-ops.h"
11#include "mmu.h"
12
13void xen_pre_suspend(void)
14{
15 xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn);
16 xen_start_info->console.domU.mfn =
17 mfn_to_pfn(xen_start_info->console.domU.mfn);
18
19 BUG_ON(!irqs_disabled());
20
21 HYPERVISOR_shared_info = &xen_dummy_shared_info;
22 if (HYPERVISOR_update_va_mapping(fix_to_virt(FIX_PARAVIRT_BOOTMAP),
23 __pte_ma(0), 0))
24 BUG();
25}
26
27void xen_post_suspend(int suspend_cancelled)
28{
29 xen_setup_shared_info();
30
31 if (suspend_cancelled) {
32 xen_start_info->store_mfn =
33 pfn_to_mfn(xen_start_info->store_mfn);
34 xen_start_info->console.domU.mfn =
35 pfn_to_mfn(xen_start_info->console.domU.mfn);
36 } else {
37#ifdef CONFIG_SMP
38 xen_cpu_initialized_map = cpu_online_map;
39#endif
40 xen_vcpu_restore();
41 }
42
43}
44
45void xen_arch_resume(void)
46{
47 /* nothing */
48}
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 41e217503c96..685b77470fc3 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -197,8 +197,8 @@ unsigned long long xen_sched_clock(void)
197} 197}
198 198
199 199
200/* Get the CPU speed from Xen */ 200/* Get the TSC speed from Xen */
201unsigned long xen_cpu_khz(void) 201unsigned long xen_tsc_khz(void)
202{ 202{
203 u64 xen_khz = 1000000ULL << 32; 203 u64 xen_khz = 1000000ULL << 32;
204 const struct pvclock_vcpu_time_info *info = 204 const struct pvclock_vcpu_time_info *info =
@@ -459,6 +459,19 @@ void xen_setup_cpu_clockevents(void)
459 clockevents_register_device(&__get_cpu_var(xen_clock_events)); 459 clockevents_register_device(&__get_cpu_var(xen_clock_events));
460} 460}
461 461
462void xen_timer_resume(void)
463{
464 int cpu;
465
466 if (xen_clockevent != &xen_vcpuop_clockevent)
467 return;
468
469 for_each_online_cpu(cpu) {
470 if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
471 BUG();
472 }
473}
474
462__init void xen_time_init(void) 475__init void xen_time_init(void)
463{ 476{
464 int cpu = smp_processor_id(); 477 int cpu = smp_processor_id();
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm_32.S
index 2497a30f41de..2497a30f41de 100644
--- a/arch/x86/xen/xen-asm.S
+++ b/arch/x86/xen/xen-asm_32.S
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
new file mode 100644
index 000000000000..7f58304fafb3
--- /dev/null
+++ b/arch/x86/xen/xen-asm_64.S
@@ -0,0 +1,271 @@
1/*
2 Asm versions of Xen pv-ops, suitable for either direct use or inlining.
3 The inline versions are the same as the direct-use versions, with the
4 pre- and post-amble chopped off.
5
6 This code is encoded for size rather than absolute efficiency,
7 with a view to being able to inline as much as possible.
8
9 We only bother with direct forms (ie, vcpu in pda) of the operations
10 here; the indirect forms are better handled in C, since they're
11 generally too large to inline anyway.
12 */
13
14#include <linux/linkage.h>
15
16#include <asm/asm-offsets.h>
17#include <asm/processor-flags.h>
18#include <asm/errno.h>
19#include <asm/segment.h>
20
21#include <xen/interface/xen.h>
22
23#define RELOC(x, v) .globl x##_reloc; x##_reloc=v
24#define ENDPATCH(x) .globl x##_end; x##_end=.
25
26/* Pseudo-flag used for virtual NMI, which we don't implement yet */
27#define XEN_EFLAGS_NMI 0x80000000
28
29#if 0
30#include <asm/percpu.h>
31
32/*
33 Enable events. This clears the event mask and tests the pending
34 event status with one and operation. If there are pending
35 events, then enter the hypervisor to get them handled.
36 */
37ENTRY(xen_irq_enable_direct)
38 /* Unmask events */
39 movb $0, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
40
41 /* Preempt here doesn't matter because that will deal with
42 any pending interrupts. The pending check may end up being
43 run on the wrong CPU, but that doesn't hurt. */
44
45 /* Test for pending */
46 testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending)
47 jz 1f
48
492: call check_events
501:
51ENDPATCH(xen_irq_enable_direct)
52 ret
53 ENDPROC(xen_irq_enable_direct)
54 RELOC(xen_irq_enable_direct, 2b+1)
55
56/*
57 Disabling events is simply a matter of making the event mask
58 non-zero.
59 */
60ENTRY(xen_irq_disable_direct)
61 movb $1, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
62ENDPATCH(xen_irq_disable_direct)
63 ret
64 ENDPROC(xen_irq_disable_direct)
65 RELOC(xen_irq_disable_direct, 0)
66
67/*
68 (xen_)save_fl is used to get the current interrupt enable status.
69 Callers expect the status to be in X86_EFLAGS_IF, and other bits
70 may be set in the return value. We take advantage of this by
71 making sure that X86_EFLAGS_IF has the right value (and other bits
72 in that byte are 0), but other bits in the return value are
73 undefined. We need to toggle the state of the bit, because
74 Xen and x86 use opposite senses (mask vs enable).
75 */
76ENTRY(xen_save_fl_direct)
77 testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
78 setz %ah
79 addb %ah,%ah
80ENDPATCH(xen_save_fl_direct)
81 ret
82 ENDPROC(xen_save_fl_direct)
83 RELOC(xen_save_fl_direct, 0)
84
85/*
86 In principle the caller should be passing us a value return
87 from xen_save_fl_direct, but for robustness sake we test only
88 the X86_EFLAGS_IF flag rather than the whole byte. After
89 setting the interrupt mask state, it checks for unmasked
90 pending events and enters the hypervisor to get them delivered
91 if so.
92 */
93ENTRY(xen_restore_fl_direct)
94 testb $X86_EFLAGS_IF>>8, %ah
95 setz PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
96 /* Preempt here doesn't matter because that will deal with
97 any pending interrupts. The pending check may end up being
98 run on the wrong CPU, but that doesn't hurt. */
99
100 /* check for unmasked and pending */
101 cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending)
102 jz 1f
1032: call check_events
1041:
105ENDPATCH(xen_restore_fl_direct)
106 ret
107 ENDPROC(xen_restore_fl_direct)
108 RELOC(xen_restore_fl_direct, 2b+1)
109
110
111/*
112 Force an event check by making a hypercall,
113 but preserve regs before making the call.
114 */
115check_events:
116 push %rax
117 push %rcx
118 push %rdx
119 push %rsi
120 push %rdi
121 push %r8
122 push %r9
123 push %r10
124 push %r11
125 call force_evtchn_callback
126 pop %r11
127 pop %r10
128 pop %r9
129 pop %r8
130 pop %rdi
131 pop %rsi
132 pop %rdx
133 pop %rcx
134 pop %rax
135 ret
136#endif
137
138ENTRY(xen_adjust_exception_frame)
139 mov 8+0(%rsp),%rcx
140 mov 8+8(%rsp),%r11
141 ret $16
142
143hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32
144/*
145 Xen64 iret frame:
146
147 ss
148 rsp
149 rflags
150 cs
151 rip <-- standard iret frame
152
153 flags
154
155 rcx }
156 r11 }<-- pushed by hypercall page
157rsp -> rax }
158 */
159ENTRY(xen_iret)
160 pushq $0
1611: jmp hypercall_iret
162ENDPATCH(xen_iret)
163RELOC(xen_iret, 1b+1)
164
165/*
166 sysexit is not used for 64-bit processes, so it's
167 only ever used to return to 32-bit compat userspace.
168 */
169ENTRY(xen_sysexit)
170 pushq $__USER32_DS
171 pushq %rcx
172 pushq $X86_EFLAGS_IF
173 pushq $__USER32_CS
174 pushq %rdx
175
176 pushq $0
1771: jmp hypercall_iret
178ENDPATCH(xen_sysexit)
179RELOC(xen_sysexit, 1b+1)
180
181ENTRY(xen_sysret64)
182 /* We're already on the usermode stack at this point, but still
183 with the kernel gs, so we can easily switch back */
184 movq %rsp, %gs:pda_oldrsp
185 movq %gs:pda_kernelstack,%rsp
186
187 pushq $__USER_DS
188 pushq %gs:pda_oldrsp
189 pushq %r11
190 pushq $__USER_CS
191 pushq %rcx
192
193 pushq $VGCF_in_syscall
1941: jmp hypercall_iret
195ENDPATCH(xen_sysret64)
196RELOC(xen_sysret64, 1b+1)
197
198ENTRY(xen_sysret32)
199 /* We're already on the usermode stack at this point, but still
200 with the kernel gs, so we can easily switch back */
201 movq %rsp, %gs:pda_oldrsp
202 movq %gs:pda_kernelstack, %rsp
203
204 pushq $__USER32_DS
205 pushq %gs:pda_oldrsp
206 pushq %r11
207 pushq $__USER32_CS
208 pushq %rcx
209
210 pushq $VGCF_in_syscall
2111: jmp hypercall_iret
212ENDPATCH(xen_sysret32)
213RELOC(xen_sysret32, 1b+1)
214
215/*
216 Xen handles syscall callbacks much like ordinary exceptions,
217 which means we have:
218 - kernel gs
219 - kernel rsp
220 - an iret-like stack frame on the stack (including rcx and r11):
221 ss
222 rsp
223 rflags
224 cs
225 rip
226 r11
227 rsp-> rcx
228
229 In all the entrypoints, we undo all that to make it look
230 like a CPU-generated syscall/sysenter and jump to the normal
231 entrypoint.
232 */
233
234.macro undo_xen_syscall
235 mov 0*8(%rsp),%rcx
236 mov 1*8(%rsp),%r11
237 mov 5*8(%rsp),%rsp
238.endm
239
240/* Normal 64-bit system call target */
241ENTRY(xen_syscall_target)
242 undo_xen_syscall
243 jmp system_call_after_swapgs
244ENDPROC(xen_syscall_target)
245
246#ifdef CONFIG_IA32_EMULATION
247
248/* 32-bit compat syscall target */
249ENTRY(xen_syscall32_target)
250 undo_xen_syscall
251 jmp ia32_cstar_target
252ENDPROC(xen_syscall32_target)
253
254/* 32-bit compat sysenter target */
255ENTRY(xen_sysenter_target)
256 undo_xen_syscall
257 jmp ia32_sysenter_target
258ENDPROC(xen_sysenter_target)
259
260#else /* !CONFIG_IA32_EMULATION */
261
262ENTRY(xen_syscall32_target)
263ENTRY(xen_sysenter_target)
264 lea 16(%rsp), %rsp /* strip %rcx,%r11 */
265 mov $-ENOSYS, %rax
266 pushq $VGCF_in_syscall
267 jmp hypercall_iret
268ENDPROC(xen_syscall32_target)
269ENDPROC(xen_sysenter_target)
270
271#endif /* CONFIG_IA32_EMULATION */
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 6ec3b4f7719b..63d49a523ed3 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -5,14 +5,24 @@
5 5
6#include <linux/elfnote.h> 6#include <linux/elfnote.h>
7#include <linux/init.h> 7#include <linux/init.h>
8
8#include <asm/boot.h> 9#include <asm/boot.h>
10#include <asm/asm.h>
11#include <asm/page.h>
12
9#include <xen/interface/elfnote.h> 13#include <xen/interface/elfnote.h>
14#include <asm/xen/interface.h>
10 15
11 __INIT 16 __INIT
12ENTRY(startup_xen) 17ENTRY(startup_xen)
13 movl %esi,xen_start_info
14 cld 18 cld
15 movl $(init_thread_union+THREAD_SIZE),%esp 19#ifdef CONFIG_X86_32
20 mov %esi,xen_start_info
21 mov $init_thread_union+THREAD_SIZE,%esp
22#else
23 mov %rsi,xen_start_info
24 mov $init_thread_union+THREAD_SIZE,%rsp
25#endif
16 jmp xen_start_kernel 26 jmp xen_start_kernel
17 27
18 __FINIT 28 __FINIT
@@ -20,17 +30,26 @@ ENTRY(startup_xen)
20.pushsection .text 30.pushsection .text
21 .align PAGE_SIZE_asm 31 .align PAGE_SIZE_asm
22ENTRY(hypercall_page) 32ENTRY(hypercall_page)
23 .skip 0x1000 33 .skip PAGE_SIZE_asm
24.popsection 34.popsection
25 35
26 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") 36 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux")
27 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6") 37 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6")
28 ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0") 38 ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0")
29 ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .long __PAGE_OFFSET) 39#ifdef CONFIG_X86_32
30 ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long startup_xen) 40 ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __PAGE_OFFSET)
31 ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long hypercall_page) 41#else
42 ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __START_KERNEL_map)
43#endif
44 ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen)
45 ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page)
32 ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb") 46 ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb")
33 ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes") 47 ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes")
34 ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic") 48 ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic")
49 ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID,
50 .quad _PAGE_PRESENT; .quad _PAGE_PRESENT)
51 ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1)
52 ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, _ASM_PTR __HYPERVISOR_VIRT_START)
53 ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, _ASM_PTR 0)
35 54
36#endif /*CONFIG_XEN */ 55#endif /*CONFIG_XEN */
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index f1063ae08037..dd3c23152a2e 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -9,22 +9,31 @@
9extern const char xen_hypervisor_callback[]; 9extern const char xen_hypervisor_callback[];
10extern const char xen_failsafe_callback[]; 10extern const char xen_failsafe_callback[];
11 11
12struct trap_info;
12void xen_copy_trap_info(struct trap_info *traps); 13void xen_copy_trap_info(struct trap_info *traps);
13 14
14DECLARE_PER_CPU(unsigned long, xen_cr3); 15DECLARE_PER_CPU(unsigned long, xen_cr3);
15DECLARE_PER_CPU(unsigned long, xen_current_cr3); 16DECLARE_PER_CPU(unsigned long, xen_current_cr3);
16 17
17extern struct start_info *xen_start_info; 18extern struct start_info *xen_start_info;
19extern struct shared_info xen_dummy_shared_info;
18extern struct shared_info *HYPERVISOR_shared_info; 20extern struct shared_info *HYPERVISOR_shared_info;
19 21
22void xen_setup_mfn_list_list(void);
23void xen_setup_shared_info(void);
24
20char * __init xen_memory_setup(void); 25char * __init xen_memory_setup(void);
21void __init xen_arch_setup(void); 26void __init xen_arch_setup(void);
22void __init xen_init_IRQ(void); 27void __init xen_init_IRQ(void);
23void xen_enable_sysenter(void); 28void xen_enable_sysenter(void);
29void xen_enable_syscall(void);
30void xen_vcpu_restore(void);
31
32void __init xen_build_dynamic_phys_to_machine(void);
24 33
25void xen_setup_timer(int cpu); 34void xen_setup_timer(int cpu);
26void xen_setup_cpu_clockevents(void); 35void xen_setup_cpu_clockevents(void);
27unsigned long xen_cpu_khz(void); 36unsigned long xen_tsc_khz(void);
28void __init xen_time_init(void); 37void __init xen_time_init(void);
29unsigned long xen_get_wallclock(void); 38unsigned long xen_get_wallclock(void);
30int xen_set_wallclock(unsigned long time); 39int xen_set_wallclock(unsigned long time);
@@ -36,23 +45,15 @@ bool xen_vcpu_stolen(int vcpu);
36 45
37void xen_mark_init_mm_pinned(void); 46void xen_mark_init_mm_pinned(void);
38 47
39void __init xen_fill_possible_map(void);
40
41void __init xen_setup_vcpu_info_placement(void); 48void __init xen_setup_vcpu_info_placement(void);
42void xen_smp_prepare_boot_cpu(void);
43void xen_smp_prepare_cpus(unsigned int max_cpus);
44int xen_cpu_up(unsigned int cpu);
45void xen_smp_cpus_done(unsigned int max_cpus);
46 49
47void xen_smp_send_stop(void); 50#ifdef CONFIG_SMP
48void xen_smp_send_reschedule(int cpu); 51void xen_smp_init(void);
49int xen_smp_call_function (void (*func) (void *info), void *info, int nonatomic,
50 int wait);
51int xen_smp_call_function_single(int cpu, void (*func) (void *info), void *info,
52 int nonatomic, int wait);
53 52
54int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *), 53extern cpumask_t xen_cpu_initialized_map;
55 void *info, int wait); 54#else
55static inline void xen_smp_init(void) {}
56#endif
56 57
57 58
58/* Declare an asm function, along with symbols needed to make it 59/* Declare an asm function, along with symbols needed to make it
@@ -67,7 +68,11 @@ DECL_ASM(void, xen_irq_disable_direct, void);
67DECL_ASM(unsigned long, xen_save_fl_direct, void); 68DECL_ASM(unsigned long, xen_save_fl_direct, void);
68DECL_ASM(void, xen_restore_fl_direct, unsigned long); 69DECL_ASM(void, xen_restore_fl_direct, unsigned long);
69 70
71/* These are not functions, and cannot be called normally */
70void xen_iret(void); 72void xen_iret(void);
71void xen_sysexit(void); 73void xen_sysexit(void);
74void xen_sysret32(void);
75void xen_sysret64(void);
76void xen_adjust_exception_frame(void);
72 77
73#endif /* XEN_OPS_H */ 78#endif /* XEN_OPS_H */