aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/xen
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2008-07-22 03:06:21 -0400
committerIngo Molnar <mingo@elte.hu>2008-07-22 03:06:21 -0400
commit76c3bb15d6786a0b8da0ad0090e0c9c3672fc08b (patch)
tree3824e008db9d554229a70c85fbbc13238276bd7a /arch/x86/xen
parent7be42004065ce4df193aeef5befd26805267d0d9 (diff)
parent93ded9b8fd42abe2c3607097963d8de6ad9117eb (diff)
Merge branch 'linus' into x86/x2apic
Diffstat (limited to 'arch/x86/xen')
-rw-r--r--arch/x86/xen/Kconfig14
-rw-r--r--arch/x86/xen/Makefile2
-rw-r--r--arch/x86/xen/enlighten.c696
-rw-r--r--arch/x86/xen/mmu.c316
-rw-r--r--arch/x86/xen/mmu.h29
-rw-r--r--arch/x86/xen/multicalls.c1
-rw-r--r--arch/x86/xen/setup.c79
-rw-r--r--arch/x86/xen/smp.c306
-rw-r--r--arch/x86/xen/suspend.c5
-rw-r--r--arch/x86/xen/xen-asm_32.S (renamed from arch/x86/xen/xen-asm.S)0
-rw-r--r--arch/x86/xen/xen-asm_64.S271
-rw-r--r--arch/x86/xen/xen-head.S28
-rw-r--r--arch/x86/xen/xen-ops.h21
13 files changed, 1425 insertions, 343 deletions
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index c2cc99580871..3815e425f470 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -6,8 +6,8 @@ config XEN
6 bool "Xen guest support" 6 bool "Xen guest support"
7 select PARAVIRT 7 select PARAVIRT
8 select PARAVIRT_CLOCK 8 select PARAVIRT_CLOCK
9 depends on X86_32 9 depends on X86_64 || (X86_32 && X86_PAE && !(X86_VISWS || X86_VOYAGER))
10 depends on X86_CMPXCHG && X86_TSC && X86_PAE && !(X86_VISWS || X86_VOYAGER) 10 depends on X86_CMPXCHG && X86_TSC
11 help 11 help
12 This is the Linux Xen port. Enabling this will allow the 12 This is the Linux Xen port. Enabling this will allow the
13 kernel to boot in a paravirtualized environment under the 13 kernel to boot in a paravirtualized environment under the
@@ -15,10 +15,16 @@ config XEN
15 15
16config XEN_MAX_DOMAIN_MEMORY 16config XEN_MAX_DOMAIN_MEMORY
17 int "Maximum allowed size of a domain in gigabytes" 17 int "Maximum allowed size of a domain in gigabytes"
18 default 8 18 default 8 if X86_32
19 default 32 if X86_64
19 depends on XEN 20 depends on XEN
20 help 21 help
21 The pseudo-physical to machine address array is sized 22 The pseudo-physical to machine address array is sized
22 according to the maximum possible memory size of a Xen 23 according to the maximum possible memory size of a Xen
23 domain. This array uses 1 page per gigabyte, so there's no 24 domain. This array uses 1 page per gigabyte, so there's no
24 need to be too stingy here. \ No newline at end of file 25 need to be too stingy here.
26
27config XEN_SAVE_RESTORE
28 bool
29 depends on PM
30 default y \ No newline at end of file
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 2ba2d1649131..59c1e539aed2 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -1,4 +1,4 @@
1obj-y := enlighten.o setup.o multicalls.o mmu.o \ 1obj-y := enlighten.o setup.o multicalls.o mmu.o \
2 time.o xen-asm.o grant-table.o suspend.o 2 time.o xen-asm_$(BITS).o grant-table.o suspend.o
3 3
4obj-$(CONFIG_SMP) += smp.o 4obj-$(CONFIG_SMP) += smp.o
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index e4d1459a63df..c910345860c3 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -33,6 +33,7 @@
33#include <xen/interface/sched.h> 33#include <xen/interface/sched.h>
34#include <xen/features.h> 34#include <xen/features.h>
35#include <xen/page.h> 35#include <xen/page.h>
36#include <xen/hvc-console.h>
36 37
37#include <asm/paravirt.h> 38#include <asm/paravirt.h>
38#include <asm/apic.h> 39#include <asm/apic.h>
@@ -41,12 +42,12 @@
41#include <asm/xen/hypervisor.h> 42#include <asm/xen/hypervisor.h>
42#include <asm/fixmap.h> 43#include <asm/fixmap.h>
43#include <asm/processor.h> 44#include <asm/processor.h>
45#include <asm/msr-index.h>
44#include <asm/setup.h> 46#include <asm/setup.h>
45#include <asm/desc.h> 47#include <asm/desc.h>
46#include <asm/pgtable.h> 48#include <asm/pgtable.h>
47#include <asm/tlbflush.h> 49#include <asm/tlbflush.h>
48#include <asm/reboot.h> 50#include <asm/reboot.h>
49#include <asm/pgalloc.h>
50 51
51#include "xen-ops.h" 52#include "xen-ops.h"
52#include "mmu.h" 53#include "mmu.h"
@@ -58,6 +59,18 @@ DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
58DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); 59DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
59 60
60/* 61/*
62 * Identity map, in addition to plain kernel map. This needs to be
63 * large enough to allocate page table pages to allocate the rest.
64 * Each page can map 2MB.
65 */
66static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss;
67
68#ifdef CONFIG_X86_64
69/* l3 pud for userspace vsyscall mapping */
70static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
71#endif /* CONFIG_X86_64 */
72
73/*
61 * Note about cr3 (pagetable base) values: 74 * Note about cr3 (pagetable base) values:
62 * 75 *
63 * xen_cr3 contains the current logical cr3 value; it contains the 76 * xen_cr3 contains the current logical cr3 value; it contains the
@@ -168,10 +181,14 @@ void xen_vcpu_restore(void)
168 181
169static void __init xen_banner(void) 182static void __init xen_banner(void)
170{ 183{
184 unsigned version = HYPERVISOR_xen_version(XENVER_version, NULL);
185 struct xen_extraversion extra;
186 HYPERVISOR_xen_version(XENVER_extraversion, &extra);
187
171 printk(KERN_INFO "Booting paravirtualized kernel on %s\n", 188 printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
172 pv_info.name); 189 pv_info.name);
173 printk(KERN_INFO "Hypervisor signature: %s%s\n", 190 printk(KERN_INFO "Xen version: %d.%d%s%s\n",
174 xen_start_info->magic, 191 version >> 16, version & 0xffff, extra.extraversion,
175 xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : ""); 192 xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : "");
176} 193}
177 194
@@ -364,14 +381,6 @@ static void load_TLS_descriptor(struct thread_struct *t,
364 381
365static void xen_load_tls(struct thread_struct *t, unsigned int cpu) 382static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
366{ 383{
367 xen_mc_batch();
368
369 load_TLS_descriptor(t, cpu, 0);
370 load_TLS_descriptor(t, cpu, 1);
371 load_TLS_descriptor(t, cpu, 2);
372
373 xen_mc_issue(PARAVIRT_LAZY_CPU);
374
375 /* 384 /*
376 * XXX sleazy hack: If we're being called in a lazy-cpu zone, 385 * XXX sleazy hack: If we're being called in a lazy-cpu zone,
377 * it means we're in a context switch, and %gs has just been 386 * it means we're in a context switch, and %gs has just been
@@ -380,10 +389,39 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
380 * Either way, it has been saved, and the new value will get 389 * Either way, it has been saved, and the new value will get
381 * loaded properly. This will go away as soon as Xen has been 390 * loaded properly. This will go away as soon as Xen has been
382 * modified to not save/restore %gs for normal hypercalls. 391 * modified to not save/restore %gs for normal hypercalls.
392 *
393 * On x86_64, this hack is not used for %gs, because gs points
394 * to KERNEL_GS_BASE (and uses it for PDA references), so we
395 * must not zero %gs on x86_64
396 *
397 * For x86_64, we need to zero %fs, otherwise we may get an
398 * exception between the new %fs descriptor being loaded and
399 * %fs being effectively cleared at __switch_to().
383 */ 400 */
384 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) 401 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) {
402#ifdef CONFIG_X86_32
385 loadsegment(gs, 0); 403 loadsegment(gs, 0);
404#else
405 loadsegment(fs, 0);
406#endif
407 }
408
409 xen_mc_batch();
410
411 load_TLS_descriptor(t, cpu, 0);
412 load_TLS_descriptor(t, cpu, 1);
413 load_TLS_descriptor(t, cpu, 2);
414
415 xen_mc_issue(PARAVIRT_LAZY_CPU);
416}
417
418#ifdef CONFIG_X86_64
419static void xen_load_gs_index(unsigned int idx)
420{
421 if (HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, idx))
422 BUG();
386} 423}
424#endif
387 425
388static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, 426static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
389 const void *ptr) 427 const void *ptr)
@@ -401,23 +439,18 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
401 preempt_enable(); 439 preempt_enable();
402} 440}
403 441
404static int cvt_gate_to_trap(int vector, u32 low, u32 high, 442static int cvt_gate_to_trap(int vector, const gate_desc *val,
405 struct trap_info *info) 443 struct trap_info *info)
406{ 444{
407 u8 type, dpl; 445 if (val->type != 0xf && val->type != 0xe)
408
409 type = (high >> 8) & 0x1f;
410 dpl = (high >> 13) & 3;
411
412 if (type != 0xf && type != 0xe)
413 return 0; 446 return 0;
414 447
415 info->vector = vector; 448 info->vector = vector;
416 info->address = (high & 0xffff0000) | (low & 0x0000ffff); 449 info->address = gate_offset(*val);
417 info->cs = low >> 16; 450 info->cs = gate_segment(*val);
418 info->flags = dpl; 451 info->flags = val->dpl;
419 /* interrupt gates clear IF */ 452 /* interrupt gates clear IF */
420 if (type == 0xe) 453 if (val->type == 0xe)
421 info->flags |= 4; 454 info->flags |= 4;
422 455
423 return 1; 456 return 1;
@@ -444,11 +477,10 @@ static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g)
444 477
445 if (p >= start && (p + 8) <= end) { 478 if (p >= start && (p + 8) <= end) {
446 struct trap_info info[2]; 479 struct trap_info info[2];
447 u32 *desc = (u32 *)g;
448 480
449 info[1].address = 0; 481 info[1].address = 0;
450 482
451 if (cvt_gate_to_trap(entrynum, desc[0], desc[1], &info[0])) 483 if (cvt_gate_to_trap(entrynum, g, &info[0]))
452 if (HYPERVISOR_set_trap_table(info)) 484 if (HYPERVISOR_set_trap_table(info))
453 BUG(); 485 BUG();
454 } 486 }
@@ -461,13 +493,13 @@ static void xen_convert_trap_info(const struct desc_ptr *desc,
461{ 493{
462 unsigned in, out, count; 494 unsigned in, out, count;
463 495
464 count = (desc->size+1) / 8; 496 count = (desc->size+1) / sizeof(gate_desc);
465 BUG_ON(count > 256); 497 BUG_ON(count > 256);
466 498
467 for (in = out = 0; in < count; in++) { 499 for (in = out = 0; in < count; in++) {
468 const u32 *entry = (u32 *)(desc->address + in * 8); 500 gate_desc *entry = (gate_desc*)(desc->address) + in;
469 501
470 if (cvt_gate_to_trap(in, entry[0], entry[1], &traps[out])) 502 if (cvt_gate_to_trap(in, entry, &traps[out]))
471 out++; 503 out++;
472 } 504 }
473 traps[out].address = 0; 505 traps[out].address = 0;
@@ -727,33 +759,89 @@ static void set_current_cr3(void *v)
727 x86_write_percpu(xen_current_cr3, (unsigned long)v); 759 x86_write_percpu(xen_current_cr3, (unsigned long)v);
728} 760}
729 761
730static void xen_write_cr3(unsigned long cr3) 762static void __xen_write_cr3(bool kernel, unsigned long cr3)
731{ 763{
732 struct mmuext_op *op; 764 struct mmuext_op *op;
733 struct multicall_space mcs; 765 struct multicall_space mcs;
734 unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3)); 766 unsigned long mfn;
735 767
736 BUG_ON(preemptible()); 768 if (cr3)
769 mfn = pfn_to_mfn(PFN_DOWN(cr3));
770 else
771 mfn = 0;
737 772
738 mcs = xen_mc_entry(sizeof(*op)); /* disables interrupts */ 773 WARN_ON(mfn == 0 && kernel);
739 774
740 /* Update while interrupts are disabled, so its atomic with 775 mcs = __xen_mc_entry(sizeof(*op));
741 respect to ipis */
742 x86_write_percpu(xen_cr3, cr3);
743 776
744 op = mcs.args; 777 op = mcs.args;
745 op->cmd = MMUEXT_NEW_BASEPTR; 778 op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
746 op->arg1.mfn = mfn; 779 op->arg1.mfn = mfn;
747 780
748 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); 781 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
749 782
750 /* Update xen_update_cr3 once the batch has actually 783 if (kernel) {
751 been submitted. */ 784 x86_write_percpu(xen_cr3, cr3);
752 xen_mc_callback(set_current_cr3, (void *)cr3); 785
786 /* Update xen_current_cr3 once the batch has actually
787 been submitted. */
788 xen_mc_callback(set_current_cr3, (void *)cr3);
789 }
790}
791
792static void xen_write_cr3(unsigned long cr3)
793{
794 BUG_ON(preemptible());
795
796 xen_mc_batch(); /* disables interrupts */
797
798 /* Update while interrupts are disabled, so its atomic with
799 respect to ipis */
800 x86_write_percpu(xen_cr3, cr3);
801
802 __xen_write_cr3(true, cr3);
803
804#ifdef CONFIG_X86_64
805 {
806 pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
807 if (user_pgd)
808 __xen_write_cr3(false, __pa(user_pgd));
809 else
810 __xen_write_cr3(false, 0);
811 }
812#endif
753 813
754 xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */ 814 xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
755} 815}
756 816
817static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
818{
819 int ret;
820
821 ret = 0;
822
823 switch(msr) {
824#ifdef CONFIG_X86_64
825 unsigned which;
826 u64 base;
827
828 case MSR_FS_BASE: which = SEGBASE_FS; goto set;
829 case MSR_KERNEL_GS_BASE: which = SEGBASE_GS_USER; goto set;
830 case MSR_GS_BASE: which = SEGBASE_GS_KERNEL; goto set;
831
832 set:
833 base = ((u64)high << 32) | low;
834 if (HYPERVISOR_set_segment_base(which, base) != 0)
835 ret = -EFAULT;
836 break;
837#endif
838 default:
839 ret = native_write_msr_safe(msr, low, high);
840 }
841
842 return ret;
843}
844
757/* Early in boot, while setting up the initial pagetable, assume 845/* Early in boot, while setting up the initial pagetable, assume
758 everything is pinned. */ 846 everything is pinned. */
759static __init void xen_alloc_pte_init(struct mm_struct *mm, u32 pfn) 847static __init void xen_alloc_pte_init(struct mm_struct *mm, u32 pfn)
@@ -810,6 +898,48 @@ static void xen_alloc_pmd(struct mm_struct *mm, u32 pfn)
810 xen_alloc_ptpage(mm, pfn, PT_PMD); 898 xen_alloc_ptpage(mm, pfn, PT_PMD);
811} 899}
812 900
901static int xen_pgd_alloc(struct mm_struct *mm)
902{
903 pgd_t *pgd = mm->pgd;
904 int ret = 0;
905
906 BUG_ON(PagePinned(virt_to_page(pgd)));
907
908#ifdef CONFIG_X86_64
909 {
910 struct page *page = virt_to_page(pgd);
911 pgd_t *user_pgd;
912
913 BUG_ON(page->private != 0);
914
915 ret = -ENOMEM;
916
917 user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
918 page->private = (unsigned long)user_pgd;
919
920 if (user_pgd != NULL) {
921 user_pgd[pgd_index(VSYSCALL_START)] =
922 __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
923 ret = 0;
924 }
925
926 BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
927 }
928#endif
929
930 return ret;
931}
932
933static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
934{
935#ifdef CONFIG_X86_64
936 pgd_t *user_pgd = xen_get_user_pgd(pgd);
937
938 if (user_pgd)
939 free_page((unsigned long)user_pgd);
940#endif
941}
942
813/* This should never happen until we're OK to use struct page */ 943/* This should never happen until we're OK to use struct page */
814static void xen_release_ptpage(u32 pfn, unsigned level) 944static void xen_release_ptpage(u32 pfn, unsigned level)
815{ 945{
@@ -835,6 +965,18 @@ static void xen_release_pmd(u32 pfn)
835 xen_release_ptpage(pfn, PT_PMD); 965 xen_release_ptpage(pfn, PT_PMD);
836} 966}
837 967
968#if PAGETABLE_LEVELS == 4
969static void xen_alloc_pud(struct mm_struct *mm, u32 pfn)
970{
971 xen_alloc_ptpage(mm, pfn, PT_PUD);
972}
973
974static void xen_release_pud(u32 pfn)
975{
976 xen_release_ptpage(pfn, PT_PUD);
977}
978#endif
979
838#ifdef CONFIG_HIGHPTE 980#ifdef CONFIG_HIGHPTE
839static void *xen_kmap_atomic_pte(struct page *page, enum km_type type) 981static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
840{ 982{
@@ -873,68 +1015,16 @@ static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
873 1015
874static __init void xen_pagetable_setup_start(pgd_t *base) 1016static __init void xen_pagetable_setup_start(pgd_t *base)
875{ 1017{
876 pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base;
877 int i;
878
879 /* special set_pte for pagetable initialization */
880 pv_mmu_ops.set_pte = xen_set_pte_init;
881
882 init_mm.pgd = base;
883 /*
884 * copy top-level of Xen-supplied pagetable into place. This
885 * is a stand-in while we copy the pmd pages.
886 */
887 memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t));
888
889 /*
890 * For PAE, need to allocate new pmds, rather than
891 * share Xen's, since Xen doesn't like pmd's being
892 * shared between address spaces.
893 */
894 for (i = 0; i < PTRS_PER_PGD; i++) {
895 if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) {
896 pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
897
898 memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]),
899 PAGE_SIZE);
900
901 make_lowmem_page_readonly(pmd);
902
903 set_pgd(&base[i], __pgd(1 + __pa(pmd)));
904 } else
905 pgd_clear(&base[i]);
906 }
907
908 /* make sure zero_page is mapped RO so we can use it in pagetables */
909 make_lowmem_page_readonly(empty_zero_page);
910 make_lowmem_page_readonly(base);
911 /*
912 * Switch to new pagetable. This is done before
913 * pagetable_init has done anything so that the new pages
914 * added to the table can be prepared properly for Xen.
915 */
916 xen_write_cr3(__pa(base));
917
918 /* Unpin initial Xen pagetable */
919 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE,
920 PFN_DOWN(__pa(xen_start_info->pt_base)));
921} 1018}
922 1019
923void xen_setup_shared_info(void) 1020void xen_setup_shared_info(void)
924{ 1021{
925 if (!xen_feature(XENFEAT_auto_translated_physmap)) { 1022 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
926 unsigned long addr = fix_to_virt(FIX_PARAVIRT_BOOTMAP); 1023 set_fixmap(FIX_PARAVIRT_BOOTMAP,
927 1024 xen_start_info->shared_info);
928 /* 1025
929 * Create a mapping for the shared info page. 1026 HYPERVISOR_shared_info =
930 * Should be set_fixmap(), but shared_info is a machine 1027 (struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP);
931 * address with no corresponding pseudo-phys address.
932 */
933 set_pte_mfn(addr,
934 PFN_DOWN(xen_start_info->shared_info),
935 PAGE_KERNEL);
936
937 HYPERVISOR_shared_info = (struct shared_info *)addr;
938 } else 1028 } else
939 HYPERVISOR_shared_info = 1029 HYPERVISOR_shared_info =
940 (struct shared_info *)__va(xen_start_info->shared_info); 1030 (struct shared_info *)__va(xen_start_info->shared_info);
@@ -949,26 +1039,32 @@ void xen_setup_shared_info(void)
949 1039
950static __init void xen_pagetable_setup_done(pgd_t *base) 1040static __init void xen_pagetable_setup_done(pgd_t *base)
951{ 1041{
952 /* This will work as long as patching hasn't happened yet
953 (which it hasn't) */
954 pv_mmu_ops.alloc_pte = xen_alloc_pte;
955 pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
956 pv_mmu_ops.release_pte = xen_release_pte;
957 pv_mmu_ops.release_pmd = xen_release_pmd;
958 pv_mmu_ops.set_pte = xen_set_pte;
959
960 xen_setup_shared_info(); 1042 xen_setup_shared_info();
961
962 /* Actually pin the pagetable down, but we can't set PG_pinned
963 yet because the page structures don't exist yet. */
964 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(base)));
965} 1043}
966 1044
967static __init void xen_post_allocator_init(void) 1045static __init void xen_post_allocator_init(void)
968{ 1046{
1047 pv_mmu_ops.set_pte = xen_set_pte;
969 pv_mmu_ops.set_pmd = xen_set_pmd; 1048 pv_mmu_ops.set_pmd = xen_set_pmd;
970 pv_mmu_ops.set_pud = xen_set_pud; 1049 pv_mmu_ops.set_pud = xen_set_pud;
1050#if PAGETABLE_LEVELS == 4
1051 pv_mmu_ops.set_pgd = xen_set_pgd;
1052#endif
1053
1054 /* This will work as long as patching hasn't happened yet
1055 (which it hasn't) */
1056 pv_mmu_ops.alloc_pte = xen_alloc_pte;
1057 pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
1058 pv_mmu_ops.release_pte = xen_release_pte;
1059 pv_mmu_ops.release_pmd = xen_release_pmd;
1060#if PAGETABLE_LEVELS == 4
1061 pv_mmu_ops.alloc_pud = xen_alloc_pud;
1062 pv_mmu_ops.release_pud = xen_release_pud;
1063#endif
971 1064
1065#ifdef CONFIG_X86_64
1066 SetPagePinned(virt_to_page(level3_user_vsyscall));
1067#endif
972 xen_mark_init_mm_pinned(); 1068 xen_mark_init_mm_pinned();
973} 1069}
974 1070
@@ -982,6 +1078,7 @@ void xen_setup_vcpu_info_placement(void)
982 1078
983 /* xen_vcpu_setup managed to place the vcpu_info within the 1079 /* xen_vcpu_setup managed to place the vcpu_info within the
984 percpu area for all cpus, so make use of it */ 1080 percpu area for all cpus, so make use of it */
1081#ifdef CONFIG_X86_32
985 if (have_vcpu_info_placement) { 1082 if (have_vcpu_info_placement) {
986 printk(KERN_INFO "Xen: using vcpu_info placement\n"); 1083 printk(KERN_INFO "Xen: using vcpu_info placement\n");
987 1084
@@ -991,6 +1088,7 @@ void xen_setup_vcpu_info_placement(void)
991 pv_irq_ops.irq_enable = xen_irq_enable_direct; 1088 pv_irq_ops.irq_enable = xen_irq_enable_direct;
992 pv_mmu_ops.read_cr2 = xen_read_cr2_direct; 1089 pv_mmu_ops.read_cr2 = xen_read_cr2_direct;
993 } 1090 }
1091#endif
994} 1092}
995 1093
996static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, 1094static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
@@ -1011,10 +1109,12 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
1011 goto patch_site 1109 goto patch_site
1012 1110
1013 switch (type) { 1111 switch (type) {
1112#ifdef CONFIG_X86_32
1014 SITE(pv_irq_ops, irq_enable); 1113 SITE(pv_irq_ops, irq_enable);
1015 SITE(pv_irq_ops, irq_disable); 1114 SITE(pv_irq_ops, irq_disable);
1016 SITE(pv_irq_ops, save_fl); 1115 SITE(pv_irq_ops, save_fl);
1017 SITE(pv_irq_ops, restore_fl); 1116 SITE(pv_irq_ops, restore_fl);
1117#endif /* CONFIG_X86_32 */
1018#undef SITE 1118#undef SITE
1019 1119
1020 patch_site: 1120 patch_site:
@@ -1057,8 +1157,15 @@ static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot)
1057#ifdef CONFIG_X86_F00F_BUG 1157#ifdef CONFIG_X86_F00F_BUG
1058 case FIX_F00F_IDT: 1158 case FIX_F00F_IDT:
1059#endif 1159#endif
1160#ifdef CONFIG_X86_32
1060 case FIX_WP_TEST: 1161 case FIX_WP_TEST:
1061 case FIX_VDSO: 1162 case FIX_VDSO:
1163# ifdef CONFIG_HIGHMEM
1164 case FIX_KMAP_BEGIN ... FIX_KMAP_END:
1165# endif
1166#else
1167 case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
1168#endif
1062#ifdef CONFIG_X86_LOCAL_APIC 1169#ifdef CONFIG_X86_LOCAL_APIC
1063 case FIX_APIC_BASE: /* maps dummy local APIC */ 1170 case FIX_APIC_BASE: /* maps dummy local APIC */
1064#endif 1171#endif
@@ -1071,6 +1178,15 @@ static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot)
1071 } 1178 }
1072 1179
1073 __native_set_fixmap(idx, pte); 1180 __native_set_fixmap(idx, pte);
1181
1182#ifdef CONFIG_X86_64
1183 /* Replicate changes to map the vsyscall page into the user
1184 pagetable vsyscall mapping. */
1185 if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) {
1186 unsigned long vaddr = __fix_to_virt(idx);
1187 set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
1188 }
1189#endif
1074} 1190}
1075 1191
1076static const struct pv_info xen_info __initdata = { 1192static const struct pv_info xen_info __initdata = {
@@ -1116,18 +1232,25 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
1116 .wbinvd = native_wbinvd, 1232 .wbinvd = native_wbinvd,
1117 1233
1118 .read_msr = native_read_msr_safe, 1234 .read_msr = native_read_msr_safe,
1119 .write_msr = native_write_msr_safe, 1235 .write_msr = xen_write_msr_safe,
1120 .read_tsc = native_read_tsc, 1236 .read_tsc = native_read_tsc,
1121 .read_pmc = native_read_pmc, 1237 .read_pmc = native_read_pmc,
1122 1238
1123 .iret = xen_iret, 1239 .iret = xen_iret,
1124 .irq_enable_sysexit = xen_sysexit, 1240 .irq_enable_sysexit = xen_sysexit,
1241#ifdef CONFIG_X86_64
1242 .usergs_sysret32 = xen_sysret32,
1243 .usergs_sysret64 = xen_sysret64,
1244#endif
1125 1245
1126 .load_tr_desc = paravirt_nop, 1246 .load_tr_desc = paravirt_nop,
1127 .set_ldt = xen_set_ldt, 1247 .set_ldt = xen_set_ldt,
1128 .load_gdt = xen_load_gdt, 1248 .load_gdt = xen_load_gdt,
1129 .load_idt = xen_load_idt, 1249 .load_idt = xen_load_idt,
1130 .load_tls = xen_load_tls, 1250 .load_tls = xen_load_tls,
1251#ifdef CONFIG_X86_64
1252 .load_gs_index = xen_load_gs_index,
1253#endif
1131 1254
1132 .store_gdt = native_store_gdt, 1255 .store_gdt = native_store_gdt,
1133 .store_idt = native_store_idt, 1256 .store_idt = native_store_idt,
@@ -1141,14 +1264,34 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
1141 .set_iopl_mask = xen_set_iopl_mask, 1264 .set_iopl_mask = xen_set_iopl_mask,
1142 .io_delay = xen_io_delay, 1265 .io_delay = xen_io_delay,
1143 1266
1267 /* Xen takes care of %gs when switching to usermode for us */
1268 .swapgs = paravirt_nop,
1269
1144 .lazy_mode = { 1270 .lazy_mode = {
1145 .enter = paravirt_enter_lazy_cpu, 1271 .enter = paravirt_enter_lazy_cpu,
1146 .leave = xen_leave_lazy, 1272 .leave = xen_leave_lazy,
1147 }, 1273 },
1148}; 1274};
1149 1275
1276static void __init __xen_init_IRQ(void)
1277{
1278#ifdef CONFIG_X86_64
1279 int i;
1280
1281 /* Create identity vector->irq map */
1282 for(i = 0; i < NR_VECTORS; i++) {
1283 int cpu;
1284
1285 for_each_possible_cpu(cpu)
1286 per_cpu(vector_irq, cpu)[i] = i;
1287 }
1288#endif /* CONFIG_X86_64 */
1289
1290 xen_init_IRQ();
1291}
1292
1150static const struct pv_irq_ops xen_irq_ops __initdata = { 1293static const struct pv_irq_ops xen_irq_ops __initdata = {
1151 .init_IRQ = xen_init_IRQ, 1294 .init_IRQ = __xen_init_IRQ,
1152 .save_fl = xen_save_fl, 1295 .save_fl = xen_save_fl,
1153 .restore_fl = xen_restore_fl, 1296 .restore_fl = xen_restore_fl,
1154 .irq_disable = xen_irq_disable, 1297 .irq_disable = xen_irq_disable,
@@ -1156,7 +1299,7 @@ static const struct pv_irq_ops xen_irq_ops __initdata = {
1156 .safe_halt = xen_safe_halt, 1299 .safe_halt = xen_safe_halt,
1157 .halt = xen_halt, 1300 .halt = xen_halt,
1158#ifdef CONFIG_X86_64 1301#ifdef CONFIG_X86_64
1159 .adjust_exception_frame = paravirt_nop, 1302 .adjust_exception_frame = xen_adjust_exception_frame,
1160#endif 1303#endif
1161}; 1304};
1162 1305
@@ -1186,8 +1329,8 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1186 .pte_update = paravirt_nop, 1329 .pte_update = paravirt_nop,
1187 .pte_update_defer = paravirt_nop, 1330 .pte_update_defer = paravirt_nop,
1188 1331
1189 .pgd_alloc = __paravirt_pgd_alloc, 1332 .pgd_alloc = xen_pgd_alloc,
1190 .pgd_free = paravirt_nop, 1333 .pgd_free = xen_pgd_free,
1191 1334
1192 .alloc_pte = xen_alloc_pte_init, 1335 .alloc_pte = xen_alloc_pte_init,
1193 .release_pte = xen_release_pte_init, 1336 .release_pte = xen_release_pte_init,
@@ -1199,7 +1342,11 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1199 .kmap_atomic_pte = xen_kmap_atomic_pte, 1342 .kmap_atomic_pte = xen_kmap_atomic_pte,
1200#endif 1343#endif
1201 1344
1202 .set_pte = NULL, /* see xen_pagetable_setup_* */ 1345#ifdef CONFIG_X86_64
1346 .set_pte = xen_set_pte,
1347#else
1348 .set_pte = xen_set_pte_init,
1349#endif
1203 .set_pte_at = xen_set_pte_at, 1350 .set_pte_at = xen_set_pte_at,
1204 .set_pmd = xen_set_pmd_hyper, 1351 .set_pmd = xen_set_pmd_hyper,
1205 1352
@@ -1213,15 +1360,26 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1213 .make_pte = xen_make_pte, 1360 .make_pte = xen_make_pte,
1214 .make_pgd = xen_make_pgd, 1361 .make_pgd = xen_make_pgd,
1215 1362
1363#ifdef CONFIG_X86_PAE
1216 .set_pte_atomic = xen_set_pte_atomic, 1364 .set_pte_atomic = xen_set_pte_atomic,
1217 .set_pte_present = xen_set_pte_at, 1365 .set_pte_present = xen_set_pte_at,
1218 .set_pud = xen_set_pud_hyper,
1219 .pte_clear = xen_pte_clear, 1366 .pte_clear = xen_pte_clear,
1220 .pmd_clear = xen_pmd_clear, 1367 .pmd_clear = xen_pmd_clear,
1368#endif /* CONFIG_X86_PAE */
1369 .set_pud = xen_set_pud_hyper,
1221 1370
1222 .make_pmd = xen_make_pmd, 1371 .make_pmd = xen_make_pmd,
1223 .pmd_val = xen_pmd_val, 1372 .pmd_val = xen_pmd_val,
1224 1373
1374#if PAGETABLE_LEVELS == 4
1375 .pud_val = xen_pud_val,
1376 .make_pud = xen_make_pud,
1377 .set_pgd = xen_set_pgd_hyper,
1378
1379 .alloc_pud = xen_alloc_pte_init,
1380 .release_pud = xen_release_pte_init,
1381#endif /* PAGETABLE_LEVELS == 4 */
1382
1225 .activate_mm = xen_activate_mm, 1383 .activate_mm = xen_activate_mm,
1226 .dup_mmap = xen_dup_mmap, 1384 .dup_mmap = xen_dup_mmap,
1227 .exit_mmap = xen_exit_mmap, 1385 .exit_mmap = xen_exit_mmap,
@@ -1234,21 +1392,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1234 .set_fixmap = xen_set_fixmap, 1392 .set_fixmap = xen_set_fixmap,
1235}; 1393};
1236 1394
1237#ifdef CONFIG_SMP
1238static const struct smp_ops xen_smp_ops __initdata = {
1239 .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
1240 .smp_prepare_cpus = xen_smp_prepare_cpus,
1241 .cpu_up = xen_cpu_up,
1242 .smp_cpus_done = xen_smp_cpus_done,
1243
1244 .smp_send_stop = xen_smp_send_stop,
1245 .smp_send_reschedule = xen_smp_send_reschedule,
1246
1247 .send_call_func_ipi = xen_smp_send_call_function_ipi,
1248 .send_call_func_single_ipi = xen_smp_send_call_function_single_ipi,
1249};
1250#endif /* CONFIG_SMP */
1251
1252static void xen_reboot(int reason) 1395static void xen_reboot(int reason)
1253{ 1396{
1254 struct sched_shutdown r = { .reason = reason }; 1397 struct sched_shutdown r = { .reason = reason };
@@ -1293,6 +1436,7 @@ static const struct machine_ops __initdata xen_machine_ops = {
1293 1436
1294static void __init xen_reserve_top(void) 1437static void __init xen_reserve_top(void)
1295{ 1438{
1439#ifdef CONFIG_X86_32
1296 unsigned long top = HYPERVISOR_VIRT_START; 1440 unsigned long top = HYPERVISOR_VIRT_START;
1297 struct xen_platform_parameters pp; 1441 struct xen_platform_parameters pp;
1298 1442
@@ -1300,8 +1444,248 @@ static void __init xen_reserve_top(void)
1300 top = pp.virt_start; 1444 top = pp.virt_start;
1301 1445
1302 reserve_top_address(-top + 2 * PAGE_SIZE); 1446 reserve_top_address(-top + 2 * PAGE_SIZE);
1447#endif /* CONFIG_X86_32 */
1448}
1449
1450/*
1451 * Like __va(), but returns address in the kernel mapping (which is
1452 * all we have until the physical memory mapping has been set up.
1453 */
1454static void *__ka(phys_addr_t paddr)
1455{
1456#ifdef CONFIG_X86_64
1457 return (void *)(paddr + __START_KERNEL_map);
1458#else
1459 return __va(paddr);
1460#endif
1303} 1461}
1304 1462
1463/* Convert a machine address to physical address */
1464static unsigned long m2p(phys_addr_t maddr)
1465{
1466 phys_addr_t paddr;
1467
1468 maddr &= PTE_MASK;
1469 paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
1470
1471 return paddr;
1472}
1473
1474/* Convert a machine address to kernel virtual */
1475static void *m2v(phys_addr_t maddr)
1476{
1477 return __ka(m2p(maddr));
1478}
1479
1480#ifdef CONFIG_X86_64
1481static void walk(pgd_t *pgd, unsigned long addr)
1482{
1483 unsigned l4idx = pgd_index(addr);
1484 unsigned l3idx = pud_index(addr);
1485 unsigned l2idx = pmd_index(addr);
1486 unsigned l1idx = pte_index(addr);
1487 pgd_t l4;
1488 pud_t l3;
1489 pmd_t l2;
1490 pte_t l1;
1491
1492 xen_raw_printk("walk %p, %lx -> %d %d %d %d\n",
1493 pgd, addr, l4idx, l3idx, l2idx, l1idx);
1494
1495 l4 = pgd[l4idx];
1496 xen_raw_printk(" l4: %016lx\n", l4.pgd);
1497 xen_raw_printk(" %016lx\n", pgd_val(l4));
1498
1499 l3 = ((pud_t *)(m2v(l4.pgd)))[l3idx];
1500 xen_raw_printk(" l3: %016lx\n", l3.pud);
1501 xen_raw_printk(" %016lx\n", pud_val(l3));
1502
1503 l2 = ((pmd_t *)(m2v(l3.pud)))[l2idx];
1504 xen_raw_printk(" l2: %016lx\n", l2.pmd);
1505 xen_raw_printk(" %016lx\n", pmd_val(l2));
1506
1507 l1 = ((pte_t *)(m2v(l2.pmd)))[l1idx];
1508 xen_raw_printk(" l1: %016lx\n", l1.pte);
1509 xen_raw_printk(" %016lx\n", pte_val(l1));
1510}
1511#endif
1512
1513static void set_page_prot(void *addr, pgprot_t prot)
1514{
1515 unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
1516 pte_t pte = pfn_pte(pfn, prot);
1517
1518 xen_raw_printk("addr=%p pfn=%lx mfn=%lx prot=%016llx pte=%016llx\n",
1519 addr, pfn, get_phys_to_machine(pfn),
1520 pgprot_val(prot), pte.pte);
1521
1522 if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
1523 BUG();
1524}
1525
1526static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1527{
1528 unsigned pmdidx, pteidx;
1529 unsigned ident_pte;
1530 unsigned long pfn;
1531
1532 ident_pte = 0;
1533 pfn = 0;
1534 for(pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
1535 pte_t *pte_page;
1536
1537 /* Reuse or allocate a page of ptes */
1538 if (pmd_present(pmd[pmdidx]))
1539 pte_page = m2v(pmd[pmdidx].pmd);
1540 else {
1541 /* Check for free pte pages */
1542 if (ident_pte == ARRAY_SIZE(level1_ident_pgt))
1543 break;
1544
1545 pte_page = &level1_ident_pgt[ident_pte];
1546 ident_pte += PTRS_PER_PTE;
1547
1548 pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
1549 }
1550
1551 /* Install mappings */
1552 for(pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
1553 pte_t pte;
1554
1555 if (pfn > max_pfn_mapped)
1556 max_pfn_mapped = pfn;
1557
1558 if (!pte_none(pte_page[pteidx]))
1559 continue;
1560
1561 pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
1562 pte_page[pteidx] = pte;
1563 }
1564 }
1565
1566 for(pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
1567 set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
1568
1569 set_page_prot(pmd, PAGE_KERNEL_RO);
1570}
1571
1572#ifdef CONFIG_X86_64
1573static void convert_pfn_mfn(void *v)
1574{
1575 pte_t *pte = v;
1576 int i;
1577
1578 /* All levels are converted the same way, so just treat them
1579 as ptes. */
1580 for(i = 0; i < PTRS_PER_PTE; i++)
1581 pte[i] = xen_make_pte(pte[i].pte);
1582}
1583
1584/*
1585 * Set up the inital kernel pagetable.
1586 *
1587 * We can construct this by grafting the Xen provided pagetable into
1588 * head_64.S's preconstructed pagetables. We copy the Xen L2's into
1589 * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt. This
1590 * means that only the kernel has a physical mapping to start with -
1591 * but that's enough to get __va working. We need to fill in the rest
1592 * of the physical mapping once some sort of allocator has been set
1593 * up.
1594 */
1595static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
1596{
1597 pud_t *l3;
1598 pmd_t *l2;
1599
1600 /* Zap identity mapping */
1601 init_level4_pgt[0] = __pgd(0);
1602
1603 /* Pre-constructed entries are in pfn, so convert to mfn */
1604 convert_pfn_mfn(init_level4_pgt);
1605 convert_pfn_mfn(level3_ident_pgt);
1606 convert_pfn_mfn(level3_kernel_pgt);
1607
1608 l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
1609 l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
1610
1611 memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1612 memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1613
1614 l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
1615 l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
1616 memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1617
1618 /* Set up identity map */
1619 xen_map_identity_early(level2_ident_pgt, max_pfn);
1620
1621 /* Make pagetable pieces RO */
1622 set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
1623 set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
1624 set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
1625 set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
1626 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1627 set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
1628
1629 /* Pin down new L4 */
1630 pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
1631 PFN_DOWN(__pa_symbol(init_level4_pgt)));
1632
1633 /* Unpin Xen-provided one */
1634 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1635
1636 /* Switch over */
1637 pgd = init_level4_pgt;
1638
1639 /*
1640 * At this stage there can be no user pgd, and no page
1641 * structure to attach it to, so make sure we just set kernel
1642 * pgd.
1643 */
1644 xen_mc_batch();
1645 __xen_write_cr3(true, __pa(pgd));
1646 xen_mc_issue(PARAVIRT_LAZY_CPU);
1647
1648 reserve_early(__pa(xen_start_info->pt_base),
1649 __pa(xen_start_info->pt_base +
1650 xen_start_info->nr_pt_frames * PAGE_SIZE),
1651 "XEN PAGETABLES");
1652
1653 return pgd;
1654}
1655#else /* !CONFIG_X86_64 */
1656static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss;
1657
1658static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
1659{
1660 pmd_t *kernel_pmd;
1661
1662 init_pg_tables_start = __pa(pgd);
1663 init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
1664 max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024);
1665
1666 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
1667 memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
1668
1669 xen_map_identity_early(level2_kernel_pgt, max_pfn);
1670
1671 memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
1672 set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY],
1673 __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT));
1674
1675 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1676 set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
1677 set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
1678
1679 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1680
1681 xen_write_cr3(__pa(swapper_pg_dir));
1682
1683 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir)));
1684
1685 return swapper_pg_dir;
1686}
1687#endif /* CONFIG_X86_64 */
1688
1305/* First C function to be called on Xen boot */ 1689/* First C function to be called on Xen boot */
1306asmlinkage void __init xen_start_kernel(void) 1690asmlinkage void __init xen_start_kernel(void)
1307{ 1691{
@@ -1337,53 +1721,56 @@ asmlinkage void __init xen_start_kernel(void)
1337 1721
1338 machine_ops = xen_machine_ops; 1722 machine_ops = xen_machine_ops;
1339 1723
1340#ifdef CONFIG_SMP 1724#ifdef CONFIG_X86_64
1341 smp_ops = xen_smp_ops; 1725 /* Disable until direct per-cpu data access. */
1726 have_vcpu_info_placement = 0;
1727 x86_64_init_pda();
1342#endif 1728#endif
1343 1729
1730 xen_smp_init();
1731
1344 /* Get mfn list */ 1732 /* Get mfn list */
1345 if (!xen_feature(XENFEAT_auto_translated_physmap)) 1733 if (!xen_feature(XENFEAT_auto_translated_physmap))
1346 xen_build_dynamic_phys_to_machine(); 1734 xen_build_dynamic_phys_to_machine();
1347 1735
1348 pgd = (pgd_t *)xen_start_info->pt_base; 1736 pgd = (pgd_t *)xen_start_info->pt_base;
1349 1737
1350 init_pg_tables_start = __pa(pgd); 1738 /* Prevent unwanted bits from being set in PTEs. */
1351 init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE; 1739 __supported_pte_mask &= ~_PAGE_GLOBAL;
1352 max_pfn_mapped = (init_pg_tables_end + 512*1024) >> PAGE_SHIFT; 1740 if (!is_initial_xendomain())
1353 1741 __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
1354 init_mm.pgd = pgd; /* use the Xen pagetables to start */
1355
1356 /* keep using Xen gdt for now; no urgent need to change it */
1357
1358 x86_write_percpu(xen_cr3, __pa(pgd));
1359 x86_write_percpu(xen_current_cr3, __pa(pgd));
1360 1742
1361 /* Don't do the full vcpu_info placement stuff until we have a 1743 /* Don't do the full vcpu_info placement stuff until we have a
1362 possible map and a non-dummy shared_info. */ 1744 possible map and a non-dummy shared_info. */
1363 per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; 1745 per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
1364 1746
1747 xen_raw_console_write("mapping kernel into physical memory\n");
1748 pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages);
1749
1750 init_mm.pgd = pgd;
1751
1752 /* keep using Xen gdt for now; no urgent need to change it */
1753
1365 pv_info.kernel_rpl = 1; 1754 pv_info.kernel_rpl = 1;
1366 if (xen_feature(XENFEAT_supervisor_mode_kernel)) 1755 if (xen_feature(XENFEAT_supervisor_mode_kernel))
1367 pv_info.kernel_rpl = 0; 1756 pv_info.kernel_rpl = 0;
1368 1757
1369 /* Prevent unwanted bits from being set in PTEs. */
1370 __supported_pte_mask &= ~_PAGE_GLOBAL;
1371 if (!is_initial_xendomain())
1372 __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
1373
1374 /* set the limit of our address space */ 1758 /* set the limit of our address space */
1375 xen_reserve_top(); 1759 xen_reserve_top();
1376 1760
1761#ifdef CONFIG_X86_32
1377 /* set up basic CPUID stuff */ 1762 /* set up basic CPUID stuff */
1378 cpu_detect(&new_cpu_data); 1763 cpu_detect(&new_cpu_data);
1379 new_cpu_data.hard_math = 1; 1764 new_cpu_data.hard_math = 1;
1380 new_cpu_data.x86_capability[0] = cpuid_edx(1); 1765 new_cpu_data.x86_capability[0] = cpuid_edx(1);
1766#endif
1381 1767
1382 /* Poke various useful things into boot_params */ 1768 /* Poke various useful things into boot_params */
1383 boot_params.hdr.type_of_loader = (9 << 4) | 0; 1769 boot_params.hdr.type_of_loader = (9 << 4) | 0;
1384 boot_params.hdr.ramdisk_image = xen_start_info->mod_start 1770 boot_params.hdr.ramdisk_image = xen_start_info->mod_start
1385 ? __pa(xen_start_info->mod_start) : 0; 1771 ? __pa(xen_start_info->mod_start) : 0;
1386 boot_params.hdr.ramdisk_size = xen_start_info->mod_len; 1772 boot_params.hdr.ramdisk_size = xen_start_info->mod_len;
1773 boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line);
1387 1774
1388 if (!is_initial_xendomain()) { 1775 if (!is_initial_xendomain()) {
1389 add_preferred_console("xenboot", 0, NULL); 1776 add_preferred_console("xenboot", 0, NULL);
@@ -1391,6 +1778,21 @@ asmlinkage void __init xen_start_kernel(void)
1391 add_preferred_console("hvc", 0, NULL); 1778 add_preferred_console("hvc", 0, NULL);
1392 } 1779 }
1393 1780
1781 xen_raw_console_write("about to get started...\n");
1782
1783#if 0
1784 xen_raw_printk("&boot_params=%p __pa(&boot_params)=%lx __va(__pa(&boot_params))=%lx\n",
1785 &boot_params, __pa_symbol(&boot_params),
1786 __va(__pa_symbol(&boot_params)));
1787
1788 walk(pgd, &boot_params);
1789 walk(pgd, __va(__pa(&boot_params)));
1790#endif
1791
1394 /* Start the world */ 1792 /* Start the world */
1793#ifdef CONFIG_X86_32
1395 i386_start_kernel(); 1794 i386_start_kernel();
1795#else
1796 x86_64_start_reservations((char *)__pa_symbol(&boot_params));
1797#endif
1396} 1798}
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index ff0aa74afaa1..a44d56e38bd1 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -44,8 +44,10 @@
44 44
45#include <asm/pgtable.h> 45#include <asm/pgtable.h>
46#include <asm/tlbflush.h> 46#include <asm/tlbflush.h>
47#include <asm/fixmap.h>
47#include <asm/mmu_context.h> 48#include <asm/mmu_context.h>
48#include <asm/paravirt.h> 49#include <asm/paravirt.h>
50#include <asm/linkage.h>
49 51
50#include <asm/xen/hypercall.h> 52#include <asm/xen/hypercall.h>
51#include <asm/xen/hypervisor.h> 53#include <asm/xen/hypervisor.h>
@@ -56,26 +58,29 @@
56#include "multicalls.h" 58#include "multicalls.h"
57#include "mmu.h" 59#include "mmu.h"
58 60
61/*
62 * Just beyond the highest usermode address. STACK_TOP_MAX has a
63 * redzone above it, so round it up to a PGD boundary.
64 */
65#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
66
67
59#define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) 68#define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
60#define TOP_ENTRIES (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE) 69#define TOP_ENTRIES (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE)
61 70
62/* Placeholder for holes in the address space */ 71/* Placeholder for holes in the address space */
63static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] 72static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] __page_aligned_data =
64 __attribute__((section(".data.page_aligned"))) =
65 { [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL }; 73 { [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL };
66 74
67 /* Array of pointers to pages containing p2m entries */ 75 /* Array of pointers to pages containing p2m entries */
68static unsigned long *p2m_top[TOP_ENTRIES] 76static unsigned long *p2m_top[TOP_ENTRIES] __page_aligned_data =
69 __attribute__((section(".data.page_aligned"))) =
70 { [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] }; 77 { [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] };
71 78
72/* Arrays of p2m arrays expressed in mfns used for save/restore */ 79/* Arrays of p2m arrays expressed in mfns used for save/restore */
73static unsigned long p2m_top_mfn[TOP_ENTRIES] 80static unsigned long p2m_top_mfn[TOP_ENTRIES] __page_aligned_bss;
74 __attribute__((section(".bss.page_aligned")));
75 81
76static unsigned long p2m_top_mfn_list[ 82static unsigned long p2m_top_mfn_list[TOP_ENTRIES / P2M_ENTRIES_PER_PAGE]
77 PAGE_ALIGN(TOP_ENTRIES / P2M_ENTRIES_PER_PAGE)] 83 __page_aligned_bss;
78 __attribute__((section(".bss.page_aligned")));
79 84
80static inline unsigned p2m_top_index(unsigned long pfn) 85static inline unsigned p2m_top_index(unsigned long pfn)
81{ 86{
@@ -181,15 +186,16 @@ void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
181 p2m_top[topidx][idx] = mfn; 186 p2m_top[topidx][idx] = mfn;
182} 187}
183 188
184xmaddr_t arbitrary_virt_to_machine(unsigned long address) 189xmaddr_t arbitrary_virt_to_machine(void *vaddr)
185{ 190{
191 unsigned long address = (unsigned long)vaddr;
186 unsigned int level; 192 unsigned int level;
187 pte_t *pte = lookup_address(address, &level); 193 pte_t *pte = lookup_address(address, &level);
188 unsigned offset = address & ~PAGE_MASK; 194 unsigned offset = address & ~PAGE_MASK;
189 195
190 BUG_ON(pte == NULL); 196 BUG_ON(pte == NULL);
191 197
192 return XMADDR((pte_mfn(*pte) << PAGE_SHIFT) + offset); 198 return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset);
193} 199}
194 200
195void make_lowmem_page_readonly(void *vaddr) 201void make_lowmem_page_readonly(void *vaddr)
@@ -256,7 +262,8 @@ void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
256 262
257 xen_mc_batch(); 263 xen_mc_batch();
258 264
259 u.ptr = virt_to_machine(ptr).maddr; 265 /* ptr may be ioremapped for 64-bit pagetable setup */
266 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
260 u.val = pmd_val_ma(val); 267 u.val = pmd_val_ma(val);
261 extend_mmu_update(&u); 268 extend_mmu_update(&u);
262 269
@@ -283,35 +290,7 @@ void xen_set_pmd(pmd_t *ptr, pmd_t val)
283 */ 290 */
284void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags) 291void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
285{ 292{
286 pgd_t *pgd; 293 set_pte_vaddr(vaddr, mfn_pte(mfn, flags));
287 pud_t *pud;
288 pmd_t *pmd;
289 pte_t *pte;
290
291 pgd = swapper_pg_dir + pgd_index(vaddr);
292 if (pgd_none(*pgd)) {
293 BUG();
294 return;
295 }
296 pud = pud_offset(pgd, vaddr);
297 if (pud_none(*pud)) {
298 BUG();
299 return;
300 }
301 pmd = pmd_offset(pud, vaddr);
302 if (pmd_none(*pmd)) {
303 BUG();
304 return;
305 }
306 pte = pte_offset_kernel(pmd, vaddr);
307 /* <mfn,flags> stored as-is, to permit clearing entries */
308 xen_set_pte(pte, mfn_pte(mfn, flags));
309
310 /*
311 * It's enough to flush this one mapping.
312 * (PGE mappings get flushed as well)
313 */
314 __flush_tlb_one(vaddr);
315} 294}
316 295
317void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, 296void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
@@ -418,7 +397,8 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val)
418 397
419 xen_mc_batch(); 398 xen_mc_batch();
420 399
421 u.ptr = virt_to_machine(ptr).maddr; 400 /* ptr may be ioremapped for 64-bit pagetable setup */
401 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
422 u.val = pud_val_ma(val); 402 u.val = pud_val_ma(val);
423 extend_mmu_update(&u); 403 extend_mmu_update(&u);
424 404
@@ -441,14 +421,19 @@ void xen_set_pud(pud_t *ptr, pud_t val)
441 421
442void xen_set_pte(pte_t *ptep, pte_t pte) 422void xen_set_pte(pte_t *ptep, pte_t pte)
443{ 423{
424#ifdef CONFIG_X86_PAE
444 ptep->pte_high = pte.pte_high; 425 ptep->pte_high = pte.pte_high;
445 smp_wmb(); 426 smp_wmb();
446 ptep->pte_low = pte.pte_low; 427 ptep->pte_low = pte.pte_low;
428#else
429 *ptep = pte;
430#endif
447} 431}
448 432
433#ifdef CONFIG_X86_PAE
449void xen_set_pte_atomic(pte_t *ptep, pte_t pte) 434void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
450{ 435{
451 set_64bit((u64 *)ptep, pte_val_ma(pte)); 436 set_64bit((u64 *)ptep, native_pte_val(pte));
452} 437}
453 438
454void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) 439void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
@@ -462,6 +447,7 @@ void xen_pmd_clear(pmd_t *pmdp)
462{ 447{
463 set_pmd(pmdp, __pmd(0)); 448 set_pmd(pmdp, __pmd(0));
464} 449}
450#endif /* CONFIG_X86_PAE */
465 451
466pmd_t xen_make_pmd(pmdval_t pmd) 452pmd_t xen_make_pmd(pmdval_t pmd)
467{ 453{
@@ -469,78 +455,189 @@ pmd_t xen_make_pmd(pmdval_t pmd)
469 return native_make_pmd(pmd); 455 return native_make_pmd(pmd);
470} 456}
471 457
458#if PAGETABLE_LEVELS == 4
459pudval_t xen_pud_val(pud_t pud)
460{
461 return pte_mfn_to_pfn(pud.pud);
462}
463
464pud_t xen_make_pud(pudval_t pud)
465{
466 pud = pte_pfn_to_mfn(pud);
467
468 return native_make_pud(pud);
469}
470
471pgd_t *xen_get_user_pgd(pgd_t *pgd)
472{
473 pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK);
474 unsigned offset = pgd - pgd_page;
475 pgd_t *user_ptr = NULL;
476
477 if (offset < pgd_index(USER_LIMIT)) {
478 struct page *page = virt_to_page(pgd_page);
479 user_ptr = (pgd_t *)page->private;
480 if (user_ptr)
481 user_ptr += offset;
482 }
483
484 return user_ptr;
485}
486
487static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
488{
489 struct mmu_update u;
490
491 u.ptr = virt_to_machine(ptr).maddr;
492 u.val = pgd_val_ma(val);
493 extend_mmu_update(&u);
494}
495
496/*
497 * Raw hypercall-based set_pgd, intended for in early boot before
498 * there's a page structure. This implies:
499 * 1. The only existing pagetable is the kernel's
500 * 2. It is always pinned
501 * 3. It has no user pagetable attached to it
502 */
503void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
504{
505 preempt_disable();
506
507 xen_mc_batch();
508
509 __xen_set_pgd_hyper(ptr, val);
510
511 xen_mc_issue(PARAVIRT_LAZY_MMU);
512
513 preempt_enable();
514}
515
516void xen_set_pgd(pgd_t *ptr, pgd_t val)
517{
518 pgd_t *user_ptr = xen_get_user_pgd(ptr);
519
520 /* If page is not pinned, we can just update the entry
521 directly */
522 if (!page_pinned(ptr)) {
523 *ptr = val;
524 if (user_ptr) {
525 WARN_ON(page_pinned(user_ptr));
526 *user_ptr = val;
527 }
528 return;
529 }
530
531 /* If it's pinned, then we can at least batch the kernel and
532 user updates together. */
533 xen_mc_batch();
534
535 __xen_set_pgd_hyper(ptr, val);
536 if (user_ptr)
537 __xen_set_pgd_hyper(user_ptr, val);
538
539 xen_mc_issue(PARAVIRT_LAZY_MMU);
540}
541#endif /* PAGETABLE_LEVELS == 4 */
542
472/* 543/*
473 (Yet another) pagetable walker. This one is intended for pinning a 544 * (Yet another) pagetable walker. This one is intended for pinning a
474 pagetable. This means that it walks a pagetable and calls the 545 * pagetable. This means that it walks a pagetable and calls the
475 callback function on each page it finds making up the page table, 546 * callback function on each page it finds making up the page table,
476 at every level. It walks the entire pagetable, but it only bothers 547 * at every level. It walks the entire pagetable, but it only bothers
477 pinning pte pages which are below pte_limit. In the normal case 548 * pinning pte pages which are below limit. In the normal case this
478 this will be TASK_SIZE, but at boot we need to pin up to 549 * will be STACK_TOP_MAX, but at boot we need to pin up to
479 FIXADDR_TOP. But the important bit is that we don't pin beyond 550 * FIXADDR_TOP.
480 there, because then we start getting into Xen's ptes. 551 *
481*/ 552 * For 32-bit the important bit is that we don't pin beyond there,
482static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, enum pt_level), 553 * because then we start getting into Xen's ptes.
554 *
555 * For 64-bit, we must skip the Xen hole in the middle of the address
556 * space, just after the big x86-64 virtual hole.
557 */
558static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level),
483 unsigned long limit) 559 unsigned long limit)
484{ 560{
485 pgd_t *pgd = pgd_base;
486 int flush = 0; 561 int flush = 0;
487 unsigned long addr = 0; 562 unsigned hole_low, hole_high;
488 unsigned long pgd_next; 563 unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;
564 unsigned pgdidx, pudidx, pmdidx;
489 565
490 BUG_ON(limit > FIXADDR_TOP); 566 /* The limit is the last byte to be touched */
567 limit--;
568 BUG_ON(limit >= FIXADDR_TOP);
491 569
492 if (xen_feature(XENFEAT_auto_translated_physmap)) 570 if (xen_feature(XENFEAT_auto_translated_physmap))
493 return 0; 571 return 0;
494 572
495 for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) { 573 /*
574 * 64-bit has a great big hole in the middle of the address
575 * space, which contains the Xen mappings. On 32-bit these
576 * will end up making a zero-sized hole and so is a no-op.
577 */
578 hole_low = pgd_index(USER_LIMIT);
579 hole_high = pgd_index(PAGE_OFFSET);
580
581 pgdidx_limit = pgd_index(limit);
582#if PTRS_PER_PUD > 1
583 pudidx_limit = pud_index(limit);
584#else
585 pudidx_limit = 0;
586#endif
587#if PTRS_PER_PMD > 1
588 pmdidx_limit = pmd_index(limit);
589#else
590 pmdidx_limit = 0;
591#endif
592
593 flush |= (*func)(virt_to_page(pgd), PT_PGD);
594
595 for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) {
496 pud_t *pud; 596 pud_t *pud;
497 unsigned long pud_limit, pud_next;
498 597
499 pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP); 598 if (pgdidx >= hole_low && pgdidx < hole_high)
599 continue;
500 600
501 if (!pgd_val(*pgd)) 601 if (!pgd_val(pgd[pgdidx]))
502 continue; 602 continue;
503 603
504 pud = pud_offset(pgd, 0); 604 pud = pud_offset(&pgd[pgdidx], 0);
505 605
506 if (PTRS_PER_PUD > 1) /* not folded */ 606 if (PTRS_PER_PUD > 1) /* not folded */
507 flush |= (*func)(virt_to_page(pud), PT_PUD); 607 flush |= (*func)(virt_to_page(pud), PT_PUD);
508 608
509 for (; addr != pud_limit; pud++, addr = pud_next) { 609 for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) {
510 pmd_t *pmd; 610 pmd_t *pmd;
511 unsigned long pmd_limit;
512 611
513 pud_next = pud_addr_end(addr, pud_limit); 612 if (pgdidx == pgdidx_limit &&
514 613 pudidx > pudidx_limit)
515 if (pud_next < limit) 614 goto out;
516 pmd_limit = pud_next;
517 else
518 pmd_limit = limit;
519 615
520 if (pud_none(*pud)) 616 if (pud_none(pud[pudidx]))
521 continue; 617 continue;
522 618
523 pmd = pmd_offset(pud, 0); 619 pmd = pmd_offset(&pud[pudidx], 0);
524 620
525 if (PTRS_PER_PMD > 1) /* not folded */ 621 if (PTRS_PER_PMD > 1) /* not folded */
526 flush |= (*func)(virt_to_page(pmd), PT_PMD); 622 flush |= (*func)(virt_to_page(pmd), PT_PMD);
527 623
528 for (; addr != pmd_limit; pmd++) { 624 for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) {
529 addr += (PAGE_SIZE * PTRS_PER_PTE); 625 struct page *pte;
530 if ((pmd_limit-1) < (addr-1)) { 626
531 addr = pmd_limit; 627 if (pgdidx == pgdidx_limit &&
532 break; 628 pudidx == pudidx_limit &&
533 } 629 pmdidx > pmdidx_limit)
630 goto out;
534 631
535 if (pmd_none(*pmd)) 632 if (pmd_none(pmd[pmdidx]))
536 continue; 633 continue;
537 634
538 flush |= (*func)(pmd_page(*pmd), PT_PTE); 635 pte = pmd_page(pmd[pmdidx]);
636 flush |= (*func)(pte, PT_PTE);
539 } 637 }
540 } 638 }
541 } 639 }
542 640out:
543 flush |= (*func)(virt_to_page(pgd_base), PT_PGD);
544 641
545 return flush; 642 return flush;
546} 643}
@@ -622,14 +719,31 @@ void xen_pgd_pin(pgd_t *pgd)
622{ 719{
623 xen_mc_batch(); 720 xen_mc_batch();
624 721
625 if (pgd_walk(pgd, pin_page, TASK_SIZE)) { 722 if (pgd_walk(pgd, pin_page, USER_LIMIT)) {
626 /* re-enable interrupts for kmap_flush_unused */ 723 /* re-enable interrupts for kmap_flush_unused */
627 xen_mc_issue(0); 724 xen_mc_issue(0);
628 kmap_flush_unused(); 725 kmap_flush_unused();
629 xen_mc_batch(); 726 xen_mc_batch();
630 } 727 }
631 728
729#ifdef CONFIG_X86_64
730 {
731 pgd_t *user_pgd = xen_get_user_pgd(pgd);
732
733 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
734
735 if (user_pgd) {
736 pin_page(virt_to_page(user_pgd), PT_PGD);
737 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd)));
738 }
739 }
740#else /* CONFIG_X86_32 */
741#ifdef CONFIG_X86_PAE
742 /* Need to make sure unshared kernel PMD is pinnable */
743 pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD);
744#endif
632 xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd))); 745 xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
746#endif /* CONFIG_X86_64 */
633 xen_mc_issue(0); 747 xen_mc_issue(0);
634} 748}
635 749
@@ -656,9 +770,11 @@ void xen_mm_pin_all(void)
656 spin_unlock_irqrestore(&pgd_lock, flags); 770 spin_unlock_irqrestore(&pgd_lock, flags);
657} 771}
658 772
659/* The init_mm pagetable is really pinned as soon as its created, but 773/*
660 that's before we have page structures to store the bits. So do all 774 * The init_mm pagetable is really pinned as soon as its created, but
661 the book-keeping now. */ 775 * that's before we have page structures to store the bits. So do all
776 * the book-keeping now.
777 */
662static __init int mark_pinned(struct page *page, enum pt_level level) 778static __init int mark_pinned(struct page *page, enum pt_level level)
663{ 779{
664 SetPagePinned(page); 780 SetPagePinned(page);
@@ -708,7 +824,23 @@ static void xen_pgd_unpin(pgd_t *pgd)
708 824
709 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); 825 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
710 826
711 pgd_walk(pgd, unpin_page, TASK_SIZE); 827#ifdef CONFIG_X86_64
828 {
829 pgd_t *user_pgd = xen_get_user_pgd(pgd);
830
831 if (user_pgd) {
832 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd)));
833 unpin_page(virt_to_page(user_pgd), PT_PGD);
834 }
835 }
836#endif
837
838#ifdef CONFIG_X86_PAE
839 /* Need to make sure unshared kernel PMD is unpinned */
840 pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD);
841#endif
842
843 pgd_walk(pgd, unpin_page, USER_LIMIT);
712 844
713 xen_mc_issue(0); 845 xen_mc_issue(0);
714} 846}
@@ -727,7 +859,6 @@ void xen_mm_unpin_all(void)
727 list_for_each_entry(page, &pgd_list, lru) { 859 list_for_each_entry(page, &pgd_list, lru) {
728 if (PageSavePinned(page)) { 860 if (PageSavePinned(page)) {
729 BUG_ON(!PagePinned(page)); 861 BUG_ON(!PagePinned(page));
730 printk("unpinning pinned %p\n", page_address(page));
731 xen_pgd_unpin((pgd_t *)page_address(page)); 862 xen_pgd_unpin((pgd_t *)page_address(page));
732 ClearPageSavePinned(page); 863 ClearPageSavePinned(page);
733 } 864 }
@@ -757,8 +888,15 @@ void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
757static void drop_other_mm_ref(void *info) 888static void drop_other_mm_ref(void *info)
758{ 889{
759 struct mm_struct *mm = info; 890 struct mm_struct *mm = info;
891 struct mm_struct *active_mm;
892
893#ifdef CONFIG_X86_64
894 active_mm = read_pda(active_mm);
895#else
896 active_mm = __get_cpu_var(cpu_tlbstate).active_mm;
897#endif
760 898
761 if (__get_cpu_var(cpu_tlbstate).active_mm == mm) 899 if (active_mm == mm)
762 leave_mm(smp_processor_id()); 900 leave_mm(smp_processor_id());
763 901
764 /* If this cpu still has a stale cr3 reference, then make sure 902 /* If this cpu still has a stale cr3 reference, then make sure
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index 297bf9f5b8bc..0f59bd03f9e3 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -10,18 +10,6 @@ enum pt_level {
10 PT_PTE 10 PT_PTE
11}; 11};
12 12
13/*
14 * Page-directory addresses above 4GB do not fit into architectural %cr3.
15 * When accessing %cr3, or equivalent field in vcpu_guest_context, guests
16 * must use the following accessor macros to pack/unpack valid MFNs.
17 *
18 * Note that Xen is using the fact that the pagetable base is always
19 * page-aligned, and putting the 12 MSB of the address into the 12 LSB
20 * of cr3.
21 */
22#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20))
23#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20))
24
25 13
26void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); 14void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
27 15
@@ -44,13 +32,26 @@ pgd_t xen_make_pgd(pgdval_t);
44void xen_set_pte(pte_t *ptep, pte_t pteval); 32void xen_set_pte(pte_t *ptep, pte_t pteval);
45void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, 33void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
46 pte_t *ptep, pte_t pteval); 34 pte_t *ptep, pte_t pteval);
35
36#ifdef CONFIG_X86_PAE
47void xen_set_pte_atomic(pte_t *ptep, pte_t pte); 37void xen_set_pte_atomic(pte_t *ptep, pte_t pte);
38void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
39void xen_pmd_clear(pmd_t *pmdp);
40#endif /* CONFIG_X86_PAE */
41
48void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval); 42void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval);
49void xen_set_pud(pud_t *ptr, pud_t val); 43void xen_set_pud(pud_t *ptr, pud_t val);
50void xen_set_pmd_hyper(pmd_t *pmdp, pmd_t pmdval); 44void xen_set_pmd_hyper(pmd_t *pmdp, pmd_t pmdval);
51void xen_set_pud_hyper(pud_t *ptr, pud_t val); 45void xen_set_pud_hyper(pud_t *ptr, pud_t val);
52void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); 46
53void xen_pmd_clear(pmd_t *pmdp); 47#if PAGETABLE_LEVELS == 4
48pudval_t xen_pud_val(pud_t pud);
49pud_t xen_make_pud(pudval_t pudval);
50void xen_set_pgd(pgd_t *pgdp, pgd_t pgd);
51void xen_set_pgd_hyper(pgd_t *pgdp, pgd_t pgd);
52#endif
53
54pgd_t *xen_get_user_pgd(pgd_t *pgd);
54 55
55pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep); 56pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
56void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, 57void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c
index 3c63c4da7ed1..9efd1c6c9776 100644
--- a/arch/x86/xen/multicalls.c
+++ b/arch/x86/xen/multicalls.c
@@ -76,6 +76,7 @@ void xen_mc_flush(void)
76 if (ret) { 76 if (ret) {
77 printk(KERN_ERR "%d multicall(s) failed: cpu %d\n", 77 printk(KERN_ERR "%d multicall(s) failed: cpu %d\n",
78 ret, smp_processor_id()); 78 ret, smp_processor_id());
79 dump_stack();
79 for (i = 0; i < b->mcidx; i++) { 80 for (i = 0; i < b->mcidx; i++) {
80 printk(" call %2d/%d: op=%lu arg=[%lx] result=%ld\n", 81 printk(" call %2d/%d: op=%lu arg=[%lx] result=%ld\n",
81 i+1, b->mcidx, 82 i+1, b->mcidx,
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index e0a39595bde3..b6acc3a0af46 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -83,30 +83,72 @@ static void xen_idle(void)
83 83
84/* 84/*
85 * Set the bit indicating "nosegneg" library variants should be used. 85 * Set the bit indicating "nosegneg" library variants should be used.
86 * We only need to bother in pure 32-bit mode; compat 32-bit processes
87 * can have un-truncated segments, so wrapping around is allowed.
86 */ 88 */
87static void __init fiddle_vdso(void) 89static void __init fiddle_vdso(void)
88{ 90{
89 extern const char vdso32_default_start; 91#ifdef CONFIG_X86_32
90 u32 *mask = VDSO32_SYMBOL(&vdso32_default_start, NOTE_MASK); 92 u32 *mask;
93 mask = VDSO32_SYMBOL(&vdso32_int80_start, NOTE_MASK);
91 *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT; 94 *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
95 mask = VDSO32_SYMBOL(&vdso32_sysenter_start, NOTE_MASK);
96 *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
97#endif
92} 98}
93 99
94void xen_enable_sysenter(void) 100static __cpuinit int register_callback(unsigned type, const void *func)
95{ 101{
96 int cpu = smp_processor_id(); 102 struct callback_register callback = {
97 extern void xen_sysenter_target(void); 103 .type = type,
98 /* Mask events on entry, even though they get enabled immediately */ 104 .address = XEN_CALLBACK(__KERNEL_CS, func),
99 static struct callback_register sysenter = {
100 .type = CALLBACKTYPE_sysenter,
101 .address = { __KERNEL_CS, (unsigned long)xen_sysenter_target },
102 .flags = CALLBACKF_mask_events, 105 .flags = CALLBACKF_mask_events,
103 }; 106 };
104 107
105 if (!boot_cpu_has(X86_FEATURE_SEP) || 108 return HYPERVISOR_callback_op(CALLBACKOP_register, &callback);
106 HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) != 0) { 109}
107 clear_cpu_cap(&cpu_data(cpu), X86_FEATURE_SEP); 110
108 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_SEP); 111void __cpuinit xen_enable_sysenter(void)
112{
113 extern void xen_sysenter_target(void);
114 int ret;
115 unsigned sysenter_feature;
116
117#ifdef CONFIG_X86_32
118 sysenter_feature = X86_FEATURE_SEP;
119#else
120 sysenter_feature = X86_FEATURE_SYSENTER32;
121#endif
122
123 if (!boot_cpu_has(sysenter_feature))
124 return;
125
126 ret = register_callback(CALLBACKTYPE_sysenter, xen_sysenter_target);
127 if(ret != 0)
128 setup_clear_cpu_cap(sysenter_feature);
129}
130
131void __cpuinit xen_enable_syscall(void)
132{
133#ifdef CONFIG_X86_64
134 int ret;
135 extern void xen_syscall_target(void);
136 extern void xen_syscall32_target(void);
137
138 ret = register_callback(CALLBACKTYPE_syscall, xen_syscall_target);
139 if (ret != 0) {
140 printk(KERN_ERR "Failed to set syscall callback: %d\n", ret);
141 /* Pretty fatal; 64-bit userspace has no other
142 mechanism for syscalls. */
109 } 143 }
144
145 if (boot_cpu_has(X86_FEATURE_SYSCALL32)) {
146 ret = register_callback(CALLBACKTYPE_syscall32,
147 xen_syscall32_target);
148 if (ret != 0)
149 setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
150 }
151#endif /* CONFIG_X86_64 */
110} 152}
111 153
112void __init xen_arch_setup(void) 154void __init xen_arch_setup(void)
@@ -120,10 +162,12 @@ void __init xen_arch_setup(void)
120 if (!xen_feature(XENFEAT_auto_translated_physmap)) 162 if (!xen_feature(XENFEAT_auto_translated_physmap))
121 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3); 163 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3);
122 164
123 HYPERVISOR_set_callbacks(__KERNEL_CS, (unsigned long)xen_hypervisor_callback, 165 if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) ||
124 __KERNEL_CS, (unsigned long)xen_failsafe_callback); 166 register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback))
167 BUG();
125 168
126 xen_enable_sysenter(); 169 xen_enable_sysenter();
170 xen_enable_syscall();
127 171
128 set_iopl.iopl = 1; 172 set_iopl.iopl = 1;
129 rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); 173 rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
@@ -143,11 +187,6 @@ void __init xen_arch_setup(void)
143 187
144 pm_idle = xen_idle; 188 pm_idle = xen_idle;
145 189
146#ifdef CONFIG_SMP
147 /* fill cpus_possible with all available cpus */
148 xen_fill_possible_map();
149#endif
150
151 paravirt_disable_iospace(); 190 paravirt_disable_iospace();
152 191
153 fiddle_vdso(); 192 fiddle_vdso();
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 233156f39b7f..e693812ac59a 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -15,6 +15,7 @@
15 * This does not handle HOTPLUG_CPU yet. 15 * This does not handle HOTPLUG_CPU yet.
16 */ 16 */
17#include <linux/sched.h> 17#include <linux/sched.h>
18#include <linux/kernel_stat.h>
18#include <linux/err.h> 19#include <linux/err.h>
19#include <linux/smp.h> 20#include <linux/smp.h>
20 21
@@ -35,6 +36,8 @@
35#include "xen-ops.h" 36#include "xen-ops.h"
36#include "mmu.h" 37#include "mmu.h"
37 38
39static void __cpuinit xen_init_lock_cpu(int cpu);
40
38cpumask_t xen_cpu_initialized_map; 41cpumask_t xen_cpu_initialized_map;
39 42
40static DEFINE_PER_CPU(int, resched_irq); 43static DEFINE_PER_CPU(int, resched_irq);
@@ -66,13 +69,22 @@ static __cpuinit void cpu_bringup_and_idle(void)
66 int cpu = smp_processor_id(); 69 int cpu = smp_processor_id();
67 70
68 cpu_init(); 71 cpu_init();
72 preempt_disable();
73
69 xen_enable_sysenter(); 74 xen_enable_sysenter();
75 xen_enable_syscall();
70 76
71 preempt_disable(); 77 cpu = smp_processor_id();
72 per_cpu(cpu_state, cpu) = CPU_ONLINE; 78 smp_store_cpu_info(cpu);
79 cpu_data(cpu).x86_max_cores = 1;
80 set_cpu_sibling_map(cpu);
73 81
74 xen_setup_cpu_clockevents(); 82 xen_setup_cpu_clockevents();
75 83
84 cpu_set(cpu, cpu_online_map);
85 x86_write_percpu(cpu_state, CPU_ONLINE);
86 wmb();
87
76 /* We can take interrupts now: we're officially "up". */ 88 /* We can take interrupts now: we're officially "up". */
77 local_irq_enable(); 89 local_irq_enable();
78 90
@@ -141,56 +153,39 @@ static int xen_smp_intr_init(unsigned int cpu)
141 return rc; 153 return rc;
142} 154}
143 155
144void __init xen_fill_possible_map(void) 156static void __init xen_fill_possible_map(void)
145{ 157{
146 int i, rc; 158 int i, rc;
147 159
148 for (i = 0; i < NR_CPUS; i++) { 160 for (i = 0; i < NR_CPUS; i++) {
149 rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); 161 rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
150 if (rc >= 0) 162 if (rc >= 0) {
163 num_processors++;
151 cpu_set(i, cpu_possible_map); 164 cpu_set(i, cpu_possible_map);
165 }
152 } 166 }
153} 167}
154 168
155void __init xen_smp_prepare_boot_cpu(void) 169static void __init xen_smp_prepare_boot_cpu(void)
156{ 170{
157 int cpu;
158
159 BUG_ON(smp_processor_id() != 0); 171 BUG_ON(smp_processor_id() != 0);
160 native_smp_prepare_boot_cpu(); 172 native_smp_prepare_boot_cpu();
161 173
162 /* We've switched to the "real" per-cpu gdt, so make sure the 174 /* We've switched to the "real" per-cpu gdt, so make sure the
163 old memory can be recycled */ 175 old memory can be recycled */
164 make_lowmem_page_readwrite(&per_cpu__gdt_page); 176 make_lowmem_page_readwrite(&per_cpu_var(gdt_page));
165
166 for_each_possible_cpu(cpu) {
167 cpus_clear(per_cpu(cpu_sibling_map, cpu));
168 /*
169 * cpu_core_map lives in a per cpu area that is cleared
170 * when the per cpu array is allocated.
171 *
172 * cpus_clear(per_cpu(cpu_core_map, cpu));
173 */
174 }
175 177
176 xen_setup_vcpu_info_placement(); 178 xen_setup_vcpu_info_placement();
177} 179}
178 180
179void __init xen_smp_prepare_cpus(unsigned int max_cpus) 181static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
180{ 182{
181 unsigned cpu; 183 unsigned cpu;
182 184
183 for_each_possible_cpu(cpu) { 185 xen_init_lock_cpu(0);
184 cpus_clear(per_cpu(cpu_sibling_map, cpu));
185 /*
186 * cpu_core_ map will be zeroed when the per
187 * cpu area is allocated.
188 *
189 * cpus_clear(per_cpu(cpu_core_map, cpu));
190 */
191 }
192 186
193 smp_store_cpu_info(0); 187 smp_store_cpu_info(0);
188 cpu_data(0).x86_max_cores = 1;
194 set_cpu_sibling_map(0); 189 set_cpu_sibling_map(0);
195 190
196 if (xen_smp_intr_init(0)) 191 if (xen_smp_intr_init(0))
@@ -225,7 +220,7 @@ static __cpuinit int
225cpu_initialize_context(unsigned int cpu, struct task_struct *idle) 220cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
226{ 221{
227 struct vcpu_guest_context *ctxt; 222 struct vcpu_guest_context *ctxt;
228 struct gdt_page *gdt = &per_cpu(gdt_page, cpu); 223 struct desc_struct *gdt;
229 224
230 if (cpu_test_and_set(cpu, xen_cpu_initialized_map)) 225 if (cpu_test_and_set(cpu, xen_cpu_initialized_map))
231 return 0; 226 return 0;
@@ -234,12 +229,15 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
234 if (ctxt == NULL) 229 if (ctxt == NULL)
235 return -ENOMEM; 230 return -ENOMEM;
236 231
232 gdt = get_cpu_gdt_table(cpu);
233
237 ctxt->flags = VGCF_IN_KERNEL; 234 ctxt->flags = VGCF_IN_KERNEL;
238 ctxt->user_regs.ds = __USER_DS; 235 ctxt->user_regs.ds = __USER_DS;
239 ctxt->user_regs.es = __USER_DS; 236 ctxt->user_regs.es = __USER_DS;
240 ctxt->user_regs.fs = __KERNEL_PERCPU;
241 ctxt->user_regs.gs = 0;
242 ctxt->user_regs.ss = __KERNEL_DS; 237 ctxt->user_regs.ss = __KERNEL_DS;
238#ifdef CONFIG_X86_32
239 ctxt->user_regs.fs = __KERNEL_PERCPU;
240#endif
243 ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle; 241 ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
244 ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */ 242 ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
245 243
@@ -249,11 +247,11 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
249 247
250 ctxt->ldt_ents = 0; 248 ctxt->ldt_ents = 0;
251 249
252 BUG_ON((unsigned long)gdt->gdt & ~PAGE_MASK); 250 BUG_ON((unsigned long)gdt & ~PAGE_MASK);
253 make_lowmem_page_readonly(gdt->gdt); 251 make_lowmem_page_readonly(gdt);
254 252
255 ctxt->gdt_frames[0] = virt_to_mfn(gdt->gdt); 253 ctxt->gdt_frames[0] = virt_to_mfn(gdt);
256 ctxt->gdt_ents = ARRAY_SIZE(gdt->gdt); 254 ctxt->gdt_ents = GDT_ENTRIES;
257 255
258 ctxt->user_regs.cs = __KERNEL_CS; 256 ctxt->user_regs.cs = __KERNEL_CS;
259 ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs); 257 ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
@@ -261,9 +259,11 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
261 ctxt->kernel_ss = __KERNEL_DS; 259 ctxt->kernel_ss = __KERNEL_DS;
262 ctxt->kernel_sp = idle->thread.sp0; 260 ctxt->kernel_sp = idle->thread.sp0;
263 261
262#ifdef CONFIG_X86_32
264 ctxt->event_callback_cs = __KERNEL_CS; 263 ctxt->event_callback_cs = __KERNEL_CS;
265 ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback;
266 ctxt->failsafe_callback_cs = __KERNEL_CS; 264 ctxt->failsafe_callback_cs = __KERNEL_CS;
265#endif
266 ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback;
267 ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback; 267 ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback;
268 268
269 per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir); 269 per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
@@ -276,7 +276,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
276 return 0; 276 return 0;
277} 277}
278 278
279int __cpuinit xen_cpu_up(unsigned int cpu) 279static int __cpuinit xen_cpu_up(unsigned int cpu)
280{ 280{
281 struct task_struct *idle = idle_task(cpu); 281 struct task_struct *idle = idle_task(cpu);
282 int rc; 282 int rc;
@@ -287,10 +287,28 @@ int __cpuinit xen_cpu_up(unsigned int cpu)
287 return rc; 287 return rc;
288#endif 288#endif
289 289
290#ifdef CONFIG_X86_64
291 /* Allocate node local memory for AP pdas */
292 WARN_ON(cpu == 0);
293 if (cpu > 0) {
294 rc = get_local_pda(cpu);
295 if (rc)
296 return rc;
297 }
298#endif
299
300#ifdef CONFIG_X86_32
290 init_gdt(cpu); 301 init_gdt(cpu);
291 per_cpu(current_task, cpu) = idle; 302 per_cpu(current_task, cpu) = idle;
292 irq_ctx_init(cpu); 303 irq_ctx_init(cpu);
304#else
305 cpu_pda(cpu)->pcurrent = idle;
306 clear_tsk_thread_flag(idle, TIF_FORK);
307#endif
293 xen_setup_timer(cpu); 308 xen_setup_timer(cpu);
309 xen_init_lock_cpu(cpu);
310
311 per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
294 312
295 /* make sure interrupts start blocked */ 313 /* make sure interrupts start blocked */
296 per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1; 314 per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1;
@@ -306,20 +324,18 @@ int __cpuinit xen_cpu_up(unsigned int cpu)
306 if (rc) 324 if (rc)
307 return rc; 325 return rc;
308 326
309 smp_store_cpu_info(cpu);
310 set_cpu_sibling_map(cpu);
311 /* This must be done before setting cpu_online_map */
312 wmb();
313
314 cpu_set(cpu, cpu_online_map);
315
316 rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL); 327 rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL);
317 BUG_ON(rc); 328 BUG_ON(rc);
318 329
330 while(per_cpu(cpu_state, cpu) != CPU_ONLINE) {
331 HYPERVISOR_sched_op(SCHEDOP_yield, 0);
332 barrier();
333 }
334
319 return 0; 335 return 0;
320} 336}
321 337
322void xen_smp_cpus_done(unsigned int max_cpus) 338static void xen_smp_cpus_done(unsigned int max_cpus)
323{ 339{
324} 340}
325 341
@@ -335,12 +351,12 @@ static void stop_self(void *v)
335 BUG(); 351 BUG();
336} 352}
337 353
338void xen_smp_send_stop(void) 354static void xen_smp_send_stop(void)
339{ 355{
340 smp_call_function(stop_self, NULL, 0); 356 smp_call_function(stop_self, NULL, 0);
341} 357}
342 358
343void xen_smp_send_reschedule(int cpu) 359static void xen_smp_send_reschedule(int cpu)
344{ 360{
345 xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR); 361 xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR);
346} 362}
@@ -355,7 +371,7 @@ static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector)
355 xen_send_IPI_one(cpu, vector); 371 xen_send_IPI_one(cpu, vector);
356} 372}
357 373
358void xen_smp_send_call_function_ipi(cpumask_t mask) 374static void xen_smp_send_call_function_ipi(cpumask_t mask)
359{ 375{
360 int cpu; 376 int cpu;
361 377
@@ -370,7 +386,7 @@ void xen_smp_send_call_function_ipi(cpumask_t mask)
370 } 386 }
371} 387}
372 388
373void xen_smp_send_call_function_single_ipi(int cpu) 389static void xen_smp_send_call_function_single_ipi(int cpu)
374{ 390{
375 xen_send_IPI_mask(cpumask_of_cpu(cpu), XEN_CALL_FUNCTION_SINGLE_VECTOR); 391 xen_send_IPI_mask(cpumask_of_cpu(cpu), XEN_CALL_FUNCTION_SINGLE_VECTOR);
376} 392}
@@ -379,7 +395,11 @@ static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
379{ 395{
380 irq_enter(); 396 irq_enter();
381 generic_smp_call_function_interrupt(); 397 generic_smp_call_function_interrupt();
398#ifdef CONFIG_X86_32
382 __get_cpu_var(irq_stat).irq_call_count++; 399 __get_cpu_var(irq_stat).irq_call_count++;
400#else
401 add_pda(irq_call_count, 1);
402#endif
383 irq_exit(); 403 irq_exit();
384 404
385 return IRQ_HANDLED; 405 return IRQ_HANDLED;
@@ -389,8 +409,196 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id)
389{ 409{
390 irq_enter(); 410 irq_enter();
391 generic_smp_call_function_single_interrupt(); 411 generic_smp_call_function_single_interrupt();
412#ifdef CONFIG_X86_32
392 __get_cpu_var(irq_stat).irq_call_count++; 413 __get_cpu_var(irq_stat).irq_call_count++;
414#else
415 add_pda(irq_call_count, 1);
416#endif
393 irq_exit(); 417 irq_exit();
394 418
395 return IRQ_HANDLED; 419 return IRQ_HANDLED;
396} 420}
421
422struct xen_spinlock {
423 unsigned char lock; /* 0 -> free; 1 -> locked */
424 unsigned short spinners; /* count of waiting cpus */
425};
426
427static int xen_spin_is_locked(struct raw_spinlock *lock)
428{
429 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
430
431 return xl->lock != 0;
432}
433
434static int xen_spin_is_contended(struct raw_spinlock *lock)
435{
436 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
437
438 /* Not strictly true; this is only the count of contended
439 lock-takers entering the slow path. */
440 return xl->spinners != 0;
441}
442
443static int xen_spin_trylock(struct raw_spinlock *lock)
444{
445 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
446 u8 old = 1;
447
448 asm("xchgb %b0,%1"
449 : "+q" (old), "+m" (xl->lock) : : "memory");
450
451 return old == 0;
452}
453
454static DEFINE_PER_CPU(int, lock_kicker_irq) = -1;
455static DEFINE_PER_CPU(struct xen_spinlock *, lock_spinners);
456
457static inline void spinning_lock(struct xen_spinlock *xl)
458{
459 __get_cpu_var(lock_spinners) = xl;
460 wmb(); /* set lock of interest before count */
461 asm(LOCK_PREFIX " incw %0"
462 : "+m" (xl->spinners) : : "memory");
463}
464
465static inline void unspinning_lock(struct xen_spinlock *xl)
466{
467 asm(LOCK_PREFIX " decw %0"
468 : "+m" (xl->spinners) : : "memory");
469 wmb(); /* decrement count before clearing lock */
470 __get_cpu_var(lock_spinners) = NULL;
471}
472
473static noinline int xen_spin_lock_slow(struct raw_spinlock *lock)
474{
475 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
476 int irq = __get_cpu_var(lock_kicker_irq);
477 int ret;
478
479 /* If kicker interrupts not initialized yet, just spin */
480 if (irq == -1)
481 return 0;
482
483 /* announce we're spinning */
484 spinning_lock(xl);
485
486 /* clear pending */
487 xen_clear_irq_pending(irq);
488
489 /* check again make sure it didn't become free while
490 we weren't looking */
491 ret = xen_spin_trylock(lock);
492 if (ret)
493 goto out;
494
495 /* block until irq becomes pending */
496 xen_poll_irq(irq);
497 kstat_this_cpu.irqs[irq]++;
498
499out:
500 unspinning_lock(xl);
501 return ret;
502}
503
504static void xen_spin_lock(struct raw_spinlock *lock)
505{
506 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
507 int timeout;
508 u8 oldval;
509
510 do {
511 timeout = 1 << 10;
512
513 asm("1: xchgb %1,%0\n"
514 " testb %1,%1\n"
515 " jz 3f\n"
516 "2: rep;nop\n"
517 " cmpb $0,%0\n"
518 " je 1b\n"
519 " dec %2\n"
520 " jnz 2b\n"
521 "3:\n"
522 : "+m" (xl->lock), "=q" (oldval), "+r" (timeout)
523 : "1" (1)
524 : "memory");
525
526 } while (unlikely(oldval != 0 && !xen_spin_lock_slow(lock)));
527}
528
529static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl)
530{
531 int cpu;
532
533 for_each_online_cpu(cpu) {
534 /* XXX should mix up next cpu selection */
535 if (per_cpu(lock_spinners, cpu) == xl) {
536 xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR);
537 break;
538 }
539 }
540}
541
542static void xen_spin_unlock(struct raw_spinlock *lock)
543{
544 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
545
546 smp_wmb(); /* make sure no writes get moved after unlock */
547 xl->lock = 0; /* release lock */
548
549 /* make sure unlock happens before kick */
550 barrier();
551
552 if (unlikely(xl->spinners))
553 xen_spin_unlock_slow(xl);
554}
555
556static __cpuinit void xen_init_lock_cpu(int cpu)
557{
558 int irq;
559 const char *name;
560
561 name = kasprintf(GFP_KERNEL, "spinlock%d", cpu);
562 irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR,
563 cpu,
564 xen_reschedule_interrupt,
565 IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
566 name,
567 NULL);
568
569 if (irq >= 0) {
570 disable_irq(irq); /* make sure it's never delivered */
571 per_cpu(lock_kicker_irq, cpu) = irq;
572 }
573
574 printk("cpu %d spinlock event irq %d\n", cpu, irq);
575}
576
577static void __init xen_init_spinlocks(void)
578{
579 pv_lock_ops.spin_is_locked = xen_spin_is_locked;
580 pv_lock_ops.spin_is_contended = xen_spin_is_contended;
581 pv_lock_ops.spin_lock = xen_spin_lock;
582 pv_lock_ops.spin_trylock = xen_spin_trylock;
583 pv_lock_ops.spin_unlock = xen_spin_unlock;
584}
585
586static const struct smp_ops xen_smp_ops __initdata = {
587 .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
588 .smp_prepare_cpus = xen_smp_prepare_cpus,
589 .cpu_up = xen_cpu_up,
590 .smp_cpus_done = xen_smp_cpus_done,
591
592 .smp_send_stop = xen_smp_send_stop,
593 .smp_send_reschedule = xen_smp_send_reschedule,
594
595 .send_call_func_ipi = xen_smp_send_call_function_ipi,
596 .send_call_func_single_ipi = xen_smp_send_call_function_single_ipi,
597};
598
599void __init xen_smp_init(void)
600{
601 smp_ops = xen_smp_ops;
602 xen_fill_possible_map();
603 xen_init_spinlocks();
604}
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index 251669a932d4..2a234db5949b 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -38,8 +38,11 @@ void xen_post_suspend(int suspend_cancelled)
38 xen_cpu_initialized_map = cpu_online_map; 38 xen_cpu_initialized_map = cpu_online_map;
39#endif 39#endif
40 xen_vcpu_restore(); 40 xen_vcpu_restore();
41 xen_timer_resume();
42 } 41 }
43 42
44} 43}
45 44
45void xen_arch_resume(void)
46{
47 /* nothing */
48}
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm_32.S
index 2497a30f41de..2497a30f41de 100644
--- a/arch/x86/xen/xen-asm.S
+++ b/arch/x86/xen/xen-asm_32.S
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
new file mode 100644
index 000000000000..4038cbfe3331
--- /dev/null
+++ b/arch/x86/xen/xen-asm_64.S
@@ -0,0 +1,271 @@
1/*
2 Asm versions of Xen pv-ops, suitable for either direct use or inlining.
3 The inline versions are the same as the direct-use versions, with the
4 pre- and post-amble chopped off.
5
6 This code is encoded for size rather than absolute efficiency,
7 with a view to being able to inline as much as possible.
8
9 We only bother with direct forms (ie, vcpu in pda) of the operations
10 here; the indirect forms are better handled in C, since they're
11 generally too large to inline anyway.
12 */
13
14#include <linux/linkage.h>
15
16#include <asm/asm-offsets.h>
17#include <asm/processor-flags.h>
18#include <asm/errno.h>
19#include <asm/segment.h>
20
21#include <xen/interface/xen.h>
22
23#define RELOC(x, v) .globl x##_reloc; x##_reloc=v
24#define ENDPATCH(x) .globl x##_end; x##_end=.
25
26/* Pseudo-flag used for virtual NMI, which we don't implement yet */
27#define XEN_EFLAGS_NMI 0x80000000
28
29#if 0
30#include <asm/percpu.h>
31
32/*
33 Enable events. This clears the event mask and tests the pending
34 event status with one and operation. If there are pending
35 events, then enter the hypervisor to get them handled.
36 */
37ENTRY(xen_irq_enable_direct)
38 /* Unmask events */
39 movb $0, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
40
41 /* Preempt here doesn't matter because that will deal with
42 any pending interrupts. The pending check may end up being
43 run on the wrong CPU, but that doesn't hurt. */
44
45 /* Test for pending */
46 testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending)
47 jz 1f
48
492: call check_events
501:
51ENDPATCH(xen_irq_enable_direct)
52 ret
53 ENDPROC(xen_irq_enable_direct)
54 RELOC(xen_irq_enable_direct, 2b+1)
55
56/*
57 Disabling events is simply a matter of making the event mask
58 non-zero.
59 */
60ENTRY(xen_irq_disable_direct)
61 movb $1, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
62ENDPATCH(xen_irq_disable_direct)
63 ret
64 ENDPROC(xen_irq_disable_direct)
65 RELOC(xen_irq_disable_direct, 0)
66
67/*
68 (xen_)save_fl is used to get the current interrupt enable status.
69 Callers expect the status to be in X86_EFLAGS_IF, and other bits
70 may be set in the return value. We take advantage of this by
71 making sure that X86_EFLAGS_IF has the right value (and other bits
72 in that byte are 0), but other bits in the return value are
73 undefined. We need to toggle the state of the bit, because
74 Xen and x86 use opposite senses (mask vs enable).
75 */
76ENTRY(xen_save_fl_direct)
77 testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
78 setz %ah
79 addb %ah,%ah
80ENDPATCH(xen_save_fl_direct)
81 ret
82 ENDPROC(xen_save_fl_direct)
83 RELOC(xen_save_fl_direct, 0)
84
85/*
86 In principle the caller should be passing us a value return
87 from xen_save_fl_direct, but for robustness sake we test only
88 the X86_EFLAGS_IF flag rather than the whole byte. After
89 setting the interrupt mask state, it checks for unmasked
90 pending events and enters the hypervisor to get them delivered
91 if so.
92 */
93ENTRY(xen_restore_fl_direct)
94 testb $X86_EFLAGS_IF>>8, %ah
95 setz PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
96 /* Preempt here doesn't matter because that will deal with
97 any pending interrupts. The pending check may end up being
98 run on the wrong CPU, but that doesn't hurt. */
99
100 /* check for unmasked and pending */
101 cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending)
102 jz 1f
1032: call check_events
1041:
105ENDPATCH(xen_restore_fl_direct)
106 ret
107 ENDPROC(xen_restore_fl_direct)
108 RELOC(xen_restore_fl_direct, 2b+1)
109
110
111/*
112 Force an event check by making a hypercall,
113 but preserve regs before making the call.
114 */
115check_events:
116 push %rax
117 push %rcx
118 push %rdx
119 push %rsi
120 push %rdi
121 push %r8
122 push %r9
123 push %r10
124 push %r11
125 call force_evtchn_callback
126 pop %r11
127 pop %r10
128 pop %r9
129 pop %r8
130 pop %rdi
131 pop %rsi
132 pop %rdx
133 pop %rcx
134 pop %rax
135 ret
136#endif
137
138ENTRY(xen_adjust_exception_frame)
139 mov 8+0(%rsp),%rcx
140 mov 8+8(%rsp),%r11
141 ret $16
142
143hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32
144/*
145 Xen64 iret frame:
146
147 ss
148 rsp
149 rflags
150 cs
151 rip <-- standard iret frame
152
153 flags
154
155 rcx }
156 r11 }<-- pushed by hypercall page
157rsp -> rax }
158 */
159ENTRY(xen_iret)
160 pushq $0
1611: jmp hypercall_iret
162ENDPATCH(xen_iret)
163RELOC(xen_iret, 1b+1)
164
165/*
166 sysexit is not used for 64-bit processes, so it's
167 only ever used to return to 32-bit compat userspace.
168 */
169ENTRY(xen_sysexit)
170 pushq $__USER32_DS
171 pushq %rcx
172 pushq $X86_EFLAGS_IF
173 pushq $__USER32_CS
174 pushq %rdx
175
176 pushq $VGCF_in_syscall
1771: jmp hypercall_iret
178ENDPATCH(xen_sysexit)
179RELOC(xen_sysexit, 1b+1)
180
181ENTRY(xen_sysret64)
182 /* We're already on the usermode stack at this point, but still
183 with the kernel gs, so we can easily switch back */
184 movq %rsp, %gs:pda_oldrsp
185 movq %gs:pda_kernelstack,%rsp
186
187 pushq $__USER_DS
188 pushq %gs:pda_oldrsp
189 pushq %r11
190 pushq $__USER_CS
191 pushq %rcx
192
193 pushq $VGCF_in_syscall
1941: jmp hypercall_iret
195ENDPATCH(xen_sysret64)
196RELOC(xen_sysret64, 1b+1)
197
198ENTRY(xen_sysret32)
199 /* We're already on the usermode stack at this point, but still
200 with the kernel gs, so we can easily switch back */
201 movq %rsp, %gs:pda_oldrsp
202 movq %gs:pda_kernelstack, %rsp
203
204 pushq $__USER32_DS
205 pushq %gs:pda_oldrsp
206 pushq %r11
207 pushq $__USER32_CS
208 pushq %rcx
209
210 pushq $VGCF_in_syscall
2111: jmp hypercall_iret
212ENDPATCH(xen_sysret32)
213RELOC(xen_sysret32, 1b+1)
214
215/*
216 Xen handles syscall callbacks much like ordinary exceptions,
217 which means we have:
218 - kernel gs
219 - kernel rsp
220 - an iret-like stack frame on the stack (including rcx and r11):
221 ss
222 rsp
223 rflags
224 cs
225 rip
226 r11
227 rsp-> rcx
228
229 In all the entrypoints, we undo all that to make it look
230 like a CPU-generated syscall/sysenter and jump to the normal
231 entrypoint.
232 */
233
234.macro undo_xen_syscall
235 mov 0*8(%rsp),%rcx
236 mov 1*8(%rsp),%r11
237 mov 5*8(%rsp),%rsp
238.endm
239
240/* Normal 64-bit system call target */
241ENTRY(xen_syscall_target)
242 undo_xen_syscall
243 jmp system_call_after_swapgs
244ENDPROC(xen_syscall_target)
245
246#ifdef CONFIG_IA32_EMULATION
247
248/* 32-bit compat syscall target */
249ENTRY(xen_syscall32_target)
250 undo_xen_syscall
251 jmp ia32_cstar_target
252ENDPROC(xen_syscall32_target)
253
254/* 32-bit compat sysenter target */
255ENTRY(xen_sysenter_target)
256 undo_xen_syscall
257 jmp ia32_sysenter_target
258ENDPROC(xen_sysenter_target)
259
260#else /* !CONFIG_IA32_EMULATION */
261
262ENTRY(xen_syscall32_target)
263ENTRY(xen_sysenter_target)
264 lea 16(%rsp), %rsp /* strip %rcx,%r11 */
265 mov $-ENOSYS, %rax
266 pushq $VGCF_in_syscall
267 jmp hypercall_iret
268ENDPROC(xen_syscall32_target)
269ENDPROC(xen_sysenter_target)
270
271#endif /* CONFIG_IA32_EMULATION */
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 7c0cf6320a0a..63d49a523ed3 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -5,15 +5,24 @@
5 5
6#include <linux/elfnote.h> 6#include <linux/elfnote.h>
7#include <linux/init.h> 7#include <linux/init.h>
8
8#include <asm/boot.h> 9#include <asm/boot.h>
10#include <asm/asm.h>
11#include <asm/page.h>
12
9#include <xen/interface/elfnote.h> 13#include <xen/interface/elfnote.h>
10#include <asm/xen/interface.h> 14#include <asm/xen/interface.h>
11 15
12 __INIT 16 __INIT
13ENTRY(startup_xen) 17ENTRY(startup_xen)
14 movl %esi,xen_start_info
15 cld 18 cld
16 movl $(init_thread_union+THREAD_SIZE),%esp 19#ifdef CONFIG_X86_32
20 mov %esi,xen_start_info
21 mov $init_thread_union+THREAD_SIZE,%esp
22#else
23 mov %rsi,xen_start_info
24 mov $init_thread_union+THREAD_SIZE,%rsp
25#endif
17 jmp xen_start_kernel 26 jmp xen_start_kernel
18 27
19 __FINIT 28 __FINIT
@@ -21,21 +30,26 @@ ENTRY(startup_xen)
21.pushsection .text 30.pushsection .text
22 .align PAGE_SIZE_asm 31 .align PAGE_SIZE_asm
23ENTRY(hypercall_page) 32ENTRY(hypercall_page)
24 .skip 0x1000 33 .skip PAGE_SIZE_asm
25.popsection 34.popsection
26 35
27 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") 36 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux")
28 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6") 37 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6")
29 ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0") 38 ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0")
30 ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .long __PAGE_OFFSET) 39#ifdef CONFIG_X86_32
31 ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long startup_xen) 40 ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __PAGE_OFFSET)
32 ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long hypercall_page) 41#else
42 ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __START_KERNEL_map)
43#endif
44 ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen)
45 ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page)
33 ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb") 46 ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb")
34 ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes") 47 ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes")
35 ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic") 48 ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic")
36 ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, 49 ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID,
37 .quad _PAGE_PRESENT; .quad _PAGE_PRESENT) 50 .quad _PAGE_PRESENT; .quad _PAGE_PRESENT)
38 ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1) 51 ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1)
39 ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, .long __HYPERVISOR_VIRT_START) 52 ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, _ASM_PTR __HYPERVISOR_VIRT_START)
53 ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, _ASM_PTR 0)
40 54
41#endif /*CONFIG_XEN */ 55#endif /*CONFIG_XEN */
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 6f4b1045c1c2..dd3c23152a2e 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -26,6 +26,7 @@ char * __init xen_memory_setup(void);
26void __init xen_arch_setup(void); 26void __init xen_arch_setup(void);
27void __init xen_init_IRQ(void); 27void __init xen_init_IRQ(void);
28void xen_enable_sysenter(void); 28void xen_enable_sysenter(void);
29void xen_enable_syscall(void);
29void xen_vcpu_restore(void); 30void xen_vcpu_restore(void);
30 31
31void __init xen_build_dynamic_phys_to_machine(void); 32void __init xen_build_dynamic_phys_to_machine(void);
@@ -37,7 +38,6 @@ void __init xen_time_init(void);
37unsigned long xen_get_wallclock(void); 38unsigned long xen_get_wallclock(void);
38int xen_set_wallclock(unsigned long time); 39int xen_set_wallclock(unsigned long time);
39unsigned long long xen_sched_clock(void); 40unsigned long long xen_sched_clock(void);
40void xen_timer_resume(void);
41 41
42irqreturn_t xen_debug_interrupt(int irq, void *dev_id); 42irqreturn_t xen_debug_interrupt(int irq, void *dev_id);
43 43
@@ -45,20 +45,15 @@ bool xen_vcpu_stolen(int vcpu);
45 45
46void xen_mark_init_mm_pinned(void); 46void xen_mark_init_mm_pinned(void);
47 47
48void __init xen_fill_possible_map(void);
49
50void __init xen_setup_vcpu_info_placement(void); 48void __init xen_setup_vcpu_info_placement(void);
51void xen_smp_prepare_boot_cpu(void);
52void xen_smp_prepare_cpus(unsigned int max_cpus);
53int xen_cpu_up(unsigned int cpu);
54void xen_smp_cpus_done(unsigned int max_cpus);
55 49
56void xen_smp_send_stop(void); 50#ifdef CONFIG_SMP
57void xen_smp_send_reschedule(int cpu); 51void xen_smp_init(void);
58void xen_smp_send_call_function_ipi(cpumask_t mask);
59void xen_smp_send_call_function_single_ipi(int cpu);
60 52
61extern cpumask_t xen_cpu_initialized_map; 53extern cpumask_t xen_cpu_initialized_map;
54#else
55static inline void xen_smp_init(void) {}
56#endif
62 57
63 58
64/* Declare an asm function, along with symbols needed to make it 59/* Declare an asm function, along with symbols needed to make it
@@ -73,7 +68,11 @@ DECL_ASM(void, xen_irq_disable_direct, void);
73DECL_ASM(unsigned long, xen_save_fl_direct, void); 68DECL_ASM(unsigned long, xen_save_fl_direct, void);
74DECL_ASM(void, xen_restore_fl_direct, unsigned long); 69DECL_ASM(void, xen_restore_fl_direct, unsigned long);
75 70
71/* These are not functions, and cannot be called normally */
76void xen_iret(void); 72void xen_iret(void);
77void xen_sysexit(void); 73void xen_sysexit(void);
74void xen_sysret32(void);
75void xen_sysret64(void);
76void xen_adjust_exception_frame(void);
78 77
79#endif /* XEN_OPS_H */ 78#endif /* XEN_OPS_H */