diff options
author | Glenn Elliott <gelliott@cs.unc.edu> | 2012-03-04 19:47:13 -0500 |
---|---|---|
committer | Glenn Elliott <gelliott@cs.unc.edu> | 2012-03-04 19:47:13 -0500 |
commit | c71c03bda1e86c9d5198c5d83f712e695c4f2a1e (patch) | |
tree | ecb166cb3e2b7e2adb3b5e292245fefd23381ac8 /arch/x86/xen | |
parent | ea53c912f8a86a8567697115b6a0d8152beee5c8 (diff) | |
parent | 6a00f206debf8a5c8899055726ad127dbeeed098 (diff) |
Merge branch 'mpi-master' into wip-k-fmlpwip-k-fmlp
Conflicts:
litmus/sched_cedf.c
Diffstat (limited to 'arch/x86/xen')
-rw-r--r-- | arch/x86/xen/Kconfig | 32 | ||||
-rw-r--r-- | arch/x86/xen/Makefile | 3 | ||||
-rw-r--r-- | arch/x86/xen/debugfs.c | 1 | ||||
-rw-r--r-- | arch/x86/xen/enlighten.c | 152 | ||||
-rw-r--r-- | arch/x86/xen/irq.c | 4 | ||||
-rw-r--r-- | arch/x86/xen/mmu.c | 870 | ||||
-rw-r--r-- | arch/x86/xen/mmu.h | 38 | ||||
-rw-r--r-- | arch/x86/xen/multicalls.c | 12 | ||||
-rw-r--r-- | arch/x86/xen/multicalls.h | 2 | ||||
-rw-r--r-- | arch/x86/xen/p2m.c | 859 | ||||
-rw-r--r-- | arch/x86/xen/pci-swiotlb-xen.c | 11 | ||||
-rw-r--r-- | arch/x86/xen/platform-pci-unplug.c | 2 | ||||
-rw-r--r-- | arch/x86/xen/setup.c | 241 | ||||
-rw-r--r-- | arch/x86/xen/smp.c | 90 | ||||
-rw-r--r-- | arch/x86/xen/spinlock.c | 10 | ||||
-rw-r--r-- | arch/x86/xen/suspend.c | 9 | ||||
-rw-r--r-- | arch/x86/xen/time.c | 28 | ||||
-rw-r--r-- | arch/x86/xen/xen-head.S | 4 | ||||
-rw-r--r-- | arch/x86/xen/xen-ops.h | 9 |
19 files changed, 1749 insertions, 628 deletions
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 68128a1b401a..5cc821cb2e09 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig | |||
@@ -13,25 +13,33 @@ config XEN | |||
13 | kernel to boot in a paravirtualized environment under the | 13 | kernel to boot in a paravirtualized environment under the |
14 | Xen hypervisor. | 14 | Xen hypervisor. |
15 | 15 | ||
16 | config XEN_DOM0 | ||
17 | def_bool y | ||
18 | depends on XEN && PCI_XEN && SWIOTLB_XEN | ||
19 | depends on X86_LOCAL_APIC && X86_IO_APIC && ACPI && PCI | ||
20 | |||
21 | # Dummy symbol since people have come to rely on the PRIVILEGED_GUEST | ||
22 | # name in tools. | ||
23 | config XEN_PRIVILEGED_GUEST | ||
24 | def_bool XEN_DOM0 | ||
25 | |||
16 | config XEN_PVHVM | 26 | config XEN_PVHVM |
17 | def_bool y | 27 | def_bool y |
18 | depends on XEN | 28 | depends on XEN |
19 | depends on X86_LOCAL_APIC | 29 | depends on X86_LOCAL_APIC |
20 | 30 | ||
21 | config XEN_MAX_DOMAIN_MEMORY | 31 | config XEN_MAX_DOMAIN_MEMORY |
22 | int "Maximum allowed size of a domain in gigabytes" | 32 | int |
23 | default 8 if X86_32 | 33 | default 128 |
24 | default 32 if X86_64 | ||
25 | depends on XEN | 34 | depends on XEN |
26 | help | 35 | help |
27 | The pseudo-physical to machine address array is sized | 36 | This only affects the sizing of some bss arrays, the unused |
28 | according to the maximum possible memory size of a Xen | 37 | portions of which are freed. |
29 | domain. This array uses 1 page per gigabyte, so there's no | ||
30 | need to be too stingy here. | ||
31 | 38 | ||
32 | config XEN_SAVE_RESTORE | 39 | config XEN_SAVE_RESTORE |
33 | bool | 40 | bool |
34 | depends on XEN && PM | 41 | depends on XEN |
42 | select HIBERNATE_CALLBACKS | ||
35 | default y | 43 | default y |
36 | 44 | ||
37 | config XEN_DEBUG_FS | 45 | config XEN_DEBUG_FS |
@@ -41,3 +49,11 @@ config XEN_DEBUG_FS | |||
41 | help | 49 | help |
42 | Enable statistics output and various tuning options in debugfs. | 50 | Enable statistics output and various tuning options in debugfs. |
43 | Enabling this option may incur a significant performance overhead. | 51 | Enabling this option may incur a significant performance overhead. |
52 | |||
53 | config XEN_DEBUG | ||
54 | bool "Enable Xen debug checks" | ||
55 | depends on XEN | ||
56 | default n | ||
57 | help | ||
58 | Enable various WARN_ON checks in the Xen MMU code. | ||
59 | Enabling this option WILL incur a significant performance overhead. | ||
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 779385158915..17c565de3d64 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile | |||
@@ -12,7 +12,8 @@ CFLAGS_mmu.o := $(nostackp) | |||
12 | 12 | ||
13 | obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ | 13 | obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ |
14 | time.o xen-asm.o xen-asm_$(BITS).o \ | 14 | time.o xen-asm.o xen-asm_$(BITS).o \ |
15 | grant-table.o suspend.o platform-pci-unplug.o | 15 | grant-table.o suspend.o platform-pci-unplug.o \ |
16 | p2m.o | ||
16 | 17 | ||
17 | obj-$(CONFIG_SMP) += smp.o | 18 | obj-$(CONFIG_SMP) += smp.o |
18 | obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o | 19 | obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o |
diff --git a/arch/x86/xen/debugfs.c b/arch/x86/xen/debugfs.c index 1304bcec8ee5..7c0fedd98ea0 100644 --- a/arch/x86/xen/debugfs.c +++ b/arch/x86/xen/debugfs.c | |||
@@ -106,6 +106,7 @@ static const struct file_operations u32_array_fops = { | |||
106 | .open = u32_array_open, | 106 | .open = u32_array_open, |
107 | .release= xen_array_release, | 107 | .release= xen_array_release, |
108 | .read = u32_array_read, | 108 | .read = u32_array_read, |
109 | .llseek = no_llseek, | ||
109 | }; | 110 | }; |
110 | 111 | ||
111 | struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode, | 112 | struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode, |
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 7d46c8441418..5525163a0398 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c | |||
@@ -30,6 +30,7 @@ | |||
30 | #include <linux/console.h> | 30 | #include <linux/console.h> |
31 | #include <linux/pci.h> | 31 | #include <linux/pci.h> |
32 | #include <linux/gfp.h> | 32 | #include <linux/gfp.h> |
33 | #include <linux/memblock.h> | ||
33 | 34 | ||
34 | #include <xen/xen.h> | 35 | #include <xen/xen.h> |
35 | #include <xen/interface/xen.h> | 36 | #include <xen/interface/xen.h> |
@@ -45,6 +46,7 @@ | |||
45 | #include <asm/paravirt.h> | 46 | #include <asm/paravirt.h> |
46 | #include <asm/apic.h> | 47 | #include <asm/apic.h> |
47 | #include <asm/page.h> | 48 | #include <asm/page.h> |
49 | #include <asm/xen/pci.h> | ||
48 | #include <asm/xen/hypercall.h> | 50 | #include <asm/xen/hypercall.h> |
49 | #include <asm/xen/hypervisor.h> | 51 | #include <asm/xen/hypervisor.h> |
50 | #include <asm/fixmap.h> | 52 | #include <asm/fixmap.h> |
@@ -58,7 +60,6 @@ | |||
58 | #include <asm/pgtable.h> | 60 | #include <asm/pgtable.h> |
59 | #include <asm/tlbflush.h> | 61 | #include <asm/tlbflush.h> |
60 | #include <asm/reboot.h> | 62 | #include <asm/reboot.h> |
61 | #include <asm/setup.h> | ||
62 | #include <asm/stackprotector.h> | 63 | #include <asm/stackprotector.h> |
63 | #include <asm/hypervisor.h> | 64 | #include <asm/hypervisor.h> |
64 | 65 | ||
@@ -74,6 +75,11 @@ DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); | |||
74 | enum xen_domain_type xen_domain_type = XEN_NATIVE; | 75 | enum xen_domain_type xen_domain_type = XEN_NATIVE; |
75 | EXPORT_SYMBOL_GPL(xen_domain_type); | 76 | EXPORT_SYMBOL_GPL(xen_domain_type); |
76 | 77 | ||
78 | unsigned long *machine_to_phys_mapping = (void *)MACH2PHYS_VIRT_START; | ||
79 | EXPORT_SYMBOL(machine_to_phys_mapping); | ||
80 | unsigned int machine_to_phys_order; | ||
81 | EXPORT_SYMBOL(machine_to_phys_order); | ||
82 | |||
77 | struct start_info *xen_start_info; | 83 | struct start_info *xen_start_info; |
78 | EXPORT_SYMBOL_GPL(xen_start_info); | 84 | EXPORT_SYMBOL_GPL(xen_start_info); |
79 | 85 | ||
@@ -135,9 +141,6 @@ static void xen_vcpu_setup(int cpu) | |||
135 | info.mfn = arbitrary_virt_to_mfn(vcpup); | 141 | info.mfn = arbitrary_virt_to_mfn(vcpup); |
136 | info.offset = offset_in_page(vcpup); | 142 | info.offset = offset_in_page(vcpup); |
137 | 143 | ||
138 | printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %llx, offset %d\n", | ||
139 | cpu, vcpup, info.mfn, info.offset); | ||
140 | |||
141 | /* Check to see if the hypervisor will put the vcpu_info | 144 | /* Check to see if the hypervisor will put the vcpu_info |
142 | structure where we want it, which allows direct access via | 145 | structure where we want it, which allows direct access via |
143 | a percpu-variable. */ | 146 | a percpu-variable. */ |
@@ -151,9 +154,6 @@ static void xen_vcpu_setup(int cpu) | |||
151 | /* This cpu is using the registered vcpu info, even if | 154 | /* This cpu is using the registered vcpu info, even if |
152 | later ones fail to. */ | 155 | later ones fail to. */ |
153 | per_cpu(xen_vcpu, cpu) = vcpup; | 156 | per_cpu(xen_vcpu, cpu) = vcpup; |
154 | |||
155 | printk(KERN_DEBUG "cpu %d using vcpu_info at %p\n", | ||
156 | cpu, vcpup); | ||
157 | } | 157 | } |
158 | } | 158 | } |
159 | 159 | ||
@@ -235,37 +235,31 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx, | |||
235 | *dx &= maskedx; | 235 | *dx &= maskedx; |
236 | } | 236 | } |
237 | 237 | ||
238 | static __init void xen_init_cpuid_mask(void) | 238 | static void __init xen_init_cpuid_mask(void) |
239 | { | 239 | { |
240 | unsigned int ax, bx, cx, dx; | 240 | unsigned int ax, bx, cx, dx; |
241 | unsigned int xsave_mask; | ||
241 | 242 | ||
242 | cpuid_leaf1_edx_mask = | 243 | cpuid_leaf1_edx_mask = |
243 | ~((1 << X86_FEATURE_MCE) | /* disable MCE */ | 244 | ~((1 << X86_FEATURE_MCE) | /* disable MCE */ |
244 | (1 << X86_FEATURE_MCA) | /* disable MCA */ | 245 | (1 << X86_FEATURE_MCA) | /* disable MCA */ |
246 | (1 << X86_FEATURE_MTRR) | /* disable MTRR */ | ||
245 | (1 << X86_FEATURE_ACC)); /* thermal monitoring */ | 247 | (1 << X86_FEATURE_ACC)); /* thermal monitoring */ |
246 | 248 | ||
247 | if (!xen_initial_domain()) | 249 | if (!xen_initial_domain()) |
248 | cpuid_leaf1_edx_mask &= | 250 | cpuid_leaf1_edx_mask &= |
249 | ~((1 << X86_FEATURE_APIC) | /* disable local APIC */ | 251 | ~((1 << X86_FEATURE_APIC) | /* disable local APIC */ |
250 | (1 << X86_FEATURE_ACPI)); /* disable ACPI */ | 252 | (1 << X86_FEATURE_ACPI)); /* disable ACPI */ |
251 | |||
252 | ax = 1; | 253 | ax = 1; |
253 | cx = 0; | ||
254 | xen_cpuid(&ax, &bx, &cx, &dx); | 254 | xen_cpuid(&ax, &bx, &cx, &dx); |
255 | 255 | ||
256 | /* cpuid claims we support xsave; try enabling it to see what happens */ | 256 | xsave_mask = |
257 | if (cx & (1 << (X86_FEATURE_XSAVE % 32))) { | 257 | (1 << (X86_FEATURE_XSAVE % 32)) | |
258 | unsigned long cr4; | 258 | (1 << (X86_FEATURE_OSXSAVE % 32)); |
259 | |||
260 | set_in_cr4(X86_CR4_OSXSAVE); | ||
261 | |||
262 | cr4 = read_cr4(); | ||
263 | |||
264 | if ((cr4 & X86_CR4_OSXSAVE) == 0) | ||
265 | cpuid_leaf1_ecx_mask &= ~(1 << (X86_FEATURE_XSAVE % 32)); | ||
266 | 259 | ||
267 | clear_in_cr4(X86_CR4_OSXSAVE); | 260 | /* Xen will set CR4.OSXSAVE if supported and not disabled by force */ |
268 | } | 261 | if ((cx & xsave_mask) != xsave_mask) |
262 | cpuid_leaf1_ecx_mask &= ~xsave_mask; /* disable XSAVE & OSXSAVE */ | ||
269 | } | 263 | } |
270 | 264 | ||
271 | static void xen_set_debugreg(int reg, unsigned long val) | 265 | static void xen_set_debugreg(int reg, unsigned long val) |
@@ -406,7 +400,7 @@ static void xen_load_gdt(const struct desc_ptr *dtr) | |||
406 | /* | 400 | /* |
407 | * load_gdt for early boot, when the gdt is only mapped once | 401 | * load_gdt for early boot, when the gdt is only mapped once |
408 | */ | 402 | */ |
409 | static __init void xen_load_gdt_boot(const struct desc_ptr *dtr) | 403 | static void __init xen_load_gdt_boot(const struct desc_ptr *dtr) |
410 | { | 404 | { |
411 | unsigned long va = dtr->address; | 405 | unsigned long va = dtr->address; |
412 | unsigned int size = dtr->size + 1; | 406 | unsigned int size = dtr->size + 1; |
@@ -573,8 +567,8 @@ static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g) | |||
573 | 567 | ||
574 | preempt_disable(); | 568 | preempt_disable(); |
575 | 569 | ||
576 | start = __get_cpu_var(idt_desc).address; | 570 | start = __this_cpu_read(idt_desc.address); |
577 | end = start + __get_cpu_var(idt_desc).size + 1; | 571 | end = start + __this_cpu_read(idt_desc.size) + 1; |
578 | 572 | ||
579 | xen_mc_flush(); | 573 | xen_mc_flush(); |
580 | 574 | ||
@@ -668,7 +662,7 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry, | |||
668 | * Version of write_gdt_entry for use at early boot-time needed to | 662 | * Version of write_gdt_entry for use at early boot-time needed to |
669 | * update an entry as simply as possible. | 663 | * update an entry as simply as possible. |
670 | */ | 664 | */ |
671 | static __init void xen_write_gdt_entry_boot(struct desc_struct *dt, int entry, | 665 | static void __init xen_write_gdt_entry_boot(struct desc_struct *dt, int entry, |
672 | const void *desc, int type) | 666 | const void *desc, int type) |
673 | { | 667 | { |
674 | switch (type) { | 668 | switch (type) { |
@@ -835,6 +829,11 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) | |||
835 | Xen console noise. */ | 829 | Xen console noise. */ |
836 | break; | 830 | break; |
837 | 831 | ||
832 | case MSR_IA32_CR_PAT: | ||
833 | if (smp_processor_id() == 0) | ||
834 | xen_set_pat(((u64)high << 32) | low); | ||
835 | break; | ||
836 | |||
838 | default: | 837 | default: |
839 | ret = native_write_msr_safe(msr, low, high); | 838 | ret = native_write_msr_safe(msr, low, high); |
840 | } | 839 | } |
@@ -873,8 +872,6 @@ void xen_setup_vcpu_info_placement(void) | |||
873 | /* xen_vcpu_setup managed to place the vcpu_info within the | 872 | /* xen_vcpu_setup managed to place the vcpu_info within the |
874 | percpu area for all cpus, so make use of it */ | 873 | percpu area for all cpus, so make use of it */ |
875 | if (have_vcpu_info_placement) { | 874 | if (have_vcpu_info_placement) { |
876 | printk(KERN_INFO "Xen: using vcpu_info placement\n"); | ||
877 | |||
878 | pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct); | 875 | pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct); |
879 | pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct); | 876 | pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct); |
880 | pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct); | 877 | pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct); |
@@ -936,18 +933,18 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, | |||
936 | return ret; | 933 | return ret; |
937 | } | 934 | } |
938 | 935 | ||
939 | static const struct pv_info xen_info __initdata = { | 936 | static const struct pv_info xen_info __initconst = { |
940 | .paravirt_enabled = 1, | 937 | .paravirt_enabled = 1, |
941 | .shared_kernel_pmd = 0, | 938 | .shared_kernel_pmd = 0, |
942 | 939 | ||
943 | .name = "Xen", | 940 | .name = "Xen", |
944 | }; | 941 | }; |
945 | 942 | ||
946 | static const struct pv_init_ops xen_init_ops __initdata = { | 943 | static const struct pv_init_ops xen_init_ops __initconst = { |
947 | .patch = xen_patch, | 944 | .patch = xen_patch, |
948 | }; | 945 | }; |
949 | 946 | ||
950 | static const struct pv_cpu_ops xen_cpu_ops __initdata = { | 947 | static const struct pv_cpu_ops xen_cpu_ops __initconst = { |
951 | .cpuid = xen_cpuid, | 948 | .cpuid = xen_cpuid, |
952 | 949 | ||
953 | .set_debugreg = xen_set_debugreg, | 950 | .set_debugreg = xen_set_debugreg, |
@@ -1007,7 +1004,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = { | |||
1007 | .end_context_switch = xen_end_context_switch, | 1004 | .end_context_switch = xen_end_context_switch, |
1008 | }; | 1005 | }; |
1009 | 1006 | ||
1010 | static const struct pv_apic_ops xen_apic_ops __initdata = { | 1007 | static const struct pv_apic_ops xen_apic_ops __initconst = { |
1011 | #ifdef CONFIG_X86_LOCAL_APIC | 1008 | #ifdef CONFIG_X86_LOCAL_APIC |
1012 | .startup_ipi_hook = paravirt_nop, | 1009 | .startup_ipi_hook = paravirt_nop, |
1013 | #endif | 1010 | #endif |
@@ -1017,10 +1014,6 @@ static void xen_reboot(int reason) | |||
1017 | { | 1014 | { |
1018 | struct sched_shutdown r = { .reason = reason }; | 1015 | struct sched_shutdown r = { .reason = reason }; |
1019 | 1016 | ||
1020 | #ifdef CONFIG_SMP | ||
1021 | smp_send_stop(); | ||
1022 | #endif | ||
1023 | |||
1024 | if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r)) | 1017 | if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r)) |
1025 | BUG(); | 1018 | BUG(); |
1026 | } | 1019 | } |
@@ -1040,6 +1033,13 @@ static void xen_machine_halt(void) | |||
1040 | xen_reboot(SHUTDOWN_poweroff); | 1033 | xen_reboot(SHUTDOWN_poweroff); |
1041 | } | 1034 | } |
1042 | 1035 | ||
1036 | static void xen_machine_power_off(void) | ||
1037 | { | ||
1038 | if (pm_power_off) | ||
1039 | pm_power_off(); | ||
1040 | xen_reboot(SHUTDOWN_poweroff); | ||
1041 | } | ||
1042 | |||
1043 | static void xen_crash_shutdown(struct pt_regs *regs) | 1043 | static void xen_crash_shutdown(struct pt_regs *regs) |
1044 | { | 1044 | { |
1045 | xen_reboot(SHUTDOWN_crash); | 1045 | xen_reboot(SHUTDOWN_crash); |
@@ -1062,10 +1062,10 @@ int xen_panic_handler_init(void) | |||
1062 | return 0; | 1062 | return 0; |
1063 | } | 1063 | } |
1064 | 1064 | ||
1065 | static const struct machine_ops __initdata xen_machine_ops = { | 1065 | static const struct machine_ops xen_machine_ops __initconst = { |
1066 | .restart = xen_restart, | 1066 | .restart = xen_restart, |
1067 | .halt = xen_machine_halt, | 1067 | .halt = xen_machine_halt, |
1068 | .power_off = xen_machine_halt, | 1068 | .power_off = xen_machine_power_off, |
1069 | .shutdown = xen_machine_halt, | 1069 | .shutdown = xen_machine_halt, |
1070 | .crash_shutdown = xen_crash_shutdown, | 1070 | .crash_shutdown = xen_crash_shutdown, |
1071 | .emergency_restart = xen_emergency_restart, | 1071 | .emergency_restart = xen_emergency_restart, |
@@ -1091,6 +1091,8 @@ static void __init xen_setup_stackprotector(void) | |||
1091 | /* First C function to be called on Xen boot */ | 1091 | /* First C function to be called on Xen boot */ |
1092 | asmlinkage void __init xen_start_kernel(void) | 1092 | asmlinkage void __init xen_start_kernel(void) |
1093 | { | 1093 | { |
1094 | struct physdev_set_iopl set_iopl; | ||
1095 | int rc; | ||
1094 | pgd_t *pgd; | 1096 | pgd_t *pgd; |
1095 | 1097 | ||
1096 | if (!xen_start_info) | 1098 | if (!xen_start_info) |
@@ -1098,6 +1100,8 @@ asmlinkage void __init xen_start_kernel(void) | |||
1098 | 1100 | ||
1099 | xen_domain_type = XEN_PV_DOMAIN; | 1101 | xen_domain_type = XEN_PV_DOMAIN; |
1100 | 1102 | ||
1103 | xen_setup_machphys_mapping(); | ||
1104 | |||
1101 | /* Install Xen paravirt ops */ | 1105 | /* Install Xen paravirt ops */ |
1102 | pv_info = xen_info; | 1106 | pv_info = xen_info; |
1103 | pv_init_ops = xen_init_ops; | 1107 | pv_init_ops = xen_init_ops; |
@@ -1170,6 +1174,15 @@ asmlinkage void __init xen_start_kernel(void) | |||
1170 | 1174 | ||
1171 | xen_smp_init(); | 1175 | xen_smp_init(); |
1172 | 1176 | ||
1177 | #ifdef CONFIG_ACPI_NUMA | ||
1178 | /* | ||
1179 | * The pages we from Xen are not related to machine pages, so | ||
1180 | * any NUMA information the kernel tries to get from ACPI will | ||
1181 | * be meaningless. Prevent it from trying. | ||
1182 | */ | ||
1183 | acpi_numa = -1; | ||
1184 | #endif | ||
1185 | |||
1173 | pgd = (pgd_t *)xen_start_info->pt_base; | 1186 | pgd = (pgd_t *)xen_start_info->pt_base; |
1174 | 1187 | ||
1175 | if (!xen_initial_domain()) | 1188 | if (!xen_initial_domain()) |
@@ -1181,12 +1194,16 @@ asmlinkage void __init xen_start_kernel(void) | |||
1181 | per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; | 1194 | per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; |
1182 | 1195 | ||
1183 | local_irq_disable(); | 1196 | local_irq_disable(); |
1184 | early_boot_irqs_off(); | 1197 | early_boot_irqs_disabled = true; |
1198 | |||
1199 | memblock_init(); | ||
1185 | 1200 | ||
1186 | xen_raw_console_write("mapping kernel into physical memory\n"); | 1201 | xen_raw_console_write("mapping kernel into physical memory\n"); |
1187 | pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages); | 1202 | pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages); |
1203 | xen_ident_map_ISA(); | ||
1188 | 1204 | ||
1189 | init_mm.pgd = pgd; | 1205 | /* Allocate and initialize top and mid mfn levels for p2m structure */ |
1206 | xen_build_mfn_list_list(); | ||
1190 | 1207 | ||
1191 | /* keep using Xen gdt for now; no urgent need to change it */ | 1208 | /* keep using Xen gdt for now; no urgent need to change it */ |
1192 | 1209 | ||
@@ -1197,10 +1214,18 @@ asmlinkage void __init xen_start_kernel(void) | |||
1197 | #else | 1214 | #else |
1198 | pv_info.kernel_rpl = 0; | 1215 | pv_info.kernel_rpl = 0; |
1199 | #endif | 1216 | #endif |
1200 | |||
1201 | /* set the limit of our address space */ | 1217 | /* set the limit of our address space */ |
1202 | xen_reserve_top(); | 1218 | xen_reserve_top(); |
1203 | 1219 | ||
1220 | /* We used to do this in xen_arch_setup, but that is too late on AMD | ||
1221 | * were early_cpu_init (run before ->arch_setup()) calls early_amd_init | ||
1222 | * which pokes 0xcf8 port. | ||
1223 | */ | ||
1224 | set_iopl.iopl = 1; | ||
1225 | rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); | ||
1226 | if (rc != 0) | ||
1227 | xen_raw_printk("physdev_op failed %d\n", rc); | ||
1228 | |||
1204 | #ifdef CONFIG_X86_32 | 1229 | #ifdef CONFIG_X86_32 |
1205 | /* set up basic CPUID stuff */ | 1230 | /* set up basic CPUID stuff */ |
1206 | cpu_detect(&new_cpu_data); | 1231 | cpu_detect(&new_cpu_data); |
@@ -1220,6 +1245,8 @@ asmlinkage void __init xen_start_kernel(void) | |||
1220 | add_preferred_console("xenboot", 0, NULL); | 1245 | add_preferred_console("xenboot", 0, NULL); |
1221 | add_preferred_console("tty", 0, NULL); | 1246 | add_preferred_console("tty", 0, NULL); |
1222 | add_preferred_console("hvc", 0, NULL); | 1247 | add_preferred_console("hvc", 0, NULL); |
1248 | if (pci_xen) | ||
1249 | x86_init.pci.arch_init = pci_xen_init; | ||
1223 | } else { | 1250 | } else { |
1224 | /* Make sure ACS will be enabled */ | 1251 | /* Make sure ACS will be enabled */ |
1225 | pci_request_acs(); | 1252 | pci_request_acs(); |
@@ -1238,25 +1265,6 @@ asmlinkage void __init xen_start_kernel(void) | |||
1238 | #endif | 1265 | #endif |
1239 | } | 1266 | } |
1240 | 1267 | ||
1241 | static uint32_t xen_cpuid_base(void) | ||
1242 | { | ||
1243 | uint32_t base, eax, ebx, ecx, edx; | ||
1244 | char signature[13]; | ||
1245 | |||
1246 | for (base = 0x40000000; base < 0x40010000; base += 0x100) { | ||
1247 | cpuid(base, &eax, &ebx, &ecx, &edx); | ||
1248 | *(uint32_t *)(signature + 0) = ebx; | ||
1249 | *(uint32_t *)(signature + 4) = ecx; | ||
1250 | *(uint32_t *)(signature + 8) = edx; | ||
1251 | signature[12] = 0; | ||
1252 | |||
1253 | if (!strcmp("XenVMMXenVMM", signature) && ((eax - base) >= 2)) | ||
1254 | return base; | ||
1255 | } | ||
1256 | |||
1257 | return 0; | ||
1258 | } | ||
1259 | |||
1260 | static int init_hvm_pv_info(int *major, int *minor) | 1268 | static int init_hvm_pv_info(int *major, int *minor) |
1261 | { | 1269 | { |
1262 | uint32_t eax, ebx, ecx, edx, pages, msr, base; | 1270 | uint32_t eax, ebx, ecx, edx, pages, msr, base; |
@@ -1276,15 +1284,14 @@ static int init_hvm_pv_info(int *major, int *minor) | |||
1276 | 1284 | ||
1277 | xen_setup_features(); | 1285 | xen_setup_features(); |
1278 | 1286 | ||
1279 | pv_info = xen_info; | 1287 | pv_info.name = "Xen HVM"; |
1280 | pv_info.kernel_rpl = 0; | ||
1281 | 1288 | ||
1282 | xen_domain_type = XEN_HVM_DOMAIN; | 1289 | xen_domain_type = XEN_HVM_DOMAIN; |
1283 | 1290 | ||
1284 | return 0; | 1291 | return 0; |
1285 | } | 1292 | } |
1286 | 1293 | ||
1287 | void xen_hvm_init_shared_info(void) | 1294 | void __ref xen_hvm_init_shared_info(void) |
1288 | { | 1295 | { |
1289 | int cpu; | 1296 | int cpu; |
1290 | struct xen_add_to_physmap xatp; | 1297 | struct xen_add_to_physmap xatp; |
@@ -1323,6 +1330,8 @@ static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self, | |||
1323 | switch (action) { | 1330 | switch (action) { |
1324 | case CPU_UP_PREPARE: | 1331 | case CPU_UP_PREPARE: |
1325 | per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; | 1332 | per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; |
1333 | if (xen_have_vector_callback) | ||
1334 | xen_init_lock_cpu(cpu); | ||
1326 | break; | 1335 | break; |
1327 | default: | 1336 | default: |
1328 | break; | 1337 | break; |
@@ -1330,7 +1339,7 @@ static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self, | |||
1330 | return NOTIFY_OK; | 1339 | return NOTIFY_OK; |
1331 | } | 1340 | } |
1332 | 1341 | ||
1333 | static struct notifier_block __cpuinitdata xen_hvm_cpu_notifier = { | 1342 | static struct notifier_block xen_hvm_cpu_notifier __cpuinitdata = { |
1334 | .notifier_call = xen_hvm_cpu_notify, | 1343 | .notifier_call = xen_hvm_cpu_notify, |
1335 | }; | 1344 | }; |
1336 | 1345 | ||
@@ -1347,6 +1356,7 @@ static void __init xen_hvm_guest_init(void) | |||
1347 | 1356 | ||
1348 | if (xen_feature(XENFEAT_hvm_callback_vector)) | 1357 | if (xen_feature(XENFEAT_hvm_callback_vector)) |
1349 | xen_have_vector_callback = 1; | 1358 | xen_have_vector_callback = 1; |
1359 | xen_hvm_smp_init(); | ||
1350 | register_cpu_notifier(&xen_hvm_cpu_notifier); | 1360 | register_cpu_notifier(&xen_hvm_cpu_notifier); |
1351 | xen_unplug_emulated_devices(); | 1361 | xen_unplug_emulated_devices(); |
1352 | have_vcpu_info_placement = 0; | 1362 | have_vcpu_info_placement = 0; |
@@ -1366,7 +1376,19 @@ static bool __init xen_hvm_platform(void) | |||
1366 | return true; | 1376 | return true; |
1367 | } | 1377 | } |
1368 | 1378 | ||
1369 | const __refconst struct hypervisor_x86 x86_hyper_xen_hvm = { | 1379 | bool xen_hvm_need_lapic(void) |
1380 | { | ||
1381 | if (xen_pv_domain()) | ||
1382 | return false; | ||
1383 | if (!xen_hvm_domain()) | ||
1384 | return false; | ||
1385 | if (xen_feature(XENFEAT_hvm_pirqs) && xen_have_vector_callback) | ||
1386 | return false; | ||
1387 | return true; | ||
1388 | } | ||
1389 | EXPORT_SYMBOL_GPL(xen_hvm_need_lapic); | ||
1390 | |||
1391 | const struct hypervisor_x86 x86_hyper_xen_hvm __refconst = { | ||
1370 | .name = "Xen HVM", | 1392 | .name = "Xen HVM", |
1371 | .detect = xen_hvm_platform, | 1393 | .detect = xen_hvm_platform, |
1372 | .init_platform = xen_hvm_guest_init, | 1394 | .init_platform = xen_hvm_guest_init, |
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c index 9d30105a0c4a..8bbb465b6f0a 100644 --- a/arch/x86/xen/irq.c +++ b/arch/x86/xen/irq.c | |||
@@ -113,7 +113,7 @@ static void xen_halt(void) | |||
113 | xen_safe_halt(); | 113 | xen_safe_halt(); |
114 | } | 114 | } |
115 | 115 | ||
116 | static const struct pv_irq_ops xen_irq_ops __initdata = { | 116 | static const struct pv_irq_ops xen_irq_ops __initconst = { |
117 | .save_fl = PV_CALLEE_SAVE(xen_save_fl), | 117 | .save_fl = PV_CALLEE_SAVE(xen_save_fl), |
118 | .restore_fl = PV_CALLEE_SAVE(xen_restore_fl), | 118 | .restore_fl = PV_CALLEE_SAVE(xen_restore_fl), |
119 | .irq_disable = PV_CALLEE_SAVE(xen_irq_disable), | 119 | .irq_disable = PV_CALLEE_SAVE(xen_irq_disable), |
@@ -126,7 +126,7 @@ static const struct pv_irq_ops xen_irq_ops __initdata = { | |||
126 | #endif | 126 | #endif |
127 | }; | 127 | }; |
128 | 128 | ||
129 | void __init xen_init_irq_ops() | 129 | void __init xen_init_irq_ops(void) |
130 | { | 130 | { |
131 | pv_irq_ops = xen_irq_ops; | 131 | pv_irq_ops = xen_irq_ops; |
132 | x86_init.irqs.intr_init = xen_init_IRQ; | 132 | x86_init.irqs.intr_init = xen_init_IRQ; |
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 42086ac406af..0ccccb67a993 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c | |||
@@ -45,6 +45,8 @@ | |||
45 | #include <linux/vmalloc.h> | 45 | #include <linux/vmalloc.h> |
46 | #include <linux/module.h> | 46 | #include <linux/module.h> |
47 | #include <linux/gfp.h> | 47 | #include <linux/gfp.h> |
48 | #include <linux/memblock.h> | ||
49 | #include <linux/seq_file.h> | ||
48 | 50 | ||
49 | #include <asm/pgtable.h> | 51 | #include <asm/pgtable.h> |
50 | #include <asm/tlbflush.h> | 52 | #include <asm/tlbflush.h> |
@@ -55,6 +57,9 @@ | |||
55 | #include <asm/e820.h> | 57 | #include <asm/e820.h> |
56 | #include <asm/linkage.h> | 58 | #include <asm/linkage.h> |
57 | #include <asm/page.h> | 59 | #include <asm/page.h> |
60 | #include <asm/init.h> | ||
61 | #include <asm/pat.h> | ||
62 | #include <asm/smp.h> | ||
58 | 63 | ||
59 | #include <asm/xen/hypercall.h> | 64 | #include <asm/xen/hypercall.h> |
60 | #include <asm/xen/hypervisor.h> | 65 | #include <asm/xen/hypervisor.h> |
@@ -71,74 +76,19 @@ | |||
71 | #include "mmu.h" | 76 | #include "mmu.h" |
72 | #include "debugfs.h" | 77 | #include "debugfs.h" |
73 | 78 | ||
74 | #define MMU_UPDATE_HISTO 30 | ||
75 | |||
76 | /* | 79 | /* |
77 | * Protects atomic reservation decrease/increase against concurrent increases. | 80 | * Protects atomic reservation decrease/increase against concurrent increases. |
78 | * Also protects non-atomic updates of current_pages and driver_pages, and | 81 | * Also protects non-atomic updates of current_pages and balloon lists. |
79 | * balloon lists. | ||
80 | */ | 82 | */ |
81 | DEFINE_SPINLOCK(xen_reservation_lock); | 83 | DEFINE_SPINLOCK(xen_reservation_lock); |
82 | 84 | ||
83 | #ifdef CONFIG_XEN_DEBUG_FS | ||
84 | |||
85 | static struct { | ||
86 | u32 pgd_update; | ||
87 | u32 pgd_update_pinned; | ||
88 | u32 pgd_update_batched; | ||
89 | |||
90 | u32 pud_update; | ||
91 | u32 pud_update_pinned; | ||
92 | u32 pud_update_batched; | ||
93 | |||
94 | u32 pmd_update; | ||
95 | u32 pmd_update_pinned; | ||
96 | u32 pmd_update_batched; | ||
97 | |||
98 | u32 pte_update; | ||
99 | u32 pte_update_pinned; | ||
100 | u32 pte_update_batched; | ||
101 | |||
102 | u32 mmu_update; | ||
103 | u32 mmu_update_extended; | ||
104 | u32 mmu_update_histo[MMU_UPDATE_HISTO]; | ||
105 | |||
106 | u32 prot_commit; | ||
107 | u32 prot_commit_batched; | ||
108 | |||
109 | u32 set_pte_at; | ||
110 | u32 set_pte_at_batched; | ||
111 | u32 set_pte_at_pinned; | ||
112 | u32 set_pte_at_current; | ||
113 | u32 set_pte_at_kernel; | ||
114 | } mmu_stats; | ||
115 | |||
116 | static u8 zero_stats; | ||
117 | |||
118 | static inline void check_zero(void) | ||
119 | { | ||
120 | if (unlikely(zero_stats)) { | ||
121 | memset(&mmu_stats, 0, sizeof(mmu_stats)); | ||
122 | zero_stats = 0; | ||
123 | } | ||
124 | } | ||
125 | |||
126 | #define ADD_STATS(elem, val) \ | ||
127 | do { check_zero(); mmu_stats.elem += (val); } while(0) | ||
128 | |||
129 | #else /* !CONFIG_XEN_DEBUG_FS */ | ||
130 | |||
131 | #define ADD_STATS(elem, val) do { (void)(val); } while(0) | ||
132 | |||
133 | #endif /* CONFIG_XEN_DEBUG_FS */ | ||
134 | |||
135 | |||
136 | /* | 85 | /* |
137 | * Identity map, in addition to plain kernel map. This needs to be | 86 | * Identity map, in addition to plain kernel map. This needs to be |
138 | * large enough to allocate page table pages to allocate the rest. | 87 | * large enough to allocate page table pages to allocate the rest. |
139 | * Each page can map 2MB. | 88 | * Each page can map 2MB. |
140 | */ | 89 | */ |
141 | static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss; | 90 | #define LEVEL1_IDENT_ENTRIES (PTRS_PER_PTE * 4) |
91 | static RESERVE_BRK_ARRAY(pte_t, level1_ident_pgt, LEVEL1_IDENT_ENTRIES); | ||
142 | 92 | ||
143 | #ifdef CONFIG_X86_64 | 93 | #ifdef CONFIG_X86_64 |
144 | /* l3 pud for userspace vsyscall mapping */ | 94 | /* l3 pud for userspace vsyscall mapping */ |
@@ -169,160 +119,6 @@ DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */ | |||
169 | */ | 119 | */ |
170 | #define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK) | 120 | #define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK) |
171 | 121 | ||
172 | |||
173 | #define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) | ||
174 | #define TOP_ENTRIES (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE) | ||
175 | |||
176 | /* Placeholder for holes in the address space */ | ||
177 | static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] __page_aligned_data = | ||
178 | { [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL }; | ||
179 | |||
180 | /* Array of pointers to pages containing p2m entries */ | ||
181 | static unsigned long *p2m_top[TOP_ENTRIES] __page_aligned_data = | ||
182 | { [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] }; | ||
183 | |||
184 | /* Arrays of p2m arrays expressed in mfns used for save/restore */ | ||
185 | static unsigned long p2m_top_mfn[TOP_ENTRIES] __page_aligned_bss; | ||
186 | |||
187 | static unsigned long p2m_top_mfn_list[TOP_ENTRIES / P2M_ENTRIES_PER_PAGE] | ||
188 | __page_aligned_bss; | ||
189 | |||
190 | static inline unsigned p2m_top_index(unsigned long pfn) | ||
191 | { | ||
192 | BUG_ON(pfn >= MAX_DOMAIN_PAGES); | ||
193 | return pfn / P2M_ENTRIES_PER_PAGE; | ||
194 | } | ||
195 | |||
196 | static inline unsigned p2m_index(unsigned long pfn) | ||
197 | { | ||
198 | return pfn % P2M_ENTRIES_PER_PAGE; | ||
199 | } | ||
200 | |||
201 | /* Build the parallel p2m_top_mfn structures */ | ||
202 | void xen_build_mfn_list_list(void) | ||
203 | { | ||
204 | unsigned pfn, idx; | ||
205 | |||
206 | for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_ENTRIES_PER_PAGE) { | ||
207 | unsigned topidx = p2m_top_index(pfn); | ||
208 | |||
209 | p2m_top_mfn[topidx] = virt_to_mfn(p2m_top[topidx]); | ||
210 | } | ||
211 | |||
212 | for (idx = 0; idx < ARRAY_SIZE(p2m_top_mfn_list); idx++) { | ||
213 | unsigned topidx = idx * P2M_ENTRIES_PER_PAGE; | ||
214 | p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]); | ||
215 | } | ||
216 | } | ||
217 | |||
218 | void xen_setup_mfn_list_list(void) | ||
219 | { | ||
220 | BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); | ||
221 | |||
222 | HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = | ||
223 | virt_to_mfn(p2m_top_mfn_list); | ||
224 | HYPERVISOR_shared_info->arch.max_pfn = xen_start_info->nr_pages; | ||
225 | } | ||
226 | |||
227 | /* Set up p2m_top to point to the domain-builder provided p2m pages */ | ||
228 | void __init xen_build_dynamic_phys_to_machine(void) | ||
229 | { | ||
230 | unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list; | ||
231 | unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages); | ||
232 | unsigned pfn; | ||
233 | |||
234 | for (pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) { | ||
235 | unsigned topidx = p2m_top_index(pfn); | ||
236 | |||
237 | p2m_top[topidx] = &mfn_list[pfn]; | ||
238 | } | ||
239 | |||
240 | xen_build_mfn_list_list(); | ||
241 | } | ||
242 | |||
243 | unsigned long get_phys_to_machine(unsigned long pfn) | ||
244 | { | ||
245 | unsigned topidx, idx; | ||
246 | |||
247 | if (unlikely(pfn >= MAX_DOMAIN_PAGES)) | ||
248 | return INVALID_P2M_ENTRY; | ||
249 | |||
250 | topidx = p2m_top_index(pfn); | ||
251 | idx = p2m_index(pfn); | ||
252 | return p2m_top[topidx][idx]; | ||
253 | } | ||
254 | EXPORT_SYMBOL_GPL(get_phys_to_machine); | ||
255 | |||
256 | /* install a new p2m_top page */ | ||
257 | bool install_p2mtop_page(unsigned long pfn, unsigned long *p) | ||
258 | { | ||
259 | unsigned topidx = p2m_top_index(pfn); | ||
260 | unsigned long **pfnp, *mfnp; | ||
261 | unsigned i; | ||
262 | |||
263 | pfnp = &p2m_top[topidx]; | ||
264 | mfnp = &p2m_top_mfn[topidx]; | ||
265 | |||
266 | for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++) | ||
267 | p[i] = INVALID_P2M_ENTRY; | ||
268 | |||
269 | if (cmpxchg(pfnp, p2m_missing, p) == p2m_missing) { | ||
270 | *mfnp = virt_to_mfn(p); | ||
271 | return true; | ||
272 | } | ||
273 | |||
274 | return false; | ||
275 | } | ||
276 | |||
277 | static void alloc_p2m(unsigned long pfn) | ||
278 | { | ||
279 | unsigned long *p; | ||
280 | |||
281 | p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL); | ||
282 | BUG_ON(p == NULL); | ||
283 | |||
284 | if (!install_p2mtop_page(pfn, p)) | ||
285 | free_page((unsigned long)p); | ||
286 | } | ||
287 | |||
288 | /* Try to install p2m mapping; fail if intermediate bits missing */ | ||
289 | bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) | ||
290 | { | ||
291 | unsigned topidx, idx; | ||
292 | |||
293 | if (unlikely(pfn >= MAX_DOMAIN_PAGES)) { | ||
294 | BUG_ON(mfn != INVALID_P2M_ENTRY); | ||
295 | return true; | ||
296 | } | ||
297 | |||
298 | topidx = p2m_top_index(pfn); | ||
299 | if (p2m_top[topidx] == p2m_missing) { | ||
300 | if (mfn == INVALID_P2M_ENTRY) | ||
301 | return true; | ||
302 | return false; | ||
303 | } | ||
304 | |||
305 | idx = p2m_index(pfn); | ||
306 | p2m_top[topidx][idx] = mfn; | ||
307 | |||
308 | return true; | ||
309 | } | ||
310 | |||
311 | void set_phys_to_machine(unsigned long pfn, unsigned long mfn) | ||
312 | { | ||
313 | if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) { | ||
314 | BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY); | ||
315 | return; | ||
316 | } | ||
317 | |||
318 | if (unlikely(!__set_phys_to_machine(pfn, mfn))) { | ||
319 | alloc_p2m(pfn); | ||
320 | |||
321 | if (!__set_phys_to_machine(pfn, mfn)) | ||
322 | BUG(); | ||
323 | } | ||
324 | } | ||
325 | |||
326 | unsigned long arbitrary_virt_to_mfn(void *vaddr) | 122 | unsigned long arbitrary_virt_to_mfn(void *vaddr) |
327 | { | 123 | { |
328 | xmaddr_t maddr = arbitrary_virt_to_machine(vaddr); | 124 | xmaddr_t maddr = arbitrary_virt_to_machine(vaddr); |
@@ -351,6 +147,7 @@ xmaddr_t arbitrary_virt_to_machine(void *vaddr) | |||
351 | offset = address & ~PAGE_MASK; | 147 | offset = address & ~PAGE_MASK; |
352 | return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset); | 148 | return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset); |
353 | } | 149 | } |
150 | EXPORT_SYMBOL_GPL(arbitrary_virt_to_machine); | ||
354 | 151 | ||
355 | void make_lowmem_page_readonly(void *vaddr) | 152 | void make_lowmem_page_readonly(void *vaddr) |
356 | { | 153 | { |
@@ -359,7 +156,8 @@ void make_lowmem_page_readonly(void *vaddr) | |||
359 | unsigned int level; | 156 | unsigned int level; |
360 | 157 | ||
361 | pte = lookup_address(address, &level); | 158 | pte = lookup_address(address, &level); |
362 | BUG_ON(pte == NULL); | 159 | if (pte == NULL) |
160 | return; /* vaddr missing */ | ||
363 | 161 | ||
364 | ptev = pte_wrprotect(*pte); | 162 | ptev = pte_wrprotect(*pte); |
365 | 163 | ||
@@ -374,7 +172,8 @@ void make_lowmem_page_readwrite(void *vaddr) | |||
374 | unsigned int level; | 172 | unsigned int level; |
375 | 173 | ||
376 | pte = lookup_address(address, &level); | 174 | pte = lookup_address(address, &level); |
377 | BUG_ON(pte == NULL); | 175 | if (pte == NULL) |
176 | return; /* vaddr missing */ | ||
378 | 177 | ||
379 | ptev = pte_mkwrite(*pte); | 178 | ptev = pte_mkwrite(*pte); |
380 | 179 | ||
@@ -390,12 +189,7 @@ static bool xen_page_pinned(void *ptr) | |||
390 | return PagePinned(page); | 189 | return PagePinned(page); |
391 | } | 190 | } |
392 | 191 | ||
393 | static bool xen_iomap_pte(pte_t pte) | 192 | void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid) |
394 | { | ||
395 | return pte_flags(pte) & _PAGE_IOMAP; | ||
396 | } | ||
397 | |||
398 | static void xen_set_iomap_pte(pte_t *ptep, pte_t pteval) | ||
399 | { | 193 | { |
400 | struct multicall_space mcs; | 194 | struct multicall_space mcs; |
401 | struct mmu_update *u; | 195 | struct mmu_update *u; |
@@ -404,13 +198,14 @@ static void xen_set_iomap_pte(pte_t *ptep, pte_t pteval) | |||
404 | u = mcs.args; | 198 | u = mcs.args; |
405 | 199 | ||
406 | /* ptep might be kmapped when using 32-bit HIGHPTE */ | 200 | /* ptep might be kmapped when using 32-bit HIGHPTE */ |
407 | u->ptr = arbitrary_virt_to_machine(ptep).maddr; | 201 | u->ptr = virt_to_machine(ptep).maddr; |
408 | u->val = pte_val_ma(pteval); | 202 | u->val = pte_val_ma(pteval); |
409 | 203 | ||
410 | MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_IO); | 204 | MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, domid); |
411 | 205 | ||
412 | xen_mc_issue(PARAVIRT_LAZY_MMU); | 206 | xen_mc_issue(PARAVIRT_LAZY_MMU); |
413 | } | 207 | } |
208 | EXPORT_SYMBOL_GPL(xen_set_domain_pte); | ||
414 | 209 | ||
415 | static void xen_extend_mmu_update(const struct mmu_update *update) | 210 | static void xen_extend_mmu_update(const struct mmu_update *update) |
416 | { | 211 | { |
@@ -420,27 +215,17 @@ static void xen_extend_mmu_update(const struct mmu_update *update) | |||
420 | mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u)); | 215 | mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u)); |
421 | 216 | ||
422 | if (mcs.mc != NULL) { | 217 | if (mcs.mc != NULL) { |
423 | ADD_STATS(mmu_update_extended, 1); | ||
424 | ADD_STATS(mmu_update_histo[mcs.mc->args[1]], -1); | ||
425 | |||
426 | mcs.mc->args[1]++; | 218 | mcs.mc->args[1]++; |
427 | |||
428 | if (mcs.mc->args[1] < MMU_UPDATE_HISTO) | ||
429 | ADD_STATS(mmu_update_histo[mcs.mc->args[1]], 1); | ||
430 | else | ||
431 | ADD_STATS(mmu_update_histo[0], 1); | ||
432 | } else { | 219 | } else { |
433 | ADD_STATS(mmu_update, 1); | ||
434 | mcs = __xen_mc_entry(sizeof(*u)); | 220 | mcs = __xen_mc_entry(sizeof(*u)); |
435 | MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF); | 221 | MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF); |
436 | ADD_STATS(mmu_update_histo[1], 1); | ||
437 | } | 222 | } |
438 | 223 | ||
439 | u = mcs.args; | 224 | u = mcs.args; |
440 | *u = *update; | 225 | *u = *update; |
441 | } | 226 | } |
442 | 227 | ||
443 | void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val) | 228 | static void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val) |
444 | { | 229 | { |
445 | struct mmu_update u; | 230 | struct mmu_update u; |
446 | 231 | ||
@@ -453,17 +238,13 @@ void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val) | |||
453 | u.val = pmd_val_ma(val); | 238 | u.val = pmd_val_ma(val); |
454 | xen_extend_mmu_update(&u); | 239 | xen_extend_mmu_update(&u); |
455 | 240 | ||
456 | ADD_STATS(pmd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU); | ||
457 | |||
458 | xen_mc_issue(PARAVIRT_LAZY_MMU); | 241 | xen_mc_issue(PARAVIRT_LAZY_MMU); |
459 | 242 | ||
460 | preempt_enable(); | 243 | preempt_enable(); |
461 | } | 244 | } |
462 | 245 | ||
463 | void xen_set_pmd(pmd_t *ptr, pmd_t val) | 246 | static void xen_set_pmd(pmd_t *ptr, pmd_t val) |
464 | { | 247 | { |
465 | ADD_STATS(pmd_update, 1); | ||
466 | |||
467 | /* If page is not pinned, we can just update the entry | 248 | /* If page is not pinned, we can just update the entry |
468 | directly */ | 249 | directly */ |
469 | if (!xen_page_pinned(ptr)) { | 250 | if (!xen_page_pinned(ptr)) { |
@@ -471,8 +252,6 @@ void xen_set_pmd(pmd_t *ptr, pmd_t val) | |||
471 | return; | 252 | return; |
472 | } | 253 | } |
473 | 254 | ||
474 | ADD_STATS(pmd_update_pinned, 1); | ||
475 | |||
476 | xen_set_pmd_hyper(ptr, val); | 255 | xen_set_pmd_hyper(ptr, val); |
477 | } | 256 | } |
478 | 257 | ||
@@ -485,35 +264,34 @@ void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags) | |||
485 | set_pte_vaddr(vaddr, mfn_pte(mfn, flags)); | 264 | set_pte_vaddr(vaddr, mfn_pte(mfn, flags)); |
486 | } | 265 | } |
487 | 266 | ||
488 | void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, | 267 | static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval) |
489 | pte_t *ptep, pte_t pteval) | ||
490 | { | 268 | { |
491 | if (xen_iomap_pte(pteval)) { | 269 | struct mmu_update u; |
492 | xen_set_iomap_pte(ptep, pteval); | 270 | |
493 | goto out; | 271 | if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU) |
494 | } | 272 | return false; |
495 | 273 | ||
496 | ADD_STATS(set_pte_at, 1); | 274 | xen_mc_batch(); |
497 | // ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep)); | ||
498 | ADD_STATS(set_pte_at_current, mm == current->mm); | ||
499 | ADD_STATS(set_pte_at_kernel, mm == &init_mm); | ||
500 | 275 | ||
501 | if (mm == current->mm || mm == &init_mm) { | 276 | u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE; |
502 | if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { | 277 | u.val = pte_val_ma(pteval); |
503 | struct multicall_space mcs; | 278 | xen_extend_mmu_update(&u); |
504 | mcs = xen_mc_entry(0); | ||
505 | 279 | ||
506 | MULTI_update_va_mapping(mcs.mc, addr, pteval, 0); | 280 | xen_mc_issue(PARAVIRT_LAZY_MMU); |
507 | ADD_STATS(set_pte_at_batched, 1); | 281 | |
508 | xen_mc_issue(PARAVIRT_LAZY_MMU); | 282 | return true; |
509 | goto out; | 283 | } |
510 | } else | ||
511 | if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0) | ||
512 | goto out; | ||
513 | } | ||
514 | xen_set_pte(ptep, pteval); | ||
515 | 284 | ||
516 | out: return; | 285 | static void xen_set_pte(pte_t *ptep, pte_t pteval) |
286 | { | ||
287 | if (!xen_batched_set_pte(ptep, pteval)) | ||
288 | native_set_pte(ptep, pteval); | ||
289 | } | ||
290 | |||
291 | static void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, | ||
292 | pte_t *ptep, pte_t pteval) | ||
293 | { | ||
294 | xen_set_pte(ptep, pteval); | ||
517 | } | 295 | } |
518 | 296 | ||
519 | pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, | 297 | pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, |
@@ -530,13 +308,10 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, | |||
530 | 308 | ||
531 | xen_mc_batch(); | 309 | xen_mc_batch(); |
532 | 310 | ||
533 | u.ptr = arbitrary_virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD; | 311 | u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD; |
534 | u.val = pte_val_ma(pte); | 312 | u.val = pte_val_ma(pte); |
535 | xen_extend_mmu_update(&u); | 313 | xen_extend_mmu_update(&u); |
536 | 314 | ||
537 | ADD_STATS(prot_commit, 1); | ||
538 | ADD_STATS(prot_commit_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU); | ||
539 | |||
540 | xen_mc_issue(PARAVIRT_LAZY_MMU); | 315 | xen_mc_issue(PARAVIRT_LAZY_MMU); |
541 | } | 316 | } |
542 | 317 | ||
@@ -557,7 +332,34 @@ static pteval_t pte_pfn_to_mfn(pteval_t val) | |||
557 | if (val & _PAGE_PRESENT) { | 332 | if (val & _PAGE_PRESENT) { |
558 | unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; | 333 | unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; |
559 | pteval_t flags = val & PTE_FLAGS_MASK; | 334 | pteval_t flags = val & PTE_FLAGS_MASK; |
560 | val = ((pteval_t)pfn_to_mfn(pfn) << PAGE_SHIFT) | flags; | 335 | unsigned long mfn; |
336 | |||
337 | if (!xen_feature(XENFEAT_auto_translated_physmap)) | ||
338 | mfn = get_phys_to_machine(pfn); | ||
339 | else | ||
340 | mfn = pfn; | ||
341 | /* | ||
342 | * If there's no mfn for the pfn, then just create an | ||
343 | * empty non-present pte. Unfortunately this loses | ||
344 | * information about the original pfn, so | ||
345 | * pte_mfn_to_pfn is asymmetric. | ||
346 | */ | ||
347 | if (unlikely(mfn == INVALID_P2M_ENTRY)) { | ||
348 | mfn = 0; | ||
349 | flags = 0; | ||
350 | } else { | ||
351 | /* | ||
352 | * Paramount to do this test _after_ the | ||
353 | * INVALID_P2M_ENTRY as INVALID_P2M_ENTRY & | ||
354 | * IDENTITY_FRAME_BIT resolves to true. | ||
355 | */ | ||
356 | mfn &= ~FOREIGN_FRAME_BIT; | ||
357 | if (mfn & IDENTITY_FRAME_BIT) { | ||
358 | mfn &= ~IDENTITY_FRAME_BIT; | ||
359 | flags |= _PAGE_IOMAP; | ||
360 | } | ||
361 | } | ||
362 | val = ((pteval_t)mfn << PAGE_SHIFT) | flags; | ||
561 | } | 363 | } |
562 | 364 | ||
563 | return val; | 365 | return val; |
@@ -577,25 +379,71 @@ static pteval_t iomap_pte(pteval_t val) | |||
577 | return val; | 379 | return val; |
578 | } | 380 | } |
579 | 381 | ||
580 | pteval_t xen_pte_val(pte_t pte) | 382 | static pteval_t xen_pte_val(pte_t pte) |
581 | { | 383 | { |
582 | if (xen_initial_domain() && (pte.pte & _PAGE_IOMAP)) | 384 | pteval_t pteval = pte.pte; |
583 | return pte.pte; | ||
584 | 385 | ||
585 | return pte_mfn_to_pfn(pte.pte); | 386 | /* If this is a WC pte, convert back from Xen WC to Linux WC */ |
387 | if ((pteval & (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT)) == _PAGE_PAT) { | ||
388 | WARN_ON(!pat_enabled); | ||
389 | pteval = (pteval & ~_PAGE_PAT) | _PAGE_PWT; | ||
390 | } | ||
391 | |||
392 | if (xen_initial_domain() && (pteval & _PAGE_IOMAP)) | ||
393 | return pteval; | ||
394 | |||
395 | return pte_mfn_to_pfn(pteval); | ||
586 | } | 396 | } |
587 | PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val); | 397 | PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val); |
588 | 398 | ||
589 | pgdval_t xen_pgd_val(pgd_t pgd) | 399 | static pgdval_t xen_pgd_val(pgd_t pgd) |
590 | { | 400 | { |
591 | return pte_mfn_to_pfn(pgd.pgd); | 401 | return pte_mfn_to_pfn(pgd.pgd); |
592 | } | 402 | } |
593 | PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val); | 403 | PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val); |
594 | 404 | ||
595 | pte_t xen_make_pte(pteval_t pte) | 405 | /* |
406 | * Xen's PAT setup is part of its ABI, though I assume entries 6 & 7 | ||
407 | * are reserved for now, to correspond to the Intel-reserved PAT | ||
408 | * types. | ||
409 | * | ||
410 | * We expect Linux's PAT set as follows: | ||
411 | * | ||
412 | * Idx PTE flags Linux Xen Default | ||
413 | * 0 WB WB WB | ||
414 | * 1 PWT WC WT WT | ||
415 | * 2 PCD UC- UC- UC- | ||
416 | * 3 PCD PWT UC UC UC | ||
417 | * 4 PAT WB WC WB | ||
418 | * 5 PAT PWT WC WP WT | ||
419 | * 6 PAT PCD UC- UC UC- | ||
420 | * 7 PAT PCD PWT UC UC UC | ||
421 | */ | ||
422 | |||
423 | void xen_set_pat(u64 pat) | ||
424 | { | ||
425 | /* We expect Linux to use a PAT setting of | ||
426 | * UC UC- WC WB (ignoring the PAT flag) */ | ||
427 | WARN_ON(pat != 0x0007010600070106ull); | ||
428 | } | ||
429 | |||
430 | static pte_t xen_make_pte(pteval_t pte) | ||
596 | { | 431 | { |
597 | phys_addr_t addr = (pte & PTE_PFN_MASK); | 432 | phys_addr_t addr = (pte & PTE_PFN_MASK); |
598 | 433 | ||
434 | /* If Linux is trying to set a WC pte, then map to the Xen WC. | ||
435 | * If _PAGE_PAT is set, then it probably means it is really | ||
436 | * _PAGE_PSE, so avoid fiddling with the PAT mapping and hope | ||
437 | * things work out OK... | ||
438 | * | ||
439 | * (We should never see kernel mappings with _PAGE_PSE set, | ||
440 | * but we could see hugetlbfs mappings, I think.). | ||
441 | */ | ||
442 | if (pat_enabled && !WARN_ON(pte & _PAGE_PAT)) { | ||
443 | if ((pte & (_PAGE_PCD | _PAGE_PWT)) == _PAGE_PWT) | ||
444 | pte = (pte & ~(_PAGE_PCD | _PAGE_PWT)) | _PAGE_PAT; | ||
445 | } | ||
446 | |||
599 | /* | 447 | /* |
600 | * Unprivileged domains are allowed to do IOMAPpings for | 448 | * Unprivileged domains are allowed to do IOMAPpings for |
601 | * PCI passthrough, but not map ISA space. The ISA | 449 | * PCI passthrough, but not map ISA space. The ISA |
@@ -614,20 +462,55 @@ pte_t xen_make_pte(pteval_t pte) | |||
614 | } | 462 | } |
615 | PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte); | 463 | PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte); |
616 | 464 | ||
617 | pgd_t xen_make_pgd(pgdval_t pgd) | 465 | #ifdef CONFIG_XEN_DEBUG |
466 | pte_t xen_make_pte_debug(pteval_t pte) | ||
467 | { | ||
468 | phys_addr_t addr = (pte & PTE_PFN_MASK); | ||
469 | phys_addr_t other_addr; | ||
470 | bool io_page = false; | ||
471 | pte_t _pte; | ||
472 | |||
473 | if (pte & _PAGE_IOMAP) | ||
474 | io_page = true; | ||
475 | |||
476 | _pte = xen_make_pte(pte); | ||
477 | |||
478 | if (!addr) | ||
479 | return _pte; | ||
480 | |||
481 | if (io_page && | ||
482 | (xen_initial_domain() || addr >= ISA_END_ADDRESS)) { | ||
483 | other_addr = pfn_to_mfn(addr >> PAGE_SHIFT) << PAGE_SHIFT; | ||
484 | WARN_ONCE(addr != other_addr, | ||
485 | "0x%lx is using VM_IO, but it is 0x%lx!\n", | ||
486 | (unsigned long)addr, (unsigned long)other_addr); | ||
487 | } else { | ||
488 | pteval_t iomap_set = (_pte.pte & PTE_FLAGS_MASK) & _PAGE_IOMAP; | ||
489 | other_addr = (_pte.pte & PTE_PFN_MASK); | ||
490 | WARN_ONCE((addr == other_addr) && (!io_page) && (!iomap_set), | ||
491 | "0x%lx is missing VM_IO (and wasn't fixed)!\n", | ||
492 | (unsigned long)addr); | ||
493 | } | ||
494 | |||
495 | return _pte; | ||
496 | } | ||
497 | PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte_debug); | ||
498 | #endif | ||
499 | |||
500 | static pgd_t xen_make_pgd(pgdval_t pgd) | ||
618 | { | 501 | { |
619 | pgd = pte_pfn_to_mfn(pgd); | 502 | pgd = pte_pfn_to_mfn(pgd); |
620 | return native_make_pgd(pgd); | 503 | return native_make_pgd(pgd); |
621 | } | 504 | } |
622 | PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd); | 505 | PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd); |
623 | 506 | ||
624 | pmdval_t xen_pmd_val(pmd_t pmd) | 507 | static pmdval_t xen_pmd_val(pmd_t pmd) |
625 | { | 508 | { |
626 | return pte_mfn_to_pfn(pmd.pmd); | 509 | return pte_mfn_to_pfn(pmd.pmd); |
627 | } | 510 | } |
628 | PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val); | 511 | PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val); |
629 | 512 | ||
630 | void xen_set_pud_hyper(pud_t *ptr, pud_t val) | 513 | static void xen_set_pud_hyper(pud_t *ptr, pud_t val) |
631 | { | 514 | { |
632 | struct mmu_update u; | 515 | struct mmu_update u; |
633 | 516 | ||
@@ -640,17 +523,13 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val) | |||
640 | u.val = pud_val_ma(val); | 523 | u.val = pud_val_ma(val); |
641 | xen_extend_mmu_update(&u); | 524 | xen_extend_mmu_update(&u); |
642 | 525 | ||
643 | ADD_STATS(pud_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU); | ||
644 | |||
645 | xen_mc_issue(PARAVIRT_LAZY_MMU); | 526 | xen_mc_issue(PARAVIRT_LAZY_MMU); |
646 | 527 | ||
647 | preempt_enable(); | 528 | preempt_enable(); |
648 | } | 529 | } |
649 | 530 | ||
650 | void xen_set_pud(pud_t *ptr, pud_t val) | 531 | static void xen_set_pud(pud_t *ptr, pud_t val) |
651 | { | 532 | { |
652 | ADD_STATS(pud_update, 1); | ||
653 | |||
654 | /* If page is not pinned, we can just update the entry | 533 | /* If page is not pinned, we can just update the entry |
655 | directly */ | 534 | directly */ |
656 | if (!xen_page_pinned(ptr)) { | 535 | if (!xen_page_pinned(ptr)) { |
@@ -658,56 +537,28 @@ void xen_set_pud(pud_t *ptr, pud_t val) | |||
658 | return; | 537 | return; |
659 | } | 538 | } |
660 | 539 | ||
661 | ADD_STATS(pud_update_pinned, 1); | ||
662 | |||
663 | xen_set_pud_hyper(ptr, val); | 540 | xen_set_pud_hyper(ptr, val); |
664 | } | 541 | } |
665 | 542 | ||
666 | void xen_set_pte(pte_t *ptep, pte_t pte) | ||
667 | { | ||
668 | if (xen_iomap_pte(pte)) { | ||
669 | xen_set_iomap_pte(ptep, pte); | ||
670 | return; | ||
671 | } | ||
672 | |||
673 | ADD_STATS(pte_update, 1); | ||
674 | // ADD_STATS(pte_update_pinned, xen_page_pinned(ptep)); | ||
675 | ADD_STATS(pte_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU); | ||
676 | |||
677 | #ifdef CONFIG_X86_PAE | 543 | #ifdef CONFIG_X86_PAE |
678 | ptep->pte_high = pte.pte_high; | 544 | static void xen_set_pte_atomic(pte_t *ptep, pte_t pte) |
679 | smp_wmb(); | ||
680 | ptep->pte_low = pte.pte_low; | ||
681 | #else | ||
682 | *ptep = pte; | ||
683 | #endif | ||
684 | } | ||
685 | |||
686 | #ifdef CONFIG_X86_PAE | ||
687 | void xen_set_pte_atomic(pte_t *ptep, pte_t pte) | ||
688 | { | 545 | { |
689 | if (xen_iomap_pte(pte)) { | ||
690 | xen_set_iomap_pte(ptep, pte); | ||
691 | return; | ||
692 | } | ||
693 | |||
694 | set_64bit((u64 *)ptep, native_pte_val(pte)); | 546 | set_64bit((u64 *)ptep, native_pte_val(pte)); |
695 | } | 547 | } |
696 | 548 | ||
697 | void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) | 549 | static void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) |
698 | { | 550 | { |
699 | ptep->pte_low = 0; | 551 | if (!xen_batched_set_pte(ptep, native_make_pte(0))) |
700 | smp_wmb(); /* make sure low gets written first */ | 552 | native_pte_clear(mm, addr, ptep); |
701 | ptep->pte_high = 0; | ||
702 | } | 553 | } |
703 | 554 | ||
704 | void xen_pmd_clear(pmd_t *pmdp) | 555 | static void xen_pmd_clear(pmd_t *pmdp) |
705 | { | 556 | { |
706 | set_pmd(pmdp, __pmd(0)); | 557 | set_pmd(pmdp, __pmd(0)); |
707 | } | 558 | } |
708 | #endif /* CONFIG_X86_PAE */ | 559 | #endif /* CONFIG_X86_PAE */ |
709 | 560 | ||
710 | pmd_t xen_make_pmd(pmdval_t pmd) | 561 | static pmd_t xen_make_pmd(pmdval_t pmd) |
711 | { | 562 | { |
712 | pmd = pte_pfn_to_mfn(pmd); | 563 | pmd = pte_pfn_to_mfn(pmd); |
713 | return native_make_pmd(pmd); | 564 | return native_make_pmd(pmd); |
@@ -715,13 +566,13 @@ pmd_t xen_make_pmd(pmdval_t pmd) | |||
715 | PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd); | 566 | PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd); |
716 | 567 | ||
717 | #if PAGETABLE_LEVELS == 4 | 568 | #if PAGETABLE_LEVELS == 4 |
718 | pudval_t xen_pud_val(pud_t pud) | 569 | static pudval_t xen_pud_val(pud_t pud) |
719 | { | 570 | { |
720 | return pte_mfn_to_pfn(pud.pud); | 571 | return pte_mfn_to_pfn(pud.pud); |
721 | } | 572 | } |
722 | PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val); | 573 | PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val); |
723 | 574 | ||
724 | pud_t xen_make_pud(pudval_t pud) | 575 | static pud_t xen_make_pud(pudval_t pud) |
725 | { | 576 | { |
726 | pud = pte_pfn_to_mfn(pud); | 577 | pud = pte_pfn_to_mfn(pud); |
727 | 578 | ||
@@ -729,7 +580,7 @@ pud_t xen_make_pud(pudval_t pud) | |||
729 | } | 580 | } |
730 | PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud); | 581 | PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud); |
731 | 582 | ||
732 | pgd_t *xen_get_user_pgd(pgd_t *pgd) | 583 | static pgd_t *xen_get_user_pgd(pgd_t *pgd) |
733 | { | 584 | { |
734 | pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK); | 585 | pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK); |
735 | unsigned offset = pgd - pgd_page; | 586 | unsigned offset = pgd - pgd_page; |
@@ -761,7 +612,7 @@ static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val) | |||
761 | * 2. It is always pinned | 612 | * 2. It is always pinned |
762 | * 3. It has no user pagetable attached to it | 613 | * 3. It has no user pagetable attached to it |
763 | */ | 614 | */ |
764 | void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val) | 615 | static void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val) |
765 | { | 616 | { |
766 | preempt_disable(); | 617 | preempt_disable(); |
767 | 618 | ||
@@ -774,12 +625,10 @@ void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val) | |||
774 | preempt_enable(); | 625 | preempt_enable(); |
775 | } | 626 | } |
776 | 627 | ||
777 | void xen_set_pgd(pgd_t *ptr, pgd_t val) | 628 | static void xen_set_pgd(pgd_t *ptr, pgd_t val) |
778 | { | 629 | { |
779 | pgd_t *user_ptr = xen_get_user_pgd(ptr); | 630 | pgd_t *user_ptr = xen_get_user_pgd(ptr); |
780 | 631 | ||
781 | ADD_STATS(pgd_update, 1); | ||
782 | |||
783 | /* If page is not pinned, we can just update the entry | 632 | /* If page is not pinned, we can just update the entry |
784 | directly */ | 633 | directly */ |
785 | if (!xen_page_pinned(ptr)) { | 634 | if (!xen_page_pinned(ptr)) { |
@@ -791,9 +640,6 @@ void xen_set_pgd(pgd_t *ptr, pgd_t val) | |||
791 | return; | 640 | return; |
792 | } | 641 | } |
793 | 642 | ||
794 | ADD_STATS(pgd_update_pinned, 1); | ||
795 | ADD_STATS(pgd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU); | ||
796 | |||
797 | /* If it's pinned, then we can at least batch the kernel and | 643 | /* If it's pinned, then we can at least batch the kernel and |
798 | user updates together. */ | 644 | user updates together. */ |
799 | xen_mc_batch(); | 645 | xen_mc_batch(); |
@@ -1068,10 +914,9 @@ static void xen_pgd_pin(struct mm_struct *mm) | |||
1068 | */ | 914 | */ |
1069 | void xen_mm_pin_all(void) | 915 | void xen_mm_pin_all(void) |
1070 | { | 916 | { |
1071 | unsigned long flags; | ||
1072 | struct page *page; | 917 | struct page *page; |
1073 | 918 | ||
1074 | spin_lock_irqsave(&pgd_lock, flags); | 919 | spin_lock(&pgd_lock); |
1075 | 920 | ||
1076 | list_for_each_entry(page, &pgd_list, lru) { | 921 | list_for_each_entry(page, &pgd_list, lru) { |
1077 | if (!PagePinned(page)) { | 922 | if (!PagePinned(page)) { |
@@ -1080,7 +925,7 @@ void xen_mm_pin_all(void) | |||
1080 | } | 925 | } |
1081 | } | 926 | } |
1082 | 927 | ||
1083 | spin_unlock_irqrestore(&pgd_lock, flags); | 928 | spin_unlock(&pgd_lock); |
1084 | } | 929 | } |
1085 | 930 | ||
1086 | /* | 931 | /* |
@@ -1088,7 +933,7 @@ void xen_mm_pin_all(void) | |||
1088 | * that's before we have page structures to store the bits. So do all | 933 | * that's before we have page structures to store the bits. So do all |
1089 | * the book-keeping now. | 934 | * the book-keeping now. |
1090 | */ | 935 | */ |
1091 | static __init int xen_mark_pinned(struct mm_struct *mm, struct page *page, | 936 | static int __init xen_mark_pinned(struct mm_struct *mm, struct page *page, |
1092 | enum pt_level level) | 937 | enum pt_level level) |
1093 | { | 938 | { |
1094 | SetPagePinned(page); | 939 | SetPagePinned(page); |
@@ -1181,10 +1026,9 @@ static void xen_pgd_unpin(struct mm_struct *mm) | |||
1181 | */ | 1026 | */ |
1182 | void xen_mm_unpin_all(void) | 1027 | void xen_mm_unpin_all(void) |
1183 | { | 1028 | { |
1184 | unsigned long flags; | ||
1185 | struct page *page; | 1029 | struct page *page; |
1186 | 1030 | ||
1187 | spin_lock_irqsave(&pgd_lock, flags); | 1031 | spin_lock(&pgd_lock); |
1188 | 1032 | ||
1189 | list_for_each_entry(page, &pgd_list, lru) { | 1033 | list_for_each_entry(page, &pgd_list, lru) { |
1190 | if (PageSavePinned(page)) { | 1034 | if (PageSavePinned(page)) { |
@@ -1194,17 +1038,17 @@ void xen_mm_unpin_all(void) | |||
1194 | } | 1038 | } |
1195 | } | 1039 | } |
1196 | 1040 | ||
1197 | spin_unlock_irqrestore(&pgd_lock, flags); | 1041 | spin_unlock(&pgd_lock); |
1198 | } | 1042 | } |
1199 | 1043 | ||
1200 | void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) | 1044 | static void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) |
1201 | { | 1045 | { |
1202 | spin_lock(&next->page_table_lock); | 1046 | spin_lock(&next->page_table_lock); |
1203 | xen_pgd_pin(next); | 1047 | xen_pgd_pin(next); |
1204 | spin_unlock(&next->page_table_lock); | 1048 | spin_unlock(&next->page_table_lock); |
1205 | } | 1049 | } |
1206 | 1050 | ||
1207 | void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) | 1051 | static void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) |
1208 | { | 1052 | { |
1209 | spin_lock(&mm->page_table_lock); | 1053 | spin_lock(&mm->page_table_lock); |
1210 | xen_pgd_pin(mm); | 1054 | xen_pgd_pin(mm); |
@@ -1222,7 +1066,7 @@ static void drop_other_mm_ref(void *info) | |||
1222 | 1066 | ||
1223 | active_mm = percpu_read(cpu_tlbstate.active_mm); | 1067 | active_mm = percpu_read(cpu_tlbstate.active_mm); |
1224 | 1068 | ||
1225 | if (active_mm == mm) | 1069 | if (active_mm == mm && percpu_read(cpu_tlbstate.state) != TLBSTATE_OK) |
1226 | leave_mm(smp_processor_id()); | 1070 | leave_mm(smp_processor_id()); |
1227 | 1071 | ||
1228 | /* If this cpu still has a stale cr3 reference, then make sure | 1072 | /* If this cpu still has a stale cr3 reference, then make sure |
@@ -1291,7 +1135,7 @@ static void xen_drop_mm_ref(struct mm_struct *mm) | |||
1291 | * pagetable because of lazy tlb flushing. This means we need need to | 1135 | * pagetable because of lazy tlb flushing. This means we need need to |
1292 | * switch all CPUs off this pagetable before we can unpin it. | 1136 | * switch all CPUs off this pagetable before we can unpin it. |
1293 | */ | 1137 | */ |
1294 | void xen_exit_mmap(struct mm_struct *mm) | 1138 | static void xen_exit_mmap(struct mm_struct *mm) |
1295 | { | 1139 | { |
1296 | get_cpu(); /* make sure we don't move around */ | 1140 | get_cpu(); /* make sure we don't move around */ |
1297 | xen_drop_mm_ref(mm); | 1141 | xen_drop_mm_ref(mm); |
@@ -1306,13 +1150,27 @@ void xen_exit_mmap(struct mm_struct *mm) | |||
1306 | spin_unlock(&mm->page_table_lock); | 1150 | spin_unlock(&mm->page_table_lock); |
1307 | } | 1151 | } |
1308 | 1152 | ||
1309 | static __init void xen_pagetable_setup_start(pgd_t *base) | 1153 | static void __init xen_pagetable_setup_start(pgd_t *base) |
1310 | { | 1154 | { |
1311 | } | 1155 | } |
1312 | 1156 | ||
1157 | static __init void xen_mapping_pagetable_reserve(u64 start, u64 end) | ||
1158 | { | ||
1159 | /* reserve the range used */ | ||
1160 | native_pagetable_reserve(start, end); | ||
1161 | |||
1162 | /* set as RW the rest */ | ||
1163 | printk(KERN_DEBUG "xen: setting RW the range %llx - %llx\n", end, | ||
1164 | PFN_PHYS(pgt_buf_top)); | ||
1165 | while (end < PFN_PHYS(pgt_buf_top)) { | ||
1166 | make_lowmem_page_readwrite(__va(end)); | ||
1167 | end += PAGE_SIZE; | ||
1168 | } | ||
1169 | } | ||
1170 | |||
1313 | static void xen_post_allocator_init(void); | 1171 | static void xen_post_allocator_init(void); |
1314 | 1172 | ||
1315 | static __init void xen_pagetable_setup_done(pgd_t *base) | 1173 | static void __init xen_pagetable_setup_done(pgd_t *base) |
1316 | { | 1174 | { |
1317 | xen_setup_shared_info(); | 1175 | xen_setup_shared_info(); |
1318 | xen_post_allocator_init(); | 1176 | xen_post_allocator_init(); |
@@ -1374,7 +1232,11 @@ static void xen_flush_tlb_others(const struct cpumask *cpus, | |||
1374 | { | 1232 | { |
1375 | struct { | 1233 | struct { |
1376 | struct mmuext_op op; | 1234 | struct mmuext_op op; |
1235 | #ifdef CONFIG_SMP | ||
1236 | DECLARE_BITMAP(mask, num_processors); | ||
1237 | #else | ||
1377 | DECLARE_BITMAP(mask, NR_CPUS); | 1238 | DECLARE_BITMAP(mask, NR_CPUS); |
1239 | #endif | ||
1378 | } *args; | 1240 | } *args; |
1379 | struct multicall_space mcs; | 1241 | struct multicall_space mcs; |
1380 | 1242 | ||
@@ -1509,7 +1371,7 @@ static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd) | |||
1509 | } | 1371 | } |
1510 | 1372 | ||
1511 | #ifdef CONFIG_X86_32 | 1373 | #ifdef CONFIG_X86_32 |
1512 | static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) | 1374 | static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte) |
1513 | { | 1375 | { |
1514 | /* If there's an existing pte, then don't allow _PAGE_RW to be set */ | 1376 | /* If there's an existing pte, then don't allow _PAGE_RW to be set */ |
1515 | if (pte_val_ma(*ptep) & _PAGE_PRESENT) | 1377 | if (pte_val_ma(*ptep) & _PAGE_PRESENT) |
@@ -1518,16 +1380,34 @@ static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) | |||
1518 | 1380 | ||
1519 | return pte; | 1381 | return pte; |
1520 | } | 1382 | } |
1383 | #else /* CONFIG_X86_64 */ | ||
1384 | static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte) | ||
1385 | { | ||
1386 | unsigned long pfn = pte_pfn(pte); | ||
1387 | |||
1388 | /* | ||
1389 | * If the new pfn is within the range of the newly allocated | ||
1390 | * kernel pagetable, and it isn't being mapped into an | ||
1391 | * early_ioremap fixmap slot as a freshly allocated page, make sure | ||
1392 | * it is RO. | ||
1393 | */ | ||
1394 | if (((!is_early_ioremap_ptep(ptep) && | ||
1395 | pfn >= pgt_buf_start && pfn < pgt_buf_top)) || | ||
1396 | (is_early_ioremap_ptep(ptep) && pfn != (pgt_buf_end - 1))) | ||
1397 | pte = pte_wrprotect(pte); | ||
1398 | |||
1399 | return pte; | ||
1400 | } | ||
1401 | #endif /* CONFIG_X86_64 */ | ||
1521 | 1402 | ||
1522 | /* Init-time set_pte while constructing initial pagetables, which | 1403 | /* Init-time set_pte while constructing initial pagetables, which |
1523 | doesn't allow RO pagetable pages to be remapped RW */ | 1404 | doesn't allow RO pagetable pages to be remapped RW */ |
1524 | static __init void xen_set_pte_init(pte_t *ptep, pte_t pte) | 1405 | static void __init xen_set_pte_init(pte_t *ptep, pte_t pte) |
1525 | { | 1406 | { |
1526 | pte = mask_rw_pte(ptep, pte); | 1407 | pte = mask_rw_pte(ptep, pte); |
1527 | 1408 | ||
1528 | xen_set_pte(ptep, pte); | 1409 | xen_set_pte(ptep, pte); |
1529 | } | 1410 | } |
1530 | #endif | ||
1531 | 1411 | ||
1532 | static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn) | 1412 | static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn) |
1533 | { | 1413 | { |
@@ -1540,7 +1420,7 @@ static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn) | |||
1540 | 1420 | ||
1541 | /* Early in boot, while setting up the initial pagetable, assume | 1421 | /* Early in boot, while setting up the initial pagetable, assume |
1542 | everything is pinned. */ | 1422 | everything is pinned. */ |
1543 | static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn) | 1423 | static void __init xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn) |
1544 | { | 1424 | { |
1545 | #ifdef CONFIG_FLATMEM | 1425 | #ifdef CONFIG_FLATMEM |
1546 | BUG_ON(mem_map); /* should only be used early */ | 1426 | BUG_ON(mem_map); /* should only be used early */ |
@@ -1550,7 +1430,7 @@ static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn) | |||
1550 | } | 1430 | } |
1551 | 1431 | ||
1552 | /* Used for pmd and pud */ | 1432 | /* Used for pmd and pud */ |
1553 | static __init void xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn) | 1433 | static void __init xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn) |
1554 | { | 1434 | { |
1555 | #ifdef CONFIG_FLATMEM | 1435 | #ifdef CONFIG_FLATMEM |
1556 | BUG_ON(mem_map); /* should only be used early */ | 1436 | BUG_ON(mem_map); /* should only be used early */ |
@@ -1560,13 +1440,13 @@ static __init void xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn) | |||
1560 | 1440 | ||
1561 | /* Early release_pte assumes that all pts are pinned, since there's | 1441 | /* Early release_pte assumes that all pts are pinned, since there's |
1562 | only init_mm and anything attached to that is pinned. */ | 1442 | only init_mm and anything attached to that is pinned. */ |
1563 | static __init void xen_release_pte_init(unsigned long pfn) | 1443 | static void __init xen_release_pte_init(unsigned long pfn) |
1564 | { | 1444 | { |
1565 | pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); | 1445 | pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); |
1566 | make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); | 1446 | make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); |
1567 | } | 1447 | } |
1568 | 1448 | ||
1569 | static __init void xen_release_pmd_init(unsigned long pfn) | 1449 | static void __init xen_release_pmd_init(unsigned long pfn) |
1570 | { | 1450 | { |
1571 | make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); | 1451 | make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); |
1572 | } | 1452 | } |
@@ -1682,6 +1562,7 @@ static void *m2v(phys_addr_t maddr) | |||
1682 | return __ka(m2p(maddr)); | 1562 | return __ka(m2p(maddr)); |
1683 | } | 1563 | } |
1684 | 1564 | ||
1565 | /* Set the page permissions on an identity-mapped pages */ | ||
1685 | static void set_page_prot(void *addr, pgprot_t prot) | 1566 | static void set_page_prot(void *addr, pgprot_t prot) |
1686 | { | 1567 | { |
1687 | unsigned long pfn = __pa(addr) >> PAGE_SHIFT; | 1568 | unsigned long pfn = __pa(addr) >> PAGE_SHIFT; |
@@ -1691,12 +1572,15 @@ static void set_page_prot(void *addr, pgprot_t prot) | |||
1691 | BUG(); | 1572 | BUG(); |
1692 | } | 1573 | } |
1693 | 1574 | ||
1694 | static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) | 1575 | static void __init xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) |
1695 | { | 1576 | { |
1696 | unsigned pmdidx, pteidx; | 1577 | unsigned pmdidx, pteidx; |
1697 | unsigned ident_pte; | 1578 | unsigned ident_pte; |
1698 | unsigned long pfn; | 1579 | unsigned long pfn; |
1699 | 1580 | ||
1581 | level1_ident_pgt = extend_brk(sizeof(pte_t) * LEVEL1_IDENT_ENTRIES, | ||
1582 | PAGE_SIZE); | ||
1583 | |||
1700 | ident_pte = 0; | 1584 | ident_pte = 0; |
1701 | pfn = 0; | 1585 | pfn = 0; |
1702 | for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) { | 1586 | for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) { |
@@ -1707,7 +1591,7 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) | |||
1707 | pte_page = m2v(pmd[pmdidx].pmd); | 1591 | pte_page = m2v(pmd[pmdidx].pmd); |
1708 | else { | 1592 | else { |
1709 | /* Check for free pte pages */ | 1593 | /* Check for free pte pages */ |
1710 | if (ident_pte == ARRAY_SIZE(level1_ident_pgt)) | 1594 | if (ident_pte == LEVEL1_IDENT_ENTRIES) |
1711 | break; | 1595 | break; |
1712 | 1596 | ||
1713 | pte_page = &level1_ident_pgt[ident_pte]; | 1597 | pte_page = &level1_ident_pgt[ident_pte]; |
@@ -1720,8 +1604,10 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) | |||
1720 | for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) { | 1604 | for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) { |
1721 | pte_t pte; | 1605 | pte_t pte; |
1722 | 1606 | ||
1607 | #ifdef CONFIG_X86_32 | ||
1723 | if (pfn > max_pfn_mapped) | 1608 | if (pfn > max_pfn_mapped) |
1724 | max_pfn_mapped = pfn; | 1609 | max_pfn_mapped = pfn; |
1610 | #endif | ||
1725 | 1611 | ||
1726 | if (!pte_none(pte_page[pteidx])) | 1612 | if (!pte_none(pte_page[pteidx])) |
1727 | continue; | 1613 | continue; |
@@ -1737,6 +1623,20 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) | |||
1737 | set_page_prot(pmd, PAGE_KERNEL_RO); | 1623 | set_page_prot(pmd, PAGE_KERNEL_RO); |
1738 | } | 1624 | } |
1739 | 1625 | ||
1626 | void __init xen_setup_machphys_mapping(void) | ||
1627 | { | ||
1628 | struct xen_machphys_mapping mapping; | ||
1629 | unsigned long machine_to_phys_nr_ents; | ||
1630 | |||
1631 | if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) { | ||
1632 | machine_to_phys_mapping = (unsigned long *)mapping.v_start; | ||
1633 | machine_to_phys_nr_ents = mapping.max_mfn + 1; | ||
1634 | } else { | ||
1635 | machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES; | ||
1636 | } | ||
1637 | machine_to_phys_order = fls(machine_to_phys_nr_ents - 1); | ||
1638 | } | ||
1639 | |||
1740 | #ifdef CONFIG_X86_64 | 1640 | #ifdef CONFIG_X86_64 |
1741 | static void convert_pfn_mfn(void *v) | 1641 | static void convert_pfn_mfn(void *v) |
1742 | { | 1642 | { |
@@ -1750,7 +1650,7 @@ static void convert_pfn_mfn(void *v) | |||
1750 | } | 1650 | } |
1751 | 1651 | ||
1752 | /* | 1652 | /* |
1753 | * Set up the inital kernel pagetable. | 1653 | * Set up the initial kernel pagetable. |
1754 | * | 1654 | * |
1755 | * We can construct this by grafting the Xen provided pagetable into | 1655 | * We can construct this by grafting the Xen provided pagetable into |
1756 | * head_64.S's preconstructed pagetables. We copy the Xen L2's into | 1656 | * head_64.S's preconstructed pagetables. We copy the Xen L2's into |
@@ -1760,12 +1660,18 @@ static void convert_pfn_mfn(void *v) | |||
1760 | * of the physical mapping once some sort of allocator has been set | 1660 | * of the physical mapping once some sort of allocator has been set |
1761 | * up. | 1661 | * up. |
1762 | */ | 1662 | */ |
1763 | __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, | 1663 | pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd, |
1764 | unsigned long max_pfn) | 1664 | unsigned long max_pfn) |
1765 | { | 1665 | { |
1766 | pud_t *l3; | 1666 | pud_t *l3; |
1767 | pmd_t *l2; | 1667 | pmd_t *l2; |
1768 | 1668 | ||
1669 | /* max_pfn_mapped is the last pfn mapped in the initial memory | ||
1670 | * mappings. Considering that on Xen after the kernel mappings we | ||
1671 | * have the mappings of some pages that don't exist in pfn space, we | ||
1672 | * set max_pfn_mapped to the last real pfn mapped. */ | ||
1673 | max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list)); | ||
1674 | |||
1769 | /* Zap identity mapping */ | 1675 | /* Zap identity mapping */ |
1770 | init_level4_pgt[0] = __pgd(0); | 1676 | init_level4_pgt[0] = __pgd(0); |
1771 | 1677 | ||
@@ -1814,7 +1720,7 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, | |||
1814 | __xen_write_cr3(true, __pa(pgd)); | 1720 | __xen_write_cr3(true, __pa(pgd)); |
1815 | xen_mc_issue(PARAVIRT_LAZY_CPU); | 1721 | xen_mc_issue(PARAVIRT_LAZY_CPU); |
1816 | 1722 | ||
1817 | reserve_early(__pa(xen_start_info->pt_base), | 1723 | memblock_x86_reserve_range(__pa(xen_start_info->pt_base), |
1818 | __pa(xen_start_info->pt_base + | 1724 | __pa(xen_start_info->pt_base + |
1819 | xen_start_info->nr_pt_frames * PAGE_SIZE), | 1725 | xen_start_info->nr_pt_frames * PAGE_SIZE), |
1820 | "XEN PAGETABLES"); | 1726 | "XEN PAGETABLES"); |
@@ -1822,45 +1728,88 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, | |||
1822 | return pgd; | 1728 | return pgd; |
1823 | } | 1729 | } |
1824 | #else /* !CONFIG_X86_64 */ | 1730 | #else /* !CONFIG_X86_64 */ |
1825 | static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss; | 1731 | static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD); |
1732 | static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD); | ||
1733 | |||
1734 | static void __init xen_write_cr3_init(unsigned long cr3) | ||
1735 | { | ||
1736 | unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir)); | ||
1737 | |||
1738 | BUG_ON(read_cr3() != __pa(initial_page_table)); | ||
1739 | BUG_ON(cr3 != __pa(swapper_pg_dir)); | ||
1740 | |||
1741 | /* | ||
1742 | * We are switching to swapper_pg_dir for the first time (from | ||
1743 | * initial_page_table) and therefore need to mark that page | ||
1744 | * read-only and then pin it. | ||
1745 | * | ||
1746 | * Xen disallows sharing of kernel PMDs for PAE | ||
1747 | * guests. Therefore we must copy the kernel PMD from | ||
1748 | * initial_page_table into a new kernel PMD to be used in | ||
1749 | * swapper_pg_dir. | ||
1750 | */ | ||
1751 | swapper_kernel_pmd = | ||
1752 | extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE); | ||
1753 | memcpy(swapper_kernel_pmd, initial_kernel_pmd, | ||
1754 | sizeof(pmd_t) * PTRS_PER_PMD); | ||
1755 | swapper_pg_dir[KERNEL_PGD_BOUNDARY] = | ||
1756 | __pgd(__pa(swapper_kernel_pmd) | _PAGE_PRESENT); | ||
1757 | set_page_prot(swapper_kernel_pmd, PAGE_KERNEL_RO); | ||
1758 | |||
1759 | set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO); | ||
1760 | xen_write_cr3(cr3); | ||
1761 | pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, pfn); | ||
1762 | |||
1763 | pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, | ||
1764 | PFN_DOWN(__pa(initial_page_table))); | ||
1765 | set_page_prot(initial_page_table, PAGE_KERNEL); | ||
1766 | set_page_prot(initial_kernel_pmd, PAGE_KERNEL); | ||
1826 | 1767 | ||
1827 | __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, | 1768 | pv_mmu_ops.write_cr3 = &xen_write_cr3; |
1769 | } | ||
1770 | |||
1771 | pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd, | ||
1828 | unsigned long max_pfn) | 1772 | unsigned long max_pfn) |
1829 | { | 1773 | { |
1830 | pmd_t *kernel_pmd; | 1774 | pmd_t *kernel_pmd; |
1831 | 1775 | ||
1776 | initial_kernel_pmd = | ||
1777 | extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE); | ||
1778 | |||
1832 | max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) + | 1779 | max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) + |
1833 | xen_start_info->nr_pt_frames * PAGE_SIZE + | 1780 | xen_start_info->nr_pt_frames * PAGE_SIZE + |
1834 | 512*1024); | 1781 | 512*1024); |
1835 | 1782 | ||
1836 | kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd); | 1783 | kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd); |
1837 | memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD); | 1784 | memcpy(initial_kernel_pmd, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD); |
1838 | 1785 | ||
1839 | xen_map_identity_early(level2_kernel_pgt, max_pfn); | 1786 | xen_map_identity_early(initial_kernel_pmd, max_pfn); |
1840 | 1787 | ||
1841 | memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD); | 1788 | memcpy(initial_page_table, pgd, sizeof(pgd_t) * PTRS_PER_PGD); |
1842 | set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY], | 1789 | initial_page_table[KERNEL_PGD_BOUNDARY] = |
1843 | __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT)); | 1790 | __pgd(__pa(initial_kernel_pmd) | _PAGE_PRESENT); |
1844 | 1791 | ||
1845 | set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); | 1792 | set_page_prot(initial_kernel_pmd, PAGE_KERNEL_RO); |
1846 | set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO); | 1793 | set_page_prot(initial_page_table, PAGE_KERNEL_RO); |
1847 | set_page_prot(empty_zero_page, PAGE_KERNEL_RO); | 1794 | set_page_prot(empty_zero_page, PAGE_KERNEL_RO); |
1848 | 1795 | ||
1849 | pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); | 1796 | pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); |
1850 | 1797 | ||
1851 | xen_write_cr3(__pa(swapper_pg_dir)); | 1798 | pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, |
1799 | PFN_DOWN(__pa(initial_page_table))); | ||
1800 | xen_write_cr3(__pa(initial_page_table)); | ||
1852 | 1801 | ||
1853 | pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir))); | 1802 | memblock_x86_reserve_range(__pa(xen_start_info->pt_base), |
1854 | |||
1855 | reserve_early(__pa(xen_start_info->pt_base), | ||
1856 | __pa(xen_start_info->pt_base + | 1803 | __pa(xen_start_info->pt_base + |
1857 | xen_start_info->nr_pt_frames * PAGE_SIZE), | 1804 | xen_start_info->nr_pt_frames * PAGE_SIZE), |
1858 | "XEN PAGETABLES"); | 1805 | "XEN PAGETABLES"); |
1859 | 1806 | ||
1860 | return swapper_pg_dir; | 1807 | return initial_page_table; |
1861 | } | 1808 | } |
1862 | #endif /* CONFIG_X86_64 */ | 1809 | #endif /* CONFIG_X86_64 */ |
1863 | 1810 | ||
1811 | static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss; | ||
1812 | |||
1864 | static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) | 1813 | static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) |
1865 | { | 1814 | { |
1866 | pte_t pte; | 1815 | pte_t pte; |
@@ -1881,15 +1830,28 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) | |||
1881 | #else | 1830 | #else |
1882 | case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE: | 1831 | case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE: |
1883 | #endif | 1832 | #endif |
1884 | #ifdef CONFIG_X86_LOCAL_APIC | ||
1885 | case FIX_APIC_BASE: /* maps dummy local APIC */ | ||
1886 | #endif | ||
1887 | case FIX_TEXT_POKE0: | 1833 | case FIX_TEXT_POKE0: |
1888 | case FIX_TEXT_POKE1: | 1834 | case FIX_TEXT_POKE1: |
1889 | /* All local page mappings */ | 1835 | /* All local page mappings */ |
1890 | pte = pfn_pte(phys, prot); | 1836 | pte = pfn_pte(phys, prot); |
1891 | break; | 1837 | break; |
1892 | 1838 | ||
1839 | #ifdef CONFIG_X86_LOCAL_APIC | ||
1840 | case FIX_APIC_BASE: /* maps dummy local APIC */ | ||
1841 | pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL); | ||
1842 | break; | ||
1843 | #endif | ||
1844 | |||
1845 | #ifdef CONFIG_X86_IO_APIC | ||
1846 | case FIX_IO_APIC_BASE_0 ... FIX_IO_APIC_BASE_END: | ||
1847 | /* | ||
1848 | * We just don't map the IO APIC - all access is via | ||
1849 | * hypercalls. Keep the address in the pte for reference. | ||
1850 | */ | ||
1851 | pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL); | ||
1852 | break; | ||
1853 | #endif | ||
1854 | |||
1893 | case FIX_PARAVIRT_BOOTMAP: | 1855 | case FIX_PARAVIRT_BOOTMAP: |
1894 | /* This is an MFN, but it isn't an IO mapping from the | 1856 | /* This is an MFN, but it isn't an IO mapping from the |
1895 | IO domain */ | 1857 | IO domain */ |
@@ -1914,8 +1876,34 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) | |||
1914 | #endif | 1876 | #endif |
1915 | } | 1877 | } |
1916 | 1878 | ||
1917 | static __init void xen_post_allocator_init(void) | 1879 | void __init xen_ident_map_ISA(void) |
1880 | { | ||
1881 | unsigned long pa; | ||
1882 | |||
1883 | /* | ||
1884 | * If we're dom0, then linear map the ISA machine addresses into | ||
1885 | * the kernel's address space. | ||
1886 | */ | ||
1887 | if (!xen_initial_domain()) | ||
1888 | return; | ||
1889 | |||
1890 | xen_raw_printk("Xen: setup ISA identity maps\n"); | ||
1891 | |||
1892 | for (pa = ISA_START_ADDRESS; pa < ISA_END_ADDRESS; pa += PAGE_SIZE) { | ||
1893 | pte_t pte = mfn_pte(PFN_DOWN(pa), PAGE_KERNEL_IO); | ||
1894 | |||
1895 | if (HYPERVISOR_update_va_mapping(PAGE_OFFSET + pa, pte, 0)) | ||
1896 | BUG(); | ||
1897 | } | ||
1898 | |||
1899 | xen_flush_tlb(); | ||
1900 | } | ||
1901 | |||
1902 | static void __init xen_post_allocator_init(void) | ||
1918 | { | 1903 | { |
1904 | #ifdef CONFIG_XEN_DEBUG | ||
1905 | pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte_debug); | ||
1906 | #endif | ||
1919 | pv_mmu_ops.set_pte = xen_set_pte; | 1907 | pv_mmu_ops.set_pte = xen_set_pte; |
1920 | pv_mmu_ops.set_pmd = xen_set_pmd; | 1908 | pv_mmu_ops.set_pmd = xen_set_pmd; |
1921 | pv_mmu_ops.set_pud = xen_set_pud; | 1909 | pv_mmu_ops.set_pud = xen_set_pud; |
@@ -1948,12 +1936,16 @@ static void xen_leave_lazy_mmu(void) | |||
1948 | preempt_enable(); | 1936 | preempt_enable(); |
1949 | } | 1937 | } |
1950 | 1938 | ||
1951 | static const struct pv_mmu_ops xen_mmu_ops __initdata = { | 1939 | static const struct pv_mmu_ops xen_mmu_ops __initconst = { |
1952 | .read_cr2 = xen_read_cr2, | 1940 | .read_cr2 = xen_read_cr2, |
1953 | .write_cr2 = xen_write_cr2, | 1941 | .write_cr2 = xen_write_cr2, |
1954 | 1942 | ||
1955 | .read_cr3 = xen_read_cr3, | 1943 | .read_cr3 = xen_read_cr3, |
1944 | #ifdef CONFIG_X86_32 | ||
1945 | .write_cr3 = xen_write_cr3_init, | ||
1946 | #else | ||
1956 | .write_cr3 = xen_write_cr3, | 1947 | .write_cr3 = xen_write_cr3, |
1948 | #endif | ||
1957 | 1949 | ||
1958 | .flush_tlb_user = xen_flush_tlb, | 1950 | .flush_tlb_user = xen_flush_tlb, |
1959 | .flush_tlb_kernel = xen_flush_tlb, | 1951 | .flush_tlb_kernel = xen_flush_tlb, |
@@ -1969,14 +1961,9 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { | |||
1969 | .alloc_pte = xen_alloc_pte_init, | 1961 | .alloc_pte = xen_alloc_pte_init, |
1970 | .release_pte = xen_release_pte_init, | 1962 | .release_pte = xen_release_pte_init, |
1971 | .alloc_pmd = xen_alloc_pmd_init, | 1963 | .alloc_pmd = xen_alloc_pmd_init, |
1972 | .alloc_pmd_clone = paravirt_nop, | ||
1973 | .release_pmd = xen_release_pmd_init, | 1964 | .release_pmd = xen_release_pmd_init, |
1974 | 1965 | ||
1975 | #ifdef CONFIG_X86_64 | ||
1976 | .set_pte = xen_set_pte, | ||
1977 | #else | ||
1978 | .set_pte = xen_set_pte_init, | 1966 | .set_pte = xen_set_pte_init, |
1979 | #endif | ||
1980 | .set_pte_at = xen_set_pte_at, | 1967 | .set_pte_at = xen_set_pte_at, |
1981 | .set_pmd = xen_set_pmd_hyper, | 1968 | .set_pmd = xen_set_pmd_hyper, |
1982 | 1969 | ||
@@ -2022,11 +2009,12 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { | |||
2022 | 2009 | ||
2023 | void __init xen_init_mmu_ops(void) | 2010 | void __init xen_init_mmu_ops(void) |
2024 | { | 2011 | { |
2012 | x86_init.mapping.pagetable_reserve = xen_mapping_pagetable_reserve; | ||
2025 | x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start; | 2013 | x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start; |
2026 | x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done; | 2014 | x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done; |
2027 | pv_mmu_ops = xen_mmu_ops; | 2015 | pv_mmu_ops = xen_mmu_ops; |
2028 | 2016 | ||
2029 | vmap_lazy_unmap = false; | 2017 | memset(dummy_mapping, 0xff, PAGE_SIZE); |
2030 | } | 2018 | } |
2031 | 2019 | ||
2032 | /* Protected by xen_reservation_lock. */ | 2020 | /* Protected by xen_reservation_lock. */ |
@@ -2049,7 +2037,7 @@ static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order, | |||
2049 | in_frames[i] = virt_to_mfn(vaddr); | 2037 | in_frames[i] = virt_to_mfn(vaddr); |
2050 | 2038 | ||
2051 | MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0); | 2039 | MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0); |
2052 | set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY); | 2040 | __set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY); |
2053 | 2041 | ||
2054 | if (out_frames) | 2042 | if (out_frames) |
2055 | out_frames[i] = virt_to_pfn(vaddr); | 2043 | out_frames[i] = virt_to_pfn(vaddr); |
@@ -2259,65 +2247,83 @@ void __init xen_hvm_init_mmu_ops(void) | |||
2259 | } | 2247 | } |
2260 | #endif | 2248 | #endif |
2261 | 2249 | ||
2262 | #ifdef CONFIG_XEN_DEBUG_FS | 2250 | #define REMAP_BATCH_SIZE 16 |
2263 | 2251 | ||
2264 | static struct dentry *d_mmu_debug; | 2252 | struct remap_data { |
2253 | unsigned long mfn; | ||
2254 | pgprot_t prot; | ||
2255 | struct mmu_update *mmu_update; | ||
2256 | }; | ||
2265 | 2257 | ||
2266 | static int __init xen_mmu_debugfs(void) | 2258 | static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token, |
2259 | unsigned long addr, void *data) | ||
2267 | { | 2260 | { |
2268 | struct dentry *d_xen = xen_init_debugfs(); | 2261 | struct remap_data *rmd = data; |
2269 | 2262 | pte_t pte = pte_mkspecial(pfn_pte(rmd->mfn++, rmd->prot)); | |
2270 | if (d_xen == NULL) | ||
2271 | return -ENOMEM; | ||
2272 | 2263 | ||
2273 | d_mmu_debug = debugfs_create_dir("mmu", d_xen); | 2264 | rmd->mmu_update->ptr = virt_to_machine(ptep).maddr; |
2274 | 2265 | rmd->mmu_update->val = pte_val_ma(pte); | |
2275 | debugfs_create_u8("zero_stats", 0644, d_mmu_debug, &zero_stats); | 2266 | rmd->mmu_update++; |
2276 | |||
2277 | debugfs_create_u32("pgd_update", 0444, d_mmu_debug, &mmu_stats.pgd_update); | ||
2278 | debugfs_create_u32("pgd_update_pinned", 0444, d_mmu_debug, | ||
2279 | &mmu_stats.pgd_update_pinned); | ||
2280 | debugfs_create_u32("pgd_update_batched", 0444, d_mmu_debug, | ||
2281 | &mmu_stats.pgd_update_pinned); | ||
2282 | |||
2283 | debugfs_create_u32("pud_update", 0444, d_mmu_debug, &mmu_stats.pud_update); | ||
2284 | debugfs_create_u32("pud_update_pinned", 0444, d_mmu_debug, | ||
2285 | &mmu_stats.pud_update_pinned); | ||
2286 | debugfs_create_u32("pud_update_batched", 0444, d_mmu_debug, | ||
2287 | &mmu_stats.pud_update_pinned); | ||
2288 | |||
2289 | debugfs_create_u32("pmd_update", 0444, d_mmu_debug, &mmu_stats.pmd_update); | ||
2290 | debugfs_create_u32("pmd_update_pinned", 0444, d_mmu_debug, | ||
2291 | &mmu_stats.pmd_update_pinned); | ||
2292 | debugfs_create_u32("pmd_update_batched", 0444, d_mmu_debug, | ||
2293 | &mmu_stats.pmd_update_pinned); | ||
2294 | |||
2295 | debugfs_create_u32("pte_update", 0444, d_mmu_debug, &mmu_stats.pte_update); | ||
2296 | // debugfs_create_u32("pte_update_pinned", 0444, d_mmu_debug, | ||
2297 | // &mmu_stats.pte_update_pinned); | ||
2298 | debugfs_create_u32("pte_update_batched", 0444, d_mmu_debug, | ||
2299 | &mmu_stats.pte_update_pinned); | ||
2300 | |||
2301 | debugfs_create_u32("mmu_update", 0444, d_mmu_debug, &mmu_stats.mmu_update); | ||
2302 | debugfs_create_u32("mmu_update_extended", 0444, d_mmu_debug, | ||
2303 | &mmu_stats.mmu_update_extended); | ||
2304 | xen_debugfs_create_u32_array("mmu_update_histo", 0444, d_mmu_debug, | ||
2305 | mmu_stats.mmu_update_histo, 20); | ||
2306 | |||
2307 | debugfs_create_u32("set_pte_at", 0444, d_mmu_debug, &mmu_stats.set_pte_at); | ||
2308 | debugfs_create_u32("set_pte_at_batched", 0444, d_mmu_debug, | ||
2309 | &mmu_stats.set_pte_at_batched); | ||
2310 | debugfs_create_u32("set_pte_at_current", 0444, d_mmu_debug, | ||
2311 | &mmu_stats.set_pte_at_current); | ||
2312 | debugfs_create_u32("set_pte_at_kernel", 0444, d_mmu_debug, | ||
2313 | &mmu_stats.set_pte_at_kernel); | ||
2314 | |||
2315 | debugfs_create_u32("prot_commit", 0444, d_mmu_debug, &mmu_stats.prot_commit); | ||
2316 | debugfs_create_u32("prot_commit_batched", 0444, d_mmu_debug, | ||
2317 | &mmu_stats.prot_commit_batched); | ||
2318 | 2267 | ||
2319 | return 0; | 2268 | return 0; |
2320 | } | 2269 | } |
2321 | fs_initcall(xen_mmu_debugfs); | ||
2322 | 2270 | ||
2323 | #endif /* CONFIG_XEN_DEBUG_FS */ | 2271 | int xen_remap_domain_mfn_range(struct vm_area_struct *vma, |
2272 | unsigned long addr, | ||
2273 | unsigned long mfn, int nr, | ||
2274 | pgprot_t prot, unsigned domid) | ||
2275 | { | ||
2276 | struct remap_data rmd; | ||
2277 | struct mmu_update mmu_update[REMAP_BATCH_SIZE]; | ||
2278 | int batch; | ||
2279 | unsigned long range; | ||
2280 | int err = 0; | ||
2281 | |||
2282 | prot = __pgprot(pgprot_val(prot) | _PAGE_IOMAP); | ||
2283 | |||
2284 | BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_RESERVED | VM_IO)) == | ||
2285 | (VM_PFNMAP | VM_RESERVED | VM_IO))); | ||
2286 | |||
2287 | rmd.mfn = mfn; | ||
2288 | rmd.prot = prot; | ||
2289 | |||
2290 | while (nr) { | ||
2291 | batch = min(REMAP_BATCH_SIZE, nr); | ||
2292 | range = (unsigned long)batch << PAGE_SHIFT; | ||
2293 | |||
2294 | rmd.mmu_update = mmu_update; | ||
2295 | err = apply_to_page_range(vma->vm_mm, addr, range, | ||
2296 | remap_area_mfn_pte_fn, &rmd); | ||
2297 | if (err) | ||
2298 | goto out; | ||
2299 | |||
2300 | err = -EFAULT; | ||
2301 | if (HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid) < 0) | ||
2302 | goto out; | ||
2303 | |||
2304 | nr -= batch; | ||
2305 | addr += range; | ||
2306 | } | ||
2307 | |||
2308 | err = 0; | ||
2309 | out: | ||
2310 | |||
2311 | flush_tlb_all(); | ||
2312 | |||
2313 | return err; | ||
2314 | } | ||
2315 | EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range); | ||
2316 | |||
2317 | #ifdef CONFIG_XEN_DEBUG_FS | ||
2318 | static int p2m_dump_open(struct inode *inode, struct file *filp) | ||
2319 | { | ||
2320 | return single_open(filp, p2m_dump_show, NULL); | ||
2321 | } | ||
2322 | |||
2323 | static const struct file_operations p2m_dump_fops = { | ||
2324 | .open = p2m_dump_open, | ||
2325 | .read = seq_read, | ||
2326 | .llseek = seq_lseek, | ||
2327 | .release = single_release, | ||
2328 | }; | ||
2329 | #endif /* CONFIG_XEN_DEBUG_FS */ | ||
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h index fa938c4aa2f7..73809bb951b4 100644 --- a/arch/x86/xen/mmu.h +++ b/arch/x86/xen/mmu.h | |||
@@ -12,47 +12,9 @@ enum pt_level { | |||
12 | 12 | ||
13 | 13 | ||
14 | bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn); | 14 | bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn); |
15 | bool install_p2mtop_page(unsigned long pfn, unsigned long *p); | ||
16 | 15 | ||
17 | void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); | 16 | void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); |
18 | 17 | ||
19 | |||
20 | void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next); | ||
21 | void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm); | ||
22 | void xen_exit_mmap(struct mm_struct *mm); | ||
23 | |||
24 | pteval_t xen_pte_val(pte_t); | ||
25 | pmdval_t xen_pmd_val(pmd_t); | ||
26 | pgdval_t xen_pgd_val(pgd_t); | ||
27 | |||
28 | pte_t xen_make_pte(pteval_t); | ||
29 | pmd_t xen_make_pmd(pmdval_t); | ||
30 | pgd_t xen_make_pgd(pgdval_t); | ||
31 | |||
32 | void xen_set_pte(pte_t *ptep, pte_t pteval); | ||
33 | void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, | ||
34 | pte_t *ptep, pte_t pteval); | ||
35 | |||
36 | #ifdef CONFIG_X86_PAE | ||
37 | void xen_set_pte_atomic(pte_t *ptep, pte_t pte); | ||
38 | void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); | ||
39 | void xen_pmd_clear(pmd_t *pmdp); | ||
40 | #endif /* CONFIG_X86_PAE */ | ||
41 | |||
42 | void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval); | ||
43 | void xen_set_pud(pud_t *ptr, pud_t val); | ||
44 | void xen_set_pmd_hyper(pmd_t *pmdp, pmd_t pmdval); | ||
45 | void xen_set_pud_hyper(pud_t *ptr, pud_t val); | ||
46 | |||
47 | #if PAGETABLE_LEVELS == 4 | ||
48 | pudval_t xen_pud_val(pud_t pud); | ||
49 | pud_t xen_make_pud(pudval_t pudval); | ||
50 | void xen_set_pgd(pgd_t *pgdp, pgd_t pgd); | ||
51 | void xen_set_pgd_hyper(pgd_t *pgdp, pgd_t pgd); | ||
52 | #endif | ||
53 | |||
54 | pgd_t *xen_get_user_pgd(pgd_t *pgd); | ||
55 | |||
56 | pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep); | 18 | pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep); |
57 | void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, | 19 | void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, |
58 | pte_t *ptep, pte_t pte); | 20 | pte_t *ptep, pte_t pte); |
diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c index 8bff7e7c290b..1b2b73ff0a6e 100644 --- a/arch/x86/xen/multicalls.c +++ b/arch/x86/xen/multicalls.c | |||
@@ -189,10 +189,10 @@ struct multicall_space __xen_mc_entry(size_t args) | |||
189 | unsigned argidx = roundup(b->argidx, sizeof(u64)); | 189 | unsigned argidx = roundup(b->argidx, sizeof(u64)); |
190 | 190 | ||
191 | BUG_ON(preemptible()); | 191 | BUG_ON(preemptible()); |
192 | BUG_ON(b->argidx > MC_ARGS); | 192 | BUG_ON(b->argidx >= MC_ARGS); |
193 | 193 | ||
194 | if (b->mcidx == MC_BATCH || | 194 | if (b->mcidx == MC_BATCH || |
195 | (argidx + args) > MC_ARGS) { | 195 | (argidx + args) >= MC_ARGS) { |
196 | mc_stats_flush(b->mcidx == MC_BATCH ? FL_SLOTS : FL_ARGS); | 196 | mc_stats_flush(b->mcidx == MC_BATCH ? FL_SLOTS : FL_ARGS); |
197 | xen_mc_flush(); | 197 | xen_mc_flush(); |
198 | argidx = roundup(b->argidx, sizeof(u64)); | 198 | argidx = roundup(b->argidx, sizeof(u64)); |
@@ -206,7 +206,7 @@ struct multicall_space __xen_mc_entry(size_t args) | |||
206 | ret.args = &b->args[argidx]; | 206 | ret.args = &b->args[argidx]; |
207 | b->argidx = argidx + args; | 207 | b->argidx = argidx + args; |
208 | 208 | ||
209 | BUG_ON(b->argidx > MC_ARGS); | 209 | BUG_ON(b->argidx >= MC_ARGS); |
210 | return ret; | 210 | return ret; |
211 | } | 211 | } |
212 | 212 | ||
@@ -216,7 +216,7 @@ struct multicall_space xen_mc_extend_args(unsigned long op, size_t size) | |||
216 | struct multicall_space ret = { NULL, NULL }; | 216 | struct multicall_space ret = { NULL, NULL }; |
217 | 217 | ||
218 | BUG_ON(preemptible()); | 218 | BUG_ON(preemptible()); |
219 | BUG_ON(b->argidx > MC_ARGS); | 219 | BUG_ON(b->argidx >= MC_ARGS); |
220 | 220 | ||
221 | if (b->mcidx == 0) | 221 | if (b->mcidx == 0) |
222 | return ret; | 222 | return ret; |
@@ -224,14 +224,14 @@ struct multicall_space xen_mc_extend_args(unsigned long op, size_t size) | |||
224 | if (b->entries[b->mcidx - 1].op != op) | 224 | if (b->entries[b->mcidx - 1].op != op) |
225 | return ret; | 225 | return ret; |
226 | 226 | ||
227 | if ((b->argidx + size) > MC_ARGS) | 227 | if ((b->argidx + size) >= MC_ARGS) |
228 | return ret; | 228 | return ret; |
229 | 229 | ||
230 | ret.mc = &b->entries[b->mcidx - 1]; | 230 | ret.mc = &b->entries[b->mcidx - 1]; |
231 | ret.args = &b->args[b->argidx]; | 231 | ret.args = &b->args[b->argidx]; |
232 | b->argidx += size; | 232 | b->argidx += size; |
233 | 233 | ||
234 | BUG_ON(b->argidx > MC_ARGS); | 234 | BUG_ON(b->argidx >= MC_ARGS); |
235 | return ret; | 235 | return ret; |
236 | } | 236 | } |
237 | 237 | ||
diff --git a/arch/x86/xen/multicalls.h b/arch/x86/xen/multicalls.h index 9e565da5d1f7..4ec8035e3216 100644 --- a/arch/x86/xen/multicalls.h +++ b/arch/x86/xen/multicalls.h | |||
@@ -22,7 +22,7 @@ static inline void xen_mc_batch(void) | |||
22 | unsigned long flags; | 22 | unsigned long flags; |
23 | /* need to disable interrupts until this entry is complete */ | 23 | /* need to disable interrupts until this entry is complete */ |
24 | local_irq_save(flags); | 24 | local_irq_save(flags); |
25 | __get_cpu_var(xen_mc_irq_flags) = flags; | 25 | __this_cpu_write(xen_mc_irq_flags, flags); |
26 | } | 26 | } |
27 | 27 | ||
28 | static inline struct multicall_space xen_mc_entry(size_t args) | 28 | static inline struct multicall_space xen_mc_entry(size_t args) |
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c new file mode 100644 index 000000000000..58efeb9d5440 --- /dev/null +++ b/arch/x86/xen/p2m.c | |||
@@ -0,0 +1,859 @@ | |||
1 | /* | ||
2 | * Xen leaves the responsibility for maintaining p2m mappings to the | ||
3 | * guests themselves, but it must also access and update the p2m array | ||
4 | * during suspend/resume when all the pages are reallocated. | ||
5 | * | ||
6 | * The p2m table is logically a flat array, but we implement it as a | ||
7 | * three-level tree to allow the address space to be sparse. | ||
8 | * | ||
9 | * Xen | ||
10 | * | | ||
11 | * p2m_top p2m_top_mfn | ||
12 | * / \ / \ | ||
13 | * p2m_mid p2m_mid p2m_mid_mfn p2m_mid_mfn | ||
14 | * / \ / \ / / | ||
15 | * p2m p2m p2m p2m p2m p2m p2m ... | ||
16 | * | ||
17 | * The p2m_mid_mfn pages are mapped by p2m_top_mfn_p. | ||
18 | * | ||
19 | * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the | ||
20 | * maximum representable pseudo-physical address space is: | ||
21 | * P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages | ||
22 | * | ||
23 | * P2M_PER_PAGE depends on the architecture, as a mfn is always | ||
24 | * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to | ||
25 | * 512 and 1024 entries respectively. | ||
26 | * | ||
27 | * In short, these structures contain the Machine Frame Number (MFN) of the PFN. | ||
28 | * | ||
29 | * However not all entries are filled with MFNs. Specifically for all other | ||
30 | * leaf entries, or for the top root, or middle one, for which there is a void | ||
31 | * entry, we assume it is "missing". So (for example) | ||
32 | * pfn_to_mfn(0x90909090)=INVALID_P2M_ENTRY. | ||
33 | * | ||
34 | * We also have the possibility of setting 1-1 mappings on certain regions, so | ||
35 | * that: | ||
36 | * pfn_to_mfn(0xc0000)=0xc0000 | ||
37 | * | ||
38 | * The benefit of this is, that we can assume for non-RAM regions (think | ||
39 | * PCI BARs, or ACPI spaces), we can create mappings easily b/c we | ||
40 | * get the PFN value to match the MFN. | ||
41 | * | ||
42 | * For this to work efficiently we have one new page p2m_identity and | ||
43 | * allocate (via reserved_brk) any other pages we need to cover the sides | ||
44 | * (1GB or 4MB boundary violations). All entries in p2m_identity are set to | ||
45 | * INVALID_P2M_ENTRY type (Xen toolstack only recognizes that and MFNs, | ||
46 | * no other fancy value). | ||
47 | * | ||
48 | * On lookup we spot that the entry points to p2m_identity and return the | ||
49 | * identity value instead of dereferencing and returning INVALID_P2M_ENTRY. | ||
50 | * If the entry points to an allocated page, we just proceed as before and | ||
51 | * return the PFN. If the PFN has IDENTITY_FRAME_BIT set we unmask that in | ||
52 | * appropriate functions (pfn_to_mfn). | ||
53 | * | ||
54 | * The reason for having the IDENTITY_FRAME_BIT instead of just returning the | ||
55 | * PFN is that we could find ourselves where pfn_to_mfn(pfn)==pfn for a | ||
56 | * non-identity pfn. To protect ourselves against we elect to set (and get) the | ||
57 | * IDENTITY_FRAME_BIT on all identity mapped PFNs. | ||
58 | * | ||
59 | * This simplistic diagram is used to explain the more subtle piece of code. | ||
60 | * There is also a digram of the P2M at the end that can help. | ||
61 | * Imagine your E820 looking as so: | ||
62 | * | ||
63 | * 1GB 2GB | ||
64 | * /-------------------+---------\/----\ /----------\ /---+-----\ | ||
65 | * | System RAM | Sys RAM ||ACPI| | reserved | | Sys RAM | | ||
66 | * \-------------------+---------/\----/ \----------/ \---+-----/ | ||
67 | * ^- 1029MB ^- 2001MB | ||
68 | * | ||
69 | * [1029MB = 263424 (0x40500), 2001MB = 512256 (0x7D100), | ||
70 | * 2048MB = 524288 (0x80000)] | ||
71 | * | ||
72 | * And dom0_mem=max:3GB,1GB is passed in to the guest, meaning memory past 1GB | ||
73 | * is actually not present (would have to kick the balloon driver to put it in). | ||
74 | * | ||
75 | * When we are told to set the PFNs for identity mapping (see patch: "xen/setup: | ||
76 | * Set identity mapping for non-RAM E820 and E820 gaps.") we pass in the start | ||
77 | * of the PFN and the end PFN (263424 and 512256 respectively). The first step | ||
78 | * is to reserve_brk a top leaf page if the p2m[1] is missing. The top leaf page | ||
79 | * covers 512^2 of page estate (1GB) and in case the start or end PFN is not | ||
80 | * aligned on 512^2*PAGE_SIZE (1GB) we loop on aligned 1GB PFNs from start pfn | ||
81 | * to end pfn. We reserve_brk top leaf pages if they are missing (means they | ||
82 | * point to p2m_mid_missing). | ||
83 | * | ||
84 | * With the E820 example above, 263424 is not 1GB aligned so we allocate a | ||
85 | * reserve_brk page which will cover the PFNs estate from 0x40000 to 0x80000. | ||
86 | * Each entry in the allocate page is "missing" (points to p2m_missing). | ||
87 | * | ||
88 | * Next stage is to determine if we need to do a more granular boundary check | ||
89 | * on the 4MB (or 2MB depending on architecture) off the start and end pfn's. | ||
90 | * We check if the start pfn and end pfn violate that boundary check, and if | ||
91 | * so reserve_brk a middle (p2m[x][y]) leaf page. This way we have a much finer | ||
92 | * granularity of setting which PFNs are missing and which ones are identity. | ||
93 | * In our example 263424 and 512256 both fail the check so we reserve_brk two | ||
94 | * pages. Populate them with INVALID_P2M_ENTRY (so they both have "missing" | ||
95 | * values) and assign them to p2m[1][2] and p2m[1][488] respectively. | ||
96 | * | ||
97 | * At this point we would at minimum reserve_brk one page, but could be up to | ||
98 | * three. Each call to set_phys_range_identity has at maximum a three page | ||
99 | * cost. If we were to query the P2M at this stage, all those entries from | ||
100 | * start PFN through end PFN (so 1029MB -> 2001MB) would return | ||
101 | * INVALID_P2M_ENTRY ("missing"). | ||
102 | * | ||
103 | * The next step is to walk from the start pfn to the end pfn setting | ||
104 | * the IDENTITY_FRAME_BIT on each PFN. This is done in set_phys_range_identity. | ||
105 | * If we find that the middle leaf is pointing to p2m_missing we can swap it | ||
106 | * over to p2m_identity - this way covering 4MB (or 2MB) PFN space. At this | ||
107 | * point we do not need to worry about boundary aligment (so no need to | ||
108 | * reserve_brk a middle page, figure out which PFNs are "missing" and which | ||
109 | * ones are identity), as that has been done earlier. If we find that the | ||
110 | * middle leaf is not occupied by p2m_identity or p2m_missing, we dereference | ||
111 | * that page (which covers 512 PFNs) and set the appropriate PFN with | ||
112 | * IDENTITY_FRAME_BIT. In our example 263424 and 512256 end up there, and we | ||
113 | * set from p2m[1][2][256->511] and p2m[1][488][0->256] with | ||
114 | * IDENTITY_FRAME_BIT set. | ||
115 | * | ||
116 | * All other regions that are void (or not filled) either point to p2m_missing | ||
117 | * (considered missing) or have the default value of INVALID_P2M_ENTRY (also | ||
118 | * considered missing). In our case, p2m[1][2][0->255] and p2m[1][488][257->511] | ||
119 | * contain the INVALID_P2M_ENTRY value and are considered "missing." | ||
120 | * | ||
121 | * This is what the p2m ends up looking (for the E820 above) with this | ||
122 | * fabulous drawing: | ||
123 | * | ||
124 | * p2m /--------------\ | ||
125 | * /-----\ | &mfn_list[0],| /-----------------\ | ||
126 | * | 0 |------>| &mfn_list[1],| /---------------\ | ~0, ~0, .. | | ||
127 | * |-----| | ..., ~0, ~0 | | ~0, ~0, [x]---+----->| IDENTITY [@256] | | ||
128 | * | 1 |---\ \--------------/ | [p2m_identity]+\ | IDENTITY [@257] | | ||
129 | * |-----| \ | [p2m_identity]+\\ | .... | | ||
130 | * | 2 |--\ \-------------------->| ... | \\ \----------------/ | ||
131 | * |-----| \ \---------------/ \\ | ||
132 | * | 3 |\ \ \\ p2m_identity | ||
133 | * |-----| \ \-------------------->/---------------\ /-----------------\ | ||
134 | * | .. +->+ | [p2m_identity]+-->| ~0, ~0, ~0, ... | | ||
135 | * \-----/ / | [p2m_identity]+-->| ..., ~0 | | ||
136 | * / /---------------\ | .... | \-----------------/ | ||
137 | * / | IDENTITY[@0] | /-+-[x], ~0, ~0.. | | ||
138 | * / | IDENTITY[@256]|<----/ \---------------/ | ||
139 | * / | ~0, ~0, .... | | ||
140 | * | \---------------/ | ||
141 | * | | ||
142 | * p2m_missing p2m_missing | ||
143 | * /------------------\ /------------\ | ||
144 | * | [p2m_mid_missing]+---->| ~0, ~0, ~0 | | ||
145 | * | [p2m_mid_missing]+---->| ..., ~0 | | ||
146 | * \------------------/ \------------/ | ||
147 | * | ||
148 | * where ~0 is INVALID_P2M_ENTRY. IDENTITY is (PFN | IDENTITY_BIT) | ||
149 | */ | ||
150 | |||
151 | #include <linux/init.h> | ||
152 | #include <linux/module.h> | ||
153 | #include <linux/list.h> | ||
154 | #include <linux/hash.h> | ||
155 | #include <linux/sched.h> | ||
156 | #include <linux/seq_file.h> | ||
157 | |||
158 | #include <asm/cache.h> | ||
159 | #include <asm/setup.h> | ||
160 | |||
161 | #include <asm/xen/page.h> | ||
162 | #include <asm/xen/hypercall.h> | ||
163 | #include <asm/xen/hypervisor.h> | ||
164 | |||
165 | #include "xen-ops.h" | ||
166 | |||
167 | static void __init m2p_override_init(void); | ||
168 | |||
169 | unsigned long xen_max_p2m_pfn __read_mostly; | ||
170 | |||
171 | #define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) | ||
172 | #define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *)) | ||
173 | #define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **)) | ||
174 | |||
175 | #define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE) | ||
176 | |||
177 | /* Placeholders for holes in the address space */ | ||
178 | static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE); | ||
179 | static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE); | ||
180 | static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE); | ||
181 | |||
182 | static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE); | ||
183 | static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE); | ||
184 | static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE); | ||
185 | |||
186 | static RESERVE_BRK_ARRAY(unsigned long, p2m_identity, P2M_PER_PAGE); | ||
187 | |||
188 | RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); | ||
189 | RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); | ||
190 | |||
191 | /* We might hit two boundary violations at the start and end, at max each | ||
192 | * boundary violation will require three middle nodes. */ | ||
193 | RESERVE_BRK(p2m_mid_identity, PAGE_SIZE * 2 * 3); | ||
194 | |||
195 | static inline unsigned p2m_top_index(unsigned long pfn) | ||
196 | { | ||
197 | BUG_ON(pfn >= MAX_P2M_PFN); | ||
198 | return pfn / (P2M_MID_PER_PAGE * P2M_PER_PAGE); | ||
199 | } | ||
200 | |||
201 | static inline unsigned p2m_mid_index(unsigned long pfn) | ||
202 | { | ||
203 | return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE; | ||
204 | } | ||
205 | |||
206 | static inline unsigned p2m_index(unsigned long pfn) | ||
207 | { | ||
208 | return pfn % P2M_PER_PAGE; | ||
209 | } | ||
210 | |||
211 | static void p2m_top_init(unsigned long ***top) | ||
212 | { | ||
213 | unsigned i; | ||
214 | |||
215 | for (i = 0; i < P2M_TOP_PER_PAGE; i++) | ||
216 | top[i] = p2m_mid_missing; | ||
217 | } | ||
218 | |||
219 | static void p2m_top_mfn_init(unsigned long *top) | ||
220 | { | ||
221 | unsigned i; | ||
222 | |||
223 | for (i = 0; i < P2M_TOP_PER_PAGE; i++) | ||
224 | top[i] = virt_to_mfn(p2m_mid_missing_mfn); | ||
225 | } | ||
226 | |||
227 | static void p2m_top_mfn_p_init(unsigned long **top) | ||
228 | { | ||
229 | unsigned i; | ||
230 | |||
231 | for (i = 0; i < P2M_TOP_PER_PAGE; i++) | ||
232 | top[i] = p2m_mid_missing_mfn; | ||
233 | } | ||
234 | |||
235 | static void p2m_mid_init(unsigned long **mid) | ||
236 | { | ||
237 | unsigned i; | ||
238 | |||
239 | for (i = 0; i < P2M_MID_PER_PAGE; i++) | ||
240 | mid[i] = p2m_missing; | ||
241 | } | ||
242 | |||
243 | static void p2m_mid_mfn_init(unsigned long *mid) | ||
244 | { | ||
245 | unsigned i; | ||
246 | |||
247 | for (i = 0; i < P2M_MID_PER_PAGE; i++) | ||
248 | mid[i] = virt_to_mfn(p2m_missing); | ||
249 | } | ||
250 | |||
251 | static void p2m_init(unsigned long *p2m) | ||
252 | { | ||
253 | unsigned i; | ||
254 | |||
255 | for (i = 0; i < P2M_MID_PER_PAGE; i++) | ||
256 | p2m[i] = INVALID_P2M_ENTRY; | ||
257 | } | ||
258 | |||
259 | /* | ||
260 | * Build the parallel p2m_top_mfn and p2m_mid_mfn structures | ||
261 | * | ||
262 | * This is called both at boot time, and after resuming from suspend: | ||
263 | * - At boot time we're called very early, and must use extend_brk() | ||
264 | * to allocate memory. | ||
265 | * | ||
266 | * - After resume we're called from within stop_machine, but the mfn | ||
267 | * tree should alreay be completely allocated. | ||
268 | */ | ||
269 | void __ref xen_build_mfn_list_list(void) | ||
270 | { | ||
271 | unsigned long pfn; | ||
272 | |||
273 | /* Pre-initialize p2m_top_mfn to be completely missing */ | ||
274 | if (p2m_top_mfn == NULL) { | ||
275 | p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); | ||
276 | p2m_mid_mfn_init(p2m_mid_missing_mfn); | ||
277 | |||
278 | p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); | ||
279 | p2m_top_mfn_p_init(p2m_top_mfn_p); | ||
280 | |||
281 | p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); | ||
282 | p2m_top_mfn_init(p2m_top_mfn); | ||
283 | } else { | ||
284 | /* Reinitialise, mfn's all change after migration */ | ||
285 | p2m_mid_mfn_init(p2m_mid_missing_mfn); | ||
286 | } | ||
287 | |||
288 | for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) { | ||
289 | unsigned topidx = p2m_top_index(pfn); | ||
290 | unsigned mididx = p2m_mid_index(pfn); | ||
291 | unsigned long **mid; | ||
292 | unsigned long *mid_mfn_p; | ||
293 | |||
294 | mid = p2m_top[topidx]; | ||
295 | mid_mfn_p = p2m_top_mfn_p[topidx]; | ||
296 | |||
297 | /* Don't bother allocating any mfn mid levels if | ||
298 | * they're just missing, just update the stored mfn, | ||
299 | * since all could have changed over a migrate. | ||
300 | */ | ||
301 | if (mid == p2m_mid_missing) { | ||
302 | BUG_ON(mididx); | ||
303 | BUG_ON(mid_mfn_p != p2m_mid_missing_mfn); | ||
304 | p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn); | ||
305 | pfn += (P2M_MID_PER_PAGE - 1) * P2M_PER_PAGE; | ||
306 | continue; | ||
307 | } | ||
308 | |||
309 | if (mid_mfn_p == p2m_mid_missing_mfn) { | ||
310 | /* | ||
311 | * XXX boot-time only! We should never find | ||
312 | * missing parts of the mfn tree after | ||
313 | * runtime. extend_brk() will BUG if we call | ||
314 | * it too late. | ||
315 | */ | ||
316 | mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); | ||
317 | p2m_mid_mfn_init(mid_mfn_p); | ||
318 | |||
319 | p2m_top_mfn_p[topidx] = mid_mfn_p; | ||
320 | } | ||
321 | |||
322 | p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p); | ||
323 | mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]); | ||
324 | } | ||
325 | } | ||
326 | |||
327 | void xen_setup_mfn_list_list(void) | ||
328 | { | ||
329 | BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); | ||
330 | |||
331 | HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = | ||
332 | virt_to_mfn(p2m_top_mfn); | ||
333 | HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn; | ||
334 | } | ||
335 | |||
336 | /* Set up p2m_top to point to the domain-builder provided p2m pages */ | ||
337 | void __init xen_build_dynamic_phys_to_machine(void) | ||
338 | { | ||
339 | unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list; | ||
340 | unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages); | ||
341 | unsigned long pfn; | ||
342 | |||
343 | xen_max_p2m_pfn = max_pfn; | ||
344 | |||
345 | p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE); | ||
346 | p2m_init(p2m_missing); | ||
347 | |||
348 | p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE); | ||
349 | p2m_mid_init(p2m_mid_missing); | ||
350 | |||
351 | p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE); | ||
352 | p2m_top_init(p2m_top); | ||
353 | |||
354 | p2m_identity = extend_brk(PAGE_SIZE, PAGE_SIZE); | ||
355 | p2m_init(p2m_identity); | ||
356 | |||
357 | /* | ||
358 | * The domain builder gives us a pre-constructed p2m array in | ||
359 | * mfn_list for all the pages initially given to us, so we just | ||
360 | * need to graft that into our tree structure. | ||
361 | */ | ||
362 | for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) { | ||
363 | unsigned topidx = p2m_top_index(pfn); | ||
364 | unsigned mididx = p2m_mid_index(pfn); | ||
365 | |||
366 | if (p2m_top[topidx] == p2m_mid_missing) { | ||
367 | unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE); | ||
368 | p2m_mid_init(mid); | ||
369 | |||
370 | p2m_top[topidx] = mid; | ||
371 | } | ||
372 | |||
373 | /* | ||
374 | * As long as the mfn_list has enough entries to completely | ||
375 | * fill a p2m page, pointing into the array is ok. But if | ||
376 | * not the entries beyond the last pfn will be undefined. | ||
377 | */ | ||
378 | if (unlikely(pfn + P2M_PER_PAGE > max_pfn)) { | ||
379 | unsigned long p2midx; | ||
380 | |||
381 | p2midx = max_pfn % P2M_PER_PAGE; | ||
382 | for ( ; p2midx < P2M_PER_PAGE; p2midx++) | ||
383 | mfn_list[pfn + p2midx] = INVALID_P2M_ENTRY; | ||
384 | } | ||
385 | p2m_top[topidx][mididx] = &mfn_list[pfn]; | ||
386 | } | ||
387 | |||
388 | m2p_override_init(); | ||
389 | } | ||
390 | |||
391 | unsigned long get_phys_to_machine(unsigned long pfn) | ||
392 | { | ||
393 | unsigned topidx, mididx, idx; | ||
394 | |||
395 | if (unlikely(pfn >= MAX_P2M_PFN)) | ||
396 | return INVALID_P2M_ENTRY; | ||
397 | |||
398 | topidx = p2m_top_index(pfn); | ||
399 | mididx = p2m_mid_index(pfn); | ||
400 | idx = p2m_index(pfn); | ||
401 | |||
402 | /* | ||
403 | * The INVALID_P2M_ENTRY is filled in both p2m_*identity | ||
404 | * and in p2m_*missing, so returning the INVALID_P2M_ENTRY | ||
405 | * would be wrong. | ||
406 | */ | ||
407 | if (p2m_top[topidx][mididx] == p2m_identity) | ||
408 | return IDENTITY_FRAME(pfn); | ||
409 | |||
410 | return p2m_top[topidx][mididx][idx]; | ||
411 | } | ||
412 | EXPORT_SYMBOL_GPL(get_phys_to_machine); | ||
413 | |||
414 | static void *alloc_p2m_page(void) | ||
415 | { | ||
416 | return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT); | ||
417 | } | ||
418 | |||
419 | static void free_p2m_page(void *p) | ||
420 | { | ||
421 | free_page((unsigned long)p); | ||
422 | } | ||
423 | |||
424 | /* | ||
425 | * Fully allocate the p2m structure for a given pfn. We need to check | ||
426 | * that both the top and mid levels are allocated, and make sure the | ||
427 | * parallel mfn tree is kept in sync. We may race with other cpus, so | ||
428 | * the new pages are installed with cmpxchg; if we lose the race then | ||
429 | * simply free the page we allocated and use the one that's there. | ||
430 | */ | ||
431 | static bool alloc_p2m(unsigned long pfn) | ||
432 | { | ||
433 | unsigned topidx, mididx; | ||
434 | unsigned long ***top_p, **mid; | ||
435 | unsigned long *top_mfn_p, *mid_mfn; | ||
436 | |||
437 | topidx = p2m_top_index(pfn); | ||
438 | mididx = p2m_mid_index(pfn); | ||
439 | |||
440 | top_p = &p2m_top[topidx]; | ||
441 | mid = *top_p; | ||
442 | |||
443 | if (mid == p2m_mid_missing) { | ||
444 | /* Mid level is missing, allocate a new one */ | ||
445 | mid = alloc_p2m_page(); | ||
446 | if (!mid) | ||
447 | return false; | ||
448 | |||
449 | p2m_mid_init(mid); | ||
450 | |||
451 | if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing) | ||
452 | free_p2m_page(mid); | ||
453 | } | ||
454 | |||
455 | top_mfn_p = &p2m_top_mfn[topidx]; | ||
456 | mid_mfn = p2m_top_mfn_p[topidx]; | ||
457 | |||
458 | BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p); | ||
459 | |||
460 | if (mid_mfn == p2m_mid_missing_mfn) { | ||
461 | /* Separately check the mid mfn level */ | ||
462 | unsigned long missing_mfn; | ||
463 | unsigned long mid_mfn_mfn; | ||
464 | |||
465 | mid_mfn = alloc_p2m_page(); | ||
466 | if (!mid_mfn) | ||
467 | return false; | ||
468 | |||
469 | p2m_mid_mfn_init(mid_mfn); | ||
470 | |||
471 | missing_mfn = virt_to_mfn(p2m_mid_missing_mfn); | ||
472 | mid_mfn_mfn = virt_to_mfn(mid_mfn); | ||
473 | if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn) | ||
474 | free_p2m_page(mid_mfn); | ||
475 | else | ||
476 | p2m_top_mfn_p[topidx] = mid_mfn; | ||
477 | } | ||
478 | |||
479 | if (p2m_top[topidx][mididx] == p2m_identity || | ||
480 | p2m_top[topidx][mididx] == p2m_missing) { | ||
481 | /* p2m leaf page is missing */ | ||
482 | unsigned long *p2m; | ||
483 | unsigned long *p2m_orig = p2m_top[topidx][mididx]; | ||
484 | |||
485 | p2m = alloc_p2m_page(); | ||
486 | if (!p2m) | ||
487 | return false; | ||
488 | |||
489 | p2m_init(p2m); | ||
490 | |||
491 | if (cmpxchg(&mid[mididx], p2m_orig, p2m) != p2m_orig) | ||
492 | free_p2m_page(p2m); | ||
493 | else | ||
494 | mid_mfn[mididx] = virt_to_mfn(p2m); | ||
495 | } | ||
496 | |||
497 | return true; | ||
498 | } | ||
499 | |||
500 | static bool __init __early_alloc_p2m(unsigned long pfn) | ||
501 | { | ||
502 | unsigned topidx, mididx, idx; | ||
503 | |||
504 | topidx = p2m_top_index(pfn); | ||
505 | mididx = p2m_mid_index(pfn); | ||
506 | idx = p2m_index(pfn); | ||
507 | |||
508 | /* Pfff.. No boundary cross-over, lets get out. */ | ||
509 | if (!idx) | ||
510 | return false; | ||
511 | |||
512 | WARN(p2m_top[topidx][mididx] == p2m_identity, | ||
513 | "P2M[%d][%d] == IDENTITY, should be MISSING (or alloced)!\n", | ||
514 | topidx, mididx); | ||
515 | |||
516 | /* | ||
517 | * Could be done by xen_build_dynamic_phys_to_machine.. | ||
518 | */ | ||
519 | if (p2m_top[topidx][mididx] != p2m_missing) | ||
520 | return false; | ||
521 | |||
522 | /* Boundary cross-over for the edges: */ | ||
523 | if (idx) { | ||
524 | unsigned long *p2m = extend_brk(PAGE_SIZE, PAGE_SIZE); | ||
525 | unsigned long *mid_mfn_p; | ||
526 | |||
527 | p2m_init(p2m); | ||
528 | |||
529 | p2m_top[topidx][mididx] = p2m; | ||
530 | |||
531 | /* For save/restore we need to MFN of the P2M saved */ | ||
532 | |||
533 | mid_mfn_p = p2m_top_mfn_p[topidx]; | ||
534 | WARN(mid_mfn_p[mididx] != virt_to_mfn(p2m_missing), | ||
535 | "P2M_TOP_P[%d][%d] != MFN of p2m_missing!\n", | ||
536 | topidx, mididx); | ||
537 | mid_mfn_p[mididx] = virt_to_mfn(p2m); | ||
538 | |||
539 | } | ||
540 | return idx != 0; | ||
541 | } | ||
542 | unsigned long __init set_phys_range_identity(unsigned long pfn_s, | ||
543 | unsigned long pfn_e) | ||
544 | { | ||
545 | unsigned long pfn; | ||
546 | |||
547 | if (unlikely(pfn_s >= MAX_P2M_PFN || pfn_e >= MAX_P2M_PFN)) | ||
548 | return 0; | ||
549 | |||
550 | if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) | ||
551 | return pfn_e - pfn_s; | ||
552 | |||
553 | if (pfn_s > pfn_e) | ||
554 | return 0; | ||
555 | |||
556 | for (pfn = (pfn_s & ~(P2M_MID_PER_PAGE * P2M_PER_PAGE - 1)); | ||
557 | pfn < ALIGN(pfn_e, (P2M_MID_PER_PAGE * P2M_PER_PAGE)); | ||
558 | pfn += P2M_MID_PER_PAGE * P2M_PER_PAGE) | ||
559 | { | ||
560 | unsigned topidx = p2m_top_index(pfn); | ||
561 | unsigned long *mid_mfn_p; | ||
562 | unsigned long **mid; | ||
563 | |||
564 | mid = p2m_top[topidx]; | ||
565 | mid_mfn_p = p2m_top_mfn_p[topidx]; | ||
566 | if (mid == p2m_mid_missing) { | ||
567 | mid = extend_brk(PAGE_SIZE, PAGE_SIZE); | ||
568 | |||
569 | p2m_mid_init(mid); | ||
570 | |||
571 | p2m_top[topidx] = mid; | ||
572 | |||
573 | BUG_ON(mid_mfn_p != p2m_mid_missing_mfn); | ||
574 | } | ||
575 | /* And the save/restore P2M tables.. */ | ||
576 | if (mid_mfn_p == p2m_mid_missing_mfn) { | ||
577 | mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); | ||
578 | p2m_mid_mfn_init(mid_mfn_p); | ||
579 | |||
580 | p2m_top_mfn_p[topidx] = mid_mfn_p; | ||
581 | p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p); | ||
582 | /* Note: we don't set mid_mfn_p[midix] here, | ||
583 | * look in __early_alloc_p2m */ | ||
584 | } | ||
585 | } | ||
586 | |||
587 | __early_alloc_p2m(pfn_s); | ||
588 | __early_alloc_p2m(pfn_e); | ||
589 | |||
590 | for (pfn = pfn_s; pfn < pfn_e; pfn++) | ||
591 | if (!__set_phys_to_machine(pfn, IDENTITY_FRAME(pfn))) | ||
592 | break; | ||
593 | |||
594 | if (!WARN((pfn - pfn_s) != (pfn_e - pfn_s), | ||
595 | "Identity mapping failed. We are %ld short of 1-1 mappings!\n", | ||
596 | (pfn_e - pfn_s) - (pfn - pfn_s))) | ||
597 | printk(KERN_DEBUG "1-1 mapping on %lx->%lx\n", pfn_s, pfn); | ||
598 | |||
599 | return pfn - pfn_s; | ||
600 | } | ||
601 | |||
602 | /* Try to install p2m mapping; fail if intermediate bits missing */ | ||
603 | bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) | ||
604 | { | ||
605 | unsigned topidx, mididx, idx; | ||
606 | |||
607 | if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) { | ||
608 | BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY); | ||
609 | return true; | ||
610 | } | ||
611 | if (unlikely(pfn >= MAX_P2M_PFN)) { | ||
612 | BUG_ON(mfn != INVALID_P2M_ENTRY); | ||
613 | return true; | ||
614 | } | ||
615 | |||
616 | topidx = p2m_top_index(pfn); | ||
617 | mididx = p2m_mid_index(pfn); | ||
618 | idx = p2m_index(pfn); | ||
619 | |||
620 | /* For sparse holes were the p2m leaf has real PFN along with | ||
621 | * PCI holes, stick in the PFN as the MFN value. | ||
622 | */ | ||
623 | if (mfn != INVALID_P2M_ENTRY && (mfn & IDENTITY_FRAME_BIT)) { | ||
624 | if (p2m_top[topidx][mididx] == p2m_identity) | ||
625 | return true; | ||
626 | |||
627 | /* Swap over from MISSING to IDENTITY if needed. */ | ||
628 | if (p2m_top[topidx][mididx] == p2m_missing) { | ||
629 | WARN_ON(cmpxchg(&p2m_top[topidx][mididx], p2m_missing, | ||
630 | p2m_identity) != p2m_missing); | ||
631 | return true; | ||
632 | } | ||
633 | } | ||
634 | |||
635 | if (p2m_top[topidx][mididx] == p2m_missing) | ||
636 | return mfn == INVALID_P2M_ENTRY; | ||
637 | |||
638 | p2m_top[topidx][mididx][idx] = mfn; | ||
639 | |||
640 | return true; | ||
641 | } | ||
642 | |||
643 | bool set_phys_to_machine(unsigned long pfn, unsigned long mfn) | ||
644 | { | ||
645 | if (unlikely(!__set_phys_to_machine(pfn, mfn))) { | ||
646 | if (!alloc_p2m(pfn)) | ||
647 | return false; | ||
648 | |||
649 | if (!__set_phys_to_machine(pfn, mfn)) | ||
650 | return false; | ||
651 | } | ||
652 | |||
653 | return true; | ||
654 | } | ||
655 | |||
656 | #define M2P_OVERRIDE_HASH_SHIFT 10 | ||
657 | #define M2P_OVERRIDE_HASH (1 << M2P_OVERRIDE_HASH_SHIFT) | ||
658 | |||
659 | static RESERVE_BRK_ARRAY(struct list_head, m2p_overrides, M2P_OVERRIDE_HASH); | ||
660 | static DEFINE_SPINLOCK(m2p_override_lock); | ||
661 | |||
662 | static void __init m2p_override_init(void) | ||
663 | { | ||
664 | unsigned i; | ||
665 | |||
666 | m2p_overrides = extend_brk(sizeof(*m2p_overrides) * M2P_OVERRIDE_HASH, | ||
667 | sizeof(unsigned long)); | ||
668 | |||
669 | for (i = 0; i < M2P_OVERRIDE_HASH; i++) | ||
670 | INIT_LIST_HEAD(&m2p_overrides[i]); | ||
671 | } | ||
672 | |||
673 | static unsigned long mfn_hash(unsigned long mfn) | ||
674 | { | ||
675 | return hash_long(mfn, M2P_OVERRIDE_HASH_SHIFT); | ||
676 | } | ||
677 | |||
678 | /* Add an MFN override for a particular page */ | ||
679 | int m2p_add_override(unsigned long mfn, struct page *page, bool clear_pte) | ||
680 | { | ||
681 | unsigned long flags; | ||
682 | unsigned long pfn; | ||
683 | unsigned long uninitialized_var(address); | ||
684 | unsigned level; | ||
685 | pte_t *ptep = NULL; | ||
686 | |||
687 | pfn = page_to_pfn(page); | ||
688 | if (!PageHighMem(page)) { | ||
689 | address = (unsigned long)__va(pfn << PAGE_SHIFT); | ||
690 | ptep = lookup_address(address, &level); | ||
691 | if (WARN(ptep == NULL || level != PG_LEVEL_4K, | ||
692 | "m2p_add_override: pfn %lx not mapped", pfn)) | ||
693 | return -EINVAL; | ||
694 | } | ||
695 | |||
696 | page->private = mfn; | ||
697 | page->index = pfn_to_mfn(pfn); | ||
698 | |||
699 | if (unlikely(!set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)))) | ||
700 | return -ENOMEM; | ||
701 | |||
702 | if (clear_pte && !PageHighMem(page)) | ||
703 | /* Just zap old mapping for now */ | ||
704 | pte_clear(&init_mm, address, ptep); | ||
705 | spin_lock_irqsave(&m2p_override_lock, flags); | ||
706 | list_add(&page->lru, &m2p_overrides[mfn_hash(mfn)]); | ||
707 | spin_unlock_irqrestore(&m2p_override_lock, flags); | ||
708 | |||
709 | return 0; | ||
710 | } | ||
711 | EXPORT_SYMBOL_GPL(m2p_add_override); | ||
712 | int m2p_remove_override(struct page *page, bool clear_pte) | ||
713 | { | ||
714 | unsigned long flags; | ||
715 | unsigned long mfn; | ||
716 | unsigned long pfn; | ||
717 | unsigned long uninitialized_var(address); | ||
718 | unsigned level; | ||
719 | pte_t *ptep = NULL; | ||
720 | |||
721 | pfn = page_to_pfn(page); | ||
722 | mfn = get_phys_to_machine(pfn); | ||
723 | if (mfn == INVALID_P2M_ENTRY || !(mfn & FOREIGN_FRAME_BIT)) | ||
724 | return -EINVAL; | ||
725 | |||
726 | if (!PageHighMem(page)) { | ||
727 | address = (unsigned long)__va(pfn << PAGE_SHIFT); | ||
728 | ptep = lookup_address(address, &level); | ||
729 | |||
730 | if (WARN(ptep == NULL || level != PG_LEVEL_4K, | ||
731 | "m2p_remove_override: pfn %lx not mapped", pfn)) | ||
732 | return -EINVAL; | ||
733 | } | ||
734 | |||
735 | spin_lock_irqsave(&m2p_override_lock, flags); | ||
736 | list_del(&page->lru); | ||
737 | spin_unlock_irqrestore(&m2p_override_lock, flags); | ||
738 | set_phys_to_machine(pfn, page->index); | ||
739 | |||
740 | if (clear_pte && !PageHighMem(page)) | ||
741 | set_pte_at(&init_mm, address, ptep, | ||
742 | pfn_pte(pfn, PAGE_KERNEL)); | ||
743 | /* No tlb flush necessary because the caller already | ||
744 | * left the pte unmapped. */ | ||
745 | |||
746 | return 0; | ||
747 | } | ||
748 | EXPORT_SYMBOL_GPL(m2p_remove_override); | ||
749 | |||
750 | struct page *m2p_find_override(unsigned long mfn) | ||
751 | { | ||
752 | unsigned long flags; | ||
753 | struct list_head *bucket = &m2p_overrides[mfn_hash(mfn)]; | ||
754 | struct page *p, *ret; | ||
755 | |||
756 | ret = NULL; | ||
757 | |||
758 | spin_lock_irqsave(&m2p_override_lock, flags); | ||
759 | |||
760 | list_for_each_entry(p, bucket, lru) { | ||
761 | if (p->private == mfn) { | ||
762 | ret = p; | ||
763 | break; | ||
764 | } | ||
765 | } | ||
766 | |||
767 | spin_unlock_irqrestore(&m2p_override_lock, flags); | ||
768 | |||
769 | return ret; | ||
770 | } | ||
771 | |||
772 | unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn) | ||
773 | { | ||
774 | struct page *p = m2p_find_override(mfn); | ||
775 | unsigned long ret = pfn; | ||
776 | |||
777 | if (p) | ||
778 | ret = page_to_pfn(p); | ||
779 | |||
780 | return ret; | ||
781 | } | ||
782 | EXPORT_SYMBOL_GPL(m2p_find_override_pfn); | ||
783 | |||
784 | #ifdef CONFIG_XEN_DEBUG_FS | ||
785 | |||
786 | int p2m_dump_show(struct seq_file *m, void *v) | ||
787 | { | ||
788 | static const char * const level_name[] = { "top", "middle", | ||
789 | "entry", "abnormal" }; | ||
790 | static const char * const type_name[] = { "identity", "missing", | ||
791 | "pfn", "abnormal"}; | ||
792 | #define TYPE_IDENTITY 0 | ||
793 | #define TYPE_MISSING 1 | ||
794 | #define TYPE_PFN 2 | ||
795 | #define TYPE_UNKNOWN 3 | ||
796 | unsigned long pfn, prev_pfn_type = 0, prev_pfn_level = 0; | ||
797 | unsigned int uninitialized_var(prev_level); | ||
798 | unsigned int uninitialized_var(prev_type); | ||
799 | |||
800 | if (!p2m_top) | ||
801 | return 0; | ||
802 | |||
803 | for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn++) { | ||
804 | unsigned topidx = p2m_top_index(pfn); | ||
805 | unsigned mididx = p2m_mid_index(pfn); | ||
806 | unsigned idx = p2m_index(pfn); | ||
807 | unsigned lvl, type; | ||
808 | |||
809 | lvl = 4; | ||
810 | type = TYPE_UNKNOWN; | ||
811 | if (p2m_top[topidx] == p2m_mid_missing) { | ||
812 | lvl = 0; type = TYPE_MISSING; | ||
813 | } else if (p2m_top[topidx] == NULL) { | ||
814 | lvl = 0; type = TYPE_UNKNOWN; | ||
815 | } else if (p2m_top[topidx][mididx] == NULL) { | ||
816 | lvl = 1; type = TYPE_UNKNOWN; | ||
817 | } else if (p2m_top[topidx][mididx] == p2m_identity) { | ||
818 | lvl = 1; type = TYPE_IDENTITY; | ||
819 | } else if (p2m_top[topidx][mididx] == p2m_missing) { | ||
820 | lvl = 1; type = TYPE_MISSING; | ||
821 | } else if (p2m_top[topidx][mididx][idx] == 0) { | ||
822 | lvl = 2; type = TYPE_UNKNOWN; | ||
823 | } else if (p2m_top[topidx][mididx][idx] == IDENTITY_FRAME(pfn)) { | ||
824 | lvl = 2; type = TYPE_IDENTITY; | ||
825 | } else if (p2m_top[topidx][mididx][idx] == INVALID_P2M_ENTRY) { | ||
826 | lvl = 2; type = TYPE_MISSING; | ||
827 | } else if (p2m_top[topidx][mididx][idx] == pfn) { | ||
828 | lvl = 2; type = TYPE_PFN; | ||
829 | } else if (p2m_top[topidx][mididx][idx] != pfn) { | ||
830 | lvl = 2; type = TYPE_PFN; | ||
831 | } | ||
832 | if (pfn == 0) { | ||
833 | prev_level = lvl; | ||
834 | prev_type = type; | ||
835 | } | ||
836 | if (pfn == MAX_DOMAIN_PAGES-1) { | ||
837 | lvl = 3; | ||
838 | type = TYPE_UNKNOWN; | ||
839 | } | ||
840 | if (prev_type != type) { | ||
841 | seq_printf(m, " [0x%lx->0x%lx] %s\n", | ||
842 | prev_pfn_type, pfn, type_name[prev_type]); | ||
843 | prev_pfn_type = pfn; | ||
844 | prev_type = type; | ||
845 | } | ||
846 | if (prev_level != lvl) { | ||
847 | seq_printf(m, " [0x%lx->0x%lx] level %s\n", | ||
848 | prev_pfn_level, pfn, level_name[prev_level]); | ||
849 | prev_pfn_level = pfn; | ||
850 | prev_level = lvl; | ||
851 | } | ||
852 | } | ||
853 | return 0; | ||
854 | #undef TYPE_IDENTITY | ||
855 | #undef TYPE_MISSING | ||
856 | #undef TYPE_PFN | ||
857 | #undef TYPE_UNKNOWN | ||
858 | } | ||
859 | #endif | ||
diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c index a013ec9d0c54..b480d4207a4c 100644 --- a/arch/x86/xen/pci-swiotlb-xen.c +++ b/arch/x86/xen/pci-swiotlb-xen.c | |||
@@ -1,10 +1,12 @@ | |||
1 | /* Glue code to lib/swiotlb-xen.c */ | 1 | /* Glue code to lib/swiotlb-xen.c */ |
2 | 2 | ||
3 | #include <linux/dma-mapping.h> | 3 | #include <linux/dma-mapping.h> |
4 | #include <linux/pci.h> | ||
4 | #include <xen/swiotlb-xen.h> | 5 | #include <xen/swiotlb-xen.h> |
5 | 6 | ||
6 | #include <asm/xen/hypervisor.h> | 7 | #include <asm/xen/hypervisor.h> |
7 | #include <xen/xen.h> | 8 | #include <xen/xen.h> |
9 | #include <asm/iommu_table.h> | ||
8 | 10 | ||
9 | int xen_swiotlb __read_mostly; | 11 | int xen_swiotlb __read_mostly; |
10 | 12 | ||
@@ -34,7 +36,7 @@ int __init pci_xen_swiotlb_detect(void) | |||
34 | 36 | ||
35 | /* If running as PV guest, either iommu=soft, or swiotlb=force will | 37 | /* If running as PV guest, either iommu=soft, or swiotlb=force will |
36 | * activate this IOMMU. If running as PV privileged, activate it | 38 | * activate this IOMMU. If running as PV privileged, activate it |
37 | * irregardlesss. | 39 | * irregardless. |
38 | */ | 40 | */ |
39 | if ((xen_initial_domain() || swiotlb || swiotlb_force) && | 41 | if ((xen_initial_domain() || swiotlb || swiotlb_force) && |
40 | (xen_pv_domain())) | 42 | (xen_pv_domain())) |
@@ -54,5 +56,12 @@ void __init pci_xen_swiotlb_init(void) | |||
54 | if (xen_swiotlb) { | 56 | if (xen_swiotlb) { |
55 | xen_swiotlb_init(1); | 57 | xen_swiotlb_init(1); |
56 | dma_ops = &xen_swiotlb_dma_ops; | 58 | dma_ops = &xen_swiotlb_dma_ops; |
59 | |||
60 | /* Make sure ACS will be enabled */ | ||
61 | pci_request_acs(); | ||
57 | } | 62 | } |
58 | } | 63 | } |
64 | IOMMU_INIT_FINISH(pci_xen_swiotlb_detect, | ||
65 | 0, | ||
66 | pci_xen_swiotlb_init, | ||
67 | 0); | ||
diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c index 0f456386cce5..25c52f94a27c 100644 --- a/arch/x86/xen/platform-pci-unplug.c +++ b/arch/x86/xen/platform-pci-unplug.c | |||
@@ -68,7 +68,7 @@ static int __init check_platform_magic(void) | |||
68 | return 0; | 68 | return 0; |
69 | } | 69 | } |
70 | 70 | ||
71 | void __init xen_unplug_emulated_devices(void) | 71 | void xen_unplug_emulated_devices(void) |
72 | { | 72 | { |
73 | int r; | 73 | int r; |
74 | 74 | ||
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 328b00305426..60aeeb56948f 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c | |||
@@ -8,6 +8,7 @@ | |||
8 | #include <linux/sched.h> | 8 | #include <linux/sched.h> |
9 | #include <linux/mm.h> | 9 | #include <linux/mm.h> |
10 | #include <linux/pm.h> | 10 | #include <linux/pm.h> |
11 | #include <linux/memblock.h> | ||
11 | 12 | ||
12 | #include <asm/elf.h> | 13 | #include <asm/elf.h> |
13 | #include <asm/vdso.h> | 14 | #include <asm/vdso.h> |
@@ -17,10 +18,11 @@ | |||
17 | #include <asm/xen/hypervisor.h> | 18 | #include <asm/xen/hypervisor.h> |
18 | #include <asm/xen/hypercall.h> | 19 | #include <asm/xen/hypercall.h> |
19 | 20 | ||
21 | #include <xen/xen.h> | ||
20 | #include <xen/page.h> | 22 | #include <xen/page.h> |
21 | #include <xen/interface/callback.h> | 23 | #include <xen/interface/callback.h> |
22 | #include <xen/interface/physdev.h> | ||
23 | #include <xen/interface/memory.h> | 24 | #include <xen/interface/memory.h> |
25 | #include <xen/interface/physdev.h> | ||
24 | #include <xen/features.h> | 26 | #include <xen/features.h> |
25 | 27 | ||
26 | #include "xen-ops.h" | 28 | #include "xen-ops.h" |
@@ -33,6 +35,44 @@ extern void xen_sysenter_target(void); | |||
33 | extern void xen_syscall_target(void); | 35 | extern void xen_syscall_target(void); |
34 | extern void xen_syscall32_target(void); | 36 | extern void xen_syscall32_target(void); |
35 | 37 | ||
38 | /* Amount of extra memory space we add to the e820 ranges */ | ||
39 | phys_addr_t xen_extra_mem_start, xen_extra_mem_size; | ||
40 | |||
41 | /* | ||
42 | * The maximum amount of extra memory compared to the base size. The | ||
43 | * main scaling factor is the size of struct page. At extreme ratios | ||
44 | * of base:extra, all the base memory can be filled with page | ||
45 | * structures for the extra memory, leaving no space for anything | ||
46 | * else. | ||
47 | * | ||
48 | * 10x seems like a reasonable balance between scaling flexibility and | ||
49 | * leaving a practically usable system. | ||
50 | */ | ||
51 | #define EXTRA_MEM_RATIO (10) | ||
52 | |||
53 | static void __init xen_add_extra_mem(unsigned long pages) | ||
54 | { | ||
55 | unsigned long pfn; | ||
56 | |||
57 | u64 size = (u64)pages * PAGE_SIZE; | ||
58 | u64 extra_start = xen_extra_mem_start + xen_extra_mem_size; | ||
59 | |||
60 | if (!pages) | ||
61 | return; | ||
62 | |||
63 | e820_add_region(extra_start, size, E820_RAM); | ||
64 | sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); | ||
65 | |||
66 | memblock_x86_reserve_range(extra_start, extra_start + size, "XEN EXTRA"); | ||
67 | |||
68 | xen_extra_mem_size += size; | ||
69 | |||
70 | xen_max_p2m_pfn = PFN_DOWN(extra_start + size); | ||
71 | |||
72 | for (pfn = PFN_DOWN(extra_start); pfn <= xen_max_p2m_pfn; pfn++) | ||
73 | __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); | ||
74 | } | ||
75 | |||
36 | static unsigned long __init xen_release_chunk(phys_addr_t start_addr, | 76 | static unsigned long __init xen_release_chunk(phys_addr_t start_addr, |
37 | phys_addr_t end_addr) | 77 | phys_addr_t end_addr) |
38 | { | 78 | { |
@@ -69,7 +109,7 @@ static unsigned long __init xen_release_chunk(phys_addr_t start_addr, | |||
69 | WARN(ret != 1, "Failed to release memory %lx-%lx err=%d\n", | 109 | WARN(ret != 1, "Failed to release memory %lx-%lx err=%d\n", |
70 | start, end, ret); | 110 | start, end, ret); |
71 | if (ret == 1) { | 111 | if (ret == 1) { |
72 | set_phys_to_machine(pfn, INVALID_P2M_ENTRY); | 112 | __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); |
73 | len++; | 113 | len++; |
74 | } | 114 | } |
75 | } | 115 | } |
@@ -82,16 +122,18 @@ static unsigned long __init xen_return_unused_memory(unsigned long max_pfn, | |||
82 | const struct e820map *e820) | 122 | const struct e820map *e820) |
83 | { | 123 | { |
84 | phys_addr_t max_addr = PFN_PHYS(max_pfn); | 124 | phys_addr_t max_addr = PFN_PHYS(max_pfn); |
85 | phys_addr_t last_end = 0; | 125 | phys_addr_t last_end = ISA_END_ADDRESS; |
86 | unsigned long released = 0; | 126 | unsigned long released = 0; |
87 | int i; | 127 | int i; |
88 | 128 | ||
129 | /* Free any unused memory above the low 1Mbyte. */ | ||
89 | for (i = 0; i < e820->nr_map && last_end < max_addr; i++) { | 130 | for (i = 0; i < e820->nr_map && last_end < max_addr; i++) { |
90 | phys_addr_t end = e820->map[i].addr; | 131 | phys_addr_t end = e820->map[i].addr; |
91 | end = min(max_addr, end); | 132 | end = min(max_addr, end); |
92 | 133 | ||
93 | released += xen_release_chunk(last_end, end); | 134 | if (last_end < end) |
94 | last_end = e820->map[i].addr + e820->map[i].size; | 135 | released += xen_release_chunk(last_end, end); |
136 | last_end = max(last_end, e820->map[i].addr + e820->map[i].size); | ||
95 | } | 137 | } |
96 | 138 | ||
97 | if (last_end < max_addr) | 139 | if (last_end < max_addr) |
@@ -101,24 +143,140 @@ static unsigned long __init xen_return_unused_memory(unsigned long max_pfn, | |||
101 | return released; | 143 | return released; |
102 | } | 144 | } |
103 | 145 | ||
146 | static unsigned long __init xen_set_identity(const struct e820entry *list, | ||
147 | ssize_t map_size) | ||
148 | { | ||
149 | phys_addr_t last = xen_initial_domain() ? 0 : ISA_END_ADDRESS; | ||
150 | phys_addr_t start_pci = last; | ||
151 | const struct e820entry *entry; | ||
152 | unsigned long identity = 0; | ||
153 | int i; | ||
154 | |||
155 | for (i = 0, entry = list; i < map_size; i++, entry++) { | ||
156 | phys_addr_t start = entry->addr; | ||
157 | phys_addr_t end = start + entry->size; | ||
158 | |||
159 | if (start < last) | ||
160 | start = last; | ||
161 | |||
162 | if (end <= start) | ||
163 | continue; | ||
164 | |||
165 | /* Skip over the 1MB region. */ | ||
166 | if (last > end) | ||
167 | continue; | ||
168 | |||
169 | if ((entry->type == E820_RAM) || (entry->type == E820_UNUSABLE)) { | ||
170 | if (start > start_pci) | ||
171 | identity += set_phys_range_identity( | ||
172 | PFN_UP(start_pci), PFN_DOWN(start)); | ||
173 | |||
174 | /* Without saving 'last' we would gooble RAM too | ||
175 | * at the end of the loop. */ | ||
176 | last = end; | ||
177 | start_pci = end; | ||
178 | continue; | ||
179 | } | ||
180 | start_pci = min(start, start_pci); | ||
181 | last = end; | ||
182 | } | ||
183 | if (last > start_pci) | ||
184 | identity += set_phys_range_identity( | ||
185 | PFN_UP(start_pci), PFN_DOWN(last)); | ||
186 | return identity; | ||
187 | } | ||
104 | /** | 188 | /** |
105 | * machine_specific_memory_setup - Hook for machine specific memory setup. | 189 | * machine_specific_memory_setup - Hook for machine specific memory setup. |
106 | **/ | 190 | **/ |
107 | |||
108 | char * __init xen_memory_setup(void) | 191 | char * __init xen_memory_setup(void) |
109 | { | 192 | { |
193 | static struct e820entry map[E820MAX] __initdata; | ||
194 | static struct e820entry map_raw[E820MAX] __initdata; | ||
195 | |||
110 | unsigned long max_pfn = xen_start_info->nr_pages; | 196 | unsigned long max_pfn = xen_start_info->nr_pages; |
197 | unsigned long long mem_end; | ||
198 | int rc; | ||
199 | struct xen_memory_map memmap; | ||
200 | unsigned long extra_pages = 0; | ||
201 | unsigned long extra_limit; | ||
202 | unsigned long identity_pages = 0; | ||
203 | int i; | ||
204 | int op; | ||
111 | 205 | ||
112 | max_pfn = min(MAX_DOMAIN_PAGES, max_pfn); | 206 | max_pfn = min(MAX_DOMAIN_PAGES, max_pfn); |
207 | mem_end = PFN_PHYS(max_pfn); | ||
208 | |||
209 | memmap.nr_entries = E820MAX; | ||
210 | set_xen_guest_handle(memmap.buffer, map); | ||
211 | |||
212 | op = xen_initial_domain() ? | ||
213 | XENMEM_machine_memory_map : | ||
214 | XENMEM_memory_map; | ||
215 | rc = HYPERVISOR_memory_op(op, &memmap); | ||
216 | if (rc == -ENOSYS) { | ||
217 | BUG_ON(xen_initial_domain()); | ||
218 | memmap.nr_entries = 1; | ||
219 | map[0].addr = 0ULL; | ||
220 | map[0].size = mem_end; | ||
221 | /* 8MB slack (to balance backend allocations). */ | ||
222 | map[0].size += 8ULL << 20; | ||
223 | map[0].type = E820_RAM; | ||
224 | rc = 0; | ||
225 | } | ||
226 | BUG_ON(rc); | ||
113 | 227 | ||
228 | memcpy(map_raw, map, sizeof(map)); | ||
114 | e820.nr_map = 0; | 229 | e820.nr_map = 0; |
230 | xen_extra_mem_start = mem_end; | ||
231 | for (i = 0; i < memmap.nr_entries; i++) { | ||
232 | unsigned long long end; | ||
233 | |||
234 | /* Guard against non-page aligned E820 entries. */ | ||
235 | if (map[i].type == E820_RAM) | ||
236 | map[i].size -= (map[i].size + map[i].addr) % PAGE_SIZE; | ||
237 | |||
238 | end = map[i].addr + map[i].size; | ||
239 | if (map[i].type == E820_RAM && end > mem_end) { | ||
240 | /* RAM off the end - may be partially included */ | ||
241 | u64 delta = min(map[i].size, end - mem_end); | ||
242 | |||
243 | map[i].size -= delta; | ||
244 | end -= delta; | ||
245 | |||
246 | extra_pages += PFN_DOWN(delta); | ||
247 | /* | ||
248 | * Set RAM below 4GB that is not for us to be unusable. | ||
249 | * This prevents "System RAM" address space from being | ||
250 | * used as potential resource for I/O address (happens | ||
251 | * when 'allocate_resource' is called). | ||
252 | */ | ||
253 | if (delta && | ||
254 | (xen_initial_domain() && end < 0x100000000ULL)) | ||
255 | e820_add_region(end, delta, E820_UNUSABLE); | ||
256 | } | ||
115 | 257 | ||
116 | e820_add_region(0, PFN_PHYS((u64)max_pfn), E820_RAM); | 258 | if (map[i].size > 0 && end > xen_extra_mem_start) |
259 | xen_extra_mem_start = end; | ||
260 | |||
261 | /* Add region if any remains */ | ||
262 | if (map[i].size > 0) | ||
263 | e820_add_region(map[i].addr, map[i].size, map[i].type); | ||
264 | } | ||
265 | /* Align the balloon area so that max_low_pfn does not get set | ||
266 | * to be at the _end_ of the PCI gap at the far end (fee01000). | ||
267 | * Note that xen_extra_mem_start gets set in the loop above to be | ||
268 | * past the last E820 region. */ | ||
269 | if (xen_initial_domain() && (xen_extra_mem_start < (1ULL<<32))) | ||
270 | xen_extra_mem_start = (1ULL<<32); | ||
117 | 271 | ||
118 | /* | 272 | /* |
119 | * Even though this is normal, usable memory under Xen, reserve | 273 | * In domU, the ISA region is normal, usable memory, but we |
120 | * ISA memory anyway because too many things think they can poke | 274 | * reserve ISA memory anyway because too many things poke |
121 | * about in there. | 275 | * about in there. |
276 | * | ||
277 | * In Dom0, the host E820 information can leave gaps in the | ||
278 | * ISA range, which would cause us to release those pages. To | ||
279 | * avoid this, we unconditionally reserve them here. | ||
122 | */ | 280 | */ |
123 | e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS, | 281 | e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS, |
124 | E820_RESERVED); | 282 | E820_RESERVED); |
@@ -129,29 +287,43 @@ char * __init xen_memory_setup(void) | |||
129 | * - xen_start_info | 287 | * - xen_start_info |
130 | * See comment above "struct start_info" in <xen/interface/xen.h> | 288 | * See comment above "struct start_info" in <xen/interface/xen.h> |
131 | */ | 289 | */ |
132 | reserve_early(__pa(xen_start_info->mfn_list), | 290 | memblock_x86_reserve_range(__pa(xen_start_info->mfn_list), |
133 | __pa(xen_start_info->pt_base), | 291 | __pa(xen_start_info->pt_base), |
134 | "XEN START INFO"); | 292 | "XEN START INFO"); |
135 | 293 | ||
136 | sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); | 294 | sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); |
137 | 295 | ||
138 | xen_return_unused_memory(xen_start_info->nr_pages, &e820); | 296 | extra_pages += xen_return_unused_memory(xen_start_info->nr_pages, &e820); |
139 | 297 | ||
140 | return "Xen"; | 298 | /* |
141 | } | 299 | * Clamp the amount of extra memory to a EXTRA_MEM_RATIO |
300 | * factor the base size. On non-highmem systems, the base | ||
301 | * size is the full initial memory allocation; on highmem it | ||
302 | * is limited to the max size of lowmem, so that it doesn't | ||
303 | * get completely filled. | ||
304 | * | ||
305 | * In principle there could be a problem in lowmem systems if | ||
306 | * the initial memory is also very large with respect to | ||
307 | * lowmem, but we won't try to deal with that here. | ||
308 | */ | ||
309 | extra_limit = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)), | ||
310 | max_pfn + extra_pages); | ||
142 | 311 | ||
143 | static void xen_idle(void) | 312 | if (extra_limit >= max_pfn) |
144 | { | 313 | extra_pages = extra_limit - max_pfn; |
145 | local_irq_disable(); | 314 | else |
146 | 315 | extra_pages = 0; | |
147 | if (need_resched()) | 316 | |
148 | local_irq_enable(); | 317 | xen_add_extra_mem(extra_pages); |
149 | else { | 318 | |
150 | current_thread_info()->status &= ~TS_POLLING; | 319 | /* |
151 | smp_mb__after_clear_bit(); | 320 | * Set P2M for all non-RAM pages and E820 gaps to be identity |
152 | safe_halt(); | 321 | * type PFNs. We supply it with the non-sanitized version |
153 | current_thread_info()->status |= TS_POLLING; | 322 | * of the E820. |
154 | } | 323 | */ |
324 | identity_pages = xen_set_identity(map_raw, memmap.nr_entries); | ||
325 | printk(KERN_INFO "Set %ld page(s) to 1-1 mapping.\n", identity_pages); | ||
326 | return "Xen"; | ||
155 | } | 327 | } |
156 | 328 | ||
157 | /* | 329 | /* |
@@ -170,7 +342,7 @@ static void __init fiddle_vdso(void) | |||
170 | #endif | 342 | #endif |
171 | } | 343 | } |
172 | 344 | ||
173 | static __cpuinit int register_callback(unsigned type, const void *func) | 345 | static int __cpuinit register_callback(unsigned type, const void *func) |
174 | { | 346 | { |
175 | struct callback_register callback = { | 347 | struct callback_register callback = { |
176 | .type = type, | 348 | .type = type, |
@@ -223,9 +395,6 @@ void __cpuinit xen_enable_syscall(void) | |||
223 | 395 | ||
224 | void __init xen_arch_setup(void) | 396 | void __init xen_arch_setup(void) |
225 | { | 397 | { |
226 | struct physdev_set_iopl set_iopl; | ||
227 | int rc; | ||
228 | |||
229 | xen_panic_handler_init(); | 398 | xen_panic_handler_init(); |
230 | 399 | ||
231 | HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments); | 400 | HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments); |
@@ -242,11 +411,6 @@ void __init xen_arch_setup(void) | |||
242 | xen_enable_sysenter(); | 411 | xen_enable_sysenter(); |
243 | xen_enable_syscall(); | 412 | xen_enable_syscall(); |
244 | 413 | ||
245 | set_iopl.iopl = 1; | ||
246 | rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); | ||
247 | if (rc != 0) | ||
248 | printk(KERN_INFO "physdev_op failed %d\n", rc); | ||
249 | |||
250 | #ifdef CONFIG_ACPI | 414 | #ifdef CONFIG_ACPI |
251 | if (!(xen_start_info->flags & SIF_INITDOMAIN)) { | 415 | if (!(xen_start_info->flags & SIF_INITDOMAIN)) { |
252 | printk(KERN_INFO "ACPI in unprivileged domain disabled\n"); | 416 | printk(KERN_INFO "ACPI in unprivileged domain disabled\n"); |
@@ -258,9 +422,12 @@ void __init xen_arch_setup(void) | |||
258 | MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ? | 422 | MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ? |
259 | COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE); | 423 | COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE); |
260 | 424 | ||
261 | pm_idle = xen_idle; | 425 | /* Set up idle, making sure it calls safe_halt() pvop */ |
262 | 426 | #ifdef CONFIG_X86_32 | |
263 | paravirt_disable_iospace(); | 427 | boot_cpu_data.hlt_works_ok = 1; |
428 | #endif | ||
429 | pm_idle = default_idle; | ||
430 | boot_option_idle_override = IDLE_HALT; | ||
264 | 431 | ||
265 | fiddle_vdso(); | 432 | fiddle_vdso(); |
266 | } | 433 | } |
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 25f232b18a82..b4533a86d7e4 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c | |||
@@ -28,6 +28,7 @@ | |||
28 | #include <asm/xen/interface.h> | 28 | #include <asm/xen/interface.h> |
29 | #include <asm/xen/hypercall.h> | 29 | #include <asm/xen/hypercall.h> |
30 | 30 | ||
31 | #include <xen/xen.h> | ||
31 | #include <xen/page.h> | 32 | #include <xen/page.h> |
32 | #include <xen/events.h> | 33 | #include <xen/events.h> |
33 | 34 | ||
@@ -45,18 +46,17 @@ static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id); | |||
45 | static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id); | 46 | static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id); |
46 | 47 | ||
47 | /* | 48 | /* |
48 | * Reschedule call back. Nothing to do, | 49 | * Reschedule call back. |
49 | * all the work is done automatically when | ||
50 | * we return from the interrupt. | ||
51 | */ | 50 | */ |
52 | static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id) | 51 | static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id) |
53 | { | 52 | { |
54 | inc_irq_stat(irq_resched_count); | 53 | inc_irq_stat(irq_resched_count); |
54 | scheduler_ipi(); | ||
55 | 55 | ||
56 | return IRQ_HANDLED; | 56 | return IRQ_HANDLED; |
57 | } | 57 | } |
58 | 58 | ||
59 | static __cpuinit void cpu_bringup(void) | 59 | static void __cpuinit cpu_bringup(void) |
60 | { | 60 | { |
61 | int cpu = smp_processor_id(); | 61 | int cpu = smp_processor_id(); |
62 | 62 | ||
@@ -84,7 +84,7 @@ static __cpuinit void cpu_bringup(void) | |||
84 | wmb(); /* make sure everything is out */ | 84 | wmb(); /* make sure everything is out */ |
85 | } | 85 | } |
86 | 86 | ||
87 | static __cpuinit void cpu_bringup_and_idle(void) | 87 | static void __cpuinit cpu_bringup_and_idle(void) |
88 | { | 88 | { |
89 | cpu_bringup(); | 89 | cpu_bringup(); |
90 | cpu_idle(); | 90 | cpu_idle(); |
@@ -156,6 +156,9 @@ static void __init xen_fill_possible_map(void) | |||
156 | { | 156 | { |
157 | int i, rc; | 157 | int i, rc; |
158 | 158 | ||
159 | if (xen_initial_domain()) | ||
160 | return; | ||
161 | |||
159 | for (i = 0; i < nr_cpu_ids; i++) { | 162 | for (i = 0; i < nr_cpu_ids; i++) { |
160 | rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); | 163 | rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); |
161 | if (rc >= 0) { | 164 | if (rc >= 0) { |
@@ -165,6 +168,27 @@ static void __init xen_fill_possible_map(void) | |||
165 | } | 168 | } |
166 | } | 169 | } |
167 | 170 | ||
171 | static void __init xen_filter_cpu_maps(void) | ||
172 | { | ||
173 | int i, rc; | ||
174 | |||
175 | if (!xen_initial_domain()) | ||
176 | return; | ||
177 | |||
178 | num_processors = 0; | ||
179 | disabled_cpus = 0; | ||
180 | for (i = 0; i < nr_cpu_ids; i++) { | ||
181 | rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); | ||
182 | if (rc >= 0) { | ||
183 | num_processors++; | ||
184 | set_cpu_possible(i, true); | ||
185 | } else { | ||
186 | set_cpu_possible(i, false); | ||
187 | set_cpu_present(i, false); | ||
188 | } | ||
189 | } | ||
190 | } | ||
191 | |||
168 | static void __init xen_smp_prepare_boot_cpu(void) | 192 | static void __init xen_smp_prepare_boot_cpu(void) |
169 | { | 193 | { |
170 | BUG_ON(smp_processor_id() != 0); | 194 | BUG_ON(smp_processor_id() != 0); |
@@ -174,17 +198,25 @@ static void __init xen_smp_prepare_boot_cpu(void) | |||
174 | old memory can be recycled */ | 198 | old memory can be recycled */ |
175 | make_lowmem_page_readwrite(xen_initial_gdt); | 199 | make_lowmem_page_readwrite(xen_initial_gdt); |
176 | 200 | ||
201 | xen_filter_cpu_maps(); | ||
177 | xen_setup_vcpu_info_placement(); | 202 | xen_setup_vcpu_info_placement(); |
178 | } | 203 | } |
179 | 204 | ||
180 | static void __init xen_smp_prepare_cpus(unsigned int max_cpus) | 205 | static void __init xen_smp_prepare_cpus(unsigned int max_cpus) |
181 | { | 206 | { |
182 | unsigned cpu; | 207 | unsigned cpu; |
208 | unsigned int i; | ||
183 | 209 | ||
184 | xen_init_lock_cpu(0); | 210 | xen_init_lock_cpu(0); |
185 | 211 | ||
186 | smp_store_cpu_info(0); | 212 | smp_store_cpu_info(0); |
187 | cpu_data(0).x86_max_cores = 1; | 213 | cpu_data(0).x86_max_cores = 1; |
214 | |||
215 | for_each_possible_cpu(i) { | ||
216 | zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL); | ||
217 | zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); | ||
218 | zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL); | ||
219 | } | ||
188 | set_cpu_sibling_map(0); | 220 | set_cpu_sibling_map(0); |
189 | 221 | ||
190 | if (xen_smp_intr_init(0)) | 222 | if (xen_smp_intr_init(0)) |
@@ -216,7 +248,7 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus) | |||
216 | } | 248 | } |
217 | } | 249 | } |
218 | 250 | ||
219 | static __cpuinit int | 251 | static int __cpuinit |
220 | cpu_initialize_context(unsigned int cpu, struct task_struct *idle) | 252 | cpu_initialize_context(unsigned int cpu, struct task_struct *idle) |
221 | { | 253 | { |
222 | struct vcpu_guest_context *ctxt; | 254 | struct vcpu_guest_context *ctxt; |
@@ -400,9 +432,9 @@ static void stop_self(void *v) | |||
400 | BUG(); | 432 | BUG(); |
401 | } | 433 | } |
402 | 434 | ||
403 | static void xen_smp_send_stop(void) | 435 | static void xen_stop_other_cpus(int wait) |
404 | { | 436 | { |
405 | smp_call_function(stop_self, NULL, 0); | 437 | smp_call_function(stop_self, NULL, wait); |
406 | } | 438 | } |
407 | 439 | ||
408 | static void xen_smp_send_reschedule(int cpu) | 440 | static void xen_smp_send_reschedule(int cpu) |
@@ -460,7 +492,7 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id) | |||
460 | return IRQ_HANDLED; | 492 | return IRQ_HANDLED; |
461 | } | 493 | } |
462 | 494 | ||
463 | static const struct smp_ops xen_smp_ops __initdata = { | 495 | static const struct smp_ops xen_smp_ops __initconst = { |
464 | .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu, | 496 | .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu, |
465 | .smp_prepare_cpus = xen_smp_prepare_cpus, | 497 | .smp_prepare_cpus = xen_smp_prepare_cpus, |
466 | .smp_cpus_done = xen_smp_cpus_done, | 498 | .smp_cpus_done = xen_smp_cpus_done, |
@@ -470,7 +502,7 @@ static const struct smp_ops xen_smp_ops __initdata = { | |||
470 | .cpu_disable = xen_cpu_disable, | 502 | .cpu_disable = xen_cpu_disable, |
471 | .play_dead = xen_play_dead, | 503 | .play_dead = xen_play_dead, |
472 | 504 | ||
473 | .smp_send_stop = xen_smp_send_stop, | 505 | .stop_other_cpus = xen_stop_other_cpus, |
474 | .smp_send_reschedule = xen_smp_send_reschedule, | 506 | .smp_send_reschedule = xen_smp_send_reschedule, |
475 | 507 | ||
476 | .send_call_func_ipi = xen_smp_send_call_function_ipi, | 508 | .send_call_func_ipi = xen_smp_send_call_function_ipi, |
@@ -483,3 +515,41 @@ void __init xen_smp_init(void) | |||
483 | xen_fill_possible_map(); | 515 | xen_fill_possible_map(); |
484 | xen_init_spinlocks(); | 516 | xen_init_spinlocks(); |
485 | } | 517 | } |
518 | |||
519 | static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus) | ||
520 | { | ||
521 | native_smp_prepare_cpus(max_cpus); | ||
522 | WARN_ON(xen_smp_intr_init(0)); | ||
523 | |||
524 | if (!xen_have_vector_callback) | ||
525 | return; | ||
526 | xen_init_lock_cpu(0); | ||
527 | xen_init_spinlocks(); | ||
528 | } | ||
529 | |||
530 | static int __cpuinit xen_hvm_cpu_up(unsigned int cpu) | ||
531 | { | ||
532 | int rc; | ||
533 | rc = native_cpu_up(cpu); | ||
534 | WARN_ON (xen_smp_intr_init(cpu)); | ||
535 | return rc; | ||
536 | } | ||
537 | |||
538 | static void xen_hvm_cpu_die(unsigned int cpu) | ||
539 | { | ||
540 | unbind_from_irqhandler(per_cpu(xen_resched_irq, cpu), NULL); | ||
541 | unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu), NULL); | ||
542 | unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu), NULL); | ||
543 | unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu), NULL); | ||
544 | native_cpu_die(cpu); | ||
545 | } | ||
546 | |||
547 | void __init xen_hvm_smp_init(void) | ||
548 | { | ||
549 | smp_ops.smp_prepare_cpus = xen_hvm_smp_prepare_cpus; | ||
550 | smp_ops.smp_send_reschedule = xen_smp_send_reschedule; | ||
551 | smp_ops.cpu_up = xen_hvm_cpu_up; | ||
552 | smp_ops.cpu_die = xen_hvm_cpu_die; | ||
553 | smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi; | ||
554 | smp_ops.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi; | ||
555 | } | ||
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index e0500646585d..cc9b1e182fcf 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c | |||
@@ -159,8 +159,8 @@ static inline struct xen_spinlock *spinning_lock(struct xen_spinlock *xl) | |||
159 | { | 159 | { |
160 | struct xen_spinlock *prev; | 160 | struct xen_spinlock *prev; |
161 | 161 | ||
162 | prev = __get_cpu_var(lock_spinners); | 162 | prev = __this_cpu_read(lock_spinners); |
163 | __get_cpu_var(lock_spinners) = xl; | 163 | __this_cpu_write(lock_spinners, xl); |
164 | 164 | ||
165 | wmb(); /* set lock of interest before count */ | 165 | wmb(); /* set lock of interest before count */ |
166 | 166 | ||
@@ -179,14 +179,14 @@ static inline void unspinning_lock(struct xen_spinlock *xl, struct xen_spinlock | |||
179 | asm(LOCK_PREFIX " decw %0" | 179 | asm(LOCK_PREFIX " decw %0" |
180 | : "+m" (xl->spinners) : : "memory"); | 180 | : "+m" (xl->spinners) : : "memory"); |
181 | wmb(); /* decrement count before restoring lock */ | 181 | wmb(); /* decrement count before restoring lock */ |
182 | __get_cpu_var(lock_spinners) = prev; | 182 | __this_cpu_write(lock_spinners, prev); |
183 | } | 183 | } |
184 | 184 | ||
185 | static noinline int xen_spin_lock_slow(struct arch_spinlock *lock, bool irq_enable) | 185 | static noinline int xen_spin_lock_slow(struct arch_spinlock *lock, bool irq_enable) |
186 | { | 186 | { |
187 | struct xen_spinlock *xl = (struct xen_spinlock *)lock; | 187 | struct xen_spinlock *xl = (struct xen_spinlock *)lock; |
188 | struct xen_spinlock *prev; | 188 | struct xen_spinlock *prev; |
189 | int irq = __get_cpu_var(lock_kicker_irq); | 189 | int irq = __this_cpu_read(lock_kicker_irq); |
190 | int ret; | 190 | int ret; |
191 | u64 start; | 191 | u64 start; |
192 | 192 | ||
@@ -224,7 +224,7 @@ static noinline int xen_spin_lock_slow(struct arch_spinlock *lock, bool irq_enab | |||
224 | goto out; | 224 | goto out; |
225 | } | 225 | } |
226 | 226 | ||
227 | flags = __raw_local_save_flags(); | 227 | flags = arch_local_save_flags(); |
228 | if (irq_enable) { | 228 | if (irq_enable) { |
229 | ADD_STATS(taken_slow_irqenable, 1); | 229 | ADD_STATS(taken_slow_irqenable, 1); |
230 | raw_local_irq_enable(); | 230 | raw_local_irq_enable(); |
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index 1d789d56877c..45329c8c226e 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c | |||
@@ -12,7 +12,7 @@ | |||
12 | #include "xen-ops.h" | 12 | #include "xen-ops.h" |
13 | #include "mmu.h" | 13 | #include "mmu.h" |
14 | 14 | ||
15 | void xen_pre_suspend(void) | 15 | void xen_arch_pre_suspend(void) |
16 | { | 16 | { |
17 | xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn); | 17 | xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn); |
18 | xen_start_info->console.domU.mfn = | 18 | xen_start_info->console.domU.mfn = |
@@ -26,19 +26,22 @@ void xen_pre_suspend(void) | |||
26 | BUG(); | 26 | BUG(); |
27 | } | 27 | } |
28 | 28 | ||
29 | void xen_hvm_post_suspend(int suspend_cancelled) | 29 | void xen_arch_hvm_post_suspend(int suspend_cancelled) |
30 | { | 30 | { |
31 | #ifdef CONFIG_XEN_PVHVM | ||
31 | int cpu; | 32 | int cpu; |
32 | xen_hvm_init_shared_info(); | 33 | xen_hvm_init_shared_info(); |
33 | xen_callback_vector(); | 34 | xen_callback_vector(); |
35 | xen_unplug_emulated_devices(); | ||
34 | if (xen_feature(XENFEAT_hvm_safe_pvclock)) { | 36 | if (xen_feature(XENFEAT_hvm_safe_pvclock)) { |
35 | for_each_online_cpu(cpu) { | 37 | for_each_online_cpu(cpu) { |
36 | xen_setup_runstate_info(cpu); | 38 | xen_setup_runstate_info(cpu); |
37 | } | 39 | } |
38 | } | 40 | } |
41 | #endif | ||
39 | } | 42 | } |
40 | 43 | ||
41 | void xen_post_suspend(int suspend_cancelled) | 44 | void xen_arch_post_suspend(int suspend_cancelled) |
42 | { | 45 | { |
43 | xen_build_mfn_list_list(); | 46 | xen_build_mfn_list_list(); |
44 | 47 | ||
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index b2bb5aa3b054..5158c505bef9 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c | |||
@@ -26,8 +26,6 @@ | |||
26 | 26 | ||
27 | #include "xen-ops.h" | 27 | #include "xen-ops.h" |
28 | 28 | ||
29 | #define XEN_SHIFT 22 | ||
30 | |||
31 | /* Xen may fire a timer up to this many ns early */ | 29 | /* Xen may fire a timer up to this many ns early */ |
32 | #define TIMER_SLOP 100000 | 30 | #define TIMER_SLOP 100000 |
33 | #define NS_PER_TICK (1000000000LL / HZ) | 31 | #define NS_PER_TICK (1000000000LL / HZ) |
@@ -135,24 +133,24 @@ static void do_stolen_accounting(void) | |||
135 | 133 | ||
136 | /* Add the appropriate number of ticks of stolen time, | 134 | /* Add the appropriate number of ticks of stolen time, |
137 | including any left-overs from last time. */ | 135 | including any left-overs from last time. */ |
138 | stolen = runnable + offline + __get_cpu_var(xen_residual_stolen); | 136 | stolen = runnable + offline + __this_cpu_read(xen_residual_stolen); |
139 | 137 | ||
140 | if (stolen < 0) | 138 | if (stolen < 0) |
141 | stolen = 0; | 139 | stolen = 0; |
142 | 140 | ||
143 | ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen); | 141 | ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen); |
144 | __get_cpu_var(xen_residual_stolen) = stolen; | 142 | __this_cpu_write(xen_residual_stolen, stolen); |
145 | account_steal_ticks(ticks); | 143 | account_steal_ticks(ticks); |
146 | 144 | ||
147 | /* Add the appropriate number of ticks of blocked time, | 145 | /* Add the appropriate number of ticks of blocked time, |
148 | including any left-overs from last time. */ | 146 | including any left-overs from last time. */ |
149 | blocked += __get_cpu_var(xen_residual_blocked); | 147 | blocked += __this_cpu_read(xen_residual_blocked); |
150 | 148 | ||
151 | if (blocked < 0) | 149 | if (blocked < 0) |
152 | blocked = 0; | 150 | blocked = 0; |
153 | 151 | ||
154 | ticks = iter_div_u64_rem(blocked, NS_PER_TICK, &blocked); | 152 | ticks = iter_div_u64_rem(blocked, NS_PER_TICK, &blocked); |
155 | __get_cpu_var(xen_residual_blocked) = blocked; | 153 | __this_cpu_write(xen_residual_blocked, blocked); |
156 | account_idle_ticks(ticks); | 154 | account_idle_ticks(ticks); |
157 | } | 155 | } |
158 | 156 | ||
@@ -211,8 +209,6 @@ static struct clocksource xen_clocksource __read_mostly = { | |||
211 | .rating = 400, | 209 | .rating = 400, |
212 | .read = xen_clocksource_get_cycles, | 210 | .read = xen_clocksource_get_cycles, |
213 | .mask = ~0, | 211 | .mask = ~0, |
214 | .mult = 1<<XEN_SHIFT, /* time directly in nanoseconds */ | ||
215 | .shift = XEN_SHIFT, | ||
216 | .flags = CLOCK_SOURCE_IS_CONTINUOUS, | 212 | .flags = CLOCK_SOURCE_IS_CONTINUOUS, |
217 | }; | 213 | }; |
218 | 214 | ||
@@ -397,7 +393,9 @@ void xen_setup_timer(int cpu) | |||
397 | name = "<timer kasprintf failed>"; | 393 | name = "<timer kasprintf failed>"; |
398 | 394 | ||
399 | irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt, | 395 | irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt, |
400 | IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING|IRQF_TIMER, | 396 | IRQF_DISABLED|IRQF_PERCPU| |
397 | IRQF_NOBALANCING|IRQF_TIMER| | ||
398 | IRQF_FORCE_RESUME, | ||
401 | name, NULL); | 399 | name, NULL); |
402 | 400 | ||
403 | evt = &per_cpu(xen_clock_events, cpu); | 401 | evt = &per_cpu(xen_clock_events, cpu); |
@@ -426,6 +424,8 @@ void xen_timer_resume(void) | |||
426 | { | 424 | { |
427 | int cpu; | 425 | int cpu; |
428 | 426 | ||
427 | pvclock_resume(); | ||
428 | |||
429 | if (xen_clockevent != &xen_vcpuop_clockevent) | 429 | if (xen_clockevent != &xen_vcpuop_clockevent) |
430 | return; | 430 | return; |
431 | 431 | ||
@@ -435,16 +435,16 @@ void xen_timer_resume(void) | |||
435 | } | 435 | } |
436 | } | 436 | } |
437 | 437 | ||
438 | static const struct pv_time_ops xen_time_ops __initdata = { | 438 | static const struct pv_time_ops xen_time_ops __initconst = { |
439 | .sched_clock = xen_clocksource_read, | 439 | .sched_clock = xen_clocksource_read, |
440 | }; | 440 | }; |
441 | 441 | ||
442 | static __init void xen_time_init(void) | 442 | static void __init xen_time_init(void) |
443 | { | 443 | { |
444 | int cpu = smp_processor_id(); | 444 | int cpu = smp_processor_id(); |
445 | struct timespec tp; | 445 | struct timespec tp; |
446 | 446 | ||
447 | clocksource_register(&xen_clocksource); | 447 | clocksource_register_hz(&xen_clocksource, NSEC_PER_SEC); |
448 | 448 | ||
449 | if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) { | 449 | if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) { |
450 | /* Successfully turned off 100Hz tick, so we have the | 450 | /* Successfully turned off 100Hz tick, so we have the |
@@ -464,7 +464,7 @@ static __init void xen_time_init(void) | |||
464 | xen_setup_cpu_clockevents(); | 464 | xen_setup_cpu_clockevents(); |
465 | } | 465 | } |
466 | 466 | ||
467 | __init void xen_init_time_ops(void) | 467 | void __init xen_init_time_ops(void) |
468 | { | 468 | { |
469 | pv_time_ops = xen_time_ops; | 469 | pv_time_ops = xen_time_ops; |
470 | 470 | ||
@@ -486,7 +486,7 @@ static void xen_hvm_setup_cpu_clockevents(void) | |||
486 | xen_setup_cpu_clockevents(); | 486 | xen_setup_cpu_clockevents(); |
487 | } | 487 | } |
488 | 488 | ||
489 | __init void xen_hvm_init_time_ops(void) | 489 | void __init xen_hvm_init_time_ops(void) |
490 | { | 490 | { |
491 | /* vector callback is needed otherwise we cannot receive interrupts | 491 | /* vector callback is needed otherwise we cannot receive interrupts |
492 | * on cpu > 0 and at this point we don't know how many cpus are | 492 | * on cpu > 0 and at this point we don't know how many cpus are |
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index 1a5ff24e29c0..aaa7291c9259 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S | |||
@@ -28,9 +28,9 @@ ENTRY(startup_xen) | |||
28 | __FINIT | 28 | __FINIT |
29 | 29 | ||
30 | .pushsection .text | 30 | .pushsection .text |
31 | .align PAGE_SIZE_asm | 31 | .align PAGE_SIZE |
32 | ENTRY(hypercall_page) | 32 | ENTRY(hypercall_page) |
33 | .skip PAGE_SIZE_asm | 33 | .skip PAGE_SIZE |
34 | .popsection | 34 | .popsection |
35 | 35 | ||
36 | ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") | 36 | ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") |
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 7c8ab86163e9..97dfdc8757b3 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h | |||
@@ -30,6 +30,9 @@ void xen_setup_machphys_mapping(void); | |||
30 | pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn); | 30 | pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn); |
31 | void xen_ident_map_ISA(void); | 31 | void xen_ident_map_ISA(void); |
32 | void xen_reserve_top(void); | 32 | void xen_reserve_top(void); |
33 | extern unsigned long xen_max_p2m_pfn; | ||
34 | |||
35 | void xen_set_pat(u64); | ||
33 | 36 | ||
34 | char * __init xen_memory_setup(void); | 37 | char * __init xen_memory_setup(void); |
35 | void __init xen_arch_setup(void); | 38 | void __init xen_arch_setup(void); |
@@ -40,7 +43,7 @@ void xen_vcpu_restore(void); | |||
40 | 43 | ||
41 | void xen_callback_vector(void); | 44 | void xen_callback_vector(void); |
42 | void xen_hvm_init_shared_info(void); | 45 | void xen_hvm_init_shared_info(void); |
43 | void __init xen_unplug_emulated_devices(void); | 46 | void xen_unplug_emulated_devices(void); |
44 | 47 | ||
45 | void __init xen_build_dynamic_phys_to_machine(void); | 48 | void __init xen_build_dynamic_phys_to_machine(void); |
46 | 49 | ||
@@ -61,15 +64,17 @@ void xen_setup_vcpu_info_placement(void); | |||
61 | 64 | ||
62 | #ifdef CONFIG_SMP | 65 | #ifdef CONFIG_SMP |
63 | void xen_smp_init(void); | 66 | void xen_smp_init(void); |
67 | void __init xen_hvm_smp_init(void); | ||
64 | 68 | ||
65 | extern cpumask_var_t xen_cpu_initialized_map; | 69 | extern cpumask_var_t xen_cpu_initialized_map; |
66 | #else | 70 | #else |
67 | static inline void xen_smp_init(void) {} | 71 | static inline void xen_smp_init(void) {} |
72 | static inline void xen_hvm_smp_init(void) {} | ||
68 | #endif | 73 | #endif |
69 | 74 | ||
70 | #ifdef CONFIG_PARAVIRT_SPINLOCKS | 75 | #ifdef CONFIG_PARAVIRT_SPINLOCKS |
71 | void __init xen_init_spinlocks(void); | 76 | void __init xen_init_spinlocks(void); |
72 | __cpuinit void xen_init_lock_cpu(int cpu); | 77 | void __cpuinit xen_init_lock_cpu(int cpu); |
73 | void xen_uninit_lock_cpu(int cpu); | 78 | void xen_uninit_lock_cpu(int cpu); |
74 | #else | 79 | #else |
75 | static inline void xen_init_spinlocks(void) | 80 | static inline void xen_init_spinlocks(void) |