aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/xen
diff options
context:
space:
mode:
authorGlenn Elliott <gelliott@cs.unc.edu>2012-03-04 19:47:13 -0500
committerGlenn Elliott <gelliott@cs.unc.edu>2012-03-04 19:47:13 -0500
commitc71c03bda1e86c9d5198c5d83f712e695c4f2a1e (patch)
treeecb166cb3e2b7e2adb3b5e292245fefd23381ac8 /arch/x86/xen
parentea53c912f8a86a8567697115b6a0d8152beee5c8 (diff)
parent6a00f206debf8a5c8899055726ad127dbeeed098 (diff)
Merge branch 'mpi-master' into wip-k-fmlpwip-k-fmlp
Conflicts: litmus/sched_cedf.c
Diffstat (limited to 'arch/x86/xen')
-rw-r--r--arch/x86/xen/Kconfig32
-rw-r--r--arch/x86/xen/Makefile3
-rw-r--r--arch/x86/xen/debugfs.c1
-rw-r--r--arch/x86/xen/enlighten.c152
-rw-r--r--arch/x86/xen/irq.c4
-rw-r--r--arch/x86/xen/mmu.c870
-rw-r--r--arch/x86/xen/mmu.h38
-rw-r--r--arch/x86/xen/multicalls.c12
-rw-r--r--arch/x86/xen/multicalls.h2
-rw-r--r--arch/x86/xen/p2m.c859
-rw-r--r--arch/x86/xen/pci-swiotlb-xen.c11
-rw-r--r--arch/x86/xen/platform-pci-unplug.c2
-rw-r--r--arch/x86/xen/setup.c241
-rw-r--r--arch/x86/xen/smp.c90
-rw-r--r--arch/x86/xen/spinlock.c10
-rw-r--r--arch/x86/xen/suspend.c9
-rw-r--r--arch/x86/xen/time.c28
-rw-r--r--arch/x86/xen/xen-head.S4
-rw-r--r--arch/x86/xen/xen-ops.h9
19 files changed, 1749 insertions, 628 deletions
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 68128a1b401a..5cc821cb2e09 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -13,25 +13,33 @@ config XEN
13 kernel to boot in a paravirtualized environment under the 13 kernel to boot in a paravirtualized environment under the
14 Xen hypervisor. 14 Xen hypervisor.
15 15
16config XEN_DOM0
17 def_bool y
18 depends on XEN && PCI_XEN && SWIOTLB_XEN
19 depends on X86_LOCAL_APIC && X86_IO_APIC && ACPI && PCI
20
21# Dummy symbol since people have come to rely on the PRIVILEGED_GUEST
22# name in tools.
23config XEN_PRIVILEGED_GUEST
24 def_bool XEN_DOM0
25
16config XEN_PVHVM 26config XEN_PVHVM
17 def_bool y 27 def_bool y
18 depends on XEN 28 depends on XEN
19 depends on X86_LOCAL_APIC 29 depends on X86_LOCAL_APIC
20 30
21config XEN_MAX_DOMAIN_MEMORY 31config XEN_MAX_DOMAIN_MEMORY
22 int "Maximum allowed size of a domain in gigabytes" 32 int
23 default 8 if X86_32 33 default 128
24 default 32 if X86_64
25 depends on XEN 34 depends on XEN
26 help 35 help
27 The pseudo-physical to machine address array is sized 36 This only affects the sizing of some bss arrays, the unused
28 according to the maximum possible memory size of a Xen 37 portions of which are freed.
29 domain. This array uses 1 page per gigabyte, so there's no
30 need to be too stingy here.
31 38
32config XEN_SAVE_RESTORE 39config XEN_SAVE_RESTORE
33 bool 40 bool
34 depends on XEN && PM 41 depends on XEN
42 select HIBERNATE_CALLBACKS
35 default y 43 default y
36 44
37config XEN_DEBUG_FS 45config XEN_DEBUG_FS
@@ -41,3 +49,11 @@ config XEN_DEBUG_FS
41 help 49 help
42 Enable statistics output and various tuning options in debugfs. 50 Enable statistics output and various tuning options in debugfs.
43 Enabling this option may incur a significant performance overhead. 51 Enabling this option may incur a significant performance overhead.
52
53config XEN_DEBUG
54 bool "Enable Xen debug checks"
55 depends on XEN
56 default n
57 help
58 Enable various WARN_ON checks in the Xen MMU code.
59 Enabling this option WILL incur a significant performance overhead.
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 779385158915..17c565de3d64 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -12,7 +12,8 @@ CFLAGS_mmu.o := $(nostackp)
12 12
13obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ 13obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \
14 time.o xen-asm.o xen-asm_$(BITS).o \ 14 time.o xen-asm.o xen-asm_$(BITS).o \
15 grant-table.o suspend.o platform-pci-unplug.o 15 grant-table.o suspend.o platform-pci-unplug.o \
16 p2m.o
16 17
17obj-$(CONFIG_SMP) += smp.o 18obj-$(CONFIG_SMP) += smp.o
18obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o 19obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o
diff --git a/arch/x86/xen/debugfs.c b/arch/x86/xen/debugfs.c
index 1304bcec8ee5..7c0fedd98ea0 100644
--- a/arch/x86/xen/debugfs.c
+++ b/arch/x86/xen/debugfs.c
@@ -106,6 +106,7 @@ static const struct file_operations u32_array_fops = {
106 .open = u32_array_open, 106 .open = u32_array_open,
107 .release= xen_array_release, 107 .release= xen_array_release,
108 .read = u32_array_read, 108 .read = u32_array_read,
109 .llseek = no_llseek,
109}; 110};
110 111
111struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode, 112struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode,
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 7d46c8441418..5525163a0398 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -30,6 +30,7 @@
30#include <linux/console.h> 30#include <linux/console.h>
31#include <linux/pci.h> 31#include <linux/pci.h>
32#include <linux/gfp.h> 32#include <linux/gfp.h>
33#include <linux/memblock.h>
33 34
34#include <xen/xen.h> 35#include <xen/xen.h>
35#include <xen/interface/xen.h> 36#include <xen/interface/xen.h>
@@ -45,6 +46,7 @@
45#include <asm/paravirt.h> 46#include <asm/paravirt.h>
46#include <asm/apic.h> 47#include <asm/apic.h>
47#include <asm/page.h> 48#include <asm/page.h>
49#include <asm/xen/pci.h>
48#include <asm/xen/hypercall.h> 50#include <asm/xen/hypercall.h>
49#include <asm/xen/hypervisor.h> 51#include <asm/xen/hypervisor.h>
50#include <asm/fixmap.h> 52#include <asm/fixmap.h>
@@ -58,7 +60,6 @@
58#include <asm/pgtable.h> 60#include <asm/pgtable.h>
59#include <asm/tlbflush.h> 61#include <asm/tlbflush.h>
60#include <asm/reboot.h> 62#include <asm/reboot.h>
61#include <asm/setup.h>
62#include <asm/stackprotector.h> 63#include <asm/stackprotector.h>
63#include <asm/hypervisor.h> 64#include <asm/hypervisor.h>
64 65
@@ -74,6 +75,11 @@ DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
74enum xen_domain_type xen_domain_type = XEN_NATIVE; 75enum xen_domain_type xen_domain_type = XEN_NATIVE;
75EXPORT_SYMBOL_GPL(xen_domain_type); 76EXPORT_SYMBOL_GPL(xen_domain_type);
76 77
78unsigned long *machine_to_phys_mapping = (void *)MACH2PHYS_VIRT_START;
79EXPORT_SYMBOL(machine_to_phys_mapping);
80unsigned int machine_to_phys_order;
81EXPORT_SYMBOL(machine_to_phys_order);
82
77struct start_info *xen_start_info; 83struct start_info *xen_start_info;
78EXPORT_SYMBOL_GPL(xen_start_info); 84EXPORT_SYMBOL_GPL(xen_start_info);
79 85
@@ -135,9 +141,6 @@ static void xen_vcpu_setup(int cpu)
135 info.mfn = arbitrary_virt_to_mfn(vcpup); 141 info.mfn = arbitrary_virt_to_mfn(vcpup);
136 info.offset = offset_in_page(vcpup); 142 info.offset = offset_in_page(vcpup);
137 143
138 printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %llx, offset %d\n",
139 cpu, vcpup, info.mfn, info.offset);
140
141 /* Check to see if the hypervisor will put the vcpu_info 144 /* Check to see if the hypervisor will put the vcpu_info
142 structure where we want it, which allows direct access via 145 structure where we want it, which allows direct access via
143 a percpu-variable. */ 146 a percpu-variable. */
@@ -151,9 +154,6 @@ static void xen_vcpu_setup(int cpu)
151 /* This cpu is using the registered vcpu info, even if 154 /* This cpu is using the registered vcpu info, even if
152 later ones fail to. */ 155 later ones fail to. */
153 per_cpu(xen_vcpu, cpu) = vcpup; 156 per_cpu(xen_vcpu, cpu) = vcpup;
154
155 printk(KERN_DEBUG "cpu %d using vcpu_info at %p\n",
156 cpu, vcpup);
157 } 157 }
158} 158}
159 159
@@ -235,37 +235,31 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx,
235 *dx &= maskedx; 235 *dx &= maskedx;
236} 236}
237 237
238static __init void xen_init_cpuid_mask(void) 238static void __init xen_init_cpuid_mask(void)
239{ 239{
240 unsigned int ax, bx, cx, dx; 240 unsigned int ax, bx, cx, dx;
241 unsigned int xsave_mask;
241 242
242 cpuid_leaf1_edx_mask = 243 cpuid_leaf1_edx_mask =
243 ~((1 << X86_FEATURE_MCE) | /* disable MCE */ 244 ~((1 << X86_FEATURE_MCE) | /* disable MCE */
244 (1 << X86_FEATURE_MCA) | /* disable MCA */ 245 (1 << X86_FEATURE_MCA) | /* disable MCA */
246 (1 << X86_FEATURE_MTRR) | /* disable MTRR */
245 (1 << X86_FEATURE_ACC)); /* thermal monitoring */ 247 (1 << X86_FEATURE_ACC)); /* thermal monitoring */
246 248
247 if (!xen_initial_domain()) 249 if (!xen_initial_domain())
248 cpuid_leaf1_edx_mask &= 250 cpuid_leaf1_edx_mask &=
249 ~((1 << X86_FEATURE_APIC) | /* disable local APIC */ 251 ~((1 << X86_FEATURE_APIC) | /* disable local APIC */
250 (1 << X86_FEATURE_ACPI)); /* disable ACPI */ 252 (1 << X86_FEATURE_ACPI)); /* disable ACPI */
251
252 ax = 1; 253 ax = 1;
253 cx = 0;
254 xen_cpuid(&ax, &bx, &cx, &dx); 254 xen_cpuid(&ax, &bx, &cx, &dx);
255 255
256 /* cpuid claims we support xsave; try enabling it to see what happens */ 256 xsave_mask =
257 if (cx & (1 << (X86_FEATURE_XSAVE % 32))) { 257 (1 << (X86_FEATURE_XSAVE % 32)) |
258 unsigned long cr4; 258 (1 << (X86_FEATURE_OSXSAVE % 32));
259
260 set_in_cr4(X86_CR4_OSXSAVE);
261
262 cr4 = read_cr4();
263
264 if ((cr4 & X86_CR4_OSXSAVE) == 0)
265 cpuid_leaf1_ecx_mask &= ~(1 << (X86_FEATURE_XSAVE % 32));
266 259
267 clear_in_cr4(X86_CR4_OSXSAVE); 260 /* Xen will set CR4.OSXSAVE if supported and not disabled by force */
268 } 261 if ((cx & xsave_mask) != xsave_mask)
262 cpuid_leaf1_ecx_mask &= ~xsave_mask; /* disable XSAVE & OSXSAVE */
269} 263}
270 264
271static void xen_set_debugreg(int reg, unsigned long val) 265static void xen_set_debugreg(int reg, unsigned long val)
@@ -406,7 +400,7 @@ static void xen_load_gdt(const struct desc_ptr *dtr)
406/* 400/*
407 * load_gdt for early boot, when the gdt is only mapped once 401 * load_gdt for early boot, when the gdt is only mapped once
408 */ 402 */
409static __init void xen_load_gdt_boot(const struct desc_ptr *dtr) 403static void __init xen_load_gdt_boot(const struct desc_ptr *dtr)
410{ 404{
411 unsigned long va = dtr->address; 405 unsigned long va = dtr->address;
412 unsigned int size = dtr->size + 1; 406 unsigned int size = dtr->size + 1;
@@ -573,8 +567,8 @@ static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g)
573 567
574 preempt_disable(); 568 preempt_disable();
575 569
576 start = __get_cpu_var(idt_desc).address; 570 start = __this_cpu_read(idt_desc.address);
577 end = start + __get_cpu_var(idt_desc).size + 1; 571 end = start + __this_cpu_read(idt_desc.size) + 1;
578 572
579 xen_mc_flush(); 573 xen_mc_flush();
580 574
@@ -668,7 +662,7 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
668 * Version of write_gdt_entry for use at early boot-time needed to 662 * Version of write_gdt_entry for use at early boot-time needed to
669 * update an entry as simply as possible. 663 * update an entry as simply as possible.
670 */ 664 */
671static __init void xen_write_gdt_entry_boot(struct desc_struct *dt, int entry, 665static void __init xen_write_gdt_entry_boot(struct desc_struct *dt, int entry,
672 const void *desc, int type) 666 const void *desc, int type)
673{ 667{
674 switch (type) { 668 switch (type) {
@@ -835,6 +829,11 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
835 Xen console noise. */ 829 Xen console noise. */
836 break; 830 break;
837 831
832 case MSR_IA32_CR_PAT:
833 if (smp_processor_id() == 0)
834 xen_set_pat(((u64)high << 32) | low);
835 break;
836
838 default: 837 default:
839 ret = native_write_msr_safe(msr, low, high); 838 ret = native_write_msr_safe(msr, low, high);
840 } 839 }
@@ -873,8 +872,6 @@ void xen_setup_vcpu_info_placement(void)
873 /* xen_vcpu_setup managed to place the vcpu_info within the 872 /* xen_vcpu_setup managed to place the vcpu_info within the
874 percpu area for all cpus, so make use of it */ 873 percpu area for all cpus, so make use of it */
875 if (have_vcpu_info_placement) { 874 if (have_vcpu_info_placement) {
876 printk(KERN_INFO "Xen: using vcpu_info placement\n");
877
878 pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct); 875 pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct);
879 pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct); 876 pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct);
880 pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct); 877 pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct);
@@ -936,18 +933,18 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
936 return ret; 933 return ret;
937} 934}
938 935
939static const struct pv_info xen_info __initdata = { 936static const struct pv_info xen_info __initconst = {
940 .paravirt_enabled = 1, 937 .paravirt_enabled = 1,
941 .shared_kernel_pmd = 0, 938 .shared_kernel_pmd = 0,
942 939
943 .name = "Xen", 940 .name = "Xen",
944}; 941};
945 942
946static const struct pv_init_ops xen_init_ops __initdata = { 943static const struct pv_init_ops xen_init_ops __initconst = {
947 .patch = xen_patch, 944 .patch = xen_patch,
948}; 945};
949 946
950static const struct pv_cpu_ops xen_cpu_ops __initdata = { 947static const struct pv_cpu_ops xen_cpu_ops __initconst = {
951 .cpuid = xen_cpuid, 948 .cpuid = xen_cpuid,
952 949
953 .set_debugreg = xen_set_debugreg, 950 .set_debugreg = xen_set_debugreg,
@@ -1007,7 +1004,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
1007 .end_context_switch = xen_end_context_switch, 1004 .end_context_switch = xen_end_context_switch,
1008}; 1005};
1009 1006
1010static const struct pv_apic_ops xen_apic_ops __initdata = { 1007static const struct pv_apic_ops xen_apic_ops __initconst = {
1011#ifdef CONFIG_X86_LOCAL_APIC 1008#ifdef CONFIG_X86_LOCAL_APIC
1012 .startup_ipi_hook = paravirt_nop, 1009 .startup_ipi_hook = paravirt_nop,
1013#endif 1010#endif
@@ -1017,10 +1014,6 @@ static void xen_reboot(int reason)
1017{ 1014{
1018 struct sched_shutdown r = { .reason = reason }; 1015 struct sched_shutdown r = { .reason = reason };
1019 1016
1020#ifdef CONFIG_SMP
1021 smp_send_stop();
1022#endif
1023
1024 if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r)) 1017 if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r))
1025 BUG(); 1018 BUG();
1026} 1019}
@@ -1040,6 +1033,13 @@ static void xen_machine_halt(void)
1040 xen_reboot(SHUTDOWN_poweroff); 1033 xen_reboot(SHUTDOWN_poweroff);
1041} 1034}
1042 1035
1036static void xen_machine_power_off(void)
1037{
1038 if (pm_power_off)
1039 pm_power_off();
1040 xen_reboot(SHUTDOWN_poweroff);
1041}
1042
1043static void xen_crash_shutdown(struct pt_regs *regs) 1043static void xen_crash_shutdown(struct pt_regs *regs)
1044{ 1044{
1045 xen_reboot(SHUTDOWN_crash); 1045 xen_reboot(SHUTDOWN_crash);
@@ -1062,10 +1062,10 @@ int xen_panic_handler_init(void)
1062 return 0; 1062 return 0;
1063} 1063}
1064 1064
1065static const struct machine_ops __initdata xen_machine_ops = { 1065static const struct machine_ops xen_machine_ops __initconst = {
1066 .restart = xen_restart, 1066 .restart = xen_restart,
1067 .halt = xen_machine_halt, 1067 .halt = xen_machine_halt,
1068 .power_off = xen_machine_halt, 1068 .power_off = xen_machine_power_off,
1069 .shutdown = xen_machine_halt, 1069 .shutdown = xen_machine_halt,
1070 .crash_shutdown = xen_crash_shutdown, 1070 .crash_shutdown = xen_crash_shutdown,
1071 .emergency_restart = xen_emergency_restart, 1071 .emergency_restart = xen_emergency_restart,
@@ -1091,6 +1091,8 @@ static void __init xen_setup_stackprotector(void)
1091/* First C function to be called on Xen boot */ 1091/* First C function to be called on Xen boot */
1092asmlinkage void __init xen_start_kernel(void) 1092asmlinkage void __init xen_start_kernel(void)
1093{ 1093{
1094 struct physdev_set_iopl set_iopl;
1095 int rc;
1094 pgd_t *pgd; 1096 pgd_t *pgd;
1095 1097
1096 if (!xen_start_info) 1098 if (!xen_start_info)
@@ -1098,6 +1100,8 @@ asmlinkage void __init xen_start_kernel(void)
1098 1100
1099 xen_domain_type = XEN_PV_DOMAIN; 1101 xen_domain_type = XEN_PV_DOMAIN;
1100 1102
1103 xen_setup_machphys_mapping();
1104
1101 /* Install Xen paravirt ops */ 1105 /* Install Xen paravirt ops */
1102 pv_info = xen_info; 1106 pv_info = xen_info;
1103 pv_init_ops = xen_init_ops; 1107 pv_init_ops = xen_init_ops;
@@ -1170,6 +1174,15 @@ asmlinkage void __init xen_start_kernel(void)
1170 1174
1171 xen_smp_init(); 1175 xen_smp_init();
1172 1176
1177#ifdef CONFIG_ACPI_NUMA
1178 /*
1179 * The pages we from Xen are not related to machine pages, so
1180 * any NUMA information the kernel tries to get from ACPI will
1181 * be meaningless. Prevent it from trying.
1182 */
1183 acpi_numa = -1;
1184#endif
1185
1173 pgd = (pgd_t *)xen_start_info->pt_base; 1186 pgd = (pgd_t *)xen_start_info->pt_base;
1174 1187
1175 if (!xen_initial_domain()) 1188 if (!xen_initial_domain())
@@ -1181,12 +1194,16 @@ asmlinkage void __init xen_start_kernel(void)
1181 per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; 1194 per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
1182 1195
1183 local_irq_disable(); 1196 local_irq_disable();
1184 early_boot_irqs_off(); 1197 early_boot_irqs_disabled = true;
1198
1199 memblock_init();
1185 1200
1186 xen_raw_console_write("mapping kernel into physical memory\n"); 1201 xen_raw_console_write("mapping kernel into physical memory\n");
1187 pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages); 1202 pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages);
1203 xen_ident_map_ISA();
1188 1204
1189 init_mm.pgd = pgd; 1205 /* Allocate and initialize top and mid mfn levels for p2m structure */
1206 xen_build_mfn_list_list();
1190 1207
1191 /* keep using Xen gdt for now; no urgent need to change it */ 1208 /* keep using Xen gdt for now; no urgent need to change it */
1192 1209
@@ -1197,10 +1214,18 @@ asmlinkage void __init xen_start_kernel(void)
1197#else 1214#else
1198 pv_info.kernel_rpl = 0; 1215 pv_info.kernel_rpl = 0;
1199#endif 1216#endif
1200
1201 /* set the limit of our address space */ 1217 /* set the limit of our address space */
1202 xen_reserve_top(); 1218 xen_reserve_top();
1203 1219
1220 /* We used to do this in xen_arch_setup, but that is too late on AMD
1221 * were early_cpu_init (run before ->arch_setup()) calls early_amd_init
1222 * which pokes 0xcf8 port.
1223 */
1224 set_iopl.iopl = 1;
1225 rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
1226 if (rc != 0)
1227 xen_raw_printk("physdev_op failed %d\n", rc);
1228
1204#ifdef CONFIG_X86_32 1229#ifdef CONFIG_X86_32
1205 /* set up basic CPUID stuff */ 1230 /* set up basic CPUID stuff */
1206 cpu_detect(&new_cpu_data); 1231 cpu_detect(&new_cpu_data);
@@ -1220,6 +1245,8 @@ asmlinkage void __init xen_start_kernel(void)
1220 add_preferred_console("xenboot", 0, NULL); 1245 add_preferred_console("xenboot", 0, NULL);
1221 add_preferred_console("tty", 0, NULL); 1246 add_preferred_console("tty", 0, NULL);
1222 add_preferred_console("hvc", 0, NULL); 1247 add_preferred_console("hvc", 0, NULL);
1248 if (pci_xen)
1249 x86_init.pci.arch_init = pci_xen_init;
1223 } else { 1250 } else {
1224 /* Make sure ACS will be enabled */ 1251 /* Make sure ACS will be enabled */
1225 pci_request_acs(); 1252 pci_request_acs();
@@ -1238,25 +1265,6 @@ asmlinkage void __init xen_start_kernel(void)
1238#endif 1265#endif
1239} 1266}
1240 1267
1241static uint32_t xen_cpuid_base(void)
1242{
1243 uint32_t base, eax, ebx, ecx, edx;
1244 char signature[13];
1245
1246 for (base = 0x40000000; base < 0x40010000; base += 0x100) {
1247 cpuid(base, &eax, &ebx, &ecx, &edx);
1248 *(uint32_t *)(signature + 0) = ebx;
1249 *(uint32_t *)(signature + 4) = ecx;
1250 *(uint32_t *)(signature + 8) = edx;
1251 signature[12] = 0;
1252
1253 if (!strcmp("XenVMMXenVMM", signature) && ((eax - base) >= 2))
1254 return base;
1255 }
1256
1257 return 0;
1258}
1259
1260static int init_hvm_pv_info(int *major, int *minor) 1268static int init_hvm_pv_info(int *major, int *minor)
1261{ 1269{
1262 uint32_t eax, ebx, ecx, edx, pages, msr, base; 1270 uint32_t eax, ebx, ecx, edx, pages, msr, base;
@@ -1276,15 +1284,14 @@ static int init_hvm_pv_info(int *major, int *minor)
1276 1284
1277 xen_setup_features(); 1285 xen_setup_features();
1278 1286
1279 pv_info = xen_info; 1287 pv_info.name = "Xen HVM";
1280 pv_info.kernel_rpl = 0;
1281 1288
1282 xen_domain_type = XEN_HVM_DOMAIN; 1289 xen_domain_type = XEN_HVM_DOMAIN;
1283 1290
1284 return 0; 1291 return 0;
1285} 1292}
1286 1293
1287void xen_hvm_init_shared_info(void) 1294void __ref xen_hvm_init_shared_info(void)
1288{ 1295{
1289 int cpu; 1296 int cpu;
1290 struct xen_add_to_physmap xatp; 1297 struct xen_add_to_physmap xatp;
@@ -1323,6 +1330,8 @@ static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self,
1323 switch (action) { 1330 switch (action) {
1324 case CPU_UP_PREPARE: 1331 case CPU_UP_PREPARE:
1325 per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; 1332 per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
1333 if (xen_have_vector_callback)
1334 xen_init_lock_cpu(cpu);
1326 break; 1335 break;
1327 default: 1336 default:
1328 break; 1337 break;
@@ -1330,7 +1339,7 @@ static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self,
1330 return NOTIFY_OK; 1339 return NOTIFY_OK;
1331} 1340}
1332 1341
1333static struct notifier_block __cpuinitdata xen_hvm_cpu_notifier = { 1342static struct notifier_block xen_hvm_cpu_notifier __cpuinitdata = {
1334 .notifier_call = xen_hvm_cpu_notify, 1343 .notifier_call = xen_hvm_cpu_notify,
1335}; 1344};
1336 1345
@@ -1347,6 +1356,7 @@ static void __init xen_hvm_guest_init(void)
1347 1356
1348 if (xen_feature(XENFEAT_hvm_callback_vector)) 1357 if (xen_feature(XENFEAT_hvm_callback_vector))
1349 xen_have_vector_callback = 1; 1358 xen_have_vector_callback = 1;
1359 xen_hvm_smp_init();
1350 register_cpu_notifier(&xen_hvm_cpu_notifier); 1360 register_cpu_notifier(&xen_hvm_cpu_notifier);
1351 xen_unplug_emulated_devices(); 1361 xen_unplug_emulated_devices();
1352 have_vcpu_info_placement = 0; 1362 have_vcpu_info_placement = 0;
@@ -1366,7 +1376,19 @@ static bool __init xen_hvm_platform(void)
1366 return true; 1376 return true;
1367} 1377}
1368 1378
1369const __refconst struct hypervisor_x86 x86_hyper_xen_hvm = { 1379bool xen_hvm_need_lapic(void)
1380{
1381 if (xen_pv_domain())
1382 return false;
1383 if (!xen_hvm_domain())
1384 return false;
1385 if (xen_feature(XENFEAT_hvm_pirqs) && xen_have_vector_callback)
1386 return false;
1387 return true;
1388}
1389EXPORT_SYMBOL_GPL(xen_hvm_need_lapic);
1390
1391const struct hypervisor_x86 x86_hyper_xen_hvm __refconst = {
1370 .name = "Xen HVM", 1392 .name = "Xen HVM",
1371 .detect = xen_hvm_platform, 1393 .detect = xen_hvm_platform,
1372 .init_platform = xen_hvm_guest_init, 1394 .init_platform = xen_hvm_guest_init,
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
index 9d30105a0c4a..8bbb465b6f0a 100644
--- a/arch/x86/xen/irq.c
+++ b/arch/x86/xen/irq.c
@@ -113,7 +113,7 @@ static void xen_halt(void)
113 xen_safe_halt(); 113 xen_safe_halt();
114} 114}
115 115
116static const struct pv_irq_ops xen_irq_ops __initdata = { 116static const struct pv_irq_ops xen_irq_ops __initconst = {
117 .save_fl = PV_CALLEE_SAVE(xen_save_fl), 117 .save_fl = PV_CALLEE_SAVE(xen_save_fl),
118 .restore_fl = PV_CALLEE_SAVE(xen_restore_fl), 118 .restore_fl = PV_CALLEE_SAVE(xen_restore_fl),
119 .irq_disable = PV_CALLEE_SAVE(xen_irq_disable), 119 .irq_disable = PV_CALLEE_SAVE(xen_irq_disable),
@@ -126,7 +126,7 @@ static const struct pv_irq_ops xen_irq_ops __initdata = {
126#endif 126#endif
127}; 127};
128 128
129void __init xen_init_irq_ops() 129void __init xen_init_irq_ops(void)
130{ 130{
131 pv_irq_ops = xen_irq_ops; 131 pv_irq_ops = xen_irq_ops;
132 x86_init.irqs.intr_init = xen_init_IRQ; 132 x86_init.irqs.intr_init = xen_init_IRQ;
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 42086ac406af..0ccccb67a993 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -45,6 +45,8 @@
45#include <linux/vmalloc.h> 45#include <linux/vmalloc.h>
46#include <linux/module.h> 46#include <linux/module.h>
47#include <linux/gfp.h> 47#include <linux/gfp.h>
48#include <linux/memblock.h>
49#include <linux/seq_file.h>
48 50
49#include <asm/pgtable.h> 51#include <asm/pgtable.h>
50#include <asm/tlbflush.h> 52#include <asm/tlbflush.h>
@@ -55,6 +57,9 @@
55#include <asm/e820.h> 57#include <asm/e820.h>
56#include <asm/linkage.h> 58#include <asm/linkage.h>
57#include <asm/page.h> 59#include <asm/page.h>
60#include <asm/init.h>
61#include <asm/pat.h>
62#include <asm/smp.h>
58 63
59#include <asm/xen/hypercall.h> 64#include <asm/xen/hypercall.h>
60#include <asm/xen/hypervisor.h> 65#include <asm/xen/hypervisor.h>
@@ -71,74 +76,19 @@
71#include "mmu.h" 76#include "mmu.h"
72#include "debugfs.h" 77#include "debugfs.h"
73 78
74#define MMU_UPDATE_HISTO 30
75
76/* 79/*
77 * Protects atomic reservation decrease/increase against concurrent increases. 80 * Protects atomic reservation decrease/increase against concurrent increases.
78 * Also protects non-atomic updates of current_pages and driver_pages, and 81 * Also protects non-atomic updates of current_pages and balloon lists.
79 * balloon lists.
80 */ 82 */
81DEFINE_SPINLOCK(xen_reservation_lock); 83DEFINE_SPINLOCK(xen_reservation_lock);
82 84
83#ifdef CONFIG_XEN_DEBUG_FS
84
85static struct {
86 u32 pgd_update;
87 u32 pgd_update_pinned;
88 u32 pgd_update_batched;
89
90 u32 pud_update;
91 u32 pud_update_pinned;
92 u32 pud_update_batched;
93
94 u32 pmd_update;
95 u32 pmd_update_pinned;
96 u32 pmd_update_batched;
97
98 u32 pte_update;
99 u32 pte_update_pinned;
100 u32 pte_update_batched;
101
102 u32 mmu_update;
103 u32 mmu_update_extended;
104 u32 mmu_update_histo[MMU_UPDATE_HISTO];
105
106 u32 prot_commit;
107 u32 prot_commit_batched;
108
109 u32 set_pte_at;
110 u32 set_pte_at_batched;
111 u32 set_pte_at_pinned;
112 u32 set_pte_at_current;
113 u32 set_pte_at_kernel;
114} mmu_stats;
115
116static u8 zero_stats;
117
118static inline void check_zero(void)
119{
120 if (unlikely(zero_stats)) {
121 memset(&mmu_stats, 0, sizeof(mmu_stats));
122 zero_stats = 0;
123 }
124}
125
126#define ADD_STATS(elem, val) \
127 do { check_zero(); mmu_stats.elem += (val); } while(0)
128
129#else /* !CONFIG_XEN_DEBUG_FS */
130
131#define ADD_STATS(elem, val) do { (void)(val); } while(0)
132
133#endif /* CONFIG_XEN_DEBUG_FS */
134
135
136/* 85/*
137 * Identity map, in addition to plain kernel map. This needs to be 86 * Identity map, in addition to plain kernel map. This needs to be
138 * large enough to allocate page table pages to allocate the rest. 87 * large enough to allocate page table pages to allocate the rest.
139 * Each page can map 2MB. 88 * Each page can map 2MB.
140 */ 89 */
141static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss; 90#define LEVEL1_IDENT_ENTRIES (PTRS_PER_PTE * 4)
91static RESERVE_BRK_ARRAY(pte_t, level1_ident_pgt, LEVEL1_IDENT_ENTRIES);
142 92
143#ifdef CONFIG_X86_64 93#ifdef CONFIG_X86_64
144/* l3 pud for userspace vsyscall mapping */ 94/* l3 pud for userspace vsyscall mapping */
@@ -169,160 +119,6 @@ DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
169 */ 119 */
170#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK) 120#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
171 121
172
173#define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
174#define TOP_ENTRIES (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE)
175
176/* Placeholder for holes in the address space */
177static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] __page_aligned_data =
178 { [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL };
179
180 /* Array of pointers to pages containing p2m entries */
181static unsigned long *p2m_top[TOP_ENTRIES] __page_aligned_data =
182 { [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] };
183
184/* Arrays of p2m arrays expressed in mfns used for save/restore */
185static unsigned long p2m_top_mfn[TOP_ENTRIES] __page_aligned_bss;
186
187static unsigned long p2m_top_mfn_list[TOP_ENTRIES / P2M_ENTRIES_PER_PAGE]
188 __page_aligned_bss;
189
190static inline unsigned p2m_top_index(unsigned long pfn)
191{
192 BUG_ON(pfn >= MAX_DOMAIN_PAGES);
193 return pfn / P2M_ENTRIES_PER_PAGE;
194}
195
196static inline unsigned p2m_index(unsigned long pfn)
197{
198 return pfn % P2M_ENTRIES_PER_PAGE;
199}
200
201/* Build the parallel p2m_top_mfn structures */
202void xen_build_mfn_list_list(void)
203{
204 unsigned pfn, idx;
205
206 for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_ENTRIES_PER_PAGE) {
207 unsigned topidx = p2m_top_index(pfn);
208
209 p2m_top_mfn[topidx] = virt_to_mfn(p2m_top[topidx]);
210 }
211
212 for (idx = 0; idx < ARRAY_SIZE(p2m_top_mfn_list); idx++) {
213 unsigned topidx = idx * P2M_ENTRIES_PER_PAGE;
214 p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]);
215 }
216}
217
218void xen_setup_mfn_list_list(void)
219{
220 BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
221
222 HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
223 virt_to_mfn(p2m_top_mfn_list);
224 HYPERVISOR_shared_info->arch.max_pfn = xen_start_info->nr_pages;
225}
226
227/* Set up p2m_top to point to the domain-builder provided p2m pages */
228void __init xen_build_dynamic_phys_to_machine(void)
229{
230 unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
231 unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
232 unsigned pfn;
233
234 for (pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) {
235 unsigned topidx = p2m_top_index(pfn);
236
237 p2m_top[topidx] = &mfn_list[pfn];
238 }
239
240 xen_build_mfn_list_list();
241}
242
243unsigned long get_phys_to_machine(unsigned long pfn)
244{
245 unsigned topidx, idx;
246
247 if (unlikely(pfn >= MAX_DOMAIN_PAGES))
248 return INVALID_P2M_ENTRY;
249
250 topidx = p2m_top_index(pfn);
251 idx = p2m_index(pfn);
252 return p2m_top[topidx][idx];
253}
254EXPORT_SYMBOL_GPL(get_phys_to_machine);
255
256/* install a new p2m_top page */
257bool install_p2mtop_page(unsigned long pfn, unsigned long *p)
258{
259 unsigned topidx = p2m_top_index(pfn);
260 unsigned long **pfnp, *mfnp;
261 unsigned i;
262
263 pfnp = &p2m_top[topidx];
264 mfnp = &p2m_top_mfn[topidx];
265
266 for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++)
267 p[i] = INVALID_P2M_ENTRY;
268
269 if (cmpxchg(pfnp, p2m_missing, p) == p2m_missing) {
270 *mfnp = virt_to_mfn(p);
271 return true;
272 }
273
274 return false;
275}
276
277static void alloc_p2m(unsigned long pfn)
278{
279 unsigned long *p;
280
281 p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL);
282 BUG_ON(p == NULL);
283
284 if (!install_p2mtop_page(pfn, p))
285 free_page((unsigned long)p);
286}
287
288/* Try to install p2m mapping; fail if intermediate bits missing */
289bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
290{
291 unsigned topidx, idx;
292
293 if (unlikely(pfn >= MAX_DOMAIN_PAGES)) {
294 BUG_ON(mfn != INVALID_P2M_ENTRY);
295 return true;
296 }
297
298 topidx = p2m_top_index(pfn);
299 if (p2m_top[topidx] == p2m_missing) {
300 if (mfn == INVALID_P2M_ENTRY)
301 return true;
302 return false;
303 }
304
305 idx = p2m_index(pfn);
306 p2m_top[topidx][idx] = mfn;
307
308 return true;
309}
310
311void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
312{
313 if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
314 BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
315 return;
316 }
317
318 if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
319 alloc_p2m(pfn);
320
321 if (!__set_phys_to_machine(pfn, mfn))
322 BUG();
323 }
324}
325
326unsigned long arbitrary_virt_to_mfn(void *vaddr) 122unsigned long arbitrary_virt_to_mfn(void *vaddr)
327{ 123{
328 xmaddr_t maddr = arbitrary_virt_to_machine(vaddr); 124 xmaddr_t maddr = arbitrary_virt_to_machine(vaddr);
@@ -351,6 +147,7 @@ xmaddr_t arbitrary_virt_to_machine(void *vaddr)
351 offset = address & ~PAGE_MASK; 147 offset = address & ~PAGE_MASK;
352 return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset); 148 return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset);
353} 149}
150EXPORT_SYMBOL_GPL(arbitrary_virt_to_machine);
354 151
355void make_lowmem_page_readonly(void *vaddr) 152void make_lowmem_page_readonly(void *vaddr)
356{ 153{
@@ -359,7 +156,8 @@ void make_lowmem_page_readonly(void *vaddr)
359 unsigned int level; 156 unsigned int level;
360 157
361 pte = lookup_address(address, &level); 158 pte = lookup_address(address, &level);
362 BUG_ON(pte == NULL); 159 if (pte == NULL)
160 return; /* vaddr missing */
363 161
364 ptev = pte_wrprotect(*pte); 162 ptev = pte_wrprotect(*pte);
365 163
@@ -374,7 +172,8 @@ void make_lowmem_page_readwrite(void *vaddr)
374 unsigned int level; 172 unsigned int level;
375 173
376 pte = lookup_address(address, &level); 174 pte = lookup_address(address, &level);
377 BUG_ON(pte == NULL); 175 if (pte == NULL)
176 return; /* vaddr missing */
378 177
379 ptev = pte_mkwrite(*pte); 178 ptev = pte_mkwrite(*pte);
380 179
@@ -390,12 +189,7 @@ static bool xen_page_pinned(void *ptr)
390 return PagePinned(page); 189 return PagePinned(page);
391} 190}
392 191
393static bool xen_iomap_pte(pte_t pte) 192void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid)
394{
395 return pte_flags(pte) & _PAGE_IOMAP;
396}
397
398static void xen_set_iomap_pte(pte_t *ptep, pte_t pteval)
399{ 193{
400 struct multicall_space mcs; 194 struct multicall_space mcs;
401 struct mmu_update *u; 195 struct mmu_update *u;
@@ -404,13 +198,14 @@ static void xen_set_iomap_pte(pte_t *ptep, pte_t pteval)
404 u = mcs.args; 198 u = mcs.args;
405 199
406 /* ptep might be kmapped when using 32-bit HIGHPTE */ 200 /* ptep might be kmapped when using 32-bit HIGHPTE */
407 u->ptr = arbitrary_virt_to_machine(ptep).maddr; 201 u->ptr = virt_to_machine(ptep).maddr;
408 u->val = pte_val_ma(pteval); 202 u->val = pte_val_ma(pteval);
409 203
410 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_IO); 204 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, domid);
411 205
412 xen_mc_issue(PARAVIRT_LAZY_MMU); 206 xen_mc_issue(PARAVIRT_LAZY_MMU);
413} 207}
208EXPORT_SYMBOL_GPL(xen_set_domain_pte);
414 209
415static void xen_extend_mmu_update(const struct mmu_update *update) 210static void xen_extend_mmu_update(const struct mmu_update *update)
416{ 211{
@@ -420,27 +215,17 @@ static void xen_extend_mmu_update(const struct mmu_update *update)
420 mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u)); 215 mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u));
421 216
422 if (mcs.mc != NULL) { 217 if (mcs.mc != NULL) {
423 ADD_STATS(mmu_update_extended, 1);
424 ADD_STATS(mmu_update_histo[mcs.mc->args[1]], -1);
425
426 mcs.mc->args[1]++; 218 mcs.mc->args[1]++;
427
428 if (mcs.mc->args[1] < MMU_UPDATE_HISTO)
429 ADD_STATS(mmu_update_histo[mcs.mc->args[1]], 1);
430 else
431 ADD_STATS(mmu_update_histo[0], 1);
432 } else { 219 } else {
433 ADD_STATS(mmu_update, 1);
434 mcs = __xen_mc_entry(sizeof(*u)); 220 mcs = __xen_mc_entry(sizeof(*u));
435 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF); 221 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
436 ADD_STATS(mmu_update_histo[1], 1);
437 } 222 }
438 223
439 u = mcs.args; 224 u = mcs.args;
440 *u = *update; 225 *u = *update;
441} 226}
442 227
443void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val) 228static void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
444{ 229{
445 struct mmu_update u; 230 struct mmu_update u;
446 231
@@ -453,17 +238,13 @@ void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
453 u.val = pmd_val_ma(val); 238 u.val = pmd_val_ma(val);
454 xen_extend_mmu_update(&u); 239 xen_extend_mmu_update(&u);
455 240
456 ADD_STATS(pmd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
457
458 xen_mc_issue(PARAVIRT_LAZY_MMU); 241 xen_mc_issue(PARAVIRT_LAZY_MMU);
459 242
460 preempt_enable(); 243 preempt_enable();
461} 244}
462 245
463void xen_set_pmd(pmd_t *ptr, pmd_t val) 246static void xen_set_pmd(pmd_t *ptr, pmd_t val)
464{ 247{
465 ADD_STATS(pmd_update, 1);
466
467 /* If page is not pinned, we can just update the entry 248 /* If page is not pinned, we can just update the entry
468 directly */ 249 directly */
469 if (!xen_page_pinned(ptr)) { 250 if (!xen_page_pinned(ptr)) {
@@ -471,8 +252,6 @@ void xen_set_pmd(pmd_t *ptr, pmd_t val)
471 return; 252 return;
472 } 253 }
473 254
474 ADD_STATS(pmd_update_pinned, 1);
475
476 xen_set_pmd_hyper(ptr, val); 255 xen_set_pmd_hyper(ptr, val);
477} 256}
478 257
@@ -485,35 +264,34 @@ void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
485 set_pte_vaddr(vaddr, mfn_pte(mfn, flags)); 264 set_pte_vaddr(vaddr, mfn_pte(mfn, flags));
486} 265}
487 266
488void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, 267static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval)
489 pte_t *ptep, pte_t pteval)
490{ 268{
491 if (xen_iomap_pte(pteval)) { 269 struct mmu_update u;
492 xen_set_iomap_pte(ptep, pteval); 270
493 goto out; 271 if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU)
494 } 272 return false;
495 273
496 ADD_STATS(set_pte_at, 1); 274 xen_mc_batch();
497// ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep));
498 ADD_STATS(set_pte_at_current, mm == current->mm);
499 ADD_STATS(set_pte_at_kernel, mm == &init_mm);
500 275
501 if (mm == current->mm || mm == &init_mm) { 276 u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE;
502 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { 277 u.val = pte_val_ma(pteval);
503 struct multicall_space mcs; 278 xen_extend_mmu_update(&u);
504 mcs = xen_mc_entry(0);
505 279
506 MULTI_update_va_mapping(mcs.mc, addr, pteval, 0); 280 xen_mc_issue(PARAVIRT_LAZY_MMU);
507 ADD_STATS(set_pte_at_batched, 1); 281
508 xen_mc_issue(PARAVIRT_LAZY_MMU); 282 return true;
509 goto out; 283}
510 } else
511 if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0)
512 goto out;
513 }
514 xen_set_pte(ptep, pteval);
515 284
516out: return; 285static void xen_set_pte(pte_t *ptep, pte_t pteval)
286{
287 if (!xen_batched_set_pte(ptep, pteval))
288 native_set_pte(ptep, pteval);
289}
290
291static void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
292 pte_t *ptep, pte_t pteval)
293{
294 xen_set_pte(ptep, pteval);
517} 295}
518 296
519pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, 297pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
@@ -530,13 +308,10 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
530 308
531 xen_mc_batch(); 309 xen_mc_batch();
532 310
533 u.ptr = arbitrary_virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD; 311 u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
534 u.val = pte_val_ma(pte); 312 u.val = pte_val_ma(pte);
535 xen_extend_mmu_update(&u); 313 xen_extend_mmu_update(&u);
536 314
537 ADD_STATS(prot_commit, 1);
538 ADD_STATS(prot_commit_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
539
540 xen_mc_issue(PARAVIRT_LAZY_MMU); 315 xen_mc_issue(PARAVIRT_LAZY_MMU);
541} 316}
542 317
@@ -557,7 +332,34 @@ static pteval_t pte_pfn_to_mfn(pteval_t val)
557 if (val & _PAGE_PRESENT) { 332 if (val & _PAGE_PRESENT) {
558 unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; 333 unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
559 pteval_t flags = val & PTE_FLAGS_MASK; 334 pteval_t flags = val & PTE_FLAGS_MASK;
560 val = ((pteval_t)pfn_to_mfn(pfn) << PAGE_SHIFT) | flags; 335 unsigned long mfn;
336
337 if (!xen_feature(XENFEAT_auto_translated_physmap))
338 mfn = get_phys_to_machine(pfn);
339 else
340 mfn = pfn;
341 /*
342 * If there's no mfn for the pfn, then just create an
343 * empty non-present pte. Unfortunately this loses
344 * information about the original pfn, so
345 * pte_mfn_to_pfn is asymmetric.
346 */
347 if (unlikely(mfn == INVALID_P2M_ENTRY)) {
348 mfn = 0;
349 flags = 0;
350 } else {
351 /*
352 * Paramount to do this test _after_ the
353 * INVALID_P2M_ENTRY as INVALID_P2M_ENTRY &
354 * IDENTITY_FRAME_BIT resolves to true.
355 */
356 mfn &= ~FOREIGN_FRAME_BIT;
357 if (mfn & IDENTITY_FRAME_BIT) {
358 mfn &= ~IDENTITY_FRAME_BIT;
359 flags |= _PAGE_IOMAP;
360 }
361 }
362 val = ((pteval_t)mfn << PAGE_SHIFT) | flags;
561 } 363 }
562 364
563 return val; 365 return val;
@@ -577,25 +379,71 @@ static pteval_t iomap_pte(pteval_t val)
577 return val; 379 return val;
578} 380}
579 381
580pteval_t xen_pte_val(pte_t pte) 382static pteval_t xen_pte_val(pte_t pte)
581{ 383{
582 if (xen_initial_domain() && (pte.pte & _PAGE_IOMAP)) 384 pteval_t pteval = pte.pte;
583 return pte.pte;
584 385
585 return pte_mfn_to_pfn(pte.pte); 386 /* If this is a WC pte, convert back from Xen WC to Linux WC */
387 if ((pteval & (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT)) == _PAGE_PAT) {
388 WARN_ON(!pat_enabled);
389 pteval = (pteval & ~_PAGE_PAT) | _PAGE_PWT;
390 }
391
392 if (xen_initial_domain() && (pteval & _PAGE_IOMAP))
393 return pteval;
394
395 return pte_mfn_to_pfn(pteval);
586} 396}
587PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val); 397PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
588 398
589pgdval_t xen_pgd_val(pgd_t pgd) 399static pgdval_t xen_pgd_val(pgd_t pgd)
590{ 400{
591 return pte_mfn_to_pfn(pgd.pgd); 401 return pte_mfn_to_pfn(pgd.pgd);
592} 402}
593PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val); 403PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
594 404
595pte_t xen_make_pte(pteval_t pte) 405/*
406 * Xen's PAT setup is part of its ABI, though I assume entries 6 & 7
407 * are reserved for now, to correspond to the Intel-reserved PAT
408 * types.
409 *
410 * We expect Linux's PAT set as follows:
411 *
412 * Idx PTE flags Linux Xen Default
413 * 0 WB WB WB
414 * 1 PWT WC WT WT
415 * 2 PCD UC- UC- UC-
416 * 3 PCD PWT UC UC UC
417 * 4 PAT WB WC WB
418 * 5 PAT PWT WC WP WT
419 * 6 PAT PCD UC- UC UC-
420 * 7 PAT PCD PWT UC UC UC
421 */
422
423void xen_set_pat(u64 pat)
424{
425 /* We expect Linux to use a PAT setting of
426 * UC UC- WC WB (ignoring the PAT flag) */
427 WARN_ON(pat != 0x0007010600070106ull);
428}
429
430static pte_t xen_make_pte(pteval_t pte)
596{ 431{
597 phys_addr_t addr = (pte & PTE_PFN_MASK); 432 phys_addr_t addr = (pte & PTE_PFN_MASK);
598 433
434 /* If Linux is trying to set a WC pte, then map to the Xen WC.
435 * If _PAGE_PAT is set, then it probably means it is really
436 * _PAGE_PSE, so avoid fiddling with the PAT mapping and hope
437 * things work out OK...
438 *
439 * (We should never see kernel mappings with _PAGE_PSE set,
440 * but we could see hugetlbfs mappings, I think.).
441 */
442 if (pat_enabled && !WARN_ON(pte & _PAGE_PAT)) {
443 if ((pte & (_PAGE_PCD | _PAGE_PWT)) == _PAGE_PWT)
444 pte = (pte & ~(_PAGE_PCD | _PAGE_PWT)) | _PAGE_PAT;
445 }
446
599 /* 447 /*
600 * Unprivileged domains are allowed to do IOMAPpings for 448 * Unprivileged domains are allowed to do IOMAPpings for
601 * PCI passthrough, but not map ISA space. The ISA 449 * PCI passthrough, but not map ISA space. The ISA
@@ -614,20 +462,55 @@ pte_t xen_make_pte(pteval_t pte)
614} 462}
615PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte); 463PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
616 464
617pgd_t xen_make_pgd(pgdval_t pgd) 465#ifdef CONFIG_XEN_DEBUG
466pte_t xen_make_pte_debug(pteval_t pte)
467{
468 phys_addr_t addr = (pte & PTE_PFN_MASK);
469 phys_addr_t other_addr;
470 bool io_page = false;
471 pte_t _pte;
472
473 if (pte & _PAGE_IOMAP)
474 io_page = true;
475
476 _pte = xen_make_pte(pte);
477
478 if (!addr)
479 return _pte;
480
481 if (io_page &&
482 (xen_initial_domain() || addr >= ISA_END_ADDRESS)) {
483 other_addr = pfn_to_mfn(addr >> PAGE_SHIFT) << PAGE_SHIFT;
484 WARN_ONCE(addr != other_addr,
485 "0x%lx is using VM_IO, but it is 0x%lx!\n",
486 (unsigned long)addr, (unsigned long)other_addr);
487 } else {
488 pteval_t iomap_set = (_pte.pte & PTE_FLAGS_MASK) & _PAGE_IOMAP;
489 other_addr = (_pte.pte & PTE_PFN_MASK);
490 WARN_ONCE((addr == other_addr) && (!io_page) && (!iomap_set),
491 "0x%lx is missing VM_IO (and wasn't fixed)!\n",
492 (unsigned long)addr);
493 }
494
495 return _pte;
496}
497PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte_debug);
498#endif
499
500static pgd_t xen_make_pgd(pgdval_t pgd)
618{ 501{
619 pgd = pte_pfn_to_mfn(pgd); 502 pgd = pte_pfn_to_mfn(pgd);
620 return native_make_pgd(pgd); 503 return native_make_pgd(pgd);
621} 504}
622PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd); 505PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd);
623 506
624pmdval_t xen_pmd_val(pmd_t pmd) 507static pmdval_t xen_pmd_val(pmd_t pmd)
625{ 508{
626 return pte_mfn_to_pfn(pmd.pmd); 509 return pte_mfn_to_pfn(pmd.pmd);
627} 510}
628PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val); 511PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val);
629 512
630void xen_set_pud_hyper(pud_t *ptr, pud_t val) 513static void xen_set_pud_hyper(pud_t *ptr, pud_t val)
631{ 514{
632 struct mmu_update u; 515 struct mmu_update u;
633 516
@@ -640,17 +523,13 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val)
640 u.val = pud_val_ma(val); 523 u.val = pud_val_ma(val);
641 xen_extend_mmu_update(&u); 524 xen_extend_mmu_update(&u);
642 525
643 ADD_STATS(pud_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
644
645 xen_mc_issue(PARAVIRT_LAZY_MMU); 526 xen_mc_issue(PARAVIRT_LAZY_MMU);
646 527
647 preempt_enable(); 528 preempt_enable();
648} 529}
649 530
650void xen_set_pud(pud_t *ptr, pud_t val) 531static void xen_set_pud(pud_t *ptr, pud_t val)
651{ 532{
652 ADD_STATS(pud_update, 1);
653
654 /* If page is not pinned, we can just update the entry 533 /* If page is not pinned, we can just update the entry
655 directly */ 534 directly */
656 if (!xen_page_pinned(ptr)) { 535 if (!xen_page_pinned(ptr)) {
@@ -658,56 +537,28 @@ void xen_set_pud(pud_t *ptr, pud_t val)
658 return; 537 return;
659 } 538 }
660 539
661 ADD_STATS(pud_update_pinned, 1);
662
663 xen_set_pud_hyper(ptr, val); 540 xen_set_pud_hyper(ptr, val);
664} 541}
665 542
666void xen_set_pte(pte_t *ptep, pte_t pte)
667{
668 if (xen_iomap_pte(pte)) {
669 xen_set_iomap_pte(ptep, pte);
670 return;
671 }
672
673 ADD_STATS(pte_update, 1);
674// ADD_STATS(pte_update_pinned, xen_page_pinned(ptep));
675 ADD_STATS(pte_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
676
677#ifdef CONFIG_X86_PAE 543#ifdef CONFIG_X86_PAE
678 ptep->pte_high = pte.pte_high; 544static void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
679 smp_wmb();
680 ptep->pte_low = pte.pte_low;
681#else
682 *ptep = pte;
683#endif
684}
685
686#ifdef CONFIG_X86_PAE
687void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
688{ 545{
689 if (xen_iomap_pte(pte)) {
690 xen_set_iomap_pte(ptep, pte);
691 return;
692 }
693
694 set_64bit((u64 *)ptep, native_pte_val(pte)); 546 set_64bit((u64 *)ptep, native_pte_val(pte));
695} 547}
696 548
697void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) 549static void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
698{ 550{
699 ptep->pte_low = 0; 551 if (!xen_batched_set_pte(ptep, native_make_pte(0)))
700 smp_wmb(); /* make sure low gets written first */ 552 native_pte_clear(mm, addr, ptep);
701 ptep->pte_high = 0;
702} 553}
703 554
704void xen_pmd_clear(pmd_t *pmdp) 555static void xen_pmd_clear(pmd_t *pmdp)
705{ 556{
706 set_pmd(pmdp, __pmd(0)); 557 set_pmd(pmdp, __pmd(0));
707} 558}
708#endif /* CONFIG_X86_PAE */ 559#endif /* CONFIG_X86_PAE */
709 560
710pmd_t xen_make_pmd(pmdval_t pmd) 561static pmd_t xen_make_pmd(pmdval_t pmd)
711{ 562{
712 pmd = pte_pfn_to_mfn(pmd); 563 pmd = pte_pfn_to_mfn(pmd);
713 return native_make_pmd(pmd); 564 return native_make_pmd(pmd);
@@ -715,13 +566,13 @@ pmd_t xen_make_pmd(pmdval_t pmd)
715PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd); 566PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
716 567
717#if PAGETABLE_LEVELS == 4 568#if PAGETABLE_LEVELS == 4
718pudval_t xen_pud_val(pud_t pud) 569static pudval_t xen_pud_val(pud_t pud)
719{ 570{
720 return pte_mfn_to_pfn(pud.pud); 571 return pte_mfn_to_pfn(pud.pud);
721} 572}
722PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val); 573PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val);
723 574
724pud_t xen_make_pud(pudval_t pud) 575static pud_t xen_make_pud(pudval_t pud)
725{ 576{
726 pud = pte_pfn_to_mfn(pud); 577 pud = pte_pfn_to_mfn(pud);
727 578
@@ -729,7 +580,7 @@ pud_t xen_make_pud(pudval_t pud)
729} 580}
730PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud); 581PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud);
731 582
732pgd_t *xen_get_user_pgd(pgd_t *pgd) 583static pgd_t *xen_get_user_pgd(pgd_t *pgd)
733{ 584{
734 pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK); 585 pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK);
735 unsigned offset = pgd - pgd_page; 586 unsigned offset = pgd - pgd_page;
@@ -761,7 +612,7 @@ static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
761 * 2. It is always pinned 612 * 2. It is always pinned
762 * 3. It has no user pagetable attached to it 613 * 3. It has no user pagetable attached to it
763 */ 614 */
764void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val) 615static void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
765{ 616{
766 preempt_disable(); 617 preempt_disable();
767 618
@@ -774,12 +625,10 @@ void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
774 preempt_enable(); 625 preempt_enable();
775} 626}
776 627
777void xen_set_pgd(pgd_t *ptr, pgd_t val) 628static void xen_set_pgd(pgd_t *ptr, pgd_t val)
778{ 629{
779 pgd_t *user_ptr = xen_get_user_pgd(ptr); 630 pgd_t *user_ptr = xen_get_user_pgd(ptr);
780 631
781 ADD_STATS(pgd_update, 1);
782
783 /* If page is not pinned, we can just update the entry 632 /* If page is not pinned, we can just update the entry
784 directly */ 633 directly */
785 if (!xen_page_pinned(ptr)) { 634 if (!xen_page_pinned(ptr)) {
@@ -791,9 +640,6 @@ void xen_set_pgd(pgd_t *ptr, pgd_t val)
791 return; 640 return;
792 } 641 }
793 642
794 ADD_STATS(pgd_update_pinned, 1);
795 ADD_STATS(pgd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
796
797 /* If it's pinned, then we can at least batch the kernel and 643 /* If it's pinned, then we can at least batch the kernel and
798 user updates together. */ 644 user updates together. */
799 xen_mc_batch(); 645 xen_mc_batch();
@@ -1068,10 +914,9 @@ static void xen_pgd_pin(struct mm_struct *mm)
1068 */ 914 */
1069void xen_mm_pin_all(void) 915void xen_mm_pin_all(void)
1070{ 916{
1071 unsigned long flags;
1072 struct page *page; 917 struct page *page;
1073 918
1074 spin_lock_irqsave(&pgd_lock, flags); 919 spin_lock(&pgd_lock);
1075 920
1076 list_for_each_entry(page, &pgd_list, lru) { 921 list_for_each_entry(page, &pgd_list, lru) {
1077 if (!PagePinned(page)) { 922 if (!PagePinned(page)) {
@@ -1080,7 +925,7 @@ void xen_mm_pin_all(void)
1080 } 925 }
1081 } 926 }
1082 927
1083 spin_unlock_irqrestore(&pgd_lock, flags); 928 spin_unlock(&pgd_lock);
1084} 929}
1085 930
1086/* 931/*
@@ -1088,7 +933,7 @@ void xen_mm_pin_all(void)
1088 * that's before we have page structures to store the bits. So do all 933 * that's before we have page structures to store the bits. So do all
1089 * the book-keeping now. 934 * the book-keeping now.
1090 */ 935 */
1091static __init int xen_mark_pinned(struct mm_struct *mm, struct page *page, 936static int __init xen_mark_pinned(struct mm_struct *mm, struct page *page,
1092 enum pt_level level) 937 enum pt_level level)
1093{ 938{
1094 SetPagePinned(page); 939 SetPagePinned(page);
@@ -1181,10 +1026,9 @@ static void xen_pgd_unpin(struct mm_struct *mm)
1181 */ 1026 */
1182void xen_mm_unpin_all(void) 1027void xen_mm_unpin_all(void)
1183{ 1028{
1184 unsigned long flags;
1185 struct page *page; 1029 struct page *page;
1186 1030
1187 spin_lock_irqsave(&pgd_lock, flags); 1031 spin_lock(&pgd_lock);
1188 1032
1189 list_for_each_entry(page, &pgd_list, lru) { 1033 list_for_each_entry(page, &pgd_list, lru) {
1190 if (PageSavePinned(page)) { 1034 if (PageSavePinned(page)) {
@@ -1194,17 +1038,17 @@ void xen_mm_unpin_all(void)
1194 } 1038 }
1195 } 1039 }
1196 1040
1197 spin_unlock_irqrestore(&pgd_lock, flags); 1041 spin_unlock(&pgd_lock);
1198} 1042}
1199 1043
1200void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) 1044static void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
1201{ 1045{
1202 spin_lock(&next->page_table_lock); 1046 spin_lock(&next->page_table_lock);
1203 xen_pgd_pin(next); 1047 xen_pgd_pin(next);
1204 spin_unlock(&next->page_table_lock); 1048 spin_unlock(&next->page_table_lock);
1205} 1049}
1206 1050
1207void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) 1051static void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
1208{ 1052{
1209 spin_lock(&mm->page_table_lock); 1053 spin_lock(&mm->page_table_lock);
1210 xen_pgd_pin(mm); 1054 xen_pgd_pin(mm);
@@ -1222,7 +1066,7 @@ static void drop_other_mm_ref(void *info)
1222 1066
1223 active_mm = percpu_read(cpu_tlbstate.active_mm); 1067 active_mm = percpu_read(cpu_tlbstate.active_mm);
1224 1068
1225 if (active_mm == mm) 1069 if (active_mm == mm && percpu_read(cpu_tlbstate.state) != TLBSTATE_OK)
1226 leave_mm(smp_processor_id()); 1070 leave_mm(smp_processor_id());
1227 1071
1228 /* If this cpu still has a stale cr3 reference, then make sure 1072 /* If this cpu still has a stale cr3 reference, then make sure
@@ -1291,7 +1135,7 @@ static void xen_drop_mm_ref(struct mm_struct *mm)
1291 * pagetable because of lazy tlb flushing. This means we need need to 1135 * pagetable because of lazy tlb flushing. This means we need need to
1292 * switch all CPUs off this pagetable before we can unpin it. 1136 * switch all CPUs off this pagetable before we can unpin it.
1293 */ 1137 */
1294void xen_exit_mmap(struct mm_struct *mm) 1138static void xen_exit_mmap(struct mm_struct *mm)
1295{ 1139{
1296 get_cpu(); /* make sure we don't move around */ 1140 get_cpu(); /* make sure we don't move around */
1297 xen_drop_mm_ref(mm); 1141 xen_drop_mm_ref(mm);
@@ -1306,13 +1150,27 @@ void xen_exit_mmap(struct mm_struct *mm)
1306 spin_unlock(&mm->page_table_lock); 1150 spin_unlock(&mm->page_table_lock);
1307} 1151}
1308 1152
1309static __init void xen_pagetable_setup_start(pgd_t *base) 1153static void __init xen_pagetable_setup_start(pgd_t *base)
1310{ 1154{
1311} 1155}
1312 1156
1157static __init void xen_mapping_pagetable_reserve(u64 start, u64 end)
1158{
1159 /* reserve the range used */
1160 native_pagetable_reserve(start, end);
1161
1162 /* set as RW the rest */
1163 printk(KERN_DEBUG "xen: setting RW the range %llx - %llx\n", end,
1164 PFN_PHYS(pgt_buf_top));
1165 while (end < PFN_PHYS(pgt_buf_top)) {
1166 make_lowmem_page_readwrite(__va(end));
1167 end += PAGE_SIZE;
1168 }
1169}
1170
1313static void xen_post_allocator_init(void); 1171static void xen_post_allocator_init(void);
1314 1172
1315static __init void xen_pagetable_setup_done(pgd_t *base) 1173static void __init xen_pagetable_setup_done(pgd_t *base)
1316{ 1174{
1317 xen_setup_shared_info(); 1175 xen_setup_shared_info();
1318 xen_post_allocator_init(); 1176 xen_post_allocator_init();
@@ -1374,7 +1232,11 @@ static void xen_flush_tlb_others(const struct cpumask *cpus,
1374{ 1232{
1375 struct { 1233 struct {
1376 struct mmuext_op op; 1234 struct mmuext_op op;
1235#ifdef CONFIG_SMP
1236 DECLARE_BITMAP(mask, num_processors);
1237#else
1377 DECLARE_BITMAP(mask, NR_CPUS); 1238 DECLARE_BITMAP(mask, NR_CPUS);
1239#endif
1378 } *args; 1240 } *args;
1379 struct multicall_space mcs; 1241 struct multicall_space mcs;
1380 1242
@@ -1509,7 +1371,7 @@ static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
1509} 1371}
1510 1372
1511#ifdef CONFIG_X86_32 1373#ifdef CONFIG_X86_32
1512static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) 1374static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
1513{ 1375{
1514 /* If there's an existing pte, then don't allow _PAGE_RW to be set */ 1376 /* If there's an existing pte, then don't allow _PAGE_RW to be set */
1515 if (pte_val_ma(*ptep) & _PAGE_PRESENT) 1377 if (pte_val_ma(*ptep) & _PAGE_PRESENT)
@@ -1518,16 +1380,34 @@ static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
1518 1380
1519 return pte; 1381 return pte;
1520} 1382}
1383#else /* CONFIG_X86_64 */
1384static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
1385{
1386 unsigned long pfn = pte_pfn(pte);
1387
1388 /*
1389 * If the new pfn is within the range of the newly allocated
1390 * kernel pagetable, and it isn't being mapped into an
1391 * early_ioremap fixmap slot as a freshly allocated page, make sure
1392 * it is RO.
1393 */
1394 if (((!is_early_ioremap_ptep(ptep) &&
1395 pfn >= pgt_buf_start && pfn < pgt_buf_top)) ||
1396 (is_early_ioremap_ptep(ptep) && pfn != (pgt_buf_end - 1)))
1397 pte = pte_wrprotect(pte);
1398
1399 return pte;
1400}
1401#endif /* CONFIG_X86_64 */
1521 1402
1522/* Init-time set_pte while constructing initial pagetables, which 1403/* Init-time set_pte while constructing initial pagetables, which
1523 doesn't allow RO pagetable pages to be remapped RW */ 1404 doesn't allow RO pagetable pages to be remapped RW */
1524static __init void xen_set_pte_init(pte_t *ptep, pte_t pte) 1405static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
1525{ 1406{
1526 pte = mask_rw_pte(ptep, pte); 1407 pte = mask_rw_pte(ptep, pte);
1527 1408
1528 xen_set_pte(ptep, pte); 1409 xen_set_pte(ptep, pte);
1529} 1410}
1530#endif
1531 1411
1532static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn) 1412static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
1533{ 1413{
@@ -1540,7 +1420,7 @@ static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
1540 1420
1541/* Early in boot, while setting up the initial pagetable, assume 1421/* Early in boot, while setting up the initial pagetable, assume
1542 everything is pinned. */ 1422 everything is pinned. */
1543static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn) 1423static void __init xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
1544{ 1424{
1545#ifdef CONFIG_FLATMEM 1425#ifdef CONFIG_FLATMEM
1546 BUG_ON(mem_map); /* should only be used early */ 1426 BUG_ON(mem_map); /* should only be used early */
@@ -1550,7 +1430,7 @@ static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
1550} 1430}
1551 1431
1552/* Used for pmd and pud */ 1432/* Used for pmd and pud */
1553static __init void xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn) 1433static void __init xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn)
1554{ 1434{
1555#ifdef CONFIG_FLATMEM 1435#ifdef CONFIG_FLATMEM
1556 BUG_ON(mem_map); /* should only be used early */ 1436 BUG_ON(mem_map); /* should only be used early */
@@ -1560,13 +1440,13 @@ static __init void xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn)
1560 1440
1561/* Early release_pte assumes that all pts are pinned, since there's 1441/* Early release_pte assumes that all pts are pinned, since there's
1562 only init_mm and anything attached to that is pinned. */ 1442 only init_mm and anything attached to that is pinned. */
1563static __init void xen_release_pte_init(unsigned long pfn) 1443static void __init xen_release_pte_init(unsigned long pfn)
1564{ 1444{
1565 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); 1445 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
1566 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); 1446 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1567} 1447}
1568 1448
1569static __init void xen_release_pmd_init(unsigned long pfn) 1449static void __init xen_release_pmd_init(unsigned long pfn)
1570{ 1450{
1571 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); 1451 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1572} 1452}
@@ -1682,6 +1562,7 @@ static void *m2v(phys_addr_t maddr)
1682 return __ka(m2p(maddr)); 1562 return __ka(m2p(maddr));
1683} 1563}
1684 1564
1565/* Set the page permissions on an identity-mapped pages */
1685static void set_page_prot(void *addr, pgprot_t prot) 1566static void set_page_prot(void *addr, pgprot_t prot)
1686{ 1567{
1687 unsigned long pfn = __pa(addr) >> PAGE_SHIFT; 1568 unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
@@ -1691,12 +1572,15 @@ static void set_page_prot(void *addr, pgprot_t prot)
1691 BUG(); 1572 BUG();
1692} 1573}
1693 1574
1694static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) 1575static void __init xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1695{ 1576{
1696 unsigned pmdidx, pteidx; 1577 unsigned pmdidx, pteidx;
1697 unsigned ident_pte; 1578 unsigned ident_pte;
1698 unsigned long pfn; 1579 unsigned long pfn;
1699 1580
1581 level1_ident_pgt = extend_brk(sizeof(pte_t) * LEVEL1_IDENT_ENTRIES,
1582 PAGE_SIZE);
1583
1700 ident_pte = 0; 1584 ident_pte = 0;
1701 pfn = 0; 1585 pfn = 0;
1702 for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) { 1586 for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
@@ -1707,7 +1591,7 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1707 pte_page = m2v(pmd[pmdidx].pmd); 1591 pte_page = m2v(pmd[pmdidx].pmd);
1708 else { 1592 else {
1709 /* Check for free pte pages */ 1593 /* Check for free pte pages */
1710 if (ident_pte == ARRAY_SIZE(level1_ident_pgt)) 1594 if (ident_pte == LEVEL1_IDENT_ENTRIES)
1711 break; 1595 break;
1712 1596
1713 pte_page = &level1_ident_pgt[ident_pte]; 1597 pte_page = &level1_ident_pgt[ident_pte];
@@ -1720,8 +1604,10 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1720 for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) { 1604 for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
1721 pte_t pte; 1605 pte_t pte;
1722 1606
1607#ifdef CONFIG_X86_32
1723 if (pfn > max_pfn_mapped) 1608 if (pfn > max_pfn_mapped)
1724 max_pfn_mapped = pfn; 1609 max_pfn_mapped = pfn;
1610#endif
1725 1611
1726 if (!pte_none(pte_page[pteidx])) 1612 if (!pte_none(pte_page[pteidx]))
1727 continue; 1613 continue;
@@ -1737,6 +1623,20 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1737 set_page_prot(pmd, PAGE_KERNEL_RO); 1623 set_page_prot(pmd, PAGE_KERNEL_RO);
1738} 1624}
1739 1625
1626void __init xen_setup_machphys_mapping(void)
1627{
1628 struct xen_machphys_mapping mapping;
1629 unsigned long machine_to_phys_nr_ents;
1630
1631 if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) {
1632 machine_to_phys_mapping = (unsigned long *)mapping.v_start;
1633 machine_to_phys_nr_ents = mapping.max_mfn + 1;
1634 } else {
1635 machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES;
1636 }
1637 machine_to_phys_order = fls(machine_to_phys_nr_ents - 1);
1638}
1639
1740#ifdef CONFIG_X86_64 1640#ifdef CONFIG_X86_64
1741static void convert_pfn_mfn(void *v) 1641static void convert_pfn_mfn(void *v)
1742{ 1642{
@@ -1750,7 +1650,7 @@ static void convert_pfn_mfn(void *v)
1750} 1650}
1751 1651
1752/* 1652/*
1753 * Set up the inital kernel pagetable. 1653 * Set up the initial kernel pagetable.
1754 * 1654 *
1755 * We can construct this by grafting the Xen provided pagetable into 1655 * We can construct this by grafting the Xen provided pagetable into
1756 * head_64.S's preconstructed pagetables. We copy the Xen L2's into 1656 * head_64.S's preconstructed pagetables. We copy the Xen L2's into
@@ -1760,12 +1660,18 @@ static void convert_pfn_mfn(void *v)
1760 * of the physical mapping once some sort of allocator has been set 1660 * of the physical mapping once some sort of allocator has been set
1761 * up. 1661 * up.
1762 */ 1662 */
1763__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, 1663pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd,
1764 unsigned long max_pfn) 1664 unsigned long max_pfn)
1765{ 1665{
1766 pud_t *l3; 1666 pud_t *l3;
1767 pmd_t *l2; 1667 pmd_t *l2;
1768 1668
1669 /* max_pfn_mapped is the last pfn mapped in the initial memory
1670 * mappings. Considering that on Xen after the kernel mappings we
1671 * have the mappings of some pages that don't exist in pfn space, we
1672 * set max_pfn_mapped to the last real pfn mapped. */
1673 max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list));
1674
1769 /* Zap identity mapping */ 1675 /* Zap identity mapping */
1770 init_level4_pgt[0] = __pgd(0); 1676 init_level4_pgt[0] = __pgd(0);
1771 1677
@@ -1814,7 +1720,7 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1814 __xen_write_cr3(true, __pa(pgd)); 1720 __xen_write_cr3(true, __pa(pgd));
1815 xen_mc_issue(PARAVIRT_LAZY_CPU); 1721 xen_mc_issue(PARAVIRT_LAZY_CPU);
1816 1722
1817 reserve_early(__pa(xen_start_info->pt_base), 1723 memblock_x86_reserve_range(__pa(xen_start_info->pt_base),
1818 __pa(xen_start_info->pt_base + 1724 __pa(xen_start_info->pt_base +
1819 xen_start_info->nr_pt_frames * PAGE_SIZE), 1725 xen_start_info->nr_pt_frames * PAGE_SIZE),
1820 "XEN PAGETABLES"); 1726 "XEN PAGETABLES");
@@ -1822,45 +1728,88 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1822 return pgd; 1728 return pgd;
1823} 1729}
1824#else /* !CONFIG_X86_64 */ 1730#else /* !CONFIG_X86_64 */
1825static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss; 1731static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD);
1732static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD);
1733
1734static void __init xen_write_cr3_init(unsigned long cr3)
1735{
1736 unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir));
1737
1738 BUG_ON(read_cr3() != __pa(initial_page_table));
1739 BUG_ON(cr3 != __pa(swapper_pg_dir));
1740
1741 /*
1742 * We are switching to swapper_pg_dir for the first time (from
1743 * initial_page_table) and therefore need to mark that page
1744 * read-only and then pin it.
1745 *
1746 * Xen disallows sharing of kernel PMDs for PAE
1747 * guests. Therefore we must copy the kernel PMD from
1748 * initial_page_table into a new kernel PMD to be used in
1749 * swapper_pg_dir.
1750 */
1751 swapper_kernel_pmd =
1752 extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
1753 memcpy(swapper_kernel_pmd, initial_kernel_pmd,
1754 sizeof(pmd_t) * PTRS_PER_PMD);
1755 swapper_pg_dir[KERNEL_PGD_BOUNDARY] =
1756 __pgd(__pa(swapper_kernel_pmd) | _PAGE_PRESENT);
1757 set_page_prot(swapper_kernel_pmd, PAGE_KERNEL_RO);
1758
1759 set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
1760 xen_write_cr3(cr3);
1761 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, pfn);
1762
1763 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE,
1764 PFN_DOWN(__pa(initial_page_table)));
1765 set_page_prot(initial_page_table, PAGE_KERNEL);
1766 set_page_prot(initial_kernel_pmd, PAGE_KERNEL);
1826 1767
1827__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, 1768 pv_mmu_ops.write_cr3 = &xen_write_cr3;
1769}
1770
1771pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd,
1828 unsigned long max_pfn) 1772 unsigned long max_pfn)
1829{ 1773{
1830 pmd_t *kernel_pmd; 1774 pmd_t *kernel_pmd;
1831 1775
1776 initial_kernel_pmd =
1777 extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
1778
1832 max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) + 1779 max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) +
1833 xen_start_info->nr_pt_frames * PAGE_SIZE + 1780 xen_start_info->nr_pt_frames * PAGE_SIZE +
1834 512*1024); 1781 512*1024);
1835 1782
1836 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd); 1783 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
1837 memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD); 1784 memcpy(initial_kernel_pmd, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
1838 1785
1839 xen_map_identity_early(level2_kernel_pgt, max_pfn); 1786 xen_map_identity_early(initial_kernel_pmd, max_pfn);
1840 1787
1841 memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD); 1788 memcpy(initial_page_table, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
1842 set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY], 1789 initial_page_table[KERNEL_PGD_BOUNDARY] =
1843 __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT)); 1790 __pgd(__pa(initial_kernel_pmd) | _PAGE_PRESENT);
1844 1791
1845 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); 1792 set_page_prot(initial_kernel_pmd, PAGE_KERNEL_RO);
1846 set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO); 1793 set_page_prot(initial_page_table, PAGE_KERNEL_RO);
1847 set_page_prot(empty_zero_page, PAGE_KERNEL_RO); 1794 set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
1848 1795
1849 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); 1796 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1850 1797
1851 xen_write_cr3(__pa(swapper_pg_dir)); 1798 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE,
1799 PFN_DOWN(__pa(initial_page_table)));
1800 xen_write_cr3(__pa(initial_page_table));
1852 1801
1853 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir))); 1802 memblock_x86_reserve_range(__pa(xen_start_info->pt_base),
1854
1855 reserve_early(__pa(xen_start_info->pt_base),
1856 __pa(xen_start_info->pt_base + 1803 __pa(xen_start_info->pt_base +
1857 xen_start_info->nr_pt_frames * PAGE_SIZE), 1804 xen_start_info->nr_pt_frames * PAGE_SIZE),
1858 "XEN PAGETABLES"); 1805 "XEN PAGETABLES");
1859 1806
1860 return swapper_pg_dir; 1807 return initial_page_table;
1861} 1808}
1862#endif /* CONFIG_X86_64 */ 1809#endif /* CONFIG_X86_64 */
1863 1810
1811static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss;
1812
1864static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) 1813static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
1865{ 1814{
1866 pte_t pte; 1815 pte_t pte;
@@ -1881,15 +1830,28 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
1881#else 1830#else
1882 case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE: 1831 case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
1883#endif 1832#endif
1884#ifdef CONFIG_X86_LOCAL_APIC
1885 case FIX_APIC_BASE: /* maps dummy local APIC */
1886#endif
1887 case FIX_TEXT_POKE0: 1833 case FIX_TEXT_POKE0:
1888 case FIX_TEXT_POKE1: 1834 case FIX_TEXT_POKE1:
1889 /* All local page mappings */ 1835 /* All local page mappings */
1890 pte = pfn_pte(phys, prot); 1836 pte = pfn_pte(phys, prot);
1891 break; 1837 break;
1892 1838
1839#ifdef CONFIG_X86_LOCAL_APIC
1840 case FIX_APIC_BASE: /* maps dummy local APIC */
1841 pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL);
1842 break;
1843#endif
1844
1845#ifdef CONFIG_X86_IO_APIC
1846 case FIX_IO_APIC_BASE_0 ... FIX_IO_APIC_BASE_END:
1847 /*
1848 * We just don't map the IO APIC - all access is via
1849 * hypercalls. Keep the address in the pte for reference.
1850 */
1851 pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL);
1852 break;
1853#endif
1854
1893 case FIX_PARAVIRT_BOOTMAP: 1855 case FIX_PARAVIRT_BOOTMAP:
1894 /* This is an MFN, but it isn't an IO mapping from the 1856 /* This is an MFN, but it isn't an IO mapping from the
1895 IO domain */ 1857 IO domain */
@@ -1914,8 +1876,34 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
1914#endif 1876#endif
1915} 1877}
1916 1878
1917static __init void xen_post_allocator_init(void) 1879void __init xen_ident_map_ISA(void)
1880{
1881 unsigned long pa;
1882
1883 /*
1884 * If we're dom0, then linear map the ISA machine addresses into
1885 * the kernel's address space.
1886 */
1887 if (!xen_initial_domain())
1888 return;
1889
1890 xen_raw_printk("Xen: setup ISA identity maps\n");
1891
1892 for (pa = ISA_START_ADDRESS; pa < ISA_END_ADDRESS; pa += PAGE_SIZE) {
1893 pte_t pte = mfn_pte(PFN_DOWN(pa), PAGE_KERNEL_IO);
1894
1895 if (HYPERVISOR_update_va_mapping(PAGE_OFFSET + pa, pte, 0))
1896 BUG();
1897 }
1898
1899 xen_flush_tlb();
1900}
1901
1902static void __init xen_post_allocator_init(void)
1918{ 1903{
1904#ifdef CONFIG_XEN_DEBUG
1905 pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte_debug);
1906#endif
1919 pv_mmu_ops.set_pte = xen_set_pte; 1907 pv_mmu_ops.set_pte = xen_set_pte;
1920 pv_mmu_ops.set_pmd = xen_set_pmd; 1908 pv_mmu_ops.set_pmd = xen_set_pmd;
1921 pv_mmu_ops.set_pud = xen_set_pud; 1909 pv_mmu_ops.set_pud = xen_set_pud;
@@ -1948,12 +1936,16 @@ static void xen_leave_lazy_mmu(void)
1948 preempt_enable(); 1936 preempt_enable();
1949} 1937}
1950 1938
1951static const struct pv_mmu_ops xen_mmu_ops __initdata = { 1939static const struct pv_mmu_ops xen_mmu_ops __initconst = {
1952 .read_cr2 = xen_read_cr2, 1940 .read_cr2 = xen_read_cr2,
1953 .write_cr2 = xen_write_cr2, 1941 .write_cr2 = xen_write_cr2,
1954 1942
1955 .read_cr3 = xen_read_cr3, 1943 .read_cr3 = xen_read_cr3,
1944#ifdef CONFIG_X86_32
1945 .write_cr3 = xen_write_cr3_init,
1946#else
1956 .write_cr3 = xen_write_cr3, 1947 .write_cr3 = xen_write_cr3,
1948#endif
1957 1949
1958 .flush_tlb_user = xen_flush_tlb, 1950 .flush_tlb_user = xen_flush_tlb,
1959 .flush_tlb_kernel = xen_flush_tlb, 1951 .flush_tlb_kernel = xen_flush_tlb,
@@ -1969,14 +1961,9 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1969 .alloc_pte = xen_alloc_pte_init, 1961 .alloc_pte = xen_alloc_pte_init,
1970 .release_pte = xen_release_pte_init, 1962 .release_pte = xen_release_pte_init,
1971 .alloc_pmd = xen_alloc_pmd_init, 1963 .alloc_pmd = xen_alloc_pmd_init,
1972 .alloc_pmd_clone = paravirt_nop,
1973 .release_pmd = xen_release_pmd_init, 1964 .release_pmd = xen_release_pmd_init,
1974 1965
1975#ifdef CONFIG_X86_64
1976 .set_pte = xen_set_pte,
1977#else
1978 .set_pte = xen_set_pte_init, 1966 .set_pte = xen_set_pte_init,
1979#endif
1980 .set_pte_at = xen_set_pte_at, 1967 .set_pte_at = xen_set_pte_at,
1981 .set_pmd = xen_set_pmd_hyper, 1968 .set_pmd = xen_set_pmd_hyper,
1982 1969
@@ -2022,11 +2009,12 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
2022 2009
2023void __init xen_init_mmu_ops(void) 2010void __init xen_init_mmu_ops(void)
2024{ 2011{
2012 x86_init.mapping.pagetable_reserve = xen_mapping_pagetable_reserve;
2025 x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start; 2013 x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start;
2026 x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done; 2014 x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done;
2027 pv_mmu_ops = xen_mmu_ops; 2015 pv_mmu_ops = xen_mmu_ops;
2028 2016
2029 vmap_lazy_unmap = false; 2017 memset(dummy_mapping, 0xff, PAGE_SIZE);
2030} 2018}
2031 2019
2032/* Protected by xen_reservation_lock. */ 2020/* Protected by xen_reservation_lock. */
@@ -2049,7 +2037,7 @@ static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order,
2049 in_frames[i] = virt_to_mfn(vaddr); 2037 in_frames[i] = virt_to_mfn(vaddr);
2050 2038
2051 MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0); 2039 MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0);
2052 set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY); 2040 __set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY);
2053 2041
2054 if (out_frames) 2042 if (out_frames)
2055 out_frames[i] = virt_to_pfn(vaddr); 2043 out_frames[i] = virt_to_pfn(vaddr);
@@ -2259,65 +2247,83 @@ void __init xen_hvm_init_mmu_ops(void)
2259} 2247}
2260#endif 2248#endif
2261 2249
2262#ifdef CONFIG_XEN_DEBUG_FS 2250#define REMAP_BATCH_SIZE 16
2263 2251
2264static struct dentry *d_mmu_debug; 2252struct remap_data {
2253 unsigned long mfn;
2254 pgprot_t prot;
2255 struct mmu_update *mmu_update;
2256};
2265 2257
2266static int __init xen_mmu_debugfs(void) 2258static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token,
2259 unsigned long addr, void *data)
2267{ 2260{
2268 struct dentry *d_xen = xen_init_debugfs(); 2261 struct remap_data *rmd = data;
2269 2262 pte_t pte = pte_mkspecial(pfn_pte(rmd->mfn++, rmd->prot));
2270 if (d_xen == NULL)
2271 return -ENOMEM;
2272 2263
2273 d_mmu_debug = debugfs_create_dir("mmu", d_xen); 2264 rmd->mmu_update->ptr = virt_to_machine(ptep).maddr;
2274 2265 rmd->mmu_update->val = pte_val_ma(pte);
2275 debugfs_create_u8("zero_stats", 0644, d_mmu_debug, &zero_stats); 2266 rmd->mmu_update++;
2276
2277 debugfs_create_u32("pgd_update", 0444, d_mmu_debug, &mmu_stats.pgd_update);
2278 debugfs_create_u32("pgd_update_pinned", 0444, d_mmu_debug,
2279 &mmu_stats.pgd_update_pinned);
2280 debugfs_create_u32("pgd_update_batched", 0444, d_mmu_debug,
2281 &mmu_stats.pgd_update_pinned);
2282
2283 debugfs_create_u32("pud_update", 0444, d_mmu_debug, &mmu_stats.pud_update);
2284 debugfs_create_u32("pud_update_pinned", 0444, d_mmu_debug,
2285 &mmu_stats.pud_update_pinned);
2286 debugfs_create_u32("pud_update_batched", 0444, d_mmu_debug,
2287 &mmu_stats.pud_update_pinned);
2288
2289 debugfs_create_u32("pmd_update", 0444, d_mmu_debug, &mmu_stats.pmd_update);
2290 debugfs_create_u32("pmd_update_pinned", 0444, d_mmu_debug,
2291 &mmu_stats.pmd_update_pinned);
2292 debugfs_create_u32("pmd_update_batched", 0444, d_mmu_debug,
2293 &mmu_stats.pmd_update_pinned);
2294
2295 debugfs_create_u32("pte_update", 0444, d_mmu_debug, &mmu_stats.pte_update);
2296// debugfs_create_u32("pte_update_pinned", 0444, d_mmu_debug,
2297// &mmu_stats.pte_update_pinned);
2298 debugfs_create_u32("pte_update_batched", 0444, d_mmu_debug,
2299 &mmu_stats.pte_update_pinned);
2300
2301 debugfs_create_u32("mmu_update", 0444, d_mmu_debug, &mmu_stats.mmu_update);
2302 debugfs_create_u32("mmu_update_extended", 0444, d_mmu_debug,
2303 &mmu_stats.mmu_update_extended);
2304 xen_debugfs_create_u32_array("mmu_update_histo", 0444, d_mmu_debug,
2305 mmu_stats.mmu_update_histo, 20);
2306
2307 debugfs_create_u32("set_pte_at", 0444, d_mmu_debug, &mmu_stats.set_pte_at);
2308 debugfs_create_u32("set_pte_at_batched", 0444, d_mmu_debug,
2309 &mmu_stats.set_pte_at_batched);
2310 debugfs_create_u32("set_pte_at_current", 0444, d_mmu_debug,
2311 &mmu_stats.set_pte_at_current);
2312 debugfs_create_u32("set_pte_at_kernel", 0444, d_mmu_debug,
2313 &mmu_stats.set_pte_at_kernel);
2314
2315 debugfs_create_u32("prot_commit", 0444, d_mmu_debug, &mmu_stats.prot_commit);
2316 debugfs_create_u32("prot_commit_batched", 0444, d_mmu_debug,
2317 &mmu_stats.prot_commit_batched);
2318 2267
2319 return 0; 2268 return 0;
2320} 2269}
2321fs_initcall(xen_mmu_debugfs);
2322 2270
2323#endif /* CONFIG_XEN_DEBUG_FS */ 2271int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
2272 unsigned long addr,
2273 unsigned long mfn, int nr,
2274 pgprot_t prot, unsigned domid)
2275{
2276 struct remap_data rmd;
2277 struct mmu_update mmu_update[REMAP_BATCH_SIZE];
2278 int batch;
2279 unsigned long range;
2280 int err = 0;
2281
2282 prot = __pgprot(pgprot_val(prot) | _PAGE_IOMAP);
2283
2284 BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_RESERVED | VM_IO)) ==
2285 (VM_PFNMAP | VM_RESERVED | VM_IO)));
2286
2287 rmd.mfn = mfn;
2288 rmd.prot = prot;
2289
2290 while (nr) {
2291 batch = min(REMAP_BATCH_SIZE, nr);
2292 range = (unsigned long)batch << PAGE_SHIFT;
2293
2294 rmd.mmu_update = mmu_update;
2295 err = apply_to_page_range(vma->vm_mm, addr, range,
2296 remap_area_mfn_pte_fn, &rmd);
2297 if (err)
2298 goto out;
2299
2300 err = -EFAULT;
2301 if (HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid) < 0)
2302 goto out;
2303
2304 nr -= batch;
2305 addr += range;
2306 }
2307
2308 err = 0;
2309out:
2310
2311 flush_tlb_all();
2312
2313 return err;
2314}
2315EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range);
2316
2317#ifdef CONFIG_XEN_DEBUG_FS
2318static int p2m_dump_open(struct inode *inode, struct file *filp)
2319{
2320 return single_open(filp, p2m_dump_show, NULL);
2321}
2322
2323static const struct file_operations p2m_dump_fops = {
2324 .open = p2m_dump_open,
2325 .read = seq_read,
2326 .llseek = seq_lseek,
2327 .release = single_release,
2328};
2329#endif /* CONFIG_XEN_DEBUG_FS */
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index fa938c4aa2f7..73809bb951b4 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -12,47 +12,9 @@ enum pt_level {
12 12
13 13
14bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn); 14bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
15bool install_p2mtop_page(unsigned long pfn, unsigned long *p);
16 15
17void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); 16void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
18 17
19
20void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next);
21void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
22void xen_exit_mmap(struct mm_struct *mm);
23
24pteval_t xen_pte_val(pte_t);
25pmdval_t xen_pmd_val(pmd_t);
26pgdval_t xen_pgd_val(pgd_t);
27
28pte_t xen_make_pte(pteval_t);
29pmd_t xen_make_pmd(pmdval_t);
30pgd_t xen_make_pgd(pgdval_t);
31
32void xen_set_pte(pte_t *ptep, pte_t pteval);
33void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
34 pte_t *ptep, pte_t pteval);
35
36#ifdef CONFIG_X86_PAE
37void xen_set_pte_atomic(pte_t *ptep, pte_t pte);
38void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
39void xen_pmd_clear(pmd_t *pmdp);
40#endif /* CONFIG_X86_PAE */
41
42void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval);
43void xen_set_pud(pud_t *ptr, pud_t val);
44void xen_set_pmd_hyper(pmd_t *pmdp, pmd_t pmdval);
45void xen_set_pud_hyper(pud_t *ptr, pud_t val);
46
47#if PAGETABLE_LEVELS == 4
48pudval_t xen_pud_val(pud_t pud);
49pud_t xen_make_pud(pudval_t pudval);
50void xen_set_pgd(pgd_t *pgdp, pgd_t pgd);
51void xen_set_pgd_hyper(pgd_t *pgdp, pgd_t pgd);
52#endif
53
54pgd_t *xen_get_user_pgd(pgd_t *pgd);
55
56pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep); 18pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
57void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, 19void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
58 pte_t *ptep, pte_t pte); 20 pte_t *ptep, pte_t pte);
diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c
index 8bff7e7c290b..1b2b73ff0a6e 100644
--- a/arch/x86/xen/multicalls.c
+++ b/arch/x86/xen/multicalls.c
@@ -189,10 +189,10 @@ struct multicall_space __xen_mc_entry(size_t args)
189 unsigned argidx = roundup(b->argidx, sizeof(u64)); 189 unsigned argidx = roundup(b->argidx, sizeof(u64));
190 190
191 BUG_ON(preemptible()); 191 BUG_ON(preemptible());
192 BUG_ON(b->argidx > MC_ARGS); 192 BUG_ON(b->argidx >= MC_ARGS);
193 193
194 if (b->mcidx == MC_BATCH || 194 if (b->mcidx == MC_BATCH ||
195 (argidx + args) > MC_ARGS) { 195 (argidx + args) >= MC_ARGS) {
196 mc_stats_flush(b->mcidx == MC_BATCH ? FL_SLOTS : FL_ARGS); 196 mc_stats_flush(b->mcidx == MC_BATCH ? FL_SLOTS : FL_ARGS);
197 xen_mc_flush(); 197 xen_mc_flush();
198 argidx = roundup(b->argidx, sizeof(u64)); 198 argidx = roundup(b->argidx, sizeof(u64));
@@ -206,7 +206,7 @@ struct multicall_space __xen_mc_entry(size_t args)
206 ret.args = &b->args[argidx]; 206 ret.args = &b->args[argidx];
207 b->argidx = argidx + args; 207 b->argidx = argidx + args;
208 208
209 BUG_ON(b->argidx > MC_ARGS); 209 BUG_ON(b->argidx >= MC_ARGS);
210 return ret; 210 return ret;
211} 211}
212 212
@@ -216,7 +216,7 @@ struct multicall_space xen_mc_extend_args(unsigned long op, size_t size)
216 struct multicall_space ret = { NULL, NULL }; 216 struct multicall_space ret = { NULL, NULL };
217 217
218 BUG_ON(preemptible()); 218 BUG_ON(preemptible());
219 BUG_ON(b->argidx > MC_ARGS); 219 BUG_ON(b->argidx >= MC_ARGS);
220 220
221 if (b->mcidx == 0) 221 if (b->mcidx == 0)
222 return ret; 222 return ret;
@@ -224,14 +224,14 @@ struct multicall_space xen_mc_extend_args(unsigned long op, size_t size)
224 if (b->entries[b->mcidx - 1].op != op) 224 if (b->entries[b->mcidx - 1].op != op)
225 return ret; 225 return ret;
226 226
227 if ((b->argidx + size) > MC_ARGS) 227 if ((b->argidx + size) >= MC_ARGS)
228 return ret; 228 return ret;
229 229
230 ret.mc = &b->entries[b->mcidx - 1]; 230 ret.mc = &b->entries[b->mcidx - 1];
231 ret.args = &b->args[b->argidx]; 231 ret.args = &b->args[b->argidx];
232 b->argidx += size; 232 b->argidx += size;
233 233
234 BUG_ON(b->argidx > MC_ARGS); 234 BUG_ON(b->argidx >= MC_ARGS);
235 return ret; 235 return ret;
236} 236}
237 237
diff --git a/arch/x86/xen/multicalls.h b/arch/x86/xen/multicalls.h
index 9e565da5d1f7..4ec8035e3216 100644
--- a/arch/x86/xen/multicalls.h
+++ b/arch/x86/xen/multicalls.h
@@ -22,7 +22,7 @@ static inline void xen_mc_batch(void)
22 unsigned long flags; 22 unsigned long flags;
23 /* need to disable interrupts until this entry is complete */ 23 /* need to disable interrupts until this entry is complete */
24 local_irq_save(flags); 24 local_irq_save(flags);
25 __get_cpu_var(xen_mc_irq_flags) = flags; 25 __this_cpu_write(xen_mc_irq_flags, flags);
26} 26}
27 27
28static inline struct multicall_space xen_mc_entry(size_t args) 28static inline struct multicall_space xen_mc_entry(size_t args)
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
new file mode 100644
index 000000000000..58efeb9d5440
--- /dev/null
+++ b/arch/x86/xen/p2m.c
@@ -0,0 +1,859 @@
1/*
2 * Xen leaves the responsibility for maintaining p2m mappings to the
3 * guests themselves, but it must also access and update the p2m array
4 * during suspend/resume when all the pages are reallocated.
5 *
6 * The p2m table is logically a flat array, but we implement it as a
7 * three-level tree to allow the address space to be sparse.
8 *
9 * Xen
10 * |
11 * p2m_top p2m_top_mfn
12 * / \ / \
13 * p2m_mid p2m_mid p2m_mid_mfn p2m_mid_mfn
14 * / \ / \ / /
15 * p2m p2m p2m p2m p2m p2m p2m ...
16 *
17 * The p2m_mid_mfn pages are mapped by p2m_top_mfn_p.
18 *
19 * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the
20 * maximum representable pseudo-physical address space is:
21 * P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages
22 *
23 * P2M_PER_PAGE depends on the architecture, as a mfn is always
24 * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to
25 * 512 and 1024 entries respectively.
26 *
27 * In short, these structures contain the Machine Frame Number (MFN) of the PFN.
28 *
29 * However not all entries are filled with MFNs. Specifically for all other
30 * leaf entries, or for the top root, or middle one, for which there is a void
31 * entry, we assume it is "missing". So (for example)
32 * pfn_to_mfn(0x90909090)=INVALID_P2M_ENTRY.
33 *
34 * We also have the possibility of setting 1-1 mappings on certain regions, so
35 * that:
36 * pfn_to_mfn(0xc0000)=0xc0000
37 *
38 * The benefit of this is, that we can assume for non-RAM regions (think
39 * PCI BARs, or ACPI spaces), we can create mappings easily b/c we
40 * get the PFN value to match the MFN.
41 *
42 * For this to work efficiently we have one new page p2m_identity and
43 * allocate (via reserved_brk) any other pages we need to cover the sides
44 * (1GB or 4MB boundary violations). All entries in p2m_identity are set to
45 * INVALID_P2M_ENTRY type (Xen toolstack only recognizes that and MFNs,
46 * no other fancy value).
47 *
48 * On lookup we spot that the entry points to p2m_identity and return the
49 * identity value instead of dereferencing and returning INVALID_P2M_ENTRY.
50 * If the entry points to an allocated page, we just proceed as before and
51 * return the PFN. If the PFN has IDENTITY_FRAME_BIT set we unmask that in
52 * appropriate functions (pfn_to_mfn).
53 *
54 * The reason for having the IDENTITY_FRAME_BIT instead of just returning the
55 * PFN is that we could find ourselves where pfn_to_mfn(pfn)==pfn for a
56 * non-identity pfn. To protect ourselves against we elect to set (and get) the
57 * IDENTITY_FRAME_BIT on all identity mapped PFNs.
58 *
59 * This simplistic diagram is used to explain the more subtle piece of code.
60 * There is also a digram of the P2M at the end that can help.
61 * Imagine your E820 looking as so:
62 *
63 * 1GB 2GB
64 * /-------------------+---------\/----\ /----------\ /---+-----\
65 * | System RAM | Sys RAM ||ACPI| | reserved | | Sys RAM |
66 * \-------------------+---------/\----/ \----------/ \---+-----/
67 * ^- 1029MB ^- 2001MB
68 *
69 * [1029MB = 263424 (0x40500), 2001MB = 512256 (0x7D100),
70 * 2048MB = 524288 (0x80000)]
71 *
72 * And dom0_mem=max:3GB,1GB is passed in to the guest, meaning memory past 1GB
73 * is actually not present (would have to kick the balloon driver to put it in).
74 *
75 * When we are told to set the PFNs for identity mapping (see patch: "xen/setup:
76 * Set identity mapping for non-RAM E820 and E820 gaps.") we pass in the start
77 * of the PFN and the end PFN (263424 and 512256 respectively). The first step
78 * is to reserve_brk a top leaf page if the p2m[1] is missing. The top leaf page
79 * covers 512^2 of page estate (1GB) and in case the start or end PFN is not
80 * aligned on 512^2*PAGE_SIZE (1GB) we loop on aligned 1GB PFNs from start pfn
81 * to end pfn. We reserve_brk top leaf pages if they are missing (means they
82 * point to p2m_mid_missing).
83 *
84 * With the E820 example above, 263424 is not 1GB aligned so we allocate a
85 * reserve_brk page which will cover the PFNs estate from 0x40000 to 0x80000.
86 * Each entry in the allocate page is "missing" (points to p2m_missing).
87 *
88 * Next stage is to determine if we need to do a more granular boundary check
89 * on the 4MB (or 2MB depending on architecture) off the start and end pfn's.
90 * We check if the start pfn and end pfn violate that boundary check, and if
91 * so reserve_brk a middle (p2m[x][y]) leaf page. This way we have a much finer
92 * granularity of setting which PFNs are missing and which ones are identity.
93 * In our example 263424 and 512256 both fail the check so we reserve_brk two
94 * pages. Populate them with INVALID_P2M_ENTRY (so they both have "missing"
95 * values) and assign them to p2m[1][2] and p2m[1][488] respectively.
96 *
97 * At this point we would at minimum reserve_brk one page, but could be up to
98 * three. Each call to set_phys_range_identity has at maximum a three page
99 * cost. If we were to query the P2M at this stage, all those entries from
100 * start PFN through end PFN (so 1029MB -> 2001MB) would return
101 * INVALID_P2M_ENTRY ("missing").
102 *
103 * The next step is to walk from the start pfn to the end pfn setting
104 * the IDENTITY_FRAME_BIT on each PFN. This is done in set_phys_range_identity.
105 * If we find that the middle leaf is pointing to p2m_missing we can swap it
106 * over to p2m_identity - this way covering 4MB (or 2MB) PFN space. At this
107 * point we do not need to worry about boundary aligment (so no need to
108 * reserve_brk a middle page, figure out which PFNs are "missing" and which
109 * ones are identity), as that has been done earlier. If we find that the
110 * middle leaf is not occupied by p2m_identity or p2m_missing, we dereference
111 * that page (which covers 512 PFNs) and set the appropriate PFN with
112 * IDENTITY_FRAME_BIT. In our example 263424 and 512256 end up there, and we
113 * set from p2m[1][2][256->511] and p2m[1][488][0->256] with
114 * IDENTITY_FRAME_BIT set.
115 *
116 * All other regions that are void (or not filled) either point to p2m_missing
117 * (considered missing) or have the default value of INVALID_P2M_ENTRY (also
118 * considered missing). In our case, p2m[1][2][0->255] and p2m[1][488][257->511]
119 * contain the INVALID_P2M_ENTRY value and are considered "missing."
120 *
121 * This is what the p2m ends up looking (for the E820 above) with this
122 * fabulous drawing:
123 *
124 * p2m /--------------\
125 * /-----\ | &mfn_list[0],| /-----------------\
126 * | 0 |------>| &mfn_list[1],| /---------------\ | ~0, ~0, .. |
127 * |-----| | ..., ~0, ~0 | | ~0, ~0, [x]---+----->| IDENTITY [@256] |
128 * | 1 |---\ \--------------/ | [p2m_identity]+\ | IDENTITY [@257] |
129 * |-----| \ | [p2m_identity]+\\ | .... |
130 * | 2 |--\ \-------------------->| ... | \\ \----------------/
131 * |-----| \ \---------------/ \\
132 * | 3 |\ \ \\ p2m_identity
133 * |-----| \ \-------------------->/---------------\ /-----------------\
134 * | .. +->+ | [p2m_identity]+-->| ~0, ~0, ~0, ... |
135 * \-----/ / | [p2m_identity]+-->| ..., ~0 |
136 * / /---------------\ | .... | \-----------------/
137 * / | IDENTITY[@0] | /-+-[x], ~0, ~0.. |
138 * / | IDENTITY[@256]|<----/ \---------------/
139 * / | ~0, ~0, .... |
140 * | \---------------/
141 * |
142 * p2m_missing p2m_missing
143 * /------------------\ /------------\
144 * | [p2m_mid_missing]+---->| ~0, ~0, ~0 |
145 * | [p2m_mid_missing]+---->| ..., ~0 |
146 * \------------------/ \------------/
147 *
148 * where ~0 is INVALID_P2M_ENTRY. IDENTITY is (PFN | IDENTITY_BIT)
149 */
150
151#include <linux/init.h>
152#include <linux/module.h>
153#include <linux/list.h>
154#include <linux/hash.h>
155#include <linux/sched.h>
156#include <linux/seq_file.h>
157
158#include <asm/cache.h>
159#include <asm/setup.h>
160
161#include <asm/xen/page.h>
162#include <asm/xen/hypercall.h>
163#include <asm/xen/hypervisor.h>
164
165#include "xen-ops.h"
166
167static void __init m2p_override_init(void);
168
169unsigned long xen_max_p2m_pfn __read_mostly;
170
171#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
172#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *))
173#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **))
174
175#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
176
177/* Placeholders for holes in the address space */
178static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE);
179static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE);
180static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE);
181
182static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE);
183static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE);
184static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE);
185
186static RESERVE_BRK_ARRAY(unsigned long, p2m_identity, P2M_PER_PAGE);
187
188RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
189RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
190
191/* We might hit two boundary violations at the start and end, at max each
192 * boundary violation will require three middle nodes. */
193RESERVE_BRK(p2m_mid_identity, PAGE_SIZE * 2 * 3);
194
195static inline unsigned p2m_top_index(unsigned long pfn)
196{
197 BUG_ON(pfn >= MAX_P2M_PFN);
198 return pfn / (P2M_MID_PER_PAGE * P2M_PER_PAGE);
199}
200
201static inline unsigned p2m_mid_index(unsigned long pfn)
202{
203 return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE;
204}
205
206static inline unsigned p2m_index(unsigned long pfn)
207{
208 return pfn % P2M_PER_PAGE;
209}
210
211static void p2m_top_init(unsigned long ***top)
212{
213 unsigned i;
214
215 for (i = 0; i < P2M_TOP_PER_PAGE; i++)
216 top[i] = p2m_mid_missing;
217}
218
219static void p2m_top_mfn_init(unsigned long *top)
220{
221 unsigned i;
222
223 for (i = 0; i < P2M_TOP_PER_PAGE; i++)
224 top[i] = virt_to_mfn(p2m_mid_missing_mfn);
225}
226
227static void p2m_top_mfn_p_init(unsigned long **top)
228{
229 unsigned i;
230
231 for (i = 0; i < P2M_TOP_PER_PAGE; i++)
232 top[i] = p2m_mid_missing_mfn;
233}
234
235static void p2m_mid_init(unsigned long **mid)
236{
237 unsigned i;
238
239 for (i = 0; i < P2M_MID_PER_PAGE; i++)
240 mid[i] = p2m_missing;
241}
242
243static void p2m_mid_mfn_init(unsigned long *mid)
244{
245 unsigned i;
246
247 for (i = 0; i < P2M_MID_PER_PAGE; i++)
248 mid[i] = virt_to_mfn(p2m_missing);
249}
250
251static void p2m_init(unsigned long *p2m)
252{
253 unsigned i;
254
255 for (i = 0; i < P2M_MID_PER_PAGE; i++)
256 p2m[i] = INVALID_P2M_ENTRY;
257}
258
259/*
260 * Build the parallel p2m_top_mfn and p2m_mid_mfn structures
261 *
262 * This is called both at boot time, and after resuming from suspend:
263 * - At boot time we're called very early, and must use extend_brk()
264 * to allocate memory.
265 *
266 * - After resume we're called from within stop_machine, but the mfn
267 * tree should alreay be completely allocated.
268 */
269void __ref xen_build_mfn_list_list(void)
270{
271 unsigned long pfn;
272
273 /* Pre-initialize p2m_top_mfn to be completely missing */
274 if (p2m_top_mfn == NULL) {
275 p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
276 p2m_mid_mfn_init(p2m_mid_missing_mfn);
277
278 p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
279 p2m_top_mfn_p_init(p2m_top_mfn_p);
280
281 p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
282 p2m_top_mfn_init(p2m_top_mfn);
283 } else {
284 /* Reinitialise, mfn's all change after migration */
285 p2m_mid_mfn_init(p2m_mid_missing_mfn);
286 }
287
288 for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) {
289 unsigned topidx = p2m_top_index(pfn);
290 unsigned mididx = p2m_mid_index(pfn);
291 unsigned long **mid;
292 unsigned long *mid_mfn_p;
293
294 mid = p2m_top[topidx];
295 mid_mfn_p = p2m_top_mfn_p[topidx];
296
297 /* Don't bother allocating any mfn mid levels if
298 * they're just missing, just update the stored mfn,
299 * since all could have changed over a migrate.
300 */
301 if (mid == p2m_mid_missing) {
302 BUG_ON(mididx);
303 BUG_ON(mid_mfn_p != p2m_mid_missing_mfn);
304 p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn);
305 pfn += (P2M_MID_PER_PAGE - 1) * P2M_PER_PAGE;
306 continue;
307 }
308
309 if (mid_mfn_p == p2m_mid_missing_mfn) {
310 /*
311 * XXX boot-time only! We should never find
312 * missing parts of the mfn tree after
313 * runtime. extend_brk() will BUG if we call
314 * it too late.
315 */
316 mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
317 p2m_mid_mfn_init(mid_mfn_p);
318
319 p2m_top_mfn_p[topidx] = mid_mfn_p;
320 }
321
322 p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p);
323 mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]);
324 }
325}
326
327void xen_setup_mfn_list_list(void)
328{
329 BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
330
331 HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
332 virt_to_mfn(p2m_top_mfn);
333 HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn;
334}
335
336/* Set up p2m_top to point to the domain-builder provided p2m pages */
337void __init xen_build_dynamic_phys_to_machine(void)
338{
339 unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
340 unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
341 unsigned long pfn;
342
343 xen_max_p2m_pfn = max_pfn;
344
345 p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
346 p2m_init(p2m_missing);
347
348 p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
349 p2m_mid_init(p2m_mid_missing);
350
351 p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE);
352 p2m_top_init(p2m_top);
353
354 p2m_identity = extend_brk(PAGE_SIZE, PAGE_SIZE);
355 p2m_init(p2m_identity);
356
357 /*
358 * The domain builder gives us a pre-constructed p2m array in
359 * mfn_list for all the pages initially given to us, so we just
360 * need to graft that into our tree structure.
361 */
362 for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) {
363 unsigned topidx = p2m_top_index(pfn);
364 unsigned mididx = p2m_mid_index(pfn);
365
366 if (p2m_top[topidx] == p2m_mid_missing) {
367 unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
368 p2m_mid_init(mid);
369
370 p2m_top[topidx] = mid;
371 }
372
373 /*
374 * As long as the mfn_list has enough entries to completely
375 * fill a p2m page, pointing into the array is ok. But if
376 * not the entries beyond the last pfn will be undefined.
377 */
378 if (unlikely(pfn + P2M_PER_PAGE > max_pfn)) {
379 unsigned long p2midx;
380
381 p2midx = max_pfn % P2M_PER_PAGE;
382 for ( ; p2midx < P2M_PER_PAGE; p2midx++)
383 mfn_list[pfn + p2midx] = INVALID_P2M_ENTRY;
384 }
385 p2m_top[topidx][mididx] = &mfn_list[pfn];
386 }
387
388 m2p_override_init();
389}
390
391unsigned long get_phys_to_machine(unsigned long pfn)
392{
393 unsigned topidx, mididx, idx;
394
395 if (unlikely(pfn >= MAX_P2M_PFN))
396 return INVALID_P2M_ENTRY;
397
398 topidx = p2m_top_index(pfn);
399 mididx = p2m_mid_index(pfn);
400 idx = p2m_index(pfn);
401
402 /*
403 * The INVALID_P2M_ENTRY is filled in both p2m_*identity
404 * and in p2m_*missing, so returning the INVALID_P2M_ENTRY
405 * would be wrong.
406 */
407 if (p2m_top[topidx][mididx] == p2m_identity)
408 return IDENTITY_FRAME(pfn);
409
410 return p2m_top[topidx][mididx][idx];
411}
412EXPORT_SYMBOL_GPL(get_phys_to_machine);
413
414static void *alloc_p2m_page(void)
415{
416 return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT);
417}
418
419static void free_p2m_page(void *p)
420{
421 free_page((unsigned long)p);
422}
423
424/*
425 * Fully allocate the p2m structure for a given pfn. We need to check
426 * that both the top and mid levels are allocated, and make sure the
427 * parallel mfn tree is kept in sync. We may race with other cpus, so
428 * the new pages are installed with cmpxchg; if we lose the race then
429 * simply free the page we allocated and use the one that's there.
430 */
431static bool alloc_p2m(unsigned long pfn)
432{
433 unsigned topidx, mididx;
434 unsigned long ***top_p, **mid;
435 unsigned long *top_mfn_p, *mid_mfn;
436
437 topidx = p2m_top_index(pfn);
438 mididx = p2m_mid_index(pfn);
439
440 top_p = &p2m_top[topidx];
441 mid = *top_p;
442
443 if (mid == p2m_mid_missing) {
444 /* Mid level is missing, allocate a new one */
445 mid = alloc_p2m_page();
446 if (!mid)
447 return false;
448
449 p2m_mid_init(mid);
450
451 if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing)
452 free_p2m_page(mid);
453 }
454
455 top_mfn_p = &p2m_top_mfn[topidx];
456 mid_mfn = p2m_top_mfn_p[topidx];
457
458 BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p);
459
460 if (mid_mfn == p2m_mid_missing_mfn) {
461 /* Separately check the mid mfn level */
462 unsigned long missing_mfn;
463 unsigned long mid_mfn_mfn;
464
465 mid_mfn = alloc_p2m_page();
466 if (!mid_mfn)
467 return false;
468
469 p2m_mid_mfn_init(mid_mfn);
470
471 missing_mfn = virt_to_mfn(p2m_mid_missing_mfn);
472 mid_mfn_mfn = virt_to_mfn(mid_mfn);
473 if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn)
474 free_p2m_page(mid_mfn);
475 else
476 p2m_top_mfn_p[topidx] = mid_mfn;
477 }
478
479 if (p2m_top[topidx][mididx] == p2m_identity ||
480 p2m_top[topidx][mididx] == p2m_missing) {
481 /* p2m leaf page is missing */
482 unsigned long *p2m;
483 unsigned long *p2m_orig = p2m_top[topidx][mididx];
484
485 p2m = alloc_p2m_page();
486 if (!p2m)
487 return false;
488
489 p2m_init(p2m);
490
491 if (cmpxchg(&mid[mididx], p2m_orig, p2m) != p2m_orig)
492 free_p2m_page(p2m);
493 else
494 mid_mfn[mididx] = virt_to_mfn(p2m);
495 }
496
497 return true;
498}
499
500static bool __init __early_alloc_p2m(unsigned long pfn)
501{
502 unsigned topidx, mididx, idx;
503
504 topidx = p2m_top_index(pfn);
505 mididx = p2m_mid_index(pfn);
506 idx = p2m_index(pfn);
507
508 /* Pfff.. No boundary cross-over, lets get out. */
509 if (!idx)
510 return false;
511
512 WARN(p2m_top[topidx][mididx] == p2m_identity,
513 "P2M[%d][%d] == IDENTITY, should be MISSING (or alloced)!\n",
514 topidx, mididx);
515
516 /*
517 * Could be done by xen_build_dynamic_phys_to_machine..
518 */
519 if (p2m_top[topidx][mididx] != p2m_missing)
520 return false;
521
522 /* Boundary cross-over for the edges: */
523 if (idx) {
524 unsigned long *p2m = extend_brk(PAGE_SIZE, PAGE_SIZE);
525 unsigned long *mid_mfn_p;
526
527 p2m_init(p2m);
528
529 p2m_top[topidx][mididx] = p2m;
530
531 /* For save/restore we need to MFN of the P2M saved */
532
533 mid_mfn_p = p2m_top_mfn_p[topidx];
534 WARN(mid_mfn_p[mididx] != virt_to_mfn(p2m_missing),
535 "P2M_TOP_P[%d][%d] != MFN of p2m_missing!\n",
536 topidx, mididx);
537 mid_mfn_p[mididx] = virt_to_mfn(p2m);
538
539 }
540 return idx != 0;
541}
542unsigned long __init set_phys_range_identity(unsigned long pfn_s,
543 unsigned long pfn_e)
544{
545 unsigned long pfn;
546
547 if (unlikely(pfn_s >= MAX_P2M_PFN || pfn_e >= MAX_P2M_PFN))
548 return 0;
549
550 if (unlikely(xen_feature(XENFEAT_auto_translated_physmap)))
551 return pfn_e - pfn_s;
552
553 if (pfn_s > pfn_e)
554 return 0;
555
556 for (pfn = (pfn_s & ~(P2M_MID_PER_PAGE * P2M_PER_PAGE - 1));
557 pfn < ALIGN(pfn_e, (P2M_MID_PER_PAGE * P2M_PER_PAGE));
558 pfn += P2M_MID_PER_PAGE * P2M_PER_PAGE)
559 {
560 unsigned topidx = p2m_top_index(pfn);
561 unsigned long *mid_mfn_p;
562 unsigned long **mid;
563
564 mid = p2m_top[topidx];
565 mid_mfn_p = p2m_top_mfn_p[topidx];
566 if (mid == p2m_mid_missing) {
567 mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
568
569 p2m_mid_init(mid);
570
571 p2m_top[topidx] = mid;
572
573 BUG_ON(mid_mfn_p != p2m_mid_missing_mfn);
574 }
575 /* And the save/restore P2M tables.. */
576 if (mid_mfn_p == p2m_mid_missing_mfn) {
577 mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
578 p2m_mid_mfn_init(mid_mfn_p);
579
580 p2m_top_mfn_p[topidx] = mid_mfn_p;
581 p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p);
582 /* Note: we don't set mid_mfn_p[midix] here,
583 * look in __early_alloc_p2m */
584 }
585 }
586
587 __early_alloc_p2m(pfn_s);
588 __early_alloc_p2m(pfn_e);
589
590 for (pfn = pfn_s; pfn < pfn_e; pfn++)
591 if (!__set_phys_to_machine(pfn, IDENTITY_FRAME(pfn)))
592 break;
593
594 if (!WARN((pfn - pfn_s) != (pfn_e - pfn_s),
595 "Identity mapping failed. We are %ld short of 1-1 mappings!\n",
596 (pfn_e - pfn_s) - (pfn - pfn_s)))
597 printk(KERN_DEBUG "1-1 mapping on %lx->%lx\n", pfn_s, pfn);
598
599 return pfn - pfn_s;
600}
601
602/* Try to install p2m mapping; fail if intermediate bits missing */
603bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
604{
605 unsigned topidx, mididx, idx;
606
607 if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
608 BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
609 return true;
610 }
611 if (unlikely(pfn >= MAX_P2M_PFN)) {
612 BUG_ON(mfn != INVALID_P2M_ENTRY);
613 return true;
614 }
615
616 topidx = p2m_top_index(pfn);
617 mididx = p2m_mid_index(pfn);
618 idx = p2m_index(pfn);
619
620 /* For sparse holes were the p2m leaf has real PFN along with
621 * PCI holes, stick in the PFN as the MFN value.
622 */
623 if (mfn != INVALID_P2M_ENTRY && (mfn & IDENTITY_FRAME_BIT)) {
624 if (p2m_top[topidx][mididx] == p2m_identity)
625 return true;
626
627 /* Swap over from MISSING to IDENTITY if needed. */
628 if (p2m_top[topidx][mididx] == p2m_missing) {
629 WARN_ON(cmpxchg(&p2m_top[topidx][mididx], p2m_missing,
630 p2m_identity) != p2m_missing);
631 return true;
632 }
633 }
634
635 if (p2m_top[topidx][mididx] == p2m_missing)
636 return mfn == INVALID_P2M_ENTRY;
637
638 p2m_top[topidx][mididx][idx] = mfn;
639
640 return true;
641}
642
643bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)
644{
645 if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
646 if (!alloc_p2m(pfn))
647 return false;
648
649 if (!__set_phys_to_machine(pfn, mfn))
650 return false;
651 }
652
653 return true;
654}
655
656#define M2P_OVERRIDE_HASH_SHIFT 10
657#define M2P_OVERRIDE_HASH (1 << M2P_OVERRIDE_HASH_SHIFT)
658
659static RESERVE_BRK_ARRAY(struct list_head, m2p_overrides, M2P_OVERRIDE_HASH);
660static DEFINE_SPINLOCK(m2p_override_lock);
661
662static void __init m2p_override_init(void)
663{
664 unsigned i;
665
666 m2p_overrides = extend_brk(sizeof(*m2p_overrides) * M2P_OVERRIDE_HASH,
667 sizeof(unsigned long));
668
669 for (i = 0; i < M2P_OVERRIDE_HASH; i++)
670 INIT_LIST_HEAD(&m2p_overrides[i]);
671}
672
673static unsigned long mfn_hash(unsigned long mfn)
674{
675 return hash_long(mfn, M2P_OVERRIDE_HASH_SHIFT);
676}
677
678/* Add an MFN override for a particular page */
679int m2p_add_override(unsigned long mfn, struct page *page, bool clear_pte)
680{
681 unsigned long flags;
682 unsigned long pfn;
683 unsigned long uninitialized_var(address);
684 unsigned level;
685 pte_t *ptep = NULL;
686
687 pfn = page_to_pfn(page);
688 if (!PageHighMem(page)) {
689 address = (unsigned long)__va(pfn << PAGE_SHIFT);
690 ptep = lookup_address(address, &level);
691 if (WARN(ptep == NULL || level != PG_LEVEL_4K,
692 "m2p_add_override: pfn %lx not mapped", pfn))
693 return -EINVAL;
694 }
695
696 page->private = mfn;
697 page->index = pfn_to_mfn(pfn);
698
699 if (unlikely(!set_phys_to_machine(pfn, FOREIGN_FRAME(mfn))))
700 return -ENOMEM;
701
702 if (clear_pte && !PageHighMem(page))
703 /* Just zap old mapping for now */
704 pte_clear(&init_mm, address, ptep);
705 spin_lock_irqsave(&m2p_override_lock, flags);
706 list_add(&page->lru, &m2p_overrides[mfn_hash(mfn)]);
707 spin_unlock_irqrestore(&m2p_override_lock, flags);
708
709 return 0;
710}
711EXPORT_SYMBOL_GPL(m2p_add_override);
712int m2p_remove_override(struct page *page, bool clear_pte)
713{
714 unsigned long flags;
715 unsigned long mfn;
716 unsigned long pfn;
717 unsigned long uninitialized_var(address);
718 unsigned level;
719 pte_t *ptep = NULL;
720
721 pfn = page_to_pfn(page);
722 mfn = get_phys_to_machine(pfn);
723 if (mfn == INVALID_P2M_ENTRY || !(mfn & FOREIGN_FRAME_BIT))
724 return -EINVAL;
725
726 if (!PageHighMem(page)) {
727 address = (unsigned long)__va(pfn << PAGE_SHIFT);
728 ptep = lookup_address(address, &level);
729
730 if (WARN(ptep == NULL || level != PG_LEVEL_4K,
731 "m2p_remove_override: pfn %lx not mapped", pfn))
732 return -EINVAL;
733 }
734
735 spin_lock_irqsave(&m2p_override_lock, flags);
736 list_del(&page->lru);
737 spin_unlock_irqrestore(&m2p_override_lock, flags);
738 set_phys_to_machine(pfn, page->index);
739
740 if (clear_pte && !PageHighMem(page))
741 set_pte_at(&init_mm, address, ptep,
742 pfn_pte(pfn, PAGE_KERNEL));
743 /* No tlb flush necessary because the caller already
744 * left the pte unmapped. */
745
746 return 0;
747}
748EXPORT_SYMBOL_GPL(m2p_remove_override);
749
750struct page *m2p_find_override(unsigned long mfn)
751{
752 unsigned long flags;
753 struct list_head *bucket = &m2p_overrides[mfn_hash(mfn)];
754 struct page *p, *ret;
755
756 ret = NULL;
757
758 spin_lock_irqsave(&m2p_override_lock, flags);
759
760 list_for_each_entry(p, bucket, lru) {
761 if (p->private == mfn) {
762 ret = p;
763 break;
764 }
765 }
766
767 spin_unlock_irqrestore(&m2p_override_lock, flags);
768
769 return ret;
770}
771
772unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn)
773{
774 struct page *p = m2p_find_override(mfn);
775 unsigned long ret = pfn;
776
777 if (p)
778 ret = page_to_pfn(p);
779
780 return ret;
781}
782EXPORT_SYMBOL_GPL(m2p_find_override_pfn);
783
784#ifdef CONFIG_XEN_DEBUG_FS
785
786int p2m_dump_show(struct seq_file *m, void *v)
787{
788 static const char * const level_name[] = { "top", "middle",
789 "entry", "abnormal" };
790 static const char * const type_name[] = { "identity", "missing",
791 "pfn", "abnormal"};
792#define TYPE_IDENTITY 0
793#define TYPE_MISSING 1
794#define TYPE_PFN 2
795#define TYPE_UNKNOWN 3
796 unsigned long pfn, prev_pfn_type = 0, prev_pfn_level = 0;
797 unsigned int uninitialized_var(prev_level);
798 unsigned int uninitialized_var(prev_type);
799
800 if (!p2m_top)
801 return 0;
802
803 for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn++) {
804 unsigned topidx = p2m_top_index(pfn);
805 unsigned mididx = p2m_mid_index(pfn);
806 unsigned idx = p2m_index(pfn);
807 unsigned lvl, type;
808
809 lvl = 4;
810 type = TYPE_UNKNOWN;
811 if (p2m_top[topidx] == p2m_mid_missing) {
812 lvl = 0; type = TYPE_MISSING;
813 } else if (p2m_top[topidx] == NULL) {
814 lvl = 0; type = TYPE_UNKNOWN;
815 } else if (p2m_top[topidx][mididx] == NULL) {
816 lvl = 1; type = TYPE_UNKNOWN;
817 } else if (p2m_top[topidx][mididx] == p2m_identity) {
818 lvl = 1; type = TYPE_IDENTITY;
819 } else if (p2m_top[topidx][mididx] == p2m_missing) {
820 lvl = 1; type = TYPE_MISSING;
821 } else if (p2m_top[topidx][mididx][idx] == 0) {
822 lvl = 2; type = TYPE_UNKNOWN;
823 } else if (p2m_top[topidx][mididx][idx] == IDENTITY_FRAME(pfn)) {
824 lvl = 2; type = TYPE_IDENTITY;
825 } else if (p2m_top[topidx][mididx][idx] == INVALID_P2M_ENTRY) {
826 lvl = 2; type = TYPE_MISSING;
827 } else if (p2m_top[topidx][mididx][idx] == pfn) {
828 lvl = 2; type = TYPE_PFN;
829 } else if (p2m_top[topidx][mididx][idx] != pfn) {
830 lvl = 2; type = TYPE_PFN;
831 }
832 if (pfn == 0) {
833 prev_level = lvl;
834 prev_type = type;
835 }
836 if (pfn == MAX_DOMAIN_PAGES-1) {
837 lvl = 3;
838 type = TYPE_UNKNOWN;
839 }
840 if (prev_type != type) {
841 seq_printf(m, " [0x%lx->0x%lx] %s\n",
842 prev_pfn_type, pfn, type_name[prev_type]);
843 prev_pfn_type = pfn;
844 prev_type = type;
845 }
846 if (prev_level != lvl) {
847 seq_printf(m, " [0x%lx->0x%lx] level %s\n",
848 prev_pfn_level, pfn, level_name[prev_level]);
849 prev_pfn_level = pfn;
850 prev_level = lvl;
851 }
852 }
853 return 0;
854#undef TYPE_IDENTITY
855#undef TYPE_MISSING
856#undef TYPE_PFN
857#undef TYPE_UNKNOWN
858}
859#endif
diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c
index a013ec9d0c54..b480d4207a4c 100644
--- a/arch/x86/xen/pci-swiotlb-xen.c
+++ b/arch/x86/xen/pci-swiotlb-xen.c
@@ -1,10 +1,12 @@
1/* Glue code to lib/swiotlb-xen.c */ 1/* Glue code to lib/swiotlb-xen.c */
2 2
3#include <linux/dma-mapping.h> 3#include <linux/dma-mapping.h>
4#include <linux/pci.h>
4#include <xen/swiotlb-xen.h> 5#include <xen/swiotlb-xen.h>
5 6
6#include <asm/xen/hypervisor.h> 7#include <asm/xen/hypervisor.h>
7#include <xen/xen.h> 8#include <xen/xen.h>
9#include <asm/iommu_table.h>
8 10
9int xen_swiotlb __read_mostly; 11int xen_swiotlb __read_mostly;
10 12
@@ -34,7 +36,7 @@ int __init pci_xen_swiotlb_detect(void)
34 36
35 /* If running as PV guest, either iommu=soft, or swiotlb=force will 37 /* If running as PV guest, either iommu=soft, or swiotlb=force will
36 * activate this IOMMU. If running as PV privileged, activate it 38 * activate this IOMMU. If running as PV privileged, activate it
37 * irregardlesss. 39 * irregardless.
38 */ 40 */
39 if ((xen_initial_domain() || swiotlb || swiotlb_force) && 41 if ((xen_initial_domain() || swiotlb || swiotlb_force) &&
40 (xen_pv_domain())) 42 (xen_pv_domain()))
@@ -54,5 +56,12 @@ void __init pci_xen_swiotlb_init(void)
54 if (xen_swiotlb) { 56 if (xen_swiotlb) {
55 xen_swiotlb_init(1); 57 xen_swiotlb_init(1);
56 dma_ops = &xen_swiotlb_dma_ops; 58 dma_ops = &xen_swiotlb_dma_ops;
59
60 /* Make sure ACS will be enabled */
61 pci_request_acs();
57 } 62 }
58} 63}
64IOMMU_INIT_FINISH(pci_xen_swiotlb_detect,
65 0,
66 pci_xen_swiotlb_init,
67 0);
diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c
index 0f456386cce5..25c52f94a27c 100644
--- a/arch/x86/xen/platform-pci-unplug.c
+++ b/arch/x86/xen/platform-pci-unplug.c
@@ -68,7 +68,7 @@ static int __init check_platform_magic(void)
68 return 0; 68 return 0;
69} 69}
70 70
71void __init xen_unplug_emulated_devices(void) 71void xen_unplug_emulated_devices(void)
72{ 72{
73 int r; 73 int r;
74 74
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 328b00305426..60aeeb56948f 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -8,6 +8,7 @@
8#include <linux/sched.h> 8#include <linux/sched.h>
9#include <linux/mm.h> 9#include <linux/mm.h>
10#include <linux/pm.h> 10#include <linux/pm.h>
11#include <linux/memblock.h>
11 12
12#include <asm/elf.h> 13#include <asm/elf.h>
13#include <asm/vdso.h> 14#include <asm/vdso.h>
@@ -17,10 +18,11 @@
17#include <asm/xen/hypervisor.h> 18#include <asm/xen/hypervisor.h>
18#include <asm/xen/hypercall.h> 19#include <asm/xen/hypercall.h>
19 20
21#include <xen/xen.h>
20#include <xen/page.h> 22#include <xen/page.h>
21#include <xen/interface/callback.h> 23#include <xen/interface/callback.h>
22#include <xen/interface/physdev.h>
23#include <xen/interface/memory.h> 24#include <xen/interface/memory.h>
25#include <xen/interface/physdev.h>
24#include <xen/features.h> 26#include <xen/features.h>
25 27
26#include "xen-ops.h" 28#include "xen-ops.h"
@@ -33,6 +35,44 @@ extern void xen_sysenter_target(void);
33extern void xen_syscall_target(void); 35extern void xen_syscall_target(void);
34extern void xen_syscall32_target(void); 36extern void xen_syscall32_target(void);
35 37
38/* Amount of extra memory space we add to the e820 ranges */
39phys_addr_t xen_extra_mem_start, xen_extra_mem_size;
40
41/*
42 * The maximum amount of extra memory compared to the base size. The
43 * main scaling factor is the size of struct page. At extreme ratios
44 * of base:extra, all the base memory can be filled with page
45 * structures for the extra memory, leaving no space for anything
46 * else.
47 *
48 * 10x seems like a reasonable balance between scaling flexibility and
49 * leaving a practically usable system.
50 */
51#define EXTRA_MEM_RATIO (10)
52
53static void __init xen_add_extra_mem(unsigned long pages)
54{
55 unsigned long pfn;
56
57 u64 size = (u64)pages * PAGE_SIZE;
58 u64 extra_start = xen_extra_mem_start + xen_extra_mem_size;
59
60 if (!pages)
61 return;
62
63 e820_add_region(extra_start, size, E820_RAM);
64 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
65
66 memblock_x86_reserve_range(extra_start, extra_start + size, "XEN EXTRA");
67
68 xen_extra_mem_size += size;
69
70 xen_max_p2m_pfn = PFN_DOWN(extra_start + size);
71
72 for (pfn = PFN_DOWN(extra_start); pfn <= xen_max_p2m_pfn; pfn++)
73 __set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
74}
75
36static unsigned long __init xen_release_chunk(phys_addr_t start_addr, 76static unsigned long __init xen_release_chunk(phys_addr_t start_addr,
37 phys_addr_t end_addr) 77 phys_addr_t end_addr)
38{ 78{
@@ -69,7 +109,7 @@ static unsigned long __init xen_release_chunk(phys_addr_t start_addr,
69 WARN(ret != 1, "Failed to release memory %lx-%lx err=%d\n", 109 WARN(ret != 1, "Failed to release memory %lx-%lx err=%d\n",
70 start, end, ret); 110 start, end, ret);
71 if (ret == 1) { 111 if (ret == 1) {
72 set_phys_to_machine(pfn, INVALID_P2M_ENTRY); 112 __set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
73 len++; 113 len++;
74 } 114 }
75 } 115 }
@@ -82,16 +122,18 @@ static unsigned long __init xen_return_unused_memory(unsigned long max_pfn,
82 const struct e820map *e820) 122 const struct e820map *e820)
83{ 123{
84 phys_addr_t max_addr = PFN_PHYS(max_pfn); 124 phys_addr_t max_addr = PFN_PHYS(max_pfn);
85 phys_addr_t last_end = 0; 125 phys_addr_t last_end = ISA_END_ADDRESS;
86 unsigned long released = 0; 126 unsigned long released = 0;
87 int i; 127 int i;
88 128
129 /* Free any unused memory above the low 1Mbyte. */
89 for (i = 0; i < e820->nr_map && last_end < max_addr; i++) { 130 for (i = 0; i < e820->nr_map && last_end < max_addr; i++) {
90 phys_addr_t end = e820->map[i].addr; 131 phys_addr_t end = e820->map[i].addr;
91 end = min(max_addr, end); 132 end = min(max_addr, end);
92 133
93 released += xen_release_chunk(last_end, end); 134 if (last_end < end)
94 last_end = e820->map[i].addr + e820->map[i].size; 135 released += xen_release_chunk(last_end, end);
136 last_end = max(last_end, e820->map[i].addr + e820->map[i].size);
95 } 137 }
96 138
97 if (last_end < max_addr) 139 if (last_end < max_addr)
@@ -101,24 +143,140 @@ static unsigned long __init xen_return_unused_memory(unsigned long max_pfn,
101 return released; 143 return released;
102} 144}
103 145
146static unsigned long __init xen_set_identity(const struct e820entry *list,
147 ssize_t map_size)
148{
149 phys_addr_t last = xen_initial_domain() ? 0 : ISA_END_ADDRESS;
150 phys_addr_t start_pci = last;
151 const struct e820entry *entry;
152 unsigned long identity = 0;
153 int i;
154
155 for (i = 0, entry = list; i < map_size; i++, entry++) {
156 phys_addr_t start = entry->addr;
157 phys_addr_t end = start + entry->size;
158
159 if (start < last)
160 start = last;
161
162 if (end <= start)
163 continue;
164
165 /* Skip over the 1MB region. */
166 if (last > end)
167 continue;
168
169 if ((entry->type == E820_RAM) || (entry->type == E820_UNUSABLE)) {
170 if (start > start_pci)
171 identity += set_phys_range_identity(
172 PFN_UP(start_pci), PFN_DOWN(start));
173
174 /* Without saving 'last' we would gooble RAM too
175 * at the end of the loop. */
176 last = end;
177 start_pci = end;
178 continue;
179 }
180 start_pci = min(start, start_pci);
181 last = end;
182 }
183 if (last > start_pci)
184 identity += set_phys_range_identity(
185 PFN_UP(start_pci), PFN_DOWN(last));
186 return identity;
187}
104/** 188/**
105 * machine_specific_memory_setup - Hook for machine specific memory setup. 189 * machine_specific_memory_setup - Hook for machine specific memory setup.
106 **/ 190 **/
107
108char * __init xen_memory_setup(void) 191char * __init xen_memory_setup(void)
109{ 192{
193 static struct e820entry map[E820MAX] __initdata;
194 static struct e820entry map_raw[E820MAX] __initdata;
195
110 unsigned long max_pfn = xen_start_info->nr_pages; 196 unsigned long max_pfn = xen_start_info->nr_pages;
197 unsigned long long mem_end;
198 int rc;
199 struct xen_memory_map memmap;
200 unsigned long extra_pages = 0;
201 unsigned long extra_limit;
202 unsigned long identity_pages = 0;
203 int i;
204 int op;
111 205
112 max_pfn = min(MAX_DOMAIN_PAGES, max_pfn); 206 max_pfn = min(MAX_DOMAIN_PAGES, max_pfn);
207 mem_end = PFN_PHYS(max_pfn);
208
209 memmap.nr_entries = E820MAX;
210 set_xen_guest_handle(memmap.buffer, map);
211
212 op = xen_initial_domain() ?
213 XENMEM_machine_memory_map :
214 XENMEM_memory_map;
215 rc = HYPERVISOR_memory_op(op, &memmap);
216 if (rc == -ENOSYS) {
217 BUG_ON(xen_initial_domain());
218 memmap.nr_entries = 1;
219 map[0].addr = 0ULL;
220 map[0].size = mem_end;
221 /* 8MB slack (to balance backend allocations). */
222 map[0].size += 8ULL << 20;
223 map[0].type = E820_RAM;
224 rc = 0;
225 }
226 BUG_ON(rc);
113 227
228 memcpy(map_raw, map, sizeof(map));
114 e820.nr_map = 0; 229 e820.nr_map = 0;
230 xen_extra_mem_start = mem_end;
231 for (i = 0; i < memmap.nr_entries; i++) {
232 unsigned long long end;
233
234 /* Guard against non-page aligned E820 entries. */
235 if (map[i].type == E820_RAM)
236 map[i].size -= (map[i].size + map[i].addr) % PAGE_SIZE;
237
238 end = map[i].addr + map[i].size;
239 if (map[i].type == E820_RAM && end > mem_end) {
240 /* RAM off the end - may be partially included */
241 u64 delta = min(map[i].size, end - mem_end);
242
243 map[i].size -= delta;
244 end -= delta;
245
246 extra_pages += PFN_DOWN(delta);
247 /*
248 * Set RAM below 4GB that is not for us to be unusable.
249 * This prevents "System RAM" address space from being
250 * used as potential resource for I/O address (happens
251 * when 'allocate_resource' is called).
252 */
253 if (delta &&
254 (xen_initial_domain() && end < 0x100000000ULL))
255 e820_add_region(end, delta, E820_UNUSABLE);
256 }
115 257
116 e820_add_region(0, PFN_PHYS((u64)max_pfn), E820_RAM); 258 if (map[i].size > 0 && end > xen_extra_mem_start)
259 xen_extra_mem_start = end;
260
261 /* Add region if any remains */
262 if (map[i].size > 0)
263 e820_add_region(map[i].addr, map[i].size, map[i].type);
264 }
265 /* Align the balloon area so that max_low_pfn does not get set
266 * to be at the _end_ of the PCI gap at the far end (fee01000).
267 * Note that xen_extra_mem_start gets set in the loop above to be
268 * past the last E820 region. */
269 if (xen_initial_domain() && (xen_extra_mem_start < (1ULL<<32)))
270 xen_extra_mem_start = (1ULL<<32);
117 271
118 /* 272 /*
119 * Even though this is normal, usable memory under Xen, reserve 273 * In domU, the ISA region is normal, usable memory, but we
120 * ISA memory anyway because too many things think they can poke 274 * reserve ISA memory anyway because too many things poke
121 * about in there. 275 * about in there.
276 *
277 * In Dom0, the host E820 information can leave gaps in the
278 * ISA range, which would cause us to release those pages. To
279 * avoid this, we unconditionally reserve them here.
122 */ 280 */
123 e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS, 281 e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS,
124 E820_RESERVED); 282 E820_RESERVED);
@@ -129,29 +287,43 @@ char * __init xen_memory_setup(void)
129 * - xen_start_info 287 * - xen_start_info
130 * See comment above "struct start_info" in <xen/interface/xen.h> 288 * See comment above "struct start_info" in <xen/interface/xen.h>
131 */ 289 */
132 reserve_early(__pa(xen_start_info->mfn_list), 290 memblock_x86_reserve_range(__pa(xen_start_info->mfn_list),
133 __pa(xen_start_info->pt_base), 291 __pa(xen_start_info->pt_base),
134 "XEN START INFO"); 292 "XEN START INFO");
135 293
136 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); 294 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
137 295
138 xen_return_unused_memory(xen_start_info->nr_pages, &e820); 296 extra_pages += xen_return_unused_memory(xen_start_info->nr_pages, &e820);
139 297
140 return "Xen"; 298 /*
141} 299 * Clamp the amount of extra memory to a EXTRA_MEM_RATIO
300 * factor the base size. On non-highmem systems, the base
301 * size is the full initial memory allocation; on highmem it
302 * is limited to the max size of lowmem, so that it doesn't
303 * get completely filled.
304 *
305 * In principle there could be a problem in lowmem systems if
306 * the initial memory is also very large with respect to
307 * lowmem, but we won't try to deal with that here.
308 */
309 extra_limit = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)),
310 max_pfn + extra_pages);
142 311
143static void xen_idle(void) 312 if (extra_limit >= max_pfn)
144{ 313 extra_pages = extra_limit - max_pfn;
145 local_irq_disable(); 314 else
146 315 extra_pages = 0;
147 if (need_resched()) 316
148 local_irq_enable(); 317 xen_add_extra_mem(extra_pages);
149 else { 318
150 current_thread_info()->status &= ~TS_POLLING; 319 /*
151 smp_mb__after_clear_bit(); 320 * Set P2M for all non-RAM pages and E820 gaps to be identity
152 safe_halt(); 321 * type PFNs. We supply it with the non-sanitized version
153 current_thread_info()->status |= TS_POLLING; 322 * of the E820.
154 } 323 */
324 identity_pages = xen_set_identity(map_raw, memmap.nr_entries);
325 printk(KERN_INFO "Set %ld page(s) to 1-1 mapping.\n", identity_pages);
326 return "Xen";
155} 327}
156 328
157/* 329/*
@@ -170,7 +342,7 @@ static void __init fiddle_vdso(void)
170#endif 342#endif
171} 343}
172 344
173static __cpuinit int register_callback(unsigned type, const void *func) 345static int __cpuinit register_callback(unsigned type, const void *func)
174{ 346{
175 struct callback_register callback = { 347 struct callback_register callback = {
176 .type = type, 348 .type = type,
@@ -223,9 +395,6 @@ void __cpuinit xen_enable_syscall(void)
223 395
224void __init xen_arch_setup(void) 396void __init xen_arch_setup(void)
225{ 397{
226 struct physdev_set_iopl set_iopl;
227 int rc;
228
229 xen_panic_handler_init(); 398 xen_panic_handler_init();
230 399
231 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments); 400 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
@@ -242,11 +411,6 @@ void __init xen_arch_setup(void)
242 xen_enable_sysenter(); 411 xen_enable_sysenter();
243 xen_enable_syscall(); 412 xen_enable_syscall();
244 413
245 set_iopl.iopl = 1;
246 rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
247 if (rc != 0)
248 printk(KERN_INFO "physdev_op failed %d\n", rc);
249
250#ifdef CONFIG_ACPI 414#ifdef CONFIG_ACPI
251 if (!(xen_start_info->flags & SIF_INITDOMAIN)) { 415 if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
252 printk(KERN_INFO "ACPI in unprivileged domain disabled\n"); 416 printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
@@ -258,9 +422,12 @@ void __init xen_arch_setup(void)
258 MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ? 422 MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ?
259 COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE); 423 COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE);
260 424
261 pm_idle = xen_idle; 425 /* Set up idle, making sure it calls safe_halt() pvop */
262 426#ifdef CONFIG_X86_32
263 paravirt_disable_iospace(); 427 boot_cpu_data.hlt_works_ok = 1;
428#endif
429 pm_idle = default_idle;
430 boot_option_idle_override = IDLE_HALT;
264 431
265 fiddle_vdso(); 432 fiddle_vdso();
266} 433}
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 25f232b18a82..b4533a86d7e4 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -28,6 +28,7 @@
28#include <asm/xen/interface.h> 28#include <asm/xen/interface.h>
29#include <asm/xen/hypercall.h> 29#include <asm/xen/hypercall.h>
30 30
31#include <xen/xen.h>
31#include <xen/page.h> 32#include <xen/page.h>
32#include <xen/events.h> 33#include <xen/events.h>
33 34
@@ -45,18 +46,17 @@ static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id);
45static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id); 46static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id);
46 47
47/* 48/*
48 * Reschedule call back. Nothing to do, 49 * Reschedule call back.
49 * all the work is done automatically when
50 * we return from the interrupt.
51 */ 50 */
52static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id) 51static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
53{ 52{
54 inc_irq_stat(irq_resched_count); 53 inc_irq_stat(irq_resched_count);
54 scheduler_ipi();
55 55
56 return IRQ_HANDLED; 56 return IRQ_HANDLED;
57} 57}
58 58
59static __cpuinit void cpu_bringup(void) 59static void __cpuinit cpu_bringup(void)
60{ 60{
61 int cpu = smp_processor_id(); 61 int cpu = smp_processor_id();
62 62
@@ -84,7 +84,7 @@ static __cpuinit void cpu_bringup(void)
84 wmb(); /* make sure everything is out */ 84 wmb(); /* make sure everything is out */
85} 85}
86 86
87static __cpuinit void cpu_bringup_and_idle(void) 87static void __cpuinit cpu_bringup_and_idle(void)
88{ 88{
89 cpu_bringup(); 89 cpu_bringup();
90 cpu_idle(); 90 cpu_idle();
@@ -156,6 +156,9 @@ static void __init xen_fill_possible_map(void)
156{ 156{
157 int i, rc; 157 int i, rc;
158 158
159 if (xen_initial_domain())
160 return;
161
159 for (i = 0; i < nr_cpu_ids; i++) { 162 for (i = 0; i < nr_cpu_ids; i++) {
160 rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); 163 rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
161 if (rc >= 0) { 164 if (rc >= 0) {
@@ -165,6 +168,27 @@ static void __init xen_fill_possible_map(void)
165 } 168 }
166} 169}
167 170
171static void __init xen_filter_cpu_maps(void)
172{
173 int i, rc;
174
175 if (!xen_initial_domain())
176 return;
177
178 num_processors = 0;
179 disabled_cpus = 0;
180 for (i = 0; i < nr_cpu_ids; i++) {
181 rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
182 if (rc >= 0) {
183 num_processors++;
184 set_cpu_possible(i, true);
185 } else {
186 set_cpu_possible(i, false);
187 set_cpu_present(i, false);
188 }
189 }
190}
191
168static void __init xen_smp_prepare_boot_cpu(void) 192static void __init xen_smp_prepare_boot_cpu(void)
169{ 193{
170 BUG_ON(smp_processor_id() != 0); 194 BUG_ON(smp_processor_id() != 0);
@@ -174,17 +198,25 @@ static void __init xen_smp_prepare_boot_cpu(void)
174 old memory can be recycled */ 198 old memory can be recycled */
175 make_lowmem_page_readwrite(xen_initial_gdt); 199 make_lowmem_page_readwrite(xen_initial_gdt);
176 200
201 xen_filter_cpu_maps();
177 xen_setup_vcpu_info_placement(); 202 xen_setup_vcpu_info_placement();
178} 203}
179 204
180static void __init xen_smp_prepare_cpus(unsigned int max_cpus) 205static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
181{ 206{
182 unsigned cpu; 207 unsigned cpu;
208 unsigned int i;
183 209
184 xen_init_lock_cpu(0); 210 xen_init_lock_cpu(0);
185 211
186 smp_store_cpu_info(0); 212 smp_store_cpu_info(0);
187 cpu_data(0).x86_max_cores = 1; 213 cpu_data(0).x86_max_cores = 1;
214
215 for_each_possible_cpu(i) {
216 zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL);
217 zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
218 zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL);
219 }
188 set_cpu_sibling_map(0); 220 set_cpu_sibling_map(0);
189 221
190 if (xen_smp_intr_init(0)) 222 if (xen_smp_intr_init(0))
@@ -216,7 +248,7 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
216 } 248 }
217} 249}
218 250
219static __cpuinit int 251static int __cpuinit
220cpu_initialize_context(unsigned int cpu, struct task_struct *idle) 252cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
221{ 253{
222 struct vcpu_guest_context *ctxt; 254 struct vcpu_guest_context *ctxt;
@@ -400,9 +432,9 @@ static void stop_self(void *v)
400 BUG(); 432 BUG();
401} 433}
402 434
403static void xen_smp_send_stop(void) 435static void xen_stop_other_cpus(int wait)
404{ 436{
405 smp_call_function(stop_self, NULL, 0); 437 smp_call_function(stop_self, NULL, wait);
406} 438}
407 439
408static void xen_smp_send_reschedule(int cpu) 440static void xen_smp_send_reschedule(int cpu)
@@ -460,7 +492,7 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id)
460 return IRQ_HANDLED; 492 return IRQ_HANDLED;
461} 493}
462 494
463static const struct smp_ops xen_smp_ops __initdata = { 495static const struct smp_ops xen_smp_ops __initconst = {
464 .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu, 496 .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
465 .smp_prepare_cpus = xen_smp_prepare_cpus, 497 .smp_prepare_cpus = xen_smp_prepare_cpus,
466 .smp_cpus_done = xen_smp_cpus_done, 498 .smp_cpus_done = xen_smp_cpus_done,
@@ -470,7 +502,7 @@ static const struct smp_ops xen_smp_ops __initdata = {
470 .cpu_disable = xen_cpu_disable, 502 .cpu_disable = xen_cpu_disable,
471 .play_dead = xen_play_dead, 503 .play_dead = xen_play_dead,
472 504
473 .smp_send_stop = xen_smp_send_stop, 505 .stop_other_cpus = xen_stop_other_cpus,
474 .smp_send_reschedule = xen_smp_send_reschedule, 506 .smp_send_reschedule = xen_smp_send_reschedule,
475 507
476 .send_call_func_ipi = xen_smp_send_call_function_ipi, 508 .send_call_func_ipi = xen_smp_send_call_function_ipi,
@@ -483,3 +515,41 @@ void __init xen_smp_init(void)
483 xen_fill_possible_map(); 515 xen_fill_possible_map();
484 xen_init_spinlocks(); 516 xen_init_spinlocks();
485} 517}
518
519static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus)
520{
521 native_smp_prepare_cpus(max_cpus);
522 WARN_ON(xen_smp_intr_init(0));
523
524 if (!xen_have_vector_callback)
525 return;
526 xen_init_lock_cpu(0);
527 xen_init_spinlocks();
528}
529
530static int __cpuinit xen_hvm_cpu_up(unsigned int cpu)
531{
532 int rc;
533 rc = native_cpu_up(cpu);
534 WARN_ON (xen_smp_intr_init(cpu));
535 return rc;
536}
537
538static void xen_hvm_cpu_die(unsigned int cpu)
539{
540 unbind_from_irqhandler(per_cpu(xen_resched_irq, cpu), NULL);
541 unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu), NULL);
542 unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu), NULL);
543 unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu), NULL);
544 native_cpu_die(cpu);
545}
546
547void __init xen_hvm_smp_init(void)
548{
549 smp_ops.smp_prepare_cpus = xen_hvm_smp_prepare_cpus;
550 smp_ops.smp_send_reschedule = xen_smp_send_reschedule;
551 smp_ops.cpu_up = xen_hvm_cpu_up;
552 smp_ops.cpu_die = xen_hvm_cpu_die;
553 smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi;
554 smp_ops.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi;
555}
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index e0500646585d..cc9b1e182fcf 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -159,8 +159,8 @@ static inline struct xen_spinlock *spinning_lock(struct xen_spinlock *xl)
159{ 159{
160 struct xen_spinlock *prev; 160 struct xen_spinlock *prev;
161 161
162 prev = __get_cpu_var(lock_spinners); 162 prev = __this_cpu_read(lock_spinners);
163 __get_cpu_var(lock_spinners) = xl; 163 __this_cpu_write(lock_spinners, xl);
164 164
165 wmb(); /* set lock of interest before count */ 165 wmb(); /* set lock of interest before count */
166 166
@@ -179,14 +179,14 @@ static inline void unspinning_lock(struct xen_spinlock *xl, struct xen_spinlock
179 asm(LOCK_PREFIX " decw %0" 179 asm(LOCK_PREFIX " decw %0"
180 : "+m" (xl->spinners) : : "memory"); 180 : "+m" (xl->spinners) : : "memory");
181 wmb(); /* decrement count before restoring lock */ 181 wmb(); /* decrement count before restoring lock */
182 __get_cpu_var(lock_spinners) = prev; 182 __this_cpu_write(lock_spinners, prev);
183} 183}
184 184
185static noinline int xen_spin_lock_slow(struct arch_spinlock *lock, bool irq_enable) 185static noinline int xen_spin_lock_slow(struct arch_spinlock *lock, bool irq_enable)
186{ 186{
187 struct xen_spinlock *xl = (struct xen_spinlock *)lock; 187 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
188 struct xen_spinlock *prev; 188 struct xen_spinlock *prev;
189 int irq = __get_cpu_var(lock_kicker_irq); 189 int irq = __this_cpu_read(lock_kicker_irq);
190 int ret; 190 int ret;
191 u64 start; 191 u64 start;
192 192
@@ -224,7 +224,7 @@ static noinline int xen_spin_lock_slow(struct arch_spinlock *lock, bool irq_enab
224 goto out; 224 goto out;
225 } 225 }
226 226
227 flags = __raw_local_save_flags(); 227 flags = arch_local_save_flags();
228 if (irq_enable) { 228 if (irq_enable) {
229 ADD_STATS(taken_slow_irqenable, 1); 229 ADD_STATS(taken_slow_irqenable, 1);
230 raw_local_irq_enable(); 230 raw_local_irq_enable();
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index 1d789d56877c..45329c8c226e 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -12,7 +12,7 @@
12#include "xen-ops.h" 12#include "xen-ops.h"
13#include "mmu.h" 13#include "mmu.h"
14 14
15void xen_pre_suspend(void) 15void xen_arch_pre_suspend(void)
16{ 16{
17 xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn); 17 xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn);
18 xen_start_info->console.domU.mfn = 18 xen_start_info->console.domU.mfn =
@@ -26,19 +26,22 @@ void xen_pre_suspend(void)
26 BUG(); 26 BUG();
27} 27}
28 28
29void xen_hvm_post_suspend(int suspend_cancelled) 29void xen_arch_hvm_post_suspend(int suspend_cancelled)
30{ 30{
31#ifdef CONFIG_XEN_PVHVM
31 int cpu; 32 int cpu;
32 xen_hvm_init_shared_info(); 33 xen_hvm_init_shared_info();
33 xen_callback_vector(); 34 xen_callback_vector();
35 xen_unplug_emulated_devices();
34 if (xen_feature(XENFEAT_hvm_safe_pvclock)) { 36 if (xen_feature(XENFEAT_hvm_safe_pvclock)) {
35 for_each_online_cpu(cpu) { 37 for_each_online_cpu(cpu) {
36 xen_setup_runstate_info(cpu); 38 xen_setup_runstate_info(cpu);
37 } 39 }
38 } 40 }
41#endif
39} 42}
40 43
41void xen_post_suspend(int suspend_cancelled) 44void xen_arch_post_suspend(int suspend_cancelled)
42{ 45{
43 xen_build_mfn_list_list(); 46 xen_build_mfn_list_list();
44 47
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index b2bb5aa3b054..5158c505bef9 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -26,8 +26,6 @@
26 26
27#include "xen-ops.h" 27#include "xen-ops.h"
28 28
29#define XEN_SHIFT 22
30
31/* Xen may fire a timer up to this many ns early */ 29/* Xen may fire a timer up to this many ns early */
32#define TIMER_SLOP 100000 30#define TIMER_SLOP 100000
33#define NS_PER_TICK (1000000000LL / HZ) 31#define NS_PER_TICK (1000000000LL / HZ)
@@ -135,24 +133,24 @@ static void do_stolen_accounting(void)
135 133
136 /* Add the appropriate number of ticks of stolen time, 134 /* Add the appropriate number of ticks of stolen time,
137 including any left-overs from last time. */ 135 including any left-overs from last time. */
138 stolen = runnable + offline + __get_cpu_var(xen_residual_stolen); 136 stolen = runnable + offline + __this_cpu_read(xen_residual_stolen);
139 137
140 if (stolen < 0) 138 if (stolen < 0)
141 stolen = 0; 139 stolen = 0;
142 140
143 ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen); 141 ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen);
144 __get_cpu_var(xen_residual_stolen) = stolen; 142 __this_cpu_write(xen_residual_stolen, stolen);
145 account_steal_ticks(ticks); 143 account_steal_ticks(ticks);
146 144
147 /* Add the appropriate number of ticks of blocked time, 145 /* Add the appropriate number of ticks of blocked time,
148 including any left-overs from last time. */ 146 including any left-overs from last time. */
149 blocked += __get_cpu_var(xen_residual_blocked); 147 blocked += __this_cpu_read(xen_residual_blocked);
150 148
151 if (blocked < 0) 149 if (blocked < 0)
152 blocked = 0; 150 blocked = 0;
153 151
154 ticks = iter_div_u64_rem(blocked, NS_PER_TICK, &blocked); 152 ticks = iter_div_u64_rem(blocked, NS_PER_TICK, &blocked);
155 __get_cpu_var(xen_residual_blocked) = blocked; 153 __this_cpu_write(xen_residual_blocked, blocked);
156 account_idle_ticks(ticks); 154 account_idle_ticks(ticks);
157} 155}
158 156
@@ -211,8 +209,6 @@ static struct clocksource xen_clocksource __read_mostly = {
211 .rating = 400, 209 .rating = 400,
212 .read = xen_clocksource_get_cycles, 210 .read = xen_clocksource_get_cycles,
213 .mask = ~0, 211 .mask = ~0,
214 .mult = 1<<XEN_SHIFT, /* time directly in nanoseconds */
215 .shift = XEN_SHIFT,
216 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 212 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
217}; 213};
218 214
@@ -397,7 +393,9 @@ void xen_setup_timer(int cpu)
397 name = "<timer kasprintf failed>"; 393 name = "<timer kasprintf failed>";
398 394
399 irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt, 395 irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
400 IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING|IRQF_TIMER, 396 IRQF_DISABLED|IRQF_PERCPU|
397 IRQF_NOBALANCING|IRQF_TIMER|
398 IRQF_FORCE_RESUME,
401 name, NULL); 399 name, NULL);
402 400
403 evt = &per_cpu(xen_clock_events, cpu); 401 evt = &per_cpu(xen_clock_events, cpu);
@@ -426,6 +424,8 @@ void xen_timer_resume(void)
426{ 424{
427 int cpu; 425 int cpu;
428 426
427 pvclock_resume();
428
429 if (xen_clockevent != &xen_vcpuop_clockevent) 429 if (xen_clockevent != &xen_vcpuop_clockevent)
430 return; 430 return;
431 431
@@ -435,16 +435,16 @@ void xen_timer_resume(void)
435 } 435 }
436} 436}
437 437
438static const struct pv_time_ops xen_time_ops __initdata = { 438static const struct pv_time_ops xen_time_ops __initconst = {
439 .sched_clock = xen_clocksource_read, 439 .sched_clock = xen_clocksource_read,
440}; 440};
441 441
442static __init void xen_time_init(void) 442static void __init xen_time_init(void)
443{ 443{
444 int cpu = smp_processor_id(); 444 int cpu = smp_processor_id();
445 struct timespec tp; 445 struct timespec tp;
446 446
447 clocksource_register(&xen_clocksource); 447 clocksource_register_hz(&xen_clocksource, NSEC_PER_SEC);
448 448
449 if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) { 449 if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {
450 /* Successfully turned off 100Hz tick, so we have the 450 /* Successfully turned off 100Hz tick, so we have the
@@ -464,7 +464,7 @@ static __init void xen_time_init(void)
464 xen_setup_cpu_clockevents(); 464 xen_setup_cpu_clockevents();
465} 465}
466 466
467__init void xen_init_time_ops(void) 467void __init xen_init_time_ops(void)
468{ 468{
469 pv_time_ops = xen_time_ops; 469 pv_time_ops = xen_time_ops;
470 470
@@ -486,7 +486,7 @@ static void xen_hvm_setup_cpu_clockevents(void)
486 xen_setup_cpu_clockevents(); 486 xen_setup_cpu_clockevents();
487} 487}
488 488
489__init void xen_hvm_init_time_ops(void) 489void __init xen_hvm_init_time_ops(void)
490{ 490{
491 /* vector callback is needed otherwise we cannot receive interrupts 491 /* vector callback is needed otherwise we cannot receive interrupts
492 * on cpu > 0 and at this point we don't know how many cpus are 492 * on cpu > 0 and at this point we don't know how many cpus are
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 1a5ff24e29c0..aaa7291c9259 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -28,9 +28,9 @@ ENTRY(startup_xen)
28 __FINIT 28 __FINIT
29 29
30.pushsection .text 30.pushsection .text
31 .align PAGE_SIZE_asm 31 .align PAGE_SIZE
32ENTRY(hypercall_page) 32ENTRY(hypercall_page)
33 .skip PAGE_SIZE_asm 33 .skip PAGE_SIZE
34.popsection 34.popsection
35 35
36 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") 36 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux")
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 7c8ab86163e9..97dfdc8757b3 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -30,6 +30,9 @@ void xen_setup_machphys_mapping(void);
30pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn); 30pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
31void xen_ident_map_ISA(void); 31void xen_ident_map_ISA(void);
32void xen_reserve_top(void); 32void xen_reserve_top(void);
33extern unsigned long xen_max_p2m_pfn;
34
35void xen_set_pat(u64);
33 36
34char * __init xen_memory_setup(void); 37char * __init xen_memory_setup(void);
35void __init xen_arch_setup(void); 38void __init xen_arch_setup(void);
@@ -40,7 +43,7 @@ void xen_vcpu_restore(void);
40 43
41void xen_callback_vector(void); 44void xen_callback_vector(void);
42void xen_hvm_init_shared_info(void); 45void xen_hvm_init_shared_info(void);
43void __init xen_unplug_emulated_devices(void); 46void xen_unplug_emulated_devices(void);
44 47
45void __init xen_build_dynamic_phys_to_machine(void); 48void __init xen_build_dynamic_phys_to_machine(void);
46 49
@@ -61,15 +64,17 @@ void xen_setup_vcpu_info_placement(void);
61 64
62#ifdef CONFIG_SMP 65#ifdef CONFIG_SMP
63void xen_smp_init(void); 66void xen_smp_init(void);
67void __init xen_hvm_smp_init(void);
64 68
65extern cpumask_var_t xen_cpu_initialized_map; 69extern cpumask_var_t xen_cpu_initialized_map;
66#else 70#else
67static inline void xen_smp_init(void) {} 71static inline void xen_smp_init(void) {}
72static inline void xen_hvm_smp_init(void) {}
68#endif 73#endif
69 74
70#ifdef CONFIG_PARAVIRT_SPINLOCKS 75#ifdef CONFIG_PARAVIRT_SPINLOCKS
71void __init xen_init_spinlocks(void); 76void __init xen_init_spinlocks(void);
72__cpuinit void xen_init_lock_cpu(int cpu); 77void __cpuinit xen_init_lock_cpu(int cpu);
73void xen_uninit_lock_cpu(int cpu); 78void xen_uninit_lock_cpu(int cpu);
74#else 79#else
75static inline void xen_init_spinlocks(void) 80static inline void xen_init_spinlocks(void)