aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/xen
diff options
context:
space:
mode:
authorJonathan Herman <hermanjl@cs.unc.edu>2013-01-17 16:15:55 -0500
committerJonathan Herman <hermanjl@cs.unc.edu>2013-01-17 16:15:55 -0500
commit8dea78da5cee153b8af9c07a2745f6c55057fe12 (patch)
treea8f4d49d63b1ecc92f2fddceba0655b2472c5bd9 /arch/x86/xen
parent406089d01562f1e2bf9f089fd7637009ebaad589 (diff)
Patched in Tegra support.
Diffstat (limited to 'arch/x86/xen')
-rw-r--r--arch/x86/xen/Kconfig16
-rw-r--r--arch/x86/xen/Makefile2
-rw-r--r--arch/x86/xen/apic.c34
-rw-r--r--arch/x86/xen/debugfs.c104
-rw-r--r--arch/x86/xen/debugfs.h4
-rw-r--r--arch/x86/xen/enlighten.c409
-rw-r--r--arch/x86/xen/grant-table.c46
-rw-r--r--arch/x86/xen/irq.c9
-rw-r--r--arch/x86/xen/mmu.c433
-rw-r--r--arch/x86/xen/multicalls.h2
-rw-r--r--arch/x86/xen/p2m.c449
-rw-r--r--arch/x86/xen/pci-swiotlb-xen.c56
-rw-r--r--arch/x86/xen/platform-pci-unplug.c1
-rw-r--r--arch/x86/xen/setup.c442
-rw-r--r--arch/x86/xen/smp.c169
-rw-r--r--arch/x86/xen/smp.h12
-rw-r--r--arch/x86/xen/spinlock.c39
-rw-r--r--arch/x86/xen/suspend.c2
-rw-r--r--arch/x86/xen/time.c16
-rw-r--r--arch/x86/xen/vga.c7
-rw-r--r--arch/x86/xen/xen-asm.S2
-rw-r--r--arch/x86/xen/xen-asm_32.S6
-rw-r--r--arch/x86/xen/xen-head.S56
-rw-r--r--arch/x86/xen/xen-ops.h11
24 files changed, 653 insertions, 1674 deletions
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 131dacd2748..5cc821cb2e0 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -6,9 +6,8 @@ config XEN
6 bool "Xen guest support" 6 bool "Xen guest support"
7 select PARAVIRT 7 select PARAVIRT
8 select PARAVIRT_CLOCK 8 select PARAVIRT_CLOCK
9 select XEN_HAVE_PVMMU
10 depends on X86_64 || (X86_32 && X86_PAE && !X86_VISWS) 9 depends on X86_64 || (X86_32 && X86_PAE && !X86_VISWS)
11 depends on X86_TSC 10 depends on X86_CMPXCHG && X86_TSC
12 help 11 help
13 This is the Linux Xen port. Enabling this will allow the 12 This is the Linux Xen port. Enabling this will allow the
14 kernel to boot in a paravirtualized environment under the 13 kernel to boot in a paravirtualized environment under the
@@ -26,12 +25,12 @@ config XEN_PRIVILEGED_GUEST
26 25
27config XEN_PVHVM 26config XEN_PVHVM
28 def_bool y 27 def_bool y
29 depends on XEN && PCI && X86_LOCAL_APIC 28 depends on XEN
29 depends on X86_LOCAL_APIC
30 30
31config XEN_MAX_DOMAIN_MEMORY 31config XEN_MAX_DOMAIN_MEMORY
32 int 32 int
33 default 500 if X86_64 33 default 128
34 default 64 if X86_32
35 depends on XEN 34 depends on XEN
36 help 35 help
37 This only affects the sizing of some bss arrays, the unused 36 This only affects the sizing of some bss arrays, the unused
@@ -51,3 +50,10 @@ config XEN_DEBUG_FS
51 Enable statistics output and various tuning options in debugfs. 50 Enable statistics output and various tuning options in debugfs.
52 Enabling this option may incur a significant performance overhead. 51 Enabling this option may incur a significant performance overhead.
53 52
53config XEN_DEBUG
54 bool "Enable Xen debug checks"
55 depends on XEN
56 default n
57 help
58 Enable various WARN_ON checks in the Xen MMU code.
59 Enabling this option WILL incur a significant performance overhead.
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 96ab2c09cb6..add2c2d729c 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -20,5 +20,5 @@ obj-$(CONFIG_EVENT_TRACING) += trace.o
20obj-$(CONFIG_SMP) += smp.o 20obj-$(CONFIG_SMP) += smp.o
21obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o 21obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o
22obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o 22obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o
23obj-$(CONFIG_XEN_DOM0) += apic.o vga.o 23obj-$(CONFIG_XEN_DOM0) += vga.o
24obj-$(CONFIG_SWIOTLB_XEN) += pci-swiotlb-xen.o 24obj-$(CONFIG_SWIOTLB_XEN) += pci-swiotlb-xen.o
diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c
deleted file mode 100644
index 7005ced5d1a..00000000000
--- a/arch/x86/xen/apic.c
+++ /dev/null
@@ -1,34 +0,0 @@
1#include <linux/init.h>
2
3#include <asm/x86_init.h>
4#include <asm/apic.h>
5#include <asm/xen/hypercall.h>
6
7#include <xen/xen.h>
8#include <xen/interface/physdev.h>
9#include "xen-ops.h"
10
11static unsigned int xen_io_apic_read(unsigned apic, unsigned reg)
12{
13 struct physdev_apic apic_op;
14 int ret;
15
16 apic_op.apic_physbase = mpc_ioapic_addr(apic);
17 apic_op.reg = reg;
18 ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op);
19 if (!ret)
20 return apic_op.value;
21
22 /* fallback to return an emulated IO_APIC values */
23 if (reg == 0x1)
24 return 0x00170020;
25 else if (reg == 0x0)
26 return apic << 24;
27
28 return 0xfd;
29}
30
31void __init xen_init_apic(void)
32{
33 x86_io_apic_ops.read = xen_io_apic_read;
34}
diff --git a/arch/x86/xen/debugfs.c b/arch/x86/xen/debugfs.c
index c8377fb26cd..7c0fedd98ea 100644
--- a/arch/x86/xen/debugfs.c
+++ b/arch/x86/xen/debugfs.c
@@ -19,3 +19,107 @@ struct dentry * __init xen_init_debugfs(void)
19 return d_xen_debug; 19 return d_xen_debug;
20} 20}
21 21
22struct array_data
23{
24 void *array;
25 unsigned elements;
26};
27
28static int u32_array_open(struct inode *inode, struct file *file)
29{
30 file->private_data = NULL;
31 return nonseekable_open(inode, file);
32}
33
34static size_t format_array(char *buf, size_t bufsize, const char *fmt,
35 u32 *array, unsigned array_size)
36{
37 size_t ret = 0;
38 unsigned i;
39
40 for(i = 0; i < array_size; i++) {
41 size_t len;
42
43 len = snprintf(buf, bufsize, fmt, array[i]);
44 len++; /* ' ' or '\n' */
45 ret += len;
46
47 if (buf) {
48 buf += len;
49 bufsize -= len;
50 buf[-1] = (i == array_size-1) ? '\n' : ' ';
51 }
52 }
53
54 ret++; /* \0 */
55 if (buf)
56 *buf = '\0';
57
58 return ret;
59}
60
61static char *format_array_alloc(const char *fmt, u32 *array, unsigned array_size)
62{
63 size_t len = format_array(NULL, 0, fmt, array, array_size);
64 char *ret;
65
66 ret = kmalloc(len, GFP_KERNEL);
67 if (ret == NULL)
68 return NULL;
69
70 format_array(ret, len, fmt, array, array_size);
71 return ret;
72}
73
74static ssize_t u32_array_read(struct file *file, char __user *buf, size_t len,
75 loff_t *ppos)
76{
77 struct inode *inode = file->f_path.dentry->d_inode;
78 struct array_data *data = inode->i_private;
79 size_t size;
80
81 if (*ppos == 0) {
82 if (file->private_data) {
83 kfree(file->private_data);
84 file->private_data = NULL;
85 }
86
87 file->private_data = format_array_alloc("%u", data->array, data->elements);
88 }
89
90 size = 0;
91 if (file->private_data)
92 size = strlen(file->private_data);
93
94 return simple_read_from_buffer(buf, len, ppos, file->private_data, size);
95}
96
97static int xen_array_release(struct inode *inode, struct file *file)
98{
99 kfree(file->private_data);
100
101 return 0;
102}
103
104static const struct file_operations u32_array_fops = {
105 .owner = THIS_MODULE,
106 .open = u32_array_open,
107 .release= xen_array_release,
108 .read = u32_array_read,
109 .llseek = no_llseek,
110};
111
112struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode,
113 struct dentry *parent,
114 u32 *array, unsigned elements)
115{
116 struct array_data *data = kmalloc(sizeof(*data), GFP_KERNEL);
117
118 if (data == NULL)
119 return NULL;
120
121 data->array = array;
122 data->elements = elements;
123
124 return debugfs_create_file(name, mode, parent, data, &u32_array_fops);
125}
diff --git a/arch/x86/xen/debugfs.h b/arch/x86/xen/debugfs.h
index 12ebf3325c7..e2813208483 100644
--- a/arch/x86/xen/debugfs.h
+++ b/arch/x86/xen/debugfs.h
@@ -3,4 +3,8 @@
3 3
4struct dentry * __init xen_init_debugfs(void); 4struct dentry * __init xen_init_debugfs(void);
5 5
6struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode,
7 struct dentry *parent,
8 u32 *array, unsigned elements);
9
6#endif /* _XEN_DEBUGFS_H */ 10#endif /* _XEN_DEBUGFS_H */
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 138e5667409..46c8069ae98 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -33,18 +33,15 @@
33#include <linux/memblock.h> 33#include <linux/memblock.h>
34 34
35#include <xen/xen.h> 35#include <xen/xen.h>
36#include <xen/events.h>
37#include <xen/interface/xen.h> 36#include <xen/interface/xen.h>
38#include <xen/interface/version.h> 37#include <xen/interface/version.h>
39#include <xen/interface/physdev.h> 38#include <xen/interface/physdev.h>
40#include <xen/interface/vcpu.h> 39#include <xen/interface/vcpu.h>
41#include <xen/interface/memory.h> 40#include <xen/interface/memory.h>
42#include <xen/interface/xen-mca.h>
43#include <xen/features.h> 41#include <xen/features.h>
44#include <xen/page.h> 42#include <xen/page.h>
45#include <xen/hvm.h> 43#include <xen/hvm.h>
46#include <xen/hvc-console.h> 44#include <xen/hvc-console.h>
47#include <xen/acpi.h>
48 45
49#include <asm/paravirt.h> 46#include <asm/paravirt.h>
50#include <asm/apic.h> 47#include <asm/apic.h>
@@ -65,20 +62,9 @@
65#include <asm/reboot.h> 62#include <asm/reboot.h>
66#include <asm/stackprotector.h> 63#include <asm/stackprotector.h>
67#include <asm/hypervisor.h> 64#include <asm/hypervisor.h>
68#include <asm/mwait.h>
69#include <asm/pci_x86.h>
70
71#ifdef CONFIG_ACPI
72#include <linux/acpi.h>
73#include <asm/acpi.h>
74#include <acpi/pdc_intel.h>
75#include <acpi/processor.h>
76#include <xen/interface/platform.h>
77#endif
78 65
79#include "xen-ops.h" 66#include "xen-ops.h"
80#include "mmu.h" 67#include "mmu.h"
81#include "smp.h"
82#include "multicalls.h" 68#include "multicalls.h"
83 69
84EXPORT_SYMBOL_GPL(hypercall_page); 70EXPORT_SYMBOL_GPL(hypercall_page);
@@ -109,7 +95,7 @@ EXPORT_SYMBOL_GPL(xen_have_vector_callback);
109 * Point at some empty memory to start with. We map the real shared_info 95 * Point at some empty memory to start with. We map the real shared_info
110 * page as soon as fixmap is up and running. 96 * page as soon as fixmap is up and running.
111 */ 97 */
112struct shared_info *HYPERVISOR_shared_info = &xen_dummy_shared_info; 98struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
113 99
114/* 100/*
115 * Flag to determine whether vcpu info placement is available on all 101 * Flag to determine whether vcpu info placement is available on all
@@ -126,19 +112,6 @@ struct shared_info *HYPERVISOR_shared_info = &xen_dummy_shared_info;
126 */ 112 */
127static int have_vcpu_info_placement = 1; 113static int have_vcpu_info_placement = 1;
128 114
129struct tls_descs {
130 struct desc_struct desc[3];
131};
132
133/*
134 * Updating the 3 TLS descriptors in the GDT on every task switch is
135 * surprisingly expensive so we avoid updating them if they haven't
136 * changed. Since Xen writes different descriptors than the one
137 * passed in the update_descriptor hypercall we keep shadow copies to
138 * compare against.
139 */
140static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc);
141
142static void clamp_max_cpus(void) 115static void clamp_max_cpus(void)
143{ 116{
144#ifdef CONFIG_SMP 117#ifdef CONFIG_SMP
@@ -193,11 +166,10 @@ void xen_vcpu_restore(void)
193{ 166{
194 int cpu; 167 int cpu;
195 168
196 for_each_possible_cpu(cpu) { 169 for_each_online_cpu(cpu) {
197 bool other_cpu = (cpu != smp_processor_id()); 170 bool other_cpu = (cpu != smp_processor_id());
198 bool is_up = HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu, NULL);
199 171
200 if (other_cpu && is_up && 172 if (other_cpu &&
201 HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL)) 173 HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL))
202 BUG(); 174 BUG();
203 175
@@ -206,7 +178,7 @@ void xen_vcpu_restore(void)
206 if (have_vcpu_info_placement) 178 if (have_vcpu_info_placement)
207 xen_vcpu_setup(cpu); 179 xen_vcpu_setup(cpu);
208 180
209 if (other_cpu && is_up && 181 if (other_cpu &&
210 HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL)) 182 HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL))
211 BUG(); 183 BUG();
212 } 184 }
@@ -224,39 +196,17 @@ static void __init xen_banner(void)
224 version >> 16, version & 0xffff, extra.extraversion, 196 version >> 16, version & 0xffff, extra.extraversion,
225 xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : ""); 197 xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : "");
226} 198}
227/* Check if running on Xen version (major, minor) or later */
228bool
229xen_running_on_version_or_later(unsigned int major, unsigned int minor)
230{
231 unsigned int version;
232
233 if (!xen_domain())
234 return false;
235
236 version = HYPERVISOR_xen_version(XENVER_version, NULL);
237 if ((((version >> 16) == major) && ((version & 0xffff) >= minor)) ||
238 ((version >> 16) > major))
239 return true;
240 return false;
241}
242
243#define CPUID_THERM_POWER_LEAF 6
244#define APERFMPERF_PRESENT 0
245 199
246static __read_mostly unsigned int cpuid_leaf1_edx_mask = ~0; 200static __read_mostly unsigned int cpuid_leaf1_edx_mask = ~0;
247static __read_mostly unsigned int cpuid_leaf1_ecx_mask = ~0; 201static __read_mostly unsigned int cpuid_leaf1_ecx_mask = ~0;
248 202
249static __read_mostly unsigned int cpuid_leaf1_ecx_set_mask;
250static __read_mostly unsigned int cpuid_leaf5_ecx_val;
251static __read_mostly unsigned int cpuid_leaf5_edx_val;
252
253static void xen_cpuid(unsigned int *ax, unsigned int *bx, 203static void xen_cpuid(unsigned int *ax, unsigned int *bx,
254 unsigned int *cx, unsigned int *dx) 204 unsigned int *cx, unsigned int *dx)
255{ 205{
256 unsigned maskebx = ~0; 206 unsigned maskebx = ~0;
257 unsigned maskecx = ~0; 207 unsigned maskecx = ~0;
258 unsigned maskedx = ~0; 208 unsigned maskedx = ~0;
259 unsigned setecx = 0; 209
260 /* 210 /*
261 * Mask out inconvenient features, to try and disable as many 211 * Mask out inconvenient features, to try and disable as many
262 * unsupported kernel subsystems as possible. 212 * unsupported kernel subsystems as possible.
@@ -264,23 +214,9 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx,
264 switch (*ax) { 214 switch (*ax) {
265 case 1: 215 case 1:
266 maskecx = cpuid_leaf1_ecx_mask; 216 maskecx = cpuid_leaf1_ecx_mask;
267 setecx = cpuid_leaf1_ecx_set_mask;
268 maskedx = cpuid_leaf1_edx_mask; 217 maskedx = cpuid_leaf1_edx_mask;
269 break; 218 break;
270 219
271 case CPUID_MWAIT_LEAF:
272 /* Synthesize the values.. */
273 *ax = 0;
274 *bx = 0;
275 *cx = cpuid_leaf5_ecx_val;
276 *dx = cpuid_leaf5_edx_val;
277 return;
278
279 case CPUID_THERM_POWER_LEAF:
280 /* Disabling APERFMPERF for kernel usage */
281 maskecx = ~(1 << APERFMPERF_PRESENT);
282 break;
283
284 case 0xb: 220 case 0xb:
285 /* Suppress extended topology stuff */ 221 /* Suppress extended topology stuff */
286 maskebx = 0; 222 maskebx = 0;
@@ -296,89 +232,18 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx,
296 232
297 *bx &= maskebx; 233 *bx &= maskebx;
298 *cx &= maskecx; 234 *cx &= maskecx;
299 *cx |= setecx;
300 *dx &= maskedx; 235 *dx &= maskedx;
301
302} 236}
303 237
304static bool __init xen_check_mwait(void)
305{
306#ifdef CONFIG_ACPI
307 struct xen_platform_op op = {
308 .cmd = XENPF_set_processor_pminfo,
309 .u.set_pminfo.id = -1,
310 .u.set_pminfo.type = XEN_PM_PDC,
311 };
312 uint32_t buf[3];
313 unsigned int ax, bx, cx, dx;
314 unsigned int mwait_mask;
315
316 /* We need to determine whether it is OK to expose the MWAIT
317 * capability to the kernel to harvest deeper than C3 states from ACPI
318 * _CST using the processor_harvest_xen.c module. For this to work, we
319 * need to gather the MWAIT_LEAF values (which the cstate.c code
320 * checks against). The hypervisor won't expose the MWAIT flag because
321 * it would break backwards compatibility; so we will find out directly
322 * from the hardware and hypercall.
323 */
324 if (!xen_initial_domain())
325 return false;
326
327 /*
328 * When running under platform earlier than Xen4.2, do not expose
329 * mwait, to avoid the risk of loading native acpi pad driver
330 */
331 if (!xen_running_on_version_or_later(4, 2))
332 return false;
333
334 ax = 1;
335 cx = 0;
336
337 native_cpuid(&ax, &bx, &cx, &dx);
338
339 mwait_mask = (1 << (X86_FEATURE_EST % 32)) |
340 (1 << (X86_FEATURE_MWAIT % 32));
341
342 if ((cx & mwait_mask) != mwait_mask)
343 return false;
344
345 /* We need to emulate the MWAIT_LEAF and for that we need both
346 * ecx and edx. The hypercall provides only partial information.
347 */
348
349 ax = CPUID_MWAIT_LEAF;
350 bx = 0;
351 cx = 0;
352 dx = 0;
353
354 native_cpuid(&ax, &bx, &cx, &dx);
355
356 /* Ask the Hypervisor whether to clear ACPI_PDC_C_C2C3_FFH. If so,
357 * don't expose MWAIT_LEAF and let ACPI pick the IOPORT version of C3.
358 */
359 buf[0] = ACPI_PDC_REVISION_ID;
360 buf[1] = 1;
361 buf[2] = (ACPI_PDC_C_CAPABILITY_SMP | ACPI_PDC_EST_CAPABILITY_SWSMP);
362
363 set_xen_guest_handle(op.u.set_pminfo.pdc, buf);
364
365 if ((HYPERVISOR_dom0_op(&op) == 0) &&
366 (buf[2] & (ACPI_PDC_C_C1_FFH | ACPI_PDC_C_C2C3_FFH))) {
367 cpuid_leaf5_ecx_val = cx;
368 cpuid_leaf5_edx_val = dx;
369 }
370 return true;
371#else
372 return false;
373#endif
374}
375static void __init xen_init_cpuid_mask(void) 238static void __init xen_init_cpuid_mask(void)
376{ 239{
377 unsigned int ax, bx, cx, dx; 240 unsigned int ax, bx, cx, dx;
378 unsigned int xsave_mask; 241 unsigned int xsave_mask;
379 242
380 cpuid_leaf1_edx_mask = 243 cpuid_leaf1_edx_mask =
381 ~((1 << X86_FEATURE_MTRR) | /* disable MTRR */ 244 ~((1 << X86_FEATURE_MCE) | /* disable MCE */
245 (1 << X86_FEATURE_MCA) | /* disable MCA */
246 (1 << X86_FEATURE_MTRR) | /* disable MTRR */
382 (1 << X86_FEATURE_ACC)); /* thermal monitoring */ 247 (1 << X86_FEATURE_ACC)); /* thermal monitoring */
383 248
384 if (!xen_initial_domain()) 249 if (!xen_initial_domain())
@@ -386,7 +251,6 @@ static void __init xen_init_cpuid_mask(void)
386 ~((1 << X86_FEATURE_APIC) | /* disable local APIC */ 251 ~((1 << X86_FEATURE_APIC) | /* disable local APIC */
387 (1 << X86_FEATURE_ACPI)); /* disable ACPI */ 252 (1 << X86_FEATURE_ACPI)); /* disable ACPI */
388 ax = 1; 253 ax = 1;
389 cx = 0;
390 xen_cpuid(&ax, &bx, &cx, &dx); 254 xen_cpuid(&ax, &bx, &cx, &dx);
391 255
392 xsave_mask = 256 xsave_mask =
@@ -396,8 +260,6 @@ static void __init xen_init_cpuid_mask(void)
396 /* Xen will set CR4.OSXSAVE if supported and not disabled by force */ 260 /* Xen will set CR4.OSXSAVE if supported and not disabled by force */
397 if ((cx & xsave_mask) != xsave_mask) 261 if ((cx & xsave_mask) != xsave_mask)
398 cpuid_leaf1_ecx_mask &= ~xsave_mask; /* disable XSAVE & OSXSAVE */ 262 cpuid_leaf1_ecx_mask &= ~xsave_mask; /* disable XSAVE & OSXSAVE */
399 if (xen_check_mwait())
400 cpuid_leaf1_ecx_set_mask = (1 << (X86_FEATURE_MWAIT % 32));
401} 263}
402 264
403static void xen_set_debugreg(int reg, unsigned long val) 265static void xen_set_debugreg(int reg, unsigned long val)
@@ -575,28 +437,12 @@ static void __init xen_load_gdt_boot(const struct desc_ptr *dtr)
575 BUG(); 437 BUG();
576} 438}
577 439
578static inline bool desc_equal(const struct desc_struct *d1,
579 const struct desc_struct *d2)
580{
581 return d1->a == d2->a && d1->b == d2->b;
582}
583
584static void load_TLS_descriptor(struct thread_struct *t, 440static void load_TLS_descriptor(struct thread_struct *t,
585 unsigned int cpu, unsigned int i) 441 unsigned int cpu, unsigned int i)
586{ 442{
587 struct desc_struct *shadow = &per_cpu(shadow_tls_desc, cpu).desc[i]; 443 struct desc_struct *gdt = get_cpu_gdt_table(cpu);
588 struct desc_struct *gdt; 444 xmaddr_t maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
589 xmaddr_t maddr; 445 struct multicall_space mc = __xen_mc_entry(0);
590 struct multicall_space mc;
591
592 if (desc_equal(shadow, &t->tls_array[i]))
593 return;
594
595 *shadow = t->tls_array[i];
596
597 gdt = get_cpu_gdt_table(cpu);
598 maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
599 mc = __xen_mc_entry(0);
600 446
601 MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]); 447 MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]);
602} 448}
@@ -678,8 +524,8 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val,
678 /* 524 /*
679 * Look for known traps using IST, and substitute them 525 * Look for known traps using IST, and substitute them
680 * appropriately. The debugger ones are the only ones we care 526 * appropriately. The debugger ones are the only ones we care
681 * about. Xen will handle faults like double_fault, 527 * about. Xen will handle faults like double_fault and
682 * so we should never see them. Warn if 528 * machine_check, so we should never see them. Warn if
683 * there's an unexpected IST-using fault handler. 529 * there's an unexpected IST-using fault handler.
684 */ 530 */
685 if (addr == (unsigned long)debug) 531 if (addr == (unsigned long)debug)
@@ -694,11 +540,7 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val,
694 return 0; 540 return 0;
695#ifdef CONFIG_X86_MCE 541#ifdef CONFIG_X86_MCE
696 } else if (addr == (unsigned long)machine_check) { 542 } else if (addr == (unsigned long)machine_check) {
697 /* 543 return 0;
698 * when xen hypervisor inject vMCE to guest,
699 * use native mce handler to handle it
700 */
701 ;
702#endif 544#endif
703 } else { 545 } else {
704 /* Some other trap using IST? */ 546 /* Some other trap using IST? */
@@ -875,40 +717,9 @@ static void xen_io_delay(void)
875} 717}
876 718
877#ifdef CONFIG_X86_LOCAL_APIC 719#ifdef CONFIG_X86_LOCAL_APIC
878static unsigned long xen_set_apic_id(unsigned int x)
879{
880 WARN_ON(1);
881 return x;
882}
883static unsigned int xen_get_apic_id(unsigned long x)
884{
885 return ((x)>>24) & 0xFFu;
886}
887static u32 xen_apic_read(u32 reg) 720static u32 xen_apic_read(u32 reg)
888{ 721{
889 struct xen_platform_op op = { 722 return 0;
890 .cmd = XENPF_get_cpuinfo,
891 .interface_version = XENPF_INTERFACE_VERSION,
892 .u.pcpu_info.xen_cpuid = 0,
893 };
894 int ret = 0;
895
896 /* Shouldn't need this as APIC is turned off for PV, and we only
897 * get called on the bootup processor. But just in case. */
898 if (!xen_initial_domain() || smp_processor_id())
899 return 0;
900
901 if (reg == APIC_LVR)
902 return 0x10;
903
904 if (reg != APIC_ID)
905 return 0;
906
907 ret = HYPERVISOR_dom0_op(&op);
908 if (ret)
909 return 0;
910
911 return op.u.pcpu_info.apic_id << 24;
912} 723}
913 724
914static void xen_apic_write(u32 reg, u32 val) 725static void xen_apic_write(u32 reg, u32 val)
@@ -946,16 +757,6 @@ static void set_xen_basic_apic_ops(void)
946 apic->icr_write = xen_apic_icr_write; 757 apic->icr_write = xen_apic_icr_write;
947 apic->wait_icr_idle = xen_apic_wait_icr_idle; 758 apic->wait_icr_idle = xen_apic_wait_icr_idle;
948 apic->safe_wait_icr_idle = xen_safe_apic_wait_icr_idle; 759 apic->safe_wait_icr_idle = xen_safe_apic_wait_icr_idle;
949 apic->set_apic_id = xen_set_apic_id;
950 apic->get_apic_id = xen_get_apic_id;
951
952#ifdef CONFIG_SMP
953 apic->send_IPI_allbutself = xen_send_IPI_allbutself;
954 apic->send_IPI_mask_allbutself = xen_send_IPI_mask_allbutself;
955 apic->send_IPI_mask = xen_send_IPI_mask;
956 apic->send_IPI_all = xen_send_IPI_all;
957 apic->send_IPI_self = xen_send_IPI_self;
958#endif
959} 760}
960 761
961#endif 762#endif
@@ -975,11 +776,11 @@ static DEFINE_PER_CPU(unsigned long, xen_cr0_value);
975 776
976static unsigned long xen_read_cr0(void) 777static unsigned long xen_read_cr0(void)
977{ 778{
978 unsigned long cr0 = this_cpu_read(xen_cr0_value); 779 unsigned long cr0 = percpu_read(xen_cr0_value);
979 780
980 if (unlikely(cr0 == 0)) { 781 if (unlikely(cr0 == 0)) {
981 cr0 = native_read_cr0(); 782 cr0 = native_read_cr0();
982 this_cpu_write(xen_cr0_value, cr0); 783 percpu_write(xen_cr0_value, cr0);
983 } 784 }
984 785
985 return cr0; 786 return cr0;
@@ -989,7 +790,7 @@ static void xen_write_cr0(unsigned long cr0)
989{ 790{
990 struct multicall_space mcs; 791 struct multicall_space mcs;
991 792
992 this_cpu_write(xen_cr0_value, cr0); 793 percpu_write(xen_cr0_value, cr0);
993 794
994 /* Only pay attention to cr0.TS; everything else is 795 /* Only pay attention to cr0.TS; everything else is
995 ignored. */ 796 ignored. */
@@ -1007,16 +808,7 @@ static void xen_write_cr4(unsigned long cr4)
1007 808
1008 native_write_cr4(cr4); 809 native_write_cr4(cr4);
1009} 810}
1010#ifdef CONFIG_X86_64 811
1011static inline unsigned long xen_read_cr8(void)
1012{
1013 return 0;
1014}
1015static inline void xen_write_cr8(unsigned long val)
1016{
1017 BUG_ON(val);
1018}
1019#endif
1020static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) 812static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
1021{ 813{
1022 int ret; 814 int ret;
@@ -1083,7 +875,7 @@ void xen_setup_shared_info(void)
1083 xen_setup_mfn_list_list(); 875 xen_setup_mfn_list_list();
1084} 876}
1085 877
1086/* This is called once we have the cpu_possible_mask */ 878/* This is called once we have the cpu_possible_map */
1087void xen_setup_vcpu_info_placement(void) 879void xen_setup_vcpu_info_placement(void)
1088{ 880{
1089 int cpu; 881 int cpu;
@@ -1185,21 +977,13 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {
1185 .read_cr4_safe = native_read_cr4_safe, 977 .read_cr4_safe = native_read_cr4_safe,
1186 .write_cr4 = xen_write_cr4, 978 .write_cr4 = xen_write_cr4,
1187 979
1188#ifdef CONFIG_X86_64
1189 .read_cr8 = xen_read_cr8,
1190 .write_cr8 = xen_write_cr8,
1191#endif
1192
1193 .wbinvd = native_wbinvd, 980 .wbinvd = native_wbinvd,
1194 981
1195 .read_msr = native_read_msr_safe, 982 .read_msr = native_read_msr_safe,
1196 .write_msr = xen_write_msr_safe, 983 .write_msr = xen_write_msr_safe,
1197
1198 .read_tsc = native_read_tsc, 984 .read_tsc = native_read_tsc,
1199 .read_pmc = native_read_pmc, 985 .read_pmc = native_read_pmc,
1200 986
1201 .read_tscp = native_read_tscp,
1202
1203 .iret = xen_iret, 987 .iret = xen_iret,
1204 .irq_enable_sysexit = xen_sysexit, 988 .irq_enable_sysexit = xen_sysexit,
1205#ifdef CONFIG_X86_64 989#ifdef CONFIG_X86_64
@@ -1327,6 +1111,7 @@ asmlinkage void __init xen_start_kernel(void)
1327{ 1111{
1328 struct physdev_set_iopl set_iopl; 1112 struct physdev_set_iopl set_iopl;
1329 int rc; 1113 int rc;
1114 pgd_t *pgd;
1330 1115
1331 if (!xen_start_info) 1116 if (!xen_start_info)
1332 return; 1117 return;
@@ -1355,9 +1140,7 @@ asmlinkage void __init xen_start_kernel(void)
1355 1140
1356 /* Prevent unwanted bits from being set in PTEs. */ 1141 /* Prevent unwanted bits from being set in PTEs. */
1357 __supported_pte_mask &= ~_PAGE_GLOBAL; 1142 __supported_pte_mask &= ~_PAGE_GLOBAL;
1358#if 0
1359 if (!xen_initial_domain()) 1143 if (!xen_initial_domain())
1360#endif
1361 __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD); 1144 __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
1362 1145
1363 __supported_pte_mask |= _PAGE_IOMAP; 1146 __supported_pte_mask |= _PAGE_IOMAP;
@@ -1418,6 +1201,12 @@ asmlinkage void __init xen_start_kernel(void)
1418 acpi_numa = -1; 1201 acpi_numa = -1;
1419#endif 1202#endif
1420 1203
1204 pgd = (pgd_t *)xen_start_info->pt_base;
1205
1206 if (!xen_initial_domain())
1207 __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
1208
1209 __supported_pte_mask |= _PAGE_IOMAP;
1421 /* Don't do the full vcpu_info placement stuff until we have a 1210 /* Don't do the full vcpu_info placement stuff until we have a
1422 possible map and a non-dummy shared_info. */ 1211 possible map and a non-dummy shared_info. */
1423 per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; 1212 per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
@@ -1425,8 +1214,11 @@ asmlinkage void __init xen_start_kernel(void)
1425 local_irq_disable(); 1214 local_irq_disable();
1426 early_boot_irqs_disabled = true; 1215 early_boot_irqs_disabled = true;
1427 1216
1217 memblock_init();
1218
1428 xen_raw_console_write("mapping kernel into physical memory\n"); 1219 xen_raw_console_write("mapping kernel into physical memory\n");
1429 xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base, xen_start_info->nr_pages); 1220 pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages);
1221 xen_ident_map_ISA();
1430 1222
1431 /* Allocate and initialize top and mid mfn levels for p2m structure */ 1223 /* Allocate and initialize top and mid mfn levels for p2m structure */
1432 xen_build_mfn_list_list(); 1224 xen_build_mfn_list_list();
@@ -1477,34 +1269,16 @@ asmlinkage void __init xen_start_kernel(void)
1477 const struct dom0_vga_console_info *info = 1269 const struct dom0_vga_console_info *info =
1478 (void *)((char *)xen_start_info + 1270 (void *)((char *)xen_start_info +
1479 xen_start_info->console.dom0.info_off); 1271 xen_start_info->console.dom0.info_off);
1480 struct xen_platform_op op = {
1481 .cmd = XENPF_firmware_info,
1482 .interface_version = XENPF_INTERFACE_VERSION,
1483 .u.firmware_info.type = XEN_FW_KBD_SHIFT_FLAGS,
1484 };
1485 1272
1486 xen_init_vga(info, xen_start_info->console.dom0.info_size); 1273 xen_init_vga(info, xen_start_info->console.dom0.info_size);
1487 xen_start_info->console.domU.mfn = 0; 1274 xen_start_info->console.domU.mfn = 0;
1488 xen_start_info->console.domU.evtchn = 0; 1275 xen_start_info->console.domU.evtchn = 0;
1489 1276
1490 if (HYPERVISOR_dom0_op(&op) == 0)
1491 boot_params.kbd_status = op.u.firmware_info.u.kbd_shift_flags;
1492
1493 xen_init_apic();
1494
1495 /* Make sure ACS will be enabled */ 1277 /* Make sure ACS will be enabled */
1496 pci_request_acs(); 1278 pci_request_acs();
1497
1498 xen_acpi_sleep_register();
1499
1500 /* Avoid searching for BIOS MP tables */
1501 x86_init.mpparse.find_smp_config = x86_init_noop;
1502 x86_init.mpparse.get_smp_config = x86_init_uint_noop;
1503 } 1279 }
1504#ifdef CONFIG_PCI 1280
1505 /* PCI BIOS service won't work from a PV guest. */ 1281
1506 pci_probe &= ~PCI_PROBE_BIOS;
1507#endif
1508 xen_raw_console_write("about to get started...\n"); 1282 xen_raw_console_write("about to get started...\n");
1509 1283
1510 xen_setup_runstate_info(0); 1284 xen_setup_runstate_info(0);
@@ -1517,84 +1291,64 @@ asmlinkage void __init xen_start_kernel(void)
1517#endif 1291#endif
1518} 1292}
1519 1293
1520#ifdef CONFIG_XEN_PVHVM 1294static int init_hvm_pv_info(int *major, int *minor)
1521#define HVM_SHARED_INFO_ADDR 0xFE700000UL 1295{
1522static struct shared_info *xen_hvm_shared_info; 1296 uint32_t eax, ebx, ecx, edx, pages, msr, base;
1523static unsigned long xen_hvm_sip_phys; 1297 u64 pfn;
1524static int xen_major, xen_minor; 1298
1299 base = xen_cpuid_base();
1300 cpuid(base + 1, &eax, &ebx, &ecx, &edx);
1301
1302 *major = eax >> 16;
1303 *minor = eax & 0xffff;
1304 printk(KERN_INFO "Xen version %d.%d.\n", *major, *minor);
1305
1306 cpuid(base + 2, &pages, &msr, &ecx, &edx);
1307
1308 pfn = __pa(hypercall_page);
1309 wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
1310
1311 xen_setup_features();
1312
1313 pv_info.name = "Xen HVM";
1314
1315 xen_domain_type = XEN_HVM_DOMAIN;
1316
1317 return 0;
1318}
1525 1319
1526static void xen_hvm_connect_shared_info(unsigned long pfn) 1320void __ref xen_hvm_init_shared_info(void)
1527{ 1321{
1322 int cpu;
1528 struct xen_add_to_physmap xatp; 1323 struct xen_add_to_physmap xatp;
1324 static struct shared_info *shared_info_page = 0;
1529 1325
1326 if (!shared_info_page)
1327 shared_info_page = (struct shared_info *)
1328 extend_brk(PAGE_SIZE, PAGE_SIZE);
1530 xatp.domid = DOMID_SELF; 1329 xatp.domid = DOMID_SELF;
1531 xatp.idx = 0; 1330 xatp.idx = 0;
1532 xatp.space = XENMAPSPACE_shared_info; 1331 xatp.space = XENMAPSPACE_shared_info;
1533 xatp.gpfn = pfn; 1332 xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT;
1534 if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp)) 1333 if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp))
1535 BUG(); 1334 BUG();
1536 1335
1537} 1336 HYPERVISOR_shared_info = (struct shared_info *)shared_info_page;
1538static void __init xen_hvm_set_shared_info(struct shared_info *sip)
1539{
1540 int cpu;
1541
1542 HYPERVISOR_shared_info = sip;
1543 1337
1544 /* xen_vcpu is a pointer to the vcpu_info struct in the shared_info 1338 /* xen_vcpu is a pointer to the vcpu_info struct in the shared_info
1545 * page, we use it in the event channel upcall and in some pvclock 1339 * page, we use it in the event channel upcall and in some pvclock
1546 * related functions. We don't need the vcpu_info placement 1340 * related functions. We don't need the vcpu_info placement
1547 * optimizations because we don't use any pv_mmu or pv_irq op on 1341 * optimizations because we don't use any pv_mmu or pv_irq op on
1548 * HVM. */ 1342 * HVM.
1549 for_each_online_cpu(cpu) 1343 * When xen_hvm_init_shared_info is run at boot time only vcpu 0 is
1344 * online but xen_hvm_init_shared_info is run at resume time too and
1345 * in that case multiple vcpus might be online. */
1346 for_each_online_cpu(cpu) {
1550 per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; 1347 per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
1551}
1552
1553/* Reconnect the shared_info pfn to a (new) mfn */
1554void xen_hvm_resume_shared_info(void)
1555{
1556 xen_hvm_connect_shared_info(xen_hvm_sip_phys >> PAGE_SHIFT);
1557}
1558
1559/* Xen tools prior to Xen 4 do not provide a E820_Reserved area for guest usage.
1560 * On these old tools the shared info page will be placed in E820_Ram.
1561 * Xen 4 provides a E820_Reserved area at 0xFC000000, and this code expects
1562 * that nothing is mapped up to HVM_SHARED_INFO_ADDR.
1563 * Xen 4.3+ provides an explicit 1MB area at HVM_SHARED_INFO_ADDR which is used
1564 * here for the shared info page. */
1565static void __init xen_hvm_init_shared_info(void)
1566{
1567 if (xen_major < 4) {
1568 xen_hvm_shared_info = extend_brk(PAGE_SIZE, PAGE_SIZE);
1569 xen_hvm_sip_phys = __pa(xen_hvm_shared_info);
1570 } else {
1571 xen_hvm_sip_phys = HVM_SHARED_INFO_ADDR;
1572 set_fixmap(FIX_PARAVIRT_BOOTMAP, xen_hvm_sip_phys);
1573 xen_hvm_shared_info =
1574 (struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP);
1575 } 1348 }
1576 xen_hvm_connect_shared_info(xen_hvm_sip_phys >> PAGE_SHIFT);
1577 xen_hvm_set_shared_info(xen_hvm_shared_info);
1578}
1579
1580static void __init init_hvm_pv_info(void)
1581{
1582 uint32_t ecx, edx, pages, msr, base;
1583 u64 pfn;
1584
1585 base = xen_cpuid_base();
1586 cpuid(base + 2, &pages, &msr, &ecx, &edx);
1587
1588 pfn = __pa(hypercall_page);
1589 wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
1590
1591 xen_setup_features();
1592
1593 pv_info.name = "Xen HVM";
1594
1595 xen_domain_type = XEN_HVM_DOMAIN;
1596} 1349}
1597 1350
1351#ifdef CONFIG_XEN_PVHVM
1598static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self, 1352static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self,
1599 unsigned long action, void *hcpu) 1353 unsigned long action, void *hcpu)
1600{ 1354{
@@ -1617,7 +1371,12 @@ static struct notifier_block xen_hvm_cpu_notifier __cpuinitdata = {
1617 1371
1618static void __init xen_hvm_guest_init(void) 1372static void __init xen_hvm_guest_init(void)
1619{ 1373{
1620 init_hvm_pv_info(); 1374 int r;
1375 int major, minor;
1376
1377 r = init_hvm_pv_info(&major, &minor);
1378 if (r < 0)
1379 return;
1621 1380
1622 xen_hvm_init_shared_info(); 1381 xen_hvm_init_shared_info();
1623 1382
@@ -1633,22 +1392,12 @@ static void __init xen_hvm_guest_init(void)
1633 1392
1634static bool __init xen_hvm_platform(void) 1393static bool __init xen_hvm_platform(void)
1635{ 1394{
1636 uint32_t eax, ebx, ecx, edx, base;
1637
1638 if (xen_pv_domain()) 1395 if (xen_pv_domain())
1639 return false; 1396 return false;
1640 1397
1641 base = xen_cpuid_base(); 1398 if (!xen_cpuid_base())
1642 if (!base)
1643 return false; 1399 return false;
1644 1400
1645 cpuid(base + 1, &eax, &ebx, &ecx, &edx);
1646
1647 xen_major = eax >> 16;
1648 xen_minor = eax & 0xffff;
1649
1650 printk(KERN_INFO "Xen version %d.%d.\n", xen_major, xen_minor);
1651
1652 return true; 1401 return true;
1653} 1402}
1654 1403
diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c
index 3a5f55d5190..49ba9b5224d 100644
--- a/arch/x86/xen/grant-table.c
+++ b/arch/x86/xen/grant-table.c
@@ -54,20 +54,6 @@ static int map_pte_fn(pte_t *pte, struct page *pmd_page,
54 return 0; 54 return 0;
55} 55}
56 56
57/*
58 * This function is used to map shared frames to store grant status. It is
59 * different from map_pte_fn above, the frames type here is uint64_t.
60 */
61static int map_pte_fn_status(pte_t *pte, struct page *pmd_page,
62 unsigned long addr, void *data)
63{
64 uint64_t **frames = (uint64_t **)data;
65
66 set_pte_at(&init_mm, addr, pte, mfn_pte((*frames)[0], PAGE_KERNEL));
67 (*frames)++;
68 return 0;
69}
70
71static int unmap_pte_fn(pte_t *pte, struct page *pmd_page, 57static int unmap_pte_fn(pte_t *pte, struct page *pmd_page,
72 unsigned long addr, void *data) 58 unsigned long addr, void *data)
73{ 59{
@@ -78,14 +64,14 @@ static int unmap_pte_fn(pte_t *pte, struct page *pmd_page,
78 64
79int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes, 65int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes,
80 unsigned long max_nr_gframes, 66 unsigned long max_nr_gframes,
81 void **__shared) 67 struct grant_entry **__shared)
82{ 68{
83 int rc; 69 int rc;
84 void *shared = *__shared; 70 struct grant_entry *shared = *__shared;
85 71
86 if (shared == NULL) { 72 if (shared == NULL) {
87 struct vm_struct *area = 73 struct vm_struct *area =
88 alloc_vm_area(PAGE_SIZE * max_nr_gframes, NULL); 74 xen_alloc_vm_area(PAGE_SIZE * max_nr_gframes);
89 BUG_ON(area == NULL); 75 BUG_ON(area == NULL);
90 shared = area->addr; 76 shared = area->addr;
91 *__shared = shared; 77 *__shared = shared;
@@ -97,30 +83,8 @@ int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes,
97 return rc; 83 return rc;
98} 84}
99 85
100int arch_gnttab_map_status(uint64_t *frames, unsigned long nr_gframes, 86void arch_gnttab_unmap_shared(struct grant_entry *shared,
101 unsigned long max_nr_gframes, 87 unsigned long nr_gframes)
102 grant_status_t **__shared)
103{
104 int rc;
105 grant_status_t *shared = *__shared;
106
107 if (shared == NULL) {
108 /* No need to pass in PTE as we are going to do it
109 * in apply_to_page_range anyhow. */
110 struct vm_struct *area =
111 alloc_vm_area(PAGE_SIZE * max_nr_gframes, NULL);
112 BUG_ON(area == NULL);
113 shared = area->addr;
114 *__shared = shared;
115 }
116
117 rc = apply_to_page_range(&init_mm, (unsigned long)shared,
118 PAGE_SIZE * nr_gframes,
119 map_pte_fn_status, &frames);
120 return rc;
121}
122
123void arch_gnttab_unmap(void *shared, unsigned long nr_gframes)
124{ 88{
125 apply_to_page_range(&init_mm, (unsigned long)shared, 89 apply_to_page_range(&init_mm, (unsigned long)shared,
126 PAGE_SIZE * nr_gframes, unmap_pte_fn, NULL); 90 PAGE_SIZE * nr_gframes, unmap_pte_fn, NULL);
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
index 01a4dc015ae..8bbb465b6f0 100644
--- a/arch/x86/xen/irq.c
+++ b/arch/x86/xen/irq.c
@@ -5,7 +5,6 @@
5#include <xen/interface/xen.h> 5#include <xen/interface/xen.h>
6#include <xen/interface/sched.h> 6#include <xen/interface/sched.h>
7#include <xen/interface/vcpu.h> 7#include <xen/interface/vcpu.h>
8#include <xen/events.h>
9 8
10#include <asm/xen/hypercall.h> 9#include <asm/xen/hypercall.h>
11#include <asm/xen/hypervisor.h> 10#include <asm/xen/hypervisor.h>
@@ -27,7 +26,7 @@ static unsigned long xen_save_fl(void)
27 struct vcpu_info *vcpu; 26 struct vcpu_info *vcpu;
28 unsigned long flags; 27 unsigned long flags;
29 28
30 vcpu = this_cpu_read(xen_vcpu); 29 vcpu = percpu_read(xen_vcpu);
31 30
32 /* flag has opposite sense of mask */ 31 /* flag has opposite sense of mask */
33 flags = !vcpu->evtchn_upcall_mask; 32 flags = !vcpu->evtchn_upcall_mask;
@@ -51,7 +50,7 @@ static void xen_restore_fl(unsigned long flags)
51 make sure we're don't switch CPUs between getting the vcpu 50 make sure we're don't switch CPUs between getting the vcpu
52 pointer and updating the mask. */ 51 pointer and updating the mask. */
53 preempt_disable(); 52 preempt_disable();
54 vcpu = this_cpu_read(xen_vcpu); 53 vcpu = percpu_read(xen_vcpu);
55 vcpu->evtchn_upcall_mask = flags; 54 vcpu->evtchn_upcall_mask = flags;
56 preempt_enable_no_resched(); 55 preempt_enable_no_resched();
57 56
@@ -73,7 +72,7 @@ static void xen_irq_disable(void)
73 make sure we're don't switch CPUs between getting the vcpu 72 make sure we're don't switch CPUs between getting the vcpu
74 pointer and updating the mask. */ 73 pointer and updating the mask. */
75 preempt_disable(); 74 preempt_disable();
76 this_cpu_read(xen_vcpu)->evtchn_upcall_mask = 1; 75 percpu_read(xen_vcpu)->evtchn_upcall_mask = 1;
77 preempt_enable_no_resched(); 76 preempt_enable_no_resched();
78} 77}
79PV_CALLEE_SAVE_REGS_THUNK(xen_irq_disable); 78PV_CALLEE_SAVE_REGS_THUNK(xen_irq_disable);
@@ -87,7 +86,7 @@ static void xen_irq_enable(void)
87 the caller is confused and is trying to re-enable interrupts 86 the caller is confused and is trying to re-enable interrupts
88 on an indeterminate processor. */ 87 on an indeterminate processor. */
89 88
90 vcpu = this_cpu_read(xen_vcpu); 89 vcpu = percpu_read(xen_vcpu);
91 vcpu->evtchn_upcall_mask = 0; 90 vcpu->evtchn_upcall_mask = 0;
92 91
93 /* Doesn't matter if we get preempted here, because any 92 /* Doesn't matter if we get preempted here, because any
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 01de35c7722..3dd53f997b1 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -47,7 +47,6 @@
47#include <linux/gfp.h> 47#include <linux/gfp.h>
48#include <linux/memblock.h> 48#include <linux/memblock.h>
49#include <linux/seq_file.h> 49#include <linux/seq_file.h>
50#include <linux/crash_dump.h>
51 50
52#include <trace/events/xen.h> 51#include <trace/events/xen.h>
53 52
@@ -85,7 +84,6 @@
85 */ 84 */
86DEFINE_SPINLOCK(xen_reservation_lock); 85DEFINE_SPINLOCK(xen_reservation_lock);
87 86
88#ifdef CONFIG_X86_32
89/* 87/*
90 * Identity map, in addition to plain kernel map. This needs to be 88 * Identity map, in addition to plain kernel map. This needs to be
91 * large enough to allocate page table pages to allocate the rest. 89 * large enough to allocate page table pages to allocate the rest.
@@ -93,7 +91,7 @@ DEFINE_SPINLOCK(xen_reservation_lock);
93 */ 91 */
94#define LEVEL1_IDENT_ENTRIES (PTRS_PER_PTE * 4) 92#define LEVEL1_IDENT_ENTRIES (PTRS_PER_PTE * 4)
95static RESERVE_BRK_ARRAY(pte_t, level1_ident_pgt, LEVEL1_IDENT_ENTRIES); 93static RESERVE_BRK_ARRAY(pte_t, level1_ident_pgt, LEVEL1_IDENT_ENTRIES);
96#endif 94
97#ifdef CONFIG_X86_64 95#ifdef CONFIG_X86_64
98/* l3 pud for userspace vsyscall mapping */ 96/* l3 pud for userspace vsyscall mapping */
99static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss; 97static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
@@ -310,20 +308,8 @@ static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval)
310 308
311static inline void __xen_set_pte(pte_t *ptep, pte_t pteval) 309static inline void __xen_set_pte(pte_t *ptep, pte_t pteval)
312{ 310{
313 if (!xen_batched_set_pte(ptep, pteval)) { 311 if (!xen_batched_set_pte(ptep, pteval))
314 /* 312 native_set_pte(ptep, pteval);
315 * Could call native_set_pte() here and trap and
316 * emulate the PTE write but with 32-bit guests this
317 * needs two traps (one for each of the two 32-bit
318 * words in the PTE) so do one hypercall directly
319 * instead.
320 */
321 struct mmu_update u;
322
323 u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE;
324 u.val = pte_val_ma(pteval);
325 HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF);
326 }
327} 313}
328 314
329static void xen_set_pte(pte_t *ptep, pte_t pteval) 315static void xen_set_pte(pte_t *ptep, pte_t pteval)
@@ -367,13 +353,8 @@ static pteval_t pte_mfn_to_pfn(pteval_t val)
367{ 353{
368 if (val & _PAGE_PRESENT) { 354 if (val & _PAGE_PRESENT) {
369 unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; 355 unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
370 unsigned long pfn = mfn_to_pfn(mfn);
371
372 pteval_t flags = val & PTE_FLAGS_MASK; 356 pteval_t flags = val & PTE_FLAGS_MASK;
373 if (unlikely(pfn == ~0)) 357 val = ((pteval_t)mfn_to_pfn(mfn) << PAGE_SHIFT) | flags;
374 val = flags & ~_PAGE_PRESENT;
375 else
376 val = ((pteval_t)pfn << PAGE_SHIFT) | flags;
377 } 358 }
378 359
379 return val; 360 return val;
@@ -434,13 +415,13 @@ static pteval_t iomap_pte(pteval_t val)
434static pteval_t xen_pte_val(pte_t pte) 415static pteval_t xen_pte_val(pte_t pte)
435{ 416{
436 pteval_t pteval = pte.pte; 417 pteval_t pteval = pte.pte;
437#if 0 418
438 /* If this is a WC pte, convert back from Xen WC to Linux WC */ 419 /* If this is a WC pte, convert back from Xen WC to Linux WC */
439 if ((pteval & (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT)) == _PAGE_PAT) { 420 if ((pteval & (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT)) == _PAGE_PAT) {
440 WARN_ON(!pat_enabled); 421 WARN_ON(!pat_enabled);
441 pteval = (pteval & ~_PAGE_PAT) | _PAGE_PWT; 422 pteval = (pteval & ~_PAGE_PAT) | _PAGE_PWT;
442 } 423 }
443#endif 424
444 if (xen_initial_domain() && (pteval & _PAGE_IOMAP)) 425 if (xen_initial_domain() && (pteval & _PAGE_IOMAP))
445 return pteval; 426 return pteval;
446 427
@@ -482,7 +463,7 @@ void xen_set_pat(u64 pat)
482static pte_t xen_make_pte(pteval_t pte) 463static pte_t xen_make_pte(pteval_t pte)
483{ 464{
484 phys_addr_t addr = (pte & PTE_PFN_MASK); 465 phys_addr_t addr = (pte & PTE_PFN_MASK);
485#if 0 466
486 /* If Linux is trying to set a WC pte, then map to the Xen WC. 467 /* If Linux is trying to set a WC pte, then map to the Xen WC.
487 * If _PAGE_PAT is set, then it probably means it is really 468 * If _PAGE_PAT is set, then it probably means it is really
488 * _PAGE_PSE, so avoid fiddling with the PAT mapping and hope 469 * _PAGE_PSE, so avoid fiddling with the PAT mapping and hope
@@ -495,7 +476,7 @@ static pte_t xen_make_pte(pteval_t pte)
495 if ((pte & (_PAGE_PCD | _PAGE_PWT)) == _PAGE_PWT) 476 if ((pte & (_PAGE_PCD | _PAGE_PWT)) == _PAGE_PWT)
496 pte = (pte & ~(_PAGE_PCD | _PAGE_PWT)) | _PAGE_PAT; 477 pte = (pte & ~(_PAGE_PCD | _PAGE_PWT)) | _PAGE_PAT;
497 } 478 }
498#endif 479
499 /* 480 /*
500 * Unprivileged domains are allowed to do IOMAPpings for 481 * Unprivileged domains are allowed to do IOMAPpings for
501 * PCI passthrough, but not map ISA space. The ISA 482 * PCI passthrough, but not map ISA space. The ISA
@@ -514,6 +495,41 @@ static pte_t xen_make_pte(pteval_t pte)
514} 495}
515PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte); 496PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
516 497
498#ifdef CONFIG_XEN_DEBUG
499pte_t xen_make_pte_debug(pteval_t pte)
500{
501 phys_addr_t addr = (pte & PTE_PFN_MASK);
502 phys_addr_t other_addr;
503 bool io_page = false;
504 pte_t _pte;
505
506 if (pte & _PAGE_IOMAP)
507 io_page = true;
508
509 _pte = xen_make_pte(pte);
510
511 if (!addr)
512 return _pte;
513
514 if (io_page &&
515 (xen_initial_domain() || addr >= ISA_END_ADDRESS)) {
516 other_addr = pfn_to_mfn(addr >> PAGE_SHIFT) << PAGE_SHIFT;
517 WARN_ONCE(addr != other_addr,
518 "0x%lx is using VM_IO, but it is 0x%lx!\n",
519 (unsigned long)addr, (unsigned long)other_addr);
520 } else {
521 pteval_t iomap_set = (_pte.pte & PTE_FLAGS_MASK) & _PAGE_IOMAP;
522 other_addr = (_pte.pte & PTE_PFN_MASK);
523 WARN_ONCE((addr == other_addr) && (!io_page) && (!iomap_set),
524 "0x%lx is missing VM_IO (and wasn't fixed)!\n",
525 (unsigned long)addr);
526 }
527
528 return _pte;
529}
530PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte_debug);
531#endif
532
517static pgd_t xen_make_pgd(pgdval_t pgd) 533static pgd_t xen_make_pgd(pgdval_t pgd)
518{ 534{
519 pgd = pte_pfn_to_mfn(pgd); 535 pgd = pte_pfn_to_mfn(pgd);
@@ -1090,14 +1106,14 @@ static void drop_other_mm_ref(void *info)
1090 struct mm_struct *mm = info; 1106 struct mm_struct *mm = info;
1091 struct mm_struct *active_mm; 1107 struct mm_struct *active_mm;
1092 1108
1093 active_mm = this_cpu_read(cpu_tlbstate.active_mm); 1109 active_mm = percpu_read(cpu_tlbstate.active_mm);
1094 1110
1095 if (active_mm == mm && this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK) 1111 if (active_mm == mm && percpu_read(cpu_tlbstate.state) != TLBSTATE_OK)
1096 leave_mm(smp_processor_id()); 1112 leave_mm(smp_processor_id());
1097 1113
1098 /* If this cpu still has a stale cr3 reference, then make sure 1114 /* If this cpu still has a stale cr3 reference, then make sure
1099 it has been flushed. */ 1115 it has been flushed. */
1100 if (this_cpu_read(xen_current_cr3) == __pa(mm->pgd)) 1116 if (percpu_read(xen_current_cr3) == __pa(mm->pgd))
1101 load_cr3(swapper_pg_dir); 1117 load_cr3(swapper_pg_dir);
1102} 1118}
1103 1119
@@ -1176,7 +1192,9 @@ static void xen_exit_mmap(struct mm_struct *mm)
1176 spin_unlock(&mm->page_table_lock); 1192 spin_unlock(&mm->page_table_lock);
1177} 1193}
1178 1194
1179static void xen_post_allocator_init(void); 1195static void __init xen_pagetable_setup_start(pgd_t *base)
1196{
1197}
1180 1198
1181static __init void xen_mapping_pagetable_reserve(u64 start, u64 end) 1199static __init void xen_mapping_pagetable_reserve(u64 start, u64 end)
1182{ 1200{
@@ -1192,121 +1210,29 @@ static __init void xen_mapping_pagetable_reserve(u64 start, u64 end)
1192 } 1210 }
1193} 1211}
1194 1212
1195#ifdef CONFIG_X86_64 1213static void xen_post_allocator_init(void);
1196static void __init xen_cleanhighmap(unsigned long vaddr,
1197 unsigned long vaddr_end)
1198{
1199 unsigned long kernel_end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1;
1200 pmd_t *pmd = level2_kernel_pgt + pmd_index(vaddr);
1201 1214
1202 /* NOTE: The loop is more greedy than the cleanup_highmap variant. 1215static void __init xen_pagetable_setup_done(pgd_t *base)
1203 * We include the PMD passed in on _both_ boundaries. */
1204 for (; vaddr <= vaddr_end && (pmd < (level2_kernel_pgt + PAGE_SIZE));
1205 pmd++, vaddr += PMD_SIZE) {
1206 if (pmd_none(*pmd))
1207 continue;
1208 if (vaddr < (unsigned long) _text || vaddr > kernel_end)
1209 set_pmd(pmd, __pmd(0));
1210 }
1211 /* In case we did something silly, we should crash in this function
1212 * instead of somewhere later and be confusing. */
1213 xen_mc_flush();
1214}
1215#endif
1216static void __init xen_pagetable_init(void)
1217{ 1216{
1218#ifdef CONFIG_X86_64
1219 unsigned long size;
1220 unsigned long addr;
1221#endif
1222 paging_init();
1223 xen_setup_shared_info(); 1217 xen_setup_shared_info();
1224#ifdef CONFIG_X86_64
1225 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
1226 unsigned long new_mfn_list;
1227
1228 size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
1229
1230 /* On 32-bit, we get zero so this never gets executed. */
1231 new_mfn_list = xen_revector_p2m_tree();
1232 if (new_mfn_list && new_mfn_list != xen_start_info->mfn_list) {
1233 /* using __ka address and sticking INVALID_P2M_ENTRY! */
1234 memset((void *)xen_start_info->mfn_list, 0xff, size);
1235
1236 /* We should be in __ka space. */
1237 BUG_ON(xen_start_info->mfn_list < __START_KERNEL_map);
1238 addr = xen_start_info->mfn_list;
1239 /* We roundup to the PMD, which means that if anybody at this stage is
1240 * using the __ka address of xen_start_info or xen_start_info->shared_info
1241 * they are in going to crash. Fortunatly we have already revectored
1242 * in xen_setup_kernel_pagetable and in xen_setup_shared_info. */
1243 size = roundup(size, PMD_SIZE);
1244 xen_cleanhighmap(addr, addr + size);
1245
1246 size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
1247 memblock_free(__pa(xen_start_info->mfn_list), size);
1248 /* And revector! Bye bye old array */
1249 xen_start_info->mfn_list = new_mfn_list;
1250 } else
1251 goto skip;
1252 }
1253 /* At this stage, cleanup_highmap has already cleaned __ka space
1254 * from _brk_limit way up to the max_pfn_mapped (which is the end of
1255 * the ramdisk). We continue on, erasing PMD entries that point to page
1256 * tables - do note that they are accessible at this stage via __va.
1257 * For good measure we also round up to the PMD - which means that if
1258 * anybody is using __ka address to the initial boot-stack - and try
1259 * to use it - they are going to crash. The xen_start_info has been
1260 * taken care of already in xen_setup_kernel_pagetable. */
1261 addr = xen_start_info->pt_base;
1262 size = roundup(xen_start_info->nr_pt_frames * PAGE_SIZE, PMD_SIZE);
1263
1264 xen_cleanhighmap(addr, addr + size);
1265 xen_start_info->pt_base = (unsigned long)__va(__pa(xen_start_info->pt_base));
1266#ifdef DEBUG
1267 /* This is superflous and is not neccessary, but you know what
1268 * lets do it. The MODULES_VADDR -> MODULES_END should be clear of
1269 * anything at this stage. */
1270 xen_cleanhighmap(MODULES_VADDR, roundup(MODULES_VADDR, PUD_SIZE) - 1);
1271#endif
1272skip:
1273#endif
1274 xen_post_allocator_init(); 1218 xen_post_allocator_init();
1275} 1219}
1220
1276static void xen_write_cr2(unsigned long cr2) 1221static void xen_write_cr2(unsigned long cr2)
1277{ 1222{
1278 this_cpu_read(xen_vcpu)->arch.cr2 = cr2; 1223 percpu_read(xen_vcpu)->arch.cr2 = cr2;
1279} 1224}
1280 1225
1281static unsigned long xen_read_cr2(void) 1226static unsigned long xen_read_cr2(void)
1282{ 1227{
1283 return this_cpu_read(xen_vcpu)->arch.cr2; 1228 return percpu_read(xen_vcpu)->arch.cr2;
1284} 1229}
1285 1230
1286unsigned long xen_read_cr2_direct(void) 1231unsigned long xen_read_cr2_direct(void)
1287{ 1232{
1288 return this_cpu_read(xen_vcpu_info.arch.cr2); 1233 return percpu_read(xen_vcpu_info.arch.cr2);
1289} 1234}
1290 1235
1291void xen_flush_tlb_all(void)
1292{
1293 struct mmuext_op *op;
1294 struct multicall_space mcs;
1295
1296 trace_xen_mmu_flush_tlb_all(0);
1297
1298 preempt_disable();
1299
1300 mcs = xen_mc_entry(sizeof(*op));
1301
1302 op = mcs.args;
1303 op->cmd = MMUEXT_TLB_FLUSH_ALL;
1304 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1305
1306 xen_mc_issue(PARAVIRT_LAZY_MMU);
1307
1308 preempt_enable();
1309}
1310static void xen_flush_tlb(void) 1236static void xen_flush_tlb(void)
1311{ 1237{
1312 struct mmuext_op *op; 1238 struct mmuext_op *op;
@@ -1348,8 +1274,7 @@ static void xen_flush_tlb_single(unsigned long addr)
1348} 1274}
1349 1275
1350static void xen_flush_tlb_others(const struct cpumask *cpus, 1276static void xen_flush_tlb_others(const struct cpumask *cpus,
1351 struct mm_struct *mm, unsigned long start, 1277 struct mm_struct *mm, unsigned long va)
1352 unsigned long end)
1353{ 1278{
1354 struct { 1279 struct {
1355 struct mmuext_op op; 1280 struct mmuext_op op;
@@ -1361,7 +1286,7 @@ static void xen_flush_tlb_others(const struct cpumask *cpus,
1361 } *args; 1286 } *args;
1362 struct multicall_space mcs; 1287 struct multicall_space mcs;
1363 1288
1364 trace_xen_mmu_flush_tlb_others(cpus, mm, start, end); 1289 trace_xen_mmu_flush_tlb_others(cpus, mm, va);
1365 1290
1366 if (cpumask_empty(cpus)) 1291 if (cpumask_empty(cpus))
1367 return; /* nothing to do */ 1292 return; /* nothing to do */
@@ -1374,10 +1299,11 @@ static void xen_flush_tlb_others(const struct cpumask *cpus,
1374 cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask); 1299 cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
1375 cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask)); 1300 cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
1376 1301
1377 args->op.cmd = MMUEXT_TLB_FLUSH_MULTI; 1302 if (va == TLB_FLUSH_ALL) {
1378 if (end != TLB_FLUSH_ALL && (end - start) <= PAGE_SIZE) { 1303 args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
1304 } else {
1379 args->op.cmd = MMUEXT_INVLPG_MULTI; 1305 args->op.cmd = MMUEXT_INVLPG_MULTI;
1380 args->op.arg1.linear_addr = start; 1306 args->op.arg1.linear_addr = va;
1381 } 1307 }
1382 1308
1383 MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF); 1309 MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
@@ -1387,12 +1313,12 @@ static void xen_flush_tlb_others(const struct cpumask *cpus,
1387 1313
1388static unsigned long xen_read_cr3(void) 1314static unsigned long xen_read_cr3(void)
1389{ 1315{
1390 return this_cpu_read(xen_cr3); 1316 return percpu_read(xen_cr3);
1391} 1317}
1392 1318
1393static void set_current_cr3(void *v) 1319static void set_current_cr3(void *v)
1394{ 1320{
1395 this_cpu_write(xen_current_cr3, (unsigned long)v); 1321 percpu_write(xen_current_cr3, (unsigned long)v);
1396} 1322}
1397 1323
1398static void __xen_write_cr3(bool kernel, unsigned long cr3) 1324static void __xen_write_cr3(bool kernel, unsigned long cr3)
@@ -1415,7 +1341,7 @@ static void __xen_write_cr3(bool kernel, unsigned long cr3)
1415 xen_extend_mmuext_op(&op); 1341 xen_extend_mmuext_op(&op);
1416 1342
1417 if (kernel) { 1343 if (kernel) {
1418 this_cpu_write(xen_cr3, cr3); 1344 percpu_write(xen_cr3, cr3);
1419 1345
1420 /* Update xen_current_cr3 once the batch has actually 1346 /* Update xen_current_cr3 once the batch has actually
1421 been submitted. */ 1347 been submitted. */
@@ -1431,7 +1357,7 @@ static void xen_write_cr3(unsigned long cr3)
1431 1357
1432 /* Update while interrupts are disabled, so its atomic with 1358 /* Update while interrupts are disabled, so its atomic with
1433 respect to ipis */ 1359 respect to ipis */
1434 this_cpu_write(xen_cr3, cr3); 1360 percpu_write(xen_cr3, cr3);
1435 1361
1436 __xen_write_cr3(true, cr3); 1362 __xen_write_cr3(true, cr3);
1437 1363
@@ -1520,28 +1446,13 @@ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
1520} 1446}
1521#endif /* CONFIG_X86_64 */ 1447#endif /* CONFIG_X86_64 */
1522 1448
1523/* 1449/* Init-time set_pte while constructing initial pagetables, which
1524 * Init-time set_pte while constructing initial pagetables, which 1450 doesn't allow RO pagetable pages to be remapped RW */
1525 * doesn't allow RO page table pages to be remapped RW.
1526 *
1527 * If there is no MFN for this PFN then this page is initially
1528 * ballooned out so clear the PTE (as in decrease_reservation() in
1529 * drivers/xen/balloon.c).
1530 *
1531 * Many of these PTE updates are done on unpinned and writable pages
1532 * and doing a hypercall for these is unnecessary and expensive. At
1533 * this point it is not possible to tell if a page is pinned or not,
1534 * so always write the PTE directly and rely on Xen trapping and
1535 * emulating any updates as necessary.
1536 */
1537static void __init xen_set_pte_init(pte_t *ptep, pte_t pte) 1451static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
1538{ 1452{
1539 if (pte_mfn(pte) != INVALID_P2M_ENTRY) 1453 pte = mask_rw_pte(ptep, pte);
1540 pte = mask_rw_pte(ptep, pte);
1541 else
1542 pte = __pte_ma(0);
1543 1454
1544 native_set_pte(ptep, pte); 1455 xen_set_pte(ptep, pte);
1545} 1456}
1546 1457
1547static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn) 1458static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
@@ -1747,7 +1658,7 @@ static void set_page_prot(void *addr, pgprot_t prot)
1747 if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0)) 1658 if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
1748 BUG(); 1659 BUG();
1749} 1660}
1750#ifdef CONFIG_X86_32 1661
1751static void __init xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) 1662static void __init xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1752{ 1663{
1753 unsigned pmdidx, pteidx; 1664 unsigned pmdidx, pteidx;
@@ -1798,7 +1709,7 @@ static void __init xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1798 1709
1799 set_page_prot(pmd, PAGE_KERNEL_RO); 1710 set_page_prot(pmd, PAGE_KERNEL_RO);
1800} 1711}
1801#endif 1712
1802void __init xen_setup_machphys_mapping(void) 1713void __init xen_setup_machphys_mapping(void)
1803{ 1714{
1804 struct xen_machphys_mapping mapping; 1715 struct xen_machphys_mapping mapping;
@@ -1826,20 +1737,7 @@ static void convert_pfn_mfn(void *v)
1826 for (i = 0; i < PTRS_PER_PTE; i++) 1737 for (i = 0; i < PTRS_PER_PTE; i++)
1827 pte[i] = xen_make_pte(pte[i].pte); 1738 pte[i] = xen_make_pte(pte[i].pte);
1828} 1739}
1829static void __init check_pt_base(unsigned long *pt_base, unsigned long *pt_end, 1740
1830 unsigned long addr)
1831{
1832 if (*pt_base == PFN_DOWN(__pa(addr))) {
1833 set_page_prot((void *)addr, PAGE_KERNEL);
1834 clear_page((void *)addr);
1835 (*pt_base)++;
1836 }
1837 if (*pt_end == PFN_DOWN(__pa(addr))) {
1838 set_page_prot((void *)addr, PAGE_KERNEL);
1839 clear_page((void *)addr);
1840 (*pt_end)--;
1841 }
1842}
1843/* 1741/*
1844 * Set up the initial kernel pagetable. 1742 * Set up the initial kernel pagetable.
1845 * 1743 *
@@ -1851,13 +1749,11 @@ static void __init check_pt_base(unsigned long *pt_base, unsigned long *pt_end,
1851 * of the physical mapping once some sort of allocator has been set 1749 * of the physical mapping once some sort of allocator has been set
1852 * up. 1750 * up.
1853 */ 1751 */
1854void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) 1752pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd,
1753 unsigned long max_pfn)
1855{ 1754{
1856 pud_t *l3; 1755 pud_t *l3;
1857 pmd_t *l2; 1756 pmd_t *l2;
1858 unsigned long addr[3];
1859 unsigned long pt_base, pt_end;
1860 unsigned i;
1861 1757
1862 /* max_pfn_mapped is the last pfn mapped in the initial memory 1758 /* max_pfn_mapped is the last pfn mapped in the initial memory
1863 * mappings. Considering that on Xen after the kernel mappings we 1759 * mappings. Considering that on Xen after the kernel mappings we
@@ -1865,53 +1761,32 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
1865 * set max_pfn_mapped to the last real pfn mapped. */ 1761 * set max_pfn_mapped to the last real pfn mapped. */
1866 max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list)); 1762 max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list));
1867 1763
1868 pt_base = PFN_DOWN(__pa(xen_start_info->pt_base));
1869 pt_end = pt_base + xen_start_info->nr_pt_frames;
1870
1871 /* Zap identity mapping */ 1764 /* Zap identity mapping */
1872 init_level4_pgt[0] = __pgd(0); 1765 init_level4_pgt[0] = __pgd(0);
1873 1766
1874 /* Pre-constructed entries are in pfn, so convert to mfn */ 1767 /* Pre-constructed entries are in pfn, so convert to mfn */
1875 /* L4[272] -> level3_ident_pgt
1876 * L4[511] -> level3_kernel_pgt */
1877 convert_pfn_mfn(init_level4_pgt); 1768 convert_pfn_mfn(init_level4_pgt);
1878
1879 /* L3_i[0] -> level2_ident_pgt */
1880 convert_pfn_mfn(level3_ident_pgt); 1769 convert_pfn_mfn(level3_ident_pgt);
1881 /* L3_k[510] -> level2_kernel_pgt
1882 * L3_i[511] -> level2_fixmap_pgt */
1883 convert_pfn_mfn(level3_kernel_pgt); 1770 convert_pfn_mfn(level3_kernel_pgt);
1884 1771
1885 /* We get [511][511] and have Xen's version of level2_kernel_pgt */
1886 l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd); 1772 l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
1887 l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud); 1773 l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
1888 1774
1889 addr[0] = (unsigned long)pgd; 1775 memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1890 addr[1] = (unsigned long)l3; 1776 memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1891 addr[2] = (unsigned long)l2; 1777
1892 /* Graft it onto L4[272][0]. Note that we creating an aliasing problem:
1893 * Both L4[272][0] and L4[511][511] have entries that point to the same
1894 * L2 (PMD) tables. Meaning that if you modify it in __va space
1895 * it will be also modified in the __ka space! (But if you just
1896 * modify the PMD table to point to other PTE's or none, then you
1897 * are OK - which is what cleanup_highmap does) */
1898 copy_page(level2_ident_pgt, l2);
1899 /* Graft it onto L4[511][511] */
1900 copy_page(level2_kernel_pgt, l2);
1901
1902 /* Get [511][510] and graft that in level2_fixmap_pgt */
1903 l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd); 1778 l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
1904 l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud); 1779 l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
1905 copy_page(level2_fixmap_pgt, l2); 1780 memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1906 /* Note that we don't do anything with level1_fixmap_pgt which 1781
1907 * we don't need. */ 1782 /* Set up identity map */
1783 xen_map_identity_early(level2_ident_pgt, max_pfn);
1908 1784
1909 /* Make pagetable pieces RO */ 1785 /* Make pagetable pieces RO */
1910 set_page_prot(init_level4_pgt, PAGE_KERNEL_RO); 1786 set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
1911 set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO); 1787 set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
1912 set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO); 1788 set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
1913 set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO); 1789 set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
1914 set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO);
1915 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); 1790 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1916 set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO); 1791 set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
1917 1792
@@ -1922,28 +1797,24 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
1922 /* Unpin Xen-provided one */ 1797 /* Unpin Xen-provided one */
1923 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); 1798 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1924 1799
1800 /* Switch over */
1801 pgd = init_level4_pgt;
1802
1925 /* 1803 /*
1926 * At this stage there can be no user pgd, and no page 1804 * At this stage there can be no user pgd, and no page
1927 * structure to attach it to, so make sure we just set kernel 1805 * structure to attach it to, so make sure we just set kernel
1928 * pgd. 1806 * pgd.
1929 */ 1807 */
1930 xen_mc_batch(); 1808 xen_mc_batch();
1931 __xen_write_cr3(true, __pa(init_level4_pgt)); 1809 __xen_write_cr3(true, __pa(pgd));
1932 xen_mc_issue(PARAVIRT_LAZY_CPU); 1810 xen_mc_issue(PARAVIRT_LAZY_CPU);
1933 1811
1934 /* We can't that easily rip out L3 and L2, as the Xen pagetables are 1812 memblock_x86_reserve_range(__pa(xen_start_info->pt_base),
1935 * set out this way: [L4], [L1], [L2], [L3], [L1], [L1] ... for 1813 __pa(xen_start_info->pt_base +
1936 * the initial domain. For guests using the toolstack, they are in: 1814 xen_start_info->nr_pt_frames * PAGE_SIZE),
1937 * [L4], [L3], [L2], [L1], [L1], order .. So for dom0 we can only 1815 "XEN PAGETABLES");
1938 * rip out the [L4] (pgd), but for guests we shave off three pages.
1939 */
1940 for (i = 0; i < ARRAY_SIZE(addr); i++)
1941 check_pt_base(&pt_base, &pt_end, addr[i]);
1942 1816
1943 /* Our (by three pages) smaller Xen pagetable that we are using */ 1817 return pgd;
1944 memblock_reserve(PFN_PHYS(pt_base), (pt_end - pt_base) * PAGE_SIZE);
1945 /* Revector the xen_start_info */
1946 xen_start_info = (struct start_info *)__va(__pa(xen_start_info));
1947} 1818}
1948#else /* !CONFIG_X86_64 */ 1819#else /* !CONFIG_X86_64 */
1949static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD); 1820static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD);
@@ -1968,7 +1839,8 @@ static void __init xen_write_cr3_init(unsigned long cr3)
1968 */ 1839 */
1969 swapper_kernel_pmd = 1840 swapper_kernel_pmd =
1970 extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE); 1841 extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
1971 copy_page(swapper_kernel_pmd, initial_kernel_pmd); 1842 memcpy(swapper_kernel_pmd, initial_kernel_pmd,
1843 sizeof(pmd_t) * PTRS_PER_PMD);
1972 swapper_pg_dir[KERNEL_PGD_BOUNDARY] = 1844 swapper_pg_dir[KERNEL_PGD_BOUNDARY] =
1973 __pgd(__pa(swapper_kernel_pmd) | _PAGE_PRESENT); 1845 __pgd(__pa(swapper_kernel_pmd) | _PAGE_PRESENT);
1974 set_page_prot(swapper_kernel_pmd, PAGE_KERNEL_RO); 1846 set_page_prot(swapper_kernel_pmd, PAGE_KERNEL_RO);
@@ -1985,7 +1857,8 @@ static void __init xen_write_cr3_init(unsigned long cr3)
1985 pv_mmu_ops.write_cr3 = &xen_write_cr3; 1857 pv_mmu_ops.write_cr3 = &xen_write_cr3;
1986} 1858}
1987 1859
1988void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) 1860pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd,
1861 unsigned long max_pfn)
1989{ 1862{
1990 pmd_t *kernel_pmd; 1863 pmd_t *kernel_pmd;
1991 1864
@@ -1997,11 +1870,11 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
1997 512*1024); 1870 512*1024);
1998 1871
1999 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd); 1872 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
2000 copy_page(initial_kernel_pmd, kernel_pmd); 1873 memcpy(initial_kernel_pmd, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
2001 1874
2002 xen_map_identity_early(initial_kernel_pmd, max_pfn); 1875 xen_map_identity_early(initial_kernel_pmd, max_pfn);
2003 1876
2004 copy_page(initial_page_table, pgd); 1877 memcpy(initial_page_table, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
2005 initial_page_table[KERNEL_PGD_BOUNDARY] = 1878 initial_page_table[KERNEL_PGD_BOUNDARY] =
2006 __pgd(__pa(initial_kernel_pmd) | _PAGE_PRESENT); 1879 __pgd(__pa(initial_kernel_pmd) | _PAGE_PRESENT);
2007 1880
@@ -2015,8 +1888,12 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
2015 PFN_DOWN(__pa(initial_page_table))); 1888 PFN_DOWN(__pa(initial_page_table)));
2016 xen_write_cr3(__pa(initial_page_table)); 1889 xen_write_cr3(__pa(initial_page_table));
2017 1890
2018 memblock_reserve(__pa(xen_start_info->pt_base), 1891 memblock_x86_reserve_range(__pa(xen_start_info->pt_base),
2019 xen_start_info->nr_pt_frames * PAGE_SIZE); 1892 __pa(xen_start_info->pt_base +
1893 xen_start_info->nr_pt_frames * PAGE_SIZE),
1894 "XEN PAGETABLES");
1895
1896 return initial_page_table;
2020} 1897}
2021#endif /* CONFIG_X86_64 */ 1898#endif /* CONFIG_X86_64 */
2022 1899
@@ -2090,8 +1967,34 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
2090#endif 1967#endif
2091} 1968}
2092 1969
1970void __init xen_ident_map_ISA(void)
1971{
1972 unsigned long pa;
1973
1974 /*
1975 * If we're dom0, then linear map the ISA machine addresses into
1976 * the kernel's address space.
1977 */
1978 if (!xen_initial_domain())
1979 return;
1980
1981 xen_raw_printk("Xen: setup ISA identity maps\n");
1982
1983 for (pa = ISA_START_ADDRESS; pa < ISA_END_ADDRESS; pa += PAGE_SIZE) {
1984 pte_t pte = mfn_pte(PFN_DOWN(pa), PAGE_KERNEL_IO);
1985
1986 if (HYPERVISOR_update_va_mapping(PAGE_OFFSET + pa, pte, 0))
1987 BUG();
1988 }
1989
1990 xen_flush_tlb();
1991}
1992
2093static void __init xen_post_allocator_init(void) 1993static void __init xen_post_allocator_init(void)
2094{ 1994{
1995#ifdef CONFIG_XEN_DEBUG
1996 pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte_debug);
1997#endif
2095 pv_mmu_ops.set_pte = xen_set_pte; 1998 pv_mmu_ops.set_pte = xen_set_pte;
2096 pv_mmu_ops.set_pmd = xen_set_pmd; 1999 pv_mmu_ops.set_pmd = xen_set_pmd;
2097 pv_mmu_ops.set_pud = xen_set_pud; 2000 pv_mmu_ops.set_pud = xen_set_pud;
@@ -2198,7 +2101,8 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = {
2198void __init xen_init_mmu_ops(void) 2101void __init xen_init_mmu_ops(void)
2199{ 2102{
2200 x86_init.mapping.pagetable_reserve = xen_mapping_pagetable_reserve; 2103 x86_init.mapping.pagetable_reserve = xen_mapping_pagetable_reserve;
2201 x86_init.paging.pagetable_init = xen_pagetable_init; 2104 x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start;
2105 x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done;
2202 pv_mmu_ops = xen_mmu_ops; 2106 pv_mmu_ops = xen_mmu_ops;
2203 2107
2204 memset(dummy_mapping, 0xff, PAGE_SIZE); 2108 memset(dummy_mapping, 0xff, PAGE_SIZE);
@@ -2401,43 +2305,6 @@ void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order)
2401EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region); 2305EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region);
2402 2306
2403#ifdef CONFIG_XEN_PVHVM 2307#ifdef CONFIG_XEN_PVHVM
2404#ifdef CONFIG_PROC_VMCORE
2405/*
2406 * This function is used in two contexts:
2407 * - the kdump kernel has to check whether a pfn of the crashed kernel
2408 * was a ballooned page. vmcore is using this function to decide
2409 * whether to access a pfn of the crashed kernel.
2410 * - the kexec kernel has to check whether a pfn was ballooned by the
2411 * previous kernel. If the pfn is ballooned, handle it properly.
2412 * Returns 0 if the pfn is not backed by a RAM page, the caller may
2413 * handle the pfn special in this case.
2414 */
2415static int xen_oldmem_pfn_is_ram(unsigned long pfn)
2416{
2417 struct xen_hvm_get_mem_type a = {
2418 .domid = DOMID_SELF,
2419 .pfn = pfn,
2420 };
2421 int ram;
2422
2423 if (HYPERVISOR_hvm_op(HVMOP_get_mem_type, &a))
2424 return -ENXIO;
2425
2426 switch (a.mem_type) {
2427 case HVMMEM_mmio_dm:
2428 ram = 0;
2429 break;
2430 case HVMMEM_ram_rw:
2431 case HVMMEM_ram_ro:
2432 default:
2433 ram = 1;
2434 break;
2435 }
2436
2437 return ram;
2438}
2439#endif
2440
2441static void xen_hvm_exit_mmap(struct mm_struct *mm) 2308static void xen_hvm_exit_mmap(struct mm_struct *mm)
2442{ 2309{
2443 struct xen_hvm_pagetable_dying a; 2310 struct xen_hvm_pagetable_dying a;
@@ -2468,9 +2335,6 @@ void __init xen_hvm_init_mmu_ops(void)
2468{ 2335{
2469 if (is_pagetable_dying_supported()) 2336 if (is_pagetable_dying_supported())
2470 pv_mmu_ops.exit_mmap = xen_hvm_exit_mmap; 2337 pv_mmu_ops.exit_mmap = xen_hvm_exit_mmap;
2471#ifdef CONFIG_PROC_VMCORE
2472 register_oldmem_pfn_is_ram(&xen_oldmem_pfn_is_ram);
2473#endif
2474} 2338}
2475#endif 2339#endif
2476 2340
@@ -2497,10 +2361,8 @@ static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token,
2497 2361
2498int xen_remap_domain_mfn_range(struct vm_area_struct *vma, 2362int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
2499 unsigned long addr, 2363 unsigned long addr,
2500 xen_pfn_t mfn, int nr, 2364 unsigned long mfn, int nr,
2501 pgprot_t prot, unsigned domid, 2365 pgprot_t prot, unsigned domid)
2502 struct page **pages)
2503
2504{ 2366{
2505 struct remap_data rmd; 2367 struct remap_data rmd;
2506 struct mmu_update mmu_update[REMAP_BATCH_SIZE]; 2368 struct mmu_update mmu_update[REMAP_BATCH_SIZE];
@@ -2508,12 +2370,10 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
2508 unsigned long range; 2370 unsigned long range;
2509 int err = 0; 2371 int err = 0;
2510 2372
2511 if (xen_feature(XENFEAT_auto_translated_physmap))
2512 return -EINVAL;
2513
2514 prot = __pgprot(pgprot_val(prot) | _PAGE_IOMAP); 2373 prot = __pgprot(pgprot_val(prot) | _PAGE_IOMAP);
2515 2374
2516 BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_IO)) == (VM_PFNMAP | VM_IO))); 2375 BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_RESERVED | VM_IO)) ==
2376 (VM_PFNMAP | VM_RESERVED | VM_IO)));
2517 2377
2518 rmd.mfn = mfn; 2378 rmd.mfn = mfn;
2519 rmd.prot = prot; 2379 rmd.prot = prot;
@@ -2528,8 +2388,8 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
2528 if (err) 2388 if (err)
2529 goto out; 2389 goto out;
2530 2390
2531 err = HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid); 2391 err = -EFAULT;
2532 if (err < 0) 2392 if (HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid) < 0)
2533 goto out; 2393 goto out;
2534 2394
2535 nr -= batch; 2395 nr -= batch;
@@ -2539,19 +2399,22 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
2539 err = 0; 2399 err = 0;
2540out: 2400out:
2541 2401
2542 xen_flush_tlb_all(); 2402 flush_tlb_all();
2543 2403
2544 return err; 2404 return err;
2545} 2405}
2546EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range); 2406EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range);
2547 2407
2548/* Returns: 0 success */ 2408#ifdef CONFIG_XEN_DEBUG_FS
2549int xen_unmap_domain_mfn_range(struct vm_area_struct *vma, 2409static int p2m_dump_open(struct inode *inode, struct file *filp)
2550 int numpgs, struct page **pages)
2551{ 2410{
2552 if (!pages || !xen_feature(XENFEAT_auto_translated_physmap)) 2411 return single_open(filp, p2m_dump_show, NULL);
2553 return 0;
2554
2555 return -EINVAL;
2556} 2412}
2557EXPORT_SYMBOL_GPL(xen_unmap_domain_mfn_range); 2413
2414static const struct file_operations p2m_dump_fops = {
2415 .open = p2m_dump_open,
2416 .read = seq_read,
2417 .llseek = seq_lseek,
2418 .release = single_release,
2419};
2420#endif /* CONFIG_XEN_DEBUG_FS */
diff --git a/arch/x86/xen/multicalls.h b/arch/x86/xen/multicalls.h
index 9c2e74f9096..dee79b78a90 100644
--- a/arch/x86/xen/multicalls.h
+++ b/arch/x86/xen/multicalls.h
@@ -47,7 +47,7 @@ static inline void xen_mc_issue(unsigned mode)
47 xen_mc_flush(); 47 xen_mc_flush();
48 48
49 /* restore flags saved in xen_mc_batch */ 49 /* restore flags saved in xen_mc_batch */
50 local_irq_restore(this_cpu_read(xen_mc_irq_flags)); 50 local_irq_restore(percpu_read(xen_mc_irq_flags));
51} 51}
52 52
53/* Set up a callback to be called when the current batch is flushed */ 53/* Set up a callback to be called when the current batch is flushed */
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 95fb2aa5927..58efeb9d544 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -22,7 +22,7 @@
22 * 22 *
23 * P2M_PER_PAGE depends on the architecture, as a mfn is always 23 * P2M_PER_PAGE depends on the architecture, as a mfn is always
24 * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to 24 * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to
25 * 512 and 1024 entries respectively. 25 * 512 and 1024 entries respectively.
26 * 26 *
27 * In short, these structures contain the Machine Frame Number (MFN) of the PFN. 27 * In short, these structures contain the Machine Frame Number (MFN) of the PFN.
28 * 28 *
@@ -139,11 +139,11 @@
139 * / | ~0, ~0, .... | 139 * / | ~0, ~0, .... |
140 * | \---------------/ 140 * | \---------------/
141 * | 141 * |
142 * p2m_mid_missing p2m_missing 142 * p2m_missing p2m_missing
143 * /-----------------\ /------------\ 143 * /------------------\ /------------\
144 * | [p2m_missing] +---->| ~0, ~0, ~0 | 144 * | [p2m_mid_missing]+---->| ~0, ~0, ~0 |
145 * | [p2m_missing] +---->| ..., ~0 | 145 * | [p2m_mid_missing]+---->| ..., ~0 |
146 * \-----------------/ \------------/ 146 * \------------------/ \------------/
147 * 147 *
148 * where ~0 is INVALID_P2M_ENTRY. IDENTITY is (PFN | IDENTITY_BIT) 148 * where ~0 is INVALID_P2M_ENTRY. IDENTITY is (PFN | IDENTITY_BIT)
149 */ 149 */
@@ -161,9 +161,7 @@
161#include <asm/xen/page.h> 161#include <asm/xen/page.h>
162#include <asm/xen/hypercall.h> 162#include <asm/xen/hypercall.h>
163#include <asm/xen/hypervisor.h> 163#include <asm/xen/hypervisor.h>
164#include <xen/grant_table.h>
165 164
166#include "multicalls.h"
167#include "xen-ops.h" 165#include "xen-ops.h"
168 166
169static void __init m2p_override_init(void); 167static void __init m2p_override_init(void);
@@ -194,13 +192,6 @@ RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID
194 * boundary violation will require three middle nodes. */ 192 * boundary violation will require three middle nodes. */
195RESERVE_BRK(p2m_mid_identity, PAGE_SIZE * 2 * 3); 193RESERVE_BRK(p2m_mid_identity, PAGE_SIZE * 2 * 3);
196 194
197/* When we populate back during bootup, the amount of pages can vary. The
198 * max we have is seen is 395979, but that does not mean it can't be more.
199 * Some machines can have 3GB I/O holes even. With early_can_reuse_p2m_middle
200 * it can re-use Xen provided mfn_list array, so we only need to allocate at
201 * most three P2M top nodes. */
202RESERVE_BRK(p2m_populated, PAGE_SIZE * 3);
203
204static inline unsigned p2m_top_index(unsigned long pfn) 195static inline unsigned p2m_top_index(unsigned long pfn)
205{ 196{
206 BUG_ON(pfn >= MAX_P2M_PFN); 197 BUG_ON(pfn >= MAX_P2M_PFN);
@@ -396,85 +387,7 @@ void __init xen_build_dynamic_phys_to_machine(void)
396 387
397 m2p_override_init(); 388 m2p_override_init();
398} 389}
399#ifdef CONFIG_X86_64
400#include <linux/bootmem.h>
401unsigned long __init xen_revector_p2m_tree(void)
402{
403 unsigned long va_start;
404 unsigned long va_end;
405 unsigned long pfn;
406 unsigned long pfn_free = 0;
407 unsigned long *mfn_list = NULL;
408 unsigned long size;
409
410 va_start = xen_start_info->mfn_list;
411 /*We copy in increments of P2M_PER_PAGE * sizeof(unsigned long),
412 * so make sure it is rounded up to that */
413 size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
414 va_end = va_start + size;
415
416 /* If we were revectored already, don't do it again. */
417 if (va_start <= __START_KERNEL_map && va_start >= __PAGE_OFFSET)
418 return 0;
419
420 mfn_list = alloc_bootmem_align(size, PAGE_SIZE);
421 if (!mfn_list) {
422 pr_warn("Could not allocate space for a new P2M tree!\n");
423 return xen_start_info->mfn_list;
424 }
425 /* Fill it out with INVALID_P2M_ENTRY value */
426 memset(mfn_list, 0xFF, size);
427
428 for (pfn = 0; pfn < ALIGN(MAX_DOMAIN_PAGES, P2M_PER_PAGE); pfn += P2M_PER_PAGE) {
429 unsigned topidx = p2m_top_index(pfn);
430 unsigned mididx;
431 unsigned long *mid_p;
432
433 if (!p2m_top[topidx])
434 continue;
435
436 if (p2m_top[topidx] == p2m_mid_missing)
437 continue;
438
439 mididx = p2m_mid_index(pfn);
440 mid_p = p2m_top[topidx][mididx];
441 if (!mid_p)
442 continue;
443 if ((mid_p == p2m_missing) || (mid_p == p2m_identity))
444 continue;
445
446 if ((unsigned long)mid_p == INVALID_P2M_ENTRY)
447 continue;
448
449 /* The old va. Rebase it on mfn_list */
450 if (mid_p >= (unsigned long *)va_start && mid_p <= (unsigned long *)va_end) {
451 unsigned long *new;
452 390
453 if (pfn_free > (size / sizeof(unsigned long))) {
454 WARN(1, "Only allocated for %ld pages, but we want %ld!\n",
455 size / sizeof(unsigned long), pfn_free);
456 return 0;
457 }
458 new = &mfn_list[pfn_free];
459
460 copy_page(new, mid_p);
461 p2m_top[topidx][mididx] = &mfn_list[pfn_free];
462 p2m_top_mfn_p[topidx][mididx] = virt_to_mfn(&mfn_list[pfn_free]);
463
464 pfn_free += P2M_PER_PAGE;
465
466 }
467 /* This should be the leafs allocated for identity from _brk. */
468 }
469 return (unsigned long)mfn_list;
470
471}
472#else
473unsigned long __init xen_revector_p2m_tree(void)
474{
475 return 0;
476}
477#endif
478unsigned long get_phys_to_machine(unsigned long pfn) 391unsigned long get_phys_to_machine(unsigned long pfn)
479{ 392{
480 unsigned topidx, mididx, idx; 393 unsigned topidx, mididx, idx;
@@ -508,7 +421,7 @@ static void free_p2m_page(void *p)
508 free_page((unsigned long)p); 421 free_page((unsigned long)p);
509} 422}
510 423
511/* 424/*
512 * Fully allocate the p2m structure for a given pfn. We need to check 425 * Fully allocate the p2m structure for a given pfn. We need to check
513 * that both the top and mid levels are allocated, and make sure the 426 * that both the top and mid levels are allocated, and make sure the
514 * parallel mfn tree is kept in sync. We may race with other cpus, so 427 * parallel mfn tree is kept in sync. We may race with other cpus, so
@@ -584,18 +497,16 @@ static bool alloc_p2m(unsigned long pfn)
584 return true; 497 return true;
585} 498}
586 499
587static bool __init early_alloc_p2m_middle(unsigned long pfn, bool check_boundary) 500static bool __init __early_alloc_p2m(unsigned long pfn)
588{ 501{
589 unsigned topidx, mididx, idx; 502 unsigned topidx, mididx, idx;
590 unsigned long *p2m;
591 unsigned long *mid_mfn_p;
592 503
593 topidx = p2m_top_index(pfn); 504 topidx = p2m_top_index(pfn);
594 mididx = p2m_mid_index(pfn); 505 mididx = p2m_mid_index(pfn);
595 idx = p2m_index(pfn); 506 idx = p2m_index(pfn);
596 507
597 /* Pfff.. No boundary cross-over, lets get out. */ 508 /* Pfff.. No boundary cross-over, lets get out. */
598 if (!idx && check_boundary) 509 if (!idx)
599 return false; 510 return false;
600 511
601 WARN(p2m_top[topidx][mididx] == p2m_identity, 512 WARN(p2m_top[topidx][mididx] == p2m_identity,
@@ -609,153 +520,24 @@ static bool __init early_alloc_p2m_middle(unsigned long pfn, bool check_boundary
609 return false; 520 return false;
610 521
611 /* Boundary cross-over for the edges: */ 522 /* Boundary cross-over for the edges: */
612 p2m = extend_brk(PAGE_SIZE, PAGE_SIZE); 523 if (idx) {
613 524 unsigned long *p2m = extend_brk(PAGE_SIZE, PAGE_SIZE);
614 p2m_init(p2m); 525 unsigned long *mid_mfn_p;
615
616 p2m_top[topidx][mididx] = p2m;
617
618 /* For save/restore we need to MFN of the P2M saved */
619
620 mid_mfn_p = p2m_top_mfn_p[topidx];
621 WARN(mid_mfn_p[mididx] != virt_to_mfn(p2m_missing),
622 "P2M_TOP_P[%d][%d] != MFN of p2m_missing!\n",
623 topidx, mididx);
624 mid_mfn_p[mididx] = virt_to_mfn(p2m);
625
626 return true;
627}
628
629static bool __init early_alloc_p2m(unsigned long pfn)
630{
631 unsigned topidx = p2m_top_index(pfn);
632 unsigned long *mid_mfn_p;
633 unsigned long **mid;
634
635 mid = p2m_top[topidx];
636 mid_mfn_p = p2m_top_mfn_p[topidx];
637 if (mid == p2m_mid_missing) {
638 mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
639
640 p2m_mid_init(mid);
641
642 p2m_top[topidx] = mid;
643
644 BUG_ON(mid_mfn_p != p2m_mid_missing_mfn);
645 }
646 /* And the save/restore P2M tables.. */
647 if (mid_mfn_p == p2m_mid_missing_mfn) {
648 mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
649 p2m_mid_mfn_init(mid_mfn_p);
650
651 p2m_top_mfn_p[topidx] = mid_mfn_p;
652 p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p);
653 /* Note: we don't set mid_mfn_p[midix] here,
654 * look in early_alloc_p2m_middle */
655 }
656 return true;
657}
658
659/*
660 * Skim over the P2M tree looking at pages that are either filled with
661 * INVALID_P2M_ENTRY or with 1:1 PFNs. If found, re-use that page and
662 * replace the P2M leaf with a p2m_missing or p2m_identity.
663 * Stick the old page in the new P2M tree location.
664 */
665bool __init early_can_reuse_p2m_middle(unsigned long set_pfn, unsigned long set_mfn)
666{
667 unsigned topidx;
668 unsigned mididx;
669 unsigned ident_pfns;
670 unsigned inv_pfns;
671 unsigned long *p2m;
672 unsigned long *mid_mfn_p;
673 unsigned idx;
674 unsigned long pfn;
675
676 /* We only look when this entails a P2M middle layer */
677 if (p2m_index(set_pfn))
678 return false;
679
680 for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_PER_PAGE) {
681 topidx = p2m_top_index(pfn);
682
683 if (!p2m_top[topidx])
684 continue;
685
686 if (p2m_top[topidx] == p2m_mid_missing)
687 continue;
688
689 mididx = p2m_mid_index(pfn);
690 p2m = p2m_top[topidx][mididx];
691 if (!p2m)
692 continue;
693
694 if ((p2m == p2m_missing) || (p2m == p2m_identity))
695 continue;
696
697 if ((unsigned long)p2m == INVALID_P2M_ENTRY)
698 continue;
699
700 ident_pfns = 0;
701 inv_pfns = 0;
702 for (idx = 0; idx < P2M_PER_PAGE; idx++) {
703 /* IDENTITY_PFNs are 1:1 */
704 if (p2m[idx] == IDENTITY_FRAME(pfn + idx))
705 ident_pfns++;
706 else if (p2m[idx] == INVALID_P2M_ENTRY)
707 inv_pfns++;
708 else
709 break;
710 }
711 if ((ident_pfns == P2M_PER_PAGE) || (inv_pfns == P2M_PER_PAGE))
712 goto found;
713 }
714 return false;
715found:
716 /* Found one, replace old with p2m_identity or p2m_missing */
717 p2m_top[topidx][mididx] = (ident_pfns ? p2m_identity : p2m_missing);
718 /* And the other for save/restore.. */
719 mid_mfn_p = p2m_top_mfn_p[topidx];
720 /* NOTE: Even if it is a p2m_identity it should still be point to
721 * a page filled with INVALID_P2M_ENTRY entries. */
722 mid_mfn_p[mididx] = virt_to_mfn(p2m_missing);
723
724 /* Reset where we want to stick the old page in. */
725 topidx = p2m_top_index(set_pfn);
726 mididx = p2m_mid_index(set_pfn);
727
728 /* This shouldn't happen */
729 if (WARN_ON(p2m_top[topidx] == p2m_mid_missing))
730 early_alloc_p2m(set_pfn);
731
732 if (WARN_ON(p2m_top[topidx][mididx] != p2m_missing))
733 return false;
734
735 p2m_init(p2m);
736 p2m_top[topidx][mididx] = p2m;
737 mid_mfn_p = p2m_top_mfn_p[topidx];
738 mid_mfn_p[mididx] = virt_to_mfn(p2m);
739 526
740 return true; 527 p2m_init(p2m);
741}
742bool __init early_set_phys_to_machine(unsigned long pfn, unsigned long mfn)
743{
744 if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
745 if (!early_alloc_p2m(pfn))
746 return false;
747 528
748 if (early_can_reuse_p2m_middle(pfn, mfn)) 529 p2m_top[topidx][mididx] = p2m;
749 return __set_phys_to_machine(pfn, mfn);
750 530
751 if (!early_alloc_p2m_middle(pfn, false /* boundary crossover OK!*/)) 531 /* For save/restore we need to MFN of the P2M saved */
752 return false; 532
533 mid_mfn_p = p2m_top_mfn_p[topidx];
534 WARN(mid_mfn_p[mididx] != virt_to_mfn(p2m_missing),
535 "P2M_TOP_P[%d][%d] != MFN of p2m_missing!\n",
536 topidx, mididx);
537 mid_mfn_p[mididx] = virt_to_mfn(p2m);
753 538
754 if (!__set_phys_to_machine(pfn, mfn))
755 return false;
756 } 539 }
757 540 return idx != 0;
758 return true;
759} 541}
760unsigned long __init set_phys_range_identity(unsigned long pfn_s, 542unsigned long __init set_phys_range_identity(unsigned long pfn_s,
761 unsigned long pfn_e) 543 unsigned long pfn_e)
@@ -775,11 +557,35 @@ unsigned long __init set_phys_range_identity(unsigned long pfn_s,
775 pfn < ALIGN(pfn_e, (P2M_MID_PER_PAGE * P2M_PER_PAGE)); 557 pfn < ALIGN(pfn_e, (P2M_MID_PER_PAGE * P2M_PER_PAGE));
776 pfn += P2M_MID_PER_PAGE * P2M_PER_PAGE) 558 pfn += P2M_MID_PER_PAGE * P2M_PER_PAGE)
777 { 559 {
778 WARN_ON(!early_alloc_p2m(pfn)); 560 unsigned topidx = p2m_top_index(pfn);
561 unsigned long *mid_mfn_p;
562 unsigned long **mid;
563
564 mid = p2m_top[topidx];
565 mid_mfn_p = p2m_top_mfn_p[topidx];
566 if (mid == p2m_mid_missing) {
567 mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
568
569 p2m_mid_init(mid);
570
571 p2m_top[topidx] = mid;
572
573 BUG_ON(mid_mfn_p != p2m_mid_missing_mfn);
574 }
575 /* And the save/restore P2M tables.. */
576 if (mid_mfn_p == p2m_mid_missing_mfn) {
577 mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
578 p2m_mid_mfn_init(mid_mfn_p);
579
580 p2m_top_mfn_p[topidx] = mid_mfn_p;
581 p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p);
582 /* Note: we don't set mid_mfn_p[midix] here,
583 * look in __early_alloc_p2m */
584 }
779 } 585 }
780 586
781 early_alloc_p2m_middle(pfn_s, true); 587 __early_alloc_p2m(pfn_s);
782 early_alloc_p2m_middle(pfn_e, true); 588 __early_alloc_p2m(pfn_e);
783 589
784 for (pfn = pfn_s; pfn < pfn_e; pfn++) 590 for (pfn = pfn_s; pfn < pfn_e; pfn++)
785 if (!__set_phys_to_machine(pfn, IDENTITY_FRAME(pfn))) 591 if (!__set_phys_to_machine(pfn, IDENTITY_FRAME(pfn)))
@@ -870,15 +676,13 @@ static unsigned long mfn_hash(unsigned long mfn)
870} 676}
871 677
872/* Add an MFN override for a particular page */ 678/* Add an MFN override for a particular page */
873int m2p_add_override(unsigned long mfn, struct page *page, 679int m2p_add_override(unsigned long mfn, struct page *page, bool clear_pte)
874 struct gnttab_map_grant_ref *kmap_op)
875{ 680{
876 unsigned long flags; 681 unsigned long flags;
877 unsigned long pfn; 682 unsigned long pfn;
878 unsigned long uninitialized_var(address); 683 unsigned long uninitialized_var(address);
879 unsigned level; 684 unsigned level;
880 pte_t *ptep = NULL; 685 pte_t *ptep = NULL;
881 int ret = 0;
882 686
883 pfn = page_to_pfn(page); 687 pfn = page_to_pfn(page);
884 if (!PageHighMem(page)) { 688 if (!PageHighMem(page)) {
@@ -888,52 +692,24 @@ int m2p_add_override(unsigned long mfn, struct page *page,
888 "m2p_add_override: pfn %lx not mapped", pfn)) 692 "m2p_add_override: pfn %lx not mapped", pfn))
889 return -EINVAL; 693 return -EINVAL;
890 } 694 }
891 WARN_ON(PagePrivate(page)); 695
892 SetPagePrivate(page); 696 page->private = mfn;
893 set_page_private(page, mfn);
894 page->index = pfn_to_mfn(pfn); 697 page->index = pfn_to_mfn(pfn);
895 698
896 if (unlikely(!set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)))) 699 if (unlikely(!set_phys_to_machine(pfn, FOREIGN_FRAME(mfn))))
897 return -ENOMEM; 700 return -ENOMEM;
898 701
899 if (kmap_op != NULL) { 702 if (clear_pte && !PageHighMem(page))
900 if (!PageHighMem(page)) { 703 /* Just zap old mapping for now */
901 struct multicall_space mcs = 704 pte_clear(&init_mm, address, ptep);
902 xen_mc_entry(sizeof(*kmap_op));
903
904 MULTI_grant_table_op(mcs.mc,
905 GNTTABOP_map_grant_ref, kmap_op, 1);
906
907 xen_mc_issue(PARAVIRT_LAZY_MMU);
908 }
909 }
910 spin_lock_irqsave(&m2p_override_lock, flags); 705 spin_lock_irqsave(&m2p_override_lock, flags);
911 list_add(&page->lru, &m2p_overrides[mfn_hash(mfn)]); 706 list_add(&page->lru, &m2p_overrides[mfn_hash(mfn)]);
912 spin_unlock_irqrestore(&m2p_override_lock, flags); 707 spin_unlock_irqrestore(&m2p_override_lock, flags);
913 708
914 /* p2m(m2p(mfn)) == mfn: the mfn is already present somewhere in
915 * this domain. Set the FOREIGN_FRAME_BIT in the p2m for the other
916 * pfn so that the following mfn_to_pfn(mfn) calls will return the
917 * pfn from the m2p_override (the backend pfn) instead.
918 * We need to do this because the pages shared by the frontend
919 * (xen-blkfront) can be already locked (lock_page, called by
920 * do_read_cache_page); when the userspace backend tries to use them
921 * with direct_IO, mfn_to_pfn returns the pfn of the frontend, so
922 * do_blockdev_direct_IO is going to try to lock the same pages
923 * again resulting in a deadlock.
924 * As a side effect get_user_pages_fast might not be safe on the
925 * frontend pages while they are being shared with the backend,
926 * because mfn_to_pfn (that ends up being called by GUPF) will
927 * return the backend pfn rather than the frontend pfn. */
928 ret = __get_user(pfn, &machine_to_phys_mapping[mfn]);
929 if (ret == 0 && get_phys_to_machine(pfn) == mfn)
930 set_phys_to_machine(pfn, FOREIGN_FRAME(mfn));
931
932 return 0; 709 return 0;
933} 710}
934EXPORT_SYMBOL_GPL(m2p_add_override); 711EXPORT_SYMBOL_GPL(m2p_add_override);
935int m2p_remove_override(struct page *page, 712int m2p_remove_override(struct page *page, bool clear_pte)
936 struct gnttab_map_grant_ref *kmap_op)
937{ 713{
938 unsigned long flags; 714 unsigned long flags;
939 unsigned long mfn; 715 unsigned long mfn;
@@ -941,7 +717,6 @@ int m2p_remove_override(struct page *page,
941 unsigned long uninitialized_var(address); 717 unsigned long uninitialized_var(address);
942 unsigned level; 718 unsigned level;
943 pte_t *ptep = NULL; 719 pte_t *ptep = NULL;
944 int ret = 0;
945 720
946 pfn = page_to_pfn(page); 721 pfn = page_to_pfn(page);
947 mfn = get_phys_to_machine(pfn); 722 mfn = get_phys_to_machine(pfn);
@@ -960,69 +735,13 @@ int m2p_remove_override(struct page *page,
960 spin_lock_irqsave(&m2p_override_lock, flags); 735 spin_lock_irqsave(&m2p_override_lock, flags);
961 list_del(&page->lru); 736 list_del(&page->lru);
962 spin_unlock_irqrestore(&m2p_override_lock, flags); 737 spin_unlock_irqrestore(&m2p_override_lock, flags);
963 WARN_ON(!PagePrivate(page));
964 ClearPagePrivate(page);
965
966 set_phys_to_machine(pfn, page->index); 738 set_phys_to_machine(pfn, page->index);
967 if (kmap_op != NULL) {
968 if (!PageHighMem(page)) {
969 struct multicall_space mcs;
970 struct gnttab_unmap_grant_ref *unmap_op;
971
972 /*
973 * It might be that we queued all the m2p grant table
974 * hypercalls in a multicall, then m2p_remove_override
975 * get called before the multicall has actually been
976 * issued. In this case handle is going to -1 because
977 * it hasn't been modified yet.
978 */
979 if (kmap_op->handle == -1)
980 xen_mc_flush();
981 /*
982 * Now if kmap_op->handle is negative it means that the
983 * hypercall actually returned an error.
984 */
985 if (kmap_op->handle == GNTST_general_error) {
986 printk(KERN_WARNING "m2p_remove_override: "
987 "pfn %lx mfn %lx, failed to modify kernel mappings",
988 pfn, mfn);
989 return -1;
990 }
991
992 mcs = xen_mc_entry(
993 sizeof(struct gnttab_unmap_grant_ref));
994 unmap_op = mcs.args;
995 unmap_op->host_addr = kmap_op->host_addr;
996 unmap_op->handle = kmap_op->handle;
997 unmap_op->dev_bus_addr = 0;
998
999 MULTI_grant_table_op(mcs.mc,
1000 GNTTABOP_unmap_grant_ref, unmap_op, 1);
1001
1002 xen_mc_issue(PARAVIRT_LAZY_MMU);
1003
1004 set_pte_at(&init_mm, address, ptep,
1005 pfn_pte(pfn, PAGE_KERNEL));
1006 __flush_tlb_single(address);
1007 kmap_op->host_addr = 0;
1008 }
1009 }
1010 739
1011 /* p2m(m2p(mfn)) == FOREIGN_FRAME(mfn): the mfn is already present 740 if (clear_pte && !PageHighMem(page))
1012 * somewhere in this domain, even before being added to the 741 set_pte_at(&init_mm, address, ptep,
1013 * m2p_override (see comment above in m2p_add_override). 742 pfn_pte(pfn, PAGE_KERNEL));
1014 * If there are no other entries in the m2p_override corresponding 743 /* No tlb flush necessary because the caller already
1015 * to this mfn, then remove the FOREIGN_FRAME_BIT from the p2m for 744 * left the pte unmapped. */
1016 * the original pfn (the one shared by the frontend): the backend
1017 * cannot do any IO on this page anymore because it has been
1018 * unshared. Removing the FOREIGN_FRAME_BIT from the p2m entry of
1019 * the original pfn causes mfn_to_pfn(mfn) to return the frontend
1020 * pfn again. */
1021 mfn &= ~FOREIGN_FRAME_BIT;
1022 ret = __get_user(pfn, &machine_to_phys_mapping[mfn]);
1023 if (ret == 0 && get_phys_to_machine(pfn) == FOREIGN_FRAME(mfn) &&
1024 m2p_find_override(mfn) == NULL)
1025 set_phys_to_machine(pfn, mfn);
1026 745
1027 return 0; 746 return 0;
1028} 747}
@@ -1039,7 +758,7 @@ struct page *m2p_find_override(unsigned long mfn)
1039 spin_lock_irqsave(&m2p_override_lock, flags); 758 spin_lock_irqsave(&m2p_override_lock, flags);
1040 759
1041 list_for_each_entry(p, bucket, lru) { 760 list_for_each_entry(p, bucket, lru) {
1042 if (page_private(p) == mfn) { 761 if (p->private == mfn) {
1043 ret = p; 762 ret = p;
1044 break; 763 break;
1045 } 764 }
@@ -1063,21 +782,17 @@ unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn)
1063EXPORT_SYMBOL_GPL(m2p_find_override_pfn); 782EXPORT_SYMBOL_GPL(m2p_find_override_pfn);
1064 783
1065#ifdef CONFIG_XEN_DEBUG_FS 784#ifdef CONFIG_XEN_DEBUG_FS
1066#include <linux/debugfs.h> 785
1067#include "debugfs.h" 786int p2m_dump_show(struct seq_file *m, void *v)
1068static int p2m_dump_show(struct seq_file *m, void *v)
1069{ 787{
1070 static const char * const level_name[] = { "top", "middle", 788 static const char * const level_name[] = { "top", "middle",
1071 "entry", "abnormal", "error"}; 789 "entry", "abnormal" };
790 static const char * const type_name[] = { "identity", "missing",
791 "pfn", "abnormal"};
1072#define TYPE_IDENTITY 0 792#define TYPE_IDENTITY 0
1073#define TYPE_MISSING 1 793#define TYPE_MISSING 1
1074#define TYPE_PFN 2 794#define TYPE_PFN 2
1075#define TYPE_UNKNOWN 3 795#define TYPE_UNKNOWN 3
1076 static const char * const type_name[] = {
1077 [TYPE_IDENTITY] = "identity",
1078 [TYPE_MISSING] = "missing",
1079 [TYPE_PFN] = "pfn",
1080 [TYPE_UNKNOWN] = "abnormal"};
1081 unsigned long pfn, prev_pfn_type = 0, prev_pfn_level = 0; 796 unsigned long pfn, prev_pfn_type = 0, prev_pfn_level = 0;
1082 unsigned int uninitialized_var(prev_level); 797 unsigned int uninitialized_var(prev_level);
1083 unsigned int uninitialized_var(prev_type); 798 unsigned int uninitialized_var(prev_type);
@@ -1141,32 +856,4 @@ static int p2m_dump_show(struct seq_file *m, void *v)
1141#undef TYPE_PFN 856#undef TYPE_PFN
1142#undef TYPE_UNKNOWN 857#undef TYPE_UNKNOWN
1143} 858}
1144 859#endif
1145static int p2m_dump_open(struct inode *inode, struct file *filp)
1146{
1147 return single_open(filp, p2m_dump_show, NULL);
1148}
1149
1150static const struct file_operations p2m_dump_fops = {
1151 .open = p2m_dump_open,
1152 .read = seq_read,
1153 .llseek = seq_lseek,
1154 .release = single_release,
1155};
1156
1157static struct dentry *d_mmu_debug;
1158
1159static int __init xen_p2m_debugfs(void)
1160{
1161 struct dentry *d_xen = xen_init_debugfs();
1162
1163 if (d_xen == NULL)
1164 return -ENOMEM;
1165
1166 d_mmu_debug = debugfs_create_dir("mmu", d_xen);
1167
1168 debugfs_create_file("p2m", 0600, d_mmu_debug, NULL, &p2m_dump_fops);
1169 return 0;
1170}
1171fs_initcall(xen_p2m_debugfs);
1172#endif /* CONFIG_XEN_DEBUG_FS */
diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c
index 969570491c3..b480d4207a4 100644
--- a/arch/x86/xen/pci-swiotlb-xen.c
+++ b/arch/x86/xen/pci-swiotlb-xen.c
@@ -8,20 +8,12 @@
8#include <xen/xen.h> 8#include <xen/xen.h>
9#include <asm/iommu_table.h> 9#include <asm/iommu_table.h>
10 10
11
12#include <asm/xen/swiotlb-xen.h>
13#ifdef CONFIG_X86_64
14#include <asm/iommu.h>
15#include <asm/dma.h>
16#endif
17#include <linux/export.h>
18
19int xen_swiotlb __read_mostly; 11int xen_swiotlb __read_mostly;
20 12
21static struct dma_map_ops xen_swiotlb_dma_ops = { 13static struct dma_map_ops xen_swiotlb_dma_ops = {
22 .mapping_error = xen_swiotlb_dma_mapping_error, 14 .mapping_error = xen_swiotlb_dma_mapping_error,
23 .alloc = xen_swiotlb_alloc_coherent, 15 .alloc_coherent = xen_swiotlb_alloc_coherent,
24 .free = xen_swiotlb_free_coherent, 16 .free_coherent = xen_swiotlb_free_coherent,
25 .sync_single_for_cpu = xen_swiotlb_sync_single_for_cpu, 17 .sync_single_for_cpu = xen_swiotlb_sync_single_for_cpu,
26 .sync_single_for_device = xen_swiotlb_sync_single_for_device, 18 .sync_single_for_device = xen_swiotlb_sync_single_for_device,
27 .sync_sg_for_cpu = xen_swiotlb_sync_sg_for_cpu, 19 .sync_sg_for_cpu = xen_swiotlb_sync_sg_for_cpu,
@@ -42,64 +34,34 @@ static struct dma_map_ops xen_swiotlb_dma_ops = {
42int __init pci_xen_swiotlb_detect(void) 34int __init pci_xen_swiotlb_detect(void)
43{ 35{
44 36
45 if (!xen_pv_domain())
46 return 0;
47
48 /* If running as PV guest, either iommu=soft, or swiotlb=force will 37 /* If running as PV guest, either iommu=soft, or swiotlb=force will
49 * activate this IOMMU. If running as PV privileged, activate it 38 * activate this IOMMU. If running as PV privileged, activate it
50 * irregardless. 39 * irregardless.
51 */ 40 */
52 if ((xen_initial_domain() || swiotlb || swiotlb_force)) 41 if ((xen_initial_domain() || swiotlb || swiotlb_force) &&
42 (xen_pv_domain()))
53 xen_swiotlb = 1; 43 xen_swiotlb = 1;
54 44
55 /* If we are running under Xen, we MUST disable the native SWIOTLB. 45 /* If we are running under Xen, we MUST disable the native SWIOTLB.
56 * Don't worry about swiotlb_force flag activating the native, as 46 * Don't worry about swiotlb_force flag activating the native, as
57 * the 'swiotlb' flag is the only one turning it on. */ 47 * the 'swiotlb' flag is the only one turning it on. */
58 swiotlb = 0; 48 if (xen_pv_domain())
49 swiotlb = 0;
59 50
60#ifdef CONFIG_X86_64
61 /* pci_swiotlb_detect_4gb turns on native SWIOTLB if no_iommu == 0
62 * (so no iommu=X command line over-writes).
63 * Considering that PV guests do not want the *native SWIOTLB* but
64 * only Xen SWIOTLB it is not useful to us so set no_iommu=1 here.
65 */
66 if (max_pfn > MAX_DMA32_PFN)
67 no_iommu = 1;
68#endif
69 return xen_swiotlb; 51 return xen_swiotlb;
70} 52}
71 53
72void __init pci_xen_swiotlb_init(void) 54void __init pci_xen_swiotlb_init(void)
73{ 55{
74 if (xen_swiotlb) { 56 if (xen_swiotlb) {
75 xen_swiotlb_init(1, true /* early */); 57 xen_swiotlb_init(1);
76 dma_ops = &xen_swiotlb_dma_ops; 58 dma_ops = &xen_swiotlb_dma_ops;
77 59
78 /* Make sure ACS will be enabled */ 60 /* Make sure ACS will be enabled */
79 pci_request_acs(); 61 pci_request_acs();
80 } 62 }
81} 63}
82
83int pci_xen_swiotlb_init_late(void)
84{
85 int rc;
86
87 if (xen_swiotlb)
88 return 0;
89
90 rc = xen_swiotlb_init(1, false /* late */);
91 if (rc)
92 return rc;
93
94 dma_ops = &xen_swiotlb_dma_ops;
95 /* Make sure ACS will be enabled */
96 pci_request_acs();
97
98 return 0;
99}
100EXPORT_SYMBOL_GPL(pci_xen_swiotlb_init_late);
101
102IOMMU_INIT_FINISH(pci_xen_swiotlb_detect, 64IOMMU_INIT_FINISH(pci_xen_swiotlb_detect,
103 NULL, 65 0,
104 pci_xen_swiotlb_init, 66 pci_xen_swiotlb_init,
105 NULL); 67 0);
diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c
index 0a7852483ff..ffcf2615640 100644
--- a/arch/x86/xen/platform-pci-unplug.c
+++ b/arch/x86/xen/platform-pci-unplug.c
@@ -24,7 +24,6 @@
24#include <linux/module.h> 24#include <linux/module.h>
25 25
26#include <xen/platform_pci.h> 26#include <xen/platform_pci.h>
27#include "xen-ops.h"
28 27
29#define XEN_PLATFORM_ERR_MAGIC -1 28#define XEN_PLATFORM_ERR_MAGIC -1
30#define XEN_PLATFORM_ERR_PROTOCOL -2 29#define XEN_PLATFORM_ERR_PROTOCOL -2
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 8971a26d21a..e1913024687 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -10,14 +10,12 @@
10#include <linux/pm.h> 10#include <linux/pm.h>
11#include <linux/memblock.h> 11#include <linux/memblock.h>
12#include <linux/cpuidle.h> 12#include <linux/cpuidle.h>
13#include <linux/cpufreq.h>
14 13
15#include <asm/elf.h> 14#include <asm/elf.h>
16#include <asm/vdso.h> 15#include <asm/vdso.h>
17#include <asm/e820.h> 16#include <asm/e820.h>
18#include <asm/setup.h> 17#include <asm/setup.h>
19#include <asm/acpi.h> 18#include <asm/acpi.h>
20#include <asm/numa.h>
21#include <asm/xen/hypervisor.h> 19#include <asm/xen/hypervisor.h>
22#include <asm/xen/hypercall.h> 20#include <asm/xen/hypercall.h>
23 21
@@ -27,6 +25,7 @@
27#include <xen/interface/memory.h> 25#include <xen/interface/memory.h>
28#include <xen/interface/physdev.h> 26#include <xen/interface/physdev.h>
29#include <xen/features.h> 27#include <xen/features.h>
28
30#include "xen-ops.h" 29#include "xen-ops.h"
31#include "vdso.h" 30#include "vdso.h"
32 31
@@ -38,10 +37,7 @@ extern void xen_syscall_target(void);
38extern void xen_syscall32_target(void); 37extern void xen_syscall32_target(void);
39 38
40/* Amount of extra memory space we add to the e820 ranges */ 39/* Amount of extra memory space we add to the e820 ranges */
41struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata; 40phys_addr_t xen_extra_mem_start, xen_extra_mem_size;
42
43/* Number of pages released from the initial allocation. */
44unsigned long xen_released_pages;
45 41
46/* 42/*
47 * The maximum amount of extra memory compared to the base size. The 43 * The maximum amount of extra memory compared to the base size. The
@@ -55,225 +51,138 @@ unsigned long xen_released_pages;
55 */ 51 */
56#define EXTRA_MEM_RATIO (10) 52#define EXTRA_MEM_RATIO (10)
57 53
58static void __init xen_add_extra_mem(u64 start, u64 size) 54static void __init xen_add_extra_mem(unsigned long pages)
59{ 55{
60 unsigned long pfn; 56 unsigned long pfn;
61 int i;
62 57
63 for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { 58 u64 size = (u64)pages * PAGE_SIZE;
64 /* Add new region. */ 59 u64 extra_start = xen_extra_mem_start + xen_extra_mem_size;
65 if (xen_extra_mem[i].size == 0) {
66 xen_extra_mem[i].start = start;
67 xen_extra_mem[i].size = size;
68 break;
69 }
70 /* Append to existing region. */
71 if (xen_extra_mem[i].start + xen_extra_mem[i].size == start) {
72 xen_extra_mem[i].size += size;
73 break;
74 }
75 }
76 if (i == XEN_EXTRA_MEM_MAX_REGIONS)
77 printk(KERN_WARNING "Warning: not enough extra memory regions\n");
78 60
79 memblock_reserve(start, size); 61 if (!pages)
62 return;
80 63
81 xen_max_p2m_pfn = PFN_DOWN(start + size); 64 e820_add_region(extra_start, size, E820_RAM);
82 for (pfn = PFN_DOWN(start); pfn < xen_max_p2m_pfn; pfn++) { 65 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
83 unsigned long mfn = pfn_to_mfn(pfn);
84 66
85 if (WARN(mfn == pfn, "Trying to over-write 1-1 mapping (pfn: %lx)\n", pfn)) 67 memblock_x86_reserve_range(extra_start, extra_start + size, "XEN EXTRA");
86 continue;
87 WARN(mfn != INVALID_P2M_ENTRY, "Trying to remove %lx which has %lx mfn!\n",
88 pfn, mfn);
89 68
69 xen_extra_mem_size += size;
70
71 xen_max_p2m_pfn = PFN_DOWN(extra_start + size);
72
73 for (pfn = PFN_DOWN(extra_start); pfn <= xen_max_p2m_pfn; pfn++)
90 __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); 74 __set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
91 }
92} 75}
93 76
94static unsigned long __init xen_do_chunk(unsigned long start, 77static unsigned long __init xen_release_chunk(phys_addr_t start_addr,
95 unsigned long end, bool release) 78 phys_addr_t end_addr)
96{ 79{
97 struct xen_memory_reservation reservation = { 80 struct xen_memory_reservation reservation = {
98 .address_bits = 0, 81 .address_bits = 0,
99 .extent_order = 0, 82 .extent_order = 0,
100 .domid = DOMID_SELF 83 .domid = DOMID_SELF
101 }; 84 };
85 unsigned long start, end;
102 unsigned long len = 0; 86 unsigned long len = 0;
103 unsigned long pfn; 87 unsigned long pfn;
104 int ret; 88 int ret;
105 89
106 for (pfn = start; pfn < end; pfn++) { 90 start = PFN_UP(start_addr);
107 unsigned long frame; 91 end = PFN_DOWN(end_addr);
92
93 if (end <= start)
94 return 0;
95
96 for(pfn = start; pfn < end; pfn++) {
108 unsigned long mfn = pfn_to_mfn(pfn); 97 unsigned long mfn = pfn_to_mfn(pfn);
109 98
110 if (release) { 99 /* Make sure pfn exists to start with */
111 /* Make sure pfn exists to start with */ 100 if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn)
112 if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn) 101 continue;
113 continue; 102
114 frame = mfn; 103 set_xen_guest_handle(reservation.extent_start, &mfn);
115 } else {
116 if (mfn != INVALID_P2M_ENTRY)
117 continue;
118 frame = pfn;
119 }
120 set_xen_guest_handle(reservation.extent_start, &frame);
121 reservation.nr_extents = 1; 104 reservation.nr_extents = 1;
122 105
123 ret = HYPERVISOR_memory_op(release ? XENMEM_decrease_reservation : XENMEM_populate_physmap, 106 ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
124 &reservation); 107 &reservation);
125 WARN(ret != 1, "Failed to %s pfn %lx err=%d\n", 108 WARN(ret != 1, "Failed to release pfn %lx err=%d\n", pfn, ret);
126 release ? "release" : "populate", pfn, ret);
127
128 if (ret == 1) { 109 if (ret == 1) {
129 if (!early_set_phys_to_machine(pfn, release ? INVALID_P2M_ENTRY : frame)) { 110 __set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
130 if (release)
131 break;
132 set_xen_guest_handle(reservation.extent_start, &frame);
133 reservation.nr_extents = 1;
134 ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
135 &reservation);
136 break;
137 }
138 len++; 111 len++;
139 } else 112 }
140 break;
141 } 113 }
142 if (len) 114 printk(KERN_INFO "Freeing %lx-%lx pfn range: %lu pages freed\n",
143 printk(KERN_INFO "%s %lx-%lx pfn range: %lu pages %s\n", 115 start, end, len);
144 release ? "Freeing" : "Populating",
145 start, end, len,
146 release ? "freed" : "added");
147 116
148 return len; 117 return len;
149} 118}
150 119
151static unsigned long __init xen_release_chunk(unsigned long start, 120static unsigned long __init xen_return_unused_memory(unsigned long max_pfn,
152 unsigned long end) 121 const struct e820map *e820)
153{ 122{
154 return xen_do_chunk(start, end, true); 123 phys_addr_t max_addr = PFN_PHYS(max_pfn);
155} 124 phys_addr_t last_end = ISA_END_ADDRESS;
156 125 unsigned long released = 0;
157static unsigned long __init xen_populate_chunk( 126 int i;
158 const struct e820entry *list, size_t map_size,
159 unsigned long max_pfn, unsigned long *last_pfn,
160 unsigned long credits_left)
161{
162 const struct e820entry *entry;
163 unsigned int i;
164 unsigned long done = 0;
165 unsigned long dest_pfn;
166
167 for (i = 0, entry = list; i < map_size; i++, entry++) {
168 unsigned long s_pfn;
169 unsigned long e_pfn;
170 unsigned long pfns;
171 long capacity;
172
173 if (credits_left <= 0)
174 break;
175
176 if (entry->type != E820_RAM)
177 continue;
178
179 e_pfn = PFN_DOWN(entry->addr + entry->size);
180
181 /* We only care about E820 after the xen_start_info->nr_pages */
182 if (e_pfn <= max_pfn)
183 continue;
184
185 s_pfn = PFN_UP(entry->addr);
186 /* If the E820 falls within the nr_pages, we want to start
187 * at the nr_pages PFN.
188 * If that would mean going past the E820 entry, skip it
189 */
190 if (s_pfn <= max_pfn) {
191 capacity = e_pfn - max_pfn;
192 dest_pfn = max_pfn;
193 } else {
194 capacity = e_pfn - s_pfn;
195 dest_pfn = s_pfn;
196 }
197 127
198 if (credits_left < capacity) 128 /* Free any unused memory above the low 1Mbyte. */
199 capacity = credits_left; 129 for (i = 0; i < e820->nr_map && last_end < max_addr; i++) {
130 phys_addr_t end = e820->map[i].addr;
131 end = min(max_addr, end);
200 132
201 pfns = xen_do_chunk(dest_pfn, dest_pfn + capacity, false); 133 if (last_end < end)
202 done += pfns; 134 released += xen_release_chunk(last_end, end);
203 *last_pfn = (dest_pfn + pfns); 135 last_end = max(last_end, e820->map[i].addr + e820->map[i].size);
204 if (pfns < capacity)
205 break;
206 credits_left -= pfns;
207 } 136 }
208 return done;
209}
210 137
211static void __init xen_set_identity_and_release_chunk( 138 if (last_end < max_addr)
212 unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages, 139 released += xen_release_chunk(last_end, max_addr);
213 unsigned long *released, unsigned long *identity)
214{
215 unsigned long pfn;
216 140
217 /* 141 printk(KERN_INFO "released %lu pages of unused memory\n", released);
218 * If the PFNs are currently mapped, the VA mapping also needs 142 return released;
219 * to be updated to be 1:1.
220 */
221 for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++)
222 (void)HYPERVISOR_update_va_mapping(
223 (unsigned long)__va(pfn << PAGE_SHIFT),
224 mfn_pte(pfn, PAGE_KERNEL_IO), 0);
225
226 if (start_pfn < nr_pages)
227 *released += xen_release_chunk(
228 start_pfn, min(end_pfn, nr_pages));
229
230 *identity += set_phys_range_identity(start_pfn, end_pfn);
231} 143}
232 144
233static unsigned long __init xen_set_identity_and_release( 145static unsigned long __init xen_set_identity(const struct e820entry *list,
234 const struct e820entry *list, size_t map_size, unsigned long nr_pages) 146 ssize_t map_size)
235{ 147{
236 phys_addr_t start = 0; 148 phys_addr_t last = xen_initial_domain() ? 0 : ISA_END_ADDRESS;
237 unsigned long released = 0; 149 phys_addr_t start_pci = last;
238 unsigned long identity = 0;
239 const struct e820entry *entry; 150 const struct e820entry *entry;
151 unsigned long identity = 0;
240 int i; 152 int i;
241 153
242 /*
243 * Combine non-RAM regions and gaps until a RAM region (or the
244 * end of the map) is reached, then set the 1:1 map and
245 * release the pages (if available) in those non-RAM regions.
246 *
247 * The combined non-RAM regions are rounded to a whole number
248 * of pages so any partial pages are accessible via the 1:1
249 * mapping. This is needed for some BIOSes that put (for
250 * example) the DMI tables in a reserved region that begins on
251 * a non-page boundary.
252 */
253 for (i = 0, entry = list; i < map_size; i++, entry++) { 154 for (i = 0, entry = list; i < map_size; i++, entry++) {
254 phys_addr_t end = entry->addr + entry->size; 155 phys_addr_t start = entry->addr;
255 if (entry->type == E820_RAM || i == map_size - 1) { 156 phys_addr_t end = start + entry->size;
256 unsigned long start_pfn = PFN_DOWN(start);
257 unsigned long end_pfn = PFN_UP(end);
258 157
259 if (entry->type == E820_RAM) 158 if (start < last)
260 end_pfn = PFN_UP(entry->addr); 159 start = last;
261 160
262 if (start_pfn < end_pfn) 161 if (end <= start)
263 xen_set_identity_and_release_chunk( 162 continue;
264 start_pfn, end_pfn, nr_pages,
265 &released, &identity);
266 163
267 start = end; 164 /* Skip over the 1MB region. */
268 } 165 if (last > end)
269 } 166 continue;
270 167
271 if (released) 168 if ((entry->type == E820_RAM) || (entry->type == E820_UNUSABLE)) {
272 printk(KERN_INFO "Released %lu pages of unused memory\n", released); 169 if (start > start_pci)
273 if (identity) 170 identity += set_phys_range_identity(
274 printk(KERN_INFO "Set %ld page(s) to 1-1 mapping\n", identity); 171 PFN_UP(start_pci), PFN_DOWN(start));
275 172
276 return released; 173 /* Without saving 'last' we would gooble RAM too
174 * at the end of the loop. */
175 last = end;
176 start_pci = end;
177 continue;
178 }
179 start_pci = min(start, start_pci);
180 last = end;
181 }
182 if (last > start_pci)
183 identity += set_phys_range_identity(
184 PFN_UP(start_pci), PFN_DOWN(last));
185 return identity;
277} 186}
278 187
279static unsigned long __init xen_get_max_pages(void) 188static unsigned long __init xen_get_max_pages(void)
@@ -300,34 +209,21 @@ static unsigned long __init xen_get_max_pages(void)
300 return min(max_pages, MAX_DOMAIN_PAGES); 209 return min(max_pages, MAX_DOMAIN_PAGES);
301} 210}
302 211
303static void xen_align_and_add_e820_region(u64 start, u64 size, int type)
304{
305 u64 end = start + size;
306
307 /* Align RAM regions to page boundaries. */
308 if (type == E820_RAM) {
309 start = PAGE_ALIGN(start);
310 end &= ~((u64)PAGE_SIZE - 1);
311 }
312
313 e820_add_region(start, end - start, type);
314}
315
316/** 212/**
317 * machine_specific_memory_setup - Hook for machine specific memory setup. 213 * machine_specific_memory_setup - Hook for machine specific memory setup.
318 **/ 214 **/
319char * __init xen_memory_setup(void) 215char * __init xen_memory_setup(void)
320{ 216{
321 static struct e820entry map[E820MAX] __initdata; 217 static struct e820entry map[E820MAX] __initdata;
218 static struct e820entry map_raw[E820MAX] __initdata;
322 219
323 unsigned long max_pfn = xen_start_info->nr_pages; 220 unsigned long max_pfn = xen_start_info->nr_pages;
324 unsigned long long mem_end; 221 unsigned long long mem_end;
325 int rc; 222 int rc;
326 struct xen_memory_map memmap; 223 struct xen_memory_map memmap;
327 unsigned long max_pages;
328 unsigned long last_pfn = 0;
329 unsigned long extra_pages = 0; 224 unsigned long extra_pages = 0;
330 unsigned long populated; 225 unsigned long extra_limit;
226 unsigned long identity_pages = 0;
331 int i; 227 int i;
332 int op; 228 int op;
333 229
@@ -353,34 +249,84 @@ char * __init xen_memory_setup(void)
353 } 249 }
354 BUG_ON(rc); 250 BUG_ON(rc);
355 251
356 /* Make sure the Xen-supplied memory map is well-ordered. */ 252 memcpy(map_raw, map, sizeof(map));
357 sanitize_e820_map(map, memmap.nr_entries, &memmap.nr_entries); 253 e820.nr_map = 0;
254 xen_extra_mem_start = mem_end;
255 for (i = 0; i < memmap.nr_entries; i++) {
256 unsigned long long end;
257
258 /* Guard against non-page aligned E820 entries. */
259 if (map[i].type == E820_RAM)
260 map[i].size -= (map[i].size + map[i].addr) % PAGE_SIZE;
261
262 end = map[i].addr + map[i].size;
263 if (map[i].type == E820_RAM && end > mem_end) {
264 /* RAM off the end - may be partially included */
265 u64 delta = min(map[i].size, end - mem_end);
266
267 map[i].size -= delta;
268 end -= delta;
269
270 extra_pages += PFN_DOWN(delta);
271 /*
272 * Set RAM below 4GB that is not for us to be unusable.
273 * This prevents "System RAM" address space from being
274 * used as potential resource for I/O address (happens
275 * when 'allocate_resource' is called).
276 */
277 if (delta &&
278 (xen_initial_domain() && end < 0x100000000ULL))
279 e820_add_region(end, delta, E820_UNUSABLE);
280 }
281
282 if (map[i].size > 0 && end > xen_extra_mem_start)
283 xen_extra_mem_start = end;
358 284
359 max_pages = xen_get_max_pages(); 285 /* Add region if any remains */
360 if (max_pages > max_pfn) 286 if (map[i].size > 0)
361 extra_pages += max_pages - max_pfn; 287 e820_add_region(map[i].addr, map[i].size, map[i].type);
288 }
289 /* Align the balloon area so that max_low_pfn does not get set
290 * to be at the _end_ of the PCI gap at the far end (fee01000).
291 * Note that xen_extra_mem_start gets set in the loop above to be
292 * past the last E820 region. */
293 if (xen_initial_domain() && (xen_extra_mem_start < (1ULL<<32)))
294 xen_extra_mem_start = (1ULL<<32);
362 295
363 /* 296 /*
364 * Set P2M for all non-RAM pages and E820 gaps to be identity 297 * In domU, the ISA region is normal, usable memory, but we
365 * type PFNs. Any RAM pages that would be made inaccesible by 298 * reserve ISA memory anyway because too many things poke
366 * this are first released. 299 * about in there.
300 *
301 * In Dom0, the host E820 information can leave gaps in the
302 * ISA range, which would cause us to release those pages. To
303 * avoid this, we unconditionally reserve them here.
367 */ 304 */
368 xen_released_pages = xen_set_identity_and_release( 305 e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS,
369 map, memmap.nr_entries, max_pfn); 306 E820_RESERVED);
370 307
371 /* 308 /*
372 * Populate back the non-RAM pages and E820 gaps that had been 309 * Reserve Xen bits:
373 * released. */ 310 * - mfn_list
374 populated = xen_populate_chunk(map, memmap.nr_entries, 311 * - xen_start_info
375 max_pfn, &last_pfn, xen_released_pages); 312 * See comment above "struct start_info" in <xen/interface/xen.h>
313 */
314 memblock_x86_reserve_range(__pa(xen_start_info->mfn_list),
315 __pa(xen_start_info->pt_base),
316 "XEN START INFO");
376 317
377 xen_released_pages -= populated; 318 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
378 extra_pages += xen_released_pages;
379 319
380 if (last_pfn > max_pfn) { 320 extra_limit = xen_get_max_pages();
381 max_pfn = min(MAX_DOMAIN_PAGES, last_pfn); 321 if (max_pfn + extra_pages > extra_limit) {
382 mem_end = PFN_PHYS(max_pfn); 322 if (extra_limit > max_pfn)
323 extra_pages = extra_limit - max_pfn;
324 else
325 extra_pages = 0;
383 } 326 }
327
328 extra_pages += xen_return_unused_memory(xen_start_info->nr_pages, &e820);
329
384 /* 330 /*
385 * Clamp the amount of extra memory to a EXTRA_MEM_RATIO 331 * Clamp the amount of extra memory to a EXTRA_MEM_RATIO
386 * factor the base size. On non-highmem systems, the base 332 * factor the base size. On non-highmem systems, the base
@@ -392,70 +338,23 @@ char * __init xen_memory_setup(void)
392 * the initial memory is also very large with respect to 338 * the initial memory is also very large with respect to
393 * lowmem, but we won't try to deal with that here. 339 * lowmem, but we won't try to deal with that here.
394 */ 340 */
395 extra_pages = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)), 341 extra_limit = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)),
396 extra_pages); 342 max_pfn + extra_pages);
397 i = 0;
398 while (i < memmap.nr_entries) {
399 u64 addr = map[i].addr;
400 u64 size = map[i].size;
401 u32 type = map[i].type;
402
403 if (type == E820_RAM) {
404 if (addr < mem_end) {
405 size = min(size, mem_end - addr);
406 } else if (extra_pages) {
407 size = min(size, (u64)extra_pages * PAGE_SIZE);
408 extra_pages -= size / PAGE_SIZE;
409 xen_add_extra_mem(addr, size);
410 } else
411 type = E820_UNUSABLE;
412 }
413 343
414 xen_align_and_add_e820_region(addr, size, type); 344 if (extra_limit >= max_pfn)
345 extra_pages = extra_limit - max_pfn;
346 else
347 extra_pages = 0;
415 348
416 map[i].addr += size; 349 xen_add_extra_mem(extra_pages);
417 map[i].size -= size;
418 if (map[i].size == 0)
419 i++;
420 }
421 350
422 /* 351 /*
423 * In domU, the ISA region is normal, usable memory, but we 352 * Set P2M for all non-RAM pages and E820 gaps to be identity
424 * reserve ISA memory anyway because too many things poke 353 * type PFNs. We supply it with the non-sanitized version
425 * about in there. 354 * of the E820.
426 */
427 e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS,
428 E820_RESERVED);
429
430 /*
431 * Reserve Xen bits:
432 * - mfn_list
433 * - xen_start_info
434 * See comment above "struct start_info" in <xen/interface/xen.h>
435 * We tried to make the the memblock_reserve more selective so
436 * that it would be clear what region is reserved. Sadly we ran
437 * in the problem wherein on a 64-bit hypervisor with a 32-bit
438 * initial domain, the pt_base has the cr3 value which is not
439 * neccessarily where the pagetable starts! As Jan put it: "
440 * Actually, the adjustment turns out to be correct: The page
441 * tables for a 32-on-64 dom0 get allocated in the order "first L1",
442 * "first L2", "first L3", so the offset to the page table base is
443 * indeed 2. When reading xen/include/public/xen.h's comment
444 * very strictly, this is not a violation (since there nothing is said
445 * that the first thing in the page table space is pointed to by
446 * pt_base; I admit that this seems to be implied though, namely
447 * do I think that it is implied that the page table space is the
448 * range [pt_base, pt_base + nt_pt_frames), whereas that
449 * range here indeed is [pt_base - 2, pt_base - 2 + nt_pt_frames),
450 * which - without a priori knowledge - the kernel would have
451 * difficulty to figure out)." - so lets just fall back to the
452 * easy way and reserve the whole region.
453 */ 355 */
454 memblock_reserve(__pa(xen_start_info->mfn_list), 356 identity_pages = xen_set_identity(map_raw, memmap.nr_entries);
455 xen_start_info->pt_base - xen_start_info->mfn_list); 357 printk(KERN_INFO "Set %ld page(s) to 1-1 mapping.\n", identity_pages);
456
457 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
458
459 return "Xen"; 358 return "Xen";
460} 359}
461 360
@@ -560,10 +459,7 @@ void __init xen_arch_setup(void)
560 boot_cpu_data.hlt_works_ok = 1; 459 boot_cpu_data.hlt_works_ok = 1;
561#endif 460#endif
562 disable_cpuidle(); 461 disable_cpuidle();
563 disable_cpufreq(); 462 boot_option_idle_override = IDLE_HALT;
564 WARN_ON(set_pm_idle_to_default()); 463 WARN_ON(set_pm_idle_to_default());
565 fiddle_vdso(); 464 fiddle_vdso();
566#ifdef CONFIG_NUMA
567 numa_off = 1;
568#endif
569} 465}
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 4f7d2599b48..041d4fe9dfe 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -16,7 +16,6 @@
16#include <linux/err.h> 16#include <linux/err.h>
17#include <linux/slab.h> 17#include <linux/slab.h>
18#include <linux/smp.h> 18#include <linux/smp.h>
19#include <linux/irq_work.h>
20 19
21#include <asm/paravirt.h> 20#include <asm/paravirt.h>
22#include <asm/desc.h> 21#include <asm/desc.h>
@@ -42,12 +41,10 @@ cpumask_var_t xen_cpu_initialized_map;
42static DEFINE_PER_CPU(int, xen_resched_irq); 41static DEFINE_PER_CPU(int, xen_resched_irq);
43static DEFINE_PER_CPU(int, xen_callfunc_irq); 42static DEFINE_PER_CPU(int, xen_callfunc_irq);
44static DEFINE_PER_CPU(int, xen_callfuncsingle_irq); 43static DEFINE_PER_CPU(int, xen_callfuncsingle_irq);
45static DEFINE_PER_CPU(int, xen_irq_work);
46static DEFINE_PER_CPU(int, xen_debug_irq) = -1; 44static DEFINE_PER_CPU(int, xen_debug_irq) = -1;
47 45
48static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id); 46static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id);
49static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id); 47static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id);
50static irqreturn_t xen_irq_work_interrupt(int irq, void *dev_id);
51 48
52/* 49/*
53 * Reschedule call back. 50 * Reschedule call back.
@@ -62,7 +59,7 @@ static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
62 59
63static void __cpuinit cpu_bringup(void) 60static void __cpuinit cpu_bringup(void)
64{ 61{
65 int cpu; 62 int cpu = smp_processor_id();
66 63
67 cpu_init(); 64 cpu_init();
68 touch_softlockup_watchdog(); 65 touch_softlockup_watchdog();
@@ -78,12 +75,8 @@ static void __cpuinit cpu_bringup(void)
78 75
79 xen_setup_cpu_clockevents(); 76 xen_setup_cpu_clockevents();
80 77
81 notify_cpu_starting(cpu);
82
83 set_cpu_online(cpu, true); 78 set_cpu_online(cpu, true);
84 79 percpu_write(cpu_state, CPU_ONLINE);
85 this_cpu_write(cpu_state, CPU_ONLINE);
86
87 wmb(); 80 wmb();
88 81
89 /* We can take interrupts now: we're officially "up". */ 82 /* We can take interrupts now: we're officially "up". */
@@ -144,17 +137,6 @@ static int xen_smp_intr_init(unsigned int cpu)
144 goto fail; 137 goto fail;
145 per_cpu(xen_callfuncsingle_irq, cpu) = rc; 138 per_cpu(xen_callfuncsingle_irq, cpu) = rc;
146 139
147 callfunc_name = kasprintf(GFP_KERNEL, "irqwork%d", cpu);
148 rc = bind_ipi_to_irqhandler(XEN_IRQ_WORK_VECTOR,
149 cpu,
150 xen_irq_work_interrupt,
151 IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
152 callfunc_name,
153 NULL);
154 if (rc < 0)
155 goto fail;
156 per_cpu(xen_irq_work, cpu) = rc;
157
158 return 0; 140 return 0;
159 141
160 fail: 142 fail:
@@ -167,8 +149,6 @@ static int xen_smp_intr_init(unsigned int cpu)
167 if (per_cpu(xen_callfuncsingle_irq, cpu) >= 0) 149 if (per_cpu(xen_callfuncsingle_irq, cpu) >= 0)
168 unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu), 150 unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu),
169 NULL); 151 NULL);
170 if (per_cpu(xen_irq_work, cpu) >= 0)
171 unbind_from_irqhandler(per_cpu(xen_irq_work, cpu), NULL);
172 152
173 return rc; 153 return rc;
174} 154}
@@ -192,7 +172,6 @@ static void __init xen_fill_possible_map(void)
192static void __init xen_filter_cpu_maps(void) 172static void __init xen_filter_cpu_maps(void)
193{ 173{
194 int i, rc; 174 int i, rc;
195 unsigned int subtract = 0;
196 175
197 if (!xen_initial_domain()) 176 if (!xen_initial_domain())
198 return; 177 return;
@@ -207,22 +186,8 @@ static void __init xen_filter_cpu_maps(void)
207 } else { 186 } else {
208 set_cpu_possible(i, false); 187 set_cpu_possible(i, false);
209 set_cpu_present(i, false); 188 set_cpu_present(i, false);
210 subtract++;
211 } 189 }
212 } 190 }
213#ifdef CONFIG_HOTPLUG_CPU
214 /* This is akin to using 'nr_cpus' on the Linux command line.
215 * Which is OK as when we use 'dom0_max_vcpus=X' we can only
216 * have up to X, while nr_cpu_ids is greater than X. This
217 * normally is not a problem, except when CPU hotplugging
218 * is involved and then there might be more than X CPUs
219 * in the guest - which will not work as there is no
220 * hypercall to expand the max number of VCPUs an already
221 * running guest has. So cap it up to X. */
222 if (subtract)
223 nr_cpu_ids = nr_cpu_ids - subtract;
224#endif
225
226} 191}
227 192
228static void __init xen_smp_prepare_boot_cpu(void) 193static void __init xen_smp_prepare_boot_cpu(void)
@@ -254,7 +219,7 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
254 } 219 }
255 xen_init_lock_cpu(0); 220 xen_init_lock_cpu(0);
256 221
257 smp_store_boot_cpu_info(); 222 smp_store_cpu_info(0);
258 cpu_data(0).x86_max_cores = 1; 223 cpu_data(0).x86_max_cores = 1;
259 224
260 for_each_possible_cpu(i) { 225 for_each_possible_cpu(i) {
@@ -279,8 +244,18 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
279 set_cpu_possible(cpu, false); 244 set_cpu_possible(cpu, false);
280 } 245 }
281 246
282 for_each_possible_cpu(cpu) 247 for_each_possible_cpu (cpu) {
248 struct task_struct *idle;
249
250 if (cpu == 0)
251 continue;
252
253 idle = fork_idle(cpu);
254 if (IS_ERR(idle))
255 panic("failed fork for CPU %d", cpu);
256
283 set_cpu_present(cpu, true); 257 set_cpu_present(cpu, true);
258 }
284} 259}
285 260
286static int __cpuinit 261static int __cpuinit
@@ -350,8 +325,9 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
350 return 0; 325 return 0;
351} 326}
352 327
353static int __cpuinit xen_cpu_up(unsigned int cpu, struct task_struct *idle) 328static int __cpuinit xen_cpu_up(unsigned int cpu)
354{ 329{
330 struct task_struct *idle = idle_task(cpu);
355 int rc; 331 int rc;
356 332
357 per_cpu(current_task, cpu) = idle; 333 per_cpu(current_task, cpu) = idle;
@@ -377,8 +353,7 @@ static int __cpuinit xen_cpu_up(unsigned int cpu, struct task_struct *idle)
377 return rc; 353 return rc;
378 354
379 if (num_online_cpus() == 1) 355 if (num_online_cpus() == 1)
380 /* Just in case we booted with a single CPU. */ 356 alternatives_smp_switch(1);
381 alternatives_enable_smp();
382 357
383 rc = xen_smp_intr_init(cpu); 358 rc = xen_smp_intr_init(cpu);
384 if (rc) 359 if (rc)
@@ -422,9 +397,11 @@ static void xen_cpu_die(unsigned int cpu)
422 unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu), NULL); 397 unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu), NULL);
423 unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu), NULL); 398 unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu), NULL);
424 unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu), NULL); 399 unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu), NULL);
425 unbind_from_irqhandler(per_cpu(xen_irq_work, cpu), NULL);
426 xen_uninit_lock_cpu(cpu); 400 xen_uninit_lock_cpu(cpu);
427 xen_teardown_timer(cpu); 401 xen_teardown_timer(cpu);
402
403 if (num_online_cpus() == 1)
404 alternatives_smp_switch(0);
428} 405}
429 406
430static void __cpuinit xen_play_dead(void) /* used only with HOTPLUG_CPU */ 407static void __cpuinit xen_play_dead(void) /* used only with HOTPLUG_CPU */
@@ -432,13 +409,6 @@ static void __cpuinit xen_play_dead(void) /* used only with HOTPLUG_CPU */
432 play_dead_common(); 409 play_dead_common();
433 HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL); 410 HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
434 cpu_bringup(); 411 cpu_bringup();
435 /*
436 * Balance out the preempt calls - as we are running in cpu_idle
437 * loop which has been called at bootup from cpu_bringup_and_idle.
438 * The cpucpu_bringup_and_idle called cpu_bringup which made a
439 * preempt_disable() So this preempt_enable will balance it out.
440 */
441 preempt_enable();
442} 412}
443 413
444#else /* !CONFIG_HOTPLUG_CPU */ 414#else /* !CONFIG_HOTPLUG_CPU */
@@ -482,8 +452,8 @@ static void xen_smp_send_reschedule(int cpu)
482 xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR); 452 xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR);
483} 453}
484 454
485static void __xen_send_IPI_mask(const struct cpumask *mask, 455static void xen_send_IPI_mask(const struct cpumask *mask,
486 int vector) 456 enum ipi_vector vector)
487{ 457{
488 unsigned cpu; 458 unsigned cpu;
489 459
@@ -495,7 +465,7 @@ static void xen_smp_send_call_function_ipi(const struct cpumask *mask)
495{ 465{
496 int cpu; 466 int cpu;
497 467
498 __xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR); 468 xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR);
499 469
500 /* Make sure other vcpus get a chance to run if they need to. */ 470 /* Make sure other vcpus get a chance to run if they need to. */
501 for_each_cpu(cpu, mask) { 471 for_each_cpu(cpu, mask) {
@@ -508,86 +478,10 @@ static void xen_smp_send_call_function_ipi(const struct cpumask *mask)
508 478
509static void xen_smp_send_call_function_single_ipi(int cpu) 479static void xen_smp_send_call_function_single_ipi(int cpu)
510{ 480{
511 __xen_send_IPI_mask(cpumask_of(cpu), 481 xen_send_IPI_mask(cpumask_of(cpu),
512 XEN_CALL_FUNCTION_SINGLE_VECTOR); 482 XEN_CALL_FUNCTION_SINGLE_VECTOR);
513} 483}
514 484
515static inline int xen_map_vector(int vector)
516{
517 int xen_vector;
518
519 switch (vector) {
520 case RESCHEDULE_VECTOR:
521 xen_vector = XEN_RESCHEDULE_VECTOR;
522 break;
523 case CALL_FUNCTION_VECTOR:
524 xen_vector = XEN_CALL_FUNCTION_VECTOR;
525 break;
526 case CALL_FUNCTION_SINGLE_VECTOR:
527 xen_vector = XEN_CALL_FUNCTION_SINGLE_VECTOR;
528 break;
529 case IRQ_WORK_VECTOR:
530 xen_vector = XEN_IRQ_WORK_VECTOR;
531 break;
532 default:
533 xen_vector = -1;
534 printk(KERN_ERR "xen: vector 0x%x is not implemented\n",
535 vector);
536 }
537
538 return xen_vector;
539}
540
541void xen_send_IPI_mask(const struct cpumask *mask,
542 int vector)
543{
544 int xen_vector = xen_map_vector(vector);
545
546 if (xen_vector >= 0)
547 __xen_send_IPI_mask(mask, xen_vector);
548}
549
550void xen_send_IPI_all(int vector)
551{
552 int xen_vector = xen_map_vector(vector);
553
554 if (xen_vector >= 0)
555 __xen_send_IPI_mask(cpu_online_mask, xen_vector);
556}
557
558void xen_send_IPI_self(int vector)
559{
560 int xen_vector = xen_map_vector(vector);
561
562 if (xen_vector >= 0)
563 xen_send_IPI_one(smp_processor_id(), xen_vector);
564}
565
566void xen_send_IPI_mask_allbutself(const struct cpumask *mask,
567 int vector)
568{
569 unsigned cpu;
570 unsigned int this_cpu = smp_processor_id();
571
572 if (!(num_online_cpus() > 1))
573 return;
574
575 for_each_cpu_and(cpu, mask, cpu_online_mask) {
576 if (this_cpu == cpu)
577 continue;
578
579 xen_smp_send_call_function_single_ipi(cpu);
580 }
581}
582
583void xen_send_IPI_allbutself(int vector)
584{
585 int xen_vector = xen_map_vector(vector);
586
587 if (xen_vector >= 0)
588 xen_send_IPI_mask_allbutself(cpu_online_mask, xen_vector);
589}
590
591static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id) 485static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
592{ 486{
593 irq_enter(); 487 irq_enter();
@@ -608,16 +502,6 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id)
608 return IRQ_HANDLED; 502 return IRQ_HANDLED;
609} 503}
610 504
611static irqreturn_t xen_irq_work_interrupt(int irq, void *dev_id)
612{
613 irq_enter();
614 irq_work_run();
615 inc_irq_stat(apic_irq_work_irqs);
616 irq_exit();
617
618 return IRQ_HANDLED;
619}
620
621static const struct smp_ops xen_smp_ops __initconst = { 505static const struct smp_ops xen_smp_ops __initconst = {
622 .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu, 506 .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
623 .smp_prepare_cpus = xen_smp_prepare_cpus, 507 .smp_prepare_cpus = xen_smp_prepare_cpus,
@@ -650,10 +534,10 @@ static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus)
650 xen_init_lock_cpu(0); 534 xen_init_lock_cpu(0);
651} 535}
652 536
653static int __cpuinit xen_hvm_cpu_up(unsigned int cpu, struct task_struct *tidle) 537static int __cpuinit xen_hvm_cpu_up(unsigned int cpu)
654{ 538{
655 int rc; 539 int rc;
656 rc = native_cpu_up(cpu, tidle); 540 rc = native_cpu_up(cpu);
657 WARN_ON (xen_smp_intr_init(cpu)); 541 WARN_ON (xen_smp_intr_init(cpu));
658 return rc; 542 return rc;
659} 543}
@@ -664,7 +548,6 @@ static void xen_hvm_cpu_die(unsigned int cpu)
664 unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu), NULL); 548 unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu), NULL);
665 unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu), NULL); 549 unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu), NULL);
666 unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu), NULL); 550 unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu), NULL);
667 unbind_from_irqhandler(per_cpu(xen_irq_work, cpu), NULL);
668 native_cpu_die(cpu); 551 native_cpu_die(cpu);
669} 552}
670 553
diff --git a/arch/x86/xen/smp.h b/arch/x86/xen/smp.h
deleted file mode 100644
index 8981a76d081..00000000000
--- a/arch/x86/xen/smp.h
+++ /dev/null
@@ -1,12 +0,0 @@
1#ifndef _XEN_SMP_H
2
3extern void xen_send_IPI_mask(const struct cpumask *mask,
4 int vector);
5extern void xen_send_IPI_mask_allbutself(const struct cpumask *mask,
6 int vector);
7extern void xen_send_IPI_allbutself(int vector);
8extern void physflat_send_IPI_allbutself(int vector);
9extern void xen_send_IPI_all(int vector);
10extern void xen_send_IPI_self(int vector);
11
12#endif
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index 83e866d714c..cc9b1e182fc 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -116,26 +116,9 @@ static inline void spin_time_accum_blocked(u64 start)
116} 116}
117#endif /* CONFIG_XEN_DEBUG_FS */ 117#endif /* CONFIG_XEN_DEBUG_FS */
118 118
119/*
120 * Size struct xen_spinlock so it's the same as arch_spinlock_t.
121 */
122#if NR_CPUS < 256
123typedef u8 xen_spinners_t;
124# define inc_spinners(xl) \
125 asm(LOCK_PREFIX " incb %0" : "+m" ((xl)->spinners) : : "memory");
126# define dec_spinners(xl) \
127 asm(LOCK_PREFIX " decb %0" : "+m" ((xl)->spinners) : : "memory");
128#else
129typedef u16 xen_spinners_t;
130# define inc_spinners(xl) \
131 asm(LOCK_PREFIX " incw %0" : "+m" ((xl)->spinners) : : "memory");
132# define dec_spinners(xl) \
133 asm(LOCK_PREFIX " decw %0" : "+m" ((xl)->spinners) : : "memory");
134#endif
135
136struct xen_spinlock { 119struct xen_spinlock {
137 unsigned char lock; /* 0 -> free; 1 -> locked */ 120 unsigned char lock; /* 0 -> free; 1 -> locked */
138 xen_spinners_t spinners; /* count of waiting cpus */ 121 unsigned short spinners; /* count of waiting cpus */
139}; 122};
140 123
141static int xen_spin_is_locked(struct arch_spinlock *lock) 124static int xen_spin_is_locked(struct arch_spinlock *lock)
@@ -181,7 +164,8 @@ static inline struct xen_spinlock *spinning_lock(struct xen_spinlock *xl)
181 164
182 wmb(); /* set lock of interest before count */ 165 wmb(); /* set lock of interest before count */
183 166
184 inc_spinners(xl); 167 asm(LOCK_PREFIX " incw %0"
168 : "+m" (xl->spinners) : : "memory");
185 169
186 return prev; 170 return prev;
187} 171}
@@ -192,7 +176,8 @@ static inline struct xen_spinlock *spinning_lock(struct xen_spinlock *xl)
192 */ 176 */
193static inline void unspinning_lock(struct xen_spinlock *xl, struct xen_spinlock *prev) 177static inline void unspinning_lock(struct xen_spinlock *xl, struct xen_spinlock *prev)
194{ 178{
195 dec_spinners(xl); 179 asm(LOCK_PREFIX " decw %0"
180 : "+m" (xl->spinners) : : "memory");
196 wmb(); /* decrement count before restoring lock */ 181 wmb(); /* decrement count before restoring lock */
197 __this_cpu_write(lock_spinners, prev); 182 __this_cpu_write(lock_spinners, prev);
198} 183}
@@ -388,8 +373,6 @@ void xen_uninit_lock_cpu(int cpu)
388 373
389void __init xen_init_spinlocks(void) 374void __init xen_init_spinlocks(void)
390{ 375{
391 BUILD_BUG_ON(sizeof(struct xen_spinlock) > sizeof(arch_spinlock_t));
392
393 pv_lock_ops.spin_is_locked = xen_spin_is_locked; 376 pv_lock_ops.spin_is_locked = xen_spin_is_locked;
394 pv_lock_ops.spin_is_contended = xen_spin_is_contended; 377 pv_lock_ops.spin_is_contended = xen_spin_is_contended;
395 pv_lock_ops.spin_lock = xen_spin_lock; 378 pv_lock_ops.spin_lock = xen_spin_lock;
@@ -440,12 +423,12 @@ static int __init xen_spinlock_debugfs(void)
440 debugfs_create_u64("time_total", 0444, d_spin_debug, 423 debugfs_create_u64("time_total", 0444, d_spin_debug,
441 &spinlock_stats.time_total); 424 &spinlock_stats.time_total);
442 425
443 debugfs_create_u32_array("histo_total", 0444, d_spin_debug, 426 xen_debugfs_create_u32_array("histo_total", 0444, d_spin_debug,
444 spinlock_stats.histo_spin_total, HISTO_BUCKETS + 1); 427 spinlock_stats.histo_spin_total, HISTO_BUCKETS + 1);
445 debugfs_create_u32_array("histo_spinning", 0444, d_spin_debug, 428 xen_debugfs_create_u32_array("histo_spinning", 0444, d_spin_debug,
446 spinlock_stats.histo_spin_spinning, HISTO_BUCKETS + 1); 429 spinlock_stats.histo_spin_spinning, HISTO_BUCKETS + 1);
447 debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug, 430 xen_debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug,
448 spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1); 431 spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1);
449 432
450 return 0; 433 return 0;
451} 434}
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index ae8a00c39de..45329c8c226 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -30,7 +30,7 @@ void xen_arch_hvm_post_suspend(int suspend_cancelled)
30{ 30{
31#ifdef CONFIG_XEN_PVHVM 31#ifdef CONFIG_XEN_PVHVM
32 int cpu; 32 int cpu;
33 xen_hvm_resume_shared_info(); 33 xen_hvm_init_shared_info();
34 xen_callback_vector(); 34 xen_callback_vector();
35 xen_unplug_emulated_devices(); 35 xen_unplug_emulated_devices();
36 if (xen_feature(XENFEAT_hvm_safe_pvclock)) { 36 if (xen_feature(XENFEAT_hvm_safe_pvclock)) {
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 0296a952250..163b4679556 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -201,22 +201,8 @@ static unsigned long xen_get_wallclock(void)
201 201
202static int xen_set_wallclock(unsigned long now) 202static int xen_set_wallclock(unsigned long now)
203{ 203{
204 struct xen_platform_op op;
205 int rc;
206
207 /* do nothing for domU */ 204 /* do nothing for domU */
208 if (!xen_initial_domain()) 205 return -1;
209 return -1;
210
211 op.cmd = XENPF_settime;
212 op.u.settime.secs = now;
213 op.u.settime.nsecs = 0;
214 op.u.settime.system_time = xen_clocksource_read();
215
216 rc = HYPERVISOR_dom0_op(&op);
217 WARN(rc != 0, "XENPF_settime failed: now=%ld\n", now);
218
219 return rc;
220} 206}
221 207
222static struct clocksource xen_clocksource __read_mostly = { 208static struct clocksource xen_clocksource __read_mostly = {
diff --git a/arch/x86/xen/vga.c b/arch/x86/xen/vga.c
index 6722e3733f0..1cd7f4d11e2 100644
--- a/arch/x86/xen/vga.c
+++ b/arch/x86/xen/vga.c
@@ -35,7 +35,6 @@ void __init xen_init_vga(const struct dom0_vga_console_info *info, size_t size)
35 info->u.text_mode_3.font_height; 35 info->u.text_mode_3.font_height;
36 break; 36 break;
37 37
38 case XEN_VGATYPE_EFI_LFB:
39 case XEN_VGATYPE_VESA_LFB: 38 case XEN_VGATYPE_VESA_LFB:
40 if (size < offsetof(struct dom0_vga_console_info, 39 if (size < offsetof(struct dom0_vga_console_info,
41 u.vesa_lfb.gbl_caps)) 40 u.vesa_lfb.gbl_caps))
@@ -55,12 +54,6 @@ void __init xen_init_vga(const struct dom0_vga_console_info *info, size_t size)
55 screen_info->blue_pos = info->u.vesa_lfb.blue_pos; 54 screen_info->blue_pos = info->u.vesa_lfb.blue_pos;
56 screen_info->rsvd_size = info->u.vesa_lfb.rsvd_size; 55 screen_info->rsvd_size = info->u.vesa_lfb.rsvd_size;
57 screen_info->rsvd_pos = info->u.vesa_lfb.rsvd_pos; 56 screen_info->rsvd_pos = info->u.vesa_lfb.rsvd_pos;
58
59 if (info->video_type == XEN_VGATYPE_EFI_LFB) {
60 screen_info->orig_video_isVGA = VIDEO_TYPE_EFI;
61 break;
62 }
63
64 if (size >= offsetof(struct dom0_vga_console_info, 57 if (size >= offsetof(struct dom0_vga_console_info,
65 u.vesa_lfb.gbl_caps) 58 u.vesa_lfb.gbl_caps)
66 + sizeof(info->u.vesa_lfb.gbl_caps)) 59 + sizeof(info->u.vesa_lfb.gbl_caps))
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S
index 3e45aa00071..79d7362ad6d 100644
--- a/arch/x86/xen/xen-asm.S
+++ b/arch/x86/xen/xen-asm.S
@@ -96,7 +96,7 @@ ENTRY(xen_restore_fl_direct)
96 96
97 /* check for unmasked and pending */ 97 /* check for unmasked and pending */
98 cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending 98 cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending
99 jnz 1f 99 jz 1f
1002: call check_events 1002: call check_events
1011: 1011:
102ENDPATCH(xen_restore_fl_direct) 102ENDPATCH(xen_restore_fl_direct)
diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S
index f9643fc50de..b040b0e518c 100644
--- a/arch/x86/xen/xen-asm_32.S
+++ b/arch/x86/xen/xen-asm_32.S
@@ -14,7 +14,6 @@
14#include <asm/thread_info.h> 14#include <asm/thread_info.h>
15#include <asm/processor-flags.h> 15#include <asm/processor-flags.h>
16#include <asm/segment.h> 16#include <asm/segment.h>
17#include <asm/asm.h>
18 17
19#include <xen/interface/xen.h> 18#include <xen/interface/xen.h>
20 19
@@ -138,7 +137,10 @@ iret_restore_end:
138 137
1391: iret 1381: iret
140xen_iret_end_crit: 139xen_iret_end_crit:
141 _ASM_EXTABLE(1b, iret_exc) 140.section __ex_table, "a"
141 .align 4
142 .long 1b, iret_exc
143.previous
142 144
143hyper_iret: 145hyper_iret:
144 /* put this out of line since its very rarely used */ 146 /* put this out of line since its very rarely used */
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 7faed5869e5..aaa7291c925 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -28,61 +28,9 @@ ENTRY(startup_xen)
28 __FINIT 28 __FINIT
29 29
30.pushsection .text 30.pushsection .text
31 .balign PAGE_SIZE 31 .align PAGE_SIZE
32ENTRY(hypercall_page) 32ENTRY(hypercall_page)
33#define NEXT_HYPERCALL(x) \ 33 .skip PAGE_SIZE
34 ENTRY(xen_hypercall_##x) \
35 .skip 32
36
37NEXT_HYPERCALL(set_trap_table)
38NEXT_HYPERCALL(mmu_update)
39NEXT_HYPERCALL(set_gdt)
40NEXT_HYPERCALL(stack_switch)
41NEXT_HYPERCALL(set_callbacks)
42NEXT_HYPERCALL(fpu_taskswitch)
43NEXT_HYPERCALL(sched_op_compat)
44NEXT_HYPERCALL(platform_op)
45NEXT_HYPERCALL(set_debugreg)
46NEXT_HYPERCALL(get_debugreg)
47NEXT_HYPERCALL(update_descriptor)
48NEXT_HYPERCALL(ni)
49NEXT_HYPERCALL(memory_op)
50NEXT_HYPERCALL(multicall)
51NEXT_HYPERCALL(update_va_mapping)
52NEXT_HYPERCALL(set_timer_op)
53NEXT_HYPERCALL(event_channel_op_compat)
54NEXT_HYPERCALL(xen_version)
55NEXT_HYPERCALL(console_io)
56NEXT_HYPERCALL(physdev_op_compat)
57NEXT_HYPERCALL(grant_table_op)
58NEXT_HYPERCALL(vm_assist)
59NEXT_HYPERCALL(update_va_mapping_otherdomain)
60NEXT_HYPERCALL(iret)
61NEXT_HYPERCALL(vcpu_op)
62NEXT_HYPERCALL(set_segment_base)
63NEXT_HYPERCALL(mmuext_op)
64NEXT_HYPERCALL(xsm_op)
65NEXT_HYPERCALL(nmi_op)
66NEXT_HYPERCALL(sched_op)
67NEXT_HYPERCALL(callback_op)
68NEXT_HYPERCALL(xenoprof_op)
69NEXT_HYPERCALL(event_channel_op)
70NEXT_HYPERCALL(physdev_op)
71NEXT_HYPERCALL(hvm_op)
72NEXT_HYPERCALL(sysctl)
73NEXT_HYPERCALL(domctl)
74NEXT_HYPERCALL(kexec_op)
75NEXT_HYPERCALL(tmem_op) /* 38 */
76ENTRY(xen_hypercall_rsvr)
77 .skip 320
78NEXT_HYPERCALL(mca) /* 48 */
79NEXT_HYPERCALL(arch_1)
80NEXT_HYPERCALL(arch_2)
81NEXT_HYPERCALL(arch_3)
82NEXT_HYPERCALL(arch_4)
83NEXT_HYPERCALL(arch_5)
84NEXT_HYPERCALL(arch_6)
85 .balign PAGE_SIZE
86.popsection 34.popsection
87 35
88 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") 36 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux")
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index d2e73d19d36..b095739ccd4 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -27,7 +27,8 @@ void xen_setup_mfn_list_list(void);
27void xen_setup_shared_info(void); 27void xen_setup_shared_info(void);
28void xen_build_mfn_list_list(void); 28void xen_build_mfn_list_list(void);
29void xen_setup_machphys_mapping(void); 29void xen_setup_machphys_mapping(void);
30void xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn); 30pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
31void xen_ident_map_ISA(void);
31void xen_reserve_top(void); 32void xen_reserve_top(void);
32extern unsigned long xen_max_p2m_pfn; 33extern unsigned long xen_max_p2m_pfn;
33 34
@@ -35,16 +36,16 @@ void xen_set_pat(u64);
35 36
36char * __init xen_memory_setup(void); 37char * __init xen_memory_setup(void);
37void __init xen_arch_setup(void); 38void __init xen_arch_setup(void);
39void __init xen_init_IRQ(void);
38void xen_enable_sysenter(void); 40void xen_enable_sysenter(void);
39void xen_enable_syscall(void); 41void xen_enable_syscall(void);
40void xen_vcpu_restore(void); 42void xen_vcpu_restore(void);
41 43
42void xen_callback_vector(void); 44void xen_callback_vector(void);
43void xen_hvm_resume_shared_info(void); 45void xen_hvm_init_shared_info(void);
44void xen_unplug_emulated_devices(void); 46void xen_unplug_emulated_devices(void);
45 47
46void __init xen_build_dynamic_phys_to_machine(void); 48void __init xen_build_dynamic_phys_to_machine(void);
47unsigned long __init xen_revector_p2m_tree(void);
48 49
49void xen_init_irq_ops(void); 50void xen_init_irq_ops(void);
50void xen_setup_timer(int cpu); 51void xen_setup_timer(int cpu);
@@ -91,15 +92,11 @@ struct dom0_vga_console_info;
91 92
92#ifdef CONFIG_XEN_DOM0 93#ifdef CONFIG_XEN_DOM0
93void __init xen_init_vga(const struct dom0_vga_console_info *, size_t size); 94void __init xen_init_vga(const struct dom0_vga_console_info *, size_t size);
94void __init xen_init_apic(void);
95#else 95#else
96static inline void __init xen_init_vga(const struct dom0_vga_console_info *info, 96static inline void __init xen_init_vga(const struct dom0_vga_console_info *info,
97 size_t size) 97 size_t size)
98{ 98{
99} 99}
100static inline void __init xen_init_apic(void)
101{
102}
103#endif 100#endif
104 101
105/* Declare an asm function, along with symbols needed to make it 102/* Declare an asm function, along with symbols needed to make it