aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/xen
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/xen')
-rw-r--r--arch/x86/xen/Makefile2
-rw-r--r--arch/x86/xen/apic.c33
-rw-r--r--arch/x86/xen/debugfs.c104
-rw-r--r--arch/x86/xen/debugfs.h4
-rw-r--r--arch/x86/xen/enlighten.c294
-rw-r--r--arch/x86/xen/mmu.c85
-rw-r--r--arch/x86/xen/p2m.c140
-rw-r--r--arch/x86/xen/setup.c165
-rw-r--r--arch/x86/xen/smp.c148
-rw-r--r--arch/x86/xen/smp.h12
-rw-r--r--arch/x86/xen/spinlock.c12
-rw-r--r--arch/x86/xen/suspend.c2
-rw-r--r--arch/x86/xen/xen-asm.S2
-rw-r--r--arch/x86/xen/xen-asm_32.S6
-rw-r--r--arch/x86/xen/xen-ops.h7
15 files changed, 708 insertions, 308 deletions
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index add2c2d729ce..96ab2c09cb68 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -20,5 +20,5 @@ obj-$(CONFIG_EVENT_TRACING) += trace.o
20obj-$(CONFIG_SMP) += smp.o 20obj-$(CONFIG_SMP) += smp.o
21obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o 21obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o
22obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o 22obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o
23obj-$(CONFIG_XEN_DOM0) += vga.o 23obj-$(CONFIG_XEN_DOM0) += apic.o vga.o
24obj-$(CONFIG_SWIOTLB_XEN) += pci-swiotlb-xen.o 24obj-$(CONFIG_SWIOTLB_XEN) += pci-swiotlb-xen.o
diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c
new file mode 100644
index 000000000000..ec57bd3818a4
--- /dev/null
+++ b/arch/x86/xen/apic.c
@@ -0,0 +1,33 @@
1#include <linux/init.h>
2
3#include <asm/x86_init.h>
4#include <asm/apic.h>
5#include <asm/xen/hypercall.h>
6
7#include <xen/xen.h>
8#include <xen/interface/physdev.h>
9
10unsigned int xen_io_apic_read(unsigned apic, unsigned reg)
11{
12 struct physdev_apic apic_op;
13 int ret;
14
15 apic_op.apic_physbase = mpc_ioapic_addr(apic);
16 apic_op.reg = reg;
17 ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op);
18 if (!ret)
19 return apic_op.value;
20
21 /* fallback to return an emulated IO_APIC values */
22 if (reg == 0x1)
23 return 0x00170020;
24 else if (reg == 0x0)
25 return apic << 24;
26
27 return 0xfd;
28}
29
30void __init xen_init_apic(void)
31{
32 x86_io_apic_ops.read = xen_io_apic_read;
33}
diff --git a/arch/x86/xen/debugfs.c b/arch/x86/xen/debugfs.c
index ef1db1900d86..c8377fb26cdf 100644
--- a/arch/x86/xen/debugfs.c
+++ b/arch/x86/xen/debugfs.c
@@ -19,107 +19,3 @@ struct dentry * __init xen_init_debugfs(void)
19 return d_xen_debug; 19 return d_xen_debug;
20} 20}
21 21
22struct array_data
23{
24 void *array;
25 unsigned elements;
26};
27
28static int u32_array_open(struct inode *inode, struct file *file)
29{
30 file->private_data = NULL;
31 return nonseekable_open(inode, file);
32}
33
34static size_t format_array(char *buf, size_t bufsize, const char *fmt,
35 u32 *array, unsigned array_size)
36{
37 size_t ret = 0;
38 unsigned i;
39
40 for(i = 0; i < array_size; i++) {
41 size_t len;
42
43 len = snprintf(buf, bufsize, fmt, array[i]);
44 len++; /* ' ' or '\n' */
45 ret += len;
46
47 if (buf) {
48 buf += len;
49 bufsize -= len;
50 buf[-1] = (i == array_size-1) ? '\n' : ' ';
51 }
52 }
53
54 ret++; /* \0 */
55 if (buf)
56 *buf = '\0';
57
58 return ret;
59}
60
61static char *format_array_alloc(const char *fmt, u32 *array, unsigned array_size)
62{
63 size_t len = format_array(NULL, 0, fmt, array, array_size);
64 char *ret;
65
66 ret = kmalloc(len, GFP_KERNEL);
67 if (ret == NULL)
68 return NULL;
69
70 format_array(ret, len, fmt, array, array_size);
71 return ret;
72}
73
74static ssize_t u32_array_read(struct file *file, char __user *buf, size_t len,
75 loff_t *ppos)
76{
77 struct inode *inode = file->f_path.dentry->d_inode;
78 struct array_data *data = inode->i_private;
79 size_t size;
80
81 if (*ppos == 0) {
82 if (file->private_data) {
83 kfree(file->private_data);
84 file->private_data = NULL;
85 }
86
87 file->private_data = format_array_alloc("%u", data->array, data->elements);
88 }
89
90 size = 0;
91 if (file->private_data)
92 size = strlen(file->private_data);
93
94 return simple_read_from_buffer(buf, len, ppos, file->private_data, size);
95}
96
97static int xen_array_release(struct inode *inode, struct file *file)
98{
99 kfree(file->private_data);
100
101 return 0;
102}
103
104static const struct file_operations u32_array_fops = {
105 .owner = THIS_MODULE,
106 .open = u32_array_open,
107 .release= xen_array_release,
108 .read = u32_array_read,
109 .llseek = no_llseek,
110};
111
112struct dentry *xen_debugfs_create_u32_array(const char *name, umode_t mode,
113 struct dentry *parent,
114 u32 *array, unsigned elements)
115{
116 struct array_data *data = kmalloc(sizeof(*data), GFP_KERNEL);
117
118 if (data == NULL)
119 return NULL;
120
121 data->array = array;
122 data->elements = elements;
123
124 return debugfs_create_file(name, mode, parent, data, &u32_array_fops);
125}
diff --git a/arch/x86/xen/debugfs.h b/arch/x86/xen/debugfs.h
index 78d25499be5b..12ebf3325c7b 100644
--- a/arch/x86/xen/debugfs.h
+++ b/arch/x86/xen/debugfs.h
@@ -3,8 +3,4 @@
3 3
4struct dentry * __init xen_init_debugfs(void); 4struct dentry * __init xen_init_debugfs(void);
5 5
6struct dentry *xen_debugfs_create_u32_array(const char *name, umode_t mode,
7 struct dentry *parent,
8 u32 *array, unsigned elements);
9
10#endif /* _XEN_DEBUGFS_H */ 6#endif /* _XEN_DEBUGFS_H */
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 4f51bebac02c..bf4bda6d3e9a 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -31,6 +31,7 @@
31#include <linux/pci.h> 31#include <linux/pci.h>
32#include <linux/gfp.h> 32#include <linux/gfp.h>
33#include <linux/memblock.h> 33#include <linux/memblock.h>
34#include <linux/syscore_ops.h>
34 35
35#include <xen/xen.h> 36#include <xen/xen.h>
36#include <xen/interface/xen.h> 37#include <xen/interface/xen.h>
@@ -38,10 +39,12 @@
38#include <xen/interface/physdev.h> 39#include <xen/interface/physdev.h>
39#include <xen/interface/vcpu.h> 40#include <xen/interface/vcpu.h>
40#include <xen/interface/memory.h> 41#include <xen/interface/memory.h>
42#include <xen/interface/xen-mca.h>
41#include <xen/features.h> 43#include <xen/features.h>
42#include <xen/page.h> 44#include <xen/page.h>
43#include <xen/hvm.h> 45#include <xen/hvm.h>
44#include <xen/hvc-console.h> 46#include <xen/hvc-console.h>
47#include <xen/acpi.h>
45 48
46#include <asm/paravirt.h> 49#include <asm/paravirt.h>
47#include <asm/apic.h> 50#include <asm/apic.h>
@@ -63,6 +66,7 @@
63#include <asm/stackprotector.h> 66#include <asm/stackprotector.h>
64#include <asm/hypervisor.h> 67#include <asm/hypervisor.h>
65#include <asm/mwait.h> 68#include <asm/mwait.h>
69#include <asm/pci_x86.h>
66 70
67#ifdef CONFIG_ACPI 71#ifdef CONFIG_ACPI
68#include <linux/acpi.h> 72#include <linux/acpi.h>
@@ -74,6 +78,7 @@
74 78
75#include "xen-ops.h" 79#include "xen-ops.h"
76#include "mmu.h" 80#include "mmu.h"
81#include "smp.h"
77#include "multicalls.h" 82#include "multicalls.h"
78 83
79EXPORT_SYMBOL_GPL(hypercall_page); 84EXPORT_SYMBOL_GPL(hypercall_page);
@@ -104,7 +109,7 @@ EXPORT_SYMBOL_GPL(xen_have_vector_callback);
104 * Point at some empty memory to start with. We map the real shared_info 109 * Point at some empty memory to start with. We map the real shared_info
105 * page as soon as fixmap is up and running. 110 * page as soon as fixmap is up and running.
106 */ 111 */
107struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info; 112struct shared_info *HYPERVISOR_shared_info = &xen_dummy_shared_info;
108 113
109/* 114/*
110 * Flag to determine whether vcpu info placement is available on all 115 * Flag to determine whether vcpu info placement is available on all
@@ -121,6 +126,19 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
121 */ 126 */
122static int have_vcpu_info_placement = 1; 127static int have_vcpu_info_placement = 1;
123 128
129struct tls_descs {
130 struct desc_struct desc[3];
131};
132
133/*
134 * Updating the 3 TLS descriptors in the GDT on every task switch is
135 * surprisingly expensive so we avoid updating them if they haven't
136 * changed. Since Xen writes different descriptors than the one
137 * passed in the update_descriptor hypercall we keep shadow copies to
138 * compare against.
139 */
140static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc);
141
124static void clamp_max_cpus(void) 142static void clamp_max_cpus(void)
125{ 143{
126#ifdef CONFIG_SMP 144#ifdef CONFIG_SMP
@@ -206,6 +224,9 @@ static void __init xen_banner(void)
206 xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : ""); 224 xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : "");
207} 225}
208 226
227#define CPUID_THERM_POWER_LEAF 6
228#define APERFMPERF_PRESENT 0
229
209static __read_mostly unsigned int cpuid_leaf1_edx_mask = ~0; 230static __read_mostly unsigned int cpuid_leaf1_edx_mask = ~0;
210static __read_mostly unsigned int cpuid_leaf1_ecx_mask = ~0; 231static __read_mostly unsigned int cpuid_leaf1_ecx_mask = ~0;
211 232
@@ -239,6 +260,11 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx,
239 *dx = cpuid_leaf5_edx_val; 260 *dx = cpuid_leaf5_edx_val;
240 return; 261 return;
241 262
263 case CPUID_THERM_POWER_LEAF:
264 /* Disabling APERFMPERF for kernel usage */
265 maskecx = ~(1 << APERFMPERF_PRESENT);
266 break;
267
242 case 0xb: 268 case 0xb:
243 /* Suppress extended topology stuff */ 269 /* Suppress extended topology stuff */
244 maskebx = 0; 270 maskebx = 0;
@@ -261,7 +287,8 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx,
261 287
262static bool __init xen_check_mwait(void) 288static bool __init xen_check_mwait(void)
263{ 289{
264#ifdef CONFIG_ACPI 290#if defined(CONFIG_ACPI) && !defined(CONFIG_ACPI_PROCESSOR_AGGREGATOR) && \
291 !defined(CONFIG_ACPI_PROCESSOR_AGGREGATOR_MODULE)
265 struct xen_platform_op op = { 292 struct xen_platform_op op = {
266 .cmd = XENPF_set_processor_pminfo, 293 .cmd = XENPF_set_processor_pminfo,
267 .u.set_pminfo.id = -1, 294 .u.set_pminfo.id = -1,
@@ -329,9 +356,7 @@ static void __init xen_init_cpuid_mask(void)
329 unsigned int xsave_mask; 356 unsigned int xsave_mask;
330 357
331 cpuid_leaf1_edx_mask = 358 cpuid_leaf1_edx_mask =
332 ~((1 << X86_FEATURE_MCE) | /* disable MCE */ 359 ~((1 << X86_FEATURE_MTRR) | /* disable MTRR */
333 (1 << X86_FEATURE_MCA) | /* disable MCA */
334 (1 << X86_FEATURE_MTRR) | /* disable MTRR */
335 (1 << X86_FEATURE_ACC)); /* thermal monitoring */ 360 (1 << X86_FEATURE_ACC)); /* thermal monitoring */
336 361
337 if (!xen_initial_domain()) 362 if (!xen_initial_domain())
@@ -349,7 +374,6 @@ static void __init xen_init_cpuid_mask(void)
349 /* Xen will set CR4.OSXSAVE if supported and not disabled by force */ 374 /* Xen will set CR4.OSXSAVE if supported and not disabled by force */
350 if ((cx & xsave_mask) != xsave_mask) 375 if ((cx & xsave_mask) != xsave_mask)
351 cpuid_leaf1_ecx_mask &= ~xsave_mask; /* disable XSAVE & OSXSAVE */ 376 cpuid_leaf1_ecx_mask &= ~xsave_mask; /* disable XSAVE & OSXSAVE */
352
353 if (xen_check_mwait()) 377 if (xen_check_mwait())
354 cpuid_leaf1_ecx_set_mask = (1 << (X86_FEATURE_MWAIT % 32)); 378 cpuid_leaf1_ecx_set_mask = (1 << (X86_FEATURE_MWAIT % 32));
355} 379}
@@ -529,12 +553,28 @@ static void __init xen_load_gdt_boot(const struct desc_ptr *dtr)
529 BUG(); 553 BUG();
530} 554}
531 555
556static inline bool desc_equal(const struct desc_struct *d1,
557 const struct desc_struct *d2)
558{
559 return d1->a == d2->a && d1->b == d2->b;
560}
561
532static void load_TLS_descriptor(struct thread_struct *t, 562static void load_TLS_descriptor(struct thread_struct *t,
533 unsigned int cpu, unsigned int i) 563 unsigned int cpu, unsigned int i)
534{ 564{
535 struct desc_struct *gdt = get_cpu_gdt_table(cpu); 565 struct desc_struct *shadow = &per_cpu(shadow_tls_desc, cpu).desc[i];
536 xmaddr_t maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]); 566 struct desc_struct *gdt;
537 struct multicall_space mc = __xen_mc_entry(0); 567 xmaddr_t maddr;
568 struct multicall_space mc;
569
570 if (desc_equal(shadow, &t->tls_array[i]))
571 return;
572
573 *shadow = t->tls_array[i];
574
575 gdt = get_cpu_gdt_table(cpu);
576 maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
577 mc = __xen_mc_entry(0);
538 578
539 MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]); 579 MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]);
540} 580}
@@ -616,8 +656,8 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val,
616 /* 656 /*
617 * Look for known traps using IST, and substitute them 657 * Look for known traps using IST, and substitute them
618 * appropriately. The debugger ones are the only ones we care 658 * appropriately. The debugger ones are the only ones we care
619 * about. Xen will handle faults like double_fault and 659 * about. Xen will handle faults like double_fault,
620 * machine_check, so we should never see them. Warn if 660 * so we should never see them. Warn if
621 * there's an unexpected IST-using fault handler. 661 * there's an unexpected IST-using fault handler.
622 */ 662 */
623 if (addr == (unsigned long)debug) 663 if (addr == (unsigned long)debug)
@@ -632,7 +672,11 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val,
632 return 0; 672 return 0;
633#ifdef CONFIG_X86_MCE 673#ifdef CONFIG_X86_MCE
634 } else if (addr == (unsigned long)machine_check) { 674 } else if (addr == (unsigned long)machine_check) {
635 return 0; 675 /*
676 * when xen hypervisor inject vMCE to guest,
677 * use native mce handler to handle it
678 */
679 ;
636#endif 680#endif
637 } else { 681 } else {
638 /* Some other trap using IST? */ 682 /* Some other trap using IST? */
@@ -809,9 +853,40 @@ static void xen_io_delay(void)
809} 853}
810 854
811#ifdef CONFIG_X86_LOCAL_APIC 855#ifdef CONFIG_X86_LOCAL_APIC
856static unsigned long xen_set_apic_id(unsigned int x)
857{
858 WARN_ON(1);
859 return x;
860}
861static unsigned int xen_get_apic_id(unsigned long x)
862{
863 return ((x)>>24) & 0xFFu;
864}
812static u32 xen_apic_read(u32 reg) 865static u32 xen_apic_read(u32 reg)
813{ 866{
814 return 0; 867 struct xen_platform_op op = {
868 .cmd = XENPF_get_cpuinfo,
869 .interface_version = XENPF_INTERFACE_VERSION,
870 .u.pcpu_info.xen_cpuid = 0,
871 };
872 int ret = 0;
873
874 /* Shouldn't need this as APIC is turned off for PV, and we only
875 * get called on the bootup processor. But just in case. */
876 if (!xen_initial_domain() || smp_processor_id())
877 return 0;
878
879 if (reg == APIC_LVR)
880 return 0x10;
881
882 if (reg != APIC_ID)
883 return 0;
884
885 ret = HYPERVISOR_dom0_op(&op);
886 if (ret)
887 return 0;
888
889 return op.u.pcpu_info.apic_id << 24;
815} 890}
816 891
817static void xen_apic_write(u32 reg, u32 val) 892static void xen_apic_write(u32 reg, u32 val)
@@ -849,6 +924,16 @@ static void set_xen_basic_apic_ops(void)
849 apic->icr_write = xen_apic_icr_write; 924 apic->icr_write = xen_apic_icr_write;
850 apic->wait_icr_idle = xen_apic_wait_icr_idle; 925 apic->wait_icr_idle = xen_apic_wait_icr_idle;
851 apic->safe_wait_icr_idle = xen_safe_apic_wait_icr_idle; 926 apic->safe_wait_icr_idle = xen_safe_apic_wait_icr_idle;
927 apic->set_apic_id = xen_set_apic_id;
928 apic->get_apic_id = xen_get_apic_id;
929
930#ifdef CONFIG_SMP
931 apic->send_IPI_allbutself = xen_send_IPI_allbutself;
932 apic->send_IPI_mask_allbutself = xen_send_IPI_mask_allbutself;
933 apic->send_IPI_mask = xen_send_IPI_mask;
934 apic->send_IPI_all = xen_send_IPI_all;
935 apic->send_IPI_self = xen_send_IPI_self;
936#endif
852} 937}
853 938
854#endif 939#endif
@@ -1073,6 +1158,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {
1073 1158
1074 .read_msr = native_read_msr_safe, 1159 .read_msr = native_read_msr_safe,
1075 .write_msr = xen_write_msr_safe, 1160 .write_msr = xen_write_msr_safe,
1161
1076 .read_tsc = native_read_tsc, 1162 .read_tsc = native_read_tsc,
1077 .read_pmc = native_read_pmc, 1163 .read_pmc = native_read_pmc,
1078 1164
@@ -1306,7 +1392,6 @@ asmlinkage void __init xen_start_kernel(void)
1306 1392
1307 xen_raw_console_write("mapping kernel into physical memory\n"); 1393 xen_raw_console_write("mapping kernel into physical memory\n");
1308 pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages); 1394 pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages);
1309 xen_ident_map_ISA();
1310 1395
1311 /* Allocate and initialize top and mid mfn levels for p2m structure */ 1396 /* Allocate and initialize top and mid mfn levels for p2m structure */
1312 xen_build_mfn_list_list(); 1397 xen_build_mfn_list_list();
@@ -1362,11 +1447,17 @@ asmlinkage void __init xen_start_kernel(void)
1362 xen_start_info->console.domU.mfn = 0; 1447 xen_start_info->console.domU.mfn = 0;
1363 xen_start_info->console.domU.evtchn = 0; 1448 xen_start_info->console.domU.evtchn = 0;
1364 1449
1450 xen_init_apic();
1451
1365 /* Make sure ACS will be enabled */ 1452 /* Make sure ACS will be enabled */
1366 pci_request_acs(); 1453 pci_request_acs();
1367 }
1368
1369 1454
1455 xen_acpi_sleep_register();
1456 }
1457#ifdef CONFIG_PCI
1458 /* PCI BIOS service won't work from a PV guest. */
1459 pci_probe &= ~PCI_PROBE_BIOS;
1460#endif
1370 xen_raw_console_write("about to get started...\n"); 1461 xen_raw_console_write("about to get started...\n");
1371 1462
1372 xen_setup_runstate_info(0); 1463 xen_setup_runstate_info(0);
@@ -1379,64 +1470,155 @@ asmlinkage void __init xen_start_kernel(void)
1379#endif 1470#endif
1380} 1471}
1381 1472
1382static int init_hvm_pv_info(int *major, int *minor) 1473#ifdef CONFIG_XEN_PVHVM
1383{ 1474/*
1384 uint32_t eax, ebx, ecx, edx, pages, msr, base; 1475 * The pfn containing the shared_info is located somewhere in RAM. This
1385 u64 pfn; 1476 * will cause trouble if the current kernel is doing a kexec boot into a
1386 1477 * new kernel. The new kernel (and its startup code) can not know where
1387 base = xen_cpuid_base(); 1478 * the pfn is, so it can not reserve the page. The hypervisor will
1388 cpuid(base + 1, &eax, &ebx, &ecx, &edx); 1479 * continue to update the pfn, and as a result memory corruption occours
1389 1480 * in the new kernel.
1390 *major = eax >> 16; 1481 *
1391 *minor = eax & 0xffff; 1482 * One way to work around this issue is to allocate a page in the
1392 printk(KERN_INFO "Xen version %d.%d.\n", *major, *minor); 1483 * xen-platform pci device's BAR memory range. But pci init is done very
1393 1484 * late and the shared_info page is already in use very early to read
1394 cpuid(base + 2, &pages, &msr, &ecx, &edx); 1485 * the pvclock. So moving the pfn from RAM to MMIO is racy because some
1395 1486 * code paths on other vcpus could access the pfn during the small
1396 pfn = __pa(hypercall_page); 1487 * window when the old pfn is moved to the new pfn. There is even a
1397 wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); 1488 * small window were the old pfn is not backed by a mfn, and during that
1398 1489 * time all reads return -1.
1399 xen_setup_features(); 1490 *
1400 1491 * Because it is not known upfront where the MMIO region is located it
1401 pv_info.name = "Xen HVM"; 1492 * can not be used right from the start in xen_hvm_init_shared_info.
1402 1493 *
1403 xen_domain_type = XEN_HVM_DOMAIN; 1494 * To minimise trouble the move of the pfn is done shortly before kexec.
1495 * This does not eliminate the race because all vcpus are still online
1496 * when the syscore_ops will be called. But hopefully there is no work
1497 * pending at this point in time. Also the syscore_op is run last which
1498 * reduces the risk further.
1499 */
1404 1500
1405 return 0; 1501static struct shared_info *xen_hvm_shared_info;
1406}
1407 1502
1408void __ref xen_hvm_init_shared_info(void) 1503static void xen_hvm_connect_shared_info(unsigned long pfn)
1409{ 1504{
1410 int cpu;
1411 struct xen_add_to_physmap xatp; 1505 struct xen_add_to_physmap xatp;
1412 static struct shared_info *shared_info_page = 0;
1413 1506
1414 if (!shared_info_page)
1415 shared_info_page = (struct shared_info *)
1416 extend_brk(PAGE_SIZE, PAGE_SIZE);
1417 xatp.domid = DOMID_SELF; 1507 xatp.domid = DOMID_SELF;
1418 xatp.idx = 0; 1508 xatp.idx = 0;
1419 xatp.space = XENMAPSPACE_shared_info; 1509 xatp.space = XENMAPSPACE_shared_info;
1420 xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT; 1510 xatp.gpfn = pfn;
1421 if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp)) 1511 if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp))
1422 BUG(); 1512 BUG();
1423 1513
1424 HYPERVISOR_shared_info = (struct shared_info *)shared_info_page; 1514}
1515static void xen_hvm_set_shared_info(struct shared_info *sip)
1516{
1517 int cpu;
1518
1519 HYPERVISOR_shared_info = sip;
1425 1520
1426 /* xen_vcpu is a pointer to the vcpu_info struct in the shared_info 1521 /* xen_vcpu is a pointer to the vcpu_info struct in the shared_info
1427 * page, we use it in the event channel upcall and in some pvclock 1522 * page, we use it in the event channel upcall and in some pvclock
1428 * related functions. We don't need the vcpu_info placement 1523 * related functions. We don't need the vcpu_info placement
1429 * optimizations because we don't use any pv_mmu or pv_irq op on 1524 * optimizations because we don't use any pv_mmu or pv_irq op on
1430 * HVM. 1525 * HVM.
1431 * When xen_hvm_init_shared_info is run at boot time only vcpu 0 is 1526 * When xen_hvm_set_shared_info is run at boot time only vcpu 0 is
1432 * online but xen_hvm_init_shared_info is run at resume time too and 1527 * online but xen_hvm_set_shared_info is run at resume time too and
1433 * in that case multiple vcpus might be online. */ 1528 * in that case multiple vcpus might be online. */
1434 for_each_online_cpu(cpu) { 1529 for_each_online_cpu(cpu) {
1435 per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; 1530 per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
1436 } 1531 }
1437} 1532}
1438 1533
1439#ifdef CONFIG_XEN_PVHVM 1534/* Reconnect the shared_info pfn to a mfn */
1535void xen_hvm_resume_shared_info(void)
1536{
1537 xen_hvm_connect_shared_info(__pa(xen_hvm_shared_info) >> PAGE_SHIFT);
1538}
1539
1540#ifdef CONFIG_KEXEC
1541static struct shared_info *xen_hvm_shared_info_kexec;
1542static unsigned long xen_hvm_shared_info_pfn_kexec;
1543
1544/* Remember a pfn in MMIO space for kexec reboot */
1545void __devinit xen_hvm_prepare_kexec(struct shared_info *sip, unsigned long pfn)
1546{
1547 xen_hvm_shared_info_kexec = sip;
1548 xen_hvm_shared_info_pfn_kexec = pfn;
1549}
1550
1551static void xen_hvm_syscore_shutdown(void)
1552{
1553 struct xen_memory_reservation reservation = {
1554 .domid = DOMID_SELF,
1555 .nr_extents = 1,
1556 };
1557 unsigned long prev_pfn;
1558 int rc;
1559
1560 if (!xen_hvm_shared_info_kexec)
1561 return;
1562
1563 prev_pfn = __pa(xen_hvm_shared_info) >> PAGE_SHIFT;
1564 set_xen_guest_handle(reservation.extent_start, &prev_pfn);
1565
1566 /* Move pfn to MMIO, disconnects previous pfn from mfn */
1567 xen_hvm_connect_shared_info(xen_hvm_shared_info_pfn_kexec);
1568
1569 /* Update pointers, following hypercall is also a memory barrier */
1570 xen_hvm_set_shared_info(xen_hvm_shared_info_kexec);
1571
1572 /* Allocate new mfn for previous pfn */
1573 do {
1574 rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation);
1575 if (rc == 0)
1576 msleep(123);
1577 } while (rc == 0);
1578
1579 /* Make sure the previous pfn is really connected to a (new) mfn */
1580 BUG_ON(rc != 1);
1581}
1582
1583static struct syscore_ops xen_hvm_syscore_ops = {
1584 .shutdown = xen_hvm_syscore_shutdown,
1585};
1586#endif
1587
1588/* Use a pfn in RAM, may move to MMIO before kexec. */
1589static void __init xen_hvm_init_shared_info(void)
1590{
1591 /* Remember pointer for resume */
1592 xen_hvm_shared_info = extend_brk(PAGE_SIZE, PAGE_SIZE);
1593 xen_hvm_connect_shared_info(__pa(xen_hvm_shared_info) >> PAGE_SHIFT);
1594 xen_hvm_set_shared_info(xen_hvm_shared_info);
1595}
1596
1597static void __init init_hvm_pv_info(void)
1598{
1599 int major, minor;
1600 uint32_t eax, ebx, ecx, edx, pages, msr, base;
1601 u64 pfn;
1602
1603 base = xen_cpuid_base();
1604 cpuid(base + 1, &eax, &ebx, &ecx, &edx);
1605
1606 major = eax >> 16;
1607 minor = eax & 0xffff;
1608 printk(KERN_INFO "Xen version %d.%d.\n", major, minor);
1609
1610 cpuid(base + 2, &pages, &msr, &ecx, &edx);
1611
1612 pfn = __pa(hypercall_page);
1613 wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
1614
1615 xen_setup_features();
1616
1617 pv_info.name = "Xen HVM";
1618
1619 xen_domain_type = XEN_HVM_DOMAIN;
1620}
1621
1440static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self, 1622static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self,
1441 unsigned long action, void *hcpu) 1623 unsigned long action, void *hcpu)
1442{ 1624{
@@ -1459,14 +1641,12 @@ static struct notifier_block xen_hvm_cpu_notifier __cpuinitdata = {
1459 1641
1460static void __init xen_hvm_guest_init(void) 1642static void __init xen_hvm_guest_init(void)
1461{ 1643{
1462 int r; 1644 init_hvm_pv_info();
1463 int major, minor;
1464
1465 r = init_hvm_pv_info(&major, &minor);
1466 if (r < 0)
1467 return;
1468 1645
1469 xen_hvm_init_shared_info(); 1646 xen_hvm_init_shared_info();
1647#ifdef CONFIG_KEXEC
1648 register_syscore_ops(&xen_hvm_syscore_ops);
1649#endif
1470 1650
1471 if (xen_feature(XENFEAT_hvm_callback_vector)) 1651 if (xen_feature(XENFEAT_hvm_callback_vector))
1472 xen_have_vector_callback = 1; 1652 xen_have_vector_callback = 1;
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index b8e279479a6b..b65a76133f4f 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -308,8 +308,20 @@ static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval)
308 308
309static inline void __xen_set_pte(pte_t *ptep, pte_t pteval) 309static inline void __xen_set_pte(pte_t *ptep, pte_t pteval)
310{ 310{
311 if (!xen_batched_set_pte(ptep, pteval)) 311 if (!xen_batched_set_pte(ptep, pteval)) {
312 native_set_pte(ptep, pteval); 312 /*
313 * Could call native_set_pte() here and trap and
314 * emulate the PTE write but with 32-bit guests this
315 * needs two traps (one for each of the two 32-bit
316 * words in the PTE) so do one hypercall directly
317 * instead.
318 */
319 struct mmu_update u;
320
321 u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE;
322 u.val = pte_val_ma(pteval);
323 HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF);
324 }
313} 325}
314 326
315static void xen_set_pte(pte_t *ptep, pte_t pteval) 327static void xen_set_pte(pte_t *ptep, pte_t pteval)
@@ -353,8 +365,13 @@ static pteval_t pte_mfn_to_pfn(pteval_t val)
353{ 365{
354 if (val & _PAGE_PRESENT) { 366 if (val & _PAGE_PRESENT) {
355 unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; 367 unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
368 unsigned long pfn = mfn_to_pfn(mfn);
369
356 pteval_t flags = val & PTE_FLAGS_MASK; 370 pteval_t flags = val & PTE_FLAGS_MASK;
357 val = ((pteval_t)mfn_to_pfn(mfn) << PAGE_SHIFT) | flags; 371 if (unlikely(pfn == ~0))
372 val = flags & ~_PAGE_PRESENT;
373 else
374 val = ((pteval_t)pfn << PAGE_SHIFT) | flags;
358 } 375 }
359 376
360 return val; 377 return val;
@@ -1239,7 +1256,8 @@ static void xen_flush_tlb_single(unsigned long addr)
1239} 1256}
1240 1257
1241static void xen_flush_tlb_others(const struct cpumask *cpus, 1258static void xen_flush_tlb_others(const struct cpumask *cpus,
1242 struct mm_struct *mm, unsigned long va) 1259 struct mm_struct *mm, unsigned long start,
1260 unsigned long end)
1243{ 1261{
1244 struct { 1262 struct {
1245 struct mmuext_op op; 1263 struct mmuext_op op;
@@ -1251,7 +1269,7 @@ static void xen_flush_tlb_others(const struct cpumask *cpus,
1251 } *args; 1269 } *args;
1252 struct multicall_space mcs; 1270 struct multicall_space mcs;
1253 1271
1254 trace_xen_mmu_flush_tlb_others(cpus, mm, va); 1272 trace_xen_mmu_flush_tlb_others(cpus, mm, start, end);
1255 1273
1256 if (cpumask_empty(cpus)) 1274 if (cpumask_empty(cpus))
1257 return; /* nothing to do */ 1275 return; /* nothing to do */
@@ -1264,11 +1282,10 @@ static void xen_flush_tlb_others(const struct cpumask *cpus,
1264 cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask); 1282 cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
1265 cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask)); 1283 cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
1266 1284
1267 if (va == TLB_FLUSH_ALL) { 1285 args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
1268 args->op.cmd = MMUEXT_TLB_FLUSH_MULTI; 1286 if (start != TLB_FLUSH_ALL && (end - start) <= PAGE_SIZE) {
1269 } else {
1270 args->op.cmd = MMUEXT_INVLPG_MULTI; 1287 args->op.cmd = MMUEXT_INVLPG_MULTI;
1271 args->op.arg1.linear_addr = va; 1288 args->op.arg1.linear_addr = start;
1272 } 1289 }
1273 1290
1274 MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF); 1291 MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
@@ -1411,13 +1428,28 @@ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
1411} 1428}
1412#endif /* CONFIG_X86_64 */ 1429#endif /* CONFIG_X86_64 */
1413 1430
1414/* Init-time set_pte while constructing initial pagetables, which 1431/*
1415 doesn't allow RO pagetable pages to be remapped RW */ 1432 * Init-time set_pte while constructing initial pagetables, which
1433 * doesn't allow RO page table pages to be remapped RW.
1434 *
1435 * If there is no MFN for this PFN then this page is initially
1436 * ballooned out so clear the PTE (as in decrease_reservation() in
1437 * drivers/xen/balloon.c).
1438 *
1439 * Many of these PTE updates are done on unpinned and writable pages
1440 * and doing a hypercall for these is unnecessary and expensive. At
1441 * this point it is not possible to tell if a page is pinned or not,
1442 * so always write the PTE directly and rely on Xen trapping and
1443 * emulating any updates as necessary.
1444 */
1416static void __init xen_set_pte_init(pte_t *ptep, pte_t pte) 1445static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
1417{ 1446{
1418 pte = mask_rw_pte(ptep, pte); 1447 if (pte_mfn(pte) != INVALID_P2M_ENTRY)
1448 pte = mask_rw_pte(ptep, pte);
1449 else
1450 pte = __pte_ma(0);
1419 1451
1420 xen_set_pte(ptep, pte); 1452 native_set_pte(ptep, pte);
1421} 1453}
1422 1454
1423static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn) 1455static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
@@ -1859,7 +1891,6 @@ pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd,
1859#endif /* CONFIG_X86_64 */ 1891#endif /* CONFIG_X86_64 */
1860 1892
1861static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss; 1893static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss;
1862static unsigned char fake_ioapic_mapping[PAGE_SIZE] __page_aligned_bss;
1863 1894
1864static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) 1895static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
1865{ 1896{
@@ -1900,7 +1931,7 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
1900 * We just don't map the IO APIC - all access is via 1931 * We just don't map the IO APIC - all access is via
1901 * hypercalls. Keep the address in the pte for reference. 1932 * hypercalls. Keep the address in the pte for reference.
1902 */ 1933 */
1903 pte = pfn_pte(PFN_DOWN(__pa(fake_ioapic_mapping)), PAGE_KERNEL); 1934 pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL);
1904 break; 1935 break;
1905#endif 1936#endif
1906 1937
@@ -1929,29 +1960,6 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
1929#endif 1960#endif
1930} 1961}
1931 1962
1932void __init xen_ident_map_ISA(void)
1933{
1934 unsigned long pa;
1935
1936 /*
1937 * If we're dom0, then linear map the ISA machine addresses into
1938 * the kernel's address space.
1939 */
1940 if (!xen_initial_domain())
1941 return;
1942
1943 xen_raw_printk("Xen: setup ISA identity maps\n");
1944
1945 for (pa = ISA_START_ADDRESS; pa < ISA_END_ADDRESS; pa += PAGE_SIZE) {
1946 pte_t pte = mfn_pte(PFN_DOWN(pa), PAGE_KERNEL_IO);
1947
1948 if (HYPERVISOR_update_va_mapping(PAGE_OFFSET + pa, pte, 0))
1949 BUG();
1950 }
1951
1952 xen_flush_tlb();
1953}
1954
1955static void __init xen_post_allocator_init(void) 1963static void __init xen_post_allocator_init(void)
1956{ 1964{
1957 pv_mmu_ops.set_pte = xen_set_pte; 1965 pv_mmu_ops.set_pte = xen_set_pte;
@@ -2065,7 +2073,6 @@ void __init xen_init_mmu_ops(void)
2065 pv_mmu_ops = xen_mmu_ops; 2073 pv_mmu_ops = xen_mmu_ops;
2066 2074
2067 memset(dummy_mapping, 0xff, PAGE_SIZE); 2075 memset(dummy_mapping, 0xff, PAGE_SIZE);
2068 memset(fake_ioapic_mapping, 0xfd, PAGE_SIZE);
2069} 2076}
2070 2077
2071/* Protected by xen_reservation_lock. */ 2078/* Protected by xen_reservation_lock. */
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 1b267e75158d..64effdc6da94 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -499,16 +499,18 @@ static bool alloc_p2m(unsigned long pfn)
499 return true; 499 return true;
500} 500}
501 501
502static bool __init __early_alloc_p2m(unsigned long pfn) 502static bool __init early_alloc_p2m_middle(unsigned long pfn, bool check_boundary)
503{ 503{
504 unsigned topidx, mididx, idx; 504 unsigned topidx, mididx, idx;
505 unsigned long *p2m;
506 unsigned long *mid_mfn_p;
505 507
506 topidx = p2m_top_index(pfn); 508 topidx = p2m_top_index(pfn);
507 mididx = p2m_mid_index(pfn); 509 mididx = p2m_mid_index(pfn);
508 idx = p2m_index(pfn); 510 idx = p2m_index(pfn);
509 511
510 /* Pfff.. No boundary cross-over, lets get out. */ 512 /* Pfff.. No boundary cross-over, lets get out. */
511 if (!idx) 513 if (!idx && check_boundary)
512 return false; 514 return false;
513 515
514 WARN(p2m_top[topidx][mididx] == p2m_identity, 516 WARN(p2m_top[topidx][mididx] == p2m_identity,
@@ -522,24 +524,66 @@ static bool __init __early_alloc_p2m(unsigned long pfn)
522 return false; 524 return false;
523 525
524 /* Boundary cross-over for the edges: */ 526 /* Boundary cross-over for the edges: */
525 if (idx) { 527 p2m = extend_brk(PAGE_SIZE, PAGE_SIZE);
526 unsigned long *p2m = extend_brk(PAGE_SIZE, PAGE_SIZE);
527 unsigned long *mid_mfn_p;
528 528
529 p2m_init(p2m); 529 p2m_init(p2m);
530 530
531 p2m_top[topidx][mididx] = p2m; 531 p2m_top[topidx][mididx] = p2m;
532 532
533 /* For save/restore we need to MFN of the P2M saved */ 533 /* For save/restore we need to MFN of the P2M saved */
534 534
535 mid_mfn_p = p2m_top_mfn_p[topidx]; 535 mid_mfn_p = p2m_top_mfn_p[topidx];
536 WARN(mid_mfn_p[mididx] != virt_to_mfn(p2m_missing), 536 WARN(mid_mfn_p[mididx] != virt_to_mfn(p2m_missing),
537 "P2M_TOP_P[%d][%d] != MFN of p2m_missing!\n", 537 "P2M_TOP_P[%d][%d] != MFN of p2m_missing!\n",
538 topidx, mididx); 538 topidx, mididx);
539 mid_mfn_p[mididx] = virt_to_mfn(p2m); 539 mid_mfn_p[mididx] = virt_to_mfn(p2m);
540
541 return true;
542}
543
544static bool __init early_alloc_p2m(unsigned long pfn)
545{
546 unsigned topidx = p2m_top_index(pfn);
547 unsigned long *mid_mfn_p;
548 unsigned long **mid;
549
550 mid = p2m_top[topidx];
551 mid_mfn_p = p2m_top_mfn_p[topidx];
552 if (mid == p2m_mid_missing) {
553 mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
554
555 p2m_mid_init(mid);
540 556
557 p2m_top[topidx] = mid;
558
559 BUG_ON(mid_mfn_p != p2m_mid_missing_mfn);
560 }
561 /* And the save/restore P2M tables.. */
562 if (mid_mfn_p == p2m_mid_missing_mfn) {
563 mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
564 p2m_mid_mfn_init(mid_mfn_p);
565
566 p2m_top_mfn_p[topidx] = mid_mfn_p;
567 p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p);
568 /* Note: we don't set mid_mfn_p[midix] here,
569 * look in early_alloc_p2m_middle */
541 } 570 }
542 return idx != 0; 571 return true;
572}
573bool __init early_set_phys_to_machine(unsigned long pfn, unsigned long mfn)
574{
575 if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
576 if (!early_alloc_p2m(pfn))
577 return false;
578
579 if (!early_alloc_p2m_middle(pfn, false /* boundary crossover OK!*/))
580 return false;
581
582 if (!__set_phys_to_machine(pfn, mfn))
583 return false;
584 }
585
586 return true;
543} 587}
544unsigned long __init set_phys_range_identity(unsigned long pfn_s, 588unsigned long __init set_phys_range_identity(unsigned long pfn_s,
545 unsigned long pfn_e) 589 unsigned long pfn_e)
@@ -559,35 +603,11 @@ unsigned long __init set_phys_range_identity(unsigned long pfn_s,
559 pfn < ALIGN(pfn_e, (P2M_MID_PER_PAGE * P2M_PER_PAGE)); 603 pfn < ALIGN(pfn_e, (P2M_MID_PER_PAGE * P2M_PER_PAGE));
560 pfn += P2M_MID_PER_PAGE * P2M_PER_PAGE) 604 pfn += P2M_MID_PER_PAGE * P2M_PER_PAGE)
561 { 605 {
562 unsigned topidx = p2m_top_index(pfn); 606 WARN_ON(!early_alloc_p2m(pfn));
563 unsigned long *mid_mfn_p;
564 unsigned long **mid;
565
566 mid = p2m_top[topidx];
567 mid_mfn_p = p2m_top_mfn_p[topidx];
568 if (mid == p2m_mid_missing) {
569 mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
570
571 p2m_mid_init(mid);
572
573 p2m_top[topidx] = mid;
574
575 BUG_ON(mid_mfn_p != p2m_mid_missing_mfn);
576 }
577 /* And the save/restore P2M tables.. */
578 if (mid_mfn_p == p2m_mid_missing_mfn) {
579 mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
580 p2m_mid_mfn_init(mid_mfn_p);
581
582 p2m_top_mfn_p[topidx] = mid_mfn_p;
583 p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p);
584 /* Note: we don't set mid_mfn_p[midix] here,
585 * look in __early_alloc_p2m */
586 }
587 } 607 }
588 608
589 __early_alloc_p2m(pfn_s); 609 early_alloc_p2m_middle(pfn_s, true);
590 __early_alloc_p2m(pfn_e); 610 early_alloc_p2m_middle(pfn_e, true);
591 611
592 for (pfn = pfn_s; pfn < pfn_e; pfn++) 612 for (pfn = pfn_s; pfn < pfn_e; pfn++)
593 if (!__set_phys_to_machine(pfn, IDENTITY_FRAME(pfn))) 613 if (!__set_phys_to_machine(pfn, IDENTITY_FRAME(pfn)))
@@ -686,6 +706,7 @@ int m2p_add_override(unsigned long mfn, struct page *page,
686 unsigned long uninitialized_var(address); 706 unsigned long uninitialized_var(address);
687 unsigned level; 707 unsigned level;
688 pte_t *ptep = NULL; 708 pte_t *ptep = NULL;
709 int ret = 0;
689 710
690 pfn = page_to_pfn(page); 711 pfn = page_to_pfn(page);
691 if (!PageHighMem(page)) { 712 if (!PageHighMem(page)) {
@@ -721,6 +742,24 @@ int m2p_add_override(unsigned long mfn, struct page *page,
721 list_add(&page->lru, &m2p_overrides[mfn_hash(mfn)]); 742 list_add(&page->lru, &m2p_overrides[mfn_hash(mfn)]);
722 spin_unlock_irqrestore(&m2p_override_lock, flags); 743 spin_unlock_irqrestore(&m2p_override_lock, flags);
723 744
745 /* p2m(m2p(mfn)) == mfn: the mfn is already present somewhere in
746 * this domain. Set the FOREIGN_FRAME_BIT in the p2m for the other
747 * pfn so that the following mfn_to_pfn(mfn) calls will return the
748 * pfn from the m2p_override (the backend pfn) instead.
749 * We need to do this because the pages shared by the frontend
750 * (xen-blkfront) can be already locked (lock_page, called by
751 * do_read_cache_page); when the userspace backend tries to use them
752 * with direct_IO, mfn_to_pfn returns the pfn of the frontend, so
753 * do_blockdev_direct_IO is going to try to lock the same pages
754 * again resulting in a deadlock.
755 * As a side effect get_user_pages_fast might not be safe on the
756 * frontend pages while they are being shared with the backend,
757 * because mfn_to_pfn (that ends up being called by GUPF) will
758 * return the backend pfn rather than the frontend pfn. */
759 ret = __get_user(pfn, &machine_to_phys_mapping[mfn]);
760 if (ret == 0 && get_phys_to_machine(pfn) == mfn)
761 set_phys_to_machine(pfn, FOREIGN_FRAME(mfn));
762
724 return 0; 763 return 0;
725} 764}
726EXPORT_SYMBOL_GPL(m2p_add_override); 765EXPORT_SYMBOL_GPL(m2p_add_override);
@@ -732,6 +771,7 @@ int m2p_remove_override(struct page *page, bool clear_pte)
732 unsigned long uninitialized_var(address); 771 unsigned long uninitialized_var(address);
733 unsigned level; 772 unsigned level;
734 pte_t *ptep = NULL; 773 pte_t *ptep = NULL;
774 int ret = 0;
735 775
736 pfn = page_to_pfn(page); 776 pfn = page_to_pfn(page);
737 mfn = get_phys_to_machine(pfn); 777 mfn = get_phys_to_machine(pfn);
@@ -801,6 +841,22 @@ int m2p_remove_override(struct page *page, bool clear_pte)
801 } else 841 } else
802 set_phys_to_machine(pfn, page->index); 842 set_phys_to_machine(pfn, page->index);
803 843
844 /* p2m(m2p(mfn)) == FOREIGN_FRAME(mfn): the mfn is already present
845 * somewhere in this domain, even before being added to the
846 * m2p_override (see comment above in m2p_add_override).
847 * If there are no other entries in the m2p_override corresponding
848 * to this mfn, then remove the FOREIGN_FRAME_BIT from the p2m for
849 * the original pfn (the one shared by the frontend): the backend
850 * cannot do any IO on this page anymore because it has been
851 * unshared. Removing the FOREIGN_FRAME_BIT from the p2m entry of
852 * the original pfn causes mfn_to_pfn(mfn) to return the frontend
853 * pfn again. */
854 mfn &= ~FOREIGN_FRAME_BIT;
855 ret = __get_user(pfn, &machine_to_phys_mapping[mfn]);
856 if (ret == 0 && get_phys_to_machine(pfn) == FOREIGN_FRAME(mfn) &&
857 m2p_find_override(mfn) == NULL)
858 set_phys_to_machine(pfn, mfn);
859
804 return 0; 860 return 0;
805} 861}
806EXPORT_SYMBOL_GPL(m2p_remove_override); 862EXPORT_SYMBOL_GPL(m2p_remove_override);
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 1ba8dff26753..ead85576d54a 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -26,7 +26,6 @@
26#include <xen/interface/memory.h> 26#include <xen/interface/memory.h>
27#include <xen/interface/physdev.h> 27#include <xen/interface/physdev.h>
28#include <xen/features.h> 28#include <xen/features.h>
29
30#include "xen-ops.h" 29#include "xen-ops.h"
31#include "vdso.h" 30#include "vdso.h"
32 31
@@ -84,8 +83,8 @@ static void __init xen_add_extra_mem(u64 start, u64 size)
84 __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); 83 __set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
85} 84}
86 85
87static unsigned long __init xen_release_chunk(unsigned long start, 86static unsigned long __init xen_do_chunk(unsigned long start,
88 unsigned long end) 87 unsigned long end, bool release)
89{ 88{
90 struct xen_memory_reservation reservation = { 89 struct xen_memory_reservation reservation = {
91 .address_bits = 0, 90 .address_bits = 0,
@@ -96,30 +95,133 @@ static unsigned long __init xen_release_chunk(unsigned long start,
96 unsigned long pfn; 95 unsigned long pfn;
97 int ret; 96 int ret;
98 97
99 for(pfn = start; pfn < end; pfn++) { 98 for (pfn = start; pfn < end; pfn++) {
99 unsigned long frame;
100 unsigned long mfn = pfn_to_mfn(pfn); 100 unsigned long mfn = pfn_to_mfn(pfn);
101 101
102 /* Make sure pfn exists to start with */ 102 if (release) {
103 if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn) 103 /* Make sure pfn exists to start with */
104 continue; 104 if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn)
105 105 continue;
106 set_xen_guest_handle(reservation.extent_start, &mfn); 106 frame = mfn;
107 } else {
108 if (mfn != INVALID_P2M_ENTRY)
109 continue;
110 frame = pfn;
111 }
112 set_xen_guest_handle(reservation.extent_start, &frame);
107 reservation.nr_extents = 1; 113 reservation.nr_extents = 1;
108 114
109 ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, 115 ret = HYPERVISOR_memory_op(release ? XENMEM_decrease_reservation : XENMEM_populate_physmap,
110 &reservation); 116 &reservation);
111 WARN(ret != 1, "Failed to release pfn %lx err=%d\n", pfn, ret); 117 WARN(ret != 1, "Failed to %s pfn %lx err=%d\n",
118 release ? "release" : "populate", pfn, ret);
119
112 if (ret == 1) { 120 if (ret == 1) {
113 __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); 121 if (!early_set_phys_to_machine(pfn, release ? INVALID_P2M_ENTRY : frame)) {
122 if (release)
123 break;
124 set_xen_guest_handle(reservation.extent_start, &frame);
125 reservation.nr_extents = 1;
126 ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
127 &reservation);
128 break;
129 }
114 len++; 130 len++;
115 } 131 } else
132 break;
116 } 133 }
117 printk(KERN_INFO "Freeing %lx-%lx pfn range: %lu pages freed\n", 134 if (len)
118 start, end, len); 135 printk(KERN_INFO "%s %lx-%lx pfn range: %lu pages %s\n",
136 release ? "Freeing" : "Populating",
137 start, end, len,
138 release ? "freed" : "added");
119 139
120 return len; 140 return len;
121} 141}
122 142
143static unsigned long __init xen_release_chunk(unsigned long start,
144 unsigned long end)
145{
146 return xen_do_chunk(start, end, true);
147}
148
149static unsigned long __init xen_populate_chunk(
150 const struct e820entry *list, size_t map_size,
151 unsigned long max_pfn, unsigned long *last_pfn,
152 unsigned long credits_left)
153{
154 const struct e820entry *entry;
155 unsigned int i;
156 unsigned long done = 0;
157 unsigned long dest_pfn;
158
159 for (i = 0, entry = list; i < map_size; i++, entry++) {
160 unsigned long s_pfn;
161 unsigned long e_pfn;
162 unsigned long pfns;
163 long capacity;
164
165 if (credits_left <= 0)
166 break;
167
168 if (entry->type != E820_RAM)
169 continue;
170
171 e_pfn = PFN_DOWN(entry->addr + entry->size);
172
173 /* We only care about E820 after the xen_start_info->nr_pages */
174 if (e_pfn <= max_pfn)
175 continue;
176
177 s_pfn = PFN_UP(entry->addr);
178 /* If the E820 falls within the nr_pages, we want to start
179 * at the nr_pages PFN.
180 * If that would mean going past the E820 entry, skip it
181 */
182 if (s_pfn <= max_pfn) {
183 capacity = e_pfn - max_pfn;
184 dest_pfn = max_pfn;
185 } else {
186 capacity = e_pfn - s_pfn;
187 dest_pfn = s_pfn;
188 }
189
190 if (credits_left < capacity)
191 capacity = credits_left;
192
193 pfns = xen_do_chunk(dest_pfn, dest_pfn + capacity, false);
194 done += pfns;
195 *last_pfn = (dest_pfn + pfns);
196 if (pfns < capacity)
197 break;
198 credits_left -= pfns;
199 }
200 return done;
201}
202
203static void __init xen_set_identity_and_release_chunk(
204 unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages,
205 unsigned long *released, unsigned long *identity)
206{
207 unsigned long pfn;
208
209 /*
210 * If the PFNs are currently mapped, the VA mapping also needs
211 * to be updated to be 1:1.
212 */
213 for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++)
214 (void)HYPERVISOR_update_va_mapping(
215 (unsigned long)__va(pfn << PAGE_SHIFT),
216 mfn_pte(pfn, PAGE_KERNEL_IO), 0);
217
218 if (start_pfn < nr_pages)
219 *released += xen_release_chunk(
220 start_pfn, min(end_pfn, nr_pages));
221
222 *identity += set_phys_range_identity(start_pfn, end_pfn);
223}
224
123static unsigned long __init xen_set_identity_and_release( 225static unsigned long __init xen_set_identity_and_release(
124 const struct e820entry *list, size_t map_size, unsigned long nr_pages) 226 const struct e820entry *list, size_t map_size, unsigned long nr_pages)
125{ 227{
@@ -142,7 +244,6 @@ static unsigned long __init xen_set_identity_and_release(
142 */ 244 */
143 for (i = 0, entry = list; i < map_size; i++, entry++) { 245 for (i = 0, entry = list; i < map_size; i++, entry++) {
144 phys_addr_t end = entry->addr + entry->size; 246 phys_addr_t end = entry->addr + entry->size;
145
146 if (entry->type == E820_RAM || i == map_size - 1) { 247 if (entry->type == E820_RAM || i == map_size - 1) {
147 unsigned long start_pfn = PFN_DOWN(start); 248 unsigned long start_pfn = PFN_DOWN(start);
148 unsigned long end_pfn = PFN_UP(end); 249 unsigned long end_pfn = PFN_UP(end);
@@ -150,20 +251,19 @@ static unsigned long __init xen_set_identity_and_release(
150 if (entry->type == E820_RAM) 251 if (entry->type == E820_RAM)
151 end_pfn = PFN_UP(entry->addr); 252 end_pfn = PFN_UP(entry->addr);
152 253
153 if (start_pfn < end_pfn) { 254 if (start_pfn < end_pfn)
154 if (start_pfn < nr_pages) 255 xen_set_identity_and_release_chunk(
155 released += xen_release_chunk( 256 start_pfn, end_pfn, nr_pages,
156 start_pfn, min(end_pfn, nr_pages)); 257 &released, &identity);
157 258
158 identity += set_phys_range_identity(
159 start_pfn, end_pfn);
160 }
161 start = end; 259 start = end;
162 } 260 }
163 } 261 }
164 262
165 printk(KERN_INFO "Released %lu pages of unused memory\n", released); 263 if (released)
166 printk(KERN_INFO "Set %ld page(s) to 1-1 mapping\n", identity); 264 printk(KERN_INFO "Released %lu pages of unused memory\n", released);
265 if (identity)
266 printk(KERN_INFO "Set %ld page(s) to 1-1 mapping\n", identity);
167 267
168 return released; 268 return released;
169} 269}
@@ -217,7 +317,9 @@ char * __init xen_memory_setup(void)
217 int rc; 317 int rc;
218 struct xen_memory_map memmap; 318 struct xen_memory_map memmap;
219 unsigned long max_pages; 319 unsigned long max_pages;
320 unsigned long last_pfn = 0;
220 unsigned long extra_pages = 0; 321 unsigned long extra_pages = 0;
322 unsigned long populated;
221 int i; 323 int i;
222 int op; 324 int op;
223 325
@@ -257,8 +359,20 @@ char * __init xen_memory_setup(void)
257 */ 359 */
258 xen_released_pages = xen_set_identity_and_release( 360 xen_released_pages = xen_set_identity_and_release(
259 map, memmap.nr_entries, max_pfn); 361 map, memmap.nr_entries, max_pfn);
362
363 /*
364 * Populate back the non-RAM pages and E820 gaps that had been
365 * released. */
366 populated = xen_populate_chunk(map, memmap.nr_entries,
367 max_pfn, &last_pfn, xen_released_pages);
368
369 xen_released_pages -= populated;
260 extra_pages += xen_released_pages; 370 extra_pages += xen_released_pages;
261 371
372 if (last_pfn > max_pfn) {
373 max_pfn = min(MAX_DOMAIN_PAGES, last_pfn);
374 mem_end = PFN_PHYS(max_pfn);
375 }
262 /* 376 /*
263 * Clamp the amount of extra memory to a EXTRA_MEM_RATIO 377 * Clamp the amount of extra memory to a EXTRA_MEM_RATIO
264 * factor the base size. On non-highmem systems, the base 378 * factor the base size. On non-highmem systems, the base
@@ -272,7 +386,6 @@ char * __init xen_memory_setup(void)
272 */ 386 */
273 extra_pages = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)), 387 extra_pages = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)),
274 extra_pages); 388 extra_pages);
275
276 i = 0; 389 i = 0;
277 while (i < memmap.nr_entries) { 390 while (i < memmap.nr_entries) {
278 u64 addr = map[i].addr; 391 u64 addr = map[i].addr;
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 5fac6919b957..f58dca7a6e52 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -16,6 +16,7 @@
16#include <linux/err.h> 16#include <linux/err.h>
17#include <linux/slab.h> 17#include <linux/slab.h>
18#include <linux/smp.h> 18#include <linux/smp.h>
19#include <linux/irq_work.h>
19 20
20#include <asm/paravirt.h> 21#include <asm/paravirt.h>
21#include <asm/desc.h> 22#include <asm/desc.h>
@@ -41,10 +42,12 @@ cpumask_var_t xen_cpu_initialized_map;
41static DEFINE_PER_CPU(int, xen_resched_irq); 42static DEFINE_PER_CPU(int, xen_resched_irq);
42static DEFINE_PER_CPU(int, xen_callfunc_irq); 43static DEFINE_PER_CPU(int, xen_callfunc_irq);
43static DEFINE_PER_CPU(int, xen_callfuncsingle_irq); 44static DEFINE_PER_CPU(int, xen_callfuncsingle_irq);
45static DEFINE_PER_CPU(int, xen_irq_work);
44static DEFINE_PER_CPU(int, xen_debug_irq) = -1; 46static DEFINE_PER_CPU(int, xen_debug_irq) = -1;
45 47
46static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id); 48static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id);
47static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id); 49static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id);
50static irqreturn_t xen_irq_work_interrupt(int irq, void *dev_id);
48 51
49/* 52/*
50 * Reschedule call back. 53 * Reschedule call back.
@@ -77,9 +80,7 @@ static void __cpuinit cpu_bringup(void)
77 80
78 notify_cpu_starting(cpu); 81 notify_cpu_starting(cpu);
79 82
80 ipi_call_lock();
81 set_cpu_online(cpu, true); 83 set_cpu_online(cpu, true);
82 ipi_call_unlock();
83 84
84 this_cpu_write(cpu_state, CPU_ONLINE); 85 this_cpu_write(cpu_state, CPU_ONLINE);
85 86
@@ -143,6 +144,17 @@ static int xen_smp_intr_init(unsigned int cpu)
143 goto fail; 144 goto fail;
144 per_cpu(xen_callfuncsingle_irq, cpu) = rc; 145 per_cpu(xen_callfuncsingle_irq, cpu) = rc;
145 146
147 callfunc_name = kasprintf(GFP_KERNEL, "irqwork%d", cpu);
148 rc = bind_ipi_to_irqhandler(XEN_IRQ_WORK_VECTOR,
149 cpu,
150 xen_irq_work_interrupt,
151 IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
152 callfunc_name,
153 NULL);
154 if (rc < 0)
155 goto fail;
156 per_cpu(xen_irq_work, cpu) = rc;
157
146 return 0; 158 return 0;
147 159
148 fail: 160 fail:
@@ -155,6 +167,8 @@ static int xen_smp_intr_init(unsigned int cpu)
155 if (per_cpu(xen_callfuncsingle_irq, cpu) >= 0) 167 if (per_cpu(xen_callfuncsingle_irq, cpu) >= 0)
156 unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu), 168 unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu),
157 NULL); 169 NULL);
170 if (per_cpu(xen_irq_work, cpu) >= 0)
171 unbind_from_irqhandler(per_cpu(xen_irq_work, cpu), NULL);
158 172
159 return rc; 173 return rc;
160} 174}
@@ -178,6 +192,7 @@ static void __init xen_fill_possible_map(void)
178static void __init xen_filter_cpu_maps(void) 192static void __init xen_filter_cpu_maps(void)
179{ 193{
180 int i, rc; 194 int i, rc;
195 unsigned int subtract = 0;
181 196
182 if (!xen_initial_domain()) 197 if (!xen_initial_domain())
183 return; 198 return;
@@ -192,8 +207,22 @@ static void __init xen_filter_cpu_maps(void)
192 } else { 207 } else {
193 set_cpu_possible(i, false); 208 set_cpu_possible(i, false);
194 set_cpu_present(i, false); 209 set_cpu_present(i, false);
210 subtract++;
195 } 211 }
196 } 212 }
213#ifdef CONFIG_HOTPLUG_CPU
214 /* This is akin to using 'nr_cpus' on the Linux command line.
215 * Which is OK as when we use 'dom0_max_vcpus=X' we can only
216 * have up to X, while nr_cpu_ids is greater than X. This
217 * normally is not a problem, except when CPU hotplugging
218 * is involved and then there might be more than X CPUs
219 * in the guest - which will not work as there is no
220 * hypercall to expand the max number of VCPUs an already
221 * running guest has. So cap it up to X. */
222 if (subtract)
223 nr_cpu_ids = nr_cpu_ids - subtract;
224#endif
225
197} 226}
198 227
199static void __init xen_smp_prepare_boot_cpu(void) 228static void __init xen_smp_prepare_boot_cpu(void)
@@ -250,18 +279,8 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
250 set_cpu_possible(cpu, false); 279 set_cpu_possible(cpu, false);
251 } 280 }
252 281
253 for_each_possible_cpu (cpu) { 282 for_each_possible_cpu(cpu)
254 struct task_struct *idle;
255
256 if (cpu == 0)
257 continue;
258
259 idle = fork_idle(cpu);
260 if (IS_ERR(idle))
261 panic("failed fork for CPU %d", cpu);
262
263 set_cpu_present(cpu, true); 283 set_cpu_present(cpu, true);
264 }
265} 284}
266 285
267static int __cpuinit 286static int __cpuinit
@@ -331,9 +350,8 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
331 return 0; 350 return 0;
332} 351}
333 352
334static int __cpuinit xen_cpu_up(unsigned int cpu) 353static int __cpuinit xen_cpu_up(unsigned int cpu, struct task_struct *idle)
335{ 354{
336 struct task_struct *idle = idle_task(cpu);
337 int rc; 355 int rc;
338 356
339 per_cpu(current_task, cpu) = idle; 357 per_cpu(current_task, cpu) = idle;
@@ -403,6 +421,7 @@ static void xen_cpu_die(unsigned int cpu)
403 unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu), NULL); 421 unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu), NULL);
404 unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu), NULL); 422 unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu), NULL);
405 unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu), NULL); 423 unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu), NULL);
424 unbind_from_irqhandler(per_cpu(xen_irq_work, cpu), NULL);
406 xen_uninit_lock_cpu(cpu); 425 xen_uninit_lock_cpu(cpu);
407 xen_teardown_timer(cpu); 426 xen_teardown_timer(cpu);
408 427
@@ -465,8 +484,8 @@ static void xen_smp_send_reschedule(int cpu)
465 xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR); 484 xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR);
466} 485}
467 486
468static void xen_send_IPI_mask(const struct cpumask *mask, 487static void __xen_send_IPI_mask(const struct cpumask *mask,
469 enum ipi_vector vector) 488 int vector)
470{ 489{
471 unsigned cpu; 490 unsigned cpu;
472 491
@@ -478,7 +497,7 @@ static void xen_smp_send_call_function_ipi(const struct cpumask *mask)
478{ 497{
479 int cpu; 498 int cpu;
480 499
481 xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR); 500 __xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR);
482 501
483 /* Make sure other vcpus get a chance to run if they need to. */ 502 /* Make sure other vcpus get a chance to run if they need to. */
484 for_each_cpu(cpu, mask) { 503 for_each_cpu(cpu, mask) {
@@ -491,10 +510,86 @@ static void xen_smp_send_call_function_ipi(const struct cpumask *mask)
491 510
492static void xen_smp_send_call_function_single_ipi(int cpu) 511static void xen_smp_send_call_function_single_ipi(int cpu)
493{ 512{
494 xen_send_IPI_mask(cpumask_of(cpu), 513 __xen_send_IPI_mask(cpumask_of(cpu),
495 XEN_CALL_FUNCTION_SINGLE_VECTOR); 514 XEN_CALL_FUNCTION_SINGLE_VECTOR);
496} 515}
497 516
517static inline int xen_map_vector(int vector)
518{
519 int xen_vector;
520
521 switch (vector) {
522 case RESCHEDULE_VECTOR:
523 xen_vector = XEN_RESCHEDULE_VECTOR;
524 break;
525 case CALL_FUNCTION_VECTOR:
526 xen_vector = XEN_CALL_FUNCTION_VECTOR;
527 break;
528 case CALL_FUNCTION_SINGLE_VECTOR:
529 xen_vector = XEN_CALL_FUNCTION_SINGLE_VECTOR;
530 break;
531 case IRQ_WORK_VECTOR:
532 xen_vector = XEN_IRQ_WORK_VECTOR;
533 break;
534 default:
535 xen_vector = -1;
536 printk(KERN_ERR "xen: vector 0x%x is not implemented\n",
537 vector);
538 }
539
540 return xen_vector;
541}
542
543void xen_send_IPI_mask(const struct cpumask *mask,
544 int vector)
545{
546 int xen_vector = xen_map_vector(vector);
547
548 if (xen_vector >= 0)
549 __xen_send_IPI_mask(mask, xen_vector);
550}
551
552void xen_send_IPI_all(int vector)
553{
554 int xen_vector = xen_map_vector(vector);
555
556 if (xen_vector >= 0)
557 __xen_send_IPI_mask(cpu_online_mask, xen_vector);
558}
559
560void xen_send_IPI_self(int vector)
561{
562 int xen_vector = xen_map_vector(vector);
563
564 if (xen_vector >= 0)
565 xen_send_IPI_one(smp_processor_id(), xen_vector);
566}
567
568void xen_send_IPI_mask_allbutself(const struct cpumask *mask,
569 int vector)
570{
571 unsigned cpu;
572 unsigned int this_cpu = smp_processor_id();
573
574 if (!(num_online_cpus() > 1))
575 return;
576
577 for_each_cpu_and(cpu, mask, cpu_online_mask) {
578 if (this_cpu == cpu)
579 continue;
580
581 xen_smp_send_call_function_single_ipi(cpu);
582 }
583}
584
585void xen_send_IPI_allbutself(int vector)
586{
587 int xen_vector = xen_map_vector(vector);
588
589 if (xen_vector >= 0)
590 xen_send_IPI_mask_allbutself(cpu_online_mask, xen_vector);
591}
592
498static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id) 593static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
499{ 594{
500 irq_enter(); 595 irq_enter();
@@ -515,6 +610,16 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id)
515 return IRQ_HANDLED; 610 return IRQ_HANDLED;
516} 611}
517 612
613static irqreturn_t xen_irq_work_interrupt(int irq, void *dev_id)
614{
615 irq_enter();
616 irq_work_run();
617 inc_irq_stat(apic_irq_work_irqs);
618 irq_exit();
619
620 return IRQ_HANDLED;
621}
622
518static const struct smp_ops xen_smp_ops __initconst = { 623static const struct smp_ops xen_smp_ops __initconst = {
519 .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu, 624 .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
520 .smp_prepare_cpus = xen_smp_prepare_cpus, 625 .smp_prepare_cpus = xen_smp_prepare_cpus,
@@ -547,10 +652,10 @@ static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus)
547 xen_init_lock_cpu(0); 652 xen_init_lock_cpu(0);
548} 653}
549 654
550static int __cpuinit xen_hvm_cpu_up(unsigned int cpu) 655static int __cpuinit xen_hvm_cpu_up(unsigned int cpu, struct task_struct *tidle)
551{ 656{
552 int rc; 657 int rc;
553 rc = native_cpu_up(cpu); 658 rc = native_cpu_up(cpu, tidle);
554 WARN_ON (xen_smp_intr_init(cpu)); 659 WARN_ON (xen_smp_intr_init(cpu));
555 return rc; 660 return rc;
556} 661}
@@ -561,6 +666,7 @@ static void xen_hvm_cpu_die(unsigned int cpu)
561 unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu), NULL); 666 unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu), NULL);
562 unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu), NULL); 667 unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu), NULL);
563 unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu), NULL); 668 unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu), NULL);
669 unbind_from_irqhandler(per_cpu(xen_irq_work, cpu), NULL);
564 native_cpu_die(cpu); 670 native_cpu_die(cpu);
565} 671}
566 672
diff --git a/arch/x86/xen/smp.h b/arch/x86/xen/smp.h
new file mode 100644
index 000000000000..8981a76d081a
--- /dev/null
+++ b/arch/x86/xen/smp.h
@@ -0,0 +1,12 @@
1#ifndef _XEN_SMP_H
2
3extern void xen_send_IPI_mask(const struct cpumask *mask,
4 int vector);
5extern void xen_send_IPI_mask_allbutself(const struct cpumask *mask,
6 int vector);
7extern void xen_send_IPI_allbutself(int vector);
8extern void physflat_send_IPI_allbutself(int vector);
9extern void xen_send_IPI_all(int vector);
10extern void xen_send_IPI_self(int vector);
11
12#endif
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index d69cc6c3f808..83e866d714ce 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -440,12 +440,12 @@ static int __init xen_spinlock_debugfs(void)
440 debugfs_create_u64("time_total", 0444, d_spin_debug, 440 debugfs_create_u64("time_total", 0444, d_spin_debug,
441 &spinlock_stats.time_total); 441 &spinlock_stats.time_total);
442 442
443 xen_debugfs_create_u32_array("histo_total", 0444, d_spin_debug, 443 debugfs_create_u32_array("histo_total", 0444, d_spin_debug,
444 spinlock_stats.histo_spin_total, HISTO_BUCKETS + 1); 444 spinlock_stats.histo_spin_total, HISTO_BUCKETS + 1);
445 xen_debugfs_create_u32_array("histo_spinning", 0444, d_spin_debug, 445 debugfs_create_u32_array("histo_spinning", 0444, d_spin_debug,
446 spinlock_stats.histo_spin_spinning, HISTO_BUCKETS + 1); 446 spinlock_stats.histo_spin_spinning, HISTO_BUCKETS + 1);
447 xen_debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug, 447 debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug,
448 spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1); 448 spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1);
449 449
450 return 0; 450 return 0;
451} 451}
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index 45329c8c226e..ae8a00c39de4 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -30,7 +30,7 @@ void xen_arch_hvm_post_suspend(int suspend_cancelled)
30{ 30{
31#ifdef CONFIG_XEN_PVHVM 31#ifdef CONFIG_XEN_PVHVM
32 int cpu; 32 int cpu;
33 xen_hvm_init_shared_info(); 33 xen_hvm_resume_shared_info();
34 xen_callback_vector(); 34 xen_callback_vector();
35 xen_unplug_emulated_devices(); 35 xen_unplug_emulated_devices();
36 if (xen_feature(XENFEAT_hvm_safe_pvclock)) { 36 if (xen_feature(XENFEAT_hvm_safe_pvclock)) {
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S
index 79d7362ad6d1..3e45aa000718 100644
--- a/arch/x86/xen/xen-asm.S
+++ b/arch/x86/xen/xen-asm.S
@@ -96,7 +96,7 @@ ENTRY(xen_restore_fl_direct)
96 96
97 /* check for unmasked and pending */ 97 /* check for unmasked and pending */
98 cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending 98 cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending
99 jz 1f 99 jnz 1f
1002: call check_events 1002: call check_events
1011: 1011:
102ENDPATCH(xen_restore_fl_direct) 102ENDPATCH(xen_restore_fl_direct)
diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S
index b040b0e518ca..f9643fc50de5 100644
--- a/arch/x86/xen/xen-asm_32.S
+++ b/arch/x86/xen/xen-asm_32.S
@@ -14,6 +14,7 @@
14#include <asm/thread_info.h> 14#include <asm/thread_info.h>
15#include <asm/processor-flags.h> 15#include <asm/processor-flags.h>
16#include <asm/segment.h> 16#include <asm/segment.h>
17#include <asm/asm.h>
17 18
18#include <xen/interface/xen.h> 19#include <xen/interface/xen.h>
19 20
@@ -137,10 +138,7 @@ iret_restore_end:
137 138
1381: iret 1391: iret
139xen_iret_end_crit: 140xen_iret_end_crit:
140.section __ex_table, "a" 141 _ASM_EXTABLE(1b, iret_exc)
141 .align 4
142 .long 1b, iret_exc
143.previous
144 142
145hyper_iret: 143hyper_iret:
146 /* put this out of line since its very rarely used */ 144 /* put this out of line since its very rarely used */
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index b095739ccd4c..1e4329e04e0f 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -28,7 +28,6 @@ void xen_setup_shared_info(void);
28void xen_build_mfn_list_list(void); 28void xen_build_mfn_list_list(void);
29void xen_setup_machphys_mapping(void); 29void xen_setup_machphys_mapping(void);
30pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn); 30pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
31void xen_ident_map_ISA(void);
32void xen_reserve_top(void); 31void xen_reserve_top(void);
33extern unsigned long xen_max_p2m_pfn; 32extern unsigned long xen_max_p2m_pfn;
34 33
@@ -42,7 +41,7 @@ void xen_enable_syscall(void);
42void xen_vcpu_restore(void); 41void xen_vcpu_restore(void);
43 42
44void xen_callback_vector(void); 43void xen_callback_vector(void);
45void xen_hvm_init_shared_info(void); 44void xen_hvm_resume_shared_info(void);
46void xen_unplug_emulated_devices(void); 45void xen_unplug_emulated_devices(void);
47 46
48void __init xen_build_dynamic_phys_to_machine(void); 47void __init xen_build_dynamic_phys_to_machine(void);
@@ -92,11 +91,15 @@ struct dom0_vga_console_info;
92 91
93#ifdef CONFIG_XEN_DOM0 92#ifdef CONFIG_XEN_DOM0
94void __init xen_init_vga(const struct dom0_vga_console_info *, size_t size); 93void __init xen_init_vga(const struct dom0_vga_console_info *, size_t size);
94void __init xen_init_apic(void);
95#else 95#else
96static inline void __init xen_init_vga(const struct dom0_vga_console_info *info, 96static inline void __init xen_init_vga(const struct dom0_vga_console_info *info,
97 size_t size) 97 size_t size)
98{ 98{
99} 99}
100static inline void __init xen_init_apic(void)
101{
102}
100#endif 103#endif
101 104
102/* Declare an asm function, along with symbols needed to make it 105/* Declare an asm function, along with symbols needed to make it