aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--arch/x86/ia32/ia32entry.S1
-rw-r--r--arch/x86/kernel/asm-offsets_64.c11
-rw-r--r--arch/x86/kernel/cpu/amd_64.c2
-rw-r--r--arch/x86/kernel/cpu/common_64.c7
-rw-r--r--arch/x86/kernel/entry_64.S106
-rw-r--r--arch/x86/kernel/head64.c11
-rw-r--r--arch/x86/kernel/head_64.S1
-rw-r--r--arch/x86/kernel/irq_32.c7
-rw-r--r--arch/x86/kernel/paravirt.c4
-rw-r--r--arch/x86/kernel/process_64.c56
-rw-r--r--arch/x86/kernel/setup.c3
-rw-r--r--arch/x86/kernel/smpboot.c2
-rw-r--r--arch/x86/mm/init_32.c2
-rw-r--r--arch/x86/vdso/Makefile2
-rw-r--r--arch/x86/vdso/vdso32-setup.c19
-rw-r--r--arch/x86/vdso/vdso32.S13
-rw-r--r--arch/x86/xen/Kconfig14
-rw-r--r--arch/x86/xen/Makefile2
-rw-r--r--arch/x86/xen/enlighten.c688
-rw-r--r--arch/x86/xen/mmu.c316
-rw-r--r--arch/x86/xen/mmu.h29
-rw-r--r--arch/x86/xen/multicalls.c1
-rw-r--r--arch/x86/xen/setup.c79
-rw-r--r--arch/x86/xen/smp.c137
-rw-r--r--arch/x86/xen/suspend.c5
-rw-r--r--arch/x86/xen/xen-asm_32.S (renamed from arch/x86/xen/xen-asm.S)0
-rw-r--r--arch/x86/xen/xen-asm_64.S271
-rw-r--r--arch/x86/xen/xen-head.S28
-rw-r--r--arch/x86/xen/xen-ops.h21
-rw-r--r--drivers/net/xen-netfront.c19
-rw-r--r--drivers/xen/manage.c6
-rw-r--r--include/asm-x86/paravirt.h26
-rw-r--r--include/asm-x86/percpu.h26
-rw-r--r--include/asm-x86/pgtable.h18
-rw-r--r--include/asm-x86/pgtable_32.h15
-rw-r--r--include/asm-x86/pgtable_64.h2
-rw-r--r--include/asm-x86/setup.h1
-rw-r--r--include/asm-x86/smp.h2
-rw-r--r--include/asm-x86/vdso.h8
-rw-r--r--include/asm-x86/xen/hypercall.h263
-rw-r--r--include/asm-x86/xen/interface.h139
-rw-r--r--include/asm-x86/xen/interface_32.h97
-rw-r--r--include/asm-x86/xen/interface_64.h159
-rw-r--r--include/asm-x86/xen/page.h6
-rw-r--r--include/xen/hvc-console.h7
-rw-r--r--include/xen/interface/callback.h6
-rw-r--r--include/xen/xen-ops.h3
-rw-r--r--kernel/power/Kconfig2
48 files changed, 2037 insertions, 606 deletions
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 20371d0635e4..0ae1e77eae50 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -321,6 +321,7 @@ ENTRY(ia32_syscall)
321 /*CFI_REL_OFFSET rflags,EFLAGS-RIP*/ 321 /*CFI_REL_OFFSET rflags,EFLAGS-RIP*/
322 /*CFI_REL_OFFSET cs,CS-RIP*/ 322 /*CFI_REL_OFFSET cs,CS-RIP*/
323 CFI_REL_OFFSET rip,RIP-RIP 323 CFI_REL_OFFSET rip,RIP-RIP
324 PARAVIRT_ADJUST_EXCEPTION_FRAME
324 SWAPGS 325 SWAPGS
325 /* 326 /*
326 * No need to follow this irqs on/off section: the syscall 327 * No need to follow this irqs on/off section: the syscall
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index bacf5deeec2d..aa89387006fe 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -18,6 +18,8 @@
18#include <asm/ia32.h> 18#include <asm/ia32.h>
19#include <asm/bootparam.h> 19#include <asm/bootparam.h>
20 20
21#include <xen/interface/xen.h>
22
21#define __NO_STUBS 1 23#define __NO_STUBS 1
22#undef __SYSCALL 24#undef __SYSCALL
23#undef _ASM_X86_64_UNISTD_H_ 25#undef _ASM_X86_64_UNISTD_H_
@@ -131,5 +133,14 @@ int main(void)
131 OFFSET(BP_loadflags, boot_params, hdr.loadflags); 133 OFFSET(BP_loadflags, boot_params, hdr.loadflags);
132 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); 134 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
133 OFFSET(BP_version, boot_params, hdr.version); 135 OFFSET(BP_version, boot_params, hdr.version);
136
137 BLANK();
138 DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
139#ifdef CONFIG_XEN
140 BLANK();
141 OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
142 OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
143#undef ENTRY
144#endif
134 return 0; 145 return 0;
135} 146}
diff --git a/arch/x86/kernel/cpu/amd_64.c b/arch/x86/kernel/cpu/amd_64.c
index 7c36fb8a28d4..d1692b2a41ff 100644
--- a/arch/x86/kernel/cpu/amd_64.c
+++ b/arch/x86/kernel/cpu/amd_64.c
@@ -115,6 +115,8 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
115 /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */ 115 /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
116 if (c->x86_power & (1<<8)) 116 if (c->x86_power & (1<<8))
117 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 117 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
118
119 set_cpu_cap(c, X86_FEATURE_SYSCALL32);
118} 120}
119 121
120static void __cpuinit init_amd(struct cpuinfo_x86 *c) 122static void __cpuinit init_amd(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c
index 7b8cc72feb40..736f50fa433d 100644
--- a/arch/x86/kernel/cpu/common_64.c
+++ b/arch/x86/kernel/cpu/common_64.c
@@ -16,6 +16,7 @@
16#include <asm/i387.h> 16#include <asm/i387.h>
17#include <asm/msr.h> 17#include <asm/msr.h>
18#include <asm/io.h> 18#include <asm/io.h>
19#include <asm/linkage.h>
19#include <asm/mmu_context.h> 20#include <asm/mmu_context.h>
20#include <asm/mtrr.h> 21#include <asm/mtrr.h>
21#include <asm/mce.h> 22#include <asm/mce.h>
@@ -316,9 +317,6 @@ static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
316 c->x86_phys_bits = eax & 0xff; 317 c->x86_phys_bits = eax & 0xff;
317 } 318 }
318 319
319 /* Assume all 64-bit CPUs support 32-bit syscall */
320 set_cpu_cap(c, X86_FEATURE_SYSCALL32);
321
322 if (c->x86_vendor != X86_VENDOR_UNKNOWN && 320 if (c->x86_vendor != X86_VENDOR_UNKNOWN &&
323 cpu_devs[c->x86_vendor]->c_early_init) 321 cpu_devs[c->x86_vendor]->c_early_init)
324 cpu_devs[c->x86_vendor]->c_early_init(c); 322 cpu_devs[c->x86_vendor]->c_early_init(c);
@@ -517,8 +515,7 @@ void pda_init(int cpu)
517} 515}
518 516
519char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + 517char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ +
520 DEBUG_STKSZ] 518 DEBUG_STKSZ] __page_aligned_bss;
521__attribute__((section(".bss.page_aligned")));
522 519
523extern asmlinkage void ignore_sysret(void); 520extern asmlinkage void ignore_sysret(void);
524 521
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index ae63e584c340..80d5663db3bc 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1189,6 +1189,7 @@ END(device_not_available)
1189 /* runs on exception stack */ 1189 /* runs on exception stack */
1190KPROBE_ENTRY(debug) 1190KPROBE_ENTRY(debug)
1191 INTR_FRAME 1191 INTR_FRAME
1192 PARAVIRT_ADJUST_EXCEPTION_FRAME
1192 pushq $0 1193 pushq $0
1193 CFI_ADJUST_CFA_OFFSET 8 1194 CFI_ADJUST_CFA_OFFSET 8
1194 paranoidentry do_debug, DEBUG_STACK 1195 paranoidentry do_debug, DEBUG_STACK
@@ -1198,6 +1199,7 @@ KPROBE_END(debug)
1198 /* runs on exception stack */ 1199 /* runs on exception stack */
1199KPROBE_ENTRY(nmi) 1200KPROBE_ENTRY(nmi)
1200 INTR_FRAME 1201 INTR_FRAME
1202 PARAVIRT_ADJUST_EXCEPTION_FRAME
1201 pushq $-1 1203 pushq $-1
1202 CFI_ADJUST_CFA_OFFSET 8 1204 CFI_ADJUST_CFA_OFFSET 8
1203 paranoidentry do_nmi, 0, 0 1205 paranoidentry do_nmi, 0, 0
@@ -1211,6 +1213,7 @@ KPROBE_END(nmi)
1211 1213
1212KPROBE_ENTRY(int3) 1214KPROBE_ENTRY(int3)
1213 INTR_FRAME 1215 INTR_FRAME
1216 PARAVIRT_ADJUST_EXCEPTION_FRAME
1214 pushq $0 1217 pushq $0
1215 CFI_ADJUST_CFA_OFFSET 8 1218 CFI_ADJUST_CFA_OFFSET 8
1216 paranoidentry do_int3, DEBUG_STACK 1219 paranoidentry do_int3, DEBUG_STACK
@@ -1237,6 +1240,7 @@ END(coprocessor_segment_overrun)
1237 /* runs on exception stack */ 1240 /* runs on exception stack */
1238ENTRY(double_fault) 1241ENTRY(double_fault)
1239 XCPT_FRAME 1242 XCPT_FRAME
1243 PARAVIRT_ADJUST_EXCEPTION_FRAME
1240 paranoidentry do_double_fault 1244 paranoidentry do_double_fault
1241 jmp paranoid_exit1 1245 jmp paranoid_exit1
1242 CFI_ENDPROC 1246 CFI_ENDPROC
@@ -1253,6 +1257,7 @@ END(segment_not_present)
1253 /* runs on exception stack */ 1257 /* runs on exception stack */
1254ENTRY(stack_segment) 1258ENTRY(stack_segment)
1255 XCPT_FRAME 1259 XCPT_FRAME
1260 PARAVIRT_ADJUST_EXCEPTION_FRAME
1256 paranoidentry do_stack_segment 1261 paranoidentry do_stack_segment
1257 jmp paranoid_exit1 1262 jmp paranoid_exit1
1258 CFI_ENDPROC 1263 CFI_ENDPROC
@@ -1278,6 +1283,7 @@ END(spurious_interrupt_bug)
1278 /* runs on exception stack */ 1283 /* runs on exception stack */
1279ENTRY(machine_check) 1284ENTRY(machine_check)
1280 INTR_FRAME 1285 INTR_FRAME
1286 PARAVIRT_ADJUST_EXCEPTION_FRAME
1281 pushq $0 1287 pushq $0
1282 CFI_ADJUST_CFA_OFFSET 8 1288 CFI_ADJUST_CFA_OFFSET 8
1283 paranoidentry do_machine_check 1289 paranoidentry do_machine_check
@@ -1312,3 +1318,103 @@ KPROBE_ENTRY(ignore_sysret)
1312 sysret 1318 sysret
1313 CFI_ENDPROC 1319 CFI_ENDPROC
1314ENDPROC(ignore_sysret) 1320ENDPROC(ignore_sysret)
1321
1322#ifdef CONFIG_XEN
1323ENTRY(xen_hypervisor_callback)
1324 zeroentry xen_do_hypervisor_callback
1325END(xen_hypervisor_callback)
1326
1327/*
1328# A note on the "critical region" in our callback handler.
1329# We want to avoid stacking callback handlers due to events occurring
1330# during handling of the last event. To do this, we keep events disabled
1331# until we've done all processing. HOWEVER, we must enable events before
1332# popping the stack frame (can't be done atomically) and so it would still
1333# be possible to get enough handler activations to overflow the stack.
1334# Although unlikely, bugs of that kind are hard to track down, so we'd
1335# like to avoid the possibility.
1336# So, on entry to the handler we detect whether we interrupted an
1337# existing activation in its critical region -- if so, we pop the current
1338# activation and restart the handler using the previous one.
1339*/
1340ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs)
1341 CFI_STARTPROC
1342/* Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
1343 see the correct pointer to the pt_regs */
1344 movq %rdi, %rsp # we don't return, adjust the stack frame
1345 CFI_ENDPROC
1346 CFI_DEFAULT_STACK
134711: incl %gs:pda_irqcount
1348 movq %rsp,%rbp
1349 CFI_DEF_CFA_REGISTER rbp
1350 cmovzq %gs:pda_irqstackptr,%rsp
1351 pushq %rbp # backlink for old unwinder
1352 call xen_evtchn_do_upcall
1353 popq %rsp
1354 CFI_DEF_CFA_REGISTER rsp
1355 decl %gs:pda_irqcount
1356 jmp error_exit
1357 CFI_ENDPROC
1358END(do_hypervisor_callback)
1359
1360/*
1361# Hypervisor uses this for application faults while it executes.
1362# We get here for two reasons:
1363# 1. Fault while reloading DS, ES, FS or GS
1364# 2. Fault while executing IRET
1365# Category 1 we do not need to fix up as Xen has already reloaded all segment
1366# registers that could be reloaded and zeroed the others.
1367# Category 2 we fix up by killing the current process. We cannot use the
1368# normal Linux return path in this case because if we use the IRET hypercall
1369# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
1370# We distinguish between categories by comparing each saved segment register
1371# with its current contents: any discrepancy means we in category 1.
1372*/
1373ENTRY(xen_failsafe_callback)
1374 framesz = (RIP-0x30) /* workaround buggy gas */
1375 _frame framesz
1376 CFI_REL_OFFSET rcx, 0
1377 CFI_REL_OFFSET r11, 8
1378 movw %ds,%cx
1379 cmpw %cx,0x10(%rsp)
1380 CFI_REMEMBER_STATE
1381 jne 1f
1382 movw %es,%cx
1383 cmpw %cx,0x18(%rsp)
1384 jne 1f
1385 movw %fs,%cx
1386 cmpw %cx,0x20(%rsp)
1387 jne 1f
1388 movw %gs,%cx
1389 cmpw %cx,0x28(%rsp)
1390 jne 1f
1391 /* All segments match their saved values => Category 2 (Bad IRET). */
1392 movq (%rsp),%rcx
1393 CFI_RESTORE rcx
1394 movq 8(%rsp),%r11
1395 CFI_RESTORE r11
1396 addq $0x30,%rsp
1397 CFI_ADJUST_CFA_OFFSET -0x30
1398 pushq $0
1399 CFI_ADJUST_CFA_OFFSET 8
1400 pushq %r11
1401 CFI_ADJUST_CFA_OFFSET 8
1402 pushq %rcx
1403 CFI_ADJUST_CFA_OFFSET 8
1404 jmp general_protection
1405 CFI_RESTORE_STATE
14061: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
1407 movq (%rsp),%rcx
1408 CFI_RESTORE rcx
1409 movq 8(%rsp),%r11
1410 CFI_RESTORE r11
1411 addq $0x30,%rsp
1412 CFI_ADJUST_CFA_OFFSET -0x30
1413 pushq $0
1414 CFI_ADJUST_CFA_OFFSET 8
1415 SAVE_ALL
1416 jmp error_exit
1417 CFI_ENDPROC
1418END(xen_failsafe_callback)
1419
1420#endif /* CONFIG_XEN */
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index c97819829146..1b318e903bf6 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -39,6 +39,13 @@ static struct x8664_pda *__cpu_pda[NR_CPUS] __initdata;
39static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly; 39static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly;
40#endif 40#endif
41 41
42void __init x86_64_init_pda(void)
43{
44 _cpu_pda = __cpu_pda;
45 cpu_pda(0) = &_boot_cpu_pda;
46 pda_init(0);
47}
48
42static void __init zap_identity_mappings(void) 49static void __init zap_identity_mappings(void)
43{ 50{
44 pgd_t *pgd = pgd_offset_k(0UL); 51 pgd_t *pgd = pgd_offset_k(0UL);
@@ -102,9 +109,7 @@ void __init x86_64_start_kernel(char * real_mode_data)
102 109
103 early_printk("Kernel alive\n"); 110 early_printk("Kernel alive\n");
104 111
105 _cpu_pda = __cpu_pda; 112 x86_64_init_pda();
106 cpu_pda(0) = &_boot_cpu_pda;
107 pda_init(0);
108 113
109 early_printk("Kernel really alive\n"); 114 early_printk("Kernel really alive\n");
110 115
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index b07ac7b217cb..db3280afe886 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -407,6 +407,7 @@ ENTRY(phys_base)
407 /* This must match the first entry in level2_kernel_pgt */ 407 /* This must match the first entry in level2_kernel_pgt */
408 .quad 0x0000000000000000 408 .quad 0x0000000000000000
409 409
410#include "../../x86/xen/xen-head.S"
410 411
411 .section .bss, "aw", @nobits 412 .section .bss, "aw", @nobits
412 .align L1_CACHE_BYTES 413 .align L1_CACHE_BYTES
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 47a6f6f12478..1cf8c1fcc088 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -83,11 +83,8 @@ union irq_ctx {
83static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly; 83static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly;
84static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly; 84static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly;
85 85
86static char softirq_stack[NR_CPUS * THREAD_SIZE] 86static char softirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
87 __attribute__((__section__(".bss.page_aligned"))); 87static char hardirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
88
89static char hardirq_stack[NR_CPUS * THREAD_SIZE]
90 __attribute__((__section__(".bss.page_aligned")));
91 88
92static void call_on_stack(void *func, void *stack) 89static void call_on_stack(void *func, void *stack)
93{ 90{
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index e0f571d58c19..2963ab5d91ee 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -29,6 +29,7 @@
29#include <asm/desc.h> 29#include <asm/desc.h>
30#include <asm/setup.h> 30#include <asm/setup.h>
31#include <asm/arch_hooks.h> 31#include <asm/arch_hooks.h>
32#include <asm/pgtable.h>
32#include <asm/time.h> 33#include <asm/time.h>
33#include <asm/pgalloc.h> 34#include <asm/pgalloc.h>
34#include <asm/irq.h> 35#include <asm/irq.h>
@@ -373,6 +374,9 @@ struct pv_mmu_ops pv_mmu_ops = {
373#ifndef CONFIG_X86_64 374#ifndef CONFIG_X86_64
374 .pagetable_setup_start = native_pagetable_setup_start, 375 .pagetable_setup_start = native_pagetable_setup_start,
375 .pagetable_setup_done = native_pagetable_setup_done, 376 .pagetable_setup_done = native_pagetable_setup_done,
377#else
378 .pagetable_setup_start = paravirt_nop,
379 .pagetable_setup_done = paravirt_nop,
376#endif 380#endif
377 381
378 .read_cr2 = native_read_cr2, 382 .read_cr2 = native_read_cr2,
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index a8e53626ac9a..e8a8e1b99817 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -537,8 +537,8 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
537struct task_struct * 537struct task_struct *
538__switch_to(struct task_struct *prev_p, struct task_struct *next_p) 538__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
539{ 539{
540 struct thread_struct *prev = &prev_p->thread, 540 struct thread_struct *prev = &prev_p->thread;
541 *next = &next_p->thread; 541 struct thread_struct *next = &next_p->thread;
542 int cpu = smp_processor_id(); 542 int cpu = smp_processor_id();
543 struct tss_struct *tss = &per_cpu(init_tss, cpu); 543 struct tss_struct *tss = &per_cpu(init_tss, cpu);
544 unsigned fsindex, gsindex; 544 unsigned fsindex, gsindex;
@@ -586,35 +586,34 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
586 586
587 /* 587 /*
588 * Switch FS and GS. 588 * Switch FS and GS.
589 *
590 * Segment register != 0 always requires a reload. Also
591 * reload when it has changed. When prev process used 64bit
592 * base always reload to avoid an information leak.
589 */ 593 */
590 { 594 if (unlikely(fsindex | next->fsindex | prev->fs)) {
591 /* segment register != 0 always requires a reload. 595 loadsegment(fs, next->fsindex);
592 also reload when it has changed. 596 /*
593 when prev process used 64bit base always reload 597 * Check if the user used a selector != 0; if yes
594 to avoid an information leak. */ 598 * clear 64bit base, since overloaded base is always
595 if (unlikely(fsindex | next->fsindex | prev->fs)) { 599 * mapped to the Null selector
596 loadsegment(fs, next->fsindex); 600 */
597 /* check if the user used a selector != 0 601 if (fsindex)
598 * if yes clear 64bit base, since overloaded base
599 * is always mapped to the Null selector
600 */
601 if (fsindex)
602 prev->fs = 0; 602 prev->fs = 0;
603 } 603 }
604 /* when next process has a 64bit base use it */ 604 /* when next process has a 64bit base use it */
605 if (next->fs) 605 if (next->fs)
606 wrmsrl(MSR_FS_BASE, next->fs); 606 wrmsrl(MSR_FS_BASE, next->fs);
607 prev->fsindex = fsindex; 607 prev->fsindex = fsindex;
608 608
609 if (unlikely(gsindex | next->gsindex | prev->gs)) { 609 if (unlikely(gsindex | next->gsindex | prev->gs)) {
610 load_gs_index(next->gsindex); 610 load_gs_index(next->gsindex);
611 if (gsindex) 611 if (gsindex)
612 prev->gs = 0; 612 prev->gs = 0;
613 }
614 if (next->gs)
615 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
616 prev->gsindex = gsindex;
617 } 613 }
614 if (next->gs)
615 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
616 prev->gsindex = gsindex;
618 617
619 /* Must be after DS reload */ 618 /* Must be after DS reload */
620 unlazy_fpu(prev_p); 619 unlazy_fpu(prev_p);
@@ -627,7 +626,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
627 write_pda(pcurrent, next_p); 626 write_pda(pcurrent, next_p);
628 627
629 write_pda(kernelstack, 628 write_pda(kernelstack,
630 (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET); 629 (unsigned long)task_stack_page(next_p) +
630 THREAD_SIZE - PDA_STACKOFFSET);
631#ifdef CONFIG_CC_STACKPROTECTOR 631#ifdef CONFIG_CC_STACKPROTECTOR
632 write_pda(stack_canary, next_p->stack_canary); 632 write_pda(stack_canary, next_p->stack_canary);
633 /* 633 /*
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 531b55b8e81a..c9010f82141d 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -824,7 +824,10 @@ void __init setup_arch(char **cmdline_p)
824 vmi_init(); 824 vmi_init();
825#endif 825#endif
826 826
827 paravirt_pagetable_setup_start(swapper_pg_dir);
827 paging_init(); 828 paging_init();
829 paravirt_pagetable_setup_done(swapper_pg_dir);
830 paravirt_post_allocator_init();
828 831
829#ifdef CONFIG_X86_64 832#ifdef CONFIG_X86_64
830 map_vsyscall(); 833 map_vsyscall();
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 687376ab07e8..1deb3b624a79 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -768,7 +768,7 @@ static void __cpuinit do_fork_idle(struct work_struct *work)
768 * 768 *
769 * Must be called after the _cpu_pda pointer table is initialized. 769 * Must be called after the _cpu_pda pointer table is initialized.
770 */ 770 */
771static int __cpuinit get_local_pda(int cpu) 771int __cpuinit get_local_pda(int cpu)
772{ 772{
773 struct x8664_pda *oldpda, *newpda; 773 struct x8664_pda *oldpda, *newpda;
774 unsigned long size = sizeof(struct x8664_pda); 774 unsigned long size = sizeof(struct x8664_pda);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 9689a5138e64..7113acd8ac45 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -868,8 +868,6 @@ void __init paging_init(void)
868 */ 868 */
869 sparse_init(); 869 sparse_init();
870 zone_sizes_init(); 870 zone_sizes_init();
871
872 paravirt_post_allocator_init();
873} 871}
874 872
875/* 873/*
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index b7ad9f89d21f..4d6ef0a336d6 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -62,7 +62,7 @@ $(obj)/%-syms.lds: $(obj)/%.so.dbg FORCE
62# Build multiple 32-bit vDSO images to choose from at boot time. 62# Build multiple 32-bit vDSO images to choose from at boot time.
63# 63#
64obj-$(VDSO32-y) += vdso32-syms.lds 64obj-$(VDSO32-y) += vdso32-syms.lds
65vdso32.so-$(CONFIG_X86_32) += int80 65vdso32.so-$(VDSO32-y) += int80
66vdso32.so-$(CONFIG_COMPAT) += syscall 66vdso32.so-$(CONFIG_COMPAT) += syscall
67vdso32.so-$(VDSO32-y) += sysenter 67vdso32.so-$(VDSO32-y) += sysenter
68 68
diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
index 0bce5429a515..513f330c5832 100644
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -193,17 +193,12 @@ static __init void relocate_vdso(Elf32_Ehdr *ehdr)
193 } 193 }
194} 194}
195 195
196/*
197 * These symbols are defined by vdso32.S to mark the bounds
198 * of the ELF DSO images included therein.
199 */
200extern const char vdso32_default_start, vdso32_default_end;
201extern const char vdso32_sysenter_start, vdso32_sysenter_end;
202static struct page *vdso32_pages[1]; 196static struct page *vdso32_pages[1];
203 197
204#ifdef CONFIG_X86_64 198#ifdef CONFIG_X86_64
205 199
206#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SYSENTER32)) 200#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SYSENTER32))
201#define vdso32_syscall() (boot_cpu_has(X86_FEATURE_SYSCALL32))
207 202
208/* May not be __init: called during resume */ 203/* May not be __init: called during resume */
209void syscall32_cpu_init(void) 204void syscall32_cpu_init(void)
@@ -226,6 +221,7 @@ static inline void map_compat_vdso(int map)
226#else /* CONFIG_X86_32 */ 221#else /* CONFIG_X86_32 */
227 222
228#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SEP)) 223#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SEP))
224#define vdso32_syscall() (0)
229 225
230void enable_sep_cpu(void) 226void enable_sep_cpu(void)
231{ 227{
@@ -296,12 +292,15 @@ int __init sysenter_setup(void)
296 gate_vma_init(); 292 gate_vma_init();
297#endif 293#endif
298 294
299 if (!vdso32_sysenter()) { 295 if (vdso32_syscall()) {
300 vsyscall = &vdso32_default_start; 296 vsyscall = &vdso32_syscall_start;
301 vsyscall_len = &vdso32_default_end - &vdso32_default_start; 297 vsyscall_len = &vdso32_syscall_end - &vdso32_syscall_start;
302 } else { 298 } else if (vdso32_sysenter()){
303 vsyscall = &vdso32_sysenter_start; 299 vsyscall = &vdso32_sysenter_start;
304 vsyscall_len = &vdso32_sysenter_end - &vdso32_sysenter_start; 300 vsyscall_len = &vdso32_sysenter_end - &vdso32_sysenter_start;
301 } else {
302 vsyscall = &vdso32_int80_start;
303 vsyscall_len = &vdso32_int80_end - &vdso32_int80_start;
305 } 304 }
306 305
307 memcpy(syscall_page, vsyscall, vsyscall_len); 306 memcpy(syscall_page, vsyscall, vsyscall_len);
diff --git a/arch/x86/vdso/vdso32.S b/arch/x86/vdso/vdso32.S
index 1e36f72cab86..2ce5f82c333b 100644
--- a/arch/x86/vdso/vdso32.S
+++ b/arch/x86/vdso/vdso32.S
@@ -2,14 +2,17 @@
2 2
3__INITDATA 3__INITDATA
4 4
5 .globl vdso32_default_start, vdso32_default_end 5 .globl vdso32_int80_start, vdso32_int80_end
6vdso32_default_start: 6vdso32_int80_start:
7#ifdef CONFIG_X86_32
8 .incbin "arch/x86/vdso/vdso32-int80.so" 7 .incbin "arch/x86/vdso/vdso32-int80.so"
9#else 8vdso32_int80_end:
9
10 .globl vdso32_syscall_start, vdso32_syscall_end
11vdso32_syscall_start:
12#ifdef CONFIG_COMPAT
10 .incbin "arch/x86/vdso/vdso32-syscall.so" 13 .incbin "arch/x86/vdso/vdso32-syscall.so"
11#endif 14#endif
12vdso32_default_end: 15vdso32_syscall_end:
13 16
14 .globl vdso32_sysenter_start, vdso32_sysenter_end 17 .globl vdso32_sysenter_start, vdso32_sysenter_end
15vdso32_sysenter_start: 18vdso32_sysenter_start:
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index c2cc99580871..3815e425f470 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -6,8 +6,8 @@ config XEN
6 bool "Xen guest support" 6 bool "Xen guest support"
7 select PARAVIRT 7 select PARAVIRT
8 select PARAVIRT_CLOCK 8 select PARAVIRT_CLOCK
9 depends on X86_32 9 depends on X86_64 || (X86_32 && X86_PAE && !(X86_VISWS || X86_VOYAGER))
10 depends on X86_CMPXCHG && X86_TSC && X86_PAE && !(X86_VISWS || X86_VOYAGER) 10 depends on X86_CMPXCHG && X86_TSC
11 help 11 help
12 This is the Linux Xen port. Enabling this will allow the 12 This is the Linux Xen port. Enabling this will allow the
13 kernel to boot in a paravirtualized environment under the 13 kernel to boot in a paravirtualized environment under the
@@ -15,10 +15,16 @@ config XEN
15 15
16config XEN_MAX_DOMAIN_MEMORY 16config XEN_MAX_DOMAIN_MEMORY
17 int "Maximum allowed size of a domain in gigabytes" 17 int "Maximum allowed size of a domain in gigabytes"
18 default 8 18 default 8 if X86_32
19 default 32 if X86_64
19 depends on XEN 20 depends on XEN
20 help 21 help
21 The pseudo-physical to machine address array is sized 22 The pseudo-physical to machine address array is sized
22 according to the maximum possible memory size of a Xen 23 according to the maximum possible memory size of a Xen
23 domain. This array uses 1 page per gigabyte, so there's no 24 domain. This array uses 1 page per gigabyte, so there's no
24 need to be too stingy here. \ No newline at end of file 25 need to be too stingy here.
26
27config XEN_SAVE_RESTORE
28 bool
29 depends on PM
30 default y \ No newline at end of file
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 2ba2d1649131..59c1e539aed2 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -1,4 +1,4 @@
1obj-y := enlighten.o setup.o multicalls.o mmu.o \ 1obj-y := enlighten.o setup.o multicalls.o mmu.o \
2 time.o xen-asm.o grant-table.o suspend.o 2 time.o xen-asm_$(BITS).o grant-table.o suspend.o
3 3
4obj-$(CONFIG_SMP) += smp.o 4obj-$(CONFIG_SMP) += smp.o
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index bb508456ef52..3da6acb7eafc 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -33,6 +33,7 @@
33#include <xen/interface/sched.h> 33#include <xen/interface/sched.h>
34#include <xen/features.h> 34#include <xen/features.h>
35#include <xen/page.h> 35#include <xen/page.h>
36#include <xen/hvc-console.h>
36 37
37#include <asm/paravirt.h> 38#include <asm/paravirt.h>
38#include <asm/page.h> 39#include <asm/page.h>
@@ -40,12 +41,12 @@
40#include <asm/xen/hypervisor.h> 41#include <asm/xen/hypervisor.h>
41#include <asm/fixmap.h> 42#include <asm/fixmap.h>
42#include <asm/processor.h> 43#include <asm/processor.h>
44#include <asm/msr-index.h>
43#include <asm/setup.h> 45#include <asm/setup.h>
44#include <asm/desc.h> 46#include <asm/desc.h>
45#include <asm/pgtable.h> 47#include <asm/pgtable.h>
46#include <asm/tlbflush.h> 48#include <asm/tlbflush.h>
47#include <asm/reboot.h> 49#include <asm/reboot.h>
48#include <asm/pgalloc.h>
49 50
50#include "xen-ops.h" 51#include "xen-ops.h"
51#include "mmu.h" 52#include "mmu.h"
@@ -57,6 +58,18 @@ DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
57DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); 58DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
58 59
59/* 60/*
61 * Identity map, in addition to plain kernel map. This needs to be
62 * large enough to allocate page table pages to allocate the rest.
63 * Each page can map 2MB.
64 */
65static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss;
66
67#ifdef CONFIG_X86_64
68/* l3 pud for userspace vsyscall mapping */
69static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
70#endif /* CONFIG_X86_64 */
71
72/*
60 * Note about cr3 (pagetable base) values: 73 * Note about cr3 (pagetable base) values:
61 * 74 *
62 * xen_cr3 contains the current logical cr3 value; it contains the 75 * xen_cr3 contains the current logical cr3 value; it contains the
@@ -363,14 +376,6 @@ static void load_TLS_descriptor(struct thread_struct *t,
363 376
364static void xen_load_tls(struct thread_struct *t, unsigned int cpu) 377static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
365{ 378{
366 xen_mc_batch();
367
368 load_TLS_descriptor(t, cpu, 0);
369 load_TLS_descriptor(t, cpu, 1);
370 load_TLS_descriptor(t, cpu, 2);
371
372 xen_mc_issue(PARAVIRT_LAZY_CPU);
373
374 /* 379 /*
375 * XXX sleazy hack: If we're being called in a lazy-cpu zone, 380 * XXX sleazy hack: If we're being called in a lazy-cpu zone,
376 * it means we're in a context switch, and %gs has just been 381 * it means we're in a context switch, and %gs has just been
@@ -379,10 +384,39 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
379 * Either way, it has been saved, and the new value will get 384 * Either way, it has been saved, and the new value will get
380 * loaded properly. This will go away as soon as Xen has been 385 * loaded properly. This will go away as soon as Xen has been
381 * modified to not save/restore %gs for normal hypercalls. 386 * modified to not save/restore %gs for normal hypercalls.
387 *
388 * On x86_64, this hack is not used for %gs, because gs points
389 * to KERNEL_GS_BASE (and uses it for PDA references), so we
390 * must not zero %gs on x86_64
391 *
392 * For x86_64, we need to zero %fs, otherwise we may get an
393 * exception between the new %fs descriptor being loaded and
394 * %fs being effectively cleared at __switch_to().
382 */ 395 */
383 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) 396 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) {
397#ifdef CONFIG_X86_32
384 loadsegment(gs, 0); 398 loadsegment(gs, 0);
399#else
400 loadsegment(fs, 0);
401#endif
402 }
403
404 xen_mc_batch();
405
406 load_TLS_descriptor(t, cpu, 0);
407 load_TLS_descriptor(t, cpu, 1);
408 load_TLS_descriptor(t, cpu, 2);
409
410 xen_mc_issue(PARAVIRT_LAZY_CPU);
411}
412
413#ifdef CONFIG_X86_64
414static void xen_load_gs_index(unsigned int idx)
415{
416 if (HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, idx))
417 BUG();
385} 418}
419#endif
386 420
387static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, 421static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
388 const void *ptr) 422 const void *ptr)
@@ -400,23 +434,18 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
400 preempt_enable(); 434 preempt_enable();
401} 435}
402 436
403static int cvt_gate_to_trap(int vector, u32 low, u32 high, 437static int cvt_gate_to_trap(int vector, const gate_desc *val,
404 struct trap_info *info) 438 struct trap_info *info)
405{ 439{
406 u8 type, dpl; 440 if (val->type != 0xf && val->type != 0xe)
407
408 type = (high >> 8) & 0x1f;
409 dpl = (high >> 13) & 3;
410
411 if (type != 0xf && type != 0xe)
412 return 0; 441 return 0;
413 442
414 info->vector = vector; 443 info->vector = vector;
415 info->address = (high & 0xffff0000) | (low & 0x0000ffff); 444 info->address = gate_offset(*val);
416 info->cs = low >> 16; 445 info->cs = gate_segment(*val);
417 info->flags = dpl; 446 info->flags = val->dpl;
418 /* interrupt gates clear IF */ 447 /* interrupt gates clear IF */
419 if (type == 0xe) 448 if (val->type == 0xe)
420 info->flags |= 4; 449 info->flags |= 4;
421 450
422 return 1; 451 return 1;
@@ -443,11 +472,10 @@ static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g)
443 472
444 if (p >= start && (p + 8) <= end) { 473 if (p >= start && (p + 8) <= end) {
445 struct trap_info info[2]; 474 struct trap_info info[2];
446 u32 *desc = (u32 *)g;
447 475
448 info[1].address = 0; 476 info[1].address = 0;
449 477
450 if (cvt_gate_to_trap(entrynum, desc[0], desc[1], &info[0])) 478 if (cvt_gate_to_trap(entrynum, g, &info[0]))
451 if (HYPERVISOR_set_trap_table(info)) 479 if (HYPERVISOR_set_trap_table(info))
452 BUG(); 480 BUG();
453 } 481 }
@@ -460,13 +488,13 @@ static void xen_convert_trap_info(const struct desc_ptr *desc,
460{ 488{
461 unsigned in, out, count; 489 unsigned in, out, count;
462 490
463 count = (desc->size+1) / 8; 491 count = (desc->size+1) / sizeof(gate_desc);
464 BUG_ON(count > 256); 492 BUG_ON(count > 256);
465 493
466 for (in = out = 0; in < count; in++) { 494 for (in = out = 0; in < count; in++) {
467 const u32 *entry = (u32 *)(desc->address + in * 8); 495 gate_desc *entry = (gate_desc*)(desc->address) + in;
468 496
469 if (cvt_gate_to_trap(in, entry[0], entry[1], &traps[out])) 497 if (cvt_gate_to_trap(in, entry, &traps[out]))
470 out++; 498 out++;
471 } 499 }
472 traps[out].address = 0; 500 traps[out].address = 0;
@@ -695,33 +723,89 @@ static void set_current_cr3(void *v)
695 x86_write_percpu(xen_current_cr3, (unsigned long)v); 723 x86_write_percpu(xen_current_cr3, (unsigned long)v);
696} 724}
697 725
698static void xen_write_cr3(unsigned long cr3) 726static void __xen_write_cr3(bool kernel, unsigned long cr3)
699{ 727{
700 struct mmuext_op *op; 728 struct mmuext_op *op;
701 struct multicall_space mcs; 729 struct multicall_space mcs;
702 unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3)); 730 unsigned long mfn;
703 731
704 BUG_ON(preemptible()); 732 if (cr3)
733 mfn = pfn_to_mfn(PFN_DOWN(cr3));
734 else
735 mfn = 0;
705 736
706 mcs = xen_mc_entry(sizeof(*op)); /* disables interrupts */ 737 WARN_ON(mfn == 0 && kernel);
707 738
708 /* Update while interrupts are disabled, so its atomic with 739 mcs = __xen_mc_entry(sizeof(*op));
709 respect to ipis */
710 x86_write_percpu(xen_cr3, cr3);
711 740
712 op = mcs.args; 741 op = mcs.args;
713 op->cmd = MMUEXT_NEW_BASEPTR; 742 op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
714 op->arg1.mfn = mfn; 743 op->arg1.mfn = mfn;
715 744
716 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); 745 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
717 746
718 /* Update xen_update_cr3 once the batch has actually 747 if (kernel) {
719 been submitted. */ 748 x86_write_percpu(xen_cr3, cr3);
720 xen_mc_callback(set_current_cr3, (void *)cr3); 749
750 /* Update xen_current_cr3 once the batch has actually
751 been submitted. */
752 xen_mc_callback(set_current_cr3, (void *)cr3);
753 }
754}
755
756static void xen_write_cr3(unsigned long cr3)
757{
758 BUG_ON(preemptible());
759
760 xen_mc_batch(); /* disables interrupts */
761
762 /* Update while interrupts are disabled, so its atomic with
763 respect to ipis */
764 x86_write_percpu(xen_cr3, cr3);
765
766 __xen_write_cr3(true, cr3);
767
768#ifdef CONFIG_X86_64
769 {
770 pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
771 if (user_pgd)
772 __xen_write_cr3(false, __pa(user_pgd));
773 else
774 __xen_write_cr3(false, 0);
775 }
776#endif
721 777
722 xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */ 778 xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
723} 779}
724 780
781static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
782{
783 int ret;
784
785 ret = 0;
786
787 switch(msr) {
788#ifdef CONFIG_X86_64
789 unsigned which;
790 u64 base;
791
792 case MSR_FS_BASE: which = SEGBASE_FS; goto set;
793 case MSR_KERNEL_GS_BASE: which = SEGBASE_GS_USER; goto set;
794 case MSR_GS_BASE: which = SEGBASE_GS_KERNEL; goto set;
795
796 set:
797 base = ((u64)high << 32) | low;
798 if (HYPERVISOR_set_segment_base(which, base) != 0)
799 ret = -EFAULT;
800 break;
801#endif
802 default:
803 ret = native_write_msr_safe(msr, low, high);
804 }
805
806 return ret;
807}
808
725/* Early in boot, while setting up the initial pagetable, assume 809/* Early in boot, while setting up the initial pagetable, assume
726 everything is pinned. */ 810 everything is pinned. */
727static __init void xen_alloc_pte_init(struct mm_struct *mm, u32 pfn) 811static __init void xen_alloc_pte_init(struct mm_struct *mm, u32 pfn)
@@ -778,6 +862,48 @@ static void xen_alloc_pmd(struct mm_struct *mm, u32 pfn)
778 xen_alloc_ptpage(mm, pfn, PT_PMD); 862 xen_alloc_ptpage(mm, pfn, PT_PMD);
779} 863}
780 864
865static int xen_pgd_alloc(struct mm_struct *mm)
866{
867 pgd_t *pgd = mm->pgd;
868 int ret = 0;
869
870 BUG_ON(PagePinned(virt_to_page(pgd)));
871
872#ifdef CONFIG_X86_64
873 {
874 struct page *page = virt_to_page(pgd);
875 pgd_t *user_pgd;
876
877 BUG_ON(page->private != 0);
878
879 ret = -ENOMEM;
880
881 user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
882 page->private = (unsigned long)user_pgd;
883
884 if (user_pgd != NULL) {
885 user_pgd[pgd_index(VSYSCALL_START)] =
886 __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
887 ret = 0;
888 }
889
890 BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
891 }
892#endif
893
894 return ret;
895}
896
897static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
898{
899#ifdef CONFIG_X86_64
900 pgd_t *user_pgd = xen_get_user_pgd(pgd);
901
902 if (user_pgd)
903 free_page((unsigned long)user_pgd);
904#endif
905}
906
781/* This should never happen until we're OK to use struct page */ 907/* This should never happen until we're OK to use struct page */
782static void xen_release_ptpage(u32 pfn, unsigned level) 908static void xen_release_ptpage(u32 pfn, unsigned level)
783{ 909{
@@ -803,6 +929,18 @@ static void xen_release_pmd(u32 pfn)
803 xen_release_ptpage(pfn, PT_PMD); 929 xen_release_ptpage(pfn, PT_PMD);
804} 930}
805 931
932#if PAGETABLE_LEVELS == 4
933static void xen_alloc_pud(struct mm_struct *mm, u32 pfn)
934{
935 xen_alloc_ptpage(mm, pfn, PT_PUD);
936}
937
938static void xen_release_pud(u32 pfn)
939{
940 xen_release_ptpage(pfn, PT_PUD);
941}
942#endif
943
806#ifdef CONFIG_HIGHPTE 944#ifdef CONFIG_HIGHPTE
807static void *xen_kmap_atomic_pte(struct page *page, enum km_type type) 945static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
808{ 946{
@@ -841,68 +979,16 @@ static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
841 979
842static __init void xen_pagetable_setup_start(pgd_t *base) 980static __init void xen_pagetable_setup_start(pgd_t *base)
843{ 981{
844 pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base;
845 int i;
846
847 /* special set_pte for pagetable initialization */
848 pv_mmu_ops.set_pte = xen_set_pte_init;
849
850 init_mm.pgd = base;
851 /*
852 * copy top-level of Xen-supplied pagetable into place. This
853 * is a stand-in while we copy the pmd pages.
854 */
855 memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t));
856
857 /*
858 * For PAE, need to allocate new pmds, rather than
859 * share Xen's, since Xen doesn't like pmd's being
860 * shared between address spaces.
861 */
862 for (i = 0; i < PTRS_PER_PGD; i++) {
863 if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) {
864 pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
865
866 memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]),
867 PAGE_SIZE);
868
869 make_lowmem_page_readonly(pmd);
870
871 set_pgd(&base[i], __pgd(1 + __pa(pmd)));
872 } else
873 pgd_clear(&base[i]);
874 }
875
876 /* make sure zero_page is mapped RO so we can use it in pagetables */
877 make_lowmem_page_readonly(empty_zero_page);
878 make_lowmem_page_readonly(base);
879 /*
880 * Switch to new pagetable. This is done before
881 * pagetable_init has done anything so that the new pages
882 * added to the table can be prepared properly for Xen.
883 */
884 xen_write_cr3(__pa(base));
885
886 /* Unpin initial Xen pagetable */
887 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE,
888 PFN_DOWN(__pa(xen_start_info->pt_base)));
889} 982}
890 983
891void xen_setup_shared_info(void) 984void xen_setup_shared_info(void)
892{ 985{
893 if (!xen_feature(XENFEAT_auto_translated_physmap)) { 986 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
894 unsigned long addr = fix_to_virt(FIX_PARAVIRT_BOOTMAP); 987 set_fixmap(FIX_PARAVIRT_BOOTMAP,
895 988 xen_start_info->shared_info);
896 /* 989
897 * Create a mapping for the shared info page. 990 HYPERVISOR_shared_info =
898 * Should be set_fixmap(), but shared_info is a machine 991 (struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP);
899 * address with no corresponding pseudo-phys address.
900 */
901 set_pte_mfn(addr,
902 PFN_DOWN(xen_start_info->shared_info),
903 PAGE_KERNEL);
904
905 HYPERVISOR_shared_info = (struct shared_info *)addr;
906 } else 992 } else
907 HYPERVISOR_shared_info = 993 HYPERVISOR_shared_info =
908 (struct shared_info *)__va(xen_start_info->shared_info); 994 (struct shared_info *)__va(xen_start_info->shared_info);
@@ -917,26 +1003,32 @@ void xen_setup_shared_info(void)
917 1003
918static __init void xen_pagetable_setup_done(pgd_t *base) 1004static __init void xen_pagetable_setup_done(pgd_t *base)
919{ 1005{
920 /* This will work as long as patching hasn't happened yet
921 (which it hasn't) */
922 pv_mmu_ops.alloc_pte = xen_alloc_pte;
923 pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
924 pv_mmu_ops.release_pte = xen_release_pte;
925 pv_mmu_ops.release_pmd = xen_release_pmd;
926 pv_mmu_ops.set_pte = xen_set_pte;
927
928 xen_setup_shared_info(); 1006 xen_setup_shared_info();
929
930 /* Actually pin the pagetable down, but we can't set PG_pinned
931 yet because the page structures don't exist yet. */
932 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(base)));
933} 1007}
934 1008
935static __init void xen_post_allocator_init(void) 1009static __init void xen_post_allocator_init(void)
936{ 1010{
1011 pv_mmu_ops.set_pte = xen_set_pte;
937 pv_mmu_ops.set_pmd = xen_set_pmd; 1012 pv_mmu_ops.set_pmd = xen_set_pmd;
938 pv_mmu_ops.set_pud = xen_set_pud; 1013 pv_mmu_ops.set_pud = xen_set_pud;
1014#if PAGETABLE_LEVELS == 4
1015 pv_mmu_ops.set_pgd = xen_set_pgd;
1016#endif
1017
1018 /* This will work as long as patching hasn't happened yet
1019 (which it hasn't) */
1020 pv_mmu_ops.alloc_pte = xen_alloc_pte;
1021 pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
1022 pv_mmu_ops.release_pte = xen_release_pte;
1023 pv_mmu_ops.release_pmd = xen_release_pmd;
1024#if PAGETABLE_LEVELS == 4
1025 pv_mmu_ops.alloc_pud = xen_alloc_pud;
1026 pv_mmu_ops.release_pud = xen_release_pud;
1027#endif
939 1028
1029#ifdef CONFIG_X86_64
1030 SetPagePinned(virt_to_page(level3_user_vsyscall));
1031#endif
940 xen_mark_init_mm_pinned(); 1032 xen_mark_init_mm_pinned();
941} 1033}
942 1034
@@ -950,6 +1042,7 @@ void xen_setup_vcpu_info_placement(void)
950 1042
951 /* xen_vcpu_setup managed to place the vcpu_info within the 1043 /* xen_vcpu_setup managed to place the vcpu_info within the
952 percpu area for all cpus, so make use of it */ 1044 percpu area for all cpus, so make use of it */
1045#ifdef CONFIG_X86_32
953 if (have_vcpu_info_placement) { 1046 if (have_vcpu_info_placement) {
954 printk(KERN_INFO "Xen: using vcpu_info placement\n"); 1047 printk(KERN_INFO "Xen: using vcpu_info placement\n");
955 1048
@@ -959,6 +1052,7 @@ void xen_setup_vcpu_info_placement(void)
959 pv_irq_ops.irq_enable = xen_irq_enable_direct; 1052 pv_irq_ops.irq_enable = xen_irq_enable_direct;
960 pv_mmu_ops.read_cr2 = xen_read_cr2_direct; 1053 pv_mmu_ops.read_cr2 = xen_read_cr2_direct;
961 } 1054 }
1055#endif
962} 1056}
963 1057
964static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, 1058static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
@@ -979,10 +1073,12 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
979 goto patch_site 1073 goto patch_site
980 1074
981 switch (type) { 1075 switch (type) {
1076#ifdef CONFIG_X86_32
982 SITE(pv_irq_ops, irq_enable); 1077 SITE(pv_irq_ops, irq_enable);
983 SITE(pv_irq_ops, irq_disable); 1078 SITE(pv_irq_ops, irq_disable);
984 SITE(pv_irq_ops, save_fl); 1079 SITE(pv_irq_ops, save_fl);
985 SITE(pv_irq_ops, restore_fl); 1080 SITE(pv_irq_ops, restore_fl);
1081#endif /* CONFIG_X86_32 */
986#undef SITE 1082#undef SITE
987 1083
988 patch_site: 1084 patch_site:
@@ -1025,8 +1121,15 @@ static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot)
1025#ifdef CONFIG_X86_F00F_BUG 1121#ifdef CONFIG_X86_F00F_BUG
1026 case FIX_F00F_IDT: 1122 case FIX_F00F_IDT:
1027#endif 1123#endif
1124#ifdef CONFIG_X86_32
1028 case FIX_WP_TEST: 1125 case FIX_WP_TEST:
1029 case FIX_VDSO: 1126 case FIX_VDSO:
1127# ifdef CONFIG_HIGHMEM
1128 case FIX_KMAP_BEGIN ... FIX_KMAP_END:
1129# endif
1130#else
1131 case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
1132#endif
1030#ifdef CONFIG_X86_LOCAL_APIC 1133#ifdef CONFIG_X86_LOCAL_APIC
1031 case FIX_APIC_BASE: /* maps dummy local APIC */ 1134 case FIX_APIC_BASE: /* maps dummy local APIC */
1032#endif 1135#endif
@@ -1039,6 +1142,15 @@ static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot)
1039 } 1142 }
1040 1143
1041 __native_set_fixmap(idx, pte); 1144 __native_set_fixmap(idx, pte);
1145
1146#ifdef CONFIG_X86_64
1147 /* Replicate changes to map the vsyscall page into the user
1148 pagetable vsyscall mapping. */
1149 if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) {
1150 unsigned long vaddr = __fix_to_virt(idx);
1151 set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
1152 }
1153#endif
1042} 1154}
1043 1155
1044static const struct pv_info xen_info __initdata = { 1156static const struct pv_info xen_info __initdata = {
@@ -1084,18 +1196,25 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
1084 .wbinvd = native_wbinvd, 1196 .wbinvd = native_wbinvd,
1085 1197
1086 .read_msr = native_read_msr_safe, 1198 .read_msr = native_read_msr_safe,
1087 .write_msr = native_write_msr_safe, 1199 .write_msr = xen_write_msr_safe,
1088 .read_tsc = native_read_tsc, 1200 .read_tsc = native_read_tsc,
1089 .read_pmc = native_read_pmc, 1201 .read_pmc = native_read_pmc,
1090 1202
1091 .iret = xen_iret, 1203 .iret = xen_iret,
1092 .irq_enable_sysexit = xen_sysexit, 1204 .irq_enable_sysexit = xen_sysexit,
1205#ifdef CONFIG_X86_64
1206 .usergs_sysret32 = xen_sysret32,
1207 .usergs_sysret64 = xen_sysret64,
1208#endif
1093 1209
1094 .load_tr_desc = paravirt_nop, 1210 .load_tr_desc = paravirt_nop,
1095 .set_ldt = xen_set_ldt, 1211 .set_ldt = xen_set_ldt,
1096 .load_gdt = xen_load_gdt, 1212 .load_gdt = xen_load_gdt,
1097 .load_idt = xen_load_idt, 1213 .load_idt = xen_load_idt,
1098 .load_tls = xen_load_tls, 1214 .load_tls = xen_load_tls,
1215#ifdef CONFIG_X86_64
1216 .load_gs_index = xen_load_gs_index,
1217#endif
1099 1218
1100 .store_gdt = native_store_gdt, 1219 .store_gdt = native_store_gdt,
1101 .store_idt = native_store_idt, 1220 .store_idt = native_store_idt,
@@ -1109,14 +1228,34 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
1109 .set_iopl_mask = xen_set_iopl_mask, 1228 .set_iopl_mask = xen_set_iopl_mask,
1110 .io_delay = xen_io_delay, 1229 .io_delay = xen_io_delay,
1111 1230
1231 /* Xen takes care of %gs when switching to usermode for us */
1232 .swapgs = paravirt_nop,
1233
1112 .lazy_mode = { 1234 .lazy_mode = {
1113 .enter = paravirt_enter_lazy_cpu, 1235 .enter = paravirt_enter_lazy_cpu,
1114 .leave = xen_leave_lazy, 1236 .leave = xen_leave_lazy,
1115 }, 1237 },
1116}; 1238};
1117 1239
1240static void __init __xen_init_IRQ(void)
1241{
1242#ifdef CONFIG_X86_64
1243 int i;
1244
1245 /* Create identity vector->irq map */
1246 for(i = 0; i < NR_VECTORS; i++) {
1247 int cpu;
1248
1249 for_each_possible_cpu(cpu)
1250 per_cpu(vector_irq, cpu)[i] = i;
1251 }
1252#endif /* CONFIG_X86_64 */
1253
1254 xen_init_IRQ();
1255}
1256
1118static const struct pv_irq_ops xen_irq_ops __initdata = { 1257static const struct pv_irq_ops xen_irq_ops __initdata = {
1119 .init_IRQ = xen_init_IRQ, 1258 .init_IRQ = __xen_init_IRQ,
1120 .save_fl = xen_save_fl, 1259 .save_fl = xen_save_fl,
1121 .restore_fl = xen_restore_fl, 1260 .restore_fl = xen_restore_fl,
1122 .irq_disable = xen_irq_disable, 1261 .irq_disable = xen_irq_disable,
@@ -1124,7 +1263,7 @@ static const struct pv_irq_ops xen_irq_ops __initdata = {
1124 .safe_halt = xen_safe_halt, 1263 .safe_halt = xen_safe_halt,
1125 .halt = xen_halt, 1264 .halt = xen_halt,
1126#ifdef CONFIG_X86_64 1265#ifdef CONFIG_X86_64
1127 .adjust_exception_frame = paravirt_nop, 1266 .adjust_exception_frame = xen_adjust_exception_frame,
1128#endif 1267#endif
1129}; 1268};
1130 1269
@@ -1157,8 +1296,8 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1157 .pte_update = paravirt_nop, 1296 .pte_update = paravirt_nop,
1158 .pte_update_defer = paravirt_nop, 1297 .pte_update_defer = paravirt_nop,
1159 1298
1160 .pgd_alloc = __paravirt_pgd_alloc, 1299 .pgd_alloc = xen_pgd_alloc,
1161 .pgd_free = paravirt_nop, 1300 .pgd_free = xen_pgd_free,
1162 1301
1163 .alloc_pte = xen_alloc_pte_init, 1302 .alloc_pte = xen_alloc_pte_init,
1164 .release_pte = xen_release_pte_init, 1303 .release_pte = xen_release_pte_init,
@@ -1170,7 +1309,11 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1170 .kmap_atomic_pte = xen_kmap_atomic_pte, 1309 .kmap_atomic_pte = xen_kmap_atomic_pte,
1171#endif 1310#endif
1172 1311
1173 .set_pte = NULL, /* see xen_pagetable_setup_* */ 1312#ifdef CONFIG_X86_64
1313 .set_pte = xen_set_pte,
1314#else
1315 .set_pte = xen_set_pte_init,
1316#endif
1174 .set_pte_at = xen_set_pte_at, 1317 .set_pte_at = xen_set_pte_at,
1175 .set_pmd = xen_set_pmd_hyper, 1318 .set_pmd = xen_set_pmd_hyper,
1176 1319
@@ -1184,15 +1327,26 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1184 .make_pte = xen_make_pte, 1327 .make_pte = xen_make_pte,
1185 .make_pgd = xen_make_pgd, 1328 .make_pgd = xen_make_pgd,
1186 1329
1330#ifdef CONFIG_X86_PAE
1187 .set_pte_atomic = xen_set_pte_atomic, 1331 .set_pte_atomic = xen_set_pte_atomic,
1188 .set_pte_present = xen_set_pte_at, 1332 .set_pte_present = xen_set_pte_at,
1189 .set_pud = xen_set_pud_hyper,
1190 .pte_clear = xen_pte_clear, 1333 .pte_clear = xen_pte_clear,
1191 .pmd_clear = xen_pmd_clear, 1334 .pmd_clear = xen_pmd_clear,
1335#endif /* CONFIG_X86_PAE */
1336 .set_pud = xen_set_pud_hyper,
1192 1337
1193 .make_pmd = xen_make_pmd, 1338 .make_pmd = xen_make_pmd,
1194 .pmd_val = xen_pmd_val, 1339 .pmd_val = xen_pmd_val,
1195 1340
1341#if PAGETABLE_LEVELS == 4
1342 .pud_val = xen_pud_val,
1343 .make_pud = xen_make_pud,
1344 .set_pgd = xen_set_pgd_hyper,
1345
1346 .alloc_pud = xen_alloc_pte_init,
1347 .release_pud = xen_release_pte_init,
1348#endif /* PAGETABLE_LEVELS == 4 */
1349
1196 .activate_mm = xen_activate_mm, 1350 .activate_mm = xen_activate_mm,
1197 .dup_mmap = xen_dup_mmap, 1351 .dup_mmap = xen_dup_mmap,
1198 .exit_mmap = xen_exit_mmap, 1352 .exit_mmap = xen_exit_mmap,
@@ -1205,21 +1359,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1205 .set_fixmap = xen_set_fixmap, 1359 .set_fixmap = xen_set_fixmap,
1206}; 1360};
1207 1361
1208#ifdef CONFIG_SMP
1209static const struct smp_ops xen_smp_ops __initdata = {
1210 .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
1211 .smp_prepare_cpus = xen_smp_prepare_cpus,
1212 .cpu_up = xen_cpu_up,
1213 .smp_cpus_done = xen_smp_cpus_done,
1214
1215 .smp_send_stop = xen_smp_send_stop,
1216 .smp_send_reschedule = xen_smp_send_reschedule,
1217
1218 .send_call_func_ipi = xen_smp_send_call_function_ipi,
1219 .send_call_func_single_ipi = xen_smp_send_call_function_single_ipi,
1220};
1221#endif /* CONFIG_SMP */
1222
1223static void xen_reboot(int reason) 1362static void xen_reboot(int reason)
1224{ 1363{
1225 struct sched_shutdown r = { .reason = reason }; 1364 struct sched_shutdown r = { .reason = reason };
@@ -1264,6 +1403,7 @@ static const struct machine_ops __initdata xen_machine_ops = {
1264 1403
1265static void __init xen_reserve_top(void) 1404static void __init xen_reserve_top(void)
1266{ 1405{
1406#ifdef CONFIG_X86_32
1267 unsigned long top = HYPERVISOR_VIRT_START; 1407 unsigned long top = HYPERVISOR_VIRT_START;
1268 struct xen_platform_parameters pp; 1408 struct xen_platform_parameters pp;
1269 1409
@@ -1271,7 +1411,247 @@ static void __init xen_reserve_top(void)
1271 top = pp.virt_start; 1411 top = pp.virt_start;
1272 1412
1273 reserve_top_address(-top + 2 * PAGE_SIZE); 1413 reserve_top_address(-top + 2 * PAGE_SIZE);
1414#endif /* CONFIG_X86_32 */
1415}
1416
1417/*
1418 * Like __va(), but returns address in the kernel mapping (which is
1419 * all we have until the physical memory mapping has been set up.
1420 */
1421static void *__ka(phys_addr_t paddr)
1422{
1423#ifdef CONFIG_X86_64
1424 return (void *)(paddr + __START_KERNEL_map);
1425#else
1426 return __va(paddr);
1427#endif
1428}
1429
1430/* Convert a machine address to physical address */
1431static unsigned long m2p(phys_addr_t maddr)
1432{
1433 phys_addr_t paddr;
1434
1435 maddr &= PTE_MASK;
1436 paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
1437
1438 return paddr;
1439}
1440
1441/* Convert a machine address to kernel virtual */
1442static void *m2v(phys_addr_t maddr)
1443{
1444 return __ka(m2p(maddr));
1445}
1446
1447#ifdef CONFIG_X86_64
1448static void walk(pgd_t *pgd, unsigned long addr)
1449{
1450 unsigned l4idx = pgd_index(addr);
1451 unsigned l3idx = pud_index(addr);
1452 unsigned l2idx = pmd_index(addr);
1453 unsigned l1idx = pte_index(addr);
1454 pgd_t l4;
1455 pud_t l3;
1456 pmd_t l2;
1457 pte_t l1;
1458
1459 xen_raw_printk("walk %p, %lx -> %d %d %d %d\n",
1460 pgd, addr, l4idx, l3idx, l2idx, l1idx);
1461
1462 l4 = pgd[l4idx];
1463 xen_raw_printk(" l4: %016lx\n", l4.pgd);
1464 xen_raw_printk(" %016lx\n", pgd_val(l4));
1465
1466 l3 = ((pud_t *)(m2v(l4.pgd)))[l3idx];
1467 xen_raw_printk(" l3: %016lx\n", l3.pud);
1468 xen_raw_printk(" %016lx\n", pud_val(l3));
1469
1470 l2 = ((pmd_t *)(m2v(l3.pud)))[l2idx];
1471 xen_raw_printk(" l2: %016lx\n", l2.pmd);
1472 xen_raw_printk(" %016lx\n", pmd_val(l2));
1473
1474 l1 = ((pte_t *)(m2v(l2.pmd)))[l1idx];
1475 xen_raw_printk(" l1: %016lx\n", l1.pte);
1476 xen_raw_printk(" %016lx\n", pte_val(l1));
1477}
1478#endif
1479
1480static void set_page_prot(void *addr, pgprot_t prot)
1481{
1482 unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
1483 pte_t pte = pfn_pte(pfn, prot);
1484
1485 xen_raw_printk("addr=%p pfn=%lx mfn=%lx prot=%016llx pte=%016llx\n",
1486 addr, pfn, get_phys_to_machine(pfn),
1487 pgprot_val(prot), pte.pte);
1488
1489 if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
1490 BUG();
1491}
1492
1493static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1494{
1495 unsigned pmdidx, pteidx;
1496 unsigned ident_pte;
1497 unsigned long pfn;
1498
1499 ident_pte = 0;
1500 pfn = 0;
1501 for(pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
1502 pte_t *pte_page;
1503
1504 /* Reuse or allocate a page of ptes */
1505 if (pmd_present(pmd[pmdidx]))
1506 pte_page = m2v(pmd[pmdidx].pmd);
1507 else {
1508 /* Check for free pte pages */
1509 if (ident_pte == ARRAY_SIZE(level1_ident_pgt))
1510 break;
1511
1512 pte_page = &level1_ident_pgt[ident_pte];
1513 ident_pte += PTRS_PER_PTE;
1514
1515 pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
1516 }
1517
1518 /* Install mappings */
1519 for(pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
1520 pte_t pte;
1521
1522 if (pfn > max_pfn_mapped)
1523 max_pfn_mapped = pfn;
1524
1525 if (!pte_none(pte_page[pteidx]))
1526 continue;
1527
1528 pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
1529 pte_page[pteidx] = pte;
1530 }
1531 }
1532
1533 for(pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
1534 set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
1535
1536 set_page_prot(pmd, PAGE_KERNEL_RO);
1537}
1538
1539#ifdef CONFIG_X86_64
1540static void convert_pfn_mfn(void *v)
1541{
1542 pte_t *pte = v;
1543 int i;
1544
1545 /* All levels are converted the same way, so just treat them
1546 as ptes. */
1547 for(i = 0; i < PTRS_PER_PTE; i++)
1548 pte[i] = xen_make_pte(pte[i].pte);
1549}
1550
1551/*
1552 * Set up the inital kernel pagetable.
1553 *
1554 * We can construct this by grafting the Xen provided pagetable into
1555 * head_64.S's preconstructed pagetables. We copy the Xen L2's into
1556 * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt. This
1557 * means that only the kernel has a physical mapping to start with -
1558 * but that's enough to get __va working. We need to fill in the rest
1559 * of the physical mapping once some sort of allocator has been set
1560 * up.
1561 */
1562static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
1563{
1564 pud_t *l3;
1565 pmd_t *l2;
1566
1567 /* Zap identity mapping */
1568 init_level4_pgt[0] = __pgd(0);
1569
1570 /* Pre-constructed entries are in pfn, so convert to mfn */
1571 convert_pfn_mfn(init_level4_pgt);
1572 convert_pfn_mfn(level3_ident_pgt);
1573 convert_pfn_mfn(level3_kernel_pgt);
1574
1575 l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
1576 l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
1577
1578 memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1579 memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1580
1581 l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
1582 l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
1583 memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1584
1585 /* Set up identity map */
1586 xen_map_identity_early(level2_ident_pgt, max_pfn);
1587
1588 /* Make pagetable pieces RO */
1589 set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
1590 set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
1591 set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
1592 set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
1593 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1594 set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
1595
1596 /* Pin down new L4 */
1597 pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
1598 PFN_DOWN(__pa_symbol(init_level4_pgt)));
1599
1600 /* Unpin Xen-provided one */
1601 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1602
1603 /* Switch over */
1604 pgd = init_level4_pgt;
1605
1606 /*
1607 * At this stage there can be no user pgd, and no page
1608 * structure to attach it to, so make sure we just set kernel
1609 * pgd.
1610 */
1611 xen_mc_batch();
1612 __xen_write_cr3(true, __pa(pgd));
1613 xen_mc_issue(PARAVIRT_LAZY_CPU);
1614
1615 reserve_early(__pa(xen_start_info->pt_base),
1616 __pa(xen_start_info->pt_base +
1617 xen_start_info->nr_pt_frames * PAGE_SIZE),
1618 "XEN PAGETABLES");
1619
1620 return pgd;
1621}
1622#else /* !CONFIG_X86_64 */
1623static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss;
1624
1625static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
1626{
1627 pmd_t *kernel_pmd;
1628
1629 init_pg_tables_start = __pa(pgd);
1630 init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
1631 max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024);
1632
1633 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
1634 memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
1635
1636 xen_map_identity_early(level2_kernel_pgt, max_pfn);
1637
1638 memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
1639 set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY],
1640 __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT));
1641
1642 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1643 set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
1644 set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
1645
1646 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1647
1648 xen_write_cr3(__pa(swapper_pg_dir));
1649
1650 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir)));
1651
1652 return swapper_pg_dir;
1274} 1653}
1654#endif /* CONFIG_X86_64 */
1275 1655
1276/* First C function to be called on Xen boot */ 1656/* First C function to be called on Xen boot */
1277asmlinkage void __init xen_start_kernel(void) 1657asmlinkage void __init xen_start_kernel(void)
@@ -1301,53 +1681,56 @@ asmlinkage void __init xen_start_kernel(void)
1301 1681
1302 machine_ops = xen_machine_ops; 1682 machine_ops = xen_machine_ops;
1303 1683
1304#ifdef CONFIG_SMP 1684#ifdef CONFIG_X86_64
1305 smp_ops = xen_smp_ops; 1685 /* Disable until direct per-cpu data access. */
1686 have_vcpu_info_placement = 0;
1687 x86_64_init_pda();
1306#endif 1688#endif
1307 1689
1690 xen_smp_init();
1691
1308 /* Get mfn list */ 1692 /* Get mfn list */
1309 if (!xen_feature(XENFEAT_auto_translated_physmap)) 1693 if (!xen_feature(XENFEAT_auto_translated_physmap))
1310 xen_build_dynamic_phys_to_machine(); 1694 xen_build_dynamic_phys_to_machine();
1311 1695
1312 pgd = (pgd_t *)xen_start_info->pt_base; 1696 pgd = (pgd_t *)xen_start_info->pt_base;
1313 1697
1314 init_pg_tables_start = __pa(pgd); 1698 /* Prevent unwanted bits from being set in PTEs. */
1315 init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE; 1699 __supported_pte_mask &= ~_PAGE_GLOBAL;
1316 max_pfn_mapped = (init_pg_tables_end + 512*1024) >> PAGE_SHIFT; 1700 if (!is_initial_xendomain())
1317 1701 __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
1318 init_mm.pgd = pgd; /* use the Xen pagetables to start */
1319
1320 /* keep using Xen gdt for now; no urgent need to change it */
1321
1322 x86_write_percpu(xen_cr3, __pa(pgd));
1323 x86_write_percpu(xen_current_cr3, __pa(pgd));
1324 1702
1325 /* Don't do the full vcpu_info placement stuff until we have a 1703 /* Don't do the full vcpu_info placement stuff until we have a
1326 possible map and a non-dummy shared_info. */ 1704 possible map and a non-dummy shared_info. */
1327 per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; 1705 per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
1328 1706
1707 xen_raw_console_write("mapping kernel into physical memory\n");
1708 pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages);
1709
1710 init_mm.pgd = pgd;
1711
1712 /* keep using Xen gdt for now; no urgent need to change it */
1713
1329 pv_info.kernel_rpl = 1; 1714 pv_info.kernel_rpl = 1;
1330 if (xen_feature(XENFEAT_supervisor_mode_kernel)) 1715 if (xen_feature(XENFEAT_supervisor_mode_kernel))
1331 pv_info.kernel_rpl = 0; 1716 pv_info.kernel_rpl = 0;
1332 1717
1333 /* Prevent unwanted bits from being set in PTEs. */
1334 __supported_pte_mask &= ~_PAGE_GLOBAL;
1335 if (!is_initial_xendomain())
1336 __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
1337
1338 /* set the limit of our address space */ 1718 /* set the limit of our address space */
1339 xen_reserve_top(); 1719 xen_reserve_top();
1340 1720
1721#ifdef CONFIG_X86_32
1341 /* set up basic CPUID stuff */ 1722 /* set up basic CPUID stuff */
1342 cpu_detect(&new_cpu_data); 1723 cpu_detect(&new_cpu_data);
1343 new_cpu_data.hard_math = 1; 1724 new_cpu_data.hard_math = 1;
1344 new_cpu_data.x86_capability[0] = cpuid_edx(1); 1725 new_cpu_data.x86_capability[0] = cpuid_edx(1);
1726#endif
1345 1727
1346 /* Poke various useful things into boot_params */ 1728 /* Poke various useful things into boot_params */
1347 boot_params.hdr.type_of_loader = (9 << 4) | 0; 1729 boot_params.hdr.type_of_loader = (9 << 4) | 0;
1348 boot_params.hdr.ramdisk_image = xen_start_info->mod_start 1730 boot_params.hdr.ramdisk_image = xen_start_info->mod_start
1349 ? __pa(xen_start_info->mod_start) : 0; 1731 ? __pa(xen_start_info->mod_start) : 0;
1350 boot_params.hdr.ramdisk_size = xen_start_info->mod_len; 1732 boot_params.hdr.ramdisk_size = xen_start_info->mod_len;
1733 boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line);
1351 1734
1352 if (!is_initial_xendomain()) { 1735 if (!is_initial_xendomain()) {
1353 add_preferred_console("xenboot", 0, NULL); 1736 add_preferred_console("xenboot", 0, NULL);
@@ -1355,6 +1738,21 @@ asmlinkage void __init xen_start_kernel(void)
1355 add_preferred_console("hvc", 0, NULL); 1738 add_preferred_console("hvc", 0, NULL);
1356 } 1739 }
1357 1740
1741 xen_raw_console_write("about to get started...\n");
1742
1743#if 0
1744 xen_raw_printk("&boot_params=%p __pa(&boot_params)=%lx __va(__pa(&boot_params))=%lx\n",
1745 &boot_params, __pa_symbol(&boot_params),
1746 __va(__pa_symbol(&boot_params)));
1747
1748 walk(pgd, &boot_params);
1749 walk(pgd, __va(__pa(&boot_params)));
1750#endif
1751
1358 /* Start the world */ 1752 /* Start the world */
1753#ifdef CONFIG_X86_32
1359 i386_start_kernel(); 1754 i386_start_kernel();
1755#else
1756 x86_64_start_reservations((char *)__pa_symbol(&boot_params));
1757#endif
1360} 1758}
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index ff0aa74afaa1..a44d56e38bd1 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -44,8 +44,10 @@
44 44
45#include <asm/pgtable.h> 45#include <asm/pgtable.h>
46#include <asm/tlbflush.h> 46#include <asm/tlbflush.h>
47#include <asm/fixmap.h>
47#include <asm/mmu_context.h> 48#include <asm/mmu_context.h>
48#include <asm/paravirt.h> 49#include <asm/paravirt.h>
50#include <asm/linkage.h>
49 51
50#include <asm/xen/hypercall.h> 52#include <asm/xen/hypercall.h>
51#include <asm/xen/hypervisor.h> 53#include <asm/xen/hypervisor.h>
@@ -56,26 +58,29 @@
56#include "multicalls.h" 58#include "multicalls.h"
57#include "mmu.h" 59#include "mmu.h"
58 60
61/*
62 * Just beyond the highest usermode address. STACK_TOP_MAX has a
63 * redzone above it, so round it up to a PGD boundary.
64 */
65#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
66
67
59#define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) 68#define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
60#define TOP_ENTRIES (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE) 69#define TOP_ENTRIES (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE)
61 70
62/* Placeholder for holes in the address space */ 71/* Placeholder for holes in the address space */
63static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] 72static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] __page_aligned_data =
64 __attribute__((section(".data.page_aligned"))) =
65 { [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL }; 73 { [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL };
66 74
67 /* Array of pointers to pages containing p2m entries */ 75 /* Array of pointers to pages containing p2m entries */
68static unsigned long *p2m_top[TOP_ENTRIES] 76static unsigned long *p2m_top[TOP_ENTRIES] __page_aligned_data =
69 __attribute__((section(".data.page_aligned"))) =
70 { [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] }; 77 { [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] };
71 78
72/* Arrays of p2m arrays expressed in mfns used for save/restore */ 79/* Arrays of p2m arrays expressed in mfns used for save/restore */
73static unsigned long p2m_top_mfn[TOP_ENTRIES] 80static unsigned long p2m_top_mfn[TOP_ENTRIES] __page_aligned_bss;
74 __attribute__((section(".bss.page_aligned")));
75 81
76static unsigned long p2m_top_mfn_list[ 82static unsigned long p2m_top_mfn_list[TOP_ENTRIES / P2M_ENTRIES_PER_PAGE]
77 PAGE_ALIGN(TOP_ENTRIES / P2M_ENTRIES_PER_PAGE)] 83 __page_aligned_bss;
78 __attribute__((section(".bss.page_aligned")));
79 84
80static inline unsigned p2m_top_index(unsigned long pfn) 85static inline unsigned p2m_top_index(unsigned long pfn)
81{ 86{
@@ -181,15 +186,16 @@ void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
181 p2m_top[topidx][idx] = mfn; 186 p2m_top[topidx][idx] = mfn;
182} 187}
183 188
184xmaddr_t arbitrary_virt_to_machine(unsigned long address) 189xmaddr_t arbitrary_virt_to_machine(void *vaddr)
185{ 190{
191 unsigned long address = (unsigned long)vaddr;
186 unsigned int level; 192 unsigned int level;
187 pte_t *pte = lookup_address(address, &level); 193 pte_t *pte = lookup_address(address, &level);
188 unsigned offset = address & ~PAGE_MASK; 194 unsigned offset = address & ~PAGE_MASK;
189 195
190 BUG_ON(pte == NULL); 196 BUG_ON(pte == NULL);
191 197
192 return XMADDR((pte_mfn(*pte) << PAGE_SHIFT) + offset); 198 return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset);
193} 199}
194 200
195void make_lowmem_page_readonly(void *vaddr) 201void make_lowmem_page_readonly(void *vaddr)
@@ -256,7 +262,8 @@ void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
256 262
257 xen_mc_batch(); 263 xen_mc_batch();
258 264
259 u.ptr = virt_to_machine(ptr).maddr; 265 /* ptr may be ioremapped for 64-bit pagetable setup */
266 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
260 u.val = pmd_val_ma(val); 267 u.val = pmd_val_ma(val);
261 extend_mmu_update(&u); 268 extend_mmu_update(&u);
262 269
@@ -283,35 +290,7 @@ void xen_set_pmd(pmd_t *ptr, pmd_t val)
283 */ 290 */
284void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags) 291void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
285{ 292{
286 pgd_t *pgd; 293 set_pte_vaddr(vaddr, mfn_pte(mfn, flags));
287 pud_t *pud;
288 pmd_t *pmd;
289 pte_t *pte;
290
291 pgd = swapper_pg_dir + pgd_index(vaddr);
292 if (pgd_none(*pgd)) {
293 BUG();
294 return;
295 }
296 pud = pud_offset(pgd, vaddr);
297 if (pud_none(*pud)) {
298 BUG();
299 return;
300 }
301 pmd = pmd_offset(pud, vaddr);
302 if (pmd_none(*pmd)) {
303 BUG();
304 return;
305 }
306 pte = pte_offset_kernel(pmd, vaddr);
307 /* <mfn,flags> stored as-is, to permit clearing entries */
308 xen_set_pte(pte, mfn_pte(mfn, flags));
309
310 /*
311 * It's enough to flush this one mapping.
312 * (PGE mappings get flushed as well)
313 */
314 __flush_tlb_one(vaddr);
315} 294}
316 295
317void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, 296void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
@@ -418,7 +397,8 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val)
418 397
419 xen_mc_batch(); 398 xen_mc_batch();
420 399
421 u.ptr = virt_to_machine(ptr).maddr; 400 /* ptr may be ioremapped for 64-bit pagetable setup */
401 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
422 u.val = pud_val_ma(val); 402 u.val = pud_val_ma(val);
423 extend_mmu_update(&u); 403 extend_mmu_update(&u);
424 404
@@ -441,14 +421,19 @@ void xen_set_pud(pud_t *ptr, pud_t val)
441 421
442void xen_set_pte(pte_t *ptep, pte_t pte) 422void xen_set_pte(pte_t *ptep, pte_t pte)
443{ 423{
424#ifdef CONFIG_X86_PAE
444 ptep->pte_high = pte.pte_high; 425 ptep->pte_high = pte.pte_high;
445 smp_wmb(); 426 smp_wmb();
446 ptep->pte_low = pte.pte_low; 427 ptep->pte_low = pte.pte_low;
428#else
429 *ptep = pte;
430#endif
447} 431}
448 432
433#ifdef CONFIG_X86_PAE
449void xen_set_pte_atomic(pte_t *ptep, pte_t pte) 434void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
450{ 435{
451 set_64bit((u64 *)ptep, pte_val_ma(pte)); 436 set_64bit((u64 *)ptep, native_pte_val(pte));
452} 437}
453 438
454void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) 439void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
@@ -462,6 +447,7 @@ void xen_pmd_clear(pmd_t *pmdp)
462{ 447{
463 set_pmd(pmdp, __pmd(0)); 448 set_pmd(pmdp, __pmd(0));
464} 449}
450#endif /* CONFIG_X86_PAE */
465 451
466pmd_t xen_make_pmd(pmdval_t pmd) 452pmd_t xen_make_pmd(pmdval_t pmd)
467{ 453{
@@ -469,78 +455,189 @@ pmd_t xen_make_pmd(pmdval_t pmd)
469 return native_make_pmd(pmd); 455 return native_make_pmd(pmd);
470} 456}
471 457
458#if PAGETABLE_LEVELS == 4
459pudval_t xen_pud_val(pud_t pud)
460{
461 return pte_mfn_to_pfn(pud.pud);
462}
463
464pud_t xen_make_pud(pudval_t pud)
465{
466 pud = pte_pfn_to_mfn(pud);
467
468 return native_make_pud(pud);
469}
470
471pgd_t *xen_get_user_pgd(pgd_t *pgd)
472{
473 pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK);
474 unsigned offset = pgd - pgd_page;
475 pgd_t *user_ptr = NULL;
476
477 if (offset < pgd_index(USER_LIMIT)) {
478 struct page *page = virt_to_page(pgd_page);
479 user_ptr = (pgd_t *)page->private;
480 if (user_ptr)
481 user_ptr += offset;
482 }
483
484 return user_ptr;
485}
486
487static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
488{
489 struct mmu_update u;
490
491 u.ptr = virt_to_machine(ptr).maddr;
492 u.val = pgd_val_ma(val);
493 extend_mmu_update(&u);
494}
495
496/*
497 * Raw hypercall-based set_pgd, intended for in early boot before
498 * there's a page structure. This implies:
499 * 1. The only existing pagetable is the kernel's
500 * 2. It is always pinned
501 * 3. It has no user pagetable attached to it
502 */
503void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
504{
505 preempt_disable();
506
507 xen_mc_batch();
508
509 __xen_set_pgd_hyper(ptr, val);
510
511 xen_mc_issue(PARAVIRT_LAZY_MMU);
512
513 preempt_enable();
514}
515
516void xen_set_pgd(pgd_t *ptr, pgd_t val)
517{
518 pgd_t *user_ptr = xen_get_user_pgd(ptr);
519
520 /* If page is not pinned, we can just update the entry
521 directly */
522 if (!page_pinned(ptr)) {
523 *ptr = val;
524 if (user_ptr) {
525 WARN_ON(page_pinned(user_ptr));
526 *user_ptr = val;
527 }
528 return;
529 }
530
531 /* If it's pinned, then we can at least batch the kernel and
532 user updates together. */
533 xen_mc_batch();
534
535 __xen_set_pgd_hyper(ptr, val);
536 if (user_ptr)
537 __xen_set_pgd_hyper(user_ptr, val);
538
539 xen_mc_issue(PARAVIRT_LAZY_MMU);
540}
541#endif /* PAGETABLE_LEVELS == 4 */
542
472/* 543/*
473 (Yet another) pagetable walker. This one is intended for pinning a 544 * (Yet another) pagetable walker. This one is intended for pinning a
474 pagetable. This means that it walks a pagetable and calls the 545 * pagetable. This means that it walks a pagetable and calls the
475 callback function on each page it finds making up the page table, 546 * callback function on each page it finds making up the page table,
476 at every level. It walks the entire pagetable, but it only bothers 547 * at every level. It walks the entire pagetable, but it only bothers
477 pinning pte pages which are below pte_limit. In the normal case 548 * pinning pte pages which are below limit. In the normal case this
478 this will be TASK_SIZE, but at boot we need to pin up to 549 * will be STACK_TOP_MAX, but at boot we need to pin up to
479 FIXADDR_TOP. But the important bit is that we don't pin beyond 550 * FIXADDR_TOP.
480 there, because then we start getting into Xen's ptes. 551 *
481*/ 552 * For 32-bit the important bit is that we don't pin beyond there,
482static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, enum pt_level), 553 * because then we start getting into Xen's ptes.
554 *
555 * For 64-bit, we must skip the Xen hole in the middle of the address
556 * space, just after the big x86-64 virtual hole.
557 */
558static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level),
483 unsigned long limit) 559 unsigned long limit)
484{ 560{
485 pgd_t *pgd = pgd_base;
486 int flush = 0; 561 int flush = 0;
487 unsigned long addr = 0; 562 unsigned hole_low, hole_high;
488 unsigned long pgd_next; 563 unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;
564 unsigned pgdidx, pudidx, pmdidx;
489 565
490 BUG_ON(limit > FIXADDR_TOP); 566 /* The limit is the last byte to be touched */
567 limit--;
568 BUG_ON(limit >= FIXADDR_TOP);
491 569
492 if (xen_feature(XENFEAT_auto_translated_physmap)) 570 if (xen_feature(XENFEAT_auto_translated_physmap))
493 return 0; 571 return 0;
494 572
495 for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) { 573 /*
574 * 64-bit has a great big hole in the middle of the address
575 * space, which contains the Xen mappings. On 32-bit these
576 * will end up making a zero-sized hole and so is a no-op.
577 */
578 hole_low = pgd_index(USER_LIMIT);
579 hole_high = pgd_index(PAGE_OFFSET);
580
581 pgdidx_limit = pgd_index(limit);
582#if PTRS_PER_PUD > 1
583 pudidx_limit = pud_index(limit);
584#else
585 pudidx_limit = 0;
586#endif
587#if PTRS_PER_PMD > 1
588 pmdidx_limit = pmd_index(limit);
589#else
590 pmdidx_limit = 0;
591#endif
592
593 flush |= (*func)(virt_to_page(pgd), PT_PGD);
594
595 for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) {
496 pud_t *pud; 596 pud_t *pud;
497 unsigned long pud_limit, pud_next;
498 597
499 pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP); 598 if (pgdidx >= hole_low && pgdidx < hole_high)
599 continue;
500 600
501 if (!pgd_val(*pgd)) 601 if (!pgd_val(pgd[pgdidx]))
502 continue; 602 continue;
503 603
504 pud = pud_offset(pgd, 0); 604 pud = pud_offset(&pgd[pgdidx], 0);
505 605
506 if (PTRS_PER_PUD > 1) /* not folded */ 606 if (PTRS_PER_PUD > 1) /* not folded */
507 flush |= (*func)(virt_to_page(pud), PT_PUD); 607 flush |= (*func)(virt_to_page(pud), PT_PUD);
508 608
509 for (; addr != pud_limit; pud++, addr = pud_next) { 609 for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) {
510 pmd_t *pmd; 610 pmd_t *pmd;
511 unsigned long pmd_limit;
512 611
513 pud_next = pud_addr_end(addr, pud_limit); 612 if (pgdidx == pgdidx_limit &&
514 613 pudidx > pudidx_limit)
515 if (pud_next < limit) 614 goto out;
516 pmd_limit = pud_next;
517 else
518 pmd_limit = limit;
519 615
520 if (pud_none(*pud)) 616 if (pud_none(pud[pudidx]))
521 continue; 617 continue;
522 618
523 pmd = pmd_offset(pud, 0); 619 pmd = pmd_offset(&pud[pudidx], 0);
524 620
525 if (PTRS_PER_PMD > 1) /* not folded */ 621 if (PTRS_PER_PMD > 1) /* not folded */
526 flush |= (*func)(virt_to_page(pmd), PT_PMD); 622 flush |= (*func)(virt_to_page(pmd), PT_PMD);
527 623
528 for (; addr != pmd_limit; pmd++) { 624 for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) {
529 addr += (PAGE_SIZE * PTRS_PER_PTE); 625 struct page *pte;
530 if ((pmd_limit-1) < (addr-1)) { 626
531 addr = pmd_limit; 627 if (pgdidx == pgdidx_limit &&
532 break; 628 pudidx == pudidx_limit &&
533 } 629 pmdidx > pmdidx_limit)
630 goto out;
534 631
535 if (pmd_none(*pmd)) 632 if (pmd_none(pmd[pmdidx]))
536 continue; 633 continue;
537 634
538 flush |= (*func)(pmd_page(*pmd), PT_PTE); 635 pte = pmd_page(pmd[pmdidx]);
636 flush |= (*func)(pte, PT_PTE);
539 } 637 }
540 } 638 }
541 } 639 }
542 640out:
543 flush |= (*func)(virt_to_page(pgd_base), PT_PGD);
544 641
545 return flush; 642 return flush;
546} 643}
@@ -622,14 +719,31 @@ void xen_pgd_pin(pgd_t *pgd)
622{ 719{
623 xen_mc_batch(); 720 xen_mc_batch();
624 721
625 if (pgd_walk(pgd, pin_page, TASK_SIZE)) { 722 if (pgd_walk(pgd, pin_page, USER_LIMIT)) {
626 /* re-enable interrupts for kmap_flush_unused */ 723 /* re-enable interrupts for kmap_flush_unused */
627 xen_mc_issue(0); 724 xen_mc_issue(0);
628 kmap_flush_unused(); 725 kmap_flush_unused();
629 xen_mc_batch(); 726 xen_mc_batch();
630 } 727 }
631 728
729#ifdef CONFIG_X86_64
730 {
731 pgd_t *user_pgd = xen_get_user_pgd(pgd);
732
733 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
734
735 if (user_pgd) {
736 pin_page(virt_to_page(user_pgd), PT_PGD);
737 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd)));
738 }
739 }
740#else /* CONFIG_X86_32 */
741#ifdef CONFIG_X86_PAE
742 /* Need to make sure unshared kernel PMD is pinnable */
743 pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD);
744#endif
632 xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd))); 745 xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
746#endif /* CONFIG_X86_64 */
633 xen_mc_issue(0); 747 xen_mc_issue(0);
634} 748}
635 749
@@ -656,9 +770,11 @@ void xen_mm_pin_all(void)
656 spin_unlock_irqrestore(&pgd_lock, flags); 770 spin_unlock_irqrestore(&pgd_lock, flags);
657} 771}
658 772
659/* The init_mm pagetable is really pinned as soon as its created, but 773/*
660 that's before we have page structures to store the bits. So do all 774 * The init_mm pagetable is really pinned as soon as its created, but
661 the book-keeping now. */ 775 * that's before we have page structures to store the bits. So do all
776 * the book-keeping now.
777 */
662static __init int mark_pinned(struct page *page, enum pt_level level) 778static __init int mark_pinned(struct page *page, enum pt_level level)
663{ 779{
664 SetPagePinned(page); 780 SetPagePinned(page);
@@ -708,7 +824,23 @@ static void xen_pgd_unpin(pgd_t *pgd)
708 824
709 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); 825 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
710 826
711 pgd_walk(pgd, unpin_page, TASK_SIZE); 827#ifdef CONFIG_X86_64
828 {
829 pgd_t *user_pgd = xen_get_user_pgd(pgd);
830
831 if (user_pgd) {
832 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd)));
833 unpin_page(virt_to_page(user_pgd), PT_PGD);
834 }
835 }
836#endif
837
838#ifdef CONFIG_X86_PAE
839 /* Need to make sure unshared kernel PMD is unpinned */
840 pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD);
841#endif
842
843 pgd_walk(pgd, unpin_page, USER_LIMIT);
712 844
713 xen_mc_issue(0); 845 xen_mc_issue(0);
714} 846}
@@ -727,7 +859,6 @@ void xen_mm_unpin_all(void)
727 list_for_each_entry(page, &pgd_list, lru) { 859 list_for_each_entry(page, &pgd_list, lru) {
728 if (PageSavePinned(page)) { 860 if (PageSavePinned(page)) {
729 BUG_ON(!PagePinned(page)); 861 BUG_ON(!PagePinned(page));
730 printk("unpinning pinned %p\n", page_address(page));
731 xen_pgd_unpin((pgd_t *)page_address(page)); 862 xen_pgd_unpin((pgd_t *)page_address(page));
732 ClearPageSavePinned(page); 863 ClearPageSavePinned(page);
733 } 864 }
@@ -757,8 +888,15 @@ void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
757static void drop_other_mm_ref(void *info) 888static void drop_other_mm_ref(void *info)
758{ 889{
759 struct mm_struct *mm = info; 890 struct mm_struct *mm = info;
891 struct mm_struct *active_mm;
892
893#ifdef CONFIG_X86_64
894 active_mm = read_pda(active_mm);
895#else
896 active_mm = __get_cpu_var(cpu_tlbstate).active_mm;
897#endif
760 898
761 if (__get_cpu_var(cpu_tlbstate).active_mm == mm) 899 if (active_mm == mm)
762 leave_mm(smp_processor_id()); 900 leave_mm(smp_processor_id());
763 901
764 /* If this cpu still has a stale cr3 reference, then make sure 902 /* If this cpu still has a stale cr3 reference, then make sure
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index 297bf9f5b8bc..0f59bd03f9e3 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -10,18 +10,6 @@ enum pt_level {
10 PT_PTE 10 PT_PTE
11}; 11};
12 12
13/*
14 * Page-directory addresses above 4GB do not fit into architectural %cr3.
15 * When accessing %cr3, or equivalent field in vcpu_guest_context, guests
16 * must use the following accessor macros to pack/unpack valid MFNs.
17 *
18 * Note that Xen is using the fact that the pagetable base is always
19 * page-aligned, and putting the 12 MSB of the address into the 12 LSB
20 * of cr3.
21 */
22#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20))
23#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20))
24
25 13
26void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); 14void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
27 15
@@ -44,13 +32,26 @@ pgd_t xen_make_pgd(pgdval_t);
44void xen_set_pte(pte_t *ptep, pte_t pteval); 32void xen_set_pte(pte_t *ptep, pte_t pteval);
45void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, 33void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
46 pte_t *ptep, pte_t pteval); 34 pte_t *ptep, pte_t pteval);
35
36#ifdef CONFIG_X86_PAE
47void xen_set_pte_atomic(pte_t *ptep, pte_t pte); 37void xen_set_pte_atomic(pte_t *ptep, pte_t pte);
38void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
39void xen_pmd_clear(pmd_t *pmdp);
40#endif /* CONFIG_X86_PAE */
41
48void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval); 42void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval);
49void xen_set_pud(pud_t *ptr, pud_t val); 43void xen_set_pud(pud_t *ptr, pud_t val);
50void xen_set_pmd_hyper(pmd_t *pmdp, pmd_t pmdval); 44void xen_set_pmd_hyper(pmd_t *pmdp, pmd_t pmdval);
51void xen_set_pud_hyper(pud_t *ptr, pud_t val); 45void xen_set_pud_hyper(pud_t *ptr, pud_t val);
52void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); 46
53void xen_pmd_clear(pmd_t *pmdp); 47#if PAGETABLE_LEVELS == 4
48pudval_t xen_pud_val(pud_t pud);
49pud_t xen_make_pud(pudval_t pudval);
50void xen_set_pgd(pgd_t *pgdp, pgd_t pgd);
51void xen_set_pgd_hyper(pgd_t *pgdp, pgd_t pgd);
52#endif
53
54pgd_t *xen_get_user_pgd(pgd_t *pgd);
54 55
55pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep); 56pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
56void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, 57void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c
index 3c63c4da7ed1..9efd1c6c9776 100644
--- a/arch/x86/xen/multicalls.c
+++ b/arch/x86/xen/multicalls.c
@@ -76,6 +76,7 @@ void xen_mc_flush(void)
76 if (ret) { 76 if (ret) {
77 printk(KERN_ERR "%d multicall(s) failed: cpu %d\n", 77 printk(KERN_ERR "%d multicall(s) failed: cpu %d\n",
78 ret, smp_processor_id()); 78 ret, smp_processor_id());
79 dump_stack();
79 for (i = 0; i < b->mcidx; i++) { 80 for (i = 0; i < b->mcidx; i++) {
80 printk(" call %2d/%d: op=%lu arg=[%lx] result=%ld\n", 81 printk(" call %2d/%d: op=%lu arg=[%lx] result=%ld\n",
81 i+1, b->mcidx, 82 i+1, b->mcidx,
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index e0a39595bde3..b6acc3a0af46 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -83,30 +83,72 @@ static void xen_idle(void)
83 83
84/* 84/*
85 * Set the bit indicating "nosegneg" library variants should be used. 85 * Set the bit indicating "nosegneg" library variants should be used.
86 * We only need to bother in pure 32-bit mode; compat 32-bit processes
87 * can have un-truncated segments, so wrapping around is allowed.
86 */ 88 */
87static void __init fiddle_vdso(void) 89static void __init fiddle_vdso(void)
88{ 90{
89 extern const char vdso32_default_start; 91#ifdef CONFIG_X86_32
90 u32 *mask = VDSO32_SYMBOL(&vdso32_default_start, NOTE_MASK); 92 u32 *mask;
93 mask = VDSO32_SYMBOL(&vdso32_int80_start, NOTE_MASK);
91 *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT; 94 *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
95 mask = VDSO32_SYMBOL(&vdso32_sysenter_start, NOTE_MASK);
96 *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
97#endif
92} 98}
93 99
94void xen_enable_sysenter(void) 100static __cpuinit int register_callback(unsigned type, const void *func)
95{ 101{
96 int cpu = smp_processor_id(); 102 struct callback_register callback = {
97 extern void xen_sysenter_target(void); 103 .type = type,
98 /* Mask events on entry, even though they get enabled immediately */ 104 .address = XEN_CALLBACK(__KERNEL_CS, func),
99 static struct callback_register sysenter = {
100 .type = CALLBACKTYPE_sysenter,
101 .address = { __KERNEL_CS, (unsigned long)xen_sysenter_target },
102 .flags = CALLBACKF_mask_events, 105 .flags = CALLBACKF_mask_events,
103 }; 106 };
104 107
105 if (!boot_cpu_has(X86_FEATURE_SEP) || 108 return HYPERVISOR_callback_op(CALLBACKOP_register, &callback);
106 HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) != 0) { 109}
107 clear_cpu_cap(&cpu_data(cpu), X86_FEATURE_SEP); 110
108 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_SEP); 111void __cpuinit xen_enable_sysenter(void)
112{
113 extern void xen_sysenter_target(void);
114 int ret;
115 unsigned sysenter_feature;
116
117#ifdef CONFIG_X86_32
118 sysenter_feature = X86_FEATURE_SEP;
119#else
120 sysenter_feature = X86_FEATURE_SYSENTER32;
121#endif
122
123 if (!boot_cpu_has(sysenter_feature))
124 return;
125
126 ret = register_callback(CALLBACKTYPE_sysenter, xen_sysenter_target);
127 if(ret != 0)
128 setup_clear_cpu_cap(sysenter_feature);
129}
130
131void __cpuinit xen_enable_syscall(void)
132{
133#ifdef CONFIG_X86_64
134 int ret;
135 extern void xen_syscall_target(void);
136 extern void xen_syscall32_target(void);
137
138 ret = register_callback(CALLBACKTYPE_syscall, xen_syscall_target);
139 if (ret != 0) {
140 printk(KERN_ERR "Failed to set syscall callback: %d\n", ret);
141 /* Pretty fatal; 64-bit userspace has no other
142 mechanism for syscalls. */
109 } 143 }
144
145 if (boot_cpu_has(X86_FEATURE_SYSCALL32)) {
146 ret = register_callback(CALLBACKTYPE_syscall32,
147 xen_syscall32_target);
148 if (ret != 0)
149 setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
150 }
151#endif /* CONFIG_X86_64 */
110} 152}
111 153
112void __init xen_arch_setup(void) 154void __init xen_arch_setup(void)
@@ -120,10 +162,12 @@ void __init xen_arch_setup(void)
120 if (!xen_feature(XENFEAT_auto_translated_physmap)) 162 if (!xen_feature(XENFEAT_auto_translated_physmap))
121 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3); 163 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3);
122 164
123 HYPERVISOR_set_callbacks(__KERNEL_CS, (unsigned long)xen_hypervisor_callback, 165 if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) ||
124 __KERNEL_CS, (unsigned long)xen_failsafe_callback); 166 register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback))
167 BUG();
125 168
126 xen_enable_sysenter(); 169 xen_enable_sysenter();
170 xen_enable_syscall();
127 171
128 set_iopl.iopl = 1; 172 set_iopl.iopl = 1;
129 rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); 173 rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
@@ -143,11 +187,6 @@ void __init xen_arch_setup(void)
143 187
144 pm_idle = xen_idle; 188 pm_idle = xen_idle;
145 189
146#ifdef CONFIG_SMP
147 /* fill cpus_possible with all available cpus */
148 xen_fill_possible_map();
149#endif
150
151 paravirt_disable_iospace(); 190 paravirt_disable_iospace();
152 191
153 fiddle_vdso(); 192 fiddle_vdso();
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 233156f39b7f..f702199312a5 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -66,13 +66,22 @@ static __cpuinit void cpu_bringup_and_idle(void)
66 int cpu = smp_processor_id(); 66 int cpu = smp_processor_id();
67 67
68 cpu_init(); 68 cpu_init();
69 preempt_disable();
70
69 xen_enable_sysenter(); 71 xen_enable_sysenter();
72 xen_enable_syscall();
70 73
71 preempt_disable(); 74 cpu = smp_processor_id();
72 per_cpu(cpu_state, cpu) = CPU_ONLINE; 75 smp_store_cpu_info(cpu);
76 cpu_data(cpu).x86_max_cores = 1;
77 set_cpu_sibling_map(cpu);
73 78
74 xen_setup_cpu_clockevents(); 79 xen_setup_cpu_clockevents();
75 80
81 cpu_set(cpu, cpu_online_map);
82 x86_write_percpu(cpu_state, CPU_ONLINE);
83 wmb();
84
76 /* We can take interrupts now: we're officially "up". */ 85 /* We can take interrupts now: we're officially "up". */
77 local_irq_enable(); 86 local_irq_enable();
78 87
@@ -141,56 +150,37 @@ static int xen_smp_intr_init(unsigned int cpu)
141 return rc; 150 return rc;
142} 151}
143 152
144void __init xen_fill_possible_map(void) 153static void __init xen_fill_possible_map(void)
145{ 154{
146 int i, rc; 155 int i, rc;
147 156
148 for (i = 0; i < NR_CPUS; i++) { 157 for (i = 0; i < NR_CPUS; i++) {
149 rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); 158 rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
150 if (rc >= 0) 159 if (rc >= 0) {
160 num_processors++;
151 cpu_set(i, cpu_possible_map); 161 cpu_set(i, cpu_possible_map);
162 }
152 } 163 }
153} 164}
154 165
155void __init xen_smp_prepare_boot_cpu(void) 166static void __init xen_smp_prepare_boot_cpu(void)
156{ 167{
157 int cpu;
158
159 BUG_ON(smp_processor_id() != 0); 168 BUG_ON(smp_processor_id() != 0);
160 native_smp_prepare_boot_cpu(); 169 native_smp_prepare_boot_cpu();
161 170
162 /* We've switched to the "real" per-cpu gdt, so make sure the 171 /* We've switched to the "real" per-cpu gdt, so make sure the
163 old memory can be recycled */ 172 old memory can be recycled */
164 make_lowmem_page_readwrite(&per_cpu__gdt_page); 173 make_lowmem_page_readwrite(&per_cpu_var(gdt_page));
165
166 for_each_possible_cpu(cpu) {
167 cpus_clear(per_cpu(cpu_sibling_map, cpu));
168 /*
169 * cpu_core_map lives in a per cpu area that is cleared
170 * when the per cpu array is allocated.
171 *
172 * cpus_clear(per_cpu(cpu_core_map, cpu));
173 */
174 }
175 174
176 xen_setup_vcpu_info_placement(); 175 xen_setup_vcpu_info_placement();
177} 176}
178 177
179void __init xen_smp_prepare_cpus(unsigned int max_cpus) 178static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
180{ 179{
181 unsigned cpu; 180 unsigned cpu;
182 181
183 for_each_possible_cpu(cpu) {
184 cpus_clear(per_cpu(cpu_sibling_map, cpu));
185 /*
186 * cpu_core_ map will be zeroed when the per
187 * cpu area is allocated.
188 *
189 * cpus_clear(per_cpu(cpu_core_map, cpu));
190 */
191 }
192
193 smp_store_cpu_info(0); 182 smp_store_cpu_info(0);
183 cpu_data(0).x86_max_cores = 1;
194 set_cpu_sibling_map(0); 184 set_cpu_sibling_map(0);
195 185
196 if (xen_smp_intr_init(0)) 186 if (xen_smp_intr_init(0))
@@ -225,7 +215,7 @@ static __cpuinit int
225cpu_initialize_context(unsigned int cpu, struct task_struct *idle) 215cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
226{ 216{
227 struct vcpu_guest_context *ctxt; 217 struct vcpu_guest_context *ctxt;
228 struct gdt_page *gdt = &per_cpu(gdt_page, cpu); 218 struct desc_struct *gdt;
229 219
230 if (cpu_test_and_set(cpu, xen_cpu_initialized_map)) 220 if (cpu_test_and_set(cpu, xen_cpu_initialized_map))
231 return 0; 221 return 0;
@@ -234,12 +224,15 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
234 if (ctxt == NULL) 224 if (ctxt == NULL)
235 return -ENOMEM; 225 return -ENOMEM;
236 226
227 gdt = get_cpu_gdt_table(cpu);
228
237 ctxt->flags = VGCF_IN_KERNEL; 229 ctxt->flags = VGCF_IN_KERNEL;
238 ctxt->user_regs.ds = __USER_DS; 230 ctxt->user_regs.ds = __USER_DS;
239 ctxt->user_regs.es = __USER_DS; 231 ctxt->user_regs.es = __USER_DS;
240 ctxt->user_regs.fs = __KERNEL_PERCPU;
241 ctxt->user_regs.gs = 0;
242 ctxt->user_regs.ss = __KERNEL_DS; 232 ctxt->user_regs.ss = __KERNEL_DS;
233#ifdef CONFIG_X86_32
234 ctxt->user_regs.fs = __KERNEL_PERCPU;
235#endif
243 ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle; 236 ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
244 ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */ 237 ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
245 238
@@ -249,11 +242,11 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
249 242
250 ctxt->ldt_ents = 0; 243 ctxt->ldt_ents = 0;
251 244
252 BUG_ON((unsigned long)gdt->gdt & ~PAGE_MASK); 245 BUG_ON((unsigned long)gdt & ~PAGE_MASK);
253 make_lowmem_page_readonly(gdt->gdt); 246 make_lowmem_page_readonly(gdt);
254 247
255 ctxt->gdt_frames[0] = virt_to_mfn(gdt->gdt); 248 ctxt->gdt_frames[0] = virt_to_mfn(gdt);
256 ctxt->gdt_ents = ARRAY_SIZE(gdt->gdt); 249 ctxt->gdt_ents = GDT_ENTRIES;
257 250
258 ctxt->user_regs.cs = __KERNEL_CS; 251 ctxt->user_regs.cs = __KERNEL_CS;
259 ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs); 252 ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
@@ -261,9 +254,11 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
261 ctxt->kernel_ss = __KERNEL_DS; 254 ctxt->kernel_ss = __KERNEL_DS;
262 ctxt->kernel_sp = idle->thread.sp0; 255 ctxt->kernel_sp = idle->thread.sp0;
263 256
257#ifdef CONFIG_X86_32
264 ctxt->event_callback_cs = __KERNEL_CS; 258 ctxt->event_callback_cs = __KERNEL_CS;
265 ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback;
266 ctxt->failsafe_callback_cs = __KERNEL_CS; 259 ctxt->failsafe_callback_cs = __KERNEL_CS;
260#endif
261 ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback;
267 ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback; 262 ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback;
268 263
269 per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir); 264 per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
@@ -276,7 +271,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
276 return 0; 271 return 0;
277} 272}
278 273
279int __cpuinit xen_cpu_up(unsigned int cpu) 274static int __cpuinit xen_cpu_up(unsigned int cpu)
280{ 275{
281 struct task_struct *idle = idle_task(cpu); 276 struct task_struct *idle = idle_task(cpu);
282 int rc; 277 int rc;
@@ -287,11 +282,28 @@ int __cpuinit xen_cpu_up(unsigned int cpu)
287 return rc; 282 return rc;
288#endif 283#endif
289 284
285#ifdef CONFIG_X86_64
286 /* Allocate node local memory for AP pdas */
287 WARN_ON(cpu == 0);
288 if (cpu > 0) {
289 rc = get_local_pda(cpu);
290 if (rc)
291 return rc;
292 }
293#endif
294
295#ifdef CONFIG_X86_32
290 init_gdt(cpu); 296 init_gdt(cpu);
291 per_cpu(current_task, cpu) = idle; 297 per_cpu(current_task, cpu) = idle;
292 irq_ctx_init(cpu); 298 irq_ctx_init(cpu);
299#else
300 cpu_pda(cpu)->pcurrent = idle;
301 clear_tsk_thread_flag(idle, TIF_FORK);
302#endif
293 xen_setup_timer(cpu); 303 xen_setup_timer(cpu);
294 304
305 per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
306
295 /* make sure interrupts start blocked */ 307 /* make sure interrupts start blocked */
296 per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1; 308 per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1;
297 309
@@ -306,20 +318,18 @@ int __cpuinit xen_cpu_up(unsigned int cpu)
306 if (rc) 318 if (rc)
307 return rc; 319 return rc;
308 320
309 smp_store_cpu_info(cpu);
310 set_cpu_sibling_map(cpu);
311 /* This must be done before setting cpu_online_map */
312 wmb();
313
314 cpu_set(cpu, cpu_online_map);
315
316 rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL); 321 rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL);
317 BUG_ON(rc); 322 BUG_ON(rc);
318 323
324 while(per_cpu(cpu_state, cpu) != CPU_ONLINE) {
325 HYPERVISOR_sched_op(SCHEDOP_yield, 0);
326 barrier();
327 }
328
319 return 0; 329 return 0;
320} 330}
321 331
322void xen_smp_cpus_done(unsigned int max_cpus) 332static void xen_smp_cpus_done(unsigned int max_cpus)
323{ 333{
324} 334}
325 335
@@ -335,12 +345,12 @@ static void stop_self(void *v)
335 BUG(); 345 BUG();
336} 346}
337 347
338void xen_smp_send_stop(void) 348static void xen_smp_send_stop(void)
339{ 349{
340 smp_call_function(stop_self, NULL, 0); 350 smp_call_function(stop_self, NULL, 0);
341} 351}
342 352
343void xen_smp_send_reschedule(int cpu) 353static void xen_smp_send_reschedule(int cpu)
344{ 354{
345 xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR); 355 xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR);
346} 356}
@@ -355,7 +365,7 @@ static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector)
355 xen_send_IPI_one(cpu, vector); 365 xen_send_IPI_one(cpu, vector);
356} 366}
357 367
358void xen_smp_send_call_function_ipi(cpumask_t mask) 368static void xen_smp_send_call_function_ipi(cpumask_t mask)
359{ 369{
360 int cpu; 370 int cpu;
361 371
@@ -370,7 +380,7 @@ void xen_smp_send_call_function_ipi(cpumask_t mask)
370 } 380 }
371} 381}
372 382
373void xen_smp_send_call_function_single_ipi(int cpu) 383static void xen_smp_send_call_function_single_ipi(int cpu)
374{ 384{
375 xen_send_IPI_mask(cpumask_of_cpu(cpu), XEN_CALL_FUNCTION_SINGLE_VECTOR); 385 xen_send_IPI_mask(cpumask_of_cpu(cpu), XEN_CALL_FUNCTION_SINGLE_VECTOR);
376} 386}
@@ -379,7 +389,11 @@ static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
379{ 389{
380 irq_enter(); 390 irq_enter();
381 generic_smp_call_function_interrupt(); 391 generic_smp_call_function_interrupt();
392#ifdef CONFIG_X86_32
382 __get_cpu_var(irq_stat).irq_call_count++; 393 __get_cpu_var(irq_stat).irq_call_count++;
394#else
395 add_pda(irq_call_count, 1);
396#endif
383 irq_exit(); 397 irq_exit();
384 398
385 return IRQ_HANDLED; 399 return IRQ_HANDLED;
@@ -389,8 +403,31 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id)
389{ 403{
390 irq_enter(); 404 irq_enter();
391 generic_smp_call_function_single_interrupt(); 405 generic_smp_call_function_single_interrupt();
406#ifdef CONFIG_X86_32
392 __get_cpu_var(irq_stat).irq_call_count++; 407 __get_cpu_var(irq_stat).irq_call_count++;
408#else
409 add_pda(irq_call_count, 1);
410#endif
393 irq_exit(); 411 irq_exit();
394 412
395 return IRQ_HANDLED; 413 return IRQ_HANDLED;
396} 414}
415
416static const struct smp_ops xen_smp_ops __initdata = {
417 .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
418 .smp_prepare_cpus = xen_smp_prepare_cpus,
419 .cpu_up = xen_cpu_up,
420 .smp_cpus_done = xen_smp_cpus_done,
421
422 .smp_send_stop = xen_smp_send_stop,
423 .smp_send_reschedule = xen_smp_send_reschedule,
424
425 .send_call_func_ipi = xen_smp_send_call_function_ipi,
426 .send_call_func_single_ipi = xen_smp_send_call_function_single_ipi,
427};
428
429void __init xen_smp_init(void)
430{
431 smp_ops = xen_smp_ops;
432 xen_fill_possible_map();
433}
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index 251669a932d4..2a234db5949b 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -38,8 +38,11 @@ void xen_post_suspend(int suspend_cancelled)
38 xen_cpu_initialized_map = cpu_online_map; 38 xen_cpu_initialized_map = cpu_online_map;
39#endif 39#endif
40 xen_vcpu_restore(); 40 xen_vcpu_restore();
41 xen_timer_resume();
42 } 41 }
43 42
44} 43}
45 44
45void xen_arch_resume(void)
46{
47 /* nothing */
48}
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm_32.S
index 2497a30f41de..2497a30f41de 100644
--- a/arch/x86/xen/xen-asm.S
+++ b/arch/x86/xen/xen-asm_32.S
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
new file mode 100644
index 000000000000..4038cbfe3331
--- /dev/null
+++ b/arch/x86/xen/xen-asm_64.S
@@ -0,0 +1,271 @@
1/*
2 Asm versions of Xen pv-ops, suitable for either direct use or inlining.
3 The inline versions are the same as the direct-use versions, with the
4 pre- and post-amble chopped off.
5
6 This code is encoded for size rather than absolute efficiency,
7 with a view to being able to inline as much as possible.
8
9 We only bother with direct forms (ie, vcpu in pda) of the operations
10 here; the indirect forms are better handled in C, since they're
11 generally too large to inline anyway.
12 */
13
14#include <linux/linkage.h>
15
16#include <asm/asm-offsets.h>
17#include <asm/processor-flags.h>
18#include <asm/errno.h>
19#include <asm/segment.h>
20
21#include <xen/interface/xen.h>
22
23#define RELOC(x, v) .globl x##_reloc; x##_reloc=v
24#define ENDPATCH(x) .globl x##_end; x##_end=.
25
26/* Pseudo-flag used for virtual NMI, which we don't implement yet */
27#define XEN_EFLAGS_NMI 0x80000000
28
29#if 0
30#include <asm/percpu.h>
31
32/*
33 Enable events. This clears the event mask and tests the pending
34 event status with one and operation. If there are pending
35 events, then enter the hypervisor to get them handled.
36 */
37ENTRY(xen_irq_enable_direct)
38 /* Unmask events */
39 movb $0, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
40
41 /* Preempt here doesn't matter because that will deal with
42 any pending interrupts. The pending check may end up being
43 run on the wrong CPU, but that doesn't hurt. */
44
45 /* Test for pending */
46 testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending)
47 jz 1f
48
492: call check_events
501:
51ENDPATCH(xen_irq_enable_direct)
52 ret
53 ENDPROC(xen_irq_enable_direct)
54 RELOC(xen_irq_enable_direct, 2b+1)
55
56/*
57 Disabling events is simply a matter of making the event mask
58 non-zero.
59 */
60ENTRY(xen_irq_disable_direct)
61 movb $1, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
62ENDPATCH(xen_irq_disable_direct)
63 ret
64 ENDPROC(xen_irq_disable_direct)
65 RELOC(xen_irq_disable_direct, 0)
66
67/*
68 (xen_)save_fl is used to get the current interrupt enable status.
69 Callers expect the status to be in X86_EFLAGS_IF, and other bits
70 may be set in the return value. We take advantage of this by
71 making sure that X86_EFLAGS_IF has the right value (and other bits
72 in that byte are 0), but other bits in the return value are
73 undefined. We need to toggle the state of the bit, because
74 Xen and x86 use opposite senses (mask vs enable).
75 */
76ENTRY(xen_save_fl_direct)
77 testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
78 setz %ah
79 addb %ah,%ah
80ENDPATCH(xen_save_fl_direct)
81 ret
82 ENDPROC(xen_save_fl_direct)
83 RELOC(xen_save_fl_direct, 0)
84
85/*
86 In principle the caller should be passing us a value return
87 from xen_save_fl_direct, but for robustness sake we test only
88 the X86_EFLAGS_IF flag rather than the whole byte. After
89 setting the interrupt mask state, it checks for unmasked
90 pending events and enters the hypervisor to get them delivered
91 if so.
92 */
93ENTRY(xen_restore_fl_direct)
94 testb $X86_EFLAGS_IF>>8, %ah
95 setz PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
96 /* Preempt here doesn't matter because that will deal with
97 any pending interrupts. The pending check may end up being
98 run on the wrong CPU, but that doesn't hurt. */
99
100 /* check for unmasked and pending */
101 cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending)
102 jz 1f
1032: call check_events
1041:
105ENDPATCH(xen_restore_fl_direct)
106 ret
107 ENDPROC(xen_restore_fl_direct)
108 RELOC(xen_restore_fl_direct, 2b+1)
109
110
111/*
112 Force an event check by making a hypercall,
113 but preserve regs before making the call.
114 */
115check_events:
116 push %rax
117 push %rcx
118 push %rdx
119 push %rsi
120 push %rdi
121 push %r8
122 push %r9
123 push %r10
124 push %r11
125 call force_evtchn_callback
126 pop %r11
127 pop %r10
128 pop %r9
129 pop %r8
130 pop %rdi
131 pop %rsi
132 pop %rdx
133 pop %rcx
134 pop %rax
135 ret
136#endif
137
138ENTRY(xen_adjust_exception_frame)
139 mov 8+0(%rsp),%rcx
140 mov 8+8(%rsp),%r11
141 ret $16
142
143hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32
144/*
145 Xen64 iret frame:
146
147 ss
148 rsp
149 rflags
150 cs
151 rip <-- standard iret frame
152
153 flags
154
155 rcx }
156 r11 }<-- pushed by hypercall page
157rsp -> rax }
158 */
159ENTRY(xen_iret)
160 pushq $0
1611: jmp hypercall_iret
162ENDPATCH(xen_iret)
163RELOC(xen_iret, 1b+1)
164
165/*
166 sysexit is not used for 64-bit processes, so it's
167 only ever used to return to 32-bit compat userspace.
168 */
169ENTRY(xen_sysexit)
170 pushq $__USER32_DS
171 pushq %rcx
172 pushq $X86_EFLAGS_IF
173 pushq $__USER32_CS
174 pushq %rdx
175
176 pushq $VGCF_in_syscall
1771: jmp hypercall_iret
178ENDPATCH(xen_sysexit)
179RELOC(xen_sysexit, 1b+1)
180
181ENTRY(xen_sysret64)
182 /* We're already on the usermode stack at this point, but still
183 with the kernel gs, so we can easily switch back */
184 movq %rsp, %gs:pda_oldrsp
185 movq %gs:pda_kernelstack,%rsp
186
187 pushq $__USER_DS
188 pushq %gs:pda_oldrsp
189 pushq %r11
190 pushq $__USER_CS
191 pushq %rcx
192
193 pushq $VGCF_in_syscall
1941: jmp hypercall_iret
195ENDPATCH(xen_sysret64)
196RELOC(xen_sysret64, 1b+1)
197
198ENTRY(xen_sysret32)
199 /* We're already on the usermode stack at this point, but still
200 with the kernel gs, so we can easily switch back */
201 movq %rsp, %gs:pda_oldrsp
202 movq %gs:pda_kernelstack, %rsp
203
204 pushq $__USER32_DS
205 pushq %gs:pda_oldrsp
206 pushq %r11
207 pushq $__USER32_CS
208 pushq %rcx
209
210 pushq $VGCF_in_syscall
2111: jmp hypercall_iret
212ENDPATCH(xen_sysret32)
213RELOC(xen_sysret32, 1b+1)
214
215/*
216 Xen handles syscall callbacks much like ordinary exceptions,
217 which means we have:
218 - kernel gs
219 - kernel rsp
220 - an iret-like stack frame on the stack (including rcx and r11):
221 ss
222 rsp
223 rflags
224 cs
225 rip
226 r11
227 rsp-> rcx
228
229 In all the entrypoints, we undo all that to make it look
230 like a CPU-generated syscall/sysenter and jump to the normal
231 entrypoint.
232 */
233
234.macro undo_xen_syscall
235 mov 0*8(%rsp),%rcx
236 mov 1*8(%rsp),%r11
237 mov 5*8(%rsp),%rsp
238.endm
239
240/* Normal 64-bit system call target */
241ENTRY(xen_syscall_target)
242 undo_xen_syscall
243 jmp system_call_after_swapgs
244ENDPROC(xen_syscall_target)
245
246#ifdef CONFIG_IA32_EMULATION
247
248/* 32-bit compat syscall target */
249ENTRY(xen_syscall32_target)
250 undo_xen_syscall
251 jmp ia32_cstar_target
252ENDPROC(xen_syscall32_target)
253
254/* 32-bit compat sysenter target */
255ENTRY(xen_sysenter_target)
256 undo_xen_syscall
257 jmp ia32_sysenter_target
258ENDPROC(xen_sysenter_target)
259
260#else /* !CONFIG_IA32_EMULATION */
261
262ENTRY(xen_syscall32_target)
263ENTRY(xen_sysenter_target)
264 lea 16(%rsp), %rsp /* strip %rcx,%r11 */
265 mov $-ENOSYS, %rax
266 pushq $VGCF_in_syscall
267 jmp hypercall_iret
268ENDPROC(xen_syscall32_target)
269ENDPROC(xen_sysenter_target)
270
271#endif /* CONFIG_IA32_EMULATION */
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 7c0cf6320a0a..63d49a523ed3 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -5,15 +5,24 @@
5 5
6#include <linux/elfnote.h> 6#include <linux/elfnote.h>
7#include <linux/init.h> 7#include <linux/init.h>
8
8#include <asm/boot.h> 9#include <asm/boot.h>
10#include <asm/asm.h>
11#include <asm/page.h>
12
9#include <xen/interface/elfnote.h> 13#include <xen/interface/elfnote.h>
10#include <asm/xen/interface.h> 14#include <asm/xen/interface.h>
11 15
12 __INIT 16 __INIT
13ENTRY(startup_xen) 17ENTRY(startup_xen)
14 movl %esi,xen_start_info
15 cld 18 cld
16 movl $(init_thread_union+THREAD_SIZE),%esp 19#ifdef CONFIG_X86_32
20 mov %esi,xen_start_info
21 mov $init_thread_union+THREAD_SIZE,%esp
22#else
23 mov %rsi,xen_start_info
24 mov $init_thread_union+THREAD_SIZE,%rsp
25#endif
17 jmp xen_start_kernel 26 jmp xen_start_kernel
18 27
19 __FINIT 28 __FINIT
@@ -21,21 +30,26 @@ ENTRY(startup_xen)
21.pushsection .text 30.pushsection .text
22 .align PAGE_SIZE_asm 31 .align PAGE_SIZE_asm
23ENTRY(hypercall_page) 32ENTRY(hypercall_page)
24 .skip 0x1000 33 .skip PAGE_SIZE_asm
25.popsection 34.popsection
26 35
27 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") 36 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux")
28 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6") 37 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6")
29 ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0") 38 ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0")
30 ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .long __PAGE_OFFSET) 39#ifdef CONFIG_X86_32
31 ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long startup_xen) 40 ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __PAGE_OFFSET)
32 ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long hypercall_page) 41#else
42 ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __START_KERNEL_map)
43#endif
44 ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen)
45 ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page)
33 ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb") 46 ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb")
34 ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes") 47 ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes")
35 ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic") 48 ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic")
36 ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, 49 ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID,
37 .quad _PAGE_PRESENT; .quad _PAGE_PRESENT) 50 .quad _PAGE_PRESENT; .quad _PAGE_PRESENT)
38 ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1) 51 ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1)
39 ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, .long __HYPERVISOR_VIRT_START) 52 ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, _ASM_PTR __HYPERVISOR_VIRT_START)
53 ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, _ASM_PTR 0)
40 54
41#endif /*CONFIG_XEN */ 55#endif /*CONFIG_XEN */
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 6f4b1045c1c2..dd3c23152a2e 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -26,6 +26,7 @@ char * __init xen_memory_setup(void);
26void __init xen_arch_setup(void); 26void __init xen_arch_setup(void);
27void __init xen_init_IRQ(void); 27void __init xen_init_IRQ(void);
28void xen_enable_sysenter(void); 28void xen_enable_sysenter(void);
29void xen_enable_syscall(void);
29void xen_vcpu_restore(void); 30void xen_vcpu_restore(void);
30 31
31void __init xen_build_dynamic_phys_to_machine(void); 32void __init xen_build_dynamic_phys_to_machine(void);
@@ -37,7 +38,6 @@ void __init xen_time_init(void);
37unsigned long xen_get_wallclock(void); 38unsigned long xen_get_wallclock(void);
38int xen_set_wallclock(unsigned long time); 39int xen_set_wallclock(unsigned long time);
39unsigned long long xen_sched_clock(void); 40unsigned long long xen_sched_clock(void);
40void xen_timer_resume(void);
41 41
42irqreturn_t xen_debug_interrupt(int irq, void *dev_id); 42irqreturn_t xen_debug_interrupt(int irq, void *dev_id);
43 43
@@ -45,20 +45,15 @@ bool xen_vcpu_stolen(int vcpu);
45 45
46void xen_mark_init_mm_pinned(void); 46void xen_mark_init_mm_pinned(void);
47 47
48void __init xen_fill_possible_map(void);
49
50void __init xen_setup_vcpu_info_placement(void); 48void __init xen_setup_vcpu_info_placement(void);
51void xen_smp_prepare_boot_cpu(void);
52void xen_smp_prepare_cpus(unsigned int max_cpus);
53int xen_cpu_up(unsigned int cpu);
54void xen_smp_cpus_done(unsigned int max_cpus);
55 49
56void xen_smp_send_stop(void); 50#ifdef CONFIG_SMP
57void xen_smp_send_reschedule(int cpu); 51void xen_smp_init(void);
58void xen_smp_send_call_function_ipi(cpumask_t mask);
59void xen_smp_send_call_function_single_ipi(int cpu);
60 52
61extern cpumask_t xen_cpu_initialized_map; 53extern cpumask_t xen_cpu_initialized_map;
54#else
55static inline void xen_smp_init(void) {}
56#endif
62 57
63 58
64/* Declare an asm function, along with symbols needed to make it 59/* Declare an asm function, along with symbols needed to make it
@@ -73,7 +68,11 @@ DECL_ASM(void, xen_irq_disable_direct, void);
73DECL_ASM(unsigned long, xen_save_fl_direct, void); 68DECL_ASM(unsigned long, xen_save_fl_direct, void);
74DECL_ASM(void, xen_restore_fl_direct, unsigned long); 69DECL_ASM(void, xen_restore_fl_direct, unsigned long);
75 70
71/* These are not functions, and cannot be called normally */
76void xen_iret(void); 72void xen_iret(void);
77void xen_sysexit(void); 73void xen_sysexit(void);
74void xen_sysret32(void);
75void xen_sysret64(void);
76void xen_adjust_exception_frame(void);
78 77
79#endif /* XEN_OPS_H */ 78#endif /* XEN_OPS_H */
diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
index ef671d1a3bf0..902bbe788215 100644
--- a/drivers/net/xen-netfront.c
+++ b/drivers/net/xen-netfront.c
@@ -92,7 +92,7 @@ struct netfront_info {
92 */ 92 */
93 union skb_entry { 93 union skb_entry {
94 struct sk_buff *skb; 94 struct sk_buff *skb;
95 unsigned link; 95 unsigned long link;
96 } tx_skbs[NET_TX_RING_SIZE]; 96 } tx_skbs[NET_TX_RING_SIZE];
97 grant_ref_t gref_tx_head; 97 grant_ref_t gref_tx_head;
98 grant_ref_t grant_tx_ref[NET_TX_RING_SIZE]; 98 grant_ref_t grant_tx_ref[NET_TX_RING_SIZE];
@@ -125,6 +125,17 @@ struct netfront_rx_info {
125 struct xen_netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX - 1]; 125 struct xen_netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX - 1];
126}; 126};
127 127
128static void skb_entry_set_link(union skb_entry *list, unsigned short id)
129{
130 list->link = id;
131}
132
133static int skb_entry_is_link(const union skb_entry *list)
134{
135 BUILD_BUG_ON(sizeof(list->skb) != sizeof(list->link));
136 return ((unsigned long)list->skb < PAGE_OFFSET);
137}
138
128/* 139/*
129 * Access macros for acquiring freeing slots in tx_skbs[]. 140 * Access macros for acquiring freeing slots in tx_skbs[].
130 */ 141 */
@@ -132,7 +143,7 @@ struct netfront_rx_info {
132static void add_id_to_freelist(unsigned *head, union skb_entry *list, 143static void add_id_to_freelist(unsigned *head, union skb_entry *list,
133 unsigned short id) 144 unsigned short id)
134{ 145{
135 list[id].link = *head; 146 skb_entry_set_link(&list[id], *head);
136 *head = id; 147 *head = id;
137} 148}
138 149
@@ -993,7 +1004,7 @@ static void xennet_release_tx_bufs(struct netfront_info *np)
993 1004
994 for (i = 0; i < NET_TX_RING_SIZE; i++) { 1005 for (i = 0; i < NET_TX_RING_SIZE; i++) {
995 /* Skip over entries which are actually freelist references */ 1006 /* Skip over entries which are actually freelist references */
996 if ((unsigned long)np->tx_skbs[i].skb < PAGE_OFFSET) 1007 if (skb_entry_is_link(&np->tx_skbs[i]))
997 continue; 1008 continue;
998 1009
999 skb = np->tx_skbs[i].skb; 1010 skb = np->tx_skbs[i].skb;
@@ -1123,7 +1134,7 @@ static struct net_device * __devinit xennet_create_dev(struct xenbus_device *dev
1123 /* Initialise tx_skbs as a free chain containing every entry. */ 1134 /* Initialise tx_skbs as a free chain containing every entry. */
1124 np->tx_skb_freelist = 0; 1135 np->tx_skb_freelist = 0;
1125 for (i = 0; i < NET_TX_RING_SIZE; i++) { 1136 for (i = 0; i < NET_TX_RING_SIZE; i++) {
1126 np->tx_skbs[i].link = i+1; 1137 skb_entry_set_link(&np->tx_skbs[i], i+1);
1127 np->grant_tx_ref[i] = GRANT_INVALID_REF; 1138 np->grant_tx_ref[i] = GRANT_INVALID_REF;
1128 } 1139 }
1129 1140
diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
index 5b546e365f00..2bb268e4ac56 100644
--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -68,6 +68,7 @@ static int xen_suspend(void *data)
68 if (!*cancelled) { 68 if (!*cancelled) {
69 xen_irq_resume(); 69 xen_irq_resume();
70 xen_console_resume(); 70 xen_console_resume();
71 xen_timer_resume();
71 } 72 }
72 73
73 return 0; 74 return 0;
@@ -107,9 +108,10 @@ static void do_suspend(void)
107 goto out; 108 goto out;
108 } 109 }
109 110
110 if (!cancelled) 111 if (!cancelled) {
112 xen_arch_resume();
111 xenbus_resume(); 113 xenbus_resume();
112 else 114 } else
113 xenbus_suspend_cancel(); 115 xenbus_suspend_cancel();
114 116
115 device_resume(); 117 device_resume();
diff --git a/include/asm-x86/paravirt.h b/include/asm-x86/paravirt.h
index ef5e8ec6a6ab..eef8095a09dc 100644
--- a/include/asm-x86/paravirt.h
+++ b/include/asm-x86/paravirt.h
@@ -1396,8 +1396,8 @@ extern struct paravirt_patch_site __parainstructions[],
1396 * caller saved registers but the argument parameter */ 1396 * caller saved registers but the argument parameter */
1397#define PV_SAVE_REGS "pushq %%rdi;" 1397#define PV_SAVE_REGS "pushq %%rdi;"
1398#define PV_RESTORE_REGS "popq %%rdi;" 1398#define PV_RESTORE_REGS "popq %%rdi;"
1399#define PV_EXTRA_CLOBBERS EXTRA_CLOBBERS, "rcx" , "rdx" 1399#define PV_EXTRA_CLOBBERS EXTRA_CLOBBERS, "rcx" , "rdx", "rsi"
1400#define PV_VEXTRA_CLOBBERS EXTRA_CLOBBERS, "rdi", "rcx" , "rdx" 1400#define PV_VEXTRA_CLOBBERS EXTRA_CLOBBERS, "rdi", "rcx" , "rdx", "rsi"
1401#define PV_FLAGS_ARG "D" 1401#define PV_FLAGS_ARG "D"
1402#endif 1402#endif
1403 1403
@@ -1489,8 +1489,26 @@ static inline unsigned long __raw_local_irq_save(void)
1489 1489
1490 1490
1491#ifdef CONFIG_X86_64 1491#ifdef CONFIG_X86_64
1492#define PV_SAVE_REGS pushq %rax; pushq %rdi; pushq %rcx; pushq %rdx 1492#define PV_SAVE_REGS \
1493#define PV_RESTORE_REGS popq %rdx; popq %rcx; popq %rdi; popq %rax 1493 push %rax; \
1494 push %rcx; \
1495 push %rdx; \
1496 push %rsi; \
1497 push %rdi; \
1498 push %r8; \
1499 push %r9; \
1500 push %r10; \
1501 push %r11
1502#define PV_RESTORE_REGS \
1503 pop %r11; \
1504 pop %r10; \
1505 pop %r9; \
1506 pop %r8; \
1507 pop %rdi; \
1508 pop %rsi; \
1509 pop %rdx; \
1510 pop %rcx; \
1511 pop %rax
1494#define PARA_PATCH(struct, off) ((PARAVIRT_PATCH_##struct + (off)) / 8) 1512#define PARA_PATCH(struct, off) ((PARAVIRT_PATCH_##struct + (off)) / 8)
1495#define PARA_SITE(ptype, clobbers, ops) _PVSITE(ptype, clobbers, ops, .quad, 8) 1513#define PARA_SITE(ptype, clobbers, ops) _PVSITE(ptype, clobbers, ops, .quad, 8)
1496#define PARA_INDIRECT(addr) *addr(%rip) 1514#define PARA_INDIRECT(addr) *addr(%rip)
diff --git a/include/asm-x86/percpu.h b/include/asm-x86/percpu.h
index 912a3a17b9db..4e91ee1e37aa 100644
--- a/include/asm-x86/percpu.h
+++ b/include/asm-x86/percpu.h
@@ -22,6 +22,32 @@
22 22
23DECLARE_PER_CPU(struct x8664_pda, pda); 23DECLARE_PER_CPU(struct x8664_pda, pda);
24 24
25/*
26 * These are supposed to be implemented as a single instruction which
27 * operates on the per-cpu data base segment. x86-64 doesn't have
28 * that yet, so this is a fairly inefficient workaround for the
29 * meantime. The single instruction is atomic with respect to
30 * preemption and interrupts, so we need to explicitly disable
31 * interrupts here to achieve the same effect. However, because it
32 * can be used from within interrupt-disable/enable, we can't actually
33 * disable interrupts; disabling preemption is enough.
34 */
35#define x86_read_percpu(var) \
36 ({ \
37 typeof(per_cpu_var(var)) __tmp; \
38 preempt_disable(); \
39 __tmp = __get_cpu_var(var); \
40 preempt_enable(); \
41 __tmp; \
42 })
43
44#define x86_write_percpu(var, val) \
45 do { \
46 preempt_disable(); \
47 __get_cpu_var(var) = (val); \
48 preempt_enable(); \
49 } while(0)
50
25#else /* CONFIG_X86_64 */ 51#else /* CONFIG_X86_64 */
26 52
27#ifdef __ASSEMBLY__ 53#ifdef __ASSEMBLY__
diff --git a/include/asm-x86/pgtable.h b/include/asm-x86/pgtable.h
index 49cbd76b9547..96aa76e691d8 100644
--- a/include/asm-x86/pgtable.h
+++ b/include/asm-x86/pgtable.h
@@ -302,6 +302,14 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
302/* Install a pte for a particular vaddr in kernel space. */ 302/* Install a pte for a particular vaddr in kernel space. */
303void set_pte_vaddr(unsigned long vaddr, pte_t pte); 303void set_pte_vaddr(unsigned long vaddr, pte_t pte);
304 304
305#ifdef CONFIG_X86_32
306extern void native_pagetable_setup_start(pgd_t *base);
307extern void native_pagetable_setup_done(pgd_t *base);
308#else
309static inline void native_pagetable_setup_start(pgd_t *base) {}
310static inline void native_pagetable_setup_done(pgd_t *base) {}
311#endif
312
305#ifdef CONFIG_PARAVIRT 313#ifdef CONFIG_PARAVIRT
306#include <asm/paravirt.h> 314#include <asm/paravirt.h>
307#else /* !CONFIG_PARAVIRT */ 315#else /* !CONFIG_PARAVIRT */
@@ -333,6 +341,16 @@ void set_pte_vaddr(unsigned long vaddr, pte_t pte);
333 341
334#define pte_update(mm, addr, ptep) do { } while (0) 342#define pte_update(mm, addr, ptep) do { } while (0)
335#define pte_update_defer(mm, addr, ptep) do { } while (0) 343#define pte_update_defer(mm, addr, ptep) do { } while (0)
344
345static inline void __init paravirt_pagetable_setup_start(pgd_t *base)
346{
347 native_pagetable_setup_start(base);
348}
349
350static inline void __init paravirt_pagetable_setup_done(pgd_t *base)
351{
352 native_pagetable_setup_done(base);
353}
336#endif /* CONFIG_PARAVIRT */ 354#endif /* CONFIG_PARAVIRT */
337 355
338#endif /* __ASSEMBLY__ */ 356#endif /* __ASSEMBLY__ */
diff --git a/include/asm-x86/pgtable_32.h b/include/asm-x86/pgtable_32.h
index ec871c420d7e..0611abf96a5e 100644
--- a/include/asm-x86/pgtable_32.h
+++ b/include/asm-x86/pgtable_32.h
@@ -171,21 +171,6 @@ do { \
171 */ 171 */
172#define update_mmu_cache(vma, address, pte) do { } while (0) 172#define update_mmu_cache(vma, address, pte) do { } while (0)
173 173
174extern void native_pagetable_setup_start(pgd_t *base);
175extern void native_pagetable_setup_done(pgd_t *base);
176
177#ifndef CONFIG_PARAVIRT
178static inline void __init paravirt_pagetable_setup_start(pgd_t *base)
179{
180 native_pagetable_setup_start(base);
181}
182
183static inline void __init paravirt_pagetable_setup_done(pgd_t *base)
184{
185 native_pagetable_setup_done(base);
186}
187#endif /* !CONFIG_PARAVIRT */
188
189#endif /* !__ASSEMBLY__ */ 174#endif /* !__ASSEMBLY__ */
190 175
191/* 176/*
diff --git a/include/asm-x86/pgtable_64.h b/include/asm-x86/pgtable_64.h
index fa7208b483ca..805d3128bfc4 100644
--- a/include/asm-x86/pgtable_64.h
+++ b/include/asm-x86/pgtable_64.h
@@ -16,6 +16,8 @@
16extern pud_t level3_kernel_pgt[512]; 16extern pud_t level3_kernel_pgt[512];
17extern pud_t level3_ident_pgt[512]; 17extern pud_t level3_ident_pgt[512];
18extern pmd_t level2_kernel_pgt[512]; 18extern pmd_t level2_kernel_pgt[512];
19extern pmd_t level2_fixmap_pgt[512];
20extern pmd_t level2_ident_pgt[512];
19extern pgd_t init_level4_pgt[]; 21extern pgd_t init_level4_pgt[];
20 22
21#define swapper_pg_dir init_level4_pgt 23#define swapper_pg_dir init_level4_pgt
diff --git a/include/asm-x86/setup.h b/include/asm-x86/setup.h
index 90ab2225e71b..659492624e74 100644
--- a/include/asm-x86/setup.h
+++ b/include/asm-x86/setup.h
@@ -76,6 +76,7 @@ extern unsigned long init_pg_tables_start;
76extern unsigned long init_pg_tables_end; 76extern unsigned long init_pg_tables_end;
77 77
78#else 78#else
79void __init x86_64_init_pda(void);
79void __init x86_64_start_kernel(char *real_mode); 80void __init x86_64_start_kernel(char *real_mode);
80void __init x86_64_start_reservations(char *real_mode_data); 81void __init x86_64_start_reservations(char *real_mode_data);
81 82
diff --git a/include/asm-x86/smp.h b/include/asm-x86/smp.h
index c2784b3e0b77..3c877f74f279 100644
--- a/include/asm-x86/smp.h
+++ b/include/asm-x86/smp.h
@@ -25,6 +25,8 @@ extern cpumask_t cpu_callin_map;
25extern void (*mtrr_hook)(void); 25extern void (*mtrr_hook)(void);
26extern void zap_low_mappings(void); 26extern void zap_low_mappings(void);
27 27
28extern int __cpuinit get_local_pda(int cpu);
29
28extern int smp_num_siblings; 30extern int smp_num_siblings;
29extern unsigned int num_processors; 31extern unsigned int num_processors;
30extern cpumask_t cpu_initialized; 32extern cpumask_t cpu_initialized;
diff --git a/include/asm-x86/vdso.h b/include/asm-x86/vdso.h
index 86e085e003d2..8e18fb80f5e6 100644
--- a/include/asm-x86/vdso.h
+++ b/include/asm-x86/vdso.h
@@ -36,4 +36,12 @@ extern const char VDSO32_PRELINK[];
36extern void __user __kernel_sigreturn; 36extern void __user __kernel_sigreturn;
37extern void __user __kernel_rt_sigreturn; 37extern void __user __kernel_rt_sigreturn;
38 38
39/*
40 * These symbols are defined by vdso32.S to mark the bounds
41 * of the ELF DSO images included therein.
42 */
43extern const char vdso32_int80_start, vdso32_int80_end;
44extern const char vdso32_syscall_start, vdso32_syscall_end;
45extern const char vdso32_sysenter_start, vdso32_sysenter_end;
46
39#endif /* asm-x86/vdso.h */ 47#endif /* asm-x86/vdso.h */
diff --git a/include/asm-x86/xen/hypercall.h b/include/asm-x86/xen/hypercall.h
index 2a4f9b41d684..91cb7fd5c123 100644
--- a/include/asm-x86/xen/hypercall.h
+++ b/include/asm-x86/xen/hypercall.h
@@ -40,83 +40,157 @@
40#include <xen/interface/sched.h> 40#include <xen/interface/sched.h>
41#include <xen/interface/physdev.h> 41#include <xen/interface/physdev.h>
42 42
43/*
44 * The hypercall asms have to meet several constraints:
45 * - Work on 32- and 64-bit.
46 * The two architectures put their arguments in different sets of
47 * registers.
48 *
49 * - Work around asm syntax quirks
50 * It isn't possible to specify one of the rNN registers in a
51 * constraint, so we use explicit register variables to get the
52 * args into the right place.
53 *
54 * - Mark all registers as potentially clobbered
55 * Even unused parameters can be clobbered by the hypervisor, so we
56 * need to make sure gcc knows it.
57 *
58 * - Avoid compiler bugs.
59 * This is the tricky part. Because x86_32 has such a constrained
60 * register set, gcc versions below 4.3 have trouble generating
61 * code when all the arg registers and memory are trashed by the
62 * asm. There are syntactically simpler ways of achieving the
63 * semantics below, but they cause the compiler to crash.
64 *
65 * The only combination I found which works is:
66 * - assign the __argX variables first
67 * - list all actually used parameters as "+r" (__argX)
68 * - clobber the rest
69 *
70 * The result certainly isn't pretty, and it really shows up cpp's
71 * weakness as as macro language. Sorry. (But let's just give thanks
72 * there aren't more than 5 arguments...)
73 */
74
43extern struct { char _entry[32]; } hypercall_page[]; 75extern struct { char _entry[32]; } hypercall_page[];
44 76
77#define __HYPERCALL "call hypercall_page+%c[offset]"
78#define __HYPERCALL_ENTRY(x) \
79 [offset] "i" (__HYPERVISOR_##x * sizeof(hypercall_page[0]))
80
81#ifdef CONFIG_X86_32
82#define __HYPERCALL_RETREG "eax"
83#define __HYPERCALL_ARG1REG "ebx"
84#define __HYPERCALL_ARG2REG "ecx"
85#define __HYPERCALL_ARG3REG "edx"
86#define __HYPERCALL_ARG4REG "esi"
87#define __HYPERCALL_ARG5REG "edi"
88#else
89#define __HYPERCALL_RETREG "rax"
90#define __HYPERCALL_ARG1REG "rdi"
91#define __HYPERCALL_ARG2REG "rsi"
92#define __HYPERCALL_ARG3REG "rdx"
93#define __HYPERCALL_ARG4REG "r10"
94#define __HYPERCALL_ARG5REG "r8"
95#endif
96
97#define __HYPERCALL_DECLS \
98 register unsigned long __res asm(__HYPERCALL_RETREG); \
99 register unsigned long __arg1 asm(__HYPERCALL_ARG1REG) = __arg1; \
100 register unsigned long __arg2 asm(__HYPERCALL_ARG2REG) = __arg2; \
101 register unsigned long __arg3 asm(__HYPERCALL_ARG3REG) = __arg3; \
102 register unsigned long __arg4 asm(__HYPERCALL_ARG4REG) = __arg4; \
103 register unsigned long __arg5 asm(__HYPERCALL_ARG5REG) = __arg5;
104
105#define __HYPERCALL_0PARAM "=r" (__res)
106#define __HYPERCALL_1PARAM __HYPERCALL_0PARAM, "+r" (__arg1)
107#define __HYPERCALL_2PARAM __HYPERCALL_1PARAM, "+r" (__arg2)
108#define __HYPERCALL_3PARAM __HYPERCALL_2PARAM, "+r" (__arg3)
109#define __HYPERCALL_4PARAM __HYPERCALL_3PARAM, "+r" (__arg4)
110#define __HYPERCALL_5PARAM __HYPERCALL_4PARAM, "+r" (__arg5)
111
112#define __HYPERCALL_0ARG()
113#define __HYPERCALL_1ARG(a1) \
114 __HYPERCALL_0ARG() __arg1 = (unsigned long)(a1);
115#define __HYPERCALL_2ARG(a1,a2) \
116 __HYPERCALL_1ARG(a1) __arg2 = (unsigned long)(a2);
117#define __HYPERCALL_3ARG(a1,a2,a3) \
118 __HYPERCALL_2ARG(a1,a2) __arg3 = (unsigned long)(a3);
119#define __HYPERCALL_4ARG(a1,a2,a3,a4) \
120 __HYPERCALL_3ARG(a1,a2,a3) __arg4 = (unsigned long)(a4);
121#define __HYPERCALL_5ARG(a1,a2,a3,a4,a5) \
122 __HYPERCALL_4ARG(a1,a2,a3,a4) __arg5 = (unsigned long)(a5);
123
124#define __HYPERCALL_CLOBBER5 "memory"
125#define __HYPERCALL_CLOBBER4 __HYPERCALL_CLOBBER5, __HYPERCALL_ARG5REG
126#define __HYPERCALL_CLOBBER3 __HYPERCALL_CLOBBER4, __HYPERCALL_ARG4REG
127#define __HYPERCALL_CLOBBER2 __HYPERCALL_CLOBBER3, __HYPERCALL_ARG3REG
128#define __HYPERCALL_CLOBBER1 __HYPERCALL_CLOBBER2, __HYPERCALL_ARG2REG
129#define __HYPERCALL_CLOBBER0 __HYPERCALL_CLOBBER1, __HYPERCALL_ARG1REG
130
45#define _hypercall0(type, name) \ 131#define _hypercall0(type, name) \
46({ \ 132({ \
47 long __res; \ 133 __HYPERCALL_DECLS; \
48 asm volatile ( \ 134 __HYPERCALL_0ARG(); \
49 "call %[call]" \ 135 asm volatile (__HYPERCALL \
50 : "=a" (__res) \ 136 : __HYPERCALL_0PARAM \
51 : [call] "m" (hypercall_page[__HYPERVISOR_##name]) \ 137 : __HYPERCALL_ENTRY(name) \
52 : "memory" ); \ 138 : __HYPERCALL_CLOBBER0); \
53 (type)__res; \ 139 (type)__res; \
54}) 140})
55 141
56#define _hypercall1(type, name, a1) \ 142#define _hypercall1(type, name, a1) \
57({ \ 143({ \
58 long __res, __ign1; \ 144 __HYPERCALL_DECLS; \
59 asm volatile ( \ 145 __HYPERCALL_1ARG(a1); \
60 "call %[call]" \ 146 asm volatile (__HYPERCALL \
61 : "=a" (__res), "=b" (__ign1) \ 147 : __HYPERCALL_1PARAM \
62 : "1" ((long)(a1)), \ 148 : __HYPERCALL_ENTRY(name) \
63 [call] "m" (hypercall_page[__HYPERVISOR_##name]) \ 149 : __HYPERCALL_CLOBBER1); \
64 : "memory" ); \
65 (type)__res; \ 150 (type)__res; \
66}) 151})
67 152
68#define _hypercall2(type, name, a1, a2) \ 153#define _hypercall2(type, name, a1, a2) \
69({ \ 154({ \
70 long __res, __ign1, __ign2; \ 155 __HYPERCALL_DECLS; \
71 asm volatile ( \ 156 __HYPERCALL_2ARG(a1, a2); \
72 "call %[call]" \ 157 asm volatile (__HYPERCALL \
73 : "=a" (__res), "=b" (__ign1), "=c" (__ign2) \ 158 : __HYPERCALL_2PARAM \
74 : "1" ((long)(a1)), "2" ((long)(a2)), \ 159 : __HYPERCALL_ENTRY(name) \
75 [call] "m" (hypercall_page[__HYPERVISOR_##name]) \ 160 : __HYPERCALL_CLOBBER2); \
76 : "memory" ); \
77 (type)__res; \ 161 (type)__res; \
78}) 162})
79 163
80#define _hypercall3(type, name, a1, a2, a3) \ 164#define _hypercall3(type, name, a1, a2, a3) \
81({ \ 165({ \
82 long __res, __ign1, __ign2, __ign3; \ 166 __HYPERCALL_DECLS; \
83 asm volatile ( \ 167 __HYPERCALL_3ARG(a1, a2, a3); \
84 "call %[call]" \ 168 asm volatile (__HYPERCALL \
85 : "=a" (__res), "=b" (__ign1), "=c" (__ign2), \ 169 : __HYPERCALL_3PARAM \
86 "=d" (__ign3) \ 170 : __HYPERCALL_ENTRY(name) \
87 : "1" ((long)(a1)), "2" ((long)(a2)), \ 171 : __HYPERCALL_CLOBBER3); \
88 "3" ((long)(a3)), \
89 [call] "m" (hypercall_page[__HYPERVISOR_##name]) \
90 : "memory" ); \
91 (type)__res; \ 172 (type)__res; \
92}) 173})
93 174
94#define _hypercall4(type, name, a1, a2, a3, a4) \ 175#define _hypercall4(type, name, a1, a2, a3, a4) \
95({ \ 176({ \
96 long __res, __ign1, __ign2, __ign3, __ign4; \ 177 __HYPERCALL_DECLS; \
97 asm volatile ( \ 178 __HYPERCALL_4ARG(a1, a2, a3, a4); \
98 "call %[call]" \ 179 asm volatile (__HYPERCALL \
99 : "=a" (__res), "=b" (__ign1), "=c" (__ign2), \ 180 : __HYPERCALL_4PARAM \
100 "=d" (__ign3), "=S" (__ign4) \ 181 : __HYPERCALL_ENTRY(name) \
101 : "1" ((long)(a1)), "2" ((long)(a2)), \ 182 : __HYPERCALL_CLOBBER4); \
102 "3" ((long)(a3)), "4" ((long)(a4)), \
103 [call] "m" (hypercall_page[__HYPERVISOR_##name]) \
104 : "memory" ); \
105 (type)__res; \ 183 (type)__res; \
106}) 184})
107 185
108#define _hypercall5(type, name, a1, a2, a3, a4, a5) \ 186#define _hypercall5(type, name, a1, a2, a3, a4, a5) \
109({ \ 187({ \
110 long __res, __ign1, __ign2, __ign3, __ign4, __ign5; \ 188 __HYPERCALL_DECLS; \
111 asm volatile ( \ 189 __HYPERCALL_5ARG(a1, a2, a3, a4, a5); \
112 "call %[call]" \ 190 asm volatile (__HYPERCALL \
113 : "=a" (__res), "=b" (__ign1), "=c" (__ign2), \ 191 : __HYPERCALL_5PARAM \
114 "=d" (__ign3), "=S" (__ign4), "=D" (__ign5) \ 192 : __HYPERCALL_ENTRY(name) \
115 : "1" ((long)(a1)), "2" ((long)(a2)), \ 193 : __HYPERCALL_CLOBBER5); \
116 "3" ((long)(a3)), "4" ((long)(a4)), \
117 "5" ((long)(a5)), \
118 [call] "m" (hypercall_page[__HYPERVISOR_##name]) \
119 : "memory" ); \
120 (type)__res; \ 194 (type)__res; \
121}) 195})
122 196
@@ -152,6 +226,7 @@ HYPERVISOR_stack_switch(unsigned long ss, unsigned long esp)
152 return _hypercall2(int, stack_switch, ss, esp); 226 return _hypercall2(int, stack_switch, ss, esp);
153} 227}
154 228
229#ifdef CONFIG_X86_32
155static inline int 230static inline int
156HYPERVISOR_set_callbacks(unsigned long event_selector, 231HYPERVISOR_set_callbacks(unsigned long event_selector,
157 unsigned long event_address, 232 unsigned long event_address,
@@ -162,6 +237,17 @@ HYPERVISOR_set_callbacks(unsigned long event_selector,
162 event_selector, event_address, 237 event_selector, event_address,
163 failsafe_selector, failsafe_address); 238 failsafe_selector, failsafe_address);
164} 239}
240#else /* CONFIG_X86_64 */
241static inline int
242HYPERVISOR_set_callbacks(unsigned long event_address,
243 unsigned long failsafe_address,
244 unsigned long syscall_address)
245{
246 return _hypercall3(int, set_callbacks,
247 event_address, failsafe_address,
248 syscall_address);
249}
250#endif /* CONFIG_X86_{32,64} */
165 251
166static inline int 252static inline int
167HYPERVISOR_callback_op(int cmd, void *arg) 253HYPERVISOR_callback_op(int cmd, void *arg)
@@ -223,12 +309,12 @@ static inline int
223HYPERVISOR_update_va_mapping(unsigned long va, pte_t new_val, 309HYPERVISOR_update_va_mapping(unsigned long va, pte_t new_val,
224 unsigned long flags) 310 unsigned long flags)
225{ 311{
226 unsigned long pte_hi = 0; 312 if (sizeof(new_val) == sizeof(long))
227#ifdef CONFIG_X86_PAE 313 return _hypercall3(int, update_va_mapping, va,
228 pte_hi = new_val.pte_high; 314 new_val.pte, flags);
229#endif 315 else
230 return _hypercall4(int, update_va_mapping, va, 316 return _hypercall4(int, update_va_mapping, va,
231 new_val.pte_low, pte_hi, flags); 317 new_val.pte, new_val.pte >> 32, flags);
232} 318}
233 319
234static inline int 320static inline int
@@ -281,12 +367,13 @@ static inline int
281HYPERVISOR_update_va_mapping_otherdomain(unsigned long va, pte_t new_val, 367HYPERVISOR_update_va_mapping_otherdomain(unsigned long va, pte_t new_val,
282 unsigned long flags, domid_t domid) 368 unsigned long flags, domid_t domid)
283{ 369{
284 unsigned long pte_hi = 0; 370 if (sizeof(new_val) == sizeof(long))
285#ifdef CONFIG_X86_PAE 371 return _hypercall4(int, update_va_mapping_otherdomain, va,
286 pte_hi = new_val.pte_high; 372 new_val.pte, flags, domid);
287#endif 373 else
288 return _hypercall5(int, update_va_mapping_otherdomain, va, 374 return _hypercall5(int, update_va_mapping_otherdomain, va,
289 new_val.pte_low, pte_hi, flags, domid); 375 new_val.pte, new_val.pte >> 32,
376 flags, domid);
290} 377}
291 378
292static inline int 379static inline int
@@ -301,6 +388,14 @@ HYPERVISOR_vcpu_op(int cmd, int vcpuid, void *extra_args)
301 return _hypercall3(int, vcpu_op, cmd, vcpuid, extra_args); 388 return _hypercall3(int, vcpu_op, cmd, vcpuid, extra_args);
302} 389}
303 390
391#ifdef CONFIG_X86_64
392static inline int
393HYPERVISOR_set_segment_base(int reg, unsigned long value)
394{
395 return _hypercall2(int, set_segment_base, reg, value);
396}
397#endif
398
304static inline int 399static inline int
305HYPERVISOR_suspend(unsigned long srec) 400HYPERVISOR_suspend(unsigned long srec)
306{ 401{
@@ -327,14 +422,14 @@ MULTI_update_va_mapping(struct multicall_entry *mcl, unsigned long va,
327{ 422{
328 mcl->op = __HYPERVISOR_update_va_mapping; 423 mcl->op = __HYPERVISOR_update_va_mapping;
329 mcl->args[0] = va; 424 mcl->args[0] = va;
330#ifdef CONFIG_X86_PAE 425 if (sizeof(new_val) == sizeof(long)) {
331 mcl->args[1] = new_val.pte_low; 426 mcl->args[1] = new_val.pte;
332 mcl->args[2] = new_val.pte_high; 427 mcl->args[2] = flags;
333#else 428 } else {
334 mcl->args[1] = new_val.pte_low; 429 mcl->args[1] = new_val.pte;
335 mcl->args[2] = 0; 430 mcl->args[2] = new_val.pte >> 32;
336#endif 431 mcl->args[3] = flags;
337 mcl->args[3] = flags; 432 }
338} 433}
339 434
340static inline void 435static inline void
@@ -354,15 +449,16 @@ MULTI_update_va_mapping_otherdomain(struct multicall_entry *mcl, unsigned long v
354{ 449{
355 mcl->op = __HYPERVISOR_update_va_mapping_otherdomain; 450 mcl->op = __HYPERVISOR_update_va_mapping_otherdomain;
356 mcl->args[0] = va; 451 mcl->args[0] = va;
357#ifdef CONFIG_X86_PAE 452 if (sizeof(new_val) == sizeof(long)) {
358 mcl->args[1] = new_val.pte_low; 453 mcl->args[1] = new_val.pte;
359 mcl->args[2] = new_val.pte_high; 454 mcl->args[2] = flags;
360#else 455 mcl->args[3] = domid;
361 mcl->args[1] = new_val.pte_low; 456 } else {
362 mcl->args[2] = 0; 457 mcl->args[1] = new_val.pte;
363#endif 458 mcl->args[2] = new_val.pte >> 32;
364 mcl->args[3] = flags; 459 mcl->args[3] = flags;
365 mcl->args[4] = domid; 460 mcl->args[4] = domid;
461 }
366} 462}
367 463
368static inline void 464static inline void
@@ -370,10 +466,15 @@ MULTI_update_descriptor(struct multicall_entry *mcl, u64 maddr,
370 struct desc_struct desc) 466 struct desc_struct desc)
371{ 467{
372 mcl->op = __HYPERVISOR_update_descriptor; 468 mcl->op = __HYPERVISOR_update_descriptor;
373 mcl->args[0] = maddr; 469 if (sizeof(maddr) == sizeof(long)) {
374 mcl->args[1] = maddr >> 32; 470 mcl->args[0] = maddr;
375 mcl->args[2] = desc.a; 471 mcl->args[1] = *(unsigned long *)&desc;
376 mcl->args[3] = desc.b; 472 } else {
473 mcl->args[0] = maddr;
474 mcl->args[1] = maddr >> 32;
475 mcl->args[2] = desc.a;
476 mcl->args[3] = desc.b;
477 }
377} 478}
378 479
379static inline void 480static inline void
diff --git a/include/asm-x86/xen/interface.h b/include/asm-x86/xen/interface.h
index 6227000a1e84..9d810f2538a2 100644
--- a/include/asm-x86/xen/interface.h
+++ b/include/asm-x86/xen/interface.h
@@ -1,13 +1,13 @@
1/****************************************************************************** 1/******************************************************************************
2 * arch-x86_32.h 2 * arch-x86_32.h
3 * 3 *
4 * Guest OS interface to x86 32-bit Xen. 4 * Guest OS interface to x86 Xen.
5 * 5 *
6 * Copyright (c) 2004, K A Fraser 6 * Copyright (c) 2004, K A Fraser
7 */ 7 */
8 8
9#ifndef __XEN_PUBLIC_ARCH_X86_32_H__ 9#ifndef __ASM_X86_XEN_INTERFACE_H
10#define __XEN_PUBLIC_ARCH_X86_32_H__ 10#define __ASM_X86_XEN_INTERFACE_H
11 11
12#ifdef __XEN__ 12#ifdef __XEN__
13#define __DEFINE_GUEST_HANDLE(name, type) \ 13#define __DEFINE_GUEST_HANDLE(name, type) \
@@ -57,6 +57,17 @@ DEFINE_GUEST_HANDLE(long);
57DEFINE_GUEST_HANDLE(void); 57DEFINE_GUEST_HANDLE(void);
58#endif 58#endif
59 59
60#ifndef HYPERVISOR_VIRT_START
61#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START)
62#endif
63
64#ifndef machine_to_phys_mapping
65#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START)
66#endif
67
68/* Maximum number of virtual CPUs in multi-processor guests. */
69#define MAX_VIRT_CPUS 32
70
60/* 71/*
61 * SEGMENT DESCRIPTOR TABLES 72 * SEGMENT DESCRIPTOR TABLES
62 */ 73 */
@@ -71,58 +82,21 @@ DEFINE_GUEST_HANDLE(void);
71#define FIRST_RESERVED_GDT_ENTRY (FIRST_RESERVED_GDT_BYTE / 8) 82#define FIRST_RESERVED_GDT_ENTRY (FIRST_RESERVED_GDT_BYTE / 8)
72 83
73/* 84/*
74 * These flat segments are in the Xen-private section of every GDT. Since these
75 * are also present in the initial GDT, many OSes will be able to avoid
76 * installing their own GDT.
77 */
78#define FLAT_RING1_CS 0xe019 /* GDT index 259 */
79#define FLAT_RING1_DS 0xe021 /* GDT index 260 */
80#define FLAT_RING1_SS 0xe021 /* GDT index 260 */
81#define FLAT_RING3_CS 0xe02b /* GDT index 261 */
82#define FLAT_RING3_DS 0xe033 /* GDT index 262 */
83#define FLAT_RING3_SS 0xe033 /* GDT index 262 */
84
85#define FLAT_KERNEL_CS FLAT_RING1_CS
86#define FLAT_KERNEL_DS FLAT_RING1_DS
87#define FLAT_KERNEL_SS FLAT_RING1_SS
88#define FLAT_USER_CS FLAT_RING3_CS
89#define FLAT_USER_DS FLAT_RING3_DS
90#define FLAT_USER_SS FLAT_RING3_SS
91
92/* And the trap vector is... */
93#define TRAP_INSTR "int $0x82"
94
95/*
96 * Virtual addresses beyond this are not modifiable by guest OSes. The
97 * machine->physical mapping table starts at this address, read-only.
98 */
99#ifdef CONFIG_X86_PAE
100#define __HYPERVISOR_VIRT_START 0xF5800000
101#else
102#define __HYPERVISOR_VIRT_START 0xFC000000
103#endif
104
105#ifndef HYPERVISOR_VIRT_START
106#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START)
107#endif
108
109#ifndef machine_to_phys_mapping
110#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START)
111#endif
112
113/* Maximum number of virtual CPUs in multi-processor guests. */
114#define MAX_VIRT_CPUS 32
115
116#ifndef __ASSEMBLY__
117
118/*
119 * Send an array of these to HYPERVISOR_set_trap_table() 85 * Send an array of these to HYPERVISOR_set_trap_table()
86 * The privilege level specifies which modes may enter a trap via a software
87 * interrupt. On x86/64, since rings 1 and 2 are unavailable, we allocate
88 * privilege levels as follows:
89 * Level == 0: Noone may enter
90 * Level == 1: Kernel may enter
91 * Level == 2: Kernel may enter
92 * Level == 3: Everyone may enter
120 */ 93 */
121#define TI_GET_DPL(_ti) ((_ti)->flags & 3) 94#define TI_GET_DPL(_ti) ((_ti)->flags & 3)
122#define TI_GET_IF(_ti) ((_ti)->flags & 4) 95#define TI_GET_IF(_ti) ((_ti)->flags & 4)
123#define TI_SET_DPL(_ti, _dpl) ((_ti)->flags |= (_dpl)) 96#define TI_SET_DPL(_ti, _dpl) ((_ti)->flags |= (_dpl))
124#define TI_SET_IF(_ti, _if) ((_ti)->flags |= ((!!(_if))<<2)) 97#define TI_SET_IF(_ti, _if) ((_ti)->flags |= ((!!(_if))<<2))
125 98
99#ifndef __ASSEMBLY__
126struct trap_info { 100struct trap_info {
127 uint8_t vector; /* exception vector */ 101 uint8_t vector; /* exception vector */
128 uint8_t flags; /* 0-3: privilege level; 4: clear event enable? */ 102 uint8_t flags; /* 0-3: privilege level; 4: clear event enable? */
@@ -131,32 +105,21 @@ struct trap_info {
131}; 105};
132DEFINE_GUEST_HANDLE_STRUCT(trap_info); 106DEFINE_GUEST_HANDLE_STRUCT(trap_info);
133 107
134struct cpu_user_regs { 108struct arch_shared_info {
135 uint32_t ebx; 109 unsigned long max_pfn; /* max pfn that appears in table */
136 uint32_t ecx; 110 /* Frame containing list of mfns containing list of mfns containing p2m. */
137 uint32_t edx; 111 unsigned long pfn_to_mfn_frame_list_list;
138 uint32_t esi; 112 unsigned long nmi_reason;
139 uint32_t edi;
140 uint32_t ebp;
141 uint32_t eax;
142 uint16_t error_code; /* private */
143 uint16_t entry_vector; /* private */
144 uint32_t eip;
145 uint16_t cs;
146 uint8_t saved_upcall_mask;
147 uint8_t _pad0;
148 uint32_t eflags; /* eflags.IF == !saved_upcall_mask */
149 uint32_t esp;
150 uint16_t ss, _pad1;
151 uint16_t es, _pad2;
152 uint16_t ds, _pad3;
153 uint16_t fs, _pad4;
154 uint16_t gs, _pad5;
155}; 113};
156DEFINE_GUEST_HANDLE_STRUCT(cpu_user_regs); 114#endif /* !__ASSEMBLY__ */
157 115
158typedef uint64_t tsc_timestamp_t; /* RDTSC timestamp */ 116#ifdef CONFIG_X86_32
117#include "interface_32.h"
118#else
119#include "interface_64.h"
120#endif
159 121
122#ifndef __ASSEMBLY__
160/* 123/*
161 * The following is all CPU context. Note that the fpu_ctxt block is filled 124 * The following is all CPU context. Note that the fpu_ctxt block is filled
162 * in by FXSAVE if the CPU has feature FXSR; otherwise FSAVE is used. 125 * in by FXSAVE if the CPU has feature FXSR; otherwise FSAVE is used.
@@ -173,33 +136,29 @@ struct vcpu_guest_context {
173 unsigned long ldt_base, ldt_ents; /* LDT (linear address, # ents) */ 136 unsigned long ldt_base, ldt_ents; /* LDT (linear address, # ents) */
174 unsigned long gdt_frames[16], gdt_ents; /* GDT (machine frames, # ents) */ 137 unsigned long gdt_frames[16], gdt_ents; /* GDT (machine frames, # ents) */
175 unsigned long kernel_ss, kernel_sp; /* Virtual TSS (only SS1/SP1) */ 138 unsigned long kernel_ss, kernel_sp; /* Virtual TSS (only SS1/SP1) */
139 /* NB. User pagetable on x86/64 is placed in ctrlreg[1]. */
176 unsigned long ctrlreg[8]; /* CR0-CR7 (control registers) */ 140 unsigned long ctrlreg[8]; /* CR0-CR7 (control registers) */
177 unsigned long debugreg[8]; /* DB0-DB7 (debug registers) */ 141 unsigned long debugreg[8]; /* DB0-DB7 (debug registers) */
142#ifdef __i386__
178 unsigned long event_callback_cs; /* CS:EIP of event callback */ 143 unsigned long event_callback_cs; /* CS:EIP of event callback */
179 unsigned long event_callback_eip; 144 unsigned long event_callback_eip;
180 unsigned long failsafe_callback_cs; /* CS:EIP of failsafe callback */ 145 unsigned long failsafe_callback_cs; /* CS:EIP of failsafe callback */
181 unsigned long failsafe_callback_eip; 146 unsigned long failsafe_callback_eip;
147#else
148 unsigned long event_callback_eip;
149 unsigned long failsafe_callback_eip;
150 unsigned long syscall_callback_eip;
151#endif
182 unsigned long vm_assist; /* VMASST_TYPE_* bitmap */ 152 unsigned long vm_assist; /* VMASST_TYPE_* bitmap */
153#ifdef __x86_64__
154 /* Segment base addresses. */
155 uint64_t fs_base;
156 uint64_t gs_base_kernel;
157 uint64_t gs_base_user;
158#endif
183}; 159};
184DEFINE_GUEST_HANDLE_STRUCT(vcpu_guest_context); 160DEFINE_GUEST_HANDLE_STRUCT(vcpu_guest_context);
185 161#endif /* !__ASSEMBLY__ */
186struct arch_shared_info {
187 unsigned long max_pfn; /* max pfn that appears in table */
188 /* Frame containing list of mfns containing list of mfns containing p2m. */
189 unsigned long pfn_to_mfn_frame_list_list;
190 unsigned long nmi_reason;
191};
192
193struct arch_vcpu_info {
194 unsigned long cr2;
195 unsigned long pad[5]; /* sizeof(struct vcpu_info) == 64 */
196};
197
198struct xen_callback {
199 unsigned long cs;
200 unsigned long eip;
201};
202#endif /* !__ASSEMBLY__ */
203 162
204/* 163/*
205 * Prefix forces emulation of some non-trapping instructions. 164 * Prefix forces emulation of some non-trapping instructions.
@@ -213,4 +172,4 @@ struct xen_callback {
213#define XEN_CPUID XEN_EMULATE_PREFIX "cpuid" 172#define XEN_CPUID XEN_EMULATE_PREFIX "cpuid"
214#endif 173#endif
215 174
216#endif 175#endif /* __ASM_X86_XEN_INTERFACE_H */
diff --git a/include/asm-x86/xen/interface_32.h b/include/asm-x86/xen/interface_32.h
new file mode 100644
index 000000000000..d8ac41d5db86
--- /dev/null
+++ b/include/asm-x86/xen/interface_32.h
@@ -0,0 +1,97 @@
1/******************************************************************************
2 * arch-x86_32.h
3 *
4 * Guest OS interface to x86 32-bit Xen.
5 *
6 * Copyright (c) 2004, K A Fraser
7 */
8
9#ifndef __ASM_X86_XEN_INTERFACE_32_H
10#define __ASM_X86_XEN_INTERFACE_32_H
11
12
13/*
14 * These flat segments are in the Xen-private section of every GDT. Since these
15 * are also present in the initial GDT, many OSes will be able to avoid
16 * installing their own GDT.
17 */
18#define FLAT_RING1_CS 0xe019 /* GDT index 259 */
19#define FLAT_RING1_DS 0xe021 /* GDT index 260 */
20#define FLAT_RING1_SS 0xe021 /* GDT index 260 */
21#define FLAT_RING3_CS 0xe02b /* GDT index 261 */
22#define FLAT_RING3_DS 0xe033 /* GDT index 262 */
23#define FLAT_RING3_SS 0xe033 /* GDT index 262 */
24
25#define FLAT_KERNEL_CS FLAT_RING1_CS
26#define FLAT_KERNEL_DS FLAT_RING1_DS
27#define FLAT_KERNEL_SS FLAT_RING1_SS
28#define FLAT_USER_CS FLAT_RING3_CS
29#define FLAT_USER_DS FLAT_RING3_DS
30#define FLAT_USER_SS FLAT_RING3_SS
31
32/* And the trap vector is... */
33#define TRAP_INSTR "int $0x82"
34
35/*
36 * Virtual addresses beyond this are not modifiable by guest OSes. The
37 * machine->physical mapping table starts at this address, read-only.
38 */
39#define __HYPERVISOR_VIRT_START 0xF5800000
40
41#ifndef __ASSEMBLY__
42
43struct cpu_user_regs {
44 uint32_t ebx;
45 uint32_t ecx;
46 uint32_t edx;
47 uint32_t esi;
48 uint32_t edi;
49 uint32_t ebp;
50 uint32_t eax;
51 uint16_t error_code; /* private */
52 uint16_t entry_vector; /* private */
53 uint32_t eip;
54 uint16_t cs;
55 uint8_t saved_upcall_mask;
56 uint8_t _pad0;
57 uint32_t eflags; /* eflags.IF == !saved_upcall_mask */
58 uint32_t esp;
59 uint16_t ss, _pad1;
60 uint16_t es, _pad2;
61 uint16_t ds, _pad3;
62 uint16_t fs, _pad4;
63 uint16_t gs, _pad5;
64};
65DEFINE_GUEST_HANDLE_STRUCT(cpu_user_regs);
66
67typedef uint64_t tsc_timestamp_t; /* RDTSC timestamp */
68
69struct arch_vcpu_info {
70 unsigned long cr2;
71 unsigned long pad[5]; /* sizeof(struct vcpu_info) == 64 */
72};
73
74struct xen_callback {
75 unsigned long cs;
76 unsigned long eip;
77};
78typedef struct xen_callback xen_callback_t;
79
80#define XEN_CALLBACK(__cs, __eip) \
81 ((struct xen_callback){ .cs = (__cs), .eip = (unsigned long)(__eip) })
82#endif /* !__ASSEMBLY__ */
83
84
85/*
86 * Page-directory addresses above 4GB do not fit into architectural %cr3.
87 * When accessing %cr3, or equivalent field in vcpu_guest_context, guests
88 * must use the following accessor macros to pack/unpack valid MFNs.
89 *
90 * Note that Xen is using the fact that the pagetable base is always
91 * page-aligned, and putting the 12 MSB of the address into the 12 LSB
92 * of cr3.
93 */
94#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20))
95#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20))
96
97#endif /* __ASM_X86_XEN_INTERFACE_32_H */
diff --git a/include/asm-x86/xen/interface_64.h b/include/asm-x86/xen/interface_64.h
new file mode 100644
index 000000000000..842266ce96e6
--- /dev/null
+++ b/include/asm-x86/xen/interface_64.h
@@ -0,0 +1,159 @@
1#ifndef __ASM_X86_XEN_INTERFACE_64_H
2#define __ASM_X86_XEN_INTERFACE_64_H
3
4/*
5 * 64-bit segment selectors
6 * These flat segments are in the Xen-private section of every GDT. Since these
7 * are also present in the initial GDT, many OSes will be able to avoid
8 * installing their own GDT.
9 */
10
11#define FLAT_RING3_CS32 0xe023 /* GDT index 260 */
12#define FLAT_RING3_CS64 0xe033 /* GDT index 261 */
13#define FLAT_RING3_DS32 0xe02b /* GDT index 262 */
14#define FLAT_RING3_DS64 0x0000 /* NULL selector */
15#define FLAT_RING3_SS32 0xe02b /* GDT index 262 */
16#define FLAT_RING3_SS64 0xe02b /* GDT index 262 */
17
18#define FLAT_KERNEL_DS64 FLAT_RING3_DS64
19#define FLAT_KERNEL_DS32 FLAT_RING3_DS32
20#define FLAT_KERNEL_DS FLAT_KERNEL_DS64
21#define FLAT_KERNEL_CS64 FLAT_RING3_CS64
22#define FLAT_KERNEL_CS32 FLAT_RING3_CS32
23#define FLAT_KERNEL_CS FLAT_KERNEL_CS64
24#define FLAT_KERNEL_SS64 FLAT_RING3_SS64
25#define FLAT_KERNEL_SS32 FLAT_RING3_SS32
26#define FLAT_KERNEL_SS FLAT_KERNEL_SS64
27
28#define FLAT_USER_DS64 FLAT_RING3_DS64
29#define FLAT_USER_DS32 FLAT_RING3_DS32
30#define FLAT_USER_DS FLAT_USER_DS64
31#define FLAT_USER_CS64 FLAT_RING3_CS64
32#define FLAT_USER_CS32 FLAT_RING3_CS32
33#define FLAT_USER_CS FLAT_USER_CS64
34#define FLAT_USER_SS64 FLAT_RING3_SS64
35#define FLAT_USER_SS32 FLAT_RING3_SS32
36#define FLAT_USER_SS FLAT_USER_SS64
37
38#define __HYPERVISOR_VIRT_START 0xFFFF800000000000
39#define __HYPERVISOR_VIRT_END 0xFFFF880000000000
40#define __MACH2PHYS_VIRT_START 0xFFFF800000000000
41#define __MACH2PHYS_VIRT_END 0xFFFF804000000000
42
43#ifndef HYPERVISOR_VIRT_START
44#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START)
45#define HYPERVISOR_VIRT_END mk_unsigned_long(__HYPERVISOR_VIRT_END)
46#endif
47
48#define MACH2PHYS_VIRT_START mk_unsigned_long(__MACH2PHYS_VIRT_START)
49#define MACH2PHYS_VIRT_END mk_unsigned_long(__MACH2PHYS_VIRT_END)
50#define MACH2PHYS_NR_ENTRIES ((MACH2PHYS_VIRT_END-MACH2PHYS_VIRT_START)>>3)
51#ifndef machine_to_phys_mapping
52#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START)
53#endif
54
55/*
56 * int HYPERVISOR_set_segment_base(unsigned int which, unsigned long base)
57 * @which == SEGBASE_* ; @base == 64-bit base address
58 * Returns 0 on success.
59 */
60#define SEGBASE_FS 0
61#define SEGBASE_GS_USER 1
62#define SEGBASE_GS_KERNEL 2
63#define SEGBASE_GS_USER_SEL 3 /* Set user %gs specified in base[15:0] */
64
65/*
66 * int HYPERVISOR_iret(void)
67 * All arguments are on the kernel stack, in the following format.
68 * Never returns if successful. Current kernel context is lost.
69 * The saved CS is mapped as follows:
70 * RING0 -> RING3 kernel mode.
71 * RING1 -> RING3 kernel mode.
72 * RING2 -> RING3 kernel mode.
73 * RING3 -> RING3 user mode.
74 * However RING0 indicates that the guest kernel should return to iteself
75 * directly with
76 * orb $3,1*8(%rsp)
77 * iretq
78 * If flags contains VGCF_in_syscall:
79 * Restore RAX, RIP, RFLAGS, RSP.
80 * Discard R11, RCX, CS, SS.
81 * Otherwise:
82 * Restore RAX, R11, RCX, CS:RIP, RFLAGS, SS:RSP.
83 * All other registers are saved on hypercall entry and restored to user.
84 */
85/* Guest exited in SYSCALL context? Return to guest with SYSRET? */
86#define _VGCF_in_syscall 8
87#define VGCF_in_syscall (1<<_VGCF_in_syscall)
88#define VGCF_IN_SYSCALL VGCF_in_syscall
89
90#ifndef __ASSEMBLY__
91
92struct iret_context {
93 /* Top of stack (%rsp at point of hypercall). */
94 uint64_t rax, r11, rcx, flags, rip, cs, rflags, rsp, ss;
95 /* Bottom of iret stack frame. */
96};
97
98#if defined(__GNUC__) && !defined(__STRICT_ANSI__)
99/* Anonymous union includes both 32- and 64-bit names (e.g., eax/rax). */
100#define __DECL_REG(name) union { \
101 uint64_t r ## name, e ## name; \
102 uint32_t _e ## name; \
103}
104#else
105/* Non-gcc sources must always use the proper 64-bit name (e.g., rax). */
106#define __DECL_REG(name) uint64_t r ## name
107#endif
108
109struct cpu_user_regs {
110 uint64_t r15;
111 uint64_t r14;
112 uint64_t r13;
113 uint64_t r12;
114 __DECL_REG(bp);
115 __DECL_REG(bx);
116 uint64_t r11;
117 uint64_t r10;
118 uint64_t r9;
119 uint64_t r8;
120 __DECL_REG(ax);
121 __DECL_REG(cx);
122 __DECL_REG(dx);
123 __DECL_REG(si);
124 __DECL_REG(di);
125 uint32_t error_code; /* private */
126 uint32_t entry_vector; /* private */
127 __DECL_REG(ip);
128 uint16_t cs, _pad0[1];
129 uint8_t saved_upcall_mask;
130 uint8_t _pad1[3];
131 __DECL_REG(flags); /* rflags.IF == !saved_upcall_mask */
132 __DECL_REG(sp);
133 uint16_t ss, _pad2[3];
134 uint16_t es, _pad3[3];
135 uint16_t ds, _pad4[3];
136 uint16_t fs, _pad5[3]; /* Non-zero => takes precedence over fs_base. */
137 uint16_t gs, _pad6[3]; /* Non-zero => takes precedence over gs_base_usr. */
138};
139DEFINE_GUEST_HANDLE_STRUCT(cpu_user_regs);
140
141#undef __DECL_REG
142
143#define xen_pfn_to_cr3(pfn) ((unsigned long)(pfn) << 12)
144#define xen_cr3_to_pfn(cr3) ((unsigned long)(cr3) >> 12)
145
146struct arch_vcpu_info {
147 unsigned long cr2;
148 unsigned long pad; /* sizeof(vcpu_info_t) == 64 */
149};
150
151typedef unsigned long xen_callback_t;
152
153#define XEN_CALLBACK(__cs, __rip) \
154 ((unsigned long)(__rip))
155
156#endif /* !__ASSEMBLY__ */
157
158
159#endif /* __ASM_X86_XEN_INTERFACE_64_H */
diff --git a/include/asm-x86/xen/page.h b/include/asm-x86/xen/page.h
index 377c04591c15..05e678a86628 100644
--- a/include/asm-x86/xen/page.h
+++ b/include/asm-x86/xen/page.h
@@ -148,13 +148,17 @@ static inline pte_t __pte_ma(pteval_t x)
148} 148}
149 149
150#define pmd_val_ma(v) ((v).pmd) 150#define pmd_val_ma(v) ((v).pmd)
151#ifdef __PAGETABLE_PUD_FOLDED
151#define pud_val_ma(v) ((v).pgd.pgd) 152#define pud_val_ma(v) ((v).pgd.pgd)
153#else
154#define pud_val_ma(v) ((v).pud)
155#endif
152#define __pmd_ma(x) ((pmd_t) { (x) } ) 156#define __pmd_ma(x) ((pmd_t) { (x) } )
153 157
154#define pgd_val_ma(x) ((x).pgd) 158#define pgd_val_ma(x) ((x).pgd)
155 159
156 160
157xmaddr_t arbitrary_virt_to_machine(unsigned long address); 161xmaddr_t arbitrary_virt_to_machine(void *address);
158void make_lowmem_page_readonly(void *vaddr); 162void make_lowmem_page_readonly(void *vaddr);
159void make_lowmem_page_readwrite(void *vaddr); 163void make_lowmem_page_readwrite(void *vaddr);
160 164
diff --git a/include/xen/hvc-console.h b/include/xen/hvc-console.h
index 98b79bc404dd..c3adde32669b 100644
--- a/include/xen/hvc-console.h
+++ b/include/xen/hvc-console.h
@@ -5,11 +5,12 @@ extern struct console xenboot_console;
5 5
6#ifdef CONFIG_HVC_XEN 6#ifdef CONFIG_HVC_XEN
7void xen_console_resume(void); 7void xen_console_resume(void);
8void xen_raw_console_write(const char *str);
9void xen_raw_printk(const char *fmt, ...);
8#else 10#else
9static inline void xen_console_resume(void) { } 11static inline void xen_console_resume(void) { }
12static inline void xen_raw_console_write(const char *str) { }
13static inline void xen_raw_printk(const char *fmt, ...) { }
10#endif 14#endif
11 15
12void xen_raw_console_write(const char *str);
13void xen_raw_printk(const char *fmt, ...);
14
15#endif /* XEN_HVC_CONSOLE_H */ 16#endif /* XEN_HVC_CONSOLE_H */
diff --git a/include/xen/interface/callback.h b/include/xen/interface/callback.h
index 4aadcba31af9..2ae3cd243264 100644
--- a/include/xen/interface/callback.h
+++ b/include/xen/interface/callback.h
@@ -82,9 +82,9 @@
82 */ 82 */
83#define CALLBACKOP_register 0 83#define CALLBACKOP_register 0
84struct callback_register { 84struct callback_register {
85 uint16_t type; 85 uint16_t type;
86 uint16_t flags; 86 uint16_t flags;
87 struct xen_callback address; 87 xen_callback_t address;
88}; 88};
89 89
90/* 90/*
diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h
index a706d6a78960..883a21bba24b 100644
--- a/include/xen/xen-ops.h
+++ b/include/xen/xen-ops.h
@@ -11,4 +11,7 @@ void xen_post_suspend(int suspend_cancelled);
11void xen_mm_pin_all(void); 11void xen_mm_pin_all(void);
12void xen_mm_unpin_all(void); 12void xen_mm_unpin_all(void);
13 13
14void xen_timer_resume(void);
15void xen_arch_resume(void);
16
14#endif /* INCLUDE_XEN_OPS_H */ 17#endif /* INCLUDE_XEN_OPS_H */
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index b45da40e8d25..59dfdf1e1d20 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -82,7 +82,7 @@ config PM_SLEEP_SMP
82 82
83config PM_SLEEP 83config PM_SLEEP
84 bool 84 bool
85 depends on SUSPEND || HIBERNATION 85 depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE
86 default y 86 default y
87 87
88config SUSPEND 88config SUSPEND