aboutsummaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
Diffstat (limited to 'arch')
-rw-r--r--arch/x86/ia32/ia32entry.S1
-rw-r--r--arch/x86/kernel/Makefile3
-rw-r--r--arch/x86/kernel/asm-offsets_64.c11
-rw-r--r--arch/x86/kernel/cpu/amd_64.c2
-rw-r--r--arch/x86/kernel/cpu/common_64.c7
-rw-r--r--arch/x86/kernel/entry_64.S106
-rw-r--r--arch/x86/kernel/head64.c11
-rw-r--r--arch/x86/kernel/head_64.S1
-rw-r--r--arch/x86/kernel/irq_32.c7
-rw-r--r--arch/x86/kernel/paravirt.c28
-rw-r--r--arch/x86/kernel/process_64.c56
-rw-r--r--arch/x86/kernel/setup.c3
-rw-r--r--arch/x86/kernel/smpboot.c2
-rw-r--r--arch/x86/mm/init_32.c2
-rw-r--r--arch/x86/vdso/Makefile2
-rw-r--r--arch/x86/vdso/vdso32-setup.c19
-rw-r--r--arch/x86/vdso/vdso32.S13
-rw-r--r--arch/x86/xen/Kconfig9
-rw-r--r--arch/x86/xen/Makefile2
-rw-r--r--arch/x86/xen/enlighten.c688
-rw-r--r--arch/x86/xen/mmu.c316
-rw-r--r--arch/x86/xen/mmu.h29
-rw-r--r--arch/x86/xen/multicalls.c1
-rw-r--r--arch/x86/xen/setup.c79
-rw-r--r--arch/x86/xen/smp.c306
-rw-r--r--arch/x86/xen/suspend.c5
-rw-r--r--arch/x86/xen/xen-asm_32.S (renamed from arch/x86/xen/xen-asm.S)0
-rw-r--r--arch/x86/xen/xen-asm_64.S271
-rw-r--r--arch/x86/xen/xen-head.S28
-rw-r--r--arch/x86/xen/xen-ops.h21
30 files changed, 1627 insertions, 402 deletions
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 20371d0635e4..0ae1e77eae50 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -321,6 +321,7 @@ ENTRY(ia32_syscall)
321 /*CFI_REL_OFFSET rflags,EFLAGS-RIP*/ 321 /*CFI_REL_OFFSET rflags,EFLAGS-RIP*/
322 /*CFI_REL_OFFSET cs,CS-RIP*/ 322 /*CFI_REL_OFFSET cs,CS-RIP*/
323 CFI_REL_OFFSET rip,RIP-RIP 323 CFI_REL_OFFSET rip,RIP-RIP
324 PARAVIRT_ADJUST_EXCEPTION_FRAME
324 SWAPGS 325 SWAPGS
325 /* 326 /*
326 * No need to follow this irqs on/off section: the syscall 327 * No need to follow this irqs on/off section: the syscall
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index da140611bb57..058c5594f493 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -7,9 +7,10 @@ extra-y := head_$(BITS).o head$(BITS).o head.o init_task.o vmlinu
7CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE) 7CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)
8 8
9ifdef CONFIG_FTRACE 9ifdef CONFIG_FTRACE
10# Do not profile debug utilities 10# Do not profile debug and lowlevel utilities
11CFLAGS_REMOVE_tsc.o = -pg 11CFLAGS_REMOVE_tsc.o = -pg
12CFLAGS_REMOVE_rtc.o = -pg 12CFLAGS_REMOVE_rtc.o = -pg
13CFLAGS_REMOVE_paravirt.o = -pg
13endif 14endif
14 15
15# 16#
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index bacf5deeec2d..aa89387006fe 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -18,6 +18,8 @@
18#include <asm/ia32.h> 18#include <asm/ia32.h>
19#include <asm/bootparam.h> 19#include <asm/bootparam.h>
20 20
21#include <xen/interface/xen.h>
22
21#define __NO_STUBS 1 23#define __NO_STUBS 1
22#undef __SYSCALL 24#undef __SYSCALL
23#undef _ASM_X86_64_UNISTD_H_ 25#undef _ASM_X86_64_UNISTD_H_
@@ -131,5 +133,14 @@ int main(void)
131 OFFSET(BP_loadflags, boot_params, hdr.loadflags); 133 OFFSET(BP_loadflags, boot_params, hdr.loadflags);
132 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); 134 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
133 OFFSET(BP_version, boot_params, hdr.version); 135 OFFSET(BP_version, boot_params, hdr.version);
136
137 BLANK();
138 DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
139#ifdef CONFIG_XEN
140 BLANK();
141 OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
142 OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
143#undef ENTRY
144#endif
134 return 0; 145 return 0;
135} 146}
diff --git a/arch/x86/kernel/cpu/amd_64.c b/arch/x86/kernel/cpu/amd_64.c
index 7c36fb8a28d4..d1692b2a41ff 100644
--- a/arch/x86/kernel/cpu/amd_64.c
+++ b/arch/x86/kernel/cpu/amd_64.c
@@ -115,6 +115,8 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
115 /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */ 115 /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
116 if (c->x86_power & (1<<8)) 116 if (c->x86_power & (1<<8))
117 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 117 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
118
119 set_cpu_cap(c, X86_FEATURE_SYSCALL32);
118} 120}
119 121
120static void __cpuinit init_amd(struct cpuinfo_x86 *c) 122static void __cpuinit init_amd(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c
index 7b8cc72feb40..736f50fa433d 100644
--- a/arch/x86/kernel/cpu/common_64.c
+++ b/arch/x86/kernel/cpu/common_64.c
@@ -16,6 +16,7 @@
16#include <asm/i387.h> 16#include <asm/i387.h>
17#include <asm/msr.h> 17#include <asm/msr.h>
18#include <asm/io.h> 18#include <asm/io.h>
19#include <asm/linkage.h>
19#include <asm/mmu_context.h> 20#include <asm/mmu_context.h>
20#include <asm/mtrr.h> 21#include <asm/mtrr.h>
21#include <asm/mce.h> 22#include <asm/mce.h>
@@ -316,9 +317,6 @@ static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
316 c->x86_phys_bits = eax & 0xff; 317 c->x86_phys_bits = eax & 0xff;
317 } 318 }
318 319
319 /* Assume all 64-bit CPUs support 32-bit syscall */
320 set_cpu_cap(c, X86_FEATURE_SYSCALL32);
321
322 if (c->x86_vendor != X86_VENDOR_UNKNOWN && 320 if (c->x86_vendor != X86_VENDOR_UNKNOWN &&
323 cpu_devs[c->x86_vendor]->c_early_init) 321 cpu_devs[c->x86_vendor]->c_early_init)
324 cpu_devs[c->x86_vendor]->c_early_init(c); 322 cpu_devs[c->x86_vendor]->c_early_init(c);
@@ -517,8 +515,7 @@ void pda_init(int cpu)
517} 515}
518 516
519char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + 517char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ +
520 DEBUG_STKSZ] 518 DEBUG_STKSZ] __page_aligned_bss;
521__attribute__((section(".bss.page_aligned")));
522 519
523extern asmlinkage void ignore_sysret(void); 520extern asmlinkage void ignore_sysret(void);
524 521
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index ae63e584c340..80d5663db3bc 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1189,6 +1189,7 @@ END(device_not_available)
1189 /* runs on exception stack */ 1189 /* runs on exception stack */
1190KPROBE_ENTRY(debug) 1190KPROBE_ENTRY(debug)
1191 INTR_FRAME 1191 INTR_FRAME
1192 PARAVIRT_ADJUST_EXCEPTION_FRAME
1192 pushq $0 1193 pushq $0
1193 CFI_ADJUST_CFA_OFFSET 8 1194 CFI_ADJUST_CFA_OFFSET 8
1194 paranoidentry do_debug, DEBUG_STACK 1195 paranoidentry do_debug, DEBUG_STACK
@@ -1198,6 +1199,7 @@ KPROBE_END(debug)
1198 /* runs on exception stack */ 1199 /* runs on exception stack */
1199KPROBE_ENTRY(nmi) 1200KPROBE_ENTRY(nmi)
1200 INTR_FRAME 1201 INTR_FRAME
1202 PARAVIRT_ADJUST_EXCEPTION_FRAME
1201 pushq $-1 1203 pushq $-1
1202 CFI_ADJUST_CFA_OFFSET 8 1204 CFI_ADJUST_CFA_OFFSET 8
1203 paranoidentry do_nmi, 0, 0 1205 paranoidentry do_nmi, 0, 0
@@ -1211,6 +1213,7 @@ KPROBE_END(nmi)
1211 1213
1212KPROBE_ENTRY(int3) 1214KPROBE_ENTRY(int3)
1213 INTR_FRAME 1215 INTR_FRAME
1216 PARAVIRT_ADJUST_EXCEPTION_FRAME
1214 pushq $0 1217 pushq $0
1215 CFI_ADJUST_CFA_OFFSET 8 1218 CFI_ADJUST_CFA_OFFSET 8
1216 paranoidentry do_int3, DEBUG_STACK 1219 paranoidentry do_int3, DEBUG_STACK
@@ -1237,6 +1240,7 @@ END(coprocessor_segment_overrun)
1237 /* runs on exception stack */ 1240 /* runs on exception stack */
1238ENTRY(double_fault) 1241ENTRY(double_fault)
1239 XCPT_FRAME 1242 XCPT_FRAME
1243 PARAVIRT_ADJUST_EXCEPTION_FRAME
1240 paranoidentry do_double_fault 1244 paranoidentry do_double_fault
1241 jmp paranoid_exit1 1245 jmp paranoid_exit1
1242 CFI_ENDPROC 1246 CFI_ENDPROC
@@ -1253,6 +1257,7 @@ END(segment_not_present)
1253 /* runs on exception stack */ 1257 /* runs on exception stack */
1254ENTRY(stack_segment) 1258ENTRY(stack_segment)
1255 XCPT_FRAME 1259 XCPT_FRAME
1260 PARAVIRT_ADJUST_EXCEPTION_FRAME
1256 paranoidentry do_stack_segment 1261 paranoidentry do_stack_segment
1257 jmp paranoid_exit1 1262 jmp paranoid_exit1
1258 CFI_ENDPROC 1263 CFI_ENDPROC
@@ -1278,6 +1283,7 @@ END(spurious_interrupt_bug)
1278 /* runs on exception stack */ 1283 /* runs on exception stack */
1279ENTRY(machine_check) 1284ENTRY(machine_check)
1280 INTR_FRAME 1285 INTR_FRAME
1286 PARAVIRT_ADJUST_EXCEPTION_FRAME
1281 pushq $0 1287 pushq $0
1282 CFI_ADJUST_CFA_OFFSET 8 1288 CFI_ADJUST_CFA_OFFSET 8
1283 paranoidentry do_machine_check 1289 paranoidentry do_machine_check
@@ -1312,3 +1318,103 @@ KPROBE_ENTRY(ignore_sysret)
1312 sysret 1318 sysret
1313 CFI_ENDPROC 1319 CFI_ENDPROC
1314ENDPROC(ignore_sysret) 1320ENDPROC(ignore_sysret)
1321
1322#ifdef CONFIG_XEN
1323ENTRY(xen_hypervisor_callback)
1324 zeroentry xen_do_hypervisor_callback
1325END(xen_hypervisor_callback)
1326
1327/*
1328# A note on the "critical region" in our callback handler.
1329# We want to avoid stacking callback handlers due to events occurring
1330# during handling of the last event. To do this, we keep events disabled
1331# until we've done all processing. HOWEVER, we must enable events before
1332# popping the stack frame (can't be done atomically) and so it would still
1333# be possible to get enough handler activations to overflow the stack.
1334# Although unlikely, bugs of that kind are hard to track down, so we'd
1335# like to avoid the possibility.
1336# So, on entry to the handler we detect whether we interrupted an
1337# existing activation in its critical region -- if so, we pop the current
1338# activation and restart the handler using the previous one.
1339*/
1340ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs)
1341 CFI_STARTPROC
1342/* Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
1343 see the correct pointer to the pt_regs */
1344 movq %rdi, %rsp # we don't return, adjust the stack frame
1345 CFI_ENDPROC
1346 CFI_DEFAULT_STACK
134711: incl %gs:pda_irqcount
1348 movq %rsp,%rbp
1349 CFI_DEF_CFA_REGISTER rbp
1350 cmovzq %gs:pda_irqstackptr,%rsp
1351 pushq %rbp # backlink for old unwinder
1352 call xen_evtchn_do_upcall
1353 popq %rsp
1354 CFI_DEF_CFA_REGISTER rsp
1355 decl %gs:pda_irqcount
1356 jmp error_exit
1357 CFI_ENDPROC
1358END(do_hypervisor_callback)
1359
1360/*
1361# Hypervisor uses this for application faults while it executes.
1362# We get here for two reasons:
1363# 1. Fault while reloading DS, ES, FS or GS
1364# 2. Fault while executing IRET
1365# Category 1 we do not need to fix up as Xen has already reloaded all segment
1366# registers that could be reloaded and zeroed the others.
1367# Category 2 we fix up by killing the current process. We cannot use the
1368# normal Linux return path in this case because if we use the IRET hypercall
1369# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
1370# We distinguish between categories by comparing each saved segment register
1371# with its current contents: any discrepancy means we in category 1.
1372*/
1373ENTRY(xen_failsafe_callback)
1374 framesz = (RIP-0x30) /* workaround buggy gas */
1375 _frame framesz
1376 CFI_REL_OFFSET rcx, 0
1377 CFI_REL_OFFSET r11, 8
1378 movw %ds,%cx
1379 cmpw %cx,0x10(%rsp)
1380 CFI_REMEMBER_STATE
1381 jne 1f
1382 movw %es,%cx
1383 cmpw %cx,0x18(%rsp)
1384 jne 1f
1385 movw %fs,%cx
1386 cmpw %cx,0x20(%rsp)
1387 jne 1f
1388 movw %gs,%cx
1389 cmpw %cx,0x28(%rsp)
1390 jne 1f
1391 /* All segments match their saved values => Category 2 (Bad IRET). */
1392 movq (%rsp),%rcx
1393 CFI_RESTORE rcx
1394 movq 8(%rsp),%r11
1395 CFI_RESTORE r11
1396 addq $0x30,%rsp
1397 CFI_ADJUST_CFA_OFFSET -0x30
1398 pushq $0
1399 CFI_ADJUST_CFA_OFFSET 8
1400 pushq %r11
1401 CFI_ADJUST_CFA_OFFSET 8
1402 pushq %rcx
1403 CFI_ADJUST_CFA_OFFSET 8
1404 jmp general_protection
1405 CFI_RESTORE_STATE
14061: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
1407 movq (%rsp),%rcx
1408 CFI_RESTORE rcx
1409 movq 8(%rsp),%r11
1410 CFI_RESTORE r11
1411 addq $0x30,%rsp
1412 CFI_ADJUST_CFA_OFFSET -0x30
1413 pushq $0
1414 CFI_ADJUST_CFA_OFFSET 8
1415 SAVE_ALL
1416 jmp error_exit
1417 CFI_ENDPROC
1418END(xen_failsafe_callback)
1419
1420#endif /* CONFIG_XEN */
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index c97819829146..1b318e903bf6 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -39,6 +39,13 @@ static struct x8664_pda *__cpu_pda[NR_CPUS] __initdata;
39static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly; 39static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly;
40#endif 40#endif
41 41
42void __init x86_64_init_pda(void)
43{
44 _cpu_pda = __cpu_pda;
45 cpu_pda(0) = &_boot_cpu_pda;
46 pda_init(0);
47}
48
42static void __init zap_identity_mappings(void) 49static void __init zap_identity_mappings(void)
43{ 50{
44 pgd_t *pgd = pgd_offset_k(0UL); 51 pgd_t *pgd = pgd_offset_k(0UL);
@@ -102,9 +109,7 @@ void __init x86_64_start_kernel(char * real_mode_data)
102 109
103 early_printk("Kernel alive\n"); 110 early_printk("Kernel alive\n");
104 111
105 _cpu_pda = __cpu_pda; 112 x86_64_init_pda();
106 cpu_pda(0) = &_boot_cpu_pda;
107 pda_init(0);
108 113
109 early_printk("Kernel really alive\n"); 114 early_printk("Kernel really alive\n");
110 115
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index b07ac7b217cb..db3280afe886 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -407,6 +407,7 @@ ENTRY(phys_base)
407 /* This must match the first entry in level2_kernel_pgt */ 407 /* This must match the first entry in level2_kernel_pgt */
408 .quad 0x0000000000000000 408 .quad 0x0000000000000000
409 409
410#include "../../x86/xen/xen-head.S"
410 411
411 .section .bss, "aw", @nobits 412 .section .bss, "aw", @nobits
412 .align L1_CACHE_BYTES 413 .align L1_CACHE_BYTES
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 47a6f6f12478..1cf8c1fcc088 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -83,11 +83,8 @@ union irq_ctx {
83static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly; 83static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly;
84static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly; 84static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly;
85 85
86static char softirq_stack[NR_CPUS * THREAD_SIZE] 86static char softirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
87 __attribute__((__section__(".bss.page_aligned"))); 87static char hardirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
88
89static char hardirq_stack[NR_CPUS * THREAD_SIZE]
90 __attribute__((__section__(".bss.page_aligned")));
91 88
92static void call_on_stack(void *func, void *stack) 89static void call_on_stack(void *func, void *stack)
93{ 90{
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index e0f571d58c19..3edfd7af22ae 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -29,6 +29,7 @@
29#include <asm/desc.h> 29#include <asm/desc.h>
30#include <asm/setup.h> 30#include <asm/setup.h>
31#include <asm/arch_hooks.h> 31#include <asm/arch_hooks.h>
32#include <asm/pgtable.h>
32#include <asm/time.h> 33#include <asm/time.h>
33#include <asm/pgalloc.h> 34#include <asm/pgalloc.h>
34#include <asm/irq.h> 35#include <asm/irq.h>
@@ -123,6 +124,7 @@ static void *get_call_destination(u8 type)
123 .pv_irq_ops = pv_irq_ops, 124 .pv_irq_ops = pv_irq_ops,
124 .pv_apic_ops = pv_apic_ops, 125 .pv_apic_ops = pv_apic_ops,
125 .pv_mmu_ops = pv_mmu_ops, 126 .pv_mmu_ops = pv_mmu_ops,
127 .pv_lock_ops = pv_lock_ops,
126 }; 128 };
127 return *((void **)&tmpl + type); 129 return *((void **)&tmpl + type);
128} 130}
@@ -266,6 +268,17 @@ enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
266 return __get_cpu_var(paravirt_lazy_mode); 268 return __get_cpu_var(paravirt_lazy_mode);
267} 269}
268 270
271void __init paravirt_use_bytelocks(void)
272{
273#ifdef CONFIG_SMP
274 pv_lock_ops.spin_is_locked = __byte_spin_is_locked;
275 pv_lock_ops.spin_is_contended = __byte_spin_is_contended;
276 pv_lock_ops.spin_lock = __byte_spin_lock;
277 pv_lock_ops.spin_trylock = __byte_spin_trylock;
278 pv_lock_ops.spin_unlock = __byte_spin_unlock;
279#endif
280}
281
269struct pv_info pv_info = { 282struct pv_info pv_info = {
270 .name = "bare hardware", 283 .name = "bare hardware",
271 .paravirt_enabled = 0, 284 .paravirt_enabled = 0,
@@ -373,6 +386,9 @@ struct pv_mmu_ops pv_mmu_ops = {
373#ifndef CONFIG_X86_64 386#ifndef CONFIG_X86_64
374 .pagetable_setup_start = native_pagetable_setup_start, 387 .pagetable_setup_start = native_pagetable_setup_start,
375 .pagetable_setup_done = native_pagetable_setup_done, 388 .pagetable_setup_done = native_pagetable_setup_done,
389#else
390 .pagetable_setup_start = paravirt_nop,
391 .pagetable_setup_done = paravirt_nop,
376#endif 392#endif
377 393
378 .read_cr2 = native_read_cr2, 394 .read_cr2 = native_read_cr2,
@@ -446,6 +462,18 @@ struct pv_mmu_ops pv_mmu_ops = {
446 .set_fixmap = native_set_fixmap, 462 .set_fixmap = native_set_fixmap,
447}; 463};
448 464
465struct pv_lock_ops pv_lock_ops = {
466#ifdef CONFIG_SMP
467 .spin_is_locked = __ticket_spin_is_locked,
468 .spin_is_contended = __ticket_spin_is_contended,
469
470 .spin_lock = __ticket_spin_lock,
471 .spin_trylock = __ticket_spin_trylock,
472 .spin_unlock = __ticket_spin_unlock,
473#endif
474};
475EXPORT_SYMBOL_GPL(pv_lock_ops);
476
449EXPORT_SYMBOL_GPL(pv_time_ops); 477EXPORT_SYMBOL_GPL(pv_time_ops);
450EXPORT_SYMBOL (pv_cpu_ops); 478EXPORT_SYMBOL (pv_cpu_ops);
451EXPORT_SYMBOL (pv_mmu_ops); 479EXPORT_SYMBOL (pv_mmu_ops);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index a8e53626ac9a..e8a8e1b99817 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -537,8 +537,8 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
537struct task_struct * 537struct task_struct *
538__switch_to(struct task_struct *prev_p, struct task_struct *next_p) 538__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
539{ 539{
540 struct thread_struct *prev = &prev_p->thread, 540 struct thread_struct *prev = &prev_p->thread;
541 *next = &next_p->thread; 541 struct thread_struct *next = &next_p->thread;
542 int cpu = smp_processor_id(); 542 int cpu = smp_processor_id();
543 struct tss_struct *tss = &per_cpu(init_tss, cpu); 543 struct tss_struct *tss = &per_cpu(init_tss, cpu);
544 unsigned fsindex, gsindex; 544 unsigned fsindex, gsindex;
@@ -586,35 +586,34 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
586 586
587 /* 587 /*
588 * Switch FS and GS. 588 * Switch FS and GS.
589 *
590 * Segment register != 0 always requires a reload. Also
591 * reload when it has changed. When prev process used 64bit
592 * base always reload to avoid an information leak.
589 */ 593 */
590 { 594 if (unlikely(fsindex | next->fsindex | prev->fs)) {
591 /* segment register != 0 always requires a reload. 595 loadsegment(fs, next->fsindex);
592 also reload when it has changed. 596 /*
593 when prev process used 64bit base always reload 597 * Check if the user used a selector != 0; if yes
594 to avoid an information leak. */ 598 * clear 64bit base, since overloaded base is always
595 if (unlikely(fsindex | next->fsindex | prev->fs)) { 599 * mapped to the Null selector
596 loadsegment(fs, next->fsindex); 600 */
597 /* check if the user used a selector != 0 601 if (fsindex)
598 * if yes clear 64bit base, since overloaded base
599 * is always mapped to the Null selector
600 */
601 if (fsindex)
602 prev->fs = 0; 602 prev->fs = 0;
603 } 603 }
604 /* when next process has a 64bit base use it */ 604 /* when next process has a 64bit base use it */
605 if (next->fs) 605 if (next->fs)
606 wrmsrl(MSR_FS_BASE, next->fs); 606 wrmsrl(MSR_FS_BASE, next->fs);
607 prev->fsindex = fsindex; 607 prev->fsindex = fsindex;
608 608
609 if (unlikely(gsindex | next->gsindex | prev->gs)) { 609 if (unlikely(gsindex | next->gsindex | prev->gs)) {
610 load_gs_index(next->gsindex); 610 load_gs_index(next->gsindex);
611 if (gsindex) 611 if (gsindex)
612 prev->gs = 0; 612 prev->gs = 0;
613 }
614 if (next->gs)
615 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
616 prev->gsindex = gsindex;
617 } 613 }
614 if (next->gs)
615 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
616 prev->gsindex = gsindex;
618 617
619 /* Must be after DS reload */ 618 /* Must be after DS reload */
620 unlazy_fpu(prev_p); 619 unlazy_fpu(prev_p);
@@ -627,7 +626,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
627 write_pda(pcurrent, next_p); 626 write_pda(pcurrent, next_p);
628 627
629 write_pda(kernelstack, 628 write_pda(kernelstack,
630 (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET); 629 (unsigned long)task_stack_page(next_p) +
630 THREAD_SIZE - PDA_STACKOFFSET);
631#ifdef CONFIG_CC_STACKPROTECTOR 631#ifdef CONFIG_CC_STACKPROTECTOR
632 write_pda(stack_canary, next_p->stack_canary); 632 write_pda(stack_canary, next_p->stack_canary);
633 /* 633 /*
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 531b55b8e81a..c9010f82141d 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -824,7 +824,10 @@ void __init setup_arch(char **cmdline_p)
824 vmi_init(); 824 vmi_init();
825#endif 825#endif
826 826
827 paravirt_pagetable_setup_start(swapper_pg_dir);
827 paging_init(); 828 paging_init();
829 paravirt_pagetable_setup_done(swapper_pg_dir);
830 paravirt_post_allocator_init();
828 831
829#ifdef CONFIG_X86_64 832#ifdef CONFIG_X86_64
830 map_vsyscall(); 833 map_vsyscall();
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 687376ab07e8..1deb3b624a79 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -768,7 +768,7 @@ static void __cpuinit do_fork_idle(struct work_struct *work)
768 * 768 *
769 * Must be called after the _cpu_pda pointer table is initialized. 769 * Must be called after the _cpu_pda pointer table is initialized.
770 */ 770 */
771static int __cpuinit get_local_pda(int cpu) 771int __cpuinit get_local_pda(int cpu)
772{ 772{
773 struct x8664_pda *oldpda, *newpda; 773 struct x8664_pda *oldpda, *newpda;
774 unsigned long size = sizeof(struct x8664_pda); 774 unsigned long size = sizeof(struct x8664_pda);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 9689a5138e64..7113acd8ac45 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -868,8 +868,6 @@ void __init paging_init(void)
868 */ 868 */
869 sparse_init(); 869 sparse_init();
870 zone_sizes_init(); 870 zone_sizes_init();
871
872 paravirt_post_allocator_init();
873} 871}
874 872
875/* 873/*
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index b7ad9f89d21f..4d6ef0a336d6 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -62,7 +62,7 @@ $(obj)/%-syms.lds: $(obj)/%.so.dbg FORCE
62# Build multiple 32-bit vDSO images to choose from at boot time. 62# Build multiple 32-bit vDSO images to choose from at boot time.
63# 63#
64obj-$(VDSO32-y) += vdso32-syms.lds 64obj-$(VDSO32-y) += vdso32-syms.lds
65vdso32.so-$(CONFIG_X86_32) += int80 65vdso32.so-$(VDSO32-y) += int80
66vdso32.so-$(CONFIG_COMPAT) += syscall 66vdso32.so-$(CONFIG_COMPAT) += syscall
67vdso32.so-$(VDSO32-y) += sysenter 67vdso32.so-$(VDSO32-y) += sysenter
68 68
diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
index 0bce5429a515..513f330c5832 100644
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -193,17 +193,12 @@ static __init void relocate_vdso(Elf32_Ehdr *ehdr)
193 } 193 }
194} 194}
195 195
196/*
197 * These symbols are defined by vdso32.S to mark the bounds
198 * of the ELF DSO images included therein.
199 */
200extern const char vdso32_default_start, vdso32_default_end;
201extern const char vdso32_sysenter_start, vdso32_sysenter_end;
202static struct page *vdso32_pages[1]; 196static struct page *vdso32_pages[1];
203 197
204#ifdef CONFIG_X86_64 198#ifdef CONFIG_X86_64
205 199
206#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SYSENTER32)) 200#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SYSENTER32))
201#define vdso32_syscall() (boot_cpu_has(X86_FEATURE_SYSCALL32))
207 202
208/* May not be __init: called during resume */ 203/* May not be __init: called during resume */
209void syscall32_cpu_init(void) 204void syscall32_cpu_init(void)
@@ -226,6 +221,7 @@ static inline void map_compat_vdso(int map)
226#else /* CONFIG_X86_32 */ 221#else /* CONFIG_X86_32 */
227 222
228#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SEP)) 223#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SEP))
224#define vdso32_syscall() (0)
229 225
230void enable_sep_cpu(void) 226void enable_sep_cpu(void)
231{ 227{
@@ -296,12 +292,15 @@ int __init sysenter_setup(void)
296 gate_vma_init(); 292 gate_vma_init();
297#endif 293#endif
298 294
299 if (!vdso32_sysenter()) { 295 if (vdso32_syscall()) {
300 vsyscall = &vdso32_default_start; 296 vsyscall = &vdso32_syscall_start;
301 vsyscall_len = &vdso32_default_end - &vdso32_default_start; 297 vsyscall_len = &vdso32_syscall_end - &vdso32_syscall_start;
302 } else { 298 } else if (vdso32_sysenter()){
303 vsyscall = &vdso32_sysenter_start; 299 vsyscall = &vdso32_sysenter_start;
304 vsyscall_len = &vdso32_sysenter_end - &vdso32_sysenter_start; 300 vsyscall_len = &vdso32_sysenter_end - &vdso32_sysenter_start;
301 } else {
302 vsyscall = &vdso32_int80_start;
303 vsyscall_len = &vdso32_int80_end - &vdso32_int80_start;
305 } 304 }
306 305
307 memcpy(syscall_page, vsyscall, vsyscall_len); 306 memcpy(syscall_page, vsyscall, vsyscall_len);
diff --git a/arch/x86/vdso/vdso32.S b/arch/x86/vdso/vdso32.S
index 1e36f72cab86..2ce5f82c333b 100644
--- a/arch/x86/vdso/vdso32.S
+++ b/arch/x86/vdso/vdso32.S
@@ -2,14 +2,17 @@
2 2
3__INITDATA 3__INITDATA
4 4
5 .globl vdso32_default_start, vdso32_default_end 5 .globl vdso32_int80_start, vdso32_int80_end
6vdso32_default_start: 6vdso32_int80_start:
7#ifdef CONFIG_X86_32
8 .incbin "arch/x86/vdso/vdso32-int80.so" 7 .incbin "arch/x86/vdso/vdso32-int80.so"
9#else 8vdso32_int80_end:
9
10 .globl vdso32_syscall_start, vdso32_syscall_end
11vdso32_syscall_start:
12#ifdef CONFIG_COMPAT
10 .incbin "arch/x86/vdso/vdso32-syscall.so" 13 .incbin "arch/x86/vdso/vdso32-syscall.so"
11#endif 14#endif
12vdso32_default_end: 15vdso32_syscall_end:
13 16
14 .globl vdso32_sysenter_start, vdso32_sysenter_end 17 .globl vdso32_sysenter_start, vdso32_sysenter_end
15vdso32_sysenter_start: 18vdso32_sysenter_start:
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index c2cc99580871..20b49729bed5 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -6,8 +6,8 @@ config XEN
6 bool "Xen guest support" 6 bool "Xen guest support"
7 select PARAVIRT 7 select PARAVIRT
8 select PARAVIRT_CLOCK 8 select PARAVIRT_CLOCK
9 depends on X86_32 9 depends on X86_64 || (X86_32 && X86_PAE && !(X86_VISWS || X86_VOYAGER))
10 depends on X86_CMPXCHG && X86_TSC && X86_PAE && !(X86_VISWS || X86_VOYAGER) 10 depends on X86_CMPXCHG && X86_TSC
11 help 11 help
12 This is the Linux Xen port. Enabling this will allow the 12 This is the Linux Xen port. Enabling this will allow the
13 kernel to boot in a paravirtualized environment under the 13 kernel to boot in a paravirtualized environment under the
@@ -15,10 +15,11 @@ config XEN
15 15
16config XEN_MAX_DOMAIN_MEMORY 16config XEN_MAX_DOMAIN_MEMORY
17 int "Maximum allowed size of a domain in gigabytes" 17 int "Maximum allowed size of a domain in gigabytes"
18 default 8 18 default 8 if X86_32
19 default 32 if X86_64
19 depends on XEN 20 depends on XEN
20 help 21 help
21 The pseudo-physical to machine address array is sized 22 The pseudo-physical to machine address array is sized
22 according to the maximum possible memory size of a Xen 23 according to the maximum possible memory size of a Xen
23 domain. This array uses 1 page per gigabyte, so there's no 24 domain. This array uses 1 page per gigabyte, so there's no
24 need to be too stingy here. \ No newline at end of file 25 need to be too stingy here.
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 2ba2d1649131..59c1e539aed2 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -1,4 +1,4 @@
1obj-y := enlighten.o setup.o multicalls.o mmu.o \ 1obj-y := enlighten.o setup.o multicalls.o mmu.o \
2 time.o xen-asm.o grant-table.o suspend.o 2 time.o xen-asm_$(BITS).o grant-table.o suspend.o
3 3
4obj-$(CONFIG_SMP) += smp.o 4obj-$(CONFIG_SMP) += smp.o
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index bb508456ef52..3da6acb7eafc 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -33,6 +33,7 @@
33#include <xen/interface/sched.h> 33#include <xen/interface/sched.h>
34#include <xen/features.h> 34#include <xen/features.h>
35#include <xen/page.h> 35#include <xen/page.h>
36#include <xen/hvc-console.h>
36 37
37#include <asm/paravirt.h> 38#include <asm/paravirt.h>
38#include <asm/page.h> 39#include <asm/page.h>
@@ -40,12 +41,12 @@
40#include <asm/xen/hypervisor.h> 41#include <asm/xen/hypervisor.h>
41#include <asm/fixmap.h> 42#include <asm/fixmap.h>
42#include <asm/processor.h> 43#include <asm/processor.h>
44#include <asm/msr-index.h>
43#include <asm/setup.h> 45#include <asm/setup.h>
44#include <asm/desc.h> 46#include <asm/desc.h>
45#include <asm/pgtable.h> 47#include <asm/pgtable.h>
46#include <asm/tlbflush.h> 48#include <asm/tlbflush.h>
47#include <asm/reboot.h> 49#include <asm/reboot.h>
48#include <asm/pgalloc.h>
49 50
50#include "xen-ops.h" 51#include "xen-ops.h"
51#include "mmu.h" 52#include "mmu.h"
@@ -57,6 +58,18 @@ DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
57DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); 58DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
58 59
59/* 60/*
61 * Identity map, in addition to plain kernel map. This needs to be
62 * large enough to allocate page table pages to allocate the rest.
63 * Each page can map 2MB.
64 */
65static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss;
66
67#ifdef CONFIG_X86_64
68/* l3 pud for userspace vsyscall mapping */
69static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
70#endif /* CONFIG_X86_64 */
71
72/*
60 * Note about cr3 (pagetable base) values: 73 * Note about cr3 (pagetable base) values:
61 * 74 *
62 * xen_cr3 contains the current logical cr3 value; it contains the 75 * xen_cr3 contains the current logical cr3 value; it contains the
@@ -363,14 +376,6 @@ static void load_TLS_descriptor(struct thread_struct *t,
363 376
364static void xen_load_tls(struct thread_struct *t, unsigned int cpu) 377static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
365{ 378{
366 xen_mc_batch();
367
368 load_TLS_descriptor(t, cpu, 0);
369 load_TLS_descriptor(t, cpu, 1);
370 load_TLS_descriptor(t, cpu, 2);
371
372 xen_mc_issue(PARAVIRT_LAZY_CPU);
373
374 /* 379 /*
375 * XXX sleazy hack: If we're being called in a lazy-cpu zone, 380 * XXX sleazy hack: If we're being called in a lazy-cpu zone,
376 * it means we're in a context switch, and %gs has just been 381 * it means we're in a context switch, and %gs has just been
@@ -379,10 +384,39 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
379 * Either way, it has been saved, and the new value will get 384 * Either way, it has been saved, and the new value will get
380 * loaded properly. This will go away as soon as Xen has been 385 * loaded properly. This will go away as soon as Xen has been
381 * modified to not save/restore %gs for normal hypercalls. 386 * modified to not save/restore %gs for normal hypercalls.
387 *
388 * On x86_64, this hack is not used for %gs, because gs points
389 * to KERNEL_GS_BASE (and uses it for PDA references), so we
390 * must not zero %gs on x86_64
391 *
392 * For x86_64, we need to zero %fs, otherwise we may get an
393 * exception between the new %fs descriptor being loaded and
394 * %fs being effectively cleared at __switch_to().
382 */ 395 */
383 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) 396 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) {
397#ifdef CONFIG_X86_32
384 loadsegment(gs, 0); 398 loadsegment(gs, 0);
399#else
400 loadsegment(fs, 0);
401#endif
402 }
403
404 xen_mc_batch();
405
406 load_TLS_descriptor(t, cpu, 0);
407 load_TLS_descriptor(t, cpu, 1);
408 load_TLS_descriptor(t, cpu, 2);
409
410 xen_mc_issue(PARAVIRT_LAZY_CPU);
411}
412
413#ifdef CONFIG_X86_64
414static void xen_load_gs_index(unsigned int idx)
415{
416 if (HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, idx))
417 BUG();
385} 418}
419#endif
386 420
387static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, 421static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
388 const void *ptr) 422 const void *ptr)
@@ -400,23 +434,18 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
400 preempt_enable(); 434 preempt_enable();
401} 435}
402 436
403static int cvt_gate_to_trap(int vector, u32 low, u32 high, 437static int cvt_gate_to_trap(int vector, const gate_desc *val,
404 struct trap_info *info) 438 struct trap_info *info)
405{ 439{
406 u8 type, dpl; 440 if (val->type != 0xf && val->type != 0xe)
407
408 type = (high >> 8) & 0x1f;
409 dpl = (high >> 13) & 3;
410
411 if (type != 0xf && type != 0xe)
412 return 0; 441 return 0;
413 442
414 info->vector = vector; 443 info->vector = vector;
415 info->address = (high & 0xffff0000) | (low & 0x0000ffff); 444 info->address = gate_offset(*val);
416 info->cs = low >> 16; 445 info->cs = gate_segment(*val);
417 info->flags = dpl; 446 info->flags = val->dpl;
418 /* interrupt gates clear IF */ 447 /* interrupt gates clear IF */
419 if (type == 0xe) 448 if (val->type == 0xe)
420 info->flags |= 4; 449 info->flags |= 4;
421 450
422 return 1; 451 return 1;
@@ -443,11 +472,10 @@ static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g)
443 472
444 if (p >= start && (p + 8) <= end) { 473 if (p >= start && (p + 8) <= end) {
445 struct trap_info info[2]; 474 struct trap_info info[2];
446 u32 *desc = (u32 *)g;
447 475
448 info[1].address = 0; 476 info[1].address = 0;
449 477
450 if (cvt_gate_to_trap(entrynum, desc[0], desc[1], &info[0])) 478 if (cvt_gate_to_trap(entrynum, g, &info[0]))
451 if (HYPERVISOR_set_trap_table(info)) 479 if (HYPERVISOR_set_trap_table(info))
452 BUG(); 480 BUG();
453 } 481 }
@@ -460,13 +488,13 @@ static void xen_convert_trap_info(const struct desc_ptr *desc,
460{ 488{
461 unsigned in, out, count; 489 unsigned in, out, count;
462 490
463 count = (desc->size+1) / 8; 491 count = (desc->size+1) / sizeof(gate_desc);
464 BUG_ON(count > 256); 492 BUG_ON(count > 256);
465 493
466 for (in = out = 0; in < count; in++) { 494 for (in = out = 0; in < count; in++) {
467 const u32 *entry = (u32 *)(desc->address + in * 8); 495 gate_desc *entry = (gate_desc*)(desc->address) + in;
468 496
469 if (cvt_gate_to_trap(in, entry[0], entry[1], &traps[out])) 497 if (cvt_gate_to_trap(in, entry, &traps[out]))
470 out++; 498 out++;
471 } 499 }
472 traps[out].address = 0; 500 traps[out].address = 0;
@@ -695,33 +723,89 @@ static void set_current_cr3(void *v)
695 x86_write_percpu(xen_current_cr3, (unsigned long)v); 723 x86_write_percpu(xen_current_cr3, (unsigned long)v);
696} 724}
697 725
698static void xen_write_cr3(unsigned long cr3) 726static void __xen_write_cr3(bool kernel, unsigned long cr3)
699{ 727{
700 struct mmuext_op *op; 728 struct mmuext_op *op;
701 struct multicall_space mcs; 729 struct multicall_space mcs;
702 unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3)); 730 unsigned long mfn;
703 731
704 BUG_ON(preemptible()); 732 if (cr3)
733 mfn = pfn_to_mfn(PFN_DOWN(cr3));
734 else
735 mfn = 0;
705 736
706 mcs = xen_mc_entry(sizeof(*op)); /* disables interrupts */ 737 WARN_ON(mfn == 0 && kernel);
707 738
708 /* Update while interrupts are disabled, so its atomic with 739 mcs = __xen_mc_entry(sizeof(*op));
709 respect to ipis */
710 x86_write_percpu(xen_cr3, cr3);
711 740
712 op = mcs.args; 741 op = mcs.args;
713 op->cmd = MMUEXT_NEW_BASEPTR; 742 op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
714 op->arg1.mfn = mfn; 743 op->arg1.mfn = mfn;
715 744
716 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); 745 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
717 746
718 /* Update xen_update_cr3 once the batch has actually 747 if (kernel) {
719 been submitted. */ 748 x86_write_percpu(xen_cr3, cr3);
720 xen_mc_callback(set_current_cr3, (void *)cr3); 749
750 /* Update xen_current_cr3 once the batch has actually
751 been submitted. */
752 xen_mc_callback(set_current_cr3, (void *)cr3);
753 }
754}
755
756static void xen_write_cr3(unsigned long cr3)
757{
758 BUG_ON(preemptible());
759
760 xen_mc_batch(); /* disables interrupts */
761
762 /* Update while interrupts are disabled, so its atomic with
763 respect to ipis */
764 x86_write_percpu(xen_cr3, cr3);
765
766 __xen_write_cr3(true, cr3);
767
768#ifdef CONFIG_X86_64
769 {
770 pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
771 if (user_pgd)
772 __xen_write_cr3(false, __pa(user_pgd));
773 else
774 __xen_write_cr3(false, 0);
775 }
776#endif
721 777
722 xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */ 778 xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
723} 779}
724 780
781static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
782{
783 int ret;
784
785 ret = 0;
786
787 switch(msr) {
788#ifdef CONFIG_X86_64
789 unsigned which;
790 u64 base;
791
792 case MSR_FS_BASE: which = SEGBASE_FS; goto set;
793 case MSR_KERNEL_GS_BASE: which = SEGBASE_GS_USER; goto set;
794 case MSR_GS_BASE: which = SEGBASE_GS_KERNEL; goto set;
795
796 set:
797 base = ((u64)high << 32) | low;
798 if (HYPERVISOR_set_segment_base(which, base) != 0)
799 ret = -EFAULT;
800 break;
801#endif
802 default:
803 ret = native_write_msr_safe(msr, low, high);
804 }
805
806 return ret;
807}
808
725/* Early in boot, while setting up the initial pagetable, assume 809/* Early in boot, while setting up the initial pagetable, assume
726 everything is pinned. */ 810 everything is pinned. */
727static __init void xen_alloc_pte_init(struct mm_struct *mm, u32 pfn) 811static __init void xen_alloc_pte_init(struct mm_struct *mm, u32 pfn)
@@ -778,6 +862,48 @@ static void xen_alloc_pmd(struct mm_struct *mm, u32 pfn)
778 xen_alloc_ptpage(mm, pfn, PT_PMD); 862 xen_alloc_ptpage(mm, pfn, PT_PMD);
779} 863}
780 864
865static int xen_pgd_alloc(struct mm_struct *mm)
866{
867 pgd_t *pgd = mm->pgd;
868 int ret = 0;
869
870 BUG_ON(PagePinned(virt_to_page(pgd)));
871
872#ifdef CONFIG_X86_64
873 {
874 struct page *page = virt_to_page(pgd);
875 pgd_t *user_pgd;
876
877 BUG_ON(page->private != 0);
878
879 ret = -ENOMEM;
880
881 user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
882 page->private = (unsigned long)user_pgd;
883
884 if (user_pgd != NULL) {
885 user_pgd[pgd_index(VSYSCALL_START)] =
886 __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
887 ret = 0;
888 }
889
890 BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
891 }
892#endif
893
894 return ret;
895}
896
897static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
898{
899#ifdef CONFIG_X86_64
900 pgd_t *user_pgd = xen_get_user_pgd(pgd);
901
902 if (user_pgd)
903 free_page((unsigned long)user_pgd);
904#endif
905}
906
781/* This should never happen until we're OK to use struct page */ 907/* This should never happen until we're OK to use struct page */
782static void xen_release_ptpage(u32 pfn, unsigned level) 908static void xen_release_ptpage(u32 pfn, unsigned level)
783{ 909{
@@ -803,6 +929,18 @@ static void xen_release_pmd(u32 pfn)
803 xen_release_ptpage(pfn, PT_PMD); 929 xen_release_ptpage(pfn, PT_PMD);
804} 930}
805 931
932#if PAGETABLE_LEVELS == 4
933static void xen_alloc_pud(struct mm_struct *mm, u32 pfn)
934{
935 xen_alloc_ptpage(mm, pfn, PT_PUD);
936}
937
938static void xen_release_pud(u32 pfn)
939{
940 xen_release_ptpage(pfn, PT_PUD);
941}
942#endif
943
806#ifdef CONFIG_HIGHPTE 944#ifdef CONFIG_HIGHPTE
807static void *xen_kmap_atomic_pte(struct page *page, enum km_type type) 945static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
808{ 946{
@@ -841,68 +979,16 @@ static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
841 979
842static __init void xen_pagetable_setup_start(pgd_t *base) 980static __init void xen_pagetable_setup_start(pgd_t *base)
843{ 981{
844 pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base;
845 int i;
846
847 /* special set_pte for pagetable initialization */
848 pv_mmu_ops.set_pte = xen_set_pte_init;
849
850 init_mm.pgd = base;
851 /*
852 * copy top-level of Xen-supplied pagetable into place. This
853 * is a stand-in while we copy the pmd pages.
854 */
855 memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t));
856
857 /*
858 * For PAE, need to allocate new pmds, rather than
859 * share Xen's, since Xen doesn't like pmd's being
860 * shared between address spaces.
861 */
862 for (i = 0; i < PTRS_PER_PGD; i++) {
863 if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) {
864 pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
865
866 memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]),
867 PAGE_SIZE);
868
869 make_lowmem_page_readonly(pmd);
870
871 set_pgd(&base[i], __pgd(1 + __pa(pmd)));
872 } else
873 pgd_clear(&base[i]);
874 }
875
876 /* make sure zero_page is mapped RO so we can use it in pagetables */
877 make_lowmem_page_readonly(empty_zero_page);
878 make_lowmem_page_readonly(base);
879 /*
880 * Switch to new pagetable. This is done before
881 * pagetable_init has done anything so that the new pages
882 * added to the table can be prepared properly for Xen.
883 */
884 xen_write_cr3(__pa(base));
885
886 /* Unpin initial Xen pagetable */
887 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE,
888 PFN_DOWN(__pa(xen_start_info->pt_base)));
889} 982}
890 983
891void xen_setup_shared_info(void) 984void xen_setup_shared_info(void)
892{ 985{
893 if (!xen_feature(XENFEAT_auto_translated_physmap)) { 986 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
894 unsigned long addr = fix_to_virt(FIX_PARAVIRT_BOOTMAP); 987 set_fixmap(FIX_PARAVIRT_BOOTMAP,
895 988 xen_start_info->shared_info);
896 /* 989
897 * Create a mapping for the shared info page. 990 HYPERVISOR_shared_info =
898 * Should be set_fixmap(), but shared_info is a machine 991 (struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP);
899 * address with no corresponding pseudo-phys address.
900 */
901 set_pte_mfn(addr,
902 PFN_DOWN(xen_start_info->shared_info),
903 PAGE_KERNEL);
904
905 HYPERVISOR_shared_info = (struct shared_info *)addr;
906 } else 992 } else
907 HYPERVISOR_shared_info = 993 HYPERVISOR_shared_info =
908 (struct shared_info *)__va(xen_start_info->shared_info); 994 (struct shared_info *)__va(xen_start_info->shared_info);
@@ -917,26 +1003,32 @@ void xen_setup_shared_info(void)
917 1003
918static __init void xen_pagetable_setup_done(pgd_t *base) 1004static __init void xen_pagetable_setup_done(pgd_t *base)
919{ 1005{
920 /* This will work as long as patching hasn't happened yet
921 (which it hasn't) */
922 pv_mmu_ops.alloc_pte = xen_alloc_pte;
923 pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
924 pv_mmu_ops.release_pte = xen_release_pte;
925 pv_mmu_ops.release_pmd = xen_release_pmd;
926 pv_mmu_ops.set_pte = xen_set_pte;
927
928 xen_setup_shared_info(); 1006 xen_setup_shared_info();
929
930 /* Actually pin the pagetable down, but we can't set PG_pinned
931 yet because the page structures don't exist yet. */
932 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(base)));
933} 1007}
934 1008
935static __init void xen_post_allocator_init(void) 1009static __init void xen_post_allocator_init(void)
936{ 1010{
1011 pv_mmu_ops.set_pte = xen_set_pte;
937 pv_mmu_ops.set_pmd = xen_set_pmd; 1012 pv_mmu_ops.set_pmd = xen_set_pmd;
938 pv_mmu_ops.set_pud = xen_set_pud; 1013 pv_mmu_ops.set_pud = xen_set_pud;
1014#if PAGETABLE_LEVELS == 4
1015 pv_mmu_ops.set_pgd = xen_set_pgd;
1016#endif
1017
1018 /* This will work as long as patching hasn't happened yet
1019 (which it hasn't) */
1020 pv_mmu_ops.alloc_pte = xen_alloc_pte;
1021 pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
1022 pv_mmu_ops.release_pte = xen_release_pte;
1023 pv_mmu_ops.release_pmd = xen_release_pmd;
1024#if PAGETABLE_LEVELS == 4
1025 pv_mmu_ops.alloc_pud = xen_alloc_pud;
1026 pv_mmu_ops.release_pud = xen_release_pud;
1027#endif
939 1028
1029#ifdef CONFIG_X86_64
1030 SetPagePinned(virt_to_page(level3_user_vsyscall));
1031#endif
940 xen_mark_init_mm_pinned(); 1032 xen_mark_init_mm_pinned();
941} 1033}
942 1034
@@ -950,6 +1042,7 @@ void xen_setup_vcpu_info_placement(void)
950 1042
951 /* xen_vcpu_setup managed to place the vcpu_info within the 1043 /* xen_vcpu_setup managed to place the vcpu_info within the
952 percpu area for all cpus, so make use of it */ 1044 percpu area for all cpus, so make use of it */
1045#ifdef CONFIG_X86_32
953 if (have_vcpu_info_placement) { 1046 if (have_vcpu_info_placement) {
954 printk(KERN_INFO "Xen: using vcpu_info placement\n"); 1047 printk(KERN_INFO "Xen: using vcpu_info placement\n");
955 1048
@@ -959,6 +1052,7 @@ void xen_setup_vcpu_info_placement(void)
959 pv_irq_ops.irq_enable = xen_irq_enable_direct; 1052 pv_irq_ops.irq_enable = xen_irq_enable_direct;
960 pv_mmu_ops.read_cr2 = xen_read_cr2_direct; 1053 pv_mmu_ops.read_cr2 = xen_read_cr2_direct;
961 } 1054 }
1055#endif
962} 1056}
963 1057
964static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, 1058static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
@@ -979,10 +1073,12 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
979 goto patch_site 1073 goto patch_site
980 1074
981 switch (type) { 1075 switch (type) {
1076#ifdef CONFIG_X86_32
982 SITE(pv_irq_ops, irq_enable); 1077 SITE(pv_irq_ops, irq_enable);
983 SITE(pv_irq_ops, irq_disable); 1078 SITE(pv_irq_ops, irq_disable);
984 SITE(pv_irq_ops, save_fl); 1079 SITE(pv_irq_ops, save_fl);
985 SITE(pv_irq_ops, restore_fl); 1080 SITE(pv_irq_ops, restore_fl);
1081#endif /* CONFIG_X86_32 */
986#undef SITE 1082#undef SITE
987 1083
988 patch_site: 1084 patch_site:
@@ -1025,8 +1121,15 @@ static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot)
1025#ifdef CONFIG_X86_F00F_BUG 1121#ifdef CONFIG_X86_F00F_BUG
1026 case FIX_F00F_IDT: 1122 case FIX_F00F_IDT:
1027#endif 1123#endif
1124#ifdef CONFIG_X86_32
1028 case FIX_WP_TEST: 1125 case FIX_WP_TEST:
1029 case FIX_VDSO: 1126 case FIX_VDSO:
1127# ifdef CONFIG_HIGHMEM
1128 case FIX_KMAP_BEGIN ... FIX_KMAP_END:
1129# endif
1130#else
1131 case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
1132#endif
1030#ifdef CONFIG_X86_LOCAL_APIC 1133#ifdef CONFIG_X86_LOCAL_APIC
1031 case FIX_APIC_BASE: /* maps dummy local APIC */ 1134 case FIX_APIC_BASE: /* maps dummy local APIC */
1032#endif 1135#endif
@@ -1039,6 +1142,15 @@ static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot)
1039 } 1142 }
1040 1143
1041 __native_set_fixmap(idx, pte); 1144 __native_set_fixmap(idx, pte);
1145
1146#ifdef CONFIG_X86_64
1147 /* Replicate changes to map the vsyscall page into the user
1148 pagetable vsyscall mapping. */
1149 if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) {
1150 unsigned long vaddr = __fix_to_virt(idx);
1151 set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
1152 }
1153#endif
1042} 1154}
1043 1155
1044static const struct pv_info xen_info __initdata = { 1156static const struct pv_info xen_info __initdata = {
@@ -1084,18 +1196,25 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
1084 .wbinvd = native_wbinvd, 1196 .wbinvd = native_wbinvd,
1085 1197
1086 .read_msr = native_read_msr_safe, 1198 .read_msr = native_read_msr_safe,
1087 .write_msr = native_write_msr_safe, 1199 .write_msr = xen_write_msr_safe,
1088 .read_tsc = native_read_tsc, 1200 .read_tsc = native_read_tsc,
1089 .read_pmc = native_read_pmc, 1201 .read_pmc = native_read_pmc,
1090 1202
1091 .iret = xen_iret, 1203 .iret = xen_iret,
1092 .irq_enable_sysexit = xen_sysexit, 1204 .irq_enable_sysexit = xen_sysexit,
1205#ifdef CONFIG_X86_64
1206 .usergs_sysret32 = xen_sysret32,
1207 .usergs_sysret64 = xen_sysret64,
1208#endif
1093 1209
1094 .load_tr_desc = paravirt_nop, 1210 .load_tr_desc = paravirt_nop,
1095 .set_ldt = xen_set_ldt, 1211 .set_ldt = xen_set_ldt,
1096 .load_gdt = xen_load_gdt, 1212 .load_gdt = xen_load_gdt,
1097 .load_idt = xen_load_idt, 1213 .load_idt = xen_load_idt,
1098 .load_tls = xen_load_tls, 1214 .load_tls = xen_load_tls,
1215#ifdef CONFIG_X86_64
1216 .load_gs_index = xen_load_gs_index,
1217#endif
1099 1218
1100 .store_gdt = native_store_gdt, 1219 .store_gdt = native_store_gdt,
1101 .store_idt = native_store_idt, 1220 .store_idt = native_store_idt,
@@ -1109,14 +1228,34 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
1109 .set_iopl_mask = xen_set_iopl_mask, 1228 .set_iopl_mask = xen_set_iopl_mask,
1110 .io_delay = xen_io_delay, 1229 .io_delay = xen_io_delay,
1111 1230
1231 /* Xen takes care of %gs when switching to usermode for us */
1232 .swapgs = paravirt_nop,
1233
1112 .lazy_mode = { 1234 .lazy_mode = {
1113 .enter = paravirt_enter_lazy_cpu, 1235 .enter = paravirt_enter_lazy_cpu,
1114 .leave = xen_leave_lazy, 1236 .leave = xen_leave_lazy,
1115 }, 1237 },
1116}; 1238};
1117 1239
1240static void __init __xen_init_IRQ(void)
1241{
1242#ifdef CONFIG_X86_64
1243 int i;
1244
1245 /* Create identity vector->irq map */
1246 for(i = 0; i < NR_VECTORS; i++) {
1247 int cpu;
1248
1249 for_each_possible_cpu(cpu)
1250 per_cpu(vector_irq, cpu)[i] = i;
1251 }
1252#endif /* CONFIG_X86_64 */
1253
1254 xen_init_IRQ();
1255}
1256
1118static const struct pv_irq_ops xen_irq_ops __initdata = { 1257static const struct pv_irq_ops xen_irq_ops __initdata = {
1119 .init_IRQ = xen_init_IRQ, 1258 .init_IRQ = __xen_init_IRQ,
1120 .save_fl = xen_save_fl, 1259 .save_fl = xen_save_fl,
1121 .restore_fl = xen_restore_fl, 1260 .restore_fl = xen_restore_fl,
1122 .irq_disable = xen_irq_disable, 1261 .irq_disable = xen_irq_disable,
@@ -1124,7 +1263,7 @@ static const struct pv_irq_ops xen_irq_ops __initdata = {
1124 .safe_halt = xen_safe_halt, 1263 .safe_halt = xen_safe_halt,
1125 .halt = xen_halt, 1264 .halt = xen_halt,
1126#ifdef CONFIG_X86_64 1265#ifdef CONFIG_X86_64
1127 .adjust_exception_frame = paravirt_nop, 1266 .adjust_exception_frame = xen_adjust_exception_frame,
1128#endif 1267#endif
1129}; 1268};
1130 1269
@@ -1157,8 +1296,8 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1157 .pte_update = paravirt_nop, 1296 .pte_update = paravirt_nop,
1158 .pte_update_defer = paravirt_nop, 1297 .pte_update_defer = paravirt_nop,
1159 1298
1160 .pgd_alloc = __paravirt_pgd_alloc, 1299 .pgd_alloc = xen_pgd_alloc,
1161 .pgd_free = paravirt_nop, 1300 .pgd_free = xen_pgd_free,
1162 1301
1163 .alloc_pte = xen_alloc_pte_init, 1302 .alloc_pte = xen_alloc_pte_init,
1164 .release_pte = xen_release_pte_init, 1303 .release_pte = xen_release_pte_init,
@@ -1170,7 +1309,11 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1170 .kmap_atomic_pte = xen_kmap_atomic_pte, 1309 .kmap_atomic_pte = xen_kmap_atomic_pte,
1171#endif 1310#endif
1172 1311
1173 .set_pte = NULL, /* see xen_pagetable_setup_* */ 1312#ifdef CONFIG_X86_64
1313 .set_pte = xen_set_pte,
1314#else
1315 .set_pte = xen_set_pte_init,
1316#endif
1174 .set_pte_at = xen_set_pte_at, 1317 .set_pte_at = xen_set_pte_at,
1175 .set_pmd = xen_set_pmd_hyper, 1318 .set_pmd = xen_set_pmd_hyper,
1176 1319
@@ -1184,15 +1327,26 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1184 .make_pte = xen_make_pte, 1327 .make_pte = xen_make_pte,
1185 .make_pgd = xen_make_pgd, 1328 .make_pgd = xen_make_pgd,
1186 1329
1330#ifdef CONFIG_X86_PAE
1187 .set_pte_atomic = xen_set_pte_atomic, 1331 .set_pte_atomic = xen_set_pte_atomic,
1188 .set_pte_present = xen_set_pte_at, 1332 .set_pte_present = xen_set_pte_at,
1189 .set_pud = xen_set_pud_hyper,
1190 .pte_clear = xen_pte_clear, 1333 .pte_clear = xen_pte_clear,
1191 .pmd_clear = xen_pmd_clear, 1334 .pmd_clear = xen_pmd_clear,
1335#endif /* CONFIG_X86_PAE */
1336 .set_pud = xen_set_pud_hyper,
1192 1337
1193 .make_pmd = xen_make_pmd, 1338 .make_pmd = xen_make_pmd,
1194 .pmd_val = xen_pmd_val, 1339 .pmd_val = xen_pmd_val,
1195 1340
1341#if PAGETABLE_LEVELS == 4
1342 .pud_val = xen_pud_val,
1343 .make_pud = xen_make_pud,
1344 .set_pgd = xen_set_pgd_hyper,
1345
1346 .alloc_pud = xen_alloc_pte_init,
1347 .release_pud = xen_release_pte_init,
1348#endif /* PAGETABLE_LEVELS == 4 */
1349
1196 .activate_mm = xen_activate_mm, 1350 .activate_mm = xen_activate_mm,
1197 .dup_mmap = xen_dup_mmap, 1351 .dup_mmap = xen_dup_mmap,
1198 .exit_mmap = xen_exit_mmap, 1352 .exit_mmap = xen_exit_mmap,
@@ -1205,21 +1359,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1205 .set_fixmap = xen_set_fixmap, 1359 .set_fixmap = xen_set_fixmap,
1206}; 1360};
1207 1361
1208#ifdef CONFIG_SMP
1209static const struct smp_ops xen_smp_ops __initdata = {
1210 .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
1211 .smp_prepare_cpus = xen_smp_prepare_cpus,
1212 .cpu_up = xen_cpu_up,
1213 .smp_cpus_done = xen_smp_cpus_done,
1214
1215 .smp_send_stop = xen_smp_send_stop,
1216 .smp_send_reschedule = xen_smp_send_reschedule,
1217
1218 .send_call_func_ipi = xen_smp_send_call_function_ipi,
1219 .send_call_func_single_ipi = xen_smp_send_call_function_single_ipi,
1220};
1221#endif /* CONFIG_SMP */
1222
1223static void xen_reboot(int reason) 1362static void xen_reboot(int reason)
1224{ 1363{
1225 struct sched_shutdown r = { .reason = reason }; 1364 struct sched_shutdown r = { .reason = reason };
@@ -1264,6 +1403,7 @@ static const struct machine_ops __initdata xen_machine_ops = {
1264 1403
1265static void __init xen_reserve_top(void) 1404static void __init xen_reserve_top(void)
1266{ 1405{
1406#ifdef CONFIG_X86_32
1267 unsigned long top = HYPERVISOR_VIRT_START; 1407 unsigned long top = HYPERVISOR_VIRT_START;
1268 struct xen_platform_parameters pp; 1408 struct xen_platform_parameters pp;
1269 1409
@@ -1271,7 +1411,247 @@ static void __init xen_reserve_top(void)
1271 top = pp.virt_start; 1411 top = pp.virt_start;
1272 1412
1273 reserve_top_address(-top + 2 * PAGE_SIZE); 1413 reserve_top_address(-top + 2 * PAGE_SIZE);
1414#endif /* CONFIG_X86_32 */
1415}
1416
1417/*
1418 * Like __va(), but returns address in the kernel mapping (which is
1419 * all we have until the physical memory mapping has been set up.
1420 */
1421static void *__ka(phys_addr_t paddr)
1422{
1423#ifdef CONFIG_X86_64
1424 return (void *)(paddr + __START_KERNEL_map);
1425#else
1426 return __va(paddr);
1427#endif
1428}
1429
1430/* Convert a machine address to physical address */
1431static unsigned long m2p(phys_addr_t maddr)
1432{
1433 phys_addr_t paddr;
1434
1435 maddr &= PTE_MASK;
1436 paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
1437
1438 return paddr;
1439}
1440
1441/* Convert a machine address to kernel virtual */
1442static void *m2v(phys_addr_t maddr)
1443{
1444 return __ka(m2p(maddr));
1445}
1446
1447#ifdef CONFIG_X86_64
1448static void walk(pgd_t *pgd, unsigned long addr)
1449{
1450 unsigned l4idx = pgd_index(addr);
1451 unsigned l3idx = pud_index(addr);
1452 unsigned l2idx = pmd_index(addr);
1453 unsigned l1idx = pte_index(addr);
1454 pgd_t l4;
1455 pud_t l3;
1456 pmd_t l2;
1457 pte_t l1;
1458
1459 xen_raw_printk("walk %p, %lx -> %d %d %d %d\n",
1460 pgd, addr, l4idx, l3idx, l2idx, l1idx);
1461
1462 l4 = pgd[l4idx];
1463 xen_raw_printk(" l4: %016lx\n", l4.pgd);
1464 xen_raw_printk(" %016lx\n", pgd_val(l4));
1465
1466 l3 = ((pud_t *)(m2v(l4.pgd)))[l3idx];
1467 xen_raw_printk(" l3: %016lx\n", l3.pud);
1468 xen_raw_printk(" %016lx\n", pud_val(l3));
1469
1470 l2 = ((pmd_t *)(m2v(l3.pud)))[l2idx];
1471 xen_raw_printk(" l2: %016lx\n", l2.pmd);
1472 xen_raw_printk(" %016lx\n", pmd_val(l2));
1473
1474 l1 = ((pte_t *)(m2v(l2.pmd)))[l1idx];
1475 xen_raw_printk(" l1: %016lx\n", l1.pte);
1476 xen_raw_printk(" %016lx\n", pte_val(l1));
1477}
1478#endif
1479
1480static void set_page_prot(void *addr, pgprot_t prot)
1481{
1482 unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
1483 pte_t pte = pfn_pte(pfn, prot);
1484
1485 xen_raw_printk("addr=%p pfn=%lx mfn=%lx prot=%016llx pte=%016llx\n",
1486 addr, pfn, get_phys_to_machine(pfn),
1487 pgprot_val(prot), pte.pte);
1488
1489 if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
1490 BUG();
1491}
1492
1493static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1494{
1495 unsigned pmdidx, pteidx;
1496 unsigned ident_pte;
1497 unsigned long pfn;
1498
1499 ident_pte = 0;
1500 pfn = 0;
1501 for(pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
1502 pte_t *pte_page;
1503
1504 /* Reuse or allocate a page of ptes */
1505 if (pmd_present(pmd[pmdidx]))
1506 pte_page = m2v(pmd[pmdidx].pmd);
1507 else {
1508 /* Check for free pte pages */
1509 if (ident_pte == ARRAY_SIZE(level1_ident_pgt))
1510 break;
1511
1512 pte_page = &level1_ident_pgt[ident_pte];
1513 ident_pte += PTRS_PER_PTE;
1514
1515 pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
1516 }
1517
1518 /* Install mappings */
1519 for(pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
1520 pte_t pte;
1521
1522 if (pfn > max_pfn_mapped)
1523 max_pfn_mapped = pfn;
1524
1525 if (!pte_none(pte_page[pteidx]))
1526 continue;
1527
1528 pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
1529 pte_page[pteidx] = pte;
1530 }
1531 }
1532
1533 for(pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
1534 set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
1535
1536 set_page_prot(pmd, PAGE_KERNEL_RO);
1537}
1538
1539#ifdef CONFIG_X86_64
1540static void convert_pfn_mfn(void *v)
1541{
1542 pte_t *pte = v;
1543 int i;
1544
1545 /* All levels are converted the same way, so just treat them
1546 as ptes. */
1547 for(i = 0; i < PTRS_PER_PTE; i++)
1548 pte[i] = xen_make_pte(pte[i].pte);
1549}
1550
1551/*
1552 * Set up the inital kernel pagetable.
1553 *
1554 * We can construct this by grafting the Xen provided pagetable into
1555 * head_64.S's preconstructed pagetables. We copy the Xen L2's into
1556 * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt. This
1557 * means that only the kernel has a physical mapping to start with -
1558 * but that's enough to get __va working. We need to fill in the rest
1559 * of the physical mapping once some sort of allocator has been set
1560 * up.
1561 */
1562static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
1563{
1564 pud_t *l3;
1565 pmd_t *l2;
1566
1567 /* Zap identity mapping */
1568 init_level4_pgt[0] = __pgd(0);
1569
1570 /* Pre-constructed entries are in pfn, so convert to mfn */
1571 convert_pfn_mfn(init_level4_pgt);
1572 convert_pfn_mfn(level3_ident_pgt);
1573 convert_pfn_mfn(level3_kernel_pgt);
1574
1575 l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
1576 l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
1577
1578 memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1579 memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1580
1581 l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
1582 l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
1583 memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1584
1585 /* Set up identity map */
1586 xen_map_identity_early(level2_ident_pgt, max_pfn);
1587
1588 /* Make pagetable pieces RO */
1589 set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
1590 set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
1591 set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
1592 set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
1593 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1594 set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
1595
1596 /* Pin down new L4 */
1597 pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
1598 PFN_DOWN(__pa_symbol(init_level4_pgt)));
1599
1600 /* Unpin Xen-provided one */
1601 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1602
1603 /* Switch over */
1604 pgd = init_level4_pgt;
1605
1606 /*
1607 * At this stage there can be no user pgd, and no page
1608 * structure to attach it to, so make sure we just set kernel
1609 * pgd.
1610 */
1611 xen_mc_batch();
1612 __xen_write_cr3(true, __pa(pgd));
1613 xen_mc_issue(PARAVIRT_LAZY_CPU);
1614
1615 reserve_early(__pa(xen_start_info->pt_base),
1616 __pa(xen_start_info->pt_base +
1617 xen_start_info->nr_pt_frames * PAGE_SIZE),
1618 "XEN PAGETABLES");
1619
1620 return pgd;
1621}
1622#else /* !CONFIG_X86_64 */
1623static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss;
1624
1625static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
1626{
1627 pmd_t *kernel_pmd;
1628
1629 init_pg_tables_start = __pa(pgd);
1630 init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
1631 max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024);
1632
1633 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
1634 memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
1635
1636 xen_map_identity_early(level2_kernel_pgt, max_pfn);
1637
1638 memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
1639 set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY],
1640 __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT));
1641
1642 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1643 set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
1644 set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
1645
1646 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1647
1648 xen_write_cr3(__pa(swapper_pg_dir));
1649
1650 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir)));
1651
1652 return swapper_pg_dir;
1274} 1653}
1654#endif /* CONFIG_X86_64 */
1275 1655
1276/* First C function to be called on Xen boot */ 1656/* First C function to be called on Xen boot */
1277asmlinkage void __init xen_start_kernel(void) 1657asmlinkage void __init xen_start_kernel(void)
@@ -1301,53 +1681,56 @@ asmlinkage void __init xen_start_kernel(void)
1301 1681
1302 machine_ops = xen_machine_ops; 1682 machine_ops = xen_machine_ops;
1303 1683
1304#ifdef CONFIG_SMP 1684#ifdef CONFIG_X86_64
1305 smp_ops = xen_smp_ops; 1685 /* Disable until direct per-cpu data access. */
1686 have_vcpu_info_placement = 0;
1687 x86_64_init_pda();
1306#endif 1688#endif
1307 1689
1690 xen_smp_init();
1691
1308 /* Get mfn list */ 1692 /* Get mfn list */
1309 if (!xen_feature(XENFEAT_auto_translated_physmap)) 1693 if (!xen_feature(XENFEAT_auto_translated_physmap))
1310 xen_build_dynamic_phys_to_machine(); 1694 xen_build_dynamic_phys_to_machine();
1311 1695
1312 pgd = (pgd_t *)xen_start_info->pt_base; 1696 pgd = (pgd_t *)xen_start_info->pt_base;
1313 1697
1314 init_pg_tables_start = __pa(pgd); 1698 /* Prevent unwanted bits from being set in PTEs. */
1315 init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE; 1699 __supported_pte_mask &= ~_PAGE_GLOBAL;
1316 max_pfn_mapped = (init_pg_tables_end + 512*1024) >> PAGE_SHIFT; 1700 if (!is_initial_xendomain())
1317 1701 __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
1318 init_mm.pgd = pgd; /* use the Xen pagetables to start */
1319
1320 /* keep using Xen gdt for now; no urgent need to change it */
1321
1322 x86_write_percpu(xen_cr3, __pa(pgd));
1323 x86_write_percpu(xen_current_cr3, __pa(pgd));
1324 1702
1325 /* Don't do the full vcpu_info placement stuff until we have a 1703 /* Don't do the full vcpu_info placement stuff until we have a
1326 possible map and a non-dummy shared_info. */ 1704 possible map and a non-dummy shared_info. */
1327 per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; 1705 per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
1328 1706
1707 xen_raw_console_write("mapping kernel into physical memory\n");
1708 pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages);
1709
1710 init_mm.pgd = pgd;
1711
1712 /* keep using Xen gdt for now; no urgent need to change it */
1713
1329 pv_info.kernel_rpl = 1; 1714 pv_info.kernel_rpl = 1;
1330 if (xen_feature(XENFEAT_supervisor_mode_kernel)) 1715 if (xen_feature(XENFEAT_supervisor_mode_kernel))
1331 pv_info.kernel_rpl = 0; 1716 pv_info.kernel_rpl = 0;
1332 1717
1333 /* Prevent unwanted bits from being set in PTEs. */
1334 __supported_pte_mask &= ~_PAGE_GLOBAL;
1335 if (!is_initial_xendomain())
1336 __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
1337
1338 /* set the limit of our address space */ 1718 /* set the limit of our address space */
1339 xen_reserve_top(); 1719 xen_reserve_top();
1340 1720
1721#ifdef CONFIG_X86_32
1341 /* set up basic CPUID stuff */ 1722 /* set up basic CPUID stuff */
1342 cpu_detect(&new_cpu_data); 1723 cpu_detect(&new_cpu_data);
1343 new_cpu_data.hard_math = 1; 1724 new_cpu_data.hard_math = 1;
1344 new_cpu_data.x86_capability[0] = cpuid_edx(1); 1725 new_cpu_data.x86_capability[0] = cpuid_edx(1);
1726#endif
1345 1727
1346 /* Poke various useful things into boot_params */ 1728 /* Poke various useful things into boot_params */
1347 boot_params.hdr.type_of_loader = (9 << 4) | 0; 1729 boot_params.hdr.type_of_loader = (9 << 4) | 0;
1348 boot_params.hdr.ramdisk_image = xen_start_info->mod_start 1730 boot_params.hdr.ramdisk_image = xen_start_info->mod_start
1349 ? __pa(xen_start_info->mod_start) : 0; 1731 ? __pa(xen_start_info->mod_start) : 0;
1350 boot_params.hdr.ramdisk_size = xen_start_info->mod_len; 1732 boot_params.hdr.ramdisk_size = xen_start_info->mod_len;
1733 boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line);
1351 1734
1352 if (!is_initial_xendomain()) { 1735 if (!is_initial_xendomain()) {
1353 add_preferred_console("xenboot", 0, NULL); 1736 add_preferred_console("xenboot", 0, NULL);
@@ -1355,6 +1738,21 @@ asmlinkage void __init xen_start_kernel(void)
1355 add_preferred_console("hvc", 0, NULL); 1738 add_preferred_console("hvc", 0, NULL);
1356 } 1739 }
1357 1740
1741 xen_raw_console_write("about to get started...\n");
1742
1743#if 0
1744 xen_raw_printk("&boot_params=%p __pa(&boot_params)=%lx __va(__pa(&boot_params))=%lx\n",
1745 &boot_params, __pa_symbol(&boot_params),
1746 __va(__pa_symbol(&boot_params)));
1747
1748 walk(pgd, &boot_params);
1749 walk(pgd, __va(__pa(&boot_params)));
1750#endif
1751
1358 /* Start the world */ 1752 /* Start the world */
1753#ifdef CONFIG_X86_32
1359 i386_start_kernel(); 1754 i386_start_kernel();
1755#else
1756 x86_64_start_reservations((char *)__pa_symbol(&boot_params));
1757#endif
1360} 1758}
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index ff0aa74afaa1..a44d56e38bd1 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -44,8 +44,10 @@
44 44
45#include <asm/pgtable.h> 45#include <asm/pgtable.h>
46#include <asm/tlbflush.h> 46#include <asm/tlbflush.h>
47#include <asm/fixmap.h>
47#include <asm/mmu_context.h> 48#include <asm/mmu_context.h>
48#include <asm/paravirt.h> 49#include <asm/paravirt.h>
50#include <asm/linkage.h>
49 51
50#include <asm/xen/hypercall.h> 52#include <asm/xen/hypercall.h>
51#include <asm/xen/hypervisor.h> 53#include <asm/xen/hypervisor.h>
@@ -56,26 +58,29 @@
56#include "multicalls.h" 58#include "multicalls.h"
57#include "mmu.h" 59#include "mmu.h"
58 60
61/*
62 * Just beyond the highest usermode address. STACK_TOP_MAX has a
63 * redzone above it, so round it up to a PGD boundary.
64 */
65#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
66
67
59#define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) 68#define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
60#define TOP_ENTRIES (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE) 69#define TOP_ENTRIES (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE)
61 70
62/* Placeholder for holes in the address space */ 71/* Placeholder for holes in the address space */
63static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] 72static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] __page_aligned_data =
64 __attribute__((section(".data.page_aligned"))) =
65 { [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL }; 73 { [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL };
66 74
67 /* Array of pointers to pages containing p2m entries */ 75 /* Array of pointers to pages containing p2m entries */
68static unsigned long *p2m_top[TOP_ENTRIES] 76static unsigned long *p2m_top[TOP_ENTRIES] __page_aligned_data =
69 __attribute__((section(".data.page_aligned"))) =
70 { [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] }; 77 { [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] };
71 78
72/* Arrays of p2m arrays expressed in mfns used for save/restore */ 79/* Arrays of p2m arrays expressed in mfns used for save/restore */
73static unsigned long p2m_top_mfn[TOP_ENTRIES] 80static unsigned long p2m_top_mfn[TOP_ENTRIES] __page_aligned_bss;
74 __attribute__((section(".bss.page_aligned")));
75 81
76static unsigned long p2m_top_mfn_list[ 82static unsigned long p2m_top_mfn_list[TOP_ENTRIES / P2M_ENTRIES_PER_PAGE]
77 PAGE_ALIGN(TOP_ENTRIES / P2M_ENTRIES_PER_PAGE)] 83 __page_aligned_bss;
78 __attribute__((section(".bss.page_aligned")));
79 84
80static inline unsigned p2m_top_index(unsigned long pfn) 85static inline unsigned p2m_top_index(unsigned long pfn)
81{ 86{
@@ -181,15 +186,16 @@ void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
181 p2m_top[topidx][idx] = mfn; 186 p2m_top[topidx][idx] = mfn;
182} 187}
183 188
184xmaddr_t arbitrary_virt_to_machine(unsigned long address) 189xmaddr_t arbitrary_virt_to_machine(void *vaddr)
185{ 190{
191 unsigned long address = (unsigned long)vaddr;
186 unsigned int level; 192 unsigned int level;
187 pte_t *pte = lookup_address(address, &level); 193 pte_t *pte = lookup_address(address, &level);
188 unsigned offset = address & ~PAGE_MASK; 194 unsigned offset = address & ~PAGE_MASK;
189 195
190 BUG_ON(pte == NULL); 196 BUG_ON(pte == NULL);
191 197
192 return XMADDR((pte_mfn(*pte) << PAGE_SHIFT) + offset); 198 return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset);
193} 199}
194 200
195void make_lowmem_page_readonly(void *vaddr) 201void make_lowmem_page_readonly(void *vaddr)
@@ -256,7 +262,8 @@ void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
256 262
257 xen_mc_batch(); 263 xen_mc_batch();
258 264
259 u.ptr = virt_to_machine(ptr).maddr; 265 /* ptr may be ioremapped for 64-bit pagetable setup */
266 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
260 u.val = pmd_val_ma(val); 267 u.val = pmd_val_ma(val);
261 extend_mmu_update(&u); 268 extend_mmu_update(&u);
262 269
@@ -283,35 +290,7 @@ void xen_set_pmd(pmd_t *ptr, pmd_t val)
283 */ 290 */
284void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags) 291void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
285{ 292{
286 pgd_t *pgd; 293 set_pte_vaddr(vaddr, mfn_pte(mfn, flags));
287 pud_t *pud;
288 pmd_t *pmd;
289 pte_t *pte;
290
291 pgd = swapper_pg_dir + pgd_index(vaddr);
292 if (pgd_none(*pgd)) {
293 BUG();
294 return;
295 }
296 pud = pud_offset(pgd, vaddr);
297 if (pud_none(*pud)) {
298 BUG();
299 return;
300 }
301 pmd = pmd_offset(pud, vaddr);
302 if (pmd_none(*pmd)) {
303 BUG();
304 return;
305 }
306 pte = pte_offset_kernel(pmd, vaddr);
307 /* <mfn,flags> stored as-is, to permit clearing entries */
308 xen_set_pte(pte, mfn_pte(mfn, flags));
309
310 /*
311 * It's enough to flush this one mapping.
312 * (PGE mappings get flushed as well)
313 */
314 __flush_tlb_one(vaddr);
315} 294}
316 295
317void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, 296void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
@@ -418,7 +397,8 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val)
418 397
419 xen_mc_batch(); 398 xen_mc_batch();
420 399
421 u.ptr = virt_to_machine(ptr).maddr; 400 /* ptr may be ioremapped for 64-bit pagetable setup */
401 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
422 u.val = pud_val_ma(val); 402 u.val = pud_val_ma(val);
423 extend_mmu_update(&u); 403 extend_mmu_update(&u);
424 404
@@ -441,14 +421,19 @@ void xen_set_pud(pud_t *ptr, pud_t val)
441 421
442void xen_set_pte(pte_t *ptep, pte_t pte) 422void xen_set_pte(pte_t *ptep, pte_t pte)
443{ 423{
424#ifdef CONFIG_X86_PAE
444 ptep->pte_high = pte.pte_high; 425 ptep->pte_high = pte.pte_high;
445 smp_wmb(); 426 smp_wmb();
446 ptep->pte_low = pte.pte_low; 427 ptep->pte_low = pte.pte_low;
428#else
429 *ptep = pte;
430#endif
447} 431}
448 432
433#ifdef CONFIG_X86_PAE
449void xen_set_pte_atomic(pte_t *ptep, pte_t pte) 434void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
450{ 435{
451 set_64bit((u64 *)ptep, pte_val_ma(pte)); 436 set_64bit((u64 *)ptep, native_pte_val(pte));
452} 437}
453 438
454void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) 439void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
@@ -462,6 +447,7 @@ void xen_pmd_clear(pmd_t *pmdp)
462{ 447{
463 set_pmd(pmdp, __pmd(0)); 448 set_pmd(pmdp, __pmd(0));
464} 449}
450#endif /* CONFIG_X86_PAE */
465 451
466pmd_t xen_make_pmd(pmdval_t pmd) 452pmd_t xen_make_pmd(pmdval_t pmd)
467{ 453{
@@ -469,78 +455,189 @@ pmd_t xen_make_pmd(pmdval_t pmd)
469 return native_make_pmd(pmd); 455 return native_make_pmd(pmd);
470} 456}
471 457
458#if PAGETABLE_LEVELS == 4
459pudval_t xen_pud_val(pud_t pud)
460{
461 return pte_mfn_to_pfn(pud.pud);
462}
463
464pud_t xen_make_pud(pudval_t pud)
465{
466 pud = pte_pfn_to_mfn(pud);
467
468 return native_make_pud(pud);
469}
470
471pgd_t *xen_get_user_pgd(pgd_t *pgd)
472{
473 pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK);
474 unsigned offset = pgd - pgd_page;
475 pgd_t *user_ptr = NULL;
476
477 if (offset < pgd_index(USER_LIMIT)) {
478 struct page *page = virt_to_page(pgd_page);
479 user_ptr = (pgd_t *)page->private;
480 if (user_ptr)
481 user_ptr += offset;
482 }
483
484 return user_ptr;
485}
486
487static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
488{
489 struct mmu_update u;
490
491 u.ptr = virt_to_machine(ptr).maddr;
492 u.val = pgd_val_ma(val);
493 extend_mmu_update(&u);
494}
495
496/*
497 * Raw hypercall-based set_pgd, intended for in early boot before
498 * there's a page structure. This implies:
499 * 1. The only existing pagetable is the kernel's
500 * 2. It is always pinned
501 * 3. It has no user pagetable attached to it
502 */
503void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
504{
505 preempt_disable();
506
507 xen_mc_batch();
508
509 __xen_set_pgd_hyper(ptr, val);
510
511 xen_mc_issue(PARAVIRT_LAZY_MMU);
512
513 preempt_enable();
514}
515
516void xen_set_pgd(pgd_t *ptr, pgd_t val)
517{
518 pgd_t *user_ptr = xen_get_user_pgd(ptr);
519
520 /* If page is not pinned, we can just update the entry
521 directly */
522 if (!page_pinned(ptr)) {
523 *ptr = val;
524 if (user_ptr) {
525 WARN_ON(page_pinned(user_ptr));
526 *user_ptr = val;
527 }
528 return;
529 }
530
531 /* If it's pinned, then we can at least batch the kernel and
532 user updates together. */
533 xen_mc_batch();
534
535 __xen_set_pgd_hyper(ptr, val);
536 if (user_ptr)
537 __xen_set_pgd_hyper(user_ptr, val);
538
539 xen_mc_issue(PARAVIRT_LAZY_MMU);
540}
541#endif /* PAGETABLE_LEVELS == 4 */
542
472/* 543/*
473 (Yet another) pagetable walker. This one is intended for pinning a 544 * (Yet another) pagetable walker. This one is intended for pinning a
474 pagetable. This means that it walks a pagetable and calls the 545 * pagetable. This means that it walks a pagetable and calls the
475 callback function on each page it finds making up the page table, 546 * callback function on each page it finds making up the page table,
476 at every level. It walks the entire pagetable, but it only bothers 547 * at every level. It walks the entire pagetable, but it only bothers
477 pinning pte pages which are below pte_limit. In the normal case 548 * pinning pte pages which are below limit. In the normal case this
478 this will be TASK_SIZE, but at boot we need to pin up to 549 * will be STACK_TOP_MAX, but at boot we need to pin up to
479 FIXADDR_TOP. But the important bit is that we don't pin beyond 550 * FIXADDR_TOP.
480 there, because then we start getting into Xen's ptes. 551 *
481*/ 552 * For 32-bit the important bit is that we don't pin beyond there,
482static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, enum pt_level), 553 * because then we start getting into Xen's ptes.
554 *
555 * For 64-bit, we must skip the Xen hole in the middle of the address
556 * space, just after the big x86-64 virtual hole.
557 */
558static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level),
483 unsigned long limit) 559 unsigned long limit)
484{ 560{
485 pgd_t *pgd = pgd_base;
486 int flush = 0; 561 int flush = 0;
487 unsigned long addr = 0; 562 unsigned hole_low, hole_high;
488 unsigned long pgd_next; 563 unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;
564 unsigned pgdidx, pudidx, pmdidx;
489 565
490 BUG_ON(limit > FIXADDR_TOP); 566 /* The limit is the last byte to be touched */
567 limit--;
568 BUG_ON(limit >= FIXADDR_TOP);
491 569
492 if (xen_feature(XENFEAT_auto_translated_physmap)) 570 if (xen_feature(XENFEAT_auto_translated_physmap))
493 return 0; 571 return 0;
494 572
495 for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) { 573 /*
574 * 64-bit has a great big hole in the middle of the address
575 * space, which contains the Xen mappings. On 32-bit these
576 * will end up making a zero-sized hole and so is a no-op.
577 */
578 hole_low = pgd_index(USER_LIMIT);
579 hole_high = pgd_index(PAGE_OFFSET);
580
581 pgdidx_limit = pgd_index(limit);
582#if PTRS_PER_PUD > 1
583 pudidx_limit = pud_index(limit);
584#else
585 pudidx_limit = 0;
586#endif
587#if PTRS_PER_PMD > 1
588 pmdidx_limit = pmd_index(limit);
589#else
590 pmdidx_limit = 0;
591#endif
592
593 flush |= (*func)(virt_to_page(pgd), PT_PGD);
594
595 for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) {
496 pud_t *pud; 596 pud_t *pud;
497 unsigned long pud_limit, pud_next;
498 597
499 pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP); 598 if (pgdidx >= hole_low && pgdidx < hole_high)
599 continue;
500 600
501 if (!pgd_val(*pgd)) 601 if (!pgd_val(pgd[pgdidx]))
502 continue; 602 continue;
503 603
504 pud = pud_offset(pgd, 0); 604 pud = pud_offset(&pgd[pgdidx], 0);
505 605
506 if (PTRS_PER_PUD > 1) /* not folded */ 606 if (PTRS_PER_PUD > 1) /* not folded */
507 flush |= (*func)(virt_to_page(pud), PT_PUD); 607 flush |= (*func)(virt_to_page(pud), PT_PUD);
508 608
509 for (; addr != pud_limit; pud++, addr = pud_next) { 609 for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) {
510 pmd_t *pmd; 610 pmd_t *pmd;
511 unsigned long pmd_limit;
512 611
513 pud_next = pud_addr_end(addr, pud_limit); 612 if (pgdidx == pgdidx_limit &&
514 613 pudidx > pudidx_limit)
515 if (pud_next < limit) 614 goto out;
516 pmd_limit = pud_next;
517 else
518 pmd_limit = limit;
519 615
520 if (pud_none(*pud)) 616 if (pud_none(pud[pudidx]))
521 continue; 617 continue;
522 618
523 pmd = pmd_offset(pud, 0); 619 pmd = pmd_offset(&pud[pudidx], 0);
524 620
525 if (PTRS_PER_PMD > 1) /* not folded */ 621 if (PTRS_PER_PMD > 1) /* not folded */
526 flush |= (*func)(virt_to_page(pmd), PT_PMD); 622 flush |= (*func)(virt_to_page(pmd), PT_PMD);
527 623
528 for (; addr != pmd_limit; pmd++) { 624 for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) {
529 addr += (PAGE_SIZE * PTRS_PER_PTE); 625 struct page *pte;
530 if ((pmd_limit-1) < (addr-1)) { 626
531 addr = pmd_limit; 627 if (pgdidx == pgdidx_limit &&
532 break; 628 pudidx == pudidx_limit &&
533 } 629 pmdidx > pmdidx_limit)
630 goto out;
534 631
535 if (pmd_none(*pmd)) 632 if (pmd_none(pmd[pmdidx]))
536 continue; 633 continue;
537 634
538 flush |= (*func)(pmd_page(*pmd), PT_PTE); 635 pte = pmd_page(pmd[pmdidx]);
636 flush |= (*func)(pte, PT_PTE);
539 } 637 }
540 } 638 }
541 } 639 }
542 640out:
543 flush |= (*func)(virt_to_page(pgd_base), PT_PGD);
544 641
545 return flush; 642 return flush;
546} 643}
@@ -622,14 +719,31 @@ void xen_pgd_pin(pgd_t *pgd)
622{ 719{
623 xen_mc_batch(); 720 xen_mc_batch();
624 721
625 if (pgd_walk(pgd, pin_page, TASK_SIZE)) { 722 if (pgd_walk(pgd, pin_page, USER_LIMIT)) {
626 /* re-enable interrupts for kmap_flush_unused */ 723 /* re-enable interrupts for kmap_flush_unused */
627 xen_mc_issue(0); 724 xen_mc_issue(0);
628 kmap_flush_unused(); 725 kmap_flush_unused();
629 xen_mc_batch(); 726 xen_mc_batch();
630 } 727 }
631 728
729#ifdef CONFIG_X86_64
730 {
731 pgd_t *user_pgd = xen_get_user_pgd(pgd);
732
733 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
734
735 if (user_pgd) {
736 pin_page(virt_to_page(user_pgd), PT_PGD);
737 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd)));
738 }
739 }
740#else /* CONFIG_X86_32 */
741#ifdef CONFIG_X86_PAE
742 /* Need to make sure unshared kernel PMD is pinnable */
743 pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD);
744#endif
632 xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd))); 745 xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
746#endif /* CONFIG_X86_64 */
633 xen_mc_issue(0); 747 xen_mc_issue(0);
634} 748}
635 749
@@ -656,9 +770,11 @@ void xen_mm_pin_all(void)
656 spin_unlock_irqrestore(&pgd_lock, flags); 770 spin_unlock_irqrestore(&pgd_lock, flags);
657} 771}
658 772
659/* The init_mm pagetable is really pinned as soon as its created, but 773/*
660 that's before we have page structures to store the bits. So do all 774 * The init_mm pagetable is really pinned as soon as its created, but
661 the book-keeping now. */ 775 * that's before we have page structures to store the bits. So do all
776 * the book-keeping now.
777 */
662static __init int mark_pinned(struct page *page, enum pt_level level) 778static __init int mark_pinned(struct page *page, enum pt_level level)
663{ 779{
664 SetPagePinned(page); 780 SetPagePinned(page);
@@ -708,7 +824,23 @@ static void xen_pgd_unpin(pgd_t *pgd)
708 824
709 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); 825 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
710 826
711 pgd_walk(pgd, unpin_page, TASK_SIZE); 827#ifdef CONFIG_X86_64
828 {
829 pgd_t *user_pgd = xen_get_user_pgd(pgd);
830
831 if (user_pgd) {
832 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd)));
833 unpin_page(virt_to_page(user_pgd), PT_PGD);
834 }
835 }
836#endif
837
838#ifdef CONFIG_X86_PAE
839 /* Need to make sure unshared kernel PMD is unpinned */
840 pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD);
841#endif
842
843 pgd_walk(pgd, unpin_page, USER_LIMIT);
712 844
713 xen_mc_issue(0); 845 xen_mc_issue(0);
714} 846}
@@ -727,7 +859,6 @@ void xen_mm_unpin_all(void)
727 list_for_each_entry(page, &pgd_list, lru) { 859 list_for_each_entry(page, &pgd_list, lru) {
728 if (PageSavePinned(page)) { 860 if (PageSavePinned(page)) {
729 BUG_ON(!PagePinned(page)); 861 BUG_ON(!PagePinned(page));
730 printk("unpinning pinned %p\n", page_address(page));
731 xen_pgd_unpin((pgd_t *)page_address(page)); 862 xen_pgd_unpin((pgd_t *)page_address(page));
732 ClearPageSavePinned(page); 863 ClearPageSavePinned(page);
733 } 864 }
@@ -757,8 +888,15 @@ void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
757static void drop_other_mm_ref(void *info) 888static void drop_other_mm_ref(void *info)
758{ 889{
759 struct mm_struct *mm = info; 890 struct mm_struct *mm = info;
891 struct mm_struct *active_mm;
892
893#ifdef CONFIG_X86_64
894 active_mm = read_pda(active_mm);
895#else
896 active_mm = __get_cpu_var(cpu_tlbstate).active_mm;
897#endif
760 898
761 if (__get_cpu_var(cpu_tlbstate).active_mm == mm) 899 if (active_mm == mm)
762 leave_mm(smp_processor_id()); 900 leave_mm(smp_processor_id());
763 901
764 /* If this cpu still has a stale cr3 reference, then make sure 902 /* If this cpu still has a stale cr3 reference, then make sure
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index 297bf9f5b8bc..0f59bd03f9e3 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -10,18 +10,6 @@ enum pt_level {
10 PT_PTE 10 PT_PTE
11}; 11};
12 12
13/*
14 * Page-directory addresses above 4GB do not fit into architectural %cr3.
15 * When accessing %cr3, or equivalent field in vcpu_guest_context, guests
16 * must use the following accessor macros to pack/unpack valid MFNs.
17 *
18 * Note that Xen is using the fact that the pagetable base is always
19 * page-aligned, and putting the 12 MSB of the address into the 12 LSB
20 * of cr3.
21 */
22#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20))
23#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20))
24
25 13
26void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); 14void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
27 15
@@ -44,13 +32,26 @@ pgd_t xen_make_pgd(pgdval_t);
44void xen_set_pte(pte_t *ptep, pte_t pteval); 32void xen_set_pte(pte_t *ptep, pte_t pteval);
45void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, 33void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
46 pte_t *ptep, pte_t pteval); 34 pte_t *ptep, pte_t pteval);
35
36#ifdef CONFIG_X86_PAE
47void xen_set_pte_atomic(pte_t *ptep, pte_t pte); 37void xen_set_pte_atomic(pte_t *ptep, pte_t pte);
38void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
39void xen_pmd_clear(pmd_t *pmdp);
40#endif /* CONFIG_X86_PAE */
41
48void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval); 42void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval);
49void xen_set_pud(pud_t *ptr, pud_t val); 43void xen_set_pud(pud_t *ptr, pud_t val);
50void xen_set_pmd_hyper(pmd_t *pmdp, pmd_t pmdval); 44void xen_set_pmd_hyper(pmd_t *pmdp, pmd_t pmdval);
51void xen_set_pud_hyper(pud_t *ptr, pud_t val); 45void xen_set_pud_hyper(pud_t *ptr, pud_t val);
52void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); 46
53void xen_pmd_clear(pmd_t *pmdp); 47#if PAGETABLE_LEVELS == 4
48pudval_t xen_pud_val(pud_t pud);
49pud_t xen_make_pud(pudval_t pudval);
50void xen_set_pgd(pgd_t *pgdp, pgd_t pgd);
51void xen_set_pgd_hyper(pgd_t *pgdp, pgd_t pgd);
52#endif
53
54pgd_t *xen_get_user_pgd(pgd_t *pgd);
54 55
55pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep); 56pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
56void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, 57void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c
index 3c63c4da7ed1..9efd1c6c9776 100644
--- a/arch/x86/xen/multicalls.c
+++ b/arch/x86/xen/multicalls.c
@@ -76,6 +76,7 @@ void xen_mc_flush(void)
76 if (ret) { 76 if (ret) {
77 printk(KERN_ERR "%d multicall(s) failed: cpu %d\n", 77 printk(KERN_ERR "%d multicall(s) failed: cpu %d\n",
78 ret, smp_processor_id()); 78 ret, smp_processor_id());
79 dump_stack();
79 for (i = 0; i < b->mcidx; i++) { 80 for (i = 0; i < b->mcidx; i++) {
80 printk(" call %2d/%d: op=%lu arg=[%lx] result=%ld\n", 81 printk(" call %2d/%d: op=%lu arg=[%lx] result=%ld\n",
81 i+1, b->mcidx, 82 i+1, b->mcidx,
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index e0a39595bde3..b6acc3a0af46 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -83,30 +83,72 @@ static void xen_idle(void)
83 83
84/* 84/*
85 * Set the bit indicating "nosegneg" library variants should be used. 85 * Set the bit indicating "nosegneg" library variants should be used.
86 * We only need to bother in pure 32-bit mode; compat 32-bit processes
87 * can have un-truncated segments, so wrapping around is allowed.
86 */ 88 */
87static void __init fiddle_vdso(void) 89static void __init fiddle_vdso(void)
88{ 90{
89 extern const char vdso32_default_start; 91#ifdef CONFIG_X86_32
90 u32 *mask = VDSO32_SYMBOL(&vdso32_default_start, NOTE_MASK); 92 u32 *mask;
93 mask = VDSO32_SYMBOL(&vdso32_int80_start, NOTE_MASK);
91 *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT; 94 *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
95 mask = VDSO32_SYMBOL(&vdso32_sysenter_start, NOTE_MASK);
96 *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
97#endif
92} 98}
93 99
94void xen_enable_sysenter(void) 100static __cpuinit int register_callback(unsigned type, const void *func)
95{ 101{
96 int cpu = smp_processor_id(); 102 struct callback_register callback = {
97 extern void xen_sysenter_target(void); 103 .type = type,
98 /* Mask events on entry, even though they get enabled immediately */ 104 .address = XEN_CALLBACK(__KERNEL_CS, func),
99 static struct callback_register sysenter = {
100 .type = CALLBACKTYPE_sysenter,
101 .address = { __KERNEL_CS, (unsigned long)xen_sysenter_target },
102 .flags = CALLBACKF_mask_events, 105 .flags = CALLBACKF_mask_events,
103 }; 106 };
104 107
105 if (!boot_cpu_has(X86_FEATURE_SEP) || 108 return HYPERVISOR_callback_op(CALLBACKOP_register, &callback);
106 HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) != 0) { 109}
107 clear_cpu_cap(&cpu_data(cpu), X86_FEATURE_SEP); 110
108 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_SEP); 111void __cpuinit xen_enable_sysenter(void)
112{
113 extern void xen_sysenter_target(void);
114 int ret;
115 unsigned sysenter_feature;
116
117#ifdef CONFIG_X86_32
118 sysenter_feature = X86_FEATURE_SEP;
119#else
120 sysenter_feature = X86_FEATURE_SYSENTER32;
121#endif
122
123 if (!boot_cpu_has(sysenter_feature))
124 return;
125
126 ret = register_callback(CALLBACKTYPE_sysenter, xen_sysenter_target);
127 if(ret != 0)
128 setup_clear_cpu_cap(sysenter_feature);
129}
130
131void __cpuinit xen_enable_syscall(void)
132{
133#ifdef CONFIG_X86_64
134 int ret;
135 extern void xen_syscall_target(void);
136 extern void xen_syscall32_target(void);
137
138 ret = register_callback(CALLBACKTYPE_syscall, xen_syscall_target);
139 if (ret != 0) {
140 printk(KERN_ERR "Failed to set syscall callback: %d\n", ret);
141 /* Pretty fatal; 64-bit userspace has no other
142 mechanism for syscalls. */
109 } 143 }
144
145 if (boot_cpu_has(X86_FEATURE_SYSCALL32)) {
146 ret = register_callback(CALLBACKTYPE_syscall32,
147 xen_syscall32_target);
148 if (ret != 0)
149 setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
150 }
151#endif /* CONFIG_X86_64 */
110} 152}
111 153
112void __init xen_arch_setup(void) 154void __init xen_arch_setup(void)
@@ -120,10 +162,12 @@ void __init xen_arch_setup(void)
120 if (!xen_feature(XENFEAT_auto_translated_physmap)) 162 if (!xen_feature(XENFEAT_auto_translated_physmap))
121 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3); 163 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3);
122 164
123 HYPERVISOR_set_callbacks(__KERNEL_CS, (unsigned long)xen_hypervisor_callback, 165 if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) ||
124 __KERNEL_CS, (unsigned long)xen_failsafe_callback); 166 register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback))
167 BUG();
125 168
126 xen_enable_sysenter(); 169 xen_enable_sysenter();
170 xen_enable_syscall();
127 171
128 set_iopl.iopl = 1; 172 set_iopl.iopl = 1;
129 rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); 173 rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
@@ -143,11 +187,6 @@ void __init xen_arch_setup(void)
143 187
144 pm_idle = xen_idle; 188 pm_idle = xen_idle;
145 189
146#ifdef CONFIG_SMP
147 /* fill cpus_possible with all available cpus */
148 xen_fill_possible_map();
149#endif
150
151 paravirt_disable_iospace(); 190 paravirt_disable_iospace();
152 191
153 fiddle_vdso(); 192 fiddle_vdso();
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 233156f39b7f..e693812ac59a 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -15,6 +15,7 @@
15 * This does not handle HOTPLUG_CPU yet. 15 * This does not handle HOTPLUG_CPU yet.
16 */ 16 */
17#include <linux/sched.h> 17#include <linux/sched.h>
18#include <linux/kernel_stat.h>
18#include <linux/err.h> 19#include <linux/err.h>
19#include <linux/smp.h> 20#include <linux/smp.h>
20 21
@@ -35,6 +36,8 @@
35#include "xen-ops.h" 36#include "xen-ops.h"
36#include "mmu.h" 37#include "mmu.h"
37 38
39static void __cpuinit xen_init_lock_cpu(int cpu);
40
38cpumask_t xen_cpu_initialized_map; 41cpumask_t xen_cpu_initialized_map;
39 42
40static DEFINE_PER_CPU(int, resched_irq); 43static DEFINE_PER_CPU(int, resched_irq);
@@ -66,13 +69,22 @@ static __cpuinit void cpu_bringup_and_idle(void)
66 int cpu = smp_processor_id(); 69 int cpu = smp_processor_id();
67 70
68 cpu_init(); 71 cpu_init();
72 preempt_disable();
73
69 xen_enable_sysenter(); 74 xen_enable_sysenter();
75 xen_enable_syscall();
70 76
71 preempt_disable(); 77 cpu = smp_processor_id();
72 per_cpu(cpu_state, cpu) = CPU_ONLINE; 78 smp_store_cpu_info(cpu);
79 cpu_data(cpu).x86_max_cores = 1;
80 set_cpu_sibling_map(cpu);
73 81
74 xen_setup_cpu_clockevents(); 82 xen_setup_cpu_clockevents();
75 83
84 cpu_set(cpu, cpu_online_map);
85 x86_write_percpu(cpu_state, CPU_ONLINE);
86 wmb();
87
76 /* We can take interrupts now: we're officially "up". */ 88 /* We can take interrupts now: we're officially "up". */
77 local_irq_enable(); 89 local_irq_enable();
78 90
@@ -141,56 +153,39 @@ static int xen_smp_intr_init(unsigned int cpu)
141 return rc; 153 return rc;
142} 154}
143 155
144void __init xen_fill_possible_map(void) 156static void __init xen_fill_possible_map(void)
145{ 157{
146 int i, rc; 158 int i, rc;
147 159
148 for (i = 0; i < NR_CPUS; i++) { 160 for (i = 0; i < NR_CPUS; i++) {
149 rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); 161 rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
150 if (rc >= 0) 162 if (rc >= 0) {
163 num_processors++;
151 cpu_set(i, cpu_possible_map); 164 cpu_set(i, cpu_possible_map);
165 }
152 } 166 }
153} 167}
154 168
155void __init xen_smp_prepare_boot_cpu(void) 169static void __init xen_smp_prepare_boot_cpu(void)
156{ 170{
157 int cpu;
158
159 BUG_ON(smp_processor_id() != 0); 171 BUG_ON(smp_processor_id() != 0);
160 native_smp_prepare_boot_cpu(); 172 native_smp_prepare_boot_cpu();
161 173
162 /* We've switched to the "real" per-cpu gdt, so make sure the 174 /* We've switched to the "real" per-cpu gdt, so make sure the
163 old memory can be recycled */ 175 old memory can be recycled */
164 make_lowmem_page_readwrite(&per_cpu__gdt_page); 176 make_lowmem_page_readwrite(&per_cpu_var(gdt_page));
165
166 for_each_possible_cpu(cpu) {
167 cpus_clear(per_cpu(cpu_sibling_map, cpu));
168 /*
169 * cpu_core_map lives in a per cpu area that is cleared
170 * when the per cpu array is allocated.
171 *
172 * cpus_clear(per_cpu(cpu_core_map, cpu));
173 */
174 }
175 177
176 xen_setup_vcpu_info_placement(); 178 xen_setup_vcpu_info_placement();
177} 179}
178 180
179void __init xen_smp_prepare_cpus(unsigned int max_cpus) 181static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
180{ 182{
181 unsigned cpu; 183 unsigned cpu;
182 184
183 for_each_possible_cpu(cpu) { 185 xen_init_lock_cpu(0);
184 cpus_clear(per_cpu(cpu_sibling_map, cpu));
185 /*
186 * cpu_core_ map will be zeroed when the per
187 * cpu area is allocated.
188 *
189 * cpus_clear(per_cpu(cpu_core_map, cpu));
190 */
191 }
192 186
193 smp_store_cpu_info(0); 187 smp_store_cpu_info(0);
188 cpu_data(0).x86_max_cores = 1;
194 set_cpu_sibling_map(0); 189 set_cpu_sibling_map(0);
195 190
196 if (xen_smp_intr_init(0)) 191 if (xen_smp_intr_init(0))
@@ -225,7 +220,7 @@ static __cpuinit int
225cpu_initialize_context(unsigned int cpu, struct task_struct *idle) 220cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
226{ 221{
227 struct vcpu_guest_context *ctxt; 222 struct vcpu_guest_context *ctxt;
228 struct gdt_page *gdt = &per_cpu(gdt_page, cpu); 223 struct desc_struct *gdt;
229 224
230 if (cpu_test_and_set(cpu, xen_cpu_initialized_map)) 225 if (cpu_test_and_set(cpu, xen_cpu_initialized_map))
231 return 0; 226 return 0;
@@ -234,12 +229,15 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
234 if (ctxt == NULL) 229 if (ctxt == NULL)
235 return -ENOMEM; 230 return -ENOMEM;
236 231
232 gdt = get_cpu_gdt_table(cpu);
233
237 ctxt->flags = VGCF_IN_KERNEL; 234 ctxt->flags = VGCF_IN_KERNEL;
238 ctxt->user_regs.ds = __USER_DS; 235 ctxt->user_regs.ds = __USER_DS;
239 ctxt->user_regs.es = __USER_DS; 236 ctxt->user_regs.es = __USER_DS;
240 ctxt->user_regs.fs = __KERNEL_PERCPU;
241 ctxt->user_regs.gs = 0;
242 ctxt->user_regs.ss = __KERNEL_DS; 237 ctxt->user_regs.ss = __KERNEL_DS;
238#ifdef CONFIG_X86_32
239 ctxt->user_regs.fs = __KERNEL_PERCPU;
240#endif
243 ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle; 241 ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
244 ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */ 242 ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
245 243
@@ -249,11 +247,11 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
249 247
250 ctxt->ldt_ents = 0; 248 ctxt->ldt_ents = 0;
251 249
252 BUG_ON((unsigned long)gdt->gdt & ~PAGE_MASK); 250 BUG_ON((unsigned long)gdt & ~PAGE_MASK);
253 make_lowmem_page_readonly(gdt->gdt); 251 make_lowmem_page_readonly(gdt);
254 252
255 ctxt->gdt_frames[0] = virt_to_mfn(gdt->gdt); 253 ctxt->gdt_frames[0] = virt_to_mfn(gdt);
256 ctxt->gdt_ents = ARRAY_SIZE(gdt->gdt); 254 ctxt->gdt_ents = GDT_ENTRIES;
257 255
258 ctxt->user_regs.cs = __KERNEL_CS; 256 ctxt->user_regs.cs = __KERNEL_CS;
259 ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs); 257 ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
@@ -261,9 +259,11 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
261 ctxt->kernel_ss = __KERNEL_DS; 259 ctxt->kernel_ss = __KERNEL_DS;
262 ctxt->kernel_sp = idle->thread.sp0; 260 ctxt->kernel_sp = idle->thread.sp0;
263 261
262#ifdef CONFIG_X86_32
264 ctxt->event_callback_cs = __KERNEL_CS; 263 ctxt->event_callback_cs = __KERNEL_CS;
265 ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback;
266 ctxt->failsafe_callback_cs = __KERNEL_CS; 264 ctxt->failsafe_callback_cs = __KERNEL_CS;
265#endif
266 ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback;
267 ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback; 267 ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback;
268 268
269 per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir); 269 per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
@@ -276,7 +276,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
276 return 0; 276 return 0;
277} 277}
278 278
279int __cpuinit xen_cpu_up(unsigned int cpu) 279static int __cpuinit xen_cpu_up(unsigned int cpu)
280{ 280{
281 struct task_struct *idle = idle_task(cpu); 281 struct task_struct *idle = idle_task(cpu);
282 int rc; 282 int rc;
@@ -287,10 +287,28 @@ int __cpuinit xen_cpu_up(unsigned int cpu)
287 return rc; 287 return rc;
288#endif 288#endif
289 289
290#ifdef CONFIG_X86_64
291 /* Allocate node local memory for AP pdas */
292 WARN_ON(cpu == 0);
293 if (cpu > 0) {
294 rc = get_local_pda(cpu);
295 if (rc)
296 return rc;
297 }
298#endif
299
300#ifdef CONFIG_X86_32
290 init_gdt(cpu); 301 init_gdt(cpu);
291 per_cpu(current_task, cpu) = idle; 302 per_cpu(current_task, cpu) = idle;
292 irq_ctx_init(cpu); 303 irq_ctx_init(cpu);
304#else
305 cpu_pda(cpu)->pcurrent = idle;
306 clear_tsk_thread_flag(idle, TIF_FORK);
307#endif
293 xen_setup_timer(cpu); 308 xen_setup_timer(cpu);
309 xen_init_lock_cpu(cpu);
310
311 per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
294 312
295 /* make sure interrupts start blocked */ 313 /* make sure interrupts start blocked */
296 per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1; 314 per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1;
@@ -306,20 +324,18 @@ int __cpuinit xen_cpu_up(unsigned int cpu)
306 if (rc) 324 if (rc)
307 return rc; 325 return rc;
308 326
309 smp_store_cpu_info(cpu);
310 set_cpu_sibling_map(cpu);
311 /* This must be done before setting cpu_online_map */
312 wmb();
313
314 cpu_set(cpu, cpu_online_map);
315
316 rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL); 327 rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL);
317 BUG_ON(rc); 328 BUG_ON(rc);
318 329
330 while(per_cpu(cpu_state, cpu) != CPU_ONLINE) {
331 HYPERVISOR_sched_op(SCHEDOP_yield, 0);
332 barrier();
333 }
334
319 return 0; 335 return 0;
320} 336}
321 337
322void xen_smp_cpus_done(unsigned int max_cpus) 338static void xen_smp_cpus_done(unsigned int max_cpus)
323{ 339{
324} 340}
325 341
@@ -335,12 +351,12 @@ static void stop_self(void *v)
335 BUG(); 351 BUG();
336} 352}
337 353
338void xen_smp_send_stop(void) 354static void xen_smp_send_stop(void)
339{ 355{
340 smp_call_function(stop_self, NULL, 0); 356 smp_call_function(stop_self, NULL, 0);
341} 357}
342 358
343void xen_smp_send_reschedule(int cpu) 359static void xen_smp_send_reschedule(int cpu)
344{ 360{
345 xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR); 361 xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR);
346} 362}
@@ -355,7 +371,7 @@ static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector)
355 xen_send_IPI_one(cpu, vector); 371 xen_send_IPI_one(cpu, vector);
356} 372}
357 373
358void xen_smp_send_call_function_ipi(cpumask_t mask) 374static void xen_smp_send_call_function_ipi(cpumask_t mask)
359{ 375{
360 int cpu; 376 int cpu;
361 377
@@ -370,7 +386,7 @@ void xen_smp_send_call_function_ipi(cpumask_t mask)
370 } 386 }
371} 387}
372 388
373void xen_smp_send_call_function_single_ipi(int cpu) 389static void xen_smp_send_call_function_single_ipi(int cpu)
374{ 390{
375 xen_send_IPI_mask(cpumask_of_cpu(cpu), XEN_CALL_FUNCTION_SINGLE_VECTOR); 391 xen_send_IPI_mask(cpumask_of_cpu(cpu), XEN_CALL_FUNCTION_SINGLE_VECTOR);
376} 392}
@@ -379,7 +395,11 @@ static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
379{ 395{
380 irq_enter(); 396 irq_enter();
381 generic_smp_call_function_interrupt(); 397 generic_smp_call_function_interrupt();
398#ifdef CONFIG_X86_32
382 __get_cpu_var(irq_stat).irq_call_count++; 399 __get_cpu_var(irq_stat).irq_call_count++;
400#else
401 add_pda(irq_call_count, 1);
402#endif
383 irq_exit(); 403 irq_exit();
384 404
385 return IRQ_HANDLED; 405 return IRQ_HANDLED;
@@ -389,8 +409,196 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id)
389{ 409{
390 irq_enter(); 410 irq_enter();
391 generic_smp_call_function_single_interrupt(); 411 generic_smp_call_function_single_interrupt();
412#ifdef CONFIG_X86_32
392 __get_cpu_var(irq_stat).irq_call_count++; 413 __get_cpu_var(irq_stat).irq_call_count++;
414#else
415 add_pda(irq_call_count, 1);
416#endif
393 irq_exit(); 417 irq_exit();
394 418
395 return IRQ_HANDLED; 419 return IRQ_HANDLED;
396} 420}
421
422struct xen_spinlock {
423 unsigned char lock; /* 0 -> free; 1 -> locked */
424 unsigned short spinners; /* count of waiting cpus */
425};
426
427static int xen_spin_is_locked(struct raw_spinlock *lock)
428{
429 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
430
431 return xl->lock != 0;
432}
433
434static int xen_spin_is_contended(struct raw_spinlock *lock)
435{
436 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
437
438 /* Not strictly true; this is only the count of contended
439 lock-takers entering the slow path. */
440 return xl->spinners != 0;
441}
442
443static int xen_spin_trylock(struct raw_spinlock *lock)
444{
445 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
446 u8 old = 1;
447
448 asm("xchgb %b0,%1"
449 : "+q" (old), "+m" (xl->lock) : : "memory");
450
451 return old == 0;
452}
453
454static DEFINE_PER_CPU(int, lock_kicker_irq) = -1;
455static DEFINE_PER_CPU(struct xen_spinlock *, lock_spinners);
456
457static inline void spinning_lock(struct xen_spinlock *xl)
458{
459 __get_cpu_var(lock_spinners) = xl;
460 wmb(); /* set lock of interest before count */
461 asm(LOCK_PREFIX " incw %0"
462 : "+m" (xl->spinners) : : "memory");
463}
464
465static inline void unspinning_lock(struct xen_spinlock *xl)
466{
467 asm(LOCK_PREFIX " decw %0"
468 : "+m" (xl->spinners) : : "memory");
469 wmb(); /* decrement count before clearing lock */
470 __get_cpu_var(lock_spinners) = NULL;
471}
472
473static noinline int xen_spin_lock_slow(struct raw_spinlock *lock)
474{
475 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
476 int irq = __get_cpu_var(lock_kicker_irq);
477 int ret;
478
479 /* If kicker interrupts not initialized yet, just spin */
480 if (irq == -1)
481 return 0;
482
483 /* announce we're spinning */
484 spinning_lock(xl);
485
486 /* clear pending */
487 xen_clear_irq_pending(irq);
488
489 /* check again make sure it didn't become free while
490 we weren't looking */
491 ret = xen_spin_trylock(lock);
492 if (ret)
493 goto out;
494
495 /* block until irq becomes pending */
496 xen_poll_irq(irq);
497 kstat_this_cpu.irqs[irq]++;
498
499out:
500 unspinning_lock(xl);
501 return ret;
502}
503
504static void xen_spin_lock(struct raw_spinlock *lock)
505{
506 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
507 int timeout;
508 u8 oldval;
509
510 do {
511 timeout = 1 << 10;
512
513 asm("1: xchgb %1,%0\n"
514 " testb %1,%1\n"
515 " jz 3f\n"
516 "2: rep;nop\n"
517 " cmpb $0,%0\n"
518 " je 1b\n"
519 " dec %2\n"
520 " jnz 2b\n"
521 "3:\n"
522 : "+m" (xl->lock), "=q" (oldval), "+r" (timeout)
523 : "1" (1)
524 : "memory");
525
526 } while (unlikely(oldval != 0 && !xen_spin_lock_slow(lock)));
527}
528
529static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl)
530{
531 int cpu;
532
533 for_each_online_cpu(cpu) {
534 /* XXX should mix up next cpu selection */
535 if (per_cpu(lock_spinners, cpu) == xl) {
536 xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR);
537 break;
538 }
539 }
540}
541
542static void xen_spin_unlock(struct raw_spinlock *lock)
543{
544 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
545
546 smp_wmb(); /* make sure no writes get moved after unlock */
547 xl->lock = 0; /* release lock */
548
549 /* make sure unlock happens before kick */
550 barrier();
551
552 if (unlikely(xl->spinners))
553 xen_spin_unlock_slow(xl);
554}
555
556static __cpuinit void xen_init_lock_cpu(int cpu)
557{
558 int irq;
559 const char *name;
560
561 name = kasprintf(GFP_KERNEL, "spinlock%d", cpu);
562 irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR,
563 cpu,
564 xen_reschedule_interrupt,
565 IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
566 name,
567 NULL);
568
569 if (irq >= 0) {
570 disable_irq(irq); /* make sure it's never delivered */
571 per_cpu(lock_kicker_irq, cpu) = irq;
572 }
573
574 printk("cpu %d spinlock event irq %d\n", cpu, irq);
575}
576
577static void __init xen_init_spinlocks(void)
578{
579 pv_lock_ops.spin_is_locked = xen_spin_is_locked;
580 pv_lock_ops.spin_is_contended = xen_spin_is_contended;
581 pv_lock_ops.spin_lock = xen_spin_lock;
582 pv_lock_ops.spin_trylock = xen_spin_trylock;
583 pv_lock_ops.spin_unlock = xen_spin_unlock;
584}
585
586static const struct smp_ops xen_smp_ops __initdata = {
587 .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
588 .smp_prepare_cpus = xen_smp_prepare_cpus,
589 .cpu_up = xen_cpu_up,
590 .smp_cpus_done = xen_smp_cpus_done,
591
592 .smp_send_stop = xen_smp_send_stop,
593 .smp_send_reschedule = xen_smp_send_reschedule,
594
595 .send_call_func_ipi = xen_smp_send_call_function_ipi,
596 .send_call_func_single_ipi = xen_smp_send_call_function_single_ipi,
597};
598
599void __init xen_smp_init(void)
600{
601 smp_ops = xen_smp_ops;
602 xen_fill_possible_map();
603 xen_init_spinlocks();
604}
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index 251669a932d4..2a234db5949b 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -38,8 +38,11 @@ void xen_post_suspend(int suspend_cancelled)
38 xen_cpu_initialized_map = cpu_online_map; 38 xen_cpu_initialized_map = cpu_online_map;
39#endif 39#endif
40 xen_vcpu_restore(); 40 xen_vcpu_restore();
41 xen_timer_resume();
42 } 41 }
43 42
44} 43}
45 44
45void xen_arch_resume(void)
46{
47 /* nothing */
48}
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm_32.S
index 2497a30f41de..2497a30f41de 100644
--- a/arch/x86/xen/xen-asm.S
+++ b/arch/x86/xen/xen-asm_32.S
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
new file mode 100644
index 000000000000..4038cbfe3331
--- /dev/null
+++ b/arch/x86/xen/xen-asm_64.S
@@ -0,0 +1,271 @@
1/*
2 Asm versions of Xen pv-ops, suitable for either direct use or inlining.
3 The inline versions are the same as the direct-use versions, with the
4 pre- and post-amble chopped off.
5
6 This code is encoded for size rather than absolute efficiency,
7 with a view to being able to inline as much as possible.
8
9 We only bother with direct forms (ie, vcpu in pda) of the operations
10 here; the indirect forms are better handled in C, since they're
11 generally too large to inline anyway.
12 */
13
14#include <linux/linkage.h>
15
16#include <asm/asm-offsets.h>
17#include <asm/processor-flags.h>
18#include <asm/errno.h>
19#include <asm/segment.h>
20
21#include <xen/interface/xen.h>
22
23#define RELOC(x, v) .globl x##_reloc; x##_reloc=v
24#define ENDPATCH(x) .globl x##_end; x##_end=.
25
26/* Pseudo-flag used for virtual NMI, which we don't implement yet */
27#define XEN_EFLAGS_NMI 0x80000000
28
29#if 0
30#include <asm/percpu.h>
31
32/*
33 Enable events. This clears the event mask and tests the pending
34 event status with one and operation. If there are pending
35 events, then enter the hypervisor to get them handled.
36 */
37ENTRY(xen_irq_enable_direct)
38 /* Unmask events */
39 movb $0, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
40
41 /* Preempt here doesn't matter because that will deal with
42 any pending interrupts. The pending check may end up being
43 run on the wrong CPU, but that doesn't hurt. */
44
45 /* Test for pending */
46 testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending)
47 jz 1f
48
492: call check_events
501:
51ENDPATCH(xen_irq_enable_direct)
52 ret
53 ENDPROC(xen_irq_enable_direct)
54 RELOC(xen_irq_enable_direct, 2b+1)
55
56/*
57 Disabling events is simply a matter of making the event mask
58 non-zero.
59 */
60ENTRY(xen_irq_disable_direct)
61 movb $1, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
62ENDPATCH(xen_irq_disable_direct)
63 ret
64 ENDPROC(xen_irq_disable_direct)
65 RELOC(xen_irq_disable_direct, 0)
66
67/*
68 (xen_)save_fl is used to get the current interrupt enable status.
69 Callers expect the status to be in X86_EFLAGS_IF, and other bits
70 may be set in the return value. We take advantage of this by
71 making sure that X86_EFLAGS_IF has the right value (and other bits
72 in that byte are 0), but other bits in the return value are
73 undefined. We need to toggle the state of the bit, because
74 Xen and x86 use opposite senses (mask vs enable).
75 */
76ENTRY(xen_save_fl_direct)
77 testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
78 setz %ah
79 addb %ah,%ah
80ENDPATCH(xen_save_fl_direct)
81 ret
82 ENDPROC(xen_save_fl_direct)
83 RELOC(xen_save_fl_direct, 0)
84
85/*
86 In principle the caller should be passing us a value return
87 from xen_save_fl_direct, but for robustness sake we test only
88 the X86_EFLAGS_IF flag rather than the whole byte. After
89 setting the interrupt mask state, it checks for unmasked
90 pending events and enters the hypervisor to get them delivered
91 if so.
92 */
93ENTRY(xen_restore_fl_direct)
94 testb $X86_EFLAGS_IF>>8, %ah
95 setz PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
96 /* Preempt here doesn't matter because that will deal with
97 any pending interrupts. The pending check may end up being
98 run on the wrong CPU, but that doesn't hurt. */
99
100 /* check for unmasked and pending */
101 cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending)
102 jz 1f
1032: call check_events
1041:
105ENDPATCH(xen_restore_fl_direct)
106 ret
107 ENDPROC(xen_restore_fl_direct)
108 RELOC(xen_restore_fl_direct, 2b+1)
109
110
111/*
112 Force an event check by making a hypercall,
113 but preserve regs before making the call.
114 */
115check_events:
116 push %rax
117 push %rcx
118 push %rdx
119 push %rsi
120 push %rdi
121 push %r8
122 push %r9
123 push %r10
124 push %r11
125 call force_evtchn_callback
126 pop %r11
127 pop %r10
128 pop %r9
129 pop %r8
130 pop %rdi
131 pop %rsi
132 pop %rdx
133 pop %rcx
134 pop %rax
135 ret
136#endif
137
138ENTRY(xen_adjust_exception_frame)
139 mov 8+0(%rsp),%rcx
140 mov 8+8(%rsp),%r11
141 ret $16
142
143hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32
144/*
145 Xen64 iret frame:
146
147 ss
148 rsp
149 rflags
150 cs
151 rip <-- standard iret frame
152
153 flags
154
155 rcx }
156 r11 }<-- pushed by hypercall page
157rsp -> rax }
158 */
159ENTRY(xen_iret)
160 pushq $0
1611: jmp hypercall_iret
162ENDPATCH(xen_iret)
163RELOC(xen_iret, 1b+1)
164
165/*
166 sysexit is not used for 64-bit processes, so it's
167 only ever used to return to 32-bit compat userspace.
168 */
169ENTRY(xen_sysexit)
170 pushq $__USER32_DS
171 pushq %rcx
172 pushq $X86_EFLAGS_IF
173 pushq $__USER32_CS
174 pushq %rdx
175
176 pushq $VGCF_in_syscall
1771: jmp hypercall_iret
178ENDPATCH(xen_sysexit)
179RELOC(xen_sysexit, 1b+1)
180
181ENTRY(xen_sysret64)
182 /* We're already on the usermode stack at this point, but still
183 with the kernel gs, so we can easily switch back */
184 movq %rsp, %gs:pda_oldrsp
185 movq %gs:pda_kernelstack,%rsp
186
187 pushq $__USER_DS
188 pushq %gs:pda_oldrsp
189 pushq %r11
190 pushq $__USER_CS
191 pushq %rcx
192
193 pushq $VGCF_in_syscall
1941: jmp hypercall_iret
195ENDPATCH(xen_sysret64)
196RELOC(xen_sysret64, 1b+1)
197
198ENTRY(xen_sysret32)
199 /* We're already on the usermode stack at this point, but still
200 with the kernel gs, so we can easily switch back */
201 movq %rsp, %gs:pda_oldrsp
202 movq %gs:pda_kernelstack, %rsp
203
204 pushq $__USER32_DS
205 pushq %gs:pda_oldrsp
206 pushq %r11
207 pushq $__USER32_CS
208 pushq %rcx
209
210 pushq $VGCF_in_syscall
2111: jmp hypercall_iret
212ENDPATCH(xen_sysret32)
213RELOC(xen_sysret32, 1b+1)
214
215/*
216 Xen handles syscall callbacks much like ordinary exceptions,
217 which means we have:
218 - kernel gs
219 - kernel rsp
220 - an iret-like stack frame on the stack (including rcx and r11):
221 ss
222 rsp
223 rflags
224 cs
225 rip
226 r11
227 rsp-> rcx
228
229 In all the entrypoints, we undo all that to make it look
230 like a CPU-generated syscall/sysenter and jump to the normal
231 entrypoint.
232 */
233
234.macro undo_xen_syscall
235 mov 0*8(%rsp),%rcx
236 mov 1*8(%rsp),%r11
237 mov 5*8(%rsp),%rsp
238.endm
239
240/* Normal 64-bit system call target */
241ENTRY(xen_syscall_target)
242 undo_xen_syscall
243 jmp system_call_after_swapgs
244ENDPROC(xen_syscall_target)
245
246#ifdef CONFIG_IA32_EMULATION
247
248/* 32-bit compat syscall target */
249ENTRY(xen_syscall32_target)
250 undo_xen_syscall
251 jmp ia32_cstar_target
252ENDPROC(xen_syscall32_target)
253
254/* 32-bit compat sysenter target */
255ENTRY(xen_sysenter_target)
256 undo_xen_syscall
257 jmp ia32_sysenter_target
258ENDPROC(xen_sysenter_target)
259
260#else /* !CONFIG_IA32_EMULATION */
261
262ENTRY(xen_syscall32_target)
263ENTRY(xen_sysenter_target)
264 lea 16(%rsp), %rsp /* strip %rcx,%r11 */
265 mov $-ENOSYS, %rax
266 pushq $VGCF_in_syscall
267 jmp hypercall_iret
268ENDPROC(xen_syscall32_target)
269ENDPROC(xen_sysenter_target)
270
271#endif /* CONFIG_IA32_EMULATION */
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 7c0cf6320a0a..63d49a523ed3 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -5,15 +5,24 @@
5 5
6#include <linux/elfnote.h> 6#include <linux/elfnote.h>
7#include <linux/init.h> 7#include <linux/init.h>
8
8#include <asm/boot.h> 9#include <asm/boot.h>
10#include <asm/asm.h>
11#include <asm/page.h>
12
9#include <xen/interface/elfnote.h> 13#include <xen/interface/elfnote.h>
10#include <asm/xen/interface.h> 14#include <asm/xen/interface.h>
11 15
12 __INIT 16 __INIT
13ENTRY(startup_xen) 17ENTRY(startup_xen)
14 movl %esi,xen_start_info
15 cld 18 cld
16 movl $(init_thread_union+THREAD_SIZE),%esp 19#ifdef CONFIG_X86_32
20 mov %esi,xen_start_info
21 mov $init_thread_union+THREAD_SIZE,%esp
22#else
23 mov %rsi,xen_start_info
24 mov $init_thread_union+THREAD_SIZE,%rsp
25#endif
17 jmp xen_start_kernel 26 jmp xen_start_kernel
18 27
19 __FINIT 28 __FINIT
@@ -21,21 +30,26 @@ ENTRY(startup_xen)
21.pushsection .text 30.pushsection .text
22 .align PAGE_SIZE_asm 31 .align PAGE_SIZE_asm
23ENTRY(hypercall_page) 32ENTRY(hypercall_page)
24 .skip 0x1000 33 .skip PAGE_SIZE_asm
25.popsection 34.popsection
26 35
27 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") 36 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux")
28 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6") 37 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6")
29 ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0") 38 ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0")
30 ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .long __PAGE_OFFSET) 39#ifdef CONFIG_X86_32
31 ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long startup_xen) 40 ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __PAGE_OFFSET)
32 ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long hypercall_page) 41#else
42 ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __START_KERNEL_map)
43#endif
44 ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen)
45 ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page)
33 ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb") 46 ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb")
34 ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes") 47 ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes")
35 ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic") 48 ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic")
36 ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, 49 ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID,
37 .quad _PAGE_PRESENT; .quad _PAGE_PRESENT) 50 .quad _PAGE_PRESENT; .quad _PAGE_PRESENT)
38 ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1) 51 ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1)
39 ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, .long __HYPERVISOR_VIRT_START) 52 ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, _ASM_PTR __HYPERVISOR_VIRT_START)
53 ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, _ASM_PTR 0)
40 54
41#endif /*CONFIG_XEN */ 55#endif /*CONFIG_XEN */
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 6f4b1045c1c2..dd3c23152a2e 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -26,6 +26,7 @@ char * __init xen_memory_setup(void);
26void __init xen_arch_setup(void); 26void __init xen_arch_setup(void);
27void __init xen_init_IRQ(void); 27void __init xen_init_IRQ(void);
28void xen_enable_sysenter(void); 28void xen_enable_sysenter(void);
29void xen_enable_syscall(void);
29void xen_vcpu_restore(void); 30void xen_vcpu_restore(void);
30 31
31void __init xen_build_dynamic_phys_to_machine(void); 32void __init xen_build_dynamic_phys_to_machine(void);
@@ -37,7 +38,6 @@ void __init xen_time_init(void);
37unsigned long xen_get_wallclock(void); 38unsigned long xen_get_wallclock(void);
38int xen_set_wallclock(unsigned long time); 39int xen_set_wallclock(unsigned long time);
39unsigned long long xen_sched_clock(void); 40unsigned long long xen_sched_clock(void);
40void xen_timer_resume(void);
41 41
42irqreturn_t xen_debug_interrupt(int irq, void *dev_id); 42irqreturn_t xen_debug_interrupt(int irq, void *dev_id);
43 43
@@ -45,20 +45,15 @@ bool xen_vcpu_stolen(int vcpu);
45 45
46void xen_mark_init_mm_pinned(void); 46void xen_mark_init_mm_pinned(void);
47 47
48void __init xen_fill_possible_map(void);
49
50void __init xen_setup_vcpu_info_placement(void); 48void __init xen_setup_vcpu_info_placement(void);
51void xen_smp_prepare_boot_cpu(void);
52void xen_smp_prepare_cpus(unsigned int max_cpus);
53int xen_cpu_up(unsigned int cpu);
54void xen_smp_cpus_done(unsigned int max_cpus);
55 49
56void xen_smp_send_stop(void); 50#ifdef CONFIG_SMP
57void xen_smp_send_reschedule(int cpu); 51void xen_smp_init(void);
58void xen_smp_send_call_function_ipi(cpumask_t mask);
59void xen_smp_send_call_function_single_ipi(int cpu);
60 52
61extern cpumask_t xen_cpu_initialized_map; 53extern cpumask_t xen_cpu_initialized_map;
54#else
55static inline void xen_smp_init(void) {}
56#endif
62 57
63 58
64/* Declare an asm function, along with symbols needed to make it 59/* Declare an asm function, along with symbols needed to make it
@@ -73,7 +68,11 @@ DECL_ASM(void, xen_irq_disable_direct, void);
73DECL_ASM(unsigned long, xen_save_fl_direct, void); 68DECL_ASM(unsigned long, xen_save_fl_direct, void);
74DECL_ASM(void, xen_restore_fl_direct, unsigned long); 69DECL_ASM(void, xen_restore_fl_direct, unsigned long);
75 70
71/* These are not functions, and cannot be called normally */
76void xen_iret(void); 72void xen_iret(void);
77void xen_sysexit(void); 73void xen_sysexit(void);
74void xen_sysret32(void);
75void xen_sysret64(void);
76void xen_adjust_exception_frame(void);
78 77
79#endif /* XEN_OPS_H */ 78#endif /* XEN_OPS_H */