aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@woody.linux-foundation.org>2007-10-17 14:10:11 -0400
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-10-17 14:10:11 -0400
commitfb9fc395174138983a49f2da982ed14caabbe741 (patch)
tree5d5d3643ee6853a899205613da272cc343fdc1a4 /arch/x86
parent0eafaae84e21ac033815cc9f33c3ae889cd7ccfe (diff)
parentace2e92e193126711cb3a83a3752b2c5b8396950 (diff)
Merge branch 'xen-upstream' of git://git.kernel.org/pub/scm/linux/kernel/git/jeremy/xen
* 'xen-upstream' of git://git.kernel.org/pub/scm/linux/kernel/git/jeremy/xen: xfs: eagerly remove vmap mappings to avoid upsetting Xen xen: add some debug output for failed multicalls xen: fix incorrect vcpu_register_vcpu_info hypercall argument xen: ask the hypervisor how much space it needs reserved xen: lock pte pages while pinning/unpinning xen: deal with stale cr3 values when unpinning pagetables xen: add batch completion callbacks xen: yield to IPI target if necessary Clean up duplicate includes in arch/i386/xen/ remove dead code in pgtable_cache_init paravirt: clean up lazy mode handling paravirt: refactor struct paravirt_ops into smaller pv_*_ops
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/kernel/alternative.c4
-rw-r--r--arch/x86/kernel/asm-offsets_32.c14
-rw-r--r--arch/x86/kernel/entry_32.S2
-rw-r--r--arch/x86/kernel/paravirt_32.c224
-rw-r--r--arch/x86/kernel/vmi_32.c201
-rw-r--r--arch/x86/mm/init_32.c22
-rw-r--r--arch/x86/xen/enlighten.c233
-rw-r--r--arch/x86/xen/mmu.c145
-rw-r--r--arch/x86/xen/multicalls.c52
-rw-r--r--arch/x86/xen/multicalls.h5
-rw-r--r--arch/x86/xen/smp.c14
-rw-r--r--arch/x86/xen/time.c6
-rw-r--r--arch/x86/xen/xen-ops.h10
13 files changed, 596 insertions, 336 deletions
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 11b03d3c6fda..42421437ded3 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -369,8 +369,8 @@ void apply_paravirt(struct paravirt_patch_site *start,
369 BUG_ON(p->len > MAX_PATCH_LEN); 369 BUG_ON(p->len > MAX_PATCH_LEN);
370 /* prep the buffer with the original instructions */ 370 /* prep the buffer with the original instructions */
371 memcpy(insnbuf, p->instr, p->len); 371 memcpy(insnbuf, p->instr, p->len);
372 used = paravirt_ops.patch(p->instrtype, p->clobbers, insnbuf, 372 used = pv_init_ops.patch(p->instrtype, p->clobbers, insnbuf,
373 (unsigned long)p->instr, p->len); 373 (unsigned long)p->instr, p->len);
374 374
375 BUG_ON(used > p->len); 375 BUG_ON(used > p->len);
376 376
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 8029742c0fc1..f1b7cdda82b3 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -116,12 +116,14 @@ void foo(void)
116 116
117#ifdef CONFIG_PARAVIRT 117#ifdef CONFIG_PARAVIRT
118 BLANK(); 118 BLANK();
119 OFFSET(PARAVIRT_enabled, paravirt_ops, paravirt_enabled); 119 OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
120 OFFSET(PARAVIRT_irq_disable, paravirt_ops, irq_disable); 120 OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops);
121 OFFSET(PARAVIRT_irq_enable, paravirt_ops, irq_enable); 121 OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops);
122 OFFSET(PARAVIRT_irq_enable_sysexit, paravirt_ops, irq_enable_sysexit); 122 OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
123 OFFSET(PARAVIRT_iret, paravirt_ops, iret); 123 OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
124 OFFSET(PARAVIRT_read_cr0, paravirt_ops, read_cr0); 124 OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
125 OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit);
126 OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0);
125#endif 127#endif
126 128
127#ifdef CONFIG_XEN 129#ifdef CONFIG_XEN
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 8099fea0a72f..dc7f938e5015 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -437,7 +437,7 @@ ldt_ss:
437 * is still available to implement the setting of the high 437 * is still available to implement the setting of the high
438 * 16-bits in the INTERRUPT_RETURN paravirt-op. 438 * 16-bits in the INTERRUPT_RETURN paravirt-op.
439 */ 439 */
440 cmpl $0, paravirt_ops+PARAVIRT_enabled 440 cmpl $0, pv_info+PARAVIRT_enabled
441 jne restore_nocheck 441 jne restore_nocheck
442#endif 442#endif
443 443
diff --git a/arch/x86/kernel/paravirt_32.c b/arch/x86/kernel/paravirt_32.c
index 739cfb207dd7..6a80d67c2121 100644
--- a/arch/x86/kernel/paravirt_32.c
+++ b/arch/x86/kernel/paravirt_32.c
@@ -42,32 +42,33 @@ void _paravirt_nop(void)
42static void __init default_banner(void) 42static void __init default_banner(void)
43{ 43{
44 printk(KERN_INFO "Booting paravirtualized kernel on %s\n", 44 printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
45 paravirt_ops.name); 45 pv_info.name);
46} 46}
47 47
48char *memory_setup(void) 48char *memory_setup(void)
49{ 49{
50 return paravirt_ops.memory_setup(); 50 return pv_init_ops.memory_setup();
51} 51}
52 52
53/* Simple instruction patching code. */ 53/* Simple instruction patching code. */
54#define DEF_NATIVE(name, code) \ 54#define DEF_NATIVE(ops, name, code) \
55 extern const char start_##name[], end_##name[]; \ 55 extern const char start_##ops##_##name[], end_##ops##_##name[]; \
56 asm("start_" #name ": " code "; end_" #name ":") 56 asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":")
57 57
58DEF_NATIVE(irq_disable, "cli"); 58DEF_NATIVE(pv_irq_ops, irq_disable, "cli");
59DEF_NATIVE(irq_enable, "sti"); 59DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
60DEF_NATIVE(restore_fl, "push %eax; popf"); 60DEF_NATIVE(pv_irq_ops, restore_fl, "push %eax; popf");
61DEF_NATIVE(save_fl, "pushf; pop %eax"); 61DEF_NATIVE(pv_irq_ops, save_fl, "pushf; pop %eax");
62DEF_NATIVE(iret, "iret"); 62DEF_NATIVE(pv_cpu_ops, iret, "iret");
63DEF_NATIVE(irq_enable_sysexit, "sti; sysexit"); 63DEF_NATIVE(pv_cpu_ops, irq_enable_sysexit, "sti; sysexit");
64DEF_NATIVE(read_cr2, "mov %cr2, %eax"); 64DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax");
65DEF_NATIVE(write_cr3, "mov %eax, %cr3"); 65DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3");
66DEF_NATIVE(read_cr3, "mov %cr3, %eax"); 66DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax");
67DEF_NATIVE(clts, "clts"); 67DEF_NATIVE(pv_cpu_ops, clts, "clts");
68DEF_NATIVE(read_tsc, "rdtsc"); 68DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc");
69 69
70DEF_NATIVE(ud2a, "ud2a"); 70/* Undefined instruction for dealing with missing ops pointers. */
71static const unsigned char ud2a[] = { 0x0f, 0x0b };
71 72
72static unsigned native_patch(u8 type, u16 clobbers, void *ibuf, 73static unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
73 unsigned long addr, unsigned len) 74 unsigned long addr, unsigned len)
@@ -76,37 +77,29 @@ static unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
76 unsigned ret; 77 unsigned ret;
77 78
78 switch(type) { 79 switch(type) {
79#define SITE(x) case PARAVIRT_PATCH(x): start = start_##x; end = end_##x; goto patch_site 80#define SITE(ops, x) \
80 SITE(irq_disable); 81 case PARAVIRT_PATCH(ops.x): \
81 SITE(irq_enable); 82 start = start_##ops##_##x; \
82 SITE(restore_fl); 83 end = end_##ops##_##x; \
83 SITE(save_fl); 84 goto patch_site
84 SITE(iret); 85
85 SITE(irq_enable_sysexit); 86 SITE(pv_irq_ops, irq_disable);
86 SITE(read_cr2); 87 SITE(pv_irq_ops, irq_enable);
87 SITE(read_cr3); 88 SITE(pv_irq_ops, restore_fl);
88 SITE(write_cr3); 89 SITE(pv_irq_ops, save_fl);
89 SITE(clts); 90 SITE(pv_cpu_ops, iret);
90 SITE(read_tsc); 91 SITE(pv_cpu_ops, irq_enable_sysexit);
92 SITE(pv_mmu_ops, read_cr2);
93 SITE(pv_mmu_ops, read_cr3);
94 SITE(pv_mmu_ops, write_cr3);
95 SITE(pv_cpu_ops, clts);
96 SITE(pv_cpu_ops, read_tsc);
91#undef SITE 97#undef SITE
92 98
93 patch_site: 99 patch_site:
94 ret = paravirt_patch_insns(ibuf, len, start, end); 100 ret = paravirt_patch_insns(ibuf, len, start, end);
95 break; 101 break;
96 102
97 case PARAVIRT_PATCH(make_pgd):
98 case PARAVIRT_PATCH(make_pte):
99 case PARAVIRT_PATCH(pgd_val):
100 case PARAVIRT_PATCH(pte_val):
101#ifdef CONFIG_X86_PAE
102 case PARAVIRT_PATCH(make_pmd):
103 case PARAVIRT_PATCH(pmd_val):
104#endif
105 /* These functions end up returning exactly what
106 they're passed, in the same registers. */
107 ret = paravirt_patch_nop();
108 break;
109
110 default: 103 default:
111 ret = paravirt_patch_default(type, clobbers, ibuf, addr, len); 104 ret = paravirt_patch_default(type, clobbers, ibuf, addr, len);
112 break; 105 break;
@@ -150,7 +143,7 @@ unsigned paravirt_patch_call(void *insnbuf,
150 return 5; 143 return 5;
151} 144}
152 145
153unsigned paravirt_patch_jmp(const void *target, void *insnbuf, 146unsigned paravirt_patch_jmp(void *insnbuf, const void *target,
154 unsigned long addr, unsigned len) 147 unsigned long addr, unsigned len)
155{ 148{
156 struct branch *b = insnbuf; 149 struct branch *b = insnbuf;
@@ -165,22 +158,37 @@ unsigned paravirt_patch_jmp(const void *target, void *insnbuf,
165 return 5; 158 return 5;
166} 159}
167 160
161/* Neat trick to map patch type back to the call within the
162 * corresponding structure. */
163static void *get_call_destination(u8 type)
164{
165 struct paravirt_patch_template tmpl = {
166 .pv_init_ops = pv_init_ops,
167 .pv_time_ops = pv_time_ops,
168 .pv_cpu_ops = pv_cpu_ops,
169 .pv_irq_ops = pv_irq_ops,
170 .pv_apic_ops = pv_apic_ops,
171 .pv_mmu_ops = pv_mmu_ops,
172 };
173 return *((void **)&tmpl + type);
174}
175
168unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf, 176unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
169 unsigned long addr, unsigned len) 177 unsigned long addr, unsigned len)
170{ 178{
171 void *opfunc = *((void **)&paravirt_ops + type); 179 void *opfunc = get_call_destination(type);
172 unsigned ret; 180 unsigned ret;
173 181
174 if (opfunc == NULL) 182 if (opfunc == NULL)
175 /* If there's no function, patch it with a ud2a (BUG) */ 183 /* If there's no function, patch it with a ud2a (BUG) */
176 ret = paravirt_patch_insns(insnbuf, len, start_ud2a, end_ud2a); 184 ret = paravirt_patch_insns(insnbuf, len, ud2a, ud2a+sizeof(ud2a));
177 else if (opfunc == paravirt_nop) 185 else if (opfunc == paravirt_nop)
178 /* If the operation is a nop, then nop the callsite */ 186 /* If the operation is a nop, then nop the callsite */
179 ret = paravirt_patch_nop(); 187 ret = paravirt_patch_nop();
180 else if (type == PARAVIRT_PATCH(iret) || 188 else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) ||
181 type == PARAVIRT_PATCH(irq_enable_sysexit)) 189 type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit))
182 /* If operation requires a jmp, then jmp */ 190 /* If operation requires a jmp, then jmp */
183 ret = paravirt_patch_jmp(opfunc, insnbuf, addr, len); 191 ret = paravirt_patch_jmp(insnbuf, opfunc, addr, len);
184 else 192 else
185 /* Otherwise call the function; assume target could 193 /* Otherwise call the function; assume target could
186 clobber any caller-save reg */ 194 clobber any caller-save reg */
@@ -205,7 +213,7 @@ unsigned paravirt_patch_insns(void *insnbuf, unsigned len,
205 213
206void init_IRQ(void) 214void init_IRQ(void)
207{ 215{
208 paravirt_ops.init_IRQ(); 216 pv_irq_ops.init_IRQ();
209} 217}
210 218
211static void native_flush_tlb(void) 219static void native_flush_tlb(void)
@@ -233,7 +241,7 @@ extern void native_irq_enable_sysexit(void);
233 241
234static int __init print_banner(void) 242static int __init print_banner(void)
235{ 243{
236 paravirt_ops.banner(); 244 pv_init_ops.banner();
237 return 0; 245 return 0;
238} 246}
239core_initcall(print_banner); 247core_initcall(print_banner);
@@ -273,47 +281,96 @@ int paravirt_disable_iospace(void)
273 return ret; 281 return ret;
274} 282}
275 283
276struct paravirt_ops paravirt_ops = { 284static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LAZY_NONE;
285
286static inline void enter_lazy(enum paravirt_lazy_mode mode)
287{
288 BUG_ON(x86_read_percpu(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE);
289 BUG_ON(preemptible());
290
291 x86_write_percpu(paravirt_lazy_mode, mode);
292}
293
294void paravirt_leave_lazy(enum paravirt_lazy_mode mode)
295{
296 BUG_ON(x86_read_percpu(paravirt_lazy_mode) != mode);
297 BUG_ON(preemptible());
298
299 x86_write_percpu(paravirt_lazy_mode, PARAVIRT_LAZY_NONE);
300}
301
302void paravirt_enter_lazy_mmu(void)
303{
304 enter_lazy(PARAVIRT_LAZY_MMU);
305}
306
307void paravirt_leave_lazy_mmu(void)
308{
309 paravirt_leave_lazy(PARAVIRT_LAZY_MMU);
310}
311
312void paravirt_enter_lazy_cpu(void)
313{
314 enter_lazy(PARAVIRT_LAZY_CPU);
315}
316
317void paravirt_leave_lazy_cpu(void)
318{
319 paravirt_leave_lazy(PARAVIRT_LAZY_CPU);
320}
321
322enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
323{
324 return x86_read_percpu(paravirt_lazy_mode);
325}
326
327struct pv_info pv_info = {
277 .name = "bare hardware", 328 .name = "bare hardware",
278 .paravirt_enabled = 0, 329 .paravirt_enabled = 0,
279 .kernel_rpl = 0, 330 .kernel_rpl = 0,
280 .shared_kernel_pmd = 1, /* Only used when CONFIG_X86_PAE is set */ 331 .shared_kernel_pmd = 1, /* Only used when CONFIG_X86_PAE is set */
332};
281 333
282 .patch = native_patch, 334struct pv_init_ops pv_init_ops = {
335 .patch = native_patch,
283 .banner = default_banner, 336 .banner = default_banner,
284 .arch_setup = paravirt_nop, 337 .arch_setup = paravirt_nop,
285 .memory_setup = machine_specific_memory_setup, 338 .memory_setup = machine_specific_memory_setup,
339};
340
341struct pv_time_ops pv_time_ops = {
342 .time_init = hpet_time_init,
286 .get_wallclock = native_get_wallclock, 343 .get_wallclock = native_get_wallclock,
287 .set_wallclock = native_set_wallclock, 344 .set_wallclock = native_set_wallclock,
288 .time_init = hpet_time_init, 345 .sched_clock = native_sched_clock,
346 .get_cpu_khz = native_calculate_cpu_khz,
347};
348
349struct pv_irq_ops pv_irq_ops = {
289 .init_IRQ = native_init_IRQ, 350 .init_IRQ = native_init_IRQ,
351 .save_fl = native_save_fl,
352 .restore_fl = native_restore_fl,
353 .irq_disable = native_irq_disable,
354 .irq_enable = native_irq_enable,
355 .safe_halt = native_safe_halt,
356 .halt = native_halt,
357};
290 358
359struct pv_cpu_ops pv_cpu_ops = {
291 .cpuid = native_cpuid, 360 .cpuid = native_cpuid,
292 .get_debugreg = native_get_debugreg, 361 .get_debugreg = native_get_debugreg,
293 .set_debugreg = native_set_debugreg, 362 .set_debugreg = native_set_debugreg,
294 .clts = native_clts, 363 .clts = native_clts,
295 .read_cr0 = native_read_cr0, 364 .read_cr0 = native_read_cr0,
296 .write_cr0 = native_write_cr0, 365 .write_cr0 = native_write_cr0,
297 .read_cr2 = native_read_cr2,
298 .write_cr2 = native_write_cr2,
299 .read_cr3 = native_read_cr3,
300 .write_cr3 = native_write_cr3,
301 .read_cr4 = native_read_cr4, 366 .read_cr4 = native_read_cr4,
302 .read_cr4_safe = native_read_cr4_safe, 367 .read_cr4_safe = native_read_cr4_safe,
303 .write_cr4 = native_write_cr4, 368 .write_cr4 = native_write_cr4,
304 .save_fl = native_save_fl,
305 .restore_fl = native_restore_fl,
306 .irq_disable = native_irq_disable,
307 .irq_enable = native_irq_enable,
308 .safe_halt = native_safe_halt,
309 .halt = native_halt,
310 .wbinvd = native_wbinvd, 369 .wbinvd = native_wbinvd,
311 .read_msr = native_read_msr_safe, 370 .read_msr = native_read_msr_safe,
312 .write_msr = native_write_msr_safe, 371 .write_msr = native_write_msr_safe,
313 .read_tsc = native_read_tsc, 372 .read_tsc = native_read_tsc,
314 .read_pmc = native_read_pmc, 373 .read_pmc = native_read_pmc,
315 .sched_clock = native_sched_clock,
316 .get_cpu_khz = native_calculate_cpu_khz,
317 .load_tr_desc = native_load_tr_desc, 374 .load_tr_desc = native_load_tr_desc,
318 .set_ldt = native_set_ldt, 375 .set_ldt = native_set_ldt,
319 .load_gdt = native_load_gdt, 376 .load_gdt = native_load_gdt,
@@ -327,9 +384,19 @@ struct paravirt_ops paravirt_ops = {
327 .write_idt_entry = write_dt_entry, 384 .write_idt_entry = write_dt_entry,
328 .load_esp0 = native_load_esp0, 385 .load_esp0 = native_load_esp0,
329 386
387 .irq_enable_sysexit = native_irq_enable_sysexit,
388 .iret = native_iret,
389
330 .set_iopl_mask = native_set_iopl_mask, 390 .set_iopl_mask = native_set_iopl_mask,
331 .io_delay = native_io_delay, 391 .io_delay = native_io_delay,
332 392
393 .lazy_mode = {
394 .enter = paravirt_nop,
395 .leave = paravirt_nop,
396 },
397};
398
399struct pv_apic_ops pv_apic_ops = {
333#ifdef CONFIG_X86_LOCAL_APIC 400#ifdef CONFIG_X86_LOCAL_APIC
334 .apic_write = native_apic_write, 401 .apic_write = native_apic_write,
335 .apic_write_atomic = native_apic_write_atomic, 402 .apic_write_atomic = native_apic_write_atomic,
@@ -338,11 +405,17 @@ struct paravirt_ops paravirt_ops = {
338 .setup_secondary_clock = setup_secondary_APIC_clock, 405 .setup_secondary_clock = setup_secondary_APIC_clock,
339 .startup_ipi_hook = paravirt_nop, 406 .startup_ipi_hook = paravirt_nop,
340#endif 407#endif
341 .set_lazy_mode = paravirt_nop, 408};
342 409
410struct pv_mmu_ops pv_mmu_ops = {
343 .pagetable_setup_start = native_pagetable_setup_start, 411 .pagetable_setup_start = native_pagetable_setup_start,
344 .pagetable_setup_done = native_pagetable_setup_done, 412 .pagetable_setup_done = native_pagetable_setup_done,
345 413
414 .read_cr2 = native_read_cr2,
415 .write_cr2 = native_write_cr2,
416 .read_cr3 = native_read_cr3,
417 .write_cr3 = native_write_cr3,
418
346 .flush_tlb_user = native_flush_tlb, 419 .flush_tlb_user = native_flush_tlb,
347 .flush_tlb_kernel = native_flush_tlb_global, 420 .flush_tlb_kernel = native_flush_tlb_global,
348 .flush_tlb_single = native_flush_tlb_single, 421 .flush_tlb_single = native_flush_tlb_single,
@@ -381,12 +454,19 @@ struct paravirt_ops paravirt_ops = {
381 .make_pte = native_make_pte, 454 .make_pte = native_make_pte,
382 .make_pgd = native_make_pgd, 455 .make_pgd = native_make_pgd,
383 456
384 .irq_enable_sysexit = native_irq_enable_sysexit,
385 .iret = native_iret,
386
387 .dup_mmap = paravirt_nop, 457 .dup_mmap = paravirt_nop,
388 .exit_mmap = paravirt_nop, 458 .exit_mmap = paravirt_nop,
389 .activate_mm = paravirt_nop, 459 .activate_mm = paravirt_nop,
460
461 .lazy_mode = {
462 .enter = paravirt_nop,
463 .leave = paravirt_nop,
464 },
390}; 465};
391 466
392EXPORT_SYMBOL(paravirt_ops); 467EXPORT_SYMBOL_GPL(pv_time_ops);
468EXPORT_SYMBOL_GPL(pv_cpu_ops);
469EXPORT_SYMBOL_GPL(pv_mmu_ops);
470EXPORT_SYMBOL_GPL(pv_apic_ops);
471EXPORT_SYMBOL_GPL(pv_info);
472EXPORT_SYMBOL (pv_irq_ops);
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 18673e0f193b..f02bad68abaa 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -134,21 +134,21 @@ static unsigned vmi_patch(u8 type, u16 clobbers, void *insns,
134 unsigned long eip, unsigned len) 134 unsigned long eip, unsigned len)
135{ 135{
136 switch (type) { 136 switch (type) {
137 case PARAVIRT_PATCH(irq_disable): 137 case PARAVIRT_PATCH(pv_irq_ops.irq_disable):
138 return patch_internal(VMI_CALL_DisableInterrupts, len, 138 return patch_internal(VMI_CALL_DisableInterrupts, len,
139 insns, eip); 139 insns, eip);
140 case PARAVIRT_PATCH(irq_enable): 140 case PARAVIRT_PATCH(pv_irq_ops.irq_enable):
141 return patch_internal(VMI_CALL_EnableInterrupts, len, 141 return patch_internal(VMI_CALL_EnableInterrupts, len,
142 insns, eip); 142 insns, eip);
143 case PARAVIRT_PATCH(restore_fl): 143 case PARAVIRT_PATCH(pv_irq_ops.restore_fl):
144 return patch_internal(VMI_CALL_SetInterruptMask, len, 144 return patch_internal(VMI_CALL_SetInterruptMask, len,
145 insns, eip); 145 insns, eip);
146 case PARAVIRT_PATCH(save_fl): 146 case PARAVIRT_PATCH(pv_irq_ops.save_fl):
147 return patch_internal(VMI_CALL_GetInterruptMask, len, 147 return patch_internal(VMI_CALL_GetInterruptMask, len,
148 insns, eip); 148 insns, eip);
149 case PARAVIRT_PATCH(iret): 149 case PARAVIRT_PATCH(pv_cpu_ops.iret):
150 return patch_internal(VMI_CALL_IRET, len, insns, eip); 150 return patch_internal(VMI_CALL_IRET, len, insns, eip);
151 case PARAVIRT_PATCH(irq_enable_sysexit): 151 case PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit):
152 return patch_internal(VMI_CALL_SYSEXIT, len, insns, eip); 152 return patch_internal(VMI_CALL_SYSEXIT, len, insns, eip);
153 default: 153 default:
154 break; 154 break;
@@ -552,24 +552,22 @@ vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip,
552} 552}
553#endif 553#endif
554 554
555static void vmi_set_lazy_mode(enum paravirt_lazy_mode mode) 555static void vmi_enter_lazy_cpu(void)
556{ 556{
557 static DEFINE_PER_CPU(enum paravirt_lazy_mode, lazy_mode); 557 paravirt_enter_lazy_cpu();
558 558 vmi_ops.set_lazy_mode(2);
559 if (!vmi_ops.set_lazy_mode) 559}
560 return;
561 560
562 /* Modes should never nest or overlap */ 561static void vmi_enter_lazy_mmu(void)
563 BUG_ON(__get_cpu_var(lazy_mode) && !(mode == PARAVIRT_LAZY_NONE || 562{
564 mode == PARAVIRT_LAZY_FLUSH)); 563 paravirt_enter_lazy_mmu();
564 vmi_ops.set_lazy_mode(1);
565}
565 566
566 if (mode == PARAVIRT_LAZY_FLUSH) { 567static void vmi_leave_lazy(void)
567 vmi_ops.set_lazy_mode(0); 568{
568 vmi_ops.set_lazy_mode(__get_cpu_var(lazy_mode)); 569 paravirt_leave_lazy(paravirt_get_lazy_mode());
569 } else { 570 vmi_ops.set_lazy_mode(0);
570 vmi_ops.set_lazy_mode(mode);
571 __get_cpu_var(lazy_mode) = mode;
572 }
573} 571}
574 572
575static inline int __init check_vmi_rom(struct vrom_header *rom) 573static inline int __init check_vmi_rom(struct vrom_header *rom)
@@ -690,9 +688,9 @@ do { \
690 reloc = call_vrom_long_func(vmi_rom, get_reloc, \ 688 reloc = call_vrom_long_func(vmi_rom, get_reloc, \
691 VMI_CALL_##vmicall); \ 689 VMI_CALL_##vmicall); \
692 if (rel->type == VMI_RELOCATION_CALL_REL) \ 690 if (rel->type == VMI_RELOCATION_CALL_REL) \
693 paravirt_ops.opname = (void *)rel->eip; \ 691 opname = (void *)rel->eip; \
694 else if (rel->type == VMI_RELOCATION_NOP) \ 692 else if (rel->type == VMI_RELOCATION_NOP) \
695 paravirt_ops.opname = (void *)vmi_nop; \ 693 opname = (void *)vmi_nop; \
696 else if (rel->type != VMI_RELOCATION_NONE) \ 694 else if (rel->type != VMI_RELOCATION_NONE) \
697 printk(KERN_WARNING "VMI: Unknown relocation " \ 695 printk(KERN_WARNING "VMI: Unknown relocation " \
698 "type %d for " #vmicall"\n",\ 696 "type %d for " #vmicall"\n",\
@@ -712,7 +710,7 @@ do { \
712 VMI_CALL_##vmicall); \ 710 VMI_CALL_##vmicall); \
713 BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL); \ 711 BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL); \
714 if (rel->type == VMI_RELOCATION_CALL_REL) { \ 712 if (rel->type == VMI_RELOCATION_CALL_REL) { \
715 paravirt_ops.opname = wrapper; \ 713 opname = wrapper; \
716 vmi_ops.cache = (void *)rel->eip; \ 714 vmi_ops.cache = (void *)rel->eip; \
717 } \ 715 } \
718} while (0) 716} while (0)
@@ -732,11 +730,11 @@ static inline int __init activate_vmi(void)
732 } 730 }
733 savesegment(cs, kernel_cs); 731 savesegment(cs, kernel_cs);
734 732
735 paravirt_ops.paravirt_enabled = 1; 733 pv_info.paravirt_enabled = 1;
736 paravirt_ops.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK; 734 pv_info.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK;
735 pv_info.name = "vmi";
737 736
738 paravirt_ops.patch = vmi_patch; 737 pv_init_ops.patch = vmi_patch;
739 paravirt_ops.name = "vmi";
740 738
741 /* 739 /*
742 * Many of these operations are ABI compatible with VMI. 740 * Many of these operations are ABI compatible with VMI.
@@ -754,26 +752,26 @@ static inline int __init activate_vmi(void)
754 */ 752 */
755 753
756 /* CPUID is special, so very special it gets wrapped like a present */ 754 /* CPUID is special, so very special it gets wrapped like a present */
757 para_wrap(cpuid, vmi_cpuid, cpuid, CPUID); 755 para_wrap(pv_cpu_ops.cpuid, vmi_cpuid, cpuid, CPUID);
758 756
759 para_fill(clts, CLTS); 757 para_fill(pv_cpu_ops.clts, CLTS);
760 para_fill(get_debugreg, GetDR); 758 para_fill(pv_cpu_ops.get_debugreg, GetDR);
761 para_fill(set_debugreg, SetDR); 759 para_fill(pv_cpu_ops.set_debugreg, SetDR);
762 para_fill(read_cr0, GetCR0); 760 para_fill(pv_cpu_ops.read_cr0, GetCR0);
763 para_fill(read_cr2, GetCR2); 761 para_fill(pv_mmu_ops.read_cr2, GetCR2);
764 para_fill(read_cr3, GetCR3); 762 para_fill(pv_mmu_ops.read_cr3, GetCR3);
765 para_fill(read_cr4, GetCR4); 763 para_fill(pv_cpu_ops.read_cr4, GetCR4);
766 para_fill(write_cr0, SetCR0); 764 para_fill(pv_cpu_ops.write_cr0, SetCR0);
767 para_fill(write_cr2, SetCR2); 765 para_fill(pv_mmu_ops.write_cr2, SetCR2);
768 para_fill(write_cr3, SetCR3); 766 para_fill(pv_mmu_ops.write_cr3, SetCR3);
769 para_fill(write_cr4, SetCR4); 767 para_fill(pv_cpu_ops.write_cr4, SetCR4);
770 para_fill(save_fl, GetInterruptMask); 768 para_fill(pv_irq_ops.save_fl, GetInterruptMask);
771 para_fill(restore_fl, SetInterruptMask); 769 para_fill(pv_irq_ops.restore_fl, SetInterruptMask);
772 para_fill(irq_disable, DisableInterrupts); 770 para_fill(pv_irq_ops.irq_disable, DisableInterrupts);
773 para_fill(irq_enable, EnableInterrupts); 771 para_fill(pv_irq_ops.irq_enable, EnableInterrupts);
774 772
775 para_fill(wbinvd, WBINVD); 773 para_fill(pv_cpu_ops.wbinvd, WBINVD);
776 para_fill(read_tsc, RDTSC); 774 para_fill(pv_cpu_ops.read_tsc, RDTSC);
777 775
778 /* The following we emulate with trap and emulate for now */ 776 /* The following we emulate with trap and emulate for now */
779 /* paravirt_ops.read_msr = vmi_rdmsr */ 777 /* paravirt_ops.read_msr = vmi_rdmsr */
@@ -781,29 +779,38 @@ static inline int __init activate_vmi(void)
781 /* paravirt_ops.rdpmc = vmi_rdpmc */ 779 /* paravirt_ops.rdpmc = vmi_rdpmc */
782 780
783 /* TR interface doesn't pass TR value, wrap */ 781 /* TR interface doesn't pass TR value, wrap */
784 para_wrap(load_tr_desc, vmi_set_tr, set_tr, SetTR); 782 para_wrap(pv_cpu_ops.load_tr_desc, vmi_set_tr, set_tr, SetTR);
785 783
786 /* LDT is special, too */ 784 /* LDT is special, too */
787 para_wrap(set_ldt, vmi_set_ldt, _set_ldt, SetLDT); 785 para_wrap(pv_cpu_ops.set_ldt, vmi_set_ldt, _set_ldt, SetLDT);
788 786
789 para_fill(load_gdt, SetGDT); 787 para_fill(pv_cpu_ops.load_gdt, SetGDT);
790 para_fill(load_idt, SetIDT); 788 para_fill(pv_cpu_ops.load_idt, SetIDT);
791 para_fill(store_gdt, GetGDT); 789 para_fill(pv_cpu_ops.store_gdt, GetGDT);
792 para_fill(store_idt, GetIDT); 790 para_fill(pv_cpu_ops.store_idt, GetIDT);
793 para_fill(store_tr, GetTR); 791 para_fill(pv_cpu_ops.store_tr, GetTR);
794 paravirt_ops.load_tls = vmi_load_tls; 792 pv_cpu_ops.load_tls = vmi_load_tls;
795 para_fill(write_ldt_entry, WriteLDTEntry); 793 para_fill(pv_cpu_ops.write_ldt_entry, WriteLDTEntry);
796 para_fill(write_gdt_entry, WriteGDTEntry); 794 para_fill(pv_cpu_ops.write_gdt_entry, WriteGDTEntry);
797 para_fill(write_idt_entry, WriteIDTEntry); 795 para_fill(pv_cpu_ops.write_idt_entry, WriteIDTEntry);
798 para_wrap(load_esp0, vmi_load_esp0, set_kernel_stack, UpdateKernelStack); 796 para_wrap(pv_cpu_ops.load_esp0, vmi_load_esp0, set_kernel_stack, UpdateKernelStack);
799 para_fill(set_iopl_mask, SetIOPLMask); 797 para_fill(pv_cpu_ops.set_iopl_mask, SetIOPLMask);
800 para_fill(io_delay, IODelay); 798 para_fill(pv_cpu_ops.io_delay, IODelay);
801 para_wrap(set_lazy_mode, vmi_set_lazy_mode, set_lazy_mode, SetLazyMode); 799
800 para_wrap(pv_cpu_ops.lazy_mode.enter, vmi_enter_lazy_cpu,
801 set_lazy_mode, SetLazyMode);
802 para_wrap(pv_cpu_ops.lazy_mode.leave, vmi_leave_lazy,
803 set_lazy_mode, SetLazyMode);
804
805 para_wrap(pv_mmu_ops.lazy_mode.enter, vmi_enter_lazy_mmu,
806 set_lazy_mode, SetLazyMode);
807 para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy,
808 set_lazy_mode, SetLazyMode);
802 809
803 /* user and kernel flush are just handled with different flags to FlushTLB */ 810 /* user and kernel flush are just handled with different flags to FlushTLB */
804 para_wrap(flush_tlb_user, vmi_flush_tlb_user, _flush_tlb, FlushTLB); 811 para_wrap(pv_mmu_ops.flush_tlb_user, vmi_flush_tlb_user, _flush_tlb, FlushTLB);
805 para_wrap(flush_tlb_kernel, vmi_flush_tlb_kernel, _flush_tlb, FlushTLB); 812 para_wrap(pv_mmu_ops.flush_tlb_kernel, vmi_flush_tlb_kernel, _flush_tlb, FlushTLB);
806 para_fill(flush_tlb_single, InvalPage); 813 para_fill(pv_mmu_ops.flush_tlb_single, InvalPage);
807 814
808 /* 815 /*
809 * Until a standard flag format can be agreed on, we need to 816 * Until a standard flag format can be agreed on, we need to
@@ -819,41 +826,41 @@ static inline int __init activate_vmi(void)
819#endif 826#endif
820 827
821 if (vmi_ops.set_pte) { 828 if (vmi_ops.set_pte) {
822 paravirt_ops.set_pte = vmi_set_pte; 829 pv_mmu_ops.set_pte = vmi_set_pte;
823 paravirt_ops.set_pte_at = vmi_set_pte_at; 830 pv_mmu_ops.set_pte_at = vmi_set_pte_at;
824 paravirt_ops.set_pmd = vmi_set_pmd; 831 pv_mmu_ops.set_pmd = vmi_set_pmd;
825#ifdef CONFIG_X86_PAE 832#ifdef CONFIG_X86_PAE
826 paravirt_ops.set_pte_atomic = vmi_set_pte_atomic; 833 pv_mmu_ops.set_pte_atomic = vmi_set_pte_atomic;
827 paravirt_ops.set_pte_present = vmi_set_pte_present; 834 pv_mmu_ops.set_pte_present = vmi_set_pte_present;
828 paravirt_ops.set_pud = vmi_set_pud; 835 pv_mmu_ops.set_pud = vmi_set_pud;
829 paravirt_ops.pte_clear = vmi_pte_clear; 836 pv_mmu_ops.pte_clear = vmi_pte_clear;
830 paravirt_ops.pmd_clear = vmi_pmd_clear; 837 pv_mmu_ops.pmd_clear = vmi_pmd_clear;
831#endif 838#endif
832 } 839 }
833 840
834 if (vmi_ops.update_pte) { 841 if (vmi_ops.update_pte) {
835 paravirt_ops.pte_update = vmi_update_pte; 842 pv_mmu_ops.pte_update = vmi_update_pte;
836 paravirt_ops.pte_update_defer = vmi_update_pte_defer; 843 pv_mmu_ops.pte_update_defer = vmi_update_pte_defer;
837 } 844 }
838 845
839 vmi_ops.allocate_page = vmi_get_function(VMI_CALL_AllocatePage); 846 vmi_ops.allocate_page = vmi_get_function(VMI_CALL_AllocatePage);
840 if (vmi_ops.allocate_page) { 847 if (vmi_ops.allocate_page) {
841 paravirt_ops.alloc_pt = vmi_allocate_pt; 848 pv_mmu_ops.alloc_pt = vmi_allocate_pt;
842 paravirt_ops.alloc_pd = vmi_allocate_pd; 849 pv_mmu_ops.alloc_pd = vmi_allocate_pd;
843 paravirt_ops.alloc_pd_clone = vmi_allocate_pd_clone; 850 pv_mmu_ops.alloc_pd_clone = vmi_allocate_pd_clone;
844 } 851 }
845 852
846 vmi_ops.release_page = vmi_get_function(VMI_CALL_ReleasePage); 853 vmi_ops.release_page = vmi_get_function(VMI_CALL_ReleasePage);
847 if (vmi_ops.release_page) { 854 if (vmi_ops.release_page) {
848 paravirt_ops.release_pt = vmi_release_pt; 855 pv_mmu_ops.release_pt = vmi_release_pt;
849 paravirt_ops.release_pd = vmi_release_pd; 856 pv_mmu_ops.release_pd = vmi_release_pd;
850 } 857 }
851 858
852 /* Set linear is needed in all cases */ 859 /* Set linear is needed in all cases */
853 vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping); 860 vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping);
854#ifdef CONFIG_HIGHPTE 861#ifdef CONFIG_HIGHPTE
855 if (vmi_ops.set_linear_mapping) 862 if (vmi_ops.set_linear_mapping)
856 paravirt_ops.kmap_atomic_pte = vmi_kmap_atomic_pte; 863 pv_mmu_ops.kmap_atomic_pte = vmi_kmap_atomic_pte;
857#endif 864#endif
858 865
859 /* 866 /*
@@ -863,17 +870,17 @@ static inline int __init activate_vmi(void)
863 * the backend. They are performance critical anyway, so requiring 870 * the backend. They are performance critical anyway, so requiring
864 * a patch is not a big problem. 871 * a patch is not a big problem.
865 */ 872 */
866 paravirt_ops.irq_enable_sysexit = (void *)0xfeedbab0; 873 pv_cpu_ops.irq_enable_sysexit = (void *)0xfeedbab0;
867 paravirt_ops.iret = (void *)0xbadbab0; 874 pv_cpu_ops.iret = (void *)0xbadbab0;
868 875
869#ifdef CONFIG_SMP 876#ifdef CONFIG_SMP
870 para_wrap(startup_ipi_hook, vmi_startup_ipi_hook, set_initial_ap_state, SetInitialAPState); 877 para_wrap(pv_apic_ops.startup_ipi_hook, vmi_startup_ipi_hook, set_initial_ap_state, SetInitialAPState);
871#endif 878#endif
872 879
873#ifdef CONFIG_X86_LOCAL_APIC 880#ifdef CONFIG_X86_LOCAL_APIC
874 para_fill(apic_read, APICRead); 881 para_fill(pv_apic_ops.apic_read, APICRead);
875 para_fill(apic_write, APICWrite); 882 para_fill(pv_apic_ops.apic_write, APICWrite);
876 para_fill(apic_write_atomic, APICWrite); 883 para_fill(pv_apic_ops.apic_write_atomic, APICWrite);
877#endif 884#endif
878 885
879 /* 886 /*
@@ -891,15 +898,15 @@ static inline int __init activate_vmi(void)
891 vmi_timer_ops.set_alarm = vmi_get_function(VMI_CALL_SetAlarm); 898 vmi_timer_ops.set_alarm = vmi_get_function(VMI_CALL_SetAlarm);
892 vmi_timer_ops.cancel_alarm = 899 vmi_timer_ops.cancel_alarm =
893 vmi_get_function(VMI_CALL_CancelAlarm); 900 vmi_get_function(VMI_CALL_CancelAlarm);
894 paravirt_ops.time_init = vmi_time_init; 901 pv_time_ops.time_init = vmi_time_init;
895 paravirt_ops.get_wallclock = vmi_get_wallclock; 902 pv_time_ops.get_wallclock = vmi_get_wallclock;
896 paravirt_ops.set_wallclock = vmi_set_wallclock; 903 pv_time_ops.set_wallclock = vmi_set_wallclock;
897#ifdef CONFIG_X86_LOCAL_APIC 904#ifdef CONFIG_X86_LOCAL_APIC
898 paravirt_ops.setup_boot_clock = vmi_time_bsp_init; 905 pv_apic_ops.setup_boot_clock = vmi_time_bsp_init;
899 paravirt_ops.setup_secondary_clock = vmi_time_ap_init; 906 pv_apic_ops.setup_secondary_clock = vmi_time_ap_init;
900#endif 907#endif
901 paravirt_ops.sched_clock = vmi_sched_clock; 908 pv_time_ops.sched_clock = vmi_sched_clock;
902 paravirt_ops.get_cpu_khz = vmi_cpu_khz; 909 pv_time_ops.get_cpu_khz = vmi_cpu_khz;
903 910
904 /* We have true wallclock functions; disable CMOS clock sync */ 911 /* We have true wallclock functions; disable CMOS clock sync */
905 no_sync_cmos_clock = 1; 912 no_sync_cmos_clock = 1;
@@ -908,7 +915,7 @@ static inline int __init activate_vmi(void)
908 disable_vmi_timer = 1; 915 disable_vmi_timer = 1;
909 } 916 }
910 917
911 para_fill(safe_halt, Halt); 918 para_fill(pv_irq_ops.safe_halt, Halt);
912 919
913 /* 920 /*
914 * Alternative instruction rewriting doesn't happen soon enough 921 * Alternative instruction rewriting doesn't happen soon enough
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index dda4e83649a0..33d367a3432e 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -741,24 +741,12 @@ struct kmem_cache *pmd_cache;
741 741
742void __init pgtable_cache_init(void) 742void __init pgtable_cache_init(void)
743{ 743{
744 size_t pgd_size = PTRS_PER_PGD*sizeof(pgd_t); 744 if (PTRS_PER_PMD > 1)
745
746 if (PTRS_PER_PMD > 1) {
747 pmd_cache = kmem_cache_create("pmd", 745 pmd_cache = kmem_cache_create("pmd",
748 PTRS_PER_PMD*sizeof(pmd_t), 746 PTRS_PER_PMD*sizeof(pmd_t),
749 PTRS_PER_PMD*sizeof(pmd_t), 747 PTRS_PER_PMD*sizeof(pmd_t),
750 SLAB_PANIC, 748 SLAB_PANIC,
751 pmd_ctor); 749 pmd_ctor);
752 if (!SHARED_KERNEL_PMD) {
753 /* If we're in PAE mode and have a non-shared
754 kernel pmd, then the pgd size must be a
755 page size. This is because the pgd_list
756 links through the page structure, so there
757 can only be one pgd per page for this to
758 work. */
759 pgd_size = PAGE_SIZE;
760 }
761 }
762} 750}
763 751
764/* 752/*
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 493a083f6886..94c39aaf695f 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -25,7 +25,6 @@
25#include <linux/mm.h> 25#include <linux/mm.h>
26#include <linux/page-flags.h> 26#include <linux/page-flags.h>
27#include <linux/highmem.h> 27#include <linux/highmem.h>
28#include <linux/smp.h>
29 28
30#include <xen/interface/xen.h> 29#include <xen/interface/xen.h>
31#include <xen/interface/physdev.h> 30#include <xen/interface/physdev.h>
@@ -52,11 +51,25 @@
52 51
53EXPORT_SYMBOL_GPL(hypercall_page); 52EXPORT_SYMBOL_GPL(hypercall_page);
54 53
55DEFINE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode);
56
57DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu); 54DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
58DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); 55DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
59DEFINE_PER_CPU(unsigned long, xen_cr3); 56
57/*
58 * Note about cr3 (pagetable base) values:
59 *
60 * xen_cr3 contains the current logical cr3 value; it contains the
61 * last set cr3. This may not be the current effective cr3, because
62 * its update may be being lazily deferred. However, a vcpu looking
63 * at its own cr3 can use this value knowing that it everything will
64 * be self-consistent.
65 *
66 * xen_current_cr3 contains the actual vcpu cr3; it is set once the
67 * hypercall to set the vcpu cr3 is complete (so it may be a little
68 * out of date, but it will never be set early). If one vcpu is
69 * looking at another vcpu's cr3 value, it should use this variable.
70 */
71DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */
72DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
60 73
61struct start_info *xen_start_info; 74struct start_info *xen_start_info;
62EXPORT_SYMBOL_GPL(xen_start_info); 75EXPORT_SYMBOL_GPL(xen_start_info);
@@ -100,7 +113,7 @@ static void __init xen_vcpu_setup(int cpu)
100 info.mfn = virt_to_mfn(vcpup); 113 info.mfn = virt_to_mfn(vcpup);
101 info.offset = offset_in_page(vcpup); 114 info.offset = offset_in_page(vcpup);
102 115
103 printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %x, offset %d\n", 116 printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %llx, offset %d\n",
104 cpu, vcpup, info.mfn, info.offset); 117 cpu, vcpup, info.mfn, info.offset);
105 118
106 /* Check to see if the hypervisor will put the vcpu_info 119 /* Check to see if the hypervisor will put the vcpu_info
@@ -124,7 +137,7 @@ static void __init xen_vcpu_setup(int cpu)
124static void __init xen_banner(void) 137static void __init xen_banner(void)
125{ 138{
126 printk(KERN_INFO "Booting paravirtualized kernel on %s\n", 139 printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
127 paravirt_ops.name); 140 pv_info.name);
128 printk(KERN_INFO "Hypervisor signature: %s\n", xen_start_info->magic); 141 printk(KERN_INFO "Hypervisor signature: %s\n", xen_start_info->magic);
129} 142}
130 143
@@ -249,29 +262,10 @@ static void xen_halt(void)
249 xen_safe_halt(); 262 xen_safe_halt();
250} 263}
251 264
252static void xen_set_lazy_mode(enum paravirt_lazy_mode mode) 265static void xen_leave_lazy(void)
253{ 266{
254 BUG_ON(preemptible()); 267 paravirt_leave_lazy(paravirt_get_lazy_mode());
255
256 switch (mode) {
257 case PARAVIRT_LAZY_NONE:
258 BUG_ON(x86_read_percpu(xen_lazy_mode) == PARAVIRT_LAZY_NONE);
259 break;
260
261 case PARAVIRT_LAZY_MMU:
262 case PARAVIRT_LAZY_CPU:
263 BUG_ON(x86_read_percpu(xen_lazy_mode) != PARAVIRT_LAZY_NONE);
264 break;
265
266 case PARAVIRT_LAZY_FLUSH:
267 /* flush if necessary, but don't change state */
268 if (x86_read_percpu(xen_lazy_mode) != PARAVIRT_LAZY_NONE)
269 xen_mc_flush();
270 return;
271 }
272
273 xen_mc_flush(); 268 xen_mc_flush();
274 x86_write_percpu(xen_lazy_mode, mode);
275} 269}
276 270
277static unsigned long xen_store_tr(void) 271static unsigned long xen_store_tr(void)
@@ -358,7 +352,7 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
358 * loaded properly. This will go away as soon as Xen has been 352 * loaded properly. This will go away as soon as Xen has been
359 * modified to not save/restore %gs for normal hypercalls. 353 * modified to not save/restore %gs for normal hypercalls.
360 */ 354 */
361 if (xen_get_lazy_mode() == PARAVIRT_LAZY_CPU) 355 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU)
362 loadsegment(gs, 0); 356 loadsegment(gs, 0);
363} 357}
364 358
@@ -632,32 +626,36 @@ static unsigned long xen_read_cr3(void)
632 return x86_read_percpu(xen_cr3); 626 return x86_read_percpu(xen_cr3);
633} 627}
634 628
629static void set_current_cr3(void *v)
630{
631 x86_write_percpu(xen_current_cr3, (unsigned long)v);
632}
633
635static void xen_write_cr3(unsigned long cr3) 634static void xen_write_cr3(unsigned long cr3)
636{ 635{
636 struct mmuext_op *op;
637 struct multicall_space mcs;
638 unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3));
639
637 BUG_ON(preemptible()); 640 BUG_ON(preemptible());
638 641
639 if (cr3 == x86_read_percpu(xen_cr3)) { 642 mcs = xen_mc_entry(sizeof(*op)); /* disables interrupts */
640 /* just a simple tlb flush */
641 xen_flush_tlb();
642 return;
643 }
644 643
644 /* Update while interrupts are disabled, so its atomic with
645 respect to ipis */
645 x86_write_percpu(xen_cr3, cr3); 646 x86_write_percpu(xen_cr3, cr3);
646 647
648 op = mcs.args;
649 op->cmd = MMUEXT_NEW_BASEPTR;
650 op->arg1.mfn = mfn;
647 651
648 { 652 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
649 struct mmuext_op *op;
650 struct multicall_space mcs = xen_mc_entry(sizeof(*op));
651 unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3));
652
653 op = mcs.args;
654 op->cmd = MMUEXT_NEW_BASEPTR;
655 op->arg1.mfn = mfn;
656 653
657 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); 654 /* Update xen_update_cr3 once the batch has actually
655 been submitted. */
656 xen_mc_callback(set_current_cr3, (void *)cr3);
658 657
659 xen_mc_issue(PARAVIRT_LAZY_CPU); 658 xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
660 }
661} 659}
662 660
663/* Early in boot, while setting up the initial pagetable, assume 661/* Early in boot, while setting up the initial pagetable, assume
@@ -668,6 +666,15 @@ static __init void xen_alloc_pt_init(struct mm_struct *mm, u32 pfn)
668 make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); 666 make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
669} 667}
670 668
669static void pin_pagetable_pfn(unsigned level, unsigned long pfn)
670{
671 struct mmuext_op op;
672 op.cmd = level;
673 op.arg1.mfn = pfn_to_mfn(pfn);
674 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
675 BUG();
676}
677
671/* This needs to make sure the new pte page is pinned iff its being 678/* This needs to make sure the new pte page is pinned iff its being
672 attached to a pinned pagetable. */ 679 attached to a pinned pagetable. */
673static void xen_alloc_pt(struct mm_struct *mm, u32 pfn) 680static void xen_alloc_pt(struct mm_struct *mm, u32 pfn)
@@ -677,9 +684,10 @@ static void xen_alloc_pt(struct mm_struct *mm, u32 pfn)
677 if (PagePinned(virt_to_page(mm->pgd))) { 684 if (PagePinned(virt_to_page(mm->pgd))) {
678 SetPagePinned(page); 685 SetPagePinned(page);
679 686
680 if (!PageHighMem(page)) 687 if (!PageHighMem(page)) {
681 make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); 688 make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
682 else 689 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
690 } else
683 /* make sure there are no stray mappings of 691 /* make sure there are no stray mappings of
684 this page */ 692 this page */
685 kmap_flush_unused(); 693 kmap_flush_unused();
@@ -692,8 +700,10 @@ static void xen_release_pt(u32 pfn)
692 struct page *page = pfn_to_page(pfn); 700 struct page *page = pfn_to_page(pfn);
693 701
694 if (PagePinned(page)) { 702 if (PagePinned(page)) {
695 if (!PageHighMem(page)) 703 if (!PageHighMem(page)) {
704 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
696 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); 705 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
706 }
697 } 707 }
698} 708}
699 709
@@ -738,7 +748,7 @@ static __init void xen_pagetable_setup_start(pgd_t *base)
738 pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base; 748 pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base;
739 749
740 /* special set_pte for pagetable initialization */ 750 /* special set_pte for pagetable initialization */
741 paravirt_ops.set_pte = xen_set_pte_init; 751 pv_mmu_ops.set_pte = xen_set_pte_init;
742 752
743 init_mm.pgd = base; 753 init_mm.pgd = base;
744 /* 754 /*
@@ -785,8 +795,8 @@ static __init void xen_pagetable_setup_done(pgd_t *base)
785{ 795{
786 /* This will work as long as patching hasn't happened yet 796 /* This will work as long as patching hasn't happened yet
787 (which it hasn't) */ 797 (which it hasn't) */
788 paravirt_ops.alloc_pt = xen_alloc_pt; 798 pv_mmu_ops.alloc_pt = xen_alloc_pt;
789 paravirt_ops.set_pte = xen_set_pte; 799 pv_mmu_ops.set_pte = xen_set_pte;
790 800
791 if (!xen_feature(XENFEAT_auto_translated_physmap)) { 801 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
792 /* 802 /*
@@ -808,15 +818,15 @@ static __init void xen_pagetable_setup_done(pgd_t *base)
808 /* Actually pin the pagetable down, but we can't set PG_pinned 818 /* Actually pin the pagetable down, but we can't set PG_pinned
809 yet because the page structures don't exist yet. */ 819 yet because the page structures don't exist yet. */
810 { 820 {
811 struct mmuext_op op; 821 unsigned level;
822
812#ifdef CONFIG_X86_PAE 823#ifdef CONFIG_X86_PAE
813 op.cmd = MMUEXT_PIN_L3_TABLE; 824 level = MMUEXT_PIN_L3_TABLE;
814#else 825#else
815 op.cmd = MMUEXT_PIN_L3_TABLE; 826 level = MMUEXT_PIN_L2_TABLE;
816#endif 827#endif
817 op.arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(base))); 828
818 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) 829 pin_pagetable_pfn(level, PFN_DOWN(__pa(base)));
819 BUG();
820 } 830 }
821} 831}
822 832
@@ -833,12 +843,12 @@ void __init xen_setup_vcpu_info_placement(void)
833 if (have_vcpu_info_placement) { 843 if (have_vcpu_info_placement) {
834 printk(KERN_INFO "Xen: using vcpu_info placement\n"); 844 printk(KERN_INFO "Xen: using vcpu_info placement\n");
835 845
836 paravirt_ops.save_fl = xen_save_fl_direct; 846 pv_irq_ops.save_fl = xen_save_fl_direct;
837 paravirt_ops.restore_fl = xen_restore_fl_direct; 847 pv_irq_ops.restore_fl = xen_restore_fl_direct;
838 paravirt_ops.irq_disable = xen_irq_disable_direct; 848 pv_irq_ops.irq_disable = xen_irq_disable_direct;
839 paravirt_ops.irq_enable = xen_irq_enable_direct; 849 pv_irq_ops.irq_enable = xen_irq_enable_direct;
840 paravirt_ops.read_cr2 = xen_read_cr2_direct; 850 pv_mmu_ops.read_cr2 = xen_read_cr2_direct;
841 paravirt_ops.iret = xen_iret_direct; 851 pv_cpu_ops.iret = xen_iret_direct;
842 } 852 }
843} 853}
844 854
@@ -850,8 +860,8 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
850 860
851 start = end = reloc = NULL; 861 start = end = reloc = NULL;
852 862
853#define SITE(x) \ 863#define SITE(op, x) \
854 case PARAVIRT_PATCH(x): \ 864 case PARAVIRT_PATCH(op.x): \
855 if (have_vcpu_info_placement) { \ 865 if (have_vcpu_info_placement) { \
856 start = (char *)xen_##x##_direct; \ 866 start = (char *)xen_##x##_direct; \
857 end = xen_##x##_direct_end; \ 867 end = xen_##x##_direct_end; \
@@ -860,10 +870,10 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
860 goto patch_site 870 goto patch_site
861 871
862 switch (type) { 872 switch (type) {
863 SITE(irq_enable); 873 SITE(pv_irq_ops, irq_enable);
864 SITE(irq_disable); 874 SITE(pv_irq_ops, irq_disable);
865 SITE(save_fl); 875 SITE(pv_irq_ops, save_fl);
866 SITE(restore_fl); 876 SITE(pv_irq_ops, restore_fl);
867#undef SITE 877#undef SITE
868 878
869 patch_site: 879 patch_site:
@@ -895,26 +905,32 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
895 return ret; 905 return ret;
896} 906}
897 907
898static const struct paravirt_ops xen_paravirt_ops __initdata = { 908static const struct pv_info xen_info __initdata = {
899 .paravirt_enabled = 1, 909 .paravirt_enabled = 1,
900 .shared_kernel_pmd = 0, 910 .shared_kernel_pmd = 0,
901 911
902 .name = "Xen", 912 .name = "Xen",
903 .banner = xen_banner, 913};
904 914
915static const struct pv_init_ops xen_init_ops __initdata = {
905 .patch = xen_patch, 916 .patch = xen_patch,
906 917
918 .banner = xen_banner,
907 .memory_setup = xen_memory_setup, 919 .memory_setup = xen_memory_setup,
908 .arch_setup = xen_arch_setup, 920 .arch_setup = xen_arch_setup,
909 .init_IRQ = xen_init_IRQ,
910 .post_allocator_init = xen_mark_init_mm_pinned, 921 .post_allocator_init = xen_mark_init_mm_pinned,
922};
911 923
924static const struct pv_time_ops xen_time_ops __initdata = {
912 .time_init = xen_time_init, 925 .time_init = xen_time_init,
926
913 .set_wallclock = xen_set_wallclock, 927 .set_wallclock = xen_set_wallclock,
914 .get_wallclock = xen_get_wallclock, 928 .get_wallclock = xen_get_wallclock,
915 .get_cpu_khz = xen_cpu_khz, 929 .get_cpu_khz = xen_cpu_khz,
916 .sched_clock = xen_sched_clock, 930 .sched_clock = xen_sched_clock,
931};
917 932
933static const struct pv_cpu_ops xen_cpu_ops __initdata = {
918 .cpuid = xen_cpuid, 934 .cpuid = xen_cpuid,
919 935
920 .set_debugreg = xen_set_debugreg, 936 .set_debugreg = xen_set_debugreg,
@@ -925,22 +941,10 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = {
925 .read_cr0 = native_read_cr0, 941 .read_cr0 = native_read_cr0,
926 .write_cr0 = native_write_cr0, 942 .write_cr0 = native_write_cr0,
927 943
928 .read_cr2 = xen_read_cr2,
929 .write_cr2 = xen_write_cr2,
930
931 .read_cr3 = xen_read_cr3,
932 .write_cr3 = xen_write_cr3,
933
934 .read_cr4 = native_read_cr4, 944 .read_cr4 = native_read_cr4,
935 .read_cr4_safe = native_read_cr4_safe, 945 .read_cr4_safe = native_read_cr4_safe,
936 .write_cr4 = xen_write_cr4, 946 .write_cr4 = xen_write_cr4,
937 947
938 .save_fl = xen_save_fl,
939 .restore_fl = xen_restore_fl,
940 .irq_disable = xen_irq_disable,
941 .irq_enable = xen_irq_enable,
942 .safe_halt = xen_safe_halt,
943 .halt = xen_halt,
944 .wbinvd = native_wbinvd, 948 .wbinvd = native_wbinvd,
945 949
946 .read_msr = native_read_msr_safe, 950 .read_msr = native_read_msr_safe,
@@ -969,6 +973,23 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = {
969 .set_iopl_mask = xen_set_iopl_mask, 973 .set_iopl_mask = xen_set_iopl_mask,
970 .io_delay = xen_io_delay, 974 .io_delay = xen_io_delay,
971 975
976 .lazy_mode = {
977 .enter = paravirt_enter_lazy_cpu,
978 .leave = xen_leave_lazy,
979 },
980};
981
982static const struct pv_irq_ops xen_irq_ops __initdata = {
983 .init_IRQ = xen_init_IRQ,
984 .save_fl = xen_save_fl,
985 .restore_fl = xen_restore_fl,
986 .irq_disable = xen_irq_disable,
987 .irq_enable = xen_irq_enable,
988 .safe_halt = xen_safe_halt,
989 .halt = xen_halt,
990};
991
992static const struct pv_apic_ops xen_apic_ops __initdata = {
972#ifdef CONFIG_X86_LOCAL_APIC 993#ifdef CONFIG_X86_LOCAL_APIC
973 .apic_write = xen_apic_write, 994 .apic_write = xen_apic_write,
974 .apic_write_atomic = xen_apic_write, 995 .apic_write_atomic = xen_apic_write,
@@ -977,6 +998,17 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = {
977 .setup_secondary_clock = paravirt_nop, 998 .setup_secondary_clock = paravirt_nop,
978 .startup_ipi_hook = paravirt_nop, 999 .startup_ipi_hook = paravirt_nop,
979#endif 1000#endif
1001};
1002
1003static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1004 .pagetable_setup_start = xen_pagetable_setup_start,
1005 .pagetable_setup_done = xen_pagetable_setup_done,
1006
1007 .read_cr2 = xen_read_cr2,
1008 .write_cr2 = xen_write_cr2,
1009
1010 .read_cr3 = xen_read_cr3,
1011 .write_cr3 = xen_write_cr3,
980 1012
981 .flush_tlb_user = xen_flush_tlb, 1013 .flush_tlb_user = xen_flush_tlb,
982 .flush_tlb_kernel = xen_flush_tlb, 1014 .flush_tlb_kernel = xen_flush_tlb,
@@ -986,9 +1018,6 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = {
986 .pte_update = paravirt_nop, 1018 .pte_update = paravirt_nop,
987 .pte_update_defer = paravirt_nop, 1019 .pte_update_defer = paravirt_nop,
988 1020
989 .pagetable_setup_start = xen_pagetable_setup_start,
990 .pagetable_setup_done = xen_pagetable_setup_done,
991
992 .alloc_pt = xen_alloc_pt_init, 1021 .alloc_pt = xen_alloc_pt_init,
993 .release_pt = xen_release_pt, 1022 .release_pt = xen_release_pt,
994 .alloc_pd = paravirt_nop, 1023 .alloc_pd = paravirt_nop,
@@ -1024,7 +1053,10 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = {
1024 .dup_mmap = xen_dup_mmap, 1053 .dup_mmap = xen_dup_mmap,
1025 .exit_mmap = xen_exit_mmap, 1054 .exit_mmap = xen_exit_mmap,
1026 1055
1027 .set_lazy_mode = xen_set_lazy_mode, 1056 .lazy_mode = {
1057 .enter = paravirt_enter_lazy_mmu,
1058 .leave = xen_leave_lazy,
1059 },
1028}; 1060};
1029 1061
1030#ifdef CONFIG_SMP 1062#ifdef CONFIG_SMP
@@ -1080,6 +1112,17 @@ static const struct machine_ops __initdata xen_machine_ops = {
1080}; 1112};
1081 1113
1082 1114
1115static void __init xen_reserve_top(void)
1116{
1117 unsigned long top = HYPERVISOR_VIRT_START;
1118 struct xen_platform_parameters pp;
1119
1120 if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
1121 top = pp.virt_start;
1122
1123 reserve_top_address(-top + 2 * PAGE_SIZE);
1124}
1125
1083/* First C function to be called on Xen boot */ 1126/* First C function to be called on Xen boot */
1084asmlinkage void __init xen_start_kernel(void) 1127asmlinkage void __init xen_start_kernel(void)
1085{ 1128{
@@ -1091,7 +1134,14 @@ asmlinkage void __init xen_start_kernel(void)
1091 BUG_ON(memcmp(xen_start_info->magic, "xen-3.0", 7) != 0); 1134 BUG_ON(memcmp(xen_start_info->magic, "xen-3.0", 7) != 0);
1092 1135
1093 /* Install Xen paravirt ops */ 1136 /* Install Xen paravirt ops */
1094 paravirt_ops = xen_paravirt_ops; 1137 pv_info = xen_info;
1138 pv_init_ops = xen_init_ops;
1139 pv_time_ops = xen_time_ops;
1140 pv_cpu_ops = xen_cpu_ops;
1141 pv_irq_ops = xen_irq_ops;
1142 pv_apic_ops = xen_apic_ops;
1143 pv_mmu_ops = xen_mmu_ops;
1144
1095 machine_ops = xen_machine_ops; 1145 machine_ops = xen_machine_ops;
1096 1146
1097#ifdef CONFIG_SMP 1147#ifdef CONFIG_SMP
@@ -1113,6 +1163,7 @@ asmlinkage void __init xen_start_kernel(void)
1113 /* keep using Xen gdt for now; no urgent need to change it */ 1163 /* keep using Xen gdt for now; no urgent need to change it */
1114 1164
1115 x86_write_percpu(xen_cr3, __pa(pgd)); 1165 x86_write_percpu(xen_cr3, __pa(pgd));
1166 x86_write_percpu(xen_current_cr3, __pa(pgd));
1116 1167
1117#ifdef CONFIG_SMP 1168#ifdef CONFIG_SMP
1118 /* Don't do the full vcpu_info placement stuff until we have a 1169 /* Don't do the full vcpu_info placement stuff until we have a
@@ -1124,12 +1175,12 @@ asmlinkage void __init xen_start_kernel(void)
1124 xen_setup_vcpu_info_placement(); 1175 xen_setup_vcpu_info_placement();
1125#endif 1176#endif
1126 1177
1127 paravirt_ops.kernel_rpl = 1; 1178 pv_info.kernel_rpl = 1;
1128 if (xen_feature(XENFEAT_supervisor_mode_kernel)) 1179 if (xen_feature(XENFEAT_supervisor_mode_kernel))
1129 paravirt_ops.kernel_rpl = 0; 1180 pv_info.kernel_rpl = 0;
1130 1181
1131 /* set the limit of our address space */ 1182 /* set the limit of our address space */
1132 reserve_top_address(-HYPERVISOR_VIRT_START + 2 * PAGE_SIZE); 1183 xen_reserve_top();
1133 1184
1134 /* set up basic CPUID stuff */ 1185 /* set up basic CPUID stuff */
1135 cpu_detect(&new_cpu_data); 1186 cpu_detect(&new_cpu_data);
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 874db0cd1d2a..b2e32f9d0071 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -41,7 +41,6 @@
41#include <linux/sched.h> 41#include <linux/sched.h>
42#include <linux/highmem.h> 42#include <linux/highmem.h>
43#include <linux/bug.h> 43#include <linux/bug.h>
44#include <linux/sched.h>
45 44
46#include <asm/pgtable.h> 45#include <asm/pgtable.h>
47#include <asm/tlbflush.h> 46#include <asm/tlbflush.h>
@@ -155,7 +154,7 @@ void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
155 pte_t *ptep, pte_t pteval) 154 pte_t *ptep, pte_t pteval)
156{ 155{
157 if (mm == current->mm || mm == &init_mm) { 156 if (mm == current->mm || mm == &init_mm) {
158 if (xen_get_lazy_mode() == PARAVIRT_LAZY_MMU) { 157 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
159 struct multicall_space mcs; 158 struct multicall_space mcs;
160 mcs = xen_mc_entry(0); 159 mcs = xen_mc_entry(0);
161 160
@@ -304,7 +303,12 @@ pgd_t xen_make_pgd(unsigned long pgd)
304} 303}
305#endif /* CONFIG_X86_PAE */ 304#endif /* CONFIG_X86_PAE */
306 305
307 306enum pt_level {
307 PT_PGD,
308 PT_PUD,
309 PT_PMD,
310 PT_PTE
311};
308 312
309/* 313/*
310 (Yet another) pagetable walker. This one is intended for pinning a 314 (Yet another) pagetable walker. This one is intended for pinning a
@@ -316,7 +320,7 @@ pgd_t xen_make_pgd(unsigned long pgd)
316 FIXADDR_TOP. But the important bit is that we don't pin beyond 320 FIXADDR_TOP. But the important bit is that we don't pin beyond
317 there, because then we start getting into Xen's ptes. 321 there, because then we start getting into Xen's ptes.
318*/ 322*/
319static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned), 323static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, enum pt_level),
320 unsigned long limit) 324 unsigned long limit)
321{ 325{
322 pgd_t *pgd = pgd_base; 326 pgd_t *pgd = pgd_base;
@@ -341,7 +345,7 @@ static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned),
341 pud = pud_offset(pgd, 0); 345 pud = pud_offset(pgd, 0);
342 346
343 if (PTRS_PER_PUD > 1) /* not folded */ 347 if (PTRS_PER_PUD > 1) /* not folded */
344 flush |= (*func)(virt_to_page(pud), 0); 348 flush |= (*func)(virt_to_page(pud), PT_PUD);
345 349
346 for (; addr != pud_limit; pud++, addr = pud_next) { 350 for (; addr != pud_limit; pud++, addr = pud_next) {
347 pmd_t *pmd; 351 pmd_t *pmd;
@@ -360,7 +364,7 @@ static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned),
360 pmd = pmd_offset(pud, 0); 364 pmd = pmd_offset(pud, 0);
361 365
362 if (PTRS_PER_PMD > 1) /* not folded */ 366 if (PTRS_PER_PMD > 1) /* not folded */
363 flush |= (*func)(virt_to_page(pmd), 0); 367 flush |= (*func)(virt_to_page(pmd), PT_PMD);
364 368
365 for (; addr != pmd_limit; pmd++) { 369 for (; addr != pmd_limit; pmd++) {
366 addr += (PAGE_SIZE * PTRS_PER_PTE); 370 addr += (PAGE_SIZE * PTRS_PER_PTE);
@@ -372,17 +376,47 @@ static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned),
372 if (pmd_none(*pmd)) 376 if (pmd_none(*pmd))
373 continue; 377 continue;
374 378
375 flush |= (*func)(pmd_page(*pmd), 0); 379 flush |= (*func)(pmd_page(*pmd), PT_PTE);
376 } 380 }
377 } 381 }
378 } 382 }
379 383
380 flush |= (*func)(virt_to_page(pgd_base), UVMF_TLB_FLUSH); 384 flush |= (*func)(virt_to_page(pgd_base), PT_PGD);
381 385
382 return flush; 386 return flush;
383} 387}
384 388
385static int pin_page(struct page *page, unsigned flags) 389static spinlock_t *lock_pte(struct page *page)
390{
391 spinlock_t *ptl = NULL;
392
393#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
394 ptl = __pte_lockptr(page);
395 spin_lock(ptl);
396#endif
397
398 return ptl;
399}
400
401static void do_unlock(void *v)
402{
403 spinlock_t *ptl = v;
404 spin_unlock(ptl);
405}
406
407static void xen_do_pin(unsigned level, unsigned long pfn)
408{
409 struct mmuext_op *op;
410 struct multicall_space mcs;
411
412 mcs = __xen_mc_entry(sizeof(*op));
413 op = mcs.args;
414 op->cmd = level;
415 op->arg1.mfn = pfn_to_mfn(pfn);
416 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
417}
418
419static int pin_page(struct page *page, enum pt_level level)
386{ 420{
387 unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags); 421 unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags);
388 int flush; 422 int flush;
@@ -397,12 +431,26 @@ static int pin_page(struct page *page, unsigned flags)
397 void *pt = lowmem_page_address(page); 431 void *pt = lowmem_page_address(page);
398 unsigned long pfn = page_to_pfn(page); 432 unsigned long pfn = page_to_pfn(page);
399 struct multicall_space mcs = __xen_mc_entry(0); 433 struct multicall_space mcs = __xen_mc_entry(0);
434 spinlock_t *ptl;
400 435
401 flush = 0; 436 flush = 0;
402 437
438 ptl = NULL;
439 if (level == PT_PTE)
440 ptl = lock_pte(page);
441
403 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, 442 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
404 pfn_pte(pfn, PAGE_KERNEL_RO), 443 pfn_pte(pfn, PAGE_KERNEL_RO),
405 flags); 444 level == PT_PGD ? UVMF_TLB_FLUSH : 0);
445
446 if (level == PT_PTE)
447 xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
448
449 if (ptl) {
450 /* Queue a deferred unlock for when this batch
451 is completed. */
452 xen_mc_callback(do_unlock, ptl);
453 }
406 } 454 }
407 455
408 return flush; 456 return flush;
@@ -413,8 +461,7 @@ static int pin_page(struct page *page, unsigned flags)
413 read-only, and can be pinned. */ 461 read-only, and can be pinned. */
414void xen_pgd_pin(pgd_t *pgd) 462void xen_pgd_pin(pgd_t *pgd)
415{ 463{
416 struct multicall_space mcs; 464 unsigned level;
417 struct mmuext_op *op;
418 465
419 xen_mc_batch(); 466 xen_mc_batch();
420 467
@@ -425,16 +472,13 @@ void xen_pgd_pin(pgd_t *pgd)
425 xen_mc_batch(); 472 xen_mc_batch();
426 } 473 }
427 474
428 mcs = __xen_mc_entry(sizeof(*op));
429 op = mcs.args;
430
431#ifdef CONFIG_X86_PAE 475#ifdef CONFIG_X86_PAE
432 op->cmd = MMUEXT_PIN_L3_TABLE; 476 level = MMUEXT_PIN_L3_TABLE;
433#else 477#else
434 op->cmd = MMUEXT_PIN_L2_TABLE; 478 level = MMUEXT_PIN_L2_TABLE;
435#endif 479#endif
436 op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd))); 480
437 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); 481 xen_do_pin(level, PFN_DOWN(__pa(pgd)));
438 482
439 xen_mc_issue(0); 483 xen_mc_issue(0);
440} 484}
@@ -442,7 +486,7 @@ void xen_pgd_pin(pgd_t *pgd)
442/* The init_mm pagetable is really pinned as soon as its created, but 486/* The init_mm pagetable is really pinned as soon as its created, but
443 that's before we have page structures to store the bits. So do all 487 that's before we have page structures to store the bits. So do all
444 the book-keeping now. */ 488 the book-keeping now. */
445static __init int mark_pinned(struct page *page, unsigned flags) 489static __init int mark_pinned(struct page *page, enum pt_level level)
446{ 490{
447 SetPagePinned(page); 491 SetPagePinned(page);
448 return 0; 492 return 0;
@@ -453,18 +497,32 @@ void __init xen_mark_init_mm_pinned(void)
453 pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP); 497 pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP);
454} 498}
455 499
456static int unpin_page(struct page *page, unsigned flags) 500static int unpin_page(struct page *page, enum pt_level level)
457{ 501{
458 unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags); 502 unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags);
459 503
460 if (pgfl && !PageHighMem(page)) { 504 if (pgfl && !PageHighMem(page)) {
461 void *pt = lowmem_page_address(page); 505 void *pt = lowmem_page_address(page);
462 unsigned long pfn = page_to_pfn(page); 506 unsigned long pfn = page_to_pfn(page);
463 struct multicall_space mcs = __xen_mc_entry(0); 507 spinlock_t *ptl = NULL;
508 struct multicall_space mcs;
509
510 if (level == PT_PTE) {
511 ptl = lock_pte(page);
512
513 xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
514 }
515
516 mcs = __xen_mc_entry(0);
464 517
465 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, 518 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
466 pfn_pte(pfn, PAGE_KERNEL), 519 pfn_pte(pfn, PAGE_KERNEL),
467 flags); 520 level == PT_PGD ? UVMF_TLB_FLUSH : 0);
521
522 if (ptl) {
523 /* unlock when batch completed */
524 xen_mc_callback(do_unlock, ptl);
525 }
468 } 526 }
469 527
470 return 0; /* never need to flush on unpin */ 528 return 0; /* never need to flush on unpin */
@@ -473,18 +531,9 @@ static int unpin_page(struct page *page, unsigned flags)
473/* Release a pagetables pages back as normal RW */ 531/* Release a pagetables pages back as normal RW */
474static void xen_pgd_unpin(pgd_t *pgd) 532static void xen_pgd_unpin(pgd_t *pgd)
475{ 533{
476 struct mmuext_op *op;
477 struct multicall_space mcs;
478
479 xen_mc_batch(); 534 xen_mc_batch();
480 535
481 mcs = __xen_mc_entry(sizeof(*op)); 536 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
482
483 op = mcs.args;
484 op->cmd = MMUEXT_UNPIN_TABLE;
485 op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
486
487 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
488 537
489 pgd_walk(pgd, unpin_page, TASK_SIZE); 538 pgd_walk(pgd, unpin_page, TASK_SIZE);
490 539
@@ -515,20 +564,43 @@ static void drop_other_mm_ref(void *info)
515 564
516 if (__get_cpu_var(cpu_tlbstate).active_mm == mm) 565 if (__get_cpu_var(cpu_tlbstate).active_mm == mm)
517 leave_mm(smp_processor_id()); 566 leave_mm(smp_processor_id());
567
568 /* If this cpu still has a stale cr3 reference, then make sure
569 it has been flushed. */
570 if (x86_read_percpu(xen_current_cr3) == __pa(mm->pgd)) {
571 load_cr3(swapper_pg_dir);
572 arch_flush_lazy_cpu_mode();
573 }
518} 574}
519 575
520static void drop_mm_ref(struct mm_struct *mm) 576static void drop_mm_ref(struct mm_struct *mm)
521{ 577{
578 cpumask_t mask;
579 unsigned cpu;
580
522 if (current->active_mm == mm) { 581 if (current->active_mm == mm) {
523 if (current->mm == mm) 582 if (current->mm == mm)
524 load_cr3(swapper_pg_dir); 583 load_cr3(swapper_pg_dir);
525 else 584 else
526 leave_mm(smp_processor_id()); 585 leave_mm(smp_processor_id());
586 arch_flush_lazy_cpu_mode();
527 } 587 }
528 588
529 if (!cpus_empty(mm->cpu_vm_mask)) 589 /* Get the "official" set of cpus referring to our pagetable. */
530 xen_smp_call_function_mask(mm->cpu_vm_mask, drop_other_mm_ref, 590 mask = mm->cpu_vm_mask;
531 mm, 1); 591
592 /* It's possible that a vcpu may have a stale reference to our
593 cr3, because its in lazy mode, and it hasn't yet flushed
594 its set of pending hypercalls yet. In this case, we can
595 look at its actual current cr3 value, and force it to flush
596 if needed. */
597 for_each_online_cpu(cpu) {
598 if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
599 cpu_set(cpu, mask);
600 }
601
602 if (!cpus_empty(mask))
603 xen_smp_call_function_mask(mask, drop_other_mm_ref, mm, 1);
532} 604}
533#else 605#else
534static void drop_mm_ref(struct mm_struct *mm) 606static void drop_mm_ref(struct mm_struct *mm)
@@ -563,5 +635,6 @@ void xen_exit_mmap(struct mm_struct *mm)
563 /* pgd may not be pinned in the error exit path of execve */ 635 /* pgd may not be pinned in the error exit path of execve */
564 if (PagePinned(virt_to_page(mm->pgd))) 636 if (PagePinned(virt_to_page(mm->pgd)))
565 xen_pgd_unpin(mm->pgd); 637 xen_pgd_unpin(mm->pgd);
638
566 spin_unlock(&mm->page_table_lock); 639 spin_unlock(&mm->page_table_lock);
567} 640}
diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c
index c837e8e463db..5e6f36f6d876 100644
--- a/arch/x86/xen/multicalls.c
+++ b/arch/x86/xen/multicalls.c
@@ -26,13 +26,22 @@
26 26
27#include "multicalls.h" 27#include "multicalls.h"
28 28
29#define MC_DEBUG 1
30
29#define MC_BATCH 32 31#define MC_BATCH 32
30#define MC_ARGS (MC_BATCH * 16 / sizeof(u64)) 32#define MC_ARGS (MC_BATCH * 16 / sizeof(u64))
31 33
32struct mc_buffer { 34struct mc_buffer {
33 struct multicall_entry entries[MC_BATCH]; 35 struct multicall_entry entries[MC_BATCH];
36#if MC_DEBUG
37 struct multicall_entry debug[MC_BATCH];
38#endif
34 u64 args[MC_ARGS]; 39 u64 args[MC_ARGS];
35 unsigned mcidx, argidx; 40 struct callback {
41 void (*fn)(void *);
42 void *data;
43 } callbacks[MC_BATCH];
44 unsigned mcidx, argidx, cbidx;
36}; 45};
37 46
38static DEFINE_PER_CPU(struct mc_buffer, mc_buffer); 47static DEFINE_PER_CPU(struct mc_buffer, mc_buffer);
@@ -43,6 +52,7 @@ void xen_mc_flush(void)
43 struct mc_buffer *b = &__get_cpu_var(mc_buffer); 52 struct mc_buffer *b = &__get_cpu_var(mc_buffer);
44 int ret = 0; 53 int ret = 0;
45 unsigned long flags; 54 unsigned long flags;
55 int i;
46 56
47 BUG_ON(preemptible()); 57 BUG_ON(preemptible());
48 58
@@ -51,13 +61,31 @@ void xen_mc_flush(void)
51 local_irq_save(flags); 61 local_irq_save(flags);
52 62
53 if (b->mcidx) { 63 if (b->mcidx) {
54 int i; 64#if MC_DEBUG
65 memcpy(b->debug, b->entries,
66 b->mcidx * sizeof(struct multicall_entry));
67#endif
55 68
56 if (HYPERVISOR_multicall(b->entries, b->mcidx) != 0) 69 if (HYPERVISOR_multicall(b->entries, b->mcidx) != 0)
57 BUG(); 70 BUG();
58 for (i = 0; i < b->mcidx; i++) 71 for (i = 0; i < b->mcidx; i++)
59 if (b->entries[i].result < 0) 72 if (b->entries[i].result < 0)
60 ret++; 73 ret++;
74
75#if MC_DEBUG
76 if (ret) {
77 printk(KERN_ERR "%d multicall(s) failed: cpu %d\n",
78 ret, smp_processor_id());
79 for(i = 0; i < b->mcidx; i++) {
80 printk(" call %2d/%d: op=%lu arg=[%lx] result=%ld\n",
81 i+1, b->mcidx,
82 b->debug[i].op,
83 b->debug[i].args[0],
84 b->entries[i].result);
85 }
86 }
87#endif
88
61 b->mcidx = 0; 89 b->mcidx = 0;
62 b->argidx = 0; 90 b->argidx = 0;
63 } else 91 } else
@@ -65,6 +93,13 @@ void xen_mc_flush(void)
65 93
66 local_irq_restore(flags); 94 local_irq_restore(flags);
67 95
96 for(i = 0; i < b->cbidx; i++) {
97 struct callback *cb = &b->callbacks[i];
98
99 (*cb->fn)(cb->data);
100 }
101 b->cbidx = 0;
102
68 BUG_ON(ret); 103 BUG_ON(ret);
69} 104}
70 105
@@ -88,3 +123,16 @@ struct multicall_space __xen_mc_entry(size_t args)
88 123
89 return ret; 124 return ret;
90} 125}
126
127void xen_mc_callback(void (*fn)(void *), void *data)
128{
129 struct mc_buffer *b = &__get_cpu_var(mc_buffer);
130 struct callback *cb;
131
132 if (b->cbidx == MC_BATCH)
133 xen_mc_flush();
134
135 cb = &b->callbacks[b->cbidx++];
136 cb->fn = fn;
137 cb->data = data;
138}
diff --git a/arch/x86/xen/multicalls.h b/arch/x86/xen/multicalls.h
index e6f7530b156c..8bae996d99a3 100644
--- a/arch/x86/xen/multicalls.h
+++ b/arch/x86/xen/multicalls.h
@@ -35,11 +35,14 @@ void xen_mc_flush(void);
35/* Issue a multicall if we're not in a lazy mode */ 35/* Issue a multicall if we're not in a lazy mode */
36static inline void xen_mc_issue(unsigned mode) 36static inline void xen_mc_issue(unsigned mode)
37{ 37{
38 if ((xen_get_lazy_mode() & mode) == 0) 38 if ((paravirt_get_lazy_mode() & mode) == 0)
39 xen_mc_flush(); 39 xen_mc_flush();
40 40
41 /* restore flags saved in xen_mc_batch */ 41 /* restore flags saved in xen_mc_batch */
42 local_irq_restore(x86_read_percpu(xen_mc_irq_flags)); 42 local_irq_restore(x86_read_percpu(xen_mc_irq_flags));
43} 43}
44 44
45/* Set up a callback to be called when the current batch is flushed */
46void xen_mc_callback(void (*fn)(void *), void *data);
47
45#endif /* _XEN_MULTICALLS_H */ 48#endif /* _XEN_MULTICALLS_H */
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 4fa33c27ccb6..d53bf9d8a72d 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -370,7 +370,8 @@ int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *),
370 void *info, int wait) 370 void *info, int wait)
371{ 371{
372 struct call_data_struct data; 372 struct call_data_struct data;
373 int cpus; 373 int cpus, cpu;
374 bool yield;
374 375
375 /* Holding any lock stops cpus from going down. */ 376 /* Holding any lock stops cpus from going down. */
376 spin_lock(&call_lock); 377 spin_lock(&call_lock);
@@ -399,9 +400,14 @@ int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *),
399 /* Send a message to other CPUs and wait for them to respond */ 400 /* Send a message to other CPUs and wait for them to respond */
400 xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR); 401 xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR);
401 402
402 /* Make sure other vcpus get a chance to run. 403 /* Make sure other vcpus get a chance to run if they need to. */
403 XXX too severe? Maybe we should check the other CPU's states? */ 404 yield = false;
404 HYPERVISOR_sched_op(SCHEDOP_yield, 0); 405 for_each_cpu_mask(cpu, mask)
406 if (xen_vcpu_stolen(cpu))
407 yield = true;
408
409 if (yield)
410 HYPERVISOR_sched_op(SCHEDOP_yield, 0);
405 411
406 /* Wait for response */ 412 /* Wait for response */
407 while (atomic_read(&data.started) != cpus || 413 while (atomic_read(&data.started) != cpus ||
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index dfd6db69ead5..d083ff5ef088 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -105,6 +105,12 @@ static void get_runstate_snapshot(struct vcpu_runstate_info *res)
105 } while (get64(&state->state_entry_time) != state_time); 105 } while (get64(&state->state_entry_time) != state_time);
106} 106}
107 107
108/* return true when a vcpu could run but has no real cpu to run on */
109bool xen_vcpu_stolen(int vcpu)
110{
111 return per_cpu(runstate, vcpu).state == RUNSTATE_runnable;
112}
113
108static void setup_runstate_info(int cpu) 114static void setup_runstate_info(int cpu)
109{ 115{
110 struct vcpu_register_runstate_memory_area area; 116 struct vcpu_register_runstate_memory_area area;
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index b9aaea45f07f..b02a909bfd4c 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -11,6 +11,7 @@ void xen_copy_trap_info(struct trap_info *traps);
11 11
12DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu); 12DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu);
13DECLARE_PER_CPU(unsigned long, xen_cr3); 13DECLARE_PER_CPU(unsigned long, xen_cr3);
14DECLARE_PER_CPU(unsigned long, xen_current_cr3);
14 15
15extern struct start_info *xen_start_info; 16extern struct start_info *xen_start_info;
16extern struct shared_info *HYPERVISOR_shared_info; 17extern struct shared_info *HYPERVISOR_shared_info;
@@ -27,14 +28,9 @@ unsigned long xen_get_wallclock(void);
27int xen_set_wallclock(unsigned long time); 28int xen_set_wallclock(unsigned long time);
28unsigned long long xen_sched_clock(void); 29unsigned long long xen_sched_clock(void);
29 30
30void xen_mark_init_mm_pinned(void); 31bool xen_vcpu_stolen(int vcpu);
31
32DECLARE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode);
33 32
34static inline unsigned xen_get_lazy_mode(void) 33void xen_mark_init_mm_pinned(void);
35{
36 return x86_read_percpu(xen_lazy_mode);
37}
38 34
39void __init xen_fill_possible_map(void); 35void __init xen_fill_possible_map(void);
40 36