aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@woody.linux-foundation.org>2007-10-17 14:10:11 -0400
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-10-17 14:10:11 -0400
commitfb9fc395174138983a49f2da982ed14caabbe741 (patch)
tree5d5d3643ee6853a899205613da272cc343fdc1a4
parent0eafaae84e21ac033815cc9f33c3ae889cd7ccfe (diff)
parentace2e92e193126711cb3a83a3752b2c5b8396950 (diff)
Merge branch 'xen-upstream' of git://git.kernel.org/pub/scm/linux/kernel/git/jeremy/xen
* 'xen-upstream' of git://git.kernel.org/pub/scm/linux/kernel/git/jeremy/xen: xfs: eagerly remove vmap mappings to avoid upsetting Xen xen: add some debug output for failed multicalls xen: fix incorrect vcpu_register_vcpu_info hypercall argument xen: ask the hypervisor how much space it needs reserved xen: lock pte pages while pinning/unpinning xen: deal with stale cr3 values when unpinning pagetables xen: add batch completion callbacks xen: yield to IPI target if necessary Clean up duplicate includes in arch/i386/xen/ remove dead code in pgtable_cache_init paravirt: clean up lazy mode handling paravirt: refactor struct paravirt_ops into smaller pv_*_ops
-rw-r--r--arch/x86/kernel/alternative.c4
-rw-r--r--arch/x86/kernel/asm-offsets_32.c14
-rw-r--r--arch/x86/kernel/entry_32.S2
-rw-r--r--arch/x86/kernel/paravirt_32.c224
-rw-r--r--arch/x86/kernel/vmi_32.c201
-rw-r--r--arch/x86/mm/init_32.c22
-rw-r--r--arch/x86/xen/enlighten.c233
-rw-r--r--arch/x86/xen/mmu.c145
-rw-r--r--arch/x86/xen/multicalls.c52
-rw-r--r--arch/x86/xen/multicalls.h5
-rw-r--r--arch/x86/xen/smp.c14
-rw-r--r--arch/x86/xen/time.c6
-rw-r--r--arch/x86/xen/xen-ops.h10
-rw-r--r--drivers/char/hvc_lguest.c2
-rw-r--r--drivers/lguest/core.c6
-rw-r--r--drivers/lguest/lguest.c152
-rw-r--r--drivers/lguest/lguest_bus.c2
-rw-r--r--include/asm-x86/paravirt.h487
-rw-r--r--include/asm-x86/pgtable-3level-defs.h2
-rw-r--r--include/xen/interface/vcpu.h5
-rw-r--r--mm/Kconfig1
21 files changed, 960 insertions, 629 deletions
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 11b03d3c6fda..42421437ded3 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -369,8 +369,8 @@ void apply_paravirt(struct paravirt_patch_site *start,
369 BUG_ON(p->len > MAX_PATCH_LEN); 369 BUG_ON(p->len > MAX_PATCH_LEN);
370 /* prep the buffer with the original instructions */ 370 /* prep the buffer with the original instructions */
371 memcpy(insnbuf, p->instr, p->len); 371 memcpy(insnbuf, p->instr, p->len);
372 used = paravirt_ops.patch(p->instrtype, p->clobbers, insnbuf, 372 used = pv_init_ops.patch(p->instrtype, p->clobbers, insnbuf,
373 (unsigned long)p->instr, p->len); 373 (unsigned long)p->instr, p->len);
374 374
375 BUG_ON(used > p->len); 375 BUG_ON(used > p->len);
376 376
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 8029742c0fc1..f1b7cdda82b3 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -116,12 +116,14 @@ void foo(void)
116 116
117#ifdef CONFIG_PARAVIRT 117#ifdef CONFIG_PARAVIRT
118 BLANK(); 118 BLANK();
119 OFFSET(PARAVIRT_enabled, paravirt_ops, paravirt_enabled); 119 OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
120 OFFSET(PARAVIRT_irq_disable, paravirt_ops, irq_disable); 120 OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops);
121 OFFSET(PARAVIRT_irq_enable, paravirt_ops, irq_enable); 121 OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops);
122 OFFSET(PARAVIRT_irq_enable_sysexit, paravirt_ops, irq_enable_sysexit); 122 OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
123 OFFSET(PARAVIRT_iret, paravirt_ops, iret); 123 OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
124 OFFSET(PARAVIRT_read_cr0, paravirt_ops, read_cr0); 124 OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
125 OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit);
126 OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0);
125#endif 127#endif
126 128
127#ifdef CONFIG_XEN 129#ifdef CONFIG_XEN
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 8099fea0a72f..dc7f938e5015 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -437,7 +437,7 @@ ldt_ss:
437 * is still available to implement the setting of the high 437 * is still available to implement the setting of the high
438 * 16-bits in the INTERRUPT_RETURN paravirt-op. 438 * 16-bits in the INTERRUPT_RETURN paravirt-op.
439 */ 439 */
440 cmpl $0, paravirt_ops+PARAVIRT_enabled 440 cmpl $0, pv_info+PARAVIRT_enabled
441 jne restore_nocheck 441 jne restore_nocheck
442#endif 442#endif
443 443
diff --git a/arch/x86/kernel/paravirt_32.c b/arch/x86/kernel/paravirt_32.c
index 739cfb207dd7..6a80d67c2121 100644
--- a/arch/x86/kernel/paravirt_32.c
+++ b/arch/x86/kernel/paravirt_32.c
@@ -42,32 +42,33 @@ void _paravirt_nop(void)
42static void __init default_banner(void) 42static void __init default_banner(void)
43{ 43{
44 printk(KERN_INFO "Booting paravirtualized kernel on %s\n", 44 printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
45 paravirt_ops.name); 45 pv_info.name);
46} 46}
47 47
48char *memory_setup(void) 48char *memory_setup(void)
49{ 49{
50 return paravirt_ops.memory_setup(); 50 return pv_init_ops.memory_setup();
51} 51}
52 52
53/* Simple instruction patching code. */ 53/* Simple instruction patching code. */
54#define DEF_NATIVE(name, code) \ 54#define DEF_NATIVE(ops, name, code) \
55 extern const char start_##name[], end_##name[]; \ 55 extern const char start_##ops##_##name[], end_##ops##_##name[]; \
56 asm("start_" #name ": " code "; end_" #name ":") 56 asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":")
57 57
58DEF_NATIVE(irq_disable, "cli"); 58DEF_NATIVE(pv_irq_ops, irq_disable, "cli");
59DEF_NATIVE(irq_enable, "sti"); 59DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
60DEF_NATIVE(restore_fl, "push %eax; popf"); 60DEF_NATIVE(pv_irq_ops, restore_fl, "push %eax; popf");
61DEF_NATIVE(save_fl, "pushf; pop %eax"); 61DEF_NATIVE(pv_irq_ops, save_fl, "pushf; pop %eax");
62DEF_NATIVE(iret, "iret"); 62DEF_NATIVE(pv_cpu_ops, iret, "iret");
63DEF_NATIVE(irq_enable_sysexit, "sti; sysexit"); 63DEF_NATIVE(pv_cpu_ops, irq_enable_sysexit, "sti; sysexit");
64DEF_NATIVE(read_cr2, "mov %cr2, %eax"); 64DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax");
65DEF_NATIVE(write_cr3, "mov %eax, %cr3"); 65DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3");
66DEF_NATIVE(read_cr3, "mov %cr3, %eax"); 66DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax");
67DEF_NATIVE(clts, "clts"); 67DEF_NATIVE(pv_cpu_ops, clts, "clts");
68DEF_NATIVE(read_tsc, "rdtsc"); 68DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc");
69 69
70DEF_NATIVE(ud2a, "ud2a"); 70/* Undefined instruction for dealing with missing ops pointers. */
71static const unsigned char ud2a[] = { 0x0f, 0x0b };
71 72
72static unsigned native_patch(u8 type, u16 clobbers, void *ibuf, 73static unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
73 unsigned long addr, unsigned len) 74 unsigned long addr, unsigned len)
@@ -76,37 +77,29 @@ static unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
76 unsigned ret; 77 unsigned ret;
77 78
78 switch(type) { 79 switch(type) {
79#define SITE(x) case PARAVIRT_PATCH(x): start = start_##x; end = end_##x; goto patch_site 80#define SITE(ops, x) \
80 SITE(irq_disable); 81 case PARAVIRT_PATCH(ops.x): \
81 SITE(irq_enable); 82 start = start_##ops##_##x; \
82 SITE(restore_fl); 83 end = end_##ops##_##x; \
83 SITE(save_fl); 84 goto patch_site
84 SITE(iret); 85
85 SITE(irq_enable_sysexit); 86 SITE(pv_irq_ops, irq_disable);
86 SITE(read_cr2); 87 SITE(pv_irq_ops, irq_enable);
87 SITE(read_cr3); 88 SITE(pv_irq_ops, restore_fl);
88 SITE(write_cr3); 89 SITE(pv_irq_ops, save_fl);
89 SITE(clts); 90 SITE(pv_cpu_ops, iret);
90 SITE(read_tsc); 91 SITE(pv_cpu_ops, irq_enable_sysexit);
92 SITE(pv_mmu_ops, read_cr2);
93 SITE(pv_mmu_ops, read_cr3);
94 SITE(pv_mmu_ops, write_cr3);
95 SITE(pv_cpu_ops, clts);
96 SITE(pv_cpu_ops, read_tsc);
91#undef SITE 97#undef SITE
92 98
93 patch_site: 99 patch_site:
94 ret = paravirt_patch_insns(ibuf, len, start, end); 100 ret = paravirt_patch_insns(ibuf, len, start, end);
95 break; 101 break;
96 102
97 case PARAVIRT_PATCH(make_pgd):
98 case PARAVIRT_PATCH(make_pte):
99 case PARAVIRT_PATCH(pgd_val):
100 case PARAVIRT_PATCH(pte_val):
101#ifdef CONFIG_X86_PAE
102 case PARAVIRT_PATCH(make_pmd):
103 case PARAVIRT_PATCH(pmd_val):
104#endif
105 /* These functions end up returning exactly what
106 they're passed, in the same registers. */
107 ret = paravirt_patch_nop();
108 break;
109
110 default: 103 default:
111 ret = paravirt_patch_default(type, clobbers, ibuf, addr, len); 104 ret = paravirt_patch_default(type, clobbers, ibuf, addr, len);
112 break; 105 break;
@@ -150,7 +143,7 @@ unsigned paravirt_patch_call(void *insnbuf,
150 return 5; 143 return 5;
151} 144}
152 145
153unsigned paravirt_patch_jmp(const void *target, void *insnbuf, 146unsigned paravirt_patch_jmp(void *insnbuf, const void *target,
154 unsigned long addr, unsigned len) 147 unsigned long addr, unsigned len)
155{ 148{
156 struct branch *b = insnbuf; 149 struct branch *b = insnbuf;
@@ -165,22 +158,37 @@ unsigned paravirt_patch_jmp(const void *target, void *insnbuf,
165 return 5; 158 return 5;
166} 159}
167 160
161/* Neat trick to map patch type back to the call within the
162 * corresponding structure. */
163static void *get_call_destination(u8 type)
164{
165 struct paravirt_patch_template tmpl = {
166 .pv_init_ops = pv_init_ops,
167 .pv_time_ops = pv_time_ops,
168 .pv_cpu_ops = pv_cpu_ops,
169 .pv_irq_ops = pv_irq_ops,
170 .pv_apic_ops = pv_apic_ops,
171 .pv_mmu_ops = pv_mmu_ops,
172 };
173 return *((void **)&tmpl + type);
174}
175
168unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf, 176unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
169 unsigned long addr, unsigned len) 177 unsigned long addr, unsigned len)
170{ 178{
171 void *opfunc = *((void **)&paravirt_ops + type); 179 void *opfunc = get_call_destination(type);
172 unsigned ret; 180 unsigned ret;
173 181
174 if (opfunc == NULL) 182 if (opfunc == NULL)
175 /* If there's no function, patch it with a ud2a (BUG) */ 183 /* If there's no function, patch it with a ud2a (BUG) */
176 ret = paravirt_patch_insns(insnbuf, len, start_ud2a, end_ud2a); 184 ret = paravirt_patch_insns(insnbuf, len, ud2a, ud2a+sizeof(ud2a));
177 else if (opfunc == paravirt_nop) 185 else if (opfunc == paravirt_nop)
178 /* If the operation is a nop, then nop the callsite */ 186 /* If the operation is a nop, then nop the callsite */
179 ret = paravirt_patch_nop(); 187 ret = paravirt_patch_nop();
180 else if (type == PARAVIRT_PATCH(iret) || 188 else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) ||
181 type == PARAVIRT_PATCH(irq_enable_sysexit)) 189 type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit))
182 /* If operation requires a jmp, then jmp */ 190 /* If operation requires a jmp, then jmp */
183 ret = paravirt_patch_jmp(opfunc, insnbuf, addr, len); 191 ret = paravirt_patch_jmp(insnbuf, opfunc, addr, len);
184 else 192 else
185 /* Otherwise call the function; assume target could 193 /* Otherwise call the function; assume target could
186 clobber any caller-save reg */ 194 clobber any caller-save reg */
@@ -205,7 +213,7 @@ unsigned paravirt_patch_insns(void *insnbuf, unsigned len,
205 213
206void init_IRQ(void) 214void init_IRQ(void)
207{ 215{
208 paravirt_ops.init_IRQ(); 216 pv_irq_ops.init_IRQ();
209} 217}
210 218
211static void native_flush_tlb(void) 219static void native_flush_tlb(void)
@@ -233,7 +241,7 @@ extern void native_irq_enable_sysexit(void);
233 241
234static int __init print_banner(void) 242static int __init print_banner(void)
235{ 243{
236 paravirt_ops.banner(); 244 pv_init_ops.banner();
237 return 0; 245 return 0;
238} 246}
239core_initcall(print_banner); 247core_initcall(print_banner);
@@ -273,47 +281,96 @@ int paravirt_disable_iospace(void)
273 return ret; 281 return ret;
274} 282}
275 283
276struct paravirt_ops paravirt_ops = { 284static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LAZY_NONE;
285
286static inline void enter_lazy(enum paravirt_lazy_mode mode)
287{
288 BUG_ON(x86_read_percpu(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE);
289 BUG_ON(preemptible());
290
291 x86_write_percpu(paravirt_lazy_mode, mode);
292}
293
294void paravirt_leave_lazy(enum paravirt_lazy_mode mode)
295{
296 BUG_ON(x86_read_percpu(paravirt_lazy_mode) != mode);
297 BUG_ON(preemptible());
298
299 x86_write_percpu(paravirt_lazy_mode, PARAVIRT_LAZY_NONE);
300}
301
302void paravirt_enter_lazy_mmu(void)
303{
304 enter_lazy(PARAVIRT_LAZY_MMU);
305}
306
307void paravirt_leave_lazy_mmu(void)
308{
309 paravirt_leave_lazy(PARAVIRT_LAZY_MMU);
310}
311
312void paravirt_enter_lazy_cpu(void)
313{
314 enter_lazy(PARAVIRT_LAZY_CPU);
315}
316
317void paravirt_leave_lazy_cpu(void)
318{
319 paravirt_leave_lazy(PARAVIRT_LAZY_CPU);
320}
321
322enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
323{
324 return x86_read_percpu(paravirt_lazy_mode);
325}
326
327struct pv_info pv_info = {
277 .name = "bare hardware", 328 .name = "bare hardware",
278 .paravirt_enabled = 0, 329 .paravirt_enabled = 0,
279 .kernel_rpl = 0, 330 .kernel_rpl = 0,
280 .shared_kernel_pmd = 1, /* Only used when CONFIG_X86_PAE is set */ 331 .shared_kernel_pmd = 1, /* Only used when CONFIG_X86_PAE is set */
332};
281 333
282 .patch = native_patch, 334struct pv_init_ops pv_init_ops = {
335 .patch = native_patch,
283 .banner = default_banner, 336 .banner = default_banner,
284 .arch_setup = paravirt_nop, 337 .arch_setup = paravirt_nop,
285 .memory_setup = machine_specific_memory_setup, 338 .memory_setup = machine_specific_memory_setup,
339};
340
341struct pv_time_ops pv_time_ops = {
342 .time_init = hpet_time_init,
286 .get_wallclock = native_get_wallclock, 343 .get_wallclock = native_get_wallclock,
287 .set_wallclock = native_set_wallclock, 344 .set_wallclock = native_set_wallclock,
288 .time_init = hpet_time_init, 345 .sched_clock = native_sched_clock,
346 .get_cpu_khz = native_calculate_cpu_khz,
347};
348
349struct pv_irq_ops pv_irq_ops = {
289 .init_IRQ = native_init_IRQ, 350 .init_IRQ = native_init_IRQ,
351 .save_fl = native_save_fl,
352 .restore_fl = native_restore_fl,
353 .irq_disable = native_irq_disable,
354 .irq_enable = native_irq_enable,
355 .safe_halt = native_safe_halt,
356 .halt = native_halt,
357};
290 358
359struct pv_cpu_ops pv_cpu_ops = {
291 .cpuid = native_cpuid, 360 .cpuid = native_cpuid,
292 .get_debugreg = native_get_debugreg, 361 .get_debugreg = native_get_debugreg,
293 .set_debugreg = native_set_debugreg, 362 .set_debugreg = native_set_debugreg,
294 .clts = native_clts, 363 .clts = native_clts,
295 .read_cr0 = native_read_cr0, 364 .read_cr0 = native_read_cr0,
296 .write_cr0 = native_write_cr0, 365 .write_cr0 = native_write_cr0,
297 .read_cr2 = native_read_cr2,
298 .write_cr2 = native_write_cr2,
299 .read_cr3 = native_read_cr3,
300 .write_cr3 = native_write_cr3,
301 .read_cr4 = native_read_cr4, 366 .read_cr4 = native_read_cr4,
302 .read_cr4_safe = native_read_cr4_safe, 367 .read_cr4_safe = native_read_cr4_safe,
303 .write_cr4 = native_write_cr4, 368 .write_cr4 = native_write_cr4,
304 .save_fl = native_save_fl,
305 .restore_fl = native_restore_fl,
306 .irq_disable = native_irq_disable,
307 .irq_enable = native_irq_enable,
308 .safe_halt = native_safe_halt,
309 .halt = native_halt,
310 .wbinvd = native_wbinvd, 369 .wbinvd = native_wbinvd,
311 .read_msr = native_read_msr_safe, 370 .read_msr = native_read_msr_safe,
312 .write_msr = native_write_msr_safe, 371 .write_msr = native_write_msr_safe,
313 .read_tsc = native_read_tsc, 372 .read_tsc = native_read_tsc,
314 .read_pmc = native_read_pmc, 373 .read_pmc = native_read_pmc,
315 .sched_clock = native_sched_clock,
316 .get_cpu_khz = native_calculate_cpu_khz,
317 .load_tr_desc = native_load_tr_desc, 374 .load_tr_desc = native_load_tr_desc,
318 .set_ldt = native_set_ldt, 375 .set_ldt = native_set_ldt,
319 .load_gdt = native_load_gdt, 376 .load_gdt = native_load_gdt,
@@ -327,9 +384,19 @@ struct paravirt_ops paravirt_ops = {
327 .write_idt_entry = write_dt_entry, 384 .write_idt_entry = write_dt_entry,
328 .load_esp0 = native_load_esp0, 385 .load_esp0 = native_load_esp0,
329 386
387 .irq_enable_sysexit = native_irq_enable_sysexit,
388 .iret = native_iret,
389
330 .set_iopl_mask = native_set_iopl_mask, 390 .set_iopl_mask = native_set_iopl_mask,
331 .io_delay = native_io_delay, 391 .io_delay = native_io_delay,
332 392
393 .lazy_mode = {
394 .enter = paravirt_nop,
395 .leave = paravirt_nop,
396 },
397};
398
399struct pv_apic_ops pv_apic_ops = {
333#ifdef CONFIG_X86_LOCAL_APIC 400#ifdef CONFIG_X86_LOCAL_APIC
334 .apic_write = native_apic_write, 401 .apic_write = native_apic_write,
335 .apic_write_atomic = native_apic_write_atomic, 402 .apic_write_atomic = native_apic_write_atomic,
@@ -338,11 +405,17 @@ struct paravirt_ops paravirt_ops = {
338 .setup_secondary_clock = setup_secondary_APIC_clock, 405 .setup_secondary_clock = setup_secondary_APIC_clock,
339 .startup_ipi_hook = paravirt_nop, 406 .startup_ipi_hook = paravirt_nop,
340#endif 407#endif
341 .set_lazy_mode = paravirt_nop, 408};
342 409
410struct pv_mmu_ops pv_mmu_ops = {
343 .pagetable_setup_start = native_pagetable_setup_start, 411 .pagetable_setup_start = native_pagetable_setup_start,
344 .pagetable_setup_done = native_pagetable_setup_done, 412 .pagetable_setup_done = native_pagetable_setup_done,
345 413
414 .read_cr2 = native_read_cr2,
415 .write_cr2 = native_write_cr2,
416 .read_cr3 = native_read_cr3,
417 .write_cr3 = native_write_cr3,
418
346 .flush_tlb_user = native_flush_tlb, 419 .flush_tlb_user = native_flush_tlb,
347 .flush_tlb_kernel = native_flush_tlb_global, 420 .flush_tlb_kernel = native_flush_tlb_global,
348 .flush_tlb_single = native_flush_tlb_single, 421 .flush_tlb_single = native_flush_tlb_single,
@@ -381,12 +454,19 @@ struct paravirt_ops paravirt_ops = {
381 .make_pte = native_make_pte, 454 .make_pte = native_make_pte,
382 .make_pgd = native_make_pgd, 455 .make_pgd = native_make_pgd,
383 456
384 .irq_enable_sysexit = native_irq_enable_sysexit,
385 .iret = native_iret,
386
387 .dup_mmap = paravirt_nop, 457 .dup_mmap = paravirt_nop,
388 .exit_mmap = paravirt_nop, 458 .exit_mmap = paravirt_nop,
389 .activate_mm = paravirt_nop, 459 .activate_mm = paravirt_nop,
460
461 .lazy_mode = {
462 .enter = paravirt_nop,
463 .leave = paravirt_nop,
464 },
390}; 465};
391 466
392EXPORT_SYMBOL(paravirt_ops); 467EXPORT_SYMBOL_GPL(pv_time_ops);
468EXPORT_SYMBOL_GPL(pv_cpu_ops);
469EXPORT_SYMBOL_GPL(pv_mmu_ops);
470EXPORT_SYMBOL_GPL(pv_apic_ops);
471EXPORT_SYMBOL_GPL(pv_info);
472EXPORT_SYMBOL (pv_irq_ops);
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 18673e0f193b..f02bad68abaa 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -134,21 +134,21 @@ static unsigned vmi_patch(u8 type, u16 clobbers, void *insns,
134 unsigned long eip, unsigned len) 134 unsigned long eip, unsigned len)
135{ 135{
136 switch (type) { 136 switch (type) {
137 case PARAVIRT_PATCH(irq_disable): 137 case PARAVIRT_PATCH(pv_irq_ops.irq_disable):
138 return patch_internal(VMI_CALL_DisableInterrupts, len, 138 return patch_internal(VMI_CALL_DisableInterrupts, len,
139 insns, eip); 139 insns, eip);
140 case PARAVIRT_PATCH(irq_enable): 140 case PARAVIRT_PATCH(pv_irq_ops.irq_enable):
141 return patch_internal(VMI_CALL_EnableInterrupts, len, 141 return patch_internal(VMI_CALL_EnableInterrupts, len,
142 insns, eip); 142 insns, eip);
143 case PARAVIRT_PATCH(restore_fl): 143 case PARAVIRT_PATCH(pv_irq_ops.restore_fl):
144 return patch_internal(VMI_CALL_SetInterruptMask, len, 144 return patch_internal(VMI_CALL_SetInterruptMask, len,
145 insns, eip); 145 insns, eip);
146 case PARAVIRT_PATCH(save_fl): 146 case PARAVIRT_PATCH(pv_irq_ops.save_fl):
147 return patch_internal(VMI_CALL_GetInterruptMask, len, 147 return patch_internal(VMI_CALL_GetInterruptMask, len,
148 insns, eip); 148 insns, eip);
149 case PARAVIRT_PATCH(iret): 149 case PARAVIRT_PATCH(pv_cpu_ops.iret):
150 return patch_internal(VMI_CALL_IRET, len, insns, eip); 150 return patch_internal(VMI_CALL_IRET, len, insns, eip);
151 case PARAVIRT_PATCH(irq_enable_sysexit): 151 case PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit):
152 return patch_internal(VMI_CALL_SYSEXIT, len, insns, eip); 152 return patch_internal(VMI_CALL_SYSEXIT, len, insns, eip);
153 default: 153 default:
154 break; 154 break;
@@ -552,24 +552,22 @@ vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip,
552} 552}
553#endif 553#endif
554 554
555static void vmi_set_lazy_mode(enum paravirt_lazy_mode mode) 555static void vmi_enter_lazy_cpu(void)
556{ 556{
557 static DEFINE_PER_CPU(enum paravirt_lazy_mode, lazy_mode); 557 paravirt_enter_lazy_cpu();
558 558 vmi_ops.set_lazy_mode(2);
559 if (!vmi_ops.set_lazy_mode) 559}
560 return;
561 560
562 /* Modes should never nest or overlap */ 561static void vmi_enter_lazy_mmu(void)
563 BUG_ON(__get_cpu_var(lazy_mode) && !(mode == PARAVIRT_LAZY_NONE || 562{
564 mode == PARAVIRT_LAZY_FLUSH)); 563 paravirt_enter_lazy_mmu();
564 vmi_ops.set_lazy_mode(1);
565}
565 566
566 if (mode == PARAVIRT_LAZY_FLUSH) { 567static void vmi_leave_lazy(void)
567 vmi_ops.set_lazy_mode(0); 568{
568 vmi_ops.set_lazy_mode(__get_cpu_var(lazy_mode)); 569 paravirt_leave_lazy(paravirt_get_lazy_mode());
569 } else { 570 vmi_ops.set_lazy_mode(0);
570 vmi_ops.set_lazy_mode(mode);
571 __get_cpu_var(lazy_mode) = mode;
572 }
573} 571}
574 572
575static inline int __init check_vmi_rom(struct vrom_header *rom) 573static inline int __init check_vmi_rom(struct vrom_header *rom)
@@ -690,9 +688,9 @@ do { \
690 reloc = call_vrom_long_func(vmi_rom, get_reloc, \ 688 reloc = call_vrom_long_func(vmi_rom, get_reloc, \
691 VMI_CALL_##vmicall); \ 689 VMI_CALL_##vmicall); \
692 if (rel->type == VMI_RELOCATION_CALL_REL) \ 690 if (rel->type == VMI_RELOCATION_CALL_REL) \
693 paravirt_ops.opname = (void *)rel->eip; \ 691 opname = (void *)rel->eip; \
694 else if (rel->type == VMI_RELOCATION_NOP) \ 692 else if (rel->type == VMI_RELOCATION_NOP) \
695 paravirt_ops.opname = (void *)vmi_nop; \ 693 opname = (void *)vmi_nop; \
696 else if (rel->type != VMI_RELOCATION_NONE) \ 694 else if (rel->type != VMI_RELOCATION_NONE) \
697 printk(KERN_WARNING "VMI: Unknown relocation " \ 695 printk(KERN_WARNING "VMI: Unknown relocation " \
698 "type %d for " #vmicall"\n",\ 696 "type %d for " #vmicall"\n",\
@@ -712,7 +710,7 @@ do { \
712 VMI_CALL_##vmicall); \ 710 VMI_CALL_##vmicall); \
713 BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL); \ 711 BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL); \
714 if (rel->type == VMI_RELOCATION_CALL_REL) { \ 712 if (rel->type == VMI_RELOCATION_CALL_REL) { \
715 paravirt_ops.opname = wrapper; \ 713 opname = wrapper; \
716 vmi_ops.cache = (void *)rel->eip; \ 714 vmi_ops.cache = (void *)rel->eip; \
717 } \ 715 } \
718} while (0) 716} while (0)
@@ -732,11 +730,11 @@ static inline int __init activate_vmi(void)
732 } 730 }
733 savesegment(cs, kernel_cs); 731 savesegment(cs, kernel_cs);
734 732
735 paravirt_ops.paravirt_enabled = 1; 733 pv_info.paravirt_enabled = 1;
736 paravirt_ops.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK; 734 pv_info.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK;
735 pv_info.name = "vmi";
737 736
738 paravirt_ops.patch = vmi_patch; 737 pv_init_ops.patch = vmi_patch;
739 paravirt_ops.name = "vmi";
740 738
741 /* 739 /*
742 * Many of these operations are ABI compatible with VMI. 740 * Many of these operations are ABI compatible with VMI.
@@ -754,26 +752,26 @@ static inline int __init activate_vmi(void)
754 */ 752 */
755 753
756 /* CPUID is special, so very special it gets wrapped like a present */ 754 /* CPUID is special, so very special it gets wrapped like a present */
757 para_wrap(cpuid, vmi_cpuid, cpuid, CPUID); 755 para_wrap(pv_cpu_ops.cpuid, vmi_cpuid, cpuid, CPUID);
758 756
759 para_fill(clts, CLTS); 757 para_fill(pv_cpu_ops.clts, CLTS);
760 para_fill(get_debugreg, GetDR); 758 para_fill(pv_cpu_ops.get_debugreg, GetDR);
761 para_fill(set_debugreg, SetDR); 759 para_fill(pv_cpu_ops.set_debugreg, SetDR);
762 para_fill(read_cr0, GetCR0); 760 para_fill(pv_cpu_ops.read_cr0, GetCR0);
763 para_fill(read_cr2, GetCR2); 761 para_fill(pv_mmu_ops.read_cr2, GetCR2);
764 para_fill(read_cr3, GetCR3); 762 para_fill(pv_mmu_ops.read_cr3, GetCR3);
765 para_fill(read_cr4, GetCR4); 763 para_fill(pv_cpu_ops.read_cr4, GetCR4);
766 para_fill(write_cr0, SetCR0); 764 para_fill(pv_cpu_ops.write_cr0, SetCR0);
767 para_fill(write_cr2, SetCR2); 765 para_fill(pv_mmu_ops.write_cr2, SetCR2);
768 para_fill(write_cr3, SetCR3); 766 para_fill(pv_mmu_ops.write_cr3, SetCR3);
769 para_fill(write_cr4, SetCR4); 767 para_fill(pv_cpu_ops.write_cr4, SetCR4);
770 para_fill(save_fl, GetInterruptMask); 768 para_fill(pv_irq_ops.save_fl, GetInterruptMask);
771 para_fill(restore_fl, SetInterruptMask); 769 para_fill(pv_irq_ops.restore_fl, SetInterruptMask);
772 para_fill(irq_disable, DisableInterrupts); 770 para_fill(pv_irq_ops.irq_disable, DisableInterrupts);
773 para_fill(irq_enable, EnableInterrupts); 771 para_fill(pv_irq_ops.irq_enable, EnableInterrupts);
774 772
775 para_fill(wbinvd, WBINVD); 773 para_fill(pv_cpu_ops.wbinvd, WBINVD);
776 para_fill(read_tsc, RDTSC); 774 para_fill(pv_cpu_ops.read_tsc, RDTSC);
777 775
778 /* The following we emulate with trap and emulate for now */ 776 /* The following we emulate with trap and emulate for now */
779 /* paravirt_ops.read_msr = vmi_rdmsr */ 777 /* paravirt_ops.read_msr = vmi_rdmsr */
@@ -781,29 +779,38 @@ static inline int __init activate_vmi(void)
781 /* paravirt_ops.rdpmc = vmi_rdpmc */ 779 /* paravirt_ops.rdpmc = vmi_rdpmc */
782 780
783 /* TR interface doesn't pass TR value, wrap */ 781 /* TR interface doesn't pass TR value, wrap */
784 para_wrap(load_tr_desc, vmi_set_tr, set_tr, SetTR); 782 para_wrap(pv_cpu_ops.load_tr_desc, vmi_set_tr, set_tr, SetTR);
785 783
786 /* LDT is special, too */ 784 /* LDT is special, too */
787 para_wrap(set_ldt, vmi_set_ldt, _set_ldt, SetLDT); 785 para_wrap(pv_cpu_ops.set_ldt, vmi_set_ldt, _set_ldt, SetLDT);
788 786
789 para_fill(load_gdt, SetGDT); 787 para_fill(pv_cpu_ops.load_gdt, SetGDT);
790 para_fill(load_idt, SetIDT); 788 para_fill(pv_cpu_ops.load_idt, SetIDT);
791 para_fill(store_gdt, GetGDT); 789 para_fill(pv_cpu_ops.store_gdt, GetGDT);
792 para_fill(store_idt, GetIDT); 790 para_fill(pv_cpu_ops.store_idt, GetIDT);
793 para_fill(store_tr, GetTR); 791 para_fill(pv_cpu_ops.store_tr, GetTR);
794 paravirt_ops.load_tls = vmi_load_tls; 792 pv_cpu_ops.load_tls = vmi_load_tls;
795 para_fill(write_ldt_entry, WriteLDTEntry); 793 para_fill(pv_cpu_ops.write_ldt_entry, WriteLDTEntry);
796 para_fill(write_gdt_entry, WriteGDTEntry); 794 para_fill(pv_cpu_ops.write_gdt_entry, WriteGDTEntry);
797 para_fill(write_idt_entry, WriteIDTEntry); 795 para_fill(pv_cpu_ops.write_idt_entry, WriteIDTEntry);
798 para_wrap(load_esp0, vmi_load_esp0, set_kernel_stack, UpdateKernelStack); 796 para_wrap(pv_cpu_ops.load_esp0, vmi_load_esp0, set_kernel_stack, UpdateKernelStack);
799 para_fill(set_iopl_mask, SetIOPLMask); 797 para_fill(pv_cpu_ops.set_iopl_mask, SetIOPLMask);
800 para_fill(io_delay, IODelay); 798 para_fill(pv_cpu_ops.io_delay, IODelay);
801 para_wrap(set_lazy_mode, vmi_set_lazy_mode, set_lazy_mode, SetLazyMode); 799
800 para_wrap(pv_cpu_ops.lazy_mode.enter, vmi_enter_lazy_cpu,
801 set_lazy_mode, SetLazyMode);
802 para_wrap(pv_cpu_ops.lazy_mode.leave, vmi_leave_lazy,
803 set_lazy_mode, SetLazyMode);
804
805 para_wrap(pv_mmu_ops.lazy_mode.enter, vmi_enter_lazy_mmu,
806 set_lazy_mode, SetLazyMode);
807 para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy,
808 set_lazy_mode, SetLazyMode);
802 809
803 /* user and kernel flush are just handled with different flags to FlushTLB */ 810 /* user and kernel flush are just handled with different flags to FlushTLB */
804 para_wrap(flush_tlb_user, vmi_flush_tlb_user, _flush_tlb, FlushTLB); 811 para_wrap(pv_mmu_ops.flush_tlb_user, vmi_flush_tlb_user, _flush_tlb, FlushTLB);
805 para_wrap(flush_tlb_kernel, vmi_flush_tlb_kernel, _flush_tlb, FlushTLB); 812 para_wrap(pv_mmu_ops.flush_tlb_kernel, vmi_flush_tlb_kernel, _flush_tlb, FlushTLB);
806 para_fill(flush_tlb_single, InvalPage); 813 para_fill(pv_mmu_ops.flush_tlb_single, InvalPage);
807 814
808 /* 815 /*
809 * Until a standard flag format can be agreed on, we need to 816 * Until a standard flag format can be agreed on, we need to
@@ -819,41 +826,41 @@ static inline int __init activate_vmi(void)
819#endif 826#endif
820 827
821 if (vmi_ops.set_pte) { 828 if (vmi_ops.set_pte) {
822 paravirt_ops.set_pte = vmi_set_pte; 829 pv_mmu_ops.set_pte = vmi_set_pte;
823 paravirt_ops.set_pte_at = vmi_set_pte_at; 830 pv_mmu_ops.set_pte_at = vmi_set_pte_at;
824 paravirt_ops.set_pmd = vmi_set_pmd; 831 pv_mmu_ops.set_pmd = vmi_set_pmd;
825#ifdef CONFIG_X86_PAE 832#ifdef CONFIG_X86_PAE
826 paravirt_ops.set_pte_atomic = vmi_set_pte_atomic; 833 pv_mmu_ops.set_pte_atomic = vmi_set_pte_atomic;
827 paravirt_ops.set_pte_present = vmi_set_pte_present; 834 pv_mmu_ops.set_pte_present = vmi_set_pte_present;
828 paravirt_ops.set_pud = vmi_set_pud; 835 pv_mmu_ops.set_pud = vmi_set_pud;
829 paravirt_ops.pte_clear = vmi_pte_clear; 836 pv_mmu_ops.pte_clear = vmi_pte_clear;
830 paravirt_ops.pmd_clear = vmi_pmd_clear; 837 pv_mmu_ops.pmd_clear = vmi_pmd_clear;
831#endif 838#endif
832 } 839 }
833 840
834 if (vmi_ops.update_pte) { 841 if (vmi_ops.update_pte) {
835 paravirt_ops.pte_update = vmi_update_pte; 842 pv_mmu_ops.pte_update = vmi_update_pte;
836 paravirt_ops.pte_update_defer = vmi_update_pte_defer; 843 pv_mmu_ops.pte_update_defer = vmi_update_pte_defer;
837 } 844 }
838 845
839 vmi_ops.allocate_page = vmi_get_function(VMI_CALL_AllocatePage); 846 vmi_ops.allocate_page = vmi_get_function(VMI_CALL_AllocatePage);
840 if (vmi_ops.allocate_page) { 847 if (vmi_ops.allocate_page) {
841 paravirt_ops.alloc_pt = vmi_allocate_pt; 848 pv_mmu_ops.alloc_pt = vmi_allocate_pt;
842 paravirt_ops.alloc_pd = vmi_allocate_pd; 849 pv_mmu_ops.alloc_pd = vmi_allocate_pd;
843 paravirt_ops.alloc_pd_clone = vmi_allocate_pd_clone; 850 pv_mmu_ops.alloc_pd_clone = vmi_allocate_pd_clone;
844 } 851 }
845 852
846 vmi_ops.release_page = vmi_get_function(VMI_CALL_ReleasePage); 853 vmi_ops.release_page = vmi_get_function(VMI_CALL_ReleasePage);
847 if (vmi_ops.release_page) { 854 if (vmi_ops.release_page) {
848 paravirt_ops.release_pt = vmi_release_pt; 855 pv_mmu_ops.release_pt = vmi_release_pt;
849 paravirt_ops.release_pd = vmi_release_pd; 856 pv_mmu_ops.release_pd = vmi_release_pd;
850 } 857 }
851 858
852 /* Set linear is needed in all cases */ 859 /* Set linear is needed in all cases */
853 vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping); 860 vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping);
854#ifdef CONFIG_HIGHPTE 861#ifdef CONFIG_HIGHPTE
855 if (vmi_ops.set_linear_mapping) 862 if (vmi_ops.set_linear_mapping)
856 paravirt_ops.kmap_atomic_pte = vmi_kmap_atomic_pte; 863 pv_mmu_ops.kmap_atomic_pte = vmi_kmap_atomic_pte;
857#endif 864#endif
858 865
859 /* 866 /*
@@ -863,17 +870,17 @@ static inline int __init activate_vmi(void)
863 * the backend. They are performance critical anyway, so requiring 870 * the backend. They are performance critical anyway, so requiring
864 * a patch is not a big problem. 871 * a patch is not a big problem.
865 */ 872 */
866 paravirt_ops.irq_enable_sysexit = (void *)0xfeedbab0; 873 pv_cpu_ops.irq_enable_sysexit = (void *)0xfeedbab0;
867 paravirt_ops.iret = (void *)0xbadbab0; 874 pv_cpu_ops.iret = (void *)0xbadbab0;
868 875
869#ifdef CONFIG_SMP 876#ifdef CONFIG_SMP
870 para_wrap(startup_ipi_hook, vmi_startup_ipi_hook, set_initial_ap_state, SetInitialAPState); 877 para_wrap(pv_apic_ops.startup_ipi_hook, vmi_startup_ipi_hook, set_initial_ap_state, SetInitialAPState);
871#endif 878#endif
872 879
873#ifdef CONFIG_X86_LOCAL_APIC 880#ifdef CONFIG_X86_LOCAL_APIC
874 para_fill(apic_read, APICRead); 881 para_fill(pv_apic_ops.apic_read, APICRead);
875 para_fill(apic_write, APICWrite); 882 para_fill(pv_apic_ops.apic_write, APICWrite);
876 para_fill(apic_write_atomic, APICWrite); 883 para_fill(pv_apic_ops.apic_write_atomic, APICWrite);
877#endif 884#endif
878 885
879 /* 886 /*
@@ -891,15 +898,15 @@ static inline int __init activate_vmi(void)
891 vmi_timer_ops.set_alarm = vmi_get_function(VMI_CALL_SetAlarm); 898 vmi_timer_ops.set_alarm = vmi_get_function(VMI_CALL_SetAlarm);
892 vmi_timer_ops.cancel_alarm = 899 vmi_timer_ops.cancel_alarm =
893 vmi_get_function(VMI_CALL_CancelAlarm); 900 vmi_get_function(VMI_CALL_CancelAlarm);
894 paravirt_ops.time_init = vmi_time_init; 901 pv_time_ops.time_init = vmi_time_init;
895 paravirt_ops.get_wallclock = vmi_get_wallclock; 902 pv_time_ops.get_wallclock = vmi_get_wallclock;
896 paravirt_ops.set_wallclock = vmi_set_wallclock; 903 pv_time_ops.set_wallclock = vmi_set_wallclock;
897#ifdef CONFIG_X86_LOCAL_APIC 904#ifdef CONFIG_X86_LOCAL_APIC
898 paravirt_ops.setup_boot_clock = vmi_time_bsp_init; 905 pv_apic_ops.setup_boot_clock = vmi_time_bsp_init;
899 paravirt_ops.setup_secondary_clock = vmi_time_ap_init; 906 pv_apic_ops.setup_secondary_clock = vmi_time_ap_init;
900#endif 907#endif
901 paravirt_ops.sched_clock = vmi_sched_clock; 908 pv_time_ops.sched_clock = vmi_sched_clock;
902 paravirt_ops.get_cpu_khz = vmi_cpu_khz; 909 pv_time_ops.get_cpu_khz = vmi_cpu_khz;
903 910
904 /* We have true wallclock functions; disable CMOS clock sync */ 911 /* We have true wallclock functions; disable CMOS clock sync */
905 no_sync_cmos_clock = 1; 912 no_sync_cmos_clock = 1;
@@ -908,7 +915,7 @@ static inline int __init activate_vmi(void)
908 disable_vmi_timer = 1; 915 disable_vmi_timer = 1;
909 } 916 }
910 917
911 para_fill(safe_halt, Halt); 918 para_fill(pv_irq_ops.safe_halt, Halt);
912 919
913 /* 920 /*
914 * Alternative instruction rewriting doesn't happen soon enough 921 * Alternative instruction rewriting doesn't happen soon enough
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index dda4e83649a0..33d367a3432e 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -741,24 +741,12 @@ struct kmem_cache *pmd_cache;
741 741
742void __init pgtable_cache_init(void) 742void __init pgtable_cache_init(void)
743{ 743{
744 size_t pgd_size = PTRS_PER_PGD*sizeof(pgd_t); 744 if (PTRS_PER_PMD > 1)
745
746 if (PTRS_PER_PMD > 1) {
747 pmd_cache = kmem_cache_create("pmd", 745 pmd_cache = kmem_cache_create("pmd",
748 PTRS_PER_PMD*sizeof(pmd_t), 746 PTRS_PER_PMD*sizeof(pmd_t),
749 PTRS_PER_PMD*sizeof(pmd_t), 747 PTRS_PER_PMD*sizeof(pmd_t),
750 SLAB_PANIC, 748 SLAB_PANIC,
751 pmd_ctor); 749 pmd_ctor);
752 if (!SHARED_KERNEL_PMD) {
753 /* If we're in PAE mode and have a non-shared
754 kernel pmd, then the pgd size must be a
755 page size. This is because the pgd_list
756 links through the page structure, so there
757 can only be one pgd per page for this to
758 work. */
759 pgd_size = PAGE_SIZE;
760 }
761 }
762} 750}
763 751
764/* 752/*
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 493a083f6886..94c39aaf695f 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -25,7 +25,6 @@
25#include <linux/mm.h> 25#include <linux/mm.h>
26#include <linux/page-flags.h> 26#include <linux/page-flags.h>
27#include <linux/highmem.h> 27#include <linux/highmem.h>
28#include <linux/smp.h>
29 28
30#include <xen/interface/xen.h> 29#include <xen/interface/xen.h>
31#include <xen/interface/physdev.h> 30#include <xen/interface/physdev.h>
@@ -52,11 +51,25 @@
52 51
53EXPORT_SYMBOL_GPL(hypercall_page); 52EXPORT_SYMBOL_GPL(hypercall_page);
54 53
55DEFINE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode);
56
57DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu); 54DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
58DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); 55DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
59DEFINE_PER_CPU(unsigned long, xen_cr3); 56
57/*
58 * Note about cr3 (pagetable base) values:
59 *
60 * xen_cr3 contains the current logical cr3 value; it contains the
61 * last set cr3. This may not be the current effective cr3, because
62 * its update may be being lazily deferred. However, a vcpu looking
63 * at its own cr3 can use this value knowing that it everything will
64 * be self-consistent.
65 *
66 * xen_current_cr3 contains the actual vcpu cr3; it is set once the
67 * hypercall to set the vcpu cr3 is complete (so it may be a little
68 * out of date, but it will never be set early). If one vcpu is
69 * looking at another vcpu's cr3 value, it should use this variable.
70 */
71DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */
72DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
60 73
61struct start_info *xen_start_info; 74struct start_info *xen_start_info;
62EXPORT_SYMBOL_GPL(xen_start_info); 75EXPORT_SYMBOL_GPL(xen_start_info);
@@ -100,7 +113,7 @@ static void __init xen_vcpu_setup(int cpu)
100 info.mfn = virt_to_mfn(vcpup); 113 info.mfn = virt_to_mfn(vcpup);
101 info.offset = offset_in_page(vcpup); 114 info.offset = offset_in_page(vcpup);
102 115
103 printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %x, offset %d\n", 116 printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %llx, offset %d\n",
104 cpu, vcpup, info.mfn, info.offset); 117 cpu, vcpup, info.mfn, info.offset);
105 118
106 /* Check to see if the hypervisor will put the vcpu_info 119 /* Check to see if the hypervisor will put the vcpu_info
@@ -124,7 +137,7 @@ static void __init xen_vcpu_setup(int cpu)
124static void __init xen_banner(void) 137static void __init xen_banner(void)
125{ 138{
126 printk(KERN_INFO "Booting paravirtualized kernel on %s\n", 139 printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
127 paravirt_ops.name); 140 pv_info.name);
128 printk(KERN_INFO "Hypervisor signature: %s\n", xen_start_info->magic); 141 printk(KERN_INFO "Hypervisor signature: %s\n", xen_start_info->magic);
129} 142}
130 143
@@ -249,29 +262,10 @@ static void xen_halt(void)
249 xen_safe_halt(); 262 xen_safe_halt();
250} 263}
251 264
252static void xen_set_lazy_mode(enum paravirt_lazy_mode mode) 265static void xen_leave_lazy(void)
253{ 266{
254 BUG_ON(preemptible()); 267 paravirt_leave_lazy(paravirt_get_lazy_mode());
255
256 switch (mode) {
257 case PARAVIRT_LAZY_NONE:
258 BUG_ON(x86_read_percpu(xen_lazy_mode) == PARAVIRT_LAZY_NONE);
259 break;
260
261 case PARAVIRT_LAZY_MMU:
262 case PARAVIRT_LAZY_CPU:
263 BUG_ON(x86_read_percpu(xen_lazy_mode) != PARAVIRT_LAZY_NONE);
264 break;
265
266 case PARAVIRT_LAZY_FLUSH:
267 /* flush if necessary, but don't change state */
268 if (x86_read_percpu(xen_lazy_mode) != PARAVIRT_LAZY_NONE)
269 xen_mc_flush();
270 return;
271 }
272
273 xen_mc_flush(); 268 xen_mc_flush();
274 x86_write_percpu(xen_lazy_mode, mode);
275} 269}
276 270
277static unsigned long xen_store_tr(void) 271static unsigned long xen_store_tr(void)
@@ -358,7 +352,7 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
358 * loaded properly. This will go away as soon as Xen has been 352 * loaded properly. This will go away as soon as Xen has been
359 * modified to not save/restore %gs for normal hypercalls. 353 * modified to not save/restore %gs for normal hypercalls.
360 */ 354 */
361 if (xen_get_lazy_mode() == PARAVIRT_LAZY_CPU) 355 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU)
362 loadsegment(gs, 0); 356 loadsegment(gs, 0);
363} 357}
364 358
@@ -632,32 +626,36 @@ static unsigned long xen_read_cr3(void)
632 return x86_read_percpu(xen_cr3); 626 return x86_read_percpu(xen_cr3);
633} 627}
634 628
629static void set_current_cr3(void *v)
630{
631 x86_write_percpu(xen_current_cr3, (unsigned long)v);
632}
633
635static void xen_write_cr3(unsigned long cr3) 634static void xen_write_cr3(unsigned long cr3)
636{ 635{
636 struct mmuext_op *op;
637 struct multicall_space mcs;
638 unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3));
639
637 BUG_ON(preemptible()); 640 BUG_ON(preemptible());
638 641
639 if (cr3 == x86_read_percpu(xen_cr3)) { 642 mcs = xen_mc_entry(sizeof(*op)); /* disables interrupts */
640 /* just a simple tlb flush */
641 xen_flush_tlb();
642 return;
643 }
644 643
644 /* Update while interrupts are disabled, so its atomic with
645 respect to ipis */
645 x86_write_percpu(xen_cr3, cr3); 646 x86_write_percpu(xen_cr3, cr3);
646 647
648 op = mcs.args;
649 op->cmd = MMUEXT_NEW_BASEPTR;
650 op->arg1.mfn = mfn;
647 651
648 { 652 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
649 struct mmuext_op *op;
650 struct multicall_space mcs = xen_mc_entry(sizeof(*op));
651 unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3));
652
653 op = mcs.args;
654 op->cmd = MMUEXT_NEW_BASEPTR;
655 op->arg1.mfn = mfn;
656 653
657 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); 654 /* Update xen_update_cr3 once the batch has actually
655 been submitted. */
656 xen_mc_callback(set_current_cr3, (void *)cr3);
658 657
659 xen_mc_issue(PARAVIRT_LAZY_CPU); 658 xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
660 }
661} 659}
662 660
663/* Early in boot, while setting up the initial pagetable, assume 661/* Early in boot, while setting up the initial pagetable, assume
@@ -668,6 +666,15 @@ static __init void xen_alloc_pt_init(struct mm_struct *mm, u32 pfn)
668 make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); 666 make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
669} 667}
670 668
669static void pin_pagetable_pfn(unsigned level, unsigned long pfn)
670{
671 struct mmuext_op op;
672 op.cmd = level;
673 op.arg1.mfn = pfn_to_mfn(pfn);
674 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
675 BUG();
676}
677
671/* This needs to make sure the new pte page is pinned iff its being 678/* This needs to make sure the new pte page is pinned iff its being
672 attached to a pinned pagetable. */ 679 attached to a pinned pagetable. */
673static void xen_alloc_pt(struct mm_struct *mm, u32 pfn) 680static void xen_alloc_pt(struct mm_struct *mm, u32 pfn)
@@ -677,9 +684,10 @@ static void xen_alloc_pt(struct mm_struct *mm, u32 pfn)
677 if (PagePinned(virt_to_page(mm->pgd))) { 684 if (PagePinned(virt_to_page(mm->pgd))) {
678 SetPagePinned(page); 685 SetPagePinned(page);
679 686
680 if (!PageHighMem(page)) 687 if (!PageHighMem(page)) {
681 make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); 688 make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
682 else 689 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
690 } else
683 /* make sure there are no stray mappings of 691 /* make sure there are no stray mappings of
684 this page */ 692 this page */
685 kmap_flush_unused(); 693 kmap_flush_unused();
@@ -692,8 +700,10 @@ static void xen_release_pt(u32 pfn)
692 struct page *page = pfn_to_page(pfn); 700 struct page *page = pfn_to_page(pfn);
693 701
694 if (PagePinned(page)) { 702 if (PagePinned(page)) {
695 if (!PageHighMem(page)) 703 if (!PageHighMem(page)) {
704 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
696 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); 705 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
706 }
697 } 707 }
698} 708}
699 709
@@ -738,7 +748,7 @@ static __init void xen_pagetable_setup_start(pgd_t *base)
738 pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base; 748 pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base;
739 749
740 /* special set_pte for pagetable initialization */ 750 /* special set_pte for pagetable initialization */
741 paravirt_ops.set_pte = xen_set_pte_init; 751 pv_mmu_ops.set_pte = xen_set_pte_init;
742 752
743 init_mm.pgd = base; 753 init_mm.pgd = base;
744 /* 754 /*
@@ -785,8 +795,8 @@ static __init void xen_pagetable_setup_done(pgd_t *base)
785{ 795{
786 /* This will work as long as patching hasn't happened yet 796 /* This will work as long as patching hasn't happened yet
787 (which it hasn't) */ 797 (which it hasn't) */
788 paravirt_ops.alloc_pt = xen_alloc_pt; 798 pv_mmu_ops.alloc_pt = xen_alloc_pt;
789 paravirt_ops.set_pte = xen_set_pte; 799 pv_mmu_ops.set_pte = xen_set_pte;
790 800
791 if (!xen_feature(XENFEAT_auto_translated_physmap)) { 801 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
792 /* 802 /*
@@ -808,15 +818,15 @@ static __init void xen_pagetable_setup_done(pgd_t *base)
808 /* Actually pin the pagetable down, but we can't set PG_pinned 818 /* Actually pin the pagetable down, but we can't set PG_pinned
809 yet because the page structures don't exist yet. */ 819 yet because the page structures don't exist yet. */
810 { 820 {
811 struct mmuext_op op; 821 unsigned level;
822
812#ifdef CONFIG_X86_PAE 823#ifdef CONFIG_X86_PAE
813 op.cmd = MMUEXT_PIN_L3_TABLE; 824 level = MMUEXT_PIN_L3_TABLE;
814#else 825#else
815 op.cmd = MMUEXT_PIN_L3_TABLE; 826 level = MMUEXT_PIN_L2_TABLE;
816#endif 827#endif
817 op.arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(base))); 828
818 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) 829 pin_pagetable_pfn(level, PFN_DOWN(__pa(base)));
819 BUG();
820 } 830 }
821} 831}
822 832
@@ -833,12 +843,12 @@ void __init xen_setup_vcpu_info_placement(void)
833 if (have_vcpu_info_placement) { 843 if (have_vcpu_info_placement) {
834 printk(KERN_INFO "Xen: using vcpu_info placement\n"); 844 printk(KERN_INFO "Xen: using vcpu_info placement\n");
835 845
836 paravirt_ops.save_fl = xen_save_fl_direct; 846 pv_irq_ops.save_fl = xen_save_fl_direct;
837 paravirt_ops.restore_fl = xen_restore_fl_direct; 847 pv_irq_ops.restore_fl = xen_restore_fl_direct;
838 paravirt_ops.irq_disable = xen_irq_disable_direct; 848 pv_irq_ops.irq_disable = xen_irq_disable_direct;
839 paravirt_ops.irq_enable = xen_irq_enable_direct; 849 pv_irq_ops.irq_enable = xen_irq_enable_direct;
840 paravirt_ops.read_cr2 = xen_read_cr2_direct; 850 pv_mmu_ops.read_cr2 = xen_read_cr2_direct;
841 paravirt_ops.iret = xen_iret_direct; 851 pv_cpu_ops.iret = xen_iret_direct;
842 } 852 }
843} 853}
844 854
@@ -850,8 +860,8 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
850 860
851 start = end = reloc = NULL; 861 start = end = reloc = NULL;
852 862
853#define SITE(x) \ 863#define SITE(op, x) \
854 case PARAVIRT_PATCH(x): \ 864 case PARAVIRT_PATCH(op.x): \
855 if (have_vcpu_info_placement) { \ 865 if (have_vcpu_info_placement) { \
856 start = (char *)xen_##x##_direct; \ 866 start = (char *)xen_##x##_direct; \
857 end = xen_##x##_direct_end; \ 867 end = xen_##x##_direct_end; \
@@ -860,10 +870,10 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
860 goto patch_site 870 goto patch_site
861 871
862 switch (type) { 872 switch (type) {
863 SITE(irq_enable); 873 SITE(pv_irq_ops, irq_enable);
864 SITE(irq_disable); 874 SITE(pv_irq_ops, irq_disable);
865 SITE(save_fl); 875 SITE(pv_irq_ops, save_fl);
866 SITE(restore_fl); 876 SITE(pv_irq_ops, restore_fl);
867#undef SITE 877#undef SITE
868 878
869 patch_site: 879 patch_site:
@@ -895,26 +905,32 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
895 return ret; 905 return ret;
896} 906}
897 907
898static const struct paravirt_ops xen_paravirt_ops __initdata = { 908static const struct pv_info xen_info __initdata = {
899 .paravirt_enabled = 1, 909 .paravirt_enabled = 1,
900 .shared_kernel_pmd = 0, 910 .shared_kernel_pmd = 0,
901 911
902 .name = "Xen", 912 .name = "Xen",
903 .banner = xen_banner, 913};
904 914
915static const struct pv_init_ops xen_init_ops __initdata = {
905 .patch = xen_patch, 916 .patch = xen_patch,
906 917
918 .banner = xen_banner,
907 .memory_setup = xen_memory_setup, 919 .memory_setup = xen_memory_setup,
908 .arch_setup = xen_arch_setup, 920 .arch_setup = xen_arch_setup,
909 .init_IRQ = xen_init_IRQ,
910 .post_allocator_init = xen_mark_init_mm_pinned, 921 .post_allocator_init = xen_mark_init_mm_pinned,
922};
911 923
924static const struct pv_time_ops xen_time_ops __initdata = {
912 .time_init = xen_time_init, 925 .time_init = xen_time_init,
926
913 .set_wallclock = xen_set_wallclock, 927 .set_wallclock = xen_set_wallclock,
914 .get_wallclock = xen_get_wallclock, 928 .get_wallclock = xen_get_wallclock,
915 .get_cpu_khz = xen_cpu_khz, 929 .get_cpu_khz = xen_cpu_khz,
916 .sched_clock = xen_sched_clock, 930 .sched_clock = xen_sched_clock,
931};
917 932
933static const struct pv_cpu_ops xen_cpu_ops __initdata = {
918 .cpuid = xen_cpuid, 934 .cpuid = xen_cpuid,
919 935
920 .set_debugreg = xen_set_debugreg, 936 .set_debugreg = xen_set_debugreg,
@@ -925,22 +941,10 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = {
925 .read_cr0 = native_read_cr0, 941 .read_cr0 = native_read_cr0,
926 .write_cr0 = native_write_cr0, 942 .write_cr0 = native_write_cr0,
927 943
928 .read_cr2 = xen_read_cr2,
929 .write_cr2 = xen_write_cr2,
930
931 .read_cr3 = xen_read_cr3,
932 .write_cr3 = xen_write_cr3,
933
934 .read_cr4 = native_read_cr4, 944 .read_cr4 = native_read_cr4,
935 .read_cr4_safe = native_read_cr4_safe, 945 .read_cr4_safe = native_read_cr4_safe,
936 .write_cr4 = xen_write_cr4, 946 .write_cr4 = xen_write_cr4,
937 947
938 .save_fl = xen_save_fl,
939 .restore_fl = xen_restore_fl,
940 .irq_disable = xen_irq_disable,
941 .irq_enable = xen_irq_enable,
942 .safe_halt = xen_safe_halt,
943 .halt = xen_halt,
944 .wbinvd = native_wbinvd, 948 .wbinvd = native_wbinvd,
945 949
946 .read_msr = native_read_msr_safe, 950 .read_msr = native_read_msr_safe,
@@ -969,6 +973,23 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = {
969 .set_iopl_mask = xen_set_iopl_mask, 973 .set_iopl_mask = xen_set_iopl_mask,
970 .io_delay = xen_io_delay, 974 .io_delay = xen_io_delay,
971 975
976 .lazy_mode = {
977 .enter = paravirt_enter_lazy_cpu,
978 .leave = xen_leave_lazy,
979 },
980};
981
982static const struct pv_irq_ops xen_irq_ops __initdata = {
983 .init_IRQ = xen_init_IRQ,
984 .save_fl = xen_save_fl,
985 .restore_fl = xen_restore_fl,
986 .irq_disable = xen_irq_disable,
987 .irq_enable = xen_irq_enable,
988 .safe_halt = xen_safe_halt,
989 .halt = xen_halt,
990};
991
992static const struct pv_apic_ops xen_apic_ops __initdata = {
972#ifdef CONFIG_X86_LOCAL_APIC 993#ifdef CONFIG_X86_LOCAL_APIC
973 .apic_write = xen_apic_write, 994 .apic_write = xen_apic_write,
974 .apic_write_atomic = xen_apic_write, 995 .apic_write_atomic = xen_apic_write,
@@ -977,6 +998,17 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = {
977 .setup_secondary_clock = paravirt_nop, 998 .setup_secondary_clock = paravirt_nop,
978 .startup_ipi_hook = paravirt_nop, 999 .startup_ipi_hook = paravirt_nop,
979#endif 1000#endif
1001};
1002
1003static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1004 .pagetable_setup_start = xen_pagetable_setup_start,
1005 .pagetable_setup_done = xen_pagetable_setup_done,
1006
1007 .read_cr2 = xen_read_cr2,
1008 .write_cr2 = xen_write_cr2,
1009
1010 .read_cr3 = xen_read_cr3,
1011 .write_cr3 = xen_write_cr3,
980 1012
981 .flush_tlb_user = xen_flush_tlb, 1013 .flush_tlb_user = xen_flush_tlb,
982 .flush_tlb_kernel = xen_flush_tlb, 1014 .flush_tlb_kernel = xen_flush_tlb,
@@ -986,9 +1018,6 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = {
986 .pte_update = paravirt_nop, 1018 .pte_update = paravirt_nop,
987 .pte_update_defer = paravirt_nop, 1019 .pte_update_defer = paravirt_nop,
988 1020
989 .pagetable_setup_start = xen_pagetable_setup_start,
990 .pagetable_setup_done = xen_pagetable_setup_done,
991
992 .alloc_pt = xen_alloc_pt_init, 1021 .alloc_pt = xen_alloc_pt_init,
993 .release_pt = xen_release_pt, 1022 .release_pt = xen_release_pt,
994 .alloc_pd = paravirt_nop, 1023 .alloc_pd = paravirt_nop,
@@ -1024,7 +1053,10 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = {
1024 .dup_mmap = xen_dup_mmap, 1053 .dup_mmap = xen_dup_mmap,
1025 .exit_mmap = xen_exit_mmap, 1054 .exit_mmap = xen_exit_mmap,
1026 1055
1027 .set_lazy_mode = xen_set_lazy_mode, 1056 .lazy_mode = {
1057 .enter = paravirt_enter_lazy_mmu,
1058 .leave = xen_leave_lazy,
1059 },
1028}; 1060};
1029 1061
1030#ifdef CONFIG_SMP 1062#ifdef CONFIG_SMP
@@ -1080,6 +1112,17 @@ static const struct machine_ops __initdata xen_machine_ops = {
1080}; 1112};
1081 1113
1082 1114
1115static void __init xen_reserve_top(void)
1116{
1117 unsigned long top = HYPERVISOR_VIRT_START;
1118 struct xen_platform_parameters pp;
1119
1120 if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
1121 top = pp.virt_start;
1122
1123 reserve_top_address(-top + 2 * PAGE_SIZE);
1124}
1125
1083/* First C function to be called on Xen boot */ 1126/* First C function to be called on Xen boot */
1084asmlinkage void __init xen_start_kernel(void) 1127asmlinkage void __init xen_start_kernel(void)
1085{ 1128{
@@ -1091,7 +1134,14 @@ asmlinkage void __init xen_start_kernel(void)
1091 BUG_ON(memcmp(xen_start_info->magic, "xen-3.0", 7) != 0); 1134 BUG_ON(memcmp(xen_start_info->magic, "xen-3.0", 7) != 0);
1092 1135
1093 /* Install Xen paravirt ops */ 1136 /* Install Xen paravirt ops */
1094 paravirt_ops = xen_paravirt_ops; 1137 pv_info = xen_info;
1138 pv_init_ops = xen_init_ops;
1139 pv_time_ops = xen_time_ops;
1140 pv_cpu_ops = xen_cpu_ops;
1141 pv_irq_ops = xen_irq_ops;
1142 pv_apic_ops = xen_apic_ops;
1143 pv_mmu_ops = xen_mmu_ops;
1144
1095 machine_ops = xen_machine_ops; 1145 machine_ops = xen_machine_ops;
1096 1146
1097#ifdef CONFIG_SMP 1147#ifdef CONFIG_SMP
@@ -1113,6 +1163,7 @@ asmlinkage void __init xen_start_kernel(void)
1113 /* keep using Xen gdt for now; no urgent need to change it */ 1163 /* keep using Xen gdt for now; no urgent need to change it */
1114 1164
1115 x86_write_percpu(xen_cr3, __pa(pgd)); 1165 x86_write_percpu(xen_cr3, __pa(pgd));
1166 x86_write_percpu(xen_current_cr3, __pa(pgd));
1116 1167
1117#ifdef CONFIG_SMP 1168#ifdef CONFIG_SMP
1118 /* Don't do the full vcpu_info placement stuff until we have a 1169 /* Don't do the full vcpu_info placement stuff until we have a
@@ -1124,12 +1175,12 @@ asmlinkage void __init xen_start_kernel(void)
1124 xen_setup_vcpu_info_placement(); 1175 xen_setup_vcpu_info_placement();
1125#endif 1176#endif
1126 1177
1127 paravirt_ops.kernel_rpl = 1; 1178 pv_info.kernel_rpl = 1;
1128 if (xen_feature(XENFEAT_supervisor_mode_kernel)) 1179 if (xen_feature(XENFEAT_supervisor_mode_kernel))
1129 paravirt_ops.kernel_rpl = 0; 1180 pv_info.kernel_rpl = 0;
1130 1181
1131 /* set the limit of our address space */ 1182 /* set the limit of our address space */
1132 reserve_top_address(-HYPERVISOR_VIRT_START + 2 * PAGE_SIZE); 1183 xen_reserve_top();
1133 1184
1134 /* set up basic CPUID stuff */ 1185 /* set up basic CPUID stuff */
1135 cpu_detect(&new_cpu_data); 1186 cpu_detect(&new_cpu_data);
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 874db0cd1d2a..b2e32f9d0071 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -41,7 +41,6 @@
41#include <linux/sched.h> 41#include <linux/sched.h>
42#include <linux/highmem.h> 42#include <linux/highmem.h>
43#include <linux/bug.h> 43#include <linux/bug.h>
44#include <linux/sched.h>
45 44
46#include <asm/pgtable.h> 45#include <asm/pgtable.h>
47#include <asm/tlbflush.h> 46#include <asm/tlbflush.h>
@@ -155,7 +154,7 @@ void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
155 pte_t *ptep, pte_t pteval) 154 pte_t *ptep, pte_t pteval)
156{ 155{
157 if (mm == current->mm || mm == &init_mm) { 156 if (mm == current->mm || mm == &init_mm) {
158 if (xen_get_lazy_mode() == PARAVIRT_LAZY_MMU) { 157 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
159 struct multicall_space mcs; 158 struct multicall_space mcs;
160 mcs = xen_mc_entry(0); 159 mcs = xen_mc_entry(0);
161 160
@@ -304,7 +303,12 @@ pgd_t xen_make_pgd(unsigned long pgd)
304} 303}
305#endif /* CONFIG_X86_PAE */ 304#endif /* CONFIG_X86_PAE */
306 305
307 306enum pt_level {
307 PT_PGD,
308 PT_PUD,
309 PT_PMD,
310 PT_PTE
311};
308 312
309/* 313/*
310 (Yet another) pagetable walker. This one is intended for pinning a 314 (Yet another) pagetable walker. This one is intended for pinning a
@@ -316,7 +320,7 @@ pgd_t xen_make_pgd(unsigned long pgd)
316 FIXADDR_TOP. But the important bit is that we don't pin beyond 320 FIXADDR_TOP. But the important bit is that we don't pin beyond
317 there, because then we start getting into Xen's ptes. 321 there, because then we start getting into Xen's ptes.
318*/ 322*/
319static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned), 323static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, enum pt_level),
320 unsigned long limit) 324 unsigned long limit)
321{ 325{
322 pgd_t *pgd = pgd_base; 326 pgd_t *pgd = pgd_base;
@@ -341,7 +345,7 @@ static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned),
341 pud = pud_offset(pgd, 0); 345 pud = pud_offset(pgd, 0);
342 346
343 if (PTRS_PER_PUD > 1) /* not folded */ 347 if (PTRS_PER_PUD > 1) /* not folded */
344 flush |= (*func)(virt_to_page(pud), 0); 348 flush |= (*func)(virt_to_page(pud), PT_PUD);
345 349
346 for (; addr != pud_limit; pud++, addr = pud_next) { 350 for (; addr != pud_limit; pud++, addr = pud_next) {
347 pmd_t *pmd; 351 pmd_t *pmd;
@@ -360,7 +364,7 @@ static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned),
360 pmd = pmd_offset(pud, 0); 364 pmd = pmd_offset(pud, 0);
361 365
362 if (PTRS_PER_PMD > 1) /* not folded */ 366 if (PTRS_PER_PMD > 1) /* not folded */
363 flush |= (*func)(virt_to_page(pmd), 0); 367 flush |= (*func)(virt_to_page(pmd), PT_PMD);
364 368
365 for (; addr != pmd_limit; pmd++) { 369 for (; addr != pmd_limit; pmd++) {
366 addr += (PAGE_SIZE * PTRS_PER_PTE); 370 addr += (PAGE_SIZE * PTRS_PER_PTE);
@@ -372,17 +376,47 @@ static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned),
372 if (pmd_none(*pmd)) 376 if (pmd_none(*pmd))
373 continue; 377 continue;
374 378
375 flush |= (*func)(pmd_page(*pmd), 0); 379 flush |= (*func)(pmd_page(*pmd), PT_PTE);
376 } 380 }
377 } 381 }
378 } 382 }
379 383
380 flush |= (*func)(virt_to_page(pgd_base), UVMF_TLB_FLUSH); 384 flush |= (*func)(virt_to_page(pgd_base), PT_PGD);
381 385
382 return flush; 386 return flush;
383} 387}
384 388
385static int pin_page(struct page *page, unsigned flags) 389static spinlock_t *lock_pte(struct page *page)
390{
391 spinlock_t *ptl = NULL;
392
393#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
394 ptl = __pte_lockptr(page);
395 spin_lock(ptl);
396#endif
397
398 return ptl;
399}
400
401static void do_unlock(void *v)
402{
403 spinlock_t *ptl = v;
404 spin_unlock(ptl);
405}
406
407static void xen_do_pin(unsigned level, unsigned long pfn)
408{
409 struct mmuext_op *op;
410 struct multicall_space mcs;
411
412 mcs = __xen_mc_entry(sizeof(*op));
413 op = mcs.args;
414 op->cmd = level;
415 op->arg1.mfn = pfn_to_mfn(pfn);
416 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
417}
418
419static int pin_page(struct page *page, enum pt_level level)
386{ 420{
387 unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags); 421 unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags);
388 int flush; 422 int flush;
@@ -397,12 +431,26 @@ static int pin_page(struct page *page, unsigned flags)
397 void *pt = lowmem_page_address(page); 431 void *pt = lowmem_page_address(page);
398 unsigned long pfn = page_to_pfn(page); 432 unsigned long pfn = page_to_pfn(page);
399 struct multicall_space mcs = __xen_mc_entry(0); 433 struct multicall_space mcs = __xen_mc_entry(0);
434 spinlock_t *ptl;
400 435
401 flush = 0; 436 flush = 0;
402 437
438 ptl = NULL;
439 if (level == PT_PTE)
440 ptl = lock_pte(page);
441
403 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, 442 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
404 pfn_pte(pfn, PAGE_KERNEL_RO), 443 pfn_pte(pfn, PAGE_KERNEL_RO),
405 flags); 444 level == PT_PGD ? UVMF_TLB_FLUSH : 0);
445
446 if (level == PT_PTE)
447 xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
448
449 if (ptl) {
450 /* Queue a deferred unlock for when this batch
451 is completed. */
452 xen_mc_callback(do_unlock, ptl);
453 }
406 } 454 }
407 455
408 return flush; 456 return flush;
@@ -413,8 +461,7 @@ static int pin_page(struct page *page, unsigned flags)
413 read-only, and can be pinned. */ 461 read-only, and can be pinned. */
414void xen_pgd_pin(pgd_t *pgd) 462void xen_pgd_pin(pgd_t *pgd)
415{ 463{
416 struct multicall_space mcs; 464 unsigned level;
417 struct mmuext_op *op;
418 465
419 xen_mc_batch(); 466 xen_mc_batch();
420 467
@@ -425,16 +472,13 @@ void xen_pgd_pin(pgd_t *pgd)
425 xen_mc_batch(); 472 xen_mc_batch();
426 } 473 }
427 474
428 mcs = __xen_mc_entry(sizeof(*op));
429 op = mcs.args;
430
431#ifdef CONFIG_X86_PAE 475#ifdef CONFIG_X86_PAE
432 op->cmd = MMUEXT_PIN_L3_TABLE; 476 level = MMUEXT_PIN_L3_TABLE;
433#else 477#else
434 op->cmd = MMUEXT_PIN_L2_TABLE; 478 level = MMUEXT_PIN_L2_TABLE;
435#endif 479#endif
436 op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd))); 480
437 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); 481 xen_do_pin(level, PFN_DOWN(__pa(pgd)));
438 482
439 xen_mc_issue(0); 483 xen_mc_issue(0);
440} 484}
@@ -442,7 +486,7 @@ void xen_pgd_pin(pgd_t *pgd)
442/* The init_mm pagetable is really pinned as soon as its created, but 486/* The init_mm pagetable is really pinned as soon as its created, but
443 that's before we have page structures to store the bits. So do all 487 that's before we have page structures to store the bits. So do all
444 the book-keeping now. */ 488 the book-keeping now. */
445static __init int mark_pinned(struct page *page, unsigned flags) 489static __init int mark_pinned(struct page *page, enum pt_level level)
446{ 490{
447 SetPagePinned(page); 491 SetPagePinned(page);
448 return 0; 492 return 0;
@@ -453,18 +497,32 @@ void __init xen_mark_init_mm_pinned(void)
453 pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP); 497 pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP);
454} 498}
455 499
456static int unpin_page(struct page *page, unsigned flags) 500static int unpin_page(struct page *page, enum pt_level level)
457{ 501{
458 unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags); 502 unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags);
459 503
460 if (pgfl && !PageHighMem(page)) { 504 if (pgfl && !PageHighMem(page)) {
461 void *pt = lowmem_page_address(page); 505 void *pt = lowmem_page_address(page);
462 unsigned long pfn = page_to_pfn(page); 506 unsigned long pfn = page_to_pfn(page);
463 struct multicall_space mcs = __xen_mc_entry(0); 507 spinlock_t *ptl = NULL;
508 struct multicall_space mcs;
509
510 if (level == PT_PTE) {
511 ptl = lock_pte(page);
512
513 xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
514 }
515
516 mcs = __xen_mc_entry(0);
464 517
465 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, 518 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
466 pfn_pte(pfn, PAGE_KERNEL), 519 pfn_pte(pfn, PAGE_KERNEL),
467 flags); 520 level == PT_PGD ? UVMF_TLB_FLUSH : 0);
521
522 if (ptl) {
523 /* unlock when batch completed */
524 xen_mc_callback(do_unlock, ptl);
525 }
468 } 526 }
469 527
470 return 0; /* never need to flush on unpin */ 528 return 0; /* never need to flush on unpin */
@@ -473,18 +531,9 @@ static int unpin_page(struct page *page, unsigned flags)
473/* Release a pagetables pages back as normal RW */ 531/* Release a pagetables pages back as normal RW */
474static void xen_pgd_unpin(pgd_t *pgd) 532static void xen_pgd_unpin(pgd_t *pgd)
475{ 533{
476 struct mmuext_op *op;
477 struct multicall_space mcs;
478
479 xen_mc_batch(); 534 xen_mc_batch();
480 535
481 mcs = __xen_mc_entry(sizeof(*op)); 536 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
482
483 op = mcs.args;
484 op->cmd = MMUEXT_UNPIN_TABLE;
485 op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
486
487 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
488 537
489 pgd_walk(pgd, unpin_page, TASK_SIZE); 538 pgd_walk(pgd, unpin_page, TASK_SIZE);
490 539
@@ -515,20 +564,43 @@ static void drop_other_mm_ref(void *info)
515 564
516 if (__get_cpu_var(cpu_tlbstate).active_mm == mm) 565 if (__get_cpu_var(cpu_tlbstate).active_mm == mm)
517 leave_mm(smp_processor_id()); 566 leave_mm(smp_processor_id());
567
568 /* If this cpu still has a stale cr3 reference, then make sure
569 it has been flushed. */
570 if (x86_read_percpu(xen_current_cr3) == __pa(mm->pgd)) {
571 load_cr3(swapper_pg_dir);
572 arch_flush_lazy_cpu_mode();
573 }
518} 574}
519 575
520static void drop_mm_ref(struct mm_struct *mm) 576static void drop_mm_ref(struct mm_struct *mm)
521{ 577{
578 cpumask_t mask;
579 unsigned cpu;
580
522 if (current->active_mm == mm) { 581 if (current->active_mm == mm) {
523 if (current->mm == mm) 582 if (current->mm == mm)
524 load_cr3(swapper_pg_dir); 583 load_cr3(swapper_pg_dir);
525 else 584 else
526 leave_mm(smp_processor_id()); 585 leave_mm(smp_processor_id());
586 arch_flush_lazy_cpu_mode();
527 } 587 }
528 588
529 if (!cpus_empty(mm->cpu_vm_mask)) 589 /* Get the "official" set of cpus referring to our pagetable. */
530 xen_smp_call_function_mask(mm->cpu_vm_mask, drop_other_mm_ref, 590 mask = mm->cpu_vm_mask;
531 mm, 1); 591
592 /* It's possible that a vcpu may have a stale reference to our
593 cr3, because its in lazy mode, and it hasn't yet flushed
594 its set of pending hypercalls yet. In this case, we can
595 look at its actual current cr3 value, and force it to flush
596 if needed. */
597 for_each_online_cpu(cpu) {
598 if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
599 cpu_set(cpu, mask);
600 }
601
602 if (!cpus_empty(mask))
603 xen_smp_call_function_mask(mask, drop_other_mm_ref, mm, 1);
532} 604}
533#else 605#else
534static void drop_mm_ref(struct mm_struct *mm) 606static void drop_mm_ref(struct mm_struct *mm)
@@ -563,5 +635,6 @@ void xen_exit_mmap(struct mm_struct *mm)
563 /* pgd may not be pinned in the error exit path of execve */ 635 /* pgd may not be pinned in the error exit path of execve */
564 if (PagePinned(virt_to_page(mm->pgd))) 636 if (PagePinned(virt_to_page(mm->pgd)))
565 xen_pgd_unpin(mm->pgd); 637 xen_pgd_unpin(mm->pgd);
638
566 spin_unlock(&mm->page_table_lock); 639 spin_unlock(&mm->page_table_lock);
567} 640}
diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c
index c837e8e463db..5e6f36f6d876 100644
--- a/arch/x86/xen/multicalls.c
+++ b/arch/x86/xen/multicalls.c
@@ -26,13 +26,22 @@
26 26
27#include "multicalls.h" 27#include "multicalls.h"
28 28
29#define MC_DEBUG 1
30
29#define MC_BATCH 32 31#define MC_BATCH 32
30#define MC_ARGS (MC_BATCH * 16 / sizeof(u64)) 32#define MC_ARGS (MC_BATCH * 16 / sizeof(u64))
31 33
32struct mc_buffer { 34struct mc_buffer {
33 struct multicall_entry entries[MC_BATCH]; 35 struct multicall_entry entries[MC_BATCH];
36#if MC_DEBUG
37 struct multicall_entry debug[MC_BATCH];
38#endif
34 u64 args[MC_ARGS]; 39 u64 args[MC_ARGS];
35 unsigned mcidx, argidx; 40 struct callback {
41 void (*fn)(void *);
42 void *data;
43 } callbacks[MC_BATCH];
44 unsigned mcidx, argidx, cbidx;
36}; 45};
37 46
38static DEFINE_PER_CPU(struct mc_buffer, mc_buffer); 47static DEFINE_PER_CPU(struct mc_buffer, mc_buffer);
@@ -43,6 +52,7 @@ void xen_mc_flush(void)
43 struct mc_buffer *b = &__get_cpu_var(mc_buffer); 52 struct mc_buffer *b = &__get_cpu_var(mc_buffer);
44 int ret = 0; 53 int ret = 0;
45 unsigned long flags; 54 unsigned long flags;
55 int i;
46 56
47 BUG_ON(preemptible()); 57 BUG_ON(preemptible());
48 58
@@ -51,13 +61,31 @@ void xen_mc_flush(void)
51 local_irq_save(flags); 61 local_irq_save(flags);
52 62
53 if (b->mcidx) { 63 if (b->mcidx) {
54 int i; 64#if MC_DEBUG
65 memcpy(b->debug, b->entries,
66 b->mcidx * sizeof(struct multicall_entry));
67#endif
55 68
56 if (HYPERVISOR_multicall(b->entries, b->mcidx) != 0) 69 if (HYPERVISOR_multicall(b->entries, b->mcidx) != 0)
57 BUG(); 70 BUG();
58 for (i = 0; i < b->mcidx; i++) 71 for (i = 0; i < b->mcidx; i++)
59 if (b->entries[i].result < 0) 72 if (b->entries[i].result < 0)
60 ret++; 73 ret++;
74
75#if MC_DEBUG
76 if (ret) {
77 printk(KERN_ERR "%d multicall(s) failed: cpu %d\n",
78 ret, smp_processor_id());
79 for(i = 0; i < b->mcidx; i++) {
80 printk(" call %2d/%d: op=%lu arg=[%lx] result=%ld\n",
81 i+1, b->mcidx,
82 b->debug[i].op,
83 b->debug[i].args[0],
84 b->entries[i].result);
85 }
86 }
87#endif
88
61 b->mcidx = 0; 89 b->mcidx = 0;
62 b->argidx = 0; 90 b->argidx = 0;
63 } else 91 } else
@@ -65,6 +93,13 @@ void xen_mc_flush(void)
65 93
66 local_irq_restore(flags); 94 local_irq_restore(flags);
67 95
96 for(i = 0; i < b->cbidx; i++) {
97 struct callback *cb = &b->callbacks[i];
98
99 (*cb->fn)(cb->data);
100 }
101 b->cbidx = 0;
102
68 BUG_ON(ret); 103 BUG_ON(ret);
69} 104}
70 105
@@ -88,3 +123,16 @@ struct multicall_space __xen_mc_entry(size_t args)
88 123
89 return ret; 124 return ret;
90} 125}
126
127void xen_mc_callback(void (*fn)(void *), void *data)
128{
129 struct mc_buffer *b = &__get_cpu_var(mc_buffer);
130 struct callback *cb;
131
132 if (b->cbidx == MC_BATCH)
133 xen_mc_flush();
134
135 cb = &b->callbacks[b->cbidx++];
136 cb->fn = fn;
137 cb->data = data;
138}
diff --git a/arch/x86/xen/multicalls.h b/arch/x86/xen/multicalls.h
index e6f7530b156c..8bae996d99a3 100644
--- a/arch/x86/xen/multicalls.h
+++ b/arch/x86/xen/multicalls.h
@@ -35,11 +35,14 @@ void xen_mc_flush(void);
35/* Issue a multicall if we're not in a lazy mode */ 35/* Issue a multicall if we're not in a lazy mode */
36static inline void xen_mc_issue(unsigned mode) 36static inline void xen_mc_issue(unsigned mode)
37{ 37{
38 if ((xen_get_lazy_mode() & mode) == 0) 38 if ((paravirt_get_lazy_mode() & mode) == 0)
39 xen_mc_flush(); 39 xen_mc_flush();
40 40
41 /* restore flags saved in xen_mc_batch */ 41 /* restore flags saved in xen_mc_batch */
42 local_irq_restore(x86_read_percpu(xen_mc_irq_flags)); 42 local_irq_restore(x86_read_percpu(xen_mc_irq_flags));
43} 43}
44 44
45/* Set up a callback to be called when the current batch is flushed */
46void xen_mc_callback(void (*fn)(void *), void *data);
47
45#endif /* _XEN_MULTICALLS_H */ 48#endif /* _XEN_MULTICALLS_H */
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 4fa33c27ccb6..d53bf9d8a72d 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -370,7 +370,8 @@ int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *),
370 void *info, int wait) 370 void *info, int wait)
371{ 371{
372 struct call_data_struct data; 372 struct call_data_struct data;
373 int cpus; 373 int cpus, cpu;
374 bool yield;
374 375
375 /* Holding any lock stops cpus from going down. */ 376 /* Holding any lock stops cpus from going down. */
376 spin_lock(&call_lock); 377 spin_lock(&call_lock);
@@ -399,9 +400,14 @@ int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *),
399 /* Send a message to other CPUs and wait for them to respond */ 400 /* Send a message to other CPUs and wait for them to respond */
400 xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR); 401 xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR);
401 402
402 /* Make sure other vcpus get a chance to run. 403 /* Make sure other vcpus get a chance to run if they need to. */
403 XXX too severe? Maybe we should check the other CPU's states? */ 404 yield = false;
404 HYPERVISOR_sched_op(SCHEDOP_yield, 0); 405 for_each_cpu_mask(cpu, mask)
406 if (xen_vcpu_stolen(cpu))
407 yield = true;
408
409 if (yield)
410 HYPERVISOR_sched_op(SCHEDOP_yield, 0);
405 411
406 /* Wait for response */ 412 /* Wait for response */
407 while (atomic_read(&data.started) != cpus || 413 while (atomic_read(&data.started) != cpus ||
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index dfd6db69ead5..d083ff5ef088 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -105,6 +105,12 @@ static void get_runstate_snapshot(struct vcpu_runstate_info *res)
105 } while (get64(&state->state_entry_time) != state_time); 105 } while (get64(&state->state_entry_time) != state_time);
106} 106}
107 107
108/* return true when a vcpu could run but has no real cpu to run on */
109bool xen_vcpu_stolen(int vcpu)
110{
111 return per_cpu(runstate, vcpu).state == RUNSTATE_runnable;
112}
113
108static void setup_runstate_info(int cpu) 114static void setup_runstate_info(int cpu)
109{ 115{
110 struct vcpu_register_runstate_memory_area area; 116 struct vcpu_register_runstate_memory_area area;
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index b9aaea45f07f..b02a909bfd4c 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -11,6 +11,7 @@ void xen_copy_trap_info(struct trap_info *traps);
11 11
12DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu); 12DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu);
13DECLARE_PER_CPU(unsigned long, xen_cr3); 13DECLARE_PER_CPU(unsigned long, xen_cr3);
14DECLARE_PER_CPU(unsigned long, xen_current_cr3);
14 15
15extern struct start_info *xen_start_info; 16extern struct start_info *xen_start_info;
16extern struct shared_info *HYPERVISOR_shared_info; 17extern struct shared_info *HYPERVISOR_shared_info;
@@ -27,14 +28,9 @@ unsigned long xen_get_wallclock(void);
27int xen_set_wallclock(unsigned long time); 28int xen_set_wallclock(unsigned long time);
28unsigned long long xen_sched_clock(void); 29unsigned long long xen_sched_clock(void);
29 30
30void xen_mark_init_mm_pinned(void); 31bool xen_vcpu_stolen(int vcpu);
31
32DECLARE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode);
33 32
34static inline unsigned xen_get_lazy_mode(void) 33void xen_mark_init_mm_pinned(void);
35{
36 return x86_read_percpu(xen_lazy_mode);
37}
38 34
39void __init xen_fill_possible_map(void); 35void __init xen_fill_possible_map(void);
40 36
diff --git a/drivers/char/hvc_lguest.c b/drivers/char/hvc_lguest.c
index 3d6bd0baa56d..efccb2155830 100644
--- a/drivers/char/hvc_lguest.c
+++ b/drivers/char/hvc_lguest.c
@@ -115,7 +115,7 @@ static struct hv_ops lguest_cons = {
115 * (0), and the struct hv_ops containing the put_chars() function. */ 115 * (0), and the struct hv_ops containing the put_chars() function. */
116static int __init cons_init(void) 116static int __init cons_init(void)
117{ 117{
118 if (strcmp(paravirt_ops.name, "lguest") != 0) 118 if (strcmp(pv_info.name, "lguest") != 0)
119 return 0; 119 return 0;
120 120
121 return hvc_instantiate(0, 0, &lguest_cons); 121 return hvc_instantiate(0, 0, &lguest_cons);
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
index 4a315f08a567..a0788c12b392 100644
--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@@ -248,8 +248,8 @@ static void unmap_switcher(void)
248} 248}
249 249
250/*H:130 Our Guest is usually so well behaved; it never tries to do things it 250/*H:130 Our Guest is usually so well behaved; it never tries to do things it
251 * isn't allowed to. Unfortunately, "struct paravirt_ops" isn't quite 251 * isn't allowed to. Unfortunately, Linux's paravirtual infrastructure isn't
252 * complete, because it doesn't contain replacements for the Intel I/O 252 * quite complete, because it doesn't contain replacements for the Intel I/O
253 * instructions. As a result, the Guest sometimes fumbles across one during 253 * instructions. As a result, the Guest sometimes fumbles across one during
254 * the boot process as it probes for various things which are usually attached 254 * the boot process as it probes for various things which are usually attached
255 * to a PC. 255 * to a PC.
@@ -694,7 +694,7 @@ static int __init init(void)
694 694
695 /* Lguest can't run under Xen, VMI or itself. It does Tricky Stuff. */ 695 /* Lguest can't run under Xen, VMI or itself. It does Tricky Stuff. */
696 if (paravirt_enabled()) { 696 if (paravirt_enabled()) {
697 printk("lguest is afraid of %s\n", paravirt_ops.name); 697 printk("lguest is afraid of %s\n", pv_info.name);
698 return -EPERM; 698 return -EPERM;
699 } 699 }
700 700
diff --git a/drivers/lguest/lguest.c b/drivers/lguest/lguest.c
index 4a579c840301..3ba337dde857 100644
--- a/drivers/lguest/lguest.c
+++ b/drivers/lguest/lguest.c
@@ -23,7 +23,7 @@
23 * 23 *
24 * So how does the kernel know it's a Guest? The Guest starts at a special 24 * So how does the kernel know it's a Guest? The Guest starts at a special
25 * entry point marked with a magic string, which sets up a few things then 25 * entry point marked with a magic string, which sets up a few things then
26 * calls here. We replace the native functions in "struct paravirt_ops" 26 * calls here. We replace the native functions various "paravirt" structures
27 * with our Guest versions, then boot like normal. :*/ 27 * with our Guest versions, then boot like normal. :*/
28 28
29/* 29/*
@@ -97,29 +97,17 @@ static cycle_t clock_base;
97 * them as a batch when lazy_mode is eventually turned off. Because hypercalls 97 * them as a batch when lazy_mode is eventually turned off. Because hypercalls
98 * are reasonably expensive, batching them up makes sense. For example, a 98 * are reasonably expensive, batching them up makes sense. For example, a
99 * large mmap might update dozens of page table entries: that code calls 99 * large mmap might update dozens of page table entries: that code calls
100 * lguest_lazy_mode(PARAVIRT_LAZY_MMU), does the dozen updates, then calls 100 * paravirt_enter_lazy_mmu(), does the dozen updates, then calls
101 * lguest_lazy_mode(PARAVIRT_LAZY_NONE). 101 * lguest_leave_lazy_mode().
102 * 102 *
103 * So, when we're in lazy mode, we call async_hypercall() to store the call for 103 * So, when we're in lazy mode, we call async_hypercall() to store the call for
104 * future processing. When lazy mode is turned off we issue a hypercall to 104 * future processing. When lazy mode is turned off we issue a hypercall to
105 * flush the stored calls. 105 * flush the stored calls.
106 * 106 */
107 * There's also a hack where "mode" is set to "PARAVIRT_LAZY_FLUSH" which 107static void lguest_leave_lazy_mode(void)
108 * indicates we're to flush any outstanding calls immediately. This is used
109 * when an interrupt handler does a kmap_atomic(): the page table changes must
110 * happen immediately even if we're in the middle of a batch. Usually we're
111 * not, though, so there's nothing to do. */
112static enum paravirt_lazy_mode lazy_mode; /* Note: not SMP-safe! */
113static void lguest_lazy_mode(enum paravirt_lazy_mode mode)
114{ 108{
115 if (mode == PARAVIRT_LAZY_FLUSH) { 109 paravirt_leave_lazy(paravirt_get_lazy_mode());
116 if (unlikely(lazy_mode != PARAVIRT_LAZY_NONE)) 110 hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0);
117 hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0);
118 } else {
119 lazy_mode = mode;
120 if (mode == PARAVIRT_LAZY_NONE)
121 hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0);
122 }
123} 111}
124 112
125static void lazy_hcall(unsigned long call, 113static void lazy_hcall(unsigned long call,
@@ -127,7 +115,7 @@ static void lazy_hcall(unsigned long call,
127 unsigned long arg2, 115 unsigned long arg2,
128 unsigned long arg3) 116 unsigned long arg3)
129{ 117{
130 if (lazy_mode == PARAVIRT_LAZY_NONE) 118 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
131 hcall(call, arg1, arg2, arg3); 119 hcall(call, arg1, arg2, arg3);
132 else 120 else
133 async_hcall(call, arg1, arg2, arg3); 121 async_hcall(call, arg1, arg2, arg3);
@@ -331,7 +319,7 @@ static void lguest_load_tls(struct thread_struct *t, unsigned int cpu)
331} 319}
332 320
333/*G:038 That's enough excitement for now, back to ploughing through each of 321/*G:038 That's enough excitement for now, back to ploughing through each of
334 * the paravirt_ops (we're about 1/3 of the way through). 322 * the different pv_ops structures (we're about 1/3 of the way through).
335 * 323 *
336 * This is the Local Descriptor Table, another weird Intel thingy. Linux only 324 * This is the Local Descriptor Table, another weird Intel thingy. Linux only
337 * uses this for some strange applications like Wine. We don't do anything 325 * uses this for some strange applications like Wine. We don't do anything
@@ -558,7 +546,7 @@ static void lguest_set_pte(pte_t *ptep, pte_t pteval)
558 lazy_hcall(LHCALL_FLUSH_TLB, 1, 0, 0); 546 lazy_hcall(LHCALL_FLUSH_TLB, 1, 0, 0);
559} 547}
560 548
561/* Unfortunately for Lguest, the paravirt_ops for page tables were based on 549/* Unfortunately for Lguest, the pv_mmu_ops for page tables were based on
562 * native page table operations. On native hardware you can set a new page 550 * native page table operations. On native hardware you can set a new page
563 * table entry whenever you want, but if you want to remove one you have to do 551 * table entry whenever you want, but if you want to remove one you have to do
564 * a TLB flush (a TLB is a little cache of page table entries kept by the CPU). 552 * a TLB flush (a TLB is a little cache of page table entries kept by the CPU).
@@ -782,7 +770,7 @@ static void lguest_time_init(void)
782 clocksource_register(&lguest_clock); 770 clocksource_register(&lguest_clock);
783 771
784 /* Now we've set up our clock, we can use it as the scheduler clock */ 772 /* Now we've set up our clock, we can use it as the scheduler clock */
785 paravirt_ops.sched_clock = lguest_sched_clock; 773 pv_time_ops.sched_clock = lguest_sched_clock;
786 774
787 /* We can't set cpumask in the initializer: damn C limitations! Set it 775 /* We can't set cpumask in the initializer: damn C limitations! Set it
788 * here and register our timer device. */ 776 * here and register our timer device. */
@@ -904,7 +892,7 @@ static __init char *lguest_memory_setup(void)
904/*G:050 892/*G:050
905 * Patching (Powerfully Placating Performance Pedants) 893 * Patching (Powerfully Placating Performance Pedants)
906 * 894 *
907 * We have already seen that "struct paravirt_ops" lets us replace simple 895 * We have already seen that pv_ops structures let us replace simple
908 * native instructions with calls to the appropriate back end all throughout 896 * native instructions with calls to the appropriate back end all throughout
909 * the kernel. This allows the same kernel to run as a Guest and as a native 897 * the kernel. This allows the same kernel to run as a Guest and as a native
910 * kernel, but it's slow because of all the indirect branches. 898 * kernel, but it's slow because of all the indirect branches.
@@ -929,10 +917,10 @@ static const struct lguest_insns
929{ 917{
930 const char *start, *end; 918 const char *start, *end;
931} lguest_insns[] = { 919} lguest_insns[] = {
932 [PARAVIRT_PATCH(irq_disable)] = { lgstart_cli, lgend_cli }, 920 [PARAVIRT_PATCH(pv_irq_ops.irq_disable)] = { lgstart_cli, lgend_cli },
933 [PARAVIRT_PATCH(irq_enable)] = { lgstart_sti, lgend_sti }, 921 [PARAVIRT_PATCH(pv_irq_ops.irq_enable)] = { lgstart_sti, lgend_sti },
934 [PARAVIRT_PATCH(restore_fl)] = { lgstart_popf, lgend_popf }, 922 [PARAVIRT_PATCH(pv_irq_ops.restore_fl)] = { lgstart_popf, lgend_popf },
935 [PARAVIRT_PATCH(save_fl)] = { lgstart_pushf, lgend_pushf }, 923 [PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf },
936}; 924};
937 925
938/* Now our patch routine is fairly simple (based on the native one in 926/* Now our patch routine is fairly simple (based on the native one in
@@ -959,9 +947,9 @@ static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf,
959 return insn_len; 947 return insn_len;
960} 948}
961 949
962/*G:030 Once we get to lguest_init(), we know we're a Guest. The paravirt_ops 950/*G:030 Once we get to lguest_init(), we know we're a Guest. The pv_ops
963 * structure in the kernel provides a single point for (almost) every routine 951 * structures in the kernel provide points for (almost) every routine we have
964 * we have to override to avoid privileged instructions. */ 952 * to override to avoid privileged instructions. */
965__init void lguest_init(void *boot) 953__init void lguest_init(void *boot)
966{ 954{
967 /* Copy boot parameters first: the Launcher put the physical location 955 /* Copy boot parameters first: the Launcher put the physical location
@@ -976,54 +964,70 @@ __init void lguest_init(void *boot)
976 964
977 /* We're under lguest, paravirt is enabled, and we're running at 965 /* We're under lguest, paravirt is enabled, and we're running at
978 * privilege level 1, not 0 as normal. */ 966 * privilege level 1, not 0 as normal. */
979 paravirt_ops.name = "lguest"; 967 pv_info.name = "lguest";
980 paravirt_ops.paravirt_enabled = 1; 968 pv_info.paravirt_enabled = 1;
981 paravirt_ops.kernel_rpl = 1; 969 pv_info.kernel_rpl = 1;
982 970
983 /* We set up all the lguest overrides for sensitive operations. These 971 /* We set up all the lguest overrides for sensitive operations. These
984 * are detailed with the operations themselves. */ 972 * are detailed with the operations themselves. */
985 paravirt_ops.save_fl = save_fl; 973
986 paravirt_ops.restore_fl = restore_fl; 974 /* interrupt-related operations */
987 paravirt_ops.irq_disable = irq_disable; 975 pv_irq_ops.init_IRQ = lguest_init_IRQ;
988 paravirt_ops.irq_enable = irq_enable; 976 pv_irq_ops.save_fl = save_fl;
989 paravirt_ops.load_gdt = lguest_load_gdt; 977 pv_irq_ops.restore_fl = restore_fl;
990 paravirt_ops.memory_setup = lguest_memory_setup; 978 pv_irq_ops.irq_disable = irq_disable;
991 paravirt_ops.cpuid = lguest_cpuid; 979 pv_irq_ops.irq_enable = irq_enable;
992 paravirt_ops.write_cr3 = lguest_write_cr3; 980 pv_irq_ops.safe_halt = lguest_safe_halt;
993 paravirt_ops.flush_tlb_user = lguest_flush_tlb_user; 981
994 paravirt_ops.flush_tlb_single = lguest_flush_tlb_single; 982 /* init-time operations */
995 paravirt_ops.flush_tlb_kernel = lguest_flush_tlb_kernel; 983 pv_init_ops.memory_setup = lguest_memory_setup;
996 paravirt_ops.set_pte = lguest_set_pte; 984 pv_init_ops.patch = lguest_patch;
997 paravirt_ops.set_pte_at = lguest_set_pte_at; 985
998 paravirt_ops.set_pmd = lguest_set_pmd; 986 /* Intercepts of various cpu instructions */
987 pv_cpu_ops.load_gdt = lguest_load_gdt;
988 pv_cpu_ops.cpuid = lguest_cpuid;
989 pv_cpu_ops.load_idt = lguest_load_idt;
990 pv_cpu_ops.iret = lguest_iret;
991 pv_cpu_ops.load_esp0 = lguest_load_esp0;
992 pv_cpu_ops.load_tr_desc = lguest_load_tr_desc;
993 pv_cpu_ops.set_ldt = lguest_set_ldt;
994 pv_cpu_ops.load_tls = lguest_load_tls;
995 pv_cpu_ops.set_debugreg = lguest_set_debugreg;
996 pv_cpu_ops.clts = lguest_clts;
997 pv_cpu_ops.read_cr0 = lguest_read_cr0;
998 pv_cpu_ops.write_cr0 = lguest_write_cr0;
999 pv_cpu_ops.read_cr4 = lguest_read_cr4;
1000 pv_cpu_ops.write_cr4 = lguest_write_cr4;
1001 pv_cpu_ops.write_gdt_entry = lguest_write_gdt_entry;
1002 pv_cpu_ops.write_idt_entry = lguest_write_idt_entry;
1003 pv_cpu_ops.wbinvd = lguest_wbinvd;
1004 pv_cpu_ops.lazy_mode.enter = paravirt_enter_lazy_cpu;
1005 pv_cpu_ops.lazy_mode.leave = lguest_leave_lazy_mode;
1006
1007 /* pagetable management */
1008 pv_mmu_ops.write_cr3 = lguest_write_cr3;
1009 pv_mmu_ops.flush_tlb_user = lguest_flush_tlb_user;
1010 pv_mmu_ops.flush_tlb_single = lguest_flush_tlb_single;
1011 pv_mmu_ops.flush_tlb_kernel = lguest_flush_tlb_kernel;
1012 pv_mmu_ops.set_pte = lguest_set_pte;
1013 pv_mmu_ops.set_pte_at = lguest_set_pte_at;
1014 pv_mmu_ops.set_pmd = lguest_set_pmd;
1015 pv_mmu_ops.read_cr2 = lguest_read_cr2;
1016 pv_mmu_ops.read_cr3 = lguest_read_cr3;
1017 pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu;
1018 pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mode;
1019
999#ifdef CONFIG_X86_LOCAL_APIC 1020#ifdef CONFIG_X86_LOCAL_APIC
1000 paravirt_ops.apic_write = lguest_apic_write; 1021 /* apic read/write intercepts */
1001 paravirt_ops.apic_write_atomic = lguest_apic_write; 1022 pv_apic_ops.apic_write = lguest_apic_write;
1002 paravirt_ops.apic_read = lguest_apic_read; 1023 pv_apic_ops.apic_write_atomic = lguest_apic_write;
1024 pv_apic_ops.apic_read = lguest_apic_read;
1003#endif 1025#endif
1004 paravirt_ops.load_idt = lguest_load_idt; 1026
1005 paravirt_ops.iret = lguest_iret; 1027 /* time operations */
1006 paravirt_ops.load_esp0 = lguest_load_esp0; 1028 pv_time_ops.get_wallclock = lguest_get_wallclock;
1007 paravirt_ops.load_tr_desc = lguest_load_tr_desc; 1029 pv_time_ops.time_init = lguest_time_init;
1008 paravirt_ops.set_ldt = lguest_set_ldt; 1030
1009 paravirt_ops.load_tls = lguest_load_tls;
1010 paravirt_ops.set_debugreg = lguest_set_debugreg;
1011 paravirt_ops.clts = lguest_clts;
1012 paravirt_ops.read_cr0 = lguest_read_cr0;
1013 paravirt_ops.write_cr0 = lguest_write_cr0;
1014 paravirt_ops.init_IRQ = lguest_init_IRQ;
1015 paravirt_ops.read_cr2 = lguest_read_cr2;
1016 paravirt_ops.read_cr3 = lguest_read_cr3;
1017 paravirt_ops.read_cr4 = lguest_read_cr4;
1018 paravirt_ops.write_cr4 = lguest_write_cr4;
1019 paravirt_ops.write_gdt_entry = lguest_write_gdt_entry;
1020 paravirt_ops.write_idt_entry = lguest_write_idt_entry;
1021 paravirt_ops.patch = lguest_patch;
1022 paravirt_ops.safe_halt = lguest_safe_halt;
1023 paravirt_ops.get_wallclock = lguest_get_wallclock;
1024 paravirt_ops.time_init = lguest_time_init;
1025 paravirt_ops.set_lazy_mode = lguest_lazy_mode;
1026 paravirt_ops.wbinvd = lguest_wbinvd;
1027 /* Now is a good time to look at the implementations of these functions 1031 /* Now is a good time to look at the implementations of these functions
1028 * before returning to the rest of lguest_init(). */ 1032 * before returning to the rest of lguest_init(). */
1029 1033
diff --git a/drivers/lguest/lguest_bus.c b/drivers/lguest/lguest_bus.c
index 9e7752cc8002..57329788f8a7 100644
--- a/drivers/lguest/lguest_bus.c
+++ b/drivers/lguest/lguest_bus.c
@@ -201,7 +201,7 @@ static void scan_devices(void)
201 * "struct lguest_device_desc" array. */ 201 * "struct lguest_device_desc" array. */
202static int __init lguest_bus_init(void) 202static int __init lguest_bus_init(void)
203{ 203{
204 if (strcmp(paravirt_ops.name, "lguest") != 0) 204 if (strcmp(pv_info.name, "lguest") != 0)
205 return 0; 205 return 0;
206 206
207 /* Devices are in a single page above top of "normal" mem */ 207 /* Devices are in a single page above top of "normal" mem */
diff --git a/include/asm-x86/paravirt.h b/include/asm-x86/paravirt.h
index 9fa3fa9e62d1..f59d370c5df4 100644
--- a/include/asm-x86/paravirt.h
+++ b/include/asm-x86/paravirt.h
@@ -25,27 +25,22 @@ struct tss_struct;
25struct mm_struct; 25struct mm_struct;
26struct desc_struct; 26struct desc_struct;
27 27
28/* Lazy mode for batching updates / context switch */ 28/* general info */
29enum paravirt_lazy_mode { 29struct pv_info {
30 PARAVIRT_LAZY_NONE = 0,
31 PARAVIRT_LAZY_MMU = 1,
32 PARAVIRT_LAZY_CPU = 2,
33 PARAVIRT_LAZY_FLUSH = 3,
34};
35
36struct paravirt_ops
37{
38 unsigned int kernel_rpl; 30 unsigned int kernel_rpl;
39 int shared_kernel_pmd; 31 int shared_kernel_pmd;
40 int paravirt_enabled; 32 int paravirt_enabled;
41 const char *name; 33 const char *name;
34};
42 35
36struct pv_init_ops {
43 /* 37 /*
44 * Patch may replace one of the defined code sequences with arbitrary 38 * Patch may replace one of the defined code sequences with
45 * code, subject to the same register constraints. This generally 39 * arbitrary code, subject to the same register constraints.
46 * means the code is not free to clobber any registers other than EAX. 40 * This generally means the code is not free to clobber any
47 * The patch function should return the number of bytes of code 41 * registers other than EAX. The patch function should return
48 * generated, as we nop pad the rest in generic code. 42 * the number of bytes of code generated, as we nop pad the
43 * rest in generic code.
49 */ 44 */
50 unsigned (*patch)(u8 type, u16 clobber, void *insnbuf, 45 unsigned (*patch)(u8 type, u16 clobber, void *insnbuf,
51 unsigned long addr, unsigned len); 46 unsigned long addr, unsigned len);
@@ -55,29 +50,29 @@ struct paravirt_ops
55 char *(*memory_setup)(void); 50 char *(*memory_setup)(void);
56 void (*post_allocator_init)(void); 51 void (*post_allocator_init)(void);
57 52
58 void (*init_IRQ)(void);
59 void (*time_init)(void);
60
61 /*
62 * Called before/after init_mm pagetable setup. setup_start
63 * may reset %cr3, and may pre-install parts of the pagetable;
64 * pagetable setup is expected to preserve any existing
65 * mapping.
66 */
67 void (*pagetable_setup_start)(pgd_t *pgd_base);
68 void (*pagetable_setup_done)(pgd_t *pgd_base);
69
70 /* Print a banner to identify the environment */ 53 /* Print a banner to identify the environment */
71 void (*banner)(void); 54 void (*banner)(void);
55};
56
57
58struct pv_lazy_ops {
59 /* Set deferred update mode, used for batching operations. */
60 void (*enter)(void);
61 void (*leave)(void);
62};
63
64struct pv_time_ops {
65 void (*time_init)(void);
72 66
73 /* Set and set time of day */ 67 /* Set and set time of day */
74 unsigned long (*get_wallclock)(void); 68 unsigned long (*get_wallclock)(void);
75 int (*set_wallclock)(unsigned long); 69 int (*set_wallclock)(unsigned long);
76 70
77 /* cpuid emulation, mostly so that caps bits can be disabled */ 71 unsigned long long (*sched_clock)(void);
78 void (*cpuid)(unsigned int *eax, unsigned int *ebx, 72 unsigned long (*get_cpu_khz)(void);
79 unsigned int *ecx, unsigned int *edx); 73};
80 74
75struct pv_cpu_ops {
81 /* hooks for various privileged instructions */ 76 /* hooks for various privileged instructions */
82 unsigned long (*get_debugreg)(int regno); 77 unsigned long (*get_debugreg)(int regno);
83 void (*set_debugreg)(int regno, unsigned long value); 78 void (*set_debugreg)(int regno, unsigned long value);
@@ -87,41 +82,10 @@ struct paravirt_ops
87 unsigned long (*read_cr0)(void); 82 unsigned long (*read_cr0)(void);
88 void (*write_cr0)(unsigned long); 83 void (*write_cr0)(unsigned long);
89 84
90 unsigned long (*read_cr2)(void);
91 void (*write_cr2)(unsigned long);
92
93 unsigned long (*read_cr3)(void);
94 void (*write_cr3)(unsigned long);
95
96 unsigned long (*read_cr4_safe)(void); 85 unsigned long (*read_cr4_safe)(void);
97 unsigned long (*read_cr4)(void); 86 unsigned long (*read_cr4)(void);
98 void (*write_cr4)(unsigned long); 87 void (*write_cr4)(unsigned long);
99 88
100 /*
101 * Get/set interrupt state. save_fl and restore_fl are only
102 * expected to use X86_EFLAGS_IF; all other bits
103 * returned from save_fl are undefined, and may be ignored by
104 * restore_fl.
105 */
106 unsigned long (*save_fl)(void);
107 void (*restore_fl)(unsigned long);
108 void (*irq_disable)(void);
109 void (*irq_enable)(void);
110 void (*safe_halt)(void);
111 void (*halt)(void);
112
113 void (*wbinvd)(void);
114
115 /* MSR, PMC and TSR operations.
116 err = 0/-EFAULT. wrmsr returns 0/-EFAULT. */
117 u64 (*read_msr)(unsigned int msr, int *err);
118 int (*write_msr)(unsigned int msr, u64 val);
119
120 u64 (*read_tsc)(void);
121 u64 (*read_pmc)(void);
122 unsigned long long (*sched_clock)(void);
123 unsigned long (*get_cpu_khz)(void);
124
125 /* Segment descriptor handling */ 89 /* Segment descriptor handling */
126 void (*load_tr_desc)(void); 90 void (*load_tr_desc)(void);
127 void (*load_gdt)(const struct Xgt_desc_struct *); 91 void (*load_gdt)(const struct Xgt_desc_struct *);
@@ -140,18 +104,47 @@ struct paravirt_ops
140 void (*load_esp0)(struct tss_struct *tss, struct thread_struct *t); 104 void (*load_esp0)(struct tss_struct *tss, struct thread_struct *t);
141 105
142 void (*set_iopl_mask)(unsigned mask); 106 void (*set_iopl_mask)(unsigned mask);
107
108 void (*wbinvd)(void);
143 void (*io_delay)(void); 109 void (*io_delay)(void);
144 110
111 /* cpuid emulation, mostly so that caps bits can be disabled */
112 void (*cpuid)(unsigned int *eax, unsigned int *ebx,
113 unsigned int *ecx, unsigned int *edx);
114
115 /* MSR, PMC and TSR operations.
116 err = 0/-EFAULT. wrmsr returns 0/-EFAULT. */
117 u64 (*read_msr)(unsigned int msr, int *err);
118 int (*write_msr)(unsigned int msr, u64 val);
119
120 u64 (*read_tsc)(void);
121 u64 (*read_pmc)(void);
122
123 /* These two are jmp to, not actually called. */
124 void (*irq_enable_sysexit)(void);
125 void (*iret)(void);
126
127 struct pv_lazy_ops lazy_mode;
128};
129
130struct pv_irq_ops {
131 void (*init_IRQ)(void);
132
145 /* 133 /*
146 * Hooks for intercepting the creation/use/destruction of an 134 * Get/set interrupt state. save_fl and restore_fl are only
147 * mm_struct. 135 * expected to use X86_EFLAGS_IF; all other bits
136 * returned from save_fl are undefined, and may be ignored by
137 * restore_fl.
148 */ 138 */
149 void (*activate_mm)(struct mm_struct *prev, 139 unsigned long (*save_fl)(void);
150 struct mm_struct *next); 140 void (*restore_fl)(unsigned long);
151 void (*dup_mmap)(struct mm_struct *oldmm, 141 void (*irq_disable)(void);
152 struct mm_struct *mm); 142 void (*irq_enable)(void);
153 void (*exit_mmap)(struct mm_struct *mm); 143 void (*safe_halt)(void);
144 void (*halt)(void);
145};
154 146
147struct pv_apic_ops {
155#ifdef CONFIG_X86_LOCAL_APIC 148#ifdef CONFIG_X86_LOCAL_APIC
156 /* 149 /*
157 * Direct APIC operations, principally for VMI. Ideally 150 * Direct APIC operations, principally for VMI. Ideally
@@ -167,6 +160,34 @@ struct paravirt_ops
167 unsigned long start_eip, 160 unsigned long start_eip,
168 unsigned long start_esp); 161 unsigned long start_esp);
169#endif 162#endif
163};
164
165struct pv_mmu_ops {
166 /*
167 * Called before/after init_mm pagetable setup. setup_start
168 * may reset %cr3, and may pre-install parts of the pagetable;
169 * pagetable setup is expected to preserve any existing
170 * mapping.
171 */
172 void (*pagetable_setup_start)(pgd_t *pgd_base);
173 void (*pagetable_setup_done)(pgd_t *pgd_base);
174
175 unsigned long (*read_cr2)(void);
176 void (*write_cr2)(unsigned long);
177
178 unsigned long (*read_cr3)(void);
179 void (*write_cr3)(unsigned long);
180
181 /*
182 * Hooks for intercepting the creation/use/destruction of an
183 * mm_struct.
184 */
185 void (*activate_mm)(struct mm_struct *prev,
186 struct mm_struct *next);
187 void (*dup_mmap)(struct mm_struct *oldmm,
188 struct mm_struct *mm);
189 void (*exit_mmap)(struct mm_struct *mm);
190
170 191
171 /* TLB operations */ 192 /* TLB operations */
172 void (*flush_tlb_user)(void); 193 void (*flush_tlb_user)(void);
@@ -191,15 +212,12 @@ struct paravirt_ops
191 void (*pte_update_defer)(struct mm_struct *mm, 212 void (*pte_update_defer)(struct mm_struct *mm,
192 unsigned long addr, pte_t *ptep); 213 unsigned long addr, pte_t *ptep);
193 214
194#ifdef CONFIG_HIGHPTE
195 void *(*kmap_atomic_pte)(struct page *page, enum km_type type);
196#endif
197
198#ifdef CONFIG_X86_PAE 215#ifdef CONFIG_X86_PAE
199 void (*set_pte_atomic)(pte_t *ptep, pte_t pteval); 216 void (*set_pte_atomic)(pte_t *ptep, pte_t pteval);
200 void (*set_pte_present)(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte); 217 void (*set_pte_present)(struct mm_struct *mm, unsigned long addr,
218 pte_t *ptep, pte_t pte);
201 void (*set_pud)(pud_t *pudp, pud_t pudval); 219 void (*set_pud)(pud_t *pudp, pud_t pudval);
202 void (*pte_clear)(struct mm_struct *mm, unsigned long addr, pte_t *ptep); 220 void (*pte_clear)(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
203 void (*pmd_clear)(pmd_t *pmdp); 221 void (*pmd_clear)(pmd_t *pmdp);
204 222
205 unsigned long long (*pte_val)(pte_t); 223 unsigned long long (*pte_val)(pte_t);
@@ -217,21 +235,40 @@ struct paravirt_ops
217 pgd_t (*make_pgd)(unsigned long pgd); 235 pgd_t (*make_pgd)(unsigned long pgd);
218#endif 236#endif
219 237
220 /* Set deferred update mode, used for batching operations. */ 238#ifdef CONFIG_HIGHPTE
221 void (*set_lazy_mode)(enum paravirt_lazy_mode mode); 239 void *(*kmap_atomic_pte)(struct page *page, enum km_type type);
240#endif
222 241
223 /* These two are jmp to, not actually called. */ 242 struct pv_lazy_ops lazy_mode;
224 void (*irq_enable_sysexit)(void);
225 void (*iret)(void);
226}; 243};
227 244
228extern struct paravirt_ops paravirt_ops; 245/* This contains all the paravirt structures: we get a convenient
246 * number for each function using the offset which we use to indicate
247 * what to patch. */
248struct paravirt_patch_template
249{
250 struct pv_init_ops pv_init_ops;
251 struct pv_time_ops pv_time_ops;
252 struct pv_cpu_ops pv_cpu_ops;
253 struct pv_irq_ops pv_irq_ops;
254 struct pv_apic_ops pv_apic_ops;
255 struct pv_mmu_ops pv_mmu_ops;
256};
257
258extern struct pv_info pv_info;
259extern struct pv_init_ops pv_init_ops;
260extern struct pv_time_ops pv_time_ops;
261extern struct pv_cpu_ops pv_cpu_ops;
262extern struct pv_irq_ops pv_irq_ops;
263extern struct pv_apic_ops pv_apic_ops;
264extern struct pv_mmu_ops pv_mmu_ops;
229 265
230#define PARAVIRT_PATCH(x) \ 266#define PARAVIRT_PATCH(x) \
231 (offsetof(struct paravirt_ops, x) / sizeof(void *)) 267 (offsetof(struct paravirt_patch_template, x) / sizeof(void *))
232 268
233#define paravirt_type(type) \ 269#define paravirt_type(op) \
234 [paravirt_typenum] "i" (PARAVIRT_PATCH(type)) 270 [paravirt_typenum] "i" (PARAVIRT_PATCH(op)), \
271 [paravirt_opptr] "m" (op)
235#define paravirt_clobber(clobber) \ 272#define paravirt_clobber(clobber) \
236 [paravirt_clobber] "i" (clobber) 273 [paravirt_clobber] "i" (clobber)
237 274
@@ -258,7 +295,7 @@ unsigned paravirt_patch_call(void *insnbuf,
258 const void *target, u16 tgt_clobbers, 295 const void *target, u16 tgt_clobbers,
259 unsigned long addr, u16 site_clobbers, 296 unsigned long addr, u16 site_clobbers,
260 unsigned len); 297 unsigned len);
261unsigned paravirt_patch_jmp(const void *target, void *insnbuf, 298unsigned paravirt_patch_jmp(void *insnbuf, const void *target,
262 unsigned long addr, unsigned len); 299 unsigned long addr, unsigned len);
263unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf, 300unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
264 unsigned long addr, unsigned len); 301 unsigned long addr, unsigned len);
@@ -271,14 +308,14 @@ int paravirt_disable_iospace(void);
271/* 308/*
272 * This generates an indirect call based on the operation type number. 309 * This generates an indirect call based on the operation type number.
273 * The type number, computed in PARAVIRT_PATCH, is derived from the 310 * The type number, computed in PARAVIRT_PATCH, is derived from the
274 * offset into the paravirt_ops structure, and can therefore be freely 311 * offset into the paravirt_patch_template structure, and can therefore be
275 * converted back into a structure offset. 312 * freely converted back into a structure offset.
276 */ 313 */
277#define PARAVIRT_CALL "call *(paravirt_ops+%c[paravirt_typenum]*4);" 314#define PARAVIRT_CALL "call *%[paravirt_opptr];"
278 315
279/* 316/*
280 * These macros are intended to wrap calls into a paravirt_ops 317 * These macros are intended to wrap calls through one of the paravirt
281 * operation, so that they can be later identified and patched at 318 * ops structs, so that they can be later identified and patched at
282 * runtime. 319 * runtime.
283 * 320 *
284 * Normally, a call to a pv_op function is a simple indirect call: 321 * Normally, a call to a pv_op function is a simple indirect call:
@@ -301,7 +338,7 @@ int paravirt_disable_iospace(void);
301 * The call instruction itself is marked by placing its start address 338 * The call instruction itself is marked by placing its start address
302 * and size into the .parainstructions section, so that 339 * and size into the .parainstructions section, so that
303 * apply_paravirt() in arch/i386/kernel/alternative.c can do the 340 * apply_paravirt() in arch/i386/kernel/alternative.c can do the
304 * appropriate patching under the control of the backend paravirt_ops 341 * appropriate patching under the control of the backend pv_init_ops
305 * implementation. 342 * implementation.
306 * 343 *
307 * Unfortunately there's no way to get gcc to generate the args setup 344 * Unfortunately there's no way to get gcc to generate the args setup
@@ -409,36 +446,36 @@ int paravirt_disable_iospace(void);
409 446
410static inline int paravirt_enabled(void) 447static inline int paravirt_enabled(void)
411{ 448{
412 return paravirt_ops.paravirt_enabled; 449 return pv_info.paravirt_enabled;
413} 450}
414 451
415static inline void load_esp0(struct tss_struct *tss, 452static inline void load_esp0(struct tss_struct *tss,
416 struct thread_struct *thread) 453 struct thread_struct *thread)
417{ 454{
418 PVOP_VCALL2(load_esp0, tss, thread); 455 PVOP_VCALL2(pv_cpu_ops.load_esp0, tss, thread);
419} 456}
420 457
421#define ARCH_SETUP paravirt_ops.arch_setup(); 458#define ARCH_SETUP pv_init_ops.arch_setup();
422static inline unsigned long get_wallclock(void) 459static inline unsigned long get_wallclock(void)
423{ 460{
424 return PVOP_CALL0(unsigned long, get_wallclock); 461 return PVOP_CALL0(unsigned long, pv_time_ops.get_wallclock);
425} 462}
426 463
427static inline int set_wallclock(unsigned long nowtime) 464static inline int set_wallclock(unsigned long nowtime)
428{ 465{
429 return PVOP_CALL1(int, set_wallclock, nowtime); 466 return PVOP_CALL1(int, pv_time_ops.set_wallclock, nowtime);
430} 467}
431 468
432static inline void (*choose_time_init(void))(void) 469static inline void (*choose_time_init(void))(void)
433{ 470{
434 return paravirt_ops.time_init; 471 return pv_time_ops.time_init;
435} 472}
436 473
437/* The paravirtualized CPUID instruction. */ 474/* The paravirtualized CPUID instruction. */
438static inline void __cpuid(unsigned int *eax, unsigned int *ebx, 475static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
439 unsigned int *ecx, unsigned int *edx) 476 unsigned int *ecx, unsigned int *edx)
440{ 477{
441 PVOP_VCALL4(cpuid, eax, ebx, ecx, edx); 478 PVOP_VCALL4(pv_cpu_ops.cpuid, eax, ebx, ecx, edx);
442} 479}
443 480
444/* 481/*
@@ -446,87 +483,87 @@ static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
446 */ 483 */
447static inline unsigned long paravirt_get_debugreg(int reg) 484static inline unsigned long paravirt_get_debugreg(int reg)
448{ 485{
449 return PVOP_CALL1(unsigned long, get_debugreg, reg); 486 return PVOP_CALL1(unsigned long, pv_cpu_ops.get_debugreg, reg);
450} 487}
451#define get_debugreg(var, reg) var = paravirt_get_debugreg(reg) 488#define get_debugreg(var, reg) var = paravirt_get_debugreg(reg)
452static inline void set_debugreg(unsigned long val, int reg) 489static inline void set_debugreg(unsigned long val, int reg)
453{ 490{
454 PVOP_VCALL2(set_debugreg, reg, val); 491 PVOP_VCALL2(pv_cpu_ops.set_debugreg, reg, val);
455} 492}
456 493
457static inline void clts(void) 494static inline void clts(void)
458{ 495{
459 PVOP_VCALL0(clts); 496 PVOP_VCALL0(pv_cpu_ops.clts);
460} 497}
461 498
462static inline unsigned long read_cr0(void) 499static inline unsigned long read_cr0(void)
463{ 500{
464 return PVOP_CALL0(unsigned long, read_cr0); 501 return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr0);
465} 502}
466 503
467static inline void write_cr0(unsigned long x) 504static inline void write_cr0(unsigned long x)
468{ 505{
469 PVOP_VCALL1(write_cr0, x); 506 PVOP_VCALL1(pv_cpu_ops.write_cr0, x);
470} 507}
471 508
472static inline unsigned long read_cr2(void) 509static inline unsigned long read_cr2(void)
473{ 510{
474 return PVOP_CALL0(unsigned long, read_cr2); 511 return PVOP_CALL0(unsigned long, pv_mmu_ops.read_cr2);
475} 512}
476 513
477static inline void write_cr2(unsigned long x) 514static inline void write_cr2(unsigned long x)
478{ 515{
479 PVOP_VCALL1(write_cr2, x); 516 PVOP_VCALL1(pv_mmu_ops.write_cr2, x);
480} 517}
481 518
482static inline unsigned long read_cr3(void) 519static inline unsigned long read_cr3(void)
483{ 520{
484 return PVOP_CALL0(unsigned long, read_cr3); 521 return PVOP_CALL0(unsigned long, pv_mmu_ops.read_cr3);
485} 522}
486 523
487static inline void write_cr3(unsigned long x) 524static inline void write_cr3(unsigned long x)
488{ 525{
489 PVOP_VCALL1(write_cr3, x); 526 PVOP_VCALL1(pv_mmu_ops.write_cr3, x);
490} 527}
491 528
492static inline unsigned long read_cr4(void) 529static inline unsigned long read_cr4(void)
493{ 530{
494 return PVOP_CALL0(unsigned long, read_cr4); 531 return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr4);
495} 532}
496static inline unsigned long read_cr4_safe(void) 533static inline unsigned long read_cr4_safe(void)
497{ 534{
498 return PVOP_CALL0(unsigned long, read_cr4_safe); 535 return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr4_safe);
499} 536}
500 537
501static inline void write_cr4(unsigned long x) 538static inline void write_cr4(unsigned long x)
502{ 539{
503 PVOP_VCALL1(write_cr4, x); 540 PVOP_VCALL1(pv_cpu_ops.write_cr4, x);
504} 541}
505 542
506static inline void raw_safe_halt(void) 543static inline void raw_safe_halt(void)
507{ 544{
508 PVOP_VCALL0(safe_halt); 545 PVOP_VCALL0(pv_irq_ops.safe_halt);
509} 546}
510 547
511static inline void halt(void) 548static inline void halt(void)
512{ 549{
513 PVOP_VCALL0(safe_halt); 550 PVOP_VCALL0(pv_irq_ops.safe_halt);
514} 551}
515 552
516static inline void wbinvd(void) 553static inline void wbinvd(void)
517{ 554{
518 PVOP_VCALL0(wbinvd); 555 PVOP_VCALL0(pv_cpu_ops.wbinvd);
519} 556}
520 557
521#define get_kernel_rpl() (paravirt_ops.kernel_rpl) 558#define get_kernel_rpl() (pv_info.kernel_rpl)
522 559
523static inline u64 paravirt_read_msr(unsigned msr, int *err) 560static inline u64 paravirt_read_msr(unsigned msr, int *err)
524{ 561{
525 return PVOP_CALL2(u64, read_msr, msr, err); 562 return PVOP_CALL2(u64, pv_cpu_ops.read_msr, msr, err);
526} 563}
527static inline int paravirt_write_msr(unsigned msr, unsigned low, unsigned high) 564static inline int paravirt_write_msr(unsigned msr, unsigned low, unsigned high)
528{ 565{
529 return PVOP_CALL3(int, write_msr, msr, low, high); 566 return PVOP_CALL3(int, pv_cpu_ops.write_msr, msr, low, high);
530} 567}
531 568
532/* These should all do BUG_ON(_err), but our headers are too tangled. */ 569/* These should all do BUG_ON(_err), but our headers are too tangled. */
@@ -560,7 +597,7 @@ static inline int paravirt_write_msr(unsigned msr, unsigned low, unsigned high)
560 597
561static inline u64 paravirt_read_tsc(void) 598static inline u64 paravirt_read_tsc(void)
562{ 599{
563 return PVOP_CALL0(u64, read_tsc); 600 return PVOP_CALL0(u64, pv_cpu_ops.read_tsc);
564} 601}
565 602
566#define rdtscl(low) do { \ 603#define rdtscl(low) do { \
@@ -572,15 +609,15 @@ static inline u64 paravirt_read_tsc(void)
572 609
573static inline unsigned long long paravirt_sched_clock(void) 610static inline unsigned long long paravirt_sched_clock(void)
574{ 611{
575 return PVOP_CALL0(unsigned long long, sched_clock); 612 return PVOP_CALL0(unsigned long long, pv_time_ops.sched_clock);
576} 613}
577#define calculate_cpu_khz() (paravirt_ops.get_cpu_khz()) 614#define calculate_cpu_khz() (pv_time_ops.get_cpu_khz())
578 615
579#define write_tsc(val1,val2) wrmsr(0x10, val1, val2) 616#define write_tsc(val1,val2) wrmsr(0x10, val1, val2)
580 617
581static inline unsigned long long paravirt_read_pmc(int counter) 618static inline unsigned long long paravirt_read_pmc(int counter)
582{ 619{
583 return PVOP_CALL1(u64, read_pmc, counter); 620 return PVOP_CALL1(u64, pv_cpu_ops.read_pmc, counter);
584} 621}
585 622
586#define rdpmc(counter,low,high) do { \ 623#define rdpmc(counter,low,high) do { \
@@ -591,61 +628,61 @@ static inline unsigned long long paravirt_read_pmc(int counter)
591 628
592static inline void load_TR_desc(void) 629static inline void load_TR_desc(void)
593{ 630{
594 PVOP_VCALL0(load_tr_desc); 631 PVOP_VCALL0(pv_cpu_ops.load_tr_desc);
595} 632}
596static inline void load_gdt(const struct Xgt_desc_struct *dtr) 633static inline void load_gdt(const struct Xgt_desc_struct *dtr)
597{ 634{
598 PVOP_VCALL1(load_gdt, dtr); 635 PVOP_VCALL1(pv_cpu_ops.load_gdt, dtr);
599} 636}
600static inline void load_idt(const struct Xgt_desc_struct *dtr) 637static inline void load_idt(const struct Xgt_desc_struct *dtr)
601{ 638{
602 PVOP_VCALL1(load_idt, dtr); 639 PVOP_VCALL1(pv_cpu_ops.load_idt, dtr);
603} 640}
604static inline void set_ldt(const void *addr, unsigned entries) 641static inline void set_ldt(const void *addr, unsigned entries)
605{ 642{
606 PVOP_VCALL2(set_ldt, addr, entries); 643 PVOP_VCALL2(pv_cpu_ops.set_ldt, addr, entries);
607} 644}
608static inline void store_gdt(struct Xgt_desc_struct *dtr) 645static inline void store_gdt(struct Xgt_desc_struct *dtr)
609{ 646{
610 PVOP_VCALL1(store_gdt, dtr); 647 PVOP_VCALL1(pv_cpu_ops.store_gdt, dtr);
611} 648}
612static inline void store_idt(struct Xgt_desc_struct *dtr) 649static inline void store_idt(struct Xgt_desc_struct *dtr)
613{ 650{
614 PVOP_VCALL1(store_idt, dtr); 651 PVOP_VCALL1(pv_cpu_ops.store_idt, dtr);
615} 652}
616static inline unsigned long paravirt_store_tr(void) 653static inline unsigned long paravirt_store_tr(void)
617{ 654{
618 return PVOP_CALL0(unsigned long, store_tr); 655 return PVOP_CALL0(unsigned long, pv_cpu_ops.store_tr);
619} 656}
620#define store_tr(tr) ((tr) = paravirt_store_tr()) 657#define store_tr(tr) ((tr) = paravirt_store_tr())
621static inline void load_TLS(struct thread_struct *t, unsigned cpu) 658static inline void load_TLS(struct thread_struct *t, unsigned cpu)
622{ 659{
623 PVOP_VCALL2(load_tls, t, cpu); 660 PVOP_VCALL2(pv_cpu_ops.load_tls, t, cpu);
624} 661}
625static inline void write_ldt_entry(void *dt, int entry, u32 low, u32 high) 662static inline void write_ldt_entry(void *dt, int entry, u32 low, u32 high)
626{ 663{
627 PVOP_VCALL4(write_ldt_entry, dt, entry, low, high); 664 PVOP_VCALL4(pv_cpu_ops.write_ldt_entry, dt, entry, low, high);
628} 665}
629static inline void write_gdt_entry(void *dt, int entry, u32 low, u32 high) 666static inline void write_gdt_entry(void *dt, int entry, u32 low, u32 high)
630{ 667{
631 PVOP_VCALL4(write_gdt_entry, dt, entry, low, high); 668 PVOP_VCALL4(pv_cpu_ops.write_gdt_entry, dt, entry, low, high);
632} 669}
633static inline void write_idt_entry(void *dt, int entry, u32 low, u32 high) 670static inline void write_idt_entry(void *dt, int entry, u32 low, u32 high)
634{ 671{
635 PVOP_VCALL4(write_idt_entry, dt, entry, low, high); 672 PVOP_VCALL4(pv_cpu_ops.write_idt_entry, dt, entry, low, high);
636} 673}
637static inline void set_iopl_mask(unsigned mask) 674static inline void set_iopl_mask(unsigned mask)
638{ 675{
639 PVOP_VCALL1(set_iopl_mask, mask); 676 PVOP_VCALL1(pv_cpu_ops.set_iopl_mask, mask);
640} 677}
641 678
642/* The paravirtualized I/O functions */ 679/* The paravirtualized I/O functions */
643static inline void slow_down_io(void) { 680static inline void slow_down_io(void) {
644 paravirt_ops.io_delay(); 681 pv_cpu_ops.io_delay();
645#ifdef REALLY_SLOW_IO 682#ifdef REALLY_SLOW_IO
646 paravirt_ops.io_delay(); 683 pv_cpu_ops.io_delay();
647 paravirt_ops.io_delay(); 684 pv_cpu_ops.io_delay();
648 paravirt_ops.io_delay(); 685 pv_cpu_ops.io_delay();
649#endif 686#endif
650} 687}
651 688
@@ -655,121 +692,120 @@ static inline void slow_down_io(void) {
655 */ 692 */
656static inline void apic_write(unsigned long reg, unsigned long v) 693static inline void apic_write(unsigned long reg, unsigned long v)
657{ 694{
658 PVOP_VCALL2(apic_write, reg, v); 695 PVOP_VCALL2(pv_apic_ops.apic_write, reg, v);
659} 696}
660 697
661static inline void apic_write_atomic(unsigned long reg, unsigned long v) 698static inline void apic_write_atomic(unsigned long reg, unsigned long v)
662{ 699{
663 PVOP_VCALL2(apic_write_atomic, reg, v); 700 PVOP_VCALL2(pv_apic_ops.apic_write_atomic, reg, v);
664} 701}
665 702
666static inline unsigned long apic_read(unsigned long reg) 703static inline unsigned long apic_read(unsigned long reg)
667{ 704{
668 return PVOP_CALL1(unsigned long, apic_read, reg); 705 return PVOP_CALL1(unsigned long, pv_apic_ops.apic_read, reg);
669} 706}
670 707
671static inline void setup_boot_clock(void) 708static inline void setup_boot_clock(void)
672{ 709{
673 PVOP_VCALL0(setup_boot_clock); 710 PVOP_VCALL0(pv_apic_ops.setup_boot_clock);
674} 711}
675 712
676static inline void setup_secondary_clock(void) 713static inline void setup_secondary_clock(void)
677{ 714{
678 PVOP_VCALL0(setup_secondary_clock); 715 PVOP_VCALL0(pv_apic_ops.setup_secondary_clock);
679} 716}
680#endif 717#endif
681 718
682static inline void paravirt_post_allocator_init(void) 719static inline void paravirt_post_allocator_init(void)
683{ 720{
684 if (paravirt_ops.post_allocator_init) 721 if (pv_init_ops.post_allocator_init)
685 (*paravirt_ops.post_allocator_init)(); 722 (*pv_init_ops.post_allocator_init)();
686} 723}
687 724
688static inline void paravirt_pagetable_setup_start(pgd_t *base) 725static inline void paravirt_pagetable_setup_start(pgd_t *base)
689{ 726{
690 if (paravirt_ops.pagetable_setup_start) 727 (*pv_mmu_ops.pagetable_setup_start)(base);
691 (*paravirt_ops.pagetable_setup_start)(base);
692} 728}
693 729
694static inline void paravirt_pagetable_setup_done(pgd_t *base) 730static inline void paravirt_pagetable_setup_done(pgd_t *base)
695{ 731{
696 if (paravirt_ops.pagetable_setup_done) 732 (*pv_mmu_ops.pagetable_setup_done)(base);
697 (*paravirt_ops.pagetable_setup_done)(base);
698} 733}
699 734
700#ifdef CONFIG_SMP 735#ifdef CONFIG_SMP
701static inline void startup_ipi_hook(int phys_apicid, unsigned long start_eip, 736static inline void startup_ipi_hook(int phys_apicid, unsigned long start_eip,
702 unsigned long start_esp) 737 unsigned long start_esp)
703{ 738{
704 PVOP_VCALL3(startup_ipi_hook, phys_apicid, start_eip, start_esp); 739 PVOP_VCALL3(pv_apic_ops.startup_ipi_hook,
740 phys_apicid, start_eip, start_esp);
705} 741}
706#endif 742#endif
707 743
708static inline void paravirt_activate_mm(struct mm_struct *prev, 744static inline void paravirt_activate_mm(struct mm_struct *prev,
709 struct mm_struct *next) 745 struct mm_struct *next)
710{ 746{
711 PVOP_VCALL2(activate_mm, prev, next); 747 PVOP_VCALL2(pv_mmu_ops.activate_mm, prev, next);
712} 748}
713 749
714static inline void arch_dup_mmap(struct mm_struct *oldmm, 750static inline void arch_dup_mmap(struct mm_struct *oldmm,
715 struct mm_struct *mm) 751 struct mm_struct *mm)
716{ 752{
717 PVOP_VCALL2(dup_mmap, oldmm, mm); 753 PVOP_VCALL2(pv_mmu_ops.dup_mmap, oldmm, mm);
718} 754}
719 755
720static inline void arch_exit_mmap(struct mm_struct *mm) 756static inline void arch_exit_mmap(struct mm_struct *mm)
721{ 757{
722 PVOP_VCALL1(exit_mmap, mm); 758 PVOP_VCALL1(pv_mmu_ops.exit_mmap, mm);
723} 759}
724 760
725static inline void __flush_tlb(void) 761static inline void __flush_tlb(void)
726{ 762{
727 PVOP_VCALL0(flush_tlb_user); 763 PVOP_VCALL0(pv_mmu_ops.flush_tlb_user);
728} 764}
729static inline void __flush_tlb_global(void) 765static inline void __flush_tlb_global(void)
730{ 766{
731 PVOP_VCALL0(flush_tlb_kernel); 767 PVOP_VCALL0(pv_mmu_ops.flush_tlb_kernel);
732} 768}
733static inline void __flush_tlb_single(unsigned long addr) 769static inline void __flush_tlb_single(unsigned long addr)
734{ 770{
735 PVOP_VCALL1(flush_tlb_single, addr); 771 PVOP_VCALL1(pv_mmu_ops.flush_tlb_single, addr);
736} 772}
737 773
738static inline void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, 774static inline void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
739 unsigned long va) 775 unsigned long va)
740{ 776{
741 PVOP_VCALL3(flush_tlb_others, &cpumask, mm, va); 777 PVOP_VCALL3(pv_mmu_ops.flush_tlb_others, &cpumask, mm, va);
742} 778}
743 779
744static inline void paravirt_alloc_pt(struct mm_struct *mm, unsigned pfn) 780static inline void paravirt_alloc_pt(struct mm_struct *mm, unsigned pfn)
745{ 781{
746 PVOP_VCALL2(alloc_pt, mm, pfn); 782 PVOP_VCALL2(pv_mmu_ops.alloc_pt, mm, pfn);
747} 783}
748static inline void paravirt_release_pt(unsigned pfn) 784static inline void paravirt_release_pt(unsigned pfn)
749{ 785{
750 PVOP_VCALL1(release_pt, pfn); 786 PVOP_VCALL1(pv_mmu_ops.release_pt, pfn);
751} 787}
752 788
753static inline void paravirt_alloc_pd(unsigned pfn) 789static inline void paravirt_alloc_pd(unsigned pfn)
754{ 790{
755 PVOP_VCALL1(alloc_pd, pfn); 791 PVOP_VCALL1(pv_mmu_ops.alloc_pd, pfn);
756} 792}
757 793
758static inline void paravirt_alloc_pd_clone(unsigned pfn, unsigned clonepfn, 794static inline void paravirt_alloc_pd_clone(unsigned pfn, unsigned clonepfn,
759 unsigned start, unsigned count) 795 unsigned start, unsigned count)
760{ 796{
761 PVOP_VCALL4(alloc_pd_clone, pfn, clonepfn, start, count); 797 PVOP_VCALL4(pv_mmu_ops.alloc_pd_clone, pfn, clonepfn, start, count);
762} 798}
763static inline void paravirt_release_pd(unsigned pfn) 799static inline void paravirt_release_pd(unsigned pfn)
764{ 800{
765 PVOP_VCALL1(release_pd, pfn); 801 PVOP_VCALL1(pv_mmu_ops.release_pd, pfn);
766} 802}
767 803
768#ifdef CONFIG_HIGHPTE 804#ifdef CONFIG_HIGHPTE
769static inline void *kmap_atomic_pte(struct page *page, enum km_type type) 805static inline void *kmap_atomic_pte(struct page *page, enum km_type type)
770{ 806{
771 unsigned long ret; 807 unsigned long ret;
772 ret = PVOP_CALL2(unsigned long, kmap_atomic_pte, page, type); 808 ret = PVOP_CALL2(unsigned long, pv_mmu_ops.kmap_atomic_pte, page, type);
773 return (void *)ret; 809 return (void *)ret;
774} 810}
775#endif 811#endif
@@ -777,162 +813,191 @@ static inline void *kmap_atomic_pte(struct page *page, enum km_type type)
777static inline void pte_update(struct mm_struct *mm, unsigned long addr, 813static inline void pte_update(struct mm_struct *mm, unsigned long addr,
778 pte_t *ptep) 814 pte_t *ptep)
779{ 815{
780 PVOP_VCALL3(pte_update, mm, addr, ptep); 816 PVOP_VCALL3(pv_mmu_ops.pte_update, mm, addr, ptep);
781} 817}
782 818
783static inline void pte_update_defer(struct mm_struct *mm, unsigned long addr, 819static inline void pte_update_defer(struct mm_struct *mm, unsigned long addr,
784 pte_t *ptep) 820 pte_t *ptep)
785{ 821{
786 PVOP_VCALL3(pte_update_defer, mm, addr, ptep); 822 PVOP_VCALL3(pv_mmu_ops.pte_update_defer, mm, addr, ptep);
787} 823}
788 824
789#ifdef CONFIG_X86_PAE 825#ifdef CONFIG_X86_PAE
790static inline pte_t __pte(unsigned long long val) 826static inline pte_t __pte(unsigned long long val)
791{ 827{
792 unsigned long long ret = PVOP_CALL2(unsigned long long, make_pte, 828 unsigned long long ret = PVOP_CALL2(unsigned long long,
829 pv_mmu_ops.make_pte,
793 val, val >> 32); 830 val, val >> 32);
794 return (pte_t) { ret, ret >> 32 }; 831 return (pte_t) { ret, ret >> 32 };
795} 832}
796 833
797static inline pmd_t __pmd(unsigned long long val) 834static inline pmd_t __pmd(unsigned long long val)
798{ 835{
799 return (pmd_t) { PVOP_CALL2(unsigned long long, make_pmd, val, val >> 32) }; 836 return (pmd_t) { PVOP_CALL2(unsigned long long, pv_mmu_ops.make_pmd,
837 val, val >> 32) };
800} 838}
801 839
802static inline pgd_t __pgd(unsigned long long val) 840static inline pgd_t __pgd(unsigned long long val)
803{ 841{
804 return (pgd_t) { PVOP_CALL2(unsigned long long, make_pgd, val, val >> 32) }; 842 return (pgd_t) { PVOP_CALL2(unsigned long long, pv_mmu_ops.make_pgd,
843 val, val >> 32) };
805} 844}
806 845
807static inline unsigned long long pte_val(pte_t x) 846static inline unsigned long long pte_val(pte_t x)
808{ 847{
809 return PVOP_CALL2(unsigned long long, pte_val, x.pte_low, x.pte_high); 848 return PVOP_CALL2(unsigned long long, pv_mmu_ops.pte_val,
849 x.pte_low, x.pte_high);
810} 850}
811 851
812static inline unsigned long long pmd_val(pmd_t x) 852static inline unsigned long long pmd_val(pmd_t x)
813{ 853{
814 return PVOP_CALL2(unsigned long long, pmd_val, x.pmd, x.pmd >> 32); 854 return PVOP_CALL2(unsigned long long, pv_mmu_ops.pmd_val,
855 x.pmd, x.pmd >> 32);
815} 856}
816 857
817static inline unsigned long long pgd_val(pgd_t x) 858static inline unsigned long long pgd_val(pgd_t x)
818{ 859{
819 return PVOP_CALL2(unsigned long long, pgd_val, x.pgd, x.pgd >> 32); 860 return PVOP_CALL2(unsigned long long, pv_mmu_ops.pgd_val,
861 x.pgd, x.pgd >> 32);
820} 862}
821 863
822static inline void set_pte(pte_t *ptep, pte_t pteval) 864static inline void set_pte(pte_t *ptep, pte_t pteval)
823{ 865{
824 PVOP_VCALL3(set_pte, ptep, pteval.pte_low, pteval.pte_high); 866 PVOP_VCALL3(pv_mmu_ops.set_pte, ptep, pteval.pte_low, pteval.pte_high);
825} 867}
826 868
827static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, 869static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
828 pte_t *ptep, pte_t pteval) 870 pte_t *ptep, pte_t pteval)
829{ 871{
830 /* 5 arg words */ 872 /* 5 arg words */
831 paravirt_ops.set_pte_at(mm, addr, ptep, pteval); 873 pv_mmu_ops.set_pte_at(mm, addr, ptep, pteval);
832} 874}
833 875
834static inline void set_pte_atomic(pte_t *ptep, pte_t pteval) 876static inline void set_pte_atomic(pte_t *ptep, pte_t pteval)
835{ 877{
836 PVOP_VCALL3(set_pte_atomic, ptep, pteval.pte_low, pteval.pte_high); 878 PVOP_VCALL3(pv_mmu_ops.set_pte_atomic, ptep,
879 pteval.pte_low, pteval.pte_high);
837} 880}
838 881
839static inline void set_pte_present(struct mm_struct *mm, unsigned long addr, 882static inline void set_pte_present(struct mm_struct *mm, unsigned long addr,
840 pte_t *ptep, pte_t pte) 883 pte_t *ptep, pte_t pte)
841{ 884{
842 /* 5 arg words */ 885 /* 5 arg words */
843 paravirt_ops.set_pte_present(mm, addr, ptep, pte); 886 pv_mmu_ops.set_pte_present(mm, addr, ptep, pte);
844} 887}
845 888
846static inline void set_pmd(pmd_t *pmdp, pmd_t pmdval) 889static inline void set_pmd(pmd_t *pmdp, pmd_t pmdval)
847{ 890{
848 PVOP_VCALL3(set_pmd, pmdp, pmdval.pmd, pmdval.pmd >> 32); 891 PVOP_VCALL3(pv_mmu_ops.set_pmd, pmdp,
892 pmdval.pmd, pmdval.pmd >> 32);
849} 893}
850 894
851static inline void set_pud(pud_t *pudp, pud_t pudval) 895static inline void set_pud(pud_t *pudp, pud_t pudval)
852{ 896{
853 PVOP_VCALL3(set_pud, pudp, pudval.pgd.pgd, pudval.pgd.pgd >> 32); 897 PVOP_VCALL3(pv_mmu_ops.set_pud, pudp,
898 pudval.pgd.pgd, pudval.pgd.pgd >> 32);
854} 899}
855 900
856static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) 901static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
857{ 902{
858 PVOP_VCALL3(pte_clear, mm, addr, ptep); 903 PVOP_VCALL3(pv_mmu_ops.pte_clear, mm, addr, ptep);
859} 904}
860 905
861static inline void pmd_clear(pmd_t *pmdp) 906static inline void pmd_clear(pmd_t *pmdp)
862{ 907{
863 PVOP_VCALL1(pmd_clear, pmdp); 908 PVOP_VCALL1(pv_mmu_ops.pmd_clear, pmdp);
864} 909}
865 910
866#else /* !CONFIG_X86_PAE */ 911#else /* !CONFIG_X86_PAE */
867 912
868static inline pte_t __pte(unsigned long val) 913static inline pte_t __pte(unsigned long val)
869{ 914{
870 return (pte_t) { PVOP_CALL1(unsigned long, make_pte, val) }; 915 return (pte_t) { PVOP_CALL1(unsigned long, pv_mmu_ops.make_pte, val) };
871} 916}
872 917
873static inline pgd_t __pgd(unsigned long val) 918static inline pgd_t __pgd(unsigned long val)
874{ 919{
875 return (pgd_t) { PVOP_CALL1(unsigned long, make_pgd, val) }; 920 return (pgd_t) { PVOP_CALL1(unsigned long, pv_mmu_ops.make_pgd, val) };
876} 921}
877 922
878static inline unsigned long pte_val(pte_t x) 923static inline unsigned long pte_val(pte_t x)
879{ 924{
880 return PVOP_CALL1(unsigned long, pte_val, x.pte_low); 925 return PVOP_CALL1(unsigned long, pv_mmu_ops.pte_val, x.pte_low);
881} 926}
882 927
883static inline unsigned long pgd_val(pgd_t x) 928static inline unsigned long pgd_val(pgd_t x)
884{ 929{
885 return PVOP_CALL1(unsigned long, pgd_val, x.pgd); 930 return PVOP_CALL1(unsigned long, pv_mmu_ops.pgd_val, x.pgd);
886} 931}
887 932
888static inline void set_pte(pte_t *ptep, pte_t pteval) 933static inline void set_pte(pte_t *ptep, pte_t pteval)
889{ 934{
890 PVOP_VCALL2(set_pte, ptep, pteval.pte_low); 935 PVOP_VCALL2(pv_mmu_ops.set_pte, ptep, pteval.pte_low);
891} 936}
892 937
893static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, 938static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
894 pte_t *ptep, pte_t pteval) 939 pte_t *ptep, pte_t pteval)
895{ 940{
896 PVOP_VCALL4(set_pte_at, mm, addr, ptep, pteval.pte_low); 941 PVOP_VCALL4(pv_mmu_ops.set_pte_at, mm, addr, ptep, pteval.pte_low);
897} 942}
898 943
899static inline void set_pmd(pmd_t *pmdp, pmd_t pmdval) 944static inline void set_pmd(pmd_t *pmdp, pmd_t pmdval)
900{ 945{
901 PVOP_VCALL2(set_pmd, pmdp, pmdval.pud.pgd.pgd); 946 PVOP_VCALL2(pv_mmu_ops.set_pmd, pmdp, pmdval.pud.pgd.pgd);
902} 947}
903#endif /* CONFIG_X86_PAE */ 948#endif /* CONFIG_X86_PAE */
904 949
950/* Lazy mode for batching updates / context switch */
951enum paravirt_lazy_mode {
952 PARAVIRT_LAZY_NONE,
953 PARAVIRT_LAZY_MMU,
954 PARAVIRT_LAZY_CPU,
955};
956
957enum paravirt_lazy_mode paravirt_get_lazy_mode(void);
958void paravirt_enter_lazy_cpu(void);
959void paravirt_leave_lazy_cpu(void);
960void paravirt_enter_lazy_mmu(void);
961void paravirt_leave_lazy_mmu(void);
962void paravirt_leave_lazy(enum paravirt_lazy_mode mode);
963
905#define __HAVE_ARCH_ENTER_LAZY_CPU_MODE 964#define __HAVE_ARCH_ENTER_LAZY_CPU_MODE
906static inline void arch_enter_lazy_cpu_mode(void) 965static inline void arch_enter_lazy_cpu_mode(void)
907{ 966{
908 PVOP_VCALL1(set_lazy_mode, PARAVIRT_LAZY_CPU); 967 PVOP_VCALL0(pv_cpu_ops.lazy_mode.enter);
909} 968}
910 969
911static inline void arch_leave_lazy_cpu_mode(void) 970static inline void arch_leave_lazy_cpu_mode(void)
912{ 971{
913 PVOP_VCALL1(set_lazy_mode, PARAVIRT_LAZY_NONE); 972 PVOP_VCALL0(pv_cpu_ops.lazy_mode.leave);
914} 973}
915 974
916static inline void arch_flush_lazy_cpu_mode(void) 975static inline void arch_flush_lazy_cpu_mode(void)
917{ 976{
918 PVOP_VCALL1(set_lazy_mode, PARAVIRT_LAZY_FLUSH); 977 if (unlikely(paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU)) {
978 arch_leave_lazy_cpu_mode();
979 arch_enter_lazy_cpu_mode();
980 }
919} 981}
920 982
921 983
922#define __HAVE_ARCH_ENTER_LAZY_MMU_MODE 984#define __HAVE_ARCH_ENTER_LAZY_MMU_MODE
923static inline void arch_enter_lazy_mmu_mode(void) 985static inline void arch_enter_lazy_mmu_mode(void)
924{ 986{
925 PVOP_VCALL1(set_lazy_mode, PARAVIRT_LAZY_MMU); 987 PVOP_VCALL0(pv_mmu_ops.lazy_mode.enter);
926} 988}
927 989
928static inline void arch_leave_lazy_mmu_mode(void) 990static inline void arch_leave_lazy_mmu_mode(void)
929{ 991{
930 PVOP_VCALL1(set_lazy_mode, PARAVIRT_LAZY_NONE); 992 PVOP_VCALL0(pv_mmu_ops.lazy_mode.leave);
931} 993}
932 994
933static inline void arch_flush_lazy_mmu_mode(void) 995static inline void arch_flush_lazy_mmu_mode(void)
934{ 996{
935 PVOP_VCALL1(set_lazy_mode, PARAVIRT_LAZY_FLUSH); 997 if (unlikely(paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU)) {
998 arch_leave_lazy_mmu_mode();
999 arch_enter_lazy_mmu_mode();
1000 }
936} 1001}
937 1002
938void _paravirt_nop(void); 1003void _paravirt_nop(void);
@@ -957,7 +1022,7 @@ static inline unsigned long __raw_local_save_flags(void)
957 PARAVIRT_CALL 1022 PARAVIRT_CALL
958 "popl %%edx; popl %%ecx") 1023 "popl %%edx; popl %%ecx")
959 : "=a"(f) 1024 : "=a"(f)
960 : paravirt_type(save_fl), 1025 : paravirt_type(pv_irq_ops.save_fl),
961 paravirt_clobber(CLBR_EAX) 1026 paravirt_clobber(CLBR_EAX)
962 : "memory", "cc"); 1027 : "memory", "cc");
963 return f; 1028 return f;
@@ -970,7 +1035,7 @@ static inline void raw_local_irq_restore(unsigned long f)
970 "popl %%edx; popl %%ecx") 1035 "popl %%edx; popl %%ecx")
971 : "=a"(f) 1036 : "=a"(f)
972 : "0"(f), 1037 : "0"(f),
973 paravirt_type(restore_fl), 1038 paravirt_type(pv_irq_ops.restore_fl),
974 paravirt_clobber(CLBR_EAX) 1039 paravirt_clobber(CLBR_EAX)
975 : "memory", "cc"); 1040 : "memory", "cc");
976} 1041}
@@ -981,7 +1046,7 @@ static inline void raw_local_irq_disable(void)
981 PARAVIRT_CALL 1046 PARAVIRT_CALL
982 "popl %%edx; popl %%ecx") 1047 "popl %%edx; popl %%ecx")
983 : 1048 :
984 : paravirt_type(irq_disable), 1049 : paravirt_type(pv_irq_ops.irq_disable),
985 paravirt_clobber(CLBR_EAX) 1050 paravirt_clobber(CLBR_EAX)
986 : "memory", "eax", "cc"); 1051 : "memory", "eax", "cc");
987} 1052}
@@ -992,7 +1057,7 @@ static inline void raw_local_irq_enable(void)
992 PARAVIRT_CALL 1057 PARAVIRT_CALL
993 "popl %%edx; popl %%ecx") 1058 "popl %%edx; popl %%ecx")
994 : 1059 :
995 : paravirt_type(irq_enable), 1060 : paravirt_type(pv_irq_ops.irq_enable),
996 paravirt_clobber(CLBR_EAX) 1061 paravirt_clobber(CLBR_EAX)
997 : "memory", "eax", "cc"); 1062 : "memory", "eax", "cc");
998} 1063}
@@ -1008,21 +1073,23 @@ static inline unsigned long __raw_local_irq_save(void)
1008 1073
1009#define CLI_STRING \ 1074#define CLI_STRING \
1010 _paravirt_alt("pushl %%ecx; pushl %%edx;" \ 1075 _paravirt_alt("pushl %%ecx; pushl %%edx;" \
1011 "call *paravirt_ops+%c[paravirt_cli_type]*4;" \ 1076 "call *%[paravirt_cli_opptr];" \
1012 "popl %%edx; popl %%ecx", \ 1077 "popl %%edx; popl %%ecx", \
1013 "%c[paravirt_cli_type]", "%c[paravirt_clobber]") 1078 "%c[paravirt_cli_type]", "%c[paravirt_clobber]")
1014 1079
1015#define STI_STRING \ 1080#define STI_STRING \
1016 _paravirt_alt("pushl %%ecx; pushl %%edx;" \ 1081 _paravirt_alt("pushl %%ecx; pushl %%edx;" \
1017 "call *paravirt_ops+%c[paravirt_sti_type]*4;" \ 1082 "call *%[paravirt_sti_opptr];" \
1018 "popl %%edx; popl %%ecx", \ 1083 "popl %%edx; popl %%ecx", \
1019 "%c[paravirt_sti_type]", "%c[paravirt_clobber]") 1084 "%c[paravirt_sti_type]", "%c[paravirt_clobber]")
1020 1085
1021#define CLI_STI_CLOBBERS , "%eax" 1086#define CLI_STI_CLOBBERS , "%eax"
1022#define CLI_STI_INPUT_ARGS \ 1087#define CLI_STI_INPUT_ARGS \
1023 , \ 1088 , \
1024 [paravirt_cli_type] "i" (PARAVIRT_PATCH(irq_disable)), \ 1089 [paravirt_cli_type] "i" (PARAVIRT_PATCH(pv_irq_ops.irq_disable)), \
1025 [paravirt_sti_type] "i" (PARAVIRT_PATCH(irq_enable)), \ 1090 [paravirt_cli_opptr] "m" (pv_irq_ops.irq_disable), \
1091 [paravirt_sti_type] "i" (PARAVIRT_PATCH(pv_irq_ops.irq_enable)), \
1092 [paravirt_sti_opptr] "m" (pv_irq_ops.irq_enable), \
1026 paravirt_clobber(CLBR_EAX) 1093 paravirt_clobber(CLBR_EAX)
1027 1094
1028/* Make sure as little as possible of this mess escapes. */ 1095/* Make sure as little as possible of this mess escapes. */
@@ -1042,7 +1109,7 @@ static inline unsigned long __raw_local_irq_save(void)
1042 1109
1043#else /* __ASSEMBLY__ */ 1110#else /* __ASSEMBLY__ */
1044 1111
1045#define PARA_PATCH(off) ((off) / 4) 1112#define PARA_PATCH(struct, off) ((PARAVIRT_PATCH_##struct + (off)) / 4)
1046 1113
1047#define PARA_SITE(ptype, clobbers, ops) \ 1114#define PARA_SITE(ptype, clobbers, ops) \
1048771:; \ 1115771:; \
@@ -1055,29 +1122,29 @@ static inline unsigned long __raw_local_irq_save(void)
1055 .short clobbers; \ 1122 .short clobbers; \
1056 .popsection 1123 .popsection
1057 1124
1058#define INTERRUPT_RETURN \ 1125#define INTERRUPT_RETURN \
1059 PARA_SITE(PARA_PATCH(PARAVIRT_iret), CLBR_NONE, \ 1126 PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_iret), CLBR_NONE, \
1060 jmp *%cs:paravirt_ops+PARAVIRT_iret) 1127 jmp *%cs:pv_cpu_ops+PV_CPU_iret)
1061 1128
1062#define DISABLE_INTERRUPTS(clobbers) \ 1129#define DISABLE_INTERRUPTS(clobbers) \
1063 PARA_SITE(PARA_PATCH(PARAVIRT_irq_disable), clobbers, \ 1130 PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_disable), clobbers, \
1064 pushl %eax; pushl %ecx; pushl %edx; \ 1131 pushl %eax; pushl %ecx; pushl %edx; \
1065 call *%cs:paravirt_ops+PARAVIRT_irq_disable; \ 1132 call *%cs:pv_irq_ops+PV_IRQ_irq_disable; \
1066 popl %edx; popl %ecx; popl %eax) \ 1133 popl %edx; popl %ecx; popl %eax) \
1067 1134
1068#define ENABLE_INTERRUPTS(clobbers) \ 1135#define ENABLE_INTERRUPTS(clobbers) \
1069 PARA_SITE(PARA_PATCH(PARAVIRT_irq_enable), clobbers, \ 1136 PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_enable), clobbers, \
1070 pushl %eax; pushl %ecx; pushl %edx; \ 1137 pushl %eax; pushl %ecx; pushl %edx; \
1071 call *%cs:paravirt_ops+PARAVIRT_irq_enable; \ 1138 call *%cs:pv_irq_ops+PV_IRQ_irq_enable; \
1072 popl %edx; popl %ecx; popl %eax) 1139 popl %edx; popl %ecx; popl %eax)
1073 1140
1074#define ENABLE_INTERRUPTS_SYSEXIT \ 1141#define ENABLE_INTERRUPTS_SYSEXIT \
1075 PARA_SITE(PARA_PATCH(PARAVIRT_irq_enable_sysexit), CLBR_NONE, \ 1142 PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_irq_enable_sysexit), CLBR_NONE,\
1076 jmp *%cs:paravirt_ops+PARAVIRT_irq_enable_sysexit) 1143 jmp *%cs:pv_cpu_ops+PV_CPU_irq_enable_sysexit)
1077 1144
1078#define GET_CR0_INTO_EAX \ 1145#define GET_CR0_INTO_EAX \
1079 push %ecx; push %edx; \ 1146 push %ecx; push %edx; \
1080 call *paravirt_ops+PARAVIRT_read_cr0; \ 1147 call *pv_cpu_ops+PV_CPU_read_cr0; \
1081 pop %edx; pop %ecx 1148 pop %edx; pop %ecx
1082 1149
1083#endif /* __ASSEMBLY__ */ 1150#endif /* __ASSEMBLY__ */
diff --git a/include/asm-x86/pgtable-3level-defs.h b/include/asm-x86/pgtable-3level-defs.h
index c0df89f66e8b..448ac9516314 100644
--- a/include/asm-x86/pgtable-3level-defs.h
+++ b/include/asm-x86/pgtable-3level-defs.h
@@ -2,7 +2,7 @@
2#define _I386_PGTABLE_3LEVEL_DEFS_H 2#define _I386_PGTABLE_3LEVEL_DEFS_H
3 3
4#ifdef CONFIG_PARAVIRT 4#ifdef CONFIG_PARAVIRT
5#define SHARED_KERNEL_PMD (paravirt_ops.shared_kernel_pmd) 5#define SHARED_KERNEL_PMD (pv_info.shared_kernel_pmd)
6#else 6#else
7#define SHARED_KERNEL_PMD 1 7#define SHARED_KERNEL_PMD 1
8#endif 8#endif
diff --git a/include/xen/interface/vcpu.h b/include/xen/interface/vcpu.h
index ff61ea365997..b05d8a6d9143 100644
--- a/include/xen/interface/vcpu.h
+++ b/include/xen/interface/vcpu.h
@@ -160,8 +160,9 @@ struct vcpu_set_singleshot_timer {
160 */ 160 */
161#define VCPUOP_register_vcpu_info 10 /* arg == struct vcpu_info */ 161#define VCPUOP_register_vcpu_info 10 /* arg == struct vcpu_info */
162struct vcpu_register_vcpu_info { 162struct vcpu_register_vcpu_info {
163 uint32_t mfn; /* mfn of page to place vcpu_info */ 163 uint64_t mfn; /* mfn of page to place vcpu_info */
164 uint32_t offset; /* offset within page */ 164 uint32_t offset; /* offset within page */
165 uint32_t rsvd; /* unused */
165}; 166};
166 167
167#endif /* __XEN_PUBLIC_VCPU_H__ */ 168#endif /* __XEN_PUBLIC_VCPU_H__ */
diff --git a/mm/Kconfig b/mm/Kconfig
index 1cc6cada2bbf..b1f03b0eb7f1 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -155,7 +155,6 @@ config SPLIT_PTLOCK_CPUS
155 int 155 int
156 default "4096" if ARM && !CPU_CACHE_VIPT 156 default "4096" if ARM && !CPU_CACHE_VIPT
157 default "4096" if PARISC && !PA20 157 default "4096" if PARISC && !PA20
158 default "4096" if XEN
159 default "4" 158 default "4"
160 159
161# 160#