diff options
Diffstat (limited to 'arch')
| -rw-r--r-- | arch/x86/kernel/alternative.c | 4 | ||||
| -rw-r--r-- | arch/x86/kernel/asm-offsets_32.c | 14 | ||||
| -rw-r--r-- | arch/x86/kernel/entry_32.S | 2 | ||||
| -rw-r--r-- | arch/x86/kernel/paravirt_32.c | 224 | ||||
| -rw-r--r-- | arch/x86/kernel/vmi_32.c | 201 | ||||
| -rw-r--r-- | arch/x86/mm/init_32.c | 22 | ||||
| -rw-r--r-- | arch/x86/xen/enlighten.c | 233 | ||||
| -rw-r--r-- | arch/x86/xen/mmu.c | 145 | ||||
| -rw-r--r-- | arch/x86/xen/multicalls.c | 52 | ||||
| -rw-r--r-- | arch/x86/xen/multicalls.h | 5 | ||||
| -rw-r--r-- | arch/x86/xen/smp.c | 14 | ||||
| -rw-r--r-- | arch/x86/xen/time.c | 6 | ||||
| -rw-r--r-- | arch/x86/xen/xen-ops.h | 10 |
13 files changed, 596 insertions, 336 deletions
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 11b03d3c6f..42421437de 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c | |||
| @@ -369,8 +369,8 @@ void apply_paravirt(struct paravirt_patch_site *start, | |||
| 369 | BUG_ON(p->len > MAX_PATCH_LEN); | 369 | BUG_ON(p->len > MAX_PATCH_LEN); |
| 370 | /* prep the buffer with the original instructions */ | 370 | /* prep the buffer with the original instructions */ |
| 371 | memcpy(insnbuf, p->instr, p->len); | 371 | memcpy(insnbuf, p->instr, p->len); |
| 372 | used = paravirt_ops.patch(p->instrtype, p->clobbers, insnbuf, | 372 | used = pv_init_ops.patch(p->instrtype, p->clobbers, insnbuf, |
| 373 | (unsigned long)p->instr, p->len); | 373 | (unsigned long)p->instr, p->len); |
| 374 | 374 | ||
| 375 | BUG_ON(used > p->len); | 375 | BUG_ON(used > p->len); |
| 376 | 376 | ||
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 8029742c0f..f1b7cdda82 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c | |||
| @@ -116,12 +116,14 @@ void foo(void) | |||
| 116 | 116 | ||
| 117 | #ifdef CONFIG_PARAVIRT | 117 | #ifdef CONFIG_PARAVIRT |
| 118 | BLANK(); | 118 | BLANK(); |
| 119 | OFFSET(PARAVIRT_enabled, paravirt_ops, paravirt_enabled); | 119 | OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled); |
| 120 | OFFSET(PARAVIRT_irq_disable, paravirt_ops, irq_disable); | 120 | OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops); |
| 121 | OFFSET(PARAVIRT_irq_enable, paravirt_ops, irq_enable); | 121 | OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops); |
| 122 | OFFSET(PARAVIRT_irq_enable_sysexit, paravirt_ops, irq_enable_sysexit); | 122 | OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable); |
| 123 | OFFSET(PARAVIRT_iret, paravirt_ops, iret); | 123 | OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable); |
| 124 | OFFSET(PARAVIRT_read_cr0, paravirt_ops, read_cr0); | 124 | OFFSET(PV_CPU_iret, pv_cpu_ops, iret); |
| 125 | OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit); | ||
| 126 | OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0); | ||
| 125 | #endif | 127 | #endif |
| 126 | 128 | ||
| 127 | #ifdef CONFIG_XEN | 129 | #ifdef CONFIG_XEN |
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 8099fea0a7..dc7f938e50 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S | |||
| @@ -437,7 +437,7 @@ ldt_ss: | |||
| 437 | * is still available to implement the setting of the high | 437 | * is still available to implement the setting of the high |
| 438 | * 16-bits in the INTERRUPT_RETURN paravirt-op. | 438 | * 16-bits in the INTERRUPT_RETURN paravirt-op. |
| 439 | */ | 439 | */ |
| 440 | cmpl $0, paravirt_ops+PARAVIRT_enabled | 440 | cmpl $0, pv_info+PARAVIRT_enabled |
| 441 | jne restore_nocheck | 441 | jne restore_nocheck |
| 442 | #endif | 442 | #endif |
| 443 | 443 | ||
diff --git a/arch/x86/kernel/paravirt_32.c b/arch/x86/kernel/paravirt_32.c index 739cfb207d..6a80d67c21 100644 --- a/arch/x86/kernel/paravirt_32.c +++ b/arch/x86/kernel/paravirt_32.c | |||
| @@ -42,32 +42,33 @@ void _paravirt_nop(void) | |||
| 42 | static void __init default_banner(void) | 42 | static void __init default_banner(void) |
| 43 | { | 43 | { |
| 44 | printk(KERN_INFO "Booting paravirtualized kernel on %s\n", | 44 | printk(KERN_INFO "Booting paravirtualized kernel on %s\n", |
| 45 | paravirt_ops.name); | 45 | pv_info.name); |
| 46 | } | 46 | } |
| 47 | 47 | ||
| 48 | char *memory_setup(void) | 48 | char *memory_setup(void) |
| 49 | { | 49 | { |
| 50 | return paravirt_ops.memory_setup(); | 50 | return pv_init_ops.memory_setup(); |
| 51 | } | 51 | } |
| 52 | 52 | ||
| 53 | /* Simple instruction patching code. */ | 53 | /* Simple instruction patching code. */ |
| 54 | #define DEF_NATIVE(name, code) \ | 54 | #define DEF_NATIVE(ops, name, code) \ |
| 55 | extern const char start_##name[], end_##name[]; \ | 55 | extern const char start_##ops##_##name[], end_##ops##_##name[]; \ |
| 56 | asm("start_" #name ": " code "; end_" #name ":") | 56 | asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":") |
| 57 | 57 | ||
| 58 | DEF_NATIVE(irq_disable, "cli"); | 58 | DEF_NATIVE(pv_irq_ops, irq_disable, "cli"); |
| 59 | DEF_NATIVE(irq_enable, "sti"); | 59 | DEF_NATIVE(pv_irq_ops, irq_enable, "sti"); |
| 60 | DEF_NATIVE(restore_fl, "push %eax; popf"); | 60 | DEF_NATIVE(pv_irq_ops, restore_fl, "push %eax; popf"); |
| 61 | DEF_NATIVE(save_fl, "pushf; pop %eax"); | 61 | DEF_NATIVE(pv_irq_ops, save_fl, "pushf; pop %eax"); |
| 62 | DEF_NATIVE(iret, "iret"); | 62 | DEF_NATIVE(pv_cpu_ops, iret, "iret"); |
| 63 | DEF_NATIVE(irq_enable_sysexit, "sti; sysexit"); | 63 | DEF_NATIVE(pv_cpu_ops, irq_enable_sysexit, "sti; sysexit"); |
| 64 | DEF_NATIVE(read_cr2, "mov %cr2, %eax"); | 64 | DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax"); |
| 65 | DEF_NATIVE(write_cr3, "mov %eax, %cr3"); | 65 | DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3"); |
| 66 | DEF_NATIVE(read_cr3, "mov %cr3, %eax"); | 66 | DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax"); |
| 67 | DEF_NATIVE(clts, "clts"); | 67 | DEF_NATIVE(pv_cpu_ops, clts, "clts"); |
| 68 | DEF_NATIVE(read_tsc, "rdtsc"); | 68 | DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc"); |
| 69 | 69 | ||
| 70 | DEF_NATIVE(ud2a, "ud2a"); | 70 | /* Undefined instruction for dealing with missing ops pointers. */ |
| 71 | static const unsigned char ud2a[] = { 0x0f, 0x0b }; | ||
| 71 | 72 | ||
| 72 | static unsigned native_patch(u8 type, u16 clobbers, void *ibuf, | 73 | static unsigned native_patch(u8 type, u16 clobbers, void *ibuf, |
| 73 | unsigned long addr, unsigned len) | 74 | unsigned long addr, unsigned len) |
| @@ -76,37 +77,29 @@ static unsigned native_patch(u8 type, u16 clobbers, void *ibuf, | |||
| 76 | unsigned ret; | 77 | unsigned ret; |
| 77 | 78 | ||
| 78 | switch(type) { | 79 | switch(type) { |
| 79 | #define SITE(x) case PARAVIRT_PATCH(x): start = start_##x; end = end_##x; goto patch_site | 80 | #define SITE(ops, x) \ |
| 80 | SITE(irq_disable); | 81 | case PARAVIRT_PATCH(ops.x): \ |
| 81 | SITE(irq_enable); | 82 | start = start_##ops##_##x; \ |
| 82 | SITE(restore_fl); | 83 | end = end_##ops##_##x; \ |
| 83 | SITE(save_fl); | 84 | goto patch_site |
| 84 | SITE(iret); | 85 | |
| 85 | SITE(irq_enable_sysexit); | 86 | SITE(pv_irq_ops, irq_disable); |
| 86 | SITE(read_cr2); | 87 | SITE(pv_irq_ops, irq_enable); |
| 87 | SITE(read_cr3); | 88 | SITE(pv_irq_ops, restore_fl); |
| 88 | SITE(write_cr3); | 89 | SITE(pv_irq_ops, save_fl); |
| 89 | SITE(clts); | 90 | SITE(pv_cpu_ops, iret); |
| 90 | SITE(read_tsc); | 91 | SITE(pv_cpu_ops, irq_enable_sysexit); |
| 92 | SITE(pv_mmu_ops, read_cr2); | ||
| 93 | SITE(pv_mmu_ops, read_cr3); | ||
| 94 | SITE(pv_mmu_ops, write_cr3); | ||
| 95 | SITE(pv_cpu_ops, clts); | ||
| 96 | SITE(pv_cpu_ops, read_tsc); | ||
| 91 | #undef SITE | 97 | #undef SITE |
| 92 | 98 | ||
| 93 | patch_site: | 99 | patch_site: |
| 94 | ret = paravirt_patch_insns(ibuf, len, start, end); | 100 | ret = paravirt_patch_insns(ibuf, len, start, end); |
| 95 | break; | 101 | break; |
| 96 | 102 | ||
| 97 | case PARAVIRT_PATCH(make_pgd): | ||
| 98 | case PARAVIRT_PATCH(make_pte): | ||
| 99 | case PARAVIRT_PATCH(pgd_val): | ||
| 100 | case PARAVIRT_PATCH(pte_val): | ||
| 101 | #ifdef CONFIG_X86_PAE | ||
| 102 | case PARAVIRT_PATCH(make_pmd): | ||
| 103 | case PARAVIRT_PATCH(pmd_val): | ||
| 104 | #endif | ||
| 105 | /* These functions end up returning exactly what | ||
| 106 | they're passed, in the same registers. */ | ||
| 107 | ret = paravirt_patch_nop(); | ||
| 108 | break; | ||
| 109 | |||
| 110 | default: | 103 | default: |
| 111 | ret = paravirt_patch_default(type, clobbers, ibuf, addr, len); | 104 | ret = paravirt_patch_default(type, clobbers, ibuf, addr, len); |
| 112 | break; | 105 | break; |
| @@ -150,7 +143,7 @@ unsigned paravirt_patch_call(void *insnbuf, | |||
| 150 | return 5; | 143 | return 5; |
| 151 | } | 144 | } |
| 152 | 145 | ||
| 153 | unsigned paravirt_patch_jmp(const void *target, void *insnbuf, | 146 | unsigned paravirt_patch_jmp(void *insnbuf, const void *target, |
| 154 | unsigned long addr, unsigned len) | 147 | unsigned long addr, unsigned len) |
| 155 | { | 148 | { |
| 156 | struct branch *b = insnbuf; | 149 | struct branch *b = insnbuf; |
| @@ -165,22 +158,37 @@ unsigned paravirt_patch_jmp(const void *target, void *insnbuf, | |||
| 165 | return 5; | 158 | return 5; |
| 166 | } | 159 | } |
| 167 | 160 | ||
| 161 | /* Neat trick to map patch type back to the call within the | ||
| 162 | * corresponding structure. */ | ||
| 163 | static void *get_call_destination(u8 type) | ||
| 164 | { | ||
| 165 | struct paravirt_patch_template tmpl = { | ||
| 166 | .pv_init_ops = pv_init_ops, | ||
| 167 | .pv_time_ops = pv_time_ops, | ||
| 168 | .pv_cpu_ops = pv_cpu_ops, | ||
| 169 | .pv_irq_ops = pv_irq_ops, | ||
| 170 | .pv_apic_ops = pv_apic_ops, | ||
| 171 | .pv_mmu_ops = pv_mmu_ops, | ||
| 172 | }; | ||
| 173 | return *((void **)&tmpl + type); | ||
| 174 | } | ||
| 175 | |||
| 168 | unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf, | 176 | unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf, |
| 169 | unsigned long addr, unsigned len) | 177 | unsigned long addr, unsigned len) |
| 170 | { | 178 | { |
| 171 | void *opfunc = *((void **)¶virt_ops + type); | 179 | void *opfunc = get_call_destination(type); |
| 172 | unsigned ret; | 180 | unsigned ret; |
| 173 | 181 | ||
| 174 | if (opfunc == NULL) | 182 | if (opfunc == NULL) |
| 175 | /* If there's no function, patch it with a ud2a (BUG) */ | 183 | /* If there's no function, patch it with a ud2a (BUG) */ |
| 176 | ret = paravirt_patch_insns(insnbuf, len, start_ud2a, end_ud2a); | 184 | ret = paravirt_patch_insns(insnbuf, len, ud2a, ud2a+sizeof(ud2a)); |
| 177 | else if (opfunc == paravirt_nop) | 185 | else if (opfunc == paravirt_nop) |
| 178 | /* If the operation is a nop, then nop the callsite */ | 186 | /* If the operation is a nop, then nop the callsite */ |
| 179 | ret = paravirt_patch_nop(); | 187 | ret = paravirt_patch_nop(); |
| 180 | else if (type == PARAVIRT_PATCH(iret) || | 188 | else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) || |
| 181 | type == PARAVIRT_PATCH(irq_enable_sysexit)) | 189 | type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit)) |
| 182 | /* If operation requires a jmp, then jmp */ | 190 | /* If operation requires a jmp, then jmp */ |
| 183 | ret = paravirt_patch_jmp(opfunc, insnbuf, addr, len); | 191 | ret = paravirt_patch_jmp(insnbuf, opfunc, addr, len); |
| 184 | else | 192 | else |
| 185 | /* Otherwise call the function; assume target could | 193 | /* Otherwise call the function; assume target could |
| 186 | clobber any caller-save reg */ | 194 | clobber any caller-save reg */ |
| @@ -205,7 +213,7 @@ unsigned paravirt_patch_insns(void *insnbuf, unsigned len, | |||
| 205 | 213 | ||
| 206 | void init_IRQ(void) | 214 | void init_IRQ(void) |
| 207 | { | 215 | { |
| 208 | paravirt_ops.init_IRQ(); | 216 | pv_irq_ops.init_IRQ(); |
| 209 | } | 217 | } |
| 210 | 218 | ||
| 211 | static void native_flush_tlb(void) | 219 | static void native_flush_tlb(void) |
| @@ -233,7 +241,7 @@ extern void native_irq_enable_sysexit(void); | |||
| 233 | 241 | ||
| 234 | static int __init print_banner(void) | 242 | static int __init print_banner(void) |
| 235 | { | 243 | { |
| 236 | paravirt_ops.banner(); | 244 | pv_init_ops.banner(); |
| 237 | return 0; | 245 | return 0; |
| 238 | } | 246 | } |
| 239 | core_initcall(print_banner); | 247 | core_initcall(print_banner); |
| @@ -273,47 +281,96 @@ int paravirt_disable_iospace(void) | |||
| 273 | return ret; | 281 | return ret; |
| 274 | } | 282 | } |
| 275 | 283 | ||
| 276 | struct paravirt_ops paravirt_ops = { | 284 | static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LAZY_NONE; |
| 285 | |||
| 286 | static inline void enter_lazy(enum paravirt_lazy_mode mode) | ||
| 287 | { | ||
| 288 | BUG_ON(x86_read_percpu(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE); | ||
| 289 | BUG_ON(preemptible()); | ||
| 290 | |||
| 291 | x86_write_percpu(paravirt_lazy_mode, mode); | ||
| 292 | } | ||
| 293 | |||
| 294 | void paravirt_leave_lazy(enum paravirt_lazy_mode mode) | ||
| 295 | { | ||
| 296 | BUG_ON(x86_read_percpu(paravirt_lazy_mode) != mode); | ||
| 297 | BUG_ON(preemptible()); | ||
| 298 | |||
| 299 | x86_write_percpu(paravirt_lazy_mode, PARAVIRT_LAZY_NONE); | ||
| 300 | } | ||
| 301 | |||
| 302 | void paravirt_enter_lazy_mmu(void) | ||
| 303 | { | ||
| 304 | enter_lazy(PARAVIRT_LAZY_MMU); | ||
| 305 | } | ||
| 306 | |||
| 307 | void paravirt_leave_lazy_mmu(void) | ||
| 308 | { | ||
| 309 | paravirt_leave_lazy(PARAVIRT_LAZY_MMU); | ||
| 310 | } | ||
| 311 | |||
| 312 | void paravirt_enter_lazy_cpu(void) | ||
| 313 | { | ||
| 314 | enter_lazy(PARAVIRT_LAZY_CPU); | ||
| 315 | } | ||
| 316 | |||
| 317 | void paravirt_leave_lazy_cpu(void) | ||
| 318 | { | ||
| 319 | paravirt_leave_lazy(PARAVIRT_LAZY_CPU); | ||
| 320 | } | ||
| 321 | |||
| 322 | enum paravirt_lazy_mode paravirt_get_lazy_mode(void) | ||
| 323 | { | ||
| 324 | return x86_read_percpu(paravirt_lazy_mode); | ||
| 325 | } | ||
| 326 | |||
| 327 | struct pv_info pv_info = { | ||
| 277 | .name = "bare hardware", | 328 | .name = "bare hardware", |
| 278 | .paravirt_enabled = 0, | 329 | .paravirt_enabled = 0, |
| 279 | .kernel_rpl = 0, | 330 | .kernel_rpl = 0, |
| 280 | .shared_kernel_pmd = 1, /* Only used when CONFIG_X86_PAE is set */ | 331 | .shared_kernel_pmd = 1, /* Only used when CONFIG_X86_PAE is set */ |
| 332 | }; | ||
| 281 | 333 | ||
| 282 | .patch = native_patch, | 334 | struct pv_init_ops pv_init_ops = { |
| 335 | .patch = native_patch, | ||
| 283 | .banner = default_banner, | 336 | .banner = default_banner, |
| 284 | .arch_setup = paravirt_nop, | 337 | .arch_setup = paravirt_nop, |
| 285 | .memory_setup = machine_specific_memory_setup, | 338 | .memory_setup = machine_specific_memory_setup, |
| 339 | }; | ||
| 340 | |||
| 341 | struct pv_time_ops pv_time_ops = { | ||
| 342 | .time_init = hpet_time_init, | ||
| 286 | .get_wallclock = native_get_wallclock, | 343 | .get_wallclock = native_get_wallclock, |
| 287 | .set_wallclock = native_set_wallclock, | 344 | .set_wallclock = native_set_wallclock, |
| 288 | .time_init = hpet_time_init, | 345 | .sched_clock = native_sched_clock, |
| 346 | .get_cpu_khz = native_calculate_cpu_khz, | ||
| 347 | }; | ||
| 348 | |||
| 349 | struct pv_irq_ops pv_irq_ops = { | ||
| 289 | .init_IRQ = native_init_IRQ, | 350 | .init_IRQ = native_init_IRQ, |
| 351 | .save_fl = native_save_fl, | ||
| 352 | .restore_fl = native_restore_fl, | ||
| 353 | .irq_disable = native_irq_disable, | ||
| 354 | .irq_enable = native_irq_enable, | ||
| 355 | .safe_halt = native_safe_halt, | ||
| 356 | .halt = native_halt, | ||
| 357 | }; | ||
| 290 | 358 | ||
| 359 | struct pv_cpu_ops pv_cpu_ops = { | ||
| 291 | .cpuid = native_cpuid, | 360 | .cpuid = native_cpuid, |
| 292 | .get_debugreg = native_get_debugreg, | 361 | .get_debugreg = native_get_debugreg, |
| 293 | .set_debugreg = native_set_debugreg, | 362 | .set_debugreg = native_set_debugreg, |
| 294 | .clts = native_clts, | 363 | .clts = native_clts, |
| 295 | .read_cr0 = native_read_cr0, | 364 | .read_cr0 = native_read_cr0, |
| 296 | .write_cr0 = native_write_cr0, | 365 | .write_cr0 = native_write_cr0, |
| 297 | .read_cr2 = native_read_cr2, | ||
| 298 | .write_cr2 = native_write_cr2, | ||
| 299 | .read_cr3 = native_read_cr3, | ||
| 300 | .write_cr3 = native_write_cr3, | ||
| 301 | .read_cr4 = native_read_cr4, | 366 | .read_cr4 = native_read_cr4, |
| 302 | .read_cr4_safe = native_read_cr4_safe, | 367 | .read_cr4_safe = native_read_cr4_safe, |
| 303 | .write_cr4 = native_write_cr4, | 368 | .write_cr4 = native_write_cr4, |
| 304 | .save_fl = native_save_fl, | ||
| 305 | .restore_fl = native_restore_fl, | ||
| 306 | .irq_disable = native_irq_disable, | ||
| 307 | .irq_enable = native_irq_enable, | ||
| 308 | .safe_halt = native_safe_halt, | ||
| 309 | .halt = native_halt, | ||
| 310 | .wbinvd = native_wbinvd, | 369 | .wbinvd = native_wbinvd, |
| 311 | .read_msr = native_read_msr_safe, | 370 | .read_msr = native_read_msr_safe, |
| 312 | .write_msr = native_write_msr_safe, | 371 | .write_msr = native_write_msr_safe, |
| 313 | .read_tsc = native_read_tsc, | 372 | .read_tsc = native_read_tsc, |
| 314 | .read_pmc = native_read_pmc, | 373 | .read_pmc = native_read_pmc, |
| 315 | .sched_clock = native_sched_clock, | ||
| 316 | .get_cpu_khz = native_calculate_cpu_khz, | ||
| 317 | .load_tr_desc = native_load_tr_desc, | 374 | .load_tr_desc = native_load_tr_desc, |
| 318 | .set_ldt = native_set_ldt, | 375 | .set_ldt = native_set_ldt, |
| 319 | .load_gdt = native_load_gdt, | 376 | .load_gdt = native_load_gdt, |
| @@ -327,9 +384,19 @@ struct paravirt_ops paravirt_ops = { | |||
| 327 | .write_idt_entry = write_dt_entry, | 384 | .write_idt_entry = write_dt_entry, |
| 328 | .load_esp0 = native_load_esp0, | 385 | .load_esp0 = native_load_esp0, |
| 329 | 386 | ||
| 387 | .irq_enable_sysexit = native_irq_enable_sysexit, | ||
| 388 | .iret = native_iret, | ||
| 389 | |||
| 330 | .set_iopl_mask = native_set_iopl_mask, | 390 | .set_iopl_mask = native_set_iopl_mask, |
| 331 | .io_delay = native_io_delay, | 391 | .io_delay = native_io_delay, |
| 332 | 392 | ||
| 393 | .lazy_mode = { | ||
| 394 | .enter = paravirt_nop, | ||
| 395 | .leave = paravirt_nop, | ||
| 396 | }, | ||
| 397 | }; | ||
| 398 | |||
| 399 | struct pv_apic_ops pv_apic_ops = { | ||
| 333 | #ifdef CONFIG_X86_LOCAL_APIC | 400 | #ifdef CONFIG_X86_LOCAL_APIC |
| 334 | .apic_write = native_apic_write, | 401 | .apic_write = native_apic_write, |
| 335 | .apic_write_atomic = native_apic_write_atomic, | 402 | .apic_write_atomic = native_apic_write_atomic, |
| @@ -338,11 +405,17 @@ struct paravirt_ops paravirt_ops = { | |||
| 338 | .setup_secondary_clock = setup_secondary_APIC_clock, | 405 | .setup_secondary_clock = setup_secondary_APIC_clock, |
| 339 | .startup_ipi_hook = paravirt_nop, | 406 | .startup_ipi_hook = paravirt_nop, |
| 340 | #endif | 407 | #endif |
| 341 | .set_lazy_mode = paravirt_nop, | 408 | }; |
| 342 | 409 | ||
| 410 | struct pv_mmu_ops pv_mmu_ops = { | ||
| 343 | .pagetable_setup_start = native_pagetable_setup_start, | 411 | .pagetable_setup_start = native_pagetable_setup_start, |
| 344 | .pagetable_setup_done = native_pagetable_setup_done, | 412 | .pagetable_setup_done = native_pagetable_setup_done, |
| 345 | 413 | ||
| 414 | .read_cr2 = native_read_cr2, | ||
| 415 | .write_cr2 = native_write_cr2, | ||
| 416 | .read_cr3 = native_read_cr3, | ||
| 417 | .write_cr3 = native_write_cr3, | ||
| 418 | |||
| 346 | .flush_tlb_user = native_flush_tlb, | 419 | .flush_tlb_user = native_flush_tlb, |
| 347 | .flush_tlb_kernel = native_flush_tlb_global, | 420 | .flush_tlb_kernel = native_flush_tlb_global, |
| 348 | .flush_tlb_single = native_flush_tlb_single, | 421 | .flush_tlb_single = native_flush_tlb_single, |
| @@ -381,12 +454,19 @@ struct paravirt_ops paravirt_ops = { | |||
| 381 | .make_pte = native_make_pte, | 454 | .make_pte = native_make_pte, |
| 382 | .make_pgd = native_make_pgd, | 455 | .make_pgd = native_make_pgd, |
| 383 | 456 | ||
| 384 | .irq_enable_sysexit = native_irq_enable_sysexit, | ||
| 385 | .iret = native_iret, | ||
| 386 | |||
| 387 | .dup_mmap = paravirt_nop, | 457 | .dup_mmap = paravirt_nop, |
| 388 | .exit_mmap = paravirt_nop, | 458 | .exit_mmap = paravirt_nop, |
| 389 | .activate_mm = paravirt_nop, | 459 | .activate_mm = paravirt_nop, |
| 460 | |||
| 461 | .lazy_mode = { | ||
| 462 | .enter = paravirt_nop, | ||
| 463 | .leave = paravirt_nop, | ||
| 464 | }, | ||
| 390 | }; | 465 | }; |
| 391 | 466 | ||
| 392 | EXPORT_SYMBOL(paravirt_ops); | 467 | EXPORT_SYMBOL_GPL(pv_time_ops); |
| 468 | EXPORT_SYMBOL_GPL(pv_cpu_ops); | ||
| 469 | EXPORT_SYMBOL_GPL(pv_mmu_ops); | ||
| 470 | EXPORT_SYMBOL_GPL(pv_apic_ops); | ||
| 471 | EXPORT_SYMBOL_GPL(pv_info); | ||
| 472 | EXPORT_SYMBOL (pv_irq_ops); | ||
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index 18673e0f19..f02bad68ab 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c | |||
| @@ -134,21 +134,21 @@ static unsigned vmi_patch(u8 type, u16 clobbers, void *insns, | |||
| 134 | unsigned long eip, unsigned len) | 134 | unsigned long eip, unsigned len) |
| 135 | { | 135 | { |
| 136 | switch (type) { | 136 | switch (type) { |
| 137 | case PARAVIRT_PATCH(irq_disable): | 137 | case PARAVIRT_PATCH(pv_irq_ops.irq_disable): |
| 138 | return patch_internal(VMI_CALL_DisableInterrupts, len, | 138 | return patch_internal(VMI_CALL_DisableInterrupts, len, |
| 139 | insns, eip); | 139 | insns, eip); |
| 140 | case PARAVIRT_PATCH(irq_enable): | 140 | case PARAVIRT_PATCH(pv_irq_ops.irq_enable): |
| 141 | return patch_internal(VMI_CALL_EnableInterrupts, len, | 141 | return patch_internal(VMI_CALL_EnableInterrupts, len, |
| 142 | insns, eip); | 142 | insns, eip); |
| 143 | case PARAVIRT_PATCH(restore_fl): | 143 | case PARAVIRT_PATCH(pv_irq_ops.restore_fl): |
| 144 | return patch_internal(VMI_CALL_SetInterruptMask, len, | 144 | return patch_internal(VMI_CALL_SetInterruptMask, len, |
| 145 | insns, eip); | 145 | insns, eip); |
| 146 | case PARAVIRT_PATCH(save_fl): | 146 | case PARAVIRT_PATCH(pv_irq_ops.save_fl): |
| 147 | return patch_internal(VMI_CALL_GetInterruptMask, len, | 147 | return patch_internal(VMI_CALL_GetInterruptMask, len, |
| 148 | insns, eip); | 148 | insns, eip); |
| 149 | case PARAVIRT_PATCH(iret): | 149 | case PARAVIRT_PATCH(pv_cpu_ops.iret): |
| 150 | return patch_internal(VMI_CALL_IRET, len, insns, eip); | 150 | return patch_internal(VMI_CALL_IRET, len, insns, eip); |
| 151 | case PARAVIRT_PATCH(irq_enable_sysexit): | 151 | case PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit): |
| 152 | return patch_internal(VMI_CALL_SYSEXIT, len, insns, eip); | 152 | return patch_internal(VMI_CALL_SYSEXIT, len, insns, eip); |
| 153 | default: | 153 | default: |
| 154 | break; | 154 | break; |
| @@ -552,24 +552,22 @@ vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip, | |||
| 552 | } | 552 | } |
| 553 | #endif | 553 | #endif |
| 554 | 554 | ||
| 555 | static void vmi_set_lazy_mode(enum paravirt_lazy_mode mode) | 555 | static void vmi_enter_lazy_cpu(void) |
| 556 | { | 556 | { |
| 557 | static DEFINE_PER_CPU(enum paravirt_lazy_mode, lazy_mode); | 557 | paravirt_enter_lazy_cpu(); |
| 558 | 558 | vmi_ops.set_lazy_mode(2); | |
| 559 | if (!vmi_ops.set_lazy_mode) | 559 | } |
| 560 | return; | ||
| 561 | 560 | ||
| 562 | /* Modes should never nest or overlap */ | 561 | static void vmi_enter_lazy_mmu(void) |
| 563 | BUG_ON(__get_cpu_var(lazy_mode) && !(mode == PARAVIRT_LAZY_NONE || | 562 | { |
| 564 | mode == PARAVIRT_LAZY_FLUSH)); | 563 | paravirt_enter_lazy_mmu(); |
| 564 | vmi_ops.set_lazy_mode(1); | ||
| 565 | } | ||
| 565 | 566 | ||
| 566 | if (mode == PARAVIRT_LAZY_FLUSH) { | 567 | static void vmi_leave_lazy(void) |
| 567 | vmi_ops.set_lazy_mode(0); | 568 | { |
| 568 | vmi_ops.set_lazy_mode(__get_cpu_var(lazy_mode)); | 569 | paravirt_leave_lazy(paravirt_get_lazy_mode()); |
| 569 | } else { | 570 | vmi_ops.set_lazy_mode(0); |
| 570 | vmi_ops.set_lazy_mode(mode); | ||
| 571 | __get_cpu_var(lazy_mode) = mode; | ||
| 572 | } | ||
| 573 | } | 571 | } |
| 574 | 572 | ||
| 575 | static inline int __init check_vmi_rom(struct vrom_header *rom) | 573 | static inline int __init check_vmi_rom(struct vrom_header *rom) |
| @@ -690,9 +688,9 @@ do { \ | |||
| 690 | reloc = call_vrom_long_func(vmi_rom, get_reloc, \ | 688 | reloc = call_vrom_long_func(vmi_rom, get_reloc, \ |
| 691 | VMI_CALL_##vmicall); \ | 689 | VMI_CALL_##vmicall); \ |
| 692 | if (rel->type == VMI_RELOCATION_CALL_REL) \ | 690 | if (rel->type == VMI_RELOCATION_CALL_REL) \ |
| 693 | paravirt_ops.opname = (void *)rel->eip; \ | 691 | opname = (void *)rel->eip; \ |
| 694 | else if (rel->type == VMI_RELOCATION_NOP) \ | 692 | else if (rel->type == VMI_RELOCATION_NOP) \ |
| 695 | paravirt_ops.opname = (void *)vmi_nop; \ | 693 | opname = (void *)vmi_nop; \ |
| 696 | else if (rel->type != VMI_RELOCATION_NONE) \ | 694 | else if (rel->type != VMI_RELOCATION_NONE) \ |
| 697 | printk(KERN_WARNING "VMI: Unknown relocation " \ | 695 | printk(KERN_WARNING "VMI: Unknown relocation " \ |
| 698 | "type %d for " #vmicall"\n",\ | 696 | "type %d for " #vmicall"\n",\ |
| @@ -712,7 +710,7 @@ do { \ | |||
| 712 | VMI_CALL_##vmicall); \ | 710 | VMI_CALL_##vmicall); \ |
| 713 | BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL); \ | 711 | BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL); \ |
| 714 | if (rel->type == VMI_RELOCATION_CALL_REL) { \ | 712 | if (rel->type == VMI_RELOCATION_CALL_REL) { \ |
| 715 | paravirt_ops.opname = wrapper; \ | 713 | opname = wrapper; \ |
| 716 | vmi_ops.cache = (void *)rel->eip; \ | 714 | vmi_ops.cache = (void *)rel->eip; \ |
| 717 | } \ | 715 | } \ |
| 718 | } while (0) | 716 | } while (0) |
| @@ -732,11 +730,11 @@ static inline int __init activate_vmi(void) | |||
| 732 | } | 730 | } |
| 733 | savesegment(cs, kernel_cs); | 731 | savesegment(cs, kernel_cs); |
| 734 | 732 | ||
| 735 | paravirt_ops.paravirt_enabled = 1; | 733 | pv_info.paravirt_enabled = 1; |
| 736 | paravirt_ops.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK; | 734 | pv_info.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK; |
| 735 | pv_info.name = "vmi"; | ||
| 737 | 736 | ||
| 738 | paravirt_ops.patch = vmi_patch; | 737 | pv_init_ops.patch = vmi_patch; |
| 739 | paravirt_ops.name = "vmi"; | ||
| 740 | 738 | ||
| 741 | /* | 739 | /* |
| 742 | * Many of these operations are ABI compatible with VMI. | 740 | * Many of these operations are ABI compatible with VMI. |
| @@ -754,26 +752,26 @@ static inline int __init activate_vmi(void) | |||
| 754 | */ | 752 | */ |
| 755 | 753 | ||
| 756 | /* CPUID is special, so very special it gets wrapped like a present */ | 754 | /* CPUID is special, so very special it gets wrapped like a present */ |
| 757 | para_wrap(cpuid, vmi_cpuid, cpuid, CPUID); | 755 | para_wrap(pv_cpu_ops.cpuid, vmi_cpuid, cpuid, CPUID); |
| 758 | 756 | ||
| 759 | para_fill(clts, CLTS); | 757 | para_fill(pv_cpu_ops.clts, CLTS); |
| 760 | para_fill(get_debugreg, GetDR); | 758 | para_fill(pv_cpu_ops.get_debugreg, GetDR); |
| 761 | para_fill(set_debugreg, SetDR); | 759 | para_fill(pv_cpu_ops.set_debugreg, SetDR); |
| 762 | para_fill(read_cr0, GetCR0); | 760 | para_fill(pv_cpu_ops.read_cr0, GetCR0); |
| 763 | para_fill(read_cr2, GetCR2); | 761 | para_fill(pv_mmu_ops.read_cr2, GetCR2); |
| 764 | para_fill(read_cr3, GetCR3); | 762 | para_fill(pv_mmu_ops.read_cr3, GetCR3); |
| 765 | para_fill(read_cr4, GetCR4); | 763 | para_fill(pv_cpu_ops.read_cr4, GetCR4); |
| 766 | para_fill(write_cr0, SetCR0); | 764 | para_fill(pv_cpu_ops.write_cr0, SetCR0); |
| 767 | para_fill(write_cr2, SetCR2); | 765 | para_fill(pv_mmu_ops.write_cr2, SetCR2); |
| 768 | para_fill(write_cr3, SetCR3); | 766 | para_fill(pv_mmu_ops.write_cr3, SetCR3); |
| 769 | para_fill(write_cr4, SetCR4); | 767 | para_fill(pv_cpu_ops.write_cr4, SetCR4); |
| 770 | para_fill(save_fl, GetInterruptMask); | 768 | para_fill(pv_irq_ops.save_fl, GetInterruptMask); |
| 771 | para_fill(restore_fl, SetInterruptMask); | 769 | para_fill(pv_irq_ops.restore_fl, SetInterruptMask); |
| 772 | para_fill(irq_disable, DisableInterrupts); | 770 | para_fill(pv_irq_ops.irq_disable, DisableInterrupts); |
| 773 | para_fill(irq_enable, EnableInterrupts); | 771 | para_fill(pv_irq_ops.irq_enable, EnableInterrupts); |
| 774 | 772 | ||
| 775 | para_fill(wbinvd, WBINVD); | 773 | para_fill(pv_cpu_ops.wbinvd, WBINVD); |
| 776 | para_fill(read_tsc, RDTSC); | 774 | para_fill(pv_cpu_ops.read_tsc, RDTSC); |
| 777 | 775 | ||
| 778 | /* The following we emulate with trap and emulate for now */ | 776 | /* The following we emulate with trap and emulate for now */ |
| 779 | /* paravirt_ops.read_msr = vmi_rdmsr */ | 777 | /* paravirt_ops.read_msr = vmi_rdmsr */ |
| @@ -781,29 +779,38 @@ static inline int __init activate_vmi(void) | |||
| 781 | /* paravirt_ops.rdpmc = vmi_rdpmc */ | 779 | /* paravirt_ops.rdpmc = vmi_rdpmc */ |
| 782 | 780 | ||
| 783 | /* TR interface doesn't pass TR value, wrap */ | 781 | /* TR interface doesn't pass TR value, wrap */ |
| 784 | para_wrap(load_tr_desc, vmi_set_tr, set_tr, SetTR); | 782 | para_wrap(pv_cpu_ops.load_tr_desc, vmi_set_tr, set_tr, SetTR); |
| 785 | 783 | ||
| 786 | /* LDT is special, too */ | 784 | /* LDT is special, too */ |
| 787 | para_wrap(set_ldt, vmi_set_ldt, _set_ldt, SetLDT); | 785 | para_wrap(pv_cpu_ops.set_ldt, vmi_set_ldt, _set_ldt, SetLDT); |
| 788 | 786 | ||
| 789 | para_fill(load_gdt, SetGDT); | 787 | para_fill(pv_cpu_ops.load_gdt, SetGDT); |
| 790 | para_fill(load_idt, SetIDT); | 788 | para_fill(pv_cpu_ops.load_idt, SetIDT); |
| 791 | para_fill(store_gdt, GetGDT); | 789 | para_fill(pv_cpu_ops.store_gdt, GetGDT); |
| 792 | para_fill(store_idt, GetIDT); | 790 | para_fill(pv_cpu_ops.store_idt, GetIDT); |
| 793 | para_fill(store_tr, GetTR); | 791 | para_fill(pv_cpu_ops.store_tr, GetTR); |
| 794 | paravirt_ops.load_tls = vmi_load_tls; | 792 | pv_cpu_ops.load_tls = vmi_load_tls; |
| 795 | para_fill(write_ldt_entry, WriteLDTEntry); | 793 | para_fill(pv_cpu_ops.write_ldt_entry, WriteLDTEntry); |
| 796 | para_fill(write_gdt_entry, WriteGDTEntry); | 794 | para_fill(pv_cpu_ops.write_gdt_entry, WriteGDTEntry); |
| 797 | para_fill(write_idt_entry, WriteIDTEntry); | 795 | para_fill(pv_cpu_ops.write_idt_entry, WriteIDTEntry); |
| 798 | para_wrap(load_esp0, vmi_load_esp0, set_kernel_stack, UpdateKernelStack); | 796 | para_wrap(pv_cpu_ops.load_esp0, vmi_load_esp0, set_kernel_stack, UpdateKernelStack); |
| 799 | para_fill(set_iopl_mask, SetIOPLMask); | 797 | para_fill(pv_cpu_ops.set_iopl_mask, SetIOPLMask); |
| 800 | para_fill(io_delay, IODelay); | 798 | para_fill(pv_cpu_ops.io_delay, IODelay); |
| 801 | para_wrap(set_lazy_mode, vmi_set_lazy_mode, set_lazy_mode, SetLazyMode); | 799 | |
| 800 | para_wrap(pv_cpu_ops.lazy_mode.enter, vmi_enter_lazy_cpu, | ||
| 801 | set_lazy_mode, SetLazyMode); | ||
| 802 | para_wrap(pv_cpu_ops.lazy_mode.leave, vmi_leave_lazy, | ||
| 803 | set_lazy_mode, SetLazyMode); | ||
| 804 | |||
| 805 | para_wrap(pv_mmu_ops.lazy_mode.enter, vmi_enter_lazy_mmu, | ||
| 806 | set_lazy_mode, SetLazyMode); | ||
| 807 | para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy, | ||
| 808 | set_lazy_mode, SetLazyMode); | ||
| 802 | 809 | ||
| 803 | /* user and kernel flush are just handled with different flags to FlushTLB */ | 810 | /* user and kernel flush are just handled with different flags to FlushTLB */ |
| 804 | para_wrap(flush_tlb_user, vmi_flush_tlb_user, _flush_tlb, FlushTLB); | 811 | para_wrap(pv_mmu_ops.flush_tlb_user, vmi_flush_tlb_user, _flush_tlb, FlushTLB); |
| 805 | para_wrap(flush_tlb_kernel, vmi_flush_tlb_kernel, _flush_tlb, FlushTLB); | 812 | para_wrap(pv_mmu_ops.flush_tlb_kernel, vmi_flush_tlb_kernel, _flush_tlb, FlushTLB); |
| 806 | para_fill(flush_tlb_single, InvalPage); | 813 | para_fill(pv_mmu_ops.flush_tlb_single, InvalPage); |
| 807 | 814 | ||
| 808 | /* | 815 | /* |
| 809 | * Until a standard flag format can be agreed on, we need to | 816 | * Until a standard flag format can be agreed on, we need to |
| @@ -819,41 +826,41 @@ static inline int __init activate_vmi(void) | |||
| 819 | #endif | 826 | #endif |
| 820 | 827 | ||
| 821 | if (vmi_ops.set_pte) { | 828 | if (vmi_ops.set_pte) { |
| 822 | paravirt_ops.set_pte = vmi_set_pte; | 829 | pv_mmu_ops.set_pte = vmi_set_pte; |
| 823 | paravirt_ops.set_pte_at = vmi_set_pte_at; | 830 | pv_mmu_ops.set_pte_at = vmi_set_pte_at; |
| 824 | paravirt_ops.set_pmd = vmi_set_pmd; | 831 | pv_mmu_ops.set_pmd = vmi_set_pmd; |
| 825 | #ifdef CONFIG_X86_PAE | 832 | #ifdef CONFIG_X86_PAE |
| 826 | paravirt_ops.set_pte_atomic = vmi_set_pte_atomic; | 833 | pv_mmu_ops.set_pte_atomic = vmi_set_pte_atomic; |
| 827 | paravirt_ops.set_pte_present = vmi_set_pte_present; | 834 | pv_mmu_ops.set_pte_present = vmi_set_pte_present; |
| 828 | paravirt_ops.set_pud = vmi_set_pud; | 835 | pv_mmu_ops.set_pud = vmi_set_pud; |
| 829 | paravirt_ops.pte_clear = vmi_pte_clear; | 836 | pv_mmu_ops.pte_clear = vmi_pte_clear; |
| 830 | paravirt_ops.pmd_clear = vmi_pmd_clear; | 837 | pv_mmu_ops.pmd_clear = vmi_pmd_clear; |
| 831 | #endif | 838 | #endif |
| 832 | } | 839 | } |
| 833 | 840 | ||
| 834 | if (vmi_ops.update_pte) { | 841 | if (vmi_ops.update_pte) { |
| 835 | paravirt_ops.pte_update = vmi_update_pte; | 842 | pv_mmu_ops.pte_update = vmi_update_pte; |
| 836 | paravirt_ops.pte_update_defer = vmi_update_pte_defer; | 843 | pv_mmu_ops.pte_update_defer = vmi_update_pte_defer; |
| 837 | } | 844 | } |
| 838 | 845 | ||
| 839 | vmi_ops.allocate_page = vmi_get_function(VMI_CALL_AllocatePage); | 846 | vmi_ops.allocate_page = vmi_get_function(VMI_CALL_AllocatePage); |
| 840 | if (vmi_ops.allocate_page) { | 847 | if (vmi_ops.allocate_page) { |
| 841 | paravirt_ops.alloc_pt = vmi_allocate_pt; | 848 | pv_mmu_ops.alloc_pt = vmi_allocate_pt; |
| 842 | paravirt_ops.alloc_pd = vmi_allocate_pd; | 849 | pv_mmu_ops.alloc_pd = vmi_allocate_pd; |
| 843 | paravirt_ops.alloc_pd_clone = vmi_allocate_pd_clone; | 850 | pv_mmu_ops.alloc_pd_clone = vmi_allocate_pd_clone; |
| 844 | } | 851 | } |
| 845 | 852 | ||
| 846 | vmi_ops.release_page = vmi_get_function(VMI_CALL_ReleasePage); | 853 | vmi_ops.release_page = vmi_get_function(VMI_CALL_ReleasePage); |
| 847 | if (vmi_ops.release_page) { | 854 | if (vmi_ops.release_page) { |
| 848 | paravirt_ops.release_pt = vmi_release_pt; | 855 | pv_mmu_ops.release_pt = vmi_release_pt; |
| 849 | paravirt_ops.release_pd = vmi_release_pd; | 856 | pv_mmu_ops.release_pd = vmi_release_pd; |
| 850 | } | 857 | } |
| 851 | 858 | ||
| 852 | /* Set linear is needed in all cases */ | 859 | /* Set linear is needed in all cases */ |
| 853 | vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping); | 860 | vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping); |
| 854 | #ifdef CONFIG_HIGHPTE | 861 | #ifdef CONFIG_HIGHPTE |
| 855 | if (vmi_ops.set_linear_mapping) | 862 | if (vmi_ops.set_linear_mapping) |
| 856 | paravirt_ops.kmap_atomic_pte = vmi_kmap_atomic_pte; | 863 | pv_mmu_ops.kmap_atomic_pte = vmi_kmap_atomic_pte; |
| 857 | #endif | 864 | #endif |
| 858 | 865 | ||
| 859 | /* | 866 | /* |
| @@ -863,17 +870,17 @@ static inline int __init activate_vmi(void) | |||
| 863 | * the backend. They are performance critical anyway, so requiring | 870 | * the backend. They are performance critical anyway, so requiring |
| 864 | * a patch is not a big problem. | 871 | * a patch is not a big problem. |
| 865 | */ | 872 | */ |
| 866 | paravirt_ops.irq_enable_sysexit = (void *)0xfeedbab0; | 873 | pv_cpu_ops.irq_enable_sysexit = (void *)0xfeedbab0; |
| 867 | paravirt_ops.iret = (void *)0xbadbab0; | 874 | pv_cpu_ops.iret = (void *)0xbadbab0; |
| 868 | 875 | ||
| 869 | #ifdef CONFIG_SMP | 876 | #ifdef CONFIG_SMP |
| 870 | para_wrap(startup_ipi_hook, vmi_startup_ipi_hook, set_initial_ap_state, SetInitialAPState); | 877 | para_wrap(pv_apic_ops.startup_ipi_hook, vmi_startup_ipi_hook, set_initial_ap_state, SetInitialAPState); |
| 871 | #endif | 878 | #endif |
| 872 | 879 | ||
| 873 | #ifdef CONFIG_X86_LOCAL_APIC | 880 | #ifdef CONFIG_X86_LOCAL_APIC |
| 874 | para_fill(apic_read, APICRead); | 881 | para_fill(pv_apic_ops.apic_read, APICRead); |
| 875 | para_fill(apic_write, APICWrite); | 882 | para_fill(pv_apic_ops.apic_write, APICWrite); |
| 876 | para_fill(apic_write_atomic, APICWrite); | 883 | para_fill(pv_apic_ops.apic_write_atomic, APICWrite); |
| 877 | #endif | 884 | #endif |
| 878 | 885 | ||
| 879 | /* | 886 | /* |
| @@ -891,15 +898,15 @@ static inline int __init activate_vmi(void) | |||
| 891 | vmi_timer_ops.set_alarm = vmi_get_function(VMI_CALL_SetAlarm); | 898 | vmi_timer_ops.set_alarm = vmi_get_function(VMI_CALL_SetAlarm); |
| 892 | vmi_timer_ops.cancel_alarm = | 899 | vmi_timer_ops.cancel_alarm = |
| 893 | vmi_get_function(VMI_CALL_CancelAlarm); | 900 | vmi_get_function(VMI_CALL_CancelAlarm); |
| 894 | paravirt_ops.time_init = vmi_time_init; | 901 | pv_time_ops.time_init = vmi_time_init; |
| 895 | paravirt_ops.get_wallclock = vmi_get_wallclock; | 902 | pv_time_ops.get_wallclock = vmi_get_wallclock; |
| 896 | paravirt_ops.set_wallclock = vmi_set_wallclock; | 903 | pv_time_ops.set_wallclock = vmi_set_wallclock; |
| 897 | #ifdef CONFIG_X86_LOCAL_APIC | 904 | #ifdef CONFIG_X86_LOCAL_APIC |
| 898 | paravirt_ops.setup_boot_clock = vmi_time_bsp_init; | 905 | pv_apic_ops.setup_boot_clock = vmi_time_bsp_init; |
| 899 | paravirt_ops.setup_secondary_clock = vmi_time_ap_init; | 906 | pv_apic_ops.setup_secondary_clock = vmi_time_ap_init; |
| 900 | #endif | 907 | #endif |
| 901 | paravirt_ops.sched_clock = vmi_sched_clock; | 908 | pv_time_ops.sched_clock = vmi_sched_clock; |
| 902 | paravirt_ops.get_cpu_khz = vmi_cpu_khz; | 909 | pv_time_ops.get_cpu_khz = vmi_cpu_khz; |
| 903 | 910 | ||
| 904 | /* We have true wallclock functions; disable CMOS clock sync */ | 911 | /* We have true wallclock functions; disable CMOS clock sync */ |
| 905 | no_sync_cmos_clock = 1; | 912 | no_sync_cmos_clock = 1; |
| @@ -908,7 +915,7 @@ static inline int __init activate_vmi(void) | |||
| 908 | disable_vmi_timer = 1; | 915 | disable_vmi_timer = 1; |
| 909 | } | 916 | } |
| 910 | 917 | ||
| 911 | para_fill(safe_halt, Halt); | 918 | para_fill(pv_irq_ops.safe_halt, Halt); |
| 912 | 919 | ||
| 913 | /* | 920 | /* |
| 914 | * Alternative instruction rewriting doesn't happen soon enough | 921 | * Alternative instruction rewriting doesn't happen soon enough |
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index dda4e83649..33d367a343 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c | |||
| @@ -741,24 +741,12 @@ struct kmem_cache *pmd_cache; | |||
| 741 | 741 | ||
| 742 | void __init pgtable_cache_init(void) | 742 | void __init pgtable_cache_init(void) |
| 743 | { | 743 | { |
| 744 | size_t pgd_size = PTRS_PER_PGD*sizeof(pgd_t); | 744 | if (PTRS_PER_PMD > 1) |
| 745 | |||
| 746 | if (PTRS_PER_PMD > 1) { | ||
| 747 | pmd_cache = kmem_cache_create("pmd", | 745 | pmd_cache = kmem_cache_create("pmd", |
| 748 | PTRS_PER_PMD*sizeof(pmd_t), | 746 | PTRS_PER_PMD*sizeof(pmd_t), |
| 749 | PTRS_PER_PMD*sizeof(pmd_t), | 747 | PTRS_PER_PMD*sizeof(pmd_t), |
| 750 | SLAB_PANIC, | 748 | SLAB_PANIC, |
| 751 | pmd_ctor); | 749 | pmd_ctor); |
| 752 | if (!SHARED_KERNEL_PMD) { | ||
| 753 | /* If we're in PAE mode and have a non-shared | ||
| 754 | kernel pmd, then the pgd size must be a | ||
| 755 | page size. This is because the pgd_list | ||
| 756 | links through the page structure, so there | ||
| 757 | can only be one pgd per page for this to | ||
| 758 | work. */ | ||
| 759 | pgd_size = PAGE_SIZE; | ||
| 760 | } | ||
| 761 | } | ||
| 762 | } | 750 | } |
| 763 | 751 | ||
| 764 | /* | 752 | /* |
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 493a083f68..94c39aaf69 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c | |||
| @@ -25,7 +25,6 @@ | |||
| 25 | #include <linux/mm.h> | 25 | #include <linux/mm.h> |
| 26 | #include <linux/page-flags.h> | 26 | #include <linux/page-flags.h> |
| 27 | #include <linux/highmem.h> | 27 | #include <linux/highmem.h> |
| 28 | #include <linux/smp.h> | ||
| 29 | 28 | ||
| 30 | #include <xen/interface/xen.h> | 29 | #include <xen/interface/xen.h> |
| 31 | #include <xen/interface/physdev.h> | 30 | #include <xen/interface/physdev.h> |
| @@ -52,11 +51,25 @@ | |||
| 52 | 51 | ||
| 53 | EXPORT_SYMBOL_GPL(hypercall_page); | 52 | EXPORT_SYMBOL_GPL(hypercall_page); |
| 54 | 53 | ||
| 55 | DEFINE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode); | ||
| 56 | |||
| 57 | DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu); | 54 | DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu); |
| 58 | DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); | 55 | DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); |
| 59 | DEFINE_PER_CPU(unsigned long, xen_cr3); | 56 | |
| 57 | /* | ||
| 58 | * Note about cr3 (pagetable base) values: | ||
| 59 | * | ||
| 60 | * xen_cr3 contains the current logical cr3 value; it contains the | ||
| 61 | * last set cr3. This may not be the current effective cr3, because | ||
| 62 | * its update may be being lazily deferred. However, a vcpu looking | ||
| 63 | * at its own cr3 can use this value knowing that it everything will | ||
| 64 | * be self-consistent. | ||
| 65 | * | ||
| 66 | * xen_current_cr3 contains the actual vcpu cr3; it is set once the | ||
| 67 | * hypercall to set the vcpu cr3 is complete (so it may be a little | ||
| 68 | * out of date, but it will never be set early). If one vcpu is | ||
| 69 | * looking at another vcpu's cr3 value, it should use this variable. | ||
| 70 | */ | ||
| 71 | DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */ | ||
| 72 | DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */ | ||
| 60 | 73 | ||
| 61 | struct start_info *xen_start_info; | 74 | struct start_info *xen_start_info; |
| 62 | EXPORT_SYMBOL_GPL(xen_start_info); | 75 | EXPORT_SYMBOL_GPL(xen_start_info); |
| @@ -100,7 +113,7 @@ static void __init xen_vcpu_setup(int cpu) | |||
| 100 | info.mfn = virt_to_mfn(vcpup); | 113 | info.mfn = virt_to_mfn(vcpup); |
| 101 | info.offset = offset_in_page(vcpup); | 114 | info.offset = offset_in_page(vcpup); |
| 102 | 115 | ||
| 103 | printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %x, offset %d\n", | 116 | printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %llx, offset %d\n", |
| 104 | cpu, vcpup, info.mfn, info.offset); | 117 | cpu, vcpup, info.mfn, info.offset); |
| 105 | 118 | ||
| 106 | /* Check to see if the hypervisor will put the vcpu_info | 119 | /* Check to see if the hypervisor will put the vcpu_info |
| @@ -124,7 +137,7 @@ static void __init xen_vcpu_setup(int cpu) | |||
| 124 | static void __init xen_banner(void) | 137 | static void __init xen_banner(void) |
| 125 | { | 138 | { |
| 126 | printk(KERN_INFO "Booting paravirtualized kernel on %s\n", | 139 | printk(KERN_INFO "Booting paravirtualized kernel on %s\n", |
| 127 | paravirt_ops.name); | 140 | pv_info.name); |
| 128 | printk(KERN_INFO "Hypervisor signature: %s\n", xen_start_info->magic); | 141 | printk(KERN_INFO "Hypervisor signature: %s\n", xen_start_info->magic); |
| 129 | } | 142 | } |
| 130 | 143 | ||
| @@ -249,29 +262,10 @@ static void xen_halt(void) | |||
| 249 | xen_safe_halt(); | 262 | xen_safe_halt(); |
| 250 | } | 263 | } |
| 251 | 264 | ||
| 252 | static void xen_set_lazy_mode(enum paravirt_lazy_mode mode) | 265 | static void xen_leave_lazy(void) |
| 253 | { | 266 | { |
| 254 | BUG_ON(preemptible()); | 267 | paravirt_leave_lazy(paravirt_get_lazy_mode()); |
| 255 | |||
| 256 | switch (mode) { | ||
| 257 | case PARAVIRT_LAZY_NONE: | ||
| 258 | BUG_ON(x86_read_percpu(xen_lazy_mode) == PARAVIRT_LAZY_NONE); | ||
| 259 | break; | ||
| 260 | |||
| 261 | case PARAVIRT_LAZY_MMU: | ||
| 262 | case PARAVIRT_LAZY_CPU: | ||
| 263 | BUG_ON(x86_read_percpu(xen_lazy_mode) != PARAVIRT_LAZY_NONE); | ||
| 264 | break; | ||
| 265 | |||
| 266 | case PARAVIRT_LAZY_FLUSH: | ||
| 267 | /* flush if necessary, but don't change state */ | ||
| 268 | if (x86_read_percpu(xen_lazy_mode) != PARAVIRT_LAZY_NONE) | ||
| 269 | xen_mc_flush(); | ||
| 270 | return; | ||
| 271 | } | ||
| 272 | |||
| 273 | xen_mc_flush(); | 268 | xen_mc_flush(); |
| 274 | x86_write_percpu(xen_lazy_mode, mode); | ||
| 275 | } | 269 | } |
| 276 | 270 | ||
| 277 | static unsigned long xen_store_tr(void) | 271 | static unsigned long xen_store_tr(void) |
| @@ -358,7 +352,7 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu) | |||
| 358 | * loaded properly. This will go away as soon as Xen has been | 352 | * loaded properly. This will go away as soon as Xen has been |
| 359 | * modified to not save/restore %gs for normal hypercalls. | 353 | * modified to not save/restore %gs for normal hypercalls. |
| 360 | */ | 354 | */ |
| 361 | if (xen_get_lazy_mode() == PARAVIRT_LAZY_CPU) | 355 | if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) |
| 362 | loadsegment(gs, 0); | 356 | loadsegment(gs, 0); |
| 363 | } | 357 | } |
| 364 | 358 | ||
| @@ -632,32 +626,36 @@ static unsigned long xen_read_cr3(void) | |||
| 632 | return x86_read_percpu(xen_cr3); | 626 | return x86_read_percpu(xen_cr3); |
| 633 | } | 627 | } |
| 634 | 628 | ||
| 629 | static void set_current_cr3(void *v) | ||
| 630 | { | ||
| 631 | x86_write_percpu(xen_current_cr3, (unsigned long)v); | ||
| 632 | } | ||
| 633 | |||
| 635 | static void xen_write_cr3(unsigned long cr3) | 634 | static void xen_write_cr3(unsigned long cr3) |
| 636 | { | 635 | { |
| 636 | struct mmuext_op *op; | ||
| 637 | struct multicall_space mcs; | ||
| 638 | unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3)); | ||
| 639 | |||
| 637 | BUG_ON(preemptible()); | 640 | BUG_ON(preemptible()); |
| 638 | 641 | ||
| 639 | if (cr3 == x86_read_percpu(xen_cr3)) { | 642 | mcs = xen_mc_entry(sizeof(*op)); /* disables interrupts */ |
| 640 | /* just a simple tlb flush */ | ||
| 641 | xen_flush_tlb(); | ||
| 642 | return; | ||
| 643 | } | ||
| 644 | 643 | ||
| 644 | /* Update while interrupts are disabled, so its atomic with | ||
| 645 | respect to ipis */ | ||
| 645 | x86_write_percpu(xen_cr3, cr3); | 646 | x86_write_percpu(xen_cr3, cr3); |
| 646 | 647 | ||
| 648 | op = mcs.args; | ||
| 649 | op->cmd = MMUEXT_NEW_BASEPTR; | ||
| 650 | op->arg1.mfn = mfn; | ||
| 647 | 651 | ||
| 648 | { | 652 | MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); |
| 649 | struct mmuext_op *op; | ||
| 650 | struct multicall_space mcs = xen_mc_entry(sizeof(*op)); | ||
| 651 | unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3)); | ||
| 652 | |||
| 653 | op = mcs.args; | ||
| 654 | op->cmd = MMUEXT_NEW_BASEPTR; | ||
| 655 | op->arg1.mfn = mfn; | ||
| 656 | 653 | ||
| 657 | MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); | 654 | /* Update xen_update_cr3 once the batch has actually |
| 655 | been submitted. */ | ||
| 656 | xen_mc_callback(set_current_cr3, (void *)cr3); | ||
| 658 | 657 | ||
| 659 | xen_mc_issue(PARAVIRT_LAZY_CPU); | 658 | xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */ |
| 660 | } | ||
| 661 | } | 659 | } |
| 662 | 660 | ||
| 663 | /* Early in boot, while setting up the initial pagetable, assume | 661 | /* Early in boot, while setting up the initial pagetable, assume |
| @@ -668,6 +666,15 @@ static __init void xen_alloc_pt_init(struct mm_struct *mm, u32 pfn) | |||
| 668 | make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); | 666 | make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); |
| 669 | } | 667 | } |
| 670 | 668 | ||
| 669 | static void pin_pagetable_pfn(unsigned level, unsigned long pfn) | ||
| 670 | { | ||
| 671 | struct mmuext_op op; | ||
| 672 | op.cmd = level; | ||
| 673 | op.arg1.mfn = pfn_to_mfn(pfn); | ||
| 674 | if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) | ||
| 675 | BUG(); | ||
| 676 | } | ||
| 677 | |||
| 671 | /* This needs to make sure the new pte page is pinned iff its being | 678 | /* This needs to make sure the new pte page is pinned iff its being |
| 672 | attached to a pinned pagetable. */ | 679 | attached to a pinned pagetable. */ |
| 673 | static void xen_alloc_pt(struct mm_struct *mm, u32 pfn) | 680 | static void xen_alloc_pt(struct mm_struct *mm, u32 pfn) |
| @@ -677,9 +684,10 @@ static void xen_alloc_pt(struct mm_struct *mm, u32 pfn) | |||
| 677 | if (PagePinned(virt_to_page(mm->pgd))) { | 684 | if (PagePinned(virt_to_page(mm->pgd))) { |
| 678 | SetPagePinned(page); | 685 | SetPagePinned(page); |
| 679 | 686 | ||
| 680 | if (!PageHighMem(page)) | 687 | if (!PageHighMem(page)) { |
| 681 | make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); | 688 | make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); |
| 682 | else | 689 | pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); |
| 690 | } else | ||
| 683 | /* make sure there are no stray mappings of | 691 | /* make sure there are no stray mappings of |
| 684 | this page */ | 692 | this page */ |
| 685 | kmap_flush_unused(); | 693 | kmap_flush_unused(); |
| @@ -692,8 +700,10 @@ static void xen_release_pt(u32 pfn) | |||
| 692 | struct page *page = pfn_to_page(pfn); | 700 | struct page *page = pfn_to_page(pfn); |
| 693 | 701 | ||
| 694 | if (PagePinned(page)) { | 702 | if (PagePinned(page)) { |
| 695 | if (!PageHighMem(page)) | 703 | if (!PageHighMem(page)) { |
| 704 | pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); | ||
| 696 | make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); | 705 | make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); |
| 706 | } | ||
| 697 | } | 707 | } |
| 698 | } | 708 | } |
| 699 | 709 | ||
| @@ -738,7 +748,7 @@ static __init void xen_pagetable_setup_start(pgd_t *base) | |||
| 738 | pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base; | 748 | pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base; |
| 739 | 749 | ||
| 740 | /* special set_pte for pagetable initialization */ | 750 | /* special set_pte for pagetable initialization */ |
| 741 | paravirt_ops.set_pte = xen_set_pte_init; | 751 | pv_mmu_ops.set_pte = xen_set_pte_init; |
| 742 | 752 | ||
| 743 | init_mm.pgd = base; | 753 | init_mm.pgd = base; |
| 744 | /* | 754 | /* |
| @@ -785,8 +795,8 @@ static __init void xen_pagetable_setup_done(pgd_t *base) | |||
| 785 | { | 795 | { |
| 786 | /* This will work as long as patching hasn't happened yet | 796 | /* This will work as long as patching hasn't happened yet |
| 787 | (which it hasn't) */ | 797 | (which it hasn't) */ |
| 788 | paravirt_ops.alloc_pt = xen_alloc_pt; | 798 | pv_mmu_ops.alloc_pt = xen_alloc_pt; |
| 789 | paravirt_ops.set_pte = xen_set_pte; | 799 | pv_mmu_ops.set_pte = xen_set_pte; |
| 790 | 800 | ||
| 791 | if (!xen_feature(XENFEAT_auto_translated_physmap)) { | 801 | if (!xen_feature(XENFEAT_auto_translated_physmap)) { |
| 792 | /* | 802 | /* |
| @@ -808,15 +818,15 @@ static __init void xen_pagetable_setup_done(pgd_t *base) | |||
| 808 | /* Actually pin the pagetable down, but we can't set PG_pinned | 818 | /* Actually pin the pagetable down, but we can't set PG_pinned |
| 809 | yet because the page structures don't exist yet. */ | 819 | yet because the page structures don't exist yet. */ |
| 810 | { | 820 | { |
| 811 | struct mmuext_op op; | 821 | unsigned level; |
| 822 | |||
| 812 | #ifdef CONFIG_X86_PAE | 823 | #ifdef CONFIG_X86_PAE |
| 813 | op.cmd = MMUEXT_PIN_L3_TABLE; | 824 | level = MMUEXT_PIN_L3_TABLE; |
| 814 | #else | 825 | #else |
| 815 | op.cmd = MMUEXT_PIN_L3_TABLE; | 826 | level = MMUEXT_PIN_L2_TABLE; |
| 816 | #endif | 827 | #endif |
| 817 | op.arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(base))); | 828 | |
| 818 | if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) | 829 | pin_pagetable_pfn(level, PFN_DOWN(__pa(base))); |
| 819 | BUG(); | ||
| 820 | } | 830 | } |
| 821 | } | 831 | } |
| 822 | 832 | ||
| @@ -833,12 +843,12 @@ void __init xen_setup_vcpu_info_placement(void) | |||
| 833 | if (have_vcpu_info_placement) { | 843 | if (have_vcpu_info_placement) { |
| 834 | printk(KERN_INFO "Xen: using vcpu_info placement\n"); | 844 | printk(KERN_INFO "Xen: using vcpu_info placement\n"); |
| 835 | 845 | ||
| 836 | paravirt_ops.save_fl = xen_save_fl_direct; | 846 | pv_irq_ops.save_fl = xen_save_fl_direct; |
| 837 | paravirt_ops.restore_fl = xen_restore_fl_direct; | 847 | pv_irq_ops.restore_fl = xen_restore_fl_direct; |
| 838 | paravirt_ops.irq_disable = xen_irq_disable_direct; | 848 | pv_irq_ops.irq_disable = xen_irq_disable_direct; |
| 839 | paravirt_ops.irq_enable = xen_irq_enable_direct; | 849 | pv_irq_ops.irq_enable = xen_irq_enable_direct; |
| 840 | paravirt_ops.read_cr2 = xen_read_cr2_direct; | 850 | pv_mmu_ops.read_cr2 = xen_read_cr2_direct; |
| 841 | paravirt_ops.iret = xen_iret_direct; | 851 | pv_cpu_ops.iret = xen_iret_direct; |
| 842 | } | 852 | } |
| 843 | } | 853 | } |
| 844 | 854 | ||
| @@ -850,8 +860,8 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, | |||
| 850 | 860 | ||
| 851 | start = end = reloc = NULL; | 861 | start = end = reloc = NULL; |
| 852 | 862 | ||
| 853 | #define SITE(x) \ | 863 | #define SITE(op, x) \ |
| 854 | case PARAVIRT_PATCH(x): \ | 864 | case PARAVIRT_PATCH(op.x): \ |
| 855 | if (have_vcpu_info_placement) { \ | 865 | if (have_vcpu_info_placement) { \ |
| 856 | start = (char *)xen_##x##_direct; \ | 866 | start = (char *)xen_##x##_direct; \ |
| 857 | end = xen_##x##_direct_end; \ | 867 | end = xen_##x##_direct_end; \ |
| @@ -860,10 +870,10 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, | |||
| 860 | goto patch_site | 870 | goto patch_site |
| 861 | 871 | ||
| 862 | switch (type) { | 872 | switch (type) { |
| 863 | SITE(irq_enable); | 873 | SITE(pv_irq_ops, irq_enable); |
| 864 | SITE(irq_disable); | 874 | SITE(pv_irq_ops, irq_disable); |
| 865 | SITE(save_fl); | 875 | SITE(pv_irq_ops, save_fl); |
| 866 | SITE(restore_fl); | 876 | SITE(pv_irq_ops, restore_fl); |
| 867 | #undef SITE | 877 | #undef SITE |
| 868 | 878 | ||
| 869 | patch_site: | 879 | patch_site: |
| @@ -895,26 +905,32 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, | |||
| 895 | return ret; | 905 | return ret; |
| 896 | } | 906 | } |
| 897 | 907 | ||
| 898 | static const struct paravirt_ops xen_paravirt_ops __initdata = { | 908 | static const struct pv_info xen_info __initdata = { |
| 899 | .paravirt_enabled = 1, | 909 | .paravirt_enabled = 1, |
| 900 | .shared_kernel_pmd = 0, | 910 | .shared_kernel_pmd = 0, |
| 901 | 911 | ||
| 902 | .name = "Xen", | 912 | .name = "Xen", |
| 903 | .banner = xen_banner, | 913 | }; |
| 904 | 914 | ||
| 915 | static const struct pv_init_ops xen_init_ops __initdata = { | ||
| 905 | .patch = xen_patch, | 916 | .patch = xen_patch, |
| 906 | 917 | ||
| 918 | .banner = xen_banner, | ||
| 907 | .memory_setup = xen_memory_setup, | 919 | .memory_setup = xen_memory_setup, |
| 908 | .arch_setup = xen_arch_setup, | 920 | .arch_setup = xen_arch_setup, |
| 909 | .init_IRQ = xen_init_IRQ, | ||
| 910 | .post_allocator_init = xen_mark_init_mm_pinned, | 921 | .post_allocator_init = xen_mark_init_mm_pinned, |
| 922 | }; | ||
| 911 | 923 | ||
| 924 | static const struct pv_time_ops xen_time_ops __initdata = { | ||
| 912 | .time_init = xen_time_init, | 925 | .time_init = xen_time_init, |
| 926 | |||
| 913 | .set_wallclock = xen_set_wallclock, | 927 | .set_wallclock = xen_set_wallclock, |
| 914 | .get_wallclock = xen_get_wallclock, | 928 | .get_wallclock = xen_get_wallclock, |
| 915 | .get_cpu_khz = xen_cpu_khz, | 929 | .get_cpu_khz = xen_cpu_khz, |
| 916 | .sched_clock = xen_sched_clock, | 930 | .sched_clock = xen_sched_clock, |
| 931 | }; | ||
| 917 | 932 | ||
| 933 | static const struct pv_cpu_ops xen_cpu_ops __initdata = { | ||
| 918 | .cpuid = xen_cpuid, | 934 | .cpuid = xen_cpuid, |
| 919 | 935 | ||
| 920 | .set_debugreg = xen_set_debugreg, | 936 | .set_debugreg = xen_set_debugreg, |
| @@ -925,22 +941,10 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = { | |||
| 925 | .read_cr0 = native_read_cr0, | 941 | .read_cr0 = native_read_cr0, |
| 926 | .write_cr0 = native_write_cr0, | 942 | .write_cr0 = native_write_cr0, |
| 927 | 943 | ||
| 928 | .read_cr2 = xen_read_cr2, | ||
| 929 | .write_cr2 = xen_write_cr2, | ||
| 930 | |||
| 931 | .read_cr3 = xen_read_cr3, | ||
| 932 | .write_cr3 = xen_write_cr3, | ||
| 933 | |||
| 934 | .read_cr4 = native_read_cr4, | 944 | .read_cr4 = native_read_cr4, |
| 935 | .read_cr4_safe = native_read_cr4_safe, | 945 | .read_cr4_safe = native_read_cr4_safe, |
| 936 | .write_cr4 = xen_write_cr4, | 946 | .write_cr4 = xen_write_cr4, |
| 937 | 947 | ||
| 938 | .save_fl = xen_save_fl, | ||
| 939 | .restore_fl = xen_restore_fl, | ||
| 940 | .irq_disable = xen_irq_disable, | ||
| 941 | .irq_enable = xen_irq_enable, | ||
| 942 | .safe_halt = xen_safe_halt, | ||
| 943 | .halt = xen_halt, | ||
| 944 | .wbinvd = native_wbinvd, | 948 | .wbinvd = native_wbinvd, |
| 945 | 949 | ||
| 946 | .read_msr = native_read_msr_safe, | 950 | .read_msr = native_read_msr_safe, |
| @@ -969,6 +973,23 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = { | |||
| 969 | .set_iopl_mask = xen_set_iopl_mask, | 973 | .set_iopl_mask = xen_set_iopl_mask, |
| 970 | .io_delay = xen_io_delay, | 974 | .io_delay = xen_io_delay, |
| 971 | 975 | ||
| 976 | .lazy_mode = { | ||
| 977 | .enter = paravirt_enter_lazy_cpu, | ||
| 978 | .leave = xen_leave_lazy, | ||
| 979 | }, | ||
| 980 | }; | ||
| 981 | |||
| 982 | static const struct pv_irq_ops xen_irq_ops __initdata = { | ||
| 983 | .init_IRQ = xen_init_IRQ, | ||
| 984 | .save_fl = xen_save_fl, | ||
| 985 | .restore_fl = xen_restore_fl, | ||
| 986 | .irq_disable = xen_irq_disable, | ||
| 987 | .irq_enable = xen_irq_enable, | ||
| 988 | .safe_halt = xen_safe_halt, | ||
| 989 | .halt = xen_halt, | ||
| 990 | }; | ||
| 991 | |||
| 992 | static const struct pv_apic_ops xen_apic_ops __initdata = { | ||
| 972 | #ifdef CONFIG_X86_LOCAL_APIC | 993 | #ifdef CONFIG_X86_LOCAL_APIC |
| 973 | .apic_write = xen_apic_write, | 994 | .apic_write = xen_apic_write, |
| 974 | .apic_write_atomic = xen_apic_write, | 995 | .apic_write_atomic = xen_apic_write, |
| @@ -977,6 +998,17 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = { | |||
| 977 | .setup_secondary_clock = paravirt_nop, | 998 | .setup_secondary_clock = paravirt_nop, |
| 978 | .startup_ipi_hook = paravirt_nop, | 999 | .startup_ipi_hook = paravirt_nop, |
| 979 | #endif | 1000 | #endif |
| 1001 | }; | ||
| 1002 | |||
| 1003 | static const struct pv_mmu_ops xen_mmu_ops __initdata = { | ||
| 1004 | .pagetable_setup_start = xen_pagetable_setup_start, | ||
| 1005 | .pagetable_setup_done = xen_pagetable_setup_done, | ||
| 1006 | |||
| 1007 | .read_cr2 = xen_read_cr2, | ||
| 1008 | .write_cr2 = xen_write_cr2, | ||
| 1009 | |||
| 1010 | .read_cr3 = xen_read_cr3, | ||
| 1011 | .write_cr3 = xen_write_cr3, | ||
| 980 | 1012 | ||
| 981 | .flush_tlb_user = xen_flush_tlb, | 1013 | .flush_tlb_user = xen_flush_tlb, |
| 982 | .flush_tlb_kernel = xen_flush_tlb, | 1014 | .flush_tlb_kernel = xen_flush_tlb, |
| @@ -986,9 +1018,6 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = { | |||
| 986 | .pte_update = paravirt_nop, | 1018 | .pte_update = paravirt_nop, |
| 987 | .pte_update_defer = paravirt_nop, | 1019 | .pte_update_defer = paravirt_nop, |
| 988 | 1020 | ||
| 989 | .pagetable_setup_start = xen_pagetable_setup_start, | ||
| 990 | .pagetable_setup_done = xen_pagetable_setup_done, | ||
| 991 | |||
| 992 | .alloc_pt = xen_alloc_pt_init, | 1021 | .alloc_pt = xen_alloc_pt_init, |
| 993 | .release_pt = xen_release_pt, | 1022 | .release_pt = xen_release_pt, |
| 994 | .alloc_pd = paravirt_nop, | 1023 | .alloc_pd = paravirt_nop, |
| @@ -1024,7 +1053,10 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = { | |||
| 1024 | .dup_mmap = xen_dup_mmap, | 1053 | .dup_mmap = xen_dup_mmap, |
| 1025 | .exit_mmap = xen_exit_mmap, | 1054 | .exit_mmap = xen_exit_mmap, |
| 1026 | 1055 | ||
| 1027 | .set_lazy_mode = xen_set_lazy_mode, | 1056 | .lazy_mode = { |
| 1057 | .enter = paravirt_enter_lazy_mmu, | ||
| 1058 | .leave = xen_leave_lazy, | ||
| 1059 | }, | ||
| 1028 | }; | 1060 | }; |
| 1029 | 1061 | ||
| 1030 | #ifdef CONFIG_SMP | 1062 | #ifdef CONFIG_SMP |
| @@ -1080,6 +1112,17 @@ static const struct machine_ops __initdata xen_machine_ops = { | |||
| 1080 | }; | 1112 | }; |
| 1081 | 1113 | ||
| 1082 | 1114 | ||
| 1115 | static void __init xen_reserve_top(void) | ||
| 1116 | { | ||
| 1117 | unsigned long top = HYPERVISOR_VIRT_START; | ||
| 1118 | struct xen_platform_parameters pp; | ||
| 1119 | |||
| 1120 | if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0) | ||
| 1121 | top = pp.virt_start; | ||
| 1122 | |||
| 1123 | reserve_top_address(-top + 2 * PAGE_SIZE); | ||
| 1124 | } | ||
| 1125 | |||
| 1083 | /* First C function to be called on Xen boot */ | 1126 | /* First C function to be called on Xen boot */ |
| 1084 | asmlinkage void __init xen_start_kernel(void) | 1127 | asmlinkage void __init xen_start_kernel(void) |
| 1085 | { | 1128 | { |
| @@ -1091,7 +1134,14 @@ asmlinkage void __init xen_start_kernel(void) | |||
| 1091 | BUG_ON(memcmp(xen_start_info->magic, "xen-3.0", 7) != 0); | 1134 | BUG_ON(memcmp(xen_start_info->magic, "xen-3.0", 7) != 0); |
| 1092 | 1135 | ||
| 1093 | /* Install Xen paravirt ops */ | 1136 | /* Install Xen paravirt ops */ |
| 1094 | paravirt_ops = xen_paravirt_ops; | 1137 | pv_info = xen_info; |
| 1138 | pv_init_ops = xen_init_ops; | ||
| 1139 | pv_time_ops = xen_time_ops; | ||
| 1140 | pv_cpu_ops = xen_cpu_ops; | ||
| 1141 | pv_irq_ops = xen_irq_ops; | ||
| 1142 | pv_apic_ops = xen_apic_ops; | ||
| 1143 | pv_mmu_ops = xen_mmu_ops; | ||
| 1144 | |||
| 1095 | machine_ops = xen_machine_ops; | 1145 | machine_ops = xen_machine_ops; |
| 1096 | 1146 | ||
| 1097 | #ifdef CONFIG_SMP | 1147 | #ifdef CONFIG_SMP |
| @@ -1113,6 +1163,7 @@ asmlinkage void __init xen_start_kernel(void) | |||
| 1113 | /* keep using Xen gdt for now; no urgent need to change it */ | 1163 | /* keep using Xen gdt for now; no urgent need to change it */ |
| 1114 | 1164 | ||
| 1115 | x86_write_percpu(xen_cr3, __pa(pgd)); | 1165 | x86_write_percpu(xen_cr3, __pa(pgd)); |
| 1166 | x86_write_percpu(xen_current_cr3, __pa(pgd)); | ||
| 1116 | 1167 | ||
| 1117 | #ifdef CONFIG_SMP | 1168 | #ifdef CONFIG_SMP |
| 1118 | /* Don't do the full vcpu_info placement stuff until we have a | 1169 | /* Don't do the full vcpu_info placement stuff until we have a |
| @@ -1124,12 +1175,12 @@ asmlinkage void __init xen_start_kernel(void) | |||
| 1124 | xen_setup_vcpu_info_placement(); | 1175 | xen_setup_vcpu_info_placement(); |
| 1125 | #endif | 1176 | #endif |
| 1126 | 1177 | ||
| 1127 | paravirt_ops.kernel_rpl = 1; | 1178 | pv_info.kernel_rpl = 1; |
| 1128 | if (xen_feature(XENFEAT_supervisor_mode_kernel)) | 1179 | if (xen_feature(XENFEAT_supervisor_mode_kernel)) |
| 1129 | paravirt_ops.kernel_rpl = 0; | 1180 | pv_info.kernel_rpl = 0; |
| 1130 | 1181 | ||
| 1131 | /* set the limit of our address space */ | 1182 | /* set the limit of our address space */ |
| 1132 | reserve_top_address(-HYPERVISOR_VIRT_START + 2 * PAGE_SIZE); | 1183 | xen_reserve_top(); |
| 1133 | 1184 | ||
| 1134 | /* set up basic CPUID stuff */ | 1185 | /* set up basic CPUID stuff */ |
| 1135 | cpu_detect(&new_cpu_data); | 1186 | cpu_detect(&new_cpu_data); |
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 874db0cd1d..b2e32f9d00 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c | |||
| @@ -41,7 +41,6 @@ | |||
| 41 | #include <linux/sched.h> | 41 | #include <linux/sched.h> |
| 42 | #include <linux/highmem.h> | 42 | #include <linux/highmem.h> |
| 43 | #include <linux/bug.h> | 43 | #include <linux/bug.h> |
| 44 | #include <linux/sched.h> | ||
| 45 | 44 | ||
| 46 | #include <asm/pgtable.h> | 45 | #include <asm/pgtable.h> |
| 47 | #include <asm/tlbflush.h> | 46 | #include <asm/tlbflush.h> |
| @@ -155,7 +154,7 @@ void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, | |||
| 155 | pte_t *ptep, pte_t pteval) | 154 | pte_t *ptep, pte_t pteval) |
| 156 | { | 155 | { |
| 157 | if (mm == current->mm || mm == &init_mm) { | 156 | if (mm == current->mm || mm == &init_mm) { |
| 158 | if (xen_get_lazy_mode() == PARAVIRT_LAZY_MMU) { | 157 | if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { |
| 159 | struct multicall_space mcs; | 158 | struct multicall_space mcs; |
| 160 | mcs = xen_mc_entry(0); | 159 | mcs = xen_mc_entry(0); |
| 161 | 160 | ||
| @@ -304,7 +303,12 @@ pgd_t xen_make_pgd(unsigned long pgd) | |||
| 304 | } | 303 | } |
| 305 | #endif /* CONFIG_X86_PAE */ | 304 | #endif /* CONFIG_X86_PAE */ |
| 306 | 305 | ||
| 307 | 306 | enum pt_level { | |
| 307 | PT_PGD, | ||
| 308 | PT_PUD, | ||
| 309 | PT_PMD, | ||
| 310 | PT_PTE | ||
| 311 | }; | ||
| 308 | 312 | ||
| 309 | /* | 313 | /* |
| 310 | (Yet another) pagetable walker. This one is intended for pinning a | 314 | (Yet another) pagetable walker. This one is intended for pinning a |
| @@ -316,7 +320,7 @@ pgd_t xen_make_pgd(unsigned long pgd) | |||
| 316 | FIXADDR_TOP. But the important bit is that we don't pin beyond | 320 | FIXADDR_TOP. But the important bit is that we don't pin beyond |
| 317 | there, because then we start getting into Xen's ptes. | 321 | there, because then we start getting into Xen's ptes. |
| 318 | */ | 322 | */ |
| 319 | static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned), | 323 | static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, enum pt_level), |
| 320 | unsigned long limit) | 324 | unsigned long limit) |
| 321 | { | 325 | { |
| 322 | pgd_t *pgd = pgd_base; | 326 | pgd_t *pgd = pgd_base; |
| @@ -341,7 +345,7 @@ static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned), | |||
| 341 | pud = pud_offset(pgd, 0); | 345 | pud = pud_offset(pgd, 0); |
| 342 | 346 | ||
| 343 | if (PTRS_PER_PUD > 1) /* not folded */ | 347 | if (PTRS_PER_PUD > 1) /* not folded */ |
| 344 | flush |= (*func)(virt_to_page(pud), 0); | 348 | flush |= (*func)(virt_to_page(pud), PT_PUD); |
| 345 | 349 | ||
| 346 | for (; addr != pud_limit; pud++, addr = pud_next) { | 350 | for (; addr != pud_limit; pud++, addr = pud_next) { |
| 347 | pmd_t *pmd; | 351 | pmd_t *pmd; |
| @@ -360,7 +364,7 @@ static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned), | |||
| 360 | pmd = pmd_offset(pud, 0); | 364 | pmd = pmd_offset(pud, 0); |
| 361 | 365 | ||
| 362 | if (PTRS_PER_PMD > 1) /* not folded */ | 366 | if (PTRS_PER_PMD > 1) /* not folded */ |
| 363 | flush |= (*func)(virt_to_page(pmd), 0); | 367 | flush |= (*func)(virt_to_page(pmd), PT_PMD); |
| 364 | 368 | ||
| 365 | for (; addr != pmd_limit; pmd++) { | 369 | for (; addr != pmd_limit; pmd++) { |
| 366 | addr += (PAGE_SIZE * PTRS_PER_PTE); | 370 | addr += (PAGE_SIZE * PTRS_PER_PTE); |
| @@ -372,17 +376,47 @@ static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned), | |||
| 372 | if (pmd_none(*pmd)) | 376 | if (pmd_none(*pmd)) |
| 373 | continue; | 377 | continue; |
| 374 | 378 | ||
| 375 | flush |= (*func)(pmd_page(*pmd), 0); | 379 | flush |= (*func)(pmd_page(*pmd), PT_PTE); |
| 376 | } | 380 | } |
| 377 | } | 381 | } |
| 378 | } | 382 | } |
| 379 | 383 | ||
| 380 | flush |= (*func)(virt_to_page(pgd_base), UVMF_TLB_FLUSH); | 384 | flush |= (*func)(virt_to_page(pgd_base), PT_PGD); |
| 381 | 385 | ||
| 382 | return flush; | 386 | return flush; |
| 383 | } | 387 | } |
| 384 | 388 | ||
| 385 | static int pin_page(struct page *page, unsigned flags) | 389 | static spinlock_t *lock_pte(struct page *page) |
| 390 | { | ||
| 391 | spinlock_t *ptl = NULL; | ||
| 392 | |||
| 393 | #if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS | ||
| 394 | ptl = __pte_lockptr(page); | ||
| 395 | spin_lock(ptl); | ||
| 396 | #endif | ||
| 397 | |||
| 398 | return ptl; | ||
| 399 | } | ||
| 400 | |||
| 401 | static void do_unlock(void *v) | ||
| 402 | { | ||
| 403 | spinlock_t *ptl = v; | ||
| 404 | spin_unlock(ptl); | ||
| 405 | } | ||
| 406 | |||
| 407 | static void xen_do_pin(unsigned level, unsigned long pfn) | ||
| 408 | { | ||
| 409 | struct mmuext_op *op; | ||
| 410 | struct multicall_space mcs; | ||
| 411 | |||
| 412 | mcs = __xen_mc_entry(sizeof(*op)); | ||
| 413 | op = mcs.args; | ||
| 414 | op->cmd = level; | ||
| 415 | op->arg1.mfn = pfn_to_mfn(pfn); | ||
| 416 | MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); | ||
| 417 | } | ||
| 418 | |||
| 419 | static int pin_page(struct page *page, enum pt_level level) | ||
| 386 | { | 420 | { |
| 387 | unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags); | 421 | unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags); |
| 388 | int flush; | 422 | int flush; |
| @@ -397,12 +431,26 @@ static int pin_page(struct page *page, unsigned flags) | |||
| 397 | void *pt = lowmem_page_address(page); | 431 | void *pt = lowmem_page_address(page); |
| 398 | unsigned long pfn = page_to_pfn(page); | 432 | unsigned long pfn = page_to_pfn(page); |
| 399 | struct multicall_space mcs = __xen_mc_entry(0); | 433 | struct multicall_space mcs = __xen_mc_entry(0); |
| 434 | spinlock_t *ptl; | ||
| 400 | 435 | ||
| 401 | flush = 0; | 436 | flush = 0; |
| 402 | 437 | ||
| 438 | ptl = NULL; | ||
| 439 | if (level == PT_PTE) | ||
| 440 | ptl = lock_pte(page); | ||
| 441 | |||
| 403 | MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, | 442 | MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, |
| 404 | pfn_pte(pfn, PAGE_KERNEL_RO), | 443 | pfn_pte(pfn, PAGE_KERNEL_RO), |
| 405 | flags); | 444 | level == PT_PGD ? UVMF_TLB_FLUSH : 0); |
| 445 | |||
| 446 | if (level == PT_PTE) | ||
| 447 | xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn); | ||
| 448 | |||
| 449 | if (ptl) { | ||
| 450 | /* Queue a deferred unlock for when this batch | ||
| 451 | is completed. */ | ||
| 452 | xen_mc_callback(do_unlock, ptl); | ||
| 453 | } | ||
| 406 | } | 454 | } |
| 407 | 455 | ||
| 408 | return flush; | 456 | return flush; |
| @@ -413,8 +461,7 @@ static int pin_page(struct page *page, unsigned flags) | |||
| 413 | read-only, and can be pinned. */ | 461 | read-only, and can be pinned. */ |
| 414 | void xen_pgd_pin(pgd_t *pgd) | 462 | void xen_pgd_pin(pgd_t *pgd) |
| 415 | { | 463 | { |
| 416 | struct multicall_space mcs; | 464 | unsigned level; |
| 417 | struct mmuext_op *op; | ||
| 418 | 465 | ||
| 419 | xen_mc_batch(); | 466 | xen_mc_batch(); |
| 420 | 467 | ||
| @@ -425,16 +472,13 @@ void xen_pgd_pin(pgd_t *pgd) | |||
| 425 | xen_mc_batch(); | 472 | xen_mc_batch(); |
| 426 | } | 473 | } |
| 427 | 474 | ||
| 428 | mcs = __xen_mc_entry(sizeof(*op)); | ||
| 429 | op = mcs.args; | ||
| 430 | |||
| 431 | #ifdef CONFIG_X86_PAE | 475 | #ifdef CONFIG_X86_PAE |
| 432 | op->cmd = MMUEXT_PIN_L3_TABLE; | 476 | level = MMUEXT_PIN_L3_TABLE; |
| 433 | #else | 477 | #else |
| 434 | op->cmd = MMUEXT_PIN_L2_TABLE; | 478 | level = MMUEXT_PIN_L2_TABLE; |
| 435 | #endif | 479 | #endif |
| 436 | op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd))); | 480 | |
| 437 | MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); | 481 | xen_do_pin(level, PFN_DOWN(__pa(pgd))); |
| 438 | 482 | ||
| 439 | xen_mc_issue(0); | 483 | xen_mc_issue(0); |
| 440 | } | 484 | } |
| @@ -442,7 +486,7 @@ void xen_pgd_pin(pgd_t *pgd) | |||
| 442 | /* The init_mm pagetable is really pinned as soon as its created, but | 486 | /* The init_mm pagetable is really pinned as soon as its created, but |
| 443 | that's before we have page structures to store the bits. So do all | 487 | that's before we have page structures to store the bits. So do all |
| 444 | the book-keeping now. */ | 488 | the book-keeping now. */ |
| 445 | static __init int mark_pinned(struct page *page, unsigned flags) | 489 | static __init int mark_pinned(struct page *page, enum pt_level level) |
| 446 | { | 490 | { |
| 447 | SetPagePinned(page); | 491 | SetPagePinned(page); |
| 448 | return 0; | 492 | return 0; |
| @@ -453,18 +497,32 @@ void __init xen_mark_init_mm_pinned(void) | |||
| 453 | pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP); | 497 | pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP); |
| 454 | } | 498 | } |
| 455 | 499 | ||
| 456 | static int unpin_page(struct page *page, unsigned flags) | 500 | static int unpin_page(struct page *page, enum pt_level level) |
| 457 | { | 501 | { |
| 458 | unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags); | 502 | unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags); |
| 459 | 503 | ||
| 460 | if (pgfl && !PageHighMem(page)) { | 504 | if (pgfl && !PageHighMem(page)) { |
| 461 | void *pt = lowmem_page_address(page); | 505 | void *pt = lowmem_page_address(page); |
| 462 | unsigned long pfn = page_to_pfn(page); | 506 | unsigned long pfn = page_to_pfn(page); |
| 463 | struct multicall_space mcs = __xen_mc_entry(0); | 507 | spinlock_t *ptl = NULL; |
| 508 | struct multicall_space mcs; | ||
| 509 | |||
| 510 | if (level == PT_PTE) { | ||
| 511 | ptl = lock_pte(page); | ||
| 512 | |||
| 513 | xen_do_pin(MMUEXT_UNPIN_TABLE, pfn); | ||
| 514 | } | ||
| 515 | |||
| 516 | mcs = __xen_mc_entry(0); | ||
| 464 | 517 | ||
| 465 | MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, | 518 | MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, |
| 466 | pfn_pte(pfn, PAGE_KERNEL), | 519 | pfn_pte(pfn, PAGE_KERNEL), |
| 467 | flags); | 520 | level == PT_PGD ? UVMF_TLB_FLUSH : 0); |
| 521 | |||
| 522 | if (ptl) { | ||
| 523 | /* unlock when batch completed */ | ||
| 524 | xen_mc_callback(do_unlock, ptl); | ||
| 525 | } | ||
| 468 | } | 526 | } |
| 469 | 527 | ||
| 470 | return 0; /* never need to flush on unpin */ | 528 | return 0; /* never need to flush on unpin */ |
| @@ -473,18 +531,9 @@ static int unpin_page(struct page *page, unsigned flags) | |||
| 473 | /* Release a pagetables pages back as normal RW */ | 531 | /* Release a pagetables pages back as normal RW */ |
| 474 | static void xen_pgd_unpin(pgd_t *pgd) | 532 | static void xen_pgd_unpin(pgd_t *pgd) |
| 475 | { | 533 | { |
| 476 | struct mmuext_op *op; | ||
| 477 | struct multicall_space mcs; | ||
| 478 | |||
| 479 | xen_mc_batch(); | 534 | xen_mc_batch(); |
| 480 | 535 | ||
| 481 | mcs = __xen_mc_entry(sizeof(*op)); | 536 | xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); |
| 482 | |||
| 483 | op = mcs.args; | ||
| 484 | op->cmd = MMUEXT_UNPIN_TABLE; | ||
| 485 | op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd))); | ||
| 486 | |||
| 487 | MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); | ||
| 488 | 537 | ||
| 489 | pgd_walk(pgd, unpin_page, TASK_SIZE); | 538 | pgd_walk(pgd, unpin_page, TASK_SIZE); |
| 490 | 539 | ||
| @@ -515,20 +564,43 @@ static void drop_other_mm_ref(void *info) | |||
| 515 | 564 | ||
| 516 | if (__get_cpu_var(cpu_tlbstate).active_mm == mm) | 565 | if (__get_cpu_var(cpu_tlbstate).active_mm == mm) |
| 517 | leave_mm(smp_processor_id()); | 566 | leave_mm(smp_processor_id()); |
| 567 | |||
| 568 | /* If this cpu still has a stale cr3 reference, then make sure | ||
| 569 | it has been flushed. */ | ||
| 570 | if (x86_read_percpu(xen_current_cr3) == __pa(mm->pgd)) { | ||
| 571 | load_cr3(swapper_pg_dir); | ||
| 572 | arch_flush_lazy_cpu_mode(); | ||
| 573 | } | ||
| 518 | } | 574 | } |
| 519 | 575 | ||
| 520 | static void drop_mm_ref(struct mm_struct *mm) | 576 | static void drop_mm_ref(struct mm_struct *mm) |
| 521 | { | 577 | { |
| 578 | cpumask_t mask; | ||
| 579 | unsigned cpu; | ||
| 580 | |||
| 522 | if (current->active_mm == mm) { | 581 | if (current->active_mm == mm) { |
| 523 | if (current->mm == mm) | 582 | if (current->mm == mm) |
| 524 | load_cr3(swapper_pg_dir); | 583 | load_cr3(swapper_pg_dir); |
| 525 | else | 584 | else |
| 526 | leave_mm(smp_processor_id()); | 585 | leave_mm(smp_processor_id()); |
| 586 | arch_flush_lazy_cpu_mode(); | ||
| 527 | } | 587 | } |
| 528 | 588 | ||
| 529 | if (!cpus_empty(mm->cpu_vm_mask)) | 589 | /* Get the "official" set of cpus referring to our pagetable. */ |
| 530 | xen_smp_call_function_mask(mm->cpu_vm_mask, drop_other_mm_ref, | 590 | mask = mm->cpu_vm_mask; |
| 531 | mm, 1); | 591 | |
| 592 | /* It's possible that a vcpu may have a stale reference to our | ||
| 593 | cr3, because its in lazy mode, and it hasn't yet flushed | ||
| 594 | its set of pending hypercalls yet. In this case, we can | ||
| 595 | look at its actual current cr3 value, and force it to flush | ||
| 596 | if needed. */ | ||
| 597 | for_each_online_cpu(cpu) { | ||
| 598 | if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd)) | ||
| 599 | cpu_set(cpu, mask); | ||
| 600 | } | ||
| 601 | |||
| 602 | if (!cpus_empty(mask)) | ||
| 603 | xen_smp_call_function_mask(mask, drop_other_mm_ref, mm, 1); | ||
| 532 | } | 604 | } |
| 533 | #else | 605 | #else |
| 534 | static void drop_mm_ref(struct mm_struct *mm) | 606 | static void drop_mm_ref(struct mm_struct *mm) |
| @@ -563,5 +635,6 @@ void xen_exit_mmap(struct mm_struct *mm) | |||
| 563 | /* pgd may not be pinned in the error exit path of execve */ | 635 | /* pgd may not be pinned in the error exit path of execve */ |
| 564 | if (PagePinned(virt_to_page(mm->pgd))) | 636 | if (PagePinned(virt_to_page(mm->pgd))) |
| 565 | xen_pgd_unpin(mm->pgd); | 637 | xen_pgd_unpin(mm->pgd); |
| 638 | |||
| 566 | spin_unlock(&mm->page_table_lock); | 639 | spin_unlock(&mm->page_table_lock); |
| 567 | } | 640 | } |
diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c index c837e8e463..5e6f36f6d8 100644 --- a/arch/x86/xen/multicalls.c +++ b/arch/x86/xen/multicalls.c | |||
| @@ -26,13 +26,22 @@ | |||
| 26 | 26 | ||
| 27 | #include "multicalls.h" | 27 | #include "multicalls.h" |
| 28 | 28 | ||
| 29 | #define MC_DEBUG 1 | ||
| 30 | |||
| 29 | #define MC_BATCH 32 | 31 | #define MC_BATCH 32 |
| 30 | #define MC_ARGS (MC_BATCH * 16 / sizeof(u64)) | 32 | #define MC_ARGS (MC_BATCH * 16 / sizeof(u64)) |
| 31 | 33 | ||
| 32 | struct mc_buffer { | 34 | struct mc_buffer { |
| 33 | struct multicall_entry entries[MC_BATCH]; | 35 | struct multicall_entry entries[MC_BATCH]; |
| 36 | #if MC_DEBUG | ||
| 37 | struct multicall_entry debug[MC_BATCH]; | ||
| 38 | #endif | ||
| 34 | u64 args[MC_ARGS]; | 39 | u64 args[MC_ARGS]; |
| 35 | unsigned mcidx, argidx; | 40 | struct callback { |
| 41 | void (*fn)(void *); | ||
| 42 | void *data; | ||
| 43 | } callbacks[MC_BATCH]; | ||
| 44 | unsigned mcidx, argidx, cbidx; | ||
| 36 | }; | 45 | }; |
| 37 | 46 | ||
| 38 | static DEFINE_PER_CPU(struct mc_buffer, mc_buffer); | 47 | static DEFINE_PER_CPU(struct mc_buffer, mc_buffer); |
| @@ -43,6 +52,7 @@ void xen_mc_flush(void) | |||
| 43 | struct mc_buffer *b = &__get_cpu_var(mc_buffer); | 52 | struct mc_buffer *b = &__get_cpu_var(mc_buffer); |
| 44 | int ret = 0; | 53 | int ret = 0; |
| 45 | unsigned long flags; | 54 | unsigned long flags; |
| 55 | int i; | ||
| 46 | 56 | ||
| 47 | BUG_ON(preemptible()); | 57 | BUG_ON(preemptible()); |
| 48 | 58 | ||
| @@ -51,13 +61,31 @@ void xen_mc_flush(void) | |||
| 51 | local_irq_save(flags); | 61 | local_irq_save(flags); |
| 52 | 62 | ||
| 53 | if (b->mcidx) { | 63 | if (b->mcidx) { |
| 54 | int i; | 64 | #if MC_DEBUG |
| 65 | memcpy(b->debug, b->entries, | ||
| 66 | b->mcidx * sizeof(struct multicall_entry)); | ||
| 67 | #endif | ||
| 55 | 68 | ||
| 56 | if (HYPERVISOR_multicall(b->entries, b->mcidx) != 0) | 69 | if (HYPERVISOR_multicall(b->entries, b->mcidx) != 0) |
| 57 | BUG(); | 70 | BUG(); |
| 58 | for (i = 0; i < b->mcidx; i++) | 71 | for (i = 0; i < b->mcidx; i++) |
| 59 | if (b->entries[i].result < 0) | 72 | if (b->entries[i].result < 0) |
| 60 | ret++; | 73 | ret++; |
| 74 | |||
| 75 | #if MC_DEBUG | ||
| 76 | if (ret) { | ||
| 77 | printk(KERN_ERR "%d multicall(s) failed: cpu %d\n", | ||
| 78 | ret, smp_processor_id()); | ||
| 79 | for(i = 0; i < b->mcidx; i++) { | ||
| 80 | printk(" call %2d/%d: op=%lu arg=[%lx] result=%ld\n", | ||
| 81 | i+1, b->mcidx, | ||
| 82 | b->debug[i].op, | ||
| 83 | b->debug[i].args[0], | ||
| 84 | b->entries[i].result); | ||
| 85 | } | ||
| 86 | } | ||
| 87 | #endif | ||
| 88 | |||
| 61 | b->mcidx = 0; | 89 | b->mcidx = 0; |
| 62 | b->argidx = 0; | 90 | b->argidx = 0; |
| 63 | } else | 91 | } else |
| @@ -65,6 +93,13 @@ void xen_mc_flush(void) | |||
| 65 | 93 | ||
| 66 | local_irq_restore(flags); | 94 | local_irq_restore(flags); |
| 67 | 95 | ||
| 96 | for(i = 0; i < b->cbidx; i++) { | ||
| 97 | struct callback *cb = &b->callbacks[i]; | ||
| 98 | |||
| 99 | (*cb->fn)(cb->data); | ||
| 100 | } | ||
| 101 | b->cbidx = 0; | ||
| 102 | |||
| 68 | BUG_ON(ret); | 103 | BUG_ON(ret); |
| 69 | } | 104 | } |
| 70 | 105 | ||
| @@ -88,3 +123,16 @@ struct multicall_space __xen_mc_entry(size_t args) | |||
| 88 | 123 | ||
| 89 | return ret; | 124 | return ret; |
| 90 | } | 125 | } |
| 126 | |||
| 127 | void xen_mc_callback(void (*fn)(void *), void *data) | ||
| 128 | { | ||
| 129 | struct mc_buffer *b = &__get_cpu_var(mc_buffer); | ||
| 130 | struct callback *cb; | ||
| 131 | |||
| 132 | if (b->cbidx == MC_BATCH) | ||
| 133 | xen_mc_flush(); | ||
| 134 | |||
| 135 | cb = &b->callbacks[b->cbidx++]; | ||
| 136 | cb->fn = fn; | ||
| 137 | cb->data = data; | ||
| 138 | } | ||
diff --git a/arch/x86/xen/multicalls.h b/arch/x86/xen/multicalls.h index e6f7530b15..8bae996d99 100644 --- a/arch/x86/xen/multicalls.h +++ b/arch/x86/xen/multicalls.h | |||
| @@ -35,11 +35,14 @@ void xen_mc_flush(void); | |||
| 35 | /* Issue a multicall if we're not in a lazy mode */ | 35 | /* Issue a multicall if we're not in a lazy mode */ |
| 36 | static inline void xen_mc_issue(unsigned mode) | 36 | static inline void xen_mc_issue(unsigned mode) |
| 37 | { | 37 | { |
| 38 | if ((xen_get_lazy_mode() & mode) == 0) | 38 | if ((paravirt_get_lazy_mode() & mode) == 0) |
| 39 | xen_mc_flush(); | 39 | xen_mc_flush(); |
| 40 | 40 | ||
| 41 | /* restore flags saved in xen_mc_batch */ | 41 | /* restore flags saved in xen_mc_batch */ |
| 42 | local_irq_restore(x86_read_percpu(xen_mc_irq_flags)); | 42 | local_irq_restore(x86_read_percpu(xen_mc_irq_flags)); |
| 43 | } | 43 | } |
| 44 | 44 | ||
| 45 | /* Set up a callback to be called when the current batch is flushed */ | ||
| 46 | void xen_mc_callback(void (*fn)(void *), void *data); | ||
| 47 | |||
| 45 | #endif /* _XEN_MULTICALLS_H */ | 48 | #endif /* _XEN_MULTICALLS_H */ |
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 4fa33c27cc..d53bf9d8a7 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c | |||
| @@ -370,7 +370,8 @@ int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *), | |||
| 370 | void *info, int wait) | 370 | void *info, int wait) |
| 371 | { | 371 | { |
| 372 | struct call_data_struct data; | 372 | struct call_data_struct data; |
| 373 | int cpus; | 373 | int cpus, cpu; |
| 374 | bool yield; | ||
| 374 | 375 | ||
| 375 | /* Holding any lock stops cpus from going down. */ | 376 | /* Holding any lock stops cpus from going down. */ |
| 376 | spin_lock(&call_lock); | 377 | spin_lock(&call_lock); |
| @@ -399,9 +400,14 @@ int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *), | |||
| 399 | /* Send a message to other CPUs and wait for them to respond */ | 400 | /* Send a message to other CPUs and wait for them to respond */ |
| 400 | xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR); | 401 | xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR); |
| 401 | 402 | ||
| 402 | /* Make sure other vcpus get a chance to run. | 403 | /* Make sure other vcpus get a chance to run if they need to. */ |
| 403 | XXX too severe? Maybe we should check the other CPU's states? */ | 404 | yield = false; |
| 404 | HYPERVISOR_sched_op(SCHEDOP_yield, 0); | 405 | for_each_cpu_mask(cpu, mask) |
| 406 | if (xen_vcpu_stolen(cpu)) | ||
| 407 | yield = true; | ||
| 408 | |||
| 409 | if (yield) | ||
| 410 | HYPERVISOR_sched_op(SCHEDOP_yield, 0); | ||
| 405 | 411 | ||
| 406 | /* Wait for response */ | 412 | /* Wait for response */ |
| 407 | while (atomic_read(&data.started) != cpus || | 413 | while (atomic_read(&data.started) != cpus || |
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index dfd6db69ea..d083ff5ef0 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c | |||
| @@ -105,6 +105,12 @@ static void get_runstate_snapshot(struct vcpu_runstate_info *res) | |||
| 105 | } while (get64(&state->state_entry_time) != state_time); | 105 | } while (get64(&state->state_entry_time) != state_time); |
| 106 | } | 106 | } |
| 107 | 107 | ||
| 108 | /* return true when a vcpu could run but has no real cpu to run on */ | ||
| 109 | bool xen_vcpu_stolen(int vcpu) | ||
| 110 | { | ||
| 111 | return per_cpu(runstate, vcpu).state == RUNSTATE_runnable; | ||
| 112 | } | ||
| 113 | |||
| 108 | static void setup_runstate_info(int cpu) | 114 | static void setup_runstate_info(int cpu) |
| 109 | { | 115 | { |
| 110 | struct vcpu_register_runstate_memory_area area; | 116 | struct vcpu_register_runstate_memory_area area; |
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index b9aaea45f0..b02a909bfd 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h | |||
| @@ -11,6 +11,7 @@ void xen_copy_trap_info(struct trap_info *traps); | |||
| 11 | 11 | ||
| 12 | DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu); | 12 | DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu); |
| 13 | DECLARE_PER_CPU(unsigned long, xen_cr3); | 13 | DECLARE_PER_CPU(unsigned long, xen_cr3); |
| 14 | DECLARE_PER_CPU(unsigned long, xen_current_cr3); | ||
| 14 | 15 | ||
| 15 | extern struct start_info *xen_start_info; | 16 | extern struct start_info *xen_start_info; |
| 16 | extern struct shared_info *HYPERVISOR_shared_info; | 17 | extern struct shared_info *HYPERVISOR_shared_info; |
| @@ -27,14 +28,9 @@ unsigned long xen_get_wallclock(void); | |||
| 27 | int xen_set_wallclock(unsigned long time); | 28 | int xen_set_wallclock(unsigned long time); |
| 28 | unsigned long long xen_sched_clock(void); | 29 | unsigned long long xen_sched_clock(void); |
| 29 | 30 | ||
| 30 | void xen_mark_init_mm_pinned(void); | 31 | bool xen_vcpu_stolen(int vcpu); |
| 31 | |||
| 32 | DECLARE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode); | ||
| 33 | 32 | ||
| 34 | static inline unsigned xen_get_lazy_mode(void) | 33 | void xen_mark_init_mm_pinned(void); |
| 35 | { | ||
| 36 | return x86_read_percpu(xen_lazy_mode); | ||
| 37 | } | ||
| 38 | 34 | ||
| 39 | void __init xen_fill_possible_map(void); | 35 | void __init xen_fill_possible_map(void); |
| 40 | 36 | ||
