diff options
Diffstat (limited to 'arch/x86/xen/mmu.c')
-rw-r--r-- | arch/x86/xen/mmu.c | 145 |
1 files changed, 109 insertions, 36 deletions
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 874db0cd1d2a..b2e32f9d0071 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c | |||
@@ -41,7 +41,6 @@ | |||
41 | #include <linux/sched.h> | 41 | #include <linux/sched.h> |
42 | #include <linux/highmem.h> | 42 | #include <linux/highmem.h> |
43 | #include <linux/bug.h> | 43 | #include <linux/bug.h> |
44 | #include <linux/sched.h> | ||
45 | 44 | ||
46 | #include <asm/pgtable.h> | 45 | #include <asm/pgtable.h> |
47 | #include <asm/tlbflush.h> | 46 | #include <asm/tlbflush.h> |
@@ -155,7 +154,7 @@ void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, | |||
155 | pte_t *ptep, pte_t pteval) | 154 | pte_t *ptep, pte_t pteval) |
156 | { | 155 | { |
157 | if (mm == current->mm || mm == &init_mm) { | 156 | if (mm == current->mm || mm == &init_mm) { |
158 | if (xen_get_lazy_mode() == PARAVIRT_LAZY_MMU) { | 157 | if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { |
159 | struct multicall_space mcs; | 158 | struct multicall_space mcs; |
160 | mcs = xen_mc_entry(0); | 159 | mcs = xen_mc_entry(0); |
161 | 160 | ||
@@ -304,7 +303,12 @@ pgd_t xen_make_pgd(unsigned long pgd) | |||
304 | } | 303 | } |
305 | #endif /* CONFIG_X86_PAE */ | 304 | #endif /* CONFIG_X86_PAE */ |
306 | 305 | ||
307 | 306 | enum pt_level { | |
307 | PT_PGD, | ||
308 | PT_PUD, | ||
309 | PT_PMD, | ||
310 | PT_PTE | ||
311 | }; | ||
308 | 312 | ||
309 | /* | 313 | /* |
310 | (Yet another) pagetable walker. This one is intended for pinning a | 314 | (Yet another) pagetable walker. This one is intended for pinning a |
@@ -316,7 +320,7 @@ pgd_t xen_make_pgd(unsigned long pgd) | |||
316 | FIXADDR_TOP. But the important bit is that we don't pin beyond | 320 | FIXADDR_TOP. But the important bit is that we don't pin beyond |
317 | there, because then we start getting into Xen's ptes. | 321 | there, because then we start getting into Xen's ptes. |
318 | */ | 322 | */ |
319 | static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned), | 323 | static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, enum pt_level), |
320 | unsigned long limit) | 324 | unsigned long limit) |
321 | { | 325 | { |
322 | pgd_t *pgd = pgd_base; | 326 | pgd_t *pgd = pgd_base; |
@@ -341,7 +345,7 @@ static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned), | |||
341 | pud = pud_offset(pgd, 0); | 345 | pud = pud_offset(pgd, 0); |
342 | 346 | ||
343 | if (PTRS_PER_PUD > 1) /* not folded */ | 347 | if (PTRS_PER_PUD > 1) /* not folded */ |
344 | flush |= (*func)(virt_to_page(pud), 0); | 348 | flush |= (*func)(virt_to_page(pud), PT_PUD); |
345 | 349 | ||
346 | for (; addr != pud_limit; pud++, addr = pud_next) { | 350 | for (; addr != pud_limit; pud++, addr = pud_next) { |
347 | pmd_t *pmd; | 351 | pmd_t *pmd; |
@@ -360,7 +364,7 @@ static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned), | |||
360 | pmd = pmd_offset(pud, 0); | 364 | pmd = pmd_offset(pud, 0); |
361 | 365 | ||
362 | if (PTRS_PER_PMD > 1) /* not folded */ | 366 | if (PTRS_PER_PMD > 1) /* not folded */ |
363 | flush |= (*func)(virt_to_page(pmd), 0); | 367 | flush |= (*func)(virt_to_page(pmd), PT_PMD); |
364 | 368 | ||
365 | for (; addr != pmd_limit; pmd++) { | 369 | for (; addr != pmd_limit; pmd++) { |
366 | addr += (PAGE_SIZE * PTRS_PER_PTE); | 370 | addr += (PAGE_SIZE * PTRS_PER_PTE); |
@@ -372,17 +376,47 @@ static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned), | |||
372 | if (pmd_none(*pmd)) | 376 | if (pmd_none(*pmd)) |
373 | continue; | 377 | continue; |
374 | 378 | ||
375 | flush |= (*func)(pmd_page(*pmd), 0); | 379 | flush |= (*func)(pmd_page(*pmd), PT_PTE); |
376 | } | 380 | } |
377 | } | 381 | } |
378 | } | 382 | } |
379 | 383 | ||
380 | flush |= (*func)(virt_to_page(pgd_base), UVMF_TLB_FLUSH); | 384 | flush |= (*func)(virt_to_page(pgd_base), PT_PGD); |
381 | 385 | ||
382 | return flush; | 386 | return flush; |
383 | } | 387 | } |
384 | 388 | ||
385 | static int pin_page(struct page *page, unsigned flags) | 389 | static spinlock_t *lock_pte(struct page *page) |
390 | { | ||
391 | spinlock_t *ptl = NULL; | ||
392 | |||
393 | #if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS | ||
394 | ptl = __pte_lockptr(page); | ||
395 | spin_lock(ptl); | ||
396 | #endif | ||
397 | |||
398 | return ptl; | ||
399 | } | ||
400 | |||
401 | static void do_unlock(void *v) | ||
402 | { | ||
403 | spinlock_t *ptl = v; | ||
404 | spin_unlock(ptl); | ||
405 | } | ||
406 | |||
407 | static void xen_do_pin(unsigned level, unsigned long pfn) | ||
408 | { | ||
409 | struct mmuext_op *op; | ||
410 | struct multicall_space mcs; | ||
411 | |||
412 | mcs = __xen_mc_entry(sizeof(*op)); | ||
413 | op = mcs.args; | ||
414 | op->cmd = level; | ||
415 | op->arg1.mfn = pfn_to_mfn(pfn); | ||
416 | MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); | ||
417 | } | ||
418 | |||
419 | static int pin_page(struct page *page, enum pt_level level) | ||
386 | { | 420 | { |
387 | unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags); | 421 | unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags); |
388 | int flush; | 422 | int flush; |
@@ -397,12 +431,26 @@ static int pin_page(struct page *page, unsigned flags) | |||
397 | void *pt = lowmem_page_address(page); | 431 | void *pt = lowmem_page_address(page); |
398 | unsigned long pfn = page_to_pfn(page); | 432 | unsigned long pfn = page_to_pfn(page); |
399 | struct multicall_space mcs = __xen_mc_entry(0); | 433 | struct multicall_space mcs = __xen_mc_entry(0); |
434 | spinlock_t *ptl; | ||
400 | 435 | ||
401 | flush = 0; | 436 | flush = 0; |
402 | 437 | ||
438 | ptl = NULL; | ||
439 | if (level == PT_PTE) | ||
440 | ptl = lock_pte(page); | ||
441 | |||
403 | MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, | 442 | MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, |
404 | pfn_pte(pfn, PAGE_KERNEL_RO), | 443 | pfn_pte(pfn, PAGE_KERNEL_RO), |
405 | flags); | 444 | level == PT_PGD ? UVMF_TLB_FLUSH : 0); |
445 | |||
446 | if (level == PT_PTE) | ||
447 | xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn); | ||
448 | |||
449 | if (ptl) { | ||
450 | /* Queue a deferred unlock for when this batch | ||
451 | is completed. */ | ||
452 | xen_mc_callback(do_unlock, ptl); | ||
453 | } | ||
406 | } | 454 | } |
407 | 455 | ||
408 | return flush; | 456 | return flush; |
@@ -413,8 +461,7 @@ static int pin_page(struct page *page, unsigned flags) | |||
413 | read-only, and can be pinned. */ | 461 | read-only, and can be pinned. */ |
414 | void xen_pgd_pin(pgd_t *pgd) | 462 | void xen_pgd_pin(pgd_t *pgd) |
415 | { | 463 | { |
416 | struct multicall_space mcs; | 464 | unsigned level; |
417 | struct mmuext_op *op; | ||
418 | 465 | ||
419 | xen_mc_batch(); | 466 | xen_mc_batch(); |
420 | 467 | ||
@@ -425,16 +472,13 @@ void xen_pgd_pin(pgd_t *pgd) | |||
425 | xen_mc_batch(); | 472 | xen_mc_batch(); |
426 | } | 473 | } |
427 | 474 | ||
428 | mcs = __xen_mc_entry(sizeof(*op)); | ||
429 | op = mcs.args; | ||
430 | |||
431 | #ifdef CONFIG_X86_PAE | 475 | #ifdef CONFIG_X86_PAE |
432 | op->cmd = MMUEXT_PIN_L3_TABLE; | 476 | level = MMUEXT_PIN_L3_TABLE; |
433 | #else | 477 | #else |
434 | op->cmd = MMUEXT_PIN_L2_TABLE; | 478 | level = MMUEXT_PIN_L2_TABLE; |
435 | #endif | 479 | #endif |
436 | op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd))); | 480 | |
437 | MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); | 481 | xen_do_pin(level, PFN_DOWN(__pa(pgd))); |
438 | 482 | ||
439 | xen_mc_issue(0); | 483 | xen_mc_issue(0); |
440 | } | 484 | } |
@@ -442,7 +486,7 @@ void xen_pgd_pin(pgd_t *pgd) | |||
442 | /* The init_mm pagetable is really pinned as soon as its created, but | 486 | /* The init_mm pagetable is really pinned as soon as its created, but |
443 | that's before we have page structures to store the bits. So do all | 487 | that's before we have page structures to store the bits. So do all |
444 | the book-keeping now. */ | 488 | the book-keeping now. */ |
445 | static __init int mark_pinned(struct page *page, unsigned flags) | 489 | static __init int mark_pinned(struct page *page, enum pt_level level) |
446 | { | 490 | { |
447 | SetPagePinned(page); | 491 | SetPagePinned(page); |
448 | return 0; | 492 | return 0; |
@@ -453,18 +497,32 @@ void __init xen_mark_init_mm_pinned(void) | |||
453 | pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP); | 497 | pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP); |
454 | } | 498 | } |
455 | 499 | ||
456 | static int unpin_page(struct page *page, unsigned flags) | 500 | static int unpin_page(struct page *page, enum pt_level level) |
457 | { | 501 | { |
458 | unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags); | 502 | unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags); |
459 | 503 | ||
460 | if (pgfl && !PageHighMem(page)) { | 504 | if (pgfl && !PageHighMem(page)) { |
461 | void *pt = lowmem_page_address(page); | 505 | void *pt = lowmem_page_address(page); |
462 | unsigned long pfn = page_to_pfn(page); | 506 | unsigned long pfn = page_to_pfn(page); |
463 | struct multicall_space mcs = __xen_mc_entry(0); | 507 | spinlock_t *ptl = NULL; |
508 | struct multicall_space mcs; | ||
509 | |||
510 | if (level == PT_PTE) { | ||
511 | ptl = lock_pte(page); | ||
512 | |||
513 | xen_do_pin(MMUEXT_UNPIN_TABLE, pfn); | ||
514 | } | ||
515 | |||
516 | mcs = __xen_mc_entry(0); | ||
464 | 517 | ||
465 | MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, | 518 | MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, |
466 | pfn_pte(pfn, PAGE_KERNEL), | 519 | pfn_pte(pfn, PAGE_KERNEL), |
467 | flags); | 520 | level == PT_PGD ? UVMF_TLB_FLUSH : 0); |
521 | |||
522 | if (ptl) { | ||
523 | /* unlock when batch completed */ | ||
524 | xen_mc_callback(do_unlock, ptl); | ||
525 | } | ||
468 | } | 526 | } |
469 | 527 | ||
470 | return 0; /* never need to flush on unpin */ | 528 | return 0; /* never need to flush on unpin */ |
@@ -473,18 +531,9 @@ static int unpin_page(struct page *page, unsigned flags) | |||
473 | /* Release a pagetables pages back as normal RW */ | 531 | /* Release a pagetables pages back as normal RW */ |
474 | static void xen_pgd_unpin(pgd_t *pgd) | 532 | static void xen_pgd_unpin(pgd_t *pgd) |
475 | { | 533 | { |
476 | struct mmuext_op *op; | ||
477 | struct multicall_space mcs; | ||
478 | |||
479 | xen_mc_batch(); | 534 | xen_mc_batch(); |
480 | 535 | ||
481 | mcs = __xen_mc_entry(sizeof(*op)); | 536 | xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); |
482 | |||
483 | op = mcs.args; | ||
484 | op->cmd = MMUEXT_UNPIN_TABLE; | ||
485 | op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd))); | ||
486 | |||
487 | MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); | ||
488 | 537 | ||
489 | pgd_walk(pgd, unpin_page, TASK_SIZE); | 538 | pgd_walk(pgd, unpin_page, TASK_SIZE); |
490 | 539 | ||
@@ -515,20 +564,43 @@ static void drop_other_mm_ref(void *info) | |||
515 | 564 | ||
516 | if (__get_cpu_var(cpu_tlbstate).active_mm == mm) | 565 | if (__get_cpu_var(cpu_tlbstate).active_mm == mm) |
517 | leave_mm(smp_processor_id()); | 566 | leave_mm(smp_processor_id()); |
567 | |||
568 | /* If this cpu still has a stale cr3 reference, then make sure | ||
569 | it has been flushed. */ | ||
570 | if (x86_read_percpu(xen_current_cr3) == __pa(mm->pgd)) { | ||
571 | load_cr3(swapper_pg_dir); | ||
572 | arch_flush_lazy_cpu_mode(); | ||
573 | } | ||
518 | } | 574 | } |
519 | 575 | ||
520 | static void drop_mm_ref(struct mm_struct *mm) | 576 | static void drop_mm_ref(struct mm_struct *mm) |
521 | { | 577 | { |
578 | cpumask_t mask; | ||
579 | unsigned cpu; | ||
580 | |||
522 | if (current->active_mm == mm) { | 581 | if (current->active_mm == mm) { |
523 | if (current->mm == mm) | 582 | if (current->mm == mm) |
524 | load_cr3(swapper_pg_dir); | 583 | load_cr3(swapper_pg_dir); |
525 | else | 584 | else |
526 | leave_mm(smp_processor_id()); | 585 | leave_mm(smp_processor_id()); |
586 | arch_flush_lazy_cpu_mode(); | ||
527 | } | 587 | } |
528 | 588 | ||
529 | if (!cpus_empty(mm->cpu_vm_mask)) | 589 | /* Get the "official" set of cpus referring to our pagetable. */ |
530 | xen_smp_call_function_mask(mm->cpu_vm_mask, drop_other_mm_ref, | 590 | mask = mm->cpu_vm_mask; |
531 | mm, 1); | 591 | |
592 | /* It's possible that a vcpu may have a stale reference to our | ||
593 | cr3, because its in lazy mode, and it hasn't yet flushed | ||
594 | its set of pending hypercalls yet. In this case, we can | ||
595 | look at its actual current cr3 value, and force it to flush | ||
596 | if needed. */ | ||
597 | for_each_online_cpu(cpu) { | ||
598 | if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd)) | ||
599 | cpu_set(cpu, mask); | ||
600 | } | ||
601 | |||
602 | if (!cpus_empty(mask)) | ||
603 | xen_smp_call_function_mask(mask, drop_other_mm_ref, mm, 1); | ||
532 | } | 604 | } |
533 | #else | 605 | #else |
534 | static void drop_mm_ref(struct mm_struct *mm) | 606 | static void drop_mm_ref(struct mm_struct *mm) |
@@ -563,5 +635,6 @@ void xen_exit_mmap(struct mm_struct *mm) | |||
563 | /* pgd may not be pinned in the error exit path of execve */ | 635 | /* pgd may not be pinned in the error exit path of execve */ |
564 | if (PagePinned(virt_to_page(mm->pgd))) | 636 | if (PagePinned(virt_to_page(mm->pgd))) |
565 | xen_pgd_unpin(mm->pgd); | 637 | xen_pgd_unpin(mm->pgd); |
638 | |||
566 | spin_unlock(&mm->page_table_lock); | 639 | spin_unlock(&mm->page_table_lock); |
567 | } | 640 | } |