aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/xen/mmu.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/xen/mmu.c')
-rw-r--r--arch/x86/xen/mmu.c145
1 files changed, 109 insertions, 36 deletions
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 874db0cd1d2a..b2e32f9d0071 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -41,7 +41,6 @@
41#include <linux/sched.h> 41#include <linux/sched.h>
42#include <linux/highmem.h> 42#include <linux/highmem.h>
43#include <linux/bug.h> 43#include <linux/bug.h>
44#include <linux/sched.h>
45 44
46#include <asm/pgtable.h> 45#include <asm/pgtable.h>
47#include <asm/tlbflush.h> 46#include <asm/tlbflush.h>
@@ -155,7 +154,7 @@ void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
155 pte_t *ptep, pte_t pteval) 154 pte_t *ptep, pte_t pteval)
156{ 155{
157 if (mm == current->mm || mm == &init_mm) { 156 if (mm == current->mm || mm == &init_mm) {
158 if (xen_get_lazy_mode() == PARAVIRT_LAZY_MMU) { 157 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
159 struct multicall_space mcs; 158 struct multicall_space mcs;
160 mcs = xen_mc_entry(0); 159 mcs = xen_mc_entry(0);
161 160
@@ -304,7 +303,12 @@ pgd_t xen_make_pgd(unsigned long pgd)
304} 303}
305#endif /* CONFIG_X86_PAE */ 304#endif /* CONFIG_X86_PAE */
306 305
307 306enum pt_level {
307 PT_PGD,
308 PT_PUD,
309 PT_PMD,
310 PT_PTE
311};
308 312
309/* 313/*
310 (Yet another) pagetable walker. This one is intended for pinning a 314 (Yet another) pagetable walker. This one is intended for pinning a
@@ -316,7 +320,7 @@ pgd_t xen_make_pgd(unsigned long pgd)
316 FIXADDR_TOP. But the important bit is that we don't pin beyond 320 FIXADDR_TOP. But the important bit is that we don't pin beyond
317 there, because then we start getting into Xen's ptes. 321 there, because then we start getting into Xen's ptes.
318*/ 322*/
319static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned), 323static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, enum pt_level),
320 unsigned long limit) 324 unsigned long limit)
321{ 325{
322 pgd_t *pgd = pgd_base; 326 pgd_t *pgd = pgd_base;
@@ -341,7 +345,7 @@ static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned),
341 pud = pud_offset(pgd, 0); 345 pud = pud_offset(pgd, 0);
342 346
343 if (PTRS_PER_PUD > 1) /* not folded */ 347 if (PTRS_PER_PUD > 1) /* not folded */
344 flush |= (*func)(virt_to_page(pud), 0); 348 flush |= (*func)(virt_to_page(pud), PT_PUD);
345 349
346 for (; addr != pud_limit; pud++, addr = pud_next) { 350 for (; addr != pud_limit; pud++, addr = pud_next) {
347 pmd_t *pmd; 351 pmd_t *pmd;
@@ -360,7 +364,7 @@ static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned),
360 pmd = pmd_offset(pud, 0); 364 pmd = pmd_offset(pud, 0);
361 365
362 if (PTRS_PER_PMD > 1) /* not folded */ 366 if (PTRS_PER_PMD > 1) /* not folded */
363 flush |= (*func)(virt_to_page(pmd), 0); 367 flush |= (*func)(virt_to_page(pmd), PT_PMD);
364 368
365 for (; addr != pmd_limit; pmd++) { 369 for (; addr != pmd_limit; pmd++) {
366 addr += (PAGE_SIZE * PTRS_PER_PTE); 370 addr += (PAGE_SIZE * PTRS_PER_PTE);
@@ -372,17 +376,47 @@ static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned),
372 if (pmd_none(*pmd)) 376 if (pmd_none(*pmd))
373 continue; 377 continue;
374 378
375 flush |= (*func)(pmd_page(*pmd), 0); 379 flush |= (*func)(pmd_page(*pmd), PT_PTE);
376 } 380 }
377 } 381 }
378 } 382 }
379 383
380 flush |= (*func)(virt_to_page(pgd_base), UVMF_TLB_FLUSH); 384 flush |= (*func)(virt_to_page(pgd_base), PT_PGD);
381 385
382 return flush; 386 return flush;
383} 387}
384 388
385static int pin_page(struct page *page, unsigned flags) 389static spinlock_t *lock_pte(struct page *page)
390{
391 spinlock_t *ptl = NULL;
392
393#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
394 ptl = __pte_lockptr(page);
395 spin_lock(ptl);
396#endif
397
398 return ptl;
399}
400
401static void do_unlock(void *v)
402{
403 spinlock_t *ptl = v;
404 spin_unlock(ptl);
405}
406
407static void xen_do_pin(unsigned level, unsigned long pfn)
408{
409 struct mmuext_op *op;
410 struct multicall_space mcs;
411
412 mcs = __xen_mc_entry(sizeof(*op));
413 op = mcs.args;
414 op->cmd = level;
415 op->arg1.mfn = pfn_to_mfn(pfn);
416 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
417}
418
419static int pin_page(struct page *page, enum pt_level level)
386{ 420{
387 unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags); 421 unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags);
388 int flush; 422 int flush;
@@ -397,12 +431,26 @@ static int pin_page(struct page *page, unsigned flags)
397 void *pt = lowmem_page_address(page); 431 void *pt = lowmem_page_address(page);
398 unsigned long pfn = page_to_pfn(page); 432 unsigned long pfn = page_to_pfn(page);
399 struct multicall_space mcs = __xen_mc_entry(0); 433 struct multicall_space mcs = __xen_mc_entry(0);
434 spinlock_t *ptl;
400 435
401 flush = 0; 436 flush = 0;
402 437
438 ptl = NULL;
439 if (level == PT_PTE)
440 ptl = lock_pte(page);
441
403 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, 442 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
404 pfn_pte(pfn, PAGE_KERNEL_RO), 443 pfn_pte(pfn, PAGE_KERNEL_RO),
405 flags); 444 level == PT_PGD ? UVMF_TLB_FLUSH : 0);
445
446 if (level == PT_PTE)
447 xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
448
449 if (ptl) {
450 /* Queue a deferred unlock for when this batch
451 is completed. */
452 xen_mc_callback(do_unlock, ptl);
453 }
406 } 454 }
407 455
408 return flush; 456 return flush;
@@ -413,8 +461,7 @@ static int pin_page(struct page *page, unsigned flags)
413 read-only, and can be pinned. */ 461 read-only, and can be pinned. */
414void xen_pgd_pin(pgd_t *pgd) 462void xen_pgd_pin(pgd_t *pgd)
415{ 463{
416 struct multicall_space mcs; 464 unsigned level;
417 struct mmuext_op *op;
418 465
419 xen_mc_batch(); 466 xen_mc_batch();
420 467
@@ -425,16 +472,13 @@ void xen_pgd_pin(pgd_t *pgd)
425 xen_mc_batch(); 472 xen_mc_batch();
426 } 473 }
427 474
428 mcs = __xen_mc_entry(sizeof(*op));
429 op = mcs.args;
430
431#ifdef CONFIG_X86_PAE 475#ifdef CONFIG_X86_PAE
432 op->cmd = MMUEXT_PIN_L3_TABLE; 476 level = MMUEXT_PIN_L3_TABLE;
433#else 477#else
434 op->cmd = MMUEXT_PIN_L2_TABLE; 478 level = MMUEXT_PIN_L2_TABLE;
435#endif 479#endif
436 op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd))); 480
437 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); 481 xen_do_pin(level, PFN_DOWN(__pa(pgd)));
438 482
439 xen_mc_issue(0); 483 xen_mc_issue(0);
440} 484}
@@ -442,7 +486,7 @@ void xen_pgd_pin(pgd_t *pgd)
442/* The init_mm pagetable is really pinned as soon as its created, but 486/* The init_mm pagetable is really pinned as soon as its created, but
443 that's before we have page structures to store the bits. So do all 487 that's before we have page structures to store the bits. So do all
444 the book-keeping now. */ 488 the book-keeping now. */
445static __init int mark_pinned(struct page *page, unsigned flags) 489static __init int mark_pinned(struct page *page, enum pt_level level)
446{ 490{
447 SetPagePinned(page); 491 SetPagePinned(page);
448 return 0; 492 return 0;
@@ -453,18 +497,32 @@ void __init xen_mark_init_mm_pinned(void)
453 pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP); 497 pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP);
454} 498}
455 499
456static int unpin_page(struct page *page, unsigned flags) 500static int unpin_page(struct page *page, enum pt_level level)
457{ 501{
458 unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags); 502 unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags);
459 503
460 if (pgfl && !PageHighMem(page)) { 504 if (pgfl && !PageHighMem(page)) {
461 void *pt = lowmem_page_address(page); 505 void *pt = lowmem_page_address(page);
462 unsigned long pfn = page_to_pfn(page); 506 unsigned long pfn = page_to_pfn(page);
463 struct multicall_space mcs = __xen_mc_entry(0); 507 spinlock_t *ptl = NULL;
508 struct multicall_space mcs;
509
510 if (level == PT_PTE) {
511 ptl = lock_pte(page);
512
513 xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
514 }
515
516 mcs = __xen_mc_entry(0);
464 517
465 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, 518 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
466 pfn_pte(pfn, PAGE_KERNEL), 519 pfn_pte(pfn, PAGE_KERNEL),
467 flags); 520 level == PT_PGD ? UVMF_TLB_FLUSH : 0);
521
522 if (ptl) {
523 /* unlock when batch completed */
524 xen_mc_callback(do_unlock, ptl);
525 }
468 } 526 }
469 527
470 return 0; /* never need to flush on unpin */ 528 return 0; /* never need to flush on unpin */
@@ -473,18 +531,9 @@ static int unpin_page(struct page *page, unsigned flags)
473/* Release a pagetables pages back as normal RW */ 531/* Release a pagetables pages back as normal RW */
474static void xen_pgd_unpin(pgd_t *pgd) 532static void xen_pgd_unpin(pgd_t *pgd)
475{ 533{
476 struct mmuext_op *op;
477 struct multicall_space mcs;
478
479 xen_mc_batch(); 534 xen_mc_batch();
480 535
481 mcs = __xen_mc_entry(sizeof(*op)); 536 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
482
483 op = mcs.args;
484 op->cmd = MMUEXT_UNPIN_TABLE;
485 op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
486
487 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
488 537
489 pgd_walk(pgd, unpin_page, TASK_SIZE); 538 pgd_walk(pgd, unpin_page, TASK_SIZE);
490 539
@@ -515,20 +564,43 @@ static void drop_other_mm_ref(void *info)
515 564
516 if (__get_cpu_var(cpu_tlbstate).active_mm == mm) 565 if (__get_cpu_var(cpu_tlbstate).active_mm == mm)
517 leave_mm(smp_processor_id()); 566 leave_mm(smp_processor_id());
567
568 /* If this cpu still has a stale cr3 reference, then make sure
569 it has been flushed. */
570 if (x86_read_percpu(xen_current_cr3) == __pa(mm->pgd)) {
571 load_cr3(swapper_pg_dir);
572 arch_flush_lazy_cpu_mode();
573 }
518} 574}
519 575
520static void drop_mm_ref(struct mm_struct *mm) 576static void drop_mm_ref(struct mm_struct *mm)
521{ 577{
578 cpumask_t mask;
579 unsigned cpu;
580
522 if (current->active_mm == mm) { 581 if (current->active_mm == mm) {
523 if (current->mm == mm) 582 if (current->mm == mm)
524 load_cr3(swapper_pg_dir); 583 load_cr3(swapper_pg_dir);
525 else 584 else
526 leave_mm(smp_processor_id()); 585 leave_mm(smp_processor_id());
586 arch_flush_lazy_cpu_mode();
527 } 587 }
528 588
529 if (!cpus_empty(mm->cpu_vm_mask)) 589 /* Get the "official" set of cpus referring to our pagetable. */
530 xen_smp_call_function_mask(mm->cpu_vm_mask, drop_other_mm_ref, 590 mask = mm->cpu_vm_mask;
531 mm, 1); 591
592 /* It's possible that a vcpu may have a stale reference to our
593 cr3, because its in lazy mode, and it hasn't yet flushed
594 its set of pending hypercalls yet. In this case, we can
595 look at its actual current cr3 value, and force it to flush
596 if needed. */
597 for_each_online_cpu(cpu) {
598 if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
599 cpu_set(cpu, mask);
600 }
601
602 if (!cpus_empty(mask))
603 xen_smp_call_function_mask(mask, drop_other_mm_ref, mm, 1);
532} 604}
533#else 605#else
534static void drop_mm_ref(struct mm_struct *mm) 606static void drop_mm_ref(struct mm_struct *mm)
@@ -563,5 +635,6 @@ void xen_exit_mmap(struct mm_struct *mm)
563 /* pgd may not be pinned in the error exit path of execve */ 635 /* pgd may not be pinned in the error exit path of execve */
564 if (PagePinned(virt_to_page(mm->pgd))) 636 if (PagePinned(virt_to_page(mm->pgd)))
565 xen_pgd_unpin(mm->pgd); 637 xen_pgd_unpin(mm->pgd);
638
566 spin_unlock(&mm->page_table_lock); 639 spin_unlock(&mm->page_table_lock);
567} 640}