aboutsummaryrefslogtreecommitdiffstats
path: root/mm/hugetlb.c
diff options
context:
space:
mode:
authorSagi Grimberg <sagig@mellanox.com>2012-10-08 19:33:33 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-10-09 03:22:58 -0400
commit2ec74c3ef2d8c58d71e0e00336fb6b891192155a (patch)
tree512b591504cdbee278c27afc50a7e3a558b4851a /mm/hugetlb.c
parent36e4f20af833d1ce196e6a4ade05dc26c44652d1 (diff)
mm: move all mmu notifier invocations to be done outside the PT lock
In order to allow sleeping during mmu notifier calls, we need to avoid invoking them under the page table spinlock. This patch solves the problem by calling invalidate_page notification after releasing the lock (but before freeing the page itself), or by wrapping the page invalidation with calls to invalidate_range_begin and invalidate_range_end. To prevent accidental changes to the invalidate_range_end arguments after the call to invalidate_range_begin, the patch introduces a convention of saving the arguments in consistently named locals: unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ ... mmun_start = ... mmun_end = ... mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); ... mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); The patch changes code to use this convention for all calls to mmu_notifier_invalidate_range_start/end, except those where the calls are close enough so that anyone who glances at the code can see the values aren't changing. This patchset is a preliminary step towards on-demand paging design to be added to the RDMA stack. Why do we want on-demand paging for Infiniband? Applications register memory with an RDMA adapter using system calls, and subsequently post IO operations that refer to the corresponding virtual addresses directly to HW. Until now, this was achieved by pinning the memory during the registration calls. The goal of on demand paging is to avoid pinning the pages of registered memory regions (MRs). This will allow users the same flexibility they get when swapping any other part of their processes address spaces. Instead of requiring the entire MR to fit in physical memory, we can allow the MR to be larger, and only fit the current working set in physical memory. Why should anyone care? What problems are users currently experiencing? This can make programming with RDMA much simpler. Today, developers that are working with more data than their RAM can hold need either to deregister and reregister memory regions throughout their process's life, or keep a single memory region and copy the data to it. On demand paging will allow these developers to register a single MR at the beginning of their process's life, and let the operating system manage which pages needs to be fetched at a given time. In the future, we might be able to provide a single memory access key for each process that would provide the entire process's address as one large memory region, and the developers wouldn't need to register memory regions at all. Is there any prospect that any other subsystems will utilise these infrastructural changes? If so, which and how, etc? As for other subsystems, I understand that XPMEM wanted to sleep in MMU notifiers, as Christoph Lameter wrote at http://lkml.indiana.edu/hypermail/linux/kernel/0802.1/0460.html and perhaps Andrea knows about other use cases. Scheduling in mmu notifications is required since we need to sync the hardware with the secondary page tables change. A TLB flush of an IO device is inherently slower than a CPU TLB flush, so our design works by sending the invalidation request to the device, and waiting for an interrupt before exiting the mmu notifier handler. Avi said: kvm may be a buyer. kvm::mmu_lock, which serializes guest page faults, also protects long operations such as destroying large ranges. It would be good to convert it into a spinlock, but as it is used inside mmu notifiers, this cannot be done. (there are alternatives, such as keeping the spinlock and using a generation counter to do the teardown in O(1), which is what the "may" is doing up there). [akpm@linux-foundation.orgpossible speed tweak in hugetlb_cow(), cleanups] Signed-off-by: Andrea Arcangeli <andrea@qumranet.com> Signed-off-by: Sagi Grimberg <sagig@mellanox.com> Signed-off-by: Haggai Eran <haggaie@mellanox.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com> Cc: Or Gerlitz <ogerlitz@mellanox.com> Cc: Haggai Eran <haggaie@mellanox.com> Cc: Shachar Raindel <raindel@mellanox.com> Cc: Liran Liss <liranl@mellanox.com> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Avi Kivity <avi@redhat.com> Cc: Hugh Dickins <hughd@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r--mm/hugetlb.c21
1 files changed, 13 insertions, 8 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index de5d1dcf34fe..993f7c1820a8 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2355,13 +2355,15 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
2355 struct page *page; 2355 struct page *page;
2356 struct hstate *h = hstate_vma(vma); 2356 struct hstate *h = hstate_vma(vma);
2357 unsigned long sz = huge_page_size(h); 2357 unsigned long sz = huge_page_size(h);
2358 const unsigned long mmun_start = start; /* For mmu_notifiers */
2359 const unsigned long mmun_end = end; /* For mmu_notifiers */
2358 2360
2359 WARN_ON(!is_vm_hugetlb_page(vma)); 2361 WARN_ON(!is_vm_hugetlb_page(vma));
2360 BUG_ON(start & ~huge_page_mask(h)); 2362 BUG_ON(start & ~huge_page_mask(h));
2361 BUG_ON(end & ~huge_page_mask(h)); 2363 BUG_ON(end & ~huge_page_mask(h));
2362 2364
2363 tlb_start_vma(tlb, vma); 2365 tlb_start_vma(tlb, vma);
2364 mmu_notifier_invalidate_range_start(mm, start, end); 2366 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2365again: 2367again:
2366 spin_lock(&mm->page_table_lock); 2368 spin_lock(&mm->page_table_lock);
2367 for (address = start; address < end; address += sz) { 2369 for (address = start; address < end; address += sz) {
@@ -2425,7 +2427,7 @@ again:
2425 if (address < end && !ref_page) 2427 if (address < end && !ref_page)
2426 goto again; 2428 goto again;
2427 } 2429 }
2428 mmu_notifier_invalidate_range_end(mm, start, end); 2430 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2429 tlb_end_vma(tlb, vma); 2431 tlb_end_vma(tlb, vma);
2430} 2432}
2431 2433
@@ -2525,6 +2527,8 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
2525 struct page *old_page, *new_page; 2527 struct page *old_page, *new_page;
2526 int avoidcopy; 2528 int avoidcopy;
2527 int outside_reserve = 0; 2529 int outside_reserve = 0;
2530 unsigned long mmun_start; /* For mmu_notifiers */
2531 unsigned long mmun_end; /* For mmu_notifiers */
2528 2532
2529 old_page = pte_page(pte); 2533 old_page = pte_page(pte);
2530 2534
@@ -2611,6 +2615,9 @@ retry_avoidcopy:
2611 pages_per_huge_page(h)); 2615 pages_per_huge_page(h));
2612 __SetPageUptodate(new_page); 2616 __SetPageUptodate(new_page);
2613 2617
2618 mmun_start = address & huge_page_mask(h);
2619 mmun_end = mmun_start + huge_page_size(h);
2620 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2614 /* 2621 /*
2615 * Retake the page_table_lock to check for racing updates 2622 * Retake the page_table_lock to check for racing updates
2616 * before the page tables are altered 2623 * before the page tables are altered
@@ -2619,9 +2626,6 @@ retry_avoidcopy:
2619 ptep = huge_pte_offset(mm, address & huge_page_mask(h)); 2626 ptep = huge_pte_offset(mm, address & huge_page_mask(h));
2620 if (likely(pte_same(huge_ptep_get(ptep), pte))) { 2627 if (likely(pte_same(huge_ptep_get(ptep), pte))) {
2621 /* Break COW */ 2628 /* Break COW */
2622 mmu_notifier_invalidate_range_start(mm,
2623 address & huge_page_mask(h),
2624 (address & huge_page_mask(h)) + huge_page_size(h));
2625 huge_ptep_clear_flush(vma, address, ptep); 2629 huge_ptep_clear_flush(vma, address, ptep);
2626 set_huge_pte_at(mm, address, ptep, 2630 set_huge_pte_at(mm, address, ptep,
2627 make_huge_pte(vma, new_page, 1)); 2631 make_huge_pte(vma, new_page, 1));
@@ -2629,10 +2633,11 @@ retry_avoidcopy:
2629 hugepage_add_new_anon_rmap(new_page, vma, address); 2633 hugepage_add_new_anon_rmap(new_page, vma, address);
2630 /* Make the old page be freed below */ 2634 /* Make the old page be freed below */
2631 new_page = old_page; 2635 new_page = old_page;
2632 mmu_notifier_invalidate_range_end(mm,
2633 address & huge_page_mask(h),
2634 (address & huge_page_mask(h)) + huge_page_size(h));
2635 } 2636 }
2637 spin_unlock(&mm->page_table_lock);
2638 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2639 /* Caller expects lock to be held */
2640 spin_lock(&mm->page_table_lock);
2636 page_cache_release(new_page); 2641 page_cache_release(new_page);
2637 page_cache_release(old_page); 2642 page_cache_release(old_page);
2638 return 0; 2643 return 0;