aboutsummaryrefslogtreecommitdiffstats
path: root/mm/rmap.c
diff options
context:
space:
mode:
authorSagi Grimberg <sagig@mellanox.com>2012-10-08 19:33:33 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-10-09 03:22:58 -0400
commit2ec74c3ef2d8c58d71e0e00336fb6b891192155a (patch)
tree512b591504cdbee278c27afc50a7e3a558b4851a /mm/rmap.c
parent36e4f20af833d1ce196e6a4ade05dc26c44652d1 (diff)
mm: move all mmu notifier invocations to be done outside the PT lock
In order to allow sleeping during mmu notifier calls, we need to avoid invoking them under the page table spinlock. This patch solves the problem by calling invalidate_page notification after releasing the lock (but before freeing the page itself), or by wrapping the page invalidation with calls to invalidate_range_begin and invalidate_range_end. To prevent accidental changes to the invalidate_range_end arguments after the call to invalidate_range_begin, the patch introduces a convention of saving the arguments in consistently named locals: unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ ... mmun_start = ... mmun_end = ... mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); ... mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); The patch changes code to use this convention for all calls to mmu_notifier_invalidate_range_start/end, except those where the calls are close enough so that anyone who glances at the code can see the values aren't changing. This patchset is a preliminary step towards on-demand paging design to be added to the RDMA stack. Why do we want on-demand paging for Infiniband? Applications register memory with an RDMA adapter using system calls, and subsequently post IO operations that refer to the corresponding virtual addresses directly to HW. Until now, this was achieved by pinning the memory during the registration calls. The goal of on demand paging is to avoid pinning the pages of registered memory regions (MRs). This will allow users the same flexibility they get when swapping any other part of their processes address spaces. Instead of requiring the entire MR to fit in physical memory, we can allow the MR to be larger, and only fit the current working set in physical memory. Why should anyone care? What problems are users currently experiencing? This can make programming with RDMA much simpler. Today, developers that are working with more data than their RAM can hold need either to deregister and reregister memory regions throughout their process's life, or keep a single memory region and copy the data to it. On demand paging will allow these developers to register a single MR at the beginning of their process's life, and let the operating system manage which pages needs to be fetched at a given time. In the future, we might be able to provide a single memory access key for each process that would provide the entire process's address as one large memory region, and the developers wouldn't need to register memory regions at all. Is there any prospect that any other subsystems will utilise these infrastructural changes? If so, which and how, etc? As for other subsystems, I understand that XPMEM wanted to sleep in MMU notifiers, as Christoph Lameter wrote at http://lkml.indiana.edu/hypermail/linux/kernel/0802.1/0460.html and perhaps Andrea knows about other use cases. Scheduling in mmu notifications is required since we need to sync the hardware with the secondary page tables change. A TLB flush of an IO device is inherently slower than a CPU TLB flush, so our design works by sending the invalidation request to the device, and waiting for an interrupt before exiting the mmu notifier handler. Avi said: kvm may be a buyer. kvm::mmu_lock, which serializes guest page faults, also protects long operations such as destroying large ranges. It would be good to convert it into a spinlock, but as it is used inside mmu notifiers, this cannot be done. (there are alternatives, such as keeping the spinlock and using a generation counter to do the teardown in O(1), which is what the "may" is doing up there). [akpm@linux-foundation.orgpossible speed tweak in hugetlb_cow(), cleanups] Signed-off-by: Andrea Arcangeli <andrea@qumranet.com> Signed-off-by: Sagi Grimberg <sagig@mellanox.com> Signed-off-by: Haggai Eran <haggaie@mellanox.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com> Cc: Or Gerlitz <ogerlitz@mellanox.com> Cc: Haggai Eran <haggaie@mellanox.com> Cc: Shachar Raindel <raindel@mellanox.com> Cc: Liran Liss <liranl@mellanox.com> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Avi Kivity <avi@redhat.com> Cc: Hugh Dickins <hughd@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/rmap.c')
-rw-r--r--mm/rmap.c18
1 files changed, 15 insertions, 3 deletions
diff --git a/mm/rmap.c b/mm/rmap.c
index bf03149f495c..7df7984d476c 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -884,7 +884,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma,
884 pte_t entry; 884 pte_t entry;
885 885
886 flush_cache_page(vma, address, pte_pfn(*pte)); 886 flush_cache_page(vma, address, pte_pfn(*pte));
887 entry = ptep_clear_flush_notify(vma, address, pte); 887 entry = ptep_clear_flush(vma, address, pte);
888 entry = pte_wrprotect(entry); 888 entry = pte_wrprotect(entry);
889 entry = pte_mkclean(entry); 889 entry = pte_mkclean(entry);
890 set_pte_at(mm, address, pte, entry); 890 set_pte_at(mm, address, pte, entry);
@@ -892,6 +892,9 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma,
892 } 892 }
893 893
894 pte_unmap_unlock(pte, ptl); 894 pte_unmap_unlock(pte, ptl);
895
896 if (ret)
897 mmu_notifier_invalidate_page(mm, address);
895out: 898out:
896 return ret; 899 return ret;
897} 900}
@@ -1212,7 +1215,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1212 1215
1213 /* Nuke the page table entry. */ 1216 /* Nuke the page table entry. */
1214 flush_cache_page(vma, address, page_to_pfn(page)); 1217 flush_cache_page(vma, address, page_to_pfn(page));
1215 pteval = ptep_clear_flush_notify(vma, address, pte); 1218 pteval = ptep_clear_flush(vma, address, pte);
1216 1219
1217 /* Move the dirty bit to the physical page now the pte is gone. */ 1220 /* Move the dirty bit to the physical page now the pte is gone. */
1218 if (pte_dirty(pteval)) 1221 if (pte_dirty(pteval))
@@ -1274,6 +1277,8 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1274 1277
1275out_unmap: 1278out_unmap:
1276 pte_unmap_unlock(pte, ptl); 1279 pte_unmap_unlock(pte, ptl);
1280 if (ret != SWAP_FAIL)
1281 mmu_notifier_invalidate_page(mm, address);
1277out: 1282out:
1278 return ret; 1283 return ret;
1279 1284
@@ -1338,6 +1343,8 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
1338 spinlock_t *ptl; 1343 spinlock_t *ptl;
1339 struct page *page; 1344 struct page *page;
1340 unsigned long address; 1345 unsigned long address;
1346 unsigned long mmun_start; /* For mmu_notifiers */
1347 unsigned long mmun_end; /* For mmu_notifiers */
1341 unsigned long end; 1348 unsigned long end;
1342 int ret = SWAP_AGAIN; 1349 int ret = SWAP_AGAIN;
1343 int locked_vma = 0; 1350 int locked_vma = 0;
@@ -1361,6 +1368,10 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
1361 if (!pmd_present(*pmd)) 1368 if (!pmd_present(*pmd))
1362 return ret; 1369 return ret;
1363 1370
1371 mmun_start = address;
1372 mmun_end = end;
1373 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
1374
1364 /* 1375 /*
1365 * If we can acquire the mmap_sem for read, and vma is VM_LOCKED, 1376 * If we can acquire the mmap_sem for read, and vma is VM_LOCKED,
1366 * keep the sem while scanning the cluster for mlocking pages. 1377 * keep the sem while scanning the cluster for mlocking pages.
@@ -1394,7 +1405,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
1394 1405
1395 /* Nuke the page table entry. */ 1406 /* Nuke the page table entry. */
1396 flush_cache_page(vma, address, pte_pfn(*pte)); 1407 flush_cache_page(vma, address, pte_pfn(*pte));
1397 pteval = ptep_clear_flush_notify(vma, address, pte); 1408 pteval = ptep_clear_flush(vma, address, pte);
1398 1409
1399 /* If nonlinear, store the file page offset in the pte. */ 1410 /* If nonlinear, store the file page offset in the pte. */
1400 if (page->index != linear_page_index(vma, address)) 1411 if (page->index != linear_page_index(vma, address))
@@ -1410,6 +1421,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
1410 (*mapcount)--; 1421 (*mapcount)--;
1411 } 1422 }
1412 pte_unmap_unlock(pte - 1, ptl); 1423 pte_unmap_unlock(pte - 1, ptl);
1424 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1413 if (locked_vma) 1425 if (locked_vma)
1414 up_read(&vma->vm_mm->mmap_sem); 1426 up_read(&vma->vm_mm->mmap_sem);
1415 return ret; 1427 return ret;