aboutsummaryrefslogtreecommitdiffstats
path: root/mm/huge_memory.c
diff options
context:
space:
mode:
authorSagi Grimberg <sagig@mellanox.com>2012-10-08 19:33:33 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-10-09 03:22:58 -0400
commit2ec74c3ef2d8c58d71e0e00336fb6b891192155a (patch)
tree512b591504cdbee278c27afc50a7e3a558b4851a /mm/huge_memory.c
parent36e4f20af833d1ce196e6a4ade05dc26c44652d1 (diff)
mm: move all mmu notifier invocations to be done outside the PT lock
In order to allow sleeping during mmu notifier calls, we need to avoid invoking them under the page table spinlock. This patch solves the problem by calling invalidate_page notification after releasing the lock (but before freeing the page itself), or by wrapping the page invalidation with calls to invalidate_range_begin and invalidate_range_end. To prevent accidental changes to the invalidate_range_end arguments after the call to invalidate_range_begin, the patch introduces a convention of saving the arguments in consistently named locals: unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ ... mmun_start = ... mmun_end = ... mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); ... mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); The patch changes code to use this convention for all calls to mmu_notifier_invalidate_range_start/end, except those where the calls are close enough so that anyone who glances at the code can see the values aren't changing. This patchset is a preliminary step towards on-demand paging design to be added to the RDMA stack. Why do we want on-demand paging for Infiniband? Applications register memory with an RDMA adapter using system calls, and subsequently post IO operations that refer to the corresponding virtual addresses directly to HW. Until now, this was achieved by pinning the memory during the registration calls. The goal of on demand paging is to avoid pinning the pages of registered memory regions (MRs). This will allow users the same flexibility they get when swapping any other part of their processes address spaces. Instead of requiring the entire MR to fit in physical memory, we can allow the MR to be larger, and only fit the current working set in physical memory. Why should anyone care? What problems are users currently experiencing? This can make programming with RDMA much simpler. Today, developers that are working with more data than their RAM can hold need either to deregister and reregister memory regions throughout their process's life, or keep a single memory region and copy the data to it. On demand paging will allow these developers to register a single MR at the beginning of their process's life, and let the operating system manage which pages needs to be fetched at a given time. In the future, we might be able to provide a single memory access key for each process that would provide the entire process's address as one large memory region, and the developers wouldn't need to register memory regions at all. Is there any prospect that any other subsystems will utilise these infrastructural changes? If so, which and how, etc? As for other subsystems, I understand that XPMEM wanted to sleep in MMU notifiers, as Christoph Lameter wrote at http://lkml.indiana.edu/hypermail/linux/kernel/0802.1/0460.html and perhaps Andrea knows about other use cases. Scheduling in mmu notifications is required since we need to sync the hardware with the secondary page tables change. A TLB flush of an IO device is inherently slower than a CPU TLB flush, so our design works by sending the invalidation request to the device, and waiting for an interrupt before exiting the mmu notifier handler. Avi said: kvm may be a buyer. kvm::mmu_lock, which serializes guest page faults, also protects long operations such as destroying large ranges. It would be good to convert it into a spinlock, but as it is used inside mmu notifiers, this cannot be done. (there are alternatives, such as keeping the spinlock and using a generation counter to do the teardown in O(1), which is what the "may" is doing up there). [akpm@linux-foundation.orgpossible speed tweak in hugetlb_cow(), cleanups] Signed-off-by: Andrea Arcangeli <andrea@qumranet.com> Signed-off-by: Sagi Grimberg <sagig@mellanox.com> Signed-off-by: Haggai Eran <haggaie@mellanox.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com> Cc: Or Gerlitz <ogerlitz@mellanox.com> Cc: Haggai Eran <haggaie@mellanox.com> Cc: Shachar Raindel <raindel@mellanox.com> Cc: Liran Liss <liranl@mellanox.com> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Avi Kivity <avi@redhat.com> Cc: Hugh Dickins <hughd@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/huge_memory.c')
-rw-r--r--mm/huge_memory.c42
1 files changed, 36 insertions, 6 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 0e7740923fb9..08a943b9cf95 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -787,6 +787,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
787 pmd_t _pmd; 787 pmd_t _pmd;
788 int ret = 0, i; 788 int ret = 0, i;
789 struct page **pages; 789 struct page **pages;
790 unsigned long mmun_start; /* For mmu_notifiers */
791 unsigned long mmun_end; /* For mmu_notifiers */
790 792
791 pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR, 793 pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR,
792 GFP_KERNEL); 794 GFP_KERNEL);
@@ -823,12 +825,16 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
823 cond_resched(); 825 cond_resched();
824 } 826 }
825 827
828 mmun_start = haddr;
829 mmun_end = haddr + HPAGE_PMD_SIZE;
830 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
831
826 spin_lock(&mm->page_table_lock); 832 spin_lock(&mm->page_table_lock);
827 if (unlikely(!pmd_same(*pmd, orig_pmd))) 833 if (unlikely(!pmd_same(*pmd, orig_pmd)))
828 goto out_free_pages; 834 goto out_free_pages;
829 VM_BUG_ON(!PageHead(page)); 835 VM_BUG_ON(!PageHead(page));
830 836
831 pmdp_clear_flush_notify(vma, haddr, pmd); 837 pmdp_clear_flush(vma, haddr, pmd);
832 /* leave pmd empty until pte is filled */ 838 /* leave pmd empty until pte is filled */
833 839
834 pgtable = pgtable_trans_huge_withdraw(mm); 840 pgtable = pgtable_trans_huge_withdraw(mm);
@@ -851,6 +857,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
851 page_remove_rmap(page); 857 page_remove_rmap(page);
852 spin_unlock(&mm->page_table_lock); 858 spin_unlock(&mm->page_table_lock);
853 859
860 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
861
854 ret |= VM_FAULT_WRITE; 862 ret |= VM_FAULT_WRITE;
855 put_page(page); 863 put_page(page);
856 864
@@ -859,6 +867,7 @@ out:
859 867
860out_free_pages: 868out_free_pages:
861 spin_unlock(&mm->page_table_lock); 869 spin_unlock(&mm->page_table_lock);
870 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
862 mem_cgroup_uncharge_start(); 871 mem_cgroup_uncharge_start();
863 for (i = 0; i < HPAGE_PMD_NR; i++) { 872 for (i = 0; i < HPAGE_PMD_NR; i++) {
864 mem_cgroup_uncharge_page(pages[i]); 873 mem_cgroup_uncharge_page(pages[i]);
@@ -875,6 +884,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
875 int ret = 0; 884 int ret = 0;
876 struct page *page, *new_page; 885 struct page *page, *new_page;
877 unsigned long haddr; 886 unsigned long haddr;
887 unsigned long mmun_start; /* For mmu_notifiers */
888 unsigned long mmun_end; /* For mmu_notifiers */
878 889
879 VM_BUG_ON(!vma->anon_vma); 890 VM_BUG_ON(!vma->anon_vma);
880 spin_lock(&mm->page_table_lock); 891 spin_lock(&mm->page_table_lock);
@@ -925,20 +936,24 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
925 copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); 936 copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
926 __SetPageUptodate(new_page); 937 __SetPageUptodate(new_page);
927 938
939 mmun_start = haddr;
940 mmun_end = haddr + HPAGE_PMD_SIZE;
941 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
942
928 spin_lock(&mm->page_table_lock); 943 spin_lock(&mm->page_table_lock);
929 put_page(page); 944 put_page(page);
930 if (unlikely(!pmd_same(*pmd, orig_pmd))) { 945 if (unlikely(!pmd_same(*pmd, orig_pmd))) {
931 spin_unlock(&mm->page_table_lock); 946 spin_unlock(&mm->page_table_lock);
932 mem_cgroup_uncharge_page(new_page); 947 mem_cgroup_uncharge_page(new_page);
933 put_page(new_page); 948 put_page(new_page);
934 goto out; 949 goto out_mn;
935 } else { 950 } else {
936 pmd_t entry; 951 pmd_t entry;
937 VM_BUG_ON(!PageHead(page)); 952 VM_BUG_ON(!PageHead(page));
938 entry = mk_pmd(new_page, vma->vm_page_prot); 953 entry = mk_pmd(new_page, vma->vm_page_prot);
939 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 954 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
940 entry = pmd_mkhuge(entry); 955 entry = pmd_mkhuge(entry);
941 pmdp_clear_flush_notify(vma, haddr, pmd); 956 pmdp_clear_flush(vma, haddr, pmd);
942 page_add_new_anon_rmap(new_page, vma, haddr); 957 page_add_new_anon_rmap(new_page, vma, haddr);
943 set_pmd_at(mm, haddr, pmd, entry); 958 set_pmd_at(mm, haddr, pmd, entry);
944 update_mmu_cache(vma, address, pmd); 959 update_mmu_cache(vma, address, pmd);
@@ -946,10 +961,14 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
946 put_page(page); 961 put_page(page);
947 ret |= VM_FAULT_WRITE; 962 ret |= VM_FAULT_WRITE;
948 } 963 }
949out_unlock:
950 spin_unlock(&mm->page_table_lock); 964 spin_unlock(&mm->page_table_lock);
965out_mn:
966 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
951out: 967out:
952 return ret; 968 return ret;
969out_unlock:
970 spin_unlock(&mm->page_table_lock);
971 return ret;
953} 972}
954 973
955struct page *follow_trans_huge_pmd(struct mm_struct *mm, 974struct page *follow_trans_huge_pmd(struct mm_struct *mm,
@@ -1162,7 +1181,11 @@ static int __split_huge_page_splitting(struct page *page,
1162 struct mm_struct *mm = vma->vm_mm; 1181 struct mm_struct *mm = vma->vm_mm;
1163 pmd_t *pmd; 1182 pmd_t *pmd;
1164 int ret = 0; 1183 int ret = 0;
1184 /* For mmu_notifiers */
1185 const unsigned long mmun_start = address;
1186 const unsigned long mmun_end = address + HPAGE_PMD_SIZE;
1165 1187
1188 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
1166 spin_lock(&mm->page_table_lock); 1189 spin_lock(&mm->page_table_lock);
1167 pmd = page_check_address_pmd(page, mm, address, 1190 pmd = page_check_address_pmd(page, mm, address,
1168 PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG); 1191 PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG);
@@ -1174,10 +1197,11 @@ static int __split_huge_page_splitting(struct page *page,
1174 * and it won't wait on the anon_vma->root->mutex to 1197 * and it won't wait on the anon_vma->root->mutex to
1175 * serialize against split_huge_page*. 1198 * serialize against split_huge_page*.
1176 */ 1199 */
1177 pmdp_splitting_flush_notify(vma, address, pmd); 1200 pmdp_splitting_flush(vma, address, pmd);
1178 ret = 1; 1201 ret = 1;
1179 } 1202 }
1180 spin_unlock(&mm->page_table_lock); 1203 spin_unlock(&mm->page_table_lock);
1204 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1181 1205
1182 return ret; 1206 return ret;
1183} 1207}
@@ -1898,6 +1922,8 @@ static void collapse_huge_page(struct mm_struct *mm,
1898 spinlock_t *ptl; 1922 spinlock_t *ptl;
1899 int isolated; 1923 int isolated;
1900 unsigned long hstart, hend; 1924 unsigned long hstart, hend;
1925 unsigned long mmun_start; /* For mmu_notifiers */
1926 unsigned long mmun_end; /* For mmu_notifiers */
1901 1927
1902 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 1928 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
1903 1929
@@ -1952,6 +1978,9 @@ static void collapse_huge_page(struct mm_struct *mm,
1952 pte = pte_offset_map(pmd, address); 1978 pte = pte_offset_map(pmd, address);
1953 ptl = pte_lockptr(mm, pmd); 1979 ptl = pte_lockptr(mm, pmd);
1954 1980
1981 mmun_start = address;
1982 mmun_end = address + HPAGE_PMD_SIZE;
1983 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
1955 spin_lock(&mm->page_table_lock); /* probably unnecessary */ 1984 spin_lock(&mm->page_table_lock); /* probably unnecessary */
1956 /* 1985 /*
1957 * After this gup_fast can't run anymore. This also removes 1986 * After this gup_fast can't run anymore. This also removes
@@ -1959,8 +1988,9 @@ static void collapse_huge_page(struct mm_struct *mm,
1959 * huge and small TLB entries for the same virtual address 1988 * huge and small TLB entries for the same virtual address
1960 * to avoid the risk of CPU bugs in that area. 1989 * to avoid the risk of CPU bugs in that area.
1961 */ 1990 */
1962 _pmd = pmdp_clear_flush_notify(vma, address, pmd); 1991 _pmd = pmdp_clear_flush(vma, address, pmd);
1963 spin_unlock(&mm->page_table_lock); 1992 spin_unlock(&mm->page_table_lock);
1993 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1964 1994
1965 spin_lock(ptl); 1995 spin_lock(ptl);
1966 isolated = __collapse_huge_page_isolate(vma, address, pte); 1996 isolated = __collapse_huge_page_isolate(vma, address, pte);