aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSagi Grimberg <sagig@mellanox.com>2012-10-08 19:33:33 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-10-09 03:22:58 -0400
commit2ec74c3ef2d8c58d71e0e00336fb6b891192155a (patch)
tree512b591504cdbee278c27afc50a7e3a558b4851a
parent36e4f20af833d1ce196e6a4ade05dc26c44652d1 (diff)
mm: move all mmu notifier invocations to be done outside the PT lock
In order to allow sleeping during mmu notifier calls, we need to avoid invoking them under the page table spinlock. This patch solves the problem by calling invalidate_page notification after releasing the lock (but before freeing the page itself), or by wrapping the page invalidation with calls to invalidate_range_begin and invalidate_range_end. To prevent accidental changes to the invalidate_range_end arguments after the call to invalidate_range_begin, the patch introduces a convention of saving the arguments in consistently named locals: unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ ... mmun_start = ... mmun_end = ... mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); ... mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); The patch changes code to use this convention for all calls to mmu_notifier_invalidate_range_start/end, except those where the calls are close enough so that anyone who glances at the code can see the values aren't changing. This patchset is a preliminary step towards on-demand paging design to be added to the RDMA stack. Why do we want on-demand paging for Infiniband? Applications register memory with an RDMA adapter using system calls, and subsequently post IO operations that refer to the corresponding virtual addresses directly to HW. Until now, this was achieved by pinning the memory during the registration calls. The goal of on demand paging is to avoid pinning the pages of registered memory regions (MRs). This will allow users the same flexibility they get when swapping any other part of their processes address spaces. Instead of requiring the entire MR to fit in physical memory, we can allow the MR to be larger, and only fit the current working set in physical memory. Why should anyone care? What problems are users currently experiencing? This can make programming with RDMA much simpler. Today, developers that are working with more data than their RAM can hold need either to deregister and reregister memory regions throughout their process's life, or keep a single memory region and copy the data to it. On demand paging will allow these developers to register a single MR at the beginning of their process's life, and let the operating system manage which pages needs to be fetched at a given time. In the future, we might be able to provide a single memory access key for each process that would provide the entire process's address as one large memory region, and the developers wouldn't need to register memory regions at all. Is there any prospect that any other subsystems will utilise these infrastructural changes? If so, which and how, etc? As for other subsystems, I understand that XPMEM wanted to sleep in MMU notifiers, as Christoph Lameter wrote at http://lkml.indiana.edu/hypermail/linux/kernel/0802.1/0460.html and perhaps Andrea knows about other use cases. Scheduling in mmu notifications is required since we need to sync the hardware with the secondary page tables change. A TLB flush of an IO device is inherently slower than a CPU TLB flush, so our design works by sending the invalidation request to the device, and waiting for an interrupt before exiting the mmu notifier handler. Avi said: kvm may be a buyer. kvm::mmu_lock, which serializes guest page faults, also protects long operations such as destroying large ranges. It would be good to convert it into a spinlock, but as it is used inside mmu notifiers, this cannot be done. (there are alternatives, such as keeping the spinlock and using a generation counter to do the teardown in O(1), which is what the "may" is doing up there). [akpm@linux-foundation.orgpossible speed tweak in hugetlb_cow(), cleanups] Signed-off-by: Andrea Arcangeli <andrea@qumranet.com> Signed-off-by: Sagi Grimberg <sagig@mellanox.com> Signed-off-by: Haggai Eran <haggaie@mellanox.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com> Cc: Or Gerlitz <ogerlitz@mellanox.com> Cc: Haggai Eran <haggaie@mellanox.com> Cc: Shachar Raindel <raindel@mellanox.com> Cc: Liran Liss <liranl@mellanox.com> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Avi Kivity <avi@redhat.com> Cc: Hugh Dickins <hughd@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/mmu_notifier.h47
-rw-r--r--mm/filemap_xip.c4
-rw-r--r--mm/huge_memory.c42
-rw-r--r--mm/hugetlb.c21
-rw-r--r--mm/memory.c28
-rw-r--r--mm/mremap.c8
-rw-r--r--mm/rmap.c18
7 files changed, 92 insertions, 76 deletions
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 4b7183e9806..bc823c4c028 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -246,50 +246,6 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
246 __mmu_notifier_mm_destroy(mm); 246 __mmu_notifier_mm_destroy(mm);
247} 247}
248 248
249/*
250 * These two macros will sometime replace ptep_clear_flush.
251 * ptep_clear_flush is implemented as macro itself, so this also is
252 * implemented as a macro until ptep_clear_flush will converted to an
253 * inline function, to diminish the risk of compilation failure. The
254 * invalidate_page method over time can be moved outside the PT lock
255 * and these two macros can be later removed.
256 */
257#define ptep_clear_flush_notify(__vma, __address, __ptep) \
258({ \
259 pte_t __pte; \
260 struct vm_area_struct *___vma = __vma; \
261 unsigned long ___address = __address; \
262 __pte = ptep_clear_flush(___vma, ___address, __ptep); \
263 mmu_notifier_invalidate_page(___vma->vm_mm, ___address); \
264 __pte; \
265})
266
267#define pmdp_clear_flush_notify(__vma, __address, __pmdp) \
268({ \
269 pmd_t __pmd; \
270 struct vm_area_struct *___vma = __vma; \
271 unsigned long ___address = __address; \
272 VM_BUG_ON(__address & ~HPAGE_PMD_MASK); \
273 mmu_notifier_invalidate_range_start(___vma->vm_mm, ___address, \
274 (__address)+HPAGE_PMD_SIZE);\
275 __pmd = pmdp_clear_flush(___vma, ___address, __pmdp); \
276 mmu_notifier_invalidate_range_end(___vma->vm_mm, ___address, \
277 (__address)+HPAGE_PMD_SIZE); \
278 __pmd; \
279})
280
281#define pmdp_splitting_flush_notify(__vma, __address, __pmdp) \
282({ \
283 struct vm_area_struct *___vma = __vma; \
284 unsigned long ___address = __address; \
285 VM_BUG_ON(__address & ~HPAGE_PMD_MASK); \
286 mmu_notifier_invalidate_range_start(___vma->vm_mm, ___address, \
287 (__address)+HPAGE_PMD_SIZE);\
288 pmdp_splitting_flush(___vma, ___address, __pmdp); \
289 mmu_notifier_invalidate_range_end(___vma->vm_mm, ___address, \
290 (__address)+HPAGE_PMD_SIZE); \
291})
292
293#define ptep_clear_flush_young_notify(__vma, __address, __ptep) \ 249#define ptep_clear_flush_young_notify(__vma, __address, __ptep) \
294({ \ 250({ \
295 int __young; \ 251 int __young; \
@@ -380,9 +336,6 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
380 336
381#define ptep_clear_flush_young_notify ptep_clear_flush_young 337#define ptep_clear_flush_young_notify ptep_clear_flush_young
382#define pmdp_clear_flush_young_notify pmdp_clear_flush_young 338#define pmdp_clear_flush_young_notify pmdp_clear_flush_young
383#define ptep_clear_flush_notify ptep_clear_flush
384#define pmdp_clear_flush_notify pmdp_clear_flush
385#define pmdp_splitting_flush_notify pmdp_splitting_flush
386#define set_pte_at_notify set_pte_at 339#define set_pte_at_notify set_pte_at
387 340
388#endif /* CONFIG_MMU_NOTIFIER */ 341#endif /* CONFIG_MMU_NOTIFIER */
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index a52daee11d3..a912da6ddfd 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -192,11 +192,13 @@ retry:
192 if (pte) { 192 if (pte) {
193 /* Nuke the page table entry. */ 193 /* Nuke the page table entry. */
194 flush_cache_page(vma, address, pte_pfn(*pte)); 194 flush_cache_page(vma, address, pte_pfn(*pte));
195 pteval = ptep_clear_flush_notify(vma, address, pte); 195 pteval = ptep_clear_flush(vma, address, pte);
196 page_remove_rmap(page); 196 page_remove_rmap(page);
197 dec_mm_counter(mm, MM_FILEPAGES); 197 dec_mm_counter(mm, MM_FILEPAGES);
198 BUG_ON(pte_dirty(pteval)); 198 BUG_ON(pte_dirty(pteval));
199 pte_unmap_unlock(pte, ptl); 199 pte_unmap_unlock(pte, ptl);
200 /* must invalidate_page _before_ freeing the page */
201 mmu_notifier_invalidate_page(mm, address);
200 page_cache_release(page); 202 page_cache_release(page);
201 } 203 }
202 } 204 }
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 0e7740923fb..08a943b9cf9 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -787,6 +787,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
787 pmd_t _pmd; 787 pmd_t _pmd;
788 int ret = 0, i; 788 int ret = 0, i;
789 struct page **pages; 789 struct page **pages;
790 unsigned long mmun_start; /* For mmu_notifiers */
791 unsigned long mmun_end; /* For mmu_notifiers */
790 792
791 pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR, 793 pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR,
792 GFP_KERNEL); 794 GFP_KERNEL);
@@ -823,12 +825,16 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
823 cond_resched(); 825 cond_resched();
824 } 826 }
825 827
828 mmun_start = haddr;
829 mmun_end = haddr + HPAGE_PMD_SIZE;
830 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
831
826 spin_lock(&mm->page_table_lock); 832 spin_lock(&mm->page_table_lock);
827 if (unlikely(!pmd_same(*pmd, orig_pmd))) 833 if (unlikely(!pmd_same(*pmd, orig_pmd)))
828 goto out_free_pages; 834 goto out_free_pages;
829 VM_BUG_ON(!PageHead(page)); 835 VM_BUG_ON(!PageHead(page));
830 836
831 pmdp_clear_flush_notify(vma, haddr, pmd); 837 pmdp_clear_flush(vma, haddr, pmd);
832 /* leave pmd empty until pte is filled */ 838 /* leave pmd empty until pte is filled */
833 839
834 pgtable = pgtable_trans_huge_withdraw(mm); 840 pgtable = pgtable_trans_huge_withdraw(mm);
@@ -851,6 +857,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
851 page_remove_rmap(page); 857 page_remove_rmap(page);
852 spin_unlock(&mm->page_table_lock); 858 spin_unlock(&mm->page_table_lock);
853 859
860 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
861
854 ret |= VM_FAULT_WRITE; 862 ret |= VM_FAULT_WRITE;
855 put_page(page); 863 put_page(page);
856 864
@@ -859,6 +867,7 @@ out:
859 867
860out_free_pages: 868out_free_pages:
861 spin_unlock(&mm->page_table_lock); 869 spin_unlock(&mm->page_table_lock);
870 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
862 mem_cgroup_uncharge_start(); 871 mem_cgroup_uncharge_start();
863 for (i = 0; i < HPAGE_PMD_NR; i++) { 872 for (i = 0; i < HPAGE_PMD_NR; i++) {
864 mem_cgroup_uncharge_page(pages[i]); 873 mem_cgroup_uncharge_page(pages[i]);
@@ -875,6 +884,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
875 int ret = 0; 884 int ret = 0;
876 struct page *page, *new_page; 885 struct page *page, *new_page;
877 unsigned long haddr; 886 unsigned long haddr;
887 unsigned long mmun_start; /* For mmu_notifiers */
888 unsigned long mmun_end; /* For mmu_notifiers */
878 889
879 VM_BUG_ON(!vma->anon_vma); 890 VM_BUG_ON(!vma->anon_vma);
880 spin_lock(&mm->page_table_lock); 891 spin_lock(&mm->page_table_lock);
@@ -925,20 +936,24 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
925 copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); 936 copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
926 __SetPageUptodate(new_page); 937 __SetPageUptodate(new_page);
927 938
939 mmun_start = haddr;
940 mmun_end = haddr + HPAGE_PMD_SIZE;
941 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
942
928 spin_lock(&mm->page_table_lock); 943 spin_lock(&mm->page_table_lock);
929 put_page(page); 944 put_page(page);
930 if (unlikely(!pmd_same(*pmd, orig_pmd))) { 945 if (unlikely(!pmd_same(*pmd, orig_pmd))) {
931 spin_unlock(&mm->page_table_lock); 946 spin_unlock(&mm->page_table_lock);
932 mem_cgroup_uncharge_page(new_page); 947 mem_cgroup_uncharge_page(new_page);
933 put_page(new_page); 948 put_page(new_page);
934 goto out; 949 goto out_mn;
935 } else { 950 } else {
936 pmd_t entry; 951 pmd_t entry;
937 VM_BUG_ON(!PageHead(page)); 952 VM_BUG_ON(!PageHead(page));
938 entry = mk_pmd(new_page, vma->vm_page_prot); 953 entry = mk_pmd(new_page, vma->vm_page_prot);
939 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 954 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
940 entry = pmd_mkhuge(entry); 955 entry = pmd_mkhuge(entry);
941 pmdp_clear_flush_notify(vma, haddr, pmd); 956 pmdp_clear_flush(vma, haddr, pmd);
942 page_add_new_anon_rmap(new_page, vma, haddr); 957 page_add_new_anon_rmap(new_page, vma, haddr);
943 set_pmd_at(mm, haddr, pmd, entry); 958 set_pmd_at(mm, haddr, pmd, entry);
944 update_mmu_cache(vma, address, pmd); 959 update_mmu_cache(vma, address, pmd);
@@ -946,10 +961,14 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
946 put_page(page); 961 put_page(page);
947 ret |= VM_FAULT_WRITE; 962 ret |= VM_FAULT_WRITE;
948 } 963 }
949out_unlock:
950 spin_unlock(&mm->page_table_lock); 964 spin_unlock(&mm->page_table_lock);
965out_mn:
966 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
951out: 967out:
952 return ret; 968 return ret;
969out_unlock:
970 spin_unlock(&mm->page_table_lock);
971 return ret;
953} 972}
954 973
955struct page *follow_trans_huge_pmd(struct mm_struct *mm, 974struct page *follow_trans_huge_pmd(struct mm_struct *mm,
@@ -1162,7 +1181,11 @@ static int __split_huge_page_splitting(struct page *page,
1162 struct mm_struct *mm = vma->vm_mm; 1181 struct mm_struct *mm = vma->vm_mm;
1163 pmd_t *pmd; 1182 pmd_t *pmd;
1164 int ret = 0; 1183 int ret = 0;
1184 /* For mmu_notifiers */
1185 const unsigned long mmun_start = address;
1186 const unsigned long mmun_end = address + HPAGE_PMD_SIZE;
1165 1187
1188 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
1166 spin_lock(&mm->page_table_lock); 1189 spin_lock(&mm->page_table_lock);
1167 pmd = page_check_address_pmd(page, mm, address, 1190 pmd = page_check_address_pmd(page, mm, address,
1168 PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG); 1191 PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG);
@@ -1174,10 +1197,11 @@ static int __split_huge_page_splitting(struct page *page,
1174 * and it won't wait on the anon_vma->root->mutex to 1197 * and it won't wait on the anon_vma->root->mutex to
1175 * serialize against split_huge_page*. 1198 * serialize against split_huge_page*.
1176 */ 1199 */
1177 pmdp_splitting_flush_notify(vma, address, pmd); 1200 pmdp_splitting_flush(vma, address, pmd);
1178 ret = 1; 1201 ret = 1;
1179 } 1202 }
1180 spin_unlock(&mm->page_table_lock); 1203 spin_unlock(&mm->page_table_lock);
1204 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1181 1205
1182 return ret; 1206 return ret;
1183} 1207}
@@ -1898,6 +1922,8 @@ static void collapse_huge_page(struct mm_struct *mm,
1898 spinlock_t *ptl; 1922 spinlock_t *ptl;
1899 int isolated; 1923 int isolated;
1900 unsigned long hstart, hend; 1924 unsigned long hstart, hend;
1925 unsigned long mmun_start; /* For mmu_notifiers */
1926 unsigned long mmun_end; /* For mmu_notifiers */
1901 1927
1902 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 1928 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
1903 1929
@@ -1952,6 +1978,9 @@ static void collapse_huge_page(struct mm_struct *mm,
1952 pte = pte_offset_map(pmd, address); 1978 pte = pte_offset_map(pmd, address);
1953 ptl = pte_lockptr(mm, pmd); 1979 ptl = pte_lockptr(mm, pmd);
1954 1980
1981 mmun_start = address;
1982 mmun_end = address + HPAGE_PMD_SIZE;
1983 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
1955 spin_lock(&mm->page_table_lock); /* probably unnecessary */ 1984 spin_lock(&mm->page_table_lock); /* probably unnecessary */
1956 /* 1985 /*
1957 * After this gup_fast can't run anymore. This also removes 1986 * After this gup_fast can't run anymore. This also removes
@@ -1959,8 +1988,9 @@ static void collapse_huge_page(struct mm_struct *mm,
1959 * huge and small TLB entries for the same virtual address 1988 * huge and small TLB entries for the same virtual address
1960 * to avoid the risk of CPU bugs in that area. 1989 * to avoid the risk of CPU bugs in that area.
1961 */ 1990 */
1962 _pmd = pmdp_clear_flush_notify(vma, address, pmd); 1991 _pmd = pmdp_clear_flush(vma, address, pmd);
1963 spin_unlock(&mm->page_table_lock); 1992 spin_unlock(&mm->page_table_lock);
1993 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1964 1994
1965 spin_lock(ptl); 1995 spin_lock(ptl);
1966 isolated = __collapse_huge_page_isolate(vma, address, pte); 1996 isolated = __collapse_huge_page_isolate(vma, address, pte);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index de5d1dcf34f..993f7c1820a 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2355,13 +2355,15 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
2355 struct page *page; 2355 struct page *page;
2356 struct hstate *h = hstate_vma(vma); 2356 struct hstate *h = hstate_vma(vma);
2357 unsigned long sz = huge_page_size(h); 2357 unsigned long sz = huge_page_size(h);
2358 const unsigned long mmun_start = start; /* For mmu_notifiers */
2359 const unsigned long mmun_end = end; /* For mmu_notifiers */
2358 2360
2359 WARN_ON(!is_vm_hugetlb_page(vma)); 2361 WARN_ON(!is_vm_hugetlb_page(vma));
2360 BUG_ON(start & ~huge_page_mask(h)); 2362 BUG_ON(start & ~huge_page_mask(h));
2361 BUG_ON(end & ~huge_page_mask(h)); 2363 BUG_ON(end & ~huge_page_mask(h));
2362 2364
2363 tlb_start_vma(tlb, vma); 2365 tlb_start_vma(tlb, vma);
2364 mmu_notifier_invalidate_range_start(mm, start, end); 2366 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2365again: 2367again:
2366 spin_lock(&mm->page_table_lock); 2368 spin_lock(&mm->page_table_lock);
2367 for (address = start; address < end; address += sz) { 2369 for (address = start; address < end; address += sz) {
@@ -2425,7 +2427,7 @@ again:
2425 if (address < end && !ref_page) 2427 if (address < end && !ref_page)
2426 goto again; 2428 goto again;
2427 } 2429 }
2428 mmu_notifier_invalidate_range_end(mm, start, end); 2430 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2429 tlb_end_vma(tlb, vma); 2431 tlb_end_vma(tlb, vma);
2430} 2432}
2431 2433
@@ -2525,6 +2527,8 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
2525 struct page *old_page, *new_page; 2527 struct page *old_page, *new_page;
2526 int avoidcopy; 2528 int avoidcopy;
2527 int outside_reserve = 0; 2529 int outside_reserve = 0;
2530 unsigned long mmun_start; /* For mmu_notifiers */
2531 unsigned long mmun_end; /* For mmu_notifiers */
2528 2532
2529 old_page = pte_page(pte); 2533 old_page = pte_page(pte);
2530 2534
@@ -2611,6 +2615,9 @@ retry_avoidcopy:
2611 pages_per_huge_page(h)); 2615 pages_per_huge_page(h));
2612 __SetPageUptodate(new_page); 2616 __SetPageUptodate(new_page);
2613 2617
2618 mmun_start = address & huge_page_mask(h);
2619 mmun_end = mmun_start + huge_page_size(h);
2620 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2614 /* 2621 /*
2615 * Retake the page_table_lock to check for racing updates 2622 * Retake the page_table_lock to check for racing updates
2616 * before the page tables are altered 2623 * before the page tables are altered
@@ -2619,9 +2626,6 @@ retry_avoidcopy:
2619 ptep = huge_pte_offset(mm, address & huge_page_mask(h)); 2626 ptep = huge_pte_offset(mm, address & huge_page_mask(h));
2620 if (likely(pte_same(huge_ptep_get(ptep), pte))) { 2627 if (likely(pte_same(huge_ptep_get(ptep), pte))) {
2621 /* Break COW */ 2628 /* Break COW */
2622 mmu_notifier_invalidate_range_start(mm,
2623 address & huge_page_mask(h),
2624 (address & huge_page_mask(h)) + huge_page_size(h));
2625 huge_ptep_clear_flush(vma, address, ptep); 2629 huge_ptep_clear_flush(vma, address, ptep);
2626 set_huge_pte_at(mm, address, ptep, 2630 set_huge_pte_at(mm, address, ptep,
2627 make_huge_pte(vma, new_page, 1)); 2631 make_huge_pte(vma, new_page, 1));
@@ -2629,10 +2633,11 @@ retry_avoidcopy:
2629 hugepage_add_new_anon_rmap(new_page, vma, address); 2633 hugepage_add_new_anon_rmap(new_page, vma, address);
2630 /* Make the old page be freed below */ 2634 /* Make the old page be freed below */
2631 new_page = old_page; 2635 new_page = old_page;
2632 mmu_notifier_invalidate_range_end(mm,
2633 address & huge_page_mask(h),
2634 (address & huge_page_mask(h)) + huge_page_size(h));
2635 } 2636 }
2637 spin_unlock(&mm->page_table_lock);
2638 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2639 /* Caller expects lock to be held */
2640 spin_lock(&mm->page_table_lock);
2636 page_cache_release(new_page); 2641 page_cache_release(new_page);
2637 page_cache_release(old_page); 2642 page_cache_release(old_page);
2638 return 0; 2643 return 0;
diff --git a/mm/memory.c b/mm/memory.c
index 5f5d1f039bf..b03a4a21c1d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -712,7 +712,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
712 add_taint(TAINT_BAD_PAGE); 712 add_taint(TAINT_BAD_PAGE);
713} 713}
714 714
715static inline int is_cow_mapping(vm_flags_t flags) 715static inline bool is_cow_mapping(vm_flags_t flags)
716{ 716{
717 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; 717 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
718} 718}
@@ -1039,6 +1039,9 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1039 unsigned long next; 1039 unsigned long next;
1040 unsigned long addr = vma->vm_start; 1040 unsigned long addr = vma->vm_start;
1041 unsigned long end = vma->vm_end; 1041 unsigned long end = vma->vm_end;
1042 unsigned long mmun_start; /* For mmu_notifiers */
1043 unsigned long mmun_end; /* For mmu_notifiers */
1044 bool is_cow;
1042 int ret; 1045 int ret;
1043 1046
1044 /* 1047 /*
@@ -1072,8 +1075,12 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1072 * parent mm. And a permission downgrade will only happen if 1075 * parent mm. And a permission downgrade will only happen if
1073 * is_cow_mapping() returns true. 1076 * is_cow_mapping() returns true.
1074 */ 1077 */
1075 if (is_cow_mapping(vma->vm_flags)) 1078 is_cow = is_cow_mapping(vma->vm_flags);
1076 mmu_notifier_invalidate_range_start(src_mm, addr, end); 1079 mmun_start = addr;
1080 mmun_end = end;
1081 if (is_cow)
1082 mmu_notifier_invalidate_range_start(src_mm, mmun_start,
1083 mmun_end);
1077 1084
1078 ret = 0; 1085 ret = 0;
1079 dst_pgd = pgd_offset(dst_mm, addr); 1086 dst_pgd = pgd_offset(dst_mm, addr);
@@ -1089,9 +1096,8 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1089 } 1096 }
1090 } while (dst_pgd++, src_pgd++, addr = next, addr != end); 1097 } while (dst_pgd++, src_pgd++, addr = next, addr != end);
1091 1098
1092 if (is_cow_mapping(vma->vm_flags)) 1099 if (is_cow)
1093 mmu_notifier_invalidate_range_end(src_mm, 1100 mmu_notifier_invalidate_range_end(src_mm, mmun_start, mmun_end);
1094 vma->vm_start, end);
1095 return ret; 1101 return ret;
1096} 1102}
1097 1103
@@ -2516,7 +2522,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2516 spinlock_t *ptl, pte_t orig_pte) 2522 spinlock_t *ptl, pte_t orig_pte)
2517 __releases(ptl) 2523 __releases(ptl)
2518{ 2524{
2519 struct page *old_page, *new_page; 2525 struct page *old_page, *new_page = NULL;
2520 pte_t entry; 2526 pte_t entry;
2521 int ret = 0; 2527 int ret = 0;
2522 int page_mkwrite = 0; 2528 int page_mkwrite = 0;
@@ -2760,10 +2766,14 @@ gotten:
2760 } else 2766 } else
2761 mem_cgroup_uncharge_page(new_page); 2767 mem_cgroup_uncharge_page(new_page);
2762 2768
2763 if (new_page)
2764 page_cache_release(new_page);
2765unlock: 2769unlock:
2766 pte_unmap_unlock(page_table, ptl); 2770 pte_unmap_unlock(page_table, ptl);
2771 if (new_page) {
2772 if (new_page == old_page)
2773 /* cow happened, notify before releasing old_page */
2774 mmu_notifier_invalidate_page(mm, address);
2775 page_cache_release(new_page);
2776 }
2767 if (old_page) { 2777 if (old_page) {
2768 /* 2778 /*
2769 * Don't let another task, with possibly unlocked vma, 2779 * Don't let another task, with possibly unlocked vma,
diff --git a/mm/mremap.c b/mm/mremap.c
index 3b639a4b26b..1b61c2d3307 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -149,11 +149,15 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
149 unsigned long extent, next, old_end; 149 unsigned long extent, next, old_end;
150 pmd_t *old_pmd, *new_pmd; 150 pmd_t *old_pmd, *new_pmd;
151 bool need_flush = false; 151 bool need_flush = false;
152 unsigned long mmun_start; /* For mmu_notifiers */
153 unsigned long mmun_end; /* For mmu_notifiers */
152 154
153 old_end = old_addr + len; 155 old_end = old_addr + len;
154 flush_cache_range(vma, old_addr, old_end); 156 flush_cache_range(vma, old_addr, old_end);
155 157
156 mmu_notifier_invalidate_range_start(vma->vm_mm, old_addr, old_end); 158 mmun_start = old_addr;
159 mmun_end = old_end;
160 mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
157 161
158 for (; old_addr < old_end; old_addr += extent, new_addr += extent) { 162 for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
159 cond_resched(); 163 cond_resched();
@@ -197,7 +201,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
197 if (likely(need_flush)) 201 if (likely(need_flush))
198 flush_tlb_range(vma, old_end-len, old_addr); 202 flush_tlb_range(vma, old_end-len, old_addr);
199 203
200 mmu_notifier_invalidate_range_end(vma->vm_mm, old_end-len, old_end); 204 mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
201 205
202 return len + old_addr - old_end; /* how much done */ 206 return len + old_addr - old_end; /* how much done */
203} 207}
diff --git a/mm/rmap.c b/mm/rmap.c
index bf03149f495..7df7984d476 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -884,7 +884,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma,
884 pte_t entry; 884 pte_t entry;
885 885
886 flush_cache_page(vma, address, pte_pfn(*pte)); 886 flush_cache_page(vma, address, pte_pfn(*pte));
887 entry = ptep_clear_flush_notify(vma, address, pte); 887 entry = ptep_clear_flush(vma, address, pte);
888 entry = pte_wrprotect(entry); 888 entry = pte_wrprotect(entry);
889 entry = pte_mkclean(entry); 889 entry = pte_mkclean(entry);
890 set_pte_at(mm, address, pte, entry); 890 set_pte_at(mm, address, pte, entry);
@@ -892,6 +892,9 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma,
892 } 892 }
893 893
894 pte_unmap_unlock(pte, ptl); 894 pte_unmap_unlock(pte, ptl);
895
896 if (ret)
897 mmu_notifier_invalidate_page(mm, address);
895out: 898out:
896 return ret; 899 return ret;
897} 900}
@@ -1212,7 +1215,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1212 1215
1213 /* Nuke the page table entry. */ 1216 /* Nuke the page table entry. */
1214 flush_cache_page(vma, address, page_to_pfn(page)); 1217 flush_cache_page(vma, address, page_to_pfn(page));
1215 pteval = ptep_clear_flush_notify(vma, address, pte); 1218 pteval = ptep_clear_flush(vma, address, pte);
1216 1219
1217 /* Move the dirty bit to the physical page now the pte is gone. */ 1220 /* Move the dirty bit to the physical page now the pte is gone. */
1218 if (pte_dirty(pteval)) 1221 if (pte_dirty(pteval))
@@ -1274,6 +1277,8 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1274 1277
1275out_unmap: 1278out_unmap:
1276 pte_unmap_unlock(pte, ptl); 1279 pte_unmap_unlock(pte, ptl);
1280 if (ret != SWAP_FAIL)
1281 mmu_notifier_invalidate_page(mm, address);
1277out: 1282out:
1278 return ret; 1283 return ret;
1279 1284
@@ -1338,6 +1343,8 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
1338 spinlock_t *ptl; 1343 spinlock_t *ptl;
1339 struct page *page; 1344 struct page *page;
1340 unsigned long address; 1345 unsigned long address;
1346 unsigned long mmun_start; /* For mmu_notifiers */
1347 unsigned long mmun_end; /* For mmu_notifiers */
1341 unsigned long end; 1348 unsigned long end;
1342 int ret = SWAP_AGAIN; 1349 int ret = SWAP_AGAIN;
1343 int locked_vma = 0; 1350 int locked_vma = 0;
@@ -1361,6 +1368,10 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
1361 if (!pmd_present(*pmd)) 1368 if (!pmd_present(*pmd))
1362 return ret; 1369 return ret;
1363 1370
1371 mmun_start = address;
1372 mmun_end = end;
1373 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
1374
1364 /* 1375 /*
1365 * If we can acquire the mmap_sem for read, and vma is VM_LOCKED, 1376 * If we can acquire the mmap_sem for read, and vma is VM_LOCKED,
1366 * keep the sem while scanning the cluster for mlocking pages. 1377 * keep the sem while scanning the cluster for mlocking pages.
@@ -1394,7 +1405,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
1394 1405
1395 /* Nuke the page table entry. */ 1406 /* Nuke the page table entry. */
1396 flush_cache_page(vma, address, pte_pfn(*pte)); 1407 flush_cache_page(vma, address, pte_pfn(*pte));
1397 pteval = ptep_clear_flush_notify(vma, address, pte); 1408 pteval = ptep_clear_flush(vma, address, pte);
1398 1409
1399 /* If nonlinear, store the file page offset in the pte. */ 1410 /* If nonlinear, store the file page offset in the pte. */
1400 if (page->index != linear_page_index(vma, address)) 1411 if (page->index != linear_page_index(vma, address))
@@ -1410,6 +1421,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
1410 (*mapcount)--; 1421 (*mapcount)--;
1411 } 1422 }
1412 pte_unmap_unlock(pte - 1, ptl); 1423 pte_unmap_unlock(pte - 1, ptl);
1424 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1413 if (locked_vma) 1425 if (locked_vma)
1414 up_read(&vma->vm_mm->mmap_sem); 1426 up_read(&vma->vm_mm->mmap_sem);
1415 return ret; 1427 return ret;