mm: Remove i_mmap_lock lockbreak

Hugh says: "The only significant loser, I think, would be page reclaim (when concurrent with truncation): could spin for a long time waiting for the i_mmap_mutex it expects would soon be dropped? " Counter points: - cpu contention makes the spin stop (need_resched()) - zap pages should be freeing pages at a higher rate than reclaim ever can I think the simplification of the truncate code is definitely worth it. Effectively reverts: 2aa15890f3c ("mm: prevent concurrent unmap_mapping_range() on the same inode") and takes out the code that caused its problem. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Hugh Dickins <hughd@google.com> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: David Miller <davem@davemloft.net> Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: Russell King <rmk@arm.linux.org.uk> Cc: Paul Mundt <lethal@linux-sh.org> Cc: Jeff Dike <jdike@addtoit.com> Cc: Richard Weinberger <richard@nod.at> Cc: Tony Luck <tony.luck@intel.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Nick Piggin <npiggin@kernel.dk> Cc: Namhyung Kim <namhyung@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Peter Zijlstra <a.p.zijlstra@chello.nl> 2011-05-24 20:12:04 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2011-05-25 11:39:17 -0400
commit: 97a894136f29802da19a15541de3c019e1ca147e (patch)
tree: 1fd3f92ba92a37d5d8527a1f41458091d0a944dc /mm/memory.c
parent: e4c70a6629f9c74c4b0de258a3951890e9047c82 (diff)
1 files changed, 27 insertions, 168 deletions
diff --git a/mm/memory.c b/mm/memory.c
index 17193d74f302..18655878b9f8 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -986,13 +986,13 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 static unsigned long zap_pte_range(struct mmu_gather *tlb,
                                struct vm_area_struct *vma, pmd_t *pmd,
                                unsigned long addr, unsigned long end,
-                                long *zap_work, struct zap_details *details)
+                                struct zap_details *details)
 {
        struct mm_struct *mm = tlb->mm;
        int force_flush = 0;
-        pte_t *pte;
-        spinlock_t *ptl;
        int rss[NR_MM_COUNTERS];
+        spinlock_t *ptl;
+        pte_t *pte;
 again:
        init_rss_vec(rss);
@@ -1001,12 +1001,9 @@ again:
        do {
                pte_t ptent = *pte;
                if (pte_none(ptent)) {
-                        (*zap_work)--;
                        continue;
                }
-                (*zap_work) -= PAGE_SIZE;
                if (pte_present(ptent)) {
                        struct page *page;
@@ -1075,7 +1072,7 @@ again:
                                print_bad_pte(vma, addr, ptent, NULL);
                }
                pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
-        } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
+        } while (pte++, addr += PAGE_SIZE, addr != end);
        add_mm_rss_vec(mm, rss);
        arch_leave_lazy_mmu_mode();
@@ -1099,7 +1096,7 @@ again:
 static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
                                struct vm_area_struct *vma, pud_t *pud,
                                unsigned long addr, unsigned long end,
-                                long *zap_work, struct zap_details *details)
+                                struct zap_details *details)
 {
        pmd_t *pmd;
        unsigned long next;
@@ -1111,19 +1108,15 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
                        if (next-addr != HPAGE_PMD_SIZE) {
                                VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem));
                                split_huge_page_pmd(vma->vm_mm, pmd);
-                        } else if (zap_huge_pmd(tlb, vma, pmd)) {
+                        } else if (zap_huge_pmd(tlb, vma, pmd))
-                                (*zap_work)--;
                                continue;
-                        }
                        /* fall through */
                }
-                if (pmd_none_or_clear_bad(pmd)) {
+                if (pmd_none_or_clear_bad(pmd))
-                        (*zap_work)--;
                        continue;
-                }
+                next = zap_pte_range(tlb, vma, pmd, addr, next, details);
-                next = zap_pte_range(tlb, vma, pmd, addr, next,
+                cond_resched();
-                                                zap_work, details);
+        } while (pmd++, addr = next, addr != end);
-        } while (pmd++, addr = next, (addr != end && *zap_work > 0));
        return addr;
 }
@@ -1131,7 +1124,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
 static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
                                struct vm_area_struct *vma, pgd_t *pgd,
                                unsigned long addr, unsigned long end,
-                                long *zap_work, struct zap_details *details)
+                                struct zap_details *details)
 {
        pud_t *pud;
        unsigned long next;
@@ -1139,13 +1132,10 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
        pud = pud_offset(pgd, addr);
        do {
                next = pud_addr_end(addr, end);
-                if (pud_none_or_clear_bad(pud)) {
+                if (pud_none_or_clear_bad(pud))
-                        (*zap_work)--;
                        continue;
-                }
+                next = zap_pmd_range(tlb, vma, pud, addr, next, details);
-                next = zap_pmd_range(tlb, vma, pud, addr, next,
+        } while (pud++, addr = next, addr != end);
-                                                zap_work, details);
-        } while (pud++, addr = next, (addr != end && *zap_work > 0));
        return addr;
 }
@@ -1153,7 +1143,7 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
 static unsigned long unmap_page_range(struct mmu_gather *tlb,
                                struct vm_area_struct *vma,
                                unsigned long addr, unsigned long end,
-                                long *zap_work, struct zap_details *details)
+                                struct zap_details *details)
 {
        pgd_t *pgd;
        unsigned long next;
@@ -1167,13 +1157,10 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
        pgd = pgd_offset(vma->vm_mm, addr);
        do {
                next = pgd_addr_end(addr, end);
-                if (pgd_none_or_clear_bad(pgd)) {
+                if (pgd_none_or_clear_bad(pgd))
-                        (*zap_work)--;
                        continue;
-                }
+                next = zap_pud_range(tlb, vma, pgd, addr, next, details);
-                next = zap_pud_range(tlb, vma, pgd, addr, next,
+        } while (pgd++, addr = next, addr != end);
-                                                zap_work, details);
-        } while (pgd++, addr = next, (addr != end && *zap_work > 0));
        tlb_end_vma(tlb, vma);
        mem_cgroup_uncharge_end();
@@ -1218,9 +1205,7 @@ unsigned long unmap_vmas(struct mmu_gather *tlb,
                unsigned long end_addr, unsigned long *nr_accounted,
                struct zap_details *details)
 {
-        long zap_work = ZAP_BLOCK_SIZE;
        unsigned long start = start_addr;
-        spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
        struct mm_struct *mm = vma->vm_mm;
        mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
@@ -1253,33 +1238,15 @@ unsigned long unmap_vmas(struct mmu_gather *tlb,
                                 * Since no pte has actually been setup, it is
                                 * safe to do nothing in this case.
                                 */
-                                if (vma->vm_file) {
+                                if (vma->vm_file)
                                        unmap_hugepage_range(vma, start, end, NULL);
-                                        zap_work -= (end - start) /
-                                        pages_per_huge_page(hstate_vma(vma));
-                                }
                                start = end;
                        } else
-                                start = unmap_page_range(tlb, vma,
+                                start = unmap_page_range(tlb, vma, start, end, details);
-                                                start, end, &zap_work, details);
-                        if (zap_work > 0) {
-                                BUG_ON(start != end);
-                                break;
-                        }
-                        if (need_resched() ||
-                                (i_mmap_lock && spin_needbreak(i_mmap_lock))) {
-                                if (i_mmap_lock)
-                                        goto out;
-                                cond_resched();
-                        }
-                        zap_work = ZAP_BLOCK_SIZE;
                }
        }
-out:
        mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
        return start;   /* which is now the end (or restart) address */
 }
@@ -2612,96 +2579,11 @@ unwritable_page:
        return ret;
 }
-/*
+static void unmap_mapping_range_vma(struct vm_area_struct *vma,
- * Helper functions for unmap_mapping_range().
- *
- * __ Notes on dropping i_mmap_lock to reduce latency while unmapping __
- *
- * We have to restart searching the prio_tree whenever we drop the lock,
- * since the iterator is only valid while the lock is held, and anyway
- * a later vma might be split and reinserted earlier while lock dropped.
- *
- * The list of nonlinear vmas could be handled more efficiently, using
- * a placeholder, but handle it in the same way until a need is shown.
- * It is important to search the prio_tree before nonlinear list: a vma
- * may become nonlinear and be shifted from prio_tree to nonlinear list
- * while the lock is dropped; but never shifted from list to prio_tree.
- *
- * In order to make forward progress despite restarting the search,
- * vm_truncate_count is used to mark a vma as now dealt with, so we can
- * quickly skip it next time around.  Since the prio_tree search only
- * shows us those vmas affected by unmapping the range in question, we
- * can't efficiently keep all vmas in step with mapping->truncate_count:
- * so instead reset them all whenever it wraps back to 0 (then go to 1).
- * mapping->truncate_count and vma->vm_truncate_count are protected by
- * i_mmap_lock.
- *
- * In order to make forward progress despite repeatedly restarting some
- * large vma, note the restart_addr from unmap_vmas when it breaks out:
- * and restart from that address when we reach that vma again.  It might
- * have been split or merged, shrunk or extended, but never shifted: so
- * restart_addr remains valid so long as it remains in the vma's range.
- * unmap_mapping_range forces truncate_count to leap over page-aligned
- * values so we can save vma's restart_addr in its truncate_count field.
- */
-#define is_restart_addr(truncate_count) (!((truncate_count) & ~PAGE_MASK))
-static void reset_vma_truncate_counts(struct address_space *mapping)
-{
-        struct vm_area_struct *vma;
-        struct prio_tree_iter iter;
-        vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
-                vma->vm_truncate_count = 0;
-        list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
-                vma->vm_truncate_count = 0;
-}
-static int unmap_mapping_range_vma(struct vm_area_struct *vma,
                unsigned long start_addr, unsigned long end_addr,
                struct zap_details *details)
 {
-        unsigned long restart_addr;
+        zap_page_range(vma, start_addr, end_addr - start_addr, details);
-        int need_break;
-        /*
-         * files that support invalidating or truncating portions of the
-         * file from under mmaped areas must have their ->fault function
-         * return a locked page (and set VM_FAULT_LOCKED in the return).
-         * This provides synchronisation against concurrent unmapping here.
-         */
-again:
-        restart_addr = vma->vm_truncate_count;
-        if (is_restart_addr(restart_addr) && start_addr < restart_addr) {
-                start_addr = restart_addr;
-                if (start_addr >= end_addr) {
-                        /* Top of vma has been split off since last time */
-                        vma->vm_truncate_count = details->truncate_count;
-                        return 0;
-                }
-        }
-        restart_addr = zap_page_range(vma, start_addr,
-                                        end_addr - start_addr, details);
-        need_break = need_resched() || spin_needbreak(details->i_mmap_lock);
-        if (restart_addr >= end_addr) {
-                /* We have now completed this vma: mark it so */
-                vma->vm_truncate_count = details->truncate_count;
-                if (!need_break)
-                        return 0;
-        } else {
-                /* Note restart_addr in vma's truncate_count field */
-                vma->vm_truncate_count = restart_addr;
-                if (!need_break)
-                        goto again;
-        }
-        spin_unlock(details->i_mmap_lock);
-        cond_resched();
-        spin_lock(details->i_mmap_lock);
-        return -EINTR;
 }
 static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
@@ -2711,12 +2593,8 @@ static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
        struct prio_tree_iter iter;
        pgoff_t vba, vea, zba, zea;
-restart:
        vma_prio_tree_foreach(vma, &iter, root,
                        details->first_index, details->last_index) {
-                /* Skip quickly over those we have already dealt with */
-                if (vma->vm_truncate_count == details->truncate_count)
-                        continue;
                vba = vma->vm_pgoff;
                vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1;
@@ -2728,11 +2606,10 @@ restart:
                if (zea > vea)
                        zea = vea;
-                if (unmap_mapping_range_vma(vma,
+                unmap_mapping_range_vma(vma,
                        ((zba - vba) << PAGE_SHIFT) + vma->vm_start,
                        ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
-                                details) < 0)
+                                details);
-                        goto restart;
        }
 }
@@ -2747,15 +2624,9 @@ static inline void unmap_mapping_range_list(struct list_head *head,
         * across *all* the pages in each nonlinear VMA, not just the pages
         * whose virtual address lies outside the file truncation point.
         */
-restart:
        list_for_each_entry(vma, head, shared.vm_set.list) {
-                /* Skip quickly over those we have already dealt with */
-                if (vma->vm_truncate_count == details->truncate_count)
-                        continue;
                details->nonlinear_vma = vma;
-                if (unmap_mapping_range_vma(vma, vma->vm_start,
+                unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details);
-                                        vma->vm_end, details) < 0)
-                        goto restart;
        }
 }
@@ -2794,26 +2665,14 @@ void unmap_mapping_range(struct address_space *mapping,
        details.last_index = hba + hlen - 1;
        if (details.last_index < details.first_index)
                details.last_index = ULONG_MAX;
-        details.i_mmap_lock = &mapping->i_mmap_lock;
-        mutex_lock(&mapping->unmap_mutex);
-        spin_lock(&mapping->i_mmap_lock);
-        /* Protect against endless unmapping loops */
-        mapping->truncate_count++;
-        if (unlikely(is_restart_addr(mapping->truncate_count))) {
-                if (mapping->truncate_count == 0)
-                        reset_vma_truncate_counts(mapping);
-                mapping->truncate_count++;
-        }
-        details.truncate_count = mapping->truncate_count;
+        spin_lock(&mapping->i_mmap_lock);
        if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
                unmap_mapping_range_tree(&mapping->i_mmap, &details);
        if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
                unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
        spin_unlock(&mapping->i_mmap_lock);
-        mutex_unlock(&mapping->unmap_mutex);
 }
 EXPORT_SYMBOL(unmap_mapping_range);
author	Peter Zijlstra <a.p.zijlstra@chello.nl>	2011-05-24 20:12:04 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2011-05-25 11:39:17 -0400
commit	97a894136f29802da19a15541de3c019e1ca147e (patch)
tree	1fd3f92ba92a37d5d8527a1f41458091d0a944dc /mm/memory.c
parent	e4c70a6629f9c74c4b0de258a3951890e9047c82 (diff)

diff --git a/mm/memory.c b/mm/memory.c index 17193d74f302..18655878b9f8 100644 --- a/mm/memory.c +++ b/mm/memory.c
@@ -986,13 +986,13 @@ int copy_page_range(struct mm_struct dst_mm, struct mm_struct src_mm,
986	static unsigned long zap_pte_range(struct mmu_gather *tlb,	986	static unsigned long zap_pte_range(struct mmu_gather *tlb,
987	struct vm_area_struct vma, pmd_t pmd,	987	struct vm_area_struct vma, pmd_t pmd,
988	unsigned long addr, unsigned long end,	988	unsigned long addr, unsigned long end,
989	long zap_work, struct zap_details details)	989	struct zap_details *details)
990	{	990	{
991	struct mm_struct *mm = tlb->mm;	991	struct mm_struct *mm = tlb->mm;
992	int force_flush = 0;	992	int force_flush = 0;
993	pte_t *pte;
994	spinlock_t *ptl;
995	int rss[NR_MM_COUNTERS];	993	int rss[NR_MM_COUNTERS];
		994	spinlock_t *ptl;
		995	pte_t *pte;
996		996
997	again:	997	again:
998	init_rss_vec(rss);	998	init_rss_vec(rss);
@@ -1001,12 +1001,9 @@ again:
1001	do {	1001	do {
1002	pte_t ptent = *pte;	1002	pte_t ptent = *pte;
1003	if (pte_none(ptent)) {	1003	if (pte_none(ptent)) {
1004	(*zap_work)--;
1005	continue;	1004	continue;
1006	}	1005	}
1007		1006
1008	(*zap_work) -= PAGE_SIZE;
1009
1010	if (pte_present(ptent)) {	1007	if (pte_present(ptent)) {
1011	struct page *page;	1008	struct page *page;
1012		1009
@@ -1075,7 +1072,7 @@ again:
1075	print_bad_pte(vma, addr, ptent, NULL);	1072	print_bad_pte(vma, addr, ptent, NULL);
1076	}	1073	}
1077	pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);	1074	pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
1078	} while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));	1075	} while (pte++, addr += PAGE_SIZE, addr != end);
1079		1076
1080	add_mm_rss_vec(mm, rss);	1077	add_mm_rss_vec(mm, rss);
1081	arch_leave_lazy_mmu_mode();	1078	arch_leave_lazy_mmu_mode();
@@ -1099,7 +1096,7 @@ again:
1099	static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,	1096	static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
1100	struct vm_area_struct vma, pud_t pud,	1097	struct vm_area_struct vma, pud_t pud,
1101	unsigned long addr, unsigned long end,	1098	unsigned long addr, unsigned long end,
1102	long zap_work, struct zap_details details)	1099	struct zap_details *details)
1103	{	1100	{
1104	pmd_t *pmd;	1101	pmd_t *pmd;
1105	unsigned long next;	1102	unsigned long next;
@@ -1111,19 +1108,15 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
1111	if (next-addr != HPAGE_PMD_SIZE) {	1108	if (next-addr != HPAGE_PMD_SIZE) {
1112	VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem));	1109	VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem));
1113	split_huge_page_pmd(vma->vm_mm, pmd);	1110	split_huge_page_pmd(vma->vm_mm, pmd);
1114	} else if (zap_huge_pmd(tlb, vma, pmd)) {	1111	} else if (zap_huge_pmd(tlb, vma, pmd))
1115	(*zap_work)--;
1116	continue;	1112	continue;
1117	}
1118	/* fall through */	1113	/* fall through */
1119	}	1114	}
1120	if (pmd_none_or_clear_bad(pmd)) {	1115	if (pmd_none_or_clear_bad(pmd))
1121	(*zap_work)--;
1122	continue;	1116	continue;
1123	}	1117	next = zap_pte_range(tlb, vma, pmd, addr, next, details);
1124	next = zap_pte_range(tlb, vma, pmd, addr, next,	1118	cond_resched();
1125	zap_work, details);	1119	} while (pmd++, addr = next, addr != end);
1126	} while (pmd++, addr = next, (addr != end && *zap_work > 0));
1127		1120
1128	return addr;	1121	return addr;
1129	}	1122	}
@@ -1131,7 +1124,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
1131	static inline unsigned long zap_pud_range(struct mmu_gather *tlb,	1124	static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
1132	struct vm_area_struct vma, pgd_t pgd,	1125	struct vm_area_struct vma, pgd_t pgd,
1133	unsigned long addr, unsigned long end,	1126	unsigned long addr, unsigned long end,
1134	long zap_work, struct zap_details details)	1127	struct zap_details *details)
1135	{	1128	{
1136	pud_t *pud;	1129	pud_t *pud;
1137	unsigned long next;	1130	unsigned long next;
@@ -1139,13 +1132,10 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
1139	pud = pud_offset(pgd, addr);	1132	pud = pud_offset(pgd, addr);
1140	do {	1133	do {
1141	next = pud_addr_end(addr, end);	1134	next = pud_addr_end(addr, end);
1142	if (pud_none_or_clear_bad(pud)) {	1135	if (pud_none_or_clear_bad(pud))
1143	(*zap_work)--;
1144	continue;	1136	continue;
1145	}	1137	next = zap_pmd_range(tlb, vma, pud, addr, next, details);
1146	next = zap_pmd_range(tlb, vma, pud, addr, next,	1138	} while (pud++, addr = next, addr != end);
1147	zap_work, details);
1148	} while (pud++, addr = next, (addr != end && *zap_work > 0));
1149		1139
1150	return addr;	1140	return addr;
1151	}	1141	}
@@ -1153,7 +1143,7 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
1153	static unsigned long unmap_page_range(struct mmu_gather *tlb,	1143	static unsigned long unmap_page_range(struct mmu_gather *tlb,
1154	struct vm_area_struct *vma,	1144	struct vm_area_struct *vma,
1155	unsigned long addr, unsigned long end,	1145	unsigned long addr, unsigned long end,
1156	long zap_work, struct zap_details details)	1146	struct zap_details *details)
1157	{	1147	{
1158	pgd_t *pgd;	1148	pgd_t *pgd;
1159	unsigned long next;	1149	unsigned long next;
@@ -1167,13 +1157,10 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
1167	pgd = pgd_offset(vma->vm_mm, addr);	1157	pgd = pgd_offset(vma->vm_mm, addr);
1168	do {	1158	do {
1169	next = pgd_addr_end(addr, end);	1159	next = pgd_addr_end(addr, end);
1170	if (pgd_none_or_clear_bad(pgd)) {	1160	if (pgd_none_or_clear_bad(pgd))
1171	(*zap_work)--;
1172	continue;	1161	continue;
1173	}	1162	next = zap_pud_range(tlb, vma, pgd, addr, next, details);
1174	next = zap_pud_range(tlb, vma, pgd, addr, next,	1163	} while (pgd++, addr = next, addr != end);
1175	zap_work, details);
1176	} while (pgd++, addr = next, (addr != end && *zap_work > 0));
1177	tlb_end_vma(tlb, vma);	1164	tlb_end_vma(tlb, vma);
1178	mem_cgroup_uncharge_end();	1165	mem_cgroup_uncharge_end();
1179		1166
@@ -1218,9 +1205,7 @@ unsigned long unmap_vmas(struct mmu_gather *tlb,
1218	unsigned long end_addr, unsigned long *nr_accounted,	1205	unsigned long end_addr, unsigned long *nr_accounted,
1219	struct zap_details *details)	1206	struct zap_details *details)
1220	{	1207	{
1221	long zap_work = ZAP_BLOCK_SIZE;
1222	unsigned long start = start_addr;	1208	unsigned long start = start_addr;
1223	spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
1224	struct mm_struct *mm = vma->vm_mm;	1209	struct mm_struct *mm = vma->vm_mm;
1225		1210
1226	mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);	1211	mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
@@ -1253,33 +1238,15 @@ unsigned long unmap_vmas(struct mmu_gather *tlb,
1253	* Since no pte has actually been setup, it is	1238	* Since no pte has actually been setup, it is
1254	* safe to do nothing in this case.	1239	* safe to do nothing in this case.
1255	*/	1240	*/
1256	if (vma->vm_file) {	1241	if (vma->vm_file)
1257	unmap_hugepage_range(vma, start, end, NULL);	1242	unmap_hugepage_range(vma, start, end, NULL);
1258	zap_work -= (end - start) /
1259	pages_per_huge_page(hstate_vma(vma));
1260	}
1261		1243
1262	start = end;	1244	start = end;
1263	} else	1245	} else
1264	start = unmap_page_range(tlb, vma,	1246	start = unmap_page_range(tlb, vma, start, end, details);
1265	start, end, &zap_work, details);
1266
1267	if (zap_work > 0) {
1268	BUG_ON(start != end);
1269	break;
1270	}
1271
1272	if (need_resched() \|\|
1273	(i_mmap_lock && spin_needbreak(i_mmap_lock))) {
1274	if (i_mmap_lock)
1275	goto out;
1276	cond_resched();
1277	}
1278
1279	zap_work = ZAP_BLOCK_SIZE;
1280	}	1247	}
1281	}	1248	}
1282	out:	1249
1283	mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);	1250	mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
1284	return start; /* which is now the end (or restart) address */	1251	return start; /* which is now the end (or restart) address */
1285	}	1252	}
@@ -2612,96 +2579,11 @@ unwritable_page:
2612	return ret;	2579	return ret;
2613	}	2580	}
2614		2581
2615	/*	2582	static void unmap_mapping_range_vma(struct vm_area_struct *vma,
2616	* Helper functions for unmap_mapping_range().
2617	*
2618	* __ Notes on dropping i_mmap_lock to reduce latency while unmapping __
2619	*
2620	* We have to restart searching the prio_tree whenever we drop the lock,
2621	* since the iterator is only valid while the lock is held, and anyway
2622	* a later vma might be split and reinserted earlier while lock dropped.
2623	*
2624	* The list of nonlinear vmas could be handled more efficiently, using
2625	* a placeholder, but handle it in the same way until a need is shown.
2626	* It is important to search the prio_tree before nonlinear list: a vma
2627	* may become nonlinear and be shifted from prio_tree to nonlinear list
2628	* while the lock is dropped; but never shifted from list to prio_tree.
2629	*
2630	* In order to make forward progress despite restarting the search,
2631	* vm_truncate_count is used to mark a vma as now dealt with, so we can
2632	* quickly skip it next time around. Since the prio_tree search only
2633	* shows us those vmas affected by unmapping the range in question, we
2634	* can't efficiently keep all vmas in step with mapping->truncate_count:
2635	* so instead reset them all whenever it wraps back to 0 (then go to 1).
2636	* mapping->truncate_count and vma->vm_truncate_count are protected by
2637	* i_mmap_lock.
2638	*
2639	* In order to make forward progress despite repeatedly restarting some
2640	* large vma, note the restart_addr from unmap_vmas when it breaks out:
2641	* and restart from that address when we reach that vma again. It might
2642	* have been split or merged, shrunk or extended, but never shifted: so
2643	* restart_addr remains valid so long as it remains in the vma's range.
2644	* unmap_mapping_range forces truncate_count to leap over page-aligned
2645	* values so we can save vma's restart_addr in its truncate_count field.
2646	*/
2647	#define is_restart_addr(truncate_count) (!((truncate_count) & ~PAGE_MASK))
2648
2649	static void reset_vma_truncate_counts(struct address_space *mapping)
2650	{
2651	struct vm_area_struct *vma;
2652	struct prio_tree_iter iter;
2653
2654	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
2655	vma->vm_truncate_count = 0;
2656	list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
2657	vma->vm_truncate_count = 0;
2658	}
2659
2660	static int unmap_mapping_range_vma(struct vm_area_struct *vma,
2661	unsigned long start_addr, unsigned long end_addr,	2583	unsigned long start_addr, unsigned long end_addr,
2662	struct zap_details *details)	2584	struct zap_details *details)
2663	{	2585	{
2664	unsigned long restart_addr;	2586	zap_page_range(vma, start_addr, end_addr - start_addr, details);
2665	int need_break;
2666
2667	/*
2668	* files that support invalidating or truncating portions of the
2669	* file from under mmaped areas must have their ->fault function
2670	* return a locked page (and set VM_FAULT_LOCKED in the return).
2671	* This provides synchronisation against concurrent unmapping here.
2672	*/
2673
2674	again:
2675	restart_addr = vma->vm_truncate_count;
2676	if (is_restart_addr(restart_addr) && start_addr < restart_addr) {
2677	start_addr = restart_addr;
2678	if (start_addr >= end_addr) {
2679	/* Top of vma has been split off since last time */
2680	vma->vm_truncate_count = details->truncate_count;
2681	return 0;
2682	}
2683	}
2684
2685	restart_addr = zap_page_range(vma, start_addr,
2686	end_addr - start_addr, details);
2687	need_break = need_resched() \|\| spin_needbreak(details->i_mmap_lock);
2688
2689	if (restart_addr >= end_addr) {
2690	/* We have now completed this vma: mark it so */
2691	vma->vm_truncate_count = details->truncate_count;
2692	if (!need_break)
2693	return 0;
2694	} else {
2695	/* Note restart_addr in vma's truncate_count field */
2696	vma->vm_truncate_count = restart_addr;
2697	if (!need_break)
2698	goto again;
2699	}
2700
2701	spin_unlock(details->i_mmap_lock);
2702	cond_resched();
2703	spin_lock(details->i_mmap_lock);
2704	return -EINTR;
2705	}	2587	}
2706		2588
2707	static inline void unmap_mapping_range_tree(struct prio_tree_root *root,	2589	static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
@@ -2711,12 +2593,8 @@ static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
2711	struct prio_tree_iter iter;	2593	struct prio_tree_iter iter;
2712	pgoff_t vba, vea, zba, zea;	2594	pgoff_t vba, vea, zba, zea;
2713		2595
2714	restart:
2715	vma_prio_tree_foreach(vma, &iter, root,	2596	vma_prio_tree_foreach(vma, &iter, root,
2716	details->first_index, details->last_index) {	2597	details->first_index, details->last_index) {
2717	/* Skip quickly over those we have already dealt with */
2718	if (vma->vm_truncate_count == details->truncate_count)
2719	continue;
2720		2598
2721	vba = vma->vm_pgoff;	2599	vba = vma->vm_pgoff;
2722	vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1;	2600	vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1;
@@ -2728,11 +2606,10 @@ restart:
2728	if (zea > vea)	2606	if (zea > vea)
2729	zea = vea;	2607	zea = vea;
2730		2608
2731	if (unmap_mapping_range_vma(vma,	2609	unmap_mapping_range_vma(vma,
2732	((zba - vba) << PAGE_SHIFT) + vma->vm_start,	2610	((zba - vba) << PAGE_SHIFT) + vma->vm_start,
2733	((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,	2611	((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
2734	details) < 0)	2612	details);
2735	goto restart;
2736	}	2613	}
2737	}	2614	}
2738		2615
@@ -2747,15 +2624,9 @@ static inline void unmap_mapping_range_list(struct list_head *head,
2747	* across all the pages in each nonlinear VMA, not just the pages	2624	* across all the pages in each nonlinear VMA, not just the pages
2748	* whose virtual address lies outside the file truncation point.	2625	* whose virtual address lies outside the file truncation point.
2749	*/	2626	*/
2750	restart:
2751	list_for_each_entry(vma, head, shared.vm_set.list) {	2627	list_for_each_entry(vma, head, shared.vm_set.list) {
2752	/* Skip quickly over those we have already dealt with */
2753	if (vma->vm_truncate_count == details->truncate_count)
2754	continue;
2755	details->nonlinear_vma = vma;	2628	details->nonlinear_vma = vma;
2756	if (unmap_mapping_range_vma(vma, vma->vm_start,	2629	unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details);
2757	vma->vm_end, details) < 0)
2758	goto restart;
2759	}	2630	}
2760	}	2631	}
2761		2632
@@ -2794,26 +2665,14 @@ void unmap_mapping_range(struct address_space *mapping,
2794	details.last_index = hba + hlen - 1;	2665	details.last_index = hba + hlen - 1;
2795	if (details.last_index < details.first_index)	2666	if (details.last_index < details.first_index)
2796	details.last_index = ULONG_MAX;	2667	details.last_index = ULONG_MAX;
2797	details.i_mmap_lock = &mapping->i_mmap_lock;
2798		2668
2799	mutex_lock(&mapping->unmap_mutex);
2800	spin_lock(&mapping->i_mmap_lock);
2801
2802	/* Protect against endless unmapping loops */
2803	mapping->truncate_count++;
2804	if (unlikely(is_restart_addr(mapping->truncate_count))) {
2805	if (mapping->truncate_count == 0)
2806	reset_vma_truncate_counts(mapping);
2807	mapping->truncate_count++;
2808	}
2809	details.truncate_count = mapping->truncate_count;
2810		2669
		2670	spin_lock(&mapping->i_mmap_lock);
2811	if (unlikely(!prio_tree_empty(&mapping->i_mmap)))	2671	if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
2812	unmap_mapping_range_tree(&mapping->i_mmap, &details);	2672	unmap_mapping_range_tree(&mapping->i_mmap, &details);
2813	if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))	2673	if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
2814	unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);	2674	unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
2815	spin_unlock(&mapping->i_mmap_lock);	2675	spin_unlock(&mapping->i_mmap_lock);
2816	mutex_unlock(&mapping->unmap_mutex);
2817	}	2676	}
2818	EXPORT_SYMBOL(unmap_mapping_range);	2677	EXPORT_SYMBOL(unmap_mapping_range);
2819		2678