aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memory.c
diff options
context:
space:
mode:
authorPeter Zijlstra <a.p.zijlstra@chello.nl>2011-05-24 20:12:04 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2011-05-25 11:39:17 -0400
commit97a894136f29802da19a15541de3c019e1ca147e (patch)
tree1fd3f92ba92a37d5d8527a1f41458091d0a944dc /mm/memory.c
parente4c70a6629f9c74c4b0de258a3951890e9047c82 (diff)
mm: Remove i_mmap_lock lockbreak
Hugh says: "The only significant loser, I think, would be page reclaim (when concurrent with truncation): could spin for a long time waiting for the i_mmap_mutex it expects would soon be dropped? " Counter points: - cpu contention makes the spin stop (need_resched()) - zap pages should be freeing pages at a higher rate than reclaim ever can I think the simplification of the truncate code is definitely worth it. Effectively reverts: 2aa15890f3c ("mm: prevent concurrent unmap_mapping_range() on the same inode") and takes out the code that caused its problem. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Hugh Dickins <hughd@google.com> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: David Miller <davem@davemloft.net> Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: Russell King <rmk@arm.linux.org.uk> Cc: Paul Mundt <lethal@linux-sh.org> Cc: Jeff Dike <jdike@addtoit.com> Cc: Richard Weinberger <richard@nod.at> Cc: Tony Luck <tony.luck@intel.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Nick Piggin <npiggin@kernel.dk> Cc: Namhyung Kim <namhyung@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/memory.c')
-rw-r--r--mm/memory.c195
1 files changed, 27 insertions, 168 deletions
diff --git a/mm/memory.c b/mm/memory.c
index 17193d74f302..18655878b9f8 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -986,13 +986,13 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
986static unsigned long zap_pte_range(struct mmu_gather *tlb, 986static unsigned long zap_pte_range(struct mmu_gather *tlb,
987 struct vm_area_struct *vma, pmd_t *pmd, 987 struct vm_area_struct *vma, pmd_t *pmd,
988 unsigned long addr, unsigned long end, 988 unsigned long addr, unsigned long end,
989 long *zap_work, struct zap_details *details) 989 struct zap_details *details)
990{ 990{
991 struct mm_struct *mm = tlb->mm; 991 struct mm_struct *mm = tlb->mm;
992 int force_flush = 0; 992 int force_flush = 0;
993 pte_t *pte;
994 spinlock_t *ptl;
995 int rss[NR_MM_COUNTERS]; 993 int rss[NR_MM_COUNTERS];
994 spinlock_t *ptl;
995 pte_t *pte;
996 996
997again: 997again:
998 init_rss_vec(rss); 998 init_rss_vec(rss);
@@ -1001,12 +1001,9 @@ again:
1001 do { 1001 do {
1002 pte_t ptent = *pte; 1002 pte_t ptent = *pte;
1003 if (pte_none(ptent)) { 1003 if (pte_none(ptent)) {
1004 (*zap_work)--;
1005 continue; 1004 continue;
1006 } 1005 }
1007 1006
1008 (*zap_work) -= PAGE_SIZE;
1009
1010 if (pte_present(ptent)) { 1007 if (pte_present(ptent)) {
1011 struct page *page; 1008 struct page *page;
1012 1009
@@ -1075,7 +1072,7 @@ again:
1075 print_bad_pte(vma, addr, ptent, NULL); 1072 print_bad_pte(vma, addr, ptent, NULL);
1076 } 1073 }
1077 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); 1074 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
1078 } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); 1075 } while (pte++, addr += PAGE_SIZE, addr != end);
1079 1076
1080 add_mm_rss_vec(mm, rss); 1077 add_mm_rss_vec(mm, rss);
1081 arch_leave_lazy_mmu_mode(); 1078 arch_leave_lazy_mmu_mode();
@@ -1099,7 +1096,7 @@ again:
1099static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, 1096static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
1100 struct vm_area_struct *vma, pud_t *pud, 1097 struct vm_area_struct *vma, pud_t *pud,
1101 unsigned long addr, unsigned long end, 1098 unsigned long addr, unsigned long end,
1102 long *zap_work, struct zap_details *details) 1099 struct zap_details *details)
1103{ 1100{
1104 pmd_t *pmd; 1101 pmd_t *pmd;
1105 unsigned long next; 1102 unsigned long next;
@@ -1111,19 +1108,15 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
1111 if (next-addr != HPAGE_PMD_SIZE) { 1108 if (next-addr != HPAGE_PMD_SIZE) {
1112 VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem)); 1109 VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem));
1113 split_huge_page_pmd(vma->vm_mm, pmd); 1110 split_huge_page_pmd(vma->vm_mm, pmd);
1114 } else if (zap_huge_pmd(tlb, vma, pmd)) { 1111 } else if (zap_huge_pmd(tlb, vma, pmd))
1115 (*zap_work)--;
1116 continue; 1112 continue;
1117 }
1118 /* fall through */ 1113 /* fall through */
1119 } 1114 }
1120 if (pmd_none_or_clear_bad(pmd)) { 1115 if (pmd_none_or_clear_bad(pmd))
1121 (*zap_work)--;
1122 continue; 1116 continue;
1123 } 1117 next = zap_pte_range(tlb, vma, pmd, addr, next, details);
1124 next = zap_pte_range(tlb, vma, pmd, addr, next, 1118 cond_resched();
1125 zap_work, details); 1119 } while (pmd++, addr = next, addr != end);
1126 } while (pmd++, addr = next, (addr != end && *zap_work > 0));
1127 1120
1128 return addr; 1121 return addr;
1129} 1122}
@@ -1131,7 +1124,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
1131static inline unsigned long zap_pud_range(struct mmu_gather *tlb, 1124static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
1132 struct vm_area_struct *vma, pgd_t *pgd, 1125 struct vm_area_struct *vma, pgd_t *pgd,
1133 unsigned long addr, unsigned long end, 1126 unsigned long addr, unsigned long end,
1134 long *zap_work, struct zap_details *details) 1127 struct zap_details *details)
1135{ 1128{
1136 pud_t *pud; 1129 pud_t *pud;
1137 unsigned long next; 1130 unsigned long next;
@@ -1139,13 +1132,10 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
1139 pud = pud_offset(pgd, addr); 1132 pud = pud_offset(pgd, addr);
1140 do { 1133 do {
1141 next = pud_addr_end(addr, end); 1134 next = pud_addr_end(addr, end);
1142 if (pud_none_or_clear_bad(pud)) { 1135 if (pud_none_or_clear_bad(pud))
1143 (*zap_work)--;
1144 continue; 1136 continue;
1145 } 1137 next = zap_pmd_range(tlb, vma, pud, addr, next, details);
1146 next = zap_pmd_range(tlb, vma, pud, addr, next, 1138 } while (pud++, addr = next, addr != end);
1147 zap_work, details);
1148 } while (pud++, addr = next, (addr != end && *zap_work > 0));
1149 1139
1150 return addr; 1140 return addr;
1151} 1141}
@@ -1153,7 +1143,7 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
1153static unsigned long unmap_page_range(struct mmu_gather *tlb, 1143static unsigned long unmap_page_range(struct mmu_gather *tlb,
1154 struct vm_area_struct *vma, 1144 struct vm_area_struct *vma,
1155 unsigned long addr, unsigned long end, 1145 unsigned long addr, unsigned long end,
1156 long *zap_work, struct zap_details *details) 1146 struct zap_details *details)
1157{ 1147{
1158 pgd_t *pgd; 1148 pgd_t *pgd;
1159 unsigned long next; 1149 unsigned long next;
@@ -1167,13 +1157,10 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
1167 pgd = pgd_offset(vma->vm_mm, addr); 1157 pgd = pgd_offset(vma->vm_mm, addr);
1168 do { 1158 do {
1169 next = pgd_addr_end(addr, end); 1159 next = pgd_addr_end(addr, end);
1170 if (pgd_none_or_clear_bad(pgd)) { 1160 if (pgd_none_or_clear_bad(pgd))
1171 (*zap_work)--;
1172 continue; 1161 continue;
1173 } 1162 next = zap_pud_range(tlb, vma, pgd, addr, next, details);
1174 next = zap_pud_range(tlb, vma, pgd, addr, next, 1163 } while (pgd++, addr = next, addr != end);
1175 zap_work, details);
1176 } while (pgd++, addr = next, (addr != end && *zap_work > 0));
1177 tlb_end_vma(tlb, vma); 1164 tlb_end_vma(tlb, vma);
1178 mem_cgroup_uncharge_end(); 1165 mem_cgroup_uncharge_end();
1179 1166
@@ -1218,9 +1205,7 @@ unsigned long unmap_vmas(struct mmu_gather *tlb,
1218 unsigned long end_addr, unsigned long *nr_accounted, 1205 unsigned long end_addr, unsigned long *nr_accounted,
1219 struct zap_details *details) 1206 struct zap_details *details)
1220{ 1207{
1221 long zap_work = ZAP_BLOCK_SIZE;
1222 unsigned long start = start_addr; 1208 unsigned long start = start_addr;
1223 spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
1224 struct mm_struct *mm = vma->vm_mm; 1209 struct mm_struct *mm = vma->vm_mm;
1225 1210
1226 mmu_notifier_invalidate_range_start(mm, start_addr, end_addr); 1211 mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
@@ -1253,33 +1238,15 @@ unsigned long unmap_vmas(struct mmu_gather *tlb,
1253 * Since no pte has actually been setup, it is 1238 * Since no pte has actually been setup, it is
1254 * safe to do nothing in this case. 1239 * safe to do nothing in this case.
1255 */ 1240 */
1256 if (vma->vm_file) { 1241 if (vma->vm_file)
1257 unmap_hugepage_range(vma, start, end, NULL); 1242 unmap_hugepage_range(vma, start, end, NULL);
1258 zap_work -= (end - start) /
1259 pages_per_huge_page(hstate_vma(vma));
1260 }
1261 1243
1262 start = end; 1244 start = end;
1263 } else 1245 } else
1264 start = unmap_page_range(tlb, vma, 1246 start = unmap_page_range(tlb, vma, start, end, details);
1265 start, end, &zap_work, details);
1266
1267 if (zap_work > 0) {
1268 BUG_ON(start != end);
1269 break;
1270 }
1271
1272 if (need_resched() ||
1273 (i_mmap_lock && spin_needbreak(i_mmap_lock))) {
1274 if (i_mmap_lock)
1275 goto out;
1276 cond_resched();
1277 }
1278
1279 zap_work = ZAP_BLOCK_SIZE;
1280 } 1247 }
1281 } 1248 }
1282out: 1249
1283 mmu_notifier_invalidate_range_end(mm, start_addr, end_addr); 1250 mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
1284 return start; /* which is now the end (or restart) address */ 1251 return start; /* which is now the end (or restart) address */
1285} 1252}
@@ -2612,96 +2579,11 @@ unwritable_page:
2612 return ret; 2579 return ret;
2613} 2580}
2614 2581
2615/* 2582static void unmap_mapping_range_vma(struct vm_area_struct *vma,
2616 * Helper functions for unmap_mapping_range().
2617 *
2618 * __ Notes on dropping i_mmap_lock to reduce latency while unmapping __
2619 *
2620 * We have to restart searching the prio_tree whenever we drop the lock,
2621 * since the iterator is only valid while the lock is held, and anyway
2622 * a later vma might be split and reinserted earlier while lock dropped.
2623 *
2624 * The list of nonlinear vmas could be handled more efficiently, using
2625 * a placeholder, but handle it in the same way until a need is shown.
2626 * It is important to search the prio_tree before nonlinear list: a vma
2627 * may become nonlinear and be shifted from prio_tree to nonlinear list
2628 * while the lock is dropped; but never shifted from list to prio_tree.
2629 *
2630 * In order to make forward progress despite restarting the search,
2631 * vm_truncate_count is used to mark a vma as now dealt with, so we can
2632 * quickly skip it next time around. Since the prio_tree search only
2633 * shows us those vmas affected by unmapping the range in question, we
2634 * can't efficiently keep all vmas in step with mapping->truncate_count:
2635 * so instead reset them all whenever it wraps back to 0 (then go to 1).
2636 * mapping->truncate_count and vma->vm_truncate_count are protected by
2637 * i_mmap_lock.
2638 *
2639 * In order to make forward progress despite repeatedly restarting some
2640 * large vma, note the restart_addr from unmap_vmas when it breaks out:
2641 * and restart from that address when we reach that vma again. It might
2642 * have been split or merged, shrunk or extended, but never shifted: so
2643 * restart_addr remains valid so long as it remains in the vma's range.
2644 * unmap_mapping_range forces truncate_count to leap over page-aligned
2645 * values so we can save vma's restart_addr in its truncate_count field.
2646 */
2647#define is_restart_addr(truncate_count) (!((truncate_count) & ~PAGE_MASK))
2648
2649static void reset_vma_truncate_counts(struct address_space *mapping)
2650{
2651 struct vm_area_struct *vma;
2652 struct prio_tree_iter iter;
2653
2654 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
2655 vma->vm_truncate_count = 0;
2656 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
2657 vma->vm_truncate_count = 0;
2658}
2659
2660static int unmap_mapping_range_vma(struct vm_area_struct *vma,
2661 unsigned long start_addr, unsigned long end_addr, 2583 unsigned long start_addr, unsigned long end_addr,
2662 struct zap_details *details) 2584 struct zap_details *details)
2663{ 2585{
2664 unsigned long restart_addr; 2586 zap_page_range(vma, start_addr, end_addr - start_addr, details);
2665 int need_break;
2666
2667 /*
2668 * files that support invalidating or truncating portions of the
2669 * file from under mmaped areas must have their ->fault function
2670 * return a locked page (and set VM_FAULT_LOCKED in the return).
2671 * This provides synchronisation against concurrent unmapping here.
2672 */
2673
2674again:
2675 restart_addr = vma->vm_truncate_count;
2676 if (is_restart_addr(restart_addr) && start_addr < restart_addr) {
2677 start_addr = restart_addr;
2678 if (start_addr >= end_addr) {
2679 /* Top of vma has been split off since last time */
2680 vma->vm_truncate_count = details->truncate_count;
2681 return 0;
2682 }
2683 }
2684
2685 restart_addr = zap_page_range(vma, start_addr,
2686 end_addr - start_addr, details);
2687 need_break = need_resched() || spin_needbreak(details->i_mmap_lock);
2688
2689 if (restart_addr >= end_addr) {
2690 /* We have now completed this vma: mark it so */
2691 vma->vm_truncate_count = details->truncate_count;
2692 if (!need_break)
2693 return 0;
2694 } else {
2695 /* Note restart_addr in vma's truncate_count field */
2696 vma->vm_truncate_count = restart_addr;
2697 if (!need_break)
2698 goto again;
2699 }
2700
2701 spin_unlock(details->i_mmap_lock);
2702 cond_resched();
2703 spin_lock(details->i_mmap_lock);
2704 return -EINTR;
2705} 2587}
2706 2588
2707static inline void unmap_mapping_range_tree(struct prio_tree_root *root, 2589static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
@@ -2711,12 +2593,8 @@ static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
2711 struct prio_tree_iter iter; 2593 struct prio_tree_iter iter;
2712 pgoff_t vba, vea, zba, zea; 2594 pgoff_t vba, vea, zba, zea;
2713 2595
2714restart:
2715 vma_prio_tree_foreach(vma, &iter, root, 2596 vma_prio_tree_foreach(vma, &iter, root,
2716 details->first_index, details->last_index) { 2597 details->first_index, details->last_index) {
2717 /* Skip quickly over those we have already dealt with */
2718 if (vma->vm_truncate_count == details->truncate_count)
2719 continue;
2720 2598
2721 vba = vma->vm_pgoff; 2599 vba = vma->vm_pgoff;
2722 vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1; 2600 vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1;
@@ -2728,11 +2606,10 @@ restart:
2728 if (zea > vea) 2606 if (zea > vea)
2729 zea = vea; 2607 zea = vea;
2730 2608
2731 if (unmap_mapping_range_vma(vma, 2609 unmap_mapping_range_vma(vma,
2732 ((zba - vba) << PAGE_SHIFT) + vma->vm_start, 2610 ((zba - vba) << PAGE_SHIFT) + vma->vm_start,
2733 ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start, 2611 ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
2734 details) < 0) 2612 details);
2735 goto restart;
2736 } 2613 }
2737} 2614}
2738 2615
@@ -2747,15 +2624,9 @@ static inline void unmap_mapping_range_list(struct list_head *head,
2747 * across *all* the pages in each nonlinear VMA, not just the pages 2624 * across *all* the pages in each nonlinear VMA, not just the pages
2748 * whose virtual address lies outside the file truncation point. 2625 * whose virtual address lies outside the file truncation point.
2749 */ 2626 */
2750restart:
2751 list_for_each_entry(vma, head, shared.vm_set.list) { 2627 list_for_each_entry(vma, head, shared.vm_set.list) {
2752 /* Skip quickly over those we have already dealt with */
2753 if (vma->vm_truncate_count == details->truncate_count)
2754 continue;
2755 details->nonlinear_vma = vma; 2628 details->nonlinear_vma = vma;
2756 if (unmap_mapping_range_vma(vma, vma->vm_start, 2629 unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details);
2757 vma->vm_end, details) < 0)
2758 goto restart;
2759 } 2630 }
2760} 2631}
2761 2632
@@ -2794,26 +2665,14 @@ void unmap_mapping_range(struct address_space *mapping,
2794 details.last_index = hba + hlen - 1; 2665 details.last_index = hba + hlen - 1;
2795 if (details.last_index < details.first_index) 2666 if (details.last_index < details.first_index)
2796 details.last_index = ULONG_MAX; 2667 details.last_index = ULONG_MAX;
2797 details.i_mmap_lock = &mapping->i_mmap_lock;
2798 2668
2799 mutex_lock(&mapping->unmap_mutex);
2800 spin_lock(&mapping->i_mmap_lock);
2801
2802 /* Protect against endless unmapping loops */
2803 mapping->truncate_count++;
2804 if (unlikely(is_restart_addr(mapping->truncate_count))) {
2805 if (mapping->truncate_count == 0)
2806 reset_vma_truncate_counts(mapping);
2807 mapping->truncate_count++;
2808 }
2809 details.truncate_count = mapping->truncate_count;
2810 2669
2670 spin_lock(&mapping->i_mmap_lock);
2811 if (unlikely(!prio_tree_empty(&mapping->i_mmap))) 2671 if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
2812 unmap_mapping_range_tree(&mapping->i_mmap, &details); 2672 unmap_mapping_range_tree(&mapping->i_mmap, &details);
2813 if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) 2673 if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
2814 unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); 2674 unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
2815 spin_unlock(&mapping->i_mmap_lock); 2675 spin_unlock(&mapping->i_mmap_lock);
2816 mutex_unlock(&mapping->unmap_mutex);
2817} 2676}
2818EXPORT_SYMBOL(unmap_mapping_range); 2677EXPORT_SYMBOL(unmap_mapping_range);
2819 2678