mm: mmu_gather rework

Rework the existing mmu_gather infrastructure. The direct purpose of these patches was to allow preemptible mmu_gather, but even without that I think these patches provide an improvement to the status quo. The first 9 patches rework the mmu_gather infrastructure. For review purpose I've split them into generic and per-arch patches with the last of those a generic cleanup. The next patch provides generic RCU page-table freeing, and the followup is a patch converting s390 to use this. I've also got 4 patches from DaveM lined up (not included in this series) that uses this to implement gup_fast() for sparc64. Then there is one patch that extends the generic mmu_gather batching. After that follow the mm preemptibility patches, these make part of the mm a lot more preemptible. It converts i_mmap_lock and anon_vma->lock to mutexes which together with the mmu_gather rework makes mmu_gather preemptible as well. Making i_mmap_lock a mutex also enables a clean-up of the truncate code. This also allows for preemptible mmu_notifiers, something that XPMEM I think wants. Furthermore, it removes the new and universially detested unmap_mutex. This patch: Remove the first obstacle towards a fully preemptible mmu_gather. The current scheme assumes mmu_gather is always done with preemption disabled and uses per-cpu storage for the page batches. Change this to try and allocate a page for batching and in case of failure, use a small on-stack array to make some progress. Preemptible mmu_gather is desired in general and usable once i_mmap_lock becomes a mutex. Doing it before the mutex conversion saves us from having to rework the code by moving the mmu_gather bits inside the pte_lock. Also avoid flushing the tlb batches from under the pte lock, this is useful even without the i_mmap_lock conversion as it significantly reduces pte lock hold times. [akpm@linux-foundation.org: fix comment tpyo] Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: David Miller <davem@davemloft.net> Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: Russell King <rmk@arm.linux.org.uk> Cc: Paul Mundt <lethal@linux-sh.org> Cc: Jeff Dike <jdike@addtoit.com> Cc: Richard Weinberger <richard@nod.at> Cc: Tony Luck <tony.luck@intel.com> Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Acked-by: Hugh Dickins <hughd@google.com> Acked-by: Mel Gorman <mel@csn.ul.ie> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Nick Piggin <npiggin@kernel.dk> Cc: Namhyung Kim <namhyung@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Peter Zijlstra <a.p.zijlstra@chello.nl> 2011-05-24 20:11:45 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2011-05-25 11:39:12 -0400
commit: d16dfc550f5326a4000f3322582a7c05dec91d7a (patch)
tree: 8ee963542705cbf2187777f1d3f2b209cbda827a /mm
parent: d05f3169c0fbca16132ec7c2be71685c6de638b5 (diff)
2 files changed, 32 insertions, 32 deletions
diff --git a/mm/memory.c b/mm/memory.c
index 4c6ea10f3d18..19b2d44de9f0 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -912,12 +912,13 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
                                long *zap_work, struct zap_details *details)
 {
        struct mm_struct *mm = tlb->mm;
+        int force_flush = 0;
        pte_t *pte;
        spinlock_t *ptl;
        int rss[NR_MM_COUNTERS];
        init_rss_vec(rss);
+again:
        pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
        arch_enter_lazy_mmu_mode();
        do {
@@ -974,7 +975,9 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
                        page_remove_rmap(page);
                        if (unlikely(page_mapcount(page) < 0))
                                print_bad_pte(vma, addr, ptent, page);
-                        tlb_remove_page(tlb, page);
+                        force_flush = !__tlb_remove_page(tlb, page);
+                        if (force_flush)
+                                break;
                        continue;
                }
                /*
@@ -1001,6 +1004,18 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
        arch_leave_lazy_mmu_mode();
        pte_unmap_unlock(pte - 1, ptl);
+        /*
+         * mmu_gather ran out of room to batch pages, we break out of
+         * the PTE lock to avoid doing the potential expensive TLB invalidate
+         * and page-free while holding it.
+         */
+        if (force_flush) {
+                force_flush = 0;
+                tlb_flush_mmu(tlb);
+                if (addr != end)
+                        goto again;
+        }
        return addr;
 }
@@ -1121,17 +1136,14 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
 * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
 * drops the lock and schedules.
 */
-unsigned long unmap_vmas(struct mmu_gather **tlbp,
+unsigned long unmap_vmas(struct mmu_gather *tlb,
                struct vm_area_struct *vma, unsigned long start_addr,
                unsigned long end_addr, unsigned long *nr_accounted,
                struct zap_details *details)
 {
        long zap_work = ZAP_BLOCK_SIZE;
-        unsigned long tlb_start = 0;    /* For tlb_finish_mmu */
-        int tlb_start_valid = 0;
        unsigned long start = start_addr;
        spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
-        int fullmm = (*tlbp)->fullmm;
        struct mm_struct *mm = vma->vm_mm;
        mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
@@ -1152,11 +1164,6 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
                        untrack_pfn_vma(vma, 0, 0);
                while (start != end) {
-                        if (!tlb_start_valid) {
-                                tlb_start = start;
-                                tlb_start_valid = 1;
-                        }
                        if (unlikely(is_vm_hugetlb_page(vma))) {
                                /*
                                 * It is undesirable to test vma->vm_file as it
@@ -1177,7 +1184,7 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
                                start = end;
                        } else
-                                start = unmap_page_range(*tlbp, vma,
+                                start = unmap_page_range(tlb, vma,
                                                start, end, &zap_work, details);
                        if (zap_work > 0) {
@@ -1185,19 +1192,13 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
                                break;
                        }
-                        tlb_finish_mmu(*tlbp, tlb_start, start);
                        if (need_resched() ||
                                (i_mmap_lock && spin_needbreak(i_mmap_lock))) {
-                                if (i_mmap_lock) {
+                                if (i_mmap_lock)
-                                        *tlbp = NULL;
                                        goto out;
-                                }
                                cond_resched();
                        }
-                        *tlbp = tlb_gather_mmu(vma->vm_mm, fullmm);
-                        tlb_start_valid = 0;
                        zap_work = ZAP_BLOCK_SIZE;
                }
        }
@@ -1217,16 +1218,15 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
                unsigned long size, struct zap_details *details)
 {
        struct mm_struct *mm = vma->vm_mm;
-        struct mmu_gather *tlb;
+        struct mmu_gather tlb;
        unsigned long end = address + size;
        unsigned long nr_accounted = 0;
        lru_add_drain();
-        tlb = tlb_gather_mmu(mm, 0);
+        tlb_gather_mmu(&tlb, mm, 0);
        update_hiwater_rss(mm);
        end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
-        if (tlb)
+        tlb_finish_mmu(&tlb, address, end);
-                tlb_finish_mmu(tlb, address, end);
        return end;
 }
diff --git a/mm/mmap.c b/mm/mmap.c
index adb12527fd0e..40d49986e714 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1903,17 +1903,17 @@ static void unmap_region(struct mm_struct *mm,
                unsigned long start, unsigned long end)
 {
        struct vm_area_struct *next = prev? prev->vm_next: mm->mmap;
-        struct mmu_gather *tlb;
+        struct mmu_gather tlb;
        unsigned long nr_accounted = 0;
        lru_add_drain();
-        tlb = tlb_gather_mmu(mm, 0);
+        tlb_gather_mmu(&tlb, mm, 0);
        update_hiwater_rss(mm);
        unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);
        vm_unacct_memory(nr_accounted);
-        free_pgtables(tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
+        free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
-                                 next? next->vm_start: 0);
+                                 next ? next->vm_start : 0);
-        tlb_finish_mmu(tlb, start, end);
+        tlb_finish_mmu(&tlb, start, end);
 }
 /*
@@ -2255,7 +2255,7 @@ EXPORT_SYMBOL(do_brk);
 /* Release all mmaps. */
 void exit_mmap(struct mm_struct *mm)
 {
-        struct mmu_gather *tlb;
+        struct mmu_gather tlb;
        struct vm_area_struct *vma;
        unsigned long nr_accounted = 0;
        unsigned long end;
@@ -2280,14 +2280,14 @@ void exit_mmap(struct mm_struct *mm)
        lru_add_drain();
        flush_cache_mm(mm);
-        tlb = tlb_gather_mmu(mm, 1);
+        tlb_gather_mmu(&tlb, mm, 1);
        /* update_hiwater_rss(mm) here? but nobody should be looking */
        /* Use -1 here to ensure all VMAs in the mm are unmapped */
        end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
        vm_unacct_memory(nr_accounted);
-        free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0);
+        free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
-        tlb_finish_mmu(tlb, 0, end);
+        tlb_finish_mmu(&tlb, 0, end);
        /*
         * Walk the list again, actually closing and freeing it,
author	Peter Zijlstra <a.p.zijlstra@chello.nl>	2011-05-24 20:11:45 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2011-05-25 11:39:12 -0400
commit	d16dfc550f5326a4000f3322582a7c05dec91d7a (patch)
tree	8ee963542705cbf2187777f1d3f2b209cbda827a /mm
parent	d05f3169c0fbca16132ec7c2be71685c6de638b5 (diff)

diff --git a/mm/memory.c b/mm/memory.c index 4c6ea10f3d18..19b2d44de9f0 100644 --- a/mm/memory.c +++ b/mm/memory.c
@@ -912,12 +912,13 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
912	long zap_work, struct zap_details details)	912	long zap_work, struct zap_details details)
913	{	913	{
914	struct mm_struct *mm = tlb->mm;	914	struct mm_struct *mm = tlb->mm;
		915	int force_flush = 0;
915	pte_t *pte;	916	pte_t *pte;
916	spinlock_t *ptl;	917	spinlock_t *ptl;
917	int rss[NR_MM_COUNTERS];	918	int rss[NR_MM_COUNTERS];
918		919
919	init_rss_vec(rss);	920	init_rss_vec(rss);
920		921	again:
921	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);	922	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
922	arch_enter_lazy_mmu_mode();	923	arch_enter_lazy_mmu_mode();
923	do {	924	do {
@@ -974,7 +975,9 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
974	page_remove_rmap(page);	975	page_remove_rmap(page);
975	if (unlikely(page_mapcount(page) < 0))	976	if (unlikely(page_mapcount(page) < 0))
976	print_bad_pte(vma, addr, ptent, page);	977	print_bad_pte(vma, addr, ptent, page);
977	tlb_remove_page(tlb, page);	978	force_flush = !__tlb_remove_page(tlb, page);
		979	if (force_flush)
		980	break;
978	continue;	981	continue;
979	}	982	}
980	/*	983	/*
@@ -1001,6 +1004,18 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
1001	arch_leave_lazy_mmu_mode();	1004	arch_leave_lazy_mmu_mode();
1002	pte_unmap_unlock(pte - 1, ptl);	1005	pte_unmap_unlock(pte - 1, ptl);
1003		1006
		1007	/*
		1008	* mmu_gather ran out of room to batch pages, we break out of
		1009	* the PTE lock to avoid doing the potential expensive TLB invalidate
		1010	* and page-free while holding it.
		1011	*/
		1012	if (force_flush) {
		1013	force_flush = 0;
		1014	tlb_flush_mmu(tlb);
		1015	if (addr != end)
		1016	goto again;
		1017	}
		1018
1004	return addr;	1019	return addr;
1005	}	1020	}
1006		1021
@@ -1121,17 +1136,14 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
1121	* ensure that any thus-far unmapped pages are flushed before unmap_vmas()	1136	* ensure that any thus-far unmapped pages are flushed before unmap_vmas()
1122	* drops the lock and schedules.	1137	* drops the lock and schedules.
1123	*/	1138	*/
1124	unsigned long unmap_vmas(struct mmu_gather **tlbp,	1139	unsigned long unmap_vmas(struct mmu_gather *tlb,
1125	struct vm_area_struct *vma, unsigned long start_addr,	1140	struct vm_area_struct *vma, unsigned long start_addr,
1126	unsigned long end_addr, unsigned long *nr_accounted,	1141	unsigned long end_addr, unsigned long *nr_accounted,
1127	struct zap_details *details)	1142	struct zap_details *details)
1128	{	1143	{
1129	long zap_work = ZAP_BLOCK_SIZE;	1144	long zap_work = ZAP_BLOCK_SIZE;
1130	unsigned long tlb_start = 0; /* For tlb_finish_mmu */
1131	int tlb_start_valid = 0;
1132	unsigned long start = start_addr;	1145	unsigned long start = start_addr;
1133	spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;	1146	spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
1134	int fullmm = (*tlbp)->fullmm;
1135	struct mm_struct *mm = vma->vm_mm;	1147	struct mm_struct *mm = vma->vm_mm;
1136		1148
1137	mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);	1149	mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
@@ -1152,11 +1164,6 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
1152	untrack_pfn_vma(vma, 0, 0);	1164	untrack_pfn_vma(vma, 0, 0);
1153		1165
1154	while (start != end) {	1166	while (start != end) {
1155	if (!tlb_start_valid) {
1156	tlb_start = start;
1157	tlb_start_valid = 1;
1158	}
1159
1160	if (unlikely(is_vm_hugetlb_page(vma))) {	1167	if (unlikely(is_vm_hugetlb_page(vma))) {
1161	/*	1168	/*
1162	* It is undesirable to test vma->vm_file as it	1169	* It is undesirable to test vma->vm_file as it
@@ -1177,7 +1184,7 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
1177		1184
1178	start = end;	1185	start = end;
1179	} else	1186	} else
1180	start = unmap_page_range(*tlbp, vma,	1187	start = unmap_page_range(tlb, vma,
1181	start, end, &zap_work, details);	1188	start, end, &zap_work, details);
1182		1189
1183	if (zap_work > 0) {	1190	if (zap_work > 0) {
@@ -1185,19 +1192,13 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
1185	break;	1192	break;
1186	}	1193	}
1187		1194
1188	tlb_finish_mmu(*tlbp, tlb_start, start);
1189
1190	if (need_resched() \|\|	1195	if (need_resched() \|\|
1191	(i_mmap_lock && spin_needbreak(i_mmap_lock))) {	1196	(i_mmap_lock && spin_needbreak(i_mmap_lock))) {
1192	if (i_mmap_lock) {	1197	if (i_mmap_lock)
1193	*tlbp = NULL;
1194	goto out;	1198	goto out;
1195	}
1196	cond_resched();	1199	cond_resched();
1197	}	1200	}
1198		1201
1199	*tlbp = tlb_gather_mmu(vma->vm_mm, fullmm);
1200	tlb_start_valid = 0;
1201	zap_work = ZAP_BLOCK_SIZE;	1202	zap_work = ZAP_BLOCK_SIZE;
1202	}	1203	}
1203	}	1204	}
@@ -1217,16 +1218,15 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
1217	unsigned long size, struct zap_details *details)	1218	unsigned long size, struct zap_details *details)
1218	{	1219	{
1219	struct mm_struct *mm = vma->vm_mm;	1220	struct mm_struct *mm = vma->vm_mm;
1220	struct mmu_gather *tlb;	1221	struct mmu_gather tlb;
1221	unsigned long end = address + size;	1222	unsigned long end = address + size;
1222	unsigned long nr_accounted = 0;	1223	unsigned long nr_accounted = 0;
1223		1224
1224	lru_add_drain();	1225	lru_add_drain();
1225	tlb = tlb_gather_mmu(mm, 0);	1226	tlb_gather_mmu(&tlb, mm, 0);
1226	update_hiwater_rss(mm);	1227	update_hiwater_rss(mm);
1227	end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);	1228	end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
1228	if (tlb)	1229	tlb_finish_mmu(&tlb, address, end);
1229	tlb_finish_mmu(tlb, address, end);
1230	return end;	1230	return end;
1231	}	1231	}
1232		1232


diff --git a/mm/mmap.c b/mm/mmap.c index adb12527fd0e..40d49986e714 100644 --- a/mm/mmap.c +++ b/mm/mmap.c
@@ -1903,17 +1903,17 @@ static void unmap_region(struct mm_struct *mm,
1903	unsigned long start, unsigned long end)	1903	unsigned long start, unsigned long end)
1904	{	1904	{
1905	struct vm_area_struct *next = prev? prev->vm_next: mm->mmap;	1905	struct vm_area_struct *next = prev? prev->vm_next: mm->mmap;
1906	struct mmu_gather *tlb;	1906	struct mmu_gather tlb;
1907	unsigned long nr_accounted = 0;	1907	unsigned long nr_accounted = 0;
1908		1908
1909	lru_add_drain();	1909	lru_add_drain();
1910	tlb = tlb_gather_mmu(mm, 0);	1910	tlb_gather_mmu(&tlb, mm, 0);
1911	update_hiwater_rss(mm);	1911	update_hiwater_rss(mm);
1912	unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);	1912	unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);
1913	vm_unacct_memory(nr_accounted);	1913	vm_unacct_memory(nr_accounted);
1914	free_pgtables(tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,	1914	free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
1915	next? next->vm_start: 0);	1915	next ? next->vm_start : 0);
1916	tlb_finish_mmu(tlb, start, end);	1916	tlb_finish_mmu(&tlb, start, end);
1917	}	1917	}
1918		1918
1919	/*	1919	/*
@@ -2255,7 +2255,7 @@ EXPORT_SYMBOL(do_brk);
2255	/* Release all mmaps. */	2255	/* Release all mmaps. */
2256	void exit_mmap(struct mm_struct *mm)	2256	void exit_mmap(struct mm_struct *mm)
2257	{	2257	{
2258	struct mmu_gather *tlb;	2258	struct mmu_gather tlb;
2259	struct vm_area_struct *vma;	2259	struct vm_area_struct *vma;
2260	unsigned long nr_accounted = 0;	2260	unsigned long nr_accounted = 0;
2261	unsigned long end;	2261	unsigned long end;
@@ -2280,14 +2280,14 @@ void exit_mmap(struct mm_struct *mm)
2280		2280
2281	lru_add_drain();	2281	lru_add_drain();
2282	flush_cache_mm(mm);	2282	flush_cache_mm(mm);
2283	tlb = tlb_gather_mmu(mm, 1);	2283	tlb_gather_mmu(&tlb, mm, 1);
2284	/* update_hiwater_rss(mm) here? but nobody should be looking */	2284	/* update_hiwater_rss(mm) here? but nobody should be looking */
2285	/* Use -1 here to ensure all VMAs in the mm are unmapped */	2285	/* Use -1 here to ensure all VMAs in the mm are unmapped */
2286	end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);	2286	end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
2287	vm_unacct_memory(nr_accounted);	2287	vm_unacct_memory(nr_accounted);
2288		2288
2289	free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0);	2289	free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
2290	tlb_finish_mmu(tlb, 0, end);	2290	tlb_finish_mmu(&tlb, 0, end);
2291		2291
2292	/*	2292	/*
2293	* Walk the list again, actually closing and freeing it,	2293	* Walk the list again, actually closing and freeing it,