mm: mmu_gather rework

Rework the existing mmu_gather infrastructure. The direct purpose of these patches was to allow preemptible mmu_gather, but even without that I think these patches provide an improvement to the status quo. The first 9 patches rework the mmu_gather infrastructure. For review purpose I've split them into generic and per-arch patches with the last of those a generic cleanup. The next patch provides generic RCU page-table freeing, and the followup is a patch converting s390 to use this. I've also got 4 patches from DaveM lined up (not included in this series) that uses this to implement gup_fast() for sparc64. Then there is one patch that extends the generic mmu_gather batching. After that follow the mm preemptibility patches, these make part of the mm a lot more preemptible. It converts i_mmap_lock and anon_vma->lock to mutexes which together with the mmu_gather rework makes mmu_gather preemptible as well. Making i_mmap_lock a mutex also enables a clean-up of the truncate code. This also allows for preemptible mmu_notifiers, something that XPMEM I think wants. Furthermore, it removes the new and universially detested unmap_mutex. This patch: Remove the first obstacle towards a fully preemptible mmu_gather. The current scheme assumes mmu_gather is always done with preemption disabled and uses per-cpu storage for the page batches. Change this to try and allocate a page for batching and in case of failure, use a small on-stack array to make some progress. Preemptible mmu_gather is desired in general and usable once i_mmap_lock becomes a mutex. Doing it before the mutex conversion saves us from having to rework the code by moving the mmu_gather bits inside the pte_lock. Also avoid flushing the tlb batches from under the pte lock, this is useful even without the i_mmap_lock conversion as it significantly reduces pte lock hold times. [akpm@linux-foundation.org: fix comment tpyo] Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: David Miller <davem@davemloft.net> Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: Russell King <rmk@arm.linux.org.uk> Cc: Paul Mundt <lethal@linux-sh.org> Cc: Jeff Dike <jdike@addtoit.com> Cc: Richard Weinberger <richard@nod.at> Cc: Tony Luck <tony.luck@intel.com> Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Acked-by: Hugh Dickins <hughd@google.com> Acked-by: Mel Gorman <mel@csn.ul.ie> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Nick Piggin <npiggin@kernel.dk> Cc: Namhyung Kim <namhyung@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Peter Zijlstra <a.p.zijlstra@chello.nl> 2011-05-24 20:11:45 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2011-05-25 11:39:12 -0400
commit: d16dfc550f5326a4000f3322582a7c05dec91d7a (patch)
tree: 8ee963542705cbf2187777f1d3f2b209cbda827a /include/asm-generic
parent: d05f3169c0fbca16132ec7c2be71685c6de638b5 (diff)
1 files changed, 69 insertions, 27 deletions
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index e43f9766259f..2d3547c84235 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -5,6 +5,8 @@
 * Copyright 2001 Red Hat, Inc.
 * Based on code from mm/memory.c Copyright Linus Torvalds and others.
 *
+ * Copyright 2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version
@@ -22,51 +24,71 @@
 * and page free order so much..
 */
 #ifdef CONFIG_SMP
-  #ifdef ARCH_FREE_PTR_NR
-    #define FREE_PTR_NR   ARCH_FREE_PTR_NR
-  #else
-    #define FREE_PTE_NR 506
-  #endif
  #define tlb_fast_mode(tlb) ((tlb)->nr == ~0U)
 #else
-  #define FREE_PTE_NR   1
  #define tlb_fast_mode(tlb) 1
 #endif
+/*
+ * If we can't allocate a page to make a big batch of page pointers
+ * to work on, then just handle a few from the on-stack structure.
+ */
+#define MMU_GATHER_BUNDLE       8
 /* struct mmu_gather is an opaque type used by the mm code for passing around
 * any data needed by arch specific code for tlb_remove_page.
 */
 struct mmu_gather {
        struct mm_struct        *mm;
        unsigned int            nr;     /* set to ~0U means fast mode */
+        unsigned int            max;    /* nr < max */
        unsigned int            need_flush;/* Really unmapped some ptes? */
        unsigned int            fullmm; /* non-zero means full mm flush */
-        struct page *           pages[FREE_PTE_NR];
+#ifdef HAVE_ARCH_MMU_GATHER
+        struct arch_mmu_gather  arch;
+#endif
+        struct page             **pages;
+        struct page             *local[MMU_GATHER_BUNDLE];
 };
-/* Users of the generic TLB shootdown code must declare this storage space. */
+static inline void __tlb_alloc_page(struct mmu_gather *tlb)
-DECLARE_PER_CPU(struct mmu_gather, mmu_gathers);
+{
+        unsigned long addr = __get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
+        if (addr) {
+                tlb->pages = (void *)addr;
+                tlb->max = PAGE_SIZE / sizeof(struct page *);
+        }
+}
 /* tlb_gather_mmu
- *      Return a pointer to an initialized struct mmu_gather.
+ *      Called to initialize an (on-stack) mmu_gather structure for page-table
+ *      tear-down from @mm. The @fullmm argument is used when @mm is without
+ *      users and we're going to destroy the full address space (exit/execve).
 */
-static inline struct mmu_gather *
+static inline void
-tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush)
+tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm)
 {
-        struct mmu_gather *tlb = &get_cpu_var(mmu_gathers);
        tlb->mm = mm;
-        /* Use fast mode if only one CPU is online */
+        tlb->max = ARRAY_SIZE(tlb->local);
-        tlb->nr = num_online_cpus() > 1 ? 0U : ~0U;
+        tlb->pages = tlb->local;
+        if (num_online_cpus() > 1) {
+                tlb->nr = 0;
+                __tlb_alloc_page(tlb);
+        } else /* Use fast mode if only one CPU is online */
+                tlb->nr = ~0U;
-        tlb->fullmm = full_mm_flush;
+        tlb->fullmm = fullmm;
-        return tlb;
+#ifdef HAVE_ARCH_MMU_GATHER
+        tlb->arch = ARCH_MMU_GATHER_INIT;
+#endif
 }
 static inline void
-tlb_flush_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
+tlb_flush_mmu(struct mmu_gather *tlb)
 {
        if (!tlb->need_flush)
                return;
@@ -75,6 +97,13 @@ tlb_flush_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
        if (!tlb_fast_mode(tlb)) {
                free_pages_and_swap_cache(tlb->pages, tlb->nr);
                tlb->nr = 0;
+                /*
+                 * If we are using the local on-stack array of pages for MMU
+                 * gather, try allocating an off-stack array again as we have
+                 * recently freed pages.
+                 */
+                if (tlb->pages == tlb->local)
+                        __tlb_alloc_page(tlb);
        }
 }
@@ -85,29 +114,42 @@ tlb_flush_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
 static inline void
 tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
 {
-        tlb_flush_mmu(tlb, start, end);
+        tlb_flush_mmu(tlb);
        /* keep the page table cache within bounds */
        check_pgt_cache();
-        put_cpu_var(mmu_gathers);
+        if (tlb->pages != tlb->local)
+                free_pages((unsigned long)tlb->pages, 0);
 }
-/* tlb_remove_page
+/* __tlb_remove_page
 *      Must perform the equivalent to __free_pte(pte_get_and_clear(ptep)), while
 *      handling the additional races in SMP caused by other CPUs caching valid
- *      mappings in their TLBs.
+ *      mappings in their TLBs. Returns the number of free page slots left.
+ *      When out of page slots we must call tlb_flush_mmu().
 */
-static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page)
+static inline int __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
 {
        tlb->need_flush = 1;
        if (tlb_fast_mode(tlb)) {
                free_page_and_swap_cache(page);
-                return;
+                return 1; /* avoid calling tlb_flush_mmu() */
        }
        tlb->pages[tlb->nr++] = page;
-        if (tlb->nr >= FREE_PTE_NR)
+        VM_BUG_ON(tlb->nr > tlb->max);
-                tlb_flush_mmu(tlb, 0, 0);
+        return tlb->max - tlb->nr;
+}
+/* tlb_remove_page
+ *      Similar to __tlb_remove_page but will call tlb_flush_mmu() itself when
+ *      required.
+ */
+static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page)
+{
+        if (!__tlb_remove_page(tlb, page))
+                tlb_flush_mmu(tlb);
 }
 /**
author	Peter Zijlstra <a.p.zijlstra@chello.nl>	2011-05-24 20:11:45 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2011-05-25 11:39:12 -0400
commit	d16dfc550f5326a4000f3322582a7c05dec91d7a (patch)
tree	8ee963542705cbf2187777f1d3f2b209cbda827a /include/asm-generic
parent	d05f3169c0fbca16132ec7c2be71685c6de638b5 (diff)

diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index e43f9766259f..2d3547c84235 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h
@@ -5,6 +5,8 @@
5	* Copyright 2001 Red Hat, Inc.	5	* Copyright 2001 Red Hat, Inc.
6	* Based on code from mm/memory.c Copyright Linus Torvalds and others.	6	* Based on code from mm/memory.c Copyright Linus Torvalds and others.
7	*	7	*
		8	* Copyright 2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
		9	*
8	* This program is free software; you can redistribute it and/or	10	* This program is free software; you can redistribute it and/or
9	* modify it under the terms of the GNU General Public License	11	* modify it under the terms of the GNU General Public License
10	* as published by the Free Software Foundation; either version	12	* as published by the Free Software Foundation; either version
@@ -22,51 +24,71 @@
22	* and page free order so much..	24	* and page free order so much..
23	*/	25	*/
24	#ifdef CONFIG_SMP	26	#ifdef CONFIG_SMP
25	#ifdef ARCH_FREE_PTR_NR
26	#define FREE_PTR_NR ARCH_FREE_PTR_NR
27	#else
28	#define FREE_PTE_NR 506
29	#endif
30	#define tlb_fast_mode(tlb) ((tlb)->nr == ~0U)	27	#define tlb_fast_mode(tlb) ((tlb)->nr == ~0U)
31	#else	28	#else
32	#define FREE_PTE_NR 1
33	#define tlb_fast_mode(tlb) 1	29	#define tlb_fast_mode(tlb) 1
34	#endif	30	#endif
35		31
		32	/*
		33	* If we can't allocate a page to make a big batch of page pointers
		34	* to work on, then just handle a few from the on-stack structure.
		35	*/
		36	#define MMU_GATHER_BUNDLE 8
		37
36	/* struct mmu_gather is an opaque type used by the mm code for passing around	38	/* struct mmu_gather is an opaque type used by the mm code for passing around
37	* any data needed by arch specific code for tlb_remove_page.	39	* any data needed by arch specific code for tlb_remove_page.
38	*/	40	*/
39	struct mmu_gather {	41	struct mmu_gather {
40	struct mm_struct *mm;	42	struct mm_struct *mm;
41	unsigned int nr; /* set to ~0U means fast mode */	43	unsigned int nr; /* set to ~0U means fast mode */
		44	unsigned int max; /* nr < max */
42	unsigned int need_flush;/* Really unmapped some ptes? */	45	unsigned int need_flush;/* Really unmapped some ptes? */
43	unsigned int fullmm; /* non-zero means full mm flush */	46	unsigned int fullmm; /* non-zero means full mm flush */
44	struct page * pages[FREE_PTE_NR];	47	#ifdef HAVE_ARCH_MMU_GATHER
		48	struct arch_mmu_gather arch;
		49	#endif
		50	struct page **pages;
		51	struct page *local[MMU_GATHER_BUNDLE];
45	};	52	};
46		53
47	/* Users of the generic TLB shootdown code must declare this storage space. */	54	static inline void __tlb_alloc_page(struct mmu_gather *tlb)
48	DECLARE_PER_CPU(struct mmu_gather, mmu_gathers);	55	{
		56	unsigned long addr = __get_free_pages(GFP_NOWAIT \| __GFP_NOWARN, 0);
		57
		58	if (addr) {
		59	tlb->pages = (void *)addr;
		60	tlb->max = PAGE_SIZE / sizeof(struct page *);
		61	}
		62	}
49		63
50	/* tlb_gather_mmu	64	/* tlb_gather_mmu
51	* Return a pointer to an initialized struct mmu_gather.	65	* Called to initialize an (on-stack) mmu_gather structure for page-table
		66	* tear-down from @mm. The @fullmm argument is used when @mm is without
		67	* users and we're going to destroy the full address space (exit/execve).
52	*/	68	*/
53	static inline struct mmu_gather *	69	static inline void
54	tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush)	70	tlb_gather_mmu(struct mmu_gather tlb, struct mm_struct mm, bool fullmm)
55	{	71	{
56	struct mmu_gather *tlb = &get_cpu_var(mmu_gathers);
57
58	tlb->mm = mm;	72	tlb->mm = mm;
59		73
60	/* Use fast mode if only one CPU is online */	74	tlb->max = ARRAY_SIZE(tlb->local);
61	tlb->nr = num_online_cpus() > 1 ? 0U : ~0U;	75	tlb->pages = tlb->local;
		76
		77	if (num_online_cpus() > 1) {
		78	tlb->nr = 0;
		79	__tlb_alloc_page(tlb);
		80	} else /* Use fast mode if only one CPU is online */
		81	tlb->nr = ~0U;
62		82
63	tlb->fullmm = full_mm_flush;	83	tlb->fullmm = fullmm;
64		84
65	return tlb;	85	#ifdef HAVE_ARCH_MMU_GATHER
		86	tlb->arch = ARCH_MMU_GATHER_INIT;
		87	#endif
66	}	88	}
67		89
68	static inline void	90	static inline void
69	tlb_flush_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)	91	tlb_flush_mmu(struct mmu_gather *tlb)
70	{	92	{
71	if (!tlb->need_flush)	93	if (!tlb->need_flush)
72	return;	94	return;
@@ -75,6 +97,13 @@ tlb_flush_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
75	if (!tlb_fast_mode(tlb)) {	97	if (!tlb_fast_mode(tlb)) {
76	free_pages_and_swap_cache(tlb->pages, tlb->nr);	98	free_pages_and_swap_cache(tlb->pages, tlb->nr);
77	tlb->nr = 0;	99	tlb->nr = 0;
		100	/*
		101	* If we are using the local on-stack array of pages for MMU
		102	* gather, try allocating an off-stack array again as we have
		103	* recently freed pages.
		104	*/
		105	if (tlb->pages == tlb->local)
		106	__tlb_alloc_page(tlb);
78	}	107	}
79	}	108	}
80		109
@@ -85,29 +114,42 @@ tlb_flush_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
85	static inline void	114	static inline void
86	tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)	115	tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
87	{	116	{
88	tlb_flush_mmu(tlb, start, end);	117	tlb_flush_mmu(tlb);
89		118
90	/* keep the page table cache within bounds */	119	/* keep the page table cache within bounds */
91	check_pgt_cache();	120	check_pgt_cache();
92		121
93	put_cpu_var(mmu_gathers);	122	if (tlb->pages != tlb->local)
		123	free_pages((unsigned long)tlb->pages, 0);
94	}	124	}
95		125
96	/* tlb_remove_page	126	/* __tlb_remove_page
97	* Must perform the equivalent to __free_pte(pte_get_and_clear(ptep)), while	127	* Must perform the equivalent to __free_pte(pte_get_and_clear(ptep)), while
98	* handling the additional races in SMP caused by other CPUs caching valid	128	* handling the additional races in SMP caused by other CPUs caching valid
99	* mappings in their TLBs.	129	* mappings in their TLBs. Returns the number of free page slots left.
		130	* When out of page slots we must call tlb_flush_mmu().
100	*/	131	*/
101	static inline void tlb_remove_page(struct mmu_gather tlb, struct page page)	132	static inline int __tlb_remove_page(struct mmu_gather tlb, struct page page)
102	{	133	{
103	tlb->need_flush = 1;	134	tlb->need_flush = 1;
104	if (tlb_fast_mode(tlb)) {	135	if (tlb_fast_mode(tlb)) {
105	free_page_and_swap_cache(page);	136	free_page_and_swap_cache(page);
106	return;	137	return 1; /* avoid calling tlb_flush_mmu() */
107	}	138	}
108	tlb->pages[tlb->nr++] = page;	139	tlb->pages[tlb->nr++] = page;
109	if (tlb->nr >= FREE_PTE_NR)	140	VM_BUG_ON(tlb->nr > tlb->max);
110	tlb_flush_mmu(tlb, 0, 0);	141
		142	return tlb->max - tlb->nr;
		143	}
		144
		145	/* tlb_remove_page
		146	* Similar to __tlb_remove_page but will call tlb_flush_mmu() itself when
		147	* required.
		148	*/
		149	static inline void tlb_remove_page(struct mmu_gather tlb, struct page page)
		150	{
		151	if (!__tlb_remove_page(tlb, page))
		152	tlb_flush_mmu(tlb);
111	}	153	}
112		154
113	/**	155	/**