aboutsummaryrefslogtreecommitdiffstats
path: root/mm/rmap.c
diff options
context:
space:
mode:
authorRik van Riel <riel@redhat.com>2010-03-05 16:42:07 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2010-03-06 14:26:26 -0500
commit5beb49305251e5669852ed541e8e2f2f7696c53e (patch)
tree46457450a22f23938b24904aeba5d4ada2f53b20 /mm/rmap.c
parent648bcc771145172a14bc35eeb849ed08f6aa4f1e (diff)
mm: change anon_vma linking to fix multi-process server scalability issue
The old anon_vma code can lead to scalability issues with heavily forking workloads. Specifically, each anon_vma will be shared between the parent process and all its child processes. In a workload with 1000 child processes and a VMA with 1000 anonymous pages per process that get COWed, this leads to a system with a million anonymous pages in the same anon_vma, each of which is mapped in just one of the 1000 processes. However, the current rmap code needs to walk them all, leading to O(N) scanning complexity for each page. This can result in systems where one CPU is walking the page tables of 1000 processes in page_referenced_one, while all other CPUs are stuck on the anon_vma lock. This leads to catastrophic failure for a benchmark like AIM7, where the total number of processes can reach in the tens of thousands. Real workloads are still a factor 10 less process intensive than AIM7, but they are catching up. This patch changes the way anon_vmas and VMAs are linked, which allows us to associate multiple anon_vmas with a VMA. At fork time, each child process gets its own anon_vmas, in which its COWed pages will be instantiated. The parents' anon_vma is also linked to the VMA, because non-COWed pages could be present in any of the children. This reduces rmap scanning complexity to O(1) for the pages of the 1000 child processes, with O(N) complexity for at most 1/N pages in the system. This reduces the average scanning cost in heavily forking workloads from O(N) to 2. The only real complexity in this patch stems from the fact that linking a VMA to anon_vmas now involves memory allocations. This means vma_adjust can fail, if it needs to attach a VMA to anon_vma structures. This in turn means error handling needs to be added to the calling functions. A second source of complexity is that, because there can be multiple anon_vmas, the anon_vma linking in vma_adjust can no longer be done under "the" anon_vma lock. To prevent the rmap code from walking up an incomplete VMA, this patch introduces the VM_LOCK_RMAP VMA flag. This bit flag uses the same slot as the NOMMU VM_MAPPED_COPY, with an ifdef in mm.h to make sure it is impossible to compile a kernel that needs both symbolic values for the same bitflag. Some test results: Without the anon_vma changes, when AIM7 hits around 9.7k users (on a test box with 16GB RAM and not quite enough IO), the system ends up running >99% in system time, with every CPU on the same anon_vma lock in the pageout code. With these changes, AIM7 hits the cross-over point around 29.7k users. This happens with ~99% IO wait time, there never seems to be any spike in system time. The anon_vma lock contention appears to be resolved. [akpm@linux-foundation.org: cleanups] Signed-off-by: Rik van Riel <riel@redhat.com> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Larry Woodman <lwoodman@redhat.com> Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com> Cc: Minchan Kim <minchan.kim@gmail.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/rmap.c')
-rw-r--r--mm/rmap.c156
1 files changed, 130 insertions, 26 deletions
diff --git a/mm/rmap.c b/mm/rmap.c
index 5cb47111f79e..be34094e4595 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -62,6 +62,7 @@
62#include "internal.h" 62#include "internal.h"
63 63
64static struct kmem_cache *anon_vma_cachep; 64static struct kmem_cache *anon_vma_cachep;
65static struct kmem_cache *anon_vma_chain_cachep;
65 66
66static inline struct anon_vma *anon_vma_alloc(void) 67static inline struct anon_vma *anon_vma_alloc(void)
67{ 68{
@@ -73,6 +74,16 @@ void anon_vma_free(struct anon_vma *anon_vma)
73 kmem_cache_free(anon_vma_cachep, anon_vma); 74 kmem_cache_free(anon_vma_cachep, anon_vma);
74} 75}
75 76
77static inline struct anon_vma_chain *anon_vma_chain_alloc(void)
78{
79 return kmem_cache_alloc(anon_vma_chain_cachep, GFP_KERNEL);
80}
81
82void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
83{
84 kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain);
85}
86
76/** 87/**
77 * anon_vma_prepare - attach an anon_vma to a memory region 88 * anon_vma_prepare - attach an anon_vma to a memory region
78 * @vma: the memory region in question 89 * @vma: the memory region in question
@@ -103,18 +114,23 @@ void anon_vma_free(struct anon_vma *anon_vma)
103int anon_vma_prepare(struct vm_area_struct *vma) 114int anon_vma_prepare(struct vm_area_struct *vma)
104{ 115{
105 struct anon_vma *anon_vma = vma->anon_vma; 116 struct anon_vma *anon_vma = vma->anon_vma;
117 struct anon_vma_chain *avc;
106 118
107 might_sleep(); 119 might_sleep();
108 if (unlikely(!anon_vma)) { 120 if (unlikely(!anon_vma)) {
109 struct mm_struct *mm = vma->vm_mm; 121 struct mm_struct *mm = vma->vm_mm;
110 struct anon_vma *allocated; 122 struct anon_vma *allocated;
111 123
124 avc = anon_vma_chain_alloc();
125 if (!avc)
126 goto out_enomem;
127
112 anon_vma = find_mergeable_anon_vma(vma); 128 anon_vma = find_mergeable_anon_vma(vma);
113 allocated = NULL; 129 allocated = NULL;
114 if (!anon_vma) { 130 if (!anon_vma) {
115 anon_vma = anon_vma_alloc(); 131 anon_vma = anon_vma_alloc();
116 if (unlikely(!anon_vma)) 132 if (unlikely(!anon_vma))
117 return -ENOMEM; 133 goto out_enomem_free_avc;
118 allocated = anon_vma; 134 allocated = anon_vma;
119 } 135 }
120 spin_lock(&anon_vma->lock); 136 spin_lock(&anon_vma->lock);
@@ -123,53 +139,113 @@ int anon_vma_prepare(struct vm_area_struct *vma)
123 spin_lock(&mm->page_table_lock); 139 spin_lock(&mm->page_table_lock);
124 if (likely(!vma->anon_vma)) { 140 if (likely(!vma->anon_vma)) {
125 vma->anon_vma = anon_vma; 141 vma->anon_vma = anon_vma;
126 list_add_tail(&vma->anon_vma_node, &anon_vma->head); 142 avc->anon_vma = anon_vma;
143 avc->vma = vma;
144 list_add(&avc->same_vma, &vma->anon_vma_chain);
145 list_add(&avc->same_anon_vma, &anon_vma->head);
127 allocated = NULL; 146 allocated = NULL;
128 } 147 }
129 spin_unlock(&mm->page_table_lock); 148 spin_unlock(&mm->page_table_lock);
130 149
131 spin_unlock(&anon_vma->lock); 150 spin_unlock(&anon_vma->lock);
132 if (unlikely(allocated)) 151 if (unlikely(allocated)) {
133 anon_vma_free(allocated); 152 anon_vma_free(allocated);
153 anon_vma_chain_free(avc);
154 }
134 } 155 }
135 return 0; 156 return 0;
157
158 out_enomem_free_avc:
159 anon_vma_chain_free(avc);
160 out_enomem:
161 return -ENOMEM;
136} 162}
137 163
138void __anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next) 164static void anon_vma_chain_link(struct vm_area_struct *vma,
165 struct anon_vma_chain *avc,
166 struct anon_vma *anon_vma)
139{ 167{
140 BUG_ON(vma->anon_vma != next->anon_vma); 168 avc->vma = vma;
141 list_del(&next->anon_vma_node); 169 avc->anon_vma = anon_vma;
170 list_add(&avc->same_vma, &vma->anon_vma_chain);
171
172 spin_lock(&anon_vma->lock);
173 list_add_tail(&avc->same_anon_vma, &anon_vma->head);
174 spin_unlock(&anon_vma->lock);
142} 175}
143 176
144void __anon_vma_link(struct vm_area_struct *vma) 177/*
178 * Attach the anon_vmas from src to dst.
179 * Returns 0 on success, -ENOMEM on failure.
180 */
181int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
145{ 182{
146 struct anon_vma *anon_vma = vma->anon_vma; 183 struct anon_vma_chain *avc, *pavc;
184
185 list_for_each_entry(pavc, &src->anon_vma_chain, same_vma) {
186 avc = anon_vma_chain_alloc();
187 if (!avc)
188 goto enomem_failure;
189 anon_vma_chain_link(dst, avc, pavc->anon_vma);
190 }
191 return 0;
147 192
148 if (anon_vma) 193 enomem_failure:
149 list_add_tail(&vma->anon_vma_node, &anon_vma->head); 194 unlink_anon_vmas(dst);
195 return -ENOMEM;
150} 196}
151 197
152void anon_vma_link(struct vm_area_struct *vma) 198/*
199 * Attach vma to its own anon_vma, as well as to the anon_vmas that
200 * the corresponding VMA in the parent process is attached to.
201 * Returns 0 on success, non-zero on failure.
202 */
203int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
153{ 204{
154 struct anon_vma *anon_vma = vma->anon_vma; 205 struct anon_vma_chain *avc;
206 struct anon_vma *anon_vma;
155 207
156 if (anon_vma) { 208 /* Don't bother if the parent process has no anon_vma here. */
157 spin_lock(&anon_vma->lock); 209 if (!pvma->anon_vma)
158 list_add_tail(&vma->anon_vma_node, &anon_vma->head); 210 return 0;
159 spin_unlock(&anon_vma->lock); 211
160 } 212 /*
213 * First, attach the new VMA to the parent VMA's anon_vmas,
214 * so rmap can find non-COWed pages in child processes.
215 */
216 if (anon_vma_clone(vma, pvma))
217 return -ENOMEM;
218
219 /* Then add our own anon_vma. */
220 anon_vma = anon_vma_alloc();
221 if (!anon_vma)
222 goto out_error;
223 avc = anon_vma_chain_alloc();
224 if (!avc)
225 goto out_error_free_anon_vma;
226 anon_vma_chain_link(vma, avc, anon_vma);
227 /* Mark this anon_vma as the one where our new (COWed) pages go. */
228 vma->anon_vma = anon_vma;
229
230 return 0;
231
232 out_error_free_anon_vma:
233 anon_vma_free(anon_vma);
234 out_error:
235 return -ENOMEM;
161} 236}
162 237
163void anon_vma_unlink(struct vm_area_struct *vma) 238static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain)
164{ 239{
165 struct anon_vma *anon_vma = vma->anon_vma; 240 struct anon_vma *anon_vma = anon_vma_chain->anon_vma;
166 int empty; 241 int empty;
167 242
243 /* If anon_vma_fork fails, we can get an empty anon_vma_chain. */
168 if (!anon_vma) 244 if (!anon_vma)
169 return; 245 return;
170 246
171 spin_lock(&anon_vma->lock); 247 spin_lock(&anon_vma->lock);
172 list_del(&vma->anon_vma_node); 248 list_del(&anon_vma_chain->same_anon_vma);
173 249
174 /* We must garbage collect the anon_vma if it's empty */ 250 /* We must garbage collect the anon_vma if it's empty */
175 empty = list_empty(&anon_vma->head) && !ksm_refcount(anon_vma); 251 empty = list_empty(&anon_vma->head) && !ksm_refcount(anon_vma);
@@ -179,6 +255,18 @@ void anon_vma_unlink(struct vm_area_struct *vma)
179 anon_vma_free(anon_vma); 255 anon_vma_free(anon_vma);
180} 256}
181 257
258void unlink_anon_vmas(struct vm_area_struct *vma)
259{
260 struct anon_vma_chain *avc, *next;
261
262 /* Unlink each anon_vma chained to the VMA. */
263 list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
264 anon_vma_unlink(avc);
265 list_del(&avc->same_vma);
266 anon_vma_chain_free(avc);
267 }
268}
269
182static void anon_vma_ctor(void *data) 270static void anon_vma_ctor(void *data)
183{ 271{
184 struct anon_vma *anon_vma = data; 272 struct anon_vma *anon_vma = data;
@@ -192,6 +280,7 @@ void __init anon_vma_init(void)
192{ 280{
193 anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), 281 anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
194 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor); 282 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor);
283 anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, SLAB_PANIC);
195} 284}
196 285
197/* 286/*
@@ -240,6 +329,18 @@ vma_address(struct page *page, struct vm_area_struct *vma)
240 /* page should be within @vma mapping range */ 329 /* page should be within @vma mapping range */
241 return -EFAULT; 330 return -EFAULT;
242 } 331 }
332 if (unlikely(vma->vm_flags & VM_LOCK_RMAP)) {
333 /*
334 * This VMA is being unlinked or is not yet linked into the
335 * VMA tree. Do not try to follow this rmap. This race
336 * condition can result in page_referenced() ignoring a
337 * reference or in try_to_unmap() failing to unmap a page.
338 * The VMA cannot be freed under us because we hold the
339 * anon_vma->lock, which the munmap code takes while
340 * unlinking the anon_vmas from the VMA.
341 */
342 return -EFAULT;
343 }
243 return address; 344 return address;
244} 345}
245 346
@@ -396,7 +497,7 @@ static int page_referenced_anon(struct page *page,
396{ 497{
397 unsigned int mapcount; 498 unsigned int mapcount;
398 struct anon_vma *anon_vma; 499 struct anon_vma *anon_vma;
399 struct vm_area_struct *vma; 500 struct anon_vma_chain *avc;
400 int referenced = 0; 501 int referenced = 0;
401 502
402 anon_vma = page_lock_anon_vma(page); 503 anon_vma = page_lock_anon_vma(page);
@@ -404,7 +505,8 @@ static int page_referenced_anon(struct page *page,
404 return referenced; 505 return referenced;
405 506
406 mapcount = page_mapcount(page); 507 mapcount = page_mapcount(page);
407 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 508 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
509 struct vm_area_struct *vma = avc->vma;
408 unsigned long address = vma_address(page, vma); 510 unsigned long address = vma_address(page, vma);
409 if (address == -EFAULT) 511 if (address == -EFAULT)
410 continue; 512 continue;
@@ -1025,14 +1127,15 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
1025static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) 1127static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
1026{ 1128{
1027 struct anon_vma *anon_vma; 1129 struct anon_vma *anon_vma;
1028 struct vm_area_struct *vma; 1130 struct anon_vma_chain *avc;
1029 int ret = SWAP_AGAIN; 1131 int ret = SWAP_AGAIN;
1030 1132
1031 anon_vma = page_lock_anon_vma(page); 1133 anon_vma = page_lock_anon_vma(page);
1032 if (!anon_vma) 1134 if (!anon_vma)
1033 return ret; 1135 return ret;
1034 1136
1035 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 1137 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
1138 struct vm_area_struct *vma = avc->vma;
1036 unsigned long address = vma_address(page, vma); 1139 unsigned long address = vma_address(page, vma);
1037 if (address == -EFAULT) 1140 if (address == -EFAULT)
1038 continue; 1141 continue;
@@ -1223,7 +1326,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
1223 struct vm_area_struct *, unsigned long, void *), void *arg) 1326 struct vm_area_struct *, unsigned long, void *), void *arg)
1224{ 1327{
1225 struct anon_vma *anon_vma; 1328 struct anon_vma *anon_vma;
1226 struct vm_area_struct *vma; 1329 struct anon_vma_chain *avc;
1227 int ret = SWAP_AGAIN; 1330 int ret = SWAP_AGAIN;
1228 1331
1229 /* 1332 /*
@@ -1238,7 +1341,8 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
1238 if (!anon_vma) 1341 if (!anon_vma)
1239 return ret; 1342 return ret;
1240 spin_lock(&anon_vma->lock); 1343 spin_lock(&anon_vma->lock);
1241 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 1344 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
1345 struct vm_area_struct *vma = avc->vma;
1242 unsigned long address = vma_address(page, vma); 1346 unsigned long address = vma_address(page, vma);
1243 if (address == -EFAULT) 1347 if (address == -EFAULT)
1244 continue; 1348 continue;