aboutsummaryrefslogtreecommitdiffstats
path: root/mm/mempolicy.c
diff options
context:
space:
mode:
authorChristoph Lameter <clameter@sgi.com>2006-01-08 04:00:50 -0500
committerLinus Torvalds <torvalds@g5.osdl.org>2006-01-08 23:12:41 -0500
commitdc9aa5b9d65fd11b1f5246b46ec610ee8b83c6dd (patch)
tree808da06f0bc8ab5189f1c315a2b99c85a33ed74c /mm/mempolicy.c
parent7cbe34cf86c673503b177ff47cfa2c7030dabb50 (diff)
[PATCH] Swap Migration V5: MPOL_MF_MOVE interface
Add page migration support via swap to the NUMA policy layer This patch adds page migration support to the NUMA policy layer. An additional flag MPOL_MF_MOVE is introduced for mbind. If MPOL_MF_MOVE is specified then pages that do not conform to the memory policy will be evicted from memory. When they get pages back in new pages will be allocated following the numa policy. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'mm/mempolicy.c')
-rw-r--r--mm/mempolicy.c155
1 files changed, 135 insertions, 20 deletions
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 0f1d2b8a952b..9cc6d962831d 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -83,9 +83,14 @@
83#include <linux/init.h> 83#include <linux/init.h>
84#include <linux/compat.h> 84#include <linux/compat.h>
85#include <linux/mempolicy.h> 85#include <linux/mempolicy.h>
86#include <linux/swap.h>
87
86#include <asm/tlbflush.h> 88#include <asm/tlbflush.h>
87#include <asm/uaccess.h> 89#include <asm/uaccess.h>
88 90
91/* Internal MPOL_MF_xxx flags */
92#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
93
89static kmem_cache_t *policy_cache; 94static kmem_cache_t *policy_cache;
90static kmem_cache_t *sn_cache; 95static kmem_cache_t *sn_cache;
91 96
@@ -174,9 +179,59 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
174 return policy; 179 return policy;
175} 180}
176 181
182/* Check if we are the only process mapping the page in question */
183static inline int single_mm_mapping(struct mm_struct *mm,
184 struct address_space *mapping)
185{
186 struct vm_area_struct *vma;
187 struct prio_tree_iter iter;
188 int rc = 1;
189
190 spin_lock(&mapping->i_mmap_lock);
191 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
192 if (mm != vma->vm_mm) {
193 rc = 0;
194 goto out;
195 }
196 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
197 if (mm != vma->vm_mm) {
198 rc = 0;
199 goto out;
200 }
201out:
202 spin_unlock(&mapping->i_mmap_lock);
203 return rc;
204}
205
206/*
207 * Add a page to be migrated to the pagelist
208 */
209static void migrate_page_add(struct vm_area_struct *vma,
210 struct page *page, struct list_head *pagelist, unsigned long flags)
211{
212 /*
213 * Avoid migrating a page that is shared by others and not writable.
214 */
215 if ((flags & MPOL_MF_MOVE_ALL) || !page->mapping || PageAnon(page) ||
216 mapping_writably_mapped(page->mapping) ||
217 single_mm_mapping(vma->vm_mm, page->mapping)) {
218 int rc = isolate_lru_page(page);
219
220 if (rc == 1)
221 list_add(&page->lru, pagelist);
222 /*
223 * If the isolate attempt was not successful then we just
224 * encountered an unswappable page. Something must be wrong.
225 */
226 WARN_ON(rc == 0);
227 }
228}
229
177/* Ensure all existing pages follow the policy. */ 230/* Ensure all existing pages follow the policy. */
178static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, 231static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
179 unsigned long addr, unsigned long end, nodemask_t *nodes) 232 unsigned long addr, unsigned long end,
233 const nodemask_t *nodes, unsigned long flags,
234 struct list_head *pagelist)
180{ 235{
181 pte_t *orig_pte; 236 pte_t *orig_pte;
182 pte_t *pte; 237 pte_t *pte;
@@ -193,15 +248,21 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
193 if (!page) 248 if (!page)
194 continue; 249 continue;
195 nid = page_to_nid(page); 250 nid = page_to_nid(page);
196 if (!node_isset(nid, *nodes)) 251 if (!node_isset(nid, *nodes)) {
197 break; 252 if (pagelist)
253 migrate_page_add(vma, page, pagelist, flags);
254 else
255 break;
256 }
198 } while (pte++, addr += PAGE_SIZE, addr != end); 257 } while (pte++, addr += PAGE_SIZE, addr != end);
199 pte_unmap_unlock(orig_pte, ptl); 258 pte_unmap_unlock(orig_pte, ptl);
200 return addr != end; 259 return addr != end;
201} 260}
202 261
203static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, 262static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
204 unsigned long addr, unsigned long end, nodemask_t *nodes) 263 unsigned long addr, unsigned long end,
264 const nodemask_t *nodes, unsigned long flags,
265 struct list_head *pagelist)
205{ 266{
206 pmd_t *pmd; 267 pmd_t *pmd;
207 unsigned long next; 268 unsigned long next;
@@ -211,14 +272,17 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
211 next = pmd_addr_end(addr, end); 272 next = pmd_addr_end(addr, end);
212 if (pmd_none_or_clear_bad(pmd)) 273 if (pmd_none_or_clear_bad(pmd))
213 continue; 274 continue;
214 if (check_pte_range(vma, pmd, addr, next, nodes)) 275 if (check_pte_range(vma, pmd, addr, next, nodes,
276 flags, pagelist))
215 return -EIO; 277 return -EIO;
216 } while (pmd++, addr = next, addr != end); 278 } while (pmd++, addr = next, addr != end);
217 return 0; 279 return 0;
218} 280}
219 281
220static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd, 282static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
221 unsigned long addr, unsigned long end, nodemask_t *nodes) 283 unsigned long addr, unsigned long end,
284 const nodemask_t *nodes, unsigned long flags,
285 struct list_head *pagelist)
222{ 286{
223 pud_t *pud; 287 pud_t *pud;
224 unsigned long next; 288 unsigned long next;
@@ -228,14 +292,17 @@ static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
228 next = pud_addr_end(addr, end); 292 next = pud_addr_end(addr, end);
229 if (pud_none_or_clear_bad(pud)) 293 if (pud_none_or_clear_bad(pud))
230 continue; 294 continue;
231 if (check_pmd_range(vma, pud, addr, next, nodes)) 295 if (check_pmd_range(vma, pud, addr, next, nodes,
296 flags, pagelist))
232 return -EIO; 297 return -EIO;
233 } while (pud++, addr = next, addr != end); 298 } while (pud++, addr = next, addr != end);
234 return 0; 299 return 0;
235} 300}
236 301
237static inline int check_pgd_range(struct vm_area_struct *vma, 302static inline int check_pgd_range(struct vm_area_struct *vma,
238 unsigned long addr, unsigned long end, nodemask_t *nodes) 303 unsigned long addr, unsigned long end,
304 const nodemask_t *nodes, unsigned long flags,
305 struct list_head *pagelist)
239{ 306{
240 pgd_t *pgd; 307 pgd_t *pgd;
241 unsigned long next; 308 unsigned long next;
@@ -245,16 +312,31 @@ static inline int check_pgd_range(struct vm_area_struct *vma,
245 next = pgd_addr_end(addr, end); 312 next = pgd_addr_end(addr, end);
246 if (pgd_none_or_clear_bad(pgd)) 313 if (pgd_none_or_clear_bad(pgd))
247 continue; 314 continue;
248 if (check_pud_range(vma, pgd, addr, next, nodes)) 315 if (check_pud_range(vma, pgd, addr, next, nodes,
316 flags, pagelist))
249 return -EIO; 317 return -EIO;
250 } while (pgd++, addr = next, addr != end); 318 } while (pgd++, addr = next, addr != end);
251 return 0; 319 return 0;
252} 320}
253 321
254/* Step 1: check the range */ 322/* Check if a vma is migratable */
323static inline int vma_migratable(struct vm_area_struct *vma)
324{
325 if (vma->vm_flags & (
326 VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP))
327 return 0;
328 return 1;
329}
330
331/*
332 * Check if all pages in a range are on a set of nodes.
333 * If pagelist != NULL then isolate pages from the LRU and
334 * put them on the pagelist.
335 */
255static struct vm_area_struct * 336static struct vm_area_struct *
256check_range(struct mm_struct *mm, unsigned long start, unsigned long end, 337check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
257 nodemask_t *nodes, unsigned long flags) 338 const nodemask_t *nodes, unsigned long flags,
339 struct list_head *pagelist)
258{ 340{
259 int err; 341 int err;
260 struct vm_area_struct *first, *vma, *prev; 342 struct vm_area_struct *first, *vma, *prev;
@@ -264,17 +346,24 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
264 return ERR_PTR(-EFAULT); 346 return ERR_PTR(-EFAULT);
265 prev = NULL; 347 prev = NULL;
266 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { 348 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
267 if (!vma->vm_next && vma->vm_end < end) 349 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
268 return ERR_PTR(-EFAULT); 350 if (!vma->vm_next && vma->vm_end < end)
269 if (prev && prev->vm_end < vma->vm_start) 351 return ERR_PTR(-EFAULT);
270 return ERR_PTR(-EFAULT); 352 if (prev && prev->vm_end < vma->vm_start)
271 if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) { 353 return ERR_PTR(-EFAULT);
354 }
355 if (!is_vm_hugetlb_page(vma) &&
356 ((flags & MPOL_MF_STRICT) ||
357 ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
358 vma_migratable(vma)))) {
272 unsigned long endvma = vma->vm_end; 359 unsigned long endvma = vma->vm_end;
360
273 if (endvma > end) 361 if (endvma > end)
274 endvma = end; 362 endvma = end;
275 if (vma->vm_start > start) 363 if (vma->vm_start > start)
276 start = vma->vm_start; 364 start = vma->vm_start;
277 err = check_pgd_range(vma, start, endvma, nodes); 365 err = check_pgd_range(vma, start, endvma, nodes,
366 flags, pagelist);
278 if (err) { 367 if (err) {
279 first = ERR_PTR(err); 368 first = ERR_PTR(err);
280 break; 369 break;
@@ -348,33 +437,59 @@ long do_mbind(unsigned long start, unsigned long len,
348 struct mempolicy *new; 437 struct mempolicy *new;
349 unsigned long end; 438 unsigned long end;
350 int err; 439 int err;
440 LIST_HEAD(pagelist);
351 441
352 if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX) 442 if ((flags & ~(unsigned long)(MPOL_MF_STRICT|MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
443 || mode > MPOL_MAX)
353 return -EINVAL; 444 return -EINVAL;
445 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE))
446 return -EPERM;
447
354 if (start & ~PAGE_MASK) 448 if (start & ~PAGE_MASK)
355 return -EINVAL; 449 return -EINVAL;
450
356 if (mode == MPOL_DEFAULT) 451 if (mode == MPOL_DEFAULT)
357 flags &= ~MPOL_MF_STRICT; 452 flags &= ~MPOL_MF_STRICT;
453
358 len = (len + PAGE_SIZE - 1) & PAGE_MASK; 454 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
359 end = start + len; 455 end = start + len;
456
360 if (end < start) 457 if (end < start)
361 return -EINVAL; 458 return -EINVAL;
362 if (end == start) 459 if (end == start)
363 return 0; 460 return 0;
461
364 if (mpol_check_policy(mode, nmask)) 462 if (mpol_check_policy(mode, nmask))
365 return -EINVAL; 463 return -EINVAL;
464
366 new = mpol_new(mode, nmask); 465 new = mpol_new(mode, nmask);
367 if (IS_ERR(new)) 466 if (IS_ERR(new))
368 return PTR_ERR(new); 467 return PTR_ERR(new);
369 468
469 /*
470 * If we are using the default policy then operation
471 * on discontinuous address spaces is okay after all
472 */
473 if (!new)
474 flags |= MPOL_MF_DISCONTIG_OK;
475
370 PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len, 476 PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
371 mode,nodes_addr(nodes)[0]); 477 mode,nodes_addr(nodes)[0]);
372 478
373 down_write(&mm->mmap_sem); 479 down_write(&mm->mmap_sem);
374 vma = check_range(mm, start, end, nmask, flags); 480 vma = check_range(mm, start, end, nmask, flags,
481 (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ? &pagelist : NULL);
375 err = PTR_ERR(vma); 482 err = PTR_ERR(vma);
376 if (!IS_ERR(vma)) 483 if (!IS_ERR(vma)) {
377 err = mbind_range(vma, start, end, new); 484 err = mbind_range(vma, start, end, new);
485 if (!list_empty(&pagelist))
486 migrate_pages(&pagelist, NULL);
487 if (!err && !list_empty(&pagelist) && (flags & MPOL_MF_STRICT))
488 err = -EIO;
489 }
490 if (!list_empty(&pagelist))
491 putback_lru_pages(&pagelist);
492
378 up_write(&mm->mmap_sem); 493 up_write(&mm->mmap_sem);
379 mpol_free(new); 494 mpol_free(new);
380 return err; 495 return err;