aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/mempolicy.h3
-rw-r--r--mm/mempolicy.c155
2 files changed, 138 insertions, 20 deletions
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index ed00b278cb93..05443a766cb8 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -22,6 +22,9 @@
22 22
23/* Flags for mbind */ 23/* Flags for mbind */
24#define MPOL_MF_STRICT (1<<0) /* Verify existing pages in the mapping */ 24#define MPOL_MF_STRICT (1<<0) /* Verify existing pages in the mapping */
25#define MPOL_MF_MOVE (1<<1) /* Move pages owned by this process to conform to mapping */
26#define MPOL_MF_MOVE_ALL (1<<2) /* Move every page to conform to mapping */
27#define MPOL_MF_INTERNAL (1<<3) /* Internal flags start here */
25 28
26#ifdef __KERNEL__ 29#ifdef __KERNEL__
27 30
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 0f1d2b8a952b..9cc6d962831d 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -83,9 +83,14 @@
83#include <linux/init.h> 83#include <linux/init.h>
84#include <linux/compat.h> 84#include <linux/compat.h>
85#include <linux/mempolicy.h> 85#include <linux/mempolicy.h>
86#include <linux/swap.h>
87
86#include <asm/tlbflush.h> 88#include <asm/tlbflush.h>
87#include <asm/uaccess.h> 89#include <asm/uaccess.h>
88 90
91/* Internal MPOL_MF_xxx flags */
92#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
93
89static kmem_cache_t *policy_cache; 94static kmem_cache_t *policy_cache;
90static kmem_cache_t *sn_cache; 95static kmem_cache_t *sn_cache;
91 96
@@ -174,9 +179,59 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
174 return policy; 179 return policy;
175} 180}
176 181
182/* Check if we are the only process mapping the page in question */
183static inline int single_mm_mapping(struct mm_struct *mm,
184 struct address_space *mapping)
185{
186 struct vm_area_struct *vma;
187 struct prio_tree_iter iter;
188 int rc = 1;
189
190 spin_lock(&mapping->i_mmap_lock);
191 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
192 if (mm != vma->vm_mm) {
193 rc = 0;
194 goto out;
195 }
196 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
197 if (mm != vma->vm_mm) {
198 rc = 0;
199 goto out;
200 }
201out:
202 spin_unlock(&mapping->i_mmap_lock);
203 return rc;
204}
205
206/*
207 * Add a page to be migrated to the pagelist
208 */
209static void migrate_page_add(struct vm_area_struct *vma,
210 struct page *page, struct list_head *pagelist, unsigned long flags)
211{
212 /*
213 * Avoid migrating a page that is shared by others and not writable.
214 */
215 if ((flags & MPOL_MF_MOVE_ALL) || !page->mapping || PageAnon(page) ||
216 mapping_writably_mapped(page->mapping) ||
217 single_mm_mapping(vma->vm_mm, page->mapping)) {
218 int rc = isolate_lru_page(page);
219
220 if (rc == 1)
221 list_add(&page->lru, pagelist);
222 /*
223 * If the isolate attempt was not successful then we just
224 * encountered an unswappable page. Something must be wrong.
225 */
226 WARN_ON(rc == 0);
227 }
228}
229
177/* Ensure all existing pages follow the policy. */ 230/* Ensure all existing pages follow the policy. */
178static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, 231static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
179 unsigned long addr, unsigned long end, nodemask_t *nodes) 232 unsigned long addr, unsigned long end,
233 const nodemask_t *nodes, unsigned long flags,
234 struct list_head *pagelist)
180{ 235{
181 pte_t *orig_pte; 236 pte_t *orig_pte;
182 pte_t *pte; 237 pte_t *pte;
@@ -193,15 +248,21 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
193 if (!page) 248 if (!page)
194 continue; 249 continue;
195 nid = page_to_nid(page); 250 nid = page_to_nid(page);
196 if (!node_isset(nid, *nodes)) 251 if (!node_isset(nid, *nodes)) {
197 break; 252 if (pagelist)
253 migrate_page_add(vma, page, pagelist, flags);
254 else
255 break;
256 }
198 } while (pte++, addr += PAGE_SIZE, addr != end); 257 } while (pte++, addr += PAGE_SIZE, addr != end);
199 pte_unmap_unlock(orig_pte, ptl); 258 pte_unmap_unlock(orig_pte, ptl);
200 return addr != end; 259 return addr != end;
201} 260}
202 261
203static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, 262static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
204 unsigned long addr, unsigned long end, nodemask_t *nodes) 263 unsigned long addr, unsigned long end,
264 const nodemask_t *nodes, unsigned long flags,
265 struct list_head *pagelist)
205{ 266{
206 pmd_t *pmd; 267 pmd_t *pmd;
207 unsigned long next; 268 unsigned long next;
@@ -211,14 +272,17 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
211 next = pmd_addr_end(addr, end); 272 next = pmd_addr_end(addr, end);
212 if (pmd_none_or_clear_bad(pmd)) 273 if (pmd_none_or_clear_bad(pmd))
213 continue; 274 continue;
214 if (check_pte_range(vma, pmd, addr, next, nodes)) 275 if (check_pte_range(vma, pmd, addr, next, nodes,
276 flags, pagelist))
215 return -EIO; 277 return -EIO;
216 } while (pmd++, addr = next, addr != end); 278 } while (pmd++, addr = next, addr != end);
217 return 0; 279 return 0;
218} 280}
219 281
220static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd, 282static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
221 unsigned long addr, unsigned long end, nodemask_t *nodes) 283 unsigned long addr, unsigned long end,
284 const nodemask_t *nodes, unsigned long flags,
285 struct list_head *pagelist)
222{ 286{
223 pud_t *pud; 287 pud_t *pud;
224 unsigned long next; 288 unsigned long next;
@@ -228,14 +292,17 @@ static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
228 next = pud_addr_end(addr, end); 292 next = pud_addr_end(addr, end);
229 if (pud_none_or_clear_bad(pud)) 293 if (pud_none_or_clear_bad(pud))
230 continue; 294 continue;
231 if (check_pmd_range(vma, pud, addr, next, nodes)) 295 if (check_pmd_range(vma, pud, addr, next, nodes,
296 flags, pagelist))
232 return -EIO; 297 return -EIO;
233 } while (pud++, addr = next, addr != end); 298 } while (pud++, addr = next, addr != end);
234 return 0; 299 return 0;
235} 300}
236 301
237static inline int check_pgd_range(struct vm_area_struct *vma, 302static inline int check_pgd_range(struct vm_area_struct *vma,
238 unsigned long addr, unsigned long end, nodemask_t *nodes) 303 unsigned long addr, unsigned long end,
304 const nodemask_t *nodes, unsigned long flags,
305 struct list_head *pagelist)
239{ 306{
240 pgd_t *pgd; 307 pgd_t *pgd;
241 unsigned long next; 308 unsigned long next;
@@ -245,16 +312,31 @@ static inline int check_pgd_range(struct vm_area_struct *vma,
245 next = pgd_addr_end(addr, end); 312 next = pgd_addr_end(addr, end);
246 if (pgd_none_or_clear_bad(pgd)) 313 if (pgd_none_or_clear_bad(pgd))
247 continue; 314 continue;
248 if (check_pud_range(vma, pgd, addr, next, nodes)) 315 if (check_pud_range(vma, pgd, addr, next, nodes,
316 flags, pagelist))
249 return -EIO; 317 return -EIO;
250 } while (pgd++, addr = next, addr != end); 318 } while (pgd++, addr = next, addr != end);
251 return 0; 319 return 0;
252} 320}
253 321
254/* Step 1: check the range */ 322/* Check if a vma is migratable */
323static inline int vma_migratable(struct vm_area_struct *vma)
324{
325 if (vma->vm_flags & (
326 VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP))
327 return 0;
328 return 1;
329}
330
331/*
332 * Check if all pages in a range are on a set of nodes.
333 * If pagelist != NULL then isolate pages from the LRU and
334 * put them on the pagelist.
335 */
255static struct vm_area_struct * 336static struct vm_area_struct *
256check_range(struct mm_struct *mm, unsigned long start, unsigned long end, 337check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
257 nodemask_t *nodes, unsigned long flags) 338 const nodemask_t *nodes, unsigned long flags,
339 struct list_head *pagelist)
258{ 340{
259 int err; 341 int err;
260 struct vm_area_struct *first, *vma, *prev; 342 struct vm_area_struct *first, *vma, *prev;
@@ -264,17 +346,24 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
264 return ERR_PTR(-EFAULT); 346 return ERR_PTR(-EFAULT);
265 prev = NULL; 347 prev = NULL;
266 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { 348 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
267 if (!vma->vm_next && vma->vm_end < end) 349 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
268 return ERR_PTR(-EFAULT); 350 if (!vma->vm_next && vma->vm_end < end)
269 if (prev && prev->vm_end < vma->vm_start) 351 return ERR_PTR(-EFAULT);
270 return ERR_PTR(-EFAULT); 352 if (prev && prev->vm_end < vma->vm_start)
271 if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) { 353 return ERR_PTR(-EFAULT);
354 }
355 if (!is_vm_hugetlb_page(vma) &&
356 ((flags & MPOL_MF_STRICT) ||
357 ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
358 vma_migratable(vma)))) {
272 unsigned long endvma = vma->vm_end; 359 unsigned long endvma = vma->vm_end;
360
273 if (endvma > end) 361 if (endvma > end)
274 endvma = end; 362 endvma = end;
275 if (vma->vm_start > start) 363 if (vma->vm_start > start)
276 start = vma->vm_start; 364 start = vma->vm_start;
277 err = check_pgd_range(vma, start, endvma, nodes); 365 err = check_pgd_range(vma, start, endvma, nodes,
366 flags, pagelist);
278 if (err) { 367 if (err) {
279 first = ERR_PTR(err); 368 first = ERR_PTR(err);
280 break; 369 break;
@@ -348,33 +437,59 @@ long do_mbind(unsigned long start, unsigned long len,
348 struct mempolicy *new; 437 struct mempolicy *new;
349 unsigned long end; 438 unsigned long end;
350 int err; 439 int err;
440 LIST_HEAD(pagelist);
351 441
352 if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX) 442 if ((flags & ~(unsigned long)(MPOL_MF_STRICT|MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
443 || mode > MPOL_MAX)
353 return -EINVAL; 444 return -EINVAL;
445 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE))
446 return -EPERM;
447
354 if (start & ~PAGE_MASK) 448 if (start & ~PAGE_MASK)
355 return -EINVAL; 449 return -EINVAL;
450
356 if (mode == MPOL_DEFAULT) 451 if (mode == MPOL_DEFAULT)
357 flags &= ~MPOL_MF_STRICT; 452 flags &= ~MPOL_MF_STRICT;
453
358 len = (len + PAGE_SIZE - 1) & PAGE_MASK; 454 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
359 end = start + len; 455 end = start + len;
456
360 if (end < start) 457 if (end < start)
361 return -EINVAL; 458 return -EINVAL;
362 if (end == start) 459 if (end == start)
363 return 0; 460 return 0;
461
364 if (mpol_check_policy(mode, nmask)) 462 if (mpol_check_policy(mode, nmask))
365 return -EINVAL; 463 return -EINVAL;
464
366 new = mpol_new(mode, nmask); 465 new = mpol_new(mode, nmask);
367 if (IS_ERR(new)) 466 if (IS_ERR(new))
368 return PTR_ERR(new); 467 return PTR_ERR(new);
369 468
469 /*
470 * If we are using the default policy then operation
471 * on discontinuous address spaces is okay after all
472 */
473 if (!new)
474 flags |= MPOL_MF_DISCONTIG_OK;
475
370 PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len, 476 PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
371 mode,nodes_addr(nodes)[0]); 477 mode,nodes_addr(nodes)[0]);
372 478
373 down_write(&mm->mmap_sem); 479 down_write(&mm->mmap_sem);
374 vma = check_range(mm, start, end, nmask, flags); 480 vma = check_range(mm, start, end, nmask, flags,
481 (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ? &pagelist : NULL);
375 err = PTR_ERR(vma); 482 err = PTR_ERR(vma);
376 if (!IS_ERR(vma)) 483 if (!IS_ERR(vma)) {
377 err = mbind_range(vma, start, end, new); 484 err = mbind_range(vma, start, end, new);
485 if (!list_empty(&pagelist))
486 migrate_pages(&pagelist, NULL);
487 if (!err && !list_empty(&pagelist) && (flags & MPOL_MF_STRICT))
488 err = -EIO;
489 }
490 if (!list_empty(&pagelist))
491 putback_lru_pages(&pagelist);
492
378 up_write(&mm->mmap_sem); 493 up_write(&mm->mmap_sem);
379 mpol_free(new); 494 mpol_free(new);
380 return err; 495 return err;