diff options
Diffstat (limited to 'mm/mempolicy.c')
-rw-r--r-- | mm/mempolicy.c | 561 |
1 files changed, 482 insertions, 79 deletions
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 0f1d2b8a952b..1850d0aef4ac 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -83,9 +83,18 @@ | |||
83 | #include <linux/init.h> | 83 | #include <linux/init.h> |
84 | #include <linux/compat.h> | 84 | #include <linux/compat.h> |
85 | #include <linux/mempolicy.h> | 85 | #include <linux/mempolicy.h> |
86 | #include <linux/swap.h> | ||
87 | #include <linux/seq_file.h> | ||
88 | #include <linux/proc_fs.h> | ||
89 | |||
86 | #include <asm/tlbflush.h> | 90 | #include <asm/tlbflush.h> |
87 | #include <asm/uaccess.h> | 91 | #include <asm/uaccess.h> |
88 | 92 | ||
93 | /* Internal flags */ | ||
94 | #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */ | ||
95 | #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ | ||
96 | #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */ | ||
97 | |||
89 | static kmem_cache_t *policy_cache; | 98 | static kmem_cache_t *policy_cache; |
90 | static kmem_cache_t *sn_cache; | 99 | static kmem_cache_t *sn_cache; |
91 | 100 | ||
@@ -171,12 +180,19 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes) | |||
171 | break; | 180 | break; |
172 | } | 181 | } |
173 | policy->policy = mode; | 182 | policy->policy = mode; |
183 | policy->cpuset_mems_allowed = cpuset_mems_allowed(current); | ||
174 | return policy; | 184 | return policy; |
175 | } | 185 | } |
176 | 186 | ||
177 | /* Ensure all existing pages follow the policy. */ | 187 | static void gather_stats(struct page *, void *); |
188 | static void migrate_page_add(struct vm_area_struct *vma, | ||
189 | struct page *page, struct list_head *pagelist, unsigned long flags); | ||
190 | |||
191 | /* Scan through pages checking if pages follow certain conditions. */ | ||
178 | static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | 192 | static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, |
179 | unsigned long addr, unsigned long end, nodemask_t *nodes) | 193 | unsigned long addr, unsigned long end, |
194 | const nodemask_t *nodes, unsigned long flags, | ||
195 | void *private) | ||
180 | { | 196 | { |
181 | pte_t *orig_pte; | 197 | pte_t *orig_pte; |
182 | pte_t *pte; | 198 | pte_t *pte; |
@@ -193,7 +209,17 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
193 | if (!page) | 209 | if (!page) |
194 | continue; | 210 | continue; |
195 | nid = page_to_nid(page); | 211 | nid = page_to_nid(page); |
196 | if (!node_isset(nid, *nodes)) | 212 | if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) |
213 | continue; | ||
214 | |||
215 | if (flags & MPOL_MF_STATS) | ||
216 | gather_stats(page, private); | ||
217 | else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { | ||
218 | spin_unlock(ptl); | ||
219 | migrate_page_add(vma, page, private, flags); | ||
220 | spin_lock(ptl); | ||
221 | } | ||
222 | else | ||
197 | break; | 223 | break; |
198 | } while (pte++, addr += PAGE_SIZE, addr != end); | 224 | } while (pte++, addr += PAGE_SIZE, addr != end); |
199 | pte_unmap_unlock(orig_pte, ptl); | 225 | pte_unmap_unlock(orig_pte, ptl); |
@@ -201,7 +227,9 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
201 | } | 227 | } |
202 | 228 | ||
203 | static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, | 229 | static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, |
204 | unsigned long addr, unsigned long end, nodemask_t *nodes) | 230 | unsigned long addr, unsigned long end, |
231 | const nodemask_t *nodes, unsigned long flags, | ||
232 | void *private) | ||
205 | { | 233 | { |
206 | pmd_t *pmd; | 234 | pmd_t *pmd; |
207 | unsigned long next; | 235 | unsigned long next; |
@@ -211,14 +239,17 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, | |||
211 | next = pmd_addr_end(addr, end); | 239 | next = pmd_addr_end(addr, end); |
212 | if (pmd_none_or_clear_bad(pmd)) | 240 | if (pmd_none_or_clear_bad(pmd)) |
213 | continue; | 241 | continue; |
214 | if (check_pte_range(vma, pmd, addr, next, nodes)) | 242 | if (check_pte_range(vma, pmd, addr, next, nodes, |
243 | flags, private)) | ||
215 | return -EIO; | 244 | return -EIO; |
216 | } while (pmd++, addr = next, addr != end); | 245 | } while (pmd++, addr = next, addr != end); |
217 | return 0; | 246 | return 0; |
218 | } | 247 | } |
219 | 248 | ||
220 | static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd, | 249 | static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd, |
221 | unsigned long addr, unsigned long end, nodemask_t *nodes) | 250 | unsigned long addr, unsigned long end, |
251 | const nodemask_t *nodes, unsigned long flags, | ||
252 | void *private) | ||
222 | { | 253 | { |
223 | pud_t *pud; | 254 | pud_t *pud; |
224 | unsigned long next; | 255 | unsigned long next; |
@@ -228,14 +259,17 @@ static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd, | |||
228 | next = pud_addr_end(addr, end); | 259 | next = pud_addr_end(addr, end); |
229 | if (pud_none_or_clear_bad(pud)) | 260 | if (pud_none_or_clear_bad(pud)) |
230 | continue; | 261 | continue; |
231 | if (check_pmd_range(vma, pud, addr, next, nodes)) | 262 | if (check_pmd_range(vma, pud, addr, next, nodes, |
263 | flags, private)) | ||
232 | return -EIO; | 264 | return -EIO; |
233 | } while (pud++, addr = next, addr != end); | 265 | } while (pud++, addr = next, addr != end); |
234 | return 0; | 266 | return 0; |
235 | } | 267 | } |
236 | 268 | ||
237 | static inline int check_pgd_range(struct vm_area_struct *vma, | 269 | static inline int check_pgd_range(struct vm_area_struct *vma, |
238 | unsigned long addr, unsigned long end, nodemask_t *nodes) | 270 | unsigned long addr, unsigned long end, |
271 | const nodemask_t *nodes, unsigned long flags, | ||
272 | void *private) | ||
239 | { | 273 | { |
240 | pgd_t *pgd; | 274 | pgd_t *pgd; |
241 | unsigned long next; | 275 | unsigned long next; |
@@ -245,16 +279,30 @@ static inline int check_pgd_range(struct vm_area_struct *vma, | |||
245 | next = pgd_addr_end(addr, end); | 279 | next = pgd_addr_end(addr, end); |
246 | if (pgd_none_or_clear_bad(pgd)) | 280 | if (pgd_none_or_clear_bad(pgd)) |
247 | continue; | 281 | continue; |
248 | if (check_pud_range(vma, pgd, addr, next, nodes)) | 282 | if (check_pud_range(vma, pgd, addr, next, nodes, |
283 | flags, private)) | ||
249 | return -EIO; | 284 | return -EIO; |
250 | } while (pgd++, addr = next, addr != end); | 285 | } while (pgd++, addr = next, addr != end); |
251 | return 0; | 286 | return 0; |
252 | } | 287 | } |
253 | 288 | ||
254 | /* Step 1: check the range */ | 289 | /* Check if a vma is migratable */ |
290 | static inline int vma_migratable(struct vm_area_struct *vma) | ||
291 | { | ||
292 | if (vma->vm_flags & ( | ||
293 | VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP)) | ||
294 | return 0; | ||
295 | return 1; | ||
296 | } | ||
297 | |||
298 | /* | ||
299 | * Check if all pages in a range are on a set of nodes. | ||
300 | * If pagelist != NULL then isolate pages from the LRU and | ||
301 | * put them on the pagelist. | ||
302 | */ | ||
255 | static struct vm_area_struct * | 303 | static struct vm_area_struct * |
256 | check_range(struct mm_struct *mm, unsigned long start, unsigned long end, | 304 | check_range(struct mm_struct *mm, unsigned long start, unsigned long end, |
257 | nodemask_t *nodes, unsigned long flags) | 305 | const nodemask_t *nodes, unsigned long flags, void *private) |
258 | { | 306 | { |
259 | int err; | 307 | int err; |
260 | struct vm_area_struct *first, *vma, *prev; | 308 | struct vm_area_struct *first, *vma, *prev; |
@@ -264,17 +312,24 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, | |||
264 | return ERR_PTR(-EFAULT); | 312 | return ERR_PTR(-EFAULT); |
265 | prev = NULL; | 313 | prev = NULL; |
266 | for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { | 314 | for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { |
267 | if (!vma->vm_next && vma->vm_end < end) | 315 | if (!(flags & MPOL_MF_DISCONTIG_OK)) { |
268 | return ERR_PTR(-EFAULT); | 316 | if (!vma->vm_next && vma->vm_end < end) |
269 | if (prev && prev->vm_end < vma->vm_start) | 317 | return ERR_PTR(-EFAULT); |
270 | return ERR_PTR(-EFAULT); | 318 | if (prev && prev->vm_end < vma->vm_start) |
271 | if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) { | 319 | return ERR_PTR(-EFAULT); |
320 | } | ||
321 | if (!is_vm_hugetlb_page(vma) && | ||
322 | ((flags & MPOL_MF_STRICT) || | ||
323 | ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && | ||
324 | vma_migratable(vma)))) { | ||
272 | unsigned long endvma = vma->vm_end; | 325 | unsigned long endvma = vma->vm_end; |
326 | |||
273 | if (endvma > end) | 327 | if (endvma > end) |
274 | endvma = end; | 328 | endvma = end; |
275 | if (vma->vm_start > start) | 329 | if (vma->vm_start > start) |
276 | start = vma->vm_start; | 330 | start = vma->vm_start; |
277 | err = check_pgd_range(vma, start, endvma, nodes); | 331 | err = check_pgd_range(vma, start, endvma, nodes, |
332 | flags, private); | ||
278 | if (err) { | 333 | if (err) { |
279 | first = ERR_PTR(err); | 334 | first = ERR_PTR(err); |
280 | break; | 335 | break; |
@@ -333,51 +388,10 @@ static int contextualize_policy(int mode, nodemask_t *nodes) | |||
333 | if (!nodes) | 388 | if (!nodes) |
334 | return 0; | 389 | return 0; |
335 | 390 | ||
336 | /* Update current mems_allowed */ | 391 | cpuset_update_task_memory_state(); |
337 | cpuset_update_current_mems_allowed(); | 392 | if (!cpuset_nodes_subset_current_mems_allowed(*nodes)) |
338 | /* Ignore nodes not set in current->mems_allowed */ | ||
339 | cpuset_restrict_to_mems_allowed(nodes->bits); | ||
340 | return mpol_check_policy(mode, nodes); | ||
341 | } | ||
342 | |||
343 | long do_mbind(unsigned long start, unsigned long len, | ||
344 | unsigned long mode, nodemask_t *nmask, unsigned long flags) | ||
345 | { | ||
346 | struct vm_area_struct *vma; | ||
347 | struct mm_struct *mm = current->mm; | ||
348 | struct mempolicy *new; | ||
349 | unsigned long end; | ||
350 | int err; | ||
351 | |||
352 | if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX) | ||
353 | return -EINVAL; | ||
354 | if (start & ~PAGE_MASK) | ||
355 | return -EINVAL; | ||
356 | if (mode == MPOL_DEFAULT) | ||
357 | flags &= ~MPOL_MF_STRICT; | ||
358 | len = (len + PAGE_SIZE - 1) & PAGE_MASK; | ||
359 | end = start + len; | ||
360 | if (end < start) | ||
361 | return -EINVAL; | 393 | return -EINVAL; |
362 | if (end == start) | 394 | return mpol_check_policy(mode, nodes); |
363 | return 0; | ||
364 | if (mpol_check_policy(mode, nmask)) | ||
365 | return -EINVAL; | ||
366 | new = mpol_new(mode, nmask); | ||
367 | if (IS_ERR(new)) | ||
368 | return PTR_ERR(new); | ||
369 | |||
370 | PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len, | ||
371 | mode,nodes_addr(nodes)[0]); | ||
372 | |||
373 | down_write(&mm->mmap_sem); | ||
374 | vma = check_range(mm, start, end, nmask, flags); | ||
375 | err = PTR_ERR(vma); | ||
376 | if (!IS_ERR(vma)) | ||
377 | err = mbind_range(vma, start, end, new); | ||
378 | up_write(&mm->mmap_sem); | ||
379 | mpol_free(new); | ||
380 | return err; | ||
381 | } | 395 | } |
382 | 396 | ||
383 | /* Set the process memory policy */ | 397 | /* Set the process memory policy */ |
@@ -448,7 +462,7 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask, | |||
448 | struct vm_area_struct *vma = NULL; | 462 | struct vm_area_struct *vma = NULL; |
449 | struct mempolicy *pol = current->mempolicy; | 463 | struct mempolicy *pol = current->mempolicy; |
450 | 464 | ||
451 | cpuset_update_current_mems_allowed(); | 465 | cpuset_update_task_memory_state(); |
452 | if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR)) | 466 | if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR)) |
453 | return -EINVAL; | 467 | return -EINVAL; |
454 | if (flags & MPOL_F_ADDR) { | 468 | if (flags & MPOL_F_ADDR) { |
@@ -500,11 +514,177 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask, | |||
500 | } | 514 | } |
501 | 515 | ||
502 | /* | 516 | /* |
517 | * page migration | ||
518 | */ | ||
519 | |||
520 | /* Check if we are the only process mapping the page in question */ | ||
521 | static inline int single_mm_mapping(struct mm_struct *mm, | ||
522 | struct address_space *mapping) | ||
523 | { | ||
524 | struct vm_area_struct *vma; | ||
525 | struct prio_tree_iter iter; | ||
526 | int rc = 1; | ||
527 | |||
528 | spin_lock(&mapping->i_mmap_lock); | ||
529 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX) | ||
530 | if (mm != vma->vm_mm) { | ||
531 | rc = 0; | ||
532 | goto out; | ||
533 | } | ||
534 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) | ||
535 | if (mm != vma->vm_mm) { | ||
536 | rc = 0; | ||
537 | goto out; | ||
538 | } | ||
539 | out: | ||
540 | spin_unlock(&mapping->i_mmap_lock); | ||
541 | return rc; | ||
542 | } | ||
543 | |||
544 | /* | ||
545 | * Add a page to be migrated to the pagelist | ||
546 | */ | ||
547 | static void migrate_page_add(struct vm_area_struct *vma, | ||
548 | struct page *page, struct list_head *pagelist, unsigned long flags) | ||
549 | { | ||
550 | /* | ||
551 | * Avoid migrating a page that is shared by others and not writable. | ||
552 | */ | ||
553 | if ((flags & MPOL_MF_MOVE_ALL) || !page->mapping || PageAnon(page) || | ||
554 | mapping_writably_mapped(page->mapping) || | ||
555 | single_mm_mapping(vma->vm_mm, page->mapping)) { | ||
556 | int rc = isolate_lru_page(page); | ||
557 | |||
558 | if (rc == 1) | ||
559 | list_add(&page->lru, pagelist); | ||
560 | /* | ||
561 | * If the isolate attempt was not successful then we just | ||
562 | * encountered an unswappable page. Something must be wrong. | ||
563 | */ | ||
564 | WARN_ON(rc == 0); | ||
565 | } | ||
566 | } | ||
567 | |||
568 | static int swap_pages(struct list_head *pagelist) | ||
569 | { | ||
570 | LIST_HEAD(moved); | ||
571 | LIST_HEAD(failed); | ||
572 | int n; | ||
573 | |||
574 | n = migrate_pages(pagelist, NULL, &moved, &failed); | ||
575 | putback_lru_pages(&failed); | ||
576 | putback_lru_pages(&moved); | ||
577 | |||
578 | return n; | ||
579 | } | ||
580 | |||
581 | /* | ||
582 | * For now migrate_pages simply swaps out the pages from nodes that are in | ||
583 | * the source set but not in the target set. In the future, we would | ||
584 | * want a function that moves pages between the two nodesets in such | ||
585 | * a way as to preserve the physical layout as much as possible. | ||
586 | * | ||
587 | * Returns the number of page that could not be moved. | ||
588 | */ | ||
589 | int do_migrate_pages(struct mm_struct *mm, | ||
590 | const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) | ||
591 | { | ||
592 | LIST_HEAD(pagelist); | ||
593 | int count = 0; | ||
594 | nodemask_t nodes; | ||
595 | |||
596 | nodes_andnot(nodes, *from_nodes, *to_nodes); | ||
597 | |||
598 | down_read(&mm->mmap_sem); | ||
599 | check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nodes, | ||
600 | flags | MPOL_MF_DISCONTIG_OK, &pagelist); | ||
601 | |||
602 | if (!list_empty(&pagelist)) { | ||
603 | count = swap_pages(&pagelist); | ||
604 | putback_lru_pages(&pagelist); | ||
605 | } | ||
606 | |||
607 | up_read(&mm->mmap_sem); | ||
608 | return count; | ||
609 | } | ||
610 | |||
611 | long do_mbind(unsigned long start, unsigned long len, | ||
612 | unsigned long mode, nodemask_t *nmask, unsigned long flags) | ||
613 | { | ||
614 | struct vm_area_struct *vma; | ||
615 | struct mm_struct *mm = current->mm; | ||
616 | struct mempolicy *new; | ||
617 | unsigned long end; | ||
618 | int err; | ||
619 | LIST_HEAD(pagelist); | ||
620 | |||
621 | if ((flags & ~(unsigned long)(MPOL_MF_STRICT | | ||
622 | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) | ||
623 | || mode > MPOL_MAX) | ||
624 | return -EINVAL; | ||
625 | if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE)) | ||
626 | return -EPERM; | ||
627 | |||
628 | if (start & ~PAGE_MASK) | ||
629 | return -EINVAL; | ||
630 | |||
631 | if (mode == MPOL_DEFAULT) | ||
632 | flags &= ~MPOL_MF_STRICT; | ||
633 | |||
634 | len = (len + PAGE_SIZE - 1) & PAGE_MASK; | ||
635 | end = start + len; | ||
636 | |||
637 | if (end < start) | ||
638 | return -EINVAL; | ||
639 | if (end == start) | ||
640 | return 0; | ||
641 | |||
642 | if (mpol_check_policy(mode, nmask)) | ||
643 | return -EINVAL; | ||
644 | |||
645 | new = mpol_new(mode, nmask); | ||
646 | if (IS_ERR(new)) | ||
647 | return PTR_ERR(new); | ||
648 | |||
649 | /* | ||
650 | * If we are using the default policy then operation | ||
651 | * on discontinuous address spaces is okay after all | ||
652 | */ | ||
653 | if (!new) | ||
654 | flags |= MPOL_MF_DISCONTIG_OK; | ||
655 | |||
656 | PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len, | ||
657 | mode,nodes_addr(nodes)[0]); | ||
658 | |||
659 | down_write(&mm->mmap_sem); | ||
660 | vma = check_range(mm, start, end, nmask, | ||
661 | flags | MPOL_MF_INVERT, &pagelist); | ||
662 | |||
663 | err = PTR_ERR(vma); | ||
664 | if (!IS_ERR(vma)) { | ||
665 | int nr_failed = 0; | ||
666 | |||
667 | err = mbind_range(vma, start, end, new); | ||
668 | if (!list_empty(&pagelist)) | ||
669 | nr_failed = swap_pages(&pagelist); | ||
670 | |||
671 | if (!err && nr_failed && (flags & MPOL_MF_STRICT)) | ||
672 | err = -EIO; | ||
673 | } | ||
674 | if (!list_empty(&pagelist)) | ||
675 | putback_lru_pages(&pagelist); | ||
676 | |||
677 | up_write(&mm->mmap_sem); | ||
678 | mpol_free(new); | ||
679 | return err; | ||
680 | } | ||
681 | |||
682 | /* | ||
503 | * User space interface with variable sized bitmaps for nodelists. | 683 | * User space interface with variable sized bitmaps for nodelists. |
504 | */ | 684 | */ |
505 | 685 | ||
506 | /* Copy a node mask from user space. */ | 686 | /* Copy a node mask from user space. */ |
507 | static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask, | 687 | static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, |
508 | unsigned long maxnode) | 688 | unsigned long maxnode) |
509 | { | 689 | { |
510 | unsigned long k; | 690 | unsigned long k; |
@@ -593,6 +773,65 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask, | |||
593 | return do_set_mempolicy(mode, &nodes); | 773 | return do_set_mempolicy(mode, &nodes); |
594 | } | 774 | } |
595 | 775 | ||
776 | asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode, | ||
777 | const unsigned long __user *old_nodes, | ||
778 | const unsigned long __user *new_nodes) | ||
779 | { | ||
780 | struct mm_struct *mm; | ||
781 | struct task_struct *task; | ||
782 | nodemask_t old; | ||
783 | nodemask_t new; | ||
784 | nodemask_t task_nodes; | ||
785 | int err; | ||
786 | |||
787 | err = get_nodes(&old, old_nodes, maxnode); | ||
788 | if (err) | ||
789 | return err; | ||
790 | |||
791 | err = get_nodes(&new, new_nodes, maxnode); | ||
792 | if (err) | ||
793 | return err; | ||
794 | |||
795 | /* Find the mm_struct */ | ||
796 | read_lock(&tasklist_lock); | ||
797 | task = pid ? find_task_by_pid(pid) : current; | ||
798 | if (!task) { | ||
799 | read_unlock(&tasklist_lock); | ||
800 | return -ESRCH; | ||
801 | } | ||
802 | mm = get_task_mm(task); | ||
803 | read_unlock(&tasklist_lock); | ||
804 | |||
805 | if (!mm) | ||
806 | return -EINVAL; | ||
807 | |||
808 | /* | ||
809 | * Check if this process has the right to modify the specified | ||
810 | * process. The right exists if the process has administrative | ||
811 | * capabilities, superuser priviledges or the same | ||
812 | * userid as the target process. | ||
813 | */ | ||
814 | if ((current->euid != task->suid) && (current->euid != task->uid) && | ||
815 | (current->uid != task->suid) && (current->uid != task->uid) && | ||
816 | !capable(CAP_SYS_ADMIN)) { | ||
817 | err = -EPERM; | ||
818 | goto out; | ||
819 | } | ||
820 | |||
821 | task_nodes = cpuset_mems_allowed(task); | ||
822 | /* Is the user allowed to access the target nodes? */ | ||
823 | if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_ADMIN)) { | ||
824 | err = -EPERM; | ||
825 | goto out; | ||
826 | } | ||
827 | |||
828 | err = do_migrate_pages(mm, &old, &new, MPOL_MF_MOVE); | ||
829 | out: | ||
830 | mmput(mm); | ||
831 | return err; | ||
832 | } | ||
833 | |||
834 | |||
596 | /* Retrieve NUMA policy */ | 835 | /* Retrieve NUMA policy */ |
597 | asmlinkage long sys_get_mempolicy(int __user *policy, | 836 | asmlinkage long sys_get_mempolicy(int __user *policy, |
598 | unsigned long __user *nmask, | 837 | unsigned long __user *nmask, |
@@ -699,8 +938,8 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len, | |||
699 | #endif | 938 | #endif |
700 | 939 | ||
701 | /* Return effective policy for a VMA */ | 940 | /* Return effective policy for a VMA */ |
702 | struct mempolicy * | 941 | static struct mempolicy * get_vma_policy(struct task_struct *task, |
703 | get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr) | 942 | struct vm_area_struct *vma, unsigned long addr) |
704 | { | 943 | { |
705 | struct mempolicy *pol = task->mempolicy; | 944 | struct mempolicy *pol = task->mempolicy; |
706 | 945 | ||
@@ -848,7 +1087,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) | |||
848 | { | 1087 | { |
849 | struct mempolicy *pol = get_vma_policy(current, vma, addr); | 1088 | struct mempolicy *pol = get_vma_policy(current, vma, addr); |
850 | 1089 | ||
851 | cpuset_update_current_mems_allowed(); | 1090 | cpuset_update_task_memory_state(); |
852 | 1091 | ||
853 | if (unlikely(pol->policy == MPOL_INTERLEAVE)) { | 1092 | if (unlikely(pol->policy == MPOL_INTERLEAVE)) { |
854 | unsigned nid; | 1093 | unsigned nid; |
@@ -874,7 +1113,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) | |||
874 | * interrupt context and apply the current process NUMA policy. | 1113 | * interrupt context and apply the current process NUMA policy. |
875 | * Returns NULL when no page can be allocated. | 1114 | * Returns NULL when no page can be allocated. |
876 | * | 1115 | * |
877 | * Don't call cpuset_update_current_mems_allowed() unless | 1116 | * Don't call cpuset_update_task_memory_state() unless |
878 | * 1) it's ok to take cpuset_sem (can WAIT), and | 1117 | * 1) it's ok to take cpuset_sem (can WAIT), and |
879 | * 2) allocating for current task (not interrupt). | 1118 | * 2) allocating for current task (not interrupt). |
880 | */ | 1119 | */ |
@@ -883,7 +1122,7 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) | |||
883 | struct mempolicy *pol = current->mempolicy; | 1122 | struct mempolicy *pol = current->mempolicy; |
884 | 1123 | ||
885 | if ((gfp & __GFP_WAIT) && !in_interrupt()) | 1124 | if ((gfp & __GFP_WAIT) && !in_interrupt()) |
886 | cpuset_update_current_mems_allowed(); | 1125 | cpuset_update_task_memory_state(); |
887 | if (!pol || in_interrupt()) | 1126 | if (!pol || in_interrupt()) |
888 | pol = &default_policy; | 1127 | pol = &default_policy; |
889 | if (pol->policy == MPOL_INTERLEAVE) | 1128 | if (pol->policy == MPOL_INTERLEAVE) |
@@ -892,6 +1131,15 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) | |||
892 | } | 1131 | } |
893 | EXPORT_SYMBOL(alloc_pages_current); | 1132 | EXPORT_SYMBOL(alloc_pages_current); |
894 | 1133 | ||
1134 | /* | ||
1135 | * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it | ||
1136 | * rebinds the mempolicy its copying by calling mpol_rebind_policy() | ||
1137 | * with the mems_allowed returned by cpuset_mems_allowed(). This | ||
1138 | * keeps mempolicies cpuset relative after its cpuset moves. See | ||
1139 | * further kernel/cpuset.c update_nodemask(). | ||
1140 | */ | ||
1141 | void *cpuset_being_rebound; | ||
1142 | |||
895 | /* Slow path of a mempolicy copy */ | 1143 | /* Slow path of a mempolicy copy */ |
896 | struct mempolicy *__mpol_copy(struct mempolicy *old) | 1144 | struct mempolicy *__mpol_copy(struct mempolicy *old) |
897 | { | 1145 | { |
@@ -899,6 +1147,10 @@ struct mempolicy *__mpol_copy(struct mempolicy *old) | |||
899 | 1147 | ||
900 | if (!new) | 1148 | if (!new) |
901 | return ERR_PTR(-ENOMEM); | 1149 | return ERR_PTR(-ENOMEM); |
1150 | if (current_cpuset_is_being_rebound()) { | ||
1151 | nodemask_t mems = cpuset_mems_allowed(current); | ||
1152 | mpol_rebind_policy(old, &mems); | ||
1153 | } | ||
902 | *new = *old; | 1154 | *new = *old; |
903 | atomic_set(&new->refcnt, 1); | 1155 | atomic_set(&new->refcnt, 1); |
904 | if (new->policy == MPOL_BIND) { | 1156 | if (new->policy == MPOL_BIND) { |
@@ -1173,25 +1425,31 @@ void numa_default_policy(void) | |||
1173 | } | 1425 | } |
1174 | 1426 | ||
1175 | /* Migrate a policy to a different set of nodes */ | 1427 | /* Migrate a policy to a different set of nodes */ |
1176 | static void rebind_policy(struct mempolicy *pol, const nodemask_t *old, | 1428 | void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask) |
1177 | const nodemask_t *new) | ||
1178 | { | 1429 | { |
1430 | nodemask_t *mpolmask; | ||
1179 | nodemask_t tmp; | 1431 | nodemask_t tmp; |
1180 | 1432 | ||
1181 | if (!pol) | 1433 | if (!pol) |
1182 | return; | 1434 | return; |
1435 | mpolmask = &pol->cpuset_mems_allowed; | ||
1436 | if (nodes_equal(*mpolmask, *newmask)) | ||
1437 | return; | ||
1183 | 1438 | ||
1184 | switch (pol->policy) { | 1439 | switch (pol->policy) { |
1185 | case MPOL_DEFAULT: | 1440 | case MPOL_DEFAULT: |
1186 | break; | 1441 | break; |
1187 | case MPOL_INTERLEAVE: | 1442 | case MPOL_INTERLEAVE: |
1188 | nodes_remap(tmp, pol->v.nodes, *old, *new); | 1443 | nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask); |
1189 | pol->v.nodes = tmp; | 1444 | pol->v.nodes = tmp; |
1190 | current->il_next = node_remap(current->il_next, *old, *new); | 1445 | *mpolmask = *newmask; |
1446 | current->il_next = node_remap(current->il_next, | ||
1447 | *mpolmask, *newmask); | ||
1191 | break; | 1448 | break; |
1192 | case MPOL_PREFERRED: | 1449 | case MPOL_PREFERRED: |
1193 | pol->v.preferred_node = node_remap(pol->v.preferred_node, | 1450 | pol->v.preferred_node = node_remap(pol->v.preferred_node, |
1194 | *old, *new); | 1451 | *mpolmask, *newmask); |
1452 | *mpolmask = *newmask; | ||
1195 | break; | 1453 | break; |
1196 | case MPOL_BIND: { | 1454 | case MPOL_BIND: { |
1197 | nodemask_t nodes; | 1455 | nodemask_t nodes; |
@@ -1201,7 +1459,7 @@ static void rebind_policy(struct mempolicy *pol, const nodemask_t *old, | |||
1201 | nodes_clear(nodes); | 1459 | nodes_clear(nodes); |
1202 | for (z = pol->v.zonelist->zones; *z; z++) | 1460 | for (z = pol->v.zonelist->zones; *z; z++) |
1203 | node_set((*z)->zone_pgdat->node_id, nodes); | 1461 | node_set((*z)->zone_pgdat->node_id, nodes); |
1204 | nodes_remap(tmp, nodes, *old, *new); | 1462 | nodes_remap(tmp, nodes, *mpolmask, *newmask); |
1205 | nodes = tmp; | 1463 | nodes = tmp; |
1206 | 1464 | ||
1207 | zonelist = bind_zonelist(&nodes); | 1465 | zonelist = bind_zonelist(&nodes); |
@@ -1216,6 +1474,7 @@ static void rebind_policy(struct mempolicy *pol, const nodemask_t *old, | |||
1216 | kfree(pol->v.zonelist); | 1474 | kfree(pol->v.zonelist); |
1217 | pol->v.zonelist = zonelist; | 1475 | pol->v.zonelist = zonelist; |
1218 | } | 1476 | } |
1477 | *mpolmask = *newmask; | ||
1219 | break; | 1478 | break; |
1220 | } | 1479 | } |
1221 | default: | 1480 | default: |
@@ -1225,12 +1484,156 @@ static void rebind_policy(struct mempolicy *pol, const nodemask_t *old, | |||
1225 | } | 1484 | } |
1226 | 1485 | ||
1227 | /* | 1486 | /* |
1228 | * Someone moved this task to different nodes. Fixup mempolicies. | 1487 | * Wrapper for mpol_rebind_policy() that just requires task |
1488 | * pointer, and updates task mempolicy. | ||
1489 | */ | ||
1490 | |||
1491 | void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) | ||
1492 | { | ||
1493 | mpol_rebind_policy(tsk->mempolicy, new); | ||
1494 | } | ||
1495 | |||
1496 | /* | ||
1497 | * Rebind each vma in mm to new nodemask. | ||
1229 | * | 1498 | * |
1230 | * TODO - fixup current->mm->vma and shmfs/tmpfs/hugetlbfs policies as well, | 1499 | * Call holding a reference to mm. Takes mm->mmap_sem during call. |
1231 | * once we have a cpuset mechanism to mark which cpuset subtree is migrating. | ||
1232 | */ | 1500 | */ |
1233 | void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new) | 1501 | |
1502 | void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) | ||
1234 | { | 1503 | { |
1235 | rebind_policy(current->mempolicy, old, new); | 1504 | struct vm_area_struct *vma; |
1505 | |||
1506 | down_write(&mm->mmap_sem); | ||
1507 | for (vma = mm->mmap; vma; vma = vma->vm_next) | ||
1508 | mpol_rebind_policy(vma->vm_policy, new); | ||
1509 | up_write(&mm->mmap_sem); | ||
1236 | } | 1510 | } |
1511 | |||
1512 | /* | ||
1513 | * Display pages allocated per node and memory policy via /proc. | ||
1514 | */ | ||
1515 | |||
1516 | static const char *policy_types[] = { "default", "prefer", "bind", | ||
1517 | "interleave" }; | ||
1518 | |||
1519 | /* | ||
1520 | * Convert a mempolicy into a string. | ||
1521 | * Returns the number of characters in buffer (if positive) | ||
1522 | * or an error (negative) | ||
1523 | */ | ||
1524 | static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) | ||
1525 | { | ||
1526 | char *p = buffer; | ||
1527 | int l; | ||
1528 | nodemask_t nodes; | ||
1529 | int mode = pol ? pol->policy : MPOL_DEFAULT; | ||
1530 | |||
1531 | switch (mode) { | ||
1532 | case MPOL_DEFAULT: | ||
1533 | nodes_clear(nodes); | ||
1534 | break; | ||
1535 | |||
1536 | case MPOL_PREFERRED: | ||
1537 | nodes_clear(nodes); | ||
1538 | node_set(pol->v.preferred_node, nodes); | ||
1539 | break; | ||
1540 | |||
1541 | case MPOL_BIND: | ||
1542 | get_zonemask(pol, &nodes); | ||
1543 | break; | ||
1544 | |||
1545 | case MPOL_INTERLEAVE: | ||
1546 | nodes = pol->v.nodes; | ||
1547 | break; | ||
1548 | |||
1549 | default: | ||
1550 | BUG(); | ||
1551 | return -EFAULT; | ||
1552 | } | ||
1553 | |||
1554 | l = strlen(policy_types[mode]); | ||
1555 | if (buffer + maxlen < p + l + 1) | ||
1556 | return -ENOSPC; | ||
1557 | |||
1558 | strcpy(p, policy_types[mode]); | ||
1559 | p += l; | ||
1560 | |||
1561 | if (!nodes_empty(nodes)) { | ||
1562 | if (buffer + maxlen < p + 2) | ||
1563 | return -ENOSPC; | ||
1564 | *p++ = '='; | ||
1565 | p += nodelist_scnprintf(p, buffer + maxlen - p, nodes); | ||
1566 | } | ||
1567 | return p - buffer; | ||
1568 | } | ||
1569 | |||
1570 | struct numa_maps { | ||
1571 | unsigned long pages; | ||
1572 | unsigned long anon; | ||
1573 | unsigned long mapped; | ||
1574 | unsigned long mapcount_max; | ||
1575 | unsigned long node[MAX_NUMNODES]; | ||
1576 | }; | ||
1577 | |||
1578 | static void gather_stats(struct page *page, void *private) | ||
1579 | { | ||
1580 | struct numa_maps *md = private; | ||
1581 | int count = page_mapcount(page); | ||
1582 | |||
1583 | if (count) | ||
1584 | md->mapped++; | ||
1585 | |||
1586 | if (count > md->mapcount_max) | ||
1587 | md->mapcount_max = count; | ||
1588 | |||
1589 | md->pages++; | ||
1590 | |||
1591 | if (PageAnon(page)) | ||
1592 | md->anon++; | ||
1593 | |||
1594 | md->node[page_to_nid(page)]++; | ||
1595 | cond_resched(); | ||
1596 | } | ||
1597 | |||
1598 | int show_numa_map(struct seq_file *m, void *v) | ||
1599 | { | ||
1600 | struct task_struct *task = m->private; | ||
1601 | struct vm_area_struct *vma = v; | ||
1602 | struct numa_maps *md; | ||
1603 | int n; | ||
1604 | char buffer[50]; | ||
1605 | |||
1606 | if (!vma->vm_mm) | ||
1607 | return 0; | ||
1608 | |||
1609 | md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL); | ||
1610 | if (!md) | ||
1611 | return 0; | ||
1612 | |||
1613 | check_pgd_range(vma, vma->vm_start, vma->vm_end, | ||
1614 | &node_online_map, MPOL_MF_STATS, md); | ||
1615 | |||
1616 | if (md->pages) { | ||
1617 | mpol_to_str(buffer, sizeof(buffer), | ||
1618 | get_vma_policy(task, vma, vma->vm_start)); | ||
1619 | |||
1620 | seq_printf(m, "%08lx %s pages=%lu mapped=%lu maxref=%lu", | ||
1621 | vma->vm_start, buffer, md->pages, | ||
1622 | md->mapped, md->mapcount_max); | ||
1623 | |||
1624 | if (md->anon) | ||
1625 | seq_printf(m," anon=%lu",md->anon); | ||
1626 | |||
1627 | for_each_online_node(n) | ||
1628 | if (md->node[n]) | ||
1629 | seq_printf(m, " N%d=%lu", n, md->node[n]); | ||
1630 | |||
1631 | seq_putc(m, '\n'); | ||
1632 | } | ||
1633 | kfree(md); | ||
1634 | |||
1635 | if (m->count < m->size) | ||
1636 | m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0; | ||
1637 | return 0; | ||
1638 | } | ||
1639 | |||