aboutsummaryrefslogtreecommitdiffstats
path: root/mm/mempolicy.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/mempolicy.c')
-rw-r--r--mm/mempolicy.c561
1 files changed, 482 insertions, 79 deletions
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 0f1d2b8a952b..1850d0aef4ac 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -83,9 +83,18 @@
83#include <linux/init.h> 83#include <linux/init.h>
84#include <linux/compat.h> 84#include <linux/compat.h>
85#include <linux/mempolicy.h> 85#include <linux/mempolicy.h>
86#include <linux/swap.h>
87#include <linux/seq_file.h>
88#include <linux/proc_fs.h>
89
86#include <asm/tlbflush.h> 90#include <asm/tlbflush.h>
87#include <asm/uaccess.h> 91#include <asm/uaccess.h>
88 92
93/* Internal flags */
94#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
95#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
96#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */
97
89static kmem_cache_t *policy_cache; 98static kmem_cache_t *policy_cache;
90static kmem_cache_t *sn_cache; 99static kmem_cache_t *sn_cache;
91 100
@@ -171,12 +180,19 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
171 break; 180 break;
172 } 181 }
173 policy->policy = mode; 182 policy->policy = mode;
183 policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
174 return policy; 184 return policy;
175} 185}
176 186
177/* Ensure all existing pages follow the policy. */ 187static void gather_stats(struct page *, void *);
188static void migrate_page_add(struct vm_area_struct *vma,
189 struct page *page, struct list_head *pagelist, unsigned long flags);
190
191/* Scan through pages checking if pages follow certain conditions. */
178static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, 192static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
179 unsigned long addr, unsigned long end, nodemask_t *nodes) 193 unsigned long addr, unsigned long end,
194 const nodemask_t *nodes, unsigned long flags,
195 void *private)
180{ 196{
181 pte_t *orig_pte; 197 pte_t *orig_pte;
182 pte_t *pte; 198 pte_t *pte;
@@ -193,7 +209,17 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
193 if (!page) 209 if (!page)
194 continue; 210 continue;
195 nid = page_to_nid(page); 211 nid = page_to_nid(page);
196 if (!node_isset(nid, *nodes)) 212 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
213 continue;
214
215 if (flags & MPOL_MF_STATS)
216 gather_stats(page, private);
217 else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
218 spin_unlock(ptl);
219 migrate_page_add(vma, page, private, flags);
220 spin_lock(ptl);
221 }
222 else
197 break; 223 break;
198 } while (pte++, addr += PAGE_SIZE, addr != end); 224 } while (pte++, addr += PAGE_SIZE, addr != end);
199 pte_unmap_unlock(orig_pte, ptl); 225 pte_unmap_unlock(orig_pte, ptl);
@@ -201,7 +227,9 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
201} 227}
202 228
203static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, 229static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
204 unsigned long addr, unsigned long end, nodemask_t *nodes) 230 unsigned long addr, unsigned long end,
231 const nodemask_t *nodes, unsigned long flags,
232 void *private)
205{ 233{
206 pmd_t *pmd; 234 pmd_t *pmd;
207 unsigned long next; 235 unsigned long next;
@@ -211,14 +239,17 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
211 next = pmd_addr_end(addr, end); 239 next = pmd_addr_end(addr, end);
212 if (pmd_none_or_clear_bad(pmd)) 240 if (pmd_none_or_clear_bad(pmd))
213 continue; 241 continue;
214 if (check_pte_range(vma, pmd, addr, next, nodes)) 242 if (check_pte_range(vma, pmd, addr, next, nodes,
243 flags, private))
215 return -EIO; 244 return -EIO;
216 } while (pmd++, addr = next, addr != end); 245 } while (pmd++, addr = next, addr != end);
217 return 0; 246 return 0;
218} 247}
219 248
220static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd, 249static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
221 unsigned long addr, unsigned long end, nodemask_t *nodes) 250 unsigned long addr, unsigned long end,
251 const nodemask_t *nodes, unsigned long flags,
252 void *private)
222{ 253{
223 pud_t *pud; 254 pud_t *pud;
224 unsigned long next; 255 unsigned long next;
@@ -228,14 +259,17 @@ static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
228 next = pud_addr_end(addr, end); 259 next = pud_addr_end(addr, end);
229 if (pud_none_or_clear_bad(pud)) 260 if (pud_none_or_clear_bad(pud))
230 continue; 261 continue;
231 if (check_pmd_range(vma, pud, addr, next, nodes)) 262 if (check_pmd_range(vma, pud, addr, next, nodes,
263 flags, private))
232 return -EIO; 264 return -EIO;
233 } while (pud++, addr = next, addr != end); 265 } while (pud++, addr = next, addr != end);
234 return 0; 266 return 0;
235} 267}
236 268
237static inline int check_pgd_range(struct vm_area_struct *vma, 269static inline int check_pgd_range(struct vm_area_struct *vma,
238 unsigned long addr, unsigned long end, nodemask_t *nodes) 270 unsigned long addr, unsigned long end,
271 const nodemask_t *nodes, unsigned long flags,
272 void *private)
239{ 273{
240 pgd_t *pgd; 274 pgd_t *pgd;
241 unsigned long next; 275 unsigned long next;
@@ -245,16 +279,30 @@ static inline int check_pgd_range(struct vm_area_struct *vma,
245 next = pgd_addr_end(addr, end); 279 next = pgd_addr_end(addr, end);
246 if (pgd_none_or_clear_bad(pgd)) 280 if (pgd_none_or_clear_bad(pgd))
247 continue; 281 continue;
248 if (check_pud_range(vma, pgd, addr, next, nodes)) 282 if (check_pud_range(vma, pgd, addr, next, nodes,
283 flags, private))
249 return -EIO; 284 return -EIO;
250 } while (pgd++, addr = next, addr != end); 285 } while (pgd++, addr = next, addr != end);
251 return 0; 286 return 0;
252} 287}
253 288
254/* Step 1: check the range */ 289/* Check if a vma is migratable */
290static inline int vma_migratable(struct vm_area_struct *vma)
291{
292 if (vma->vm_flags & (
293 VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP))
294 return 0;
295 return 1;
296}
297
298/*
299 * Check if all pages in a range are on a set of nodes.
300 * If pagelist != NULL then isolate pages from the LRU and
301 * put them on the pagelist.
302 */
255static struct vm_area_struct * 303static struct vm_area_struct *
256check_range(struct mm_struct *mm, unsigned long start, unsigned long end, 304check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
257 nodemask_t *nodes, unsigned long flags) 305 const nodemask_t *nodes, unsigned long flags, void *private)
258{ 306{
259 int err; 307 int err;
260 struct vm_area_struct *first, *vma, *prev; 308 struct vm_area_struct *first, *vma, *prev;
@@ -264,17 +312,24 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
264 return ERR_PTR(-EFAULT); 312 return ERR_PTR(-EFAULT);
265 prev = NULL; 313 prev = NULL;
266 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { 314 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
267 if (!vma->vm_next && vma->vm_end < end) 315 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
268 return ERR_PTR(-EFAULT); 316 if (!vma->vm_next && vma->vm_end < end)
269 if (prev && prev->vm_end < vma->vm_start) 317 return ERR_PTR(-EFAULT);
270 return ERR_PTR(-EFAULT); 318 if (prev && prev->vm_end < vma->vm_start)
271 if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) { 319 return ERR_PTR(-EFAULT);
320 }
321 if (!is_vm_hugetlb_page(vma) &&
322 ((flags & MPOL_MF_STRICT) ||
323 ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
324 vma_migratable(vma)))) {
272 unsigned long endvma = vma->vm_end; 325 unsigned long endvma = vma->vm_end;
326
273 if (endvma > end) 327 if (endvma > end)
274 endvma = end; 328 endvma = end;
275 if (vma->vm_start > start) 329 if (vma->vm_start > start)
276 start = vma->vm_start; 330 start = vma->vm_start;
277 err = check_pgd_range(vma, start, endvma, nodes); 331 err = check_pgd_range(vma, start, endvma, nodes,
332 flags, private);
278 if (err) { 333 if (err) {
279 first = ERR_PTR(err); 334 first = ERR_PTR(err);
280 break; 335 break;
@@ -333,51 +388,10 @@ static int contextualize_policy(int mode, nodemask_t *nodes)
333 if (!nodes) 388 if (!nodes)
334 return 0; 389 return 0;
335 390
336 /* Update current mems_allowed */ 391 cpuset_update_task_memory_state();
337 cpuset_update_current_mems_allowed(); 392 if (!cpuset_nodes_subset_current_mems_allowed(*nodes))
338 /* Ignore nodes not set in current->mems_allowed */
339 cpuset_restrict_to_mems_allowed(nodes->bits);
340 return mpol_check_policy(mode, nodes);
341}
342
343long do_mbind(unsigned long start, unsigned long len,
344 unsigned long mode, nodemask_t *nmask, unsigned long flags)
345{
346 struct vm_area_struct *vma;
347 struct mm_struct *mm = current->mm;
348 struct mempolicy *new;
349 unsigned long end;
350 int err;
351
352 if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
353 return -EINVAL;
354 if (start & ~PAGE_MASK)
355 return -EINVAL;
356 if (mode == MPOL_DEFAULT)
357 flags &= ~MPOL_MF_STRICT;
358 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
359 end = start + len;
360 if (end < start)
361 return -EINVAL; 393 return -EINVAL;
362 if (end == start) 394 return mpol_check_policy(mode, nodes);
363 return 0;
364 if (mpol_check_policy(mode, nmask))
365 return -EINVAL;
366 new = mpol_new(mode, nmask);
367 if (IS_ERR(new))
368 return PTR_ERR(new);
369
370 PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
371 mode,nodes_addr(nodes)[0]);
372
373 down_write(&mm->mmap_sem);
374 vma = check_range(mm, start, end, nmask, flags);
375 err = PTR_ERR(vma);
376 if (!IS_ERR(vma))
377 err = mbind_range(vma, start, end, new);
378 up_write(&mm->mmap_sem);
379 mpol_free(new);
380 return err;
381} 395}
382 396
383/* Set the process memory policy */ 397/* Set the process memory policy */
@@ -448,7 +462,7 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask,
448 struct vm_area_struct *vma = NULL; 462 struct vm_area_struct *vma = NULL;
449 struct mempolicy *pol = current->mempolicy; 463 struct mempolicy *pol = current->mempolicy;
450 464
451 cpuset_update_current_mems_allowed(); 465 cpuset_update_task_memory_state();
452 if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR)) 466 if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
453 return -EINVAL; 467 return -EINVAL;
454 if (flags & MPOL_F_ADDR) { 468 if (flags & MPOL_F_ADDR) {
@@ -500,11 +514,177 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask,
500} 514}
501 515
502/* 516/*
517 * page migration
518 */
519
520/* Check if we are the only process mapping the page in question */
521static inline int single_mm_mapping(struct mm_struct *mm,
522 struct address_space *mapping)
523{
524 struct vm_area_struct *vma;
525 struct prio_tree_iter iter;
526 int rc = 1;
527
528 spin_lock(&mapping->i_mmap_lock);
529 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
530 if (mm != vma->vm_mm) {
531 rc = 0;
532 goto out;
533 }
534 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
535 if (mm != vma->vm_mm) {
536 rc = 0;
537 goto out;
538 }
539out:
540 spin_unlock(&mapping->i_mmap_lock);
541 return rc;
542}
543
544/*
545 * Add a page to be migrated to the pagelist
546 */
547static void migrate_page_add(struct vm_area_struct *vma,
548 struct page *page, struct list_head *pagelist, unsigned long flags)
549{
550 /*
551 * Avoid migrating a page that is shared by others and not writable.
552 */
553 if ((flags & MPOL_MF_MOVE_ALL) || !page->mapping || PageAnon(page) ||
554 mapping_writably_mapped(page->mapping) ||
555 single_mm_mapping(vma->vm_mm, page->mapping)) {
556 int rc = isolate_lru_page(page);
557
558 if (rc == 1)
559 list_add(&page->lru, pagelist);
560 /*
561 * If the isolate attempt was not successful then we just
562 * encountered an unswappable page. Something must be wrong.
563 */
564 WARN_ON(rc == 0);
565 }
566}
567
568static int swap_pages(struct list_head *pagelist)
569{
570 LIST_HEAD(moved);
571 LIST_HEAD(failed);
572 int n;
573
574 n = migrate_pages(pagelist, NULL, &moved, &failed);
575 putback_lru_pages(&failed);
576 putback_lru_pages(&moved);
577
578 return n;
579}
580
581/*
582 * For now migrate_pages simply swaps out the pages from nodes that are in
583 * the source set but not in the target set. In the future, we would
584 * want a function that moves pages between the two nodesets in such
585 * a way as to preserve the physical layout as much as possible.
586 *
587 * Returns the number of page that could not be moved.
588 */
589int do_migrate_pages(struct mm_struct *mm,
590 const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
591{
592 LIST_HEAD(pagelist);
593 int count = 0;
594 nodemask_t nodes;
595
596 nodes_andnot(nodes, *from_nodes, *to_nodes);
597
598 down_read(&mm->mmap_sem);
599 check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nodes,
600 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
601
602 if (!list_empty(&pagelist)) {
603 count = swap_pages(&pagelist);
604 putback_lru_pages(&pagelist);
605 }
606
607 up_read(&mm->mmap_sem);
608 return count;
609}
610
611long do_mbind(unsigned long start, unsigned long len,
612 unsigned long mode, nodemask_t *nmask, unsigned long flags)
613{
614 struct vm_area_struct *vma;
615 struct mm_struct *mm = current->mm;
616 struct mempolicy *new;
617 unsigned long end;
618 int err;
619 LIST_HEAD(pagelist);
620
621 if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
622 MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
623 || mode > MPOL_MAX)
624 return -EINVAL;
625 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE))
626 return -EPERM;
627
628 if (start & ~PAGE_MASK)
629 return -EINVAL;
630
631 if (mode == MPOL_DEFAULT)
632 flags &= ~MPOL_MF_STRICT;
633
634 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
635 end = start + len;
636
637 if (end < start)
638 return -EINVAL;
639 if (end == start)
640 return 0;
641
642 if (mpol_check_policy(mode, nmask))
643 return -EINVAL;
644
645 new = mpol_new(mode, nmask);
646 if (IS_ERR(new))
647 return PTR_ERR(new);
648
649 /*
650 * If we are using the default policy then operation
651 * on discontinuous address spaces is okay after all
652 */
653 if (!new)
654 flags |= MPOL_MF_DISCONTIG_OK;
655
656 PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
657 mode,nodes_addr(nodes)[0]);
658
659 down_write(&mm->mmap_sem);
660 vma = check_range(mm, start, end, nmask,
661 flags | MPOL_MF_INVERT, &pagelist);
662
663 err = PTR_ERR(vma);
664 if (!IS_ERR(vma)) {
665 int nr_failed = 0;
666
667 err = mbind_range(vma, start, end, new);
668 if (!list_empty(&pagelist))
669 nr_failed = swap_pages(&pagelist);
670
671 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
672 err = -EIO;
673 }
674 if (!list_empty(&pagelist))
675 putback_lru_pages(&pagelist);
676
677 up_write(&mm->mmap_sem);
678 mpol_free(new);
679 return err;
680}
681
682/*
503 * User space interface with variable sized bitmaps for nodelists. 683 * User space interface with variable sized bitmaps for nodelists.
504 */ 684 */
505 685
506/* Copy a node mask from user space. */ 686/* Copy a node mask from user space. */
507static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask, 687static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
508 unsigned long maxnode) 688 unsigned long maxnode)
509{ 689{
510 unsigned long k; 690 unsigned long k;
@@ -593,6 +773,65 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
593 return do_set_mempolicy(mode, &nodes); 773 return do_set_mempolicy(mode, &nodes);
594} 774}
595 775
776asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
777 const unsigned long __user *old_nodes,
778 const unsigned long __user *new_nodes)
779{
780 struct mm_struct *mm;
781 struct task_struct *task;
782 nodemask_t old;
783 nodemask_t new;
784 nodemask_t task_nodes;
785 int err;
786
787 err = get_nodes(&old, old_nodes, maxnode);
788 if (err)
789 return err;
790
791 err = get_nodes(&new, new_nodes, maxnode);
792 if (err)
793 return err;
794
795 /* Find the mm_struct */
796 read_lock(&tasklist_lock);
797 task = pid ? find_task_by_pid(pid) : current;
798 if (!task) {
799 read_unlock(&tasklist_lock);
800 return -ESRCH;
801 }
802 mm = get_task_mm(task);
803 read_unlock(&tasklist_lock);
804
805 if (!mm)
806 return -EINVAL;
807
808 /*
809 * Check if this process has the right to modify the specified
810 * process. The right exists if the process has administrative
811 * capabilities, superuser priviledges or the same
812 * userid as the target process.
813 */
814 if ((current->euid != task->suid) && (current->euid != task->uid) &&
815 (current->uid != task->suid) && (current->uid != task->uid) &&
816 !capable(CAP_SYS_ADMIN)) {
817 err = -EPERM;
818 goto out;
819 }
820
821 task_nodes = cpuset_mems_allowed(task);
822 /* Is the user allowed to access the target nodes? */
823 if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_ADMIN)) {
824 err = -EPERM;
825 goto out;
826 }
827
828 err = do_migrate_pages(mm, &old, &new, MPOL_MF_MOVE);
829out:
830 mmput(mm);
831 return err;
832}
833
834
596/* Retrieve NUMA policy */ 835/* Retrieve NUMA policy */
597asmlinkage long sys_get_mempolicy(int __user *policy, 836asmlinkage long sys_get_mempolicy(int __user *policy,
598 unsigned long __user *nmask, 837 unsigned long __user *nmask,
@@ -699,8 +938,8 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
699#endif 938#endif
700 939
701/* Return effective policy for a VMA */ 940/* Return effective policy for a VMA */
702struct mempolicy * 941static struct mempolicy * get_vma_policy(struct task_struct *task,
703get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr) 942 struct vm_area_struct *vma, unsigned long addr)
704{ 943{
705 struct mempolicy *pol = task->mempolicy; 944 struct mempolicy *pol = task->mempolicy;
706 945
@@ -848,7 +1087,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
848{ 1087{
849 struct mempolicy *pol = get_vma_policy(current, vma, addr); 1088 struct mempolicy *pol = get_vma_policy(current, vma, addr);
850 1089
851 cpuset_update_current_mems_allowed(); 1090 cpuset_update_task_memory_state();
852 1091
853 if (unlikely(pol->policy == MPOL_INTERLEAVE)) { 1092 if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
854 unsigned nid; 1093 unsigned nid;
@@ -874,7 +1113,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
874 * interrupt context and apply the current process NUMA policy. 1113 * interrupt context and apply the current process NUMA policy.
875 * Returns NULL when no page can be allocated. 1114 * Returns NULL when no page can be allocated.
876 * 1115 *
877 * Don't call cpuset_update_current_mems_allowed() unless 1116 * Don't call cpuset_update_task_memory_state() unless
878 * 1) it's ok to take cpuset_sem (can WAIT), and 1117 * 1) it's ok to take cpuset_sem (can WAIT), and
879 * 2) allocating for current task (not interrupt). 1118 * 2) allocating for current task (not interrupt).
880 */ 1119 */
@@ -883,7 +1122,7 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
883 struct mempolicy *pol = current->mempolicy; 1122 struct mempolicy *pol = current->mempolicy;
884 1123
885 if ((gfp & __GFP_WAIT) && !in_interrupt()) 1124 if ((gfp & __GFP_WAIT) && !in_interrupt())
886 cpuset_update_current_mems_allowed(); 1125 cpuset_update_task_memory_state();
887 if (!pol || in_interrupt()) 1126 if (!pol || in_interrupt())
888 pol = &default_policy; 1127 pol = &default_policy;
889 if (pol->policy == MPOL_INTERLEAVE) 1128 if (pol->policy == MPOL_INTERLEAVE)
@@ -892,6 +1131,15 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
892} 1131}
893EXPORT_SYMBOL(alloc_pages_current); 1132EXPORT_SYMBOL(alloc_pages_current);
894 1133
1134/*
1135 * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it
1136 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1137 * with the mems_allowed returned by cpuset_mems_allowed(). This
1138 * keeps mempolicies cpuset relative after its cpuset moves. See
1139 * further kernel/cpuset.c update_nodemask().
1140 */
1141void *cpuset_being_rebound;
1142
895/* Slow path of a mempolicy copy */ 1143/* Slow path of a mempolicy copy */
896struct mempolicy *__mpol_copy(struct mempolicy *old) 1144struct mempolicy *__mpol_copy(struct mempolicy *old)
897{ 1145{
@@ -899,6 +1147,10 @@ struct mempolicy *__mpol_copy(struct mempolicy *old)
899 1147
900 if (!new) 1148 if (!new)
901 return ERR_PTR(-ENOMEM); 1149 return ERR_PTR(-ENOMEM);
1150 if (current_cpuset_is_being_rebound()) {
1151 nodemask_t mems = cpuset_mems_allowed(current);
1152 mpol_rebind_policy(old, &mems);
1153 }
902 *new = *old; 1154 *new = *old;
903 atomic_set(&new->refcnt, 1); 1155 atomic_set(&new->refcnt, 1);
904 if (new->policy == MPOL_BIND) { 1156 if (new->policy == MPOL_BIND) {
@@ -1173,25 +1425,31 @@ void numa_default_policy(void)
1173} 1425}
1174 1426
1175/* Migrate a policy to a different set of nodes */ 1427/* Migrate a policy to a different set of nodes */
1176static void rebind_policy(struct mempolicy *pol, const nodemask_t *old, 1428void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
1177 const nodemask_t *new)
1178{ 1429{
1430 nodemask_t *mpolmask;
1179 nodemask_t tmp; 1431 nodemask_t tmp;
1180 1432
1181 if (!pol) 1433 if (!pol)
1182 return; 1434 return;
1435 mpolmask = &pol->cpuset_mems_allowed;
1436 if (nodes_equal(*mpolmask, *newmask))
1437 return;
1183 1438
1184 switch (pol->policy) { 1439 switch (pol->policy) {
1185 case MPOL_DEFAULT: 1440 case MPOL_DEFAULT:
1186 break; 1441 break;
1187 case MPOL_INTERLEAVE: 1442 case MPOL_INTERLEAVE:
1188 nodes_remap(tmp, pol->v.nodes, *old, *new); 1443 nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
1189 pol->v.nodes = tmp; 1444 pol->v.nodes = tmp;
1190 current->il_next = node_remap(current->il_next, *old, *new); 1445 *mpolmask = *newmask;
1446 current->il_next = node_remap(current->il_next,
1447 *mpolmask, *newmask);
1191 break; 1448 break;
1192 case MPOL_PREFERRED: 1449 case MPOL_PREFERRED:
1193 pol->v.preferred_node = node_remap(pol->v.preferred_node, 1450 pol->v.preferred_node = node_remap(pol->v.preferred_node,
1194 *old, *new); 1451 *mpolmask, *newmask);
1452 *mpolmask = *newmask;
1195 break; 1453 break;
1196 case MPOL_BIND: { 1454 case MPOL_BIND: {
1197 nodemask_t nodes; 1455 nodemask_t nodes;
@@ -1201,7 +1459,7 @@ static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
1201 nodes_clear(nodes); 1459 nodes_clear(nodes);
1202 for (z = pol->v.zonelist->zones; *z; z++) 1460 for (z = pol->v.zonelist->zones; *z; z++)
1203 node_set((*z)->zone_pgdat->node_id, nodes); 1461 node_set((*z)->zone_pgdat->node_id, nodes);
1204 nodes_remap(tmp, nodes, *old, *new); 1462 nodes_remap(tmp, nodes, *mpolmask, *newmask);
1205 nodes = tmp; 1463 nodes = tmp;
1206 1464
1207 zonelist = bind_zonelist(&nodes); 1465 zonelist = bind_zonelist(&nodes);
@@ -1216,6 +1474,7 @@ static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
1216 kfree(pol->v.zonelist); 1474 kfree(pol->v.zonelist);
1217 pol->v.zonelist = zonelist; 1475 pol->v.zonelist = zonelist;
1218 } 1476 }
1477 *mpolmask = *newmask;
1219 break; 1478 break;
1220 } 1479 }
1221 default: 1480 default:
@@ -1225,12 +1484,156 @@ static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
1225} 1484}
1226 1485
1227/* 1486/*
1228 * Someone moved this task to different nodes. Fixup mempolicies. 1487 * Wrapper for mpol_rebind_policy() that just requires task
1488 * pointer, and updates task mempolicy.
1489 */
1490
1491void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
1492{
1493 mpol_rebind_policy(tsk->mempolicy, new);
1494}
1495
1496/*
1497 * Rebind each vma in mm to new nodemask.
1229 * 1498 *
1230 * TODO - fixup current->mm->vma and shmfs/tmpfs/hugetlbfs policies as well, 1499 * Call holding a reference to mm. Takes mm->mmap_sem during call.
1231 * once we have a cpuset mechanism to mark which cpuset subtree is migrating.
1232 */ 1500 */
1233void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new) 1501
1502void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
1234{ 1503{
1235 rebind_policy(current->mempolicy, old, new); 1504 struct vm_area_struct *vma;
1505
1506 down_write(&mm->mmap_sem);
1507 for (vma = mm->mmap; vma; vma = vma->vm_next)
1508 mpol_rebind_policy(vma->vm_policy, new);
1509 up_write(&mm->mmap_sem);
1236} 1510}
1511
1512/*
1513 * Display pages allocated per node and memory policy via /proc.
1514 */
1515
1516static const char *policy_types[] = { "default", "prefer", "bind",
1517 "interleave" };
1518
1519/*
1520 * Convert a mempolicy into a string.
1521 * Returns the number of characters in buffer (if positive)
1522 * or an error (negative)
1523 */
1524static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1525{
1526 char *p = buffer;
1527 int l;
1528 nodemask_t nodes;
1529 int mode = pol ? pol->policy : MPOL_DEFAULT;
1530
1531 switch (mode) {
1532 case MPOL_DEFAULT:
1533 nodes_clear(nodes);
1534 break;
1535
1536 case MPOL_PREFERRED:
1537 nodes_clear(nodes);
1538 node_set(pol->v.preferred_node, nodes);
1539 break;
1540
1541 case MPOL_BIND:
1542 get_zonemask(pol, &nodes);
1543 break;
1544
1545 case MPOL_INTERLEAVE:
1546 nodes = pol->v.nodes;
1547 break;
1548
1549 default:
1550 BUG();
1551 return -EFAULT;
1552 }
1553
1554 l = strlen(policy_types[mode]);
1555 if (buffer + maxlen < p + l + 1)
1556 return -ENOSPC;
1557
1558 strcpy(p, policy_types[mode]);
1559 p += l;
1560
1561 if (!nodes_empty(nodes)) {
1562 if (buffer + maxlen < p + 2)
1563 return -ENOSPC;
1564 *p++ = '=';
1565 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
1566 }
1567 return p - buffer;
1568}
1569
1570struct numa_maps {
1571 unsigned long pages;
1572 unsigned long anon;
1573 unsigned long mapped;
1574 unsigned long mapcount_max;
1575 unsigned long node[MAX_NUMNODES];
1576};
1577
1578static void gather_stats(struct page *page, void *private)
1579{
1580 struct numa_maps *md = private;
1581 int count = page_mapcount(page);
1582
1583 if (count)
1584 md->mapped++;
1585
1586 if (count > md->mapcount_max)
1587 md->mapcount_max = count;
1588
1589 md->pages++;
1590
1591 if (PageAnon(page))
1592 md->anon++;
1593
1594 md->node[page_to_nid(page)]++;
1595 cond_resched();
1596}
1597
1598int show_numa_map(struct seq_file *m, void *v)
1599{
1600 struct task_struct *task = m->private;
1601 struct vm_area_struct *vma = v;
1602 struct numa_maps *md;
1603 int n;
1604 char buffer[50];
1605
1606 if (!vma->vm_mm)
1607 return 0;
1608
1609 md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
1610 if (!md)
1611 return 0;
1612
1613 check_pgd_range(vma, vma->vm_start, vma->vm_end,
1614 &node_online_map, MPOL_MF_STATS, md);
1615
1616 if (md->pages) {
1617 mpol_to_str(buffer, sizeof(buffer),
1618 get_vma_policy(task, vma, vma->vm_start));
1619
1620 seq_printf(m, "%08lx %s pages=%lu mapped=%lu maxref=%lu",
1621 vma->vm_start, buffer, md->pages,
1622 md->mapped, md->mapcount_max);
1623
1624 if (md->anon)
1625 seq_printf(m," anon=%lu",md->anon);
1626
1627 for_each_online_node(n)
1628 if (md->node[n])
1629 seq_printf(m, " N%d=%lu", n, md->node[n]);
1630
1631 seq_putc(m, '\n');
1632 }
1633 kfree(md);
1634
1635 if (m->count < m->size)
1636 m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
1637 return 0;
1638}
1639