aboutsummaryrefslogtreecommitdiffstats
path: root/mm/mempolicy.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/mempolicy.c')
-rw-r--r--mm/mempolicy.c708
1 files changed, 552 insertions, 156 deletions
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 5abc57c2b8bd..73790188b0eb 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -83,9 +83,18 @@
83#include <linux/init.h> 83#include <linux/init.h>
84#include <linux/compat.h> 84#include <linux/compat.h>
85#include <linux/mempolicy.h> 85#include <linux/mempolicy.h>
86#include <linux/swap.h>
87#include <linux/seq_file.h>
88#include <linux/proc_fs.h>
89
86#include <asm/tlbflush.h> 90#include <asm/tlbflush.h>
87#include <asm/uaccess.h> 91#include <asm/uaccess.h>
88 92
93/* Internal flags */
94#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
95#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
96#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */
97
89static kmem_cache_t *policy_cache; 98static kmem_cache_t *policy_cache;
90static kmem_cache_t *sn_cache; 99static kmem_cache_t *sn_cache;
91 100
@@ -93,7 +102,7 @@ static kmem_cache_t *sn_cache;
93 102
94/* Highest zone. An specific allocation for a zone below that is not 103/* Highest zone. An specific allocation for a zone below that is not
95 policied. */ 104 policied. */
96static int policy_zone; 105int policy_zone = ZONE_DMA;
97 106
98struct mempolicy default_policy = { 107struct mempolicy default_policy = {
99 .refcnt = ATOMIC_INIT(1), /* never free it */ 108 .refcnt = ATOMIC_INIT(1), /* never free it */
@@ -131,17 +140,8 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes)
131 if (!zl) 140 if (!zl)
132 return NULL; 141 return NULL;
133 num = 0; 142 num = 0;
134 for_each_node_mask(nd, *nodes) { 143 for_each_node_mask(nd, *nodes)
135 int k; 144 zl->zones[num++] = &NODE_DATA(nd)->node_zones[policy_zone];
136 for (k = MAX_NR_ZONES-1; k >= 0; k--) {
137 struct zone *z = &NODE_DATA(nd)->node_zones[k];
138 if (!z->present_pages)
139 continue;
140 zl->zones[num++] = z;
141 if (k > policy_zone)
142 policy_zone = k;
143 }
144 }
145 zl->zones[num] = NULL; 145 zl->zones[num] = NULL;
146 return zl; 146 return zl;
147} 147}
@@ -161,6 +161,10 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
161 switch (mode) { 161 switch (mode) {
162 case MPOL_INTERLEAVE: 162 case MPOL_INTERLEAVE:
163 policy->v.nodes = *nodes; 163 policy->v.nodes = *nodes;
164 if (nodes_weight(*nodes) == 0) {
165 kmem_cache_free(policy_cache, policy);
166 return ERR_PTR(-EINVAL);
167 }
164 break; 168 break;
165 case MPOL_PREFERRED: 169 case MPOL_PREFERRED:
166 policy->v.preferred_node = first_node(*nodes); 170 policy->v.preferred_node = first_node(*nodes);
@@ -176,12 +180,19 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
176 break; 180 break;
177 } 181 }
178 policy->policy = mode; 182 policy->policy = mode;
183 policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
179 return policy; 184 return policy;
180} 185}
181 186
182/* Ensure all existing pages follow the policy. */ 187static void gather_stats(struct page *, void *);
188static void migrate_page_add(struct page *page, struct list_head *pagelist,
189 unsigned long flags);
190
191/* Scan through pages checking if pages follow certain conditions. */
183static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, 192static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
184 unsigned long addr, unsigned long end, nodemask_t *nodes) 193 unsigned long addr, unsigned long end,
194 const nodemask_t *nodes, unsigned long flags,
195 void *private)
185{ 196{
186 pte_t *orig_pte; 197 pte_t *orig_pte;
187 pte_t *pte; 198 pte_t *pte;
@@ -189,18 +200,36 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
189 200
190 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 201 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
191 do { 202 do {
192 unsigned long pfn; 203 struct page *page;
193 unsigned int nid; 204 unsigned int nid;
194 205
195 if (!pte_present(*pte)) 206 if (!pte_present(*pte))
196 continue; 207 continue;
197 pfn = pte_pfn(*pte); 208 page = vm_normal_page(vma, addr, *pte);
198 if (!pfn_valid(pfn)) { 209 if (!page)
199 print_bad_pte(vma, *pte, addr);
200 continue; 210 continue;
201 } 211 /*
202 nid = pfn_to_nid(pfn); 212 * The check for PageReserved here is important to avoid
203 if (!node_isset(nid, *nodes)) 213 * handling zero pages and other pages that may have been
214 * marked special by the system.
215 *
216 * If the PageReserved would not be checked here then f.e.
217 * the location of the zero page could have an influence
218 * on MPOL_MF_STRICT, zero pages would be counted for
219 * the per node stats, and there would be useless attempts
220 * to put zero pages on the migration list.
221 */
222 if (PageReserved(page))
223 continue;
224 nid = page_to_nid(page);
225 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
226 continue;
227
228 if (flags & MPOL_MF_STATS)
229 gather_stats(page, private);
230 else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
231 migrate_page_add(page, private, flags);
232 else
204 break; 233 break;
205 } while (pte++, addr += PAGE_SIZE, addr != end); 234 } while (pte++, addr += PAGE_SIZE, addr != end);
206 pte_unmap_unlock(orig_pte, ptl); 235 pte_unmap_unlock(orig_pte, ptl);
@@ -208,7 +237,9 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
208} 237}
209 238
210static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, 239static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
211 unsigned long addr, unsigned long end, nodemask_t *nodes) 240 unsigned long addr, unsigned long end,
241 const nodemask_t *nodes, unsigned long flags,
242 void *private)
212{ 243{
213 pmd_t *pmd; 244 pmd_t *pmd;
214 unsigned long next; 245 unsigned long next;
@@ -218,14 +249,17 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
218 next = pmd_addr_end(addr, end); 249 next = pmd_addr_end(addr, end);
219 if (pmd_none_or_clear_bad(pmd)) 250 if (pmd_none_or_clear_bad(pmd))
220 continue; 251 continue;
221 if (check_pte_range(vma, pmd, addr, next, nodes)) 252 if (check_pte_range(vma, pmd, addr, next, nodes,
253 flags, private))
222 return -EIO; 254 return -EIO;
223 } while (pmd++, addr = next, addr != end); 255 } while (pmd++, addr = next, addr != end);
224 return 0; 256 return 0;
225} 257}
226 258
227static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd, 259static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
228 unsigned long addr, unsigned long end, nodemask_t *nodes) 260 unsigned long addr, unsigned long end,
261 const nodemask_t *nodes, unsigned long flags,
262 void *private)
229{ 263{
230 pud_t *pud; 264 pud_t *pud;
231 unsigned long next; 265 unsigned long next;
@@ -235,14 +269,17 @@ static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
235 next = pud_addr_end(addr, end); 269 next = pud_addr_end(addr, end);
236 if (pud_none_or_clear_bad(pud)) 270 if (pud_none_or_clear_bad(pud))
237 continue; 271 continue;
238 if (check_pmd_range(vma, pud, addr, next, nodes)) 272 if (check_pmd_range(vma, pud, addr, next, nodes,
273 flags, private))
239 return -EIO; 274 return -EIO;
240 } while (pud++, addr = next, addr != end); 275 } while (pud++, addr = next, addr != end);
241 return 0; 276 return 0;
242} 277}
243 278
244static inline int check_pgd_range(struct vm_area_struct *vma, 279static inline int check_pgd_range(struct vm_area_struct *vma,
245 unsigned long addr, unsigned long end, nodemask_t *nodes) 280 unsigned long addr, unsigned long end,
281 const nodemask_t *nodes, unsigned long flags,
282 void *private)
246{ 283{
247 pgd_t *pgd; 284 pgd_t *pgd;
248 unsigned long next; 285 unsigned long next;
@@ -252,38 +289,61 @@ static inline int check_pgd_range(struct vm_area_struct *vma,
252 next = pgd_addr_end(addr, end); 289 next = pgd_addr_end(addr, end);
253 if (pgd_none_or_clear_bad(pgd)) 290 if (pgd_none_or_clear_bad(pgd))
254 continue; 291 continue;
255 if (check_pud_range(vma, pgd, addr, next, nodes)) 292 if (check_pud_range(vma, pgd, addr, next, nodes,
293 flags, private))
256 return -EIO; 294 return -EIO;
257 } while (pgd++, addr = next, addr != end); 295 } while (pgd++, addr = next, addr != end);
258 return 0; 296 return 0;
259} 297}
260 298
261/* Step 1: check the range */ 299/* Check if a vma is migratable */
300static inline int vma_migratable(struct vm_area_struct *vma)
301{
302 if (vma->vm_flags & (
303 VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP|VM_RESERVED))
304 return 0;
305 return 1;
306}
307
308/*
309 * Check if all pages in a range are on a set of nodes.
310 * If pagelist != NULL then isolate pages from the LRU and
311 * put them on the pagelist.
312 */
262static struct vm_area_struct * 313static struct vm_area_struct *
263check_range(struct mm_struct *mm, unsigned long start, unsigned long end, 314check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
264 nodemask_t *nodes, unsigned long flags) 315 const nodemask_t *nodes, unsigned long flags, void *private)
265{ 316{
266 int err; 317 int err;
267 struct vm_area_struct *first, *vma, *prev; 318 struct vm_area_struct *first, *vma, *prev;
268 319
320 /* Clear the LRU lists so pages can be isolated */
321 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
322 lru_add_drain_all();
323
269 first = find_vma(mm, start); 324 first = find_vma(mm, start);
270 if (!first) 325 if (!first)
271 return ERR_PTR(-EFAULT); 326 return ERR_PTR(-EFAULT);
272 if (first->vm_flags & VM_RESERVED)
273 return ERR_PTR(-EACCES);
274 prev = NULL; 327 prev = NULL;
275 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { 328 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
276 if (!vma->vm_next && vma->vm_end < end) 329 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
277 return ERR_PTR(-EFAULT); 330 if (!vma->vm_next && vma->vm_end < end)
278 if (prev && prev->vm_end < vma->vm_start) 331 return ERR_PTR(-EFAULT);
279 return ERR_PTR(-EFAULT); 332 if (prev && prev->vm_end < vma->vm_start)
280 if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) { 333 return ERR_PTR(-EFAULT);
334 }
335 if (!is_vm_hugetlb_page(vma) &&
336 ((flags & MPOL_MF_STRICT) ||
337 ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
338 vma_migratable(vma)))) {
281 unsigned long endvma = vma->vm_end; 339 unsigned long endvma = vma->vm_end;
340
282 if (endvma > end) 341 if (endvma > end)
283 endvma = end; 342 endvma = end;
284 if (vma->vm_start > start) 343 if (vma->vm_start > start)
285 start = vma->vm_start; 344 start = vma->vm_start;
286 err = check_pgd_range(vma, start, endvma, nodes); 345 err = check_pgd_range(vma, start, endvma, nodes,
346 flags, private);
287 if (err) { 347 if (err) {
288 first = ERR_PTR(err); 348 first = ERR_PTR(err);
289 break; 349 break;
@@ -342,51 +402,10 @@ static int contextualize_policy(int mode, nodemask_t *nodes)
342 if (!nodes) 402 if (!nodes)
343 return 0; 403 return 0;
344 404
345 /* Update current mems_allowed */ 405 cpuset_update_task_memory_state();
346 cpuset_update_current_mems_allowed(); 406 if (!cpuset_nodes_subset_current_mems_allowed(*nodes))
347 /* Ignore nodes not set in current->mems_allowed */
348 cpuset_restrict_to_mems_allowed(nodes->bits);
349 return mpol_check_policy(mode, nodes);
350}
351
352long do_mbind(unsigned long start, unsigned long len,
353 unsigned long mode, nodemask_t *nmask, unsigned long flags)
354{
355 struct vm_area_struct *vma;
356 struct mm_struct *mm = current->mm;
357 struct mempolicy *new;
358 unsigned long end;
359 int err;
360
361 if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
362 return -EINVAL;
363 if (start & ~PAGE_MASK)
364 return -EINVAL;
365 if (mode == MPOL_DEFAULT)
366 flags &= ~MPOL_MF_STRICT;
367 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
368 end = start + len;
369 if (end < start)
370 return -EINVAL;
371 if (end == start)
372 return 0;
373 if (mpol_check_policy(mode, nmask))
374 return -EINVAL; 407 return -EINVAL;
375 new = mpol_new(mode, nmask); 408 return mpol_check_policy(mode, nodes);
376 if (IS_ERR(new))
377 return PTR_ERR(new);
378
379 PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
380 mode,nodes_addr(nodes)[0]);
381
382 down_write(&mm->mmap_sem);
383 vma = check_range(mm, start, end, nmask, flags);
384 err = PTR_ERR(vma);
385 if (!IS_ERR(vma))
386 err = mbind_range(vma, start, end, new);
387 up_write(&mm->mmap_sem);
388 mpol_free(new);
389 return err;
390} 409}
391 410
392/* Set the process memory policy */ 411/* Set the process memory policy */
@@ -457,7 +476,7 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask,
457 struct vm_area_struct *vma = NULL; 476 struct vm_area_struct *vma = NULL;
458 struct mempolicy *pol = current->mempolicy; 477 struct mempolicy *pol = current->mempolicy;
459 478
460 cpuset_update_current_mems_allowed(); 479 cpuset_update_task_memory_state();
461 if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR)) 480 if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
462 return -EINVAL; 481 return -EINVAL;
463 if (flags & MPOL_F_ADDR) { 482 if (flags & MPOL_F_ADDR) {
@@ -509,11 +528,141 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask,
509} 528}
510 529
511/* 530/*
531 * page migration
532 */
533
534static void migrate_page_add(struct page *page, struct list_head *pagelist,
535 unsigned long flags)
536{
537 /*
538 * Avoid migrating a page that is shared with others.
539 */
540 if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
541 if (isolate_lru_page(page))
542 list_add(&page->lru, pagelist);
543 }
544}
545
546static int swap_pages(struct list_head *pagelist)
547{
548 LIST_HEAD(moved);
549 LIST_HEAD(failed);
550 int n;
551
552 n = migrate_pages(pagelist, NULL, &moved, &failed);
553 putback_lru_pages(&failed);
554 putback_lru_pages(&moved);
555
556 return n;
557}
558
559/*
560 * For now migrate_pages simply swaps out the pages from nodes that are in
561 * the source set but not in the target set. In the future, we would
562 * want a function that moves pages between the two nodesets in such
563 * a way as to preserve the physical layout as much as possible.
564 *
565 * Returns the number of page that could not be moved.
566 */
567int do_migrate_pages(struct mm_struct *mm,
568 const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
569{
570 LIST_HEAD(pagelist);
571 int count = 0;
572 nodemask_t nodes;
573
574 nodes_andnot(nodes, *from_nodes, *to_nodes);
575
576 down_read(&mm->mmap_sem);
577 check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nodes,
578 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
579
580 if (!list_empty(&pagelist)) {
581 count = swap_pages(&pagelist);
582 putback_lru_pages(&pagelist);
583 }
584
585 up_read(&mm->mmap_sem);
586 return count;
587}
588
589long do_mbind(unsigned long start, unsigned long len,
590 unsigned long mode, nodemask_t *nmask, unsigned long flags)
591{
592 struct vm_area_struct *vma;
593 struct mm_struct *mm = current->mm;
594 struct mempolicy *new;
595 unsigned long end;
596 int err;
597 LIST_HEAD(pagelist);
598
599 if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
600 MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
601 || mode > MPOL_MAX)
602 return -EINVAL;
603 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE))
604 return -EPERM;
605
606 if (start & ~PAGE_MASK)
607 return -EINVAL;
608
609 if (mode == MPOL_DEFAULT)
610 flags &= ~MPOL_MF_STRICT;
611
612 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
613 end = start + len;
614
615 if (end < start)
616 return -EINVAL;
617 if (end == start)
618 return 0;
619
620 if (mpol_check_policy(mode, nmask))
621 return -EINVAL;
622
623 new = mpol_new(mode, nmask);
624 if (IS_ERR(new))
625 return PTR_ERR(new);
626
627 /*
628 * If we are using the default policy then operation
629 * on discontinuous address spaces is okay after all
630 */
631 if (!new)
632 flags |= MPOL_MF_DISCONTIG_OK;
633
634 PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
635 mode,nodes_addr(nodes)[0]);
636
637 down_write(&mm->mmap_sem);
638 vma = check_range(mm, start, end, nmask,
639 flags | MPOL_MF_INVERT, &pagelist);
640
641 err = PTR_ERR(vma);
642 if (!IS_ERR(vma)) {
643 int nr_failed = 0;
644
645 err = mbind_range(vma, start, end, new);
646 if (!list_empty(&pagelist))
647 nr_failed = swap_pages(&pagelist);
648
649 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
650 err = -EIO;
651 }
652 if (!list_empty(&pagelist))
653 putback_lru_pages(&pagelist);
654
655 up_write(&mm->mmap_sem);
656 mpol_free(new);
657 return err;
658}
659
660/*
512 * User space interface with variable sized bitmaps for nodelists. 661 * User space interface with variable sized bitmaps for nodelists.
513 */ 662 */
514 663
515/* Copy a node mask from user space. */ 664/* Copy a node mask from user space. */
516static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask, 665static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
517 unsigned long maxnode) 666 unsigned long maxnode)
518{ 667{
519 unsigned long k; 668 unsigned long k;
@@ -602,6 +751,65 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
602 return do_set_mempolicy(mode, &nodes); 751 return do_set_mempolicy(mode, &nodes);
603} 752}
604 753
754asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
755 const unsigned long __user *old_nodes,
756 const unsigned long __user *new_nodes)
757{
758 struct mm_struct *mm;
759 struct task_struct *task;
760 nodemask_t old;
761 nodemask_t new;
762 nodemask_t task_nodes;
763 int err;
764
765 err = get_nodes(&old, old_nodes, maxnode);
766 if (err)
767 return err;
768
769 err = get_nodes(&new, new_nodes, maxnode);
770 if (err)
771 return err;
772
773 /* Find the mm_struct */
774 read_lock(&tasklist_lock);
775 task = pid ? find_task_by_pid(pid) : current;
776 if (!task) {
777 read_unlock(&tasklist_lock);
778 return -ESRCH;
779 }
780 mm = get_task_mm(task);
781 read_unlock(&tasklist_lock);
782
783 if (!mm)
784 return -EINVAL;
785
786 /*
787 * Check if this process has the right to modify the specified
788 * process. The right exists if the process has administrative
789 * capabilities, superuser priviledges or the same
790 * userid as the target process.
791 */
792 if ((current->euid != task->suid) && (current->euid != task->uid) &&
793 (current->uid != task->suid) && (current->uid != task->uid) &&
794 !capable(CAP_SYS_ADMIN)) {
795 err = -EPERM;
796 goto out;
797 }
798
799 task_nodes = cpuset_mems_allowed(task);
800 /* Is the user allowed to access the target nodes? */
801 if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_ADMIN)) {
802 err = -EPERM;
803 goto out;
804 }
805
806 err = do_migrate_pages(mm, &old, &new, MPOL_MF_MOVE);
807out:
808 mmput(mm);
809 return err;
810}
811
812
605/* Retrieve NUMA policy */ 813/* Retrieve NUMA policy */
606asmlinkage long sys_get_mempolicy(int __user *policy, 814asmlinkage long sys_get_mempolicy(int __user *policy,
607 unsigned long __user *nmask, 815 unsigned long __user *nmask,
@@ -708,8 +916,8 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
708#endif 916#endif
709 917
710/* Return effective policy for a VMA */ 918/* Return effective policy for a VMA */
711struct mempolicy * 919static struct mempolicy * get_vma_policy(struct task_struct *task,
712get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr) 920 struct vm_area_struct *vma, unsigned long addr)
713{ 921{
714 struct mempolicy *pol = task->mempolicy; 922 struct mempolicy *pol = task->mempolicy;
715 923
@@ -768,6 +976,33 @@ static unsigned interleave_nodes(struct mempolicy *policy)
768 return nid; 976 return nid;
769} 977}
770 978
979/*
980 * Depending on the memory policy provide a node from which to allocate the
981 * next slab entry.
982 */
983unsigned slab_node(struct mempolicy *policy)
984{
985 switch (policy->policy) {
986 case MPOL_INTERLEAVE:
987 return interleave_nodes(policy);
988
989 case MPOL_BIND:
990 /*
991 * Follow bind policy behavior and start allocation at the
992 * first node.
993 */
994 return policy->v.zonelist->zones[0]->zone_pgdat->node_id;
995
996 case MPOL_PREFERRED:
997 if (policy->v.preferred_node >= 0)
998 return policy->v.preferred_node;
999 /* Fall through */
1000
1001 default:
1002 return numa_node_id();
1003 }
1004}
1005
771/* Do static interleaving for a VMA with known offset. */ 1006/* Do static interleaving for a VMA with known offset. */
772static unsigned offset_il_node(struct mempolicy *pol, 1007static unsigned offset_il_node(struct mempolicy *pol,
773 struct vm_area_struct *vma, unsigned long off) 1008 struct vm_area_struct *vma, unsigned long off)
@@ -785,6 +1020,34 @@ static unsigned offset_il_node(struct mempolicy *pol,
785 return nid; 1020 return nid;
786} 1021}
787 1022
1023/* Determine a node number for interleave */
1024static inline unsigned interleave_nid(struct mempolicy *pol,
1025 struct vm_area_struct *vma, unsigned long addr, int shift)
1026{
1027 if (vma) {
1028 unsigned long off;
1029
1030 off = vma->vm_pgoff;
1031 off += (addr - vma->vm_start) >> shift;
1032 return offset_il_node(pol, vma, off);
1033 } else
1034 return interleave_nodes(pol);
1035}
1036
1037/* Return a zonelist suitable for a huge page allocation. */
1038struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
1039{
1040 struct mempolicy *pol = get_vma_policy(current, vma, addr);
1041
1042 if (pol->policy == MPOL_INTERLEAVE) {
1043 unsigned nid;
1044
1045 nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
1046 return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
1047 }
1048 return zonelist_policy(GFP_HIGHUSER, pol);
1049}
1050
788/* Allocate a page in interleaved policy. 1051/* Allocate a page in interleaved policy.
789 Own path because it needs to do special accounting. */ 1052 Own path because it needs to do special accounting. */
790static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, 1053static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
@@ -829,19 +1092,12 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
829{ 1092{
830 struct mempolicy *pol = get_vma_policy(current, vma, addr); 1093 struct mempolicy *pol = get_vma_policy(current, vma, addr);
831 1094
832 cpuset_update_current_mems_allowed(); 1095 cpuset_update_task_memory_state();
833 1096
834 if (unlikely(pol->policy == MPOL_INTERLEAVE)) { 1097 if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
835 unsigned nid; 1098 unsigned nid;
836 if (vma) { 1099
837 unsigned long off; 1100 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
838 off = vma->vm_pgoff;
839 off += (addr - vma->vm_start) >> PAGE_SHIFT;
840 nid = offset_il_node(pol, vma, off);
841 } else {
842 /* fall back to process interleaving */
843 nid = interleave_nodes(pol);
844 }
845 return alloc_page_interleave(gfp, 0, nid); 1101 return alloc_page_interleave(gfp, 0, nid);
846 } 1102 }
847 return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol)); 1103 return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
@@ -862,7 +1118,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
862 * interrupt context and apply the current process NUMA policy. 1118 * interrupt context and apply the current process NUMA policy.
863 * Returns NULL when no page can be allocated. 1119 * Returns NULL when no page can be allocated.
864 * 1120 *
865 * Don't call cpuset_update_current_mems_allowed() unless 1121 * Don't call cpuset_update_task_memory_state() unless
866 * 1) it's ok to take cpuset_sem (can WAIT), and 1122 * 1) it's ok to take cpuset_sem (can WAIT), and
867 * 2) allocating for current task (not interrupt). 1123 * 2) allocating for current task (not interrupt).
868 */ 1124 */
@@ -871,7 +1127,7 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
871 struct mempolicy *pol = current->mempolicy; 1127 struct mempolicy *pol = current->mempolicy;
872 1128
873 if ((gfp & __GFP_WAIT) && !in_interrupt()) 1129 if ((gfp & __GFP_WAIT) && !in_interrupt())
874 cpuset_update_current_mems_allowed(); 1130 cpuset_update_task_memory_state();
875 if (!pol || in_interrupt()) 1131 if (!pol || in_interrupt())
876 pol = &default_policy; 1132 pol = &default_policy;
877 if (pol->policy == MPOL_INTERLEAVE) 1133 if (pol->policy == MPOL_INTERLEAVE)
@@ -880,6 +1136,15 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
880} 1136}
881EXPORT_SYMBOL(alloc_pages_current); 1137EXPORT_SYMBOL(alloc_pages_current);
882 1138
1139/*
1140 * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it
1141 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1142 * with the mems_allowed returned by cpuset_mems_allowed(). This
1143 * keeps mempolicies cpuset relative after its cpuset moves. See
1144 * further kernel/cpuset.c update_nodemask().
1145 */
1146void *cpuset_being_rebound;
1147
883/* Slow path of a mempolicy copy */ 1148/* Slow path of a mempolicy copy */
884struct mempolicy *__mpol_copy(struct mempolicy *old) 1149struct mempolicy *__mpol_copy(struct mempolicy *old)
885{ 1150{
@@ -887,6 +1152,10 @@ struct mempolicy *__mpol_copy(struct mempolicy *old)
887 1152
888 if (!new) 1153 if (!new)
889 return ERR_PTR(-ENOMEM); 1154 return ERR_PTR(-ENOMEM);
1155 if (current_cpuset_is_being_rebound()) {
1156 nodemask_t mems = cpuset_mems_allowed(current);
1157 mpol_rebind_policy(old, &mems);
1158 }
890 *new = *old; 1159 *new = *old;
891 atomic_set(&new->refcnt, 1); 1160 atomic_set(&new->refcnt, 1);
892 if (new->policy == MPOL_BIND) { 1161 if (new->policy == MPOL_BIND) {
@@ -940,54 +1209,6 @@ void __mpol_free(struct mempolicy *p)
940} 1209}
941 1210
942/* 1211/*
943 * Hugetlb policy. Same as above, just works with node numbers instead of
944 * zonelists.
945 */
946
947/* Find first node suitable for an allocation */
948int mpol_first_node(struct vm_area_struct *vma, unsigned long addr)
949{
950 struct mempolicy *pol = get_vma_policy(current, vma, addr);
951
952 switch (pol->policy) {
953 case MPOL_DEFAULT:
954 return numa_node_id();
955 case MPOL_BIND:
956 return pol->v.zonelist->zones[0]->zone_pgdat->node_id;
957 case MPOL_INTERLEAVE:
958 return interleave_nodes(pol);
959 case MPOL_PREFERRED:
960 return pol->v.preferred_node >= 0 ?
961 pol->v.preferred_node : numa_node_id();
962 }
963 BUG();
964 return 0;
965}
966
967/* Find secondary valid nodes for an allocation */
968int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr)
969{
970 struct mempolicy *pol = get_vma_policy(current, vma, addr);
971
972 switch (pol->policy) {
973 case MPOL_PREFERRED:
974 case MPOL_DEFAULT:
975 case MPOL_INTERLEAVE:
976 return 1;
977 case MPOL_BIND: {
978 struct zone **z;
979 for (z = pol->v.zonelist->zones; *z; z++)
980 if ((*z)->zone_pgdat->node_id == nid)
981 return 1;
982 return 0;
983 }
984 default:
985 BUG();
986 return 0;
987 }
988}
989
990/*
991 * Shared memory backing store policy support. 1212 * Shared memory backing store policy support.
992 * 1213 *
993 * Remember policies even when nobody has shared memory mapped. 1214 * Remember policies even when nobody has shared memory mapped.
@@ -1141,6 +1362,30 @@ restart:
1141 return 0; 1362 return 0;
1142} 1363}
1143 1364
1365void mpol_shared_policy_init(struct shared_policy *info, int policy,
1366 nodemask_t *policy_nodes)
1367{
1368 info->root = RB_ROOT;
1369 spin_lock_init(&info->lock);
1370
1371 if (policy != MPOL_DEFAULT) {
1372 struct mempolicy *newpol;
1373
1374 /* Falls back to MPOL_DEFAULT on any error */
1375 newpol = mpol_new(policy, policy_nodes);
1376 if (!IS_ERR(newpol)) {
1377 /* Create pseudo-vma that contains just the policy */
1378 struct vm_area_struct pvma;
1379
1380 memset(&pvma, 0, sizeof(struct vm_area_struct));
1381 /* Policy covers entire file */
1382 pvma.vm_end = TASK_SIZE;
1383 mpol_set_shared_policy(info, &pvma, newpol);
1384 mpol_free(newpol);
1385 }
1386 }
1387}
1388
1144int mpol_set_shared_policy(struct shared_policy *info, 1389int mpol_set_shared_policy(struct shared_policy *info,
1145 struct vm_area_struct *vma, struct mempolicy *npol) 1390 struct vm_area_struct *vma, struct mempolicy *npol)
1146{ 1391{
@@ -1209,25 +1454,31 @@ void numa_default_policy(void)
1209} 1454}
1210 1455
1211/* Migrate a policy to a different set of nodes */ 1456/* Migrate a policy to a different set of nodes */
1212static void rebind_policy(struct mempolicy *pol, const nodemask_t *old, 1457void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
1213 const nodemask_t *new)
1214{ 1458{
1459 nodemask_t *mpolmask;
1215 nodemask_t tmp; 1460 nodemask_t tmp;
1216 1461
1217 if (!pol) 1462 if (!pol)
1218 return; 1463 return;
1464 mpolmask = &pol->cpuset_mems_allowed;
1465 if (nodes_equal(*mpolmask, *newmask))
1466 return;
1219 1467
1220 switch (pol->policy) { 1468 switch (pol->policy) {
1221 case MPOL_DEFAULT: 1469 case MPOL_DEFAULT:
1222 break; 1470 break;
1223 case MPOL_INTERLEAVE: 1471 case MPOL_INTERLEAVE:
1224 nodes_remap(tmp, pol->v.nodes, *old, *new); 1472 nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
1225 pol->v.nodes = tmp; 1473 pol->v.nodes = tmp;
1226 current->il_next = node_remap(current->il_next, *old, *new); 1474 *mpolmask = *newmask;
1475 current->il_next = node_remap(current->il_next,
1476 *mpolmask, *newmask);
1227 break; 1477 break;
1228 case MPOL_PREFERRED: 1478 case MPOL_PREFERRED:
1229 pol->v.preferred_node = node_remap(pol->v.preferred_node, 1479 pol->v.preferred_node = node_remap(pol->v.preferred_node,
1230 *old, *new); 1480 *mpolmask, *newmask);
1481 *mpolmask = *newmask;
1231 break; 1482 break;
1232 case MPOL_BIND: { 1483 case MPOL_BIND: {
1233 nodemask_t nodes; 1484 nodemask_t nodes;
@@ -1237,7 +1488,7 @@ static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
1237 nodes_clear(nodes); 1488 nodes_clear(nodes);
1238 for (z = pol->v.zonelist->zones; *z; z++) 1489 for (z = pol->v.zonelist->zones; *z; z++)
1239 node_set((*z)->zone_pgdat->node_id, nodes); 1490 node_set((*z)->zone_pgdat->node_id, nodes);
1240 nodes_remap(tmp, nodes, *old, *new); 1491 nodes_remap(tmp, nodes, *mpolmask, *newmask);
1241 nodes = tmp; 1492 nodes = tmp;
1242 1493
1243 zonelist = bind_zonelist(&nodes); 1494 zonelist = bind_zonelist(&nodes);
@@ -1252,6 +1503,7 @@ static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
1252 kfree(pol->v.zonelist); 1503 kfree(pol->v.zonelist);
1253 pol->v.zonelist = zonelist; 1504 pol->v.zonelist = zonelist;
1254 } 1505 }
1506 *mpolmask = *newmask;
1255 break; 1507 break;
1256 } 1508 }
1257 default: 1509 default:
@@ -1261,12 +1513,156 @@ static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
1261} 1513}
1262 1514
1263/* 1515/*
1264 * Someone moved this task to different nodes. Fixup mempolicies. 1516 * Wrapper for mpol_rebind_policy() that just requires task
1517 * pointer, and updates task mempolicy.
1518 */
1519
1520void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
1521{
1522 mpol_rebind_policy(tsk->mempolicy, new);
1523}
1524
1525/*
1526 * Rebind each vma in mm to new nodemask.
1265 * 1527 *
1266 * TODO - fixup current->mm->vma and shmfs/tmpfs/hugetlbfs policies as well, 1528 * Call holding a reference to mm. Takes mm->mmap_sem during call.
1267 * once we have a cpuset mechanism to mark which cpuset subtree is migrating.
1268 */ 1529 */
1269void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new) 1530
1531void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
1270{ 1532{
1271 rebind_policy(current->mempolicy, old, new); 1533 struct vm_area_struct *vma;
1534
1535 down_write(&mm->mmap_sem);
1536 for (vma = mm->mmap; vma; vma = vma->vm_next)
1537 mpol_rebind_policy(vma->vm_policy, new);
1538 up_write(&mm->mmap_sem);
1272} 1539}
1540
1541/*
1542 * Display pages allocated per node and memory policy via /proc.
1543 */
1544
1545static const char *policy_types[] = { "default", "prefer", "bind",
1546 "interleave" };
1547
1548/*
1549 * Convert a mempolicy into a string.
1550 * Returns the number of characters in buffer (if positive)
1551 * or an error (negative)
1552 */
1553static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1554{
1555 char *p = buffer;
1556 int l;
1557 nodemask_t nodes;
1558 int mode = pol ? pol->policy : MPOL_DEFAULT;
1559
1560 switch (mode) {
1561 case MPOL_DEFAULT:
1562 nodes_clear(nodes);
1563 break;
1564
1565 case MPOL_PREFERRED:
1566 nodes_clear(nodes);
1567 node_set(pol->v.preferred_node, nodes);
1568 break;
1569
1570 case MPOL_BIND:
1571 get_zonemask(pol, &nodes);
1572 break;
1573
1574 case MPOL_INTERLEAVE:
1575 nodes = pol->v.nodes;
1576 break;
1577
1578 default:
1579 BUG();
1580 return -EFAULT;
1581 }
1582
1583 l = strlen(policy_types[mode]);
1584 if (buffer + maxlen < p + l + 1)
1585 return -ENOSPC;
1586
1587 strcpy(p, policy_types[mode]);
1588 p += l;
1589
1590 if (!nodes_empty(nodes)) {
1591 if (buffer + maxlen < p + 2)
1592 return -ENOSPC;
1593 *p++ = '=';
1594 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
1595 }
1596 return p - buffer;
1597}
1598
1599struct numa_maps {
1600 unsigned long pages;
1601 unsigned long anon;
1602 unsigned long mapped;
1603 unsigned long mapcount_max;
1604 unsigned long node[MAX_NUMNODES];
1605};
1606
1607static void gather_stats(struct page *page, void *private)
1608{
1609 struct numa_maps *md = private;
1610 int count = page_mapcount(page);
1611
1612 if (count)
1613 md->mapped++;
1614
1615 if (count > md->mapcount_max)
1616 md->mapcount_max = count;
1617
1618 md->pages++;
1619
1620 if (PageAnon(page))
1621 md->anon++;
1622
1623 md->node[page_to_nid(page)]++;
1624 cond_resched();
1625}
1626
1627int show_numa_map(struct seq_file *m, void *v)
1628{
1629 struct task_struct *task = m->private;
1630 struct vm_area_struct *vma = v;
1631 struct numa_maps *md;
1632 int n;
1633 char buffer[50];
1634
1635 if (!vma->vm_mm)
1636 return 0;
1637
1638 md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
1639 if (!md)
1640 return 0;
1641
1642 check_pgd_range(vma, vma->vm_start, vma->vm_end,
1643 &node_online_map, MPOL_MF_STATS, md);
1644
1645 if (md->pages) {
1646 mpol_to_str(buffer, sizeof(buffer),
1647 get_vma_policy(task, vma, vma->vm_start));
1648
1649 seq_printf(m, "%08lx %s pages=%lu mapped=%lu maxref=%lu",
1650 vma->vm_start, buffer, md->pages,
1651 md->mapped, md->mapcount_max);
1652
1653 if (md->anon)
1654 seq_printf(m," anon=%lu",md->anon);
1655
1656 for_each_online_node(n)
1657 if (md->node[n])
1658 seq_printf(m, " N%d=%lu", n, md->node[n]);
1659
1660 seq_putc(m, '\n');
1661 }
1662 kfree(md);
1663
1664 if (m->count < m->size)
1665 m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
1666 return 0;
1667}
1668