diff options
Diffstat (limited to 'mm/mempolicy.c')
-rw-r--r-- | mm/mempolicy.c | 708 |
1 files changed, 552 insertions, 156 deletions
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 5abc57c2b8bd..73790188b0eb 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -83,9 +83,18 @@ | |||
83 | #include <linux/init.h> | 83 | #include <linux/init.h> |
84 | #include <linux/compat.h> | 84 | #include <linux/compat.h> |
85 | #include <linux/mempolicy.h> | 85 | #include <linux/mempolicy.h> |
86 | #include <linux/swap.h> | ||
87 | #include <linux/seq_file.h> | ||
88 | #include <linux/proc_fs.h> | ||
89 | |||
86 | #include <asm/tlbflush.h> | 90 | #include <asm/tlbflush.h> |
87 | #include <asm/uaccess.h> | 91 | #include <asm/uaccess.h> |
88 | 92 | ||
93 | /* Internal flags */ | ||
94 | #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */ | ||
95 | #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ | ||
96 | #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */ | ||
97 | |||
89 | static kmem_cache_t *policy_cache; | 98 | static kmem_cache_t *policy_cache; |
90 | static kmem_cache_t *sn_cache; | 99 | static kmem_cache_t *sn_cache; |
91 | 100 | ||
@@ -93,7 +102,7 @@ static kmem_cache_t *sn_cache; | |||
93 | 102 | ||
94 | /* Highest zone. An specific allocation for a zone below that is not | 103 | /* Highest zone. An specific allocation for a zone below that is not |
95 | policied. */ | 104 | policied. */ |
96 | static int policy_zone; | 105 | int policy_zone = ZONE_DMA; |
97 | 106 | ||
98 | struct mempolicy default_policy = { | 107 | struct mempolicy default_policy = { |
99 | .refcnt = ATOMIC_INIT(1), /* never free it */ | 108 | .refcnt = ATOMIC_INIT(1), /* never free it */ |
@@ -131,17 +140,8 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes) | |||
131 | if (!zl) | 140 | if (!zl) |
132 | return NULL; | 141 | return NULL; |
133 | num = 0; | 142 | num = 0; |
134 | for_each_node_mask(nd, *nodes) { | 143 | for_each_node_mask(nd, *nodes) |
135 | int k; | 144 | zl->zones[num++] = &NODE_DATA(nd)->node_zones[policy_zone]; |
136 | for (k = MAX_NR_ZONES-1; k >= 0; k--) { | ||
137 | struct zone *z = &NODE_DATA(nd)->node_zones[k]; | ||
138 | if (!z->present_pages) | ||
139 | continue; | ||
140 | zl->zones[num++] = z; | ||
141 | if (k > policy_zone) | ||
142 | policy_zone = k; | ||
143 | } | ||
144 | } | ||
145 | zl->zones[num] = NULL; | 145 | zl->zones[num] = NULL; |
146 | return zl; | 146 | return zl; |
147 | } | 147 | } |
@@ -161,6 +161,10 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes) | |||
161 | switch (mode) { | 161 | switch (mode) { |
162 | case MPOL_INTERLEAVE: | 162 | case MPOL_INTERLEAVE: |
163 | policy->v.nodes = *nodes; | 163 | policy->v.nodes = *nodes; |
164 | if (nodes_weight(*nodes) == 0) { | ||
165 | kmem_cache_free(policy_cache, policy); | ||
166 | return ERR_PTR(-EINVAL); | ||
167 | } | ||
164 | break; | 168 | break; |
165 | case MPOL_PREFERRED: | 169 | case MPOL_PREFERRED: |
166 | policy->v.preferred_node = first_node(*nodes); | 170 | policy->v.preferred_node = first_node(*nodes); |
@@ -176,12 +180,19 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes) | |||
176 | break; | 180 | break; |
177 | } | 181 | } |
178 | policy->policy = mode; | 182 | policy->policy = mode; |
183 | policy->cpuset_mems_allowed = cpuset_mems_allowed(current); | ||
179 | return policy; | 184 | return policy; |
180 | } | 185 | } |
181 | 186 | ||
182 | /* Ensure all existing pages follow the policy. */ | 187 | static void gather_stats(struct page *, void *); |
188 | static void migrate_page_add(struct page *page, struct list_head *pagelist, | ||
189 | unsigned long flags); | ||
190 | |||
191 | /* Scan through pages checking if pages follow certain conditions. */ | ||
183 | static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | 192 | static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, |
184 | unsigned long addr, unsigned long end, nodemask_t *nodes) | 193 | unsigned long addr, unsigned long end, |
194 | const nodemask_t *nodes, unsigned long flags, | ||
195 | void *private) | ||
185 | { | 196 | { |
186 | pte_t *orig_pte; | 197 | pte_t *orig_pte; |
187 | pte_t *pte; | 198 | pte_t *pte; |
@@ -189,18 +200,36 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
189 | 200 | ||
190 | orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | 201 | orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); |
191 | do { | 202 | do { |
192 | unsigned long pfn; | 203 | struct page *page; |
193 | unsigned int nid; | 204 | unsigned int nid; |
194 | 205 | ||
195 | if (!pte_present(*pte)) | 206 | if (!pte_present(*pte)) |
196 | continue; | 207 | continue; |
197 | pfn = pte_pfn(*pte); | 208 | page = vm_normal_page(vma, addr, *pte); |
198 | if (!pfn_valid(pfn)) { | 209 | if (!page) |
199 | print_bad_pte(vma, *pte, addr); | ||
200 | continue; | 210 | continue; |
201 | } | 211 | /* |
202 | nid = pfn_to_nid(pfn); | 212 | * The check for PageReserved here is important to avoid |
203 | if (!node_isset(nid, *nodes)) | 213 | * handling zero pages and other pages that may have been |
214 | * marked special by the system. | ||
215 | * | ||
216 | * If the PageReserved would not be checked here then f.e. | ||
217 | * the location of the zero page could have an influence | ||
218 | * on MPOL_MF_STRICT, zero pages would be counted for | ||
219 | * the per node stats, and there would be useless attempts | ||
220 | * to put zero pages on the migration list. | ||
221 | */ | ||
222 | if (PageReserved(page)) | ||
223 | continue; | ||
224 | nid = page_to_nid(page); | ||
225 | if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) | ||
226 | continue; | ||
227 | |||
228 | if (flags & MPOL_MF_STATS) | ||
229 | gather_stats(page, private); | ||
230 | else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) | ||
231 | migrate_page_add(page, private, flags); | ||
232 | else | ||
204 | break; | 233 | break; |
205 | } while (pte++, addr += PAGE_SIZE, addr != end); | 234 | } while (pte++, addr += PAGE_SIZE, addr != end); |
206 | pte_unmap_unlock(orig_pte, ptl); | 235 | pte_unmap_unlock(orig_pte, ptl); |
@@ -208,7 +237,9 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
208 | } | 237 | } |
209 | 238 | ||
210 | static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, | 239 | static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, |
211 | unsigned long addr, unsigned long end, nodemask_t *nodes) | 240 | unsigned long addr, unsigned long end, |
241 | const nodemask_t *nodes, unsigned long flags, | ||
242 | void *private) | ||
212 | { | 243 | { |
213 | pmd_t *pmd; | 244 | pmd_t *pmd; |
214 | unsigned long next; | 245 | unsigned long next; |
@@ -218,14 +249,17 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, | |||
218 | next = pmd_addr_end(addr, end); | 249 | next = pmd_addr_end(addr, end); |
219 | if (pmd_none_or_clear_bad(pmd)) | 250 | if (pmd_none_or_clear_bad(pmd)) |
220 | continue; | 251 | continue; |
221 | if (check_pte_range(vma, pmd, addr, next, nodes)) | 252 | if (check_pte_range(vma, pmd, addr, next, nodes, |
253 | flags, private)) | ||
222 | return -EIO; | 254 | return -EIO; |
223 | } while (pmd++, addr = next, addr != end); | 255 | } while (pmd++, addr = next, addr != end); |
224 | return 0; | 256 | return 0; |
225 | } | 257 | } |
226 | 258 | ||
227 | static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd, | 259 | static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd, |
228 | unsigned long addr, unsigned long end, nodemask_t *nodes) | 260 | unsigned long addr, unsigned long end, |
261 | const nodemask_t *nodes, unsigned long flags, | ||
262 | void *private) | ||
229 | { | 263 | { |
230 | pud_t *pud; | 264 | pud_t *pud; |
231 | unsigned long next; | 265 | unsigned long next; |
@@ -235,14 +269,17 @@ static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd, | |||
235 | next = pud_addr_end(addr, end); | 269 | next = pud_addr_end(addr, end); |
236 | if (pud_none_or_clear_bad(pud)) | 270 | if (pud_none_or_clear_bad(pud)) |
237 | continue; | 271 | continue; |
238 | if (check_pmd_range(vma, pud, addr, next, nodes)) | 272 | if (check_pmd_range(vma, pud, addr, next, nodes, |
273 | flags, private)) | ||
239 | return -EIO; | 274 | return -EIO; |
240 | } while (pud++, addr = next, addr != end); | 275 | } while (pud++, addr = next, addr != end); |
241 | return 0; | 276 | return 0; |
242 | } | 277 | } |
243 | 278 | ||
244 | static inline int check_pgd_range(struct vm_area_struct *vma, | 279 | static inline int check_pgd_range(struct vm_area_struct *vma, |
245 | unsigned long addr, unsigned long end, nodemask_t *nodes) | 280 | unsigned long addr, unsigned long end, |
281 | const nodemask_t *nodes, unsigned long flags, | ||
282 | void *private) | ||
246 | { | 283 | { |
247 | pgd_t *pgd; | 284 | pgd_t *pgd; |
248 | unsigned long next; | 285 | unsigned long next; |
@@ -252,38 +289,61 @@ static inline int check_pgd_range(struct vm_area_struct *vma, | |||
252 | next = pgd_addr_end(addr, end); | 289 | next = pgd_addr_end(addr, end); |
253 | if (pgd_none_or_clear_bad(pgd)) | 290 | if (pgd_none_or_clear_bad(pgd)) |
254 | continue; | 291 | continue; |
255 | if (check_pud_range(vma, pgd, addr, next, nodes)) | 292 | if (check_pud_range(vma, pgd, addr, next, nodes, |
293 | flags, private)) | ||
256 | return -EIO; | 294 | return -EIO; |
257 | } while (pgd++, addr = next, addr != end); | 295 | } while (pgd++, addr = next, addr != end); |
258 | return 0; | 296 | return 0; |
259 | } | 297 | } |
260 | 298 | ||
261 | /* Step 1: check the range */ | 299 | /* Check if a vma is migratable */ |
300 | static inline int vma_migratable(struct vm_area_struct *vma) | ||
301 | { | ||
302 | if (vma->vm_flags & ( | ||
303 | VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP|VM_RESERVED)) | ||
304 | return 0; | ||
305 | return 1; | ||
306 | } | ||
307 | |||
308 | /* | ||
309 | * Check if all pages in a range are on a set of nodes. | ||
310 | * If pagelist != NULL then isolate pages from the LRU and | ||
311 | * put them on the pagelist. | ||
312 | */ | ||
262 | static struct vm_area_struct * | 313 | static struct vm_area_struct * |
263 | check_range(struct mm_struct *mm, unsigned long start, unsigned long end, | 314 | check_range(struct mm_struct *mm, unsigned long start, unsigned long end, |
264 | nodemask_t *nodes, unsigned long flags) | 315 | const nodemask_t *nodes, unsigned long flags, void *private) |
265 | { | 316 | { |
266 | int err; | 317 | int err; |
267 | struct vm_area_struct *first, *vma, *prev; | 318 | struct vm_area_struct *first, *vma, *prev; |
268 | 319 | ||
320 | /* Clear the LRU lists so pages can be isolated */ | ||
321 | if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) | ||
322 | lru_add_drain_all(); | ||
323 | |||
269 | first = find_vma(mm, start); | 324 | first = find_vma(mm, start); |
270 | if (!first) | 325 | if (!first) |
271 | return ERR_PTR(-EFAULT); | 326 | return ERR_PTR(-EFAULT); |
272 | if (first->vm_flags & VM_RESERVED) | ||
273 | return ERR_PTR(-EACCES); | ||
274 | prev = NULL; | 327 | prev = NULL; |
275 | for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { | 328 | for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { |
276 | if (!vma->vm_next && vma->vm_end < end) | 329 | if (!(flags & MPOL_MF_DISCONTIG_OK)) { |
277 | return ERR_PTR(-EFAULT); | 330 | if (!vma->vm_next && vma->vm_end < end) |
278 | if (prev && prev->vm_end < vma->vm_start) | 331 | return ERR_PTR(-EFAULT); |
279 | return ERR_PTR(-EFAULT); | 332 | if (prev && prev->vm_end < vma->vm_start) |
280 | if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) { | 333 | return ERR_PTR(-EFAULT); |
334 | } | ||
335 | if (!is_vm_hugetlb_page(vma) && | ||
336 | ((flags & MPOL_MF_STRICT) || | ||
337 | ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && | ||
338 | vma_migratable(vma)))) { | ||
281 | unsigned long endvma = vma->vm_end; | 339 | unsigned long endvma = vma->vm_end; |
340 | |||
282 | if (endvma > end) | 341 | if (endvma > end) |
283 | endvma = end; | 342 | endvma = end; |
284 | if (vma->vm_start > start) | 343 | if (vma->vm_start > start) |
285 | start = vma->vm_start; | 344 | start = vma->vm_start; |
286 | err = check_pgd_range(vma, start, endvma, nodes); | 345 | err = check_pgd_range(vma, start, endvma, nodes, |
346 | flags, private); | ||
287 | if (err) { | 347 | if (err) { |
288 | first = ERR_PTR(err); | 348 | first = ERR_PTR(err); |
289 | break; | 349 | break; |
@@ -342,51 +402,10 @@ static int contextualize_policy(int mode, nodemask_t *nodes) | |||
342 | if (!nodes) | 402 | if (!nodes) |
343 | return 0; | 403 | return 0; |
344 | 404 | ||
345 | /* Update current mems_allowed */ | 405 | cpuset_update_task_memory_state(); |
346 | cpuset_update_current_mems_allowed(); | 406 | if (!cpuset_nodes_subset_current_mems_allowed(*nodes)) |
347 | /* Ignore nodes not set in current->mems_allowed */ | ||
348 | cpuset_restrict_to_mems_allowed(nodes->bits); | ||
349 | return mpol_check_policy(mode, nodes); | ||
350 | } | ||
351 | |||
352 | long do_mbind(unsigned long start, unsigned long len, | ||
353 | unsigned long mode, nodemask_t *nmask, unsigned long flags) | ||
354 | { | ||
355 | struct vm_area_struct *vma; | ||
356 | struct mm_struct *mm = current->mm; | ||
357 | struct mempolicy *new; | ||
358 | unsigned long end; | ||
359 | int err; | ||
360 | |||
361 | if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX) | ||
362 | return -EINVAL; | ||
363 | if (start & ~PAGE_MASK) | ||
364 | return -EINVAL; | ||
365 | if (mode == MPOL_DEFAULT) | ||
366 | flags &= ~MPOL_MF_STRICT; | ||
367 | len = (len + PAGE_SIZE - 1) & PAGE_MASK; | ||
368 | end = start + len; | ||
369 | if (end < start) | ||
370 | return -EINVAL; | ||
371 | if (end == start) | ||
372 | return 0; | ||
373 | if (mpol_check_policy(mode, nmask)) | ||
374 | return -EINVAL; | 407 | return -EINVAL; |
375 | new = mpol_new(mode, nmask); | 408 | return mpol_check_policy(mode, nodes); |
376 | if (IS_ERR(new)) | ||
377 | return PTR_ERR(new); | ||
378 | |||
379 | PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len, | ||
380 | mode,nodes_addr(nodes)[0]); | ||
381 | |||
382 | down_write(&mm->mmap_sem); | ||
383 | vma = check_range(mm, start, end, nmask, flags); | ||
384 | err = PTR_ERR(vma); | ||
385 | if (!IS_ERR(vma)) | ||
386 | err = mbind_range(vma, start, end, new); | ||
387 | up_write(&mm->mmap_sem); | ||
388 | mpol_free(new); | ||
389 | return err; | ||
390 | } | 409 | } |
391 | 410 | ||
392 | /* Set the process memory policy */ | 411 | /* Set the process memory policy */ |
@@ -457,7 +476,7 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask, | |||
457 | struct vm_area_struct *vma = NULL; | 476 | struct vm_area_struct *vma = NULL; |
458 | struct mempolicy *pol = current->mempolicy; | 477 | struct mempolicy *pol = current->mempolicy; |
459 | 478 | ||
460 | cpuset_update_current_mems_allowed(); | 479 | cpuset_update_task_memory_state(); |
461 | if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR)) | 480 | if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR)) |
462 | return -EINVAL; | 481 | return -EINVAL; |
463 | if (flags & MPOL_F_ADDR) { | 482 | if (flags & MPOL_F_ADDR) { |
@@ -509,11 +528,141 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask, | |||
509 | } | 528 | } |
510 | 529 | ||
511 | /* | 530 | /* |
531 | * page migration | ||
532 | */ | ||
533 | |||
534 | static void migrate_page_add(struct page *page, struct list_head *pagelist, | ||
535 | unsigned long flags) | ||
536 | { | ||
537 | /* | ||
538 | * Avoid migrating a page that is shared with others. | ||
539 | */ | ||
540 | if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) { | ||
541 | if (isolate_lru_page(page)) | ||
542 | list_add(&page->lru, pagelist); | ||
543 | } | ||
544 | } | ||
545 | |||
546 | static int swap_pages(struct list_head *pagelist) | ||
547 | { | ||
548 | LIST_HEAD(moved); | ||
549 | LIST_HEAD(failed); | ||
550 | int n; | ||
551 | |||
552 | n = migrate_pages(pagelist, NULL, &moved, &failed); | ||
553 | putback_lru_pages(&failed); | ||
554 | putback_lru_pages(&moved); | ||
555 | |||
556 | return n; | ||
557 | } | ||
558 | |||
559 | /* | ||
560 | * For now migrate_pages simply swaps out the pages from nodes that are in | ||
561 | * the source set but not in the target set. In the future, we would | ||
562 | * want a function that moves pages between the two nodesets in such | ||
563 | * a way as to preserve the physical layout as much as possible. | ||
564 | * | ||
565 | * Returns the number of page that could not be moved. | ||
566 | */ | ||
567 | int do_migrate_pages(struct mm_struct *mm, | ||
568 | const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) | ||
569 | { | ||
570 | LIST_HEAD(pagelist); | ||
571 | int count = 0; | ||
572 | nodemask_t nodes; | ||
573 | |||
574 | nodes_andnot(nodes, *from_nodes, *to_nodes); | ||
575 | |||
576 | down_read(&mm->mmap_sem); | ||
577 | check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nodes, | ||
578 | flags | MPOL_MF_DISCONTIG_OK, &pagelist); | ||
579 | |||
580 | if (!list_empty(&pagelist)) { | ||
581 | count = swap_pages(&pagelist); | ||
582 | putback_lru_pages(&pagelist); | ||
583 | } | ||
584 | |||
585 | up_read(&mm->mmap_sem); | ||
586 | return count; | ||
587 | } | ||
588 | |||
589 | long do_mbind(unsigned long start, unsigned long len, | ||
590 | unsigned long mode, nodemask_t *nmask, unsigned long flags) | ||
591 | { | ||
592 | struct vm_area_struct *vma; | ||
593 | struct mm_struct *mm = current->mm; | ||
594 | struct mempolicy *new; | ||
595 | unsigned long end; | ||
596 | int err; | ||
597 | LIST_HEAD(pagelist); | ||
598 | |||
599 | if ((flags & ~(unsigned long)(MPOL_MF_STRICT | | ||
600 | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) | ||
601 | || mode > MPOL_MAX) | ||
602 | return -EINVAL; | ||
603 | if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE)) | ||
604 | return -EPERM; | ||
605 | |||
606 | if (start & ~PAGE_MASK) | ||
607 | return -EINVAL; | ||
608 | |||
609 | if (mode == MPOL_DEFAULT) | ||
610 | flags &= ~MPOL_MF_STRICT; | ||
611 | |||
612 | len = (len + PAGE_SIZE - 1) & PAGE_MASK; | ||
613 | end = start + len; | ||
614 | |||
615 | if (end < start) | ||
616 | return -EINVAL; | ||
617 | if (end == start) | ||
618 | return 0; | ||
619 | |||
620 | if (mpol_check_policy(mode, nmask)) | ||
621 | return -EINVAL; | ||
622 | |||
623 | new = mpol_new(mode, nmask); | ||
624 | if (IS_ERR(new)) | ||
625 | return PTR_ERR(new); | ||
626 | |||
627 | /* | ||
628 | * If we are using the default policy then operation | ||
629 | * on discontinuous address spaces is okay after all | ||
630 | */ | ||
631 | if (!new) | ||
632 | flags |= MPOL_MF_DISCONTIG_OK; | ||
633 | |||
634 | PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len, | ||
635 | mode,nodes_addr(nodes)[0]); | ||
636 | |||
637 | down_write(&mm->mmap_sem); | ||
638 | vma = check_range(mm, start, end, nmask, | ||
639 | flags | MPOL_MF_INVERT, &pagelist); | ||
640 | |||
641 | err = PTR_ERR(vma); | ||
642 | if (!IS_ERR(vma)) { | ||
643 | int nr_failed = 0; | ||
644 | |||
645 | err = mbind_range(vma, start, end, new); | ||
646 | if (!list_empty(&pagelist)) | ||
647 | nr_failed = swap_pages(&pagelist); | ||
648 | |||
649 | if (!err && nr_failed && (flags & MPOL_MF_STRICT)) | ||
650 | err = -EIO; | ||
651 | } | ||
652 | if (!list_empty(&pagelist)) | ||
653 | putback_lru_pages(&pagelist); | ||
654 | |||
655 | up_write(&mm->mmap_sem); | ||
656 | mpol_free(new); | ||
657 | return err; | ||
658 | } | ||
659 | |||
660 | /* | ||
512 | * User space interface with variable sized bitmaps for nodelists. | 661 | * User space interface with variable sized bitmaps for nodelists. |
513 | */ | 662 | */ |
514 | 663 | ||
515 | /* Copy a node mask from user space. */ | 664 | /* Copy a node mask from user space. */ |
516 | static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask, | 665 | static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, |
517 | unsigned long maxnode) | 666 | unsigned long maxnode) |
518 | { | 667 | { |
519 | unsigned long k; | 668 | unsigned long k; |
@@ -602,6 +751,65 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask, | |||
602 | return do_set_mempolicy(mode, &nodes); | 751 | return do_set_mempolicy(mode, &nodes); |
603 | } | 752 | } |
604 | 753 | ||
754 | asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode, | ||
755 | const unsigned long __user *old_nodes, | ||
756 | const unsigned long __user *new_nodes) | ||
757 | { | ||
758 | struct mm_struct *mm; | ||
759 | struct task_struct *task; | ||
760 | nodemask_t old; | ||
761 | nodemask_t new; | ||
762 | nodemask_t task_nodes; | ||
763 | int err; | ||
764 | |||
765 | err = get_nodes(&old, old_nodes, maxnode); | ||
766 | if (err) | ||
767 | return err; | ||
768 | |||
769 | err = get_nodes(&new, new_nodes, maxnode); | ||
770 | if (err) | ||
771 | return err; | ||
772 | |||
773 | /* Find the mm_struct */ | ||
774 | read_lock(&tasklist_lock); | ||
775 | task = pid ? find_task_by_pid(pid) : current; | ||
776 | if (!task) { | ||
777 | read_unlock(&tasklist_lock); | ||
778 | return -ESRCH; | ||
779 | } | ||
780 | mm = get_task_mm(task); | ||
781 | read_unlock(&tasklist_lock); | ||
782 | |||
783 | if (!mm) | ||
784 | return -EINVAL; | ||
785 | |||
786 | /* | ||
787 | * Check if this process has the right to modify the specified | ||
788 | * process. The right exists if the process has administrative | ||
789 | * capabilities, superuser priviledges or the same | ||
790 | * userid as the target process. | ||
791 | */ | ||
792 | if ((current->euid != task->suid) && (current->euid != task->uid) && | ||
793 | (current->uid != task->suid) && (current->uid != task->uid) && | ||
794 | !capable(CAP_SYS_ADMIN)) { | ||
795 | err = -EPERM; | ||
796 | goto out; | ||
797 | } | ||
798 | |||
799 | task_nodes = cpuset_mems_allowed(task); | ||
800 | /* Is the user allowed to access the target nodes? */ | ||
801 | if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_ADMIN)) { | ||
802 | err = -EPERM; | ||
803 | goto out; | ||
804 | } | ||
805 | |||
806 | err = do_migrate_pages(mm, &old, &new, MPOL_MF_MOVE); | ||
807 | out: | ||
808 | mmput(mm); | ||
809 | return err; | ||
810 | } | ||
811 | |||
812 | |||
605 | /* Retrieve NUMA policy */ | 813 | /* Retrieve NUMA policy */ |
606 | asmlinkage long sys_get_mempolicy(int __user *policy, | 814 | asmlinkage long sys_get_mempolicy(int __user *policy, |
607 | unsigned long __user *nmask, | 815 | unsigned long __user *nmask, |
@@ -708,8 +916,8 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len, | |||
708 | #endif | 916 | #endif |
709 | 917 | ||
710 | /* Return effective policy for a VMA */ | 918 | /* Return effective policy for a VMA */ |
711 | struct mempolicy * | 919 | static struct mempolicy * get_vma_policy(struct task_struct *task, |
712 | get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr) | 920 | struct vm_area_struct *vma, unsigned long addr) |
713 | { | 921 | { |
714 | struct mempolicy *pol = task->mempolicy; | 922 | struct mempolicy *pol = task->mempolicy; |
715 | 923 | ||
@@ -768,6 +976,33 @@ static unsigned interleave_nodes(struct mempolicy *policy) | |||
768 | return nid; | 976 | return nid; |
769 | } | 977 | } |
770 | 978 | ||
979 | /* | ||
980 | * Depending on the memory policy provide a node from which to allocate the | ||
981 | * next slab entry. | ||
982 | */ | ||
983 | unsigned slab_node(struct mempolicy *policy) | ||
984 | { | ||
985 | switch (policy->policy) { | ||
986 | case MPOL_INTERLEAVE: | ||
987 | return interleave_nodes(policy); | ||
988 | |||
989 | case MPOL_BIND: | ||
990 | /* | ||
991 | * Follow bind policy behavior and start allocation at the | ||
992 | * first node. | ||
993 | */ | ||
994 | return policy->v.zonelist->zones[0]->zone_pgdat->node_id; | ||
995 | |||
996 | case MPOL_PREFERRED: | ||
997 | if (policy->v.preferred_node >= 0) | ||
998 | return policy->v.preferred_node; | ||
999 | /* Fall through */ | ||
1000 | |||
1001 | default: | ||
1002 | return numa_node_id(); | ||
1003 | } | ||
1004 | } | ||
1005 | |||
771 | /* Do static interleaving for a VMA with known offset. */ | 1006 | /* Do static interleaving for a VMA with known offset. */ |
772 | static unsigned offset_il_node(struct mempolicy *pol, | 1007 | static unsigned offset_il_node(struct mempolicy *pol, |
773 | struct vm_area_struct *vma, unsigned long off) | 1008 | struct vm_area_struct *vma, unsigned long off) |
@@ -785,6 +1020,34 @@ static unsigned offset_il_node(struct mempolicy *pol, | |||
785 | return nid; | 1020 | return nid; |
786 | } | 1021 | } |
787 | 1022 | ||
1023 | /* Determine a node number for interleave */ | ||
1024 | static inline unsigned interleave_nid(struct mempolicy *pol, | ||
1025 | struct vm_area_struct *vma, unsigned long addr, int shift) | ||
1026 | { | ||
1027 | if (vma) { | ||
1028 | unsigned long off; | ||
1029 | |||
1030 | off = vma->vm_pgoff; | ||
1031 | off += (addr - vma->vm_start) >> shift; | ||
1032 | return offset_il_node(pol, vma, off); | ||
1033 | } else | ||
1034 | return interleave_nodes(pol); | ||
1035 | } | ||
1036 | |||
1037 | /* Return a zonelist suitable for a huge page allocation. */ | ||
1038 | struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr) | ||
1039 | { | ||
1040 | struct mempolicy *pol = get_vma_policy(current, vma, addr); | ||
1041 | |||
1042 | if (pol->policy == MPOL_INTERLEAVE) { | ||
1043 | unsigned nid; | ||
1044 | |||
1045 | nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT); | ||
1046 | return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER); | ||
1047 | } | ||
1048 | return zonelist_policy(GFP_HIGHUSER, pol); | ||
1049 | } | ||
1050 | |||
788 | /* Allocate a page in interleaved policy. | 1051 | /* Allocate a page in interleaved policy. |
789 | Own path because it needs to do special accounting. */ | 1052 | Own path because it needs to do special accounting. */ |
790 | static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, | 1053 | static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, |
@@ -829,19 +1092,12 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) | |||
829 | { | 1092 | { |
830 | struct mempolicy *pol = get_vma_policy(current, vma, addr); | 1093 | struct mempolicy *pol = get_vma_policy(current, vma, addr); |
831 | 1094 | ||
832 | cpuset_update_current_mems_allowed(); | 1095 | cpuset_update_task_memory_state(); |
833 | 1096 | ||
834 | if (unlikely(pol->policy == MPOL_INTERLEAVE)) { | 1097 | if (unlikely(pol->policy == MPOL_INTERLEAVE)) { |
835 | unsigned nid; | 1098 | unsigned nid; |
836 | if (vma) { | 1099 | |
837 | unsigned long off; | 1100 | nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); |
838 | off = vma->vm_pgoff; | ||
839 | off += (addr - vma->vm_start) >> PAGE_SHIFT; | ||
840 | nid = offset_il_node(pol, vma, off); | ||
841 | } else { | ||
842 | /* fall back to process interleaving */ | ||
843 | nid = interleave_nodes(pol); | ||
844 | } | ||
845 | return alloc_page_interleave(gfp, 0, nid); | 1101 | return alloc_page_interleave(gfp, 0, nid); |
846 | } | 1102 | } |
847 | return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol)); | 1103 | return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol)); |
@@ -862,7 +1118,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) | |||
862 | * interrupt context and apply the current process NUMA policy. | 1118 | * interrupt context and apply the current process NUMA policy. |
863 | * Returns NULL when no page can be allocated. | 1119 | * Returns NULL when no page can be allocated. |
864 | * | 1120 | * |
865 | * Don't call cpuset_update_current_mems_allowed() unless | 1121 | * Don't call cpuset_update_task_memory_state() unless |
866 | * 1) it's ok to take cpuset_sem (can WAIT), and | 1122 | * 1) it's ok to take cpuset_sem (can WAIT), and |
867 | * 2) allocating for current task (not interrupt). | 1123 | * 2) allocating for current task (not interrupt). |
868 | */ | 1124 | */ |
@@ -871,7 +1127,7 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) | |||
871 | struct mempolicy *pol = current->mempolicy; | 1127 | struct mempolicy *pol = current->mempolicy; |
872 | 1128 | ||
873 | if ((gfp & __GFP_WAIT) && !in_interrupt()) | 1129 | if ((gfp & __GFP_WAIT) && !in_interrupt()) |
874 | cpuset_update_current_mems_allowed(); | 1130 | cpuset_update_task_memory_state(); |
875 | if (!pol || in_interrupt()) | 1131 | if (!pol || in_interrupt()) |
876 | pol = &default_policy; | 1132 | pol = &default_policy; |
877 | if (pol->policy == MPOL_INTERLEAVE) | 1133 | if (pol->policy == MPOL_INTERLEAVE) |
@@ -880,6 +1136,15 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) | |||
880 | } | 1136 | } |
881 | EXPORT_SYMBOL(alloc_pages_current); | 1137 | EXPORT_SYMBOL(alloc_pages_current); |
882 | 1138 | ||
1139 | /* | ||
1140 | * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it | ||
1141 | * rebinds the mempolicy its copying by calling mpol_rebind_policy() | ||
1142 | * with the mems_allowed returned by cpuset_mems_allowed(). This | ||
1143 | * keeps mempolicies cpuset relative after its cpuset moves. See | ||
1144 | * further kernel/cpuset.c update_nodemask(). | ||
1145 | */ | ||
1146 | void *cpuset_being_rebound; | ||
1147 | |||
883 | /* Slow path of a mempolicy copy */ | 1148 | /* Slow path of a mempolicy copy */ |
884 | struct mempolicy *__mpol_copy(struct mempolicy *old) | 1149 | struct mempolicy *__mpol_copy(struct mempolicy *old) |
885 | { | 1150 | { |
@@ -887,6 +1152,10 @@ struct mempolicy *__mpol_copy(struct mempolicy *old) | |||
887 | 1152 | ||
888 | if (!new) | 1153 | if (!new) |
889 | return ERR_PTR(-ENOMEM); | 1154 | return ERR_PTR(-ENOMEM); |
1155 | if (current_cpuset_is_being_rebound()) { | ||
1156 | nodemask_t mems = cpuset_mems_allowed(current); | ||
1157 | mpol_rebind_policy(old, &mems); | ||
1158 | } | ||
890 | *new = *old; | 1159 | *new = *old; |
891 | atomic_set(&new->refcnt, 1); | 1160 | atomic_set(&new->refcnt, 1); |
892 | if (new->policy == MPOL_BIND) { | 1161 | if (new->policy == MPOL_BIND) { |
@@ -940,54 +1209,6 @@ void __mpol_free(struct mempolicy *p) | |||
940 | } | 1209 | } |
941 | 1210 | ||
942 | /* | 1211 | /* |
943 | * Hugetlb policy. Same as above, just works with node numbers instead of | ||
944 | * zonelists. | ||
945 | */ | ||
946 | |||
947 | /* Find first node suitable for an allocation */ | ||
948 | int mpol_first_node(struct vm_area_struct *vma, unsigned long addr) | ||
949 | { | ||
950 | struct mempolicy *pol = get_vma_policy(current, vma, addr); | ||
951 | |||
952 | switch (pol->policy) { | ||
953 | case MPOL_DEFAULT: | ||
954 | return numa_node_id(); | ||
955 | case MPOL_BIND: | ||
956 | return pol->v.zonelist->zones[0]->zone_pgdat->node_id; | ||
957 | case MPOL_INTERLEAVE: | ||
958 | return interleave_nodes(pol); | ||
959 | case MPOL_PREFERRED: | ||
960 | return pol->v.preferred_node >= 0 ? | ||
961 | pol->v.preferred_node : numa_node_id(); | ||
962 | } | ||
963 | BUG(); | ||
964 | return 0; | ||
965 | } | ||
966 | |||
967 | /* Find secondary valid nodes for an allocation */ | ||
968 | int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr) | ||
969 | { | ||
970 | struct mempolicy *pol = get_vma_policy(current, vma, addr); | ||
971 | |||
972 | switch (pol->policy) { | ||
973 | case MPOL_PREFERRED: | ||
974 | case MPOL_DEFAULT: | ||
975 | case MPOL_INTERLEAVE: | ||
976 | return 1; | ||
977 | case MPOL_BIND: { | ||
978 | struct zone **z; | ||
979 | for (z = pol->v.zonelist->zones; *z; z++) | ||
980 | if ((*z)->zone_pgdat->node_id == nid) | ||
981 | return 1; | ||
982 | return 0; | ||
983 | } | ||
984 | default: | ||
985 | BUG(); | ||
986 | return 0; | ||
987 | } | ||
988 | } | ||
989 | |||
990 | /* | ||
991 | * Shared memory backing store policy support. | 1212 | * Shared memory backing store policy support. |
992 | * | 1213 | * |
993 | * Remember policies even when nobody has shared memory mapped. | 1214 | * Remember policies even when nobody has shared memory mapped. |
@@ -1141,6 +1362,30 @@ restart: | |||
1141 | return 0; | 1362 | return 0; |
1142 | } | 1363 | } |
1143 | 1364 | ||
1365 | void mpol_shared_policy_init(struct shared_policy *info, int policy, | ||
1366 | nodemask_t *policy_nodes) | ||
1367 | { | ||
1368 | info->root = RB_ROOT; | ||
1369 | spin_lock_init(&info->lock); | ||
1370 | |||
1371 | if (policy != MPOL_DEFAULT) { | ||
1372 | struct mempolicy *newpol; | ||
1373 | |||
1374 | /* Falls back to MPOL_DEFAULT on any error */ | ||
1375 | newpol = mpol_new(policy, policy_nodes); | ||
1376 | if (!IS_ERR(newpol)) { | ||
1377 | /* Create pseudo-vma that contains just the policy */ | ||
1378 | struct vm_area_struct pvma; | ||
1379 | |||
1380 | memset(&pvma, 0, sizeof(struct vm_area_struct)); | ||
1381 | /* Policy covers entire file */ | ||
1382 | pvma.vm_end = TASK_SIZE; | ||
1383 | mpol_set_shared_policy(info, &pvma, newpol); | ||
1384 | mpol_free(newpol); | ||
1385 | } | ||
1386 | } | ||
1387 | } | ||
1388 | |||
1144 | int mpol_set_shared_policy(struct shared_policy *info, | 1389 | int mpol_set_shared_policy(struct shared_policy *info, |
1145 | struct vm_area_struct *vma, struct mempolicy *npol) | 1390 | struct vm_area_struct *vma, struct mempolicy *npol) |
1146 | { | 1391 | { |
@@ -1209,25 +1454,31 @@ void numa_default_policy(void) | |||
1209 | } | 1454 | } |
1210 | 1455 | ||
1211 | /* Migrate a policy to a different set of nodes */ | 1456 | /* Migrate a policy to a different set of nodes */ |
1212 | static void rebind_policy(struct mempolicy *pol, const nodemask_t *old, | 1457 | void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask) |
1213 | const nodemask_t *new) | ||
1214 | { | 1458 | { |
1459 | nodemask_t *mpolmask; | ||
1215 | nodemask_t tmp; | 1460 | nodemask_t tmp; |
1216 | 1461 | ||
1217 | if (!pol) | 1462 | if (!pol) |
1218 | return; | 1463 | return; |
1464 | mpolmask = &pol->cpuset_mems_allowed; | ||
1465 | if (nodes_equal(*mpolmask, *newmask)) | ||
1466 | return; | ||
1219 | 1467 | ||
1220 | switch (pol->policy) { | 1468 | switch (pol->policy) { |
1221 | case MPOL_DEFAULT: | 1469 | case MPOL_DEFAULT: |
1222 | break; | 1470 | break; |
1223 | case MPOL_INTERLEAVE: | 1471 | case MPOL_INTERLEAVE: |
1224 | nodes_remap(tmp, pol->v.nodes, *old, *new); | 1472 | nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask); |
1225 | pol->v.nodes = tmp; | 1473 | pol->v.nodes = tmp; |
1226 | current->il_next = node_remap(current->il_next, *old, *new); | 1474 | *mpolmask = *newmask; |
1475 | current->il_next = node_remap(current->il_next, | ||
1476 | *mpolmask, *newmask); | ||
1227 | break; | 1477 | break; |
1228 | case MPOL_PREFERRED: | 1478 | case MPOL_PREFERRED: |
1229 | pol->v.preferred_node = node_remap(pol->v.preferred_node, | 1479 | pol->v.preferred_node = node_remap(pol->v.preferred_node, |
1230 | *old, *new); | 1480 | *mpolmask, *newmask); |
1481 | *mpolmask = *newmask; | ||
1231 | break; | 1482 | break; |
1232 | case MPOL_BIND: { | 1483 | case MPOL_BIND: { |
1233 | nodemask_t nodes; | 1484 | nodemask_t nodes; |
@@ -1237,7 +1488,7 @@ static void rebind_policy(struct mempolicy *pol, const nodemask_t *old, | |||
1237 | nodes_clear(nodes); | 1488 | nodes_clear(nodes); |
1238 | for (z = pol->v.zonelist->zones; *z; z++) | 1489 | for (z = pol->v.zonelist->zones; *z; z++) |
1239 | node_set((*z)->zone_pgdat->node_id, nodes); | 1490 | node_set((*z)->zone_pgdat->node_id, nodes); |
1240 | nodes_remap(tmp, nodes, *old, *new); | 1491 | nodes_remap(tmp, nodes, *mpolmask, *newmask); |
1241 | nodes = tmp; | 1492 | nodes = tmp; |
1242 | 1493 | ||
1243 | zonelist = bind_zonelist(&nodes); | 1494 | zonelist = bind_zonelist(&nodes); |
@@ -1252,6 +1503,7 @@ static void rebind_policy(struct mempolicy *pol, const nodemask_t *old, | |||
1252 | kfree(pol->v.zonelist); | 1503 | kfree(pol->v.zonelist); |
1253 | pol->v.zonelist = zonelist; | 1504 | pol->v.zonelist = zonelist; |
1254 | } | 1505 | } |
1506 | *mpolmask = *newmask; | ||
1255 | break; | 1507 | break; |
1256 | } | 1508 | } |
1257 | default: | 1509 | default: |
@@ -1261,12 +1513,156 @@ static void rebind_policy(struct mempolicy *pol, const nodemask_t *old, | |||
1261 | } | 1513 | } |
1262 | 1514 | ||
1263 | /* | 1515 | /* |
1264 | * Someone moved this task to different nodes. Fixup mempolicies. | 1516 | * Wrapper for mpol_rebind_policy() that just requires task |
1517 | * pointer, and updates task mempolicy. | ||
1518 | */ | ||
1519 | |||
1520 | void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) | ||
1521 | { | ||
1522 | mpol_rebind_policy(tsk->mempolicy, new); | ||
1523 | } | ||
1524 | |||
1525 | /* | ||
1526 | * Rebind each vma in mm to new nodemask. | ||
1265 | * | 1527 | * |
1266 | * TODO - fixup current->mm->vma and shmfs/tmpfs/hugetlbfs policies as well, | 1528 | * Call holding a reference to mm. Takes mm->mmap_sem during call. |
1267 | * once we have a cpuset mechanism to mark which cpuset subtree is migrating. | ||
1268 | */ | 1529 | */ |
1269 | void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new) | 1530 | |
1531 | void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) | ||
1270 | { | 1532 | { |
1271 | rebind_policy(current->mempolicy, old, new); | 1533 | struct vm_area_struct *vma; |
1534 | |||
1535 | down_write(&mm->mmap_sem); | ||
1536 | for (vma = mm->mmap; vma; vma = vma->vm_next) | ||
1537 | mpol_rebind_policy(vma->vm_policy, new); | ||
1538 | up_write(&mm->mmap_sem); | ||
1272 | } | 1539 | } |
1540 | |||
1541 | /* | ||
1542 | * Display pages allocated per node and memory policy via /proc. | ||
1543 | */ | ||
1544 | |||
1545 | static const char *policy_types[] = { "default", "prefer", "bind", | ||
1546 | "interleave" }; | ||
1547 | |||
1548 | /* | ||
1549 | * Convert a mempolicy into a string. | ||
1550 | * Returns the number of characters in buffer (if positive) | ||
1551 | * or an error (negative) | ||
1552 | */ | ||
1553 | static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) | ||
1554 | { | ||
1555 | char *p = buffer; | ||
1556 | int l; | ||
1557 | nodemask_t nodes; | ||
1558 | int mode = pol ? pol->policy : MPOL_DEFAULT; | ||
1559 | |||
1560 | switch (mode) { | ||
1561 | case MPOL_DEFAULT: | ||
1562 | nodes_clear(nodes); | ||
1563 | break; | ||
1564 | |||
1565 | case MPOL_PREFERRED: | ||
1566 | nodes_clear(nodes); | ||
1567 | node_set(pol->v.preferred_node, nodes); | ||
1568 | break; | ||
1569 | |||
1570 | case MPOL_BIND: | ||
1571 | get_zonemask(pol, &nodes); | ||
1572 | break; | ||
1573 | |||
1574 | case MPOL_INTERLEAVE: | ||
1575 | nodes = pol->v.nodes; | ||
1576 | break; | ||
1577 | |||
1578 | default: | ||
1579 | BUG(); | ||
1580 | return -EFAULT; | ||
1581 | } | ||
1582 | |||
1583 | l = strlen(policy_types[mode]); | ||
1584 | if (buffer + maxlen < p + l + 1) | ||
1585 | return -ENOSPC; | ||
1586 | |||
1587 | strcpy(p, policy_types[mode]); | ||
1588 | p += l; | ||
1589 | |||
1590 | if (!nodes_empty(nodes)) { | ||
1591 | if (buffer + maxlen < p + 2) | ||
1592 | return -ENOSPC; | ||
1593 | *p++ = '='; | ||
1594 | p += nodelist_scnprintf(p, buffer + maxlen - p, nodes); | ||
1595 | } | ||
1596 | return p - buffer; | ||
1597 | } | ||
1598 | |||
1599 | struct numa_maps { | ||
1600 | unsigned long pages; | ||
1601 | unsigned long anon; | ||
1602 | unsigned long mapped; | ||
1603 | unsigned long mapcount_max; | ||
1604 | unsigned long node[MAX_NUMNODES]; | ||
1605 | }; | ||
1606 | |||
1607 | static void gather_stats(struct page *page, void *private) | ||
1608 | { | ||
1609 | struct numa_maps *md = private; | ||
1610 | int count = page_mapcount(page); | ||
1611 | |||
1612 | if (count) | ||
1613 | md->mapped++; | ||
1614 | |||
1615 | if (count > md->mapcount_max) | ||
1616 | md->mapcount_max = count; | ||
1617 | |||
1618 | md->pages++; | ||
1619 | |||
1620 | if (PageAnon(page)) | ||
1621 | md->anon++; | ||
1622 | |||
1623 | md->node[page_to_nid(page)]++; | ||
1624 | cond_resched(); | ||
1625 | } | ||
1626 | |||
1627 | int show_numa_map(struct seq_file *m, void *v) | ||
1628 | { | ||
1629 | struct task_struct *task = m->private; | ||
1630 | struct vm_area_struct *vma = v; | ||
1631 | struct numa_maps *md; | ||
1632 | int n; | ||
1633 | char buffer[50]; | ||
1634 | |||
1635 | if (!vma->vm_mm) | ||
1636 | return 0; | ||
1637 | |||
1638 | md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL); | ||
1639 | if (!md) | ||
1640 | return 0; | ||
1641 | |||
1642 | check_pgd_range(vma, vma->vm_start, vma->vm_end, | ||
1643 | &node_online_map, MPOL_MF_STATS, md); | ||
1644 | |||
1645 | if (md->pages) { | ||
1646 | mpol_to_str(buffer, sizeof(buffer), | ||
1647 | get_vma_policy(task, vma, vma->vm_start)); | ||
1648 | |||
1649 | seq_printf(m, "%08lx %s pages=%lu mapped=%lu maxref=%lu", | ||
1650 | vma->vm_start, buffer, md->pages, | ||
1651 | md->mapped, md->mapcount_max); | ||
1652 | |||
1653 | if (md->anon) | ||
1654 | seq_printf(m," anon=%lu",md->anon); | ||
1655 | |||
1656 | for_each_online_node(n) | ||
1657 | if (md->node[n]) | ||
1658 | seq_printf(m, " N%d=%lu", n, md->node[n]); | ||
1659 | |||
1660 | seq_putc(m, '\n'); | ||
1661 | } | ||
1662 | kfree(md); | ||
1663 | |||
1664 | if (m->count < m->size) | ||
1665 | m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0; | ||
1666 | return 0; | ||
1667 | } | ||
1668 | |||