diff options
-rw-r--r-- | include/linux/mempolicy.h | 3 | ||||
-rw-r--r-- | mm/mempolicy.c | 155 |
2 files changed, 138 insertions, 20 deletions
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index ed00b278cb93..05443a766cb8 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h | |||
@@ -22,6 +22,9 @@ | |||
22 | 22 | ||
23 | /* Flags for mbind */ | 23 | /* Flags for mbind */ |
24 | #define MPOL_MF_STRICT (1<<0) /* Verify existing pages in the mapping */ | 24 | #define MPOL_MF_STRICT (1<<0) /* Verify existing pages in the mapping */ |
25 | #define MPOL_MF_MOVE (1<<1) /* Move pages owned by this process to conform to mapping */ | ||
26 | #define MPOL_MF_MOVE_ALL (1<<2) /* Move every page to conform to mapping */ | ||
27 | #define MPOL_MF_INTERNAL (1<<3) /* Internal flags start here */ | ||
25 | 28 | ||
26 | #ifdef __KERNEL__ | 29 | #ifdef __KERNEL__ |
27 | 30 | ||
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 0f1d2b8a952b..9cc6d962831d 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -83,9 +83,14 @@ | |||
83 | #include <linux/init.h> | 83 | #include <linux/init.h> |
84 | #include <linux/compat.h> | 84 | #include <linux/compat.h> |
85 | #include <linux/mempolicy.h> | 85 | #include <linux/mempolicy.h> |
86 | #include <linux/swap.h> | ||
87 | |||
86 | #include <asm/tlbflush.h> | 88 | #include <asm/tlbflush.h> |
87 | #include <asm/uaccess.h> | 89 | #include <asm/uaccess.h> |
88 | 90 | ||
91 | /* Internal MPOL_MF_xxx flags */ | ||
92 | #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */ | ||
93 | |||
89 | static kmem_cache_t *policy_cache; | 94 | static kmem_cache_t *policy_cache; |
90 | static kmem_cache_t *sn_cache; | 95 | static kmem_cache_t *sn_cache; |
91 | 96 | ||
@@ -174,9 +179,59 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes) | |||
174 | return policy; | 179 | return policy; |
175 | } | 180 | } |
176 | 181 | ||
182 | /* Check if we are the only process mapping the page in question */ | ||
183 | static inline int single_mm_mapping(struct mm_struct *mm, | ||
184 | struct address_space *mapping) | ||
185 | { | ||
186 | struct vm_area_struct *vma; | ||
187 | struct prio_tree_iter iter; | ||
188 | int rc = 1; | ||
189 | |||
190 | spin_lock(&mapping->i_mmap_lock); | ||
191 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX) | ||
192 | if (mm != vma->vm_mm) { | ||
193 | rc = 0; | ||
194 | goto out; | ||
195 | } | ||
196 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) | ||
197 | if (mm != vma->vm_mm) { | ||
198 | rc = 0; | ||
199 | goto out; | ||
200 | } | ||
201 | out: | ||
202 | spin_unlock(&mapping->i_mmap_lock); | ||
203 | return rc; | ||
204 | } | ||
205 | |||
206 | /* | ||
207 | * Add a page to be migrated to the pagelist | ||
208 | */ | ||
209 | static void migrate_page_add(struct vm_area_struct *vma, | ||
210 | struct page *page, struct list_head *pagelist, unsigned long flags) | ||
211 | { | ||
212 | /* | ||
213 | * Avoid migrating a page that is shared by others and not writable. | ||
214 | */ | ||
215 | if ((flags & MPOL_MF_MOVE_ALL) || !page->mapping || PageAnon(page) || | ||
216 | mapping_writably_mapped(page->mapping) || | ||
217 | single_mm_mapping(vma->vm_mm, page->mapping)) { | ||
218 | int rc = isolate_lru_page(page); | ||
219 | |||
220 | if (rc == 1) | ||
221 | list_add(&page->lru, pagelist); | ||
222 | /* | ||
223 | * If the isolate attempt was not successful then we just | ||
224 | * encountered an unswappable page. Something must be wrong. | ||
225 | */ | ||
226 | WARN_ON(rc == 0); | ||
227 | } | ||
228 | } | ||
229 | |||
177 | /* Ensure all existing pages follow the policy. */ | 230 | /* Ensure all existing pages follow the policy. */ |
178 | static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | 231 | static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, |
179 | unsigned long addr, unsigned long end, nodemask_t *nodes) | 232 | unsigned long addr, unsigned long end, |
233 | const nodemask_t *nodes, unsigned long flags, | ||
234 | struct list_head *pagelist) | ||
180 | { | 235 | { |
181 | pte_t *orig_pte; | 236 | pte_t *orig_pte; |
182 | pte_t *pte; | 237 | pte_t *pte; |
@@ -193,15 +248,21 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
193 | if (!page) | 248 | if (!page) |
194 | continue; | 249 | continue; |
195 | nid = page_to_nid(page); | 250 | nid = page_to_nid(page); |
196 | if (!node_isset(nid, *nodes)) | 251 | if (!node_isset(nid, *nodes)) { |
197 | break; | 252 | if (pagelist) |
253 | migrate_page_add(vma, page, pagelist, flags); | ||
254 | else | ||
255 | break; | ||
256 | } | ||
198 | } while (pte++, addr += PAGE_SIZE, addr != end); | 257 | } while (pte++, addr += PAGE_SIZE, addr != end); |
199 | pte_unmap_unlock(orig_pte, ptl); | 258 | pte_unmap_unlock(orig_pte, ptl); |
200 | return addr != end; | 259 | return addr != end; |
201 | } | 260 | } |
202 | 261 | ||
203 | static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, | 262 | static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, |
204 | unsigned long addr, unsigned long end, nodemask_t *nodes) | 263 | unsigned long addr, unsigned long end, |
264 | const nodemask_t *nodes, unsigned long flags, | ||
265 | struct list_head *pagelist) | ||
205 | { | 266 | { |
206 | pmd_t *pmd; | 267 | pmd_t *pmd; |
207 | unsigned long next; | 268 | unsigned long next; |
@@ -211,14 +272,17 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, | |||
211 | next = pmd_addr_end(addr, end); | 272 | next = pmd_addr_end(addr, end); |
212 | if (pmd_none_or_clear_bad(pmd)) | 273 | if (pmd_none_or_clear_bad(pmd)) |
213 | continue; | 274 | continue; |
214 | if (check_pte_range(vma, pmd, addr, next, nodes)) | 275 | if (check_pte_range(vma, pmd, addr, next, nodes, |
276 | flags, pagelist)) | ||
215 | return -EIO; | 277 | return -EIO; |
216 | } while (pmd++, addr = next, addr != end); | 278 | } while (pmd++, addr = next, addr != end); |
217 | return 0; | 279 | return 0; |
218 | } | 280 | } |
219 | 281 | ||
220 | static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd, | 282 | static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd, |
221 | unsigned long addr, unsigned long end, nodemask_t *nodes) | 283 | unsigned long addr, unsigned long end, |
284 | const nodemask_t *nodes, unsigned long flags, | ||
285 | struct list_head *pagelist) | ||
222 | { | 286 | { |
223 | pud_t *pud; | 287 | pud_t *pud; |
224 | unsigned long next; | 288 | unsigned long next; |
@@ -228,14 +292,17 @@ static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd, | |||
228 | next = pud_addr_end(addr, end); | 292 | next = pud_addr_end(addr, end); |
229 | if (pud_none_or_clear_bad(pud)) | 293 | if (pud_none_or_clear_bad(pud)) |
230 | continue; | 294 | continue; |
231 | if (check_pmd_range(vma, pud, addr, next, nodes)) | 295 | if (check_pmd_range(vma, pud, addr, next, nodes, |
296 | flags, pagelist)) | ||
232 | return -EIO; | 297 | return -EIO; |
233 | } while (pud++, addr = next, addr != end); | 298 | } while (pud++, addr = next, addr != end); |
234 | return 0; | 299 | return 0; |
235 | } | 300 | } |
236 | 301 | ||
237 | static inline int check_pgd_range(struct vm_area_struct *vma, | 302 | static inline int check_pgd_range(struct vm_area_struct *vma, |
238 | unsigned long addr, unsigned long end, nodemask_t *nodes) | 303 | unsigned long addr, unsigned long end, |
304 | const nodemask_t *nodes, unsigned long flags, | ||
305 | struct list_head *pagelist) | ||
239 | { | 306 | { |
240 | pgd_t *pgd; | 307 | pgd_t *pgd; |
241 | unsigned long next; | 308 | unsigned long next; |
@@ -245,16 +312,31 @@ static inline int check_pgd_range(struct vm_area_struct *vma, | |||
245 | next = pgd_addr_end(addr, end); | 312 | next = pgd_addr_end(addr, end); |
246 | if (pgd_none_or_clear_bad(pgd)) | 313 | if (pgd_none_or_clear_bad(pgd)) |
247 | continue; | 314 | continue; |
248 | if (check_pud_range(vma, pgd, addr, next, nodes)) | 315 | if (check_pud_range(vma, pgd, addr, next, nodes, |
316 | flags, pagelist)) | ||
249 | return -EIO; | 317 | return -EIO; |
250 | } while (pgd++, addr = next, addr != end); | 318 | } while (pgd++, addr = next, addr != end); |
251 | return 0; | 319 | return 0; |
252 | } | 320 | } |
253 | 321 | ||
254 | /* Step 1: check the range */ | 322 | /* Check if a vma is migratable */ |
323 | static inline int vma_migratable(struct vm_area_struct *vma) | ||
324 | { | ||
325 | if (vma->vm_flags & ( | ||
326 | VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP)) | ||
327 | return 0; | ||
328 | return 1; | ||
329 | } | ||
330 | |||
331 | /* | ||
332 | * Check if all pages in a range are on a set of nodes. | ||
333 | * If pagelist != NULL then isolate pages from the LRU and | ||
334 | * put them on the pagelist. | ||
335 | */ | ||
255 | static struct vm_area_struct * | 336 | static struct vm_area_struct * |
256 | check_range(struct mm_struct *mm, unsigned long start, unsigned long end, | 337 | check_range(struct mm_struct *mm, unsigned long start, unsigned long end, |
257 | nodemask_t *nodes, unsigned long flags) | 338 | const nodemask_t *nodes, unsigned long flags, |
339 | struct list_head *pagelist) | ||
258 | { | 340 | { |
259 | int err; | 341 | int err; |
260 | struct vm_area_struct *first, *vma, *prev; | 342 | struct vm_area_struct *first, *vma, *prev; |
@@ -264,17 +346,24 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, | |||
264 | return ERR_PTR(-EFAULT); | 346 | return ERR_PTR(-EFAULT); |
265 | prev = NULL; | 347 | prev = NULL; |
266 | for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { | 348 | for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { |
267 | if (!vma->vm_next && vma->vm_end < end) | 349 | if (!(flags & MPOL_MF_DISCONTIG_OK)) { |
268 | return ERR_PTR(-EFAULT); | 350 | if (!vma->vm_next && vma->vm_end < end) |
269 | if (prev && prev->vm_end < vma->vm_start) | 351 | return ERR_PTR(-EFAULT); |
270 | return ERR_PTR(-EFAULT); | 352 | if (prev && prev->vm_end < vma->vm_start) |
271 | if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) { | 353 | return ERR_PTR(-EFAULT); |
354 | } | ||
355 | if (!is_vm_hugetlb_page(vma) && | ||
356 | ((flags & MPOL_MF_STRICT) || | ||
357 | ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && | ||
358 | vma_migratable(vma)))) { | ||
272 | unsigned long endvma = vma->vm_end; | 359 | unsigned long endvma = vma->vm_end; |
360 | |||
273 | if (endvma > end) | 361 | if (endvma > end) |
274 | endvma = end; | 362 | endvma = end; |
275 | if (vma->vm_start > start) | 363 | if (vma->vm_start > start) |
276 | start = vma->vm_start; | 364 | start = vma->vm_start; |
277 | err = check_pgd_range(vma, start, endvma, nodes); | 365 | err = check_pgd_range(vma, start, endvma, nodes, |
366 | flags, pagelist); | ||
278 | if (err) { | 367 | if (err) { |
279 | first = ERR_PTR(err); | 368 | first = ERR_PTR(err); |
280 | break; | 369 | break; |
@@ -348,33 +437,59 @@ long do_mbind(unsigned long start, unsigned long len, | |||
348 | struct mempolicy *new; | 437 | struct mempolicy *new; |
349 | unsigned long end; | 438 | unsigned long end; |
350 | int err; | 439 | int err; |
440 | LIST_HEAD(pagelist); | ||
351 | 441 | ||
352 | if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX) | 442 | if ((flags & ~(unsigned long)(MPOL_MF_STRICT|MPOL_MF_MOVE|MPOL_MF_MOVE_ALL)) |
443 | || mode > MPOL_MAX) | ||
353 | return -EINVAL; | 444 | return -EINVAL; |
445 | if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE)) | ||
446 | return -EPERM; | ||
447 | |||
354 | if (start & ~PAGE_MASK) | 448 | if (start & ~PAGE_MASK) |
355 | return -EINVAL; | 449 | return -EINVAL; |
450 | |||
356 | if (mode == MPOL_DEFAULT) | 451 | if (mode == MPOL_DEFAULT) |
357 | flags &= ~MPOL_MF_STRICT; | 452 | flags &= ~MPOL_MF_STRICT; |
453 | |||
358 | len = (len + PAGE_SIZE - 1) & PAGE_MASK; | 454 | len = (len + PAGE_SIZE - 1) & PAGE_MASK; |
359 | end = start + len; | 455 | end = start + len; |
456 | |||
360 | if (end < start) | 457 | if (end < start) |
361 | return -EINVAL; | 458 | return -EINVAL; |
362 | if (end == start) | 459 | if (end == start) |
363 | return 0; | 460 | return 0; |
461 | |||
364 | if (mpol_check_policy(mode, nmask)) | 462 | if (mpol_check_policy(mode, nmask)) |
365 | return -EINVAL; | 463 | return -EINVAL; |
464 | |||
366 | new = mpol_new(mode, nmask); | 465 | new = mpol_new(mode, nmask); |
367 | if (IS_ERR(new)) | 466 | if (IS_ERR(new)) |
368 | return PTR_ERR(new); | 467 | return PTR_ERR(new); |
369 | 468 | ||
469 | /* | ||
470 | * If we are using the default policy then operation | ||
471 | * on discontinuous address spaces is okay after all | ||
472 | */ | ||
473 | if (!new) | ||
474 | flags |= MPOL_MF_DISCONTIG_OK; | ||
475 | |||
370 | PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len, | 476 | PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len, |
371 | mode,nodes_addr(nodes)[0]); | 477 | mode,nodes_addr(nodes)[0]); |
372 | 478 | ||
373 | down_write(&mm->mmap_sem); | 479 | down_write(&mm->mmap_sem); |
374 | vma = check_range(mm, start, end, nmask, flags); | 480 | vma = check_range(mm, start, end, nmask, flags, |
481 | (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ? &pagelist : NULL); | ||
375 | err = PTR_ERR(vma); | 482 | err = PTR_ERR(vma); |
376 | if (!IS_ERR(vma)) | 483 | if (!IS_ERR(vma)) { |
377 | err = mbind_range(vma, start, end, new); | 484 | err = mbind_range(vma, start, end, new); |
485 | if (!list_empty(&pagelist)) | ||
486 | migrate_pages(&pagelist, NULL); | ||
487 | if (!err && !list_empty(&pagelist) && (flags & MPOL_MF_STRICT)) | ||
488 | err = -EIO; | ||
489 | } | ||
490 | if (!list_empty(&pagelist)) | ||
491 | putback_lru_pages(&pagelist); | ||
492 | |||
378 | up_write(&mm->mmap_sem); | 493 | up_write(&mm->mmap_sem); |
379 | mpol_free(new); | 494 | mpol_free(new); |
380 | return err; | 495 | return err; |