diff options
-rw-r--r-- | Documentation/cgroups/memory.txt | 56 | ||||
-rw-r--r-- | mm/memcontrol.c | 97 |
2 files changed, 147 insertions, 6 deletions
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index b871f2552b45..e726fb0df719 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt | |||
@@ -262,10 +262,12 @@ some of the pages cached in the cgroup (page cache pages). | |||
262 | 4.2 Task migration | 262 | 4.2 Task migration |
263 | 263 | ||
264 | When a task migrates from one cgroup to another, it's charge is not | 264 | When a task migrates from one cgroup to another, it's charge is not |
265 | carried forward. The pages allocated from the original cgroup still | 265 | carried forward by default. The pages allocated from the original cgroup still |
266 | remain charged to it, the charge is dropped when the page is freed or | 266 | remain charged to it, the charge is dropped when the page is freed or |
267 | reclaimed. | 267 | reclaimed. |
268 | 268 | ||
269 | Note: You can move charges of a task along with task migration. See 8. | ||
270 | |||
269 | 4.3 Removing a cgroup | 271 | 4.3 Removing a cgroup |
270 | 272 | ||
271 | A cgroup can be removed by rmdir, but as discussed in sections 4.1 and 4.2, a | 273 | A cgroup can be removed by rmdir, but as discussed in sections 4.1 and 4.2, a |
@@ -414,7 +416,57 @@ NOTE1: Soft limits take effect over a long period of time, since they involve | |||
414 | NOTE2: It is recommended to set the soft limit always below the hard limit, | 416 | NOTE2: It is recommended to set the soft limit always below the hard limit, |
415 | otherwise the hard limit will take precedence. | 417 | otherwise the hard limit will take precedence. |
416 | 418 | ||
417 | 8. TODO | 419 | 8. Move charges at task migration |
420 | |||
421 | Users can move charges associated with a task along with task migration, that | ||
422 | is, uncharge task's pages from the old cgroup and charge them to the new cgroup. | ||
423 | |||
424 | 8.1 Interface | ||
425 | |||
426 | This feature is disabled by default. It can be enabled(and disabled again) by | ||
427 | writing to memory.move_charge_at_immigrate of the destination cgroup. | ||
428 | |||
429 | If you want to enable it: | ||
430 | |||
431 | # echo (some positive value) > memory.move_charge_at_immigrate | ||
432 | |||
433 | Note: Each bits of move_charge_at_immigrate has its own meaning about what type | ||
434 | of charges should be moved. See 8.2 for details. | ||
435 | Note: Charges are moved only when you move mm->owner, IOW, a leader of a thread | ||
436 | group. | ||
437 | Note: If we cannot find enough space for the task in the destination cgroup, we | ||
438 | try to make space by reclaiming memory. Task migration may fail if we | ||
439 | cannot make enough space. | ||
440 | Note: It can take several seconds if you move charges in giga bytes order. | ||
441 | |||
442 | And if you want disable it again: | ||
443 | |||
444 | # echo 0 > memory.move_charge_at_immigrate | ||
445 | |||
446 | 8.2 Type of charges which can be move | ||
447 | |||
448 | Each bits of move_charge_at_immigrate has its own meaning about what type of | ||
449 | charges should be moved. | ||
450 | |||
451 | bit | what type of charges would be moved ? | ||
452 | -----+------------------------------------------------------------------------ | ||
453 | 0 | A charge of an anonymous page(or swap of it) used by the target task. | ||
454 | | Those pages and swaps must be used only by the target task. You must | ||
455 | | enable Swap Extension(see 2.4) to enable move of swap charges. | ||
456 | |||
457 | Note: Those pages and swaps must be charged to the old cgroup. | ||
458 | Note: More type of pages(e.g. file cache, shmem,) will be supported by other | ||
459 | bits in future. | ||
460 | |||
461 | 8.3 TODO | ||
462 | |||
463 | - Add support for other types of pages(e.g. file cache, shmem, etc.). | ||
464 | - Implement madvise(2) to let users decide the vma to be moved or not to be | ||
465 | moved. | ||
466 | - All of moving charge operations are done under cgroup_mutex. It's not good | ||
467 | behavior to hold the mutex too long, so we may need some trick. | ||
468 | |||
469 | 9. TODO | ||
418 | 470 | ||
419 | 1. Add support for accounting huge pages (as a separate controller) | 471 | 1. Add support for accounting huge pages (as a separate controller) |
420 | 2. Make per-cgroup scanner reclaim not-shared pages first | 472 | 2. Make per-cgroup scanner reclaim not-shared pages first |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d813823ab08f..59ffaf511d77 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -226,11 +226,26 @@ struct mem_cgroup { | |||
226 | bool memsw_is_minimum; | 226 | bool memsw_is_minimum; |
227 | 227 | ||
228 | /* | 228 | /* |
229 | * Should we move charges of a task when a task is moved into this | ||
230 | * mem_cgroup ? And what type of charges should we move ? | ||
231 | */ | ||
232 | unsigned long move_charge_at_immigrate; | ||
233 | |||
234 | /* | ||
229 | * statistics. This must be placed at the end of memcg. | 235 | * statistics. This must be placed at the end of memcg. |
230 | */ | 236 | */ |
231 | struct mem_cgroup_stat stat; | 237 | struct mem_cgroup_stat stat; |
232 | }; | 238 | }; |
233 | 239 | ||
240 | /* Stuffs for move charges at task migration. */ | ||
241 | /* | ||
242 | * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a | ||
243 | * left-shifted bitmap of these types. | ||
244 | */ | ||
245 | enum move_type { | ||
246 | NR_MOVE_TYPE, | ||
247 | }; | ||
248 | |||
234 | /* | 249 | /* |
235 | * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft | 250 | * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft |
236 | * limit reclaim to prevent infinite loops, if they ever occur. | 251 | * limit reclaim to prevent infinite loops, if they ever occur. |
@@ -2865,6 +2880,31 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) | |||
2865 | return 0; | 2880 | return 0; |
2866 | } | 2881 | } |
2867 | 2882 | ||
2883 | static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp, | ||
2884 | struct cftype *cft) | ||
2885 | { | ||
2886 | return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate; | ||
2887 | } | ||
2888 | |||
2889 | static int mem_cgroup_move_charge_write(struct cgroup *cgrp, | ||
2890 | struct cftype *cft, u64 val) | ||
2891 | { | ||
2892 | struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); | ||
2893 | |||
2894 | if (val >= (1 << NR_MOVE_TYPE)) | ||
2895 | return -EINVAL; | ||
2896 | /* | ||
2897 | * We check this value several times in both in can_attach() and | ||
2898 | * attach(), so we need cgroup lock to prevent this value from being | ||
2899 | * inconsistent. | ||
2900 | */ | ||
2901 | cgroup_lock(); | ||
2902 | mem->move_charge_at_immigrate = val; | ||
2903 | cgroup_unlock(); | ||
2904 | |||
2905 | return 0; | ||
2906 | } | ||
2907 | |||
2868 | 2908 | ||
2869 | /* For read statistics */ | 2909 | /* For read statistics */ |
2870 | enum { | 2910 | enum { |
@@ -3098,6 +3138,11 @@ static struct cftype mem_cgroup_files[] = { | |||
3098 | .read_u64 = mem_cgroup_swappiness_read, | 3138 | .read_u64 = mem_cgroup_swappiness_read, |
3099 | .write_u64 = mem_cgroup_swappiness_write, | 3139 | .write_u64 = mem_cgroup_swappiness_write, |
3100 | }, | 3140 | }, |
3141 | { | ||
3142 | .name = "move_charge_at_immigrate", | ||
3143 | .read_u64 = mem_cgroup_move_charge_read, | ||
3144 | .write_u64 = mem_cgroup_move_charge_write, | ||
3145 | }, | ||
3101 | }; | 3146 | }; |
3102 | 3147 | ||
3103 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 3148 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP |
@@ -3345,6 +3390,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
3345 | if (parent) | 3390 | if (parent) |
3346 | mem->swappiness = get_swappiness(parent); | 3391 | mem->swappiness = get_swappiness(parent); |
3347 | atomic_set(&mem->refcnt, 1); | 3392 | atomic_set(&mem->refcnt, 1); |
3393 | mem->move_charge_at_immigrate = 0; | ||
3348 | return &mem->css; | 3394 | return &mem->css; |
3349 | free_out: | 3395 | free_out: |
3350 | __mem_cgroup_free(mem); | 3396 | __mem_cgroup_free(mem); |
@@ -3381,16 +3427,57 @@ static int mem_cgroup_populate(struct cgroup_subsys *ss, | |||
3381 | return ret; | 3427 | return ret; |
3382 | } | 3428 | } |
3383 | 3429 | ||
3430 | /* Handlers for move charge at task migration. */ | ||
3431 | static int mem_cgroup_can_move_charge(void) | ||
3432 | { | ||
3433 | return 0; | ||
3434 | } | ||
3435 | |||
3436 | static int mem_cgroup_can_attach(struct cgroup_subsys *ss, | ||
3437 | struct cgroup *cgroup, | ||
3438 | struct task_struct *p, | ||
3439 | bool threadgroup) | ||
3440 | { | ||
3441 | int ret = 0; | ||
3442 | struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup); | ||
3443 | |||
3444 | if (mem->move_charge_at_immigrate) { | ||
3445 | struct mm_struct *mm; | ||
3446 | struct mem_cgroup *from = mem_cgroup_from_task(p); | ||
3447 | |||
3448 | VM_BUG_ON(from == mem); | ||
3449 | |||
3450 | mm = get_task_mm(p); | ||
3451 | if (!mm) | ||
3452 | return 0; | ||
3453 | |||
3454 | /* We move charges only when we move a owner of the mm */ | ||
3455 | if (mm->owner == p) | ||
3456 | ret = mem_cgroup_can_move_charge(); | ||
3457 | |||
3458 | mmput(mm); | ||
3459 | } | ||
3460 | return ret; | ||
3461 | } | ||
3462 | |||
3463 | static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, | ||
3464 | struct cgroup *cgroup, | ||
3465 | struct task_struct *p, | ||
3466 | bool threadgroup) | ||
3467 | { | ||
3468 | } | ||
3469 | |||
3470 | static void mem_cgroup_move_charge(void) | ||
3471 | { | ||
3472 | } | ||
3473 | |||
3384 | static void mem_cgroup_move_task(struct cgroup_subsys *ss, | 3474 | static void mem_cgroup_move_task(struct cgroup_subsys *ss, |
3385 | struct cgroup *cont, | 3475 | struct cgroup *cont, |
3386 | struct cgroup *old_cont, | 3476 | struct cgroup *old_cont, |
3387 | struct task_struct *p, | 3477 | struct task_struct *p, |
3388 | bool threadgroup) | 3478 | bool threadgroup) |
3389 | { | 3479 | { |
3390 | /* | 3480 | mem_cgroup_move_charge(); |
3391 | * FIXME: It's better to move charges of this process from old | ||
3392 | * memcg to new memcg. But it's just on TODO-List now. | ||
3393 | */ | ||
3394 | } | 3481 | } |
3395 | 3482 | ||
3396 | struct cgroup_subsys mem_cgroup_subsys = { | 3483 | struct cgroup_subsys mem_cgroup_subsys = { |
@@ -3400,6 +3487,8 @@ struct cgroup_subsys mem_cgroup_subsys = { | |||
3400 | .pre_destroy = mem_cgroup_pre_destroy, | 3487 | .pre_destroy = mem_cgroup_pre_destroy, |
3401 | .destroy = mem_cgroup_destroy, | 3488 | .destroy = mem_cgroup_destroy, |
3402 | .populate = mem_cgroup_populate, | 3489 | .populate = mem_cgroup_populate, |
3490 | .can_attach = mem_cgroup_can_attach, | ||
3491 | .cancel_attach = mem_cgroup_cancel_attach, | ||
3403 | .attach = mem_cgroup_move_task, | 3492 | .attach = mem_cgroup_move_task, |
3404 | .early_init = 0, | 3493 | .early_init = 0, |
3405 | .use_id = 1, | 3494 | .use_id = 1, |