diff options
| -rw-r--r-- | Documentation/cgroups/memory.txt | 56 | ||||
| -rw-r--r-- | mm/memcontrol.c | 97 |
2 files changed, 147 insertions, 6 deletions
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index b871f2552b45..e726fb0df719 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt | |||
| @@ -262,10 +262,12 @@ some of the pages cached in the cgroup (page cache pages). | |||
| 262 | 4.2 Task migration | 262 | 4.2 Task migration |
| 263 | 263 | ||
| 264 | When a task migrates from one cgroup to another, it's charge is not | 264 | When a task migrates from one cgroup to another, it's charge is not |
| 265 | carried forward. The pages allocated from the original cgroup still | 265 | carried forward by default. The pages allocated from the original cgroup still |
| 266 | remain charged to it, the charge is dropped when the page is freed or | 266 | remain charged to it, the charge is dropped when the page is freed or |
| 267 | reclaimed. | 267 | reclaimed. |
| 268 | 268 | ||
| 269 | Note: You can move charges of a task along with task migration. See 8. | ||
| 270 | |||
| 269 | 4.3 Removing a cgroup | 271 | 4.3 Removing a cgroup |
| 270 | 272 | ||
| 271 | A cgroup can be removed by rmdir, but as discussed in sections 4.1 and 4.2, a | 273 | A cgroup can be removed by rmdir, but as discussed in sections 4.1 and 4.2, a |
| @@ -414,7 +416,57 @@ NOTE1: Soft limits take effect over a long period of time, since they involve | |||
| 414 | NOTE2: It is recommended to set the soft limit always below the hard limit, | 416 | NOTE2: It is recommended to set the soft limit always below the hard limit, |
| 415 | otherwise the hard limit will take precedence. | 417 | otherwise the hard limit will take precedence. |
| 416 | 418 | ||
| 417 | 8. TODO | 419 | 8. Move charges at task migration |
| 420 | |||
| 421 | Users can move charges associated with a task along with task migration, that | ||
| 422 | is, uncharge task's pages from the old cgroup and charge them to the new cgroup. | ||
| 423 | |||
| 424 | 8.1 Interface | ||
| 425 | |||
| 426 | This feature is disabled by default. It can be enabled(and disabled again) by | ||
| 427 | writing to memory.move_charge_at_immigrate of the destination cgroup. | ||
| 428 | |||
| 429 | If you want to enable it: | ||
| 430 | |||
| 431 | # echo (some positive value) > memory.move_charge_at_immigrate | ||
| 432 | |||
| 433 | Note: Each bits of move_charge_at_immigrate has its own meaning about what type | ||
| 434 | of charges should be moved. See 8.2 for details. | ||
| 435 | Note: Charges are moved only when you move mm->owner, IOW, a leader of a thread | ||
| 436 | group. | ||
| 437 | Note: If we cannot find enough space for the task in the destination cgroup, we | ||
| 438 | try to make space by reclaiming memory. Task migration may fail if we | ||
| 439 | cannot make enough space. | ||
| 440 | Note: It can take several seconds if you move charges in giga bytes order. | ||
| 441 | |||
| 442 | And if you want disable it again: | ||
| 443 | |||
| 444 | # echo 0 > memory.move_charge_at_immigrate | ||
| 445 | |||
| 446 | 8.2 Type of charges which can be move | ||
| 447 | |||
| 448 | Each bits of move_charge_at_immigrate has its own meaning about what type of | ||
| 449 | charges should be moved. | ||
| 450 | |||
| 451 | bit | what type of charges would be moved ? | ||
| 452 | -----+------------------------------------------------------------------------ | ||
| 453 | 0 | A charge of an anonymous page(or swap of it) used by the target task. | ||
| 454 | | Those pages and swaps must be used only by the target task. You must | ||
| 455 | | enable Swap Extension(see 2.4) to enable move of swap charges. | ||
| 456 | |||
| 457 | Note: Those pages and swaps must be charged to the old cgroup. | ||
| 458 | Note: More type of pages(e.g. file cache, shmem,) will be supported by other | ||
| 459 | bits in future. | ||
| 460 | |||
| 461 | 8.3 TODO | ||
| 462 | |||
| 463 | - Add support for other types of pages(e.g. file cache, shmem, etc.). | ||
| 464 | - Implement madvise(2) to let users decide the vma to be moved or not to be | ||
| 465 | moved. | ||
| 466 | - All of moving charge operations are done under cgroup_mutex. It's not good | ||
| 467 | behavior to hold the mutex too long, so we may need some trick. | ||
| 468 | |||
| 469 | 9. TODO | ||
| 418 | 470 | ||
| 419 | 1. Add support for accounting huge pages (as a separate controller) | 471 | 1. Add support for accounting huge pages (as a separate controller) |
| 420 | 2. Make per-cgroup scanner reclaim not-shared pages first | 472 | 2. Make per-cgroup scanner reclaim not-shared pages first |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d813823ab08f..59ffaf511d77 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
| @@ -226,11 +226,26 @@ struct mem_cgroup { | |||
| 226 | bool memsw_is_minimum; | 226 | bool memsw_is_minimum; |
| 227 | 227 | ||
| 228 | /* | 228 | /* |
| 229 | * Should we move charges of a task when a task is moved into this | ||
| 230 | * mem_cgroup ? And what type of charges should we move ? | ||
| 231 | */ | ||
| 232 | unsigned long move_charge_at_immigrate; | ||
| 233 | |||
| 234 | /* | ||
| 229 | * statistics. This must be placed at the end of memcg. | 235 | * statistics. This must be placed at the end of memcg. |
| 230 | */ | 236 | */ |
| 231 | struct mem_cgroup_stat stat; | 237 | struct mem_cgroup_stat stat; |
| 232 | }; | 238 | }; |
| 233 | 239 | ||
| 240 | /* Stuffs for move charges at task migration. */ | ||
| 241 | /* | ||
| 242 | * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a | ||
| 243 | * left-shifted bitmap of these types. | ||
| 244 | */ | ||
| 245 | enum move_type { | ||
| 246 | NR_MOVE_TYPE, | ||
| 247 | }; | ||
| 248 | |||
| 234 | /* | 249 | /* |
| 235 | * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft | 250 | * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft |
| 236 | * limit reclaim to prevent infinite loops, if they ever occur. | 251 | * limit reclaim to prevent infinite loops, if they ever occur. |
| @@ -2865,6 +2880,31 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) | |||
| 2865 | return 0; | 2880 | return 0; |
| 2866 | } | 2881 | } |
| 2867 | 2882 | ||
| 2883 | static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp, | ||
| 2884 | struct cftype *cft) | ||
| 2885 | { | ||
| 2886 | return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate; | ||
| 2887 | } | ||
| 2888 | |||
| 2889 | static int mem_cgroup_move_charge_write(struct cgroup *cgrp, | ||
| 2890 | struct cftype *cft, u64 val) | ||
| 2891 | { | ||
| 2892 | struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); | ||
| 2893 | |||
| 2894 | if (val >= (1 << NR_MOVE_TYPE)) | ||
| 2895 | return -EINVAL; | ||
| 2896 | /* | ||
| 2897 | * We check this value several times in both in can_attach() and | ||
| 2898 | * attach(), so we need cgroup lock to prevent this value from being | ||
| 2899 | * inconsistent. | ||
| 2900 | */ | ||
| 2901 | cgroup_lock(); | ||
| 2902 | mem->move_charge_at_immigrate = val; | ||
| 2903 | cgroup_unlock(); | ||
| 2904 | |||
| 2905 | return 0; | ||
| 2906 | } | ||
| 2907 | |||
| 2868 | 2908 | ||
| 2869 | /* For read statistics */ | 2909 | /* For read statistics */ |
| 2870 | enum { | 2910 | enum { |
| @@ -3098,6 +3138,11 @@ static struct cftype mem_cgroup_files[] = { | |||
| 3098 | .read_u64 = mem_cgroup_swappiness_read, | 3138 | .read_u64 = mem_cgroup_swappiness_read, |
| 3099 | .write_u64 = mem_cgroup_swappiness_write, | 3139 | .write_u64 = mem_cgroup_swappiness_write, |
| 3100 | }, | 3140 | }, |
| 3141 | { | ||
| 3142 | .name = "move_charge_at_immigrate", | ||
| 3143 | .read_u64 = mem_cgroup_move_charge_read, | ||
| 3144 | .write_u64 = mem_cgroup_move_charge_write, | ||
| 3145 | }, | ||
| 3101 | }; | 3146 | }; |
| 3102 | 3147 | ||
| 3103 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 3148 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP |
| @@ -3345,6 +3390,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
| 3345 | if (parent) | 3390 | if (parent) |
| 3346 | mem->swappiness = get_swappiness(parent); | 3391 | mem->swappiness = get_swappiness(parent); |
| 3347 | atomic_set(&mem->refcnt, 1); | 3392 | atomic_set(&mem->refcnt, 1); |
| 3393 | mem->move_charge_at_immigrate = 0; | ||
| 3348 | return &mem->css; | 3394 | return &mem->css; |
| 3349 | free_out: | 3395 | free_out: |
| 3350 | __mem_cgroup_free(mem); | 3396 | __mem_cgroup_free(mem); |
| @@ -3381,16 +3427,57 @@ static int mem_cgroup_populate(struct cgroup_subsys *ss, | |||
| 3381 | return ret; | 3427 | return ret; |
| 3382 | } | 3428 | } |
| 3383 | 3429 | ||
| 3430 | /* Handlers for move charge at task migration. */ | ||
| 3431 | static int mem_cgroup_can_move_charge(void) | ||
| 3432 | { | ||
| 3433 | return 0; | ||
| 3434 | } | ||
| 3435 | |||
| 3436 | static int mem_cgroup_can_attach(struct cgroup_subsys *ss, | ||
| 3437 | struct cgroup *cgroup, | ||
| 3438 | struct task_struct *p, | ||
| 3439 | bool threadgroup) | ||
| 3440 | { | ||
| 3441 | int ret = 0; | ||
| 3442 | struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup); | ||
| 3443 | |||
| 3444 | if (mem->move_charge_at_immigrate) { | ||
| 3445 | struct mm_struct *mm; | ||
| 3446 | struct mem_cgroup *from = mem_cgroup_from_task(p); | ||
| 3447 | |||
| 3448 | VM_BUG_ON(from == mem); | ||
| 3449 | |||
| 3450 | mm = get_task_mm(p); | ||
| 3451 | if (!mm) | ||
| 3452 | return 0; | ||
| 3453 | |||
| 3454 | /* We move charges only when we move a owner of the mm */ | ||
| 3455 | if (mm->owner == p) | ||
| 3456 | ret = mem_cgroup_can_move_charge(); | ||
| 3457 | |||
| 3458 | mmput(mm); | ||
| 3459 | } | ||
| 3460 | return ret; | ||
| 3461 | } | ||
| 3462 | |||
| 3463 | static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, | ||
| 3464 | struct cgroup *cgroup, | ||
| 3465 | struct task_struct *p, | ||
| 3466 | bool threadgroup) | ||
| 3467 | { | ||
| 3468 | } | ||
| 3469 | |||
| 3470 | static void mem_cgroup_move_charge(void) | ||
| 3471 | { | ||
| 3472 | } | ||
| 3473 | |||
| 3384 | static void mem_cgroup_move_task(struct cgroup_subsys *ss, | 3474 | static void mem_cgroup_move_task(struct cgroup_subsys *ss, |
| 3385 | struct cgroup *cont, | 3475 | struct cgroup *cont, |
| 3386 | struct cgroup *old_cont, | 3476 | struct cgroup *old_cont, |
| 3387 | struct task_struct *p, | 3477 | struct task_struct *p, |
| 3388 | bool threadgroup) | 3478 | bool threadgroup) |
| 3389 | { | 3479 | { |
| 3390 | /* | 3480 | mem_cgroup_move_charge(); |
| 3391 | * FIXME: It's better to move charges of this process from old | ||
| 3392 | * memcg to new memcg. But it's just on TODO-List now. | ||
| 3393 | */ | ||
| 3394 | } | 3481 | } |
| 3395 | 3482 | ||
| 3396 | struct cgroup_subsys mem_cgroup_subsys = { | 3483 | struct cgroup_subsys mem_cgroup_subsys = { |
| @@ -3400,6 +3487,8 @@ struct cgroup_subsys mem_cgroup_subsys = { | |||
| 3400 | .pre_destroy = mem_cgroup_pre_destroy, | 3487 | .pre_destroy = mem_cgroup_pre_destroy, |
| 3401 | .destroy = mem_cgroup_destroy, | 3488 | .destroy = mem_cgroup_destroy, |
| 3402 | .populate = mem_cgroup_populate, | 3489 | .populate = mem_cgroup_populate, |
| 3490 | .can_attach = mem_cgroup_can_attach, | ||
| 3491 | .cancel_attach = mem_cgroup_cancel_attach, | ||
| 3403 | .attach = mem_cgroup_move_task, | 3492 | .attach = mem_cgroup_move_task, |
| 3404 | .early_init = 0, | 3493 | .early_init = 0, |
| 3405 | .use_id = 1, | 3494 | .use_id = 1, |
