diff options
-rw-r--r-- | Documentation/cgroups/memory.txt | 20 | ||||
-rw-r--r-- | mm/memcontrol.c | 100 |
2 files changed, 111 insertions, 9 deletions
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index 6cab1f29da4c..eac22d3b2f7b 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt | |||
@@ -184,6 +184,9 @@ limits on the root cgroup. | |||
184 | 184 | ||
185 | Note2: When panic_on_oom is set to "2", the whole system will panic. | 185 | Note2: When panic_on_oom is set to "2", the whole system will panic. |
186 | 186 | ||
187 | When oom event notifier is registered, event will be delivered. | ||
188 | (See oom_control section) | ||
189 | |||
187 | 2. Locking | 190 | 2. Locking |
188 | 191 | ||
189 | The memory controller uses the following hierarchy | 192 | The memory controller uses the following hierarchy |
@@ -488,7 +491,22 @@ threshold in any direction. | |||
488 | 491 | ||
489 | It's applicable for root and non-root cgroup. | 492 | It's applicable for root and non-root cgroup. |
490 | 493 | ||
491 | 10. TODO | 494 | 10. OOM Control |
495 | |||
496 | Memory controler implements oom notifier using cgroup notification | ||
497 | API (See cgroups.txt). It allows to register multiple oom notification | ||
498 | delivery and gets notification when oom happens. | ||
499 | |||
500 | To register a notifier, application need: | ||
501 | - create an eventfd using eventfd(2) | ||
502 | - open memory.oom_control file | ||
503 | - write string like "<event_fd> <memory.oom_control>" to cgroup.event_control | ||
504 | |||
505 | Application will be notifier through eventfd when oom happens. | ||
506 | OOM notification doesn't work for root cgroup. | ||
507 | |||
508 | |||
509 | 11. TODO | ||
492 | 510 | ||
493 | 1. Add support for accounting huge pages (as a separate controller) | 511 | 1. Add support for accounting huge pages (as a separate controller) |
494 | 2. Make per-cgroup scanner reclaim not-shared pages first | 512 | 2. Make per-cgroup scanner reclaim not-shared pages first |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 94ac208b1490..da2ed3913316 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -149,6 +149,7 @@ struct mem_cgroup_threshold { | |||
149 | u64 threshold; | 149 | u64 threshold; |
150 | }; | 150 | }; |
151 | 151 | ||
152 | /* For threshold */ | ||
152 | struct mem_cgroup_threshold_ary { | 153 | struct mem_cgroup_threshold_ary { |
153 | /* An array index points to threshold just below usage. */ | 154 | /* An array index points to threshold just below usage. */ |
154 | atomic_t current_threshold; | 155 | atomic_t current_threshold; |
@@ -157,8 +158,14 @@ struct mem_cgroup_threshold_ary { | |||
157 | /* Array of thresholds */ | 158 | /* Array of thresholds */ |
158 | struct mem_cgroup_threshold entries[0]; | 159 | struct mem_cgroup_threshold entries[0]; |
159 | }; | 160 | }; |
161 | /* for OOM */ | ||
162 | struct mem_cgroup_eventfd_list { | ||
163 | struct list_head list; | ||
164 | struct eventfd_ctx *eventfd; | ||
165 | }; | ||
160 | 166 | ||
161 | static void mem_cgroup_threshold(struct mem_cgroup *mem); | 167 | static void mem_cgroup_threshold(struct mem_cgroup *mem); |
168 | static void mem_cgroup_oom_notify(struct mem_cgroup *mem); | ||
162 | 169 | ||
163 | /* | 170 | /* |
164 | * The memory controller data structure. The memory controller controls both | 171 | * The memory controller data structure. The memory controller controls both |
@@ -220,6 +227,9 @@ struct mem_cgroup { | |||
220 | /* thresholds for mem+swap usage. RCU-protected */ | 227 | /* thresholds for mem+swap usage. RCU-protected */ |
221 | struct mem_cgroup_threshold_ary *memsw_thresholds; | 228 | struct mem_cgroup_threshold_ary *memsw_thresholds; |
222 | 229 | ||
230 | /* For oom notifier event fd */ | ||
231 | struct list_head oom_notify; | ||
232 | |||
223 | /* | 233 | /* |
224 | * Should we move charges of a task when a task is moved into this | 234 | * Should we move charges of a task when a task is moved into this |
225 | * mem_cgroup ? And what type of charges should we move ? | 235 | * mem_cgroup ? And what type of charges should we move ? |
@@ -282,9 +292,12 @@ enum charge_type { | |||
282 | /* for encoding cft->private value on file */ | 292 | /* for encoding cft->private value on file */ |
283 | #define _MEM (0) | 293 | #define _MEM (0) |
284 | #define _MEMSWAP (1) | 294 | #define _MEMSWAP (1) |
295 | #define _OOM_TYPE (2) | ||
285 | #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) | 296 | #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) |
286 | #define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) | 297 | #define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) |
287 | #define MEMFILE_ATTR(val) ((val) & 0xffff) | 298 | #define MEMFILE_ATTR(val) ((val) & 0xffff) |
299 | /* Used for OOM nofiier */ | ||
300 | #define OOM_CONTROL (0) | ||
288 | 301 | ||
289 | /* | 302 | /* |
290 | * Reclaim flags for mem_cgroup_hierarchical_reclaim | 303 | * Reclaim flags for mem_cgroup_hierarchical_reclaim |
@@ -1353,6 +1366,8 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) | |||
1353 | */ | 1366 | */ |
1354 | if (!locked) | 1367 | if (!locked) |
1355 | prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); | 1368 | prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); |
1369 | else | ||
1370 | mem_cgroup_oom_notify(mem); | ||
1356 | mutex_unlock(&memcg_oom_mutex); | 1371 | mutex_unlock(&memcg_oom_mutex); |
1357 | 1372 | ||
1358 | if (locked) | 1373 | if (locked) |
@@ -3398,8 +3413,22 @@ static int compare_thresholds(const void *a, const void *b) | |||
3398 | return _a->threshold - _b->threshold; | 3413 | return _a->threshold - _b->threshold; |
3399 | } | 3414 | } |
3400 | 3415 | ||
3401 | static int mem_cgroup_register_event(struct cgroup *cgrp, struct cftype *cft, | 3416 | static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem, void *data) |
3402 | struct eventfd_ctx *eventfd, const char *args) | 3417 | { |
3418 | struct mem_cgroup_eventfd_list *ev; | ||
3419 | |||
3420 | list_for_each_entry(ev, &mem->oom_notify, list) | ||
3421 | eventfd_signal(ev->eventfd, 1); | ||
3422 | return 0; | ||
3423 | } | ||
3424 | |||
3425 | static void mem_cgroup_oom_notify(struct mem_cgroup *mem) | ||
3426 | { | ||
3427 | mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_notify_cb); | ||
3428 | } | ||
3429 | |||
3430 | static int mem_cgroup_usage_register_event(struct cgroup *cgrp, | ||
3431 | struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) | ||
3403 | { | 3432 | { |
3404 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); | 3433 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); |
3405 | struct mem_cgroup_threshold_ary *thresholds, *thresholds_new; | 3434 | struct mem_cgroup_threshold_ary *thresholds, *thresholds_new; |
@@ -3483,8 +3512,8 @@ unlock: | |||
3483 | return ret; | 3512 | return ret; |
3484 | } | 3513 | } |
3485 | 3514 | ||
3486 | static int mem_cgroup_unregister_event(struct cgroup *cgrp, struct cftype *cft, | 3515 | static int mem_cgroup_usage_unregister_event(struct cgroup *cgrp, |
3487 | struct eventfd_ctx *eventfd) | 3516 | struct cftype *cft, struct eventfd_ctx *eventfd) |
3488 | { | 3517 | { |
3489 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); | 3518 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); |
3490 | struct mem_cgroup_threshold_ary *thresholds, *thresholds_new; | 3519 | struct mem_cgroup_threshold_ary *thresholds, *thresholds_new; |
@@ -3568,13 +3597,61 @@ unlock: | |||
3568 | return ret; | 3597 | return ret; |
3569 | } | 3598 | } |
3570 | 3599 | ||
3600 | static int mem_cgroup_oom_register_event(struct cgroup *cgrp, | ||
3601 | struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) | ||
3602 | { | ||
3603 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); | ||
3604 | struct mem_cgroup_eventfd_list *event; | ||
3605 | int type = MEMFILE_TYPE(cft->private); | ||
3606 | |||
3607 | BUG_ON(type != _OOM_TYPE); | ||
3608 | event = kmalloc(sizeof(*event), GFP_KERNEL); | ||
3609 | if (!event) | ||
3610 | return -ENOMEM; | ||
3611 | |||
3612 | mutex_lock(&memcg_oom_mutex); | ||
3613 | |||
3614 | event->eventfd = eventfd; | ||
3615 | list_add(&event->list, &memcg->oom_notify); | ||
3616 | |||
3617 | /* already in OOM ? */ | ||
3618 | if (atomic_read(&memcg->oom_lock)) | ||
3619 | eventfd_signal(eventfd, 1); | ||
3620 | mutex_unlock(&memcg_oom_mutex); | ||
3621 | |||
3622 | return 0; | ||
3623 | } | ||
3624 | |||
3625 | static int mem_cgroup_oom_unregister_event(struct cgroup *cgrp, | ||
3626 | struct cftype *cft, struct eventfd_ctx *eventfd) | ||
3627 | { | ||
3628 | struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); | ||
3629 | struct mem_cgroup_eventfd_list *ev, *tmp; | ||
3630 | int type = MEMFILE_TYPE(cft->private); | ||
3631 | |||
3632 | BUG_ON(type != _OOM_TYPE); | ||
3633 | |||
3634 | mutex_lock(&memcg_oom_mutex); | ||
3635 | |||
3636 | list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) { | ||
3637 | if (ev->eventfd == eventfd) { | ||
3638 | list_del(&ev->list); | ||
3639 | kfree(ev); | ||
3640 | } | ||
3641 | } | ||
3642 | |||
3643 | mutex_unlock(&memcg_oom_mutex); | ||
3644 | |||
3645 | return 0; | ||
3646 | } | ||
3647 | |||
3571 | static struct cftype mem_cgroup_files[] = { | 3648 | static struct cftype mem_cgroup_files[] = { |
3572 | { | 3649 | { |
3573 | .name = "usage_in_bytes", | 3650 | .name = "usage_in_bytes", |
3574 | .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), | 3651 | .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), |
3575 | .read_u64 = mem_cgroup_read, | 3652 | .read_u64 = mem_cgroup_read, |
3576 | .register_event = mem_cgroup_register_event, | 3653 | .register_event = mem_cgroup_usage_register_event, |
3577 | .unregister_event = mem_cgroup_unregister_event, | 3654 | .unregister_event = mem_cgroup_usage_unregister_event, |
3578 | }, | 3655 | }, |
3579 | { | 3656 | { |
3580 | .name = "max_usage_in_bytes", | 3657 | .name = "max_usage_in_bytes", |
@@ -3623,6 +3700,12 @@ static struct cftype mem_cgroup_files[] = { | |||
3623 | .read_u64 = mem_cgroup_move_charge_read, | 3700 | .read_u64 = mem_cgroup_move_charge_read, |
3624 | .write_u64 = mem_cgroup_move_charge_write, | 3701 | .write_u64 = mem_cgroup_move_charge_write, |
3625 | }, | 3702 | }, |
3703 | { | ||
3704 | .name = "oom_control", | ||
3705 | .register_event = mem_cgroup_oom_register_event, | ||
3706 | .unregister_event = mem_cgroup_oom_unregister_event, | ||
3707 | .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), | ||
3708 | }, | ||
3626 | }; | 3709 | }; |
3627 | 3710 | ||
3628 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 3711 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP |
@@ -3631,8 +3714,8 @@ static struct cftype memsw_cgroup_files[] = { | |||
3631 | .name = "memsw.usage_in_bytes", | 3714 | .name = "memsw.usage_in_bytes", |
3632 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), | 3715 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), |
3633 | .read_u64 = mem_cgroup_read, | 3716 | .read_u64 = mem_cgroup_read, |
3634 | .register_event = mem_cgroup_register_event, | 3717 | .register_event = mem_cgroup_usage_register_event, |
3635 | .unregister_event = mem_cgroup_unregister_event, | 3718 | .unregister_event = mem_cgroup_usage_unregister_event, |
3636 | }, | 3719 | }, |
3637 | { | 3720 | { |
3638 | .name = "memsw.max_usage_in_bytes", | 3721 | .name = "memsw.max_usage_in_bytes", |
@@ -3878,6 +3961,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
3878 | } | 3961 | } |
3879 | mem->last_scanned_child = 0; | 3962 | mem->last_scanned_child = 0; |
3880 | spin_lock_init(&mem->reclaim_param_lock); | 3963 | spin_lock_init(&mem->reclaim_param_lock); |
3964 | INIT_LIST_HEAD(&mem->oom_notify); | ||
3881 | 3965 | ||
3882 | if (parent) | 3966 | if (parent) |
3883 | mem->swappiness = get_swappiness(parent); | 3967 | mem->swappiness = get_swappiness(parent); |