diff options
author | KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> | 2010-05-26 17:42:36 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2010-05-27 12:12:43 -0400 |
commit | 9490ff275606da012d5b373342a49610ad61cb81 (patch) | |
tree | 037993e807654da633776066129b001f8bae7d1d | |
parent | dc98df5a1b7be402a0e1c71f1b89ccf249ac15ee (diff) |
memcg: oom notifier
Considering containers or other resource management softwares in userland,
event notification of OOM in memcg should be implemented. Now, memcg has
"threshold" notifier which uses eventfd, we can make use of it for oom
notification.
This patch adds oom notification eventfd callback for memcg. The usage is
very similar to threshold notifier, but control file is memory.oom_control
and no arguments other than eventfd is required.
% cgroup_event_notifier /cgroup/A/memory.oom_control dummy
(About cgroup_event_notifier, see Documentation/cgroup/)
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: David Rientjes <rientjes@google.com>
Cc: Davide Libenzi <davidel@xmailserver.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | Documentation/cgroups/memory.txt | 20 | ||||
-rw-r--r-- | mm/memcontrol.c | 100 |
2 files changed, 111 insertions, 9 deletions
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index 6cab1f29da4c..eac22d3b2f7b 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt | |||
@@ -184,6 +184,9 @@ limits on the root cgroup. | |||
184 | 184 | ||
185 | Note2: When panic_on_oom is set to "2", the whole system will panic. | 185 | Note2: When panic_on_oom is set to "2", the whole system will panic. |
186 | 186 | ||
187 | When oom event notifier is registered, event will be delivered. | ||
188 | (See oom_control section) | ||
189 | |||
187 | 2. Locking | 190 | 2. Locking |
188 | 191 | ||
189 | The memory controller uses the following hierarchy | 192 | The memory controller uses the following hierarchy |
@@ -488,7 +491,22 @@ threshold in any direction. | |||
488 | 491 | ||
489 | It's applicable for root and non-root cgroup. | 492 | It's applicable for root and non-root cgroup. |
490 | 493 | ||
491 | 10. TODO | 494 | 10. OOM Control |
495 | |||
496 | Memory controler implements oom notifier using cgroup notification | ||
497 | API (See cgroups.txt). It allows to register multiple oom notification | ||
498 | delivery and gets notification when oom happens. | ||
499 | |||
500 | To register a notifier, application need: | ||
501 | - create an eventfd using eventfd(2) | ||
502 | - open memory.oom_control file | ||
503 | - write string like "<event_fd> <memory.oom_control>" to cgroup.event_control | ||
504 | |||
505 | Application will be notifier through eventfd when oom happens. | ||
506 | OOM notification doesn't work for root cgroup. | ||
507 | |||
508 | |||
509 | 11. TODO | ||
492 | 510 | ||
493 | 1. Add support for accounting huge pages (as a separate controller) | 511 | 1. Add support for accounting huge pages (as a separate controller) |
494 | 2. Make per-cgroup scanner reclaim not-shared pages first | 512 | 2. Make per-cgroup scanner reclaim not-shared pages first |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 94ac208b1490..da2ed3913316 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -149,6 +149,7 @@ struct mem_cgroup_threshold { | |||
149 | u64 threshold; | 149 | u64 threshold; |
150 | }; | 150 | }; |
151 | 151 | ||
152 | /* For threshold */ | ||
152 | struct mem_cgroup_threshold_ary { | 153 | struct mem_cgroup_threshold_ary { |
153 | /* An array index points to threshold just below usage. */ | 154 | /* An array index points to threshold just below usage. */ |
154 | atomic_t current_threshold; | 155 | atomic_t current_threshold; |
@@ -157,8 +158,14 @@ struct mem_cgroup_threshold_ary { | |||
157 | /* Array of thresholds */ | 158 | /* Array of thresholds */ |
158 | struct mem_cgroup_threshold entries[0]; | 159 | struct mem_cgroup_threshold entries[0]; |
159 | }; | 160 | }; |
161 | /* for OOM */ | ||
162 | struct mem_cgroup_eventfd_list { | ||
163 | struct list_head list; | ||
164 | struct eventfd_ctx *eventfd; | ||
165 | }; | ||
160 | 166 | ||
161 | static void mem_cgroup_threshold(struct mem_cgroup *mem); | 167 | static void mem_cgroup_threshold(struct mem_cgroup *mem); |
168 | static void mem_cgroup_oom_notify(struct mem_cgroup *mem); | ||
162 | 169 | ||
163 | /* | 170 | /* |
164 | * The memory controller data structure. The memory controller controls both | 171 | * The memory controller data structure. The memory controller controls both |
@@ -220,6 +227,9 @@ struct mem_cgroup { | |||
220 | /* thresholds for mem+swap usage. RCU-protected */ | 227 | /* thresholds for mem+swap usage. RCU-protected */ |
221 | struct mem_cgroup_threshold_ary *memsw_thresholds; | 228 | struct mem_cgroup_threshold_ary *memsw_thresholds; |
222 | 229 | ||
230 | /* For oom notifier event fd */ | ||
231 | struct list_head oom_notify; | ||
232 | |||
223 | /* | 233 | /* |
224 | * Should we move charges of a task when a task is moved into this | 234 | * Should we move charges of a task when a task is moved into this |
225 | * mem_cgroup ? And what type of charges should we move ? | 235 | * mem_cgroup ? And what type of charges should we move ? |
@@ -282,9 +292,12 @@ enum charge_type { | |||
282 | /* for encoding cft->private value on file */ | 292 | /* for encoding cft->private value on file */ |
283 | #define _MEM (0) | 293 | #define _MEM (0) |
284 | #define _MEMSWAP (1) | 294 | #define _MEMSWAP (1) |
295 | #define _OOM_TYPE (2) | ||
285 | #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) | 296 | #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) |
286 | #define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) | 297 | #define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) |
287 | #define MEMFILE_ATTR(val) ((val) & 0xffff) | 298 | #define MEMFILE_ATTR(val) ((val) & 0xffff) |
299 | /* Used for OOM nofiier */ | ||
300 | #define OOM_CONTROL (0) | ||
288 | 301 | ||
289 | /* | 302 | /* |
290 | * Reclaim flags for mem_cgroup_hierarchical_reclaim | 303 | * Reclaim flags for mem_cgroup_hierarchical_reclaim |
@@ -1353,6 +1366,8 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) | |||
1353 | */ | 1366 | */ |
1354 | if (!locked) | 1367 | if (!locked) |
1355 | prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); | 1368 | prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); |
1369 | else | ||
1370 | mem_cgroup_oom_notify(mem); | ||
1356 | mutex_unlock(&memcg_oom_mutex); | 1371 | mutex_unlock(&memcg_oom_mutex); |
1357 | 1372 | ||
1358 | if (locked) | 1373 | if (locked) |
@@ -3398,8 +3413,22 @@ static int compare_thresholds(const void *a, const void *b) | |||
3398 | return _a->threshold - _b->threshold; | 3413 | return _a->threshold - _b->threshold; |
3399 | } | 3414 | } |
3400 | 3415 | ||
3401 | static int mem_cgroup_register_event(struct cgroup *cgrp, struct cftype *cft, | 3416 | static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem, void *data) |
3402 | struct eventfd_ctx *eventfd, const char *args) | 3417 | { |
3418 | struct mem_cgroup_eventfd_list *ev; | ||
3419 | |||
3420 | list_for_each_entry(ev, &mem->oom_notify, list) | ||
3421 | eventfd_signal(ev->eventfd, 1); | ||
3422 | return 0; | ||
3423 | } | ||
3424 | |||
3425 | static void mem_cgroup_oom_notify(struct mem_cgroup *mem) | ||
3426 | { | ||
3427 | mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_notify_cb); | ||
3428 | } | ||
3429 | |||
3430 | static int mem_cgroup_usage_register_event(struct cgroup *cgrp, | ||
3431 | struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) | ||
3403 | { | 3432 | { |
3404 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); | 3433 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); |
3405 | struct mem_cgroup_threshold_ary *thresholds, *thresholds_new; | 3434 | struct mem_cgroup_threshold_ary *thresholds, *thresholds_new; |
@@ -3483,8 +3512,8 @@ unlock: | |||
3483 | return ret; | 3512 | return ret; |
3484 | } | 3513 | } |
3485 | 3514 | ||
3486 | static int mem_cgroup_unregister_event(struct cgroup *cgrp, struct cftype *cft, | 3515 | static int mem_cgroup_usage_unregister_event(struct cgroup *cgrp, |
3487 | struct eventfd_ctx *eventfd) | 3516 | struct cftype *cft, struct eventfd_ctx *eventfd) |
3488 | { | 3517 | { |
3489 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); | 3518 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); |
3490 | struct mem_cgroup_threshold_ary *thresholds, *thresholds_new; | 3519 | struct mem_cgroup_threshold_ary *thresholds, *thresholds_new; |
@@ -3568,13 +3597,61 @@ unlock: | |||
3568 | return ret; | 3597 | return ret; |
3569 | } | 3598 | } |
3570 | 3599 | ||
3600 | static int mem_cgroup_oom_register_event(struct cgroup *cgrp, | ||
3601 | struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) | ||
3602 | { | ||
3603 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); | ||
3604 | struct mem_cgroup_eventfd_list *event; | ||
3605 | int type = MEMFILE_TYPE(cft->private); | ||
3606 | |||
3607 | BUG_ON(type != _OOM_TYPE); | ||
3608 | event = kmalloc(sizeof(*event), GFP_KERNEL); | ||
3609 | if (!event) | ||
3610 | return -ENOMEM; | ||
3611 | |||
3612 | mutex_lock(&memcg_oom_mutex); | ||
3613 | |||
3614 | event->eventfd = eventfd; | ||
3615 | list_add(&event->list, &memcg->oom_notify); | ||
3616 | |||
3617 | /* already in OOM ? */ | ||
3618 | if (atomic_read(&memcg->oom_lock)) | ||
3619 | eventfd_signal(eventfd, 1); | ||
3620 | mutex_unlock(&memcg_oom_mutex); | ||
3621 | |||
3622 | return 0; | ||
3623 | } | ||
3624 | |||
3625 | static int mem_cgroup_oom_unregister_event(struct cgroup *cgrp, | ||
3626 | struct cftype *cft, struct eventfd_ctx *eventfd) | ||
3627 | { | ||
3628 | struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); | ||
3629 | struct mem_cgroup_eventfd_list *ev, *tmp; | ||
3630 | int type = MEMFILE_TYPE(cft->private); | ||
3631 | |||
3632 | BUG_ON(type != _OOM_TYPE); | ||
3633 | |||
3634 | mutex_lock(&memcg_oom_mutex); | ||
3635 | |||
3636 | list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) { | ||
3637 | if (ev->eventfd == eventfd) { | ||
3638 | list_del(&ev->list); | ||
3639 | kfree(ev); | ||
3640 | } | ||
3641 | } | ||
3642 | |||
3643 | mutex_unlock(&memcg_oom_mutex); | ||
3644 | |||
3645 | return 0; | ||
3646 | } | ||
3647 | |||
3571 | static struct cftype mem_cgroup_files[] = { | 3648 | static struct cftype mem_cgroup_files[] = { |
3572 | { | 3649 | { |
3573 | .name = "usage_in_bytes", | 3650 | .name = "usage_in_bytes", |
3574 | .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), | 3651 | .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), |
3575 | .read_u64 = mem_cgroup_read, | 3652 | .read_u64 = mem_cgroup_read, |
3576 | .register_event = mem_cgroup_register_event, | 3653 | .register_event = mem_cgroup_usage_register_event, |
3577 | .unregister_event = mem_cgroup_unregister_event, | 3654 | .unregister_event = mem_cgroup_usage_unregister_event, |
3578 | }, | 3655 | }, |
3579 | { | 3656 | { |
3580 | .name = "max_usage_in_bytes", | 3657 | .name = "max_usage_in_bytes", |
@@ -3623,6 +3700,12 @@ static struct cftype mem_cgroup_files[] = { | |||
3623 | .read_u64 = mem_cgroup_move_charge_read, | 3700 | .read_u64 = mem_cgroup_move_charge_read, |
3624 | .write_u64 = mem_cgroup_move_charge_write, | 3701 | .write_u64 = mem_cgroup_move_charge_write, |
3625 | }, | 3702 | }, |
3703 | { | ||
3704 | .name = "oom_control", | ||
3705 | .register_event = mem_cgroup_oom_register_event, | ||
3706 | .unregister_event = mem_cgroup_oom_unregister_event, | ||
3707 | .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), | ||
3708 | }, | ||
3626 | }; | 3709 | }; |
3627 | 3710 | ||
3628 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 3711 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP |
@@ -3631,8 +3714,8 @@ static struct cftype memsw_cgroup_files[] = { | |||
3631 | .name = "memsw.usage_in_bytes", | 3714 | .name = "memsw.usage_in_bytes", |
3632 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), | 3715 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), |
3633 | .read_u64 = mem_cgroup_read, | 3716 | .read_u64 = mem_cgroup_read, |
3634 | .register_event = mem_cgroup_register_event, | 3717 | .register_event = mem_cgroup_usage_register_event, |
3635 | .unregister_event = mem_cgroup_unregister_event, | 3718 | .unregister_event = mem_cgroup_usage_unregister_event, |
3636 | }, | 3719 | }, |
3637 | { | 3720 | { |
3638 | .name = "memsw.max_usage_in_bytes", | 3721 | .name = "memsw.max_usage_in_bytes", |
@@ -3878,6 +3961,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
3878 | } | 3961 | } |
3879 | mem->last_scanned_child = 0; | 3962 | mem->last_scanned_child = 0; |
3880 | spin_lock_init(&mem->reclaim_param_lock); | 3963 | spin_lock_init(&mem->reclaim_param_lock); |
3964 | INIT_LIST_HEAD(&mem->oom_notify); | ||
3881 | 3965 | ||
3882 | if (parent) | 3966 | if (parent) |
3883 | mem->swappiness = get_swappiness(parent); | 3967 | mem->swappiness = get_swappiness(parent); |