aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>2010-05-26 17:42:36 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2010-05-27 12:12:43 -0400
commit9490ff275606da012d5b373342a49610ad61cb81 (patch)
tree037993e807654da633776066129b001f8bae7d1d
parentdc98df5a1b7be402a0e1c71f1b89ccf249ac15ee (diff)
memcg: oom notifier
Considering containers or other resource management softwares in userland, event notification of OOM in memcg should be implemented. Now, memcg has "threshold" notifier which uses eventfd, we can make use of it for oom notification. This patch adds oom notification eventfd callback for memcg. The usage is very similar to threshold notifier, but control file is memory.oom_control and no arguments other than eventfd is required. % cgroup_event_notifier /cgroup/A/memory.oom_control dummy (About cgroup_event_notifier, see Documentation/cgroup/) Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Cc: David Rientjes <rientjes@google.com> Cc: Davide Libenzi <davidel@xmailserver.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--Documentation/cgroups/memory.txt20
-rw-r--r--mm/memcontrol.c100
2 files changed, 111 insertions, 9 deletions
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index 6cab1f29da4c..eac22d3b2f7b 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -184,6 +184,9 @@ limits on the root cgroup.
184 184
185Note2: When panic_on_oom is set to "2", the whole system will panic. 185Note2: When panic_on_oom is set to "2", the whole system will panic.
186 186
187When oom event notifier is registered, event will be delivered.
188(See oom_control section)
189
1872. Locking 1902. Locking
188 191
189The memory controller uses the following hierarchy 192The memory controller uses the following hierarchy
@@ -488,7 +491,22 @@ threshold in any direction.
488 491
489It's applicable for root and non-root cgroup. 492It's applicable for root and non-root cgroup.
490 493
49110. TODO 49410. OOM Control
495
496Memory controler implements oom notifier using cgroup notification
497API (See cgroups.txt). It allows to register multiple oom notification
498delivery and gets notification when oom happens.
499
500To register a notifier, application need:
501 - create an eventfd using eventfd(2)
502 - open memory.oom_control file
503 - write string like "<event_fd> <memory.oom_control>" to cgroup.event_control
504
505Application will be notifier through eventfd when oom happens.
506OOM notification doesn't work for root cgroup.
507
508
50911. TODO
492 510
4931. Add support for accounting huge pages (as a separate controller) 5111. Add support for accounting huge pages (as a separate controller)
4942. Make per-cgroup scanner reclaim not-shared pages first 5122. Make per-cgroup scanner reclaim not-shared pages first
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 94ac208b1490..da2ed3913316 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -149,6 +149,7 @@ struct mem_cgroup_threshold {
149 u64 threshold; 149 u64 threshold;
150}; 150};
151 151
152/* For threshold */
152struct mem_cgroup_threshold_ary { 153struct mem_cgroup_threshold_ary {
153 /* An array index points to threshold just below usage. */ 154 /* An array index points to threshold just below usage. */
154 atomic_t current_threshold; 155 atomic_t current_threshold;
@@ -157,8 +158,14 @@ struct mem_cgroup_threshold_ary {
157 /* Array of thresholds */ 158 /* Array of thresholds */
158 struct mem_cgroup_threshold entries[0]; 159 struct mem_cgroup_threshold entries[0];
159}; 160};
161/* for OOM */
162struct mem_cgroup_eventfd_list {
163 struct list_head list;
164 struct eventfd_ctx *eventfd;
165};
160 166
161static void mem_cgroup_threshold(struct mem_cgroup *mem); 167static void mem_cgroup_threshold(struct mem_cgroup *mem);
168static void mem_cgroup_oom_notify(struct mem_cgroup *mem);
162 169
163/* 170/*
164 * The memory controller data structure. The memory controller controls both 171 * The memory controller data structure. The memory controller controls both
@@ -220,6 +227,9 @@ struct mem_cgroup {
220 /* thresholds for mem+swap usage. RCU-protected */ 227 /* thresholds for mem+swap usage. RCU-protected */
221 struct mem_cgroup_threshold_ary *memsw_thresholds; 228 struct mem_cgroup_threshold_ary *memsw_thresholds;
222 229
230 /* For oom notifier event fd */
231 struct list_head oom_notify;
232
223 /* 233 /*
224 * Should we move charges of a task when a task is moved into this 234 * Should we move charges of a task when a task is moved into this
225 * mem_cgroup ? And what type of charges should we move ? 235 * mem_cgroup ? And what type of charges should we move ?
@@ -282,9 +292,12 @@ enum charge_type {
282/* for encoding cft->private value on file */ 292/* for encoding cft->private value on file */
283#define _MEM (0) 293#define _MEM (0)
284#define _MEMSWAP (1) 294#define _MEMSWAP (1)
295#define _OOM_TYPE (2)
285#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) 296#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))
286#define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) 297#define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff)
287#define MEMFILE_ATTR(val) ((val) & 0xffff) 298#define MEMFILE_ATTR(val) ((val) & 0xffff)
299/* Used for OOM nofiier */
300#define OOM_CONTROL (0)
288 301
289/* 302/*
290 * Reclaim flags for mem_cgroup_hierarchical_reclaim 303 * Reclaim flags for mem_cgroup_hierarchical_reclaim
@@ -1353,6 +1366,8 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
1353 */ 1366 */
1354 if (!locked) 1367 if (!locked)
1355 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); 1368 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
1369 else
1370 mem_cgroup_oom_notify(mem);
1356 mutex_unlock(&memcg_oom_mutex); 1371 mutex_unlock(&memcg_oom_mutex);
1357 1372
1358 if (locked) 1373 if (locked)
@@ -3398,8 +3413,22 @@ static int compare_thresholds(const void *a, const void *b)
3398 return _a->threshold - _b->threshold; 3413 return _a->threshold - _b->threshold;
3399} 3414}
3400 3415
3401static int mem_cgroup_register_event(struct cgroup *cgrp, struct cftype *cft, 3416static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem, void *data)
3402 struct eventfd_ctx *eventfd, const char *args) 3417{
3418 struct mem_cgroup_eventfd_list *ev;
3419
3420 list_for_each_entry(ev, &mem->oom_notify, list)
3421 eventfd_signal(ev->eventfd, 1);
3422 return 0;
3423}
3424
3425static void mem_cgroup_oom_notify(struct mem_cgroup *mem)
3426{
3427 mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_notify_cb);
3428}
3429
3430static int mem_cgroup_usage_register_event(struct cgroup *cgrp,
3431 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
3403{ 3432{
3404 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 3433 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
3405 struct mem_cgroup_threshold_ary *thresholds, *thresholds_new; 3434 struct mem_cgroup_threshold_ary *thresholds, *thresholds_new;
@@ -3483,8 +3512,8 @@ unlock:
3483 return ret; 3512 return ret;
3484} 3513}
3485 3514
3486static int mem_cgroup_unregister_event(struct cgroup *cgrp, struct cftype *cft, 3515static int mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
3487 struct eventfd_ctx *eventfd) 3516 struct cftype *cft, struct eventfd_ctx *eventfd)
3488{ 3517{
3489 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 3518 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
3490 struct mem_cgroup_threshold_ary *thresholds, *thresholds_new; 3519 struct mem_cgroup_threshold_ary *thresholds, *thresholds_new;
@@ -3568,13 +3597,61 @@ unlock:
3568 return ret; 3597 return ret;
3569} 3598}
3570 3599
3600static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
3601 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
3602{
3603 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
3604 struct mem_cgroup_eventfd_list *event;
3605 int type = MEMFILE_TYPE(cft->private);
3606
3607 BUG_ON(type != _OOM_TYPE);
3608 event = kmalloc(sizeof(*event), GFP_KERNEL);
3609 if (!event)
3610 return -ENOMEM;
3611
3612 mutex_lock(&memcg_oom_mutex);
3613
3614 event->eventfd = eventfd;
3615 list_add(&event->list, &memcg->oom_notify);
3616
3617 /* already in OOM ? */
3618 if (atomic_read(&memcg->oom_lock))
3619 eventfd_signal(eventfd, 1);
3620 mutex_unlock(&memcg_oom_mutex);
3621
3622 return 0;
3623}
3624
3625static int mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
3626 struct cftype *cft, struct eventfd_ctx *eventfd)
3627{
3628 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
3629 struct mem_cgroup_eventfd_list *ev, *tmp;
3630 int type = MEMFILE_TYPE(cft->private);
3631
3632 BUG_ON(type != _OOM_TYPE);
3633
3634 mutex_lock(&memcg_oom_mutex);
3635
3636 list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) {
3637 if (ev->eventfd == eventfd) {
3638 list_del(&ev->list);
3639 kfree(ev);
3640 }
3641 }
3642
3643 mutex_unlock(&memcg_oom_mutex);
3644
3645 return 0;
3646}
3647
3571static struct cftype mem_cgroup_files[] = { 3648static struct cftype mem_cgroup_files[] = {
3572 { 3649 {
3573 .name = "usage_in_bytes", 3650 .name = "usage_in_bytes",
3574 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 3651 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
3575 .read_u64 = mem_cgroup_read, 3652 .read_u64 = mem_cgroup_read,
3576 .register_event = mem_cgroup_register_event, 3653 .register_event = mem_cgroup_usage_register_event,
3577 .unregister_event = mem_cgroup_unregister_event, 3654 .unregister_event = mem_cgroup_usage_unregister_event,
3578 }, 3655 },
3579 { 3656 {
3580 .name = "max_usage_in_bytes", 3657 .name = "max_usage_in_bytes",
@@ -3623,6 +3700,12 @@ static struct cftype mem_cgroup_files[] = {
3623 .read_u64 = mem_cgroup_move_charge_read, 3700 .read_u64 = mem_cgroup_move_charge_read,
3624 .write_u64 = mem_cgroup_move_charge_write, 3701 .write_u64 = mem_cgroup_move_charge_write,
3625 }, 3702 },
3703 {
3704 .name = "oom_control",
3705 .register_event = mem_cgroup_oom_register_event,
3706 .unregister_event = mem_cgroup_oom_unregister_event,
3707 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
3708 },
3626}; 3709};
3627 3710
3628#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 3711#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
@@ -3631,8 +3714,8 @@ static struct cftype memsw_cgroup_files[] = {
3631 .name = "memsw.usage_in_bytes", 3714 .name = "memsw.usage_in_bytes",
3632 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 3715 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
3633 .read_u64 = mem_cgroup_read, 3716 .read_u64 = mem_cgroup_read,
3634 .register_event = mem_cgroup_register_event, 3717 .register_event = mem_cgroup_usage_register_event,
3635 .unregister_event = mem_cgroup_unregister_event, 3718 .unregister_event = mem_cgroup_usage_unregister_event,
3636 }, 3719 },
3637 { 3720 {
3638 .name = "memsw.max_usage_in_bytes", 3721 .name = "memsw.max_usage_in_bytes",
@@ -3878,6 +3961,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
3878 } 3961 }
3879 mem->last_scanned_child = 0; 3962 mem->last_scanned_child = 0;
3880 spin_lock_init(&mem->reclaim_param_lock); 3963 spin_lock_init(&mem->reclaim_param_lock);
3964 INIT_LIST_HEAD(&mem->oom_notify);
3881 3965
3882 if (parent) 3966 if (parent)
3883 mem->swappiness = get_swappiness(parent); 3967 mem->swappiness = get_swappiness(parent);