aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
authorGlauber Costa <glommer@parallels.com>2012-12-18 17:21:47 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2012-12-18 18:02:12 -0500
commit510fc4e11b772fd60f2c545c64d4c55abd07ce36 (patch)
treed4e5b773b87e35d91b6a569bf43dc3424ac1968d /mm/memcontrol.c
parent86ae53e1a138a3295c04ae69bf404be00244a381 (diff)
memcg: kmem accounting basic infrastructure
Add the basic infrastructure for the accounting of kernel memory. To control that, the following files are created: * memory.kmem.usage_in_bytes * memory.kmem.limit_in_bytes * memory.kmem.failcnt * memory.kmem.max_usage_in_bytes They have the same meaning of their user memory counterparts. They reflect the state of the "kmem" res_counter. Per cgroup kmem memory accounting is not enabled until a limit is set for the group. Once the limit is set the accounting cannot be disabled for that group. This means that after the patch is applied, no behavioral changes exists for whoever is still using memcg to control their memory usage, until memory.kmem.limit_in_bytes is set for the first time. We always account to both user and kernel resource_counters. This effectively means that an independent kernel limit is in place when the limit is set to a lower value than the user memory. A equal or higher value means that the user limit will always hit first, meaning that kmem is effectively unlimited. People who want to track kernel memory but not limit it, can set this limit to a very high number (like RESOURCE_MAX - 1page - that no one will ever hit, or equal to the user memory) [akpm@linux-foundation.org: MEMCG_MMEM only works with slab and slub] Signed-off-by: Glauber Costa <glommer@parallels.com> Acked-by: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Acked-by: Michal Hocko <mhocko@suse.cz> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Tejun Heo <tj@kernel.org> Cc: Christoph Lameter <cl@linux.com> Cc: David Rientjes <rientjes@google.com> Cc: Frederic Weisbecker <fweisbec@redhat.com> Cc: Greg Thelen <gthelen@google.com> Cc: JoonSoo Kim <js1304@gmail.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Pekka Enberg <penberg@cs.helsinki.fi> Cc: Rik van Riel <riel@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c126
1 files changed, 123 insertions, 3 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index c7b0b1b803a5..bba1cb4bbb82 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -268,6 +268,10 @@ struct mem_cgroup {
268 }; 268 };
269 269
270 /* 270 /*
271 * the counter to account for kernel memory usage.
272 */
273 struct res_counter kmem;
274 /*
271 * Per cgroup active and inactive list, similar to the 275 * Per cgroup active and inactive list, similar to the
272 * per zone LRU lists. 276 * per zone LRU lists.
273 */ 277 */
@@ -282,6 +286,7 @@ struct mem_cgroup {
282 * Should the accounting and control be hierarchical, per subtree? 286 * Should the accounting and control be hierarchical, per subtree?
283 */ 287 */
284 bool use_hierarchy; 288 bool use_hierarchy;
289 unsigned long kmem_account_flags; /* See KMEM_ACCOUNTED_*, below */
285 290
286 bool oom_lock; 291 bool oom_lock;
287 atomic_t under_oom; 292 atomic_t under_oom;
@@ -334,6 +339,20 @@ struct mem_cgroup {
334#endif 339#endif
335}; 340};
336 341
342/* internal only representation about the status of kmem accounting. */
343enum {
344 KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */
345};
346
347#define KMEM_ACCOUNTED_MASK (1 << KMEM_ACCOUNTED_ACTIVE)
348
349#ifdef CONFIG_MEMCG_KMEM
350static inline void memcg_kmem_set_active(struct mem_cgroup *memcg)
351{
352 set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
353}
354#endif
355
337/* Stuffs for move charges at task migration. */ 356/* Stuffs for move charges at task migration. */
338/* 357/*
339 * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a 358 * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a
@@ -392,6 +411,7 @@ enum res_type {
392 _MEM, 411 _MEM,
393 _MEMSWAP, 412 _MEMSWAP,
394 _OOM_TYPE, 413 _OOM_TYPE,
414 _KMEM,
395}; 415};
396 416
397#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val)) 417#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
@@ -1456,6 +1476,10 @@ done:
1456 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10, 1476 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
1457 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10, 1477 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
1458 res_counter_read_u64(&memcg->memsw, RES_FAILCNT)); 1478 res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
1479 printk(KERN_INFO "kmem: usage %llukB, limit %llukB, failcnt %llu\n",
1480 res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10,
1481 res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10,
1482 res_counter_read_u64(&memcg->kmem, RES_FAILCNT));
1459} 1483}
1460 1484
1461/* 1485/*
@@ -3977,6 +4001,9 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
3977 else 4001 else
3978 val = res_counter_read_u64(&memcg->memsw, name); 4002 val = res_counter_read_u64(&memcg->memsw, name);
3979 break; 4003 break;
4004 case _KMEM:
4005 val = res_counter_read_u64(&memcg->kmem, name);
4006 break;
3980 default: 4007 default:
3981 BUG(); 4008 BUG();
3982 } 4009 }
@@ -3984,6 +4011,59 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
3984 len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val); 4011 len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val);
3985 return simple_read_from_buffer(buf, nbytes, ppos, str, len); 4012 return simple_read_from_buffer(buf, nbytes, ppos, str, len);
3986} 4013}
4014
4015static int memcg_update_kmem_limit(struct cgroup *cont, u64 val)
4016{
4017 int ret = -EINVAL;
4018#ifdef CONFIG_MEMCG_KMEM
4019 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
4020 /*
4021 * For simplicity, we won't allow this to be disabled. It also can't
4022 * be changed if the cgroup has children already, or if tasks had
4023 * already joined.
4024 *
4025 * If tasks join before we set the limit, a person looking at
4026 * kmem.usage_in_bytes will have no way to determine when it took
4027 * place, which makes the value quite meaningless.
4028 *
4029 * After it first became limited, changes in the value of the limit are
4030 * of course permitted.
4031 *
4032 * Taking the cgroup_lock is really offensive, but it is so far the only
4033 * way to guarantee that no children will appear. There are plenty of
4034 * other offenders, and they should all go away. Fine grained locking
4035 * is probably the way to go here. When we are fully hierarchical, we
4036 * can also get rid of the use_hierarchy check.
4037 */
4038 cgroup_lock();
4039 mutex_lock(&set_limit_mutex);
4040 if (!memcg->kmem_account_flags && val != RESOURCE_MAX) {
4041 if (cgroup_task_count(cont) || (memcg->use_hierarchy &&
4042 !list_empty(&cont->children))) {
4043 ret = -EBUSY;
4044 goto out;
4045 }
4046 ret = res_counter_set_limit(&memcg->kmem, val);
4047 VM_BUG_ON(ret);
4048
4049 memcg_kmem_set_active(memcg);
4050 } else
4051 ret = res_counter_set_limit(&memcg->kmem, val);
4052out:
4053 mutex_unlock(&set_limit_mutex);
4054 cgroup_unlock();
4055#endif
4056 return ret;
4057}
4058
4059static void memcg_propagate_kmem(struct mem_cgroup *memcg)
4060{
4061 struct mem_cgroup *parent = parent_mem_cgroup(memcg);
4062 if (!parent)
4063 return;
4064 memcg->kmem_account_flags = parent->kmem_account_flags;
4065}
4066
3987/* 4067/*
3988 * The user of this function is... 4068 * The user of this function is...
3989 * RES_LIMIT. 4069 * RES_LIMIT.
@@ -4015,8 +4095,12 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
4015 break; 4095 break;
4016 if (type == _MEM) 4096 if (type == _MEM)
4017 ret = mem_cgroup_resize_limit(memcg, val); 4097 ret = mem_cgroup_resize_limit(memcg, val);
4018 else 4098 else if (type == _MEMSWAP)
4019 ret = mem_cgroup_resize_memsw_limit(memcg, val); 4099 ret = mem_cgroup_resize_memsw_limit(memcg, val);
4100 else if (type == _KMEM)
4101 ret = memcg_update_kmem_limit(cont, val);
4102 else
4103 return -EINVAL;
4020 break; 4104 break;
4021 case RES_SOFT_LIMIT: 4105 case RES_SOFT_LIMIT:
4022 ret = res_counter_memparse_write_strategy(buffer, &val); 4106 ret = res_counter_memparse_write_strategy(buffer, &val);
@@ -4082,14 +4166,22 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
4082 case RES_MAX_USAGE: 4166 case RES_MAX_USAGE:
4083 if (type == _MEM) 4167 if (type == _MEM)
4084 res_counter_reset_max(&memcg->res); 4168 res_counter_reset_max(&memcg->res);
4085 else 4169 else if (type == _MEMSWAP)
4086 res_counter_reset_max(&memcg->memsw); 4170 res_counter_reset_max(&memcg->memsw);
4171 else if (type == _KMEM)
4172 res_counter_reset_max(&memcg->kmem);
4173 else
4174 return -EINVAL;
4087 break; 4175 break;
4088 case RES_FAILCNT: 4176 case RES_FAILCNT:
4089 if (type == _MEM) 4177 if (type == _MEM)
4090 res_counter_reset_failcnt(&memcg->res); 4178 res_counter_reset_failcnt(&memcg->res);
4091 else 4179 else if (type == _MEMSWAP)
4092 res_counter_reset_failcnt(&memcg->memsw); 4180 res_counter_reset_failcnt(&memcg->memsw);
4181 else if (type == _KMEM)
4182 res_counter_reset_failcnt(&memcg->kmem);
4183 else
4184 return -EINVAL;
4093 break; 4185 break;
4094 } 4186 }
4095 4187
@@ -4651,6 +4743,7 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
4651#ifdef CONFIG_MEMCG_KMEM 4743#ifdef CONFIG_MEMCG_KMEM
4652static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) 4744static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
4653{ 4745{
4746 memcg_propagate_kmem(memcg);
4654 return mem_cgroup_sockets_init(memcg, ss); 4747 return mem_cgroup_sockets_init(memcg, ss);
4655}; 4748};
4656 4749
@@ -4765,6 +4858,31 @@ static struct cftype mem_cgroup_files[] = {
4765 .read = mem_cgroup_read, 4858 .read = mem_cgroup_read,
4766 }, 4859 },
4767#endif 4860#endif
4861#ifdef CONFIG_MEMCG_KMEM
4862 {
4863 .name = "kmem.limit_in_bytes",
4864 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
4865 .write_string = mem_cgroup_write,
4866 .read = mem_cgroup_read,
4867 },
4868 {
4869 .name = "kmem.usage_in_bytes",
4870 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
4871 .read = mem_cgroup_read,
4872 },
4873 {
4874 .name = "kmem.failcnt",
4875 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
4876 .trigger = mem_cgroup_reset,
4877 .read = mem_cgroup_read,
4878 },
4879 {
4880 .name = "kmem.max_usage_in_bytes",
4881 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
4882 .trigger = mem_cgroup_reset,
4883 .read = mem_cgroup_read,
4884 },
4885#endif
4768 { }, /* terminate */ 4886 { }, /* terminate */
4769}; 4887};
4770 4888
@@ -5010,6 +5128,7 @@ mem_cgroup_css_alloc(struct cgroup *cont)
5010 if (parent && parent->use_hierarchy) { 5128 if (parent && parent->use_hierarchy) {
5011 res_counter_init(&memcg->res, &parent->res); 5129 res_counter_init(&memcg->res, &parent->res);
5012 res_counter_init(&memcg->memsw, &parent->memsw); 5130 res_counter_init(&memcg->memsw, &parent->memsw);
5131 res_counter_init(&memcg->kmem, &parent->kmem);
5013 /* 5132 /*
5014 * We increment refcnt of the parent to ensure that we can 5133 * We increment refcnt of the parent to ensure that we can
5015 * safely access it on res_counter_charge/uncharge. 5134 * safely access it on res_counter_charge/uncharge.
@@ -5020,6 +5139,7 @@ mem_cgroup_css_alloc(struct cgroup *cont)
5020 } else { 5139 } else {
5021 res_counter_init(&memcg->res, NULL); 5140 res_counter_init(&memcg->res, NULL);
5022 res_counter_init(&memcg->memsw, NULL); 5141 res_counter_init(&memcg->memsw, NULL);
5142 res_counter_init(&memcg->kmem, NULL);
5023 /* 5143 /*
5024 * Deeper hierachy with use_hierarchy == false doesn't make 5144 * Deeper hierachy with use_hierarchy == false doesn't make
5025 * much sense so let cgroup subsystem know about this 5145 * much sense so let cgroup subsystem know about this