aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGlauber Costa <glommer@parallels.com>2011-12-11 16:47:01 -0500
committerDavid S. Miller <davem@davemloft.net>2011-12-12 19:03:55 -0500
commite5671dfae59b165e2adfd4dfbdeab11ac8db5bda (patch)
treedaf5570e8da71934970daa4b2044c6f13ee98f9d
parent08e34eb14fe4cfd934b5c169a7682a969457c4ea (diff)
Basic kernel memory functionality for the Memory Controller
This patch lays down the foundation for the kernel memory component of the Memory Controller. As of today, I am only laying down the following files: * memory.independent_kmem_limit * memory.kmem.limit_in_bytes (currently ignored) * memory.kmem.usage_in_bytes (always zero) Signed-off-by: Glauber Costa <glommer@parallels.com> CC: Kirill A. Shutemov <kirill@shutemov.name> CC: Paul Menage <paul@paulmenage.org> CC: Greg Thelen <gthelen@google.com> CC: Johannes Weiner <jweiner@redhat.com> CC: Michal Hocko <mhocko@suse.cz> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--Documentation/cgroups/memory.txt40
-rw-r--r--init/Kconfig11
-rw-r--r--mm/memcontrol.c105
3 files changed, 149 insertions, 7 deletions
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index cc0ebc5241b3..f2453241142b 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -44,8 +44,9 @@ Features:
44 - oom-killer disable knob and oom-notifier 44 - oom-killer disable knob and oom-notifier
45 - Root cgroup has no limit controls. 45 - Root cgroup has no limit controls.
46 46
47 Kernel memory and Hugepages are not under control yet. We just manage 47 Hugepages is not under control yet. We just manage pages on LRU. To add more
48 pages on LRU. To add more controls, we have to take care of performance. 48 controls, we have to take care of performance. Kernel memory support is work
49 in progress, and the current version provides basically functionality.
49 50
50Brief summary of control files. 51Brief summary of control files.
51 52
@@ -56,8 +57,11 @@ Brief summary of control files.
56 (See 5.5 for details) 57 (See 5.5 for details)
57 memory.memsw.usage_in_bytes # show current res_counter usage for memory+Swap 58 memory.memsw.usage_in_bytes # show current res_counter usage for memory+Swap
58 (See 5.5 for details) 59 (See 5.5 for details)
60 memory.kmem.usage_in_bytes # show current res_counter usage for kmem only.
61 (See 2.7 for details)
59 memory.limit_in_bytes # set/show limit of memory usage 62 memory.limit_in_bytes # set/show limit of memory usage
60 memory.memsw.limit_in_bytes # set/show limit of memory+Swap usage 63 memory.memsw.limit_in_bytes # set/show limit of memory+Swap usage
64 memory.kmem.limit_in_bytes # if allowed, set/show limit of kernel memory
61 memory.failcnt # show the number of memory usage hits limits 65 memory.failcnt # show the number of memory usage hits limits
62 memory.memsw.failcnt # show the number of memory+Swap hits limits 66 memory.memsw.failcnt # show the number of memory+Swap hits limits
63 memory.max_usage_in_bytes # show max memory usage recorded 67 memory.max_usage_in_bytes # show max memory usage recorded
@@ -72,6 +76,9 @@ Brief summary of control files.
72 memory.oom_control # set/show oom controls. 76 memory.oom_control # set/show oom controls.
73 memory.numa_stat # show the number of memory usage per numa node 77 memory.numa_stat # show the number of memory usage per numa node
74 78
79 memory.independent_kmem_limit # select whether or not kernel memory limits are
80 independent of user limits
81
751. History 821. History
76 83
77The memory controller has a long history. A request for comments for the memory 84The memory controller has a long history. A request for comments for the memory
@@ -255,6 +262,35 @@ When oom event notifier is registered, event will be delivered.
255 per-zone-per-cgroup LRU (cgroup's private LRU) is just guarded by 262 per-zone-per-cgroup LRU (cgroup's private LRU) is just guarded by
256 zone->lru_lock, it has no lock of its own. 263 zone->lru_lock, it has no lock of its own.
257 264
2652.7 Kernel Memory Extension (CONFIG_CGROUP_MEM_RES_CTLR_KMEM)
266
267With the Kernel memory extension, the Memory Controller is able to limit
268the amount of kernel memory used by the system. Kernel memory is fundamentally
269different than user memory, since it can't be swapped out, which makes it
270possible to DoS the system by consuming too much of this precious resource.
271
272Some kernel memory resources may be accounted and limited separately from the
273main "kmem" resource. For instance, a slab cache that is considered important
274enough to be limited separately may have its own knobs.
275
276Kernel memory limits are not imposed for the root cgroup. Usage for the root
277cgroup may or may not be accounted.
278
279Memory limits as specified by the standard Memory Controller may or may not
280take kernel memory into consideration. This is achieved through the file
281memory.independent_kmem_limit. A Value different than 0 will allow for kernel
282memory to be controlled separately.
283
284When kernel memory limits are not independent, the limit values set in
285memory.kmem files are ignored.
286
287Currently no soft limit is implemented for kernel memory. It is future work
288to trigger slab reclaim when those limits are reached.
289
2902.7.1 Current Kernel Memory resources accounted
291
292None
293
2583. User Interface 2943. User Interface
259 295
2600. Configuration 2960. Configuration
diff --git a/init/Kconfig b/init/Kconfig
index 43298f9810fb..b8930d5a8325 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -689,6 +689,17 @@ config CGROUP_MEM_RES_CTLR_SWAP_ENABLED
689 For those who want to have the feature enabled by default should 689 For those who want to have the feature enabled by default should
690 select this option (if, for some reason, they need to disable it 690 select this option (if, for some reason, they need to disable it
691 then swapaccount=0 does the trick). 691 then swapaccount=0 does the trick).
692config CGROUP_MEM_RES_CTLR_KMEM
693 bool "Memory Resource Controller Kernel Memory accounting (EXPERIMENTAL)"
694 depends on CGROUP_MEM_RES_CTLR && EXPERIMENTAL
695 default n
696 help
697 The Kernel Memory extension for Memory Resource Controller can limit
698 the amount of memory used by kernel objects in the system. Those are
699 fundamentally different from the entities handled by the standard
700 Memory Controller, which are page-based, and can be swapped. Users of
701 the kmem extension can use it to guarantee that no group of processes
702 will ever exhaust kernel resources alone.
692 703
693config CGROUP_PERF 704config CGROUP_PERF
694 bool "Enable perf_event per-cpu per-container group (cgroup) monitoring" 705 bool "Enable perf_event per-cpu per-container group (cgroup) monitoring"
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 6aff93c98aca..9fbcff71245e 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -227,6 +227,10 @@ struct mem_cgroup {
227 */ 227 */
228 struct res_counter memsw; 228 struct res_counter memsw;
229 /* 229 /*
230 * the counter to account for kmem usage.
231 */
232 struct res_counter kmem;
233 /*
230 * Per cgroup active and inactive list, similar to the 234 * Per cgroup active and inactive list, similar to the
231 * per zone LRU lists. 235 * per zone LRU lists.
232 */ 236 */
@@ -277,6 +281,11 @@ struct mem_cgroup {
277 */ 281 */
278 unsigned long move_charge_at_immigrate; 282 unsigned long move_charge_at_immigrate;
279 /* 283 /*
284 * Should kernel memory limits be stabilished independently
285 * from user memory ?
286 */
287 int kmem_independent_accounting;
288 /*
280 * percpu counter. 289 * percpu counter.
281 */ 290 */
282 struct mem_cgroup_stat_cpu *stat; 291 struct mem_cgroup_stat_cpu *stat;
@@ -344,9 +353,14 @@ enum charge_type {
344}; 353};
345 354
346/* for encoding cft->private value on file */ 355/* for encoding cft->private value on file */
347#define _MEM (0) 356
348#define _MEMSWAP (1) 357enum mem_type {
349#define _OOM_TYPE (2) 358 _MEM = 0,
359 _MEMSWAP,
360 _OOM_TYPE,
361 _KMEM,
362};
363
350#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) 364#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))
351#define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) 365#define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff)
352#define MEMFILE_ATTR(val) ((val) & 0xffff) 366#define MEMFILE_ATTR(val) ((val) & 0xffff)
@@ -3848,10 +3862,17 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
3848 u64 val; 3862 u64 val;
3849 3863
3850 if (!mem_cgroup_is_root(memcg)) { 3864 if (!mem_cgroup_is_root(memcg)) {
3865 val = 0;
3866#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
3867 if (!memcg->kmem_independent_accounting)
3868 val = res_counter_read_u64(&memcg->kmem, RES_USAGE);
3869#endif
3851 if (!swap) 3870 if (!swap)
3852 return res_counter_read_u64(&memcg->res, RES_USAGE); 3871 val += res_counter_read_u64(&memcg->res, RES_USAGE);
3853 else 3872 else
3854 return res_counter_read_u64(&memcg->memsw, RES_USAGE); 3873 val += res_counter_read_u64(&memcg->memsw, RES_USAGE);
3874
3875 return val;
3855 } 3876 }
3856 3877
3857 val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE); 3878 val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
@@ -3884,6 +3905,11 @@ static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
3884 else 3905 else
3885 val = res_counter_read_u64(&memcg->memsw, name); 3906 val = res_counter_read_u64(&memcg->memsw, name);
3886 break; 3907 break;
3908#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
3909 case _KMEM:
3910 val = res_counter_read_u64(&memcg->kmem, name);
3911 break;
3912#endif
3887 default: 3913 default:
3888 BUG(); 3914 BUG();
3889 break; 3915 break;
@@ -4612,6 +4638,69 @@ static int mem_control_numa_stat_open(struct inode *unused, struct file *file)
4612} 4638}
4613#endif /* CONFIG_NUMA */ 4639#endif /* CONFIG_NUMA */
4614 4640
4641#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
4642static u64 kmem_limit_independent_read(struct cgroup *cgroup, struct cftype *cft)
4643{
4644 return mem_cgroup_from_cont(cgroup)->kmem_independent_accounting;
4645}
4646
4647static int kmem_limit_independent_write(struct cgroup *cgroup, struct cftype *cft,
4648 u64 val)
4649{
4650 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgroup);
4651 struct mem_cgroup *parent = parent_mem_cgroup(memcg);
4652
4653 val = !!val;
4654
4655 /*
4656 * This follows the same hierarchy restrictions than
4657 * mem_cgroup_hierarchy_write()
4658 */
4659 if (!parent || !parent->use_hierarchy) {
4660 if (list_empty(&cgroup->children))
4661 memcg->kmem_independent_accounting = val;
4662 else
4663 return -EBUSY;
4664 }
4665 else
4666 return -EINVAL;
4667
4668 return 0;
4669}
4670static struct cftype kmem_cgroup_files[] = {
4671 {
4672 .name = "independent_kmem_limit",
4673 .read_u64 = kmem_limit_independent_read,
4674 .write_u64 = kmem_limit_independent_write,
4675 },
4676 {
4677 .name = "kmem.usage_in_bytes",
4678 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
4679 .read_u64 = mem_cgroup_read,
4680 },
4681 {
4682 .name = "kmem.limit_in_bytes",
4683 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
4684 .read_u64 = mem_cgroup_read,
4685 },
4686};
4687
4688static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss)
4689{
4690 int ret = 0;
4691
4692 ret = cgroup_add_files(cont, ss, kmem_cgroup_files,
4693 ARRAY_SIZE(kmem_cgroup_files));
4694 return ret;
4695};
4696
4697#else
4698static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss)
4699{
4700 return 0;
4701}
4702#endif
4703
4615static struct cftype mem_cgroup_files[] = { 4704static struct cftype mem_cgroup_files[] = {
4616 { 4705 {
4617 .name = "usage_in_bytes", 4706 .name = "usage_in_bytes",
@@ -4925,6 +5014,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
4925 if (parent && parent->use_hierarchy) { 5014 if (parent && parent->use_hierarchy) {
4926 res_counter_init(&memcg->res, &parent->res); 5015 res_counter_init(&memcg->res, &parent->res);
4927 res_counter_init(&memcg->memsw, &parent->memsw); 5016 res_counter_init(&memcg->memsw, &parent->memsw);
5017 res_counter_init(&memcg->kmem, &parent->kmem);
4928 /* 5018 /*
4929 * We increment refcnt of the parent to ensure that we can 5019 * We increment refcnt of the parent to ensure that we can
4930 * safely access it on res_counter_charge/uncharge. 5020 * safely access it on res_counter_charge/uncharge.
@@ -4935,6 +5025,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
4935 } else { 5025 } else {
4936 res_counter_init(&memcg->res, NULL); 5026 res_counter_init(&memcg->res, NULL);
4937 res_counter_init(&memcg->memsw, NULL); 5027 res_counter_init(&memcg->memsw, NULL);
5028 res_counter_init(&memcg->kmem, NULL);
4938 } 5029 }
4939 memcg->last_scanned_child = 0; 5030 memcg->last_scanned_child = 0;
4940 memcg->last_scanned_node = MAX_NUMNODES; 5031 memcg->last_scanned_node = MAX_NUMNODES;
@@ -4978,6 +5069,10 @@ static int mem_cgroup_populate(struct cgroup_subsys *ss,
4978 5069
4979 if (!ret) 5070 if (!ret)
4980 ret = register_memsw_files(cont, ss); 5071 ret = register_memsw_files(cont, ss);
5072
5073 if (!ret)
5074 ret = register_kmem_files(cont, ss);
5075
4981 return ret; 5076 return ret;
4982} 5077}
4983 5078