diff options
author | Glauber Costa <glommer@parallels.com> | 2011-12-11 16:47:01 -0500 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2011-12-12 19:03:55 -0500 |
commit | e5671dfae59b165e2adfd4dfbdeab11ac8db5bda (patch) | |
tree | daf5570e8da71934970daa4b2044c6f13ee98f9d | |
parent | 08e34eb14fe4cfd934b5c169a7682a969457c4ea (diff) |
Basic kernel memory functionality for the Memory Controller
This patch lays down the foundation for the kernel memory component
of the Memory Controller.
As of today, I am only laying down the following files:
* memory.independent_kmem_limit
* memory.kmem.limit_in_bytes (currently ignored)
* memory.kmem.usage_in_bytes (always zero)
Signed-off-by: Glauber Costa <glommer@parallels.com>
CC: Kirill A. Shutemov <kirill@shutemov.name>
CC: Paul Menage <paul@paulmenage.org>
CC: Greg Thelen <gthelen@google.com>
CC: Johannes Weiner <jweiner@redhat.com>
CC: Michal Hocko <mhocko@suse.cz>
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | Documentation/cgroups/memory.txt | 40 | ||||
-rw-r--r-- | init/Kconfig | 11 | ||||
-rw-r--r-- | mm/memcontrol.c | 105 |
3 files changed, 149 insertions, 7 deletions
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index cc0ebc5241b3..f2453241142b 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt | |||
@@ -44,8 +44,9 @@ Features: | |||
44 | - oom-killer disable knob and oom-notifier | 44 | - oom-killer disable knob and oom-notifier |
45 | - Root cgroup has no limit controls. | 45 | - Root cgroup has no limit controls. |
46 | 46 | ||
47 | Kernel memory and Hugepages are not under control yet. We just manage | 47 | Hugepages is not under control yet. We just manage pages on LRU. To add more |
48 | pages on LRU. To add more controls, we have to take care of performance. | 48 | controls, we have to take care of performance. Kernel memory support is work |
49 | in progress, and the current version provides basically functionality. | ||
49 | 50 | ||
50 | Brief summary of control files. | 51 | Brief summary of control files. |
51 | 52 | ||
@@ -56,8 +57,11 @@ Brief summary of control files. | |||
56 | (See 5.5 for details) | 57 | (See 5.5 for details) |
57 | memory.memsw.usage_in_bytes # show current res_counter usage for memory+Swap | 58 | memory.memsw.usage_in_bytes # show current res_counter usage for memory+Swap |
58 | (See 5.5 for details) | 59 | (See 5.5 for details) |
60 | memory.kmem.usage_in_bytes # show current res_counter usage for kmem only. | ||
61 | (See 2.7 for details) | ||
59 | memory.limit_in_bytes # set/show limit of memory usage | 62 | memory.limit_in_bytes # set/show limit of memory usage |
60 | memory.memsw.limit_in_bytes # set/show limit of memory+Swap usage | 63 | memory.memsw.limit_in_bytes # set/show limit of memory+Swap usage |
64 | memory.kmem.limit_in_bytes # if allowed, set/show limit of kernel memory | ||
61 | memory.failcnt # show the number of memory usage hits limits | 65 | memory.failcnt # show the number of memory usage hits limits |
62 | memory.memsw.failcnt # show the number of memory+Swap hits limits | 66 | memory.memsw.failcnt # show the number of memory+Swap hits limits |
63 | memory.max_usage_in_bytes # show max memory usage recorded | 67 | memory.max_usage_in_bytes # show max memory usage recorded |
@@ -72,6 +76,9 @@ Brief summary of control files. | |||
72 | memory.oom_control # set/show oom controls. | 76 | memory.oom_control # set/show oom controls. |
73 | memory.numa_stat # show the number of memory usage per numa node | 77 | memory.numa_stat # show the number of memory usage per numa node |
74 | 78 | ||
79 | memory.independent_kmem_limit # select whether or not kernel memory limits are | ||
80 | independent of user limits | ||
81 | |||
75 | 1. History | 82 | 1. History |
76 | 83 | ||
77 | The memory controller has a long history. A request for comments for the memory | 84 | The memory controller has a long history. A request for comments for the memory |
@@ -255,6 +262,35 @@ When oom event notifier is registered, event will be delivered. | |||
255 | per-zone-per-cgroup LRU (cgroup's private LRU) is just guarded by | 262 | per-zone-per-cgroup LRU (cgroup's private LRU) is just guarded by |
256 | zone->lru_lock, it has no lock of its own. | 263 | zone->lru_lock, it has no lock of its own. |
257 | 264 | ||
265 | 2.7 Kernel Memory Extension (CONFIG_CGROUP_MEM_RES_CTLR_KMEM) | ||
266 | |||
267 | With the Kernel memory extension, the Memory Controller is able to limit | ||
268 | the amount of kernel memory used by the system. Kernel memory is fundamentally | ||
269 | different than user memory, since it can't be swapped out, which makes it | ||
270 | possible to DoS the system by consuming too much of this precious resource. | ||
271 | |||
272 | Some kernel memory resources may be accounted and limited separately from the | ||
273 | main "kmem" resource. For instance, a slab cache that is considered important | ||
274 | enough to be limited separately may have its own knobs. | ||
275 | |||
276 | Kernel memory limits are not imposed for the root cgroup. Usage for the root | ||
277 | cgroup may or may not be accounted. | ||
278 | |||
279 | Memory limits as specified by the standard Memory Controller may or may not | ||
280 | take kernel memory into consideration. This is achieved through the file | ||
281 | memory.independent_kmem_limit. A Value different than 0 will allow for kernel | ||
282 | memory to be controlled separately. | ||
283 | |||
284 | When kernel memory limits are not independent, the limit values set in | ||
285 | memory.kmem files are ignored. | ||
286 | |||
287 | Currently no soft limit is implemented for kernel memory. It is future work | ||
288 | to trigger slab reclaim when those limits are reached. | ||
289 | |||
290 | 2.7.1 Current Kernel Memory resources accounted | ||
291 | |||
292 | None | ||
293 | |||
258 | 3. User Interface | 294 | 3. User Interface |
259 | 295 | ||
260 | 0. Configuration | 296 | 0. Configuration |
diff --git a/init/Kconfig b/init/Kconfig index 43298f9810fb..b8930d5a8325 100644 --- a/init/Kconfig +++ b/init/Kconfig | |||
@@ -689,6 +689,17 @@ config CGROUP_MEM_RES_CTLR_SWAP_ENABLED | |||
689 | For those who want to have the feature enabled by default should | 689 | For those who want to have the feature enabled by default should |
690 | select this option (if, for some reason, they need to disable it | 690 | select this option (if, for some reason, they need to disable it |
691 | then swapaccount=0 does the trick). | 691 | then swapaccount=0 does the trick). |
692 | config CGROUP_MEM_RES_CTLR_KMEM | ||
693 | bool "Memory Resource Controller Kernel Memory accounting (EXPERIMENTAL)" | ||
694 | depends on CGROUP_MEM_RES_CTLR && EXPERIMENTAL | ||
695 | default n | ||
696 | help | ||
697 | The Kernel Memory extension for Memory Resource Controller can limit | ||
698 | the amount of memory used by kernel objects in the system. Those are | ||
699 | fundamentally different from the entities handled by the standard | ||
700 | Memory Controller, which are page-based, and can be swapped. Users of | ||
701 | the kmem extension can use it to guarantee that no group of processes | ||
702 | will ever exhaust kernel resources alone. | ||
692 | 703 | ||
693 | config CGROUP_PERF | 704 | config CGROUP_PERF |
694 | bool "Enable perf_event per-cpu per-container group (cgroup) monitoring" | 705 | bool "Enable perf_event per-cpu per-container group (cgroup) monitoring" |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6aff93c98aca..9fbcff71245e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -227,6 +227,10 @@ struct mem_cgroup { | |||
227 | */ | 227 | */ |
228 | struct res_counter memsw; | 228 | struct res_counter memsw; |
229 | /* | 229 | /* |
230 | * the counter to account for kmem usage. | ||
231 | */ | ||
232 | struct res_counter kmem; | ||
233 | /* | ||
230 | * Per cgroup active and inactive list, similar to the | 234 | * Per cgroup active and inactive list, similar to the |
231 | * per zone LRU lists. | 235 | * per zone LRU lists. |
232 | */ | 236 | */ |
@@ -277,6 +281,11 @@ struct mem_cgroup { | |||
277 | */ | 281 | */ |
278 | unsigned long move_charge_at_immigrate; | 282 | unsigned long move_charge_at_immigrate; |
279 | /* | 283 | /* |
284 | * Should kernel memory limits be stabilished independently | ||
285 | * from user memory ? | ||
286 | */ | ||
287 | int kmem_independent_accounting; | ||
288 | /* | ||
280 | * percpu counter. | 289 | * percpu counter. |
281 | */ | 290 | */ |
282 | struct mem_cgroup_stat_cpu *stat; | 291 | struct mem_cgroup_stat_cpu *stat; |
@@ -344,9 +353,14 @@ enum charge_type { | |||
344 | }; | 353 | }; |
345 | 354 | ||
346 | /* for encoding cft->private value on file */ | 355 | /* for encoding cft->private value on file */ |
347 | #define _MEM (0) | 356 | |
348 | #define _MEMSWAP (1) | 357 | enum mem_type { |
349 | #define _OOM_TYPE (2) | 358 | _MEM = 0, |
359 | _MEMSWAP, | ||
360 | _OOM_TYPE, | ||
361 | _KMEM, | ||
362 | }; | ||
363 | |||
350 | #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) | 364 | #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) |
351 | #define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) | 365 | #define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) |
352 | #define MEMFILE_ATTR(val) ((val) & 0xffff) | 366 | #define MEMFILE_ATTR(val) ((val) & 0xffff) |
@@ -3848,10 +3862,17 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) | |||
3848 | u64 val; | 3862 | u64 val; |
3849 | 3863 | ||
3850 | if (!mem_cgroup_is_root(memcg)) { | 3864 | if (!mem_cgroup_is_root(memcg)) { |
3865 | val = 0; | ||
3866 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM | ||
3867 | if (!memcg->kmem_independent_accounting) | ||
3868 | val = res_counter_read_u64(&memcg->kmem, RES_USAGE); | ||
3869 | #endif | ||
3851 | if (!swap) | 3870 | if (!swap) |
3852 | return res_counter_read_u64(&memcg->res, RES_USAGE); | 3871 | val += res_counter_read_u64(&memcg->res, RES_USAGE); |
3853 | else | 3872 | else |
3854 | return res_counter_read_u64(&memcg->memsw, RES_USAGE); | 3873 | val += res_counter_read_u64(&memcg->memsw, RES_USAGE); |
3874 | |||
3875 | return val; | ||
3855 | } | 3876 | } |
3856 | 3877 | ||
3857 | val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE); | 3878 | val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE); |
@@ -3884,6 +3905,11 @@ static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) | |||
3884 | else | 3905 | else |
3885 | val = res_counter_read_u64(&memcg->memsw, name); | 3906 | val = res_counter_read_u64(&memcg->memsw, name); |
3886 | break; | 3907 | break; |
3908 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM | ||
3909 | case _KMEM: | ||
3910 | val = res_counter_read_u64(&memcg->kmem, name); | ||
3911 | break; | ||
3912 | #endif | ||
3887 | default: | 3913 | default: |
3888 | BUG(); | 3914 | BUG(); |
3889 | break; | 3915 | break; |
@@ -4612,6 +4638,69 @@ static int mem_control_numa_stat_open(struct inode *unused, struct file *file) | |||
4612 | } | 4638 | } |
4613 | #endif /* CONFIG_NUMA */ | 4639 | #endif /* CONFIG_NUMA */ |
4614 | 4640 | ||
4641 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM | ||
4642 | static u64 kmem_limit_independent_read(struct cgroup *cgroup, struct cftype *cft) | ||
4643 | { | ||
4644 | return mem_cgroup_from_cont(cgroup)->kmem_independent_accounting; | ||
4645 | } | ||
4646 | |||
4647 | static int kmem_limit_independent_write(struct cgroup *cgroup, struct cftype *cft, | ||
4648 | u64 val) | ||
4649 | { | ||
4650 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgroup); | ||
4651 | struct mem_cgroup *parent = parent_mem_cgroup(memcg); | ||
4652 | |||
4653 | val = !!val; | ||
4654 | |||
4655 | /* | ||
4656 | * This follows the same hierarchy restrictions than | ||
4657 | * mem_cgroup_hierarchy_write() | ||
4658 | */ | ||
4659 | if (!parent || !parent->use_hierarchy) { | ||
4660 | if (list_empty(&cgroup->children)) | ||
4661 | memcg->kmem_independent_accounting = val; | ||
4662 | else | ||
4663 | return -EBUSY; | ||
4664 | } | ||
4665 | else | ||
4666 | return -EINVAL; | ||
4667 | |||
4668 | return 0; | ||
4669 | } | ||
4670 | static struct cftype kmem_cgroup_files[] = { | ||
4671 | { | ||
4672 | .name = "independent_kmem_limit", | ||
4673 | .read_u64 = kmem_limit_independent_read, | ||
4674 | .write_u64 = kmem_limit_independent_write, | ||
4675 | }, | ||
4676 | { | ||
4677 | .name = "kmem.usage_in_bytes", | ||
4678 | .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE), | ||
4679 | .read_u64 = mem_cgroup_read, | ||
4680 | }, | ||
4681 | { | ||
4682 | .name = "kmem.limit_in_bytes", | ||
4683 | .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), | ||
4684 | .read_u64 = mem_cgroup_read, | ||
4685 | }, | ||
4686 | }; | ||
4687 | |||
4688 | static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss) | ||
4689 | { | ||
4690 | int ret = 0; | ||
4691 | |||
4692 | ret = cgroup_add_files(cont, ss, kmem_cgroup_files, | ||
4693 | ARRAY_SIZE(kmem_cgroup_files)); | ||
4694 | return ret; | ||
4695 | }; | ||
4696 | |||
4697 | #else | ||
4698 | static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss) | ||
4699 | { | ||
4700 | return 0; | ||
4701 | } | ||
4702 | #endif | ||
4703 | |||
4615 | static struct cftype mem_cgroup_files[] = { | 4704 | static struct cftype mem_cgroup_files[] = { |
4616 | { | 4705 | { |
4617 | .name = "usage_in_bytes", | 4706 | .name = "usage_in_bytes", |
@@ -4925,6 +5014,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
4925 | if (parent && parent->use_hierarchy) { | 5014 | if (parent && parent->use_hierarchy) { |
4926 | res_counter_init(&memcg->res, &parent->res); | 5015 | res_counter_init(&memcg->res, &parent->res); |
4927 | res_counter_init(&memcg->memsw, &parent->memsw); | 5016 | res_counter_init(&memcg->memsw, &parent->memsw); |
5017 | res_counter_init(&memcg->kmem, &parent->kmem); | ||
4928 | /* | 5018 | /* |
4929 | * We increment refcnt of the parent to ensure that we can | 5019 | * We increment refcnt of the parent to ensure that we can |
4930 | * safely access it on res_counter_charge/uncharge. | 5020 | * safely access it on res_counter_charge/uncharge. |
@@ -4935,6 +5025,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
4935 | } else { | 5025 | } else { |
4936 | res_counter_init(&memcg->res, NULL); | 5026 | res_counter_init(&memcg->res, NULL); |
4937 | res_counter_init(&memcg->memsw, NULL); | 5027 | res_counter_init(&memcg->memsw, NULL); |
5028 | res_counter_init(&memcg->kmem, NULL); | ||
4938 | } | 5029 | } |
4939 | memcg->last_scanned_child = 0; | 5030 | memcg->last_scanned_child = 0; |
4940 | memcg->last_scanned_node = MAX_NUMNODES; | 5031 | memcg->last_scanned_node = MAX_NUMNODES; |
@@ -4978,6 +5069,10 @@ static int mem_cgroup_populate(struct cgroup_subsys *ss, | |||
4978 | 5069 | ||
4979 | if (!ret) | 5070 | if (!ret) |
4980 | ret = register_memsw_files(cont, ss); | 5071 | ret = register_memsw_files(cont, ss); |
5072 | |||
5073 | if (!ret) | ||
5074 | ret = register_kmem_files(cont, ss); | ||
5075 | |||
4981 | return ret; | 5076 | return ret; |
4982 | } | 5077 | } |
4983 | 5078 | ||