diff options
| author | Glauber Costa <glommer@parallels.com> | 2011-12-11 16:47:01 -0500 |
|---|---|---|
| committer | David S. Miller <davem@davemloft.net> | 2011-12-12 19:03:55 -0500 |
| commit | e5671dfae59b165e2adfd4dfbdeab11ac8db5bda (patch) | |
| tree | daf5570e8da71934970daa4b2044c6f13ee98f9d | |
| parent | 08e34eb14fe4cfd934b5c169a7682a969457c4ea (diff) | |
Basic kernel memory functionality for the Memory Controller
This patch lays down the foundation for the kernel memory component
of the Memory Controller.
As of today, I am only laying down the following files:
* memory.independent_kmem_limit
* memory.kmem.limit_in_bytes (currently ignored)
* memory.kmem.usage_in_bytes (always zero)
Signed-off-by: Glauber Costa <glommer@parallels.com>
CC: Kirill A. Shutemov <kirill@shutemov.name>
CC: Paul Menage <paul@paulmenage.org>
CC: Greg Thelen <gthelen@google.com>
CC: Johannes Weiner <jweiner@redhat.com>
CC: Michal Hocko <mhocko@suse.cz>
Signed-off-by: David S. Miller <davem@davemloft.net>
| -rw-r--r-- | Documentation/cgroups/memory.txt | 40 | ||||
| -rw-r--r-- | init/Kconfig | 11 | ||||
| -rw-r--r-- | mm/memcontrol.c | 105 |
3 files changed, 149 insertions, 7 deletions
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index cc0ebc5241b3..f2453241142b 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt | |||
| @@ -44,8 +44,9 @@ Features: | |||
| 44 | - oom-killer disable knob and oom-notifier | 44 | - oom-killer disable knob and oom-notifier |
| 45 | - Root cgroup has no limit controls. | 45 | - Root cgroup has no limit controls. |
| 46 | 46 | ||
| 47 | Kernel memory and Hugepages are not under control yet. We just manage | 47 | Hugepages is not under control yet. We just manage pages on LRU. To add more |
| 48 | pages on LRU. To add more controls, we have to take care of performance. | 48 | controls, we have to take care of performance. Kernel memory support is work |
| 49 | in progress, and the current version provides basically functionality. | ||
| 49 | 50 | ||
| 50 | Brief summary of control files. | 51 | Brief summary of control files. |
| 51 | 52 | ||
| @@ -56,8 +57,11 @@ Brief summary of control files. | |||
| 56 | (See 5.5 for details) | 57 | (See 5.5 for details) |
| 57 | memory.memsw.usage_in_bytes # show current res_counter usage for memory+Swap | 58 | memory.memsw.usage_in_bytes # show current res_counter usage for memory+Swap |
| 58 | (See 5.5 for details) | 59 | (See 5.5 for details) |
| 60 | memory.kmem.usage_in_bytes # show current res_counter usage for kmem only. | ||
| 61 | (See 2.7 for details) | ||
| 59 | memory.limit_in_bytes # set/show limit of memory usage | 62 | memory.limit_in_bytes # set/show limit of memory usage |
| 60 | memory.memsw.limit_in_bytes # set/show limit of memory+Swap usage | 63 | memory.memsw.limit_in_bytes # set/show limit of memory+Swap usage |
| 64 | memory.kmem.limit_in_bytes # if allowed, set/show limit of kernel memory | ||
| 61 | memory.failcnt # show the number of memory usage hits limits | 65 | memory.failcnt # show the number of memory usage hits limits |
| 62 | memory.memsw.failcnt # show the number of memory+Swap hits limits | 66 | memory.memsw.failcnt # show the number of memory+Swap hits limits |
| 63 | memory.max_usage_in_bytes # show max memory usage recorded | 67 | memory.max_usage_in_bytes # show max memory usage recorded |
| @@ -72,6 +76,9 @@ Brief summary of control files. | |||
| 72 | memory.oom_control # set/show oom controls. | 76 | memory.oom_control # set/show oom controls. |
| 73 | memory.numa_stat # show the number of memory usage per numa node | 77 | memory.numa_stat # show the number of memory usage per numa node |
| 74 | 78 | ||
| 79 | memory.independent_kmem_limit # select whether or not kernel memory limits are | ||
| 80 | independent of user limits | ||
| 81 | |||
| 75 | 1. History | 82 | 1. History |
| 76 | 83 | ||
| 77 | The memory controller has a long history. A request for comments for the memory | 84 | The memory controller has a long history. A request for comments for the memory |
| @@ -255,6 +262,35 @@ When oom event notifier is registered, event will be delivered. | |||
| 255 | per-zone-per-cgroup LRU (cgroup's private LRU) is just guarded by | 262 | per-zone-per-cgroup LRU (cgroup's private LRU) is just guarded by |
| 256 | zone->lru_lock, it has no lock of its own. | 263 | zone->lru_lock, it has no lock of its own. |
| 257 | 264 | ||
| 265 | 2.7 Kernel Memory Extension (CONFIG_CGROUP_MEM_RES_CTLR_KMEM) | ||
| 266 | |||
| 267 | With the Kernel memory extension, the Memory Controller is able to limit | ||
| 268 | the amount of kernel memory used by the system. Kernel memory is fundamentally | ||
| 269 | different than user memory, since it can't be swapped out, which makes it | ||
| 270 | possible to DoS the system by consuming too much of this precious resource. | ||
| 271 | |||
| 272 | Some kernel memory resources may be accounted and limited separately from the | ||
| 273 | main "kmem" resource. For instance, a slab cache that is considered important | ||
| 274 | enough to be limited separately may have its own knobs. | ||
| 275 | |||
| 276 | Kernel memory limits are not imposed for the root cgroup. Usage for the root | ||
| 277 | cgroup may or may not be accounted. | ||
| 278 | |||
| 279 | Memory limits as specified by the standard Memory Controller may or may not | ||
| 280 | take kernel memory into consideration. This is achieved through the file | ||
| 281 | memory.independent_kmem_limit. A Value different than 0 will allow for kernel | ||
| 282 | memory to be controlled separately. | ||
| 283 | |||
| 284 | When kernel memory limits are not independent, the limit values set in | ||
| 285 | memory.kmem files are ignored. | ||
| 286 | |||
| 287 | Currently no soft limit is implemented for kernel memory. It is future work | ||
| 288 | to trigger slab reclaim when those limits are reached. | ||
| 289 | |||
| 290 | 2.7.1 Current Kernel Memory resources accounted | ||
| 291 | |||
| 292 | None | ||
| 293 | |||
| 258 | 3. User Interface | 294 | 3. User Interface |
| 259 | 295 | ||
| 260 | 0. Configuration | 296 | 0. Configuration |
diff --git a/init/Kconfig b/init/Kconfig index 43298f9810fb..b8930d5a8325 100644 --- a/init/Kconfig +++ b/init/Kconfig | |||
| @@ -689,6 +689,17 @@ config CGROUP_MEM_RES_CTLR_SWAP_ENABLED | |||
| 689 | For those who want to have the feature enabled by default should | 689 | For those who want to have the feature enabled by default should |
| 690 | select this option (if, for some reason, they need to disable it | 690 | select this option (if, for some reason, they need to disable it |
| 691 | then swapaccount=0 does the trick). | 691 | then swapaccount=0 does the trick). |
| 692 | config CGROUP_MEM_RES_CTLR_KMEM | ||
| 693 | bool "Memory Resource Controller Kernel Memory accounting (EXPERIMENTAL)" | ||
| 694 | depends on CGROUP_MEM_RES_CTLR && EXPERIMENTAL | ||
| 695 | default n | ||
| 696 | help | ||
| 697 | The Kernel Memory extension for Memory Resource Controller can limit | ||
| 698 | the amount of memory used by kernel objects in the system. Those are | ||
| 699 | fundamentally different from the entities handled by the standard | ||
| 700 | Memory Controller, which are page-based, and can be swapped. Users of | ||
| 701 | the kmem extension can use it to guarantee that no group of processes | ||
| 702 | will ever exhaust kernel resources alone. | ||
| 692 | 703 | ||
| 693 | config CGROUP_PERF | 704 | config CGROUP_PERF |
| 694 | bool "Enable perf_event per-cpu per-container group (cgroup) monitoring" | 705 | bool "Enable perf_event per-cpu per-container group (cgroup) monitoring" |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6aff93c98aca..9fbcff71245e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
| @@ -227,6 +227,10 @@ struct mem_cgroup { | |||
| 227 | */ | 227 | */ |
| 228 | struct res_counter memsw; | 228 | struct res_counter memsw; |
| 229 | /* | 229 | /* |
| 230 | * the counter to account for kmem usage. | ||
| 231 | */ | ||
| 232 | struct res_counter kmem; | ||
| 233 | /* | ||
| 230 | * Per cgroup active and inactive list, similar to the | 234 | * Per cgroup active and inactive list, similar to the |
| 231 | * per zone LRU lists. | 235 | * per zone LRU lists. |
| 232 | */ | 236 | */ |
| @@ -277,6 +281,11 @@ struct mem_cgroup { | |||
| 277 | */ | 281 | */ |
| 278 | unsigned long move_charge_at_immigrate; | 282 | unsigned long move_charge_at_immigrate; |
| 279 | /* | 283 | /* |
| 284 | * Should kernel memory limits be stabilished independently | ||
| 285 | * from user memory ? | ||
| 286 | */ | ||
| 287 | int kmem_independent_accounting; | ||
| 288 | /* | ||
| 280 | * percpu counter. | 289 | * percpu counter. |
| 281 | */ | 290 | */ |
| 282 | struct mem_cgroup_stat_cpu *stat; | 291 | struct mem_cgroup_stat_cpu *stat; |
| @@ -344,9 +353,14 @@ enum charge_type { | |||
| 344 | }; | 353 | }; |
| 345 | 354 | ||
| 346 | /* for encoding cft->private value on file */ | 355 | /* for encoding cft->private value on file */ |
| 347 | #define _MEM (0) | 356 | |
| 348 | #define _MEMSWAP (1) | 357 | enum mem_type { |
| 349 | #define _OOM_TYPE (2) | 358 | _MEM = 0, |
| 359 | _MEMSWAP, | ||
| 360 | _OOM_TYPE, | ||
| 361 | _KMEM, | ||
| 362 | }; | ||
| 363 | |||
| 350 | #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) | 364 | #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) |
| 351 | #define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) | 365 | #define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) |
| 352 | #define MEMFILE_ATTR(val) ((val) & 0xffff) | 366 | #define MEMFILE_ATTR(val) ((val) & 0xffff) |
| @@ -3848,10 +3862,17 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) | |||
| 3848 | u64 val; | 3862 | u64 val; |
| 3849 | 3863 | ||
| 3850 | if (!mem_cgroup_is_root(memcg)) { | 3864 | if (!mem_cgroup_is_root(memcg)) { |
| 3865 | val = 0; | ||
| 3866 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM | ||
| 3867 | if (!memcg->kmem_independent_accounting) | ||
| 3868 | val = res_counter_read_u64(&memcg->kmem, RES_USAGE); | ||
| 3869 | #endif | ||
| 3851 | if (!swap) | 3870 | if (!swap) |
| 3852 | return res_counter_read_u64(&memcg->res, RES_USAGE); | 3871 | val += res_counter_read_u64(&memcg->res, RES_USAGE); |
| 3853 | else | 3872 | else |
| 3854 | return res_counter_read_u64(&memcg->memsw, RES_USAGE); | 3873 | val += res_counter_read_u64(&memcg->memsw, RES_USAGE); |
| 3874 | |||
| 3875 | return val; | ||
| 3855 | } | 3876 | } |
| 3856 | 3877 | ||
| 3857 | val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE); | 3878 | val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE); |
| @@ -3884,6 +3905,11 @@ static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) | |||
| 3884 | else | 3905 | else |
| 3885 | val = res_counter_read_u64(&memcg->memsw, name); | 3906 | val = res_counter_read_u64(&memcg->memsw, name); |
| 3886 | break; | 3907 | break; |
| 3908 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM | ||
| 3909 | case _KMEM: | ||
| 3910 | val = res_counter_read_u64(&memcg->kmem, name); | ||
| 3911 | break; | ||
| 3912 | #endif | ||
| 3887 | default: | 3913 | default: |
| 3888 | BUG(); | 3914 | BUG(); |
| 3889 | break; | 3915 | break; |
| @@ -4612,6 +4638,69 @@ static int mem_control_numa_stat_open(struct inode *unused, struct file *file) | |||
| 4612 | } | 4638 | } |
| 4613 | #endif /* CONFIG_NUMA */ | 4639 | #endif /* CONFIG_NUMA */ |
| 4614 | 4640 | ||
| 4641 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM | ||
| 4642 | static u64 kmem_limit_independent_read(struct cgroup *cgroup, struct cftype *cft) | ||
| 4643 | { | ||
| 4644 | return mem_cgroup_from_cont(cgroup)->kmem_independent_accounting; | ||
| 4645 | } | ||
| 4646 | |||
| 4647 | static int kmem_limit_independent_write(struct cgroup *cgroup, struct cftype *cft, | ||
| 4648 | u64 val) | ||
| 4649 | { | ||
| 4650 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgroup); | ||
| 4651 | struct mem_cgroup *parent = parent_mem_cgroup(memcg); | ||
| 4652 | |||
| 4653 | val = !!val; | ||
| 4654 | |||
| 4655 | /* | ||
| 4656 | * This follows the same hierarchy restrictions than | ||
| 4657 | * mem_cgroup_hierarchy_write() | ||
| 4658 | */ | ||
| 4659 | if (!parent || !parent->use_hierarchy) { | ||
| 4660 | if (list_empty(&cgroup->children)) | ||
| 4661 | memcg->kmem_independent_accounting = val; | ||
| 4662 | else | ||
| 4663 | return -EBUSY; | ||
| 4664 | } | ||
| 4665 | else | ||
| 4666 | return -EINVAL; | ||
| 4667 | |||
| 4668 | return 0; | ||
| 4669 | } | ||
| 4670 | static struct cftype kmem_cgroup_files[] = { | ||
| 4671 | { | ||
| 4672 | .name = "independent_kmem_limit", | ||
| 4673 | .read_u64 = kmem_limit_independent_read, | ||
| 4674 | .write_u64 = kmem_limit_independent_write, | ||
| 4675 | }, | ||
| 4676 | { | ||
| 4677 | .name = "kmem.usage_in_bytes", | ||
| 4678 | .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE), | ||
| 4679 | .read_u64 = mem_cgroup_read, | ||
| 4680 | }, | ||
| 4681 | { | ||
| 4682 | .name = "kmem.limit_in_bytes", | ||
| 4683 | .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), | ||
| 4684 | .read_u64 = mem_cgroup_read, | ||
| 4685 | }, | ||
| 4686 | }; | ||
| 4687 | |||
| 4688 | static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss) | ||
| 4689 | { | ||
| 4690 | int ret = 0; | ||
| 4691 | |||
| 4692 | ret = cgroup_add_files(cont, ss, kmem_cgroup_files, | ||
| 4693 | ARRAY_SIZE(kmem_cgroup_files)); | ||
| 4694 | return ret; | ||
| 4695 | }; | ||
| 4696 | |||
| 4697 | #else | ||
| 4698 | static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss) | ||
| 4699 | { | ||
| 4700 | return 0; | ||
| 4701 | } | ||
| 4702 | #endif | ||
| 4703 | |||
| 4615 | static struct cftype mem_cgroup_files[] = { | 4704 | static struct cftype mem_cgroup_files[] = { |
| 4616 | { | 4705 | { |
| 4617 | .name = "usage_in_bytes", | 4706 | .name = "usage_in_bytes", |
| @@ -4925,6 +5014,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
| 4925 | if (parent && parent->use_hierarchy) { | 5014 | if (parent && parent->use_hierarchy) { |
| 4926 | res_counter_init(&memcg->res, &parent->res); | 5015 | res_counter_init(&memcg->res, &parent->res); |
| 4927 | res_counter_init(&memcg->memsw, &parent->memsw); | 5016 | res_counter_init(&memcg->memsw, &parent->memsw); |
| 5017 | res_counter_init(&memcg->kmem, &parent->kmem); | ||
| 4928 | /* | 5018 | /* |
| 4929 | * We increment refcnt of the parent to ensure that we can | 5019 | * We increment refcnt of the parent to ensure that we can |
| 4930 | * safely access it on res_counter_charge/uncharge. | 5020 | * safely access it on res_counter_charge/uncharge. |
| @@ -4935,6 +5025,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
| 4935 | } else { | 5025 | } else { |
| 4936 | res_counter_init(&memcg->res, NULL); | 5026 | res_counter_init(&memcg->res, NULL); |
| 4937 | res_counter_init(&memcg->memsw, NULL); | 5027 | res_counter_init(&memcg->memsw, NULL); |
| 5028 | res_counter_init(&memcg->kmem, NULL); | ||
| 4938 | } | 5029 | } |
| 4939 | memcg->last_scanned_child = 0; | 5030 | memcg->last_scanned_child = 0; |
| 4940 | memcg->last_scanned_node = MAX_NUMNODES; | 5031 | memcg->last_scanned_node = MAX_NUMNODES; |
| @@ -4978,6 +5069,10 @@ static int mem_cgroup_populate(struct cgroup_subsys *ss, | |||
| 4978 | 5069 | ||
| 4979 | if (!ret) | 5070 | if (!ret) |
| 4980 | ret = register_memsw_files(cont, ss); | 5071 | ret = register_memsw_files(cont, ss); |
| 5072 | |||
| 5073 | if (!ret) | ||
| 5074 | ret = register_kmem_files(cont, ss); | ||
| 5075 | |||
| 4981 | return ret; | 5076 | return ret; |
| 4982 | } | 5077 | } |
| 4983 | 5078 | ||
