aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChris Down <chris@chrisdown.name>2019-06-01 01:30:22 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2019-06-01 18:51:31 -0400
commit9852ae3fe5293264f01c49f2571ef7688f7823ce (patch)
tree6e9847c13762ce77ab3eb1e7a604be7b9cf6c09a
parentbc81426f5beef7da863d3365bc9d45e820448745 (diff)
mm, memcg: consider subtrees in memory.events
memory.stat and other files already consider subtrees in their output, and we should too in order to not present an inconsistent interface. The current situation is fairly confusing, because people interacting with cgroups expect hierarchical behaviour in the vein of memory.stat, cgroup.events, and other files. For example, this causes confusion when debugging reclaim events under low, as currently these always read "0" at non-leaf memcg nodes, which frequently causes people to misdiagnose breach behaviour. The same confusion applies to other counters in this file when debugging issues. Aggregation is done at write time instead of at read-time since these counters aren't hot (unlike memory.stat which is per-page, so it does it at read time), and it makes sense to bundle this with the file notifications. After this patch, events are propagated up the hierarchy: [root@ktst ~]# cat /sys/fs/cgroup/system.slice/memory.events low 0 high 0 max 0 oom 0 oom_kill 0 [root@ktst ~]# systemd-run -p MemoryMax=1 true Running as unit: run-r251162a189fb4562b9dabfdc9b0422f5.service [root@ktst ~]# cat /sys/fs/cgroup/system.slice/memory.events low 0 high 0 max 7 oom 1 oom_kill 1 As this is a change in behaviour, this can be reverted to the old behaviour by mounting with the `memory_localevents' flag set. However, we use the new behaviour by default as there's a lack of evidence that there are any current users of memory.events that would find this change undesirable. akpm: this is a behaviour change, so Cc:stable. THis is so that forthcoming distros which use cgroup v2 are more likely to pick up the revised behaviour. Link: http://lkml.kernel.org/r/20190208224419.GA24772@chrisdown.name Signed-off-by: Chris Down <chris@chrisdown.name> Acked-by: Johannes Weiner <hannes@cmpxchg.org> Reviewed-by: Shakeel Butt <shakeelb@google.com> Cc: Michal Hocko <mhocko@kernel.org> Cc: Tejun Heo <tj@kernel.org> Cc: Roman Gushchin <guro@fb.com> Cc: Dennis Zhou <dennis@kernel.org> Cc: Suren Baghdasaryan <surenb@google.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--Documentation/admin-guide/cgroup-v2.rst9
-rw-r--r--include/linux/cgroup-defs.h5
-rw-r--r--include/linux/memcontrol.h10
-rw-r--r--kernel/cgroup/cgroup.c16
4 files changed, 36 insertions, 4 deletions
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 88e746074252..cf88c1f98270 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -177,6 +177,15 @@ cgroup v2 currently supports the following mount options.
177 ignored on non-init namespace mounts. Please refer to the 177 ignored on non-init namespace mounts. Please refer to the
178 Delegation section for details. 178 Delegation section for details.
179 179
180 memory_localevents
181
182 Only populate memory.events with data for the current cgroup,
183 and not any subtrees. This is legacy behaviour, the default
184 behaviour without this option is to include subtree counts.
185 This option is system wide and can only be set on mount or
186 modified through remount from the init namespace. The mount
187 option is ignored on non-init namespace mounts.
188
180 189
181Organizing Processes and Threads 190Organizing Processes and Threads
182-------------------------------- 191--------------------------------
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 77258d276f93..11e215d7937e 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -89,6 +89,11 @@ enum {
89 * Enable cpuset controller in v1 cgroup to use v2 behavior. 89 * Enable cpuset controller in v1 cgroup to use v2 behavior.
90 */ 90 */
91 CGRP_ROOT_CPUSET_V2_MODE = (1 << 4), 91 CGRP_ROOT_CPUSET_V2_MODE = (1 << 4),
92
93 /*
94 * Enable legacy local memory.events.
95 */
96 CGRP_ROOT_MEMORY_LOCAL_EVENTS = (1 << 5),
92}; 97};
93 98
94/* cftype->flags */ 99/* cftype->flags */
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 73fe0a700911..edf9e8f32d70 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -737,8 +737,14 @@ static inline void count_memcg_event_mm(struct mm_struct *mm,
737static inline void memcg_memory_event(struct mem_cgroup *memcg, 737static inline void memcg_memory_event(struct mem_cgroup *memcg,
738 enum memcg_memory_event event) 738 enum memcg_memory_event event)
739{ 739{
740 atomic_long_inc(&memcg->memory_events[event]); 740 do {
741 cgroup_file_notify(&memcg->events_file); 741 atomic_long_inc(&memcg->memory_events[event]);
742 cgroup_file_notify(&memcg->events_file);
743
744 if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
745 break;
746 } while ((memcg = parent_mem_cgroup(memcg)) &&
747 !mem_cgroup_is_root(memcg));
742} 748}
743 749
744static inline void memcg_memory_event_mm(struct mm_struct *mm, 750static inline void memcg_memory_event_mm(struct mm_struct *mm,
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 217cec4e22c6..426a0026225c 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1810,11 +1810,13 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
1810 1810
1811enum cgroup2_param { 1811enum cgroup2_param {
1812 Opt_nsdelegate, 1812 Opt_nsdelegate,
1813 Opt_memory_localevents,
1813 nr__cgroup2_params 1814 nr__cgroup2_params
1814}; 1815};
1815 1816
1816static const struct fs_parameter_spec cgroup2_param_specs[] = { 1817static const struct fs_parameter_spec cgroup2_param_specs[] = {
1817 fsparam_flag ("nsdelegate", Opt_nsdelegate), 1818 fsparam_flag("nsdelegate", Opt_nsdelegate),
1819 fsparam_flag("memory_localevents", Opt_memory_localevents),
1818 {} 1820 {}
1819}; 1821};
1820 1822
@@ -1837,6 +1839,9 @@ static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param
1837 case Opt_nsdelegate: 1839 case Opt_nsdelegate:
1838 ctx->flags |= CGRP_ROOT_NS_DELEGATE; 1840 ctx->flags |= CGRP_ROOT_NS_DELEGATE;
1839 return 0; 1841 return 0;
1842 case Opt_memory_localevents:
1843 ctx->flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
1844 return 0;
1840 } 1845 }
1841 return -EINVAL; 1846 return -EINVAL;
1842} 1847}
@@ -1848,6 +1853,11 @@ static void apply_cgroup_root_flags(unsigned int root_flags)
1848 cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE; 1853 cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE;
1849 else 1854 else
1850 cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE; 1855 cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE;
1856
1857 if (root_flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
1858 cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
1859 else
1860 cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_LOCAL_EVENTS;
1851 } 1861 }
1852} 1862}
1853 1863
@@ -1855,6 +1865,8 @@ static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root
1855{ 1865{
1856 if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) 1866 if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE)
1857 seq_puts(seq, ",nsdelegate"); 1867 seq_puts(seq, ",nsdelegate");
1868 if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
1869 seq_puts(seq, ",memory_localevents");
1858 return 0; 1870 return 0;
1859} 1871}
1860 1872
@@ -6325,7 +6337,7 @@ static struct kobj_attribute cgroup_delegate_attr = __ATTR_RO(delegate);
6325static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr, 6337static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr,
6326 char *buf) 6338 char *buf)
6327{ 6339{
6328 return snprintf(buf, PAGE_SIZE, "nsdelegate\n"); 6340 return snprintf(buf, PAGE_SIZE, "nsdelegate\nmemory_localevents\n");
6329} 6341}
6330static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features); 6342static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features);
6331 6343