aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2013-11-22 18:20:42 -0500
committerTejun Heo <tj@kernel.org>2013-11-22 18:20:42 -0500
commit79bd9814e5ec9a288d6599f53aeac0b548fdfe52 (patch)
tree3eea32952c52e0d32de841156c4b68e7b8278053 /mm/memcontrol.c
parent5e01dc7b26d9f24f39abace5da98ccbd6a5ceb52 (diff)
cgroup, memcg: move cgroup_event implementation to memcg
cgroup_event is way over-designed and tries to build a generic flexible event mechanism into cgroup - fully customizable event specification for each user of the interface. This is utterly unnecessary and overboard especially in the light of the planned unified hierarchy as there's gonna be single agent. Simply generating events at fixed points, or if that's too restrictive, configureable cadence or single set of configureable points should be enough. Thankfully, memcg is the only user and gets to keep it. Replacing it with something simpler on sane_behavior is strongly recommended. This patch moves cgroup_event and "cgroup.event_control" implementation to mm/memcontrol.c. Clearing of events on cgroup destruction is moved from cgroup_destroy_locked() to mem_cgroup_css_offline(), which shouldn't make any noticeable difference. cgroup_css() and __file_cft() are exported to enable the move; however, this will soon be reverted once the event code is updated to be memcg specific. Note that "cgroup.event_control" will now exist only on the hierarchy with memcg attached to it. While this change is visible to userland, it is unlikely to be noticeable as the file has never been meaningful outside memcg. Aside from the above change, this is pure code relocation. v2: Per Li Zefan's comments, init/Kconfig updated accordingly and poll.h inclusion moved from cgroup.c to memcontrol.c. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Li Zefan <lizefan@huawei.com> Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Acked-by: Michal Hocko <mhocko@suse.cz> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Balbir Singh <bsingharora@gmail.com>
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c248
1 files changed, 248 insertions, 0 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 13b9d0f221b8..02dae3292668 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -45,6 +45,7 @@
45#include <linux/swapops.h> 45#include <linux/swapops.h>
46#include <linux/spinlock.h> 46#include <linux/spinlock.h>
47#include <linux/eventfd.h> 47#include <linux/eventfd.h>
48#include <linux/poll.h>
48#include <linux/sort.h> 49#include <linux/sort.h>
49#include <linux/fs.h> 50#include <linux/fs.h>
50#include <linux/seq_file.h> 51#include <linux/seq_file.h>
@@ -55,6 +56,7 @@
55#include <linux/cpu.h> 56#include <linux/cpu.h>
56#include <linux/oom.h> 57#include <linux/oom.h>
57#include <linux/lockdep.h> 58#include <linux/lockdep.h>
59#include <linux/file.h>
58#include "internal.h" 60#include "internal.h"
59#include <net/sock.h> 61#include <net/sock.h>
60#include <net/ip.h> 62#include <net/ip.h>
@@ -226,6 +228,36 @@ struct mem_cgroup_eventfd_list {
226 struct eventfd_ctx *eventfd; 228 struct eventfd_ctx *eventfd;
227}; 229};
228 230
231/*
232 * cgroup_event represents events which userspace want to receive.
233 */
234struct cgroup_event {
235 /*
236 * css which the event belongs to.
237 */
238 struct cgroup_subsys_state *css;
239 /*
240 * Control file which the event associated.
241 */
242 struct cftype *cft;
243 /*
244 * eventfd to signal userspace about the event.
245 */
246 struct eventfd_ctx *eventfd;
247 /*
248 * Each of these stored in a list by the cgroup.
249 */
250 struct list_head list;
251 /*
252 * All fields below needed to unregister event when
253 * userspace closes eventfd.
254 */
255 poll_table pt;
256 wait_queue_head_t *wqh;
257 wait_queue_t wait;
258 struct work_struct remove;
259};
260
229static void mem_cgroup_threshold(struct mem_cgroup *memcg); 261static void mem_cgroup_threshold(struct mem_cgroup *memcg);
230static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); 262static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
231 263
@@ -5947,6 +5979,202 @@ static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)
5947} 5979}
5948#endif 5980#endif
5949 5981
5982/*
5983 * Unregister event and free resources.
5984 *
5985 * Gets called from workqueue.
5986 */
5987static void cgroup_event_remove(struct work_struct *work)
5988{
5989 struct cgroup_event *event = container_of(work, struct cgroup_event,
5990 remove);
5991 struct cgroup_subsys_state *css = event->css;
5992
5993 remove_wait_queue(event->wqh, &event->wait);
5994
5995 event->cft->unregister_event(css, event->cft, event->eventfd);
5996
5997 /* Notify userspace the event is going away. */
5998 eventfd_signal(event->eventfd, 1);
5999
6000 eventfd_ctx_put(event->eventfd);
6001 kfree(event);
6002 css_put(css);
6003}
6004
6005/*
6006 * Gets called on POLLHUP on eventfd when user closes it.
6007 *
6008 * Called with wqh->lock held and interrupts disabled.
6009 */
6010static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
6011 int sync, void *key)
6012{
6013 struct cgroup_event *event = container_of(wait,
6014 struct cgroup_event, wait);
6015 struct cgroup *cgrp = event->css->cgroup;
6016 unsigned long flags = (unsigned long)key;
6017
6018 if (flags & POLLHUP) {
6019 /*
6020 * If the event has been detached at cgroup removal, we
6021 * can simply return knowing the other side will cleanup
6022 * for us.
6023 *
6024 * We can't race against event freeing since the other
6025 * side will require wqh->lock via remove_wait_queue(),
6026 * which we hold.
6027 */
6028 spin_lock(&cgrp->event_list_lock);
6029 if (!list_empty(&event->list)) {
6030 list_del_init(&event->list);
6031 /*
6032 * We are in atomic context, but cgroup_event_remove()
6033 * may sleep, so we have to call it in workqueue.
6034 */
6035 schedule_work(&event->remove);
6036 }
6037 spin_unlock(&cgrp->event_list_lock);
6038 }
6039
6040 return 0;
6041}
6042
6043static void cgroup_event_ptable_queue_proc(struct file *file,
6044 wait_queue_head_t *wqh, poll_table *pt)
6045{
6046 struct cgroup_event *event = container_of(pt,
6047 struct cgroup_event, pt);
6048
6049 event->wqh = wqh;
6050 add_wait_queue(wqh, &event->wait);
6051}
6052
6053/*
6054 * Parse input and register new cgroup event handler.
6055 *
6056 * Input must be in format '<event_fd> <control_fd> <args>'.
6057 * Interpretation of args is defined by control file implementation.
6058 */
6059static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css,
6060 struct cftype *cft, const char *buffer)
6061{
6062 struct cgroup *cgrp = dummy_css->cgroup;
6063 struct cgroup_event *event;
6064 struct cgroup_subsys_state *cfile_css;
6065 unsigned int efd, cfd;
6066 struct fd efile;
6067 struct fd cfile;
6068 char *endp;
6069 int ret;
6070
6071 efd = simple_strtoul(buffer, &endp, 10);
6072 if (*endp != ' ')
6073 return -EINVAL;
6074 buffer = endp + 1;
6075
6076 cfd = simple_strtoul(buffer, &endp, 10);
6077 if ((*endp != ' ') && (*endp != '\0'))
6078 return -EINVAL;
6079 buffer = endp + 1;
6080
6081 event = kzalloc(sizeof(*event), GFP_KERNEL);
6082 if (!event)
6083 return -ENOMEM;
6084
6085 INIT_LIST_HEAD(&event->list);
6086 init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
6087 init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
6088 INIT_WORK(&event->remove, cgroup_event_remove);
6089
6090 efile = fdget(efd);
6091 if (!efile.file) {
6092 ret = -EBADF;
6093 goto out_kfree;
6094 }
6095
6096 event->eventfd = eventfd_ctx_fileget(efile.file);
6097 if (IS_ERR(event->eventfd)) {
6098 ret = PTR_ERR(event->eventfd);
6099 goto out_put_efile;
6100 }
6101
6102 cfile = fdget(cfd);
6103 if (!cfile.file) {
6104 ret = -EBADF;
6105 goto out_put_eventfd;
6106 }
6107
6108 /* the process need read permission on control file */
6109 /* AV: shouldn't we check that it's been opened for read instead? */
6110 ret = inode_permission(file_inode(cfile.file), MAY_READ);
6111 if (ret < 0)
6112 goto out_put_cfile;
6113
6114 event->cft = __file_cft(cfile.file);
6115 if (IS_ERR(event->cft)) {
6116 ret = PTR_ERR(event->cft);
6117 goto out_put_cfile;
6118 }
6119
6120 if (!event->cft->ss) {
6121 ret = -EBADF;
6122 goto out_put_cfile;
6123 }
6124
6125 /*
6126 * Determine the css of @cfile, verify it belongs to the same
6127 * cgroup as cgroup.event_control, and associate @event with it.
6128 * Remaining events are automatically removed on cgroup destruction
6129 * but the removal is asynchronous, so take an extra ref.
6130 */
6131 rcu_read_lock();
6132
6133 ret = -EINVAL;
6134 event->css = cgroup_css(cgrp, event->cft->ss);
6135 cfile_css = css_from_dir(cfile.file->f_dentry->d_parent, event->cft->ss);
6136 if (event->css && event->css == cfile_css && css_tryget(event->css))
6137 ret = 0;
6138
6139 rcu_read_unlock();
6140 if (ret)
6141 goto out_put_cfile;
6142
6143 if (!event->cft->register_event || !event->cft->unregister_event) {
6144 ret = -EINVAL;
6145 goto out_put_css;
6146 }
6147
6148 ret = event->cft->register_event(event->css, event->cft,
6149 event->eventfd, buffer);
6150 if (ret)
6151 goto out_put_css;
6152
6153 efile.file->f_op->poll(efile.file, &event->pt);
6154
6155 spin_lock(&cgrp->event_list_lock);
6156 list_add(&event->list, &cgrp->event_list);
6157 spin_unlock(&cgrp->event_list_lock);
6158
6159 fdput(cfile);
6160 fdput(efile);
6161
6162 return 0;
6163
6164out_put_css:
6165 css_put(event->css);
6166out_put_cfile:
6167 fdput(cfile);
6168out_put_eventfd:
6169 eventfd_ctx_put(event->eventfd);
6170out_put_efile:
6171 fdput(efile);
6172out_kfree:
6173 kfree(event);
6174
6175 return ret;
6176}
6177
5950static struct cftype mem_cgroup_files[] = { 6178static struct cftype mem_cgroup_files[] = {
5951 { 6179 {
5952 .name = "usage_in_bytes", 6180 .name = "usage_in_bytes",
@@ -5994,6 +6222,12 @@ static struct cftype mem_cgroup_files[] = {
5994 .read_u64 = mem_cgroup_hierarchy_read, 6222 .read_u64 = mem_cgroup_hierarchy_read,
5995 }, 6223 },
5996 { 6224 {
6225 .name = "cgroup.event_control",
6226 .write_string = cgroup_write_event_control,
6227 .flags = CFTYPE_NO_PREFIX,
6228 .mode = S_IWUGO,
6229 },
6230 {
5997 .name = "swappiness", 6231 .name = "swappiness",
5998 .read_u64 = mem_cgroup_swappiness_read, 6232 .read_u64 = mem_cgroup_swappiness_read,
5999 .write_u64 = mem_cgroup_swappiness_write, 6233 .write_u64 = mem_cgroup_swappiness_write,
@@ -6326,6 +6560,20 @@ static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg)
6326static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) 6560static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
6327{ 6561{
6328 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 6562 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
6563 struct cgroup *cgrp = css->cgroup;
6564 struct cgroup_event *event, *tmp;
6565
6566 /*
6567 * Unregister events and notify userspace.
6568 * Notify userspace about cgroup removing only after rmdir of cgroup
6569 * directory to avoid race between userspace and kernelspace.
6570 */
6571 spin_lock(&cgrp->event_list_lock);
6572 list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
6573 list_del_init(&event->list);
6574 schedule_work(&event->remove);
6575 }
6576 spin_unlock(&cgrp->event_list_lock);
6329 6577
6330 kmem_cgroup_css_offline(memcg); 6578 kmem_cgroup_css_offline(memcg);
6331 6579