aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2013-11-22 18:20:42 -0500
committerTejun Heo <tj@kernel.org>2013-11-22 18:20:42 -0500
commit79bd9814e5ec9a288d6599f53aeac0b548fdfe52 (patch)
tree3eea32952c52e0d32de841156c4b68e7b8278053 /kernel
parent5e01dc7b26d9f24f39abace5da98ccbd6a5ceb52 (diff)
cgroup, memcg: move cgroup_event implementation to memcg
cgroup_event is way over-designed and tries to build a generic flexible event mechanism into cgroup - fully customizable event specification for each user of the interface. This is utterly unnecessary and overboard especially in the light of the planned unified hierarchy as there's gonna be single agent. Simply generating events at fixed points, or if that's too restrictive, configureable cadence or single set of configureable points should be enough. Thankfully, memcg is the only user and gets to keep it. Replacing it with something simpler on sane_behavior is strongly recommended. This patch moves cgroup_event and "cgroup.event_control" implementation to mm/memcontrol.c. Clearing of events on cgroup destruction is moved from cgroup_destroy_locked() to mem_cgroup_css_offline(), which shouldn't make any noticeable difference. cgroup_css() and __file_cft() are exported to enable the move; however, this will soon be reverted once the event code is updated to be memcg specific. Note that "cgroup.event_control" will now exist only on the hierarchy with memcg attached to it. While this change is visible to userland, it is unlikely to be noticeable as the file has never been meaningful outside memcg. Aside from the above change, this is pure code relocation. v2: Per Li Zefan's comments, init/Kconfig updated accordingly and poll.h inclusion moved from cgroup.c to memcontrol.c. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Li Zefan <lizefan@huawei.com> Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Acked-by: Michal Hocko <mhocko@suse.cz> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Balbir Singh <bsingharora@gmail.com>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cgroup.c253
1 files changed, 3 insertions, 250 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 8bd9cfdc70d7..4bccaa7dda35 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -56,11 +56,8 @@
56#include <linux/pid_namespace.h> 56#include <linux/pid_namespace.h>
57#include <linux/idr.h> 57#include <linux/idr.h>
58#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ 58#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
59#include <linux/eventfd.h>
60#include <linux/poll.h>
61#include <linux/flex_array.h> /* used in cgroup_attach_task */ 59#include <linux/flex_array.h> /* used in cgroup_attach_task */
62#include <linux/kthread.h> 60#include <linux/kthread.h>
63#include <linux/file.h>
64 61
65#include <linux/atomic.h> 62#include <linux/atomic.h>
66 63
@@ -156,36 +153,6 @@ struct css_id {
156 unsigned short stack[0]; /* Array of Length (depth+1) */ 153 unsigned short stack[0]; /* Array of Length (depth+1) */
157}; 154};
158 155
159/*
160 * cgroup_event represents events which userspace want to receive.
161 */
162struct cgroup_event {
163 /*
164 * css which the event belongs to.
165 */
166 struct cgroup_subsys_state *css;
167 /*
168 * Control file which the event associated.
169 */
170 struct cftype *cft;
171 /*
172 * eventfd to signal userspace about the event.
173 */
174 struct eventfd_ctx *eventfd;
175 /*
176 * Each of these stored in a list by the cgroup.
177 */
178 struct list_head list;
179 /*
180 * All fields below needed to unregister event when
181 * userspace closes eventfd.
182 */
183 poll_table pt;
184 wait_queue_head_t *wqh;
185 wait_queue_t wait;
186 struct work_struct remove;
187};
188
189/* The list of hierarchy roots */ 156/* The list of hierarchy roots */
190 157
191static LIST_HEAD(cgroup_roots); 158static LIST_HEAD(cgroup_roots);
@@ -235,8 +202,8 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
235 * keep accessing it outside the said locks. This function may return 202 * keep accessing it outside the said locks. This function may return
236 * %NULL if @cgrp doesn't have @subsys_id enabled. 203 * %NULL if @cgrp doesn't have @subsys_id enabled.
237 */ 204 */
238static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, 205struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
239 struct cgroup_subsys *ss) 206 struct cgroup_subsys *ss)
240{ 207{
241 if (ss) 208 if (ss)
242 return rcu_dereference_check(cgrp->subsys[ss->subsys_id], 209 return rcu_dereference_check(cgrp->subsys[ss->subsys_id],
@@ -2663,7 +2630,7 @@ static const struct inode_operations cgroup_dir_inode_operations = {
2663/* 2630/*
2664 * Check if a file is a control file 2631 * Check if a file is a control file
2665 */ 2632 */
2666static inline struct cftype *__file_cft(struct file *file) 2633struct cftype *__file_cft(struct file *file)
2667{ 2634{
2668 if (file_inode(file)->i_fop != &cgroup_file_operations) 2635 if (file_inode(file)->i_fop != &cgroup_file_operations)
2669 return ERR_PTR(-EINVAL); 2636 return ERR_PTR(-EINVAL);
@@ -3949,202 +3916,6 @@ static void cgroup_dput(struct cgroup *cgrp)
3949 deactivate_super(sb); 3916 deactivate_super(sb);
3950} 3917}
3951 3918
3952/*
3953 * Unregister event and free resources.
3954 *
3955 * Gets called from workqueue.
3956 */
3957static void cgroup_event_remove(struct work_struct *work)
3958{
3959 struct cgroup_event *event = container_of(work, struct cgroup_event,
3960 remove);
3961 struct cgroup_subsys_state *css = event->css;
3962
3963 remove_wait_queue(event->wqh, &event->wait);
3964
3965 event->cft->unregister_event(css, event->cft, event->eventfd);
3966
3967 /* Notify userspace the event is going away. */
3968 eventfd_signal(event->eventfd, 1);
3969
3970 eventfd_ctx_put(event->eventfd);
3971 kfree(event);
3972 css_put(css);
3973}
3974
3975/*
3976 * Gets called on POLLHUP on eventfd when user closes it.
3977 *
3978 * Called with wqh->lock held and interrupts disabled.
3979 */
3980static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
3981 int sync, void *key)
3982{
3983 struct cgroup_event *event = container_of(wait,
3984 struct cgroup_event, wait);
3985 struct cgroup *cgrp = event->css->cgroup;
3986 unsigned long flags = (unsigned long)key;
3987
3988 if (flags & POLLHUP) {
3989 /*
3990 * If the event has been detached at cgroup removal, we
3991 * can simply return knowing the other side will cleanup
3992 * for us.
3993 *
3994 * We can't race against event freeing since the other
3995 * side will require wqh->lock via remove_wait_queue(),
3996 * which we hold.
3997 */
3998 spin_lock(&cgrp->event_list_lock);
3999 if (!list_empty(&event->list)) {
4000 list_del_init(&event->list);
4001 /*
4002 * We are in atomic context, but cgroup_event_remove()
4003 * may sleep, so we have to call it in workqueue.
4004 */
4005 schedule_work(&event->remove);
4006 }
4007 spin_unlock(&cgrp->event_list_lock);
4008 }
4009
4010 return 0;
4011}
4012
4013static void cgroup_event_ptable_queue_proc(struct file *file,
4014 wait_queue_head_t *wqh, poll_table *pt)
4015{
4016 struct cgroup_event *event = container_of(pt,
4017 struct cgroup_event, pt);
4018
4019 event->wqh = wqh;
4020 add_wait_queue(wqh, &event->wait);
4021}
4022
4023/*
4024 * Parse input and register new cgroup event handler.
4025 *
4026 * Input must be in format '<event_fd> <control_fd> <args>'.
4027 * Interpretation of args is defined by control file implementation.
4028 */
4029static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css,
4030 struct cftype *cft, const char *buffer)
4031{
4032 struct cgroup *cgrp = dummy_css->cgroup;
4033 struct cgroup_event *event;
4034 struct cgroup_subsys_state *cfile_css;
4035 unsigned int efd, cfd;
4036 struct fd efile;
4037 struct fd cfile;
4038 char *endp;
4039 int ret;
4040
4041 efd = simple_strtoul(buffer, &endp, 10);
4042 if (*endp != ' ')
4043 return -EINVAL;
4044 buffer = endp + 1;
4045
4046 cfd = simple_strtoul(buffer, &endp, 10);
4047 if ((*endp != ' ') && (*endp != '\0'))
4048 return -EINVAL;
4049 buffer = endp + 1;
4050
4051 event = kzalloc(sizeof(*event), GFP_KERNEL);
4052 if (!event)
4053 return -ENOMEM;
4054
4055 INIT_LIST_HEAD(&event->list);
4056 init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
4057 init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
4058 INIT_WORK(&event->remove, cgroup_event_remove);
4059
4060 efile = fdget(efd);
4061 if (!efile.file) {
4062 ret = -EBADF;
4063 goto out_kfree;
4064 }
4065
4066 event->eventfd = eventfd_ctx_fileget(efile.file);
4067 if (IS_ERR(event->eventfd)) {
4068 ret = PTR_ERR(event->eventfd);
4069 goto out_put_efile;
4070 }
4071
4072 cfile = fdget(cfd);
4073 if (!cfile.file) {
4074 ret = -EBADF;
4075 goto out_put_eventfd;
4076 }
4077
4078 /* the process need read permission on control file */
4079 /* AV: shouldn't we check that it's been opened for read instead? */
4080 ret = inode_permission(file_inode(cfile.file), MAY_READ);
4081 if (ret < 0)
4082 goto out_put_cfile;
4083
4084 event->cft = __file_cft(cfile.file);
4085 if (IS_ERR(event->cft)) {
4086 ret = PTR_ERR(event->cft);
4087 goto out_put_cfile;
4088 }
4089
4090 if (!event->cft->ss) {
4091 ret = -EBADF;
4092 goto out_put_cfile;
4093 }
4094
4095 /*
4096 * Determine the css of @cfile, verify it belongs to the same
4097 * cgroup as cgroup.event_control, and associate @event with it.
4098 * Remaining events are automatically removed on cgroup destruction
4099 * but the removal is asynchronous, so take an extra ref.
4100 */
4101 rcu_read_lock();
4102
4103 ret = -EINVAL;
4104 event->css = cgroup_css(cgrp, event->cft->ss);
4105 cfile_css = css_from_dir(cfile.file->f_dentry->d_parent, event->cft->ss);
4106 if (event->css && event->css == cfile_css && css_tryget(event->css))
4107 ret = 0;
4108
4109 rcu_read_unlock();
4110 if (ret)
4111 goto out_put_cfile;
4112
4113 if (!event->cft->register_event || !event->cft->unregister_event) {
4114 ret = -EINVAL;
4115 goto out_put_css;
4116 }
4117
4118 ret = event->cft->register_event(event->css, event->cft,
4119 event->eventfd, buffer);
4120 if (ret)
4121 goto out_put_css;
4122
4123 efile.file->f_op->poll(efile.file, &event->pt);
4124
4125 spin_lock(&cgrp->event_list_lock);
4126 list_add(&event->list, &cgrp->event_list);
4127 spin_unlock(&cgrp->event_list_lock);
4128
4129 fdput(cfile);
4130 fdput(efile);
4131
4132 return 0;
4133
4134out_put_css:
4135 css_put(event->css);
4136out_put_cfile:
4137 fdput(cfile);
4138out_put_eventfd:
4139 eventfd_ctx_put(event->eventfd);
4140out_put_efile:
4141 fdput(efile);
4142out_kfree:
4143 kfree(event);
4144
4145 return ret;
4146}
4147
4148static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css, 3919static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
4149 struct cftype *cft) 3920 struct cftype *cft)
4150{ 3921{
@@ -4170,11 +3941,6 @@ static struct cftype cgroup_base_files[] = {
4170 .mode = S_IRUGO | S_IWUSR, 3941 .mode = S_IRUGO | S_IWUSR,
4171 }, 3942 },
4172 { 3943 {
4173 .name = "cgroup.event_control",
4174 .write_string = cgroup_write_event_control,
4175 .mode = S_IWUGO,
4176 },
4177 {
4178 .name = "cgroup.clone_children", 3944 .name = "cgroup.clone_children",
4179 .flags = CFTYPE_INSANE, 3945 .flags = CFTYPE_INSANE,
4180 .read_u64 = cgroup_clone_children_read, 3946 .read_u64 = cgroup_clone_children_read,
@@ -4666,7 +4432,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4666 __releases(&cgroup_mutex) __acquires(&cgroup_mutex) 4432 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
4667{ 4433{
4668 struct dentry *d = cgrp->dentry; 4434 struct dentry *d = cgrp->dentry;
4669 struct cgroup_event *event, *tmp;
4670 struct cgroup_subsys *ss; 4435 struct cgroup_subsys *ss;
4671 struct cgroup *child; 4436 struct cgroup *child;
4672 bool empty; 4437 bool empty;
@@ -4741,18 +4506,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4741 dget(d); 4506 dget(d);
4742 cgroup_d_remove_dir(d); 4507 cgroup_d_remove_dir(d);
4743 4508
4744 /*
4745 * Unregister events and notify userspace.
4746 * Notify userspace about cgroup removing only after rmdir of cgroup
4747 * directory to avoid race between userspace and kernelspace.
4748 */
4749 spin_lock(&cgrp->event_list_lock);
4750 list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
4751 list_del_init(&event->list);
4752 schedule_work(&event->remove);
4753 }
4754 spin_unlock(&cgrp->event_list_lock);
4755
4756 return 0; 4509 return 0;
4757}; 4510};
4758 4511