aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2013-11-22 18:20:42 -0500
committerTejun Heo <tj@kernel.org>2013-11-22 18:20:42 -0500
commit79bd9814e5ec9a288d6599f53aeac0b548fdfe52 (patch)
tree3eea32952c52e0d32de841156c4b68e7b8278053
parent5e01dc7b26d9f24f39abace5da98ccbd6a5ceb52 (diff)
cgroup, memcg: move cgroup_event implementation to memcg
cgroup_event is way over-designed and tries to build a generic flexible event mechanism into cgroup - fully customizable event specification for each user of the interface. This is utterly unnecessary and overboard especially in the light of the planned unified hierarchy as there's gonna be single agent. Simply generating events at fixed points, or if that's too restrictive, configureable cadence or single set of configureable points should be enough. Thankfully, memcg is the only user and gets to keep it. Replacing it with something simpler on sane_behavior is strongly recommended. This patch moves cgroup_event and "cgroup.event_control" implementation to mm/memcontrol.c. Clearing of events on cgroup destruction is moved from cgroup_destroy_locked() to mem_cgroup_css_offline(), which shouldn't make any noticeable difference. cgroup_css() and __file_cft() are exported to enable the move; however, this will soon be reverted once the event code is updated to be memcg specific. Note that "cgroup.event_control" will now exist only on the hierarchy with memcg attached to it. While this change is visible to userland, it is unlikely to be noticeable as the file has never been meaningful outside memcg. Aside from the above change, this is pure code relocation. v2: Per Li Zefan's comments, init/Kconfig updated accordingly and poll.h inclusion moved from cgroup.c to memcontrol.c. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Li Zefan <lizefan@huawei.com> Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Acked-by: Michal Hocko <mhocko@suse.cz> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Balbir Singh <bsingharora@gmail.com>
-rw-r--r--include/linux/cgroup.h5
-rw-r--r--init/Kconfig3
-rw-r--r--kernel/cgroup.c253
-rw-r--r--mm/memcontrol.c248
4 files changed, 257 insertions, 252 deletions
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 3561d305b1e0..40c2427806c9 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -907,6 +907,11 @@ unsigned short css_id(struct cgroup_subsys_state *css);
907struct cgroup_subsys_state *css_from_dir(struct dentry *dentry, 907struct cgroup_subsys_state *css_from_dir(struct dentry *dentry,
908 struct cgroup_subsys *ss); 908 struct cgroup_subsys *ss);
909 909
910/* XXX: temporary */
911struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
912 struct cgroup_subsys *ss);
913struct cftype *__file_cft(struct file *file);
914
910#else /* !CONFIG_CGROUPS */ 915#else /* !CONFIG_CGROUPS */
911 916
912static inline int cgroup_init_early(void) { return 0; } 917static inline int cgroup_init_early(void) { return 0; }
diff --git a/init/Kconfig b/init/Kconfig
index 3ecd8a1178f1..3ca5b8110b0c 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -861,7 +861,6 @@ config NUMA_BALANCING
861 861
862menuconfig CGROUPS 862menuconfig CGROUPS
863 boolean "Control Group support" 863 boolean "Control Group support"
864 depends on EVENTFD
865 help 864 help
866 This option adds support for grouping sets of processes together, for 865 This option adds support for grouping sets of processes together, for
867 use with process control subsystems such as Cpusets, CFS, memory 866 use with process control subsystems such as Cpusets, CFS, memory
@@ -928,6 +927,7 @@ config MEMCG
928 bool "Memory Resource Controller for Control Groups" 927 bool "Memory Resource Controller for Control Groups"
929 depends on RESOURCE_COUNTERS 928 depends on RESOURCE_COUNTERS
930 select MM_OWNER 929 select MM_OWNER
930 select EVENTFD
931 help 931 help
932 Provides a memory resource controller that manages both anonymous 932 Provides a memory resource controller that manages both anonymous
933 memory and page cache. (See Documentation/cgroups/memory.txt) 933 memory and page cache. (See Documentation/cgroups/memory.txt)
@@ -1167,7 +1167,6 @@ config UIDGID_STRICT_TYPE_CHECKS
1167 1167
1168config SCHED_AUTOGROUP 1168config SCHED_AUTOGROUP
1169 bool "Automatic process group scheduling" 1169 bool "Automatic process group scheduling"
1170 select EVENTFD
1171 select CGROUPS 1170 select CGROUPS
1172 select CGROUP_SCHED 1171 select CGROUP_SCHED
1173 select FAIR_GROUP_SCHED 1172 select FAIR_GROUP_SCHED
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 8bd9cfdc70d7..4bccaa7dda35 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -56,11 +56,8 @@
56#include <linux/pid_namespace.h> 56#include <linux/pid_namespace.h>
57#include <linux/idr.h> 57#include <linux/idr.h>
58#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ 58#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
59#include <linux/eventfd.h>
60#include <linux/poll.h>
61#include <linux/flex_array.h> /* used in cgroup_attach_task */ 59#include <linux/flex_array.h> /* used in cgroup_attach_task */
62#include <linux/kthread.h> 60#include <linux/kthread.h>
63#include <linux/file.h>
64 61
65#include <linux/atomic.h> 62#include <linux/atomic.h>
66 63
@@ -156,36 +153,6 @@ struct css_id {
156 unsigned short stack[0]; /* Array of Length (depth+1) */ 153 unsigned short stack[0]; /* Array of Length (depth+1) */
157}; 154};
158 155
159/*
160 * cgroup_event represents events which userspace want to receive.
161 */
162struct cgroup_event {
163 /*
164 * css which the event belongs to.
165 */
166 struct cgroup_subsys_state *css;
167 /*
168 * Control file which the event associated.
169 */
170 struct cftype *cft;
171 /*
172 * eventfd to signal userspace about the event.
173 */
174 struct eventfd_ctx *eventfd;
175 /*
176 * Each of these stored in a list by the cgroup.
177 */
178 struct list_head list;
179 /*
180 * All fields below needed to unregister event when
181 * userspace closes eventfd.
182 */
183 poll_table pt;
184 wait_queue_head_t *wqh;
185 wait_queue_t wait;
186 struct work_struct remove;
187};
188
189/* The list of hierarchy roots */ 156/* The list of hierarchy roots */
190 157
191static LIST_HEAD(cgroup_roots); 158static LIST_HEAD(cgroup_roots);
@@ -235,8 +202,8 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
235 * keep accessing it outside the said locks. This function may return 202 * keep accessing it outside the said locks. This function may return
236 * %NULL if @cgrp doesn't have @subsys_id enabled. 203 * %NULL if @cgrp doesn't have @subsys_id enabled.
237 */ 204 */
238static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, 205struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
239 struct cgroup_subsys *ss) 206 struct cgroup_subsys *ss)
240{ 207{
241 if (ss) 208 if (ss)
242 return rcu_dereference_check(cgrp->subsys[ss->subsys_id], 209 return rcu_dereference_check(cgrp->subsys[ss->subsys_id],
@@ -2663,7 +2630,7 @@ static const struct inode_operations cgroup_dir_inode_operations = {
2663/* 2630/*
2664 * Check if a file is a control file 2631 * Check if a file is a control file
2665 */ 2632 */
2666static inline struct cftype *__file_cft(struct file *file) 2633struct cftype *__file_cft(struct file *file)
2667{ 2634{
2668 if (file_inode(file)->i_fop != &cgroup_file_operations) 2635 if (file_inode(file)->i_fop != &cgroup_file_operations)
2669 return ERR_PTR(-EINVAL); 2636 return ERR_PTR(-EINVAL);
@@ -3949,202 +3916,6 @@ static void cgroup_dput(struct cgroup *cgrp)
3949 deactivate_super(sb); 3916 deactivate_super(sb);
3950} 3917}
3951 3918
3952/*
3953 * Unregister event and free resources.
3954 *
3955 * Gets called from workqueue.
3956 */
3957static void cgroup_event_remove(struct work_struct *work)
3958{
3959 struct cgroup_event *event = container_of(work, struct cgroup_event,
3960 remove);
3961 struct cgroup_subsys_state *css = event->css;
3962
3963 remove_wait_queue(event->wqh, &event->wait);
3964
3965 event->cft->unregister_event(css, event->cft, event->eventfd);
3966
3967 /* Notify userspace the event is going away. */
3968 eventfd_signal(event->eventfd, 1);
3969
3970 eventfd_ctx_put(event->eventfd);
3971 kfree(event);
3972 css_put(css);
3973}
3974
3975/*
3976 * Gets called on POLLHUP on eventfd when user closes it.
3977 *
3978 * Called with wqh->lock held and interrupts disabled.
3979 */
3980static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
3981 int sync, void *key)
3982{
3983 struct cgroup_event *event = container_of(wait,
3984 struct cgroup_event, wait);
3985 struct cgroup *cgrp = event->css->cgroup;
3986 unsigned long flags = (unsigned long)key;
3987
3988 if (flags & POLLHUP) {
3989 /*
3990 * If the event has been detached at cgroup removal, we
3991 * can simply return knowing the other side will cleanup
3992 * for us.
3993 *
3994 * We can't race against event freeing since the other
3995 * side will require wqh->lock via remove_wait_queue(),
3996 * which we hold.
3997 */
3998 spin_lock(&cgrp->event_list_lock);
3999 if (!list_empty(&event->list)) {
4000 list_del_init(&event->list);
4001 /*
4002 * We are in atomic context, but cgroup_event_remove()
4003 * may sleep, so we have to call it in workqueue.
4004 */
4005 schedule_work(&event->remove);
4006 }
4007 spin_unlock(&cgrp->event_list_lock);
4008 }
4009
4010 return 0;
4011}
4012
4013static void cgroup_event_ptable_queue_proc(struct file *file,
4014 wait_queue_head_t *wqh, poll_table *pt)
4015{
4016 struct cgroup_event *event = container_of(pt,
4017 struct cgroup_event, pt);
4018
4019 event->wqh = wqh;
4020 add_wait_queue(wqh, &event->wait);
4021}
4022
4023/*
4024 * Parse input and register new cgroup event handler.
4025 *
4026 * Input must be in format '<event_fd> <control_fd> <args>'.
4027 * Interpretation of args is defined by control file implementation.
4028 */
4029static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css,
4030 struct cftype *cft, const char *buffer)
4031{
4032 struct cgroup *cgrp = dummy_css->cgroup;
4033 struct cgroup_event *event;
4034 struct cgroup_subsys_state *cfile_css;
4035 unsigned int efd, cfd;
4036 struct fd efile;
4037 struct fd cfile;
4038 char *endp;
4039 int ret;
4040
4041 efd = simple_strtoul(buffer, &endp, 10);
4042 if (*endp != ' ')
4043 return -EINVAL;
4044 buffer = endp + 1;
4045
4046 cfd = simple_strtoul(buffer, &endp, 10);
4047 if ((*endp != ' ') && (*endp != '\0'))
4048 return -EINVAL;
4049 buffer = endp + 1;
4050
4051 event = kzalloc(sizeof(*event), GFP_KERNEL);
4052 if (!event)
4053 return -ENOMEM;
4054
4055 INIT_LIST_HEAD(&event->list);
4056 init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
4057 init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
4058 INIT_WORK(&event->remove, cgroup_event_remove);
4059
4060 efile = fdget(efd);
4061 if (!efile.file) {
4062 ret = -EBADF;
4063 goto out_kfree;
4064 }
4065
4066 event->eventfd = eventfd_ctx_fileget(efile.file);
4067 if (IS_ERR(event->eventfd)) {
4068 ret = PTR_ERR(event->eventfd);
4069 goto out_put_efile;
4070 }
4071
4072 cfile = fdget(cfd);
4073 if (!cfile.file) {
4074 ret = -EBADF;
4075 goto out_put_eventfd;
4076 }
4077
4078 /* the process need read permission on control file */
4079 /* AV: shouldn't we check that it's been opened for read instead? */
4080 ret = inode_permission(file_inode(cfile.file), MAY_READ);
4081 if (ret < 0)
4082 goto out_put_cfile;
4083
4084 event->cft = __file_cft(cfile.file);
4085 if (IS_ERR(event->cft)) {
4086 ret = PTR_ERR(event->cft);
4087 goto out_put_cfile;
4088 }
4089
4090 if (!event->cft->ss) {
4091 ret = -EBADF;
4092 goto out_put_cfile;
4093 }
4094
4095 /*
4096 * Determine the css of @cfile, verify it belongs to the same
4097 * cgroup as cgroup.event_control, and associate @event with it.
4098 * Remaining events are automatically removed on cgroup destruction
4099 * but the removal is asynchronous, so take an extra ref.
4100 */
4101 rcu_read_lock();
4102
4103 ret = -EINVAL;
4104 event->css = cgroup_css(cgrp, event->cft->ss);
4105 cfile_css = css_from_dir(cfile.file->f_dentry->d_parent, event->cft->ss);
4106 if (event->css && event->css == cfile_css && css_tryget(event->css))
4107 ret = 0;
4108
4109 rcu_read_unlock();
4110 if (ret)
4111 goto out_put_cfile;
4112
4113 if (!event->cft->register_event || !event->cft->unregister_event) {
4114 ret = -EINVAL;
4115 goto out_put_css;
4116 }
4117
4118 ret = event->cft->register_event(event->css, event->cft,
4119 event->eventfd, buffer);
4120 if (ret)
4121 goto out_put_css;
4122
4123 efile.file->f_op->poll(efile.file, &event->pt);
4124
4125 spin_lock(&cgrp->event_list_lock);
4126 list_add(&event->list, &cgrp->event_list);
4127 spin_unlock(&cgrp->event_list_lock);
4128
4129 fdput(cfile);
4130 fdput(efile);
4131
4132 return 0;
4133
4134out_put_css:
4135 css_put(event->css);
4136out_put_cfile:
4137 fdput(cfile);
4138out_put_eventfd:
4139 eventfd_ctx_put(event->eventfd);
4140out_put_efile:
4141 fdput(efile);
4142out_kfree:
4143 kfree(event);
4144
4145 return ret;
4146}
4147
4148static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css, 3919static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
4149 struct cftype *cft) 3920 struct cftype *cft)
4150{ 3921{
@@ -4170,11 +3941,6 @@ static struct cftype cgroup_base_files[] = {
4170 .mode = S_IRUGO | S_IWUSR, 3941 .mode = S_IRUGO | S_IWUSR,
4171 }, 3942 },
4172 { 3943 {
4173 .name = "cgroup.event_control",
4174 .write_string = cgroup_write_event_control,
4175 .mode = S_IWUGO,
4176 },
4177 {
4178 .name = "cgroup.clone_children", 3944 .name = "cgroup.clone_children",
4179 .flags = CFTYPE_INSANE, 3945 .flags = CFTYPE_INSANE,
4180 .read_u64 = cgroup_clone_children_read, 3946 .read_u64 = cgroup_clone_children_read,
@@ -4666,7 +4432,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4666 __releases(&cgroup_mutex) __acquires(&cgroup_mutex) 4432 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
4667{ 4433{
4668 struct dentry *d = cgrp->dentry; 4434 struct dentry *d = cgrp->dentry;
4669 struct cgroup_event *event, *tmp;
4670 struct cgroup_subsys *ss; 4435 struct cgroup_subsys *ss;
4671 struct cgroup *child; 4436 struct cgroup *child;
4672 bool empty; 4437 bool empty;
@@ -4741,18 +4506,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4741 dget(d); 4506 dget(d);
4742 cgroup_d_remove_dir(d); 4507 cgroup_d_remove_dir(d);
4743 4508
4744 /*
4745 * Unregister events and notify userspace.
4746 * Notify userspace about cgroup removing only after rmdir of cgroup
4747 * directory to avoid race between userspace and kernelspace.
4748 */
4749 spin_lock(&cgrp->event_list_lock);
4750 list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
4751 list_del_init(&event->list);
4752 schedule_work(&event->remove);
4753 }
4754 spin_unlock(&cgrp->event_list_lock);
4755
4756 return 0; 4509 return 0;
4757}; 4510};
4758 4511
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 13b9d0f221b8..02dae3292668 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -45,6 +45,7 @@
45#include <linux/swapops.h> 45#include <linux/swapops.h>
46#include <linux/spinlock.h> 46#include <linux/spinlock.h>
47#include <linux/eventfd.h> 47#include <linux/eventfd.h>
48#include <linux/poll.h>
48#include <linux/sort.h> 49#include <linux/sort.h>
49#include <linux/fs.h> 50#include <linux/fs.h>
50#include <linux/seq_file.h> 51#include <linux/seq_file.h>
@@ -55,6 +56,7 @@
55#include <linux/cpu.h> 56#include <linux/cpu.h>
56#include <linux/oom.h> 57#include <linux/oom.h>
57#include <linux/lockdep.h> 58#include <linux/lockdep.h>
59#include <linux/file.h>
58#include "internal.h" 60#include "internal.h"
59#include <net/sock.h> 61#include <net/sock.h>
60#include <net/ip.h> 62#include <net/ip.h>
@@ -226,6 +228,36 @@ struct mem_cgroup_eventfd_list {
226 struct eventfd_ctx *eventfd; 228 struct eventfd_ctx *eventfd;
227}; 229};
228 230
231/*
232 * cgroup_event represents events which userspace want to receive.
233 */
234struct cgroup_event {
235 /*
236 * css which the event belongs to.
237 */
238 struct cgroup_subsys_state *css;
239 /*
240 * Control file which the event associated.
241 */
242 struct cftype *cft;
243 /*
244 * eventfd to signal userspace about the event.
245 */
246 struct eventfd_ctx *eventfd;
247 /*
248 * Each of these stored in a list by the cgroup.
249 */
250 struct list_head list;
251 /*
252 * All fields below needed to unregister event when
253 * userspace closes eventfd.
254 */
255 poll_table pt;
256 wait_queue_head_t *wqh;
257 wait_queue_t wait;
258 struct work_struct remove;
259};
260
229static void mem_cgroup_threshold(struct mem_cgroup *memcg); 261static void mem_cgroup_threshold(struct mem_cgroup *memcg);
230static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); 262static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
231 263
@@ -5947,6 +5979,202 @@ static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)
5947} 5979}
5948#endif 5980#endif
5949 5981
5982/*
5983 * Unregister event and free resources.
5984 *
5985 * Gets called from workqueue.
5986 */
5987static void cgroup_event_remove(struct work_struct *work)
5988{
5989 struct cgroup_event *event = container_of(work, struct cgroup_event,
5990 remove);
5991 struct cgroup_subsys_state *css = event->css;
5992
5993 remove_wait_queue(event->wqh, &event->wait);
5994
5995 event->cft->unregister_event(css, event->cft, event->eventfd);
5996
5997 /* Notify userspace the event is going away. */
5998 eventfd_signal(event->eventfd, 1);
5999
6000 eventfd_ctx_put(event->eventfd);
6001 kfree(event);
6002 css_put(css);
6003}
6004
6005/*
6006 * Gets called on POLLHUP on eventfd when user closes it.
6007 *
6008 * Called with wqh->lock held and interrupts disabled.
6009 */
6010static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
6011 int sync, void *key)
6012{
6013 struct cgroup_event *event = container_of(wait,
6014 struct cgroup_event, wait);
6015 struct cgroup *cgrp = event->css->cgroup;
6016 unsigned long flags = (unsigned long)key;
6017
6018 if (flags & POLLHUP) {
6019 /*
6020 * If the event has been detached at cgroup removal, we
6021 * can simply return knowing the other side will cleanup
6022 * for us.
6023 *
6024 * We can't race against event freeing since the other
6025 * side will require wqh->lock via remove_wait_queue(),
6026 * which we hold.
6027 */
6028 spin_lock(&cgrp->event_list_lock);
6029 if (!list_empty(&event->list)) {
6030 list_del_init(&event->list);
6031 /*
6032 * We are in atomic context, but cgroup_event_remove()
6033 * may sleep, so we have to call it in workqueue.
6034 */
6035 schedule_work(&event->remove);
6036 }
6037 spin_unlock(&cgrp->event_list_lock);
6038 }
6039
6040 return 0;
6041}
6042
6043static void cgroup_event_ptable_queue_proc(struct file *file,
6044 wait_queue_head_t *wqh, poll_table *pt)
6045{
6046 struct cgroup_event *event = container_of(pt,
6047 struct cgroup_event, pt);
6048
6049 event->wqh = wqh;
6050 add_wait_queue(wqh, &event->wait);
6051}
6052
6053/*
6054 * Parse input and register new cgroup event handler.
6055 *
6056 * Input must be in format '<event_fd> <control_fd> <args>'.
6057 * Interpretation of args is defined by control file implementation.
6058 */
6059static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css,
6060 struct cftype *cft, const char *buffer)
6061{
6062 struct cgroup *cgrp = dummy_css->cgroup;
6063 struct cgroup_event *event;
6064 struct cgroup_subsys_state *cfile_css;
6065 unsigned int efd, cfd;
6066 struct fd efile;
6067 struct fd cfile;
6068 char *endp;
6069 int ret;
6070
6071 efd = simple_strtoul(buffer, &endp, 10);
6072 if (*endp != ' ')
6073 return -EINVAL;
6074 buffer = endp + 1;
6075
6076 cfd = simple_strtoul(buffer, &endp, 10);
6077 if ((*endp != ' ') && (*endp != '\0'))
6078 return -EINVAL;
6079 buffer = endp + 1;
6080
6081 event = kzalloc(sizeof(*event), GFP_KERNEL);
6082 if (!event)
6083 return -ENOMEM;
6084
6085 INIT_LIST_HEAD(&event->list);
6086 init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
6087 init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
6088 INIT_WORK(&event->remove, cgroup_event_remove);
6089
6090 efile = fdget(efd);
6091 if (!efile.file) {
6092 ret = -EBADF;
6093 goto out_kfree;
6094 }
6095
6096 event->eventfd = eventfd_ctx_fileget(efile.file);
6097 if (IS_ERR(event->eventfd)) {
6098 ret = PTR_ERR(event->eventfd);
6099 goto out_put_efile;
6100 }
6101
6102 cfile = fdget(cfd);
6103 if (!cfile.file) {
6104 ret = -EBADF;
6105 goto out_put_eventfd;
6106 }
6107
6108 /* the process need read permission on control file */
6109 /* AV: shouldn't we check that it's been opened for read instead? */
6110 ret = inode_permission(file_inode(cfile.file), MAY_READ);
6111 if (ret < 0)
6112 goto out_put_cfile;
6113
6114 event->cft = __file_cft(cfile.file);
6115 if (IS_ERR(event->cft)) {
6116 ret = PTR_ERR(event->cft);
6117 goto out_put_cfile;
6118 }
6119
6120 if (!event->cft->ss) {
6121 ret = -EBADF;
6122 goto out_put_cfile;
6123 }
6124
6125 /*
6126 * Determine the css of @cfile, verify it belongs to the same
6127 * cgroup as cgroup.event_control, and associate @event with it.
6128 * Remaining events are automatically removed on cgroup destruction
6129 * but the removal is asynchronous, so take an extra ref.
6130 */
6131 rcu_read_lock();
6132
6133 ret = -EINVAL;
6134 event->css = cgroup_css(cgrp, event->cft->ss);
6135 cfile_css = css_from_dir(cfile.file->f_dentry->d_parent, event->cft->ss);
6136 if (event->css && event->css == cfile_css && css_tryget(event->css))
6137 ret = 0;
6138
6139 rcu_read_unlock();
6140 if (ret)
6141 goto out_put_cfile;
6142
6143 if (!event->cft->register_event || !event->cft->unregister_event) {
6144 ret = -EINVAL;
6145 goto out_put_css;
6146 }
6147
6148 ret = event->cft->register_event(event->css, event->cft,
6149 event->eventfd, buffer);
6150 if (ret)
6151 goto out_put_css;
6152
6153 efile.file->f_op->poll(efile.file, &event->pt);
6154
6155 spin_lock(&cgrp->event_list_lock);
6156 list_add(&event->list, &cgrp->event_list);
6157 spin_unlock(&cgrp->event_list_lock);
6158
6159 fdput(cfile);
6160 fdput(efile);
6161
6162 return 0;
6163
6164out_put_css:
6165 css_put(event->css);
6166out_put_cfile:
6167 fdput(cfile);
6168out_put_eventfd:
6169 eventfd_ctx_put(event->eventfd);
6170out_put_efile:
6171 fdput(efile);
6172out_kfree:
6173 kfree(event);
6174
6175 return ret;
6176}
6177
5950static struct cftype mem_cgroup_files[] = { 6178static struct cftype mem_cgroup_files[] = {
5951 { 6179 {
5952 .name = "usage_in_bytes", 6180 .name = "usage_in_bytes",
@@ -5994,6 +6222,12 @@ static struct cftype mem_cgroup_files[] = {
5994 .read_u64 = mem_cgroup_hierarchy_read, 6222 .read_u64 = mem_cgroup_hierarchy_read,
5995 }, 6223 },
5996 { 6224 {
6225 .name = "cgroup.event_control",
6226 .write_string = cgroup_write_event_control,
6227 .flags = CFTYPE_NO_PREFIX,
6228 .mode = S_IWUGO,
6229 },
6230 {
5997 .name = "swappiness", 6231 .name = "swappiness",
5998 .read_u64 = mem_cgroup_swappiness_read, 6232 .read_u64 = mem_cgroup_swappiness_read,
5999 .write_u64 = mem_cgroup_swappiness_write, 6233 .write_u64 = mem_cgroup_swappiness_write,
@@ -6326,6 +6560,20 @@ static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg)
6326static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) 6560static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
6327{ 6561{
6328 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 6562 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
6563 struct cgroup *cgrp = css->cgroup;
6564 struct cgroup_event *event, *tmp;
6565
6566 /*
6567 * Unregister events and notify userspace.
6568 * Notify userspace about cgroup removing only after rmdir of cgroup
6569 * directory to avoid race between userspace and kernelspace.
6570 */
6571 spin_lock(&cgrp->event_list_lock);
6572 list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
6573 list_del_init(&event->list);
6574 schedule_work(&event->remove);
6575 }
6576 spin_unlock(&cgrp->event_list_lock);
6329 6577
6330 kmem_cgroup_css_offline(memcg); 6578 kmem_cgroup_css_offline(memcg);
6331 6579