aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/cgroups/cgroups.txt20
-rw-r--r--include/linux/cgroup.h24
-rw-r--r--include/linux/vmpressure.h8
-rw-r--r--init/Kconfig3
-rw-r--r--kernel/cgroup.c259
-rw-r--r--mm/memcontrol.c353
-rw-r--r--mm/vmpressure.c26
7 files changed, 334 insertions, 359 deletions
diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
index 638bf17ff869..821de56d1580 100644
--- a/Documentation/cgroups/cgroups.txt
+++ b/Documentation/cgroups/cgroups.txt
@@ -24,7 +24,6 @@ CONTENTS:
24 2.1 Basic Usage 24 2.1 Basic Usage
25 2.2 Attaching processes 25 2.2 Attaching processes
26 2.3 Mounting hierarchies by name 26 2.3 Mounting hierarchies by name
27 2.4 Notification API
283. Kernel API 273. Kernel API
29 3.1 Overview 28 3.1 Overview
30 3.2 Synchronization 29 3.2 Synchronization
@@ -472,25 +471,6 @@ you give a subsystem a name.
472The name of the subsystem appears as part of the hierarchy description 471The name of the subsystem appears as part of the hierarchy description
473in /proc/mounts and /proc/<pid>/cgroups. 472in /proc/mounts and /proc/<pid>/cgroups.
474 473
4752.4 Notification API
476--------------------
477
478There is mechanism which allows to get notifications about changing
479status of a cgroup.
480
481To register a new notification handler you need to:
482 - create a file descriptor for event notification using eventfd(2);
483 - open a control file to be monitored (e.g. memory.usage_in_bytes);
484 - write "<event_fd> <control_fd> <args>" to cgroup.event_control.
485 Interpretation of args is defined by control file implementation;
486
487eventfd will be woken up by control file implementation or when the
488cgroup is removed.
489
490To unregister a notification handler just close eventfd.
491
492NOTE: Support of notifications should be implemented for the control
493file. See documentation for the subsystem.
494 474
4953. Kernel API 4753. Kernel API
496============= 476=============
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 39c1d9469677..492fa01ec2d3 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -29,7 +29,6 @@ struct cgroup_subsys;
29struct inode; 29struct inode;
30struct cgroup; 30struct cgroup;
31struct css_id; 31struct css_id;
32struct eventfd_ctx;
33 32
34extern int cgroup_init_early(void); 33extern int cgroup_init_early(void);
35extern int cgroup_init(void); 34extern int cgroup_init(void);
@@ -239,10 +238,6 @@ struct cgroup {
239 struct rcu_head rcu_head; 238 struct rcu_head rcu_head;
240 struct work_struct destroy_work; 239 struct work_struct destroy_work;
241 240
242 /* List of events which userspace want to receive */
243 struct list_head event_list;
244 spinlock_t event_list_lock;
245
246 /* directory xattrs */ 241 /* directory xattrs */
247 struct simple_xattrs xattrs; 242 struct simple_xattrs xattrs;
248}; 243};
@@ -506,25 +501,6 @@ struct cftype {
506 int (*trigger)(struct cgroup_subsys_state *css, unsigned int event); 501 int (*trigger)(struct cgroup_subsys_state *css, unsigned int event);
507 502
508 int (*release)(struct inode *inode, struct file *file); 503 int (*release)(struct inode *inode, struct file *file);
509
510 /*
511 * register_event() callback will be used to add new userspace
512 * waiter for changes related to the cftype. Implement it if
513 * you want to provide this functionality. Use eventfd_signal()
514 * on eventfd to send notification to userspace.
515 */
516 int (*register_event)(struct cgroup_subsys_state *css,
517 struct cftype *cft, struct eventfd_ctx *eventfd,
518 const char *args);
519 /*
520 * unregister_event() callback will be called when userspace
521 * closes the eventfd or on cgroup removing.
522 * This callback must be implemented, if you want provide
523 * notification functionality.
524 */
525 void (*unregister_event)(struct cgroup_subsys_state *css,
526 struct cftype *cft,
527 struct eventfd_ctx *eventfd);
528}; 504};
529 505
530/* 506/*
diff --git a/include/linux/vmpressure.h b/include/linux/vmpressure.h
index 3f3788d49362..3e4535876d37 100644
--- a/include/linux/vmpressure.h
+++ b/include/linux/vmpressure.h
@@ -7,6 +7,7 @@
7#include <linux/gfp.h> 7#include <linux/gfp.h>
8#include <linux/types.h> 8#include <linux/types.h>
9#include <linux/cgroup.h> 9#include <linux/cgroup.h>
10#include <linux/eventfd.h>
10 11
11struct vmpressure { 12struct vmpressure {
12 unsigned long scanned; 13 unsigned long scanned;
@@ -33,13 +34,10 @@ extern void vmpressure_init(struct vmpressure *vmpr);
33extern void vmpressure_cleanup(struct vmpressure *vmpr); 34extern void vmpressure_cleanup(struct vmpressure *vmpr);
34extern struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg); 35extern struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg);
35extern struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr); 36extern struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr);
36extern struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css); 37extern int vmpressure_register_event(struct mem_cgroup *memcg,
37extern int vmpressure_register_event(struct cgroup_subsys_state *css,
38 struct cftype *cft,
39 struct eventfd_ctx *eventfd, 38 struct eventfd_ctx *eventfd,
40 const char *args); 39 const char *args);
41extern void vmpressure_unregister_event(struct cgroup_subsys_state *css, 40extern void vmpressure_unregister_event(struct mem_cgroup *memcg,
42 struct cftype *cft,
43 struct eventfd_ctx *eventfd); 41 struct eventfd_ctx *eventfd);
44#else 42#else
45static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, 43static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
diff --git a/init/Kconfig b/init/Kconfig
index 79383d3aa5dc..93f344337172 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -848,7 +848,6 @@ config NUMA_BALANCING
848 848
849menuconfig CGROUPS 849menuconfig CGROUPS
850 boolean "Control Group support" 850 boolean "Control Group support"
851 depends on EVENTFD
852 help 851 help
853 This option adds support for grouping sets of processes together, for 852 This option adds support for grouping sets of processes together, for
854 use with process control subsystems such as Cpusets, CFS, memory 853 use with process control subsystems such as Cpusets, CFS, memory
@@ -915,6 +914,7 @@ config MEMCG
915 bool "Memory Resource Controller for Control Groups" 914 bool "Memory Resource Controller for Control Groups"
916 depends on RESOURCE_COUNTERS 915 depends on RESOURCE_COUNTERS
917 select MM_OWNER 916 select MM_OWNER
917 select EVENTFD
918 help 918 help
919 Provides a memory resource controller that manages both anonymous 919 Provides a memory resource controller that manages both anonymous
920 memory and page cache. (See Documentation/cgroups/memory.txt) 920 memory and page cache. (See Documentation/cgroups/memory.txt)
@@ -1154,7 +1154,6 @@ config UIDGID_STRICT_TYPE_CHECKS
1154 1154
1155config SCHED_AUTOGROUP 1155config SCHED_AUTOGROUP
1156 bool "Automatic process group scheduling" 1156 bool "Automatic process group scheduling"
1157 select EVENTFD
1158 select CGROUPS 1157 select CGROUPS
1159 select CGROUP_SCHED 1158 select CGROUP_SCHED
1160 select FAIR_GROUP_SCHED 1159 select FAIR_GROUP_SCHED
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a7b98ee35ef7..be42967f4f1a 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -56,11 +56,8 @@
56#include <linux/pid_namespace.h> 56#include <linux/pid_namespace.h>
57#include <linux/idr.h> 57#include <linux/idr.h>
58#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ 58#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
59#include <linux/eventfd.h>
60#include <linux/poll.h>
61#include <linux/flex_array.h> /* used in cgroup_attach_task */ 59#include <linux/flex_array.h> /* used in cgroup_attach_task */
62#include <linux/kthread.h> 60#include <linux/kthread.h>
63#include <linux/file.h>
64 61
65#include <linux/atomic.h> 62#include <linux/atomic.h>
66 63
@@ -132,36 +129,6 @@ struct cfent {
132 struct simple_xattrs xattrs; 129 struct simple_xattrs xattrs;
133}; 130};
134 131
135/*
136 * cgroup_event represents events which userspace want to receive.
137 */
138struct cgroup_event {
139 /*
140 * css which the event belongs to.
141 */
142 struct cgroup_subsys_state *css;
143 /*
144 * Control file which the event associated.
145 */
146 struct cftype *cft;
147 /*
148 * eventfd to signal userspace about the event.
149 */
150 struct eventfd_ctx *eventfd;
151 /*
152 * Each of these stored in a list by the cgroup.
153 */
154 struct list_head list;
155 /*
156 * All fields below needed to unregister event when
157 * userspace closes eventfd.
158 */
159 poll_table pt;
160 wait_queue_head_t *wqh;
161 wait_queue_t wait;
162 struct work_struct remove;
163};
164
165/* The list of hierarchy roots */ 132/* The list of hierarchy roots */
166 133
167static LIST_HEAD(cgroup_roots); 134static LIST_HEAD(cgroup_roots);
@@ -1351,8 +1318,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1351 INIT_LIST_HEAD(&cgrp->pidlists); 1318 INIT_LIST_HEAD(&cgrp->pidlists);
1352 mutex_init(&cgrp->pidlist_mutex); 1319 mutex_init(&cgrp->pidlist_mutex);
1353 cgrp->dummy_css.cgroup = cgrp; 1320 cgrp->dummy_css.cgroup = cgrp;
1354 INIT_LIST_HEAD(&cgrp->event_list);
1355 spin_lock_init(&cgrp->event_list_lock);
1356 simple_xattrs_init(&cgrp->xattrs); 1321 simple_xattrs_init(&cgrp->xattrs);
1357} 1322}
1358 1323
@@ -2626,16 +2591,6 @@ static const struct inode_operations cgroup_dir_inode_operations = {
2626 .removexattr = cgroup_removexattr, 2591 .removexattr = cgroup_removexattr,
2627}; 2592};
2628 2593
2629/*
2630 * Check if a file is a control file
2631 */
2632static inline struct cftype *__file_cft(struct file *file)
2633{
2634 if (file_inode(file)->i_fop != &cgroup_file_operations)
2635 return ERR_PTR(-EINVAL);
2636 return __d_cft(file->f_dentry);
2637}
2638
2639static int cgroup_create_file(struct dentry *dentry, umode_t mode, 2594static int cgroup_create_file(struct dentry *dentry, umode_t mode,
2640 struct super_block *sb) 2595 struct super_block *sb)
2641{ 2596{
@@ -3915,202 +3870,6 @@ static void cgroup_dput(struct cgroup *cgrp)
3915 deactivate_super(sb); 3870 deactivate_super(sb);
3916} 3871}
3917 3872
3918/*
3919 * Unregister event and free resources.
3920 *
3921 * Gets called from workqueue.
3922 */
3923static void cgroup_event_remove(struct work_struct *work)
3924{
3925 struct cgroup_event *event = container_of(work, struct cgroup_event,
3926 remove);
3927 struct cgroup_subsys_state *css = event->css;
3928
3929 remove_wait_queue(event->wqh, &event->wait);
3930
3931 event->cft->unregister_event(css, event->cft, event->eventfd);
3932
3933 /* Notify userspace the event is going away. */
3934 eventfd_signal(event->eventfd, 1);
3935
3936 eventfd_ctx_put(event->eventfd);
3937 kfree(event);
3938 css_put(css);
3939}
3940
3941/*
3942 * Gets called on POLLHUP on eventfd when user closes it.
3943 *
3944 * Called with wqh->lock held and interrupts disabled.
3945 */
3946static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
3947 int sync, void *key)
3948{
3949 struct cgroup_event *event = container_of(wait,
3950 struct cgroup_event, wait);
3951 struct cgroup *cgrp = event->css->cgroup;
3952 unsigned long flags = (unsigned long)key;
3953
3954 if (flags & POLLHUP) {
3955 /*
3956 * If the event has been detached at cgroup removal, we
3957 * can simply return knowing the other side will cleanup
3958 * for us.
3959 *
3960 * We can't race against event freeing since the other
3961 * side will require wqh->lock via remove_wait_queue(),
3962 * which we hold.
3963 */
3964 spin_lock(&cgrp->event_list_lock);
3965 if (!list_empty(&event->list)) {
3966 list_del_init(&event->list);
3967 /*
3968 * We are in atomic context, but cgroup_event_remove()
3969 * may sleep, so we have to call it in workqueue.
3970 */
3971 schedule_work(&event->remove);
3972 }
3973 spin_unlock(&cgrp->event_list_lock);
3974 }
3975
3976 return 0;
3977}
3978
3979static void cgroup_event_ptable_queue_proc(struct file *file,
3980 wait_queue_head_t *wqh, poll_table *pt)
3981{
3982 struct cgroup_event *event = container_of(pt,
3983 struct cgroup_event, pt);
3984
3985 event->wqh = wqh;
3986 add_wait_queue(wqh, &event->wait);
3987}
3988
3989/*
3990 * Parse input and register new cgroup event handler.
3991 *
3992 * Input must be in format '<event_fd> <control_fd> <args>'.
3993 * Interpretation of args is defined by control file implementation.
3994 */
3995static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css,
3996 struct cftype *cft, const char *buffer)
3997{
3998 struct cgroup *cgrp = dummy_css->cgroup;
3999 struct cgroup_event *event;
4000 struct cgroup_subsys_state *cfile_css;
4001 unsigned int efd, cfd;
4002 struct fd efile;
4003 struct fd cfile;
4004 char *endp;
4005 int ret;
4006
4007 efd = simple_strtoul(buffer, &endp, 10);
4008 if (*endp != ' ')
4009 return -EINVAL;
4010 buffer = endp + 1;
4011
4012 cfd = simple_strtoul(buffer, &endp, 10);
4013 if ((*endp != ' ') && (*endp != '\0'))
4014 return -EINVAL;
4015 buffer = endp + 1;
4016
4017 event = kzalloc(sizeof(*event), GFP_KERNEL);
4018 if (!event)
4019 return -ENOMEM;
4020
4021 INIT_LIST_HEAD(&event->list);
4022 init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
4023 init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
4024 INIT_WORK(&event->remove, cgroup_event_remove);
4025
4026 efile = fdget(efd);
4027 if (!efile.file) {
4028 ret = -EBADF;
4029 goto out_kfree;
4030 }
4031
4032 event->eventfd = eventfd_ctx_fileget(efile.file);
4033 if (IS_ERR(event->eventfd)) {
4034 ret = PTR_ERR(event->eventfd);
4035 goto out_put_efile;
4036 }
4037
4038 cfile = fdget(cfd);
4039 if (!cfile.file) {
4040 ret = -EBADF;
4041 goto out_put_eventfd;
4042 }
4043
4044 /* the process need read permission on control file */
4045 /* AV: shouldn't we check that it's been opened for read instead? */
4046 ret = inode_permission(file_inode(cfile.file), MAY_READ);
4047 if (ret < 0)
4048 goto out_put_cfile;
4049
4050 event->cft = __file_cft(cfile.file);
4051 if (IS_ERR(event->cft)) {
4052 ret = PTR_ERR(event->cft);
4053 goto out_put_cfile;
4054 }
4055
4056 if (!event->cft->ss) {
4057 ret = -EBADF;
4058 goto out_put_cfile;
4059 }
4060
4061 /*
4062 * Determine the css of @cfile, verify it belongs to the same
4063 * cgroup as cgroup.event_control, and associate @event with it.
4064 * Remaining events are automatically removed on cgroup destruction
4065 * but the removal is asynchronous, so take an extra ref.
4066 */
4067 rcu_read_lock();
4068
4069 ret = -EINVAL;
4070 event->css = cgroup_css(cgrp, event->cft->ss);
4071 cfile_css = css_from_dir(cfile.file->f_dentry->d_parent, event->cft->ss);
4072 if (event->css && event->css == cfile_css && css_tryget(event->css))
4073 ret = 0;
4074
4075 rcu_read_unlock();
4076 if (ret)
4077 goto out_put_cfile;
4078
4079 if (!event->cft->register_event || !event->cft->unregister_event) {
4080 ret = -EINVAL;
4081 goto out_put_css;
4082 }
4083
4084 ret = event->cft->register_event(event->css, event->cft,
4085 event->eventfd, buffer);
4086 if (ret)
4087 goto out_put_css;
4088
4089 efile.file->f_op->poll(efile.file, &event->pt);
4090
4091 spin_lock(&cgrp->event_list_lock);
4092 list_add(&event->list, &cgrp->event_list);
4093 spin_unlock(&cgrp->event_list_lock);
4094
4095 fdput(cfile);
4096 fdput(efile);
4097
4098 return 0;
4099
4100out_put_css:
4101 css_put(event->css);
4102out_put_cfile:
4103 fdput(cfile);
4104out_put_eventfd:
4105 eventfd_ctx_put(event->eventfd);
4106out_put_efile:
4107 fdput(efile);
4108out_kfree:
4109 kfree(event);
4110
4111 return ret;
4112}
4113
4114static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css, 3873static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
4115 struct cftype *cft) 3874 struct cftype *cft)
4116{ 3875{
@@ -4136,11 +3895,6 @@ static struct cftype cgroup_base_files[] = {
4136 .mode = S_IRUGO | S_IWUSR, 3895 .mode = S_IRUGO | S_IWUSR,
4137 }, 3896 },
4138 { 3897 {
4139 .name = "cgroup.event_control",
4140 .write_string = cgroup_write_event_control,
4141 .mode = S_IWUGO,
4142 },
4143 {
4144 .name = "cgroup.clone_children", 3898 .name = "cgroup.clone_children",
4145 .flags = CFTYPE_INSANE, 3899 .flags = CFTYPE_INSANE,
4146 .read_u64 = cgroup_clone_children_read, 3900 .read_u64 = cgroup_clone_children_read,
@@ -4610,7 +4364,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4610 __releases(&cgroup_mutex) __acquires(&cgroup_mutex) 4364 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
4611{ 4365{
4612 struct dentry *d = cgrp->dentry; 4366 struct dentry *d = cgrp->dentry;
4613 struct cgroup_event *event, *tmp;
4614 struct cgroup_subsys *ss; 4367 struct cgroup_subsys *ss;
4615 struct cgroup *child; 4368 struct cgroup *child;
4616 bool empty; 4369 bool empty;
@@ -4685,18 +4438,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4685 dget(d); 4438 dget(d);
4686 cgroup_d_remove_dir(d); 4439 cgroup_d_remove_dir(d);
4687 4440
4688 /*
4689 * Unregister events and notify userspace.
4690 * Notify userspace about cgroup removing only after rmdir of cgroup
4691 * directory to avoid race between userspace and kernelspace.
4692 */
4693 spin_lock(&cgrp->event_list_lock);
4694 list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
4695 list_del_init(&event->list);
4696 schedule_work(&event->remove);
4697 }
4698 spin_unlock(&cgrp->event_list_lock);
4699
4700 return 0; 4441 return 0;
4701}; 4442};
4702 4443
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f1a0ae6e11b8..7aa0d405b148 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -45,6 +45,7 @@
45#include <linux/swapops.h> 45#include <linux/swapops.h>
46#include <linux/spinlock.h> 46#include <linux/spinlock.h>
47#include <linux/eventfd.h> 47#include <linux/eventfd.h>
48#include <linux/poll.h>
48#include <linux/sort.h> 49#include <linux/sort.h>
49#include <linux/fs.h> 50#include <linux/fs.h>
50#include <linux/seq_file.h> 51#include <linux/seq_file.h>
@@ -55,6 +56,7 @@
55#include <linux/cpu.h> 56#include <linux/cpu.h>
56#include <linux/oom.h> 57#include <linux/oom.h>
57#include <linux/lockdep.h> 58#include <linux/lockdep.h>
59#include <linux/file.h>
58#include "internal.h" 60#include "internal.h"
59#include <net/sock.h> 61#include <net/sock.h>
60#include <net/ip.h> 62#include <net/ip.h>
@@ -227,6 +229,46 @@ struct mem_cgroup_eventfd_list {
227 struct eventfd_ctx *eventfd; 229 struct eventfd_ctx *eventfd;
228}; 230};
229 231
232/*
233 * cgroup_event represents events which userspace want to receive.
234 */
235struct mem_cgroup_event {
236 /*
237 * memcg which the event belongs to.
238 */
239 struct mem_cgroup *memcg;
240 /*
241 * eventfd to signal userspace about the event.
242 */
243 struct eventfd_ctx *eventfd;
244 /*
245 * Each of these stored in a list by the cgroup.
246 */
247 struct list_head list;
248 /*
249 * register_event() callback will be used to add new userspace
250 * waiter for changes related to this event. Use eventfd_signal()
251 * on eventfd to send notification to userspace.
252 */
253 int (*register_event)(struct mem_cgroup *memcg,
254 struct eventfd_ctx *eventfd, const char *args);
255 /*
256 * unregister_event() callback will be called when userspace closes
257 * the eventfd or on cgroup removing. This callback must be set,
258 * if you want provide notification functionality.
259 */
260 void (*unregister_event)(struct mem_cgroup *memcg,
261 struct eventfd_ctx *eventfd);
262 /*
263 * All fields below needed to unregister event when
264 * userspace closes eventfd.
265 */
266 poll_table pt;
267 wait_queue_head_t *wqh;
268 wait_queue_t wait;
269 struct work_struct remove;
270};
271
230static void mem_cgroup_threshold(struct mem_cgroup *memcg); 272static void mem_cgroup_threshold(struct mem_cgroup *memcg);
231static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); 273static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
232 274
@@ -331,6 +373,10 @@ struct mem_cgroup {
331 atomic_t numainfo_updating; 373 atomic_t numainfo_updating;
332#endif 374#endif
333 375
376 /* List of events which userspace want to receive */
377 struct list_head event_list;
378 spinlock_t event_list_lock;
379
334 struct mem_cgroup_per_node *nodeinfo[0]; 380 struct mem_cgroup_per_node *nodeinfo[0];
335 /* WARNING: nodeinfo must be the last member here */ 381 /* WARNING: nodeinfo must be the last member here */
336}; 382};
@@ -490,11 +536,6 @@ struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
490 return &container_of(vmpr, struct mem_cgroup, vmpressure)->css; 536 return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
491} 537}
492 538
493struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css)
494{
495 return &mem_cgroup_from_css(css)->vmpressure;
496}
497
498static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) 539static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
499{ 540{
500 return (memcg == root_mem_cgroup); 541 return (memcg == root_mem_cgroup);
@@ -5648,13 +5689,11 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
5648 mem_cgroup_oom_notify_cb(iter); 5689 mem_cgroup_oom_notify_cb(iter);
5649} 5690}
5650 5691
5651static int mem_cgroup_usage_register_event(struct cgroup_subsys_state *css, 5692static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
5652 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) 5693 struct eventfd_ctx *eventfd, const char *args, enum res_type type)
5653{ 5694{
5654 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5655 struct mem_cgroup_thresholds *thresholds; 5695 struct mem_cgroup_thresholds *thresholds;
5656 struct mem_cgroup_threshold_ary *new; 5696 struct mem_cgroup_threshold_ary *new;
5657 enum res_type type = MEMFILE_TYPE(cft->private);
5658 u64 threshold, usage; 5697 u64 threshold, usage;
5659 int i, size, ret; 5698 int i, size, ret;
5660 5699
@@ -5731,13 +5770,23 @@ unlock:
5731 return ret; 5770 return ret;
5732} 5771}
5733 5772
5734static void mem_cgroup_usage_unregister_event(struct cgroup_subsys_state *css, 5773static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
5735 struct cftype *cft, struct eventfd_ctx *eventfd) 5774 struct eventfd_ctx *eventfd, const char *args)
5775{
5776 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
5777}
5778
5779static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
5780 struct eventfd_ctx *eventfd, const char *args)
5781{
5782 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
5783}
5784
5785static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
5786 struct eventfd_ctx *eventfd, enum res_type type)
5736{ 5787{
5737 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5738 struct mem_cgroup_thresholds *thresholds; 5788 struct mem_cgroup_thresholds *thresholds;
5739 struct mem_cgroup_threshold_ary *new; 5789 struct mem_cgroup_threshold_ary *new;
5740 enum res_type type = MEMFILE_TYPE(cft->private);
5741 u64 usage; 5790 u64 usage;
5742 int i, j, size; 5791 int i, j, size;
5743 5792
@@ -5810,14 +5859,23 @@ unlock:
5810 mutex_unlock(&memcg->thresholds_lock); 5859 mutex_unlock(&memcg->thresholds_lock);
5811} 5860}
5812 5861
5813static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css, 5862static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
5814 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) 5863 struct eventfd_ctx *eventfd)
5864{
5865 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
5866}
5867
5868static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
5869 struct eventfd_ctx *eventfd)
5870{
5871 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
5872}
5873
5874static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
5875 struct eventfd_ctx *eventfd, const char *args)
5815{ 5876{
5816 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5817 struct mem_cgroup_eventfd_list *event; 5877 struct mem_cgroup_eventfd_list *event;
5818 enum res_type type = MEMFILE_TYPE(cft->private);
5819 5878
5820 BUG_ON(type != _OOM_TYPE);
5821 event = kmalloc(sizeof(*event), GFP_KERNEL); 5879 event = kmalloc(sizeof(*event), GFP_KERNEL);
5822 if (!event) 5880 if (!event)
5823 return -ENOMEM; 5881 return -ENOMEM;
@@ -5835,14 +5893,10 @@ static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css,
5835 return 0; 5893 return 0;
5836} 5894}
5837 5895
5838static void mem_cgroup_oom_unregister_event(struct cgroup_subsys_state *css, 5896static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
5839 struct cftype *cft, struct eventfd_ctx *eventfd) 5897 struct eventfd_ctx *eventfd)
5840{ 5898{
5841 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5842 struct mem_cgroup_eventfd_list *ev, *tmp; 5899 struct mem_cgroup_eventfd_list *ev, *tmp;
5843 enum res_type type = MEMFILE_TYPE(cft->private);
5844
5845 BUG_ON(type != _OOM_TYPE);
5846 5900
5847 spin_lock(&memcg_oom_lock); 5901 spin_lock(&memcg_oom_lock);
5848 5902
@@ -5959,13 +6013,233 @@ static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)
5959} 6013}
5960#endif 6014#endif
5961 6015
6016/*
6017 * DO NOT USE IN NEW FILES.
6018 *
6019 * "cgroup.event_control" implementation.
6020 *
6021 * This is way over-engineered. It tries to support fully configurable
6022 * events for each user. Such level of flexibility is completely
6023 * unnecessary especially in the light of the planned unified hierarchy.
6024 *
6025 * Please deprecate this and replace with something simpler if at all
6026 * possible.
6027 */
6028
6029/*
6030 * Unregister event and free resources.
6031 *
6032 * Gets called from workqueue.
6033 */
6034static void memcg_event_remove(struct work_struct *work)
6035{
6036 struct mem_cgroup_event *event =
6037 container_of(work, struct mem_cgroup_event, remove);
6038 struct mem_cgroup *memcg = event->memcg;
6039
6040 remove_wait_queue(event->wqh, &event->wait);
6041
6042 event->unregister_event(memcg, event->eventfd);
6043
6044 /* Notify userspace the event is going away. */
6045 eventfd_signal(event->eventfd, 1);
6046
6047 eventfd_ctx_put(event->eventfd);
6048 kfree(event);
6049 css_put(&memcg->css);
6050}
6051
6052/*
6053 * Gets called on POLLHUP on eventfd when user closes it.
6054 *
6055 * Called with wqh->lock held and interrupts disabled.
6056 */
6057static int memcg_event_wake(wait_queue_t *wait, unsigned mode,
6058 int sync, void *key)
6059{
6060 struct mem_cgroup_event *event =
6061 container_of(wait, struct mem_cgroup_event, wait);
6062 struct mem_cgroup *memcg = event->memcg;
6063 unsigned long flags = (unsigned long)key;
6064
6065 if (flags & POLLHUP) {
6066 /*
6067 * If the event has been detached at cgroup removal, we
6068 * can simply return knowing the other side will cleanup
6069 * for us.
6070 *
6071 * We can't race against event freeing since the other
6072 * side will require wqh->lock via remove_wait_queue(),
6073 * which we hold.
6074 */
6075 spin_lock(&memcg->event_list_lock);
6076 if (!list_empty(&event->list)) {
6077 list_del_init(&event->list);
6078 /*
6079 * We are in atomic context, but cgroup_event_remove()
6080 * may sleep, so we have to call it in workqueue.
6081 */
6082 schedule_work(&event->remove);
6083 }
6084 spin_unlock(&memcg->event_list_lock);
6085 }
6086
6087 return 0;
6088}
6089
6090static void memcg_event_ptable_queue_proc(struct file *file,
6091 wait_queue_head_t *wqh, poll_table *pt)
6092{
6093 struct mem_cgroup_event *event =
6094 container_of(pt, struct mem_cgroup_event, pt);
6095
6096 event->wqh = wqh;
6097 add_wait_queue(wqh, &event->wait);
6098}
6099
6100/*
6101 * DO NOT USE IN NEW FILES.
6102 *
6103 * Parse input and register new cgroup event handler.
6104 *
6105 * Input must be in format '<event_fd> <control_fd> <args>'.
6106 * Interpretation of args is defined by control file implementation.
6107 */
6108static int memcg_write_event_control(struct cgroup_subsys_state *css,
6109 struct cftype *cft, const char *buffer)
6110{
6111 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
6112 struct mem_cgroup_event *event;
6113 struct cgroup_subsys_state *cfile_css;
6114 unsigned int efd, cfd;
6115 struct fd efile;
6116 struct fd cfile;
6117 const char *name;
6118 char *endp;
6119 int ret;
6120
6121 efd = simple_strtoul(buffer, &endp, 10);
6122 if (*endp != ' ')
6123 return -EINVAL;
6124 buffer = endp + 1;
6125
6126 cfd = simple_strtoul(buffer, &endp, 10);
6127 if ((*endp != ' ') && (*endp != '\0'))
6128 return -EINVAL;
6129 buffer = endp + 1;
6130
6131 event = kzalloc(sizeof(*event), GFP_KERNEL);
6132 if (!event)
6133 return -ENOMEM;
6134
6135 event->memcg = memcg;
6136 INIT_LIST_HEAD(&event->list);
6137 init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
6138 init_waitqueue_func_entry(&event->wait, memcg_event_wake);
6139 INIT_WORK(&event->remove, memcg_event_remove);
6140
6141 efile = fdget(efd);
6142 if (!efile.file) {
6143 ret = -EBADF;
6144 goto out_kfree;
6145 }
6146
6147 event->eventfd = eventfd_ctx_fileget(efile.file);
6148 if (IS_ERR(event->eventfd)) {
6149 ret = PTR_ERR(event->eventfd);
6150 goto out_put_efile;
6151 }
6152
6153 cfile = fdget(cfd);
6154 if (!cfile.file) {
6155 ret = -EBADF;
6156 goto out_put_eventfd;
6157 }
6158
6159 /* the process need read permission on control file */
6160 /* AV: shouldn't we check that it's been opened for read instead? */
6161 ret = inode_permission(file_inode(cfile.file), MAY_READ);
6162 if (ret < 0)
6163 goto out_put_cfile;
6164
6165 /*
6166 * Determine the event callbacks and set them in @event. This used
6167 * to be done via struct cftype but cgroup core no longer knows
6168 * about these events. The following is crude but the whole thing
6169 * is for compatibility anyway.
6170 *
6171 * DO NOT ADD NEW FILES.
6172 */
6173 name = cfile.file->f_dentry->d_name.name;
6174
6175 if (!strcmp(name, "memory.usage_in_bytes")) {
6176 event->register_event = mem_cgroup_usage_register_event;
6177 event->unregister_event = mem_cgroup_usage_unregister_event;
6178 } else if (!strcmp(name, "memory.oom_control")) {
6179 event->register_event = mem_cgroup_oom_register_event;
6180 event->unregister_event = mem_cgroup_oom_unregister_event;
6181 } else if (!strcmp(name, "memory.pressure_level")) {
6182 event->register_event = vmpressure_register_event;
6183 event->unregister_event = vmpressure_unregister_event;
6184 } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
6185 event->register_event = memsw_cgroup_usage_register_event;
6186 event->unregister_event = memsw_cgroup_usage_unregister_event;
6187 } else {
6188 ret = -EINVAL;
6189 goto out_put_cfile;
6190 }
6191
6192 /*
6193 * Verify @cfile should belong to @css. Also, remaining events are
6194 * automatically removed on cgroup destruction but the removal is
6195 * asynchronous, so take an extra ref on @css.
6196 */
6197 rcu_read_lock();
6198
6199 ret = -EINVAL;
6200 cfile_css = css_from_dir(cfile.file->f_dentry->d_parent,
6201 &mem_cgroup_subsys);
6202 if (cfile_css == css && css_tryget(css))
6203 ret = 0;
6204
6205 rcu_read_unlock();
6206 if (ret)
6207 goto out_put_cfile;
6208
6209 ret = event->register_event(memcg, event->eventfd, buffer);
6210 if (ret)
6211 goto out_put_css;
6212
6213 efile.file->f_op->poll(efile.file, &event->pt);
6214
6215 spin_lock(&memcg->event_list_lock);
6216 list_add(&event->list, &memcg->event_list);
6217 spin_unlock(&memcg->event_list_lock);
6218
6219 fdput(cfile);
6220 fdput(efile);
6221
6222 return 0;
6223
6224out_put_css:
6225 css_put(css);
6226out_put_cfile:
6227 fdput(cfile);
6228out_put_eventfd:
6229 eventfd_ctx_put(event->eventfd);
6230out_put_efile:
6231 fdput(efile);
6232out_kfree:
6233 kfree(event);
6234
6235 return ret;
6236}
6237
5962static struct cftype mem_cgroup_files[] = { 6238static struct cftype mem_cgroup_files[] = {
5963 { 6239 {
5964 .name = "usage_in_bytes", 6240 .name = "usage_in_bytes",
5965 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 6241 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
5966 .read = mem_cgroup_read, 6242 .read = mem_cgroup_read,
5967 .register_event = mem_cgroup_usage_register_event,
5968 .unregister_event = mem_cgroup_usage_unregister_event,
5969 }, 6243 },
5970 { 6244 {
5971 .name = "max_usage_in_bytes", 6245 .name = "max_usage_in_bytes",
@@ -6006,6 +6280,12 @@ static struct cftype mem_cgroup_files[] = {
6006 .read_u64 = mem_cgroup_hierarchy_read, 6280 .read_u64 = mem_cgroup_hierarchy_read,
6007 }, 6281 },
6008 { 6282 {
6283 .name = "cgroup.event_control", /* XXX: for compat */
6284 .write_string = memcg_write_event_control,
6285 .flags = CFTYPE_NO_PREFIX,
6286 .mode = S_IWUGO,
6287 },
6288 {
6009 .name = "swappiness", 6289 .name = "swappiness",
6010 .read_u64 = mem_cgroup_swappiness_read, 6290 .read_u64 = mem_cgroup_swappiness_read,
6011 .write_u64 = mem_cgroup_swappiness_write, 6291 .write_u64 = mem_cgroup_swappiness_write,
@@ -6019,14 +6299,10 @@ static struct cftype mem_cgroup_files[] = {
6019 .name = "oom_control", 6299 .name = "oom_control",
6020 .read_map = mem_cgroup_oom_control_read, 6300 .read_map = mem_cgroup_oom_control_read,
6021 .write_u64 = mem_cgroup_oom_control_write, 6301 .write_u64 = mem_cgroup_oom_control_write,
6022 .register_event = mem_cgroup_oom_register_event,
6023 .unregister_event = mem_cgroup_oom_unregister_event,
6024 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), 6302 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
6025 }, 6303 },
6026 { 6304 {
6027 .name = "pressure_level", 6305 .name = "pressure_level",
6028 .register_event = vmpressure_register_event,
6029 .unregister_event = vmpressure_unregister_event,
6030 }, 6306 },
6031#ifdef CONFIG_NUMA 6307#ifdef CONFIG_NUMA
6032 { 6308 {
@@ -6074,8 +6350,6 @@ static struct cftype memsw_cgroup_files[] = {
6074 .name = "memsw.usage_in_bytes", 6350 .name = "memsw.usage_in_bytes",
6075 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 6351 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
6076 .read = mem_cgroup_read, 6352 .read = mem_cgroup_read,
6077 .register_event = mem_cgroup_usage_register_event,
6078 .unregister_event = mem_cgroup_usage_unregister_event,
6079 }, 6353 },
6080 { 6354 {
6081 .name = "memsw.max_usage_in_bytes", 6355 .name = "memsw.max_usage_in_bytes",
@@ -6265,6 +6539,8 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
6265 mutex_init(&memcg->thresholds_lock); 6539 mutex_init(&memcg->thresholds_lock);
6266 spin_lock_init(&memcg->move_lock); 6540 spin_lock_init(&memcg->move_lock);
6267 vmpressure_init(&memcg->vmpressure); 6541 vmpressure_init(&memcg->vmpressure);
6542 INIT_LIST_HEAD(&memcg->event_list);
6543 spin_lock_init(&memcg->event_list_lock);
6268 6544
6269 return &memcg->css; 6545 return &memcg->css;
6270 6546
@@ -6340,6 +6616,19 @@ static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg)
6340static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) 6616static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
6341{ 6617{
6342 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 6618 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
6619 struct mem_cgroup_event *event, *tmp;
6620
6621 /*
6622 * Unregister events and notify userspace.
6623 * Notify userspace about cgroup removing only after rmdir of cgroup
6624 * directory to avoid race between userspace and kernelspace.
6625 */
6626 spin_lock(&memcg->event_list_lock);
6627 list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
6628 list_del_init(&event->list);
6629 schedule_work(&event->remove);
6630 }
6631 spin_unlock(&memcg->event_list_lock);
6343 6632
6344 kmem_cgroup_css_offline(memcg); 6633 kmem_cgroup_css_offline(memcg);
6345 6634
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
index e0f62837c3f4..196970a4541f 100644
--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -278,8 +278,7 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
278 278
279/** 279/**
280 * vmpressure_register_event() - Bind vmpressure notifications to an eventfd 280 * vmpressure_register_event() - Bind vmpressure notifications to an eventfd
281 * @css: css that is interested in vmpressure notifications 281 * @memcg: memcg that is interested in vmpressure notifications
282 * @cft: cgroup control files handle
283 * @eventfd: eventfd context to link notifications with 282 * @eventfd: eventfd context to link notifications with
284 * @args: event arguments (used to set up a pressure level threshold) 283 * @args: event arguments (used to set up a pressure level threshold)
285 * 284 *
@@ -289,15 +288,12 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
289 * threshold (one of vmpressure_str_levels, i.e. "low", "medium", or 288 * threshold (one of vmpressure_str_levels, i.e. "low", "medium", or
290 * "critical"). 289 * "critical").
291 * 290 *
292 * This function should not be used directly, just pass it to (struct 291 * To be used as memcg event method.
293 * cftype).register_event, and then cgroup core will handle everything by
294 * itself.
295 */ 292 */
296int vmpressure_register_event(struct cgroup_subsys_state *css, 293int vmpressure_register_event(struct mem_cgroup *memcg,
297 struct cftype *cft, struct eventfd_ctx *eventfd, 294 struct eventfd_ctx *eventfd, const char *args)
298 const char *args)
299{ 295{
300 struct vmpressure *vmpr = css_to_vmpressure(css); 296 struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
301 struct vmpressure_event *ev; 297 struct vmpressure_event *ev;
302 int level; 298 int level;
303 299
@@ -325,23 +321,19 @@ int vmpressure_register_event(struct cgroup_subsys_state *css,
325 321
326/** 322/**
327 * vmpressure_unregister_event() - Unbind eventfd from vmpressure 323 * vmpressure_unregister_event() - Unbind eventfd from vmpressure
328 * @css: css handle 324 * @memcg: memcg handle
329 * @cft: cgroup control files handle
330 * @eventfd: eventfd context that was used to link vmpressure with the @cg 325 * @eventfd: eventfd context that was used to link vmpressure with the @cg
331 * 326 *
332 * This function does internal manipulations to detach the @eventfd from 327 * This function does internal manipulations to detach the @eventfd from
333 * the vmpressure notifications, and then frees internal resources 328 * the vmpressure notifications, and then frees internal resources
334 * associated with the @eventfd (but the @eventfd itself is not freed). 329 * associated with the @eventfd (but the @eventfd itself is not freed).
335 * 330 *
336 * This function should not be used directly, just pass it to (struct 331 * To be used as memcg event method.
337 * cftype).unregister_event, and then cgroup core will handle everything
338 * by itself.
339 */ 332 */
340void vmpressure_unregister_event(struct cgroup_subsys_state *css, 333void vmpressure_unregister_event(struct mem_cgroup *memcg,
341 struct cftype *cft,
342 struct eventfd_ctx *eventfd) 334 struct eventfd_ctx *eventfd)
343{ 335{
344 struct vmpressure *vmpr = css_to_vmpressure(css); 336 struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
345 struct vmpressure_event *ev; 337 struct vmpressure_event *ev;
346 338
347 mutex_lock(&vmpr->events_lock); 339 mutex_lock(&vmpr->events_lock);