diff options
-rw-r--r-- | Documentation/cgroups/cgroups.txt | 20 | ||||
-rw-r--r-- | include/linux/cgroup.h | 24 | ||||
-rw-r--r-- | include/linux/vmpressure.h | 8 | ||||
-rw-r--r-- | init/Kconfig | 3 | ||||
-rw-r--r-- | kernel/cgroup.c | 259 | ||||
-rw-r--r-- | mm/memcontrol.c | 353 | ||||
-rw-r--r-- | mm/vmpressure.c | 26 |
7 files changed, 334 insertions, 359 deletions
diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt index 638bf17ff869..821de56d1580 100644 --- a/Documentation/cgroups/cgroups.txt +++ b/Documentation/cgroups/cgroups.txt | |||
@@ -24,7 +24,6 @@ CONTENTS: | |||
24 | 2.1 Basic Usage | 24 | 2.1 Basic Usage |
25 | 2.2 Attaching processes | 25 | 2.2 Attaching processes |
26 | 2.3 Mounting hierarchies by name | 26 | 2.3 Mounting hierarchies by name |
27 | 2.4 Notification API | ||
28 | 3. Kernel API | 27 | 3. Kernel API |
29 | 3.1 Overview | 28 | 3.1 Overview |
30 | 3.2 Synchronization | 29 | 3.2 Synchronization |
@@ -472,25 +471,6 @@ you give a subsystem a name. | |||
472 | The name of the subsystem appears as part of the hierarchy description | 471 | The name of the subsystem appears as part of the hierarchy description |
473 | in /proc/mounts and /proc/<pid>/cgroups. | 472 | in /proc/mounts and /proc/<pid>/cgroups. |
474 | 473 | ||
475 | 2.4 Notification API | ||
476 | -------------------- | ||
477 | |||
478 | There is mechanism which allows to get notifications about changing | ||
479 | status of a cgroup. | ||
480 | |||
481 | To register a new notification handler you need to: | ||
482 | - create a file descriptor for event notification using eventfd(2); | ||
483 | - open a control file to be monitored (e.g. memory.usage_in_bytes); | ||
484 | - write "<event_fd> <control_fd> <args>" to cgroup.event_control. | ||
485 | Interpretation of args is defined by control file implementation; | ||
486 | |||
487 | eventfd will be woken up by control file implementation or when the | ||
488 | cgroup is removed. | ||
489 | |||
490 | To unregister a notification handler just close eventfd. | ||
491 | |||
492 | NOTE: Support of notifications should be implemented for the control | ||
493 | file. See documentation for the subsystem. | ||
494 | 474 | ||
495 | 3. Kernel API | 475 | 3. Kernel API |
496 | ============= | 476 | ============= |
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 39c1d9469677..492fa01ec2d3 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h | |||
@@ -29,7 +29,6 @@ struct cgroup_subsys; | |||
29 | struct inode; | 29 | struct inode; |
30 | struct cgroup; | 30 | struct cgroup; |
31 | struct css_id; | 31 | struct css_id; |
32 | struct eventfd_ctx; | ||
33 | 32 | ||
34 | extern int cgroup_init_early(void); | 33 | extern int cgroup_init_early(void); |
35 | extern int cgroup_init(void); | 34 | extern int cgroup_init(void); |
@@ -239,10 +238,6 @@ struct cgroup { | |||
239 | struct rcu_head rcu_head; | 238 | struct rcu_head rcu_head; |
240 | struct work_struct destroy_work; | 239 | struct work_struct destroy_work; |
241 | 240 | ||
242 | /* List of events which userspace want to receive */ | ||
243 | struct list_head event_list; | ||
244 | spinlock_t event_list_lock; | ||
245 | |||
246 | /* directory xattrs */ | 241 | /* directory xattrs */ |
247 | struct simple_xattrs xattrs; | 242 | struct simple_xattrs xattrs; |
248 | }; | 243 | }; |
@@ -506,25 +501,6 @@ struct cftype { | |||
506 | int (*trigger)(struct cgroup_subsys_state *css, unsigned int event); | 501 | int (*trigger)(struct cgroup_subsys_state *css, unsigned int event); |
507 | 502 | ||
508 | int (*release)(struct inode *inode, struct file *file); | 503 | int (*release)(struct inode *inode, struct file *file); |
509 | |||
510 | /* | ||
511 | * register_event() callback will be used to add new userspace | ||
512 | * waiter for changes related to the cftype. Implement it if | ||
513 | * you want to provide this functionality. Use eventfd_signal() | ||
514 | * on eventfd to send notification to userspace. | ||
515 | */ | ||
516 | int (*register_event)(struct cgroup_subsys_state *css, | ||
517 | struct cftype *cft, struct eventfd_ctx *eventfd, | ||
518 | const char *args); | ||
519 | /* | ||
520 | * unregister_event() callback will be called when userspace | ||
521 | * closes the eventfd or on cgroup removing. | ||
522 | * This callback must be implemented, if you want provide | ||
523 | * notification functionality. | ||
524 | */ | ||
525 | void (*unregister_event)(struct cgroup_subsys_state *css, | ||
526 | struct cftype *cft, | ||
527 | struct eventfd_ctx *eventfd); | ||
528 | }; | 504 | }; |
529 | 505 | ||
530 | /* | 506 | /* |
diff --git a/include/linux/vmpressure.h b/include/linux/vmpressure.h index 3f3788d49362..3e4535876d37 100644 --- a/include/linux/vmpressure.h +++ b/include/linux/vmpressure.h | |||
@@ -7,6 +7,7 @@ | |||
7 | #include <linux/gfp.h> | 7 | #include <linux/gfp.h> |
8 | #include <linux/types.h> | 8 | #include <linux/types.h> |
9 | #include <linux/cgroup.h> | 9 | #include <linux/cgroup.h> |
10 | #include <linux/eventfd.h> | ||
10 | 11 | ||
11 | struct vmpressure { | 12 | struct vmpressure { |
12 | unsigned long scanned; | 13 | unsigned long scanned; |
@@ -33,13 +34,10 @@ extern void vmpressure_init(struct vmpressure *vmpr); | |||
33 | extern void vmpressure_cleanup(struct vmpressure *vmpr); | 34 | extern void vmpressure_cleanup(struct vmpressure *vmpr); |
34 | extern struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg); | 35 | extern struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg); |
35 | extern struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr); | 36 | extern struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr); |
36 | extern struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css); | 37 | extern int vmpressure_register_event(struct mem_cgroup *memcg, |
37 | extern int vmpressure_register_event(struct cgroup_subsys_state *css, | ||
38 | struct cftype *cft, | ||
39 | struct eventfd_ctx *eventfd, | 38 | struct eventfd_ctx *eventfd, |
40 | const char *args); | 39 | const char *args); |
41 | extern void vmpressure_unregister_event(struct cgroup_subsys_state *css, | 40 | extern void vmpressure_unregister_event(struct mem_cgroup *memcg, |
42 | struct cftype *cft, | ||
43 | struct eventfd_ctx *eventfd); | 41 | struct eventfd_ctx *eventfd); |
44 | #else | 42 | #else |
45 | static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, | 43 | static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, |
diff --git a/init/Kconfig b/init/Kconfig index 79383d3aa5dc..93f344337172 100644 --- a/init/Kconfig +++ b/init/Kconfig | |||
@@ -848,7 +848,6 @@ config NUMA_BALANCING | |||
848 | 848 | ||
849 | menuconfig CGROUPS | 849 | menuconfig CGROUPS |
850 | boolean "Control Group support" | 850 | boolean "Control Group support" |
851 | depends on EVENTFD | ||
852 | help | 851 | help |
853 | This option adds support for grouping sets of processes together, for | 852 | This option adds support for grouping sets of processes together, for |
854 | use with process control subsystems such as Cpusets, CFS, memory | 853 | use with process control subsystems such as Cpusets, CFS, memory |
@@ -915,6 +914,7 @@ config MEMCG | |||
915 | bool "Memory Resource Controller for Control Groups" | 914 | bool "Memory Resource Controller for Control Groups" |
916 | depends on RESOURCE_COUNTERS | 915 | depends on RESOURCE_COUNTERS |
917 | select MM_OWNER | 916 | select MM_OWNER |
917 | select EVENTFD | ||
918 | help | 918 | help |
919 | Provides a memory resource controller that manages both anonymous | 919 | Provides a memory resource controller that manages both anonymous |
920 | memory and page cache. (See Documentation/cgroups/memory.txt) | 920 | memory and page cache. (See Documentation/cgroups/memory.txt) |
@@ -1154,7 +1154,6 @@ config UIDGID_STRICT_TYPE_CHECKS | |||
1154 | 1154 | ||
1155 | config SCHED_AUTOGROUP | 1155 | config SCHED_AUTOGROUP |
1156 | bool "Automatic process group scheduling" | 1156 | bool "Automatic process group scheduling" |
1157 | select EVENTFD | ||
1158 | select CGROUPS | 1157 | select CGROUPS |
1159 | select CGROUP_SCHED | 1158 | select CGROUP_SCHED |
1160 | select FAIR_GROUP_SCHED | 1159 | select FAIR_GROUP_SCHED |
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a7b98ee35ef7..be42967f4f1a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -56,11 +56,8 @@ | |||
56 | #include <linux/pid_namespace.h> | 56 | #include <linux/pid_namespace.h> |
57 | #include <linux/idr.h> | 57 | #include <linux/idr.h> |
58 | #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ | 58 | #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ |
59 | #include <linux/eventfd.h> | ||
60 | #include <linux/poll.h> | ||
61 | #include <linux/flex_array.h> /* used in cgroup_attach_task */ | 59 | #include <linux/flex_array.h> /* used in cgroup_attach_task */ |
62 | #include <linux/kthread.h> | 60 | #include <linux/kthread.h> |
63 | #include <linux/file.h> | ||
64 | 61 | ||
65 | #include <linux/atomic.h> | 62 | #include <linux/atomic.h> |
66 | 63 | ||
@@ -132,36 +129,6 @@ struct cfent { | |||
132 | struct simple_xattrs xattrs; | 129 | struct simple_xattrs xattrs; |
133 | }; | 130 | }; |
134 | 131 | ||
135 | /* | ||
136 | * cgroup_event represents events which userspace want to receive. | ||
137 | */ | ||
138 | struct cgroup_event { | ||
139 | /* | ||
140 | * css which the event belongs to. | ||
141 | */ | ||
142 | struct cgroup_subsys_state *css; | ||
143 | /* | ||
144 | * Control file which the event associated. | ||
145 | */ | ||
146 | struct cftype *cft; | ||
147 | /* | ||
148 | * eventfd to signal userspace about the event. | ||
149 | */ | ||
150 | struct eventfd_ctx *eventfd; | ||
151 | /* | ||
152 | * Each of these stored in a list by the cgroup. | ||
153 | */ | ||
154 | struct list_head list; | ||
155 | /* | ||
156 | * All fields below needed to unregister event when | ||
157 | * userspace closes eventfd. | ||
158 | */ | ||
159 | poll_table pt; | ||
160 | wait_queue_head_t *wqh; | ||
161 | wait_queue_t wait; | ||
162 | struct work_struct remove; | ||
163 | }; | ||
164 | |||
165 | /* The list of hierarchy roots */ | 132 | /* The list of hierarchy roots */ |
166 | 133 | ||
167 | static LIST_HEAD(cgroup_roots); | 134 | static LIST_HEAD(cgroup_roots); |
@@ -1351,8 +1318,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) | |||
1351 | INIT_LIST_HEAD(&cgrp->pidlists); | 1318 | INIT_LIST_HEAD(&cgrp->pidlists); |
1352 | mutex_init(&cgrp->pidlist_mutex); | 1319 | mutex_init(&cgrp->pidlist_mutex); |
1353 | cgrp->dummy_css.cgroup = cgrp; | 1320 | cgrp->dummy_css.cgroup = cgrp; |
1354 | INIT_LIST_HEAD(&cgrp->event_list); | ||
1355 | spin_lock_init(&cgrp->event_list_lock); | ||
1356 | simple_xattrs_init(&cgrp->xattrs); | 1321 | simple_xattrs_init(&cgrp->xattrs); |
1357 | } | 1322 | } |
1358 | 1323 | ||
@@ -2626,16 +2591,6 @@ static const struct inode_operations cgroup_dir_inode_operations = { | |||
2626 | .removexattr = cgroup_removexattr, | 2591 | .removexattr = cgroup_removexattr, |
2627 | }; | 2592 | }; |
2628 | 2593 | ||
2629 | /* | ||
2630 | * Check if a file is a control file | ||
2631 | */ | ||
2632 | static inline struct cftype *__file_cft(struct file *file) | ||
2633 | { | ||
2634 | if (file_inode(file)->i_fop != &cgroup_file_operations) | ||
2635 | return ERR_PTR(-EINVAL); | ||
2636 | return __d_cft(file->f_dentry); | ||
2637 | } | ||
2638 | |||
2639 | static int cgroup_create_file(struct dentry *dentry, umode_t mode, | 2594 | static int cgroup_create_file(struct dentry *dentry, umode_t mode, |
2640 | struct super_block *sb) | 2595 | struct super_block *sb) |
2641 | { | 2596 | { |
@@ -3915,202 +3870,6 @@ static void cgroup_dput(struct cgroup *cgrp) | |||
3915 | deactivate_super(sb); | 3870 | deactivate_super(sb); |
3916 | } | 3871 | } |
3917 | 3872 | ||
3918 | /* | ||
3919 | * Unregister event and free resources. | ||
3920 | * | ||
3921 | * Gets called from workqueue. | ||
3922 | */ | ||
3923 | static void cgroup_event_remove(struct work_struct *work) | ||
3924 | { | ||
3925 | struct cgroup_event *event = container_of(work, struct cgroup_event, | ||
3926 | remove); | ||
3927 | struct cgroup_subsys_state *css = event->css; | ||
3928 | |||
3929 | remove_wait_queue(event->wqh, &event->wait); | ||
3930 | |||
3931 | event->cft->unregister_event(css, event->cft, event->eventfd); | ||
3932 | |||
3933 | /* Notify userspace the event is going away. */ | ||
3934 | eventfd_signal(event->eventfd, 1); | ||
3935 | |||
3936 | eventfd_ctx_put(event->eventfd); | ||
3937 | kfree(event); | ||
3938 | css_put(css); | ||
3939 | } | ||
3940 | |||
3941 | /* | ||
3942 | * Gets called on POLLHUP on eventfd when user closes it. | ||
3943 | * | ||
3944 | * Called with wqh->lock held and interrupts disabled. | ||
3945 | */ | ||
3946 | static int cgroup_event_wake(wait_queue_t *wait, unsigned mode, | ||
3947 | int sync, void *key) | ||
3948 | { | ||
3949 | struct cgroup_event *event = container_of(wait, | ||
3950 | struct cgroup_event, wait); | ||
3951 | struct cgroup *cgrp = event->css->cgroup; | ||
3952 | unsigned long flags = (unsigned long)key; | ||
3953 | |||
3954 | if (flags & POLLHUP) { | ||
3955 | /* | ||
3956 | * If the event has been detached at cgroup removal, we | ||
3957 | * can simply return knowing the other side will cleanup | ||
3958 | * for us. | ||
3959 | * | ||
3960 | * We can't race against event freeing since the other | ||
3961 | * side will require wqh->lock via remove_wait_queue(), | ||
3962 | * which we hold. | ||
3963 | */ | ||
3964 | spin_lock(&cgrp->event_list_lock); | ||
3965 | if (!list_empty(&event->list)) { | ||
3966 | list_del_init(&event->list); | ||
3967 | /* | ||
3968 | * We are in atomic context, but cgroup_event_remove() | ||
3969 | * may sleep, so we have to call it in workqueue. | ||
3970 | */ | ||
3971 | schedule_work(&event->remove); | ||
3972 | } | ||
3973 | spin_unlock(&cgrp->event_list_lock); | ||
3974 | } | ||
3975 | |||
3976 | return 0; | ||
3977 | } | ||
3978 | |||
3979 | static void cgroup_event_ptable_queue_proc(struct file *file, | ||
3980 | wait_queue_head_t *wqh, poll_table *pt) | ||
3981 | { | ||
3982 | struct cgroup_event *event = container_of(pt, | ||
3983 | struct cgroup_event, pt); | ||
3984 | |||
3985 | event->wqh = wqh; | ||
3986 | add_wait_queue(wqh, &event->wait); | ||
3987 | } | ||
3988 | |||
3989 | /* | ||
3990 | * Parse input and register new cgroup event handler. | ||
3991 | * | ||
3992 | * Input must be in format '<event_fd> <control_fd> <args>'. | ||
3993 | * Interpretation of args is defined by control file implementation. | ||
3994 | */ | ||
3995 | static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css, | ||
3996 | struct cftype *cft, const char *buffer) | ||
3997 | { | ||
3998 | struct cgroup *cgrp = dummy_css->cgroup; | ||
3999 | struct cgroup_event *event; | ||
4000 | struct cgroup_subsys_state *cfile_css; | ||
4001 | unsigned int efd, cfd; | ||
4002 | struct fd efile; | ||
4003 | struct fd cfile; | ||
4004 | char *endp; | ||
4005 | int ret; | ||
4006 | |||
4007 | efd = simple_strtoul(buffer, &endp, 10); | ||
4008 | if (*endp != ' ') | ||
4009 | return -EINVAL; | ||
4010 | buffer = endp + 1; | ||
4011 | |||
4012 | cfd = simple_strtoul(buffer, &endp, 10); | ||
4013 | if ((*endp != ' ') && (*endp != '\0')) | ||
4014 | return -EINVAL; | ||
4015 | buffer = endp + 1; | ||
4016 | |||
4017 | event = kzalloc(sizeof(*event), GFP_KERNEL); | ||
4018 | if (!event) | ||
4019 | return -ENOMEM; | ||
4020 | |||
4021 | INIT_LIST_HEAD(&event->list); | ||
4022 | init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc); | ||
4023 | init_waitqueue_func_entry(&event->wait, cgroup_event_wake); | ||
4024 | INIT_WORK(&event->remove, cgroup_event_remove); | ||
4025 | |||
4026 | efile = fdget(efd); | ||
4027 | if (!efile.file) { | ||
4028 | ret = -EBADF; | ||
4029 | goto out_kfree; | ||
4030 | } | ||
4031 | |||
4032 | event->eventfd = eventfd_ctx_fileget(efile.file); | ||
4033 | if (IS_ERR(event->eventfd)) { | ||
4034 | ret = PTR_ERR(event->eventfd); | ||
4035 | goto out_put_efile; | ||
4036 | } | ||
4037 | |||
4038 | cfile = fdget(cfd); | ||
4039 | if (!cfile.file) { | ||
4040 | ret = -EBADF; | ||
4041 | goto out_put_eventfd; | ||
4042 | } | ||
4043 | |||
4044 | /* the process need read permission on control file */ | ||
4045 | /* AV: shouldn't we check that it's been opened for read instead? */ | ||
4046 | ret = inode_permission(file_inode(cfile.file), MAY_READ); | ||
4047 | if (ret < 0) | ||
4048 | goto out_put_cfile; | ||
4049 | |||
4050 | event->cft = __file_cft(cfile.file); | ||
4051 | if (IS_ERR(event->cft)) { | ||
4052 | ret = PTR_ERR(event->cft); | ||
4053 | goto out_put_cfile; | ||
4054 | } | ||
4055 | |||
4056 | if (!event->cft->ss) { | ||
4057 | ret = -EBADF; | ||
4058 | goto out_put_cfile; | ||
4059 | } | ||
4060 | |||
4061 | /* | ||
4062 | * Determine the css of @cfile, verify it belongs to the same | ||
4063 | * cgroup as cgroup.event_control, and associate @event with it. | ||
4064 | * Remaining events are automatically removed on cgroup destruction | ||
4065 | * but the removal is asynchronous, so take an extra ref. | ||
4066 | */ | ||
4067 | rcu_read_lock(); | ||
4068 | |||
4069 | ret = -EINVAL; | ||
4070 | event->css = cgroup_css(cgrp, event->cft->ss); | ||
4071 | cfile_css = css_from_dir(cfile.file->f_dentry->d_parent, event->cft->ss); | ||
4072 | if (event->css && event->css == cfile_css && css_tryget(event->css)) | ||
4073 | ret = 0; | ||
4074 | |||
4075 | rcu_read_unlock(); | ||
4076 | if (ret) | ||
4077 | goto out_put_cfile; | ||
4078 | |||
4079 | if (!event->cft->register_event || !event->cft->unregister_event) { | ||
4080 | ret = -EINVAL; | ||
4081 | goto out_put_css; | ||
4082 | } | ||
4083 | |||
4084 | ret = event->cft->register_event(event->css, event->cft, | ||
4085 | event->eventfd, buffer); | ||
4086 | if (ret) | ||
4087 | goto out_put_css; | ||
4088 | |||
4089 | efile.file->f_op->poll(efile.file, &event->pt); | ||
4090 | |||
4091 | spin_lock(&cgrp->event_list_lock); | ||
4092 | list_add(&event->list, &cgrp->event_list); | ||
4093 | spin_unlock(&cgrp->event_list_lock); | ||
4094 | |||
4095 | fdput(cfile); | ||
4096 | fdput(efile); | ||
4097 | |||
4098 | return 0; | ||
4099 | |||
4100 | out_put_css: | ||
4101 | css_put(event->css); | ||
4102 | out_put_cfile: | ||
4103 | fdput(cfile); | ||
4104 | out_put_eventfd: | ||
4105 | eventfd_ctx_put(event->eventfd); | ||
4106 | out_put_efile: | ||
4107 | fdput(efile); | ||
4108 | out_kfree: | ||
4109 | kfree(event); | ||
4110 | |||
4111 | return ret; | ||
4112 | } | ||
4113 | |||
4114 | static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css, | 3873 | static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css, |
4115 | struct cftype *cft) | 3874 | struct cftype *cft) |
4116 | { | 3875 | { |
@@ -4136,11 +3895,6 @@ static struct cftype cgroup_base_files[] = { | |||
4136 | .mode = S_IRUGO | S_IWUSR, | 3895 | .mode = S_IRUGO | S_IWUSR, |
4137 | }, | 3896 | }, |
4138 | { | 3897 | { |
4139 | .name = "cgroup.event_control", | ||
4140 | .write_string = cgroup_write_event_control, | ||
4141 | .mode = S_IWUGO, | ||
4142 | }, | ||
4143 | { | ||
4144 | .name = "cgroup.clone_children", | 3898 | .name = "cgroup.clone_children", |
4145 | .flags = CFTYPE_INSANE, | 3899 | .flags = CFTYPE_INSANE, |
4146 | .read_u64 = cgroup_clone_children_read, | 3900 | .read_u64 = cgroup_clone_children_read, |
@@ -4610,7 +4364,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) | |||
4610 | __releases(&cgroup_mutex) __acquires(&cgroup_mutex) | 4364 | __releases(&cgroup_mutex) __acquires(&cgroup_mutex) |
4611 | { | 4365 | { |
4612 | struct dentry *d = cgrp->dentry; | 4366 | struct dentry *d = cgrp->dentry; |
4613 | struct cgroup_event *event, *tmp; | ||
4614 | struct cgroup_subsys *ss; | 4367 | struct cgroup_subsys *ss; |
4615 | struct cgroup *child; | 4368 | struct cgroup *child; |
4616 | bool empty; | 4369 | bool empty; |
@@ -4685,18 +4438,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) | |||
4685 | dget(d); | 4438 | dget(d); |
4686 | cgroup_d_remove_dir(d); | 4439 | cgroup_d_remove_dir(d); |
4687 | 4440 | ||
4688 | /* | ||
4689 | * Unregister events and notify userspace. | ||
4690 | * Notify userspace about cgroup removing only after rmdir of cgroup | ||
4691 | * directory to avoid race between userspace and kernelspace. | ||
4692 | */ | ||
4693 | spin_lock(&cgrp->event_list_lock); | ||
4694 | list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) { | ||
4695 | list_del_init(&event->list); | ||
4696 | schedule_work(&event->remove); | ||
4697 | } | ||
4698 | spin_unlock(&cgrp->event_list_lock); | ||
4699 | |||
4700 | return 0; | 4441 | return 0; |
4701 | }; | 4442 | }; |
4702 | 4443 | ||
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f1a0ae6e11b8..7aa0d405b148 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -45,6 +45,7 @@ | |||
45 | #include <linux/swapops.h> | 45 | #include <linux/swapops.h> |
46 | #include <linux/spinlock.h> | 46 | #include <linux/spinlock.h> |
47 | #include <linux/eventfd.h> | 47 | #include <linux/eventfd.h> |
48 | #include <linux/poll.h> | ||
48 | #include <linux/sort.h> | 49 | #include <linux/sort.h> |
49 | #include <linux/fs.h> | 50 | #include <linux/fs.h> |
50 | #include <linux/seq_file.h> | 51 | #include <linux/seq_file.h> |
@@ -55,6 +56,7 @@ | |||
55 | #include <linux/cpu.h> | 56 | #include <linux/cpu.h> |
56 | #include <linux/oom.h> | 57 | #include <linux/oom.h> |
57 | #include <linux/lockdep.h> | 58 | #include <linux/lockdep.h> |
59 | #include <linux/file.h> | ||
58 | #include "internal.h" | 60 | #include "internal.h" |
59 | #include <net/sock.h> | 61 | #include <net/sock.h> |
60 | #include <net/ip.h> | 62 | #include <net/ip.h> |
@@ -227,6 +229,46 @@ struct mem_cgroup_eventfd_list { | |||
227 | struct eventfd_ctx *eventfd; | 229 | struct eventfd_ctx *eventfd; |
228 | }; | 230 | }; |
229 | 231 | ||
232 | /* | ||
233 | * cgroup_event represents events which userspace want to receive. | ||
234 | */ | ||
235 | struct mem_cgroup_event { | ||
236 | /* | ||
237 | * memcg which the event belongs to. | ||
238 | */ | ||
239 | struct mem_cgroup *memcg; | ||
240 | /* | ||
241 | * eventfd to signal userspace about the event. | ||
242 | */ | ||
243 | struct eventfd_ctx *eventfd; | ||
244 | /* | ||
245 | * Each of these stored in a list by the cgroup. | ||
246 | */ | ||
247 | struct list_head list; | ||
248 | /* | ||
249 | * register_event() callback will be used to add new userspace | ||
250 | * waiter for changes related to this event. Use eventfd_signal() | ||
251 | * on eventfd to send notification to userspace. | ||
252 | */ | ||
253 | int (*register_event)(struct mem_cgroup *memcg, | ||
254 | struct eventfd_ctx *eventfd, const char *args); | ||
255 | /* | ||
256 | * unregister_event() callback will be called when userspace closes | ||
257 | * the eventfd or on cgroup removing. This callback must be set, | ||
258 | * if you want provide notification functionality. | ||
259 | */ | ||
260 | void (*unregister_event)(struct mem_cgroup *memcg, | ||
261 | struct eventfd_ctx *eventfd); | ||
262 | /* | ||
263 | * All fields below needed to unregister event when | ||
264 | * userspace closes eventfd. | ||
265 | */ | ||
266 | poll_table pt; | ||
267 | wait_queue_head_t *wqh; | ||
268 | wait_queue_t wait; | ||
269 | struct work_struct remove; | ||
270 | }; | ||
271 | |||
230 | static void mem_cgroup_threshold(struct mem_cgroup *memcg); | 272 | static void mem_cgroup_threshold(struct mem_cgroup *memcg); |
231 | static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); | 273 | static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); |
232 | 274 | ||
@@ -331,6 +373,10 @@ struct mem_cgroup { | |||
331 | atomic_t numainfo_updating; | 373 | atomic_t numainfo_updating; |
332 | #endif | 374 | #endif |
333 | 375 | ||
376 | /* List of events which userspace want to receive */ | ||
377 | struct list_head event_list; | ||
378 | spinlock_t event_list_lock; | ||
379 | |||
334 | struct mem_cgroup_per_node *nodeinfo[0]; | 380 | struct mem_cgroup_per_node *nodeinfo[0]; |
335 | /* WARNING: nodeinfo must be the last member here */ | 381 | /* WARNING: nodeinfo must be the last member here */ |
336 | }; | 382 | }; |
@@ -490,11 +536,6 @@ struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr) | |||
490 | return &container_of(vmpr, struct mem_cgroup, vmpressure)->css; | 536 | return &container_of(vmpr, struct mem_cgroup, vmpressure)->css; |
491 | } | 537 | } |
492 | 538 | ||
493 | struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css) | ||
494 | { | ||
495 | return &mem_cgroup_from_css(css)->vmpressure; | ||
496 | } | ||
497 | |||
498 | static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) | 539 | static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) |
499 | { | 540 | { |
500 | return (memcg == root_mem_cgroup); | 541 | return (memcg == root_mem_cgroup); |
@@ -5648,13 +5689,11 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg) | |||
5648 | mem_cgroup_oom_notify_cb(iter); | 5689 | mem_cgroup_oom_notify_cb(iter); |
5649 | } | 5690 | } |
5650 | 5691 | ||
5651 | static int mem_cgroup_usage_register_event(struct cgroup_subsys_state *css, | 5692 | static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, |
5652 | struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) | 5693 | struct eventfd_ctx *eventfd, const char *args, enum res_type type) |
5653 | { | 5694 | { |
5654 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | ||
5655 | struct mem_cgroup_thresholds *thresholds; | 5695 | struct mem_cgroup_thresholds *thresholds; |
5656 | struct mem_cgroup_threshold_ary *new; | 5696 | struct mem_cgroup_threshold_ary *new; |
5657 | enum res_type type = MEMFILE_TYPE(cft->private); | ||
5658 | u64 threshold, usage; | 5697 | u64 threshold, usage; |
5659 | int i, size, ret; | 5698 | int i, size, ret; |
5660 | 5699 | ||
@@ -5731,13 +5770,23 @@ unlock: | |||
5731 | return ret; | 5770 | return ret; |
5732 | } | 5771 | } |
5733 | 5772 | ||
5734 | static void mem_cgroup_usage_unregister_event(struct cgroup_subsys_state *css, | 5773 | static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg, |
5735 | struct cftype *cft, struct eventfd_ctx *eventfd) | 5774 | struct eventfd_ctx *eventfd, const char *args) |
5775 | { | ||
5776 | return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM); | ||
5777 | } | ||
5778 | |||
5779 | static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg, | ||
5780 | struct eventfd_ctx *eventfd, const char *args) | ||
5781 | { | ||
5782 | return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP); | ||
5783 | } | ||
5784 | |||
5785 | static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, | ||
5786 | struct eventfd_ctx *eventfd, enum res_type type) | ||
5736 | { | 5787 | { |
5737 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | ||
5738 | struct mem_cgroup_thresholds *thresholds; | 5788 | struct mem_cgroup_thresholds *thresholds; |
5739 | struct mem_cgroup_threshold_ary *new; | 5789 | struct mem_cgroup_threshold_ary *new; |
5740 | enum res_type type = MEMFILE_TYPE(cft->private); | ||
5741 | u64 usage; | 5790 | u64 usage; |
5742 | int i, j, size; | 5791 | int i, j, size; |
5743 | 5792 | ||
@@ -5810,14 +5859,23 @@ unlock: | |||
5810 | mutex_unlock(&memcg->thresholds_lock); | 5859 | mutex_unlock(&memcg->thresholds_lock); |
5811 | } | 5860 | } |
5812 | 5861 | ||
5813 | static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css, | 5862 | static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, |
5814 | struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) | 5863 | struct eventfd_ctx *eventfd) |
5864 | { | ||
5865 | return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM); | ||
5866 | } | ||
5867 | |||
5868 | static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg, | ||
5869 | struct eventfd_ctx *eventfd) | ||
5870 | { | ||
5871 | return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP); | ||
5872 | } | ||
5873 | |||
5874 | static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg, | ||
5875 | struct eventfd_ctx *eventfd, const char *args) | ||
5815 | { | 5876 | { |
5816 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | ||
5817 | struct mem_cgroup_eventfd_list *event; | 5877 | struct mem_cgroup_eventfd_list *event; |
5818 | enum res_type type = MEMFILE_TYPE(cft->private); | ||
5819 | 5878 | ||
5820 | BUG_ON(type != _OOM_TYPE); | ||
5821 | event = kmalloc(sizeof(*event), GFP_KERNEL); | 5879 | event = kmalloc(sizeof(*event), GFP_KERNEL); |
5822 | if (!event) | 5880 | if (!event) |
5823 | return -ENOMEM; | 5881 | return -ENOMEM; |
@@ -5835,14 +5893,10 @@ static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css, | |||
5835 | return 0; | 5893 | return 0; |
5836 | } | 5894 | } |
5837 | 5895 | ||
5838 | static void mem_cgroup_oom_unregister_event(struct cgroup_subsys_state *css, | 5896 | static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg, |
5839 | struct cftype *cft, struct eventfd_ctx *eventfd) | 5897 | struct eventfd_ctx *eventfd) |
5840 | { | 5898 | { |
5841 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | ||
5842 | struct mem_cgroup_eventfd_list *ev, *tmp; | 5899 | struct mem_cgroup_eventfd_list *ev, *tmp; |
5843 | enum res_type type = MEMFILE_TYPE(cft->private); | ||
5844 | |||
5845 | BUG_ON(type != _OOM_TYPE); | ||
5846 | 5900 | ||
5847 | spin_lock(&memcg_oom_lock); | 5901 | spin_lock(&memcg_oom_lock); |
5848 | 5902 | ||
@@ -5959,13 +6013,233 @@ static void kmem_cgroup_css_offline(struct mem_cgroup *memcg) | |||
5959 | } | 6013 | } |
5960 | #endif | 6014 | #endif |
5961 | 6015 | ||
6016 | /* | ||
6017 | * DO NOT USE IN NEW FILES. | ||
6018 | * | ||
6019 | * "cgroup.event_control" implementation. | ||
6020 | * | ||
6021 | * This is way over-engineered. It tries to support fully configurable | ||
6022 | * events for each user. Such level of flexibility is completely | ||
6023 | * unnecessary especially in the light of the planned unified hierarchy. | ||
6024 | * | ||
6025 | * Please deprecate this and replace with something simpler if at all | ||
6026 | * possible. | ||
6027 | */ | ||
6028 | |||
6029 | /* | ||
6030 | * Unregister event and free resources. | ||
6031 | * | ||
6032 | * Gets called from workqueue. | ||
6033 | */ | ||
6034 | static void memcg_event_remove(struct work_struct *work) | ||
6035 | { | ||
6036 | struct mem_cgroup_event *event = | ||
6037 | container_of(work, struct mem_cgroup_event, remove); | ||
6038 | struct mem_cgroup *memcg = event->memcg; | ||
6039 | |||
6040 | remove_wait_queue(event->wqh, &event->wait); | ||
6041 | |||
6042 | event->unregister_event(memcg, event->eventfd); | ||
6043 | |||
6044 | /* Notify userspace the event is going away. */ | ||
6045 | eventfd_signal(event->eventfd, 1); | ||
6046 | |||
6047 | eventfd_ctx_put(event->eventfd); | ||
6048 | kfree(event); | ||
6049 | css_put(&memcg->css); | ||
6050 | } | ||
6051 | |||
6052 | /* | ||
6053 | * Gets called on POLLHUP on eventfd when user closes it. | ||
6054 | * | ||
6055 | * Called with wqh->lock held and interrupts disabled. | ||
6056 | */ | ||
6057 | static int memcg_event_wake(wait_queue_t *wait, unsigned mode, | ||
6058 | int sync, void *key) | ||
6059 | { | ||
6060 | struct mem_cgroup_event *event = | ||
6061 | container_of(wait, struct mem_cgroup_event, wait); | ||
6062 | struct mem_cgroup *memcg = event->memcg; | ||
6063 | unsigned long flags = (unsigned long)key; | ||
6064 | |||
6065 | if (flags & POLLHUP) { | ||
6066 | /* | ||
6067 | * If the event has been detached at cgroup removal, we | ||
6068 | * can simply return knowing the other side will cleanup | ||
6069 | * for us. | ||
6070 | * | ||
6071 | * We can't race against event freeing since the other | ||
6072 | * side will require wqh->lock via remove_wait_queue(), | ||
6073 | * which we hold. | ||
6074 | */ | ||
6075 | spin_lock(&memcg->event_list_lock); | ||
6076 | if (!list_empty(&event->list)) { | ||
6077 | list_del_init(&event->list); | ||
6078 | /* | ||
6079 | * We are in atomic context, but cgroup_event_remove() | ||
6080 | * may sleep, so we have to call it in workqueue. | ||
6081 | */ | ||
6082 | schedule_work(&event->remove); | ||
6083 | } | ||
6084 | spin_unlock(&memcg->event_list_lock); | ||
6085 | } | ||
6086 | |||
6087 | return 0; | ||
6088 | } | ||
6089 | |||
6090 | static void memcg_event_ptable_queue_proc(struct file *file, | ||
6091 | wait_queue_head_t *wqh, poll_table *pt) | ||
6092 | { | ||
6093 | struct mem_cgroup_event *event = | ||
6094 | container_of(pt, struct mem_cgroup_event, pt); | ||
6095 | |||
6096 | event->wqh = wqh; | ||
6097 | add_wait_queue(wqh, &event->wait); | ||
6098 | } | ||
6099 | |||
6100 | /* | ||
6101 | * DO NOT USE IN NEW FILES. | ||
6102 | * | ||
6103 | * Parse input and register new cgroup event handler. | ||
6104 | * | ||
6105 | * Input must be in format '<event_fd> <control_fd> <args>'. | ||
6106 | * Interpretation of args is defined by control file implementation. | ||
6107 | */ | ||
6108 | static int memcg_write_event_control(struct cgroup_subsys_state *css, | ||
6109 | struct cftype *cft, const char *buffer) | ||
6110 | { | ||
6111 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | ||
6112 | struct mem_cgroup_event *event; | ||
6113 | struct cgroup_subsys_state *cfile_css; | ||
6114 | unsigned int efd, cfd; | ||
6115 | struct fd efile; | ||
6116 | struct fd cfile; | ||
6117 | const char *name; | ||
6118 | char *endp; | ||
6119 | int ret; | ||
6120 | |||
6121 | efd = simple_strtoul(buffer, &endp, 10); | ||
6122 | if (*endp != ' ') | ||
6123 | return -EINVAL; | ||
6124 | buffer = endp + 1; | ||
6125 | |||
6126 | cfd = simple_strtoul(buffer, &endp, 10); | ||
6127 | if ((*endp != ' ') && (*endp != '\0')) | ||
6128 | return -EINVAL; | ||
6129 | buffer = endp + 1; | ||
6130 | |||
6131 | event = kzalloc(sizeof(*event), GFP_KERNEL); | ||
6132 | if (!event) | ||
6133 | return -ENOMEM; | ||
6134 | |||
6135 | event->memcg = memcg; | ||
6136 | INIT_LIST_HEAD(&event->list); | ||
6137 | init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc); | ||
6138 | init_waitqueue_func_entry(&event->wait, memcg_event_wake); | ||
6139 | INIT_WORK(&event->remove, memcg_event_remove); | ||
6140 | |||
6141 | efile = fdget(efd); | ||
6142 | if (!efile.file) { | ||
6143 | ret = -EBADF; | ||
6144 | goto out_kfree; | ||
6145 | } | ||
6146 | |||
6147 | event->eventfd = eventfd_ctx_fileget(efile.file); | ||
6148 | if (IS_ERR(event->eventfd)) { | ||
6149 | ret = PTR_ERR(event->eventfd); | ||
6150 | goto out_put_efile; | ||
6151 | } | ||
6152 | |||
6153 | cfile = fdget(cfd); | ||
6154 | if (!cfile.file) { | ||
6155 | ret = -EBADF; | ||
6156 | goto out_put_eventfd; | ||
6157 | } | ||
6158 | |||
6159 | /* the process need read permission on control file */ | ||
6160 | /* AV: shouldn't we check that it's been opened for read instead? */ | ||
6161 | ret = inode_permission(file_inode(cfile.file), MAY_READ); | ||
6162 | if (ret < 0) | ||
6163 | goto out_put_cfile; | ||
6164 | |||
6165 | /* | ||
6166 | * Determine the event callbacks and set them in @event. This used | ||
6167 | * to be done via struct cftype but cgroup core no longer knows | ||
6168 | * about these events. The following is crude but the whole thing | ||
6169 | * is for compatibility anyway. | ||
6170 | * | ||
6171 | * DO NOT ADD NEW FILES. | ||
6172 | */ | ||
6173 | name = cfile.file->f_dentry->d_name.name; | ||
6174 | |||
6175 | if (!strcmp(name, "memory.usage_in_bytes")) { | ||
6176 | event->register_event = mem_cgroup_usage_register_event; | ||
6177 | event->unregister_event = mem_cgroup_usage_unregister_event; | ||
6178 | } else if (!strcmp(name, "memory.oom_control")) { | ||
6179 | event->register_event = mem_cgroup_oom_register_event; | ||
6180 | event->unregister_event = mem_cgroup_oom_unregister_event; | ||
6181 | } else if (!strcmp(name, "memory.pressure_level")) { | ||
6182 | event->register_event = vmpressure_register_event; | ||
6183 | event->unregister_event = vmpressure_unregister_event; | ||
6184 | } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) { | ||
6185 | event->register_event = memsw_cgroup_usage_register_event; | ||
6186 | event->unregister_event = memsw_cgroup_usage_unregister_event; | ||
6187 | } else { | ||
6188 | ret = -EINVAL; | ||
6189 | goto out_put_cfile; | ||
6190 | } | ||
6191 | |||
6192 | /* | ||
6193 | * Verify @cfile should belong to @css. Also, remaining events are | ||
6194 | * automatically removed on cgroup destruction but the removal is | ||
6195 | * asynchronous, so take an extra ref on @css. | ||
6196 | */ | ||
6197 | rcu_read_lock(); | ||
6198 | |||
6199 | ret = -EINVAL; | ||
6200 | cfile_css = css_from_dir(cfile.file->f_dentry->d_parent, | ||
6201 | &mem_cgroup_subsys); | ||
6202 | if (cfile_css == css && css_tryget(css)) | ||
6203 | ret = 0; | ||
6204 | |||
6205 | rcu_read_unlock(); | ||
6206 | if (ret) | ||
6207 | goto out_put_cfile; | ||
6208 | |||
6209 | ret = event->register_event(memcg, event->eventfd, buffer); | ||
6210 | if (ret) | ||
6211 | goto out_put_css; | ||
6212 | |||
6213 | efile.file->f_op->poll(efile.file, &event->pt); | ||
6214 | |||
6215 | spin_lock(&memcg->event_list_lock); | ||
6216 | list_add(&event->list, &memcg->event_list); | ||
6217 | spin_unlock(&memcg->event_list_lock); | ||
6218 | |||
6219 | fdput(cfile); | ||
6220 | fdput(efile); | ||
6221 | |||
6222 | return 0; | ||
6223 | |||
6224 | out_put_css: | ||
6225 | css_put(css); | ||
6226 | out_put_cfile: | ||
6227 | fdput(cfile); | ||
6228 | out_put_eventfd: | ||
6229 | eventfd_ctx_put(event->eventfd); | ||
6230 | out_put_efile: | ||
6231 | fdput(efile); | ||
6232 | out_kfree: | ||
6233 | kfree(event); | ||
6234 | |||
6235 | return ret; | ||
6236 | } | ||
6237 | |||
5962 | static struct cftype mem_cgroup_files[] = { | 6238 | static struct cftype mem_cgroup_files[] = { |
5963 | { | 6239 | { |
5964 | .name = "usage_in_bytes", | 6240 | .name = "usage_in_bytes", |
5965 | .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), | 6241 | .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), |
5966 | .read = mem_cgroup_read, | 6242 | .read = mem_cgroup_read, |
5967 | .register_event = mem_cgroup_usage_register_event, | ||
5968 | .unregister_event = mem_cgroup_usage_unregister_event, | ||
5969 | }, | 6243 | }, |
5970 | { | 6244 | { |
5971 | .name = "max_usage_in_bytes", | 6245 | .name = "max_usage_in_bytes", |
@@ -6006,6 +6280,12 @@ static struct cftype mem_cgroup_files[] = { | |||
6006 | .read_u64 = mem_cgroup_hierarchy_read, | 6280 | .read_u64 = mem_cgroup_hierarchy_read, |
6007 | }, | 6281 | }, |
6008 | { | 6282 | { |
6283 | .name = "cgroup.event_control", /* XXX: for compat */ | ||
6284 | .write_string = memcg_write_event_control, | ||
6285 | .flags = CFTYPE_NO_PREFIX, | ||
6286 | .mode = S_IWUGO, | ||
6287 | }, | ||
6288 | { | ||
6009 | .name = "swappiness", | 6289 | .name = "swappiness", |
6010 | .read_u64 = mem_cgroup_swappiness_read, | 6290 | .read_u64 = mem_cgroup_swappiness_read, |
6011 | .write_u64 = mem_cgroup_swappiness_write, | 6291 | .write_u64 = mem_cgroup_swappiness_write, |
@@ -6019,14 +6299,10 @@ static struct cftype mem_cgroup_files[] = { | |||
6019 | .name = "oom_control", | 6299 | .name = "oom_control", |
6020 | .read_map = mem_cgroup_oom_control_read, | 6300 | .read_map = mem_cgroup_oom_control_read, |
6021 | .write_u64 = mem_cgroup_oom_control_write, | 6301 | .write_u64 = mem_cgroup_oom_control_write, |
6022 | .register_event = mem_cgroup_oom_register_event, | ||
6023 | .unregister_event = mem_cgroup_oom_unregister_event, | ||
6024 | .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), | 6302 | .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), |
6025 | }, | 6303 | }, |
6026 | { | 6304 | { |
6027 | .name = "pressure_level", | 6305 | .name = "pressure_level", |
6028 | .register_event = vmpressure_register_event, | ||
6029 | .unregister_event = vmpressure_unregister_event, | ||
6030 | }, | 6306 | }, |
6031 | #ifdef CONFIG_NUMA | 6307 | #ifdef CONFIG_NUMA |
6032 | { | 6308 | { |
@@ -6074,8 +6350,6 @@ static struct cftype memsw_cgroup_files[] = { | |||
6074 | .name = "memsw.usage_in_bytes", | 6350 | .name = "memsw.usage_in_bytes", |
6075 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), | 6351 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), |
6076 | .read = mem_cgroup_read, | 6352 | .read = mem_cgroup_read, |
6077 | .register_event = mem_cgroup_usage_register_event, | ||
6078 | .unregister_event = mem_cgroup_usage_unregister_event, | ||
6079 | }, | 6353 | }, |
6080 | { | 6354 | { |
6081 | .name = "memsw.max_usage_in_bytes", | 6355 | .name = "memsw.max_usage_in_bytes", |
@@ -6265,6 +6539,8 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) | |||
6265 | mutex_init(&memcg->thresholds_lock); | 6539 | mutex_init(&memcg->thresholds_lock); |
6266 | spin_lock_init(&memcg->move_lock); | 6540 | spin_lock_init(&memcg->move_lock); |
6267 | vmpressure_init(&memcg->vmpressure); | 6541 | vmpressure_init(&memcg->vmpressure); |
6542 | INIT_LIST_HEAD(&memcg->event_list); | ||
6543 | spin_lock_init(&memcg->event_list_lock); | ||
6268 | 6544 | ||
6269 | return &memcg->css; | 6545 | return &memcg->css; |
6270 | 6546 | ||
@@ -6340,6 +6616,19 @@ static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg) | |||
6340 | static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) | 6616 | static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) |
6341 | { | 6617 | { |
6342 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 6618 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); |
6619 | struct mem_cgroup_event *event, *tmp; | ||
6620 | |||
6621 | /* | ||
6622 | * Unregister events and notify userspace. | ||
6623 | * Notify userspace about cgroup removing only after rmdir of cgroup | ||
6624 | * directory to avoid race between userspace and kernelspace. | ||
6625 | */ | ||
6626 | spin_lock(&memcg->event_list_lock); | ||
6627 | list_for_each_entry_safe(event, tmp, &memcg->event_list, list) { | ||
6628 | list_del_init(&event->list); | ||
6629 | schedule_work(&event->remove); | ||
6630 | } | ||
6631 | spin_unlock(&memcg->event_list_lock); | ||
6343 | 6632 | ||
6344 | kmem_cgroup_css_offline(memcg); | 6633 | kmem_cgroup_css_offline(memcg); |
6345 | 6634 | ||
diff --git a/mm/vmpressure.c b/mm/vmpressure.c index e0f62837c3f4..196970a4541f 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c | |||
@@ -278,8 +278,7 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio) | |||
278 | 278 | ||
279 | /** | 279 | /** |
280 | * vmpressure_register_event() - Bind vmpressure notifications to an eventfd | 280 | * vmpressure_register_event() - Bind vmpressure notifications to an eventfd |
281 | * @css: css that is interested in vmpressure notifications | 281 | * @memcg: memcg that is interested in vmpressure notifications |
282 | * @cft: cgroup control files handle | ||
283 | * @eventfd: eventfd context to link notifications with | 282 | * @eventfd: eventfd context to link notifications with |
284 | * @args: event arguments (used to set up a pressure level threshold) | 283 | * @args: event arguments (used to set up a pressure level threshold) |
285 | * | 284 | * |
@@ -289,15 +288,12 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio) | |||
289 | * threshold (one of vmpressure_str_levels, i.e. "low", "medium", or | 288 | * threshold (one of vmpressure_str_levels, i.e. "low", "medium", or |
290 | * "critical"). | 289 | * "critical"). |
291 | * | 290 | * |
292 | * This function should not be used directly, just pass it to (struct | 291 | * To be used as memcg event method. |
293 | * cftype).register_event, and then cgroup core will handle everything by | ||
294 | * itself. | ||
295 | */ | 292 | */ |
296 | int vmpressure_register_event(struct cgroup_subsys_state *css, | 293 | int vmpressure_register_event(struct mem_cgroup *memcg, |
297 | struct cftype *cft, struct eventfd_ctx *eventfd, | 294 | struct eventfd_ctx *eventfd, const char *args) |
298 | const char *args) | ||
299 | { | 295 | { |
300 | struct vmpressure *vmpr = css_to_vmpressure(css); | 296 | struct vmpressure *vmpr = memcg_to_vmpressure(memcg); |
301 | struct vmpressure_event *ev; | 297 | struct vmpressure_event *ev; |
302 | int level; | 298 | int level; |
303 | 299 | ||
@@ -325,23 +321,19 @@ int vmpressure_register_event(struct cgroup_subsys_state *css, | |||
325 | 321 | ||
326 | /** | 322 | /** |
327 | * vmpressure_unregister_event() - Unbind eventfd from vmpressure | 323 | * vmpressure_unregister_event() - Unbind eventfd from vmpressure |
328 | * @css: css handle | 324 | * @memcg: memcg handle |
329 | * @cft: cgroup control files handle | ||
330 | * @eventfd: eventfd context that was used to link vmpressure with the @cg | 325 | * @eventfd: eventfd context that was used to link vmpressure with the @cg |
331 | * | 326 | * |
332 | * This function does internal manipulations to detach the @eventfd from | 327 | * This function does internal manipulations to detach the @eventfd from |
333 | * the vmpressure notifications, and then frees internal resources | 328 | * the vmpressure notifications, and then frees internal resources |
334 | * associated with the @eventfd (but the @eventfd itself is not freed). | 329 | * associated with the @eventfd (but the @eventfd itself is not freed). |
335 | * | 330 | * |
336 | * This function should not be used directly, just pass it to (struct | 331 | * To be used as memcg event method. |
337 | * cftype).unregister_event, and then cgroup core will handle everything | ||
338 | * by itself. | ||
339 | */ | 332 | */ |
340 | void vmpressure_unregister_event(struct cgroup_subsys_state *css, | 333 | void vmpressure_unregister_event(struct mem_cgroup *memcg, |
341 | struct cftype *cft, | ||
342 | struct eventfd_ctx *eventfd) | 334 | struct eventfd_ctx *eventfd) |
343 | { | 335 | { |
344 | struct vmpressure *vmpr = css_to_vmpressure(css); | 336 | struct vmpressure *vmpr = memcg_to_vmpressure(memcg); |
345 | struct vmpressure_event *ev; | 337 | struct vmpressure_event *ev; |
346 | 338 | ||
347 | mutex_lock(&vmpr->events_lock); | 339 | mutex_lock(&vmpr->events_lock); |