diff options
| author | Tejun Heo <tj@kernel.org> | 2013-11-22 18:32:25 -0500 |
|---|---|---|
| committer | Tejun Heo <tj@kernel.org> | 2013-11-22 18:32:25 -0500 |
| commit | edab95103d3a1eb5e3faf977eae4ad0b5bf5669c (patch) | |
| tree | 812c111f94b0ae31bf88b49e7c37a7f5ba353eef | |
| parent | e5fca243abae1445afbfceebda5f08462ef869d3 (diff) | |
| parent | b36824c75c7855585d6476eef2b234f6e0e68872 (diff) | |
cgroup: Merge branch 'memcg_event' into for-3.14
Merge v3.12 based patch series to move cgroup_event implementation to
memcg into for-3.14. The following two commits cause a conflict in
kernel/cgroup.c
2ff2a7d03bbe4 ("cgroup: kill css_id")
79bd9814e5ec9 ("cgroup, memcg: move cgroup_event implementation to memcg")
Each patch removes a struct definition from kernel/cgroup.c. As the
two are adjacent, they cause a context conflict. Easily resolved by
removing both structs.
Signed-off-by: Tejun Heo <tj@kernel.org>
| -rw-r--r-- | Documentation/cgroups/cgroups.txt | 20 | ||||
| -rw-r--r-- | include/linux/cgroup.h | 24 | ||||
| -rw-r--r-- | include/linux/vmpressure.h | 8 | ||||
| -rw-r--r-- | init/Kconfig | 3 | ||||
| -rw-r--r-- | kernel/cgroup.c | 259 | ||||
| -rw-r--r-- | mm/memcontrol.c | 353 | ||||
| -rw-r--r-- | mm/vmpressure.c | 26 |
7 files changed, 334 insertions, 359 deletions
diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt index 638bf17ff869..821de56d1580 100644 --- a/Documentation/cgroups/cgroups.txt +++ b/Documentation/cgroups/cgroups.txt | |||
| @@ -24,7 +24,6 @@ CONTENTS: | |||
| 24 | 2.1 Basic Usage | 24 | 2.1 Basic Usage |
| 25 | 2.2 Attaching processes | 25 | 2.2 Attaching processes |
| 26 | 2.3 Mounting hierarchies by name | 26 | 2.3 Mounting hierarchies by name |
| 27 | 2.4 Notification API | ||
| 28 | 3. Kernel API | 27 | 3. Kernel API |
| 29 | 3.1 Overview | 28 | 3.1 Overview |
| 30 | 3.2 Synchronization | 29 | 3.2 Synchronization |
| @@ -472,25 +471,6 @@ you give a subsystem a name. | |||
| 472 | The name of the subsystem appears as part of the hierarchy description | 471 | The name of the subsystem appears as part of the hierarchy description |
| 473 | in /proc/mounts and /proc/<pid>/cgroups. | 472 | in /proc/mounts and /proc/<pid>/cgroups. |
| 474 | 473 | ||
| 475 | 2.4 Notification API | ||
| 476 | -------------------- | ||
| 477 | |||
| 478 | There is mechanism which allows to get notifications about changing | ||
| 479 | status of a cgroup. | ||
| 480 | |||
| 481 | To register a new notification handler you need to: | ||
| 482 | - create a file descriptor for event notification using eventfd(2); | ||
| 483 | - open a control file to be monitored (e.g. memory.usage_in_bytes); | ||
| 484 | - write "<event_fd> <control_fd> <args>" to cgroup.event_control. | ||
| 485 | Interpretation of args is defined by control file implementation; | ||
| 486 | |||
| 487 | eventfd will be woken up by control file implementation or when the | ||
| 488 | cgroup is removed. | ||
| 489 | |||
| 490 | To unregister a notification handler just close eventfd. | ||
| 491 | |||
| 492 | NOTE: Support of notifications should be implemented for the control | ||
| 493 | file. See documentation for the subsystem. | ||
| 494 | 474 | ||
| 495 | 3. Kernel API | 475 | 3. Kernel API |
| 496 | ============= | 476 | ============= |
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 39c1d9469677..492fa01ec2d3 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h | |||
| @@ -29,7 +29,6 @@ struct cgroup_subsys; | |||
| 29 | struct inode; | 29 | struct inode; |
| 30 | struct cgroup; | 30 | struct cgroup; |
| 31 | struct css_id; | 31 | struct css_id; |
| 32 | struct eventfd_ctx; | ||
| 33 | 32 | ||
| 34 | extern int cgroup_init_early(void); | 33 | extern int cgroup_init_early(void); |
| 35 | extern int cgroup_init(void); | 34 | extern int cgroup_init(void); |
| @@ -239,10 +238,6 @@ struct cgroup { | |||
| 239 | struct rcu_head rcu_head; | 238 | struct rcu_head rcu_head; |
| 240 | struct work_struct destroy_work; | 239 | struct work_struct destroy_work; |
| 241 | 240 | ||
| 242 | /* List of events which userspace want to receive */ | ||
| 243 | struct list_head event_list; | ||
| 244 | spinlock_t event_list_lock; | ||
| 245 | |||
| 246 | /* directory xattrs */ | 241 | /* directory xattrs */ |
| 247 | struct simple_xattrs xattrs; | 242 | struct simple_xattrs xattrs; |
| 248 | }; | 243 | }; |
| @@ -506,25 +501,6 @@ struct cftype { | |||
| 506 | int (*trigger)(struct cgroup_subsys_state *css, unsigned int event); | 501 | int (*trigger)(struct cgroup_subsys_state *css, unsigned int event); |
| 507 | 502 | ||
| 508 | int (*release)(struct inode *inode, struct file *file); | 503 | int (*release)(struct inode *inode, struct file *file); |
| 509 | |||
| 510 | /* | ||
| 511 | * register_event() callback will be used to add new userspace | ||
| 512 | * waiter for changes related to the cftype. Implement it if | ||
| 513 | * you want to provide this functionality. Use eventfd_signal() | ||
| 514 | * on eventfd to send notification to userspace. | ||
| 515 | */ | ||
| 516 | int (*register_event)(struct cgroup_subsys_state *css, | ||
| 517 | struct cftype *cft, struct eventfd_ctx *eventfd, | ||
| 518 | const char *args); | ||
| 519 | /* | ||
| 520 | * unregister_event() callback will be called when userspace | ||
| 521 | * closes the eventfd or on cgroup removing. | ||
| 522 | * This callback must be implemented, if you want provide | ||
| 523 | * notification functionality. | ||
| 524 | */ | ||
| 525 | void (*unregister_event)(struct cgroup_subsys_state *css, | ||
| 526 | struct cftype *cft, | ||
| 527 | struct eventfd_ctx *eventfd); | ||
| 528 | }; | 504 | }; |
| 529 | 505 | ||
| 530 | /* | 506 | /* |
diff --git a/include/linux/vmpressure.h b/include/linux/vmpressure.h index 3f3788d49362..3e4535876d37 100644 --- a/include/linux/vmpressure.h +++ b/include/linux/vmpressure.h | |||
| @@ -7,6 +7,7 @@ | |||
| 7 | #include <linux/gfp.h> | 7 | #include <linux/gfp.h> |
| 8 | #include <linux/types.h> | 8 | #include <linux/types.h> |
| 9 | #include <linux/cgroup.h> | 9 | #include <linux/cgroup.h> |
| 10 | #include <linux/eventfd.h> | ||
| 10 | 11 | ||
| 11 | struct vmpressure { | 12 | struct vmpressure { |
| 12 | unsigned long scanned; | 13 | unsigned long scanned; |
| @@ -33,13 +34,10 @@ extern void vmpressure_init(struct vmpressure *vmpr); | |||
| 33 | extern void vmpressure_cleanup(struct vmpressure *vmpr); | 34 | extern void vmpressure_cleanup(struct vmpressure *vmpr); |
| 34 | extern struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg); | 35 | extern struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg); |
| 35 | extern struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr); | 36 | extern struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr); |
| 36 | extern struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css); | 37 | extern int vmpressure_register_event(struct mem_cgroup *memcg, |
| 37 | extern int vmpressure_register_event(struct cgroup_subsys_state *css, | ||
| 38 | struct cftype *cft, | ||
| 39 | struct eventfd_ctx *eventfd, | 38 | struct eventfd_ctx *eventfd, |
| 40 | const char *args); | 39 | const char *args); |
| 41 | extern void vmpressure_unregister_event(struct cgroup_subsys_state *css, | 40 | extern void vmpressure_unregister_event(struct mem_cgroup *memcg, |
| 42 | struct cftype *cft, | ||
| 43 | struct eventfd_ctx *eventfd); | 41 | struct eventfd_ctx *eventfd); |
| 44 | #else | 42 | #else |
| 45 | static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, | 43 | static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, |
diff --git a/init/Kconfig b/init/Kconfig index 79383d3aa5dc..93f344337172 100644 --- a/init/Kconfig +++ b/init/Kconfig | |||
| @@ -848,7 +848,6 @@ config NUMA_BALANCING | |||
| 848 | 848 | ||
| 849 | menuconfig CGROUPS | 849 | menuconfig CGROUPS |
| 850 | boolean "Control Group support" | 850 | boolean "Control Group support" |
| 851 | depends on EVENTFD | ||
| 852 | help | 851 | help |
| 853 | This option adds support for grouping sets of processes together, for | 852 | This option adds support for grouping sets of processes together, for |
| 854 | use with process control subsystems such as Cpusets, CFS, memory | 853 | use with process control subsystems such as Cpusets, CFS, memory |
| @@ -915,6 +914,7 @@ config MEMCG | |||
| 915 | bool "Memory Resource Controller for Control Groups" | 914 | bool "Memory Resource Controller for Control Groups" |
| 916 | depends on RESOURCE_COUNTERS | 915 | depends on RESOURCE_COUNTERS |
| 917 | select MM_OWNER | 916 | select MM_OWNER |
| 917 | select EVENTFD | ||
| 918 | help | 918 | help |
| 919 | Provides a memory resource controller that manages both anonymous | 919 | Provides a memory resource controller that manages both anonymous |
| 920 | memory and page cache. (See Documentation/cgroups/memory.txt) | 920 | memory and page cache. (See Documentation/cgroups/memory.txt) |
| @@ -1154,7 +1154,6 @@ config UIDGID_STRICT_TYPE_CHECKS | |||
| 1154 | 1154 | ||
| 1155 | config SCHED_AUTOGROUP | 1155 | config SCHED_AUTOGROUP |
| 1156 | bool "Automatic process group scheduling" | 1156 | bool "Automatic process group scheduling" |
| 1157 | select EVENTFD | ||
| 1158 | select CGROUPS | 1157 | select CGROUPS |
| 1159 | select CGROUP_SCHED | 1158 | select CGROUP_SCHED |
| 1160 | select FAIR_GROUP_SCHED | 1159 | select FAIR_GROUP_SCHED |
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a7b98ee35ef7..be42967f4f1a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
| @@ -56,11 +56,8 @@ | |||
| 56 | #include <linux/pid_namespace.h> | 56 | #include <linux/pid_namespace.h> |
| 57 | #include <linux/idr.h> | 57 | #include <linux/idr.h> |
| 58 | #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ | 58 | #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ |
| 59 | #include <linux/eventfd.h> | ||
| 60 | #include <linux/poll.h> | ||
| 61 | #include <linux/flex_array.h> /* used in cgroup_attach_task */ | 59 | #include <linux/flex_array.h> /* used in cgroup_attach_task */ |
| 62 | #include <linux/kthread.h> | 60 | #include <linux/kthread.h> |
| 63 | #include <linux/file.h> | ||
| 64 | 61 | ||
| 65 | #include <linux/atomic.h> | 62 | #include <linux/atomic.h> |
| 66 | 63 | ||
| @@ -132,36 +129,6 @@ struct cfent { | |||
| 132 | struct simple_xattrs xattrs; | 129 | struct simple_xattrs xattrs; |
| 133 | }; | 130 | }; |
| 134 | 131 | ||
| 135 | /* | ||
| 136 | * cgroup_event represents events which userspace want to receive. | ||
| 137 | */ | ||
| 138 | struct cgroup_event { | ||
| 139 | /* | ||
| 140 | * css which the event belongs to. | ||
| 141 | */ | ||
| 142 | struct cgroup_subsys_state *css; | ||
| 143 | /* | ||
| 144 | * Control file which the event associated. | ||
| 145 | */ | ||
| 146 | struct cftype *cft; | ||
| 147 | /* | ||
| 148 | * eventfd to signal userspace about the event. | ||
| 149 | */ | ||
| 150 | struct eventfd_ctx *eventfd; | ||
| 151 | /* | ||
| 152 | * Each of these stored in a list by the cgroup. | ||
| 153 | */ | ||
| 154 | struct list_head list; | ||
| 155 | /* | ||
| 156 | * All fields below needed to unregister event when | ||
| 157 | * userspace closes eventfd. | ||
| 158 | */ | ||
| 159 | poll_table pt; | ||
| 160 | wait_queue_head_t *wqh; | ||
| 161 | wait_queue_t wait; | ||
| 162 | struct work_struct remove; | ||
| 163 | }; | ||
| 164 | |||
| 165 | /* The list of hierarchy roots */ | 132 | /* The list of hierarchy roots */ |
| 166 | 133 | ||
| 167 | static LIST_HEAD(cgroup_roots); | 134 | static LIST_HEAD(cgroup_roots); |
| @@ -1351,8 +1318,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) | |||
| 1351 | INIT_LIST_HEAD(&cgrp->pidlists); | 1318 | INIT_LIST_HEAD(&cgrp->pidlists); |
| 1352 | mutex_init(&cgrp->pidlist_mutex); | 1319 | mutex_init(&cgrp->pidlist_mutex); |
| 1353 | cgrp->dummy_css.cgroup = cgrp; | 1320 | cgrp->dummy_css.cgroup = cgrp; |
| 1354 | INIT_LIST_HEAD(&cgrp->event_list); | ||
| 1355 | spin_lock_init(&cgrp->event_list_lock); | ||
| 1356 | simple_xattrs_init(&cgrp->xattrs); | 1321 | simple_xattrs_init(&cgrp->xattrs); |
| 1357 | } | 1322 | } |
| 1358 | 1323 | ||
| @@ -2626,16 +2591,6 @@ static const struct inode_operations cgroup_dir_inode_operations = { | |||
| 2626 | .removexattr = cgroup_removexattr, | 2591 | .removexattr = cgroup_removexattr, |
| 2627 | }; | 2592 | }; |
| 2628 | 2593 | ||
| 2629 | /* | ||
| 2630 | * Check if a file is a control file | ||
| 2631 | */ | ||
| 2632 | static inline struct cftype *__file_cft(struct file *file) | ||
| 2633 | { | ||
| 2634 | if (file_inode(file)->i_fop != &cgroup_file_operations) | ||
| 2635 | return ERR_PTR(-EINVAL); | ||
| 2636 | return __d_cft(file->f_dentry); | ||
| 2637 | } | ||
| 2638 | |||
| 2639 | static int cgroup_create_file(struct dentry *dentry, umode_t mode, | 2594 | static int cgroup_create_file(struct dentry *dentry, umode_t mode, |
| 2640 | struct super_block *sb) | 2595 | struct super_block *sb) |
| 2641 | { | 2596 | { |
| @@ -3915,202 +3870,6 @@ static void cgroup_dput(struct cgroup *cgrp) | |||
| 3915 | deactivate_super(sb); | 3870 | deactivate_super(sb); |
| 3916 | } | 3871 | } |
| 3917 | 3872 | ||
| 3918 | /* | ||
| 3919 | * Unregister event and free resources. | ||
| 3920 | * | ||
| 3921 | * Gets called from workqueue. | ||
| 3922 | */ | ||
| 3923 | static void cgroup_event_remove(struct work_struct *work) | ||
| 3924 | { | ||
| 3925 | struct cgroup_event *event = container_of(work, struct cgroup_event, | ||
| 3926 | remove); | ||
| 3927 | struct cgroup_subsys_state *css = event->css; | ||
| 3928 | |||
| 3929 | remove_wait_queue(event->wqh, &event->wait); | ||
| 3930 | |||
| 3931 | event->cft->unregister_event(css, event->cft, event->eventfd); | ||
| 3932 | |||
| 3933 | /* Notify userspace the event is going away. */ | ||
| 3934 | eventfd_signal(event->eventfd, 1); | ||
| 3935 | |||
| 3936 | eventfd_ctx_put(event->eventfd); | ||
| 3937 | kfree(event); | ||
| 3938 | css_put(css); | ||
| 3939 | } | ||
| 3940 | |||
| 3941 | /* | ||
| 3942 | * Gets called on POLLHUP on eventfd when user closes it. | ||
| 3943 | * | ||
| 3944 | * Called with wqh->lock held and interrupts disabled. | ||
| 3945 | */ | ||
| 3946 | static int cgroup_event_wake(wait_queue_t *wait, unsigned mode, | ||
| 3947 | int sync, void *key) | ||
| 3948 | { | ||
| 3949 | struct cgroup_event *event = container_of(wait, | ||
| 3950 | struct cgroup_event, wait); | ||
| 3951 | struct cgroup *cgrp = event->css->cgroup; | ||
| 3952 | unsigned long flags = (unsigned long)key; | ||
| 3953 | |||
| 3954 | if (flags & POLLHUP) { | ||
| 3955 | /* | ||
| 3956 | * If the event has been detached at cgroup removal, we | ||
| 3957 | * can simply return knowing the other side will cleanup | ||
| 3958 | * for us. | ||
| 3959 | * | ||
| 3960 | * We can't race against event freeing since the other | ||
| 3961 | * side will require wqh->lock via remove_wait_queue(), | ||
| 3962 | * which we hold. | ||
| 3963 | */ | ||
| 3964 | spin_lock(&cgrp->event_list_lock); | ||
| 3965 | if (!list_empty(&event->list)) { | ||
| 3966 | list_del_init(&event->list); | ||
| 3967 | /* | ||
| 3968 | * We are in atomic context, but cgroup_event_remove() | ||
| 3969 | * may sleep, so we have to call it in workqueue. | ||
| 3970 | */ | ||
| 3971 | schedule_work(&event->remove); | ||
| 3972 | } | ||
| 3973 | spin_unlock(&cgrp->event_list_lock); | ||
| 3974 | } | ||
| 3975 | |||
| 3976 | return 0; | ||
| 3977 | } | ||
| 3978 | |||
| 3979 | static void cgroup_event_ptable_queue_proc(struct file *file, | ||
| 3980 | wait_queue_head_t *wqh, poll_table *pt) | ||
| 3981 | { | ||
| 3982 | struct cgroup_event *event = container_of(pt, | ||
| 3983 | struct cgroup_event, pt); | ||
| 3984 | |||
| 3985 | event->wqh = wqh; | ||
| 3986 | add_wait_queue(wqh, &event->wait); | ||
| 3987 | } | ||
| 3988 | |||
| 3989 | /* | ||
| 3990 | * Parse input and register new cgroup event handler. | ||
| 3991 | * | ||
| 3992 | * Input must be in format '<event_fd> <control_fd> <args>'. | ||
| 3993 | * Interpretation of args is defined by control file implementation. | ||
| 3994 | */ | ||
| 3995 | static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css, | ||
| 3996 | struct cftype *cft, const char *buffer) | ||
| 3997 | { | ||
| 3998 | struct cgroup *cgrp = dummy_css->cgroup; | ||
| 3999 | struct cgroup_event *event; | ||
| 4000 | struct cgroup_subsys_state *cfile_css; | ||
| 4001 | unsigned int efd, cfd; | ||
| 4002 | struct fd efile; | ||
| 4003 | struct fd cfile; | ||
| 4004 | char *endp; | ||
| 4005 | int ret; | ||
| 4006 | |||
| 4007 | efd = simple_strtoul(buffer, &endp, 10); | ||
| 4008 | if (*endp != ' ') | ||
| 4009 | return -EINVAL; | ||
| 4010 | buffer = endp + 1; | ||
| 4011 | |||
| 4012 | cfd = simple_strtoul(buffer, &endp, 10); | ||
| 4013 | if ((*endp != ' ') && (*endp != '\0')) | ||
| 4014 | return -EINVAL; | ||
| 4015 | buffer = endp + 1; | ||
| 4016 | |||
| 4017 | event = kzalloc(sizeof(*event), GFP_KERNEL); | ||
| 4018 | if (!event) | ||
| 4019 | return -ENOMEM; | ||
| 4020 | |||
| 4021 | INIT_LIST_HEAD(&event->list); | ||
| 4022 | init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc); | ||
| 4023 | init_waitqueue_func_entry(&event->wait, cgroup_event_wake); | ||
| 4024 | INIT_WORK(&event->remove, cgroup_event_remove); | ||
| 4025 | |||
| 4026 | efile = fdget(efd); | ||
| 4027 | if (!efile.file) { | ||
| 4028 | ret = -EBADF; | ||
| 4029 | goto out_kfree; | ||
| 4030 | } | ||
| 4031 | |||
| 4032 | event->eventfd = eventfd_ctx_fileget(efile.file); | ||
| 4033 | if (IS_ERR(event->eventfd)) { | ||
| 4034 | ret = PTR_ERR(event->eventfd); | ||
| 4035 | goto out_put_efile; | ||
| 4036 | } | ||
| 4037 | |||
| 4038 | cfile = fdget(cfd); | ||
| 4039 | if (!cfile.file) { | ||
| 4040 | ret = -EBADF; | ||
| 4041 | goto out_put_eventfd; | ||
| 4042 | } | ||
| 4043 | |||
| 4044 | /* the process need read permission on control file */ | ||
| 4045 | /* AV: shouldn't we check that it's been opened for read instead? */ | ||
| 4046 | ret = inode_permission(file_inode(cfile.file), MAY_READ); | ||
| 4047 | if (ret < 0) | ||
| 4048 | goto out_put_cfile; | ||
| 4049 | |||
| 4050 | event->cft = __file_cft(cfile.file); | ||
| 4051 | if (IS_ERR(event->cft)) { | ||
| 4052 | ret = PTR_ERR(event->cft); | ||
| 4053 | goto out_put_cfile; | ||
| 4054 | } | ||
| 4055 | |||
| 4056 | if (!event->cft->ss) { | ||
| 4057 | ret = -EBADF; | ||
| 4058 | goto out_put_cfile; | ||
| 4059 | } | ||
| 4060 | |||
| 4061 | /* | ||
| 4062 | * Determine the css of @cfile, verify it belongs to the same | ||
| 4063 | * cgroup as cgroup.event_control, and associate @event with it. | ||
| 4064 | * Remaining events are automatically removed on cgroup destruction | ||
| 4065 | * but the removal is asynchronous, so take an extra ref. | ||
| 4066 | */ | ||
| 4067 | rcu_read_lock(); | ||
| 4068 | |||
| 4069 | ret = -EINVAL; | ||
| 4070 | event->css = cgroup_css(cgrp, event->cft->ss); | ||
| 4071 | cfile_css = css_from_dir(cfile.file->f_dentry->d_parent, event->cft->ss); | ||
| 4072 | if (event->css && event->css == cfile_css && css_tryget(event->css)) | ||
| 4073 | ret = 0; | ||
| 4074 | |||
| 4075 | rcu_read_unlock(); | ||
| 4076 | if (ret) | ||
| 4077 | goto out_put_cfile; | ||
| 4078 | |||
| 4079 | if (!event->cft->register_event || !event->cft->unregister_event) { | ||
| 4080 | ret = -EINVAL; | ||
| 4081 | goto out_put_css; | ||
| 4082 | } | ||
| 4083 | |||
| 4084 | ret = event->cft->register_event(event->css, event->cft, | ||
| 4085 | event->eventfd, buffer); | ||
| 4086 | if (ret) | ||
| 4087 | goto out_put_css; | ||
| 4088 | |||
| 4089 | efile.file->f_op->poll(efile.file, &event->pt); | ||
| 4090 | |||
| 4091 | spin_lock(&cgrp->event_list_lock); | ||
| 4092 | list_add(&event->list, &cgrp->event_list); | ||
| 4093 | spin_unlock(&cgrp->event_list_lock); | ||
| 4094 | |||
| 4095 | fdput(cfile); | ||
| 4096 | fdput(efile); | ||
| 4097 | |||
| 4098 | return 0; | ||
| 4099 | |||
| 4100 | out_put_css: | ||
| 4101 | css_put(event->css); | ||
| 4102 | out_put_cfile: | ||
| 4103 | fdput(cfile); | ||
| 4104 | out_put_eventfd: | ||
| 4105 | eventfd_ctx_put(event->eventfd); | ||
| 4106 | out_put_efile: | ||
| 4107 | fdput(efile); | ||
| 4108 | out_kfree: | ||
| 4109 | kfree(event); | ||
| 4110 | |||
| 4111 | return ret; | ||
| 4112 | } | ||
| 4113 | |||
| 4114 | static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css, | 3873 | static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css, |
| 4115 | struct cftype *cft) | 3874 | struct cftype *cft) |
| 4116 | { | 3875 | { |
| @@ -4136,11 +3895,6 @@ static struct cftype cgroup_base_files[] = { | |||
| 4136 | .mode = S_IRUGO | S_IWUSR, | 3895 | .mode = S_IRUGO | S_IWUSR, |
| 4137 | }, | 3896 | }, |
| 4138 | { | 3897 | { |
| 4139 | .name = "cgroup.event_control", | ||
| 4140 | .write_string = cgroup_write_event_control, | ||
| 4141 | .mode = S_IWUGO, | ||
| 4142 | }, | ||
| 4143 | { | ||
| 4144 | .name = "cgroup.clone_children", | 3898 | .name = "cgroup.clone_children", |
| 4145 | .flags = CFTYPE_INSANE, | 3899 | .flags = CFTYPE_INSANE, |
| 4146 | .read_u64 = cgroup_clone_children_read, | 3900 | .read_u64 = cgroup_clone_children_read, |
| @@ -4610,7 +4364,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) | |||
| 4610 | __releases(&cgroup_mutex) __acquires(&cgroup_mutex) | 4364 | __releases(&cgroup_mutex) __acquires(&cgroup_mutex) |
| 4611 | { | 4365 | { |
| 4612 | struct dentry *d = cgrp->dentry; | 4366 | struct dentry *d = cgrp->dentry; |
| 4613 | struct cgroup_event *event, *tmp; | ||
| 4614 | struct cgroup_subsys *ss; | 4367 | struct cgroup_subsys *ss; |
| 4615 | struct cgroup *child; | 4368 | struct cgroup *child; |
| 4616 | bool empty; | 4369 | bool empty; |
| @@ -4685,18 +4438,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) | |||
| 4685 | dget(d); | 4438 | dget(d); |
| 4686 | cgroup_d_remove_dir(d); | 4439 | cgroup_d_remove_dir(d); |
| 4687 | 4440 | ||
| 4688 | /* | ||
| 4689 | * Unregister events and notify userspace. | ||
| 4690 | * Notify userspace about cgroup removing only after rmdir of cgroup | ||
| 4691 | * directory to avoid race between userspace and kernelspace. | ||
| 4692 | */ | ||
| 4693 | spin_lock(&cgrp->event_list_lock); | ||
| 4694 | list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) { | ||
| 4695 | list_del_init(&event->list); | ||
| 4696 | schedule_work(&event->remove); | ||
| 4697 | } | ||
| 4698 | spin_unlock(&cgrp->event_list_lock); | ||
| 4699 | |||
| 4700 | return 0; | 4441 | return 0; |
| 4701 | }; | 4442 | }; |
| 4702 | 4443 | ||
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f1a0ae6e11b8..7aa0d405b148 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
| @@ -45,6 +45,7 @@ | |||
| 45 | #include <linux/swapops.h> | 45 | #include <linux/swapops.h> |
| 46 | #include <linux/spinlock.h> | 46 | #include <linux/spinlock.h> |
| 47 | #include <linux/eventfd.h> | 47 | #include <linux/eventfd.h> |
| 48 | #include <linux/poll.h> | ||
| 48 | #include <linux/sort.h> | 49 | #include <linux/sort.h> |
| 49 | #include <linux/fs.h> | 50 | #include <linux/fs.h> |
| 50 | #include <linux/seq_file.h> | 51 | #include <linux/seq_file.h> |
| @@ -55,6 +56,7 @@ | |||
| 55 | #include <linux/cpu.h> | 56 | #include <linux/cpu.h> |
| 56 | #include <linux/oom.h> | 57 | #include <linux/oom.h> |
| 57 | #include <linux/lockdep.h> | 58 | #include <linux/lockdep.h> |
| 59 | #include <linux/file.h> | ||
| 58 | #include "internal.h" | 60 | #include "internal.h" |
| 59 | #include <net/sock.h> | 61 | #include <net/sock.h> |
| 60 | #include <net/ip.h> | 62 | #include <net/ip.h> |
| @@ -227,6 +229,46 @@ struct mem_cgroup_eventfd_list { | |||
| 227 | struct eventfd_ctx *eventfd; | 229 | struct eventfd_ctx *eventfd; |
| 228 | }; | 230 | }; |
| 229 | 231 | ||
| 232 | /* | ||
| 233 | * cgroup_event represents events which userspace want to receive. | ||
| 234 | */ | ||
| 235 | struct mem_cgroup_event { | ||
| 236 | /* | ||
| 237 | * memcg which the event belongs to. | ||
| 238 | */ | ||
| 239 | struct mem_cgroup *memcg; | ||
| 240 | /* | ||
| 241 | * eventfd to signal userspace about the event. | ||
| 242 | */ | ||
| 243 | struct eventfd_ctx *eventfd; | ||
| 244 | /* | ||
| 245 | * Each of these stored in a list by the cgroup. | ||
| 246 | */ | ||
| 247 | struct list_head list; | ||
| 248 | /* | ||
| 249 | * register_event() callback will be used to add new userspace | ||
| 250 | * waiter for changes related to this event. Use eventfd_signal() | ||
| 251 | * on eventfd to send notification to userspace. | ||
| 252 | */ | ||
| 253 | int (*register_event)(struct mem_cgroup *memcg, | ||
| 254 | struct eventfd_ctx *eventfd, const char *args); | ||
| 255 | /* | ||
| 256 | * unregister_event() callback will be called when userspace closes | ||
| 257 | * the eventfd or on cgroup removing. This callback must be set, | ||
| 258 | * if you want provide notification functionality. | ||
| 259 | */ | ||
| 260 | void (*unregister_event)(struct mem_cgroup *memcg, | ||
| 261 | struct eventfd_ctx *eventfd); | ||
| 262 | /* | ||
| 263 | * All fields below needed to unregister event when | ||
| 264 | * userspace closes eventfd. | ||
| 265 | */ | ||
| 266 | poll_table pt; | ||
| 267 | wait_queue_head_t *wqh; | ||
| 268 | wait_queue_t wait; | ||
| 269 | struct work_struct remove; | ||
| 270 | }; | ||
| 271 | |||
| 230 | static void mem_cgroup_threshold(struct mem_cgroup *memcg); | 272 | static void mem_cgroup_threshold(struct mem_cgroup *memcg); |
| 231 | static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); | 273 | static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); |
| 232 | 274 | ||
| @@ -331,6 +373,10 @@ struct mem_cgroup { | |||
| 331 | atomic_t numainfo_updating; | 373 | atomic_t numainfo_updating; |
| 332 | #endif | 374 | #endif |
| 333 | 375 | ||
| 376 | /* List of events which userspace want to receive */ | ||
| 377 | struct list_head event_list; | ||
| 378 | spinlock_t event_list_lock; | ||
| 379 | |||
| 334 | struct mem_cgroup_per_node *nodeinfo[0]; | 380 | struct mem_cgroup_per_node *nodeinfo[0]; |
| 335 | /* WARNING: nodeinfo must be the last member here */ | 381 | /* WARNING: nodeinfo must be the last member here */ |
| 336 | }; | 382 | }; |
| @@ -490,11 +536,6 @@ struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr) | |||
| 490 | return &container_of(vmpr, struct mem_cgroup, vmpressure)->css; | 536 | return &container_of(vmpr, struct mem_cgroup, vmpressure)->css; |
| 491 | } | 537 | } |
| 492 | 538 | ||
| 493 | struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css) | ||
| 494 | { | ||
| 495 | return &mem_cgroup_from_css(css)->vmpressure; | ||
| 496 | } | ||
| 497 | |||
| 498 | static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) | 539 | static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) |
| 499 | { | 540 | { |
| 500 | return (memcg == root_mem_cgroup); | 541 | return (memcg == root_mem_cgroup); |
| @@ -5648,13 +5689,11 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg) | |||
| 5648 | mem_cgroup_oom_notify_cb(iter); | 5689 | mem_cgroup_oom_notify_cb(iter); |
| 5649 | } | 5690 | } |
| 5650 | 5691 | ||
| 5651 | static int mem_cgroup_usage_register_event(struct cgroup_subsys_state *css, | 5692 | static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, |
| 5652 | struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) | 5693 | struct eventfd_ctx *eventfd, const char *args, enum res_type type) |
| 5653 | { | 5694 | { |
| 5654 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | ||
| 5655 | struct mem_cgroup_thresholds *thresholds; | 5695 | struct mem_cgroup_thresholds *thresholds; |
| 5656 | struct mem_cgroup_threshold_ary *new; | 5696 | struct mem_cgroup_threshold_ary *new; |
| 5657 | enum res_type type = MEMFILE_TYPE(cft->private); | ||
| 5658 | u64 threshold, usage; | 5697 | u64 threshold, usage; |
| 5659 | int i, size, ret; | 5698 | int i, size, ret; |
| 5660 | 5699 | ||
| @@ -5731,13 +5770,23 @@ unlock: | |||
| 5731 | return ret; | 5770 | return ret; |
| 5732 | } | 5771 | } |
| 5733 | 5772 | ||
| 5734 | static void mem_cgroup_usage_unregister_event(struct cgroup_subsys_state *css, | 5773 | static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg, |
| 5735 | struct cftype *cft, struct eventfd_ctx *eventfd) | 5774 | struct eventfd_ctx *eventfd, const char *args) |
| 5775 | { | ||
| 5776 | return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM); | ||
| 5777 | } | ||
| 5778 | |||
| 5779 | static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg, | ||
| 5780 | struct eventfd_ctx *eventfd, const char *args) | ||
| 5781 | { | ||
| 5782 | return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP); | ||
| 5783 | } | ||
| 5784 | |||
| 5785 | static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, | ||
| 5786 | struct eventfd_ctx *eventfd, enum res_type type) | ||
| 5736 | { | 5787 | { |
| 5737 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | ||
| 5738 | struct mem_cgroup_thresholds *thresholds; | 5788 | struct mem_cgroup_thresholds *thresholds; |
| 5739 | struct mem_cgroup_threshold_ary *new; | 5789 | struct mem_cgroup_threshold_ary *new; |
| 5740 | enum res_type type = MEMFILE_TYPE(cft->private); | ||
| 5741 | u64 usage; | 5790 | u64 usage; |
| 5742 | int i, j, size; | 5791 | int i, j, size; |
| 5743 | 5792 | ||
| @@ -5810,14 +5859,23 @@ unlock: | |||
| 5810 | mutex_unlock(&memcg->thresholds_lock); | 5859 | mutex_unlock(&memcg->thresholds_lock); |
| 5811 | } | 5860 | } |
| 5812 | 5861 | ||
| 5813 | static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css, | 5862 | static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, |
| 5814 | struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) | 5863 | struct eventfd_ctx *eventfd) |
| 5864 | { | ||
| 5865 | return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM); | ||
| 5866 | } | ||
| 5867 | |||
| 5868 | static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg, | ||
| 5869 | struct eventfd_ctx *eventfd) | ||
| 5870 | { | ||
| 5871 | return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP); | ||
| 5872 | } | ||
| 5873 | |||
| 5874 | static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg, | ||
| 5875 | struct eventfd_ctx *eventfd, const char *args) | ||
| 5815 | { | 5876 | { |
| 5816 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | ||
| 5817 | struct mem_cgroup_eventfd_list *event; | 5877 | struct mem_cgroup_eventfd_list *event; |
| 5818 | enum res_type type = MEMFILE_TYPE(cft->private); | ||
| 5819 | 5878 | ||
| 5820 | BUG_ON(type != _OOM_TYPE); | ||
| 5821 | event = kmalloc(sizeof(*event), GFP_KERNEL); | 5879 | event = kmalloc(sizeof(*event), GFP_KERNEL); |
| 5822 | if (!event) | 5880 | if (!event) |
| 5823 | return -ENOMEM; | 5881 | return -ENOMEM; |
| @@ -5835,14 +5893,10 @@ static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css, | |||
| 5835 | return 0; | 5893 | return 0; |
| 5836 | } | 5894 | } |
| 5837 | 5895 | ||
| 5838 | static void mem_cgroup_oom_unregister_event(struct cgroup_subsys_state *css, | 5896 | static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg, |
| 5839 | struct cftype *cft, struct eventfd_ctx *eventfd) | 5897 | struct eventfd_ctx *eventfd) |
| 5840 | { | 5898 | { |
| 5841 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | ||
| 5842 | struct mem_cgroup_eventfd_list *ev, *tmp; | 5899 | struct mem_cgroup_eventfd_list *ev, *tmp; |
| 5843 | enum res_type type = MEMFILE_TYPE(cft->private); | ||
| 5844 | |||
| 5845 | BUG_ON(type != _OOM_TYPE); | ||
| 5846 | 5900 | ||
| 5847 | spin_lock(&memcg_oom_lock); | 5901 | spin_lock(&memcg_oom_lock); |
| 5848 | 5902 | ||
| @@ -5959,13 +6013,233 @@ static void kmem_cgroup_css_offline(struct mem_cgroup *memcg) | |||
| 5959 | } | 6013 | } |
| 5960 | #endif | 6014 | #endif |
| 5961 | 6015 | ||
| 6016 | /* | ||
| 6017 | * DO NOT USE IN NEW FILES. | ||
| 6018 | * | ||
| 6019 | * "cgroup.event_control" implementation. | ||
| 6020 | * | ||
| 6021 | * This is way over-engineered. It tries to support fully configurable | ||
| 6022 | * events for each user. Such level of flexibility is completely | ||
| 6023 | * unnecessary especially in the light of the planned unified hierarchy. | ||
| 6024 | * | ||
| 6025 | * Please deprecate this and replace with something simpler if at all | ||
| 6026 | * possible. | ||
| 6027 | */ | ||
| 6028 | |||
| 6029 | /* | ||
| 6030 | * Unregister event and free resources. | ||
| 6031 | * | ||
| 6032 | * Gets called from workqueue. | ||
| 6033 | */ | ||
| 6034 | static void memcg_event_remove(struct work_struct *work) | ||
| 6035 | { | ||
| 6036 | struct mem_cgroup_event *event = | ||
| 6037 | container_of(work, struct mem_cgroup_event, remove); | ||
| 6038 | struct mem_cgroup *memcg = event->memcg; | ||
| 6039 | |||
| 6040 | remove_wait_queue(event->wqh, &event->wait); | ||
| 6041 | |||
| 6042 | event->unregister_event(memcg, event->eventfd); | ||
| 6043 | |||
| 6044 | /* Notify userspace the event is going away. */ | ||
| 6045 | eventfd_signal(event->eventfd, 1); | ||
| 6046 | |||
| 6047 | eventfd_ctx_put(event->eventfd); | ||
| 6048 | kfree(event); | ||
| 6049 | css_put(&memcg->css); | ||
| 6050 | } | ||
| 6051 | |||
| 6052 | /* | ||
| 6053 | * Gets called on POLLHUP on eventfd when user closes it. | ||
| 6054 | * | ||
| 6055 | * Called with wqh->lock held and interrupts disabled. | ||
| 6056 | */ | ||
| 6057 | static int memcg_event_wake(wait_queue_t *wait, unsigned mode, | ||
| 6058 | int sync, void *key) | ||
| 6059 | { | ||
| 6060 | struct mem_cgroup_event *event = | ||
| 6061 | container_of(wait, struct mem_cgroup_event, wait); | ||
| 6062 | struct mem_cgroup *memcg = event->memcg; | ||
| 6063 | unsigned long flags = (unsigned long)key; | ||
| 6064 | |||
| 6065 | if (flags & POLLHUP) { | ||
| 6066 | /* | ||
| 6067 | * If the event has been detached at cgroup removal, we | ||
| 6068 | * can simply return knowing the other side will cleanup | ||
| 6069 | * for us. | ||
| 6070 | * | ||
| 6071 | * We can't race against event freeing since the other | ||
| 6072 | * side will require wqh->lock via remove_wait_queue(), | ||
| 6073 | * which we hold. | ||
| 6074 | */ | ||
| 6075 | spin_lock(&memcg->event_list_lock); | ||
| 6076 | if (!list_empty(&event->list)) { | ||
| 6077 | list_del_init(&event->list); | ||
| 6078 | /* | ||
| 6079 | * We are in atomic context, but cgroup_event_remove() | ||
| 6080 | * may sleep, so we have to call it in workqueue. | ||
| 6081 | */ | ||
| 6082 | schedule_work(&event->remove); | ||
| 6083 | } | ||
| 6084 | spin_unlock(&memcg->event_list_lock); | ||
| 6085 | } | ||
| 6086 | |||
| 6087 | return 0; | ||
| 6088 | } | ||
| 6089 | |||
| 6090 | static void memcg_event_ptable_queue_proc(struct file *file, | ||
| 6091 | wait_queue_head_t *wqh, poll_table *pt) | ||
| 6092 | { | ||
| 6093 | struct mem_cgroup_event *event = | ||
| 6094 | container_of(pt, struct mem_cgroup_event, pt); | ||
| 6095 | |||
| 6096 | event->wqh = wqh; | ||
| 6097 | add_wait_queue(wqh, &event->wait); | ||
| 6098 | } | ||
| 6099 | |||
| 6100 | /* | ||
| 6101 | * DO NOT USE IN NEW FILES. | ||
| 6102 | * | ||
| 6103 | * Parse input and register new cgroup event handler. | ||
| 6104 | * | ||
| 6105 | * Input must be in format '<event_fd> <control_fd> <args>'. | ||
| 6106 | * Interpretation of args is defined by control file implementation. | ||
| 6107 | */ | ||
| 6108 | static int memcg_write_event_control(struct cgroup_subsys_state *css, | ||
| 6109 | struct cftype *cft, const char *buffer) | ||
| 6110 | { | ||
| 6111 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | ||
| 6112 | struct mem_cgroup_event *event; | ||
| 6113 | struct cgroup_subsys_state *cfile_css; | ||
| 6114 | unsigned int efd, cfd; | ||
| 6115 | struct fd efile; | ||
| 6116 | struct fd cfile; | ||
| 6117 | const char *name; | ||
| 6118 | char *endp; | ||
| 6119 | int ret; | ||
| 6120 | |||
| 6121 | efd = simple_strtoul(buffer, &endp, 10); | ||
| 6122 | if (*endp != ' ') | ||
| 6123 | return -EINVAL; | ||
| 6124 | buffer = endp + 1; | ||
| 6125 | |||
| 6126 | cfd = simple_strtoul(buffer, &endp, 10); | ||
| 6127 | if ((*endp != ' ') && (*endp != '\0')) | ||
| 6128 | return -EINVAL; | ||
| 6129 | buffer = endp + 1; | ||
| 6130 | |||
| 6131 | event = kzalloc(sizeof(*event), GFP_KERNEL); | ||
| 6132 | if (!event) | ||
| 6133 | return -ENOMEM; | ||
| 6134 | |||
| 6135 | event->memcg = memcg; | ||
| 6136 | INIT_LIST_HEAD(&event->list); | ||
| 6137 | init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc); | ||
| 6138 | init_waitqueue_func_entry(&event->wait, memcg_event_wake); | ||
| 6139 | INIT_WORK(&event->remove, memcg_event_remove); | ||
| 6140 | |||
| 6141 | efile = fdget(efd); | ||
| 6142 | if (!efile.file) { | ||
| 6143 | ret = -EBADF; | ||
| 6144 | goto out_kfree; | ||
| 6145 | } | ||
| 6146 | |||
| 6147 | event->eventfd = eventfd_ctx_fileget(efile.file); | ||
| 6148 | if (IS_ERR(event->eventfd)) { | ||
| 6149 | ret = PTR_ERR(event->eventfd); | ||
| 6150 | goto out_put_efile; | ||
| 6151 | } | ||
| 6152 | |||
| 6153 | cfile = fdget(cfd); | ||
| 6154 | if (!cfile.file) { | ||
| 6155 | ret = -EBADF; | ||
| 6156 | goto out_put_eventfd; | ||
| 6157 | } | ||
| 6158 | |||
| 6159 | /* the process need read permission on control file */ | ||
| 6160 | /* AV: shouldn't we check that it's been opened for read instead? */ | ||
| 6161 | ret = inode_permission(file_inode(cfile.file), MAY_READ); | ||
| 6162 | if (ret < 0) | ||
| 6163 | goto out_put_cfile; | ||
| 6164 | |||
| 6165 | /* | ||
| 6166 | * Determine the event callbacks and set them in @event. This used | ||
| 6167 | * to be done via struct cftype but cgroup core no longer knows | ||
| 6168 | * about these events. The following is crude but the whole thing | ||
| 6169 | * is for compatibility anyway. | ||
| 6170 | * | ||
| 6171 | * DO NOT ADD NEW FILES. | ||
| 6172 | */ | ||
| 6173 | name = cfile.file->f_dentry->d_name.name; | ||
| 6174 | |||
| 6175 | if (!strcmp(name, "memory.usage_in_bytes")) { | ||
| 6176 | event->register_event = mem_cgroup_usage_register_event; | ||
| 6177 | event->unregister_event = mem_cgroup_usage_unregister_event; | ||
| 6178 | } else if (!strcmp(name, "memory.oom_control")) { | ||
| 6179 | event->register_event = mem_cgroup_oom_register_event; | ||
| 6180 | event->unregister_event = mem_cgroup_oom_unregister_event; | ||
| 6181 | } else if (!strcmp(name, "memory.pressure_level")) { | ||
| 6182 | event->register_event = vmpressure_register_event; | ||
| 6183 | event->unregister_event = vmpressure_unregister_event; | ||
| 6184 | } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) { | ||
| 6185 | event->register_event = memsw_cgroup_usage_register_event; | ||
| 6186 | event->unregister_event = memsw_cgroup_usage_unregister_event; | ||
| 6187 | } else { | ||
| 6188 | ret = -EINVAL; | ||
| 6189 | goto out_put_cfile; | ||
| 6190 | } | ||
| 6191 | |||
| 6192 | /* | ||
| 6193 | * Verify @cfile should belong to @css. Also, remaining events are | ||
| 6194 | * automatically removed on cgroup destruction but the removal is | ||
| 6195 | * asynchronous, so take an extra ref on @css. | ||
| 6196 | */ | ||
| 6197 | rcu_read_lock(); | ||
| 6198 | |||
| 6199 | ret = -EINVAL; | ||
| 6200 | cfile_css = css_from_dir(cfile.file->f_dentry->d_parent, | ||
| 6201 | &mem_cgroup_subsys); | ||
| 6202 | if (cfile_css == css && css_tryget(css)) | ||
| 6203 | ret = 0; | ||
| 6204 | |||
| 6205 | rcu_read_unlock(); | ||
| 6206 | if (ret) | ||
| 6207 | goto out_put_cfile; | ||
| 6208 | |||
| 6209 | ret = event->register_event(memcg, event->eventfd, buffer); | ||
| 6210 | if (ret) | ||
| 6211 | goto out_put_css; | ||
| 6212 | |||
| 6213 | efile.file->f_op->poll(efile.file, &event->pt); | ||
| 6214 | |||
| 6215 | spin_lock(&memcg->event_list_lock); | ||
| 6216 | list_add(&event->list, &memcg->event_list); | ||
| 6217 | spin_unlock(&memcg->event_list_lock); | ||
| 6218 | |||
| 6219 | fdput(cfile); | ||
| 6220 | fdput(efile); | ||
| 6221 | |||
| 6222 | return 0; | ||
| 6223 | |||
| 6224 | out_put_css: | ||
| 6225 | css_put(css); | ||
| 6226 | out_put_cfile: | ||
| 6227 | fdput(cfile); | ||
| 6228 | out_put_eventfd: | ||
| 6229 | eventfd_ctx_put(event->eventfd); | ||
| 6230 | out_put_efile: | ||
| 6231 | fdput(efile); | ||
| 6232 | out_kfree: | ||
| 6233 | kfree(event); | ||
| 6234 | |||
| 6235 | return ret; | ||
| 6236 | } | ||
| 6237 | |||
| 5962 | static struct cftype mem_cgroup_files[] = { | 6238 | static struct cftype mem_cgroup_files[] = { |
| 5963 | { | 6239 | { |
| 5964 | .name = "usage_in_bytes", | 6240 | .name = "usage_in_bytes", |
| 5965 | .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), | 6241 | .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), |
| 5966 | .read = mem_cgroup_read, | 6242 | .read = mem_cgroup_read, |
| 5967 | .register_event = mem_cgroup_usage_register_event, | ||
| 5968 | .unregister_event = mem_cgroup_usage_unregister_event, | ||
| 5969 | }, | 6243 | }, |
| 5970 | { | 6244 | { |
| 5971 | .name = "max_usage_in_bytes", | 6245 | .name = "max_usage_in_bytes", |
| @@ -6006,6 +6280,12 @@ static struct cftype mem_cgroup_files[] = { | |||
| 6006 | .read_u64 = mem_cgroup_hierarchy_read, | 6280 | .read_u64 = mem_cgroup_hierarchy_read, |
| 6007 | }, | 6281 | }, |
| 6008 | { | 6282 | { |
| 6283 | .name = "cgroup.event_control", /* XXX: for compat */ | ||
| 6284 | .write_string = memcg_write_event_control, | ||
| 6285 | .flags = CFTYPE_NO_PREFIX, | ||
| 6286 | .mode = S_IWUGO, | ||
| 6287 | }, | ||
| 6288 | { | ||
| 6009 | .name = "swappiness", | 6289 | .name = "swappiness", |
| 6010 | .read_u64 = mem_cgroup_swappiness_read, | 6290 | .read_u64 = mem_cgroup_swappiness_read, |
| 6011 | .write_u64 = mem_cgroup_swappiness_write, | 6291 | .write_u64 = mem_cgroup_swappiness_write, |
| @@ -6019,14 +6299,10 @@ static struct cftype mem_cgroup_files[] = { | |||
| 6019 | .name = "oom_control", | 6299 | .name = "oom_control", |
| 6020 | .read_map = mem_cgroup_oom_control_read, | 6300 | .read_map = mem_cgroup_oom_control_read, |
| 6021 | .write_u64 = mem_cgroup_oom_control_write, | 6301 | .write_u64 = mem_cgroup_oom_control_write, |
| 6022 | .register_event = mem_cgroup_oom_register_event, | ||
| 6023 | .unregister_event = mem_cgroup_oom_unregister_event, | ||
| 6024 | .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), | 6302 | .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), |
| 6025 | }, | 6303 | }, |
| 6026 | { | 6304 | { |
| 6027 | .name = "pressure_level", | 6305 | .name = "pressure_level", |
| 6028 | .register_event = vmpressure_register_event, | ||
| 6029 | .unregister_event = vmpressure_unregister_event, | ||
| 6030 | }, | 6306 | }, |
| 6031 | #ifdef CONFIG_NUMA | 6307 | #ifdef CONFIG_NUMA |
| 6032 | { | 6308 | { |
| @@ -6074,8 +6350,6 @@ static struct cftype memsw_cgroup_files[] = { | |||
| 6074 | .name = "memsw.usage_in_bytes", | 6350 | .name = "memsw.usage_in_bytes", |
| 6075 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), | 6351 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), |
| 6076 | .read = mem_cgroup_read, | 6352 | .read = mem_cgroup_read, |
| 6077 | .register_event = mem_cgroup_usage_register_event, | ||
| 6078 | .unregister_event = mem_cgroup_usage_unregister_event, | ||
| 6079 | }, | 6353 | }, |
| 6080 | { | 6354 | { |
| 6081 | .name = "memsw.max_usage_in_bytes", | 6355 | .name = "memsw.max_usage_in_bytes", |
| @@ -6265,6 +6539,8 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) | |||
| 6265 | mutex_init(&memcg->thresholds_lock); | 6539 | mutex_init(&memcg->thresholds_lock); |
| 6266 | spin_lock_init(&memcg->move_lock); | 6540 | spin_lock_init(&memcg->move_lock); |
| 6267 | vmpressure_init(&memcg->vmpressure); | 6541 | vmpressure_init(&memcg->vmpressure); |
| 6542 | INIT_LIST_HEAD(&memcg->event_list); | ||
| 6543 | spin_lock_init(&memcg->event_list_lock); | ||
| 6268 | 6544 | ||
| 6269 | return &memcg->css; | 6545 | return &memcg->css; |
| 6270 | 6546 | ||
| @@ -6340,6 +6616,19 @@ static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg) | |||
| 6340 | static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) | 6616 | static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) |
| 6341 | { | 6617 | { |
| 6342 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 6618 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); |
| 6619 | struct mem_cgroup_event *event, *tmp; | ||
| 6620 | |||
| 6621 | /* | ||
| 6622 | * Unregister events and notify userspace. | ||
| 6623 | * Notify userspace about cgroup removing only after rmdir of cgroup | ||
| 6624 | * directory to avoid race between userspace and kernelspace. | ||
| 6625 | */ | ||
| 6626 | spin_lock(&memcg->event_list_lock); | ||
| 6627 | list_for_each_entry_safe(event, tmp, &memcg->event_list, list) { | ||
| 6628 | list_del_init(&event->list); | ||
| 6629 | schedule_work(&event->remove); | ||
| 6630 | } | ||
| 6631 | spin_unlock(&memcg->event_list_lock); | ||
| 6343 | 6632 | ||
| 6344 | kmem_cgroup_css_offline(memcg); | 6633 | kmem_cgroup_css_offline(memcg); |
| 6345 | 6634 | ||
diff --git a/mm/vmpressure.c b/mm/vmpressure.c index e0f62837c3f4..196970a4541f 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c | |||
| @@ -278,8 +278,7 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio) | |||
| 278 | 278 | ||
| 279 | /** | 279 | /** |
| 280 | * vmpressure_register_event() - Bind vmpressure notifications to an eventfd | 280 | * vmpressure_register_event() - Bind vmpressure notifications to an eventfd |
| 281 | * @css: css that is interested in vmpressure notifications | 281 | * @memcg: memcg that is interested in vmpressure notifications |
| 282 | * @cft: cgroup control files handle | ||
| 283 | * @eventfd: eventfd context to link notifications with | 282 | * @eventfd: eventfd context to link notifications with |
| 284 | * @args: event arguments (used to set up a pressure level threshold) | 283 | * @args: event arguments (used to set up a pressure level threshold) |
| 285 | * | 284 | * |
| @@ -289,15 +288,12 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio) | |||
| 289 | * threshold (one of vmpressure_str_levels, i.e. "low", "medium", or | 288 | * threshold (one of vmpressure_str_levels, i.e. "low", "medium", or |
| 290 | * "critical"). | 289 | * "critical"). |
| 291 | * | 290 | * |
| 292 | * This function should not be used directly, just pass it to (struct | 291 | * To be used as memcg event method. |
| 293 | * cftype).register_event, and then cgroup core will handle everything by | ||
| 294 | * itself. | ||
| 295 | */ | 292 | */ |
| 296 | int vmpressure_register_event(struct cgroup_subsys_state *css, | 293 | int vmpressure_register_event(struct mem_cgroup *memcg, |
| 297 | struct cftype *cft, struct eventfd_ctx *eventfd, | 294 | struct eventfd_ctx *eventfd, const char *args) |
| 298 | const char *args) | ||
| 299 | { | 295 | { |
| 300 | struct vmpressure *vmpr = css_to_vmpressure(css); | 296 | struct vmpressure *vmpr = memcg_to_vmpressure(memcg); |
| 301 | struct vmpressure_event *ev; | 297 | struct vmpressure_event *ev; |
| 302 | int level; | 298 | int level; |
| 303 | 299 | ||
| @@ -325,23 +321,19 @@ int vmpressure_register_event(struct cgroup_subsys_state *css, | |||
| 325 | 321 | ||
| 326 | /** | 322 | /** |
| 327 | * vmpressure_unregister_event() - Unbind eventfd from vmpressure | 323 | * vmpressure_unregister_event() - Unbind eventfd from vmpressure |
| 328 | * @css: css handle | 324 | * @memcg: memcg handle |
| 329 | * @cft: cgroup control files handle | ||
| 330 | * @eventfd: eventfd context that was used to link vmpressure with the @cg | 325 | * @eventfd: eventfd context that was used to link vmpressure with the @cg |
| 331 | * | 326 | * |
| 332 | * This function does internal manipulations to detach the @eventfd from | 327 | * This function does internal manipulations to detach the @eventfd from |
| 333 | * the vmpressure notifications, and then frees internal resources | 328 | * the vmpressure notifications, and then frees internal resources |
| 334 | * associated with the @eventfd (but the @eventfd itself is not freed). | 329 | * associated with the @eventfd (but the @eventfd itself is not freed). |
| 335 | * | 330 | * |
| 336 | * This function should not be used directly, just pass it to (struct | 331 | * To be used as memcg event method. |
| 337 | * cftype).unregister_event, and then cgroup core will handle everything | ||
| 338 | * by itself. | ||
| 339 | */ | 332 | */ |
| 340 | void vmpressure_unregister_event(struct cgroup_subsys_state *css, | 333 | void vmpressure_unregister_event(struct mem_cgroup *memcg, |
| 341 | struct cftype *cft, | ||
| 342 | struct eventfd_ctx *eventfd) | 334 | struct eventfd_ctx *eventfd) |
| 343 | { | 335 | { |
| 344 | struct vmpressure *vmpr = css_to_vmpressure(css); | 336 | struct vmpressure *vmpr = memcg_to_vmpressure(memcg); |
| 345 | struct vmpressure_event *ev; | 337 | struct vmpressure_event *ev; |
| 346 | 338 | ||
| 347 | mutex_lock(&vmpr->events_lock); | 339 | mutex_lock(&vmpr->events_lock); |
