aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c353
1 files changed, 321 insertions, 32 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f1a0ae6e11b8..7aa0d405b148 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -45,6 +45,7 @@
45#include <linux/swapops.h> 45#include <linux/swapops.h>
46#include <linux/spinlock.h> 46#include <linux/spinlock.h>
47#include <linux/eventfd.h> 47#include <linux/eventfd.h>
48#include <linux/poll.h>
48#include <linux/sort.h> 49#include <linux/sort.h>
49#include <linux/fs.h> 50#include <linux/fs.h>
50#include <linux/seq_file.h> 51#include <linux/seq_file.h>
@@ -55,6 +56,7 @@
55#include <linux/cpu.h> 56#include <linux/cpu.h>
56#include <linux/oom.h> 57#include <linux/oom.h>
57#include <linux/lockdep.h> 58#include <linux/lockdep.h>
59#include <linux/file.h>
58#include "internal.h" 60#include "internal.h"
59#include <net/sock.h> 61#include <net/sock.h>
60#include <net/ip.h> 62#include <net/ip.h>
@@ -227,6 +229,46 @@ struct mem_cgroup_eventfd_list {
227 struct eventfd_ctx *eventfd; 229 struct eventfd_ctx *eventfd;
228}; 230};
229 231
232/*
233 * cgroup_event represents events which userspace want to receive.
234 */
235struct mem_cgroup_event {
236 /*
237 * memcg which the event belongs to.
238 */
239 struct mem_cgroup *memcg;
240 /*
241 * eventfd to signal userspace about the event.
242 */
243 struct eventfd_ctx *eventfd;
244 /*
245 * Each of these stored in a list by the cgroup.
246 */
247 struct list_head list;
248 /*
249 * register_event() callback will be used to add new userspace
250 * waiter for changes related to this event. Use eventfd_signal()
251 * on eventfd to send notification to userspace.
252 */
253 int (*register_event)(struct mem_cgroup *memcg,
254 struct eventfd_ctx *eventfd, const char *args);
255 /*
256 * unregister_event() callback will be called when userspace closes
257 * the eventfd or on cgroup removing. This callback must be set,
258 * if you want provide notification functionality.
259 */
260 void (*unregister_event)(struct mem_cgroup *memcg,
261 struct eventfd_ctx *eventfd);
262 /*
263 * All fields below needed to unregister event when
264 * userspace closes eventfd.
265 */
266 poll_table pt;
267 wait_queue_head_t *wqh;
268 wait_queue_t wait;
269 struct work_struct remove;
270};
271
230static void mem_cgroup_threshold(struct mem_cgroup *memcg); 272static void mem_cgroup_threshold(struct mem_cgroup *memcg);
231static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); 273static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
232 274
@@ -331,6 +373,10 @@ struct mem_cgroup {
331 atomic_t numainfo_updating; 373 atomic_t numainfo_updating;
332#endif 374#endif
333 375
376 /* List of events which userspace want to receive */
377 struct list_head event_list;
378 spinlock_t event_list_lock;
379
334 struct mem_cgroup_per_node *nodeinfo[0]; 380 struct mem_cgroup_per_node *nodeinfo[0];
335 /* WARNING: nodeinfo must be the last member here */ 381 /* WARNING: nodeinfo must be the last member here */
336}; 382};
@@ -490,11 +536,6 @@ struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
490 return &container_of(vmpr, struct mem_cgroup, vmpressure)->css; 536 return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
491} 537}
492 538
493struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css)
494{
495 return &mem_cgroup_from_css(css)->vmpressure;
496}
497
498static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) 539static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
499{ 540{
500 return (memcg == root_mem_cgroup); 541 return (memcg == root_mem_cgroup);
@@ -5648,13 +5689,11 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
5648 mem_cgroup_oom_notify_cb(iter); 5689 mem_cgroup_oom_notify_cb(iter);
5649} 5690}
5650 5691
5651static int mem_cgroup_usage_register_event(struct cgroup_subsys_state *css, 5692static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
5652 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) 5693 struct eventfd_ctx *eventfd, const char *args, enum res_type type)
5653{ 5694{
5654 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5655 struct mem_cgroup_thresholds *thresholds; 5695 struct mem_cgroup_thresholds *thresholds;
5656 struct mem_cgroup_threshold_ary *new; 5696 struct mem_cgroup_threshold_ary *new;
5657 enum res_type type = MEMFILE_TYPE(cft->private);
5658 u64 threshold, usage; 5697 u64 threshold, usage;
5659 int i, size, ret; 5698 int i, size, ret;
5660 5699
@@ -5731,13 +5770,23 @@ unlock:
5731 return ret; 5770 return ret;
5732} 5771}
5733 5772
5734static void mem_cgroup_usage_unregister_event(struct cgroup_subsys_state *css, 5773static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
5735 struct cftype *cft, struct eventfd_ctx *eventfd) 5774 struct eventfd_ctx *eventfd, const char *args)
5775{
5776 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
5777}
5778
5779static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
5780 struct eventfd_ctx *eventfd, const char *args)
5781{
5782 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
5783}
5784
5785static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
5786 struct eventfd_ctx *eventfd, enum res_type type)
5736{ 5787{
5737 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5738 struct mem_cgroup_thresholds *thresholds; 5788 struct mem_cgroup_thresholds *thresholds;
5739 struct mem_cgroup_threshold_ary *new; 5789 struct mem_cgroup_threshold_ary *new;
5740 enum res_type type = MEMFILE_TYPE(cft->private);
5741 u64 usage; 5790 u64 usage;
5742 int i, j, size; 5791 int i, j, size;
5743 5792
@@ -5810,14 +5859,23 @@ unlock:
5810 mutex_unlock(&memcg->thresholds_lock); 5859 mutex_unlock(&memcg->thresholds_lock);
5811} 5860}
5812 5861
5813static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css, 5862static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
5814 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) 5863 struct eventfd_ctx *eventfd)
5864{
5865 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
5866}
5867
5868static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
5869 struct eventfd_ctx *eventfd)
5870{
5871 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
5872}
5873
5874static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
5875 struct eventfd_ctx *eventfd, const char *args)
5815{ 5876{
5816 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5817 struct mem_cgroup_eventfd_list *event; 5877 struct mem_cgroup_eventfd_list *event;
5818 enum res_type type = MEMFILE_TYPE(cft->private);
5819 5878
5820 BUG_ON(type != _OOM_TYPE);
5821 event = kmalloc(sizeof(*event), GFP_KERNEL); 5879 event = kmalloc(sizeof(*event), GFP_KERNEL);
5822 if (!event) 5880 if (!event)
5823 return -ENOMEM; 5881 return -ENOMEM;
@@ -5835,14 +5893,10 @@ static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css,
5835 return 0; 5893 return 0;
5836} 5894}
5837 5895
5838static void mem_cgroup_oom_unregister_event(struct cgroup_subsys_state *css, 5896static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
5839 struct cftype *cft, struct eventfd_ctx *eventfd) 5897 struct eventfd_ctx *eventfd)
5840{ 5898{
5841 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5842 struct mem_cgroup_eventfd_list *ev, *tmp; 5899 struct mem_cgroup_eventfd_list *ev, *tmp;
5843 enum res_type type = MEMFILE_TYPE(cft->private);
5844
5845 BUG_ON(type != _OOM_TYPE);
5846 5900
5847 spin_lock(&memcg_oom_lock); 5901 spin_lock(&memcg_oom_lock);
5848 5902
@@ -5959,13 +6013,233 @@ static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)
5959} 6013}
5960#endif 6014#endif
5961 6015
6016/*
6017 * DO NOT USE IN NEW FILES.
6018 *
6019 * "cgroup.event_control" implementation.
6020 *
6021 * This is way over-engineered. It tries to support fully configurable
6022 * events for each user. Such level of flexibility is completely
6023 * unnecessary especially in the light of the planned unified hierarchy.
6024 *
6025 * Please deprecate this and replace with something simpler if at all
6026 * possible.
6027 */
6028
6029/*
6030 * Unregister event and free resources.
6031 *
6032 * Gets called from workqueue.
6033 */
6034static void memcg_event_remove(struct work_struct *work)
6035{
6036 struct mem_cgroup_event *event =
6037 container_of(work, struct mem_cgroup_event, remove);
6038 struct mem_cgroup *memcg = event->memcg;
6039
6040 remove_wait_queue(event->wqh, &event->wait);
6041
6042 event->unregister_event(memcg, event->eventfd);
6043
6044 /* Notify userspace the event is going away. */
6045 eventfd_signal(event->eventfd, 1);
6046
6047 eventfd_ctx_put(event->eventfd);
6048 kfree(event);
6049 css_put(&memcg->css);
6050}
6051
6052/*
6053 * Gets called on POLLHUP on eventfd when user closes it.
6054 *
6055 * Called with wqh->lock held and interrupts disabled.
6056 */
6057static int memcg_event_wake(wait_queue_t *wait, unsigned mode,
6058 int sync, void *key)
6059{
6060 struct mem_cgroup_event *event =
6061 container_of(wait, struct mem_cgroup_event, wait);
6062 struct mem_cgroup *memcg = event->memcg;
6063 unsigned long flags = (unsigned long)key;
6064
6065 if (flags & POLLHUP) {
6066 /*
6067 * If the event has been detached at cgroup removal, we
6068 * can simply return knowing the other side will cleanup
6069 * for us.
6070 *
6071 * We can't race against event freeing since the other
6072 * side will require wqh->lock via remove_wait_queue(),
6073 * which we hold.
6074 */
6075 spin_lock(&memcg->event_list_lock);
6076 if (!list_empty(&event->list)) {
6077 list_del_init(&event->list);
6078 /*
6079 * We are in atomic context, but cgroup_event_remove()
6080 * may sleep, so we have to call it in workqueue.
6081 */
6082 schedule_work(&event->remove);
6083 }
6084 spin_unlock(&memcg->event_list_lock);
6085 }
6086
6087 return 0;
6088}
6089
6090static void memcg_event_ptable_queue_proc(struct file *file,
6091 wait_queue_head_t *wqh, poll_table *pt)
6092{
6093 struct mem_cgroup_event *event =
6094 container_of(pt, struct mem_cgroup_event, pt);
6095
6096 event->wqh = wqh;
6097 add_wait_queue(wqh, &event->wait);
6098}
6099
6100/*
6101 * DO NOT USE IN NEW FILES.
6102 *
6103 * Parse input and register new cgroup event handler.
6104 *
6105 * Input must be in format '<event_fd> <control_fd> <args>'.
6106 * Interpretation of args is defined by control file implementation.
6107 */
6108static int memcg_write_event_control(struct cgroup_subsys_state *css,
6109 struct cftype *cft, const char *buffer)
6110{
6111 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
6112 struct mem_cgroup_event *event;
6113 struct cgroup_subsys_state *cfile_css;
6114 unsigned int efd, cfd;
6115 struct fd efile;
6116 struct fd cfile;
6117 const char *name;
6118 char *endp;
6119 int ret;
6120
6121 efd = simple_strtoul(buffer, &endp, 10);
6122 if (*endp != ' ')
6123 return -EINVAL;
6124 buffer = endp + 1;
6125
6126 cfd = simple_strtoul(buffer, &endp, 10);
6127 if ((*endp != ' ') && (*endp != '\0'))
6128 return -EINVAL;
6129 buffer = endp + 1;
6130
6131 event = kzalloc(sizeof(*event), GFP_KERNEL);
6132 if (!event)
6133 return -ENOMEM;
6134
6135 event->memcg = memcg;
6136 INIT_LIST_HEAD(&event->list);
6137 init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
6138 init_waitqueue_func_entry(&event->wait, memcg_event_wake);
6139 INIT_WORK(&event->remove, memcg_event_remove);
6140
6141 efile = fdget(efd);
6142 if (!efile.file) {
6143 ret = -EBADF;
6144 goto out_kfree;
6145 }
6146
6147 event->eventfd = eventfd_ctx_fileget(efile.file);
6148 if (IS_ERR(event->eventfd)) {
6149 ret = PTR_ERR(event->eventfd);
6150 goto out_put_efile;
6151 }
6152
6153 cfile = fdget(cfd);
6154 if (!cfile.file) {
6155 ret = -EBADF;
6156 goto out_put_eventfd;
6157 }
6158
6159 /* the process need read permission on control file */
6160 /* AV: shouldn't we check that it's been opened for read instead? */
6161 ret = inode_permission(file_inode(cfile.file), MAY_READ);
6162 if (ret < 0)
6163 goto out_put_cfile;
6164
6165 /*
6166 * Determine the event callbacks and set them in @event. This used
6167 * to be done via struct cftype but cgroup core no longer knows
6168 * about these events. The following is crude but the whole thing
6169 * is for compatibility anyway.
6170 *
6171 * DO NOT ADD NEW FILES.
6172 */
6173 name = cfile.file->f_dentry->d_name.name;
6174
6175 if (!strcmp(name, "memory.usage_in_bytes")) {
6176 event->register_event = mem_cgroup_usage_register_event;
6177 event->unregister_event = mem_cgroup_usage_unregister_event;
6178 } else if (!strcmp(name, "memory.oom_control")) {
6179 event->register_event = mem_cgroup_oom_register_event;
6180 event->unregister_event = mem_cgroup_oom_unregister_event;
6181 } else if (!strcmp(name, "memory.pressure_level")) {
6182 event->register_event = vmpressure_register_event;
6183 event->unregister_event = vmpressure_unregister_event;
6184 } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
6185 event->register_event = memsw_cgroup_usage_register_event;
6186 event->unregister_event = memsw_cgroup_usage_unregister_event;
6187 } else {
6188 ret = -EINVAL;
6189 goto out_put_cfile;
6190 }
6191
6192 /*
6193 * Verify @cfile should belong to @css. Also, remaining events are
6194 * automatically removed on cgroup destruction but the removal is
6195 * asynchronous, so take an extra ref on @css.
6196 */
6197 rcu_read_lock();
6198
6199 ret = -EINVAL;
6200 cfile_css = css_from_dir(cfile.file->f_dentry->d_parent,
6201 &mem_cgroup_subsys);
6202 if (cfile_css == css && css_tryget(css))
6203 ret = 0;
6204
6205 rcu_read_unlock();
6206 if (ret)
6207 goto out_put_cfile;
6208
6209 ret = event->register_event(memcg, event->eventfd, buffer);
6210 if (ret)
6211 goto out_put_css;
6212
6213 efile.file->f_op->poll(efile.file, &event->pt);
6214
6215 spin_lock(&memcg->event_list_lock);
6216 list_add(&event->list, &memcg->event_list);
6217 spin_unlock(&memcg->event_list_lock);
6218
6219 fdput(cfile);
6220 fdput(efile);
6221
6222 return 0;
6223
6224out_put_css:
6225 css_put(css);
6226out_put_cfile:
6227 fdput(cfile);
6228out_put_eventfd:
6229 eventfd_ctx_put(event->eventfd);
6230out_put_efile:
6231 fdput(efile);
6232out_kfree:
6233 kfree(event);
6234
6235 return ret;
6236}
6237
5962static struct cftype mem_cgroup_files[] = { 6238static struct cftype mem_cgroup_files[] = {
5963 { 6239 {
5964 .name = "usage_in_bytes", 6240 .name = "usage_in_bytes",
5965 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 6241 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
5966 .read = mem_cgroup_read, 6242 .read = mem_cgroup_read,
5967 .register_event = mem_cgroup_usage_register_event,
5968 .unregister_event = mem_cgroup_usage_unregister_event,
5969 }, 6243 },
5970 { 6244 {
5971 .name = "max_usage_in_bytes", 6245 .name = "max_usage_in_bytes",
@@ -6006,6 +6280,12 @@ static struct cftype mem_cgroup_files[] = {
6006 .read_u64 = mem_cgroup_hierarchy_read, 6280 .read_u64 = mem_cgroup_hierarchy_read,
6007 }, 6281 },
6008 { 6282 {
6283 .name = "cgroup.event_control", /* XXX: for compat */
6284 .write_string = memcg_write_event_control,
6285 .flags = CFTYPE_NO_PREFIX,
6286 .mode = S_IWUGO,
6287 },
6288 {
6009 .name = "swappiness", 6289 .name = "swappiness",
6010 .read_u64 = mem_cgroup_swappiness_read, 6290 .read_u64 = mem_cgroup_swappiness_read,
6011 .write_u64 = mem_cgroup_swappiness_write, 6291 .write_u64 = mem_cgroup_swappiness_write,
@@ -6019,14 +6299,10 @@ static struct cftype mem_cgroup_files[] = {
6019 .name = "oom_control", 6299 .name = "oom_control",
6020 .read_map = mem_cgroup_oom_control_read, 6300 .read_map = mem_cgroup_oom_control_read,
6021 .write_u64 = mem_cgroup_oom_control_write, 6301 .write_u64 = mem_cgroup_oom_control_write,
6022 .register_event = mem_cgroup_oom_register_event,
6023 .unregister_event = mem_cgroup_oom_unregister_event,
6024 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), 6302 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
6025 }, 6303 },
6026 { 6304 {
6027 .name = "pressure_level", 6305 .name = "pressure_level",
6028 .register_event = vmpressure_register_event,
6029 .unregister_event = vmpressure_unregister_event,
6030 }, 6306 },
6031#ifdef CONFIG_NUMA 6307#ifdef CONFIG_NUMA
6032 { 6308 {
@@ -6074,8 +6350,6 @@ static struct cftype memsw_cgroup_files[] = {
6074 .name = "memsw.usage_in_bytes", 6350 .name = "memsw.usage_in_bytes",
6075 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 6351 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
6076 .read = mem_cgroup_read, 6352 .read = mem_cgroup_read,
6077 .register_event = mem_cgroup_usage_register_event,
6078 .unregister_event = mem_cgroup_usage_unregister_event,
6079 }, 6353 },
6080 { 6354 {
6081 .name = "memsw.max_usage_in_bytes", 6355 .name = "memsw.max_usage_in_bytes",
@@ -6265,6 +6539,8 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
6265 mutex_init(&memcg->thresholds_lock); 6539 mutex_init(&memcg->thresholds_lock);
6266 spin_lock_init(&memcg->move_lock); 6540 spin_lock_init(&memcg->move_lock);
6267 vmpressure_init(&memcg->vmpressure); 6541 vmpressure_init(&memcg->vmpressure);
6542 INIT_LIST_HEAD(&memcg->event_list);
6543 spin_lock_init(&memcg->event_list_lock);
6268 6544
6269 return &memcg->css; 6545 return &memcg->css;
6270 6546
@@ -6340,6 +6616,19 @@ static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg)
6340static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) 6616static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
6341{ 6617{
6342 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 6618 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
6619 struct mem_cgroup_event *event, *tmp;
6620
6621 /*
6622 * Unregister events and notify userspace.
6623 * Notify userspace about cgroup removing only after rmdir of cgroup
6624 * directory to avoid race between userspace and kernelspace.
6625 */
6626 spin_lock(&memcg->event_list_lock);
6627 list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
6628 list_del_init(&event->list);
6629 schedule_work(&event->remove);
6630 }
6631 spin_unlock(&memcg->event_list_lock);
6343 6632
6344 kmem_cgroup_css_offline(memcg); 6633 kmem_cgroup_css_offline(memcg);
6345 6634