cgroup: implement eventfd-based generic API for notifications

This patchset introduces eventfd-based API for notifications in cgroups and implements memory notifications on top of it. It uses statistics in memory controler to track memory usage. Output of time(1) on building kernel on tmpfs: Root cgroup before changes: make -j2 506.37 user 60.93s system 193% cpu 4:52.77 total Non-root cgroup before changes: make -j2 507.14 user 62.66s system 193% cpu 4:54.74 total Root cgroup after changes (0 thresholds): make -j2 507.13 user 62.20s system 193% cpu 4:53.55 total Non-root cgroup after changes (0 thresholds): make -j2 507.70 user 64.20s system 193% cpu 4:55.70 total Root cgroup after changes (1 thresholds, never crossed): make -j2 506.97 user 62.20s system 193% cpu 4:53.90 total Non-root cgroup after changes (1 thresholds, never crossed): make -j2 507.55 user 64.08s system 193% cpu 4:55.63 total This patch: Introduce the write-only file "cgroup.event_control" in every cgroup. To register new notification handler you need: - create an eventfd; - open a control file to be monitored. Callbacks register_event() and unregister_event() must be defined for the control file; - write "<event_fd> <control_fd> <args>" to cgroup.event_control. Interpretation of args is defined by control file implementation; eventfd will be woken up by control file implementation or when the cgroup is removed. To unregister notification handler just close eventfd. If you need notification functionality for a control file you have to implement callbacks register_event() and unregister_event() in the struct cftype. [kamezawa.hiroyu@jp.fujitsu.com: Kconfig fix] Signed-off-by: Kirill A. Shutemov <kirill@shutemov.name> Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Paul Menage <menage@google.com> Cc: Li Zefan <lizf@cn.fujitsu.com> Cc: Balbir Singh <balbir@linux.vnet.ibm.com> Cc: Pavel Emelyanov <xemul@openvz.org> Cc: Dan Malek <dan@embeddedalley.com> Cc: Vladislav Buzov <vbuzov@embeddedalley.com> Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Cc: Alexander Shishkin <virtuoso@slind.org> Cc: Davide Libenzi <davidel@xmailserver.org> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Kirill A. Shutemov <kirill@shutemov.name> 2010-03-10 18:22:20 -0500
committer: Linus Torvalds <torvalds@linux-foundation.org> 2010-03-12 18:52:37 -0500
commit: 0dea116876eefc9c7ca9c5d74fe665481e499fa3 (patch)
tree: 446ef64c99a234cf076b6d43efe42c8b48a928c7
parent: 483c30b514bd3037fa3f19fa42327c94c10f51c8 (diff)
4 files changed, 272 insertions, 1 deletions
diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
index c0358c30c64f..fd588ff0e296 100644
--- a/Documentation/cgroups/cgroups.txt
+++ b/Documentation/cgroups/cgroups.txt
@@ -23,6 +23,7 @@ CONTENTS:
  2.1 Basic Usage
  2.2 Attaching processes
  2.3 Mounting hierarchies by name
+  2.4 Notification API
 3. Kernel API
  3.1 Overview
  3.2 Synchronization
@@ -435,6 +436,25 @@ you give a subsystem a name.
 The name of the subsystem appears as part of the hierarchy description
 in /proc/mounts and /proc/<pid>/cgroups.
+2.4 Notification API
+--------------------
+There is mechanism which allows to get notifications about changing
+status of a cgroup.
+To register new notification handler you need:
+ - create a file descriptor for event notification using eventfd(2);
+ - open a control file to be monitored (e.g. memory.usage_in_bytes);
+ - write "<event_fd> <control_fd> <args>" to cgroup.event_control.
+   Interpretation of args is defined by control file implementation;
+eventfd will be woken up by control file implementation or when the
+cgroup is removed.
+To unregister notification handler just close eventfd.
+NOTE: Support of notifications should be implemented for the control
+file. See documentation for the subsystem.
 3. Kernel API
 =============
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 2a59d3101e5d..b4f2201321cd 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -235,6 +235,10 @@ struct cgroup {
        /* For RCU-protected deletion */
        struct rcu_head rcu_head;
+        /* List of events which userspace want to recieve */
+        struct list_head event_list;
+        spinlock_t event_list_lock;
 };
 /*
@@ -378,6 +382,26 @@ struct cftype {
        int (*trigger)(struct cgroup *cgrp, unsigned int event);
        int (*release)(struct inode *inode, struct file *file);
+        /*
+         * register_event() callback will be used to add new userspace
+         * waiter for changes related to the cftype. Implement it if
+         * you want to provide this functionality. Use eventfd_signal()
+         * on eventfd to send notification to userspace.
+         */
+        int (*register_event)(struct cgroup *cgrp, struct cftype *cft,
+                        struct eventfd_ctx *eventfd, const char *args);
+        /*
+         * unregister_event() callback will be called when userspace
+         * closes the eventfd or on cgroup removing.
+         * This callback must be implemented, if you want provide
+         * notification functionality.
+         *
+         * Be careful. It can be called after destroy(), so you have
+         * to keep all nesessary data, until all events are removed.
+         */
+        int (*unregister_event)(struct cgroup *cgrp, struct cftype *cft,
+                        struct eventfd_ctx *eventfd);
 };
 struct cgroup_scanner {
diff --git a/init/Kconfig b/init/Kconfig
index 089a230e5652..eb77e8ccde1c 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -463,6 +463,7 @@ config HAVE_UNSTABLE_SCHED_CLOCK
 menuconfig CGROUPS
        boolean "Control Group support"
+        depends on EVENTFD
        help
          This option adds support for grouping sets of processes together, for
          use with process control subsystems such as Cpusets, CFS, memory
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 1bf4d6db54ab..ea94984a3895 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4,6 +4,10 @@
 *  Based originally on the cpuset system, extracted by Paul Menage
 *  Copyright (C) 2006 Google, Inc
 *
+ *  Notifications support
+ *  Copyright (C) 2009 Nokia Corporation
+ *  Author: Kirill A. Shutemov
+ *
 *  Copyright notices from the original cpuset code:
 *  --------------------------------------------------
 *  Copyright (C) 2003 BULL SA.
@@ -53,6 +57,8 @@
 #include <linux/pid_namespace.h>
 #include <linux/idr.h>
 #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
+#include <linux/eventfd.h>
+#include <linux/poll.h>
 #include <asm/atomic.h>
@@ -152,6 +158,35 @@ struct css_id {
        unsigned short stack[0]; /* Array of Length (depth+1) */
 };
+/*
+ * cgroup_event represents events which userspace want to recieve.
+ */
+struct cgroup_event {
+        /*
+         * Cgroup which the event belongs to.
+         */
+        struct cgroup *cgrp;
+        /*
+         * Control file which the event associated.
+         */
+        struct cftype *cft;
+        /*
+         * eventfd to signal userspace about the event.
+         */
+        struct eventfd_ctx *eventfd;
+        /*
+         * Each of these stored in a list by the cgroup.
+         */
+        struct list_head list;
+        /*
+         * All fields below needed to unregister event when
+         * userspace closes eventfd.
+         */
+        poll_table pt;
+        wait_queue_head_t *wqh;
+        wait_queue_t wait;
+        struct work_struct remove;
+};
 /* The list of hierarchy roots */
@@ -760,14 +795,28 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
 static int cgroup_call_pre_destroy(struct cgroup *cgrp)
 {
        struct cgroup_subsys *ss;
+        struct cgroup_event *event, *tmp;
        int ret = 0;
        for_each_subsys(cgrp->root, ss)
                if (ss->pre_destroy) {
                        ret = ss->pre_destroy(ss, cgrp);
                        if (ret)
-                                break;
+                                goto out;
                }
+        /*
+         * Unregister events and notify userspace.
+         */
+        spin_lock(&cgrp->event_list_lock);
+        list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
+                list_del(&event->list);
+                eventfd_signal(event->eventfd, 1);
+                schedule_work(&event->remove);
+        }
+        spin_unlock(&cgrp->event_list_lock);
+out:
        return ret;
 }
@@ -1239,6 +1288,8 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
        INIT_LIST_HEAD(&cgrp->release_list);
        INIT_LIST_HEAD(&cgrp->pidlists);
        mutex_init(&cgrp->pidlist_mutex);
+        INIT_LIST_HEAD(&cgrp->event_list);
+        spin_lock_init(&cgrp->event_list_lock);
 }
 static void init_cgroup_root(struct cgroupfs_root *root)
@@ -2077,6 +2128,16 @@ static const struct inode_operations cgroup_dir_inode_operations = {
        .rename = cgroup_rename,
 };
+/*
+ * Check if a file is a control file
+ */
+static inline struct cftype *__file_cft(struct file *file)
+{
+        if (file->f_dentry->d_inode->i_fop != &cgroup_file_operations)
+                return ERR_PTR(-EINVAL);
+        return __d_cft(file->f_dentry);
+}
 static int cgroup_create_file(struct dentry *dentry, mode_t mode,
                                struct super_block *sb)
 {
@@ -2931,6 +2992,166 @@ static int cgroup_write_notify_on_release(struct cgroup *cgrp,
 }
 /*
+ * Unregister event and free resources.
+ *
+ * Gets called from workqueue.
+ */
+static void cgroup_event_remove(struct work_struct *work)
+{
+        struct cgroup_event *event = container_of(work, struct cgroup_event,
+                        remove);
+        struct cgroup *cgrp = event->cgrp;
+        /* TODO: check return code */
+        event->cft->unregister_event(cgrp, event->cft, event->eventfd);
+        eventfd_ctx_put(event->eventfd);
+        remove_wait_queue(event->wqh, &event->wait);
+        kfree(event);
+}
+/*
+ * Gets called on POLLHUP on eventfd when user closes it.
+ *
+ * Called with wqh->lock held and interrupts disabled.
+ */
+static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
+                int sync, void *key)
+{
+        struct cgroup_event *event = container_of(wait,
+                        struct cgroup_event, wait);
+        struct cgroup *cgrp = event->cgrp;
+        unsigned long flags = (unsigned long)key;
+        if (flags & POLLHUP) {
+                spin_lock(&cgrp->event_list_lock);
+                list_del(&event->list);
+                spin_unlock(&cgrp->event_list_lock);
+                /*
+                 * We are in atomic context, but cgroup_event_remove() may
+                 * sleep, so we have to call it in workqueue.
+                 */
+                schedule_work(&event->remove);
+        }
+        return 0;
+}
+static void cgroup_event_ptable_queue_proc(struct file *file,
+                wait_queue_head_t *wqh, poll_table *pt)
+{
+        struct cgroup_event *event = container_of(pt,
+                        struct cgroup_event, pt);
+        event->wqh = wqh;
+        add_wait_queue(wqh, &event->wait);
+}
+/*
+ * Parse input and register new cgroup event handler.
+ *
+ * Input must be in format '<event_fd> <control_fd> <args>'.
+ * Interpretation of args is defined by control file implementation.
+ */
+static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
+                                      const char *buffer)
+{
+        struct cgroup_event *event = NULL;
+        unsigned int efd, cfd;
+        struct file *efile = NULL;
+        struct file *cfile = NULL;
+        char *endp;
+        int ret;
+        efd = simple_strtoul(buffer, &endp, 10);
+        if (*endp != ' ')
+                return -EINVAL;
+        buffer = endp + 1;
+        cfd = simple_strtoul(buffer, &endp, 10);
+        if ((*endp != ' ') && (*endp != '\0'))
+                return -EINVAL;
+        buffer = endp + 1;
+        event = kzalloc(sizeof(*event), GFP_KERNEL);
+        if (!event)
+                return -ENOMEM;
+        event->cgrp = cgrp;
+        INIT_LIST_HEAD(&event->list);
+        init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
+        init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
+        INIT_WORK(&event->remove, cgroup_event_remove);
+        efile = eventfd_fget(efd);
+        if (IS_ERR(efile)) {
+                ret = PTR_ERR(efile);
+                goto fail;
+        }
+        event->eventfd = eventfd_ctx_fileget(efile);
+        if (IS_ERR(event->eventfd)) {
+                ret = PTR_ERR(event->eventfd);
+                goto fail;
+        }
+        cfile = fget(cfd);
+        if (!cfile) {
+                ret = -EBADF;
+                goto fail;
+        }
+        /* the process need read permission on control file */
+        ret = file_permission(cfile, MAY_READ);
+        if (ret < 0)
+                goto fail;
+        event->cft = __file_cft(cfile);
+        if (IS_ERR(event->cft)) {
+                ret = PTR_ERR(event->cft);
+                goto fail;
+        }
+        if (!event->cft->register_event || !event->cft->unregister_event) {
+                ret = -EINVAL;
+                goto fail;
+        }
+        ret = event->cft->register_event(cgrp, event->cft,
+                        event->eventfd, buffer);
+        if (ret)
+                goto fail;
+        if (efile->f_op->poll(efile, &event->pt) & POLLHUP) {
+                event->cft->unregister_event(cgrp, event->cft, event->eventfd);
+                ret = 0;
+                goto fail;
+        }
+        spin_lock(&cgrp->event_list_lock);
+        list_add(&event->list, &cgrp->event_list);
+        spin_unlock(&cgrp->event_list_lock);
+        fput(cfile);
+        fput(efile);
+        return 0;
+fail:
+        if (cfile)
+                fput(cfile);
+        if (event && event->eventfd && !IS_ERR(event->eventfd))
+                eventfd_ctx_put(event->eventfd);
+        if (!IS_ERR_OR_NULL(efile))
+                fput(efile);
+        kfree(event);
+        return ret;
+}
+/*
 * for the common functions, 'private' gives the type of file
 */
 /* for hysterical raisins, we can't put this on the older files */
@@ -2955,6 +3176,11 @@ static struct cftype files[] = {
                .read_u64 = cgroup_read_notify_on_release,
                .write_u64 = cgroup_write_notify_on_release,
        },
+        {
+                .name = CGROUP_FILE_GENERIC_PREFIX "event_control",
+                .write_string = cgroup_write_event_control,
+                .mode = S_IWUGO,
+        },
 };
 static struct cftype cft_release_agent = {
author	Kirill A. Shutemov <kirill@shutemov.name>	2010-03-10 18:22:20 -0500
committer	Linus Torvalds <torvalds@linux-foundation.org>	2010-03-12 18:52:37 -0500
commit	0dea116876eefc9c7ca9c5d74fe665481e499fa3 (patch)
tree	446ef64c99a234cf076b6d43efe42c8b48a928c7
parent	483c30b514bd3037fa3f19fa42327c94c10f51c8 (diff)

diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt index c0358c30c64f..fd588ff0e296 100644 --- a/Documentation/cgroups/cgroups.txt +++ b/Documentation/cgroups/cgroups.txt
@@ -23,6 +23,7 @@ CONTENTS:
23	2.1 Basic Usage	23	2.1 Basic Usage
24	2.2 Attaching processes	24	2.2 Attaching processes
25	2.3 Mounting hierarchies by name	25	2.3 Mounting hierarchies by name
		26	2.4 Notification API
26	3. Kernel API	27	3. Kernel API
27	3.1 Overview	28	3.1 Overview
28	3.2 Synchronization	29	3.2 Synchronization
@@ -435,6 +436,25 @@ you give a subsystem a name.
435	The name of the subsystem appears as part of the hierarchy description	436	The name of the subsystem appears as part of the hierarchy description
436	in /proc/mounts and /proc/<pid>/cgroups.	437	in /proc/mounts and /proc/<pid>/cgroups.
437		438
		439	2.4 Notification API
		440	--------------------
		441
		442	There is mechanism which allows to get notifications about changing
		443	status of a cgroup.
		444
		445	To register new notification handler you need:
		446	- create a file descriptor for event notification using eventfd(2);
		447	- open a control file to be monitored (e.g. memory.usage_in_bytes);
		448	- write "<event_fd> <control_fd> <args>" to cgroup.event_control.
		449	Interpretation of args is defined by control file implementation;
		450
		451	eventfd will be woken up by control file implementation or when the
		452	cgroup is removed.
		453
		454	To unregister notification handler just close eventfd.
		455
		456	NOTE: Support of notifications should be implemented for the control
		457	file. See documentation for the subsystem.
438		458
439	3. Kernel API	459	3. Kernel API
440	=============	460	=============


diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 2a59d3101e5d..b4f2201321cd 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h
@@ -235,6 +235,10 @@ struct cgroup {
235		235
236	/* For RCU-protected deletion */	236	/* For RCU-protected deletion */
237	struct rcu_head rcu_head;	237	struct rcu_head rcu_head;
		238
		239	/* List of events which userspace want to recieve */
		240	struct list_head event_list;
		241	spinlock_t event_list_lock;
238	};	242	};
239		243
240	/*	244	/*
@@ -378,6 +382,26 @@ struct cftype {
378	int (trigger)(struct cgroup cgrp, unsigned int event);	382	int (trigger)(struct cgroup cgrp, unsigned int event);
379		383
380	int (release)(struct inode inode, struct file *file);	384	int (release)(struct inode inode, struct file *file);
		385
		386	/*
		387	* register_event() callback will be used to add new userspace
		388	* waiter for changes related to the cftype. Implement it if
		389	* you want to provide this functionality. Use eventfd_signal()
		390	* on eventfd to send notification to userspace.
		391	*/
		392	int (register_event)(struct cgroup cgrp, struct cftype *cft,
		393	struct eventfd_ctx eventfd, const char args);
		394	/*
		395	* unregister_event() callback will be called when userspace
		396	* closes the eventfd or on cgroup removing.
		397	* This callback must be implemented, if you want provide
		398	* notification functionality.
		399	*
		400	* Be careful. It can be called after destroy(), so you have
		401	* to keep all nesessary data, until all events are removed.
		402	*/
		403	int (unregister_event)(struct cgroup cgrp, struct cftype *cft,
		404	struct eventfd_ctx *eventfd);
381	};	405	};
382		406
383	struct cgroup_scanner {	407	struct cgroup_scanner {


diff --git a/init/Kconfig b/init/Kconfig index 089a230e5652..eb77e8ccde1c 100644 --- a/init/Kconfig +++ b/init/Kconfig
@@ -463,6 +463,7 @@ config HAVE_UNSTABLE_SCHED_CLOCK
463		463
464	menuconfig CGROUPS	464	menuconfig CGROUPS
465	boolean "Control Group support"	465	boolean "Control Group support"
		466	depends on EVENTFD
466	help	467	help
467	This option adds support for grouping sets of processes together, for	468	This option adds support for grouping sets of processes together, for
468	use with process control subsystems such as Cpusets, CFS, memory	469	use with process control subsystems such as Cpusets, CFS, memory


diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 1bf4d6db54ab..ea94984a3895 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c
@@ -4,6 +4,10 @@
4	* Based originally on the cpuset system, extracted by Paul Menage	4	* Based originally on the cpuset system, extracted by Paul Menage
5	* Copyright (C) 2006 Google, Inc	5	* Copyright (C) 2006 Google, Inc
6	*	6	*
		7	* Notifications support
		8	* Copyright (C) 2009 Nokia Corporation
		9	* Author: Kirill A. Shutemov
		10	*
7	* Copyright notices from the original cpuset code:	11	* Copyright notices from the original cpuset code:
8	* --------------------------------------------------	12	* --------------------------------------------------
9	* Copyright (C) 2003 BULL SA.	13	* Copyright (C) 2003 BULL SA.
@@ -53,6 +57,8 @@
53	#include <linux/pid_namespace.h>	57	#include <linux/pid_namespace.h>
54	#include <linux/idr.h>	58	#include <linux/idr.h>
55	#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */	59	#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
		60	#include <linux/eventfd.h>
		61	#include <linux/poll.h>
56		62
57	#include <asm/atomic.h>	63	#include <asm/atomic.h>
58		64
@@ -152,6 +158,35 @@ struct css_id {
152	unsigned short stack[0]; /* Array of Length (depth+1) */	158	unsigned short stack[0]; /* Array of Length (depth+1) */
153	};	159	};
154		160
		161	/*
		162	* cgroup_event represents events which userspace want to recieve.
		163	*/
		164	struct cgroup_event {
		165	/*
		166	* Cgroup which the event belongs to.
		167	*/
		168	struct cgroup *cgrp;
		169	/*
		170	* Control file which the event associated.
		171	*/
		172	struct cftype *cft;
		173	/*
		174	* eventfd to signal userspace about the event.
		175	*/
		176	struct eventfd_ctx *eventfd;
		177	/*
		178	* Each of these stored in a list by the cgroup.
		179	*/
		180	struct list_head list;
		181	/*
		182	* All fields below needed to unregister event when
		183	* userspace closes eventfd.
		184	*/
		185	poll_table pt;
		186	wait_queue_head_t *wqh;
		187	wait_queue_t wait;
		188	struct work_struct remove;
		189	};
155		190
156	/* The list of hierarchy roots */	191	/* The list of hierarchy roots */
157		192
@@ -760,14 +795,28 @@ static struct inode cgroup_new_inode(mode_t mode, struct super_block sb)
760	static int cgroup_call_pre_destroy(struct cgroup *cgrp)	795	static int cgroup_call_pre_destroy(struct cgroup *cgrp)
761	{	796	{
762	struct cgroup_subsys *ss;	797	struct cgroup_subsys *ss;
		798	struct cgroup_event event, tmp;
763	int ret = 0;	799	int ret = 0;
764		800
765	for_each_subsys(cgrp->root, ss)	801	for_each_subsys(cgrp->root, ss)
766	if (ss->pre_destroy) {	802	if (ss->pre_destroy) {
767	ret = ss->pre_destroy(ss, cgrp);	803	ret = ss->pre_destroy(ss, cgrp);
768	if (ret)	804	if (ret)
769	break;	805	goto out;
770	}	806	}
		807
		808	/*
		809	* Unregister events and notify userspace.
		810	*/
		811	spin_lock(&cgrp->event_list_lock);
		812	list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
		813	list_del(&event->list);
		814	eventfd_signal(event->eventfd, 1);
		815	schedule_work(&event->remove);
		816	}
		817	spin_unlock(&cgrp->event_list_lock);
		818
		819	out:
771	return ret;	820	return ret;
772	}	821	}
773		822
@@ -1239,6 +1288,8 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1239	INIT_LIST_HEAD(&cgrp->release_list);	1288	INIT_LIST_HEAD(&cgrp->release_list);
1240	INIT_LIST_HEAD(&cgrp->pidlists);	1289	INIT_LIST_HEAD(&cgrp->pidlists);
1241	mutex_init(&cgrp->pidlist_mutex);	1290	mutex_init(&cgrp->pidlist_mutex);
		1291	INIT_LIST_HEAD(&cgrp->event_list);
		1292	spin_lock_init(&cgrp->event_list_lock);
1242	}	1293	}
1243		1294
1244	static void init_cgroup_root(struct cgroupfs_root *root)	1295	static void init_cgroup_root(struct cgroupfs_root *root)
@@ -2077,6 +2128,16 @@ static const struct inode_operations cgroup_dir_inode_operations = {
2077	.rename = cgroup_rename,	2128	.rename = cgroup_rename,
2078	};	2129	};
2079		2130
		2131	/*
		2132	* Check if a file is a control file
		2133	*/
		2134	static inline struct cftype __file_cft(struct file file)
		2135	{
		2136	if (file->f_dentry->d_inode->i_fop != &cgroup_file_operations)
		2137	return ERR_PTR(-EINVAL);
		2138	return __d_cft(file->f_dentry);
		2139	}
		2140
2080	static int cgroup_create_file(struct dentry *dentry, mode_t mode,	2141	static int cgroup_create_file(struct dentry *dentry, mode_t mode,
2081	struct super_block *sb)	2142	struct super_block *sb)
2082	{	2143	{
@@ -2931,6 +2992,166 @@ static int cgroup_write_notify_on_release(struct cgroup *cgrp,
2931	}	2992	}
2932		2993
2933	/*	2994	/*
		2995	* Unregister event and free resources.
		2996	*
		2997	* Gets called from workqueue.
		2998	*/
		2999	static void cgroup_event_remove(struct work_struct *work)
		3000	{
		3001	struct cgroup_event *event = container_of(work, struct cgroup_event,
		3002	remove);
		3003	struct cgroup *cgrp = event->cgrp;
		3004
		3005	/* TODO: check return code */
		3006	event->cft->unregister_event(cgrp, event->cft, event->eventfd);
		3007
		3008	eventfd_ctx_put(event->eventfd);
		3009	remove_wait_queue(event->wqh, &event->wait);
		3010	kfree(event);
		3011	}
		3012
		3013	/*
		3014	* Gets called on POLLHUP on eventfd when user closes it.
		3015	*
		3016	* Called with wqh->lock held and interrupts disabled.
		3017	*/
		3018	static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
		3019	int sync, void *key)
		3020	{
		3021	struct cgroup_event *event = container_of(wait,
		3022	struct cgroup_event, wait);
		3023	struct cgroup *cgrp = event->cgrp;
		3024	unsigned long flags = (unsigned long)key;
		3025
		3026	if (flags & POLLHUP) {
		3027	spin_lock(&cgrp->event_list_lock);
		3028	list_del(&event->list);
		3029	spin_unlock(&cgrp->event_list_lock);
		3030	/*
		3031	* We are in atomic context, but cgroup_event_remove() may
		3032	* sleep, so we have to call it in workqueue.
		3033	*/
		3034	schedule_work(&event->remove);
		3035	}
		3036
		3037	return 0;
		3038	}
		3039
		3040	static void cgroup_event_ptable_queue_proc(struct file *file,
		3041	wait_queue_head_t wqh, poll_table pt)
		3042	{
		3043	struct cgroup_event *event = container_of(pt,
		3044	struct cgroup_event, pt);
		3045
		3046	event->wqh = wqh;
		3047	add_wait_queue(wqh, &event->wait);
		3048	}
		3049
		3050	/*
		3051	* Parse input and register new cgroup event handler.
		3052	*
		3053	* Input must be in format '<event_fd> <control_fd> <args>'.
		3054	* Interpretation of args is defined by control file implementation.
		3055	*/
		3056	static int cgroup_write_event_control(struct cgroup cgrp, struct cftype cft,
		3057	const char *buffer)
		3058	{
		3059	struct cgroup_event *event = NULL;
		3060	unsigned int efd, cfd;
		3061	struct file *efile = NULL;
		3062	struct file *cfile = NULL;
		3063	char *endp;
		3064	int ret;
		3065
		3066	efd = simple_strtoul(buffer, &endp, 10);
		3067	if (*endp != ' ')
		3068	return -EINVAL;
		3069	buffer = endp + 1;
		3070
		3071	cfd = simple_strtoul(buffer, &endp, 10);
		3072	if ((endp != ' ') && (endp != '\0'))
		3073	return -EINVAL;
		3074	buffer = endp + 1;
		3075
		3076	event = kzalloc(sizeof(*event), GFP_KERNEL);
		3077	if (!event)
		3078	return -ENOMEM;
		3079	event->cgrp = cgrp;
		3080	INIT_LIST_HEAD(&event->list);
		3081	init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
		3082	init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
		3083	INIT_WORK(&event->remove, cgroup_event_remove);
		3084
		3085	efile = eventfd_fget(efd);
		3086	if (IS_ERR(efile)) {
		3087	ret = PTR_ERR(efile);
		3088	goto fail;
		3089	}
		3090
		3091	event->eventfd = eventfd_ctx_fileget(efile);
		3092	if (IS_ERR(event->eventfd)) {
		3093	ret = PTR_ERR(event->eventfd);
		3094	goto fail;
		3095	}
		3096
		3097	cfile = fget(cfd);
		3098	if (!cfile) {
		3099	ret = -EBADF;
		3100	goto fail;
		3101	}
		3102
		3103	/* the process need read permission on control file */
		3104	ret = file_permission(cfile, MAY_READ);
		3105	if (ret < 0)
		3106	goto fail;
		3107
		3108	event->cft = __file_cft(cfile);
		3109	if (IS_ERR(event->cft)) {
		3110	ret = PTR_ERR(event->cft);
		3111	goto fail;
		3112	}
		3113
		3114	if (!event->cft->register_event \|\| !event->cft->unregister_event) {
		3115	ret = -EINVAL;
		3116	goto fail;
		3117	}
		3118
		3119	ret = event->cft->register_event(cgrp, event->cft,
		3120	event->eventfd, buffer);
		3121	if (ret)
		3122	goto fail;
		3123
		3124	if (efile->f_op->poll(efile, &event->pt) & POLLHUP) {
		3125	event->cft->unregister_event(cgrp, event->cft, event->eventfd);
		3126	ret = 0;
		3127	goto fail;
		3128	}
		3129
		3130	spin_lock(&cgrp->event_list_lock);
		3131	list_add(&event->list, &cgrp->event_list);
		3132	spin_unlock(&cgrp->event_list_lock);
		3133
		3134	fput(cfile);
		3135	fput(efile);
		3136
		3137	return 0;
		3138
		3139	fail:
		3140	if (cfile)
		3141	fput(cfile);
		3142
		3143	if (event && event->eventfd && !IS_ERR(event->eventfd))
		3144	eventfd_ctx_put(event->eventfd);
		3145
		3146	if (!IS_ERR_OR_NULL(efile))
		3147	fput(efile);
		3148
		3149	kfree(event);
		3150
		3151	return ret;
		3152	}
		3153
		3154	/*
2934	* for the common functions, 'private' gives the type of file	3155	* for the common functions, 'private' gives the type of file
2935	*/	3156	*/
2936	/* for hysterical raisins, we can't put this on the older files */	3157	/* for hysterical raisins, we can't put this on the older files */
@@ -2955,6 +3176,11 @@ static struct cftype files[] = {
2955	.read_u64 = cgroup_read_notify_on_release,	3176	.read_u64 = cgroup_read_notify_on_release,
2956	.write_u64 = cgroup_write_notify_on_release,	3177	.write_u64 = cgroup_write_notify_on_release,
2957	},	3178	},
		3179	{
		3180	.name = CGROUP_FILE_GENERIC_PREFIX "event_control",
		3181	.write_string = cgroup_write_event_control,
		3182	.mode = S_IWUGO,
		3183	},
2958	};	3184	};
2959		3185
2960	static struct cftype cft_release_agent = {	3186	static struct cftype cft_release_agent = {