aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKirill A. Shutemov <kirill@shutemov.name>2010-03-10 18:22:20 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2010-03-12 18:52:37 -0500
commit0dea116876eefc9c7ca9c5d74fe665481e499fa3 (patch)
tree446ef64c99a234cf076b6d43efe42c8b48a928c7
parent483c30b514bd3037fa3f19fa42327c94c10f51c8 (diff)
cgroup: implement eventfd-based generic API for notifications
This patchset introduces eventfd-based API for notifications in cgroups and implements memory notifications on top of it. It uses statistics in memory controler to track memory usage. Output of time(1) on building kernel on tmpfs: Root cgroup before changes: make -j2 506.37 user 60.93s system 193% cpu 4:52.77 total Non-root cgroup before changes: make -j2 507.14 user 62.66s system 193% cpu 4:54.74 total Root cgroup after changes (0 thresholds): make -j2 507.13 user 62.20s system 193% cpu 4:53.55 total Non-root cgroup after changes (0 thresholds): make -j2 507.70 user 64.20s system 193% cpu 4:55.70 total Root cgroup after changes (1 thresholds, never crossed): make -j2 506.97 user 62.20s system 193% cpu 4:53.90 total Non-root cgroup after changes (1 thresholds, never crossed): make -j2 507.55 user 64.08s system 193% cpu 4:55.63 total This patch: Introduce the write-only file "cgroup.event_control" in every cgroup. To register new notification handler you need: - create an eventfd; - open a control file to be monitored. Callbacks register_event() and unregister_event() must be defined for the control file; - write "<event_fd> <control_fd> <args>" to cgroup.event_control. Interpretation of args is defined by control file implementation; eventfd will be woken up by control file implementation or when the cgroup is removed. To unregister notification handler just close eventfd. If you need notification functionality for a control file you have to implement callbacks register_event() and unregister_event() in the struct cftype. [kamezawa.hiroyu@jp.fujitsu.com: Kconfig fix] Signed-off-by: Kirill A. Shutemov <kirill@shutemov.name> Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Paul Menage <menage@google.com> Cc: Li Zefan <lizf@cn.fujitsu.com> Cc: Balbir Singh <balbir@linux.vnet.ibm.com> Cc: Pavel Emelyanov <xemul@openvz.org> Cc: Dan Malek <dan@embeddedalley.com> Cc: Vladislav Buzov <vbuzov@embeddedalley.com> Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Cc: Alexander Shishkin <virtuoso@slind.org> Cc: Davide Libenzi <davidel@xmailserver.org> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--Documentation/cgroups/cgroups.txt20
-rw-r--r--include/linux/cgroup.h24
-rw-r--r--init/Kconfig1
-rw-r--r--kernel/cgroup.c228
4 files changed, 272 insertions, 1 deletions
diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
index c0358c30c64f..fd588ff0e296 100644
--- a/Documentation/cgroups/cgroups.txt
+++ b/Documentation/cgroups/cgroups.txt
@@ -23,6 +23,7 @@ CONTENTS:
23 2.1 Basic Usage 23 2.1 Basic Usage
24 2.2 Attaching processes 24 2.2 Attaching processes
25 2.3 Mounting hierarchies by name 25 2.3 Mounting hierarchies by name
26 2.4 Notification API
263. Kernel API 273. Kernel API
27 3.1 Overview 28 3.1 Overview
28 3.2 Synchronization 29 3.2 Synchronization
@@ -435,6 +436,25 @@ you give a subsystem a name.
435The name of the subsystem appears as part of the hierarchy description 436The name of the subsystem appears as part of the hierarchy description
436in /proc/mounts and /proc/<pid>/cgroups. 437in /proc/mounts and /proc/<pid>/cgroups.
437 438
4392.4 Notification API
440--------------------
441
442There is mechanism which allows to get notifications about changing
443status of a cgroup.
444
445To register new notification handler you need:
446 - create a file descriptor for event notification using eventfd(2);
447 - open a control file to be monitored (e.g. memory.usage_in_bytes);
448 - write "<event_fd> <control_fd> <args>" to cgroup.event_control.
449 Interpretation of args is defined by control file implementation;
450
451eventfd will be woken up by control file implementation or when the
452cgroup is removed.
453
454To unregister notification handler just close eventfd.
455
456NOTE: Support of notifications should be implemented for the control
457file. See documentation for the subsystem.
438 458
4393. Kernel API 4593. Kernel API
440============= 460=============
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 2a59d3101e5d..b4f2201321cd 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -235,6 +235,10 @@ struct cgroup {
235 235
236 /* For RCU-protected deletion */ 236 /* For RCU-protected deletion */
237 struct rcu_head rcu_head; 237 struct rcu_head rcu_head;
238
239 /* List of events which userspace want to recieve */
240 struct list_head event_list;
241 spinlock_t event_list_lock;
238}; 242};
239 243
240/* 244/*
@@ -378,6 +382,26 @@ struct cftype {
378 int (*trigger)(struct cgroup *cgrp, unsigned int event); 382 int (*trigger)(struct cgroup *cgrp, unsigned int event);
379 383
380 int (*release)(struct inode *inode, struct file *file); 384 int (*release)(struct inode *inode, struct file *file);
385
386 /*
387 * register_event() callback will be used to add new userspace
388 * waiter for changes related to the cftype. Implement it if
389 * you want to provide this functionality. Use eventfd_signal()
390 * on eventfd to send notification to userspace.
391 */
392 int (*register_event)(struct cgroup *cgrp, struct cftype *cft,
393 struct eventfd_ctx *eventfd, const char *args);
394 /*
395 * unregister_event() callback will be called when userspace
396 * closes the eventfd or on cgroup removing.
397 * This callback must be implemented, if you want provide
398 * notification functionality.
399 *
400 * Be careful. It can be called after destroy(), so you have
401 * to keep all nesessary data, until all events are removed.
402 */
403 int (*unregister_event)(struct cgroup *cgrp, struct cftype *cft,
404 struct eventfd_ctx *eventfd);
381}; 405};
382 406
383struct cgroup_scanner { 407struct cgroup_scanner {
diff --git a/init/Kconfig b/init/Kconfig
index 089a230e5652..eb77e8ccde1c 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -463,6 +463,7 @@ config HAVE_UNSTABLE_SCHED_CLOCK
463 463
464menuconfig CGROUPS 464menuconfig CGROUPS
465 boolean "Control Group support" 465 boolean "Control Group support"
466 depends on EVENTFD
466 help 467 help
467 This option adds support for grouping sets of processes together, for 468 This option adds support for grouping sets of processes together, for
468 use with process control subsystems such as Cpusets, CFS, memory 469 use with process control subsystems such as Cpusets, CFS, memory
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 1bf4d6db54ab..ea94984a3895 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4,6 +4,10 @@
4 * Based originally on the cpuset system, extracted by Paul Menage 4 * Based originally on the cpuset system, extracted by Paul Menage
5 * Copyright (C) 2006 Google, Inc 5 * Copyright (C) 2006 Google, Inc
6 * 6 *
7 * Notifications support
8 * Copyright (C) 2009 Nokia Corporation
9 * Author: Kirill A. Shutemov
10 *
7 * Copyright notices from the original cpuset code: 11 * Copyright notices from the original cpuset code:
8 * -------------------------------------------------- 12 * --------------------------------------------------
9 * Copyright (C) 2003 BULL SA. 13 * Copyright (C) 2003 BULL SA.
@@ -53,6 +57,8 @@
53#include <linux/pid_namespace.h> 57#include <linux/pid_namespace.h>
54#include <linux/idr.h> 58#include <linux/idr.h>
55#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ 59#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
60#include <linux/eventfd.h>
61#include <linux/poll.h>
56 62
57#include <asm/atomic.h> 63#include <asm/atomic.h>
58 64
@@ -152,6 +158,35 @@ struct css_id {
152 unsigned short stack[0]; /* Array of Length (depth+1) */ 158 unsigned short stack[0]; /* Array of Length (depth+1) */
153}; 159};
154 160
161/*
162 * cgroup_event represents events which userspace want to recieve.
163 */
164struct cgroup_event {
165 /*
166 * Cgroup which the event belongs to.
167 */
168 struct cgroup *cgrp;
169 /*
170 * Control file which the event associated.
171 */
172 struct cftype *cft;
173 /*
174 * eventfd to signal userspace about the event.
175 */
176 struct eventfd_ctx *eventfd;
177 /*
178 * Each of these stored in a list by the cgroup.
179 */
180 struct list_head list;
181 /*
182 * All fields below needed to unregister event when
183 * userspace closes eventfd.
184 */
185 poll_table pt;
186 wait_queue_head_t *wqh;
187 wait_queue_t wait;
188 struct work_struct remove;
189};
155 190
156/* The list of hierarchy roots */ 191/* The list of hierarchy roots */
157 192
@@ -760,14 +795,28 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
760static int cgroup_call_pre_destroy(struct cgroup *cgrp) 795static int cgroup_call_pre_destroy(struct cgroup *cgrp)
761{ 796{
762 struct cgroup_subsys *ss; 797 struct cgroup_subsys *ss;
798 struct cgroup_event *event, *tmp;
763 int ret = 0; 799 int ret = 0;
764 800
765 for_each_subsys(cgrp->root, ss) 801 for_each_subsys(cgrp->root, ss)
766 if (ss->pre_destroy) { 802 if (ss->pre_destroy) {
767 ret = ss->pre_destroy(ss, cgrp); 803 ret = ss->pre_destroy(ss, cgrp);
768 if (ret) 804 if (ret)
769 break; 805 goto out;
770 } 806 }
807
808 /*
809 * Unregister events and notify userspace.
810 */
811 spin_lock(&cgrp->event_list_lock);
812 list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
813 list_del(&event->list);
814 eventfd_signal(event->eventfd, 1);
815 schedule_work(&event->remove);
816 }
817 spin_unlock(&cgrp->event_list_lock);
818
819out:
771 return ret; 820 return ret;
772} 821}
773 822
@@ -1239,6 +1288,8 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1239 INIT_LIST_HEAD(&cgrp->release_list); 1288 INIT_LIST_HEAD(&cgrp->release_list);
1240 INIT_LIST_HEAD(&cgrp->pidlists); 1289 INIT_LIST_HEAD(&cgrp->pidlists);
1241 mutex_init(&cgrp->pidlist_mutex); 1290 mutex_init(&cgrp->pidlist_mutex);
1291 INIT_LIST_HEAD(&cgrp->event_list);
1292 spin_lock_init(&cgrp->event_list_lock);
1242} 1293}
1243 1294
1244static void init_cgroup_root(struct cgroupfs_root *root) 1295static void init_cgroup_root(struct cgroupfs_root *root)
@@ -2077,6 +2128,16 @@ static const struct inode_operations cgroup_dir_inode_operations = {
2077 .rename = cgroup_rename, 2128 .rename = cgroup_rename,
2078}; 2129};
2079 2130
2131/*
2132 * Check if a file is a control file
2133 */
2134static inline struct cftype *__file_cft(struct file *file)
2135{
2136 if (file->f_dentry->d_inode->i_fop != &cgroup_file_operations)
2137 return ERR_PTR(-EINVAL);
2138 return __d_cft(file->f_dentry);
2139}
2140
2080static int cgroup_create_file(struct dentry *dentry, mode_t mode, 2141static int cgroup_create_file(struct dentry *dentry, mode_t mode,
2081 struct super_block *sb) 2142 struct super_block *sb)
2082{ 2143{
@@ -2931,6 +2992,166 @@ static int cgroup_write_notify_on_release(struct cgroup *cgrp,
2931} 2992}
2932 2993
2933/* 2994/*
2995 * Unregister event and free resources.
2996 *
2997 * Gets called from workqueue.
2998 */
2999static void cgroup_event_remove(struct work_struct *work)
3000{
3001 struct cgroup_event *event = container_of(work, struct cgroup_event,
3002 remove);
3003 struct cgroup *cgrp = event->cgrp;
3004
3005 /* TODO: check return code */
3006 event->cft->unregister_event(cgrp, event->cft, event->eventfd);
3007
3008 eventfd_ctx_put(event->eventfd);
3009 remove_wait_queue(event->wqh, &event->wait);
3010 kfree(event);
3011}
3012
3013/*
3014 * Gets called on POLLHUP on eventfd when user closes it.
3015 *
3016 * Called with wqh->lock held and interrupts disabled.
3017 */
3018static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
3019 int sync, void *key)
3020{
3021 struct cgroup_event *event = container_of(wait,
3022 struct cgroup_event, wait);
3023 struct cgroup *cgrp = event->cgrp;
3024 unsigned long flags = (unsigned long)key;
3025
3026 if (flags & POLLHUP) {
3027 spin_lock(&cgrp->event_list_lock);
3028 list_del(&event->list);
3029 spin_unlock(&cgrp->event_list_lock);
3030 /*
3031 * We are in atomic context, but cgroup_event_remove() may
3032 * sleep, so we have to call it in workqueue.
3033 */
3034 schedule_work(&event->remove);
3035 }
3036
3037 return 0;
3038}
3039
3040static void cgroup_event_ptable_queue_proc(struct file *file,
3041 wait_queue_head_t *wqh, poll_table *pt)
3042{
3043 struct cgroup_event *event = container_of(pt,
3044 struct cgroup_event, pt);
3045
3046 event->wqh = wqh;
3047 add_wait_queue(wqh, &event->wait);
3048}
3049
3050/*
3051 * Parse input and register new cgroup event handler.
3052 *
3053 * Input must be in format '<event_fd> <control_fd> <args>'.
3054 * Interpretation of args is defined by control file implementation.
3055 */
3056static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
3057 const char *buffer)
3058{
3059 struct cgroup_event *event = NULL;
3060 unsigned int efd, cfd;
3061 struct file *efile = NULL;
3062 struct file *cfile = NULL;
3063 char *endp;
3064 int ret;
3065
3066 efd = simple_strtoul(buffer, &endp, 10);
3067 if (*endp != ' ')
3068 return -EINVAL;
3069 buffer = endp + 1;
3070
3071 cfd = simple_strtoul(buffer, &endp, 10);
3072 if ((*endp != ' ') && (*endp != '\0'))
3073 return -EINVAL;
3074 buffer = endp + 1;
3075
3076 event = kzalloc(sizeof(*event), GFP_KERNEL);
3077 if (!event)
3078 return -ENOMEM;
3079 event->cgrp = cgrp;
3080 INIT_LIST_HEAD(&event->list);
3081 init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
3082 init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
3083 INIT_WORK(&event->remove, cgroup_event_remove);
3084
3085 efile = eventfd_fget(efd);
3086 if (IS_ERR(efile)) {
3087 ret = PTR_ERR(efile);
3088 goto fail;
3089 }
3090
3091 event->eventfd = eventfd_ctx_fileget(efile);
3092 if (IS_ERR(event->eventfd)) {
3093 ret = PTR_ERR(event->eventfd);
3094 goto fail;
3095 }
3096
3097 cfile = fget(cfd);
3098 if (!cfile) {
3099 ret = -EBADF;
3100 goto fail;
3101 }
3102
3103 /* the process need read permission on control file */
3104 ret = file_permission(cfile, MAY_READ);
3105 if (ret < 0)
3106 goto fail;
3107
3108 event->cft = __file_cft(cfile);
3109 if (IS_ERR(event->cft)) {
3110 ret = PTR_ERR(event->cft);
3111 goto fail;
3112 }
3113
3114 if (!event->cft->register_event || !event->cft->unregister_event) {
3115 ret = -EINVAL;
3116 goto fail;
3117 }
3118
3119 ret = event->cft->register_event(cgrp, event->cft,
3120 event->eventfd, buffer);
3121 if (ret)
3122 goto fail;
3123
3124 if (efile->f_op->poll(efile, &event->pt) & POLLHUP) {
3125 event->cft->unregister_event(cgrp, event->cft, event->eventfd);
3126 ret = 0;
3127 goto fail;
3128 }
3129
3130 spin_lock(&cgrp->event_list_lock);
3131 list_add(&event->list, &cgrp->event_list);
3132 spin_unlock(&cgrp->event_list_lock);
3133
3134 fput(cfile);
3135 fput(efile);
3136
3137 return 0;
3138
3139fail:
3140 if (cfile)
3141 fput(cfile);
3142
3143 if (event && event->eventfd && !IS_ERR(event->eventfd))
3144 eventfd_ctx_put(event->eventfd);
3145
3146 if (!IS_ERR_OR_NULL(efile))
3147 fput(efile);
3148
3149 kfree(event);
3150
3151 return ret;
3152}
3153
3154/*
2934 * for the common functions, 'private' gives the type of file 3155 * for the common functions, 'private' gives the type of file
2935 */ 3156 */
2936/* for hysterical raisins, we can't put this on the older files */ 3157/* for hysterical raisins, we can't put this on the older files */
@@ -2955,6 +3176,11 @@ static struct cftype files[] = {
2955 .read_u64 = cgroup_read_notify_on_release, 3176 .read_u64 = cgroup_read_notify_on_release,
2956 .write_u64 = cgroup_write_notify_on_release, 3177 .write_u64 = cgroup_write_notify_on_release,
2957 }, 3178 },
3179 {
3180 .name = CGROUP_FILE_GENERIC_PREFIX "event_control",
3181 .write_string = cgroup_write_event_control,
3182 .mode = S_IWUGO,
3183 },
2958}; 3184};
2959 3185
2960static struct cftype cft_release_agent = { 3186static struct cftype cft_release_agent = {