diff options
-rw-r--r-- | Documentation/cgroups/cgroups.txt | 20 | ||||
-rw-r--r-- | include/linux/cgroup.h | 24 | ||||
-rw-r--r-- | init/Kconfig | 1 | ||||
-rw-r--r-- | kernel/cgroup.c | 228 |
4 files changed, 272 insertions, 1 deletions
diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt index c0358c30c64f..fd588ff0e296 100644 --- a/Documentation/cgroups/cgroups.txt +++ b/Documentation/cgroups/cgroups.txt | |||
@@ -23,6 +23,7 @@ CONTENTS: | |||
23 | 2.1 Basic Usage | 23 | 2.1 Basic Usage |
24 | 2.2 Attaching processes | 24 | 2.2 Attaching processes |
25 | 2.3 Mounting hierarchies by name | 25 | 2.3 Mounting hierarchies by name |
26 | 2.4 Notification API | ||
26 | 3. Kernel API | 27 | 3. Kernel API |
27 | 3.1 Overview | 28 | 3.1 Overview |
28 | 3.2 Synchronization | 29 | 3.2 Synchronization |
@@ -435,6 +436,25 @@ you give a subsystem a name. | |||
435 | The name of the subsystem appears as part of the hierarchy description | 436 | The name of the subsystem appears as part of the hierarchy description |
436 | in /proc/mounts and /proc/<pid>/cgroups. | 437 | in /proc/mounts and /proc/<pid>/cgroups. |
437 | 438 | ||
439 | 2.4 Notification API | ||
440 | -------------------- | ||
441 | |||
442 | There is mechanism which allows to get notifications about changing | ||
443 | status of a cgroup. | ||
444 | |||
445 | To register new notification handler you need: | ||
446 | - create a file descriptor for event notification using eventfd(2); | ||
447 | - open a control file to be monitored (e.g. memory.usage_in_bytes); | ||
448 | - write "<event_fd> <control_fd> <args>" to cgroup.event_control. | ||
449 | Interpretation of args is defined by control file implementation; | ||
450 | |||
451 | eventfd will be woken up by control file implementation or when the | ||
452 | cgroup is removed. | ||
453 | |||
454 | To unregister notification handler just close eventfd. | ||
455 | |||
456 | NOTE: Support of notifications should be implemented for the control | ||
457 | file. See documentation for the subsystem. | ||
438 | 458 | ||
439 | 3. Kernel API | 459 | 3. Kernel API |
440 | ============= | 460 | ============= |
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 2a59d3101e5d..b4f2201321cd 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h | |||
@@ -235,6 +235,10 @@ struct cgroup { | |||
235 | 235 | ||
236 | /* For RCU-protected deletion */ | 236 | /* For RCU-protected deletion */ |
237 | struct rcu_head rcu_head; | 237 | struct rcu_head rcu_head; |
238 | |||
239 | /* List of events which userspace want to recieve */ | ||
240 | struct list_head event_list; | ||
241 | spinlock_t event_list_lock; | ||
238 | }; | 242 | }; |
239 | 243 | ||
240 | /* | 244 | /* |
@@ -378,6 +382,26 @@ struct cftype { | |||
378 | int (*trigger)(struct cgroup *cgrp, unsigned int event); | 382 | int (*trigger)(struct cgroup *cgrp, unsigned int event); |
379 | 383 | ||
380 | int (*release)(struct inode *inode, struct file *file); | 384 | int (*release)(struct inode *inode, struct file *file); |
385 | |||
386 | /* | ||
387 | * register_event() callback will be used to add new userspace | ||
388 | * waiter for changes related to the cftype. Implement it if | ||
389 | * you want to provide this functionality. Use eventfd_signal() | ||
390 | * on eventfd to send notification to userspace. | ||
391 | */ | ||
392 | int (*register_event)(struct cgroup *cgrp, struct cftype *cft, | ||
393 | struct eventfd_ctx *eventfd, const char *args); | ||
394 | /* | ||
395 | * unregister_event() callback will be called when userspace | ||
396 | * closes the eventfd or on cgroup removing. | ||
397 | * This callback must be implemented, if you want provide | ||
398 | * notification functionality. | ||
399 | * | ||
400 | * Be careful. It can be called after destroy(), so you have | ||
401 | * to keep all nesessary data, until all events are removed. | ||
402 | */ | ||
403 | int (*unregister_event)(struct cgroup *cgrp, struct cftype *cft, | ||
404 | struct eventfd_ctx *eventfd); | ||
381 | }; | 405 | }; |
382 | 406 | ||
383 | struct cgroup_scanner { | 407 | struct cgroup_scanner { |
diff --git a/init/Kconfig b/init/Kconfig index 089a230e5652..eb77e8ccde1c 100644 --- a/init/Kconfig +++ b/init/Kconfig | |||
@@ -463,6 +463,7 @@ config HAVE_UNSTABLE_SCHED_CLOCK | |||
463 | 463 | ||
464 | menuconfig CGROUPS | 464 | menuconfig CGROUPS |
465 | boolean "Control Group support" | 465 | boolean "Control Group support" |
466 | depends on EVENTFD | ||
466 | help | 467 | help |
467 | This option adds support for grouping sets of processes together, for | 468 | This option adds support for grouping sets of processes together, for |
468 | use with process control subsystems such as Cpusets, CFS, memory | 469 | use with process control subsystems such as Cpusets, CFS, memory |
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 1bf4d6db54ab..ea94984a3895 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -4,6 +4,10 @@ | |||
4 | * Based originally on the cpuset system, extracted by Paul Menage | 4 | * Based originally on the cpuset system, extracted by Paul Menage |
5 | * Copyright (C) 2006 Google, Inc | 5 | * Copyright (C) 2006 Google, Inc |
6 | * | 6 | * |
7 | * Notifications support | ||
8 | * Copyright (C) 2009 Nokia Corporation | ||
9 | * Author: Kirill A. Shutemov | ||
10 | * | ||
7 | * Copyright notices from the original cpuset code: | 11 | * Copyright notices from the original cpuset code: |
8 | * -------------------------------------------------- | 12 | * -------------------------------------------------- |
9 | * Copyright (C) 2003 BULL SA. | 13 | * Copyright (C) 2003 BULL SA. |
@@ -53,6 +57,8 @@ | |||
53 | #include <linux/pid_namespace.h> | 57 | #include <linux/pid_namespace.h> |
54 | #include <linux/idr.h> | 58 | #include <linux/idr.h> |
55 | #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ | 59 | #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ |
60 | #include <linux/eventfd.h> | ||
61 | #include <linux/poll.h> | ||
56 | 62 | ||
57 | #include <asm/atomic.h> | 63 | #include <asm/atomic.h> |
58 | 64 | ||
@@ -152,6 +158,35 @@ struct css_id { | |||
152 | unsigned short stack[0]; /* Array of Length (depth+1) */ | 158 | unsigned short stack[0]; /* Array of Length (depth+1) */ |
153 | }; | 159 | }; |
154 | 160 | ||
161 | /* | ||
162 | * cgroup_event represents events which userspace want to recieve. | ||
163 | */ | ||
164 | struct cgroup_event { | ||
165 | /* | ||
166 | * Cgroup which the event belongs to. | ||
167 | */ | ||
168 | struct cgroup *cgrp; | ||
169 | /* | ||
170 | * Control file which the event associated. | ||
171 | */ | ||
172 | struct cftype *cft; | ||
173 | /* | ||
174 | * eventfd to signal userspace about the event. | ||
175 | */ | ||
176 | struct eventfd_ctx *eventfd; | ||
177 | /* | ||
178 | * Each of these stored in a list by the cgroup. | ||
179 | */ | ||
180 | struct list_head list; | ||
181 | /* | ||
182 | * All fields below needed to unregister event when | ||
183 | * userspace closes eventfd. | ||
184 | */ | ||
185 | poll_table pt; | ||
186 | wait_queue_head_t *wqh; | ||
187 | wait_queue_t wait; | ||
188 | struct work_struct remove; | ||
189 | }; | ||
155 | 190 | ||
156 | /* The list of hierarchy roots */ | 191 | /* The list of hierarchy roots */ |
157 | 192 | ||
@@ -760,14 +795,28 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb) | |||
760 | static int cgroup_call_pre_destroy(struct cgroup *cgrp) | 795 | static int cgroup_call_pre_destroy(struct cgroup *cgrp) |
761 | { | 796 | { |
762 | struct cgroup_subsys *ss; | 797 | struct cgroup_subsys *ss; |
798 | struct cgroup_event *event, *tmp; | ||
763 | int ret = 0; | 799 | int ret = 0; |
764 | 800 | ||
765 | for_each_subsys(cgrp->root, ss) | 801 | for_each_subsys(cgrp->root, ss) |
766 | if (ss->pre_destroy) { | 802 | if (ss->pre_destroy) { |
767 | ret = ss->pre_destroy(ss, cgrp); | 803 | ret = ss->pre_destroy(ss, cgrp); |
768 | if (ret) | 804 | if (ret) |
769 | break; | 805 | goto out; |
770 | } | 806 | } |
807 | |||
808 | /* | ||
809 | * Unregister events and notify userspace. | ||
810 | */ | ||
811 | spin_lock(&cgrp->event_list_lock); | ||
812 | list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) { | ||
813 | list_del(&event->list); | ||
814 | eventfd_signal(event->eventfd, 1); | ||
815 | schedule_work(&event->remove); | ||
816 | } | ||
817 | spin_unlock(&cgrp->event_list_lock); | ||
818 | |||
819 | out: | ||
771 | return ret; | 820 | return ret; |
772 | } | 821 | } |
773 | 822 | ||
@@ -1239,6 +1288,8 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) | |||
1239 | INIT_LIST_HEAD(&cgrp->release_list); | 1288 | INIT_LIST_HEAD(&cgrp->release_list); |
1240 | INIT_LIST_HEAD(&cgrp->pidlists); | 1289 | INIT_LIST_HEAD(&cgrp->pidlists); |
1241 | mutex_init(&cgrp->pidlist_mutex); | 1290 | mutex_init(&cgrp->pidlist_mutex); |
1291 | INIT_LIST_HEAD(&cgrp->event_list); | ||
1292 | spin_lock_init(&cgrp->event_list_lock); | ||
1242 | } | 1293 | } |
1243 | 1294 | ||
1244 | static void init_cgroup_root(struct cgroupfs_root *root) | 1295 | static void init_cgroup_root(struct cgroupfs_root *root) |
@@ -2077,6 +2128,16 @@ static const struct inode_operations cgroup_dir_inode_operations = { | |||
2077 | .rename = cgroup_rename, | 2128 | .rename = cgroup_rename, |
2078 | }; | 2129 | }; |
2079 | 2130 | ||
2131 | /* | ||
2132 | * Check if a file is a control file | ||
2133 | */ | ||
2134 | static inline struct cftype *__file_cft(struct file *file) | ||
2135 | { | ||
2136 | if (file->f_dentry->d_inode->i_fop != &cgroup_file_operations) | ||
2137 | return ERR_PTR(-EINVAL); | ||
2138 | return __d_cft(file->f_dentry); | ||
2139 | } | ||
2140 | |||
2080 | static int cgroup_create_file(struct dentry *dentry, mode_t mode, | 2141 | static int cgroup_create_file(struct dentry *dentry, mode_t mode, |
2081 | struct super_block *sb) | 2142 | struct super_block *sb) |
2082 | { | 2143 | { |
@@ -2931,6 +2992,166 @@ static int cgroup_write_notify_on_release(struct cgroup *cgrp, | |||
2931 | } | 2992 | } |
2932 | 2993 | ||
2933 | /* | 2994 | /* |
2995 | * Unregister event and free resources. | ||
2996 | * | ||
2997 | * Gets called from workqueue. | ||
2998 | */ | ||
2999 | static void cgroup_event_remove(struct work_struct *work) | ||
3000 | { | ||
3001 | struct cgroup_event *event = container_of(work, struct cgroup_event, | ||
3002 | remove); | ||
3003 | struct cgroup *cgrp = event->cgrp; | ||
3004 | |||
3005 | /* TODO: check return code */ | ||
3006 | event->cft->unregister_event(cgrp, event->cft, event->eventfd); | ||
3007 | |||
3008 | eventfd_ctx_put(event->eventfd); | ||
3009 | remove_wait_queue(event->wqh, &event->wait); | ||
3010 | kfree(event); | ||
3011 | } | ||
3012 | |||
3013 | /* | ||
3014 | * Gets called on POLLHUP on eventfd when user closes it. | ||
3015 | * | ||
3016 | * Called with wqh->lock held and interrupts disabled. | ||
3017 | */ | ||
3018 | static int cgroup_event_wake(wait_queue_t *wait, unsigned mode, | ||
3019 | int sync, void *key) | ||
3020 | { | ||
3021 | struct cgroup_event *event = container_of(wait, | ||
3022 | struct cgroup_event, wait); | ||
3023 | struct cgroup *cgrp = event->cgrp; | ||
3024 | unsigned long flags = (unsigned long)key; | ||
3025 | |||
3026 | if (flags & POLLHUP) { | ||
3027 | spin_lock(&cgrp->event_list_lock); | ||
3028 | list_del(&event->list); | ||
3029 | spin_unlock(&cgrp->event_list_lock); | ||
3030 | /* | ||
3031 | * We are in atomic context, but cgroup_event_remove() may | ||
3032 | * sleep, so we have to call it in workqueue. | ||
3033 | */ | ||
3034 | schedule_work(&event->remove); | ||
3035 | } | ||
3036 | |||
3037 | return 0; | ||
3038 | } | ||
3039 | |||
3040 | static void cgroup_event_ptable_queue_proc(struct file *file, | ||
3041 | wait_queue_head_t *wqh, poll_table *pt) | ||
3042 | { | ||
3043 | struct cgroup_event *event = container_of(pt, | ||
3044 | struct cgroup_event, pt); | ||
3045 | |||
3046 | event->wqh = wqh; | ||
3047 | add_wait_queue(wqh, &event->wait); | ||
3048 | } | ||
3049 | |||
3050 | /* | ||
3051 | * Parse input and register new cgroup event handler. | ||
3052 | * | ||
3053 | * Input must be in format '<event_fd> <control_fd> <args>'. | ||
3054 | * Interpretation of args is defined by control file implementation. | ||
3055 | */ | ||
3056 | static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, | ||
3057 | const char *buffer) | ||
3058 | { | ||
3059 | struct cgroup_event *event = NULL; | ||
3060 | unsigned int efd, cfd; | ||
3061 | struct file *efile = NULL; | ||
3062 | struct file *cfile = NULL; | ||
3063 | char *endp; | ||
3064 | int ret; | ||
3065 | |||
3066 | efd = simple_strtoul(buffer, &endp, 10); | ||
3067 | if (*endp != ' ') | ||
3068 | return -EINVAL; | ||
3069 | buffer = endp + 1; | ||
3070 | |||
3071 | cfd = simple_strtoul(buffer, &endp, 10); | ||
3072 | if ((*endp != ' ') && (*endp != '\0')) | ||
3073 | return -EINVAL; | ||
3074 | buffer = endp + 1; | ||
3075 | |||
3076 | event = kzalloc(sizeof(*event), GFP_KERNEL); | ||
3077 | if (!event) | ||
3078 | return -ENOMEM; | ||
3079 | event->cgrp = cgrp; | ||
3080 | INIT_LIST_HEAD(&event->list); | ||
3081 | init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc); | ||
3082 | init_waitqueue_func_entry(&event->wait, cgroup_event_wake); | ||
3083 | INIT_WORK(&event->remove, cgroup_event_remove); | ||
3084 | |||
3085 | efile = eventfd_fget(efd); | ||
3086 | if (IS_ERR(efile)) { | ||
3087 | ret = PTR_ERR(efile); | ||
3088 | goto fail; | ||
3089 | } | ||
3090 | |||
3091 | event->eventfd = eventfd_ctx_fileget(efile); | ||
3092 | if (IS_ERR(event->eventfd)) { | ||
3093 | ret = PTR_ERR(event->eventfd); | ||
3094 | goto fail; | ||
3095 | } | ||
3096 | |||
3097 | cfile = fget(cfd); | ||
3098 | if (!cfile) { | ||
3099 | ret = -EBADF; | ||
3100 | goto fail; | ||
3101 | } | ||
3102 | |||
3103 | /* the process need read permission on control file */ | ||
3104 | ret = file_permission(cfile, MAY_READ); | ||
3105 | if (ret < 0) | ||
3106 | goto fail; | ||
3107 | |||
3108 | event->cft = __file_cft(cfile); | ||
3109 | if (IS_ERR(event->cft)) { | ||
3110 | ret = PTR_ERR(event->cft); | ||
3111 | goto fail; | ||
3112 | } | ||
3113 | |||
3114 | if (!event->cft->register_event || !event->cft->unregister_event) { | ||
3115 | ret = -EINVAL; | ||
3116 | goto fail; | ||
3117 | } | ||
3118 | |||
3119 | ret = event->cft->register_event(cgrp, event->cft, | ||
3120 | event->eventfd, buffer); | ||
3121 | if (ret) | ||
3122 | goto fail; | ||
3123 | |||
3124 | if (efile->f_op->poll(efile, &event->pt) & POLLHUP) { | ||
3125 | event->cft->unregister_event(cgrp, event->cft, event->eventfd); | ||
3126 | ret = 0; | ||
3127 | goto fail; | ||
3128 | } | ||
3129 | |||
3130 | spin_lock(&cgrp->event_list_lock); | ||
3131 | list_add(&event->list, &cgrp->event_list); | ||
3132 | spin_unlock(&cgrp->event_list_lock); | ||
3133 | |||
3134 | fput(cfile); | ||
3135 | fput(efile); | ||
3136 | |||
3137 | return 0; | ||
3138 | |||
3139 | fail: | ||
3140 | if (cfile) | ||
3141 | fput(cfile); | ||
3142 | |||
3143 | if (event && event->eventfd && !IS_ERR(event->eventfd)) | ||
3144 | eventfd_ctx_put(event->eventfd); | ||
3145 | |||
3146 | if (!IS_ERR_OR_NULL(efile)) | ||
3147 | fput(efile); | ||
3148 | |||
3149 | kfree(event); | ||
3150 | |||
3151 | return ret; | ||
3152 | } | ||
3153 | |||
3154 | /* | ||
2934 | * for the common functions, 'private' gives the type of file | 3155 | * for the common functions, 'private' gives the type of file |
2935 | */ | 3156 | */ |
2936 | /* for hysterical raisins, we can't put this on the older files */ | 3157 | /* for hysterical raisins, we can't put this on the older files */ |
@@ -2955,6 +3176,11 @@ static struct cftype files[] = { | |||
2955 | .read_u64 = cgroup_read_notify_on_release, | 3176 | .read_u64 = cgroup_read_notify_on_release, |
2956 | .write_u64 = cgroup_write_notify_on_release, | 3177 | .write_u64 = cgroup_write_notify_on_release, |
2957 | }, | 3178 | }, |
3179 | { | ||
3180 | .name = CGROUP_FILE_GENERIC_PREFIX "event_control", | ||
3181 | .write_string = cgroup_write_event_control, | ||
3182 | .mode = S_IWUGO, | ||
3183 | }, | ||
2958 | }; | 3184 | }; |
2959 | 3185 | ||
2960 | static struct cftype cft_release_agent = { | 3186 | static struct cftype cft_release_agent = { |