diff options
Diffstat (limited to 'kernel/cgroup.c')
-rw-r--r-- | kernel/cgroup.c | 265 |
1 files changed, 159 insertions, 106 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 8c6e1c17e6d3..046c1609606b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -241,7 +241,6 @@ static void unlink_css_set(struct css_set *cg) | |||
241 | struct cg_cgroup_link *link; | 241 | struct cg_cgroup_link *link; |
242 | struct cg_cgroup_link *saved_link; | 242 | struct cg_cgroup_link *saved_link; |
243 | 243 | ||
244 | write_lock(&css_set_lock); | ||
245 | hlist_del(&cg->hlist); | 244 | hlist_del(&cg->hlist); |
246 | css_set_count--; | 245 | css_set_count--; |
247 | 246 | ||
@@ -251,16 +250,25 @@ static void unlink_css_set(struct css_set *cg) | |||
251 | list_del(&link->cgrp_link_list); | 250 | list_del(&link->cgrp_link_list); |
252 | kfree(link); | 251 | kfree(link); |
253 | } | 252 | } |
254 | |||
255 | write_unlock(&css_set_lock); | ||
256 | } | 253 | } |
257 | 254 | ||
258 | static void __release_css_set(struct kref *k, int taskexit) | 255 | static void __put_css_set(struct css_set *cg, int taskexit) |
259 | { | 256 | { |
260 | int i; | 257 | int i; |
261 | struct css_set *cg = container_of(k, struct css_set, ref); | 258 | /* |
262 | 259 | * Ensure that the refcount doesn't hit zero while any readers | |
260 | * can see it. Similar to atomic_dec_and_lock(), but for an | ||
261 | * rwlock | ||
262 | */ | ||
263 | if (atomic_add_unless(&cg->refcount, -1, 1)) | ||
264 | return; | ||
265 | write_lock(&css_set_lock); | ||
266 | if (!atomic_dec_and_test(&cg->refcount)) { | ||
267 | write_unlock(&css_set_lock); | ||
268 | return; | ||
269 | } | ||
263 | unlink_css_set(cg); | 270 | unlink_css_set(cg); |
271 | write_unlock(&css_set_lock); | ||
264 | 272 | ||
265 | rcu_read_lock(); | 273 | rcu_read_lock(); |
266 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 274 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { |
@@ -276,32 +284,22 @@ static void __release_css_set(struct kref *k, int taskexit) | |||
276 | kfree(cg); | 284 | kfree(cg); |
277 | } | 285 | } |
278 | 286 | ||
279 | static void release_css_set(struct kref *k) | ||
280 | { | ||
281 | __release_css_set(k, 0); | ||
282 | } | ||
283 | |||
284 | static void release_css_set_taskexit(struct kref *k) | ||
285 | { | ||
286 | __release_css_set(k, 1); | ||
287 | } | ||
288 | |||
289 | /* | 287 | /* |
290 | * refcounted get/put for css_set objects | 288 | * refcounted get/put for css_set objects |
291 | */ | 289 | */ |
292 | static inline void get_css_set(struct css_set *cg) | 290 | static inline void get_css_set(struct css_set *cg) |
293 | { | 291 | { |
294 | kref_get(&cg->ref); | 292 | atomic_inc(&cg->refcount); |
295 | } | 293 | } |
296 | 294 | ||
297 | static inline void put_css_set(struct css_set *cg) | 295 | static inline void put_css_set(struct css_set *cg) |
298 | { | 296 | { |
299 | kref_put(&cg->ref, release_css_set); | 297 | __put_css_set(cg, 0); |
300 | } | 298 | } |
301 | 299 | ||
302 | static inline void put_css_set_taskexit(struct css_set *cg) | 300 | static inline void put_css_set_taskexit(struct css_set *cg) |
303 | { | 301 | { |
304 | kref_put(&cg->ref, release_css_set_taskexit); | 302 | __put_css_set(cg, 1); |
305 | } | 303 | } |
306 | 304 | ||
307 | /* | 305 | /* |
@@ -427,7 +425,7 @@ static struct css_set *find_css_set( | |||
427 | return NULL; | 425 | return NULL; |
428 | } | 426 | } |
429 | 427 | ||
430 | kref_init(&res->ref); | 428 | atomic_set(&res->refcount, 1); |
431 | INIT_LIST_HEAD(&res->cg_links); | 429 | INIT_LIST_HEAD(&res->cg_links); |
432 | INIT_LIST_HEAD(&res->tasks); | 430 | INIT_LIST_HEAD(&res->tasks); |
433 | INIT_HLIST_NODE(&res->hlist); | 431 | INIT_HLIST_NODE(&res->hlist); |
@@ -870,6 +868,14 @@ static struct super_operations cgroup_ops = { | |||
870 | .remount_fs = cgroup_remount, | 868 | .remount_fs = cgroup_remount, |
871 | }; | 869 | }; |
872 | 870 | ||
871 | static void init_cgroup_housekeeping(struct cgroup *cgrp) | ||
872 | { | ||
873 | INIT_LIST_HEAD(&cgrp->sibling); | ||
874 | INIT_LIST_HEAD(&cgrp->children); | ||
875 | INIT_LIST_HEAD(&cgrp->css_sets); | ||
876 | INIT_LIST_HEAD(&cgrp->release_list); | ||
877 | init_rwsem(&cgrp->pids_mutex); | ||
878 | } | ||
873 | static void init_cgroup_root(struct cgroupfs_root *root) | 879 | static void init_cgroup_root(struct cgroupfs_root *root) |
874 | { | 880 | { |
875 | struct cgroup *cgrp = &root->top_cgroup; | 881 | struct cgroup *cgrp = &root->top_cgroup; |
@@ -878,10 +884,7 @@ static void init_cgroup_root(struct cgroupfs_root *root) | |||
878 | root->number_of_cgroups = 1; | 884 | root->number_of_cgroups = 1; |
879 | cgrp->root = root; | 885 | cgrp->root = root; |
880 | cgrp->top_cgroup = cgrp; | 886 | cgrp->top_cgroup = cgrp; |
881 | INIT_LIST_HEAD(&cgrp->sibling); | 887 | init_cgroup_housekeeping(cgrp); |
882 | INIT_LIST_HEAD(&cgrp->children); | ||
883 | INIT_LIST_HEAD(&cgrp->css_sets); | ||
884 | INIT_LIST_HEAD(&cgrp->release_list); | ||
885 | } | 888 | } |
886 | 889 | ||
887 | static int cgroup_test_super(struct super_block *sb, void *data) | 890 | static int cgroup_test_super(struct super_block *sb, void *data) |
@@ -1728,7 +1731,7 @@ int cgroup_task_count(const struct cgroup *cgrp) | |||
1728 | 1731 | ||
1729 | read_lock(&css_set_lock); | 1732 | read_lock(&css_set_lock); |
1730 | list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) { | 1733 | list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) { |
1731 | count += atomic_read(&link->cg->ref.refcount); | 1734 | count += atomic_read(&link->cg->refcount); |
1732 | } | 1735 | } |
1733 | read_unlock(&css_set_lock); | 1736 | read_unlock(&css_set_lock); |
1734 | return count; | 1737 | return count; |
@@ -1997,16 +2000,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) | |||
1997 | * but we cannot guarantee that the information we produce is correct | 2000 | * but we cannot guarantee that the information we produce is correct |
1998 | * unless we produce it entirely atomically. | 2001 | * unless we produce it entirely atomically. |
1999 | * | 2002 | * |
2000 | * Upon tasks file open(), a struct ctr_struct is allocated, that | ||
2001 | * will have a pointer to an array (also allocated here). The struct | ||
2002 | * ctr_struct * is stored in file->private_data. Its resources will | ||
2003 | * be freed by release() when the file is closed. The array is used | ||
2004 | * to sprintf the PIDs and then used by read(). | ||
2005 | */ | 2003 | */ |
2006 | struct ctr_struct { | ||
2007 | char *buf; | ||
2008 | int bufsz; | ||
2009 | }; | ||
2010 | 2004 | ||
2011 | /* | 2005 | /* |
2012 | * Load into 'pidarray' up to 'npids' of the tasks using cgroup | 2006 | * Load into 'pidarray' up to 'npids' of the tasks using cgroup |
@@ -2088,42 +2082,132 @@ static int cmppid(const void *a, const void *b) | |||
2088 | return *(pid_t *)a - *(pid_t *)b; | 2082 | return *(pid_t *)a - *(pid_t *)b; |
2089 | } | 2083 | } |
2090 | 2084 | ||
2085 | |||
2091 | /* | 2086 | /* |
2092 | * Convert array 'a' of 'npids' pid_t's to a string of newline separated | 2087 | * seq_file methods for the "tasks" file. The seq_file position is the |
2093 | * decimal pids in 'buf'. Don't write more than 'sz' chars, but return | 2088 | * next pid to display; the seq_file iterator is a pointer to the pid |
2094 | * count 'cnt' of how many chars would be written if buf were large enough. | 2089 | * in the cgroup->tasks_pids array. |
2095 | */ | 2090 | */ |
2096 | static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids) | 2091 | |
2092 | static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos) | ||
2097 | { | 2093 | { |
2098 | int cnt = 0; | 2094 | /* |
2099 | int i; | 2095 | * Initially we receive a position value that corresponds to |
2096 | * one more than the last pid shown (or 0 on the first call or | ||
2097 | * after a seek to the start). Use a binary-search to find the | ||
2098 | * next pid to display, if any | ||
2099 | */ | ||
2100 | struct cgroup *cgrp = s->private; | ||
2101 | int index = 0, pid = *pos; | ||
2102 | int *iter; | ||
2100 | 2103 | ||
2101 | for (i = 0; i < npids; i++) | 2104 | down_read(&cgrp->pids_mutex); |
2102 | cnt += snprintf(buf + cnt, max(sz - cnt, 0), "%d\n", a[i]); | 2105 | if (pid) { |
2103 | return cnt; | 2106 | int end = cgrp->pids_length; |
2107 | int i; | ||
2108 | while (index < end) { | ||
2109 | int mid = (index + end) / 2; | ||
2110 | if (cgrp->tasks_pids[mid] == pid) { | ||
2111 | index = mid; | ||
2112 | break; | ||
2113 | } else if (cgrp->tasks_pids[mid] <= pid) | ||
2114 | index = mid + 1; | ||
2115 | else | ||
2116 | end = mid; | ||
2117 | } | ||
2118 | } | ||
2119 | /* If we're off the end of the array, we're done */ | ||
2120 | if (index >= cgrp->pids_length) | ||
2121 | return NULL; | ||
2122 | /* Update the abstract position to be the actual pid that we found */ | ||
2123 | iter = cgrp->tasks_pids + index; | ||
2124 | *pos = *iter; | ||
2125 | return iter; | ||
2126 | } | ||
2127 | |||
2128 | static void cgroup_tasks_stop(struct seq_file *s, void *v) | ||
2129 | { | ||
2130 | struct cgroup *cgrp = s->private; | ||
2131 | up_read(&cgrp->pids_mutex); | ||
2104 | } | 2132 | } |
2105 | 2133 | ||
2134 | static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos) | ||
2135 | { | ||
2136 | struct cgroup *cgrp = s->private; | ||
2137 | int *p = v; | ||
2138 | int *end = cgrp->tasks_pids + cgrp->pids_length; | ||
2139 | |||
2140 | /* | ||
2141 | * Advance to the next pid in the array. If this goes off the | ||
2142 | * end, we're done | ||
2143 | */ | ||
2144 | p++; | ||
2145 | if (p >= end) { | ||
2146 | return NULL; | ||
2147 | } else { | ||
2148 | *pos = *p; | ||
2149 | return p; | ||
2150 | } | ||
2151 | } | ||
2152 | |||
2153 | static int cgroup_tasks_show(struct seq_file *s, void *v) | ||
2154 | { | ||
2155 | return seq_printf(s, "%d\n", *(int *)v); | ||
2156 | } | ||
2157 | |||
2158 | static struct seq_operations cgroup_tasks_seq_operations = { | ||
2159 | .start = cgroup_tasks_start, | ||
2160 | .stop = cgroup_tasks_stop, | ||
2161 | .next = cgroup_tasks_next, | ||
2162 | .show = cgroup_tasks_show, | ||
2163 | }; | ||
2164 | |||
2165 | static void release_cgroup_pid_array(struct cgroup *cgrp) | ||
2166 | { | ||
2167 | down_write(&cgrp->pids_mutex); | ||
2168 | BUG_ON(!cgrp->pids_use_count); | ||
2169 | if (!--cgrp->pids_use_count) { | ||
2170 | kfree(cgrp->tasks_pids); | ||
2171 | cgrp->tasks_pids = NULL; | ||
2172 | cgrp->pids_length = 0; | ||
2173 | } | ||
2174 | up_write(&cgrp->pids_mutex); | ||
2175 | } | ||
2176 | |||
2177 | static int cgroup_tasks_release(struct inode *inode, struct file *file) | ||
2178 | { | ||
2179 | struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); | ||
2180 | |||
2181 | if (!(file->f_mode & FMODE_READ)) | ||
2182 | return 0; | ||
2183 | |||
2184 | release_cgroup_pid_array(cgrp); | ||
2185 | return seq_release(inode, file); | ||
2186 | } | ||
2187 | |||
2188 | static struct file_operations cgroup_tasks_operations = { | ||
2189 | .read = seq_read, | ||
2190 | .llseek = seq_lseek, | ||
2191 | .write = cgroup_file_write, | ||
2192 | .release = cgroup_tasks_release, | ||
2193 | }; | ||
2194 | |||
2106 | /* | 2195 | /* |
2107 | * Handle an open on 'tasks' file. Prepare a buffer listing the | 2196 | * Handle an open on 'tasks' file. Prepare an array containing the |
2108 | * process id's of tasks currently attached to the cgroup being opened. | 2197 | * process id's of tasks currently attached to the cgroup being opened. |
2109 | * | ||
2110 | * Does not require any specific cgroup mutexes, and does not take any. | ||
2111 | */ | 2198 | */ |
2199 | |||
2112 | static int cgroup_tasks_open(struct inode *unused, struct file *file) | 2200 | static int cgroup_tasks_open(struct inode *unused, struct file *file) |
2113 | { | 2201 | { |
2114 | struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); | 2202 | struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); |
2115 | struct ctr_struct *ctr; | ||
2116 | pid_t *pidarray; | 2203 | pid_t *pidarray; |
2117 | int npids; | 2204 | int npids; |
2118 | char c; | 2205 | int retval; |
2119 | 2206 | ||
2207 | /* Nothing to do for write-only files */ | ||
2120 | if (!(file->f_mode & FMODE_READ)) | 2208 | if (!(file->f_mode & FMODE_READ)) |
2121 | return 0; | 2209 | return 0; |
2122 | 2210 | ||
2123 | ctr = kmalloc(sizeof(*ctr), GFP_KERNEL); | ||
2124 | if (!ctr) | ||
2125 | goto err0; | ||
2126 | |||
2127 | /* | 2211 | /* |
2128 | * If cgroup gets more users after we read count, we won't have | 2212 | * If cgroup gets more users after we read count, we won't have |
2129 | * enough space - tough. This race is indistinguishable to the | 2213 | * enough space - tough. This race is indistinguishable to the |
@@ -2131,57 +2215,31 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file) | |||
2131 | * show up until sometime later on. | 2215 | * show up until sometime later on. |
2132 | */ | 2216 | */ |
2133 | npids = cgroup_task_count(cgrp); | 2217 | npids = cgroup_task_count(cgrp); |
2134 | if (npids) { | 2218 | pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL); |
2135 | pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL); | 2219 | if (!pidarray) |
2136 | if (!pidarray) | 2220 | return -ENOMEM; |
2137 | goto err1; | 2221 | npids = pid_array_load(pidarray, npids, cgrp); |
2138 | 2222 | sort(pidarray, npids, sizeof(pid_t), cmppid, NULL); | |
2139 | npids = pid_array_load(pidarray, npids, cgrp); | ||
2140 | sort(pidarray, npids, sizeof(pid_t), cmppid, NULL); | ||
2141 | |||
2142 | /* Call pid_array_to_buf() twice, first just to get bufsz */ | ||
2143 | ctr->bufsz = pid_array_to_buf(&c, sizeof(c), pidarray, npids) + 1; | ||
2144 | ctr->buf = kmalloc(ctr->bufsz, GFP_KERNEL); | ||
2145 | if (!ctr->buf) | ||
2146 | goto err2; | ||
2147 | ctr->bufsz = pid_array_to_buf(ctr->buf, ctr->bufsz, pidarray, npids); | ||
2148 | |||
2149 | kfree(pidarray); | ||
2150 | } else { | ||
2151 | ctr->buf = NULL; | ||
2152 | ctr->bufsz = 0; | ||
2153 | } | ||
2154 | file->private_data = ctr; | ||
2155 | return 0; | ||
2156 | |||
2157 | err2: | ||
2158 | kfree(pidarray); | ||
2159 | err1: | ||
2160 | kfree(ctr); | ||
2161 | err0: | ||
2162 | return -ENOMEM; | ||
2163 | } | ||
2164 | 2223 | ||
2165 | static ssize_t cgroup_tasks_read(struct cgroup *cgrp, | 2224 | /* |
2166 | struct cftype *cft, | 2225 | * Store the array in the cgroup, freeing the old |
2167 | struct file *file, char __user *buf, | 2226 | * array if necessary |
2168 | size_t nbytes, loff_t *ppos) | 2227 | */ |
2169 | { | 2228 | down_write(&cgrp->pids_mutex); |
2170 | struct ctr_struct *ctr = file->private_data; | 2229 | kfree(cgrp->tasks_pids); |
2230 | cgrp->tasks_pids = pidarray; | ||
2231 | cgrp->pids_length = npids; | ||
2232 | cgrp->pids_use_count++; | ||
2233 | up_write(&cgrp->pids_mutex); | ||
2171 | 2234 | ||
2172 | return simple_read_from_buffer(buf, nbytes, ppos, ctr->buf, ctr->bufsz); | 2235 | file->f_op = &cgroup_tasks_operations; |
2173 | } | ||
2174 | 2236 | ||
2175 | static int cgroup_tasks_release(struct inode *unused_inode, | 2237 | retval = seq_open(file, &cgroup_tasks_seq_operations); |
2176 | struct file *file) | 2238 | if (retval) { |
2177 | { | 2239 | release_cgroup_pid_array(cgrp); |
2178 | struct ctr_struct *ctr; | 2240 | return retval; |
2179 | |||
2180 | if (file->f_mode & FMODE_READ) { | ||
2181 | ctr = file->private_data; | ||
2182 | kfree(ctr->buf); | ||
2183 | kfree(ctr); | ||
2184 | } | 2241 | } |
2242 | ((struct seq_file *)file->private_data)->private = cgrp; | ||
2185 | return 0; | 2243 | return 0; |
2186 | } | 2244 | } |
2187 | 2245 | ||
@@ -2210,7 +2268,6 @@ static struct cftype files[] = { | |||
2210 | { | 2268 | { |
2211 | .name = "tasks", | 2269 | .name = "tasks", |
2212 | .open = cgroup_tasks_open, | 2270 | .open = cgroup_tasks_open, |
2213 | .read = cgroup_tasks_read, | ||
2214 | .write_u64 = cgroup_tasks_write, | 2271 | .write_u64 = cgroup_tasks_write, |
2215 | .release = cgroup_tasks_release, | 2272 | .release = cgroup_tasks_release, |
2216 | .private = FILE_TASKLIST, | 2273 | .private = FILE_TASKLIST, |
@@ -2300,10 +2357,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
2300 | 2357 | ||
2301 | mutex_lock(&cgroup_mutex); | 2358 | mutex_lock(&cgroup_mutex); |
2302 | 2359 | ||
2303 | INIT_LIST_HEAD(&cgrp->sibling); | 2360 | init_cgroup_housekeeping(cgrp); |
2304 | INIT_LIST_HEAD(&cgrp->children); | ||
2305 | INIT_LIST_HEAD(&cgrp->css_sets); | ||
2306 | INIT_LIST_HEAD(&cgrp->release_list); | ||
2307 | 2361 | ||
2308 | cgrp->parent = parent; | 2362 | cgrp->parent = parent; |
2309 | cgrp->root = parent->root; | 2363 | cgrp->root = parent->root; |
@@ -2495,8 +2549,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) | |||
2495 | int __init cgroup_init_early(void) | 2549 | int __init cgroup_init_early(void) |
2496 | { | 2550 | { |
2497 | int i; | 2551 | int i; |
2498 | kref_init(&init_css_set.ref); | 2552 | atomic_set(&init_css_set.refcount, 1); |
2499 | kref_get(&init_css_set.ref); | ||
2500 | INIT_LIST_HEAD(&init_css_set.cg_links); | 2553 | INIT_LIST_HEAD(&init_css_set.cg_links); |
2501 | INIT_LIST_HEAD(&init_css_set.tasks); | 2554 | INIT_LIST_HEAD(&init_css_set.tasks); |
2502 | INIT_HLIST_NODE(&init_css_set.hlist); | 2555 | INIT_HLIST_NODE(&init_css_set.hlist); |