aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/cgroup.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/cgroup.c')
-rw-r--r--kernel/cgroup.c265
1 files changed, 159 insertions, 106 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 8c6e1c17e6d3..046c1609606b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -241,7 +241,6 @@ static void unlink_css_set(struct css_set *cg)
241 struct cg_cgroup_link *link; 241 struct cg_cgroup_link *link;
242 struct cg_cgroup_link *saved_link; 242 struct cg_cgroup_link *saved_link;
243 243
244 write_lock(&css_set_lock);
245 hlist_del(&cg->hlist); 244 hlist_del(&cg->hlist);
246 css_set_count--; 245 css_set_count--;
247 246
@@ -251,16 +250,25 @@ static void unlink_css_set(struct css_set *cg)
251 list_del(&link->cgrp_link_list); 250 list_del(&link->cgrp_link_list);
252 kfree(link); 251 kfree(link);
253 } 252 }
254
255 write_unlock(&css_set_lock);
256} 253}
257 254
258static void __release_css_set(struct kref *k, int taskexit) 255static void __put_css_set(struct css_set *cg, int taskexit)
259{ 256{
260 int i; 257 int i;
261 struct css_set *cg = container_of(k, struct css_set, ref); 258 /*
262 259 * Ensure that the refcount doesn't hit zero while any readers
260 * can see it. Similar to atomic_dec_and_lock(), but for an
261 * rwlock
262 */
263 if (atomic_add_unless(&cg->refcount, -1, 1))
264 return;
265 write_lock(&css_set_lock);
266 if (!atomic_dec_and_test(&cg->refcount)) {
267 write_unlock(&css_set_lock);
268 return;
269 }
263 unlink_css_set(cg); 270 unlink_css_set(cg);
271 write_unlock(&css_set_lock);
264 272
265 rcu_read_lock(); 273 rcu_read_lock();
266 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 274 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
@@ -276,32 +284,22 @@ static void __release_css_set(struct kref *k, int taskexit)
276 kfree(cg); 284 kfree(cg);
277} 285}
278 286
279static void release_css_set(struct kref *k)
280{
281 __release_css_set(k, 0);
282}
283
284static void release_css_set_taskexit(struct kref *k)
285{
286 __release_css_set(k, 1);
287}
288
289/* 287/*
290 * refcounted get/put for css_set objects 288 * refcounted get/put for css_set objects
291 */ 289 */
292static inline void get_css_set(struct css_set *cg) 290static inline void get_css_set(struct css_set *cg)
293{ 291{
294 kref_get(&cg->ref); 292 atomic_inc(&cg->refcount);
295} 293}
296 294
297static inline void put_css_set(struct css_set *cg) 295static inline void put_css_set(struct css_set *cg)
298{ 296{
299 kref_put(&cg->ref, release_css_set); 297 __put_css_set(cg, 0);
300} 298}
301 299
302static inline void put_css_set_taskexit(struct css_set *cg) 300static inline void put_css_set_taskexit(struct css_set *cg)
303{ 301{
304 kref_put(&cg->ref, release_css_set_taskexit); 302 __put_css_set(cg, 1);
305} 303}
306 304
307/* 305/*
@@ -427,7 +425,7 @@ static struct css_set *find_css_set(
427 return NULL; 425 return NULL;
428 } 426 }
429 427
430 kref_init(&res->ref); 428 atomic_set(&res->refcount, 1);
431 INIT_LIST_HEAD(&res->cg_links); 429 INIT_LIST_HEAD(&res->cg_links);
432 INIT_LIST_HEAD(&res->tasks); 430 INIT_LIST_HEAD(&res->tasks);
433 INIT_HLIST_NODE(&res->hlist); 431 INIT_HLIST_NODE(&res->hlist);
@@ -870,6 +868,14 @@ static struct super_operations cgroup_ops = {
870 .remount_fs = cgroup_remount, 868 .remount_fs = cgroup_remount,
871}; 869};
872 870
871static void init_cgroup_housekeeping(struct cgroup *cgrp)
872{
873 INIT_LIST_HEAD(&cgrp->sibling);
874 INIT_LIST_HEAD(&cgrp->children);
875 INIT_LIST_HEAD(&cgrp->css_sets);
876 INIT_LIST_HEAD(&cgrp->release_list);
877 init_rwsem(&cgrp->pids_mutex);
878}
873static void init_cgroup_root(struct cgroupfs_root *root) 879static void init_cgroup_root(struct cgroupfs_root *root)
874{ 880{
875 struct cgroup *cgrp = &root->top_cgroup; 881 struct cgroup *cgrp = &root->top_cgroup;
@@ -878,10 +884,7 @@ static void init_cgroup_root(struct cgroupfs_root *root)
878 root->number_of_cgroups = 1; 884 root->number_of_cgroups = 1;
879 cgrp->root = root; 885 cgrp->root = root;
880 cgrp->top_cgroup = cgrp; 886 cgrp->top_cgroup = cgrp;
881 INIT_LIST_HEAD(&cgrp->sibling); 887 init_cgroup_housekeeping(cgrp);
882 INIT_LIST_HEAD(&cgrp->children);
883 INIT_LIST_HEAD(&cgrp->css_sets);
884 INIT_LIST_HEAD(&cgrp->release_list);
885} 888}
886 889
887static int cgroup_test_super(struct super_block *sb, void *data) 890static int cgroup_test_super(struct super_block *sb, void *data)
@@ -1728,7 +1731,7 @@ int cgroup_task_count(const struct cgroup *cgrp)
1728 1731
1729 read_lock(&css_set_lock); 1732 read_lock(&css_set_lock);
1730 list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) { 1733 list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) {
1731 count += atomic_read(&link->cg->ref.refcount); 1734 count += atomic_read(&link->cg->refcount);
1732 } 1735 }
1733 read_unlock(&css_set_lock); 1736 read_unlock(&css_set_lock);
1734 return count; 1737 return count;
@@ -1997,16 +2000,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
1997 * but we cannot guarantee that the information we produce is correct 2000 * but we cannot guarantee that the information we produce is correct
1998 * unless we produce it entirely atomically. 2001 * unless we produce it entirely atomically.
1999 * 2002 *
2000 * Upon tasks file open(), a struct ctr_struct is allocated, that
2001 * will have a pointer to an array (also allocated here). The struct
2002 * ctr_struct * is stored in file->private_data. Its resources will
2003 * be freed by release() when the file is closed. The array is used
2004 * to sprintf the PIDs and then used by read().
2005 */ 2003 */
2006struct ctr_struct {
2007 char *buf;
2008 int bufsz;
2009};
2010 2004
2011/* 2005/*
2012 * Load into 'pidarray' up to 'npids' of the tasks using cgroup 2006 * Load into 'pidarray' up to 'npids' of the tasks using cgroup
@@ -2088,42 +2082,132 @@ static int cmppid(const void *a, const void *b)
2088 return *(pid_t *)a - *(pid_t *)b; 2082 return *(pid_t *)a - *(pid_t *)b;
2089} 2083}
2090 2084
2085
2091/* 2086/*
2092 * Convert array 'a' of 'npids' pid_t's to a string of newline separated 2087 * seq_file methods for the "tasks" file. The seq_file position is the
2093 * decimal pids in 'buf'. Don't write more than 'sz' chars, but return 2088 * next pid to display; the seq_file iterator is a pointer to the pid
2094 * count 'cnt' of how many chars would be written if buf were large enough. 2089 * in the cgroup->tasks_pids array.
2095 */ 2090 */
2096static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids) 2091
2092static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
2097{ 2093{
2098 int cnt = 0; 2094 /*
2099 int i; 2095 * Initially we receive a position value that corresponds to
2096 * one more than the last pid shown (or 0 on the first call or
2097 * after a seek to the start). Use a binary-search to find the
2098 * next pid to display, if any
2099 */
2100 struct cgroup *cgrp = s->private;
2101 int index = 0, pid = *pos;
2102 int *iter;
2100 2103
2101 for (i = 0; i < npids; i++) 2104 down_read(&cgrp->pids_mutex);
2102 cnt += snprintf(buf + cnt, max(sz - cnt, 0), "%d\n", a[i]); 2105 if (pid) {
2103 return cnt; 2106 int end = cgrp->pids_length;
2107 int i;
2108 while (index < end) {
2109 int mid = (index + end) / 2;
2110 if (cgrp->tasks_pids[mid] == pid) {
2111 index = mid;
2112 break;
2113 } else if (cgrp->tasks_pids[mid] <= pid)
2114 index = mid + 1;
2115 else
2116 end = mid;
2117 }
2118 }
2119 /* If we're off the end of the array, we're done */
2120 if (index >= cgrp->pids_length)
2121 return NULL;
2122 /* Update the abstract position to be the actual pid that we found */
2123 iter = cgrp->tasks_pids + index;
2124 *pos = *iter;
2125 return iter;
2126}
2127
2128static void cgroup_tasks_stop(struct seq_file *s, void *v)
2129{
2130 struct cgroup *cgrp = s->private;
2131 up_read(&cgrp->pids_mutex);
2104} 2132}
2105 2133
2134static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos)
2135{
2136 struct cgroup *cgrp = s->private;
2137 int *p = v;
2138 int *end = cgrp->tasks_pids + cgrp->pids_length;
2139
2140 /*
2141 * Advance to the next pid in the array. If this goes off the
2142 * end, we're done
2143 */
2144 p++;
2145 if (p >= end) {
2146 return NULL;
2147 } else {
2148 *pos = *p;
2149 return p;
2150 }
2151}
2152
2153static int cgroup_tasks_show(struct seq_file *s, void *v)
2154{
2155 return seq_printf(s, "%d\n", *(int *)v);
2156}
2157
2158static struct seq_operations cgroup_tasks_seq_operations = {
2159 .start = cgroup_tasks_start,
2160 .stop = cgroup_tasks_stop,
2161 .next = cgroup_tasks_next,
2162 .show = cgroup_tasks_show,
2163};
2164
2165static void release_cgroup_pid_array(struct cgroup *cgrp)
2166{
2167 down_write(&cgrp->pids_mutex);
2168 BUG_ON(!cgrp->pids_use_count);
2169 if (!--cgrp->pids_use_count) {
2170 kfree(cgrp->tasks_pids);
2171 cgrp->tasks_pids = NULL;
2172 cgrp->pids_length = 0;
2173 }
2174 up_write(&cgrp->pids_mutex);
2175}
2176
2177static int cgroup_tasks_release(struct inode *inode, struct file *file)
2178{
2179 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
2180
2181 if (!(file->f_mode & FMODE_READ))
2182 return 0;
2183
2184 release_cgroup_pid_array(cgrp);
2185 return seq_release(inode, file);
2186}
2187
2188static struct file_operations cgroup_tasks_operations = {
2189 .read = seq_read,
2190 .llseek = seq_lseek,
2191 .write = cgroup_file_write,
2192 .release = cgroup_tasks_release,
2193};
2194
2106/* 2195/*
2107 * Handle an open on 'tasks' file. Prepare a buffer listing the 2196 * Handle an open on 'tasks' file. Prepare an array containing the
2108 * process id's of tasks currently attached to the cgroup being opened. 2197 * process id's of tasks currently attached to the cgroup being opened.
2109 *
2110 * Does not require any specific cgroup mutexes, and does not take any.
2111 */ 2198 */
2199
2112static int cgroup_tasks_open(struct inode *unused, struct file *file) 2200static int cgroup_tasks_open(struct inode *unused, struct file *file)
2113{ 2201{
2114 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 2202 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
2115 struct ctr_struct *ctr;
2116 pid_t *pidarray; 2203 pid_t *pidarray;
2117 int npids; 2204 int npids;
2118 char c; 2205 int retval;
2119 2206
2207 /* Nothing to do for write-only files */
2120 if (!(file->f_mode & FMODE_READ)) 2208 if (!(file->f_mode & FMODE_READ))
2121 return 0; 2209 return 0;
2122 2210
2123 ctr = kmalloc(sizeof(*ctr), GFP_KERNEL);
2124 if (!ctr)
2125 goto err0;
2126
2127 /* 2211 /*
2128 * If cgroup gets more users after we read count, we won't have 2212 * If cgroup gets more users after we read count, we won't have
2129 * enough space - tough. This race is indistinguishable to the 2213 * enough space - tough. This race is indistinguishable to the
@@ -2131,57 +2215,31 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file)
2131 * show up until sometime later on. 2215 * show up until sometime later on.
2132 */ 2216 */
2133 npids = cgroup_task_count(cgrp); 2217 npids = cgroup_task_count(cgrp);
2134 if (npids) { 2218 pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
2135 pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL); 2219 if (!pidarray)
2136 if (!pidarray) 2220 return -ENOMEM;
2137 goto err1; 2221 npids = pid_array_load(pidarray, npids, cgrp);
2138 2222 sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
2139 npids = pid_array_load(pidarray, npids, cgrp);
2140 sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
2141
2142 /* Call pid_array_to_buf() twice, first just to get bufsz */
2143 ctr->bufsz = pid_array_to_buf(&c, sizeof(c), pidarray, npids) + 1;
2144 ctr->buf = kmalloc(ctr->bufsz, GFP_KERNEL);
2145 if (!ctr->buf)
2146 goto err2;
2147 ctr->bufsz = pid_array_to_buf(ctr->buf, ctr->bufsz, pidarray, npids);
2148
2149 kfree(pidarray);
2150 } else {
2151 ctr->buf = NULL;
2152 ctr->bufsz = 0;
2153 }
2154 file->private_data = ctr;
2155 return 0;
2156
2157err2:
2158 kfree(pidarray);
2159err1:
2160 kfree(ctr);
2161err0:
2162 return -ENOMEM;
2163}
2164 2223
2165static ssize_t cgroup_tasks_read(struct cgroup *cgrp, 2224 /*
2166 struct cftype *cft, 2225 * Store the array in the cgroup, freeing the old
2167 struct file *file, char __user *buf, 2226 * array if necessary
2168 size_t nbytes, loff_t *ppos) 2227 */
2169{ 2228 down_write(&cgrp->pids_mutex);
2170 struct ctr_struct *ctr = file->private_data; 2229 kfree(cgrp->tasks_pids);
2230 cgrp->tasks_pids = pidarray;
2231 cgrp->pids_length = npids;
2232 cgrp->pids_use_count++;
2233 up_write(&cgrp->pids_mutex);
2171 2234
2172 return simple_read_from_buffer(buf, nbytes, ppos, ctr->buf, ctr->bufsz); 2235 file->f_op = &cgroup_tasks_operations;
2173}
2174 2236
2175static int cgroup_tasks_release(struct inode *unused_inode, 2237 retval = seq_open(file, &cgroup_tasks_seq_operations);
2176 struct file *file) 2238 if (retval) {
2177{ 2239 release_cgroup_pid_array(cgrp);
2178 struct ctr_struct *ctr; 2240 return retval;
2179
2180 if (file->f_mode & FMODE_READ) {
2181 ctr = file->private_data;
2182 kfree(ctr->buf);
2183 kfree(ctr);
2184 } 2241 }
2242 ((struct seq_file *)file->private_data)->private = cgrp;
2185 return 0; 2243 return 0;
2186} 2244}
2187 2245
@@ -2210,7 +2268,6 @@ static struct cftype files[] = {
2210 { 2268 {
2211 .name = "tasks", 2269 .name = "tasks",
2212 .open = cgroup_tasks_open, 2270 .open = cgroup_tasks_open,
2213 .read = cgroup_tasks_read,
2214 .write_u64 = cgroup_tasks_write, 2271 .write_u64 = cgroup_tasks_write,
2215 .release = cgroup_tasks_release, 2272 .release = cgroup_tasks_release,
2216 .private = FILE_TASKLIST, 2273 .private = FILE_TASKLIST,
@@ -2300,10 +2357,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
2300 2357
2301 mutex_lock(&cgroup_mutex); 2358 mutex_lock(&cgroup_mutex);
2302 2359
2303 INIT_LIST_HEAD(&cgrp->sibling); 2360 init_cgroup_housekeeping(cgrp);
2304 INIT_LIST_HEAD(&cgrp->children);
2305 INIT_LIST_HEAD(&cgrp->css_sets);
2306 INIT_LIST_HEAD(&cgrp->release_list);
2307 2361
2308 cgrp->parent = parent; 2362 cgrp->parent = parent;
2309 cgrp->root = parent->root; 2363 cgrp->root = parent->root;
@@ -2495,8 +2549,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
2495int __init cgroup_init_early(void) 2549int __init cgroup_init_early(void)
2496{ 2550{
2497 int i; 2551 int i;
2498 kref_init(&init_css_set.ref); 2552 atomic_set(&init_css_set.refcount, 1);
2499 kref_get(&init_css_set.ref);
2500 INIT_LIST_HEAD(&init_css_set.cg_links); 2553 INIT_LIST_HEAD(&init_css_set.cg_links);
2501 INIT_LIST_HEAD(&init_css_set.tasks); 2554 INIT_LIST_HEAD(&init_css_set.tasks);
2502 INIT_HLIST_NODE(&init_css_set.hlist); 2555 INIT_HLIST_NODE(&init_css_set.hlist);