aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/cgroup.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/cgroup.c')
-rw-r--r--kernel/cgroup.c165
1 files changed, 124 insertions, 41 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 3fb789f6df94..b6eadfe30e7b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -47,6 +47,7 @@
47#include <linux/hash.h> 47#include <linux/hash.h>
48#include <linux/namei.h> 48#include <linux/namei.h>
49#include <linux/smp_lock.h> 49#include <linux/smp_lock.h>
50#include <linux/pid_namespace.h>
50 51
51#include <asm/atomic.h> 52#include <asm/atomic.h>
52 53
@@ -734,16 +735,28 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
734 * reference to css->refcnt. In general, this refcnt is expected to goes down 735 * reference to css->refcnt. In general, this refcnt is expected to goes down
735 * to zero, soon. 736 * to zero, soon.
736 * 737 *
737 * CGRP_WAIT_ON_RMDIR flag is modified under cgroup's inode->i_mutex; 738 * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
738 */ 739 */
739DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq); 740DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
740 741
741static void cgroup_wakeup_rmdir_waiters(const struct cgroup *cgrp) 742static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
742{ 743{
743 if (unlikely(test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))) 744 if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
744 wake_up_all(&cgroup_rmdir_waitq); 745 wake_up_all(&cgroup_rmdir_waitq);
745} 746}
746 747
748void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
749{
750 css_get(css);
751}
752
753void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
754{
755 cgroup_wakeup_rmdir_waiter(css->cgroup);
756 css_put(css);
757}
758
759
747static int rebind_subsystems(struct cgroupfs_root *root, 760static int rebind_subsystems(struct cgroupfs_root *root,
748 unsigned long final_bits) 761 unsigned long final_bits)
749{ 762{
@@ -843,6 +856,11 @@ static int parse_cgroupfs_options(char *data,
843 struct cgroup_sb_opts *opts) 856 struct cgroup_sb_opts *opts)
844{ 857{
845 char *token, *o = data ?: "all"; 858 char *token, *o = data ?: "all";
859 unsigned long mask = (unsigned long)-1;
860
861#ifdef CONFIG_CPUSETS
862 mask = ~(1UL << cpuset_subsys_id);
863#endif
846 864
847 opts->subsys_bits = 0; 865 opts->subsys_bits = 0;
848 opts->flags = 0; 866 opts->flags = 0;
@@ -887,6 +905,15 @@ static int parse_cgroupfs_options(char *data,
887 } 905 }
888 } 906 }
889 907
908 /*
909 * Option noprefix was introduced just for backward compatibility
910 * with the old cpuset, so we allow noprefix only if mounting just
911 * the cpuset subsystem.
912 */
913 if (test_bit(ROOT_NOPREFIX, &opts->flags) &&
914 (opts->subsys_bits & mask))
915 return -EINVAL;
916
890 /* We can't have an empty hierarchy */ 917 /* We can't have an empty hierarchy */
891 if (!opts->subsys_bits) 918 if (!opts->subsys_bits)
892 return -EINVAL; 919 return -EINVAL;
@@ -946,6 +973,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
946 INIT_LIST_HEAD(&cgrp->children); 973 INIT_LIST_HEAD(&cgrp->children);
947 INIT_LIST_HEAD(&cgrp->css_sets); 974 INIT_LIST_HEAD(&cgrp->css_sets);
948 INIT_LIST_HEAD(&cgrp->release_list); 975 INIT_LIST_HEAD(&cgrp->release_list);
976 INIT_LIST_HEAD(&cgrp->pids_list);
949 init_rwsem(&cgrp->pids_mutex); 977 init_rwsem(&cgrp->pids_mutex);
950} 978}
951static void init_cgroup_root(struct cgroupfs_root *root) 979static void init_cgroup_root(struct cgroupfs_root *root)
@@ -1343,7 +1371,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1343 * wake up rmdir() waiter. the rmdir should fail since the cgroup 1371 * wake up rmdir() waiter. the rmdir should fail since the cgroup
1344 * is no longer empty. 1372 * is no longer empty.
1345 */ 1373 */
1346 cgroup_wakeup_rmdir_waiters(cgrp); 1374 cgroup_wakeup_rmdir_waiter(cgrp);
1347 return 0; 1375 return 0;
1348} 1376}
1349 1377
@@ -2187,12 +2215,30 @@ err:
2187 return ret; 2215 return ret;
2188} 2216}
2189 2217
2218/*
2219 * Cache pids for all threads in the same pid namespace that are
2220 * opening the same "tasks" file.
2221 */
2222struct cgroup_pids {
2223 /* The node in cgrp->pids_list */
2224 struct list_head list;
2225 /* The cgroup those pids belong to */
2226 struct cgroup *cgrp;
2227 /* The namepsace those pids belong to */
2228 struct pid_namespace *ns;
2229 /* Array of process ids in the cgroup */
2230 pid_t *tasks_pids;
2231 /* How many files are using the this tasks_pids array */
2232 int use_count;
2233 /* Length of the current tasks_pids array */
2234 int length;
2235};
2236
2190static int cmppid(const void *a, const void *b) 2237static int cmppid(const void *a, const void *b)
2191{ 2238{
2192 return *(pid_t *)a - *(pid_t *)b; 2239 return *(pid_t *)a - *(pid_t *)b;
2193} 2240}
2194 2241
2195
2196/* 2242/*
2197 * seq_file methods for the "tasks" file. The seq_file position is the 2243 * seq_file methods for the "tasks" file. The seq_file position is the
2198 * next pid to display; the seq_file iterator is a pointer to the pid 2244 * next pid to display; the seq_file iterator is a pointer to the pid
@@ -2207,45 +2253,47 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
2207 * after a seek to the start). Use a binary-search to find the 2253 * after a seek to the start). Use a binary-search to find the
2208 * next pid to display, if any 2254 * next pid to display, if any
2209 */ 2255 */
2210 struct cgroup *cgrp = s->private; 2256 struct cgroup_pids *cp = s->private;
2257 struct cgroup *cgrp = cp->cgrp;
2211 int index = 0, pid = *pos; 2258 int index = 0, pid = *pos;
2212 int *iter; 2259 int *iter;
2213 2260
2214 down_read(&cgrp->pids_mutex); 2261 down_read(&cgrp->pids_mutex);
2215 if (pid) { 2262 if (pid) {
2216 int end = cgrp->pids_length; 2263 int end = cp->length;
2217 2264
2218 while (index < end) { 2265 while (index < end) {
2219 int mid = (index + end) / 2; 2266 int mid = (index + end) / 2;
2220 if (cgrp->tasks_pids[mid] == pid) { 2267 if (cp->tasks_pids[mid] == pid) {
2221 index = mid; 2268 index = mid;
2222 break; 2269 break;
2223 } else if (cgrp->tasks_pids[mid] <= pid) 2270 } else if (cp->tasks_pids[mid] <= pid)
2224 index = mid + 1; 2271 index = mid + 1;
2225 else 2272 else
2226 end = mid; 2273 end = mid;
2227 } 2274 }
2228 } 2275 }
2229 /* If we're off the end of the array, we're done */ 2276 /* If we're off the end of the array, we're done */
2230 if (index >= cgrp->pids_length) 2277 if (index >= cp->length)
2231 return NULL; 2278 return NULL;
2232 /* Update the abstract position to be the actual pid that we found */ 2279 /* Update the abstract position to be the actual pid that we found */
2233 iter = cgrp->tasks_pids + index; 2280 iter = cp->tasks_pids + index;
2234 *pos = *iter; 2281 *pos = *iter;
2235 return iter; 2282 return iter;
2236} 2283}
2237 2284
2238static void cgroup_tasks_stop(struct seq_file *s, void *v) 2285static void cgroup_tasks_stop(struct seq_file *s, void *v)
2239{ 2286{
2240 struct cgroup *cgrp = s->private; 2287 struct cgroup_pids *cp = s->private;
2288 struct cgroup *cgrp = cp->cgrp;
2241 up_read(&cgrp->pids_mutex); 2289 up_read(&cgrp->pids_mutex);
2242} 2290}
2243 2291
2244static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos) 2292static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos)
2245{ 2293{
2246 struct cgroup *cgrp = s->private; 2294 struct cgroup_pids *cp = s->private;
2247 int *p = v; 2295 int *p = v;
2248 int *end = cgrp->tasks_pids + cgrp->pids_length; 2296 int *end = cp->tasks_pids + cp->length;
2249 2297
2250 /* 2298 /*
2251 * Advance to the next pid in the array. If this goes off the 2299 * Advance to the next pid in the array. If this goes off the
@@ -2272,26 +2320,33 @@ static struct seq_operations cgroup_tasks_seq_operations = {
2272 .show = cgroup_tasks_show, 2320 .show = cgroup_tasks_show,
2273}; 2321};
2274 2322
2275static void release_cgroup_pid_array(struct cgroup *cgrp) 2323static void release_cgroup_pid_array(struct cgroup_pids *cp)
2276{ 2324{
2325 struct cgroup *cgrp = cp->cgrp;
2326
2277 down_write(&cgrp->pids_mutex); 2327 down_write(&cgrp->pids_mutex);
2278 BUG_ON(!cgrp->pids_use_count); 2328 BUG_ON(!cp->use_count);
2279 if (!--cgrp->pids_use_count) { 2329 if (!--cp->use_count) {
2280 kfree(cgrp->tasks_pids); 2330 list_del(&cp->list);
2281 cgrp->tasks_pids = NULL; 2331 put_pid_ns(cp->ns);
2282 cgrp->pids_length = 0; 2332 kfree(cp->tasks_pids);
2333 kfree(cp);
2283 } 2334 }
2284 up_write(&cgrp->pids_mutex); 2335 up_write(&cgrp->pids_mutex);
2285} 2336}
2286 2337
2287static int cgroup_tasks_release(struct inode *inode, struct file *file) 2338static int cgroup_tasks_release(struct inode *inode, struct file *file)
2288{ 2339{
2289 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 2340 struct seq_file *seq;
2341 struct cgroup_pids *cp;
2290 2342
2291 if (!(file->f_mode & FMODE_READ)) 2343 if (!(file->f_mode & FMODE_READ))
2292 return 0; 2344 return 0;
2293 2345
2294 release_cgroup_pid_array(cgrp); 2346 seq = file->private_data;
2347 cp = seq->private;
2348
2349 release_cgroup_pid_array(cp);
2295 return seq_release(inode, file); 2350 return seq_release(inode, file);
2296} 2351}
2297 2352
@@ -2310,6 +2365,8 @@ static struct file_operations cgroup_tasks_operations = {
2310static int cgroup_tasks_open(struct inode *unused, struct file *file) 2365static int cgroup_tasks_open(struct inode *unused, struct file *file)
2311{ 2366{
2312 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 2367 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
2368 struct pid_namespace *ns = current->nsproxy->pid_ns;
2369 struct cgroup_pids *cp;
2313 pid_t *pidarray; 2370 pid_t *pidarray;
2314 int npids; 2371 int npids;
2315 int retval; 2372 int retval;
@@ -2336,20 +2393,37 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file)
2336 * array if necessary 2393 * array if necessary
2337 */ 2394 */
2338 down_write(&cgrp->pids_mutex); 2395 down_write(&cgrp->pids_mutex);
2339 kfree(cgrp->tasks_pids); 2396
2340 cgrp->tasks_pids = pidarray; 2397 list_for_each_entry(cp, &cgrp->pids_list, list) {
2341 cgrp->pids_length = npids; 2398 if (ns == cp->ns)
2342 cgrp->pids_use_count++; 2399 goto found;
2400 }
2401
2402 cp = kzalloc(sizeof(*cp), GFP_KERNEL);
2403 if (!cp) {
2404 up_write(&cgrp->pids_mutex);
2405 kfree(pidarray);
2406 return -ENOMEM;
2407 }
2408 cp->cgrp = cgrp;
2409 cp->ns = ns;
2410 get_pid_ns(ns);
2411 list_add(&cp->list, &cgrp->pids_list);
2412found:
2413 kfree(cp->tasks_pids);
2414 cp->tasks_pids = pidarray;
2415 cp->length = npids;
2416 cp->use_count++;
2343 up_write(&cgrp->pids_mutex); 2417 up_write(&cgrp->pids_mutex);
2344 2418
2345 file->f_op = &cgroup_tasks_operations; 2419 file->f_op = &cgroup_tasks_operations;
2346 2420
2347 retval = seq_open(file, &cgroup_tasks_seq_operations); 2421 retval = seq_open(file, &cgroup_tasks_seq_operations);
2348 if (retval) { 2422 if (retval) {
2349 release_cgroup_pid_array(cgrp); 2423 release_cgroup_pid_array(cp);
2350 return retval; 2424 return retval;
2351 } 2425 }
2352 ((struct seq_file *)file->private_data)->private = cgrp; 2426 ((struct seq_file *)file->private_data)->private = cp;
2353 return 0; 2427 return 0;
2354} 2428}
2355 2429
@@ -2682,33 +2756,42 @@ again:
2682 mutex_unlock(&cgroup_mutex); 2756 mutex_unlock(&cgroup_mutex);
2683 2757
2684 /* 2758 /*
2759 * In general, subsystem has no css->refcnt after pre_destroy(). But
2760 * in racy cases, subsystem may have to get css->refcnt after
2761 * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes
2762 * make rmdir return -EBUSY too often. To avoid that, we use waitqueue
2763 * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir
2764 * and subsystem's reference count handling. Please see css_get/put
2765 * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation.
2766 */
2767 set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2768
2769 /*
2685 * Call pre_destroy handlers of subsys. Notify subsystems 2770 * Call pre_destroy handlers of subsys. Notify subsystems
2686 * that rmdir() request comes. 2771 * that rmdir() request comes.
2687 */ 2772 */
2688 ret = cgroup_call_pre_destroy(cgrp); 2773 ret = cgroup_call_pre_destroy(cgrp);
2689 if (ret) 2774 if (ret) {
2775 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2690 return ret; 2776 return ret;
2777 }
2691 2778
2692 mutex_lock(&cgroup_mutex); 2779 mutex_lock(&cgroup_mutex);
2693 parent = cgrp->parent; 2780 parent = cgrp->parent;
2694 if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) { 2781 if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
2782 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2695 mutex_unlock(&cgroup_mutex); 2783 mutex_unlock(&cgroup_mutex);
2696 return -EBUSY; 2784 return -EBUSY;
2697 } 2785 }
2698 /*
2699 * css_put/get is provided for subsys to grab refcnt to css. In typical
2700 * case, subsystem has no reference after pre_destroy(). But, under
2701 * hierarchy management, some *temporal* refcnt can be hold.
2702 * To avoid returning -EBUSY to a user, waitqueue is used. If subsys
2703 * is really busy, it should return -EBUSY at pre_destroy(). wake_up
2704 * is called when css_put() is called and refcnt goes down to 0.
2705 */
2706 set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2707 prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE); 2786 prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
2708
2709 if (!cgroup_clear_css_refs(cgrp)) { 2787 if (!cgroup_clear_css_refs(cgrp)) {
2710 mutex_unlock(&cgroup_mutex); 2788 mutex_unlock(&cgroup_mutex);
2711 schedule(); 2789 /*
2790 * Because someone may call cgroup_wakeup_rmdir_waiter() before
2791 * prepare_to_wait(), we need to check this flag.
2792 */
2793 if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))
2794 schedule();
2712 finish_wait(&cgroup_rmdir_waitq, &wait); 2795 finish_wait(&cgroup_rmdir_waitq, &wait);
2713 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); 2796 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2714 if (signal_pending(current)) 2797 if (signal_pending(current))
@@ -3280,7 +3363,7 @@ void __css_put(struct cgroup_subsys_state *css)
3280 set_bit(CGRP_RELEASABLE, &cgrp->flags); 3363 set_bit(CGRP_RELEASABLE, &cgrp->flags);
3281 check_for_release(cgrp); 3364 check_for_release(cgrp);
3282 } 3365 }
3283 cgroup_wakeup_rmdir_waiters(cgrp); 3366 cgroup_wakeup_rmdir_waiter(cgrp);
3284 } 3367 }
3285 rcu_read_unlock(); 3368 rcu_read_unlock();
3286} 3369}