diff options
Diffstat (limited to 'kernel/cgroup.c')
-rw-r--r-- | kernel/cgroup.c | 165 |
1 files changed, 124 insertions, 41 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3fb789f6df94..b6eadfe30e7b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -47,6 +47,7 @@ | |||
47 | #include <linux/hash.h> | 47 | #include <linux/hash.h> |
48 | #include <linux/namei.h> | 48 | #include <linux/namei.h> |
49 | #include <linux/smp_lock.h> | 49 | #include <linux/smp_lock.h> |
50 | #include <linux/pid_namespace.h> | ||
50 | 51 | ||
51 | #include <asm/atomic.h> | 52 | #include <asm/atomic.h> |
52 | 53 | ||
@@ -734,16 +735,28 @@ static void cgroup_d_remove_dir(struct dentry *dentry) | |||
734 | * reference to css->refcnt. In general, this refcnt is expected to goes down | 735 | * reference to css->refcnt. In general, this refcnt is expected to goes down |
735 | * to zero, soon. | 736 | * to zero, soon. |
736 | * | 737 | * |
737 | * CGRP_WAIT_ON_RMDIR flag is modified under cgroup's inode->i_mutex; | 738 | * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex; |
738 | */ | 739 | */ |
739 | DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq); | 740 | DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq); |
740 | 741 | ||
741 | static void cgroup_wakeup_rmdir_waiters(const struct cgroup *cgrp) | 742 | static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp) |
742 | { | 743 | { |
743 | if (unlikely(test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))) | 744 | if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))) |
744 | wake_up_all(&cgroup_rmdir_waitq); | 745 | wake_up_all(&cgroup_rmdir_waitq); |
745 | } | 746 | } |
746 | 747 | ||
748 | void cgroup_exclude_rmdir(struct cgroup_subsys_state *css) | ||
749 | { | ||
750 | css_get(css); | ||
751 | } | ||
752 | |||
753 | void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css) | ||
754 | { | ||
755 | cgroup_wakeup_rmdir_waiter(css->cgroup); | ||
756 | css_put(css); | ||
757 | } | ||
758 | |||
759 | |||
747 | static int rebind_subsystems(struct cgroupfs_root *root, | 760 | static int rebind_subsystems(struct cgroupfs_root *root, |
748 | unsigned long final_bits) | 761 | unsigned long final_bits) |
749 | { | 762 | { |
@@ -843,6 +856,11 @@ static int parse_cgroupfs_options(char *data, | |||
843 | struct cgroup_sb_opts *opts) | 856 | struct cgroup_sb_opts *opts) |
844 | { | 857 | { |
845 | char *token, *o = data ?: "all"; | 858 | char *token, *o = data ?: "all"; |
859 | unsigned long mask = (unsigned long)-1; | ||
860 | |||
861 | #ifdef CONFIG_CPUSETS | ||
862 | mask = ~(1UL << cpuset_subsys_id); | ||
863 | #endif | ||
846 | 864 | ||
847 | opts->subsys_bits = 0; | 865 | opts->subsys_bits = 0; |
848 | opts->flags = 0; | 866 | opts->flags = 0; |
@@ -887,6 +905,15 @@ static int parse_cgroupfs_options(char *data, | |||
887 | } | 905 | } |
888 | } | 906 | } |
889 | 907 | ||
908 | /* | ||
909 | * Option noprefix was introduced just for backward compatibility | ||
910 | * with the old cpuset, so we allow noprefix only if mounting just | ||
911 | * the cpuset subsystem. | ||
912 | */ | ||
913 | if (test_bit(ROOT_NOPREFIX, &opts->flags) && | ||
914 | (opts->subsys_bits & mask)) | ||
915 | return -EINVAL; | ||
916 | |||
890 | /* We can't have an empty hierarchy */ | 917 | /* We can't have an empty hierarchy */ |
891 | if (!opts->subsys_bits) | 918 | if (!opts->subsys_bits) |
892 | return -EINVAL; | 919 | return -EINVAL; |
@@ -946,6 +973,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) | |||
946 | INIT_LIST_HEAD(&cgrp->children); | 973 | INIT_LIST_HEAD(&cgrp->children); |
947 | INIT_LIST_HEAD(&cgrp->css_sets); | 974 | INIT_LIST_HEAD(&cgrp->css_sets); |
948 | INIT_LIST_HEAD(&cgrp->release_list); | 975 | INIT_LIST_HEAD(&cgrp->release_list); |
976 | INIT_LIST_HEAD(&cgrp->pids_list); | ||
949 | init_rwsem(&cgrp->pids_mutex); | 977 | init_rwsem(&cgrp->pids_mutex); |
950 | } | 978 | } |
951 | static void init_cgroup_root(struct cgroupfs_root *root) | 979 | static void init_cgroup_root(struct cgroupfs_root *root) |
@@ -1343,7 +1371,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) | |||
1343 | * wake up rmdir() waiter. the rmdir should fail since the cgroup | 1371 | * wake up rmdir() waiter. the rmdir should fail since the cgroup |
1344 | * is no longer empty. | 1372 | * is no longer empty. |
1345 | */ | 1373 | */ |
1346 | cgroup_wakeup_rmdir_waiters(cgrp); | 1374 | cgroup_wakeup_rmdir_waiter(cgrp); |
1347 | return 0; | 1375 | return 0; |
1348 | } | 1376 | } |
1349 | 1377 | ||
@@ -2187,12 +2215,30 @@ err: | |||
2187 | return ret; | 2215 | return ret; |
2188 | } | 2216 | } |
2189 | 2217 | ||
2218 | /* | ||
2219 | * Cache pids for all threads in the same pid namespace that are | ||
2220 | * opening the same "tasks" file. | ||
2221 | */ | ||
2222 | struct cgroup_pids { | ||
2223 | /* The node in cgrp->pids_list */ | ||
2224 | struct list_head list; | ||
2225 | /* The cgroup those pids belong to */ | ||
2226 | struct cgroup *cgrp; | ||
2227 | /* The namepsace those pids belong to */ | ||
2228 | struct pid_namespace *ns; | ||
2229 | /* Array of process ids in the cgroup */ | ||
2230 | pid_t *tasks_pids; | ||
2231 | /* How many files are using the this tasks_pids array */ | ||
2232 | int use_count; | ||
2233 | /* Length of the current tasks_pids array */ | ||
2234 | int length; | ||
2235 | }; | ||
2236 | |||
2190 | static int cmppid(const void *a, const void *b) | 2237 | static int cmppid(const void *a, const void *b) |
2191 | { | 2238 | { |
2192 | return *(pid_t *)a - *(pid_t *)b; | 2239 | return *(pid_t *)a - *(pid_t *)b; |
2193 | } | 2240 | } |
2194 | 2241 | ||
2195 | |||
2196 | /* | 2242 | /* |
2197 | * seq_file methods for the "tasks" file. The seq_file position is the | 2243 | * seq_file methods for the "tasks" file. The seq_file position is the |
2198 | * next pid to display; the seq_file iterator is a pointer to the pid | 2244 | * next pid to display; the seq_file iterator is a pointer to the pid |
@@ -2207,45 +2253,47 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos) | |||
2207 | * after a seek to the start). Use a binary-search to find the | 2253 | * after a seek to the start). Use a binary-search to find the |
2208 | * next pid to display, if any | 2254 | * next pid to display, if any |
2209 | */ | 2255 | */ |
2210 | struct cgroup *cgrp = s->private; | 2256 | struct cgroup_pids *cp = s->private; |
2257 | struct cgroup *cgrp = cp->cgrp; | ||
2211 | int index = 0, pid = *pos; | 2258 | int index = 0, pid = *pos; |
2212 | int *iter; | 2259 | int *iter; |
2213 | 2260 | ||
2214 | down_read(&cgrp->pids_mutex); | 2261 | down_read(&cgrp->pids_mutex); |
2215 | if (pid) { | 2262 | if (pid) { |
2216 | int end = cgrp->pids_length; | 2263 | int end = cp->length; |
2217 | 2264 | ||
2218 | while (index < end) { | 2265 | while (index < end) { |
2219 | int mid = (index + end) / 2; | 2266 | int mid = (index + end) / 2; |
2220 | if (cgrp->tasks_pids[mid] == pid) { | 2267 | if (cp->tasks_pids[mid] == pid) { |
2221 | index = mid; | 2268 | index = mid; |
2222 | break; | 2269 | break; |
2223 | } else if (cgrp->tasks_pids[mid] <= pid) | 2270 | } else if (cp->tasks_pids[mid] <= pid) |
2224 | index = mid + 1; | 2271 | index = mid + 1; |
2225 | else | 2272 | else |
2226 | end = mid; | 2273 | end = mid; |
2227 | } | 2274 | } |
2228 | } | 2275 | } |
2229 | /* If we're off the end of the array, we're done */ | 2276 | /* If we're off the end of the array, we're done */ |
2230 | if (index >= cgrp->pids_length) | 2277 | if (index >= cp->length) |
2231 | return NULL; | 2278 | return NULL; |
2232 | /* Update the abstract position to be the actual pid that we found */ | 2279 | /* Update the abstract position to be the actual pid that we found */ |
2233 | iter = cgrp->tasks_pids + index; | 2280 | iter = cp->tasks_pids + index; |
2234 | *pos = *iter; | 2281 | *pos = *iter; |
2235 | return iter; | 2282 | return iter; |
2236 | } | 2283 | } |
2237 | 2284 | ||
2238 | static void cgroup_tasks_stop(struct seq_file *s, void *v) | 2285 | static void cgroup_tasks_stop(struct seq_file *s, void *v) |
2239 | { | 2286 | { |
2240 | struct cgroup *cgrp = s->private; | 2287 | struct cgroup_pids *cp = s->private; |
2288 | struct cgroup *cgrp = cp->cgrp; | ||
2241 | up_read(&cgrp->pids_mutex); | 2289 | up_read(&cgrp->pids_mutex); |
2242 | } | 2290 | } |
2243 | 2291 | ||
2244 | static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos) | 2292 | static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos) |
2245 | { | 2293 | { |
2246 | struct cgroup *cgrp = s->private; | 2294 | struct cgroup_pids *cp = s->private; |
2247 | int *p = v; | 2295 | int *p = v; |
2248 | int *end = cgrp->tasks_pids + cgrp->pids_length; | 2296 | int *end = cp->tasks_pids + cp->length; |
2249 | 2297 | ||
2250 | /* | 2298 | /* |
2251 | * Advance to the next pid in the array. If this goes off the | 2299 | * Advance to the next pid in the array. If this goes off the |
@@ -2272,26 +2320,33 @@ static struct seq_operations cgroup_tasks_seq_operations = { | |||
2272 | .show = cgroup_tasks_show, | 2320 | .show = cgroup_tasks_show, |
2273 | }; | 2321 | }; |
2274 | 2322 | ||
2275 | static void release_cgroup_pid_array(struct cgroup *cgrp) | 2323 | static void release_cgroup_pid_array(struct cgroup_pids *cp) |
2276 | { | 2324 | { |
2325 | struct cgroup *cgrp = cp->cgrp; | ||
2326 | |||
2277 | down_write(&cgrp->pids_mutex); | 2327 | down_write(&cgrp->pids_mutex); |
2278 | BUG_ON(!cgrp->pids_use_count); | 2328 | BUG_ON(!cp->use_count); |
2279 | if (!--cgrp->pids_use_count) { | 2329 | if (!--cp->use_count) { |
2280 | kfree(cgrp->tasks_pids); | 2330 | list_del(&cp->list); |
2281 | cgrp->tasks_pids = NULL; | 2331 | put_pid_ns(cp->ns); |
2282 | cgrp->pids_length = 0; | 2332 | kfree(cp->tasks_pids); |
2333 | kfree(cp); | ||
2283 | } | 2334 | } |
2284 | up_write(&cgrp->pids_mutex); | 2335 | up_write(&cgrp->pids_mutex); |
2285 | } | 2336 | } |
2286 | 2337 | ||
2287 | static int cgroup_tasks_release(struct inode *inode, struct file *file) | 2338 | static int cgroup_tasks_release(struct inode *inode, struct file *file) |
2288 | { | 2339 | { |
2289 | struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); | 2340 | struct seq_file *seq; |
2341 | struct cgroup_pids *cp; | ||
2290 | 2342 | ||
2291 | if (!(file->f_mode & FMODE_READ)) | 2343 | if (!(file->f_mode & FMODE_READ)) |
2292 | return 0; | 2344 | return 0; |
2293 | 2345 | ||
2294 | release_cgroup_pid_array(cgrp); | 2346 | seq = file->private_data; |
2347 | cp = seq->private; | ||
2348 | |||
2349 | release_cgroup_pid_array(cp); | ||
2295 | return seq_release(inode, file); | 2350 | return seq_release(inode, file); |
2296 | } | 2351 | } |
2297 | 2352 | ||
@@ -2310,6 +2365,8 @@ static struct file_operations cgroup_tasks_operations = { | |||
2310 | static int cgroup_tasks_open(struct inode *unused, struct file *file) | 2365 | static int cgroup_tasks_open(struct inode *unused, struct file *file) |
2311 | { | 2366 | { |
2312 | struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); | 2367 | struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); |
2368 | struct pid_namespace *ns = current->nsproxy->pid_ns; | ||
2369 | struct cgroup_pids *cp; | ||
2313 | pid_t *pidarray; | 2370 | pid_t *pidarray; |
2314 | int npids; | 2371 | int npids; |
2315 | int retval; | 2372 | int retval; |
@@ -2336,20 +2393,37 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file) | |||
2336 | * array if necessary | 2393 | * array if necessary |
2337 | */ | 2394 | */ |
2338 | down_write(&cgrp->pids_mutex); | 2395 | down_write(&cgrp->pids_mutex); |
2339 | kfree(cgrp->tasks_pids); | 2396 | |
2340 | cgrp->tasks_pids = pidarray; | 2397 | list_for_each_entry(cp, &cgrp->pids_list, list) { |
2341 | cgrp->pids_length = npids; | 2398 | if (ns == cp->ns) |
2342 | cgrp->pids_use_count++; | 2399 | goto found; |
2400 | } | ||
2401 | |||
2402 | cp = kzalloc(sizeof(*cp), GFP_KERNEL); | ||
2403 | if (!cp) { | ||
2404 | up_write(&cgrp->pids_mutex); | ||
2405 | kfree(pidarray); | ||
2406 | return -ENOMEM; | ||
2407 | } | ||
2408 | cp->cgrp = cgrp; | ||
2409 | cp->ns = ns; | ||
2410 | get_pid_ns(ns); | ||
2411 | list_add(&cp->list, &cgrp->pids_list); | ||
2412 | found: | ||
2413 | kfree(cp->tasks_pids); | ||
2414 | cp->tasks_pids = pidarray; | ||
2415 | cp->length = npids; | ||
2416 | cp->use_count++; | ||
2343 | up_write(&cgrp->pids_mutex); | 2417 | up_write(&cgrp->pids_mutex); |
2344 | 2418 | ||
2345 | file->f_op = &cgroup_tasks_operations; | 2419 | file->f_op = &cgroup_tasks_operations; |
2346 | 2420 | ||
2347 | retval = seq_open(file, &cgroup_tasks_seq_operations); | 2421 | retval = seq_open(file, &cgroup_tasks_seq_operations); |
2348 | if (retval) { | 2422 | if (retval) { |
2349 | release_cgroup_pid_array(cgrp); | 2423 | release_cgroup_pid_array(cp); |
2350 | return retval; | 2424 | return retval; |
2351 | } | 2425 | } |
2352 | ((struct seq_file *)file->private_data)->private = cgrp; | 2426 | ((struct seq_file *)file->private_data)->private = cp; |
2353 | return 0; | 2427 | return 0; |
2354 | } | 2428 | } |
2355 | 2429 | ||
@@ -2682,33 +2756,42 @@ again: | |||
2682 | mutex_unlock(&cgroup_mutex); | 2756 | mutex_unlock(&cgroup_mutex); |
2683 | 2757 | ||
2684 | /* | 2758 | /* |
2759 | * In general, subsystem has no css->refcnt after pre_destroy(). But | ||
2760 | * in racy cases, subsystem may have to get css->refcnt after | ||
2761 | * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes | ||
2762 | * make rmdir return -EBUSY too often. To avoid that, we use waitqueue | ||
2763 | * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir | ||
2764 | * and subsystem's reference count handling. Please see css_get/put | ||
2765 | * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation. | ||
2766 | */ | ||
2767 | set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); | ||
2768 | |||
2769 | /* | ||
2685 | * Call pre_destroy handlers of subsys. Notify subsystems | 2770 | * Call pre_destroy handlers of subsys. Notify subsystems |
2686 | * that rmdir() request comes. | 2771 | * that rmdir() request comes. |
2687 | */ | 2772 | */ |
2688 | ret = cgroup_call_pre_destroy(cgrp); | 2773 | ret = cgroup_call_pre_destroy(cgrp); |
2689 | if (ret) | 2774 | if (ret) { |
2775 | clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); | ||
2690 | return ret; | 2776 | return ret; |
2777 | } | ||
2691 | 2778 | ||
2692 | mutex_lock(&cgroup_mutex); | 2779 | mutex_lock(&cgroup_mutex); |
2693 | parent = cgrp->parent; | 2780 | parent = cgrp->parent; |
2694 | if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) { | 2781 | if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) { |
2782 | clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); | ||
2695 | mutex_unlock(&cgroup_mutex); | 2783 | mutex_unlock(&cgroup_mutex); |
2696 | return -EBUSY; | 2784 | return -EBUSY; |
2697 | } | 2785 | } |
2698 | /* | ||
2699 | * css_put/get is provided for subsys to grab refcnt to css. In typical | ||
2700 | * case, subsystem has no reference after pre_destroy(). But, under | ||
2701 | * hierarchy management, some *temporal* refcnt can be hold. | ||
2702 | * To avoid returning -EBUSY to a user, waitqueue is used. If subsys | ||
2703 | * is really busy, it should return -EBUSY at pre_destroy(). wake_up | ||
2704 | * is called when css_put() is called and refcnt goes down to 0. | ||
2705 | */ | ||
2706 | set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); | ||
2707 | prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE); | 2786 | prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE); |
2708 | |||
2709 | if (!cgroup_clear_css_refs(cgrp)) { | 2787 | if (!cgroup_clear_css_refs(cgrp)) { |
2710 | mutex_unlock(&cgroup_mutex); | 2788 | mutex_unlock(&cgroup_mutex); |
2711 | schedule(); | 2789 | /* |
2790 | * Because someone may call cgroup_wakeup_rmdir_waiter() before | ||
2791 | * prepare_to_wait(), we need to check this flag. | ||
2792 | */ | ||
2793 | if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)) | ||
2794 | schedule(); | ||
2712 | finish_wait(&cgroup_rmdir_waitq, &wait); | 2795 | finish_wait(&cgroup_rmdir_waitq, &wait); |
2713 | clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); | 2796 | clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); |
2714 | if (signal_pending(current)) | 2797 | if (signal_pending(current)) |
@@ -3280,7 +3363,7 @@ void __css_put(struct cgroup_subsys_state *css) | |||
3280 | set_bit(CGRP_RELEASABLE, &cgrp->flags); | 3363 | set_bit(CGRP_RELEASABLE, &cgrp->flags); |
3281 | check_for_release(cgrp); | 3364 | check_for_release(cgrp); |
3282 | } | 3365 | } |
3283 | cgroup_wakeup_rmdir_waiters(cgrp); | 3366 | cgroup_wakeup_rmdir_waiter(cgrp); |
3284 | } | 3367 | } |
3285 | rcu_read_unlock(); | 3368 | rcu_read_unlock(); |
3286 | } | 3369 | } |