diff options
Diffstat (limited to 'kernel/cgroup.c')
-rw-r--r-- | kernel/cgroup.c | 151 |
1 files changed, 110 insertions, 41 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3737a682cdf5..b6eadfe30e7b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -47,6 +47,7 @@ | |||
47 | #include <linux/hash.h> | 47 | #include <linux/hash.h> |
48 | #include <linux/namei.h> | 48 | #include <linux/namei.h> |
49 | #include <linux/smp_lock.h> | 49 | #include <linux/smp_lock.h> |
50 | #include <linux/pid_namespace.h> | ||
50 | 51 | ||
51 | #include <asm/atomic.h> | 52 | #include <asm/atomic.h> |
52 | 53 | ||
@@ -734,16 +735,28 @@ static void cgroup_d_remove_dir(struct dentry *dentry) | |||
734 | * reference to css->refcnt. In general, this refcnt is expected to goes down | 735 | * reference to css->refcnt. In general, this refcnt is expected to goes down |
735 | * to zero, soon. | 736 | * to zero, soon. |
736 | * | 737 | * |
737 | * CGRP_WAIT_ON_RMDIR flag is modified under cgroup's inode->i_mutex; | 738 | * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex; |
738 | */ | 739 | */ |
739 | DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq); | 740 | DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq); |
740 | 741 | ||
741 | static void cgroup_wakeup_rmdir_waiters(const struct cgroup *cgrp) | 742 | static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp) |
742 | { | 743 | { |
743 | if (unlikely(test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))) | 744 | if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))) |
744 | wake_up_all(&cgroup_rmdir_waitq); | 745 | wake_up_all(&cgroup_rmdir_waitq); |
745 | } | 746 | } |
746 | 747 | ||
748 | void cgroup_exclude_rmdir(struct cgroup_subsys_state *css) | ||
749 | { | ||
750 | css_get(css); | ||
751 | } | ||
752 | |||
753 | void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css) | ||
754 | { | ||
755 | cgroup_wakeup_rmdir_waiter(css->cgroup); | ||
756 | css_put(css); | ||
757 | } | ||
758 | |||
759 | |||
747 | static int rebind_subsystems(struct cgroupfs_root *root, | 760 | static int rebind_subsystems(struct cgroupfs_root *root, |
748 | unsigned long final_bits) | 761 | unsigned long final_bits) |
749 | { | 762 | { |
@@ -960,6 +973,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) | |||
960 | INIT_LIST_HEAD(&cgrp->children); | 973 | INIT_LIST_HEAD(&cgrp->children); |
961 | INIT_LIST_HEAD(&cgrp->css_sets); | 974 | INIT_LIST_HEAD(&cgrp->css_sets); |
962 | INIT_LIST_HEAD(&cgrp->release_list); | 975 | INIT_LIST_HEAD(&cgrp->release_list); |
976 | INIT_LIST_HEAD(&cgrp->pids_list); | ||
963 | init_rwsem(&cgrp->pids_mutex); | 977 | init_rwsem(&cgrp->pids_mutex); |
964 | } | 978 | } |
965 | static void init_cgroup_root(struct cgroupfs_root *root) | 979 | static void init_cgroup_root(struct cgroupfs_root *root) |
@@ -1357,7 +1371,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) | |||
1357 | * wake up rmdir() waiter. the rmdir should fail since the cgroup | 1371 | * wake up rmdir() waiter. the rmdir should fail since the cgroup |
1358 | * is no longer empty. | 1372 | * is no longer empty. |
1359 | */ | 1373 | */ |
1360 | cgroup_wakeup_rmdir_waiters(cgrp); | 1374 | cgroup_wakeup_rmdir_waiter(cgrp); |
1361 | return 0; | 1375 | return 0; |
1362 | } | 1376 | } |
1363 | 1377 | ||
@@ -2201,12 +2215,30 @@ err: | |||
2201 | return ret; | 2215 | return ret; |
2202 | } | 2216 | } |
2203 | 2217 | ||
2218 | /* | ||
2219 | * Cache pids for all threads in the same pid namespace that are | ||
2220 | * opening the same "tasks" file. | ||
2221 | */ | ||
2222 | struct cgroup_pids { | ||
2223 | /* The node in cgrp->pids_list */ | ||
2224 | struct list_head list; | ||
2225 | /* The cgroup those pids belong to */ | ||
2226 | struct cgroup *cgrp; | ||
2227 | /* The namepsace those pids belong to */ | ||
2228 | struct pid_namespace *ns; | ||
2229 | /* Array of process ids in the cgroup */ | ||
2230 | pid_t *tasks_pids; | ||
2231 | /* How many files are using the this tasks_pids array */ | ||
2232 | int use_count; | ||
2233 | /* Length of the current tasks_pids array */ | ||
2234 | int length; | ||
2235 | }; | ||
2236 | |||
2204 | static int cmppid(const void *a, const void *b) | 2237 | static int cmppid(const void *a, const void *b) |
2205 | { | 2238 | { |
2206 | return *(pid_t *)a - *(pid_t *)b; | 2239 | return *(pid_t *)a - *(pid_t *)b; |
2207 | } | 2240 | } |
2208 | 2241 | ||
2209 | |||
2210 | /* | 2242 | /* |
2211 | * seq_file methods for the "tasks" file. The seq_file position is the | 2243 | * seq_file methods for the "tasks" file. The seq_file position is the |
2212 | * next pid to display; the seq_file iterator is a pointer to the pid | 2244 | * next pid to display; the seq_file iterator is a pointer to the pid |
@@ -2221,45 +2253,47 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos) | |||
2221 | * after a seek to the start). Use a binary-search to find the | 2253 | * after a seek to the start). Use a binary-search to find the |
2222 | * next pid to display, if any | 2254 | * next pid to display, if any |
2223 | */ | 2255 | */ |
2224 | struct cgroup *cgrp = s->private; | 2256 | struct cgroup_pids *cp = s->private; |
2257 | struct cgroup *cgrp = cp->cgrp; | ||
2225 | int index = 0, pid = *pos; | 2258 | int index = 0, pid = *pos; |
2226 | int *iter; | 2259 | int *iter; |
2227 | 2260 | ||
2228 | down_read(&cgrp->pids_mutex); | 2261 | down_read(&cgrp->pids_mutex); |
2229 | if (pid) { | 2262 | if (pid) { |
2230 | int end = cgrp->pids_length; | 2263 | int end = cp->length; |
2231 | 2264 | ||
2232 | while (index < end) { | 2265 | while (index < end) { |
2233 | int mid = (index + end) / 2; | 2266 | int mid = (index + end) / 2; |
2234 | if (cgrp->tasks_pids[mid] == pid) { | 2267 | if (cp->tasks_pids[mid] == pid) { |
2235 | index = mid; | 2268 | index = mid; |
2236 | break; | 2269 | break; |
2237 | } else if (cgrp->tasks_pids[mid] <= pid) | 2270 | } else if (cp->tasks_pids[mid] <= pid) |
2238 | index = mid + 1; | 2271 | index = mid + 1; |
2239 | else | 2272 | else |
2240 | end = mid; | 2273 | end = mid; |
2241 | } | 2274 | } |
2242 | } | 2275 | } |
2243 | /* If we're off the end of the array, we're done */ | 2276 | /* If we're off the end of the array, we're done */ |
2244 | if (index >= cgrp->pids_length) | 2277 | if (index >= cp->length) |
2245 | return NULL; | 2278 | return NULL; |
2246 | /* Update the abstract position to be the actual pid that we found */ | 2279 | /* Update the abstract position to be the actual pid that we found */ |
2247 | iter = cgrp->tasks_pids + index; | 2280 | iter = cp->tasks_pids + index; |
2248 | *pos = *iter; | 2281 | *pos = *iter; |
2249 | return iter; | 2282 | return iter; |
2250 | } | 2283 | } |
2251 | 2284 | ||
2252 | static void cgroup_tasks_stop(struct seq_file *s, void *v) | 2285 | static void cgroup_tasks_stop(struct seq_file *s, void *v) |
2253 | { | 2286 | { |
2254 | struct cgroup *cgrp = s->private; | 2287 | struct cgroup_pids *cp = s->private; |
2288 | struct cgroup *cgrp = cp->cgrp; | ||
2255 | up_read(&cgrp->pids_mutex); | 2289 | up_read(&cgrp->pids_mutex); |
2256 | } | 2290 | } |
2257 | 2291 | ||
2258 | static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos) | 2292 | static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos) |
2259 | { | 2293 | { |
2260 | struct cgroup *cgrp = s->private; | 2294 | struct cgroup_pids *cp = s->private; |
2261 | int *p = v; | 2295 | int *p = v; |
2262 | int *end = cgrp->tasks_pids + cgrp->pids_length; | 2296 | int *end = cp->tasks_pids + cp->length; |
2263 | 2297 | ||
2264 | /* | 2298 | /* |
2265 | * Advance to the next pid in the array. If this goes off the | 2299 | * Advance to the next pid in the array. If this goes off the |
@@ -2286,26 +2320,33 @@ static struct seq_operations cgroup_tasks_seq_operations = { | |||
2286 | .show = cgroup_tasks_show, | 2320 | .show = cgroup_tasks_show, |
2287 | }; | 2321 | }; |
2288 | 2322 | ||
2289 | static void release_cgroup_pid_array(struct cgroup *cgrp) | 2323 | static void release_cgroup_pid_array(struct cgroup_pids *cp) |
2290 | { | 2324 | { |
2325 | struct cgroup *cgrp = cp->cgrp; | ||
2326 | |||
2291 | down_write(&cgrp->pids_mutex); | 2327 | down_write(&cgrp->pids_mutex); |
2292 | BUG_ON(!cgrp->pids_use_count); | 2328 | BUG_ON(!cp->use_count); |
2293 | if (!--cgrp->pids_use_count) { | 2329 | if (!--cp->use_count) { |
2294 | kfree(cgrp->tasks_pids); | 2330 | list_del(&cp->list); |
2295 | cgrp->tasks_pids = NULL; | 2331 | put_pid_ns(cp->ns); |
2296 | cgrp->pids_length = 0; | 2332 | kfree(cp->tasks_pids); |
2333 | kfree(cp); | ||
2297 | } | 2334 | } |
2298 | up_write(&cgrp->pids_mutex); | 2335 | up_write(&cgrp->pids_mutex); |
2299 | } | 2336 | } |
2300 | 2337 | ||
2301 | static int cgroup_tasks_release(struct inode *inode, struct file *file) | 2338 | static int cgroup_tasks_release(struct inode *inode, struct file *file) |
2302 | { | 2339 | { |
2303 | struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); | 2340 | struct seq_file *seq; |
2341 | struct cgroup_pids *cp; | ||
2304 | 2342 | ||
2305 | if (!(file->f_mode & FMODE_READ)) | 2343 | if (!(file->f_mode & FMODE_READ)) |
2306 | return 0; | 2344 | return 0; |
2307 | 2345 | ||
2308 | release_cgroup_pid_array(cgrp); | 2346 | seq = file->private_data; |
2347 | cp = seq->private; | ||
2348 | |||
2349 | release_cgroup_pid_array(cp); | ||
2309 | return seq_release(inode, file); | 2350 | return seq_release(inode, file); |
2310 | } | 2351 | } |
2311 | 2352 | ||
@@ -2324,6 +2365,8 @@ static struct file_operations cgroup_tasks_operations = { | |||
2324 | static int cgroup_tasks_open(struct inode *unused, struct file *file) | 2365 | static int cgroup_tasks_open(struct inode *unused, struct file *file) |
2325 | { | 2366 | { |
2326 | struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); | 2367 | struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); |
2368 | struct pid_namespace *ns = current->nsproxy->pid_ns; | ||
2369 | struct cgroup_pids *cp; | ||
2327 | pid_t *pidarray; | 2370 | pid_t *pidarray; |
2328 | int npids; | 2371 | int npids; |
2329 | int retval; | 2372 | int retval; |
@@ -2350,20 +2393,37 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file) | |||
2350 | * array if necessary | 2393 | * array if necessary |
2351 | */ | 2394 | */ |
2352 | down_write(&cgrp->pids_mutex); | 2395 | down_write(&cgrp->pids_mutex); |
2353 | kfree(cgrp->tasks_pids); | 2396 | |
2354 | cgrp->tasks_pids = pidarray; | 2397 | list_for_each_entry(cp, &cgrp->pids_list, list) { |
2355 | cgrp->pids_length = npids; | 2398 | if (ns == cp->ns) |
2356 | cgrp->pids_use_count++; | 2399 | goto found; |
2400 | } | ||
2401 | |||
2402 | cp = kzalloc(sizeof(*cp), GFP_KERNEL); | ||
2403 | if (!cp) { | ||
2404 | up_write(&cgrp->pids_mutex); | ||
2405 | kfree(pidarray); | ||
2406 | return -ENOMEM; | ||
2407 | } | ||
2408 | cp->cgrp = cgrp; | ||
2409 | cp->ns = ns; | ||
2410 | get_pid_ns(ns); | ||
2411 | list_add(&cp->list, &cgrp->pids_list); | ||
2412 | found: | ||
2413 | kfree(cp->tasks_pids); | ||
2414 | cp->tasks_pids = pidarray; | ||
2415 | cp->length = npids; | ||
2416 | cp->use_count++; | ||
2357 | up_write(&cgrp->pids_mutex); | 2417 | up_write(&cgrp->pids_mutex); |
2358 | 2418 | ||
2359 | file->f_op = &cgroup_tasks_operations; | 2419 | file->f_op = &cgroup_tasks_operations; |
2360 | 2420 | ||
2361 | retval = seq_open(file, &cgroup_tasks_seq_operations); | 2421 | retval = seq_open(file, &cgroup_tasks_seq_operations); |
2362 | if (retval) { | 2422 | if (retval) { |
2363 | release_cgroup_pid_array(cgrp); | 2423 | release_cgroup_pid_array(cp); |
2364 | return retval; | 2424 | return retval; |
2365 | } | 2425 | } |
2366 | ((struct seq_file *)file->private_data)->private = cgrp; | 2426 | ((struct seq_file *)file->private_data)->private = cp; |
2367 | return 0; | 2427 | return 0; |
2368 | } | 2428 | } |
2369 | 2429 | ||
@@ -2696,33 +2756,42 @@ again: | |||
2696 | mutex_unlock(&cgroup_mutex); | 2756 | mutex_unlock(&cgroup_mutex); |
2697 | 2757 | ||
2698 | /* | 2758 | /* |
2759 | * In general, subsystem has no css->refcnt after pre_destroy(). But | ||
2760 | * in racy cases, subsystem may have to get css->refcnt after | ||
2761 | * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes | ||
2762 | * make rmdir return -EBUSY too often. To avoid that, we use waitqueue | ||
2763 | * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir | ||
2764 | * and subsystem's reference count handling. Please see css_get/put | ||
2765 | * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation. | ||
2766 | */ | ||
2767 | set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); | ||
2768 | |||
2769 | /* | ||
2699 | * Call pre_destroy handlers of subsys. Notify subsystems | 2770 | * Call pre_destroy handlers of subsys. Notify subsystems |
2700 | * that rmdir() request comes. | 2771 | * that rmdir() request comes. |
2701 | */ | 2772 | */ |
2702 | ret = cgroup_call_pre_destroy(cgrp); | 2773 | ret = cgroup_call_pre_destroy(cgrp); |
2703 | if (ret) | 2774 | if (ret) { |
2775 | clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); | ||
2704 | return ret; | 2776 | return ret; |
2777 | } | ||
2705 | 2778 | ||
2706 | mutex_lock(&cgroup_mutex); | 2779 | mutex_lock(&cgroup_mutex); |
2707 | parent = cgrp->parent; | 2780 | parent = cgrp->parent; |
2708 | if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) { | 2781 | if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) { |
2782 | clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); | ||
2709 | mutex_unlock(&cgroup_mutex); | 2783 | mutex_unlock(&cgroup_mutex); |
2710 | return -EBUSY; | 2784 | return -EBUSY; |
2711 | } | 2785 | } |
2712 | /* | ||
2713 | * css_put/get is provided for subsys to grab refcnt to css. In typical | ||
2714 | * case, subsystem has no reference after pre_destroy(). But, under | ||
2715 | * hierarchy management, some *temporal* refcnt can be hold. | ||
2716 | * To avoid returning -EBUSY to a user, waitqueue is used. If subsys | ||
2717 | * is really busy, it should return -EBUSY at pre_destroy(). wake_up | ||
2718 | * is called when css_put() is called and refcnt goes down to 0. | ||
2719 | */ | ||
2720 | set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); | ||
2721 | prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE); | 2786 | prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE); |
2722 | |||
2723 | if (!cgroup_clear_css_refs(cgrp)) { | 2787 | if (!cgroup_clear_css_refs(cgrp)) { |
2724 | mutex_unlock(&cgroup_mutex); | 2788 | mutex_unlock(&cgroup_mutex); |
2725 | schedule(); | 2789 | /* |
2790 | * Because someone may call cgroup_wakeup_rmdir_waiter() before | ||
2791 | * prepare_to_wait(), we need to check this flag. | ||
2792 | */ | ||
2793 | if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)) | ||
2794 | schedule(); | ||
2726 | finish_wait(&cgroup_rmdir_waitq, &wait); | 2795 | finish_wait(&cgroup_rmdir_waitq, &wait); |
2727 | clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); | 2796 | clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); |
2728 | if (signal_pending(current)) | 2797 | if (signal_pending(current)) |
@@ -3294,7 +3363,7 @@ void __css_put(struct cgroup_subsys_state *css) | |||
3294 | set_bit(CGRP_RELEASABLE, &cgrp->flags); | 3363 | set_bit(CGRP_RELEASABLE, &cgrp->flags); |
3295 | check_for_release(cgrp); | 3364 | check_for_release(cgrp); |
3296 | } | 3365 | } |
3297 | cgroup_wakeup_rmdir_waiters(cgrp); | 3366 | cgroup_wakeup_rmdir_waiter(cgrp); |
3298 | } | 3367 | } |
3299 | rcu_read_unlock(); | 3368 | rcu_read_unlock(); |
3300 | } | 3369 | } |