diff options
Diffstat (limited to 'kernel/cgroup.c')
-rw-r--r-- | kernel/cgroup.c | 152 |
1 files changed, 111 insertions, 41 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3737a682cdf5..c7ece8f027f2 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -47,6 +47,7 @@ | |||
47 | #include <linux/hash.h> | 47 | #include <linux/hash.h> |
48 | #include <linux/namei.h> | 48 | #include <linux/namei.h> |
49 | #include <linux/smp_lock.h> | 49 | #include <linux/smp_lock.h> |
50 | #include <linux/pid_namespace.h> | ||
50 | 51 | ||
51 | #include <asm/atomic.h> | 52 | #include <asm/atomic.h> |
52 | 53 | ||
@@ -599,6 +600,7 @@ static struct inode_operations cgroup_dir_inode_operations; | |||
599 | static struct file_operations proc_cgroupstats_operations; | 600 | static struct file_operations proc_cgroupstats_operations; |
600 | 601 | ||
601 | static struct backing_dev_info cgroup_backing_dev_info = { | 602 | static struct backing_dev_info cgroup_backing_dev_info = { |
603 | .name = "cgroup", | ||
602 | .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, | 604 | .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, |
603 | }; | 605 | }; |
604 | 606 | ||
@@ -734,16 +736,28 @@ static void cgroup_d_remove_dir(struct dentry *dentry) | |||
734 | * reference to css->refcnt. In general, this refcnt is expected to goes down | 736 | * reference to css->refcnt. In general, this refcnt is expected to goes down |
735 | * to zero, soon. | 737 | * to zero, soon. |
736 | * | 738 | * |
737 | * CGRP_WAIT_ON_RMDIR flag is modified under cgroup's inode->i_mutex; | 739 | * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex; |
738 | */ | 740 | */ |
739 | DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq); | 741 | DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq); |
740 | 742 | ||
741 | static void cgroup_wakeup_rmdir_waiters(const struct cgroup *cgrp) | 743 | static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp) |
742 | { | 744 | { |
743 | if (unlikely(test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))) | 745 | if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))) |
744 | wake_up_all(&cgroup_rmdir_waitq); | 746 | wake_up_all(&cgroup_rmdir_waitq); |
745 | } | 747 | } |
746 | 748 | ||
749 | void cgroup_exclude_rmdir(struct cgroup_subsys_state *css) | ||
750 | { | ||
751 | css_get(css); | ||
752 | } | ||
753 | |||
754 | void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css) | ||
755 | { | ||
756 | cgroup_wakeup_rmdir_waiter(css->cgroup); | ||
757 | css_put(css); | ||
758 | } | ||
759 | |||
760 | |||
747 | static int rebind_subsystems(struct cgroupfs_root *root, | 761 | static int rebind_subsystems(struct cgroupfs_root *root, |
748 | unsigned long final_bits) | 762 | unsigned long final_bits) |
749 | { | 763 | { |
@@ -960,6 +974,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) | |||
960 | INIT_LIST_HEAD(&cgrp->children); | 974 | INIT_LIST_HEAD(&cgrp->children); |
961 | INIT_LIST_HEAD(&cgrp->css_sets); | 975 | INIT_LIST_HEAD(&cgrp->css_sets); |
962 | INIT_LIST_HEAD(&cgrp->release_list); | 976 | INIT_LIST_HEAD(&cgrp->release_list); |
977 | INIT_LIST_HEAD(&cgrp->pids_list); | ||
963 | init_rwsem(&cgrp->pids_mutex); | 978 | init_rwsem(&cgrp->pids_mutex); |
964 | } | 979 | } |
965 | static void init_cgroup_root(struct cgroupfs_root *root) | 980 | static void init_cgroup_root(struct cgroupfs_root *root) |
@@ -1357,7 +1372,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) | |||
1357 | * wake up rmdir() waiter. the rmdir should fail since the cgroup | 1372 | * wake up rmdir() waiter. the rmdir should fail since the cgroup |
1358 | * is no longer empty. | 1373 | * is no longer empty. |
1359 | */ | 1374 | */ |
1360 | cgroup_wakeup_rmdir_waiters(cgrp); | 1375 | cgroup_wakeup_rmdir_waiter(cgrp); |
1361 | return 0; | 1376 | return 0; |
1362 | } | 1377 | } |
1363 | 1378 | ||
@@ -2201,12 +2216,30 @@ err: | |||
2201 | return ret; | 2216 | return ret; |
2202 | } | 2217 | } |
2203 | 2218 | ||
2219 | /* | ||
2220 | * Cache pids for all threads in the same pid namespace that are | ||
2221 | * opening the same "tasks" file. | ||
2222 | */ | ||
2223 | struct cgroup_pids { | ||
2224 | /* The node in cgrp->pids_list */ | ||
2225 | struct list_head list; | ||
2226 | /* The cgroup those pids belong to */ | ||
2227 | struct cgroup *cgrp; | ||
2228 | /* The namepsace those pids belong to */ | ||
2229 | struct pid_namespace *ns; | ||
2230 | /* Array of process ids in the cgroup */ | ||
2231 | pid_t *tasks_pids; | ||
2232 | /* How many files are using the this tasks_pids array */ | ||
2233 | int use_count; | ||
2234 | /* Length of the current tasks_pids array */ | ||
2235 | int length; | ||
2236 | }; | ||
2237 | |||
2204 | static int cmppid(const void *a, const void *b) | 2238 | static int cmppid(const void *a, const void *b) |
2205 | { | 2239 | { |
2206 | return *(pid_t *)a - *(pid_t *)b; | 2240 | return *(pid_t *)a - *(pid_t *)b; |
2207 | } | 2241 | } |
2208 | 2242 | ||
2209 | |||
2210 | /* | 2243 | /* |
2211 | * seq_file methods for the "tasks" file. The seq_file position is the | 2244 | * seq_file methods for the "tasks" file. The seq_file position is the |
2212 | * next pid to display; the seq_file iterator is a pointer to the pid | 2245 | * next pid to display; the seq_file iterator is a pointer to the pid |
@@ -2221,45 +2254,47 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos) | |||
2221 | * after a seek to the start). Use a binary-search to find the | 2254 | * after a seek to the start). Use a binary-search to find the |
2222 | * next pid to display, if any | 2255 | * next pid to display, if any |
2223 | */ | 2256 | */ |
2224 | struct cgroup *cgrp = s->private; | 2257 | struct cgroup_pids *cp = s->private; |
2258 | struct cgroup *cgrp = cp->cgrp; | ||
2225 | int index = 0, pid = *pos; | 2259 | int index = 0, pid = *pos; |
2226 | int *iter; | 2260 | int *iter; |
2227 | 2261 | ||
2228 | down_read(&cgrp->pids_mutex); | 2262 | down_read(&cgrp->pids_mutex); |
2229 | if (pid) { | 2263 | if (pid) { |
2230 | int end = cgrp->pids_length; | 2264 | int end = cp->length; |
2231 | 2265 | ||
2232 | while (index < end) { | 2266 | while (index < end) { |
2233 | int mid = (index + end) / 2; | 2267 | int mid = (index + end) / 2; |
2234 | if (cgrp->tasks_pids[mid] == pid) { | 2268 | if (cp->tasks_pids[mid] == pid) { |
2235 | index = mid; | 2269 | index = mid; |
2236 | break; | 2270 | break; |
2237 | } else if (cgrp->tasks_pids[mid] <= pid) | 2271 | } else if (cp->tasks_pids[mid] <= pid) |
2238 | index = mid + 1; | 2272 | index = mid + 1; |
2239 | else | 2273 | else |
2240 | end = mid; | 2274 | end = mid; |
2241 | } | 2275 | } |
2242 | } | 2276 | } |
2243 | /* If we're off the end of the array, we're done */ | 2277 | /* If we're off the end of the array, we're done */ |
2244 | if (index >= cgrp->pids_length) | 2278 | if (index >= cp->length) |
2245 | return NULL; | 2279 | return NULL; |
2246 | /* Update the abstract position to be the actual pid that we found */ | 2280 | /* Update the abstract position to be the actual pid that we found */ |
2247 | iter = cgrp->tasks_pids + index; | 2281 | iter = cp->tasks_pids + index; |
2248 | *pos = *iter; | 2282 | *pos = *iter; |
2249 | return iter; | 2283 | return iter; |
2250 | } | 2284 | } |
2251 | 2285 | ||
2252 | static void cgroup_tasks_stop(struct seq_file *s, void *v) | 2286 | static void cgroup_tasks_stop(struct seq_file *s, void *v) |
2253 | { | 2287 | { |
2254 | struct cgroup *cgrp = s->private; | 2288 | struct cgroup_pids *cp = s->private; |
2289 | struct cgroup *cgrp = cp->cgrp; | ||
2255 | up_read(&cgrp->pids_mutex); | 2290 | up_read(&cgrp->pids_mutex); |
2256 | } | 2291 | } |
2257 | 2292 | ||
2258 | static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos) | 2293 | static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos) |
2259 | { | 2294 | { |
2260 | struct cgroup *cgrp = s->private; | 2295 | struct cgroup_pids *cp = s->private; |
2261 | int *p = v; | 2296 | int *p = v; |
2262 | int *end = cgrp->tasks_pids + cgrp->pids_length; | 2297 | int *end = cp->tasks_pids + cp->length; |
2263 | 2298 | ||
2264 | /* | 2299 | /* |
2265 | * Advance to the next pid in the array. If this goes off the | 2300 | * Advance to the next pid in the array. If this goes off the |
@@ -2286,26 +2321,33 @@ static struct seq_operations cgroup_tasks_seq_operations = { | |||
2286 | .show = cgroup_tasks_show, | 2321 | .show = cgroup_tasks_show, |
2287 | }; | 2322 | }; |
2288 | 2323 | ||
2289 | static void release_cgroup_pid_array(struct cgroup *cgrp) | 2324 | static void release_cgroup_pid_array(struct cgroup_pids *cp) |
2290 | { | 2325 | { |
2326 | struct cgroup *cgrp = cp->cgrp; | ||
2327 | |||
2291 | down_write(&cgrp->pids_mutex); | 2328 | down_write(&cgrp->pids_mutex); |
2292 | BUG_ON(!cgrp->pids_use_count); | 2329 | BUG_ON(!cp->use_count); |
2293 | if (!--cgrp->pids_use_count) { | 2330 | if (!--cp->use_count) { |
2294 | kfree(cgrp->tasks_pids); | 2331 | list_del(&cp->list); |
2295 | cgrp->tasks_pids = NULL; | 2332 | put_pid_ns(cp->ns); |
2296 | cgrp->pids_length = 0; | 2333 | kfree(cp->tasks_pids); |
2334 | kfree(cp); | ||
2297 | } | 2335 | } |
2298 | up_write(&cgrp->pids_mutex); | 2336 | up_write(&cgrp->pids_mutex); |
2299 | } | 2337 | } |
2300 | 2338 | ||
2301 | static int cgroup_tasks_release(struct inode *inode, struct file *file) | 2339 | static int cgroup_tasks_release(struct inode *inode, struct file *file) |
2302 | { | 2340 | { |
2303 | struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); | 2341 | struct seq_file *seq; |
2342 | struct cgroup_pids *cp; | ||
2304 | 2343 | ||
2305 | if (!(file->f_mode & FMODE_READ)) | 2344 | if (!(file->f_mode & FMODE_READ)) |
2306 | return 0; | 2345 | return 0; |
2307 | 2346 | ||
2308 | release_cgroup_pid_array(cgrp); | 2347 | seq = file->private_data; |
2348 | cp = seq->private; | ||
2349 | |||
2350 | release_cgroup_pid_array(cp); | ||
2309 | return seq_release(inode, file); | 2351 | return seq_release(inode, file); |
2310 | } | 2352 | } |
2311 | 2353 | ||
@@ -2324,6 +2366,8 @@ static struct file_operations cgroup_tasks_operations = { | |||
2324 | static int cgroup_tasks_open(struct inode *unused, struct file *file) | 2366 | static int cgroup_tasks_open(struct inode *unused, struct file *file) |
2325 | { | 2367 | { |
2326 | struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); | 2368 | struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); |
2369 | struct pid_namespace *ns = current->nsproxy->pid_ns; | ||
2370 | struct cgroup_pids *cp; | ||
2327 | pid_t *pidarray; | 2371 | pid_t *pidarray; |
2328 | int npids; | 2372 | int npids; |
2329 | int retval; | 2373 | int retval; |
@@ -2350,20 +2394,37 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file) | |||
2350 | * array if necessary | 2394 | * array if necessary |
2351 | */ | 2395 | */ |
2352 | down_write(&cgrp->pids_mutex); | 2396 | down_write(&cgrp->pids_mutex); |
2353 | kfree(cgrp->tasks_pids); | 2397 | |
2354 | cgrp->tasks_pids = pidarray; | 2398 | list_for_each_entry(cp, &cgrp->pids_list, list) { |
2355 | cgrp->pids_length = npids; | 2399 | if (ns == cp->ns) |
2356 | cgrp->pids_use_count++; | 2400 | goto found; |
2401 | } | ||
2402 | |||
2403 | cp = kzalloc(sizeof(*cp), GFP_KERNEL); | ||
2404 | if (!cp) { | ||
2405 | up_write(&cgrp->pids_mutex); | ||
2406 | kfree(pidarray); | ||
2407 | return -ENOMEM; | ||
2408 | } | ||
2409 | cp->cgrp = cgrp; | ||
2410 | cp->ns = ns; | ||
2411 | get_pid_ns(ns); | ||
2412 | list_add(&cp->list, &cgrp->pids_list); | ||
2413 | found: | ||
2414 | kfree(cp->tasks_pids); | ||
2415 | cp->tasks_pids = pidarray; | ||
2416 | cp->length = npids; | ||
2417 | cp->use_count++; | ||
2357 | up_write(&cgrp->pids_mutex); | 2418 | up_write(&cgrp->pids_mutex); |
2358 | 2419 | ||
2359 | file->f_op = &cgroup_tasks_operations; | 2420 | file->f_op = &cgroup_tasks_operations; |
2360 | 2421 | ||
2361 | retval = seq_open(file, &cgroup_tasks_seq_operations); | 2422 | retval = seq_open(file, &cgroup_tasks_seq_operations); |
2362 | if (retval) { | 2423 | if (retval) { |
2363 | release_cgroup_pid_array(cgrp); | 2424 | release_cgroup_pid_array(cp); |
2364 | return retval; | 2425 | return retval; |
2365 | } | 2426 | } |
2366 | ((struct seq_file *)file->private_data)->private = cgrp; | 2427 | ((struct seq_file *)file->private_data)->private = cp; |
2367 | return 0; | 2428 | return 0; |
2368 | } | 2429 | } |
2369 | 2430 | ||
@@ -2696,33 +2757,42 @@ again: | |||
2696 | mutex_unlock(&cgroup_mutex); | 2757 | mutex_unlock(&cgroup_mutex); |
2697 | 2758 | ||
2698 | /* | 2759 | /* |
2760 | * In general, subsystem has no css->refcnt after pre_destroy(). But | ||
2761 | * in racy cases, subsystem may have to get css->refcnt after | ||
2762 | * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes | ||
2763 | * make rmdir return -EBUSY too often. To avoid that, we use waitqueue | ||
2764 | * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir | ||
2765 | * and subsystem's reference count handling. Please see css_get/put | ||
2766 | * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation. | ||
2767 | */ | ||
2768 | set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); | ||
2769 | |||
2770 | /* | ||
2699 | * Call pre_destroy handlers of subsys. Notify subsystems | 2771 | * Call pre_destroy handlers of subsys. Notify subsystems |
2700 | * that rmdir() request comes. | 2772 | * that rmdir() request comes. |
2701 | */ | 2773 | */ |
2702 | ret = cgroup_call_pre_destroy(cgrp); | 2774 | ret = cgroup_call_pre_destroy(cgrp); |
2703 | if (ret) | 2775 | if (ret) { |
2776 | clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); | ||
2704 | return ret; | 2777 | return ret; |
2778 | } | ||
2705 | 2779 | ||
2706 | mutex_lock(&cgroup_mutex); | 2780 | mutex_lock(&cgroup_mutex); |
2707 | parent = cgrp->parent; | 2781 | parent = cgrp->parent; |
2708 | if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) { | 2782 | if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) { |
2783 | clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); | ||
2709 | mutex_unlock(&cgroup_mutex); | 2784 | mutex_unlock(&cgroup_mutex); |
2710 | return -EBUSY; | 2785 | return -EBUSY; |
2711 | } | 2786 | } |
2712 | /* | ||
2713 | * css_put/get is provided for subsys to grab refcnt to css. In typical | ||
2714 | * case, subsystem has no reference after pre_destroy(). But, under | ||
2715 | * hierarchy management, some *temporal* refcnt can be hold. | ||
2716 | * To avoid returning -EBUSY to a user, waitqueue is used. If subsys | ||
2717 | * is really busy, it should return -EBUSY at pre_destroy(). wake_up | ||
2718 | * is called when css_put() is called and refcnt goes down to 0. | ||
2719 | */ | ||
2720 | set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); | ||
2721 | prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE); | 2787 | prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE); |
2722 | |||
2723 | if (!cgroup_clear_css_refs(cgrp)) { | 2788 | if (!cgroup_clear_css_refs(cgrp)) { |
2724 | mutex_unlock(&cgroup_mutex); | 2789 | mutex_unlock(&cgroup_mutex); |
2725 | schedule(); | 2790 | /* |
2791 | * Because someone may call cgroup_wakeup_rmdir_waiter() before | ||
2792 | * prepare_to_wait(), we need to check this flag. | ||
2793 | */ | ||
2794 | if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)) | ||
2795 | schedule(); | ||
2726 | finish_wait(&cgroup_rmdir_waitq, &wait); | 2796 | finish_wait(&cgroup_rmdir_waitq, &wait); |
2727 | clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); | 2797 | clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); |
2728 | if (signal_pending(current)) | 2798 | if (signal_pending(current)) |
@@ -3294,7 +3364,7 @@ void __css_put(struct cgroup_subsys_state *css) | |||
3294 | set_bit(CGRP_RELEASABLE, &cgrp->flags); | 3364 | set_bit(CGRP_RELEASABLE, &cgrp->flags); |
3295 | check_for_release(cgrp); | 3365 | check_for_release(cgrp); |
3296 | } | 3366 | } |
3297 | cgroup_wakeup_rmdir_waiters(cgrp); | 3367 | cgroup_wakeup_rmdir_waiter(cgrp); |
3298 | } | 3368 | } |
3299 | rcu_read_unlock(); | 3369 | rcu_read_unlock(); |
3300 | } | 3370 | } |