aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBen Blum <bblum@google.com>2009-09-23 18:56:27 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2009-09-24 10:20:58 -0400
commit72a8cb30d10d4041c455a7054607a7d519167c87 (patch)
tree9b499f9c7f4de011ba5c8282df0b2280b7c21f0b
parent102a775e3647628727ae83a9a6abf0564c3ca7cb (diff)
cgroups: ensure correct concurrent opening/reading of pidlists across pid namespaces
Previously there was the problem in which two processes from different pid namespaces reading the tasks or procs file could result in one process seeing results from the other's namespace. Rather than one pidlist for each file in a cgroup, we now keep a list of pidlists keyed by namespace and file type (tasks versus procs) in which entries are placed on demand. Each pidlist has its own lock, and that the pidlists themselves are passed around in the seq_file's private pointer means we don't have to touch the cgroup or its master list except when creating and destroying entries. Signed-off-by: Ben Blum <bblum@google.com> Signed-off-by: Paul Menage <menage@google.com> Cc: Li Zefan <lizf@cn.fujitsu.com> Cc: Matt Helsley <matthltc@us.ibm.com> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Cc: Oleg Nesterov <oleg@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/cgroup.h34
-rw-r--r--kernel/cgroup.c107
2 files changed, 119 insertions, 22 deletions
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 2357733a0a80..88e863460726 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -141,15 +141,36 @@ enum {
141 CGRP_WAIT_ON_RMDIR, 141 CGRP_WAIT_ON_RMDIR,
142}; 142};
143 143
144/* which pidlist file are we talking about? */
145enum cgroup_filetype {
146 CGROUP_FILE_PROCS,
147 CGROUP_FILE_TASKS,
148};
149
150/*
151 * A pidlist is a list of pids that virtually represents the contents of one
152 * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
153 * a pair (one each for procs, tasks) for each pid namespace that's relevant
154 * to the cgroup.
155 */
144struct cgroup_pidlist { 156struct cgroup_pidlist {
145 /* protects the other fields */ 157 /*
146 struct rw_semaphore mutex; 158 * used to find which pidlist is wanted. doesn't change as long as
159 * this particular list stays in the list.
160 */
161 struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
147 /* array of xids */ 162 /* array of xids */
148 pid_t *list; 163 pid_t *list;
149 /* how many elements the above list has */ 164 /* how many elements the above list has */
150 int length; 165 int length;
151 /* how many files are using the current array */ 166 /* how many files are using the current array */
152 int use_count; 167 int use_count;
168 /* each of these stored in a list by its cgroup */
169 struct list_head links;
170 /* pointer to the cgroup we belong to, for list removal purposes */
171 struct cgroup *owner;
172 /* protects the other fields */
173 struct rw_semaphore mutex;
153}; 174};
154 175
155struct cgroup { 176struct cgroup {
@@ -190,9 +211,12 @@ struct cgroup {
190 */ 211 */
191 struct list_head release_list; 212 struct list_head release_list;
192 213
193 /* we will have two separate pidlists, one for pids (the tasks file) 214 /*
194 * and one for tgids (the procs file). */ 215 * list of pidlists, up to two for each namespace (one for procs, one
195 struct cgroup_pidlist tasks, procs; 216 * for tasks); created on demand.
217 */
218 struct list_head pidlists;
219 struct mutex pidlist_mutex;
196 220
197 /* For RCU-protected deletion */ 221 /* For RCU-protected deletion */
198 struct rcu_head rcu_head; 222 struct rcu_head rcu_head;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a9433f50e53d..97194ba12014 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -776,6 +776,12 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
776 */ 776 */
777 deactivate_super(cgrp->root->sb); 777 deactivate_super(cgrp->root->sb);
778 778
779 /*
780 * if we're getting rid of the cgroup, refcount should ensure
781 * that there are no pidlists left.
782 */
783 BUG_ON(!list_empty(&cgrp->pidlists));
784
779 call_rcu(&cgrp->rcu_head, free_cgroup_rcu); 785 call_rcu(&cgrp->rcu_head, free_cgroup_rcu);
780 } 786 }
781 iput(inode); 787 iput(inode);
@@ -1121,8 +1127,8 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1121 INIT_LIST_HEAD(&cgrp->children); 1127 INIT_LIST_HEAD(&cgrp->children);
1122 INIT_LIST_HEAD(&cgrp->css_sets); 1128 INIT_LIST_HEAD(&cgrp->css_sets);
1123 INIT_LIST_HEAD(&cgrp->release_list); 1129 INIT_LIST_HEAD(&cgrp->release_list);
1124 init_rwsem(&(cgrp->tasks.mutex)); 1130 INIT_LIST_HEAD(&cgrp->pidlists);
1125 init_rwsem(&(cgrp->procs.mutex)); 1131 mutex_init(&cgrp->pidlist_mutex);
1126} 1132}
1127 1133
1128static void init_cgroup_root(struct cgroupfs_root *root) 1134static void init_cgroup_root(struct cgroupfs_root *root)
@@ -2396,9 +2402,59 @@ static int cmppid(const void *a, const void *b)
2396} 2402}
2397 2403
2398/* 2404/*
2405 * find the appropriate pidlist for our purpose (given procs vs tasks)
2406 * returns with the lock on that pidlist already held, and takes care
2407 * of the use count, or returns NULL with no locks held if we're out of
2408 * memory.
2409 */
2410static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
2411 enum cgroup_filetype type)
2412{
2413 struct cgroup_pidlist *l;
2414 /* don't need task_nsproxy() if we're looking at ourself */
2415 struct pid_namespace *ns = get_pid_ns(current->nsproxy->pid_ns);
2416 /*
2417 * We can't drop the pidlist_mutex before taking the l->mutex in case
2418 * the last ref-holder is trying to remove l from the list at the same
2419 * time. Holding the pidlist_mutex precludes somebody taking whichever
2420 * list we find out from under us - compare release_pid_array().
2421 */
2422 mutex_lock(&cgrp->pidlist_mutex);
2423 list_for_each_entry(l, &cgrp->pidlists, links) {
2424 if (l->key.type == type && l->key.ns == ns) {
2425 /* found a matching list - drop the extra refcount */
2426 put_pid_ns(ns);
2427 /* make sure l doesn't vanish out from under us */
2428 down_write(&l->mutex);
2429 mutex_unlock(&cgrp->pidlist_mutex);
2430 l->use_count++;
2431 return l;
2432 }
2433 }
2434 /* entry not found; create a new one */
2435 l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
2436 if (!l) {
2437 mutex_unlock(&cgrp->pidlist_mutex);
2438 put_pid_ns(ns);
2439 return l;
2440 }
2441 init_rwsem(&l->mutex);
2442 down_write(&l->mutex);
2443 l->key.type = type;
2444 l->key.ns = ns;
2445 l->use_count = 0; /* don't increment here */
2446 l->list = NULL;
2447 l->owner = cgrp;
2448 list_add(&l->links, &cgrp->pidlists);
2449 mutex_unlock(&cgrp->pidlist_mutex);
2450 return l;
2451}
2452
2453/*
2399 * Load a cgroup's pidarray with either procs' tgids or tasks' pids 2454 * Load a cgroup's pidarray with either procs' tgids or tasks' pids
2400 */ 2455 */
2401static int pidlist_array_load(struct cgroup *cgrp, bool procs) 2456static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
2457 struct cgroup_pidlist **lp)
2402{ 2458{
2403 pid_t *array; 2459 pid_t *array;
2404 int length; 2460 int length;
@@ -2423,7 +2479,10 @@ static int pidlist_array_load(struct cgroup *cgrp, bool procs)
2423 if (unlikely(n == length)) 2479 if (unlikely(n == length))
2424 break; 2480 break;
2425 /* get tgid or pid for procs or tasks file respectively */ 2481 /* get tgid or pid for procs or tasks file respectively */
2426 pid = (procs ? task_tgid_vnr(tsk) : task_pid_vnr(tsk)); 2482 if (type == CGROUP_FILE_PROCS)
2483 pid = task_tgid_vnr(tsk);
2484 else
2485 pid = task_pid_vnr(tsk);
2427 if (pid > 0) /* make sure to only use valid results */ 2486 if (pid > 0) /* make sure to only use valid results */
2428 array[n++] = pid; 2487 array[n++] = pid;
2429 } 2488 }
@@ -2431,19 +2490,20 @@ static int pidlist_array_load(struct cgroup *cgrp, bool procs)
2431 length = n; 2490 length = n;
2432 /* now sort & (if procs) strip out duplicates */ 2491 /* now sort & (if procs) strip out duplicates */
2433 sort(array, length, sizeof(pid_t), cmppid, NULL); 2492 sort(array, length, sizeof(pid_t), cmppid, NULL);
2434 if (procs) { 2493 if (type == CGROUP_FILE_PROCS)
2435 length = pidlist_uniq(&array, length); 2494 length = pidlist_uniq(&array, length);
2436 l = &(cgrp->procs); 2495 l = cgroup_pidlist_find(cgrp, type);
2437 } else { 2496 if (!l) {
2438 l = &(cgrp->tasks); 2497 kfree(array);
2498 return -ENOMEM;
2439 } 2499 }
2440 /* store array in cgroup, freeing old if necessary */ 2500 /* store array, freeing old if necessary - lock already held */
2441 down_write(&l->mutex);
2442 kfree(l->list); 2501 kfree(l->list);
2443 l->list = array; 2502 l->list = array;
2444 l->length = length; 2503 l->length = length;
2445 l->use_count++; 2504 l->use_count++;
2446 up_write(&l->mutex); 2505 up_write(&l->mutex);
2506 *lp = l;
2447 return 0; 2507 return 0;
2448} 2508}
2449 2509
@@ -2586,13 +2646,26 @@ static const struct seq_operations cgroup_pidlist_seq_operations = {
2586 2646
2587static void cgroup_release_pid_array(struct cgroup_pidlist *l) 2647static void cgroup_release_pid_array(struct cgroup_pidlist *l)
2588{ 2648{
2649 /*
2650 * the case where we're the last user of this particular pidlist will
2651 * have us remove it from the cgroup's list, which entails taking the
2652 * mutex. since in pidlist_find the pidlist->lock depends on cgroup->
2653 * pidlist_mutex, we have to take pidlist_mutex first.
2654 */
2655 mutex_lock(&l->owner->pidlist_mutex);
2589 down_write(&l->mutex); 2656 down_write(&l->mutex);
2590 BUG_ON(!l->use_count); 2657 BUG_ON(!l->use_count);
2591 if (!--l->use_count) { 2658 if (!--l->use_count) {
2659 /* we're the last user if refcount is 0; remove and free */
2660 list_del(&l->links);
2661 mutex_unlock(&l->owner->pidlist_mutex);
2592 kfree(l->list); 2662 kfree(l->list);
2593 l->list = NULL; 2663 put_pid_ns(l->key.ns);
2594 l->length = 0; 2664 up_write(&l->mutex);
2665 kfree(l);
2666 return;
2595 } 2667 }
2668 mutex_unlock(&l->owner->pidlist_mutex);
2596 up_write(&l->mutex); 2669 up_write(&l->mutex);
2597} 2670}
2598 2671
@@ -2623,10 +2696,10 @@ static const struct file_operations cgroup_pidlist_operations = {
2623 * in the cgroup. 2696 * in the cgroup.
2624 */ 2697 */
2625/* helper function for the two below it */ 2698/* helper function for the two below it */
2626static int cgroup_pidlist_open(struct file *file, bool procs) 2699static int cgroup_pidlist_open(struct file *file, enum cgroup_filetype type)
2627{ 2700{
2628 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 2701 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
2629 struct cgroup_pidlist *l = (procs ? &cgrp->procs : &cgrp->tasks); 2702 struct cgroup_pidlist *l;
2630 int retval; 2703 int retval;
2631 2704
2632 /* Nothing to do for write-only files */ 2705 /* Nothing to do for write-only files */
@@ -2634,7 +2707,7 @@ static int cgroup_pidlist_open(struct file *file, bool procs)
2634 return 0; 2707 return 0;
2635 2708
2636 /* have the array populated */ 2709 /* have the array populated */
2637 retval = pidlist_array_load(cgrp, procs); 2710 retval = pidlist_array_load(cgrp, type, &l);
2638 if (retval) 2711 if (retval)
2639 return retval; 2712 return retval;
2640 /* configure file information */ 2713 /* configure file information */
@@ -2650,11 +2723,11 @@ static int cgroup_pidlist_open(struct file *file, bool procs)
2650} 2723}
2651static int cgroup_tasks_open(struct inode *unused, struct file *file) 2724static int cgroup_tasks_open(struct inode *unused, struct file *file)
2652{ 2725{
2653 return cgroup_pidlist_open(file, false); 2726 return cgroup_pidlist_open(file, CGROUP_FILE_TASKS);
2654} 2727}
2655static int cgroup_procs_open(struct inode *unused, struct file *file) 2728static int cgroup_procs_open(struct inode *unused, struct file *file)
2656{ 2729{
2657 return cgroup_pidlist_open(file, true); 2730 return cgroup_pidlist_open(file, CGROUP_FILE_PROCS);
2658} 2731}
2659 2732
2660static u64 cgroup_read_notify_on_release(struct cgroup *cgrp, 2733static u64 cgroup_read_notify_on_release(struct cgroup *cgrp,