aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPaul Menage <menage@google.com>2009-09-23 18:56:25 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2009-09-24 10:20:58 -0400
commit8f3ff20862cfcb85500a2bb55ee64622bd59fd0c (patch)
treeccf408f52ee23b1c0130520a84c21bbd8f4077e7
parent2c6ab6d200827e1c41dc71fff3a2ac7473f51777 (diff)
cgroups: revert "cgroups: fix pid namespace bug"
The following series adds a "cgroup.procs" file to each cgroup that reports unique tgids rather than pids, and allows all threads in a threadgroup to be atomically moved to a new cgroup. The subsystem "attach" interface is modified to support attaching whole threadgroups at a time, which could introduce potential problems if any subsystem were to need to access the old cgroup of every thread being moved. The attach interface may need to be revised if this becomes the case. Also added is functionality for read/write locking all CLONE_THREAD fork()ing within a threadgroup, by means of an rwsem that lives in the sighand_struct, for per-threadgroup-ness and also for sharing a cacheline with the sighand's atomic count. This scheme should introduce no extra overhead in the fork path when there's no contention. The final patch reveals potential for a race when forking before a subsystem's attach function is called - one potential solution in case any subsystem has this problem is to hang on to the group's fork mutex through the attach() calls, though no subsystem yet demonstrates need for an extended critical section. This patch: Revert commit 096b7fe012d66ed55e98bc8022405ede0cc80e96 Author: Li Zefan <lizf@cn.fujitsu.com> AuthorDate: Wed Jul 29 15:04:04 2009 -0700 Commit: Linus Torvalds <torvalds@linux-foundation.org> CommitDate: Wed Jul 29 19:10:35 2009 -0700 cgroups: fix pid namespace bug This is in preparation for some clashing cgroups changes that subsume the original commit's functionaliy. The original commit fixed a pid namespace bug which Ben Blum fixed independently (in the same way, but with different code) as part of a series of patches. I played around with trying to reconcile Ben's patch series with Li's patch, but concluded that it was simpler to just revert Li's, given that Ben's patch series contained essentially the same fix. Signed-off-by: Paul Menage <menage@google.com> Cc: Li Zefan <lizf@cn.fujitsu.com> Cc: Matt Helsley <matthltc@us.ibm.com> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Cc: Oleg Nesterov <oleg@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/cgroup.h11
-rw-r--r--kernel/cgroup.c95
2 files changed, 31 insertions, 75 deletions
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 90bba9e62286..c833d6f23672 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -179,11 +179,14 @@ struct cgroup {
179 */ 179 */
180 struct list_head release_list; 180 struct list_head release_list;
181 181
182 /* pids_mutex protects pids_list and cached pid arrays. */ 182 /* pids_mutex protects the fields below */
183 struct rw_semaphore pids_mutex; 183 struct rw_semaphore pids_mutex;
184 184 /* Array of process ids in the cgroup */
185 /* Linked list of struct cgroup_pids */ 185 pid_t *tasks_pids;
186 struct list_head pids_list; 186 /* How many files are using the current tasks_pids array */
187 int pids_use_count;
188 /* Length of the current tasks_pids array */
189 int pids_length;
187 190
188 /* For RCU-protected deletion */ 191 /* For RCU-protected deletion */
189 struct rcu_head rcu_head; 192 struct rcu_head rcu_head;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 14efffed72c8..22db0a7cf1fa 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1121,7 +1121,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1121 INIT_LIST_HEAD(&cgrp->children); 1121 INIT_LIST_HEAD(&cgrp->children);
1122 INIT_LIST_HEAD(&cgrp->css_sets); 1122 INIT_LIST_HEAD(&cgrp->css_sets);
1123 INIT_LIST_HEAD(&cgrp->release_list); 1123 INIT_LIST_HEAD(&cgrp->release_list);
1124 INIT_LIST_HEAD(&cgrp->pids_list);
1125 init_rwsem(&cgrp->pids_mutex); 1124 init_rwsem(&cgrp->pids_mutex);
1126} 1125}
1127 1126
@@ -2431,30 +2430,12 @@ err:
2431 return ret; 2430 return ret;
2432} 2431}
2433 2432
2434/*
2435 * Cache pids for all threads in the same pid namespace that are
2436 * opening the same "tasks" file.
2437 */
2438struct cgroup_pids {
2439 /* The node in cgrp->pids_list */
2440 struct list_head list;
2441 /* The cgroup those pids belong to */
2442 struct cgroup *cgrp;
2443 /* The namepsace those pids belong to */
2444 struct pid_namespace *ns;
2445 /* Array of process ids in the cgroup */
2446 pid_t *tasks_pids;
2447 /* How many files are using the this tasks_pids array */
2448 int use_count;
2449 /* Length of the current tasks_pids array */
2450 int length;
2451};
2452
2453static int cmppid(const void *a, const void *b) 2433static int cmppid(const void *a, const void *b)
2454{ 2434{
2455 return *(pid_t *)a - *(pid_t *)b; 2435 return *(pid_t *)a - *(pid_t *)b;
2456} 2436}
2457 2437
2438
2458/* 2439/*
2459 * seq_file methods for the "tasks" file. The seq_file position is the 2440 * seq_file methods for the "tasks" file. The seq_file position is the
2460 * next pid to display; the seq_file iterator is a pointer to the pid 2441 * next pid to display; the seq_file iterator is a pointer to the pid
@@ -2469,47 +2450,45 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
2469 * after a seek to the start). Use a binary-search to find the 2450 * after a seek to the start). Use a binary-search to find the
2470 * next pid to display, if any 2451 * next pid to display, if any
2471 */ 2452 */
2472 struct cgroup_pids *cp = s->private; 2453 struct cgroup *cgrp = s->private;
2473 struct cgroup *cgrp = cp->cgrp;
2474 int index = 0, pid = *pos; 2454 int index = 0, pid = *pos;
2475 int *iter; 2455 int *iter;
2476 2456
2477 down_read(&cgrp->pids_mutex); 2457 down_read(&cgrp->pids_mutex);
2478 if (pid) { 2458 if (pid) {
2479 int end = cp->length; 2459 int end = cgrp->pids_length;
2480 2460
2481 while (index < end) { 2461 while (index < end) {
2482 int mid = (index + end) / 2; 2462 int mid = (index + end) / 2;
2483 if (cp->tasks_pids[mid] == pid) { 2463 if (cgrp->tasks_pids[mid] == pid) {
2484 index = mid; 2464 index = mid;
2485 break; 2465 break;
2486 } else if (cp->tasks_pids[mid] <= pid) 2466 } else if (cgrp->tasks_pids[mid] <= pid)
2487 index = mid + 1; 2467 index = mid + 1;
2488 else 2468 else
2489 end = mid; 2469 end = mid;
2490 } 2470 }
2491 } 2471 }
2492 /* If we're off the end of the array, we're done */ 2472 /* If we're off the end of the array, we're done */
2493 if (index >= cp->length) 2473 if (index >= cgrp->pids_length)
2494 return NULL; 2474 return NULL;
2495 /* Update the abstract position to be the actual pid that we found */ 2475 /* Update the abstract position to be the actual pid that we found */
2496 iter = cp->tasks_pids + index; 2476 iter = cgrp->tasks_pids + index;
2497 *pos = *iter; 2477 *pos = *iter;
2498 return iter; 2478 return iter;
2499} 2479}
2500 2480
2501static void cgroup_tasks_stop(struct seq_file *s, void *v) 2481static void cgroup_tasks_stop(struct seq_file *s, void *v)
2502{ 2482{
2503 struct cgroup_pids *cp = s->private; 2483 struct cgroup *cgrp = s->private;
2504 struct cgroup *cgrp = cp->cgrp;
2505 up_read(&cgrp->pids_mutex); 2484 up_read(&cgrp->pids_mutex);
2506} 2485}
2507 2486
2508static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos) 2487static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos)
2509{ 2488{
2510 struct cgroup_pids *cp = s->private; 2489 struct cgroup *cgrp = s->private;
2511 int *p = v; 2490 int *p = v;
2512 int *end = cp->tasks_pids + cp->length; 2491 int *end = cgrp->tasks_pids + cgrp->pids_length;
2513 2492
2514 /* 2493 /*
2515 * Advance to the next pid in the array. If this goes off the 2494 * Advance to the next pid in the array. If this goes off the
@@ -2536,33 +2515,26 @@ static const struct seq_operations cgroup_tasks_seq_operations = {
2536 .show = cgroup_tasks_show, 2515 .show = cgroup_tasks_show,
2537}; 2516};
2538 2517
2539static void release_cgroup_pid_array(struct cgroup_pids *cp) 2518static void release_cgroup_pid_array(struct cgroup *cgrp)
2540{ 2519{
2541 struct cgroup *cgrp = cp->cgrp;
2542
2543 down_write(&cgrp->pids_mutex); 2520 down_write(&cgrp->pids_mutex);
2544 BUG_ON(!cp->use_count); 2521 BUG_ON(!cgrp->pids_use_count);
2545 if (!--cp->use_count) { 2522 if (!--cgrp->pids_use_count) {
2546 list_del(&cp->list); 2523 kfree(cgrp->tasks_pids);
2547 put_pid_ns(cp->ns); 2524 cgrp->tasks_pids = NULL;
2548 kfree(cp->tasks_pids); 2525 cgrp->pids_length = 0;
2549 kfree(cp);
2550 } 2526 }
2551 up_write(&cgrp->pids_mutex); 2527 up_write(&cgrp->pids_mutex);
2552} 2528}
2553 2529
2554static int cgroup_tasks_release(struct inode *inode, struct file *file) 2530static int cgroup_tasks_release(struct inode *inode, struct file *file)
2555{ 2531{
2556 struct seq_file *seq; 2532 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
2557 struct cgroup_pids *cp;
2558 2533
2559 if (!(file->f_mode & FMODE_READ)) 2534 if (!(file->f_mode & FMODE_READ))
2560 return 0; 2535 return 0;
2561 2536
2562 seq = file->private_data; 2537 release_cgroup_pid_array(cgrp);
2563 cp = seq->private;
2564
2565 release_cgroup_pid_array(cp);
2566 return seq_release(inode, file); 2538 return seq_release(inode, file);
2567} 2539}
2568 2540
@@ -2581,8 +2553,6 @@ static struct file_operations cgroup_tasks_operations = {
2581static int cgroup_tasks_open(struct inode *unused, struct file *file) 2553static int cgroup_tasks_open(struct inode *unused, struct file *file)
2582{ 2554{
2583 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 2555 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
2584 struct pid_namespace *ns = current->nsproxy->pid_ns;
2585 struct cgroup_pids *cp;
2586 pid_t *pidarray; 2556 pid_t *pidarray;
2587 int npids; 2557 int npids;
2588 int retval; 2558 int retval;
@@ -2609,37 +2579,20 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file)
2609 * array if necessary 2579 * array if necessary
2610 */ 2580 */
2611 down_write(&cgrp->pids_mutex); 2581 down_write(&cgrp->pids_mutex);
2612 2582 kfree(cgrp->tasks_pids);
2613 list_for_each_entry(cp, &cgrp->pids_list, list) { 2583 cgrp->tasks_pids = pidarray;
2614 if (ns == cp->ns) 2584 cgrp->pids_length = npids;
2615 goto found; 2585 cgrp->pids_use_count++;
2616 }
2617
2618 cp = kzalloc(sizeof(*cp), GFP_KERNEL);
2619 if (!cp) {
2620 up_write(&cgrp->pids_mutex);
2621 kfree(pidarray);
2622 return -ENOMEM;
2623 }
2624 cp->cgrp = cgrp;
2625 cp->ns = ns;
2626 get_pid_ns(ns);
2627 list_add(&cp->list, &cgrp->pids_list);
2628found:
2629 kfree(cp->tasks_pids);
2630 cp->tasks_pids = pidarray;
2631 cp->length = npids;
2632 cp->use_count++;
2633 up_write(&cgrp->pids_mutex); 2586 up_write(&cgrp->pids_mutex);
2634 2587
2635 file->f_op = &cgroup_tasks_operations; 2588 file->f_op = &cgroup_tasks_operations;
2636 2589
2637 retval = seq_open(file, &cgroup_tasks_seq_operations); 2590 retval = seq_open(file, &cgroup_tasks_seq_operations);
2638 if (retval) { 2591 if (retval) {
2639 release_cgroup_pid_array(cp); 2592 release_cgroup_pid_array(cgrp);
2640 return retval; 2593 return retval;
2641 } 2594 }
2642 ((struct seq_file *)file->private_data)->private = cp; 2595 ((struct seq_file *)file->private_data)->private = cgrp;
2643 return 0; 2596 return 0;
2644} 2597}
2645 2598