aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorOleg Nesterov <oleg@tv-sign.ru>2007-10-19 02:40:00 -0400
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-10-19 14:53:38 -0400
commit762a24beed3f3ab93224bd447710e6c36fcf1968 (patch)
tree5753a796e11e53173194199292fdfcaae04efb47
parentd4c5e41f3f1b0c19448fcf2d259bdab1ede75e2e (diff)
pid namespaces: rework forget_original_parent()
A pid namespace is a "view" of a particular set of tasks on the system. They work in a similar way to filesystem namespaces. A file (or a process) can be accessed in multiple namespaces, but it may have a different name in each. In a filesystem, this name might be /etc/passwd in one namespace, but /chroot/etc/passwd in another. For processes, a process may have pid 1234 in one namespace, but be pid 1 in another. This allows new pid namespaces to have basically arbitrary pids, and not have to worry about what pids exist in other namespaces. This is essential for checkpoint/restart where a restarted process's pid might collide with an existing process on the system's pid. In this particular implementation, pid namespaces have a parent-child relationship, just like processes. A process in a pid namespace may see all of the processes in the same namespace, as well as all of the processes in all of the namespaces which are children of its namespace. Processes may not, however, see others which are in their parent's namespace, but not in their own. The same goes for sibling namespaces. The know issue to be solved in the nearest future is signal handling in the namespace boundary. That is, currently the namespace's init is treated like an ordinary task that can be killed from within an namespace. Ideally, the signal handling by the namespace's init should have two sides: when signaling the init from its namespace, the init should look like a real init task, i.e. receive only those signals, that is explicitly wants to; when signaling the init from one of the parent namespaces, init should look like an ordinary task, i.e. receive any signal, only taking the general permissions into account. The pid namespace was developed by Pavel Emlyanov and Sukadev Bhattiprolu and we eventually came to almost the same implementation, which differed in some details. This set is based on Pavel's patches, but it includes comments and patches that from Sukadev. Many thanks to Oleg, who reviewed the patches, pointed out many BUGs and made valuable advises on how to make this set cleaner. This patch: We have to call exit_task_namespaces() only after the exiting task has reparented all his children and is sure that no other threads will reparent theirs for it. Why this is needed is explained in appropriate patch. This one only reworks the forget_original_parent() so that after calling this a task cannot be/become parent of any other task. We check PF_EXITING instead of ->exit_state while choosing the new parent. Note that tasklits_lock acts as a barrier, everyone who takes tasklist after us (when forget_original_parent() drops it) must see PF_EXITING. The other changes are just cleanups. They just move some code from exit_notify to forget_original_parent(). It is a bit silly to declare ptrace_dead in exit_notify(), take tasklist, pass ptrace_dead to forget_original_parent(), unlock-lock-unlock tasklist, and then use ptrace_dead. Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru> Signed-off-by: Pavel Emelyanov <xemul@openvz.org> Cc: Sukadev Bhattiprolu <sukadev@us.ibm.com> Cc: Paul Menage <menage@google.com> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--kernel/exit.c39
1 files changed, 21 insertions, 18 deletions
diff --git a/kernel/exit.c b/kernel/exit.c
index 179ac74bf911..3f2182ccf187 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -666,10 +666,14 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
666 * the child reaper process (ie "init") in our pid 666 * the child reaper process (ie "init") in our pid
667 * space. 667 * space.
668 */ 668 */
669static void 669static void forget_original_parent(struct task_struct *father)
670forget_original_parent(struct task_struct *father, struct list_head *to_release)
671{ 670{
672 struct task_struct *p, *n, *reaper = father; 671 struct task_struct *p, *n, *reaper = father;
672 struct list_head ptrace_dead;
673
674 INIT_LIST_HEAD(&ptrace_dead);
675
676 write_lock_irq(&tasklist_lock);
673 677
674 do { 678 do {
675 reaper = next_thread(reaper); 679 reaper = next_thread(reaper);
@@ -677,7 +681,7 @@ forget_original_parent(struct task_struct *father, struct list_head *to_release)
677 reaper = task_child_reaper(father); 681 reaper = task_child_reaper(father);
678 break; 682 break;
679 } 683 }
680 } while (reaper->exit_state); 684 } while (reaper->flags & PF_EXITING);
681 685
682 /* 686 /*
683 * There are only two places where our children can be: 687 * There are only two places where our children can be:
@@ -714,12 +718,23 @@ forget_original_parent(struct task_struct *father, struct list_head *to_release)
714 * while it was being traced by us, to be able to see it in wait4. 718 * while it was being traced by us, to be able to see it in wait4.
715 */ 719 */
716 if (unlikely(ptrace && p->exit_state == EXIT_ZOMBIE && p->exit_signal == -1)) 720 if (unlikely(ptrace && p->exit_state == EXIT_ZOMBIE && p->exit_signal == -1))
717 list_add(&p->ptrace_list, to_release); 721 list_add(&p->ptrace_list, &ptrace_dead);
718 } 722 }
723
719 list_for_each_entry_safe(p, n, &father->ptrace_children, ptrace_list) { 724 list_for_each_entry_safe(p, n, &father->ptrace_children, ptrace_list) {
720 p->real_parent = reaper; 725 p->real_parent = reaper;
721 reparent_thread(p, father, 1); 726 reparent_thread(p, father, 1);
722 } 727 }
728
729 write_unlock_irq(&tasklist_lock);
730 BUG_ON(!list_empty(&father->children));
731 BUG_ON(!list_empty(&father->ptrace_children));
732
733 list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_list) {
734 list_del_init(&p->ptrace_list);
735 release_task(p);
736 }
737
723} 738}
724 739
725/* 740/*
@@ -730,7 +745,6 @@ static void exit_notify(struct task_struct *tsk)
730{ 745{
731 int state; 746 int state;
732 struct task_struct *t; 747 struct task_struct *t;
733 struct list_head ptrace_dead, *_p, *_n;
734 struct pid *pgrp; 748 struct pid *pgrp;
735 749
736 if (signal_pending(tsk) && !(tsk->signal->flags & SIGNAL_GROUP_EXIT) 750 if (signal_pending(tsk) && !(tsk->signal->flags & SIGNAL_GROUP_EXIT)
@@ -751,8 +765,6 @@ static void exit_notify(struct task_struct *tsk)
751 spin_unlock_irq(&tsk->sighand->siglock); 765 spin_unlock_irq(&tsk->sighand->siglock);
752 } 766 }
753 767
754 write_lock_irq(&tasklist_lock);
755
756 /* 768 /*
757 * This does two things: 769 * This does two things:
758 * 770 *
@@ -761,12 +773,9 @@ static void exit_notify(struct task_struct *tsk)
761 * as a result of our exiting, and if they have any stopped 773 * as a result of our exiting, and if they have any stopped
762 * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) 774 * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
763 */ 775 */
776 forget_original_parent(tsk);
764 777
765 INIT_LIST_HEAD(&ptrace_dead); 778 write_lock_irq(&tasklist_lock);
766 forget_original_parent(tsk, &ptrace_dead);
767 BUG_ON(!list_empty(&tsk->children));
768 BUG_ON(!list_empty(&tsk->ptrace_children));
769
770 /* 779 /*
771 * Check to see if any process groups have become orphaned 780 * Check to see if any process groups have become orphaned
772 * as a result of our exiting, and if they have any stopped 781 * as a result of our exiting, and if they have any stopped
@@ -831,12 +840,6 @@ static void exit_notify(struct task_struct *tsk)
831 840
832 write_unlock_irq(&tasklist_lock); 841 write_unlock_irq(&tasklist_lock);
833 842
834 list_for_each_safe(_p, _n, &ptrace_dead) {
835 list_del_init(_p);
836 t = list_entry(_p, struct task_struct, ptrace_list);
837 release_task(t);
838 }
839
840 /* If the process is dead, release it - nobody will wait for it */ 843 /* If the process is dead, release it - nobody will wait for it */
841 if (state == EXIT_DEAD) 844 if (state == EXIT_DEAD)
842 release_task(tsk); 845 release_task(tsk);