aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLennart Poettering <lennart@poettering.net>2012-03-23 18:01:54 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-03-23 19:58:32 -0400
commitebec18a6d3aa1e7d84aab16225e87fd25170ec2b (patch)
treefd329dcd5173c252fc7aed64cab9c2a51575dcec
parent953326cb60c1dff1bd3458d6468d16d75f2bcd61 (diff)
prctl: add PR_{SET,GET}_CHILD_SUBREAPER to allow simple process supervision
Userspace service managers/supervisors need to track their started services. Many services daemonize by double-forking and get implicitly re-parented to PID 1. The service manager will no longer be able to receive the SIGCHLD signals for them, and is no longer in charge of reaping the children with wait(). All information about the children is lost at the moment PID 1 cleans up the re-parented processes. With this prctl, a service manager process can mark itself as a sort of 'sub-init', able to stay as the parent for all orphaned processes created by the started services. All SIGCHLD signals will be delivered to the service manager. Receiving SIGCHLD and doing wait() is in cases of a service-manager much preferred over any possible asynchronous notification about specific PIDs, because the service manager has full access to the child process data in /proc and the PID can not be re-used until the wait(), the service-manager itself is in charge of, has happened. As a side effect, the relevant parent PID information does not get lost by a double-fork, which results in a more elaborate process tree and 'ps' output: before: # ps afx 253 ? Ss 0:00 /bin/dbus-daemon --system --nofork 294 ? Sl 0:00 /usr/libexec/polkit-1/polkitd 328 ? S 0:00 /usr/sbin/modem-manager 608 ? Sl 0:00 /usr/libexec/colord 658 ? Sl 0:00 /usr/libexec/upowerd 819 ? Sl 0:00 /usr/libexec/imsettings-daemon 916 ? Sl 0:00 /usr/libexec/udisks-daemon 917 ? S 0:00 \_ udisks-daemon: not polling any devices after: # ps afx 294 ? Ss 0:00 /bin/dbus-daemon --system --nofork 426 ? Sl 0:00 \_ /usr/libexec/polkit-1/polkitd 449 ? S 0:00 \_ /usr/sbin/modem-manager 635 ? Sl 0:00 \_ /usr/libexec/colord 705 ? Sl 0:00 \_ /usr/libexec/upowerd 959 ? Sl 0:00 \_ /usr/libexec/udisks-daemon 960 ? S 0:00 | \_ udisks-daemon: not polling any devices 977 ? Sl 0:00 \_ /usr/libexec/packagekitd This prctl is orthogonal to PID namespaces. PID namespaces are isolated from each other, while a service management process usually requires the services to live in the same namespace, to be able to talk to each other. Users of this will be the systemd per-user instance, which provides init-like functionality for the user's login session and D-Bus, which activates bus services on-demand. Both need init-like capabilities to be able to properly keep track of the services they start. Many thanks to Oleg for several rounds of review and insights. [akpm@linux-foundation.org: fix comment layout and spelling] [akpm@linux-foundation.org: add lengthy code comment from Oleg] Reviewed-by: Oleg Nesterov <oleg@redhat.com> Signed-off-by: Lennart Poettering <lennart@poettering.net> Signed-off-by: Kay Sievers <kay.sievers@vrfy.org> Acked-by: Valdis Kletnieks <Valdis.Kletnieks@vt.edu> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/prctl.h3
-rw-r--r--include/linux/sched.h12
-rw-r--r--kernel/exit.c33
-rw-r--r--kernel/fork.c3
-rw-r--r--kernel/sys.c8
5 files changed, 54 insertions, 5 deletions
diff --git a/include/linux/prctl.h b/include/linux/prctl.h
index a0413ac3abe8..e0cfec2490aa 100644
--- a/include/linux/prctl.h
+++ b/include/linux/prctl.h
@@ -121,4 +121,7 @@
121#define PR_SET_PTRACER 0x59616d61 121#define PR_SET_PTRACER 0x59616d61
122# define PR_SET_PTRACER_ANY ((unsigned long)-1) 122# define PR_SET_PTRACER_ANY ((unsigned long)-1)
123 123
124#define PR_SET_CHILD_SUBREAPER 36
125#define PR_GET_CHILD_SUBREAPER 37
126
124#endif /* _LINUX_PRCTL_H */ 127#endif /* _LINUX_PRCTL_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0c147a4260a5..0c3854b0d4b1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -553,6 +553,18 @@ struct signal_struct {
553 int group_stop_count; 553 int group_stop_count;
554 unsigned int flags; /* see SIGNAL_* flags below */ 554 unsigned int flags; /* see SIGNAL_* flags below */
555 555
556 /*
557 * PR_SET_CHILD_SUBREAPER marks a process, like a service
558 * manager, to re-parent orphan (double-forking) child processes
559 * to this process instead of 'init'. The service manager is
560 * able to receive SIGCHLD signals and is able to investigate
561 * the process until it calls wait(). All children of this
562 * process will inherit a flag if they should look for a
563 * child_subreaper process at exit.
564 */
565 unsigned int is_child_subreaper:1;
566 unsigned int has_child_subreaper:1;
567
556 /* POSIX.1b Interval Timers */ 568 /* POSIX.1b Interval Timers */
557 struct list_head posix_timers; 569 struct list_head posix_timers;
558 570
diff --git a/kernel/exit.c b/kernel/exit.c
index 16b07bfac224..456329fd4ea3 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -687,11 +687,11 @@ static void exit_mm(struct task_struct * tsk)
687} 687}
688 688
689/* 689/*
690 * When we die, we re-parent all our children. 690 * When we die, we re-parent all our children, and try to:
691 * Try to give them to another thread in our thread 691 * 1. give them to another thread in our thread group, if such a member exists
692 * group, and if no such member exists, give it to 692 * 2. give it to the first ancestor process which prctl'd itself as a
693 * the child reaper process (ie "init") in our pid 693 * child_subreaper for its children (like a service manager)
694 * space. 694 * 3. give it to the init process (PID 1) in our pid namespace
695 */ 695 */
696static struct task_struct *find_new_reaper(struct task_struct *father) 696static struct task_struct *find_new_reaper(struct task_struct *father)
697 __releases(&tasklist_lock) 697 __releases(&tasklist_lock)
@@ -722,6 +722,29 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
722 * forget_original_parent() must move them somewhere. 722 * forget_original_parent() must move them somewhere.
723 */ 723 */
724 pid_ns->child_reaper = init_pid_ns.child_reaper; 724 pid_ns->child_reaper = init_pid_ns.child_reaper;
725 } else if (father->signal->has_child_subreaper) {
726 struct task_struct *reaper;
727
728 /*
729 * Find the first ancestor marked as child_subreaper.
730 * Note that the code below checks same_thread_group(reaper,
731 * pid_ns->child_reaper). This is what we need to DTRT in a
732 * PID namespace. However we still need the check above, see
733 * http://marc.info/?l=linux-kernel&m=131385460420380
734 */
735 for (reaper = father->real_parent;
736 reaper != &init_task;
737 reaper = reaper->real_parent) {
738 if (same_thread_group(reaper, pid_ns->child_reaper))
739 break;
740 if (!reaper->signal->is_child_subreaper)
741 continue;
742 thread = reaper;
743 do {
744 if (!(thread->flags & PF_EXITING))
745 return reaper;
746 } while_each_thread(reaper, thread);
747 }
725 } 748 }
726 749
727 return pid_ns->child_reaper; 750 return pid_ns->child_reaper;
diff --git a/kernel/fork.c b/kernel/fork.c
index 37674ec55cde..b9372a0bff18 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1051,6 +1051,9 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
1051 sig->oom_score_adj = current->signal->oom_score_adj; 1051 sig->oom_score_adj = current->signal->oom_score_adj;
1052 sig->oom_score_adj_min = current->signal->oom_score_adj_min; 1052 sig->oom_score_adj_min = current->signal->oom_score_adj_min;
1053 1053
1054 sig->has_child_subreaper = current->signal->has_child_subreaper ||
1055 current->signal->is_child_subreaper;
1056
1054 mutex_init(&sig->cred_guard_mutex); 1057 mutex_init(&sig->cred_guard_mutex);
1055 1058
1056 return 0; 1059 return 0;
diff --git a/kernel/sys.c b/kernel/sys.c
index 888d227fd195..9eb7fcab8df6 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1962,6 +1962,14 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
1962 case PR_SET_MM: 1962 case PR_SET_MM:
1963 error = prctl_set_mm(arg2, arg3, arg4, arg5); 1963 error = prctl_set_mm(arg2, arg3, arg4, arg5);
1964 break; 1964 break;
1965 case PR_SET_CHILD_SUBREAPER:
1966 me->signal->is_child_subreaper = !!arg2;
1967 error = 0;
1968 break;
1969 case PR_GET_CHILD_SUBREAPER:
1970 error = put_user(me->signal->is_child_subreaper,
1971 (int __user *) arg2);
1972 break;
1965 default: 1973 default:
1966 error = -EINVAL; 1974 error = -EINVAL;
1967 break; 1975 break;