aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorOleg Nesterov <oleg@redhat.com>2016-05-23 19:23:50 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2016-05-23 20:04:14 -0400
commitbf959931ddb88c4e4366e96dd22e68fa0db9527c (patch)
tree357173e0564d7b62bccdd50170ef703964a9d41f /kernel
parent076a378ba6e6b6ddd5f2336aa0876349b7d36409 (diff)
wait/ptrace: assume __WALL if the child is traced
The following program (simplified version of generated by syzkaller) #include <pthread.h> #include <unistd.h> #include <sys/ptrace.h> #include <stdio.h> #include <signal.h> void *thread_func(void *arg) { ptrace(PTRACE_TRACEME, 0,0,0); return 0; } int main(void) { pthread_t thread; if (fork()) return 0; while (getppid() != 1) ; pthread_create(&thread, NULL, thread_func, NULL); pthread_join(thread, NULL); return 0; } creates an unreapable zombie if /sbin/init doesn't use __WALL. This is not a kernel bug, at least in a sense that everything works as expected: debugger should reap a traced sub-thread before it can reap the leader, but without __WALL/__WCLONE do_wait() ignores sub-threads. Unfortunately, it seems that /sbin/init in most (all?) distributions doesn't use it and we have to change the kernel to avoid the problem. Note also that most init's use sys_waitid() which doesn't allow __WALL, so the necessary user-space fix is not that trivial. This patch just adds the "ptrace" check into eligible_child(). To some degree this matches the "tsk->ptrace" in exit_notify(), ->exit_signal is mostly ignored when the tracee reports to debugger. Or WSTOPPED, the tracer doesn't need to set this flag to wait for the stopped tracee. This obviously means the user-visible change: __WCLONE and __WALL no longer have any meaning for debugger. And I can only hope that this won't break something, but at least strace/gdb won't suffer. We could make a more conservative change. Say, we can take __WCLONE into account, or !thread_group_leader(). But it would be nice to not complicate these historical/confusing checks. Signed-off-by: Oleg Nesterov <oleg@redhat.com> Reported-by: Dmitry Vyukov <dvyukov@google.com> Cc: Denys Vlasenko <dvlasenk@redhat.com> Cc: Jan Kratochvil <jan.kratochvil@redhat.com> Cc: "Michael Kerrisk (man-pages)" <mtk.manpages@gmail.com> Cc: Pedro Alves <palves@redhat.com> Cc: Roland McGrath <roland@hack.frob.com> Cc: <syzkaller@googlegroups.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/exit.c29
1 files changed, 20 insertions, 9 deletions
diff --git a/kernel/exit.c b/kernel/exit.c
index 75b34fe835b2..44fbe6edd7fe 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -918,17 +918,28 @@ static int eligible_pid(struct wait_opts *wo, struct task_struct *p)
918 task_pid_type(p, wo->wo_type) == wo->wo_pid; 918 task_pid_type(p, wo->wo_type) == wo->wo_pid;
919} 919}
920 920
921static int eligible_child(struct wait_opts *wo, struct task_struct *p) 921static int
922eligible_child(struct wait_opts *wo, bool ptrace, struct task_struct *p)
922{ 923{
923 if (!eligible_pid(wo, p)) 924 if (!eligible_pid(wo, p))
924 return 0; 925 return 0;
925 /* Wait for all children (clone and not) if __WALL is set; 926
926 * otherwise, wait for clone children *only* if __WCLONE is 927 /*
927 * set; otherwise, wait for non-clone children *only*. (Note: 928 * Wait for all children (clone and not) if __WALL is set or
928 * A "clone" child here is one that reports to its parent 929 * if it is traced by us.
929 * using a signal other than SIGCHLD.) */ 930 */
930 if (((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE)) 931 if (ptrace || (wo->wo_flags & __WALL))
931 && !(wo->wo_flags & __WALL)) 932 return 1;
933
934 /*
935 * Otherwise, wait for clone children *only* if __WCLONE is set;
936 * otherwise, wait for non-clone children *only*.
937 *
938 * Note: a "clone" child here is one that reports to its parent
939 * using a signal other than SIGCHLD, or a non-leader thread which
940 * we can only see if it is traced by us.
941 */
942 if ((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE))
932 return 0; 943 return 0;
933 944
934 return 1; 945 return 1;
@@ -1300,7 +1311,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
1300 if (unlikely(exit_state == EXIT_DEAD)) 1311 if (unlikely(exit_state == EXIT_DEAD))
1301 return 0; 1312 return 0;
1302 1313
1303 ret = eligible_child(wo, p); 1314 ret = eligible_child(wo, ptrace, p);
1304 if (!ret) 1315 if (!ret)
1305 return ret; 1316 return ret;
1306 1317