aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/exit.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/exit.c')
-rw-r--r--kernel/exit.c614
1 files changed, 358 insertions, 256 deletions
diff --git a/kernel/exit.c b/kernel/exit.c
index fb8de6cbf2c7..c8d0485578be 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -13,6 +13,7 @@
13#include <linux/personality.h> 13#include <linux/personality.h>
14#include <linux/tty.h> 14#include <linux/tty.h>
15#include <linux/mnt_namespace.h> 15#include <linux/mnt_namespace.h>
16#include <linux/iocontext.h>
16#include <linux/key.h> 17#include <linux/key.h>
17#include <linux/security.h> 18#include <linux/security.h>
18#include <linux/cpu.h> 19#include <linux/cpu.h>
@@ -45,6 +46,7 @@
45#include <linux/resource.h> 46#include <linux/resource.h>
46#include <linux/blkdev.h> 47#include <linux/blkdev.h>
47#include <linux/task_io_accounting_ops.h> 48#include <linux/task_io_accounting_ops.h>
49#include <linux/tracehook.h>
48 50
49#include <asm/uaccess.h> 51#include <asm/uaccess.h>
50#include <asm/unistd.h> 52#include <asm/unistd.h>
@@ -70,7 +72,7 @@ static void __unhash_process(struct task_struct *p)
70 __get_cpu_var(process_counts)--; 72 __get_cpu_var(process_counts)--;
71 } 73 }
72 list_del_rcu(&p->thread_group); 74 list_del_rcu(&p->thread_group);
73 remove_parent(p); 75 list_del_init(&p->sibling);
74} 76}
75 77
76/* 78/*
@@ -84,7 +86,6 @@ static void __exit_signal(struct task_struct *tsk)
84 BUG_ON(!sig); 86 BUG_ON(!sig);
85 BUG_ON(!atomic_read(&sig->count)); 87 BUG_ON(!atomic_read(&sig->count));
86 88
87 rcu_read_lock();
88 sighand = rcu_dereference(tsk->sighand); 89 sighand = rcu_dereference(tsk->sighand);
89 spin_lock(&sighand->siglock); 90 spin_lock(&sighand->siglock);
90 91
@@ -111,15 +112,16 @@ static void __exit_signal(struct task_struct *tsk)
111 * We won't ever get here for the group leader, since it 112 * We won't ever get here for the group leader, since it
112 * will have been the last reference on the signal_struct. 113 * will have been the last reference on the signal_struct.
113 */ 114 */
114 sig->utime = cputime_add(sig->utime, tsk->utime); 115 sig->utime = cputime_add(sig->utime, task_utime(tsk));
115 sig->stime = cputime_add(sig->stime, tsk->stime); 116 sig->stime = cputime_add(sig->stime, task_stime(tsk));
116 sig->gtime = cputime_add(sig->gtime, tsk->gtime); 117 sig->gtime = cputime_add(sig->gtime, task_gtime(tsk));
117 sig->min_flt += tsk->min_flt; 118 sig->min_flt += tsk->min_flt;
118 sig->maj_flt += tsk->maj_flt; 119 sig->maj_flt += tsk->maj_flt;
119 sig->nvcsw += tsk->nvcsw; 120 sig->nvcsw += tsk->nvcsw;
120 sig->nivcsw += tsk->nivcsw; 121 sig->nivcsw += tsk->nivcsw;
121 sig->inblock += task_io_get_inblock(tsk); 122 sig->inblock += task_io_get_inblock(tsk);
122 sig->oublock += task_io_get_oublock(tsk); 123 sig->oublock += task_io_get_oublock(tsk);
124 task_io_accounting_add(&sig->ioac, &tsk->ioac);
123 sig->sum_sched_runtime += tsk->se.sum_exec_runtime; 125 sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
124 sig = NULL; /* Marker for below. */ 126 sig = NULL; /* Marker for below. */
125 } 127 }
@@ -135,7 +137,6 @@ static void __exit_signal(struct task_struct *tsk)
135 tsk->signal = NULL; 137 tsk->signal = NULL;
136 tsk->sighand = NULL; 138 tsk->sighand = NULL;
137 spin_unlock(&sighand->siglock); 139 spin_unlock(&sighand->siglock);
138 rcu_read_unlock();
139 140
140 __cleanup_sighand(sighand); 141 __cleanup_sighand(sighand);
141 clear_tsk_thread_flag(tsk,TIF_SIGPENDING); 142 clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
@@ -151,16 +152,17 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
151 put_task_struct(container_of(rhp, struct task_struct, rcu)); 152 put_task_struct(container_of(rhp, struct task_struct, rcu));
152} 153}
153 154
155
154void release_task(struct task_struct * p) 156void release_task(struct task_struct * p)
155{ 157{
156 struct task_struct *leader; 158 struct task_struct *leader;
157 int zap_leader; 159 int zap_leader;
158repeat: 160repeat:
161 tracehook_prepare_release_task(p);
159 atomic_dec(&p->user->processes); 162 atomic_dec(&p->user->processes);
160 proc_flush_task(p); 163 proc_flush_task(p);
161 write_lock_irq(&tasklist_lock); 164 write_lock_irq(&tasklist_lock);
162 ptrace_unlink(p); 165 tracehook_finish_release_task(p);
163 BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children));
164 __exit_signal(p); 166 __exit_signal(p);
165 167
166 /* 168 /*
@@ -182,6 +184,13 @@ repeat:
182 * that case. 184 * that case.
183 */ 185 */
184 zap_leader = task_detached(leader); 186 zap_leader = task_detached(leader);
187
188 /*
189 * This maintains the invariant that release_task()
190 * only runs on a task in EXIT_DEAD, just for sanity.
191 */
192 if (zap_leader)
193 leader->exit_state = EXIT_DEAD;
185 } 194 }
186 195
187 write_unlock_irq(&tasklist_lock); 196 write_unlock_irq(&tasklist_lock);
@@ -314,9 +323,8 @@ static void reparent_to_kthreadd(void)
314 323
315 ptrace_unlink(current); 324 ptrace_unlink(current);
316 /* Reparent to init */ 325 /* Reparent to init */
317 remove_parent(current);
318 current->real_parent = current->parent = kthreadd_task; 326 current->real_parent = current->parent = kthreadd_task;
319 add_parent(current); 327 list_move_tail(&current->sibling, &current->real_parent->children);
320 328
321 /* Set the exit signal to SIGCHLD so we signal init on exit */ 329 /* Set the exit signal to SIGCHLD so we signal init on exit */
322 current->exit_signal = SIGCHLD; 330 current->exit_signal = SIGCHLD;
@@ -421,7 +429,7 @@ void daemonize(const char *name, ...)
421 * We don't want to have TIF_FREEZE set if the system-wide hibernation 429 * We don't want to have TIF_FREEZE set if the system-wide hibernation
422 * or suspend transition begins right now. 430 * or suspend transition begins right now.
423 */ 431 */
424 current->flags |= PF_NOFREEZE; 432 current->flags |= (PF_NOFREEZE | PF_KTHREAD);
425 433
426 if (current->nsproxy != &init_nsproxy) { 434 if (current->nsproxy != &init_nsproxy) {
427 get_nsproxy(&init_nsproxy); 435 get_nsproxy(&init_nsproxy);
@@ -546,8 +554,6 @@ void put_fs_struct(struct fs_struct *fs)
546 if (atomic_dec_and_test(&fs->count)) { 554 if (atomic_dec_and_test(&fs->count)) {
547 path_put(&fs->root); 555 path_put(&fs->root);
548 path_put(&fs->pwd); 556 path_put(&fs->pwd);
549 if (fs->altroot.dentry)
550 path_put(&fs->altroot);
551 kmem_cache_free(fs_cachep, fs); 557 kmem_cache_free(fs_cachep, fs);
552 } 558 }
553} 559}
@@ -577,8 +583,6 @@ mm_need_new_owner(struct mm_struct *mm, struct task_struct *p)
577 * If there are other users of the mm and the owner (us) is exiting 583 * If there are other users of the mm and the owner (us) is exiting
578 * we need to find a new owner to take on the responsibility. 584 * we need to find a new owner to take on the responsibility.
579 */ 585 */
580 if (!mm)
581 return 0;
582 if (atomic_read(&mm->mm_users) <= 1) 586 if (atomic_read(&mm->mm_users) <= 1)
583 return 0; 587 return 0;
584 if (mm->owner != p) 588 if (mm->owner != p)
@@ -621,6 +625,16 @@ retry:
621 } while_each_thread(g, c); 625 } while_each_thread(g, c);
622 626
623 read_unlock(&tasklist_lock); 627 read_unlock(&tasklist_lock);
628 /*
629 * We found no owner yet mm_users > 1: this implies that we are
630 * most likely racing with swapoff (try_to_unuse()) or /proc or
631 * ptrace or page migration (get_task_mm()). Mark owner as NULL,
632 * so that subsystems can understand the callback and take action.
633 */
634 down_write(&mm->mmap_sem);
635 cgroup_mm_owner_callbacks(mm->owner, NULL);
636 mm->owner = NULL;
637 up_write(&mm->mmap_sem);
624 return; 638 return;
625 639
626assign_new_owner: 640assign_new_owner:
@@ -655,26 +669,40 @@ assign_new_owner:
655static void exit_mm(struct task_struct * tsk) 669static void exit_mm(struct task_struct * tsk)
656{ 670{
657 struct mm_struct *mm = tsk->mm; 671 struct mm_struct *mm = tsk->mm;
672 struct core_state *core_state;
658 673
659 mm_release(tsk, mm); 674 mm_release(tsk, mm);
660 if (!mm) 675 if (!mm)
661 return; 676 return;
662 /* 677 /*
663 * Serialize with any possible pending coredump. 678 * Serialize with any possible pending coredump.
664 * We must hold mmap_sem around checking core_waiters 679 * We must hold mmap_sem around checking core_state
665 * and clearing tsk->mm. The core-inducing thread 680 * and clearing tsk->mm. The core-inducing thread
666 * will increment core_waiters for each thread in the 681 * will increment ->nr_threads for each thread in the
667 * group with ->mm != NULL. 682 * group with ->mm != NULL.
668 */ 683 */
669 down_read(&mm->mmap_sem); 684 down_read(&mm->mmap_sem);
670 if (mm->core_waiters) { 685 core_state = mm->core_state;
686 if (core_state) {
687 struct core_thread self;
671 up_read(&mm->mmap_sem); 688 up_read(&mm->mmap_sem);
672 down_write(&mm->mmap_sem);
673 if (!--mm->core_waiters)
674 complete(mm->core_startup_done);
675 up_write(&mm->mmap_sem);
676 689
677 wait_for_completion(&mm->core_done); 690 self.task = tsk;
691 self.next = xchg(&core_state->dumper.next, &self);
692 /*
693 * Implies mb(), the result of xchg() must be visible
694 * to core_state->dumper.
695 */
696 if (atomic_dec_and_test(&core_state->nr_threads))
697 complete(&core_state->startup);
698
699 for (;;) {
700 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
701 if (!self.task) /* see coredump_finish() */
702 break;
703 schedule();
704 }
705 __set_task_state(tsk, TASK_RUNNING);
678 down_read(&mm->mmap_sem); 706 down_read(&mm->mmap_sem);
679 } 707 }
680 atomic_inc(&mm->mm_count); 708 atomic_inc(&mm->mm_count);
@@ -691,37 +719,97 @@ static void exit_mm(struct task_struct * tsk)
691 mmput(mm); 719 mmput(mm);
692} 720}
693 721
694static void 722/*
695reparent_thread(struct task_struct *p, struct task_struct *father, int traced) 723 * Return nonzero if @parent's children should reap themselves.
724 *
725 * Called with write_lock_irq(&tasklist_lock) held.
726 */
727static int ignoring_children(struct task_struct *parent)
696{ 728{
697 if (p->pdeath_signal) 729 int ret;
698 /* We already hold the tasklist_lock here. */ 730 struct sighand_struct *psig = parent->sighand;
699 group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p); 731 unsigned long flags;
732 spin_lock_irqsave(&psig->siglock, flags);
733 ret = (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN ||
734 (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT));
735 spin_unlock_irqrestore(&psig->siglock, flags);
736 return ret;
737}
700 738
701 /* Move the child from its dying parent to the new one. */ 739/*
702 if (unlikely(traced)) { 740 * Detach all tasks we were using ptrace on.
703 /* Preserve ptrace links if someone else is tracing this child. */ 741 * Any that need to be release_task'd are put on the @dead list.
704 list_del_init(&p->ptrace_list); 742 *
705 if (ptrace_reparented(p)) 743 * Called with write_lock(&tasklist_lock) held.
706 list_add(&p->ptrace_list, &p->real_parent->ptrace_children); 744 */
707 } else { 745static void ptrace_exit(struct task_struct *parent, struct list_head *dead)
708 /* If this child is being traced, then we're the one tracing it 746{
709 * anyway, so let go of it. 747 struct task_struct *p, *n;
748 int ign = -1;
749
750 list_for_each_entry_safe(p, n, &parent->ptraced, ptrace_entry) {
751 __ptrace_unlink(p);
752
753 if (p->exit_state != EXIT_ZOMBIE)
754 continue;
755
756 /*
757 * If it's a zombie, our attachedness prevented normal
758 * parent notification or self-reaping. Do notification
759 * now if it would have happened earlier. If it should
760 * reap itself, add it to the @dead list. We can't call
761 * release_task() here because we already hold tasklist_lock.
762 *
763 * If it's our own child, there is no notification to do.
764 * But if our normal children self-reap, then this child
765 * was prevented by ptrace and we must reap it now.
710 */ 766 */
711 p->ptrace = 0; 767 if (!task_detached(p) && thread_group_empty(p)) {
712 remove_parent(p); 768 if (!same_thread_group(p->real_parent, parent))
713 p->parent = p->real_parent; 769 do_notify_parent(p, p->exit_signal);
714 add_parent(p); 770 else {
771 if (ign < 0)
772 ign = ignoring_children(parent);
773 if (ign)
774 p->exit_signal = -1;
775 }
776 }
715 777
716 if (task_is_traced(p)) { 778 if (task_detached(p)) {
717 /* 779 /*
718 * If it was at a trace stop, turn it into 780 * Mark it as in the process of being reaped.
719 * a normal stop since it's no longer being
720 * traced.
721 */ 781 */
722 ptrace_untrace(p); 782 p->exit_state = EXIT_DEAD;
783 list_add(&p->ptrace_entry, dead);
723 } 784 }
724 } 785 }
786}
787
788/*
789 * Finish up exit-time ptrace cleanup.
790 *
791 * Called without locks.
792 */
793static void ptrace_exit_finish(struct task_struct *parent,
794 struct list_head *dead)
795{
796 struct task_struct *p, *n;
797
798 BUG_ON(!list_empty(&parent->ptraced));
799
800 list_for_each_entry_safe(p, n, dead, ptrace_entry) {
801 list_del_init(&p->ptrace_entry);
802 release_task(p);
803 }
804}
805
806static void reparent_thread(struct task_struct *p, struct task_struct *father)
807{
808 if (p->pdeath_signal)
809 /* We already hold the tasklist_lock here. */
810 group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p);
811
812 list_move_tail(&p->sibling, &p->real_parent->children);
725 813
726 /* If this is a threaded reparent there is no need to 814 /* If this is a threaded reparent there is no need to
727 * notify anyone anything has happened. 815 * notify anyone anything has happened.
@@ -736,7 +824,8 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
736 /* If we'd notified the old parent about this child's death, 824 /* If we'd notified the old parent about this child's death,
737 * also notify the new parent. 825 * also notify the new parent.
738 */ 826 */
739 if (!traced && p->exit_state == EXIT_ZOMBIE && 827 if (!ptrace_reparented(p) &&
828 p->exit_state == EXIT_ZOMBIE &&
740 !task_detached(p) && thread_group_empty(p)) 829 !task_detached(p) && thread_group_empty(p))
741 do_notify_parent(p, p->exit_signal); 830 do_notify_parent(p, p->exit_signal);
742 831
@@ -750,75 +839,63 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
750 * the child reaper process (ie "init") in our pid 839 * the child reaper process (ie "init") in our pid
751 * space. 840 * space.
752 */ 841 */
753static void forget_original_parent(struct task_struct *father) 842static struct task_struct *find_new_reaper(struct task_struct *father)
754{ 843{
755 struct task_struct *p, *n, *reaper = father; 844 struct pid_namespace *pid_ns = task_active_pid_ns(father);
756 struct list_head ptrace_dead; 845 struct task_struct *thread;
757
758 INIT_LIST_HEAD(&ptrace_dead);
759
760 write_lock_irq(&tasklist_lock);
761
762 do {
763 reaper = next_thread(reaper);
764 if (reaper == father) {
765 reaper = task_child_reaper(father);
766 break;
767 }
768 } while (reaper->flags & PF_EXITING);
769 846
770 /* 847 thread = father;
771 * There are only two places where our children can be: 848 while_each_thread(father, thread) {
772 * 849 if (thread->flags & PF_EXITING)
773 * - in our child list 850 continue;
774 * - in our ptraced child list 851 if (unlikely(pid_ns->child_reaper == father))
775 * 852 pid_ns->child_reaper = thread;
776 * Search them and reparent children. 853 return thread;
777 */ 854 }
778 list_for_each_entry_safe(p, n, &father->children, sibling) {
779 int ptrace;
780
781 ptrace = p->ptrace;
782
783 /* if father isn't the real parent, then ptrace must be enabled */
784 BUG_ON(father != p->real_parent && !ptrace);
785 855
786 if (father == p->real_parent) { 856 if (unlikely(pid_ns->child_reaper == father)) {
787 /* reparent with a reaper, real father it's us */ 857 write_unlock_irq(&tasklist_lock);
788 p->real_parent = reaper; 858 if (unlikely(pid_ns == &init_pid_ns))
789 reparent_thread(p, father, 0); 859 panic("Attempted to kill init!");
790 } else {
791 /* reparent ptraced task to its real parent */
792 __ptrace_unlink (p);
793 if (p->exit_state == EXIT_ZOMBIE && !task_detached(p) &&
794 thread_group_empty(p))
795 do_notify_parent(p, p->exit_signal);
796 }
797 860
861 zap_pid_ns_processes(pid_ns);
862 write_lock_irq(&tasklist_lock);
798 /* 863 /*
799 * if the ptraced child is a detached zombie we must collect 864 * We can not clear ->child_reaper or leave it alone.
800 * it before we exit, or it will remain zombie forever since 865 * There may by stealth EXIT_DEAD tasks on ->children,
801 * we prevented it from self-reap itself while it was being 866 * forget_original_parent() must move them somewhere.
802 * traced by us, to be able to see it in wait4.
803 */ 867 */
804 if (unlikely(ptrace && p->exit_state == EXIT_ZOMBIE && task_detached(p))) 868 pid_ns->child_reaper = init_pid_ns.child_reaper;
805 list_add(&p->ptrace_list, &ptrace_dead);
806 } 869 }
807 870
808 list_for_each_entry_safe(p, n, &father->ptrace_children, ptrace_list) { 871 return pid_ns->child_reaper;
872}
873
874static void forget_original_parent(struct task_struct *father)
875{
876 struct task_struct *p, *n, *reaper;
877 LIST_HEAD(ptrace_dead);
878
879 write_lock_irq(&tasklist_lock);
880 reaper = find_new_reaper(father);
881 /*
882 * First clean up ptrace if we were using it.
883 */
884 ptrace_exit(father, &ptrace_dead);
885
886 list_for_each_entry_safe(p, n, &father->children, sibling) {
809 p->real_parent = reaper; 887 p->real_parent = reaper;
810 reparent_thread(p, father, 1); 888 if (p->parent == father) {
889 BUG_ON(p->ptrace);
890 p->parent = p->real_parent;
891 }
892 reparent_thread(p, father);
811 } 893 }
812 894
813 write_unlock_irq(&tasklist_lock); 895 write_unlock_irq(&tasklist_lock);
814 BUG_ON(!list_empty(&father->children)); 896 BUG_ON(!list_empty(&father->children));
815 BUG_ON(!list_empty(&father->ptrace_children));
816
817 list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_list) {
818 list_del_init(&p->ptrace_list);
819 release_task(p);
820 }
821 897
898 ptrace_exit_finish(father, &ptrace_dead);
822} 899}
823 900
824/* 901/*
@@ -827,7 +904,8 @@ static void forget_original_parent(struct task_struct *father)
827 */ 904 */
828static void exit_notify(struct task_struct *tsk, int group_dead) 905static void exit_notify(struct task_struct *tsk, int group_dead)
829{ 906{
830 int state; 907 int signal;
908 void *cookie;
831 909
832 /* 910 /*
833 * This does two things: 911 * This does two things:
@@ -864,33 +942,24 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
864 !capable(CAP_KILL)) 942 !capable(CAP_KILL))
865 tsk->exit_signal = SIGCHLD; 943 tsk->exit_signal = SIGCHLD;
866 944
867 /* If something other than our normal parent is ptracing us, then 945 signal = tracehook_notify_death(tsk, &cookie, group_dead);
868 * send it a SIGCHLD instead of honoring exit_signal. exit_signal 946 if (signal >= 0)
869 * only has special meaning to our real parent. 947 signal = do_notify_parent(tsk, signal);
870 */
871 if (!task_detached(tsk) && thread_group_empty(tsk)) {
872 int signal = ptrace_reparented(tsk) ?
873 SIGCHLD : tsk->exit_signal;
874 do_notify_parent(tsk, signal);
875 } else if (tsk->ptrace) {
876 do_notify_parent(tsk, SIGCHLD);
877 }
878 948
879 state = EXIT_ZOMBIE; 949 tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE;
880 if (task_detached(tsk) && likely(!tsk->ptrace))
881 state = EXIT_DEAD;
882 tsk->exit_state = state;
883 950
884 /* mt-exec, de_thread() is waiting for us */ 951 /* mt-exec, de_thread() is waiting for us */
885 if (thread_group_leader(tsk) && 952 if (thread_group_leader(tsk) &&
886 tsk->signal->notify_count < 0 && 953 tsk->signal->group_exit_task &&
887 tsk->signal->group_exit_task) 954 tsk->signal->notify_count < 0)
888 wake_up_process(tsk->signal->group_exit_task); 955 wake_up_process(tsk->signal->group_exit_task);
889 956
890 write_unlock_irq(&tasklist_lock); 957 write_unlock_irq(&tasklist_lock);
891 958
959 tracehook_report_death(tsk, signal, cookie, group_dead);
960
892 /* If the process is dead, release it - nobody will wait for it */ 961 /* If the process is dead, release it - nobody will wait for it */
893 if (state == EXIT_DEAD) 962 if (signal == DEATH_REAP)
894 release_task(tsk); 963 release_task(tsk);
895} 964}
896 965
@@ -919,39 +988,6 @@ static void check_stack_usage(void)
919static inline void check_stack_usage(void) {} 988static inline void check_stack_usage(void) {}
920#endif 989#endif
921 990
922static inline void exit_child_reaper(struct task_struct *tsk)
923{
924 if (likely(tsk->group_leader != task_child_reaper(tsk)))
925 return;
926
927 if (tsk->nsproxy->pid_ns == &init_pid_ns)
928 panic("Attempted to kill init!");
929
930 /*
931 * @tsk is the last thread in the 'cgroup-init' and is exiting.
932 * Terminate all remaining processes in the namespace and reap them
933 * before exiting @tsk.
934 *
935 * Note that @tsk (last thread of cgroup-init) may not necessarily
936 * be the child-reaper (i.e main thread of cgroup-init) of the
937 * namespace i.e the child_reaper may have already exited.
938 *
939 * Even after a child_reaper exits, we let it inherit orphaned children,
940 * because, pid_ns->child_reaper remains valid as long as there is
941 * at least one living sub-thread in the cgroup init.
942
943 * This living sub-thread of the cgroup-init will be notified when
944 * a child inherited by the 'child-reaper' exits (do_notify_parent()
945 * uses __group_send_sig_info()). Further, when reaping child processes,
946 * do_wait() iterates over children of all living sub threads.
947
948 * i.e even though 'child_reaper' thread is listed as the parent of the
949 * orphaned children, any living sub-thread in the cgroup-init can
950 * perform the role of the child_reaper.
951 */
952 zap_pid_ns_processes(tsk->nsproxy->pid_ns);
953}
954
955NORET_TYPE void do_exit(long code) 991NORET_TYPE void do_exit(long code)
956{ 992{
957 struct task_struct *tsk = current; 993 struct task_struct *tsk = current;
@@ -966,10 +1002,7 @@ NORET_TYPE void do_exit(long code)
966 if (unlikely(!tsk->pid)) 1002 if (unlikely(!tsk->pid))
967 panic("Attempted to kill the idle task!"); 1003 panic("Attempted to kill the idle task!");
968 1004
969 if (unlikely(current->ptrace & PT_TRACE_EXIT)) { 1005 tracehook_report_exit(&code);
970 current->ptrace_message = code;
971 ptrace_notify((PTRACE_EVENT_EXIT << 8) | SIGTRAP);
972 }
973 1006
974 /* 1007 /*
975 * We're taking recursive faults here in do_exit. Safest is to just 1008 * We're taking recursive faults here in do_exit. Safest is to just
@@ -1014,7 +1047,6 @@ NORET_TYPE void do_exit(long code)
1014 } 1047 }
1015 group_dead = atomic_dec_and_test(&tsk->signal->live); 1048 group_dead = atomic_dec_and_test(&tsk->signal->live);
1016 if (group_dead) { 1049 if (group_dead) {
1017 exit_child_reaper(tsk);
1018 hrtimer_cancel(&tsk->signal->real_timer); 1050 hrtimer_cancel(&tsk->signal->real_timer);
1019 exit_itimers(tsk->signal); 1051 exit_itimers(tsk->signal);
1020 } 1052 }
@@ -1176,13 +1208,6 @@ static int eligible_child(enum pid_type type, struct pid *pid, int options,
1176 return 0; 1208 return 0;
1177 } 1209 }
1178 1210
1179 /*
1180 * Do not consider detached threads that are
1181 * not ptraced:
1182 */
1183 if (task_detached(p) && !p->ptrace)
1184 return 0;
1185
1186 /* Wait for all children (clone and not) if __WALL is set; 1211 /* Wait for all children (clone and not) if __WALL is set;
1187 * otherwise, wait for clone children *only* if __WCLONE is 1212 * otherwise, wait for clone children *only* if __WCLONE is
1188 * set; otherwise, wait for non-clone children *only*. (Note: 1213 * set; otherwise, wait for non-clone children *only*. (Note:
@@ -1193,14 +1218,10 @@ static int eligible_child(enum pid_type type, struct pid *pid, int options,
1193 return 0; 1218 return 0;
1194 1219
1195 err = security_task_wait(p); 1220 err = security_task_wait(p);
1196 if (likely(!err)) 1221 if (err)
1197 return 1; 1222 return err;
1198 1223
1199 if (type != PIDTYPE_PID) 1224 return 1;
1200 return 0;
1201 /* This child was explicitly requested, abort */
1202 read_unlock(&tasklist_lock);
1203 return err;
1204} 1225}
1205 1226
1206static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid, 1227static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid,
@@ -1234,7 +1255,7 @@ static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid,
1234 * the lock and this task is uninteresting. If we return nonzero, we have 1255 * the lock and this task is uninteresting. If we return nonzero, we have
1235 * released the lock and the system call should return. 1256 * released the lock and the system call should return.
1236 */ 1257 */
1237static int wait_task_zombie(struct task_struct *p, int noreap, 1258static int wait_task_zombie(struct task_struct *p, int options,
1238 struct siginfo __user *infop, 1259 struct siginfo __user *infop,
1239 int __user *stat_addr, struct rusage __user *ru) 1260 int __user *stat_addr, struct rusage __user *ru)
1240{ 1261{
@@ -1242,7 +1263,10 @@ static int wait_task_zombie(struct task_struct *p, int noreap,
1242 int retval, status, traced; 1263 int retval, status, traced;
1243 pid_t pid = task_pid_vnr(p); 1264 pid_t pid = task_pid_vnr(p);
1244 1265
1245 if (unlikely(noreap)) { 1266 if (!likely(options & WEXITED))
1267 return 0;
1268
1269 if (unlikely(options & WNOWAIT)) {
1246 uid_t uid = p->uid; 1270 uid_t uid = p->uid;
1247 int exit_code = p->exit_code; 1271 int exit_code = p->exit_code;
1248 int why, status; 1272 int why, status;
@@ -1323,6 +1347,8 @@ static int wait_task_zombie(struct task_struct *p, int noreap,
1323 psig->coublock += 1347 psig->coublock +=
1324 task_io_get_oublock(p) + 1348 task_io_get_oublock(p) +
1325 sig->oublock + sig->coublock; 1349 sig->oublock + sig->coublock;
1350 task_io_accounting_add(&psig->ioac, &p->ioac);
1351 task_io_accounting_add(&psig->ioac, &sig->ioac);
1326 spin_unlock_irq(&p->parent->sighand->siglock); 1352 spin_unlock_irq(&p->parent->sighand->siglock);
1327 } 1353 }
1328 1354
@@ -1392,21 +1418,24 @@ static int wait_task_zombie(struct task_struct *p, int noreap,
1392 * the lock and this task is uninteresting. If we return nonzero, we have 1418 * the lock and this task is uninteresting. If we return nonzero, we have
1393 * released the lock and the system call should return. 1419 * released the lock and the system call should return.
1394 */ 1420 */
1395static int wait_task_stopped(struct task_struct *p, 1421static int wait_task_stopped(int ptrace, struct task_struct *p,
1396 int noreap, struct siginfo __user *infop, 1422 int options, struct siginfo __user *infop,
1397 int __user *stat_addr, struct rusage __user *ru) 1423 int __user *stat_addr, struct rusage __user *ru)
1398{ 1424{
1399 int retval, exit_code, why; 1425 int retval, exit_code, why;
1400 uid_t uid = 0; /* unneeded, required by compiler */ 1426 uid_t uid = 0; /* unneeded, required by compiler */
1401 pid_t pid; 1427 pid_t pid;
1402 1428
1429 if (!(options & WUNTRACED))
1430 return 0;
1431
1403 exit_code = 0; 1432 exit_code = 0;
1404 spin_lock_irq(&p->sighand->siglock); 1433 spin_lock_irq(&p->sighand->siglock);
1405 1434
1406 if (unlikely(!task_is_stopped_or_traced(p))) 1435 if (unlikely(!task_is_stopped_or_traced(p)))
1407 goto unlock_sig; 1436 goto unlock_sig;
1408 1437
1409 if (!(p->ptrace & PT_PTRACED) && p->signal->group_stop_count > 0) 1438 if (!ptrace && p->signal->group_stop_count > 0)
1410 /* 1439 /*
1411 * A group stop is in progress and this is the group leader. 1440 * A group stop is in progress and this is the group leader.
1412 * We won't report until all threads have stopped. 1441 * We won't report until all threads have stopped.
@@ -1417,7 +1446,7 @@ static int wait_task_stopped(struct task_struct *p,
1417 if (!exit_code) 1446 if (!exit_code)
1418 goto unlock_sig; 1447 goto unlock_sig;
1419 1448
1420 if (!noreap) 1449 if (!unlikely(options & WNOWAIT))
1421 p->exit_code = 0; 1450 p->exit_code = 0;
1422 1451
1423 uid = p->uid; 1452 uid = p->uid;
@@ -1435,10 +1464,10 @@ unlock_sig:
1435 */ 1464 */
1436 get_task_struct(p); 1465 get_task_struct(p);
1437 pid = task_pid_vnr(p); 1466 pid = task_pid_vnr(p);
1438 why = (p->ptrace & PT_PTRACED) ? CLD_TRAPPED : CLD_STOPPED; 1467 why = ptrace ? CLD_TRAPPED : CLD_STOPPED;
1439 read_unlock(&tasklist_lock); 1468 read_unlock(&tasklist_lock);
1440 1469
1441 if (unlikely(noreap)) 1470 if (unlikely(options & WNOWAIT))
1442 return wait_noreap_copyout(p, pid, uid, 1471 return wait_noreap_copyout(p, pid, uid,
1443 why, exit_code, 1472 why, exit_code,
1444 infop, ru); 1473 infop, ru);
@@ -1472,7 +1501,7 @@ unlock_sig:
1472 * the lock and this task is uninteresting. If we return nonzero, we have 1501 * the lock and this task is uninteresting. If we return nonzero, we have
1473 * released the lock and the system call should return. 1502 * released the lock and the system call should return.
1474 */ 1503 */
1475static int wait_task_continued(struct task_struct *p, int noreap, 1504static int wait_task_continued(struct task_struct *p, int options,
1476 struct siginfo __user *infop, 1505 struct siginfo __user *infop,
1477 int __user *stat_addr, struct rusage __user *ru) 1506 int __user *stat_addr, struct rusage __user *ru)
1478{ 1507{
@@ -1480,6 +1509,9 @@ static int wait_task_continued(struct task_struct *p, int noreap,
1480 pid_t pid; 1509 pid_t pid;
1481 uid_t uid; 1510 uid_t uid;
1482 1511
1512 if (!unlikely(options & WCONTINUED))
1513 return 0;
1514
1483 if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) 1515 if (!(p->signal->flags & SIGNAL_STOP_CONTINUED))
1484 return 0; 1516 return 0;
1485 1517
@@ -1489,7 +1521,7 @@ static int wait_task_continued(struct task_struct *p, int noreap,
1489 spin_unlock_irq(&p->sighand->siglock); 1521 spin_unlock_irq(&p->sighand->siglock);
1490 return 0; 1522 return 0;
1491 } 1523 }
1492 if (!noreap) 1524 if (!unlikely(options & WNOWAIT))
1493 p->signal->flags &= ~SIGNAL_STOP_CONTINUED; 1525 p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
1494 spin_unlock_irq(&p->sighand->siglock); 1526 spin_unlock_irq(&p->sighand->siglock);
1495 1527
@@ -1515,89 +1547,161 @@ static int wait_task_continued(struct task_struct *p, int noreap,
1515 return retval; 1547 return retval;
1516} 1548}
1517 1549
1550/*
1551 * Consider @p for a wait by @parent.
1552 *
1553 * -ECHILD should be in *@notask_error before the first call.
1554 * Returns nonzero for a final return, when we have unlocked tasklist_lock.
1555 * Returns zero if the search for a child should continue;
1556 * then *@notask_error is 0 if @p is an eligible child,
1557 * or another error from security_task_wait(), or still -ECHILD.
1558 */
1559static int wait_consider_task(struct task_struct *parent, int ptrace,
1560 struct task_struct *p, int *notask_error,
1561 enum pid_type type, struct pid *pid, int options,
1562 struct siginfo __user *infop,
1563 int __user *stat_addr, struct rusage __user *ru)
1564{
1565 int ret = eligible_child(type, pid, options, p);
1566 if (!ret)
1567 return ret;
1568
1569 if (unlikely(ret < 0)) {
1570 /*
1571 * If we have not yet seen any eligible child,
1572 * then let this error code replace -ECHILD.
1573 * A permission error will give the user a clue
1574 * to look for security policy problems, rather
1575 * than for mysterious wait bugs.
1576 */
1577 if (*notask_error)
1578 *notask_error = ret;
1579 }
1580
1581 if (likely(!ptrace) && unlikely(p->ptrace)) {
1582 /*
1583 * This child is hidden by ptrace.
1584 * We aren't allowed to see it now, but eventually we will.
1585 */
1586 *notask_error = 0;
1587 return 0;
1588 }
1589
1590 if (p->exit_state == EXIT_DEAD)
1591 return 0;
1592
1593 /*
1594 * We don't reap group leaders with subthreads.
1595 */
1596 if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p))
1597 return wait_task_zombie(p, options, infop, stat_addr, ru);
1598
1599 /*
1600 * It's stopped or running now, so it might
1601 * later continue, exit, or stop again.
1602 */
1603 *notask_error = 0;
1604
1605 if (task_is_stopped_or_traced(p))
1606 return wait_task_stopped(ptrace, p, options,
1607 infop, stat_addr, ru);
1608
1609 return wait_task_continued(p, options, infop, stat_addr, ru);
1610}
1611
1612/*
1613 * Do the work of do_wait() for one thread in the group, @tsk.
1614 *
1615 * -ECHILD should be in *@notask_error before the first call.
1616 * Returns nonzero for a final return, when we have unlocked tasklist_lock.
1617 * Returns zero if the search for a child should continue; then
1618 * *@notask_error is 0 if there were any eligible children,
1619 * or another error from security_task_wait(), or still -ECHILD.
1620 */
1621static int do_wait_thread(struct task_struct *tsk, int *notask_error,
1622 enum pid_type type, struct pid *pid, int options,
1623 struct siginfo __user *infop, int __user *stat_addr,
1624 struct rusage __user *ru)
1625{
1626 struct task_struct *p;
1627
1628 list_for_each_entry(p, &tsk->children, sibling) {
1629 /*
1630 * Do not consider detached threads.
1631 */
1632 if (!task_detached(p)) {
1633 int ret = wait_consider_task(tsk, 0, p, notask_error,
1634 type, pid, options,
1635 infop, stat_addr, ru);
1636 if (ret)
1637 return ret;
1638 }
1639 }
1640
1641 return 0;
1642}
1643
1644static int ptrace_do_wait(struct task_struct *tsk, int *notask_error,
1645 enum pid_type type, struct pid *pid, int options,
1646 struct siginfo __user *infop, int __user *stat_addr,
1647 struct rusage __user *ru)
1648{
1649 struct task_struct *p;
1650
1651 /*
1652 * Traditionally we see ptrace'd stopped tasks regardless of options.
1653 */
1654 options |= WUNTRACED;
1655
1656 list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {
1657 int ret = wait_consider_task(tsk, 1, p, notask_error,
1658 type, pid, options,
1659 infop, stat_addr, ru);
1660 if (ret)
1661 return ret;
1662 }
1663
1664 return 0;
1665}
1666
1518static long do_wait(enum pid_type type, struct pid *pid, int options, 1667static long do_wait(enum pid_type type, struct pid *pid, int options,
1519 struct siginfo __user *infop, int __user *stat_addr, 1668 struct siginfo __user *infop, int __user *stat_addr,
1520 struct rusage __user *ru) 1669 struct rusage __user *ru)
1521{ 1670{
1522 DECLARE_WAITQUEUE(wait, current); 1671 DECLARE_WAITQUEUE(wait, current);
1523 struct task_struct *tsk; 1672 struct task_struct *tsk;
1524 int flag, retval; 1673 int retval;
1525 1674
1526 add_wait_queue(&current->signal->wait_chldexit,&wait); 1675 add_wait_queue(&current->signal->wait_chldexit,&wait);
1527repeat: 1676repeat:
1528 /* If there is nothing that can match our critier just get out */ 1677 /*
1678 * If there is nothing that can match our critiera just get out.
1679 * We will clear @retval to zero if we see any child that might later
1680 * match our criteria, even if we are not able to reap it yet.
1681 */
1529 retval = -ECHILD; 1682 retval = -ECHILD;
1530 if ((type < PIDTYPE_MAX) && (!pid || hlist_empty(&pid->tasks[type]))) 1683 if ((type < PIDTYPE_MAX) && (!pid || hlist_empty(&pid->tasks[type])))
1531 goto end; 1684 goto end;
1532 1685
1533 /*
1534 * We will set this flag if we see any child that might later
1535 * match our criteria, even if we are not able to reap it yet.
1536 */
1537 flag = retval = 0;
1538 current->state = TASK_INTERRUPTIBLE; 1686 current->state = TASK_INTERRUPTIBLE;
1539 read_lock(&tasklist_lock); 1687 read_lock(&tasklist_lock);
1540 tsk = current; 1688 tsk = current;
1541 do { 1689 do {
1542 struct task_struct *p; 1690 int tsk_result = do_wait_thread(tsk, &retval,
1543 1691 type, pid, options,
1544 list_for_each_entry(p, &tsk->children, sibling) { 1692 infop, stat_addr, ru);
1545 int ret = eligible_child(type, pid, options, p); 1693 if (!tsk_result)
1546 if (!ret) 1694 tsk_result = ptrace_do_wait(tsk, &retval,
1547 continue; 1695 type, pid, options,
1548 1696 infop, stat_addr, ru);
1549 if (unlikely(ret < 0)) { 1697 if (tsk_result) {
1550 retval = ret; 1698 /*
1551 } else if (task_is_stopped_or_traced(p)) { 1699 * tasklist_lock is unlocked and we have a final result.
1552 /* 1700 */
1553 * It's stopped now, so it might later 1701 retval = tsk_result;
1554 * continue, exit, or stop again. 1702 goto end;
1555 */
1556 flag = 1;
1557 if (!(p->ptrace & PT_PTRACED) &&
1558 !(options & WUNTRACED))
1559 continue;
1560
1561 retval = wait_task_stopped(p,
1562 (options & WNOWAIT), infop,
1563 stat_addr, ru);
1564 } else if (p->exit_state == EXIT_ZOMBIE &&
1565 !delay_group_leader(p)) {
1566 /*
1567 * We don't reap group leaders with subthreads.
1568 */
1569 if (!likely(options & WEXITED))
1570 continue;
1571 retval = wait_task_zombie(p,
1572 (options & WNOWAIT), infop,
1573 stat_addr, ru);
1574 } else if (p->exit_state != EXIT_DEAD) {
1575 /*
1576 * It's running now, so it might later
1577 * exit, stop, or stop and then continue.
1578 */
1579 flag = 1;
1580 if (!unlikely(options & WCONTINUED))
1581 continue;
1582 retval = wait_task_continued(p,
1583 (options & WNOWAIT), infop,
1584 stat_addr, ru);
1585 }
1586 if (retval != 0) /* tasklist_lock released */
1587 goto end;
1588 }
1589 if (!flag) {
1590 list_for_each_entry(p, &tsk->ptrace_children,
1591 ptrace_list) {
1592 flag = eligible_child(type, pid, options, p);
1593 if (!flag)
1594 continue;
1595 if (likely(flag > 0))
1596 break;
1597 retval = flag;
1598 goto end;
1599 }
1600 } 1703 }
1704
1601 if (options & __WNOTHREAD) 1705 if (options & __WNOTHREAD)
1602 break; 1706 break;
1603 tsk = next_thread(tsk); 1707 tsk = next_thread(tsk);
@@ -1605,16 +1709,14 @@ repeat:
1605 } while (tsk != current); 1709 } while (tsk != current);
1606 read_unlock(&tasklist_lock); 1710 read_unlock(&tasklist_lock);
1607 1711
1608 if (flag) { 1712 if (!retval && !(options & WNOHANG)) {
1609 if (options & WNOHANG)
1610 goto end;
1611 retval = -ERESTARTSYS; 1713 retval = -ERESTARTSYS;
1612 if (signal_pending(current)) 1714 if (!signal_pending(current)) {
1613 goto end; 1715 schedule();
1614 schedule(); 1716 goto repeat;
1615 goto repeat; 1717 }
1616 } 1718 }
1617 retval = -ECHILD; 1719
1618end: 1720end:
1619 current->state = TASK_RUNNING; 1721 current->state = TASK_RUNNING;
1620 remove_wait_queue(&current->signal->wait_chldexit,&wait); 1722 remove_wait_queue(&current->signal->wait_chldexit,&wait);