1 files changed, 139 insertions, 49 deletions
diff --git a/kernel/exit.c b/kernel/exit.c
index b9d3bc6c21ec..64879bdff921 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -50,6 +50,7 @@
 #include <linux/perf_event.h>
 #include <trace/events/sched.h>
 #include <linux/hw_breakpoint.h>
+#include <linux/oom.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -70,7 +71,7 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
                list_del_rcu(&p->tasks);
                list_del_init(&p->sibling);
-                __get_cpu_var(process_counts)--;
+                __this_cpu_dec(process_counts);
        }
        list_del_rcu(&p->thread_group);
 }
@@ -97,6 +98,14 @@ static void __exit_signal(struct task_struct *tsk)
                sig->tty = NULL;
        } else {
                /*
+                 * This can only happen if the caller is de_thread().
+                 * FIXME: this is the temporary hack, we should teach
+                 * posix-cpu-timers to handle this case correctly.
+                 */
+                if (unlikely(has_group_leader_pid(tsk)))
+                        posix_cpu_timers_exit_group(tsk);
+                /*
                 * If there is any task waiting for the group exit
                 * then notify it:
                 */
@@ -151,9 +160,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
 {
        struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
-#ifdef CONFIG_PERF_EVENTS
+        perf_event_delayed_put(tsk);
-        WARN_ON_ONCE(tsk->perf_event_ctxp);
-#endif
        trace_sched_process_free(tsk);
        put_task_struct(tsk);
 }
@@ -556,29 +563,28 @@ void exit_files(struct task_struct *tsk)
 #ifdef CONFIG_MM_OWNER
 /*
- * Task p is exiting and it owned mm, lets find a new owner for it
+ * A task is exiting.   If it owned this mm, find a new owner for the mm.
 */
-static inline int
-mm_need_new_owner(struct mm_struct *mm, struct task_struct *p)
-{
-        /*
-         * If there are other users of the mm and the owner (us) is exiting
-         * we need to find a new owner to take on the responsibility.
-         */
-        if (atomic_read(&mm->mm_users) <= 1)
-                return 0;
-        if (mm->owner != p)
-                return 0;
-        return 1;
-}
 void mm_update_next_owner(struct mm_struct *mm)
 {
        struct task_struct *c, *g, *p = current;
 retry:
-        if (!mm_need_new_owner(mm, p))
+        /*
+         * If the exiting or execing task is not the owner, it's
+         * someone else's problem.
+         */
+        if (mm->owner != p)
+                return;
+        /*
+         * The current owner is exiting/execing and there are no other
+         * candidates.  Do not leave the mm pointing to a possibly
+         * freed task structure.
+         */
+        if (atomic_read(&mm->mm_users) <= 1) {
+                mm->owner = NULL;
                return;
+        }
        read_lock(&tasklist_lock);
        /*
@@ -691,6 +697,8 @@ static void exit_mm(struct task_struct * tsk)
        enter_lazy_tlb(mm, current);
        /* We don't want this task to be frozen prematurely */
        clear_freeze_flag(tsk);
+        if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
+                atomic_dec(&mm->oom_disable_count);
        task_unlock(tsk);
        mm_update_next_owner(mm);
        mmput(mm);
@@ -704,6 +712,8 @@ static void exit_mm(struct task_struct * tsk)
 * space.
 */
 static struct task_struct *find_new_reaper(struct task_struct *father)
+        __releases(&tasklist_lock)
+        __acquires(&tasklist_lock)
 {
        struct pid_namespace *pid_ns = task_active_pid_ns(father);
        struct task_struct *thread;
@@ -832,7 +842,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
        /* Let father know we died
         *
         * Thread signals are configurable, but you aren't going to use
-         * that to send signals to arbitary processes.
+         * that to send signals to arbitrary processes.
         * That stops right now.
         *
         * If the parent exec id doesn't match the exec id we saved
@@ -899,12 +909,22 @@ NORET_TYPE void do_exit(long code)
        profile_task_exit(tsk);
        WARN_ON(atomic_read(&tsk->fs_excl));
+        WARN_ON(blk_needs_flush_plug(tsk));
        if (unlikely(in_interrupt()))
                panic("Aiee, killing interrupt handler!");
        if (unlikely(!tsk->pid))
                panic("Attempted to kill the idle task!");
+        /*
+         * If do_exit is called because this processes oopsed, it's possible
+         * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before
+         * continuing. Amongst other possible reasons, this is to prevent
+         * mm_release()->clear_child_tid() from writing to a user-controlled
+         * kernel address.
+         */
+        set_fs(USER_DS);
        tracehook_report_exit(&code);
        validate_creds_for_do_exit(tsk);
@@ -978,6 +998,15 @@ NORET_TYPE void do_exit(long code)
        exit_fs(tsk);
        check_stack_usage();
        exit_thread();
+        /*
+         * Flush inherited counters to the parent - before the parent
+         * gets woken up by child-exit notifications.
+         *
+         * because of cgroup mode, must be called before cgroup_exit()
+         */
+        perf_event_exit_task(tsk);
        cgroup_exit(tsk, 1);
        if (group_dead)
@@ -990,12 +1019,7 @@ NORET_TYPE void do_exit(long code)
        /*
         * FIXME: do that only when needed, using sched_exit tracepoint
         */
-        flush_ptrace_hw_breakpoint(tsk);
+        ptrace_put_breakpoints(tsk);
-        /*
-         * Flush inherited counters to the parent - before the parent
-         * gets woken up by child-exit notifications.
-         */
-        perf_event_exit_task(tsk);
        exit_notify(tsk, group_dead);
 #ifdef CONFIG_NUMA
@@ -1356,11 +1380,23 @@ static int *task_stopped_code(struct task_struct *p, bool ptrace)
        return NULL;
 }
-/*
+/**
- * Handle sys_wait4 work for one task in state TASK_STOPPED.  We hold
+ * wait_task_stopped - Wait for %TASK_STOPPED or %TASK_TRACED
- * read_lock(&tasklist_lock) on entry.  If we return zero, we still hold
+ * @wo: wait options
- * the lock and this task is uninteresting.  If we return nonzero, we have
+ * @ptrace: is the wait for ptrace
- * released the lock and the system call should return.
+ * @p: task to wait for
+ *
+ * Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED.
+ *
+ * CONTEXT:
+ * read_lock(&tasklist_lock), which is released if return value is
+ * non-zero.  Also, grabs and releases @p->sighand->siglock.
+ *
+ * RETURNS:
+ * 0 if wait condition didn't exist and search for other wait conditions
+ * should continue.  Non-zero return, -errno on failure and @p's pid on
+ * success, implies that tasklist_lock is released and wait condition
+ * search should terminate.
 */
 static int wait_task_stopped(struct wait_opts *wo,
                                int ptrace, struct task_struct *p)
@@ -1376,6 +1412,9 @@ static int wait_task_stopped(struct wait_opts *wo,
        if (!ptrace && !(wo->wo_flags & WUNTRACED))
                return 0;
+        if (!task_stopped_code(p, ptrace))
+                return 0;
        exit_code = 0;
        spin_lock_irq(&p->sighand->siglock);
@@ -1517,33 +1556,84 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
                return 0;
        }
-        if (likely(!ptrace) && unlikely(task_ptrace(p))) {
+        /* dead body doesn't have much to contribute */
+        if (p->exit_state == EXIT_DEAD)
+                return 0;
+        /* slay zombie? */
+        if (p->exit_state == EXIT_ZOMBIE) {
                /*
-                 * This child is hidden by ptrace.
+                 * A zombie ptracee is only visible to its ptracer.
-                 * We aren't allowed to see it now, but eventually we will.
+                 * Notification and reaping will be cascaded to the real
+                 * parent when the ptracer detaches.
+                 */
+                if (likely(!ptrace) && unlikely(task_ptrace(p))) {
+                        /* it will become visible, clear notask_error */
+                        wo->notask_error = 0;
+                        return 0;
+                }
+                /* we don't reap group leaders with subthreads */
+                if (!delay_group_leader(p))
+                        return wait_task_zombie(wo, p);
+                /*
+                 * Allow access to stopped/continued state via zombie by
+                 * falling through.  Clearing of notask_error is complex.
+                 *
+                 * When !@ptrace:
+                 *
+                 * If WEXITED is set, notask_error should naturally be
+                 * cleared.  If not, subset of WSTOPPED|WCONTINUED is set,
+                 * so, if there are live subthreads, there are events to
+                 * wait for.  If all subthreads are dead, it's still safe
+                 * to clear - this function will be called again in finite
+                 * amount time once all the subthreads are released and
+                 * will then return without clearing.
+                 *
+                 * When @ptrace:
+                 *
+                 * Stopped state is per-task and thus can't change once the
+                 * target task dies.  Only continued and exited can happen.
+                 * Clear notask_error if WCONTINUED | WEXITED.
+                 */
+                if (likely(!ptrace) || (wo->wo_flags & (WCONTINUED | WEXITED)))
+                        wo->notask_error = 0;
+        } else {
+                /*
+                 * If @p is ptraced by a task in its real parent's group,
+                 * hide group stop/continued state when looking at @p as
+                 * the real parent; otherwise, a single stop can be
+                 * reported twice as group and ptrace stops.
+                 *
+                 * If a ptracer wants to distinguish the two events for its
+                 * own children, it should create a separate process which
+                 * takes the role of real parent.
+                 */
+                if (likely(!ptrace) && task_ptrace(p) &&
+                    same_thread_group(p->parent, p->real_parent))
+                        return 0;
+                /*
+                 * @p is alive and it's gonna stop, continue or exit, so
+                 * there always is something to wait for.
                 */
                wo->notask_error = 0;
-                return 0;
        }
-        if (p->exit_state == EXIT_DEAD)
-                return 0;
        /*
-         * We don't reap group leaders with subthreads.
+         * Wait for stopped.  Depending on @ptrace, different stopped state
+         * is used and the two don't interact with each other.
         */
-        if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p))
+        ret = wait_task_stopped(wo, ptrace, p);
-                return wait_task_zombie(wo, p);
+        if (ret)
+                return ret;
        /*
-         * It's stopped or running now, so it might
+         * Wait for continued.  There's only one continued state and the
-         * later continue, exit, or stop again.
+         * ptracer can consume it which can confuse the real parent.  Don't
+         * use WCONTINUED from ptracer.  You don't need or want it.
         */
-        wo->notask_error = 0;
-        if (task_stopped_code(p, ptrace))
-                return wait_task_stopped(wo, ptrace, p);
        return wait_task_continued(wo, p);
 }