diff options
Diffstat (limited to 'kernel/exit.c')
-rw-r--r-- | kernel/exit.c | 188 |
1 files changed, 139 insertions, 49 deletions
diff --git a/kernel/exit.c b/kernel/exit.c index b9d3bc6c21ec..64879bdff921 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -50,6 +50,7 @@ | |||
50 | #include <linux/perf_event.h> | 50 | #include <linux/perf_event.h> |
51 | #include <trace/events/sched.h> | 51 | #include <trace/events/sched.h> |
52 | #include <linux/hw_breakpoint.h> | 52 | #include <linux/hw_breakpoint.h> |
53 | #include <linux/oom.h> | ||
53 | 54 | ||
54 | #include <asm/uaccess.h> | 55 | #include <asm/uaccess.h> |
55 | #include <asm/unistd.h> | 56 | #include <asm/unistd.h> |
@@ -70,7 +71,7 @@ static void __unhash_process(struct task_struct *p, bool group_dead) | |||
70 | 71 | ||
71 | list_del_rcu(&p->tasks); | 72 | list_del_rcu(&p->tasks); |
72 | list_del_init(&p->sibling); | 73 | list_del_init(&p->sibling); |
73 | __get_cpu_var(process_counts)--; | 74 | __this_cpu_dec(process_counts); |
74 | } | 75 | } |
75 | list_del_rcu(&p->thread_group); | 76 | list_del_rcu(&p->thread_group); |
76 | } | 77 | } |
@@ -97,6 +98,14 @@ static void __exit_signal(struct task_struct *tsk) | |||
97 | sig->tty = NULL; | 98 | sig->tty = NULL; |
98 | } else { | 99 | } else { |
99 | /* | 100 | /* |
101 | * This can only happen if the caller is de_thread(). | ||
102 | * FIXME: this is the temporary hack, we should teach | ||
103 | * posix-cpu-timers to handle this case correctly. | ||
104 | */ | ||
105 | if (unlikely(has_group_leader_pid(tsk))) | ||
106 | posix_cpu_timers_exit_group(tsk); | ||
107 | |||
108 | /* | ||
100 | * If there is any task waiting for the group exit | 109 | * If there is any task waiting for the group exit |
101 | * then notify it: | 110 | * then notify it: |
102 | */ | 111 | */ |
@@ -151,9 +160,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp) | |||
151 | { | 160 | { |
152 | struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); | 161 | struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); |
153 | 162 | ||
154 | #ifdef CONFIG_PERF_EVENTS | 163 | perf_event_delayed_put(tsk); |
155 | WARN_ON_ONCE(tsk->perf_event_ctxp); | ||
156 | #endif | ||
157 | trace_sched_process_free(tsk); | 164 | trace_sched_process_free(tsk); |
158 | put_task_struct(tsk); | 165 | put_task_struct(tsk); |
159 | } | 166 | } |
@@ -556,29 +563,28 @@ void exit_files(struct task_struct *tsk) | |||
556 | 563 | ||
557 | #ifdef CONFIG_MM_OWNER | 564 | #ifdef CONFIG_MM_OWNER |
558 | /* | 565 | /* |
559 | * Task p is exiting and it owned mm, lets find a new owner for it | 566 | * A task is exiting. If it owned this mm, find a new owner for the mm. |
560 | */ | 567 | */ |
561 | static inline int | ||
562 | mm_need_new_owner(struct mm_struct *mm, struct task_struct *p) | ||
563 | { | ||
564 | /* | ||
565 | * If there are other users of the mm and the owner (us) is exiting | ||
566 | * we need to find a new owner to take on the responsibility. | ||
567 | */ | ||
568 | if (atomic_read(&mm->mm_users) <= 1) | ||
569 | return 0; | ||
570 | if (mm->owner != p) | ||
571 | return 0; | ||
572 | return 1; | ||
573 | } | ||
574 | |||
575 | void mm_update_next_owner(struct mm_struct *mm) | 568 | void mm_update_next_owner(struct mm_struct *mm) |
576 | { | 569 | { |
577 | struct task_struct *c, *g, *p = current; | 570 | struct task_struct *c, *g, *p = current; |
578 | 571 | ||
579 | retry: | 572 | retry: |
580 | if (!mm_need_new_owner(mm, p)) | 573 | /* |
574 | * If the exiting or execing task is not the owner, it's | ||
575 | * someone else's problem. | ||
576 | */ | ||
577 | if (mm->owner != p) | ||
578 | return; | ||
579 | /* | ||
580 | * The current owner is exiting/execing and there are no other | ||
581 | * candidates. Do not leave the mm pointing to a possibly | ||
582 | * freed task structure. | ||
583 | */ | ||
584 | if (atomic_read(&mm->mm_users) <= 1) { | ||
585 | mm->owner = NULL; | ||
581 | return; | 586 | return; |
587 | } | ||
582 | 588 | ||
583 | read_lock(&tasklist_lock); | 589 | read_lock(&tasklist_lock); |
584 | /* | 590 | /* |
@@ -691,6 +697,8 @@ static void exit_mm(struct task_struct * tsk) | |||
691 | enter_lazy_tlb(mm, current); | 697 | enter_lazy_tlb(mm, current); |
692 | /* We don't want this task to be frozen prematurely */ | 698 | /* We don't want this task to be frozen prematurely */ |
693 | clear_freeze_flag(tsk); | 699 | clear_freeze_flag(tsk); |
700 | if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) | ||
701 | atomic_dec(&mm->oom_disable_count); | ||
694 | task_unlock(tsk); | 702 | task_unlock(tsk); |
695 | mm_update_next_owner(mm); | 703 | mm_update_next_owner(mm); |
696 | mmput(mm); | 704 | mmput(mm); |
@@ -704,6 +712,8 @@ static void exit_mm(struct task_struct * tsk) | |||
704 | * space. | 712 | * space. |
705 | */ | 713 | */ |
706 | static struct task_struct *find_new_reaper(struct task_struct *father) | 714 | static struct task_struct *find_new_reaper(struct task_struct *father) |
715 | __releases(&tasklist_lock) | ||
716 | __acquires(&tasklist_lock) | ||
707 | { | 717 | { |
708 | struct pid_namespace *pid_ns = task_active_pid_ns(father); | 718 | struct pid_namespace *pid_ns = task_active_pid_ns(father); |
709 | struct task_struct *thread; | 719 | struct task_struct *thread; |
@@ -832,7 +842,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead) | |||
832 | /* Let father know we died | 842 | /* Let father know we died |
833 | * | 843 | * |
834 | * Thread signals are configurable, but you aren't going to use | 844 | * Thread signals are configurable, but you aren't going to use |
835 | * that to send signals to arbitary processes. | 845 | * that to send signals to arbitrary processes. |
836 | * That stops right now. | 846 | * That stops right now. |
837 | * | 847 | * |
838 | * If the parent exec id doesn't match the exec id we saved | 848 | * If the parent exec id doesn't match the exec id we saved |
@@ -899,12 +909,22 @@ NORET_TYPE void do_exit(long code) | |||
899 | profile_task_exit(tsk); | 909 | profile_task_exit(tsk); |
900 | 910 | ||
901 | WARN_ON(atomic_read(&tsk->fs_excl)); | 911 | WARN_ON(atomic_read(&tsk->fs_excl)); |
912 | WARN_ON(blk_needs_flush_plug(tsk)); | ||
902 | 913 | ||
903 | if (unlikely(in_interrupt())) | 914 | if (unlikely(in_interrupt())) |
904 | panic("Aiee, killing interrupt handler!"); | 915 | panic("Aiee, killing interrupt handler!"); |
905 | if (unlikely(!tsk->pid)) | 916 | if (unlikely(!tsk->pid)) |
906 | panic("Attempted to kill the idle task!"); | 917 | panic("Attempted to kill the idle task!"); |
907 | 918 | ||
919 | /* | ||
920 | * If do_exit is called because this processes oopsed, it's possible | ||
921 | * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before | ||
922 | * continuing. Amongst other possible reasons, this is to prevent | ||
923 | * mm_release()->clear_child_tid() from writing to a user-controlled | ||
924 | * kernel address. | ||
925 | */ | ||
926 | set_fs(USER_DS); | ||
927 | |||
908 | tracehook_report_exit(&code); | 928 | tracehook_report_exit(&code); |
909 | 929 | ||
910 | validate_creds_for_do_exit(tsk); | 930 | validate_creds_for_do_exit(tsk); |
@@ -978,6 +998,15 @@ NORET_TYPE void do_exit(long code) | |||
978 | exit_fs(tsk); | 998 | exit_fs(tsk); |
979 | check_stack_usage(); | 999 | check_stack_usage(); |
980 | exit_thread(); | 1000 | exit_thread(); |
1001 | |||
1002 | /* | ||
1003 | * Flush inherited counters to the parent - before the parent | ||
1004 | * gets woken up by child-exit notifications. | ||
1005 | * | ||
1006 | * because of cgroup mode, must be called before cgroup_exit() | ||
1007 | */ | ||
1008 | perf_event_exit_task(tsk); | ||
1009 | |||
981 | cgroup_exit(tsk, 1); | 1010 | cgroup_exit(tsk, 1); |
982 | 1011 | ||
983 | if (group_dead) | 1012 | if (group_dead) |
@@ -990,12 +1019,7 @@ NORET_TYPE void do_exit(long code) | |||
990 | /* | 1019 | /* |
991 | * FIXME: do that only when needed, using sched_exit tracepoint | 1020 | * FIXME: do that only when needed, using sched_exit tracepoint |
992 | */ | 1021 | */ |
993 | flush_ptrace_hw_breakpoint(tsk); | 1022 | ptrace_put_breakpoints(tsk); |
994 | /* | ||
995 | * Flush inherited counters to the parent - before the parent | ||
996 | * gets woken up by child-exit notifications. | ||
997 | */ | ||
998 | perf_event_exit_task(tsk); | ||
999 | 1023 | ||
1000 | exit_notify(tsk, group_dead); | 1024 | exit_notify(tsk, group_dead); |
1001 | #ifdef CONFIG_NUMA | 1025 | #ifdef CONFIG_NUMA |
@@ -1356,11 +1380,23 @@ static int *task_stopped_code(struct task_struct *p, bool ptrace) | |||
1356 | return NULL; | 1380 | return NULL; |
1357 | } | 1381 | } |
1358 | 1382 | ||
1359 | /* | 1383 | /** |
1360 | * Handle sys_wait4 work for one task in state TASK_STOPPED. We hold | 1384 | * wait_task_stopped - Wait for %TASK_STOPPED or %TASK_TRACED |
1361 | * read_lock(&tasklist_lock) on entry. If we return zero, we still hold | 1385 | * @wo: wait options |
1362 | * the lock and this task is uninteresting. If we return nonzero, we have | 1386 | * @ptrace: is the wait for ptrace |
1363 | * released the lock and the system call should return. | 1387 | * @p: task to wait for |
1388 | * | ||
1389 | * Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED. | ||
1390 | * | ||
1391 | * CONTEXT: | ||
1392 | * read_lock(&tasklist_lock), which is released if return value is | ||
1393 | * non-zero. Also, grabs and releases @p->sighand->siglock. | ||
1394 | * | ||
1395 | * RETURNS: | ||
1396 | * 0 if wait condition didn't exist and search for other wait conditions | ||
1397 | * should continue. Non-zero return, -errno on failure and @p's pid on | ||
1398 | * success, implies that tasklist_lock is released and wait condition | ||
1399 | * search should terminate. | ||
1364 | */ | 1400 | */ |
1365 | static int wait_task_stopped(struct wait_opts *wo, | 1401 | static int wait_task_stopped(struct wait_opts *wo, |
1366 | int ptrace, struct task_struct *p) | 1402 | int ptrace, struct task_struct *p) |
@@ -1376,6 +1412,9 @@ static int wait_task_stopped(struct wait_opts *wo, | |||
1376 | if (!ptrace && !(wo->wo_flags & WUNTRACED)) | 1412 | if (!ptrace && !(wo->wo_flags & WUNTRACED)) |
1377 | return 0; | 1413 | return 0; |
1378 | 1414 | ||
1415 | if (!task_stopped_code(p, ptrace)) | ||
1416 | return 0; | ||
1417 | |||
1379 | exit_code = 0; | 1418 | exit_code = 0; |
1380 | spin_lock_irq(&p->sighand->siglock); | 1419 | spin_lock_irq(&p->sighand->siglock); |
1381 | 1420 | ||
@@ -1517,33 +1556,84 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, | |||
1517 | return 0; | 1556 | return 0; |
1518 | } | 1557 | } |
1519 | 1558 | ||
1520 | if (likely(!ptrace) && unlikely(task_ptrace(p))) { | 1559 | /* dead body doesn't have much to contribute */ |
1560 | if (p->exit_state == EXIT_DEAD) | ||
1561 | return 0; | ||
1562 | |||
1563 | /* slay zombie? */ | ||
1564 | if (p->exit_state == EXIT_ZOMBIE) { | ||
1521 | /* | 1565 | /* |
1522 | * This child is hidden by ptrace. | 1566 | * A zombie ptracee is only visible to its ptracer. |
1523 | * We aren't allowed to see it now, but eventually we will. | 1567 | * Notification and reaping will be cascaded to the real |
1568 | * parent when the ptracer detaches. | ||
1569 | */ | ||
1570 | if (likely(!ptrace) && unlikely(task_ptrace(p))) { | ||
1571 | /* it will become visible, clear notask_error */ | ||
1572 | wo->notask_error = 0; | ||
1573 | return 0; | ||
1574 | } | ||
1575 | |||
1576 | /* we don't reap group leaders with subthreads */ | ||
1577 | if (!delay_group_leader(p)) | ||
1578 | return wait_task_zombie(wo, p); | ||
1579 | |||
1580 | /* | ||
1581 | * Allow access to stopped/continued state via zombie by | ||
1582 | * falling through. Clearing of notask_error is complex. | ||
1583 | * | ||
1584 | * When !@ptrace: | ||
1585 | * | ||
1586 | * If WEXITED is set, notask_error should naturally be | ||
1587 | * cleared. If not, subset of WSTOPPED|WCONTINUED is set, | ||
1588 | * so, if there are live subthreads, there are events to | ||
1589 | * wait for. If all subthreads are dead, it's still safe | ||
1590 | * to clear - this function will be called again in finite | ||
1591 | * amount time once all the subthreads are released and | ||
1592 | * will then return without clearing. | ||
1593 | * | ||
1594 | * When @ptrace: | ||
1595 | * | ||
1596 | * Stopped state is per-task and thus can't change once the | ||
1597 | * target task dies. Only continued and exited can happen. | ||
1598 | * Clear notask_error if WCONTINUED | WEXITED. | ||
1599 | */ | ||
1600 | if (likely(!ptrace) || (wo->wo_flags & (WCONTINUED | WEXITED))) | ||
1601 | wo->notask_error = 0; | ||
1602 | } else { | ||
1603 | /* | ||
1604 | * If @p is ptraced by a task in its real parent's group, | ||
1605 | * hide group stop/continued state when looking at @p as | ||
1606 | * the real parent; otherwise, a single stop can be | ||
1607 | * reported twice as group and ptrace stops. | ||
1608 | * | ||
1609 | * If a ptracer wants to distinguish the two events for its | ||
1610 | * own children, it should create a separate process which | ||
1611 | * takes the role of real parent. | ||
1612 | */ | ||
1613 | if (likely(!ptrace) && task_ptrace(p) && | ||
1614 | same_thread_group(p->parent, p->real_parent)) | ||
1615 | return 0; | ||
1616 | |||
1617 | /* | ||
1618 | * @p is alive and it's gonna stop, continue or exit, so | ||
1619 | * there always is something to wait for. | ||
1524 | */ | 1620 | */ |
1525 | wo->notask_error = 0; | 1621 | wo->notask_error = 0; |
1526 | return 0; | ||
1527 | } | 1622 | } |
1528 | 1623 | ||
1529 | if (p->exit_state == EXIT_DEAD) | ||
1530 | return 0; | ||
1531 | |||
1532 | /* | 1624 | /* |
1533 | * We don't reap group leaders with subthreads. | 1625 | * Wait for stopped. Depending on @ptrace, different stopped state |
1626 | * is used and the two don't interact with each other. | ||
1534 | */ | 1627 | */ |
1535 | if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p)) | 1628 | ret = wait_task_stopped(wo, ptrace, p); |
1536 | return wait_task_zombie(wo, p); | 1629 | if (ret) |
1630 | return ret; | ||
1537 | 1631 | ||
1538 | /* | 1632 | /* |
1539 | * It's stopped or running now, so it might | 1633 | * Wait for continued. There's only one continued state and the |
1540 | * later continue, exit, or stop again. | 1634 | * ptracer can consume it which can confuse the real parent. Don't |
1635 | * use WCONTINUED from ptracer. You don't need or want it. | ||
1541 | */ | 1636 | */ |
1542 | wo->notask_error = 0; | ||
1543 | |||
1544 | if (task_stopped_code(p, ptrace)) | ||
1545 | return wait_task_stopped(wo, ptrace, p); | ||
1546 | |||
1547 | return wait_task_continued(wo, p); | 1637 | return wait_task_continued(wo, p); |
1548 | } | 1638 | } |
1549 | 1639 | ||