From 3854a771821c970065e3203a0b40ddc4101538cc Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 25 Jul 2008 01:47:29 -0700 Subject: __exit_signal: don't take rcu lock There is no reason for rcu_read_lock() in __exit_signal(). tsk->sighand can only be changed if tsk does exec, obviously this is not possible. Signed-off-by: Oleg Nesterov Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 93d2711b9381..a7799d8a6404 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -85,7 +85,6 @@ static void __exit_signal(struct task_struct *tsk) BUG_ON(!sig); BUG_ON(!atomic_read(&sig->count)); - rcu_read_lock(); sighand = rcu_dereference(tsk->sighand); spin_lock(&sighand->siglock); @@ -136,7 +135,6 @@ static void __exit_signal(struct task_struct *tsk) tsk->signal = NULL; tsk->sighand = NULL; spin_unlock(&sighand->siglock); - rcu_read_unlock(); __cleanup_sighand(sighand); clear_tsk_thread_flag(tsk,TIF_SIGPENDING); -- cgit v1.2.2 From 7b34e4283c685f5cc6ba6d30e939906eee0d4bcf Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 25 Jul 2008 01:47:37 -0700 Subject: introduce PF_KTHREAD flag Introduce the new PF_KTHREAD flag to mark the kernel threads. It is set by INIT_TASK() and copied to the forked childs (we could set it in kthreadd() along with PF_NOFREEZE instead). daemonize() was changed as well. In that case testing of PF_KTHREAD is racy, but daemonize() is hopeless anyway. This flag is cleared in do_execve(), before search_binary_handler(). Probably not the best place, we can do this in exec_mmap() or in start_thread(), or clear it along with PF_FORKNOEXEC. But I think this doesn't matter in practice, and if do_execve() fails kthread should die soon. Signed-off-by: Oleg Nesterov Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index a7799d8a6404..28a44a2612dc 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -430,7 +430,7 @@ void daemonize(const char *name, ...) * We don't want to have TIF_FREEZE set if the system-wide hibernation * or suspend transition begins right now. */ - current->flags |= PF_NOFREEZE; + current->flags |= (PF_NOFREEZE | PF_KTHREAD); if (current->nsproxy != &init_nsproxy) { get_nsproxy(&init_nsproxy); -- cgit v1.2.2 From 32ecb1f26dd50eeaac4e3f4dea4541c97848e459 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 25 Jul 2008 01:47:41 -0700 Subject: coredump: turn mm->core_startup_done into the pointer to struct core_state mm->core_startup_done points to "struct completion startup_done" allocated on the coredump_wait()'s stack. Introduce the new structure, core_state, which holds this "struct completion". This way we can add more info visible to the threads participating in coredump without enlarging mm_struct. No changes in affected .o files. Signed-off-by: Oleg Nesterov Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 28a44a2612dc..f7fa21dbced4 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -680,7 +680,7 @@ static void exit_mm(struct task_struct * tsk) up_read(&mm->mmap_sem); down_write(&mm->mmap_sem); if (!--mm->core_waiters) - complete(mm->core_startup_done); + complete(&mm->core_state->startup); up_write(&mm->mmap_sem); wait_for_completion(&mm->core_done); -- cgit v1.2.2 From 999d9fc1670bc082928b93b11d1f2e0e417d973c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 25 Jul 2008 01:47:41 -0700 Subject: coredump: move mm->core_waiters into struct core_state Move mm->core_waiters into "struct core_state" allocated on stack. This shrinks mm_struct a little bit and allows further changes. This patch mostly does s/core_waiters/core_state. The only essential change is that coredump_wait() must clear mm->core_state before return. The coredump_wait()'s path is uglified and .text grows by 30 bytes, this is fixed by the next patch. Signed-off-by: Oleg Nesterov Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index f7fa21dbced4..988e232254e9 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -670,16 +670,16 @@ static void exit_mm(struct task_struct * tsk) return; /* * Serialize with any possible pending coredump. - * We must hold mmap_sem around checking core_waiters + * We must hold mmap_sem around checking core_state * and clearing tsk->mm. The core-inducing thread - * will increment core_waiters for each thread in the + * will increment ->nr_threads for each thread in the * group with ->mm != NULL. */ down_read(&mm->mmap_sem); - if (mm->core_waiters) { + if (mm->core_state) { up_read(&mm->mmap_sem); down_write(&mm->mmap_sem); - if (!--mm->core_waiters) + if (!--mm->core_state->nr_threads) complete(&mm->core_state->startup); up_write(&mm->mmap_sem); -- cgit v1.2.2 From c5f1cc8c1828486a61ab3e575da6e2c62b34d399 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 25 Jul 2008 01:47:42 -0700 Subject: coredump: turn core_state->nr_threads into atomic_t Turn core_state->nr_threads into atomic_t and kill now unneeded down_write(&mm->mmap_sem) in exit_mm(). Signed-off-by: Oleg Nesterov Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 988e232254e9..63d82957baae 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -678,10 +678,9 @@ static void exit_mm(struct task_struct * tsk) down_read(&mm->mmap_sem); if (mm->core_state) { up_read(&mm->mmap_sem); - down_write(&mm->mmap_sem); - if (!--mm->core_state->nr_threads) + + if (atomic_dec_and_test(&mm->core_state->nr_threads)) complete(&mm->core_state->startup); - up_write(&mm->mmap_sem); wait_for_completion(&mm->core_done); down_read(&mm->mmap_sem); -- cgit v1.2.2 From b564daf806d492dd4f7afe9b6c83b8d35d137669 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 25 Jul 2008 01:47:44 -0700 Subject: coredump: construct the list of coredumping threads at startup time binfmt->core_dump() has to iterate over the all threads in system in order to find the coredumping threads and construct the list using the GFP_ATOMIC allocations. With this patch each thread allocates the list node on exit_mm()'s stack and adds itself to the list. This allows us to do further changes: - simplify ->core_dump() - change exit_mm() to clear ->mm first, then wait for ->core_done. this makes the coredumping process visible to oom_kill - kill mm->core_done Signed-off-by: Oleg Nesterov Acked-by: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 63d82957baae..b66f0d55c791 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -664,6 +664,7 @@ assign_new_owner: static void exit_mm(struct task_struct * tsk) { struct mm_struct *mm = tsk->mm; + struct core_state *core_state; mm_release(tsk, mm); if (!mm) @@ -676,11 +677,19 @@ static void exit_mm(struct task_struct * tsk) * group with ->mm != NULL. */ down_read(&mm->mmap_sem); - if (mm->core_state) { + core_state = mm->core_state; + if (core_state) { + struct core_thread self; up_read(&mm->mmap_sem); - if (atomic_dec_and_test(&mm->core_state->nr_threads)) - complete(&mm->core_state->startup); + self.task = tsk; + self.next = xchg(&core_state->dumper.next, &self); + /* + * Implies mb(), the result of xchg() must be visible + * to core_state->dumper. + */ + if (atomic_dec_and_test(&core_state->nr_threads)) + complete(&core_state->startup); wait_for_completion(&mm->core_done); down_read(&mm->mmap_sem); -- cgit v1.2.2 From a94e2d408eaedbd85aae259621d46fafc10479a2 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 25 Jul 2008 01:47:46 -0700 Subject: coredump: kill mm->core_done Now that we have core_state->dumper list we can use it to wake up the sub-threads waiting for the coredump completion. This uglifies the code and .text grows by 47 bytes, but otoh mm_struct lessens by sizeof(struct completion). Also, with this change we can decouple exit_mm() from the coredumping code. Signed-off-by: Oleg Nesterov Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index b66f0d55c791..8a4d4d12e294 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -691,7 +691,13 @@ static void exit_mm(struct task_struct * tsk) if (atomic_dec_and_test(&core_state->nr_threads)) complete(&core_state->startup); - wait_for_completion(&mm->core_done); + for (;;) { + set_task_state(tsk, TASK_UNINTERRUPTIBLE); + if (!self.task) /* see coredump_finish() */ + break; + schedule(); + } + __set_task_state(tsk, TASK_RUNNING); down_read(&mm->mmap_sem); } atomic_inc(&mm->mm_count); -- cgit v1.2.2 From 297c5d92634c809cef23d73e7b2556f2528ff7e2 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Fri, 25 Jul 2008 01:48:49 -0700 Subject: task IO accounting: provide distinct tgid/tid I/O statistics Report per-thread I/O statistics in /proc/pid/task/tid/io and aggregate parent I/O statistics in /proc/pid/io. This approach follows the same model used to account per-process and per-thread CPU times. As a practial application, this allows for example to quickly find the top I/O consumer when a process spawns many child threads that perform the actual I/O work, because the aggregated I/O statistics can always be found in /proc/pid/io. [ Oleg Nesterov points out that we should check that the task is still alive before we iterate over the threads, but also says that we can do that fixup on top of this later. - Linus ] Acked-by: Balbir Singh Signed-off-by: Andrea Righi Cc: Matt Heaton Cc: Shailabh Nagar Acked-by-with-comments: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 8a4d4d12e294..ad933bb29ec7 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -120,6 +120,18 @@ static void __exit_signal(struct task_struct *tsk) sig->nivcsw += tsk->nivcsw; sig->inblock += task_io_get_inblock(tsk); sig->oublock += task_io_get_oublock(tsk); +#ifdef CONFIG_TASK_XACCT + sig->rchar += tsk->rchar; + sig->wchar += tsk->wchar; + sig->syscr += tsk->syscr; + sig->syscw += tsk->syscw; +#endif /* CONFIG_TASK_XACCT */ +#ifdef CONFIG_TASK_IO_ACCOUNTING + sig->ioac.read_bytes += tsk->ioac.read_bytes; + sig->ioac.write_bytes += tsk->ioac.write_bytes; + sig->ioac.cancelled_write_bytes += + tsk->ioac.cancelled_write_bytes; +#endif /* CONFIG_TASK_IO_ACCOUNTING */ sig->sum_sched_runtime += tsk->se.sum_exec_runtime; sig = NULL; /* Marker for below. */ } @@ -1366,6 +1378,21 @@ static int wait_task_zombie(struct task_struct *p, int options, psig->coublock += task_io_get_oublock(p) + sig->oublock + sig->coublock; +#ifdef CONFIG_TASK_XACCT + psig->rchar += p->rchar + sig->rchar; + psig->wchar += p->wchar + sig->wchar; + psig->syscr += p->syscr + sig->syscr; + psig->syscw += p->syscw + sig->syscw; +#endif /* CONFIG_TASK_XACCT */ +#ifdef CONFIG_TASK_IO_ACCOUNTING + psig->ioac.read_bytes += + p->ioac.read_bytes + sig->ioac.read_bytes; + psig->ioac.write_bytes += + p->ioac.write_bytes + sig->ioac.write_bytes; + psig->ioac.cancelled_write_bytes += + p->ioac.cancelled_write_bytes + + sig->ioac.cancelled_write_bytes; +#endif /* CONFIG_TASK_IO_ACCOUNTING */ spin_unlock_irq(&p->parent->sighand->siglock); } -- cgit v1.2.2