From 3d5992d2ac7dc09aed8ab537cba074589f0f0a52 Mon Sep 17 00:00:00 2001 From: Ying Han Date: Tue, 26 Oct 2010 14:21:23 -0700 Subject: oom: add per-mm oom disable count It's pointless to kill a task if another thread sharing its mm cannot be killed to allow future memory freeing. A subsequent patch will prevent kills in such cases, but first it's necessary to have a way to flag a task that shares memory with an OOM_DISABLE task that doesn't incur an additional tasklist scan, which would make select_bad_process() an O(n^2) function. This patch adds an atomic counter to struct mm_struct that follows how many threads attached to it have an oom_score_adj of OOM_SCORE_ADJ_MIN. They cannot be killed by the kernel, so their memory cannot be freed in oom conditions. This only requires task_lock() on the task that we're operating on, it does not require mm->mmap_sem since task_lock() pins the mm and the operation is atomic. [rientjes@google.com: changelog and sys_unshare() code] [rientjes@google.com: protect oom_disable_count with task_lock in fork] [rientjes@google.com: use old_mm for oom_disable_count in exec] Signed-off-by: Ying Han Signed-off-by: David Rientjes Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index c445f8cc408d..e87aaaaf5131 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -65,6 +65,7 @@ #include #include #include +#include #include #include @@ -488,6 +489,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) mm->cached_hole_size = ~0UL; mm_init_aio(mm); mm_init_owner(mm, p); + atomic_set(&mm->oom_disable_count, 0); if (likely(!mm_alloc_pgd(mm))) { mm->def_flags = 0; @@ -741,6 +743,8 @@ good_mm: /* Initializing for Swap token stuff */ mm->token_priority = 0; mm->last_interval = 0; + if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) + atomic_inc(&mm->oom_disable_count); tsk->mm = mm; tsk->active_mm = mm; @@ -1299,8 +1303,13 @@ bad_fork_cleanup_io: bad_fork_cleanup_namespaces: exit_task_namespaces(p); bad_fork_cleanup_mm: - if (p->mm) + if (p->mm) { + task_lock(p); + if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) + atomic_dec(&p->mm->oom_disable_count); + task_unlock(p); mmput(p->mm); + } bad_fork_cleanup_signal: if (!(clone_flags & CLONE_THREAD)) free_signal_struct(p->signal); @@ -1693,6 +1702,10 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) active_mm = current->active_mm; current->mm = new_mm; current->active_mm = new_mm; + if (current->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { + atomic_dec(&mm->oom_disable_count); + atomic_inc(&new_mm->oom_disable_count); + } activate_mm(active_mm, new_mm); new_mm = mm; } -- cgit v1.2.2 From 9b1bf12d5d51bca178dea21b04a0805e29d60cf1 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Wed, 27 Oct 2010 15:34:08 -0700 Subject: signals: move cred_guard_mutex from task_struct to signal_struct Oleg Nesterov pointed out we have to prevent multiple-threads-inside-exec itself and we can reuse ->cred_guard_mutex for it. Yes, concurrent execve() has no worth. Let's move ->cred_guard_mutex from task_struct to signal_struct. It naturally prevent multiple-threads-inside-exec. Signed-off-by: KOSAKI Motohiro Reviewed-by: Oleg Nesterov Acked-by: Roland McGrath Acked-by: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index e87aaaaf5131..3b159c5991b7 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -908,6 +908,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->oom_adj = current->signal->oom_adj; sig->oom_score_adj = current->signal->oom_score_adj; + mutex_init(&sig->cred_guard_mutex); + return 0; } -- cgit v1.2.2 From 5091faa449ee0b7d73bc296a93bca9540fc51d0a Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Tue, 30 Nov 2010 14:18:03 +0100 Subject: sched: Add 'autogroup' scheduling feature: automated per session task groups A recurring complaint from CFS users is that parallel kbuild has a negative impact on desktop interactivity. This patch implements an idea from Linus, to automatically create task groups. Currently, only per session autogroups are implemented, but the patch leaves the way open for enhancement. Implementation: each task's signal struct contains an inherited pointer to a refcounted autogroup struct containing a task group pointer, the default for all tasks pointing to the init_task_group. When a task calls setsid(), a new task group is created, the process is moved into the new task group, and a reference to the preveious task group is dropped. Child processes inherit this task group thereafter, and increase it's refcount. When the last thread of a process exits, the process's reference is dropped, such that when the last process referencing an autogroup exits, the autogroup is destroyed. At runqueue selection time, IFF a task has no cgroup assignment, its current autogroup is used. Autogroup bandwidth is controllable via setting it's nice level through the proc filesystem: cat /proc//autogroup Displays the task's group and the group's nice level. echo > /proc//autogroup Sets the task group's shares to the weight of nice task. Setting nice level is rate limited for !admin users due to the abuse risk of task group locking. The feature is enabled from boot by default if CONFIG_SCHED_AUTOGROUP=y is selected, but can be disabled via the boot option noautogroup, and can also be turned on/off on the fly via: echo [01] > /proc/sys/kernel/sched_autogroup_enabled ... which will automatically move tasks to/from the root task group. Signed-off-by: Mike Galbraith Acked-by: Linus Torvalds Acked-by: Peter Zijlstra Cc: Markus Trippelsdorf Cc: Mathieu Desnoyers Cc: Paul Turner Cc: Oleg Nesterov [ Removed the task_group_path() debug code, and fixed !EVENTFD build failure. ] Signed-off-by: Ingo Molnar LKML-Reference: <1290281700.28711.9.camel@maggy.simson.net> Signed-off-by: Ingo Molnar --- kernel/fork.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 3b159c5991b7..b6f2475f1e83 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -174,8 +174,10 @@ static inline void free_signal_struct(struct signal_struct *sig) static inline void put_signal_struct(struct signal_struct *sig) { - if (atomic_dec_and_test(&sig->sigcnt)) + if (atomic_dec_and_test(&sig->sigcnt)) { + sched_autogroup_exit(sig); free_signal_struct(sig); + } } void __put_task_struct(struct task_struct *tsk) @@ -904,6 +906,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) posix_cpu_timers_init_group(sig); tty_audit_fork(sig); + sched_autogroup_fork(sig); sig->oom_adj = current->signal->oom_adj; sig->oom_score_adj = current->signal->oom_score_adj; -- cgit v1.2.2 From f26f9aff6aaf67e9a430d16c266f91b13a5bff64 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Wed, 8 Dec 2010 11:05:42 +0100 Subject: Sched: fix skip_clock_update optimization idle_balance() drops/retakes rq->lock, leaving the previous task vulnerable to set_tsk_need_resched(). Clear it after we return from balancing instead, and in setup_thread_stack() as well, so no successfully descheduled or never scheduled task has it set. Need resched confused the skip_clock_update logic, which assumes that the next call to update_rq_clock() will come nearly immediately after being set. Make the optimization robust against the waking a sleeper before it sucessfully deschedules case by checking that the current task has not been dequeued before setting the flag, since it is that useless clock update we're trying to save, and clear unconditionally in schedule() proper instead of conditionally in put_prev_task(). Signed-off-by: Mike Galbraith Reported-by: Bjoern B. Brandenburg Tested-by: Yong Zhang Signed-off-by: Peter Zijlstra Cc: stable@kernel.org LKML-Reference: <1291802742.1417.9.camel@marge.simson.net> Signed-off-by: Ingo Molnar --- kernel/fork.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 3b159c5991b7..5447dc7defa9 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -273,6 +273,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) setup_thread_stack(tsk, orig); clear_user_return_notifier(tsk); + clear_tsk_need_resched(tsk); stackend = end_of_stack(tsk); *stackend = STACK_END_MAGIC; /* for overflow detection */ -- cgit v1.2.2 From 909ea96468096b07fbb41aaf69be060d92bd9271 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 8 Dec 2010 16:22:55 +0100 Subject: core: Replace __get_cpu_var with __this_cpu_read if not used for an address. __get_cpu_var() can be replaced with this_cpu_read and will then use a single read instruction with implied address calculation to access the correct per cpu instance. However, the address of a per cpu variable passed to __this_cpu_read() cannot be determined (since it's an implied address conversion through segment prefixes). Therefore apply this only to uses of __get_cpu_var where the address of the variable is not used. Cc: Pekka Enberg Cc: Hugh Dickins Cc: Thomas Gleixner Acked-by: H. Peter Anvin Signed-off-by: Christoph Lameter Signed-off-by: Tejun Heo --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 3b159c5991b7..e05e27de67df 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1282,7 +1282,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, attach_pid(p, PIDTYPE_SID, task_session(current)); list_add_tail(&p->sibling, &p->real_parent->children); list_add_tail_rcu(&p->tasks, &init_task.tasks); - __get_cpu_var(process_counts)++; + __this_cpu_inc(process_counts); } attach_pid(p, PIDTYPE_PID, pid); nr_threads++; -- cgit v1.2.2 From 101e5f77bf35679809586e250b6c62193d2ed179 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Fri, 31 Dec 2010 09:32:30 +0100 Subject: sched, autogroup: Fix reference leak The cgroup exit mess also uncovered a struct autogroup reference leak. copy_process() was simply freeing vs putting the signal_struct, stranding a reference. Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra Cc: Oleg Nesterov LKML-Reference: <1293784350.6839.2.camel@marge.simson.net> Signed-off-by: Ingo Molnar --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index b6f2475f1e83..067244495966 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1317,7 +1317,7 @@ bad_fork_cleanup_mm: } bad_fork_cleanup_signal: if (!(clone_flags & CLONE_THREAD)) - free_signal_struct(p->signal); + put_signal_struct(p->signal); bad_fork_cleanup_sighand: __cleanup_sighand(p->sighand); bad_fork_cleanup_fs: -- cgit v1.2.2 From 1c5354de90c900b369e2ebd36c3a065ede29eb93 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Wed, 5 Jan 2011 11:16:04 +0100 Subject: sched: Move sched_autogroup_exit() to free_signal_struct() Per Oleg's suggestion, undo fork failure free/put_signal_struct change, and move sched_autogroup_exit() to free_signal_struct() instead. Signed-off-by: Mike Galbraith Reviewed-by: Oleg Nesterov Signed-off-by: Peter Zijlstra LKML-Reference: <1294222564.8369.6.camel@marge.simson.net> Signed-off-by: Ingo Molnar --- kernel/fork.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 7d164e25b0f0..dc1a8bbcea7b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -169,15 +169,14 @@ EXPORT_SYMBOL(free_task); static inline void free_signal_struct(struct signal_struct *sig) { taskstats_tgid_free(sig); + sched_autogroup_exit(sig); kmem_cache_free(signal_cachep, sig); } static inline void put_signal_struct(struct signal_struct *sig) { - if (atomic_dec_and_test(&sig->sigcnt)) { - sched_autogroup_exit(sig); + if (atomic_dec_and_test(&sig->sigcnt)) free_signal_struct(sig); - } } void __put_task_struct(struct task_struct *tsk) @@ -1318,7 +1317,7 @@ bad_fork_cleanup_mm: } bad_fork_cleanup_signal: if (!(clone_flags & CLONE_THREAD)) - put_signal_struct(p->signal); + free_signal_struct(p->signal); bad_fork_cleanup_sighand: __cleanup_sighand(p->sighand); bad_fork_cleanup_fs: -- cgit v1.2.2 From 43bb40c9e3aa51a3b038c9df2c9afb4d4685614d Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Thu, 13 Jan 2011 15:45:40 -0800 Subject: sched: remove long deprecated CLONE_STOPPED flag This warning was added in commit bdff746a3915 ("clone: prepare to recycle CLONE_STOPPED") three years ago. 2.6.26 came and went. As far as I know, no-one is actually using CLONE_STOPPED. Signed-off-by: Dave Jones Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 28 +--------------------------- 1 file changed, 1 insertion(+), 27 deletions(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index d9b44f20b6b0..76a1fdd80bdf 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1409,23 +1409,6 @@ long do_fork(unsigned long clone_flags, return -EPERM; } - /* - * We hope to recycle these flags after 2.6.26 - */ - if (unlikely(clone_flags & CLONE_STOPPED)) { - static int __read_mostly count = 100; - - if (count > 0 && printk_ratelimit()) { - char comm[TASK_COMM_LEN]; - - count--; - printk(KERN_INFO "fork(): process `%s' used deprecated " - "clone flags 0x%lx\n", - get_task_comm(comm, current), - clone_flags & CLONE_STOPPED); - } - } - /* * When called from kernel_thread, don't do user tracing stuff. */ @@ -1464,16 +1447,7 @@ long do_fork(unsigned long clone_flags, */ p->flags &= ~PF_STARTING; - if (unlikely(clone_flags & CLONE_STOPPED)) { - /* - * We'll start up with an immediate SIGSTOP. - */ - sigaddset(&p->pending.signal, SIGSTOP); - set_tsk_thread_flag(p, TIF_SIGPENDING); - __set_task_state(p, TASK_STOPPED); - } else { - wake_up_new_task(p, clone_flags); - } + wake_up_new_task(p, clone_flags); tracehook_report_clone_complete(trace, regs, clone_flags, nr, p); -- cgit v1.2.2 From dabb16f639820267b3850d804571c70bd93d4e07 Mon Sep 17 00:00:00 2001 From: Mandeep Singh Baines Date: Thu, 13 Jan 2011 15:46:05 -0800 Subject: oom: allow a non-CAP_SYS_RESOURCE proces to oom_score_adj down We'd like to be able to oom_score_adj a process up/down as it enters/leaves the foreground. Currently, it is not possible to oom_adj down without CAP_SYS_RESOURCE. This patch allows a task to decrease its oom_score_adj back to the value that a CAP_SYS_RESOURCE thread set it to or its inherited value at fork. Assuming the thread that has forked it has oom_score_adj of 0, each process could decrease it back from 0 upon activation unless a CAP_SYS_RESOURCE thread elevated it to something higher. Alternative considered: * a setuid binary * a daemon with CAP_SYS_RESOURCE Since you don't wan't all processes to be able to reduce their oom_adj, a setuid or daemon implementation would be complex. The alternatives also have much higher overhead. This patch updated from original patch based on feedback from David Rientjes. Signed-off-by: Mandeep Singh Baines Acked-by: David Rientjes Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Cc: Rik van Riel Cc: Ying Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 76a1fdd80bdf..1499607e4da2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -910,6 +910,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->oom_adj = current->signal->oom_adj; sig->oom_score_adj = current->signal->oom_score_adj; + sig->oom_score_adj_min = current->signal->oom_score_adj_min; mutex_init(&sig->cred_guard_mutex); -- cgit v1.2.2 From e7a00c45f29c0155007aa150bf231a70fa470365 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:45 -0800 Subject: thp: add pmd_huge_pte to mm_struct This increase the size of the mm struct a bit but it is needed to preallocate one pte for each hugepage so that split_huge_page will not require a fail path. Guarantee of success is a fundamental property of split_huge_page to avoid decrasing swapping reliability and to avoid adding -ENOMEM fail paths that would otherwise force the hugepage-unaware VM code to learn rolling back in the middle of its pte mangling operations (if something we need it to learn handling pmd_trans_huge natively rather being capable of rollback). When split_huge_page runs a pte is needed to succeed the split, to map the newly splitted regular pages with a regular pte. This way all existing VM code remains backwards compatible by just adding a split_huge_page* one liner. The memory waste of those preallocated ptes is negligible and so it is worth it. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 1499607e4da2..f78f50ba6cb2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -529,6 +529,9 @@ void __mmdrop(struct mm_struct *mm) mm_free_pgd(mm); destroy_context(mm); mmu_notifier_mm_destroy(mm); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + VM_BUG_ON(mm->pmd_huge_pte); +#endif free_mm(mm); } EXPORT_SYMBOL_GPL(__mmdrop); @@ -669,6 +672,10 @@ struct mm_struct *dup_mm(struct task_struct *tsk) mm->token_priority = 0; mm->last_interval = 0; +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + mm->pmd_huge_pte = NULL; +#endif + if (!mm_init(mm, tsk)) goto fail_nomem; -- cgit v1.2.2 From ba76149f47d8c939efa0acc07a191237af900471 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:58 -0800 Subject: thp: khugepaged Add khugepaged to relocate fragmented pages into hugepages if new hugepages become available. (this is indipendent of the defrag logic that will have to make new hugepages available) The fundamental reason why khugepaged is unavoidable, is that some memory can be fragmented and not everything can be relocated. So when a virtual machine quits and releases gigabytes of hugepages, we want to use those freely available hugepages to create huge-pmd in the other virtual machines that may be running on fragmented memory, to maximize the CPU efficiency at all times. The scan is slow, it takes nearly zero cpu time, except when it copies data (in which case it means we definitely want to pay for that cpu time) so it seems a good tradeoff. In addition to the hugepages being released by other process releasing memory, we have the strong suspicion that the performance impact of potentially defragmenting hugepages during or before each page fault could lead to more performance inconsistency than allocating small pages at first and having them collapsed into large pages later... if they prove themselfs to be long lived mappings (khugepaged scan is slow so short lived mappings have low probability to run into khugepaged if compared to long lived mappings). Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index f78f50ba6cb2..25e429152ddc 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -66,6 +66,7 @@ #include #include #include +#include #include #include @@ -328,6 +329,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) rb_parent = NULL; pprev = &mm->mmap; retval = ksm_fork(mm, oldmm); + if (retval) + goto out; + retval = khugepaged_fork(mm, oldmm); if (retval) goto out; @@ -546,6 +550,7 @@ void mmput(struct mm_struct *mm) if (atomic_dec_and_test(&mm->mm_users)) { exit_aio(mm); ksm_exit(mm); + khugepaged_exit(mm); /* must run before exit_mmap */ exit_mmap(mm); set_mm_exe_file(mm, NULL); if (!list_empty(&mm->mmlist)) { -- cgit v1.2.2 From 73c101011926c5832e6e141682180c4debe2cf45 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 8 Mar 2011 13:19:51 +0100 Subject: block: initial patch for on-stack per-task plugging This patch adds support for creating a queuing context outside of the queue itself. This enables us to batch up pieces of IO before grabbing the block device queue lock and submitting them to the IO scheduler. The context is created on the stack of the process and assigned in the task structure, so that we can auto-unplug it if we hit a schedule event. The current queue plugging happens implicitly if IO is submitted to an empty device, yet callers have to remember to unplug that IO when they are going to wait for it. This is an ugly API and has caused bugs in the past. Additionally, it requires hacks in the vm (->sync_page() callback) to handle that logic. By switching to an explicit plugging scheme we make the API a lot nicer and can get rid of the ->sync_page() hack in the vm. Signed-off-by: Jens Axboe --- kernel/fork.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 25e429152ddc..027c80e5162f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1204,6 +1204,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, * Clear TID on mm_release()? */ p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL; +#ifdef CONFIG_BLOCK + p->plug = NULL; +#endif #ifdef CONFIG_FUTEX p->robust_list = NULL; #ifdef CONFIG_COMPAT -- cgit v1.2.2 From 77c100c83e84316ced2507c5799f79c2c80bc6b9 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 1 Feb 2011 09:51:46 -0500 Subject: export pid symbols needed for kvm_vcpu_on_spin Export the symbols required for a race-free kvm_vcpu_on_spin. Signed-off-by: Rik van Riel Signed-off-by: Avi Kivity --- kernel/fork.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 25e429152ddc..05b92c457010 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -193,6 +193,7 @@ void __put_task_struct(struct task_struct *tsk) if (!profile_handoff_task(tsk)) free_task(tsk); } +EXPORT_SYMBOL_GPL(__put_task_struct); /* * macro override instead of weak attribute alias, to workaround -- cgit v1.2.2 From 504f52b5439aaf26d3e2c1d45ec10fce38c8dd27 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 22 Mar 2011 16:30:41 -0700 Subject: mm: NUMA aware alloc_task_struct_node() All kthreads being created from a single helper task, they all use memory from a single node for their kernel stack and task struct. This patch suite creates kthread_create_on_cpu(), adding a 'cpu' parameter to parameters already used by kthread_create(). This parameter serves in allocating memory for the new kthread on its memory node if available. Users of this new function are : ksoftirqd, kworker, migration, pktgend... This patch: Add a node parameter to alloc_task_struct(), and change its name to alloc_task_struct_node() This change is needed to allow NUMA aware kthread_create_on_cpu() Signed-off-by: Eric Dumazet Acked-by: David S. Miller Reviewed-by: Andi Kleen Acked-by: Rusty Russell Cc: Tejun Heo Cc: Tony Luck Cc: Fenghua Yu Cc: David Howells Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 05b92c457010..cffbe8a4e1fc 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -109,8 +109,10 @@ int nr_processes(void) } #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR -# define alloc_task_struct() kmem_cache_alloc(task_struct_cachep, GFP_KERNEL) -# define free_task_struct(tsk) kmem_cache_free(task_struct_cachep, (tsk)) +# define alloc_task_struct_node(node) \ + kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node) +# define free_task_struct(tsk) \ + kmem_cache_free(task_struct_cachep, (tsk)) static struct kmem_cache *task_struct_cachep; #endif @@ -249,12 +251,12 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) struct task_struct *tsk; struct thread_info *ti; unsigned long *stackend; - + int node = numa_node_id(); int err; prepare_to_copy(orig); - tsk = alloc_task_struct(); + tsk = alloc_task_struct_node(node); if (!tsk) return NULL; -- cgit v1.2.2 From b6a84016bd2598e35ead635147fa53619982648d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 22 Mar 2011 16:30:42 -0700 Subject: mm: NUMA aware alloc_thread_info_node() Add a node parameter to alloc_thread_info(), and change its name to alloc_thread_info_node() This change is needed to allow NUMA aware kthread_create_on_cpu() Signed-off-by: Eric Dumazet Acked-by: David S. Miller Reviewed-by: Andi Kleen Acked-by: Rusty Russell Cc: Tejun Heo Cc: Tony Luck Cc: Fenghua Yu Cc: David Howells Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index cffbe8a4e1fc..cbc6adc6e891 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -117,14 +117,17 @@ static struct kmem_cache *task_struct_cachep; #endif #ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR -static inline struct thread_info *alloc_thread_info(struct task_struct *tsk) +static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, + int node) { #ifdef CONFIG_DEBUG_STACK_USAGE gfp_t mask = GFP_KERNEL | __GFP_ZERO; #else gfp_t mask = GFP_KERNEL; #endif - return (struct thread_info *)__get_free_pages(mask, THREAD_SIZE_ORDER); + struct page *page = alloc_pages_node(node, mask, THREAD_SIZE_ORDER); + + return page ? page_address(page) : NULL; } static inline void free_thread_info(struct thread_info *ti) @@ -260,7 +263,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) if (!tsk) return NULL; - ti = alloc_thread_info(tsk); + ti = alloc_thread_info_node(tsk, node); if (!ti) { free_task_struct(tsk); return NULL; -- cgit v1.2.2 From 207205a2ba2655652fe46a60b49838af6c16a919 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 22 Mar 2011 16:30:44 -0700 Subject: kthread: NUMA aware kthread_create_on_node() All kthreads being created from a single helper task, they all use memory from a single node for their kernel stack and task struct. This patch suite creates kthread_create_on_node(), adding a 'cpu' parameter to parameters already used by kthread_create(). This parameter serves in allocating memory for the new kthread on its memory node if possible. Signed-off-by: Eric Dumazet Acked-by: David S. Miller Reviewed-by: Andi Kleen Acked-by: Rusty Russell Cc: Tejun Heo Cc: Tony Luck Cc: Fenghua Yu Cc: David Howells Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index cbc6adc6e891..a8f64f8ec7e1 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include #include @@ -254,7 +255,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) struct task_struct *tsk; struct thread_info *ti; unsigned long *stackend; - int node = numa_node_id(); + int node = tsk_fork_get_node(orig); int err; prepare_to_copy(orig); -- cgit v1.2.2 From 9bfb23fc4a481650e60d22dbe84c0fd5a9d49bba Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 22 Mar 2011 16:34:09 -0700 Subject: sys_unshare: remove the dead CLONE_THREAD/SIGHAND/VM code Cleanup: kill the dead code which does nothing but complicates the code and confuses the reader. sys_unshare(CLONE_THREAD/SIGHAND/VM) is not really implemented, and I doubt very much it will ever work. At least, nobody even tried since the original 99d1419d96d7df9cfa56 ("unshare system call -v5: system call handler function") was applied more than 4 years ago. And the code is not consistent. unshare_thread() always fails unconditionally, while unshare_sighand() and unshare_vm() pretend to work if there is nothing to unshare. Remove unshare_thread(), unshare_sighand(), unshare_vm() helpers and related variables and add a simple CLONE_THREAD | CLONE_SIGHAND| CLONE_VM check into check_unshare_flags(). Also, move the "CLONE_NEWNS needs CLONE_FS" check from check_unshare_flags() to sys_unshare(). This looks more consistent and matches the similar do_sysvsem check in sys_unshare(). Note: with or without this patch "atomic_read(mm->mm_users) > 1" can give a false positive due to get_task_mm(). Signed-off-by: Oleg Nesterov Acked-by: Roland McGrath Cc: Janak Desai Cc: Daniel Lezcano Cc: "Eric W. Biederman" Cc: KOSAKI Motohiro Cc: Alexey Dobriyan Acked-by: Serge Hallyn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 123 ++++++++++++---------------------------------------------- 1 file changed, 25 insertions(+), 98 deletions(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index a8f64f8ec7e1..f2b494d7c557 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1519,38 +1519,24 @@ void __init proc_caches_init(void) } /* - * Check constraints on flags passed to the unshare system call and - * force unsharing of additional process context as appropriate. + * Check constraints on flags passed to the unshare system call. */ -static void check_unshare_flags(unsigned long *flags_ptr) +static int check_unshare_flags(unsigned long unshare_flags) { + if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| + CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| + CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET)) + return -EINVAL; /* - * If unsharing a thread from a thread group, must also - * unshare vm. - */ - if (*flags_ptr & CLONE_THREAD) - *flags_ptr |= CLONE_VM; - - /* - * If unsharing vm, must also unshare signal handlers. - */ - if (*flags_ptr & CLONE_VM) - *flags_ptr |= CLONE_SIGHAND; - - /* - * If unsharing namespace, must also unshare filesystem information. + * Not implemented, but pretend it works if there is nothing to + * unshare. Note that unsharing CLONE_THREAD or CLONE_SIGHAND + * needs to unshare vm. */ - if (*flags_ptr & CLONE_NEWNS) - *flags_ptr |= CLONE_FS; -} - -/* - * Unsharing of tasks created with CLONE_THREAD is not supported yet - */ -static int unshare_thread(unsigned long unshare_flags) -{ - if (unshare_flags & CLONE_THREAD) - return -EINVAL; + if (unshare_flags & (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM)) { + /* FIXME: get_task_mm() increments ->mm_users */ + if (atomic_read(¤t->mm->mm_users) > 1) + return -EINVAL; + } return 0; } @@ -1576,34 +1562,6 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp) return 0; } -/* - * Unsharing of sighand is not supported yet - */ -static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp) -{ - struct sighand_struct *sigh = current->sighand; - - if ((unshare_flags & CLONE_SIGHAND) && atomic_read(&sigh->count) > 1) - return -EINVAL; - else - return 0; -} - -/* - * Unshare vm if it is being shared - */ -static int unshare_vm(unsigned long unshare_flags, struct mm_struct **new_mmp) -{ - struct mm_struct *mm = current->mm; - - if ((unshare_flags & CLONE_VM) && - (mm && atomic_read(&mm->mm_users) > 1)) { - return -EINVAL; - } - - return 0; -} - /* * Unshare file descriptor table if it is being shared */ @@ -1632,23 +1590,21 @@ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp */ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) { - int err = 0; struct fs_struct *fs, *new_fs = NULL; - struct sighand_struct *new_sigh = NULL; - struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL; struct files_struct *fd, *new_fd = NULL; struct nsproxy *new_nsproxy = NULL; int do_sysvsem = 0; + int err; - check_unshare_flags(&unshare_flags); - - /* Return -EINVAL for all unsupported flags */ - err = -EINVAL; - if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| - CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| - CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET)) + err = check_unshare_flags(unshare_flags); + if (err) goto bad_unshare_out; + /* + * If unsharing namespace, must also unshare filesystem information. + */ + if (unshare_flags & CLONE_NEWNS) + unshare_flags |= CLONE_FS; /* * CLONE_NEWIPC must also detach from the undolist: after switching * to a new ipc namespace, the semaphore arrays from the old @@ -1656,21 +1612,15 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) */ if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM)) do_sysvsem = 1; - if ((err = unshare_thread(unshare_flags))) - goto bad_unshare_out; if ((err = unshare_fs(unshare_flags, &new_fs))) - goto bad_unshare_cleanup_thread; - if ((err = unshare_sighand(unshare_flags, &new_sigh))) - goto bad_unshare_cleanup_fs; - if ((err = unshare_vm(unshare_flags, &new_mm))) - goto bad_unshare_cleanup_sigh; + goto bad_unshare_out; if ((err = unshare_fd(unshare_flags, &new_fd))) - goto bad_unshare_cleanup_vm; + goto bad_unshare_cleanup_fs; if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, new_fs))) goto bad_unshare_cleanup_fd; - if (new_fs || new_mm || new_fd || do_sysvsem || new_nsproxy) { + if (new_fs || new_fd || do_sysvsem || new_nsproxy) { if (do_sysvsem) { /* * CLONE_SYSVSEM is equivalent to sys_exit(). @@ -1696,19 +1646,6 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) spin_unlock(&fs->lock); } - if (new_mm) { - mm = current->mm; - active_mm = current->active_mm; - current->mm = new_mm; - current->active_mm = new_mm; - if (current->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { - atomic_dec(&mm->oom_disable_count); - atomic_inc(&new_mm->oom_disable_count); - } - activate_mm(active_mm, new_mm); - new_mm = mm; - } - if (new_fd) { fd = current->files; current->files = new_fd; @@ -1725,20 +1662,10 @@ bad_unshare_cleanup_fd: if (new_fd) put_files_struct(new_fd); -bad_unshare_cleanup_vm: - if (new_mm) - mmput(new_mm); - -bad_unshare_cleanup_sigh: - if (new_sigh) - if (atomic_dec_and_test(&new_sigh->count)) - kmem_cache_free(sighand_cachep, new_sigh); - bad_unshare_cleanup_fs: if (new_fs) free_fs_struct(new_fs); -bad_unshare_cleanup_thread: bad_unshare_out: return err; } -- cgit v1.2.2 From 45a68628d37222e655219febce9e91b6484789b2 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 23 Mar 2011 16:43:12 -0700 Subject: pid: remove the child_reaper special case in init/main.c This patchset is a cleanup and a preparation to unshare the pid namespace. These prerequisites prepare for Eric's patchset to give a file descriptor to a namespace and join an existing namespace. This patch: It turns out that the existing assignment in copy_process of the child_reaper can handle the initial assignment of child_reaper we just need to generalize the test in kernel/fork.c Signed-off-by: Eric W. Biederman Signed-off-by: Daniel Lezcano Cc: Oleg Nesterov Cc: Alexey Dobriyan Acked-by: Serge E. Hallyn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index f2b494d7c557..17aed4378eda 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1296,7 +1296,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, tracehook_finish_clone(p, clone_flags, trace); if (thread_group_leader(p)) { - if (clone_flags & CLONE_NEWPID) + if (is_child_reaper(pid)) p->nsproxy->pid_ns->child_reaper = p; p->signal->leader_pid = pid; -- cgit v1.2.2 From 4308eebbeb2026827d4492ce8c23d99f7f144a82 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 23 Mar 2011 16:43:13 -0700 Subject: pidns: call pid_ns_prepare_proc() from create_pid_namespace() Reorganize proc_get_sb() so it can be called before the struct pid of the first process is allocated. Signed-off-by: Eric W. Biederman Signed-off-by: Daniel Lezcano Cc: Oleg Nesterov Cc: Alexey Dobriyan Acked-by: Serge E. Hallyn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 17aed4378eda..457fff2e17e0 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1187,12 +1187,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, pid = alloc_pid(p->nsproxy->pid_ns); if (!pid) goto bad_fork_cleanup_io; - - if (clone_flags & CLONE_NEWPID) { - retval = pid_ns_prepare_proc(p->nsproxy->pid_ns); - if (retval < 0) - goto bad_fork_free_pid; - } } p->pid = pid_nr(pid); -- cgit v1.2.2 From 625f2a378e5a10f45fdc37932fc9f8a21676de9e Mon Sep 17 00:00:00 2001 From: Jonathan Corbet Date: Fri, 22 Apr 2011 11:19:10 -0600 Subject: sched: Get rid of lock_depth Neil Brown pointed out that lock_depth somehow escaped the BKL removal work. Let's get rid of it now. Note that the perf scripting utilities still have a bunch of code for dealing with common_lock_depth in tracepoints; I have left that in place in case anybody wants to use that code with older kernels. Suggested-by: Neil Brown Signed-off-by: Jonathan Corbet Cc: Arnd Bergmann Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110422111910.456c0e84@bike.lwn.net Signed-off-by: Ingo Molnar --- kernel/fork.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index e7548dee636b..aca62871a4f9 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1103,7 +1103,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, posix_cpu_timers_init(p); - p->lock_depth = -1; /* -1 = no lock */ do_posix_clock_monotonic_gettime(&p->start_time); p->real_start_time = p->start_time; monotonic_to_bootbased(&p->real_start_time); -- cgit v1.2.2 From 3e51e3edfd81bfd9853ad7de91167e4ce33d0fe7 Mon Sep 17 00:00:00 2001 From: Samir Bellabes Date: Wed, 11 May 2011 18:18:05 +0200 Subject: sched: Remove unused parameters from sched_fork() and wake_up_new_task() sched_fork() and wake_up_new_task() are defined with a parameter 'unsigned long clone_flags', which is unused. This patch removes the parameters. Signed-off-by: Samir Bellabes Acked-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1305130685-1047-1-git-send-email-sam@synack.fr Signed-off-by: Ingo Molnar --- kernel/fork.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index aca62871a4f9..2b44d82b8237 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1152,7 +1152,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, #endif /* Perform scheduler related setup. Assign this task to a CPU. */ - sched_fork(p, clone_flags); + sched_fork(p); retval = perf_event_init_task(p); if (retval) @@ -1463,7 +1463,7 @@ long do_fork(unsigned long clone_flags, */ p->flags &= ~PF_STARTING; - wake_up_new_task(p, clone_flags); + wake_up_new_task(p); tracehook_report_clone_complete(trace, regs, clone_flags, nr, p); -- cgit v1.2.2 From 97a894136f29802da19a15541de3c019e1ca147e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 24 May 2011 17:12:04 -0700 Subject: mm: Remove i_mmap_lock lockbreak Hugh says: "The only significant loser, I think, would be page reclaim (when concurrent with truncation): could spin for a long time waiting for the i_mmap_mutex it expects would soon be dropped? " Counter points: - cpu contention makes the spin stop (need_resched()) - zap pages should be freeing pages at a higher rate than reclaim ever can I think the simplification of the truncate code is definitely worth it. Effectively reverts: 2aa15890f3c ("mm: prevent concurrent unmap_mapping_range() on the same inode") and takes out the code that caused its problem. Signed-off-by: Peter Zijlstra Reviewed-by: KAMEZAWA Hiroyuki Cc: Hugh Dickins Cc: Benjamin Herrenschmidt Cc: David Miller Cc: Martin Schwidefsky Cc: Russell King Cc: Paul Mundt Cc: Jeff Dike Cc: Richard Weinberger Cc: Tony Luck Cc: Mel Gorman Cc: KOSAKI Motohiro Cc: Nick Piggin Cc: Namhyung Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 2b44d82b8237..4eef925477fc 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -386,7 +386,6 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) spin_lock(&mapping->i_mmap_lock); if (tmp->vm_flags & VM_SHARED) mapping->i_mmap_writable++; - tmp->vm_truncate_count = mpnt->vm_truncate_count; flush_dcache_mmap_lock(mapping); /* insert tmp into the share list, just after mpnt */ vma_prio_tree_add(tmp, mpnt); -- cgit v1.2.2 From 3d48ae45e72390ddf8cc5256ac32ed6f7a19cbea Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 24 May 2011 17:12:06 -0700 Subject: mm: Convert i_mmap_lock to a mutex Straightforward conversion of i_mmap_lock to a mutex. Signed-off-by: Peter Zijlstra Acked-by: Hugh Dickins Cc: Benjamin Herrenschmidt Cc: David Miller Cc: Martin Schwidefsky Cc: Russell King Cc: Paul Mundt Cc: Jeff Dike Cc: Richard Weinberger Cc: Tony Luck Cc: KAMEZAWA Hiroyuki Cc: Mel Gorman Cc: KOSAKI Motohiro Cc: Nick Piggin Cc: Namhyung Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 4eef925477fc..927692734bcf 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -383,14 +383,14 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) get_file(file); if (tmp->vm_flags & VM_DENYWRITE) atomic_dec(&inode->i_writecount); - spin_lock(&mapping->i_mmap_lock); + mutex_lock(&mapping->i_mmap_mutex); if (tmp->vm_flags & VM_SHARED) mapping->i_mmap_writable++; flush_dcache_mmap_lock(mapping); /* insert tmp into the share list, just after mpnt */ vma_prio_tree_add(tmp, mpnt); flush_dcache_mmap_unlock(mapping); - spin_unlock(&mapping->i_mmap_lock); + mutex_unlock(&mapping->i_mmap_mutex); } /* -- cgit v1.2.2 From de03c72cfce5b263a674d04348b58475ec50163c Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 24 May 2011 17:12:15 -0700 Subject: mm: convert mm->cpu_vm_cpumask into cpumask_var_t cpumask_t is very big struct and cpu_vm_mask is placed wrong position. It might lead to reduce cache hit ratio. This patch has two change. 1) Move the place of cpumask into last of mm_struct. Because usually cpumask is accessed only front bits when the system has cpu-hotplug capability 2) Convert cpu_vm_mask into cpumask_var_t. It may help to reduce memory footprint if cpumask_size() will use nr_cpumask_bits properly in future. In addition, this patch change the name of cpu_vm_mask with cpu_vm_mask_var. It may help to detect out of tree cpu_vm_mask users. This patch has no functional change. [akpm@linux-foundation.org: build fix] [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: KOSAKI Motohiro Cc: David Howells Cc: Koichi Yasutake Cc: Hugh Dickins Cc: Chris Metcalf Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 37 ++++++++++++++++++++++++++++++++++--- 1 file changed, 34 insertions(+), 3 deletions(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 927692734bcf..8e7e135d0817 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -485,6 +485,20 @@ static void mm_init_aio(struct mm_struct *mm) #endif } +int mm_init_cpumask(struct mm_struct *mm, struct mm_struct *oldmm) +{ +#ifdef CONFIG_CPUMASK_OFFSTACK + if (!alloc_cpumask_var(&mm->cpu_vm_mask_var, GFP_KERNEL)) + return -ENOMEM; + + if (oldmm) + cpumask_copy(mm_cpumask(mm), mm_cpumask(oldmm)); + else + memset(mm_cpumask(mm), 0, cpumask_size()); +#endif + return 0; +} + static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) { atomic_set(&mm->mm_users, 1); @@ -521,10 +535,20 @@ struct mm_struct * mm_alloc(void) struct mm_struct * mm; mm = allocate_mm(); - if (mm) { - memset(mm, 0, sizeof(*mm)); - mm = mm_init(mm, current); + if (!mm) + return NULL; + + memset(mm, 0, sizeof(*mm)); + mm = mm_init(mm, current); + if (!mm) + return NULL; + + if (mm_init_cpumask(mm, NULL)) { + mm_free_pgd(mm); + free_mm(mm); + return NULL; } + return mm; } @@ -536,6 +560,7 @@ struct mm_struct * mm_alloc(void) void __mmdrop(struct mm_struct *mm) { BUG_ON(mm == &init_mm); + free_cpumask_var(mm->cpu_vm_mask_var); mm_free_pgd(mm); destroy_context(mm); mmu_notifier_mm_destroy(mm); @@ -690,6 +715,9 @@ struct mm_struct *dup_mm(struct task_struct *tsk) if (!mm_init(mm, tsk)) goto fail_nomem; + if (mm_init_cpumask(mm, oldmm)) + goto fail_nocpumask; + if (init_new_context(tsk, mm)) goto fail_nocontext; @@ -716,6 +744,9 @@ fail_nomem: return NULL; fail_nocontext: + free_cpumask_var(mm->cpu_vm_mask_var); + +fail_nocpumask: /* * If init_new_context() failed, we cannot use mmput() to free the mm * because it calls destroy_context() -- cgit v1.2.2 From 4714d1d32d97239fb5ae3e10521d3f133a899b66 Mon Sep 17 00:00:00 2001 From: Ben Blum Date: Thu, 26 May 2011 16:25:18 -0700 Subject: cgroups: read-write lock CLONE_THREAD forking per threadgroup Adds functionality to read/write lock CLONE_THREAD fork()ing per-threadgroup Add an rwsem that lives in a threadgroup's signal_struct that's taken for reading in the fork path, under CONFIG_CGROUPS. If another part of the kernel later wants to use such a locking mechanism, the CONFIG_CGROUPS ifdefs should be changed to a higher-up flag that CGROUPS and the other system would both depend on. This is a pre-patch for cgroup-procs-write.patch. Signed-off-by: Ben Blum Cc: "Eric W. Biederman" Cc: Li Zefan Cc: Matt Helsley Reviewed-by: Paul Menage Cc: Oleg Nesterov Cc: David Rientjes Cc: Miao Xie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 8e7e135d0817..1fa9d940e301 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -957,6 +957,10 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) tty_audit_fork(sig); sched_autogroup_fork(sig); +#ifdef CONFIG_CGROUPS + init_rwsem(&sig->threadgroup_fork_lock); +#endif + sig->oom_adj = current->signal->oom_adj; sig->oom_score_adj = current->signal->oom_score_adj; sig->oom_score_adj_min = current->signal->oom_score_adj_min; @@ -1138,6 +1142,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, monotonic_to_bootbased(&p->real_start_time); p->io_context = NULL; p->audit_context = NULL; + if (clone_flags & CLONE_THREAD) + threadgroup_fork_read_lock(current); cgroup_fork(p); #ifdef CONFIG_NUMA p->mempolicy = mpol_dup(p->mempolicy); @@ -1342,6 +1348,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, write_unlock_irq(&tasklist_lock); proc_fork_connector(p); cgroup_post_fork(p); + if (clone_flags & CLONE_THREAD) + threadgroup_fork_read_unlock(current); perf_event_fork(p); return p; @@ -1380,6 +1388,8 @@ bad_fork_cleanup_policy: mpol_put(p->mempolicy); bad_fork_cleanup_cgroup: #endif + if (clone_flags & CLONE_THREAD) + threadgroup_fork_read_unlock(current); cgroup_exit(p, cgroup_callbacks_done); delayacct_tsk_free(p); module_put(task_thread_info(p)->exec_domain->module); -- cgit v1.2.2 From a77aea92010acf54ad785047234418d5d68772e2 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Thu, 26 May 2011 16:25:23 -0700 Subject: cgroup: remove the ns_cgroup The ns_cgroup is an annoying cgroup at the namespace / cgroup frontier and leads to some problems: * cgroup creation is out-of-control * cgroup name can conflict when pids are looping * it is not possible to have a single process handling a lot of namespaces without falling in a exponential creation time * we may want to create a namespace without creating a cgroup The ns_cgroup was replaced by a compatibility flag 'clone_children', where a newly created cgroup will copy the parent cgroup values. The userspace has to manually create a cgroup and add a task to the 'tasks' file. This patch removes the ns_cgroup as suggested in the following thread: https://lists.linux-foundation.org/pipermail/containers/2009-June/018616.html The 'cgroup_clone' function is removed because it is no longer used. This is a userspace-visible change. Commit 45531757b45c ("cgroup: notify ns_cgroup deprecated") (merged into 2.6.27) caused the kernel to emit a printk warning users that the feature is planned for removal. Since that time we have heard from XXX users who were affected by this. Signed-off-by: Daniel Lezcano Signed-off-by: Serge E. Hallyn Cc: Eric W. Biederman Cc: Jamal Hadi Salim Reviewed-by: Li Zefan Acked-by: Paul Menage Acked-by: Matt Helsley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 1fa9d940e301..1f84099ecce6 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1229,12 +1229,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (clone_flags & CLONE_THREAD) p->tgid = current->tgid; - if (current->nsproxy != p->nsproxy) { - retval = ns_cgroup_clone(p, pid); - if (retval) - goto bad_fork_free_pid; - } - p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; /* * Clear TID on mm_release()? -- cgit v1.2.2 From 3864601387cf4196371e3c1897fdffa5228296f9 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Thu, 26 May 2011 16:25:46 -0700 Subject: mm: extract exe_file handling from procfs Setup and cleanup of mm_struct->exe_file is currently done in fs/proc/. This was because exe_file was needed only for /proc//exe. Since we will need the exe_file functionality also for core dumps (so core name can contain full binary path), built this functionality always into the kernel. To achieve that move that out of proc FS to the kernel/ where in fact it should belong. By doing that we can make dup_mm_exe_file static. Also we can drop linux/proc_fs.h inclusion in fs/exec.c and kernel/fork.c. Signed-off-by: Jiri Slaby Cc: Alexander Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 52 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 1f84099ecce6..ca406d916713 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -59,7 +59,6 @@ #include #include #include -#include #include #include #include @@ -597,6 +596,57 @@ void mmput(struct mm_struct *mm) } EXPORT_SYMBOL_GPL(mmput); +/* + * We added or removed a vma mapping the executable. The vmas are only mapped + * during exec and are not mapped with the mmap system call. + * Callers must hold down_write() on the mm's mmap_sem for these + */ +void added_exe_file_vma(struct mm_struct *mm) +{ + mm->num_exe_file_vmas++; +} + +void removed_exe_file_vma(struct mm_struct *mm) +{ + mm->num_exe_file_vmas--; + if ((mm->num_exe_file_vmas == 0) && mm->exe_file){ + fput(mm->exe_file); + mm->exe_file = NULL; + } + +} + +void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) +{ + if (new_exe_file) + get_file(new_exe_file); + if (mm->exe_file) + fput(mm->exe_file); + mm->exe_file = new_exe_file; + mm->num_exe_file_vmas = 0; +} + +struct file *get_mm_exe_file(struct mm_struct *mm) +{ + struct file *exe_file; + + /* We need mmap_sem to protect against races with removal of + * VM_EXECUTABLE vmas */ + down_read(&mm->mmap_sem); + exe_file = mm->exe_file; + if (exe_file) + get_file(exe_file); + up_read(&mm->mmap_sem); + return exe_file; +} + +static void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm) +{ + /* It's safe to write the exe_file pointer without exe_file_lock because + * this is called during fork when the task is not yet in /proc */ + newmm->exe_file = get_mm_exe_file(oldmm); +} + /** * get_task_mm - acquire a reference to the task's mm * -- cgit v1.2.2 From 6345d24daf0c1fffe6642081d783cdf653ebaa5c Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 29 May 2011 11:32:28 -0700 Subject: mm: Fix boot crash in mm_alloc() Thomas Gleixner reports that we now have a boot crash triggered by CONFIG_CPUMASK_OFFSTACK=y: BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] find_next_bit+0x55/0xb0 Call Trace: [] cpumask_any_but+0x2a/0x70 [] flush_tlb_mm+0x2b/0x80 [] pud_populate+0x35/0x50 [] pgd_alloc+0x9a/0xf0 [] mm_init+0xec/0x120 [] mm_alloc+0x53/0xd0 which was introduced by commit de03c72cfce5 ("mm: convert mm->cpu_vm_cpumask into cpumask_var_t"), and is due to wrong ordering of mm_init() vs mm_init_cpumask Thomas wrote a patch to just fix the ordering of initialization, but I hate the new double allocation in the fork path, so I ended up instead doing some more radical surgery to clean it all up. Reported-by: Thomas Gleixner Reported-by: Ingo Molnar Cc: KOSAKI Motohiro Cc: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 42 ++++++++++-------------------------------- 1 file changed, 10 insertions(+), 32 deletions(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index ca406d916713..0276c30401a0 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -484,20 +484,6 @@ static void mm_init_aio(struct mm_struct *mm) #endif } -int mm_init_cpumask(struct mm_struct *mm, struct mm_struct *oldmm) -{ -#ifdef CONFIG_CPUMASK_OFFSTACK - if (!alloc_cpumask_var(&mm->cpu_vm_mask_var, GFP_KERNEL)) - return -ENOMEM; - - if (oldmm) - cpumask_copy(mm_cpumask(mm), mm_cpumask(oldmm)); - else - memset(mm_cpumask(mm), 0, cpumask_size()); -#endif - return 0; -} - static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) { atomic_set(&mm->mm_users, 1); @@ -538,17 +524,8 @@ struct mm_struct * mm_alloc(void) return NULL; memset(mm, 0, sizeof(*mm)); - mm = mm_init(mm, current); - if (!mm) - return NULL; - - if (mm_init_cpumask(mm, NULL)) { - mm_free_pgd(mm); - free_mm(mm); - return NULL; - } - - return mm; + mm_init_cpumask(mm); + return mm_init(mm, current); } /* @@ -559,7 +536,6 @@ struct mm_struct * mm_alloc(void) void __mmdrop(struct mm_struct *mm) { BUG_ON(mm == &init_mm); - free_cpumask_var(mm->cpu_vm_mask_var); mm_free_pgd(mm); destroy_context(mm); mmu_notifier_mm_destroy(mm); @@ -753,6 +729,7 @@ struct mm_struct *dup_mm(struct task_struct *tsk) goto fail_nomem; memcpy(mm, oldmm, sizeof(*mm)); + mm_init_cpumask(mm); /* Initializing for Swap token stuff */ mm->token_priority = 0; @@ -765,9 +742,6 @@ struct mm_struct *dup_mm(struct task_struct *tsk) if (!mm_init(mm, tsk)) goto fail_nomem; - if (mm_init_cpumask(mm, oldmm)) - goto fail_nocpumask; - if (init_new_context(tsk, mm)) goto fail_nocontext; @@ -794,9 +768,6 @@ fail_nomem: return NULL; fail_nocontext: - free_cpumask_var(mm->cpu_vm_mask_var); - -fail_nocpumask: /* * If init_new_context() failed, we cannot use mmput() to free the mm * because it calls destroy_context() @@ -1591,6 +1562,13 @@ void __init proc_caches_init(void) fs_cachep = kmem_cache_create("fs_cache", sizeof(struct fs_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); + /* + * FIXME! The "sizeof(struct mm_struct)" currently includes the + * whole struct cpumask for the OFFSTACK case. We could change + * this to *only* allocate as much of it as required by the + * maximum number of CPU's we can ever have. The cpumask_allocation + * is at the end of the structure, exactly for that reason. + */ mm_cachep = kmem_cache_create("mm_struct", sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); -- cgit v1.2.2