From 3d48749d93a3dce732dd30a14002ab90ec4355f3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 5 Mar 2012 13:15:25 -0800 Subject: block: ioc_task_link() can't fail ioc_task_link() is used to share %current's ioc on clone. If %current->io_context is set, %current is guaranteed to have refcount on the ioc and, thus, ioc_task_link() can't fail. Replace error checking in ioc_task_link() with WARN_ON_ONCE() and make it just increment refcount and nr_tasks. -v2: Description typo fix (Vivek). Signed-off-by: Tejun Heo Cc: Vivek Goyal Signed-off-by: Jens Axboe --- kernel/fork.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index b77fd559c78e..a1b632713e43 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -901,9 +901,8 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk) * Share io context with parent, if CLONE_IO is set */ if (clone_flags & CLONE_IO) { - tsk->io_context = ioc_task_link(ioc); - if (unlikely(!tsk->io_context)) - return -ENOMEM; + ioc_task_link(ioc); + tsk->io_context = ioc; } else if (ioprio_valid(ioc->ioprio)) { new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE); if (unlikely(!new_ioc)) -- cgit v1.2.2 From 0326f5a94ddea33fa331b2519f4172f4fb387baa Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Tue, 13 Mar 2012 23:30:11 +0530 Subject: uprobes/core: Handle breakpoint and singlestep exceptions Uprobes uses exception notifiers to get to know if a thread hit a breakpoint or a singlestep exception. When a thread hits a uprobe or is singlestepping post a uprobe hit, the uprobe exception notifier sets its TIF_UPROBE bit, which will then be checked on its return to userspace path (do_notify_resume() ->uprobe_notify_resume()), where the consumers handlers are run (in task context) based on the defined filters. Uprobe hits are thread specific and hence we need to maintain information about if a task hit a uprobe, what uprobe was hit, the slot where the original instruction was copied for xol so that it can be singlestepped with appropriate fixups. In some cases, special care is needed for instructions that are executed out of line (xol). These are architecture specific artefacts, such as handling RIP relative instructions on x86_64. Since the instruction at which the uprobe was inserted is executed out of line, architecture specific fixups are added so that the thread continues normal execution in the presence of a uprobe. Postpone the signals until we execute the probed insn. post_xol() path does a recalc_sigpending() before return to user-mode, this ensures the signal can't be lost. Uprobes relies on DIE_DEBUG notification to notify if a singlestep is complete. Adds x86 specific uprobe exception notifiers and appropriate hooks needed to determine a uprobe hit and subsequent post processing. Add requisite x86 fixups for xol for uprobes. Specific cases needing fixups include relative jumps (x86_64), calls, etc. Where possible, we check and skip singlestepping the breakpointed instructions. For now we skip single byte as well as few multibyte nop instructions. However this can be extended to other instructions too. Credits to Oleg Nesterov for suggestions/patches related to signal, breakpoint, singlestep handling code. Signed-off-by: Srikar Dronamraju Cc: Linus Torvalds Cc: Ananth N Mavinakayanahalli Cc: Jim Keniston Cc: Linux-mm Cc: Oleg Nesterov Cc: Andi Kleen Cc: Christoph Hellwig Cc: Steven Rostedt Cc: Arnaldo Carvalho de Melo Cc: Masami Hiramatsu Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120313180011.29771.89027.sendpatchset@srdronam.in.ibm.com [ Performed various cleanliness edits ] Signed-off-by: Ingo Molnar --- kernel/fork.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index e2cd3e2a5ae8..eb7b63334009 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -67,6 +67,7 @@ #include #include #include +#include #include #include @@ -701,6 +702,8 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) exit_pi_state_list(tsk); #endif + uprobe_free_utask(tsk); + /* Get rid of any cached register state */ deactivate_mm(tsk, mm); @@ -1295,6 +1298,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, INIT_LIST_HEAD(&p->pi_state_list); p->pi_state_cache = NULL; #endif + uprobe_copy_process(p); /* * sigaltstack should be cleared when sharing the same VM */ -- cgit v1.2.2 From d4b3b6384f98f8692ad0209891ccdbc7e78bbefe Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Fri, 30 Mar 2012 23:56:31 +0530 Subject: uprobes/core: Allocate XOL slots for uprobes use Uprobes executes the original instruction at a probed location out of line. For this, we allocate a page (per mm) upon the first uprobe hit, in the process user address space, divide it into slots that are used to store the actual instructions to be singlestepped. These slots are known as xol (execution out of line) slots. Care is taken to ensure that the allocation is in an unmapped area as close to the top of the user address space as possible, with appropriate permission settings to keep selinux like frameworks happy. Upon a uprobe hit, a free slot is acquired, and is released after the singlestep completes. Lots of improvements courtesy suggestions/inputs from Peter and Oleg. [ Folded a fix for build issue on powerpc fixed and reported by Stephen Rothwell. ] Signed-off-by: Srikar Dronamraju Cc: Linus Torvalds Cc: Ananth N Mavinakayanahalli Cc: Jim Keniston Cc: Linux-mm Cc: Oleg Nesterov Cc: Andi Kleen Cc: Christoph Hellwig Cc: Steven Rostedt Cc: Arnaldo Carvalho de Melo Cc: Masami Hiramatsu Cc: Anton Arapov Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120330182631.10018.48175.sendpatchset@srdronam.in.ibm.com Signed-off-by: Ingo Molnar --- kernel/fork.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index eb7b63334009..3133b9da59d5 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -554,6 +554,7 @@ void mmput(struct mm_struct *mm) might_sleep(); if (atomic_dec_and_test(&mm->mm_users)) { + uprobe_clear_state(mm); exit_aio(mm); ksm_exit(mm); khugepaged_exit(mm); /* must run before exit_mmap */ @@ -760,6 +761,7 @@ struct mm_struct *dup_mm(struct task_struct *tsk) #ifdef CONFIG_TRANSPARENT_HUGEPAGE mm->pmd_huge_pte = NULL; #endif + uprobe_reset_state(mm); if (!mm_init(mm, tsk)) goto fail_nomem; -- cgit v1.2.2 From 682968e0c425c60f0dde37977e5beb2b12ddc4cc Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Fri, 30 Mar 2012 23:56:46 +0530 Subject: uprobes/core: Optimize probe hits with the help of a counter Maintain a per-mm counter: number of uprobes that are inserted on this process address space. This counter can be used at probe hit time to determine if we need a lookup in the uprobes rbtree. Everytime a probe gets inserted successfully, the probe count is incremented and everytime a probe gets removed, the probe count is decremented. The new uprobe_munmap hook ensures the count is correct on a unmap or remap of a region. We expect that once a uprobe_munmap() is called, the vma goes away. So uprobe_unregister() finding a probe to unregister would either mean unmap event hasnt occurred yet or a mmap event on the same executable file occured after a unmap event. Additionally, uprobe_mmap hook now also gets called: a. on every executable vma that is COWed at fork. b. a vma of interest is newly mapped; breakpoint insertion also happens at the required address. On process creation, make sure the probes count in the child is set correctly. Special cases that are taken care include: a. mremap b. VM_DONTCOPY vmas on fork() c. insertion/removal races in the parent during fork(). Signed-off-by: Srikar Dronamraju Cc: Linus Torvalds Cc: Ananth N Mavinakayanahalli Cc: Jim Keniston Cc: Linux-mm Cc: Oleg Nesterov Cc: Andi Kleen Cc: Christoph Hellwig Cc: Steven Rostedt Cc: Arnaldo Carvalho de Melo Cc: Masami Hiramatsu Cc: Anton Arapov Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120330182646.10018.85805.sendpatchset@srdronam.in.ibm.com Signed-off-by: Ingo Molnar --- kernel/fork.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 3133b9da59d5..26a8f5c25805 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -421,6 +421,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) if (retval) goto out; + + if (file && uprobe_mmap(tmp)) + goto out; } /* a new mm has just been created */ arch_dup_mmap(oldmm, mm); -- cgit v1.2.2 From e2cfabdfd075648216f99c2c03821cf3f47c1727 Mon Sep 17 00:00:00 2001 From: Will Drewry Date: Thu, 12 Apr 2012 16:47:57 -0500 Subject: seccomp: add system call filtering using BPF [This patch depends on luto@mit.edu's no_new_privs patch: https://lkml.org/lkml/2012/1/30/264 The whole series including Andrew's patches can be found here: https://github.com/redpig/linux/tree/seccomp Complete diff here: https://github.com/redpig/linux/compare/1dc65fed...seccomp ] This patch adds support for seccomp mode 2. Mode 2 introduces the ability for unprivileged processes to install system call filtering policy expressed in terms of a Berkeley Packet Filter (BPF) program. This program will be evaluated in the kernel for each system call the task makes and computes a result based on data in the format of struct seccomp_data. A filter program may be installed by calling: struct sock_fprog fprog = { ... }; ... prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &fprog); The return value of the filter program determines if the system call is allowed to proceed or denied. If the first filter program installed allows prctl(2) calls, then the above call may be made repeatedly by a task to further reduce its access to the kernel. All attached programs must be evaluated before a system call will be allowed to proceed. Filter programs will be inherited across fork/clone and execve. However, if the task attaching the filter is unprivileged (!CAP_SYS_ADMIN) the no_new_privs bit will be set on the task. This ensures that unprivileged tasks cannot attach filters that affect privileged tasks (e.g., setuid binary). There are a number of benefits to this approach. A few of which are as follows: - BPF has been exposed to userland for a long time - BPF optimization (and JIT'ing) are well understood - Userland already knows its ABI: system call numbers and desired arguments - No time-of-check-time-of-use vulnerable data accesses are possible. - system call arguments are loaded on access only to minimize copying required for system call policy decisions. Mode 2 support is restricted to architectures that enable HAVE_ARCH_SECCOMP_FILTER. In this patch, the primary dependency is on syscall_get_arguments(). The full desired scope of this feature will add a few minor additional requirements expressed later in this series. Based on discussion, SECCOMP_RET_ERRNO and SECCOMP_RET_TRACE seem to be the desired additional functionality. No architectures are enabled in this patch. Signed-off-by: Will Drewry Acked-by: Serge Hallyn Reviewed-by: Indan Zupancic Acked-by: Eric Paris Reviewed-by: Kees Cook v18: - rebase to v3.4-rc2 - s/chk/check/ (akpm@linux-foundation.org,jmorris@namei.org) - allocate with GFP_KERNEL|__GFP_NOWARN (indan@nul.nu) - add a comment for get_u32 regarding endianness (akpm@) - fix other typos, style mistakes (akpm@) - added acked-by v17: - properly guard seccomp filter needed headers (leann@ubuntu.com) - tighten return mask to 0x7fff0000 v16: - no change v15: - add a 4 instr penalty when counting a path to account for seccomp_filter size (indan@nul.nu) - drop the max insns to 256KB (indan@nul.nu) - return ENOMEM if the max insns limit has been hit (indan@nul.nu) - move IP checks after args (indan@nul.nu) - drop !user_filter check (indan@nul.nu) - only allow explicit bpf codes (indan@nul.nu) - exit_code -> exit_sig v14: - put/get_seccomp_filter takes struct task_struct (indan@nul.nu,keescook@chromium.org) - adds seccomp_chk_filter and drops general bpf_run/chk_filter user - add seccomp_bpf_load for use by net/core/filter.c - lower max per-process/per-hierarchy: 1MB - moved nnp/capability check prior to allocation (all of the above: indan@nul.nu) v13: - rebase on to 88ebdda6159ffc15699f204c33feb3e431bf9bdc v12: - added a maximum instruction count per path (indan@nul.nu,oleg@redhat.com) - removed copy_seccomp (keescook@chromium.org,indan@nul.nu) - reworded the prctl_set_seccomp comment (indan@nul.nu) v11: - reorder struct seccomp_data to allow future args expansion (hpa@zytor.com) - style clean up, @compat dropped, compat_sock_fprog32 (indan@nul.nu) - do_exit(SIGSYS) (keescook@chromium.org, luto@mit.edu) - pare down Kconfig doc reference. - extra comment clean up v10: - seccomp_data has changed again to be more aesthetically pleasing (hpa@zytor.com) - calling convention is noted in a new u32 field using syscall_get_arch. This allows for cross-calling convention tasks to use seccomp filters. (hpa@zytor.com) - lots of clean up (thanks, Indan!) v9: - n/a v8: - use bpf_chk_filter, bpf_run_filter. update load_fns - Lots of fixes courtesy of indan@nul.nu: -- fix up load behavior, compat fixups, and merge alloc code, -- renamed pc and dropped __packed, use bool compat. -- Added a hidden CONFIG_SECCOMP_FILTER to synthesize non-arch dependencies v7: (massive overhaul thanks to Indan, others) - added CONFIG_HAVE_ARCH_SECCOMP_FILTER - merged into seccomp.c - minimal seccomp_filter.h - no config option (part of seccomp) - no new prctl - doesn't break seccomp on systems without asm/syscall.h (works but arg access always fails) - dropped seccomp_init_task, extra free functions, ... - dropped the no-asm/syscall.h code paths - merges with network sk_run_filter and sk_chk_filter v6: - fix memory leak on attach compat check failure - require no_new_privs || CAP_SYS_ADMIN prior to filter installation. (luto@mit.edu) - s/seccomp_struct_/seccomp_/ for macros/functions (amwang@redhat.com) - cleaned up Kconfig (amwang@redhat.com) - on block, note if the call was compat (so the # means something) v5: - uses syscall_get_arguments (indan@nul.nu,oleg@redhat.com, mcgrathr@chromium.org) - uses union-based arg storage with hi/lo struct to handle endianness. Compromises between the two alternate proposals to minimize extra arg shuffling and account for endianness assuming userspace uses offsetof(). (mcgrathr@chromium.org, indan@nul.nu) - update Kconfig description - add include/seccomp_filter.h and add its installation - (naive) on-demand syscall argument loading - drop seccomp_t (eparis@redhat.com) v4: - adjusted prctl to make room for PR_[SG]ET_NO_NEW_PRIVS - now uses current->no_new_privs (luto@mit.edu,torvalds@linux-foundation.com) - assign names to seccomp modes (rdunlap@xenotime.net) - fix style issues (rdunlap@xenotime.net) - reworded Kconfig entry (rdunlap@xenotime.net) v3: - macros to inline (oleg@redhat.com) - init_task behavior fixed (oleg@redhat.com) - drop creator entry and extra NULL check (oleg@redhat.com) - alloc returns -EINVAL on bad sizing (serge.hallyn@canonical.com) - adds tentative use of "always_unprivileged" as per torvalds@linux-foundation.org and luto@mit.edu v2: - (patch 2 only) Signed-off-by: James Morris --- kernel/fork.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index b9372a0bff18..f7cf6fb107ec 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -170,6 +171,7 @@ void free_task(struct task_struct *tsk) free_thread_info(tsk->stack); rt_mutex_debug_task_free(tsk); ftrace_graph_exit_task(tsk); + put_seccomp_filter(tsk); free_task_struct(tsk); } EXPORT_SYMBOL(free_task); @@ -1162,6 +1164,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, goto fork_out; ftrace_graph_init_task(p); + get_seccomp_filter(p); rt_mutex_init_task(p); -- cgit v1.2.2 From 6c0a9fa62feb7e9fdefa9720bcc03040c9b0b311 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 5 May 2012 15:05:40 +0000 Subject: fork: Remove the weak insanity We error out when compiling with gcc4.1.[01] as it miscompiles __weak. The workaround with magic defines is not longer necessary. Make it __weak again. Signed-off-by: Thomas Gleixner Link: http://lkml.kernel.org/r/20120505150141.306358267@linutronix.de --- kernel/fork.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index b9372a0bff18..a79b36e2e912 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -203,13 +203,7 @@ void __put_task_struct(struct task_struct *tsk) } EXPORT_SYMBOL_GPL(__put_task_struct); -/* - * macro override instead of weak attribute alias, to workaround - * gcc 4.1.0 and 4.1.1 bugs with weak attribute and empty functions. - */ -#ifndef arch_task_cache_init -#define arch_task_cache_init() -#endif +void __init __weak arch_task_cache_init(void) { } void __init fork_init(unsigned long mempages) { -- cgit v1.2.2 From 2889f60814e15dea644782597d897cdba943564f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 5 May 2012 15:05:41 +0000 Subject: fork: Move thread info gfp flags to header These flags can be useful for extra allocations outside of the core code. Add __GFP_NOTRACK to them, so the archs which have kmemcheck do not have to provide extra allocators just for that reason. Signed-off-by: Thomas Gleixner Link: http://lkml.kernel.org/r/20120505150141.428211694@linutronix.de --- kernel/fork.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index a79b36e2e912..5d22b9b8cf7b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -123,12 +123,8 @@ static struct kmem_cache *task_struct_cachep; static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, int node) { -#ifdef CONFIG_DEBUG_STACK_USAGE - gfp_t mask = GFP_KERNEL | __GFP_ZERO; -#else - gfp_t mask = GFP_KERNEL; -#endif - struct page *page = alloc_pages_node(node, mask, THREAD_SIZE_ORDER); + struct page *page = alloc_pages_node(node, THREADINFO_GFP, + THREAD_SIZE_ORDER); return page ? page_address(page) : NULL; } -- cgit v1.2.2 From 41101809a865dd0be1b56eff46c83fad321870b2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 5 May 2012 15:05:41 +0000 Subject: fork: Provide weak arch_release_[task_struct|thread_info] functions These functions allow us to move most of the duplicated thread_info allocators to the core code. Signed-off-by: Thomas Gleixner Link: http://lkml.kernel.org/r/20120505150141.366461660@linutronix.de --- kernel/fork.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 5d22b9b8cf7b..2dfad0269674 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -112,14 +112,26 @@ int nr_processes(void) } #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR -# define alloc_task_struct_node(node) \ - kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node) -# define free_task_struct(tsk) \ - kmem_cache_free(task_struct_cachep, (tsk)) static struct kmem_cache *task_struct_cachep; + +static inline struct task_struct *alloc_task_struct_node(int node) +{ + return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node); +} + +void __weak arch_release_task_struct(struct task_struct *tsk) { } + +static inline void free_task_struct(struct task_struct *tsk) +{ + arch_release_task_struct(tsk); + kmem_cache_free(task_struct_cachep, tsk); +} #endif #ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR + +void __weak arch_release_thread_info(struct thread_info *ti) { } + static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, int node) { @@ -131,6 +143,7 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, static inline void free_thread_info(struct thread_info *ti) { + arch_release_thread_info(ti); free_pages((unsigned long)ti, THREAD_SIZE_ORDER); } #endif -- cgit v1.2.2 From 0d15d74a1ead10673b5b1db66d4c90552769096c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 5 May 2012 15:05:41 +0000 Subject: fork: Provide kmemcache based thread_info allocator Several architectures have their own kmemcache based thread allocator because THREAD_SIZE is smaller than PAGE_SIZE. Add it to the core code conditionally on THREAD_SIZE < PAGE_SIZE so the private copies can go. Signed-off-by: Thomas Gleixner Link: http://lkml.kernel.org/r/20120505150141.491002124@linutronix.de --- kernel/fork.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 2dfad0269674..7590bd6e8dff 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -132,6 +132,11 @@ static inline void free_task_struct(struct task_struct *tsk) void __weak arch_release_thread_info(struct thread_info *ti) { } +/* + * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a + * kmemcache based allocator. + */ +# if THREAD_SIZE >= PAGE_SIZE static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, int node) { @@ -146,6 +151,28 @@ static inline void free_thread_info(struct thread_info *ti) arch_release_thread_info(ti); free_pages((unsigned long)ti, THREAD_SIZE_ORDER); } +# else +static struct kmem_cache *thread_info_cache; + +static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, + int node) +{ + return kmem_cache_alloc_node(thread_info_cache, THREADINFO_GFP, node); +} + +static void free_thread_info(struct thread_info *ti) +{ + arch_release_thread_info(ti); + kmem_cache_free(thread_info_cache, ti); +} + +void thread_info_cache_init(void) +{ + thread_info_cache = kmem_cache_create("thread_info", THREAD_SIZE, + THREAD_SIZE, 0, NULL); + BUG_ON(thread_info_cache == NULL); +} +# endif #endif /* SLAB cache for signal_struct structures (tsk->signal) */ -- cgit v1.2.2 From f5e10287367dcffb5504d19c83e85ca041ca2596 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 5 May 2012 15:05:48 +0000 Subject: task_allocator: Use config switches instead of magic defines Replace __HAVE_ARCH_TASK_ALLOCATOR and __HAVE_ARCH_THREAD_ALLOCATOR with proper config switches. Signed-off-by: Thomas Gleixner Cc: Sam Ravnborg Cc: Tony Luck Link: http://lkml.kernel.org/r/20120505150142.371309416@linutronix.de --- kernel/fork.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 7590bd6e8dff..a1793e442b20 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -111,7 +111,7 @@ int nr_processes(void) return total; } -#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR +#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR static struct kmem_cache *task_struct_cachep; static inline struct task_struct *alloc_task_struct_node(int node) @@ -128,8 +128,7 @@ static inline void free_task_struct(struct task_struct *tsk) } #endif -#ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR - +#ifndef CONFIG_ARCH_THREAD_INFO_ALLOCATOR void __weak arch_release_thread_info(struct thread_info *ti) { } /* @@ -243,7 +242,7 @@ void __init __weak arch_task_cache_init(void) { } void __init fork_init(unsigned long mempages) { -#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR +#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR #ifndef ARCH_MIN_TASKALIGN #define ARCH_MIN_TASKALIGN L1_CACHE_BYTES #endif -- cgit v1.2.2 From 5e2bf0142231194d36fdc9596b36a261ed2b9fe7 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Thu, 10 May 2012 13:01:45 -0700 Subject: namespaces, pid_ns: fix leakage on fork() failure Fork() failure post namespace creation for a child cloned with CLONE_NEWPID leaks pid_namespace/mnt_cache due to proc being mounted during creation, but not unmounted during cleanup. Call pid_ns_release_proc() during cleanup. Signed-off-by: Mike Galbraith Acked-by: Oleg Nesterov Reviewed-by: "Eric W. Biederman" Cc: Pavel Emelyanov Cc: Cyrill Gorcunov Cc: Louis Rilling Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index b9372a0bff18..687a15d56243 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -47,6 +47,7 @@ #include #include #include +#include #include #include #include @@ -1464,6 +1465,8 @@ bad_fork_cleanup_io: if (p->io_context) exit_io_context(p); bad_fork_cleanup_namespaces: + if (unlikely(clone_flags & CLONE_NEWPID)) + pid_ns_release_proc(p->nsproxy->pid_ns); exit_task_namespaces(p); bad_fork_cleanup_mm: if (p->mm) -- cgit v1.2.2 From 55ccf3fe3f9a3441731aa79cf42a628fc4ecace9 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Wed, 16 May 2012 15:03:51 -0700 Subject: fork: move the real prepare_to_copy() users to arch_dup_task_struct() Historical prepare_to_copy() is mostly a no-op, duplicated for majority of the architectures and the rest following the x86 model of flushing the extended register state like fpu there. Remove it and use the arch_dup_task_struct() instead. Suggested-by: Oleg Nesterov Suggested-by: Linus Torvalds Signed-off-by: Suresh Siddha Link: http://lkml.kernel.org/r/1336692811-30576-1-git-send-email-suresh.b.siddha@intel.com Acked-by: Benjamin Herrenschmidt Cc: David Howells Cc: Koichi Yasutake Cc: Paul Mackerras Cc: Paul Mundt Cc: Chris Zankel Cc: Richard Henderson Cc: Russell King Cc: Haavard Skinnemoen Cc: Mike Frysinger Cc: Mark Salter Cc: Aurelien Jacquiot Cc: Mikael Starvik Cc: Yoshinori Sato Cc: Richard Kuo Cc: Tony Luck Cc: Michal Simek Cc: Ralf Baechle Cc: Jonas Bonn Cc: James E.J. Bottomley Cc: Helge Deller Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Chen Liqin Cc: Lennox Wu Cc: David S. Miller Cc: Chris Metcalf Cc: Jeff Dike Cc: Richard Weinberger Cc: Guan Xuetao Signed-off-by: H. Peter Anvin --- kernel/fork.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 687a15d56243..7aed746ff47c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -261,8 +261,6 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) int node = tsk_fork_get_node(orig); int err; - prepare_to_copy(orig); - tsk = alloc_task_struct_node(node); if (!tsk) return NULL; -- cgit v1.2.2 From e73f8959af0439d114847eab5a8a5ce48f1217c4 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 11 May 2012 10:59:07 +1000 Subject: task_work_add: generic process-context callbacks Provide a simple mechanism that allows running code in the (nonatomic) context of the arbitrary task. The caller does task_work_add(task, task_work) and this task executes task_work->func() either from do_notify_resume() or from do_exit(). The callback can rely on PF_EXITING to detect the latter case. "struct task_work" can be embedded in another struct, still it has "void *data" to handle the most common/simple case. This allows us to kill the ->replacement_session_keyring hack, and potentially this can have more users. Performance-wise, this adds 2 "unlikely(!hlist_empty())" checks into tracehook_notify_resume() and do_exit(). But at the same time we can remove the "replacement_session_keyring != NULL" checks from arch/*/signal.c and exit_creds(). Note: task_work_add/task_work_run abuses ->pi_lock. This is only because this lock is already used by lookup_pi_state() to synchronize with do_exit() setting PF_EXITING. Fortunately the scope of this lock in task_work.c is really tiny, and the code is unlikely anyway. Signed-off-by: Oleg Nesterov Acked-by: David Howells Cc: Thomas Gleixner Cc: Richard Kuo Cc: Linus Torvalds Cc: Alexander Gordeev Cc: Chris Zankel Cc: David Smith Cc: "Frank Ch. Eigler" Cc: Geert Uytterhoeven Cc: Larry Woodman Cc: Peter Zijlstra Cc: Tejun Heo Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- kernel/fork.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 05c813dc9ecc..a46db217a589 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1411,6 +1411,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, */ p->group_leader = p; INIT_LIST_HEAD(&p->thread_group); + INIT_HLIST_HEAD(&p->task_works); /* Now that the task is set up, run cgroup callbacks if * necessary. We need to run them before the task is visible -- cgit v1.2.2 From e709ffd6169ccd259eb5874e853303e91e94e829 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 29 May 2012 15:06:18 -0700 Subject: mm: remove swap token code The swap token code no longer fits in with the current VM model. It does not play well with cgroups or the better NUMA placement code in development, since we have only one swap token globally. It also has the potential to mess with scalability of the system, by increasing the number of non-reclaimable pages on the active and inactive anon LRU lists. Last but not least, the swap token code has been broken for a year without complaints, as reported by Konstantin Khlebnikov. This suggests we no longer have much use for it. The days of sub-1G memory systems with heavy use of swap are over. If we ever need thrashing reducing code in the future, we will have to implement something that does scale. Signed-off-by: Rik van Riel Cc: Konstantin Khlebnikov Acked-by: Johannes Weiner Cc: Mel Gorman Cc: Hugh Dickins Acked-by: Bob Picco Acked-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 9 --------- 1 file changed, 9 deletions(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 47b4e4f379f9..5b13eea2e757 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -614,7 +614,6 @@ void mmput(struct mm_struct *mm) list_del(&mm->mmlist); spin_unlock(&mmlist_lock); } - put_swap_token(mm); if (mm->binfmt) module_put(mm->binfmt->module); mmdrop(mm); @@ -831,10 +830,6 @@ struct mm_struct *dup_mm(struct task_struct *tsk) memcpy(mm, oldmm, sizeof(*mm)); mm_init_cpumask(mm); - /* Initializing for Swap token stuff */ - mm->token_priority = 0; - mm->last_interval = 0; - #ifdef CONFIG_TRANSPARENT_HUGEPAGE mm->pmd_huge_pte = NULL; #endif @@ -913,10 +908,6 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) goto fail_nomem; good_mm: - /* Initializing for Swap token stuff */ - mm->token_priority = 0; - mm->last_interval = 0; - tsk->mm = mm; tsk->active_mm = mm; return 0; -- cgit v1.2.2 From 7edc8b0ac16cbaed7cb4ea4c6b95ce98d2997e84 Mon Sep 17 00:00:00 2001 From: Siddhesh Poyarekar Date: Tue, 29 May 2012 15:06:22 -0700 Subject: mm/fork: fix overflow in vma length when copying mmap on clone The vma length in dup_mmap is calculated and stored in a unsigned int, which is insufficient and hence overflows for very large maps (beyond 16TB). The following program demonstrates this: #include #include #include #define GIG 1024 * 1024 * 1024L #define EXTENT 16393 int main(void) { int i, r; void *m; char buf[1024]; for (i = 0; i < EXTENT; i++) { m = mmap(NULL, (size_t) 1 * 1024 * 1024 * 1024L, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); if (m == (void *)-1) printf("MMAP Failed: %d\n", m); else printf("%d : MMAP returned %p\n", i, m); r = fork(); if (r == 0) { printf("%d: successed\n", i); return 0; } else if (r < 0) printf("FORK Failed: %d\n", r); else if (r > 0) wait(NULL); } return 0; } Increase the storage size of the result to unsigned long, which is sufficient for storing the difference between addresses. Signed-off-by: Siddhesh Poyarekar Cc: Tejun Heo Cc: Oleg Nesterov Cc: Jens Axboe Cc: Peter Zijlstra Acked-by: Hugh Dickins Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 5b13eea2e757..017fb23d5983 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -386,7 +386,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) } charge = 0; if (mpnt->vm_flags & VM_ACCOUNT) { - unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; + unsigned long len; + len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; if (security_vm_enough_memory_mm(oldmm, len)) /* sic */ goto fail_nomem; charge = len; -- cgit v1.2.2 From f7505d64f2db5da2d7d94873ddf2cd2524847061 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 31 May 2012 16:26:21 -0700 Subject: fork: call complete_vfork_done() after clearing child_tid and flushing rss-counters Child should wake up the parent from vfork() only after finishing all operations with shared mm. There is no sense in using CLONE_CHILD_CLEARTID together with CLONE_VFORK, but it looks more accurate now. Signed-off-by: Konstantin Khlebnikov Cc: Oleg Nesterov Cc: Hugh Dickins Cc: KAMEZAWA Hiroyuki Cc: Konstantin Khlebnikov Cc: Markus Trippelsdorf Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 017fb23d5983..2254fbf23567 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -787,9 +787,6 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) /* Get rid of any cached register state */ deactivate_mm(tsk, mm); - if (tsk->vfork_done) - complete_vfork_done(tsk); - /* * If we're exiting normally, clear a user-space tid field if * requested. We leave this alone when dying by signal, to leave @@ -810,6 +807,13 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) } tsk->clear_child_tid = NULL; } + + /* + * All done, finally we can wake up parent and return this mm to him. + * Also kthread_stop() uses this completion for synchronization. + */ + if (tsk->vfork_done) + complete_vfork_done(tsk); } /* -- cgit v1.2.2