From 221af7f87b97431e3ee21ce4b0e77d5411cf1549 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 28 Jan 2010 22:14:42 -0800 Subject: Split 'flush_old_exec' into two functions 'flush_old_exec()' is the point of no return when doing an execve(), and it is pretty badly misnamed. It doesn't just flush the old executable environment, it also starts up the new one. Which is very inconvenient for things like setting up the new personality, because we want the new personality to affect the starting of the new environment, but at the same time we do _not_ want the new personality to take effect if flushing the old one fails. As a result, the x86-64 '32-bit' personality is actually done using this insane "I'm going to change the ABI, but I haven't done it yet" bit (TIF_ABI_PENDING), with SET_PERSONALITY() not actually setting the personality, but just the "pending" bit, so that "flush_thread()" can do the actual personality magic. This patch in no way changes any of that insanity, but it does split the 'flush_old_exec()' function up into a preparatory part that can fail (still called flush_old_exec()), and a new part that will actually set up the new exec environment (setup_new_exec()). All callers are changed to trivially comply with the new world order. Signed-off-by: H. Peter Anvin Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- fs/exec.c | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) (limited to 'fs/exec.c') diff --git a/fs/exec.c b/fs/exec.c index 632b02e34ec7..675c3f44c2ea 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -941,9 +941,7 @@ void set_task_comm(struct task_struct *tsk, char *buf) int flush_old_exec(struct linux_binprm * bprm) { - char * name; - int i, ch, retval; - char tcomm[sizeof(current->comm)]; + int retval; /* * Make sure we have a private signal table and that @@ -963,6 +961,20 @@ int flush_old_exec(struct linux_binprm * bprm) goto out; bprm->mm = NULL; /* We're using it now */ + return 0; + +out: + return retval; +} +EXPORT_SYMBOL(flush_old_exec); + +void setup_new_exec(struct linux_binprm * bprm) +{ + int i, ch; + char * name; + char tcomm[sizeof(current->comm)]; + + arch_pick_mmap_layout(current->mm); /* This is the point of no return */ current->sas_ss_sp = current->sas_ss_size = 0; @@ -1019,14 +1031,8 @@ int flush_old_exec(struct linux_binprm * bprm) flush_signal_handlers(current, 0); flush_old_files(current->files); - - return 0; - -out: - return retval; } - -EXPORT_SYMBOL(flush_old_exec); +EXPORT_SYMBOL(setup_new_exec); /* * Prepare credentials and lock ->cred_guard_mutex. -- cgit v1.2.2 From 7ab02af428c2d312c0cf8fb0b01cc1eb21131a3d Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 2 Feb 2010 12:37:44 -0800 Subject: Fix 'flush_old_exec()/setup_new_exec()' split Commit 221af7f87b9 ("Split 'flush_old_exec' into two functions") split the function at the point of no return - ie right where there were no more error cases to check. That made sense from a technical standpoint, but when we then also combined it with the actual personality setting going in between flush_old_exec() and setup_new_exec(), it needs to be a bit more careful. In particular, we need to make sure that we really flush the old personality bits in the 'flush' stage, rather than later in the 'setup' stage, since otherwise we might be flushing the _new_ personality state that we're just setting up. So this moves the flags and personality flushing (and 'flush_thread()', which is the arch-specific function that generally resets lazy FP state etc) of the old process into flush_old_exec(), so that it doesn't affect any state that execve() is setting up for the new process environment. This was reported by Michal Simek as breaking his Microblaze qemu environment. Reported-and-tested-by: Michal Simek Cc: Peter Anvin Signed-off-by: Linus Torvalds --- fs/exec.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'fs/exec.c') diff --git a/fs/exec.c b/fs/exec.c index 675c3f44c2ea..0790a107ff7e 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -961,6 +961,11 @@ int flush_old_exec(struct linux_binprm * bprm) goto out; bprm->mm = NULL; /* We're using it now */ + + current->flags &= ~PF_RANDOMIZE; + flush_thread(); + current->personality &= ~bprm->per_clear; + return 0; out: @@ -997,9 +1002,6 @@ void setup_new_exec(struct linux_binprm * bprm) tcomm[i] = '\0'; set_task_comm(current, tcomm); - current->flags &= ~PF_RANDOMIZE; - flush_thread(); - /* Set the new mm task size. We have to do that late because it may * depend on TIF_32BIT which is only updated in flush_thread() on * some architectures like powerpc @@ -1015,8 +1017,6 @@ void setup_new_exec(struct linux_binprm * bprm) set_dumpable(current->mm, suid_dumpable); } - current->personality &= ~bprm->per_clear; - /* * Flush performance counters when crossing a * security domain: -- cgit v1.2.2 From 803bf5ec259941936262d10ecc84511b76a20921 Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Wed, 10 Feb 2010 13:56:42 -0800 Subject: fs/exec.c: restrict initial stack space expansion to rlimit When reserving stack space for a new process, make sure we're not attempting to expand the stack by more than rlimit allows. This fixes a bug caused by b6a2fea39318e43fee84fa7b0b90d68bed92d2ba ("mm: variable length argument support") and unmasked by fc63cf237078c86214abcb2ee9926d8ad289da9b ("exec: setup_arg_pages() fails to return errors"). This bug means that when limiting the stack to less the 20*PAGE_SIZE (eg. 80K on 4K pages or 'ulimit -s 79') all processes will be killed before they start. This is particularly bad with 64K pages, where a ulimit below 1280K will kill every process. To test, do: 'ulimit -s 15; ls' before and after the patch is applied. Before it's applied, 'ls' should be killed. After the patch is applied, 'ls' should no longer be killed. A stack limit of 15KB since it's small enough to trigger 20*PAGE_SIZE. Also 15KB not a multiple of PAGE_SIZE, which is a trickier case to handle correctly with this code. 4K pages should be fine to test with. [kosaki.motohiro@jp.fujitsu.com: cleanup] [akpm@linux-foundation.org: cleanup cleanup] Signed-off-by: Michael Neuling Signed-off-by: KOSAKI Motohiro Cc: Americo Wang Cc: Anton Blanchard Cc: Oleg Nesterov Cc: James Morris Cc: Ingo Molnar Cc: Serge Hallyn Cc: Benjamin Herrenschmidt Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) (limited to 'fs/exec.c') diff --git a/fs/exec.c b/fs/exec.c index 0790a107ff7e..e95c692ef0e4 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -571,6 +571,9 @@ int setup_arg_pages(struct linux_binprm *bprm, struct vm_area_struct *prev = NULL; unsigned long vm_flags; unsigned long stack_base; + unsigned long stack_size; + unsigned long stack_expand; + unsigned long rlim_stack; #ifdef CONFIG_STACK_GROWSUP /* Limit stack size to 1GB */ @@ -627,10 +630,24 @@ int setup_arg_pages(struct linux_binprm *bprm, goto out_unlock; } + stack_expand = EXTRA_STACK_VM_PAGES * PAGE_SIZE; + stack_size = vma->vm_end - vma->vm_start; + /* + * Align this down to a page boundary as expand_stack + * will align it up. + */ + rlim_stack = rlimit(RLIMIT_STACK) & PAGE_MASK; + rlim_stack = min(rlim_stack, stack_size); #ifdef CONFIG_STACK_GROWSUP - stack_base = vma->vm_end + EXTRA_STACK_VM_PAGES * PAGE_SIZE; + if (stack_size + stack_expand > rlim_stack) + stack_base = vma->vm_start + rlim_stack; + else + stack_base = vma->vm_end + stack_expand; #else - stack_base = vma->vm_start - EXTRA_STACK_VM_PAGES * PAGE_SIZE; + if (stack_size + stack_expand > rlim_stack) + stack_base = vma->vm_end - rlim_stack; + else + stack_base = vma->vm_start - stack_expand; #endif ret = expand_stack(vma, stack_base); if (ret) -- cgit v1.2.2