summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChristian Brauner <christian@brauner.io>2019-05-25 05:36:41 -0400
committerChristian Brauner <christian@brauner.io>2019-06-09 03:29:28 -0400
commit7f192e3cd316ba58c88dfa26796cf77789dd9872 (patch)
tree5393310e05423aa017a7fea1e80b7dc256c77239
parenta188339ca5a396acc588e5851ed7e19f66b0ebd9 (diff)
fork: add clone3
This adds the clone3 system call. As mentioned several times already (cf. [7], [8]) here's the promised patchset for clone3(). We recently merged the CLONE_PIDFD patchset (cf. [1]). It took the last free flag from clone(). Independent of the CLONE_PIDFD patchset a time namespace has been discussed at Linux Plumber Conference last year and has been sent out and reviewed (cf. [5]). It is expected that it will go upstream in the not too distant future. However, it relies on the addition of the CLONE_NEWTIME flag to clone(). The only other good candidate - CLONE_DETACHED - is currently not recyclable as we have identified at least two large or widely used codebases that currently pass this flag (cf. [2], [3], and [4]). Given that CLONE_PIDFD grabbed the last clone() flag the time namespace is effectively blocked. clone3() has the advantage that it will unblock this patchset again. In general, clone3() is extensible and allows for the implementation of new features. The idea is to keep clone3() very simple and close to the original clone(), specifically, to keep on supporting old clone()-based workloads. We know there have been various creative proposals how a new process creation syscall or even api is supposed to look like. Some people even going so far as to argue that the traditional fork()+exec() split should be abandoned in favor of an in-kernel version of spawn(). Independent of whether or not we personally think spawn() is a good idea this patchset has and does not want to have anything to do with this. One stance we take is that there's no real good alternative to clone()+exec() and we need and want to support this model going forward; independent of spawn(). The following requirements guided clone3(): - bump the number of available flags - move arguments that are currently passed as separate arguments in clone() into a dedicated struct clone_args - choose a struct layout that is easy to handle on 32 and on 64 bit - choose a struct layout that is extensible - give new flags that currently need to abuse another flag's dedicated return argument in clone() their own dedicated return argument (e.g. CLONE_PIDFD) - use a separate kernel internal struct kernel_clone_args that is properly typed according to current kernel conventions in fork.c and is different from the uapi struct clone_args - port _do_fork() to use kernel_clone_args so that all process creation syscalls such as fork(), vfork(), clone(), and clone3() behave identical (Arnd suggested, that we can probably also port do_fork() itself in a separate patchset.) - ease of transition for userspace from clone() to clone3() This very much means that we do *not* remove functionality that userspace currently relies on as the latter is a good way of creating a syscall that won't be adopted. - do not try to be clever or complex: keep clone3() as dumb as possible In accordance with Linus suggestions (cf. [11]), clone3() has the following signature: /* uapi */ struct clone_args { __aligned_u64 flags; __aligned_u64 pidfd; __aligned_u64 child_tid; __aligned_u64 parent_tid; __aligned_u64 exit_signal; __aligned_u64 stack; __aligned_u64 stack_size; __aligned_u64 tls; }; /* kernel internal */ struct kernel_clone_args { u64 flags; int __user *pidfd; int __user *child_tid; int __user *parent_tid; int exit_signal; unsigned long stack; unsigned long stack_size; unsigned long tls; }; long sys_clone3(struct clone_args __user *uargs, size_t size) clone3() cleanly supports all of the supported flags from clone() and thus all legacy workloads. The advantage of sticking close to the old clone() is the low cost for userspace to switch to this new api. Quite a lot of userspace apis (e.g. pthreads) are based on the clone() syscall. With the new clone3() syscall supporting all of the old workloads and opening up the ability to add new features should make switching to it for userspace more appealing. In essence, glibc can just write a simple wrapper to switch from clone() to clone3(). There has been some interest in this patchset already. We have received a patch from the CRIU corner for clone3() that would set the PID/TID of a restored process without /proc/sys/kernel/ns_last_pid to eliminate a race. /* User visible differences to legacy clone() */ - CLONE_DETACHED will cause EINVAL with clone3() - CSIGNAL is deprecated It is superseeded by a dedicated "exit_signal" argument in struct clone_args freeing up space for additional flags. This is based on a suggestion from Andrei and Linus (cf. [9] and [10]) /* References */ [1]: b3e5838252665ee4cfa76b82bdf1198dca81e5be [2]: https://dxr.mozilla.org/mozilla-central/source/security/sandbox/linux/SandboxFilter.cpp#343 [3]: https://git.musl-libc.org/cgit/musl/tree/src/thread/pthread_create.c#n233 [4]: https://sources.debian.org/src/blcr/0.8.5-2.3/cr_module/cr_dump_self.c/?hl=740#L740 [5]: https://lore.kernel.org/lkml/20190425161416.26600-1-dima@arista.com/ [6]: https://lore.kernel.org/lkml/20190425161416.26600-2-dima@arista.com/ [7]: https://lore.kernel.org/lkml/CAHrFyr5HxpGXA2YrKza-oB-GGwJCqwPfyhD-Y5wbktWZdt0sGQ@mail.gmail.com/ [8]: https://lore.kernel.org/lkml/20190524102756.qjsjxukuq2f4t6bo@brauner.io/ [9]: https://lore.kernel.org/lkml/20190529222414.GA6492@gmail.com/ [10]: https://lore.kernel.org/lkml/CAHk-=whQP-Ykxi=zSYaV9iXsHsENa+2fdj-zYKwyeyed63Lsfw@mail.gmail.com/ [11]: https://lore.kernel.org/lkml/CAHk-=wieuV4hGwznPsX-8E0G2FKhx3NjZ9X3dTKh5zKd+iqOBw@mail.gmail.com/ Suggested-by: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Christian Brauner <christian@brauner.io> Acked-by: Arnd Bergmann <arnd@arndb.de> Acked-by: Serge Hallyn <serge@hallyn.com> Cc: Kees Cook <keescook@chromium.org> Cc: Pavel Emelyanov <xemul@virtuozzo.com> Cc: Jann Horn <jannh@google.com> Cc: David Howells <dhowells@redhat.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Adrian Reber <adrian@lisas.de> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Andrei Vagin <avagin@gmail.com> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: Florian Weimer <fweimer@redhat.com> Cc: linux-api@vger.kernel.org
-rw-r--r--arch/x86/ia32/sys_ia32.c12
-rw-r--r--include/linux/sched/task.h17
-rw-r--r--include/linux/syscalls.h4
-rw-r--r--include/uapi/linux/sched.h16
-rw-r--r--kernel/fork.c201
5 files changed, 199 insertions, 51 deletions
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index a43212036257..64a6c952091e 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -237,6 +237,14 @@ COMPAT_SYSCALL_DEFINE5(x86_clone, unsigned long, clone_flags,
237 unsigned long, newsp, int __user *, parent_tidptr, 237 unsigned long, newsp, int __user *, parent_tidptr,
238 unsigned long, tls_val, int __user *, child_tidptr) 238 unsigned long, tls_val, int __user *, child_tidptr)
239{ 239{
240 return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr, 240 struct kernel_clone_args args = {
241 tls_val); 241 .flags = (clone_flags & ~CSIGNAL),
242 .child_tid = child_tidptr,
243 .parent_tid = parent_tidptr,
244 .exit_signal = (clone_flags & CSIGNAL),
245 .stack = newsp,
246 .tls = tls_val,
247 };
248
249 return _do_fork(&args);
242} 250}
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index f1227f2c38a4..109a0df5af39 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -8,11 +8,26 @@
8 */ 8 */
9 9
10#include <linux/sched.h> 10#include <linux/sched.h>
11#include <linux/uaccess.h>
11 12
12struct task_struct; 13struct task_struct;
13struct rusage; 14struct rusage;
14union thread_union; 15union thread_union;
15 16
17/* All the bits taken by the old clone syscall. */
18#define CLONE_LEGACY_FLAGS 0xffffffffULL
19
20struct kernel_clone_args {
21 u64 flags;
22 int __user *pidfd;
23 int __user *child_tid;
24 int __user *parent_tid;
25 int exit_signal;
26 unsigned long stack;
27 unsigned long stack_size;
28 unsigned long tls;
29};
30
16/* 31/*
17 * This serializes "schedule()" and also protects 32 * This serializes "schedule()" and also protects
18 * the run-queue from deletions/modifications (but 33 * the run-queue from deletions/modifications (but
@@ -73,7 +88,7 @@ extern void do_group_exit(int);
73extern void exit_files(struct task_struct *); 88extern void exit_files(struct task_struct *);
74extern void exit_itimers(struct signal_struct *); 89extern void exit_itimers(struct signal_struct *);
75 90
76extern long _do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *, unsigned long); 91extern long _do_fork(struct kernel_clone_args *kargs);
77extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *); 92extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *);
78struct task_struct *fork_idle(int); 93struct task_struct *fork_idle(int);
79struct mm_struct *copy_init_mm(void); 94struct mm_struct *copy_init_mm(void);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index e2870fe1be5b..60a81f374ca3 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -70,6 +70,7 @@ struct sigaltstack;
70struct rseq; 70struct rseq;
71union bpf_attr; 71union bpf_attr;
72struct io_uring_params; 72struct io_uring_params;
73struct clone_args;
73 74
74#include <linux/types.h> 75#include <linux/types.h>
75#include <linux/aio_abi.h> 76#include <linux/aio_abi.h>
@@ -852,6 +853,9 @@ asmlinkage long sys_clone(unsigned long, unsigned long, int __user *,
852 int __user *, unsigned long); 853 int __user *, unsigned long);
853#endif 854#endif
854#endif 855#endif
856
857asmlinkage long sys_clone3(struct clone_args __user *uargs, size_t size);
858
855asmlinkage long sys_execve(const char __user *filename, 859asmlinkage long sys_execve(const char __user *filename,
856 const char __user *const __user *argv, 860 const char __user *const __user *argv,
857 const char __user *const __user *envp); 861 const char __user *const __user *envp);
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index ed4ee170bee2..f5331dbdcaa2 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -2,6 +2,8 @@
2#ifndef _UAPI_LINUX_SCHED_H 2#ifndef _UAPI_LINUX_SCHED_H
3#define _UAPI_LINUX_SCHED_H 3#define _UAPI_LINUX_SCHED_H
4 4
5#include <linux/types.h>
6
5/* 7/*
6 * cloning flags: 8 * cloning flags:
7 */ 9 */
@@ -32,6 +34,20 @@
32#define CLONE_IO 0x80000000 /* Clone io context */ 34#define CLONE_IO 0x80000000 /* Clone io context */
33 35
34/* 36/*
37 * Arguments for the clone3 syscall
38 */
39struct clone_args {
40 __aligned_u64 flags;
41 __aligned_u64 pidfd;
42 __aligned_u64 child_tid;
43 __aligned_u64 parent_tid;
44 __aligned_u64 exit_signal;
45 __aligned_u64 stack;
46 __aligned_u64 stack_size;
47 __aligned_u64 tls;
48};
49
50/*
35 * Scheduling policies 51 * Scheduling policies
36 */ 52 */
37#define SCHED_NORMAL 0 53#define SCHED_NORMAL 0
diff --git a/kernel/fork.c b/kernel/fork.c
index b4cba953040a..08ff131f26b4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1760,19 +1760,15 @@ static __always_inline void delayed_free_task(struct task_struct *tsk)
1760 * flags). The actual kick-off is left to the caller. 1760 * flags). The actual kick-off is left to the caller.
1761 */ 1761 */
1762static __latent_entropy struct task_struct *copy_process( 1762static __latent_entropy struct task_struct *copy_process(
1763 unsigned long clone_flags,
1764 unsigned long stack_start,
1765 unsigned long stack_size,
1766 int __user *parent_tidptr,
1767 int __user *child_tidptr,
1768 struct pid *pid, 1763 struct pid *pid,
1769 int trace, 1764 int trace,
1770 unsigned long tls, 1765 int node,
1771 int node) 1766 struct kernel_clone_args *args)
1772{ 1767{
1773 int pidfd = -1, retval; 1768 int pidfd = -1, retval;
1774 struct task_struct *p; 1769 struct task_struct *p;
1775 struct multiprocess_signals delayed; 1770 struct multiprocess_signals delayed;
1771 u64 clone_flags = args->flags;
1776 1772
1777 /* 1773 /*
1778 * Don't allow sharing the root directory with processes in a different 1774 * Don't allow sharing the root directory with processes in a different
@@ -1821,27 +1817,12 @@ static __latent_entropy struct task_struct *copy_process(
1821 } 1817 }
1822 1818
1823 if (clone_flags & CLONE_PIDFD) { 1819 if (clone_flags & CLONE_PIDFD) {
1824 int reserved;
1825
1826 /* 1820 /*
1827 * - CLONE_PARENT_SETTID is useless for pidfds and also
1828 * parent_tidptr is used to return pidfds.
1829 * - CLONE_DETACHED is blocked so that we can potentially 1821 * - CLONE_DETACHED is blocked so that we can potentially
1830 * reuse it later for CLONE_PIDFD. 1822 * reuse it later for CLONE_PIDFD.
1831 * - CLONE_THREAD is blocked until someone really needs it. 1823 * - CLONE_THREAD is blocked until someone really needs it.
1832 */ 1824 */
1833 if (clone_flags & 1825 if (clone_flags & (CLONE_DETACHED | CLONE_THREAD))
1834 (CLONE_DETACHED | CLONE_PARENT_SETTID | CLONE_THREAD))
1835 return ERR_PTR(-EINVAL);
1836
1837 /*
1838 * Verify that parent_tidptr is sane so we can potentially
1839 * reuse it later.
1840 */
1841 if (get_user(reserved, parent_tidptr))
1842 return ERR_PTR(-EFAULT);
1843
1844 if (reserved != 0)
1845 return ERR_PTR(-EINVAL); 1826 return ERR_PTR(-EINVAL);
1846 } 1827 }
1847 1828
@@ -1874,11 +1855,11 @@ static __latent_entropy struct task_struct *copy_process(
1874 * p->set_child_tid which is (ab)used as a kthread's data pointer for 1855 * p->set_child_tid which is (ab)used as a kthread's data pointer for
1875 * kernel threads (PF_KTHREAD). 1856 * kernel threads (PF_KTHREAD).
1876 */ 1857 */
1877 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; 1858 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL;
1878 /* 1859 /*
1879 * Clear TID on mm_release()? 1860 * Clear TID on mm_release()?
1880 */ 1861 */
1881 p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL; 1862 p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? args->child_tid : NULL;
1882 1863
1883 ftrace_graph_init_task(p); 1864 ftrace_graph_init_task(p);
1884 1865
@@ -2037,7 +2018,8 @@ static __latent_entropy struct task_struct *copy_process(
2037 retval = copy_io(clone_flags, p); 2018 retval = copy_io(clone_flags, p);
2038 if (retval) 2019 if (retval)
2039 goto bad_fork_cleanup_namespaces; 2020 goto bad_fork_cleanup_namespaces;
2040 retval = copy_thread_tls(clone_flags, stack_start, stack_size, p, tls); 2021 retval = copy_thread_tls(clone_flags, args->stack, args->stack_size, p,
2022 args->tls);
2041 if (retval) 2023 if (retval)
2042 goto bad_fork_cleanup_io; 2024 goto bad_fork_cleanup_io;
2043 2025
@@ -2062,7 +2044,7 @@ static __latent_entropy struct task_struct *copy_process(
2062 goto bad_fork_free_pid; 2044 goto bad_fork_free_pid;
2063 2045
2064 pidfd = retval; 2046 pidfd = retval;
2065 retval = put_user(pidfd, parent_tidptr); 2047 retval = put_user(pidfd, args->pidfd);
2066 if (retval) 2048 if (retval)
2067 goto bad_fork_put_pidfd; 2049 goto bad_fork_put_pidfd;
2068 } 2050 }
@@ -2105,7 +2087,7 @@ static __latent_entropy struct task_struct *copy_process(
2105 if (clone_flags & CLONE_PARENT) 2087 if (clone_flags & CLONE_PARENT)
2106 p->exit_signal = current->group_leader->exit_signal; 2088 p->exit_signal = current->group_leader->exit_signal;
2107 else 2089 else
2108 p->exit_signal = (clone_flags & CSIGNAL); 2090 p->exit_signal = args->exit_signal;
2109 p->group_leader = p; 2091 p->group_leader = p;
2110 p->tgid = p->pid; 2092 p->tgid = p->pid;
2111 } 2093 }
@@ -2313,8 +2295,11 @@ static inline void init_idle_pids(struct task_struct *idle)
2313struct task_struct *fork_idle(int cpu) 2295struct task_struct *fork_idle(int cpu)
2314{ 2296{
2315 struct task_struct *task; 2297 struct task_struct *task;
2316 task = copy_process(CLONE_VM, 0, 0, NULL, NULL, &init_struct_pid, 0, 0, 2298 struct kernel_clone_args args = {
2317 cpu_to_node(cpu)); 2299 .flags = CLONE_VM,
2300 };
2301
2302 task = copy_process(&init_struct_pid, 0, cpu_to_node(cpu), &args);
2318 if (!IS_ERR(task)) { 2303 if (!IS_ERR(task)) {
2319 init_idle_pids(task); 2304 init_idle_pids(task);
2320 init_idle(task, cpu); 2305 init_idle(task, cpu);
@@ -2334,13 +2319,9 @@ struct mm_struct *copy_init_mm(void)
2334 * It copies the process, and if successful kick-starts 2319 * It copies the process, and if successful kick-starts
2335 * it and waits for it to finish using the VM if required. 2320 * it and waits for it to finish using the VM if required.
2336 */ 2321 */
2337long _do_fork(unsigned long clone_flags, 2322long _do_fork(struct kernel_clone_args *args)
2338 unsigned long stack_start,
2339 unsigned long stack_size,
2340 int __user *parent_tidptr,
2341 int __user *child_tidptr,
2342 unsigned long tls)
2343{ 2323{
2324 u64 clone_flags = args->flags;
2344 struct completion vfork; 2325 struct completion vfork;
2345 struct pid *pid; 2326 struct pid *pid;
2346 struct task_struct *p; 2327 struct task_struct *p;
@@ -2356,7 +2337,7 @@ long _do_fork(unsigned long clone_flags,
2356 if (!(clone_flags & CLONE_UNTRACED)) { 2337 if (!(clone_flags & CLONE_UNTRACED)) {
2357 if (clone_flags & CLONE_VFORK) 2338 if (clone_flags & CLONE_VFORK)
2358 trace = PTRACE_EVENT_VFORK; 2339 trace = PTRACE_EVENT_VFORK;
2359 else if ((clone_flags & CSIGNAL) != SIGCHLD) 2340 else if (args->exit_signal != SIGCHLD)
2360 trace = PTRACE_EVENT_CLONE; 2341 trace = PTRACE_EVENT_CLONE;
2361 else 2342 else
2362 trace = PTRACE_EVENT_FORK; 2343 trace = PTRACE_EVENT_FORK;
@@ -2365,8 +2346,7 @@ long _do_fork(unsigned long clone_flags,
2365 trace = 0; 2346 trace = 0;
2366 } 2347 }
2367 2348
2368 p = copy_process(clone_flags, stack_start, stack_size, parent_tidptr, 2349 p = copy_process(NULL, trace, NUMA_NO_NODE, args);
2369 child_tidptr, NULL, trace, tls, NUMA_NO_NODE);
2370 add_latent_entropy(); 2350 add_latent_entropy();
2371 2351
2372 if (IS_ERR(p)) 2352 if (IS_ERR(p))
@@ -2382,7 +2362,7 @@ long _do_fork(unsigned long clone_flags,
2382 nr = pid_vnr(pid); 2362 nr = pid_vnr(pid);
2383 2363
2384 if (clone_flags & CLONE_PARENT_SETTID) 2364 if (clone_flags & CLONE_PARENT_SETTID)
2385 put_user(nr, parent_tidptr); 2365 put_user(nr, args->parent_tid);
2386 2366
2387 if (clone_flags & CLONE_VFORK) { 2367 if (clone_flags & CLONE_VFORK) {
2388 p->vfork_done = &vfork; 2368 p->vfork_done = &vfork;
@@ -2414,8 +2394,16 @@ long do_fork(unsigned long clone_flags,
2414 int __user *parent_tidptr, 2394 int __user *parent_tidptr,
2415 int __user *child_tidptr) 2395 int __user *child_tidptr)
2416{ 2396{
2417 return _do_fork(clone_flags, stack_start, stack_size, 2397 struct kernel_clone_args args = {
2418 parent_tidptr, child_tidptr, 0); 2398 .flags = (clone_flags & ~CSIGNAL),
2399 .child_tid = child_tidptr,
2400 .parent_tid = parent_tidptr,
2401 .exit_signal = (clone_flags & CSIGNAL),
2402 .stack = stack_start,
2403 .stack_size = stack_size,
2404 };
2405
2406 return _do_fork(&args);
2419} 2407}
2420#endif 2408#endif
2421 2409
@@ -2424,15 +2412,25 @@ long do_fork(unsigned long clone_flags,
2424 */ 2412 */
2425pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) 2413pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
2426{ 2414{
2427 return _do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn, 2415 struct kernel_clone_args args = {
2428 (unsigned long)arg, NULL, NULL, 0); 2416 .flags = ((flags | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL),
2417 .exit_signal = (flags & CSIGNAL),
2418 .stack = (unsigned long)fn,
2419 .stack_size = (unsigned long)arg,
2420 };
2421
2422 return _do_fork(&args);
2429} 2423}
2430 2424
2431#ifdef __ARCH_WANT_SYS_FORK 2425#ifdef __ARCH_WANT_SYS_FORK
2432SYSCALL_DEFINE0(fork) 2426SYSCALL_DEFINE0(fork)
2433{ 2427{
2434#ifdef CONFIG_MMU 2428#ifdef CONFIG_MMU
2435 return _do_fork(SIGCHLD, 0, 0, NULL, NULL, 0); 2429 struct kernel_clone_args args = {
2430 .exit_signal = SIGCHLD,
2431 };
2432
2433 return _do_fork(&args);
2436#else 2434#else
2437 /* can not support in nommu mode */ 2435 /* can not support in nommu mode */
2438 return -EINVAL; 2436 return -EINVAL;
@@ -2443,8 +2441,12 @@ SYSCALL_DEFINE0(fork)
2443#ifdef __ARCH_WANT_SYS_VFORK 2441#ifdef __ARCH_WANT_SYS_VFORK
2444SYSCALL_DEFINE0(vfork) 2442SYSCALL_DEFINE0(vfork)
2445{ 2443{
2446 return _do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0, 2444 struct kernel_clone_args args = {
2447 0, NULL, NULL, 0); 2445 .flags = CLONE_VFORK | CLONE_VM,
2446 .exit_signal = SIGCHLD,
2447 };
2448
2449 return _do_fork(&args);
2448} 2450}
2449#endif 2451#endif
2450 2452
@@ -2472,7 +2474,110 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
2472 unsigned long, tls) 2474 unsigned long, tls)
2473#endif 2475#endif
2474{ 2476{
2475 return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr, tls); 2477 struct kernel_clone_args args = {
2478 .flags = (clone_flags & ~CSIGNAL),
2479 .pidfd = parent_tidptr,
2480 .child_tid = child_tidptr,
2481 .parent_tid = parent_tidptr,
2482 .exit_signal = (clone_flags & CSIGNAL),
2483 .stack = newsp,
2484 .tls = tls,
2485 };
2486
2487 /* clone(CLONE_PIDFD) uses parent_tidptr to return a pidfd */
2488 if ((clone_flags & CLONE_PIDFD) && (clone_flags & CLONE_PARENT_SETTID))
2489 return -EINVAL;
2490
2491 return _do_fork(&args);
2492}
2493
2494noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
2495 struct clone_args __user *uargs,
2496 size_t size)
2497{
2498 struct clone_args args;
2499
2500 if (unlikely(size > PAGE_SIZE))
2501 return -E2BIG;
2502
2503 if (unlikely(size < sizeof(struct clone_args)))
2504 return -EINVAL;
2505
2506 if (unlikely(!access_ok(uargs, size)))
2507 return -EFAULT;
2508
2509 if (size > sizeof(struct clone_args)) {
2510 unsigned char __user *addr;
2511 unsigned char __user *end;
2512 unsigned char val;
2513
2514 addr = (void __user *)uargs + sizeof(struct clone_args);
2515 end = (void __user *)uargs + size;
2516
2517 for (; addr < end; addr++) {
2518 if (get_user(val, addr))
2519 return -EFAULT;
2520 if (val)
2521 return -E2BIG;
2522 }
2523
2524 size = sizeof(struct clone_args);
2525 }
2526
2527 if (copy_from_user(&args, uargs, size))
2528 return -EFAULT;
2529
2530 *kargs = (struct kernel_clone_args){
2531 .flags = args.flags,
2532 .pidfd = u64_to_user_ptr(args.pidfd),
2533 .child_tid = u64_to_user_ptr(args.child_tid),
2534 .parent_tid = u64_to_user_ptr(args.parent_tid),
2535 .exit_signal = args.exit_signal,
2536 .stack = args.stack,
2537 .stack_size = args.stack_size,
2538 .tls = args.tls,
2539 };
2540
2541 return 0;
2542}
2543
2544static bool clone3_args_valid(const struct kernel_clone_args *kargs)
2545{
2546 /*
2547 * All lower bits of the flag word are taken.
2548 * Verify that no other unknown flags are passed along.
2549 */
2550 if (kargs->flags & ~CLONE_LEGACY_FLAGS)
2551 return false;
2552
2553 /*
2554 * - make the CLONE_DETACHED bit reuseable for clone3
2555 * - make the CSIGNAL bits reuseable for clone3
2556 */
2557 if (kargs->flags & (CLONE_DETACHED | CSIGNAL))
2558 return false;
2559
2560 if ((kargs->flags & (CLONE_THREAD | CLONE_PARENT)) &&
2561 kargs->exit_signal)
2562 return false;
2563
2564 return true;
2565}
2566
2567SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size)
2568{
2569 int err;
2570
2571 struct kernel_clone_args kargs;
2572
2573 err = copy_clone_args_from_user(&kargs, uargs, size);
2574 if (err)
2575 return err;
2576
2577 if (!clone3_args_valid(&kargs))
2578 return -EINVAL;
2579
2580 return _do_fork(&kargs);
2476} 2581}
2477#endif 2582#endif
2478 2583