summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-07-11 13:09:44 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2019-07-11 13:09:44 -0400
commit8f6ccf6159aed1f04c6d179f61f6fb2691261e84 (patch)
tree449c6d9cddc6f94c6450a18885b6a06e8a67d845
parent5450e8a316a64cddcbc15f90733ebc78aa736545 (diff)
parentd68dbb0c9ac8b1ff52eb09aa58ce6358400fa939 (diff)
Merge tag 'clone3-v5.3' of git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux
Pull clone3 system call from Christian Brauner: "This adds the clone3 syscall which is an extensible successor to clone after we snagged the last flag with CLONE_PIDFD during the 5.2 merge window for clone(). It cleanly supports all of the flags from clone() and thus all legacy workloads. There are few user visible differences between clone3 and clone. First, CLONE_DETACHED will cause EINVAL with clone3 so we can reuse this flag. Second, the CSIGNAL flag is deprecated and will cause EINVAL to be reported. It is superseeded by a dedicated "exit_signal" argument in struct clone_args thus freeing up even more flags. And third, clone3 gives CLONE_PIDFD a dedicated return argument in struct clone_args instead of abusing CLONE_PARENT_SETTID's parent_tidptr argument. The clone3 uapi is designed to be easy to handle on 32- and 64 bit: /* uapi */ struct clone_args { __aligned_u64 flags; __aligned_u64 pidfd; __aligned_u64 child_tid; __aligned_u64 parent_tid; __aligned_u64 exit_signal; __aligned_u64 stack; __aligned_u64 stack_size; __aligned_u64 tls; }; and a separate kernel struct is used that uses proper kernel typing: /* kernel internal */ struct kernel_clone_args { u64 flags; int __user *pidfd; int __user *child_tid; int __user *parent_tid; int exit_signal; unsigned long stack; unsigned long stack_size; unsigned long tls; }; The system call comes with a size argument which enables the kernel to detect what version of clone_args userspace is passing in. clone3 validates that any additional bytes a given kernel does not know about are set to zero and that the size never exceeds a page. A nice feature is that this patchset allowed us to cleanup and simplify various core kernel codepaths in kernel/fork.c by making the internal _do_fork() function take struct kernel_clone_args even for legacy clone(). This patch also unblocks the time namespace patchset which wants to introduce a new CLONE_TIMENS flag. Note, that clone3 has only been wired up for x86{_32,64}, arm{64}, and xtensa. These were the architectures that did not require special massaging. Other architectures treat fork-like system calls individually and after some back and forth neither Arnd nor I felt confident that we dared to add clone3 unconditionally to all architectures. We agreed to leave this up to individual architecture maintainers. This is why there's an additional patch that introduces __ARCH_WANT_SYS_CLONE3 which any architecture can set once it has implemented support for clone3. The patch also adds a cond_syscall(clone3) for architectures such as nios2 or h8300 that generate their syscall table by simply including asm-generic/unistd.h. The hope is to get rid of __ARCH_WANT_SYS_CLONE3 and cond_syscall() rather soon" * tag 'clone3-v5.3' of git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux: arch: handle arches who do not yet define clone3 arch: wire-up clone3() syscall fork: add clone3
-rw-r--r--arch/arm/include/asm/unistd.h1
-rw-r--r--arch/arm/tools/syscall.tbl1
-rw-r--r--arch/arm64/include/asm/unistd.h3
-rw-r--r--arch/arm64/include/asm/unistd32.h2
-rw-r--r--arch/microblaze/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/x86/entry/syscalls/syscall_32.tbl1
-rw-r--r--arch/x86/entry/syscalls/syscall_64.tbl1
-rw-r--r--arch/x86/ia32/sys_ia32.c12
-rw-r--r--arch/x86/include/asm/unistd.h1
-rw-r--r--arch/xtensa/include/asm/unistd.h1
-rw-r--r--arch/xtensa/kernel/syscalls/syscall.tbl1
-rw-r--r--include/linux/sched/task.h17
-rw-r--r--include/linux/syscalls.h4
-rw-r--r--include/uapi/asm-generic/unistd.h4
-rw-r--r--include/uapi/linux/sched.h16
-rw-r--r--kernel/fork.c191
-rw-r--r--kernel/sys_ni.c2
17 files changed, 218 insertions, 41 deletions
diff --git a/arch/arm/include/asm/unistd.h b/arch/arm/include/asm/unistd.h
index 9fb00973c608..3676e82cf95c 100644
--- a/arch/arm/include/asm/unistd.h
+++ b/arch/arm/include/asm/unistd.h
@@ -37,6 +37,7 @@
37#define __ARCH_WANT_SYS_FORK 37#define __ARCH_WANT_SYS_FORK
38#define __ARCH_WANT_SYS_VFORK 38#define __ARCH_WANT_SYS_VFORK
39#define __ARCH_WANT_SYS_CLONE 39#define __ARCH_WANT_SYS_CLONE
40#define __ARCH_WANT_SYS_CLONE3
40 41
41/* 42/*
42 * Unimplemented (or alternatively implemented) syscalls 43 * Unimplemented (or alternatively implemented) syscalls
diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
index 81e6e1817c45..6da7dc4d79cc 100644
--- a/arch/arm/tools/syscall.tbl
+++ b/arch/arm/tools/syscall.tbl
@@ -448,3 +448,4 @@
448432 common fsmount sys_fsmount 448432 common fsmount sys_fsmount
449433 common fspick sys_fspick 449433 common fspick sys_fspick
450434 common pidfd_open sys_pidfd_open 450434 common pidfd_open sys_pidfd_open
451435 common clone3 sys_clone3
diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
index ede7b88d4f15..2629a68b8724 100644
--- a/arch/arm64/include/asm/unistd.h
+++ b/arch/arm64/include/asm/unistd.h
@@ -38,10 +38,11 @@
38#define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5) 38#define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5)
39#define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800) 39#define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800)
40 40
41#define __NR_compat_syscalls 435 41#define __NR_compat_syscalls 436
42#endif 42#endif
43 43
44#define __ARCH_WANT_SYS_CLONE 44#define __ARCH_WANT_SYS_CLONE
45#define __ARCH_WANT_SYS_CLONE3
45 46
46#ifndef __COMPAT_SYSCALL_NR 47#ifndef __COMPAT_SYSCALL_NR
47#include <uapi/asm/unistd.h> 48#include <uapi/asm/unistd.h>
diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
index 52415923e08f..94ab29cf4f00 100644
--- a/arch/arm64/include/asm/unistd32.h
+++ b/arch/arm64/include/asm/unistd32.h
@@ -877,6 +877,8 @@ __SYSCALL(__NR_fsmount, sys_fsmount)
877__SYSCALL(__NR_fspick, sys_fspick) 877__SYSCALL(__NR_fspick, sys_fspick)
878#define __NR_pidfd_open 434 878#define __NR_pidfd_open 434
879__SYSCALL(__NR_pidfd_open, sys_pidfd_open) 879__SYSCALL(__NR_pidfd_open, sys_pidfd_open)
880#define __NR_clone3 435
881__SYSCALL(__NR_clone3, sys_clone3)
880 882
881/* 883/*
882 * Please add new compat syscalls above this comment and update 884 * Please add new compat syscalls above this comment and update
diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl
index ad706f83c755..09b0cd7dab0a 100644
--- a/arch/microblaze/kernel/syscalls/syscall.tbl
+++ b/arch/microblaze/kernel/syscalls/syscall.tbl
@@ -440,3 +440,4 @@
440432 common fsmount sys_fsmount 440432 common fsmount sys_fsmount
441433 common fspick sys_fspick 441433 common fspick sys_fspick
442434 common pidfd_open sys_pidfd_open 442434 common pidfd_open sys_pidfd_open
443435 common clone3 sys_clone3
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 43e4429a5272..c00019abd076 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -439,3 +439,4 @@
439432 i386 fsmount sys_fsmount __ia32_sys_fsmount 439432 i386 fsmount sys_fsmount __ia32_sys_fsmount
440433 i386 fspick sys_fspick __ia32_sys_fspick 440433 i386 fspick sys_fspick __ia32_sys_fspick
441434 i386 pidfd_open sys_pidfd_open __ia32_sys_pidfd_open 441434 i386 pidfd_open sys_pidfd_open __ia32_sys_pidfd_open
442435 i386 clone3 sys_clone3 __ia32_sys_clone3
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 1bee0a77fdd3..c29976eca4a8 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -356,6 +356,7 @@
356432 common fsmount __x64_sys_fsmount 356432 common fsmount __x64_sys_fsmount
357433 common fspick __x64_sys_fspick 357433 common fspick __x64_sys_fspick
358434 common pidfd_open __x64_sys_pidfd_open 358434 common pidfd_open __x64_sys_pidfd_open
359435 common clone3 __x64_sys_clone3/ptregs
359 360
360# 361#
361# x32-specific system call numbers start at 512 to avoid cache impact 362# x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index a43212036257..64a6c952091e 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -237,6 +237,14 @@ COMPAT_SYSCALL_DEFINE5(x86_clone, unsigned long, clone_flags,
237 unsigned long, newsp, int __user *, parent_tidptr, 237 unsigned long, newsp, int __user *, parent_tidptr,
238 unsigned long, tls_val, int __user *, child_tidptr) 238 unsigned long, tls_val, int __user *, child_tidptr)
239{ 239{
240 return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr, 240 struct kernel_clone_args args = {
241 tls_val); 241 .flags = (clone_flags & ~CSIGNAL),
242 .child_tid = child_tidptr,
243 .parent_tid = parent_tidptr,
244 .exit_signal = (clone_flags & CSIGNAL),
245 .stack = newsp,
246 .tls = tls_val,
247 };
248
249 return _do_fork(&args);
242} 250}
diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h
index 146859efd83c..097589753fec 100644
--- a/arch/x86/include/asm/unistd.h
+++ b/arch/x86/include/asm/unistd.h
@@ -54,5 +54,6 @@
54# define __ARCH_WANT_SYS_FORK 54# define __ARCH_WANT_SYS_FORK
55# define __ARCH_WANT_SYS_VFORK 55# define __ARCH_WANT_SYS_VFORK
56# define __ARCH_WANT_SYS_CLONE 56# define __ARCH_WANT_SYS_CLONE
57# define __ARCH_WANT_SYS_CLONE3
57 58
58#endif /* _ASM_X86_UNISTD_H */ 59#endif /* _ASM_X86_UNISTD_H */
diff --git a/arch/xtensa/include/asm/unistd.h b/arch/xtensa/include/asm/unistd.h
index 30af4dc3ce7b..b52236245e51 100644
--- a/arch/xtensa/include/asm/unistd.h
+++ b/arch/xtensa/include/asm/unistd.h
@@ -3,6 +3,7 @@
3#define _XTENSA_UNISTD_H 3#define _XTENSA_UNISTD_H
4 4
5#define __ARCH_WANT_SYS_CLONE 5#define __ARCH_WANT_SYS_CLONE
6#define __ARCH_WANT_SYS_CLONE3
6#include <uapi/asm/unistd.h> 7#include <uapi/asm/unistd.h>
7 8
8#define __ARCH_WANT_NEW_STAT 9#define __ARCH_WANT_NEW_STAT
diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl
index 782b81945ccc..25f4de729a6d 100644
--- a/arch/xtensa/kernel/syscalls/syscall.tbl
+++ b/arch/xtensa/kernel/syscalls/syscall.tbl
@@ -405,3 +405,4 @@
405432 common fsmount sys_fsmount 405432 common fsmount sys_fsmount
406433 common fspick sys_fspick 406433 common fspick sys_fspick
407434 common pidfd_open sys_pidfd_open 407434 common pidfd_open sys_pidfd_open
408435 common clone3 sys_clone3
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index f1227f2c38a4..109a0df5af39 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -8,11 +8,26 @@
8 */ 8 */
9 9
10#include <linux/sched.h> 10#include <linux/sched.h>
11#include <linux/uaccess.h>
11 12
12struct task_struct; 13struct task_struct;
13struct rusage; 14struct rusage;
14union thread_union; 15union thread_union;
15 16
17/* All the bits taken by the old clone syscall. */
18#define CLONE_LEGACY_FLAGS 0xffffffffULL
19
20struct kernel_clone_args {
21 u64 flags;
22 int __user *pidfd;
23 int __user *child_tid;
24 int __user *parent_tid;
25 int exit_signal;
26 unsigned long stack;
27 unsigned long stack_size;
28 unsigned long tls;
29};
30
16/* 31/*
17 * This serializes "schedule()" and also protects 32 * This serializes "schedule()" and also protects
18 * the run-queue from deletions/modifications (but 33 * the run-queue from deletions/modifications (but
@@ -73,7 +88,7 @@ extern void do_group_exit(int);
73extern void exit_files(struct task_struct *); 88extern void exit_files(struct task_struct *);
74extern void exit_itimers(struct signal_struct *); 89extern void exit_itimers(struct signal_struct *);
75 90
76extern long _do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *, unsigned long); 91extern long _do_fork(struct kernel_clone_args *kargs);
77extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *); 92extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *);
78struct task_struct *fork_idle(int); 93struct task_struct *fork_idle(int);
79struct mm_struct *copy_init_mm(void); 94struct mm_struct *copy_init_mm(void);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 699aed6674a0..b01d54a5732e 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -68,6 +68,7 @@ struct sigaltstack;
68struct rseq; 68struct rseq;
69union bpf_attr; 69union bpf_attr;
70struct io_uring_params; 70struct io_uring_params;
71struct clone_args;
71 72
72#include <linux/types.h> 73#include <linux/types.h>
73#include <linux/aio_abi.h> 74#include <linux/aio_abi.h>
@@ -850,6 +851,9 @@ asmlinkage long sys_clone(unsigned long, unsigned long, int __user *,
850 int __user *, unsigned long); 851 int __user *, unsigned long);
851#endif 852#endif
852#endif 853#endif
854
855asmlinkage long sys_clone3(struct clone_args __user *uargs, size_t size);
856
853asmlinkage long sys_execve(const char __user *filename, 857asmlinkage long sys_execve(const char __user *filename,
854 const char __user *const __user *argv, 858 const char __user *const __user *argv,
855 const char __user *const __user *envp); 859 const char __user *const __user *envp);
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index e5684a4512c0..9acfff0cd153 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -846,9 +846,11 @@ __SYSCALL(__NR_fsmount, sys_fsmount)
846__SYSCALL(__NR_fspick, sys_fspick) 846__SYSCALL(__NR_fspick, sys_fspick)
847#define __NR_pidfd_open 434 847#define __NR_pidfd_open 434
848__SYSCALL(__NR_pidfd_open, sys_pidfd_open) 848__SYSCALL(__NR_pidfd_open, sys_pidfd_open)
849#define __NR_clone3 435
850__SYSCALL(__NR_clone3, sys_clone3)
849 851
850#undef __NR_syscalls 852#undef __NR_syscalls
851#define __NR_syscalls 435 853#define __NR_syscalls 436
852 854
853/* 855/*
854 * 32 bit systems traditionally used different 856 * 32 bit systems traditionally used different
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index 617bb59aa8ba..b3105ac1381a 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -2,6 +2,8 @@
2#ifndef _UAPI_LINUX_SCHED_H 2#ifndef _UAPI_LINUX_SCHED_H
3#define _UAPI_LINUX_SCHED_H 3#define _UAPI_LINUX_SCHED_H
4 4
5#include <linux/types.h>
6
5/* 7/*
6 * cloning flags: 8 * cloning flags:
7 */ 9 */
@@ -32,6 +34,20 @@
32#define CLONE_IO 0x80000000 /* Clone io context */ 34#define CLONE_IO 0x80000000 /* Clone io context */
33 35
34/* 36/*
37 * Arguments for the clone3 syscall
38 */
39struct clone_args {
40 __aligned_u64 flags;
41 __aligned_u64 pidfd;
42 __aligned_u64 child_tid;
43 __aligned_u64 parent_tid;
44 __aligned_u64 exit_signal;
45 __aligned_u64 stack;
46 __aligned_u64 stack_size;
47 __aligned_u64 tls;
48};
49
50/*
35 * Scheduling policies 51 * Scheduling policies
36 */ 52 */
37#define SCHED_NORMAL 0 53#define SCHED_NORMAL 0
diff --git a/kernel/fork.c b/kernel/fork.c
index 187c02ce534c..8f3e2d97d771 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1768,20 +1768,16 @@ static __always_inline void delayed_free_task(struct task_struct *tsk)
1768 * flags). The actual kick-off is left to the caller. 1768 * flags). The actual kick-off is left to the caller.
1769 */ 1769 */
1770static __latent_entropy struct task_struct *copy_process( 1770static __latent_entropy struct task_struct *copy_process(
1771 unsigned long clone_flags,
1772 unsigned long stack_start,
1773 unsigned long stack_size,
1774 int __user *parent_tidptr,
1775 int __user *child_tidptr,
1776 struct pid *pid, 1771 struct pid *pid,
1777 int trace, 1772 int trace,
1778 unsigned long tls, 1773 int node,
1779 int node) 1774 struct kernel_clone_args *args)
1780{ 1775{
1781 int pidfd = -1, retval; 1776 int pidfd = -1, retval;
1782 struct task_struct *p; 1777 struct task_struct *p;
1783 struct multiprocess_signals delayed; 1778 struct multiprocess_signals delayed;
1784 struct file *pidfile = NULL; 1779 struct file *pidfile = NULL;
1780 u64 clone_flags = args->flags;
1785 1781
1786 /* 1782 /*
1787 * Don't allow sharing the root directory with processes in a different 1783 * Don't allow sharing the root directory with processes in a different
@@ -1831,14 +1827,11 @@ static __latent_entropy struct task_struct *copy_process(
1831 1827
1832 if (clone_flags & CLONE_PIDFD) { 1828 if (clone_flags & CLONE_PIDFD) {
1833 /* 1829 /*
1834 * - CLONE_PARENT_SETTID is useless for pidfds and also
1835 * parent_tidptr is used to return pidfds.
1836 * - CLONE_DETACHED is blocked so that we can potentially 1830 * - CLONE_DETACHED is blocked so that we can potentially
1837 * reuse it later for CLONE_PIDFD. 1831 * reuse it later for CLONE_PIDFD.
1838 * - CLONE_THREAD is blocked until someone really needs it. 1832 * - CLONE_THREAD is blocked until someone really needs it.
1839 */ 1833 */
1840 if (clone_flags & 1834 if (clone_flags & (CLONE_DETACHED | CLONE_THREAD))
1841 (CLONE_DETACHED | CLONE_PARENT_SETTID | CLONE_THREAD))
1842 return ERR_PTR(-EINVAL); 1835 return ERR_PTR(-EINVAL);
1843 } 1836 }
1844 1837
@@ -1871,11 +1864,11 @@ static __latent_entropy struct task_struct *copy_process(
1871 * p->set_child_tid which is (ab)used as a kthread's data pointer for 1864 * p->set_child_tid which is (ab)used as a kthread's data pointer for
1872 * kernel threads (PF_KTHREAD). 1865 * kernel threads (PF_KTHREAD).
1873 */ 1866 */
1874 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; 1867 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL;
1875 /* 1868 /*
1876 * Clear TID on mm_release()? 1869 * Clear TID on mm_release()?
1877 */ 1870 */
1878 p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL; 1871 p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? args->child_tid : NULL;
1879 1872
1880 ftrace_graph_init_task(p); 1873 ftrace_graph_init_task(p);
1881 1874
@@ -2031,7 +2024,8 @@ static __latent_entropy struct task_struct *copy_process(
2031 retval = copy_io(clone_flags, p); 2024 retval = copy_io(clone_flags, p);
2032 if (retval) 2025 if (retval)
2033 goto bad_fork_cleanup_namespaces; 2026 goto bad_fork_cleanup_namespaces;
2034 retval = copy_thread_tls(clone_flags, stack_start, stack_size, p, tls); 2027 retval = copy_thread_tls(clone_flags, args->stack, args->stack_size, p,
2028 args->tls);
2035 if (retval) 2029 if (retval)
2036 goto bad_fork_cleanup_io; 2030 goto bad_fork_cleanup_io;
2037 2031
@@ -2066,7 +2060,7 @@ static __latent_entropy struct task_struct *copy_process(
2066 } 2060 }
2067 get_pid(pid); /* held by pidfile now */ 2061 get_pid(pid); /* held by pidfile now */
2068 2062
2069 retval = put_user(pidfd, parent_tidptr); 2063 retval = put_user(pidfd, args->pidfd);
2070 if (retval) 2064 if (retval)
2071 goto bad_fork_put_pidfd; 2065 goto bad_fork_put_pidfd;
2072 } 2066 }
@@ -2109,7 +2103,7 @@ static __latent_entropy struct task_struct *copy_process(
2109 if (clone_flags & CLONE_PARENT) 2103 if (clone_flags & CLONE_PARENT)
2110 p->exit_signal = current->group_leader->exit_signal; 2104 p->exit_signal = current->group_leader->exit_signal;
2111 else 2105 else
2112 p->exit_signal = (clone_flags & CSIGNAL); 2106 p->exit_signal = args->exit_signal;
2113 p->group_leader = p; 2107 p->group_leader = p;
2114 p->tgid = p->pid; 2108 p->tgid = p->pid;
2115 } 2109 }
@@ -2322,8 +2316,11 @@ static inline void init_idle_pids(struct task_struct *idle)
2322struct task_struct *fork_idle(int cpu) 2316struct task_struct *fork_idle(int cpu)
2323{ 2317{
2324 struct task_struct *task; 2318 struct task_struct *task;
2325 task = copy_process(CLONE_VM, 0, 0, NULL, NULL, &init_struct_pid, 0, 0, 2319 struct kernel_clone_args args = {
2326 cpu_to_node(cpu)); 2320 .flags = CLONE_VM,
2321 };
2322
2323 task = copy_process(&init_struct_pid, 0, cpu_to_node(cpu), &args);
2327 if (!IS_ERR(task)) { 2324 if (!IS_ERR(task)) {
2328 init_idle_pids(task); 2325 init_idle_pids(task);
2329 init_idle(task, cpu); 2326 init_idle(task, cpu);
@@ -2343,13 +2340,9 @@ struct mm_struct *copy_init_mm(void)
2343 * It copies the process, and if successful kick-starts 2340 * It copies the process, and if successful kick-starts
2344 * it and waits for it to finish using the VM if required. 2341 * it and waits for it to finish using the VM if required.
2345 */ 2342 */
2346long _do_fork(unsigned long clone_flags, 2343long _do_fork(struct kernel_clone_args *args)
2347 unsigned long stack_start,
2348 unsigned long stack_size,
2349 int __user *parent_tidptr,
2350 int __user *child_tidptr,
2351 unsigned long tls)
2352{ 2344{
2345 u64 clone_flags = args->flags;
2353 struct completion vfork; 2346 struct completion vfork;
2354 struct pid *pid; 2347 struct pid *pid;
2355 struct task_struct *p; 2348 struct task_struct *p;
@@ -2365,7 +2358,7 @@ long _do_fork(unsigned long clone_flags,
2365 if (!(clone_flags & CLONE_UNTRACED)) { 2358 if (!(clone_flags & CLONE_UNTRACED)) {
2366 if (clone_flags & CLONE_VFORK) 2359 if (clone_flags & CLONE_VFORK)
2367 trace = PTRACE_EVENT_VFORK; 2360 trace = PTRACE_EVENT_VFORK;
2368 else if ((clone_flags & CSIGNAL) != SIGCHLD) 2361 else if (args->exit_signal != SIGCHLD)
2369 trace = PTRACE_EVENT_CLONE; 2362 trace = PTRACE_EVENT_CLONE;
2370 else 2363 else
2371 trace = PTRACE_EVENT_FORK; 2364 trace = PTRACE_EVENT_FORK;
@@ -2374,8 +2367,7 @@ long _do_fork(unsigned long clone_flags,
2374 trace = 0; 2367 trace = 0;
2375 } 2368 }
2376 2369
2377 p = copy_process(clone_flags, stack_start, stack_size, parent_tidptr, 2370 p = copy_process(NULL, trace, NUMA_NO_NODE, args);
2378 child_tidptr, NULL, trace, tls, NUMA_NO_NODE);
2379 add_latent_entropy(); 2371 add_latent_entropy();
2380 2372
2381 if (IS_ERR(p)) 2373 if (IS_ERR(p))
@@ -2391,7 +2383,7 @@ long _do_fork(unsigned long clone_flags,
2391 nr = pid_vnr(pid); 2383 nr = pid_vnr(pid);
2392 2384
2393 if (clone_flags & CLONE_PARENT_SETTID) 2385 if (clone_flags & CLONE_PARENT_SETTID)
2394 put_user(nr, parent_tidptr); 2386 put_user(nr, args->parent_tid);
2395 2387
2396 if (clone_flags & CLONE_VFORK) { 2388 if (clone_flags & CLONE_VFORK) {
2397 p->vfork_done = &vfork; 2389 p->vfork_done = &vfork;
@@ -2423,8 +2415,16 @@ long do_fork(unsigned long clone_flags,
2423 int __user *parent_tidptr, 2415 int __user *parent_tidptr,
2424 int __user *child_tidptr) 2416 int __user *child_tidptr)
2425{ 2417{
2426 return _do_fork(clone_flags, stack_start, stack_size, 2418 struct kernel_clone_args args = {
2427 parent_tidptr, child_tidptr, 0); 2419 .flags = (clone_flags & ~CSIGNAL),
2420 .child_tid = child_tidptr,
2421 .parent_tid = parent_tidptr,
2422 .exit_signal = (clone_flags & CSIGNAL),
2423 .stack = stack_start,
2424 .stack_size = stack_size,
2425 };
2426
2427 return _do_fork(&args);
2428} 2428}
2429#endif 2429#endif
2430 2430
@@ -2433,15 +2433,25 @@ long do_fork(unsigned long clone_flags,
2433 */ 2433 */
2434pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) 2434pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
2435{ 2435{
2436 return _do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn, 2436 struct kernel_clone_args args = {
2437 (unsigned long)arg, NULL, NULL, 0); 2437 .flags = ((flags | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL),
2438 .exit_signal = (flags & CSIGNAL),
2439 .stack = (unsigned long)fn,
2440 .stack_size = (unsigned long)arg,
2441 };
2442
2443 return _do_fork(&args);
2438} 2444}
2439 2445
2440#ifdef __ARCH_WANT_SYS_FORK 2446#ifdef __ARCH_WANT_SYS_FORK
2441SYSCALL_DEFINE0(fork) 2447SYSCALL_DEFINE0(fork)
2442{ 2448{
2443#ifdef CONFIG_MMU 2449#ifdef CONFIG_MMU
2444 return _do_fork(SIGCHLD, 0, 0, NULL, NULL, 0); 2450 struct kernel_clone_args args = {
2451 .exit_signal = SIGCHLD,
2452 };
2453
2454 return _do_fork(&args);
2445#else 2455#else
2446 /* can not support in nommu mode */ 2456 /* can not support in nommu mode */
2447 return -EINVAL; 2457 return -EINVAL;
@@ -2452,8 +2462,12 @@ SYSCALL_DEFINE0(fork)
2452#ifdef __ARCH_WANT_SYS_VFORK 2462#ifdef __ARCH_WANT_SYS_VFORK
2453SYSCALL_DEFINE0(vfork) 2463SYSCALL_DEFINE0(vfork)
2454{ 2464{
2455 return _do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0, 2465 struct kernel_clone_args args = {
2456 0, NULL, NULL, 0); 2466 .flags = CLONE_VFORK | CLONE_VM,
2467 .exit_signal = SIGCHLD,
2468 };
2469
2470 return _do_fork(&args);
2457} 2471}
2458#endif 2472#endif
2459 2473
@@ -2481,7 +2495,112 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
2481 unsigned long, tls) 2495 unsigned long, tls)
2482#endif 2496#endif
2483{ 2497{
2484 return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr, tls); 2498 struct kernel_clone_args args = {
2499 .flags = (clone_flags & ~CSIGNAL),
2500 .pidfd = parent_tidptr,
2501 .child_tid = child_tidptr,
2502 .parent_tid = parent_tidptr,
2503 .exit_signal = (clone_flags & CSIGNAL),
2504 .stack = newsp,
2505 .tls = tls,
2506 };
2507
2508 /* clone(CLONE_PIDFD) uses parent_tidptr to return a pidfd */
2509 if ((clone_flags & CLONE_PIDFD) && (clone_flags & CLONE_PARENT_SETTID))
2510 return -EINVAL;
2511
2512 return _do_fork(&args);
2513}
2514#endif
2515
2516#ifdef __ARCH_WANT_SYS_CLONE3
2517noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
2518 struct clone_args __user *uargs,
2519 size_t size)
2520{
2521 struct clone_args args;
2522
2523 if (unlikely(size > PAGE_SIZE))
2524 return -E2BIG;
2525
2526 if (unlikely(size < sizeof(struct clone_args)))
2527 return -EINVAL;
2528
2529 if (unlikely(!access_ok(uargs, size)))
2530 return -EFAULT;
2531
2532 if (size > sizeof(struct clone_args)) {
2533 unsigned char __user *addr;
2534 unsigned char __user *end;
2535 unsigned char val;
2536
2537 addr = (void __user *)uargs + sizeof(struct clone_args);
2538 end = (void __user *)uargs + size;
2539
2540 for (; addr < end; addr++) {
2541 if (get_user(val, addr))
2542 return -EFAULT;
2543 if (val)
2544 return -E2BIG;
2545 }
2546
2547 size = sizeof(struct clone_args);
2548 }
2549
2550 if (copy_from_user(&args, uargs, size))
2551 return -EFAULT;
2552
2553 *kargs = (struct kernel_clone_args){
2554 .flags = args.flags,
2555 .pidfd = u64_to_user_ptr(args.pidfd),
2556 .child_tid = u64_to_user_ptr(args.child_tid),
2557 .parent_tid = u64_to_user_ptr(args.parent_tid),
2558 .exit_signal = args.exit_signal,
2559 .stack = args.stack,
2560 .stack_size = args.stack_size,
2561 .tls = args.tls,
2562 };
2563
2564 return 0;
2565}
2566
2567static bool clone3_args_valid(const struct kernel_clone_args *kargs)
2568{
2569 /*
2570 * All lower bits of the flag word are taken.
2571 * Verify that no other unknown flags are passed along.
2572 */
2573 if (kargs->flags & ~CLONE_LEGACY_FLAGS)
2574 return false;
2575
2576 /*
2577 * - make the CLONE_DETACHED bit reuseable for clone3
2578 * - make the CSIGNAL bits reuseable for clone3
2579 */
2580 if (kargs->flags & (CLONE_DETACHED | CSIGNAL))
2581 return false;
2582
2583 if ((kargs->flags & (CLONE_THREAD | CLONE_PARENT)) &&
2584 kargs->exit_signal)
2585 return false;
2586
2587 return true;
2588}
2589
2590SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size)
2591{
2592 int err;
2593
2594 struct kernel_clone_args kargs;
2595
2596 err = copy_clone_args_from_user(&kargs, uargs, size);
2597 if (err)
2598 return err;
2599
2600 if (!clone3_args_valid(&kargs))
2601 return -EINVAL;
2602
2603 return _do_fork(&kargs);
2485} 2604}
2486#endif 2605#endif
2487 2606
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 4d9ae5ea6caf..34b76895b81e 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -137,6 +137,8 @@ COND_SYSCALL(capset);
137/* kernel/exit.c */ 137/* kernel/exit.c */
138 138
139/* kernel/fork.c */ 139/* kernel/fork.c */
140/* __ARCH_WANT_SYS_CLONE3 */
141COND_SYSCALL(clone3);
140 142
141/* kernel/futex.c */ 143/* kernel/futex.c */
142COND_SYSCALL(futex); 144COND_SYSCALL(futex);