summaryrefslogtreecommitdiffstats
path: root/kernel/fork.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-07-11 13:09:44 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2019-07-11 13:09:44 -0400
commit8f6ccf6159aed1f04c6d179f61f6fb2691261e84 (patch)
tree449c6d9cddc6f94c6450a18885b6a06e8a67d845 /kernel/fork.c
parent5450e8a316a64cddcbc15f90733ebc78aa736545 (diff)
parentd68dbb0c9ac8b1ff52eb09aa58ce6358400fa939 (diff)
Merge tag 'clone3-v5.3' of git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux
Pull clone3 system call from Christian Brauner: "This adds the clone3 syscall which is an extensible successor to clone after we snagged the last flag with CLONE_PIDFD during the 5.2 merge window for clone(). It cleanly supports all of the flags from clone() and thus all legacy workloads. There are few user visible differences between clone3 and clone. First, CLONE_DETACHED will cause EINVAL with clone3 so we can reuse this flag. Second, the CSIGNAL flag is deprecated and will cause EINVAL to be reported. It is superseeded by a dedicated "exit_signal" argument in struct clone_args thus freeing up even more flags. And third, clone3 gives CLONE_PIDFD a dedicated return argument in struct clone_args instead of abusing CLONE_PARENT_SETTID's parent_tidptr argument. The clone3 uapi is designed to be easy to handle on 32- and 64 bit: /* uapi */ struct clone_args { __aligned_u64 flags; __aligned_u64 pidfd; __aligned_u64 child_tid; __aligned_u64 parent_tid; __aligned_u64 exit_signal; __aligned_u64 stack; __aligned_u64 stack_size; __aligned_u64 tls; }; and a separate kernel struct is used that uses proper kernel typing: /* kernel internal */ struct kernel_clone_args { u64 flags; int __user *pidfd; int __user *child_tid; int __user *parent_tid; int exit_signal; unsigned long stack; unsigned long stack_size; unsigned long tls; }; The system call comes with a size argument which enables the kernel to detect what version of clone_args userspace is passing in. clone3 validates that any additional bytes a given kernel does not know about are set to zero and that the size never exceeds a page. A nice feature is that this patchset allowed us to cleanup and simplify various core kernel codepaths in kernel/fork.c by making the internal _do_fork() function take struct kernel_clone_args even for legacy clone(). This patch also unblocks the time namespace patchset which wants to introduce a new CLONE_TIMENS flag. Note, that clone3 has only been wired up for x86{_32,64}, arm{64}, and xtensa. These were the architectures that did not require special massaging. Other architectures treat fork-like system calls individually and after some back and forth neither Arnd nor I felt confident that we dared to add clone3 unconditionally to all architectures. We agreed to leave this up to individual architecture maintainers. This is why there's an additional patch that introduces __ARCH_WANT_SYS_CLONE3 which any architecture can set once it has implemented support for clone3. The patch also adds a cond_syscall(clone3) for architectures such as nios2 or h8300 that generate their syscall table by simply including asm-generic/unistd.h. The hope is to get rid of __ARCH_WANT_SYS_CLONE3 and cond_syscall() rather soon" * tag 'clone3-v5.3' of git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux: arch: handle arches who do not yet define clone3 arch: wire-up clone3() syscall fork: add clone3
Diffstat (limited to 'kernel/fork.c')
-rw-r--r--kernel/fork.c191
1 files changed, 155 insertions, 36 deletions
diff --git a/kernel/fork.c b/kernel/fork.c
index 187c02ce534c..8f3e2d97d771 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1768,20 +1768,16 @@ static __always_inline void delayed_free_task(struct task_struct *tsk)
1768 * flags). The actual kick-off is left to the caller. 1768 * flags). The actual kick-off is left to the caller.
1769 */ 1769 */
1770static __latent_entropy struct task_struct *copy_process( 1770static __latent_entropy struct task_struct *copy_process(
1771 unsigned long clone_flags,
1772 unsigned long stack_start,
1773 unsigned long stack_size,
1774 int __user *parent_tidptr,
1775 int __user *child_tidptr,
1776 struct pid *pid, 1771 struct pid *pid,
1777 int trace, 1772 int trace,
1778 unsigned long tls, 1773 int node,
1779 int node) 1774 struct kernel_clone_args *args)
1780{ 1775{
1781 int pidfd = -1, retval; 1776 int pidfd = -1, retval;
1782 struct task_struct *p; 1777 struct task_struct *p;
1783 struct multiprocess_signals delayed; 1778 struct multiprocess_signals delayed;
1784 struct file *pidfile = NULL; 1779 struct file *pidfile = NULL;
1780 u64 clone_flags = args->flags;
1785 1781
1786 /* 1782 /*
1787 * Don't allow sharing the root directory with processes in a different 1783 * Don't allow sharing the root directory with processes in a different
@@ -1831,14 +1827,11 @@ static __latent_entropy struct task_struct *copy_process(
1831 1827
1832 if (clone_flags & CLONE_PIDFD) { 1828 if (clone_flags & CLONE_PIDFD) {
1833 /* 1829 /*
1834 * - CLONE_PARENT_SETTID is useless for pidfds and also
1835 * parent_tidptr is used to return pidfds.
1836 * - CLONE_DETACHED is blocked so that we can potentially 1830 * - CLONE_DETACHED is blocked so that we can potentially
1837 * reuse it later for CLONE_PIDFD. 1831 * reuse it later for CLONE_PIDFD.
1838 * - CLONE_THREAD is blocked until someone really needs it. 1832 * - CLONE_THREAD is blocked until someone really needs it.
1839 */ 1833 */
1840 if (clone_flags & 1834 if (clone_flags & (CLONE_DETACHED | CLONE_THREAD))
1841 (CLONE_DETACHED | CLONE_PARENT_SETTID | CLONE_THREAD))
1842 return ERR_PTR(-EINVAL); 1835 return ERR_PTR(-EINVAL);
1843 } 1836 }
1844 1837
@@ -1871,11 +1864,11 @@ static __latent_entropy struct task_struct *copy_process(
1871 * p->set_child_tid which is (ab)used as a kthread's data pointer for 1864 * p->set_child_tid which is (ab)used as a kthread's data pointer for
1872 * kernel threads (PF_KTHREAD). 1865 * kernel threads (PF_KTHREAD).
1873 */ 1866 */
1874 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; 1867 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL;
1875 /* 1868 /*
1876 * Clear TID on mm_release()? 1869 * Clear TID on mm_release()?
1877 */ 1870 */
1878 p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL; 1871 p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? args->child_tid : NULL;
1879 1872
1880 ftrace_graph_init_task(p); 1873 ftrace_graph_init_task(p);
1881 1874
@@ -2031,7 +2024,8 @@ static __latent_entropy struct task_struct *copy_process(
2031 retval = copy_io(clone_flags, p); 2024 retval = copy_io(clone_flags, p);
2032 if (retval) 2025 if (retval)
2033 goto bad_fork_cleanup_namespaces; 2026 goto bad_fork_cleanup_namespaces;
2034 retval = copy_thread_tls(clone_flags, stack_start, stack_size, p, tls); 2027 retval = copy_thread_tls(clone_flags, args->stack, args->stack_size, p,
2028 args->tls);
2035 if (retval) 2029 if (retval)
2036 goto bad_fork_cleanup_io; 2030 goto bad_fork_cleanup_io;
2037 2031
@@ -2066,7 +2060,7 @@ static __latent_entropy struct task_struct *copy_process(
2066 } 2060 }
2067 get_pid(pid); /* held by pidfile now */ 2061 get_pid(pid); /* held by pidfile now */
2068 2062
2069 retval = put_user(pidfd, parent_tidptr); 2063 retval = put_user(pidfd, args->pidfd);
2070 if (retval) 2064 if (retval)
2071 goto bad_fork_put_pidfd; 2065 goto bad_fork_put_pidfd;
2072 } 2066 }
@@ -2109,7 +2103,7 @@ static __latent_entropy struct task_struct *copy_process(
2109 if (clone_flags & CLONE_PARENT) 2103 if (clone_flags & CLONE_PARENT)
2110 p->exit_signal = current->group_leader->exit_signal; 2104 p->exit_signal = current->group_leader->exit_signal;
2111 else 2105 else
2112 p->exit_signal = (clone_flags & CSIGNAL); 2106 p->exit_signal = args->exit_signal;
2113 p->group_leader = p; 2107 p->group_leader = p;
2114 p->tgid = p->pid; 2108 p->tgid = p->pid;
2115 } 2109 }
@@ -2322,8 +2316,11 @@ static inline void init_idle_pids(struct task_struct *idle)
2322struct task_struct *fork_idle(int cpu) 2316struct task_struct *fork_idle(int cpu)
2323{ 2317{
2324 struct task_struct *task; 2318 struct task_struct *task;
2325 task = copy_process(CLONE_VM, 0, 0, NULL, NULL, &init_struct_pid, 0, 0, 2319 struct kernel_clone_args args = {
2326 cpu_to_node(cpu)); 2320 .flags = CLONE_VM,
2321 };
2322
2323 task = copy_process(&init_struct_pid, 0, cpu_to_node(cpu), &args);
2327 if (!IS_ERR(task)) { 2324 if (!IS_ERR(task)) {
2328 init_idle_pids(task); 2325 init_idle_pids(task);
2329 init_idle(task, cpu); 2326 init_idle(task, cpu);
@@ -2343,13 +2340,9 @@ struct mm_struct *copy_init_mm(void)
2343 * It copies the process, and if successful kick-starts 2340 * It copies the process, and if successful kick-starts
2344 * it and waits for it to finish using the VM if required. 2341 * it and waits for it to finish using the VM if required.
2345 */ 2342 */
2346long _do_fork(unsigned long clone_flags, 2343long _do_fork(struct kernel_clone_args *args)
2347 unsigned long stack_start,
2348 unsigned long stack_size,
2349 int __user *parent_tidptr,
2350 int __user *child_tidptr,
2351 unsigned long tls)
2352{ 2344{
2345 u64 clone_flags = args->flags;
2353 struct completion vfork; 2346 struct completion vfork;
2354 struct pid *pid; 2347 struct pid *pid;
2355 struct task_struct *p; 2348 struct task_struct *p;
@@ -2365,7 +2358,7 @@ long _do_fork(unsigned long clone_flags,
2365 if (!(clone_flags & CLONE_UNTRACED)) { 2358 if (!(clone_flags & CLONE_UNTRACED)) {
2366 if (clone_flags & CLONE_VFORK) 2359 if (clone_flags & CLONE_VFORK)
2367 trace = PTRACE_EVENT_VFORK; 2360 trace = PTRACE_EVENT_VFORK;
2368 else if ((clone_flags & CSIGNAL) != SIGCHLD) 2361 else if (args->exit_signal != SIGCHLD)
2369 trace = PTRACE_EVENT_CLONE; 2362 trace = PTRACE_EVENT_CLONE;
2370 else 2363 else
2371 trace = PTRACE_EVENT_FORK; 2364 trace = PTRACE_EVENT_FORK;
@@ -2374,8 +2367,7 @@ long _do_fork(unsigned long clone_flags,
2374 trace = 0; 2367 trace = 0;
2375 } 2368 }
2376 2369
2377 p = copy_process(clone_flags, stack_start, stack_size, parent_tidptr, 2370 p = copy_process(NULL, trace, NUMA_NO_NODE, args);
2378 child_tidptr, NULL, trace, tls, NUMA_NO_NODE);
2379 add_latent_entropy(); 2371 add_latent_entropy();
2380 2372
2381 if (IS_ERR(p)) 2373 if (IS_ERR(p))
@@ -2391,7 +2383,7 @@ long _do_fork(unsigned long clone_flags,
2391 nr = pid_vnr(pid); 2383 nr = pid_vnr(pid);
2392 2384
2393 if (clone_flags & CLONE_PARENT_SETTID) 2385 if (clone_flags & CLONE_PARENT_SETTID)
2394 put_user(nr, parent_tidptr); 2386 put_user(nr, args->parent_tid);
2395 2387
2396 if (clone_flags & CLONE_VFORK) { 2388 if (clone_flags & CLONE_VFORK) {
2397 p->vfork_done = &vfork; 2389 p->vfork_done = &vfork;
@@ -2423,8 +2415,16 @@ long do_fork(unsigned long clone_flags,
2423 int __user *parent_tidptr, 2415 int __user *parent_tidptr,
2424 int __user *child_tidptr) 2416 int __user *child_tidptr)
2425{ 2417{
2426 return _do_fork(clone_flags, stack_start, stack_size, 2418 struct kernel_clone_args args = {
2427 parent_tidptr, child_tidptr, 0); 2419 .flags = (clone_flags & ~CSIGNAL),
2420 .child_tid = child_tidptr,
2421 .parent_tid = parent_tidptr,
2422 .exit_signal = (clone_flags & CSIGNAL),
2423 .stack = stack_start,
2424 .stack_size = stack_size,
2425 };
2426
2427 return _do_fork(&args);
2428} 2428}
2429#endif 2429#endif
2430 2430
@@ -2433,15 +2433,25 @@ long do_fork(unsigned long clone_flags,
2433 */ 2433 */
2434pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) 2434pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
2435{ 2435{
2436 return _do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn, 2436 struct kernel_clone_args args = {
2437 (unsigned long)arg, NULL, NULL, 0); 2437 .flags = ((flags | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL),
2438 .exit_signal = (flags & CSIGNAL),
2439 .stack = (unsigned long)fn,
2440 .stack_size = (unsigned long)arg,
2441 };
2442
2443 return _do_fork(&args);
2438} 2444}
2439 2445
2440#ifdef __ARCH_WANT_SYS_FORK 2446#ifdef __ARCH_WANT_SYS_FORK
2441SYSCALL_DEFINE0(fork) 2447SYSCALL_DEFINE0(fork)
2442{ 2448{
2443#ifdef CONFIG_MMU 2449#ifdef CONFIG_MMU
2444 return _do_fork(SIGCHLD, 0, 0, NULL, NULL, 0); 2450 struct kernel_clone_args args = {
2451 .exit_signal = SIGCHLD,
2452 };
2453
2454 return _do_fork(&args);
2445#else 2455#else
2446 /* can not support in nommu mode */ 2456 /* can not support in nommu mode */
2447 return -EINVAL; 2457 return -EINVAL;
@@ -2452,8 +2462,12 @@ SYSCALL_DEFINE0(fork)
2452#ifdef __ARCH_WANT_SYS_VFORK 2462#ifdef __ARCH_WANT_SYS_VFORK
2453SYSCALL_DEFINE0(vfork) 2463SYSCALL_DEFINE0(vfork)
2454{ 2464{
2455 return _do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0, 2465 struct kernel_clone_args args = {
2456 0, NULL, NULL, 0); 2466 .flags = CLONE_VFORK | CLONE_VM,
2467 .exit_signal = SIGCHLD,
2468 };
2469
2470 return _do_fork(&args);
2457} 2471}
2458#endif 2472#endif
2459 2473
@@ -2481,7 +2495,112 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
2481 unsigned long, tls) 2495 unsigned long, tls)
2482#endif 2496#endif
2483{ 2497{
2484 return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr, tls); 2498 struct kernel_clone_args args = {
2499 .flags = (clone_flags & ~CSIGNAL),
2500 .pidfd = parent_tidptr,
2501 .child_tid = child_tidptr,
2502 .parent_tid = parent_tidptr,
2503 .exit_signal = (clone_flags & CSIGNAL),
2504 .stack = newsp,
2505 .tls = tls,
2506 };
2507
2508 /* clone(CLONE_PIDFD) uses parent_tidptr to return a pidfd */
2509 if ((clone_flags & CLONE_PIDFD) && (clone_flags & CLONE_PARENT_SETTID))
2510 return -EINVAL;
2511
2512 return _do_fork(&args);
2513}
2514#endif
2515
2516#ifdef __ARCH_WANT_SYS_CLONE3
2517noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
2518 struct clone_args __user *uargs,
2519 size_t size)
2520{
2521 struct clone_args args;
2522
2523 if (unlikely(size > PAGE_SIZE))
2524 return -E2BIG;
2525
2526 if (unlikely(size < sizeof(struct clone_args)))
2527 return -EINVAL;
2528
2529 if (unlikely(!access_ok(uargs, size)))
2530 return -EFAULT;
2531
2532 if (size > sizeof(struct clone_args)) {
2533 unsigned char __user *addr;
2534 unsigned char __user *end;
2535 unsigned char val;
2536
2537 addr = (void __user *)uargs + sizeof(struct clone_args);
2538 end = (void __user *)uargs + size;
2539
2540 for (; addr < end; addr++) {
2541 if (get_user(val, addr))
2542 return -EFAULT;
2543 if (val)
2544 return -E2BIG;
2545 }
2546
2547 size = sizeof(struct clone_args);
2548 }
2549
2550 if (copy_from_user(&args, uargs, size))
2551 return -EFAULT;
2552
2553 *kargs = (struct kernel_clone_args){
2554 .flags = args.flags,
2555 .pidfd = u64_to_user_ptr(args.pidfd),
2556 .child_tid = u64_to_user_ptr(args.child_tid),
2557 .parent_tid = u64_to_user_ptr(args.parent_tid),
2558 .exit_signal = args.exit_signal,
2559 .stack = args.stack,
2560 .stack_size = args.stack_size,
2561 .tls = args.tls,
2562 };
2563
2564 return 0;
2565}
2566
2567static bool clone3_args_valid(const struct kernel_clone_args *kargs)
2568{
2569 /*
2570 * All lower bits of the flag word are taken.
2571 * Verify that no other unknown flags are passed along.
2572 */
2573 if (kargs->flags & ~CLONE_LEGACY_FLAGS)
2574 return false;
2575
2576 /*
2577 * - make the CLONE_DETACHED bit reuseable for clone3
2578 * - make the CSIGNAL bits reuseable for clone3
2579 */
2580 if (kargs->flags & (CLONE_DETACHED | CSIGNAL))
2581 return false;
2582
2583 if ((kargs->flags & (CLONE_THREAD | CLONE_PARENT)) &&
2584 kargs->exit_signal)
2585 return false;
2586
2587 return true;
2588}
2589
2590SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size)
2591{
2592 int err;
2593
2594 struct kernel_clone_args kargs;
2595
2596 err = copy_clone_args_from_user(&kargs, uargs, size);
2597 if (err)
2598 return err;
2599
2600 if (!clone3_args_valid(&kargs))
2601 return -EINVAL;
2602
2603 return _do_fork(&kargs);
2485} 2604}
2486#endif 2605#endif
2487 2606