diff options
Diffstat (limited to 'kernel/fork.c')
-rw-r--r-- | kernel/fork.c | 315 |
1 files changed, 235 insertions, 80 deletions
diff --git a/kernel/fork.c b/kernel/fork.c index 737db1828437..8f3e2d97d771 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -1,3 +1,4 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0-only | ||
1 | /* | 2 | /* |
2 | * linux/kernel/fork.c | 3 | * linux/kernel/fork.c |
3 | * | 4 | * |
@@ -122,7 +123,7 @@ | |||
122 | unsigned long total_forks; /* Handle normal Linux uptimes. */ | 123 | unsigned long total_forks; /* Handle normal Linux uptimes. */ |
123 | int nr_threads; /* The idle threads do not count.. */ | 124 | int nr_threads; /* The idle threads do not count.. */ |
124 | 125 | ||
125 | int max_threads; /* tunable limit on nr_threads */ | 126 | static int max_threads; /* tunable limit on nr_threads */ |
126 | 127 | ||
127 | DEFINE_PER_CPU(unsigned long, process_counts) = 0; | 128 | DEFINE_PER_CPU(unsigned long, process_counts) = 0; |
128 | 129 | ||
@@ -247,7 +248,11 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) | |||
247 | struct page *page = alloc_pages_node(node, THREADINFO_GFP, | 248 | struct page *page = alloc_pages_node(node, THREADINFO_GFP, |
248 | THREAD_SIZE_ORDER); | 249 | THREAD_SIZE_ORDER); |
249 | 250 | ||
250 | return page ? page_address(page) : NULL; | 251 | if (likely(page)) { |
252 | tsk->stack = page_address(page); | ||
253 | return tsk->stack; | ||
254 | } | ||
255 | return NULL; | ||
251 | #endif | 256 | #endif |
252 | } | 257 | } |
253 | 258 | ||
@@ -893,6 +898,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) | |||
893 | #ifdef CONFIG_STACKPROTECTOR | 898 | #ifdef CONFIG_STACKPROTECTOR |
894 | tsk->stack_canary = get_random_canary(); | 899 | tsk->stack_canary = get_random_canary(); |
895 | #endif | 900 | #endif |
901 | if (orig->cpus_ptr == &orig->cpus_mask) | ||
902 | tsk->cpus_ptr = &tsk->cpus_mask; | ||
896 | 903 | ||
897 | /* | 904 | /* |
898 | * One for us, one for whoever does the "release_task()" (usually | 905 | * One for us, one for whoever does the "release_task()" (usually |
@@ -955,6 +962,15 @@ static void mm_init_aio(struct mm_struct *mm) | |||
955 | #endif | 962 | #endif |
956 | } | 963 | } |
957 | 964 | ||
965 | static __always_inline void mm_clear_owner(struct mm_struct *mm, | ||
966 | struct task_struct *p) | ||
967 | { | ||
968 | #ifdef CONFIG_MEMCG | ||
969 | if (mm->owner == p) | ||
970 | WRITE_ONCE(mm->owner, NULL); | ||
971 | #endif | ||
972 | } | ||
973 | |||
958 | static void mm_init_owner(struct mm_struct *mm, struct task_struct *p) | 974 | static void mm_init_owner(struct mm_struct *mm, struct task_struct *p) |
959 | { | 975 | { |
960 | #ifdef CONFIG_MEMCG | 976 | #ifdef CONFIG_MEMCG |
@@ -1343,6 +1359,7 @@ static struct mm_struct *dup_mm(struct task_struct *tsk, | |||
1343 | free_pt: | 1359 | free_pt: |
1344 | /* don't put binfmt in mmput, we haven't got module yet */ | 1360 | /* don't put binfmt in mmput, we haven't got module yet */ |
1345 | mm->binfmt = NULL; | 1361 | mm->binfmt = NULL; |
1362 | mm_init_owner(mm, NULL); | ||
1346 | mmput(mm); | 1363 | mmput(mm); |
1347 | 1364 | ||
1348 | fail_nomem: | 1365 | fail_nomem: |
@@ -1694,36 +1711,52 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) | |||
1694 | } | 1711 | } |
1695 | #endif | 1712 | #endif |
1696 | 1713 | ||
1714 | /* | ||
1715 | * Poll support for process exit notification. | ||
1716 | */ | ||
1717 | static unsigned int pidfd_poll(struct file *file, struct poll_table_struct *pts) | ||
1718 | { | ||
1719 | struct task_struct *task; | ||
1720 | struct pid *pid = file->private_data; | ||
1721 | int poll_flags = 0; | ||
1722 | |||
1723 | poll_wait(file, &pid->wait_pidfd, pts); | ||
1724 | |||
1725 | rcu_read_lock(); | ||
1726 | task = pid_task(pid, PIDTYPE_PID); | ||
1727 | /* | ||
1728 | * Inform pollers only when the whole thread group exits. | ||
1729 | * If the thread group leader exits before all other threads in the | ||
1730 | * group, then poll(2) should block, similar to the wait(2) family. | ||
1731 | */ | ||
1732 | if (!task || (task->exit_state && thread_group_empty(task))) | ||
1733 | poll_flags = POLLIN | POLLRDNORM; | ||
1734 | rcu_read_unlock(); | ||
1735 | |||
1736 | return poll_flags; | ||
1737 | } | ||
1738 | |||
1697 | const struct file_operations pidfd_fops = { | 1739 | const struct file_operations pidfd_fops = { |
1698 | .release = pidfd_release, | 1740 | .release = pidfd_release, |
1741 | .poll = pidfd_poll, | ||
1699 | #ifdef CONFIG_PROC_FS | 1742 | #ifdef CONFIG_PROC_FS |
1700 | .show_fdinfo = pidfd_show_fdinfo, | 1743 | .show_fdinfo = pidfd_show_fdinfo, |
1701 | #endif | 1744 | #endif |
1702 | }; | 1745 | }; |
1703 | 1746 | ||
1704 | /** | 1747 | static void __delayed_free_task(struct rcu_head *rhp) |
1705 | * pidfd_create() - Create a new pid file descriptor. | ||
1706 | * | ||
1707 | * @pid: struct pid that the pidfd will reference | ||
1708 | * | ||
1709 | * This creates a new pid file descriptor with the O_CLOEXEC flag set. | ||
1710 | * | ||
1711 | * Note, that this function can only be called after the fd table has | ||
1712 | * been unshared to avoid leaking the pidfd to the new process. | ||
1713 | * | ||
1714 | * Return: On success, a cloexec pidfd is returned. | ||
1715 | * On error, a negative errno number will be returned. | ||
1716 | */ | ||
1717 | static int pidfd_create(struct pid *pid) | ||
1718 | { | 1748 | { |
1719 | int fd; | 1749 | struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); |
1720 | 1750 | ||
1721 | fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid), | 1751 | free_task(tsk); |
1722 | O_RDWR | O_CLOEXEC); | 1752 | } |
1723 | if (fd < 0) | ||
1724 | put_pid(pid); | ||
1725 | 1753 | ||
1726 | return fd; | 1754 | static __always_inline void delayed_free_task(struct task_struct *tsk) |
1755 | { | ||
1756 | if (IS_ENABLED(CONFIG_MEMCG)) | ||
1757 | call_rcu(&tsk->rcu, __delayed_free_task); | ||
1758 | else | ||
1759 | free_task(tsk); | ||
1727 | } | 1760 | } |
1728 | 1761 | ||
1729 | /* | 1762 | /* |
@@ -1735,19 +1768,16 @@ static int pidfd_create(struct pid *pid) | |||
1735 | * flags). The actual kick-off is left to the caller. | 1768 | * flags). The actual kick-off is left to the caller. |
1736 | */ | 1769 | */ |
1737 | static __latent_entropy struct task_struct *copy_process( | 1770 | static __latent_entropy struct task_struct *copy_process( |
1738 | unsigned long clone_flags, | ||
1739 | unsigned long stack_start, | ||
1740 | unsigned long stack_size, | ||
1741 | int __user *parent_tidptr, | ||
1742 | int __user *child_tidptr, | ||
1743 | struct pid *pid, | 1771 | struct pid *pid, |
1744 | int trace, | 1772 | int trace, |
1745 | unsigned long tls, | 1773 | int node, |
1746 | int node) | 1774 | struct kernel_clone_args *args) |
1747 | { | 1775 | { |
1748 | int pidfd = -1, retval; | 1776 | int pidfd = -1, retval; |
1749 | struct task_struct *p; | 1777 | struct task_struct *p; |
1750 | struct multiprocess_signals delayed; | 1778 | struct multiprocess_signals delayed; |
1779 | struct file *pidfile = NULL; | ||
1780 | u64 clone_flags = args->flags; | ||
1751 | 1781 | ||
1752 | /* | 1782 | /* |
1753 | * Don't allow sharing the root directory with processes in a different | 1783 | * Don't allow sharing the root directory with processes in a different |
@@ -1796,27 +1826,12 @@ static __latent_entropy struct task_struct *copy_process( | |||
1796 | } | 1826 | } |
1797 | 1827 | ||
1798 | if (clone_flags & CLONE_PIDFD) { | 1828 | if (clone_flags & CLONE_PIDFD) { |
1799 | int reserved; | ||
1800 | |||
1801 | /* | 1829 | /* |
1802 | * - CLONE_PARENT_SETTID is useless for pidfds and also | ||
1803 | * parent_tidptr is used to return pidfds. | ||
1804 | * - CLONE_DETACHED is blocked so that we can potentially | 1830 | * - CLONE_DETACHED is blocked so that we can potentially |
1805 | * reuse it later for CLONE_PIDFD. | 1831 | * reuse it later for CLONE_PIDFD. |
1806 | * - CLONE_THREAD is blocked until someone really needs it. | 1832 | * - CLONE_THREAD is blocked until someone really needs it. |
1807 | */ | 1833 | */ |
1808 | if (clone_flags & | 1834 | if (clone_flags & (CLONE_DETACHED | CLONE_THREAD)) |
1809 | (CLONE_DETACHED | CLONE_PARENT_SETTID | CLONE_THREAD)) | ||
1810 | return ERR_PTR(-EINVAL); | ||
1811 | |||
1812 | /* | ||
1813 | * Verify that parent_tidptr is sane so we can potentially | ||
1814 | * reuse it later. | ||
1815 | */ | ||
1816 | if (get_user(reserved, parent_tidptr)) | ||
1817 | return ERR_PTR(-EFAULT); | ||
1818 | |||
1819 | if (reserved != 0) | ||
1820 | return ERR_PTR(-EINVAL); | 1835 | return ERR_PTR(-EINVAL); |
1821 | } | 1836 | } |
1822 | 1837 | ||
@@ -1849,11 +1864,11 @@ static __latent_entropy struct task_struct *copy_process( | |||
1849 | * p->set_child_tid which is (ab)used as a kthread's data pointer for | 1864 | * p->set_child_tid which is (ab)used as a kthread's data pointer for |
1850 | * kernel threads (PF_KTHREAD). | 1865 | * kernel threads (PF_KTHREAD). |
1851 | */ | 1866 | */ |
1852 | p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; | 1867 | p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL; |
1853 | /* | 1868 | /* |
1854 | * Clear TID on mm_release()? | 1869 | * Clear TID on mm_release()? |
1855 | */ | 1870 | */ |
1856 | p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL; | 1871 | p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? args->child_tid : NULL; |
1857 | 1872 | ||
1858 | ftrace_graph_init_task(p); | 1873 | ftrace_graph_init_task(p); |
1859 | 1874 | ||
@@ -1958,9 +1973,6 @@ static __latent_entropy struct task_struct *copy_process( | |||
1958 | p->pagefault_disabled = 0; | 1973 | p->pagefault_disabled = 0; |
1959 | 1974 | ||
1960 | #ifdef CONFIG_LOCKDEP | 1975 | #ifdef CONFIG_LOCKDEP |
1961 | p->lockdep_depth = 0; /* no locks held yet */ | ||
1962 | p->curr_chain_key = 0; | ||
1963 | p->lockdep_recursion = 0; | ||
1964 | lockdep_init_task(p); | 1976 | lockdep_init_task(p); |
1965 | #endif | 1977 | #endif |
1966 | 1978 | ||
@@ -2012,7 +2024,8 @@ static __latent_entropy struct task_struct *copy_process( | |||
2012 | retval = copy_io(clone_flags, p); | 2024 | retval = copy_io(clone_flags, p); |
2013 | if (retval) | 2025 | if (retval) |
2014 | goto bad_fork_cleanup_namespaces; | 2026 | goto bad_fork_cleanup_namespaces; |
2015 | retval = copy_thread_tls(clone_flags, stack_start, stack_size, p, tls); | 2027 | retval = copy_thread_tls(clone_flags, args->stack, args->stack_size, p, |
2028 | args->tls); | ||
2016 | if (retval) | 2029 | if (retval) |
2017 | goto bad_fork_cleanup_io; | 2030 | goto bad_fork_cleanup_io; |
2018 | 2031 | ||
@@ -2032,12 +2045,22 @@ static __latent_entropy struct task_struct *copy_process( | |||
2032 | * if the fd table isn't shared). | 2045 | * if the fd table isn't shared). |
2033 | */ | 2046 | */ |
2034 | if (clone_flags & CLONE_PIDFD) { | 2047 | if (clone_flags & CLONE_PIDFD) { |
2035 | retval = pidfd_create(pid); | 2048 | retval = get_unused_fd_flags(O_RDWR | O_CLOEXEC); |
2036 | if (retval < 0) | 2049 | if (retval < 0) |
2037 | goto bad_fork_free_pid; | 2050 | goto bad_fork_free_pid; |
2038 | 2051 | ||
2039 | pidfd = retval; | 2052 | pidfd = retval; |
2040 | retval = put_user(pidfd, parent_tidptr); | 2053 | |
2054 | pidfile = anon_inode_getfile("[pidfd]", &pidfd_fops, pid, | ||
2055 | O_RDWR | O_CLOEXEC); | ||
2056 | if (IS_ERR(pidfile)) { | ||
2057 | put_unused_fd(pidfd); | ||
2058 | retval = PTR_ERR(pidfile); | ||
2059 | goto bad_fork_free_pid; | ||
2060 | } | ||
2061 | get_pid(pid); /* held by pidfile now */ | ||
2062 | |||
2063 | retval = put_user(pidfd, args->pidfd); | ||
2041 | if (retval) | 2064 | if (retval) |
2042 | goto bad_fork_put_pidfd; | 2065 | goto bad_fork_put_pidfd; |
2043 | } | 2066 | } |
@@ -2068,7 +2091,7 @@ static __latent_entropy struct task_struct *copy_process( | |||
2068 | #ifdef TIF_SYSCALL_EMU | 2091 | #ifdef TIF_SYSCALL_EMU |
2069 | clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); | 2092 | clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); |
2070 | #endif | 2093 | #endif |
2071 | clear_all_latency_tracing(p); | 2094 | clear_tsk_latency_tracing(p); |
2072 | 2095 | ||
2073 | /* ok, now we should be set up.. */ | 2096 | /* ok, now we should be set up.. */ |
2074 | p->pid = pid_nr(pid); | 2097 | p->pid = pid_nr(pid); |
@@ -2080,7 +2103,7 @@ static __latent_entropy struct task_struct *copy_process( | |||
2080 | if (clone_flags & CLONE_PARENT) | 2103 | if (clone_flags & CLONE_PARENT) |
2081 | p->exit_signal = current->group_leader->exit_signal; | 2104 | p->exit_signal = current->group_leader->exit_signal; |
2082 | else | 2105 | else |
2083 | p->exit_signal = (clone_flags & CSIGNAL); | 2106 | p->exit_signal = args->exit_signal; |
2084 | p->group_leader = p; | 2107 | p->group_leader = p; |
2085 | p->tgid = p->pid; | 2108 | p->tgid = p->pid; |
2086 | } | 2109 | } |
@@ -2113,7 +2136,7 @@ static __latent_entropy struct task_struct *copy_process( | |||
2113 | */ | 2136 | */ |
2114 | 2137 | ||
2115 | p->start_time = ktime_get_ns(); | 2138 | p->start_time = ktime_get_ns(); |
2116 | p->real_start_time = ktime_get_boot_ns(); | 2139 | p->real_start_time = ktime_get_boottime_ns(); |
2117 | 2140 | ||
2118 | /* | 2141 | /* |
2119 | * Make it visible to the rest of the system, but dont wake it up yet. | 2142 | * Make it visible to the rest of the system, but dont wake it up yet. |
@@ -2154,6 +2177,9 @@ static __latent_entropy struct task_struct *copy_process( | |||
2154 | goto bad_fork_cancel_cgroup; | 2177 | goto bad_fork_cancel_cgroup; |
2155 | } | 2178 | } |
2156 | 2179 | ||
2180 | /* past the last point of failure */ | ||
2181 | if (pidfile) | ||
2182 | fd_install(pidfd, pidfile); | ||
2157 | 2183 | ||
2158 | init_task_pid_links(p); | 2184 | init_task_pid_links(p); |
2159 | if (likely(p->pid)) { | 2185 | if (likely(p->pid)) { |
@@ -2220,8 +2246,10 @@ bad_fork_cancel_cgroup: | |||
2220 | bad_fork_cgroup_threadgroup_change_end: | 2246 | bad_fork_cgroup_threadgroup_change_end: |
2221 | cgroup_threadgroup_change_end(current); | 2247 | cgroup_threadgroup_change_end(current); |
2222 | bad_fork_put_pidfd: | 2248 | bad_fork_put_pidfd: |
2223 | if (clone_flags & CLONE_PIDFD) | 2249 | if (clone_flags & CLONE_PIDFD) { |
2224 | ksys_close(pidfd); | 2250 | fput(pidfile); |
2251 | put_unused_fd(pidfd); | ||
2252 | } | ||
2225 | bad_fork_free_pid: | 2253 | bad_fork_free_pid: |
2226 | if (pid != &init_struct_pid) | 2254 | if (pid != &init_struct_pid) |
2227 | free_pid(pid); | 2255 | free_pid(pid); |
@@ -2233,8 +2261,10 @@ bad_fork_cleanup_io: | |||
2233 | bad_fork_cleanup_namespaces: | 2261 | bad_fork_cleanup_namespaces: |
2234 | exit_task_namespaces(p); | 2262 | exit_task_namespaces(p); |
2235 | bad_fork_cleanup_mm: | 2263 | bad_fork_cleanup_mm: |
2236 | if (p->mm) | 2264 | if (p->mm) { |
2265 | mm_clear_owner(p->mm, p); | ||
2237 | mmput(p->mm); | 2266 | mmput(p->mm); |
2267 | } | ||
2238 | bad_fork_cleanup_signal: | 2268 | bad_fork_cleanup_signal: |
2239 | if (!(clone_flags & CLONE_THREAD)) | 2269 | if (!(clone_flags & CLONE_THREAD)) |
2240 | free_signal_struct(p->signal); | 2270 | free_signal_struct(p->signal); |
@@ -2265,7 +2295,7 @@ bad_fork_cleanup_count: | |||
2265 | bad_fork_free: | 2295 | bad_fork_free: |
2266 | p->state = TASK_DEAD; | 2296 | p->state = TASK_DEAD; |
2267 | put_task_stack(p); | 2297 | put_task_stack(p); |
2268 | free_task(p); | 2298 | delayed_free_task(p); |
2269 | fork_out: | 2299 | fork_out: |
2270 | spin_lock_irq(¤t->sighand->siglock); | 2300 | spin_lock_irq(¤t->sighand->siglock); |
2271 | hlist_del_init(&delayed.node); | 2301 | hlist_del_init(&delayed.node); |
@@ -2286,8 +2316,11 @@ static inline void init_idle_pids(struct task_struct *idle) | |||
2286 | struct task_struct *fork_idle(int cpu) | 2316 | struct task_struct *fork_idle(int cpu) |
2287 | { | 2317 | { |
2288 | struct task_struct *task; | 2318 | struct task_struct *task; |
2289 | task = copy_process(CLONE_VM, 0, 0, NULL, NULL, &init_struct_pid, 0, 0, | 2319 | struct kernel_clone_args args = { |
2290 | cpu_to_node(cpu)); | 2320 | .flags = CLONE_VM, |
2321 | }; | ||
2322 | |||
2323 | task = copy_process(&init_struct_pid, 0, cpu_to_node(cpu), &args); | ||
2291 | if (!IS_ERR(task)) { | 2324 | if (!IS_ERR(task)) { |
2292 | init_idle_pids(task); | 2325 | init_idle_pids(task); |
2293 | init_idle(task, cpu); | 2326 | init_idle(task, cpu); |
@@ -2307,13 +2340,9 @@ struct mm_struct *copy_init_mm(void) | |||
2307 | * It copies the process, and if successful kick-starts | 2340 | * It copies the process, and if successful kick-starts |
2308 | * it and waits for it to finish using the VM if required. | 2341 | * it and waits for it to finish using the VM if required. |
2309 | */ | 2342 | */ |
2310 | long _do_fork(unsigned long clone_flags, | 2343 | long _do_fork(struct kernel_clone_args *args) |
2311 | unsigned long stack_start, | ||
2312 | unsigned long stack_size, | ||
2313 | int __user *parent_tidptr, | ||
2314 | int __user *child_tidptr, | ||
2315 | unsigned long tls) | ||
2316 | { | 2344 | { |
2345 | u64 clone_flags = args->flags; | ||
2317 | struct completion vfork; | 2346 | struct completion vfork; |
2318 | struct pid *pid; | 2347 | struct pid *pid; |
2319 | struct task_struct *p; | 2348 | struct task_struct *p; |
@@ -2329,7 +2358,7 @@ long _do_fork(unsigned long clone_flags, | |||
2329 | if (!(clone_flags & CLONE_UNTRACED)) { | 2358 | if (!(clone_flags & CLONE_UNTRACED)) { |
2330 | if (clone_flags & CLONE_VFORK) | 2359 | if (clone_flags & CLONE_VFORK) |
2331 | trace = PTRACE_EVENT_VFORK; | 2360 | trace = PTRACE_EVENT_VFORK; |
2332 | else if ((clone_flags & CSIGNAL) != SIGCHLD) | 2361 | else if (args->exit_signal != SIGCHLD) |
2333 | trace = PTRACE_EVENT_CLONE; | 2362 | trace = PTRACE_EVENT_CLONE; |
2334 | else | 2363 | else |
2335 | trace = PTRACE_EVENT_FORK; | 2364 | trace = PTRACE_EVENT_FORK; |
@@ -2338,8 +2367,7 @@ long _do_fork(unsigned long clone_flags, | |||
2338 | trace = 0; | 2367 | trace = 0; |
2339 | } | 2368 | } |
2340 | 2369 | ||
2341 | p = copy_process(clone_flags, stack_start, stack_size, parent_tidptr, | 2370 | p = copy_process(NULL, trace, NUMA_NO_NODE, args); |
2342 | child_tidptr, NULL, trace, tls, NUMA_NO_NODE); | ||
2343 | add_latent_entropy(); | 2371 | add_latent_entropy(); |
2344 | 2372 | ||
2345 | if (IS_ERR(p)) | 2373 | if (IS_ERR(p)) |
@@ -2355,7 +2383,7 @@ long _do_fork(unsigned long clone_flags, | |||
2355 | nr = pid_vnr(pid); | 2383 | nr = pid_vnr(pid); |
2356 | 2384 | ||
2357 | if (clone_flags & CLONE_PARENT_SETTID) | 2385 | if (clone_flags & CLONE_PARENT_SETTID) |
2358 | put_user(nr, parent_tidptr); | 2386 | put_user(nr, args->parent_tid); |
2359 | 2387 | ||
2360 | if (clone_flags & CLONE_VFORK) { | 2388 | if (clone_flags & CLONE_VFORK) { |
2361 | p->vfork_done = &vfork; | 2389 | p->vfork_done = &vfork; |
@@ -2387,8 +2415,16 @@ long do_fork(unsigned long clone_flags, | |||
2387 | int __user *parent_tidptr, | 2415 | int __user *parent_tidptr, |
2388 | int __user *child_tidptr) | 2416 | int __user *child_tidptr) |
2389 | { | 2417 | { |
2390 | return _do_fork(clone_flags, stack_start, stack_size, | 2418 | struct kernel_clone_args args = { |
2391 | parent_tidptr, child_tidptr, 0); | 2419 | .flags = (clone_flags & ~CSIGNAL), |
2420 | .child_tid = child_tidptr, | ||
2421 | .parent_tid = parent_tidptr, | ||
2422 | .exit_signal = (clone_flags & CSIGNAL), | ||
2423 | .stack = stack_start, | ||
2424 | .stack_size = stack_size, | ||
2425 | }; | ||
2426 | |||
2427 | return _do_fork(&args); | ||
2392 | } | 2428 | } |
2393 | #endif | 2429 | #endif |
2394 | 2430 | ||
@@ -2397,15 +2433,25 @@ long do_fork(unsigned long clone_flags, | |||
2397 | */ | 2433 | */ |
2398 | pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) | 2434 | pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) |
2399 | { | 2435 | { |
2400 | return _do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn, | 2436 | struct kernel_clone_args args = { |
2401 | (unsigned long)arg, NULL, NULL, 0); | 2437 | .flags = ((flags | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL), |
2438 | .exit_signal = (flags & CSIGNAL), | ||
2439 | .stack = (unsigned long)fn, | ||
2440 | .stack_size = (unsigned long)arg, | ||
2441 | }; | ||
2442 | |||
2443 | return _do_fork(&args); | ||
2402 | } | 2444 | } |
2403 | 2445 | ||
2404 | #ifdef __ARCH_WANT_SYS_FORK | 2446 | #ifdef __ARCH_WANT_SYS_FORK |
2405 | SYSCALL_DEFINE0(fork) | 2447 | SYSCALL_DEFINE0(fork) |
2406 | { | 2448 | { |
2407 | #ifdef CONFIG_MMU | 2449 | #ifdef CONFIG_MMU |
2408 | return _do_fork(SIGCHLD, 0, 0, NULL, NULL, 0); | 2450 | struct kernel_clone_args args = { |
2451 | .exit_signal = SIGCHLD, | ||
2452 | }; | ||
2453 | |||
2454 | return _do_fork(&args); | ||
2409 | #else | 2455 | #else |
2410 | /* can not support in nommu mode */ | 2456 | /* can not support in nommu mode */ |
2411 | return -EINVAL; | 2457 | return -EINVAL; |
@@ -2416,8 +2462,12 @@ SYSCALL_DEFINE0(fork) | |||
2416 | #ifdef __ARCH_WANT_SYS_VFORK | 2462 | #ifdef __ARCH_WANT_SYS_VFORK |
2417 | SYSCALL_DEFINE0(vfork) | 2463 | SYSCALL_DEFINE0(vfork) |
2418 | { | 2464 | { |
2419 | return _do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0, | 2465 | struct kernel_clone_args args = { |
2420 | 0, NULL, NULL, 0); | 2466 | .flags = CLONE_VFORK | CLONE_VM, |
2467 | .exit_signal = SIGCHLD, | ||
2468 | }; | ||
2469 | |||
2470 | return _do_fork(&args); | ||
2421 | } | 2471 | } |
2422 | #endif | 2472 | #endif |
2423 | 2473 | ||
@@ -2445,7 +2495,112 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, | |||
2445 | unsigned long, tls) | 2495 | unsigned long, tls) |
2446 | #endif | 2496 | #endif |
2447 | { | 2497 | { |
2448 | return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr, tls); | 2498 | struct kernel_clone_args args = { |
2499 | .flags = (clone_flags & ~CSIGNAL), | ||
2500 | .pidfd = parent_tidptr, | ||
2501 | .child_tid = child_tidptr, | ||
2502 | .parent_tid = parent_tidptr, | ||
2503 | .exit_signal = (clone_flags & CSIGNAL), | ||
2504 | .stack = newsp, | ||
2505 | .tls = tls, | ||
2506 | }; | ||
2507 | |||
2508 | /* clone(CLONE_PIDFD) uses parent_tidptr to return a pidfd */ | ||
2509 | if ((clone_flags & CLONE_PIDFD) && (clone_flags & CLONE_PARENT_SETTID)) | ||
2510 | return -EINVAL; | ||
2511 | |||
2512 | return _do_fork(&args); | ||
2513 | } | ||
2514 | #endif | ||
2515 | |||
2516 | #ifdef __ARCH_WANT_SYS_CLONE3 | ||
2517 | noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, | ||
2518 | struct clone_args __user *uargs, | ||
2519 | size_t size) | ||
2520 | { | ||
2521 | struct clone_args args; | ||
2522 | |||
2523 | if (unlikely(size > PAGE_SIZE)) | ||
2524 | return -E2BIG; | ||
2525 | |||
2526 | if (unlikely(size < sizeof(struct clone_args))) | ||
2527 | return -EINVAL; | ||
2528 | |||
2529 | if (unlikely(!access_ok(uargs, size))) | ||
2530 | return -EFAULT; | ||
2531 | |||
2532 | if (size > sizeof(struct clone_args)) { | ||
2533 | unsigned char __user *addr; | ||
2534 | unsigned char __user *end; | ||
2535 | unsigned char val; | ||
2536 | |||
2537 | addr = (void __user *)uargs + sizeof(struct clone_args); | ||
2538 | end = (void __user *)uargs + size; | ||
2539 | |||
2540 | for (; addr < end; addr++) { | ||
2541 | if (get_user(val, addr)) | ||
2542 | return -EFAULT; | ||
2543 | if (val) | ||
2544 | return -E2BIG; | ||
2545 | } | ||
2546 | |||
2547 | size = sizeof(struct clone_args); | ||
2548 | } | ||
2549 | |||
2550 | if (copy_from_user(&args, uargs, size)) | ||
2551 | return -EFAULT; | ||
2552 | |||
2553 | *kargs = (struct kernel_clone_args){ | ||
2554 | .flags = args.flags, | ||
2555 | .pidfd = u64_to_user_ptr(args.pidfd), | ||
2556 | .child_tid = u64_to_user_ptr(args.child_tid), | ||
2557 | .parent_tid = u64_to_user_ptr(args.parent_tid), | ||
2558 | .exit_signal = args.exit_signal, | ||
2559 | .stack = args.stack, | ||
2560 | .stack_size = args.stack_size, | ||
2561 | .tls = args.tls, | ||
2562 | }; | ||
2563 | |||
2564 | return 0; | ||
2565 | } | ||
2566 | |||
2567 | static bool clone3_args_valid(const struct kernel_clone_args *kargs) | ||
2568 | { | ||
2569 | /* | ||
2570 | * All lower bits of the flag word are taken. | ||
2571 | * Verify that no other unknown flags are passed along. | ||
2572 | */ | ||
2573 | if (kargs->flags & ~CLONE_LEGACY_FLAGS) | ||
2574 | return false; | ||
2575 | |||
2576 | /* | ||
2577 | * - make the CLONE_DETACHED bit reuseable for clone3 | ||
2578 | * - make the CSIGNAL bits reuseable for clone3 | ||
2579 | */ | ||
2580 | if (kargs->flags & (CLONE_DETACHED | CSIGNAL)) | ||
2581 | return false; | ||
2582 | |||
2583 | if ((kargs->flags & (CLONE_THREAD | CLONE_PARENT)) && | ||
2584 | kargs->exit_signal) | ||
2585 | return false; | ||
2586 | |||
2587 | return true; | ||
2588 | } | ||
2589 | |||
2590 | SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size) | ||
2591 | { | ||
2592 | int err; | ||
2593 | |||
2594 | struct kernel_clone_args kargs; | ||
2595 | |||
2596 | err = copy_clone_args_from_user(&kargs, uargs, size); | ||
2597 | if (err) | ||
2598 | return err; | ||
2599 | |||
2600 | if (!clone3_args_valid(&kargs)) | ||
2601 | return -EINVAL; | ||
2602 | |||
2603 | return _do_fork(&kargs); | ||
2449 | } | 2604 | } |
2450 | #endif | 2605 | #endif |
2451 | 2606 | ||