aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/fork.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/fork.c')
-rw-r--r--kernel/fork.c146
1 files changed, 86 insertions, 60 deletions
diff --git a/kernel/fork.c b/kernel/fork.c
index 467746b3f0aa..166b8c49257c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -17,7 +17,6 @@
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/vmalloc.h> 18#include <linux/vmalloc.h>
19#include <linux/completion.h> 19#include <linux/completion.h>
20#include <linux/mnt_namespace.h>
21#include <linux/personality.h> 20#include <linux/personality.h>
22#include <linux/mempolicy.h> 21#include <linux/mempolicy.h>
23#include <linux/sem.h> 22#include <linux/sem.h>
@@ -50,6 +49,7 @@
50#include <linux/ftrace.h> 49#include <linux/ftrace.h>
51#include <linux/profile.h> 50#include <linux/profile.h>
52#include <linux/rmap.h> 51#include <linux/rmap.h>
52#include <linux/ksm.h>
53#include <linux/acct.h> 53#include <linux/acct.h>
54#include <linux/tsacct_kern.h> 54#include <linux/tsacct_kern.h>
55#include <linux/cn_proc.h> 55#include <linux/cn_proc.h>
@@ -62,7 +62,8 @@
62#include <linux/blkdev.h> 62#include <linux/blkdev.h>
63#include <linux/fs_struct.h> 63#include <linux/fs_struct.h>
64#include <linux/magic.h> 64#include <linux/magic.h>
65#include <linux/perf_counter.h> 65#include <linux/perf_event.h>
66#include <linux/posix-timers.h>
66 67
67#include <asm/pgtable.h> 68#include <asm/pgtable.h>
68#include <asm/pgalloc.h> 69#include <asm/pgalloc.h>
@@ -90,7 +91,7 @@ int nr_processes(void)
90 int cpu; 91 int cpu;
91 int total = 0; 92 int total = 0;
92 93
93 for_each_online_cpu(cpu) 94 for_each_possible_cpu(cpu)
94 total += per_cpu(process_counts, cpu); 95 total += per_cpu(process_counts, cpu);
95 96
96 return total; 97 return total;
@@ -137,9 +138,17 @@ struct kmem_cache *vm_area_cachep;
137/* SLAB cache for mm_struct structures (tsk->mm) */ 138/* SLAB cache for mm_struct structures (tsk->mm) */
138static struct kmem_cache *mm_cachep; 139static struct kmem_cache *mm_cachep;
139 140
141static void account_kernel_stack(struct thread_info *ti, int account)
142{
143 struct zone *zone = page_zone(virt_to_page(ti));
144
145 mod_zone_page_state(zone, NR_KERNEL_STACK, account);
146}
147
140void free_task(struct task_struct *tsk) 148void free_task(struct task_struct *tsk)
141{ 149{
142 prop_local_destroy_single(&tsk->dirties); 150 prop_local_destroy_single(&tsk->dirties);
151 account_kernel_stack(tsk->stack, -1);
143 free_thread_info(tsk->stack); 152 free_thread_info(tsk->stack);
144 rt_mutex_debug_task_free(tsk); 153 rt_mutex_debug_task_free(tsk);
145 ftrace_graph_exit_task(tsk); 154 ftrace_graph_exit_task(tsk);
@@ -153,8 +162,7 @@ void __put_task_struct(struct task_struct *tsk)
153 WARN_ON(atomic_read(&tsk->usage)); 162 WARN_ON(atomic_read(&tsk->usage));
154 WARN_ON(tsk == current); 163 WARN_ON(tsk == current);
155 164
156 put_cred(tsk->real_cred); 165 exit_creds(tsk);
157 put_cred(tsk->cred);
158 delayacct_tsk_free(tsk); 166 delayacct_tsk_free(tsk);
159 167
160 if (!profile_handoff_task(tsk)) 168 if (!profile_handoff_task(tsk))
@@ -255,6 +263,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
255 tsk->btrace_seq = 0; 263 tsk->btrace_seq = 0;
256#endif 264#endif
257 tsk->splice_pipe = NULL; 265 tsk->splice_pipe = NULL;
266
267 account_kernel_stack(ti, 1);
268
258 return tsk; 269 return tsk;
259 270
260out: 271out:
@@ -290,6 +301,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
290 rb_link = &mm->mm_rb.rb_node; 301 rb_link = &mm->mm_rb.rb_node;
291 rb_parent = NULL; 302 rb_parent = NULL;
292 pprev = &mm->mmap; 303 pprev = &mm->mmap;
304 retval = ksm_fork(mm, oldmm);
305 if (retval)
306 goto out;
293 307
294 for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { 308 for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
295 struct file *file; 309 struct file *file;
@@ -420,22 +434,30 @@ __setup("coredump_filter=", coredump_filter_setup);
420 434
421#include <linux/init_task.h> 435#include <linux/init_task.h>
422 436
437static void mm_init_aio(struct mm_struct *mm)
438{
439#ifdef CONFIG_AIO
440 spin_lock_init(&mm->ioctx_lock);
441 INIT_HLIST_HEAD(&mm->ioctx_list);
442#endif
443}
444
423static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) 445static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
424{ 446{
425 atomic_set(&mm->mm_users, 1); 447 atomic_set(&mm->mm_users, 1);
426 atomic_set(&mm->mm_count, 1); 448 atomic_set(&mm->mm_count, 1);
427 init_rwsem(&mm->mmap_sem); 449 init_rwsem(&mm->mmap_sem);
428 INIT_LIST_HEAD(&mm->mmlist); 450 INIT_LIST_HEAD(&mm->mmlist);
429 mm->flags = (current->mm) ? current->mm->flags : default_dump_filter; 451 mm->flags = (current->mm) ?
452 (current->mm->flags & MMF_INIT_MASK) : default_dump_filter;
430 mm->core_state = NULL; 453 mm->core_state = NULL;
431 mm->nr_ptes = 0; 454 mm->nr_ptes = 0;
432 set_mm_counter(mm, file_rss, 0); 455 set_mm_counter(mm, file_rss, 0);
433 set_mm_counter(mm, anon_rss, 0); 456 set_mm_counter(mm, anon_rss, 0);
434 spin_lock_init(&mm->page_table_lock); 457 spin_lock_init(&mm->page_table_lock);
435 spin_lock_init(&mm->ioctx_lock);
436 INIT_HLIST_HEAD(&mm->ioctx_list);
437 mm->free_area_cache = TASK_UNMAPPED_BASE; 458 mm->free_area_cache = TASK_UNMAPPED_BASE;
438 mm->cached_hole_size = ~0UL; 459 mm->cached_hole_size = ~0UL;
460 mm_init_aio(mm);
439 mm_init_owner(mm, p); 461 mm_init_owner(mm, p);
440 462
441 if (likely(!mm_alloc_pgd(mm))) { 463 if (likely(!mm_alloc_pgd(mm))) {
@@ -487,6 +509,7 @@ void mmput(struct mm_struct *mm)
487 509
488 if (atomic_dec_and_test(&mm->mm_users)) { 510 if (atomic_dec_and_test(&mm->mm_users)) {
489 exit_aio(mm); 511 exit_aio(mm);
512 ksm_exit(mm);
490 exit_mmap(mm); 513 exit_mmap(mm);
491 set_mm_exe_file(mm, NULL); 514 set_mm_exe_file(mm, NULL);
492 if (!list_empty(&mm->mmlist)) { 515 if (!list_empty(&mm->mmlist)) {
@@ -495,6 +518,8 @@ void mmput(struct mm_struct *mm)
495 spin_unlock(&mmlist_lock); 518 spin_unlock(&mmlist_lock);
496 } 519 }
497 put_swap_token(mm); 520 put_swap_token(mm);
521 if (mm->binfmt)
522 module_put(mm->binfmt->module);
498 mmdrop(mm); 523 mmdrop(mm);
499 } 524 }
500} 525}
@@ -545,12 +570,18 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
545 570
546 /* Get rid of any futexes when releasing the mm */ 571 /* Get rid of any futexes when releasing the mm */
547#ifdef CONFIG_FUTEX 572#ifdef CONFIG_FUTEX
548 if (unlikely(tsk->robust_list)) 573 if (unlikely(tsk->robust_list)) {
549 exit_robust_list(tsk); 574 exit_robust_list(tsk);
575 tsk->robust_list = NULL;
576 }
550#ifdef CONFIG_COMPAT 577#ifdef CONFIG_COMPAT
551 if (unlikely(tsk->compat_robust_list)) 578 if (unlikely(tsk->compat_robust_list)) {
552 compat_exit_robust_list(tsk); 579 compat_exit_robust_list(tsk);
580 tsk->compat_robust_list = NULL;
581 }
553#endif 582#endif
583 if (unlikely(!list_empty(&tsk->pi_state_list)))
584 exit_pi_state_list(tsk);
554#endif 585#endif
555 586
556 /* Get rid of any cached register state */ 587 /* Get rid of any cached register state */
@@ -568,18 +599,18 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
568 * the value intact in a core dump, and to save the unnecessary 599 * the value intact in a core dump, and to save the unnecessary
569 * trouble otherwise. Userland only wants this done for a sys_exit. 600 * trouble otherwise. Userland only wants this done for a sys_exit.
570 */ 601 */
571 if (tsk->clear_child_tid 602 if (tsk->clear_child_tid) {
572 && !(tsk->flags & PF_SIGNALED) 603 if (!(tsk->flags & PF_SIGNALED) &&
573 && atomic_read(&mm->mm_users) > 1) { 604 atomic_read(&mm->mm_users) > 1) {
574 u32 __user * tidptr = tsk->clear_child_tid; 605 /*
606 * We don't check the error code - if userspace has
607 * not set up a proper pointer then tough luck.
608 */
609 put_user(0, tsk->clear_child_tid);
610 sys_futex(tsk->clear_child_tid, FUTEX_WAKE,
611 1, NULL, NULL, 0);
612 }
575 tsk->clear_child_tid = NULL; 613 tsk->clear_child_tid = NULL;
576
577 /*
578 * We don't check the error code - if userspace has
579 * not set up a proper pointer then tough luck.
580 */
581 put_user(0, tidptr);
582 sys_futex(tidptr, FUTEX_WAKE, 1, NULL, NULL, 0);
583 } 614 }
584} 615}
585 616
@@ -620,9 +651,14 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
620 mm->hiwater_rss = get_mm_rss(mm); 651 mm->hiwater_rss = get_mm_rss(mm);
621 mm->hiwater_vm = mm->total_vm; 652 mm->hiwater_vm = mm->total_vm;
622 653
654 if (mm->binfmt && !try_module_get(mm->binfmt->module))
655 goto free_pt;
656
623 return mm; 657 return mm;
624 658
625free_pt: 659free_pt:
660 /* don't put binfmt in mmput, we haven't got module yet */
661 mm->binfmt = NULL;
626 mmput(mm); 662 mmput(mm);
627 663
628fail_nomem: 664fail_nomem:
@@ -790,10 +826,10 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
790 thread_group_cputime_init(sig); 826 thread_group_cputime_init(sig);
791 827
792 /* Expiration times and increments. */ 828 /* Expiration times and increments. */
793 sig->it_virt_expires = cputime_zero; 829 sig->it[CPUCLOCK_PROF].expires = cputime_zero;
794 sig->it_virt_incr = cputime_zero; 830 sig->it[CPUCLOCK_PROF].incr = cputime_zero;
795 sig->it_prof_expires = cputime_zero; 831 sig->it[CPUCLOCK_VIRT].expires = cputime_zero;
796 sig->it_prof_incr = cputime_zero; 832 sig->it[CPUCLOCK_VIRT].incr = cputime_zero;
797 833
798 /* Cached expiration times. */ 834 /* Cached expiration times. */
799 sig->cputime_expires.prof_exp = cputime_zero; 835 sig->cputime_expires.prof_exp = cputime_zero;
@@ -816,11 +852,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
816{ 852{
817 struct signal_struct *sig; 853 struct signal_struct *sig;
818 854
819 if (clone_flags & CLONE_THREAD) { 855 if (clone_flags & CLONE_THREAD)
820 atomic_inc(&current->signal->count);
821 atomic_inc(&current->signal->live);
822 return 0; 856 return 0;
823 }
824 857
825 sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); 858 sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
826 tsk->signal = sig; 859 tsk->signal = sig;
@@ -854,6 +887,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
854 sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; 887 sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
855 sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; 888 sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
856 sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; 889 sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
890 sig->maxrss = sig->cmaxrss = 0;
857 task_io_accounting_init(&sig->ioac); 891 task_io_accounting_init(&sig->ioac);
858 sig->sum_sched_runtime = 0; 892 sig->sum_sched_runtime = 0;
859 taskstats_tgid_init(sig); 893 taskstats_tgid_init(sig);
@@ -868,6 +902,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
868 902
869 tty_audit_fork(sig); 903 tty_audit_fork(sig);
870 904
905 sig->oom_adj = current->signal->oom_adj;
906
871 return 0; 907 return 0;
872} 908}
873 909
@@ -878,16 +914,6 @@ void __cleanup_signal(struct signal_struct *sig)
878 kmem_cache_free(signal_cachep, sig); 914 kmem_cache_free(signal_cachep, sig);
879} 915}
880 916
881static void cleanup_signal(struct task_struct *tsk)
882{
883 struct signal_struct *sig = tsk->signal;
884
885 atomic_dec(&sig->live);
886
887 if (atomic_dec_and_test(&sig->count))
888 __cleanup_signal(sig);
889}
890
891static void copy_flags(unsigned long clone_flags, struct task_struct *p) 917static void copy_flags(unsigned long clone_flags, struct task_struct *p)
892{ 918{
893 unsigned long new_flags = p->flags; 919 unsigned long new_flags = p->flags;
@@ -973,6 +999,16 @@ static struct task_struct *copy_process(unsigned long clone_flags,
973 if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM)) 999 if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
974 return ERR_PTR(-EINVAL); 1000 return ERR_PTR(-EINVAL);
975 1001
1002 /*
1003 * Siblings of global init remain as zombies on exit since they are
1004 * not reaped by their parent (swapper). To solve this and to avoid
1005 * multi-rooted process trees, prevent global and container-inits
1006 * from creating siblings.
1007 */
1008 if ((clone_flags & CLONE_PARENT) &&
1009 current->signal->flags & SIGNAL_UNKILLABLE)
1010 return ERR_PTR(-EINVAL);
1011
976 retval = security_task_create(clone_flags); 1012 retval = security_task_create(clone_flags);
977 if (retval) 1013 if (retval)
978 goto fork_out; 1014 goto fork_out;
@@ -1014,18 +1050,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1014 if (!try_module_get(task_thread_info(p)->exec_domain->module)) 1050 if (!try_module_get(task_thread_info(p)->exec_domain->module))
1015 goto bad_fork_cleanup_count; 1051 goto bad_fork_cleanup_count;
1016 1052
1017 if (p->binfmt && !try_module_get(p->binfmt->module))
1018 goto bad_fork_cleanup_put_domain;
1019
1020 p->did_exec = 0; 1053 p->did_exec = 0;
1021 delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ 1054 delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
1022 copy_flags(clone_flags, p); 1055 copy_flags(clone_flags, p);
1023 INIT_LIST_HEAD(&p->children); 1056 INIT_LIST_HEAD(&p->children);
1024 INIT_LIST_HEAD(&p->sibling); 1057 INIT_LIST_HEAD(&p->sibling);
1025#ifdef CONFIG_PREEMPT_RCU 1058 rcu_copy_process(p);
1026 p->rcu_read_lock_nesting = 0;
1027 p->rcu_flipctr_idx = 0;
1028#endif /* #ifdef CONFIG_PREEMPT_RCU */
1029 p->vfork_done = NULL; 1059 p->vfork_done = NULL;
1030 spin_lock_init(&p->alloc_lock); 1060 spin_lock_init(&p->alloc_lock);
1031 1061
@@ -1093,10 +1123,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1093 1123
1094 p->bts = NULL; 1124 p->bts = NULL;
1095 1125
1126 p->stack_start = stack_start;
1127
1096 /* Perform scheduler related setup. Assign this task to a CPU. */ 1128 /* Perform scheduler related setup. Assign this task to a CPU. */
1097 sched_fork(p, clone_flags); 1129 sched_fork(p, clone_flags);
1098 1130
1099 retval = perf_counter_init_task(p); 1131 retval = perf_event_init_task(p);
1100 if (retval) 1132 if (retval)
1101 goto bad_fork_cleanup_policy; 1133 goto bad_fork_cleanup_policy;
1102 1134
@@ -1240,6 +1272,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1240 } 1272 }
1241 1273
1242 if (clone_flags & CLONE_THREAD) { 1274 if (clone_flags & CLONE_THREAD) {
1275 atomic_inc(&current->signal->count);
1276 atomic_inc(&current->signal->live);
1243 p->group_leader = current->group_leader; 1277 p->group_leader = current->group_leader;
1244 list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); 1278 list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
1245 } 1279 }
@@ -1269,6 +1303,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1269 write_unlock_irq(&tasklist_lock); 1303 write_unlock_irq(&tasklist_lock);
1270 proc_fork_connector(p); 1304 proc_fork_connector(p);
1271 cgroup_post_fork(p); 1305 cgroup_post_fork(p);
1306 perf_event_fork(p);
1272 return p; 1307 return p;
1273 1308
1274bad_fork_free_pid: 1309bad_fork_free_pid:
@@ -1282,7 +1317,8 @@ bad_fork_cleanup_mm:
1282 if (p->mm) 1317 if (p->mm)
1283 mmput(p->mm); 1318 mmput(p->mm);
1284bad_fork_cleanup_signal: 1319bad_fork_cleanup_signal:
1285 cleanup_signal(p); 1320 if (!(clone_flags & CLONE_THREAD))
1321 __cleanup_signal(p->signal);
1286bad_fork_cleanup_sighand: 1322bad_fork_cleanup_sighand:
1287 __cleanup_sighand(p->sighand); 1323 __cleanup_sighand(p->sighand);
1288bad_fork_cleanup_fs: 1324bad_fork_cleanup_fs:
@@ -1294,21 +1330,17 @@ bad_fork_cleanup_semundo:
1294bad_fork_cleanup_audit: 1330bad_fork_cleanup_audit:
1295 audit_free(p); 1331 audit_free(p);
1296bad_fork_cleanup_policy: 1332bad_fork_cleanup_policy:
1297 perf_counter_free_task(p); 1333 perf_event_free_task(p);
1298#ifdef CONFIG_NUMA 1334#ifdef CONFIG_NUMA
1299 mpol_put(p->mempolicy); 1335 mpol_put(p->mempolicy);
1300bad_fork_cleanup_cgroup: 1336bad_fork_cleanup_cgroup:
1301#endif 1337#endif
1302 cgroup_exit(p, cgroup_callbacks_done); 1338 cgroup_exit(p, cgroup_callbacks_done);
1303 delayacct_tsk_free(p); 1339 delayacct_tsk_free(p);
1304 if (p->binfmt)
1305 module_put(p->binfmt->module);
1306bad_fork_cleanup_put_domain:
1307 module_put(task_thread_info(p)->exec_domain->module); 1340 module_put(task_thread_info(p)->exec_domain->module);
1308bad_fork_cleanup_count: 1341bad_fork_cleanup_count:
1309 atomic_dec(&p->cred->user->processes); 1342 atomic_dec(&p->cred->user->processes);
1310 put_cred(p->real_cred); 1343 exit_creds(p);
1311 put_cred(p->cred);
1312bad_fork_free: 1344bad_fork_free:
1313 free_task(p); 1345 free_task(p);
1314fork_out: 1346fork_out:
@@ -1408,12 +1440,6 @@ long do_fork(unsigned long clone_flags,
1408 if (clone_flags & CLONE_VFORK) { 1440 if (clone_flags & CLONE_VFORK) {
1409 p->vfork_done = &vfork; 1441 p->vfork_done = &vfork;
1410 init_completion(&vfork); 1442 init_completion(&vfork);
1411 } else if (!(clone_flags & CLONE_VM)) {
1412 /*
1413 * vfork will do an exec which will call
1414 * set_task_comm()
1415 */
1416 perf_counter_fork(p);
1417 } 1443 }
1418 1444
1419 audit_finish_fork(p); 1445 audit_finish_fork(p);