diff options
author | Andrea Bastoni <bastoni@cs.unc.edu> | 2011-08-27 09:43:54 -0400 |
---|---|---|
committer | Andrea Bastoni <bastoni@cs.unc.edu> | 2011-08-27 10:06:11 -0400 |
commit | 7b1bb388bc879ffcc6c69b567816d5c354afe42b (patch) | |
tree | 5a217fdfb0b5e5a327bdcd624506337c1ae1fe32 /kernel/fork.c | |
parent | 7d754596756240fa918b94cd0c3011c77a638987 (diff) | |
parent | 02f8c6aee8df3cdc935e9bdd4f2d020306035dbe (diff) |
Merge 'Linux v3.0' into Litmus
Some notes:
* Litmus^RT scheduling class is the topmost scheduling class
(above stop_sched_class).
* scheduler_ipi() function (e.g., in smp_reschedule_interrupt())
may increase IPI latencies.
* Added path into schedule() to quickly re-evaluate scheduling
decision without becoming preemptive again. This used to be
a standard path before the removal of BKL.
Conflicts:
Makefile
arch/arm/kernel/calls.S
arch/arm/kernel/smp.c
arch/x86/include/asm/unistd_32.h
arch/x86/kernel/smp.c
arch/x86/kernel/syscall_table_32.S
include/linux/hrtimer.h
kernel/printk.c
kernel/sched.c
kernel/sched_fair.c
Diffstat (limited to 'kernel/fork.c')
-rw-r--r-- | kernel/fork.c | 305 |
1 files changed, 151 insertions, 154 deletions
diff --git a/kernel/fork.c b/kernel/fork.c index ab7f29d906c7..25c6111fe3a6 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -40,6 +40,7 @@ | |||
40 | #include <linux/tracehook.h> | 40 | #include <linux/tracehook.h> |
41 | #include <linux/futex.h> | 41 | #include <linux/futex.h> |
42 | #include <linux/compat.h> | 42 | #include <linux/compat.h> |
43 | #include <linux/kthread.h> | ||
43 | #include <linux/task_io_accounting_ops.h> | 44 | #include <linux/task_io_accounting_ops.h> |
44 | #include <linux/rcupdate.h> | 45 | #include <linux/rcupdate.h> |
45 | #include <linux/ptrace.h> | 46 | #include <linux/ptrace.h> |
@@ -58,13 +59,14 @@ | |||
58 | #include <linux/taskstats_kern.h> | 59 | #include <linux/taskstats_kern.h> |
59 | #include <linux/random.h> | 60 | #include <linux/random.h> |
60 | #include <linux/tty.h> | 61 | #include <linux/tty.h> |
61 | #include <linux/proc_fs.h> | ||
62 | #include <linux/blkdev.h> | 62 | #include <linux/blkdev.h> |
63 | #include <linux/fs_struct.h> | 63 | #include <linux/fs_struct.h> |
64 | #include <linux/magic.h> | 64 | #include <linux/magic.h> |
65 | #include <linux/perf_event.h> | 65 | #include <linux/perf_event.h> |
66 | #include <linux/posix-timers.h> | 66 | #include <linux/posix-timers.h> |
67 | #include <linux/user-return-notifier.h> | 67 | #include <linux/user-return-notifier.h> |
68 | #include <linux/oom.h> | ||
69 | #include <linux/khugepaged.h> | ||
68 | 70 | ||
69 | #include <asm/pgtable.h> | 71 | #include <asm/pgtable.h> |
70 | #include <asm/pgalloc.h> | 72 | #include <asm/pgalloc.h> |
@@ -110,20 +112,25 @@ int nr_processes(void) | |||
110 | } | 112 | } |
111 | 113 | ||
112 | #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR | 114 | #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR |
113 | # define alloc_task_struct() kmem_cache_alloc(task_struct_cachep, GFP_KERNEL) | 115 | # define alloc_task_struct_node(node) \ |
114 | # define free_task_struct(tsk) kmem_cache_free(task_struct_cachep, (tsk)) | 116 | kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node) |
117 | # define free_task_struct(tsk) \ | ||
118 | kmem_cache_free(task_struct_cachep, (tsk)) | ||
115 | static struct kmem_cache *task_struct_cachep; | 119 | static struct kmem_cache *task_struct_cachep; |
116 | #endif | 120 | #endif |
117 | 121 | ||
118 | #ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR | 122 | #ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR |
119 | static inline struct thread_info *alloc_thread_info(struct task_struct *tsk) | 123 | static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, |
124 | int node) | ||
120 | { | 125 | { |
121 | #ifdef CONFIG_DEBUG_STACK_USAGE | 126 | #ifdef CONFIG_DEBUG_STACK_USAGE |
122 | gfp_t mask = GFP_KERNEL | __GFP_ZERO; | 127 | gfp_t mask = GFP_KERNEL | __GFP_ZERO; |
123 | #else | 128 | #else |
124 | gfp_t mask = GFP_KERNEL; | 129 | gfp_t mask = GFP_KERNEL; |
125 | #endif | 130 | #endif |
126 | return (struct thread_info *)__get_free_pages(mask, THREAD_SIZE_ORDER); | 131 | struct page *page = alloc_pages_node(node, mask, THREAD_SIZE_ORDER); |
132 | |||
133 | return page ? page_address(page) : NULL; | ||
127 | } | 134 | } |
128 | 135 | ||
129 | static inline void free_thread_info(struct thread_info *ti) | 136 | static inline void free_thread_info(struct thread_info *ti) |
@@ -171,6 +178,7 @@ EXPORT_SYMBOL(free_task); | |||
171 | static inline void free_signal_struct(struct signal_struct *sig) | 178 | static inline void free_signal_struct(struct signal_struct *sig) |
172 | { | 179 | { |
173 | taskstats_tgid_free(sig); | 180 | taskstats_tgid_free(sig); |
181 | sched_autogroup_exit(sig); | ||
174 | kmem_cache_free(signal_cachep, sig); | 182 | kmem_cache_free(signal_cachep, sig); |
175 | } | 183 | } |
176 | 184 | ||
@@ -194,6 +202,7 @@ void __put_task_struct(struct task_struct *tsk) | |||
194 | if (!profile_handoff_task(tsk)) | 202 | if (!profile_handoff_task(tsk)) |
195 | free_task(tsk); | 203 | free_task(tsk); |
196 | } | 204 | } |
205 | EXPORT_SYMBOL_GPL(__put_task_struct); | ||
197 | 206 | ||
198 | /* | 207 | /* |
199 | * macro override instead of weak attribute alias, to workaround | 208 | * macro override instead of weak attribute alias, to workaround |
@@ -249,16 +258,16 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) | |||
249 | struct task_struct *tsk; | 258 | struct task_struct *tsk; |
250 | struct thread_info *ti; | 259 | struct thread_info *ti; |
251 | unsigned long *stackend; | 260 | unsigned long *stackend; |
252 | 261 | int node = tsk_fork_get_node(orig); | |
253 | int err; | 262 | int err; |
254 | 263 | ||
255 | prepare_to_copy(orig); | 264 | prepare_to_copy(orig); |
256 | 265 | ||
257 | tsk = alloc_task_struct(); | 266 | tsk = alloc_task_struct_node(node); |
258 | if (!tsk) | 267 | if (!tsk) |
259 | return NULL; | 268 | return NULL; |
260 | 269 | ||
261 | ti = alloc_thread_info(tsk); | 270 | ti = alloc_thread_info_node(tsk, node); |
262 | if (!ti) { | 271 | if (!ti) { |
263 | free_task_struct(tsk); | 272 | free_task_struct(tsk); |
264 | return NULL; | 273 | return NULL; |
@@ -279,6 +288,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) | |||
279 | 288 | ||
280 | setup_thread_stack(tsk, orig); | 289 | setup_thread_stack(tsk, orig); |
281 | clear_user_return_notifier(tsk); | 290 | clear_user_return_notifier(tsk); |
291 | clear_tsk_need_resched(tsk); | ||
282 | stackend = end_of_stack(tsk); | 292 | stackend = end_of_stack(tsk); |
283 | *stackend = STACK_END_MAGIC; /* for overflow detection */ | 293 | *stackend = STACK_END_MAGIC; /* for overflow detection */ |
284 | 294 | ||
@@ -334,6 +344,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |||
334 | retval = ksm_fork(mm, oldmm); | 344 | retval = ksm_fork(mm, oldmm); |
335 | if (retval) | 345 | if (retval) |
336 | goto out; | 346 | goto out; |
347 | retval = khugepaged_fork(mm, oldmm); | ||
348 | if (retval) | ||
349 | goto out; | ||
337 | 350 | ||
338 | prev = NULL; | 351 | prev = NULL; |
339 | for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { | 352 | for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { |
@@ -376,15 +389,14 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |||
376 | get_file(file); | 389 | get_file(file); |
377 | if (tmp->vm_flags & VM_DENYWRITE) | 390 | if (tmp->vm_flags & VM_DENYWRITE) |
378 | atomic_dec(&inode->i_writecount); | 391 | atomic_dec(&inode->i_writecount); |
379 | spin_lock(&mapping->i_mmap_lock); | 392 | mutex_lock(&mapping->i_mmap_mutex); |
380 | if (tmp->vm_flags & VM_SHARED) | 393 | if (tmp->vm_flags & VM_SHARED) |
381 | mapping->i_mmap_writable++; | 394 | mapping->i_mmap_writable++; |
382 | tmp->vm_truncate_count = mpnt->vm_truncate_count; | ||
383 | flush_dcache_mmap_lock(mapping); | 395 | flush_dcache_mmap_lock(mapping); |
384 | /* insert tmp into the share list, just after mpnt */ | 396 | /* insert tmp into the share list, just after mpnt */ |
385 | vma_prio_tree_add(tmp, mpnt); | 397 | vma_prio_tree_add(tmp, mpnt); |
386 | flush_dcache_mmap_unlock(mapping); | 398 | flush_dcache_mmap_unlock(mapping); |
387 | spin_unlock(&mapping->i_mmap_lock); | 399 | mutex_unlock(&mapping->i_mmap_mutex); |
388 | } | 400 | } |
389 | 401 | ||
390 | /* | 402 | /* |
@@ -495,6 +507,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) | |||
495 | mm->cached_hole_size = ~0UL; | 507 | mm->cached_hole_size = ~0UL; |
496 | mm_init_aio(mm); | 508 | mm_init_aio(mm); |
497 | mm_init_owner(mm, p); | 509 | mm_init_owner(mm, p); |
510 | atomic_set(&mm->oom_disable_count, 0); | ||
498 | 511 | ||
499 | if (likely(!mm_alloc_pgd(mm))) { | 512 | if (likely(!mm_alloc_pgd(mm))) { |
500 | mm->def_flags = 0; | 513 | mm->def_flags = 0; |
@@ -514,11 +527,12 @@ struct mm_struct * mm_alloc(void) | |||
514 | struct mm_struct * mm; | 527 | struct mm_struct * mm; |
515 | 528 | ||
516 | mm = allocate_mm(); | 529 | mm = allocate_mm(); |
517 | if (mm) { | 530 | if (!mm) |
518 | memset(mm, 0, sizeof(*mm)); | 531 | return NULL; |
519 | mm = mm_init(mm, current); | 532 | |
520 | } | 533 | memset(mm, 0, sizeof(*mm)); |
521 | return mm; | 534 | mm_init_cpumask(mm); |
535 | return mm_init(mm, current); | ||
522 | } | 536 | } |
523 | 537 | ||
524 | /* | 538 | /* |
@@ -532,6 +546,9 @@ void __mmdrop(struct mm_struct *mm) | |||
532 | mm_free_pgd(mm); | 546 | mm_free_pgd(mm); |
533 | destroy_context(mm); | 547 | destroy_context(mm); |
534 | mmu_notifier_mm_destroy(mm); | 548 | mmu_notifier_mm_destroy(mm); |
549 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
550 | VM_BUG_ON(mm->pmd_huge_pte); | ||
551 | #endif | ||
535 | free_mm(mm); | 552 | free_mm(mm); |
536 | } | 553 | } |
537 | EXPORT_SYMBOL_GPL(__mmdrop); | 554 | EXPORT_SYMBOL_GPL(__mmdrop); |
@@ -546,6 +563,7 @@ void mmput(struct mm_struct *mm) | |||
546 | if (atomic_dec_and_test(&mm->mm_users)) { | 563 | if (atomic_dec_and_test(&mm->mm_users)) { |
547 | exit_aio(mm); | 564 | exit_aio(mm); |
548 | ksm_exit(mm); | 565 | ksm_exit(mm); |
566 | khugepaged_exit(mm); /* must run before exit_mmap */ | ||
549 | exit_mmap(mm); | 567 | exit_mmap(mm); |
550 | set_mm_exe_file(mm, NULL); | 568 | set_mm_exe_file(mm, NULL); |
551 | if (!list_empty(&mm->mmlist)) { | 569 | if (!list_empty(&mm->mmlist)) { |
@@ -561,6 +579,57 @@ void mmput(struct mm_struct *mm) | |||
561 | } | 579 | } |
562 | EXPORT_SYMBOL_GPL(mmput); | 580 | EXPORT_SYMBOL_GPL(mmput); |
563 | 581 | ||
582 | /* | ||
583 | * We added or removed a vma mapping the executable. The vmas are only mapped | ||
584 | * during exec and are not mapped with the mmap system call. | ||
585 | * Callers must hold down_write() on the mm's mmap_sem for these | ||
586 | */ | ||
587 | void added_exe_file_vma(struct mm_struct *mm) | ||
588 | { | ||
589 | mm->num_exe_file_vmas++; | ||
590 | } | ||
591 | |||
592 | void removed_exe_file_vma(struct mm_struct *mm) | ||
593 | { | ||
594 | mm->num_exe_file_vmas--; | ||
595 | if ((mm->num_exe_file_vmas == 0) && mm->exe_file){ | ||
596 | fput(mm->exe_file); | ||
597 | mm->exe_file = NULL; | ||
598 | } | ||
599 | |||
600 | } | ||
601 | |||
602 | void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) | ||
603 | { | ||
604 | if (new_exe_file) | ||
605 | get_file(new_exe_file); | ||
606 | if (mm->exe_file) | ||
607 | fput(mm->exe_file); | ||
608 | mm->exe_file = new_exe_file; | ||
609 | mm->num_exe_file_vmas = 0; | ||
610 | } | ||
611 | |||
612 | struct file *get_mm_exe_file(struct mm_struct *mm) | ||
613 | { | ||
614 | struct file *exe_file; | ||
615 | |||
616 | /* We need mmap_sem to protect against races with removal of | ||
617 | * VM_EXECUTABLE vmas */ | ||
618 | down_read(&mm->mmap_sem); | ||
619 | exe_file = mm->exe_file; | ||
620 | if (exe_file) | ||
621 | get_file(exe_file); | ||
622 | up_read(&mm->mmap_sem); | ||
623 | return exe_file; | ||
624 | } | ||
625 | |||
626 | static void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm) | ||
627 | { | ||
628 | /* It's safe to write the exe_file pointer without exe_file_lock because | ||
629 | * this is called during fork when the task is not yet in /proc */ | ||
630 | newmm->exe_file = get_mm_exe_file(oldmm); | ||
631 | } | ||
632 | |||
564 | /** | 633 | /** |
565 | * get_task_mm - acquire a reference to the task's mm | 634 | * get_task_mm - acquire a reference to the task's mm |
566 | * | 635 | * |
@@ -667,11 +736,16 @@ struct mm_struct *dup_mm(struct task_struct *tsk) | |||
667 | goto fail_nomem; | 736 | goto fail_nomem; |
668 | 737 | ||
669 | memcpy(mm, oldmm, sizeof(*mm)); | 738 | memcpy(mm, oldmm, sizeof(*mm)); |
739 | mm_init_cpumask(mm); | ||
670 | 740 | ||
671 | /* Initializing for Swap token stuff */ | 741 | /* Initializing for Swap token stuff */ |
672 | mm->token_priority = 0; | 742 | mm->token_priority = 0; |
673 | mm->last_interval = 0; | 743 | mm->last_interval = 0; |
674 | 744 | ||
745 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
746 | mm->pmd_huge_pte = NULL; | ||
747 | #endif | ||
748 | |||
675 | if (!mm_init(mm, tsk)) | 749 | if (!mm_init(mm, tsk)) |
676 | goto fail_nomem; | 750 | goto fail_nomem; |
677 | 751 | ||
@@ -748,6 +822,8 @@ good_mm: | |||
748 | /* Initializing for Swap token stuff */ | 822 | /* Initializing for Swap token stuff */ |
749 | mm->token_priority = 0; | 823 | mm->token_priority = 0; |
750 | mm->last_interval = 0; | 824 | mm->last_interval = 0; |
825 | if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) | ||
826 | atomic_inc(&mm->oom_disable_count); | ||
751 | 827 | ||
752 | tsk->mm = mm; | 828 | tsk->mm = mm; |
753 | tsk->active_mm = mm; | 829 | tsk->active_mm = mm; |
@@ -907,9 +983,17 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) | |||
907 | posix_cpu_timers_init_group(sig); | 983 | posix_cpu_timers_init_group(sig); |
908 | 984 | ||
909 | tty_audit_fork(sig); | 985 | tty_audit_fork(sig); |
986 | sched_autogroup_fork(sig); | ||
987 | |||
988 | #ifdef CONFIG_CGROUPS | ||
989 | init_rwsem(&sig->threadgroup_fork_lock); | ||
990 | #endif | ||
910 | 991 | ||
911 | sig->oom_adj = current->signal->oom_adj; | 992 | sig->oom_adj = current->signal->oom_adj; |
912 | sig->oom_score_adj = current->signal->oom_score_adj; | 993 | sig->oom_score_adj = current->signal->oom_score_adj; |
994 | sig->oom_score_adj_min = current->signal->oom_score_adj_min; | ||
995 | |||
996 | mutex_init(&sig->cred_guard_mutex); | ||
913 | 997 | ||
914 | return 0; | 998 | return 0; |
915 | } | 999 | } |
@@ -1081,12 +1165,13 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1081 | 1165 | ||
1082 | posix_cpu_timers_init(p); | 1166 | posix_cpu_timers_init(p); |
1083 | 1167 | ||
1084 | p->lock_depth = -1; /* -1 = no lock */ | ||
1085 | do_posix_clock_monotonic_gettime(&p->start_time); | 1168 | do_posix_clock_monotonic_gettime(&p->start_time); |
1086 | p->real_start_time = p->start_time; | 1169 | p->real_start_time = p->start_time; |
1087 | monotonic_to_bootbased(&p->real_start_time); | 1170 | monotonic_to_bootbased(&p->real_start_time); |
1088 | p->io_context = NULL; | 1171 | p->io_context = NULL; |
1089 | p->audit_context = NULL; | 1172 | p->audit_context = NULL; |
1173 | if (clone_flags & CLONE_THREAD) | ||
1174 | threadgroup_fork_read_lock(current); | ||
1090 | cgroup_fork(p); | 1175 | cgroup_fork(p); |
1091 | #ifdef CONFIG_NUMA | 1176 | #ifdef CONFIG_NUMA |
1092 | p->mempolicy = mpol_dup(p->mempolicy); | 1177 | p->mempolicy = mpol_dup(p->mempolicy); |
@@ -1131,7 +1216,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1131 | #endif | 1216 | #endif |
1132 | 1217 | ||
1133 | /* Perform scheduler related setup. Assign this task to a CPU. */ | 1218 | /* Perform scheduler related setup. Assign this task to a CPU. */ |
1134 | sched_fork(p, clone_flags); | 1219 | sched_fork(p); |
1135 | 1220 | ||
1136 | retval = perf_event_init_task(p); | 1221 | retval = perf_event_init_task(p); |
1137 | if (retval) | 1222 | if (retval) |
@@ -1165,12 +1250,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1165 | pid = alloc_pid(p->nsproxy->pid_ns); | 1250 | pid = alloc_pid(p->nsproxy->pid_ns); |
1166 | if (!pid) | 1251 | if (!pid) |
1167 | goto bad_fork_cleanup_io; | 1252 | goto bad_fork_cleanup_io; |
1168 | |||
1169 | if (clone_flags & CLONE_NEWPID) { | ||
1170 | retval = pid_ns_prepare_proc(p->nsproxy->pid_ns); | ||
1171 | if (retval < 0) | ||
1172 | goto bad_fork_free_pid; | ||
1173 | } | ||
1174 | } | 1253 | } |
1175 | 1254 | ||
1176 | p->pid = pid_nr(pid); | 1255 | p->pid = pid_nr(pid); |
@@ -1178,17 +1257,14 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1178 | if (clone_flags & CLONE_THREAD) | 1257 | if (clone_flags & CLONE_THREAD) |
1179 | p->tgid = current->tgid; | 1258 | p->tgid = current->tgid; |
1180 | 1259 | ||
1181 | if (current->nsproxy != p->nsproxy) { | ||
1182 | retval = ns_cgroup_clone(p, pid); | ||
1183 | if (retval) | ||
1184 | goto bad_fork_free_pid; | ||
1185 | } | ||
1186 | |||
1187 | p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; | 1260 | p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; |
1188 | /* | 1261 | /* |
1189 | * Clear TID on mm_release()? | 1262 | * Clear TID on mm_release()? |
1190 | */ | 1263 | */ |
1191 | p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL; | 1264 | p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL; |
1265 | #ifdef CONFIG_BLOCK | ||
1266 | p->plug = NULL; | ||
1267 | #endif | ||
1192 | #ifdef CONFIG_FUTEX | 1268 | #ifdef CONFIG_FUTEX |
1193 | p->robust_list = NULL; | 1269 | p->robust_list = NULL; |
1194 | #ifdef CONFIG_COMPAT | 1270 | #ifdef CONFIG_COMPAT |
@@ -1274,7 +1350,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1274 | tracehook_finish_clone(p, clone_flags, trace); | 1350 | tracehook_finish_clone(p, clone_flags, trace); |
1275 | 1351 | ||
1276 | if (thread_group_leader(p)) { | 1352 | if (thread_group_leader(p)) { |
1277 | if (clone_flags & CLONE_NEWPID) | 1353 | if (is_child_reaper(pid)) |
1278 | p->nsproxy->pid_ns->child_reaper = p; | 1354 | p->nsproxy->pid_ns->child_reaper = p; |
1279 | 1355 | ||
1280 | p->signal->leader_pid = pid; | 1356 | p->signal->leader_pid = pid; |
@@ -1283,7 +1359,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1283 | attach_pid(p, PIDTYPE_SID, task_session(current)); | 1359 | attach_pid(p, PIDTYPE_SID, task_session(current)); |
1284 | list_add_tail(&p->sibling, &p->real_parent->children); | 1360 | list_add_tail(&p->sibling, &p->real_parent->children); |
1285 | list_add_tail_rcu(&p->tasks, &init_task.tasks); | 1361 | list_add_tail_rcu(&p->tasks, &init_task.tasks); |
1286 | __get_cpu_var(process_counts)++; | 1362 | __this_cpu_inc(process_counts); |
1287 | } | 1363 | } |
1288 | attach_pid(p, PIDTYPE_PID, pid); | 1364 | attach_pid(p, PIDTYPE_PID, pid); |
1289 | nr_threads++; | 1365 | nr_threads++; |
@@ -1294,6 +1370,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1294 | write_unlock_irq(&tasklist_lock); | 1370 | write_unlock_irq(&tasklist_lock); |
1295 | proc_fork_connector(p); | 1371 | proc_fork_connector(p); |
1296 | cgroup_post_fork(p); | 1372 | cgroup_post_fork(p); |
1373 | if (clone_flags & CLONE_THREAD) | ||
1374 | threadgroup_fork_read_unlock(current); | ||
1297 | perf_event_fork(p); | 1375 | perf_event_fork(p); |
1298 | return p; | 1376 | return p; |
1299 | 1377 | ||
@@ -1306,8 +1384,13 @@ bad_fork_cleanup_io: | |||
1306 | bad_fork_cleanup_namespaces: | 1384 | bad_fork_cleanup_namespaces: |
1307 | exit_task_namespaces(p); | 1385 | exit_task_namespaces(p); |
1308 | bad_fork_cleanup_mm: | 1386 | bad_fork_cleanup_mm: |
1309 | if (p->mm) | 1387 | if (p->mm) { |
1388 | task_lock(p); | ||
1389 | if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) | ||
1390 | atomic_dec(&p->mm->oom_disable_count); | ||
1391 | task_unlock(p); | ||
1310 | mmput(p->mm); | 1392 | mmput(p->mm); |
1393 | } | ||
1311 | bad_fork_cleanup_signal: | 1394 | bad_fork_cleanup_signal: |
1312 | if (!(clone_flags & CLONE_THREAD)) | 1395 | if (!(clone_flags & CLONE_THREAD)) |
1313 | free_signal_struct(p->signal); | 1396 | free_signal_struct(p->signal); |
@@ -1327,6 +1410,8 @@ bad_fork_cleanup_policy: | |||
1327 | mpol_put(p->mempolicy); | 1410 | mpol_put(p->mempolicy); |
1328 | bad_fork_cleanup_cgroup: | 1411 | bad_fork_cleanup_cgroup: |
1329 | #endif | 1412 | #endif |
1413 | if (clone_flags & CLONE_THREAD) | ||
1414 | threadgroup_fork_read_unlock(current); | ||
1330 | cgroup_exit(p, cgroup_callbacks_done); | 1415 | cgroup_exit(p, cgroup_callbacks_done); |
1331 | delayacct_tsk_free(p); | 1416 | delayacct_tsk_free(p); |
1332 | module_put(task_thread_info(p)->exec_domain->module); | 1417 | module_put(task_thread_info(p)->exec_domain->module); |
@@ -1403,23 +1488,6 @@ long do_fork(unsigned long clone_flags, | |||
1403 | } | 1488 | } |
1404 | 1489 | ||
1405 | /* | 1490 | /* |
1406 | * We hope to recycle these flags after 2.6.26 | ||
1407 | */ | ||
1408 | if (unlikely(clone_flags & CLONE_STOPPED)) { | ||
1409 | static int __read_mostly count = 100; | ||
1410 | |||
1411 | if (count > 0 && printk_ratelimit()) { | ||
1412 | char comm[TASK_COMM_LEN]; | ||
1413 | |||
1414 | count--; | ||
1415 | printk(KERN_INFO "fork(): process `%s' used deprecated " | ||
1416 | "clone flags 0x%lx\n", | ||
1417 | get_task_comm(comm, current), | ||
1418 | clone_flags & CLONE_STOPPED); | ||
1419 | } | ||
1420 | } | ||
1421 | |||
1422 | /* | ||
1423 | * When called from kernel_thread, don't do user tracing stuff. | 1491 | * When called from kernel_thread, don't do user tracing stuff. |
1424 | */ | 1492 | */ |
1425 | if (likely(user_mode(regs))) | 1493 | if (likely(user_mode(regs))) |
@@ -1457,16 +1525,7 @@ long do_fork(unsigned long clone_flags, | |||
1457 | */ | 1525 | */ |
1458 | p->flags &= ~PF_STARTING; | 1526 | p->flags &= ~PF_STARTING; |
1459 | 1527 | ||
1460 | if (unlikely(clone_flags & CLONE_STOPPED)) { | 1528 | wake_up_new_task(p); |
1461 | /* | ||
1462 | * We'll start up with an immediate SIGSTOP. | ||
1463 | */ | ||
1464 | sigaddset(&p->pending.signal, SIGSTOP); | ||
1465 | set_tsk_thread_flag(p, TIF_SIGPENDING); | ||
1466 | __set_task_state(p, TASK_STOPPED); | ||
1467 | } else { | ||
1468 | wake_up_new_task(p, clone_flags); | ||
1469 | } | ||
1470 | 1529 | ||
1471 | tracehook_report_clone_complete(trace, regs, | 1530 | tracehook_report_clone_complete(trace, regs, |
1472 | clone_flags, nr, p); | 1531 | clone_flags, nr, p); |
@@ -1510,6 +1569,13 @@ void __init proc_caches_init(void) | |||
1510 | fs_cachep = kmem_cache_create("fs_cache", | 1569 | fs_cachep = kmem_cache_create("fs_cache", |
1511 | sizeof(struct fs_struct), 0, | 1570 | sizeof(struct fs_struct), 0, |
1512 | SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); | 1571 | SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); |
1572 | /* | ||
1573 | * FIXME! The "sizeof(struct mm_struct)" currently includes the | ||
1574 | * whole struct cpumask for the OFFSTACK case. We could change | ||
1575 | * this to *only* allocate as much of it as required by the | ||
1576 | * maximum number of CPU's we can ever have. The cpumask_allocation | ||
1577 | * is at the end of the structure, exactly for that reason. | ||
1578 | */ | ||
1513 | mm_cachep = kmem_cache_create("mm_struct", | 1579 | mm_cachep = kmem_cache_create("mm_struct", |
1514 | sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, | 1580 | sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, |
1515 | SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); | 1581 | SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); |
@@ -1518,38 +1584,24 @@ void __init proc_caches_init(void) | |||
1518 | } | 1584 | } |
1519 | 1585 | ||
1520 | /* | 1586 | /* |
1521 | * Check constraints on flags passed to the unshare system call and | 1587 | * Check constraints on flags passed to the unshare system call. |
1522 | * force unsharing of additional process context as appropriate. | ||
1523 | */ | 1588 | */ |
1524 | static void check_unshare_flags(unsigned long *flags_ptr) | 1589 | static int check_unshare_flags(unsigned long unshare_flags) |
1525 | { | 1590 | { |
1591 | if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| | ||
1592 | CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| | ||
1593 | CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET)) | ||
1594 | return -EINVAL; | ||
1526 | /* | 1595 | /* |
1527 | * If unsharing a thread from a thread group, must also | 1596 | * Not implemented, but pretend it works if there is nothing to |
1528 | * unshare vm. | 1597 | * unshare. Note that unsharing CLONE_THREAD or CLONE_SIGHAND |
1529 | */ | 1598 | * needs to unshare vm. |
1530 | if (*flags_ptr & CLONE_THREAD) | ||
1531 | *flags_ptr |= CLONE_VM; | ||
1532 | |||
1533 | /* | ||
1534 | * If unsharing vm, must also unshare signal handlers. | ||
1535 | */ | ||
1536 | if (*flags_ptr & CLONE_VM) | ||
1537 | *flags_ptr |= CLONE_SIGHAND; | ||
1538 | |||
1539 | /* | ||
1540 | * If unsharing namespace, must also unshare filesystem information. | ||
1541 | */ | 1599 | */ |
1542 | if (*flags_ptr & CLONE_NEWNS) | 1600 | if (unshare_flags & (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM)) { |
1543 | *flags_ptr |= CLONE_FS; | 1601 | /* FIXME: get_task_mm() increments ->mm_users */ |
1544 | } | 1602 | if (atomic_read(¤t->mm->mm_users) > 1) |
1545 | 1603 | return -EINVAL; | |
1546 | /* | 1604 | } |
1547 | * Unsharing of tasks created with CLONE_THREAD is not supported yet | ||
1548 | */ | ||
1549 | static int unshare_thread(unsigned long unshare_flags) | ||
1550 | { | ||
1551 | if (unshare_flags & CLONE_THREAD) | ||
1552 | return -EINVAL; | ||
1553 | 1605 | ||
1554 | return 0; | 1606 | return 0; |
1555 | } | 1607 | } |
@@ -1576,34 +1628,6 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp) | |||
1576 | } | 1628 | } |
1577 | 1629 | ||
1578 | /* | 1630 | /* |
1579 | * Unsharing of sighand is not supported yet | ||
1580 | */ | ||
1581 | static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp) | ||
1582 | { | ||
1583 | struct sighand_struct *sigh = current->sighand; | ||
1584 | |||
1585 | if ((unshare_flags & CLONE_SIGHAND) && atomic_read(&sigh->count) > 1) | ||
1586 | return -EINVAL; | ||
1587 | else | ||
1588 | return 0; | ||
1589 | } | ||
1590 | |||
1591 | /* | ||
1592 | * Unshare vm if it is being shared | ||
1593 | */ | ||
1594 | static int unshare_vm(unsigned long unshare_flags, struct mm_struct **new_mmp) | ||
1595 | { | ||
1596 | struct mm_struct *mm = current->mm; | ||
1597 | |||
1598 | if ((unshare_flags & CLONE_VM) && | ||
1599 | (mm && atomic_read(&mm->mm_users) > 1)) { | ||
1600 | return -EINVAL; | ||
1601 | } | ||
1602 | |||
1603 | return 0; | ||
1604 | } | ||
1605 | |||
1606 | /* | ||
1607 | * Unshare file descriptor table if it is being shared | 1631 | * Unshare file descriptor table if it is being shared |
1608 | */ | 1632 | */ |
1609 | static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp) | 1633 | static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp) |
@@ -1631,45 +1655,37 @@ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp | |||
1631 | */ | 1655 | */ |
1632 | SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) | 1656 | SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) |
1633 | { | 1657 | { |
1634 | int err = 0; | ||
1635 | struct fs_struct *fs, *new_fs = NULL; | 1658 | struct fs_struct *fs, *new_fs = NULL; |
1636 | struct sighand_struct *new_sigh = NULL; | ||
1637 | struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL; | ||
1638 | struct files_struct *fd, *new_fd = NULL; | 1659 | struct files_struct *fd, *new_fd = NULL; |
1639 | struct nsproxy *new_nsproxy = NULL; | 1660 | struct nsproxy *new_nsproxy = NULL; |
1640 | int do_sysvsem = 0; | 1661 | int do_sysvsem = 0; |
1662 | int err; | ||
1641 | 1663 | ||
1642 | check_unshare_flags(&unshare_flags); | 1664 | err = check_unshare_flags(unshare_flags); |
1643 | 1665 | if (err) | |
1644 | /* Return -EINVAL for all unsupported flags */ | ||
1645 | err = -EINVAL; | ||
1646 | if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| | ||
1647 | CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| | ||
1648 | CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET)) | ||
1649 | goto bad_unshare_out; | 1666 | goto bad_unshare_out; |
1650 | 1667 | ||
1651 | /* | 1668 | /* |
1669 | * If unsharing namespace, must also unshare filesystem information. | ||
1670 | */ | ||
1671 | if (unshare_flags & CLONE_NEWNS) | ||
1672 | unshare_flags |= CLONE_FS; | ||
1673 | /* | ||
1652 | * CLONE_NEWIPC must also detach from the undolist: after switching | 1674 | * CLONE_NEWIPC must also detach from the undolist: after switching |
1653 | * to a new ipc namespace, the semaphore arrays from the old | 1675 | * to a new ipc namespace, the semaphore arrays from the old |
1654 | * namespace are unreachable. | 1676 | * namespace are unreachable. |
1655 | */ | 1677 | */ |
1656 | if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM)) | 1678 | if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM)) |
1657 | do_sysvsem = 1; | 1679 | do_sysvsem = 1; |
1658 | if ((err = unshare_thread(unshare_flags))) | ||
1659 | goto bad_unshare_out; | ||
1660 | if ((err = unshare_fs(unshare_flags, &new_fs))) | 1680 | if ((err = unshare_fs(unshare_flags, &new_fs))) |
1661 | goto bad_unshare_cleanup_thread; | 1681 | goto bad_unshare_out; |
1662 | if ((err = unshare_sighand(unshare_flags, &new_sigh))) | ||
1663 | goto bad_unshare_cleanup_fs; | ||
1664 | if ((err = unshare_vm(unshare_flags, &new_mm))) | ||
1665 | goto bad_unshare_cleanup_sigh; | ||
1666 | if ((err = unshare_fd(unshare_flags, &new_fd))) | 1682 | if ((err = unshare_fd(unshare_flags, &new_fd))) |
1667 | goto bad_unshare_cleanup_vm; | 1683 | goto bad_unshare_cleanup_fs; |
1668 | if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, | 1684 | if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, |
1669 | new_fs))) | 1685 | new_fs))) |
1670 | goto bad_unshare_cleanup_fd; | 1686 | goto bad_unshare_cleanup_fd; |
1671 | 1687 | ||
1672 | if (new_fs || new_mm || new_fd || do_sysvsem || new_nsproxy) { | 1688 | if (new_fs || new_fd || do_sysvsem || new_nsproxy) { |
1673 | if (do_sysvsem) { | 1689 | if (do_sysvsem) { |
1674 | /* | 1690 | /* |
1675 | * CLONE_SYSVSEM is equivalent to sys_exit(). | 1691 | * CLONE_SYSVSEM is equivalent to sys_exit(). |
@@ -1695,15 +1711,6 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) | |||
1695 | spin_unlock(&fs->lock); | 1711 | spin_unlock(&fs->lock); |
1696 | } | 1712 | } |
1697 | 1713 | ||
1698 | if (new_mm) { | ||
1699 | mm = current->mm; | ||
1700 | active_mm = current->active_mm; | ||
1701 | current->mm = new_mm; | ||
1702 | current->active_mm = new_mm; | ||
1703 | activate_mm(active_mm, new_mm); | ||
1704 | new_mm = mm; | ||
1705 | } | ||
1706 | |||
1707 | if (new_fd) { | 1714 | if (new_fd) { |
1708 | fd = current->files; | 1715 | fd = current->files; |
1709 | current->files = new_fd; | 1716 | current->files = new_fd; |
@@ -1720,20 +1727,10 @@ bad_unshare_cleanup_fd: | |||
1720 | if (new_fd) | 1727 | if (new_fd) |
1721 | put_files_struct(new_fd); | 1728 | put_files_struct(new_fd); |
1722 | 1729 | ||
1723 | bad_unshare_cleanup_vm: | ||
1724 | if (new_mm) | ||
1725 | mmput(new_mm); | ||
1726 | |||
1727 | bad_unshare_cleanup_sigh: | ||
1728 | if (new_sigh) | ||
1729 | if (atomic_dec_and_test(&new_sigh->count)) | ||
1730 | kmem_cache_free(sighand_cachep, new_sigh); | ||
1731 | |||
1732 | bad_unshare_cleanup_fs: | 1730 | bad_unshare_cleanup_fs: |
1733 | if (new_fs) | 1731 | if (new_fs) |
1734 | free_fs_struct(new_fs); | 1732 | free_fs_struct(new_fs); |
1735 | 1733 | ||
1736 | bad_unshare_cleanup_thread: | ||
1737 | bad_unshare_out: | 1734 | bad_unshare_out: |
1738 | return err; | 1735 | return err; |
1739 | } | 1736 | } |