aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/fork.c
diff options
context:
space:
mode:
authorAndrea Bastoni <bastoni@cs.unc.edu>2011-08-27 09:43:54 -0400
committerAndrea Bastoni <bastoni@cs.unc.edu>2011-08-27 10:06:11 -0400
commit7b1bb388bc879ffcc6c69b567816d5c354afe42b (patch)
tree5a217fdfb0b5e5a327bdcd624506337c1ae1fe32 /kernel/fork.c
parent7d754596756240fa918b94cd0c3011c77a638987 (diff)
parent02f8c6aee8df3cdc935e9bdd4f2d020306035dbe (diff)
Merge 'Linux v3.0' into Litmus
Some notes: * Litmus^RT scheduling class is the topmost scheduling class (above stop_sched_class). * scheduler_ipi() function (e.g., in smp_reschedule_interrupt()) may increase IPI latencies. * Added path into schedule() to quickly re-evaluate scheduling decision without becoming preemptive again. This used to be a standard path before the removal of BKL. Conflicts: Makefile arch/arm/kernel/calls.S arch/arm/kernel/smp.c arch/x86/include/asm/unistd_32.h arch/x86/kernel/smp.c arch/x86/kernel/syscall_table_32.S include/linux/hrtimer.h kernel/printk.c kernel/sched.c kernel/sched_fair.c
Diffstat (limited to 'kernel/fork.c')
-rw-r--r--kernel/fork.c305
1 files changed, 151 insertions, 154 deletions
diff --git a/kernel/fork.c b/kernel/fork.c
index ab7f29d906c7..25c6111fe3a6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -40,6 +40,7 @@
40#include <linux/tracehook.h> 40#include <linux/tracehook.h>
41#include <linux/futex.h> 41#include <linux/futex.h>
42#include <linux/compat.h> 42#include <linux/compat.h>
43#include <linux/kthread.h>
43#include <linux/task_io_accounting_ops.h> 44#include <linux/task_io_accounting_ops.h>
44#include <linux/rcupdate.h> 45#include <linux/rcupdate.h>
45#include <linux/ptrace.h> 46#include <linux/ptrace.h>
@@ -58,13 +59,14 @@
58#include <linux/taskstats_kern.h> 59#include <linux/taskstats_kern.h>
59#include <linux/random.h> 60#include <linux/random.h>
60#include <linux/tty.h> 61#include <linux/tty.h>
61#include <linux/proc_fs.h>
62#include <linux/blkdev.h> 62#include <linux/blkdev.h>
63#include <linux/fs_struct.h> 63#include <linux/fs_struct.h>
64#include <linux/magic.h> 64#include <linux/magic.h>
65#include <linux/perf_event.h> 65#include <linux/perf_event.h>
66#include <linux/posix-timers.h> 66#include <linux/posix-timers.h>
67#include <linux/user-return-notifier.h> 67#include <linux/user-return-notifier.h>
68#include <linux/oom.h>
69#include <linux/khugepaged.h>
68 70
69#include <asm/pgtable.h> 71#include <asm/pgtable.h>
70#include <asm/pgalloc.h> 72#include <asm/pgalloc.h>
@@ -110,20 +112,25 @@ int nr_processes(void)
110} 112}
111 113
112#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR 114#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
113# define alloc_task_struct() kmem_cache_alloc(task_struct_cachep, GFP_KERNEL) 115# define alloc_task_struct_node(node) \
114# define free_task_struct(tsk) kmem_cache_free(task_struct_cachep, (tsk)) 116 kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node)
117# define free_task_struct(tsk) \
118 kmem_cache_free(task_struct_cachep, (tsk))
115static struct kmem_cache *task_struct_cachep; 119static struct kmem_cache *task_struct_cachep;
116#endif 120#endif
117 121
118#ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR 122#ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR
119static inline struct thread_info *alloc_thread_info(struct task_struct *tsk) 123static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
124 int node)
120{ 125{
121#ifdef CONFIG_DEBUG_STACK_USAGE 126#ifdef CONFIG_DEBUG_STACK_USAGE
122 gfp_t mask = GFP_KERNEL | __GFP_ZERO; 127 gfp_t mask = GFP_KERNEL | __GFP_ZERO;
123#else 128#else
124 gfp_t mask = GFP_KERNEL; 129 gfp_t mask = GFP_KERNEL;
125#endif 130#endif
126 return (struct thread_info *)__get_free_pages(mask, THREAD_SIZE_ORDER); 131 struct page *page = alloc_pages_node(node, mask, THREAD_SIZE_ORDER);
132
133 return page ? page_address(page) : NULL;
127} 134}
128 135
129static inline void free_thread_info(struct thread_info *ti) 136static inline void free_thread_info(struct thread_info *ti)
@@ -171,6 +178,7 @@ EXPORT_SYMBOL(free_task);
171static inline void free_signal_struct(struct signal_struct *sig) 178static inline void free_signal_struct(struct signal_struct *sig)
172{ 179{
173 taskstats_tgid_free(sig); 180 taskstats_tgid_free(sig);
181 sched_autogroup_exit(sig);
174 kmem_cache_free(signal_cachep, sig); 182 kmem_cache_free(signal_cachep, sig);
175} 183}
176 184
@@ -194,6 +202,7 @@ void __put_task_struct(struct task_struct *tsk)
194 if (!profile_handoff_task(tsk)) 202 if (!profile_handoff_task(tsk))
195 free_task(tsk); 203 free_task(tsk);
196} 204}
205EXPORT_SYMBOL_GPL(__put_task_struct);
197 206
198/* 207/*
199 * macro override instead of weak attribute alias, to workaround 208 * macro override instead of weak attribute alias, to workaround
@@ -249,16 +258,16 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
249 struct task_struct *tsk; 258 struct task_struct *tsk;
250 struct thread_info *ti; 259 struct thread_info *ti;
251 unsigned long *stackend; 260 unsigned long *stackend;
252 261 int node = tsk_fork_get_node(orig);
253 int err; 262 int err;
254 263
255 prepare_to_copy(orig); 264 prepare_to_copy(orig);
256 265
257 tsk = alloc_task_struct(); 266 tsk = alloc_task_struct_node(node);
258 if (!tsk) 267 if (!tsk)
259 return NULL; 268 return NULL;
260 269
261 ti = alloc_thread_info(tsk); 270 ti = alloc_thread_info_node(tsk, node);
262 if (!ti) { 271 if (!ti) {
263 free_task_struct(tsk); 272 free_task_struct(tsk);
264 return NULL; 273 return NULL;
@@ -279,6 +288,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
279 288
280 setup_thread_stack(tsk, orig); 289 setup_thread_stack(tsk, orig);
281 clear_user_return_notifier(tsk); 290 clear_user_return_notifier(tsk);
291 clear_tsk_need_resched(tsk);
282 stackend = end_of_stack(tsk); 292 stackend = end_of_stack(tsk);
283 *stackend = STACK_END_MAGIC; /* for overflow detection */ 293 *stackend = STACK_END_MAGIC; /* for overflow detection */
284 294
@@ -334,6 +344,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
334 retval = ksm_fork(mm, oldmm); 344 retval = ksm_fork(mm, oldmm);
335 if (retval) 345 if (retval)
336 goto out; 346 goto out;
347 retval = khugepaged_fork(mm, oldmm);
348 if (retval)
349 goto out;
337 350
338 prev = NULL; 351 prev = NULL;
339 for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { 352 for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
@@ -376,15 +389,14 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
376 get_file(file); 389 get_file(file);
377 if (tmp->vm_flags & VM_DENYWRITE) 390 if (tmp->vm_flags & VM_DENYWRITE)
378 atomic_dec(&inode->i_writecount); 391 atomic_dec(&inode->i_writecount);
379 spin_lock(&mapping->i_mmap_lock); 392 mutex_lock(&mapping->i_mmap_mutex);
380 if (tmp->vm_flags & VM_SHARED) 393 if (tmp->vm_flags & VM_SHARED)
381 mapping->i_mmap_writable++; 394 mapping->i_mmap_writable++;
382 tmp->vm_truncate_count = mpnt->vm_truncate_count;
383 flush_dcache_mmap_lock(mapping); 395 flush_dcache_mmap_lock(mapping);
384 /* insert tmp into the share list, just after mpnt */ 396 /* insert tmp into the share list, just after mpnt */
385 vma_prio_tree_add(tmp, mpnt); 397 vma_prio_tree_add(tmp, mpnt);
386 flush_dcache_mmap_unlock(mapping); 398 flush_dcache_mmap_unlock(mapping);
387 spin_unlock(&mapping->i_mmap_lock); 399 mutex_unlock(&mapping->i_mmap_mutex);
388 } 400 }
389 401
390 /* 402 /*
@@ -495,6 +507,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
495 mm->cached_hole_size = ~0UL; 507 mm->cached_hole_size = ~0UL;
496 mm_init_aio(mm); 508 mm_init_aio(mm);
497 mm_init_owner(mm, p); 509 mm_init_owner(mm, p);
510 atomic_set(&mm->oom_disable_count, 0);
498 511
499 if (likely(!mm_alloc_pgd(mm))) { 512 if (likely(!mm_alloc_pgd(mm))) {
500 mm->def_flags = 0; 513 mm->def_flags = 0;
@@ -514,11 +527,12 @@ struct mm_struct * mm_alloc(void)
514 struct mm_struct * mm; 527 struct mm_struct * mm;
515 528
516 mm = allocate_mm(); 529 mm = allocate_mm();
517 if (mm) { 530 if (!mm)
518 memset(mm, 0, sizeof(*mm)); 531 return NULL;
519 mm = mm_init(mm, current); 532
520 } 533 memset(mm, 0, sizeof(*mm));
521 return mm; 534 mm_init_cpumask(mm);
535 return mm_init(mm, current);
522} 536}
523 537
524/* 538/*
@@ -532,6 +546,9 @@ void __mmdrop(struct mm_struct *mm)
532 mm_free_pgd(mm); 546 mm_free_pgd(mm);
533 destroy_context(mm); 547 destroy_context(mm);
534 mmu_notifier_mm_destroy(mm); 548 mmu_notifier_mm_destroy(mm);
549#ifdef CONFIG_TRANSPARENT_HUGEPAGE
550 VM_BUG_ON(mm->pmd_huge_pte);
551#endif
535 free_mm(mm); 552 free_mm(mm);
536} 553}
537EXPORT_SYMBOL_GPL(__mmdrop); 554EXPORT_SYMBOL_GPL(__mmdrop);
@@ -546,6 +563,7 @@ void mmput(struct mm_struct *mm)
546 if (atomic_dec_and_test(&mm->mm_users)) { 563 if (atomic_dec_and_test(&mm->mm_users)) {
547 exit_aio(mm); 564 exit_aio(mm);
548 ksm_exit(mm); 565 ksm_exit(mm);
566 khugepaged_exit(mm); /* must run before exit_mmap */
549 exit_mmap(mm); 567 exit_mmap(mm);
550 set_mm_exe_file(mm, NULL); 568 set_mm_exe_file(mm, NULL);
551 if (!list_empty(&mm->mmlist)) { 569 if (!list_empty(&mm->mmlist)) {
@@ -561,6 +579,57 @@ void mmput(struct mm_struct *mm)
561} 579}
562EXPORT_SYMBOL_GPL(mmput); 580EXPORT_SYMBOL_GPL(mmput);
563 581
582/*
583 * We added or removed a vma mapping the executable. The vmas are only mapped
584 * during exec and are not mapped with the mmap system call.
585 * Callers must hold down_write() on the mm's mmap_sem for these
586 */
587void added_exe_file_vma(struct mm_struct *mm)
588{
589 mm->num_exe_file_vmas++;
590}
591
592void removed_exe_file_vma(struct mm_struct *mm)
593{
594 mm->num_exe_file_vmas--;
595 if ((mm->num_exe_file_vmas == 0) && mm->exe_file){
596 fput(mm->exe_file);
597 mm->exe_file = NULL;
598 }
599
600}
601
602void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
603{
604 if (new_exe_file)
605 get_file(new_exe_file);
606 if (mm->exe_file)
607 fput(mm->exe_file);
608 mm->exe_file = new_exe_file;
609 mm->num_exe_file_vmas = 0;
610}
611
612struct file *get_mm_exe_file(struct mm_struct *mm)
613{
614 struct file *exe_file;
615
616 /* We need mmap_sem to protect against races with removal of
617 * VM_EXECUTABLE vmas */
618 down_read(&mm->mmap_sem);
619 exe_file = mm->exe_file;
620 if (exe_file)
621 get_file(exe_file);
622 up_read(&mm->mmap_sem);
623 return exe_file;
624}
625
626static void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm)
627{
628 /* It's safe to write the exe_file pointer without exe_file_lock because
629 * this is called during fork when the task is not yet in /proc */
630 newmm->exe_file = get_mm_exe_file(oldmm);
631}
632
564/** 633/**
565 * get_task_mm - acquire a reference to the task's mm 634 * get_task_mm - acquire a reference to the task's mm
566 * 635 *
@@ -667,11 +736,16 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
667 goto fail_nomem; 736 goto fail_nomem;
668 737
669 memcpy(mm, oldmm, sizeof(*mm)); 738 memcpy(mm, oldmm, sizeof(*mm));
739 mm_init_cpumask(mm);
670 740
671 /* Initializing for Swap token stuff */ 741 /* Initializing for Swap token stuff */
672 mm->token_priority = 0; 742 mm->token_priority = 0;
673 mm->last_interval = 0; 743 mm->last_interval = 0;
674 744
745#ifdef CONFIG_TRANSPARENT_HUGEPAGE
746 mm->pmd_huge_pte = NULL;
747#endif
748
675 if (!mm_init(mm, tsk)) 749 if (!mm_init(mm, tsk))
676 goto fail_nomem; 750 goto fail_nomem;
677 751
@@ -748,6 +822,8 @@ good_mm:
748 /* Initializing for Swap token stuff */ 822 /* Initializing for Swap token stuff */
749 mm->token_priority = 0; 823 mm->token_priority = 0;
750 mm->last_interval = 0; 824 mm->last_interval = 0;
825 if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
826 atomic_inc(&mm->oom_disable_count);
751 827
752 tsk->mm = mm; 828 tsk->mm = mm;
753 tsk->active_mm = mm; 829 tsk->active_mm = mm;
@@ -907,9 +983,17 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
907 posix_cpu_timers_init_group(sig); 983 posix_cpu_timers_init_group(sig);
908 984
909 tty_audit_fork(sig); 985 tty_audit_fork(sig);
986 sched_autogroup_fork(sig);
987
988#ifdef CONFIG_CGROUPS
989 init_rwsem(&sig->threadgroup_fork_lock);
990#endif
910 991
911 sig->oom_adj = current->signal->oom_adj; 992 sig->oom_adj = current->signal->oom_adj;
912 sig->oom_score_adj = current->signal->oom_score_adj; 993 sig->oom_score_adj = current->signal->oom_score_adj;
994 sig->oom_score_adj_min = current->signal->oom_score_adj_min;
995
996 mutex_init(&sig->cred_guard_mutex);
913 997
914 return 0; 998 return 0;
915} 999}
@@ -1081,12 +1165,13 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1081 1165
1082 posix_cpu_timers_init(p); 1166 posix_cpu_timers_init(p);
1083 1167
1084 p->lock_depth = -1; /* -1 = no lock */
1085 do_posix_clock_monotonic_gettime(&p->start_time); 1168 do_posix_clock_monotonic_gettime(&p->start_time);
1086 p->real_start_time = p->start_time; 1169 p->real_start_time = p->start_time;
1087 monotonic_to_bootbased(&p->real_start_time); 1170 monotonic_to_bootbased(&p->real_start_time);
1088 p->io_context = NULL; 1171 p->io_context = NULL;
1089 p->audit_context = NULL; 1172 p->audit_context = NULL;
1173 if (clone_flags & CLONE_THREAD)
1174 threadgroup_fork_read_lock(current);
1090 cgroup_fork(p); 1175 cgroup_fork(p);
1091#ifdef CONFIG_NUMA 1176#ifdef CONFIG_NUMA
1092 p->mempolicy = mpol_dup(p->mempolicy); 1177 p->mempolicy = mpol_dup(p->mempolicy);
@@ -1131,7 +1216,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1131#endif 1216#endif
1132 1217
1133 /* Perform scheduler related setup. Assign this task to a CPU. */ 1218 /* Perform scheduler related setup. Assign this task to a CPU. */
1134 sched_fork(p, clone_flags); 1219 sched_fork(p);
1135 1220
1136 retval = perf_event_init_task(p); 1221 retval = perf_event_init_task(p);
1137 if (retval) 1222 if (retval)
@@ -1165,12 +1250,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1165 pid = alloc_pid(p->nsproxy->pid_ns); 1250 pid = alloc_pid(p->nsproxy->pid_ns);
1166 if (!pid) 1251 if (!pid)
1167 goto bad_fork_cleanup_io; 1252 goto bad_fork_cleanup_io;
1168
1169 if (clone_flags & CLONE_NEWPID) {
1170 retval = pid_ns_prepare_proc(p->nsproxy->pid_ns);
1171 if (retval < 0)
1172 goto bad_fork_free_pid;
1173 }
1174 } 1253 }
1175 1254
1176 p->pid = pid_nr(pid); 1255 p->pid = pid_nr(pid);
@@ -1178,17 +1257,14 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1178 if (clone_flags & CLONE_THREAD) 1257 if (clone_flags & CLONE_THREAD)
1179 p->tgid = current->tgid; 1258 p->tgid = current->tgid;
1180 1259
1181 if (current->nsproxy != p->nsproxy) {
1182 retval = ns_cgroup_clone(p, pid);
1183 if (retval)
1184 goto bad_fork_free_pid;
1185 }
1186
1187 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; 1260 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
1188 /* 1261 /*
1189 * Clear TID on mm_release()? 1262 * Clear TID on mm_release()?
1190 */ 1263 */
1191 p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL; 1264 p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;
1265#ifdef CONFIG_BLOCK
1266 p->plug = NULL;
1267#endif
1192#ifdef CONFIG_FUTEX 1268#ifdef CONFIG_FUTEX
1193 p->robust_list = NULL; 1269 p->robust_list = NULL;
1194#ifdef CONFIG_COMPAT 1270#ifdef CONFIG_COMPAT
@@ -1274,7 +1350,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1274 tracehook_finish_clone(p, clone_flags, trace); 1350 tracehook_finish_clone(p, clone_flags, trace);
1275 1351
1276 if (thread_group_leader(p)) { 1352 if (thread_group_leader(p)) {
1277 if (clone_flags & CLONE_NEWPID) 1353 if (is_child_reaper(pid))
1278 p->nsproxy->pid_ns->child_reaper = p; 1354 p->nsproxy->pid_ns->child_reaper = p;
1279 1355
1280 p->signal->leader_pid = pid; 1356 p->signal->leader_pid = pid;
@@ -1283,7 +1359,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1283 attach_pid(p, PIDTYPE_SID, task_session(current)); 1359 attach_pid(p, PIDTYPE_SID, task_session(current));
1284 list_add_tail(&p->sibling, &p->real_parent->children); 1360 list_add_tail(&p->sibling, &p->real_parent->children);
1285 list_add_tail_rcu(&p->tasks, &init_task.tasks); 1361 list_add_tail_rcu(&p->tasks, &init_task.tasks);
1286 __get_cpu_var(process_counts)++; 1362 __this_cpu_inc(process_counts);
1287 } 1363 }
1288 attach_pid(p, PIDTYPE_PID, pid); 1364 attach_pid(p, PIDTYPE_PID, pid);
1289 nr_threads++; 1365 nr_threads++;
@@ -1294,6 +1370,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1294 write_unlock_irq(&tasklist_lock); 1370 write_unlock_irq(&tasklist_lock);
1295 proc_fork_connector(p); 1371 proc_fork_connector(p);
1296 cgroup_post_fork(p); 1372 cgroup_post_fork(p);
1373 if (clone_flags & CLONE_THREAD)
1374 threadgroup_fork_read_unlock(current);
1297 perf_event_fork(p); 1375 perf_event_fork(p);
1298 return p; 1376 return p;
1299 1377
@@ -1306,8 +1384,13 @@ bad_fork_cleanup_io:
1306bad_fork_cleanup_namespaces: 1384bad_fork_cleanup_namespaces:
1307 exit_task_namespaces(p); 1385 exit_task_namespaces(p);
1308bad_fork_cleanup_mm: 1386bad_fork_cleanup_mm:
1309 if (p->mm) 1387 if (p->mm) {
1388 task_lock(p);
1389 if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
1390 atomic_dec(&p->mm->oom_disable_count);
1391 task_unlock(p);
1310 mmput(p->mm); 1392 mmput(p->mm);
1393 }
1311bad_fork_cleanup_signal: 1394bad_fork_cleanup_signal:
1312 if (!(clone_flags & CLONE_THREAD)) 1395 if (!(clone_flags & CLONE_THREAD))
1313 free_signal_struct(p->signal); 1396 free_signal_struct(p->signal);
@@ -1327,6 +1410,8 @@ bad_fork_cleanup_policy:
1327 mpol_put(p->mempolicy); 1410 mpol_put(p->mempolicy);
1328bad_fork_cleanup_cgroup: 1411bad_fork_cleanup_cgroup:
1329#endif 1412#endif
1413 if (clone_flags & CLONE_THREAD)
1414 threadgroup_fork_read_unlock(current);
1330 cgroup_exit(p, cgroup_callbacks_done); 1415 cgroup_exit(p, cgroup_callbacks_done);
1331 delayacct_tsk_free(p); 1416 delayacct_tsk_free(p);
1332 module_put(task_thread_info(p)->exec_domain->module); 1417 module_put(task_thread_info(p)->exec_domain->module);
@@ -1403,23 +1488,6 @@ long do_fork(unsigned long clone_flags,
1403 } 1488 }
1404 1489
1405 /* 1490 /*
1406 * We hope to recycle these flags after 2.6.26
1407 */
1408 if (unlikely(clone_flags & CLONE_STOPPED)) {
1409 static int __read_mostly count = 100;
1410
1411 if (count > 0 && printk_ratelimit()) {
1412 char comm[TASK_COMM_LEN];
1413
1414 count--;
1415 printk(KERN_INFO "fork(): process `%s' used deprecated "
1416 "clone flags 0x%lx\n",
1417 get_task_comm(comm, current),
1418 clone_flags & CLONE_STOPPED);
1419 }
1420 }
1421
1422 /*
1423 * When called from kernel_thread, don't do user tracing stuff. 1491 * When called from kernel_thread, don't do user tracing stuff.
1424 */ 1492 */
1425 if (likely(user_mode(regs))) 1493 if (likely(user_mode(regs)))
@@ -1457,16 +1525,7 @@ long do_fork(unsigned long clone_flags,
1457 */ 1525 */
1458 p->flags &= ~PF_STARTING; 1526 p->flags &= ~PF_STARTING;
1459 1527
1460 if (unlikely(clone_flags & CLONE_STOPPED)) { 1528 wake_up_new_task(p);
1461 /*
1462 * We'll start up with an immediate SIGSTOP.
1463 */
1464 sigaddset(&p->pending.signal, SIGSTOP);
1465 set_tsk_thread_flag(p, TIF_SIGPENDING);
1466 __set_task_state(p, TASK_STOPPED);
1467 } else {
1468 wake_up_new_task(p, clone_flags);
1469 }
1470 1529
1471 tracehook_report_clone_complete(trace, regs, 1530 tracehook_report_clone_complete(trace, regs,
1472 clone_flags, nr, p); 1531 clone_flags, nr, p);
@@ -1510,6 +1569,13 @@ void __init proc_caches_init(void)
1510 fs_cachep = kmem_cache_create("fs_cache", 1569 fs_cachep = kmem_cache_create("fs_cache",
1511 sizeof(struct fs_struct), 0, 1570 sizeof(struct fs_struct), 0,
1512 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); 1571 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
1572 /*
1573 * FIXME! The "sizeof(struct mm_struct)" currently includes the
1574 * whole struct cpumask for the OFFSTACK case. We could change
1575 * this to *only* allocate as much of it as required by the
1576 * maximum number of CPU's we can ever have. The cpumask_allocation
1577 * is at the end of the structure, exactly for that reason.
1578 */
1513 mm_cachep = kmem_cache_create("mm_struct", 1579 mm_cachep = kmem_cache_create("mm_struct",
1514 sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, 1580 sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
1515 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); 1581 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
@@ -1518,38 +1584,24 @@ void __init proc_caches_init(void)
1518} 1584}
1519 1585
1520/* 1586/*
1521 * Check constraints on flags passed to the unshare system call and 1587 * Check constraints on flags passed to the unshare system call.
1522 * force unsharing of additional process context as appropriate.
1523 */ 1588 */
1524static void check_unshare_flags(unsigned long *flags_ptr) 1589static int check_unshare_flags(unsigned long unshare_flags)
1525{ 1590{
1591 if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
1592 CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
1593 CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET))
1594 return -EINVAL;
1526 /* 1595 /*
1527 * If unsharing a thread from a thread group, must also 1596 * Not implemented, but pretend it works if there is nothing to
1528 * unshare vm. 1597 * unshare. Note that unsharing CLONE_THREAD or CLONE_SIGHAND
1529 */ 1598 * needs to unshare vm.
1530 if (*flags_ptr & CLONE_THREAD)
1531 *flags_ptr |= CLONE_VM;
1532
1533 /*
1534 * If unsharing vm, must also unshare signal handlers.
1535 */
1536 if (*flags_ptr & CLONE_VM)
1537 *flags_ptr |= CLONE_SIGHAND;
1538
1539 /*
1540 * If unsharing namespace, must also unshare filesystem information.
1541 */ 1599 */
1542 if (*flags_ptr & CLONE_NEWNS) 1600 if (unshare_flags & (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM)) {
1543 *flags_ptr |= CLONE_FS; 1601 /* FIXME: get_task_mm() increments ->mm_users */
1544} 1602 if (atomic_read(&current->mm->mm_users) > 1)
1545 1603 return -EINVAL;
1546/* 1604 }
1547 * Unsharing of tasks created with CLONE_THREAD is not supported yet
1548 */
1549static int unshare_thread(unsigned long unshare_flags)
1550{
1551 if (unshare_flags & CLONE_THREAD)
1552 return -EINVAL;
1553 1605
1554 return 0; 1606 return 0;
1555} 1607}
@@ -1576,34 +1628,6 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
1576} 1628}
1577 1629
1578/* 1630/*
1579 * Unsharing of sighand is not supported yet
1580 */
1581static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp)
1582{
1583 struct sighand_struct *sigh = current->sighand;
1584
1585 if ((unshare_flags & CLONE_SIGHAND) && atomic_read(&sigh->count) > 1)
1586 return -EINVAL;
1587 else
1588 return 0;
1589}
1590
1591/*
1592 * Unshare vm if it is being shared
1593 */
1594static int unshare_vm(unsigned long unshare_flags, struct mm_struct **new_mmp)
1595{
1596 struct mm_struct *mm = current->mm;
1597
1598 if ((unshare_flags & CLONE_VM) &&
1599 (mm && atomic_read(&mm->mm_users) > 1)) {
1600 return -EINVAL;
1601 }
1602
1603 return 0;
1604}
1605
1606/*
1607 * Unshare file descriptor table if it is being shared 1631 * Unshare file descriptor table if it is being shared
1608 */ 1632 */
1609static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp) 1633static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp)
@@ -1631,45 +1655,37 @@ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp
1631 */ 1655 */
1632SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) 1656SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
1633{ 1657{
1634 int err = 0;
1635 struct fs_struct *fs, *new_fs = NULL; 1658 struct fs_struct *fs, *new_fs = NULL;
1636 struct sighand_struct *new_sigh = NULL;
1637 struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL;
1638 struct files_struct *fd, *new_fd = NULL; 1659 struct files_struct *fd, *new_fd = NULL;
1639 struct nsproxy *new_nsproxy = NULL; 1660 struct nsproxy *new_nsproxy = NULL;
1640 int do_sysvsem = 0; 1661 int do_sysvsem = 0;
1662 int err;
1641 1663
1642 check_unshare_flags(&unshare_flags); 1664 err = check_unshare_flags(unshare_flags);
1643 1665 if (err)
1644 /* Return -EINVAL for all unsupported flags */
1645 err = -EINVAL;
1646 if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
1647 CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
1648 CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET))
1649 goto bad_unshare_out; 1666 goto bad_unshare_out;
1650 1667
1651 /* 1668 /*
1669 * If unsharing namespace, must also unshare filesystem information.
1670 */
1671 if (unshare_flags & CLONE_NEWNS)
1672 unshare_flags |= CLONE_FS;
1673 /*
1652 * CLONE_NEWIPC must also detach from the undolist: after switching 1674 * CLONE_NEWIPC must also detach from the undolist: after switching
1653 * to a new ipc namespace, the semaphore arrays from the old 1675 * to a new ipc namespace, the semaphore arrays from the old
1654 * namespace are unreachable. 1676 * namespace are unreachable.
1655 */ 1677 */
1656 if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM)) 1678 if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM))
1657 do_sysvsem = 1; 1679 do_sysvsem = 1;
1658 if ((err = unshare_thread(unshare_flags)))
1659 goto bad_unshare_out;
1660 if ((err = unshare_fs(unshare_flags, &new_fs))) 1680 if ((err = unshare_fs(unshare_flags, &new_fs)))
1661 goto bad_unshare_cleanup_thread; 1681 goto bad_unshare_out;
1662 if ((err = unshare_sighand(unshare_flags, &new_sigh)))
1663 goto bad_unshare_cleanup_fs;
1664 if ((err = unshare_vm(unshare_flags, &new_mm)))
1665 goto bad_unshare_cleanup_sigh;
1666 if ((err = unshare_fd(unshare_flags, &new_fd))) 1682 if ((err = unshare_fd(unshare_flags, &new_fd)))
1667 goto bad_unshare_cleanup_vm; 1683 goto bad_unshare_cleanup_fs;
1668 if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, 1684 if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
1669 new_fs))) 1685 new_fs)))
1670 goto bad_unshare_cleanup_fd; 1686 goto bad_unshare_cleanup_fd;
1671 1687
1672 if (new_fs || new_mm || new_fd || do_sysvsem || new_nsproxy) { 1688 if (new_fs || new_fd || do_sysvsem || new_nsproxy) {
1673 if (do_sysvsem) { 1689 if (do_sysvsem) {
1674 /* 1690 /*
1675 * CLONE_SYSVSEM is equivalent to sys_exit(). 1691 * CLONE_SYSVSEM is equivalent to sys_exit().
@@ -1695,15 +1711,6 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
1695 spin_unlock(&fs->lock); 1711 spin_unlock(&fs->lock);
1696 } 1712 }
1697 1713
1698 if (new_mm) {
1699 mm = current->mm;
1700 active_mm = current->active_mm;
1701 current->mm = new_mm;
1702 current->active_mm = new_mm;
1703 activate_mm(active_mm, new_mm);
1704 new_mm = mm;
1705 }
1706
1707 if (new_fd) { 1714 if (new_fd) {
1708 fd = current->files; 1715 fd = current->files;
1709 current->files = new_fd; 1716 current->files = new_fd;
@@ -1720,20 +1727,10 @@ bad_unshare_cleanup_fd:
1720 if (new_fd) 1727 if (new_fd)
1721 put_files_struct(new_fd); 1728 put_files_struct(new_fd);
1722 1729
1723bad_unshare_cleanup_vm:
1724 if (new_mm)
1725 mmput(new_mm);
1726
1727bad_unshare_cleanup_sigh:
1728 if (new_sigh)
1729 if (atomic_dec_and_test(&new_sigh->count))
1730 kmem_cache_free(sighand_cachep, new_sigh);
1731
1732bad_unshare_cleanup_fs: 1730bad_unshare_cleanup_fs:
1733 if (new_fs) 1731 if (new_fs)
1734 free_fs_struct(new_fs); 1732 free_fs_struct(new_fs);
1735 1733
1736bad_unshare_cleanup_thread:
1737bad_unshare_out: 1734bad_unshare_out:
1738 return err; 1735 return err;
1739} 1736}