aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPavel Emelyanov <xemul@openvz.org>2007-10-19 02:40:10 -0400
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-10-19 14:53:39 -0400
commit30e49c263e36341b60b735cbef5ca37912549264 (patch)
tree103e74c41db97476ae38cdd4ffc18e4da03f28e8
parentb461cc03828c743aed6b3855b9ab0d39a9d54ec5 (diff)
pid namespaces: allow cloning of new namespace
When clone() is invoked with CLONE_NEWPID, create a new pid namespace and then create a new struct pid for the new process. Allocate pid_t's for the new process in the new pid namespace and all ancestor pid namespaces. Make the newly cloned process the session and process group leader. Since the active pid namespace is special and expected to be the first entry in pid->upid_list, preserve the order of pid namespaces. The size of 'struct pid' is dependent on the the number of pid namespaces the process exists in, so we use multiple pid-caches'. Only one pid cache is created during system startup and this used by processes that exist only in init_pid_ns. When a process clones its pid namespace, we create additional pid caches as necessary and use the pid cache to allocate 'struct pids' for that depth. Note, that with this patch the newly created namespace won't work, since the rest of the kernel still uses global pids, but this is to be fixed soon. Init pid namespace still works. [oleg@tv-sign.ru: merge fix] Signed-off-by: Pavel Emelyanov <xemul@openvz.org> Signed-off-by: Sukadev Bhattiprolu <sukadev@us.ibm.com> Cc: Paul Menage <menage@google.com> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Cc: Oleg Nesterov <oleg@tv-sign.ru> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/sched.h1
-rw-r--r--kernel/fork.c43
-rw-r--r--kernel/nsproxy.c3
-rw-r--r--kernel/pid.c88
4 files changed, 113 insertions, 22 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b0bf326143a9..1301c0875370 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -25,6 +25,7 @@
25#define CLONE_NEWUTS 0x04000000 /* New utsname group? */ 25#define CLONE_NEWUTS 0x04000000 /* New utsname group? */
26#define CLONE_NEWIPC 0x08000000 /* New ipcs */ 26#define CLONE_NEWIPC 0x08000000 /* New ipcs */
27#define CLONE_NEWUSER 0x10000000 /* New user namespace */ 27#define CLONE_NEWUSER 0x10000000 /* New user namespace */
28#define CLONE_NEWPID 0x20000000 /* New pid namespace */
28#define CLONE_NEWNET 0x40000000 /* New network namespace */ 29#define CLONE_NEWNET 0x40000000 /* New network namespace */
29 30
30/* 31/*
diff --git a/kernel/fork.c b/kernel/fork.c
index bab34192799b..f252784f9330 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -973,7 +973,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
973 unsigned long stack_start, 973 unsigned long stack_start,
974 struct pt_regs *regs, 974 struct pt_regs *regs,
975 unsigned long stack_size, 975 unsigned long stack_size,
976 int __user *parent_tidptr,
977 int __user *child_tidptr, 976 int __user *child_tidptr,
978 struct pid *pid) 977 struct pid *pid)
979{ 978{
@@ -1043,11 +1042,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1043 p->did_exec = 0; 1042 p->did_exec = 0;
1044 delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ 1043 delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
1045 copy_flags(clone_flags, p); 1044 copy_flags(clone_flags, p);
1046 retval = -EFAULT;
1047 if (clone_flags & CLONE_PARENT_SETTID)
1048 if (put_user(p->pid, parent_tidptr))
1049 goto bad_fork_cleanup_delays_binfmt;
1050
1051 INIT_LIST_HEAD(&p->children); 1045 INIT_LIST_HEAD(&p->children);
1052 INIT_LIST_HEAD(&p->sibling); 1046 INIT_LIST_HEAD(&p->sibling);
1053 p->vfork_done = NULL; 1047 p->vfork_done = NULL;
@@ -1289,11 +1283,22 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1289 __ptrace_link(p, current->parent); 1283 __ptrace_link(p, current->parent);
1290 1284
1291 if (thread_group_leader(p)) { 1285 if (thread_group_leader(p)) {
1292 p->signal->tty = current->signal->tty; 1286 if (clone_flags & CLONE_NEWPID) {
1293 p->signal->pgrp = task_pgrp_nr(current); 1287 p->nsproxy->pid_ns->child_reaper = p;
1294 set_task_session(p, task_session_nr(current)); 1288 p->signal->tty = NULL;
1295 attach_pid(p, PIDTYPE_PGID, task_pgrp(current)); 1289 p->signal->pgrp = p->pid;
1296 attach_pid(p, PIDTYPE_SID, task_session(current)); 1290 set_task_session(p, p->pid);
1291 attach_pid(p, PIDTYPE_PGID, pid);
1292 attach_pid(p, PIDTYPE_SID, pid);
1293 } else {
1294 p->signal->tty = current->signal->tty;
1295 p->signal->pgrp = task_pgrp_nr(current);
1296 set_task_session(p, task_session_nr(current));
1297 attach_pid(p, PIDTYPE_PGID,
1298 task_pgrp(current));
1299 attach_pid(p, PIDTYPE_SID,
1300 task_session(current));
1301 }
1297 1302
1298 list_add_tail_rcu(&p->tasks, &init_task.tasks); 1303 list_add_tail_rcu(&p->tasks, &init_task.tasks);
1299 __get_cpu_var(process_counts)++; 1304 __get_cpu_var(process_counts)++;
@@ -1339,7 +1344,6 @@ bad_fork_cleanup_policy:
1339bad_fork_cleanup_cgroup: 1344bad_fork_cleanup_cgroup:
1340#endif 1345#endif
1341 cgroup_exit(p, cgroup_callbacks_done); 1346 cgroup_exit(p, cgroup_callbacks_done);
1342bad_fork_cleanup_delays_binfmt:
1343 delayacct_tsk_free(p); 1347 delayacct_tsk_free(p);
1344 if (p->binfmt) 1348 if (p->binfmt)
1345 module_put(p->binfmt->module); 1349 module_put(p->binfmt->module);
@@ -1366,7 +1370,7 @@ struct task_struct * __cpuinit fork_idle(int cpu)
1366 struct task_struct *task; 1370 struct task_struct *task;
1367 struct pt_regs regs; 1371 struct pt_regs regs;
1368 1372
1369 task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL, NULL, 1373 task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL,
1370 &init_struct_pid); 1374 &init_struct_pid);
1371 if (!IS_ERR(task)) 1375 if (!IS_ERR(task))
1372 init_idle(task, cpu); 1376 init_idle(task, cpu);
@@ -1414,7 +1418,7 @@ long do_fork(unsigned long clone_flags,
1414 } 1418 }
1415 1419
1416 p = copy_process(clone_flags, stack_start, regs, stack_size, 1420 p = copy_process(clone_flags, stack_start, regs, stack_size,
1417 parent_tidptr, child_tidptr, NULL); 1421 child_tidptr, NULL);
1418 /* 1422 /*
1419 * Do this prior waking up the new thread - the thread pointer 1423 * Do this prior waking up the new thread - the thread pointer
1420 * might get invalid after that point, if the thread exits quickly. 1424 * might get invalid after that point, if the thread exits quickly.
@@ -1422,7 +1426,16 @@ long do_fork(unsigned long clone_flags,
1422 if (!IS_ERR(p)) { 1426 if (!IS_ERR(p)) {
1423 struct completion vfork; 1427 struct completion vfork;
1424 1428
1425 nr = pid_nr(task_pid(p)); 1429 /*
1430 * this is enough to call pid_nr_ns here, but this if
1431 * improves optimisation of regular fork()
1432 */
1433 nr = (clone_flags & CLONE_NEWPID) ?
1434 task_pid_nr_ns(p, current->nsproxy->pid_ns) :
1435 task_pid_vnr(p);
1436
1437 if (clone_flags & CLONE_PARENT_SETTID)
1438 put_user(nr, parent_tidptr);
1426 1439
1427 if (clone_flags & CLONE_VFORK) { 1440 if (clone_flags & CLONE_VFORK) {
1428 p->vfork_done = &vfork; 1441 p->vfork_done = &vfork;
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index c8ef7c2992ed..79f871bc0ef4 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -129,7 +129,8 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
129 129
130 get_nsproxy(old_ns); 130 get_nsproxy(old_ns);
131 131
132 if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWUSER | CLONE_NEWNET))) 132 if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
133 CLONE_NEWUSER | CLONE_NEWPID | CLONE_NEWNET)))
133 return 0; 134 return 0;
134 135
135 if (!capable(CAP_SYS_ADMIN)) { 136 if (!capable(CAP_SYS_ADMIN)) {
diff --git a/kernel/pid.c b/kernel/pid.c
index 4b17acdb862f..f76097c60475 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -18,6 +18,12 @@
18 * allocation scenario when all but one out of 1 million PIDs possible are 18 * allocation scenario when all but one out of 1 million PIDs possible are
19 * allocated already: the scanning of 32 list entries and at most PAGE_SIZE 19 * allocated already: the scanning of 32 list entries and at most PAGE_SIZE
20 * bytes. The typical fastpath is a single successful setbit. Freeing is O(1). 20 * bytes. The typical fastpath is a single successful setbit. Freeing is O(1).
21 *
22 * Pid namespaces:
23 * (C) 2007 Pavel Emelyanov <xemul@openvz.org>, OpenVZ, SWsoft Inc.
24 * (C) 2007 Sukadev Bhattiprolu <sukadev@us.ibm.com>, IBM
25 * Many thanks to Oleg Nesterov for comments and help
26 *
21 */ 27 */
22 28
23#include <linux/mm.h> 29#include <linux/mm.h>
@@ -456,8 +462,8 @@ static struct kmem_cache *create_pid_cachep(int nr_ids)
456 462
457 snprintf(pcache->name, sizeof(pcache->name), "pid_%d", nr_ids); 463 snprintf(pcache->name, sizeof(pcache->name), "pid_%d", nr_ids);
458 cachep = kmem_cache_create(pcache->name, 464 cachep = kmem_cache_create(pcache->name,
459 /* FIXME add numerical ids here */ 465 sizeof(struct pid) + (nr_ids - 1) * sizeof(struct upid),
460 sizeof(struct pid), 0, SLAB_HWCACHE_ALIGN, NULL); 466 0, SLAB_HWCACHE_ALIGN, NULL);
461 if (cachep == NULL) 467 if (cachep == NULL)
462 goto err_cachep; 468 goto err_cachep;
463 469
@@ -475,19 +481,89 @@ err_alloc:
475 return NULL; 481 return NULL;
476} 482}
477 483
484static struct pid_namespace *create_pid_namespace(int level)
485{
486 struct pid_namespace *ns;
487 int i;
488
489 ns = kmalloc(sizeof(struct pid_namespace), GFP_KERNEL);
490 if (ns == NULL)
491 goto out;
492
493 ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
494 if (!ns->pidmap[0].page)
495 goto out_free;
496
497 ns->pid_cachep = create_pid_cachep(level + 1);
498 if (ns->pid_cachep == NULL)
499 goto out_free_map;
500
501 kref_init(&ns->kref);
502 ns->last_pid = 0;
503 ns->child_reaper = NULL;
504 ns->level = level;
505
506 set_bit(0, ns->pidmap[0].page);
507 atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
508
509 for (i = 1; i < PIDMAP_ENTRIES; i++) {
510 ns->pidmap[i].page = 0;
511 atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
512 }
513
514 return ns;
515
516out_free_map:
517 kfree(ns->pidmap[0].page);
518out_free:
519 kfree(ns);
520out:
521 return ERR_PTR(-ENOMEM);
522}
523
524static void destroy_pid_namespace(struct pid_namespace *ns)
525{
526 int i;
527
528 for (i = 0; i < PIDMAP_ENTRIES; i++)
529 kfree(ns->pidmap[i].page);
530 kfree(ns);
531}
532
478struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns) 533struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns)
479{ 534{
535 struct pid_namespace *new_ns;
536
480 BUG_ON(!old_ns); 537 BUG_ON(!old_ns);
481 get_pid_ns(old_ns); 538 new_ns = get_pid_ns(old_ns);
482 return old_ns; 539 if (!(flags & CLONE_NEWPID))
540 goto out;
541
542 new_ns = ERR_PTR(-EINVAL);
543 if (flags & CLONE_THREAD)
544 goto out_put;
545
546 new_ns = create_pid_namespace(old_ns->level + 1);
547 if (!IS_ERR(new_ns))
548 new_ns->parent = get_pid_ns(old_ns);
549
550out_put:
551 put_pid_ns(old_ns);
552out:
553 return new_ns;
483} 554}
484 555
485void free_pid_ns(struct kref *kref) 556void free_pid_ns(struct kref *kref)
486{ 557{
487 struct pid_namespace *ns; 558 struct pid_namespace *ns, *parent;
488 559
489 ns = container_of(kref, struct pid_namespace, kref); 560 ns = container_of(kref, struct pid_namespace, kref);
490 kfree(ns); 561
562 parent = ns->parent;
563 destroy_pid_namespace(ns);
564
565 if (parent != NULL)
566 put_pid_ns(parent);
491} 567}
492 568
493/* 569/*