aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/fork.c
diff options
context:
space:
mode:
authorEric W. Biederman <ebiederm@xmission.com>2010-03-02 18:41:50 -0500
committerEric W. Biederman <ebiederm@xmission.com>2012-11-19 08:59:16 -0500
commit50804fe3737ca6a5942fdc2057a18a8141d00141 (patch)
treeae85d7ba1f24111f225f794e3310c39319d5a412 /kernel/fork.c
parent1c4042c29bd2e85aac4110552ca8ade763762e84 (diff)
pidns: Support unsharing the pid namespace.
Unsharing of the pid namespace unlike unsharing of other namespaces does not take affect immediately. Instead it affects the children created with fork and clone. The first of these children becomes the init process of the new pid namespace, the rest become oddball children of pid 0. From the point of view of the new pid namespace the process that created it is pid 0, as it's pid does not map. A couple of different semantics were considered but this one was settled on because it is easy to implement and it is usable from pam modules. The core reasons for the existence of unshare. I took a survey of the callers of pam modules and the following appears to be a representative sample of their logic. { setup stuff include pam child = fork(); if (!child) { setuid() exec /bin/bash } waitpid(child); pam and other cleanup } As you can see there is a fork to create the unprivileged user space process. Which means that the unprivileged user space process will appear as pid 1 in the new pid namespace. Further most login processes do not cope with extraneous children which means shifting the duty of reaping extraneous child process to the creator of those extraneous children makes the system more comprehensible. The practical reason for this set of pid namespace semantics is that it is simple to implement and verify they work correctly. Whereas an implementation that requres changing the struct pid on a process comes with a lot more races and pain. Not the least of which is that glibc caches getpid(). These semantics are implemented by having two notions of the pid namespace of a proces. There is task_active_pid_ns which is the pid namspace the process was created with and the pid namespace that all pids are presented to that process in. The task_active_pid_ns is stored in the struct pid of the task. Then there is the pid namespace that will be used for children that pid namespace is stored in task->nsproxy->pid_ns. Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Diffstat (limited to 'kernel/fork.c')
-rw-r--r--kernel/fork.c32
1 files changed, 25 insertions, 7 deletions
diff --git a/kernel/fork.c b/kernel/fork.c
index 0f2bbce311fc..811ffbad7889 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1565,9 +1565,11 @@ long do_fork(unsigned long clone_flags,
1565 * Do some preliminary argument and permissions checking before we 1565 * Do some preliminary argument and permissions checking before we
1566 * actually start allocating stuff 1566 * actually start allocating stuff
1567 */ 1567 */
1568 if (clone_flags & CLONE_NEWUSER) { 1568 if (clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) {
1569 if (clone_flags & CLONE_THREAD) 1569 if (clone_flags & (CLONE_THREAD|CLONE_PARENT))
1570 return -EINVAL; 1570 return -EINVAL;
1571 }
1572 if (clone_flags & CLONE_NEWUSER) {
1571 /* hopefully this check will go away when userns support is 1573 /* hopefully this check will go away when userns support is
1572 * complete 1574 * complete
1573 */ 1575 */
@@ -1692,7 +1694,8 @@ static int check_unshare_flags(unsigned long unshare_flags)
1692{ 1694{
1693 if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| 1695 if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
1694 CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| 1696 CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
1695 CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET)) 1697 CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET|
1698 CLONE_NEWPID))
1696 return -EINVAL; 1699 return -EINVAL;
1697 /* 1700 /*
1698 * Not implemented, but pretend it works if there is nothing to 1701 * Not implemented, but pretend it works if there is nothing to
@@ -1763,15 +1766,30 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
1763 int do_sysvsem = 0; 1766 int do_sysvsem = 0;
1764 int err; 1767 int err;
1765 1768
1766 err = check_unshare_flags(unshare_flags); 1769 /*
1767 if (err) 1770 * If unsharing a pid namespace must also unshare the thread.
1768 goto bad_unshare_out; 1771 */
1769 1772 if (unshare_flags & CLONE_NEWPID)
1773 unshare_flags |= CLONE_THREAD;
1774 /*
1775 * If unsharing a thread from a thread group, must also unshare vm.
1776 */
1777 if (unshare_flags & CLONE_THREAD)
1778 unshare_flags |= CLONE_VM;
1779 /*
1780 * If unsharing vm, must also unshare signal handlers.
1781 */
1782 if (unshare_flags & CLONE_VM)
1783 unshare_flags |= CLONE_SIGHAND;
1770 /* 1784 /*
1771 * If unsharing namespace, must also unshare filesystem information. 1785 * If unsharing namespace, must also unshare filesystem information.
1772 */ 1786 */
1773 if (unshare_flags & CLONE_NEWNS) 1787 if (unshare_flags & CLONE_NEWNS)
1774 unshare_flags |= CLONE_FS; 1788 unshare_flags |= CLONE_FS;
1789
1790 err = check_unshare_flags(unshare_flags);
1791 if (err)
1792 goto bad_unshare_out;
1775 /* 1793 /*
1776 * CLONE_NEWIPC must also detach from the undolist: after switching 1794 * CLONE_NEWIPC must also detach from the undolist: after switching
1777 * to a new ipc namespace, the semaphore arrays from the old 1795 * to a new ipc namespace, the semaphore arrays from the old