aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-12-17 18:44:47 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2012-12-17 18:44:47 -0500
commit6a2b60b17b3e48a418695a94bd2420f6ab32e519 (patch)
tree54b7792fa68b8890f710fa6398b6ba8626a039a8 /kernel
parent9228ff90387e276ad67b10c0eb525c9d6a57d5e9 (diff)
parent98f842e675f96ffac96e6c50315790912b2812be (diff)
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace
Pull user namespace changes from Eric Biederman: "While small this set of changes is very significant with respect to containers in general and user namespaces in particular. The user space interface is now complete. This set of changes adds support for unprivileged users to create user namespaces and as a user namespace root to create other namespaces. The tyranny of supporting suid root preventing unprivileged users from using cool new kernel features is broken. This set of changes completes the work on setns, adding support for the pid, user, mount namespaces. This set of changes includes a bunch of basic pid namespace cleanups/simplifications. Of particular significance is the rework of the pid namespace cleanup so it no longer requires sending out tendrils into all kinds of unexpected cleanup paths for operation. At least one case of broken error handling is fixed by this cleanup. The files under /proc/<pid>/ns/ have been converted from regular files to magic symlinks which prevents incorrect caching by the VFS, ensuring the files always refer to the namespace the process is currently using and ensuring that the ptrace_mayaccess permission checks are always applied. The files under /proc/<pid>/ns/ have been given stable inode numbers so it is now possible to see if different processes share the same namespaces. Through the David Miller's net tree are changes to relax many of the permission checks in the networking stack to allowing the user namespace root to usefully use the networking stack. Similar changes for the mount namespace and the pid namespace are coming through my tree. Two small changes to add user namespace support were commited here adn in David Miller's -net tree so that I could complete the work on the /proc/<pid>/ns/ files in this tree. Work remains to make it safe to build user namespaces and 9p, afs, ceph, cifs, coda, gfs2, ncpfs, nfs, nfsd, ocfs2, and xfs so the Kconfig guard remains in place preventing that user namespaces from being built when any of those filesystems are enabled. Future design work remains to allow root users outside of the initial user namespace to mount more than just /proc and /sys." * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace: (38 commits) proc: Usable inode numbers for the namespace file descriptors. proc: Fix the namespace inode permission checks. proc: Generalize proc inode allocation userns: Allow unprivilged mounts of proc and sysfs userns: For /proc/self/{uid,gid}_map derive the lower userns from the struct file procfs: Print task uids and gids in the userns that opened the proc file userns: Implement unshare of the user namespace userns: Implent proc namespace operations userns: Kill task_user_ns userns: Make create_new_namespaces take a user_ns parameter userns: Allow unprivileged use of setns. userns: Allow unprivileged users to create new namespaces userns: Allow setting a userns mapping to your current uid. userns: Allow chown and setgid preservation userns: Allow unprivileged users to create user namespaces. userns: Ignore suid and sgid on binaries if the uid or gid can not be mapped userns: fix return value on mntns_install() failure vfs: Allow unprivileged manipulation of the mount namespace. vfs: Only support slave subtrees across different user namespaces vfs: Add a user namespace reference from struct mnt_namespace ...
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cgroup.c2
-rw-r--r--kernel/events/core.c2
-rw-r--r--kernel/exit.c12
-rw-r--r--kernel/fork.c69
-rw-r--r--kernel/nsproxy.c36
-rw-r--r--kernel/pid.c47
-rw-r--r--kernel/pid_namespace.c112
-rw-r--r--kernel/ptrace.c10
-rw-r--r--kernel/sched/core.c10
-rw-r--r--kernel/signal.c2
-rw-r--r--kernel/sysctl_binary.c2
-rw-r--r--kernel/user.c2
-rw-r--r--kernel/user_namespace.c147
-rw-r--r--kernel/utsname.c33
14 files changed, 371 insertions, 115 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f34c41bfaa37..9915ffe01372 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3409,7 +3409,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
3409{ 3409{
3410 struct cgroup_pidlist *l; 3410 struct cgroup_pidlist *l;
3411 /* don't need task_nsproxy() if we're looking at ourself */ 3411 /* don't need task_nsproxy() if we're looking at ourself */
3412 struct pid_namespace *ns = current->nsproxy->pid_ns; 3412 struct pid_namespace *ns = task_active_pid_ns(current);
3413 3413
3414 /* 3414 /*
3415 * We can't drop the pidlist_mutex before taking the l->mutex in case 3415 * We can't drop the pidlist_mutex before taking the l->mutex in case
diff --git a/kernel/events/core.c b/kernel/events/core.c
index f9ff5493171d..301079d06f24 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6155,7 +6155,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
6155 6155
6156 event->parent = parent_event; 6156 event->parent = parent_event;
6157 6157
6158 event->ns = get_pid_ns(current->nsproxy->pid_ns); 6158 event->ns = get_pid_ns(task_active_pid_ns(current));
6159 event->id = atomic64_inc_return(&perf_event_id); 6159 event->id = atomic64_inc_return(&perf_event_id);
6160 6160
6161 event->state = PERF_EVENT_STATE_INACTIVE; 6161 event->state = PERF_EVENT_STATE_INACTIVE;
diff --git a/kernel/exit.c b/kernel/exit.c
index 50d2e93c36ea..b4df21937216 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -72,18 +72,6 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
72 list_del_rcu(&p->tasks); 72 list_del_rcu(&p->tasks);
73 list_del_init(&p->sibling); 73 list_del_init(&p->sibling);
74 __this_cpu_dec(process_counts); 74 __this_cpu_dec(process_counts);
75 /*
76 * If we are the last child process in a pid namespace to be
77 * reaped, notify the reaper sleeping zap_pid_ns_processes().
78 */
79 if (IS_ENABLED(CONFIG_PID_NS)) {
80 struct task_struct *parent = p->real_parent;
81
82 if ((task_active_pid_ns(parent)->child_reaper == parent) &&
83 list_empty(&parent->children) &&
84 (parent->flags & PF_EXITING))
85 wake_up_process(parent);
86 }
87 } 75 }
88 list_del_rcu(&p->thread_group); 76 list_del_rcu(&p->thread_group);
89} 77}
diff --git a/kernel/fork.c b/kernel/fork.c
index 115d6c2e4cca..c36c4e301efe 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1044,8 +1044,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
1044 atomic_set(&sig->live, 1); 1044 atomic_set(&sig->live, 1);
1045 atomic_set(&sig->sigcnt, 1); 1045 atomic_set(&sig->sigcnt, 1);
1046 init_waitqueue_head(&sig->wait_chldexit); 1046 init_waitqueue_head(&sig->wait_chldexit);
1047 if (clone_flags & CLONE_NEWPID)
1048 sig->flags |= SIGNAL_UNKILLABLE;
1049 sig->curr_target = tsk; 1047 sig->curr_target = tsk;
1050 init_sigpending(&sig->shared_pending); 1048 init_sigpending(&sig->shared_pending);
1051 INIT_LIST_HEAD(&sig->posix_timers); 1049 INIT_LIST_HEAD(&sig->posix_timers);
@@ -1438,8 +1436,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1438 ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); 1436 ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
1439 1437
1440 if (thread_group_leader(p)) { 1438 if (thread_group_leader(p)) {
1441 if (is_child_reaper(pid)) 1439 if (is_child_reaper(pid)) {
1442 p->nsproxy->pid_ns->child_reaper = p; 1440 ns_of_pid(pid)->child_reaper = p;
1441 p->signal->flags |= SIGNAL_UNKILLABLE;
1442 }
1443 1443
1444 p->signal->leader_pid = pid; 1444 p->signal->leader_pid = pid;
1445 p->signal->tty = tty_kref_get(current->signal->tty); 1445 p->signal->tty = tty_kref_get(current->signal->tty);
@@ -1473,8 +1473,6 @@ bad_fork_cleanup_io:
1473 if (p->io_context) 1473 if (p->io_context)
1474 exit_io_context(p); 1474 exit_io_context(p);
1475bad_fork_cleanup_namespaces: 1475bad_fork_cleanup_namespaces:
1476 if (unlikely(clone_flags & CLONE_NEWPID))
1477 pid_ns_release_proc(p->nsproxy->pid_ns);
1478 exit_task_namespaces(p); 1476 exit_task_namespaces(p);
1479bad_fork_cleanup_mm: 1477bad_fork_cleanup_mm:
1480 if (p->mm) 1478 if (p->mm)
@@ -1554,15 +1552,9 @@ long do_fork(unsigned long clone_flags,
1554 * Do some preliminary argument and permissions checking before we 1552 * Do some preliminary argument and permissions checking before we
1555 * actually start allocating stuff 1553 * actually start allocating stuff
1556 */ 1554 */
1557 if (clone_flags & CLONE_NEWUSER) { 1555 if (clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) {
1558 if (clone_flags & CLONE_THREAD) 1556 if (clone_flags & (CLONE_THREAD|CLONE_PARENT))
1559 return -EINVAL; 1557 return -EINVAL;
1560 /* hopefully this check will go away when userns support is
1561 * complete
1562 */
1563 if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SETUID) ||
1564 !capable(CAP_SETGID))
1565 return -EPERM;
1566 } 1558 }
1567 1559
1568 /* 1560 /*
@@ -1724,7 +1716,8 @@ static int check_unshare_flags(unsigned long unshare_flags)
1724{ 1716{
1725 if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| 1717 if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
1726 CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| 1718 CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
1727 CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET)) 1719 CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET|
1720 CLONE_NEWUSER|CLONE_NEWPID))
1728 return -EINVAL; 1721 return -EINVAL;
1729 /* 1722 /*
1730 * Not implemented, but pretend it works if there is nothing to 1723 * Not implemented, but pretend it works if there is nothing to
@@ -1791,19 +1784,40 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
1791{ 1784{
1792 struct fs_struct *fs, *new_fs = NULL; 1785 struct fs_struct *fs, *new_fs = NULL;
1793 struct files_struct *fd, *new_fd = NULL; 1786 struct files_struct *fd, *new_fd = NULL;
1787 struct cred *new_cred = NULL;
1794 struct nsproxy *new_nsproxy = NULL; 1788 struct nsproxy *new_nsproxy = NULL;
1795 int do_sysvsem = 0; 1789 int do_sysvsem = 0;
1796 int err; 1790 int err;
1797 1791
1798 err = check_unshare_flags(unshare_flags); 1792 /*
1799 if (err) 1793 * If unsharing a user namespace must also unshare the thread.
1800 goto bad_unshare_out; 1794 */
1801 1795 if (unshare_flags & CLONE_NEWUSER)
1796 unshare_flags |= CLONE_THREAD;
1797 /*
1798 * If unsharing a pid namespace must also unshare the thread.
1799 */
1800 if (unshare_flags & CLONE_NEWPID)
1801 unshare_flags |= CLONE_THREAD;
1802 /*
1803 * If unsharing a thread from a thread group, must also unshare vm.
1804 */
1805 if (unshare_flags & CLONE_THREAD)
1806 unshare_flags |= CLONE_VM;
1807 /*
1808 * If unsharing vm, must also unshare signal handlers.
1809 */
1810 if (unshare_flags & CLONE_VM)
1811 unshare_flags |= CLONE_SIGHAND;
1802 /* 1812 /*
1803 * If unsharing namespace, must also unshare filesystem information. 1813 * If unsharing namespace, must also unshare filesystem information.
1804 */ 1814 */
1805 if (unshare_flags & CLONE_NEWNS) 1815 if (unshare_flags & CLONE_NEWNS)
1806 unshare_flags |= CLONE_FS; 1816 unshare_flags |= CLONE_FS;
1817
1818 err = check_unshare_flags(unshare_flags);
1819 if (err)
1820 goto bad_unshare_out;
1807 /* 1821 /*
1808 * CLONE_NEWIPC must also detach from the undolist: after switching 1822 * CLONE_NEWIPC must also detach from the undolist: after switching
1809 * to a new ipc namespace, the semaphore arrays from the old 1823 * to a new ipc namespace, the semaphore arrays from the old
@@ -1817,11 +1831,15 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
1817 err = unshare_fd(unshare_flags, &new_fd); 1831 err = unshare_fd(unshare_flags, &new_fd);
1818 if (err) 1832 if (err)
1819 goto bad_unshare_cleanup_fs; 1833 goto bad_unshare_cleanup_fs;
1820 err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, new_fs); 1834 err = unshare_userns(unshare_flags, &new_cred);
1821 if (err) 1835 if (err)
1822 goto bad_unshare_cleanup_fd; 1836 goto bad_unshare_cleanup_fd;
1837 err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
1838 new_cred, new_fs);
1839 if (err)
1840 goto bad_unshare_cleanup_cred;
1823 1841
1824 if (new_fs || new_fd || do_sysvsem || new_nsproxy) { 1842 if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) {
1825 if (do_sysvsem) { 1843 if (do_sysvsem) {
1826 /* 1844 /*
1827 * CLONE_SYSVSEM is equivalent to sys_exit(). 1845 * CLONE_SYSVSEM is equivalent to sys_exit().
@@ -1854,11 +1872,20 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
1854 } 1872 }
1855 1873
1856 task_unlock(current); 1874 task_unlock(current);
1875
1876 if (new_cred) {
1877 /* Install the new user namespace */
1878 commit_creds(new_cred);
1879 new_cred = NULL;
1880 }
1857 } 1881 }
1858 1882
1859 if (new_nsproxy) 1883 if (new_nsproxy)
1860 put_nsproxy(new_nsproxy); 1884 put_nsproxy(new_nsproxy);
1861 1885
1886bad_unshare_cleanup_cred:
1887 if (new_cred)
1888 put_cred(new_cred);
1862bad_unshare_cleanup_fd: 1889bad_unshare_cleanup_fd:
1863 if (new_fd) 1890 if (new_fd)
1864 put_files_struct(new_fd); 1891 put_files_struct(new_fd);
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 7e1c3de1ce45..78e2ecb20165 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -57,7 +57,8 @@ static inline struct nsproxy *create_nsproxy(void)
57 * leave it to the caller to do proper locking and attach it to task. 57 * leave it to the caller to do proper locking and attach it to task.
58 */ 58 */
59static struct nsproxy *create_new_namespaces(unsigned long flags, 59static struct nsproxy *create_new_namespaces(unsigned long flags,
60 struct task_struct *tsk, struct fs_struct *new_fs) 60 struct task_struct *tsk, struct user_namespace *user_ns,
61 struct fs_struct *new_fs)
61{ 62{
62 struct nsproxy *new_nsp; 63 struct nsproxy *new_nsp;
63 int err; 64 int err;
@@ -66,31 +67,31 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
66 if (!new_nsp) 67 if (!new_nsp)
67 return ERR_PTR(-ENOMEM); 68 return ERR_PTR(-ENOMEM);
68 69
69 new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, new_fs); 70 new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, user_ns, new_fs);
70 if (IS_ERR(new_nsp->mnt_ns)) { 71 if (IS_ERR(new_nsp->mnt_ns)) {
71 err = PTR_ERR(new_nsp->mnt_ns); 72 err = PTR_ERR(new_nsp->mnt_ns);
72 goto out_ns; 73 goto out_ns;
73 } 74 }
74 75
75 new_nsp->uts_ns = copy_utsname(flags, tsk); 76 new_nsp->uts_ns = copy_utsname(flags, user_ns, tsk->nsproxy->uts_ns);
76 if (IS_ERR(new_nsp->uts_ns)) { 77 if (IS_ERR(new_nsp->uts_ns)) {
77 err = PTR_ERR(new_nsp->uts_ns); 78 err = PTR_ERR(new_nsp->uts_ns);
78 goto out_uts; 79 goto out_uts;
79 } 80 }
80 81
81 new_nsp->ipc_ns = copy_ipcs(flags, tsk); 82 new_nsp->ipc_ns = copy_ipcs(flags, user_ns, tsk->nsproxy->ipc_ns);
82 if (IS_ERR(new_nsp->ipc_ns)) { 83 if (IS_ERR(new_nsp->ipc_ns)) {
83 err = PTR_ERR(new_nsp->ipc_ns); 84 err = PTR_ERR(new_nsp->ipc_ns);
84 goto out_ipc; 85 goto out_ipc;
85 } 86 }
86 87
87 new_nsp->pid_ns = copy_pid_ns(flags, task_active_pid_ns(tsk)); 88 new_nsp->pid_ns = copy_pid_ns(flags, user_ns, tsk->nsproxy->pid_ns);
88 if (IS_ERR(new_nsp->pid_ns)) { 89 if (IS_ERR(new_nsp->pid_ns)) {
89 err = PTR_ERR(new_nsp->pid_ns); 90 err = PTR_ERR(new_nsp->pid_ns);
90 goto out_pid; 91 goto out_pid;
91 } 92 }
92 93
93 new_nsp->net_ns = copy_net_ns(flags, task_cred_xxx(tsk, user_ns), tsk->nsproxy->net_ns); 94 new_nsp->net_ns = copy_net_ns(flags, user_ns, tsk->nsproxy->net_ns);
94 if (IS_ERR(new_nsp->net_ns)) { 95 if (IS_ERR(new_nsp->net_ns)) {
95 err = PTR_ERR(new_nsp->net_ns); 96 err = PTR_ERR(new_nsp->net_ns);
96 goto out_net; 97 goto out_net;
@@ -122,6 +123,7 @@ out_ns:
122int copy_namespaces(unsigned long flags, struct task_struct *tsk) 123int copy_namespaces(unsigned long flags, struct task_struct *tsk)
123{ 124{
124 struct nsproxy *old_ns = tsk->nsproxy; 125 struct nsproxy *old_ns = tsk->nsproxy;
126 struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns);
125 struct nsproxy *new_ns; 127 struct nsproxy *new_ns;
126 int err = 0; 128 int err = 0;
127 129
@@ -134,7 +136,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
134 CLONE_NEWPID | CLONE_NEWNET))) 136 CLONE_NEWPID | CLONE_NEWNET)))
135 return 0; 137 return 0;
136 138
137 if (!capable(CAP_SYS_ADMIN)) { 139 if (!ns_capable(user_ns, CAP_SYS_ADMIN)) {
138 err = -EPERM; 140 err = -EPERM;
139 goto out; 141 goto out;
140 } 142 }
@@ -151,7 +153,8 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
151 goto out; 153 goto out;
152 } 154 }
153 155
154 new_ns = create_new_namespaces(flags, tsk, tsk->fs); 156 new_ns = create_new_namespaces(flags, tsk,
157 task_cred_xxx(tsk, user_ns), tsk->fs);
155 if (IS_ERR(new_ns)) { 158 if (IS_ERR(new_ns)) {
156 err = PTR_ERR(new_ns); 159 err = PTR_ERR(new_ns);
157 goto out; 160 goto out;
@@ -183,19 +186,21 @@ void free_nsproxy(struct nsproxy *ns)
183 * On success, returns the new nsproxy. 186 * On success, returns the new nsproxy.
184 */ 187 */
185int unshare_nsproxy_namespaces(unsigned long unshare_flags, 188int unshare_nsproxy_namespaces(unsigned long unshare_flags,
186 struct nsproxy **new_nsp, struct fs_struct *new_fs) 189 struct nsproxy **new_nsp, struct cred *new_cred, struct fs_struct *new_fs)
187{ 190{
191 struct user_namespace *user_ns;
188 int err = 0; 192 int err = 0;
189 193
190 if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | 194 if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
191 CLONE_NEWNET))) 195 CLONE_NEWNET | CLONE_NEWPID)))
192 return 0; 196 return 0;
193 197
194 if (!capable(CAP_SYS_ADMIN)) 198 user_ns = new_cred ? new_cred->user_ns : current_user_ns();
199 if (!ns_capable(user_ns, CAP_SYS_ADMIN))
195 return -EPERM; 200 return -EPERM;
196 201
197 *new_nsp = create_new_namespaces(unshare_flags, current, 202 *new_nsp = create_new_namespaces(unshare_flags, current, user_ns,
198 new_fs ? new_fs : current->fs); 203 new_fs ? new_fs : current->fs);
199 if (IS_ERR(*new_nsp)) { 204 if (IS_ERR(*new_nsp)) {
200 err = PTR_ERR(*new_nsp); 205 err = PTR_ERR(*new_nsp);
201 goto out; 206 goto out;
@@ -241,9 +246,6 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
241 struct file *file; 246 struct file *file;
242 int err; 247 int err;
243 248
244 if (!capable(CAP_SYS_ADMIN))
245 return -EPERM;
246
247 file = proc_ns_fget(fd); 249 file = proc_ns_fget(fd);
248 if (IS_ERR(file)) 250 if (IS_ERR(file))
249 return PTR_ERR(file); 251 return PTR_ERR(file);
@@ -254,7 +256,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
254 if (nstype && (ops->type != nstype)) 256 if (nstype && (ops->type != nstype))
255 goto out; 257 goto out;
256 258
257 new_nsproxy = create_new_namespaces(0, tsk, tsk->fs); 259 new_nsproxy = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs);
258 if (IS_ERR(new_nsproxy)) { 260 if (IS_ERR(new_nsproxy)) {
259 err = PTR_ERR(new_nsproxy); 261 err = PTR_ERR(new_nsproxy);
260 goto out; 262 goto out;
diff --git a/kernel/pid.c b/kernel/pid.c
index fd996c1ed9f8..3e2cf8100acc 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -36,6 +36,7 @@
36#include <linux/pid_namespace.h> 36#include <linux/pid_namespace.h>
37#include <linux/init_task.h> 37#include <linux/init_task.h>
38#include <linux/syscalls.h> 38#include <linux/syscalls.h>
39#include <linux/proc_fs.h>
39 40
40#define pid_hashfn(nr, ns) \ 41#define pid_hashfn(nr, ns) \
41 hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift) 42 hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
@@ -78,6 +79,8 @@ struct pid_namespace init_pid_ns = {
78 .last_pid = 0, 79 .last_pid = 0,
79 .level = 0, 80 .level = 0,
80 .child_reaper = &init_task, 81 .child_reaper = &init_task,
82 .user_ns = &init_user_ns,
83 .proc_inum = PROC_PID_INIT_INO,
81}; 84};
82EXPORT_SYMBOL_GPL(init_pid_ns); 85EXPORT_SYMBOL_GPL(init_pid_ns);
83 86
@@ -269,8 +272,24 @@ void free_pid(struct pid *pid)
269 unsigned long flags; 272 unsigned long flags;
270 273
271 spin_lock_irqsave(&pidmap_lock, flags); 274 spin_lock_irqsave(&pidmap_lock, flags);
272 for (i = 0; i <= pid->level; i++) 275 for (i = 0; i <= pid->level; i++) {
273 hlist_del_rcu(&pid->numbers[i].pid_chain); 276 struct upid *upid = pid->numbers + i;
277 struct pid_namespace *ns = upid->ns;
278 hlist_del_rcu(&upid->pid_chain);
279 switch(--ns->nr_hashed) {
280 case 1:
281 /* When all that is left in the pid namespace
282 * is the reaper wake up the reaper. The reaper
283 * may be sleeping in zap_pid_ns_processes().
284 */
285 wake_up_process(ns->child_reaper);
286 break;
287 case 0:
288 ns->nr_hashed = -1;
289 schedule_work(&ns->proc_work);
290 break;
291 }
292 }
274 spin_unlock_irqrestore(&pidmap_lock, flags); 293 spin_unlock_irqrestore(&pidmap_lock, flags);
275 294
276 for (i = 0; i <= pid->level; i++) 295 for (i = 0; i <= pid->level; i++)
@@ -292,6 +311,7 @@ struct pid *alloc_pid(struct pid_namespace *ns)
292 goto out; 311 goto out;
293 312
294 tmp = ns; 313 tmp = ns;
314 pid->level = ns->level;
295 for (i = ns->level; i >= 0; i--) { 315 for (i = ns->level; i >= 0; i--) {
296 nr = alloc_pidmap(tmp); 316 nr = alloc_pidmap(tmp);
297 if (nr < 0) 317 if (nr < 0)
@@ -302,22 +322,32 @@ struct pid *alloc_pid(struct pid_namespace *ns)
302 tmp = tmp->parent; 322 tmp = tmp->parent;
303 } 323 }
304 324
325 if (unlikely(is_child_reaper(pid))) {
326 if (pid_ns_prepare_proc(ns))
327 goto out_free;
328 }
329
305 get_pid_ns(ns); 330 get_pid_ns(ns);
306 pid->level = ns->level;
307 atomic_set(&pid->count, 1); 331 atomic_set(&pid->count, 1);
308 for (type = 0; type < PIDTYPE_MAX; ++type) 332 for (type = 0; type < PIDTYPE_MAX; ++type)
309 INIT_HLIST_HEAD(&pid->tasks[type]); 333 INIT_HLIST_HEAD(&pid->tasks[type]);
310 334
311 upid = pid->numbers + ns->level; 335 upid = pid->numbers + ns->level;
312 spin_lock_irq(&pidmap_lock); 336 spin_lock_irq(&pidmap_lock);
313 for ( ; upid >= pid->numbers; --upid) 337 if (ns->nr_hashed < 0)
338 goto out_unlock;
339 for ( ; upid >= pid->numbers; --upid) {
314 hlist_add_head_rcu(&upid->pid_chain, 340 hlist_add_head_rcu(&upid->pid_chain,
315 &pid_hash[pid_hashfn(upid->nr, upid->ns)]); 341 &pid_hash[pid_hashfn(upid->nr, upid->ns)]);
342 upid->ns->nr_hashed++;
343 }
316 spin_unlock_irq(&pidmap_lock); 344 spin_unlock_irq(&pidmap_lock);
317 345
318out: 346out:
319 return pid; 347 return pid;
320 348
349out_unlock:
350 spin_unlock(&pidmap_lock);
321out_free: 351out_free:
322 while (++i <= ns->level) 352 while (++i <= ns->level)
323 free_pidmap(pid->numbers + i); 353 free_pidmap(pid->numbers + i);
@@ -344,7 +374,7 @@ EXPORT_SYMBOL_GPL(find_pid_ns);
344 374
345struct pid *find_vpid(int nr) 375struct pid *find_vpid(int nr)
346{ 376{
347 return find_pid_ns(nr, current->nsproxy->pid_ns); 377 return find_pid_ns(nr, task_active_pid_ns(current));
348} 378}
349EXPORT_SYMBOL_GPL(find_vpid); 379EXPORT_SYMBOL_GPL(find_vpid);
350 380
@@ -428,7 +458,7 @@ struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
428 458
429struct task_struct *find_task_by_vpid(pid_t vnr) 459struct task_struct *find_task_by_vpid(pid_t vnr)
430{ 460{
431 return find_task_by_pid_ns(vnr, current->nsproxy->pid_ns); 461 return find_task_by_pid_ns(vnr, task_active_pid_ns(current));
432} 462}
433 463
434struct pid *get_task_pid(struct task_struct *task, enum pid_type type) 464struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
@@ -483,7 +513,7 @@ EXPORT_SYMBOL_GPL(pid_nr_ns);
483 513
484pid_t pid_vnr(struct pid *pid) 514pid_t pid_vnr(struct pid *pid)
485{ 515{
486 return pid_nr_ns(pid, current->nsproxy->pid_ns); 516 return pid_nr_ns(pid, task_active_pid_ns(current));
487} 517}
488EXPORT_SYMBOL_GPL(pid_vnr); 518EXPORT_SYMBOL_GPL(pid_vnr);
489 519
@@ -494,7 +524,7 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
494 524
495 rcu_read_lock(); 525 rcu_read_lock();
496 if (!ns) 526 if (!ns)
497 ns = current->nsproxy->pid_ns; 527 ns = task_active_pid_ns(current);
498 if (likely(pid_alive(task))) { 528 if (likely(pid_alive(task))) {
499 if (type != PIDTYPE_PID) 529 if (type != PIDTYPE_PID)
500 task = task->group_leader; 530 task = task->group_leader;
@@ -569,6 +599,7 @@ void __init pidmap_init(void)
569 /* Reserve PID 0. We never call free_pidmap(0) */ 599 /* Reserve PID 0. We never call free_pidmap(0) */
570 set_bit(0, init_pid_ns.pidmap[0].page); 600 set_bit(0, init_pid_ns.pidmap[0].page);
571 atomic_dec(&init_pid_ns.pidmap[0].nr_free); 601 atomic_dec(&init_pid_ns.pidmap[0].nr_free);
602 init_pid_ns.nr_hashed = 1;
572 603
573 init_pid_ns.pid_cachep = KMEM_CACHE(pid, 604 init_pid_ns.pid_cachep = KMEM_CACHE(pid,
574 SLAB_HWCACHE_ALIGN | SLAB_PANIC); 605 SLAB_HWCACHE_ALIGN | SLAB_PANIC);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 7b07cc0dfb75..560da0dab230 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -10,6 +10,7 @@
10 10
11#include <linux/pid.h> 11#include <linux/pid.h>
12#include <linux/pid_namespace.h> 12#include <linux/pid_namespace.h>
13#include <linux/user_namespace.h>
13#include <linux/syscalls.h> 14#include <linux/syscalls.h>
14#include <linux/err.h> 15#include <linux/err.h>
15#include <linux/acct.h> 16#include <linux/acct.h>
@@ -71,10 +72,17 @@ err_alloc:
71 return NULL; 72 return NULL;
72} 73}
73 74
75static void proc_cleanup_work(struct work_struct *work)
76{
77 struct pid_namespace *ns = container_of(work, struct pid_namespace, proc_work);
78 pid_ns_release_proc(ns);
79}
80
74/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */ 81/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */
75#define MAX_PID_NS_LEVEL 32 82#define MAX_PID_NS_LEVEL 32
76 83
77static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_pid_ns) 84static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns,
85 struct pid_namespace *parent_pid_ns)
78{ 86{
79 struct pid_namespace *ns; 87 struct pid_namespace *ns;
80 unsigned int level = parent_pid_ns->level + 1; 88 unsigned int level = parent_pid_ns->level + 1;
@@ -99,9 +107,15 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p
99 if (ns->pid_cachep == NULL) 107 if (ns->pid_cachep == NULL)
100 goto out_free_map; 108 goto out_free_map;
101 109
110 err = proc_alloc_inum(&ns->proc_inum);
111 if (err)
112 goto out_free_map;
113
102 kref_init(&ns->kref); 114 kref_init(&ns->kref);
103 ns->level = level; 115 ns->level = level;
104 ns->parent = get_pid_ns(parent_pid_ns); 116 ns->parent = get_pid_ns(parent_pid_ns);
117 ns->user_ns = get_user_ns(user_ns);
118 INIT_WORK(&ns->proc_work, proc_cleanup_work);
105 119
106 set_bit(0, ns->pidmap[0].page); 120 set_bit(0, ns->pidmap[0].page);
107 atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1); 121 atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
@@ -109,14 +123,8 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p
109 for (i = 1; i < PIDMAP_ENTRIES; i++) 123 for (i = 1; i < PIDMAP_ENTRIES; i++)
110 atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE); 124 atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
111 125
112 err = pid_ns_prepare_proc(ns);
113 if (err)
114 goto out_put_parent_pid_ns;
115
116 return ns; 126 return ns;
117 127
118out_put_parent_pid_ns:
119 put_pid_ns(parent_pid_ns);
120out_free_map: 128out_free_map:
121 kfree(ns->pidmap[0].page); 129 kfree(ns->pidmap[0].page);
122out_free: 130out_free:
@@ -129,18 +137,21 @@ static void destroy_pid_namespace(struct pid_namespace *ns)
129{ 137{
130 int i; 138 int i;
131 139
140 proc_free_inum(ns->proc_inum);
132 for (i = 0; i < PIDMAP_ENTRIES; i++) 141 for (i = 0; i < PIDMAP_ENTRIES; i++)
133 kfree(ns->pidmap[i].page); 142 kfree(ns->pidmap[i].page);
143 put_user_ns(ns->user_ns);
134 kmem_cache_free(pid_ns_cachep, ns); 144 kmem_cache_free(pid_ns_cachep, ns);
135} 145}
136 146
137struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns) 147struct pid_namespace *copy_pid_ns(unsigned long flags,
148 struct user_namespace *user_ns, struct pid_namespace *old_ns)
138{ 149{
139 if (!(flags & CLONE_NEWPID)) 150 if (!(flags & CLONE_NEWPID))
140 return get_pid_ns(old_ns); 151 return get_pid_ns(old_ns);
141 if (flags & (CLONE_THREAD|CLONE_PARENT)) 152 if (task_active_pid_ns(current) != old_ns)
142 return ERR_PTR(-EINVAL); 153 return ERR_PTR(-EINVAL);
143 return create_pid_namespace(old_ns); 154 return create_pid_namespace(user_ns, old_ns);
144} 155}
145 156
146static void free_pid_ns(struct kref *kref) 157static void free_pid_ns(struct kref *kref)
@@ -211,22 +222,15 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
211 222
212 /* 223 /*
213 * sys_wait4() above can't reap the TASK_DEAD children. 224 * sys_wait4() above can't reap the TASK_DEAD children.
214 * Make sure they all go away, see __unhash_process(). 225 * Make sure they all go away, see free_pid().
215 */ 226 */
216 for (;;) { 227 for (;;) {
217 bool need_wait = false; 228 set_current_state(TASK_UNINTERRUPTIBLE);
218 229 if (pid_ns->nr_hashed == 1)
219 read_lock(&tasklist_lock);
220 if (!list_empty(&current->children)) {
221 __set_current_state(TASK_UNINTERRUPTIBLE);
222 need_wait = true;
223 }
224 read_unlock(&tasklist_lock);
225
226 if (!need_wait)
227 break; 230 break;
228 schedule(); 231 schedule();
229 } 232 }
233 __set_current_state(TASK_RUNNING);
230 234
231 if (pid_ns->reboot) 235 if (pid_ns->reboot)
232 current->signal->group_exit_code = pid_ns->reboot; 236 current->signal->group_exit_code = pid_ns->reboot;
@@ -239,9 +243,10 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
239static int pid_ns_ctl_handler(struct ctl_table *table, int write, 243static int pid_ns_ctl_handler(struct ctl_table *table, int write,
240 void __user *buffer, size_t *lenp, loff_t *ppos) 244 void __user *buffer, size_t *lenp, loff_t *ppos)
241{ 245{
246 struct pid_namespace *pid_ns = task_active_pid_ns(current);
242 struct ctl_table tmp = *table; 247 struct ctl_table tmp = *table;
243 248
244 if (write && !capable(CAP_SYS_ADMIN)) 249 if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN))
245 return -EPERM; 250 return -EPERM;
246 251
247 /* 252 /*
@@ -250,7 +255,7 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write,
250 * it should synchronize its usage with external means. 255 * it should synchronize its usage with external means.
251 */ 256 */
252 257
253 tmp.data = &current->nsproxy->pid_ns->last_pid; 258 tmp.data = &pid_ns->last_pid;
254 return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); 259 return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
255} 260}
256 261
@@ -299,6 +304,67 @@ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
299 return 0; 304 return 0;
300} 305}
301 306
307static void *pidns_get(struct task_struct *task)
308{
309 struct pid_namespace *ns;
310
311 rcu_read_lock();
312 ns = get_pid_ns(task_active_pid_ns(task));
313 rcu_read_unlock();
314
315 return ns;
316}
317
318static void pidns_put(void *ns)
319{
320 put_pid_ns(ns);
321}
322
323static int pidns_install(struct nsproxy *nsproxy, void *ns)
324{
325 struct pid_namespace *active = task_active_pid_ns(current);
326 struct pid_namespace *ancestor, *new = ns;
327
328 if (!ns_capable(new->user_ns, CAP_SYS_ADMIN))
329 return -EPERM;
330
331 /*
332 * Only allow entering the current active pid namespace
333 * or a child of the current active pid namespace.
334 *
335 * This is required for fork to return a usable pid value and
336 * this maintains the property that processes and their
337 * children can not escape their current pid namespace.
338 */
339 if (new->level < active->level)
340 return -EINVAL;
341
342 ancestor = new;
343 while (ancestor->level > active->level)
344 ancestor = ancestor->parent;
345 if (ancestor != active)
346 return -EINVAL;
347
348 put_pid_ns(nsproxy->pid_ns);
349 nsproxy->pid_ns = get_pid_ns(new);
350 return 0;
351}
352
353static unsigned int pidns_inum(void *ns)
354{
355 struct pid_namespace *pid_ns = ns;
356 return pid_ns->proc_inum;
357}
358
359const struct proc_ns_operations pidns_operations = {
360 .name = "pid",
361 .type = CLONE_NEWPID,
362 .get = pidns_get,
363 .put = pidns_put,
364 .install = pidns_install,
365 .inum = pidns_inum,
366};
367
302static __init int pid_namespaces_init(void) 368static __init int pid_namespaces_init(void)
303{ 369{
304 pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); 370 pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 1f5e55dda955..7b09b88862cc 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -215,8 +215,12 @@ ok:
215 smp_rmb(); 215 smp_rmb();
216 if (task->mm) 216 if (task->mm)
217 dumpable = get_dumpable(task->mm); 217 dumpable = get_dumpable(task->mm);
218 if (!dumpable && !ptrace_has_cap(task_user_ns(task), mode)) 218 rcu_read_lock();
219 if (!dumpable && !ptrace_has_cap(__task_cred(task)->user_ns, mode)) {
220 rcu_read_unlock();
219 return -EPERM; 221 return -EPERM;
222 }
223 rcu_read_unlock();
220 224
221 return security_ptrace_access_check(task, mode); 225 return security_ptrace_access_check(task, mode);
222} 226}
@@ -280,8 +284,10 @@ static int ptrace_attach(struct task_struct *task, long request,
280 284
281 if (seize) 285 if (seize)
282 flags |= PT_SEIZED; 286 flags |= PT_SEIZED;
283 if (ns_capable(task_user_ns(task), CAP_SYS_PTRACE)) 287 rcu_read_lock();
288 if (ns_capable(__task_cred(task)->user_ns, CAP_SYS_PTRACE))
284 flags |= PT_PTRACE_CAP; 289 flags |= PT_PTRACE_CAP;
290 rcu_read_unlock();
285 task->ptrace = flags; 291 task->ptrace = flags;
286 292
287 __ptrace_link(task, current); 293 __ptrace_link(task, current);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c1fb82104bfb..257002c13bb0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4097,8 +4097,14 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
4097 goto out_free_cpus_allowed; 4097 goto out_free_cpus_allowed;
4098 } 4098 }
4099 retval = -EPERM; 4099 retval = -EPERM;
4100 if (!check_same_owner(p) && !ns_capable(task_user_ns(p), CAP_SYS_NICE)) 4100 if (!check_same_owner(p)) {
4101 goto out_unlock; 4101 rcu_read_lock();
4102 if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
4103 rcu_read_unlock();
4104 goto out_unlock;
4105 }
4106 rcu_read_unlock();
4107 }
4102 4108
4103 retval = security_task_setscheduler(p); 4109 retval = security_task_setscheduler(p);
4104 if (retval) 4110 if (retval)
diff --git a/kernel/signal.c b/kernel/signal.c
index a49c7f36ceb3..580a91e63471 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1753,7 +1753,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
1753 * see comment in do_notify_parent() about the following 4 lines 1753 * see comment in do_notify_parent() about the following 4 lines
1754 */ 1754 */
1755 rcu_read_lock(); 1755 rcu_read_lock();
1756 info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns); 1756 info.si_pid = task_pid_nr_ns(tsk, task_active_pid_ns(parent));
1757 info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), task_uid(tsk)); 1757 info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), task_uid(tsk));
1758 rcu_read_unlock(); 1758 rcu_read_unlock();
1759 1759
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 65bdcf198d4e..5a6384450501 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -1344,7 +1344,7 @@ static ssize_t binary_sysctl(const int *name, int nlen,
1344 goto out_putname; 1344 goto out_putname;
1345 } 1345 }
1346 1346
1347 mnt = current->nsproxy->pid_ns->proc_mnt; 1347 mnt = task_active_pid_ns(current)->proc_mnt;
1348 file = file_open_root(mnt->mnt_root, mnt, pathname, flags); 1348 file = file_open_root(mnt->mnt_root, mnt, pathname, flags);
1349 result = PTR_ERR(file); 1349 result = PTR_ERR(file);
1350 if (IS_ERR(file)) 1350 if (IS_ERR(file))
diff --git a/kernel/user.c b/kernel/user.c
index 750acffbe9ec..33acb5e53a5f 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -16,6 +16,7 @@
16#include <linux/interrupt.h> 16#include <linux/interrupt.h>
17#include <linux/export.h> 17#include <linux/export.h>
18#include <linux/user_namespace.h> 18#include <linux/user_namespace.h>
19#include <linux/proc_fs.h>
19 20
20/* 21/*
21 * userns count is 1 for root user, 1 for init_uts_ns, 22 * userns count is 1 for root user, 1 for init_uts_ns,
@@ -51,6 +52,7 @@ struct user_namespace init_user_ns = {
51 }, 52 },
52 .owner = GLOBAL_ROOT_UID, 53 .owner = GLOBAL_ROOT_UID,
53 .group = GLOBAL_ROOT_GID, 54 .group = GLOBAL_ROOT_GID,
55 .proc_inum = PROC_USER_INIT_INO,
54}; 56};
55EXPORT_SYMBOL_GPL(init_user_ns); 57EXPORT_SYMBOL_GPL(init_user_ns);
56 58
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 456a6b9fba34..f5975ccf9348 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -9,6 +9,7 @@
9#include <linux/nsproxy.h> 9#include <linux/nsproxy.h>
10#include <linux/slab.h> 10#include <linux/slab.h>
11#include <linux/user_namespace.h> 11#include <linux/user_namespace.h>
12#include <linux/proc_fs.h>
12#include <linux/highuid.h> 13#include <linux/highuid.h>
13#include <linux/cred.h> 14#include <linux/cred.h>
14#include <linux/securebits.h> 15#include <linux/securebits.h>
@@ -26,6 +27,24 @@ static struct kmem_cache *user_ns_cachep __read_mostly;
26static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, 27static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
27 struct uid_gid_map *map); 28 struct uid_gid_map *map);
28 29
30static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
31{
32 /* Start with the same capabilities as init but useless for doing
33 * anything as the capabilities are bound to the new user namespace.
34 */
35 cred->securebits = SECUREBITS_DEFAULT;
36 cred->cap_inheritable = CAP_EMPTY_SET;
37 cred->cap_permitted = CAP_FULL_SET;
38 cred->cap_effective = CAP_FULL_SET;
39 cred->cap_bset = CAP_FULL_SET;
40#ifdef CONFIG_KEYS
41 key_put(cred->request_key_auth);
42 cred->request_key_auth = NULL;
43#endif
44 /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */
45 cred->user_ns = user_ns;
46}
47
29/* 48/*
30 * Create a new user namespace, deriving the creator from the user in the 49 * Create a new user namespace, deriving the creator from the user in the
31 * passed credentials, and replacing that user with the new root user for the 50 * passed credentials, and replacing that user with the new root user for the
@@ -39,6 +58,7 @@ int create_user_ns(struct cred *new)
39 struct user_namespace *ns, *parent_ns = new->user_ns; 58 struct user_namespace *ns, *parent_ns = new->user_ns;
40 kuid_t owner = new->euid; 59 kuid_t owner = new->euid;
41 kgid_t group = new->egid; 60 kgid_t group = new->egid;
61 int ret;
42 62
43 /* The creator needs a mapping in the parent user namespace 63 /* The creator needs a mapping in the parent user namespace
44 * or else we won't be able to reasonably tell userspace who 64 * or else we won't be able to reasonably tell userspace who
@@ -52,38 +72,45 @@ int create_user_ns(struct cred *new)
52 if (!ns) 72 if (!ns)
53 return -ENOMEM; 73 return -ENOMEM;
54 74
75 ret = proc_alloc_inum(&ns->proc_inum);
76 if (ret) {
77 kmem_cache_free(user_ns_cachep, ns);
78 return ret;
79 }
80
55 kref_init(&ns->kref); 81 kref_init(&ns->kref);
82 /* Leave the new->user_ns reference with the new user namespace. */
56 ns->parent = parent_ns; 83 ns->parent = parent_ns;
57 ns->owner = owner; 84 ns->owner = owner;
58 ns->group = group; 85 ns->group = group;
59 86
60 /* Start with the same capabilities as init but useless for doing 87 set_cred_user_ns(new, ns);
61 * anything as the capabilities are bound to the new user namespace.
62 */
63 new->securebits = SECUREBITS_DEFAULT;
64 new->cap_inheritable = CAP_EMPTY_SET;
65 new->cap_permitted = CAP_FULL_SET;
66 new->cap_effective = CAP_FULL_SET;
67 new->cap_bset = CAP_FULL_SET;
68#ifdef CONFIG_KEYS
69 key_put(new->request_key_auth);
70 new->request_key_auth = NULL;
71#endif
72 /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */
73
74 /* Leave the new->user_ns reference with the new user namespace. */
75 /* Leave the reference to our user_ns with the new cred. */
76 new->user_ns = ns;
77 88
78 return 0; 89 return 0;
79} 90}
80 91
92int unshare_userns(unsigned long unshare_flags, struct cred **new_cred)
93{
94 struct cred *cred;
95
96 if (!(unshare_flags & CLONE_NEWUSER))
97 return 0;
98
99 cred = prepare_creds();
100 if (!cred)
101 return -ENOMEM;
102
103 *new_cred = cred;
104 return create_user_ns(cred);
105}
106
81void free_user_ns(struct kref *kref) 107void free_user_ns(struct kref *kref)
82{ 108{
83 struct user_namespace *parent, *ns = 109 struct user_namespace *parent, *ns =
84 container_of(kref, struct user_namespace, kref); 110 container_of(kref, struct user_namespace, kref);
85 111
86 parent = ns->parent; 112 parent = ns->parent;
113 proc_free_inum(ns->proc_inum);
87 kmem_cache_free(user_ns_cachep, ns); 114 kmem_cache_free(user_ns_cachep, ns);
88 put_user_ns(parent); 115 put_user_ns(parent);
89} 116}
@@ -372,7 +399,7 @@ static int uid_m_show(struct seq_file *seq, void *v)
372 struct user_namespace *lower_ns; 399 struct user_namespace *lower_ns;
373 uid_t lower; 400 uid_t lower;
374 401
375 lower_ns = current_user_ns(); 402 lower_ns = seq_user_ns(seq);
376 if ((lower_ns == ns) && lower_ns->parent) 403 if ((lower_ns == ns) && lower_ns->parent)
377 lower_ns = lower_ns->parent; 404 lower_ns = lower_ns->parent;
378 405
@@ -393,7 +420,7 @@ static int gid_m_show(struct seq_file *seq, void *v)
393 struct user_namespace *lower_ns; 420 struct user_namespace *lower_ns;
394 gid_t lower; 421 gid_t lower;
395 422
396 lower_ns = current_user_ns(); 423 lower_ns = seq_user_ns(seq);
397 if ((lower_ns == ns) && lower_ns->parent) 424 if ((lower_ns == ns) && lower_ns->parent)
398 lower_ns = lower_ns->parent; 425 lower_ns = lower_ns->parent;
399 426
@@ -669,10 +696,14 @@ ssize_t proc_uid_map_write(struct file *file, const char __user *buf, size_t siz
669{ 696{
670 struct seq_file *seq = file->private_data; 697 struct seq_file *seq = file->private_data;
671 struct user_namespace *ns = seq->private; 698 struct user_namespace *ns = seq->private;
699 struct user_namespace *seq_ns = seq_user_ns(seq);
672 700
673 if (!ns->parent) 701 if (!ns->parent)
674 return -EPERM; 702 return -EPERM;
675 703
704 if ((seq_ns != ns) && (seq_ns != ns->parent))
705 return -EPERM;
706
676 return map_write(file, buf, size, ppos, CAP_SETUID, 707 return map_write(file, buf, size, ppos, CAP_SETUID,
677 &ns->uid_map, &ns->parent->uid_map); 708 &ns->uid_map, &ns->parent->uid_map);
678} 709}
@@ -681,10 +712,14 @@ ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t siz
681{ 712{
682 struct seq_file *seq = file->private_data; 713 struct seq_file *seq = file->private_data;
683 struct user_namespace *ns = seq->private; 714 struct user_namespace *ns = seq->private;
715 struct user_namespace *seq_ns = seq_user_ns(seq);
684 716
685 if (!ns->parent) 717 if (!ns->parent)
686 return -EPERM; 718 return -EPERM;
687 719
720 if ((seq_ns != ns) && (seq_ns != ns->parent))
721 return -EPERM;
722
688 return map_write(file, buf, size, ppos, CAP_SETGID, 723 return map_write(file, buf, size, ppos, CAP_SETGID,
689 &ns->gid_map, &ns->parent->gid_map); 724 &ns->gid_map, &ns->parent->gid_map);
690} 725}
@@ -709,6 +744,21 @@ ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t
709static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, 744static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
710 struct uid_gid_map *new_map) 745 struct uid_gid_map *new_map)
711{ 746{
747 /* Allow mapping to your own filesystem ids */
748 if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1)) {
749 u32 id = new_map->extent[0].lower_first;
750 if (cap_setid == CAP_SETUID) {
751 kuid_t uid = make_kuid(ns->parent, id);
752 if (uid_eq(uid, current_fsuid()))
753 return true;
754 }
755 else if (cap_setid == CAP_SETGID) {
756 kgid_t gid = make_kgid(ns->parent, id);
757 if (gid_eq(gid, current_fsgid()))
758 return true;
759 }
760 }
761
712 /* Allow anyone to set a mapping that doesn't require privilege */ 762 /* Allow anyone to set a mapping that doesn't require privilege */
713 if (!cap_valid(cap_setid)) 763 if (!cap_valid(cap_setid))
714 return true; 764 return true;
@@ -722,6 +772,65 @@ static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
722 return false; 772 return false;
723} 773}
724 774
775static void *userns_get(struct task_struct *task)
776{
777 struct user_namespace *user_ns;
778
779 rcu_read_lock();
780 user_ns = get_user_ns(__task_cred(task)->user_ns);
781 rcu_read_unlock();
782
783 return user_ns;
784}
785
786static void userns_put(void *ns)
787{
788 put_user_ns(ns);
789}
790
791static int userns_install(struct nsproxy *nsproxy, void *ns)
792{
793 struct user_namespace *user_ns = ns;
794 struct cred *cred;
795
796 /* Don't allow gaining capabilities by reentering
797 * the same user namespace.
798 */
799 if (user_ns == current_user_ns())
800 return -EINVAL;
801
802 /* Threaded many not enter a different user namespace */
803 if (atomic_read(&current->mm->mm_users) > 1)
804 return -EINVAL;
805
806 if (!ns_capable(user_ns, CAP_SYS_ADMIN))
807 return -EPERM;
808
809 cred = prepare_creds();
810 if (!cred)
811 return -ENOMEM;
812
813 put_user_ns(cred->user_ns);
814 set_cred_user_ns(cred, get_user_ns(user_ns));
815
816 return commit_creds(cred);
817}
818
819static unsigned int userns_inum(void *ns)
820{
821 struct user_namespace *user_ns = ns;
822 return user_ns->proc_inum;
823}
824
825const struct proc_ns_operations userns_operations = {
826 .name = "user",
827 .type = CLONE_NEWUSER,
828 .get = userns_get,
829 .put = userns_put,
830 .install = userns_install,
831 .inum = userns_inum,
832};
833
725static __init int user_namespaces_init(void) 834static __init int user_namespaces_init(void)
726{ 835{
727 user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC); 836 user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC);
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 679d97a5d3fd..f6336d51d64c 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -32,18 +32,25 @@ static struct uts_namespace *create_uts_ns(void)
32 * @old_ns: namespace to clone 32 * @old_ns: namespace to clone
33 * Return NULL on error (failure to kmalloc), new ns otherwise 33 * Return NULL on error (failure to kmalloc), new ns otherwise
34 */ 34 */
35static struct uts_namespace *clone_uts_ns(struct task_struct *tsk, 35static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns,
36 struct uts_namespace *old_ns) 36 struct uts_namespace *old_ns)
37{ 37{
38 struct uts_namespace *ns; 38 struct uts_namespace *ns;
39 int err;
39 40
40 ns = create_uts_ns(); 41 ns = create_uts_ns();
41 if (!ns) 42 if (!ns)
42 return ERR_PTR(-ENOMEM); 43 return ERR_PTR(-ENOMEM);
43 44
45 err = proc_alloc_inum(&ns->proc_inum);
46 if (err) {
47 kfree(ns);
48 return ERR_PTR(err);
49 }
50
44 down_read(&uts_sem); 51 down_read(&uts_sem);
45 memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); 52 memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
46 ns->user_ns = get_user_ns(task_cred_xxx(tsk, user_ns)); 53 ns->user_ns = get_user_ns(user_ns);
47 up_read(&uts_sem); 54 up_read(&uts_sem);
48 return ns; 55 return ns;
49} 56}
@@ -55,9 +62,8 @@ static struct uts_namespace *clone_uts_ns(struct task_struct *tsk,
55 * versa. 62 * versa.
56 */ 63 */
57struct uts_namespace *copy_utsname(unsigned long flags, 64struct uts_namespace *copy_utsname(unsigned long flags,
58 struct task_struct *tsk) 65 struct user_namespace *user_ns, struct uts_namespace *old_ns)
59{ 66{
60 struct uts_namespace *old_ns = tsk->nsproxy->uts_ns;
61 struct uts_namespace *new_ns; 67 struct uts_namespace *new_ns;
62 68
63 BUG_ON(!old_ns); 69 BUG_ON(!old_ns);
@@ -66,7 +72,7 @@ struct uts_namespace *copy_utsname(unsigned long flags,
66 if (!(flags & CLONE_NEWUTS)) 72 if (!(flags & CLONE_NEWUTS))
67 return old_ns; 73 return old_ns;
68 74
69 new_ns = clone_uts_ns(tsk, old_ns); 75 new_ns = clone_uts_ns(user_ns, old_ns);
70 76
71 put_uts_ns(old_ns); 77 put_uts_ns(old_ns);
72 return new_ns; 78 return new_ns;
@@ -78,6 +84,7 @@ void free_uts_ns(struct kref *kref)
78 84
79 ns = container_of(kref, struct uts_namespace, kref); 85 ns = container_of(kref, struct uts_namespace, kref);
80 put_user_ns(ns->user_ns); 86 put_user_ns(ns->user_ns);
87 proc_free_inum(ns->proc_inum);
81 kfree(ns); 88 kfree(ns);
82} 89}
83 90
@@ -102,19 +109,31 @@ static void utsns_put(void *ns)
102 put_uts_ns(ns); 109 put_uts_ns(ns);
103} 110}
104 111
105static int utsns_install(struct nsproxy *nsproxy, void *ns) 112static int utsns_install(struct nsproxy *nsproxy, void *new)
106{ 113{
114 struct uts_namespace *ns = new;
115
116 if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN))
117 return -EPERM;
118
107 get_uts_ns(ns); 119 get_uts_ns(ns);
108 put_uts_ns(nsproxy->uts_ns); 120 put_uts_ns(nsproxy->uts_ns);
109 nsproxy->uts_ns = ns; 121 nsproxy->uts_ns = ns;
110 return 0; 122 return 0;
111} 123}
112 124
125static unsigned int utsns_inum(void *vp)
126{
127 struct uts_namespace *ns = vp;
128
129 return ns->proc_inum;
130}
131
113const struct proc_ns_operations utsns_operations = { 132const struct proc_ns_operations utsns_operations = {
114 .name = "uts", 133 .name = "uts",
115 .type = CLONE_NEWUTS, 134 .type = CLONE_NEWUTS,
116 .get = utsns_get, 135 .get = utsns_get,
117 .put = utsns_put, 136 .put = utsns_put,
118 .install = utsns_install, 137 .install = utsns_install,
138 .inum = utsns_inum,
119}; 139};
120