aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-12-17 18:44:47 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2012-12-17 18:44:47 -0500
commit6a2b60b17b3e48a418695a94bd2420f6ab32e519 (patch)
tree54b7792fa68b8890f710fa6398b6ba8626a039a8 /fs
parent9228ff90387e276ad67b10c0eb525c9d6a57d5e9 (diff)
parent98f842e675f96ffac96e6c50315790912b2812be (diff)
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace
Pull user namespace changes from Eric Biederman: "While small this set of changes is very significant with respect to containers in general and user namespaces in particular. The user space interface is now complete. This set of changes adds support for unprivileged users to create user namespaces and as a user namespace root to create other namespaces. The tyranny of supporting suid root preventing unprivileged users from using cool new kernel features is broken. This set of changes completes the work on setns, adding support for the pid, user, mount namespaces. This set of changes includes a bunch of basic pid namespace cleanups/simplifications. Of particular significance is the rework of the pid namespace cleanup so it no longer requires sending out tendrils into all kinds of unexpected cleanup paths for operation. At least one case of broken error handling is fixed by this cleanup. The files under /proc/<pid>/ns/ have been converted from regular files to magic symlinks which prevents incorrect caching by the VFS, ensuring the files always refer to the namespace the process is currently using and ensuring that the ptrace_mayaccess permission checks are always applied. The files under /proc/<pid>/ns/ have been given stable inode numbers so it is now possible to see if different processes share the same namespaces. Through the David Miller's net tree are changes to relax many of the permission checks in the networking stack to allowing the user namespace root to usefully use the networking stack. Similar changes for the mount namespace and the pid namespace are coming through my tree. Two small changes to add user namespace support were commited here adn in David Miller's -net tree so that I could complete the work on the /proc/<pid>/ns/ files in this tree. Work remains to make it safe to build user namespaces and 9p, afs, ceph, cifs, coda, gfs2, ncpfs, nfs, nfsd, ocfs2, and xfs so the Kconfig guard remains in place preventing that user namespaces from being built when any of those filesystems are enabled. Future design work remains to allow root users outside of the initial user namespace to mount more than just /proc and /sys." * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace: (38 commits) proc: Usable inode numbers for the namespace file descriptors. proc: Fix the namespace inode permission checks. proc: Generalize proc inode allocation userns: Allow unprivilged mounts of proc and sysfs userns: For /proc/self/{uid,gid}_map derive the lower userns from the struct file procfs: Print task uids and gids in the userns that opened the proc file userns: Implement unshare of the user namespace userns: Implent proc namespace operations userns: Kill task_user_ns userns: Make create_new_namespaces take a user_ns parameter userns: Allow unprivileged use of setns. userns: Allow unprivileged users to create new namespaces userns: Allow setting a userns mapping to your current uid. userns: Allow chown and setgid preservation userns: Allow unprivileged users to create user namespaces. userns: Ignore suid and sgid on binaries if the uid or gid can not be mapped userns: fix return value on mntns_install() failure vfs: Allow unprivileged manipulation of the mount namespace. vfs: Only support slave subtrees across different user namespaces vfs: Add a user namespace reference from struct mnt_namespace ...
Diffstat (limited to 'fs')
-rw-r--r--fs/attr.c11
-rw-r--r--fs/autofs4/autofs_i.h8
-rw-r--r--fs/autofs4/dev-ioctl.c4
-rw-r--r--fs/autofs4/inode.c24
-rw-r--r--fs/autofs4/waitq.c5
-rw-r--r--fs/exec.c9
-rw-r--r--fs/fuse/dev.c4
-rw-r--r--fs/fuse/dir.c20
-rw-r--r--fs/fuse/fuse_i.h4
-rw-r--r--fs/fuse/inode.c23
-rw-r--r--fs/hppfs/hppfs.c2
-rw-r--r--fs/mount.h3
-rw-r--r--fs/namespace.c211
-rw-r--r--fs/open.c2
-rw-r--r--fs/pnode.h1
-rw-r--r--fs/proc/Makefile1
-rw-r--r--fs/proc/array.c2
-rw-r--r--fs/proc/base.c169
-rw-r--r--fs/proc/generic.c26
-rw-r--r--fs/proc/inode.c6
-rw-r--r--fs/proc/internal.h1
-rw-r--r--fs/proc/namespaces.c185
-rw-r--r--fs/proc/root.c17
-rw-r--r--fs/proc/self.c59
-rw-r--r--fs/sysfs/mount.c1
25 files changed, 493 insertions, 305 deletions
diff --git a/fs/attr.c b/fs/attr.c
index cce7df53b694..1449adb14ef6 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -49,14 +49,15 @@ int inode_change_ok(const struct inode *inode, struct iattr *attr)
49 /* Make sure a caller can chown. */ 49 /* Make sure a caller can chown. */
50 if ((ia_valid & ATTR_UID) && 50 if ((ia_valid & ATTR_UID) &&
51 (!uid_eq(current_fsuid(), inode->i_uid) || 51 (!uid_eq(current_fsuid(), inode->i_uid) ||
52 !uid_eq(attr->ia_uid, inode->i_uid)) && !capable(CAP_CHOWN)) 52 !uid_eq(attr->ia_uid, inode->i_uid)) &&
53 !inode_capable(inode, CAP_CHOWN))
53 return -EPERM; 54 return -EPERM;
54 55
55 /* Make sure caller can chgrp. */ 56 /* Make sure caller can chgrp. */
56 if ((ia_valid & ATTR_GID) && 57 if ((ia_valid & ATTR_GID) &&
57 (!uid_eq(current_fsuid(), inode->i_uid) || 58 (!uid_eq(current_fsuid(), inode->i_uid) ||
58 (!in_group_p(attr->ia_gid) && !gid_eq(attr->ia_gid, inode->i_gid))) && 59 (!in_group_p(attr->ia_gid) && !gid_eq(attr->ia_gid, inode->i_gid))) &&
59 !capable(CAP_CHOWN)) 60 !inode_capable(inode, CAP_CHOWN))
60 return -EPERM; 61 return -EPERM;
61 62
62 /* Make sure a caller can chmod. */ 63 /* Make sure a caller can chmod. */
@@ -65,7 +66,8 @@ int inode_change_ok(const struct inode *inode, struct iattr *attr)
65 return -EPERM; 66 return -EPERM;
66 /* Also check the setgid bit! */ 67 /* Also check the setgid bit! */
67 if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid : 68 if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid :
68 inode->i_gid) && !capable(CAP_FSETID)) 69 inode->i_gid) &&
70 !inode_capable(inode, CAP_FSETID))
69 attr->ia_mode &= ~S_ISGID; 71 attr->ia_mode &= ~S_ISGID;
70 } 72 }
71 73
@@ -157,7 +159,8 @@ void setattr_copy(struct inode *inode, const struct iattr *attr)
157 if (ia_valid & ATTR_MODE) { 159 if (ia_valid & ATTR_MODE) {
158 umode_t mode = attr->ia_mode; 160 umode_t mode = attr->ia_mode;
159 161
160 if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) 162 if (!in_group_p(inode->i_gid) &&
163 !inode_capable(inode, CAP_FSETID))
161 mode &= ~S_ISGID; 164 mode &= ~S_ISGID;
162 inode->i_mode = mode; 165 inode->i_mode = mode;
163 } 166 }
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 908e18455413..b785e7707959 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -74,8 +74,8 @@ struct autofs_info {
74 unsigned long last_used; 74 unsigned long last_used;
75 atomic_t count; 75 atomic_t count;
76 76
77 uid_t uid; 77 kuid_t uid;
78 gid_t gid; 78 kgid_t gid;
79}; 79};
80 80
81#define AUTOFS_INF_EXPIRING (1<<0) /* dentry is in the process of expiring */ 81#define AUTOFS_INF_EXPIRING (1<<0) /* dentry is in the process of expiring */
@@ -89,8 +89,8 @@ struct autofs_wait_queue {
89 struct qstr name; 89 struct qstr name;
90 u32 dev; 90 u32 dev;
91 u64 ino; 91 u64 ino;
92 uid_t uid; 92 kuid_t uid;
93 gid_t gid; 93 kgid_t gid;
94 pid_t pid; 94 pid_t pid;
95 pid_t tgid; 95 pid_t tgid;
96 /* This is for status reporting upon return */ 96 /* This is for status reporting upon return */
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index a16214109d31..9f68a37bb2b2 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -437,8 +437,8 @@ static int autofs_dev_ioctl_requester(struct file *fp,
437 err = 0; 437 err = 0;
438 autofs4_expire_wait(path.dentry); 438 autofs4_expire_wait(path.dentry);
439 spin_lock(&sbi->fs_lock); 439 spin_lock(&sbi->fs_lock);
440 param->requester.uid = ino->uid; 440 param->requester.uid = from_kuid_munged(current_user_ns(), ino->uid);
441 param->requester.gid = ino->gid; 441 param->requester.gid = from_kgid_munged(current_user_ns(), ino->gid);
442 spin_unlock(&sbi->fs_lock); 442 spin_unlock(&sbi->fs_lock);
443 } 443 }
444 path_put(&path); 444 path_put(&path);
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 8a4fed8ead30..b104726e2d0a 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -36,8 +36,8 @@ struct autofs_info *autofs4_new_ino(struct autofs_sb_info *sbi)
36 36
37void autofs4_clean_ino(struct autofs_info *ino) 37void autofs4_clean_ino(struct autofs_info *ino)
38{ 38{
39 ino->uid = 0; 39 ino->uid = GLOBAL_ROOT_UID;
40 ino->gid = 0; 40 ino->gid = GLOBAL_ROOT_GID;
41 ino->last_used = jiffies; 41 ino->last_used = jiffies;
42} 42}
43 43
@@ -79,10 +79,12 @@ static int autofs4_show_options(struct seq_file *m, struct dentry *root)
79 return 0; 79 return 0;
80 80
81 seq_printf(m, ",fd=%d", sbi->pipefd); 81 seq_printf(m, ",fd=%d", sbi->pipefd);
82 if (root_inode->i_uid != 0) 82 if (!uid_eq(root_inode->i_uid, GLOBAL_ROOT_UID))
83 seq_printf(m, ",uid=%u", root_inode->i_uid); 83 seq_printf(m, ",uid=%u",
84 if (root_inode->i_gid != 0) 84 from_kuid_munged(&init_user_ns, root_inode->i_uid));
85 seq_printf(m, ",gid=%u", root_inode->i_gid); 85 if (!gid_eq(root_inode->i_gid, GLOBAL_ROOT_GID))
86 seq_printf(m, ",gid=%u",
87 from_kgid_munged(&init_user_ns, root_inode->i_gid));
86 seq_printf(m, ",pgrp=%d", sbi->oz_pgrp); 88 seq_printf(m, ",pgrp=%d", sbi->oz_pgrp);
87 seq_printf(m, ",timeout=%lu", sbi->exp_timeout/HZ); 89 seq_printf(m, ",timeout=%lu", sbi->exp_timeout/HZ);
88 seq_printf(m, ",minproto=%d", sbi->min_proto); 90 seq_printf(m, ",minproto=%d", sbi->min_proto);
@@ -126,7 +128,7 @@ static const match_table_t tokens = {
126 {Opt_err, NULL} 128 {Opt_err, NULL}
127}; 129};
128 130
129static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid, 131static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid,
130 pid_t *pgrp, unsigned int *type, int *minproto, int *maxproto) 132 pid_t *pgrp, unsigned int *type, int *minproto, int *maxproto)
131{ 133{
132 char *p; 134 char *p;
@@ -159,12 +161,16 @@ static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid,
159 case Opt_uid: 161 case Opt_uid:
160 if (match_int(args, &option)) 162 if (match_int(args, &option))
161 return 1; 163 return 1;
162 *uid = option; 164 *uid = make_kuid(current_user_ns(), option);
165 if (!uid_valid(*uid))
166 return 1;
163 break; 167 break;
164 case Opt_gid: 168 case Opt_gid:
165 if (match_int(args, &option)) 169 if (match_int(args, &option))
166 return 1; 170 return 1;
167 *gid = option; 171 *gid = make_kgid(current_user_ns(), option);
172 if (!gid_valid(*gid))
173 return 1;
168 break; 174 break;
169 case Opt_pgrp: 175 case Opt_pgrp:
170 if (match_int(args, &option)) 176 if (match_int(args, &option))
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index dce436e595c1..03bc1d347d8e 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -154,6 +154,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
154 case autofs_ptype_expire_direct: 154 case autofs_ptype_expire_direct:
155 { 155 {
156 struct autofs_v5_packet *packet = &pkt.v5_pkt.v5_packet; 156 struct autofs_v5_packet *packet = &pkt.v5_pkt.v5_packet;
157 struct user_namespace *user_ns = sbi->pipe->f_cred->user_ns;
157 158
158 pktsz = sizeof(*packet); 159 pktsz = sizeof(*packet);
159 160
@@ -163,8 +164,8 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
163 packet->name[wq->name.len] = '\0'; 164 packet->name[wq->name.len] = '\0';
164 packet->dev = wq->dev; 165 packet->dev = wq->dev;
165 packet->ino = wq->ino; 166 packet->ino = wq->ino;
166 packet->uid = wq->uid; 167 packet->uid = from_kuid_munged(user_ns, wq->uid);
167 packet->gid = wq->gid; 168 packet->gid = from_kgid_munged(user_ns, wq->gid);
168 packet->pid = wq->pid; 169 packet->pid = wq->pid;
169 packet->tgid = wq->tgid; 170 packet->tgid = wq->tgid;
170 break; 171 break;
diff --git a/fs/exec.c b/fs/exec.c
index 721a29929511..b71b08ce7120 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1266,14 +1266,13 @@ int prepare_binprm(struct linux_binprm *bprm)
1266 bprm->cred->egid = current_egid(); 1266 bprm->cred->egid = current_egid();
1267 1267
1268 if (!(bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID) && 1268 if (!(bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID) &&
1269 !current->no_new_privs) { 1269 !current->no_new_privs &&
1270 kuid_has_mapping(bprm->cred->user_ns, inode->i_uid) &&
1271 kgid_has_mapping(bprm->cred->user_ns, inode->i_gid)) {
1270 /* Set-uid? */ 1272 /* Set-uid? */
1271 if (mode & S_ISUID) { 1273 if (mode & S_ISUID) {
1272 if (!kuid_has_mapping(bprm->cred->user_ns, inode->i_uid))
1273 return -EPERM;
1274 bprm->per_clear |= PER_CLEAR_ON_SETID; 1274 bprm->per_clear |= PER_CLEAR_ON_SETID;
1275 bprm->cred->euid = inode->i_uid; 1275 bprm->cred->euid = inode->i_uid;
1276
1277 } 1276 }
1278 1277
1279 /* Set-gid? */ 1278 /* Set-gid? */
@@ -1283,8 +1282,6 @@ int prepare_binprm(struct linux_binprm *bprm)
1283 * executable. 1282 * executable.
1284 */ 1283 */
1285 if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) { 1284 if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
1286 if (!kgid_has_mapping(bprm->cred->user_ns, inode->i_gid))
1287 return -EPERM;
1288 bprm->per_clear |= PER_CLEAR_ON_SETID; 1285 bprm->per_clear |= PER_CLEAR_ON_SETID;
1289 bprm->cred->egid = inode->i_gid; 1286 bprm->cred->egid = inode->i_gid;
1290 } 1287 }
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 8c23fa7a91e6..c16335315e5d 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -92,8 +92,8 @@ static void __fuse_put_request(struct fuse_req *req)
92 92
93static void fuse_req_init_context(struct fuse_req *req) 93static void fuse_req_init_context(struct fuse_req *req)
94{ 94{
95 req->in.h.uid = current_fsuid(); 95 req->in.h.uid = from_kuid_munged(&init_user_ns, current_fsuid());
96 req->in.h.gid = current_fsgid(); 96 req->in.h.gid = from_kgid_munged(&init_user_ns, current_fsgid());
97 req->in.h.pid = current->pid; 97 req->in.h.pid = current->pid;
98} 98}
99 99
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 324bc0850534..b7c09f9eb40c 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -818,8 +818,8 @@ static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr,
818 stat->ino = attr->ino; 818 stat->ino = attr->ino;
819 stat->mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777); 819 stat->mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777);
820 stat->nlink = attr->nlink; 820 stat->nlink = attr->nlink;
821 stat->uid = attr->uid; 821 stat->uid = make_kuid(&init_user_ns, attr->uid);
822 stat->gid = attr->gid; 822 stat->gid = make_kgid(&init_user_ns, attr->gid);
823 stat->rdev = inode->i_rdev; 823 stat->rdev = inode->i_rdev;
824 stat->atime.tv_sec = attr->atime; 824 stat->atime.tv_sec = attr->atime;
825 stat->atime.tv_nsec = attr->atimensec; 825 stat->atime.tv_nsec = attr->atimensec;
@@ -1007,12 +1007,12 @@ int fuse_allow_task(struct fuse_conn *fc, struct task_struct *task)
1007 rcu_read_lock(); 1007 rcu_read_lock();
1008 ret = 0; 1008 ret = 0;
1009 cred = __task_cred(task); 1009 cred = __task_cred(task);
1010 if (cred->euid == fc->user_id && 1010 if (uid_eq(cred->euid, fc->user_id) &&
1011 cred->suid == fc->user_id && 1011 uid_eq(cred->suid, fc->user_id) &&
1012 cred->uid == fc->user_id && 1012 uid_eq(cred->uid, fc->user_id) &&
1013 cred->egid == fc->group_id && 1013 gid_eq(cred->egid, fc->group_id) &&
1014 cred->sgid == fc->group_id && 1014 gid_eq(cred->sgid, fc->group_id) &&
1015 cred->gid == fc->group_id) 1015 gid_eq(cred->gid, fc->group_id))
1016 ret = 1; 1016 ret = 1;
1017 rcu_read_unlock(); 1017 rcu_read_unlock();
1018 1018
@@ -1306,9 +1306,9 @@ static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg)
1306 if (ivalid & ATTR_MODE) 1306 if (ivalid & ATTR_MODE)
1307 arg->valid |= FATTR_MODE, arg->mode = iattr->ia_mode; 1307 arg->valid |= FATTR_MODE, arg->mode = iattr->ia_mode;
1308 if (ivalid & ATTR_UID) 1308 if (ivalid & ATTR_UID)
1309 arg->valid |= FATTR_UID, arg->uid = iattr->ia_uid; 1309 arg->valid |= FATTR_UID, arg->uid = from_kuid(&init_user_ns, iattr->ia_uid);
1310 if (ivalid & ATTR_GID) 1310 if (ivalid & ATTR_GID)
1311 arg->valid |= FATTR_GID, arg->gid = iattr->ia_gid; 1311 arg->valid |= FATTR_GID, arg->gid = from_kgid(&init_user_ns, iattr->ia_gid);
1312 if (ivalid & ATTR_SIZE) 1312 if (ivalid & ATTR_SIZE)
1313 arg->valid |= FATTR_SIZE, arg->size = iattr->ia_size; 1313 arg->valid |= FATTR_SIZE, arg->size = iattr->ia_size;
1314 if (ivalid & ATTR_ATIME) { 1314 if (ivalid & ATTR_ATIME) {
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index e24dd74e3068..e105a53fc72d 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -333,10 +333,10 @@ struct fuse_conn {
333 atomic_t count; 333 atomic_t count;
334 334
335 /** The user id for this mount */ 335 /** The user id for this mount */
336 uid_t user_id; 336 kuid_t user_id;
337 337
338 /** The group id for this mount */ 338 /** The group id for this mount */
339 gid_t group_id; 339 kgid_t group_id;
340 340
341 /** The fuse mount flags for this mount */ 341 /** The fuse mount flags for this mount */
342 unsigned flags; 342 unsigned flags;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index f0eda124cffb..73ca6b72beaf 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -60,8 +60,8 @@ MODULE_PARM_DESC(max_user_congthresh,
60struct fuse_mount_data { 60struct fuse_mount_data {
61 int fd; 61 int fd;
62 unsigned rootmode; 62 unsigned rootmode;
63 unsigned user_id; 63 kuid_t user_id;
64 unsigned group_id; 64 kgid_t group_id;
65 unsigned fd_present:1; 65 unsigned fd_present:1;
66 unsigned rootmode_present:1; 66 unsigned rootmode_present:1;
67 unsigned user_id_present:1; 67 unsigned user_id_present:1;
@@ -164,8 +164,8 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
164 inode->i_ino = fuse_squash_ino(attr->ino); 164 inode->i_ino = fuse_squash_ino(attr->ino);
165 inode->i_mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777); 165 inode->i_mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777);
166 set_nlink(inode, attr->nlink); 166 set_nlink(inode, attr->nlink);
167 inode->i_uid = attr->uid; 167 inode->i_uid = make_kuid(&init_user_ns, attr->uid);
168 inode->i_gid = attr->gid; 168 inode->i_gid = make_kgid(&init_user_ns, attr->gid);
169 inode->i_blocks = attr->blocks; 169 inode->i_blocks = attr->blocks;
170 inode->i_atime.tv_sec = attr->atime; 170 inode->i_atime.tv_sec = attr->atime;
171 inode->i_atime.tv_nsec = attr->atimensec; 171 inode->i_atime.tv_nsec = attr->atimensec;
@@ -492,14 +492,18 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev)
492 case OPT_USER_ID: 492 case OPT_USER_ID:
493 if (match_int(&args[0], &value)) 493 if (match_int(&args[0], &value))
494 return 0; 494 return 0;
495 d->user_id = value; 495 d->user_id = make_kuid(current_user_ns(), value);
496 if (!uid_valid(d->user_id))
497 return 0;
496 d->user_id_present = 1; 498 d->user_id_present = 1;
497 break; 499 break;
498 500
499 case OPT_GROUP_ID: 501 case OPT_GROUP_ID:
500 if (match_int(&args[0], &value)) 502 if (match_int(&args[0], &value))
501 return 0; 503 return 0;
502 d->group_id = value; 504 d->group_id = make_kgid(current_user_ns(), value);
505 if (!gid_valid(d->group_id))
506 return 0;
503 d->group_id_present = 1; 507 d->group_id_present = 1;
504 break; 508 break;
505 509
@@ -540,8 +544,8 @@ static int fuse_show_options(struct seq_file *m, struct dentry *root)
540 struct super_block *sb = root->d_sb; 544 struct super_block *sb = root->d_sb;
541 struct fuse_conn *fc = get_fuse_conn_super(sb); 545 struct fuse_conn *fc = get_fuse_conn_super(sb);
542 546
543 seq_printf(m, ",user_id=%u", fc->user_id); 547 seq_printf(m, ",user_id=%u", from_kuid_munged(&init_user_ns, fc->user_id));
544 seq_printf(m, ",group_id=%u", fc->group_id); 548 seq_printf(m, ",group_id=%u", from_kgid_munged(&init_user_ns, fc->group_id));
545 if (fc->flags & FUSE_DEFAULT_PERMISSIONS) 549 if (fc->flags & FUSE_DEFAULT_PERMISSIONS)
546 seq_puts(m, ",default_permissions"); 550 seq_puts(m, ",default_permissions");
547 if (fc->flags & FUSE_ALLOW_OTHER) 551 if (fc->flags & FUSE_ALLOW_OTHER)
@@ -989,7 +993,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
989 if (!file) 993 if (!file)
990 goto err; 994 goto err;
991 995
992 if (file->f_op != &fuse_dev_operations) 996 if ((file->f_op != &fuse_dev_operations) ||
997 (file->f_cred->user_ns != &init_user_ns))
993 goto err_fput; 998 goto err_fput;
994 999
995 fc = kmalloc(sizeof(*fc), GFP_KERNEL); 1000 fc = kmalloc(sizeof(*fc), GFP_KERNEL);
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index 78f21f8dc2ec..43b315f2002b 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -710,7 +710,7 @@ static int hppfs_fill_super(struct super_block *sb, void *d, int silent)
710 struct vfsmount *proc_mnt; 710 struct vfsmount *proc_mnt;
711 int err = -ENOENT; 711 int err = -ENOENT;
712 712
713 proc_mnt = mntget(current->nsproxy->pid_ns->proc_mnt); 713 proc_mnt = mntget(task_active_pid_ns(current)->proc_mnt);
714 if (IS_ERR(proc_mnt)) 714 if (IS_ERR(proc_mnt))
715 goto out; 715 goto out;
716 716
diff --git a/fs/mount.h b/fs/mount.h
index 4f291f9de641..cd5007980400 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -4,8 +4,11 @@
4 4
5struct mnt_namespace { 5struct mnt_namespace {
6 atomic_t count; 6 atomic_t count;
7 unsigned int proc_inum;
7 struct mount * root; 8 struct mount * root;
8 struct list_head list; 9 struct list_head list;
10 struct user_namespace *user_ns;
11 u64 seq; /* Sequence number to prevent loops */
9 wait_queue_head_t poll; 12 wait_queue_head_t poll;
10 int event; 13 int event;
11}; 14};
diff --git a/fs/namespace.c b/fs/namespace.c
index 24960626bb6b..c1bbe86f4920 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -12,6 +12,7 @@
12#include <linux/export.h> 12#include <linux/export.h>
13#include <linux/capability.h> 13#include <linux/capability.h>
14#include <linux/mnt_namespace.h> 14#include <linux/mnt_namespace.h>
15#include <linux/user_namespace.h>
15#include <linux/namei.h> 16#include <linux/namei.h>
16#include <linux/security.h> 17#include <linux/security.h>
17#include <linux/idr.h> 18#include <linux/idr.h>
@@ -20,6 +21,7 @@
20#include <linux/fs_struct.h> /* get_fs_root et.al. */ 21#include <linux/fs_struct.h> /* get_fs_root et.al. */
21#include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */ 22#include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */
22#include <linux/uaccess.h> 23#include <linux/uaccess.h>
24#include <linux/proc_fs.h>
23#include "pnode.h" 25#include "pnode.h"
24#include "internal.h" 26#include "internal.h"
25 27
@@ -784,7 +786,7 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
784 if (!mnt) 786 if (!mnt)
785 return ERR_PTR(-ENOMEM); 787 return ERR_PTR(-ENOMEM);
786 788
787 if (flag & (CL_SLAVE | CL_PRIVATE)) 789 if (flag & (CL_SLAVE | CL_PRIVATE | CL_SHARED_TO_SLAVE))
788 mnt->mnt_group_id = 0; /* not a peer of original */ 790 mnt->mnt_group_id = 0; /* not a peer of original */
789 else 791 else
790 mnt->mnt_group_id = old->mnt_group_id; 792 mnt->mnt_group_id = old->mnt_group_id;
@@ -805,7 +807,8 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
805 list_add_tail(&mnt->mnt_instance, &sb->s_mounts); 807 list_add_tail(&mnt->mnt_instance, &sb->s_mounts);
806 br_write_unlock(&vfsmount_lock); 808 br_write_unlock(&vfsmount_lock);
807 809
808 if (flag & CL_SLAVE) { 810 if ((flag & CL_SLAVE) ||
811 ((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) {
809 list_add(&mnt->mnt_slave, &old->mnt_slave_list); 812 list_add(&mnt->mnt_slave, &old->mnt_slave_list);
810 mnt->mnt_master = old; 813 mnt->mnt_master = old;
811 CLEAR_MNT_SHARED(mnt); 814 CLEAR_MNT_SHARED(mnt);
@@ -1266,7 +1269,7 @@ SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
1266 goto dput_and_out; 1269 goto dput_and_out;
1267 1270
1268 retval = -EPERM; 1271 retval = -EPERM;
1269 if (!capable(CAP_SYS_ADMIN)) 1272 if (!ns_capable(mnt->mnt_ns->user_ns, CAP_SYS_ADMIN))
1270 goto dput_and_out; 1273 goto dput_and_out;
1271 1274
1272 retval = do_umount(mnt, flags); 1275 retval = do_umount(mnt, flags);
@@ -1292,7 +1295,7 @@ SYSCALL_DEFINE1(oldumount, char __user *, name)
1292 1295
1293static int mount_is_safe(struct path *path) 1296static int mount_is_safe(struct path *path)
1294{ 1297{
1295 if (capable(CAP_SYS_ADMIN)) 1298 if (ns_capable(real_mount(path->mnt)->mnt_ns->user_ns, CAP_SYS_ADMIN))
1296 return 0; 1299 return 0;
1297 return -EPERM; 1300 return -EPERM;
1298#ifdef notyet 1301#ifdef notyet
@@ -1308,6 +1311,26 @@ static int mount_is_safe(struct path *path)
1308#endif 1311#endif
1309} 1312}
1310 1313
1314static bool mnt_ns_loop(struct path *path)
1315{
1316 /* Could bind mounting the mount namespace inode cause a
1317 * mount namespace loop?
1318 */
1319 struct inode *inode = path->dentry->d_inode;
1320 struct proc_inode *ei;
1321 struct mnt_namespace *mnt_ns;
1322
1323 if (!proc_ns_inode(inode))
1324 return false;
1325
1326 ei = PROC_I(inode);
1327 if (ei->ns_ops != &mntns_operations)
1328 return false;
1329
1330 mnt_ns = ei->ns;
1331 return current->nsproxy->mnt_ns->seq >= mnt_ns->seq;
1332}
1333
1311struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, 1334struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
1312 int flag) 1335 int flag)
1313{ 1336{
@@ -1610,7 +1633,7 @@ static int do_change_type(struct path *path, int flag)
1610 int type; 1633 int type;
1611 int err = 0; 1634 int err = 0;
1612 1635
1613 if (!capable(CAP_SYS_ADMIN)) 1636 if (!ns_capable(mnt->mnt_ns->user_ns, CAP_SYS_ADMIN))
1614 return -EPERM; 1637 return -EPERM;
1615 1638
1616 if (path->dentry != path->mnt->mnt_root) 1639 if (path->dentry != path->mnt->mnt_root)
@@ -1655,6 +1678,10 @@ static int do_loopback(struct path *path, const char *old_name,
1655 if (err) 1678 if (err)
1656 return err; 1679 return err;
1657 1680
1681 err = -EINVAL;
1682 if (mnt_ns_loop(&old_path))
1683 goto out;
1684
1658 err = lock_mount(path); 1685 err = lock_mount(path);
1659 if (err) 1686 if (err)
1660 goto out; 1687 goto out;
@@ -1770,7 +1797,7 @@ static int do_move_mount(struct path *path, const char *old_name)
1770 struct mount *p; 1797 struct mount *p;
1771 struct mount *old; 1798 struct mount *old;
1772 int err = 0; 1799 int err = 0;
1773 if (!capable(CAP_SYS_ADMIN)) 1800 if (!ns_capable(real_mount(path->mnt)->mnt_ns->user_ns, CAP_SYS_ADMIN))
1774 return -EPERM; 1801 return -EPERM;
1775 if (!old_name || !*old_name) 1802 if (!old_name || !*old_name)
1776 return -EINVAL; 1803 return -EINVAL;
@@ -1857,21 +1884,6 @@ static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype)
1857 return ERR_PTR(err); 1884 return ERR_PTR(err);
1858} 1885}
1859 1886
1860static struct vfsmount *
1861do_kern_mount(const char *fstype, int flags, const char *name, void *data)
1862{
1863 struct file_system_type *type = get_fs_type(fstype);
1864 struct vfsmount *mnt;
1865 if (!type)
1866 return ERR_PTR(-ENODEV);
1867 mnt = vfs_kern_mount(type, flags, name, data);
1868 if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&
1869 !mnt->mnt_sb->s_subtype)
1870 mnt = fs_set_subtype(mnt, fstype);
1871 put_filesystem(type);
1872 return mnt;
1873}
1874
1875/* 1887/*
1876 * add a mount into a namespace's mount tree 1888 * add a mount into a namespace's mount tree
1877 */ 1889 */
@@ -1917,20 +1929,46 @@ unlock:
1917 * create a new mount for userspace and request it to be added into the 1929 * create a new mount for userspace and request it to be added into the
1918 * namespace's tree 1930 * namespace's tree
1919 */ 1931 */
1920static int do_new_mount(struct path *path, const char *type, int flags, 1932static int do_new_mount(struct path *path, const char *fstype, int flags,
1921 int mnt_flags, const char *name, void *data) 1933 int mnt_flags, const char *name, void *data)
1922{ 1934{
1935 struct file_system_type *type;
1936 struct user_namespace *user_ns;
1923 struct vfsmount *mnt; 1937 struct vfsmount *mnt;
1924 int err; 1938 int err;
1925 1939
1926 if (!type) 1940 if (!fstype)
1927 return -EINVAL; 1941 return -EINVAL;
1928 1942
1929 /* we need capabilities... */ 1943 /* we need capabilities... */
1930 if (!capable(CAP_SYS_ADMIN)) 1944 user_ns = real_mount(path->mnt)->mnt_ns->user_ns;
1945 if (!ns_capable(user_ns, CAP_SYS_ADMIN))
1931 return -EPERM; 1946 return -EPERM;
1932 1947
1933 mnt = do_kern_mount(type, flags, name, data); 1948 type = get_fs_type(fstype);
1949 if (!type)
1950 return -ENODEV;
1951
1952 if (user_ns != &init_user_ns) {
1953 if (!(type->fs_flags & FS_USERNS_MOUNT)) {
1954 put_filesystem(type);
1955 return -EPERM;
1956 }
1957 /* Only in special cases allow devices from mounts
1958 * created outside the initial user namespace.
1959 */
1960 if (!(type->fs_flags & FS_USERNS_DEV_MOUNT)) {
1961 flags |= MS_NODEV;
1962 mnt_flags |= MNT_NODEV;
1963 }
1964 }
1965
1966 mnt = vfs_kern_mount(type, flags, name, data);
1967 if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&
1968 !mnt->mnt_sb->s_subtype)
1969 mnt = fs_set_subtype(mnt, fstype);
1970
1971 put_filesystem(type);
1934 if (IS_ERR(mnt)) 1972 if (IS_ERR(mnt))
1935 return PTR_ERR(mnt); 1973 return PTR_ERR(mnt);
1936 1974
@@ -2261,18 +2299,42 @@ dput_out:
2261 return retval; 2299 return retval;
2262} 2300}
2263 2301
2264static struct mnt_namespace *alloc_mnt_ns(void) 2302static void free_mnt_ns(struct mnt_namespace *ns)
2303{
2304 proc_free_inum(ns->proc_inum);
2305 put_user_ns(ns->user_ns);
2306 kfree(ns);
2307}
2308
2309/*
2310 * Assign a sequence number so we can detect when we attempt to bind
2311 * mount a reference to an older mount namespace into the current
2312 * mount namespace, preventing reference counting loops. A 64bit
2313 * number incrementing at 10Ghz will take 12,427 years to wrap which
2314 * is effectively never, so we can ignore the possibility.
2315 */
2316static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1);
2317
2318static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns)
2265{ 2319{
2266 struct mnt_namespace *new_ns; 2320 struct mnt_namespace *new_ns;
2321 int ret;
2267 2322
2268 new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL); 2323 new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL);
2269 if (!new_ns) 2324 if (!new_ns)
2270 return ERR_PTR(-ENOMEM); 2325 return ERR_PTR(-ENOMEM);
2326 ret = proc_alloc_inum(&new_ns->proc_inum);
2327 if (ret) {
2328 kfree(new_ns);
2329 return ERR_PTR(ret);
2330 }
2331 new_ns->seq = atomic64_add_return(1, &mnt_ns_seq);
2271 atomic_set(&new_ns->count, 1); 2332 atomic_set(&new_ns->count, 1);
2272 new_ns->root = NULL; 2333 new_ns->root = NULL;
2273 INIT_LIST_HEAD(&new_ns->list); 2334 INIT_LIST_HEAD(&new_ns->list);
2274 init_waitqueue_head(&new_ns->poll); 2335 init_waitqueue_head(&new_ns->poll);
2275 new_ns->event = 0; 2336 new_ns->event = 0;
2337 new_ns->user_ns = get_user_ns(user_ns);
2276 return new_ns; 2338 return new_ns;
2277} 2339}
2278 2340
@@ -2281,24 +2343,28 @@ static struct mnt_namespace *alloc_mnt_ns(void)
2281 * copied from the namespace of the passed in task structure. 2343 * copied from the namespace of the passed in task structure.
2282 */ 2344 */
2283static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, 2345static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
2284 struct fs_struct *fs) 2346 struct user_namespace *user_ns, struct fs_struct *fs)
2285{ 2347{
2286 struct mnt_namespace *new_ns; 2348 struct mnt_namespace *new_ns;
2287 struct vfsmount *rootmnt = NULL, *pwdmnt = NULL; 2349 struct vfsmount *rootmnt = NULL, *pwdmnt = NULL;
2288 struct mount *p, *q; 2350 struct mount *p, *q;
2289 struct mount *old = mnt_ns->root; 2351 struct mount *old = mnt_ns->root;
2290 struct mount *new; 2352 struct mount *new;
2353 int copy_flags;
2291 2354
2292 new_ns = alloc_mnt_ns(); 2355 new_ns = alloc_mnt_ns(user_ns);
2293 if (IS_ERR(new_ns)) 2356 if (IS_ERR(new_ns))
2294 return new_ns; 2357 return new_ns;
2295 2358
2296 down_write(&namespace_sem); 2359 down_write(&namespace_sem);
2297 /* First pass: copy the tree topology */ 2360 /* First pass: copy the tree topology */
2298 new = copy_tree(old, old->mnt.mnt_root, CL_COPY_ALL | CL_EXPIRE); 2361 copy_flags = CL_COPY_ALL | CL_EXPIRE;
2362 if (user_ns != mnt_ns->user_ns)
2363 copy_flags |= CL_SHARED_TO_SLAVE;
2364 new = copy_tree(old, old->mnt.mnt_root, copy_flags);
2299 if (IS_ERR(new)) { 2365 if (IS_ERR(new)) {
2300 up_write(&namespace_sem); 2366 up_write(&namespace_sem);
2301 kfree(new_ns); 2367 free_mnt_ns(new_ns);
2302 return ERR_CAST(new); 2368 return ERR_CAST(new);
2303 } 2369 }
2304 new_ns->root = new; 2370 new_ns->root = new;
@@ -2339,7 +2405,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
2339} 2405}
2340 2406
2341struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, 2407struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
2342 struct fs_struct *new_fs) 2408 struct user_namespace *user_ns, struct fs_struct *new_fs)
2343{ 2409{
2344 struct mnt_namespace *new_ns; 2410 struct mnt_namespace *new_ns;
2345 2411
@@ -2349,7 +2415,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
2349 if (!(flags & CLONE_NEWNS)) 2415 if (!(flags & CLONE_NEWNS))
2350 return ns; 2416 return ns;
2351 2417
2352 new_ns = dup_mnt_ns(ns, new_fs); 2418 new_ns = dup_mnt_ns(ns, user_ns, new_fs);
2353 2419
2354 put_mnt_ns(ns); 2420 put_mnt_ns(ns);
2355 return new_ns; 2421 return new_ns;
@@ -2361,7 +2427,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
2361 */ 2427 */
2362static struct mnt_namespace *create_mnt_ns(struct vfsmount *m) 2428static struct mnt_namespace *create_mnt_ns(struct vfsmount *m)
2363{ 2429{
2364 struct mnt_namespace *new_ns = alloc_mnt_ns(); 2430 struct mnt_namespace *new_ns = alloc_mnt_ns(&init_user_ns);
2365 if (!IS_ERR(new_ns)) { 2431 if (!IS_ERR(new_ns)) {
2366 struct mount *mnt = real_mount(m); 2432 struct mount *mnt = real_mount(m);
2367 mnt->mnt_ns = new_ns; 2433 mnt->mnt_ns = new_ns;
@@ -2501,7 +2567,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
2501 struct mount *new_mnt, *root_mnt; 2567 struct mount *new_mnt, *root_mnt;
2502 int error; 2568 int error;
2503 2569
2504 if (!capable(CAP_SYS_ADMIN)) 2570 if (!ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN))
2505 return -EPERM; 2571 return -EPERM;
2506 2572
2507 error = user_path_dir(new_root, &new); 2573 error = user_path_dir(new_root, &new);
@@ -2583,8 +2649,13 @@ static void __init init_mount_tree(void)
2583 struct vfsmount *mnt; 2649 struct vfsmount *mnt;
2584 struct mnt_namespace *ns; 2650 struct mnt_namespace *ns;
2585 struct path root; 2651 struct path root;
2652 struct file_system_type *type;
2586 2653
2587 mnt = do_kern_mount("rootfs", 0, "rootfs", NULL); 2654 type = get_fs_type("rootfs");
2655 if (!type)
2656 panic("Can't find rootfs type");
2657 mnt = vfs_kern_mount(type, 0, "rootfs", NULL);
2658 put_filesystem(type);
2588 if (IS_ERR(mnt)) 2659 if (IS_ERR(mnt))
2589 panic("Can't create rootfs"); 2660 panic("Can't create rootfs");
2590 2661
@@ -2647,7 +2718,7 @@ void put_mnt_ns(struct mnt_namespace *ns)
2647 br_write_unlock(&vfsmount_lock); 2718 br_write_unlock(&vfsmount_lock);
2648 up_write(&namespace_sem); 2719 up_write(&namespace_sem);
2649 release_mounts(&umount_list); 2720 release_mounts(&umount_list);
2650 kfree(ns); 2721 free_mnt_ns(ns);
2651} 2722}
2652 2723
2653struct vfsmount *kern_mount_data(struct file_system_type *type, void *data) 2724struct vfsmount *kern_mount_data(struct file_system_type *type, void *data)
@@ -2681,3 +2752,71 @@ bool our_mnt(struct vfsmount *mnt)
2681{ 2752{
2682 return check_mnt(real_mount(mnt)); 2753 return check_mnt(real_mount(mnt));
2683} 2754}
2755
2756static void *mntns_get(struct task_struct *task)
2757{
2758 struct mnt_namespace *ns = NULL;
2759 struct nsproxy *nsproxy;
2760
2761 rcu_read_lock();
2762 nsproxy = task_nsproxy(task);
2763 if (nsproxy) {
2764 ns = nsproxy->mnt_ns;
2765 get_mnt_ns(ns);
2766 }
2767 rcu_read_unlock();
2768
2769 return ns;
2770}
2771
2772static void mntns_put(void *ns)
2773{
2774 put_mnt_ns(ns);
2775}
2776
2777static int mntns_install(struct nsproxy *nsproxy, void *ns)
2778{
2779 struct fs_struct *fs = current->fs;
2780 struct mnt_namespace *mnt_ns = ns;
2781 struct path root;
2782
2783 if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) ||
2784 !nsown_capable(CAP_SYS_CHROOT))
2785 return -EPERM;
2786
2787 if (fs->users != 1)
2788 return -EINVAL;
2789
2790 get_mnt_ns(mnt_ns);
2791 put_mnt_ns(nsproxy->mnt_ns);
2792 nsproxy->mnt_ns = mnt_ns;
2793
2794 /* Find the root */
2795 root.mnt = &mnt_ns->root->mnt;
2796 root.dentry = mnt_ns->root->mnt.mnt_root;
2797 path_get(&root);
2798 while(d_mountpoint(root.dentry) && follow_down_one(&root))
2799 ;
2800
2801 /* Update the pwd and root */
2802 set_fs_pwd(fs, &root);
2803 set_fs_root(fs, &root);
2804
2805 path_put(&root);
2806 return 0;
2807}
2808
2809static unsigned int mntns_inum(void *ns)
2810{
2811 struct mnt_namespace *mnt_ns = ns;
2812 return mnt_ns->proc_inum;
2813}
2814
2815const struct proc_ns_operations mntns_operations = {
2816 .name = "mnt",
2817 .type = CLONE_NEWNS,
2818 .get = mntns_get,
2819 .put = mntns_put,
2820 .install = mntns_install,
2821 .inum = mntns_inum,
2822};
diff --git a/fs/open.c b/fs/open.c
index 59071f55bf7f..182d8667b7bd 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -435,7 +435,7 @@ SYSCALL_DEFINE1(chroot, const char __user *, filename)
435 goto dput_and_out; 435 goto dput_and_out;
436 436
437 error = -EPERM; 437 error = -EPERM;
438 if (!capable(CAP_SYS_CHROOT)) 438 if (!nsown_capable(CAP_SYS_CHROOT))
439 goto dput_and_out; 439 goto dput_and_out;
440 error = security_path_chroot(&path); 440 error = security_path_chroot(&path);
441 if (error) 441 if (error)
diff --git a/fs/pnode.h b/fs/pnode.h
index 65c60979d541..19b853a3445c 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -22,6 +22,7 @@
22#define CL_COPY_ALL 0x04 22#define CL_COPY_ALL 0x04
23#define CL_MAKE_SHARED 0x08 23#define CL_MAKE_SHARED 0x08
24#define CL_PRIVATE 0x10 24#define CL_PRIVATE 0x10
25#define CL_SHARED_TO_SLAVE 0x20
25 26
26static inline void set_mnt_shared(struct mount *mnt) 27static inline void set_mnt_shared(struct mount *mnt)
27{ 28{
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index 99349efbbc2b..981b05601931 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -21,6 +21,7 @@ proc-y += uptime.o
21proc-y += version.o 21proc-y += version.o
22proc-y += softirqs.o 22proc-y += softirqs.o
23proc-y += namespaces.o 23proc-y += namespaces.o
24proc-y += self.o
24proc-$(CONFIG_PROC_SYSCTL) += proc_sysctl.o 25proc-$(CONFIG_PROC_SYSCTL) += proc_sysctl.o
25proc-$(CONFIG_NET) += proc_net.o 26proc-$(CONFIG_NET) += proc_net.o
26proc-$(CONFIG_PROC_KCORE) += kcore.o 27proc-$(CONFIG_PROC_KCORE) += kcore.o
diff --git a/fs/proc/array.c b/fs/proc/array.c
index d3696708fc1a..d66248a1919b 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -162,7 +162,7 @@ static inline const char *get_task_state(struct task_struct *tsk)
162static inline void task_state(struct seq_file *m, struct pid_namespace *ns, 162static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
163 struct pid *pid, struct task_struct *p) 163 struct pid *pid, struct task_struct *p)
164{ 164{
165 struct user_namespace *user_ns = current_user_ns(); 165 struct user_namespace *user_ns = seq_user_ns(m);
166 struct group_info *group_info; 166 struct group_info *group_info;
167 int g; 167 int g;
168 struct fdtable *fdt = NULL; 168 struct fdtable *fdt = NULL;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index aa63d25157b8..5a5a0be40e40 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2345,146 +2345,6 @@ static const struct file_operations proc_coredump_filter_operations = {
2345}; 2345};
2346#endif 2346#endif
2347 2347
2348/*
2349 * /proc/self:
2350 */
2351static int proc_self_readlink(struct dentry *dentry, char __user *buffer,
2352 int buflen)
2353{
2354 struct pid_namespace *ns = dentry->d_sb->s_fs_info;
2355 pid_t tgid = task_tgid_nr_ns(current, ns);
2356 char tmp[PROC_NUMBUF];
2357 if (!tgid)
2358 return -ENOENT;
2359 sprintf(tmp, "%d", tgid);
2360 return vfs_readlink(dentry,buffer,buflen,tmp);
2361}
2362
2363static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd)
2364{
2365 struct pid_namespace *ns = dentry->d_sb->s_fs_info;
2366 pid_t tgid = task_tgid_nr_ns(current, ns);
2367 char *name = ERR_PTR(-ENOENT);
2368 if (tgid) {
2369 /* 11 for max length of signed int in decimal + NULL term */
2370 name = kmalloc(12, GFP_KERNEL);
2371 if (!name)
2372 name = ERR_PTR(-ENOMEM);
2373 else
2374 sprintf(name, "%d", tgid);
2375 }
2376 nd_set_link(nd, name);
2377 return NULL;
2378}
2379
2380static void proc_self_put_link(struct dentry *dentry, struct nameidata *nd,
2381 void *cookie)
2382{
2383 char *s = nd_get_link(nd);
2384 if (!IS_ERR(s))
2385 kfree(s);
2386}
2387
2388static const struct inode_operations proc_self_inode_operations = {
2389 .readlink = proc_self_readlink,
2390 .follow_link = proc_self_follow_link,
2391 .put_link = proc_self_put_link,
2392};
2393
2394/*
2395 * proc base
2396 *
2397 * These are the directory entries in the root directory of /proc
2398 * that properly belong to the /proc filesystem, as they describe
2399 * describe something that is process related.
2400 */
2401static const struct pid_entry proc_base_stuff[] = {
2402 NOD("self", S_IFLNK|S_IRWXUGO,
2403 &proc_self_inode_operations, NULL, {}),
2404};
2405
2406static struct dentry *proc_base_instantiate(struct inode *dir,
2407 struct dentry *dentry, struct task_struct *task, const void *ptr)
2408{
2409 const struct pid_entry *p = ptr;
2410 struct inode *inode;
2411 struct proc_inode *ei;
2412 struct dentry *error;
2413
2414 /* Allocate the inode */
2415 error = ERR_PTR(-ENOMEM);
2416 inode = new_inode(dir->i_sb);
2417 if (!inode)
2418 goto out;
2419
2420 /* Initialize the inode */
2421 ei = PROC_I(inode);
2422 inode->i_ino = get_next_ino();
2423 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
2424
2425 /*
2426 * grab the reference to the task.
2427 */
2428 ei->pid = get_task_pid(task, PIDTYPE_PID);
2429 if (!ei->pid)
2430 goto out_iput;
2431
2432 inode->i_mode = p->mode;
2433 if (S_ISDIR(inode->i_mode))
2434 set_nlink(inode, 2);
2435 if (S_ISLNK(inode->i_mode))
2436 inode->i_size = 64;
2437 if (p->iop)
2438 inode->i_op = p->iop;
2439 if (p->fop)
2440 inode->i_fop = p->fop;
2441 ei->op = p->op;
2442 d_add(dentry, inode);
2443 error = NULL;
2444out:
2445 return error;
2446out_iput:
2447 iput(inode);
2448 goto out;
2449}
2450
2451static struct dentry *proc_base_lookup(struct inode *dir, struct dentry *dentry)
2452{
2453 struct dentry *error;
2454 struct task_struct *task = get_proc_task(dir);
2455 const struct pid_entry *p, *last;
2456
2457 error = ERR_PTR(-ENOENT);
2458
2459 if (!task)
2460 goto out_no_task;
2461
2462 /* Lookup the directory entry */
2463 last = &proc_base_stuff[ARRAY_SIZE(proc_base_stuff) - 1];
2464 for (p = proc_base_stuff; p <= last; p++) {
2465 if (p->len != dentry->d_name.len)
2466 continue;
2467 if (!memcmp(dentry->d_name.name, p->name, p->len))
2468 break;
2469 }
2470 if (p > last)
2471 goto out;
2472
2473 error = proc_base_instantiate(dir, dentry, task, p);
2474
2475out:
2476 put_task_struct(task);
2477out_no_task:
2478 return error;
2479}
2480
2481static int proc_base_fill_cache(struct file *filp, void *dirent,
2482 filldir_t filldir, struct task_struct *task, const struct pid_entry *p)
2483{
2484 return proc_fill_cache(filp, dirent, filldir, p->name, p->len,
2485 proc_base_instantiate, task, p);
2486}
2487
2488#ifdef CONFIG_TASK_IO_ACCOUNTING 2348#ifdef CONFIG_TASK_IO_ACCOUNTING
2489static int do_io_accounting(struct task_struct *task, char *buffer, int whole) 2349static int do_io_accounting(struct task_struct *task, char *buffer, int whole)
2490{ 2350{
@@ -2839,10 +2699,6 @@ void proc_flush_task(struct task_struct *task)
2839 proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr, 2699 proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
2840 tgid->numbers[i].nr); 2700 tgid->numbers[i].nr);
2841 } 2701 }
2842
2843 upid = &pid->numbers[pid->level];
2844 if (upid->nr == 1)
2845 pid_ns_release_proc(upid->ns);
2846} 2702}
2847 2703
2848static struct dentry *proc_pid_instantiate(struct inode *dir, 2704static struct dentry *proc_pid_instantiate(struct inode *dir,
@@ -2876,15 +2732,11 @@ out:
2876 2732
2877struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags) 2733struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
2878{ 2734{
2879 struct dentry *result; 2735 struct dentry *result = NULL;
2880 struct task_struct *task; 2736 struct task_struct *task;
2881 unsigned tgid; 2737 unsigned tgid;
2882 struct pid_namespace *ns; 2738 struct pid_namespace *ns;
2883 2739
2884 result = proc_base_lookup(dir, dentry);
2885 if (!IS_ERR(result) || PTR_ERR(result) != -ENOENT)
2886 goto out;
2887
2888 tgid = name_to_int(dentry); 2740 tgid = name_to_int(dentry);
2889 if (tgid == ~0U) 2741 if (tgid == ~0U)
2890 goto out; 2742 goto out;
@@ -2947,7 +2799,7 @@ retry:
2947 return iter; 2799 return iter;
2948} 2800}
2949 2801
2950#define TGID_OFFSET (FIRST_PROCESS_ENTRY + ARRAY_SIZE(proc_base_stuff)) 2802#define TGID_OFFSET (FIRST_PROCESS_ENTRY)
2951 2803
2952static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldir, 2804static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
2953 struct tgid_iter iter) 2805 struct tgid_iter iter)
@@ -2967,25 +2819,12 @@ static int fake_filldir(void *buf, const char *name, int namelen,
2967/* for the /proc/ directory itself, after non-process stuff has been done */ 2819/* for the /proc/ directory itself, after non-process stuff has been done */
2968int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) 2820int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
2969{ 2821{
2970 unsigned int nr;
2971 struct task_struct *reaper;
2972 struct tgid_iter iter; 2822 struct tgid_iter iter;
2973 struct pid_namespace *ns; 2823 struct pid_namespace *ns;
2974 filldir_t __filldir; 2824 filldir_t __filldir;
2975 2825
2976 if (filp->f_pos >= PID_MAX_LIMIT + TGID_OFFSET) 2826 if (filp->f_pos >= PID_MAX_LIMIT + TGID_OFFSET)
2977 goto out_no_task; 2827 goto out;
2978 nr = filp->f_pos - FIRST_PROCESS_ENTRY;
2979
2980 reaper = get_proc_task(filp->f_path.dentry->d_inode);
2981 if (!reaper)
2982 goto out_no_task;
2983
2984 for (; nr < ARRAY_SIZE(proc_base_stuff); filp->f_pos++, nr++) {
2985 const struct pid_entry *p = &proc_base_stuff[nr];
2986 if (proc_base_fill_cache(filp, dirent, filldir, reaper, p) < 0)
2987 goto out;
2988 }
2989 2828
2990 ns = filp->f_dentry->d_sb->s_fs_info; 2829 ns = filp->f_dentry->d_sb->s_fs_info;
2991 iter.task = NULL; 2830 iter.task = NULL;
@@ -3006,8 +2845,6 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
3006 } 2845 }
3007 filp->f_pos = PID_MAX_LIMIT + TGID_OFFSET; 2846 filp->f_pos = PID_MAX_LIMIT + TGID_OFFSET;
3008out: 2847out:
3009 put_task_struct(reaper);
3010out_no_task:
3011 return 0; 2848 return 0;
3012} 2849}
3013 2850
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 0d80cef4cfb9..7b3ae3cc0ef9 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -350,14 +350,14 @@ static DEFINE_SPINLOCK(proc_inum_lock); /* protects the above */
350 * Return an inode number between PROC_DYNAMIC_FIRST and 350 * Return an inode number between PROC_DYNAMIC_FIRST and
351 * 0xffffffff, or zero on failure. 351 * 0xffffffff, or zero on failure.
352 */ 352 */
353static unsigned int get_inode_number(void) 353int proc_alloc_inum(unsigned int *inum)
354{ 354{
355 unsigned int i; 355 unsigned int i;
356 int error; 356 int error;
357 357
358retry: 358retry:
359 if (ida_pre_get(&proc_inum_ida, GFP_KERNEL) == 0) 359 if (!ida_pre_get(&proc_inum_ida, GFP_KERNEL))
360 return 0; 360 return -ENOMEM;
361 361
362 spin_lock(&proc_inum_lock); 362 spin_lock(&proc_inum_lock);
363 error = ida_get_new(&proc_inum_ida, &i); 363 error = ida_get_new(&proc_inum_ida, &i);
@@ -365,18 +365,19 @@ retry:
365 if (error == -EAGAIN) 365 if (error == -EAGAIN)
366 goto retry; 366 goto retry;
367 else if (error) 367 else if (error)
368 return 0; 368 return error;
369 369
370 if (i > UINT_MAX - PROC_DYNAMIC_FIRST) { 370 if (i > UINT_MAX - PROC_DYNAMIC_FIRST) {
371 spin_lock(&proc_inum_lock); 371 spin_lock(&proc_inum_lock);
372 ida_remove(&proc_inum_ida, i); 372 ida_remove(&proc_inum_ida, i);
373 spin_unlock(&proc_inum_lock); 373 spin_unlock(&proc_inum_lock);
374 return 0; 374 return -ENOSPC;
375 } 375 }
376 return PROC_DYNAMIC_FIRST + i; 376 *inum = PROC_DYNAMIC_FIRST + i;
377 return 0;
377} 378}
378 379
379static void release_inode_number(unsigned int inum) 380void proc_free_inum(unsigned int inum)
380{ 381{
381 spin_lock(&proc_inum_lock); 382 spin_lock(&proc_inum_lock);
382 ida_remove(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST); 383 ida_remove(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST);
@@ -554,13 +555,12 @@ static const struct inode_operations proc_dir_inode_operations = {
554 555
555static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp) 556static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp)
556{ 557{
557 unsigned int i;
558 struct proc_dir_entry *tmp; 558 struct proc_dir_entry *tmp;
559 int ret;
559 560
560 i = get_inode_number(); 561 ret = proc_alloc_inum(&dp->low_ino);
561 if (i == 0) 562 if (ret)
562 return -EAGAIN; 563 return ret;
563 dp->low_ino = i;
564 564
565 if (S_ISDIR(dp->mode)) { 565 if (S_ISDIR(dp->mode)) {
566 if (dp->proc_iops == NULL) { 566 if (dp->proc_iops == NULL) {
@@ -764,7 +764,7 @@ EXPORT_SYMBOL(proc_create_data);
764 764
765static void free_proc_entry(struct proc_dir_entry *de) 765static void free_proc_entry(struct proc_dir_entry *de)
766{ 766{
767 release_inode_number(de->low_ino); 767 proc_free_inum(de->low_ino);
768 768
769 if (S_ISLNK(de->mode)) 769 if (S_ISLNK(de->mode))
770 kfree(de->data); 770 kfree(de->data);
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 3b22bbdee9ec..439ae6886507 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -31,6 +31,7 @@ static void proc_evict_inode(struct inode *inode)
31 struct proc_dir_entry *de; 31 struct proc_dir_entry *de;
32 struct ctl_table_header *head; 32 struct ctl_table_header *head;
33 const struct proc_ns_operations *ns_ops; 33 const struct proc_ns_operations *ns_ops;
34 void *ns;
34 35
35 truncate_inode_pages(&inode->i_data, 0); 36 truncate_inode_pages(&inode->i_data, 0);
36 clear_inode(inode); 37 clear_inode(inode);
@@ -49,8 +50,9 @@ static void proc_evict_inode(struct inode *inode)
49 } 50 }
50 /* Release any associated namespace */ 51 /* Release any associated namespace */
51 ns_ops = PROC_I(inode)->ns_ops; 52 ns_ops = PROC_I(inode)->ns_ops;
52 if (ns_ops && ns_ops->put) 53 ns = PROC_I(inode)->ns;
53 ns_ops->put(PROC_I(inode)->ns); 54 if (ns_ops && ns)
55 ns_ops->put(ns);
54} 56}
55 57
56static struct kmem_cache * proc_inode_cachep; 58static struct kmem_cache * proc_inode_cachep;
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 43973b084abf..252544c05207 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -15,6 +15,7 @@ struct ctl_table_header;
15struct mempolicy; 15struct mempolicy;
16 16
17extern struct proc_dir_entry proc_root; 17extern struct proc_dir_entry proc_root;
18extern void proc_self_init(void);
18#ifdef CONFIG_PROC_SYSCTL 19#ifdef CONFIG_PROC_SYSCTL
19extern int proc_sys_init(void); 20extern int proc_sys_init(void);
20extern void sysctl_head_put(struct ctl_table_header *head); 21extern void sysctl_head_put(struct ctl_table_header *head);
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index b178ed733c36..b7a47196c8c3 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -11,6 +11,7 @@
11#include <net/net_namespace.h> 11#include <net/net_namespace.h>
12#include <linux/ipc_namespace.h> 12#include <linux/ipc_namespace.h>
13#include <linux/pid_namespace.h> 13#include <linux/pid_namespace.h>
14#include <linux/user_namespace.h>
14#include "internal.h" 15#include "internal.h"
15 16
16 17
@@ -24,12 +25,168 @@ static const struct proc_ns_operations *ns_entries[] = {
24#ifdef CONFIG_IPC_NS 25#ifdef CONFIG_IPC_NS
25 &ipcns_operations, 26 &ipcns_operations,
26#endif 27#endif
28#ifdef CONFIG_PID_NS
29 &pidns_operations,
30#endif
31#ifdef CONFIG_USER_NS
32 &userns_operations,
33#endif
34 &mntns_operations,
27}; 35};
28 36
29static const struct file_operations ns_file_operations = { 37static const struct file_operations ns_file_operations = {
30 .llseek = no_llseek, 38 .llseek = no_llseek,
31}; 39};
32 40
41static const struct inode_operations ns_inode_operations = {
42 .setattr = proc_setattr,
43};
44
45static int ns_delete_dentry(const struct dentry *dentry)
46{
47 /* Don't cache namespace inodes when not in use */
48 return 1;
49}
50
51static char *ns_dname(struct dentry *dentry, char *buffer, int buflen)
52{
53 struct inode *inode = dentry->d_inode;
54 const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops;
55
56 return dynamic_dname(dentry, buffer, buflen, "%s:[%lu]",
57 ns_ops->name, inode->i_ino);
58}
59
60const struct dentry_operations ns_dentry_operations =
61{
62 .d_delete = ns_delete_dentry,
63 .d_dname = ns_dname,
64};
65
66static struct dentry *proc_ns_get_dentry(struct super_block *sb,
67 struct task_struct *task, const struct proc_ns_operations *ns_ops)
68{
69 struct dentry *dentry, *result;
70 struct inode *inode;
71 struct proc_inode *ei;
72 struct qstr qname = { .name = "", };
73 void *ns;
74
75 ns = ns_ops->get(task);
76 if (!ns)
77 return ERR_PTR(-ENOENT);
78
79 dentry = d_alloc_pseudo(sb, &qname);
80 if (!dentry) {
81 ns_ops->put(ns);
82 return ERR_PTR(-ENOMEM);
83 }
84
85 inode = iget_locked(sb, ns_ops->inum(ns));
86 if (!inode) {
87 dput(dentry);
88 ns_ops->put(ns);
89 return ERR_PTR(-ENOMEM);
90 }
91
92 ei = PROC_I(inode);
93 if (inode->i_state & I_NEW) {
94 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
95 inode->i_op = &ns_inode_operations;
96 inode->i_mode = S_IFREG | S_IRUGO;
97 inode->i_fop = &ns_file_operations;
98 ei->ns_ops = ns_ops;
99 ei->ns = ns;
100 unlock_new_inode(inode);
101 } else {
102 ns_ops->put(ns);
103 }
104
105 d_set_d_op(dentry, &ns_dentry_operations);
106 result = d_instantiate_unique(dentry, inode);
107 if (result) {
108 dput(dentry);
109 dentry = result;
110 }
111
112 return dentry;
113}
114
115static void *proc_ns_follow_link(struct dentry *dentry, struct nameidata *nd)
116{
117 struct inode *inode = dentry->d_inode;
118 struct super_block *sb = inode->i_sb;
119 struct proc_inode *ei = PROC_I(inode);
120 struct task_struct *task;
121 struct dentry *ns_dentry;
122 void *error = ERR_PTR(-EACCES);
123
124 task = get_proc_task(inode);
125 if (!task)
126 goto out;
127
128 if (!ptrace_may_access(task, PTRACE_MODE_READ))
129 goto out_put_task;
130
131 ns_dentry = proc_ns_get_dentry(sb, task, ei->ns_ops);
132 if (IS_ERR(ns_dentry)) {
133 error = ERR_CAST(ns_dentry);
134 goto out_put_task;
135 }
136
137 dput(nd->path.dentry);
138 nd->path.dentry = ns_dentry;
139 error = NULL;
140
141out_put_task:
142 put_task_struct(task);
143out:
144 return error;
145}
146
147static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int buflen)
148{
149 struct inode *inode = dentry->d_inode;
150 struct proc_inode *ei = PROC_I(inode);
151 const struct proc_ns_operations *ns_ops = ei->ns_ops;
152 struct task_struct *task;
153 void *ns;
154 char name[50];
155 int len = -EACCES;
156
157 task = get_proc_task(inode);
158 if (!task)
159 goto out;
160
161 if (!ptrace_may_access(task, PTRACE_MODE_READ))
162 goto out_put_task;
163
164 len = -ENOENT;
165 ns = ns_ops->get(task);
166 if (!ns)
167 goto out_put_task;
168
169 snprintf(name, sizeof(name), "%s:[%u]", ns_ops->name, ns_ops->inum(ns));
170 len = strlen(name);
171
172 if (len > buflen)
173 len = buflen;
174 if (copy_to_user(buffer, name, len))
175 len = -EFAULT;
176
177 ns_ops->put(ns);
178out_put_task:
179 put_task_struct(task);
180out:
181 return len;
182}
183
184static const struct inode_operations proc_ns_link_inode_operations = {
185 .readlink = proc_ns_readlink,
186 .follow_link = proc_ns_follow_link,
187 .setattr = proc_setattr,
188};
189
33static struct dentry *proc_ns_instantiate(struct inode *dir, 190static struct dentry *proc_ns_instantiate(struct inode *dir,
34 struct dentry *dentry, struct task_struct *task, const void *ptr) 191 struct dentry *dentry, struct task_struct *task, const void *ptr)
35{ 192{
@@ -37,21 +194,15 @@ static struct dentry *proc_ns_instantiate(struct inode *dir,
37 struct inode *inode; 194 struct inode *inode;
38 struct proc_inode *ei; 195 struct proc_inode *ei;
39 struct dentry *error = ERR_PTR(-ENOENT); 196 struct dentry *error = ERR_PTR(-ENOENT);
40 void *ns;
41 197
42 inode = proc_pid_make_inode(dir->i_sb, task); 198 inode = proc_pid_make_inode(dir->i_sb, task);
43 if (!inode) 199 if (!inode)
44 goto out; 200 goto out;
45 201
46 ns = ns_ops->get(task);
47 if (!ns)
48 goto out_iput;
49
50 ei = PROC_I(inode); 202 ei = PROC_I(inode);
51 inode->i_mode = S_IFREG|S_IRUSR; 203 inode->i_mode = S_IFLNK|S_IRWXUGO;
52 inode->i_fop = &ns_file_operations; 204 inode->i_op = &proc_ns_link_inode_operations;
53 ei->ns_ops = ns_ops; 205 ei->ns_ops = ns_ops;
54 ei->ns = ns;
55 206
56 d_set_d_op(dentry, &pid_dentry_operations); 207 d_set_d_op(dentry, &pid_dentry_operations);
57 d_add(dentry, inode); 208 d_add(dentry, inode);
@@ -60,9 +211,6 @@ static struct dentry *proc_ns_instantiate(struct inode *dir,
60 error = NULL; 211 error = NULL;
61out: 212out:
62 return error; 213 return error;
63out_iput:
64 iput(inode);
65 goto out;
66} 214}
67 215
68static int proc_ns_fill_cache(struct file *filp, void *dirent, 216static int proc_ns_fill_cache(struct file *filp, void *dirent,
@@ -89,10 +237,6 @@ static int proc_ns_dir_readdir(struct file *filp, void *dirent,
89 if (!task) 237 if (!task)
90 goto out_no_task; 238 goto out_no_task;
91 239
92 ret = -EPERM;
93 if (!ptrace_may_access(task, PTRACE_MODE_READ))
94 goto out;
95
96 ret = 0; 240 ret = 0;
97 i = filp->f_pos; 241 i = filp->f_pos;
98 switch (i) { 242 switch (i) {
@@ -152,10 +296,6 @@ static struct dentry *proc_ns_dir_lookup(struct inode *dir,
152 if (!task) 296 if (!task)
153 goto out_no_task; 297 goto out_no_task;
154 298
155 error = ERR_PTR(-EPERM);
156 if (!ptrace_may_access(task, PTRACE_MODE_READ))
157 goto out;
158
159 last = &ns_entries[ARRAY_SIZE(ns_entries)]; 299 last = &ns_entries[ARRAY_SIZE(ns_entries)];
160 for (entry = ns_entries; entry < last; entry++) { 300 for (entry = ns_entries; entry < last; entry++) {
161 if (strlen((*entry)->name) != len) 301 if (strlen((*entry)->name) != len)
@@ -163,7 +303,6 @@ static struct dentry *proc_ns_dir_lookup(struct inode *dir,
163 if (!memcmp(dentry->d_name.name, (*entry)->name, len)) 303 if (!memcmp(dentry->d_name.name, (*entry)->name, len))
164 break; 304 break;
165 } 305 }
166 error = ERR_PTR(-ENOENT);
167 if (entry == last) 306 if (entry == last)
168 goto out; 307 goto out;
169 308
@@ -198,3 +337,7 @@ out_invalid:
198 return ERR_PTR(-EINVAL); 337 return ERR_PTR(-EINVAL);
199} 338}
200 339
340bool proc_ns_inode(struct inode *inode)
341{
342 return inode->i_fop == &ns_file_operations;
343}
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 9889a92d2e01..c6e9fac26bac 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -100,14 +100,13 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
100 int err; 100 int err;
101 struct super_block *sb; 101 struct super_block *sb;
102 struct pid_namespace *ns; 102 struct pid_namespace *ns;
103 struct proc_inode *ei;
104 char *options; 103 char *options;
105 104
106 if (flags & MS_KERNMOUNT) { 105 if (flags & MS_KERNMOUNT) {
107 ns = (struct pid_namespace *)data; 106 ns = (struct pid_namespace *)data;
108 options = NULL; 107 options = NULL;
109 } else { 108 } else {
110 ns = current->nsproxy->pid_ns; 109 ns = task_active_pid_ns(current);
111 options = data; 110 options = data;
112 } 111 }
113 112
@@ -130,13 +129,6 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
130 sb->s_flags |= MS_ACTIVE; 129 sb->s_flags |= MS_ACTIVE;
131 } 130 }
132 131
133 ei = PROC_I(sb->s_root->d_inode);
134 if (!ei->pid) {
135 rcu_read_lock();
136 ei->pid = get_pid(find_pid_ns(1, ns));
137 rcu_read_unlock();
138 }
139
140 return dget(sb->s_root); 132 return dget(sb->s_root);
141} 133}
142 134
@@ -153,6 +145,7 @@ static struct file_system_type proc_fs_type = {
153 .name = "proc", 145 .name = "proc",
154 .mount = proc_mount, 146 .mount = proc_mount,
155 .kill_sb = proc_kill_sb, 147 .kill_sb = proc_kill_sb,
148 .fs_flags = FS_USERNS_MOUNT,
156}; 149};
157 150
158void __init proc_root_init(void) 151void __init proc_root_init(void)
@@ -163,12 +156,8 @@ void __init proc_root_init(void)
163 err = register_filesystem(&proc_fs_type); 156 err = register_filesystem(&proc_fs_type);
164 if (err) 157 if (err)
165 return; 158 return;
166 err = pid_ns_prepare_proc(&init_pid_ns);
167 if (err) {
168 unregister_filesystem(&proc_fs_type);
169 return;
170 }
171 159
160 proc_self_init();
172 proc_symlink("mounts", NULL, "self/mounts"); 161 proc_symlink("mounts", NULL, "self/mounts");
173 162
174 proc_net_init(); 163 proc_net_init();
diff --git a/fs/proc/self.c b/fs/proc/self.c
new file mode 100644
index 000000000000..aa5cc3bff140
--- /dev/null
+++ b/fs/proc/self.c
@@ -0,0 +1,59 @@
1#include <linux/proc_fs.h>
2#include <linux/sched.h>
3#include <linux/namei.h>
4
5/*
6 * /proc/self:
7 */
8static int proc_self_readlink(struct dentry *dentry, char __user *buffer,
9 int buflen)
10{
11 struct pid_namespace *ns = dentry->d_sb->s_fs_info;
12 pid_t tgid = task_tgid_nr_ns(current, ns);
13 char tmp[PROC_NUMBUF];
14 if (!tgid)
15 return -ENOENT;
16 sprintf(tmp, "%d", tgid);
17 return vfs_readlink(dentry,buffer,buflen,tmp);
18}
19
20static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd)
21{
22 struct pid_namespace *ns = dentry->d_sb->s_fs_info;
23 pid_t tgid = task_tgid_nr_ns(current, ns);
24 char *name = ERR_PTR(-ENOENT);
25 if (tgid) {
26 /* 11 for max length of signed int in decimal + NULL term */
27 name = kmalloc(12, GFP_KERNEL);
28 if (!name)
29 name = ERR_PTR(-ENOMEM);
30 else
31 sprintf(name, "%d", tgid);
32 }
33 nd_set_link(nd, name);
34 return NULL;
35}
36
37static void proc_self_put_link(struct dentry *dentry, struct nameidata *nd,
38 void *cookie)
39{
40 char *s = nd_get_link(nd);
41 if (!IS_ERR(s))
42 kfree(s);
43}
44
45static const struct inode_operations proc_self_inode_operations = {
46 .readlink = proc_self_readlink,
47 .follow_link = proc_self_follow_link,
48 .put_link = proc_self_put_link,
49};
50
51void __init proc_self_init(void)
52{
53 struct proc_dir_entry *proc_self_symlink;
54 mode_t mode;
55
56 mode = S_IFLNK | S_IRWXUGO;
57 proc_self_symlink = proc_create("self", mode, NULL, NULL );
58 proc_self_symlink->proc_iops = &proc_self_inode_operations;
59}
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 71eb7e253927..db940a9be045 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -149,6 +149,7 @@ static struct file_system_type sysfs_fs_type = {
149 .name = "sysfs", 149 .name = "sysfs",
150 .mount = sysfs_mount, 150 .mount = sysfs_mount,
151 .kill_sb = sysfs_kill_sb, 151 .kill_sb = sysfs_kill_sb,
152 .fs_flags = FS_USERNS_MOUNT,
152}; 153};
153 154
154int __init sysfs_init(void) 155int __init sysfs_init(void)