aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-02-23 23:33:51 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2017-02-23 23:33:51 -0500
commitf1ef09fde17f9b77ca1435a5b53a28b203afb81c (patch)
tree0efcd2c5b5da451a7ca780c8aa5e26d7ec712b85
parentef96152e6a36e0510387cb174178b7982c1ae879 (diff)
parentace0c791e6c3cf5ef37cad2df69f0d90ccc40ffb (diff)
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace
Pull namespace updates from Eric Biederman: "There is a lot here. A lot of these changes result in subtle user visible differences in kernel behavior. I don't expect anything will care but I will revert/fix things immediately if any regressions show up. From Seth Forshee there is a continuation of the work to make the vfs ready for unpriviled mounts. We had thought the previous changes prevented the creation of files outside of s_user_ns of a filesystem, but it turns we missed the O_CREAT path. Ooops. Pavel Tikhomirov and Oleg Nesterov worked together to fix a long standing bug in the implemenation of PR_SET_CHILD_SUBREAPER where only children that are forked after the prctl are considered and not children forked before the prctl. The only known user of this prctl systemd forks all children after the prctl. So no userspace regressions will occur. Holding earlier forked children to the same rules as later forked children creates a semantic that is sane enough to allow checkpoing of processes that use this feature. There is a long delayed change by Nikolay Borisov to limit inotify instances inside a user namespace. Michael Kerrisk extends the API for files used to maniuplate namespaces with two new trivial ioctls to allow discovery of the hierachy and properties of namespaces. Konstantin Khlebnikov with the help of Al Viro adds code that when a network namespace exits purges it's sysctl entries from the dcache. As in some circumstances this could use a lot of memory. Vivek Goyal fixed a bug with stacked filesystems where the permissions on the wrong inode were being checked. I continue previous work on ptracing across exec. Allowing a file to be setuid across exec while being ptraced if the tracer has enough credentials in the user namespace, and if the process has CAP_SETUID in it's own namespace. Proc files for setuid or otherwise undumpable executables are now owned by the root in the user namespace of their mm. Allowing debugging of setuid applications in containers to work better. A bug I introduced with permission checking and automount is now fixed. The big change is to mark the mounts that the kernel initiates as a result of an automount. This allows the permission checks in sget to be safely suppressed for this kind of mount. As the permission check happened when the original filesystem was mounted. Finally a special case in the mount namespace is removed preventing unbounded chains in the mount hash table, and making the semantics simpler which benefits CRIU. The vfs fix along with related work in ima and evm I believe makes us ready to finish developing and merge fully unprivileged mounts of the fuse filesystem. The cleanups of the mount namespace makes discussing how to fix the worst case complexity of umount. The stacked filesystem fixes pave the way for adding multiple mappings for the filesystem uids so that efficient and safer containers can be implemented" * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace: proc/sysctl: Don't grab i_lock under sysctl_lock. vfs: Use upper filesystem inode in bprm_fill_uid() proc/sysctl: prune stale dentries during unregistering mnt: Tuck mounts under others instead of creating shadow/side mounts. prctl: propagate has_child_subreaper flag to every descendant introduce the walk_process_tree() helper nsfs: Add an ioctl() to return owner UID of a userns fs: Better permission checking for submounts exit: fix the setns() && PR_SET_CHILD_SUBREAPER interaction vfs: open() with O_CREAT should not create inodes with unknown ids nsfs: Add an ioctl() to return the namespace type proc: Better ownership of files for non-dumpable tasks in user namespaces exec: Remove LSM_UNSAFE_PTRACE_CAP exec: Test the ptracer's saved cred to see if the tracee can gain caps exec: Don't reset euid and egid when the tracee has CAP_SETUID inotify: Convert to using per-namespace limits
-rw-r--r--fs/afs/mntpt.c2
-rw-r--r--fs/autofs4/waitq.c4
-rw-r--r--fs/cifs/cifs_dfs_ref.c7
-rw-r--r--fs/debugfs/inode.c8
-rw-r--r--fs/exec.c10
-rw-r--r--fs/mount.h1
-rw-r--r--fs/namei.c9
-rw-r--r--fs/namespace.c127
-rw-r--r--fs/nfs/namespace.c2
-rw-r--r--fs/nfs/nfs4namespace.c2
-rw-r--r--fs/notify/inotify/inotify.h17
-rw-r--r--fs/notify/inotify/inotify_fsnotify.c6
-rw-r--r--fs/notify/inotify/inotify_user.c34
-rw-r--r--fs/nsfs.c13
-rw-r--r--fs/pnode.c61
-rw-r--r--fs/pnode.h2
-rw-r--r--fs/proc/base.c102
-rw-r--r--fs/proc/fd.c12
-rw-r--r--fs/proc/inode.c3
-rw-r--r--fs/proc/internal.h23
-rw-r--r--fs/proc/proc_sysctl.c66
-rw-r--r--fs/super.c13
-rw-r--r--include/linux/debugfs.h3
-rw-r--r--include/linux/fsnotify_backend.h3
-rw-r--r--include/linux/mount.h3
-rw-r--r--include/linux/sched.h7
-rw-r--r--include/linux/security.h3
-rw-r--r--include/linux/sysctl.h1
-rw-r--r--include/linux/user_namespace.h4
-rw-r--r--include/uapi/linux/fs.h1
-rw-r--r--include/uapi/linux/nsfs.h9
-rw-r--r--kernel/exit.c13
-rw-r--r--kernel/fork.c42
-rw-r--r--kernel/sys.c22
-rw-r--r--kernel/trace/trace.c4
-rw-r--r--kernel/ucount.c6
-rw-r--r--security/apparmor/domain.c2
-rw-r--r--security/commoncap.c5
-rw-r--r--security/selinux/hooks.c3
-rw-r--r--security/smack/smack_lsm.c2
40 files changed, 431 insertions, 226 deletions
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 81dd075356b9..d4fb0afc0097 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -202,7 +202,7 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
202 202
203 /* try and do the mount */ 203 /* try and do the mount */
204 _debug("--- attempting mount %s -o %s ---", devname, options); 204 _debug("--- attempting mount %s -o %s ---", devname, options);
205 mnt = vfs_kern_mount(&afs_fs_type, 0, devname, options); 205 mnt = vfs_submount(mntpt, &afs_fs_type, devname, options);
206 _debug("--- mount result %p ---", mnt); 206 _debug("--- mount result %p ---", mnt);
207 207
208 free_page((unsigned long) devname); 208 free_page((unsigned long) devname);
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 1278335ce366..79fbd85db4ba 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -436,8 +436,8 @@ int autofs4_wait(struct autofs_sb_info *sbi,
436 memcpy(&wq->name, &qstr, sizeof(struct qstr)); 436 memcpy(&wq->name, &qstr, sizeof(struct qstr));
437 wq->dev = autofs4_get_dev(sbi); 437 wq->dev = autofs4_get_dev(sbi);
438 wq->ino = autofs4_get_ino(sbi); 438 wq->ino = autofs4_get_ino(sbi);
439 wq->uid = current_real_cred()->uid; 439 wq->uid = current_cred()->uid;
440 wq->gid = current_real_cred()->gid; 440 wq->gid = current_cred()->gid;
441 wq->pid = pid; 441 wq->pid = pid;
442 wq->tgid = tgid; 442 wq->tgid = tgid;
443 wq->status = -EINTR; /* Status return if interrupted */ 443 wq->status = -EINTR; /* Status return if interrupted */
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index ec9dbbcca3b9..9156be545b0f 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -245,7 +245,8 @@ compose_mount_options_err:
245 * @fullpath: full path in UNC format 245 * @fullpath: full path in UNC format
246 * @ref: server's referral 246 * @ref: server's referral
247 */ 247 */
248static struct vfsmount *cifs_dfs_do_refmount(struct cifs_sb_info *cifs_sb, 248static struct vfsmount *cifs_dfs_do_refmount(struct dentry *mntpt,
249 struct cifs_sb_info *cifs_sb,
249 const char *fullpath, const struct dfs_info3_param *ref) 250 const char *fullpath, const struct dfs_info3_param *ref)
250{ 251{
251 struct vfsmount *mnt; 252 struct vfsmount *mnt;
@@ -259,7 +260,7 @@ static struct vfsmount *cifs_dfs_do_refmount(struct cifs_sb_info *cifs_sb,
259 if (IS_ERR(mountdata)) 260 if (IS_ERR(mountdata))
260 return (struct vfsmount *)mountdata; 261 return (struct vfsmount *)mountdata;
261 262
262 mnt = vfs_kern_mount(&cifs_fs_type, 0, devname, mountdata); 263 mnt = vfs_submount(mntpt, &cifs_fs_type, devname, mountdata);
263 kfree(mountdata); 264 kfree(mountdata);
264 kfree(devname); 265 kfree(devname);
265 return mnt; 266 return mnt;
@@ -334,7 +335,7 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
334 mnt = ERR_PTR(-EINVAL); 335 mnt = ERR_PTR(-EINVAL);
335 break; 336 break;
336 } 337 }
337 mnt = cifs_dfs_do_refmount(cifs_sb, 338 mnt = cifs_dfs_do_refmount(mntpt, cifs_sb,
338 full_path, referrals + i); 339 full_path, referrals + i);
339 cifs_dbg(FYI, "%s: cifs_dfs_do_refmount:%s , mnt:%p\n", 340 cifs_dbg(FYI, "%s: cifs_dfs_do_refmount:%s , mnt:%p\n",
340 __func__, referrals[i].node_name, mnt); 341 __func__, referrals[i].node_name, mnt);
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 7fb1732a3630..7fd4ec4bb214 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -187,9 +187,9 @@ static const struct super_operations debugfs_super_operations = {
187 187
188static struct vfsmount *debugfs_automount(struct path *path) 188static struct vfsmount *debugfs_automount(struct path *path)
189{ 189{
190 struct vfsmount *(*f)(void *); 190 debugfs_automount_t f;
191 f = (struct vfsmount *(*)(void *))path->dentry->d_fsdata; 191 f = (debugfs_automount_t)path->dentry->d_fsdata;
192 return f(d_inode(path->dentry)->i_private); 192 return f(path->dentry, d_inode(path->dentry)->i_private);
193} 193}
194 194
195static const struct dentry_operations debugfs_dops = { 195static const struct dentry_operations debugfs_dops = {
@@ -540,7 +540,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_dir);
540 */ 540 */
541struct dentry *debugfs_create_automount(const char *name, 541struct dentry *debugfs_create_automount(const char *name,
542 struct dentry *parent, 542 struct dentry *parent,
543 struct vfsmount *(*f)(void *), 543 debugfs_automount_t f,
544 void *data) 544 void *data)
545{ 545{
546 struct dentry *dentry = start_creating(name, parent); 546 struct dentry *dentry = start_creating(name, parent);
diff --git a/fs/exec.c b/fs/exec.c
index e57946610733..698a86094f76 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1426,12 +1426,8 @@ static void check_unsafe_exec(struct linux_binprm *bprm)
1426 struct task_struct *p = current, *t; 1426 struct task_struct *p = current, *t;
1427 unsigned n_fs; 1427 unsigned n_fs;
1428 1428
1429 if (p->ptrace) { 1429 if (p->ptrace)
1430 if (ptracer_capable(p, current_user_ns())) 1430 bprm->unsafe |= LSM_UNSAFE_PTRACE;
1431 bprm->unsafe |= LSM_UNSAFE_PTRACE_CAP;
1432 else
1433 bprm->unsafe |= LSM_UNSAFE_PTRACE;
1434 }
1435 1431
1436 /* 1432 /*
1437 * This isn't strictly necessary, but it makes it harder for LSMs to 1433 * This isn't strictly necessary, but it makes it harder for LSMs to
@@ -1479,7 +1475,7 @@ static void bprm_fill_uid(struct linux_binprm *bprm)
1479 if (task_no_new_privs(current)) 1475 if (task_no_new_privs(current))
1480 return; 1476 return;
1481 1477
1482 inode = file_inode(bprm->file); 1478 inode = bprm->file->f_path.dentry->d_inode;
1483 mode = READ_ONCE(inode->i_mode); 1479 mode = READ_ONCE(inode->i_mode);
1484 if (!(mode & (S_ISUID|S_ISGID))) 1480 if (!(mode & (S_ISUID|S_ISGID)))
1485 return; 1481 return;
diff --git a/fs/mount.h b/fs/mount.h
index 2c856fc47ae3..2826543a131d 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -89,7 +89,6 @@ static inline int is_mounted(struct vfsmount *mnt)
89} 89}
90 90
91extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *); 91extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *);
92extern struct mount *__lookup_mnt_last(struct vfsmount *, struct dentry *);
93 92
94extern int __legitimize_mnt(struct vfsmount *, unsigned); 93extern int __legitimize_mnt(struct vfsmount *, unsigned);
95extern bool legitimize_mnt(struct vfsmount *, unsigned); 94extern bool legitimize_mnt(struct vfsmount *, unsigned);
diff --git a/fs/namei.c b/fs/namei.c
index ad74877e1442..da689c9c005e 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1100,7 +1100,6 @@ static int follow_automount(struct path *path, struct nameidata *nd,
1100 bool *need_mntput) 1100 bool *need_mntput)
1101{ 1101{
1102 struct vfsmount *mnt; 1102 struct vfsmount *mnt;
1103 const struct cred *old_cred;
1104 int err; 1103 int err;
1105 1104
1106 if (!path->dentry->d_op || !path->dentry->d_op->d_automount) 1105 if (!path->dentry->d_op || !path->dentry->d_op->d_automount)
@@ -1129,9 +1128,7 @@ static int follow_automount(struct path *path, struct nameidata *nd,
1129 if (nd->total_link_count >= 40) 1128 if (nd->total_link_count >= 40)
1130 return -ELOOP; 1129 return -ELOOP;
1131 1130
1132 old_cred = override_creds(&init_cred);
1133 mnt = path->dentry->d_op->d_automount(path); 1131 mnt = path->dentry->d_op->d_automount(path);
1134 revert_creds(old_cred);
1135 if (IS_ERR(mnt)) { 1132 if (IS_ERR(mnt)) {
1136 /* 1133 /*
1137 * The filesystem is allowed to return -EISDIR here to indicate 1134 * The filesystem is allowed to return -EISDIR here to indicate
@@ -2941,10 +2938,16 @@ static inline int open_to_namei_flags(int flag)
2941 2938
2942static int may_o_create(const struct path *dir, struct dentry *dentry, umode_t mode) 2939static int may_o_create(const struct path *dir, struct dentry *dentry, umode_t mode)
2943{ 2940{
2941 struct user_namespace *s_user_ns;
2944 int error = security_path_mknod(dir, dentry, mode, 0); 2942 int error = security_path_mknod(dir, dentry, mode, 0);
2945 if (error) 2943 if (error)
2946 return error; 2944 return error;
2947 2945
2946 s_user_ns = dir->dentry->d_sb->s_user_ns;
2947 if (!kuid_has_mapping(s_user_ns, current_fsuid()) ||
2948 !kgid_has_mapping(s_user_ns, current_fsgid()))
2949 return -EOVERFLOW;
2950
2948 error = inode_permission(dir->dentry->d_inode, MAY_WRITE | MAY_EXEC); 2951 error = inode_permission(dir->dentry->d_inode, MAY_WRITE | MAY_EXEC);
2949 if (error) 2952 if (error)
2950 return error; 2953 return error;
diff --git a/fs/namespace.c b/fs/namespace.c
index 487ba30bb5c6..8bfad42c1ccf 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -637,28 +637,6 @@ struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
637} 637}
638 638
639/* 639/*
640 * find the last mount at @dentry on vfsmount @mnt.
641 * mount_lock must be held.
642 */
643struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)
644{
645 struct mount *p, *res = NULL;
646 p = __lookup_mnt(mnt, dentry);
647 if (!p)
648 goto out;
649 if (!(p->mnt.mnt_flags & MNT_UMOUNT))
650 res = p;
651 hlist_for_each_entry_continue(p, mnt_hash) {
652 if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry)
653 break;
654 if (!(p->mnt.mnt_flags & MNT_UMOUNT))
655 res = p;
656 }
657out:
658 return res;
659}
660
661/*
662 * lookup_mnt - Return the first child mount mounted at path 640 * lookup_mnt - Return the first child mount mounted at path
663 * 641 *
664 * "First" means first mounted chronologically. If you create the 642 * "First" means first mounted chronologically. If you create the
@@ -878,6 +856,13 @@ void mnt_set_mountpoint(struct mount *mnt,
878 hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list); 856 hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list);
879} 857}
880 858
859static void __attach_mnt(struct mount *mnt, struct mount *parent)
860{
861 hlist_add_head_rcu(&mnt->mnt_hash,
862 m_hash(&parent->mnt, mnt->mnt_mountpoint));
863 list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
864}
865
881/* 866/*
882 * vfsmount lock must be held for write 867 * vfsmount lock must be held for write
883 */ 868 */
@@ -886,28 +871,45 @@ static void attach_mnt(struct mount *mnt,
886 struct mountpoint *mp) 871 struct mountpoint *mp)
887{ 872{
888 mnt_set_mountpoint(parent, mp, mnt); 873 mnt_set_mountpoint(parent, mp, mnt);
889 hlist_add_head_rcu(&mnt->mnt_hash, m_hash(&parent->mnt, mp->m_dentry)); 874 __attach_mnt(mnt, parent);
890 list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
891} 875}
892 876
893static void attach_shadowed(struct mount *mnt, 877void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt)
894 struct mount *parent,
895 struct mount *shadows)
896{ 878{
897 if (shadows) { 879 struct mountpoint *old_mp = mnt->mnt_mp;
898 hlist_add_behind_rcu(&mnt->mnt_hash, &shadows->mnt_hash); 880 struct dentry *old_mountpoint = mnt->mnt_mountpoint;
899 list_add(&mnt->mnt_child, &shadows->mnt_child); 881 struct mount *old_parent = mnt->mnt_parent;
900 } else { 882
901 hlist_add_head_rcu(&mnt->mnt_hash, 883 list_del_init(&mnt->mnt_child);
902 m_hash(&parent->mnt, mnt->mnt_mountpoint)); 884 hlist_del_init(&mnt->mnt_mp_list);
903 list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); 885 hlist_del_init_rcu(&mnt->mnt_hash);
904 } 886
887 attach_mnt(mnt, parent, mp);
888
889 put_mountpoint(old_mp);
890
891 /*
892 * Safely avoid even the suggestion this code might sleep or
893 * lock the mount hash by taking advantage of the knowledge that
894 * mnt_change_mountpoint will not release the final reference
895 * to a mountpoint.
896 *
897 * During mounting, the mount passed in as the parent mount will
898 * continue to use the old mountpoint and during unmounting, the
899 * old mountpoint will continue to exist until namespace_unlock,
900 * which happens well after mnt_change_mountpoint.
901 */
902 spin_lock(&old_mountpoint->d_lock);
903 old_mountpoint->d_lockref.count--;
904 spin_unlock(&old_mountpoint->d_lock);
905
906 mnt_add_count(old_parent, -1);
905} 907}
906 908
907/* 909/*
908 * vfsmount lock must be held for write 910 * vfsmount lock must be held for write
909 */ 911 */
910static void commit_tree(struct mount *mnt, struct mount *shadows) 912static void commit_tree(struct mount *mnt)
911{ 913{
912 struct mount *parent = mnt->mnt_parent; 914 struct mount *parent = mnt->mnt_parent;
913 struct mount *m; 915 struct mount *m;
@@ -925,7 +927,7 @@ static void commit_tree(struct mount *mnt, struct mount *shadows)
925 n->mounts += n->pending_mounts; 927 n->mounts += n->pending_mounts;
926 n->pending_mounts = 0; 928 n->pending_mounts = 0;
927 929
928 attach_shadowed(mnt, parent, shadows); 930 __attach_mnt(mnt, parent);
929 touch_mnt_namespace(n); 931 touch_mnt_namespace(n);
930} 932}
931 933
@@ -989,6 +991,21 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
989} 991}
990EXPORT_SYMBOL_GPL(vfs_kern_mount); 992EXPORT_SYMBOL_GPL(vfs_kern_mount);
991 993
994struct vfsmount *
995vfs_submount(const struct dentry *mountpoint, struct file_system_type *type,
996 const char *name, void *data)
997{
998 /* Until it is worked out how to pass the user namespace
999 * through from the parent mount to the submount don't support
1000 * unprivileged mounts with submounts.
1001 */
1002 if (mountpoint->d_sb->s_user_ns != &init_user_ns)
1003 return ERR_PTR(-EPERM);
1004
1005 return vfs_kern_mount(type, MS_SUBMOUNT, name, data);
1006}
1007EXPORT_SYMBOL_GPL(vfs_submount);
1008
992static struct mount *clone_mnt(struct mount *old, struct dentry *root, 1009static struct mount *clone_mnt(struct mount *old, struct dentry *root,
993 int flag) 1010 int flag)
994{ 1011{
@@ -1764,7 +1781,6 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
1764 continue; 1781 continue;
1765 1782
1766 for (s = r; s; s = next_mnt(s, r)) { 1783 for (s = r; s; s = next_mnt(s, r)) {
1767 struct mount *t = NULL;
1768 if (!(flag & CL_COPY_UNBINDABLE) && 1784 if (!(flag & CL_COPY_UNBINDABLE) &&
1769 IS_MNT_UNBINDABLE(s)) { 1785 IS_MNT_UNBINDABLE(s)) {
1770 s = skip_mnt_tree(s); 1786 s = skip_mnt_tree(s);
@@ -1786,14 +1802,7 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
1786 goto out; 1802 goto out;
1787 lock_mount_hash(); 1803 lock_mount_hash();
1788 list_add_tail(&q->mnt_list, &res->mnt_list); 1804 list_add_tail(&q->mnt_list, &res->mnt_list);
1789 mnt_set_mountpoint(parent, p->mnt_mp, q); 1805 attach_mnt(q, parent, p->mnt_mp);
1790 if (!list_empty(&parent->mnt_mounts)) {
1791 t = list_last_entry(&parent->mnt_mounts,
1792 struct mount, mnt_child);
1793 if (t->mnt_mp != p->mnt_mp)
1794 t = NULL;
1795 }
1796 attach_shadowed(q, parent, t);
1797 unlock_mount_hash(); 1806 unlock_mount_hash();
1798 } 1807 }
1799 } 1808 }
@@ -1992,10 +2001,18 @@ static int attach_recursive_mnt(struct mount *source_mnt,
1992{ 2001{
1993 HLIST_HEAD(tree_list); 2002 HLIST_HEAD(tree_list);
1994 struct mnt_namespace *ns = dest_mnt->mnt_ns; 2003 struct mnt_namespace *ns = dest_mnt->mnt_ns;
2004 struct mountpoint *smp;
1995 struct mount *child, *p; 2005 struct mount *child, *p;
1996 struct hlist_node *n; 2006 struct hlist_node *n;
1997 int err; 2007 int err;
1998 2008
2009 /* Preallocate a mountpoint in case the new mounts need
2010 * to be tucked under other mounts.
2011 */
2012 smp = get_mountpoint(source_mnt->mnt.mnt_root);
2013 if (IS_ERR(smp))
2014 return PTR_ERR(smp);
2015
1999 /* Is there space to add these mounts to the mount namespace? */ 2016 /* Is there space to add these mounts to the mount namespace? */
2000 if (!parent_path) { 2017 if (!parent_path) {
2001 err = count_mounts(ns, source_mnt); 2018 err = count_mounts(ns, source_mnt);
@@ -2022,16 +2039,19 @@ static int attach_recursive_mnt(struct mount *source_mnt,
2022 touch_mnt_namespace(source_mnt->mnt_ns); 2039 touch_mnt_namespace(source_mnt->mnt_ns);
2023 } else { 2040 } else {
2024 mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt); 2041 mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
2025 commit_tree(source_mnt, NULL); 2042 commit_tree(source_mnt);
2026 } 2043 }
2027 2044
2028 hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) { 2045 hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) {
2029 struct mount *q; 2046 struct mount *q;
2030 hlist_del_init(&child->mnt_hash); 2047 hlist_del_init(&child->mnt_hash);
2031 q = __lookup_mnt_last(&child->mnt_parent->mnt, 2048 q = __lookup_mnt(&child->mnt_parent->mnt,
2032 child->mnt_mountpoint); 2049 child->mnt_mountpoint);
2033 commit_tree(child, q); 2050 if (q)
2051 mnt_change_mountpoint(child, smp, q);
2052 commit_tree(child);
2034 } 2053 }
2054 put_mountpoint(smp);
2035 unlock_mount_hash(); 2055 unlock_mount_hash();
2036 2056
2037 return 0; 2057 return 0;
@@ -2046,6 +2066,11 @@ static int attach_recursive_mnt(struct mount *source_mnt,
2046 cleanup_group_ids(source_mnt, NULL); 2066 cleanup_group_ids(source_mnt, NULL);
2047 out: 2067 out:
2048 ns->pending_mounts = 0; 2068 ns->pending_mounts = 0;
2069
2070 read_seqlock_excl(&mount_lock);
2071 put_mountpoint(smp);
2072 read_sequnlock_excl(&mount_lock);
2073
2049 return err; 2074 return err;
2050} 2075}
2051 2076
@@ -2794,7 +2819,7 @@ long do_mount(const char *dev_name, const char __user *dir_name,
2794 2819
2795 flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN | 2820 flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN |
2796 MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT | 2821 MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
2797 MS_STRICTATIME | MS_NOREMOTELOCK); 2822 MS_STRICTATIME | MS_NOREMOTELOCK | MS_SUBMOUNT);
2798 2823
2799 if (flags & MS_REMOUNT) 2824 if (flags & MS_REMOUNT)
2800 retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags, 2825 retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 5551e8ef67fd..e49d831c4e85 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -226,7 +226,7 @@ static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server,
226 const char *devname, 226 const char *devname,
227 struct nfs_clone_mount *mountdata) 227 struct nfs_clone_mount *mountdata)
228{ 228{
229 return vfs_kern_mount(&nfs_xdev_fs_type, 0, devname, mountdata); 229 return vfs_submount(mountdata->dentry, &nfs_xdev_fs_type, devname, mountdata);
230} 230}
231 231
232/** 232/**
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index d21104912676..d8b040bd9814 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -279,7 +279,7 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
279 mountdata->hostname, 279 mountdata->hostname,
280 mountdata->mnt_path); 280 mountdata->mnt_path);
281 281
282 mnt = vfs_kern_mount(&nfs4_referral_fs_type, 0, page, mountdata); 282 mnt = vfs_submount(mountdata->dentry, &nfs4_referral_fs_type, page, mountdata);
283 if (!IS_ERR(mnt)) 283 if (!IS_ERR(mnt))
284 break; 284 break;
285 } 285 }
diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h
index a6f5907a3fee..7c461fd49c4c 100644
--- a/fs/notify/inotify/inotify.h
+++ b/fs/notify/inotify/inotify.h
@@ -30,3 +30,20 @@ extern int inotify_handle_event(struct fsnotify_group *group,
30 const unsigned char *file_name, u32 cookie); 30 const unsigned char *file_name, u32 cookie);
31 31
32extern const struct fsnotify_ops inotify_fsnotify_ops; 32extern const struct fsnotify_ops inotify_fsnotify_ops;
33
34#ifdef CONFIG_INOTIFY_USER
35static inline void dec_inotify_instances(struct ucounts *ucounts)
36{
37 dec_ucount(ucounts, UCOUNT_INOTIFY_INSTANCES);
38}
39
40static inline struct ucounts *inc_inotify_watches(struct ucounts *ucounts)
41{
42 return inc_ucount(ucounts->ns, ucounts->uid, UCOUNT_INOTIFY_WATCHES);
43}
44
45static inline void dec_inotify_watches(struct ucounts *ucounts)
46{
47 dec_ucount(ucounts, UCOUNT_INOTIFY_WATCHES);
48}
49#endif
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 19e7ec109a75..f36c29398de3 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -165,10 +165,8 @@ static void inotify_free_group_priv(struct fsnotify_group *group)
165 /* ideally the idr is empty and we won't hit the BUG in the callback */ 165 /* ideally the idr is empty and we won't hit the BUG in the callback */
166 idr_for_each(&group->inotify_data.idr, idr_callback, group); 166 idr_for_each(&group->inotify_data.idr, idr_callback, group);
167 idr_destroy(&group->inotify_data.idr); 167 idr_destroy(&group->inotify_data.idr);
168 if (group->inotify_data.user) { 168 if (group->inotify_data.ucounts)
169 atomic_dec(&group->inotify_data.user->inotify_devs); 169 dec_inotify_instances(group->inotify_data.ucounts);
170 free_uid(group->inotify_data.user);
171 }
172} 170}
173 171
174static void inotify_free_event(struct fsnotify_event *fsn_event) 172static void inotify_free_event(struct fsnotify_event *fsn_event)
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 69d1ea3d292a..1cf41c623be1 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -44,10 +44,8 @@
44 44
45#include <asm/ioctls.h> 45#include <asm/ioctls.h>
46 46
47/* these are configurable via /proc/sys/fs/inotify/ */ 47/* configurable via /proc/sys/fs/inotify/ */
48static int inotify_max_user_instances __read_mostly;
49static int inotify_max_queued_events __read_mostly; 48static int inotify_max_queued_events __read_mostly;
50static int inotify_max_user_watches __read_mostly;
51 49
52static struct kmem_cache *inotify_inode_mark_cachep __read_mostly; 50static struct kmem_cache *inotify_inode_mark_cachep __read_mostly;
53 51
@@ -60,7 +58,7 @@ static int zero;
60struct ctl_table inotify_table[] = { 58struct ctl_table inotify_table[] = {
61 { 59 {
62 .procname = "max_user_instances", 60 .procname = "max_user_instances",
63 .data = &inotify_max_user_instances, 61 .data = &init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES],
64 .maxlen = sizeof(int), 62 .maxlen = sizeof(int),
65 .mode = 0644, 63 .mode = 0644,
66 .proc_handler = proc_dointvec_minmax, 64 .proc_handler = proc_dointvec_minmax,
@@ -68,7 +66,7 @@ struct ctl_table inotify_table[] = {
68 }, 66 },
69 { 67 {
70 .procname = "max_user_watches", 68 .procname = "max_user_watches",
71 .data = &inotify_max_user_watches, 69 .data = &init_user_ns.ucount_max[UCOUNT_INOTIFY_WATCHES],
72 .maxlen = sizeof(int), 70 .maxlen = sizeof(int),
73 .mode = 0644, 71 .mode = 0644,
74 .proc_handler = proc_dointvec_minmax, 72 .proc_handler = proc_dointvec_minmax,
@@ -500,7 +498,7 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
500 /* remove this mark from the idr */ 498 /* remove this mark from the idr */
501 inotify_remove_from_idr(group, i_mark); 499 inotify_remove_from_idr(group, i_mark);
502 500
503 atomic_dec(&group->inotify_data.user->inotify_watches); 501 dec_inotify_watches(group->inotify_data.ucounts);
504} 502}
505 503
506/* ding dong the mark is dead */ 504/* ding dong the mark is dead */
@@ -584,14 +582,17 @@ static int inotify_new_watch(struct fsnotify_group *group,
584 tmp_i_mark->fsn_mark.mask = mask; 582 tmp_i_mark->fsn_mark.mask = mask;
585 tmp_i_mark->wd = -1; 583 tmp_i_mark->wd = -1;
586 584
587 ret = -ENOSPC;
588 if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches)
589 goto out_err;
590
591 ret = inotify_add_to_idr(idr, idr_lock, tmp_i_mark); 585 ret = inotify_add_to_idr(idr, idr_lock, tmp_i_mark);
592 if (ret) 586 if (ret)
593 goto out_err; 587 goto out_err;
594 588
589 /* increment the number of watches the user has */
590 if (!inc_inotify_watches(group->inotify_data.ucounts)) {
591 inotify_remove_from_idr(group, tmp_i_mark);
592 ret = -ENOSPC;
593 goto out_err;
594 }
595
595 /* we are on the idr, now get on the inode */ 596 /* we are on the idr, now get on the inode */
596 ret = fsnotify_add_mark_locked(&tmp_i_mark->fsn_mark, group, inode, 597 ret = fsnotify_add_mark_locked(&tmp_i_mark->fsn_mark, group, inode,
597 NULL, 0); 598 NULL, 0);
@@ -601,8 +602,6 @@ static int inotify_new_watch(struct fsnotify_group *group,
601 goto out_err; 602 goto out_err;
602 } 603 }
603 604
604 /* increment the number of watches the user has */
605 atomic_inc(&group->inotify_data.user->inotify_watches);
606 605
607 /* return the watch descriptor for this new mark */ 606 /* return the watch descriptor for this new mark */
608 ret = tmp_i_mark->wd; 607 ret = tmp_i_mark->wd;
@@ -653,10 +652,11 @@ static struct fsnotify_group *inotify_new_group(unsigned int max_events)
653 652
654 spin_lock_init(&group->inotify_data.idr_lock); 653 spin_lock_init(&group->inotify_data.idr_lock);
655 idr_init(&group->inotify_data.idr); 654 idr_init(&group->inotify_data.idr);
656 group->inotify_data.user = get_current_user(); 655 group->inotify_data.ucounts = inc_ucount(current_user_ns(),
656 current_euid(),
657 UCOUNT_INOTIFY_INSTANCES);
657 658
658 if (atomic_inc_return(&group->inotify_data.user->inotify_devs) > 659 if (!group->inotify_data.ucounts) {
659 inotify_max_user_instances) {
660 fsnotify_destroy_group(group); 660 fsnotify_destroy_group(group);
661 return ERR_PTR(-EMFILE); 661 return ERR_PTR(-EMFILE);
662 } 662 }
@@ -819,8 +819,8 @@ static int __init inotify_user_setup(void)
819 inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, SLAB_PANIC); 819 inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, SLAB_PANIC);
820 820
821 inotify_max_queued_events = 16384; 821 inotify_max_queued_events = 16384;
822 inotify_max_user_instances = 128; 822 init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES] = 128;
823 inotify_max_user_watches = 8192; 823 init_user_ns.ucount_max[UCOUNT_INOTIFY_WATCHES] = 8192;
824 824
825 return 0; 825 return 0;
826} 826}
diff --git a/fs/nsfs.c b/fs/nsfs.c
index 8c9fb29c6673..1656843e87d2 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -7,6 +7,7 @@
7#include <linux/seq_file.h> 7#include <linux/seq_file.h>
8#include <linux/user_namespace.h> 8#include <linux/user_namespace.h>
9#include <linux/nsfs.h> 9#include <linux/nsfs.h>
10#include <linux/uaccess.h>
10 11
11static struct vfsmount *nsfs_mnt; 12static struct vfsmount *nsfs_mnt;
12 13
@@ -163,7 +164,10 @@ int open_related_ns(struct ns_common *ns,
163static long ns_ioctl(struct file *filp, unsigned int ioctl, 164static long ns_ioctl(struct file *filp, unsigned int ioctl,
164 unsigned long arg) 165 unsigned long arg)
165{ 166{
167 struct user_namespace *user_ns;
166 struct ns_common *ns = get_proc_ns(file_inode(filp)); 168 struct ns_common *ns = get_proc_ns(file_inode(filp));
169 uid_t __user *argp;
170 uid_t uid;
167 171
168 switch (ioctl) { 172 switch (ioctl) {
169 case NS_GET_USERNS: 173 case NS_GET_USERNS:
@@ -172,6 +176,15 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl,
172 if (!ns->ops->get_parent) 176 if (!ns->ops->get_parent)
173 return -EINVAL; 177 return -EINVAL;
174 return open_related_ns(ns, ns->ops->get_parent); 178 return open_related_ns(ns, ns->ops->get_parent);
179 case NS_GET_NSTYPE:
180 return ns->ops->type;
181 case NS_GET_OWNER_UID:
182 if (ns->ops->type != CLONE_NEWUSER)
183 return -EINVAL;
184 user_ns = container_of(ns, struct user_namespace, ns);
185 argp = (uid_t __user *) arg;
186 uid = from_kuid_munged(current_user_ns(), user_ns->owner);
187 return put_user(uid, argp);
175 default: 188 default:
176 return -ENOTTY; 189 return -ENOTTY;
177 } 190 }
diff --git a/fs/pnode.c b/fs/pnode.c
index 06a793f4ae38..5bc7896d122a 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -322,6 +322,21 @@ out:
322 return ret; 322 return ret;
323} 323}
324 324
325static struct mount *find_topper(struct mount *mnt)
326{
327 /* If there is exactly one mount covering mnt completely return it. */
328 struct mount *child;
329
330 if (!list_is_singular(&mnt->mnt_mounts))
331 return NULL;
332
333 child = list_first_entry(&mnt->mnt_mounts, struct mount, mnt_child);
334 if (child->mnt_mountpoint != mnt->mnt.mnt_root)
335 return NULL;
336
337 return child;
338}
339
325/* 340/*
326 * return true if the refcount is greater than count 341 * return true if the refcount is greater than count
327 */ 342 */
@@ -342,9 +357,8 @@ static inline int do_refcount_check(struct mount *mnt, int count)
342 */ 357 */
343int propagate_mount_busy(struct mount *mnt, int refcnt) 358int propagate_mount_busy(struct mount *mnt, int refcnt)
344{ 359{
345 struct mount *m, *child; 360 struct mount *m, *child, *topper;
346 struct mount *parent = mnt->mnt_parent; 361 struct mount *parent = mnt->mnt_parent;
347 int ret = 0;
348 362
349 if (mnt == parent) 363 if (mnt == parent)
350 return do_refcount_check(mnt, refcnt); 364 return do_refcount_check(mnt, refcnt);
@@ -359,12 +373,24 @@ int propagate_mount_busy(struct mount *mnt, int refcnt)
359 373
360 for (m = propagation_next(parent, parent); m; 374 for (m = propagation_next(parent, parent); m;
361 m = propagation_next(m, parent)) { 375 m = propagation_next(m, parent)) {
362 child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint); 376 int count = 1;
363 if (child && list_empty(&child->mnt_mounts) && 377 child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint);
364 (ret = do_refcount_check(child, 1))) 378 if (!child)
365 break; 379 continue;
380
381 /* Is there exactly one mount on the child that covers
382 * it completely whose reference should be ignored?
383 */
384 topper = find_topper(child);
385 if (topper)
386 count += 1;
387 else if (!list_empty(&child->mnt_mounts))
388 continue;
389
390 if (do_refcount_check(child, count))
391 return 1;
366 } 392 }
367 return ret; 393 return 0;
368} 394}
369 395
370/* 396/*
@@ -381,7 +407,7 @@ void propagate_mount_unlock(struct mount *mnt)
381 407
382 for (m = propagation_next(parent, parent); m; 408 for (m = propagation_next(parent, parent); m;
383 m = propagation_next(m, parent)) { 409 m = propagation_next(m, parent)) {
384 child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint); 410 child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint);
385 if (child) 411 if (child)
386 child->mnt.mnt_flags &= ~MNT_LOCKED; 412 child->mnt.mnt_flags &= ~MNT_LOCKED;
387 } 413 }
@@ -399,9 +425,11 @@ static void mark_umount_candidates(struct mount *mnt)
399 425
400 for (m = propagation_next(parent, parent); m; 426 for (m = propagation_next(parent, parent); m;
401 m = propagation_next(m, parent)) { 427 m = propagation_next(m, parent)) {
402 struct mount *child = __lookup_mnt_last(&m->mnt, 428 struct mount *child = __lookup_mnt(&m->mnt,
403 mnt->mnt_mountpoint); 429 mnt->mnt_mountpoint);
404 if (child && (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m))) { 430 if (!child || (child->mnt.mnt_flags & MNT_UMOUNT))
431 continue;
432 if (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m)) {
405 SET_MNT_MARK(child); 433 SET_MNT_MARK(child);
406 } 434 }
407 } 435 }
@@ -420,8 +448,8 @@ static void __propagate_umount(struct mount *mnt)
420 448
421 for (m = propagation_next(parent, parent); m; 449 for (m = propagation_next(parent, parent); m;
422 m = propagation_next(m, parent)) { 450 m = propagation_next(m, parent)) {
423 451 struct mount *topper;
424 struct mount *child = __lookup_mnt_last(&m->mnt, 452 struct mount *child = __lookup_mnt(&m->mnt,
425 mnt->mnt_mountpoint); 453 mnt->mnt_mountpoint);
426 /* 454 /*
427 * umount the child only if the child has no children 455 * umount the child only if the child has no children
@@ -430,6 +458,15 @@ static void __propagate_umount(struct mount *mnt)
430 if (!child || !IS_MNT_MARKED(child)) 458 if (!child || !IS_MNT_MARKED(child))
431 continue; 459 continue;
432 CLEAR_MNT_MARK(child); 460 CLEAR_MNT_MARK(child);
461
462 /* If there is exactly one mount covering all of child
463 * replace child with that mount.
464 */
465 topper = find_topper(child);
466 if (topper)
467 mnt_change_mountpoint(child->mnt_parent, child->mnt_mp,
468 topper);
469
433 if (list_empty(&child->mnt_mounts)) { 470 if (list_empty(&child->mnt_mounts)) {
434 list_del_init(&child->mnt_child); 471 list_del_init(&child->mnt_child);
435 child->mnt.mnt_flags |= MNT_UMOUNT; 472 child->mnt.mnt_flags |= MNT_UMOUNT;
diff --git a/fs/pnode.h b/fs/pnode.h
index 550f5a8b4fcf..dc87e65becd2 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -49,6 +49,8 @@ int get_dominating_id(struct mount *mnt, const struct path *root);
49unsigned int mnt_get_count(struct mount *mnt); 49unsigned int mnt_get_count(struct mount *mnt);
50void mnt_set_mountpoint(struct mount *, struct mountpoint *, 50void mnt_set_mountpoint(struct mount *, struct mountpoint *,
51 struct mount *); 51 struct mount *);
52void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp,
53 struct mount *mnt);
52struct mount *copy_tree(struct mount *, struct dentry *, int); 54struct mount *copy_tree(struct mount *, struct dentry *, int);
53bool is_path_reachable(struct mount *, struct dentry *, 55bool is_path_reachable(struct mount *, struct dentry *,
54 const struct path *root); 56 const struct path *root);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 3d773eb9e144..b73b4de8fb36 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1667,12 +1667,63 @@ const struct inode_operations proc_pid_link_inode_operations = {
1667 1667
1668/* building an inode */ 1668/* building an inode */
1669 1669
1670void task_dump_owner(struct task_struct *task, mode_t mode,
1671 kuid_t *ruid, kgid_t *rgid)
1672{
1673 /* Depending on the state of dumpable compute who should own a
1674 * proc file for a task.
1675 */
1676 const struct cred *cred;
1677 kuid_t uid;
1678 kgid_t gid;
1679
1680 /* Default to the tasks effective ownership */
1681 rcu_read_lock();
1682 cred = __task_cred(task);
1683 uid = cred->euid;
1684 gid = cred->egid;
1685 rcu_read_unlock();
1686
1687 /*
1688 * Before the /proc/pid/status file was created the only way to read
1689 * the effective uid of a /process was to stat /proc/pid. Reading
1690 * /proc/pid/status is slow enough that procps and other packages
1691 * kept stating /proc/pid. To keep the rules in /proc simple I have
1692 * made this apply to all per process world readable and executable
1693 * directories.
1694 */
1695 if (mode != (S_IFDIR|S_IRUGO|S_IXUGO)) {
1696 struct mm_struct *mm;
1697 task_lock(task);
1698 mm = task->mm;
1699 /* Make non-dumpable tasks owned by some root */
1700 if (mm) {
1701 if (get_dumpable(mm) != SUID_DUMP_USER) {
1702 struct user_namespace *user_ns = mm->user_ns;
1703
1704 uid = make_kuid(user_ns, 0);
1705 if (!uid_valid(uid))
1706 uid = GLOBAL_ROOT_UID;
1707
1708 gid = make_kgid(user_ns, 0);
1709 if (!gid_valid(gid))
1710 gid = GLOBAL_ROOT_GID;
1711 }
1712 } else {
1713 uid = GLOBAL_ROOT_UID;
1714 gid = GLOBAL_ROOT_GID;
1715 }
1716 task_unlock(task);
1717 }
1718 *ruid = uid;
1719 *rgid = gid;
1720}
1721
1670struct inode *proc_pid_make_inode(struct super_block * sb, 1722struct inode *proc_pid_make_inode(struct super_block * sb,
1671 struct task_struct *task, umode_t mode) 1723 struct task_struct *task, umode_t mode)
1672{ 1724{
1673 struct inode * inode; 1725 struct inode * inode;
1674 struct proc_inode *ei; 1726 struct proc_inode *ei;
1675 const struct cred *cred;
1676 1727
1677 /* We need a new inode */ 1728 /* We need a new inode */
1678 1729
@@ -1694,13 +1745,7 @@ struct inode *proc_pid_make_inode(struct super_block * sb,
1694 if (!ei->pid) 1745 if (!ei->pid)
1695 goto out_unlock; 1746 goto out_unlock;
1696 1747
1697 if (task_dumpable(task)) { 1748 task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
1698 rcu_read_lock();
1699 cred = __task_cred(task);
1700 inode->i_uid = cred->euid;
1701 inode->i_gid = cred->egid;
1702 rcu_read_unlock();
1703 }
1704 security_task_to_inode(task, inode); 1749 security_task_to_inode(task, inode);
1705 1750
1706out: 1751out:
@@ -1715,7 +1760,6 @@ int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
1715{ 1760{
1716 struct inode *inode = d_inode(dentry); 1761 struct inode *inode = d_inode(dentry);
1717 struct task_struct *task; 1762 struct task_struct *task;
1718 const struct cred *cred;
1719 struct pid_namespace *pid = dentry->d_sb->s_fs_info; 1763 struct pid_namespace *pid = dentry->d_sb->s_fs_info;
1720 1764
1721 generic_fillattr(inode, stat); 1765 generic_fillattr(inode, stat);
@@ -1733,12 +1777,7 @@ int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
1733 */ 1777 */
1734 return -ENOENT; 1778 return -ENOENT;
1735 } 1779 }
1736 if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) || 1780 task_dump_owner(task, inode->i_mode, &stat->uid, &stat->gid);
1737 task_dumpable(task)) {
1738 cred = __task_cred(task);
1739 stat->uid = cred->euid;
1740 stat->gid = cred->egid;
1741 }
1742 } 1781 }
1743 rcu_read_unlock(); 1782 rcu_read_unlock();
1744 return 0; 1783 return 0;
@@ -1754,18 +1793,11 @@ int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
1754 * Rewrite the inode's ownerships here because the owning task may have 1793 * Rewrite the inode's ownerships here because the owning task may have
1755 * performed a setuid(), etc. 1794 * performed a setuid(), etc.
1756 * 1795 *
1757 * Before the /proc/pid/status file was created the only way to read
1758 * the effective uid of a /process was to stat /proc/pid. Reading
1759 * /proc/pid/status is slow enough that procps and other packages
1760 * kept stating /proc/pid. To keep the rules in /proc simple I have
1761 * made this apply to all per process world readable and executable
1762 * directories.
1763 */ 1796 */
1764int pid_revalidate(struct dentry *dentry, unsigned int flags) 1797int pid_revalidate(struct dentry *dentry, unsigned int flags)
1765{ 1798{
1766 struct inode *inode; 1799 struct inode *inode;
1767 struct task_struct *task; 1800 struct task_struct *task;
1768 const struct cred *cred;
1769 1801
1770 if (flags & LOOKUP_RCU) 1802 if (flags & LOOKUP_RCU)
1771 return -ECHILD; 1803 return -ECHILD;
@@ -1774,17 +1806,8 @@ int pid_revalidate(struct dentry *dentry, unsigned int flags)
1774 task = get_proc_task(inode); 1806 task = get_proc_task(inode);
1775 1807
1776 if (task) { 1808 if (task) {
1777 if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) || 1809 task_dump_owner(task, inode->i_mode, &inode->i_uid, &inode->i_gid);
1778 task_dumpable(task)) { 1810
1779 rcu_read_lock();
1780 cred = __task_cred(task);
1781 inode->i_uid = cred->euid;
1782 inode->i_gid = cred->egid;
1783 rcu_read_unlock();
1784 } else {
1785 inode->i_uid = GLOBAL_ROOT_UID;
1786 inode->i_gid = GLOBAL_ROOT_GID;
1787 }
1788 inode->i_mode &= ~(S_ISUID | S_ISGID); 1811 inode->i_mode &= ~(S_ISUID | S_ISGID);
1789 security_task_to_inode(task, inode); 1812 security_task_to_inode(task, inode);
1790 put_task_struct(task); 1813 put_task_struct(task);
@@ -1881,7 +1904,6 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)
1881 bool exact_vma_exists = false; 1904 bool exact_vma_exists = false;
1882 struct mm_struct *mm = NULL; 1905 struct mm_struct *mm = NULL;
1883 struct task_struct *task; 1906 struct task_struct *task;
1884 const struct cred *cred;
1885 struct inode *inode; 1907 struct inode *inode;
1886 int status = 0; 1908 int status = 0;
1887 1909
@@ -1906,16 +1928,8 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)
1906 mmput(mm); 1928 mmput(mm);
1907 1929
1908 if (exact_vma_exists) { 1930 if (exact_vma_exists) {
1909 if (task_dumpable(task)) { 1931 task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
1910 rcu_read_lock(); 1932
1911 cred = __task_cred(task);
1912 inode->i_uid = cred->euid;
1913 inode->i_gid = cred->egid;
1914 rcu_read_unlock();
1915 } else {
1916 inode->i_uid = GLOBAL_ROOT_UID;
1917 inode->i_gid = GLOBAL_ROOT_GID;
1918 }
1919 security_task_to_inode(task, inode); 1933 security_task_to_inode(task, inode);
1920 status = 1; 1934 status = 1;
1921 } 1935 }
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 4274f83bf100..00ce1531b2f5 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -84,7 +84,6 @@ static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags)
84{ 84{
85 struct files_struct *files; 85 struct files_struct *files;
86 struct task_struct *task; 86 struct task_struct *task;
87 const struct cred *cred;
88 struct inode *inode; 87 struct inode *inode;
89 unsigned int fd; 88 unsigned int fd;
90 89
@@ -108,16 +107,7 @@ static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags)
108 rcu_read_unlock(); 107 rcu_read_unlock();
109 put_files_struct(files); 108 put_files_struct(files);
110 109
111 if (task_dumpable(task)) { 110 task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
112 rcu_read_lock();
113 cred = __task_cred(task);
114 inode->i_uid = cred->euid;
115 inode->i_gid = cred->egid;
116 rcu_read_unlock();
117 } else {
118 inode->i_uid = GLOBAL_ROOT_UID;
119 inode->i_gid = GLOBAL_ROOT_GID;
120 }
121 111
122 if (S_ISLNK(inode->i_mode)) { 112 if (S_ISLNK(inode->i_mode)) {
123 unsigned i_mode = S_IFLNK; 113 unsigned i_mode = S_IFLNK;
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 842a5ff5b85c..7ad9ed7958af 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -43,10 +43,11 @@ static void proc_evict_inode(struct inode *inode)
43 de = PDE(inode); 43 de = PDE(inode);
44 if (de) 44 if (de)
45 pde_put(de); 45 pde_put(de);
46
46 head = PROC_I(inode)->sysctl; 47 head = PROC_I(inode)->sysctl;
47 if (head) { 48 if (head) {
48 RCU_INIT_POINTER(PROC_I(inode)->sysctl, NULL); 49 RCU_INIT_POINTER(PROC_I(inode)->sysctl, NULL);
49 sysctl_head_put(head); 50 proc_sys_evict_inode(inode, head);
50 } 51 }
51} 52}
52 53
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 2de5194ba378..5d6960f5f1c0 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -65,6 +65,7 @@ struct proc_inode {
65 struct proc_dir_entry *pde; 65 struct proc_dir_entry *pde;
66 struct ctl_table_header *sysctl; 66 struct ctl_table_header *sysctl;
67 struct ctl_table *sysctl_entry; 67 struct ctl_table *sysctl_entry;
68 struct list_head sysctl_inodes;
68 const struct proc_ns_operations *ns_ops; 69 const struct proc_ns_operations *ns_ops;
69 struct inode vfs_inode; 70 struct inode vfs_inode;
70}; 71};
@@ -97,20 +98,8 @@ static inline struct task_struct *get_proc_task(struct inode *inode)
97 return get_pid_task(proc_pid(inode), PIDTYPE_PID); 98 return get_pid_task(proc_pid(inode), PIDTYPE_PID);
98} 99}
99 100
100static inline int task_dumpable(struct task_struct *task) 101void task_dump_owner(struct task_struct *task, mode_t mode,
101{ 102 kuid_t *ruid, kgid_t *rgid);
102 int dumpable = 0;
103 struct mm_struct *mm;
104
105 task_lock(task);
106 mm = task->mm;
107 if (mm)
108 dumpable = get_dumpable(mm);
109 task_unlock(task);
110 if (dumpable == SUID_DUMP_USER)
111 return 1;
112 return 0;
113}
114 103
115static inline unsigned name_to_int(const struct qstr *qstr) 104static inline unsigned name_to_int(const struct qstr *qstr)
116{ 105{
@@ -249,10 +238,12 @@ extern void proc_thread_self_init(void);
249 */ 238 */
250#ifdef CONFIG_PROC_SYSCTL 239#ifdef CONFIG_PROC_SYSCTL
251extern int proc_sys_init(void); 240extern int proc_sys_init(void);
252extern void sysctl_head_put(struct ctl_table_header *); 241extern void proc_sys_evict_inode(struct inode *inode,
242 struct ctl_table_header *head);
253#else 243#else
254static inline void proc_sys_init(void) { } 244static inline void proc_sys_init(void) { }
255static inline void sysctl_head_put(struct ctl_table_header *head) { } 245static inline void proc_sys_evict_inode(struct inode *inode,
246 struct ctl_table_header *head) { }
256#endif 247#endif
257 248
258/* 249/*
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index d4e37acd4821..3e64c6502dc8 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -190,6 +190,7 @@ static void init_header(struct ctl_table_header *head,
190 head->set = set; 190 head->set = set;
191 head->parent = NULL; 191 head->parent = NULL;
192 head->node = node; 192 head->node = node;
193 INIT_LIST_HEAD(&head->inodes);
193 if (node) { 194 if (node) {
194 struct ctl_table *entry; 195 struct ctl_table *entry;
195 for (entry = table; entry->procname; entry++, node++) 196 for (entry = table; entry->procname; entry++, node++)
@@ -259,6 +260,27 @@ static void unuse_table(struct ctl_table_header *p)
259 complete(p->unregistering); 260 complete(p->unregistering);
260} 261}
261 262
263/* called under sysctl_lock */
264static void proc_sys_prune_dcache(struct ctl_table_header *head)
265{
266 struct inode *inode, *prev = NULL;
267 struct proc_inode *ei;
268
269 rcu_read_lock();
270 list_for_each_entry_rcu(ei, &head->inodes, sysctl_inodes) {
271 inode = igrab(&ei->vfs_inode);
272 if (inode) {
273 rcu_read_unlock();
274 iput(prev);
275 prev = inode;
276 d_prune_aliases(inode);
277 rcu_read_lock();
278 }
279 }
280 rcu_read_unlock();
281 iput(prev);
282}
283
262/* called under sysctl_lock, will reacquire if has to wait */ 284/* called under sysctl_lock, will reacquire if has to wait */
263static void start_unregistering(struct ctl_table_header *p) 285static void start_unregistering(struct ctl_table_header *p)
264{ 286{
@@ -272,31 +294,22 @@ static void start_unregistering(struct ctl_table_header *p)
272 p->unregistering = &wait; 294 p->unregistering = &wait;
273 spin_unlock(&sysctl_lock); 295 spin_unlock(&sysctl_lock);
274 wait_for_completion(&wait); 296 wait_for_completion(&wait);
275 spin_lock(&sysctl_lock);
276 } else { 297 } else {
277 /* anything non-NULL; we'll never dereference it */ 298 /* anything non-NULL; we'll never dereference it */
278 p->unregistering = ERR_PTR(-EINVAL); 299 p->unregistering = ERR_PTR(-EINVAL);
300 spin_unlock(&sysctl_lock);
279 } 301 }
280 /* 302 /*
303 * Prune dentries for unregistered sysctls: namespaced sysctls
304 * can have duplicate names and contaminate dcache very badly.
305 */
306 proc_sys_prune_dcache(p);
307 /*
281 * do not remove from the list until nobody holds it; walking the 308 * do not remove from the list until nobody holds it; walking the
282 * list in do_sysctl() relies on that. 309 * list in do_sysctl() relies on that.
283 */ 310 */
284 erase_header(p);
285}
286
287static void sysctl_head_get(struct ctl_table_header *head)
288{
289 spin_lock(&sysctl_lock); 311 spin_lock(&sysctl_lock);
290 head->count++; 312 erase_header(p);
291 spin_unlock(&sysctl_lock);
292}
293
294void sysctl_head_put(struct ctl_table_header *head)
295{
296 spin_lock(&sysctl_lock);
297 if (!--head->count)
298 kfree_rcu(head, rcu);
299 spin_unlock(&sysctl_lock);
300} 313}
301 314
302static struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head) 315static struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head)
@@ -440,10 +453,20 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
440 453
441 inode->i_ino = get_next_ino(); 454 inode->i_ino = get_next_ino();
442 455
443 sysctl_head_get(head);
444 ei = PROC_I(inode); 456 ei = PROC_I(inode);
457
458 spin_lock(&sysctl_lock);
459 if (unlikely(head->unregistering)) {
460 spin_unlock(&sysctl_lock);
461 iput(inode);
462 inode = NULL;
463 goto out;
464 }
445 ei->sysctl = head; 465 ei->sysctl = head;
446 ei->sysctl_entry = table; 466 ei->sysctl_entry = table;
467 list_add_rcu(&ei->sysctl_inodes, &head->inodes);
468 head->count++;
469 spin_unlock(&sysctl_lock);
447 470
448 inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); 471 inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
449 inode->i_mode = table->mode; 472 inode->i_mode = table->mode;
@@ -466,6 +489,15 @@ out:
466 return inode; 489 return inode;
467} 490}
468 491
492void proc_sys_evict_inode(struct inode *inode, struct ctl_table_header *head)
493{
494 spin_lock(&sysctl_lock);
495 list_del_rcu(&PROC_I(inode)->sysctl_inodes);
496 if (!--head->count)
497 kfree_rcu(head, rcu);
498 spin_unlock(&sysctl_lock);
499}
500
469static struct ctl_table_header *grab_header(struct inode *inode) 501static struct ctl_table_header *grab_header(struct inode *inode)
470{ 502{
471 struct ctl_table_header *head = PROC_I(inode)->sysctl; 503 struct ctl_table_header *head = PROC_I(inode)->sysctl;
diff --git a/fs/super.c b/fs/super.c
index ea662b0e5e78..b8b6a086c03b 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -469,7 +469,7 @@ struct super_block *sget_userns(struct file_system_type *type,
469 struct super_block *old; 469 struct super_block *old;
470 int err; 470 int err;
471 471
472 if (!(flags & MS_KERNMOUNT) && 472 if (!(flags & (MS_KERNMOUNT|MS_SUBMOUNT)) &&
473 !(type->fs_flags & FS_USERNS_MOUNT) && 473 !(type->fs_flags & FS_USERNS_MOUNT) &&
474 !capable(CAP_SYS_ADMIN)) 474 !capable(CAP_SYS_ADMIN))
475 return ERR_PTR(-EPERM); 475 return ERR_PTR(-EPERM);
@@ -499,7 +499,7 @@ retry:
499 } 499 }
500 if (!s) { 500 if (!s) {
501 spin_unlock(&sb_lock); 501 spin_unlock(&sb_lock);
502 s = alloc_super(type, flags, user_ns); 502 s = alloc_super(type, (flags & ~MS_SUBMOUNT), user_ns);
503 if (!s) 503 if (!s)
504 return ERR_PTR(-ENOMEM); 504 return ERR_PTR(-ENOMEM);
505 goto retry; 505 goto retry;
@@ -540,8 +540,15 @@ struct super_block *sget(struct file_system_type *type,
540{ 540{
541 struct user_namespace *user_ns = current_user_ns(); 541 struct user_namespace *user_ns = current_user_ns();
542 542
543 /* We don't yet pass the user namespace of the parent
544 * mount through to here so always use &init_user_ns
545 * until that changes.
546 */
547 if (flags & MS_SUBMOUNT)
548 user_ns = &init_user_ns;
549
543 /* Ensure the requestor has permissions over the target filesystem */ 550 /* Ensure the requestor has permissions over the target filesystem */
544 if (!(flags & MS_KERNMOUNT) && !ns_capable(user_ns, CAP_SYS_ADMIN)) 551 if (!(flags & (MS_KERNMOUNT|MS_SUBMOUNT)) && !ns_capable(user_ns, CAP_SYS_ADMIN))
545 return ERR_PTR(-EPERM); 552 return ERR_PTR(-EPERM);
546 553
547 return sget_userns(type, test, set, flags, user_ns, data); 554 return sget_userns(type, test, set, flags, user_ns, data);
diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index 9d571acd3a48..7dff776e6d16 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -98,9 +98,10 @@ struct dentry *debugfs_create_dir(const char *name, struct dentry *parent);
98struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent, 98struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent,
99 const char *dest); 99 const char *dest);
100 100
101typedef struct vfsmount *(*debugfs_automount_t)(struct dentry *, void *);
101struct dentry *debugfs_create_automount(const char *name, 102struct dentry *debugfs_create_automount(const char *name,
102 struct dentry *parent, 103 struct dentry *parent,
103 struct vfsmount *(*f)(void *), 104 debugfs_automount_t f,
104 void *data); 105 void *data);
105 106
106void debugfs_remove(struct dentry *dentry); 107void debugfs_remove(struct dentry *dentry);
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 487246546ebe..e6e689b5569e 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -16,6 +16,7 @@
16#include <linux/spinlock.h> 16#include <linux/spinlock.h>
17#include <linux/types.h> 17#include <linux/types.h>
18#include <linux/atomic.h> 18#include <linux/atomic.h>
19#include <linux/user_namespace.h>
19 20
20/* 21/*
21 * IN_* from inotfy.h lines up EXACTLY with FS_*, this is so we can easily 22 * IN_* from inotfy.h lines up EXACTLY with FS_*, this is so we can easily
@@ -170,7 +171,7 @@ struct fsnotify_group {
170 struct inotify_group_private_data { 171 struct inotify_group_private_data {
171 spinlock_t idr_lock; 172 spinlock_t idr_lock;
172 struct idr idr; 173 struct idr idr;
173 struct user_struct *user; 174 struct ucounts *ucounts;
174 } inotify_data; 175 } inotify_data;
175#endif 176#endif
176#ifdef CONFIG_FANOTIFY 177#ifdef CONFIG_FANOTIFY
diff --git a/include/linux/mount.h b/include/linux/mount.h
index c6f55158d5e5..8e0352af06b7 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -90,6 +90,9 @@ struct file_system_type;
90extern struct vfsmount *vfs_kern_mount(struct file_system_type *type, 90extern struct vfsmount *vfs_kern_mount(struct file_system_type *type,
91 int flags, const char *name, 91 int flags, const char *name,
92 void *data); 92 void *data);
93extern struct vfsmount *vfs_submount(const struct dentry *mountpoint,
94 struct file_system_type *type,
95 const char *name, void *data);
93 96
94extern void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list); 97extern void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list);
95extern void mark_mounts_for_expiry(struct list_head *mounts); 98extern void mark_mounts_for_expiry(struct list_head *mounts);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index c8e519d0b4a3..451e241f32c5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -846,10 +846,6 @@ struct user_struct {
846 atomic_t __count; /* reference count */ 846 atomic_t __count; /* reference count */
847 atomic_t processes; /* How many processes does this user have? */ 847 atomic_t processes; /* How many processes does this user have? */
848 atomic_t sigpending; /* How many pending signals does this user have? */ 848 atomic_t sigpending; /* How many pending signals does this user have? */
849#ifdef CONFIG_INOTIFY_USER
850 atomic_t inotify_watches; /* How many inotify watches does this user have? */
851 atomic_t inotify_devs; /* How many inotify devs does this user have opened? */
852#endif
853#ifdef CONFIG_FANOTIFY 849#ifdef CONFIG_FANOTIFY
854 atomic_t fanotify_listeners; 850 atomic_t fanotify_listeners;
855#endif 851#endif
@@ -3051,6 +3047,9 @@ extern bool current_is_single_threaded(void);
3051#define for_each_process_thread(p, t) \ 3047#define for_each_process_thread(p, t) \
3052 for_each_process(p) for_each_thread(p, t) 3048 for_each_process(p) for_each_thread(p, t)
3053 3049
3050typedef int (*proc_visitor)(struct task_struct *p, void *data);
3051void walk_process_tree(struct task_struct *top, proc_visitor, void *);
3052
3054static inline int get_nr_threads(struct task_struct *tsk) 3053static inline int get_nr_threads(struct task_struct *tsk)
3055{ 3054{
3056 return tsk->signal->nr_threads; 3055 return tsk->signal->nr_threads;
diff --git a/include/linux/security.h b/include/linux/security.h
index d3868f2ebada..96899fad7016 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -140,8 +140,7 @@ struct request_sock;
140/* bprm->unsafe reasons */ 140/* bprm->unsafe reasons */
141#define LSM_UNSAFE_SHARE 1 141#define LSM_UNSAFE_SHARE 1
142#define LSM_UNSAFE_PTRACE 2 142#define LSM_UNSAFE_PTRACE 2
143#define LSM_UNSAFE_PTRACE_CAP 4 143#define LSM_UNSAFE_NO_NEW_PRIVS 4
144#define LSM_UNSAFE_NO_NEW_PRIVS 8
145 144
146#ifdef CONFIG_MMU 145#ifdef CONFIG_MMU
147extern int mmap_min_addr_handler(struct ctl_table *table, int write, 146extern int mmap_min_addr_handler(struct ctl_table *table, int write,
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index adf4e51cf597..b7e82049fec7 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -143,6 +143,7 @@ struct ctl_table_header
143 struct ctl_table_set *set; 143 struct ctl_table_set *set;
144 struct ctl_dir *parent; 144 struct ctl_dir *parent;
145 struct ctl_node *node; 145 struct ctl_node *node;
146 struct list_head inodes; /* head for proc_inode->sysctl_inodes */
146}; 147};
147 148
148struct ctl_dir { 149struct ctl_dir {
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index eb209d4523f5..363e0e8082a9 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -32,6 +32,10 @@ enum ucount_type {
32 UCOUNT_NET_NAMESPACES, 32 UCOUNT_NET_NAMESPACES,
33 UCOUNT_MNT_NAMESPACES, 33 UCOUNT_MNT_NAMESPACES,
34 UCOUNT_CGROUP_NAMESPACES, 34 UCOUNT_CGROUP_NAMESPACES,
35#ifdef CONFIG_INOTIFY_USER
36 UCOUNT_INOTIFY_INSTANCES,
37 UCOUNT_INOTIFY_WATCHES,
38#endif
35 UCOUNT_COUNTS, 39 UCOUNT_COUNTS,
36}; 40};
37 41
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 36da93fbf188..048a85e9f017 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -132,6 +132,7 @@ struct inodes_stat_t {
132#define MS_LAZYTIME (1<<25) /* Update the on-disk [acm]times lazily */ 132#define MS_LAZYTIME (1<<25) /* Update the on-disk [acm]times lazily */
133 133
134/* These sb flags are internal to the kernel */ 134/* These sb flags are internal to the kernel */
135#define MS_SUBMOUNT (1<<26)
135#define MS_NOREMOTELOCK (1<<27) 136#define MS_NOREMOTELOCK (1<<27)
136#define MS_NOSEC (1<<28) 137#define MS_NOSEC (1<<28)
137#define MS_BORN (1<<29) 138#define MS_BORN (1<<29)
diff --git a/include/uapi/linux/nsfs.h b/include/uapi/linux/nsfs.h
index 3af617230d1b..1a3ca79f466b 100644
--- a/include/uapi/linux/nsfs.h
+++ b/include/uapi/linux/nsfs.h
@@ -6,8 +6,13 @@
6#define NSIO 0xb7 6#define NSIO 0xb7
7 7
8/* Returns a file descriptor that refers to an owning user namespace */ 8/* Returns a file descriptor that refers to an owning user namespace */
9#define NS_GET_USERNS _IO(NSIO, 0x1) 9#define NS_GET_USERNS _IO(NSIO, 0x1)
10/* Returns a file descriptor that refers to a parent namespace */ 10/* Returns a file descriptor that refers to a parent namespace */
11#define NS_GET_PARENT _IO(NSIO, 0x2) 11#define NS_GET_PARENT _IO(NSIO, 0x2)
12/* Returns the type of namespace (CLONE_NEW* value) referred to by
13 file descriptor */
14#define NS_GET_NSTYPE _IO(NSIO, 0x3)
15/* Get owner UID (in the caller's user namespace) for a user namespace */
16#define NS_GET_OWNER_UID _IO(NSIO, 0x4)
12 17
13#endif /* __LINUX_NSFS_H */ 18#endif /* __LINUX_NSFS_H */
diff --git a/kernel/exit.c b/kernel/exit.c
index 580da79e38ee..9960accbf2ab 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -607,15 +607,18 @@ static struct task_struct *find_new_reaper(struct task_struct *father,
607 return thread; 607 return thread;
608 608
609 if (father->signal->has_child_subreaper) { 609 if (father->signal->has_child_subreaper) {
610 unsigned int ns_level = task_pid(father)->level;
610 /* 611 /*
611 * Find the first ->is_child_subreaper ancestor in our pid_ns. 612 * Find the first ->is_child_subreaper ancestor in our pid_ns.
612 * We start from father to ensure we can not look into another 613 * We can't check reaper != child_reaper to ensure we do not
613 * namespace, this is safe because all its threads are dead. 614 * cross the namespaces, the exiting parent could be injected
615 * by setns() + fork().
616 * We check pid->level, this is slightly more efficient than
617 * task_active_pid_ns(reaper) != task_active_pid_ns(father).
614 */ 618 */
615 for (reaper = father; 619 for (reaper = father->real_parent;
616 !same_thread_group(reaper, child_reaper); 620 task_pid(reaper)->level == ns_level;
617 reaper = reaper->real_parent) { 621 reaper = reaper->real_parent) {
618 /* call_usermodehelper() descendants need this check */
619 if (reaper == &init_task) 622 if (reaper == &init_task)
620 break; 623 break;
621 if (!reaper->signal->is_child_subreaper) 624 if (!reaper->signal->is_child_subreaper)
diff --git a/kernel/fork.c b/kernel/fork.c
index d12fcc4db8a3..348fe73155bc 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1377,9 +1377,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
1377 sig->oom_score_adj = current->signal->oom_score_adj; 1377 sig->oom_score_adj = current->signal->oom_score_adj;
1378 sig->oom_score_adj_min = current->signal->oom_score_adj_min; 1378 sig->oom_score_adj_min = current->signal->oom_score_adj_min;
1379 1379
1380 sig->has_child_subreaper = current->signal->has_child_subreaper ||
1381 current->signal->is_child_subreaper;
1382
1383 mutex_init(&sig->cred_guard_mutex); 1380 mutex_init(&sig->cred_guard_mutex);
1384 1381
1385 return 0; 1382 return 0;
@@ -1814,6 +1811,13 @@ static __latent_entropy struct task_struct *copy_process(
1814 1811
1815 p->signal->leader_pid = pid; 1812 p->signal->leader_pid = pid;
1816 p->signal->tty = tty_kref_get(current->signal->tty); 1813 p->signal->tty = tty_kref_get(current->signal->tty);
1814 /*
1815 * Inherit has_child_subreaper flag under the same
1816 * tasklist_lock with adding child to the process tree
1817 * for propagate_has_child_subreaper optimization.
1818 */
1819 p->signal->has_child_subreaper = p->real_parent->signal->has_child_subreaper ||
1820 p->real_parent->signal->is_child_subreaper;
1817 list_add_tail(&p->sibling, &p->real_parent->children); 1821 list_add_tail(&p->sibling, &p->real_parent->children);
1818 list_add_tail_rcu(&p->tasks, &init_task.tasks); 1822 list_add_tail_rcu(&p->tasks, &init_task.tasks);
1819 attach_pid(p, PIDTYPE_PGID); 1823 attach_pid(p, PIDTYPE_PGID);
@@ -2067,6 +2071,38 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
2067} 2071}
2068#endif 2072#endif
2069 2073
2074void walk_process_tree(struct task_struct *top, proc_visitor visitor, void *data)
2075{
2076 struct task_struct *leader, *parent, *child;
2077 int res;
2078
2079 read_lock(&tasklist_lock);
2080 leader = top = top->group_leader;
2081down:
2082 for_each_thread(leader, parent) {
2083 list_for_each_entry(child, &parent->children, sibling) {
2084 res = visitor(child, data);
2085 if (res) {
2086 if (res < 0)
2087 goto out;
2088 leader = child;
2089 goto down;
2090 }
2091up:
2092 ;
2093 }
2094 }
2095
2096 if (leader != top) {
2097 child = leader;
2098 parent = child->real_parent;
2099 leader = parent->group_leader;
2100 goto up;
2101 }
2102out:
2103 read_unlock(&tasklist_lock);
2104}
2105
2070#ifndef ARCH_MIN_MMSTRUCT_ALIGN 2106#ifndef ARCH_MIN_MMSTRUCT_ALIGN
2071#define ARCH_MIN_MMSTRUCT_ALIGN 0 2107#define ARCH_MIN_MMSTRUCT_ALIGN 0
2072#endif 2108#endif
diff --git a/kernel/sys.c b/kernel/sys.c
index 7d4a9a6df956..b07adca97ea3 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2063,6 +2063,24 @@ static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
2063} 2063}
2064#endif 2064#endif
2065 2065
2066static int propagate_has_child_subreaper(struct task_struct *p, void *data)
2067{
2068 /*
2069 * If task has has_child_subreaper - all its decendants
2070 * already have these flag too and new decendants will
2071 * inherit it on fork, skip them.
2072 *
2073 * If we've found child_reaper - skip descendants in
2074 * it's subtree as they will never get out pidns.
2075 */
2076 if (p->signal->has_child_subreaper ||
2077 is_child_reaper(task_pid(p)))
2078 return 0;
2079
2080 p->signal->has_child_subreaper = 1;
2081 return 1;
2082}
2083
2066SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, 2084SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
2067 unsigned long, arg4, unsigned long, arg5) 2085 unsigned long, arg4, unsigned long, arg5)
2068{ 2086{
@@ -2214,6 +2232,10 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
2214 break; 2232 break;
2215 case PR_SET_CHILD_SUBREAPER: 2233 case PR_SET_CHILD_SUBREAPER:
2216 me->signal->is_child_subreaper = !!arg2; 2234 me->signal->is_child_subreaper = !!arg2;
2235 if (!arg2)
2236 break;
2237
2238 walk_process_tree(me, propagate_has_child_subreaper, NULL);
2217 break; 2239 break;
2218 case PR_GET_CHILD_SUBREAPER: 2240 case PR_GET_CHILD_SUBREAPER:
2219 error = put_user(me->signal->is_child_subreaper, 2241 error = put_user(me->signal->is_child_subreaper,
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d7449783987a..310f0ea0d1a2 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -7503,7 +7503,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
7503 ftrace_init_tracefs(tr, d_tracer); 7503 ftrace_init_tracefs(tr, d_tracer);
7504} 7504}
7505 7505
7506static struct vfsmount *trace_automount(void *ingore) 7506static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore)
7507{ 7507{
7508 struct vfsmount *mnt; 7508 struct vfsmount *mnt;
7509 struct file_system_type *type; 7509 struct file_system_type *type;
@@ -7516,7 +7516,7 @@ static struct vfsmount *trace_automount(void *ingore)
7516 type = get_fs_type("tracefs"); 7516 type = get_fs_type("tracefs");
7517 if (!type) 7517 if (!type)
7518 return NULL; 7518 return NULL;
7519 mnt = vfs_kern_mount(type, 0, "tracefs", NULL); 7519 mnt = vfs_submount(mntpt, type, "tracefs", NULL);
7520 put_filesystem(type); 7520 put_filesystem(type);
7521 if (IS_ERR(mnt)) 7521 if (IS_ERR(mnt))
7522 return NULL; 7522 return NULL;
diff --git a/kernel/ucount.c b/kernel/ucount.c
index 95c6336fc2b3..8a11fc0cb459 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -57,7 +57,7 @@ static struct ctl_table_root set_root = {
57 57
58static int zero = 0; 58static int zero = 0;
59static int int_max = INT_MAX; 59static int int_max = INT_MAX;
60#define UCOUNT_ENTRY(name) \ 60#define UCOUNT_ENTRY(name) \
61 { \ 61 { \
62 .procname = name, \ 62 .procname = name, \
63 .maxlen = sizeof(int), \ 63 .maxlen = sizeof(int), \
@@ -74,6 +74,10 @@ static struct ctl_table user_table[] = {
74 UCOUNT_ENTRY("max_net_namespaces"), 74 UCOUNT_ENTRY("max_net_namespaces"),
75 UCOUNT_ENTRY("max_mnt_namespaces"), 75 UCOUNT_ENTRY("max_mnt_namespaces"),
76 UCOUNT_ENTRY("max_cgroup_namespaces"), 76 UCOUNT_ENTRY("max_cgroup_namespaces"),
77#ifdef CONFIG_INOTIFY_USER
78 UCOUNT_ENTRY("max_inotify_instances"),
79 UCOUNT_ENTRY("max_inotify_watches"),
80#endif
77 { } 81 { }
78}; 82};
79#endif /* CONFIG_SYSCTL */ 83#endif /* CONFIG_SYSCTL */
diff --git a/security/apparmor/domain.c b/security/apparmor/domain.c
index ef4beef06e9d..001e133a3c8c 100644
--- a/security/apparmor/domain.c
+++ b/security/apparmor/domain.c
@@ -471,7 +471,7 @@ int apparmor_bprm_set_creds(struct linux_binprm *bprm)
471 ; 471 ;
472 } 472 }
473 473
474 if (bprm->unsafe & (LSM_UNSAFE_PTRACE | LSM_UNSAFE_PTRACE_CAP)) { 474 if (bprm->unsafe & LSM_UNSAFE_PTRACE) {
475 error = may_change_ptraced_domain(new_profile); 475 error = may_change_ptraced_domain(new_profile);
476 if (error) 476 if (error)
477 goto audit; 477 goto audit;
diff --git a/security/commoncap.c b/security/commoncap.c
index 6d4d586b9356..78b37838a2d3 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -548,9 +548,10 @@ skip:
548 548
549 if ((is_setid || 549 if ((is_setid ||
550 !cap_issubset(new->cap_permitted, old->cap_permitted)) && 550 !cap_issubset(new->cap_permitted, old->cap_permitted)) &&
551 bprm->unsafe & ~LSM_UNSAFE_PTRACE_CAP) { 551 ((bprm->unsafe & ~LSM_UNSAFE_PTRACE) ||
552 !ptracer_capable(current, new->user_ns))) {
552 /* downgrade; they get no more than they had, and maybe less */ 553 /* downgrade; they get no more than they had, and maybe less */
553 if (!capable(CAP_SETUID) || 554 if (!ns_capable(new->user_ns, CAP_SETUID) ||
554 (bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS)) { 555 (bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS)) {
555 new->euid = new->uid; 556 new->euid = new->uid;
556 new->egid = new->gid; 557 new->egid = new->gid;
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index e6b1b7410321..9a8f12f8d5b7 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -2399,8 +2399,7 @@ static int selinux_bprm_set_creds(struct linux_binprm *bprm)
2399 2399
2400 /* Make sure that anyone attempting to ptrace over a task that 2400 /* Make sure that anyone attempting to ptrace over a task that
2401 * changes its SID has the appropriate permit */ 2401 * changes its SID has the appropriate permit */
2402 if (bprm->unsafe & 2402 if (bprm->unsafe & LSM_UNSAFE_PTRACE) {
2403 (LSM_UNSAFE_PTRACE | LSM_UNSAFE_PTRACE_CAP)) {
2404 u32 ptsid = ptrace_parent_sid(); 2403 u32 ptsid = ptrace_parent_sid();
2405 if (ptsid != 0) { 2404 if (ptsid != 0) {
2406 rc = avc_has_perm(ptsid, new_tsec->sid, 2405 rc = avc_has_perm(ptsid, new_tsec->sid,
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 60b4217b9b68..fc8fb31fc24f 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -931,7 +931,7 @@ static int smack_bprm_set_creds(struct linux_binprm *bprm)
931 isp->smk_task != sbsp->smk_root) 931 isp->smk_task != sbsp->smk_root)
932 return 0; 932 return 0;
933 933
934 if (bprm->unsafe & (LSM_UNSAFE_PTRACE | LSM_UNSAFE_PTRACE_CAP)) { 934 if (bprm->unsafe & LSM_UNSAFE_PTRACE) {
935 struct task_struct *tracer; 935 struct task_struct *tracer;
936 rc = 0; 936 rc = 0;
937 937