summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fs/afs/mntpt.c2
-rw-r--r--fs/autofs4/waitq.c4
-rw-r--r--fs/cifs/cifs_dfs_ref.c7
-rw-r--r--fs/debugfs/inode.c8
-rw-r--r--fs/exec.c10
-rw-r--r--fs/mount.h1
-rw-r--r--fs/namei.c9
-rw-r--r--fs/namespace.c127
-rw-r--r--fs/nfs/namespace.c2
-rw-r--r--fs/nfs/nfs4namespace.c2
-rw-r--r--fs/notify/inotify/inotify.h17
-rw-r--r--fs/notify/inotify/inotify_fsnotify.c6
-rw-r--r--fs/notify/inotify/inotify_user.c34
-rw-r--r--fs/nsfs.c13
-rw-r--r--fs/pnode.c61
-rw-r--r--fs/pnode.h2
-rw-r--r--fs/proc/base.c102
-rw-r--r--fs/proc/fd.c12
-rw-r--r--fs/proc/inode.c3
-rw-r--r--fs/proc/internal.h23
-rw-r--r--fs/proc/proc_sysctl.c66
-rw-r--r--fs/super.c13
-rw-r--r--include/linux/debugfs.h3
-rw-r--r--include/linux/fsnotify_backend.h3
-rw-r--r--include/linux/mount.h3
-rw-r--r--include/linux/sched.h7
-rw-r--r--include/linux/security.h3
-rw-r--r--include/linux/sysctl.h1
-rw-r--r--include/linux/user_namespace.h4
-rw-r--r--include/uapi/linux/fs.h1
-rw-r--r--include/uapi/linux/nsfs.h9
-rw-r--r--kernel/exit.c13
-rw-r--r--kernel/fork.c42
-rw-r--r--kernel/sys.c22
-rw-r--r--kernel/trace/trace.c4
-rw-r--r--kernel/ucount.c6
-rw-r--r--security/apparmor/domain.c2
-rw-r--r--security/commoncap.c5
-rw-r--r--security/selinux/hooks.c3
-rw-r--r--security/smack/smack_lsm.c2
40 files changed, 431 insertions, 226 deletions
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 81dd075356b9..d4fb0afc0097 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -202,7 +202,7 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
202 202
203 /* try and do the mount */ 203 /* try and do the mount */
204 _debug("--- attempting mount %s -o %s ---", devname, options); 204 _debug("--- attempting mount %s -o %s ---", devname, options);
205 mnt = vfs_kern_mount(&afs_fs_type, 0, devname, options); 205 mnt = vfs_submount(mntpt, &afs_fs_type, devname, options);
206 _debug("--- mount result %p ---", mnt); 206 _debug("--- mount result %p ---", mnt);
207 207
208 free_page((unsigned long) devname); 208 free_page((unsigned long) devname);
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 1278335ce366..79fbd85db4ba 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -436,8 +436,8 @@ int autofs4_wait(struct autofs_sb_info *sbi,
436 memcpy(&wq->name, &qstr, sizeof(struct qstr)); 436 memcpy(&wq->name, &qstr, sizeof(struct qstr));
437 wq->dev = autofs4_get_dev(sbi); 437 wq->dev = autofs4_get_dev(sbi);
438 wq->ino = autofs4_get_ino(sbi); 438 wq->ino = autofs4_get_ino(sbi);
439 wq->uid = current_real_cred()->uid; 439 wq->uid = current_cred()->uid;
440 wq->gid = current_real_cred()->gid; 440 wq->gid = current_cred()->gid;
441 wq->pid = pid; 441 wq->pid = pid;
442 wq->tgid = tgid; 442 wq->tgid = tgid;
443 wq->status = -EINTR; /* Status return if interrupted */ 443 wq->status = -EINTR; /* Status return if interrupted */
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index ec9dbbcca3b9..9156be545b0f 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -245,7 +245,8 @@ compose_mount_options_err:
245 * @fullpath: full path in UNC format 245 * @fullpath: full path in UNC format
246 * @ref: server's referral 246 * @ref: server's referral
247 */ 247 */
248static struct vfsmount *cifs_dfs_do_refmount(struct cifs_sb_info *cifs_sb, 248static struct vfsmount *cifs_dfs_do_refmount(struct dentry *mntpt,
249 struct cifs_sb_info *cifs_sb,
249 const char *fullpath, const struct dfs_info3_param *ref) 250 const char *fullpath, const struct dfs_info3_param *ref)
250{ 251{
251 struct vfsmount *mnt; 252 struct vfsmount *mnt;
@@ -259,7 +260,7 @@ static struct vfsmount *cifs_dfs_do_refmount(struct cifs_sb_info *cifs_sb,
259 if (IS_ERR(mountdata)) 260 if (IS_ERR(mountdata))
260 return (struct vfsmount *)mountdata; 261 return (struct vfsmount *)mountdata;
261 262
262 mnt = vfs_kern_mount(&cifs_fs_type, 0, devname, mountdata); 263 mnt = vfs_submount(mntpt, &cifs_fs_type, devname, mountdata);
263 kfree(mountdata); 264 kfree(mountdata);
264 kfree(devname); 265 kfree(devname);
265 return mnt; 266 return mnt;
@@ -334,7 +335,7 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
334 mnt = ERR_PTR(-EINVAL); 335 mnt = ERR_PTR(-EINVAL);
335 break; 336 break;
336 } 337 }
337 mnt = cifs_dfs_do_refmount(cifs_sb, 338 mnt = cifs_dfs_do_refmount(mntpt, cifs_sb,
338 full_path, referrals + i); 339 full_path, referrals + i);
339 cifs_dbg(FYI, "%s: cifs_dfs_do_refmount:%s , mnt:%p\n", 340 cifs_dbg(FYI, "%s: cifs_dfs_do_refmount:%s , mnt:%p\n",
340 __func__, referrals[i].node_name, mnt); 341 __func__, referrals[i].node_name, mnt);
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 7fb1732a3630..7fd4ec4bb214 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -187,9 +187,9 @@ static const struct super_operations debugfs_super_operations = {
187 187
188static struct vfsmount *debugfs_automount(struct path *path) 188static struct vfsmount *debugfs_automount(struct path *path)
189{ 189{
190 struct vfsmount *(*f)(void *); 190 debugfs_automount_t f;
191 f = (struct vfsmount *(*)(void *))path->dentry->d_fsdata; 191 f = (debugfs_automount_t)path->dentry->d_fsdata;
192 return f(d_inode(path->dentry)->i_private); 192 return f(path->dentry, d_inode(path->dentry)->i_private);
193} 193}
194 194
195static const struct dentry_operations debugfs_dops = { 195static const struct dentry_operations debugfs_dops = {
@@ -540,7 +540,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_dir);
540 */ 540 */
541struct dentry *debugfs_create_automount(const char *name, 541struct dentry *debugfs_create_automount(const char *name,
542 struct dentry *parent, 542 struct dentry *parent,
543 struct vfsmount *(*f)(void *), 543 debugfs_automount_t f,
544 void *data) 544 void *data)
545{ 545{
546 struct dentry *dentry = start_creating(name, parent); 546 struct dentry *dentry = start_creating(name, parent);
diff --git a/fs/exec.c b/fs/exec.c
index e57946610733..698a86094f76 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1426,12 +1426,8 @@ static void check_unsafe_exec(struct linux_binprm *bprm)
1426 struct task_struct *p = current, *t; 1426 struct task_struct *p = current, *t;
1427 unsigned n_fs; 1427 unsigned n_fs;
1428 1428
1429 if (p->ptrace) { 1429 if (p->ptrace)
1430 if (ptracer_capable(p, current_user_ns())) 1430 bprm->unsafe |= LSM_UNSAFE_PTRACE;
1431 bprm->unsafe |= LSM_UNSAFE_PTRACE_CAP;
1432 else
1433 bprm->unsafe |= LSM_UNSAFE_PTRACE;
1434 }
1435 1431
1436 /* 1432 /*
1437 * This isn't strictly necessary, but it makes it harder for LSMs to 1433 * This isn't strictly necessary, but it makes it harder for LSMs to
@@ -1479,7 +1475,7 @@ static void bprm_fill_uid(struct linux_binprm *bprm)
1479 if (task_no_new_privs(current)) 1475 if (task_no_new_privs(current))
1480 return; 1476 return;
1481 1477
1482 inode = file_inode(bprm->file); 1478 inode = bprm->file->f_path.dentry->d_inode;
1483 mode = READ_ONCE(inode->i_mode); 1479 mode = READ_ONCE(inode->i_mode);
1484 if (!(mode & (S_ISUID|S_ISGID))) 1480 if (!(mode & (S_ISUID|S_ISGID)))
1485 return; 1481 return;
diff --git a/fs/mount.h b/fs/mount.h
index 2c856fc47ae3..2826543a131d 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -89,7 +89,6 @@ static inline int is_mounted(struct vfsmount *mnt)
89} 89}
90 90
91extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *); 91extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *);
92extern struct mount *__lookup_mnt_last(struct vfsmount *, struct dentry *);
93 92
94extern int __legitimize_mnt(struct vfsmount *, unsigned); 93extern int __legitimize_mnt(struct vfsmount *, unsigned);
95extern bool legitimize_mnt(struct vfsmount *, unsigned); 94extern bool legitimize_mnt(struct vfsmount *, unsigned);
diff --git a/fs/namei.c b/fs/namei.c
index ad74877e1442..da689c9c005e 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1100,7 +1100,6 @@ static int follow_automount(struct path *path, struct nameidata *nd,
1100 bool *need_mntput) 1100 bool *need_mntput)
1101{ 1101{
1102 struct vfsmount *mnt; 1102 struct vfsmount *mnt;
1103 const struct cred *old_cred;
1104 int err; 1103 int err;
1105 1104
1106 if (!path->dentry->d_op || !path->dentry->d_op->d_automount) 1105 if (!path->dentry->d_op || !path->dentry->d_op->d_automount)
@@ -1129,9 +1128,7 @@ static int follow_automount(struct path *path, struct nameidata *nd,
1129 if (nd->total_link_count >= 40) 1128 if (nd->total_link_count >= 40)
1130 return -ELOOP; 1129 return -ELOOP;
1131 1130
1132 old_cred = override_creds(&init_cred);
1133 mnt = path->dentry->d_op->d_automount(path); 1131 mnt = path->dentry->d_op->d_automount(path);
1134 revert_creds(old_cred);
1135 if (IS_ERR(mnt)) { 1132 if (IS_ERR(mnt)) {
1136 /* 1133 /*
1137 * The filesystem is allowed to return -EISDIR here to indicate 1134 * The filesystem is allowed to return -EISDIR here to indicate
@@ -2941,10 +2938,16 @@ static inline int open_to_namei_flags(int flag)
2941 2938
2942static int may_o_create(const struct path *dir, struct dentry *dentry, umode_t mode) 2939static int may_o_create(const struct path *dir, struct dentry *dentry, umode_t mode)
2943{ 2940{
2941 struct user_namespace *s_user_ns;
2944 int error = security_path_mknod(dir, dentry, mode, 0); 2942 int error = security_path_mknod(dir, dentry, mode, 0);
2945 if (error) 2943 if (error)
2946 return error; 2944 return error;
2947 2945
2946 s_user_ns = dir->dentry->d_sb->s_user_ns;
2947 if (!kuid_has_mapping(s_user_ns, current_fsuid()) ||
2948 !kgid_has_mapping(s_user_ns, current_fsgid()))
2949 return -EOVERFLOW;
2950
2948 error = inode_permission(dir->dentry->d_inode, MAY_WRITE | MAY_EXEC); 2951 error = inode_permission(dir->dentry->d_inode, MAY_WRITE | MAY_EXEC);
2949 if (error) 2952 if (error)
2950 return error; 2953 return error;
diff --git a/fs/namespace.c b/fs/namespace.c
index 487ba30bb5c6..8bfad42c1ccf 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -637,28 +637,6 @@ struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
637} 637}
638 638
639/* 639/*
640 * find the last mount at @dentry on vfsmount @mnt.
641 * mount_lock must be held.
642 */
643struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)
644{
645 struct mount *p, *res = NULL;
646 p = __lookup_mnt(mnt, dentry);
647 if (!p)
648 goto out;
649 if (!(p->mnt.mnt_flags & MNT_UMOUNT))
650 res = p;
651 hlist_for_each_entry_continue(p, mnt_hash) {
652 if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry)
653 break;
654 if (!(p->mnt.mnt_flags & MNT_UMOUNT))
655 res = p;
656 }
657out:
658 return res;
659}
660
661/*
662 * lookup_mnt - Return the first child mount mounted at path 640 * lookup_mnt - Return the first child mount mounted at path
663 * 641 *
664 * "First" means first mounted chronologically. If you create the 642 * "First" means first mounted chronologically. If you create the
@@ -878,6 +856,13 @@ void mnt_set_mountpoint(struct mount *mnt,
878 hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list); 856 hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list);
879} 857}
880 858
859static void __attach_mnt(struct mount *mnt, struct mount *parent)
860{
861 hlist_add_head_rcu(&mnt->mnt_hash,
862 m_hash(&parent->mnt, mnt->mnt_mountpoint));
863 list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
864}
865
881/* 866/*
882 * vfsmount lock must be held for write 867 * vfsmount lock must be held for write
883 */ 868 */
@@ -886,28 +871,45 @@ static void attach_mnt(struct mount *mnt,
886 struct mountpoint *mp) 871 struct mountpoint *mp)
887{ 872{
888 mnt_set_mountpoint(parent, mp, mnt); 873 mnt_set_mountpoint(parent, mp, mnt);
889 hlist_add_head_rcu(&mnt->mnt_hash, m_hash(&parent->mnt, mp->m_dentry)); 874 __attach_mnt(mnt, parent);
890 list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
891} 875}
892 876
893static void attach_shadowed(struct mount *mnt, 877void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt)
894 struct mount *parent,
895 struct mount *shadows)
896{ 878{
897 if (shadows) { 879 struct mountpoint *old_mp = mnt->mnt_mp;
898 hlist_add_behind_rcu(&mnt->mnt_hash, &shadows->mnt_hash); 880 struct dentry *old_mountpoint = mnt->mnt_mountpoint;
899 list_add(&mnt->mnt_child, &shadows->mnt_child); 881 struct mount *old_parent = mnt->mnt_parent;
900 } else { 882
901 hlist_add_head_rcu(&mnt->mnt_hash, 883 list_del_init(&mnt->mnt_child);
902 m_hash(&parent->mnt, mnt->mnt_mountpoint)); 884 hlist_del_init(&mnt->mnt_mp_list);
903 list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); 885 hlist_del_init_rcu(&mnt->mnt_hash);
904 } 886
887 attach_mnt(mnt, parent, mp);
888
889 put_mountpoint(old_mp);
890
891 /*
892 * Safely avoid even the suggestion this code might sleep or
893 * lock the mount hash by taking advantage of the knowledge that
894 * mnt_change_mountpoint will not release the final reference
895 * to a mountpoint.
896 *
897 * During mounting, the mount passed in as the parent mount will
898 * continue to use the old mountpoint and during unmounting, the
899 * old mountpoint will continue to exist until namespace_unlock,
900 * which happens well after mnt_change_mountpoint.
901 */
902 spin_lock(&old_mountpoint->d_lock);
903 old_mountpoint->d_lockref.count--;
904 spin_unlock(&old_mountpoint->d_lock);
905
906 mnt_add_count(old_parent, -1);
905} 907}
906 908
907/* 909/*
908 * vfsmount lock must be held for write 910 * vfsmount lock must be held for write
909 */ 911 */
910static void commit_tree(struct mount *mnt, struct mount *shadows) 912static void commit_tree(struct mount *mnt)
911{ 913{
912 struct mount *parent = mnt->mnt_parent; 914 struct mount *parent = mnt->mnt_parent;
913 struct mount *m; 915 struct mount *m;
@@ -925,7 +927,7 @@ static void commit_tree(struct mount *mnt, struct mount *shadows)
925 n->mounts += n->pending_mounts; 927 n->mounts += n->pending_mounts;
926 n->pending_mounts = 0; 928 n->pending_mounts = 0;
927 929
928 attach_shadowed(mnt, parent, shadows); 930 __attach_mnt(mnt, parent);
929 touch_mnt_namespace(n); 931 touch_mnt_namespace(n);
930} 932}
931 933
@@ -989,6 +991,21 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
989} 991}
990EXPORT_SYMBOL_GPL(vfs_kern_mount); 992EXPORT_SYMBOL_GPL(vfs_kern_mount);
991 993
994struct vfsmount *
995vfs_submount(const struct dentry *mountpoint, struct file_system_type *type,
996 const char *name, void *data)
997{
998 /* Until it is worked out how to pass the user namespace
999 * through from the parent mount to the submount don't support
1000 * unprivileged mounts with submounts.
1001 */
1002 if (mountpoint->d_sb->s_user_ns != &init_user_ns)
1003 return ERR_PTR(-EPERM);
1004
1005 return vfs_kern_mount(type, MS_SUBMOUNT, name, data);
1006}
1007EXPORT_SYMBOL_GPL(vfs_submount);
1008
992static struct mount *clone_mnt(struct mount *old, struct dentry *root, 1009static struct mount *clone_mnt(struct mount *old, struct dentry *root,
993 int flag) 1010 int flag)
994{ 1011{
@@ -1764,7 +1781,6 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
1764 continue; 1781 continue;
1765 1782
1766 for (s = r; s; s = next_mnt(s, r)) { 1783 for (s = r; s; s = next_mnt(s, r)) {
1767 struct mount *t = NULL;
1768 if (!(flag & CL_COPY_UNBINDABLE) && 1784 if (!(flag & CL_COPY_UNBINDABLE) &&
1769 IS_MNT_UNBINDABLE(s)) { 1785 IS_MNT_UNBINDABLE(s)) {
1770 s = skip_mnt_tree(s); 1786 s = skip_mnt_tree(s);
@@ -1786,14 +1802,7 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
1786 goto out; 1802 goto out;
1787 lock_mount_hash(); 1803 lock_mount_hash();
1788 list_add_tail(&q->mnt_list, &res->mnt_list); 1804 list_add_tail(&q->mnt_list, &res->mnt_list);
1789 mnt_set_mountpoint(parent, p->mnt_mp, q); 1805 attach_mnt(q, parent, p->mnt_mp);
1790 if (!list_empty(&parent->mnt_mounts)) {
1791 t = list_last_entry(&parent->mnt_mounts,
1792 struct mount, mnt_child);
1793 if (t->mnt_mp != p->mnt_mp)
1794 t = NULL;
1795 }
1796 attach_shadowed(q, parent, t);
1797 unlock_mount_hash(); 1806 unlock_mount_hash();
1798 } 1807 }
1799 } 1808 }
@@ -1992,10 +2001,18 @@ static int attach_recursive_mnt(struct mount *source_mnt,
1992{ 2001{
1993 HLIST_HEAD(tree_list); 2002 HLIST_HEAD(tree_list);
1994 struct mnt_namespace *ns = dest_mnt->mnt_ns; 2003 struct mnt_namespace *ns = dest_mnt->mnt_ns;
2004 struct mountpoint *smp;
1995 struct mount *child, *p; 2005 struct mount *child, *p;
1996 struct hlist_node *n; 2006 struct hlist_node *n;
1997 int err; 2007 int err;
1998 2008
2009 /* Preallocate a mountpoint in case the new mounts need
2010 * to be tucked under other mounts.
2011 */
2012 smp = get_mountpoint(source_mnt->mnt.mnt_root);
2013 if (IS_ERR(smp))
2014 return PTR_ERR(smp);
2015
1999 /* Is there space to add these mounts to the mount namespace? */ 2016 /* Is there space to add these mounts to the mount namespace? */
2000 if (!parent_path) { 2017 if (!parent_path) {
2001 err = count_mounts(ns, source_mnt); 2018 err = count_mounts(ns, source_mnt);
@@ -2022,16 +2039,19 @@ static int attach_recursive_mnt(struct mount *source_mnt,
2022 touch_mnt_namespace(source_mnt->mnt_ns); 2039 touch_mnt_namespace(source_mnt->mnt_ns);
2023 } else { 2040 } else {
2024 mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt); 2041 mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
2025 commit_tree(source_mnt, NULL); 2042 commit_tree(source_mnt);
2026 } 2043 }
2027 2044
2028 hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) { 2045 hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) {
2029 struct mount *q; 2046 struct mount *q;
2030 hlist_del_init(&child->mnt_hash); 2047 hlist_del_init(&child->mnt_hash);
2031 q = __lookup_mnt_last(&child->mnt_parent->mnt, 2048 q = __lookup_mnt(&child->mnt_parent->mnt,
2032 child->mnt_mountpoint); 2049 child->mnt_mountpoint);
2033 commit_tree(child, q); 2050 if (q)
2051 mnt_change_mountpoint(child, smp, q);
2052 commit_tree(child);
2034 } 2053 }
2054 put_mountpoint(smp);
2035 unlock_mount_hash(); 2055 unlock_mount_hash();
2036 2056
2037 return 0; 2057 return 0;
@@ -2046,6 +2066,11 @@ static int attach_recursive_mnt(struct mount *source_mnt,
2046 cleanup_group_ids(source_mnt, NULL); 2066 cleanup_group_ids(source_mnt, NULL);
2047 out: 2067 out:
2048 ns->pending_mounts = 0; 2068 ns->pending_mounts = 0;
2069
2070 read_seqlock_excl(&mount_lock);
2071 put_mountpoint(smp);
2072 read_sequnlock_excl(&mount_lock);
2073
2049 return err; 2074 return err;
2050} 2075}
2051 2076
@@ -2794,7 +2819,7 @@ long do_mount(const char *dev_name, const char __user *dir_name,
2794 2819
2795 flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN | 2820 flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN |
2796 MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT | 2821 MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
2797 MS_STRICTATIME | MS_NOREMOTELOCK); 2822 MS_STRICTATIME | MS_NOREMOTELOCK | MS_SUBMOUNT);
2798 2823
2799 if (flags & MS_REMOUNT) 2824 if (flags & MS_REMOUNT)
2800 retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags, 2825 retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 5551e8ef67fd..e49d831c4e85 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -226,7 +226,7 @@ static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server,
226 const char *devname, 226 const char *devname,
227 struct nfs_clone_mount *mountdata) 227 struct nfs_clone_mount *mountdata)
228{ 228{
229 return vfs_kern_mount(&nfs_xdev_fs_type, 0, devname, mountdata); 229 return vfs_submount(mountdata->dentry, &nfs_xdev_fs_type, devname, mountdata);
230} 230}
231 231
232/** 232/**
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index d21104912676..d8b040bd9814 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -279,7 +279,7 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
279 mountdata->hostname, 279 mountdata->hostname,
280 mountdata->mnt_path); 280 mountdata->mnt_path);
281 281
282 mnt = vfs_kern_mount(&nfs4_referral_fs_type, 0, page, mountdata); 282 mnt = vfs_submount(mountdata->dentry, &nfs4_referral_fs_type, page, mountdata);
283 if (!IS_ERR(mnt)) 283 if (!IS_ERR(mnt))
284 break; 284 break;
285 } 285 }
diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h
index a6f5907a3fee..7c461fd49c4c 100644
--- a/fs/notify/inotify/inotify.h
+++ b/fs/notify/inotify/inotify.h
@@ -30,3 +30,20 @@ extern int inotify_handle_event(struct fsnotify_group *group,
30 const unsigned char *file_name, u32 cookie); 30 const unsigned char *file_name, u32 cookie);
31 31
32extern const struct fsnotify_ops inotify_fsnotify_ops; 32extern const struct fsnotify_ops inotify_fsnotify_ops;
33
34#ifdef CONFIG_INOTIFY_USER
35static inline void dec_inotify_instances(struct ucounts *ucounts)
36{
37 dec_ucount(ucounts, UCOUNT_INOTIFY_INSTANCES);
38}
39
40static inline struct ucounts *inc_inotify_watches(struct ucounts *ucounts)
41{
42 return inc_ucount(ucounts->ns, ucounts->uid, UCOUNT_INOTIFY_WATCHES);
43}
44
45static inline void dec_inotify_watches(struct ucounts *ucounts)
46{
47 dec_ucount(ucounts, UCOUNT_INOTIFY_WATCHES);
48}
49#endif
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 19e7ec109a75..f36c29398de3 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -165,10 +165,8 @@ static void inotify_free_group_priv(struct fsnotify_group *group)
165 /* ideally the idr is empty and we won't hit the BUG in the callback */ 165 /* ideally the idr is empty and we won't hit the BUG in the callback */
166 idr_for_each(&group->inotify_data.idr, idr_callback, group); 166 idr_for_each(&group->inotify_data.idr, idr_callback, group);
167 idr_destroy(&group->inotify_data.idr); 167 idr_destroy(&group->inotify_data.idr);
168 if (group->inotify_data.user) { 168 if (group->inotify_data.ucounts)
169 atomic_dec(&group->inotify_data.user->inotify_devs); 169 dec_inotify_instances(group->inotify_data.ucounts);
170 free_uid(group->inotify_data.user);
171 }
172} 170}
173 171
174static void inotify_free_event(struct fsnotify_event *fsn_event) 172static void inotify_free_event(struct fsnotify_event *fsn_event)
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 69d1ea3d292a..1cf41c623be1 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -44,10 +44,8 @@
44 44
45#include <asm/ioctls.h> 45#include <asm/ioctls.h>
46 46
47/* these are configurable via /proc/sys/fs/inotify/ */ 47/* configurable via /proc/sys/fs/inotify/ */
48static int inotify_max_user_instances __read_mostly;
49static int inotify_max_queued_events __read_mostly; 48static int inotify_max_queued_events __read_mostly;
50static int inotify_max_user_watches __read_mostly;
51 49
52static struct kmem_cache *inotify_inode_mark_cachep __read_mostly; 50static struct kmem_cache *inotify_inode_mark_cachep __read_mostly;
53 51
@@ -60,7 +58,7 @@ static int zero;
60struct ctl_table inotify_table[] = { 58struct ctl_table inotify_table[] = {
61 { 59 {
62 .procname = "max_user_instances", 60 .procname = "max_user_instances",
63 .data = &inotify_max_user_instances, 61 .data = &init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES],
64 .maxlen = sizeof(int), 62 .maxlen = sizeof(int),
65 .mode = 0644, 63 .mode = 0644,
66 .proc_handler = proc_dointvec_minmax, 64 .proc_handler = proc_dointvec_minmax,
@@ -68,7 +66,7 @@ struct ctl_table inotify_table[] = {
68 }, 66 },
69 { 67 {
70 .procname = "max_user_watches", 68 .procname = "max_user_watches",
71 .data = &inotify_max_user_watches, 69 .data = &init_user_ns.ucount_max[UCOUNT_INOTIFY_WATCHES],
72 .maxlen = sizeof(int), 70 .maxlen = sizeof(int),
73 .mode = 0644, 71 .mode = 0644,
74 .proc_handler = proc_dointvec_minmax, 72 .proc_handler = proc_dointvec_minmax,
@@ -500,7 +498,7 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
500 /* remove this mark from the idr */ 498 /* remove this mark from the idr */
501 inotify_remove_from_idr(group, i_mark); 499 inotify_remove_from_idr(group, i_mark);
502 500
503 atomic_dec(&group->inotify_data.user->inotify_watches); 501 dec_inotify_watches(group->inotify_data.ucounts);
504} 502}
505 503
506/* ding dong the mark is dead */ 504/* ding dong the mark is dead */
@@ -584,14 +582,17 @@ static int inotify_new_watch(struct fsnotify_group *group,
584 tmp_i_mark->fsn_mark.mask = mask; 582 tmp_i_mark->fsn_mark.mask = mask;
585 tmp_i_mark->wd = -1; 583 tmp_i_mark->wd = -1;
586 584
587 ret = -ENOSPC;
588 if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches)
589 goto out_err;
590
591 ret = inotify_add_to_idr(idr, idr_lock, tmp_i_mark); 585 ret = inotify_add_to_idr(idr, idr_lock, tmp_i_mark);
592 if (ret) 586 if (ret)
593 goto out_err; 587 goto out_err;
594 588
589 /* increment the number of watches the user has */
590 if (!inc_inotify_watches(group->inotify_data.ucounts)) {
591 inotify_remove_from_idr(group, tmp_i_mark);
592 ret = -ENOSPC;
593 goto out_err;
594 }
595
595 /* we are on the idr, now get on the inode */ 596 /* we are on the idr, now get on the inode */
596 ret = fsnotify_add_mark_locked(&tmp_i_mark->fsn_mark, group, inode, 597 ret = fsnotify_add_mark_locked(&tmp_i_mark->fsn_mark, group, inode,
597 NULL, 0); 598 NULL, 0);
@@ -601,8 +602,6 @@ static int inotify_new_watch(struct fsnotify_group *group,
601 goto out_err; 602 goto out_err;
602 } 603 }
603 604
604 /* increment the number of watches the user has */
605 atomic_inc(&group->inotify_data.user->inotify_watches);
606 605
607 /* return the watch descriptor for this new mark */ 606 /* return the watch descriptor for this new mark */
608 ret = tmp_i_mark->wd; 607 ret = tmp_i_mark->wd;
@@ -653,10 +652,11 @@ static struct fsnotify_group *inotify_new_group(unsigned int max_events)
653 652
654 spin_lock_init(&group->inotify_data.idr_lock); 653 spin_lock_init(&group->inotify_data.idr_lock);
655 idr_init(&group->inotify_data.idr); 654 idr_init(&group->inotify_data.idr);
656 group->inotify_data.user = get_current_user(); 655 group->inotify_data.ucounts = inc_ucount(current_user_ns(),
656 current_euid(),
657 UCOUNT_INOTIFY_INSTANCES);
657 658
658 if (atomic_inc_return(&group->inotify_data.user->inotify_devs) > 659 if (!group->inotify_data.ucounts) {
659 inotify_max_user_instances) {
660 fsnotify_destroy_group(group); 660 fsnotify_destroy_group(group);
661 return ERR_PTR(-EMFILE); 661 return ERR_PTR(-EMFILE);
662 } 662 }
@@ -819,8 +819,8 @@ static int __init inotify_user_setup(void)
819 inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, SLAB_PANIC); 819 inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, SLAB_PANIC);
820 820
821 inotify_max_queued_events = 16384; 821 inotify_max_queued_events = 16384;
822 inotify_max_user_instances = 128; 822 init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES] = 128;
823 inotify_max_user_watches = 8192; 823 init_user_ns.ucount_max[UCOUNT_INOTIFY_WATCHES] = 8192;
824 824
825 return 0; 825 return 0;
826} 826}
diff --git a/fs/nsfs.c b/fs/nsfs.c
index 8c9fb29c6673..1656843e87d2 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -7,6 +7,7 @@
7#include <linux/seq_file.h> 7#include <linux/seq_file.h>
8#include <linux/user_namespace.h> 8#include <linux/user_namespace.h>
9#include <linux/nsfs.h> 9#include <linux/nsfs.h>
10#include <linux/uaccess.h>
10 11
11static struct vfsmount *nsfs_mnt; 12static struct vfsmount *nsfs_mnt;
12 13
@@ -163,7 +164,10 @@ int open_related_ns(struct ns_common *ns,
163static long ns_ioctl(struct file *filp, unsigned int ioctl, 164static long ns_ioctl(struct file *filp, unsigned int ioctl,
164 unsigned long arg) 165 unsigned long arg)
165{ 166{
167 struct user_namespace *user_ns;
166 struct ns_common *ns = get_proc_ns(file_inode(filp)); 168 struct ns_common *ns = get_proc_ns(file_inode(filp));
169 uid_t __user *argp;
170 uid_t uid;
167 171
168 switch (ioctl) { 172 switch (ioctl) {
169 case NS_GET_USERNS: 173 case NS_GET_USERNS:
@@ -172,6 +176,15 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl,
172 if (!ns->ops->get_parent) 176 if (!ns->ops->get_parent)
173 return -EINVAL; 177 return -EINVAL;
174 return open_related_ns(ns, ns->ops->get_parent); 178 return open_related_ns(ns, ns->ops->get_parent);
179 case NS_GET_NSTYPE:
180 return ns->ops->type;
181 case NS_GET_OWNER_UID:
182 if (ns->ops->type != CLONE_NEWUSER)
183 return -EINVAL;
184 user_ns = container_of(ns, struct user_namespace, ns);
185 argp = (uid_t __user *) arg;
186 uid = from_kuid_munged(current_user_ns(), user_ns->owner);
187 return put_user(uid, argp);
175 default: 188 default:
176 return -ENOTTY; 189 return -ENOTTY;
177 } 190 }
diff --git a/fs/pnode.c b/fs/pnode.c
index 06a793f4ae38..5bc7896d122a 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -322,6 +322,21 @@ out:
322 return ret; 322 return ret;
323} 323}
324 324
325static struct mount *find_topper(struct mount *mnt)
326{
327 /* If there is exactly one mount covering mnt completely return it. */
328 struct mount *child;
329
330 if (!list_is_singular(&mnt->mnt_mounts))
331 return NULL;
332
333 child = list_first_entry(&mnt->mnt_mounts, struct mount, mnt_child);
334 if (child->mnt_mountpoint != mnt->mnt.mnt_root)
335 return NULL;
336
337 return child;
338}
339
325/* 340/*
326 * return true if the refcount is greater than count 341 * return true if the refcount is greater than count
327 */ 342 */
@@ -342,9 +357,8 @@ static inline int do_refcount_check(struct mount *mnt, int count)
342 */ 357 */
343int propagate_mount_busy(struct mount *mnt, int refcnt) 358int propagate_mount_busy(struct mount *mnt, int refcnt)
344{ 359{
345 struct mount *m, *child; 360 struct mount *m, *child, *topper;
346 struct mount *parent = mnt->mnt_parent; 361 struct mount *parent = mnt->mnt_parent;
347 int ret = 0;
348 362
349 if (mnt == parent) 363 if (mnt == parent)
350 return do_refcount_check(mnt, refcnt); 364 return do_refcount_check(mnt, refcnt);
@@ -359,12 +373,24 @@ int propagate_mount_busy(struct mount *mnt, int refcnt)
359 373
360 for (m = propagation_next(parent, parent); m; 374 for (m = propagation_next(parent, parent); m;
361 m = propagation_next(m, parent)) { 375 m = propagation_next(m, parent)) {
362 child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint); 376 int count = 1;
363 if (child && list_empty(&child->mnt_mounts) && 377 child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint);
364 (ret = do_refcount_check(child, 1))) 378 if (!child)
365 break; 379 continue;
380
381 /* Is there exactly one mount on the child that covers
382 * it completely whose reference should be ignored?
383 */
384 topper = find_topper(child);
385 if (topper)
386 count += 1;
387 else if (!list_empty(&child->mnt_mounts))
388 continue;
389
390 if (do_refcount_check(child, count))
391 return 1;
366 } 392 }
367 return ret; 393 return 0;
368} 394}
369 395
370/* 396/*
@@ -381,7 +407,7 @@ void propagate_mount_unlock(struct mount *mnt)
381 407
382 for (m = propagation_next(parent, parent); m; 408 for (m = propagation_next(parent, parent); m;
383 m = propagation_next(m, parent)) { 409 m = propagation_next(m, parent)) {
384 child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint); 410 child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint);
385 if (child) 411 if (child)
386 child->mnt.mnt_flags &= ~MNT_LOCKED; 412 child->mnt.mnt_flags &= ~MNT_LOCKED;
387 } 413 }
@@ -399,9 +425,11 @@ static void mark_umount_candidates(struct mount *mnt)
399 425
400 for (m = propagation_next(parent, parent); m; 426 for (m = propagation_next(parent, parent); m;
401 m = propagation_next(m, parent)) { 427 m = propagation_next(m, parent)) {
402 struct mount *child = __lookup_mnt_last(&m->mnt, 428 struct mount *child = __lookup_mnt(&m->mnt,
403 mnt->mnt_mountpoint); 429 mnt->mnt_mountpoint);
404 if (child && (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m))) { 430 if (!child || (child->mnt.mnt_flags & MNT_UMOUNT))
431 continue;
432 if (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m)) {
405 SET_MNT_MARK(child); 433 SET_MNT_MARK(child);
406 } 434 }
407 } 435 }
@@ -420,8 +448,8 @@ static void __propagate_umount(struct mount *mnt)
420 448
421 for (m = propagation_next(parent, parent); m; 449 for (m = propagation_next(parent, parent); m;
422 m = propagation_next(m, parent)) { 450 m = propagation_next(m, parent)) {
423 451 struct mount *topper;
424 struct mount *child = __lookup_mnt_last(&m->mnt, 452 struct mount *child = __lookup_mnt(&m->mnt,
425 mnt->mnt_mountpoint); 453 mnt->mnt_mountpoint);
426 /* 454 /*
427 * umount the child only if the child has no children 455 * umount the child only if the child has no children
@@ -430,6 +458,15 @@ static void __propagate_umount(struct mount *mnt)
430 if (!child || !IS_MNT_MARKED(child)) 458 if (!child || !IS_MNT_MARKED(child))
431 continue; 459 continue;
432 CLEAR_MNT_MARK(child); 460 CLEAR_MNT_MARK(child);
461
462 /* If there is exactly one mount covering all of child
463 * replace child with that mount.
464 */
465 topper = find_topper(child);
466 if (topper)
467 mnt_change_mountpoint(child->mnt_parent, child->mnt_mp,
468 topper);
469
433 if (list_empty(&child->mnt_mounts)) { 470 if (list_empty(&child->mnt_mounts)) {
434 list_del_init(&child->mnt_child); 471 list_del_init(&child->mnt_child);
435 child->mnt.mnt_flags |= MNT_UMOUNT; 472 child->mnt.mnt_flags |= MNT_UMOUNT;
diff --git a/fs/pnode.h b/fs/pnode.h
index 550f5a8b4fcf..dc87e65becd2 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -49,6 +49,8 @@ int get_dominating_id(struct mount *mnt, const struct path *root);
49unsigned int mnt_get_count(struct mount *mnt); 49unsigned int mnt_get_count(struct mount *mnt);
50void mnt_set_mountpoint(struct mount *, struct mountpoint *, 50void mnt_set_mountpoint(struct mount *, struct mountpoint *,
51 struct mount *); 51 struct mount *);
52void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp,
53 struct mount *mnt);
52struct mount *copy_tree(struct mount *, struct dentry *, int); 54struct mount *copy_tree(struct mount *, struct dentry *, int);
53bool is_path_reachable(struct mount *, struct dentry *, 55bool is_path_reachable(struct mount *, struct dentry *,
54 const struct path *root); 56 const struct path *root);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 3d773eb9e144..b73b4de8fb36 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1667,12 +1667,63 @@ const struct inode_operations proc_pid_link_inode_operations = {
1667 1667
1668/* building an inode */ 1668/* building an inode */
1669 1669
1670void task_dump_owner(struct task_struct *task, mode_t mode,
1671 kuid_t *ruid, kgid_t *rgid)
1672{
1673 /* Depending on the state of dumpable compute who should own a
1674 * proc file for a task.
1675 */
1676 const struct cred *cred;
1677 kuid_t uid;
1678 kgid_t gid;
1679
1680 /* Default to the tasks effective ownership */
1681 rcu_read_lock();
1682 cred = __task_cred(task);
1683 uid = cred->euid;
1684 gid = cred->egid;
1685 rcu_read_unlock();
1686
1687 /*
1688 * Before the /proc/pid/status file was created the only way to read
1689 * the effective uid of a /process was to stat /proc/pid. Reading
1690 * /proc/pid/status is slow enough that procps and other packages
1691 * kept stating /proc/pid. To keep the rules in /proc simple I have
1692 * made this apply to all per process world readable and executable
1693 * directories.
1694 */
1695 if (mode != (S_IFDIR|S_IRUGO|S_IXUGO)) {
1696 struct mm_struct *mm;
1697 task_lock(task);
1698 mm = task->mm;
1699 /* Make non-dumpable tasks owned by some root */
1700 if (mm) {
1701 if (get_dumpable(mm) != SUID_DUMP_USER) {
1702 struct user_namespace *user_ns = mm->user_ns;
1703
1704 uid = make_kuid(user_ns, 0);
1705 if (!uid_valid(uid))
1706 uid = GLOBAL_ROOT_UID;
1707
1708 gid = make_kgid(user_ns, 0);
1709 if (!gid_valid(gid))
1710 gid = GLOBAL_ROOT_GID;
1711 }
1712 } else {
1713 uid = GLOBAL_ROOT_UID;
1714 gid = GLOBAL_ROOT_GID;
1715 }
1716 task_unlock(task);
1717 }
1718 *ruid = uid;
1719 *rgid = gid;
1720}
1721
1670struct inode *proc_pid_make_inode(struct super_block * sb, 1722struct inode *proc_pid_make_inode(struct super_block * sb,
1671 struct task_struct *task, umode_t mode) 1723 struct task_struct *task, umode_t mode)
1672{ 1724{
1673 struct inode * inode; 1725 struct inode * inode;
1674 struct proc_inode *ei; 1726 struct proc_inode *ei;
1675 const struct cred *cred;
1676 1727
1677 /* We need a new inode */ 1728 /* We need a new inode */
1678 1729
@@ -1694,13 +1745,7 @@ struct inode *proc_pid_make_inode(struct super_block * sb,
1694 if (!ei->pid) 1745 if (!ei->pid)
1695 goto out_unlock; 1746 goto out_unlock;
1696 1747
1697 if (task_dumpable(task)) { 1748 task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
1698 rcu_read_lock();
1699 cred = __task_cred(task);
1700 inode->i_uid = cred->euid;
1701 inode->i_gid = cred->egid;
1702 rcu_read_unlock();
1703 }
1704 security_task_to_inode(task, inode); 1749 security_task_to_inode(task, inode);
1705 1750
1706out: 1751out:
@@ -1715,7 +1760,6 @@ int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
1715{ 1760{
1716 struct inode *inode = d_inode(dentry); 1761 struct inode *inode = d_inode(dentry);
1717 struct task_struct *task; 1762 struct task_struct *task;
1718 const struct cred *cred;
1719 struct pid_namespace *pid = dentry->d_sb->s_fs_info; 1763 struct pid_namespace *pid = dentry->d_sb->s_fs_info;
1720 1764
1721 generic_fillattr(inode, stat); 1765 generic_fillattr(inode, stat);
@@ -1733,12 +1777,7 @@ int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
1733 */ 1777 */
1734 return -ENOENT; 1778 return -ENOENT;
1735 } 1779 }
1736 if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) || 1780 task_dump_owner(task, inode->i_mode, &stat->uid, &stat->gid);
1737 task_dumpable(task)) {
1738 cred = __task_cred(task);
1739 stat->uid = cred->euid;
1740 stat->gid = cred->egid;
1741 }
1742 } 1781 }
1743 rcu_read_unlock(); 1782 rcu_read_unlock();
1744 return 0; 1783 return 0;
@@ -1754,18 +1793,11 @@ int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
1754 * Rewrite the inode's ownerships here because the owning task may have 1793 * Rewrite the inode's ownerships here because the owning task may have
1755 * performed a setuid(), etc. 1794 * performed a setuid(), etc.
1756 * 1795 *
1757 * Before the /proc/pid/status file was created the only way to read
1758 * the effective uid of a /process was to stat /proc/pid. Reading
1759 * /proc/pid/status is slow enough that procps and other packages
1760 * kept stating /proc/pid. To keep the rules in /proc simple I have
1761 * made this apply to all per process world readable and executable
1762 * directories.
1763 */ 1796 */
1764int pid_revalidate(struct dentry *dentry, unsigned int flags) 1797int pid_revalidate(struct dentry *dentry, unsigned int flags)
1765{ 1798{
1766 struct inode *inode; 1799 struct inode *inode;
1767 struct task_struct *task; 1800 struct task_struct *task;
1768 const struct cred *cred;
1769 1801
1770 if (flags & LOOKUP_RCU) 1802 if (flags & LOOKUP_RCU)
1771 return -ECHILD; 1803 return -ECHILD;
@@ -1774,17 +1806,8 @@ int pid_revalidate(struct dentry *dentry, unsigned int flags)
1774 task = get_proc_task(inode); 1806 task = get_proc_task(inode);
1775 1807
1776 if (task) { 1808 if (task) {
1777 if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) || 1809 task_dump_owner(task, inode->i_mode, &inode->i_uid, &inode->i_gid);
1778 task_dumpable(task)) { 1810
1779 rcu_read_lock();
1780 cred = __task_cred(task);
1781 inode->i_uid = cred->euid;
1782 inode->i_gid = cred->egid;
1783 rcu_read_unlock();
1784 } else {
1785 inode->i_uid = GLOBAL_ROOT_UID;
1786 inode->i_gid = GLOBAL_ROOT_GID;
1787 }
1788 inode->i_mode &= ~(S_ISUID | S_ISGID); 1811 inode->i_mode &= ~(S_ISUID | S_ISGID);
1789 security_task_to_inode(task, inode); 1812 security_task_to_inode(task, inode);
1790 put_task_struct(task); 1813 put_task_struct(task);
@@ -1881,7 +1904,6 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)
1881 bool exact_vma_exists = false; 1904 bool exact_vma_exists = false;
1882 struct mm_struct *mm = NULL; 1905 struct mm_struct *mm = NULL;
1883 struct task_struct *task; 1906 struct task_struct *task;
1884 const struct cred *cred;
1885 struct inode *inode; 1907 struct inode *inode;
1886 int status = 0; 1908 int status = 0;
1887 1909
@@ -1906,16 +1928,8 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)
1906 mmput(mm); 1928 mmput(mm);
1907 1929
1908 if (exact_vma_exists) { 1930 if (exact_vma_exists) {
1909 if (task_dumpable(task)) { 1931 task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
1910 rcu_read_lock(); 1932
1911 cred = __task_cred(task);
1912 inode->i_uid = cred->euid;
1913 inode->i_gid = cred->egid;
1914 rcu_read_unlock();
1915 } else {
1916 inode->i_uid = GLOBAL_ROOT_UID;
1917 inode->i_gid = GLOBAL_ROOT_GID;
1918 }
1919 security_task_to_inode(task, inode); 1933 security_task_to_inode(task, inode);
1920 status = 1; 1934 status = 1;
1921 } 1935 }
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 4274f83bf100..00ce1531b2f5 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -84,7 +84,6 @@ static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags)
84{ 84{
85 struct files_struct *files; 85 struct files_struct *files;
86 struct task_struct *task; 86 struct task_struct *task;
87 const struct cred *cred;
88 struct inode *inode; 87 struct inode *inode;
89 unsigned int fd; 88 unsigned int fd;
90 89
@@ -108,16 +107,7 @@ static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags)
108 rcu_read_unlock(); 107 rcu_read_unlock();
109 put_files_struct(files); 108 put_files_struct(files);
110 109
111 if (task_dumpable(task)) { 110 task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
112 rcu_read_lock();
113 cred = __task_cred(task);
114 inode->i_uid = cred->euid;
115 inode->i_gid = cred->egid;
116 rcu_read_unlock();
117 } else {
118 inode->i_uid = GLOBAL_ROOT_UID;
119 inode->i_gid = GLOBAL_ROOT_GID;
120 }
121 111
122 if (S_ISLNK(inode->i_mode)) { 112 if (S_ISLNK(inode->i_mode)) {
123 unsigned i_mode = S_IFLNK; 113 unsigned i_mode = S_IFLNK;
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 842a5ff5b85c..7ad9ed7958af 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -43,10 +43,11 @@ static void proc_evict_inode(struct inode *inode)
43 de = PDE(inode); 43 de = PDE(inode);
44 if (de) 44 if (de)
45 pde_put(de); 45 pde_put(de);
46
46 head = PROC_I(inode)->sysctl; 47 head = PROC_I(inode)->sysctl;
47 if (head) { 48 if (head) {
48 RCU_INIT_POINTER(PROC_I(inode)->sysctl, NULL); 49 RCU_INIT_POINTER(PROC_I(inode)->sysctl, NULL);
49 sysctl_head_put(head); 50 proc_sys_evict_inode(inode, head);
50 } 51 }
51} 52}
52 53
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 2de5194ba378..5d6960f5f1c0 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -65,6 +65,7 @@ struct proc_inode {
65 struct proc_dir_entry *pde; 65 struct proc_dir_entry *pde;
66 struct ctl_table_header *sysctl; 66 struct ctl_table_header *sysctl;
67 struct ctl_table *sysctl_entry; 67 struct ctl_table *sysctl_entry;
68 struct list_head sysctl_inodes;
68 const struct proc_ns_operations *ns_ops; 69 const struct proc_ns_operations *ns_ops;
69 struct inode vfs_inode; 70 struct inode vfs_inode;
70}; 71};
@@ -97,20 +98,8 @@ static inline struct task_struct *get_proc_task(struct inode *inode)
97 return get_pid_task(proc_pid(inode), PIDTYPE_PID); 98 return get_pid_task(proc_pid(inode), PIDTYPE_PID);
98} 99}
99 100
100static inline int task_dumpable(struct task_struct *task) 101void task_dump_owner(struct task_struct *task, mode_t mode,
101{ 102 kuid_t *ruid, kgid_t *rgid);
102 int dumpable = 0;
103 struct mm_struct *mm;
104
105 task_lock(task);
106 mm = task->mm;
107 if (mm)
108 dumpable = get_dumpable(mm);
109 task_unlock(task);
110 if (dumpable == SUID_DUMP_USER)
111 return 1;
112 return 0;
113}
114 103
115static inline unsigned name_to_int(const struct qstr *qstr) 104static inline unsigned name_to_int(const struct qstr *qstr)
116{ 105{
@@ -249,10 +238,12 @@ extern void proc_thread_self_init(void);
249 */ 238 */
250#ifdef CONFIG_PROC_SYSCTL 239#ifdef CONFIG_PROC_SYSCTL
251extern int proc_sys_init(void); 240extern int proc_sys_init(void);
252extern void sysctl_head_put(struct ctl_table_header *); 241extern void proc_sys_evict_inode(struct inode *inode,
242 struct ctl_table_header *head);
253#else 243#else
254static inline void proc_sys_init(void) { } 244static inline void proc_sys_init(void) { }
255static inline void sysctl_head_put(struct ctl_table_header *head) { } 245static inline void proc_sys_evict_inode(struct inode *inode,
246 struct ctl_table_header *head) { }
256#endif 247#endif
257 248
258/* 249/*
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index d4e37acd4821..3e64c6502dc8 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -190,6 +190,7 @@ static void init_header(struct ctl_table_header *head,
190 head->set = set; 190 head->set = set;
191 head->parent = NULL; 191 head->parent = NULL;
192 head->node = node; 192 head->node = node;
193 INIT_LIST_HEAD(&head->inodes);
193 if (node) { 194 if (node) {
194 struct ctl_table *entry; 195 struct ctl_table *entry;
195 for (entry = table; entry->procname; entry++, node++) 196 for (entry = table; entry->procname; entry++, node++)
@@ -259,6 +260,27 @@ static void unuse_table(struct ctl_table_header *p)
259 complete(p->unregistering); 260 complete(p->unregistering);
260} 261}
261 262
263/* called under sysctl_lock */
264static void proc_sys_prune_dcache(struct ctl_table_header *head)
265{
266 struct inode *inode, *prev = NULL;
267 struct proc_inode *ei;
268
269 rcu_read_lock();
270 list_for_each_entry_rcu(ei, &head->inodes, sysctl_inodes) {
271 inode = igrab(&ei->vfs_inode);
272 if (inode) {
273 rcu_read_unlock();
274 iput(prev);
275 prev = inode;
276 d_prune_aliases(inode);
277 rcu_read_lock();
278 }
279 }
280 rcu_read_unlock();
281 iput(prev);
282}
283
262/* called under sysctl_lock, will reacquire if has to wait */ 284/* called under sysctl_lock, will reacquire if has to wait */
263static void start_unregistering(struct ctl_table_header *p) 285static void start_unregistering(struct ctl_table_header *p)
264{ 286{
@@ -272,31 +294,22 @@ static void start_unregistering(struct ctl_table_header *p)
272 p->unregistering = &wait; 294 p->unregistering = &wait;
273 spin_unlock(&sysctl_lock); 295 spin_unlock(&sysctl_lock);
274 wait_for_completion(&wait); 296 wait_for_completion(&wait);
275 spin_lock(&sysctl_lock);
276 } else { 297 } else {
277 /* anything non-NULL; we'll never dereference it */ 298 /* anything non-NULL; we'll never dereference it */
278 p->unregistering = ERR_PTR(-EINVAL); 299 p->unregistering = ERR_PTR(-EINVAL);
300 spin_unlock(&sysctl_lock);
279 } 301 }
280 /* 302 /*
303 * Prune dentries for unregistered sysctls: namespaced sysctls
304 * can have duplicate names and contaminate dcache very badly.
305 */
306 proc_sys_prune_dcache(p);
307 /*
281 * do not remove from the list until nobody holds it; walking the 308 * do not remove from the list until nobody holds it; walking the
282 * list in do_sysctl() relies on that. 309 * list in do_sysctl() relies on that.
283 */ 310 */
284 erase_header(p);
285}
286
287static void sysctl_head_get(struct ctl_table_header *head)
288{
289 spin_lock(&sysctl_lock); 311 spin_lock(&sysctl_lock);
290 head->count++; 312 erase_header(p);
291 spin_unlock(&sysctl_lock);
292}
293
294void sysctl_head_put(struct ctl_table_header *head)
295{
296 spin_lock(&sysctl_lock);
297 if (!--head->count)
298 kfree_rcu(head, rcu);
299 spin_unlock(&sysctl_lock);
300} 313}
301 314
302static struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head) 315static struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head)
@@ -440,10 +453,20 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
440 453
441 inode->i_ino = get_next_ino(); 454 inode->i_ino = get_next_ino();
442 455
443 sysctl_head_get(head);
444 ei = PROC_I(inode); 456 ei = PROC_I(inode);
457
458 spin_lock(&sysctl_lock);
459 if (unlikely(head->unregistering)) {
460 spin_unlock(&sysctl_lock);
461 iput(inode);
462 inode = NULL;
463 goto out;
464 }
445 ei->sysctl = head; 465 ei->sysctl = head;
446 ei->sysctl_entry = table; 466 ei->sysctl_entry = table;
467 list_add_rcu(&ei->sysctl_inodes, &head->inodes);
468 head->count++;
469 spin_unlock(&sysctl_lock);
447 470
448 inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); 471 inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
449 inode->i_mode = table->mode; 472 inode->i_mode = table->mode;
@@ -466,6 +489,15 @@ out:
466 return inode; 489 return inode;
467} 490}
468 491
492void proc_sys_evict_inode(struct inode *inode, struct ctl_table_header *head)
493{
494 spin_lock(&sysctl_lock);
495 list_del_rcu(&PROC_I(inode)->sysctl_inodes);
496 if (!--head->count)
497 kfree_rcu(head, rcu);
498 spin_unlock(&sysctl_lock);
499}
500
469static struct ctl_table_header *grab_header(struct inode *inode) 501static struct ctl_table_header *grab_header(struct inode *inode)
470{ 502{
471 struct ctl_table_header *head = PROC_I(inode)->sysctl; 503 struct ctl_table_header *head = PROC_I(inode)->sysctl;
diff --git a/fs/super.c b/fs/super.c
index ea662b0e5e78..b8b6a086c03b 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -469,7 +469,7 @@ struct super_block *sget_userns(struct file_system_type *type,
469 struct super_block *old; 469 struct super_block *old;
470 int err; 470 int err;
471 471
472 if (!(flags & MS_KERNMOUNT) && 472 if (!(flags & (MS_KERNMOUNT|MS_SUBMOUNT)) &&
473 !(type->fs_flags & FS_USERNS_MOUNT) && 473 !(type->fs_flags & FS_USERNS_MOUNT) &&
474 !capable(CAP_SYS_ADMIN)) 474 !capable(CAP_SYS_ADMIN))
475 return ERR_PTR(-EPERM); 475 return ERR_PTR(-EPERM);
@@ -499,7 +499,7 @@ retry:
499 } 499 }
500 if (!s) { 500 if (!s) {
501 spin_unlock(&sb_lock); 501 spin_unlock(&sb_lock);
502 s = alloc_super(type, flags, user_ns); 502 s = alloc_super(type, (flags & ~MS_SUBMOUNT), user_ns);
503 if (!s) 503 if (!s)
504 return ERR_PTR(-ENOMEM); 504 return ERR_PTR(-ENOMEM);
505 goto retry; 505 goto retry;
@@ -540,8 +540,15 @@ struct super_block *sget(struct file_system_type *type,
540{ 540{
541 struct user_namespace *user_ns = current_user_ns(); 541 struct user_namespace *user_ns = current_user_ns();
542 542
543 /* We don't yet pass the user namespace of the parent
544 * mount through to here so always use &init_user_ns
545 * until that changes.
546 */
547 if (flags & MS_SUBMOUNT)
548 user_ns = &init_user_ns;
549
543 /* Ensure the requestor has permissions over the target filesystem */ 550 /* Ensure the requestor has permissions over the target filesystem */
544 if (!(flags & MS_KERNMOUNT) && !ns_capable(user_ns, CAP_SYS_ADMIN)) 551 if (!(flags & (MS_KERNMOUNT|MS_SUBMOUNT)) && !ns_capable(user_ns, CAP_SYS_ADMIN))
545 return ERR_PTR(-EPERM); 552 return ERR_PTR(-EPERM);
546 553
547 return sget_userns(type, test, set, flags, user_ns, data); 554 return sget_userns(type, test, set, flags, user_ns, data);
diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index 9d571acd3a48..7dff776e6d16 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -98,9 +98,10 @@ struct dentry *debugfs_create_dir(const char *name, struct dentry *parent);
98struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent, 98struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent,
99 const char *dest); 99 const char *dest);
100 100
101typedef struct vfsmount *(*debugfs_automount_t)(struct dentry *, void *);
101struct dentry *debugfs_create_automount(const char *name, 102struct dentry *debugfs_create_automount(const char *name,
102 struct dentry *parent, 103 struct dentry *parent,
103 struct vfsmount *(*f)(void *), 104 debugfs_automount_t f,
104 void *data); 105 void *data);
105 106
106void debugfs_remove(struct dentry *dentry); 107void debugfs_remove(struct dentry *dentry);
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 487246546ebe..e6e689b5569e 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -16,6 +16,7 @@
16#include <linux/spinlock.h> 16#include <linux/spinlock.h>
17#include <linux/types.h> 17#include <linux/types.h>
18#include <linux/atomic.h> 18#include <linux/atomic.h>
19#include <linux/user_namespace.h>
19 20
20/* 21/*
21 * IN_* from inotfy.h lines up EXACTLY with FS_*, this is so we can easily 22 * IN_* from inotfy.h lines up EXACTLY with FS_*, this is so we can easily
@@ -170,7 +171,7 @@ struct fsnotify_group {
170 struct inotify_group_private_data { 171 struct inotify_group_private_data {
171 spinlock_t idr_lock; 172 spinlock_t idr_lock;
172 struct idr idr; 173 struct idr idr;
173 struct user_struct *user; 174 struct ucounts *ucounts;
174 } inotify_data; 175 } inotify_data;
175#endif 176#endif
176#ifdef CONFIG_FANOTIFY 177#ifdef CONFIG_FANOTIFY
diff --git a/include/linux/mount.h b/include/linux/mount.h
index c6f55158d5e5..8e0352af06b7 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -90,6 +90,9 @@ struct file_system_type;
90extern struct vfsmount *vfs_kern_mount(struct file_system_type *type, 90extern struct vfsmount *vfs_kern_mount(struct file_system_type *type,
91 int flags, const char *name, 91 int flags, const char *name,
92 void *data); 92 void *data);
93extern struct vfsmount *vfs_submount(const struct dentry *mountpoint,
94 struct file_system_type *type,
95 const char *name, void *data);
93 96
94extern void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list); 97extern void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list);
95extern void mark_mounts_for_expiry(struct list_head *mounts); 98extern void mark_mounts_for_expiry(struct list_head *mounts);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index c8e519d0b4a3..451e241f32c5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -846,10 +846,6 @@ struct user_struct {
846 atomic_t __count; /* reference count */ 846 atomic_t __count; /* reference count */
847 atomic_t processes; /* How many processes does this user have? */ 847 atomic_t processes; /* How many processes does this user have? */
848 atomic_t sigpending; /* How many pending signals does this user have? */ 848 atomic_t sigpending; /* How many pending signals does this user have? */
849#ifdef CONFIG_INOTIFY_USER
850 atomic_t inotify_watches; /* How many inotify watches does this user have? */
851 atomic_t inotify_devs; /* How many inotify devs does this user have opened? */
852#endif
853#ifdef CONFIG_FANOTIFY 849#ifdef CONFIG_FANOTIFY
854 atomic_t fanotify_listeners; 850 atomic_t fanotify_listeners;
855#endif 851#endif
@@ -3051,6 +3047,9 @@ extern bool current_is_single_threaded(void);
3051#define for_each_process_thread(p, t) \ 3047#define for_each_process_thread(p, t) \
3052 for_each_process(p) for_each_thread(p, t) 3048 for_each_process(p) for_each_thread(p, t)
3053 3049
3050typedef int (*proc_visitor)(struct task_struct *p, void *data);
3051void walk_process_tree(struct task_struct *top, proc_visitor, void *);
3052
3054static inline int get_nr_threads(struct task_struct *tsk) 3053static inline int get_nr_threads(struct task_struct *tsk)
3055{ 3054{
3056 return tsk->signal->nr_threads; 3055 return tsk->signal->nr_threads;
diff --git a/include/linux/security.h b/include/linux/security.h
index d3868f2ebada..96899fad7016 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -140,8 +140,7 @@ struct request_sock;
140/* bprm->unsafe reasons */ 140/* bprm->unsafe reasons */
141#define LSM_UNSAFE_SHARE 1 141#define LSM_UNSAFE_SHARE 1
142#define LSM_UNSAFE_PTRACE 2 142#define LSM_UNSAFE_PTRACE 2
143#define LSM_UNSAFE_PTRACE_CAP 4 143#define LSM_UNSAFE_NO_NEW_PRIVS 4
144#define LSM_UNSAFE_NO_NEW_PRIVS 8
145 144
146#ifdef CONFIG_MMU 145#ifdef CONFIG_MMU
147extern int mmap_min_addr_handler(struct ctl_table *table, int write, 146extern int mmap_min_addr_handler(struct ctl_table *table, int write,
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index adf4e51cf597..b7e82049fec7 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -143,6 +143,7 @@ struct ctl_table_header
143 struct ctl_table_set *set; 143 struct ctl_table_set *set;
144 struct ctl_dir *parent; 144 struct ctl_dir *parent;
145 struct ctl_node *node; 145 struct ctl_node *node;
146 struct list_head inodes; /* head for proc_inode->sysctl_inodes */
146}; 147};
147 148
148struct ctl_dir { 149struct ctl_dir {
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index eb209d4523f5..363e0e8082a9 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -32,6 +32,10 @@ enum ucount_type {
32 UCOUNT_NET_NAMESPACES, 32 UCOUNT_NET_NAMESPACES,
33 UCOUNT_MNT_NAMESPACES, 33 UCOUNT_MNT_NAMESPACES,
34 UCOUNT_CGROUP_NAMESPACES, 34 UCOUNT_CGROUP_NAMESPACES,
35#ifdef CONFIG_INOTIFY_USER
36 UCOUNT_INOTIFY_INSTANCES,
37 UCOUNT_INOTIFY_WATCHES,
38#endif
35 UCOUNT_COUNTS, 39 UCOUNT_COUNTS,
36}; 40};
37 41
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 36da93fbf188..048a85e9f017 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -132,6 +132,7 @@ struct inodes_stat_t {
132#define MS_LAZYTIME (1<<25) /* Update the on-disk [acm]times lazily */ 132#define MS_LAZYTIME (1<<25) /* Update the on-disk [acm]times lazily */
133 133
134/* These sb flags are internal to the kernel */ 134/* These sb flags are internal to the kernel */
135#define MS_SUBMOUNT (1<<26)
135#define MS_NOREMOTELOCK (1<<27) 136#define MS_NOREMOTELOCK (1<<27)
136#define MS_NOSEC (1<<28) 137#define MS_NOSEC (1<<28)
137#define MS_BORN (1<<29) 138#define MS_BORN (1<<29)
diff --git a/include/uapi/linux/nsfs.h b/include/uapi/linux/nsfs.h
index 3af617230d1b..1a3ca79f466b 100644
--- a/include/uapi/linux/nsfs.h
+++ b/include/uapi/linux/nsfs.h
@@ -6,8 +6,13 @@
6#define NSIO 0xb7 6#define NSIO 0xb7
7 7
8/* Returns a file descriptor that refers to an owning user namespace */ 8/* Returns a file descriptor that refers to an owning user namespace */
9#define NS_GET_USERNS _IO(NSIO, 0x1) 9#define NS_GET_USERNS _IO(NSIO, 0x1)
10/* Returns a file descriptor that refers to a parent namespace */ 10/* Returns a file descriptor that refers to a parent namespace */
11#define NS_GET_PARENT _IO(NSIO, 0x2) 11#define NS_GET_PARENT _IO(NSIO, 0x2)
12/* Returns the type of namespace (CLONE_NEW* value) referred to by
13 file descriptor */
14#define NS_GET_NSTYPE _IO(NSIO, 0x3)
15/* Get owner UID (in the caller's user namespace) for a user namespace */
16#define NS_GET_OWNER_UID _IO(NSIO, 0x4)
12 17
13#endif /* __LINUX_NSFS_H */ 18#endif /* __LINUX_NSFS_H */
diff --git a/kernel/exit.c b/kernel/exit.c
index 580da79e38ee..9960accbf2ab 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -607,15 +607,18 @@ static struct task_struct *find_new_reaper(struct task_struct *father,
607 return thread; 607 return thread;
608 608
609 if (father->signal->has_child_subreaper) { 609 if (father->signal->has_child_subreaper) {
610 unsigned int ns_level = task_pid(father)->level;
610 /* 611 /*
611 * Find the first ->is_child_subreaper ancestor in our pid_ns. 612 * Find the first ->is_child_subreaper ancestor in our pid_ns.
612 * We start from father to ensure we can not look into another 613 * We can't check reaper != child_reaper to ensure we do not
613 * namespace, this is safe because all its threads are dead. 614 * cross the namespaces, the exiting parent could be injected
615 * by setns() + fork().
616 * We check pid->level, this is slightly more efficient than
617 * task_active_pid_ns(reaper) != task_active_pid_ns(father).
614 */ 618 */
615 for (reaper = father; 619 for (reaper = father->real_parent;
616 !same_thread_group(reaper, child_reaper); 620 task_pid(reaper)->level == ns_level;
617 reaper = reaper->real_parent) { 621 reaper = reaper->real_parent) {
618 /* call_usermodehelper() descendants need this check */
619 if (reaper == &init_task) 622 if (reaper == &init_task)
620 break; 623 break;
621 if (!reaper->signal->is_child_subreaper) 624 if (!reaper->signal->is_child_subreaper)
diff --git a/kernel/fork.c b/kernel/fork.c
index d12fcc4db8a3..348fe73155bc 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1377,9 +1377,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
1377 sig->oom_score_adj = current->signal->oom_score_adj; 1377 sig->oom_score_adj = current->signal->oom_score_adj;
1378 sig->oom_score_adj_min = current->signal->oom_score_adj_min; 1378 sig->oom_score_adj_min = current->signal->oom_score_adj_min;
1379 1379
1380 sig->has_child_subreaper = current->signal->has_child_subreaper ||
1381 current->signal->is_child_subreaper;
1382
1383 mutex_init(&sig->cred_guard_mutex); 1380 mutex_init(&sig->cred_guard_mutex);
1384 1381
1385 return 0; 1382 return 0;
@@ -1814,6 +1811,13 @@ static __latent_entropy struct task_struct *copy_process(
1814 1811
1815 p->signal->leader_pid = pid; 1812 p->signal->leader_pid = pid;
1816 p->signal->tty = tty_kref_get(current->signal->tty); 1813 p->signal->tty = tty_kref_get(current->signal->tty);
1814 /*
1815 * Inherit has_child_subreaper flag under the same
1816 * tasklist_lock with adding child to the process tree
1817 * for propagate_has_child_subreaper optimization.
1818 */
1819 p->signal->has_child_subreaper = p->real_parent->signal->has_child_subreaper ||
1820 p->real_parent->signal->is_child_subreaper;
1817 list_add_tail(&p->sibling, &p->real_parent->children); 1821 list_add_tail(&p->sibling, &p->real_parent->children);
1818 list_add_tail_rcu(&p->tasks, &init_task.tasks); 1822 list_add_tail_rcu(&p->tasks, &init_task.tasks);
1819 attach_pid(p, PIDTYPE_PGID); 1823 attach_pid(p, PIDTYPE_PGID);
@@ -2067,6 +2071,38 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
2067} 2071}
2068#endif 2072#endif
2069 2073
2074void walk_process_tree(struct task_struct *top, proc_visitor visitor, void *data)
2075{
2076 struct task_struct *leader, *parent, *child;
2077 int res;
2078
2079 read_lock(&tasklist_lock);
2080 leader = top = top->group_leader;
2081down:
2082 for_each_thread(leader, parent) {
2083 list_for_each_entry(child, &parent->children, sibling) {
2084 res = visitor(child, data);
2085 if (res) {
2086 if (res < 0)
2087 goto out;
2088 leader = child;
2089 goto down;
2090 }
2091up:
2092 ;
2093 }
2094 }
2095
2096 if (leader != top) {
2097 child = leader;
2098 parent = child->real_parent;
2099 leader = parent->group_leader;
2100 goto up;
2101 }
2102out:
2103 read_unlock(&tasklist_lock);
2104}
2105
2070#ifndef ARCH_MIN_MMSTRUCT_ALIGN 2106#ifndef ARCH_MIN_MMSTRUCT_ALIGN
2071#define ARCH_MIN_MMSTRUCT_ALIGN 0 2107#define ARCH_MIN_MMSTRUCT_ALIGN 0
2072#endif 2108#endif
diff --git a/kernel/sys.c b/kernel/sys.c
index 7d4a9a6df956..b07adca97ea3 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2063,6 +2063,24 @@ static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
2063} 2063}
2064#endif 2064#endif
2065 2065
2066static int propagate_has_child_subreaper(struct task_struct *p, void *data)
2067{
2068 /*
2069 * If task has has_child_subreaper - all its decendants
2070 * already have these flag too and new decendants will
2071 * inherit it on fork, skip them.
2072 *
2073 * If we've found child_reaper - skip descendants in
2074 * it's subtree as they will never get out pidns.
2075 */
2076 if (p->signal->has_child_subreaper ||
2077 is_child_reaper(task_pid(p)))
2078 return 0;
2079
2080 p->signal->has_child_subreaper = 1;
2081 return 1;
2082}
2083
2066SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, 2084SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
2067 unsigned long, arg4, unsigned long, arg5) 2085 unsigned long, arg4, unsigned long, arg5)
2068{ 2086{
@@ -2214,6 +2232,10 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
2214 break; 2232 break;
2215 case PR_SET_CHILD_SUBREAPER: 2233 case PR_SET_CHILD_SUBREAPER:
2216 me->signal->is_child_subreaper = !!arg2; 2234 me->signal->is_child_subreaper = !!arg2;
2235 if (!arg2)
2236 break;
2237
2238 walk_process_tree(me, propagate_has_child_subreaper, NULL);
2217 break; 2239 break;
2218 case PR_GET_CHILD_SUBREAPER: 2240 case PR_GET_CHILD_SUBREAPER:
2219 error = put_user(me->signal->is_child_subreaper, 2241 error = put_user(me->signal->is_child_subreaper,
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d7449783987a..310f0ea0d1a2 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -7503,7 +7503,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
7503 ftrace_init_tracefs(tr, d_tracer); 7503 ftrace_init_tracefs(tr, d_tracer);
7504} 7504}
7505 7505
7506static struct vfsmount *trace_automount(void *ingore) 7506static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore)
7507{ 7507{
7508 struct vfsmount *mnt; 7508 struct vfsmount *mnt;
7509 struct file_system_type *type; 7509 struct file_system_type *type;
@@ -7516,7 +7516,7 @@ static struct vfsmount *trace_automount(void *ingore)
7516 type = get_fs_type("tracefs"); 7516 type = get_fs_type("tracefs");
7517 if (!type) 7517 if (!type)
7518 return NULL; 7518 return NULL;
7519 mnt = vfs_kern_mount(type, 0, "tracefs", NULL); 7519 mnt = vfs_submount(mntpt, type, "tracefs", NULL);
7520 put_filesystem(type); 7520 put_filesystem(type);
7521 if (IS_ERR(mnt)) 7521 if (IS_ERR(mnt))
7522 return NULL; 7522 return NULL;
diff --git a/kernel/ucount.c b/kernel/ucount.c
index 95c6336fc2b3..8a11fc0cb459 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -57,7 +57,7 @@ static struct ctl_table_root set_root = {
57 57
58static int zero = 0; 58static int zero = 0;
59static int int_max = INT_MAX; 59static int int_max = INT_MAX;
60#define UCOUNT_ENTRY(name) \ 60#define UCOUNT_ENTRY(name) \
61 { \ 61 { \
62 .procname = name, \ 62 .procname = name, \
63 .maxlen = sizeof(int), \ 63 .maxlen = sizeof(int), \
@@ -74,6 +74,10 @@ static struct ctl_table user_table[] = {
74 UCOUNT_ENTRY("max_net_namespaces"), 74 UCOUNT_ENTRY("max_net_namespaces"),
75 UCOUNT_ENTRY("max_mnt_namespaces"), 75 UCOUNT_ENTRY("max_mnt_namespaces"),
76 UCOUNT_ENTRY("max_cgroup_namespaces"), 76 UCOUNT_ENTRY("max_cgroup_namespaces"),
77#ifdef CONFIG_INOTIFY_USER
78 UCOUNT_ENTRY("max_inotify_instances"),
79 UCOUNT_ENTRY("max_inotify_watches"),
80#endif
77 { } 81 { }
78}; 82};
79#endif /* CONFIG_SYSCTL */ 83#endif /* CONFIG_SYSCTL */
diff --git a/security/apparmor/domain.c b/security/apparmor/domain.c
index ef4beef06e9d..001e133a3c8c 100644
--- a/security/apparmor/domain.c
+++ b/security/apparmor/domain.c
@@ -471,7 +471,7 @@ int apparmor_bprm_set_creds(struct linux_binprm *bprm)
471 ; 471 ;
472 } 472 }
473 473
474 if (bprm->unsafe & (LSM_UNSAFE_PTRACE | LSM_UNSAFE_PTRACE_CAP)) { 474 if (bprm->unsafe & LSM_UNSAFE_PTRACE) {
475 error = may_change_ptraced_domain(new_profile); 475 error = may_change_ptraced_domain(new_profile);
476 if (error) 476 if (error)
477 goto audit; 477 goto audit;
diff --git a/security/commoncap.c b/security/commoncap.c
index 6d4d586b9356..78b37838a2d3 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -548,9 +548,10 @@ skip:
548 548
549 if ((is_setid || 549 if ((is_setid ||
550 !cap_issubset(new->cap_permitted, old->cap_permitted)) && 550 !cap_issubset(new->cap_permitted, old->cap_permitted)) &&
551 bprm->unsafe & ~LSM_UNSAFE_PTRACE_CAP) { 551 ((bprm->unsafe & ~LSM_UNSAFE_PTRACE) ||
552 !ptracer_capable(current, new->user_ns))) {
552 /* downgrade; they get no more than they had, and maybe less */ 553 /* downgrade; they get no more than they had, and maybe less */
553 if (!capable(CAP_SETUID) || 554 if (!ns_capable(new->user_ns, CAP_SETUID) ||
554 (bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS)) { 555 (bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS)) {
555 new->euid = new->uid; 556 new->euid = new->uid;
556 new->egid = new->gid; 557 new->egid = new->gid;
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index e6b1b7410321..9a8f12f8d5b7 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -2399,8 +2399,7 @@ static int selinux_bprm_set_creds(struct linux_binprm *bprm)
2399 2399
2400 /* Make sure that anyone attempting to ptrace over a task that 2400 /* Make sure that anyone attempting to ptrace over a task that
2401 * changes its SID has the appropriate permit */ 2401 * changes its SID has the appropriate permit */
2402 if (bprm->unsafe & 2402 if (bprm->unsafe & LSM_UNSAFE_PTRACE) {
2403 (LSM_UNSAFE_PTRACE | LSM_UNSAFE_PTRACE_CAP)) {
2404 u32 ptsid = ptrace_parent_sid(); 2403 u32 ptsid = ptrace_parent_sid();
2405 if (ptsid != 0) { 2404 if (ptsid != 0) {
2406 rc = avc_has_perm(ptsid, new_tsec->sid, 2405 rc = avc_has_perm(ptsid, new_tsec->sid,
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 60b4217b9b68..fc8fb31fc24f 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -931,7 +931,7 @@ static int smack_bprm_set_creds(struct linux_binprm *bprm)
931 isp->smk_task != sbsp->smk_root) 931 isp->smk_task != sbsp->smk_root)
932 return 0; 932 return 0;
933 933
934 if (bprm->unsafe & (LSM_UNSAFE_PTRACE | LSM_UNSAFE_PTRACE_CAP)) { 934 if (bprm->unsafe & LSM_UNSAFE_PTRACE) {
935 struct task_struct *tracer; 935 struct task_struct *tracer;
936 rc = 0; 936 rc = 0;
937 937