40 files changed, 431 insertions, 226 deletions
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 81dd075356b9..d4fb0afc0097 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -202,7 +202,7 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
        /* try and do the mount */
        _debug("--- attempting mount %s -o %s ---", devname, options);
-        mnt = vfs_kern_mount(&afs_fs_type, 0, devname, options);
+        mnt = vfs_submount(mntpt, &afs_fs_type, devname, options);
        _debug("--- mount result %p ---", mnt);
        free_page((unsigned long) devname);
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 1278335ce366..79fbd85db4ba 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -436,8 +436,8 @@ int autofs4_wait(struct autofs_sb_info *sbi,
                memcpy(&wq->name, &qstr, sizeof(struct qstr));
                wq->dev = autofs4_get_dev(sbi);
                wq->ino = autofs4_get_ino(sbi);
-                wq->uid = current_real_cred()->uid;
+                wq->uid = current_cred()->uid;
-                wq->gid = current_real_cred()->gid;
+                wq->gid = current_cred()->gid;
                wq->pid = pid;
                wq->tgid = tgid;
                wq->status = -EINTR; /* Status return if interrupted */
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index ec9dbbcca3b9..9156be545b0f 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -245,7 +245,8 @@ compose_mount_options_err:
 * @fullpath:           full path in UNC format
 * @ref:                server's referral
 */
-static struct vfsmount *cifs_dfs_do_refmount(struct cifs_sb_info *cifs_sb,
+static struct vfsmount *cifs_dfs_do_refmount(struct dentry *mntpt,
+                struct cifs_sb_info *cifs_sb,
                const char *fullpath, const struct dfs_info3_param *ref)
 {
        struct vfsmount *mnt;
@@ -259,7 +260,7 @@ static struct vfsmount *cifs_dfs_do_refmount(struct cifs_sb_info *cifs_sb,
        if (IS_ERR(mountdata))
                return (struct vfsmount *)mountdata;
-        mnt = vfs_kern_mount(&cifs_fs_type, 0, devname, mountdata);
+        mnt = vfs_submount(mntpt, &cifs_fs_type, devname, mountdata);
        kfree(mountdata);
        kfree(devname);
        return mnt;
@@ -334,7 +335,7 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
                        mnt = ERR_PTR(-EINVAL);
                        break;
                }
-                mnt = cifs_dfs_do_refmount(cifs_sb,
+                mnt = cifs_dfs_do_refmount(mntpt, cifs_sb,
                                full_path, referrals + i);
                cifs_dbg(FYI, "%s: cifs_dfs_do_refmount:%s , mnt:%p\n",
                         __func__, referrals[i].node_name, mnt);
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 7fb1732a3630..7fd4ec4bb214 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -187,9 +187,9 @@ static const struct super_operations debugfs_super_operations = {
 static struct vfsmount *debugfs_automount(struct path *path)
 {
-        struct vfsmount *(*f)(void *);
+        debugfs_automount_t f;
-        f = (struct vfsmount *(*)(void *))path->dentry->d_fsdata;
+        f = (debugfs_automount_t)path->dentry->d_fsdata;
-        return f(d_inode(path->dentry)->i_private);
+        return f(path->dentry, d_inode(path->dentry)->i_private);
 }
 static const struct dentry_operations debugfs_dops = {
@@ -540,7 +540,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_dir);
 */
 struct dentry *debugfs_create_automount(const char *name,
                                        struct dentry *parent,
-                                        struct vfsmount *(*f)(void *),
+                                        debugfs_automount_t f,
                                        void *data)
 {
        struct dentry *dentry = start_creating(name, parent);
diff --git a/fs/exec.c b/fs/exec.c
index e57946610733..698a86094f76 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1426,12 +1426,8 @@ static void check_unsafe_exec(struct linux_binprm *bprm)
        struct task_struct *p = current, *t;
        unsigned n_fs;
-        if (p->ptrace) {
+        if (p->ptrace)
-                if (ptracer_capable(p, current_user_ns()))
+                bprm->unsafe |= LSM_UNSAFE_PTRACE;
-                        bprm->unsafe |= LSM_UNSAFE_PTRACE_CAP;
-                else
-                        bprm->unsafe |= LSM_UNSAFE_PTRACE;
-        }
        /*
         * This isn't strictly necessary, but it makes it harder for LSMs to
@@ -1479,7 +1475,7 @@ static void bprm_fill_uid(struct linux_binprm *bprm)
        if (task_no_new_privs(current))
                return;
-        inode = file_inode(bprm->file);
+        inode = bprm->file->f_path.dentry->d_inode;
        mode = READ_ONCE(inode->i_mode);
        if (!(mode & (S_ISUID|S_ISGID)))
                return;
diff --git a/fs/mount.h b/fs/mount.h
index 2c856fc47ae3..2826543a131d 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -89,7 +89,6 @@ static inline int is_mounted(struct vfsmount *mnt)
 }
 extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *);
-extern struct mount *__lookup_mnt_last(struct vfsmount *, struct dentry *);
 extern int __legitimize_mnt(struct vfsmount *, unsigned);
 extern bool legitimize_mnt(struct vfsmount *, unsigned);
diff --git a/fs/namei.c b/fs/namei.c
index ad74877e1442..da689c9c005e 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1100,7 +1100,6 @@ static int follow_automount(struct path *path, struct nameidata *nd,
                            bool *need_mntput)
 {
        struct vfsmount *mnt;
-        const struct cred *old_cred;
        int err;
        if (!path->dentry->d_op || !path->dentry->d_op->d_automount)
@@ -1129,9 +1128,7 @@ static int follow_automount(struct path *path, struct nameidata *nd,
        if (nd->total_link_count >= 40)
                return -ELOOP;
-        old_cred = override_creds(&init_cred);
        mnt = path->dentry->d_op->d_automount(path);
-        revert_creds(old_cred);
        if (IS_ERR(mnt)) {
                /*
                 * The filesystem is allowed to return -EISDIR here to indicate
@@ -2941,10 +2938,16 @@ static inline int open_to_namei_flags(int flag)
 static int may_o_create(const struct path *dir, struct dentry *dentry, umode_t mode)
 {
+        struct user_namespace *s_user_ns;
        int error = security_path_mknod(dir, dentry, mode, 0);
        if (error)
                return error;
+        s_user_ns = dir->dentry->d_sb->s_user_ns;
+        if (!kuid_has_mapping(s_user_ns, current_fsuid()) ||
+            !kgid_has_mapping(s_user_ns, current_fsgid()))
+                return -EOVERFLOW;
        error = inode_permission(dir->dentry->d_inode, MAY_WRITE | MAY_EXEC);
        if (error)
                return error;
diff --git a/fs/namespace.c b/fs/namespace.c
index 487ba30bb5c6..8bfad42c1ccf 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -637,28 +637,6 @@ struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
 }
 /*
- * find the last mount at @dentry on vfsmount @mnt.
- * mount_lock must be held.
- */
-struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)
-{
-        struct mount *p, *res = NULL;
-        p = __lookup_mnt(mnt, dentry);
-        if (!p)
-                goto out;
-        if (!(p->mnt.mnt_flags & MNT_UMOUNT))
-                res = p;
-        hlist_for_each_entry_continue(p, mnt_hash) {
-                if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry)
-                        break;
-                if (!(p->mnt.mnt_flags & MNT_UMOUNT))
-                        res = p;
-        }
-out:
-        return res;
-}
-/*
 * lookup_mnt - Return the first child mount mounted at path
 *
 * "First" means first mounted chronologically.  If you create the
@@ -878,6 +856,13 @@ void mnt_set_mountpoint(struct mount *mnt,
        hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list);
 }
+static void __attach_mnt(struct mount *mnt, struct mount *parent)
+{
+        hlist_add_head_rcu(&mnt->mnt_hash,
+                           m_hash(&parent->mnt, mnt->mnt_mountpoint));
+        list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
+}
 /*
 * vfsmount lock must be held for write
 */
@@ -886,28 +871,45 @@ static void attach_mnt(struct mount *mnt,
                        struct mountpoint *mp)
 {
        mnt_set_mountpoint(parent, mp, mnt);
-        hlist_add_head_rcu(&mnt->mnt_hash, m_hash(&parent->mnt, mp->m_dentry));
+        __attach_mnt(mnt, parent);
-        list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
 }
-static void attach_shadowed(struct mount *mnt,
+void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt)
-                        struct mount *parent,
-                        struct mount *shadows)
 {
-        if (shadows) {
+        struct mountpoint *old_mp = mnt->mnt_mp;
-                hlist_add_behind_rcu(&mnt->mnt_hash, &shadows->mnt_hash);
+        struct dentry *old_mountpoint = mnt->mnt_mountpoint;
-                list_add(&mnt->mnt_child, &shadows->mnt_child);
+        struct mount *old_parent = mnt->mnt_parent;
-        } else {
-                hlist_add_head_rcu(&mnt->mnt_hash,
+        list_del_init(&mnt->mnt_child);
-                                m_hash(&parent->mnt, mnt->mnt_mountpoint));
+        hlist_del_init(&mnt->mnt_mp_list);
-                list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
+        hlist_del_init_rcu(&mnt->mnt_hash);
-        }
+        attach_mnt(mnt, parent, mp);
+        put_mountpoint(old_mp);
+        /*
+         * Safely avoid even the suggestion this code might sleep or
+         * lock the mount hash by taking advantage of the knowledge that
+         * mnt_change_mountpoint will not release the final reference
+         * to a mountpoint.
+         *
+         * During mounting, the mount passed in as the parent mount will
+         * continue to use the old mountpoint and during unmounting, the
+         * old mountpoint will continue to exist until namespace_unlock,
+         * which happens well after mnt_change_mountpoint.
+         */
+        spin_lock(&old_mountpoint->d_lock);
+        old_mountpoint->d_lockref.count--;
+        spin_unlock(&old_mountpoint->d_lock);
+        mnt_add_count(old_parent, -1);
 }
 /*
 * vfsmount lock must be held for write
 */
-static void commit_tree(struct mount *mnt, struct mount *shadows)
+static void commit_tree(struct mount *mnt)
 {
        struct mount *parent = mnt->mnt_parent;
        struct mount *m;
@@ -925,7 +927,7 @@ static void commit_tree(struct mount *mnt, struct mount *shadows)
        n->mounts += n->pending_mounts;
        n->pending_mounts = 0;
-        attach_shadowed(mnt, parent, shadows);
+        __attach_mnt(mnt, parent);
        touch_mnt_namespace(n);
 }
@@ -989,6 +991,21 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
 }
 EXPORT_SYMBOL_GPL(vfs_kern_mount);
+struct vfsmount *
+vfs_submount(const struct dentry *mountpoint, struct file_system_type *type,
+             const char *name, void *data)
+{
+        /* Until it is worked out how to pass the user namespace
+         * through from the parent mount to the submount don't support
+         * unprivileged mounts with submounts.
+         */
+        if (mountpoint->d_sb->s_user_ns != &init_user_ns)
+                return ERR_PTR(-EPERM);
+        return vfs_kern_mount(type, MS_SUBMOUNT, name, data);
+}
+EXPORT_SYMBOL_GPL(vfs_submount);
 static struct mount *clone_mnt(struct mount *old, struct dentry *root,
                                        int flag)
 {
@@ -1764,7 +1781,6 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
                        continue;
                for (s = r; s; s = next_mnt(s, r)) {
-                        struct mount *t = NULL;
                        if (!(flag & CL_COPY_UNBINDABLE) &&
                            IS_MNT_UNBINDABLE(s)) {
                                s = skip_mnt_tree(s);
@@ -1786,14 +1802,7 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
                                goto out;
                        lock_mount_hash();
                        list_add_tail(&q->mnt_list, &res->mnt_list);
-                        mnt_set_mountpoint(parent, p->mnt_mp, q);
+                        attach_mnt(q, parent, p->mnt_mp);
-                        if (!list_empty(&parent->mnt_mounts)) {
-                                t = list_last_entry(&parent->mnt_mounts,
-                                        struct mount, mnt_child);
-                                if (t->mnt_mp != p->mnt_mp)
-                                        t = NULL;
-                        }
-                        attach_shadowed(q, parent, t);
                        unlock_mount_hash();
                }
        }
@@ -1992,10 +2001,18 @@ static int attach_recursive_mnt(struct mount *source_mnt,
 {
        HLIST_HEAD(tree_list);
        struct mnt_namespace *ns = dest_mnt->mnt_ns;
+        struct mountpoint *smp;
        struct mount *child, *p;
        struct hlist_node *n;
        int err;
+        /* Preallocate a mountpoint in case the new mounts need
+         * to be tucked under other mounts.
+         */
+        smp = get_mountpoint(source_mnt->mnt.mnt_root);
+        if (IS_ERR(smp))
+                return PTR_ERR(smp);
        /* Is there space to add these mounts to the mount namespace? */
        if (!parent_path) {
                err = count_mounts(ns, source_mnt);
@@ -2022,16 +2039,19 @@ static int attach_recursive_mnt(struct mount *source_mnt,
                touch_mnt_namespace(source_mnt->mnt_ns);
        } else {
                mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
-                commit_tree(source_mnt, NULL);
+                commit_tree(source_mnt);
        }
        hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) {
                struct mount *q;
                hlist_del_init(&child->mnt_hash);
-                q = __lookup_mnt_last(&child->mnt_parent->mnt,
+                q = __lookup_mnt(&child->mnt_parent->mnt,
-                                      child->mnt_mountpoint);
+                                 child->mnt_mountpoint);
-                commit_tree(child, q);
+                if (q)
+                        mnt_change_mountpoint(child, smp, q);
+                commit_tree(child);
        }
+        put_mountpoint(smp);
        unlock_mount_hash();
        return 0;
@@ -2046,6 +2066,11 @@ static int attach_recursive_mnt(struct mount *source_mnt,
        cleanup_group_ids(source_mnt, NULL);
 out:
        ns->pending_mounts = 0;
+        read_seqlock_excl(&mount_lock);
+        put_mountpoint(smp);
+        read_sequnlock_excl(&mount_lock);
        return err;
 }
@@ -2794,7 +2819,7 @@ long do_mount(const char *dev_name, const char __user *dir_name,
        flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN |
                   MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
-                   MS_STRICTATIME | MS_NOREMOTELOCK);
+                   MS_STRICTATIME | MS_NOREMOTELOCK | MS_SUBMOUNT);
        if (flags & MS_REMOUNT)
                retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 5551e8ef67fd..e49d831c4e85 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -226,7 +226,7 @@ static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server,
                                           const char *devname,
                                           struct nfs_clone_mount *mountdata)
 {
-        return vfs_kern_mount(&nfs_xdev_fs_type, 0, devname, mountdata);
+        return vfs_submount(mountdata->dentry, &nfs_xdev_fs_type, devname, mountdata);
 }
 /**
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index d21104912676..d8b040bd9814 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -279,7 +279,7 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
                                mountdata->hostname,
                                mountdata->mnt_path);
-                mnt = vfs_kern_mount(&nfs4_referral_fs_type, 0, page, mountdata);
+                mnt = vfs_submount(mountdata->dentry, &nfs4_referral_fs_type, page, mountdata);
                if (!IS_ERR(mnt))
                        break;
        }
diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h
index a6f5907a3fee..7c461fd49c4c 100644
--- a/fs/notify/inotify/inotify.h
+++ b/fs/notify/inotify/inotify.h
@@ -30,3 +30,20 @@ extern int inotify_handle_event(struct fsnotify_group *group,
                                const unsigned char *file_name, u32 cookie);
 extern const struct fsnotify_ops inotify_fsnotify_ops;
+#ifdef CONFIG_INOTIFY_USER
+static inline void dec_inotify_instances(struct ucounts *ucounts)
+{
+        dec_ucount(ucounts, UCOUNT_INOTIFY_INSTANCES);
+}
+static inline struct ucounts *inc_inotify_watches(struct ucounts *ucounts)
+{
+        return inc_ucount(ucounts->ns, ucounts->uid, UCOUNT_INOTIFY_WATCHES);
+}
+static inline void dec_inotify_watches(struct ucounts *ucounts)
+{
+        dec_ucount(ucounts, UCOUNT_INOTIFY_WATCHES);
+}
+#endif
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 19e7ec109a75..f36c29398de3 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -165,10 +165,8 @@ static void inotify_free_group_priv(struct fsnotify_group *group)
        /* ideally the idr is empty and we won't hit the BUG in the callback */
        idr_for_each(&group->inotify_data.idr, idr_callback, group);
        idr_destroy(&group->inotify_data.idr);
-        if (group->inotify_data.user) {
+        if (group->inotify_data.ucounts)
-                atomic_dec(&group->inotify_data.user->inotify_devs);
+                dec_inotify_instances(group->inotify_data.ucounts);
-                free_uid(group->inotify_data.user);
-        }
 }
 static void inotify_free_event(struct fsnotify_event *fsn_event)
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 69d1ea3d292a..1cf41c623be1 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -44,10 +44,8 @@
 #include <asm/ioctls.h>
-/* these are configurable via /proc/sys/fs/inotify/ */
+/* configurable via /proc/sys/fs/inotify/ */
-static int inotify_max_user_instances __read_mostly;
 static int inotify_max_queued_events __read_mostly;
-static int inotify_max_user_watches __read_mostly;
 static struct kmem_cache *inotify_inode_mark_cachep __read_mostly;
@@ -60,7 +58,7 @@ static int zero;
 struct ctl_table inotify_table[] = {
        {
                .procname       = "max_user_instances",
-                .data           = &inotify_max_user_instances,
+                .data           = &init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES],
                .maxlen         = sizeof(int),
                .mode           = 0644,
                .proc_handler   = proc_dointvec_minmax,
@@ -68,7 +66,7 @@ struct ctl_table inotify_table[] = {
        },
        {
                .procname       = "max_user_watches",
-                .data           = &inotify_max_user_watches,
+                .data           = &init_user_ns.ucount_max[UCOUNT_INOTIFY_WATCHES],
                .maxlen         = sizeof(int),
                .mode           = 0644,
                .proc_handler   = proc_dointvec_minmax,
@@ -500,7 +498,7 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
        /* remove this mark from the idr */
        inotify_remove_from_idr(group, i_mark);
-        atomic_dec(&group->inotify_data.user->inotify_watches);
+        dec_inotify_watches(group->inotify_data.ucounts);
 }
 /* ding dong the mark is dead */
@@ -584,14 +582,17 @@ static int inotify_new_watch(struct fsnotify_group *group,
        tmp_i_mark->fsn_mark.mask = mask;
        tmp_i_mark->wd = -1;
-        ret = -ENOSPC;
-        if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches)
-                goto out_err;
        ret = inotify_add_to_idr(idr, idr_lock, tmp_i_mark);
        if (ret)
                goto out_err;
+        /* increment the number of watches the user has */
+        if (!inc_inotify_watches(group->inotify_data.ucounts)) {
+                inotify_remove_from_idr(group, tmp_i_mark);
+                ret = -ENOSPC;
+                goto out_err;
+        }
        /* we are on the idr, now get on the inode */
        ret = fsnotify_add_mark_locked(&tmp_i_mark->fsn_mark, group, inode,
                                       NULL, 0);
@@ -601,8 +602,6 @@ static int inotify_new_watch(struct fsnotify_group *group,
                goto out_err;
        }
-        /* increment the number of watches the user has */
-        atomic_inc(&group->inotify_data.user->inotify_watches);
        /* return the watch descriptor for this new mark */
        ret = tmp_i_mark->wd;
@@ -653,10 +652,11 @@ static struct fsnotify_group *inotify_new_group(unsigned int max_events)
        spin_lock_init(&group->inotify_data.idr_lock);
        idr_init(&group->inotify_data.idr);
-        group->inotify_data.user = get_current_user();
+        group->inotify_data.ucounts = inc_ucount(current_user_ns(),
+                                                 current_euid(),
+                                                 UCOUNT_INOTIFY_INSTANCES);
-        if (atomic_inc_return(&group->inotify_data.user->inotify_devs) >
+        if (!group->inotify_data.ucounts) {
-            inotify_max_user_instances) {
                fsnotify_destroy_group(group);
                return ERR_PTR(-EMFILE);
        }
@@ -819,8 +819,8 @@ static int __init inotify_user_setup(void)
        inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, SLAB_PANIC);
        inotify_max_queued_events = 16384;
-        inotify_max_user_instances = 128;
+        init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES] = 128;
-        inotify_max_user_watches = 8192;
+        init_user_ns.ucount_max[UCOUNT_INOTIFY_WATCHES] = 8192;
        return 0;
 }
diff --git a/fs/nsfs.c b/fs/nsfs.c
index 8c9fb29c6673..1656843e87d2 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -7,6 +7,7 @@
 #include <linux/seq_file.h>
 #include <linux/user_namespace.h>
 #include <linux/nsfs.h>
+#include <linux/uaccess.h>
 static struct vfsmount *nsfs_mnt;
@@ -163,7 +164,10 @@ int open_related_ns(struct ns_common *ns,
 static long ns_ioctl(struct file *filp, unsigned int ioctl,
                        unsigned long arg)
 {
+        struct user_namespace *user_ns;
        struct ns_common *ns = get_proc_ns(file_inode(filp));
+        uid_t __user *argp;
+        uid_t uid;
        switch (ioctl) {
        case NS_GET_USERNS:
@@ -172,6 +176,15 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl,
                if (!ns->ops->get_parent)
                        return -EINVAL;
                return open_related_ns(ns, ns->ops->get_parent);
+        case NS_GET_NSTYPE:
+                return ns->ops->type;
+        case NS_GET_OWNER_UID:
+                if (ns->ops->type != CLONE_NEWUSER)
+                        return -EINVAL;
+                user_ns = container_of(ns, struct user_namespace, ns);
+                argp = (uid_t __user *) arg;
+                uid = from_kuid_munged(current_user_ns(), user_ns->owner);
+                return put_user(uid, argp);
        default:
                return -ENOTTY;
        }
diff --git a/fs/pnode.c b/fs/pnode.c
index 06a793f4ae38..5bc7896d122a 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -322,6 +322,21 @@ out:
        return ret;
 }
+static struct mount *find_topper(struct mount *mnt)
+{
+        /* If there is exactly one mount covering mnt completely return it. */
+        struct mount *child;
+        if (!list_is_singular(&mnt->mnt_mounts))
+                return NULL;
+        child = list_first_entry(&mnt->mnt_mounts, struct mount, mnt_child);
+        if (child->mnt_mountpoint != mnt->mnt.mnt_root)
+                return NULL;
+        return child;
+}
 /*
 * return true if the refcount is greater than count
 */
@@ -342,9 +357,8 @@ static inline int do_refcount_check(struct mount *mnt, int count)
 */
 int propagate_mount_busy(struct mount *mnt, int refcnt)
 {
-        struct mount *m, *child;
+        struct mount *m, *child, *topper;
        struct mount *parent = mnt->mnt_parent;
-        int ret = 0;
        if (mnt == parent)
                return do_refcount_check(mnt, refcnt);
@@ -359,12 +373,24 @@ int propagate_mount_busy(struct mount *mnt, int refcnt)
        for (m = propagation_next(parent, parent); m;
                        m = propagation_next(m, parent)) {
-                child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint);
+                int count = 1;
-                if (child && list_empty(&child->mnt_mounts) &&
+                child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint);
-                    (ret = do_refcount_check(child, 1)))
+                if (!child)
-                        break;
+                        continue;
+                /* Is there exactly one mount on the child that covers
+                 * it completely whose reference should be ignored?
+                 */
+                topper = find_topper(child);
+                if (topper)
+                        count += 1;
+                else if (!list_empty(&child->mnt_mounts))
+                        continue;
+                if (do_refcount_check(child, count))
+                        return 1;
        }
-        return ret;
+        return 0;
 }
 /*
@@ -381,7 +407,7 @@ void propagate_mount_unlock(struct mount *mnt)
        for (m = propagation_next(parent, parent); m;
                        m = propagation_next(m, parent)) {
-                child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint);
+                child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint);
                if (child)
                        child->mnt.mnt_flags &= ~MNT_LOCKED;
        }
@@ -399,9 +425,11 @@ static void mark_umount_candidates(struct mount *mnt)
        for (m = propagation_next(parent, parent); m;
                        m = propagation_next(m, parent)) {
-                struct mount *child = __lookup_mnt_last(&m->mnt,
+                struct mount *child = __lookup_mnt(&m->mnt,
                                                mnt->mnt_mountpoint);
-                if (child && (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m))) {
+                if (!child || (child->mnt.mnt_flags & MNT_UMOUNT))
+                        continue;
+                if (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m)) {
                        SET_MNT_MARK(child);
                }
        }
@@ -420,8 +448,8 @@ static void __propagate_umount(struct mount *mnt)
        for (m = propagation_next(parent, parent); m;
                        m = propagation_next(m, parent)) {
+                struct mount *topper;
-                struct mount *child = __lookup_mnt_last(&m->mnt,
+                struct mount *child = __lookup_mnt(&m->mnt,
                                                mnt->mnt_mountpoint);
                /*
                 * umount the child only if the child has no children
@@ -430,6 +458,15 @@ static void __propagate_umount(struct mount *mnt)
                if (!child || !IS_MNT_MARKED(child))
                        continue;
                CLEAR_MNT_MARK(child);
+                /* If there is exactly one mount covering all of child
+                 * replace child with that mount.
+                 */
+                topper = find_topper(child);
+                if (topper)
+                        mnt_change_mountpoint(child->mnt_parent, child->mnt_mp,
+                                              topper);
                if (list_empty(&child->mnt_mounts)) {
                        list_del_init(&child->mnt_child);
                        child->mnt.mnt_flags |= MNT_UMOUNT;
diff --git a/fs/pnode.h b/fs/pnode.h
index 550f5a8b4fcf..dc87e65becd2 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -49,6 +49,8 @@ int get_dominating_id(struct mount *mnt, const struct path *root);
 unsigned int mnt_get_count(struct mount *mnt);
 void mnt_set_mountpoint(struct mount *, struct mountpoint *,
                        struct mount *);
+void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp,
+                           struct mount *mnt);
 struct mount *copy_tree(struct mount *, struct dentry *, int);
 bool is_path_reachable(struct mount *, struct dentry *,
                         const struct path *root);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 3d773eb9e144..b73b4de8fb36 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1667,12 +1667,63 @@ const struct inode_operations proc_pid_link_inode_operations = {
 /* building an inode */
+void task_dump_owner(struct task_struct *task, mode_t mode,
+                     kuid_t *ruid, kgid_t *rgid)
+{
+        /* Depending on the state of dumpable compute who should own a
+         * proc file for a task.
+         */
+        const struct cred *cred;
+        kuid_t uid;
+        kgid_t gid;
+        /* Default to the tasks effective ownership */
+        rcu_read_lock();
+        cred = __task_cred(task);
+        uid = cred->euid;
+        gid = cred->egid;
+        rcu_read_unlock();
+        /*
+         * Before the /proc/pid/status file was created the only way to read
+         * the effective uid of a /process was to stat /proc/pid.  Reading
+         * /proc/pid/status is slow enough that procps and other packages
+         * kept stating /proc/pid.  To keep the rules in /proc simple I have
+         * made this apply to all per process world readable and executable
+         * directories.
+         */
+        if (mode != (S_IFDIR|S_IRUGO|S_IXUGO)) {
+                struct mm_struct *mm;
+                task_lock(task);
+                mm = task->mm;
+                /* Make non-dumpable tasks owned by some root */
+                if (mm) {
+                        if (get_dumpable(mm) != SUID_DUMP_USER) {
+                                struct user_namespace *user_ns = mm->user_ns;
+                                uid = make_kuid(user_ns, 0);
+                                if (!uid_valid(uid))
+                                        uid = GLOBAL_ROOT_UID;
+                                gid = make_kgid(user_ns, 0);
+                                if (!gid_valid(gid))
+                                        gid = GLOBAL_ROOT_GID;
+                        }
+                } else {
+                        uid = GLOBAL_ROOT_UID;
+                        gid = GLOBAL_ROOT_GID;
+                }
+                task_unlock(task);
+        }
+        *ruid = uid;
+        *rgid = gid;
+}
 struct inode *proc_pid_make_inode(struct super_block * sb,
                                  struct task_struct *task, umode_t mode)
 {
        struct inode * inode;
        struct proc_inode *ei;
-        const struct cred *cred;
        /* We need a new inode */
@@ -1694,13 +1745,7 @@ struct inode *proc_pid_make_inode(struct super_block * sb,
        if (!ei->pid)
                goto out_unlock;
-        if (task_dumpable(task)) {
+        task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
-                rcu_read_lock();
-                cred = __task_cred(task);
-                inode->i_uid = cred->euid;
-                inode->i_gid = cred->egid;
-                rcu_read_unlock();
-        }
        security_task_to_inode(task, inode);
 out:
@@ -1715,7 +1760,6 @@ int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 {
        struct inode *inode = d_inode(dentry);
        struct task_struct *task;
-        const struct cred *cred;
        struct pid_namespace *pid = dentry->d_sb->s_fs_info;
        generic_fillattr(inode, stat);
@@ -1733,12 +1777,7 @@ int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
                         */
                        return -ENOENT;
                }
-                if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
+                task_dump_owner(task, inode->i_mode, &stat->uid, &stat->gid);
-                    task_dumpable(task)) {
-                        cred = __task_cred(task);
-                        stat->uid = cred->euid;
-                        stat->gid = cred->egid;
-                }
        }
        rcu_read_unlock();
        return 0;
@@ -1754,18 +1793,11 @@ int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 * Rewrite the inode's ownerships here because the owning task may have
 * performed a setuid(), etc.
 *
- * Before the /proc/pid/status file was created the only way to read
- * the effective uid of a /process was to stat /proc/pid.  Reading
- * /proc/pid/status is slow enough that procps and other packages
- * kept stating /proc/pid.  To keep the rules in /proc simple I have
- * made this apply to all per process world readable and executable
- * directories.
 */
 int pid_revalidate(struct dentry *dentry, unsigned int flags)
 {
        struct inode *inode;
        struct task_struct *task;
-        const struct cred *cred;
        if (flags & LOOKUP_RCU)
                return -ECHILD;
@@ -1774,17 +1806,8 @@ int pid_revalidate(struct dentry *dentry, unsigned int flags)
        task = get_proc_task(inode);
        if (task) {
-                if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
+                task_dump_owner(task, inode->i_mode, &inode->i_uid, &inode->i_gid);
-                    task_dumpable(task)) {
-                        rcu_read_lock();
-                        cred = __task_cred(task);
-                        inode->i_uid = cred->euid;
-                        inode->i_gid = cred->egid;
-                        rcu_read_unlock();
-                } else {
-                        inode->i_uid = GLOBAL_ROOT_UID;
-                        inode->i_gid = GLOBAL_ROOT_GID;
-                }
                inode->i_mode &= ~(S_ISUID | S_ISGID);
                security_task_to_inode(task, inode);
                put_task_struct(task);
@@ -1881,7 +1904,6 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)
        bool exact_vma_exists = false;
        struct mm_struct *mm = NULL;
        struct task_struct *task;
-        const struct cred *cred;
        struct inode *inode;
        int status = 0;
@@ -1906,16 +1928,8 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)
        mmput(mm);
        if (exact_vma_exists) {
-                if (task_dumpable(task)) {
+                task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
-                        rcu_read_lock();
-                        cred = __task_cred(task);
-                        inode->i_uid = cred->euid;
-                        inode->i_gid = cred->egid;
-                        rcu_read_unlock();
-                } else {
-                        inode->i_uid = GLOBAL_ROOT_UID;
-                        inode->i_gid = GLOBAL_ROOT_GID;
-                }
                security_task_to_inode(task, inode);
                status = 1;
        }
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 4274f83bf100..00ce1531b2f5 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -84,7 +84,6 @@ static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags)
 {
        struct files_struct *files;
        struct task_struct *task;
-        const struct cred *cred;
        struct inode *inode;
        unsigned int fd;
@@ -108,16 +107,7 @@ static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags)
                                rcu_read_unlock();
                                put_files_struct(files);
-                                if (task_dumpable(task)) {
+                                task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
-                                        rcu_read_lock();
-                                        cred = __task_cred(task);
-                                        inode->i_uid = cred->euid;
-                                        inode->i_gid = cred->egid;
-                                        rcu_read_unlock();
-                                } else {
-                                        inode->i_uid = GLOBAL_ROOT_UID;
-                                        inode->i_gid = GLOBAL_ROOT_GID;
-                                }
                                if (S_ISLNK(inode->i_mode)) {
                                        unsigned i_mode = S_IFLNK;
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 842a5ff5b85c..7ad9ed7958af 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -43,10 +43,11 @@ static void proc_evict_inode(struct inode *inode)
        de = PDE(inode);
        if (de)
                pde_put(de);
        head = PROC_I(inode)->sysctl;
        if (head) {
                RCU_INIT_POINTER(PROC_I(inode)->sysctl, NULL);
-                sysctl_head_put(head);
+                proc_sys_evict_inode(inode, head);
        }
 }
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 2de5194ba378..5d6960f5f1c0 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -65,6 +65,7 @@ struct proc_inode {
        struct proc_dir_entry *pde;
        struct ctl_table_header *sysctl;
        struct ctl_table *sysctl_entry;
+        struct list_head sysctl_inodes;
        const struct proc_ns_operations *ns_ops;
        struct inode vfs_inode;
 };
@@ -97,20 +98,8 @@ static inline struct task_struct *get_proc_task(struct inode *inode)
        return get_pid_task(proc_pid(inode), PIDTYPE_PID);
 }
-static inline int task_dumpable(struct task_struct *task)
+void task_dump_owner(struct task_struct *task, mode_t mode,
-{
+                     kuid_t *ruid, kgid_t *rgid);
-        int dumpable = 0;
-        struct mm_struct *mm;
-        task_lock(task);
-        mm = task->mm;
-        if (mm)
-                dumpable = get_dumpable(mm);
-        task_unlock(task);
-        if (dumpable == SUID_DUMP_USER)
-                return 1;
-        return 0;
-}
 static inline unsigned name_to_int(const struct qstr *qstr)
 {
@@ -249,10 +238,12 @@ extern void proc_thread_self_init(void);
 */
 #ifdef CONFIG_PROC_SYSCTL
 extern int proc_sys_init(void);
-extern void sysctl_head_put(struct ctl_table_header *);
+extern void proc_sys_evict_inode(struct inode *inode,
+                                 struct ctl_table_header *head);
 #else
 static inline void proc_sys_init(void) { }
-static inline void sysctl_head_put(struct ctl_table_header *head) { }
+static inline void proc_sys_evict_inode(struct  inode *inode,
+                                        struct ctl_table_header *head) { }
 #endif
 /*
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index d4e37acd4821..3e64c6502dc8 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -190,6 +190,7 @@ static void init_header(struct ctl_table_header *head,
        head->set = set;
        head->parent = NULL;
        head->node = node;
+        INIT_LIST_HEAD(&head->inodes);
        if (node) {
                struct ctl_table *entry;
                for (entry = table; entry->procname; entry++, node++)
@@ -259,6 +260,27 @@ static void unuse_table(struct ctl_table_header *p)
                        complete(p->unregistering);
 }
+/* called under sysctl_lock */
+static void proc_sys_prune_dcache(struct ctl_table_header *head)
+{
+        struct inode *inode, *prev = NULL;
+        struct proc_inode *ei;
+        rcu_read_lock();
+        list_for_each_entry_rcu(ei, &head->inodes, sysctl_inodes) {
+                inode = igrab(&ei->vfs_inode);
+                if (inode) {
+                        rcu_read_unlock();
+                        iput(prev);
+                        prev = inode;
+                        d_prune_aliases(inode);
+                        rcu_read_lock();
+                }
+        }
+        rcu_read_unlock();
+        iput(prev);
+}
 /* called under sysctl_lock, will reacquire if has to wait */
 static void start_unregistering(struct ctl_table_header *p)
 {
@@ -272,31 +294,22 @@ static void start_unregistering(struct ctl_table_header *p)
                p->unregistering = &wait;
                spin_unlock(&sysctl_lock);
                wait_for_completion(&wait);
-                spin_lock(&sysctl_lock);
        } else {
                /* anything non-NULL; we'll never dereference it */
                p->unregistering = ERR_PTR(-EINVAL);
+                spin_unlock(&sysctl_lock);
        }
        /*
+         * Prune dentries for unregistered sysctls: namespaced sysctls
+         * can have duplicate names and contaminate dcache very badly.
+         */
+        proc_sys_prune_dcache(p);
+        /*
         * do not remove from the list until nobody holds it; walking the
         * list in do_sysctl() relies on that.
         */
-        erase_header(p);
-}
-static void sysctl_head_get(struct ctl_table_header *head)
-{
        spin_lock(&sysctl_lock);
-        head->count++;
+        erase_header(p);
-        spin_unlock(&sysctl_lock);
-}
-void sysctl_head_put(struct ctl_table_header *head)
-{
-        spin_lock(&sysctl_lock);
-        if (!--head->count)
-                kfree_rcu(head, rcu);
-        spin_unlock(&sysctl_lock);
 }
 static struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head)
@@ -440,10 +453,20 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
        inode->i_ino = get_next_ino();
-        sysctl_head_get(head);
        ei = PROC_I(inode);
+        spin_lock(&sysctl_lock);
+        if (unlikely(head->unregistering)) {
+                spin_unlock(&sysctl_lock);
+                iput(inode);
+                inode = NULL;
+                goto out;
+        }
        ei->sysctl = head;
        ei->sysctl_entry = table;
+        list_add_rcu(&ei->sysctl_inodes, &head->inodes);
+        head->count++;
+        spin_unlock(&sysctl_lock);
        inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
        inode->i_mode = table->mode;
@@ -466,6 +489,15 @@ out:
        return inode;
 }
+void proc_sys_evict_inode(struct inode *inode, struct ctl_table_header *head)
+{
+        spin_lock(&sysctl_lock);
+        list_del_rcu(&PROC_I(inode)->sysctl_inodes);
+        if (!--head->count)
+                kfree_rcu(head, rcu);
+        spin_unlock(&sysctl_lock);
+}
 static struct ctl_table_header *grab_header(struct inode *inode)
 {
        struct ctl_table_header *head = PROC_I(inode)->sysctl;
diff --git a/fs/super.c b/fs/super.c
index ea662b0e5e78..b8b6a086c03b 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -469,7 +469,7 @@ struct super_block *sget_userns(struct file_system_type *type,
        struct super_block *old;
        int err;
-        if (!(flags & MS_KERNMOUNT) &&
+        if (!(flags & (MS_KERNMOUNT|MS_SUBMOUNT)) &&
            !(type->fs_flags & FS_USERNS_MOUNT) &&
            !capable(CAP_SYS_ADMIN))
                return ERR_PTR(-EPERM);
@@ -499,7 +499,7 @@ retry:
        }
        if (!s) {
                spin_unlock(&sb_lock);
-                s = alloc_super(type, flags, user_ns);
+                s = alloc_super(type, (flags & ~MS_SUBMOUNT), user_ns);
                if (!s)
                        return ERR_PTR(-ENOMEM);
                goto retry;
@@ -540,8 +540,15 @@ struct super_block *sget(struct file_system_type *type,
 {
        struct user_namespace *user_ns = current_user_ns();
+        /* We don't yet pass the user namespace of the parent
+         * mount through to here so always use &init_user_ns
+         * until that changes.
+         */
+        if (flags & MS_SUBMOUNT)
+                user_ns = &init_user_ns;
        /* Ensure the requestor has permissions over the target filesystem */
-        if (!(flags & MS_KERNMOUNT) && !ns_capable(user_ns, CAP_SYS_ADMIN))
+        if (!(flags & (MS_KERNMOUNT|MS_SUBMOUNT)) && !ns_capable(user_ns, CAP_SYS_ADMIN))
                return ERR_PTR(-EPERM);
        return sget_userns(type, test, set, flags, user_ns, data);
diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index 9d571acd3a48..7dff776e6d16 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -98,9 +98,10 @@ struct dentry *debugfs_create_dir(const char *name, struct dentry *parent);
 struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent,
                                      const char *dest);
+typedef struct vfsmount *(*debugfs_automount_t)(struct dentry *, void *);
 struct dentry *debugfs_create_automount(const char *name,
                                        struct dentry *parent,
-                                        struct vfsmount *(*f)(void *),
+                                        debugfs_automount_t f,
                                        void *data);
 void debugfs_remove(struct dentry *dentry);
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 487246546ebe..e6e689b5569e 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -16,6 +16,7 @@
 #include <linux/spinlock.h>
 #include <linux/types.h>
 #include <linux/atomic.h>
+#include <linux/user_namespace.h>
 /*
 * IN_* from inotfy.h lines up EXACTLY with FS_*, this is so we can easily
@@ -170,7 +171,7 @@ struct fsnotify_group {
                struct inotify_group_private_data {
                        spinlock_t      idr_lock;
                        struct idr      idr;
-                        struct user_struct      *user;
+                        struct ucounts *ucounts;
                } inotify_data;
 #endif
 #ifdef CONFIG_FANOTIFY
diff --git a/include/linux/mount.h b/include/linux/mount.h
index c6f55158d5e5..8e0352af06b7 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -90,6 +90,9 @@ struct file_system_type;
 extern struct vfsmount *vfs_kern_mount(struct file_system_type *type,
                                      int flags, const char *name,
                                      void *data);
+extern struct vfsmount *vfs_submount(const struct dentry *mountpoint,
+                                     struct file_system_type *type,
+                                     const char *name, void *data);
 extern void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list);
 extern void mark_mounts_for_expiry(struct list_head *mounts);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index c8e519d0b4a3..451e241f32c5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -846,10 +846,6 @@ struct user_struct {
        atomic_t __count;       /* reference count */
        atomic_t processes;     /* How many processes does this user have? */
        atomic_t sigpending;    /* How many pending signals does this user have? */
-#ifdef CONFIG_INOTIFY_USER
-        atomic_t inotify_watches; /* How many inotify watches does this user have? */
-        atomic_t inotify_devs;  /* How many inotify devs does this user have opened? */
-#endif
 #ifdef CONFIG_FANOTIFY
        atomic_t fanotify_listeners;
 #endif
@@ -3051,6 +3047,9 @@ extern bool current_is_single_threaded(void);
 #define for_each_process_thread(p, t)   \
        for_each_process(p) for_each_thread(p, t)
+typedef int (*proc_visitor)(struct task_struct *p, void *data);
+void walk_process_tree(struct task_struct *top, proc_visitor, void *);
 static inline int get_nr_threads(struct task_struct *tsk)
 {
        return tsk->signal->nr_threads;
diff --git a/include/linux/security.h b/include/linux/security.h
index d3868f2ebada..96899fad7016 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -140,8 +140,7 @@ struct request_sock;
 /* bprm->unsafe reasons */
 #define LSM_UNSAFE_SHARE        1
 #define LSM_UNSAFE_PTRACE       2
-#define LSM_UNSAFE_PTRACE_CAP   4
+#define LSM_UNSAFE_NO_NEW_PRIVS 4
-#define LSM_UNSAFE_NO_NEW_PRIVS 8
 #ifdef CONFIG_MMU
 extern int mmap_min_addr_handler(struct ctl_table *table, int write,
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index adf4e51cf597..b7e82049fec7 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -143,6 +143,7 @@ struct ctl_table_header
        struct ctl_table_set *set;
        struct ctl_dir *parent;
        struct ctl_node *node;
+        struct list_head inodes; /* head for proc_inode->sysctl_inodes */
 };
 struct ctl_dir {
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index eb209d4523f5..363e0e8082a9 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -32,6 +32,10 @@ enum ucount_type {
        UCOUNT_NET_NAMESPACES,
        UCOUNT_MNT_NAMESPACES,
        UCOUNT_CGROUP_NAMESPACES,
+#ifdef CONFIG_INOTIFY_USER
+        UCOUNT_INOTIFY_INSTANCES,
+        UCOUNT_INOTIFY_WATCHES,
+#endif
        UCOUNT_COUNTS,
 };
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 36da93fbf188..048a85e9f017 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -132,6 +132,7 @@ struct inodes_stat_t {
 #define MS_LAZYTIME     (1<<25) /* Update the on-disk [acm]times lazily */
 /* These sb flags are internal to the kernel */
+#define MS_SUBMOUNT     (1<<26)
 #define MS_NOREMOTELOCK (1<<27)
 #define MS_NOSEC        (1<<28)
 #define MS_BORN         (1<<29)
diff --git a/include/uapi/linux/nsfs.h b/include/uapi/linux/nsfs.h
index 3af617230d1b..1a3ca79f466b 100644
--- a/include/uapi/linux/nsfs.h
+++ b/include/uapi/linux/nsfs.h
@@ -6,8 +6,13 @@
 #define NSIO    0xb7
 /* Returns a file descriptor that refers to an owning user namespace */
-#define NS_GET_USERNS   _IO(NSIO, 0x1)
+#define NS_GET_USERNS           _IO(NSIO, 0x1)
 /* Returns a file descriptor that refers to a parent namespace */
-#define NS_GET_PARENT   _IO(NSIO, 0x2)
+#define NS_GET_PARENT           _IO(NSIO, 0x2)
+/* Returns the type of namespace (CLONE_NEW* value) referred to by
+   file descriptor */
+#define NS_GET_NSTYPE           _IO(NSIO, 0x3)
+/* Get owner UID (in the caller's user namespace) for a user namespace */
+#define NS_GET_OWNER_UID        _IO(NSIO, 0x4)
 #endif /* __LINUX_NSFS_H */
diff --git a/kernel/exit.c b/kernel/exit.c
index 580da79e38ee..9960accbf2ab 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -607,15 +607,18 @@ static struct task_struct *find_new_reaper(struct task_struct *father,
                return thread;
        if (father->signal->has_child_subreaper) {
+                unsigned int ns_level = task_pid(father)->level;
                /*
                 * Find the first ->is_child_subreaper ancestor in our pid_ns.
-                 * We start from father to ensure we can not look into another
+                 * We can't check reaper != child_reaper to ensure we do not
-                 * namespace, this is safe because all its threads are dead.
+                 * cross the namespaces, the exiting parent could be injected
+                 * by setns() + fork().
+                 * We check pid->level, this is slightly more efficient than
+                 * task_active_pid_ns(reaper) != task_active_pid_ns(father).
                 */
-                for (reaper = father;
+                for (reaper = father->real_parent;
-                     !same_thread_group(reaper, child_reaper);
+                     task_pid(reaper)->level == ns_level;
                     reaper = reaper->real_parent) {
-                        /* call_usermodehelper() descendants need this check */
                        if (reaper == &init_task)
                                break;
                        if (!reaper->signal->is_child_subreaper)
diff --git a/kernel/fork.c b/kernel/fork.c
index d12fcc4db8a3..348fe73155bc 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1377,9 +1377,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
        sig->oom_score_adj = current->signal->oom_score_adj;
        sig->oom_score_adj_min = current->signal->oom_score_adj_min;
-        sig->has_child_subreaper = current->signal->has_child_subreaper ||
-                                   current->signal->is_child_subreaper;
        mutex_init(&sig->cred_guard_mutex);
        return 0;
@@ -1814,6 +1811,13 @@ static __latent_entropy struct task_struct *copy_process(
                        p->signal->leader_pid = pid;
                        p->signal->tty = tty_kref_get(current->signal->tty);
+                        /*
+                         * Inherit has_child_subreaper flag under the same
+                         * tasklist_lock with adding child to the process tree
+                         * for propagate_has_child_subreaper optimization.
+                         */
+                        p->signal->has_child_subreaper = p->real_parent->signal->has_child_subreaper ||
+                                                         p->real_parent->signal->is_child_subreaper;
                        list_add_tail(&p->sibling, &p->real_parent->children);
                        list_add_tail_rcu(&p->tasks, &init_task.tasks);
                        attach_pid(p, PIDTYPE_PGID);
@@ -2067,6 +2071,38 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
 }
 #endif
+void walk_process_tree(struct task_struct *top, proc_visitor visitor, void *data)
+{
+        struct task_struct *leader, *parent, *child;
+        int res;
+        read_lock(&tasklist_lock);
+        leader = top = top->group_leader;
+down:
+        for_each_thread(leader, parent) {
+                list_for_each_entry(child, &parent->children, sibling) {
+                        res = visitor(child, data);
+                        if (res) {
+                                if (res < 0)
+                                        goto out;
+                                leader = child;
+                                goto down;
+                        }
+up:
+                        ;
+                }
+        }
+        if (leader != top) {
+                child = leader;
+                parent = child->real_parent;
+                leader = parent->group_leader;
+                goto up;
+        }
+out:
+        read_unlock(&tasklist_lock);
+}
 #ifndef ARCH_MIN_MMSTRUCT_ALIGN
 #define ARCH_MIN_MMSTRUCT_ALIGN 0
 #endif
diff --git a/kernel/sys.c b/kernel/sys.c
index 7d4a9a6df956..b07adca97ea3 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2063,6 +2063,24 @@ static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
 }
 #endif
+static int propagate_has_child_subreaper(struct task_struct *p, void *data)
+{
+        /*
+         * If task has has_child_subreaper - all its decendants
+         * already have these flag too and new decendants will
+         * inherit it on fork, skip them.
+         *
+         * If we've found child_reaper - skip descendants in
+         * it's subtree as they will never get out pidns.
+         */
+        if (p->signal->has_child_subreaper ||
+            is_child_reaper(task_pid(p)))
+                return 0;
+        p->signal->has_child_subreaper = 1;
+        return 1;
+}
 SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
                unsigned long, arg4, unsigned long, arg5)
 {
@@ -2214,6 +2232,10 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
                break;
        case PR_SET_CHILD_SUBREAPER:
                me->signal->is_child_subreaper = !!arg2;
+                if (!arg2)
+                        break;
+                walk_process_tree(me, propagate_has_child_subreaper, NULL);
                break;
        case PR_GET_CHILD_SUBREAPER:
                error = put_user(me->signal->is_child_subreaper,
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d7449783987a..310f0ea0d1a2 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -7503,7 +7503,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
        ftrace_init_tracefs(tr, d_tracer);
 }
-static struct vfsmount *trace_automount(void *ingore)
+static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore)
 {
        struct vfsmount *mnt;
        struct file_system_type *type;
@@ -7516,7 +7516,7 @@ static struct vfsmount *trace_automount(void *ingore)
        type = get_fs_type("tracefs");
        if (!type)
                return NULL;
-        mnt = vfs_kern_mount(type, 0, "tracefs", NULL);
+        mnt = vfs_submount(mntpt, type, "tracefs", NULL);
        put_filesystem(type);
        if (IS_ERR(mnt))
                return NULL;
diff --git a/kernel/ucount.c b/kernel/ucount.c
index 95c6336fc2b3..8a11fc0cb459 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -57,7 +57,7 @@ static struct ctl_table_root set_root = {
 static int zero = 0;
 static int int_max = INT_MAX;
-#define UCOUNT_ENTRY(name)                              \
+#define UCOUNT_ENTRY(name)                              \
        {                                               \
                .procname       = name,                 \
                .maxlen         = sizeof(int),          \
@@ -74,6 +74,10 @@ static struct ctl_table user_table[] = {
        UCOUNT_ENTRY("max_net_namespaces"),
        UCOUNT_ENTRY("max_mnt_namespaces"),
        UCOUNT_ENTRY("max_cgroup_namespaces"),
+#ifdef CONFIG_INOTIFY_USER
+        UCOUNT_ENTRY("max_inotify_instances"),
+        UCOUNT_ENTRY("max_inotify_watches"),
+#endif
        { }
 };
 #endif /* CONFIG_SYSCTL */
diff --git a/security/apparmor/domain.c b/security/apparmor/domain.c
index ef4beef06e9d..001e133a3c8c 100644
--- a/security/apparmor/domain.c
+++ b/security/apparmor/domain.c
@@ -471,7 +471,7 @@ int apparmor_bprm_set_creds(struct linux_binprm *bprm)
                ;
        }
-        if (bprm->unsafe & (LSM_UNSAFE_PTRACE | LSM_UNSAFE_PTRACE_CAP)) {
+        if (bprm->unsafe & LSM_UNSAFE_PTRACE) {
                error = may_change_ptraced_domain(new_profile);
                if (error)
                        goto audit;
diff --git a/security/commoncap.c b/security/commoncap.c
index 6d4d586b9356..78b37838a2d3 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -548,9 +548,10 @@ skip:
        if ((is_setid ||
             !cap_issubset(new->cap_permitted, old->cap_permitted)) &&
-            bprm->unsafe & ~LSM_UNSAFE_PTRACE_CAP) {
+            ((bprm->unsafe & ~LSM_UNSAFE_PTRACE) ||
+             !ptracer_capable(current, new->user_ns))) {
                /* downgrade; they get no more than they had, and maybe less */
-                if (!capable(CAP_SETUID) ||
+                if (!ns_capable(new->user_ns, CAP_SETUID) ||
                    (bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS)) {
                        new->euid = new->uid;
                        new->egid = new->gid;
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index e6b1b7410321..9a8f12f8d5b7 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -2399,8 +2399,7 @@ static int selinux_bprm_set_creds(struct linux_binprm *bprm)
                /* Make sure that anyone attempting to ptrace over a task that
                 * changes its SID has the appropriate permit */
-                if (bprm->unsafe &
+                if (bprm->unsafe & LSM_UNSAFE_PTRACE) {
-                    (LSM_UNSAFE_PTRACE | LSM_UNSAFE_PTRACE_CAP)) {
                        u32 ptsid = ptrace_parent_sid();
                        if (ptsid != 0) {
                                rc = avc_has_perm(ptsid, new_tsec->sid,
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 60b4217b9b68..fc8fb31fc24f 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -931,7 +931,7 @@ static int smack_bprm_set_creds(struct linux_binprm *bprm)
            isp->smk_task != sbsp->smk_root)
                return 0;
-        if (bprm->unsafe & (LSM_UNSAFE_PTRACE | LSM_UNSAFE_PTRACE_CAP)) {
+        if (bprm->unsafe & LSM_UNSAFE_PTRACE) {
                struct task_struct *tracer;
                rc = 0;