summaryrefslogtreecommitdiffstats
path: root/fs/namespace.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-02-23 23:33:51 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2017-02-23 23:33:51 -0500
commitf1ef09fde17f9b77ca1435a5b53a28b203afb81c (patch)
tree0efcd2c5b5da451a7ca780c8aa5e26d7ec712b85 /fs/namespace.c
parentef96152e6a36e0510387cb174178b7982c1ae879 (diff)
parentace0c791e6c3cf5ef37cad2df69f0d90ccc40ffb (diff)
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace
Pull namespace updates from Eric Biederman: "There is a lot here. A lot of these changes result in subtle user visible differences in kernel behavior. I don't expect anything will care but I will revert/fix things immediately if any regressions show up. From Seth Forshee there is a continuation of the work to make the vfs ready for unpriviled mounts. We had thought the previous changes prevented the creation of files outside of s_user_ns of a filesystem, but it turns we missed the O_CREAT path. Ooops. Pavel Tikhomirov and Oleg Nesterov worked together to fix a long standing bug in the implemenation of PR_SET_CHILD_SUBREAPER where only children that are forked after the prctl are considered and not children forked before the prctl. The only known user of this prctl systemd forks all children after the prctl. So no userspace regressions will occur. Holding earlier forked children to the same rules as later forked children creates a semantic that is sane enough to allow checkpoing of processes that use this feature. There is a long delayed change by Nikolay Borisov to limit inotify instances inside a user namespace. Michael Kerrisk extends the API for files used to maniuplate namespaces with two new trivial ioctls to allow discovery of the hierachy and properties of namespaces. Konstantin Khlebnikov with the help of Al Viro adds code that when a network namespace exits purges it's sysctl entries from the dcache. As in some circumstances this could use a lot of memory. Vivek Goyal fixed a bug with stacked filesystems where the permissions on the wrong inode were being checked. I continue previous work on ptracing across exec. Allowing a file to be setuid across exec while being ptraced if the tracer has enough credentials in the user namespace, and if the process has CAP_SETUID in it's own namespace. Proc files for setuid or otherwise undumpable executables are now owned by the root in the user namespace of their mm. Allowing debugging of setuid applications in containers to work better. A bug I introduced with permission checking and automount is now fixed. The big change is to mark the mounts that the kernel initiates as a result of an automount. This allows the permission checks in sget to be safely suppressed for this kind of mount. As the permission check happened when the original filesystem was mounted. Finally a special case in the mount namespace is removed preventing unbounded chains in the mount hash table, and making the semantics simpler which benefits CRIU. The vfs fix along with related work in ima and evm I believe makes us ready to finish developing and merge fully unprivileged mounts of the fuse filesystem. The cleanups of the mount namespace makes discussing how to fix the worst case complexity of umount. The stacked filesystem fixes pave the way for adding multiple mappings for the filesystem uids so that efficient and safer containers can be implemented" * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace: proc/sysctl: Don't grab i_lock under sysctl_lock. vfs: Use upper filesystem inode in bprm_fill_uid() proc/sysctl: prune stale dentries during unregistering mnt: Tuck mounts under others instead of creating shadow/side mounts. prctl: propagate has_child_subreaper flag to every descendant introduce the walk_process_tree() helper nsfs: Add an ioctl() to return owner UID of a userns fs: Better permission checking for submounts exit: fix the setns() && PR_SET_CHILD_SUBREAPER interaction vfs: open() with O_CREAT should not create inodes with unknown ids nsfs: Add an ioctl() to return the namespace type proc: Better ownership of files for non-dumpable tasks in user namespaces exec: Remove LSM_UNSAFE_PTRACE_CAP exec: Test the ptracer's saved cred to see if the tracee can gain caps exec: Don't reset euid and egid when the tracee has CAP_SETUID inotify: Convert to using per-namespace limits
Diffstat (limited to 'fs/namespace.c')
-rw-r--r--fs/namespace.c127
1 files changed, 76 insertions, 51 deletions
diff --git a/fs/namespace.c b/fs/namespace.c
index 487ba30bb5c6..8bfad42c1ccf 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -637,28 +637,6 @@ struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
637} 637}
638 638
639/* 639/*
640 * find the last mount at @dentry on vfsmount @mnt.
641 * mount_lock must be held.
642 */
643struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)
644{
645 struct mount *p, *res = NULL;
646 p = __lookup_mnt(mnt, dentry);
647 if (!p)
648 goto out;
649 if (!(p->mnt.mnt_flags & MNT_UMOUNT))
650 res = p;
651 hlist_for_each_entry_continue(p, mnt_hash) {
652 if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry)
653 break;
654 if (!(p->mnt.mnt_flags & MNT_UMOUNT))
655 res = p;
656 }
657out:
658 return res;
659}
660
661/*
662 * lookup_mnt - Return the first child mount mounted at path 640 * lookup_mnt - Return the first child mount mounted at path
663 * 641 *
664 * "First" means first mounted chronologically. If you create the 642 * "First" means first mounted chronologically. If you create the
@@ -878,6 +856,13 @@ void mnt_set_mountpoint(struct mount *mnt,
878 hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list); 856 hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list);
879} 857}
880 858
859static void __attach_mnt(struct mount *mnt, struct mount *parent)
860{
861 hlist_add_head_rcu(&mnt->mnt_hash,
862 m_hash(&parent->mnt, mnt->mnt_mountpoint));
863 list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
864}
865
881/* 866/*
882 * vfsmount lock must be held for write 867 * vfsmount lock must be held for write
883 */ 868 */
@@ -886,28 +871,45 @@ static void attach_mnt(struct mount *mnt,
886 struct mountpoint *mp) 871 struct mountpoint *mp)
887{ 872{
888 mnt_set_mountpoint(parent, mp, mnt); 873 mnt_set_mountpoint(parent, mp, mnt);
889 hlist_add_head_rcu(&mnt->mnt_hash, m_hash(&parent->mnt, mp->m_dentry)); 874 __attach_mnt(mnt, parent);
890 list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
891} 875}
892 876
893static void attach_shadowed(struct mount *mnt, 877void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt)
894 struct mount *parent,
895 struct mount *shadows)
896{ 878{
897 if (shadows) { 879 struct mountpoint *old_mp = mnt->mnt_mp;
898 hlist_add_behind_rcu(&mnt->mnt_hash, &shadows->mnt_hash); 880 struct dentry *old_mountpoint = mnt->mnt_mountpoint;
899 list_add(&mnt->mnt_child, &shadows->mnt_child); 881 struct mount *old_parent = mnt->mnt_parent;
900 } else { 882
901 hlist_add_head_rcu(&mnt->mnt_hash, 883 list_del_init(&mnt->mnt_child);
902 m_hash(&parent->mnt, mnt->mnt_mountpoint)); 884 hlist_del_init(&mnt->mnt_mp_list);
903 list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); 885 hlist_del_init_rcu(&mnt->mnt_hash);
904 } 886
887 attach_mnt(mnt, parent, mp);
888
889 put_mountpoint(old_mp);
890
891 /*
892 * Safely avoid even the suggestion this code might sleep or
893 * lock the mount hash by taking advantage of the knowledge that
894 * mnt_change_mountpoint will not release the final reference
895 * to a mountpoint.
896 *
897 * During mounting, the mount passed in as the parent mount will
898 * continue to use the old mountpoint and during unmounting, the
899 * old mountpoint will continue to exist until namespace_unlock,
900 * which happens well after mnt_change_mountpoint.
901 */
902 spin_lock(&old_mountpoint->d_lock);
903 old_mountpoint->d_lockref.count--;
904 spin_unlock(&old_mountpoint->d_lock);
905
906 mnt_add_count(old_parent, -1);
905} 907}
906 908
907/* 909/*
908 * vfsmount lock must be held for write 910 * vfsmount lock must be held for write
909 */ 911 */
910static void commit_tree(struct mount *mnt, struct mount *shadows) 912static void commit_tree(struct mount *mnt)
911{ 913{
912 struct mount *parent = mnt->mnt_parent; 914 struct mount *parent = mnt->mnt_parent;
913 struct mount *m; 915 struct mount *m;
@@ -925,7 +927,7 @@ static void commit_tree(struct mount *mnt, struct mount *shadows)
925 n->mounts += n->pending_mounts; 927 n->mounts += n->pending_mounts;
926 n->pending_mounts = 0; 928 n->pending_mounts = 0;
927 929
928 attach_shadowed(mnt, parent, shadows); 930 __attach_mnt(mnt, parent);
929 touch_mnt_namespace(n); 931 touch_mnt_namespace(n);
930} 932}
931 933
@@ -989,6 +991,21 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
989} 991}
990EXPORT_SYMBOL_GPL(vfs_kern_mount); 992EXPORT_SYMBOL_GPL(vfs_kern_mount);
991 993
994struct vfsmount *
995vfs_submount(const struct dentry *mountpoint, struct file_system_type *type,
996 const char *name, void *data)
997{
998 /* Until it is worked out how to pass the user namespace
999 * through from the parent mount to the submount don't support
1000 * unprivileged mounts with submounts.
1001 */
1002 if (mountpoint->d_sb->s_user_ns != &init_user_ns)
1003 return ERR_PTR(-EPERM);
1004
1005 return vfs_kern_mount(type, MS_SUBMOUNT, name, data);
1006}
1007EXPORT_SYMBOL_GPL(vfs_submount);
1008
992static struct mount *clone_mnt(struct mount *old, struct dentry *root, 1009static struct mount *clone_mnt(struct mount *old, struct dentry *root,
993 int flag) 1010 int flag)
994{ 1011{
@@ -1764,7 +1781,6 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
1764 continue; 1781 continue;
1765 1782
1766 for (s = r; s; s = next_mnt(s, r)) { 1783 for (s = r; s; s = next_mnt(s, r)) {
1767 struct mount *t = NULL;
1768 if (!(flag & CL_COPY_UNBINDABLE) && 1784 if (!(flag & CL_COPY_UNBINDABLE) &&
1769 IS_MNT_UNBINDABLE(s)) { 1785 IS_MNT_UNBINDABLE(s)) {
1770 s = skip_mnt_tree(s); 1786 s = skip_mnt_tree(s);
@@ -1786,14 +1802,7 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
1786 goto out; 1802 goto out;
1787 lock_mount_hash(); 1803 lock_mount_hash();
1788 list_add_tail(&q->mnt_list, &res->mnt_list); 1804 list_add_tail(&q->mnt_list, &res->mnt_list);
1789 mnt_set_mountpoint(parent, p->mnt_mp, q); 1805 attach_mnt(q, parent, p->mnt_mp);
1790 if (!list_empty(&parent->mnt_mounts)) {
1791 t = list_last_entry(&parent->mnt_mounts,
1792 struct mount, mnt_child);
1793 if (t->mnt_mp != p->mnt_mp)
1794 t = NULL;
1795 }
1796 attach_shadowed(q, parent, t);
1797 unlock_mount_hash(); 1806 unlock_mount_hash();
1798 } 1807 }
1799 } 1808 }
@@ -1992,10 +2001,18 @@ static int attach_recursive_mnt(struct mount *source_mnt,
1992{ 2001{
1993 HLIST_HEAD(tree_list); 2002 HLIST_HEAD(tree_list);
1994 struct mnt_namespace *ns = dest_mnt->mnt_ns; 2003 struct mnt_namespace *ns = dest_mnt->mnt_ns;
2004 struct mountpoint *smp;
1995 struct mount *child, *p; 2005 struct mount *child, *p;
1996 struct hlist_node *n; 2006 struct hlist_node *n;
1997 int err; 2007 int err;
1998 2008
2009 /* Preallocate a mountpoint in case the new mounts need
2010 * to be tucked under other mounts.
2011 */
2012 smp = get_mountpoint(source_mnt->mnt.mnt_root);
2013 if (IS_ERR(smp))
2014 return PTR_ERR(smp);
2015
1999 /* Is there space to add these mounts to the mount namespace? */ 2016 /* Is there space to add these mounts to the mount namespace? */
2000 if (!parent_path) { 2017 if (!parent_path) {
2001 err = count_mounts(ns, source_mnt); 2018 err = count_mounts(ns, source_mnt);
@@ -2022,16 +2039,19 @@ static int attach_recursive_mnt(struct mount *source_mnt,
2022 touch_mnt_namespace(source_mnt->mnt_ns); 2039 touch_mnt_namespace(source_mnt->mnt_ns);
2023 } else { 2040 } else {
2024 mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt); 2041 mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
2025 commit_tree(source_mnt, NULL); 2042 commit_tree(source_mnt);
2026 } 2043 }
2027 2044
2028 hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) { 2045 hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) {
2029 struct mount *q; 2046 struct mount *q;
2030 hlist_del_init(&child->mnt_hash); 2047 hlist_del_init(&child->mnt_hash);
2031 q = __lookup_mnt_last(&child->mnt_parent->mnt, 2048 q = __lookup_mnt(&child->mnt_parent->mnt,
2032 child->mnt_mountpoint); 2049 child->mnt_mountpoint);
2033 commit_tree(child, q); 2050 if (q)
2051 mnt_change_mountpoint(child, smp, q);
2052 commit_tree(child);
2034 } 2053 }
2054 put_mountpoint(smp);
2035 unlock_mount_hash(); 2055 unlock_mount_hash();
2036 2056
2037 return 0; 2057 return 0;
@@ -2046,6 +2066,11 @@ static int attach_recursive_mnt(struct mount *source_mnt,
2046 cleanup_group_ids(source_mnt, NULL); 2066 cleanup_group_ids(source_mnt, NULL);
2047 out: 2067 out:
2048 ns->pending_mounts = 0; 2068 ns->pending_mounts = 0;
2069
2070 read_seqlock_excl(&mount_lock);
2071 put_mountpoint(smp);
2072 read_sequnlock_excl(&mount_lock);
2073
2049 return err; 2074 return err;
2050} 2075}
2051 2076
@@ -2794,7 +2819,7 @@ long do_mount(const char *dev_name, const char __user *dir_name,
2794 2819
2795 flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN | 2820 flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN |
2796 MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT | 2821 MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
2797 MS_STRICTATIME | MS_NOREMOTELOCK); 2822 MS_STRICTATIME | MS_NOREMOTELOCK | MS_SUBMOUNT);
2798 2823
2799 if (flags & MS_REMOUNT) 2824 if (flags & MS_REMOUNT)
2800 retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags, 2825 retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,