Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace

Pull namespace updates from Eric Biederman: "There is a lot here. A lot of these changes result in subtle user visible differences in kernel behavior. I don't expect anything will care but I will revert/fix things immediately if any regressions show up. From Seth Forshee there is a continuation of the work to make the vfs ready for unpriviled mounts. We had thought the previous changes prevented the creation of files outside of s_user_ns of a filesystem, but it turns we missed the O_CREAT path. Ooops. Pavel Tikhomirov and Oleg Nesterov worked together to fix a long standing bug in the implemenation of PR_SET_CHILD_SUBREAPER where only children that are forked after the prctl are considered and not children forked before the prctl. The only known user of this prctl systemd forks all children after the prctl. So no userspace regressions will occur. Holding earlier forked children to the same rules as later forked children creates a semantic that is sane enough to allow checkpoing of processes that use this feature. There is a long delayed change by Nikolay Borisov to limit inotify instances inside a user namespace. Michael Kerrisk extends the API for files used to maniuplate namespaces with two new trivial ioctls to allow discovery of the hierachy and properties of namespaces. Konstantin Khlebnikov with the help of Al Viro adds code that when a network namespace exits purges it's sysctl entries from the dcache. As in some circumstances this could use a lot of memory. Vivek Goyal fixed a bug with stacked filesystems where the permissions on the wrong inode were being checked. I continue previous work on ptracing across exec. Allowing a file to be setuid across exec while being ptraced if the tracer has enough credentials in the user namespace, and if the process has CAP_SETUID in it's own namespace. Proc files for setuid or otherwise undumpable executables are now owned by the root in the user namespace of their mm. Allowing debugging of setuid applications in containers to work better. A bug I introduced with permission checking and automount is now fixed. The big change is to mark the mounts that the kernel initiates as a result of an automount. This allows the permission checks in sget to be safely suppressed for this kind of mount. As the permission check happened when the original filesystem was mounted. Finally a special case in the mount namespace is removed preventing unbounded chains in the mount hash table, and making the semantics simpler which benefits CRIU. The vfs fix along with related work in ima and evm I believe makes us ready to finish developing and merge fully unprivileged mounts of the fuse filesystem. The cleanups of the mount namespace makes discussing how to fix the worst case complexity of umount. The stacked filesystem fixes pave the way for adding multiple mappings for the filesystem uids so that efficient and safer containers can be implemented" * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace: proc/sysctl: Don't grab i_lock under sysctl_lock. vfs: Use upper filesystem inode in bprm_fill_uid() proc/sysctl: prune stale dentries during unregistering mnt: Tuck mounts under others instead of creating shadow/side mounts. prctl: propagate has_child_subreaper flag to every descendant introduce the walk_process_tree() helper nsfs: Add an ioctl() to return owner UID of a userns fs: Better permission checking for submounts exit: fix the setns() && PR_SET_CHILD_SUBREAPER interaction vfs: open() with O_CREAT should not create inodes with unknown ids nsfs: Add an ioctl() to return the namespace type proc: Better ownership of files for non-dumpable tasks in user namespaces exec: Remove LSM_UNSAFE_PTRACE_CAP exec: Test the ptracer's saved cred to see if the tracee can gain caps exec: Don't reset euid and egid when the tracee has CAP_SETUID inotify: Convert to using per-namespace limits
author: Linus Torvalds <torvalds@linux-foundation.org> 2017-02-23 23:33:51 -0500
committer: Linus Torvalds <torvalds@linux-foundation.org> 2017-02-23 23:33:51 -0500
commit: f1ef09fde17f9b77ca1435a5b53a28b203afb81c (patch)
tree: 0efcd2c5b5da451a7ca780c8aa5e26d7ec712b85 /fs/pnode.c
parent: ef96152e6a36e0510387cb174178b7982c1ae879 (diff)
parent: ace0c791e6c3cf5ef37cad2df69f0d90ccc40ffb (diff)
1 files changed, 49 insertions, 12 deletions
diff --git a/fs/pnode.c b/fs/pnode.c
index 06a793f4ae38..5bc7896d122a 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -322,6 +322,21 @@ out:
        return ret;
 }
+static struct mount *find_topper(struct mount *mnt)
+{
+        /* If there is exactly one mount covering mnt completely return it. */
+        struct mount *child;
+        if (!list_is_singular(&mnt->mnt_mounts))
+                return NULL;
+        child = list_first_entry(&mnt->mnt_mounts, struct mount, mnt_child);
+        if (child->mnt_mountpoint != mnt->mnt.mnt_root)
+                return NULL;
+        return child;
+}
 /*
 * return true if the refcount is greater than count
 */
@@ -342,9 +357,8 @@ static inline int do_refcount_check(struct mount *mnt, int count)
 */
 int propagate_mount_busy(struct mount *mnt, int refcnt)
 {
-        struct mount *m, *child;
+        struct mount *m, *child, *topper;
        struct mount *parent = mnt->mnt_parent;
-        int ret = 0;
        if (mnt == parent)
                return do_refcount_check(mnt, refcnt);
@@ -359,12 +373,24 @@ int propagate_mount_busy(struct mount *mnt, int refcnt)
        for (m = propagation_next(parent, parent); m;
                        m = propagation_next(m, parent)) {
-                child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint);
+                int count = 1;
-                if (child && list_empty(&child->mnt_mounts) &&
+                child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint);
-                    (ret = do_refcount_check(child, 1)))
+                if (!child)
-                        break;
+                        continue;
+                /* Is there exactly one mount on the child that covers
+                 * it completely whose reference should be ignored?
+                 */
+                topper = find_topper(child);
+                if (topper)
+                        count += 1;
+                else if (!list_empty(&child->mnt_mounts))
+                        continue;
+                if (do_refcount_check(child, count))
+                        return 1;
        }
-        return ret;
+        return 0;
 }
 /*
@@ -381,7 +407,7 @@ void propagate_mount_unlock(struct mount *mnt)
        for (m = propagation_next(parent, parent); m;
                        m = propagation_next(m, parent)) {
-                child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint);
+                child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint);
                if (child)
                        child->mnt.mnt_flags &= ~MNT_LOCKED;
        }
@@ -399,9 +425,11 @@ static void mark_umount_candidates(struct mount *mnt)
        for (m = propagation_next(parent, parent); m;
                        m = propagation_next(m, parent)) {
-                struct mount *child = __lookup_mnt_last(&m->mnt,
+                struct mount *child = __lookup_mnt(&m->mnt,
                                                mnt->mnt_mountpoint);
-                if (child && (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m))) {
+                if (!child || (child->mnt.mnt_flags & MNT_UMOUNT))
+                        continue;
+                if (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m)) {
                        SET_MNT_MARK(child);
                }
        }
@@ -420,8 +448,8 @@ static void __propagate_umount(struct mount *mnt)
        for (m = propagation_next(parent, parent); m;
                        m = propagation_next(m, parent)) {
+                struct mount *topper;
-                struct mount *child = __lookup_mnt_last(&m->mnt,
+                struct mount *child = __lookup_mnt(&m->mnt,
                                                mnt->mnt_mountpoint);
                /*
                 * umount the child only if the child has no children
@@ -430,6 +458,15 @@ static void __propagate_umount(struct mount *mnt)
                if (!child || !IS_MNT_MARKED(child))
                        continue;
                CLEAR_MNT_MARK(child);
+                /* If there is exactly one mount covering all of child
+                 * replace child with that mount.
+                 */
+                topper = find_topper(child);
+                if (topper)
+                        mnt_change_mountpoint(child->mnt_parent, child->mnt_mp,
+                                              topper);
                if (list_empty(&child->mnt_mounts)) {
                        list_del_init(&child->mnt_child);
                        child->mnt.mnt_flags |= MNT_UMOUNT;
author	Linus Torvalds <torvalds@linux-foundation.org>	2017-02-23 23:33:51 -0500
committer	Linus Torvalds <torvalds@linux-foundation.org>	2017-02-23 23:33:51 -0500
commit	f1ef09fde17f9b77ca1435a5b53a28b203afb81c (patch)
tree	0efcd2c5b5da451a7ca780c8aa5e26d7ec712b85 /fs/pnode.c
parent	ef96152e6a36e0510387cb174178b7982c1ae879 (diff)
parent	ace0c791e6c3cf5ef37cad2df69f0d90ccc40ffb (diff)

diff --git a/fs/pnode.c b/fs/pnode.c index 06a793f4ae38..5bc7896d122a 100644 --- a/fs/pnode.c +++ b/fs/pnode.c
@@ -322,6 +322,21 @@ out:
322	return ret;	322	return ret;
323	}	323	}
324		324
		325	static struct mount find_topper(struct mount mnt)
		326	{
		327	/* If there is exactly one mount covering mnt completely return it. */
		328	struct mount *child;
		329
		330	if (!list_is_singular(&mnt->mnt_mounts))
		331	return NULL;
		332
		333	child = list_first_entry(&mnt->mnt_mounts, struct mount, mnt_child);
		334	if (child->mnt_mountpoint != mnt->mnt.mnt_root)
		335	return NULL;
		336
		337	return child;
		338	}
		339
325	/*	340	/*
326	* return true if the refcount is greater than count	341	* return true if the refcount is greater than count
327	*/	342	*/
@@ -342,9 +357,8 @@ static inline int do_refcount_check(struct mount *mnt, int count)
342	*/	357	*/
343	int propagate_mount_busy(struct mount *mnt, int refcnt)	358	int propagate_mount_busy(struct mount *mnt, int refcnt)
344	{	359	{
345	struct mount m, child;	360	struct mount m, child, *topper;
346	struct mount *parent = mnt->mnt_parent;	361	struct mount *parent = mnt->mnt_parent;
347	int ret = 0;
348		362
349	if (mnt == parent)	363	if (mnt == parent)
350	return do_refcount_check(mnt, refcnt);	364	return do_refcount_check(mnt, refcnt);
@@ -359,12 +373,24 @@ int propagate_mount_busy(struct mount *mnt, int refcnt)
359		373
360	for (m = propagation_next(parent, parent); m;	374	for (m = propagation_next(parent, parent); m;
361	m = propagation_next(m, parent)) {	375	m = propagation_next(m, parent)) {
362	child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint);	376	int count = 1;
363	if (child && list_empty(&child->mnt_mounts) &&	377	child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint);
364	(ret = do_refcount_check(child, 1)))	378	if (!child)
365	break;	379	continue;
		380
		381	/* Is there exactly one mount on the child that covers
		382	* it completely whose reference should be ignored?
		383	*/
		384	topper = find_topper(child);
		385	if (topper)
		386	count += 1;
		387	else if (!list_empty(&child->mnt_mounts))
		388	continue;
		389
		390	if (do_refcount_check(child, count))
		391	return 1;
366	}	392	}
367	return ret;	393	return 0;
368	}	394	}
369		395
370	/*	396	/*
@@ -381,7 +407,7 @@ void propagate_mount_unlock(struct mount *mnt)
381		407
382	for (m = propagation_next(parent, parent); m;	408	for (m = propagation_next(parent, parent); m;
383	m = propagation_next(m, parent)) {	409	m = propagation_next(m, parent)) {
384	child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint);	410	child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint);
385	if (child)	411	if (child)
386	child->mnt.mnt_flags &= ~MNT_LOCKED;	412	child->mnt.mnt_flags &= ~MNT_LOCKED;
387	}	413	}
@@ -399,9 +425,11 @@ static void mark_umount_candidates(struct mount *mnt)
399		425
400	for (m = propagation_next(parent, parent); m;	426	for (m = propagation_next(parent, parent); m;
401	m = propagation_next(m, parent)) {	427	m = propagation_next(m, parent)) {
402	struct mount *child = __lookup_mnt_last(&m->mnt,	428	struct mount *child = __lookup_mnt(&m->mnt,
403	mnt->mnt_mountpoint);	429	mnt->mnt_mountpoint);
404	if (child && (!IS_MNT_LOCKED(child) \|\| IS_MNT_MARKED(m))) {	430	if (!child \|\| (child->mnt.mnt_flags & MNT_UMOUNT))
		431	continue;
		432	if (!IS_MNT_LOCKED(child) \|\| IS_MNT_MARKED(m)) {
405	SET_MNT_MARK(child);	433	SET_MNT_MARK(child);
406	}	434	}
407	}	435	}
@@ -420,8 +448,8 @@ static void __propagate_umount(struct mount *mnt)
420		448
421	for (m = propagation_next(parent, parent); m;	449	for (m = propagation_next(parent, parent); m;
422	m = propagation_next(m, parent)) {	450	m = propagation_next(m, parent)) {
423		451	struct mount *topper;
424	struct mount *child = __lookup_mnt_last(&m->mnt,	452	struct mount *child = __lookup_mnt(&m->mnt,
425	mnt->mnt_mountpoint);	453	mnt->mnt_mountpoint);
426	/*	454	/*
427	* umount the child only if the child has no children	455	* umount the child only if the child has no children
@@ -430,6 +458,15 @@ static void __propagate_umount(struct mount *mnt)
430	if (!child \|\| !IS_MNT_MARKED(child))	458	if (!child \|\| !IS_MNT_MARKED(child))
431	continue;	459	continue;
432	CLEAR_MNT_MARK(child);	460	CLEAR_MNT_MARK(child);
		461
		462	/* If there is exactly one mount covering all of child
		463	* replace child with that mount.
		464	*/
		465	topper = find_topper(child);
		466	if (topper)
		467	mnt_change_mountpoint(child->mnt_parent, child->mnt_mp,
		468	topper);
		469
433	if (list_empty(&child->mnt_mounts)) {	470	if (list_empty(&child->mnt_mounts)) {
434	list_del_init(&child->mnt_child);	471	list_del_init(&child->mnt_child);
435	child->mnt.mnt_flags \|= MNT_UMOUNT;	472	child->mnt.mnt_flags \|= MNT_UMOUNT;