aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAl Viro <viro@zeniv.linux.org.uk>2018-08-09 17:51:32 -0400
committerAl Viro <viro@zeniv.linux.org.uk>2018-08-09 17:51:32 -0400
commit119e1ef80ecfe0d1deb6378d4ab41f5b71519de1 (patch)
treeab2f094205bde3da367a4a875d5102157a7f370b
parent9ea0a46ca2c318fcc449c1e6b62a7230a17888f1 (diff)
fix __legitimize_mnt()/mntput() race
__legitimize_mnt() has two problems - one is that in case of success the check of mount_lock is not ordered wrt preceding increment of refcount, making it possible to have successful __legitimize_mnt() on one CPU just before the otherwise final mntpu() on another, with __legitimize_mnt() not seeing mntput() taking the lock and mntput() not seeing the increment done by __legitimize_mnt(). Solved by a pair of barriers. Another is that failure of __legitimize_mnt() on the second read_seqretry() leaves us with reference that'll need to be dropped by caller; however, if that races with final mntput() we can end up with caller dropping rcu_read_lock() and doing mntput() to release that reference - with the first mntput() having freed the damn thing just as rcu_read_lock() had been dropped. Solution: in "do mntput() yourself" failure case grab mount_lock, check if MNT_DOOMED has been set by racing final mntput() that has missed our increment and if it has - undo the increment and treat that as "failure, caller doesn't need to drop anything" case. It's not easy to hit - the final mntput() has to come right after the first read_seqretry() in __legitimize_mnt() *and* manage to miss the increment done by __legitimize_mnt() before the second read_seqretry() in there. The things that are almost impossible to hit on bare hardware are not impossible on SMP KVM, though... Reported-by: Oleg Nesterov <oleg@redhat.com> Fixes: 48a066e72d97 ("RCU'd vsfmounts") Cc: stable@vger.kernel.org Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
-rw-r--r--fs/namespace.c14
1 files changed, 14 insertions, 0 deletions
diff --git a/fs/namespace.c b/fs/namespace.c
index d46a951bd541..bd2f4c68506a 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -659,12 +659,21 @@ int __legitimize_mnt(struct vfsmount *bastard, unsigned seq)
659 return 0; 659 return 0;
660 mnt = real_mount(bastard); 660 mnt = real_mount(bastard);
661 mnt_add_count(mnt, 1); 661 mnt_add_count(mnt, 1);
662 smp_mb(); // see mntput_no_expire()
662 if (likely(!read_seqretry(&mount_lock, seq))) 663 if (likely(!read_seqretry(&mount_lock, seq)))
663 return 0; 664 return 0;
664 if (bastard->mnt_flags & MNT_SYNC_UMOUNT) { 665 if (bastard->mnt_flags & MNT_SYNC_UMOUNT) {
665 mnt_add_count(mnt, -1); 666 mnt_add_count(mnt, -1);
666 return 1; 667 return 1;
667 } 668 }
669 lock_mount_hash();
670 if (unlikely(bastard->mnt_flags & MNT_DOOMED)) {
671 mnt_add_count(mnt, -1);
672 unlock_mount_hash();
673 return 1;
674 }
675 unlock_mount_hash();
676 /* caller will mntput() */
668 return -1; 677 return -1;
669} 678}
670 679
@@ -1210,6 +1219,11 @@ static void mntput_no_expire(struct mount *mnt)
1210 return; 1219 return;
1211 } 1220 }
1212 lock_mount_hash(); 1221 lock_mount_hash();
1222 /*
1223 * make sure that if __legitimize_mnt() has not seen us grab
1224 * mount_lock, we'll see their refcount increment here.
1225 */
1226 smp_mb();
1213 mnt_add_count(mnt, -1); 1227 mnt_add_count(mnt, -1);
1214 if (mnt_get_count(mnt)) { 1228 if (mnt_get_count(mnt)) {
1215 rcu_read_unlock(); 1229 rcu_read_unlock();