summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDave Chinner <dchinner@redhat.com>2018-05-10 21:20:57 -0400
committerAl Viro <viro@zeniv.linux.org.uk>2018-05-11 15:37:57 -0400
commit79f546a696bff2590169fb5684e23d65f4d9f591 (patch)
tree308b4fe1af632ee7b22b62af94bba0b461cc315a
parent1e2e547a93a00ebc21582c06ca3c6cfea2a309ee (diff)
fs: don't scan the inode cache before SB_BORN is set
We recently had an oops reported on a 4.14 kernel in xfs_reclaim_inodes_count() where sb->s_fs_info pointed to garbage and so the m_perag_tree lookup walked into lala land. It produces an oops down this path during the failed mount: radix_tree_gang_lookup_tag+0xc4/0x130 xfs_perag_get_tag+0x37/0xf0 xfs_reclaim_inodes_count+0x32/0x40 xfs_fs_nr_cached_objects+0x11/0x20 super_cache_count+0x35/0xc0 shrink_slab.part.66+0xb1/0x370 shrink_node+0x7e/0x1a0 try_to_free_pages+0x199/0x470 __alloc_pages_slowpath+0x3a1/0xd20 __alloc_pages_nodemask+0x1c3/0x200 cache_grow_begin+0x20b/0x2e0 fallback_alloc+0x160/0x200 kmem_cache_alloc+0x111/0x4e0 The problem is that the superblock shrinker is running before the filesystem structures it depends on have been fully set up. i.e. the shrinker is registered in sget(), before ->fill_super() has been called, and the shrinker can call into the filesystem before fill_super() does it's setup work. Essentially we are exposed to both use-after-free and use-before-initialisation bugs here. To fix this, add a check for the SB_BORN flag in super_cache_count. In general, this flag is not set until ->fs_mount() completes successfully, so we know that it is set after the filesystem setup has completed. This matches the trylock_super() behaviour which will not let super_cache_scan() run if SB_BORN is not set, and hence will not allow the superblock shrinker from entering the filesystem while it is being set up or after it has failed setup and is being torn down. Cc: stable@kernel.org Signed-Off-By: Dave Chinner <dchinner@redhat.com> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
-rw-r--r--fs/super.c30
1 files changed, 24 insertions, 6 deletions
diff --git a/fs/super.c b/fs/super.c
index 122c402049a2..4b5b562176d0 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -121,13 +121,23 @@ static unsigned long super_cache_count(struct shrinker *shrink,
121 sb = container_of(shrink, struct super_block, s_shrink); 121 sb = container_of(shrink, struct super_block, s_shrink);
122 122
123 /* 123 /*
124 * Don't call trylock_super as it is a potential 124 * We don't call trylock_super() here as it is a scalability bottleneck,
125 * scalability bottleneck. The counts could get updated 125 * so we're exposed to partial setup state. The shrinker rwsem does not
126 * between super_cache_count and super_cache_scan anyway. 126 * protect filesystem operations backing list_lru_shrink_count() or
127 * Call to super_cache_count with shrinker_rwsem held 127 * s_op->nr_cached_objects(). Counts can change between
128 * ensures the safety of call to list_lru_shrink_count() and 128 * super_cache_count and super_cache_scan, so we really don't need locks
129 * s_op->nr_cached_objects(). 129 * here.
130 *
131 * However, if we are currently mounting the superblock, the underlying
132 * filesystem might be in a state of partial construction and hence it
133 * is dangerous to access it. trylock_super() uses a SB_BORN check to
134 * avoid this situation, so do the same here. The memory barrier is
135 * matched with the one in mount_fs() as we don't hold locks here.
130 */ 136 */
137 if (!(sb->s_flags & SB_BORN))
138 return 0;
139 smp_rmb();
140
131 if (sb->s_op && sb->s_op->nr_cached_objects) 141 if (sb->s_op && sb->s_op->nr_cached_objects)
132 total_objects = sb->s_op->nr_cached_objects(sb, sc); 142 total_objects = sb->s_op->nr_cached_objects(sb, sc);
133 143
@@ -1272,6 +1282,14 @@ mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
1272 sb = root->d_sb; 1282 sb = root->d_sb;
1273 BUG_ON(!sb); 1283 BUG_ON(!sb);
1274 WARN_ON(!sb->s_bdi); 1284 WARN_ON(!sb->s_bdi);
1285
1286 /*
1287 * Write barrier is for super_cache_count(). We place it before setting
1288 * SB_BORN as the data dependency between the two functions is the
1289 * superblock structure contents that we just set up, not the SB_BORN
1290 * flag.
1291 */
1292 smp_wmb();
1275 sb->s_flags |= SB_BORN; 1293 sb->s_flags |= SB_BORN;
1276 1294
1277 error = security_sb_kern_mount(sb, flags, secdata); 1295 error = security_sb_kern_mount(sb, flags, secdata);