aboutsummaryrefslogtreecommitdiffstats
path: root/fs/super.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-08-01 13:26:23 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-08-01 13:26:23 -0400
commita0e881b7c189fa2bd76c024dbff91e79511c971d (patch)
tree0c801918565b08921d21aceee5b326f64d998f5f /fs/super.c
parenteff0d13f3823f35d70228cd151d2a2c89288ff32 (diff)
parentdbc6e0222d79e78925fe20733844a796a4b72cf9 (diff)
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs
Pull second vfs pile from Al Viro: "The stuff in there: fsfreeze deadlock fixes by Jan (essentially, the deadlock reproduced by xfstests 068), symlink and hardlink restriction patches, plus assorted cleanups and fixes. Note that another fsfreeze deadlock (emergency thaw one) is *not* dealt with - the series by Fernando conflicts a lot with Jan's, breaks userland ABI (FIFREEZE semantics gets changed) and trades the deadlock for massive vfsmount leak; this is going to be handled next cycle. There probably will be another pull request, but that stuff won't be in it." Fix up trivial conflicts due to unrelated changes next to each other in drivers/{staging/gdm72xx/usb_boot.c, usb/gadget/storage_common.c} * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs: (54 commits) delousing target_core_file a bit Documentation: Correct s_umount state for freeze_fs/unfreeze_fs fs: Remove old freezing mechanism ext2: Implement freezing btrfs: Convert to new freezing mechanism nilfs2: Convert to new freezing mechanism ntfs: Convert to new freezing mechanism fuse: Convert to new freezing mechanism gfs2: Convert to new freezing mechanism ocfs2: Convert to new freezing mechanism xfs: Convert to new freezing code ext4: Convert to new freezing mechanism fs: Protect write paths by sb_start_write - sb_end_write fs: Skip atime update on frozen filesystem fs: Add freezing handling to mnt_want_write() / mnt_drop_write() fs: Improve filesystem freezing handling switch the protection of percpu_counter list to spinlock nfsd: Push mnt_want_write() outside of i_mutex btrfs: Push mnt_want_write() outside of i_mutex fat: Push mnt_want_write() outside of i_mutex ...
Diffstat (limited to 'fs/super.c')
-rw-r--r--fs/super.c252
1 files changed, 230 insertions, 22 deletions
diff --git a/fs/super.c b/fs/super.c
index 4bf714459a4b..b05cf47463d0 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -33,12 +33,19 @@
33#include <linux/rculist_bl.h> 33#include <linux/rculist_bl.h>
34#include <linux/cleancache.h> 34#include <linux/cleancache.h>
35#include <linux/fsnotify.h> 35#include <linux/fsnotify.h>
36#include <linux/lockdep.h>
36#include "internal.h" 37#include "internal.h"
37 38
38 39
39LIST_HEAD(super_blocks); 40LIST_HEAD(super_blocks);
40DEFINE_SPINLOCK(sb_lock); 41DEFINE_SPINLOCK(sb_lock);
41 42
43static char *sb_writers_name[SB_FREEZE_LEVELS] = {
44 "sb_writers",
45 "sb_pagefaults",
46 "sb_internal",
47};
48
42/* 49/*
43 * One thing we have to be careful of with a per-sb shrinker is that we don't 50 * One thing we have to be careful of with a per-sb shrinker is that we don't
44 * drop the last active reference to the superblock from within the shrinker. 51 * drop the last active reference to the superblock from within the shrinker.
@@ -102,6 +109,35 @@ static int prune_super(struct shrinker *shrink, struct shrink_control *sc)
102 return total_objects; 109 return total_objects;
103} 110}
104 111
112static int init_sb_writers(struct super_block *s, struct file_system_type *type)
113{
114 int err;
115 int i;
116
117 for (i = 0; i < SB_FREEZE_LEVELS; i++) {
118 err = percpu_counter_init(&s->s_writers.counter[i], 0);
119 if (err < 0)
120 goto err_out;
121 lockdep_init_map(&s->s_writers.lock_map[i], sb_writers_name[i],
122 &type->s_writers_key[i], 0);
123 }
124 init_waitqueue_head(&s->s_writers.wait);
125 init_waitqueue_head(&s->s_writers.wait_unfrozen);
126 return 0;
127err_out:
128 while (--i >= 0)
129 percpu_counter_destroy(&s->s_writers.counter[i]);
130 return err;
131}
132
133static void destroy_sb_writers(struct super_block *s)
134{
135 int i;
136
137 for (i = 0; i < SB_FREEZE_LEVELS; i++)
138 percpu_counter_destroy(&s->s_writers.counter[i]);
139}
140
105/** 141/**
106 * alloc_super - create new superblock 142 * alloc_super - create new superblock
107 * @type: filesystem type superblock should belong to 143 * @type: filesystem type superblock should belong to
@@ -117,18 +153,19 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
117 153
118 if (s) { 154 if (s) {
119 if (security_sb_alloc(s)) { 155 if (security_sb_alloc(s)) {
156 /*
157 * We cannot call security_sb_free() without
158 * security_sb_alloc() succeeding. So bail out manually
159 */
120 kfree(s); 160 kfree(s);
121 s = NULL; 161 s = NULL;
122 goto out; 162 goto out;
123 } 163 }
124#ifdef CONFIG_SMP 164#ifdef CONFIG_SMP
125 s->s_files = alloc_percpu(struct list_head); 165 s->s_files = alloc_percpu(struct list_head);
126 if (!s->s_files) { 166 if (!s->s_files)
127 security_sb_free(s); 167 goto err_out;
128 kfree(s); 168 else {
129 s = NULL;
130 goto out;
131 } else {
132 int i; 169 int i;
133 170
134 for_each_possible_cpu(i) 171 for_each_possible_cpu(i)
@@ -137,6 +174,8 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
137#else 174#else
138 INIT_LIST_HEAD(&s->s_files); 175 INIT_LIST_HEAD(&s->s_files);
139#endif 176#endif
177 if (init_sb_writers(s, type))
178 goto err_out;
140 s->s_flags = flags; 179 s->s_flags = flags;
141 s->s_bdi = &default_backing_dev_info; 180 s->s_bdi = &default_backing_dev_info;
142 INIT_HLIST_NODE(&s->s_instances); 181 INIT_HLIST_NODE(&s->s_instances);
@@ -178,7 +217,6 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
178 mutex_init(&s->s_dquot.dqio_mutex); 217 mutex_init(&s->s_dquot.dqio_mutex);
179 mutex_init(&s->s_dquot.dqonoff_mutex); 218 mutex_init(&s->s_dquot.dqonoff_mutex);
180 init_rwsem(&s->s_dquot.dqptr_sem); 219 init_rwsem(&s->s_dquot.dqptr_sem);
181 init_waitqueue_head(&s->s_wait_unfrozen);
182 s->s_maxbytes = MAX_NON_LFS; 220 s->s_maxbytes = MAX_NON_LFS;
183 s->s_op = &default_op; 221 s->s_op = &default_op;
184 s->s_time_gran = 1000000000; 222 s->s_time_gran = 1000000000;
@@ -190,6 +228,16 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
190 } 228 }
191out: 229out:
192 return s; 230 return s;
231err_out:
232 security_sb_free(s);
233#ifdef CONFIG_SMP
234 if (s->s_files)
235 free_percpu(s->s_files);
236#endif
237 destroy_sb_writers(s);
238 kfree(s);
239 s = NULL;
240 goto out;
193} 241}
194 242
195/** 243/**
@@ -203,6 +251,7 @@ static inline void destroy_super(struct super_block *s)
203#ifdef CONFIG_SMP 251#ifdef CONFIG_SMP
204 free_percpu(s->s_files); 252 free_percpu(s->s_files);
205#endif 253#endif
254 destroy_sb_writers(s);
206 security_sb_free(s); 255 security_sb_free(s);
207 WARN_ON(!list_empty(&s->s_mounts)); 256 WARN_ON(!list_empty(&s->s_mounts));
208 kfree(s->s_subtype); 257 kfree(s->s_subtype);
@@ -651,10 +700,11 @@ struct super_block *get_super_thawed(struct block_device *bdev)
651{ 700{
652 while (1) { 701 while (1) {
653 struct super_block *s = get_super(bdev); 702 struct super_block *s = get_super(bdev);
654 if (!s || s->s_frozen == SB_UNFROZEN) 703 if (!s || s->s_writers.frozen == SB_UNFROZEN)
655 return s; 704 return s;
656 up_read(&s->s_umount); 705 up_read(&s->s_umount);
657 vfs_check_frozen(s, SB_FREEZE_WRITE); 706 wait_event(s->s_writers.wait_unfrozen,
707 s->s_writers.frozen == SB_UNFROZEN);
658 put_super(s); 708 put_super(s);
659 } 709 }
660} 710}
@@ -732,7 +782,7 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
732 int retval; 782 int retval;
733 int remount_ro; 783 int remount_ro;
734 784
735 if (sb->s_frozen != SB_UNFROZEN) 785 if (sb->s_writers.frozen != SB_UNFROZEN)
736 return -EBUSY; 786 return -EBUSY;
737 787
738#ifdef CONFIG_BLOCK 788#ifdef CONFIG_BLOCK
@@ -1163,6 +1213,120 @@ out:
1163 return ERR_PTR(error); 1213 return ERR_PTR(error);
1164} 1214}
1165 1215
1216/*
1217 * This is an internal function, please use sb_end_{write,pagefault,intwrite}
1218 * instead.
1219 */
1220void __sb_end_write(struct super_block *sb, int level)
1221{
1222 percpu_counter_dec(&sb->s_writers.counter[level-1]);
1223 /*
1224 * Make sure s_writers are updated before we wake up waiters in
1225 * freeze_super().
1226 */
1227 smp_mb();
1228 if (waitqueue_active(&sb->s_writers.wait))
1229 wake_up(&sb->s_writers.wait);
1230 rwsem_release(&sb->s_writers.lock_map[level-1], 1, _RET_IP_);
1231}
1232EXPORT_SYMBOL(__sb_end_write);
1233
1234#ifdef CONFIG_LOCKDEP
1235/*
1236 * We want lockdep to tell us about possible deadlocks with freezing but
1237 * it's it bit tricky to properly instrument it. Getting a freeze protection
1238 * works as getting a read lock but there are subtle problems. XFS for example
1239 * gets freeze protection on internal level twice in some cases, which is OK
1240 * only because we already hold a freeze protection also on higher level. Due
1241 * to these cases we have to tell lockdep we are doing trylock when we
1242 * already hold a freeze protection for a higher freeze level.
1243 */
1244static void acquire_freeze_lock(struct super_block *sb, int level, bool trylock,
1245 unsigned long ip)
1246{
1247 int i;
1248
1249 if (!trylock) {
1250 for (i = 0; i < level - 1; i++)
1251 if (lock_is_held(&sb->s_writers.lock_map[i])) {
1252 trylock = true;
1253 break;
1254 }
1255 }
1256 rwsem_acquire_read(&sb->s_writers.lock_map[level-1], 0, trylock, ip);
1257}
1258#endif
1259
1260/*
1261 * This is an internal function, please use sb_start_{write,pagefault,intwrite}
1262 * instead.
1263 */
1264int __sb_start_write(struct super_block *sb, int level, bool wait)
1265{
1266retry:
1267 if (unlikely(sb->s_writers.frozen >= level)) {
1268 if (!wait)
1269 return 0;
1270 wait_event(sb->s_writers.wait_unfrozen,
1271 sb->s_writers.frozen < level);
1272 }
1273
1274#ifdef CONFIG_LOCKDEP
1275 acquire_freeze_lock(sb, level, !wait, _RET_IP_);
1276#endif
1277 percpu_counter_inc(&sb->s_writers.counter[level-1]);
1278 /*
1279 * Make sure counter is updated before we check for frozen.
1280 * freeze_super() first sets frozen and then checks the counter.
1281 */
1282 smp_mb();
1283 if (unlikely(sb->s_writers.frozen >= level)) {
1284 __sb_end_write(sb, level);
1285 goto retry;
1286 }
1287 return 1;
1288}
1289EXPORT_SYMBOL(__sb_start_write);
1290
1291/**
1292 * sb_wait_write - wait until all writers to given file system finish
1293 * @sb: the super for which we wait
1294 * @level: type of writers we wait for (normal vs page fault)
1295 *
1296 * This function waits until there are no writers of given type to given file
1297 * system. Caller of this function should make sure there can be no new writers
1298 * of type @level before calling this function. Otherwise this function can
1299 * livelock.
1300 */
1301static void sb_wait_write(struct super_block *sb, int level)
1302{
1303 s64 writers;
1304
1305 /*
1306 * We just cycle-through lockdep here so that it does not complain
1307 * about returning with lock to userspace
1308 */
1309 rwsem_acquire(&sb->s_writers.lock_map[level-1], 0, 0, _THIS_IP_);
1310 rwsem_release(&sb->s_writers.lock_map[level-1], 1, _THIS_IP_);
1311
1312 do {
1313 DEFINE_WAIT(wait);
1314
1315 /*
1316 * We use a barrier in prepare_to_wait() to separate setting
1317 * of frozen and checking of the counter
1318 */
1319 prepare_to_wait(&sb->s_writers.wait, &wait,
1320 TASK_UNINTERRUPTIBLE);
1321
1322 writers = percpu_counter_sum(&sb->s_writers.counter[level-1]);
1323 if (writers)
1324 schedule();
1325
1326 finish_wait(&sb->s_writers.wait, &wait);
1327 } while (writers);
1328}
1329
1166/** 1330/**
1167 * freeze_super - lock the filesystem and force it into a consistent state 1331 * freeze_super - lock the filesystem and force it into a consistent state
1168 * @sb: the super to lock 1332 * @sb: the super to lock
@@ -1170,6 +1334,31 @@ out:
1170 * Syncs the super to make sure the filesystem is consistent and calls the fs's 1334 * Syncs the super to make sure the filesystem is consistent and calls the fs's
1171 * freeze_fs. Subsequent calls to this without first thawing the fs will return 1335 * freeze_fs. Subsequent calls to this without first thawing the fs will return
1172 * -EBUSY. 1336 * -EBUSY.
1337 *
1338 * During this function, sb->s_writers.frozen goes through these values:
1339 *
1340 * SB_UNFROZEN: File system is normal, all writes progress as usual.
1341 *
1342 * SB_FREEZE_WRITE: The file system is in the process of being frozen. New
1343 * writes should be blocked, though page faults are still allowed. We wait for
1344 * all writes to complete and then proceed to the next stage.
1345 *
1346 * SB_FREEZE_PAGEFAULT: Freezing continues. Now also page faults are blocked
1347 * but internal fs threads can still modify the filesystem (although they
1348 * should not dirty new pages or inodes), writeback can run etc. After waiting
1349 * for all running page faults we sync the filesystem which will clean all
1350 * dirty pages and inodes (no new dirty pages or inodes can be created when
1351 * sync is running).
1352 *
1353 * SB_FREEZE_FS: The file system is frozen. Now all internal sources of fs
1354 * modification are blocked (e.g. XFS preallocation truncation on inode
1355 * reclaim). This is usually implemented by blocking new transactions for
1356 * filesystems that have them and need this additional guard. After all
1357 * internal writers are finished we call ->freeze_fs() to finish filesystem
1358 * freezing. Then we transition to SB_FREEZE_COMPLETE state. This state is
1359 * mostly auxiliary for filesystems to verify they do not modify frozen fs.
1360 *
1361 * sb->s_writers.frozen is protected by sb->s_umount.
1173 */ 1362 */
1174int freeze_super(struct super_block *sb) 1363int freeze_super(struct super_block *sb)
1175{ 1364{
@@ -1177,7 +1366,7 @@ int freeze_super(struct super_block *sb)
1177 1366
1178 atomic_inc(&sb->s_active); 1367 atomic_inc(&sb->s_active);
1179 down_write(&sb->s_umount); 1368 down_write(&sb->s_umount);
1180 if (sb->s_frozen) { 1369 if (sb->s_writers.frozen != SB_UNFROZEN) {
1181 deactivate_locked_super(sb); 1370 deactivate_locked_super(sb);
1182 return -EBUSY; 1371 return -EBUSY;
1183 } 1372 }
@@ -1188,33 +1377,53 @@ int freeze_super(struct super_block *sb)
1188 } 1377 }
1189 1378
1190 if (sb->s_flags & MS_RDONLY) { 1379 if (sb->s_flags & MS_RDONLY) {
1191 sb->s_frozen = SB_FREEZE_TRANS; 1380 /* Nothing to do really... */
1192 smp_wmb(); 1381 sb->s_writers.frozen = SB_FREEZE_COMPLETE;
1193 up_write(&sb->s_umount); 1382 up_write(&sb->s_umount);
1194 return 0; 1383 return 0;
1195 } 1384 }
1196 1385
1197 sb->s_frozen = SB_FREEZE_WRITE; 1386 /* From now on, no new normal writers can start */
1387 sb->s_writers.frozen = SB_FREEZE_WRITE;
1388 smp_wmb();
1389
1390 /* Release s_umount to preserve sb_start_write -> s_umount ordering */
1391 up_write(&sb->s_umount);
1392
1393 sb_wait_write(sb, SB_FREEZE_WRITE);
1394
1395 /* Now we go and block page faults... */
1396 down_write(&sb->s_umount);
1397 sb->s_writers.frozen = SB_FREEZE_PAGEFAULT;
1198 smp_wmb(); 1398 smp_wmb();
1199 1399
1400 sb_wait_write(sb, SB_FREEZE_PAGEFAULT);
1401
1402 /* All writers are done so after syncing there won't be dirty data */
1200 sync_filesystem(sb); 1403 sync_filesystem(sb);
1201 1404
1202 sb->s_frozen = SB_FREEZE_TRANS; 1405 /* Now wait for internal filesystem counter */
1406 sb->s_writers.frozen = SB_FREEZE_FS;
1203 smp_wmb(); 1407 smp_wmb();
1408 sb_wait_write(sb, SB_FREEZE_FS);
1204 1409
1205 sync_blockdev(sb->s_bdev);
1206 if (sb->s_op->freeze_fs) { 1410 if (sb->s_op->freeze_fs) {
1207 ret = sb->s_op->freeze_fs(sb); 1411 ret = sb->s_op->freeze_fs(sb);
1208 if (ret) { 1412 if (ret) {
1209 printk(KERN_ERR 1413 printk(KERN_ERR
1210 "VFS:Filesystem freeze failed\n"); 1414 "VFS:Filesystem freeze failed\n");
1211 sb->s_frozen = SB_UNFROZEN; 1415 sb->s_writers.frozen = SB_UNFROZEN;
1212 smp_wmb(); 1416 smp_wmb();
1213 wake_up(&sb->s_wait_unfrozen); 1417 wake_up(&sb->s_writers.wait_unfrozen);
1214 deactivate_locked_super(sb); 1418 deactivate_locked_super(sb);
1215 return ret; 1419 return ret;
1216 } 1420 }
1217 } 1421 }
1422 /*
1423 * This is just for debugging purposes so that fs can warn if it
1424 * sees write activity when frozen is set to SB_FREEZE_COMPLETE.
1425 */
1426 sb->s_writers.frozen = SB_FREEZE_COMPLETE;
1218 up_write(&sb->s_umount); 1427 up_write(&sb->s_umount);
1219 return 0; 1428 return 0;
1220} 1429}
@@ -1231,7 +1440,7 @@ int thaw_super(struct super_block *sb)
1231 int error; 1440 int error;
1232 1441
1233 down_write(&sb->s_umount); 1442 down_write(&sb->s_umount);
1234 if (sb->s_frozen == SB_UNFROZEN) { 1443 if (sb->s_writers.frozen == SB_UNFROZEN) {
1235 up_write(&sb->s_umount); 1444 up_write(&sb->s_umount);
1236 return -EINVAL; 1445 return -EINVAL;
1237 } 1446 }
@@ -1244,16 +1453,15 @@ int thaw_super(struct super_block *sb)
1244 if (error) { 1453 if (error) {
1245 printk(KERN_ERR 1454 printk(KERN_ERR
1246 "VFS:Filesystem thaw failed\n"); 1455 "VFS:Filesystem thaw failed\n");
1247 sb->s_frozen = SB_FREEZE_TRANS;
1248 up_write(&sb->s_umount); 1456 up_write(&sb->s_umount);
1249 return error; 1457 return error;
1250 } 1458 }
1251 } 1459 }
1252 1460
1253out: 1461out:
1254 sb->s_frozen = SB_UNFROZEN; 1462 sb->s_writers.frozen = SB_UNFROZEN;
1255 smp_wmb(); 1463 smp_wmb();
1256 wake_up(&sb->s_wait_unfrozen); 1464 wake_up(&sb->s_writers.wait_unfrozen);
1257 deactivate_locked_super(sb); 1465 deactivate_locked_super(sb);
1258 1466
1259 return 0; 1467 return 0;