aboutsummaryrefslogtreecommitdiffstats
path: root/fs/super.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/super.c')
-rw-r--r--fs/super.c252
1 files changed, 230 insertions, 22 deletions
diff --git a/fs/super.c b/fs/super.c
index 4bf714459a4b..b05cf47463d0 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -33,12 +33,19 @@
33#include <linux/rculist_bl.h> 33#include <linux/rculist_bl.h>
34#include <linux/cleancache.h> 34#include <linux/cleancache.h>
35#include <linux/fsnotify.h> 35#include <linux/fsnotify.h>
36#include <linux/lockdep.h>
36#include "internal.h" 37#include "internal.h"
37 38
38 39
39LIST_HEAD(super_blocks); 40LIST_HEAD(super_blocks);
40DEFINE_SPINLOCK(sb_lock); 41DEFINE_SPINLOCK(sb_lock);
41 42
43static char *sb_writers_name[SB_FREEZE_LEVELS] = {
44 "sb_writers",
45 "sb_pagefaults",
46 "sb_internal",
47};
48
42/* 49/*
43 * One thing we have to be careful of with a per-sb shrinker is that we don't 50 * One thing we have to be careful of with a per-sb shrinker is that we don't
44 * drop the last active reference to the superblock from within the shrinker. 51 * drop the last active reference to the superblock from within the shrinker.
@@ -102,6 +109,35 @@ static int prune_super(struct shrinker *shrink, struct shrink_control *sc)
102 return total_objects; 109 return total_objects;
103} 110}
104 111
112static int init_sb_writers(struct super_block *s, struct file_system_type *type)
113{
114 int err;
115 int i;
116
117 for (i = 0; i < SB_FREEZE_LEVELS; i++) {
118 err = percpu_counter_init(&s->s_writers.counter[i], 0);
119 if (err < 0)
120 goto err_out;
121 lockdep_init_map(&s->s_writers.lock_map[i], sb_writers_name[i],
122 &type->s_writers_key[i], 0);
123 }
124 init_waitqueue_head(&s->s_writers.wait);
125 init_waitqueue_head(&s->s_writers.wait_unfrozen);
126 return 0;
127err_out:
128 while (--i >= 0)
129 percpu_counter_destroy(&s->s_writers.counter[i]);
130 return err;
131}
132
133static void destroy_sb_writers(struct super_block *s)
134{
135 int i;
136
137 for (i = 0; i < SB_FREEZE_LEVELS; i++)
138 percpu_counter_destroy(&s->s_writers.counter[i]);
139}
140
105/** 141/**
106 * alloc_super - create new superblock 142 * alloc_super - create new superblock
107 * @type: filesystem type superblock should belong to 143 * @type: filesystem type superblock should belong to
@@ -117,18 +153,19 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
117 153
118 if (s) { 154 if (s) {
119 if (security_sb_alloc(s)) { 155 if (security_sb_alloc(s)) {
156 /*
157 * We cannot call security_sb_free() without
158 * security_sb_alloc() succeeding. So bail out manually
159 */
120 kfree(s); 160 kfree(s);
121 s = NULL; 161 s = NULL;
122 goto out; 162 goto out;
123 } 163 }
124#ifdef CONFIG_SMP 164#ifdef CONFIG_SMP
125 s->s_files = alloc_percpu(struct list_head); 165 s->s_files = alloc_percpu(struct list_head);
126 if (!s->s_files) { 166 if (!s->s_files)
127 security_sb_free(s); 167 goto err_out;
128 kfree(s); 168 else {
129 s = NULL;
130 goto out;
131 } else {
132 int i; 169 int i;
133 170
134 for_each_possible_cpu(i) 171 for_each_possible_cpu(i)
@@ -137,6 +174,8 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
137#else 174#else
138 INIT_LIST_HEAD(&s->s_files); 175 INIT_LIST_HEAD(&s->s_files);
139#endif 176#endif
177 if (init_sb_writers(s, type))
178 goto err_out;
140 s->s_flags = flags; 179 s->s_flags = flags;
141 s->s_bdi = &default_backing_dev_info; 180 s->s_bdi = &default_backing_dev_info;
142 INIT_HLIST_NODE(&s->s_instances); 181 INIT_HLIST_NODE(&s->s_instances);
@@ -178,7 +217,6 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
178 mutex_init(&s->s_dquot.dqio_mutex); 217 mutex_init(&s->s_dquot.dqio_mutex);
179 mutex_init(&s->s_dquot.dqonoff_mutex); 218 mutex_init(&s->s_dquot.dqonoff_mutex);
180 init_rwsem(&s->s_dquot.dqptr_sem); 219 init_rwsem(&s->s_dquot.dqptr_sem);
181 init_waitqueue_head(&s->s_wait_unfrozen);
182 s->s_maxbytes = MAX_NON_LFS; 220 s->s_maxbytes = MAX_NON_LFS;
183 s->s_op = &default_op; 221 s->s_op = &default_op;
184 s->s_time_gran = 1000000000; 222 s->s_time_gran = 1000000000;
@@ -190,6 +228,16 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
190 } 228 }
191out: 229out:
192 return s; 230 return s;
231err_out:
232 security_sb_free(s);
233#ifdef CONFIG_SMP
234 if (s->s_files)
235 free_percpu(s->s_files);
236#endif
237 destroy_sb_writers(s);
238 kfree(s);
239 s = NULL;
240 goto out;
193} 241}
194 242
195/** 243/**
@@ -203,6 +251,7 @@ static inline void destroy_super(struct super_block *s)
203#ifdef CONFIG_SMP 251#ifdef CONFIG_SMP
204 free_percpu(s->s_files); 252 free_percpu(s->s_files);
205#endif 253#endif
254 destroy_sb_writers(s);
206 security_sb_free(s); 255 security_sb_free(s);
207 WARN_ON(!list_empty(&s->s_mounts)); 256 WARN_ON(!list_empty(&s->s_mounts));
208 kfree(s->s_subtype); 257 kfree(s->s_subtype);
@@ -651,10 +700,11 @@ struct super_block *get_super_thawed(struct block_device *bdev)
651{ 700{
652 while (1) { 701 while (1) {
653 struct super_block *s = get_super(bdev); 702 struct super_block *s = get_super(bdev);
654 if (!s || s->s_frozen == SB_UNFROZEN) 703 if (!s || s->s_writers.frozen == SB_UNFROZEN)
655 return s; 704 return s;
656 up_read(&s->s_umount); 705 up_read(&s->s_umount);
657 vfs_check_frozen(s, SB_FREEZE_WRITE); 706 wait_event(s->s_writers.wait_unfrozen,
707 s->s_writers.frozen == SB_UNFROZEN);
658 put_super(s); 708 put_super(s);
659 } 709 }
660} 710}
@@ -732,7 +782,7 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
732 int retval; 782 int retval;
733 int remount_ro; 783 int remount_ro;
734 784
735 if (sb->s_frozen != SB_UNFROZEN) 785 if (sb->s_writers.frozen != SB_UNFROZEN)
736 return -EBUSY; 786 return -EBUSY;
737 787
738#ifdef CONFIG_BLOCK 788#ifdef CONFIG_BLOCK
@@ -1163,6 +1213,120 @@ out:
1163 return ERR_PTR(error); 1213 return ERR_PTR(error);
1164} 1214}
1165 1215
1216/*
1217 * This is an internal function, please use sb_end_{write,pagefault,intwrite}
1218 * instead.
1219 */
1220void __sb_end_write(struct super_block *sb, int level)
1221{
1222 percpu_counter_dec(&sb->s_writers.counter[level-1]);
1223 /*
1224 * Make sure s_writers are updated before we wake up waiters in
1225 * freeze_super().
1226 */
1227 smp_mb();
1228 if (waitqueue_active(&sb->s_writers.wait))
1229 wake_up(&sb->s_writers.wait);
1230 rwsem_release(&sb->s_writers.lock_map[level-1], 1, _RET_IP_);
1231}
1232EXPORT_SYMBOL(__sb_end_write);
1233
1234#ifdef CONFIG_LOCKDEP
1235/*
1236 * We want lockdep to tell us about possible deadlocks with freezing but
1237 * it's it bit tricky to properly instrument it. Getting a freeze protection
1238 * works as getting a read lock but there are subtle problems. XFS for example
1239 * gets freeze protection on internal level twice in some cases, which is OK
1240 * only because we already hold a freeze protection also on higher level. Due
1241 * to these cases we have to tell lockdep we are doing trylock when we
1242 * already hold a freeze protection for a higher freeze level.
1243 */
1244static void acquire_freeze_lock(struct super_block *sb, int level, bool trylock,
1245 unsigned long ip)
1246{
1247 int i;
1248
1249 if (!trylock) {
1250 for (i = 0; i < level - 1; i++)
1251 if (lock_is_held(&sb->s_writers.lock_map[i])) {
1252 trylock = true;
1253 break;
1254 }
1255 }
1256 rwsem_acquire_read(&sb->s_writers.lock_map[level-1], 0, trylock, ip);
1257}
1258#endif
1259
1260/*
1261 * This is an internal function, please use sb_start_{write,pagefault,intwrite}
1262 * instead.
1263 */
1264int __sb_start_write(struct super_block *sb, int level, bool wait)
1265{
1266retry:
1267 if (unlikely(sb->s_writers.frozen >= level)) {
1268 if (!wait)
1269 return 0;
1270 wait_event(sb->s_writers.wait_unfrozen,
1271 sb->s_writers.frozen < level);
1272 }
1273
1274#ifdef CONFIG_LOCKDEP
1275 acquire_freeze_lock(sb, level, !wait, _RET_IP_);
1276#endif
1277 percpu_counter_inc(&sb->s_writers.counter[level-1]);
1278 /*
1279 * Make sure counter is updated before we check for frozen.
1280 * freeze_super() first sets frozen and then checks the counter.
1281 */
1282 smp_mb();
1283 if (unlikely(sb->s_writers.frozen >= level)) {
1284 __sb_end_write(sb, level);
1285 goto retry;
1286 }
1287 return 1;
1288}
1289EXPORT_SYMBOL(__sb_start_write);
1290
1291/**
1292 * sb_wait_write - wait until all writers to given file system finish
1293 * @sb: the super for which we wait
1294 * @level: type of writers we wait for (normal vs page fault)
1295 *
1296 * This function waits until there are no writers of given type to given file
1297 * system. Caller of this function should make sure there can be no new writers
1298 * of type @level before calling this function. Otherwise this function can
1299 * livelock.
1300 */
1301static void sb_wait_write(struct super_block *sb, int level)
1302{
1303 s64 writers;
1304
1305 /*
1306 * We just cycle-through lockdep here so that it does not complain
1307 * about returning with lock to userspace
1308 */
1309 rwsem_acquire(&sb->s_writers.lock_map[level-1], 0, 0, _THIS_IP_);
1310 rwsem_release(&sb->s_writers.lock_map[level-1], 1, _THIS_IP_);
1311
1312 do {
1313 DEFINE_WAIT(wait);
1314
1315 /*
1316 * We use a barrier in prepare_to_wait() to separate setting
1317 * of frozen and checking of the counter
1318 */
1319 prepare_to_wait(&sb->s_writers.wait, &wait,
1320 TASK_UNINTERRUPTIBLE);
1321
1322 writers = percpu_counter_sum(&sb->s_writers.counter[level-1]);
1323 if (writers)
1324 schedule();
1325
1326 finish_wait(&sb->s_writers.wait, &wait);
1327 } while (writers);
1328}
1329
1166/** 1330/**
1167 * freeze_super - lock the filesystem and force it into a consistent state 1331 * freeze_super - lock the filesystem and force it into a consistent state
1168 * @sb: the super to lock 1332 * @sb: the super to lock
@@ -1170,6 +1334,31 @@ out:
1170 * Syncs the super to make sure the filesystem is consistent and calls the fs's 1334 * Syncs the super to make sure the filesystem is consistent and calls the fs's
1171 * freeze_fs. Subsequent calls to this without first thawing the fs will return 1335 * freeze_fs. Subsequent calls to this without first thawing the fs will return
1172 * -EBUSY. 1336 * -EBUSY.
1337 *
1338 * During this function, sb->s_writers.frozen goes through these values:
1339 *
1340 * SB_UNFROZEN: File system is normal, all writes progress as usual.
1341 *
1342 * SB_FREEZE_WRITE: The file system is in the process of being frozen. New
1343 * writes should be blocked, though page faults are still allowed. We wait for
1344 * all writes to complete and then proceed to the next stage.
1345 *
1346 * SB_FREEZE_PAGEFAULT: Freezing continues. Now also page faults are blocked
1347 * but internal fs threads can still modify the filesystem (although they
1348 * should not dirty new pages or inodes), writeback can run etc. After waiting
1349 * for all running page faults we sync the filesystem which will clean all
1350 * dirty pages and inodes (no new dirty pages or inodes can be created when
1351 * sync is running).
1352 *
1353 * SB_FREEZE_FS: The file system is frozen. Now all internal sources of fs
1354 * modification are blocked (e.g. XFS preallocation truncation on inode
1355 * reclaim). This is usually implemented by blocking new transactions for
1356 * filesystems that have them and need this additional guard. After all
1357 * internal writers are finished we call ->freeze_fs() to finish filesystem
1358 * freezing. Then we transition to SB_FREEZE_COMPLETE state. This state is
1359 * mostly auxiliary for filesystems to verify they do not modify frozen fs.
1360 *
1361 * sb->s_writers.frozen is protected by sb->s_umount.
1173 */ 1362 */
1174int freeze_super(struct super_block *sb) 1363int freeze_super(struct super_block *sb)
1175{ 1364{
@@ -1177,7 +1366,7 @@ int freeze_super(struct super_block *sb)
1177 1366
1178 atomic_inc(&sb->s_active); 1367 atomic_inc(&sb->s_active);
1179 down_write(&sb->s_umount); 1368 down_write(&sb->s_umount);
1180 if (sb->s_frozen) { 1369 if (sb->s_writers.frozen != SB_UNFROZEN) {
1181 deactivate_locked_super(sb); 1370 deactivate_locked_super(sb);
1182 return -EBUSY; 1371 return -EBUSY;
1183 } 1372 }
@@ -1188,33 +1377,53 @@ int freeze_super(struct super_block *sb)
1188 } 1377 }
1189 1378
1190 if (sb->s_flags & MS_RDONLY) { 1379 if (sb->s_flags & MS_RDONLY) {
1191 sb->s_frozen = SB_FREEZE_TRANS; 1380 /* Nothing to do really... */
1192 smp_wmb(); 1381 sb->s_writers.frozen = SB_FREEZE_COMPLETE;
1193 up_write(&sb->s_umount); 1382 up_write(&sb->s_umount);
1194 return 0; 1383 return 0;
1195 } 1384 }
1196 1385
1197 sb->s_frozen = SB_FREEZE_WRITE; 1386 /* From now on, no new normal writers can start */
1387 sb->s_writers.frozen = SB_FREEZE_WRITE;
1388 smp_wmb();
1389
1390 /* Release s_umount to preserve sb_start_write -> s_umount ordering */
1391 up_write(&sb->s_umount);
1392
1393 sb_wait_write(sb, SB_FREEZE_WRITE);
1394
1395 /* Now we go and block page faults... */
1396 down_write(&sb->s_umount);
1397 sb->s_writers.frozen = SB_FREEZE_PAGEFAULT;
1198 smp_wmb(); 1398 smp_wmb();
1199 1399
1400 sb_wait_write(sb, SB_FREEZE_PAGEFAULT);
1401
1402 /* All writers are done so after syncing there won't be dirty data */
1200 sync_filesystem(sb); 1403 sync_filesystem(sb);
1201 1404
1202 sb->s_frozen = SB_FREEZE_TRANS; 1405 /* Now wait for internal filesystem counter */
1406 sb->s_writers.frozen = SB_FREEZE_FS;
1203 smp_wmb(); 1407 smp_wmb();
1408 sb_wait_write(sb, SB_FREEZE_FS);
1204 1409
1205 sync_blockdev(sb->s_bdev);
1206 if (sb->s_op->freeze_fs) { 1410 if (sb->s_op->freeze_fs) {
1207 ret = sb->s_op->freeze_fs(sb); 1411 ret = sb->s_op->freeze_fs(sb);
1208 if (ret) { 1412 if (ret) {
1209 printk(KERN_ERR 1413 printk(KERN_ERR
1210 "VFS:Filesystem freeze failed\n"); 1414 "VFS:Filesystem freeze failed\n");
1211 sb->s_frozen = SB_UNFROZEN; 1415 sb->s_writers.frozen = SB_UNFROZEN;
1212 smp_wmb(); 1416 smp_wmb();
1213 wake_up(&sb->s_wait_unfrozen); 1417 wake_up(&sb->s_writers.wait_unfrozen);
1214 deactivate_locked_super(sb); 1418 deactivate_locked_super(sb);
1215 return ret; 1419 return ret;
1216 } 1420 }
1217 } 1421 }
1422 /*
1423 * This is just for debugging purposes so that fs can warn if it
1424 * sees write activity when frozen is set to SB_FREEZE_COMPLETE.
1425 */
1426 sb->s_writers.frozen = SB_FREEZE_COMPLETE;
1218 up_write(&sb->s_umount); 1427 up_write(&sb->s_umount);
1219 return 0; 1428 return 0;
1220} 1429}
@@ -1231,7 +1440,7 @@ int thaw_super(struct super_block *sb)
1231 int error; 1440 int error;
1232 1441
1233 down_write(&sb->s_umount); 1442 down_write(&sb->s_umount);
1234 if (sb->s_frozen == SB_UNFROZEN) { 1443 if (sb->s_writers.frozen == SB_UNFROZEN) {
1235 up_write(&sb->s_umount); 1444 up_write(&sb->s_umount);
1236 return -EINVAL; 1445 return -EINVAL;
1237 } 1446 }
@@ -1244,16 +1453,15 @@ int thaw_super(struct super_block *sb)
1244 if (error) { 1453 if (error) {
1245 printk(KERN_ERR 1454 printk(KERN_ERR
1246 "VFS:Filesystem thaw failed\n"); 1455 "VFS:Filesystem thaw failed\n");
1247 sb->s_frozen = SB_FREEZE_TRANS;
1248 up_write(&sb->s_umount); 1456 up_write(&sb->s_umount);
1249 return error; 1457 return error;
1250 } 1458 }
1251 } 1459 }
1252 1460
1253out: 1461out:
1254 sb->s_frozen = SB_UNFROZEN; 1462 sb->s_writers.frozen = SB_UNFROZEN;
1255 smp_wmb(); 1463 smp_wmb();
1256 wake_up(&sb->s_wait_unfrozen); 1464 wake_up(&sb->s_writers.wait_unfrozen);
1257 deactivate_locked_super(sb); 1465 deactivate_locked_super(sb);
1258 1466
1259 return 0; 1467 return 0;