diff options
Diffstat (limited to 'fs/super.c')
-rw-r--r-- | fs/super.c | 252 |
1 files changed, 230 insertions, 22 deletions
diff --git a/fs/super.c b/fs/super.c index 4bf714459a4b..b05cf47463d0 100644 --- a/fs/super.c +++ b/fs/super.c | |||
@@ -33,12 +33,19 @@ | |||
33 | #include <linux/rculist_bl.h> | 33 | #include <linux/rculist_bl.h> |
34 | #include <linux/cleancache.h> | 34 | #include <linux/cleancache.h> |
35 | #include <linux/fsnotify.h> | 35 | #include <linux/fsnotify.h> |
36 | #include <linux/lockdep.h> | ||
36 | #include "internal.h" | 37 | #include "internal.h" |
37 | 38 | ||
38 | 39 | ||
39 | LIST_HEAD(super_blocks); | 40 | LIST_HEAD(super_blocks); |
40 | DEFINE_SPINLOCK(sb_lock); | 41 | DEFINE_SPINLOCK(sb_lock); |
41 | 42 | ||
43 | static char *sb_writers_name[SB_FREEZE_LEVELS] = { | ||
44 | "sb_writers", | ||
45 | "sb_pagefaults", | ||
46 | "sb_internal", | ||
47 | }; | ||
48 | |||
42 | /* | 49 | /* |
43 | * One thing we have to be careful of with a per-sb shrinker is that we don't | 50 | * One thing we have to be careful of with a per-sb shrinker is that we don't |
44 | * drop the last active reference to the superblock from within the shrinker. | 51 | * drop the last active reference to the superblock from within the shrinker. |
@@ -102,6 +109,35 @@ static int prune_super(struct shrinker *shrink, struct shrink_control *sc) | |||
102 | return total_objects; | 109 | return total_objects; |
103 | } | 110 | } |
104 | 111 | ||
112 | static int init_sb_writers(struct super_block *s, struct file_system_type *type) | ||
113 | { | ||
114 | int err; | ||
115 | int i; | ||
116 | |||
117 | for (i = 0; i < SB_FREEZE_LEVELS; i++) { | ||
118 | err = percpu_counter_init(&s->s_writers.counter[i], 0); | ||
119 | if (err < 0) | ||
120 | goto err_out; | ||
121 | lockdep_init_map(&s->s_writers.lock_map[i], sb_writers_name[i], | ||
122 | &type->s_writers_key[i], 0); | ||
123 | } | ||
124 | init_waitqueue_head(&s->s_writers.wait); | ||
125 | init_waitqueue_head(&s->s_writers.wait_unfrozen); | ||
126 | return 0; | ||
127 | err_out: | ||
128 | while (--i >= 0) | ||
129 | percpu_counter_destroy(&s->s_writers.counter[i]); | ||
130 | return err; | ||
131 | } | ||
132 | |||
133 | static void destroy_sb_writers(struct super_block *s) | ||
134 | { | ||
135 | int i; | ||
136 | |||
137 | for (i = 0; i < SB_FREEZE_LEVELS; i++) | ||
138 | percpu_counter_destroy(&s->s_writers.counter[i]); | ||
139 | } | ||
140 | |||
105 | /** | 141 | /** |
106 | * alloc_super - create new superblock | 142 | * alloc_super - create new superblock |
107 | * @type: filesystem type superblock should belong to | 143 | * @type: filesystem type superblock should belong to |
@@ -117,18 +153,19 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) | |||
117 | 153 | ||
118 | if (s) { | 154 | if (s) { |
119 | if (security_sb_alloc(s)) { | 155 | if (security_sb_alloc(s)) { |
156 | /* | ||
157 | * We cannot call security_sb_free() without | ||
158 | * security_sb_alloc() succeeding. So bail out manually | ||
159 | */ | ||
120 | kfree(s); | 160 | kfree(s); |
121 | s = NULL; | 161 | s = NULL; |
122 | goto out; | 162 | goto out; |
123 | } | 163 | } |
124 | #ifdef CONFIG_SMP | 164 | #ifdef CONFIG_SMP |
125 | s->s_files = alloc_percpu(struct list_head); | 165 | s->s_files = alloc_percpu(struct list_head); |
126 | if (!s->s_files) { | 166 | if (!s->s_files) |
127 | security_sb_free(s); | 167 | goto err_out; |
128 | kfree(s); | 168 | else { |
129 | s = NULL; | ||
130 | goto out; | ||
131 | } else { | ||
132 | int i; | 169 | int i; |
133 | 170 | ||
134 | for_each_possible_cpu(i) | 171 | for_each_possible_cpu(i) |
@@ -137,6 +174,8 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) | |||
137 | #else | 174 | #else |
138 | INIT_LIST_HEAD(&s->s_files); | 175 | INIT_LIST_HEAD(&s->s_files); |
139 | #endif | 176 | #endif |
177 | if (init_sb_writers(s, type)) | ||
178 | goto err_out; | ||
140 | s->s_flags = flags; | 179 | s->s_flags = flags; |
141 | s->s_bdi = &default_backing_dev_info; | 180 | s->s_bdi = &default_backing_dev_info; |
142 | INIT_HLIST_NODE(&s->s_instances); | 181 | INIT_HLIST_NODE(&s->s_instances); |
@@ -178,7 +217,6 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) | |||
178 | mutex_init(&s->s_dquot.dqio_mutex); | 217 | mutex_init(&s->s_dquot.dqio_mutex); |
179 | mutex_init(&s->s_dquot.dqonoff_mutex); | 218 | mutex_init(&s->s_dquot.dqonoff_mutex); |
180 | init_rwsem(&s->s_dquot.dqptr_sem); | 219 | init_rwsem(&s->s_dquot.dqptr_sem); |
181 | init_waitqueue_head(&s->s_wait_unfrozen); | ||
182 | s->s_maxbytes = MAX_NON_LFS; | 220 | s->s_maxbytes = MAX_NON_LFS; |
183 | s->s_op = &default_op; | 221 | s->s_op = &default_op; |
184 | s->s_time_gran = 1000000000; | 222 | s->s_time_gran = 1000000000; |
@@ -190,6 +228,16 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) | |||
190 | } | 228 | } |
191 | out: | 229 | out: |
192 | return s; | 230 | return s; |
231 | err_out: | ||
232 | security_sb_free(s); | ||
233 | #ifdef CONFIG_SMP | ||
234 | if (s->s_files) | ||
235 | free_percpu(s->s_files); | ||
236 | #endif | ||
237 | destroy_sb_writers(s); | ||
238 | kfree(s); | ||
239 | s = NULL; | ||
240 | goto out; | ||
193 | } | 241 | } |
194 | 242 | ||
195 | /** | 243 | /** |
@@ -203,6 +251,7 @@ static inline void destroy_super(struct super_block *s) | |||
203 | #ifdef CONFIG_SMP | 251 | #ifdef CONFIG_SMP |
204 | free_percpu(s->s_files); | 252 | free_percpu(s->s_files); |
205 | #endif | 253 | #endif |
254 | destroy_sb_writers(s); | ||
206 | security_sb_free(s); | 255 | security_sb_free(s); |
207 | WARN_ON(!list_empty(&s->s_mounts)); | 256 | WARN_ON(!list_empty(&s->s_mounts)); |
208 | kfree(s->s_subtype); | 257 | kfree(s->s_subtype); |
@@ -651,10 +700,11 @@ struct super_block *get_super_thawed(struct block_device *bdev) | |||
651 | { | 700 | { |
652 | while (1) { | 701 | while (1) { |
653 | struct super_block *s = get_super(bdev); | 702 | struct super_block *s = get_super(bdev); |
654 | if (!s || s->s_frozen == SB_UNFROZEN) | 703 | if (!s || s->s_writers.frozen == SB_UNFROZEN) |
655 | return s; | 704 | return s; |
656 | up_read(&s->s_umount); | 705 | up_read(&s->s_umount); |
657 | vfs_check_frozen(s, SB_FREEZE_WRITE); | 706 | wait_event(s->s_writers.wait_unfrozen, |
707 | s->s_writers.frozen == SB_UNFROZEN); | ||
658 | put_super(s); | 708 | put_super(s); |
659 | } | 709 | } |
660 | } | 710 | } |
@@ -732,7 +782,7 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force) | |||
732 | int retval; | 782 | int retval; |
733 | int remount_ro; | 783 | int remount_ro; |
734 | 784 | ||
735 | if (sb->s_frozen != SB_UNFROZEN) | 785 | if (sb->s_writers.frozen != SB_UNFROZEN) |
736 | return -EBUSY; | 786 | return -EBUSY; |
737 | 787 | ||
738 | #ifdef CONFIG_BLOCK | 788 | #ifdef CONFIG_BLOCK |
@@ -1163,6 +1213,120 @@ out: | |||
1163 | return ERR_PTR(error); | 1213 | return ERR_PTR(error); |
1164 | } | 1214 | } |
1165 | 1215 | ||
1216 | /* | ||
1217 | * This is an internal function, please use sb_end_{write,pagefault,intwrite} | ||
1218 | * instead. | ||
1219 | */ | ||
1220 | void __sb_end_write(struct super_block *sb, int level) | ||
1221 | { | ||
1222 | percpu_counter_dec(&sb->s_writers.counter[level-1]); | ||
1223 | /* | ||
1224 | * Make sure s_writers are updated before we wake up waiters in | ||
1225 | * freeze_super(). | ||
1226 | */ | ||
1227 | smp_mb(); | ||
1228 | if (waitqueue_active(&sb->s_writers.wait)) | ||
1229 | wake_up(&sb->s_writers.wait); | ||
1230 | rwsem_release(&sb->s_writers.lock_map[level-1], 1, _RET_IP_); | ||
1231 | } | ||
1232 | EXPORT_SYMBOL(__sb_end_write); | ||
1233 | |||
1234 | #ifdef CONFIG_LOCKDEP | ||
1235 | /* | ||
1236 | * We want lockdep to tell us about possible deadlocks with freezing but | ||
1237 | * it's it bit tricky to properly instrument it. Getting a freeze protection | ||
1238 | * works as getting a read lock but there are subtle problems. XFS for example | ||
1239 | * gets freeze protection on internal level twice in some cases, which is OK | ||
1240 | * only because we already hold a freeze protection also on higher level. Due | ||
1241 | * to these cases we have to tell lockdep we are doing trylock when we | ||
1242 | * already hold a freeze protection for a higher freeze level. | ||
1243 | */ | ||
1244 | static void acquire_freeze_lock(struct super_block *sb, int level, bool trylock, | ||
1245 | unsigned long ip) | ||
1246 | { | ||
1247 | int i; | ||
1248 | |||
1249 | if (!trylock) { | ||
1250 | for (i = 0; i < level - 1; i++) | ||
1251 | if (lock_is_held(&sb->s_writers.lock_map[i])) { | ||
1252 | trylock = true; | ||
1253 | break; | ||
1254 | } | ||
1255 | } | ||
1256 | rwsem_acquire_read(&sb->s_writers.lock_map[level-1], 0, trylock, ip); | ||
1257 | } | ||
1258 | #endif | ||
1259 | |||
1260 | /* | ||
1261 | * This is an internal function, please use sb_start_{write,pagefault,intwrite} | ||
1262 | * instead. | ||
1263 | */ | ||
1264 | int __sb_start_write(struct super_block *sb, int level, bool wait) | ||
1265 | { | ||
1266 | retry: | ||
1267 | if (unlikely(sb->s_writers.frozen >= level)) { | ||
1268 | if (!wait) | ||
1269 | return 0; | ||
1270 | wait_event(sb->s_writers.wait_unfrozen, | ||
1271 | sb->s_writers.frozen < level); | ||
1272 | } | ||
1273 | |||
1274 | #ifdef CONFIG_LOCKDEP | ||
1275 | acquire_freeze_lock(sb, level, !wait, _RET_IP_); | ||
1276 | #endif | ||
1277 | percpu_counter_inc(&sb->s_writers.counter[level-1]); | ||
1278 | /* | ||
1279 | * Make sure counter is updated before we check for frozen. | ||
1280 | * freeze_super() first sets frozen and then checks the counter. | ||
1281 | */ | ||
1282 | smp_mb(); | ||
1283 | if (unlikely(sb->s_writers.frozen >= level)) { | ||
1284 | __sb_end_write(sb, level); | ||
1285 | goto retry; | ||
1286 | } | ||
1287 | return 1; | ||
1288 | } | ||
1289 | EXPORT_SYMBOL(__sb_start_write); | ||
1290 | |||
1291 | /** | ||
1292 | * sb_wait_write - wait until all writers to given file system finish | ||
1293 | * @sb: the super for which we wait | ||
1294 | * @level: type of writers we wait for (normal vs page fault) | ||
1295 | * | ||
1296 | * This function waits until there are no writers of given type to given file | ||
1297 | * system. Caller of this function should make sure there can be no new writers | ||
1298 | * of type @level before calling this function. Otherwise this function can | ||
1299 | * livelock. | ||
1300 | */ | ||
1301 | static void sb_wait_write(struct super_block *sb, int level) | ||
1302 | { | ||
1303 | s64 writers; | ||
1304 | |||
1305 | /* | ||
1306 | * We just cycle-through lockdep here so that it does not complain | ||
1307 | * about returning with lock to userspace | ||
1308 | */ | ||
1309 | rwsem_acquire(&sb->s_writers.lock_map[level-1], 0, 0, _THIS_IP_); | ||
1310 | rwsem_release(&sb->s_writers.lock_map[level-1], 1, _THIS_IP_); | ||
1311 | |||
1312 | do { | ||
1313 | DEFINE_WAIT(wait); | ||
1314 | |||
1315 | /* | ||
1316 | * We use a barrier in prepare_to_wait() to separate setting | ||
1317 | * of frozen and checking of the counter | ||
1318 | */ | ||
1319 | prepare_to_wait(&sb->s_writers.wait, &wait, | ||
1320 | TASK_UNINTERRUPTIBLE); | ||
1321 | |||
1322 | writers = percpu_counter_sum(&sb->s_writers.counter[level-1]); | ||
1323 | if (writers) | ||
1324 | schedule(); | ||
1325 | |||
1326 | finish_wait(&sb->s_writers.wait, &wait); | ||
1327 | } while (writers); | ||
1328 | } | ||
1329 | |||
1166 | /** | 1330 | /** |
1167 | * freeze_super - lock the filesystem and force it into a consistent state | 1331 | * freeze_super - lock the filesystem and force it into a consistent state |
1168 | * @sb: the super to lock | 1332 | * @sb: the super to lock |
@@ -1170,6 +1334,31 @@ out: | |||
1170 | * Syncs the super to make sure the filesystem is consistent and calls the fs's | 1334 | * Syncs the super to make sure the filesystem is consistent and calls the fs's |
1171 | * freeze_fs. Subsequent calls to this without first thawing the fs will return | 1335 | * freeze_fs. Subsequent calls to this without first thawing the fs will return |
1172 | * -EBUSY. | 1336 | * -EBUSY. |
1337 | * | ||
1338 | * During this function, sb->s_writers.frozen goes through these values: | ||
1339 | * | ||
1340 | * SB_UNFROZEN: File system is normal, all writes progress as usual. | ||
1341 | * | ||
1342 | * SB_FREEZE_WRITE: The file system is in the process of being frozen. New | ||
1343 | * writes should be blocked, though page faults are still allowed. We wait for | ||
1344 | * all writes to complete and then proceed to the next stage. | ||
1345 | * | ||
1346 | * SB_FREEZE_PAGEFAULT: Freezing continues. Now also page faults are blocked | ||
1347 | * but internal fs threads can still modify the filesystem (although they | ||
1348 | * should not dirty new pages or inodes), writeback can run etc. After waiting | ||
1349 | * for all running page faults we sync the filesystem which will clean all | ||
1350 | * dirty pages and inodes (no new dirty pages or inodes can be created when | ||
1351 | * sync is running). | ||
1352 | * | ||
1353 | * SB_FREEZE_FS: The file system is frozen. Now all internal sources of fs | ||
1354 | * modification are blocked (e.g. XFS preallocation truncation on inode | ||
1355 | * reclaim). This is usually implemented by blocking new transactions for | ||
1356 | * filesystems that have them and need this additional guard. After all | ||
1357 | * internal writers are finished we call ->freeze_fs() to finish filesystem | ||
1358 | * freezing. Then we transition to SB_FREEZE_COMPLETE state. This state is | ||
1359 | * mostly auxiliary for filesystems to verify they do not modify frozen fs. | ||
1360 | * | ||
1361 | * sb->s_writers.frozen is protected by sb->s_umount. | ||
1173 | */ | 1362 | */ |
1174 | int freeze_super(struct super_block *sb) | 1363 | int freeze_super(struct super_block *sb) |
1175 | { | 1364 | { |
@@ -1177,7 +1366,7 @@ int freeze_super(struct super_block *sb) | |||
1177 | 1366 | ||
1178 | atomic_inc(&sb->s_active); | 1367 | atomic_inc(&sb->s_active); |
1179 | down_write(&sb->s_umount); | 1368 | down_write(&sb->s_umount); |
1180 | if (sb->s_frozen) { | 1369 | if (sb->s_writers.frozen != SB_UNFROZEN) { |
1181 | deactivate_locked_super(sb); | 1370 | deactivate_locked_super(sb); |
1182 | return -EBUSY; | 1371 | return -EBUSY; |
1183 | } | 1372 | } |
@@ -1188,33 +1377,53 @@ int freeze_super(struct super_block *sb) | |||
1188 | } | 1377 | } |
1189 | 1378 | ||
1190 | if (sb->s_flags & MS_RDONLY) { | 1379 | if (sb->s_flags & MS_RDONLY) { |
1191 | sb->s_frozen = SB_FREEZE_TRANS; | 1380 | /* Nothing to do really... */ |
1192 | smp_wmb(); | 1381 | sb->s_writers.frozen = SB_FREEZE_COMPLETE; |
1193 | up_write(&sb->s_umount); | 1382 | up_write(&sb->s_umount); |
1194 | return 0; | 1383 | return 0; |
1195 | } | 1384 | } |
1196 | 1385 | ||
1197 | sb->s_frozen = SB_FREEZE_WRITE; | 1386 | /* From now on, no new normal writers can start */ |
1387 | sb->s_writers.frozen = SB_FREEZE_WRITE; | ||
1388 | smp_wmb(); | ||
1389 | |||
1390 | /* Release s_umount to preserve sb_start_write -> s_umount ordering */ | ||
1391 | up_write(&sb->s_umount); | ||
1392 | |||
1393 | sb_wait_write(sb, SB_FREEZE_WRITE); | ||
1394 | |||
1395 | /* Now we go and block page faults... */ | ||
1396 | down_write(&sb->s_umount); | ||
1397 | sb->s_writers.frozen = SB_FREEZE_PAGEFAULT; | ||
1198 | smp_wmb(); | 1398 | smp_wmb(); |
1199 | 1399 | ||
1400 | sb_wait_write(sb, SB_FREEZE_PAGEFAULT); | ||
1401 | |||
1402 | /* All writers are done so after syncing there won't be dirty data */ | ||
1200 | sync_filesystem(sb); | 1403 | sync_filesystem(sb); |
1201 | 1404 | ||
1202 | sb->s_frozen = SB_FREEZE_TRANS; | 1405 | /* Now wait for internal filesystem counter */ |
1406 | sb->s_writers.frozen = SB_FREEZE_FS; | ||
1203 | smp_wmb(); | 1407 | smp_wmb(); |
1408 | sb_wait_write(sb, SB_FREEZE_FS); | ||
1204 | 1409 | ||
1205 | sync_blockdev(sb->s_bdev); | ||
1206 | if (sb->s_op->freeze_fs) { | 1410 | if (sb->s_op->freeze_fs) { |
1207 | ret = sb->s_op->freeze_fs(sb); | 1411 | ret = sb->s_op->freeze_fs(sb); |
1208 | if (ret) { | 1412 | if (ret) { |
1209 | printk(KERN_ERR | 1413 | printk(KERN_ERR |
1210 | "VFS:Filesystem freeze failed\n"); | 1414 | "VFS:Filesystem freeze failed\n"); |
1211 | sb->s_frozen = SB_UNFROZEN; | 1415 | sb->s_writers.frozen = SB_UNFROZEN; |
1212 | smp_wmb(); | 1416 | smp_wmb(); |
1213 | wake_up(&sb->s_wait_unfrozen); | 1417 | wake_up(&sb->s_writers.wait_unfrozen); |
1214 | deactivate_locked_super(sb); | 1418 | deactivate_locked_super(sb); |
1215 | return ret; | 1419 | return ret; |
1216 | } | 1420 | } |
1217 | } | 1421 | } |
1422 | /* | ||
1423 | * This is just for debugging purposes so that fs can warn if it | ||
1424 | * sees write activity when frozen is set to SB_FREEZE_COMPLETE. | ||
1425 | */ | ||
1426 | sb->s_writers.frozen = SB_FREEZE_COMPLETE; | ||
1218 | up_write(&sb->s_umount); | 1427 | up_write(&sb->s_umount); |
1219 | return 0; | 1428 | return 0; |
1220 | } | 1429 | } |
@@ -1231,7 +1440,7 @@ int thaw_super(struct super_block *sb) | |||
1231 | int error; | 1440 | int error; |
1232 | 1441 | ||
1233 | down_write(&sb->s_umount); | 1442 | down_write(&sb->s_umount); |
1234 | if (sb->s_frozen == SB_UNFROZEN) { | 1443 | if (sb->s_writers.frozen == SB_UNFROZEN) { |
1235 | up_write(&sb->s_umount); | 1444 | up_write(&sb->s_umount); |
1236 | return -EINVAL; | 1445 | return -EINVAL; |
1237 | } | 1446 | } |
@@ -1244,16 +1453,15 @@ int thaw_super(struct super_block *sb) | |||
1244 | if (error) { | 1453 | if (error) { |
1245 | printk(KERN_ERR | 1454 | printk(KERN_ERR |
1246 | "VFS:Filesystem thaw failed\n"); | 1455 | "VFS:Filesystem thaw failed\n"); |
1247 | sb->s_frozen = SB_FREEZE_TRANS; | ||
1248 | up_write(&sb->s_umount); | 1456 | up_write(&sb->s_umount); |
1249 | return error; | 1457 | return error; |
1250 | } | 1458 | } |
1251 | } | 1459 | } |
1252 | 1460 | ||
1253 | out: | 1461 | out: |
1254 | sb->s_frozen = SB_UNFROZEN; | 1462 | sb->s_writers.frozen = SB_UNFROZEN; |
1255 | smp_wmb(); | 1463 | smp_wmb(); |
1256 | wake_up(&sb->s_wait_unfrozen); | 1464 | wake_up(&sb->s_writers.wait_unfrozen); |
1257 | deactivate_locked_super(sb); | 1465 | deactivate_locked_super(sb); |
1258 | 1466 | ||
1259 | return 0; | 1467 | return 0; |