aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorJan Kara <jack@suse.cz>2012-06-12 10:20:34 -0400
committerAl Viro <viro@zeniv.linux.org.uk>2012-07-31 01:30:13 -0400
commit5accdf82ba25cacefd6c1867f1704beb4d244cdd (patch)
tree7125b01d9bf0f23d5c5eaed0cbafa9a1cbe544d5 /fs
parentd87aae2f3c8e90bd0fe03f5309b4d066b712b8ec (diff)
fs: Improve filesystem freezing handling
vfs_check_frozen() tests are racy since the filesystem can be frozen just after the test is performed. Thus in write paths we can end up marking some pages or inodes dirty even though the file system is already frozen. This creates problems with flusher thread hanging on frozen filesystem. Another problem is that exclusion between ->page_mkwrite() and filesystem freezing has been handled by setting page dirty and then verifying s_frozen. This guaranteed that either the freezing code sees the faulted page, writes it, and writeprotects it again or we see s_frozen set and bail out of page fault. This works to protect from page being marked writeable while filesystem freezing is running but has an unpleasant artefact of leaving dirty (although unmodified and writeprotected) pages on frozen filesystem resulting in similar problems with flusher thread as the first problem. This patch aims at providing exclusion between write paths and filesystem freezing. We implement a writer-freeze read-write semaphore in the superblock. Actually, there are three such semaphores because of lock ranking reasons - one for page fault handlers (->page_mkwrite), one for all other writers, and one of internal filesystem purposes (used e.g. to track running transactions). Write paths which should block freezing (e.g. directory operations, ->aio_write(), ->page_mkwrite) hold reader side of the semaphore. Code freezing the filesystem takes the writer side. Only that we don't really want to bounce cachelines of the semaphores between CPUs for each write happening. So we implement the reader side of the semaphore as a per-cpu counter and the writer side is implemented using s_writers.frozen superblock field. [AV: microoptimize sb_start_write(); we want it fast in normal case] BugLink: https://bugs.launchpad.net/bugs/897421 Tested-by: Kamal Mostafa <kamal@canonical.com> Tested-by: Peter M. Petrakis <peter.petrakis@canonical.com> Tested-by: Dann Frazier <dann.frazier@canonical.com> Tested-by: Massimo Morana <massimo.morana@canonical.com> Signed-off-by: Jan Kara <jack@suse.cz> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Diffstat (limited to 'fs')
-rw-r--r--fs/super.c251
1 files changed, 230 insertions, 21 deletions
diff --git a/fs/super.c b/fs/super.c
index c743fb3be4b8..0f64ecb7b1bf 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -33,12 +33,19 @@
33#include <linux/rculist_bl.h> 33#include <linux/rculist_bl.h>
34#include <linux/cleancache.h> 34#include <linux/cleancache.h>
35#include <linux/fsnotify.h> 35#include <linux/fsnotify.h>
36#include <linux/lockdep.h>
36#include "internal.h" 37#include "internal.h"
37 38
38 39
39LIST_HEAD(super_blocks); 40LIST_HEAD(super_blocks);
40DEFINE_SPINLOCK(sb_lock); 41DEFINE_SPINLOCK(sb_lock);
41 42
43static char *sb_writers_name[SB_FREEZE_LEVELS] = {
44 "sb_writers",
45 "sb_pagefaults",
46 "sb_internal",
47};
48
42/* 49/*
43 * One thing we have to be careful of with a per-sb shrinker is that we don't 50 * One thing we have to be careful of with a per-sb shrinker is that we don't
44 * drop the last active reference to the superblock from within the shrinker. 51 * drop the last active reference to the superblock from within the shrinker.
@@ -102,6 +109,35 @@ static int prune_super(struct shrinker *shrink, struct shrink_control *sc)
102 return total_objects; 109 return total_objects;
103} 110}
104 111
112static int init_sb_writers(struct super_block *s, struct file_system_type *type)
113{
114 int err;
115 int i;
116
117 for (i = 0; i < SB_FREEZE_LEVELS; i++) {
118 err = percpu_counter_init(&s->s_writers.counter[i], 0);
119 if (err < 0)
120 goto err_out;
121 lockdep_init_map(&s->s_writers.lock_map[i], sb_writers_name[i],
122 &type->s_writers_key[i], 0);
123 }
124 init_waitqueue_head(&s->s_writers.wait);
125 init_waitqueue_head(&s->s_writers.wait_unfrozen);
126 return 0;
127err_out:
128 while (--i >= 0)
129 percpu_counter_destroy(&s->s_writers.counter[i]);
130 return err;
131}
132
133static void destroy_sb_writers(struct super_block *s)
134{
135 int i;
136
137 for (i = 0; i < SB_FREEZE_LEVELS; i++)
138 percpu_counter_destroy(&s->s_writers.counter[i]);
139}
140
105/** 141/**
106 * alloc_super - create new superblock 142 * alloc_super - create new superblock
107 * @type: filesystem type superblock should belong to 143 * @type: filesystem type superblock should belong to
@@ -117,18 +153,19 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
117 153
118 if (s) { 154 if (s) {
119 if (security_sb_alloc(s)) { 155 if (security_sb_alloc(s)) {
156 /*
157 * We cannot call security_sb_free() without
158 * security_sb_alloc() succeeding. So bail out manually
159 */
120 kfree(s); 160 kfree(s);
121 s = NULL; 161 s = NULL;
122 goto out; 162 goto out;
123 } 163 }
124#ifdef CONFIG_SMP 164#ifdef CONFIG_SMP
125 s->s_files = alloc_percpu(struct list_head); 165 s->s_files = alloc_percpu(struct list_head);
126 if (!s->s_files) { 166 if (!s->s_files)
127 security_sb_free(s); 167 goto err_out;
128 kfree(s); 168 else {
129 s = NULL;
130 goto out;
131 } else {
132 int i; 169 int i;
133 170
134 for_each_possible_cpu(i) 171 for_each_possible_cpu(i)
@@ -137,6 +174,8 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
137#else 174#else
138 INIT_LIST_HEAD(&s->s_files); 175 INIT_LIST_HEAD(&s->s_files);
139#endif 176#endif
177 if (init_sb_writers(s, type))
178 goto err_out;
140 s->s_flags = flags; 179 s->s_flags = flags;
141 s->s_bdi = &default_backing_dev_info; 180 s->s_bdi = &default_backing_dev_info;
142 INIT_HLIST_NODE(&s->s_instances); 181 INIT_HLIST_NODE(&s->s_instances);
@@ -190,6 +229,16 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
190 } 229 }
191out: 230out:
192 return s; 231 return s;
232err_out:
233 security_sb_free(s);
234#ifdef CONFIG_SMP
235 if (s->s_files)
236 free_percpu(s->s_files);
237#endif
238 destroy_sb_writers(s);
239 kfree(s);
240 s = NULL;
241 goto out;
193} 242}
194 243
195/** 244/**
@@ -203,6 +252,7 @@ static inline void destroy_super(struct super_block *s)
203#ifdef CONFIG_SMP 252#ifdef CONFIG_SMP
204 free_percpu(s->s_files); 253 free_percpu(s->s_files);
205#endif 254#endif
255 destroy_sb_writers(s);
206 security_sb_free(s); 256 security_sb_free(s);
207 WARN_ON(!list_empty(&s->s_mounts)); 257 WARN_ON(!list_empty(&s->s_mounts));
208 kfree(s->s_subtype); 258 kfree(s->s_subtype);
@@ -651,10 +701,11 @@ struct super_block *get_super_thawed(struct block_device *bdev)
651{ 701{
652 while (1) { 702 while (1) {
653 struct super_block *s = get_super(bdev); 703 struct super_block *s = get_super(bdev);
654 if (!s || s->s_frozen == SB_UNFROZEN) 704 if (!s || s->s_writers.frozen == SB_UNFROZEN)
655 return s; 705 return s;
656 up_read(&s->s_umount); 706 up_read(&s->s_umount);
657 vfs_check_frozen(s, SB_FREEZE_WRITE); 707 wait_event(s->s_writers.wait_unfrozen,
708 s->s_writers.frozen == SB_UNFROZEN);
658 put_super(s); 709 put_super(s);
659 } 710 }
660} 711}
@@ -732,7 +783,7 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
732 int retval; 783 int retval;
733 int remount_ro; 784 int remount_ro;
734 785
735 if (sb->s_frozen != SB_UNFROZEN) 786 if (sb->s_writers.frozen != SB_UNFROZEN)
736 return -EBUSY; 787 return -EBUSY;
737 788
738#ifdef CONFIG_BLOCK 789#ifdef CONFIG_BLOCK
@@ -1163,6 +1214,120 @@ out:
1163 return ERR_PTR(error); 1214 return ERR_PTR(error);
1164} 1215}
1165 1216
1217/*
1218 * This is an internal function, please use sb_end_{write,pagefault,intwrite}
1219 * instead.
1220 */
1221void __sb_end_write(struct super_block *sb, int level)
1222{
1223 percpu_counter_dec(&sb->s_writers.counter[level-1]);
1224 /*
1225 * Make sure s_writers are updated before we wake up waiters in
1226 * freeze_super().
1227 */
1228 smp_mb();
1229 if (waitqueue_active(&sb->s_writers.wait))
1230 wake_up(&sb->s_writers.wait);
1231 rwsem_release(&sb->s_writers.lock_map[level-1], 1, _RET_IP_);
1232}
1233EXPORT_SYMBOL(__sb_end_write);
1234
1235#ifdef CONFIG_LOCKDEP
1236/*
1237 * We want lockdep to tell us about possible deadlocks with freezing but
1238 * it's it bit tricky to properly instrument it. Getting a freeze protection
1239 * works as getting a read lock but there are subtle problems. XFS for example
1240 * gets freeze protection on internal level twice in some cases, which is OK
1241 * only because we already hold a freeze protection also on higher level. Due
1242 * to these cases we have to tell lockdep we are doing trylock when we
1243 * already hold a freeze protection for a higher freeze level.
1244 */
1245static void acquire_freeze_lock(struct super_block *sb, int level, bool trylock,
1246 unsigned long ip)
1247{
1248 int i;
1249
1250 if (!trylock) {
1251 for (i = 0; i < level - 1; i++)
1252 if (lock_is_held(&sb->s_writers.lock_map[i])) {
1253 trylock = true;
1254 break;
1255 }
1256 }
1257 rwsem_acquire_read(&sb->s_writers.lock_map[level-1], 0, trylock, ip);
1258}
1259#endif
1260
1261/*
1262 * This is an internal function, please use sb_start_{write,pagefault,intwrite}
1263 * instead.
1264 */
1265int __sb_start_write(struct super_block *sb, int level, bool wait)
1266{
1267retry:
1268 if (unlikely(sb->s_writers.frozen >= level)) {
1269 if (!wait)
1270 return 0;
1271 wait_event(sb->s_writers.wait_unfrozen,
1272 sb->s_writers.frozen < level);
1273 }
1274
1275#ifdef CONFIG_LOCKDEP
1276 acquire_freeze_lock(sb, level, !wait, _RET_IP_);
1277#endif
1278 percpu_counter_inc(&sb->s_writers.counter[level-1]);
1279 /*
1280 * Make sure counter is updated before we check for frozen.
1281 * freeze_super() first sets frozen and then checks the counter.
1282 */
1283 smp_mb();
1284 if (unlikely(sb->s_writers.frozen >= level)) {
1285 __sb_end_write(sb, level);
1286 goto retry;
1287 }
1288 return 1;
1289}
1290EXPORT_SYMBOL(__sb_start_write);
1291
1292/**
1293 * sb_wait_write - wait until all writers to given file system finish
1294 * @sb: the super for which we wait
1295 * @level: type of writers we wait for (normal vs page fault)
1296 *
1297 * This function waits until there are no writers of given type to given file
1298 * system. Caller of this function should make sure there can be no new writers
1299 * of type @level before calling this function. Otherwise this function can
1300 * livelock.
1301 */
1302static void sb_wait_write(struct super_block *sb, int level)
1303{
1304 s64 writers;
1305
1306 /*
1307 * We just cycle-through lockdep here so that it does not complain
1308 * about returning with lock to userspace
1309 */
1310 rwsem_acquire(&sb->s_writers.lock_map[level-1], 0, 0, _THIS_IP_);
1311 rwsem_release(&sb->s_writers.lock_map[level-1], 1, _THIS_IP_);
1312
1313 do {
1314 DEFINE_WAIT(wait);
1315
1316 /*
1317 * We use a barrier in prepare_to_wait() to separate setting
1318 * of frozen and checking of the counter
1319 */
1320 prepare_to_wait(&sb->s_writers.wait, &wait,
1321 TASK_UNINTERRUPTIBLE);
1322
1323 writers = percpu_counter_sum(&sb->s_writers.counter[level-1]);
1324 if (writers)
1325 schedule();
1326
1327 finish_wait(&sb->s_writers.wait, &wait);
1328 } while (writers);
1329}
1330
1166/** 1331/**
1167 * freeze_super - lock the filesystem and force it into a consistent state 1332 * freeze_super - lock the filesystem and force it into a consistent state
1168 * @sb: the super to lock 1333 * @sb: the super to lock
@@ -1170,6 +1335,31 @@ out:
1170 * Syncs the super to make sure the filesystem is consistent and calls the fs's 1335 * Syncs the super to make sure the filesystem is consistent and calls the fs's
1171 * freeze_fs. Subsequent calls to this without first thawing the fs will return 1336 * freeze_fs. Subsequent calls to this without first thawing the fs will return
1172 * -EBUSY. 1337 * -EBUSY.
1338 *
1339 * During this function, sb->s_writers.frozen goes through these values:
1340 *
1341 * SB_UNFROZEN: File system is normal, all writes progress as usual.
1342 *
1343 * SB_FREEZE_WRITE: The file system is in the process of being frozen. New
1344 * writes should be blocked, though page faults are still allowed. We wait for
1345 * all writes to complete and then proceed to the next stage.
1346 *
1347 * SB_FREEZE_PAGEFAULT: Freezing continues. Now also page faults are blocked
1348 * but internal fs threads can still modify the filesystem (although they
1349 * should not dirty new pages or inodes), writeback can run etc. After waiting
1350 * for all running page faults we sync the filesystem which will clean all
1351 * dirty pages and inodes (no new dirty pages or inodes can be created when
1352 * sync is running).
1353 *
1354 * SB_FREEZE_FS: The file system is frozen. Now all internal sources of fs
1355 * modification are blocked (e.g. XFS preallocation truncation on inode
1356 * reclaim). This is usually implemented by blocking new transactions for
1357 * filesystems that have them and need this additional guard. After all
1358 * internal writers are finished we call ->freeze_fs() to finish filesystem
1359 * freezing. Then we transition to SB_FREEZE_COMPLETE state. This state is
1360 * mostly auxiliary for filesystems to verify they do not modify frozen fs.
1361 *
1362 * sb->s_writers.frozen is protected by sb->s_umount.
1173 */ 1363 */
1174int freeze_super(struct super_block *sb) 1364int freeze_super(struct super_block *sb)
1175{ 1365{
@@ -1177,7 +1367,7 @@ int freeze_super(struct super_block *sb)
1177 1367
1178 atomic_inc(&sb->s_active); 1368 atomic_inc(&sb->s_active);
1179 down_write(&sb->s_umount); 1369 down_write(&sb->s_umount);
1180 if (sb->s_frozen) { 1370 if (sb->s_writers.frozen != SB_UNFROZEN) {
1181 deactivate_locked_super(sb); 1371 deactivate_locked_super(sb);
1182 return -EBUSY; 1372 return -EBUSY;
1183 } 1373 }
@@ -1188,33 +1378,53 @@ int freeze_super(struct super_block *sb)
1188 } 1378 }
1189 1379
1190 if (sb->s_flags & MS_RDONLY) { 1380 if (sb->s_flags & MS_RDONLY) {
1191 sb->s_frozen = SB_FREEZE_TRANS; 1381 /* Nothing to do really... */
1192 smp_wmb(); 1382 sb->s_writers.frozen = SB_FREEZE_COMPLETE;
1193 up_write(&sb->s_umount); 1383 up_write(&sb->s_umount);
1194 return 0; 1384 return 0;
1195 } 1385 }
1196 1386
1197 sb->s_frozen = SB_FREEZE_WRITE; 1387 /* From now on, no new normal writers can start */
1388 sb->s_writers.frozen = SB_FREEZE_WRITE;
1389 smp_wmb();
1390
1391 /* Release s_umount to preserve sb_start_write -> s_umount ordering */
1392 up_write(&sb->s_umount);
1393
1394 sb_wait_write(sb, SB_FREEZE_WRITE);
1395
1396 /* Now we go and block page faults... */
1397 down_write(&sb->s_umount);
1398 sb->s_writers.frozen = SB_FREEZE_PAGEFAULT;
1198 smp_wmb(); 1399 smp_wmb();
1199 1400
1401 sb_wait_write(sb, SB_FREEZE_PAGEFAULT);
1402
1403 /* All writers are done so after syncing there won't be dirty data */
1200 sync_filesystem(sb); 1404 sync_filesystem(sb);
1201 1405
1202 sb->s_frozen = SB_FREEZE_TRANS; 1406 /* Now wait for internal filesystem counter */
1407 sb->s_writers.frozen = SB_FREEZE_FS;
1203 smp_wmb(); 1408 smp_wmb();
1409 sb_wait_write(sb, SB_FREEZE_FS);
1204 1410
1205 sync_blockdev(sb->s_bdev);
1206 if (sb->s_op->freeze_fs) { 1411 if (sb->s_op->freeze_fs) {
1207 ret = sb->s_op->freeze_fs(sb); 1412 ret = sb->s_op->freeze_fs(sb);
1208 if (ret) { 1413 if (ret) {
1209 printk(KERN_ERR 1414 printk(KERN_ERR
1210 "VFS:Filesystem freeze failed\n"); 1415 "VFS:Filesystem freeze failed\n");
1211 sb->s_frozen = SB_UNFROZEN; 1416 sb->s_writers.frozen = SB_UNFROZEN;
1212 smp_wmb(); 1417 smp_wmb();
1213 wake_up(&sb->s_wait_unfrozen); 1418 wake_up(&sb->s_writers.wait_unfrozen);
1214 deactivate_locked_super(sb); 1419 deactivate_locked_super(sb);
1215 return ret; 1420 return ret;
1216 } 1421 }
1217 } 1422 }
1423 /*
1424 * This is just for debugging purposes so that fs can warn if it
1425 * sees write activity when frozen is set to SB_FREEZE_COMPLETE.
1426 */
1427 sb->s_writers.frozen = SB_FREEZE_COMPLETE;
1218 up_write(&sb->s_umount); 1428 up_write(&sb->s_umount);
1219 return 0; 1429 return 0;
1220} 1430}
@@ -1231,7 +1441,7 @@ int thaw_super(struct super_block *sb)
1231 int error; 1441 int error;
1232 1442
1233 down_write(&sb->s_umount); 1443 down_write(&sb->s_umount);
1234 if (sb->s_frozen == SB_UNFROZEN) { 1444 if (sb->s_writers.frozen == SB_UNFROZEN) {
1235 up_write(&sb->s_umount); 1445 up_write(&sb->s_umount);
1236 return -EINVAL; 1446 return -EINVAL;
1237 } 1447 }
@@ -1244,16 +1454,15 @@ int thaw_super(struct super_block *sb)
1244 if (error) { 1454 if (error) {
1245 printk(KERN_ERR 1455 printk(KERN_ERR
1246 "VFS:Filesystem thaw failed\n"); 1456 "VFS:Filesystem thaw failed\n");
1247 sb->s_frozen = SB_FREEZE_TRANS;
1248 up_write(&sb->s_umount); 1457 up_write(&sb->s_umount);
1249 return error; 1458 return error;
1250 } 1459 }
1251 } 1460 }
1252 1461
1253out: 1462out:
1254 sb->s_frozen = SB_UNFROZEN; 1463 sb->s_writers.frozen = SB_UNFROZEN;
1255 smp_wmb(); 1464 smp_wmb();
1256 wake_up(&sb->s_wait_unfrozen); 1465 wake_up(&sb->s_writers.wait_unfrozen);
1257 deactivate_locked_super(sb); 1466 deactivate_locked_super(sb);
1258 1467
1259 return 0; 1468 return 0;