aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJan Kara <jack@suse.cz>2012-06-12 10:20:34 -0400
committerAl Viro <viro@zeniv.linux.org.uk>2012-07-31 01:30:13 -0400
commit5accdf82ba25cacefd6c1867f1704beb4d244cdd (patch)
tree7125b01d9bf0f23d5c5eaed0cbafa9a1cbe544d5
parentd87aae2f3c8e90bd0fe03f5309b4d066b712b8ec (diff)
fs: Improve filesystem freezing handling
vfs_check_frozen() tests are racy since the filesystem can be frozen just after the test is performed. Thus in write paths we can end up marking some pages or inodes dirty even though the file system is already frozen. This creates problems with flusher thread hanging on frozen filesystem. Another problem is that exclusion between ->page_mkwrite() and filesystem freezing has been handled by setting page dirty and then verifying s_frozen. This guaranteed that either the freezing code sees the faulted page, writes it, and writeprotects it again or we see s_frozen set and bail out of page fault. This works to protect from page being marked writeable while filesystem freezing is running but has an unpleasant artefact of leaving dirty (although unmodified and writeprotected) pages on frozen filesystem resulting in similar problems with flusher thread as the first problem. This patch aims at providing exclusion between write paths and filesystem freezing. We implement a writer-freeze read-write semaphore in the superblock. Actually, there are three such semaphores because of lock ranking reasons - one for page fault handlers (->page_mkwrite), one for all other writers, and one of internal filesystem purposes (used e.g. to track running transactions). Write paths which should block freezing (e.g. directory operations, ->aio_write(), ->page_mkwrite) hold reader side of the semaphore. Code freezing the filesystem takes the writer side. Only that we don't really want to bounce cachelines of the semaphores between CPUs for each write happening. So we implement the reader side of the semaphore as a per-cpu counter and the writer side is implemented using s_writers.frozen superblock field. [AV: microoptimize sb_start_write(); we want it fast in normal case] BugLink: https://bugs.launchpad.net/bugs/897421 Tested-by: Kamal Mostafa <kamal@canonical.com> Tested-by: Peter M. Petrakis <peter.petrakis@canonical.com> Tested-by: Dann Frazier <dann.frazier@canonical.com> Tested-by: Massimo Morana <massimo.morana@canonical.com> Signed-off-by: Jan Kara <jack@suse.cz> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
-rw-r--r--fs/super.c251
-rw-r--r--include/linux/fs.h150
2 files changed, 373 insertions, 28 deletions
diff --git a/fs/super.c b/fs/super.c
index c743fb3be4b8..0f64ecb7b1bf 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -33,12 +33,19 @@
33#include <linux/rculist_bl.h> 33#include <linux/rculist_bl.h>
34#include <linux/cleancache.h> 34#include <linux/cleancache.h>
35#include <linux/fsnotify.h> 35#include <linux/fsnotify.h>
36#include <linux/lockdep.h>
36#include "internal.h" 37#include "internal.h"
37 38
38 39
39LIST_HEAD(super_blocks); 40LIST_HEAD(super_blocks);
40DEFINE_SPINLOCK(sb_lock); 41DEFINE_SPINLOCK(sb_lock);
41 42
43static char *sb_writers_name[SB_FREEZE_LEVELS] = {
44 "sb_writers",
45 "sb_pagefaults",
46 "sb_internal",
47};
48
42/* 49/*
43 * One thing we have to be careful of with a per-sb shrinker is that we don't 50 * One thing we have to be careful of with a per-sb shrinker is that we don't
44 * drop the last active reference to the superblock from within the shrinker. 51 * drop the last active reference to the superblock from within the shrinker.
@@ -102,6 +109,35 @@ static int prune_super(struct shrinker *shrink, struct shrink_control *sc)
102 return total_objects; 109 return total_objects;
103} 110}
104 111
112static int init_sb_writers(struct super_block *s, struct file_system_type *type)
113{
114 int err;
115 int i;
116
117 for (i = 0; i < SB_FREEZE_LEVELS; i++) {
118 err = percpu_counter_init(&s->s_writers.counter[i], 0);
119 if (err < 0)
120 goto err_out;
121 lockdep_init_map(&s->s_writers.lock_map[i], sb_writers_name[i],
122 &type->s_writers_key[i], 0);
123 }
124 init_waitqueue_head(&s->s_writers.wait);
125 init_waitqueue_head(&s->s_writers.wait_unfrozen);
126 return 0;
127err_out:
128 while (--i >= 0)
129 percpu_counter_destroy(&s->s_writers.counter[i]);
130 return err;
131}
132
133static void destroy_sb_writers(struct super_block *s)
134{
135 int i;
136
137 for (i = 0; i < SB_FREEZE_LEVELS; i++)
138 percpu_counter_destroy(&s->s_writers.counter[i]);
139}
140
105/** 141/**
106 * alloc_super - create new superblock 142 * alloc_super - create new superblock
107 * @type: filesystem type superblock should belong to 143 * @type: filesystem type superblock should belong to
@@ -117,18 +153,19 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
117 153
118 if (s) { 154 if (s) {
119 if (security_sb_alloc(s)) { 155 if (security_sb_alloc(s)) {
156 /*
157 * We cannot call security_sb_free() without
158 * security_sb_alloc() succeeding. So bail out manually
159 */
120 kfree(s); 160 kfree(s);
121 s = NULL; 161 s = NULL;
122 goto out; 162 goto out;
123 } 163 }
124#ifdef CONFIG_SMP 164#ifdef CONFIG_SMP
125 s->s_files = alloc_percpu(struct list_head); 165 s->s_files = alloc_percpu(struct list_head);
126 if (!s->s_files) { 166 if (!s->s_files)
127 security_sb_free(s); 167 goto err_out;
128 kfree(s); 168 else {
129 s = NULL;
130 goto out;
131 } else {
132 int i; 169 int i;
133 170
134 for_each_possible_cpu(i) 171 for_each_possible_cpu(i)
@@ -137,6 +174,8 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
137#else 174#else
138 INIT_LIST_HEAD(&s->s_files); 175 INIT_LIST_HEAD(&s->s_files);
139#endif 176#endif
177 if (init_sb_writers(s, type))
178 goto err_out;
140 s->s_flags = flags; 179 s->s_flags = flags;
141 s->s_bdi = &default_backing_dev_info; 180 s->s_bdi = &default_backing_dev_info;
142 INIT_HLIST_NODE(&s->s_instances); 181 INIT_HLIST_NODE(&s->s_instances);
@@ -190,6 +229,16 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
190 } 229 }
191out: 230out:
192 return s; 231 return s;
232err_out:
233 security_sb_free(s);
234#ifdef CONFIG_SMP
235 if (s->s_files)
236 free_percpu(s->s_files);
237#endif
238 destroy_sb_writers(s);
239 kfree(s);
240 s = NULL;
241 goto out;
193} 242}
194 243
195/** 244/**
@@ -203,6 +252,7 @@ static inline void destroy_super(struct super_block *s)
203#ifdef CONFIG_SMP 252#ifdef CONFIG_SMP
204 free_percpu(s->s_files); 253 free_percpu(s->s_files);
205#endif 254#endif
255 destroy_sb_writers(s);
206 security_sb_free(s); 256 security_sb_free(s);
207 WARN_ON(!list_empty(&s->s_mounts)); 257 WARN_ON(!list_empty(&s->s_mounts));
208 kfree(s->s_subtype); 258 kfree(s->s_subtype);
@@ -651,10 +701,11 @@ struct super_block *get_super_thawed(struct block_device *bdev)
651{ 701{
652 while (1) { 702 while (1) {
653 struct super_block *s = get_super(bdev); 703 struct super_block *s = get_super(bdev);
654 if (!s || s->s_frozen == SB_UNFROZEN) 704 if (!s || s->s_writers.frozen == SB_UNFROZEN)
655 return s; 705 return s;
656 up_read(&s->s_umount); 706 up_read(&s->s_umount);
657 vfs_check_frozen(s, SB_FREEZE_WRITE); 707 wait_event(s->s_writers.wait_unfrozen,
708 s->s_writers.frozen == SB_UNFROZEN);
658 put_super(s); 709 put_super(s);
659 } 710 }
660} 711}
@@ -732,7 +783,7 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
732 int retval; 783 int retval;
733 int remount_ro; 784 int remount_ro;
734 785
735 if (sb->s_frozen != SB_UNFROZEN) 786 if (sb->s_writers.frozen != SB_UNFROZEN)
736 return -EBUSY; 787 return -EBUSY;
737 788
738#ifdef CONFIG_BLOCK 789#ifdef CONFIG_BLOCK
@@ -1163,6 +1214,120 @@ out:
1163 return ERR_PTR(error); 1214 return ERR_PTR(error);
1164} 1215}
1165 1216
1217/*
1218 * This is an internal function, please use sb_end_{write,pagefault,intwrite}
1219 * instead.
1220 */
1221void __sb_end_write(struct super_block *sb, int level)
1222{
1223 percpu_counter_dec(&sb->s_writers.counter[level-1]);
1224 /*
1225 * Make sure s_writers are updated before we wake up waiters in
1226 * freeze_super().
1227 */
1228 smp_mb();
1229 if (waitqueue_active(&sb->s_writers.wait))
1230 wake_up(&sb->s_writers.wait);
1231 rwsem_release(&sb->s_writers.lock_map[level-1], 1, _RET_IP_);
1232}
1233EXPORT_SYMBOL(__sb_end_write);
1234
1235#ifdef CONFIG_LOCKDEP
1236/*
1237 * We want lockdep to tell us about possible deadlocks with freezing but
1238 * it's it bit tricky to properly instrument it. Getting a freeze protection
1239 * works as getting a read lock but there are subtle problems. XFS for example
1240 * gets freeze protection on internal level twice in some cases, which is OK
1241 * only because we already hold a freeze protection also on higher level. Due
1242 * to these cases we have to tell lockdep we are doing trylock when we
1243 * already hold a freeze protection for a higher freeze level.
1244 */
1245static void acquire_freeze_lock(struct super_block *sb, int level, bool trylock,
1246 unsigned long ip)
1247{
1248 int i;
1249
1250 if (!trylock) {
1251 for (i = 0; i < level - 1; i++)
1252 if (lock_is_held(&sb->s_writers.lock_map[i])) {
1253 trylock = true;
1254 break;
1255 }
1256 }
1257 rwsem_acquire_read(&sb->s_writers.lock_map[level-1], 0, trylock, ip);
1258}
1259#endif
1260
1261/*
1262 * This is an internal function, please use sb_start_{write,pagefault,intwrite}
1263 * instead.
1264 */
1265int __sb_start_write(struct super_block *sb, int level, bool wait)
1266{
1267retry:
1268 if (unlikely(sb->s_writers.frozen >= level)) {
1269 if (!wait)
1270 return 0;
1271 wait_event(sb->s_writers.wait_unfrozen,
1272 sb->s_writers.frozen < level);
1273 }
1274
1275#ifdef CONFIG_LOCKDEP
1276 acquire_freeze_lock(sb, level, !wait, _RET_IP_);
1277#endif
1278 percpu_counter_inc(&sb->s_writers.counter[level-1]);
1279 /*
1280 * Make sure counter is updated before we check for frozen.
1281 * freeze_super() first sets frozen and then checks the counter.
1282 */
1283 smp_mb();
1284 if (unlikely(sb->s_writers.frozen >= level)) {
1285 __sb_end_write(sb, level);
1286 goto retry;
1287 }
1288 return 1;
1289}
1290EXPORT_SYMBOL(__sb_start_write);
1291
1292/**
1293 * sb_wait_write - wait until all writers to given file system finish
1294 * @sb: the super for which we wait
1295 * @level: type of writers we wait for (normal vs page fault)
1296 *
1297 * This function waits until there are no writers of given type to given file
1298 * system. Caller of this function should make sure there can be no new writers
1299 * of type @level before calling this function. Otherwise this function can
1300 * livelock.
1301 */
1302static void sb_wait_write(struct super_block *sb, int level)
1303{
1304 s64 writers;
1305
1306 /*
1307 * We just cycle-through lockdep here so that it does not complain
1308 * about returning with lock to userspace
1309 */
1310 rwsem_acquire(&sb->s_writers.lock_map[level-1], 0, 0, _THIS_IP_);
1311 rwsem_release(&sb->s_writers.lock_map[level-1], 1, _THIS_IP_);
1312
1313 do {
1314 DEFINE_WAIT(wait);
1315
1316 /*
1317 * We use a barrier in prepare_to_wait() to separate setting
1318 * of frozen and checking of the counter
1319 */
1320 prepare_to_wait(&sb->s_writers.wait, &wait,
1321 TASK_UNINTERRUPTIBLE);
1322
1323 writers = percpu_counter_sum(&sb->s_writers.counter[level-1]);
1324 if (writers)
1325 schedule();
1326
1327 finish_wait(&sb->s_writers.wait, &wait);
1328 } while (writers);
1329}
1330
1166/** 1331/**
1167 * freeze_super - lock the filesystem and force it into a consistent state 1332 * freeze_super - lock the filesystem and force it into a consistent state
1168 * @sb: the super to lock 1333 * @sb: the super to lock
@@ -1170,6 +1335,31 @@ out:
1170 * Syncs the super to make sure the filesystem is consistent and calls the fs's 1335 * Syncs the super to make sure the filesystem is consistent and calls the fs's
1171 * freeze_fs. Subsequent calls to this without first thawing the fs will return 1336 * freeze_fs. Subsequent calls to this without first thawing the fs will return
1172 * -EBUSY. 1337 * -EBUSY.
1338 *
1339 * During this function, sb->s_writers.frozen goes through these values:
1340 *
1341 * SB_UNFROZEN: File system is normal, all writes progress as usual.
1342 *
1343 * SB_FREEZE_WRITE: The file system is in the process of being frozen. New
1344 * writes should be blocked, though page faults are still allowed. We wait for
1345 * all writes to complete and then proceed to the next stage.
1346 *
1347 * SB_FREEZE_PAGEFAULT: Freezing continues. Now also page faults are blocked
1348 * but internal fs threads can still modify the filesystem (although they
1349 * should not dirty new pages or inodes), writeback can run etc. After waiting
1350 * for all running page faults we sync the filesystem which will clean all
1351 * dirty pages and inodes (no new dirty pages or inodes can be created when
1352 * sync is running).
1353 *
1354 * SB_FREEZE_FS: The file system is frozen. Now all internal sources of fs
1355 * modification are blocked (e.g. XFS preallocation truncation on inode
1356 * reclaim). This is usually implemented by blocking new transactions for
1357 * filesystems that have them and need this additional guard. After all
1358 * internal writers are finished we call ->freeze_fs() to finish filesystem
1359 * freezing. Then we transition to SB_FREEZE_COMPLETE state. This state is
1360 * mostly auxiliary for filesystems to verify they do not modify frozen fs.
1361 *
1362 * sb->s_writers.frozen is protected by sb->s_umount.
1173 */ 1363 */
1174int freeze_super(struct super_block *sb) 1364int freeze_super(struct super_block *sb)
1175{ 1365{
@@ -1177,7 +1367,7 @@ int freeze_super(struct super_block *sb)
1177 1367
1178 atomic_inc(&sb->s_active); 1368 atomic_inc(&sb->s_active);
1179 down_write(&sb->s_umount); 1369 down_write(&sb->s_umount);
1180 if (sb->s_frozen) { 1370 if (sb->s_writers.frozen != SB_UNFROZEN) {
1181 deactivate_locked_super(sb); 1371 deactivate_locked_super(sb);
1182 return -EBUSY; 1372 return -EBUSY;
1183 } 1373 }
@@ -1188,33 +1378,53 @@ int freeze_super(struct super_block *sb)
1188 } 1378 }
1189 1379
1190 if (sb->s_flags & MS_RDONLY) { 1380 if (sb->s_flags & MS_RDONLY) {
1191 sb->s_frozen = SB_FREEZE_TRANS; 1381 /* Nothing to do really... */
1192 smp_wmb(); 1382 sb->s_writers.frozen = SB_FREEZE_COMPLETE;
1193 up_write(&sb->s_umount); 1383 up_write(&sb->s_umount);
1194 return 0; 1384 return 0;
1195 } 1385 }
1196 1386
1197 sb->s_frozen = SB_FREEZE_WRITE; 1387 /* From now on, no new normal writers can start */
1388 sb->s_writers.frozen = SB_FREEZE_WRITE;
1389 smp_wmb();
1390
1391 /* Release s_umount to preserve sb_start_write -> s_umount ordering */
1392 up_write(&sb->s_umount);
1393
1394 sb_wait_write(sb, SB_FREEZE_WRITE);
1395
1396 /* Now we go and block page faults... */
1397 down_write(&sb->s_umount);
1398 sb->s_writers.frozen = SB_FREEZE_PAGEFAULT;
1198 smp_wmb(); 1399 smp_wmb();
1199 1400
1401 sb_wait_write(sb, SB_FREEZE_PAGEFAULT);
1402
1403 /* All writers are done so after syncing there won't be dirty data */
1200 sync_filesystem(sb); 1404 sync_filesystem(sb);
1201 1405
1202 sb->s_frozen = SB_FREEZE_TRANS; 1406 /* Now wait for internal filesystem counter */
1407 sb->s_writers.frozen = SB_FREEZE_FS;
1203 smp_wmb(); 1408 smp_wmb();
1409 sb_wait_write(sb, SB_FREEZE_FS);
1204 1410
1205 sync_blockdev(sb->s_bdev);
1206 if (sb->s_op->freeze_fs) { 1411 if (sb->s_op->freeze_fs) {
1207 ret = sb->s_op->freeze_fs(sb); 1412 ret = sb->s_op->freeze_fs(sb);
1208 if (ret) { 1413 if (ret) {
1209 printk(KERN_ERR 1414 printk(KERN_ERR
1210 "VFS:Filesystem freeze failed\n"); 1415 "VFS:Filesystem freeze failed\n");
1211 sb->s_frozen = SB_UNFROZEN; 1416 sb->s_writers.frozen = SB_UNFROZEN;
1212 smp_wmb(); 1417 smp_wmb();
1213 wake_up(&sb->s_wait_unfrozen); 1418 wake_up(&sb->s_writers.wait_unfrozen);
1214 deactivate_locked_super(sb); 1419 deactivate_locked_super(sb);
1215 return ret; 1420 return ret;
1216 } 1421 }
1217 } 1422 }
1423 /*
1424 * This is just for debugging purposes so that fs can warn if it
1425 * sees write activity when frozen is set to SB_FREEZE_COMPLETE.
1426 */
1427 sb->s_writers.frozen = SB_FREEZE_COMPLETE;
1218 up_write(&sb->s_umount); 1428 up_write(&sb->s_umount);
1219 return 0; 1429 return 0;
1220} 1430}
@@ -1231,7 +1441,7 @@ int thaw_super(struct super_block *sb)
1231 int error; 1441 int error;
1232 1442
1233 down_write(&sb->s_umount); 1443 down_write(&sb->s_umount);
1234 if (sb->s_frozen == SB_UNFROZEN) { 1444 if (sb->s_writers.frozen == SB_UNFROZEN) {
1235 up_write(&sb->s_umount); 1445 up_write(&sb->s_umount);
1236 return -EINVAL; 1446 return -EINVAL;
1237 } 1447 }
@@ -1244,16 +1454,15 @@ int thaw_super(struct super_block *sb)
1244 if (error) { 1454 if (error) {
1245 printk(KERN_ERR 1455 printk(KERN_ERR
1246 "VFS:Filesystem thaw failed\n"); 1456 "VFS:Filesystem thaw failed\n");
1247 sb->s_frozen = SB_FREEZE_TRANS;
1248 up_write(&sb->s_umount); 1457 up_write(&sb->s_umount);
1249 return error; 1458 return error;
1250 } 1459 }
1251 } 1460 }
1252 1461
1253out: 1462out:
1254 sb->s_frozen = SB_UNFROZEN; 1463 sb->s_writers.frozen = SB_UNFROZEN;
1255 smp_wmb(); 1464 smp_wmb();
1256 wake_up(&sb->s_wait_unfrozen); 1465 wake_up(&sb->s_writers.wait_unfrozen);
1257 deactivate_locked_super(sb); 1466 deactivate_locked_super(sb);
1258 1467
1259 return 0; 1468 return 0;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 80c819cbe272..aefed9426b03 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -412,6 +412,7 @@ struct inodes_stat_t {
412#include <linux/shrinker.h> 412#include <linux/shrinker.h>
413#include <linux/migrate_mode.h> 413#include <linux/migrate_mode.h>
414#include <linux/uidgid.h> 414#include <linux/uidgid.h>
415#include <linux/lockdep.h>
415 416
416#include <asm/byteorder.h> 417#include <asm/byteorder.h>
417 418
@@ -1439,6 +1440,8 @@ extern void f_delown(struct file *filp);
1439extern pid_t f_getown(struct file *filp); 1440extern pid_t f_getown(struct file *filp);
1440extern int send_sigurg(struct fown_struct *fown); 1441extern int send_sigurg(struct fown_struct *fown);
1441 1442
1443struct mm_struct;
1444
1442/* 1445/*
1443 * Umount options 1446 * Umount options
1444 */ 1447 */
@@ -1452,6 +1455,32 @@ extern int send_sigurg(struct fown_struct *fown);
1452extern struct list_head super_blocks; 1455extern struct list_head super_blocks;
1453extern spinlock_t sb_lock; 1456extern spinlock_t sb_lock;
1454 1457
1458/* Possible states of 'frozen' field */
1459enum {
1460 SB_UNFROZEN = 0, /* FS is unfrozen */
1461 SB_FREEZE_WRITE = 1, /* Writes, dir ops, ioctls frozen */
1462 SB_FREEZE_TRANS = 2,
1463 SB_FREEZE_PAGEFAULT = 2, /* Page faults stopped as well */
1464 SB_FREEZE_FS = 3, /* For internal FS use (e.g. to stop
1465 * internal threads if needed) */
1466 SB_FREEZE_COMPLETE = 4, /* ->freeze_fs finished successfully */
1467};
1468
1469#define SB_FREEZE_LEVELS (SB_FREEZE_COMPLETE - 1)
1470
1471struct sb_writers {
1472 /* Counters for counting writers at each level */
1473 struct percpu_counter counter[SB_FREEZE_LEVELS];
1474 wait_queue_head_t wait; /* queue for waiting for
1475 writers / faults to finish */
1476 int frozen; /* Is sb frozen? */
1477 wait_queue_head_t wait_unfrozen; /* queue for waiting for
1478 sb to be thawed */
1479#ifdef CONFIG_DEBUG_LOCK_ALLOC
1480 struct lockdep_map lock_map[SB_FREEZE_LEVELS];
1481#endif
1482};
1483
1455struct super_block { 1484struct super_block {
1456 struct list_head s_list; /* Keep this first */ 1485 struct list_head s_list; /* Keep this first */
1457 dev_t s_dev; /* search index; _not_ kdev_t */ 1486 dev_t s_dev; /* search index; _not_ kdev_t */
@@ -1501,6 +1530,7 @@ struct super_block {
1501 1530
1502 int s_frozen; 1531 int s_frozen;
1503 wait_queue_head_t s_wait_unfrozen; 1532 wait_queue_head_t s_wait_unfrozen;
1533 struct sb_writers s_writers;
1504 1534
1505 char s_id[32]; /* Informational name */ 1535 char s_id[32]; /* Informational name */
1506 u8 s_uuid[16]; /* UUID */ 1536 u8 s_uuid[16]; /* UUID */
@@ -1555,14 +1585,119 @@ extern struct timespec current_fs_time(struct super_block *sb);
1555/* 1585/*
1556 * Snapshotting support. 1586 * Snapshotting support.
1557 */ 1587 */
1558enum { 1588/* Will go away when all users are converted */
1559 SB_UNFROZEN = 0, 1589#define vfs_check_frozen(sb, level) do { } while (0)
1560 SB_FREEZE_WRITE = 1, 1590
1561 SB_FREEZE_TRANS = 2, 1591void __sb_end_write(struct super_block *sb, int level);
1562}; 1592int __sb_start_write(struct super_block *sb, int level, bool wait);
1593
1594/**
1595 * sb_end_write - drop write access to a superblock
1596 * @sb: the super we wrote to
1597 *
1598 * Decrement number of writers to the filesystem. Wake up possible waiters
1599 * wanting to freeze the filesystem.
1600 */
1601static inline void sb_end_write(struct super_block *sb)
1602{
1603 __sb_end_write(sb, SB_FREEZE_WRITE);
1604}
1605
1606/**
1607 * sb_end_pagefault - drop write access to a superblock from a page fault
1608 * @sb: the super we wrote to
1609 *
1610 * Decrement number of processes handling write page fault to the filesystem.
1611 * Wake up possible waiters wanting to freeze the filesystem.
1612 */
1613static inline void sb_end_pagefault(struct super_block *sb)
1614{
1615 __sb_end_write(sb, SB_FREEZE_PAGEFAULT);
1616}
1617
1618/**
1619 * sb_end_intwrite - drop write access to a superblock for internal fs purposes
1620 * @sb: the super we wrote to
1621 *
1622 * Decrement fs-internal number of writers to the filesystem. Wake up possible
1623 * waiters wanting to freeze the filesystem.
1624 */
1625static inline void sb_end_intwrite(struct super_block *sb)
1626{
1627 __sb_end_write(sb, SB_FREEZE_FS);
1628}
1629
1630/**
1631 * sb_start_write - get write access to a superblock
1632 * @sb: the super we write to
1633 *
1634 * When a process wants to write data or metadata to a file system (i.e. dirty
1635 * a page or an inode), it should embed the operation in a sb_start_write() -
1636 * sb_end_write() pair to get exclusion against file system freezing. This
1637 * function increments number of writers preventing freezing. If the file
1638 * system is already frozen, the function waits until the file system is
1639 * thawed.
1640 *
1641 * Since freeze protection behaves as a lock, users have to preserve
1642 * ordering of freeze protection and other filesystem locks. Generally,
1643 * freeze protection should be the outermost lock. In particular, we have:
1644 *
1645 * sb_start_write
1646 * -> i_mutex (write path, truncate, directory ops, ...)
1647 * -> s_umount (freeze_super, thaw_super)
1648 */
1649static inline void sb_start_write(struct super_block *sb)
1650{
1651 __sb_start_write(sb, SB_FREEZE_WRITE, true);
1652}
1653
1654static inline int sb_start_write_trylock(struct super_block *sb)
1655{
1656 return __sb_start_write(sb, SB_FREEZE_WRITE, false);
1657}
1658
1659/**
1660 * sb_start_pagefault - get write access to a superblock from a page fault
1661 * @sb: the super we write to
1662 *
1663 * When a process starts handling write page fault, it should embed the
1664 * operation into sb_start_pagefault() - sb_end_pagefault() pair to get
1665 * exclusion against file system freezing. This is needed since the page fault
1666 * is going to dirty a page. This function increments number of running page
1667 * faults preventing freezing. If the file system is already frozen, the
1668 * function waits until the file system is thawed.
1669 *
1670 * Since page fault freeze protection behaves as a lock, users have to preserve
1671 * ordering of freeze protection and other filesystem locks. It is advised to
1672 * put sb_start_pagefault() close to mmap_sem in lock ordering. Page fault
1673 * handling code implies lock dependency:
1674 *
1675 * mmap_sem
1676 * -> sb_start_pagefault
1677 */
1678static inline void sb_start_pagefault(struct super_block *sb)
1679{
1680 __sb_start_write(sb, SB_FREEZE_PAGEFAULT, true);
1681}
1682
1683/*
1684 * sb_start_intwrite - get write access to a superblock for internal fs purposes
1685 * @sb: the super we write to
1686 *
1687 * This is the third level of protection against filesystem freezing. It is
1688 * free for use by a filesystem. The only requirement is that it must rank
1689 * below sb_start_pagefault.
1690 *
1691 * For example filesystem can call sb_start_intwrite() when starting a
1692 * transaction which somewhat eases handling of freezing for internal sources
1693 * of filesystem changes (internal fs threads, discarding preallocation on file
1694 * close, etc.).
1695 */
1696static inline void sb_start_intwrite(struct super_block *sb)
1697{
1698 __sb_start_write(sb, SB_FREEZE_FS, true);
1699}
1563 1700
1564#define vfs_check_frozen(sb, level) \
1565 wait_event((sb)->s_wait_unfrozen, ((sb)->s_frozen < (level)))
1566 1701
1567extern bool inode_owner_or_capable(const struct inode *inode); 1702extern bool inode_owner_or_capable(const struct inode *inode);
1568 1703
@@ -1886,6 +2021,7 @@ struct file_system_type {
1886 struct lock_class_key s_lock_key; 2021 struct lock_class_key s_lock_key;
1887 struct lock_class_key s_umount_key; 2022 struct lock_class_key s_umount_key;
1888 struct lock_class_key s_vfs_rename_key; 2023 struct lock_class_key s_vfs_rename_key;
2024 struct lock_class_key s_writers_key[SB_FREEZE_LEVELS];
1889 2025
1890 struct lock_class_key i_lock_key; 2026 struct lock_class_key i_lock_key;
1891 struct lock_class_key i_mutex_key; 2027 struct lock_class_key i_mutex_key;