aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorJan Kara <jack@suse.cz>2009-04-27 10:43:51 -0400
committerAl Viro <viro@zeniv.linux.org.uk>2009-06-11 21:36:03 -0400
commit5cee5815d1564bbbd505fea86f4550f1efdb5cd0 (patch)
treeba99c38932dc534bf56f0d7dcfeeca153f50b007 /fs
parent429479f031322a0cc5c921ffb2321a51718dc875 (diff)
vfs: Make sys_sync() use fsync_super() (version 4)
It is unnecessarily fragile to have two places (fsync_super() and do_sync()) doing data integrity sync of the filesystem. Alter __fsync_super() to accommodate needs of both callers and use it. So after this patch __fsync_super() is the only place where we gather all the calls needed to properly send all data on a filesystem to disk. Nice bonus is that we get a complete livelock avoidance and write_supers() is now only used for periodic writeback of superblocks. sync_blockdevs() introduced a couple of patches ago is gone now. [build fixes folded] Signed-off-by: Jan Kara <jack@suse.cz> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Diffstat (limited to 'fs')
-rw-r--r--fs/block_dev.c15
-rw-r--r--fs/fs-writeback.c49
-rw-r--r--fs/internal.h16
-rw-r--r--fs/super.c72
-rw-r--r--fs/sync.c31
5 files changed, 50 insertions, 133 deletions
diff --git a/fs/block_dev.c b/fs/block_dev.c
index fe47f7227618..4b6a3b9d01ef 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -176,17 +176,22 @@ blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
176 iov, offset, nr_segs, blkdev_get_blocks, NULL); 176 iov, offset, nr_segs, blkdev_get_blocks, NULL);
177} 177}
178 178
179int __sync_blockdev(struct block_device *bdev, int wait)
180{
181 if (!bdev)
182 return 0;
183 if (!wait)
184 return filemap_flush(bdev->bd_inode->i_mapping);
185 return filemap_write_and_wait(bdev->bd_inode->i_mapping);
186}
187
179/* 188/*
180 * Write out and wait upon all the dirty data associated with a block 189 * Write out and wait upon all the dirty data associated with a block
181 * device via its mapping. Does not take the superblock lock. 190 * device via its mapping. Does not take the superblock lock.
182 */ 191 */
183int sync_blockdev(struct block_device *bdev) 192int sync_blockdev(struct block_device *bdev)
184{ 193{
185 int ret = 0; 194 return __sync_blockdev(bdev, 1);
186
187 if (bdev)
188 ret = filemap_write_and_wait(bdev->bd_inode->i_mapping);
189 return ret;
190} 195}
191EXPORT_SYMBOL(sync_blockdev); 196EXPORT_SYMBOL(sync_blockdev);
192 197
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 91013ff7dd53..e0fb2e789598 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -679,55 +679,6 @@ void sync_inodes_sb(struct super_block *sb, int wait)
679} 679}
680 680
681/** 681/**
682 * sync_inodes - writes all inodes to disk
683 * @wait: wait for completion
684 *
685 * sync_inodes() goes through each super block's dirty inode list, writes the
686 * inodes out, waits on the writeout and puts the inodes back on the normal
687 * list.
688 *
689 * This is for sys_sync(). fsync_dev() uses the same algorithm. The subtle
690 * part of the sync functions is that the blockdev "superblock" is processed
691 * last. This is because the write_inode() function of a typical fs will
692 * perform no I/O, but will mark buffers in the blockdev mapping as dirty.
693 * What we want to do is to perform all that dirtying first, and then write
694 * back all those inode blocks via the blockdev mapping in one sweep. So the
695 * additional (somewhat redundant) sync_blockdev() calls here are to make
696 * sure that really happens. Because if we call sync_inodes_sb(wait=1) with
697 * outstanding dirty inodes, the writeback goes block-at-a-time within the
698 * filesystem's write_inode(). This is extremely slow.
699 */
700static void __sync_inodes(int wait)
701{
702 struct super_block *sb;
703
704 spin_lock(&sb_lock);
705restart:
706 list_for_each_entry(sb, &super_blocks, s_list) {
707 sb->s_count++;
708 spin_unlock(&sb_lock);
709 down_read(&sb->s_umount);
710 if (sb->s_root) {
711 sync_inodes_sb(sb, wait);
712 sync_blockdev(sb->s_bdev);
713 }
714 up_read(&sb->s_umount);
715 spin_lock(&sb_lock);
716 if (__put_super_and_need_restart(sb))
717 goto restart;
718 }
719 spin_unlock(&sb_lock);
720}
721
722void sync_inodes(int wait)
723{
724 __sync_inodes(0);
725
726 if (wait)
727 __sync_inodes(1);
728}
729
730/**
731 * write_inode_now - write an inode to disk 682 * write_inode_now - write an inode to disk
732 * @inode: inode to write to disk 683 * @inode: inode to write to disk
733 * @sync: whether the write should be synchronous or not 684 * @sync: whether the write should be synchronous or not
diff --git a/fs/internal.h b/fs/internal.h
index 343a537ab809..dbec3cc28338 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -25,6 +25,8 @@ static inline int sb_is_blkdev_sb(struct super_block *sb)
25 return sb == blockdev_superblock; 25 return sb == blockdev_superblock;
26} 26}
27 27
28extern int __sync_blockdev(struct block_device *bdev, int wait);
29
28#else 30#else
29static inline void bdev_cache_init(void) 31static inline void bdev_cache_init(void)
30{ 32{
@@ -34,6 +36,11 @@ static inline int sb_is_blkdev_sb(struct super_block *sb)
34{ 36{
35 return 0; 37 return 0;
36} 38}
39
40static inline int __sync_blockdev(struct block_device *bdev, int wait)
41{
42 return 0;
43}
37#endif 44#endif
38 45
39/* 46/*
@@ -71,12 +78,3 @@ extern void chroot_fs_refs(struct path *, struct path *);
71 * file_table.c 78 * file_table.c
72 */ 79 */
73extern void mark_files_ro(struct super_block *); 80extern void mark_files_ro(struct super_block *);
74
75/*
76 * super.c
77 */
78#ifdef CONFIG_BLOCK
79extern void sync_blockdevs(void);
80#else
81static inline void sync_blockdevs(void) { }
82#endif
diff --git a/fs/super.c b/fs/super.c
index 8dbe1ead9ddd..c8ce5ed04249 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -284,23 +284,23 @@ EXPORT_SYMBOL(lock_super);
284EXPORT_SYMBOL(unlock_super); 284EXPORT_SYMBOL(unlock_super);
285 285
286/* 286/*
287 * Write out and wait upon all dirty data associated with this 287 * Do the filesystem syncing work. For simple filesystems sync_inodes_sb(sb, 0)
288 * superblock. Filesystem data as well as the underlying block 288 * just dirties buffers with inodes so we have to submit IO for these buffers
289 * device. Takes the superblock lock. Requires a second blkdev 289 * via __sync_blockdev(). This also speeds up the wait == 1 case since in that
290 * flush by the caller to complete the operation. 290 * case write_inode() functions do sync_dirty_buffer() and thus effectively
291 * write one block at a time.
291 */ 292 */
292static int __fsync_super(struct super_block *sb) 293static int __fsync_super(struct super_block *sb, int wait)
293{ 294{
294 sync_inodes_sb(sb, 0);
295 vfs_dq_sync(sb); 295 vfs_dq_sync(sb);
296 sync_inodes_sb(sb, 1); 296 sync_inodes_sb(sb, wait);
297 lock_super(sb); 297 lock_super(sb);
298 if (sb->s_dirt && sb->s_op->write_super) 298 if (sb->s_dirt && sb->s_op->write_super)
299 sb->s_op->write_super(sb); 299 sb->s_op->write_super(sb);
300 unlock_super(sb); 300 unlock_super(sb);
301 if (sb->s_op->sync_fs) 301 if (sb->s_op->sync_fs)
302 sb->s_op->sync_fs(sb, 1); 302 sb->s_op->sync_fs(sb, wait);
303 return sync_blockdev(sb->s_bdev); 303 return __sync_blockdev(sb->s_bdev, wait);
304} 304}
305 305
306/* 306/*
@@ -310,7 +310,12 @@ static int __fsync_super(struct super_block *sb)
310 */ 310 */
311int fsync_super(struct super_block *sb) 311int fsync_super(struct super_block *sb)
312{ 312{
313 return __fsync_super(sb); 313 int ret;
314
315 ret = __fsync_super(sb, 0);
316 if (ret < 0)
317 return ret;
318 return __fsync_super(sb, 1);
314} 319}
315EXPORT_SYMBOL_GPL(fsync_super); 320EXPORT_SYMBOL_GPL(fsync_super);
316 321
@@ -469,20 +474,18 @@ restart:
469} 474}
470 475
471/* 476/*
472 * Call the ->sync_fs super_op against all filesystems which are r/w and 477 * Sync all the data for all the filesystems (called by sys_sync() and
473 * which implement it. 478 * emergency sync)
474 * 479 *
475 * This operation is careful to avoid the livelock which could easily happen 480 * This operation is careful to avoid the livelock which could easily happen
476 * if two or more filesystems are being continuously dirtied. s_need_sync_fs 481 * if two or more filesystems are being continuously dirtied. s_need_sync
477 * is used only here. We set it against all filesystems and then clear it as 482 * is used only here. We set it against all filesystems and then clear it as
478 * we sync them. So redirtied filesystems are skipped. 483 * we sync them. So redirtied filesystems are skipped.
479 * 484 *
480 * But if process A is currently running sync_filesystems and then process B 485 * But if process A is currently running sync_filesystems and then process B
481 * calls sync_filesystems as well, process B will set all the s_need_sync_fs 486 * calls sync_filesystems as well, process B will set all the s_need_sync
482 * flags again, which will cause process A to resync everything. Fix that with 487 * flags again, which will cause process A to resync everything. Fix that with
483 * a local mutex. 488 * a local mutex.
484 *
485 * (Fabian) Avoid sync_fs with clean fs & wait mode 0
486 */ 489 */
487void sync_filesystems(int wait) 490void sync_filesystems(int wait)
488{ 491{
@@ -492,25 +495,23 @@ void sync_filesystems(int wait)
492 mutex_lock(&mutex); /* Could be down_interruptible */ 495 mutex_lock(&mutex); /* Could be down_interruptible */
493 spin_lock(&sb_lock); 496 spin_lock(&sb_lock);
494 list_for_each_entry(sb, &super_blocks, s_list) { 497 list_for_each_entry(sb, &super_blocks, s_list) {
495 if (!sb->s_op->sync_fs)
496 continue;
497 if (sb->s_flags & MS_RDONLY) 498 if (sb->s_flags & MS_RDONLY)
498 continue; 499 continue;
499 sb->s_need_sync_fs = 1; 500 sb->s_need_sync = 1;
500 } 501 }
501 502
502restart: 503restart:
503 list_for_each_entry(sb, &super_blocks, s_list) { 504 list_for_each_entry(sb, &super_blocks, s_list) {
504 if (!sb->s_need_sync_fs) 505 if (!sb->s_need_sync)
505 continue; 506 continue;
506 sb->s_need_sync_fs = 0; 507 sb->s_need_sync = 0;
507 if (sb->s_flags & MS_RDONLY) 508 if (sb->s_flags & MS_RDONLY)
508 continue; /* hm. Was remounted r/o meanwhile */ 509 continue; /* hm. Was remounted r/o meanwhile */
509 sb->s_count++; 510 sb->s_count++;
510 spin_unlock(&sb_lock); 511 spin_unlock(&sb_lock);
511 down_read(&sb->s_umount); 512 down_read(&sb->s_umount);
512 if (sb->s_root) 513 if (sb->s_root)
513 sb->s_op->sync_fs(sb, wait); 514 __fsync_super(sb, wait);
514 up_read(&sb->s_umount); 515 up_read(&sb->s_umount);
515 /* restart only when sb is no longer on the list */ 516 /* restart only when sb is no longer on the list */
516 spin_lock(&sb_lock); 517 spin_lock(&sb_lock);
@@ -521,33 +522,6 @@ restart:
521 mutex_unlock(&mutex); 522 mutex_unlock(&mutex);
522} 523}
523 524
524#ifdef CONFIG_BLOCK
525/*
526 * Sync all block devices underlying some superblock
527 */
528void sync_blockdevs(void)
529{
530 struct super_block *sb;
531
532 spin_lock(&sb_lock);
533restart:
534 list_for_each_entry(sb, &super_blocks, s_list) {
535 if (!sb->s_bdev)
536 continue;
537 sb->s_count++;
538 spin_unlock(&sb_lock);
539 down_read(&sb->s_umount);
540 if (sb->s_root)
541 sync_blockdev(sb->s_bdev);
542 up_read(&sb->s_umount);
543 spin_lock(&sb_lock);
544 if (__put_super_and_need_restart(sb))
545 goto restart;
546 }
547 spin_unlock(&sb_lock);
548}
549#endif
550
551/** 525/**
552 * get_super - get the superblock of a device 526 * get_super - get the superblock of a device
553 * @bdev: device to get the superblock for 527 * @bdev: device to get the superblock for
diff --git a/fs/sync.c b/fs/sync.c
index 631fd5aece78..be0798cc33d7 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -18,35 +18,24 @@
18#define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \ 18#define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \
19 SYNC_FILE_RANGE_WAIT_AFTER) 19 SYNC_FILE_RANGE_WAIT_AFTER)
20 20
21/* 21SYSCALL_DEFINE0(sync)
22 * sync everything. Start out by waking pdflush, because that writes back
23 * all queues in parallel.
24 */
25static void do_sync(unsigned long wait)
26{ 22{
27 wakeup_pdflush(0); 23 sync_filesystems(0);
28 sync_inodes(0); /* All mappings, inodes and their blockdevs */ 24 sync_filesystems(1);
29 vfs_dq_sync(NULL);
30 sync_inodes(wait); /* Mappings, inodes and blockdevs, again. */
31 sync_supers(); /* Write the superblocks */
32 sync_filesystems(0); /* Start syncing the filesystems */
33 sync_filesystems(wait); /* Waitingly sync the filesystems */
34 sync_blockdevs();
35 if (!wait)
36 printk("Emergency Sync complete\n");
37 if (unlikely(laptop_mode)) 25 if (unlikely(laptop_mode))
38 laptop_sync_completion(); 26 laptop_sync_completion();
39}
40
41SYSCALL_DEFINE0(sync)
42{
43 do_sync(1);
44 return 0; 27 return 0;
45} 28}
46 29
47static void do_sync_work(struct work_struct *work) 30static void do_sync_work(struct work_struct *work)
48{ 31{
49 do_sync(0); 32 /*
33 * Sync twice to reduce the possibility we skipped some inodes / pages
34 * because they were temporarily locked
35 */
36 sync_filesystems(0);
37 sync_filesystems(0);
38 printk("Emergency Sync complete\n");
50 kfree(work); 39 kfree(work);
51} 40}
52 41