diff options
author | Jan Kara <jack@suse.cz> | 2009-04-27 10:43:51 -0400 |
---|---|---|
committer | Al Viro <viro@zeniv.linux.org.uk> | 2009-06-11 21:36:03 -0400 |
commit | 5cee5815d1564bbbd505fea86f4550f1efdb5cd0 (patch) | |
tree | ba99c38932dc534bf56f0d7dcfeeca153f50b007 /fs | |
parent | 429479f031322a0cc5c921ffb2321a51718dc875 (diff) |
vfs: Make sys_sync() use fsync_super() (version 4)
It is unnecessarily fragile to have two places (fsync_super() and do_sync())
doing data integrity sync of the filesystem. Alter __fsync_super() to
accommodate needs of both callers and use it. So after this patch
__fsync_super() is the only place where we gather all the calls needed to
properly send all data on a filesystem to disk.
Nice bonus is that we get a complete livelock avoidance and write_supers()
is now only used for periodic writeback of superblocks.
sync_blockdevs() introduced a couple of patches ago is gone now.
[build fixes folded]
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Diffstat (limited to 'fs')
-rw-r--r-- | fs/block_dev.c | 15 | ||||
-rw-r--r-- | fs/fs-writeback.c | 49 | ||||
-rw-r--r-- | fs/internal.h | 16 | ||||
-rw-r--r-- | fs/super.c | 72 | ||||
-rw-r--r-- | fs/sync.c | 31 |
5 files changed, 50 insertions, 133 deletions
diff --git a/fs/block_dev.c b/fs/block_dev.c index fe47f7227618..4b6a3b9d01ef 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c | |||
@@ -176,17 +176,22 @@ blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, | |||
176 | iov, offset, nr_segs, blkdev_get_blocks, NULL); | 176 | iov, offset, nr_segs, blkdev_get_blocks, NULL); |
177 | } | 177 | } |
178 | 178 | ||
179 | int __sync_blockdev(struct block_device *bdev, int wait) | ||
180 | { | ||
181 | if (!bdev) | ||
182 | return 0; | ||
183 | if (!wait) | ||
184 | return filemap_flush(bdev->bd_inode->i_mapping); | ||
185 | return filemap_write_and_wait(bdev->bd_inode->i_mapping); | ||
186 | } | ||
187 | |||
179 | /* | 188 | /* |
180 | * Write out and wait upon all the dirty data associated with a block | 189 | * Write out and wait upon all the dirty data associated with a block |
181 | * device via its mapping. Does not take the superblock lock. | 190 | * device via its mapping. Does not take the superblock lock. |
182 | */ | 191 | */ |
183 | int sync_blockdev(struct block_device *bdev) | 192 | int sync_blockdev(struct block_device *bdev) |
184 | { | 193 | { |
185 | int ret = 0; | 194 | return __sync_blockdev(bdev, 1); |
186 | |||
187 | if (bdev) | ||
188 | ret = filemap_write_and_wait(bdev->bd_inode->i_mapping); | ||
189 | return ret; | ||
190 | } | 195 | } |
191 | EXPORT_SYMBOL(sync_blockdev); | 196 | EXPORT_SYMBOL(sync_blockdev); |
192 | 197 | ||
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 91013ff7dd53..e0fb2e789598 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c | |||
@@ -679,55 +679,6 @@ void sync_inodes_sb(struct super_block *sb, int wait) | |||
679 | } | 679 | } |
680 | 680 | ||
681 | /** | 681 | /** |
682 | * sync_inodes - writes all inodes to disk | ||
683 | * @wait: wait for completion | ||
684 | * | ||
685 | * sync_inodes() goes through each super block's dirty inode list, writes the | ||
686 | * inodes out, waits on the writeout and puts the inodes back on the normal | ||
687 | * list. | ||
688 | * | ||
689 | * This is for sys_sync(). fsync_dev() uses the same algorithm. The subtle | ||
690 | * part of the sync functions is that the blockdev "superblock" is processed | ||
691 | * last. This is because the write_inode() function of a typical fs will | ||
692 | * perform no I/O, but will mark buffers in the blockdev mapping as dirty. | ||
693 | * What we want to do is to perform all that dirtying first, and then write | ||
694 | * back all those inode blocks via the blockdev mapping in one sweep. So the | ||
695 | * additional (somewhat redundant) sync_blockdev() calls here are to make | ||
696 | * sure that really happens. Because if we call sync_inodes_sb(wait=1) with | ||
697 | * outstanding dirty inodes, the writeback goes block-at-a-time within the | ||
698 | * filesystem's write_inode(). This is extremely slow. | ||
699 | */ | ||
700 | static void __sync_inodes(int wait) | ||
701 | { | ||
702 | struct super_block *sb; | ||
703 | |||
704 | spin_lock(&sb_lock); | ||
705 | restart: | ||
706 | list_for_each_entry(sb, &super_blocks, s_list) { | ||
707 | sb->s_count++; | ||
708 | spin_unlock(&sb_lock); | ||
709 | down_read(&sb->s_umount); | ||
710 | if (sb->s_root) { | ||
711 | sync_inodes_sb(sb, wait); | ||
712 | sync_blockdev(sb->s_bdev); | ||
713 | } | ||
714 | up_read(&sb->s_umount); | ||
715 | spin_lock(&sb_lock); | ||
716 | if (__put_super_and_need_restart(sb)) | ||
717 | goto restart; | ||
718 | } | ||
719 | spin_unlock(&sb_lock); | ||
720 | } | ||
721 | |||
722 | void sync_inodes(int wait) | ||
723 | { | ||
724 | __sync_inodes(0); | ||
725 | |||
726 | if (wait) | ||
727 | __sync_inodes(1); | ||
728 | } | ||
729 | |||
730 | /** | ||
731 | * write_inode_now - write an inode to disk | 682 | * write_inode_now - write an inode to disk |
732 | * @inode: inode to write to disk | 683 | * @inode: inode to write to disk |
733 | * @sync: whether the write should be synchronous or not | 684 | * @sync: whether the write should be synchronous or not |
diff --git a/fs/internal.h b/fs/internal.h index 343a537ab809..dbec3cc28338 100644 --- a/fs/internal.h +++ b/fs/internal.h | |||
@@ -25,6 +25,8 @@ static inline int sb_is_blkdev_sb(struct super_block *sb) | |||
25 | return sb == blockdev_superblock; | 25 | return sb == blockdev_superblock; |
26 | } | 26 | } |
27 | 27 | ||
28 | extern int __sync_blockdev(struct block_device *bdev, int wait); | ||
29 | |||
28 | #else | 30 | #else |
29 | static inline void bdev_cache_init(void) | 31 | static inline void bdev_cache_init(void) |
30 | { | 32 | { |
@@ -34,6 +36,11 @@ static inline int sb_is_blkdev_sb(struct super_block *sb) | |||
34 | { | 36 | { |
35 | return 0; | 37 | return 0; |
36 | } | 38 | } |
39 | |||
40 | static inline int __sync_blockdev(struct block_device *bdev, int wait) | ||
41 | { | ||
42 | return 0; | ||
43 | } | ||
37 | #endif | 44 | #endif |
38 | 45 | ||
39 | /* | 46 | /* |
@@ -71,12 +78,3 @@ extern void chroot_fs_refs(struct path *, struct path *); | |||
71 | * file_table.c | 78 | * file_table.c |
72 | */ | 79 | */ |
73 | extern void mark_files_ro(struct super_block *); | 80 | extern void mark_files_ro(struct super_block *); |
74 | |||
75 | /* | ||
76 | * super.c | ||
77 | */ | ||
78 | #ifdef CONFIG_BLOCK | ||
79 | extern void sync_blockdevs(void); | ||
80 | #else | ||
81 | static inline void sync_blockdevs(void) { } | ||
82 | #endif | ||
diff --git a/fs/super.c b/fs/super.c index 8dbe1ead9ddd..c8ce5ed04249 100644 --- a/fs/super.c +++ b/fs/super.c | |||
@@ -284,23 +284,23 @@ EXPORT_SYMBOL(lock_super); | |||
284 | EXPORT_SYMBOL(unlock_super); | 284 | EXPORT_SYMBOL(unlock_super); |
285 | 285 | ||
286 | /* | 286 | /* |
287 | * Write out and wait upon all dirty data associated with this | 287 | * Do the filesystem syncing work. For simple filesystems sync_inodes_sb(sb, 0) |
288 | * superblock. Filesystem data as well as the underlying block | 288 | * just dirties buffers with inodes so we have to submit IO for these buffers |
289 | * device. Takes the superblock lock. Requires a second blkdev | 289 | * via __sync_blockdev(). This also speeds up the wait == 1 case since in that |
290 | * flush by the caller to complete the operation. | 290 | * case write_inode() functions do sync_dirty_buffer() and thus effectively |
291 | * write one block at a time. | ||
291 | */ | 292 | */ |
292 | static int __fsync_super(struct super_block *sb) | 293 | static int __fsync_super(struct super_block *sb, int wait) |
293 | { | 294 | { |
294 | sync_inodes_sb(sb, 0); | ||
295 | vfs_dq_sync(sb); | 295 | vfs_dq_sync(sb); |
296 | sync_inodes_sb(sb, 1); | 296 | sync_inodes_sb(sb, wait); |
297 | lock_super(sb); | 297 | lock_super(sb); |
298 | if (sb->s_dirt && sb->s_op->write_super) | 298 | if (sb->s_dirt && sb->s_op->write_super) |
299 | sb->s_op->write_super(sb); | 299 | sb->s_op->write_super(sb); |
300 | unlock_super(sb); | 300 | unlock_super(sb); |
301 | if (sb->s_op->sync_fs) | 301 | if (sb->s_op->sync_fs) |
302 | sb->s_op->sync_fs(sb, 1); | 302 | sb->s_op->sync_fs(sb, wait); |
303 | return sync_blockdev(sb->s_bdev); | 303 | return __sync_blockdev(sb->s_bdev, wait); |
304 | } | 304 | } |
305 | 305 | ||
306 | /* | 306 | /* |
@@ -310,7 +310,12 @@ static int __fsync_super(struct super_block *sb) | |||
310 | */ | 310 | */ |
311 | int fsync_super(struct super_block *sb) | 311 | int fsync_super(struct super_block *sb) |
312 | { | 312 | { |
313 | return __fsync_super(sb); | 313 | int ret; |
314 | |||
315 | ret = __fsync_super(sb, 0); | ||
316 | if (ret < 0) | ||
317 | return ret; | ||
318 | return __fsync_super(sb, 1); | ||
314 | } | 319 | } |
315 | EXPORT_SYMBOL_GPL(fsync_super); | 320 | EXPORT_SYMBOL_GPL(fsync_super); |
316 | 321 | ||
@@ -469,20 +474,18 @@ restart: | |||
469 | } | 474 | } |
470 | 475 | ||
471 | /* | 476 | /* |
472 | * Call the ->sync_fs super_op against all filesystems which are r/w and | 477 | * Sync all the data for all the filesystems (called by sys_sync() and |
473 | * which implement it. | 478 | * emergency sync) |
474 | * | 479 | * |
475 | * This operation is careful to avoid the livelock which could easily happen | 480 | * This operation is careful to avoid the livelock which could easily happen |
476 | * if two or more filesystems are being continuously dirtied. s_need_sync_fs | 481 | * if two or more filesystems are being continuously dirtied. s_need_sync |
477 | * is used only here. We set it against all filesystems and then clear it as | 482 | * is used only here. We set it against all filesystems and then clear it as |
478 | * we sync them. So redirtied filesystems are skipped. | 483 | * we sync them. So redirtied filesystems are skipped. |
479 | * | 484 | * |
480 | * But if process A is currently running sync_filesystems and then process B | 485 | * But if process A is currently running sync_filesystems and then process B |
481 | * calls sync_filesystems as well, process B will set all the s_need_sync_fs | 486 | * calls sync_filesystems as well, process B will set all the s_need_sync |
482 | * flags again, which will cause process A to resync everything. Fix that with | 487 | * flags again, which will cause process A to resync everything. Fix that with |
483 | * a local mutex. | 488 | * a local mutex. |
484 | * | ||
485 | * (Fabian) Avoid sync_fs with clean fs & wait mode 0 | ||
486 | */ | 489 | */ |
487 | void sync_filesystems(int wait) | 490 | void sync_filesystems(int wait) |
488 | { | 491 | { |
@@ -492,25 +495,23 @@ void sync_filesystems(int wait) | |||
492 | mutex_lock(&mutex); /* Could be down_interruptible */ | 495 | mutex_lock(&mutex); /* Could be down_interruptible */ |
493 | spin_lock(&sb_lock); | 496 | spin_lock(&sb_lock); |
494 | list_for_each_entry(sb, &super_blocks, s_list) { | 497 | list_for_each_entry(sb, &super_blocks, s_list) { |
495 | if (!sb->s_op->sync_fs) | ||
496 | continue; | ||
497 | if (sb->s_flags & MS_RDONLY) | 498 | if (sb->s_flags & MS_RDONLY) |
498 | continue; | 499 | continue; |
499 | sb->s_need_sync_fs = 1; | 500 | sb->s_need_sync = 1; |
500 | } | 501 | } |
501 | 502 | ||
502 | restart: | 503 | restart: |
503 | list_for_each_entry(sb, &super_blocks, s_list) { | 504 | list_for_each_entry(sb, &super_blocks, s_list) { |
504 | if (!sb->s_need_sync_fs) | 505 | if (!sb->s_need_sync) |
505 | continue; | 506 | continue; |
506 | sb->s_need_sync_fs = 0; | 507 | sb->s_need_sync = 0; |
507 | if (sb->s_flags & MS_RDONLY) | 508 | if (sb->s_flags & MS_RDONLY) |
508 | continue; /* hm. Was remounted r/o meanwhile */ | 509 | continue; /* hm. Was remounted r/o meanwhile */ |
509 | sb->s_count++; | 510 | sb->s_count++; |
510 | spin_unlock(&sb_lock); | 511 | spin_unlock(&sb_lock); |
511 | down_read(&sb->s_umount); | 512 | down_read(&sb->s_umount); |
512 | if (sb->s_root) | 513 | if (sb->s_root) |
513 | sb->s_op->sync_fs(sb, wait); | 514 | __fsync_super(sb, wait); |
514 | up_read(&sb->s_umount); | 515 | up_read(&sb->s_umount); |
515 | /* restart only when sb is no longer on the list */ | 516 | /* restart only when sb is no longer on the list */ |
516 | spin_lock(&sb_lock); | 517 | spin_lock(&sb_lock); |
@@ -521,33 +522,6 @@ restart: | |||
521 | mutex_unlock(&mutex); | 522 | mutex_unlock(&mutex); |
522 | } | 523 | } |
523 | 524 | ||
524 | #ifdef CONFIG_BLOCK | ||
525 | /* | ||
526 | * Sync all block devices underlying some superblock | ||
527 | */ | ||
528 | void sync_blockdevs(void) | ||
529 | { | ||
530 | struct super_block *sb; | ||
531 | |||
532 | spin_lock(&sb_lock); | ||
533 | restart: | ||
534 | list_for_each_entry(sb, &super_blocks, s_list) { | ||
535 | if (!sb->s_bdev) | ||
536 | continue; | ||
537 | sb->s_count++; | ||
538 | spin_unlock(&sb_lock); | ||
539 | down_read(&sb->s_umount); | ||
540 | if (sb->s_root) | ||
541 | sync_blockdev(sb->s_bdev); | ||
542 | up_read(&sb->s_umount); | ||
543 | spin_lock(&sb_lock); | ||
544 | if (__put_super_and_need_restart(sb)) | ||
545 | goto restart; | ||
546 | } | ||
547 | spin_unlock(&sb_lock); | ||
548 | } | ||
549 | #endif | ||
550 | |||
551 | /** | 525 | /** |
552 | * get_super - get the superblock of a device | 526 | * get_super - get the superblock of a device |
553 | * @bdev: device to get the superblock for | 527 | * @bdev: device to get the superblock for |
@@ -18,35 +18,24 @@ | |||
18 | #define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \ | 18 | #define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \ |
19 | SYNC_FILE_RANGE_WAIT_AFTER) | 19 | SYNC_FILE_RANGE_WAIT_AFTER) |
20 | 20 | ||
21 | /* | 21 | SYSCALL_DEFINE0(sync) |
22 | * sync everything. Start out by waking pdflush, because that writes back | ||
23 | * all queues in parallel. | ||
24 | */ | ||
25 | static void do_sync(unsigned long wait) | ||
26 | { | 22 | { |
27 | wakeup_pdflush(0); | 23 | sync_filesystems(0); |
28 | sync_inodes(0); /* All mappings, inodes and their blockdevs */ | 24 | sync_filesystems(1); |
29 | vfs_dq_sync(NULL); | ||
30 | sync_inodes(wait); /* Mappings, inodes and blockdevs, again. */ | ||
31 | sync_supers(); /* Write the superblocks */ | ||
32 | sync_filesystems(0); /* Start syncing the filesystems */ | ||
33 | sync_filesystems(wait); /* Waitingly sync the filesystems */ | ||
34 | sync_blockdevs(); | ||
35 | if (!wait) | ||
36 | printk("Emergency Sync complete\n"); | ||
37 | if (unlikely(laptop_mode)) | 25 | if (unlikely(laptop_mode)) |
38 | laptop_sync_completion(); | 26 | laptop_sync_completion(); |
39 | } | ||
40 | |||
41 | SYSCALL_DEFINE0(sync) | ||
42 | { | ||
43 | do_sync(1); | ||
44 | return 0; | 27 | return 0; |
45 | } | 28 | } |
46 | 29 | ||
47 | static void do_sync_work(struct work_struct *work) | 30 | static void do_sync_work(struct work_struct *work) |
48 | { | 31 | { |
49 | do_sync(0); | 32 | /* |
33 | * Sync twice to reduce the possibility we skipped some inodes / pages | ||
34 | * because they were temporarily locked | ||
35 | */ | ||
36 | sync_filesystems(0); | ||
37 | sync_filesystems(0); | ||
38 | printk("Emergency Sync complete\n"); | ||
50 | kfree(work); | 39 | kfree(work); |
51 | } | 40 | } |
52 | 41 | ||