diff options
| author | Jan Kara <jack@suse.cz> | 2009-10-16 13:26:15 -0400 |
|---|---|---|
| committer | Jan Kara <jack@suse.cz> | 2009-11-11 09:22:49 -0500 |
| commit | fe8bc91c4c30122b357d197117705cfd4fabaf28 (patch) | |
| tree | 33e2895847d812204209444db4093af9a2c3623d | |
| parent | ea0174a7137c8ca9f130ca681f3a99c872da6778 (diff) | |
ext3: Wait for proper transaction commit on fsync
We cannot rely on buffer dirty bits during fsync because pdflush can come
before fsync is called and clear dirty bits without forcing a transaction
commit. What we do is that we track which transaction has last changed
the inode and which transaction last changed allocation and force it to
disk on fsync.
Signed-off-by: Jan Kara <jack@suse.cz>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
| -rw-r--r-- | fs/ext3/fsync.c | 36 | ||||
| -rw-r--r-- | fs/ext3/inode.c | 32 | ||||
| -rw-r--r-- | fs/ext3/super.c | 2 | ||||
| -rw-r--r-- | include/linux/ext3_fs_i.h | 8 |
4 files changed, 57 insertions, 21 deletions
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c index 451d166bbe93..8209f266e9ad 100644 --- a/fs/ext3/fsync.c +++ b/fs/ext3/fsync.c | |||
| @@ -46,19 +46,21 @@ | |||
| 46 | int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync) | 46 | int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync) |
| 47 | { | 47 | { |
| 48 | struct inode *inode = dentry->d_inode; | 48 | struct inode *inode = dentry->d_inode; |
| 49 | struct ext3_inode_info *ei = EXT3_I(inode); | ||
| 50 | journal_t *journal = EXT3_SB(inode->i_sb)->s_journal; | ||
| 49 | int ret = 0; | 51 | int ret = 0; |
| 52 | tid_t commit_tid; | ||
| 53 | |||
| 54 | if (inode->i_sb->s_flags & MS_RDONLY) | ||
| 55 | return 0; | ||
| 50 | 56 | ||
| 51 | J_ASSERT(ext3_journal_current_handle() == NULL); | 57 | J_ASSERT(ext3_journal_current_handle() == NULL); |
| 52 | 58 | ||
| 53 | /* | 59 | /* |
| 54 | * data=writeback: | 60 | * data=writeback,ordered: |
| 55 | * The caller's filemap_fdatawrite()/wait will sync the data. | 61 | * The caller's filemap_fdatawrite()/wait will sync the data. |
| 56 | * sync_inode() will sync the metadata | 62 | * Metadata is in the journal, we wait for a proper transaction |
| 57 | * | 63 | * to commit here. |
| 58 | * data=ordered: | ||
| 59 | * The caller's filemap_fdatawrite() will write the data and | ||
| 60 | * sync_inode() will write the inode if it is dirty. Then the caller's | ||
| 61 | * filemap_fdatawait() will wait on the pages. | ||
| 62 | * | 64 | * |
| 63 | * data=journal: | 65 | * data=journal: |
| 64 | * filemap_fdatawrite won't do anything (the buffers are clean). | 66 | * filemap_fdatawrite won't do anything (the buffers are clean). |
| @@ -73,22 +75,16 @@ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync) | |||
| 73 | goto out; | 75 | goto out; |
| 74 | } | 76 | } |
| 75 | 77 | ||
| 76 | if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) | 78 | if (datasync) |
| 77 | goto flush; | 79 | commit_tid = atomic_read(&ei->i_datasync_tid); |
| 80 | else | ||
| 81 | commit_tid = atomic_read(&ei->i_sync_tid); | ||
| 78 | 82 | ||
| 79 | /* | 83 | if (log_start_commit(journal, commit_tid)) { |
| 80 | * The VFS has written the file data. If the inode is unaltered | 84 | log_wait_commit(journal, commit_tid); |
| 81 | * then we need not start a commit. | ||
| 82 | */ | ||
| 83 | if (inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC)) { | ||
| 84 | struct writeback_control wbc = { | ||
| 85 | .sync_mode = WB_SYNC_ALL, | ||
| 86 | .nr_to_write = 0, /* sys_fsync did this */ | ||
| 87 | }; | ||
| 88 | ret = sync_inode(inode, &wbc); | ||
| 89 | goto out; | 85 | goto out; |
| 90 | } | 86 | } |
| 91 | flush: | 87 | |
| 92 | /* | 88 | /* |
| 93 | * In case we didn't commit a transaction, we have to flush | 89 | * In case we didn't commit a transaction, we have to flush |
| 94 | * disk caches manually so that data really is on persistent | 90 | * disk caches manually so that data really is on persistent |
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 069a163393b4..354ed3b47b30 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c | |||
| @@ -699,8 +699,9 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode, | |||
| 699 | int err = 0; | 699 | int err = 0; |
| 700 | struct ext3_block_alloc_info *block_i; | 700 | struct ext3_block_alloc_info *block_i; |
| 701 | ext3_fsblk_t current_block; | 701 | ext3_fsblk_t current_block; |
| 702 | struct ext3_inode_info *ei = EXT3_I(inode); | ||
| 702 | 703 | ||
| 703 | block_i = EXT3_I(inode)->i_block_alloc_info; | 704 | block_i = ei->i_block_alloc_info; |
| 704 | /* | 705 | /* |
| 705 | * If we're splicing into a [td]indirect block (as opposed to the | 706 | * If we're splicing into a [td]indirect block (as opposed to the |
| 706 | * inode) then we need to get write access to the [td]indirect block | 707 | * inode) then we need to get write access to the [td]indirect block |
| @@ -741,6 +742,8 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode, | |||
| 741 | 742 | ||
| 742 | inode->i_ctime = CURRENT_TIME_SEC; | 743 | inode->i_ctime = CURRENT_TIME_SEC; |
| 743 | ext3_mark_inode_dirty(handle, inode); | 744 | ext3_mark_inode_dirty(handle, inode); |
| 745 | /* ext3_mark_inode_dirty already updated i_sync_tid */ | ||
| 746 | atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid); | ||
| 744 | 747 | ||
| 745 | /* had we spliced it onto indirect block? */ | 748 | /* had we spliced it onto indirect block? */ |
| 746 | if (where->bh) { | 749 | if (where->bh) { |
| @@ -2754,6 +2757,8 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino) | |||
| 2754 | struct ext3_inode_info *ei; | 2757 | struct ext3_inode_info *ei; |
| 2755 | struct buffer_head *bh; | 2758 | struct buffer_head *bh; |
| 2756 | struct inode *inode; | 2759 | struct inode *inode; |
| 2760 | journal_t *journal = EXT3_SB(sb)->s_journal; | ||
| 2761 | transaction_t *transaction; | ||
| 2757 | long ret; | 2762 | long ret; |
| 2758 | int block; | 2763 | int block; |
| 2759 | 2764 | ||
| @@ -2831,6 +2836,30 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino) | |||
| 2831 | ei->i_data[block] = raw_inode->i_block[block]; | 2836 | ei->i_data[block] = raw_inode->i_block[block]; |
| 2832 | INIT_LIST_HEAD(&ei->i_orphan); | 2837 | INIT_LIST_HEAD(&ei->i_orphan); |
| 2833 | 2838 | ||
| 2839 | /* | ||
| 2840 | * Set transaction id's of transactions that have to be committed | ||
| 2841 | * to finish f[data]sync. We set them to currently running transaction | ||
| 2842 | * as we cannot be sure that the inode or some of its metadata isn't | ||
| 2843 | * part of the transaction - the inode could have been reclaimed and | ||
| 2844 | * now it is reread from disk. | ||
| 2845 | */ | ||
| 2846 | if (journal) { | ||
| 2847 | tid_t tid; | ||
| 2848 | |||
| 2849 | spin_lock(&journal->j_state_lock); | ||
| 2850 | if (journal->j_running_transaction) | ||
| 2851 | transaction = journal->j_running_transaction; | ||
| 2852 | else | ||
| 2853 | transaction = journal->j_committing_transaction; | ||
| 2854 | if (transaction) | ||
| 2855 | tid = transaction->t_tid; | ||
| 2856 | else | ||
| 2857 | tid = journal->j_commit_sequence; | ||
| 2858 | spin_unlock(&journal->j_state_lock); | ||
| 2859 | atomic_set(&ei->i_sync_tid, tid); | ||
| 2860 | atomic_set(&ei->i_datasync_tid, tid); | ||
| 2861 | } | ||
| 2862 | |||
| 2834 | if (inode->i_ino >= EXT3_FIRST_INO(inode->i_sb) + 1 && | 2863 | if (inode->i_ino >= EXT3_FIRST_INO(inode->i_sb) + 1 && |
| 2835 | EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) { | 2864 | EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) { |
| 2836 | /* | 2865 | /* |
| @@ -3015,6 +3044,7 @@ again: | |||
| 3015 | err = rc; | 3044 | err = rc; |
| 3016 | ei->i_state &= ~EXT3_STATE_NEW; | 3045 | ei->i_state &= ~EXT3_STATE_NEW; |
| 3017 | 3046 | ||
| 3047 | atomic_set(&ei->i_sync_tid, handle->h_transaction->t_tid); | ||
| 3018 | out_brelse: | 3048 | out_brelse: |
| 3019 | brelse (bh); | 3049 | brelse (bh); |
| 3020 | ext3_std_error(inode->i_sb, err); | 3050 | ext3_std_error(inode->i_sb, err); |
diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 7a520a862f49..427496c4767c 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c | |||
| @@ -466,6 +466,8 @@ static struct inode *ext3_alloc_inode(struct super_block *sb) | |||
| 466 | return NULL; | 466 | return NULL; |
| 467 | ei->i_block_alloc_info = NULL; | 467 | ei->i_block_alloc_info = NULL; |
| 468 | ei->vfs_inode.i_version = 1; | 468 | ei->vfs_inode.i_version = 1; |
| 469 | atomic_set(&ei->i_datasync_tid, 0); | ||
| 470 | atomic_set(&ei->i_sync_tid, 0); | ||
| 469 | return &ei->vfs_inode; | 471 | return &ei->vfs_inode; |
| 470 | } | 472 | } |
| 471 | 473 | ||
diff --git a/include/linux/ext3_fs_i.h b/include/linux/ext3_fs_i.h index ca1bfe90004f..93e7428156ba 100644 --- a/include/linux/ext3_fs_i.h +++ b/include/linux/ext3_fs_i.h | |||
| @@ -137,6 +137,14 @@ struct ext3_inode_info { | |||
| 137 | * by other means, so we have truncate_mutex. | 137 | * by other means, so we have truncate_mutex. |
| 138 | */ | 138 | */ |
| 139 | struct mutex truncate_mutex; | 139 | struct mutex truncate_mutex; |
| 140 | |||
| 141 | /* | ||
| 142 | * Transactions that contain inode's metadata needed to complete | ||
| 143 | * fsync and fdatasync, respectively. | ||
| 144 | */ | ||
| 145 | atomic_t i_sync_tid; | ||
| 146 | atomic_t i_datasync_tid; | ||
| 147 | |||
| 140 | struct inode vfs_inode; | 148 | struct inode vfs_inode; |
| 141 | }; | 149 | }; |
| 142 | 150 | ||
