diff options
author | Christoph Hellwig <hch@lst.de> | 2009-08-31 20:00:31 -0400 |
---|---|---|
committer | Felix Blyakher <felixb@sgi.com> | 2009-09-01 13:45:57 -0400 |
commit | 13e6d5cdde0e785aa943810f08b801cadd0935df (patch) | |
tree | 72b62d1e3e4b35f1613458b6e1dbbadd74534a92 | |
parent | bd169565993b39b9b4b102cdac8b13e0a259ce2f (diff) |
xfs: merge fsync and O_SYNC handling
The guarantees for O_SYNC are exactly the same as the ones we need to
make for an fsync call (and given that Linux O_SYNC is O_DSYNC the
equivalent is fdadatasync, but we treat both the same in XFS), except
with a range data writeout. Jan Kara has started unifying these two
path for filesystems using the generic helpers, and I've started to
look at XFS.
The actual transaction commited by xfs_fsync and xfs_write_sync_logforce
has a different transaction number, but actually is exactly the same.
We'll only use the fsync transaction going forward. One major difference
is that xfs_write_sync_logforce never issues a cache flush unless we
commit a transaction causing that as a side-effect, which is an obvious
bug in the O_SYNC handling. Second all the locking and i_update_size
vs i_update_core changes from 978b7237123d007b9fa983af6e0e2fa8f97f9934
never made it to xfs_write_sync_logforce, so we add them back.
To make xfs_fsync easily usable from the O_SYNC path, the filemap_fdatawait
call is moved up to xfs_file_fsync, so that we don't wait on the whole
file after we already waited for our portion in xfs_write.
We'll also use a plain call to filemap_write_and_wait_range instead
of the previous sync_page_rang which did it in two steps including
an half-hearted inode write out that doesn't help us.
Once we're done with this also remove the now useless i_update_size
tracking.
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Felix Blyakher <felixb@sgi.com>
Signed-off-by: Felix Blyakher <felixb@sgi.com>
-rw-r--r-- | fs/xfs/linux-2.6/xfs_aops.c | 1 | ||||
-rw-r--r-- | fs/xfs/linux-2.6/xfs_file.c | 19 | ||||
-rw-r--r-- | fs/xfs/linux-2.6/xfs_lrw.c | 7 | ||||
-rw-r--r-- | fs/xfs/xfs_iget.c | 1 | ||||
-rw-r--r-- | fs/xfs/xfs_inode.h | 1 | ||||
-rw-r--r-- | fs/xfs/xfs_inode_item.c | 8 | ||||
-rw-r--r-- | fs/xfs/xfs_rw.c | 84 | ||||
-rw-r--r-- | fs/xfs/xfs_rw.h | 1 | ||||
-rw-r--r-- | fs/xfs/xfs_trans.h | 2 | ||||
-rw-r--r-- | fs/xfs/xfs_vnodeops.c | 11 |
10 files changed, 23 insertions, 112 deletions
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index aecf2519db76..d5e5559e31db 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c | |||
@@ -216,7 +216,6 @@ xfs_setfilesize( | |||
216 | if (ip->i_d.di_size < isize) { | 216 | if (ip->i_d.di_size < isize) { |
217 | ip->i_d.di_size = isize; | 217 | ip->i_d.di_size = isize; |
218 | ip->i_update_core = 1; | 218 | ip->i_update_core = 1; |
219 | ip->i_update_size = 1; | ||
220 | xfs_mark_inode_dirty_sync(ip); | 219 | xfs_mark_inode_dirty_sync(ip); |
221 | } | 220 | } |
222 | 221 | ||
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index 0542fd507649..988d8f87bc0f 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c | |||
@@ -172,12 +172,21 @@ xfs_file_release( | |||
172 | */ | 172 | */ |
173 | STATIC int | 173 | STATIC int |
174 | xfs_file_fsync( | 174 | xfs_file_fsync( |
175 | struct file *filp, | 175 | struct file *file, |
176 | struct dentry *dentry, | 176 | struct dentry *dentry, |
177 | int datasync) | 177 | int datasync) |
178 | { | 178 | { |
179 | xfs_iflags_clear(XFS_I(dentry->d_inode), XFS_ITRUNCATED); | 179 | struct inode *inode = dentry->d_inode; |
180 | return -xfs_fsync(XFS_I(dentry->d_inode)); | 180 | struct xfs_inode *ip = XFS_I(inode); |
181 | int error; | ||
182 | |||
183 | /* capture size updates in I/O completion before writing the inode. */ | ||
184 | error = filemap_fdatawait(inode->i_mapping); | ||
185 | if (error) | ||
186 | return error; | ||
187 | |||
188 | xfs_iflags_clear(ip, XFS_ITRUNCATED); | ||
189 | return -xfs_fsync(ip); | ||
181 | } | 190 | } |
182 | 191 | ||
183 | STATIC int | 192 | STATIC int |
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c index 7078974a6eee..49e4a6aea73c 100644 --- a/fs/xfs/linux-2.6/xfs_lrw.c +++ b/fs/xfs/linux-2.6/xfs_lrw.c | |||
@@ -812,18 +812,21 @@ write_retry: | |||
812 | 812 | ||
813 | /* Handle various SYNC-type writes */ | 813 | /* Handle various SYNC-type writes */ |
814 | if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) { | 814 | if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) { |
815 | loff_t end = pos + ret - 1; | ||
815 | int error2; | 816 | int error2; |
816 | 817 | ||
817 | xfs_iunlock(xip, iolock); | 818 | xfs_iunlock(xip, iolock); |
818 | if (need_i_mutex) | 819 | if (need_i_mutex) |
819 | mutex_unlock(&inode->i_mutex); | 820 | mutex_unlock(&inode->i_mutex); |
820 | error2 = sync_page_range(inode, mapping, pos, ret); | 821 | |
822 | error2 = filemap_write_and_wait_range(mapping, pos, end); | ||
821 | if (!error) | 823 | if (!error) |
822 | error = error2; | 824 | error = error2; |
823 | if (need_i_mutex) | 825 | if (need_i_mutex) |
824 | mutex_lock(&inode->i_mutex); | 826 | mutex_lock(&inode->i_mutex); |
825 | xfs_ilock(xip, iolock); | 827 | xfs_ilock(xip, iolock); |
826 | error2 = xfs_write_sync_logforce(mp, xip); | 828 | |
829 | error2 = xfs_fsync(xip); | ||
827 | if (!error) | 830 | if (!error) |
828 | error = error2; | 831 | error = error2; |
829 | } | 832 | } |
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index ecbf8b4d2e2e..3323826274b3 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c | |||
@@ -82,7 +82,6 @@ xfs_inode_alloc( | |||
82 | memset(&ip->i_df, 0, sizeof(xfs_ifork_t)); | 82 | memset(&ip->i_df, 0, sizeof(xfs_ifork_t)); |
83 | ip->i_flags = 0; | 83 | ip->i_flags = 0; |
84 | ip->i_update_core = 0; | 84 | ip->i_update_core = 0; |
85 | ip->i_update_size = 0; | ||
86 | ip->i_delayed_blks = 0; | 85 | ip->i_delayed_blks = 0; |
87 | memset(&ip->i_d, 0, sizeof(xfs_icdinode_t)); | 86 | memset(&ip->i_d, 0, sizeof(xfs_icdinode_t)); |
88 | ip->i_size = 0; | 87 | ip->i_size = 0; |
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 00e8505bde2d..ed566c248ae4 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h | |||
@@ -261,7 +261,6 @@ typedef struct xfs_inode { | |||
261 | /* Miscellaneous state. */ | 261 | /* Miscellaneous state. */ |
262 | unsigned short i_flags; /* see defined flags below */ | 262 | unsigned short i_flags; /* see defined flags below */ |
263 | unsigned char i_update_core; /* timestamps/size is dirty */ | 263 | unsigned char i_update_core; /* timestamps/size is dirty */ |
264 | unsigned char i_update_size; /* di_size field is dirty */ | ||
265 | unsigned int i_delayed_blks; /* count of delay alloc blks */ | 264 | unsigned int i_delayed_blks; /* count of delay alloc blks */ |
266 | 265 | ||
267 | xfs_icdinode_t i_d; /* most of ondisk inode */ | 266 | xfs_icdinode_t i_d; /* most of ondisk inode */ |
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 977c4aec587e..2e69412195e6 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c | |||
@@ -263,14 +263,6 @@ xfs_inode_item_format( | |||
263 | } | 263 | } |
264 | 264 | ||
265 | /* | 265 | /* |
266 | * We don't have to worry about re-ordering here because | ||
267 | * the update_size field is protected by the inode lock | ||
268 | * and we have that held in exclusive mode. | ||
269 | */ | ||
270 | if (ip->i_update_size) | ||
271 | ip->i_update_size = 0; | ||
272 | |||
273 | /* | ||
274 | * Make sure to get the latest atime from the Linux inode. | 266 | * Make sure to get the latest atime from the Linux inode. |
275 | */ | 267 | */ |
276 | xfs_synchronize_atime(ip); | 268 | xfs_synchronize_atime(ip); |
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c index fea68615ed23..3f816ad7ff19 100644 --- a/fs/xfs/xfs_rw.c +++ b/fs/xfs/xfs_rw.c | |||
@@ -88,90 +88,6 @@ xfs_write_clear_setuid( | |||
88 | } | 88 | } |
89 | 89 | ||
90 | /* | 90 | /* |
91 | * Handle logging requirements of various synchronous types of write. | ||
92 | */ | ||
93 | int | ||
94 | xfs_write_sync_logforce( | ||
95 | xfs_mount_t *mp, | ||
96 | xfs_inode_t *ip) | ||
97 | { | ||
98 | int error = 0; | ||
99 | |||
100 | /* | ||
101 | * If we're treating this as O_DSYNC and we have not updated the | ||
102 | * size, force the log. | ||
103 | */ | ||
104 | if (!(mp->m_flags & XFS_MOUNT_OSYNCISOSYNC) && | ||
105 | !(ip->i_update_size)) { | ||
106 | xfs_inode_log_item_t *iip = ip->i_itemp; | ||
107 | |||
108 | /* | ||
109 | * If an allocation transaction occurred | ||
110 | * without extending the size, then we have to force | ||
111 | * the log up the proper point to ensure that the | ||
112 | * allocation is permanent. We can't count on | ||
113 | * the fact that buffered writes lock out direct I/O | ||
114 | * writes - the direct I/O write could have extended | ||
115 | * the size nontransactionally, then finished before | ||
116 | * we started. xfs_write_file will think that the file | ||
117 | * didn't grow but the update isn't safe unless the | ||
118 | * size change is logged. | ||
119 | * | ||
120 | * Force the log if we've committed a transaction | ||
121 | * against the inode or if someone else has and | ||
122 | * the commit record hasn't gone to disk (e.g. | ||
123 | * the inode is pinned). This guarantees that | ||
124 | * all changes affecting the inode are permanent | ||
125 | * when we return. | ||
126 | */ | ||
127 | if (iip && iip->ili_last_lsn) { | ||
128 | error = _xfs_log_force(mp, iip->ili_last_lsn, | ||
129 | XFS_LOG_FORCE | XFS_LOG_SYNC, NULL); | ||
130 | } else if (xfs_ipincount(ip) > 0) { | ||
131 | error = _xfs_log_force(mp, (xfs_lsn_t)0, | ||
132 | XFS_LOG_FORCE | XFS_LOG_SYNC, NULL); | ||
133 | } | ||
134 | |||
135 | } else { | ||
136 | xfs_trans_t *tp; | ||
137 | |||
138 | /* | ||
139 | * O_SYNC or O_DSYNC _with_ a size update are handled | ||
140 | * the same way. | ||
141 | * | ||
142 | * If the write was synchronous then we need to make | ||
143 | * sure that the inode modification time is permanent. | ||
144 | * We'll have updated the timestamp above, so here | ||
145 | * we use a synchronous transaction to log the inode. | ||
146 | * It's not fast, but it's necessary. | ||
147 | * | ||
148 | * If this a dsync write and the size got changed | ||
149 | * non-transactionally, then we need to ensure that | ||
150 | * the size change gets logged in a synchronous | ||
151 | * transaction. | ||
152 | */ | ||
153 | tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC); | ||
154 | if ((error = xfs_trans_reserve(tp, 0, | ||
155 | XFS_SWRITE_LOG_RES(mp), | ||
156 | 0, 0, 0))) { | ||
157 | /* Transaction reserve failed */ | ||
158 | xfs_trans_cancel(tp, 0); | ||
159 | } else { | ||
160 | /* Transaction reserve successful */ | ||
161 | xfs_ilock(ip, XFS_ILOCK_EXCL); | ||
162 | xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); | ||
163 | xfs_trans_ihold(tp, ip); | ||
164 | xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); | ||
165 | xfs_trans_set_sync(tp); | ||
166 | error = xfs_trans_commit(tp, 0); | ||
167 | xfs_iunlock(ip, XFS_ILOCK_EXCL); | ||
168 | } | ||
169 | } | ||
170 | |||
171 | return error; | ||
172 | } | ||
173 | |||
174 | /* | ||
175 | * Force a shutdown of the filesystem instantly while keeping | 91 | * Force a shutdown of the filesystem instantly while keeping |
176 | * the filesystem consistent. We don't do an unmount here; just shutdown | 92 | * the filesystem consistent. We don't do an unmount here; just shutdown |
177 | * the shop, make sure that absolutely nothing persistent happens to | 93 | * the shop, make sure that absolutely nothing persistent happens to |
diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h index ae65f0df87da..f5e4874c37d8 100644 --- a/fs/xfs/xfs_rw.h +++ b/fs/xfs/xfs_rw.h | |||
@@ -68,7 +68,6 @@ xfs_get_extsz_hint( | |||
68 | * Prototypes for functions in xfs_rw.c. | 68 | * Prototypes for functions in xfs_rw.c. |
69 | */ | 69 | */ |
70 | extern int xfs_write_clear_setuid(struct xfs_inode *ip); | 70 | extern int xfs_write_clear_setuid(struct xfs_inode *ip); |
71 | extern int xfs_write_sync_logforce(struct xfs_mount *mp, struct xfs_inode *ip); | ||
72 | extern int xfs_bwrite(struct xfs_mount *mp, struct xfs_buf *bp); | 71 | extern int xfs_bwrite(struct xfs_mount *mp, struct xfs_buf *bp); |
73 | extern int xfs_bioerror(struct xfs_buf *bp); | 72 | extern int xfs_bioerror(struct xfs_buf *bp); |
74 | extern int xfs_bioerror_relse(struct xfs_buf *bp); | 73 | extern int xfs_bioerror_relse(struct xfs_buf *bp); |
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 775249a54f6f..ed47fc77759c 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h | |||
@@ -68,7 +68,7 @@ typedef struct xfs_trans_header { | |||
68 | #define XFS_TRANS_GROWFS 14 | 68 | #define XFS_TRANS_GROWFS 14 |
69 | #define XFS_TRANS_STRAT_WRITE 15 | 69 | #define XFS_TRANS_STRAT_WRITE 15 |
70 | #define XFS_TRANS_DIOSTRAT 16 | 70 | #define XFS_TRANS_DIOSTRAT 16 |
71 | #define XFS_TRANS_WRITE_SYNC 17 | 71 | /* 17 was XFS_TRANS_WRITE_SYNC */ |
72 | #define XFS_TRANS_WRITEID 18 | 72 | #define XFS_TRANS_WRITEID 18 |
73 | #define XFS_TRANS_ADDAFORK 19 | 73 | #define XFS_TRANS_ADDAFORK 19 |
74 | #define XFS_TRANS_ATTRINVAL 20 | 74 | #define XFS_TRANS_ATTRINVAL 20 |
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index ceecafd1f9c1..03d3100559ac 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c | |||
@@ -611,7 +611,7 @@ xfs_fsync( | |||
611 | xfs_inode_t *ip) | 611 | xfs_inode_t *ip) |
612 | { | 612 | { |
613 | xfs_trans_t *tp; | 613 | xfs_trans_t *tp; |
614 | int error; | 614 | int error = 0; |
615 | int log_flushed = 0, changed = 1; | 615 | int log_flushed = 0, changed = 1; |
616 | 616 | ||
617 | xfs_itrace_entry(ip); | 617 | xfs_itrace_entry(ip); |
@@ -619,14 +619,9 @@ xfs_fsync( | |||
619 | if (XFS_FORCED_SHUTDOWN(ip->i_mount)) | 619 | if (XFS_FORCED_SHUTDOWN(ip->i_mount)) |
620 | return XFS_ERROR(EIO); | 620 | return XFS_ERROR(EIO); |
621 | 621 | ||
622 | /* capture size updates in I/O completion before writing the inode. */ | ||
623 | error = xfs_wait_on_pages(ip, 0, -1); | ||
624 | if (error) | ||
625 | return XFS_ERROR(error); | ||
626 | |||
627 | /* | 622 | /* |
628 | * We always need to make sure that the required inode state is safe on | 623 | * We always need to make sure that the required inode state is safe on |
629 | * disk. The vnode might be clean but we still might need to force the | 624 | * disk. The inode might be clean but we still might need to force the |
630 | * log because of committed transactions that haven't hit the disk yet. | 625 | * log because of committed transactions that haven't hit the disk yet. |
631 | * Likewise, there could be unflushed non-transactional changes to the | 626 | * Likewise, there could be unflushed non-transactional changes to the |
632 | * inode core that have to go to disk and this requires us to issue | 627 | * inode core that have to go to disk and this requires us to issue |
@@ -638,7 +633,7 @@ xfs_fsync( | |||
638 | */ | 633 | */ |
639 | xfs_ilock(ip, XFS_ILOCK_SHARED); | 634 | xfs_ilock(ip, XFS_ILOCK_SHARED); |
640 | 635 | ||
641 | if (!(ip->i_update_size || ip->i_update_core)) { | 636 | if (!ip->i_update_core) { |
642 | /* | 637 | /* |
643 | * Timestamps/size haven't changed since last inode flush or | 638 | * Timestamps/size haven't changed since last inode flush or |
644 | * inode transaction commit. That means either nothing got | 639 | * inode transaction commit. That means either nothing got |