aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChristoph Hellwig <hch@lst.de>2009-08-31 20:00:31 -0400
committerFelix Blyakher <felixb@sgi.com>2009-09-01 13:45:57 -0400
commit13e6d5cdde0e785aa943810f08b801cadd0935df (patch)
tree72b62d1e3e4b35f1613458b6e1dbbadd74534a92
parentbd169565993b39b9b4b102cdac8b13e0a259ce2f (diff)
xfs: merge fsync and O_SYNC handling
The guarantees for O_SYNC are exactly the same as the ones we need to make for an fsync call (and given that Linux O_SYNC is O_DSYNC the equivalent is fdadatasync, but we treat both the same in XFS), except with a range data writeout. Jan Kara has started unifying these two path for filesystems using the generic helpers, and I've started to look at XFS. The actual transaction commited by xfs_fsync and xfs_write_sync_logforce has a different transaction number, but actually is exactly the same. We'll only use the fsync transaction going forward. One major difference is that xfs_write_sync_logforce never issues a cache flush unless we commit a transaction causing that as a side-effect, which is an obvious bug in the O_SYNC handling. Second all the locking and i_update_size vs i_update_core changes from 978b7237123d007b9fa983af6e0e2fa8f97f9934 never made it to xfs_write_sync_logforce, so we add them back. To make xfs_fsync easily usable from the O_SYNC path, the filemap_fdatawait call is moved up to xfs_file_fsync, so that we don't wait on the whole file after we already waited for our portion in xfs_write. We'll also use a plain call to filemap_write_and_wait_range instead of the previous sync_page_rang which did it in two steps including an half-hearted inode write out that doesn't help us. Once we're done with this also remove the now useless i_update_size tracking. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Felix Blyakher <felixb@sgi.com> Signed-off-by: Felix Blyakher <felixb@sgi.com>
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c1
-rw-r--r--fs/xfs/linux-2.6/xfs_file.c19
-rw-r--r--fs/xfs/linux-2.6/xfs_lrw.c7
-rw-r--r--fs/xfs/xfs_iget.c1
-rw-r--r--fs/xfs/xfs_inode.h1
-rw-r--r--fs/xfs/xfs_inode_item.c8
-rw-r--r--fs/xfs/xfs_rw.c84
-rw-r--r--fs/xfs/xfs_rw.h1
-rw-r--r--fs/xfs/xfs_trans.h2
-rw-r--r--fs/xfs/xfs_vnodeops.c11
10 files changed, 23 insertions, 112 deletions
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index aecf2519db76..d5e5559e31db 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -216,7 +216,6 @@ xfs_setfilesize(
216 if (ip->i_d.di_size < isize) { 216 if (ip->i_d.di_size < isize) {
217 ip->i_d.di_size = isize; 217 ip->i_d.di_size = isize;
218 ip->i_update_core = 1; 218 ip->i_update_core = 1;
219 ip->i_update_size = 1;
220 xfs_mark_inode_dirty_sync(ip); 219 xfs_mark_inode_dirty_sync(ip);
221 } 220 }
222 221
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 0542fd507649..988d8f87bc0f 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -172,12 +172,21 @@ xfs_file_release(
172 */ 172 */
173STATIC int 173STATIC int
174xfs_file_fsync( 174xfs_file_fsync(
175 struct file *filp, 175 struct file *file,
176 struct dentry *dentry, 176 struct dentry *dentry,
177 int datasync) 177 int datasync)
178{ 178{
179 xfs_iflags_clear(XFS_I(dentry->d_inode), XFS_ITRUNCATED); 179 struct inode *inode = dentry->d_inode;
180 return -xfs_fsync(XFS_I(dentry->d_inode)); 180 struct xfs_inode *ip = XFS_I(inode);
181 int error;
182
183 /* capture size updates in I/O completion before writing the inode. */
184 error = filemap_fdatawait(inode->i_mapping);
185 if (error)
186 return error;
187
188 xfs_iflags_clear(ip, XFS_ITRUNCATED);
189 return -xfs_fsync(ip);
181} 190}
182 191
183STATIC int 192STATIC int
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 7078974a6eee..49e4a6aea73c 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -812,18 +812,21 @@ write_retry:
812 812
813 /* Handle various SYNC-type writes */ 813 /* Handle various SYNC-type writes */
814 if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) { 814 if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) {
815 loff_t end = pos + ret - 1;
815 int error2; 816 int error2;
816 817
817 xfs_iunlock(xip, iolock); 818 xfs_iunlock(xip, iolock);
818 if (need_i_mutex) 819 if (need_i_mutex)
819 mutex_unlock(&inode->i_mutex); 820 mutex_unlock(&inode->i_mutex);
820 error2 = sync_page_range(inode, mapping, pos, ret); 821
822 error2 = filemap_write_and_wait_range(mapping, pos, end);
821 if (!error) 823 if (!error)
822 error = error2; 824 error = error2;
823 if (need_i_mutex) 825 if (need_i_mutex)
824 mutex_lock(&inode->i_mutex); 826 mutex_lock(&inode->i_mutex);
825 xfs_ilock(xip, iolock); 827 xfs_ilock(xip, iolock);
826 error2 = xfs_write_sync_logforce(mp, xip); 828
829 error2 = xfs_fsync(xip);
827 if (!error) 830 if (!error)
828 error = error2; 831 error = error2;
829 } 832 }
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index ecbf8b4d2e2e..3323826274b3 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -82,7 +82,6 @@ xfs_inode_alloc(
82 memset(&ip->i_df, 0, sizeof(xfs_ifork_t)); 82 memset(&ip->i_df, 0, sizeof(xfs_ifork_t));
83 ip->i_flags = 0; 83 ip->i_flags = 0;
84 ip->i_update_core = 0; 84 ip->i_update_core = 0;
85 ip->i_update_size = 0;
86 ip->i_delayed_blks = 0; 85 ip->i_delayed_blks = 0;
87 memset(&ip->i_d, 0, sizeof(xfs_icdinode_t)); 86 memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
88 ip->i_size = 0; 87 ip->i_size = 0;
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 00e8505bde2d..ed566c248ae4 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -261,7 +261,6 @@ typedef struct xfs_inode {
261 /* Miscellaneous state. */ 261 /* Miscellaneous state. */
262 unsigned short i_flags; /* see defined flags below */ 262 unsigned short i_flags; /* see defined flags below */
263 unsigned char i_update_core; /* timestamps/size is dirty */ 263 unsigned char i_update_core; /* timestamps/size is dirty */
264 unsigned char i_update_size; /* di_size field is dirty */
265 unsigned int i_delayed_blks; /* count of delay alloc blks */ 264 unsigned int i_delayed_blks; /* count of delay alloc blks */
266 265
267 xfs_icdinode_t i_d; /* most of ondisk inode */ 266 xfs_icdinode_t i_d; /* most of ondisk inode */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 977c4aec587e..2e69412195e6 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -263,14 +263,6 @@ xfs_inode_item_format(
263 } 263 }
264 264
265 /* 265 /*
266 * We don't have to worry about re-ordering here because
267 * the update_size field is protected by the inode lock
268 * and we have that held in exclusive mode.
269 */
270 if (ip->i_update_size)
271 ip->i_update_size = 0;
272
273 /*
274 * Make sure to get the latest atime from the Linux inode. 266 * Make sure to get the latest atime from the Linux inode.
275 */ 267 */
276 xfs_synchronize_atime(ip); 268 xfs_synchronize_atime(ip);
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index fea68615ed23..3f816ad7ff19 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -88,90 +88,6 @@ xfs_write_clear_setuid(
88} 88}
89 89
90/* 90/*
91 * Handle logging requirements of various synchronous types of write.
92 */
93int
94xfs_write_sync_logforce(
95 xfs_mount_t *mp,
96 xfs_inode_t *ip)
97{
98 int error = 0;
99
100 /*
101 * If we're treating this as O_DSYNC and we have not updated the
102 * size, force the log.
103 */
104 if (!(mp->m_flags & XFS_MOUNT_OSYNCISOSYNC) &&
105 !(ip->i_update_size)) {
106 xfs_inode_log_item_t *iip = ip->i_itemp;
107
108 /*
109 * If an allocation transaction occurred
110 * without extending the size, then we have to force
111 * the log up the proper point to ensure that the
112 * allocation is permanent. We can't count on
113 * the fact that buffered writes lock out direct I/O
114 * writes - the direct I/O write could have extended
115 * the size nontransactionally, then finished before
116 * we started. xfs_write_file will think that the file
117 * didn't grow but the update isn't safe unless the
118 * size change is logged.
119 *
120 * Force the log if we've committed a transaction
121 * against the inode or if someone else has and
122 * the commit record hasn't gone to disk (e.g.
123 * the inode is pinned). This guarantees that
124 * all changes affecting the inode are permanent
125 * when we return.
126 */
127 if (iip && iip->ili_last_lsn) {
128 error = _xfs_log_force(mp, iip->ili_last_lsn,
129 XFS_LOG_FORCE | XFS_LOG_SYNC, NULL);
130 } else if (xfs_ipincount(ip) > 0) {
131 error = _xfs_log_force(mp, (xfs_lsn_t)0,
132 XFS_LOG_FORCE | XFS_LOG_SYNC, NULL);
133 }
134
135 } else {
136 xfs_trans_t *tp;
137
138 /*
139 * O_SYNC or O_DSYNC _with_ a size update are handled
140 * the same way.
141 *
142 * If the write was synchronous then we need to make
143 * sure that the inode modification time is permanent.
144 * We'll have updated the timestamp above, so here
145 * we use a synchronous transaction to log the inode.
146 * It's not fast, but it's necessary.
147 *
148 * If this a dsync write and the size got changed
149 * non-transactionally, then we need to ensure that
150 * the size change gets logged in a synchronous
151 * transaction.
152 */
153 tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC);
154 if ((error = xfs_trans_reserve(tp, 0,
155 XFS_SWRITE_LOG_RES(mp),
156 0, 0, 0))) {
157 /* Transaction reserve failed */
158 xfs_trans_cancel(tp, 0);
159 } else {
160 /* Transaction reserve successful */
161 xfs_ilock(ip, XFS_ILOCK_EXCL);
162 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
163 xfs_trans_ihold(tp, ip);
164 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
165 xfs_trans_set_sync(tp);
166 error = xfs_trans_commit(tp, 0);
167 xfs_iunlock(ip, XFS_ILOCK_EXCL);
168 }
169 }
170
171 return error;
172}
173
174/*
175 * Force a shutdown of the filesystem instantly while keeping 91 * Force a shutdown of the filesystem instantly while keeping
176 * the filesystem consistent. We don't do an unmount here; just shutdown 92 * the filesystem consistent. We don't do an unmount here; just shutdown
177 * the shop, make sure that absolutely nothing persistent happens to 93 * the shop, make sure that absolutely nothing persistent happens to
diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h
index ae65f0df87da..f5e4874c37d8 100644
--- a/fs/xfs/xfs_rw.h
+++ b/fs/xfs/xfs_rw.h
@@ -68,7 +68,6 @@ xfs_get_extsz_hint(
68 * Prototypes for functions in xfs_rw.c. 68 * Prototypes for functions in xfs_rw.c.
69 */ 69 */
70extern int xfs_write_clear_setuid(struct xfs_inode *ip); 70extern int xfs_write_clear_setuid(struct xfs_inode *ip);
71extern int xfs_write_sync_logforce(struct xfs_mount *mp, struct xfs_inode *ip);
72extern int xfs_bwrite(struct xfs_mount *mp, struct xfs_buf *bp); 71extern int xfs_bwrite(struct xfs_mount *mp, struct xfs_buf *bp);
73extern int xfs_bioerror(struct xfs_buf *bp); 72extern int xfs_bioerror(struct xfs_buf *bp);
74extern int xfs_bioerror_relse(struct xfs_buf *bp); 73extern int xfs_bioerror_relse(struct xfs_buf *bp);
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 775249a54f6f..ed47fc77759c 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -68,7 +68,7 @@ typedef struct xfs_trans_header {
68#define XFS_TRANS_GROWFS 14 68#define XFS_TRANS_GROWFS 14
69#define XFS_TRANS_STRAT_WRITE 15 69#define XFS_TRANS_STRAT_WRITE 15
70#define XFS_TRANS_DIOSTRAT 16 70#define XFS_TRANS_DIOSTRAT 16
71#define XFS_TRANS_WRITE_SYNC 17 71/* 17 was XFS_TRANS_WRITE_SYNC */
72#define XFS_TRANS_WRITEID 18 72#define XFS_TRANS_WRITEID 18
73#define XFS_TRANS_ADDAFORK 19 73#define XFS_TRANS_ADDAFORK 19
74#define XFS_TRANS_ATTRINVAL 20 74#define XFS_TRANS_ATTRINVAL 20
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index ceecafd1f9c1..03d3100559ac 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -611,7 +611,7 @@ xfs_fsync(
611 xfs_inode_t *ip) 611 xfs_inode_t *ip)
612{ 612{
613 xfs_trans_t *tp; 613 xfs_trans_t *tp;
614 int error; 614 int error = 0;
615 int log_flushed = 0, changed = 1; 615 int log_flushed = 0, changed = 1;
616 616
617 xfs_itrace_entry(ip); 617 xfs_itrace_entry(ip);
@@ -619,14 +619,9 @@ xfs_fsync(
619 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 619 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
620 return XFS_ERROR(EIO); 620 return XFS_ERROR(EIO);
621 621
622 /* capture size updates in I/O completion before writing the inode. */
623 error = xfs_wait_on_pages(ip, 0, -1);
624 if (error)
625 return XFS_ERROR(error);
626
627 /* 622 /*
628 * We always need to make sure that the required inode state is safe on 623 * We always need to make sure that the required inode state is safe on
629 * disk. The vnode might be clean but we still might need to force the 624 * disk. The inode might be clean but we still might need to force the
630 * log because of committed transactions that haven't hit the disk yet. 625 * log because of committed transactions that haven't hit the disk yet.
631 * Likewise, there could be unflushed non-transactional changes to the 626 * Likewise, there could be unflushed non-transactional changes to the
632 * inode core that have to go to disk and this requires us to issue 627 * inode core that have to go to disk and this requires us to issue
@@ -638,7 +633,7 @@ xfs_fsync(
638 */ 633 */
639 xfs_ilock(ip, XFS_ILOCK_SHARED); 634 xfs_ilock(ip, XFS_ILOCK_SHARED);
640 635
641 if (!(ip->i_update_size || ip->i_update_core)) { 636 if (!ip->i_update_core) {
642 /* 637 /*
643 * Timestamps/size haven't changed since last inode flush or 638 * Timestamps/size haven't changed since last inode flush or
644 * inode transaction commit. That means either nothing got 639 * inode transaction commit. That means either nothing got