aboutsummaryrefslogtreecommitdiffstats
path: root/fs/xfs/linux-2.6/xfs_aops.c
diff options
context:
space:
mode:
authorLachlan McIlroy <lachlan@sgi.com>2007-05-07 23:49:46 -0400
committerTim Shimmin <tes@sgi.com>2007-05-07 23:49:46 -0400
commitba87ea699ebd9dd577bf055ebc4a98200e337542 (patch)
tree713b7d32937372fd7c5b8647f14d0e7262fc7075 /fs/xfs/linux-2.6/xfs_aops.c
parent2a32963130aec5e157b58ff7dfa3dfa1afdf7ca1 (diff)
[XFS] Fix to prevent the notorious 'NULL files' problem after a crash.
The problem that has been addressed is that of synchronising updates of the file size with writes that extend a file. Without the fix the update of a file's size, as a result of a write beyond eof, is independent of when the cached data is flushed to disk. Often the file size update would be written to the filesystem log before the data is flushed to disk. When a system crashes between these two events and the filesystem log is replayed on mount the file's size will be set but since the contents never made it to disk the file is full of holes. If some of the cached data was flushed to disk then it may just be a section of the file at the end that has holes. There are existing fixes to help alleviate this problem, particularly in the case where a file has been truncated, that force cached data to be flushed to disk when the file is closed. If the system crashes while the file(s) are still open then this flushing will never occur. The fix that we have implemented is to introduce a second file size, called the in-memory file size, that represents the current file size as viewed by the user. The existing file size, called the on-disk file size, is the one that get's written to the filesystem log and we only update it when it is safe to do so. When we write to a file beyond eof we only update the in- memory file size in the write operation. Later when the I/O operation, that flushes the cached data to disk completes, an I/O completion routine will update the on-disk file size. The on-disk file size will be updated to the maximum offset of the I/O or to the value of the in-memory file size if the I/O includes eof. SGI-PV: 958522 SGI-Modid: xfs-linux-melb:xfs-kern:28322a Signed-off-by: Lachlan McIlroy <lachlan@sgi.com> Signed-off-by: David Chinner <dgc@sgi.com> Signed-off-by: Tim Shimmin <tes@sgi.com>
Diffstat (limited to 'fs/xfs/linux-2.6/xfs_aops.c')
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c89
1 files changed, 77 insertions, 12 deletions
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 143ffc851c9d..4475588e973a 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -141,9 +141,46 @@ xfs_destroy_ioend(
141} 141}
142 142
143/* 143/*
144 * Update on-disk file size now that data has been written to disk.
145 * The current in-memory file size is i_size. If a write is beyond
146 * eof io_new_size will be the intended file size until i_size is
147 * updated. If this write does not extend all the way to the valid
148 * file size then restrict this update to the end of the write.
149 */
150STATIC void
151xfs_setfilesize(
152 xfs_ioend_t *ioend)
153{
154 xfs_inode_t *ip;
155 xfs_fsize_t isize;
156 xfs_fsize_t bsize;
157
158 ip = xfs_vtoi(ioend->io_vnode);
159
160 ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
161 ASSERT(ioend->io_type != IOMAP_READ);
162
163 if (unlikely(ioend->io_error))
164 return;
165
166 bsize = ioend->io_offset + ioend->io_size;
167
168 xfs_ilock(ip, XFS_ILOCK_EXCL);
169
170 isize = MAX(ip->i_size, ip->i_iocore.io_new_size);
171 isize = MIN(isize, bsize);
172
173 if (ip->i_d.di_size < isize) {
174 ip->i_d.di_size = isize;
175 ip->i_update_core = 1;
176 ip->i_update_size = 1;
177 }
178
179 xfs_iunlock(ip, XFS_ILOCK_EXCL);
180}
181
182/*
144 * Buffered IO write completion for delayed allocate extents. 183 * Buffered IO write completion for delayed allocate extents.
145 * TODO: Update ondisk isize now that we know the file data
146 * has been flushed (i.e. the notorious "NULL file" problem).
147 */ 184 */
148STATIC void 185STATIC void
149xfs_end_bio_delalloc( 186xfs_end_bio_delalloc(
@@ -152,6 +189,7 @@ xfs_end_bio_delalloc(
152 xfs_ioend_t *ioend = 189 xfs_ioend_t *ioend =
153 container_of(work, xfs_ioend_t, io_work); 190 container_of(work, xfs_ioend_t, io_work);
154 191
192 xfs_setfilesize(ioend);
155 xfs_destroy_ioend(ioend); 193 xfs_destroy_ioend(ioend);
156} 194}
157 195
@@ -165,6 +203,7 @@ xfs_end_bio_written(
165 xfs_ioend_t *ioend = 203 xfs_ioend_t *ioend =
166 container_of(work, xfs_ioend_t, io_work); 204 container_of(work, xfs_ioend_t, io_work);
167 205
206 xfs_setfilesize(ioend);
168 xfs_destroy_ioend(ioend); 207 xfs_destroy_ioend(ioend);
169} 208}
170 209
@@ -184,8 +223,23 @@ xfs_end_bio_unwritten(
184 xfs_off_t offset = ioend->io_offset; 223 xfs_off_t offset = ioend->io_offset;
185 size_t size = ioend->io_size; 224 size_t size = ioend->io_size;
186 225
187 if (likely(!ioend->io_error)) 226 if (likely(!ioend->io_error)) {
188 bhv_vop_bmap(vp, offset, size, BMAPI_UNWRITTEN, NULL, NULL); 227 bhv_vop_bmap(vp, offset, size, BMAPI_UNWRITTEN, NULL, NULL);
228 xfs_setfilesize(ioend);
229 }
230 xfs_destroy_ioend(ioend);
231}
232
233/*
234 * IO read completion for regular, written extents.
235 */
236STATIC void
237xfs_end_bio_read(
238 struct work_struct *work)
239{
240 xfs_ioend_t *ioend =
241 container_of(work, xfs_ioend_t, io_work);
242
189 xfs_destroy_ioend(ioend); 243 xfs_destroy_ioend(ioend);
190} 244}
191 245
@@ -224,6 +278,8 @@ xfs_alloc_ioend(
224 INIT_WORK(&ioend->io_work, xfs_end_bio_unwritten); 278 INIT_WORK(&ioend->io_work, xfs_end_bio_unwritten);
225 else if (type == IOMAP_DELAY) 279 else if (type == IOMAP_DELAY)
226 INIT_WORK(&ioend->io_work, xfs_end_bio_delalloc); 280 INIT_WORK(&ioend->io_work, xfs_end_bio_delalloc);
281 else if (type == IOMAP_READ)
282 INIT_WORK(&ioend->io_work, xfs_end_bio_read);
227 else 283 else
228 INIT_WORK(&ioend->io_work, xfs_end_bio_written); 284 INIT_WORK(&ioend->io_work, xfs_end_bio_written);
229 285
@@ -913,7 +969,7 @@ xfs_page_state_convert(
913 bh = head = page_buffers(page); 969 bh = head = page_buffers(page);
914 offset = page_offset(page); 970 offset = page_offset(page);
915 flags = -1; 971 flags = -1;
916 type = 0; 972 type = IOMAP_READ;
917 973
918 /* TODO: cleanup count and page_dirty */ 974 /* TODO: cleanup count and page_dirty */
919 975
@@ -999,7 +1055,7 @@ xfs_page_state_convert(
999 * That means it must already have extents allocated 1055 * That means it must already have extents allocated
1000 * underneath it. Map the extent by reading it. 1056 * underneath it. Map the extent by reading it.
1001 */ 1057 */
1002 if (!iomap_valid || type != 0) { 1058 if (!iomap_valid || type != IOMAP_READ) {
1003 flags = BMAPI_READ; 1059 flags = BMAPI_READ;
1004 size = xfs_probe_cluster(inode, page, bh, 1060 size = xfs_probe_cluster(inode, page, bh,
1005 head, 1); 1061 head, 1);
@@ -1010,7 +1066,7 @@ xfs_page_state_convert(
1010 iomap_valid = xfs_iomap_valid(&iomap, offset); 1066 iomap_valid = xfs_iomap_valid(&iomap, offset);
1011 } 1067 }
1012 1068
1013 type = 0; 1069 type = IOMAP_READ;
1014 if (!test_and_set_bit(BH_Lock, &bh->b_state)) { 1070 if (!test_and_set_bit(BH_Lock, &bh->b_state)) {
1015 ASSERT(buffer_mapped(bh)); 1071 ASSERT(buffer_mapped(bh));
1016 if (iomap_valid) 1072 if (iomap_valid)
@@ -1356,12 +1412,21 @@ xfs_end_io_direct(
1356 * completion handler in the future, in which case all this can 1412 * completion handler in the future, in which case all this can
1357 * go away. 1413 * go away.
1358 */ 1414 */
1359 if (private && size > 0) { 1415 ioend->io_offset = offset;
1360 ioend->io_offset = offset; 1416 ioend->io_size = size;
1361 ioend->io_size = size; 1417 if (ioend->io_type == IOMAP_READ) {
1418 xfs_finish_ioend(ioend);
1419 } else if (private && size > 0) {
1362 xfs_finish_ioend(ioend); 1420 xfs_finish_ioend(ioend);
1363 } else { 1421 } else {
1364 xfs_destroy_ioend(ioend); 1422 /*
1423 * A direct I/O write ioend starts it's life in unwritten
1424 * state in case they map an unwritten extent. This write
1425 * didn't map an unwritten extent so switch it's completion
1426 * handler.
1427 */
1428 INIT_WORK(&ioend->io_work, xfs_end_bio_written);
1429 xfs_finish_ioend(ioend);
1365 } 1430 }
1366 1431
1367 /* 1432 /*
@@ -1392,15 +1457,15 @@ xfs_vm_direct_IO(
1392 if (error) 1457 if (error)
1393 return -error; 1458 return -error;
1394 1459
1395 iocb->private = xfs_alloc_ioend(inode, IOMAP_UNWRITTEN);
1396
1397 if (rw == WRITE) { 1460 if (rw == WRITE) {
1461 iocb->private = xfs_alloc_ioend(inode, IOMAP_UNWRITTEN);
1398 ret = blockdev_direct_IO_own_locking(rw, iocb, inode, 1462 ret = blockdev_direct_IO_own_locking(rw, iocb, inode,
1399 iomap.iomap_target->bt_bdev, 1463 iomap.iomap_target->bt_bdev,
1400 iov, offset, nr_segs, 1464 iov, offset, nr_segs,
1401 xfs_get_blocks_direct, 1465 xfs_get_blocks_direct,
1402 xfs_end_io_direct); 1466 xfs_end_io_direct);
1403 } else { 1467 } else {
1468 iocb->private = xfs_alloc_ioend(inode, IOMAP_READ);
1404 ret = blockdev_direct_IO_no_locking(rw, iocb, inode, 1469 ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
1405 iomap.iomap_target->bt_bdev, 1470 iomap.iomap_target->bt_bdev,
1406 iov, offset, nr_segs, 1471 iov, offset, nr_segs,