aboutsummaryrefslogtreecommitdiffstats
path: root/fs/xfs/linux-2.6
diff options
context:
space:
mode:
authorLachlan McIlroy <lachlan@sgi.com>2007-05-07 23:49:46 -0400
committerTim Shimmin <tes@sgi.com>2007-05-07 23:49:46 -0400
commitba87ea699ebd9dd577bf055ebc4a98200e337542 (patch)
tree713b7d32937372fd7c5b8647f14d0e7262fc7075 /fs/xfs/linux-2.6
parent2a32963130aec5e157b58ff7dfa3dfa1afdf7ca1 (diff)
[XFS] Fix to prevent the notorious 'NULL files' problem after a crash.
The problem that has been addressed is that of synchronising updates of the file size with writes that extend a file. Without the fix the update of a file's size, as a result of a write beyond eof, is independent of when the cached data is flushed to disk. Often the file size update would be written to the filesystem log before the data is flushed to disk. When a system crashes between these two events and the filesystem log is replayed on mount the file's size will be set but since the contents never made it to disk the file is full of holes. If some of the cached data was flushed to disk then it may just be a section of the file at the end that has holes. There are existing fixes to help alleviate this problem, particularly in the case where a file has been truncated, that force cached data to be flushed to disk when the file is closed. If the system crashes while the file(s) are still open then this flushing will never occur. The fix that we have implemented is to introduce a second file size, called the in-memory file size, that represents the current file size as viewed by the user. The existing file size, called the on-disk file size, is the one that get's written to the filesystem log and we only update it when it is safe to do so. When we write to a file beyond eof we only update the in- memory file size in the write operation. Later when the I/O operation, that flushes the cached data to disk completes, an I/O completion routine will update the on-disk file size. The on-disk file size will be updated to the maximum offset of the I/O or to the value of the in-memory file size if the I/O includes eof. SGI-PV: 958522 SGI-Modid: xfs-linux-melb:xfs-kern:28322a Signed-off-by: Lachlan McIlroy <lachlan@sgi.com> Signed-off-by: David Chinner <dgc@sgi.com> Signed-off-by: Tim Shimmin <tes@sgi.com>
Diffstat (limited to 'fs/xfs/linux-2.6')
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c89
-rw-r--r--fs/xfs/linux-2.6/xfs_lrw.c91
2 files changed, 132 insertions, 48 deletions
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 143ffc851c9d..4475588e973a 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -141,9 +141,46 @@ xfs_destroy_ioend(
141} 141}
142 142
143/* 143/*
144 * Update on-disk file size now that data has been written to disk.
145 * The current in-memory file size is i_size. If a write is beyond
146 * eof io_new_size will be the intended file size until i_size is
147 * updated. If this write does not extend all the way to the valid
148 * file size then restrict this update to the end of the write.
149 */
150STATIC void
151xfs_setfilesize(
152 xfs_ioend_t *ioend)
153{
154 xfs_inode_t *ip;
155 xfs_fsize_t isize;
156 xfs_fsize_t bsize;
157
158 ip = xfs_vtoi(ioend->io_vnode);
159
160 ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
161 ASSERT(ioend->io_type != IOMAP_READ);
162
163 if (unlikely(ioend->io_error))
164 return;
165
166 bsize = ioend->io_offset + ioend->io_size;
167
168 xfs_ilock(ip, XFS_ILOCK_EXCL);
169
170 isize = MAX(ip->i_size, ip->i_iocore.io_new_size);
171 isize = MIN(isize, bsize);
172
173 if (ip->i_d.di_size < isize) {
174 ip->i_d.di_size = isize;
175 ip->i_update_core = 1;
176 ip->i_update_size = 1;
177 }
178
179 xfs_iunlock(ip, XFS_ILOCK_EXCL);
180}
181
182/*
144 * Buffered IO write completion for delayed allocate extents. 183 * Buffered IO write completion for delayed allocate extents.
145 * TODO: Update ondisk isize now that we know the file data
146 * has been flushed (i.e. the notorious "NULL file" problem).
147 */ 184 */
148STATIC void 185STATIC void
149xfs_end_bio_delalloc( 186xfs_end_bio_delalloc(
@@ -152,6 +189,7 @@ xfs_end_bio_delalloc(
152 xfs_ioend_t *ioend = 189 xfs_ioend_t *ioend =
153 container_of(work, xfs_ioend_t, io_work); 190 container_of(work, xfs_ioend_t, io_work);
154 191
192 xfs_setfilesize(ioend);
155 xfs_destroy_ioend(ioend); 193 xfs_destroy_ioend(ioend);
156} 194}
157 195
@@ -165,6 +203,7 @@ xfs_end_bio_written(
165 xfs_ioend_t *ioend = 203 xfs_ioend_t *ioend =
166 container_of(work, xfs_ioend_t, io_work); 204 container_of(work, xfs_ioend_t, io_work);
167 205
206 xfs_setfilesize(ioend);
168 xfs_destroy_ioend(ioend); 207 xfs_destroy_ioend(ioend);
169} 208}
170 209
@@ -184,8 +223,23 @@ xfs_end_bio_unwritten(
184 xfs_off_t offset = ioend->io_offset; 223 xfs_off_t offset = ioend->io_offset;
185 size_t size = ioend->io_size; 224 size_t size = ioend->io_size;
186 225
187 if (likely(!ioend->io_error)) 226 if (likely(!ioend->io_error)) {
188 bhv_vop_bmap(vp, offset, size, BMAPI_UNWRITTEN, NULL, NULL); 227 bhv_vop_bmap(vp, offset, size, BMAPI_UNWRITTEN, NULL, NULL);
228 xfs_setfilesize(ioend);
229 }
230 xfs_destroy_ioend(ioend);
231}
232
233/*
234 * IO read completion for regular, written extents.
235 */
236STATIC void
237xfs_end_bio_read(
238 struct work_struct *work)
239{
240 xfs_ioend_t *ioend =
241 container_of(work, xfs_ioend_t, io_work);
242
189 xfs_destroy_ioend(ioend); 243 xfs_destroy_ioend(ioend);
190} 244}
191 245
@@ -224,6 +278,8 @@ xfs_alloc_ioend(
224 INIT_WORK(&ioend->io_work, xfs_end_bio_unwritten); 278 INIT_WORK(&ioend->io_work, xfs_end_bio_unwritten);
225 else if (type == IOMAP_DELAY) 279 else if (type == IOMAP_DELAY)
226 INIT_WORK(&ioend->io_work, xfs_end_bio_delalloc); 280 INIT_WORK(&ioend->io_work, xfs_end_bio_delalloc);
281 else if (type == IOMAP_READ)
282 INIT_WORK(&ioend->io_work, xfs_end_bio_read);
227 else 283 else
228 INIT_WORK(&ioend->io_work, xfs_end_bio_written); 284 INIT_WORK(&ioend->io_work, xfs_end_bio_written);
229 285
@@ -913,7 +969,7 @@ xfs_page_state_convert(
913 bh = head = page_buffers(page); 969 bh = head = page_buffers(page);
914 offset = page_offset(page); 970 offset = page_offset(page);
915 flags = -1; 971 flags = -1;
916 type = 0; 972 type = IOMAP_READ;
917 973
918 /* TODO: cleanup count and page_dirty */ 974 /* TODO: cleanup count and page_dirty */
919 975
@@ -999,7 +1055,7 @@ xfs_page_state_convert(
999 * That means it must already have extents allocated 1055 * That means it must already have extents allocated
1000 * underneath it. Map the extent by reading it. 1056 * underneath it. Map the extent by reading it.
1001 */ 1057 */
1002 if (!iomap_valid || type != 0) { 1058 if (!iomap_valid || type != IOMAP_READ) {
1003 flags = BMAPI_READ; 1059 flags = BMAPI_READ;
1004 size = xfs_probe_cluster(inode, page, bh, 1060 size = xfs_probe_cluster(inode, page, bh,
1005 head, 1); 1061 head, 1);
@@ -1010,7 +1066,7 @@ xfs_page_state_convert(
1010 iomap_valid = xfs_iomap_valid(&iomap, offset); 1066 iomap_valid = xfs_iomap_valid(&iomap, offset);
1011 } 1067 }
1012 1068
1013 type = 0; 1069 type = IOMAP_READ;
1014 if (!test_and_set_bit(BH_Lock, &bh->b_state)) { 1070 if (!test_and_set_bit(BH_Lock, &bh->b_state)) {
1015 ASSERT(buffer_mapped(bh)); 1071 ASSERT(buffer_mapped(bh));
1016 if (iomap_valid) 1072 if (iomap_valid)
@@ -1356,12 +1412,21 @@ xfs_end_io_direct(
1356 * completion handler in the future, in which case all this can 1412 * completion handler in the future, in which case all this can
1357 * go away. 1413 * go away.
1358 */ 1414 */
1359 if (private && size > 0) { 1415 ioend->io_offset = offset;
1360 ioend->io_offset = offset; 1416 ioend->io_size = size;
1361 ioend->io_size = size; 1417 if (ioend->io_type == IOMAP_READ) {
1418 xfs_finish_ioend(ioend);
1419 } else if (private && size > 0) {
1362 xfs_finish_ioend(ioend); 1420 xfs_finish_ioend(ioend);
1363 } else { 1421 } else {
1364 xfs_destroy_ioend(ioend); 1422 /*
1423 * A direct I/O write ioend starts it's life in unwritten
1424 * state in case they map an unwritten extent. This write
1425 * didn't map an unwritten extent so switch it's completion
1426 * handler.
1427 */
1428 INIT_WORK(&ioend->io_work, xfs_end_bio_written);
1429 xfs_finish_ioend(ioend);
1365 } 1430 }
1366 1431
1367 /* 1432 /*
@@ -1392,15 +1457,15 @@ xfs_vm_direct_IO(
1392 if (error) 1457 if (error)
1393 return -error; 1458 return -error;
1394 1459
1395 iocb->private = xfs_alloc_ioend(inode, IOMAP_UNWRITTEN);
1396
1397 if (rw == WRITE) { 1460 if (rw == WRITE) {
1461 iocb->private = xfs_alloc_ioend(inode, IOMAP_UNWRITTEN);
1398 ret = blockdev_direct_IO_own_locking(rw, iocb, inode, 1462 ret = blockdev_direct_IO_own_locking(rw, iocb, inode,
1399 iomap.iomap_target->bt_bdev, 1463 iomap.iomap_target->bt_bdev,
1400 iov, offset, nr_segs, 1464 iov, offset, nr_segs,
1401 xfs_get_blocks_direct, 1465 xfs_get_blocks_direct,
1402 xfs_end_io_direct); 1466 xfs_end_io_direct);
1403 } else { 1467 } else {
1468 iocb->private = xfs_alloc_ioend(inode, IOMAP_READ);
1404 ret = blockdev_direct_IO_no_locking(rw, iocb, inode, 1469 ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
1405 iomap.iomap_target->bt_bdev, 1470 iomap.iomap_target->bt_bdev,
1406 iov, offset, nr_segs, 1471 iov, offset, nr_segs,
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 80fe31233471..82ab792c7fc9 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -224,7 +224,7 @@ xfs_read(
224 mp->m_rtdev_targp : mp->m_ddev_targp; 224 mp->m_rtdev_targp : mp->m_ddev_targp;
225 if ((*offset & target->bt_smask) || 225 if ((*offset & target->bt_smask) ||
226 (size & target->bt_smask)) { 226 (size & target->bt_smask)) {
227 if (*offset == ip->i_d.di_size) { 227 if (*offset == ip->i_size) {
228 return (0); 228 return (0);
229 } 229 }
230 return -XFS_ERROR(EINVAL); 230 return -XFS_ERROR(EINVAL);
@@ -387,9 +387,10 @@ xfs_splice_write(
387{ 387{
388 xfs_inode_t *ip = XFS_BHVTOI(bdp); 388 xfs_inode_t *ip = XFS_BHVTOI(bdp);
389 xfs_mount_t *mp = ip->i_mount; 389 xfs_mount_t *mp = ip->i_mount;
390 xfs_iocore_t *io = &ip->i_iocore;
390 ssize_t ret; 391 ssize_t ret;
391 struct inode *inode = outfilp->f_mapping->host; 392 struct inode *inode = outfilp->f_mapping->host;
392 xfs_fsize_t isize; 393 xfs_fsize_t isize, new_size;
393 394
394 XFS_STATS_INC(xs_write_calls); 395 XFS_STATS_INC(xs_write_calls);
395 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 396 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
@@ -410,6 +411,14 @@ xfs_splice_write(
410 return -error; 411 return -error;
411 } 412 }
412 } 413 }
414
415 new_size = *ppos + count;
416
417 xfs_ilock(ip, XFS_ILOCK_EXCL);
418 if (new_size > ip->i_size)
419 io->io_new_size = new_size;
420 xfs_iunlock(ip, XFS_ILOCK_EXCL);
421
413 xfs_rw_enter_trace(XFS_SPLICE_WRITE_ENTER, &ip->i_iocore, 422 xfs_rw_enter_trace(XFS_SPLICE_WRITE_ENTER, &ip->i_iocore,
414 pipe, count, *ppos, ioflags); 423 pipe, count, *ppos, ioflags);
415 ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags); 424 ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
@@ -420,14 +429,18 @@ xfs_splice_write(
420 if (unlikely(ret < 0 && ret != -EFAULT && *ppos > isize)) 429 if (unlikely(ret < 0 && ret != -EFAULT && *ppos > isize))
421 *ppos = isize; 430 *ppos = isize;
422 431
423 if (*ppos > ip->i_d.di_size) { 432 if (*ppos > ip->i_size) {
424 xfs_ilock(ip, XFS_ILOCK_EXCL); 433 xfs_ilock(ip, XFS_ILOCK_EXCL);
425 if (*ppos > ip->i_d.di_size) { 434 if (*ppos > ip->i_size)
426 ip->i_d.di_size = *ppos; 435 ip->i_size = *ppos;
427 i_size_write(inode, *ppos); 436 xfs_iunlock(ip, XFS_ILOCK_EXCL);
428 ip->i_update_core = 1; 437 }
429 ip->i_update_size = 1; 438
430 } 439 if (io->io_new_size) {
440 xfs_ilock(ip, XFS_ILOCK_EXCL);
441 io->io_new_size = 0;
442 if (ip->i_d.di_size > ip->i_size)
443 ip->i_d.di_size = ip->i_size;
431 xfs_iunlock(ip, XFS_ILOCK_EXCL); 444 xfs_iunlock(ip, XFS_ILOCK_EXCL);
432 } 445 }
433 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 446 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
@@ -711,8 +724,6 @@ start:
711 goto out_unlock_mutex; 724 goto out_unlock_mutex;
712 } 725 }
713 726
714 isize = i_size_read(inode);
715
716 if (ioflags & IO_ISDIRECT) { 727 if (ioflags & IO_ISDIRECT) {
717 xfs_buftarg_t *target = 728 xfs_buftarg_t *target =
718 (xip->i_d.di_flags & XFS_DIFLAG_REALTIME) ? 729 (xip->i_d.di_flags & XFS_DIFLAG_REALTIME) ?
@@ -723,7 +734,7 @@ start:
723 return XFS_ERROR(-EINVAL); 734 return XFS_ERROR(-EINVAL);
724 } 735 }
725 736
726 if (!need_i_mutex && (VN_CACHED(vp) || pos > isize)) { 737 if (!need_i_mutex && (VN_CACHED(vp) || pos > xip->i_size)) {
727 xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); 738 xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
728 iolock = XFS_IOLOCK_EXCL; 739 iolock = XFS_IOLOCK_EXCL;
729 locktype = VRWLOCK_WRITE; 740 locktype = VRWLOCK_WRITE;
@@ -735,7 +746,7 @@ start:
735 } 746 }
736 747
737 new_size = pos + count; 748 new_size = pos + count;
738 if (new_size > isize) 749 if (new_size > xip->i_size)
739 io->io_new_size = new_size; 750 io->io_new_size = new_size;
740 751
741 if ((DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_WRITE) && 752 if ((DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_WRITE) &&
@@ -751,8 +762,7 @@ start:
751 pos, count, 762 pos, count,
752 dmflags, &locktype); 763 dmflags, &locktype);
753 if (error) { 764 if (error) {
754 xfs_iunlock(xip, iolock); 765 goto out_unlock_internal;
755 goto out_unlock_mutex;
756 } 766 }
757 xfs_ilock(xip, XFS_ILOCK_EXCL); 767 xfs_ilock(xip, XFS_ILOCK_EXCL);
758 eventsent = 1; 768 eventsent = 1;
@@ -764,9 +774,8 @@ start:
764 * event prevents another call to XFS_SEND_DATA, which is 774 * event prevents another call to XFS_SEND_DATA, which is
765 * what allows the size to change in the first place. 775 * what allows the size to change in the first place.
766 */ 776 */
767 if ((file->f_flags & O_APPEND) && savedsize != isize) { 777 if ((file->f_flags & O_APPEND) && savedsize != xip->i_size)
768 goto start; 778 goto start;
769 }
770 } 779 }
771 780
772 if (likely(!(ioflags & IO_INVIS))) { 781 if (likely(!(ioflags & IO_INVIS))) {
@@ -784,11 +793,11 @@ start:
784 * to zero it out up to the new size. 793 * to zero it out up to the new size.
785 */ 794 */
786 795
787 if (pos > isize) { 796 if (pos > xip->i_size) {
788 error = xfs_zero_eof(BHV_TO_VNODE(bdp), io, pos, isize); 797 error = xfs_zero_eof(BHV_TO_VNODE(bdp), io, pos, xip->i_size);
789 if (error) { 798 if (error) {
790 xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); 799 xfs_iunlock(xip, XFS_ILOCK_EXCL);
791 goto out_unlock_mutex; 800 goto out_unlock_internal;
792 } 801 }
793 } 802 }
794 xfs_iunlock(xip, XFS_ILOCK_EXCL); 803 xfs_iunlock(xip, XFS_ILOCK_EXCL);
@@ -808,8 +817,7 @@ start:
808 if (likely(!error)) 817 if (likely(!error))
809 error = -remove_suid(file->f_path.dentry); 818 error = -remove_suid(file->f_path.dentry);
810 if (unlikely(error)) { 819 if (unlikely(error)) {
811 xfs_iunlock(xip, iolock); 820 goto out_unlock_internal;
812 goto out_unlock_mutex;
813 } 821 }
814 } 822 }
815 823
@@ -879,12 +887,12 @@ retry:
879 error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, vp, 887 error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, vp,
880 DM_RIGHT_NULL, vp, DM_RIGHT_NULL, NULL, NULL, 888 DM_RIGHT_NULL, vp, DM_RIGHT_NULL, NULL, NULL,
881 0, 0, 0); /* Delay flag intentionally unused */ 889 0, 0, 0); /* Delay flag intentionally unused */
882 if (error)
883 goto out_nounlocks;
884 if (need_i_mutex) 890 if (need_i_mutex)
885 mutex_lock(&inode->i_mutex); 891 mutex_lock(&inode->i_mutex);
886 xfs_rwlock(bdp, locktype); 892 xfs_rwlock(bdp, locktype);
887 pos = xip->i_d.di_size; 893 if (error)
894 goto out_unlock_internal;
895 pos = xip->i_size;
888 ret = 0; 896 ret = 0;
889 goto retry; 897 goto retry;
890 } 898 }
@@ -893,14 +901,10 @@ retry:
893 if (unlikely(ret < 0 && ret != -EFAULT && *offset > isize)) 901 if (unlikely(ret < 0 && ret != -EFAULT && *offset > isize))
894 *offset = isize; 902 *offset = isize;
895 903
896 if (*offset > xip->i_d.di_size) { 904 if (*offset > xip->i_size) {
897 xfs_ilock(xip, XFS_ILOCK_EXCL); 905 xfs_ilock(xip, XFS_ILOCK_EXCL);
898 if (*offset > xip->i_d.di_size) { 906 if (*offset > xip->i_size)
899 xip->i_d.di_size = *offset; 907 xip->i_size = *offset;
900 i_size_write(inode, *offset);
901 xip->i_update_core = 1;
902 xip->i_update_size = 1;
903 }
904 xfs_iunlock(xip, XFS_ILOCK_EXCL); 908 xfs_iunlock(xip, XFS_ILOCK_EXCL);
905 } 909 }
906 910
@@ -922,16 +926,31 @@ retry:
922 926
923 error = sync_page_range(inode, mapping, pos, ret); 927 error = sync_page_range(inode, mapping, pos, ret);
924 if (!error) 928 if (!error)
925 error = ret; 929 error = -ret;
926 return error; 930 if (need_i_mutex)
931 mutex_lock(&inode->i_mutex);
932 xfs_rwlock(bdp, locktype);
927 } 933 }
928 934
929 out_unlock_internal: 935 out_unlock_internal:
936 if (io->io_new_size) {
937 xfs_ilock(xip, XFS_ILOCK_EXCL);
938 io->io_new_size = 0;
939 /*
940 * If this was a direct or synchronous I/O that failed (such
941 * as ENOSPC) then part of the I/O may have been written to
942 * disk before the error occured. In this case the on-disk
943 * file size may have been adjusted beyond the in-memory file
944 * size and now needs to be truncated back.
945 */
946 if (xip->i_d.di_size > xip->i_size)
947 xip->i_d.di_size = xip->i_size;
948 xfs_iunlock(xip, XFS_ILOCK_EXCL);
949 }
930 xfs_rwunlock(bdp, locktype); 950 xfs_rwunlock(bdp, locktype);
931 out_unlock_mutex: 951 out_unlock_mutex:
932 if (need_i_mutex) 952 if (need_i_mutex)
933 mutex_unlock(&inode->i_mutex); 953 mutex_unlock(&inode->i_mutex);
934 out_nounlocks:
935 return -error; 954 return -error;
936} 955}
937 956