aboutsummaryrefslogtreecommitdiffstats
path: root/fs/xfs/linux-2.6/xfs_lrw.c
diff options
context:
space:
mode:
authorLachlan McIlroy <lachlan@sgi.com>2007-05-07 23:49:46 -0400
committerTim Shimmin <tes@sgi.com>2007-05-07 23:49:46 -0400
commitba87ea699ebd9dd577bf055ebc4a98200e337542 (patch)
tree713b7d32937372fd7c5b8647f14d0e7262fc7075 /fs/xfs/linux-2.6/xfs_lrw.c
parent2a32963130aec5e157b58ff7dfa3dfa1afdf7ca1 (diff)
[XFS] Fix to prevent the notorious 'NULL files' problem after a crash.
The problem that has been addressed is that of synchronising updates of the file size with writes that extend a file. Without the fix the update of a file's size, as a result of a write beyond eof, is independent of when the cached data is flushed to disk. Often the file size update would be written to the filesystem log before the data is flushed to disk. When a system crashes between these two events and the filesystem log is replayed on mount the file's size will be set but since the contents never made it to disk the file is full of holes. If some of the cached data was flushed to disk then it may just be a section of the file at the end that has holes. There are existing fixes to help alleviate this problem, particularly in the case where a file has been truncated, that force cached data to be flushed to disk when the file is closed. If the system crashes while the file(s) are still open then this flushing will never occur. The fix that we have implemented is to introduce a second file size, called the in-memory file size, that represents the current file size as viewed by the user. The existing file size, called the on-disk file size, is the one that get's written to the filesystem log and we only update it when it is safe to do so. When we write to a file beyond eof we only update the in- memory file size in the write operation. Later when the I/O operation, that flushes the cached data to disk completes, an I/O completion routine will update the on-disk file size. The on-disk file size will be updated to the maximum offset of the I/O or to the value of the in-memory file size if the I/O includes eof. SGI-PV: 958522 SGI-Modid: xfs-linux-melb:xfs-kern:28322a Signed-off-by: Lachlan McIlroy <lachlan@sgi.com> Signed-off-by: David Chinner <dgc@sgi.com> Signed-off-by: Tim Shimmin <tes@sgi.com>
Diffstat (limited to 'fs/xfs/linux-2.6/xfs_lrw.c')
-rw-r--r--fs/xfs/linux-2.6/xfs_lrw.c91
1 files changed, 55 insertions, 36 deletions
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 80fe31233471..82ab792c7fc9 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -224,7 +224,7 @@ xfs_read(
224 mp->m_rtdev_targp : mp->m_ddev_targp; 224 mp->m_rtdev_targp : mp->m_ddev_targp;
225 if ((*offset & target->bt_smask) || 225 if ((*offset & target->bt_smask) ||
226 (size & target->bt_smask)) { 226 (size & target->bt_smask)) {
227 if (*offset == ip->i_d.di_size) { 227 if (*offset == ip->i_size) {
228 return (0); 228 return (0);
229 } 229 }
230 return -XFS_ERROR(EINVAL); 230 return -XFS_ERROR(EINVAL);
@@ -387,9 +387,10 @@ xfs_splice_write(
387{ 387{
388 xfs_inode_t *ip = XFS_BHVTOI(bdp); 388 xfs_inode_t *ip = XFS_BHVTOI(bdp);
389 xfs_mount_t *mp = ip->i_mount; 389 xfs_mount_t *mp = ip->i_mount;
390 xfs_iocore_t *io = &ip->i_iocore;
390 ssize_t ret; 391 ssize_t ret;
391 struct inode *inode = outfilp->f_mapping->host; 392 struct inode *inode = outfilp->f_mapping->host;
392 xfs_fsize_t isize; 393 xfs_fsize_t isize, new_size;
393 394
394 XFS_STATS_INC(xs_write_calls); 395 XFS_STATS_INC(xs_write_calls);
395 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 396 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
@@ -410,6 +411,14 @@ xfs_splice_write(
410 return -error; 411 return -error;
411 } 412 }
412 } 413 }
414
415 new_size = *ppos + count;
416
417 xfs_ilock(ip, XFS_ILOCK_EXCL);
418 if (new_size > ip->i_size)
419 io->io_new_size = new_size;
420 xfs_iunlock(ip, XFS_ILOCK_EXCL);
421
413 xfs_rw_enter_trace(XFS_SPLICE_WRITE_ENTER, &ip->i_iocore, 422 xfs_rw_enter_trace(XFS_SPLICE_WRITE_ENTER, &ip->i_iocore,
414 pipe, count, *ppos, ioflags); 423 pipe, count, *ppos, ioflags);
415 ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags); 424 ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
@@ -420,14 +429,18 @@ xfs_splice_write(
420 if (unlikely(ret < 0 && ret != -EFAULT && *ppos > isize)) 429 if (unlikely(ret < 0 && ret != -EFAULT && *ppos > isize))
421 *ppos = isize; 430 *ppos = isize;
422 431
423 if (*ppos > ip->i_d.di_size) { 432 if (*ppos > ip->i_size) {
424 xfs_ilock(ip, XFS_ILOCK_EXCL); 433 xfs_ilock(ip, XFS_ILOCK_EXCL);
425 if (*ppos > ip->i_d.di_size) { 434 if (*ppos > ip->i_size)
426 ip->i_d.di_size = *ppos; 435 ip->i_size = *ppos;
427 i_size_write(inode, *ppos); 436 xfs_iunlock(ip, XFS_ILOCK_EXCL);
428 ip->i_update_core = 1; 437 }
429 ip->i_update_size = 1; 438
430 } 439 if (io->io_new_size) {
440 xfs_ilock(ip, XFS_ILOCK_EXCL);
441 io->io_new_size = 0;
442 if (ip->i_d.di_size > ip->i_size)
443 ip->i_d.di_size = ip->i_size;
431 xfs_iunlock(ip, XFS_ILOCK_EXCL); 444 xfs_iunlock(ip, XFS_ILOCK_EXCL);
432 } 445 }
433 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 446 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
@@ -711,8 +724,6 @@ start:
711 goto out_unlock_mutex; 724 goto out_unlock_mutex;
712 } 725 }
713 726
714 isize = i_size_read(inode);
715
716 if (ioflags & IO_ISDIRECT) { 727 if (ioflags & IO_ISDIRECT) {
717 xfs_buftarg_t *target = 728 xfs_buftarg_t *target =
718 (xip->i_d.di_flags & XFS_DIFLAG_REALTIME) ? 729 (xip->i_d.di_flags & XFS_DIFLAG_REALTIME) ?
@@ -723,7 +734,7 @@ start:
723 return XFS_ERROR(-EINVAL); 734 return XFS_ERROR(-EINVAL);
724 } 735 }
725 736
726 if (!need_i_mutex && (VN_CACHED(vp) || pos > isize)) { 737 if (!need_i_mutex && (VN_CACHED(vp) || pos > xip->i_size)) {
727 xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); 738 xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
728 iolock = XFS_IOLOCK_EXCL; 739 iolock = XFS_IOLOCK_EXCL;
729 locktype = VRWLOCK_WRITE; 740 locktype = VRWLOCK_WRITE;
@@ -735,7 +746,7 @@ start:
735 } 746 }
736 747
737 new_size = pos + count; 748 new_size = pos + count;
738 if (new_size > isize) 749 if (new_size > xip->i_size)
739 io->io_new_size = new_size; 750 io->io_new_size = new_size;
740 751
741 if ((DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_WRITE) && 752 if ((DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_WRITE) &&
@@ -751,8 +762,7 @@ start:
751 pos, count, 762 pos, count,
752 dmflags, &locktype); 763 dmflags, &locktype);
753 if (error) { 764 if (error) {
754 xfs_iunlock(xip, iolock); 765 goto out_unlock_internal;
755 goto out_unlock_mutex;
756 } 766 }
757 xfs_ilock(xip, XFS_ILOCK_EXCL); 767 xfs_ilock(xip, XFS_ILOCK_EXCL);
758 eventsent = 1; 768 eventsent = 1;
@@ -764,9 +774,8 @@ start:
764 * event prevents another call to XFS_SEND_DATA, which is 774 * event prevents another call to XFS_SEND_DATA, which is
765 * what allows the size to change in the first place. 775 * what allows the size to change in the first place.
766 */ 776 */
767 if ((file->f_flags & O_APPEND) && savedsize != isize) { 777 if ((file->f_flags & O_APPEND) && savedsize != xip->i_size)
768 goto start; 778 goto start;
769 }
770 } 779 }
771 780
772 if (likely(!(ioflags & IO_INVIS))) { 781 if (likely(!(ioflags & IO_INVIS))) {
@@ -784,11 +793,11 @@ start:
784 * to zero it out up to the new size. 793 * to zero it out up to the new size.
785 */ 794 */
786 795
787 if (pos > isize) { 796 if (pos > xip->i_size) {
788 error = xfs_zero_eof(BHV_TO_VNODE(bdp), io, pos, isize); 797 error = xfs_zero_eof(BHV_TO_VNODE(bdp), io, pos, xip->i_size);
789 if (error) { 798 if (error) {
790 xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); 799 xfs_iunlock(xip, XFS_ILOCK_EXCL);
791 goto out_unlock_mutex; 800 goto out_unlock_internal;
792 } 801 }
793 } 802 }
794 xfs_iunlock(xip, XFS_ILOCK_EXCL); 803 xfs_iunlock(xip, XFS_ILOCK_EXCL);
@@ -808,8 +817,7 @@ start:
808 if (likely(!error)) 817 if (likely(!error))
809 error = -remove_suid(file->f_path.dentry); 818 error = -remove_suid(file->f_path.dentry);
810 if (unlikely(error)) { 819 if (unlikely(error)) {
811 xfs_iunlock(xip, iolock); 820 goto out_unlock_internal;
812 goto out_unlock_mutex;
813 } 821 }
814 } 822 }
815 823
@@ -879,12 +887,12 @@ retry:
879 error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, vp, 887 error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, vp,
880 DM_RIGHT_NULL, vp, DM_RIGHT_NULL, NULL, NULL, 888 DM_RIGHT_NULL, vp, DM_RIGHT_NULL, NULL, NULL,
881 0, 0, 0); /* Delay flag intentionally unused */ 889 0, 0, 0); /* Delay flag intentionally unused */
882 if (error)
883 goto out_nounlocks;
884 if (need_i_mutex) 890 if (need_i_mutex)
885 mutex_lock(&inode->i_mutex); 891 mutex_lock(&inode->i_mutex);
886 xfs_rwlock(bdp, locktype); 892 xfs_rwlock(bdp, locktype);
887 pos = xip->i_d.di_size; 893 if (error)
894 goto out_unlock_internal;
895 pos = xip->i_size;
888 ret = 0; 896 ret = 0;
889 goto retry; 897 goto retry;
890 } 898 }
@@ -893,14 +901,10 @@ retry:
893 if (unlikely(ret < 0 && ret != -EFAULT && *offset > isize)) 901 if (unlikely(ret < 0 && ret != -EFAULT && *offset > isize))
894 *offset = isize; 902 *offset = isize;
895 903
896 if (*offset > xip->i_d.di_size) { 904 if (*offset > xip->i_size) {
897 xfs_ilock(xip, XFS_ILOCK_EXCL); 905 xfs_ilock(xip, XFS_ILOCK_EXCL);
898 if (*offset > xip->i_d.di_size) { 906 if (*offset > xip->i_size)
899 xip->i_d.di_size = *offset; 907 xip->i_size = *offset;
900 i_size_write(inode, *offset);
901 xip->i_update_core = 1;
902 xip->i_update_size = 1;
903 }
904 xfs_iunlock(xip, XFS_ILOCK_EXCL); 908 xfs_iunlock(xip, XFS_ILOCK_EXCL);
905 } 909 }
906 910
@@ -922,16 +926,31 @@ retry:
922 926
923 error = sync_page_range(inode, mapping, pos, ret); 927 error = sync_page_range(inode, mapping, pos, ret);
924 if (!error) 928 if (!error)
925 error = ret; 929 error = -ret;
926 return error; 930 if (need_i_mutex)
931 mutex_lock(&inode->i_mutex);
932 xfs_rwlock(bdp, locktype);
927 } 933 }
928 934
929 out_unlock_internal: 935 out_unlock_internal:
936 if (io->io_new_size) {
937 xfs_ilock(xip, XFS_ILOCK_EXCL);
938 io->io_new_size = 0;
939 /*
940 * If this was a direct or synchronous I/O that failed (such
941 * as ENOSPC) then part of the I/O may have been written to
942 * disk before the error occured. In this case the on-disk
943 * file size may have been adjusted beyond the in-memory file
944 * size and now needs to be truncated back.
945 */
946 if (xip->i_d.di_size > xip->i_size)
947 xip->i_d.di_size = xip->i_size;
948 xfs_iunlock(xip, XFS_ILOCK_EXCL);
949 }
930 xfs_rwunlock(bdp, locktype); 950 xfs_rwunlock(bdp, locktype);
931 out_unlock_mutex: 951 out_unlock_mutex:
932 if (need_i_mutex) 952 if (need_i_mutex)
933 mutex_unlock(&inode->i_mutex); 953 mutex_unlock(&inode->i_mutex);
934 out_nounlocks:
935 return -error; 954 return -error;
936} 955}
937 956