diff options
author | Lachlan McIlroy <lachlan@sgi.com> | 2007-05-07 23:49:46 -0400 |
---|---|---|
committer | Tim Shimmin <tes@sgi.com> | 2007-05-07 23:49:46 -0400 |
commit | ba87ea699ebd9dd577bf055ebc4a98200e337542 (patch) | |
tree | 713b7d32937372fd7c5b8647f14d0e7262fc7075 /fs/xfs/linux-2.6/xfs_lrw.c | |
parent | 2a32963130aec5e157b58ff7dfa3dfa1afdf7ca1 (diff) |
[XFS] Fix to prevent the notorious 'NULL files' problem after a crash.
The problem that has been addressed is that of synchronising updates of
the file size with writes that extend a file. Without the fix the update
of a file's size, as a result of a write beyond eof, is independent of
when the cached data is flushed to disk. Often the file size update would
be written to the filesystem log before the data is flushed to disk. When
a system crashes between these two events and the filesystem log is
replayed on mount the file's size will be set but since the contents never
made it to disk the file is full of holes. If some of the cached data was
flushed to disk then it may just be a section of the file at the end that
has holes.
There are existing fixes to help alleviate this problem, particularly in
the case where a file has been truncated, that force cached data to be
flushed to disk when the file is closed. If the system crashes while the
file(s) are still open then this flushing will never occur.
The fix that we have implemented is to introduce a second file size,
called the in-memory file size, that represents the current file size as
viewed by the user. The existing file size, called the on-disk file size,
is the one that get's written to the filesystem log and we only update it
when it is safe to do so. When we write to a file beyond eof we only
update the in- memory file size in the write operation. Later when the I/O
operation, that flushes the cached data to disk completes, an I/O
completion routine will update the on-disk file size. The on-disk file
size will be updated to the maximum offset of the I/O or to the value of
the in-memory file size if the I/O includes eof.
SGI-PV: 958522
SGI-Modid: xfs-linux-melb:xfs-kern:28322a
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Tim Shimmin <tes@sgi.com>
Diffstat (limited to 'fs/xfs/linux-2.6/xfs_lrw.c')
-rw-r--r-- | fs/xfs/linux-2.6/xfs_lrw.c | 91 |
1 files changed, 55 insertions, 36 deletions
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c index 80fe31233471..82ab792c7fc9 100644 --- a/fs/xfs/linux-2.6/xfs_lrw.c +++ b/fs/xfs/linux-2.6/xfs_lrw.c | |||
@@ -224,7 +224,7 @@ xfs_read( | |||
224 | mp->m_rtdev_targp : mp->m_ddev_targp; | 224 | mp->m_rtdev_targp : mp->m_ddev_targp; |
225 | if ((*offset & target->bt_smask) || | 225 | if ((*offset & target->bt_smask) || |
226 | (size & target->bt_smask)) { | 226 | (size & target->bt_smask)) { |
227 | if (*offset == ip->i_d.di_size) { | 227 | if (*offset == ip->i_size) { |
228 | return (0); | 228 | return (0); |
229 | } | 229 | } |
230 | return -XFS_ERROR(EINVAL); | 230 | return -XFS_ERROR(EINVAL); |
@@ -387,9 +387,10 @@ xfs_splice_write( | |||
387 | { | 387 | { |
388 | xfs_inode_t *ip = XFS_BHVTOI(bdp); | 388 | xfs_inode_t *ip = XFS_BHVTOI(bdp); |
389 | xfs_mount_t *mp = ip->i_mount; | 389 | xfs_mount_t *mp = ip->i_mount; |
390 | xfs_iocore_t *io = &ip->i_iocore; | ||
390 | ssize_t ret; | 391 | ssize_t ret; |
391 | struct inode *inode = outfilp->f_mapping->host; | 392 | struct inode *inode = outfilp->f_mapping->host; |
392 | xfs_fsize_t isize; | 393 | xfs_fsize_t isize, new_size; |
393 | 394 | ||
394 | XFS_STATS_INC(xs_write_calls); | 395 | XFS_STATS_INC(xs_write_calls); |
395 | if (XFS_FORCED_SHUTDOWN(ip->i_mount)) | 396 | if (XFS_FORCED_SHUTDOWN(ip->i_mount)) |
@@ -410,6 +411,14 @@ xfs_splice_write( | |||
410 | return -error; | 411 | return -error; |
411 | } | 412 | } |
412 | } | 413 | } |
414 | |||
415 | new_size = *ppos + count; | ||
416 | |||
417 | xfs_ilock(ip, XFS_ILOCK_EXCL); | ||
418 | if (new_size > ip->i_size) | ||
419 | io->io_new_size = new_size; | ||
420 | xfs_iunlock(ip, XFS_ILOCK_EXCL); | ||
421 | |||
413 | xfs_rw_enter_trace(XFS_SPLICE_WRITE_ENTER, &ip->i_iocore, | 422 | xfs_rw_enter_trace(XFS_SPLICE_WRITE_ENTER, &ip->i_iocore, |
414 | pipe, count, *ppos, ioflags); | 423 | pipe, count, *ppos, ioflags); |
415 | ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags); | 424 | ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags); |
@@ -420,14 +429,18 @@ xfs_splice_write( | |||
420 | if (unlikely(ret < 0 && ret != -EFAULT && *ppos > isize)) | 429 | if (unlikely(ret < 0 && ret != -EFAULT && *ppos > isize)) |
421 | *ppos = isize; | 430 | *ppos = isize; |
422 | 431 | ||
423 | if (*ppos > ip->i_d.di_size) { | 432 | if (*ppos > ip->i_size) { |
424 | xfs_ilock(ip, XFS_ILOCK_EXCL); | 433 | xfs_ilock(ip, XFS_ILOCK_EXCL); |
425 | if (*ppos > ip->i_d.di_size) { | 434 | if (*ppos > ip->i_size) |
426 | ip->i_d.di_size = *ppos; | 435 | ip->i_size = *ppos; |
427 | i_size_write(inode, *ppos); | 436 | xfs_iunlock(ip, XFS_ILOCK_EXCL); |
428 | ip->i_update_core = 1; | 437 | } |
429 | ip->i_update_size = 1; | 438 | |
430 | } | 439 | if (io->io_new_size) { |
440 | xfs_ilock(ip, XFS_ILOCK_EXCL); | ||
441 | io->io_new_size = 0; | ||
442 | if (ip->i_d.di_size > ip->i_size) | ||
443 | ip->i_d.di_size = ip->i_size; | ||
431 | xfs_iunlock(ip, XFS_ILOCK_EXCL); | 444 | xfs_iunlock(ip, XFS_ILOCK_EXCL); |
432 | } | 445 | } |
433 | xfs_iunlock(ip, XFS_IOLOCK_EXCL); | 446 | xfs_iunlock(ip, XFS_IOLOCK_EXCL); |
@@ -711,8 +724,6 @@ start: | |||
711 | goto out_unlock_mutex; | 724 | goto out_unlock_mutex; |
712 | } | 725 | } |
713 | 726 | ||
714 | isize = i_size_read(inode); | ||
715 | |||
716 | if (ioflags & IO_ISDIRECT) { | 727 | if (ioflags & IO_ISDIRECT) { |
717 | xfs_buftarg_t *target = | 728 | xfs_buftarg_t *target = |
718 | (xip->i_d.di_flags & XFS_DIFLAG_REALTIME) ? | 729 | (xip->i_d.di_flags & XFS_DIFLAG_REALTIME) ? |
@@ -723,7 +734,7 @@ start: | |||
723 | return XFS_ERROR(-EINVAL); | 734 | return XFS_ERROR(-EINVAL); |
724 | } | 735 | } |
725 | 736 | ||
726 | if (!need_i_mutex && (VN_CACHED(vp) || pos > isize)) { | 737 | if (!need_i_mutex && (VN_CACHED(vp) || pos > xip->i_size)) { |
727 | xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); | 738 | xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); |
728 | iolock = XFS_IOLOCK_EXCL; | 739 | iolock = XFS_IOLOCK_EXCL; |
729 | locktype = VRWLOCK_WRITE; | 740 | locktype = VRWLOCK_WRITE; |
@@ -735,7 +746,7 @@ start: | |||
735 | } | 746 | } |
736 | 747 | ||
737 | new_size = pos + count; | 748 | new_size = pos + count; |
738 | if (new_size > isize) | 749 | if (new_size > xip->i_size) |
739 | io->io_new_size = new_size; | 750 | io->io_new_size = new_size; |
740 | 751 | ||
741 | if ((DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_WRITE) && | 752 | if ((DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_WRITE) && |
@@ -751,8 +762,7 @@ start: | |||
751 | pos, count, | 762 | pos, count, |
752 | dmflags, &locktype); | 763 | dmflags, &locktype); |
753 | if (error) { | 764 | if (error) { |
754 | xfs_iunlock(xip, iolock); | 765 | goto out_unlock_internal; |
755 | goto out_unlock_mutex; | ||
756 | } | 766 | } |
757 | xfs_ilock(xip, XFS_ILOCK_EXCL); | 767 | xfs_ilock(xip, XFS_ILOCK_EXCL); |
758 | eventsent = 1; | 768 | eventsent = 1; |
@@ -764,9 +774,8 @@ start: | |||
764 | * event prevents another call to XFS_SEND_DATA, which is | 774 | * event prevents another call to XFS_SEND_DATA, which is |
765 | * what allows the size to change in the first place. | 775 | * what allows the size to change in the first place. |
766 | */ | 776 | */ |
767 | if ((file->f_flags & O_APPEND) && savedsize != isize) { | 777 | if ((file->f_flags & O_APPEND) && savedsize != xip->i_size) |
768 | goto start; | 778 | goto start; |
769 | } | ||
770 | } | 779 | } |
771 | 780 | ||
772 | if (likely(!(ioflags & IO_INVIS))) { | 781 | if (likely(!(ioflags & IO_INVIS))) { |
@@ -784,11 +793,11 @@ start: | |||
784 | * to zero it out up to the new size. | 793 | * to zero it out up to the new size. |
785 | */ | 794 | */ |
786 | 795 | ||
787 | if (pos > isize) { | 796 | if (pos > xip->i_size) { |
788 | error = xfs_zero_eof(BHV_TO_VNODE(bdp), io, pos, isize); | 797 | error = xfs_zero_eof(BHV_TO_VNODE(bdp), io, pos, xip->i_size); |
789 | if (error) { | 798 | if (error) { |
790 | xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); | 799 | xfs_iunlock(xip, XFS_ILOCK_EXCL); |
791 | goto out_unlock_mutex; | 800 | goto out_unlock_internal; |
792 | } | 801 | } |
793 | } | 802 | } |
794 | xfs_iunlock(xip, XFS_ILOCK_EXCL); | 803 | xfs_iunlock(xip, XFS_ILOCK_EXCL); |
@@ -808,8 +817,7 @@ start: | |||
808 | if (likely(!error)) | 817 | if (likely(!error)) |
809 | error = -remove_suid(file->f_path.dentry); | 818 | error = -remove_suid(file->f_path.dentry); |
810 | if (unlikely(error)) { | 819 | if (unlikely(error)) { |
811 | xfs_iunlock(xip, iolock); | 820 | goto out_unlock_internal; |
812 | goto out_unlock_mutex; | ||
813 | } | 821 | } |
814 | } | 822 | } |
815 | 823 | ||
@@ -879,12 +887,12 @@ retry: | |||
879 | error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, vp, | 887 | error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, vp, |
880 | DM_RIGHT_NULL, vp, DM_RIGHT_NULL, NULL, NULL, | 888 | DM_RIGHT_NULL, vp, DM_RIGHT_NULL, NULL, NULL, |
881 | 0, 0, 0); /* Delay flag intentionally unused */ | 889 | 0, 0, 0); /* Delay flag intentionally unused */ |
882 | if (error) | ||
883 | goto out_nounlocks; | ||
884 | if (need_i_mutex) | 890 | if (need_i_mutex) |
885 | mutex_lock(&inode->i_mutex); | 891 | mutex_lock(&inode->i_mutex); |
886 | xfs_rwlock(bdp, locktype); | 892 | xfs_rwlock(bdp, locktype); |
887 | pos = xip->i_d.di_size; | 893 | if (error) |
894 | goto out_unlock_internal; | ||
895 | pos = xip->i_size; | ||
888 | ret = 0; | 896 | ret = 0; |
889 | goto retry; | 897 | goto retry; |
890 | } | 898 | } |
@@ -893,14 +901,10 @@ retry: | |||
893 | if (unlikely(ret < 0 && ret != -EFAULT && *offset > isize)) | 901 | if (unlikely(ret < 0 && ret != -EFAULT && *offset > isize)) |
894 | *offset = isize; | 902 | *offset = isize; |
895 | 903 | ||
896 | if (*offset > xip->i_d.di_size) { | 904 | if (*offset > xip->i_size) { |
897 | xfs_ilock(xip, XFS_ILOCK_EXCL); | 905 | xfs_ilock(xip, XFS_ILOCK_EXCL); |
898 | if (*offset > xip->i_d.di_size) { | 906 | if (*offset > xip->i_size) |
899 | xip->i_d.di_size = *offset; | 907 | xip->i_size = *offset; |
900 | i_size_write(inode, *offset); | ||
901 | xip->i_update_core = 1; | ||
902 | xip->i_update_size = 1; | ||
903 | } | ||
904 | xfs_iunlock(xip, XFS_ILOCK_EXCL); | 908 | xfs_iunlock(xip, XFS_ILOCK_EXCL); |
905 | } | 909 | } |
906 | 910 | ||
@@ -922,16 +926,31 @@ retry: | |||
922 | 926 | ||
923 | error = sync_page_range(inode, mapping, pos, ret); | 927 | error = sync_page_range(inode, mapping, pos, ret); |
924 | if (!error) | 928 | if (!error) |
925 | error = ret; | 929 | error = -ret; |
926 | return error; | 930 | if (need_i_mutex) |
931 | mutex_lock(&inode->i_mutex); | ||
932 | xfs_rwlock(bdp, locktype); | ||
927 | } | 933 | } |
928 | 934 | ||
929 | out_unlock_internal: | 935 | out_unlock_internal: |
936 | if (io->io_new_size) { | ||
937 | xfs_ilock(xip, XFS_ILOCK_EXCL); | ||
938 | io->io_new_size = 0; | ||
939 | /* | ||
940 | * If this was a direct or synchronous I/O that failed (such | ||
941 | * as ENOSPC) then part of the I/O may have been written to | ||
942 | * disk before the error occured. In this case the on-disk | ||
943 | * file size may have been adjusted beyond the in-memory file | ||
944 | * size and now needs to be truncated back. | ||
945 | */ | ||
946 | if (xip->i_d.di_size > xip->i_size) | ||
947 | xip->i_d.di_size = xip->i_size; | ||
948 | xfs_iunlock(xip, XFS_ILOCK_EXCL); | ||
949 | } | ||
930 | xfs_rwunlock(bdp, locktype); | 950 | xfs_rwunlock(bdp, locktype); |
931 | out_unlock_mutex: | 951 | out_unlock_mutex: |
932 | if (need_i_mutex) | 952 | if (need_i_mutex) |
933 | mutex_unlock(&inode->i_mutex); | 953 | mutex_unlock(&inode->i_mutex); |
934 | out_nounlocks: | ||
935 | return -error; | 954 | return -error; |
936 | } | 955 | } |
937 | 956 | ||