aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBrian Foster <bfoster@redhat.com>2017-03-08 12:58:08 -0500
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>2017-04-08 03:30:32 -0400
commitda617af8f0c6fa9cd2694440529f5edf99c0c6d1 (patch)
treec786239646bcfb00f48fdfd20b318e82079ab7fa
parent77aedb0cbe6aa45338a6e59afa995fde37133bf0 (diff)
xfs: use iomap new flag for newly allocated delalloc blocks
commit f65e6fad293b3a5793b7fa2044800506490e7a2e upstream. Commit fa7f138 ("xfs: clear delalloc and cache on buffered write failure") fixed one regression in the iomap error handling code and exposed another. The fundamental problem is that if a buffered write is a rewrite of preexisting delalloc blocks and the write fails, the failure handling code can punch out preexisting blocks with valid file data. This was reproduced directly by sub-block writes in the LTP kernel/syscalls/write/write03 test. A first 100 byte write allocates a single block in a file. A subsequent 100 byte write fails and punches out the block, including the data successfully written by the previous write. To address this problem, update the ->iomap_begin() handler to distinguish newly allocated delalloc blocks from preexisting delalloc blocks via the IOMAP_F_NEW flag. Use this flag in the ->iomap_end() handler to decide when a failed or short write should punch out delalloc blocks. This introduces the subtle requirement that ->iomap_begin() should never combine newly allocated delalloc blocks with existing blocks in the resulting iomap descriptor. This can occur when a new delalloc reservation merges with a neighboring extent that is part of the current write, for example. Therefore, drop the post-allocation extent lookup from xfs_bmapi_reserve_delalloc() and just return the record inserted into the fork. This ensures only new blocks are returned and thus that preexisting delalloc blocks are always handled as "found" blocks and not punched out on a failed rewrite. Reported-by: Xiong Zhou <xzhou@redhat.com> Signed-off-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c24
-rw-r--r--fs/xfs/xfs_iomap.c16
2 files changed, 25 insertions, 15 deletions
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index a4322526ea5f..ec93395eccdc 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -4253,6 +4253,19 @@ xfs_bmapi_read(
4253 return 0; 4253 return 0;
4254} 4254}
4255 4255
4256/*
4257 * Add a delayed allocation extent to an inode. Blocks are reserved from the
4258 * global pool and the extent inserted into the inode in-core extent tree.
4259 *
4260 * On entry, got refers to the first extent beyond the offset of the extent to
4261 * allocate or eof is specified if no such extent exists. On return, got refers
4262 * to the extent record that was inserted to the inode fork.
4263 *
4264 * Note that the allocated extent may have been merged with contiguous extents
4265 * during insertion into the inode fork. Thus, got does not reflect the current
4266 * state of the inode fork on return. If necessary, the caller can use lastx to
4267 * look up the updated record in the inode fork.
4268 */
4256int 4269int
4257xfs_bmapi_reserve_delalloc( 4270xfs_bmapi_reserve_delalloc(
4258 struct xfs_inode *ip, 4271 struct xfs_inode *ip,
@@ -4339,13 +4352,8 @@ xfs_bmapi_reserve_delalloc(
4339 got->br_startblock = nullstartblock(indlen); 4352 got->br_startblock = nullstartblock(indlen);
4340 got->br_blockcount = alen; 4353 got->br_blockcount = alen;
4341 got->br_state = XFS_EXT_NORM; 4354 got->br_state = XFS_EXT_NORM;
4342 xfs_bmap_add_extent_hole_delay(ip, whichfork, lastx, got);
4343 4355
4344 /* 4356 xfs_bmap_add_extent_hole_delay(ip, whichfork, lastx, got);
4345 * Update our extent pointer, given that xfs_bmap_add_extent_hole_delay
4346 * might have merged it into one of the neighbouring ones.
4347 */
4348 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx), got);
4349 4357
4350 /* 4358 /*
4351 * Tag the inode if blocks were preallocated. Note that COW fork 4359 * Tag the inode if blocks were preallocated. Note that COW fork
@@ -4357,10 +4365,6 @@ xfs_bmapi_reserve_delalloc(
4357 if (whichfork == XFS_COW_FORK && (prealloc || aoff < off || alen > len)) 4365 if (whichfork == XFS_COW_FORK && (prealloc || aoff < off || alen > len))
4358 xfs_inode_set_cowblocks_tag(ip); 4366 xfs_inode_set_cowblocks_tag(ip);
4359 4367
4360 ASSERT(got->br_startoff <= aoff);
4361 ASSERT(got->br_startoff + got->br_blockcount >= aoff + alen);
4362 ASSERT(isnullstartblock(got->br_startblock));
4363 ASSERT(got->br_state == XFS_EXT_NORM);
4364 return 0; 4368 return 0;
4365 4369
4366out_unreserve_blocks: 4370out_unreserve_blocks:
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 5211887cbcd2..360562484e7b 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -637,6 +637,11 @@ retry:
637 goto out_unlock; 637 goto out_unlock;
638 } 638 }
639 639
640 /*
641 * Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch
642 * them out if the write happens to fail.
643 */
644 iomap->flags = IOMAP_F_NEW;
640 trace_xfs_iomap_alloc(ip, offset, count, 0, &got); 645 trace_xfs_iomap_alloc(ip, offset, count, 0, &got);
641done: 646done:
642 if (isnullstartblock(got.br_startblock)) 647 if (isnullstartblock(got.br_startblock))
@@ -1061,7 +1066,8 @@ xfs_file_iomap_end_delalloc(
1061 struct xfs_inode *ip, 1066 struct xfs_inode *ip,
1062 loff_t offset, 1067 loff_t offset,
1063 loff_t length, 1068 loff_t length,
1064 ssize_t written) 1069 ssize_t written,
1070 struct iomap *iomap)
1065{ 1071{
1066 struct xfs_mount *mp = ip->i_mount; 1072 struct xfs_mount *mp = ip->i_mount;
1067 xfs_fileoff_t start_fsb; 1073 xfs_fileoff_t start_fsb;
@@ -1080,14 +1086,14 @@ xfs_file_iomap_end_delalloc(
1080 end_fsb = XFS_B_TO_FSB(mp, offset + length); 1086 end_fsb = XFS_B_TO_FSB(mp, offset + length);
1081 1087
1082 /* 1088 /*
1083 * Trim back delalloc blocks if we didn't manage to write the whole 1089 * Trim delalloc blocks if they were allocated by this write and we
1084 * range reserved. 1090 * didn't manage to write the whole range.
1085 * 1091 *
1086 * We don't need to care about racing delalloc as we hold i_mutex 1092 * We don't need to care about racing delalloc as we hold i_mutex
1087 * across the reserve/allocate/unreserve calls. If there are delalloc 1093 * across the reserve/allocate/unreserve calls. If there are delalloc
1088 * blocks in the range, they are ours. 1094 * blocks in the range, they are ours.
1089 */ 1095 */
1090 if (start_fsb < end_fsb) { 1096 if ((iomap->flags & IOMAP_F_NEW) && start_fsb < end_fsb) {
1091 truncate_pagecache_range(VFS_I(ip), XFS_FSB_TO_B(mp, start_fsb), 1097 truncate_pagecache_range(VFS_I(ip), XFS_FSB_TO_B(mp, start_fsb),
1092 XFS_FSB_TO_B(mp, end_fsb) - 1); 1098 XFS_FSB_TO_B(mp, end_fsb) - 1);
1093 1099
@@ -1117,7 +1123,7 @@ xfs_file_iomap_end(
1117{ 1123{
1118 if ((flags & IOMAP_WRITE) && iomap->type == IOMAP_DELALLOC) 1124 if ((flags & IOMAP_WRITE) && iomap->type == IOMAP_DELALLOC)
1119 return xfs_file_iomap_end_delalloc(XFS_I(inode), offset, 1125 return xfs_file_iomap_end_delalloc(XFS_I(inode), offset,
1120 length, written); 1126 length, written, iomap);
1121 return 0; 1127 return 0;
1122} 1128}
1123 1129