aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2018-11-24 12:11:52 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2018-11-24 12:11:52 -0500
commitabe72ff4134028ff2189d29629c40a40bee0a989 (patch)
tree131d5cb0b4f091ac963c8a444098bec2025577cd
parent7c98a42618271210c60b79128b220107d35938d9 (diff)
parent8c110d43c6bca4b24dd13272a9d4e0ba6f2ec957 (diff)
Merge tag 'xfs-4.20-fixes-2' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux
Pull xfs fixes from Darrick Wong: "Dave and I have continued our work fixing corruption problems that can be found when running long-term burn-in exercisers on xfs. Here are some patches fixing most of the problems, but there will likely be more. :/ - Numerous corruption fixes for copy on write - Numerous corruption fixes for blocksize < pagesize writes - Don't miscalculate AG reservations for small final AGs - Fix page cache truncation to work properly for reflink and extent shifting - Fix use-after-free when retrying failed inode/dquot buffer logging - Fix corruptions seen when using copy_file_range in directio mode" * tag 'xfs-4.20-fixes-2' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux: iomap: readpages doesn't zero page tail beyond EOF vfs: vfs_dedupe_file_range() doesn't return EOPNOTSUPP iomap: dio data corruption and spurious errors when pipes fill iomap: sub-block dio needs to zeroout beyond EOF iomap: FUA is wrong for DIO O_DSYNC writes into unwritten extents xfs: delalloc -> unwritten COW fork allocation can go wrong xfs: flush removing page cache in xfs_reflink_remap_prep xfs: extent shifting doesn't fully invalidate page cache xfs: finobt AG reserves don't consider last AG can be a runt xfs: fix transient reference count error in xfs_buf_resubmit_failed_buffers xfs: uncached buffer tracing needs to print bno xfs: make xfs_file_remap_range() static xfs: fix shared extent data corruption due to missing cow reservation
-rw-r--r--fs/iomap.c53
-rw-r--r--fs/read_write.c15
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c5
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c11
-rw-r--r--fs/xfs/xfs_bmap_util.c10
-rw-r--r--fs/xfs/xfs_bmap_util.h3
-rw-r--r--fs/xfs/xfs_buf_item.c28
-rw-r--r--fs/xfs/xfs_file.c2
-rw-r--r--fs/xfs/xfs_reflink.c18
-rw-r--r--fs/xfs/xfs_trace.h5
10 files changed, 104 insertions, 46 deletions
diff --git a/fs/iomap.c b/fs/iomap.c
index 64ce240217a1..3ffb776fbebe 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -142,13 +142,14 @@ static void
142iomap_adjust_read_range(struct inode *inode, struct iomap_page *iop, 142iomap_adjust_read_range(struct inode *inode, struct iomap_page *iop,
143 loff_t *pos, loff_t length, unsigned *offp, unsigned *lenp) 143 loff_t *pos, loff_t length, unsigned *offp, unsigned *lenp)
144{ 144{
145 loff_t orig_pos = *pos;
146 loff_t isize = i_size_read(inode);
145 unsigned block_bits = inode->i_blkbits; 147 unsigned block_bits = inode->i_blkbits;
146 unsigned block_size = (1 << block_bits); 148 unsigned block_size = (1 << block_bits);
147 unsigned poff = offset_in_page(*pos); 149 unsigned poff = offset_in_page(*pos);
148 unsigned plen = min_t(loff_t, PAGE_SIZE - poff, length); 150 unsigned plen = min_t(loff_t, PAGE_SIZE - poff, length);
149 unsigned first = poff >> block_bits; 151 unsigned first = poff >> block_bits;
150 unsigned last = (poff + plen - 1) >> block_bits; 152 unsigned last = (poff + plen - 1) >> block_bits;
151 unsigned end = offset_in_page(i_size_read(inode)) >> block_bits;
152 153
153 /* 154 /*
154 * If the block size is smaller than the page size we need to check the 155 * If the block size is smaller than the page size we need to check the
@@ -183,8 +184,12 @@ iomap_adjust_read_range(struct inode *inode, struct iomap_page *iop,
183 * handle both halves separately so that we properly zero data in the 184 * handle both halves separately so that we properly zero data in the
184 * page cache for blocks that are entirely outside of i_size. 185 * page cache for blocks that are entirely outside of i_size.
185 */ 186 */
186 if (first <= end && last > end) 187 if (orig_pos <= isize && orig_pos + length > isize) {
187 plen -= (last - end) * block_size; 188 unsigned end = offset_in_page(isize - 1) >> block_bits;
189
190 if (first <= end && last > end)
191 plen -= (last - end) * block_size;
192 }
188 193
189 *offp = poff; 194 *offp = poff;
190 *lenp = plen; 195 *lenp = plen;
@@ -1580,7 +1585,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
1580 struct bio *bio; 1585 struct bio *bio;
1581 bool need_zeroout = false; 1586 bool need_zeroout = false;
1582 bool use_fua = false; 1587 bool use_fua = false;
1583 int nr_pages, ret; 1588 int nr_pages, ret = 0;
1584 size_t copied = 0; 1589 size_t copied = 0;
1585 1590
1586 if ((pos | length | align) & ((1 << blkbits) - 1)) 1591 if ((pos | length | align) & ((1 << blkbits) - 1))
@@ -1596,12 +1601,13 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
1596 1601
1597 if (iomap->flags & IOMAP_F_NEW) { 1602 if (iomap->flags & IOMAP_F_NEW) {
1598 need_zeroout = true; 1603 need_zeroout = true;
1599 } else { 1604 } else if (iomap->type == IOMAP_MAPPED) {
1600 /* 1605 /*
1601 * Use a FUA write if we need datasync semantics, this 1606 * Use a FUA write if we need datasync semantics, this is a pure
1602 * is a pure data IO that doesn't require any metadata 1607 * data IO that doesn't require any metadata updates (including
1603 * updates and the underlying device supports FUA. This 1608 * after IO completion such as unwritten extent conversion) and
1604 * allows us to avoid cache flushes on IO completion. 1609 * the underlying device supports FUA. This allows us to avoid
1610 * cache flushes on IO completion.
1605 */ 1611 */
1606 if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) && 1612 if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) &&
1607 (dio->flags & IOMAP_DIO_WRITE_FUA) && 1613 (dio->flags & IOMAP_DIO_WRITE_FUA) &&
@@ -1644,8 +1650,14 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
1644 1650
1645 ret = bio_iov_iter_get_pages(bio, &iter); 1651 ret = bio_iov_iter_get_pages(bio, &iter);
1646 if (unlikely(ret)) { 1652 if (unlikely(ret)) {
1653 /*
1654 * We have to stop part way through an IO. We must fall
1655 * through to the sub-block tail zeroing here, otherwise
1656 * this short IO may expose stale data in the tail of
1657 * the block we haven't written data to.
1658 */
1647 bio_put(bio); 1659 bio_put(bio);
1648 return copied ? copied : ret; 1660 goto zero_tail;
1649 } 1661 }
1650 1662
1651 n = bio->bi_iter.bi_size; 1663 n = bio->bi_iter.bi_size;
@@ -1676,13 +1688,21 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
1676 dio->submit.cookie = submit_bio(bio); 1688 dio->submit.cookie = submit_bio(bio);
1677 } while (nr_pages); 1689 } while (nr_pages);
1678 1690
1679 if (need_zeroout) { 1691 /*
1692 * We need to zeroout the tail of a sub-block write if the extent type
1693 * requires zeroing or the write extends beyond EOF. If we don't zero
1694 * the block tail in the latter case, we can expose stale data via mmap
1695 * reads of the EOF block.
1696 */
1697zero_tail:
1698 if (need_zeroout ||
1699 ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) {
1680 /* zero out from the end of the write to the end of the block */ 1700 /* zero out from the end of the write to the end of the block */
1681 pad = pos & (fs_block_size - 1); 1701 pad = pos & (fs_block_size - 1);
1682 if (pad) 1702 if (pad)
1683 iomap_dio_zero(dio, iomap, pos, fs_block_size - pad); 1703 iomap_dio_zero(dio, iomap, pos, fs_block_size - pad);
1684 } 1704 }
1685 return copied; 1705 return copied ? copied : ret;
1686} 1706}
1687 1707
1688static loff_t 1708static loff_t
@@ -1857,6 +1877,15 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
1857 dio->wait_for_completion = true; 1877 dio->wait_for_completion = true;
1858 ret = 0; 1878 ret = 0;
1859 } 1879 }
1880
1881 /*
1882 * Splicing to pipes can fail on a full pipe. We have to
1883 * swallow this to make it look like a short IO
1884 * otherwise the higher splice layers will completely
1885 * mishandle the error and stop moving data.
1886 */
1887 if (ret == -EFAULT)
1888 ret = 0;
1860 break; 1889 break;
1861 } 1890 }
1862 pos += ret; 1891 pos += ret;
diff --git a/fs/read_write.c b/fs/read_write.c
index bfcb4ced5664..4dae0399c75a 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -2094,17 +2094,18 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
2094 off = same->src_offset; 2094 off = same->src_offset;
2095 len = same->src_length; 2095 len = same->src_length;
2096 2096
2097 ret = -EISDIR;
2098 if (S_ISDIR(src->i_mode)) 2097 if (S_ISDIR(src->i_mode))
2099 goto out; 2098 return -EISDIR;
2100 2099
2101 ret = -EINVAL;
2102 if (!S_ISREG(src->i_mode)) 2100 if (!S_ISREG(src->i_mode))
2103 goto out; 2101 return -EINVAL;
2102
2103 if (!file->f_op->remap_file_range)
2104 return -EOPNOTSUPP;
2104 2105
2105 ret = remap_verify_area(file, off, len, false); 2106 ret = remap_verify_area(file, off, len, false);
2106 if (ret < 0) 2107 if (ret < 0)
2107 goto out; 2108 return ret;
2108 ret = 0; 2109 ret = 0;
2109 2110
2110 if (off + len > i_size_read(src)) 2111 if (off + len > i_size_read(src))
@@ -2147,10 +2148,8 @@ next_fdput:
2147 fdput(dst_fd); 2148 fdput(dst_fd);
2148next_loop: 2149next_loop:
2149 if (fatal_signal_pending(current)) 2150 if (fatal_signal_pending(current))
2150 goto out; 2151 break;
2151 } 2152 }
2152
2153out:
2154 return ret; 2153 return ret;
2155} 2154}
2156EXPORT_SYMBOL(vfs_dedupe_file_range); 2155EXPORT_SYMBOL(vfs_dedupe_file_range);
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 74d7228e755b..19e921d1586f 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -1694,10 +1694,13 @@ xfs_bmap_add_extent_delay_real(
1694 case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: 1694 case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
1695 /* 1695 /*
1696 * Filling in all of a previously delayed allocation extent. 1696 * Filling in all of a previously delayed allocation extent.
1697 * The right neighbor is contiguous, the left is not. 1697 * The right neighbor is contiguous, the left is not. Take care
1698 * with delay -> unwritten extent allocation here because the
1699 * delalloc record we are overwriting is always written.
1698 */ 1700 */
1699 PREV.br_startblock = new->br_startblock; 1701 PREV.br_startblock = new->br_startblock;
1700 PREV.br_blockcount += RIGHT.br_blockcount; 1702 PREV.br_blockcount += RIGHT.br_blockcount;
1703 PREV.br_state = new->br_state;
1701 1704
1702 xfs_iext_next(ifp, &bma->icur); 1705 xfs_iext_next(ifp, &bma->icur);
1703 xfs_iext_remove(bma->ip, &bma->icur, state); 1706 xfs_iext_remove(bma->ip, &bma->icur, state);
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index 86c50208a143..7fbf8af0b159 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -538,15 +538,18 @@ xfs_inobt_rec_check_count(
538 538
539static xfs_extlen_t 539static xfs_extlen_t
540xfs_inobt_max_size( 540xfs_inobt_max_size(
541 struct xfs_mount *mp) 541 struct xfs_mount *mp,
542 xfs_agnumber_t agno)
542{ 543{
544 xfs_agblock_t agblocks = xfs_ag_block_count(mp, agno);
545
543 /* Bail out if we're uninitialized, which can happen in mkfs. */ 546 /* Bail out if we're uninitialized, which can happen in mkfs. */
544 if (mp->m_inobt_mxr[0] == 0) 547 if (mp->m_inobt_mxr[0] == 0)
545 return 0; 548 return 0;
546 549
547 return xfs_btree_calc_size(mp->m_inobt_mnr, 550 return xfs_btree_calc_size(mp->m_inobt_mnr,
548 (uint64_t)mp->m_sb.sb_agblocks * mp->m_sb.sb_inopblock / 551 (uint64_t)agblocks * mp->m_sb.sb_inopblock /
549 XFS_INODES_PER_CHUNK); 552 XFS_INODES_PER_CHUNK);
550} 553}
551 554
552static int 555static int
@@ -594,7 +597,7 @@ xfs_finobt_calc_reserves(
594 if (error) 597 if (error)
595 return error; 598 return error;
596 599
597 *ask += xfs_inobt_max_size(mp); 600 *ask += xfs_inobt_max_size(mp, agno);
598 *used += tree_len; 601 *used += tree_len;
599 return 0; 602 return 0;
600} 603}
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 5d263dfdb3bc..404e581f1ea1 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1042,7 +1042,7 @@ out_trans_cancel:
1042 goto out_unlock; 1042 goto out_unlock;
1043} 1043}
1044 1044
1045static int 1045int
1046xfs_flush_unmap_range( 1046xfs_flush_unmap_range(
1047 struct xfs_inode *ip, 1047 struct xfs_inode *ip,
1048 xfs_off_t offset, 1048 xfs_off_t offset,
@@ -1195,13 +1195,7 @@ xfs_prepare_shift(
1195 * Writeback and invalidate cache for the remainder of the file as we're 1195 * Writeback and invalidate cache for the remainder of the file as we're
1196 * about to shift down every extent from offset to EOF. 1196 * about to shift down every extent from offset to EOF.
1197 */ 1197 */
1198 error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, offset, -1); 1198 error = xfs_flush_unmap_range(ip, offset, XFS_ISIZE(ip));
1199 if (error)
1200 return error;
1201 error = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
1202 offset >> PAGE_SHIFT, -1);
1203 if (error)
1204 return error;
1205 1199
1206 /* 1200 /*
1207 * Clean out anything hanging around in the cow fork now that 1201 * Clean out anything hanging around in the cow fork now that
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index 87363d136bb6..7a78229cf1a7 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -80,4 +80,7 @@ int xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip,
80 int whichfork, xfs_extnum_t *nextents, 80 int whichfork, xfs_extnum_t *nextents,
81 xfs_filblks_t *count); 81 xfs_filblks_t *count);
82 82
83int xfs_flush_unmap_range(struct xfs_inode *ip, xfs_off_t offset,
84 xfs_off_t len);
85
83#endif /* __XFS_BMAP_UTIL_H__ */ 86#endif /* __XFS_BMAP_UTIL_H__ */
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 12d8455bfbb2..010db5f8fb00 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -1233,9 +1233,23 @@ xfs_buf_iodone(
1233} 1233}
1234 1234
1235/* 1235/*
1236 * Requeue a failed buffer for writeback 1236 * Requeue a failed buffer for writeback.
1237 * 1237 *
1238 * Return true if the buffer has been re-queued properly, false otherwise 1238 * We clear the log item failed state here as well, but we have to be careful
1239 * about reference counts because the only active reference counts on the buffer
1240 * may be the failed log items. Hence if we clear the log item failed state
1241 * before queuing the buffer for IO we can release all active references to
1242 * the buffer and free it, leading to use after free problems in
1243 * xfs_buf_delwri_queue. It makes no difference to the buffer or log items which
1244 * order we process them in - the buffer is locked, and we own the buffer list
1245 * so nothing on them is going to change while we are performing this action.
1246 *
1247 * Hence we can safely queue the buffer for IO before we clear the failed log
1248 * item state, therefore always having an active reference to the buffer and
1249 * avoiding the transient zero-reference state that leads to use-after-free.
1250 *
1251 * Return true if the buffer was added to the buffer list, false if it was
1252 * already on the buffer list.
1239 */ 1253 */
1240bool 1254bool
1241xfs_buf_resubmit_failed_buffers( 1255xfs_buf_resubmit_failed_buffers(
@@ -1243,16 +1257,16 @@ xfs_buf_resubmit_failed_buffers(
1243 struct list_head *buffer_list) 1257 struct list_head *buffer_list)
1244{ 1258{
1245 struct xfs_log_item *lip; 1259 struct xfs_log_item *lip;
1260 bool ret;
1261
1262 ret = xfs_buf_delwri_queue(bp, buffer_list);
1246 1263
1247 /* 1264 /*
1248 * Clear XFS_LI_FAILED flag from all items before resubmit 1265 * XFS_LI_FAILED set/clear is protected by ail_lock, caller of this
1249 *
1250 * XFS_LI_FAILED set/clear is protected by ail_lock, caller this
1251 * function already have it acquired 1266 * function already have it acquired
1252 */ 1267 */
1253 list_for_each_entry(lip, &bp->b_li_list, li_bio_list) 1268 list_for_each_entry(lip, &bp->b_li_list, li_bio_list)
1254 xfs_clear_li_failed(lip); 1269 xfs_clear_li_failed(lip);
1255 1270
1256 /* Add this buffer back to the delayed write list */ 1271 return ret;
1257 return xfs_buf_delwri_queue(bp, buffer_list);
1258} 1272}
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 53c9ab8fb777..e47425071e65 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -920,7 +920,7 @@ out_unlock:
920} 920}
921 921
922 922
923loff_t 923STATIC loff_t
924xfs_file_remap_range( 924xfs_file_remap_range(
925 struct file *file_in, 925 struct file *file_in,
926 loff_t pos_in, 926 loff_t pos_in,
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index ecdb086bc23e..322a852ce284 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -296,6 +296,7 @@ xfs_reflink_reserve_cow(
296 if (error) 296 if (error)
297 return error; 297 return error;
298 298
299 xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
299 trace_xfs_reflink_cow_alloc(ip, &got); 300 trace_xfs_reflink_cow_alloc(ip, &got);
300 return 0; 301 return 0;
301} 302}
@@ -1351,10 +1352,19 @@ xfs_reflink_remap_prep(
1351 if (ret) 1352 if (ret)
1352 goto out_unlock; 1353 goto out_unlock;
1353 1354
1354 /* Zap any page cache for the destination file's range. */ 1355 /*
1355 truncate_inode_pages_range(&inode_out->i_data, 1356 * If pos_out > EOF, we may have dirtied blocks between EOF and
1356 round_down(pos_out, PAGE_SIZE), 1357 * pos_out. In that case, we need to extend the flush and unmap to cover
1357 round_up(pos_out + *len, PAGE_SIZE) - 1); 1358 * from EOF to the end of the copy length.
1359 */
1360 if (pos_out > XFS_ISIZE(dest)) {
1361 loff_t flen = *len + (pos_out - XFS_ISIZE(dest));
1362 ret = xfs_flush_unmap_range(dest, XFS_ISIZE(dest), flen);
1363 } else {
1364 ret = xfs_flush_unmap_range(dest, pos_out, *len);
1365 }
1366 if (ret)
1367 goto out_unlock;
1358 1368
1359 return 1; 1369 return 1;
1360out_unlock: 1370out_unlock:
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 3043e5ed6495..8a6532aae779 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -280,7 +280,10 @@ DECLARE_EVENT_CLASS(xfs_buf_class,
280 ), 280 ),
281 TP_fast_assign( 281 TP_fast_assign(
282 __entry->dev = bp->b_target->bt_dev; 282 __entry->dev = bp->b_target->bt_dev;
283 __entry->bno = bp->b_bn; 283 if (bp->b_bn == XFS_BUF_DADDR_NULL)
284 __entry->bno = bp->b_maps[0].bm_bn;
285 else
286 __entry->bno = bp->b_bn;
284 __entry->nblks = bp->b_length; 287 __entry->nblks = bp->b_length;
285 __entry->hold = atomic_read(&bp->b_hold); 288 __entry->hold = atomic_read(&bp->b_hold);
286 __entry->pincount = atomic_read(&bp->b_pin_count); 289 __entry->pincount = atomic_read(&bp->b_pin_count);