aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorBrian Foster <bfoster@redhat.com>2019-02-01 12:14:24 -0500
committerDarrick J. Wong <darrick.wong@oracle.com>2019-02-11 19:07:01 -0500
commitc2b3164320b51a535d7c7a6acdcee255edbb22cf (patch)
tree29d264749af4414018655c5fbba5cf3d859eb15a /fs
parent627209fbcc2f0d658a5417645859a1d3053ddb59 (diff)
xfs: use the latest extent at writeback delalloc conversion time
The writeback delalloc conversion code is racy with respect to changes in the currently cached file mapping outside of the current page. This is because the ilock is cycled between the time the caller originally looked up the mapping and across each real allocation of the provided file range. This code has collected various hacks over the years to help combat the symptoms of these races (i.e., truncate race detection, allocation into hole detection, etc.), but none address the fundamental problem that the imap may not be valid at allocation time. Rather than continue to use race detection hacks, update writeback delalloc conversion to a model that explicitly converts the delalloc extent backing the current file offset being processed. The current file offset is the only block we can trust to remain once the ilock is dropped because any operation that can remove the block (truncate, hole punch, etc.) must flush and discard pagecache pages first. Modify xfs_iomap_write_allocate() to use the xfs_bmapi_delalloc() mechanism to request allocation of the entire delalloc extent backing the current offset instead of assuming the extent passed by the caller is unchanged. Record the range specified by the caller and apply it to the resulting allocated extent so previous checks by the caller for COW fork overlap are not lost. Finally, overload the bmapi delalloc flag with the range reval flag behavior since this is the only use case for both. This ensures that writeback always picks up the correct and current extent associated with the page, regardless of races with other extent modifying operations. If operating on a data fork and the COW overlap state has changed since the ilock was cycled, the caller revalidates against the COW fork sequence number before using the imap for the next block. Signed-off-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Diffstat (limited to 'fs')
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c16
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h2
-rw-r--r--fs/xfs/xfs_iomap.c170
3 files changed, 64 insertions, 124 deletions
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index c629004d9a4c..f4a65330a2a9 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -4296,15 +4296,14 @@ xfs_bmapi_write(
4296 bma.datatype = 0; 4296 bma.datatype = 0;
4297 4297
4298 /* 4298 /*
4299 * The reval flag means the caller wants to allocate the entire delalloc 4299 * The delalloc flag means the caller wants to allocate the entire
4300 * extent backing bno where bno may not necessarily match the startoff. 4300 * delalloc extent backing bno where bno may not necessarily match the
4301 * Now that we've looked up the extent, reset the range to map based on 4301 * startoff. Now that we've looked up the extent, reset the range to
4302 * the extent in the file. If we're in a hole, this may be an error so 4302 * map based on the extent in the file. If we're in a hole, this may be
4303 * don't adjust anything. 4303 * an error so don't adjust anything.
4304 */ 4304 */
4305 if ((flags & XFS_BMAPI_REVALRANGE) && 4305 if ((flags & XFS_BMAPI_DELALLOC) &&
4306 !eof && bno >= bma.got.br_startoff) { 4306 !eof && bno >= bma.got.br_startoff) {
4307 ASSERT(flags & XFS_BMAPI_DELALLOC);
4308 bno = bma.got.br_startoff; 4307 bno = bma.got.br_startoff;
4309 len = bma.got.br_blockcount; 4308 len = bma.got.br_blockcount;
4310#ifdef DEBUG 4309#ifdef DEBUG
@@ -4495,10 +4494,9 @@ xfs_bmapi_convert_delalloc(
4495 flags |= XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC; 4494 flags |= XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC;
4496 4495
4497 /* 4496 /*
4498 * The reval flag means to allocate the entire extent; pass a dummy 4497 * The delalloc flag means to allocate the entire extent; pass a dummy
4499 * length of 1. 4498 * length of 1.
4500 */ 4499 */
4501 flags |= XFS_BMAPI_REVALRANGE;
4502 error = xfs_bmapi_write(tp, ip, offset_fsb, 1, flags, total, imap, 4500 error = xfs_bmapi_write(tp, ip, offset_fsb, 1, flags, total, imap,
4503 &nimaps); 4501 &nimaps);
4504 if (!error && !nimaps) 4502 if (!error && !nimaps)
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 75586d56f7a5..4dc7d1a02b35 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -107,8 +107,6 @@ struct xfs_extent_free_item
107/* Do not update the rmap btree. Used for reconstructing bmbt from rmapbt. */ 107/* Do not update the rmap btree. Used for reconstructing bmbt from rmapbt. */
108#define XFS_BMAPI_NORMAP 0x2000 108#define XFS_BMAPI_NORMAP 0x2000
109 109
110#define XFS_BMAPI_REVALRANGE 0x4000
111
112#define XFS_BMAPI_FLAGS \ 110#define XFS_BMAPI_FLAGS \
113 { XFS_BMAPI_ENTIRE, "ENTIRE" }, \ 111 { XFS_BMAPI_ENTIRE, "ENTIRE" }, \
114 { XFS_BMAPI_METADATA, "METADATA" }, \ 112 { XFS_BMAPI_METADATA, "METADATA" }, \
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index ab69caa685b4..6af1d3ec0a9c 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -677,25 +677,19 @@ out_unlock:
677 */ 677 */
678int 678int
679xfs_iomap_write_allocate( 679xfs_iomap_write_allocate(
680 xfs_inode_t *ip, 680 struct xfs_inode *ip,
681 int whichfork, 681 int whichfork,
682 xfs_off_t offset, 682 xfs_off_t offset,
683 xfs_bmbt_irec_t *imap, 683 struct xfs_bmbt_irec *imap,
684 unsigned int *seq) 684 unsigned int *seq)
685{ 685{
686 xfs_mount_t *mp = ip->i_mount; 686 struct xfs_mount *mp = ip->i_mount;
687 struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); 687 struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
688 xfs_fileoff_t offset_fsb, last_block; 688 xfs_fileoff_t offset_fsb;
689 xfs_fileoff_t end_fsb, map_start_fsb; 689 xfs_fileoff_t map_start_fsb;
690 xfs_filblks_t count_fsb; 690 xfs_extlen_t map_count_fsb;
691 xfs_trans_t *tp; 691 struct xfs_trans *tp;
692 int nimaps; 692 int error = 0;
693 int error = 0;
694 int flags = XFS_BMAPI_DELALLOC;
695 int nres;
696
697 if (whichfork == XFS_COW_FORK)
698 flags |= XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC;
699 693
700 /* 694 /*
701 * Make sure that the dquots are there. 695 * Make sure that the dquots are there.
@@ -704,106 +698,60 @@ xfs_iomap_write_allocate(
704 if (error) 698 if (error)
705 return error; 699 return error;
706 700
701 /*
702 * Store the file range the caller is interested in because it encodes
703 * state such as potential overlap with COW fork blocks. We must trim
704 * the allocated extent down to this range to maintain consistency with
705 * what the caller expects. Revalidation of the range itself is the
706 * responsibility of the caller.
707 */
707 offset_fsb = XFS_B_TO_FSBT(mp, offset); 708 offset_fsb = XFS_B_TO_FSBT(mp, offset);
708 count_fsb = imap->br_blockcount;
709 map_start_fsb = imap->br_startoff; 709 map_start_fsb = imap->br_startoff;
710 map_count_fsb = imap->br_blockcount;
710 711
711 XFS_STATS_ADD(mp, xs_xstrat_bytes, XFS_FSB_TO_B(mp, count_fsb)); 712 XFS_STATS_ADD(mp, xs_xstrat_bytes,
713 XFS_FSB_TO_B(mp, imap->br_blockcount));
712 714
713 while (count_fsb != 0) { 715 while (true) {
714 /* 716 /*
715 * Set up a transaction with which to allocate the 717 * Allocate in a loop because it may take several attempts to
716 * backing store for the file. Do allocations in a 718 * allocate real blocks for a contiguous delalloc extent if free
717 * loop until we get some space in the range we are 719 * space is sufficiently fragmented. Note that space for the
718 * interested in. The other space that might be allocated 720 * extent and indirect blocks was reserved when the delalloc
719 * is in the delayed allocation extent on which we sit 721 * extent was created so there's no need to do so here.
720 * but before our buffer starts.
721 */ 722 */
722 nimaps = 0; 723 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0,
723 while (nimaps == 0) { 724 XFS_TRANS_RESERVE, &tp);
724 nres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); 725 if (error)
725 /* 726 return error;
726 * We have already reserved space for the extent and any
727 * indirect blocks when creating the delalloc extent,
728 * there is no need to reserve space in this transaction
729 * again.
730 */
731 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0,
732 0, XFS_TRANS_RESERVE, &tp);
733 if (error)
734 return error;
735
736 xfs_ilock(ip, XFS_ILOCK_EXCL);
737 xfs_trans_ijoin(tp, ip, 0);
738
739 /*
740 * it is possible that the extents have changed since
741 * we did the read call as we dropped the ilock for a
742 * while. We have to be careful about truncates or hole
743 * punchs here - we are not allowed to allocate
744 * non-delalloc blocks here.
745 *
746 * The only protection against truncation is the pages
747 * for the range we are being asked to convert are
748 * locked and hence a truncate will block on them
749 * first.
750 *
751 * As a result, if we go beyond the range we really
752 * need and hit an delalloc extent boundary followed by
753 * a hole while we have excess blocks in the map, we
754 * will fill the hole incorrectly and overrun the
755 * transaction reservation.
756 *
757 * Using a single map prevents this as we are forced to
758 * check each map we look for overlap with the desired
759 * range and abort as soon as we find it. Also, given
760 * that we only return a single map, having one beyond
761 * what we can return is probably a bit silly.
762 *
763 * We also need to check that we don't go beyond EOF;
764 * this is a truncate optimisation as a truncate sets
765 * the new file size before block on the pages we
766 * currently have locked under writeback. Because they
767 * are about to be tossed, we don't need to write them
768 * back....
769 */
770 nimaps = 1;
771 end_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip));
772 error = xfs_bmap_last_offset(ip, &last_block,
773 XFS_DATA_FORK);
774 if (error)
775 goto trans_cancel;
776 727
777 last_block = XFS_FILEOFF_MAX(last_block, end_fsb); 728 xfs_ilock(ip, XFS_ILOCK_EXCL);
778 if ((map_start_fsb + count_fsb) > last_block) { 729 xfs_trans_ijoin(tp, ip, 0);
779 count_fsb = last_block - map_start_fsb;
780 if (count_fsb == 0) {
781 error = -EAGAIN;
782 goto trans_cancel;
783 }
784 }
785 730
786 /* 731 /*
787 * From this point onwards we overwrite the imap 732 * ilock was dropped since imap was populated which means it
788 * pointer that the caller gave to us. 733 * might no longer be valid. The current page is held locked so
789 */ 734 * nothing could have removed the block backing offset_fsb.
790 error = xfs_bmapi_write(tp, ip, map_start_fsb, 735 * Attempt to allocate whatever delalloc extent currently backs
791 count_fsb, flags, nres, imap, 736 * offset_fsb and put the result in the imap pointer from the
792 &nimaps); 737 * caller. We'll trim it down to the caller's most recently
793 if (error) 738 * validated range before we return.
794 goto trans_cancel; 739 */
740 error = xfs_bmapi_convert_delalloc(tp, ip, offset_fsb,
741 whichfork, imap);
742 if (error)
743 goto trans_cancel;
795 744
796 error = xfs_trans_commit(tp); 745 error = xfs_trans_commit(tp);
797 if (error) 746 if (error)
798 goto error0; 747 goto error0;
799 748
800 *seq = READ_ONCE(ifp->if_seq); 749 *seq = READ_ONCE(ifp->if_seq);
801 xfs_iunlock(ip, XFS_ILOCK_EXCL); 750 xfs_iunlock(ip, XFS_ILOCK_EXCL);
802 }
803 751
804 /* 752 /*
805 * See if we were able to allocate an extent that 753 * See if we were able to allocate an extent that covers at
806 * covers at least part of the callers request 754 * least part of the callers request.
807 */ 755 */
808 if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip))) 756 if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip)))
809 return xfs_alert_fsblock_zero(ip, imap); 757 return xfs_alert_fsblock_zero(ip, imap);
@@ -812,15 +760,11 @@ xfs_iomap_write_allocate(
812 (offset_fsb < (imap->br_startoff + 760 (offset_fsb < (imap->br_startoff +
813 imap->br_blockcount))) { 761 imap->br_blockcount))) {
814 XFS_STATS_INC(mp, xs_xstrat_quick); 762 XFS_STATS_INC(mp, xs_xstrat_quick);
763 xfs_trim_extent(imap, map_start_fsb, map_count_fsb);
764 ASSERT(offset_fsb >= imap->br_startoff &&
765 offset_fsb < imap->br_startoff + imap->br_blockcount);
815 return 0; 766 return 0;
816 } 767 }
817
818 /*
819 * So far we have not mapped the requested part of the
820 * file, just surrounding data, try again.
821 */
822 count_fsb -= imap->br_blockcount;
823 map_start_fsb = imap->br_startoff + imap->br_blockcount;
824 } 768 }
825 769
826trans_cancel: 770trans_cancel: