aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDarrick J. Wong <darrick.wong@oracle.com>2017-02-02 18:14:02 -0500
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>2017-04-08 03:30:31 -0400
commite02f0ff252f2cd402063636ccea812a35034d6d7 (patch)
treedd0a07dbf1192f1c1f51e2521f41cf8be2677683
parent8370826f7d3274fe64de32c58aa49a7384f0c9e9 (diff)
xfs: mark speculative prealloc CoW fork extents unwritten
commit 5eda43000064a69a39fb7869cc63c9571535ad29 upstream. Christoph Hellwig pointed out that there's a potentially nasty race when performing simultaneous nearby directio cow writes: "Thread 1 writes a range from B to c " B --------- C p "a little later thread 2 writes from A to B " A --------- B p [editor's note: the 'p' denote cowextsize boundaries, which I added to make this more clear] "but the code preallocates beyond B into the range where thread "1 has just written, but ->end_io hasn't been called yet. "But once ->end_io is called thread 2 has already allocated "up to the extent size hint into the write range of thread 1, "so the end_io handler will splice the unintialized blocks from "that preallocation back into the file right after B." We can avoid this race by ensuring that thread 1 cannot accidentally remap the blocks that thread 2 allocated (as part of speculative preallocation) as part of t2's write preparation in t1's end_io handler. The way we make this happen is by taking advantage of the unwritten extent flag as an intermediate step. Recall that when we begin the process of writing data to shared blocks, we create a delayed allocation extent in the CoW fork: D: --RRRRRRSSSRRRRRRRR--- C: ------DDDDDDD--------- When a thread prepares to CoW some dirty data out to disk, it will now convert the delalloc reservation into an /unwritten/ allocated extent in the cow fork. The da conversion code tries to opportunistically allocate as much of a (speculatively prealloc'd) extent as possible, so we may end up allocating a larger extent than we're actually writing out: D: --RRRRRRSSSRRRRRRRR--- U: ------UUUUUUU--------- Next, we convert only the part of the extent that we're actively planning to write to normal (i.e. not unwritten) status: D: --RRRRRRSSSRRRRRRRR--- U: ------UURRUUU--------- If the write succeeds, the end_cow function will now scan the relevant range of the CoW fork for real extents and remap only the real extents into the data fork: D: --RRRRRRRRSRRRRRRRR--- U: ------UU--UUU--------- This ensures that we never obliterate valid data fork extents with unwritten blocks from the CoW fork. Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-rw-r--r--fs/xfs/xfs_aops.c6
-rw-r--r--fs/xfs/xfs_iomap.c2
-rw-r--r--fs/xfs/xfs_reflink.c116
-rw-r--r--fs/xfs/xfs_reflink.h2
-rw-r--r--fs/xfs/xfs_trace.h8
5 files changed, 123 insertions, 11 deletions
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 06763f5cc701..6845ebfa3067 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -486,6 +486,12 @@ xfs_submit_ioend(
486 struct xfs_ioend *ioend, 486 struct xfs_ioend *ioend,
487 int status) 487 int status)
488{ 488{
489 /* Convert CoW extents to regular */
490 if (!status && ioend->io_type == XFS_IO_COW) {
491 status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode),
492 ioend->io_offset, ioend->io_size);
493 }
494
489 /* Reserve log space if we might write beyond the on-disk inode size. */ 495 /* Reserve log space if we might write beyond the on-disk inode size. */
490 if (!status && 496 if (!status &&
491 ioend->io_type != XFS_IO_UNWRITTEN && 497 ioend->io_type != XFS_IO_UNWRITTEN &&
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index e8889614cec3..5211887cbcd2 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -685,7 +685,7 @@ xfs_iomap_write_allocate(
685 int nres; 685 int nres;
686 686
687 if (whichfork == XFS_COW_FORK) 687 if (whichfork == XFS_COW_FORK)
688 flags |= XFS_BMAPI_COWFORK; 688 flags |= XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC;
689 689
690 /* 690 /*
691 * Make sure that the dquots are there. 691 * Make sure that the dquots are there.
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 4d3f74e3c5e1..539a612a02e5 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -82,11 +82,22 @@
82 * mappings are a reservation against the free space in the filesystem; 82 * mappings are a reservation against the free space in the filesystem;
83 * adjacent mappings can also be combined into fewer larger mappings. 83 * adjacent mappings can also be combined into fewer larger mappings.
84 * 84 *
85 * As an optimization, the CoW extent size hint (cowextsz) creates
86 * outsized aligned delalloc reservations in the hope of landing out of
87 * order nearby CoW writes in a single extent on disk, thereby reducing
88 * fragmentation and improving future performance.
89 *
90 * D: --RRRRRRSSSRRRRRRRR--- (data fork)
91 * C: ------DDDDDDD--------- (CoW fork)
92 *
85 * When dirty pages are being written out (typically in writepage), the 93 * When dirty pages are being written out (typically in writepage), the
86 * delalloc reservations are converted into real mappings by allocating 94 * delalloc reservations are converted into unwritten mappings by
87 * blocks and replacing the delalloc mapping with real ones. A delalloc 95 * allocating blocks and replacing the delalloc mapping with real ones.
88 * mapping can be replaced by several real ones if the free space is 96 * A delalloc mapping can be replaced by several unwritten ones if the
89 * fragmented. 97 * free space is fragmented.
98 *
99 * D: --RRRRRRSSSRRRRRRRR---
100 * C: ------UUUUUUU---------
90 * 101 *
91 * We want to adapt the delalloc mechanism for copy-on-write, since the 102 * We want to adapt the delalloc mechanism for copy-on-write, since the
92 * write paths are similar. The first two steps (creating the reservation 103 * write paths are similar. The first two steps (creating the reservation
@@ -101,13 +112,29 @@
101 * Block-aligned directio writes will use the same mechanism as buffered 112 * Block-aligned directio writes will use the same mechanism as buffered
102 * writes. 113 * writes.
103 * 114 *
115 * Just prior to submitting the actual disk write requests, we convert
116 * the extents representing the range of the file actually being written
117 * (as opposed to extra pieces created for the cowextsize hint) to real
118 * extents. This will become important in the next step:
119 *
120 * D: --RRRRRRSSSRRRRRRRR---
121 * C: ------UUrrUUU---------
122 *
104 * CoW remapping must be done after the data block write completes, 123 * CoW remapping must be done after the data block write completes,
105 * because we don't want to destroy the old data fork map until we're sure 124 * because we don't want to destroy the old data fork map until we're sure
106 * the new block has been written. Since the new mappings are kept in a 125 * the new block has been written. Since the new mappings are kept in a
107 * separate fork, we can simply iterate these mappings to find the ones 126 * separate fork, we can simply iterate these mappings to find the ones
108 * that cover the file blocks that we just CoW'd. For each extent, simply 127 * that cover the file blocks that we just CoW'd. For each extent, simply
109 * unmap the corresponding range in the data fork, map the new range into 128 * unmap the corresponding range in the data fork, map the new range into
110 * the data fork, and remove the extent from the CoW fork. 129 * the data fork, and remove the extent from the CoW fork. Because of
130 * the presence of the cowextsize hint, however, we must be careful
131 * only to remap the blocks that we've actually written out -- we must
132 * never remap delalloc reservations nor CoW staging blocks that have
133 * yet to be written. This corresponds exactly to the real extents in
134 * the CoW fork:
135 *
136 * D: --RRRRRRrrSRRRRRRRR---
137 * C: ------UU--UUU---------
111 * 138 *
112 * Since the remapping operation can be applied to an arbitrary file 139 * Since the remapping operation can be applied to an arbitrary file
113 * range, we record the need for the remap step as a flag in the ioend 140 * range, we record the need for the remap step as a flag in the ioend
@@ -296,6 +323,65 @@ xfs_reflink_reserve_cow(
296 return 0; 323 return 0;
297} 324}
298 325
326/* Convert part of an unwritten CoW extent to a real one. */
327STATIC int
328xfs_reflink_convert_cow_extent(
329 struct xfs_inode *ip,
330 struct xfs_bmbt_irec *imap,
331 xfs_fileoff_t offset_fsb,
332 xfs_filblks_t count_fsb,
333 struct xfs_defer_ops *dfops)
334{
335 struct xfs_bmbt_irec irec = *imap;
336 xfs_fsblock_t first_block;
337 int nimaps = 1;
338
339 if (imap->br_state == XFS_EXT_NORM)
340 return 0;
341
342 xfs_trim_extent(&irec, offset_fsb, count_fsb);
343 trace_xfs_reflink_convert_cow(ip, &irec);
344 if (irec.br_blockcount == 0)
345 return 0;
346 return xfs_bmapi_write(NULL, ip, irec.br_startoff, irec.br_blockcount,
347 XFS_BMAPI_COWFORK | XFS_BMAPI_CONVERT, &first_block,
348 0, &irec, &nimaps, dfops);
349}
350
351/* Convert all of the unwritten CoW extents in a file's range to real ones. */
352int
353xfs_reflink_convert_cow(
354 struct xfs_inode *ip,
355 xfs_off_t offset,
356 xfs_off_t count)
357{
358 struct xfs_bmbt_irec got;
359 struct xfs_defer_ops dfops;
360 struct xfs_mount *mp = ip->i_mount;
361 struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
362 xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
363 xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count);
364 xfs_extnum_t idx;
365 bool found;
366 int error;
367
368 xfs_ilock(ip, XFS_ILOCK_EXCL);
369
370 /* Convert all the extents to real from unwritten. */
371 for (found = xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got);
372 found && got.br_startoff < end_fsb;
373 found = xfs_iext_get_extent(ifp, ++idx, &got)) {
374 error = xfs_reflink_convert_cow_extent(ip, &got, offset_fsb,
375 end_fsb - offset_fsb, &dfops);
376 if (error)
377 break;
378 }
379
380 /* Finish up. */
381 xfs_iunlock(ip, XFS_ILOCK_EXCL);
382 return error;
383}
384
299/* Allocate all CoW reservations covering a range of blocks in a file. */ 385/* Allocate all CoW reservations covering a range of blocks in a file. */
300static int 386static int
301__xfs_reflink_allocate_cow( 387__xfs_reflink_allocate_cow(
@@ -328,6 +414,7 @@ __xfs_reflink_allocate_cow(
328 goto out_unlock; 414 goto out_unlock;
329 ASSERT(nimaps == 1); 415 ASSERT(nimaps == 1);
330 416
417 /* Make sure there's a CoW reservation for it. */
331 error = xfs_reflink_reserve_cow(ip, &imap, &shared); 418 error = xfs_reflink_reserve_cow(ip, &imap, &shared);
332 if (error) 419 if (error)
333 goto out_trans_cancel; 420 goto out_trans_cancel;
@@ -337,14 +424,16 @@ __xfs_reflink_allocate_cow(
337 goto out_trans_cancel; 424 goto out_trans_cancel;
338 } 425 }
339 426
427 /* Allocate the entire reservation as unwritten blocks. */
340 xfs_trans_ijoin(tp, ip, 0); 428 xfs_trans_ijoin(tp, ip, 0);
341 error = xfs_bmapi_write(tp, ip, imap.br_startoff, imap.br_blockcount, 429 error = xfs_bmapi_write(tp, ip, imap.br_startoff, imap.br_blockcount,
342 XFS_BMAPI_COWFORK, &first_block, 430 XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, &first_block,
343 XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), 431 XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK),
344 &imap, &nimaps, &dfops); 432 &imap, &nimaps, &dfops);
345 if (error) 433 if (error)
346 goto out_trans_cancel; 434 goto out_trans_cancel;
347 435
436 /* Finish up. */
348 error = xfs_defer_finish(&tp, &dfops, NULL); 437 error = xfs_defer_finish(&tp, &dfops, NULL);
349 if (error) 438 if (error)
350 goto out_trans_cancel; 439 goto out_trans_cancel;
@@ -389,11 +478,12 @@ xfs_reflink_allocate_cow_range(
389 if (error) { 478 if (error) {
390 trace_xfs_reflink_allocate_cow_range_error(ip, error, 479 trace_xfs_reflink_allocate_cow_range_error(ip, error,
391 _RET_IP_); 480 _RET_IP_);
392 break; 481 return error;
393 } 482 }
394 } 483 }
395 484
396 return error; 485 /* Convert the CoW extents to regular. */
486 return xfs_reflink_convert_cow(ip, offset, count);
397} 487}
398 488
399/* 489/*
@@ -669,6 +759,16 @@ xfs_reflink_end_cow(
669 759
670 ASSERT(!isnullstartblock(got.br_startblock)); 760 ASSERT(!isnullstartblock(got.br_startblock));
671 761
762 /*
763 * Don't remap unwritten extents; these are
764 * speculatively preallocated CoW extents that have been
765 * allocated but have not yet been involved in a write.
766 */
767 if (got.br_state == XFS_EXT_UNWRITTEN) {
768 idx--;
769 goto next_extent;
770 }
771
672 /* Unmap the old blocks in the data fork. */ 772 /* Unmap the old blocks in the data fork. */
673 xfs_defer_init(&dfops, &firstfsb); 773 xfs_defer_init(&dfops, &firstfsb);
674 rlen = del.br_blockcount; 774 rlen = del.br_blockcount;
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
index 97ea9b487884..523e06d88f43 100644
--- a/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h
@@ -30,6 +30,8 @@ extern int xfs_reflink_reserve_cow(struct xfs_inode *ip,
30 struct xfs_bmbt_irec *imap, bool *shared); 30 struct xfs_bmbt_irec *imap, bool *shared);
31extern int xfs_reflink_allocate_cow_range(struct xfs_inode *ip, 31extern int xfs_reflink_allocate_cow_range(struct xfs_inode *ip,
32 xfs_off_t offset, xfs_off_t count); 32 xfs_off_t offset, xfs_off_t count);
33extern int xfs_reflink_convert_cow(struct xfs_inode *ip, xfs_off_t offset,
34 xfs_off_t count);
33extern bool xfs_reflink_find_cow_mapping(struct xfs_inode *ip, xfs_off_t offset, 35extern bool xfs_reflink_find_cow_mapping(struct xfs_inode *ip, xfs_off_t offset,
34 struct xfs_bmbt_irec *imap, bool *need_alloc); 36 struct xfs_bmbt_irec *imap, bool *need_alloc);
35extern int xfs_reflink_trim_irec_to_next_cow(struct xfs_inode *ip, 37extern int xfs_reflink_trim_irec_to_next_cow(struct xfs_inode *ip,
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 0907752be62d..b62764064af6 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -3183,6 +3183,7 @@ DECLARE_EVENT_CLASS(xfs_inode_irec_class,
3183 __field(xfs_fileoff_t, lblk) 3183 __field(xfs_fileoff_t, lblk)
3184 __field(xfs_extlen_t, len) 3184 __field(xfs_extlen_t, len)
3185 __field(xfs_fsblock_t, pblk) 3185 __field(xfs_fsblock_t, pblk)
3186 __field(int, state)
3186 ), 3187 ),
3187 TP_fast_assign( 3188 TP_fast_assign(
3188 __entry->dev = VFS_I(ip)->i_sb->s_dev; 3189 __entry->dev = VFS_I(ip)->i_sb->s_dev;
@@ -3190,13 +3191,15 @@ DECLARE_EVENT_CLASS(xfs_inode_irec_class,
3190 __entry->lblk = irec->br_startoff; 3191 __entry->lblk = irec->br_startoff;
3191 __entry->len = irec->br_blockcount; 3192 __entry->len = irec->br_blockcount;
3192 __entry->pblk = irec->br_startblock; 3193 __entry->pblk = irec->br_startblock;
3194 __entry->state = irec->br_state;
3193 ), 3195 ),
3194 TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x pblk %llu", 3196 TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x pblk %llu st %d",
3195 MAJOR(__entry->dev), MINOR(__entry->dev), 3197 MAJOR(__entry->dev), MINOR(__entry->dev),
3196 __entry->ino, 3198 __entry->ino,
3197 __entry->lblk, 3199 __entry->lblk,
3198 __entry->len, 3200 __entry->len,
3199 __entry->pblk) 3201 __entry->pblk,
3202 __entry->state)
3200); 3203);
3201#define DEFINE_INODE_IREC_EVENT(name) \ 3204#define DEFINE_INODE_IREC_EVENT(name) \
3202DEFINE_EVENT(xfs_inode_irec_class, name, \ 3205DEFINE_EVENT(xfs_inode_irec_class, name, \
@@ -3345,6 +3348,7 @@ DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_around_shared);
3345DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_alloc); 3348DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_alloc);
3346DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_found); 3349DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_found);
3347DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc); 3350DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc);
3351DEFINE_INODE_IREC_EVENT(xfs_reflink_convert_cow);
3348 3352
3349DEFINE_RW_EVENT(xfs_reflink_reserve_cow); 3353DEFINE_RW_EVENT(xfs_reflink_reserve_cow);
3350DEFINE_RW_EVENT(xfs_reflink_allocate_cow_range); 3354DEFINE_RW_EVENT(xfs_reflink_allocate_cow_range);