aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDarrick J. Wong <darrick.wong@oracle.com>2017-02-02 18:14:02 -0500
committerDarrick J. Wong <darrick.wong@oracle.com>2017-02-02 18:14:02 -0500
commit5eda43000064a69a39fb7869cc63c9571535ad29 (patch)
treea0614f9fe159a22120a8ae26d787d2f554c9ecd2
parent05a630d76bd3f39baf0eecfa305bed2820796dee (diff)
xfs: mark speculative prealloc CoW fork extents unwritten
Christoph Hellwig pointed out that there's a potentially nasty race when performing simultaneous nearby directio cow writes: "Thread 1 writes a range from B to c " B --------- C p "a little later thread 2 writes from A to B " A --------- B p [editor's note: the 'p' denote cowextsize boundaries, which I added to make this more clear] "but the code preallocates beyond B into the range where thread "1 has just written, but ->end_io hasn't been called yet. "But once ->end_io is called thread 2 has already allocated "up to the extent size hint into the write range of thread 1, "so the end_io handler will splice the unintialized blocks from "that preallocation back into the file right after B." We can avoid this race by ensuring that thread 1 cannot accidentally remap the blocks that thread 2 allocated (as part of speculative preallocation) as part of t2's write preparation in t1's end_io handler. The way we make this happen is by taking advantage of the unwritten extent flag as an intermediate step. Recall that when we begin the process of writing data to shared blocks, we create a delayed allocation extent in the CoW fork: D: --RRRRRRSSSRRRRRRRR--- C: ------DDDDDDD--------- When a thread prepares to CoW some dirty data out to disk, it will now convert the delalloc reservation into an /unwritten/ allocated extent in the cow fork. The da conversion code tries to opportunistically allocate as much of a (speculatively prealloc'd) extent as possible, so we may end up allocating a larger extent than we're actually writing out: D: --RRRRRRSSSRRRRRRRR--- U: ------UUUUUUU--------- Next, we convert only the part of the extent that we're actively planning to write to normal (i.e. not unwritten) status: D: --RRRRRRSSSRRRRRRRR--- U: ------UURRUUU--------- If the write succeeds, the end_cow function will now scan the relevant range of the CoW fork for real extents and remap only the real extents into the data fork: D: --RRRRRRRRSRRRRRRRR--- U: ------UU--UUU--------- This ensures that we never obliterate valid data fork extents with unwritten blocks from the CoW fork. Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> Reviewed-by: Christoph Hellwig <hch@lst.de>
-rw-r--r--fs/xfs/xfs_aops.c6
-rw-r--r--fs/xfs/xfs_iomap.c2
-rw-r--r--fs/xfs/xfs_reflink.c116
-rw-r--r--fs/xfs/xfs_reflink.h2
-rw-r--r--fs/xfs/xfs_trace.h8
5 files changed, 123 insertions, 11 deletions
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 631e7c0e0a29..1ff9df7a3ce8 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -481,6 +481,12 @@ xfs_submit_ioend(
481 struct xfs_ioend *ioend, 481 struct xfs_ioend *ioend,
482 int status) 482 int status)
483{ 483{
484 /* Convert CoW extents to regular */
485 if (!status && ioend->io_type == XFS_IO_COW) {
486 status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode),
487 ioend->io_offset, ioend->io_size);
488 }
489
484 /* Reserve log space if we might write beyond the on-disk inode size. */ 490 /* Reserve log space if we might write beyond the on-disk inode size. */
485 if (!status && 491 if (!status &&
486 ioend->io_type != XFS_IO_UNWRITTEN && 492 ioend->io_type != XFS_IO_UNWRITTEN &&
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 25ed98324b27..84fb8788431b 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -685,7 +685,7 @@ xfs_iomap_write_allocate(
685 int nres; 685 int nres;
686 686
687 if (whichfork == XFS_COW_FORK) 687 if (whichfork == XFS_COW_FORK)
688 flags |= XFS_BMAPI_COWFORK; 688 flags |= XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC;
689 689
690 /* 690 /*
691 * Make sure that the dquots are there. 691 * Make sure that the dquots are there.
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 07593a362cd0..8c8c4f4676da 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -82,11 +82,22 @@
82 * mappings are a reservation against the free space in the filesystem; 82 * mappings are a reservation against the free space in the filesystem;
83 * adjacent mappings can also be combined into fewer larger mappings. 83 * adjacent mappings can also be combined into fewer larger mappings.
84 * 84 *
85 * As an optimization, the CoW extent size hint (cowextsz) creates
86 * outsized aligned delalloc reservations in the hope of landing out of
87 * order nearby CoW writes in a single extent on disk, thereby reducing
88 * fragmentation and improving future performance.
89 *
90 * D: --RRRRRRSSSRRRRRRRR--- (data fork)
91 * C: ------DDDDDDD--------- (CoW fork)
92 *
85 * When dirty pages are being written out (typically in writepage), the 93 * When dirty pages are being written out (typically in writepage), the
86 * delalloc reservations are converted into real mappings by allocating 94 * delalloc reservations are converted into unwritten mappings by
87 * blocks and replacing the delalloc mapping with real ones. A delalloc 95 * allocating blocks and replacing the delalloc mapping with real ones.
88 * mapping can be replaced by several real ones if the free space is 96 * A delalloc mapping can be replaced by several unwritten ones if the
89 * fragmented. 97 * free space is fragmented.
98 *
99 * D: --RRRRRRSSSRRRRRRRR---
100 * C: ------UUUUUUU---------
90 * 101 *
91 * We want to adapt the delalloc mechanism for copy-on-write, since the 102 * We want to adapt the delalloc mechanism for copy-on-write, since the
92 * write paths are similar. The first two steps (creating the reservation 103 * write paths are similar. The first two steps (creating the reservation
@@ -101,13 +112,29 @@
101 * Block-aligned directio writes will use the same mechanism as buffered 112 * Block-aligned directio writes will use the same mechanism as buffered
102 * writes. 113 * writes.
103 * 114 *
115 * Just prior to submitting the actual disk write requests, we convert
116 * the extents representing the range of the file actually being written
117 * (as opposed to extra pieces created for the cowextsize hint) to real
118 * extents. This will become important in the next step:
119 *
120 * D: --RRRRRRSSSRRRRRRRR---
121 * C: ------UUrrUUU---------
122 *
104 * CoW remapping must be done after the data block write completes, 123 * CoW remapping must be done after the data block write completes,
105 * because we don't want to destroy the old data fork map until we're sure 124 * because we don't want to destroy the old data fork map until we're sure
106 * the new block has been written. Since the new mappings are kept in a 125 * the new block has been written. Since the new mappings are kept in a
107 * separate fork, we can simply iterate these mappings to find the ones 126 * separate fork, we can simply iterate these mappings to find the ones
108 * that cover the file blocks that we just CoW'd. For each extent, simply 127 * that cover the file blocks that we just CoW'd. For each extent, simply
109 * unmap the corresponding range in the data fork, map the new range into 128 * unmap the corresponding range in the data fork, map the new range into
110 * the data fork, and remove the extent from the CoW fork. 129 * the data fork, and remove the extent from the CoW fork. Because of
130 * the presence of the cowextsize hint, however, we must be careful
131 * only to remap the blocks that we've actually written out -- we must
132 * never remap delalloc reservations nor CoW staging blocks that have
133 * yet to be written. This corresponds exactly to the real extents in
134 * the CoW fork:
135 *
136 * D: --RRRRRRrrSRRRRRRRR---
137 * C: ------UU--UUU---------
111 * 138 *
112 * Since the remapping operation can be applied to an arbitrary file 139 * Since the remapping operation can be applied to an arbitrary file
113 * range, we record the need for the remap step as a flag in the ioend 140 * range, we record the need for the remap step as a flag in the ioend
@@ -296,6 +323,65 @@ xfs_reflink_reserve_cow(
296 return 0; 323 return 0;
297} 324}
298 325
326/* Convert part of an unwritten CoW extent to a real one. */
327STATIC int
328xfs_reflink_convert_cow_extent(
329 struct xfs_inode *ip,
330 struct xfs_bmbt_irec *imap,
331 xfs_fileoff_t offset_fsb,
332 xfs_filblks_t count_fsb,
333 struct xfs_defer_ops *dfops)
334{
335 struct xfs_bmbt_irec irec = *imap;
336 xfs_fsblock_t first_block;
337 int nimaps = 1;
338
339 if (imap->br_state == XFS_EXT_NORM)
340 return 0;
341
342 xfs_trim_extent(&irec, offset_fsb, count_fsb);
343 trace_xfs_reflink_convert_cow(ip, &irec);
344 if (irec.br_blockcount == 0)
345 return 0;
346 return xfs_bmapi_write(NULL, ip, irec.br_startoff, irec.br_blockcount,
347 XFS_BMAPI_COWFORK | XFS_BMAPI_CONVERT, &first_block,
348 0, &irec, &nimaps, dfops);
349}
350
351/* Convert all of the unwritten CoW extents in a file's range to real ones. */
352int
353xfs_reflink_convert_cow(
354 struct xfs_inode *ip,
355 xfs_off_t offset,
356 xfs_off_t count)
357{
358 struct xfs_bmbt_irec got;
359 struct xfs_defer_ops dfops;
360 struct xfs_mount *mp = ip->i_mount;
361 struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
362 xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
363 xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count);
364 xfs_extnum_t idx;
365 bool found;
366 int error;
367
368 xfs_ilock(ip, XFS_ILOCK_EXCL);
369
370 /* Convert all the extents to real from unwritten. */
371 for (found = xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got);
372 found && got.br_startoff < end_fsb;
373 found = xfs_iext_get_extent(ifp, ++idx, &got)) {
374 error = xfs_reflink_convert_cow_extent(ip, &got, offset_fsb,
375 end_fsb - offset_fsb, &dfops);
376 if (error)
377 break;
378 }
379
380 /* Finish up. */
381 xfs_iunlock(ip, XFS_ILOCK_EXCL);
382 return error;
383}
384
299/* Allocate all CoW reservations covering a range of blocks in a file. */ 385/* Allocate all CoW reservations covering a range of blocks in a file. */
300static int 386static int
301__xfs_reflink_allocate_cow( 387__xfs_reflink_allocate_cow(
@@ -328,6 +414,7 @@ __xfs_reflink_allocate_cow(
328 goto out_unlock; 414 goto out_unlock;
329 ASSERT(nimaps == 1); 415 ASSERT(nimaps == 1);
330 416
417 /* Make sure there's a CoW reservation for it. */
331 error = xfs_reflink_reserve_cow(ip, &imap, &shared); 418 error = xfs_reflink_reserve_cow(ip, &imap, &shared);
332 if (error) 419 if (error)
333 goto out_trans_cancel; 420 goto out_trans_cancel;
@@ -337,14 +424,16 @@ __xfs_reflink_allocate_cow(
337 goto out_trans_cancel; 424 goto out_trans_cancel;
338 } 425 }
339 426
427 /* Allocate the entire reservation as unwritten blocks. */
340 xfs_trans_ijoin(tp, ip, 0); 428 xfs_trans_ijoin(tp, ip, 0);
341 error = xfs_bmapi_write(tp, ip, imap.br_startoff, imap.br_blockcount, 429 error = xfs_bmapi_write(tp, ip, imap.br_startoff, imap.br_blockcount,
342 XFS_BMAPI_COWFORK, &first_block, 430 XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, &first_block,
343 XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), 431 XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK),
344 &imap, &nimaps, &dfops); 432 &imap, &nimaps, &dfops);
345 if (error) 433 if (error)
346 goto out_trans_cancel; 434 goto out_trans_cancel;
347 435
436 /* Finish up. */
348 error = xfs_defer_finish(&tp, &dfops, NULL); 437 error = xfs_defer_finish(&tp, &dfops, NULL);
349 if (error) 438 if (error)
350 goto out_trans_cancel; 439 goto out_trans_cancel;
@@ -389,11 +478,12 @@ xfs_reflink_allocate_cow_range(
389 if (error) { 478 if (error) {
390 trace_xfs_reflink_allocate_cow_range_error(ip, error, 479 trace_xfs_reflink_allocate_cow_range_error(ip, error,
391 _RET_IP_); 480 _RET_IP_);
392 break; 481 return error;
393 } 482 }
394 } 483 }
395 484
396 return error; 485 /* Convert the CoW extents to regular. */
486 return xfs_reflink_convert_cow(ip, offset, count);
397} 487}
398 488
399/* 489/*
@@ -641,6 +731,16 @@ xfs_reflink_end_cow(
641 731
642 ASSERT(!isnullstartblock(got.br_startblock)); 732 ASSERT(!isnullstartblock(got.br_startblock));
643 733
734 /*
735 * Don't remap unwritten extents; these are
736 * speculatively preallocated CoW extents that have been
737 * allocated but have not yet been involved in a write.
738 */
739 if (got.br_state == XFS_EXT_UNWRITTEN) {
740 idx--;
741 goto next_extent;
742 }
743
644 /* Unmap the old blocks in the data fork. */ 744 /* Unmap the old blocks in the data fork. */
645 xfs_defer_init(&dfops, &firstfsb); 745 xfs_defer_init(&dfops, &firstfsb);
646 rlen = del.br_blockcount; 746 rlen = del.br_blockcount;
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
index aa6a4d64bd35..1583c4727cb1 100644
--- a/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h
@@ -30,6 +30,8 @@ extern int xfs_reflink_reserve_cow(struct xfs_inode *ip,
30 struct xfs_bmbt_irec *imap, bool *shared); 30 struct xfs_bmbt_irec *imap, bool *shared);
31extern int xfs_reflink_allocate_cow_range(struct xfs_inode *ip, 31extern int xfs_reflink_allocate_cow_range(struct xfs_inode *ip,
32 xfs_off_t offset, xfs_off_t count); 32 xfs_off_t offset, xfs_off_t count);
33extern int xfs_reflink_convert_cow(struct xfs_inode *ip, xfs_off_t offset,
34 xfs_off_t count);
33extern bool xfs_reflink_find_cow_mapping(struct xfs_inode *ip, xfs_off_t offset, 35extern bool xfs_reflink_find_cow_mapping(struct xfs_inode *ip, xfs_off_t offset,
34 struct xfs_bmbt_irec *imap); 36 struct xfs_bmbt_irec *imap);
35extern void xfs_reflink_trim_irec_to_next_cow(struct xfs_inode *ip, 37extern void xfs_reflink_trim_irec_to_next_cow(struct xfs_inode *ip,
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 643222784c3b..9e9bb9538bb6 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -3088,6 +3088,7 @@ DECLARE_EVENT_CLASS(xfs_inode_irec_class,
3088 __field(xfs_fileoff_t, lblk) 3088 __field(xfs_fileoff_t, lblk)
3089 __field(xfs_extlen_t, len) 3089 __field(xfs_extlen_t, len)
3090 __field(xfs_fsblock_t, pblk) 3090 __field(xfs_fsblock_t, pblk)
3091 __field(int, state)
3091 ), 3092 ),
3092 TP_fast_assign( 3093 TP_fast_assign(
3093 __entry->dev = VFS_I(ip)->i_sb->s_dev; 3094 __entry->dev = VFS_I(ip)->i_sb->s_dev;
@@ -3095,13 +3096,15 @@ DECLARE_EVENT_CLASS(xfs_inode_irec_class,
3095 __entry->lblk = irec->br_startoff; 3096 __entry->lblk = irec->br_startoff;
3096 __entry->len = irec->br_blockcount; 3097 __entry->len = irec->br_blockcount;
3097 __entry->pblk = irec->br_startblock; 3098 __entry->pblk = irec->br_startblock;
3099 __entry->state = irec->br_state;
3098 ), 3100 ),
3099 TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x pblk %llu", 3101 TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x pblk %llu st %d",
3100 MAJOR(__entry->dev), MINOR(__entry->dev), 3102 MAJOR(__entry->dev), MINOR(__entry->dev),
3101 __entry->ino, 3103 __entry->ino,
3102 __entry->lblk, 3104 __entry->lblk,
3103 __entry->len, 3105 __entry->len,
3104 __entry->pblk) 3106 __entry->pblk,
3107 __entry->state)
3105); 3108);
3106#define DEFINE_INODE_IREC_EVENT(name) \ 3109#define DEFINE_INODE_IREC_EVENT(name) \
3107DEFINE_EVENT(xfs_inode_irec_class, name, \ 3110DEFINE_EVENT(xfs_inode_irec_class, name, \
@@ -3241,6 +3244,7 @@ DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_around_shared);
3241DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_alloc); 3244DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_alloc);
3242DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_found); 3245DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_found);
3243DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc); 3246DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc);
3247DEFINE_INODE_IREC_EVENT(xfs_reflink_convert_cow);
3244 3248
3245DEFINE_RW_EVENT(xfs_reflink_reserve_cow); 3249DEFINE_RW_EVENT(xfs_reflink_reserve_cow);
3246DEFINE_RW_EVENT(xfs_reflink_allocate_cow_range); 3250DEFINE_RW_EVENT(xfs_reflink_allocate_cow_range);