aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDave Chinner <dchinner@redhat.com>2015-11-02 20:37:00 -0500
committerDave Chinner <david@fromorbit.com>2015-11-02 20:37:00 -0500
commit1ca191576fc862b4766f58e41aa362b28a7c1866 (patch)
tree07b9e420aae07600a11da6bac81477e15194cdfc
parent3fbbbea34bac049c0b5938dc065f7d8ee1ef7e67 (diff)
xfs: Don't use unwritten extents for DAX
DAX has a page fault serialisation problem with block allocation. Because it allows concurrent page faults and does not have a page lock to serialise faults to the same page, it can get two concurrent faults to the page that race. When two read faults race, this isn't a huge problem as the data underlying the page is not changing and so "detect and drop" works just fine. The issues are to do with write faults. When two write faults occur, we serialise block allocation in get_blocks() so only one faul will allocate the extent. It will, however, be marked as an unwritten extent, and that is where the problem lies - the DAX fault code cannot differentiate between a block that was just allocated and a block that was preallocated and needs zeroing. The result is that both write faults end up zeroing the block and attempting to convert it back to written. The problem is that the first fault can zero and convert before the second fault starts zeroing, resulting in the zeroing for the second fault overwriting the data that the first fault wrote with zeros. The second fault then attempts to convert the unwritten extent, which is then a no-op because it's already written. Data loss occurs as a result of this race. Because there is no sane locking construct in the page fault code that we can use for serialisation across the page faults, we need to ensure block allocation and zeroing occurs atomically in the filesystem. This means we can still take concurrent page faults and the only time they will serialise is in the filesystem mapping/allocation callback. The page fault code will always see written, initialised extents, so we will be able to remove the unwritten extent handling from the DAX code when all filesystems are converted. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Brian Foster <bfoster@redhat.com> Signed-off-by: Dave Chinner <david@fromorbit.com>
-rw-r--r--fs/dax.c5
-rw-r--r--fs/xfs/xfs_aops.c13
-rw-r--r--fs/xfs/xfs_iomap.c23
3 files changed, 35 insertions, 6 deletions
diff --git a/fs/dax.c b/fs/dax.c
index 7ae6df7ea1d2..74033ad1bc92 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -29,6 +29,11 @@
29#include <linux/uio.h> 29#include <linux/uio.h>
30#include <linux/vmstat.h> 30#include <linux/vmstat.h>
31 31
32/*
33 * dax_clear_blocks() is called from within transaction context from XFS,
34 * and hence this means the stack from this point must follow GFP_NOFS
35 * semantics for all operations.
36 */
32int dax_clear_blocks(struct inode *inode, sector_t block, long size) 37int dax_clear_blocks(struct inode *inode, sector_t block, long size)
33{ 38{
34 struct block_device *bdev = inode->i_sb->s_bdev; 39 struct block_device *bdev = inode->i_sb->s_bdev;
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index e747d6ad5d18..df3dabd469b9 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1284,15 +1284,12 @@ xfs_map_direct(
1284 1284
1285 trace_xfs_gbmap_direct(XFS_I(inode), offset, size, type, imap); 1285 trace_xfs_gbmap_direct(XFS_I(inode), offset, size, type, imap);
1286 1286
1287 /* XXX: preparation for removing unwritten extents in DAX */
1288#if 0
1289 if (dax_fault) { 1287 if (dax_fault) {
1290 ASSERT(type == XFS_IO_OVERWRITE); 1288 ASSERT(type == XFS_IO_OVERWRITE);
1291 trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type, 1289 trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type,
1292 imap); 1290 imap);
1293 return; 1291 return;
1294 } 1292 }
1295#endif
1296 1293
1297 if (bh_result->b_private) { 1294 if (bh_result->b_private) {
1298 ioend = bh_result->b_private; 1295 ioend = bh_result->b_private;
@@ -1420,10 +1417,12 @@ __xfs_get_blocks(
1420 if (error) 1417 if (error)
1421 goto out_unlock; 1418 goto out_unlock;
1422 1419
1420 /* for DAX, we convert unwritten extents directly */
1423 if (create && 1421 if (create &&
1424 (!nimaps || 1422 (!nimaps ||
1425 (imap.br_startblock == HOLESTARTBLOCK || 1423 (imap.br_startblock == HOLESTARTBLOCK ||
1426 imap.br_startblock == DELAYSTARTBLOCK))) { 1424 imap.br_startblock == DELAYSTARTBLOCK) ||
1425 (IS_DAX(inode) && ISUNWRITTEN(&imap)))) {
1427 if (direct || xfs_get_extsz_hint(ip)) { 1426 if (direct || xfs_get_extsz_hint(ip)) {
1428 /* 1427 /*
1429 * Drop the ilock in preparation for starting the block 1428 * Drop the ilock in preparation for starting the block
@@ -1468,6 +1467,12 @@ __xfs_get_blocks(
1468 goto out_unlock; 1467 goto out_unlock;
1469 } 1468 }
1470 1469
1470 if (IS_DAX(inode) && create) {
1471 ASSERT(!ISUNWRITTEN(&imap));
1472 /* zeroing is not needed at a higher layer */
1473 new = 0;
1474 }
1475
1471 /* trim mapping down to size requested */ 1476 /* trim mapping down to size requested */
1472 if (direct || size > (1 << inode->i_blkbits)) 1477 if (direct || size > (1 << inode->i_blkbits))
1473 xfs_map_trim_size(inode, iblock, bh_result, 1478 xfs_map_trim_size(inode, iblock, bh_result,
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 1f86033171c8..b48c6b525e77 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -131,6 +131,7 @@ xfs_iomap_write_direct(
131 uint qblocks, resblks, resrtextents; 131 uint qblocks, resblks, resrtextents;
132 int committed; 132 int committed;
133 int error; 133 int error;
134 int bmapi_flags = XFS_BMAPI_PREALLOC;
134 135
135 error = xfs_qm_dqattach(ip, 0); 136 error = xfs_qm_dqattach(ip, 0);
136 if (error) 137 if (error)
@@ -177,6 +178,23 @@ xfs_iomap_write_direct(
177 * Allocate and setup the transaction 178 * Allocate and setup the transaction
178 */ 179 */
179 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); 180 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
181
182 /*
183 * For DAX, we do not allocate unwritten extents, but instead we zero
184 * the block before we commit the transaction. Ideally we'd like to do
185 * this outside the transaction context, but if we commit and then crash
186 * we may not have zeroed the blocks and this will be exposed on
187 * recovery of the allocation. Hence we must zero before commit.
188 * Further, if we are mapping unwritten extents here, we need to zero
189 * and convert them to written so that we don't need an unwritten extent
190 * callback for DAX. This also means that we need to be able to dip into
191 * the reserve block pool if there is no space left but we need to do
192 * unwritten extent conversion.
193 */
194 if (IS_DAX(VFS_I(ip))) {
195 bmapi_flags = XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO;
196 tp->t_flags |= XFS_TRANS_RESERVE;
197 }
180 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, 198 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
181 resblks, resrtextents); 199 resblks, resrtextents);
182 /* 200 /*
@@ -202,8 +220,8 @@ xfs_iomap_write_direct(
202 xfs_bmap_init(&free_list, &firstfsb); 220 xfs_bmap_init(&free_list, &firstfsb);
203 nimaps = 1; 221 nimaps = 1;
204 error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, 222 error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb,
205 XFS_BMAPI_PREALLOC, &firstfsb, 0, 223 bmapi_flags, &firstfsb, 0, imap,
206 imap, &nimaps, &free_list); 224 &nimaps, &free_list);
207 if (error) 225 if (error)
208 goto out_bmap_cancel; 226 goto out_bmap_cancel;
209 227
@@ -213,6 +231,7 @@ xfs_iomap_write_direct(
213 error = xfs_bmap_finish(&tp, &free_list, &committed); 231 error = xfs_bmap_finish(&tp, &free_list, &committed);
214 if (error) 232 if (error)
215 goto out_bmap_cancel; 233 goto out_bmap_cancel;
234
216 error = xfs_trans_commit(tp); 235 error = xfs_trans_commit(tp);
217 if (error) 236 if (error)
218 goto out_unlock; 237 goto out_unlock;