aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorDave Chinner <david@fromorbit.com>2015-04-16 08:13:18 -0400
committerDave Chinner <david@fromorbit.com>2015-04-16 08:13:18 -0400
commit542c311813d5cb2e6f0dfa9557f41c829b8fb6a0 (patch)
tree573c5644eb966e44112016c9ae86e80251326223 /fs
parent6a63ef064b2444883ce8b68b0779d0c739d27204 (diff)
parent0cefb29e6a63727bc7606c47fc538467594ef112 (diff)
Merge branch 'xfs-dio-extend-fix' into for-next
Conflicts: fs/xfs/xfs_file.c
Diffstat (limited to 'fs')
-rw-r--r--fs/xfs/xfs_aops.c270
-rw-r--r--fs/xfs/xfs_file.c46
-rw-r--r--fs/xfs/xfs_trace.h5
3 files changed, 239 insertions, 82 deletions
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 3a9b7a1b8704..598b259fda04 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1233,6 +1233,117 @@ xfs_vm_releasepage(
1233 return try_to_free_buffers(page); 1233 return try_to_free_buffers(page);
1234} 1234}
1235 1235
1236/*
1237 * When we map a DIO buffer, we may need to attach an ioend that describes the
1238 * type of write IO we are doing. This passes to the completion function the
1239 * operations it needs to perform. If the mapping is for an overwrite wholly
1240 * within the EOF then we don't need an ioend and so we don't allocate one.
1241 * This avoids the unnecessary overhead of allocating and freeing ioends for
1242 * workloads that don't require transactions on IO completion.
1243 *
1244 * If we get multiple mappings in a single IO, we might be mapping different
1245 * types. But because the direct IO can only have a single private pointer, we
1246 * need to ensure that:
1247 *
1248 * a) i) the ioend spans the entire region of unwritten mappings; or
1249 * ii) the ioend spans all the mappings that cross or are beyond EOF; and
1250 * b) if it contains unwritten extents, it is *permanently* marked as such
1251 *
1252 * We could do this by chaining ioends like buffered IO does, but we only
1253 * actually get one IO completion callback from the direct IO, and that spans
1254 * the entire IO regardless of how many mappings and IOs are needed to complete
1255 * the DIO. There is only going to be one reference to the ioend and its life
1256 * cycle is constrained by the DIO completion code. hence we don't need
1257 * reference counting here.
1258 */
1259static void
1260xfs_map_direct(
1261 struct inode *inode,
1262 struct buffer_head *bh_result,
1263 struct xfs_bmbt_irec *imap,
1264 xfs_off_t offset)
1265{
1266 struct xfs_ioend *ioend;
1267 xfs_off_t size = bh_result->b_size;
1268 int type;
1269
1270 if (ISUNWRITTEN(imap))
1271 type = XFS_IO_UNWRITTEN;
1272 else
1273 type = XFS_IO_OVERWRITE;
1274
1275 trace_xfs_gbmap_direct(XFS_I(inode), offset, size, type, imap);
1276
1277 if (bh_result->b_private) {
1278 ioend = bh_result->b_private;
1279 ASSERT(ioend->io_size > 0);
1280 ASSERT(offset >= ioend->io_offset);
1281 if (offset + size > ioend->io_offset + ioend->io_size)
1282 ioend->io_size = offset - ioend->io_offset + size;
1283
1284 if (type == XFS_IO_UNWRITTEN && type != ioend->io_type)
1285 ioend->io_type = XFS_IO_UNWRITTEN;
1286
1287 trace_xfs_gbmap_direct_update(XFS_I(inode), ioend->io_offset,
1288 ioend->io_size, ioend->io_type,
1289 imap);
1290 } else if (type == XFS_IO_UNWRITTEN ||
1291 offset + size > i_size_read(inode)) {
1292 ioend = xfs_alloc_ioend(inode, type);
1293 ioend->io_offset = offset;
1294 ioend->io_size = size;
1295
1296 bh_result->b_private = ioend;
1297 set_buffer_defer_completion(bh_result);
1298
1299 trace_xfs_gbmap_direct_new(XFS_I(inode), offset, size, type,
1300 imap);
1301 } else {
1302 trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type,
1303 imap);
1304 }
1305}
1306
1307/*
1308 * If this is O_DIRECT or the mpage code calling tell them how large the mapping
1309 * is, so that we can avoid repeated get_blocks calls.
1310 *
1311 * If the mapping spans EOF, then we have to break the mapping up as the mapping
1312 * for blocks beyond EOF must be marked new so that sub block regions can be
1313 * correctly zeroed. We can't do this for mappings within EOF unless the mapping
1314 * was just allocated or is unwritten, otherwise the callers would overwrite
1315 * existing data with zeros. Hence we have to split the mapping into a range up
1316 * to and including EOF, and a second mapping for beyond EOF.
1317 */
1318static void
1319xfs_map_trim_size(
1320 struct inode *inode,
1321 sector_t iblock,
1322 struct buffer_head *bh_result,
1323 struct xfs_bmbt_irec *imap,
1324 xfs_off_t offset,
1325 ssize_t size)
1326{
1327 xfs_off_t mapping_size;
1328
1329 mapping_size = imap->br_startoff + imap->br_blockcount - iblock;
1330 mapping_size <<= inode->i_blkbits;
1331
1332 ASSERT(mapping_size > 0);
1333 if (mapping_size > size)
1334 mapping_size = size;
1335 if (offset < i_size_read(inode) &&
1336 offset + mapping_size >= i_size_read(inode)) {
1337 /* limit mapping to block that spans EOF */
1338 mapping_size = roundup_64(i_size_read(inode) - offset,
1339 1 << inode->i_blkbits);
1340 }
1341 if (mapping_size > LONG_MAX)
1342 mapping_size = LONG_MAX;
1343
1344 bh_result->b_size = mapping_size;
1345}
1346
1236STATIC int 1347STATIC int
1237__xfs_get_blocks( 1348__xfs_get_blocks(
1238 struct inode *inode, 1349 struct inode *inode,
@@ -1321,31 +1432,37 @@ __xfs_get_blocks(
1321 1432
1322 xfs_iunlock(ip, lockmode); 1433 xfs_iunlock(ip, lockmode);
1323 } 1434 }
1324 1435 trace_xfs_get_blocks_alloc(ip, offset, size,
1325 trace_xfs_get_blocks_alloc(ip, offset, size, 0, &imap); 1436 ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
1437 : XFS_IO_DELALLOC, &imap);
1326 } else if (nimaps) { 1438 } else if (nimaps) {
1327 trace_xfs_get_blocks_found(ip, offset, size, 0, &imap); 1439 trace_xfs_get_blocks_found(ip, offset, size,
1440 ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
1441 : XFS_IO_OVERWRITE, &imap);
1328 xfs_iunlock(ip, lockmode); 1442 xfs_iunlock(ip, lockmode);
1329 } else { 1443 } else {
1330 trace_xfs_get_blocks_notfound(ip, offset, size); 1444 trace_xfs_get_blocks_notfound(ip, offset, size);
1331 goto out_unlock; 1445 goto out_unlock;
1332 } 1446 }
1333 1447
1448 /* trim mapping down to size requested */
1449 if (direct || size > (1 << inode->i_blkbits))
1450 xfs_map_trim_size(inode, iblock, bh_result,
1451 &imap, offset, size);
1452
1453 /*
1454 * For unwritten extents do not report a disk address in the buffered
1455 * read case (treat as if we're reading into a hole).
1456 */
1334 if (imap.br_startblock != HOLESTARTBLOCK && 1457 if (imap.br_startblock != HOLESTARTBLOCK &&
1335 imap.br_startblock != DELAYSTARTBLOCK) { 1458 imap.br_startblock != DELAYSTARTBLOCK &&
1336 /* 1459 (create || !ISUNWRITTEN(&imap))) {
1337 * For unwritten extents do not report a disk address on 1460 xfs_map_buffer(inode, bh_result, &imap, offset);
1338 * the read case (treat as if we're reading into a hole). 1461 if (ISUNWRITTEN(&imap))
1339 */
1340 if (create || !ISUNWRITTEN(&imap))
1341 xfs_map_buffer(inode, bh_result, &imap, offset);
1342 if (create && ISUNWRITTEN(&imap)) {
1343 if (direct) {
1344 bh_result->b_private = inode;
1345 set_buffer_defer_completion(bh_result);
1346 }
1347 set_buffer_unwritten(bh_result); 1462 set_buffer_unwritten(bh_result);
1348 } 1463 /* direct IO needs special help */
1464 if (create && direct)
1465 xfs_map_direct(inode, bh_result, &imap, offset);
1349 } 1466 }
1350 1467
1351 /* 1468 /*
@@ -1378,39 +1495,6 @@ __xfs_get_blocks(
1378 } 1495 }
1379 } 1496 }
1380 1497
1381 /*
1382 * If this is O_DIRECT or the mpage code calling tell them how large
1383 * the mapping is, so that we can avoid repeated get_blocks calls.
1384 *
1385 * If the mapping spans EOF, then we have to break the mapping up as the
1386 * mapping for blocks beyond EOF must be marked new so that sub block
1387 * regions can be correctly zeroed. We can't do this for mappings within
1388 * EOF unless the mapping was just allocated or is unwritten, otherwise
1389 * the callers would overwrite existing data with zeros. Hence we have
1390 * to split the mapping into a range up to and including EOF, and a
1391 * second mapping for beyond EOF.
1392 */
1393 if (direct || size > (1 << inode->i_blkbits)) {
1394 xfs_off_t mapping_size;
1395
1396 mapping_size = imap.br_startoff + imap.br_blockcount - iblock;
1397 mapping_size <<= inode->i_blkbits;
1398
1399 ASSERT(mapping_size > 0);
1400 if (mapping_size > size)
1401 mapping_size = size;
1402 if (offset < i_size_read(inode) &&
1403 offset + mapping_size >= i_size_read(inode)) {
1404 /* limit mapping to block that spans EOF */
1405 mapping_size = roundup_64(i_size_read(inode) - offset,
1406 1 << inode->i_blkbits);
1407 }
1408 if (mapping_size > LONG_MAX)
1409 mapping_size = LONG_MAX;
1410
1411 bh_result->b_size = mapping_size;
1412 }
1413
1414 return 0; 1498 return 0;
1415 1499
1416out_unlock: 1500out_unlock:
@@ -1441,9 +1525,11 @@ xfs_get_blocks_direct(
1441/* 1525/*
1442 * Complete a direct I/O write request. 1526 * Complete a direct I/O write request.
1443 * 1527 *
1444 * If the private argument is non-NULL __xfs_get_blocks signals us that we 1528 * The ioend structure is passed from __xfs_get_blocks() to tell us what to do.
1445 * need to issue a transaction to convert the range from unwritten to written 1529 * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite
1446 * extents. 1530 * wholly within the EOF and so there is nothing for us to do. Note that in this
1531 * case the completion can be called in interrupt context, whereas if we have an
1532 * ioend we will always be called in task context (i.e. from a workqueue).
1447 */ 1533 */
1448STATIC void 1534STATIC void
1449xfs_end_io_direct_write( 1535xfs_end_io_direct_write(
@@ -1455,43 +1541,71 @@ xfs_end_io_direct_write(
1455 struct inode *inode = file_inode(iocb->ki_filp); 1541 struct inode *inode = file_inode(iocb->ki_filp);
1456 struct xfs_inode *ip = XFS_I(inode); 1542 struct xfs_inode *ip = XFS_I(inode);
1457 struct xfs_mount *mp = ip->i_mount; 1543 struct xfs_mount *mp = ip->i_mount;
1544 struct xfs_ioend *ioend = private;
1458 1545
1459 if (XFS_FORCED_SHUTDOWN(mp)) 1546 trace_xfs_gbmap_direct_endio(ip, offset, size,
1547 ioend ? ioend->io_type : 0, NULL);
1548
1549 if (!ioend) {
1550 ASSERT(offset + size <= i_size_read(inode));
1460 return; 1551 return;
1552 }
1553
1554 if (XFS_FORCED_SHUTDOWN(mp))
1555 goto out_end_io;
1461 1556
1462 /* 1557 /*
1463 * While the generic direct I/O code updates the inode size, it does 1558 * dio completion end_io functions are only called on writes if more
1464 * so only after the end_io handler is called, which means our 1559 * than 0 bytes was written.
1465 * end_io handler thinks the on-disk size is outside the in-core
1466 * size. To prevent this just update it a little bit earlier here.
1467 */ 1560 */
1561 ASSERT(size > 0);
1562
1563 /*
1564 * The ioend only maps whole blocks, while the IO may be sector aligned.
1565 * Hence the ioend offset/size may not match the IO offset/size exactly.
1566 * Because we don't map overwrites within EOF into the ioend, the offset
1567 * may not match, but only if the endio spans EOF. Either way, write
1568 * the IO sizes into the ioend so that completion processing does the
1569 * right thing.
1570 */
1571 ASSERT(offset + size <= ioend->io_offset + ioend->io_size);
1572 ioend->io_size = size;
1573 ioend->io_offset = offset;
1574
1575 /*
1576 * The ioend tells us whether we are doing unwritten extent conversion
1577 * or an append transaction that updates the on-disk file size. These
1578 * cases are the only cases where we should *potentially* be needing
1579 * to update the VFS inode size.
1580 *
1581 * We need to update the in-core inode size here so that we don't end up
1582 * with the on-disk inode size being outside the in-core inode size. We
1583 * have no other method of updating EOF for AIO, so always do it here
1584 * if necessary.
1585 *
1586 * We need to lock the test/set EOF update as we can be racing with
1587 * other IO completions here to update the EOF. Failing to serialise
1588 * here can result in EOF moving backwards and Bad Things Happen when
1589 * that occurs.
1590 */
1591 spin_lock(&ip->i_flags_lock);
1468 if (offset + size > i_size_read(inode)) 1592 if (offset + size > i_size_read(inode))
1469 i_size_write(inode, offset + size); 1593 i_size_write(inode, offset + size);
1594 spin_unlock(&ip->i_flags_lock);
1470 1595
1471 /* 1596 /*
1472 * For direct I/O we do not know if we need to allocate blocks or not, 1597 * If we are doing an append IO that needs to update the EOF on disk,
1473 * so we can't preallocate an append transaction, as that results in 1598 * do the transaction reserve now so we can use common end io
1474 * nested reservations and log space deadlocks. Hence allocate the 1599 * processing. Stashing the error (if there is one) in the ioend will
1475 * transaction here. While this is sub-optimal and can block IO 1600 * result in the ioend processing passing on the error if it is
1476 * completion for some time, we're stuck with doing it this way until 1601 * possible as we can't return it from here.
1477 * we can pass the ioend to the direct IO allocation callbacks and
1478 * avoid nesting that way.
1479 */ 1602 */
1480 if (private && size > 0) { 1603 if (ioend->io_type == XFS_IO_OVERWRITE)
1481 xfs_iomap_write_unwritten(ip, offset, size); 1604 ioend->io_error = xfs_setfilesize_trans_alloc(ioend);
1482 } else if (offset + size > ip->i_d.di_size) {
1483 struct xfs_trans *tp;
1484 int error;
1485
1486 tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
1487 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
1488 if (error) {
1489 xfs_trans_cancel(tp, 0);
1490 return;
1491 }
1492 1605
1493 xfs_setfilesize(ip, tp, offset, size); 1606out_end_io:
1494 } 1607 xfs_end_io(&ioend->io_work);
1608 return;
1495} 1609}
1496 1610
1497STATIC ssize_t 1611STATIC ssize_t
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index c203839cd5be..3a5d305e60c9 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -569,20 +569,41 @@ restart:
569 * write. If zeroing is needed and we are currently holding the 569 * write. If zeroing is needed and we are currently holding the
570 * iolock shared, we need to update it to exclusive which implies 570 * iolock shared, we need to update it to exclusive which implies
571 * having to redo all checks before. 571 * having to redo all checks before.
572 *
573 * We need to serialise against EOF updates that occur in IO
574 * completions here. We want to make sure that nobody is changing the
575 * size while we do this check until we have placed an IO barrier (i.e.
576 * hold the XFS_IOLOCK_EXCL) that prevents new IO from being dispatched.
577 * The spinlock effectively forms a memory barrier once we have the
578 * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value
579 * and hence be able to correctly determine if we need to run zeroing.
572 */ 580 */
581 spin_lock(&ip->i_flags_lock);
573 if (*pos > i_size_read(inode)) { 582 if (*pos > i_size_read(inode)) {
574 bool zero = false; 583 bool zero = false;
575 584
585 spin_unlock(&ip->i_flags_lock);
576 if (*iolock == XFS_IOLOCK_SHARED) { 586 if (*iolock == XFS_IOLOCK_SHARED) {
577 xfs_rw_iunlock(ip, *iolock); 587 xfs_rw_iunlock(ip, *iolock);
578 *iolock = XFS_IOLOCK_EXCL; 588 *iolock = XFS_IOLOCK_EXCL;
579 xfs_rw_ilock(ip, *iolock); 589 xfs_rw_ilock(ip, *iolock);
590
591 /*
592 * We now have an IO submission barrier in place, but
593 * AIO can do EOF updates during IO completion and hence
594 * we now need to wait for all of them to drain. Non-AIO
595 * DIO will have drained before we are given the
596 * XFS_IOLOCK_EXCL, and so for most cases this wait is a
597 * no-op.
598 */
599 inode_dio_wait(inode);
580 goto restart; 600 goto restart;
581 } 601 }
582 error = xfs_zero_eof(ip, *pos, i_size_read(inode), &zero); 602 error = xfs_zero_eof(ip, *pos, i_size_read(inode), &zero);
583 if (error) 603 if (error)
584 return error; 604 return error;
585 } 605 } else
606 spin_unlock(&ip->i_flags_lock);
586 607
587 /* 608 /*
588 * Updating the timestamps will grab the ilock again from 609 * Updating the timestamps will grab the ilock again from
@@ -644,6 +665,8 @@ xfs_file_dio_aio_write(
644 int iolock; 665 int iolock;
645 size_t count = iov_iter_count(from); 666 size_t count = iov_iter_count(from);
646 loff_t pos = iocb->ki_pos; 667 loff_t pos = iocb->ki_pos;
668 loff_t end;
669 struct iov_iter data;
647 struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ? 670 struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ?
648 mp->m_rtdev_targp : mp->m_ddev_targp; 671 mp->m_rtdev_targp : mp->m_ddev_targp;
649 672
@@ -683,10 +706,11 @@ xfs_file_dio_aio_write(
683 if (ret) 706 if (ret)
684 goto out; 707 goto out;
685 iov_iter_truncate(from, count); 708 iov_iter_truncate(from, count);
709 end = pos + count - 1;
686 710
687 if (mapping->nrpages) { 711 if (mapping->nrpages) {
688 ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, 712 ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
689 pos, pos + count - 1); 713 pos, end);
690 if (ret) 714 if (ret)
691 goto out; 715 goto out;
692 /* 716 /*
@@ -696,7 +720,7 @@ xfs_file_dio_aio_write(
696 */ 720 */
697 ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping, 721 ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
698 pos >> PAGE_CACHE_SHIFT, 722 pos >> PAGE_CACHE_SHIFT,
699 (pos + count - 1) >> PAGE_CACHE_SHIFT); 723 end >> PAGE_CACHE_SHIFT);
700 WARN_ON_ONCE(ret); 724 WARN_ON_ONCE(ret);
701 ret = 0; 725 ret = 0;
702 } 726 }
@@ -713,8 +737,22 @@ xfs_file_dio_aio_write(
713 } 737 }
714 738
715 trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0); 739 trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
716 ret = generic_file_direct_write(iocb, from, pos);
717 740
741 data = *from;
742 ret = mapping->a_ops->direct_IO(WRITE, iocb, &data, pos);
743
744 /* see generic_file_direct_write() for why this is necessary */
745 if (mapping->nrpages) {
746 invalidate_inode_pages2_range(mapping,
747 pos >> PAGE_CACHE_SHIFT,
748 end >> PAGE_CACHE_SHIFT);
749 }
750
751 if (ret > 0) {
752 pos += ret;
753 iov_iter_advance(from, ret);
754 iocb->ki_pos = pos;
755 }
718out: 756out:
719 xfs_rw_iunlock(ip, iolock); 757 xfs_rw_iunlock(ip, iolock);
720 758
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index b2a45cc9eceb..615781bf4ee5 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1221,6 +1221,11 @@ DEFINE_IOMAP_EVENT(xfs_map_blocks_found);
1221DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc); 1221DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
1222DEFINE_IOMAP_EVENT(xfs_get_blocks_found); 1222DEFINE_IOMAP_EVENT(xfs_get_blocks_found);
1223DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc); 1223DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
1224DEFINE_IOMAP_EVENT(xfs_gbmap_direct);
1225DEFINE_IOMAP_EVENT(xfs_gbmap_direct_new);
1226DEFINE_IOMAP_EVENT(xfs_gbmap_direct_update);
1227DEFINE_IOMAP_EVENT(xfs_gbmap_direct_none);
1228DEFINE_IOMAP_EVENT(xfs_gbmap_direct_endio);
1224 1229
1225DECLARE_EVENT_CLASS(xfs_simple_io_class, 1230DECLARE_EVENT_CLASS(xfs_simple_io_class,
1226 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), 1231 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),