aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorBrian Foster <bfoster@redhat.com>2017-01-28 02:22:56 -0500
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>2017-04-08 03:30:30 -0400
commit4d725d7474dfddea00dca26f14b052c40d3444b0 (patch)
treec7d7f16449923718ee3a5d4166142dff6247db6a /fs
parent798b1dc5cbdfbbb3ac0d45177a1fc1dd511e3469 (diff)
xfs: sync eofblocks scans under iolock are livelock prone
commit c3155097ad89a956579bc305856a1f2878494e52 upstream. The xfs_eofblocks.eof_scan_owner field is an internal field to facilitate invoking eofb scans from the kernel while under the iolock. This is necessary because the eofb scan acquires the iolock of each inode. Synchronous scans are invoked on certain buffered write failures while under iolock. In such cases, the scan owner indicates that the context for the scan already owns the particular iolock and prevents a double lock deadlock. eofblocks scans while under iolock are still livelock prone in the event of multiple parallel scans, however. If multiple buffered writes to different inodes fail and invoke eofblocks scans at the same time, each scan avoids a deadlock with its own inode by virtue of the eof_scan_owner field, but will never be able to acquire the iolock of the inode from the parallel scan. Because the low free space scans are invoked with SYNC_WAIT, the scan will not return until it has processed every tagged inode and thus both scans will spin indefinitely on the iolock being held across the opposite scan. This problem can be reproduced reliably by generic/224 on systems with higher cpu counts (x16). To avoid this problem, simplify the semantics of eofblocks scans to never invoke a scan while under iolock. This means that the buffered write context must drop the iolock before the scan. It must reacquire the lock before the write retry and also repeat the initial write checks, as the original state might no longer be valid once the iolock was dropped. Signed-off-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Diffstat (limited to 'fs')
-rw-r--r--fs/xfs/xfs_file.c13
-rw-r--r--fs/xfs/xfs_icache.c45
-rw-r--r--fs/xfs/xfs_icache.h2
3 files changed, 16 insertions, 44 deletions
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 9a5d64b5f35a..81b7fee40b54 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -675,8 +675,10 @@ xfs_file_buffered_aio_write(
675 struct xfs_inode *ip = XFS_I(inode); 675 struct xfs_inode *ip = XFS_I(inode);
676 ssize_t ret; 676 ssize_t ret;
677 int enospc = 0; 677 int enospc = 0;
678 int iolock = XFS_IOLOCK_EXCL; 678 int iolock;
679 679
680write_retry:
681 iolock = XFS_IOLOCK_EXCL;
680 xfs_rw_ilock(ip, iolock); 682 xfs_rw_ilock(ip, iolock);
681 683
682 ret = xfs_file_aio_write_checks(iocb, from, &iolock); 684 ret = xfs_file_aio_write_checks(iocb, from, &iolock);
@@ -686,7 +688,6 @@ xfs_file_buffered_aio_write(
686 /* We can write back this queue in page reclaim */ 688 /* We can write back this queue in page reclaim */
687 current->backing_dev_info = inode_to_bdi(inode); 689 current->backing_dev_info = inode_to_bdi(inode);
688 690
689write_retry:
690 trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos); 691 trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos);
691 ret = iomap_file_buffered_write(iocb, from, &xfs_iomap_ops); 692 ret = iomap_file_buffered_write(iocb, from, &xfs_iomap_ops);
692 if (likely(ret >= 0)) 693 if (likely(ret >= 0))
@@ -702,18 +703,21 @@ write_retry:
702 * running at the same time. 703 * running at the same time.
703 */ 704 */
704 if (ret == -EDQUOT && !enospc) { 705 if (ret == -EDQUOT && !enospc) {
706 xfs_rw_iunlock(ip, iolock);
705 enospc = xfs_inode_free_quota_eofblocks(ip); 707 enospc = xfs_inode_free_quota_eofblocks(ip);
706 if (enospc) 708 if (enospc)
707 goto write_retry; 709 goto write_retry;
708 enospc = xfs_inode_free_quota_cowblocks(ip); 710 enospc = xfs_inode_free_quota_cowblocks(ip);
709 if (enospc) 711 if (enospc)
710 goto write_retry; 712 goto write_retry;
713 iolock = 0;
711 } else if (ret == -ENOSPC && !enospc) { 714 } else if (ret == -ENOSPC && !enospc) {
712 struct xfs_eofblocks eofb = {0}; 715 struct xfs_eofblocks eofb = {0};
713 716
714 enospc = 1; 717 enospc = 1;
715 xfs_flush_inodes(ip->i_mount); 718 xfs_flush_inodes(ip->i_mount);
716 eofb.eof_scan_owner = ip->i_ino; /* for locking */ 719
720 xfs_rw_iunlock(ip, iolock);
717 eofb.eof_flags = XFS_EOF_FLAGS_SYNC; 721 eofb.eof_flags = XFS_EOF_FLAGS_SYNC;
718 xfs_icache_free_eofblocks(ip->i_mount, &eofb); 722 xfs_icache_free_eofblocks(ip->i_mount, &eofb);
719 goto write_retry; 723 goto write_retry;
@@ -721,7 +725,8 @@ write_retry:
721 725
722 current->backing_dev_info = NULL; 726 current->backing_dev_info = NULL;
723out: 727out:
724 xfs_rw_iunlock(ip, iolock); 728 if (iolock)
729 xfs_rw_iunlock(ip, iolock);
725 return ret; 730 return ret;
726} 731}
727 732
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index e4b382aabf9f..78708d001a63 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -1326,11 +1326,8 @@ xfs_inode_free_eofblocks(
1326{ 1326{
1327 int ret = 0; 1327 int ret = 0;
1328 struct xfs_eofblocks *eofb = args; 1328 struct xfs_eofblocks *eofb = args;
1329 bool need_iolock = true;
1330 int match; 1329 int match;
1331 1330
1332 ASSERT(!eofb || (eofb && eofb->eof_scan_owner != 0));
1333
1334 if (!xfs_can_free_eofblocks(ip, false)) { 1331 if (!xfs_can_free_eofblocks(ip, false)) {
1335 /* inode could be preallocated or append-only */ 1332 /* inode could be preallocated or append-only */
1336 trace_xfs_inode_free_eofblocks_invalid(ip); 1333 trace_xfs_inode_free_eofblocks_invalid(ip);
@@ -1358,27 +1355,19 @@ xfs_inode_free_eofblocks(
1358 if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE && 1355 if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE &&
1359 XFS_ISIZE(ip) < eofb->eof_min_file_size) 1356 XFS_ISIZE(ip) < eofb->eof_min_file_size)
1360 return 0; 1357 return 0;
1361
1362 /*
1363 * A scan owner implies we already hold the iolock. Skip it here
1364 * to avoid deadlock.
1365 */
1366 if (eofb->eof_scan_owner == ip->i_ino)
1367 need_iolock = false;
1368 } 1358 }
1369 1359
1370 /* 1360 /*
1371 * If the caller is waiting, return -EAGAIN to keep the background 1361 * If the caller is waiting, return -EAGAIN to keep the background
1372 * scanner moving and revisit the inode in a subsequent pass. 1362 * scanner moving and revisit the inode in a subsequent pass.
1373 */ 1363 */
1374 if (need_iolock && !xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { 1364 if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
1375 if (flags & SYNC_WAIT) 1365 if (flags & SYNC_WAIT)
1376 ret = -EAGAIN; 1366 ret = -EAGAIN;
1377 return ret; 1367 return ret;
1378 } 1368 }
1379 ret = xfs_free_eofblocks(ip); 1369 ret = xfs_free_eofblocks(ip);
1380 if (need_iolock) 1370 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1381 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1382 1371
1383 return ret; 1372 return ret;
1384} 1373}
@@ -1425,15 +1414,10 @@ __xfs_inode_free_quota_eofblocks(
1425 struct xfs_eofblocks eofb = {0}; 1414 struct xfs_eofblocks eofb = {0};
1426 struct xfs_dquot *dq; 1415 struct xfs_dquot *dq;
1427 1416
1428 ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
1429
1430 /* 1417 /*
1431 * Set the scan owner to avoid a potential livelock. Otherwise, the scan 1418 * Run a sync scan to increase effectiveness and use the union filter to
1432 * can repeatedly trylock on the inode we're currently processing. We
1433 * run a sync scan to increase effectiveness and use the union filter to
1434 * cover all applicable quotas in a single scan. 1419 * cover all applicable quotas in a single scan.
1435 */ 1420 */
1436 eofb.eof_scan_owner = ip->i_ino;
1437 eofb.eof_flags = XFS_EOF_FLAGS_UNION|XFS_EOF_FLAGS_SYNC; 1421 eofb.eof_flags = XFS_EOF_FLAGS_UNION|XFS_EOF_FLAGS_SYNC;
1438 1422
1439 if (XFS_IS_UQUOTA_ENFORCED(ip->i_mount)) { 1423 if (XFS_IS_UQUOTA_ENFORCED(ip->i_mount)) {
@@ -1585,12 +1569,9 @@ xfs_inode_free_cowblocks(
1585{ 1569{
1586 int ret; 1570 int ret;
1587 struct xfs_eofblocks *eofb = args; 1571 struct xfs_eofblocks *eofb = args;
1588 bool need_iolock = true;
1589 int match; 1572 int match;
1590 struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); 1573 struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
1591 1574
1592 ASSERT(!eofb || (eofb && eofb->eof_scan_owner != 0));
1593
1594 /* 1575 /*
1595 * Just clear the tag if we have an empty cow fork or none at all. It's 1576 * Just clear the tag if we have an empty cow fork or none at all. It's
1596 * possible the inode was fully unshared since it was originally tagged. 1577 * possible the inode was fully unshared since it was originally tagged.
@@ -1623,28 +1604,16 @@ xfs_inode_free_cowblocks(
1623 if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE && 1604 if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE &&
1624 XFS_ISIZE(ip) < eofb->eof_min_file_size) 1605 XFS_ISIZE(ip) < eofb->eof_min_file_size)
1625 return 0; 1606 return 0;
1626
1627 /*
1628 * A scan owner implies we already hold the iolock. Skip it in
1629 * xfs_free_eofblocks() to avoid deadlock. This also eliminates
1630 * the possibility of EAGAIN being returned.
1631 */
1632 if (eofb->eof_scan_owner == ip->i_ino)
1633 need_iolock = false;
1634 } 1607 }
1635 1608
1636 /* Free the CoW blocks */ 1609 /* Free the CoW blocks */
1637 if (need_iolock) { 1610 xfs_ilock(ip, XFS_IOLOCK_EXCL);
1638 xfs_ilock(ip, XFS_IOLOCK_EXCL); 1611 xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
1639 xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
1640 }
1641 1612
1642 ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF); 1613 ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF);
1643 1614
1644 if (need_iolock) { 1615 xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
1645 xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); 1616 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1646 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1647 }
1648 1617
1649 return ret; 1618 return ret;
1650} 1619}
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index a1e02f4708ab..8a7c849b4dea 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -27,7 +27,6 @@ struct xfs_eofblocks {
27 kgid_t eof_gid; 27 kgid_t eof_gid;
28 prid_t eof_prid; 28 prid_t eof_prid;
29 __u64 eof_min_file_size; 29 __u64 eof_min_file_size;
30 xfs_ino_t eof_scan_owner;
31}; 30};
32 31
33#define SYNC_WAIT 0x0001 /* wait for i/o to complete */ 32#define SYNC_WAIT 0x0001 /* wait for i/o to complete */
@@ -102,7 +101,6 @@ xfs_fs_eofblocks_from_user(
102 dst->eof_flags = src->eof_flags; 101 dst->eof_flags = src->eof_flags;
103 dst->eof_prid = src->eof_prid; 102 dst->eof_prid = src->eof_prid;
104 dst->eof_min_file_size = src->eof_min_file_size; 103 dst->eof_min_file_size = src->eof_min_file_size;
105 dst->eof_scan_owner = NULLFSINO;
106 104
107 dst->eof_uid = INVALID_UID; 105 dst->eof_uid = INVALID_UID;
108 if (src->eof_flags & XFS_EOF_FLAGS_UID) { 106 if (src->eof_flags & XFS_EOF_FLAGS_UID) {