aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDave Chinner <dchinner@redhat.com>2013-10-29 07:11:44 -0400
committerBen Myers <bpm@sgi.com>2013-11-04 14:18:48 -0500
commit273203699f82667296e1f14344c5a5a6c4600470 (patch)
tree54f024ab7ad73e7165594d3a079bcb043caf49a6
parentbb86d21cba22a045b09d11b71decf5ca7c3d5def (diff)
xfs: xfs_remove deadlocks due to inverted AGF vs AGI lock ordering
Removing an inode from the namespace involves removing the directory entry and dropping the link count on the inode. Removing the directory entry can result in locking an AGF (directory blocks were freed) and removing a link count can result in placing the inode on an unlinked list which results in locking an AGI. The big problem here is that we have an ordering constraint on AGF and AGI locking - inode allocation locks the AGI, then can allocate a new extent for new inodes, locking the AGF after the AGI. Similarly, freeing the inode removes the inode from the unlinked list, requiring that we lock the AGI first, and then freeing the inode can result in an inode chunk being freed and hence freeing disk space requiring that we lock an AGF. Hence the ordering that is imposed by other parts of the code is AGI before AGF. This means we cannot remove the directory entry before we drop the inode reference count and put it on the unlinked list as this results in a lock order of AGF then AGI, and this can deadlock against inode allocation and freeing. Therefore we must drop the link counts before we remove the directory entry. This is still safe from a transactional point of view - it is not until we get to xfs_bmap_finish() that we have the possibility of multiple transactions in this operation. Hence as long as we remove the directory entry and drop the link count in the first transaction of the remove operation, there are no transactional constraints on the ordering here. Change the ordering of the operations in the xfs_remove() function to align the ordering of AGI and AGF locking to match that of the rest of the code. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Ben Myers <bpm@sgi.com> Signed-off-by: Ben Myers <bpm@sgi.com>
-rw-r--r--fs/xfs/xfs_inode.c72
1 files changed, 44 insertions, 28 deletions
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 326b94dbe159..001aa893ed59 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2404,6 +2404,33 @@ xfs_iunpin_wait(
2404 __xfs_iunpin_wait(ip); 2404 __xfs_iunpin_wait(ip);
2405} 2405}
2406 2406
2407/*
2408 * Removing an inode from the namespace involves removing the directory entry
2409 * and dropping the link count on the inode. Removing the directory entry can
2410 * result in locking an AGF (directory blocks were freed) and removing a link
2411 * count can result in placing the inode on an unlinked list which results in
2412 * locking an AGI.
2413 *
2414 * The big problem here is that we have an ordering constraint on AGF and AGI
2415 * locking - inode allocation locks the AGI, then can allocate a new extent for
2416 * new inodes, locking the AGF after the AGI. Similarly, freeing the inode
2417 * removes the inode from the unlinked list, requiring that we lock the AGI
2418 * first, and then freeing the inode can result in an inode chunk being freed
2419 * and hence freeing disk space requiring that we lock an AGF.
2420 *
2421 * Hence the ordering that is imposed by other parts of the code is AGI before
2422 * AGF. This means we cannot remove the directory entry before we drop the inode
2423 * reference count and put it on the unlinked list as this results in a lock
2424 * order of AGF then AGI, and this can deadlock against inode allocation and
2425 * freeing. Therefore we must drop the link counts before we remove the
2426 * directory entry.
2427 *
2428 * This is still safe from a transactional point of view - it is not until we
2429 * get to xfs_bmap_finish() that we have the possibility of multiple
2430 * transactions in this operation. Hence as long as we remove the directory
2431 * entry and drop the link count in the first transaction of the remove
2432 * operation, there are no transactional constraints on the ordering here.
2433 */
2407int 2434int
2408xfs_remove( 2435xfs_remove(
2409 xfs_inode_t *dp, 2436 xfs_inode_t *dp,
@@ -2473,6 +2500,7 @@ xfs_remove(
2473 /* 2500 /*
2474 * If we're removing a directory perform some additional validation. 2501 * If we're removing a directory perform some additional validation.
2475 */ 2502 */
2503 cancel_flags |= XFS_TRANS_ABORT;
2476 if (is_dir) { 2504 if (is_dir) {
2477 ASSERT(ip->i_d.di_nlink >= 2); 2505 ASSERT(ip->i_d.di_nlink >= 2);
2478 if (ip->i_d.di_nlink != 2) { 2506 if (ip->i_d.di_nlink != 2) {
@@ -2483,31 +2511,16 @@ xfs_remove(
2483 error = XFS_ERROR(ENOTEMPTY); 2511 error = XFS_ERROR(ENOTEMPTY);
2484 goto out_trans_cancel; 2512 goto out_trans_cancel;
2485 } 2513 }
2486 }
2487 2514
2488 xfs_bmap_init(&free_list, &first_block); 2515 /* Drop the link from ip's "..". */
2489 error = xfs_dir_removename(tp, dp, name, ip->i_ino,
2490 &first_block, &free_list, resblks);
2491 if (error) {
2492 ASSERT(error != ENOENT);
2493 goto out_bmap_cancel;
2494 }
2495 xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2496
2497 if (is_dir) {
2498 /*
2499 * Drop the link from ip's "..".
2500 */
2501 error = xfs_droplink(tp, dp); 2516 error = xfs_droplink(tp, dp);
2502 if (error) 2517 if (error)
2503 goto out_bmap_cancel; 2518 goto out_trans_cancel;
2504 2519
2505 /* 2520 /* Drop the "." link from ip to self. */
2506 * Drop the "." link from ip to self.
2507 */
2508 error = xfs_droplink(tp, ip); 2521 error = xfs_droplink(tp, ip);
2509 if (error) 2522 if (error)
2510 goto out_bmap_cancel; 2523 goto out_trans_cancel;
2511 } else { 2524 } else {
2512 /* 2525 /*
2513 * When removing a non-directory we need to log the parent 2526 * When removing a non-directory we need to log the parent
@@ -2516,20 +2529,24 @@ xfs_remove(
2516 */ 2529 */
2517 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); 2530 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
2518 } 2531 }
2532 xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2519 2533
2520 /* 2534 /* Drop the link from dp to ip. */
2521 * Drop the link from dp to ip.
2522 */
2523 error = xfs_droplink(tp, ip); 2535 error = xfs_droplink(tp, ip);
2524 if (error) 2536 if (error)
2525 goto out_bmap_cancel; 2537 goto out_trans_cancel;
2526 2538
2527 /* 2539 /* Determine if this is the last link while the inode is locked */
2528 * Determine if this is the last link while
2529 * we are in the transaction.
2530 */
2531 link_zero = (ip->i_d.di_nlink == 0); 2540 link_zero = (ip->i_d.di_nlink == 0);
2532 2541
2542 xfs_bmap_init(&free_list, &first_block);
2543 error = xfs_dir_removename(tp, dp, name, ip->i_ino,
2544 &first_block, &free_list, resblks);
2545 if (error) {
2546 ASSERT(error != ENOENT);
2547 goto out_bmap_cancel;
2548 }
2549
2533 /* 2550 /*
2534 * If this is a synchronous mount, make sure that the 2551 * If this is a synchronous mount, make sure that the
2535 * remove transaction goes to disk before returning to 2552 * remove transaction goes to disk before returning to
@@ -2559,7 +2576,6 @@ xfs_remove(
2559 2576
2560 out_bmap_cancel: 2577 out_bmap_cancel:
2561 xfs_bmap_cancel(&free_list); 2578 xfs_bmap_cancel(&free_list);
2562 cancel_flags |= XFS_TRANS_ABORT;
2563 out_trans_cancel: 2579 out_trans_cancel:
2564 xfs_trans_cancel(tp, cancel_flags); 2580 xfs_trans_cancel(tp, cancel_flags);
2565 std_return: 2581 std_return: