diff options
author | Mark Fasheh <mark.fasheh@oracle.com> | 2006-02-22 20:35:08 -0500 |
---|---|---|
committer | Mark Fasheh <mark.fasheh@oracle.com> | 2006-03-01 14:32:41 -0500 |
commit | b4df6ed8db0c387d38292e31f00adc4cd297ed5a (patch) | |
tree | d7dcaeecfa55b3fd9d6c4844d90e07759182f845 /fs | |
parent | 895928b8380cc697ac56e9732cedf549c0a4f79c (diff) |
[PATCH] ocfs2: fix orphan recovery deadlock
Orphan dir recovery can deadlock with another process in
ocfs2_delete_inode() in some corner cases. Fix this by tracking recovery
state more closely and allowing it to handle inode wipes which might
deadlock.
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
Diffstat (limited to 'fs')
-rw-r--r-- | fs/ocfs2/heartbeat.c | 1 | ||||
-rw-r--r-- | fs/ocfs2/inode.c | 46 | ||||
-rw-r--r-- | fs/ocfs2/journal.c | 124 | ||||
-rw-r--r-- | fs/ocfs2/ocfs2.h | 4 | ||||
-rw-r--r-- | fs/ocfs2/super.c | 11 |
5 files changed, 154 insertions, 32 deletions
diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c index 0bbd22f46c80..cbfd45a97a63 100644 --- a/fs/ocfs2/heartbeat.c +++ b/fs/ocfs2/heartbeat.c | |||
@@ -67,6 +67,7 @@ void ocfs2_init_node_maps(struct ocfs2_super *osb) | |||
67 | ocfs2_node_map_init(&osb->mounted_map); | 67 | ocfs2_node_map_init(&osb->mounted_map); |
68 | ocfs2_node_map_init(&osb->recovery_map); | 68 | ocfs2_node_map_init(&osb->recovery_map); |
69 | ocfs2_node_map_init(&osb->umount_map); | 69 | ocfs2_node_map_init(&osb->umount_map); |
70 | ocfs2_node_map_init(&osb->osb_recovering_orphan_dirs); | ||
70 | } | 71 | } |
71 | 72 | ||
72 | static void ocfs2_do_node_down(int node_num, | 73 | static void ocfs2_do_node_down(int node_num, |
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 8122489c5762..315472a5c192 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c | |||
@@ -41,6 +41,7 @@ | |||
41 | #include "dlmglue.h" | 41 | #include "dlmglue.h" |
42 | #include "extent_map.h" | 42 | #include "extent_map.h" |
43 | #include "file.h" | 43 | #include "file.h" |
44 | #include "heartbeat.h" | ||
44 | #include "inode.h" | 45 | #include "inode.h" |
45 | #include "journal.h" | 46 | #include "journal.h" |
46 | #include "namei.h" | 47 | #include "namei.h" |
@@ -544,6 +545,42 @@ bail: | |||
544 | return status; | 545 | return status; |
545 | } | 546 | } |
546 | 547 | ||
548 | /* | ||
549 | * Serialize with orphan dir recovery. If the process doing | ||
550 | * recovery on this orphan dir does an iget() with the dir | ||
551 | * i_mutex held, we'll deadlock here. Instead we detect this | ||
552 | * and exit early - recovery will wipe this inode for us. | ||
553 | */ | ||
554 | static int ocfs2_check_orphan_recovery_state(struct ocfs2_super *osb, | ||
555 | int slot) | ||
556 | { | ||
557 | int ret = 0; | ||
558 | |||
559 | spin_lock(&osb->osb_lock); | ||
560 | if (ocfs2_node_map_test_bit(osb, &osb->osb_recovering_orphan_dirs, slot)) { | ||
561 | mlog(0, "Recovery is happening on orphan dir %d, will skip " | ||
562 | "this inode\n", slot); | ||
563 | ret = -EDEADLK; | ||
564 | goto out; | ||
565 | } | ||
566 | /* This signals to the orphan recovery process that it should | ||
567 | * wait for us to handle the wipe. */ | ||
568 | osb->osb_orphan_wipes[slot]++; | ||
569 | out: | ||
570 | spin_unlock(&osb->osb_lock); | ||
571 | return ret; | ||
572 | } | ||
573 | |||
574 | static void ocfs2_signal_wipe_completion(struct ocfs2_super *osb, | ||
575 | int slot) | ||
576 | { | ||
577 | spin_lock(&osb->osb_lock); | ||
578 | osb->osb_orphan_wipes[slot]--; | ||
579 | spin_unlock(&osb->osb_lock); | ||
580 | |||
581 | wake_up(&osb->osb_wipe_event); | ||
582 | } | ||
583 | |||
547 | static int ocfs2_wipe_inode(struct inode *inode, | 584 | static int ocfs2_wipe_inode(struct inode *inode, |
548 | struct buffer_head *di_bh) | 585 | struct buffer_head *di_bh) |
549 | { | 586 | { |
@@ -555,6 +592,11 @@ static int ocfs2_wipe_inode(struct inode *inode, | |||
555 | /* We've already voted on this so it should be readonly - no | 592 | /* We've already voted on this so it should be readonly - no |
556 | * spinlock needed. */ | 593 | * spinlock needed. */ |
557 | orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot; | 594 | orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot; |
595 | |||
596 | status = ocfs2_check_orphan_recovery_state(osb, orphaned_slot); | ||
597 | if (status) | ||
598 | return status; | ||
599 | |||
558 | orphan_dir_inode = ocfs2_get_system_file_inode(osb, | 600 | orphan_dir_inode = ocfs2_get_system_file_inode(osb, |
559 | ORPHAN_DIR_SYSTEM_INODE, | 601 | ORPHAN_DIR_SYSTEM_INODE, |
560 | orphaned_slot); | 602 | orphaned_slot); |
@@ -597,6 +639,7 @@ bail_unlock_dir: | |||
597 | brelse(orphan_dir_bh); | 639 | brelse(orphan_dir_bh); |
598 | bail: | 640 | bail: |
599 | iput(orphan_dir_inode); | 641 | iput(orphan_dir_inode); |
642 | ocfs2_signal_wipe_completion(osb, orphaned_slot); | ||
600 | 643 | ||
601 | return status; | 644 | return status; |
602 | } | 645 | } |
@@ -822,7 +865,8 @@ void ocfs2_delete_inode(struct inode *inode) | |||
822 | 865 | ||
823 | status = ocfs2_wipe_inode(inode, di_bh); | 866 | status = ocfs2_wipe_inode(inode, di_bh); |
824 | if (status < 0) { | 867 | if (status < 0) { |
825 | mlog_errno(status); | 868 | if (status != -EDEADLK) |
869 | mlog_errno(status); | ||
826 | goto bail_unlock_inode; | 870 | goto bail_unlock_inode; |
827 | } | 871 | } |
828 | 872 | ||
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index d329c9df90ae..4be801f4559b 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c | |||
@@ -1408,21 +1408,17 @@ bail: | |||
1408 | return status; | 1408 | return status; |
1409 | } | 1409 | } |
1410 | 1410 | ||
1411 | static int ocfs2_recover_orphans(struct ocfs2_super *osb, | 1411 | static int ocfs2_queue_orphans(struct ocfs2_super *osb, |
1412 | int slot) | 1412 | int slot, |
1413 | struct inode **head) | ||
1413 | { | 1414 | { |
1414 | int status = 0; | 1415 | int status; |
1415 | int have_disk_lock = 0; | ||
1416 | struct inode *inode = NULL; | ||
1417 | struct inode *iter; | ||
1418 | struct inode *orphan_dir_inode = NULL; | 1416 | struct inode *orphan_dir_inode = NULL; |
1417 | struct inode *iter; | ||
1419 | unsigned long offset, blk, local; | 1418 | unsigned long offset, blk, local; |
1420 | struct buffer_head *bh = NULL; | 1419 | struct buffer_head *bh = NULL; |
1421 | struct ocfs2_dir_entry *de; | 1420 | struct ocfs2_dir_entry *de; |
1422 | struct super_block *sb = osb->sb; | 1421 | struct super_block *sb = osb->sb; |
1423 | struct ocfs2_inode_info *oi; | ||
1424 | |||
1425 | mlog(0, "Recover inodes from orphan dir in slot %d\n", slot); | ||
1426 | 1422 | ||
1427 | orphan_dir_inode = ocfs2_get_system_file_inode(osb, | 1423 | orphan_dir_inode = ocfs2_get_system_file_inode(osb, |
1428 | ORPHAN_DIR_SYSTEM_INODE, | 1424 | ORPHAN_DIR_SYSTEM_INODE, |
@@ -1430,17 +1426,15 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, | |||
1430 | if (!orphan_dir_inode) { | 1426 | if (!orphan_dir_inode) { |
1431 | status = -ENOENT; | 1427 | status = -ENOENT; |
1432 | mlog_errno(status); | 1428 | mlog_errno(status); |
1433 | goto out; | 1429 | return status; |
1434 | } | 1430 | } |
1435 | 1431 | ||
1436 | mutex_lock(&orphan_dir_inode->i_mutex); | 1432 | mutex_lock(&orphan_dir_inode->i_mutex); |
1437 | status = ocfs2_meta_lock(orphan_dir_inode, NULL, NULL, 0); | 1433 | status = ocfs2_meta_lock(orphan_dir_inode, NULL, NULL, 0); |
1438 | if (status < 0) { | 1434 | if (status < 0) { |
1439 | mutex_unlock(&orphan_dir_inode->i_mutex); | ||
1440 | mlog_errno(status); | 1435 | mlog_errno(status); |
1441 | goto out; | 1436 | goto out; |
1442 | } | 1437 | } |
1443 | have_disk_lock = 1; | ||
1444 | 1438 | ||
1445 | offset = 0; | 1439 | offset = 0; |
1446 | iter = NULL; | 1440 | iter = NULL; |
@@ -1451,11 +1445,10 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, | |||
1451 | if (!bh) | 1445 | if (!bh) |
1452 | status = -EINVAL; | 1446 | status = -EINVAL; |
1453 | if (status < 0) { | 1447 | if (status < 0) { |
1454 | mutex_unlock(&orphan_dir_inode->i_mutex); | ||
1455 | if (bh) | 1448 | if (bh) |
1456 | brelse(bh); | 1449 | brelse(bh); |
1457 | mlog_errno(status); | 1450 | mlog_errno(status); |
1458 | goto out; | 1451 | goto out_unlock; |
1459 | } | 1452 | } |
1460 | 1453 | ||
1461 | local = 0; | 1454 | local = 0; |
@@ -1465,11 +1458,10 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, | |||
1465 | 1458 | ||
1466 | if (!ocfs2_check_dir_entry(orphan_dir_inode, | 1459 | if (!ocfs2_check_dir_entry(orphan_dir_inode, |
1467 | de, bh, local)) { | 1460 | de, bh, local)) { |
1468 | mutex_unlock(&orphan_dir_inode->i_mutex); | ||
1469 | status = -EINVAL; | 1461 | status = -EINVAL; |
1470 | mlog_errno(status); | 1462 | mlog_errno(status); |
1471 | brelse(bh); | 1463 | brelse(bh); |
1472 | goto out; | 1464 | goto out_unlock; |
1473 | } | 1465 | } |
1474 | 1466 | ||
1475 | local += le16_to_cpu(de->rec_len); | 1467 | local += le16_to_cpu(de->rec_len); |
@@ -1504,18 +1496,95 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, | |||
1504 | 1496 | ||
1505 | mlog(0, "queue orphan %"MLFu64"\n", | 1497 | mlog(0, "queue orphan %"MLFu64"\n", |
1506 | OCFS2_I(iter)->ip_blkno); | 1498 | OCFS2_I(iter)->ip_blkno); |
1507 | OCFS2_I(iter)->ip_next_orphan = inode; | 1499 | /* No locking is required for the next_orphan |
1508 | inode = iter; | 1500 | * queue as there is only ever a single |
1501 | * process doing orphan recovery. */ | ||
1502 | OCFS2_I(iter)->ip_next_orphan = *head; | ||
1503 | *head = iter; | ||
1509 | } | 1504 | } |
1510 | brelse(bh); | 1505 | brelse(bh); |
1511 | } | 1506 | } |
1512 | mutex_unlock(&orphan_dir_inode->i_mutex); | ||
1513 | 1507 | ||
1508 | out_unlock: | ||
1514 | ocfs2_meta_unlock(orphan_dir_inode, 0); | 1509 | ocfs2_meta_unlock(orphan_dir_inode, 0); |
1515 | have_disk_lock = 0; | 1510 | out: |
1516 | 1511 | mutex_unlock(&orphan_dir_inode->i_mutex); | |
1517 | iput(orphan_dir_inode); | 1512 | iput(orphan_dir_inode); |
1518 | orphan_dir_inode = NULL; | 1513 | return status; |
1514 | } | ||
1515 | |||
1516 | static int ocfs2_orphan_recovery_can_continue(struct ocfs2_super *osb, | ||
1517 | int slot) | ||
1518 | { | ||
1519 | int ret; | ||
1520 | |||
1521 | spin_lock(&osb->osb_lock); | ||
1522 | ret = !osb->osb_orphan_wipes[slot]; | ||
1523 | spin_unlock(&osb->osb_lock); | ||
1524 | return ret; | ||
1525 | } | ||
1526 | |||
1527 | static void ocfs2_mark_recovering_orphan_dir(struct ocfs2_super *osb, | ||
1528 | int slot) | ||
1529 | { | ||
1530 | spin_lock(&osb->osb_lock); | ||
1531 | /* Mark ourselves such that new processes in delete_inode() | ||
1532 | * know to quit early. */ | ||
1533 | ocfs2_node_map_set_bit(osb, &osb->osb_recovering_orphan_dirs, slot); | ||
1534 | while (osb->osb_orphan_wipes[slot]) { | ||
1535 | /* If any processes are already in the middle of an | ||
1536 | * orphan wipe on this dir, then we need to wait for | ||
1537 | * them. */ | ||
1538 | spin_unlock(&osb->osb_lock); | ||
1539 | wait_event_interruptible(osb->osb_wipe_event, | ||
1540 | ocfs2_orphan_recovery_can_continue(osb, slot)); | ||
1541 | spin_lock(&osb->osb_lock); | ||
1542 | } | ||
1543 | spin_unlock(&osb->osb_lock); | ||
1544 | } | ||
1545 | |||
1546 | static void ocfs2_clear_recovering_orphan_dir(struct ocfs2_super *osb, | ||
1547 | int slot) | ||
1548 | { | ||
1549 | ocfs2_node_map_clear_bit(osb, &osb->osb_recovering_orphan_dirs, slot); | ||
1550 | } | ||
1551 | |||
1552 | /* | ||
1553 | * Orphan recovery. Each mounted node has it's own orphan dir which we | ||
1554 | * must run during recovery. Our strategy here is to build a list of | ||
1555 | * the inodes in the orphan dir and iget/iput them. The VFS does | ||
1556 | * (most) of the rest of the work. | ||
1557 | * | ||
1558 | * Orphan recovery can happen at any time, not just mount so we have a | ||
1559 | * couple of extra considerations. | ||
1560 | * | ||
1561 | * - We grab as many inodes as we can under the orphan dir lock - | ||
1562 | * doing iget() outside the orphan dir risks getting a reference on | ||
1563 | * an invalid inode. | ||
1564 | * - We must be sure not to deadlock with other processes on the | ||
1565 | * system wanting to run delete_inode(). This can happen when they go | ||
1566 | * to lock the orphan dir and the orphan recovery process attempts to | ||
1567 | * iget() inside the orphan dir lock. This can be avoided by | ||
1568 | * advertising our state to ocfs2_delete_inode(). | ||
1569 | */ | ||
1570 | static int ocfs2_recover_orphans(struct ocfs2_super *osb, | ||
1571 | int slot) | ||
1572 | { | ||
1573 | int ret = 0; | ||
1574 | struct inode *inode = NULL; | ||
1575 | struct inode *iter; | ||
1576 | struct ocfs2_inode_info *oi; | ||
1577 | |||
1578 | mlog(0, "Recover inodes from orphan dir in slot %d\n", slot); | ||
1579 | |||
1580 | ocfs2_mark_recovering_orphan_dir(osb, slot); | ||
1581 | ret = ocfs2_queue_orphans(osb, slot, &inode); | ||
1582 | ocfs2_clear_recovering_orphan_dir(osb, slot); | ||
1583 | |||
1584 | /* Error here should be noted, but we want to continue with as | ||
1585 | * many queued inodes as we've got. */ | ||
1586 | if (ret) | ||
1587 | mlog_errno(ret); | ||
1519 | 1588 | ||
1520 | while (inode) { | 1589 | while (inode) { |
1521 | oi = OCFS2_I(inode); | 1590 | oi = OCFS2_I(inode); |
@@ -1541,14 +1610,7 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, | |||
1541 | inode = iter; | 1610 | inode = iter; |
1542 | } | 1611 | } |
1543 | 1612 | ||
1544 | out: | 1613 | return ret; |
1545 | if (have_disk_lock) | ||
1546 | ocfs2_meta_unlock(orphan_dir_inode, 0); | ||
1547 | |||
1548 | if (orphan_dir_inode) | ||
1549 | iput(orphan_dir_inode); | ||
1550 | |||
1551 | return status; | ||
1552 | } | 1614 | } |
1553 | 1615 | ||
1554 | static int ocfs2_wait_on_mount(struct ocfs2_super *osb) | 1616 | static int ocfs2_wait_on_mount(struct ocfs2_super *osb) |
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 19360e3d842e..e89de9b6e491 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h | |||
@@ -287,6 +287,10 @@ struct ocfs2_super | |||
287 | struct inode *osb_tl_inode; | 287 | struct inode *osb_tl_inode; |
288 | struct buffer_head *osb_tl_bh; | 288 | struct buffer_head *osb_tl_bh; |
289 | struct work_struct osb_truncate_log_wq; | 289 | struct work_struct osb_truncate_log_wq; |
290 | |||
291 | struct ocfs2_node_map osb_recovering_orphan_dirs; | ||
292 | unsigned int *osb_orphan_wipes; | ||
293 | wait_queue_head_t osb_wipe_event; | ||
290 | }; | 294 | }; |
291 | 295 | ||
292 | #define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info) | 296 | #define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info) |
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 046824b6b625..8dd3aafec499 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c | |||
@@ -1325,6 +1325,16 @@ static int ocfs2_initialize_super(struct super_block *sb, | |||
1325 | } | 1325 | } |
1326 | mlog(ML_NOTICE, "max_slots for this device: %u\n", osb->max_slots); | 1326 | mlog(ML_NOTICE, "max_slots for this device: %u\n", osb->max_slots); |
1327 | 1327 | ||
1328 | init_waitqueue_head(&osb->osb_wipe_event); | ||
1329 | osb->osb_orphan_wipes = kcalloc(osb->max_slots, | ||
1330 | sizeof(*osb->osb_orphan_wipes), | ||
1331 | GFP_KERNEL); | ||
1332 | if (!osb->osb_orphan_wipes) { | ||
1333 | status = -ENOMEM; | ||
1334 | mlog_errno(status); | ||
1335 | goto bail; | ||
1336 | } | ||
1337 | |||
1328 | osb->s_feature_compat = | 1338 | osb->s_feature_compat = |
1329 | le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_compat); | 1339 | le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_compat); |
1330 | osb->s_feature_ro_compat = | 1340 | osb->s_feature_ro_compat = |
@@ -1638,6 +1648,7 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb) | |||
1638 | if (osb->slot_info) | 1648 | if (osb->slot_info) |
1639 | ocfs2_free_slot_info(osb->slot_info); | 1649 | ocfs2_free_slot_info(osb->slot_info); |
1640 | 1650 | ||
1651 | kfree(osb->osb_orphan_wipes); | ||
1641 | /* FIXME | 1652 | /* FIXME |
1642 | * This belongs in journal shutdown, but because we have to | 1653 | * This belongs in journal shutdown, but because we have to |
1643 | * allocate osb->journal at the start of ocfs2_initalize_osb(), | 1654 | * allocate osb->journal at the start of ocfs2_initalize_osb(), |