diff options
Diffstat (limited to 'fs/ocfs2/journal.c')
-rw-r--r-- | fs/ocfs2/journal.c | 173 |
1 files changed, 133 insertions, 40 deletions
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index a8c19cb3cfdd..7a37240f7a31 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c | |||
@@ -57,7 +57,7 @@ static int __ocfs2_recovery_thread(void *arg); | |||
57 | static int ocfs2_commit_cache(struct ocfs2_super *osb); | 57 | static int ocfs2_commit_cache(struct ocfs2_super *osb); |
58 | static int ocfs2_wait_on_mount(struct ocfs2_super *osb); | 58 | static int ocfs2_wait_on_mount(struct ocfs2_super *osb); |
59 | static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb, | 59 | static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb, |
60 | int dirty); | 60 | int dirty, int replayed); |
61 | static int ocfs2_trylock_journal(struct ocfs2_super *osb, | 61 | static int ocfs2_trylock_journal(struct ocfs2_super *osb, |
62 | int slot_num); | 62 | int slot_num); |
63 | static int ocfs2_recover_orphans(struct ocfs2_super *osb, | 63 | static int ocfs2_recover_orphans(struct ocfs2_super *osb, |
@@ -562,8 +562,18 @@ done: | |||
562 | return status; | 562 | return status; |
563 | } | 563 | } |
564 | 564 | ||
565 | static void ocfs2_bump_recovery_generation(struct ocfs2_dinode *di) | ||
566 | { | ||
567 | le32_add_cpu(&(di->id1.journal1.ij_recovery_generation), 1); | ||
568 | } | ||
569 | |||
570 | static u32 ocfs2_get_recovery_generation(struct ocfs2_dinode *di) | ||
571 | { | ||
572 | return le32_to_cpu(di->id1.journal1.ij_recovery_generation); | ||
573 | } | ||
574 | |||
565 | static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb, | 575 | static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb, |
566 | int dirty) | 576 | int dirty, int replayed) |
567 | { | 577 | { |
568 | int status; | 578 | int status; |
569 | unsigned int flags; | 579 | unsigned int flags; |
@@ -593,6 +603,9 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb, | |||
593 | flags &= ~OCFS2_JOURNAL_DIRTY_FL; | 603 | flags &= ~OCFS2_JOURNAL_DIRTY_FL; |
594 | fe->id1.journal1.ij_flags = cpu_to_le32(flags); | 604 | fe->id1.journal1.ij_flags = cpu_to_le32(flags); |
595 | 605 | ||
606 | if (replayed) | ||
607 | ocfs2_bump_recovery_generation(fe); | ||
608 | |||
596 | status = ocfs2_write_block(osb, bh, journal->j_inode); | 609 | status = ocfs2_write_block(osb, bh, journal->j_inode); |
597 | if (status < 0) | 610 | if (status < 0) |
598 | mlog_errno(status); | 611 | mlog_errno(status); |
@@ -667,7 +680,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb) | |||
667 | * Do not toggle if flush was unsuccessful otherwise | 680 | * Do not toggle if flush was unsuccessful otherwise |
668 | * will leave dirty metadata in a "clean" journal | 681 | * will leave dirty metadata in a "clean" journal |
669 | */ | 682 | */ |
670 | status = ocfs2_journal_toggle_dirty(osb, 0); | 683 | status = ocfs2_journal_toggle_dirty(osb, 0, 0); |
671 | if (status < 0) | 684 | if (status < 0) |
672 | mlog_errno(status); | 685 | mlog_errno(status); |
673 | } | 686 | } |
@@ -710,7 +723,7 @@ static void ocfs2_clear_journal_error(struct super_block *sb, | |||
710 | } | 723 | } |
711 | } | 724 | } |
712 | 725 | ||
713 | int ocfs2_journal_load(struct ocfs2_journal *journal, int local) | 726 | int ocfs2_journal_load(struct ocfs2_journal *journal, int local, int replayed) |
714 | { | 727 | { |
715 | int status = 0; | 728 | int status = 0; |
716 | struct ocfs2_super *osb; | 729 | struct ocfs2_super *osb; |
@@ -729,7 +742,7 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local) | |||
729 | 742 | ||
730 | ocfs2_clear_journal_error(osb->sb, journal->j_journal, osb->slot_num); | 743 | ocfs2_clear_journal_error(osb->sb, journal->j_journal, osb->slot_num); |
731 | 744 | ||
732 | status = ocfs2_journal_toggle_dirty(osb, 1); | 745 | status = ocfs2_journal_toggle_dirty(osb, 1, replayed); |
733 | if (status < 0) { | 746 | if (status < 0) { |
734 | mlog_errno(status); | 747 | mlog_errno(status); |
735 | goto done; | 748 | goto done; |
@@ -771,7 +784,7 @@ int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full) | |||
771 | goto bail; | 784 | goto bail; |
772 | } | 785 | } |
773 | 786 | ||
774 | status = ocfs2_journal_toggle_dirty(journal->j_osb, 0); | 787 | status = ocfs2_journal_toggle_dirty(journal->j_osb, 0, 0); |
775 | if (status < 0) | 788 | if (status < 0) |
776 | mlog_errno(status); | 789 | mlog_errno(status); |
777 | 790 | ||
@@ -1034,6 +1047,12 @@ restart: | |||
1034 | spin_unlock(&osb->osb_lock); | 1047 | spin_unlock(&osb->osb_lock); |
1035 | mlog(0, "All nodes recovered\n"); | 1048 | mlog(0, "All nodes recovered\n"); |
1036 | 1049 | ||
1050 | /* Refresh all journal recovery generations from disk */ | ||
1051 | status = ocfs2_check_journals_nolocks(osb); | ||
1052 | status = (status == -EROFS) ? 0 : status; | ||
1053 | if (status < 0) | ||
1054 | mlog_errno(status); | ||
1055 | |||
1037 | ocfs2_super_unlock(osb, 1); | 1056 | ocfs2_super_unlock(osb, 1); |
1038 | 1057 | ||
1039 | /* We always run recovery on our own orphan dir - the dead | 1058 | /* We always run recovery on our own orphan dir - the dead |
@@ -1096,6 +1115,42 @@ out: | |||
1096 | mlog_exit_void(); | 1115 | mlog_exit_void(); |
1097 | } | 1116 | } |
1098 | 1117 | ||
1118 | static int ocfs2_read_journal_inode(struct ocfs2_super *osb, | ||
1119 | int slot_num, | ||
1120 | struct buffer_head **bh, | ||
1121 | struct inode **ret_inode) | ||
1122 | { | ||
1123 | int status = -EACCES; | ||
1124 | struct inode *inode = NULL; | ||
1125 | |||
1126 | BUG_ON(slot_num >= osb->max_slots); | ||
1127 | |||
1128 | inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE, | ||
1129 | slot_num); | ||
1130 | if (!inode || is_bad_inode(inode)) { | ||
1131 | mlog_errno(status); | ||
1132 | goto bail; | ||
1133 | } | ||
1134 | SET_INODE_JOURNAL(inode); | ||
1135 | |||
1136 | status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, bh, 0, inode); | ||
1137 | if (status < 0) { | ||
1138 | mlog_errno(status); | ||
1139 | goto bail; | ||
1140 | } | ||
1141 | |||
1142 | status = 0; | ||
1143 | |||
1144 | bail: | ||
1145 | if (inode) { | ||
1146 | if (status || !ret_inode) | ||
1147 | iput(inode); | ||
1148 | else | ||
1149 | *ret_inode = inode; | ||
1150 | } | ||
1151 | return status; | ||
1152 | } | ||
1153 | |||
1099 | /* Does the actual journal replay and marks the journal inode as | 1154 | /* Does the actual journal replay and marks the journal inode as |
1100 | * clean. Will only replay if the journal inode is marked dirty. */ | 1155 | * clean. Will only replay if the journal inode is marked dirty. */ |
1101 | static int ocfs2_replay_journal(struct ocfs2_super *osb, | 1156 | static int ocfs2_replay_journal(struct ocfs2_super *osb, |
@@ -1109,22 +1164,36 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb, | |||
1109 | struct ocfs2_dinode *fe; | 1164 | struct ocfs2_dinode *fe; |
1110 | journal_t *journal = NULL; | 1165 | journal_t *journal = NULL; |
1111 | struct buffer_head *bh = NULL; | 1166 | struct buffer_head *bh = NULL; |
1167 | u32 slot_reco_gen; | ||
1112 | 1168 | ||
1113 | inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE, | 1169 | status = ocfs2_read_journal_inode(osb, slot_num, &bh, &inode); |
1114 | slot_num); | 1170 | if (status) { |
1115 | if (inode == NULL) { | ||
1116 | status = -EACCES; | ||
1117 | mlog_errno(status); | 1171 | mlog_errno(status); |
1118 | goto done; | 1172 | goto done; |
1119 | } | 1173 | } |
1120 | if (is_bad_inode(inode)) { | 1174 | |
1121 | status = -EACCES; | 1175 | fe = (struct ocfs2_dinode *)bh->b_data; |
1122 | iput(inode); | 1176 | slot_reco_gen = ocfs2_get_recovery_generation(fe); |
1123 | inode = NULL; | 1177 | brelse(bh); |
1124 | mlog_errno(status); | 1178 | bh = NULL; |
1179 | |||
1180 | /* | ||
1181 | * As the fs recovery is asynchronous, there is a small chance that | ||
1182 | * another node mounted (and recovered) the slot before the recovery | ||
1183 | * thread could get the lock. To handle that, we dirty read the journal | ||
1184 | * inode for that slot to get the recovery generation. If it is | ||
1185 | * different than what we expected, the slot has been recovered. | ||
1186 | * If not, it needs recovery. | ||
1187 | */ | ||
1188 | if (osb->slot_recovery_generations[slot_num] != slot_reco_gen) { | ||
1189 | mlog(0, "Slot %u already recovered (old/new=%u/%u)\n", slot_num, | ||
1190 | osb->slot_recovery_generations[slot_num], slot_reco_gen); | ||
1191 | osb->slot_recovery_generations[slot_num] = slot_reco_gen; | ||
1192 | status = -EBUSY; | ||
1125 | goto done; | 1193 | goto done; |
1126 | } | 1194 | } |
1127 | SET_INODE_JOURNAL(inode); | 1195 | |
1196 | /* Continue with recovery as the journal has not yet been recovered */ | ||
1128 | 1197 | ||
1129 | status = ocfs2_inode_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY); | 1198 | status = ocfs2_inode_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY); |
1130 | if (status < 0) { | 1199 | if (status < 0) { |
@@ -1138,9 +1207,12 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb, | |||
1138 | fe = (struct ocfs2_dinode *) bh->b_data; | 1207 | fe = (struct ocfs2_dinode *) bh->b_data; |
1139 | 1208 | ||
1140 | flags = le32_to_cpu(fe->id1.journal1.ij_flags); | 1209 | flags = le32_to_cpu(fe->id1.journal1.ij_flags); |
1210 | slot_reco_gen = ocfs2_get_recovery_generation(fe); | ||
1141 | 1211 | ||
1142 | if (!(flags & OCFS2_JOURNAL_DIRTY_FL)) { | 1212 | if (!(flags & OCFS2_JOURNAL_DIRTY_FL)) { |
1143 | mlog(0, "No recovery required for node %d\n", node_num); | 1213 | mlog(0, "No recovery required for node %d\n", node_num); |
1214 | /* Refresh recovery generation for the slot */ | ||
1215 | osb->slot_recovery_generations[slot_num] = slot_reco_gen; | ||
1144 | goto done; | 1216 | goto done; |
1145 | } | 1217 | } |
1146 | 1218 | ||
@@ -1188,6 +1260,11 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb, | |||
1188 | flags &= ~OCFS2_JOURNAL_DIRTY_FL; | 1260 | flags &= ~OCFS2_JOURNAL_DIRTY_FL; |
1189 | fe->id1.journal1.ij_flags = cpu_to_le32(flags); | 1261 | fe->id1.journal1.ij_flags = cpu_to_le32(flags); |
1190 | 1262 | ||
1263 | /* Increment recovery generation to indicate successful recovery */ | ||
1264 | ocfs2_bump_recovery_generation(fe); | ||
1265 | osb->slot_recovery_generations[slot_num] = | ||
1266 | ocfs2_get_recovery_generation(fe); | ||
1267 | |||
1191 | status = ocfs2_write_block(osb, bh, inode); | 1268 | status = ocfs2_write_block(osb, bh, inode); |
1192 | if (status < 0) | 1269 | if (status < 0) |
1193 | mlog_errno(status); | 1270 | mlog_errno(status); |
@@ -1252,6 +1329,13 @@ static int ocfs2_recover_node(struct ocfs2_super *osb, | |||
1252 | 1329 | ||
1253 | status = ocfs2_replay_journal(osb, node_num, slot_num); | 1330 | status = ocfs2_replay_journal(osb, node_num, slot_num); |
1254 | if (status < 0) { | 1331 | if (status < 0) { |
1332 | if (status == -EBUSY) { | ||
1333 | mlog(0, "Skipping recovery for slot %u (node %u) " | ||
1334 | "as another node has recovered it\n", slot_num, | ||
1335 | node_num); | ||
1336 | status = 0; | ||
1337 | goto done; | ||
1338 | } | ||
1255 | mlog_errno(status); | 1339 | mlog_errno(status); |
1256 | goto done; | 1340 | goto done; |
1257 | } | 1341 | } |
@@ -1334,12 +1418,29 @@ int ocfs2_mark_dead_nodes(struct ocfs2_super *osb) | |||
1334 | { | 1418 | { |
1335 | unsigned int node_num; | 1419 | unsigned int node_num; |
1336 | int status, i; | 1420 | int status, i; |
1421 | struct buffer_head *bh = NULL; | ||
1422 | struct ocfs2_dinode *di; | ||
1337 | 1423 | ||
1338 | /* This is called with the super block cluster lock, so we | 1424 | /* This is called with the super block cluster lock, so we |
1339 | * know that the slot map can't change underneath us. */ | 1425 | * know that the slot map can't change underneath us. */ |
1340 | 1426 | ||
1341 | spin_lock(&osb->osb_lock); | 1427 | spin_lock(&osb->osb_lock); |
1342 | for (i = 0; i < osb->max_slots; i++) { | 1428 | for (i = 0; i < osb->max_slots; i++) { |
1429 | /* Read journal inode to get the recovery generation */ | ||
1430 | status = ocfs2_read_journal_inode(osb, i, &bh, NULL); | ||
1431 | if (status) { | ||
1432 | mlog_errno(status); | ||
1433 | goto bail; | ||
1434 | } | ||
1435 | di = (struct ocfs2_dinode *)bh->b_data; | ||
1436 | osb->slot_recovery_generations[i] = | ||
1437 | ocfs2_get_recovery_generation(di); | ||
1438 | brelse(bh); | ||
1439 | bh = NULL; | ||
1440 | |||
1441 | mlog(0, "Slot %u recovery generation is %u\n", i, | ||
1442 | osb->slot_recovery_generations[i]); | ||
1443 | |||
1343 | if (i == osb->slot_num) | 1444 | if (i == osb->slot_num) |
1344 | continue; | 1445 | continue; |
1345 | 1446 | ||
@@ -1603,49 +1704,41 @@ static int ocfs2_commit_thread(void *arg) | |||
1603 | return 0; | 1704 | return 0; |
1604 | } | 1705 | } |
1605 | 1706 | ||
1606 | /* Look for a dirty journal without taking any cluster locks. Used for | 1707 | /* Reads all the journal inodes without taking any cluster locks. Used |
1607 | * hard readonly access to determine whether the file system journals | 1708 | * for hard readonly access to determine whether any journal requires |
1608 | * require recovery. */ | 1709 | * recovery. Also used to refresh the recovery generation numbers after |
1710 | * a journal has been recovered by another node. | ||
1711 | */ | ||
1609 | int ocfs2_check_journals_nolocks(struct ocfs2_super *osb) | 1712 | int ocfs2_check_journals_nolocks(struct ocfs2_super *osb) |
1610 | { | 1713 | { |
1611 | int ret = 0; | 1714 | int ret = 0; |
1612 | unsigned int slot; | 1715 | unsigned int slot; |
1613 | struct buffer_head *di_bh; | 1716 | struct buffer_head *di_bh = NULL; |
1614 | struct ocfs2_dinode *di; | 1717 | struct ocfs2_dinode *di; |
1615 | struct inode *journal = NULL; | 1718 | int journal_dirty = 0; |
1616 | 1719 | ||
1617 | for(slot = 0; slot < osb->max_slots; slot++) { | 1720 | for(slot = 0; slot < osb->max_slots; slot++) { |
1618 | journal = ocfs2_get_system_file_inode(osb, | 1721 | ret = ocfs2_read_journal_inode(osb, slot, &di_bh, NULL); |
1619 | JOURNAL_SYSTEM_INODE, | 1722 | if (ret) { |
1620 | slot); | ||
1621 | if (!journal || is_bad_inode(journal)) { | ||
1622 | ret = -EACCES; | ||
1623 | mlog_errno(ret); | ||
1624 | goto out; | ||
1625 | } | ||
1626 | |||
1627 | di_bh = NULL; | ||
1628 | ret = ocfs2_read_block(osb, OCFS2_I(journal)->ip_blkno, &di_bh, | ||
1629 | 0, journal); | ||
1630 | if (ret < 0) { | ||
1631 | mlog_errno(ret); | 1723 | mlog_errno(ret); |
1632 | goto out; | 1724 | goto out; |
1633 | } | 1725 | } |
1634 | 1726 | ||
1635 | di = (struct ocfs2_dinode *) di_bh->b_data; | 1727 | di = (struct ocfs2_dinode *) di_bh->b_data; |
1636 | 1728 | ||
1729 | osb->slot_recovery_generations[slot] = | ||
1730 | ocfs2_get_recovery_generation(di); | ||
1731 | |||
1637 | if (le32_to_cpu(di->id1.journal1.ij_flags) & | 1732 | if (le32_to_cpu(di->id1.journal1.ij_flags) & |
1638 | OCFS2_JOURNAL_DIRTY_FL) | 1733 | OCFS2_JOURNAL_DIRTY_FL) |
1639 | ret = -EROFS; | 1734 | journal_dirty = 1; |
1640 | 1735 | ||
1641 | brelse(di_bh); | 1736 | brelse(di_bh); |
1642 | if (ret) | 1737 | di_bh = NULL; |
1643 | break; | ||
1644 | } | 1738 | } |
1645 | 1739 | ||
1646 | out: | 1740 | out: |
1647 | if (journal) | 1741 | if (journal_dirty) |
1648 | iput(journal); | 1742 | ret = -EROFS; |
1649 | |||
1650 | return ret; | 1743 | return ret; |
1651 | } | 1744 | } |