diff options
author | Artem Bityutskiy <Artem.Bityutskiy@nokia.com> | 2011-05-26 01:58:19 -0400 |
---|---|---|
committer | Artem Bityutskiy <Artem.Bityutskiy@nokia.com> | 2011-06-01 05:29:06 -0400 |
commit | da8b94ea61c5d80aae0cc7b7541f1e0fa7459391 (patch) | |
tree | 5d0b7c64b7f5afd9c9b2c528ed08e7d8a0d97859 /fs | |
parent | efcfde54ca68091b164f9aec544c7233a9760aff (diff) |
UBIFS: fix recovery broken by the previous recovery fix
Unfortunately, the recovery fix d1606a59b6be4ea392eabd40d1250aa1eeb19efb
(UBIFS: fix extremely rare mount failure) broke recovery. This commit make
UBIFS drop the last min. I/O unit in all journal heads, but this is needed only
for the GC head. And this does not work for non-GC heads. For example, if
suppose we have min. I/O units A and B, and A contains a valid node X, which
was fsynced, and then a group of nodes Y which spans the rest of A and B. In
this case we'll drop not only Y, but also X, which is obviously incorrect.
This patch fixes the issue and additionally makes recovery to drop last min.
I/O unit only for the GC head, and leave things as they have been for ages for
the other heads - this is safer.
Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Diffstat (limited to 'fs')
-rw-r--r-- | fs/ubifs/recovery.c | 152 |
1 files changed, 87 insertions, 65 deletions
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c index 6adb5328a016..783d8e0beb76 100644 --- a/fs/ubifs/recovery.c +++ b/fs/ubifs/recovery.c | |||
@@ -564,19 +564,15 @@ static int fix_unclean_leb(struct ubifs_info *c, struct ubifs_scan_leb *sleb, | |||
564 | } | 564 | } |
565 | 565 | ||
566 | /** | 566 | /** |
567 | * drop_last_node - drop the last node or group of nodes. | 567 | * drop_last_group - drop the last group of nodes. |
568 | * @sleb: scanned LEB information | 568 | * @sleb: scanned LEB information |
569 | * @offs: offset of dropped nodes is returned here | 569 | * @offs: offset of dropped nodes is returned here |
570 | * @grouped: non-zero if whole group of nodes have to be dropped | ||
571 | * | 570 | * |
572 | * This is a helper function for 'ubifs_recover_leb()' which drops the last | 571 | * This is a helper function for 'ubifs_recover_leb()' which drops the last |
573 | * node of the scanned LEB or the last group of nodes if @grouped is not zero. | 572 | * group of nodes of the scanned LEB. |
574 | * This function returns %1 if a node was dropped and %0 otherwise. | ||
575 | */ | 573 | */ |
576 | static int drop_last_node(struct ubifs_scan_leb *sleb, int *offs, int grouped) | 574 | static void drop_last_group(struct ubifs_scan_leb *sleb, int *offs) |
577 | { | 575 | { |
578 | int dropped = 0; | ||
579 | |||
580 | while (!list_empty(&sleb->nodes)) { | 576 | while (!list_empty(&sleb->nodes)) { |
581 | struct ubifs_scan_node *snod; | 577 | struct ubifs_scan_node *snod; |
582 | struct ubifs_ch *ch; | 578 | struct ubifs_ch *ch; |
@@ -585,17 +581,40 @@ static int drop_last_node(struct ubifs_scan_leb *sleb, int *offs, int grouped) | |||
585 | list); | 581 | list); |
586 | ch = snod->node; | 582 | ch = snod->node; |
587 | if (ch->group_type != UBIFS_IN_NODE_GROUP) | 583 | if (ch->group_type != UBIFS_IN_NODE_GROUP) |
588 | return dropped; | 584 | break; |
589 | dbg_rcvry("dropping node at %d:%d", sleb->lnum, snod->offs); | 585 | |
586 | dbg_rcvry("dropping grouped node at %d:%d", | ||
587 | sleb->lnum, snod->offs); | ||
588 | *offs = snod->offs; | ||
589 | list_del(&snod->list); | ||
590 | kfree(snod); | ||
591 | sleb->nodes_cnt -= 1; | ||
592 | } | ||
593 | } | ||
594 | |||
595 | /** | ||
596 | * drop_last_node - drop the last node. | ||
597 | * @sleb: scanned LEB information | ||
598 | * @offs: offset of dropped nodes is returned here | ||
599 | * @grouped: non-zero if whole group of nodes have to be dropped | ||
600 | * | ||
601 | * This is a helper function for 'ubifs_recover_leb()' which drops the last | ||
602 | * node of the scanned LEB. | ||
603 | */ | ||
604 | static void drop_last_node(struct ubifs_scan_leb *sleb, int *offs) | ||
605 | { | ||
606 | struct ubifs_scan_node *snod; | ||
607 | |||
608 | if (!list_empty(&sleb->nodes)) { | ||
609 | snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node, | ||
610 | list); | ||
611 | |||
612 | dbg_rcvry("dropping last node at %d:%d", sleb->lnum, snod->offs); | ||
590 | *offs = snod->offs; | 613 | *offs = snod->offs; |
591 | list_del(&snod->list); | 614 | list_del(&snod->list); |
592 | kfree(snod); | 615 | kfree(snod); |
593 | sleb->nodes_cnt -= 1; | 616 | sleb->nodes_cnt -= 1; |
594 | dropped = 1; | ||
595 | if (!grouped) | ||
596 | break; | ||
597 | } | 617 | } |
598 | return dropped; | ||
599 | } | 618 | } |
600 | 619 | ||
601 | /** | 620 | /** |
@@ -697,59 +716,62 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum, | |||
697 | * If nodes are grouped, always drop the incomplete group at | 716 | * If nodes are grouped, always drop the incomplete group at |
698 | * the end. | 717 | * the end. |
699 | */ | 718 | */ |
700 | drop_last_node(sleb, &offs, 1); | 719 | drop_last_group(sleb, &offs); |
701 | 720 | ||
702 | /* | 721 | if (jhead == GCHD) { |
703 | * While we are in the middle of the same min. I/O unit keep dropping | 722 | /* |
704 | * nodes. So basically, what we want is to make sure that the last min. | 723 | * If this LEB belongs to the GC head then while we are in the |
705 | * I/O unit where we saw the corruption is dropped completely with all | 724 | * middle of the same min. I/O unit keep dropping nodes. So |
706 | * the uncorrupted nodes which may possibly sit there. | 725 | * basically, what we want is to make sure that the last min. |
707 | * | 726 | * I/O unit where we saw the corruption is dropped completely |
708 | * In other words, let's name the min. I/O unit where the corruption | 727 | * with all the uncorrupted nodes which may possibly sit there. |
709 | * starts B, and the previous min. I/O unit A. The below code tries to | 728 | * |
710 | * deal with a situation when half of B contains valid nodes or the end | 729 | * In other words, let's name the min. I/O unit where the |
711 | * of a valid node, and the second half of B contains corrupted data or | 730 | * corruption starts B, and the previous min. I/O unit A. The |
712 | * garbage. This means that UBIFS had been writing to B just before the | 731 | * below code tries to deal with a situation when half of B |
713 | * power cut happened. I do not know how realistic is this scenario | 732 | * contains valid nodes or the end of a valid node, and the |
714 | * that half of the min. I/O unit had been written successfully and the | 733 | * second half of B contains corrupted data or garbage. This |
715 | * other half not, but this is possible in our 'failure mode emulation' | 734 | * means that UBIFS had been writing to B just before the power |
716 | * infrastructure at least. | 735 | * cut happened. I do not know how realistic is this scenario |
717 | * | 736 | * that half of the min. I/O unit had been written successfully |
718 | * So what is the problem, why we need to drop those nodes? Whey can't | 737 | * and the other half not, but this is possible in our 'failure |
719 | * we just clean-up the second half of B by putting a padding node | 738 | * mode emulation' infrastructure at least. |
720 | * there? We can, and this works fine with one exception which was | 739 | * |
721 | * reproduced with power cut emulation testing and happens extremely | 740 | * So what is the problem, why we need to drop those nodes? Why |
722 | * rarely. The description follows, but it is worth noting that that is | 741 | * can't we just clean-up the second half of B by putting a |
723 | * only about the GC head, so we could do this trick only if the bud | 742 | * padding node there? We can, and this works fine with one |
724 | * belongs to the GC head, but it does not seem to be worth an | 743 | * exception which was reproduced with power cut emulation |
725 | * additional "if" statement. | 744 | * testing and happens extremely rarely. |
726 | * | 745 | * |
727 | * So, imagine the file-system is full, we run GC which is moving valid | 746 | * Imagine the file-system is full, we run GC which starts |
728 | * nodes from LEB X to LEB Y (obviously, LEB Y is the current GC head | 747 | * moving valid nodes from LEB X to LEB Y (obviously, LEB Y is |
729 | * LEB). The @c->gc_lnum is -1, which means that GC will retain LEB X | 748 | * the current GC head LEB). The @c->gc_lnum is -1, which means |
730 | * and will try to continue. Imagine that LEB X is currently the | 749 | * that GC will retain LEB X and will try to continue. Imagine |
731 | * dirtiest LEB, and the amount of used space in LEB Y is exactly the | 750 | * that LEB X is currently the dirtiest LEB, and the amount of |
732 | * same as amount of free space in LEB X. | 751 | * used space in LEB Y is exactly the same as amount of free |
733 | * | 752 | * space in LEB X. |
734 | * And a power cut happens when nodes are moved from LEB X to LEB Y. We | 753 | * |
735 | * are here trying to recover LEB Y which is the GC head LEB. We find | 754 | * And a power cut happens when nodes are moved from LEB X to |
736 | * the min. I/O unit B as described above. Then we clean-up LEB Y by | 755 | * LEB Y. We are here trying to recover LEB Y which is the GC |
737 | * padding min. I/O unit. And later 'ubifs_rcvry_gc_commit()' function | 756 | * head LEB. We find the min. I/O unit B as described above. |
738 | * fails, because it cannot find a dirty LEB which could be GC'd into | 757 | * Then we clean-up LEB Y by padding min. I/O unit. And later |
739 | * LEB Y! Even LEB X does not match because the amount of valid nodes | 758 | * 'ubifs_rcvry_gc_commit()' function fails, because it cannot |
740 | * there does not fit the free space in LEB Y any more! And this is | 759 | * find a dirty LEB which could be GC'd into LEB Y! Even LEB X |
741 | * because of the padding node which we added to LEB Y. The | 760 | * does not match because the amount of valid nodes there does |
742 | * user-visible effect of this which I once observed and analysed is | 761 | * not fit the free space in LEB Y any more! And this is |
743 | * that we cannot mount the file-system with -ENOSPC error. | 762 | * because of the padding node which we added to LEB Y. The |
744 | * | 763 | * user-visible effect of this which I once observed and |
745 | * So obviously, to make sure that situation does not happen we should | 764 | * analysed is that we cannot mount the file-system with |
746 | * free min. I/O unit B in LEB Y completely and the last used min. I/O | 765 | * -ENOSPC error. |
747 | * unit in LEB Y should be A. This is basically what the below code | 766 | * |
748 | * tries to do. | 767 | * So obviously, to make sure that situation does not happen we |
749 | */ | 768 | * should free min. I/O unit B in LEB Y completely and the last |
750 | while (min_io_unit == round_down(offs, c->min_io_size) && | 769 | * used min. I/O unit in LEB Y should be A. This is basically |
751 | min_io_unit != offs && | 770 | * what the below code tries to do. |
752 | drop_last_node(sleb, &offs, grouped)); | 771 | */ |
772 | while (offs > min_io_unit) | ||
773 | drop_last_node(sleb, &offs); | ||
774 | } | ||
753 | 775 | ||
754 | buf = sbuf + offs; | 776 | buf = sbuf + offs; |
755 | len = c->leb_size - offs; | 777 | len = c->leb_size - offs; |