aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorArtem Bityutskiy <Artem.Bityutskiy@nokia.com>2011-05-26 01:58:19 -0400
committerArtem Bityutskiy <Artem.Bityutskiy@nokia.com>2011-06-01 05:29:06 -0400
commitda8b94ea61c5d80aae0cc7b7541f1e0fa7459391 (patch)
tree5d0b7c64b7f5afd9c9b2c528ed08e7d8a0d97859 /fs
parentefcfde54ca68091b164f9aec544c7233a9760aff (diff)
UBIFS: fix recovery broken by the previous recovery fix
Unfortunately, the recovery fix d1606a59b6be4ea392eabd40d1250aa1eeb19efb (UBIFS: fix extremely rare mount failure) broke recovery. This commit make UBIFS drop the last min. I/O unit in all journal heads, but this is needed only for the GC head. And this does not work for non-GC heads. For example, if suppose we have min. I/O units A and B, and A contains a valid node X, which was fsynced, and then a group of nodes Y which spans the rest of A and B. In this case we'll drop not only Y, but also X, which is obviously incorrect. This patch fixes the issue and additionally makes recovery to drop last min. I/O unit only for the GC head, and leave things as they have been for ages for the other heads - this is safer. Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Diffstat (limited to 'fs')
-rw-r--r--fs/ubifs/recovery.c152
1 files changed, 87 insertions, 65 deletions
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 6adb5328a01..783d8e0beb7 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -564,19 +564,15 @@ static int fix_unclean_leb(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
564} 564}
565 565
566/** 566/**
567 * drop_last_node - drop the last node or group of nodes. 567 * drop_last_group - drop the last group of nodes.
568 * @sleb: scanned LEB information 568 * @sleb: scanned LEB information
569 * @offs: offset of dropped nodes is returned here 569 * @offs: offset of dropped nodes is returned here
570 * @grouped: non-zero if whole group of nodes have to be dropped
571 * 570 *
572 * This is a helper function for 'ubifs_recover_leb()' which drops the last 571 * This is a helper function for 'ubifs_recover_leb()' which drops the last
573 * node of the scanned LEB or the last group of nodes if @grouped is not zero. 572 * group of nodes of the scanned LEB.
574 * This function returns %1 if a node was dropped and %0 otherwise.
575 */ 573 */
576static int drop_last_node(struct ubifs_scan_leb *sleb, int *offs, int grouped) 574static void drop_last_group(struct ubifs_scan_leb *sleb, int *offs)
577{ 575{
578 int dropped = 0;
579
580 while (!list_empty(&sleb->nodes)) { 576 while (!list_empty(&sleb->nodes)) {
581 struct ubifs_scan_node *snod; 577 struct ubifs_scan_node *snod;
582 struct ubifs_ch *ch; 578 struct ubifs_ch *ch;
@@ -585,17 +581,40 @@ static int drop_last_node(struct ubifs_scan_leb *sleb, int *offs, int grouped)
585 list); 581 list);
586 ch = snod->node; 582 ch = snod->node;
587 if (ch->group_type != UBIFS_IN_NODE_GROUP) 583 if (ch->group_type != UBIFS_IN_NODE_GROUP)
588 return dropped; 584 break;
589 dbg_rcvry("dropping node at %d:%d", sleb->lnum, snod->offs); 585
586 dbg_rcvry("dropping grouped node at %d:%d",
587 sleb->lnum, snod->offs);
588 *offs = snod->offs;
589 list_del(&snod->list);
590 kfree(snod);
591 sleb->nodes_cnt -= 1;
592 }
593}
594
595/**
596 * drop_last_node - drop the last node.
597 * @sleb: scanned LEB information
598 * @offs: offset of dropped nodes is returned here
599 * @grouped: non-zero if whole group of nodes have to be dropped
600 *
601 * This is a helper function for 'ubifs_recover_leb()' which drops the last
602 * node of the scanned LEB.
603 */
604static void drop_last_node(struct ubifs_scan_leb *sleb, int *offs)
605{
606 struct ubifs_scan_node *snod;
607
608 if (!list_empty(&sleb->nodes)) {
609 snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node,
610 list);
611
612 dbg_rcvry("dropping last node at %d:%d", sleb->lnum, snod->offs);
590 *offs = snod->offs; 613 *offs = snod->offs;
591 list_del(&snod->list); 614 list_del(&snod->list);
592 kfree(snod); 615 kfree(snod);
593 sleb->nodes_cnt -= 1; 616 sleb->nodes_cnt -= 1;
594 dropped = 1;
595 if (!grouped)
596 break;
597 } 617 }
598 return dropped;
599} 618}
600 619
601/** 620/**
@@ -697,59 +716,62 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
697 * If nodes are grouped, always drop the incomplete group at 716 * If nodes are grouped, always drop the incomplete group at
698 * the end. 717 * the end.
699 */ 718 */
700 drop_last_node(sleb, &offs, 1); 719 drop_last_group(sleb, &offs);
701 720
702 /* 721 if (jhead == GCHD) {
703 * While we are in the middle of the same min. I/O unit keep dropping 722 /*
704 * nodes. So basically, what we want is to make sure that the last min. 723 * If this LEB belongs to the GC head then while we are in the
705 * I/O unit where we saw the corruption is dropped completely with all 724 * middle of the same min. I/O unit keep dropping nodes. So
706 * the uncorrupted nodes which may possibly sit there. 725 * basically, what we want is to make sure that the last min.
707 * 726 * I/O unit where we saw the corruption is dropped completely
708 * In other words, let's name the min. I/O unit where the corruption 727 * with all the uncorrupted nodes which may possibly sit there.
709 * starts B, and the previous min. I/O unit A. The below code tries to 728 *
710 * deal with a situation when half of B contains valid nodes or the end 729 * In other words, let's name the min. I/O unit where the
711 * of a valid node, and the second half of B contains corrupted data or 730 * corruption starts B, and the previous min. I/O unit A. The
712 * garbage. This means that UBIFS had been writing to B just before the 731 * below code tries to deal with a situation when half of B
713 * power cut happened. I do not know how realistic is this scenario 732 * contains valid nodes or the end of a valid node, and the
714 * that half of the min. I/O unit had been written successfully and the 733 * second half of B contains corrupted data or garbage. This
715 * other half not, but this is possible in our 'failure mode emulation' 734 * means that UBIFS had been writing to B just before the power
716 * infrastructure at least. 735 * cut happened. I do not know how realistic is this scenario
717 * 736 * that half of the min. I/O unit had been written successfully
718 * So what is the problem, why we need to drop those nodes? Whey can't 737 * and the other half not, but this is possible in our 'failure
719 * we just clean-up the second half of B by putting a padding node 738 * mode emulation' infrastructure at least.
720 * there? We can, and this works fine with one exception which was 739 *
721 * reproduced with power cut emulation testing and happens extremely 740 * So what is the problem, why we need to drop those nodes? Why
722 * rarely. The description follows, but it is worth noting that that is 741 * can't we just clean-up the second half of B by putting a
723 * only about the GC head, so we could do this trick only if the bud 742 * padding node there? We can, and this works fine with one
724 * belongs to the GC head, but it does not seem to be worth an 743 * exception which was reproduced with power cut emulation
725 * additional "if" statement. 744 * testing and happens extremely rarely.
726 * 745 *
727 * So, imagine the file-system is full, we run GC which is moving valid 746 * Imagine the file-system is full, we run GC which starts
728 * nodes from LEB X to LEB Y (obviously, LEB Y is the current GC head 747 * moving valid nodes from LEB X to LEB Y (obviously, LEB Y is
729 * LEB). The @c->gc_lnum is -1, which means that GC will retain LEB X 748 * the current GC head LEB). The @c->gc_lnum is -1, which means
730 * and will try to continue. Imagine that LEB X is currently the 749 * that GC will retain LEB X and will try to continue. Imagine
731 * dirtiest LEB, and the amount of used space in LEB Y is exactly the 750 * that LEB X is currently the dirtiest LEB, and the amount of
732 * same as amount of free space in LEB X. 751 * used space in LEB Y is exactly the same as amount of free
733 * 752 * space in LEB X.
734 * And a power cut happens when nodes are moved from LEB X to LEB Y. We 753 *
735 * are here trying to recover LEB Y which is the GC head LEB. We find 754 * And a power cut happens when nodes are moved from LEB X to
736 * the min. I/O unit B as described above. Then we clean-up LEB Y by 755 * LEB Y. We are here trying to recover LEB Y which is the GC
737 * padding min. I/O unit. And later 'ubifs_rcvry_gc_commit()' function 756 * head LEB. We find the min. I/O unit B as described above.
738 * fails, because it cannot find a dirty LEB which could be GC'd into 757 * Then we clean-up LEB Y by padding min. I/O unit. And later
739 * LEB Y! Even LEB X does not match because the amount of valid nodes 758 * 'ubifs_rcvry_gc_commit()' function fails, because it cannot
740 * there does not fit the free space in LEB Y any more! And this is 759 * find a dirty LEB which could be GC'd into LEB Y! Even LEB X
741 * because of the padding node which we added to LEB Y. The 760 * does not match because the amount of valid nodes there does
742 * user-visible effect of this which I once observed and analysed is 761 * not fit the free space in LEB Y any more! And this is
743 * that we cannot mount the file-system with -ENOSPC error. 762 * because of the padding node which we added to LEB Y. The
744 * 763 * user-visible effect of this which I once observed and
745 * So obviously, to make sure that situation does not happen we should 764 * analysed is that we cannot mount the file-system with
746 * free min. I/O unit B in LEB Y completely and the last used min. I/O 765 * -ENOSPC error.
747 * unit in LEB Y should be A. This is basically what the below code 766 *
748 * tries to do. 767 * So obviously, to make sure that situation does not happen we
749 */ 768 * should free min. I/O unit B in LEB Y completely and the last
750 while (min_io_unit == round_down(offs, c->min_io_size) && 769 * used min. I/O unit in LEB Y should be A. This is basically
751 min_io_unit != offs && 770 * what the below code tries to do.
752 drop_last_node(sleb, &offs, grouped)); 771 */
772 while (offs > min_io_unit)
773 drop_last_node(sleb, &offs);
774 }
753 775
754 buf = sbuf + offs; 776 buf = sbuf + offs;
755 len = c->leb_size - offs; 777 len = c->leb_size - offs;