1 files changed, 72 insertions, 10 deletions
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 74281f135b04..731d9e2e7b50 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -564,13 +564,16 @@ static int fix_unclean_leb(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
 }
 /**
- * drop_incomplete_group - drop nodes from an incomplete group.
+ * drop_last_node - drop the last node or group of nodes.
 * @sleb: scanned LEB information
 * @offs: offset of dropped nodes is returned here
+ * @grouped: non-zero if whole group of nodes have to be dropped
 *
- * This function returns %1 if nodes are dropped and %0 otherwise.
+ * This is a helper function for 'ubifs_recover_leb()' which drops the last
+ * node of the scanned LEB or the last group of nodes if @grouped is not zero.
+ * This function returns %1 if a node was dropped and %0 otherwise.
 */
-static int drop_incomplete_group(struct ubifs_scan_leb *sleb, int *offs)
+static int drop_last_node(struct ubifs_scan_leb *sleb, int *offs, int grouped)
 {
        int dropped = 0;
@@ -589,6 +592,8 @@ static int drop_incomplete_group(struct ubifs_scan_leb *sleb, int *offs)
                kfree(snod);
                sleb->nodes_cnt -= 1;
                dropped = 1;
+                if (!grouped)
+                        break;
        }
        return dropped;
 }
@@ -609,8 +614,7 @@ static int drop_incomplete_group(struct ubifs_scan_leb *sleb, int *offs)
 struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
                                         int offs, void *sbuf, int grouped)
 {
-        int ret = 0, err, len = c->leb_size - offs;
+        int ret = 0, err, len = c->leb_size - offs, start = offs, min_io_unit;
-        int start = offs;
        struct ubifs_scan_leb *sleb;
        void *buf = sbuf + offs;
@@ -620,6 +624,7 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
        if (IS_ERR(sleb))
                return sleb;
+        ubifs_assert(len >= 8);
        while (len >= 8) {
                dbg_scan("look at LEB %d:%d (%d bytes left)",
                         lnum, offs, len);
@@ -684,11 +689,68 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
                }
        }
-        /* Drop nodes from incomplete group */
+        min_io_unit = round_down(offs, c->min_io_size);
-        if (grouped && drop_incomplete_group(sleb, &offs)) {
+        if (grouped)
-                buf = sbuf + offs;
+                /*
-                len = c->leb_size - offs;
+                 * If nodes are grouped, always drop the incomplete group at
-        }
+                 * the end.
+                 */
+                drop_last_node(sleb, &offs, 1);
+        /*
+         * While we are in the middle of the same min. I/O unit keep dropping
+         * nodes. So basically, what we want is to make sure that the last min.
+         * I/O unit where we saw the corruption is dropped completely with all
+         * the uncorrupted node which may possibly sit there.
+         *
+         * In other words, let's name the min. I/O unit where the corruption
+         * starts B, and the previous min. I/O unit A. The below code tries to
+         * deal with a situation when half of B contains valid nodes or the end
+         * of a valid node, and the second half of B contains corrupted data or
+         * garbage. This means that UBIFS had been writing to B just before the
+         * power cut happened. I do not know how realistic is this scenario
+         * that half of the min. I/O unit had been written successfully and the
+         * other half not, but this is possible in our 'failure mode emulation'
+         * infrastructure at least.
+         *
+         * So what is the problem, why we need to drop those nodes? Whey can't
+         * we just clean-up the second half of B by putting a padding node
+         * there? We can, and this works fine with one exception which was
+         * reproduced with power cut emulation testing and happens extremely
+         * rarely. The description follows, but it is worth noting that that is
+         * only about the GC head, so we could do this trick only if the bud
+         * belongs to the GC head, but it does not seem to be worth an
+         * additional "if" statement.
+         *
+         * So, imagine the file-system is full, we run GC which is moving valid
+         * nodes from LEB X to LEB Y (obviously, LEB Y is the current GC head
+         * LEB). The @c->gc_lnum is -1, which means that GC will retain LEB X
+         * and will try to continue. Imagine that LEB X is currently the
+         * dirtiest LEB, and the amount of used space in LEB Y is exactly the
+         * same as amount of free space in LEB X.
+         *
+         * And a power cut happens when nodes are moved from LEB X to LEB Y. We
+         * are here trying to recover LEB Y which is the GC head LEB. We find
+         * the min. I/O unit B as described above. Then we clean-up LEB Y by
+         * padding min. I/O unit. And later 'ubifs_rcvry_gc_commit()' function
+         * fails, because it cannot find a dirty LEB which could be GC'd into
+         * LEB Y! Even LEB X does not match because the amount of valid nodes
+         * there does not fit the free space in LEB Y any more! And this is
+         * because of the padding node which we added to LEB Y. The
+         * user-visible effect of this which I once observed and analysed is
+         * that we cannot mount the file-system with -ENOSPC error.
+         *
+         * So obviously, to make sure that situation does not happen we should
+         * free min. I/O unit B in LEB Y completely and the last used min. I/O
+         * unit in LEB Y should be A. This is basically what the below code
+         * tries to do.
+         */
+        while (min_io_unit == round_down(offs, c->min_io_size) &&
+               min_io_unit != offs &&
+               drop_last_node(sleb, &offs, grouped));
+        buf = sbuf + offs;
+        len = c->leb_size - offs;
        clean_buf(c, &buf, lnum, &offs, &len);
        ubifs_end_scan(c, sleb, lnum, offs);

diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c index 74281f135b04..731d9e2e7b50 100644 --- a/fs/ubifs/recovery.c +++ b/fs/ubifs/recovery.c
@@ -564,13 +564,16 @@ static int fix_unclean_leb(struct ubifs_info c, struct ubifs_scan_leb sleb,
564	}	564	}
565		565
566	/**	566	/**
567	* drop_incomplete_group - drop nodes from an incomplete group.	567	* drop_last_node - drop the last node or group of nodes.
568	* @sleb: scanned LEB information	568	* @sleb: scanned LEB information
569	* @offs: offset of dropped nodes is returned here	569	* @offs: offset of dropped nodes is returned here
		570	* @grouped: non-zero if whole group of nodes have to be dropped
570	*	571	*
571	* This function returns %1 if nodes are dropped and %0 otherwise.	572	* This is a helper function for 'ubifs_recover_leb()' which drops the last
		573	* node of the scanned LEB or the last group of nodes if @grouped is not zero.
		574	* This function returns %1 if a node was dropped and %0 otherwise.
572	*/	575	*/
573	static int drop_incomplete_group(struct ubifs_scan_leb sleb, int offs)	576	static int drop_last_node(struct ubifs_scan_leb sleb, int offs, int grouped)
574	{	577	{
575	int dropped = 0;	578	int dropped = 0;
576		579
@@ -589,6 +592,8 @@ static int drop_incomplete_group(struct ubifs_scan_leb sleb, int offs)
589	kfree(snod);	592	kfree(snod);
590	sleb->nodes_cnt -= 1;	593	sleb->nodes_cnt -= 1;
591	dropped = 1;	594	dropped = 1;
		595	if (!grouped)
		596	break;
592	}	597	}
593	return dropped;	598	return dropped;
594	}	599	}
@@ -609,8 +614,7 @@ static int drop_incomplete_group(struct ubifs_scan_leb sleb, int offs)
609	struct ubifs_scan_leb ubifs_recover_leb(struct ubifs_info c, int lnum,	614	struct ubifs_scan_leb ubifs_recover_leb(struct ubifs_info c, int lnum,
610	int offs, void *sbuf, int grouped)	615	int offs, void *sbuf, int grouped)
611	{	616	{
612	int ret = 0, err, len = c->leb_size - offs;	617	int ret = 0, err, len = c->leb_size - offs, start = offs, min_io_unit;
613	int start = offs;
614	struct ubifs_scan_leb *sleb;	618	struct ubifs_scan_leb *sleb;
615	void *buf = sbuf + offs;	619	void *buf = sbuf + offs;
616		620
@@ -620,6 +624,7 @@ struct ubifs_scan_leb ubifs_recover_leb(struct ubifs_info c, int lnum,
620	if (IS_ERR(sleb))	624	if (IS_ERR(sleb))
621	return sleb;	625	return sleb;
622		626
		627	ubifs_assert(len >= 8);
623	while (len >= 8) {	628	while (len >= 8) {
624	dbg_scan("look at LEB %d:%d (%d bytes left)",	629	dbg_scan("look at LEB %d:%d (%d bytes left)",
625	lnum, offs, len);	630	lnum, offs, len);
@@ -684,11 +689,68 @@ struct ubifs_scan_leb ubifs_recover_leb(struct ubifs_info c, int lnum,
684	}	689	}
685	}	690	}
686		691
687	/* Drop nodes from incomplete group */	692	min_io_unit = round_down(offs, c->min_io_size);
688	if (grouped && drop_incomplete_group(sleb, &offs)) {	693	if (grouped)
689	buf = sbuf + offs;	694	/*
690	len = c->leb_size - offs;	695	* If nodes are grouped, always drop the incomplete group at
691	}	696	* the end.
		697	*/
		698	drop_last_node(sleb, &offs, 1);
		699
		700	/*
		701	* While we are in the middle of the same min. I/O unit keep dropping
		702	* nodes. So basically, what we want is to make sure that the last min.
		703	* I/O unit where we saw the corruption is dropped completely with all
		704	* the uncorrupted node which may possibly sit there.
		705	*
		706	* In other words, let's name the min. I/O unit where the corruption
		707	* starts B, and the previous min. I/O unit A. The below code tries to
		708	* deal with a situation when half of B contains valid nodes or the end
		709	* of a valid node, and the second half of B contains corrupted data or
		710	* garbage. This means that UBIFS had been writing to B just before the
		711	* power cut happened. I do not know how realistic is this scenario
		712	* that half of the min. I/O unit had been written successfully and the
		713	* other half not, but this is possible in our 'failure mode emulation'
		714	* infrastructure at least.
		715	*
		716	* So what is the problem, why we need to drop those nodes? Whey can't
		717	* we just clean-up the second half of B by putting a padding node
		718	* there? We can, and this works fine with one exception which was
		719	* reproduced with power cut emulation testing and happens extremely
		720	* rarely. The description follows, but it is worth noting that that is
		721	* only about the GC head, so we could do this trick only if the bud
		722	* belongs to the GC head, but it does not seem to be worth an
		723	* additional "if" statement.
		724	*
		725	* So, imagine the file-system is full, we run GC which is moving valid
		726	* nodes from LEB X to LEB Y (obviously, LEB Y is the current GC head
		727	* LEB). The @c->gc_lnum is -1, which means that GC will retain LEB X
		728	* and will try to continue. Imagine that LEB X is currently the
		729	* dirtiest LEB, and the amount of used space in LEB Y is exactly the
		730	* same as amount of free space in LEB X.
		731	*
		732	* And a power cut happens when nodes are moved from LEB X to LEB Y. We
		733	* are here trying to recover LEB Y which is the GC head LEB. We find
		734	* the min. I/O unit B as described above. Then we clean-up LEB Y by
		735	* padding min. I/O unit. And later 'ubifs_rcvry_gc_commit()' function
		736	* fails, because it cannot find a dirty LEB which could be GC'd into
		737	* LEB Y! Even LEB X does not match because the amount of valid nodes
		738	* there does not fit the free space in LEB Y any more! And this is
		739	* because of the padding node which we added to LEB Y. The
		740	* user-visible effect of this which I once observed and analysed is
		741	* that we cannot mount the file-system with -ENOSPC error.
		742	*
		743	* So obviously, to make sure that situation does not happen we should
		744	* free min. I/O unit B in LEB Y completely and the last used min. I/O
		745	* unit in LEB Y should be A. This is basically what the below code
		746	* tries to do.
		747	*/
		748	while (min_io_unit == round_down(offs, c->min_io_size) &&
		749	min_io_unit != offs &&
		750	drop_last_node(sleb, &offs, grouped));
		751
		752	buf = sbuf + offs;
		753	len = c->leb_size - offs;
692		754
693	clean_buf(c, &buf, lnum, &offs, &len);	755	clean_buf(c, &buf, lnum, &offs, &len);
694	ubifs_end_scan(c, sleb, lnum, offs);	756	ubifs_end_scan(c, sleb, lnum, offs);