Documentation: mtd: improve nand_ecc.txt for readability and correctness

This patch correct some representation errors, add a little clarification in some places, and fix indentation problems for pseudo code. It also delete one more white space for one place. Signed-off-by: Wang YanQing <udknight@gmail.com> [Brian: a few tweaks] Signed-off-by: Brian Norris <computersforpeace@gmail.com>
author: Wang YanQing <udknight@gmail.com> 2015-10-29 12:36:33 -0400
committer: Brian Norris <computersforpeace@gmail.com> 2015-11-17 20:05:14 -0500
commit: fc5adbebac6ffab461492f7a415648b29b1a3b31 (patch)
tree: c71b2c7b93d4bea72af2d136dd4a903414b100ec
parent: 1b15b1f5a01019524815a9ce5c575f3b2068e7f8 (diff)
1 files changed, 29 insertions, 29 deletions
diff --git a/Documentation/mtd/nand_ecc.txt b/Documentation/mtd/nand_ecc.txt
index e129b2479ea8..f8c3284bf6a7 100644
--- a/Documentation/mtd/nand_ecc.txt
+++ b/Documentation/mtd/nand_ecc.txt
@@ -107,7 +107,7 @@ for (i = 0; i < 256; i++)
    if (i & 0x01)
       rp1 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp1;
    else
-       rp0 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp1;
+       rp0 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp0;
    if (i & 0x02)
       rp3 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp3;
    else
@@ -127,7 +127,7 @@ for (i = 0; i < 256; i++)
    if (i & 0x20)
      rp11 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp11;
    else
-    rp10 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp10;
+      rp10 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp10;
    if (i & 0x40)
      rp13 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp13;
    else
@@ -158,7 +158,7 @@ the values in any order. So instead of calculating all the bits
 individually, let us try to rearrange things.
 For the column parity this is easy. We can just xor the bytes and in the
 end filter out the relevant bits. This is pretty nice as it will bring
-all cp calculation out of the if loop.
+all cp calculation out of the for loop.
 Similarly we can first xor the bytes for the various rows.
 This leads to:
@@ -271,11 +271,11 @@ to write our code in such a way that we process data in 32 bit chunks.
 Of course this means some modification as the row parity is byte by
 byte. A quick analysis:
 for the column parity we use the par variable. When extending to 32 bits
-we can in the end easily calculate p0 and p1 from it.
+we can in the end easily calculate rp0 and rp1 from it.
 (because par now consists of 4 bytes, contributing to rp1, rp0, rp1, rp0
-respectively)
+respectively, from MSB to LSB)
 also rp2 and rp3 can be easily retrieved from par as rp3 covers the
-first two bytes and rp2 the last two bytes.
+first two MSBs and rp2 covers the last two LSBs.
 Note that of course now the loop is executed only 64 times (256/4).
 And note that care must taken wrt byte ordering. The way bytes are
@@ -387,11 +387,11 @@ Analysis 2
 The code (of course) works, and hurray: we are a little bit faster than
 the linux driver code (about 15%). But wait, don't cheer too quickly.
-THere is more to be gained.
+There is more to be gained.
 If we look at e.g. rp14 and rp15 we see that we either xor our data with
 rp14 or with rp15. However we also have par which goes over all data.
 This means there is no need to calculate rp14 as it can be calculated from
-rp15 through rp14 = par ^ rp15;
+rp15 through rp14 = par ^ rp15, because par = rp14 ^ rp15;
 (or if desired we can avoid calculating rp15 and calculate it from
 rp14).  That is why some places refer to inverse parity.
 Of course the same thing holds for rp4/5, rp6/7, rp8/9, rp10/11 and rp12/13.
@@ -419,12 +419,12 @@ with
        if (i & 0x20) rp15 ^= cur;
        and outside the loop added:
-    rp4  = par ^ rp5;
+        rp4  = par ^ rp5;
-    rp6  = par ^ rp7;
+        rp6  = par ^ rp7;
-    rp8  = par ^ rp9;
+        rp8  = par ^ rp9;
-    rp10  = par ^ rp11;
+        rp10  = par ^ rp11;
-    rp12  = par ^ rp13;
+        rp12  = par ^ rp13;
-    rp14  = par ^ rp15;
+        rp14  = par ^ rp15;
 And after that the code takes about 30% more time, although the number of
 statements is reduced. This is also reflected in the assembly code.
@@ -524,12 +524,12 @@ THe code within the for loop was changed to:
        cur = *bp++; tmppar ^= cur; rp4 ^= cur; rp6 ^= cur;
        cur = *bp++; tmppar ^= cur; rp6 ^= cur;
-            cur = *bp++; tmppar ^= cur; rp4 ^= cur;
+        cur = *bp++; tmppar ^= cur; rp4 ^= cur;
-            cur = *bp++; tmppar ^= cur; rp10 ^= tmppar;
+        cur = *bp++; tmppar ^= cur; rp10 ^= tmppar;
-            cur = *bp++; tmppar ^= cur; rp4 ^= cur; rp6 ^= cur; rp8 ^= cur;
+        cur = *bp++; tmppar ^= cur; rp4 ^= cur; rp6 ^= cur; rp8 ^= cur;
        cur = *bp++; tmppar ^= cur; rp6 ^= cur; rp8 ^= cur;
-            cur = *bp++; tmppar ^= cur; rp4 ^= cur; rp8 ^= cur;
+        cur = *bp++; tmppar ^= cur; rp4 ^= cur; rp8 ^= cur;
        cur = *bp++; tmppar ^= cur; rp8 ^= cur;
        cur = *bp++; tmppar ^= cur; rp4 ^= cur; rp6 ^= cur;
@@ -537,7 +537,7 @@ THe code within the for loop was changed to:
        cur = *bp++; tmppar ^= cur; rp4 ^= cur;
        cur = *bp++; tmppar ^= cur;
-            par ^= tmppar;
+        par ^= tmppar;
        if ((i & 0x1) == 0) rp12 ^= tmppar;
        if ((i & 0x2) == 0) rp14 ^= tmppar;
    }
@@ -548,8 +548,8 @@ to rp12 and rp14.
 While making the changes I also found that I could exploit that tmppar
 contains the running parity for this iteration. So instead of having:
-rp4 ^= cur; rp6 = cur;
+rp4 ^= cur; rp6 ^= cur;
-I removed the rp6 = cur; statement and did rp6 ^= tmppar; on next
+I removed the rp6 ^= cur; statement and did rp6 ^= tmppar; on next
 statement. A similar change was done for rp8 and rp10
@@ -593,22 +593,22 @@ The new code now looks like:
        cur = *bp++; tmppar ^= cur; rp4_6 ^= cur;
        cur = *bp++; tmppar ^= cur; rp6 ^= cur;
-            cur = *bp++; tmppar ^= cur; rp4 ^= cur;
+        cur = *bp++; tmppar ^= cur; rp4 ^= cur;
-            cur = *bp++; tmppar ^= cur; rp10 ^= tmppar;
+        cur = *bp++; tmppar ^= cur; rp10 ^= tmppar;
-            notrp8 = tmppar;
+        notrp8 = tmppar;
-            cur = *bp++; tmppar ^= cur; rp4_6 ^= cur;
+        cur = *bp++; tmppar ^= cur; rp4_6 ^= cur;
        cur = *bp++; tmppar ^= cur; rp6 ^= cur;
-            cur = *bp++; tmppar ^= cur; rp4 ^= cur;
+        cur = *bp++; tmppar ^= cur; rp4 ^= cur;
        cur = *bp++; tmppar ^= cur;
-            rp8 = rp8 ^ tmppar ^ notrp8;
+        rp8 = rp8 ^ tmppar ^ notrp8;
        cur = *bp++; tmppar ^= cur; rp4_6 ^= cur;
        cur = *bp++; tmppar ^= cur; rp6 ^= cur;
        cur = *bp++; tmppar ^= cur; rp4 ^= cur;
        cur = *bp++; tmppar ^= cur;
-            par ^= tmppar;
+        par ^= tmppar;
        if ((i & 0x1) == 0) rp12 ^= tmppar;
        if ((i & 0x2) == 0) rp14 ^= tmppar;
    }
@@ -700,7 +700,7 @@ Conclusion
 The gain when calculating the ecc is tremendous. Om my development hardware
 a speedup of a factor of 18 for ecc calculation was achieved. On a test on an
 embedded system with a MIPS core a factor 7 was obtained.
-On  a test with a Linksys NSLU2 (ARMv5TE processor) the speedup was a factor
+On a test with a Linksys NSLU2 (ARMv5TE processor) the speedup was a factor
 5 (big endian mode, gcc 4.1.2, -O3)
 For correction not much gain could be obtained (as bitflips are rare). Then
 again there are also much less cycles spent there.
author	Wang YanQing <udknight@gmail.com>	2015-10-29 12:36:33 -0400
committer	Brian Norris <computersforpeace@gmail.com>	2015-11-17 20:05:14 -0500
commit	fc5adbebac6ffab461492f7a415648b29b1a3b31 (patch)
tree	c71b2c7b93d4bea72af2d136dd4a903414b100ec
parent	1b15b1f5a01019524815a9ce5c575f3b2068e7f8 (diff)

diff --git a/Documentation/mtd/nand_ecc.txt b/Documentation/mtd/nand_ecc.txt index e129b2479ea8..f8c3284bf6a7 100644 --- a/Documentation/mtd/nand_ecc.txt +++ b/Documentation/mtd/nand_ecc.txt
@@ -107,7 +107,7 @@ for (i = 0; i < 256; i++)
107	if (i & 0x01)	107	if (i & 0x01)
108	rp1 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp1;	108	rp1 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp1;
109	else	109	else
110	rp0 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp1;	110	rp0 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp0;
111	if (i & 0x02)	111	if (i & 0x02)
112	rp3 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp3;	112	rp3 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp3;
113	else	113	else
@@ -127,7 +127,7 @@ for (i = 0; i < 256; i++)
127	if (i & 0x20)	127	if (i & 0x20)
128	rp11 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp11;	128	rp11 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp11;
129	else	129	else
130	rp10 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp10;	130	rp10 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp10;
131	if (i & 0x40)	131	if (i & 0x40)
132	rp13 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp13;	132	rp13 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp13;
133	else	133	else
@@ -158,7 +158,7 @@ the values in any order. So instead of calculating all the bits
158	individually, let us try to rearrange things.	158	individually, let us try to rearrange things.
159	For the column parity this is easy. We can just xor the bytes and in the	159	For the column parity this is easy. We can just xor the bytes and in the
160	end filter out the relevant bits. This is pretty nice as it will bring	160	end filter out the relevant bits. This is pretty nice as it will bring
161	all cp calculation out of the if loop.	161	all cp calculation out of the for loop.
162		162
163	Similarly we can first xor the bytes for the various rows.	163	Similarly we can first xor the bytes for the various rows.
164	This leads to:	164	This leads to:
@@ -271,11 +271,11 @@ to write our code in such a way that we process data in 32 bit chunks.
271	Of course this means some modification as the row parity is byte by	271	Of course this means some modification as the row parity is byte by
272	byte. A quick analysis:	272	byte. A quick analysis:
273	for the column parity we use the par variable. When extending to 32 bits	273	for the column parity we use the par variable. When extending to 32 bits
274	we can in the end easily calculate p0 and p1 from it.	274	we can in the end easily calculate rp0 and rp1 from it.
275	(because par now consists of 4 bytes, contributing to rp1, rp0, rp1, rp0	275	(because par now consists of 4 bytes, contributing to rp1, rp0, rp1, rp0
276	respectively)	276	respectively, from MSB to LSB)
277	also rp2 and rp3 can be easily retrieved from par as rp3 covers the	277	also rp2 and rp3 can be easily retrieved from par as rp3 covers the
278	first two bytes and rp2 the last two bytes.	278	first two MSBs and rp2 covers the last two LSBs.
279		279
280	Note that of course now the loop is executed only 64 times (256/4).	280	Note that of course now the loop is executed only 64 times (256/4).
281	And note that care must taken wrt byte ordering. The way bytes are	281	And note that care must taken wrt byte ordering. The way bytes are
@@ -387,11 +387,11 @@ Analysis 2
387		387
388	The code (of course) works, and hurray: we are a little bit faster than	388	The code (of course) works, and hurray: we are a little bit faster than
389	the linux driver code (about 15%). But wait, don't cheer too quickly.	389	the linux driver code (about 15%). But wait, don't cheer too quickly.
390	THere is more to be gained.	390	There is more to be gained.
391	If we look at e.g. rp14 and rp15 we see that we either xor our data with	391	If we look at e.g. rp14 and rp15 we see that we either xor our data with
392	rp14 or with rp15. However we also have par which goes over all data.	392	rp14 or with rp15. However we also have par which goes over all data.
393	This means there is no need to calculate rp14 as it can be calculated from	393	This means there is no need to calculate rp14 as it can be calculated from
394	rp15 through rp14 = par ^ rp15;	394	rp15 through rp14 = par ^ rp15, because par = rp14 ^ rp15;
395	(or if desired we can avoid calculating rp15 and calculate it from	395	(or if desired we can avoid calculating rp15 and calculate it from
396	rp14). That is why some places refer to inverse parity.	396	rp14). That is why some places refer to inverse parity.
397	Of course the same thing holds for rp4/5, rp6/7, rp8/9, rp10/11 and rp12/13.	397	Of course the same thing holds for rp4/5, rp6/7, rp8/9, rp10/11 and rp12/13.
@@ -419,12 +419,12 @@ with
419	if (i & 0x20) rp15 ^= cur;	419	if (i & 0x20) rp15 ^= cur;
420		420
421	and outside the loop added:	421	and outside the loop added:
422	rp4 = par ^ rp5;	422	rp4 = par ^ rp5;
423	rp6 = par ^ rp7;	423	rp6 = par ^ rp7;
424	rp8 = par ^ rp9;	424	rp8 = par ^ rp9;
425	rp10 = par ^ rp11;	425	rp10 = par ^ rp11;
426	rp12 = par ^ rp13;	426	rp12 = par ^ rp13;
427	rp14 = par ^ rp15;	427	rp14 = par ^ rp15;
428		428
429	And after that the code takes about 30% more time, although the number of	429	And after that the code takes about 30% more time, although the number of
430	statements is reduced. This is also reflected in the assembly code.	430	statements is reduced. This is also reflected in the assembly code.
@@ -524,12 +524,12 @@ THe code within the for loop was changed to:
524		524
525	cur = *bp++; tmppar ^= cur; rp4 ^= cur; rp6 ^= cur;	525	cur = *bp++; tmppar ^= cur; rp4 ^= cur; rp6 ^= cur;
526	cur = *bp++; tmppar ^= cur; rp6 ^= cur;	526	cur = *bp++; tmppar ^= cur; rp6 ^= cur;
527	cur = *bp++; tmppar ^= cur; rp4 ^= cur;	527	cur = *bp++; tmppar ^= cur; rp4 ^= cur;
528	cur = *bp++; tmppar ^= cur; rp10 ^= tmppar;	528	cur = *bp++; tmppar ^= cur; rp10 ^= tmppar;
529		529
530	cur = *bp++; tmppar ^= cur; rp4 ^= cur; rp6 ^= cur; rp8 ^= cur;	530	cur = *bp++; tmppar ^= cur; rp4 ^= cur; rp6 ^= cur; rp8 ^= cur;
531	cur = *bp++; tmppar ^= cur; rp6 ^= cur; rp8 ^= cur;	531	cur = *bp++; tmppar ^= cur; rp6 ^= cur; rp8 ^= cur;
532	cur = *bp++; tmppar ^= cur; rp4 ^= cur; rp8 ^= cur;	532	cur = *bp++; tmppar ^= cur; rp4 ^= cur; rp8 ^= cur;
533	cur = *bp++; tmppar ^= cur; rp8 ^= cur;	533	cur = *bp++; tmppar ^= cur; rp8 ^= cur;
534		534
535	cur = *bp++; tmppar ^= cur; rp4 ^= cur; rp6 ^= cur;	535	cur = *bp++; tmppar ^= cur; rp4 ^= cur; rp6 ^= cur;
@@ -537,7 +537,7 @@ THe code within the for loop was changed to:
537	cur = *bp++; tmppar ^= cur; rp4 ^= cur;	537	cur = *bp++; tmppar ^= cur; rp4 ^= cur;
538	cur = *bp++; tmppar ^= cur;	538	cur = *bp++; tmppar ^= cur;
539		539
540	par ^= tmppar;	540	par ^= tmppar;
541	if ((i & 0x1) == 0) rp12 ^= tmppar;	541	if ((i & 0x1) == 0) rp12 ^= tmppar;
542	if ((i & 0x2) == 0) rp14 ^= tmppar;	542	if ((i & 0x2) == 0) rp14 ^= tmppar;
543	}	543	}
@@ -548,8 +548,8 @@ to rp12 and rp14.
548		548
549	While making the changes I also found that I could exploit that tmppar	549	While making the changes I also found that I could exploit that tmppar
550	contains the running parity for this iteration. So instead of having:	550	contains the running parity for this iteration. So instead of having:
551	rp4 ^= cur; rp6 = cur;	551	rp4 ^= cur; rp6 ^= cur;
552	I removed the rp6 = cur; statement and did rp6 ^= tmppar; on next	552	I removed the rp6 ^= cur; statement and did rp6 ^= tmppar; on next
553	statement. A similar change was done for rp8 and rp10	553	statement. A similar change was done for rp8 and rp10
554		554
555		555
@@ -593,22 +593,22 @@ The new code now looks like:
593		593
594	cur = *bp++; tmppar ^= cur; rp4_6 ^= cur;	594	cur = *bp++; tmppar ^= cur; rp4_6 ^= cur;
595	cur = *bp++; tmppar ^= cur; rp6 ^= cur;	595	cur = *bp++; tmppar ^= cur; rp6 ^= cur;
596	cur = *bp++; tmppar ^= cur; rp4 ^= cur;	596	cur = *bp++; tmppar ^= cur; rp4 ^= cur;
597	cur = *bp++; tmppar ^= cur; rp10 ^= tmppar;	597	cur = *bp++; tmppar ^= cur; rp10 ^= tmppar;
598		598
599	notrp8 = tmppar;	599	notrp8 = tmppar;
600	cur = *bp++; tmppar ^= cur; rp4_6 ^= cur;	600	cur = *bp++; tmppar ^= cur; rp4_6 ^= cur;
601	cur = *bp++; tmppar ^= cur; rp6 ^= cur;	601	cur = *bp++; tmppar ^= cur; rp6 ^= cur;
602	cur = *bp++; tmppar ^= cur; rp4 ^= cur;	602	cur = *bp++; tmppar ^= cur; rp4 ^= cur;
603	cur = *bp++; tmppar ^= cur;	603	cur = *bp++; tmppar ^= cur;
604	rp8 = rp8 ^ tmppar ^ notrp8;	604	rp8 = rp8 ^ tmppar ^ notrp8;
605		605
606	cur = *bp++; tmppar ^= cur; rp4_6 ^= cur;	606	cur = *bp++; tmppar ^= cur; rp4_6 ^= cur;
607	cur = *bp++; tmppar ^= cur; rp6 ^= cur;	607	cur = *bp++; tmppar ^= cur; rp6 ^= cur;
608	cur = *bp++; tmppar ^= cur; rp4 ^= cur;	608	cur = *bp++; tmppar ^= cur; rp4 ^= cur;
609	cur = *bp++; tmppar ^= cur;	609	cur = *bp++; tmppar ^= cur;
610		610
611	par ^= tmppar;	611	par ^= tmppar;
612	if ((i & 0x1) == 0) rp12 ^= tmppar;	612	if ((i & 0x1) == 0) rp12 ^= tmppar;
613	if ((i & 0x2) == 0) rp14 ^= tmppar;	613	if ((i & 0x2) == 0) rp14 ^= tmppar;
614	}	614	}
@@ -700,7 +700,7 @@ Conclusion
700	The gain when calculating the ecc is tremendous. Om my development hardware	700	The gain when calculating the ecc is tremendous. Om my development hardware
701	a speedup of a factor of 18 for ecc calculation was achieved. On a test on an	701	a speedup of a factor of 18 for ecc calculation was achieved. On a test on an
702	embedded system with a MIPS core a factor 7 was obtained.	702	embedded system with a MIPS core a factor 7 was obtained.
703	On a test with a Linksys NSLU2 (ARMv5TE processor) the speedup was a factor	703	On a test with a Linksys NSLU2 (ARMv5TE processor) the speedup was a factor
704	5 (big endian mode, gcc 4.1.2, -O3)	704	5 (big endian mode, gcc 4.1.2, -O3)
705	For correction not much gain could be obtained (as bitflips are rare). Then	705	For correction not much gain could be obtained (as bitflips are rare). Then
706	again there are also much less cycles spent there.	706	again there are also much less cycles spent there.