diff options
-rw-r--r-- | Documentation/mtd/nand_ecc.txt | 714 | ||||
-rw-r--r-- | drivers/mtd/nand/nand_ecc.c | 496 |
2 files changed, 1086 insertions, 124 deletions
diff --git a/Documentation/mtd/nand_ecc.txt b/Documentation/mtd/nand_ecc.txt new file mode 100644 index 000000000000..bdf93b7f0f24 --- /dev/null +++ b/Documentation/mtd/nand_ecc.txt | |||
@@ -0,0 +1,714 @@ | |||
1 | Introduction | ||
2 | ============ | ||
3 | |||
4 | Having looked at the linux mtd/nand driver and more specific at nand_ecc.c | ||
5 | I felt there was room for optimisation. I bashed the code for a few hours | ||
6 | performing tricks like table lookup removing superfluous code etc. | ||
7 | After that the speed was increased by 35-40%. | ||
8 | Still I was not too happy as I felt there was additional room for improvement. | ||
9 | |||
10 | Bad! I was hooked. | ||
11 | I decided to annotate my steps in this file. Perhaps it is useful to someone | ||
12 | or someone learns something from it. | ||
13 | |||
14 | |||
15 | The problem | ||
16 | =========== | ||
17 | |||
18 | NAND flash (at least SLC one) typically has sectors of 256 bytes. | ||
19 | However NAND flash is not extremely reliable so some error detection | ||
20 | (and sometimes correction) is needed. | ||
21 | |||
22 | This is done by means of a Hamming code. I'll try to explain it in | ||
23 | laymans terms (and apologies to all the pro's in the field in case I do | ||
24 | not use the right terminology, my coding theory class was almost 30 | ||
25 | years ago, and I must admit it was not one of my favourites). | ||
26 | |||
27 | As I said before the ecc calculation is performed on sectors of 256 | ||
28 | bytes. This is done by calculating several parity bits over the rows and | ||
29 | columns. The parity used is even parity which means that the parity bit = 1 | ||
30 | if the data over which the parity is calculated is 1 and the parity bit = 0 | ||
31 | if the data over which the parity is calculated is 0. So the total | ||
32 | number of bits over the data over which the parity is calculated + the | ||
33 | parity bit is even. (see wikipedia if you can't follow this). | ||
34 | Parity is often calculated by means of an exclusive or operation, | ||
35 | sometimes also referred to as xor. In C the operator for xor is ^ | ||
36 | |||
37 | Back to ecc. | ||
38 | Let's give a small figure: | ||
39 | |||
40 | byte 0: bit7 bit6 bit5 bit4 bit3 bit2 bit1 bit0 rp0 rp2 rp4 ... rp14 | ||
41 | byte 1: bit7 bit6 bit5 bit4 bit3 bit2 bit1 bit0 rp1 rp2 rp4 ... rp14 | ||
42 | byte 2: bit7 bit6 bit5 bit4 bit3 bit2 bit1 bit0 rp0 rp3 rp4 ... rp14 | ||
43 | byte 3: bit7 bit6 bit5 bit4 bit3 bit2 bit1 bit0 rp1 rp3 rp4 ... rp14 | ||
44 | byte 4: bit7 bit6 bit5 bit4 bit3 bit2 bit1 bit0 rp0 rp2 rp5 ... rp14 | ||
45 | .... | ||
46 | byte 254: bit7 bit6 bit5 bit4 bit3 bit2 bit1 bit0 rp0 rp3 rp5 ... rp15 | ||
47 | byte 255: bit7 bit6 bit5 bit4 bit3 bit2 bit1 bit0 rp1 rp3 rp5 ... rp15 | ||
48 | cp1 cp0 cp1 cp0 cp1 cp0 cp1 cp0 | ||
49 | cp3 cp3 cp2 cp2 cp3 cp3 cp2 cp2 | ||
50 | cp5 cp5 cp5 cp5 cp4 cp4 cp4 cp4 | ||
51 | |||
52 | This figure represents a sector of 256 bytes. | ||
53 | cp is my abbreviaton for column parity, rp for row parity. | ||
54 | |||
55 | Let's start to explain column parity. | ||
56 | cp0 is the parity that belongs to all bit0, bit2, bit4, bit6. | ||
57 | so the sum of all bit0, bit2, bit4 and bit6 values + cp0 itself is even. | ||
58 | Similarly cp1 is the sum of all bit1, bit3, bit5 and bit7. | ||
59 | cp2 is the parity over bit0, bit1, bit4 and bit5 | ||
60 | cp3 is the parity over bit2, bit3, bit6 and bit7. | ||
61 | cp4 is the parity over bit0, bit1, bit2 and bit3. | ||
62 | cp5 is the parity over bit4, bit5, bit6 and bit7. | ||
63 | Note that each of cp0 .. cp5 is exactly one bit. | ||
64 | |||
65 | Row parity actually works almost the same. | ||
66 | rp0 is the parity of all even bytes (0, 2, 4, 6, ... 252, 254) | ||
67 | rp1 is the parity of all odd bytes (1, 3, 5, 7, ..., 253, 255) | ||
68 | rp2 is the parity of all bytes 0, 1, 4, 5, 8, 9, ... | ||
69 | (so handle two bytes, then skip 2 bytes). | ||
70 | rp3 is covers the half rp2 does not cover (bytes 2, 3, 6, 7, 10, 11, ...) | ||
71 | for rp4 the rule is cover 4 bytes, skip 4 bytes, cover 4 bytes, skip 4 etc. | ||
72 | so rp4 calculates parity over bytes 0, 1, 2, 3, 8, 9, 10, 11, 16, ...) | ||
73 | and rp5 covers the other half, so bytes 4, 5, 6, 7, 12, 13, 14, 15, 20, .. | ||
74 | The story now becomes quite boring. I guess you get the idea. | ||
75 | rp6 covers 8 bytes then skips 8 etc | ||
76 | rp7 skips 8 bytes then covers 8 etc | ||
77 | rp8 covers 16 bytes then skips 16 etc | ||
78 | rp9 skips 16 bytes then covers 16 etc | ||
79 | rp10 covers 32 bytes then skips 32 etc | ||
80 | rp11 skips 32 bytes then covers 32 etc | ||
81 | rp12 covers 64 bytes then skips 64 etc | ||
82 | rp13 skips 64 bytes then covers 64 etc | ||
83 | rp14 covers 128 bytes then skips 128 | ||
84 | rp15 skips 128 bytes then covers 128 | ||
85 | |||
86 | In the end the parity bits are grouped together in three bytes as | ||
87 | follows: | ||
88 | ECC Bit 7 Bit 6 Bit 5 Bit 4 Bit 3 Bit 2 Bit 1 Bit 0 | ||
89 | ECC 0 rp07 rp06 rp05 rp04 rp03 rp02 rp01 rp00 | ||
90 | ECC 1 rp15 rp14 rp13 rp12 rp11 rp10 rp09 rp08 | ||
91 | ECC 2 cp5 cp4 cp3 cp2 cp1 cp0 1 1 | ||
92 | |||
93 | I detected after writing this that ST application note AN1823 | ||
94 | (http://www.st.com/stonline/books/pdf/docs/10123.pdf) gives a much | ||
95 | nicer picture.(but they use line parity as term where I use row parity) | ||
96 | Oh well, I'm graphically challenged, so suffer with me for a moment :-) | ||
97 | And I could not reuse the ST picture anyway for copyright reasons. | ||
98 | |||
99 | |||
100 | Attempt 0 | ||
101 | ========= | ||
102 | |||
103 | Implementing the parity calculation is pretty simple. | ||
104 | In C pseudocode: | ||
105 | for (i = 0; i < 256; i++) | ||
106 | { | ||
107 | if (i & 0x01) | ||
108 | rp1 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp1; | ||
109 | else | ||
110 | rp0 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp1; | ||
111 | if (i & 0x02) | ||
112 | rp3 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp3; | ||
113 | else | ||
114 | rp2 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp2; | ||
115 | if (i & 0x04) | ||
116 | rp5 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp5; | ||
117 | else | ||
118 | rp4 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp4; | ||
119 | if (i & 0x08) | ||
120 | rp7 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp7; | ||
121 | else | ||
122 | rp6 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp6; | ||
123 | if (i & 0x10) | ||
124 | rp9 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp9; | ||
125 | else | ||
126 | rp8 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp8; | ||
127 | if (i & 0x20) | ||
128 | rp11 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp11; | ||
129 | else | ||
130 | rp10 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp10; | ||
131 | if (i & 0x40) | ||
132 | rp13 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp13; | ||
133 | else | ||
134 | rp12 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp12; | ||
135 | if (i & 0x80) | ||
136 | rp15 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp15; | ||
137 | else | ||
138 | rp14 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp14; | ||
139 | cp0 = bit6 ^ bit4 ^ bit2 ^ bit0 ^ cp0; | ||
140 | cp1 = bit7 ^ bit5 ^ bit3 ^ bit1 ^ cp1; | ||
141 | cp2 = bit5 ^ bit4 ^ bit1 ^ bit0 ^ cp2; | ||
142 | cp3 = bit7 ^ bit6 ^ bit3 ^ bit2 ^ cp3 | ||
143 | cp4 = bit3 ^ bit2 ^ bit1 ^ bit0 ^ cp4 | ||
144 | cp5 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ cp5 | ||
145 | } | ||
146 | |||
147 | |||
148 | Analysis 0 | ||
149 | ========== | ||
150 | |||
151 | C does have bitwise operators but not really operators to do the above | ||
152 | efficiently (and most hardware has no such instructions either). | ||
153 | Therefore without implementing this it was clear that the code above was | ||
154 | not going to bring me a Nobel prize :-) | ||
155 | |||
156 | Fortunately the exclusive or operation is commutative, so we can combine | ||
157 | the values in any order. So instead of calculating all the bits | ||
158 | individually, let us try to rearrange things. | ||
159 | For the column parity this is easy. We can just xor the bytes and in the | ||
160 | end filter out the relevant bits. This is pretty nice as it will bring | ||
161 | all cp calculation out of the if loop. | ||
162 | |||
163 | Similarly we can first xor the bytes for the various rows. | ||
164 | This leads to: | ||
165 | |||
166 | |||
167 | Attempt 1 | ||
168 | ========= | ||
169 | |||
170 | const char parity[256] = { | ||
171 | 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, | ||
172 | 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, | ||
173 | 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, | ||
174 | 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, | ||
175 | 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, | ||
176 | 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, | ||
177 | 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, | ||
178 | 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, | ||
179 | 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, | ||
180 | 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, | ||
181 | 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, | ||
182 | 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, | ||
183 | 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, | ||
184 | 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, | ||
185 | 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, | ||
186 | 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0 | ||
187 | }; | ||
188 | |||
189 | void ecc1(const unsigned char *buf, unsigned char *code) | ||
190 | { | ||
191 | int i; | ||
192 | const unsigned char *bp = buf; | ||
193 | unsigned char cur; | ||
194 | unsigned char rp0, rp1, rp2, rp3, rp4, rp5, rp6, rp7; | ||
195 | unsigned char rp8, rp9, rp10, rp11, rp12, rp13, rp14, rp15; | ||
196 | unsigned char par; | ||
197 | |||
198 | par = 0; | ||
199 | rp0 = 0; rp1 = 0; rp2 = 0; rp3 = 0; | ||
200 | rp4 = 0; rp5 = 0; rp6 = 0; rp7 = 0; | ||
201 | rp8 = 0; rp9 = 0; rp10 = 0; rp11 = 0; | ||
202 | rp12 = 0; rp13 = 0; rp14 = 0; rp15 = 0; | ||
203 | |||
204 | for (i = 0; i < 256; i++) | ||
205 | { | ||
206 | cur = *bp++; | ||
207 | par ^= cur; | ||
208 | if (i & 0x01) rp1 ^= cur; else rp0 ^= cur; | ||
209 | if (i & 0x02) rp3 ^= cur; else rp2 ^= cur; | ||
210 | if (i & 0x04) rp5 ^= cur; else rp4 ^= cur; | ||
211 | if (i & 0x08) rp7 ^= cur; else rp6 ^= cur; | ||
212 | if (i & 0x10) rp9 ^= cur; else rp8 ^= cur; | ||
213 | if (i & 0x20) rp11 ^= cur; else rp10 ^= cur; | ||
214 | if (i & 0x40) rp13 ^= cur; else rp12 ^= cur; | ||
215 | if (i & 0x80) rp15 ^= cur; else rp14 ^= cur; | ||
216 | } | ||
217 | code[0] = | ||
218 | (parity[rp7] << 7) | | ||
219 | (parity[rp6] << 6) | | ||
220 | (parity[rp5] << 5) | | ||
221 | (parity[rp4] << 4) | | ||
222 | (parity[rp3] << 3) | | ||
223 | (parity[rp2] << 2) | | ||
224 | (parity[rp1] << 1) | | ||
225 | (parity[rp0]); | ||
226 | code[1] = | ||
227 | (parity[rp15] << 7) | | ||
228 | (parity[rp14] << 6) | | ||
229 | (parity[rp13] << 5) | | ||
230 | (parity[rp12] << 4) | | ||
231 | (parity[rp11] << 3) | | ||
232 | (parity[rp10] << 2) | | ||
233 | (parity[rp9] << 1) | | ||
234 | (parity[rp8]); | ||
235 | code[2] = | ||
236 | (parity[par & 0xf0] << 7) | | ||
237 | (parity[par & 0x0f] << 6) | | ||
238 | (parity[par & 0xcc] << 5) | | ||
239 | (parity[par & 0x33] << 4) | | ||
240 | (parity[par & 0xaa] << 3) | | ||
241 | (parity[par & 0x55] << 2); | ||
242 | code[0] = ~code[0]; | ||
243 | code[1] = ~code[1]; | ||
244 | code[2] = ~code[2]; | ||
245 | } | ||
246 | |||
247 | Still pretty straightforward. The last three invert statements are there to | ||
248 | give a checksum of 0xff 0xff 0xff for an empty flash. In an empty flash | ||
249 | all data is 0xff, so the checksum then matches. | ||
250 | |||
251 | I also introduced the parity lookup. I expected this to be the fastest | ||
252 | way to calculate the parity, but I will investigate alternatives later | ||
253 | on. | ||
254 | |||
255 | |||
256 | Analysis 1 | ||
257 | ========== | ||
258 | |||
259 | The code works, but is not terribly efficient. On my system it took | ||
260 | almost 4 times as much time as the linux driver code. But hey, if it was | ||
261 | *that* easy this would have been done long before. | ||
262 | No pain. no gain. | ||
263 | |||
264 | Fortunately there is plenty of room for improvement. | ||
265 | |||
266 | In step 1 we moved from bit-wise calculation to byte-wise calculation. | ||
267 | However in C we can also use the unsigned long data type and virtually | ||
268 | every modern microprocessor supports 32 bit operations, so why not try | ||
269 | to write our code in such a way that we process data in 32 bit chunks. | ||
270 | |||
271 | Of course this means some modification as the row parity is byte by | ||
272 | byte. A quick analysis: | ||
273 | for the column parity we use the par variable. When extending to 32 bits | ||
274 | we can in the end easily calculate p0 and p1 from it. | ||
275 | (because par now consists of 4 bytes, contributing to rp1, rp0, rp1, rp0 | ||
276 | respectively) | ||
277 | also rp2 and rp3 can be easily retrieved from par as rp3 covers the | ||
278 | first two bytes and rp2 the last two bytes. | ||
279 | |||
280 | Note that of course now the loop is executed only 64 times (256/4). | ||
281 | And note that care must taken wrt byte ordering. The way bytes are | ||
282 | ordered in a long is machine dependent, and might affect us. | ||
283 | Anyway, if there is an issue: this code is developed on x86 (to be | ||
284 | precise: a DELL PC with a D920 Intel CPU) | ||
285 | |||
286 | And of course the performance might depend on alignment, but I expect | ||
287 | that the I/O buffers in the nand driver are aligned properly (and | ||
288 | otherwise that should be fixed to get maximum performance). | ||
289 | |||
290 | Let's give it a try... | ||
291 | |||
292 | |||
293 | Attempt 2 | ||
294 | ========= | ||
295 | |||
296 | extern const char parity[256]; | ||
297 | |||
298 | void ecc2(const unsigned char *buf, unsigned char *code) | ||
299 | { | ||
300 | int i; | ||
301 | const unsigned long *bp = (unsigned long *)buf; | ||
302 | unsigned long cur; | ||
303 | unsigned long rp0, rp1, rp2, rp3, rp4, rp5, rp6, rp7; | ||
304 | unsigned long rp8, rp9, rp10, rp11, rp12, rp13, rp14, rp15; | ||
305 | unsigned long par; | ||
306 | |||
307 | par = 0; | ||
308 | rp0 = 0; rp1 = 0; rp2 = 0; rp3 = 0; | ||
309 | rp4 = 0; rp5 = 0; rp6 = 0; rp7 = 0; | ||
310 | rp8 = 0; rp9 = 0; rp10 = 0; rp11 = 0; | ||
311 | rp12 = 0; rp13 = 0; rp14 = 0; rp15 = 0; | ||
312 | |||
313 | for (i = 0; i < 64; i++) | ||
314 | { | ||
315 | cur = *bp++; | ||
316 | par ^= cur; | ||
317 | if (i & 0x01) rp5 ^= cur; else rp4 ^= cur; | ||
318 | if (i & 0x02) rp7 ^= cur; else rp6 ^= cur; | ||
319 | if (i & 0x04) rp9 ^= cur; else rp8 ^= cur; | ||
320 | if (i & 0x08) rp11 ^= cur; else rp10 ^= cur; | ||
321 | if (i & 0x10) rp13 ^= cur; else rp12 ^= cur; | ||
322 | if (i & 0x20) rp15 ^= cur; else rp14 ^= cur; | ||
323 | } | ||
324 | /* | ||
325 | we need to adapt the code generation for the fact that rp vars are now | ||
326 | long; also the column parity calculation needs to be changed. | ||
327 | we'll bring rp4 to 15 back to single byte entities by shifting and | ||
328 | xoring | ||
329 | */ | ||
330 | rp4 ^= (rp4 >> 16); rp4 ^= (rp4 >> 8); rp4 &= 0xff; | ||
331 | rp5 ^= (rp5 >> 16); rp5 ^= (rp5 >> 8); rp5 &= 0xff; | ||
332 | rp6 ^= (rp6 >> 16); rp6 ^= (rp6 >> 8); rp6 &= 0xff; | ||
333 | rp7 ^= (rp7 >> 16); rp7 ^= (rp7 >> 8); rp7 &= 0xff; | ||
334 | rp8 ^= (rp8 >> 16); rp8 ^= (rp8 >> 8); rp8 &= 0xff; | ||
335 | rp9 ^= (rp9 >> 16); rp9 ^= (rp9 >> 8); rp9 &= 0xff; | ||
336 | rp10 ^= (rp10 >> 16); rp10 ^= (rp10 >> 8); rp10 &= 0xff; | ||
337 | rp11 ^= (rp11 >> 16); rp11 ^= (rp11 >> 8); rp11 &= 0xff; | ||
338 | rp12 ^= (rp12 >> 16); rp12 ^= (rp12 >> 8); rp12 &= 0xff; | ||
339 | rp13 ^= (rp13 >> 16); rp13 ^= (rp13 >> 8); rp13 &= 0xff; | ||
340 | rp14 ^= (rp14 >> 16); rp14 ^= (rp14 >> 8); rp14 &= 0xff; | ||
341 | rp15 ^= (rp15 >> 16); rp15 ^= (rp15 >> 8); rp15 &= 0xff; | ||
342 | rp3 = (par >> 16); rp3 ^= (rp3 >> 8); rp3 &= 0xff; | ||
343 | rp2 = par & 0xffff; rp2 ^= (rp2 >> 8); rp2 &= 0xff; | ||
344 | par ^= (par >> 16); | ||
345 | rp1 = (par >> 8); rp1 &= 0xff; | ||
346 | rp0 = (par & 0xff); | ||
347 | par ^= (par >> 8); par &= 0xff; | ||
348 | |||
349 | code[0] = | ||
350 | (parity[rp7] << 7) | | ||
351 | (parity[rp6] << 6) | | ||
352 | (parity[rp5] << 5) | | ||
353 | (parity[rp4] << 4) | | ||
354 | (parity[rp3] << 3) | | ||
355 | (parity[rp2] << 2) | | ||
356 | (parity[rp1] << 1) | | ||
357 | (parity[rp0]); | ||
358 | code[1] = | ||
359 | (parity[rp15] << 7) | | ||
360 | (parity[rp14] << 6) | | ||
361 | (parity[rp13] << 5) | | ||
362 | (parity[rp12] << 4) | | ||
363 | (parity[rp11] << 3) | | ||
364 | (parity[rp10] << 2) | | ||
365 | (parity[rp9] << 1) | | ||
366 | (parity[rp8]); | ||
367 | code[2] = | ||
368 | (parity[par & 0xf0] << 7) | | ||
369 | (parity[par & 0x0f] << 6) | | ||
370 | (parity[par & 0xcc] << 5) | | ||
371 | (parity[par & 0x33] << 4) | | ||
372 | (parity[par & 0xaa] << 3) | | ||
373 | (parity[par & 0x55] << 2); | ||
374 | code[0] = ~code[0]; | ||
375 | code[1] = ~code[1]; | ||
376 | code[2] = ~code[2]; | ||
377 | } | ||
378 | |||
379 | The parity array is not shown any more. Note also that for these | ||
380 | examples I kinda deviated from my regular programming style by allowing | ||
381 | multiple statements on a line, not using { } in then and else blocks | ||
382 | with only a single statement and by using operators like ^= | ||
383 | |||
384 | |||
385 | Analysis 2 | ||
386 | ========== | ||
387 | |||
388 | The code (of course) works, and hurray: we are a little bit faster than | ||
389 | the linux driver code (about 15%). But wait, don't cheer too quickly. | ||
390 | THere is more to be gained. | ||
391 | If we look at e.g. rp14 and rp15 we see that we either xor our data with | ||
392 | rp14 or with rp15. However we also have par which goes over all data. | ||
393 | This means there is no need to calculate rp14 as it can be calculated from | ||
394 | rp15 through rp14 = par ^ rp15; | ||
395 | (or if desired we can avoid calculating rp15 and calculate it from | ||
396 | rp14). That is why some places refer to inverse parity. | ||
397 | Of course the same thing holds for rp4/5, rp6/7, rp8/9, rp10/11 and rp12/13. | ||
398 | Effectively this means we can eliminate the else clause from the if | ||
399 | statements. Also we can optimise the calculation in the end a little bit | ||
400 | by going from long to byte first. Actually we can even avoid the table | ||
401 | lookups | ||
402 | |||
403 | Attempt 3 | ||
404 | ========= | ||
405 | |||
406 | Odd replaced: | ||
407 | if (i & 0x01) rp5 ^= cur; else rp4 ^= cur; | ||
408 | if (i & 0x02) rp7 ^= cur; else rp6 ^= cur; | ||
409 | if (i & 0x04) rp9 ^= cur; else rp8 ^= cur; | ||
410 | if (i & 0x08) rp11 ^= cur; else rp10 ^= cur; | ||
411 | if (i & 0x10) rp13 ^= cur; else rp12 ^= cur; | ||
412 | if (i & 0x20) rp15 ^= cur; else rp14 ^= cur; | ||
413 | with | ||
414 | if (i & 0x01) rp5 ^= cur; | ||
415 | if (i & 0x02) rp7 ^= cur; | ||
416 | if (i & 0x04) rp9 ^= cur; | ||
417 | if (i & 0x08) rp11 ^= cur; | ||
418 | if (i & 0x10) rp13 ^= cur; | ||
419 | if (i & 0x20) rp15 ^= cur; | ||
420 | |||
421 | and outside the loop added: | ||
422 | rp4 = par ^ rp5; | ||
423 | rp6 = par ^ rp7; | ||
424 | rp8 = par ^ rp9; | ||
425 | rp10 = par ^ rp11; | ||
426 | rp12 = par ^ rp13; | ||
427 | rp14 = par ^ rp15; | ||
428 | |||
429 | And after that the code takes about 30% more time, although the number of | ||
430 | statements is reduced. This is also reflected in the assembly code. | ||
431 | |||
432 | |||
433 | Analysis 3 | ||
434 | ========== | ||
435 | |||
436 | Very weird. Guess it has to do with caching or instruction parallellism | ||
437 | or so. I also tried on an eeePC (Celeron, clocked at 900 Mhz). Interesting | ||
438 | observation was that this one is only 30% slower (according to time) | ||
439 | executing the code as my 3Ghz D920 processor. | ||
440 | |||
441 | Well, it was expected not to be easy so maybe instead move to a | ||
442 | different track: let's move back to the code from attempt2 and do some | ||
443 | loop unrolling. This will eliminate a few if statements. I'll try | ||
444 | different amounts of unrolling to see what works best. | ||
445 | |||
446 | |||
447 | Attempt 4 | ||
448 | ========= | ||
449 | |||
450 | Unrolled the loop 1, 2, 3 and 4 times. | ||
451 | For 4 the code starts with: | ||
452 | |||
453 | for (i = 0; i < 4; i++) | ||
454 | { | ||
455 | cur = *bp++; | ||
456 | par ^= cur; | ||
457 | rp4 ^= cur; | ||
458 | rp6 ^= cur; | ||
459 | rp8 ^= cur; | ||
460 | rp10 ^= cur; | ||
461 | if (i & 0x1) rp13 ^= cur; else rp12 ^= cur; | ||
462 | if (i & 0x2) rp15 ^= cur; else rp14 ^= cur; | ||
463 | cur = *bp++; | ||
464 | par ^= cur; | ||
465 | rp5 ^= cur; | ||
466 | rp6 ^= cur; | ||
467 | ... | ||
468 | |||
469 | |||
470 | Analysis 4 | ||
471 | ========== | ||
472 | |||
473 | Unrolling once gains about 15% | ||
474 | Unrolling twice keeps the gain at about 15% | ||
475 | Unrolling three times gives a gain of 30% compared to attempt 2. | ||
476 | Unrolling four times gives a marginal improvement compared to unrolling | ||
477 | three times. | ||
478 | |||
479 | I decided to proceed with a four time unrolled loop anyway. It was my gut | ||
480 | feeling that in the next steps I would obtain additional gain from it. | ||
481 | |||
482 | The next step was triggered by the fact that par contains the xor of all | ||
483 | bytes and rp4 and rp5 each contain the xor of half of the bytes. | ||
484 | So in effect par = rp4 ^ rp5. But as xor is commutative we can also say | ||
485 | that rp5 = par ^ rp4. So no need to keep both rp4 and rp5 around. We can | ||
486 | eliminate rp5 (or rp4, but I already foresaw another optimisation). | ||
487 | The same holds for rp6/7, rp8/9, rp10/11 rp12/13 and rp14/15. | ||
488 | |||
489 | |||
490 | Attempt 5 | ||
491 | ========= | ||
492 | |||
493 | Effectively so all odd digit rp assignments in the loop were removed. | ||
494 | This included the else clause of the if statements. | ||
495 | Of course after the loop we need to correct things by adding code like: | ||
496 | rp5 = par ^ rp4; | ||
497 | Also the initial assignments (rp5 = 0; etc) could be removed. | ||
498 | Along the line I also removed the initialisation of rp0/1/2/3. | ||
499 | |||
500 | |||
501 | Analysis 5 | ||
502 | ========== | ||
503 | |||
504 | Measurements showed this was a good move. The run-time roughly halved | ||
505 | compared with attempt 4 with 4 times unrolled, and we only require 1/3rd | ||
506 | of the processor time compared to the current code in the linux kernel. | ||
507 | |||
508 | However, still I thought there was more. I didn't like all the if | ||
509 | statements. Why not keep a running parity and only keep the last if | ||
510 | statement. Time for yet another version! | ||
511 | |||
512 | |||
513 | Attempt 6 | ||
514 | ========= | ||
515 | |||
516 | THe code within the for loop was changed to: | ||
517 | |||
518 | for (i = 0; i < 4; i++) | ||
519 | { | ||
520 | cur = *bp++; tmppar = cur; rp4 ^= cur; | ||
521 | cur = *bp++; tmppar ^= cur; rp6 ^= tmppar; | ||
522 | cur = *bp++; tmppar ^= cur; rp4 ^= cur; | ||
523 | cur = *bp++; tmppar ^= cur; rp8 ^= tmppar; | ||
524 | |||
525 | cur = *bp++; tmppar ^= cur; rp4 ^= cur; rp6 ^= cur; | ||
526 | cur = *bp++; tmppar ^= cur; rp6 ^= cur; | ||
527 | cur = *bp++; tmppar ^= cur; rp4 ^= cur; | ||
528 | cur = *bp++; tmppar ^= cur; rp10 ^= tmppar; | ||
529 | |||
530 | cur = *bp++; tmppar ^= cur; rp4 ^= cur; rp6 ^= cur; rp8 ^= cur; | ||
531 | cur = *bp++; tmppar ^= cur; rp6 ^= cur; rp8 ^= cur; | ||
532 | cur = *bp++; tmppar ^= cur; rp4 ^= cur; rp8 ^= cur; | ||
533 | cur = *bp++; tmppar ^= cur; rp8 ^= cur; | ||
534 | |||
535 | cur = *bp++; tmppar ^= cur; rp4 ^= cur; rp6 ^= cur; | ||
536 | cur = *bp++; tmppar ^= cur; rp6 ^= cur; | ||
537 | cur = *bp++; tmppar ^= cur; rp4 ^= cur; | ||
538 | cur = *bp++; tmppar ^= cur; | ||
539 | |||
540 | par ^= tmppar; | ||
541 | if ((i & 0x1) == 0) rp12 ^= tmppar; | ||
542 | if ((i & 0x2) == 0) rp14 ^= tmppar; | ||
543 | } | ||
544 | |||
545 | As you can see tmppar is used to accumulate the parity within a for | ||
546 | iteration. In the last 3 statements is is added to par and, if needed, | ||
547 | to rp12 and rp14. | ||
548 | |||
549 | While making the changes I also found that I could exploit that tmppar | ||
550 | contains the running parity for this iteration. So instead of having: | ||
551 | rp4 ^= cur; rp6 = cur; | ||
552 | I removed the rp6 = cur; statement and did rp6 ^= tmppar; on next | ||
553 | statement. A similar change was done for rp8 and rp10 | ||
554 | |||
555 | |||
556 | Analysis 6 | ||
557 | ========== | ||
558 | |||
559 | Measuring this code again showed big gain. When executing the original | ||
560 | linux code 1 million times, this took about 1 second on my system. | ||
561 | (using time to measure the performance). After this iteration I was back | ||
562 | to 0.075 sec. Actually I had to decide to start measuring over 10 | ||
563 | million interations in order not to loose too much accuracy. This one | ||
564 | definitely seemed to be the jackpot! | ||
565 | |||
566 | There is a little bit more room for improvement though. There are three | ||
567 | places with statements: | ||
568 | rp4 ^= cur; rp6 ^= cur; | ||
569 | It seems more efficient to also maintain a variable rp4_6 in the while | ||
570 | loop; This eliminates 3 statements per loop. Of course after the loop we | ||
571 | need to correct by adding: | ||
572 | rp4 ^= rp4_6; | ||
573 | rp6 ^= rp4_6 | ||
574 | Furthermore there are 4 sequential assingments to rp8. This can be | ||
575 | encoded slightly more efficient by saving tmppar before those 4 lines | ||
576 | and later do rp8 = rp8 ^ tmppar ^ notrp8; | ||
577 | (where notrp8 is the value of rp8 before those 4 lines). | ||
578 | Again a use of the commutative property of xor. | ||
579 | Time for a new test! | ||
580 | |||
581 | |||
582 | Attempt 7 | ||
583 | ========= | ||
584 | |||
585 | The new code now looks like: | ||
586 | |||
587 | for (i = 0; i < 4; i++) | ||
588 | { | ||
589 | cur = *bp++; tmppar = cur; rp4 ^= cur; | ||
590 | cur = *bp++; tmppar ^= cur; rp6 ^= tmppar; | ||
591 | cur = *bp++; tmppar ^= cur; rp4 ^= cur; | ||
592 | cur = *bp++; tmppar ^= cur; rp8 ^= tmppar; | ||
593 | |||
594 | cur = *bp++; tmppar ^= cur; rp4_6 ^= cur; | ||
595 | cur = *bp++; tmppar ^= cur; rp6 ^= cur; | ||
596 | cur = *bp++; tmppar ^= cur; rp4 ^= cur; | ||
597 | cur = *bp++; tmppar ^= cur; rp10 ^= tmppar; | ||
598 | |||
599 | notrp8 = tmppar; | ||
600 | cur = *bp++; tmppar ^= cur; rp4_6 ^= cur; | ||
601 | cur = *bp++; tmppar ^= cur; rp6 ^= cur; | ||
602 | cur = *bp++; tmppar ^= cur; rp4 ^= cur; | ||
603 | cur = *bp++; tmppar ^= cur; | ||
604 | rp8 = rp8 ^ tmppar ^ notrp8; | ||
605 | |||
606 | cur = *bp++; tmppar ^= cur; rp4_6 ^= cur; | ||
607 | cur = *bp++; tmppar ^= cur; rp6 ^= cur; | ||
608 | cur = *bp++; tmppar ^= cur; rp4 ^= cur; | ||
609 | cur = *bp++; tmppar ^= cur; | ||
610 | |||
611 | par ^= tmppar; | ||
612 | if ((i & 0x1) == 0) rp12 ^= tmppar; | ||
613 | if ((i & 0x2) == 0) rp14 ^= tmppar; | ||
614 | } | ||
615 | rp4 ^= rp4_6; | ||
616 | rp6 ^= rp4_6; | ||
617 | |||
618 | |||
619 | Not a big change, but every penny counts :-) | ||
620 | |||
621 | |||
622 | Analysis 7 | ||
623 | ========== | ||
624 | |||
625 | Acutally this made things worse. Not very much, but I don't want to move | ||
626 | into the wrong direction. Maybe something to investigate later. Could | ||
627 | have to do with caching again. | ||
628 | |||
629 | Guess that is what there is to win within the loop. Maybe unrolling one | ||
630 | more time will help. I'll keep the optimisations from 7 for now. | ||
631 | |||
632 | |||
633 | Attempt 8 | ||
634 | ========= | ||
635 | |||
636 | Unrolled the loop one more time. | ||
637 | |||
638 | |||
639 | Analysis 8 | ||
640 | ========== | ||
641 | |||
642 | This makes things worse. Let's stick with attempt 6 and continue from there. | ||
643 | Although it seems that the code within the loop cannot be optimised | ||
644 | further there is still room to optimize the generation of the ecc codes. | ||
645 | We can simply calcualate the total parity. If this is 0 then rp4 = rp5 | ||
646 | etc. If the parity is 1, then rp4 = !rp5; | ||
647 | But if rp4 = rp5 we do not need rp5 etc. We can just write the even bits | ||
648 | in the result byte and then do something like | ||
649 | code[0] |= (code[0] << 1); | ||
650 | Lets test this. | ||
651 | |||
652 | |||
653 | Attempt 9 | ||
654 | ========= | ||
655 | |||
656 | Changed the code but again this slightly degrades performance. Tried all | ||
657 | kind of other things, like having dedicated parity arrays to avoid the | ||
658 | shift after parity[rp7] << 7; No gain. | ||
659 | Change the lookup using the parity array by using shift operators (e.g. | ||
660 | replace parity[rp7] << 7 with: | ||
661 | rp7 ^= (rp7 << 4); | ||
662 | rp7 ^= (rp7 << 2); | ||
663 | rp7 ^= (rp7 << 1); | ||
664 | rp7 &= 0x80; | ||
665 | No gain. | ||
666 | |||
667 | The only marginal change was inverting the parity bits, so we can remove | ||
668 | the last three invert statements. | ||
669 | |||
670 | Ah well, pity this does not deliver more. Then again 10 million | ||
671 | iterations using the linux driver code takes between 13 and 13.5 | ||
672 | seconds, whereas my code now takes about 0.73 seconds for those 10 | ||
673 | million iterations. So basically I've improved the performance by a | ||
674 | factor 18 on my system. Not that bad. Of course on different hardware | ||
675 | you will get different results. No warranties! | ||
676 | |||
677 | But of course there is no such thing as a free lunch. The codesize almost | ||
678 | tripled (from 562 bytes to 1434 bytes). Then again, it is not that much. | ||
679 | |||
680 | |||
681 | Correcting errors | ||
682 | ================= | ||
683 | |||
684 | For correcting errors I again used the ST application note as a starter, | ||
685 | but I also peeked at the existing code. | ||
686 | The algorithm itself is pretty straightforward. Just xor the given and | ||
687 | the calculated ecc. If all bytes are 0 there is no problem. If 11 bits | ||
688 | are 1 we have one correctable bit error. If there is 1 bit 1, we have an | ||
689 | error in the given ecc code. | ||
690 | It proved to be fastest to do some table lookups. Performance gain | ||
691 | introduced by this is about a factor 2 on my system when a repair had to | ||
692 | be done, and 1% or so if no repair had to be done. | ||
693 | Code size increased from 330 bytes to 686 bytes for this function. | ||
694 | (gcc 4.2, -O3) | ||
695 | |||
696 | |||
697 | Conclusion | ||
698 | ========== | ||
699 | |||
700 | The gain when calculating the ecc is tremendous. Om my development hardware | ||
701 | a speedup of a factor of 18 for ecc calculation was achieved. On a test on an | ||
702 | embedded system with a MIPS core a factor 7 was obtained. | ||
703 | On a test with a Linksys NSLU2 (ARMv5TE processor) the speedup was a factor | ||
704 | 5 (big endian mode, gcc 4.1.2, -O3) | ||
705 | For correction not much gain could be obtained (as bitflips are rare). Then | ||
706 | again there are also much less cycles spent there. | ||
707 | |||
708 | It seems there is not much more gain possible in this, at least when | ||
709 | programmed in C. Of course it might be possible to squeeze something more | ||
710 | out of it with an assembler program, but due to pipeline behaviour etc | ||
711 | this is very tricky (at least for intel hw). | ||
712 | |||
713 | Author: Frans Meulenbroeks | ||
714 | Copyright (C) 2008 Koninklijke Philips Electronics NV. | ||
diff --git a/drivers/mtd/nand/nand_ecc.c b/drivers/mtd/nand/nand_ecc.c index 918a806a8471..7129da51bb33 100644 --- a/drivers/mtd/nand/nand_ecc.c +++ b/drivers/mtd/nand/nand_ecc.c | |||
@@ -1,13 +1,18 @@ | |||
1 | /* | 1 | /* |
2 | * This file contains an ECC algorithm from Toshiba that detects and | 2 | * This file contains an ECC algorithm that detects and corrects 1 bit |
3 | * corrects 1 bit errors in a 256 byte block of data. | 3 | * errors in a 256 byte block of data. |
4 | * | 4 | * |
5 | * drivers/mtd/nand/nand_ecc.c | 5 | * drivers/mtd/nand/nand_ecc.c |
6 | * | 6 | * |
7 | * Copyright (C) 2000-2004 Steven J. Hill (sjhill@realitydiluted.com) | 7 | * Copyright (C) 2008 Koninklijke Philips Electronics NV. |
8 | * Toshiba America Electronics Components, Inc. | 8 | * Author: Frans Meulenbroeks |
9 | * | 9 | * |
10 | * Copyright (C) 2006 Thomas Gleixner <tglx@linutronix.de> | 10 | * Completely replaces the previous ECC implementation which was written by: |
11 | * Steven J. Hill (sjhill@realitydiluted.com) | ||
12 | * Thomas Gleixner (tglx@linutronix.de) | ||
13 | * | ||
14 | * Information on how this algorithm works and how it was developed | ||
15 | * can be found in Documentation/nand/ecc.txt | ||
11 | * | 16 | * |
12 | * This file is free software; you can redistribute it and/or modify it | 17 | * This file is free software; you can redistribute it and/or modify it |
13 | * under the terms of the GNU General Public License as published by the | 18 | * under the terms of the GNU General Public License as published by the |
@@ -23,174 +28,417 @@ | |||
23 | * with this file; if not, write to the Free Software Foundation, Inc., | 28 | * with this file; if not, write to the Free Software Foundation, Inc., |
24 | * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. | 29 | * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. |
25 | * | 30 | * |
26 | * As a special exception, if other files instantiate templates or use | ||
27 | * macros or inline functions from these files, or you compile these | ||
28 | * files and link them with other works to produce a work based on these | ||
29 | * files, these files do not by themselves cause the resulting work to be | ||
30 | * covered by the GNU General Public License. However the source code for | ||
31 | * these files must still be made available in accordance with section (3) | ||
32 | * of the GNU General Public License. | ||
33 | * | ||
34 | * This exception does not invalidate any other reasons why a work based on | ||
35 | * this file might be covered by the GNU General Public License. | ||
36 | */ | 31 | */ |
37 | 32 | ||
33 | /* | ||
34 | * The STANDALONE macro is useful when running the code outside the kernel | ||
35 | * e.g. when running the code in a testbed or a benchmark program. | ||
36 | * When STANDALONE is used, the module related macros are commented out | ||
37 | * as well as the linux include files. | ||
38 | * Instead a private definition of mtd_into is given to satisfy the compiler | ||
39 | * (the code does not use mtd_info, so the code does not care) | ||
40 | */ | ||
41 | #ifndef STANDALONE | ||
38 | #include <linux/types.h> | 42 | #include <linux/types.h> |
39 | #include <linux/kernel.h> | 43 | #include <linux/kernel.h> |
40 | #include <linux/module.h> | 44 | #include <linux/module.h> |
41 | #include <linux/mtd/nand_ecc.h> | 45 | #include <linux/mtd/nand_ecc.h> |
46 | #else | ||
47 | typedef uint32_t unsigned long | ||
48 | struct mtd_info { | ||
49 | int dummy; | ||
50 | }; | ||
51 | #define EXPORT_SYMBOL(x) /* x */ | ||
52 | |||
53 | #define MODULE_LICENSE(x) /* x */ | ||
54 | #define MODULE_AUTHOR(x) /* x */ | ||
55 | #define MODULE_DESCRIPTION(x) /* x */ | ||
56 | #endif | ||
57 | |||
58 | /* | ||
59 | * invparity is a 256 byte table that contains the odd parity | ||
60 | * for each byte. So if the number of bits in a byte is even, | ||
61 | * the array element is 1, and when the number of bits is odd | ||
62 | * the array eleemnt is 0. | ||
63 | */ | ||
64 | static const char invparity[256] = { | ||
65 | 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, | ||
66 | 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, | ||
67 | 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, | ||
68 | 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, | ||
69 | 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, | ||
70 | 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, | ||
71 | 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, | ||
72 | 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, | ||
73 | 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, | ||
74 | 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, | ||
75 | 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, | ||
76 | 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, | ||
77 | 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, | ||
78 | 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, | ||
79 | 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, | ||
80 | 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1 | ||
81 | }; | ||
42 | 82 | ||
43 | /* | 83 | /* |
44 | * Pre-calculated 256-way 1 byte column parity | 84 | * bitsperbyte contains the number of bits per byte |
85 | * this is only used for testing and repairing parity | ||
86 | * (a precalculated value slightly improves performance) | ||
45 | */ | 87 | */ |
46 | static const u_char nand_ecc_precalc_table[] = { | 88 | static const char bitsperbyte[256] = { |
47 | 0x00, 0x55, 0x56, 0x03, 0x59, 0x0c, 0x0f, 0x5a, 0x5a, 0x0f, 0x0c, 0x59, 0x03, 0x56, 0x55, 0x00, | 89 | 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, |
48 | 0x65, 0x30, 0x33, 0x66, 0x3c, 0x69, 0x6a, 0x3f, 0x3f, 0x6a, 0x69, 0x3c, 0x66, 0x33, 0x30, 0x65, | 90 | 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, |
49 | 0x66, 0x33, 0x30, 0x65, 0x3f, 0x6a, 0x69, 0x3c, 0x3c, 0x69, 0x6a, 0x3f, 0x65, 0x30, 0x33, 0x66, | 91 | 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, |
50 | 0x03, 0x56, 0x55, 0x00, 0x5a, 0x0f, 0x0c, 0x59, 0x59, 0x0c, 0x0f, 0x5a, 0x00, 0x55, 0x56, 0x03, | 92 | 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, |
51 | 0x69, 0x3c, 0x3f, 0x6a, 0x30, 0x65, 0x66, 0x33, 0x33, 0x66, 0x65, 0x30, 0x6a, 0x3f, 0x3c, 0x69, | 93 | 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, |
52 | 0x0c, 0x59, 0x5a, 0x0f, 0x55, 0x00, 0x03, 0x56, 0x56, 0x03, 0x00, 0x55, 0x0f, 0x5a, 0x59, 0x0c, | 94 | 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, |
53 | 0x0f, 0x5a, 0x59, 0x0c, 0x56, 0x03, 0x00, 0x55, 0x55, 0x00, 0x03, 0x56, 0x0c, 0x59, 0x5a, 0x0f, | 95 | 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, |
54 | 0x6a, 0x3f, 0x3c, 0x69, 0x33, 0x66, 0x65, 0x30, 0x30, 0x65, 0x66, 0x33, 0x69, 0x3c, 0x3f, 0x6a, | 96 | 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, |
55 | 0x6a, 0x3f, 0x3c, 0x69, 0x33, 0x66, 0x65, 0x30, 0x30, 0x65, 0x66, 0x33, 0x69, 0x3c, 0x3f, 0x6a, | 97 | 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, |
56 | 0x0f, 0x5a, 0x59, 0x0c, 0x56, 0x03, 0x00, 0x55, 0x55, 0x00, 0x03, 0x56, 0x0c, 0x59, 0x5a, 0x0f, | 98 | 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, |
57 | 0x0c, 0x59, 0x5a, 0x0f, 0x55, 0x00, 0x03, 0x56, 0x56, 0x03, 0x00, 0x55, 0x0f, 0x5a, 0x59, 0x0c, | 99 | 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, |
58 | 0x69, 0x3c, 0x3f, 0x6a, 0x30, 0x65, 0x66, 0x33, 0x33, 0x66, 0x65, 0x30, 0x6a, 0x3f, 0x3c, 0x69, | 100 | 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, |
59 | 0x03, 0x56, 0x55, 0x00, 0x5a, 0x0f, 0x0c, 0x59, 0x59, 0x0c, 0x0f, 0x5a, 0x00, 0x55, 0x56, 0x03, | 101 | 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, |
60 | 0x66, 0x33, 0x30, 0x65, 0x3f, 0x6a, 0x69, 0x3c, 0x3c, 0x69, 0x6a, 0x3f, 0x65, 0x30, 0x33, 0x66, | 102 | 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, |
61 | 0x65, 0x30, 0x33, 0x66, 0x3c, 0x69, 0x6a, 0x3f, 0x3f, 0x6a, 0x69, 0x3c, 0x66, 0x33, 0x30, 0x65, | 103 | 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, |
62 | 0x00, 0x55, 0x56, 0x03, 0x59, 0x0c, 0x0f, 0x5a, 0x5a, 0x0f, 0x0c, 0x59, 0x03, 0x56, 0x55, 0x00 | 104 | 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8, |
105 | }; | ||
106 | |||
107 | /* | ||
108 | * addressbits is a lookup table to filter out the bits from the xor-ed | ||
109 | * ecc data that identify the faulty location. | ||
110 | * this is only used for repairing parity | ||
111 | * see the comments in nand_correct_data for more details | ||
112 | */ | ||
113 | static const char addressbits[256] = { | ||
114 | 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, | ||
115 | 0x02, 0x02, 0x03, 0x03, 0x02, 0x02, 0x03, 0x03, | ||
116 | 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, | ||
117 | 0x02, 0x02, 0x03, 0x03, 0x02, 0x02, 0x03, 0x03, | ||
118 | 0x04, 0x04, 0x05, 0x05, 0x04, 0x04, 0x05, 0x05, | ||
119 | 0x06, 0x06, 0x07, 0x07, 0x06, 0x06, 0x07, 0x07, | ||
120 | 0x04, 0x04, 0x05, 0x05, 0x04, 0x04, 0x05, 0x05, | ||
121 | 0x06, 0x06, 0x07, 0x07, 0x06, 0x06, 0x07, 0x07, | ||
122 | 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, | ||
123 | 0x02, 0x02, 0x03, 0x03, 0x02, 0x02, 0x03, 0x03, | ||
124 | 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, | ||
125 | 0x02, 0x02, 0x03, 0x03, 0x02, 0x02, 0x03, 0x03, | ||
126 | 0x04, 0x04, 0x05, 0x05, 0x04, 0x04, 0x05, 0x05, | ||
127 | 0x06, 0x06, 0x07, 0x07, 0x06, 0x06, 0x07, 0x07, | ||
128 | 0x04, 0x04, 0x05, 0x05, 0x04, 0x04, 0x05, 0x05, | ||
129 | 0x06, 0x06, 0x07, 0x07, 0x06, 0x06, 0x07, 0x07, | ||
130 | 0x08, 0x08, 0x09, 0x09, 0x08, 0x08, 0x09, 0x09, | ||
131 | 0x0a, 0x0a, 0x0b, 0x0b, 0x0a, 0x0a, 0x0b, 0x0b, | ||
132 | 0x08, 0x08, 0x09, 0x09, 0x08, 0x08, 0x09, 0x09, | ||
133 | 0x0a, 0x0a, 0x0b, 0x0b, 0x0a, 0x0a, 0x0b, 0x0b, | ||
134 | 0x0c, 0x0c, 0x0d, 0x0d, 0x0c, 0x0c, 0x0d, 0x0d, | ||
135 | 0x0e, 0x0e, 0x0f, 0x0f, 0x0e, 0x0e, 0x0f, 0x0f, | ||
136 | 0x0c, 0x0c, 0x0d, 0x0d, 0x0c, 0x0c, 0x0d, 0x0d, | ||
137 | 0x0e, 0x0e, 0x0f, 0x0f, 0x0e, 0x0e, 0x0f, 0x0f, | ||
138 | 0x08, 0x08, 0x09, 0x09, 0x08, 0x08, 0x09, 0x09, | ||
139 | 0x0a, 0x0a, 0x0b, 0x0b, 0x0a, 0x0a, 0x0b, 0x0b, | ||
140 | 0x08, 0x08, 0x09, 0x09, 0x08, 0x08, 0x09, 0x09, | ||
141 | 0x0a, 0x0a, 0x0b, 0x0b, 0x0a, 0x0a, 0x0b, 0x0b, | ||
142 | 0x0c, 0x0c, 0x0d, 0x0d, 0x0c, 0x0c, 0x0d, 0x0d, | ||
143 | 0x0e, 0x0e, 0x0f, 0x0f, 0x0e, 0x0e, 0x0f, 0x0f, | ||
144 | 0x0c, 0x0c, 0x0d, 0x0d, 0x0c, 0x0c, 0x0d, 0x0d, | ||
145 | 0x0e, 0x0e, 0x0f, 0x0f, 0x0e, 0x0e, 0x0f, 0x0f | ||
63 | }; | 146 | }; |
64 | 147 | ||
65 | /** | 148 | /** |
66 | * nand_calculate_ecc - [NAND Interface] Calculate 3-byte ECC for 256-byte block | 149 | * nand_calculate_ecc - [NAND Interface] Calculate 3-byte ECC for 256-byte block |
67 | * @mtd: MTD block structure | 150 | * @mtd: MTD block structure (unused) |
68 | * @dat: raw data | 151 | * @dat: raw data |
69 | * @ecc_code: buffer for ECC | 152 | * @ecc_code: buffer for ECC |
70 | */ | 153 | */ |
71 | int nand_calculate_ecc(struct mtd_info *mtd, const u_char *dat, | 154 | int nand_calculate_ecc(struct mtd_info *mtd, const unsigned char *buf, |
72 | u_char *ecc_code) | 155 | unsigned char *code) |
73 | { | 156 | { |
74 | uint8_t idx, reg1, reg2, reg3, tmp1, tmp2; | ||
75 | int i; | 157 | int i; |
158 | const uint32_t *bp = (uint32_t *)buf; | ||
159 | uint32_t cur; /* current value in buffer */ | ||
160 | /* rp0..rp15 are the various accumulated parities (per byte) */ | ||
161 | uint32_t rp0, rp1, rp2, rp3, rp4, rp5, rp6, rp7; | ||
162 | uint32_t rp8, rp9, rp10, rp11, rp12, rp13, rp14, rp15; | ||
163 | uint32_t par; /* the cumulative parity for all data */ | ||
164 | uint32_t tmppar; /* the cumulative parity for this iteration; | ||
165 | for rp12 and rp14 at the end of the loop */ | ||
166 | |||
167 | par = 0; | ||
168 | rp4 = 0; | ||
169 | rp6 = 0; | ||
170 | rp8 = 0; | ||
171 | rp10 = 0; | ||
172 | rp12 = 0; | ||
173 | rp14 = 0; | ||
174 | |||
175 | /* | ||
176 | * The loop is unrolled a number of times; | ||
177 | * This avoids if statements to decide on which rp value to update | ||
178 | * Also we process the data by longwords. | ||
179 | * Note: passing unaligned data might give a performance penalty. | ||
180 | * It is assumed that the buffers are aligned. | ||
181 | * tmppar is the cumulative sum of this iteration. | ||
182 | * needed for calculating rp12, rp14 and par | ||
183 | * also used as a performance improvement for rp6, rp8 and rp10 | ||
184 | */ | ||
185 | for (i = 0; i < 4; i++) { | ||
186 | cur = *bp++; | ||
187 | tmppar = cur; | ||
188 | rp4 ^= cur; | ||
189 | cur = *bp++; | ||
190 | tmppar ^= cur; | ||
191 | rp6 ^= tmppar; | ||
192 | cur = *bp++; | ||
193 | tmppar ^= cur; | ||
194 | rp4 ^= cur; | ||
195 | cur = *bp++; | ||
196 | tmppar ^= cur; | ||
197 | rp8 ^= tmppar; | ||
76 | 198 | ||
77 | /* Initialize variables */ | 199 | cur = *bp++; |
78 | reg1 = reg2 = reg3 = 0; | 200 | tmppar ^= cur; |
201 | rp4 ^= cur; | ||
202 | rp6 ^= cur; | ||
203 | cur = *bp++; | ||
204 | tmppar ^= cur; | ||
205 | rp6 ^= cur; | ||
206 | cur = *bp++; | ||
207 | tmppar ^= cur; | ||
208 | rp4 ^= cur; | ||
209 | cur = *bp++; | ||
210 | tmppar ^= cur; | ||
211 | rp10 ^= tmppar; | ||
79 | 212 | ||
80 | /* Build up column parity */ | 213 | cur = *bp++; |
81 | for(i = 0; i < 256; i++) { | 214 | tmppar ^= cur; |
82 | /* Get CP0 - CP5 from table */ | 215 | rp4 ^= cur; |
83 | idx = nand_ecc_precalc_table[*dat++]; | 216 | rp6 ^= cur; |
84 | reg1 ^= (idx & 0x3f); | 217 | rp8 ^= cur; |
218 | cur = *bp++; | ||
219 | tmppar ^= cur; | ||
220 | rp6 ^= cur; | ||
221 | rp8 ^= cur; | ||
222 | cur = *bp++; | ||
223 | tmppar ^= cur; | ||
224 | rp4 ^= cur; | ||
225 | rp8 ^= cur; | ||
226 | cur = *bp++; | ||
227 | tmppar ^= cur; | ||
228 | rp8 ^= cur; | ||
85 | 229 | ||
86 | /* All bit XOR = 1 ? */ | 230 | cur = *bp++; |
87 | if (idx & 0x40) { | 231 | tmppar ^= cur; |
88 | reg3 ^= (uint8_t) i; | 232 | rp4 ^= cur; |
89 | reg2 ^= ~((uint8_t) i); | 233 | rp6 ^= cur; |
90 | } | 234 | cur = *bp++; |
235 | tmppar ^= cur; | ||
236 | rp6 ^= cur; | ||
237 | cur = *bp++; | ||
238 | tmppar ^= cur; | ||
239 | rp4 ^= cur; | ||
240 | cur = *bp++; | ||
241 | tmppar ^= cur; | ||
242 | |||
243 | par ^= tmppar; | ||
244 | if ((i & 0x1) == 0) | ||
245 | rp12 ^= tmppar; | ||
246 | if ((i & 0x2) == 0) | ||
247 | rp14 ^= tmppar; | ||
91 | } | 248 | } |
92 | 249 | ||
93 | /* Create non-inverted ECC code from line parity */ | 250 | /* |
94 | tmp1 = (reg3 & 0x80) >> 0; /* B7 -> B7 */ | 251 | * handle the fact that we use longword operations |
95 | tmp1 |= (reg2 & 0x80) >> 1; /* B7 -> B6 */ | 252 | * we'll bring rp4..rp14 back to single byte entities by shifting and |
96 | tmp1 |= (reg3 & 0x40) >> 1; /* B6 -> B5 */ | 253 | * xoring first fold the upper and lower 16 bits, |
97 | tmp1 |= (reg2 & 0x40) >> 2; /* B6 -> B4 */ | 254 | * then the upper and lower 8 bits. |
98 | tmp1 |= (reg3 & 0x20) >> 2; /* B5 -> B3 */ | 255 | */ |
99 | tmp1 |= (reg2 & 0x20) >> 3; /* B5 -> B2 */ | 256 | rp4 ^= (rp4 >> 16); |
100 | tmp1 |= (reg3 & 0x10) >> 3; /* B4 -> B1 */ | 257 | rp4 ^= (rp4 >> 8); |
101 | tmp1 |= (reg2 & 0x10) >> 4; /* B4 -> B0 */ | 258 | rp4 &= 0xff; |
102 | 259 | rp6 ^= (rp6 >> 16); | |
103 | tmp2 = (reg3 & 0x08) << 4; /* B3 -> B7 */ | 260 | rp6 ^= (rp6 >> 8); |
104 | tmp2 |= (reg2 & 0x08) << 3; /* B3 -> B6 */ | 261 | rp6 &= 0xff; |
105 | tmp2 |= (reg3 & 0x04) << 3; /* B2 -> B5 */ | 262 | rp8 ^= (rp8 >> 16); |
106 | tmp2 |= (reg2 & 0x04) << 2; /* B2 -> B4 */ | 263 | rp8 ^= (rp8 >> 8); |
107 | tmp2 |= (reg3 & 0x02) << 2; /* B1 -> B3 */ | 264 | rp8 &= 0xff; |
108 | tmp2 |= (reg2 & 0x02) << 1; /* B1 -> B2 */ | 265 | rp10 ^= (rp10 >> 16); |
109 | tmp2 |= (reg3 & 0x01) << 1; /* B0 -> B1 */ | 266 | rp10 ^= (rp10 >> 8); |
110 | tmp2 |= (reg2 & 0x01) << 0; /* B7 -> B0 */ | 267 | rp10 &= 0xff; |
111 | 268 | rp12 ^= (rp12 >> 16); | |
112 | /* Calculate final ECC code */ | 269 | rp12 ^= (rp12 >> 8); |
270 | rp12 &= 0xff; | ||
271 | rp14 ^= (rp14 >> 16); | ||
272 | rp14 ^= (rp14 >> 8); | ||
273 | rp14 &= 0xff; | ||
274 | |||
275 | /* | ||
276 | * we also need to calculate the row parity for rp0..rp3 | ||
277 | * This is present in par, because par is now | ||
278 | * rp3 rp3 rp2 rp2 | ||
279 | * as well as | ||
280 | * rp1 rp0 rp1 rp0 | ||
281 | * First calculate rp2 and rp3 | ||
282 | * (and yes: rp2 = (par ^ rp3) & 0xff; but doing that did not | ||
283 | * give a performance improvement) | ||
284 | */ | ||
285 | rp3 = (par >> 16); | ||
286 | rp3 ^= (rp3 >> 8); | ||
287 | rp3 &= 0xff; | ||
288 | rp2 = par & 0xffff; | ||
289 | rp2 ^= (rp2 >> 8); | ||
290 | rp2 &= 0xff; | ||
291 | |||
292 | /* reduce par to 16 bits then calculate rp1 and rp0 */ | ||
293 | par ^= (par >> 16); | ||
294 | rp1 = (par >> 8) & 0xff; | ||
295 | rp0 = (par & 0xff); | ||
296 | |||
297 | /* finally reduce par to 8 bits */ | ||
298 | par ^= (par >> 8); | ||
299 | par &= 0xff; | ||
300 | |||
301 | /* | ||
302 | * and calculate rp5..rp15 | ||
303 | * note that par = rp4 ^ rp5 and due to the commutative property | ||
304 | * of the ^ operator we can say: | ||
305 | * rp5 = (par ^ rp4); | ||
306 | * The & 0xff seems superfluous, but benchmarking learned that | ||
307 | * leaving it out gives slightly worse results. No idea why, probably | ||
308 | * it has to do with the way the pipeline in pentium is organized. | ||
309 | */ | ||
310 | rp5 = (par ^ rp4) & 0xff; | ||
311 | rp7 = (par ^ rp6) & 0xff; | ||
312 | rp9 = (par ^ rp8) & 0xff; | ||
313 | rp11 = (par ^ rp10) & 0xff; | ||
314 | rp13 = (par ^ rp12) & 0xff; | ||
315 | rp15 = (par ^ rp14) & 0xff; | ||
316 | |||
317 | /* | ||
318 | * Finally calculate the ecc bits. | ||
319 | * Again here it might seem that there are performance optimisations | ||
320 | * possible, but benchmarks showed that on the system this is developed | ||
321 | * the code below is the fastest | ||
322 | */ | ||
113 | #ifdef CONFIG_MTD_NAND_ECC_SMC | 323 | #ifdef CONFIG_MTD_NAND_ECC_SMC |
114 | ecc_code[0] = ~tmp2; | 324 | code[0] = |
115 | ecc_code[1] = ~tmp1; | 325 | (invparity[rp7] << 7) | |
326 | (invparity[rp6] << 6) | | ||
327 | (invparity[rp5] << 5) | | ||
328 | (invparity[rp4] << 4) | | ||
329 | (invparity[rp3] << 3) | | ||
330 | (invparity[rp2] << 2) | | ||
331 | (invparity[rp1] << 1) | | ||
332 | (invparity[rp0]); | ||
333 | code[1] = | ||
334 | (invparity[rp15] << 7) | | ||
335 | (invparity[rp14] << 6) | | ||
336 | (invparity[rp13] << 5) | | ||
337 | (invparity[rp12] << 4) | | ||
338 | (invparity[rp11] << 3) | | ||
339 | (invparity[rp10] << 2) | | ||
340 | (invparity[rp9] << 1) | | ||
341 | (invparity[rp8]); | ||
116 | #else | 342 | #else |
117 | ecc_code[0] = ~tmp1; | 343 | code[1] = |
118 | ecc_code[1] = ~tmp2; | 344 | (invparity[rp7] << 7) | |
345 | (invparity[rp6] << 6) | | ||
346 | (invparity[rp5] << 5) | | ||
347 | (invparity[rp4] << 4) | | ||
348 | (invparity[rp3] << 3) | | ||
349 | (invparity[rp2] << 2) | | ||
350 | (invparity[rp1] << 1) | | ||
351 | (invparity[rp0]); | ||
352 | code[0] = | ||
353 | (invparity[rp15] << 7) | | ||
354 | (invparity[rp14] << 6) | | ||
355 | (invparity[rp13] << 5) | | ||
356 | (invparity[rp12] << 4) | | ||
357 | (invparity[rp11] << 3) | | ||
358 | (invparity[rp10] << 2) | | ||
359 | (invparity[rp9] << 1) | | ||
360 | (invparity[rp8]); | ||
119 | #endif | 361 | #endif |
120 | ecc_code[2] = ((~reg1) << 2) | 0x03; | 362 | code[2] = |
121 | 363 | (invparity[par & 0xf0] << 7) | | |
364 | (invparity[par & 0x0f] << 6) | | ||
365 | (invparity[par & 0xcc] << 5) | | ||
366 | (invparity[par & 0x33] << 4) | | ||
367 | (invparity[par & 0xaa] << 3) | | ||
368 | (invparity[par & 0x55] << 2) | | ||
369 | 3; | ||
122 | return 0; | 370 | return 0; |
123 | } | 371 | } |
124 | EXPORT_SYMBOL(nand_calculate_ecc); | 372 | EXPORT_SYMBOL(nand_calculate_ecc); |
125 | 373 | ||
126 | static inline int countbits(uint32_t byte) | ||
127 | { | ||
128 | int res = 0; | ||
129 | |||
130 | for (;byte; byte >>= 1) | ||
131 | res += byte & 0x01; | ||
132 | return res; | ||
133 | } | ||
134 | |||
135 | /** | 374 | /** |
136 | * nand_correct_data - [NAND Interface] Detect and correct bit error(s) | 375 | * nand_correct_data - [NAND Interface] Detect and correct bit error(s) |
137 | * @mtd: MTD block structure | 376 | * @mtd: MTD block structure (unused) |
138 | * @dat: raw data read from the chip | 377 | * @dat: raw data read from the chip |
139 | * @read_ecc: ECC from the chip | 378 | * @read_ecc: ECC from the chip |
140 | * @calc_ecc: the ECC calculated from raw data | 379 | * @calc_ecc: the ECC calculated from raw data |
141 | * | 380 | * |
142 | * Detect and correct a 1 bit error for 256 byte block | 381 | * Detect and correct a 1 bit error for 256 byte block |
143 | */ | 382 | */ |
144 | int nand_correct_data(struct mtd_info *mtd, u_char *dat, | 383 | int nand_correct_data(struct mtd_info *mtd, unsigned char *buf, |
145 | u_char *read_ecc, u_char *calc_ecc) | 384 | unsigned char *read_ecc, unsigned char *calc_ecc) |
146 | { | 385 | { |
147 | uint8_t s0, s1, s2; | 386 | int nr_bits; |
387 | unsigned char b0, b1, b2; | ||
388 | unsigned char byte_addr, bit_addr; | ||
148 | 389 | ||
390 | /* | ||
391 | * b0 to b2 indicate which bit is faulty (if any) | ||
392 | * we might need the xor result more than once, | ||
393 | * so keep them in a local var | ||
394 | */ | ||
149 | #ifdef CONFIG_MTD_NAND_ECC_SMC | 395 | #ifdef CONFIG_MTD_NAND_ECC_SMC |
150 | s0 = calc_ecc[0] ^ read_ecc[0]; | 396 | b0 = read_ecc[0] ^ calc_ecc[0]; |
151 | s1 = calc_ecc[1] ^ read_ecc[1]; | 397 | b1 = read_ecc[1] ^ calc_ecc[1]; |
152 | s2 = calc_ecc[2] ^ read_ecc[2]; | ||
153 | #else | 398 | #else |
154 | s1 = calc_ecc[0] ^ read_ecc[0]; | 399 | b0 = read_ecc[1] ^ calc_ecc[1]; |
155 | s0 = calc_ecc[1] ^ read_ecc[1]; | 400 | b1 = read_ecc[0] ^ calc_ecc[0]; |
156 | s2 = calc_ecc[2] ^ read_ecc[2]; | ||
157 | #endif | 401 | #endif |
158 | if ((s0 | s1 | s2) == 0) | 402 | b2 = read_ecc[2] ^ calc_ecc[2]; |
159 | return 0; | ||
160 | |||
161 | /* Check for a single bit error */ | ||
162 | if( ((s0 ^ (s0 >> 1)) & 0x55) == 0x55 && | ||
163 | ((s1 ^ (s1 >> 1)) & 0x55) == 0x55 && | ||
164 | ((s2 ^ (s2 >> 1)) & 0x54) == 0x54) { | ||
165 | |||
166 | uint32_t byteoffs, bitnum; | ||
167 | 403 | ||
168 | byteoffs = (s1 << 0) & 0x80; | 404 | /* check if there are any bitfaults */ |
169 | byteoffs |= (s1 << 1) & 0x40; | ||
170 | byteoffs |= (s1 << 2) & 0x20; | ||
171 | byteoffs |= (s1 << 3) & 0x10; | ||
172 | 405 | ||
173 | byteoffs |= (s0 >> 4) & 0x08; | 406 | /* count nr of bits; use table lookup, faster than calculating it */ |
174 | byteoffs |= (s0 >> 3) & 0x04; | 407 | nr_bits = bitsperbyte[b0] + bitsperbyte[b1] + bitsperbyte[b2]; |
175 | byteoffs |= (s0 >> 2) & 0x02; | ||
176 | byteoffs |= (s0 >> 1) & 0x01; | ||
177 | 408 | ||
178 | bitnum = (s2 >> 5) & 0x04; | 409 | /* repeated if statements are slightly more efficient than switch ... */ |
179 | bitnum |= (s2 >> 4) & 0x02; | 410 | /* ordered in order of likelihood */ |
180 | bitnum |= (s2 >> 3) & 0x01; | 411 | if (nr_bits == 0) |
181 | 412 | return (0); /* no error */ | |
182 | dat[byteoffs] ^= (1 << bitnum); | 413 | if (nr_bits == 11) { /* correctable error */ |
183 | 414 | /* | |
184 | return 1; | 415 | * rp15/13/11/9/7/5/3/1 indicate which byte is the faulty byte |
416 | * cp 5/3/1 indicate the faulty bit. | ||
417 | * A lookup table (called addressbits) is used to filter | ||
418 | * the bits from the byte they are in. | ||
419 | * A marginal optimisation is possible by having three | ||
420 | * different lookup tables. | ||
421 | * One as we have now (for b0), one for b2 | ||
422 | * (that would avoid the >> 1), and one for b1 (with all values | ||
423 | * << 4). However it was felt that introducing two more tables | ||
424 | * hardly justify the gain. | ||
425 | * | ||
426 | * The b2 shift is there to get rid of the lowest two bits. | ||
427 | * We could also do addressbits[b2] >> 1 but for the | ||
428 | * performace it does not make any difference | ||
429 | */ | ||
430 | byte_addr = (addressbits[b1] << 4) + addressbits[b0]; | ||
431 | bit_addr = addressbits[b2 >> 2]; | ||
432 | /* flip the bit */ | ||
433 | buf[byte_addr] ^= (1 << bit_addr); | ||
434 | return (1); | ||
185 | } | 435 | } |
186 | 436 | if (nr_bits == 1) | |
187 | if(countbits(s0 | ((uint32_t)s1 << 8) | ((uint32_t)s2 <<16)) == 1) | 437 | return (1); /* error in ecc data; no action needed */ |
188 | return 1; | 438 | return -1; |
189 | |||
190 | return -EBADMSG; | ||
191 | } | 439 | } |
192 | EXPORT_SYMBOL(nand_correct_data); | 440 | EXPORT_SYMBOL(nand_correct_data); |
193 | 441 | ||
194 | MODULE_LICENSE("GPL"); | 442 | MODULE_LICENSE("GPL"); |
195 | MODULE_AUTHOR("Steven J. Hill <sjhill@realitydiluted.com>"); | 443 | MODULE_AUTHOR("Frans Meulenbroeks <fransmeulenbroeks@gmail.com>"); |
196 | MODULE_DESCRIPTION("Generic NAND ECC support"); | 444 | MODULE_DESCRIPTION("Generic NAND ECC support"); |