diff options
Diffstat (limited to 'fs/logfs/gc.c')
-rw-r--r-- | fs/logfs/gc.c | 730 |
1 files changed, 730 insertions, 0 deletions
diff --git a/fs/logfs/gc.c b/fs/logfs/gc.c new file mode 100644 index 000000000000..b3656c44190e --- /dev/null +++ b/fs/logfs/gc.c | |||
@@ -0,0 +1,730 @@ | |||
1 | /* | ||
2 | * fs/logfs/gc.c - garbage collection code | ||
3 | * | ||
4 | * As should be obvious for Linux kernel code, license is GPLv2 | ||
5 | * | ||
6 | * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org> | ||
7 | */ | ||
8 | #include "logfs.h" | ||
9 | #include <linux/sched.h> | ||
10 | |||
11 | /* | ||
12 | * Wear leveling needs to kick in when the difference between low erase | ||
13 | * counts and high erase counts gets too big. A good value for "too big" | ||
14 | * may be somewhat below 10% of maximum erase count for the device. | ||
15 | * Why not 397, to pick a nice round number with no specific meaning? :) | ||
16 | * | ||
17 | * WL_RATELIMIT is the minimum time between two wear level events. A huge | ||
18 | * number of segments may fulfil the requirements for wear leveling at the | ||
19 | * same time. If that happens we don't want to cause a latency from hell, | ||
20 | * but just gently pick one segment every so often and minimize overhead. | ||
21 | */ | ||
22 | #define WL_DELTA 397 | ||
23 | #define WL_RATELIMIT 100 | ||
24 | #define MAX_OBJ_ALIASES 2600 | ||
25 | #define SCAN_RATIO 512 /* number of scanned segments per gc'd segment */ | ||
26 | #define LIST_SIZE 64 /* base size of candidate lists */ | ||
27 | #define SCAN_ROUNDS 128 /* maximum number of complete medium scans */ | ||
28 | #define SCAN_ROUNDS_HIGH 4 /* maximum number of higher-level scans */ | ||
29 | |||
30 | static int no_free_segments(struct super_block *sb) | ||
31 | { | ||
32 | struct logfs_super *super = logfs_super(sb); | ||
33 | |||
34 | return super->s_free_list.count; | ||
35 | } | ||
36 | |||
37 | /* journal has distance -1, top-most ifile layer distance 0 */ | ||
38 | static u8 root_distance(struct super_block *sb, gc_level_t __gc_level) | ||
39 | { | ||
40 | struct logfs_super *super = logfs_super(sb); | ||
41 | u8 gc_level = (__force u8)__gc_level; | ||
42 | |||
43 | switch (gc_level) { | ||
44 | case 0: /* fall through */ | ||
45 | case 1: /* fall through */ | ||
46 | case 2: /* fall through */ | ||
47 | case 3: | ||
48 | /* file data or indirect blocks */ | ||
49 | return super->s_ifile_levels + super->s_iblock_levels - gc_level; | ||
50 | case 6: /* fall through */ | ||
51 | case 7: /* fall through */ | ||
52 | case 8: /* fall through */ | ||
53 | case 9: | ||
54 | /* inode file data or indirect blocks */ | ||
55 | return super->s_ifile_levels - (gc_level - 6); | ||
56 | default: | ||
57 | printk(KERN_ERR"LOGFS: segment of unknown level %x found\n", | ||
58 | gc_level); | ||
59 | WARN_ON(1); | ||
60 | return super->s_ifile_levels + super->s_iblock_levels; | ||
61 | } | ||
62 | } | ||
63 | |||
64 | static int segment_is_reserved(struct super_block *sb, u32 segno) | ||
65 | { | ||
66 | struct logfs_super *super = logfs_super(sb); | ||
67 | struct logfs_area *area; | ||
68 | void *reserved; | ||
69 | int i; | ||
70 | |||
71 | /* Some segments are reserved. Just pretend they were all valid */ | ||
72 | reserved = btree_lookup32(&super->s_reserved_segments, segno); | ||
73 | if (reserved) | ||
74 | return 1; | ||
75 | |||
76 | /* Currently open segments */ | ||
77 | for_each_area(i) { | ||
78 | area = super->s_area[i]; | ||
79 | if (area->a_is_open && area->a_segno == segno) | ||
80 | return 1; | ||
81 | } | ||
82 | |||
83 | return 0; | ||
84 | } | ||
85 | |||
86 | static void logfs_mark_segment_bad(struct super_block *sb, u32 segno) | ||
87 | { | ||
88 | BUG(); | ||
89 | } | ||
90 | |||
91 | /* | ||
92 | * Returns the bytes consumed by valid objects in this segment. Object headers | ||
93 | * are counted, the segment header is not. | ||
94 | */ | ||
95 | static u32 logfs_valid_bytes(struct super_block *sb, u32 segno, u32 *ec, | ||
96 | gc_level_t *gc_level) | ||
97 | { | ||
98 | struct logfs_segment_entry se; | ||
99 | u32 ec_level; | ||
100 | |||
101 | logfs_get_segment_entry(sb, segno, &se); | ||
102 | if (se.ec_level == cpu_to_be32(BADSEG) || | ||
103 | se.valid == cpu_to_be32(RESERVED)) | ||
104 | return RESERVED; | ||
105 | |||
106 | ec_level = be32_to_cpu(se.ec_level); | ||
107 | *ec = ec_level >> 4; | ||
108 | *gc_level = GC_LEVEL(ec_level & 0xf); | ||
109 | return be32_to_cpu(se.valid); | ||
110 | } | ||
111 | |||
112 | static void logfs_cleanse_block(struct super_block *sb, u64 ofs, u64 ino, | ||
113 | u64 bix, gc_level_t gc_level) | ||
114 | { | ||
115 | struct inode *inode; | ||
116 | int err, cookie; | ||
117 | |||
118 | inode = logfs_safe_iget(sb, ino, &cookie); | ||
119 | err = logfs_rewrite_block(inode, bix, ofs, gc_level, 0); | ||
120 | BUG_ON(err); | ||
121 | logfs_safe_iput(inode, cookie); | ||
122 | } | ||
123 | |||
124 | static u32 logfs_gc_segment(struct super_block *sb, u32 segno, u8 dist) | ||
125 | { | ||
126 | struct logfs_super *super = logfs_super(sb); | ||
127 | struct logfs_segment_header sh; | ||
128 | struct logfs_object_header oh; | ||
129 | u64 ofs, ino, bix; | ||
130 | u32 seg_ofs, logical_segno, cleaned = 0; | ||
131 | int err, len, valid; | ||
132 | gc_level_t gc_level; | ||
133 | |||
134 | LOGFS_BUG_ON(segment_is_reserved(sb, segno), sb); | ||
135 | |||
136 | btree_insert32(&super->s_reserved_segments, segno, (void *)1, GFP_NOFS); | ||
137 | err = wbuf_read(sb, dev_ofs(sb, segno, 0), sizeof(sh), &sh); | ||
138 | BUG_ON(err); | ||
139 | gc_level = GC_LEVEL(sh.level); | ||
140 | logical_segno = be32_to_cpu(sh.segno); | ||
141 | if (sh.crc != logfs_crc32(&sh, sizeof(sh), 4)) { | ||
142 | logfs_mark_segment_bad(sb, segno); | ||
143 | cleaned = -1; | ||
144 | goto out; | ||
145 | } | ||
146 | |||
147 | for (seg_ofs = LOGFS_SEGMENT_HEADERSIZE; | ||
148 | seg_ofs + sizeof(oh) < super->s_segsize; ) { | ||
149 | ofs = dev_ofs(sb, logical_segno, seg_ofs); | ||
150 | err = wbuf_read(sb, dev_ofs(sb, segno, seg_ofs), sizeof(oh), | ||
151 | &oh); | ||
152 | BUG_ON(err); | ||
153 | |||
154 | if (!memchr_inv(&oh, 0xff, sizeof(oh))) | ||
155 | break; | ||
156 | |||
157 | if (oh.crc != logfs_crc32(&oh, sizeof(oh) - 4, 4)) { | ||
158 | logfs_mark_segment_bad(sb, segno); | ||
159 | cleaned = super->s_segsize - 1; | ||
160 | goto out; | ||
161 | } | ||
162 | |||
163 | ino = be64_to_cpu(oh.ino); | ||
164 | bix = be64_to_cpu(oh.bix); | ||
165 | len = sizeof(oh) + be16_to_cpu(oh.len); | ||
166 | valid = logfs_is_valid_block(sb, ofs, ino, bix, gc_level); | ||
167 | if (valid == 1) { | ||
168 | logfs_cleanse_block(sb, ofs, ino, bix, gc_level); | ||
169 | cleaned += len; | ||
170 | } else if (valid == 2) { | ||
171 | /* Will be invalid upon journal commit */ | ||
172 | cleaned += len; | ||
173 | } | ||
174 | seg_ofs += len; | ||
175 | } | ||
176 | out: | ||
177 | btree_remove32(&super->s_reserved_segments, segno); | ||
178 | return cleaned; | ||
179 | } | ||
180 | |||
181 | static struct gc_candidate *add_list(struct gc_candidate *cand, | ||
182 | struct candidate_list *list) | ||
183 | { | ||
184 | struct rb_node **p = &list->rb_tree.rb_node; | ||
185 | struct rb_node *parent = NULL; | ||
186 | struct gc_candidate *cur; | ||
187 | int comp; | ||
188 | |||
189 | cand->list = list; | ||
190 | while (*p) { | ||
191 | parent = *p; | ||
192 | cur = rb_entry(parent, struct gc_candidate, rb_node); | ||
193 | |||
194 | if (list->sort_by_ec) | ||
195 | comp = cand->erase_count < cur->erase_count; | ||
196 | else | ||
197 | comp = cand->valid < cur->valid; | ||
198 | |||
199 | if (comp) | ||
200 | p = &parent->rb_left; | ||
201 | else | ||
202 | p = &parent->rb_right; | ||
203 | } | ||
204 | rb_link_node(&cand->rb_node, parent, p); | ||
205 | rb_insert_color(&cand->rb_node, &list->rb_tree); | ||
206 | |||
207 | if (list->count <= list->maxcount) { | ||
208 | list->count++; | ||
209 | return NULL; | ||
210 | } | ||
211 | cand = rb_entry(rb_last(&list->rb_tree), struct gc_candidate, rb_node); | ||
212 | rb_erase(&cand->rb_node, &list->rb_tree); | ||
213 | cand->list = NULL; | ||
214 | return cand; | ||
215 | } | ||
216 | |||
217 | static void remove_from_list(struct gc_candidate *cand) | ||
218 | { | ||
219 | struct candidate_list *list = cand->list; | ||
220 | |||
221 | rb_erase(&cand->rb_node, &list->rb_tree); | ||
222 | list->count--; | ||
223 | } | ||
224 | |||
225 | static void free_candidate(struct super_block *sb, struct gc_candidate *cand) | ||
226 | { | ||
227 | struct logfs_super *super = logfs_super(sb); | ||
228 | |||
229 | btree_remove32(&super->s_cand_tree, cand->segno); | ||
230 | kfree(cand); | ||
231 | } | ||
232 | |||
233 | u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec) | ||
234 | { | ||
235 | struct gc_candidate *cand; | ||
236 | u32 segno; | ||
237 | |||
238 | BUG_ON(list->count == 0); | ||
239 | |||
240 | cand = rb_entry(rb_first(&list->rb_tree), struct gc_candidate, rb_node); | ||
241 | remove_from_list(cand); | ||
242 | segno = cand->segno; | ||
243 | if (ec) | ||
244 | *ec = cand->erase_count; | ||
245 | free_candidate(sb, cand); | ||
246 | return segno; | ||
247 | } | ||
248 | |||
249 | /* | ||
250 | * We have several lists to manage segments with. The reserve_list is used to | ||
251 | * deal with bad blocks. We try to keep the best (lowest ec) segments on this | ||
252 | * list. | ||
253 | * The free_list contains free segments for normal usage. It usually gets the | ||
254 | * second pick after the reserve_list. But when the free_list is running short | ||
255 | * it is more important to keep the free_list full than to keep a reserve. | ||
256 | * | ||
257 | * Segments that are not free are put onto a per-level low_list. If we have | ||
258 | * to run garbage collection, we pick a candidate from there. All segments on | ||
259 | * those lists should have at least some free space so GC will make progress. | ||
260 | * | ||
261 | * And last we have the ec_list, which is used to pick segments for wear | ||
262 | * leveling. | ||
263 | * | ||
264 | * If all appropriate lists are full, we simply free the candidate and forget | ||
265 | * about that segment for a while. We have better candidates for each purpose. | ||
266 | */ | ||
267 | static void __add_candidate(struct super_block *sb, struct gc_candidate *cand) | ||
268 | { | ||
269 | struct logfs_super *super = logfs_super(sb); | ||
270 | u32 full = super->s_segsize - LOGFS_SEGMENT_RESERVE; | ||
271 | |||
272 | if (cand->valid == 0) { | ||
273 | /* 100% free segments */ | ||
274 | log_gc_noisy("add reserve segment %x (ec %x) at %llx\n", | ||
275 | cand->segno, cand->erase_count, | ||
276 | dev_ofs(sb, cand->segno, 0)); | ||
277 | cand = add_list(cand, &super->s_reserve_list); | ||
278 | if (cand) { | ||
279 | log_gc_noisy("add free segment %x (ec %x) at %llx\n", | ||
280 | cand->segno, cand->erase_count, | ||
281 | dev_ofs(sb, cand->segno, 0)); | ||
282 | cand = add_list(cand, &super->s_free_list); | ||
283 | } | ||
284 | } else { | ||
285 | /* good candidates for Garbage Collection */ | ||
286 | if (cand->valid < full) | ||
287 | cand = add_list(cand, &super->s_low_list[cand->dist]); | ||
288 | /* good candidates for wear leveling, | ||
289 | * segments that were recently written get ignored */ | ||
290 | if (cand) | ||
291 | cand = add_list(cand, &super->s_ec_list); | ||
292 | } | ||
293 | if (cand) | ||
294 | free_candidate(sb, cand); | ||
295 | } | ||
296 | |||
297 | static int add_candidate(struct super_block *sb, u32 segno, u32 valid, u32 ec, | ||
298 | u8 dist) | ||
299 | { | ||
300 | struct logfs_super *super = logfs_super(sb); | ||
301 | struct gc_candidate *cand; | ||
302 | |||
303 | cand = kmalloc(sizeof(*cand), GFP_NOFS); | ||
304 | if (!cand) | ||
305 | return -ENOMEM; | ||
306 | |||
307 | cand->segno = segno; | ||
308 | cand->valid = valid; | ||
309 | cand->erase_count = ec; | ||
310 | cand->dist = dist; | ||
311 | |||
312 | btree_insert32(&super->s_cand_tree, segno, cand, GFP_NOFS); | ||
313 | __add_candidate(sb, cand); | ||
314 | return 0; | ||
315 | } | ||
316 | |||
317 | static void remove_segment_from_lists(struct super_block *sb, u32 segno) | ||
318 | { | ||
319 | struct logfs_super *super = logfs_super(sb); | ||
320 | struct gc_candidate *cand; | ||
321 | |||
322 | cand = btree_lookup32(&super->s_cand_tree, segno); | ||
323 | if (cand) { | ||
324 | remove_from_list(cand); | ||
325 | free_candidate(sb, cand); | ||
326 | } | ||
327 | } | ||
328 | |||
329 | static void scan_segment(struct super_block *sb, u32 segno) | ||
330 | { | ||
331 | u32 valid, ec = 0; | ||
332 | gc_level_t gc_level = 0; | ||
333 | u8 dist; | ||
334 | |||
335 | if (segment_is_reserved(sb, segno)) | ||
336 | return; | ||
337 | |||
338 | remove_segment_from_lists(sb, segno); | ||
339 | valid = logfs_valid_bytes(sb, segno, &ec, &gc_level); | ||
340 | if (valid == RESERVED) | ||
341 | return; | ||
342 | |||
343 | dist = root_distance(sb, gc_level); | ||
344 | add_candidate(sb, segno, valid, ec, dist); | ||
345 | } | ||
346 | |||
347 | static struct gc_candidate *first_in_list(struct candidate_list *list) | ||
348 | { | ||
349 | if (list->count == 0) | ||
350 | return NULL; | ||
351 | return rb_entry(rb_first(&list->rb_tree), struct gc_candidate, rb_node); | ||
352 | } | ||
353 | |||
354 | /* | ||
355 | * Find the best segment for garbage collection. Main criterion is | ||
356 | * the segment requiring the least effort to clean. Secondary | ||
357 | * criterion is to GC on the lowest level available. | ||
358 | * | ||
359 | * So we search the least effort segment on the lowest level first, | ||
360 | * then move up and pick another segment iff is requires significantly | ||
361 | * less effort. Hence the LOGFS_MAX_OBJECTSIZE in the comparison. | ||
362 | */ | ||
363 | static struct gc_candidate *get_candidate(struct super_block *sb) | ||
364 | { | ||
365 | struct logfs_super *super = logfs_super(sb); | ||
366 | int i, max_dist; | ||
367 | struct gc_candidate *cand = NULL, *this; | ||
368 | |||
369 | max_dist = min(no_free_segments(sb), LOGFS_NO_AREAS); | ||
370 | |||
371 | for (i = max_dist; i >= 0; i--) { | ||
372 | this = first_in_list(&super->s_low_list[i]); | ||
373 | if (!this) | ||
374 | continue; | ||
375 | if (!cand) | ||
376 | cand = this; | ||
377 | if (this->valid + LOGFS_MAX_OBJECTSIZE <= cand->valid) | ||
378 | cand = this; | ||
379 | } | ||
380 | return cand; | ||
381 | } | ||
382 | |||
383 | static int __logfs_gc_once(struct super_block *sb, struct gc_candidate *cand) | ||
384 | { | ||
385 | struct logfs_super *super = logfs_super(sb); | ||
386 | gc_level_t gc_level; | ||
387 | u32 cleaned, valid, segno, ec; | ||
388 | u8 dist; | ||
389 | |||
390 | if (!cand) { | ||
391 | log_gc("GC attempted, but no candidate found\n"); | ||
392 | return 0; | ||
393 | } | ||
394 | |||
395 | segno = cand->segno; | ||
396 | dist = cand->dist; | ||
397 | valid = logfs_valid_bytes(sb, segno, &ec, &gc_level); | ||
398 | free_candidate(sb, cand); | ||
399 | log_gc("GC segment #%02x at %llx, %x required, %x free, %x valid, %llx free\n", | ||
400 | segno, (u64)segno << super->s_segshift, | ||
401 | dist, no_free_segments(sb), valid, | ||
402 | super->s_free_bytes); | ||
403 | cleaned = logfs_gc_segment(sb, segno, dist); | ||
404 | log_gc("GC segment #%02x complete - now %x valid\n", segno, | ||
405 | valid - cleaned); | ||
406 | BUG_ON(cleaned != valid); | ||
407 | return 1; | ||
408 | } | ||
409 | |||
410 | static int logfs_gc_once(struct super_block *sb) | ||
411 | { | ||
412 | struct gc_candidate *cand; | ||
413 | |||
414 | cand = get_candidate(sb); | ||
415 | if (cand) | ||
416 | remove_from_list(cand); | ||
417 | return __logfs_gc_once(sb, cand); | ||
418 | } | ||
419 | |||
420 | /* returns 1 if a wrap occurs, 0 otherwise */ | ||
421 | static int logfs_scan_some(struct super_block *sb) | ||
422 | { | ||
423 | struct logfs_super *super = logfs_super(sb); | ||
424 | u32 segno; | ||
425 | int i, ret = 0; | ||
426 | |||
427 | segno = super->s_sweeper; | ||
428 | for (i = SCAN_RATIO; i > 0; i--) { | ||
429 | segno++; | ||
430 | if (segno >= super->s_no_segs) { | ||
431 | segno = 0; | ||
432 | ret = 1; | ||
433 | /* Break out of the loop. We want to read a single | ||
434 | * block from the segment size on next invocation if | ||
435 | * SCAN_RATIO is set to match block size | ||
436 | */ | ||
437 | break; | ||
438 | } | ||
439 | |||
440 | scan_segment(sb, segno); | ||
441 | } | ||
442 | super->s_sweeper = segno; | ||
443 | return ret; | ||
444 | } | ||
445 | |||
446 | /* | ||
447 | * In principle, this function should loop forever, looking for GC candidates | ||
448 | * and moving data. LogFS is designed in such a way that this loop is | ||
449 | * guaranteed to terminate. | ||
450 | * | ||
451 | * Limiting the loop to some iterations serves purely to catch cases when | ||
452 | * these guarantees have failed. An actual endless loop is an obvious bug | ||
453 | * and should be reported as such. | ||
454 | */ | ||
455 | static void __logfs_gc_pass(struct super_block *sb, int target) | ||
456 | { | ||
457 | struct logfs_super *super = logfs_super(sb); | ||
458 | struct logfs_block *block; | ||
459 | int round, progress, last_progress = 0; | ||
460 | |||
461 | if (no_free_segments(sb) >= target && | ||
462 | super->s_no_object_aliases < MAX_OBJ_ALIASES) | ||
463 | return; | ||
464 | |||
465 | log_gc("__logfs_gc_pass(%x)\n", target); | ||
466 | for (round = 0; round < SCAN_ROUNDS; ) { | ||
467 | if (no_free_segments(sb) >= target) | ||
468 | goto write_alias; | ||
469 | |||
470 | /* Sync in-memory state with on-medium state in case they | ||
471 | * diverged */ | ||
472 | logfs_write_anchor(super->s_master_inode); | ||
473 | round += logfs_scan_some(sb); | ||
474 | if (no_free_segments(sb) >= target) | ||
475 | goto write_alias; | ||
476 | progress = logfs_gc_once(sb); | ||
477 | if (progress) | ||
478 | last_progress = round; | ||
479 | else if (round - last_progress > 2) | ||
480 | break; | ||
481 | continue; | ||
482 | |||
483 | /* | ||
484 | * The goto logic is nasty, I just don't know a better way to | ||
485 | * code it. GC is supposed to ensure two things: | ||
486 | * 1. Enough free segments are available. | ||
487 | * 2. The number of aliases is bounded. | ||
488 | * When 1. is achieved, we take a look at 2. and write back | ||
489 | * some alias-containing blocks, if necessary. However, after | ||
490 | * each such write we need to go back to 1., as writes can | ||
491 | * consume free segments. | ||
492 | */ | ||
493 | write_alias: | ||
494 | if (super->s_no_object_aliases < MAX_OBJ_ALIASES) | ||
495 | return; | ||
496 | if (list_empty(&super->s_object_alias)) { | ||
497 | /* All aliases are still in btree */ | ||
498 | return; | ||
499 | } | ||
500 | log_gc("Write back one alias\n"); | ||
501 | block = list_entry(super->s_object_alias.next, | ||
502 | struct logfs_block, alias_list); | ||
503 | block->ops->write_block(block); | ||
504 | /* | ||
505 | * To round off the nasty goto logic, we reset round here. It | ||
506 | * is a safety-net for GC not making any progress and limited | ||
507 | * to something reasonably small. If incremented it for every | ||
508 | * single alias, the loop could terminate rather quickly. | ||
509 | */ | ||
510 | round = 0; | ||
511 | } | ||
512 | LOGFS_BUG(sb); | ||
513 | } | ||
514 | |||
515 | static int wl_ratelimit(struct super_block *sb, u64 *next_event) | ||
516 | { | ||
517 | struct logfs_super *super = logfs_super(sb); | ||
518 | |||
519 | if (*next_event < super->s_gec) { | ||
520 | *next_event = super->s_gec + WL_RATELIMIT; | ||
521 | return 0; | ||
522 | } | ||
523 | return 1; | ||
524 | } | ||
525 | |||
526 | static void logfs_wl_pass(struct super_block *sb) | ||
527 | { | ||
528 | struct logfs_super *super = logfs_super(sb); | ||
529 | struct gc_candidate *wl_cand, *free_cand; | ||
530 | |||
531 | if (wl_ratelimit(sb, &super->s_wl_gec_ostore)) | ||
532 | return; | ||
533 | |||
534 | wl_cand = first_in_list(&super->s_ec_list); | ||
535 | if (!wl_cand) | ||
536 | return; | ||
537 | free_cand = first_in_list(&super->s_free_list); | ||
538 | if (!free_cand) | ||
539 | return; | ||
540 | |||
541 | if (wl_cand->erase_count < free_cand->erase_count + WL_DELTA) { | ||
542 | remove_from_list(wl_cand); | ||
543 | __logfs_gc_once(sb, wl_cand); | ||
544 | } | ||
545 | } | ||
546 | |||
547 | /* | ||
548 | * The journal needs wear leveling as well. But moving the journal is an | ||
549 | * expensive operation so we try to avoid it as much as possible. And if we | ||
550 | * have to do it, we move the whole journal, not individual segments. | ||
551 | * | ||
552 | * Ratelimiting is not strictly necessary here, it mainly serves to avoid the | ||
553 | * calculations. First we check whether moving the journal would be a | ||
554 | * significant improvement. That means that a) the current journal segments | ||
555 | * have more wear than the future journal segments and b) the current journal | ||
556 | * segments have more wear than normal ostore segments. | ||
557 | * Rationale for b) is that we don't have to move the journal if it is aging | ||
558 | * less than the ostore, even if the reserve segments age even less (they are | ||
559 | * excluded from wear leveling, after all). | ||
560 | * Next we check that the superblocks have less wear than the journal. Since | ||
561 | * moving the journal requires writing the superblocks, we have to protect the | ||
562 | * superblocks even more than the journal. | ||
563 | * | ||
564 | * Also we double the acceptable wear difference, compared to ostore wear | ||
565 | * leveling. Journal data is read and rewritten rapidly, comparatively. So | ||
566 | * soft errors have much less time to accumulate and we allow the journal to | ||
567 | * be a bit worse than the ostore. | ||
568 | */ | ||
569 | static void logfs_journal_wl_pass(struct super_block *sb) | ||
570 | { | ||
571 | struct logfs_super *super = logfs_super(sb); | ||
572 | struct gc_candidate *cand; | ||
573 | u32 min_journal_ec = -1, max_reserve_ec = 0; | ||
574 | int i; | ||
575 | |||
576 | if (wl_ratelimit(sb, &super->s_wl_gec_journal)) | ||
577 | return; | ||
578 | |||
579 | if (super->s_reserve_list.count < super->s_no_journal_segs) { | ||
580 | /* Reserve is not full enough to move complete journal */ | ||
581 | return; | ||
582 | } | ||
583 | |||
584 | journal_for_each(i) | ||
585 | if (super->s_journal_seg[i]) | ||
586 | min_journal_ec = min(min_journal_ec, | ||
587 | super->s_journal_ec[i]); | ||
588 | cand = rb_entry(rb_first(&super->s_free_list.rb_tree), | ||
589 | struct gc_candidate, rb_node); | ||
590 | max_reserve_ec = cand->erase_count; | ||
591 | for (i = 0; i < 2; i++) { | ||
592 | struct logfs_segment_entry se; | ||
593 | u32 segno = seg_no(sb, super->s_sb_ofs[i]); | ||
594 | u32 ec; | ||
595 | |||
596 | logfs_get_segment_entry(sb, segno, &se); | ||
597 | ec = be32_to_cpu(se.ec_level) >> 4; | ||
598 | max_reserve_ec = max(max_reserve_ec, ec); | ||
599 | } | ||
600 | |||
601 | if (min_journal_ec > max_reserve_ec + 2 * WL_DELTA) { | ||
602 | do_logfs_journal_wl_pass(sb); | ||
603 | } | ||
604 | } | ||
605 | |||
606 | void logfs_gc_pass(struct super_block *sb) | ||
607 | { | ||
608 | struct logfs_super *super = logfs_super(sb); | ||
609 | |||
610 | //BUG_ON(mutex_trylock(&logfs_super(sb)->s_w_mutex)); | ||
611 | /* Write journal before free space is getting saturated with dirty | ||
612 | * objects. | ||
613 | */ | ||
614 | if (super->s_dirty_used_bytes + super->s_dirty_free_bytes | ||
615 | + LOGFS_MAX_OBJECTSIZE >= super->s_free_bytes) | ||
616 | logfs_write_anchor(super->s_master_inode); | ||
617 | __logfs_gc_pass(sb, logfs_super(sb)->s_total_levels); | ||
618 | logfs_wl_pass(sb); | ||
619 | logfs_journal_wl_pass(sb); | ||
620 | } | ||
621 | |||
622 | static int check_area(struct super_block *sb, int i) | ||
623 | { | ||
624 | struct logfs_super *super = logfs_super(sb); | ||
625 | struct logfs_area *area = super->s_area[i]; | ||
626 | struct logfs_object_header oh; | ||
627 | u32 segno = area->a_segno; | ||
628 | u32 ofs = area->a_used_bytes; | ||
629 | __be32 crc; | ||
630 | int err; | ||
631 | |||
632 | if (!area->a_is_open) | ||
633 | return 0; | ||
634 | |||
635 | for (ofs = area->a_used_bytes; | ||
636 | ofs <= super->s_segsize - sizeof(oh); | ||
637 | ofs += (u32)be16_to_cpu(oh.len) + sizeof(oh)) { | ||
638 | err = wbuf_read(sb, dev_ofs(sb, segno, ofs), sizeof(oh), &oh); | ||
639 | if (err) | ||
640 | return err; | ||
641 | |||
642 | if (!memchr_inv(&oh, 0xff, sizeof(oh))) | ||
643 | break; | ||
644 | |||
645 | crc = logfs_crc32(&oh, sizeof(oh) - 4, 4); | ||
646 | if (crc != oh.crc) { | ||
647 | printk(KERN_INFO "interrupted header at %llx\n", | ||
648 | dev_ofs(sb, segno, ofs)); | ||
649 | return 0; | ||
650 | } | ||
651 | } | ||
652 | if (ofs != area->a_used_bytes) { | ||
653 | printk(KERN_INFO "%x bytes unaccounted data found at %llx\n", | ||
654 | ofs - area->a_used_bytes, | ||
655 | dev_ofs(sb, segno, area->a_used_bytes)); | ||
656 | area->a_used_bytes = ofs; | ||
657 | } | ||
658 | return 0; | ||
659 | } | ||
660 | |||
661 | int logfs_check_areas(struct super_block *sb) | ||
662 | { | ||
663 | int i, err; | ||
664 | |||
665 | for_each_area(i) { | ||
666 | err = check_area(sb, i); | ||
667 | if (err) | ||
668 | return err; | ||
669 | } | ||
670 | return 0; | ||
671 | } | ||
672 | |||
673 | static void logfs_init_candlist(struct candidate_list *list, int maxcount, | ||
674 | int sort_by_ec) | ||
675 | { | ||
676 | list->count = 0; | ||
677 | list->maxcount = maxcount; | ||
678 | list->sort_by_ec = sort_by_ec; | ||
679 | list->rb_tree = RB_ROOT; | ||
680 | } | ||
681 | |||
682 | int logfs_init_gc(struct super_block *sb) | ||
683 | { | ||
684 | struct logfs_super *super = logfs_super(sb); | ||
685 | int i; | ||
686 | |||
687 | btree_init_mempool32(&super->s_cand_tree, super->s_btree_pool); | ||
688 | logfs_init_candlist(&super->s_free_list, LIST_SIZE + SCAN_RATIO, 1); | ||
689 | logfs_init_candlist(&super->s_reserve_list, | ||
690 | super->s_bad_seg_reserve, 1); | ||
691 | for_each_area(i) | ||
692 | logfs_init_candlist(&super->s_low_list[i], LIST_SIZE, 0); | ||
693 | logfs_init_candlist(&super->s_ec_list, LIST_SIZE, 1); | ||
694 | return 0; | ||
695 | } | ||
696 | |||
697 | static void logfs_cleanup_list(struct super_block *sb, | ||
698 | struct candidate_list *list) | ||
699 | { | ||
700 | struct gc_candidate *cand; | ||
701 | |||
702 | while (list->count) { | ||
703 | cand = rb_entry(list->rb_tree.rb_node, struct gc_candidate, | ||
704 | rb_node); | ||
705 | remove_from_list(cand); | ||
706 | free_candidate(sb, cand); | ||
707 | } | ||
708 | BUG_ON(list->rb_tree.rb_node); | ||
709 | } | ||
710 | |||
711 | void logfs_cleanup_gc(struct super_block *sb) | ||
712 | { | ||
713 | struct logfs_super *super = logfs_super(sb); | ||
714 | int i; | ||
715 | |||
716 | if (!super->s_free_list.count) | ||
717 | return; | ||
718 | |||
719 | /* | ||
720 | * FIXME: The btree may still contain a single empty node. So we | ||
721 | * call the grim visitor to clean up that mess. Btree code should | ||
722 | * do it for us, really. | ||
723 | */ | ||
724 | btree_grim_visitor32(&super->s_cand_tree, 0, NULL); | ||
725 | logfs_cleanup_list(sb, &super->s_free_list); | ||
726 | logfs_cleanup_list(sb, &super->s_reserve_list); | ||
727 | for_each_area(i) | ||
728 | logfs_cleanup_list(sb, &super->s_low_list[i]); | ||
729 | logfs_cleanup_list(sb, &super->s_ec_list); | ||
730 | } | ||