diff options
Diffstat (limited to 'fs/logfs/gc.c')
-rw-r--r-- | fs/logfs/gc.c | 731 |
1 files changed, 731 insertions, 0 deletions
diff --git a/fs/logfs/gc.c b/fs/logfs/gc.c new file mode 100644 index 000000000000..84e36f52fe95 --- /dev/null +++ b/fs/logfs/gc.c | |||
@@ -0,0 +1,731 @@ | |||
1 | /* | ||
2 | * fs/logfs/gc.c - garbage collection code | ||
3 | * | ||
4 | * As should be obvious for Linux kernel code, license is GPLv2 | ||
5 | * | ||
6 | * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org> | ||
7 | */ | ||
8 | #include "logfs.h" | ||
9 | #include <linux/sched.h> | ||
10 | #include <linux/slab.h> | ||
11 | |||
12 | /* | ||
13 | * Wear leveling needs to kick in when the difference between low erase | ||
14 | * counts and high erase counts gets too big. A good value for "too big" | ||
15 | * may be somewhat below 10% of maximum erase count for the device. | ||
16 | * Why not 397, to pick a nice round number with no specific meaning? :) | ||
17 | * | ||
18 | * WL_RATELIMIT is the minimum time between two wear level events. A huge | ||
19 | * number of segments may fulfil the requirements for wear leveling at the | ||
20 | * same time. If that happens we don't want to cause a latency from hell, | ||
21 | * but just gently pick one segment every so often and minimize overhead. | ||
22 | */ | ||
23 | #define WL_DELTA 397 | ||
24 | #define WL_RATELIMIT 100 | ||
25 | #define MAX_OBJ_ALIASES 2600 | ||
26 | #define SCAN_RATIO 512 /* number of scanned segments per gc'd segment */ | ||
27 | #define LIST_SIZE 64 /* base size of candidate lists */ | ||
28 | #define SCAN_ROUNDS 128 /* maximum number of complete medium scans */ | ||
29 | #define SCAN_ROUNDS_HIGH 4 /* maximum number of higher-level scans */ | ||
30 | |||
31 | static int no_free_segments(struct super_block *sb) | ||
32 | { | ||
33 | struct logfs_super *super = logfs_super(sb); | ||
34 | |||
35 | return super->s_free_list.count; | ||
36 | } | ||
37 | |||
38 | /* journal has distance -1, top-most ifile layer distance 0 */ | ||
39 | static u8 root_distance(struct super_block *sb, gc_level_t __gc_level) | ||
40 | { | ||
41 | struct logfs_super *super = logfs_super(sb); | ||
42 | u8 gc_level = (__force u8)__gc_level; | ||
43 | |||
44 | switch (gc_level) { | ||
45 | case 0: /* fall through */ | ||
46 | case 1: /* fall through */ | ||
47 | case 2: /* fall through */ | ||
48 | case 3: | ||
49 | /* file data or indirect blocks */ | ||
50 | return super->s_ifile_levels + super->s_iblock_levels - gc_level; | ||
51 | case 6: /* fall through */ | ||
52 | case 7: /* fall through */ | ||
53 | case 8: /* fall through */ | ||
54 | case 9: | ||
55 | /* inode file data or indirect blocks */ | ||
56 | return super->s_ifile_levels - (gc_level - 6); | ||
57 | default: | ||
58 | printk(KERN_ERR"LOGFS: segment of unknown level %x found\n", | ||
59 | gc_level); | ||
60 | WARN_ON(1); | ||
61 | return super->s_ifile_levels + super->s_iblock_levels; | ||
62 | } | ||
63 | } | ||
64 | |||
65 | static int segment_is_reserved(struct super_block *sb, u32 segno) | ||
66 | { | ||
67 | struct logfs_super *super = logfs_super(sb); | ||
68 | struct logfs_area *area; | ||
69 | void *reserved; | ||
70 | int i; | ||
71 | |||
72 | /* Some segments are reserved. Just pretend they were all valid */ | ||
73 | reserved = btree_lookup32(&super->s_reserved_segments, segno); | ||
74 | if (reserved) | ||
75 | return 1; | ||
76 | |||
77 | /* Currently open segments */ | ||
78 | for_each_area(i) { | ||
79 | area = super->s_area[i]; | ||
80 | if (area->a_is_open && area->a_segno == segno) | ||
81 | return 1; | ||
82 | } | ||
83 | |||
84 | return 0; | ||
85 | } | ||
86 | |||
87 | static void logfs_mark_segment_bad(struct super_block *sb, u32 segno) | ||
88 | { | ||
89 | BUG(); | ||
90 | } | ||
91 | |||
92 | /* | ||
93 | * Returns the bytes consumed by valid objects in this segment. Object headers | ||
94 | * are counted, the segment header is not. | ||
95 | */ | ||
96 | static u32 logfs_valid_bytes(struct super_block *sb, u32 segno, u32 *ec, | ||
97 | gc_level_t *gc_level) | ||
98 | { | ||
99 | struct logfs_segment_entry se; | ||
100 | u32 ec_level; | ||
101 | |||
102 | logfs_get_segment_entry(sb, segno, &se); | ||
103 | if (se.ec_level == cpu_to_be32(BADSEG) || | ||
104 | se.valid == cpu_to_be32(RESERVED)) | ||
105 | return RESERVED; | ||
106 | |||
107 | ec_level = be32_to_cpu(se.ec_level); | ||
108 | *ec = ec_level >> 4; | ||
109 | *gc_level = GC_LEVEL(ec_level & 0xf); | ||
110 | return be32_to_cpu(se.valid); | ||
111 | } | ||
112 | |||
113 | static void logfs_cleanse_block(struct super_block *sb, u64 ofs, u64 ino, | ||
114 | u64 bix, gc_level_t gc_level) | ||
115 | { | ||
116 | struct inode *inode; | ||
117 | int err, cookie; | ||
118 | |||
119 | inode = logfs_safe_iget(sb, ino, &cookie); | ||
120 | err = logfs_rewrite_block(inode, bix, ofs, gc_level, 0); | ||
121 | BUG_ON(err); | ||
122 | logfs_safe_iput(inode, cookie); | ||
123 | } | ||
124 | |||
125 | static u32 logfs_gc_segment(struct super_block *sb, u32 segno, u8 dist) | ||
126 | { | ||
127 | struct logfs_super *super = logfs_super(sb); | ||
128 | struct logfs_segment_header sh; | ||
129 | struct logfs_object_header oh; | ||
130 | u64 ofs, ino, bix; | ||
131 | u32 seg_ofs, logical_segno, cleaned = 0; | ||
132 | int err, len, valid; | ||
133 | gc_level_t gc_level; | ||
134 | |||
135 | LOGFS_BUG_ON(segment_is_reserved(sb, segno), sb); | ||
136 | |||
137 | btree_insert32(&super->s_reserved_segments, segno, (void *)1, GFP_NOFS); | ||
138 | err = wbuf_read(sb, dev_ofs(sb, segno, 0), sizeof(sh), &sh); | ||
139 | BUG_ON(err); | ||
140 | gc_level = GC_LEVEL(sh.level); | ||
141 | logical_segno = be32_to_cpu(sh.segno); | ||
142 | if (sh.crc != logfs_crc32(&sh, sizeof(sh), 4)) { | ||
143 | logfs_mark_segment_bad(sb, segno); | ||
144 | cleaned = -1; | ||
145 | goto out; | ||
146 | } | ||
147 | |||
148 | for (seg_ofs = LOGFS_SEGMENT_HEADERSIZE; | ||
149 | seg_ofs + sizeof(oh) < super->s_segsize; ) { | ||
150 | ofs = dev_ofs(sb, logical_segno, seg_ofs); | ||
151 | err = wbuf_read(sb, dev_ofs(sb, segno, seg_ofs), sizeof(oh), | ||
152 | &oh); | ||
153 | BUG_ON(err); | ||
154 | |||
155 | if (!memchr_inv(&oh, 0xff, sizeof(oh))) | ||
156 | break; | ||
157 | |||
158 | if (oh.crc != logfs_crc32(&oh, sizeof(oh) - 4, 4)) { | ||
159 | logfs_mark_segment_bad(sb, segno); | ||
160 | cleaned = super->s_segsize - 1; | ||
161 | goto out; | ||
162 | } | ||
163 | |||
164 | ino = be64_to_cpu(oh.ino); | ||
165 | bix = be64_to_cpu(oh.bix); | ||
166 | len = sizeof(oh) + be16_to_cpu(oh.len); | ||
167 | valid = logfs_is_valid_block(sb, ofs, ino, bix, gc_level); | ||
168 | if (valid == 1) { | ||
169 | logfs_cleanse_block(sb, ofs, ino, bix, gc_level); | ||
170 | cleaned += len; | ||
171 | } else if (valid == 2) { | ||
172 | /* Will be invalid upon journal commit */ | ||
173 | cleaned += len; | ||
174 | } | ||
175 | seg_ofs += len; | ||
176 | } | ||
177 | out: | ||
178 | btree_remove32(&super->s_reserved_segments, segno); | ||
179 | return cleaned; | ||
180 | } | ||
181 | |||
182 | static struct gc_candidate *add_list(struct gc_candidate *cand, | ||
183 | struct candidate_list *list) | ||
184 | { | ||
185 | struct rb_node **p = &list->rb_tree.rb_node; | ||
186 | struct rb_node *parent = NULL; | ||
187 | struct gc_candidate *cur; | ||
188 | int comp; | ||
189 | |||
190 | cand->list = list; | ||
191 | while (*p) { | ||
192 | parent = *p; | ||
193 | cur = rb_entry(parent, struct gc_candidate, rb_node); | ||
194 | |||
195 | if (list->sort_by_ec) | ||
196 | comp = cand->erase_count < cur->erase_count; | ||
197 | else | ||
198 | comp = cand->valid < cur->valid; | ||
199 | |||
200 | if (comp) | ||
201 | p = &parent->rb_left; | ||
202 | else | ||
203 | p = &parent->rb_right; | ||
204 | } | ||
205 | rb_link_node(&cand->rb_node, parent, p); | ||
206 | rb_insert_color(&cand->rb_node, &list->rb_tree); | ||
207 | |||
208 | if (list->count <= list->maxcount) { | ||
209 | list->count++; | ||
210 | return NULL; | ||
211 | } | ||
212 | cand = rb_entry(rb_last(&list->rb_tree), struct gc_candidate, rb_node); | ||
213 | rb_erase(&cand->rb_node, &list->rb_tree); | ||
214 | cand->list = NULL; | ||
215 | return cand; | ||
216 | } | ||
217 | |||
218 | static void remove_from_list(struct gc_candidate *cand) | ||
219 | { | ||
220 | struct candidate_list *list = cand->list; | ||
221 | |||
222 | rb_erase(&cand->rb_node, &list->rb_tree); | ||
223 | list->count--; | ||
224 | } | ||
225 | |||
226 | static void free_candidate(struct super_block *sb, struct gc_candidate *cand) | ||
227 | { | ||
228 | struct logfs_super *super = logfs_super(sb); | ||
229 | |||
230 | btree_remove32(&super->s_cand_tree, cand->segno); | ||
231 | kfree(cand); | ||
232 | } | ||
233 | |||
234 | u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec) | ||
235 | { | ||
236 | struct gc_candidate *cand; | ||
237 | u32 segno; | ||
238 | |||
239 | BUG_ON(list->count == 0); | ||
240 | |||
241 | cand = rb_entry(rb_first(&list->rb_tree), struct gc_candidate, rb_node); | ||
242 | remove_from_list(cand); | ||
243 | segno = cand->segno; | ||
244 | if (ec) | ||
245 | *ec = cand->erase_count; | ||
246 | free_candidate(sb, cand); | ||
247 | return segno; | ||
248 | } | ||
249 | |||
250 | /* | ||
251 | * We have several lists to manage segments with. The reserve_list is used to | ||
252 | * deal with bad blocks. We try to keep the best (lowest ec) segments on this | ||
253 | * list. | ||
254 | * The free_list contains free segments for normal usage. It usually gets the | ||
255 | * second pick after the reserve_list. But when the free_list is running short | ||
256 | * it is more important to keep the free_list full than to keep a reserve. | ||
257 | * | ||
258 | * Segments that are not free are put onto a per-level low_list. If we have | ||
259 | * to run garbage collection, we pick a candidate from there. All segments on | ||
260 | * those lists should have at least some free space so GC will make progress. | ||
261 | * | ||
262 | * And last we have the ec_list, which is used to pick segments for wear | ||
263 | * leveling. | ||
264 | * | ||
265 | * If all appropriate lists are full, we simply free the candidate and forget | ||
266 | * about that segment for a while. We have better candidates for each purpose. | ||
267 | */ | ||
268 | static void __add_candidate(struct super_block *sb, struct gc_candidate *cand) | ||
269 | { | ||
270 | struct logfs_super *super = logfs_super(sb); | ||
271 | u32 full = super->s_segsize - LOGFS_SEGMENT_RESERVE; | ||
272 | |||
273 | if (cand->valid == 0) { | ||
274 | /* 100% free segments */ | ||
275 | log_gc_noisy("add reserve segment %x (ec %x) at %llx\n", | ||
276 | cand->segno, cand->erase_count, | ||
277 | dev_ofs(sb, cand->segno, 0)); | ||
278 | cand = add_list(cand, &super->s_reserve_list); | ||
279 | if (cand) { | ||
280 | log_gc_noisy("add free segment %x (ec %x) at %llx\n", | ||
281 | cand->segno, cand->erase_count, | ||
282 | dev_ofs(sb, cand->segno, 0)); | ||
283 | cand = add_list(cand, &super->s_free_list); | ||
284 | } | ||
285 | } else { | ||
286 | /* good candidates for Garbage Collection */ | ||
287 | if (cand->valid < full) | ||
288 | cand = add_list(cand, &super->s_low_list[cand->dist]); | ||
289 | /* good candidates for wear leveling, | ||
290 | * segments that were recently written get ignored */ | ||
291 | if (cand) | ||
292 | cand = add_list(cand, &super->s_ec_list); | ||
293 | } | ||
294 | if (cand) | ||
295 | free_candidate(sb, cand); | ||
296 | } | ||
297 | |||
298 | static int add_candidate(struct super_block *sb, u32 segno, u32 valid, u32 ec, | ||
299 | u8 dist) | ||
300 | { | ||
301 | struct logfs_super *super = logfs_super(sb); | ||
302 | struct gc_candidate *cand; | ||
303 | |||
304 | cand = kmalloc(sizeof(*cand), GFP_NOFS); | ||
305 | if (!cand) | ||
306 | return -ENOMEM; | ||
307 | |||
308 | cand->segno = segno; | ||
309 | cand->valid = valid; | ||
310 | cand->erase_count = ec; | ||
311 | cand->dist = dist; | ||
312 | |||
313 | btree_insert32(&super->s_cand_tree, segno, cand, GFP_NOFS); | ||
314 | __add_candidate(sb, cand); | ||
315 | return 0; | ||
316 | } | ||
317 | |||
318 | static void remove_segment_from_lists(struct super_block *sb, u32 segno) | ||
319 | { | ||
320 | struct logfs_super *super = logfs_super(sb); | ||
321 | struct gc_candidate *cand; | ||
322 | |||
323 | cand = btree_lookup32(&super->s_cand_tree, segno); | ||
324 | if (cand) { | ||
325 | remove_from_list(cand); | ||
326 | free_candidate(sb, cand); | ||
327 | } | ||
328 | } | ||
329 | |||
330 | static void scan_segment(struct super_block *sb, u32 segno) | ||
331 | { | ||
332 | u32 valid, ec = 0; | ||
333 | gc_level_t gc_level = 0; | ||
334 | u8 dist; | ||
335 | |||
336 | if (segment_is_reserved(sb, segno)) | ||
337 | return; | ||
338 | |||
339 | remove_segment_from_lists(sb, segno); | ||
340 | valid = logfs_valid_bytes(sb, segno, &ec, &gc_level); | ||
341 | if (valid == RESERVED) | ||
342 | return; | ||
343 | |||
344 | dist = root_distance(sb, gc_level); | ||
345 | add_candidate(sb, segno, valid, ec, dist); | ||
346 | } | ||
347 | |||
348 | static struct gc_candidate *first_in_list(struct candidate_list *list) | ||
349 | { | ||
350 | if (list->count == 0) | ||
351 | return NULL; | ||
352 | return rb_entry(rb_first(&list->rb_tree), struct gc_candidate, rb_node); | ||
353 | } | ||
354 | |||
355 | /* | ||
356 | * Find the best segment for garbage collection. Main criterion is | ||
357 | * the segment requiring the least effort to clean. Secondary | ||
358 | * criterion is to GC on the lowest level available. | ||
359 | * | ||
360 | * So we search the least effort segment on the lowest level first, | ||
361 | * then move up and pick another segment iff is requires significantly | ||
362 | * less effort. Hence the LOGFS_MAX_OBJECTSIZE in the comparison. | ||
363 | */ | ||
364 | static struct gc_candidate *get_candidate(struct super_block *sb) | ||
365 | { | ||
366 | struct logfs_super *super = logfs_super(sb); | ||
367 | int i, max_dist; | ||
368 | struct gc_candidate *cand = NULL, *this; | ||
369 | |||
370 | max_dist = min(no_free_segments(sb), LOGFS_NO_AREAS); | ||
371 | |||
372 | for (i = max_dist; i >= 0; i--) { | ||
373 | this = first_in_list(&super->s_low_list[i]); | ||
374 | if (!this) | ||
375 | continue; | ||
376 | if (!cand) | ||
377 | cand = this; | ||
378 | if (this->valid + LOGFS_MAX_OBJECTSIZE <= cand->valid) | ||
379 | cand = this; | ||
380 | } | ||
381 | return cand; | ||
382 | } | ||
383 | |||
384 | static int __logfs_gc_once(struct super_block *sb, struct gc_candidate *cand) | ||
385 | { | ||
386 | struct logfs_super *super = logfs_super(sb); | ||
387 | gc_level_t gc_level; | ||
388 | u32 cleaned, valid, segno, ec; | ||
389 | u8 dist; | ||
390 | |||
391 | if (!cand) { | ||
392 | log_gc("GC attempted, but no candidate found\n"); | ||
393 | return 0; | ||
394 | } | ||
395 | |||
396 | segno = cand->segno; | ||
397 | dist = cand->dist; | ||
398 | valid = logfs_valid_bytes(sb, segno, &ec, &gc_level); | ||
399 | free_candidate(sb, cand); | ||
400 | log_gc("GC segment #%02x at %llx, %x required, %x free, %x valid, %llx free\n", | ||
401 | segno, (u64)segno << super->s_segshift, | ||
402 | dist, no_free_segments(sb), valid, | ||
403 | super->s_free_bytes); | ||
404 | cleaned = logfs_gc_segment(sb, segno, dist); | ||
405 | log_gc("GC segment #%02x complete - now %x valid\n", segno, | ||
406 | valid - cleaned); | ||
407 | BUG_ON(cleaned != valid); | ||
408 | return 1; | ||
409 | } | ||
410 | |||
411 | static int logfs_gc_once(struct super_block *sb) | ||
412 | { | ||
413 | struct gc_candidate *cand; | ||
414 | |||
415 | cand = get_candidate(sb); | ||
416 | if (cand) | ||
417 | remove_from_list(cand); | ||
418 | return __logfs_gc_once(sb, cand); | ||
419 | } | ||
420 | |||
421 | /* returns 1 if a wrap occurs, 0 otherwise */ | ||
422 | static int logfs_scan_some(struct super_block *sb) | ||
423 | { | ||
424 | struct logfs_super *super = logfs_super(sb); | ||
425 | u32 segno; | ||
426 | int i, ret = 0; | ||
427 | |||
428 | segno = super->s_sweeper; | ||
429 | for (i = SCAN_RATIO; i > 0; i--) { | ||
430 | segno++; | ||
431 | if (segno >= super->s_no_segs) { | ||
432 | segno = 0; | ||
433 | ret = 1; | ||
434 | /* Break out of the loop. We want to read a single | ||
435 | * block from the segment size on next invocation if | ||
436 | * SCAN_RATIO is set to match block size | ||
437 | */ | ||
438 | break; | ||
439 | } | ||
440 | |||
441 | scan_segment(sb, segno); | ||
442 | } | ||
443 | super->s_sweeper = segno; | ||
444 | return ret; | ||
445 | } | ||
446 | |||
447 | /* | ||
448 | * In principle, this function should loop forever, looking for GC candidates | ||
449 | * and moving data. LogFS is designed in such a way that this loop is | ||
450 | * guaranteed to terminate. | ||
451 | * | ||
452 | * Limiting the loop to some iterations serves purely to catch cases when | ||
453 | * these guarantees have failed. An actual endless loop is an obvious bug | ||
454 | * and should be reported as such. | ||
455 | */ | ||
456 | static void __logfs_gc_pass(struct super_block *sb, int target) | ||
457 | { | ||
458 | struct logfs_super *super = logfs_super(sb); | ||
459 | struct logfs_block *block; | ||
460 | int round, progress, last_progress = 0; | ||
461 | |||
462 | if (no_free_segments(sb) >= target && | ||
463 | super->s_no_object_aliases < MAX_OBJ_ALIASES) | ||
464 | return; | ||
465 | |||
466 | log_gc("__logfs_gc_pass(%x)\n", target); | ||
467 | for (round = 0; round < SCAN_ROUNDS; ) { | ||
468 | if (no_free_segments(sb) >= target) | ||
469 | goto write_alias; | ||
470 | |||
471 | /* Sync in-memory state with on-medium state in case they | ||
472 | * diverged */ | ||
473 | logfs_write_anchor(sb); | ||
474 | round += logfs_scan_some(sb); | ||
475 | if (no_free_segments(sb) >= target) | ||
476 | goto write_alias; | ||
477 | progress = logfs_gc_once(sb); | ||
478 | if (progress) | ||
479 | last_progress = round; | ||
480 | else if (round - last_progress > 2) | ||
481 | break; | ||
482 | continue; | ||
483 | |||
484 | /* | ||
485 | * The goto logic is nasty, I just don't know a better way to | ||
486 | * code it. GC is supposed to ensure two things: | ||
487 | * 1. Enough free segments are available. | ||
488 | * 2. The number of aliases is bounded. | ||
489 | * When 1. is achieved, we take a look at 2. and write back | ||
490 | * some alias-containing blocks, if necessary. However, after | ||
491 | * each such write we need to go back to 1., as writes can | ||
492 | * consume free segments. | ||
493 | */ | ||
494 | write_alias: | ||
495 | if (super->s_no_object_aliases < MAX_OBJ_ALIASES) | ||
496 | return; | ||
497 | if (list_empty(&super->s_object_alias)) { | ||
498 | /* All aliases are still in btree */ | ||
499 | return; | ||
500 | } | ||
501 | log_gc("Write back one alias\n"); | ||
502 | block = list_entry(super->s_object_alias.next, | ||
503 | struct logfs_block, alias_list); | ||
504 | block->ops->write_block(block); | ||
505 | /* | ||
506 | * To round off the nasty goto logic, we reset round here. It | ||
507 | * is a safety-net for GC not making any progress and limited | ||
508 | * to something reasonably small. If incremented it for every | ||
509 | * single alias, the loop could terminate rather quickly. | ||
510 | */ | ||
511 | round = 0; | ||
512 | } | ||
513 | LOGFS_BUG(sb); | ||
514 | } | ||
515 | |||
516 | static int wl_ratelimit(struct super_block *sb, u64 *next_event) | ||
517 | { | ||
518 | struct logfs_super *super = logfs_super(sb); | ||
519 | |||
520 | if (*next_event < super->s_gec) { | ||
521 | *next_event = super->s_gec + WL_RATELIMIT; | ||
522 | return 0; | ||
523 | } | ||
524 | return 1; | ||
525 | } | ||
526 | |||
527 | static void logfs_wl_pass(struct super_block *sb) | ||
528 | { | ||
529 | struct logfs_super *super = logfs_super(sb); | ||
530 | struct gc_candidate *wl_cand, *free_cand; | ||
531 | |||
532 | if (wl_ratelimit(sb, &super->s_wl_gec_ostore)) | ||
533 | return; | ||
534 | |||
535 | wl_cand = first_in_list(&super->s_ec_list); | ||
536 | if (!wl_cand) | ||
537 | return; | ||
538 | free_cand = first_in_list(&super->s_free_list); | ||
539 | if (!free_cand) | ||
540 | return; | ||
541 | |||
542 | if (wl_cand->erase_count < free_cand->erase_count + WL_DELTA) { | ||
543 | remove_from_list(wl_cand); | ||
544 | __logfs_gc_once(sb, wl_cand); | ||
545 | } | ||
546 | } | ||
547 | |||
548 | /* | ||
549 | * The journal needs wear leveling as well. But moving the journal is an | ||
550 | * expensive operation so we try to avoid it as much as possible. And if we | ||
551 | * have to do it, we move the whole journal, not individual segments. | ||
552 | * | ||
553 | * Ratelimiting is not strictly necessary here, it mainly serves to avoid the | ||
554 | * calculations. First we check whether moving the journal would be a | ||
555 | * significant improvement. That means that a) the current journal segments | ||
556 | * have more wear than the future journal segments and b) the current journal | ||
557 | * segments have more wear than normal ostore segments. | ||
558 | * Rationale for b) is that we don't have to move the journal if it is aging | ||
559 | * less than the ostore, even if the reserve segments age even less (they are | ||
560 | * excluded from wear leveling, after all). | ||
561 | * Next we check that the superblocks have less wear than the journal. Since | ||
562 | * moving the journal requires writing the superblocks, we have to protect the | ||
563 | * superblocks even more than the journal. | ||
564 | * | ||
565 | * Also we double the acceptable wear difference, compared to ostore wear | ||
566 | * leveling. Journal data is read and rewritten rapidly, comparatively. So | ||
567 | * soft errors have much less time to accumulate and we allow the journal to | ||
568 | * be a bit worse than the ostore. | ||
569 | */ | ||
570 | static void logfs_journal_wl_pass(struct super_block *sb) | ||
571 | { | ||
572 | struct logfs_super *super = logfs_super(sb); | ||
573 | struct gc_candidate *cand; | ||
574 | u32 min_journal_ec = -1, max_reserve_ec = 0; | ||
575 | int i; | ||
576 | |||
577 | if (wl_ratelimit(sb, &super->s_wl_gec_journal)) | ||
578 | return; | ||
579 | |||
580 | if (super->s_reserve_list.count < super->s_no_journal_segs) { | ||
581 | /* Reserve is not full enough to move complete journal */ | ||
582 | return; | ||
583 | } | ||
584 | |||
585 | journal_for_each(i) | ||
586 | if (super->s_journal_seg[i]) | ||
587 | min_journal_ec = min(min_journal_ec, | ||
588 | super->s_journal_ec[i]); | ||
589 | cand = rb_entry(rb_first(&super->s_free_list.rb_tree), | ||
590 | struct gc_candidate, rb_node); | ||
591 | max_reserve_ec = cand->erase_count; | ||
592 | for (i = 0; i < 2; i++) { | ||
593 | struct logfs_segment_entry se; | ||
594 | u32 segno = seg_no(sb, super->s_sb_ofs[i]); | ||
595 | u32 ec; | ||
596 | |||
597 | logfs_get_segment_entry(sb, segno, &se); | ||
598 | ec = be32_to_cpu(se.ec_level) >> 4; | ||
599 | max_reserve_ec = max(max_reserve_ec, ec); | ||
600 | } | ||
601 | |||
602 | if (min_journal_ec > max_reserve_ec + 2 * WL_DELTA) { | ||
603 | do_logfs_journal_wl_pass(sb); | ||
604 | } | ||
605 | } | ||
606 | |||
607 | void logfs_gc_pass(struct super_block *sb) | ||
608 | { | ||
609 | struct logfs_super *super = logfs_super(sb); | ||
610 | |||
611 | //BUG_ON(mutex_trylock(&logfs_super(sb)->s_w_mutex)); | ||
612 | /* Write journal before free space is getting saturated with dirty | ||
613 | * objects. | ||
614 | */ | ||
615 | if (super->s_dirty_used_bytes + super->s_dirty_free_bytes | ||
616 | + LOGFS_MAX_OBJECTSIZE >= super->s_free_bytes) | ||
617 | logfs_write_anchor(sb); | ||
618 | __logfs_gc_pass(sb, super->s_total_levels); | ||
619 | logfs_wl_pass(sb); | ||
620 | logfs_journal_wl_pass(sb); | ||
621 | } | ||
622 | |||
623 | static int check_area(struct super_block *sb, int i) | ||
624 | { | ||
625 | struct logfs_super *super = logfs_super(sb); | ||
626 | struct logfs_area *area = super->s_area[i]; | ||
627 | struct logfs_object_header oh; | ||
628 | u32 segno = area->a_segno; | ||
629 | u32 ofs = area->a_used_bytes; | ||
630 | __be32 crc; | ||
631 | int err; | ||
632 | |||
633 | if (!area->a_is_open) | ||
634 | return 0; | ||
635 | |||
636 | for (ofs = area->a_used_bytes; | ||
637 | ofs <= super->s_segsize - sizeof(oh); | ||
638 | ofs += (u32)be16_to_cpu(oh.len) + sizeof(oh)) { | ||
639 | err = wbuf_read(sb, dev_ofs(sb, segno, ofs), sizeof(oh), &oh); | ||
640 | if (err) | ||
641 | return err; | ||
642 | |||
643 | if (!memchr_inv(&oh, 0xff, sizeof(oh))) | ||
644 | break; | ||
645 | |||
646 | crc = logfs_crc32(&oh, sizeof(oh) - 4, 4); | ||
647 | if (crc != oh.crc) { | ||
648 | printk(KERN_INFO "interrupted header at %llx\n", | ||
649 | dev_ofs(sb, segno, ofs)); | ||
650 | return 0; | ||
651 | } | ||
652 | } | ||
653 | if (ofs != area->a_used_bytes) { | ||
654 | printk(KERN_INFO "%x bytes unaccounted data found at %llx\n", | ||
655 | ofs - area->a_used_bytes, | ||
656 | dev_ofs(sb, segno, area->a_used_bytes)); | ||
657 | area->a_used_bytes = ofs; | ||
658 | } | ||
659 | return 0; | ||
660 | } | ||
661 | |||
662 | int logfs_check_areas(struct super_block *sb) | ||
663 | { | ||
664 | int i, err; | ||
665 | |||
666 | for_each_area(i) { | ||
667 | err = check_area(sb, i); | ||
668 | if (err) | ||
669 | return err; | ||
670 | } | ||
671 | return 0; | ||
672 | } | ||
673 | |||
674 | static void logfs_init_candlist(struct candidate_list *list, int maxcount, | ||
675 | int sort_by_ec) | ||
676 | { | ||
677 | list->count = 0; | ||
678 | list->maxcount = maxcount; | ||
679 | list->sort_by_ec = sort_by_ec; | ||
680 | list->rb_tree = RB_ROOT; | ||
681 | } | ||
682 | |||
683 | int logfs_init_gc(struct super_block *sb) | ||
684 | { | ||
685 | struct logfs_super *super = logfs_super(sb); | ||
686 | int i; | ||
687 | |||
688 | btree_init_mempool32(&super->s_cand_tree, super->s_btree_pool); | ||
689 | logfs_init_candlist(&super->s_free_list, LIST_SIZE + SCAN_RATIO, 1); | ||
690 | logfs_init_candlist(&super->s_reserve_list, | ||
691 | super->s_bad_seg_reserve, 1); | ||
692 | for_each_area(i) | ||
693 | logfs_init_candlist(&super->s_low_list[i], LIST_SIZE, 0); | ||
694 | logfs_init_candlist(&super->s_ec_list, LIST_SIZE, 1); | ||
695 | return 0; | ||
696 | } | ||
697 | |||
698 | static void logfs_cleanup_list(struct super_block *sb, | ||
699 | struct candidate_list *list) | ||
700 | { | ||
701 | struct gc_candidate *cand; | ||
702 | |||
703 | while (list->count) { | ||
704 | cand = rb_entry(list->rb_tree.rb_node, struct gc_candidate, | ||
705 | rb_node); | ||
706 | remove_from_list(cand); | ||
707 | free_candidate(sb, cand); | ||
708 | } | ||
709 | BUG_ON(list->rb_tree.rb_node); | ||
710 | } | ||
711 | |||
712 | void logfs_cleanup_gc(struct super_block *sb) | ||
713 | { | ||
714 | struct logfs_super *super = logfs_super(sb); | ||
715 | int i; | ||
716 | |||
717 | if (!super->s_free_list.count) | ||
718 | return; | ||
719 | |||
720 | /* | ||
721 | * FIXME: The btree may still contain a single empty node. So we | ||
722 | * call the grim visitor to clean up that mess. Btree code should | ||
723 | * do it for us, really. | ||
724 | */ | ||
725 | btree_grim_visitor32(&super->s_cand_tree, 0, NULL); | ||
726 | logfs_cleanup_list(sb, &super->s_free_list); | ||
727 | logfs_cleanup_list(sb, &super->s_reserve_list); | ||
728 | for_each_area(i) | ||
729 | logfs_cleanup_list(sb, &super->s_low_list[i]); | ||
730 | logfs_cleanup_list(sb, &super->s_ec_list); | ||
731 | } | ||