diff options
-rw-r--r-- | fs/ubifs/gc.c | 428 |
1 files changed, 296 insertions, 132 deletions
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c index a711d33b3d3e..f0f5f15d384e 100644 --- a/fs/ubifs/gc.c +++ b/fs/ubifs/gc.c | |||
@@ -47,7 +47,7 @@ | |||
47 | * have to waste large pieces of free space at the end of LEB B, because nodes | 47 | * have to waste large pieces of free space at the end of LEB B, because nodes |
48 | * from LEB A would not fit. And the worst situation is when all nodes are of | 48 | * from LEB A would not fit. And the worst situation is when all nodes are of |
49 | * maximum size. So dark watermark is the amount of free + dirty space in LEB | 49 | * maximum size. So dark watermark is the amount of free + dirty space in LEB |
50 | * which are guaranteed to be reclaimable. If LEB has less space, the GC migh | 50 | * which are guaranteed to be reclaimable. If LEB has less space, the GC might |
51 | * be unable to reclaim it. So, LEBs with free + dirty greater than dark | 51 | * be unable to reclaim it. So, LEBs with free + dirty greater than dark |
52 | * watermark are "good" LEBs from GC's point of few. The other LEBs are not so | 52 | * watermark are "good" LEBs from GC's point of few. The other LEBs are not so |
53 | * good, and GC takes extra care when moving them. | 53 | * good, and GC takes extra care when moving them. |
@@ -57,14 +57,6 @@ | |||
57 | #include "ubifs.h" | 57 | #include "ubifs.h" |
58 | 58 | ||
59 | /* | 59 | /* |
60 | * GC tries to optimize the way it fit nodes to available space, and it sorts | ||
61 | * nodes a little. The below constants are watermarks which define "large", | ||
62 | * "medium", and "small" nodes. | ||
63 | */ | ||
64 | #define MEDIUM_NODE_WM (UBIFS_BLOCK_SIZE / 4) | ||
65 | #define SMALL_NODE_WM UBIFS_MAX_DENT_NODE_SZ | ||
66 | |||
67 | /* | ||
68 | * GC may need to move more than one LEB to make progress. The below constants | 60 | * GC may need to move more than one LEB to make progress. The below constants |
69 | * define "soft" and "hard" limits on the number of LEBs the garbage collector | 61 | * define "soft" and "hard" limits on the number of LEBs the garbage collector |
70 | * may move. | 62 | * may move. |
@@ -116,83 +108,222 @@ static int switch_gc_head(struct ubifs_info *c) | |||
116 | } | 108 | } |
117 | 109 | ||
118 | /** | 110 | /** |
119 | * joinup - bring data nodes for an inode together. | 111 | * list_sort - sort a list. |
120 | * @c: UBIFS file-system description object | 112 | * @priv: private data, passed to @cmp |
121 | * @sleb: describes scanned LEB | 113 | * @head: the list to sort |
122 | * @inum: inode number | 114 | * @cmp: the elements comparison function |
123 | * @blk: block number | ||
124 | * @data: list to which to add data nodes | ||
125 | * | 115 | * |
126 | * This function looks at the first few nodes in the scanned LEB @sleb and adds | 116 | * This function has been implemented by Mark J Roberts <mjr@znex.org>. It |
127 | * them to @data if they are data nodes from @inum and have a larger block | 117 | * implements "merge sort" which has O(nlog(n)) complexity. The list is sorted |
128 | * number than @blk. This function returns %0 on success and a negative error | 118 | * in ascending order. |
129 | * code on failure. | 119 | * |
120 | * The comparison function @cmp is supposed to return a negative value if @a is | ||
121 | * than @b, and a positive value if @a is greater than @b. If @a and @b are | ||
122 | * equivalent, then it does not matter what this function returns. | ||
130 | */ | 123 | */ |
131 | static int joinup(struct ubifs_info *c, struct ubifs_scan_leb *sleb, ino_t inum, | 124 | static void list_sort(void *priv, struct list_head *head, |
132 | unsigned int blk, struct list_head *data) | 125 | int (*cmp)(void *priv, struct list_head *a, |
126 | struct list_head *b)) | ||
133 | { | 127 | { |
134 | int err, cnt = 6, lnum = sleb->lnum, offs; | 128 | struct list_head *p, *q, *e, *list, *tail, *oldhead; |
135 | struct ubifs_scan_node *snod, *tmp; | 129 | int insize, nmerges, psize, qsize, i; |
136 | union ubifs_key *key; | 130 | |
131 | if (list_empty(head)) | ||
132 | return; | ||
133 | |||
134 | list = head->next; | ||
135 | list_del(head); | ||
136 | insize = 1; | ||
137 | for (;;) { | ||
138 | p = oldhead = list; | ||
139 | list = tail = NULL; | ||
140 | nmerges = 0; | ||
141 | |||
142 | while (p) { | ||
143 | nmerges++; | ||
144 | q = p; | ||
145 | psize = 0; | ||
146 | for (i = 0; i < insize; i++) { | ||
147 | psize++; | ||
148 | q = q->next == oldhead ? NULL : q->next; | ||
149 | if (!q) | ||
150 | break; | ||
151 | } | ||
137 | 152 | ||
138 | list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) { | 153 | qsize = insize; |
139 | key = &snod->key; | 154 | while (psize > 0 || (qsize > 0 && q)) { |
140 | if (key_inum(c, key) == inum && | 155 | if (!psize) { |
141 | key_type(c, key) == UBIFS_DATA_KEY && | 156 | e = q; |
142 | key_block(c, key) > blk) { | 157 | q = q->next; |
143 | offs = snod->offs; | 158 | qsize--; |
144 | err = ubifs_tnc_has_node(c, key, 0, lnum, offs, 0); | 159 | if (q == oldhead) |
145 | if (err < 0) | 160 | q = NULL; |
146 | return err; | 161 | } else if (!qsize || !q) { |
147 | list_del(&snod->list); | 162 | e = p; |
148 | if (err) { | 163 | p = p->next; |
149 | list_add_tail(&snod->list, data); | 164 | psize--; |
150 | blk = key_block(c, key); | 165 | if (p == oldhead) |
151 | } else | 166 | p = NULL; |
152 | kfree(snod); | 167 | } else if (cmp(priv, p, q) <= 0) { |
153 | cnt = 6; | 168 | e = p; |
154 | } else if (--cnt == 0) | 169 | p = p->next; |
170 | psize--; | ||
171 | if (p == oldhead) | ||
172 | p = NULL; | ||
173 | } else { | ||
174 | e = q; | ||
175 | q = q->next; | ||
176 | qsize--; | ||
177 | if (q == oldhead) | ||
178 | q = NULL; | ||
179 | } | ||
180 | if (tail) | ||
181 | tail->next = e; | ||
182 | else | ||
183 | list = e; | ||
184 | e->prev = tail; | ||
185 | tail = e; | ||
186 | } | ||
187 | p = q; | ||
188 | } | ||
189 | |||
190 | tail->next = list; | ||
191 | list->prev = tail; | ||
192 | |||
193 | if (nmerges <= 1) | ||
155 | break; | 194 | break; |
195 | |||
196 | insize *= 2; | ||
156 | } | 197 | } |
157 | return 0; | 198 | |
199 | head->next = list; | ||
200 | head->prev = list->prev; | ||
201 | list->prev->next = head; | ||
202 | list->prev = head; | ||
158 | } | 203 | } |
159 | 204 | ||
160 | /** | 205 | /** |
161 | * move_nodes - move nodes. | 206 | * data_nodes_cmp - compare 2 data nodes. |
207 | * @priv: UBIFS file-system description object | ||
208 | * @a: first data node | ||
209 | * @a: second data node | ||
210 | * | ||
211 | * This function compares data nodes @a and @b. Returns %1 if @a has greater | ||
212 | * inode or block number, and %-1 otherwise. | ||
213 | */ | ||
214 | int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b) | ||
215 | { | ||
216 | ino_t inuma, inumb; | ||
217 | struct ubifs_info *c = priv; | ||
218 | struct ubifs_scan_node *sa, *sb; | ||
219 | |||
220 | cond_resched(); | ||
221 | sa = list_entry(a, struct ubifs_scan_node, list); | ||
222 | sb = list_entry(b, struct ubifs_scan_node, list); | ||
223 | ubifs_assert(key_type(c, &sa->key) == UBIFS_DATA_KEY); | ||
224 | ubifs_assert(key_type(c, &sb->key) == UBIFS_DATA_KEY); | ||
225 | |||
226 | inuma = key_inum(c, &sa->key); | ||
227 | inumb = key_inum(c, &sb->key); | ||
228 | |||
229 | if (inuma == inumb) { | ||
230 | unsigned int blka = key_block(c, &sa->key); | ||
231 | unsigned int blkb = key_block(c, &sb->key); | ||
232 | |||
233 | if (blka <= blkb) | ||
234 | return -1; | ||
235 | } else if (inuma <= inumb) | ||
236 | return -1; | ||
237 | |||
238 | return 1; | ||
239 | } | ||
240 | |||
241 | /* | ||
242 | * nondata_nodes_cmp - compare 2 non-data nodes. | ||
243 | * @priv: UBIFS file-system description object | ||
244 | * @a: first node | ||
245 | * @a: second node | ||
246 | * | ||
247 | * This function compares nodes @a and @b. It makes sure that inode nodes go | ||
248 | * first and sorted by length in descending order. Directory entry nodes go | ||
249 | * after inode nodes and are sorted in ascending hash valuer order. | ||
250 | */ | ||
251 | int nondata_nodes_cmp(void *priv, struct list_head *a, struct list_head *b) | ||
252 | { | ||
253 | int typea, typeb; | ||
254 | ino_t inuma, inumb; | ||
255 | struct ubifs_info *c = priv; | ||
256 | struct ubifs_scan_node *sa, *sb; | ||
257 | |||
258 | cond_resched(); | ||
259 | sa = list_entry(a, struct ubifs_scan_node, list); | ||
260 | sb = list_entry(b, struct ubifs_scan_node, list); | ||
261 | typea = key_type(c, &sa->key); | ||
262 | typeb = key_type(c, &sb->key); | ||
263 | ubifs_assert(typea != UBIFS_DATA_KEY && typeb != UBIFS_DATA_KEY); | ||
264 | |||
265 | /* Inodes go before directory entries */ | ||
266 | if (typea == UBIFS_INO_KEY) { | ||
267 | if (typeb == UBIFS_INO_KEY) | ||
268 | return sb->len - sa->len; | ||
269 | return -1; | ||
270 | } | ||
271 | if (typeb == UBIFS_INO_KEY) | ||
272 | return 1; | ||
273 | |||
274 | ubifs_assert(typea == UBIFS_DENT_KEY && typeb == UBIFS_DENT_KEY); | ||
275 | inuma = key_inum(c, &sa->key); | ||
276 | inumb = key_inum(c, &sb->key); | ||
277 | |||
278 | if (inuma == inumb) { | ||
279 | uint32_t hasha = key_hash(c, &sa->key); | ||
280 | uint32_t hashb = key_hash(c, &sb->key); | ||
281 | |||
282 | if (hasha <= hashb) | ||
283 | return -1; | ||
284 | } else if (inuma <= inumb) | ||
285 | return -1; | ||
286 | |||
287 | return 1; | ||
288 | } | ||
289 | |||
290 | /** | ||
291 | * sort_nodes - sort nodes for GC. | ||
162 | * @c: UBIFS file-system description object | 292 | * @c: UBIFS file-system description object |
163 | * @sleb: describes nodes to move | 293 | * @sleb: describes nodes to sort and contains the result on exit |
294 | * @nondata: contains non-data nodes on exit | ||
295 | * @min: minimum node size is returned here | ||
164 | * | 296 | * |
165 | * This function moves valid nodes from data LEB described by @sleb to the GC | 297 | * This function sorts the list of inodes to garbage collect. First of all, it |
166 | * journal head. The obsolete nodes are dropped. | 298 | * kills obsolete nodes and separates data and non-data nodes to the |
299 | * @sleb->nodes and @nondata lists correspondingly. | ||
300 | * | ||
301 | * Data nodes are then sorted in block number order - this is important for | ||
302 | * bulk-read; data nodes with lower inode number go before data nodes with | ||
303 | * higher inode number, and data nodes with lower block number go before data | ||
304 | * nodes with higher block number; | ||
167 | * | 305 | * |
168 | * When moving nodes we have to deal with classical bin-packing problem: the | 306 | * Non-data nodes are sorted as follows. |
169 | * space in the current GC journal head LEB and in @c->gc_lnum are the "bins", | 307 | * o First go inode nodes - they are sorted in descending length order. |
170 | * where the nodes in the @sleb->nodes list are the elements which should be | 308 | * o Then go directory entry nodes - they are sorted in hash order, which |
171 | * fit optimally to the bins. This function uses the "first fit decreasing" | 309 | * should supposedly optimize 'readdir()'. Direntry nodes with lower parent |
172 | * strategy, although it does not really sort the nodes but just split them on | 310 | * inode number go before direntry nodes with higher parent inode number, |
173 | * 3 classes - large, medium, and small, so they are roughly sorted. | 311 | * and direntry nodes with lower name hash values go before direntry nodes |
312 | * with higher name hash values. | ||
174 | * | 313 | * |
175 | * This function returns zero in case of success, %-EAGAIN if commit is | 314 | * This function returns zero in case of success and a negative error code in |
176 | * required, and other negative error codes in case of other failures. | 315 | * case of failure. |
177 | */ | 316 | */ |
178 | static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb) | 317 | static int sort_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb, |
318 | struct list_head *nondata, int *min) | ||
179 | { | 319 | { |
180 | struct ubifs_scan_node *snod, *tmp; | 320 | struct ubifs_scan_node *snod, *tmp; |
181 | struct list_head data, large, medium, small; | ||
182 | struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf; | ||
183 | int avail, err, min = INT_MAX; | ||
184 | unsigned int blk = 0; | ||
185 | ino_t inum = 0; | ||
186 | 321 | ||
187 | INIT_LIST_HEAD(&data); | 322 | *min = INT_MAX; |
188 | INIT_LIST_HEAD(&large); | ||
189 | INIT_LIST_HEAD(&medium); | ||
190 | INIT_LIST_HEAD(&small); | ||
191 | 323 | ||
192 | while (!list_empty(&sleb->nodes)) { | 324 | /* Separate data nodes and non-data nodes */ |
193 | struct list_head *lst = sleb->nodes.next; | 325 | list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) { |
194 | 326 | int err; | |
195 | snod = list_entry(lst, struct ubifs_scan_node, list); | ||
196 | 327 | ||
197 | ubifs_assert(snod->type != UBIFS_IDX_NODE); | 328 | ubifs_assert(snod->type != UBIFS_IDX_NODE); |
198 | ubifs_assert(snod->type != UBIFS_REF_NODE); | 329 | ubifs_assert(snod->type != UBIFS_REF_NODE); |
@@ -201,53 +332,72 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb) | |||
201 | err = ubifs_tnc_has_node(c, &snod->key, 0, sleb->lnum, | 332 | err = ubifs_tnc_has_node(c, &snod->key, 0, sleb->lnum, |
202 | snod->offs, 0); | 333 | snod->offs, 0); |
203 | if (err < 0) | 334 | if (err < 0) |
204 | goto out; | 335 | return err; |
205 | 336 | ||
206 | list_del(lst); | ||
207 | if (!err) { | 337 | if (!err) { |
208 | /* The node is obsolete, remove it from the list */ | 338 | /* The node is obsolete, remove it from the list */ |
339 | list_del(&snod->list); | ||
209 | kfree(snod); | 340 | kfree(snod); |
210 | continue; | 341 | continue; |
211 | } | 342 | } |
212 | 343 | ||
213 | /* | 344 | if (snod->len < *min) |
214 | * Sort the list of nodes so that data nodes go first, large | 345 | *min = snod->len; |
215 | * nodes go second, and small nodes go last. | 346 | |
216 | */ | 347 | if (key_type(c, &snod->key) != UBIFS_DATA_KEY) |
217 | if (key_type(c, &snod->key) == UBIFS_DATA_KEY) { | 348 | list_move_tail(&snod->list, nondata); |
218 | if (inum != key_inum(c, &snod->key)) { | ||
219 | if (inum) { | ||
220 | /* | ||
221 | * Try to move data nodes from the same | ||
222 | * inode together. | ||
223 | */ | ||
224 | err = joinup(c, sleb, inum, blk, &data); | ||
225 | if (err) | ||
226 | goto out; | ||
227 | } | ||
228 | inum = key_inum(c, &snod->key); | ||
229 | blk = key_block(c, &snod->key); | ||
230 | } | ||
231 | list_add_tail(lst, &data); | ||
232 | } else if (snod->len > MEDIUM_NODE_WM) | ||
233 | list_add_tail(lst, &large); | ||
234 | else if (snod->len > SMALL_NODE_WM) | ||
235 | list_add_tail(lst, &medium); | ||
236 | else | ||
237 | list_add_tail(lst, &small); | ||
238 | |||
239 | /* And find the smallest node */ | ||
240 | if (snod->len < min) | ||
241 | min = snod->len; | ||
242 | } | 349 | } |
243 | 350 | ||
244 | /* | 351 | /* Sort data and non-data nodes */ |
245 | * Join the tree lists so that we'd have one roughly sorted list | 352 | list_sort(c, &sleb->nodes, &data_nodes_cmp); |
246 | * ('large' will be the head of the joined list). | 353 | list_sort(c, nondata, &nondata_nodes_cmp); |
247 | */ | 354 | return 0; |
248 | list_splice(&data, &large); | 355 | } |
249 | list_splice(&medium, large.prev); | 356 | |
250 | list_splice(&small, large.prev); | 357 | /** |
358 | * move_node - move a node. | ||
359 | * @c: UBIFS file-system description object | ||
360 | * @sleb: describes the LEB to move nodes from | ||
361 | * @snod: the mode to move | ||
362 | * @wbuf: write-buffer to move node to | ||
363 | * | ||
364 | * This function moves node @snod to @wbuf, changes TNC correspondingly, and | ||
365 | * destroys @snod. Returns zero in case of success and a negative error code in | ||
366 | * case of failure. | ||
367 | */ | ||
368 | static int move_node(struct ubifs_info *c, struct ubifs_scan_leb *sleb, | ||
369 | struct ubifs_scan_node *snod, struct ubifs_wbuf *wbuf) | ||
370 | { | ||
371 | int err, new_lnum = wbuf->lnum, new_offs = wbuf->offs + wbuf->used; | ||
372 | |||
373 | cond_resched(); | ||
374 | err = ubifs_wbuf_write_nolock(wbuf, snod->node, snod->len); | ||
375 | if (err) | ||
376 | return err; | ||
377 | |||
378 | err = ubifs_tnc_replace(c, &snod->key, sleb->lnum, | ||
379 | snod->offs, new_lnum, new_offs, | ||
380 | snod->len); | ||
381 | list_del(&snod->list); | ||
382 | kfree(snod); | ||
383 | return err; | ||
384 | } | ||
385 | |||
386 | /** | ||
387 | * move_nodes - move nodes. | ||
388 | * @c: UBIFS file-system description object | ||
389 | * @sleb: describes the LEB to move nodes from | ||
390 | * | ||
391 | * This function moves valid nodes from data LEB described by @sleb to the GC | ||
392 | * journal head. This function returns zero in case of success, %-EAGAIN if | ||
393 | * commit is required, and other negative error codes in case of other | ||
394 | * failures. | ||
395 | */ | ||
396 | static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb) | ||
397 | { | ||
398 | int err, min; | ||
399 | LIST_HEAD(nondata); | ||
400 | struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf; | ||
251 | 401 | ||
252 | if (wbuf->lnum == -1) { | 402 | if (wbuf->lnum == -1) { |
253 | /* | 403 | /* |
@@ -256,42 +406,59 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb) | |||
256 | */ | 406 | */ |
257 | err = switch_gc_head(c); | 407 | err = switch_gc_head(c); |
258 | if (err) | 408 | if (err) |
259 | goto out; | 409 | return err; |
260 | } | 410 | } |
261 | 411 | ||
412 | err = sort_nodes(c, sleb, &nondata, &min); | ||
413 | if (err) | ||
414 | goto out; | ||
415 | |||
262 | /* Write nodes to their new location. Use the first-fit strategy */ | 416 | /* Write nodes to their new location. Use the first-fit strategy */ |
263 | while (1) { | 417 | while (1) { |
264 | avail = c->leb_size - wbuf->offs - wbuf->used; | 418 | int avail; |
265 | list_for_each_entry_safe(snod, tmp, &large, list) { | 419 | struct ubifs_scan_node *snod, *tmp; |
266 | int new_lnum, new_offs; | 420 | |
421 | /* Move data nodes */ | ||
422 | list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) { | ||
423 | avail = c->leb_size - wbuf->offs - wbuf->used; | ||
424 | if (snod->len > avail) | ||
425 | /* | ||
426 | * Do not skip data nodes in order to optimize | ||
427 | * bulk-read. | ||
428 | */ | ||
429 | break; | ||
430 | |||
431 | err = move_node(c, sleb, snod, wbuf); | ||
432 | if (err) | ||
433 | goto out; | ||
434 | } | ||
267 | 435 | ||
436 | /* Move non-data nodes */ | ||
437 | list_for_each_entry_safe(snod, tmp, &nondata, list) { | ||
438 | avail = c->leb_size - wbuf->offs - wbuf->used; | ||
268 | if (avail < min) | 439 | if (avail < min) |
269 | break; | 440 | break; |
270 | 441 | ||
271 | if (snod->len > avail) | 442 | if (snod->len > avail) { |
272 | /* This node does not fit */ | 443 | /* |
444 | * Keep going only if this is an inode with | ||
445 | * some data. Otherwise stop and switch the GC | ||
446 | * head. IOW, we assume that data-less inode | ||
447 | * nodes and direntry nodes are roughly of the | ||
448 | * same size. | ||
449 | */ | ||
450 | if (key_type(c, &snod->key) == UBIFS_DENT_KEY || | ||
451 | snod->len == UBIFS_INO_NODE_SZ) | ||
452 | break; | ||
273 | continue; | 453 | continue; |
454 | } | ||
274 | 455 | ||
275 | cond_resched(); | 456 | err = move_node(c, sleb, snod, wbuf); |
276 | |||
277 | new_lnum = wbuf->lnum; | ||
278 | new_offs = wbuf->offs + wbuf->used; | ||
279 | err = ubifs_wbuf_write_nolock(wbuf, snod->node, | ||
280 | snod->len); | ||
281 | if (err) | 457 | if (err) |
282 | goto out; | 458 | goto out; |
283 | err = ubifs_tnc_replace(c, &snod->key, sleb->lnum, | ||
284 | snod->offs, new_lnum, new_offs, | ||
285 | snod->len); | ||
286 | if (err) | ||
287 | goto out; | ||
288 | |||
289 | avail = c->leb_size - wbuf->offs - wbuf->used; | ||
290 | list_del(&snod->list); | ||
291 | kfree(snod); | ||
292 | } | 459 | } |
293 | 460 | ||
294 | if (list_empty(&large)) | 461 | if (list_empty(&sleb->nodes) && list_empty(&nondata)) |
295 | break; | 462 | break; |
296 | 463 | ||
297 | /* | 464 | /* |
@@ -306,10 +473,7 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb) | |||
306 | return 0; | 473 | return 0; |
307 | 474 | ||
308 | out: | 475 | out: |
309 | list_for_each_entry_safe(snod, tmp, &large, list) { | 476 | list_splice_tail(&nondata, &sleb->nodes); |
310 | list_del(&snod->list); | ||
311 | kfree(snod); | ||
312 | } | ||
313 | return err; | 477 | return err; |
314 | } | 478 | } |
315 | 479 | ||