1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
|
/*
* fs/ext4/extents_status.c
*
* Written by Yongqiang Yang <xiaoqiangnk@gmail.com>
* Modified by
* Allison Henderson <achender@linux.vnet.ibm.com>
* Hugh Dickins <hughd@google.com>
* Zheng Liu <wenqing.lz@taobao.com>
*
* Ext4 extents status tree core functions.
*/
#include <linux/rbtree.h>
#include "ext4.h"
#include "extents_status.h"
#include "ext4_extents.h"
#include <trace/events/ext4.h>
/*
* According to previous discussion in Ext4 Developer Workshop, we
* will introduce a new structure called io tree to track all extent
* status in order to solve some problems that we have met
* (e.g. Reservation space warning), and provide extent-level locking.
* Delay extent tree is the first step to achieve this goal. It is
* original built by Yongqiang Yang. At that time it is called delay
* extent tree, whose goal is only track delay extent in memory to
* simplify the implementation of fiemap and bigalloc, and introduce
* lseek SEEK_DATA/SEEK_HOLE support. That is why it is still called
* delay extent tree at the following comment. But for better
* understand what it does, it has been rename to extent status tree.
*
* Currently the first step has been done. All delay extents are
* tracked in the tree. It maintains the delay extent when a delay
* allocation is issued, and the delay extent is written out or
* invalidated. Therefore the implementation of fiemap and bigalloc
* are simplified, and SEEK_DATA/SEEK_HOLE are introduced.
*
* The following comment describes the implemenmtation of extent
* status tree and future works.
*/
/*
* extents status tree implementation for ext4.
*
*
* ==========================================================================
* Extents status encompass delayed extents and extent locks
*
* 1. Why delayed extent implementation ?
*
* Without delayed extent, ext4 identifies a delayed extent by looking
* up page cache, this has several deficiencies - complicated, buggy,
* and inefficient code.
*
* FIEMAP, SEEK_HOLE/DATA, bigalloc, punch hole and writeout all need
* to know if a block or a range of blocks are belonged to a delayed
* extent.
*
* Let us have a look at how they do without delayed extents implementation.
* -- FIEMAP
* FIEMAP looks up page cache to identify delayed allocations from holes.
*
* -- SEEK_HOLE/DATA
* SEEK_HOLE/DATA has the same problem as FIEMAP.
*
* -- bigalloc
* bigalloc looks up page cache to figure out if a block is
* already under delayed allocation or not to determine whether
* quota reserving is needed for the cluster.
*
* -- punch hole
* punch hole looks up page cache to identify a delayed extent.
*
* -- writeout
* Writeout looks up whole page cache to see if a buffer is
* mapped, If there are not very many delayed buffers, then it is
* time comsuming.
*
* With delayed extents implementation, FIEMAP, SEEK_HOLE/DATA,
* bigalloc and writeout can figure out if a block or a range of
* blocks is under delayed allocation(belonged to a delayed extent) or
* not by searching the delayed extent tree.
*
*
* ==========================================================================
* 2. ext4 delayed extents impelmentation
*
* -- delayed extent
* A delayed extent is a range of blocks which are contiguous
* logically and under delayed allocation. Unlike extent in
* ext4, delayed extent in ext4 is a in-memory struct, there is
* no corresponding on-disk data. There is no limit on length of
* delayed extent, so a delayed extent can contain as many blocks
* as they are contiguous logically.
*
* -- delayed extent tree
* Every inode has a delayed extent tree and all under delayed
* allocation blocks are added to the tree as delayed extents.
* Delayed extents in the tree are ordered by logical block no.
*
* -- operations on a delayed extent tree
* There are three operations on a delayed extent tree: find next
* delayed extent, adding a space(a range of blocks) and removing
* a space.
*
* -- race on a delayed extent tree
* Delayed extent tree is protected inode->i_es_lock.
*
*
* ==========================================================================
* 3. performance analysis
* -- overhead
* 1. There is a cache extent for write access, so if writes are
* not very random, adding space operaions are in O(1) time.
*
* -- gain
* 2. Code is much simpler, more readable, more maintainable and
* more efficient.
*
*
* ==========================================================================
* 4. TODO list
* -- Track all extent status
*
* -- Improve get block process
*
* -- Extent-level locking
*/
static struct kmem_cache *ext4_es_cachep;
int __init ext4_init_es(void)
{
ext4_es_cachep = KMEM_CACHE(extent_status, SLAB_RECLAIM_ACCOUNT);
if (ext4_es_cachep == NULL)
return -ENOMEM;
return 0;
}
void ext4_exit_es(void)
{
if (ext4_es_cachep)
kmem_cache_destroy(ext4_es_cachep);
}
void ext4_es_init_tree(struct ext4_es_tree *tree)
{
tree->root = RB_ROOT;
tree->cache_es = NULL;
}
#ifdef ES_DEBUG__
static void ext4_es_print_tree(struct inode *inode)
{
struct ext4_es_tree *tree;
struct rb_node *node;
printk(KERN_DEBUG "status extents for inode %lu:", inode->i_ino);
tree = &EXT4_I(inode)->i_es_tree;
node = rb_first(&tree->root);
while (node) {
struct extent_status *es;
es = rb_entry(node, struct extent_status, rb_node);
printk(KERN_DEBUG " [%u/%u)", es->start, es->len);
node = rb_next(node);
}
printk(KERN_DEBUG "\n");
}
#else
#define ext4_es_print_tree(inode)
#endif
static inline ext4_lblk_t extent_status_end(struct extent_status *es)
{
BUG_ON(es->start + es->len < es->start);
return es->start + es->len - 1;
}
/*
* search through the tree for an delayed extent with a given offset. If
* it can't be found, try to find next extent.
*/
static struct extent_status *__es_tree_search(struct rb_root *root,
ext4_lblk_t offset)
{
struct rb_node *node = root->rb_node;
struct extent_status *es = NULL;
while (node) {
es = rb_entry(node, struct extent_status, rb_node);
if (offset < es->start)
node = node->rb_left;
else if (offset > extent_status_end(es))
node = node->rb_right;
else
return es;
}
if (es && offset < es->start)
return es;
if (es && offset > extent_status_end(es)) {
node = rb_next(&es->rb_node);
return node ? rb_entry(node, struct extent_status, rb_node) :
NULL;
}
return NULL;
}
/*
* ext4_es_find_extent: find the 1st delayed extent covering @es->start
* if it exists, otherwise, the next extent after @es->start.
*
* @inode: the inode which owns delayed extents
* @es: delayed extent that we found
*
* Returns the first block of the next extent after es, otherwise
* EXT_MAX_BLOCKS if no delay extent is found.
* Delayed extent is returned via @es.
*/
ext4_lblk_t ext4_es_find_extent(struct inode *inode, struct extent_status *es)
{
struct ext4_es_tree *tree = NULL;
struct extent_status *es1 = NULL;
struct rb_node *node;
ext4_lblk_t ret = EXT_MAX_BLOCKS;
trace_ext4_es_find_extent_enter(inode, es->start);
read_lock(&EXT4_I(inode)->i_es_lock);
tree = &EXT4_I(inode)->i_es_tree;
/* find delay extent in cache firstly */
if (tree->cache_es) {
es1 = tree->cache_es;
if (in_range(es->start, es1->start, es1->len)) {
es_debug("%u cached by [%u/%u)\n",
es->start, es1->start, es1->len);
goto out;
}
}
es->len = 0;
es1 = __es_tree_search(&tree->root, es->start);
out:
if (es1) {
tree->cache_es = es1;
es->start = es1->start;
es->len = es1->len;
node = rb_next(&es1->rb_node);
if (node) {
es1 = rb_entry(node, struct extent_status, rb_node);
ret = es1->start;
}
}
read_unlock(&EXT4_I(inode)->i_es_lock);
trace_ext4_es_find_extent_exit(inode, es, ret);
return ret;
}
static struct extent_status *
ext4_es_alloc_extent(ext4_lblk_t start, ext4_lblk_t len)
{
struct extent_status *es;
es = kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC);
if (es == NULL)
return NULL;
es->start = start;
es->len = len;
return es;
}
static void ext4_es_free_extent(struct extent_status *es)
{
kmem_cache_free(ext4_es_cachep, es);
}
static struct extent_status *
ext4_es_try_to_merge_left(struct ext4_es_tree *tree, struct extent_status *es)
{
struct extent_status *es1;
struct rb_node *node;
node = rb_prev(&es->rb_node);
if (!node)
return es;
es1 = rb_entry(node, struct extent_status, rb_node);
if (es->start == extent_status_end(es1) + 1) {
es1->len += es->len;
rb_erase(&es->rb_node, &tree->root);
ext4_es_free_extent(es);
es = es1;
}
return es;
}
static struct extent_status *
ext4_es_try_to_merge_right(struct ext4_es_tree *tree, struct extent_status *es)
{
struct extent_status *es1;
struct rb_node *node;
node = rb_next(&es->rb_node);
if (!node)
return es;
es1 = rb_entry(node, struct extent_status, rb_node);
if (es1->start == extent_status_end(es) + 1) {
es->len += es1->len;
rb_erase(node, &tree->root);
ext4_es_free_extent(es1);
}
return es;
}
static int __es_insert_extent(struct ext4_es_tree *tree, ext4_lblk_t offset,
ext4_lblk_t len)
{
struct rb_node **p = &tree->root.rb_node;
struct rb_node *parent = NULL;
struct extent_status *es;
ext4_lblk_t end = offset + len - 1;
BUG_ON(end < offset);
es = tree->cache_es;
if (es && offset == (extent_status_end(es) + 1)) {
es_debug("cached by [%u/%u)\n", es->start, es->len);
es->len += len;
es = ext4_es_try_to_merge_right(tree, es);
goto out;
} else if (es && es->start == end + 1) {
es_debug("cached by [%u/%u)\n", es->start, es->len);
es->start = offset;
es->len += len;
es = ext4_es_try_to_merge_left(tree, es);
goto out;
} else if (es && es->start <= offset &&
end <= extent_status_end(es)) {
es_debug("cached by [%u/%u)\n", es->start, es->len);
goto out;
}
while (*p) {
parent = *p;
es = rb_entry(parent, struct extent_status, rb_node);
if (offset < es->start) {
if (es->start == end + 1) {
es->start = offset;
es->len += len;
es = ext4_es_try_to_merge_left(tree, es);
goto out;
}
p = &(*p)->rb_left;
} else if (offset > extent_status_end(es)) {
if (offset == extent_status_end(es) + 1) {
es->len += len;
es = ext4_es_try_to_merge_right(tree, es);
goto out;
}
p = &(*p)->rb_right;
} else {
if (extent_status_end(es) <= end)
es->len = offset - es->start + len;
goto out;
}
}
es = ext4_es_alloc_extent(offset, len);
if (!es)
return -ENOMEM;
rb_link_node(&es->rb_node, parent, p);
rb_insert_color(&es->rb_node, &tree->root);
out:
tree->cache_es = es;
return 0;
}
/*
* ext4_es_insert_extent() adds a space to a delayed extent tree.
* Caller holds inode->i_es_lock.
*
* ext4_es_insert_extent is called by ext4_da_write_begin and
* ext4_es_remove_extent.
*
* Return 0 on success, error code on failure.
*/
int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t offset,
ext4_lblk_t len)
{
struct ext4_es_tree *tree;
int err = 0;
trace_ext4_es_insert_extent(inode, offset, len);
es_debug("add [%u/%u) to extent status tree of inode %lu\n",
offset, len, inode->i_ino);
write_lock(&EXT4_I(inode)->i_es_lock);
tree = &EXT4_I(inode)->i_es_tree;
err = __es_insert_extent(tree, offset, len);
write_unlock(&EXT4_I(inode)->i_es_lock);
ext4_es_print_tree(inode);
return err;
}
/*
* ext4_es_remove_extent() removes a space from a delayed extent tree.
* Caller holds inode->i_es_lock.
*
* Return 0 on success, error code on failure.
*/
int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t offset,
ext4_lblk_t len)
{
struct rb_node *node;
struct ext4_es_tree *tree;
struct extent_status *es;
struct extent_status orig_es;
ext4_lblk_t len1, len2, end;
int err = 0;
trace_ext4_es_remove_extent(inode, offset, len);
es_debug("remove [%u/%u) from extent status tree of inode %lu\n",
offset, len, inode->i_ino);
end = offset + len - 1;
BUG_ON(end < offset);
write_lock(&EXT4_I(inode)->i_es_lock);
tree = &EXT4_I(inode)->i_es_tree;
es = __es_tree_search(&tree->root, offset);
if (!es)
goto out;
if (es->start > end)
goto out;
/* Simply invalidate cache_es. */
tree->cache_es = NULL;
orig_es.start = es->start;
orig_es.len = es->len;
len1 = offset > es->start ? offset - es->start : 0;
len2 = extent_status_end(es) > end ?
extent_status_end(es) - end : 0;
if (len1 > 0)
es->len = len1;
if (len2 > 0) {
if (len1 > 0) {
err = __es_insert_extent(tree, end + 1, len2);
if (err) {
es->start = orig_es.start;
es->len = orig_es.len;
goto out;
}
} else {
es->start = end + 1;
es->len = len2;
}
goto out;
}
if (len1 > 0) {
node = rb_next(&es->rb_node);
if (node)
es = rb_entry(node, struct extent_status, rb_node);
else
es = NULL;
}
while (es && extent_status_end(es) <= end) {
node = rb_next(&es->rb_node);
rb_erase(&es->rb_node, &tree->root);
ext4_es_free_extent(es);
if (!node) {
es = NULL;
break;
}
es = rb_entry(node, struct extent_status, rb_node);
}
if (es && es->start < end + 1) {
len1 = extent_status_end(es) - end;
es->start = end + 1;
es->len = len1;
}
out:
write_unlock(&EXT4_I(inode)->i_es_lock);
ext4_es_print_tree(inode);
return err;
}
|