aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorZheng Liu <wenqing.lz@taobao.com>2013-02-18 00:26:51 -0500
committerTheodore Ts'o <tytso@mit.edu>2013-02-18 00:26:51 -0500
commit06b0c886214a223dde7b21cbfc3008fd20a8ce16 (patch)
treec1d3c51622e2a885446e8d617b953405412fc3aa
parent0f70b40613ee14b0cadafeb461034cff81b4419a (diff)
ext4: refine extent status tree
This commit refines the extent status tree code. 1) A prefix 'es_' is added to to the extent status tree structure members. 2) Refactored es_remove_extent() so that __es_remove_extent() can be used by es_insert_extent() to remove the old extent entry(-ies) before inserting a new one. 3) Rename extent_status_end() to ext4_es_end() 4) ext4_es_can_be_merged() is define to check whether two extents can be merged or not. 5) Update and clarified comments. Signed-off-by: Zheng Liu <wenqing.lz@taobao.com> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu> Reviewed-by: Jan Kara <jack@suse.cz>
-rw-r--r--fs/ext4/extents.c21
-rw-r--r--fs/ext4/extents_status.c322
-rw-r--r--fs/ext4/extents_status.h8
-rw-r--r--fs/ext4/file.c12
-rw-r--r--include/trace/events/ext4.h40
5 files changed, 221 insertions, 182 deletions
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index b6b54d658dc2..37f94a751ad7 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3528,13 +3528,14 @@ static int ext4_find_delalloc_range(struct inode *inode,
3528{ 3528{
3529 struct extent_status es; 3529 struct extent_status es;
3530 3530
3531 es.start = lblk_start; 3531 es.es_lblk = lblk_start;
3532 ext4_es_find_extent(inode, &es); 3532 (void)ext4_es_find_extent(inode, &es);
3533 if (es.len == 0) 3533 if (es.es_len == 0)
3534 return 0; /* there is no delay extent in this tree */ 3534 return 0; /* there is no delay extent in this tree */
3535 else if (es.start <= lblk_start && lblk_start < es.start + es.len) 3535 else if (es.es_lblk <= lblk_start &&
3536 lblk_start < es.es_lblk + es.es_len)
3536 return 1; 3537 return 1;
3537 else if (lblk_start <= es.start && es.start <= lblk_end) 3538 else if (lblk_start <= es.es_lblk && es.es_lblk <= lblk_end)
3538 return 1; 3539 return 1;
3539 else 3540 else
3540 return 0; 3541 return 0;
@@ -4569,7 +4570,7 @@ static int ext4_find_delayed_extent(struct inode *inode,
4569 struct extent_status es; 4570 struct extent_status es;
4570 ext4_lblk_t next_del; 4571 ext4_lblk_t next_del;
4571 4572
4572 es.start = newex->ec_block; 4573 es.es_lblk = newex->ec_block;
4573 next_del = ext4_es_find_extent(inode, &es); 4574 next_del = ext4_es_find_extent(inode, &es);
4574 4575
4575 if (newex->ec_start == 0) { 4576 if (newex->ec_start == 0) {
@@ -4577,18 +4578,18 @@ static int ext4_find_delayed_extent(struct inode *inode,
4577 * No extent in extent-tree contains block @newex->ec_start, 4578 * No extent in extent-tree contains block @newex->ec_start,
4578 * then the block may stay in 1)a hole or 2)delayed-extent. 4579 * then the block may stay in 1)a hole or 2)delayed-extent.
4579 */ 4580 */
4580 if (es.len == 0) 4581 if (es.es_len == 0)
4581 /* A hole found. */ 4582 /* A hole found. */
4582 return 0; 4583 return 0;
4583 4584
4584 if (es.start > newex->ec_block) { 4585 if (es.es_lblk > newex->ec_block) {
4585 /* A hole found. */ 4586 /* A hole found. */
4586 newex->ec_len = min(es.start - newex->ec_block, 4587 newex->ec_len = min(es.es_lblk - newex->ec_block,
4587 newex->ec_len); 4588 newex->ec_len);
4588 return 0; 4589 return 0;
4589 } 4590 }
4590 4591
4591 newex->ec_len = es.start + es.len - newex->ec_block; 4592 newex->ec_len = es.es_lblk + es.es_len - newex->ec_block;
4592 } 4593 }
4593 4594
4594 return next_del; 4595 return next_del;
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 564d981a2fcc..c9921cf4f817 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -23,40 +23,53 @@
23 * (e.g. Reservation space warning), and provide extent-level locking. 23 * (e.g. Reservation space warning), and provide extent-level locking.
24 * Delay extent tree is the first step to achieve this goal. It is 24 * Delay extent tree is the first step to achieve this goal. It is
25 * original built by Yongqiang Yang. At that time it is called delay 25 * original built by Yongqiang Yang. At that time it is called delay
26 * extent tree, whose goal is only track delay extent in memory to 26 * extent tree, whose goal is only track delayed extents in memory to
27 * simplify the implementation of fiemap and bigalloc, and introduce 27 * simplify the implementation of fiemap and bigalloc, and introduce
28 * lseek SEEK_DATA/SEEK_HOLE support. That is why it is still called 28 * lseek SEEK_DATA/SEEK_HOLE support. That is why it is still called
29 * delay extent tree at the following comment. But for better 29 * delay extent tree at the first commit. But for better understand
30 * understand what it does, it has been rename to extent status tree. 30 * what it does, it has been rename to extent status tree.
31 * 31 *
32 * Currently the first step has been done. All delay extents are 32 * Step1:
33 * tracked in the tree. It maintains the delay extent when a delay 33 * Currently the first step has been done. All delayed extents are
34 * allocation is issued, and the delay extent is written out or 34 * tracked in the tree. It maintains the delayed extent when a delayed
35 * allocation is issued, and the delayed extent is written out or
35 * invalidated. Therefore the implementation of fiemap and bigalloc 36 * invalidated. Therefore the implementation of fiemap and bigalloc
36 * are simplified, and SEEK_DATA/SEEK_HOLE are introduced. 37 * are simplified, and SEEK_DATA/SEEK_HOLE are introduced.
37 * 38 *
38 * The following comment describes the implemenmtation of extent 39 * The following comment describes the implemenmtation of extent
39 * status tree and future works. 40 * status tree and future works.
41 *
42 * Step2:
43 * In this step all extent status are tracked by extent status tree.
44 * Thus, we can first try to lookup a block mapping in this tree before
45 * finding it in extent tree. Hence, single extent cache can be removed
46 * because extent status tree can do a better job. Extents in status
47 * tree are loaded on-demand. Therefore, the extent status tree may not
48 * contain all of the extents in a file. Meanwhile we define a shrinker
49 * to reclaim memory from extent status tree because fragmented extent
50 * tree will make status tree cost too much memory. written/unwritten/-
51 * hole extents in the tree will be reclaimed by this shrinker when we
52 * are under high memory pressure. Delayed extents will not be
53 * reclimed because fiemap, bigalloc, and seek_data/hole need it.
40 */ 54 */
41 55
42/* 56/*
43 * extents status tree implementation for ext4. 57 * Extent status tree implementation for ext4.
44 * 58 *
45 * 59 *
46 * ========================================================================== 60 * ==========================================================================
47 * Extents status encompass delayed extents and extent locks 61 * Extent status tree tracks all extent status.
48 * 62 *
49 * 1. Why delayed extent implementation ? 63 * 1. Why we need to implement extent status tree?
50 * 64 *
51 * Without delayed extent, ext4 identifies a delayed extent by looking 65 * Without extent status tree, ext4 identifies a delayed extent by looking
52 * up page cache, this has several deficiencies - complicated, buggy, 66 * up page cache, this has several deficiencies - complicated, buggy,
53 * and inefficient code. 67 * and inefficient code.
54 * 68 *
55 * FIEMAP, SEEK_HOLE/DATA, bigalloc, punch hole and writeout all need 69 * FIEMAP, SEEK_HOLE/DATA, bigalloc, and writeout all need to know if a
56 * to know if a block or a range of blocks are belonged to a delayed 70 * block or a range of blocks are belonged to a delayed extent.
57 * extent.
58 * 71 *
59 * Let us have a look at how they do without delayed extents implementation. 72 * Let us have a look at how they do without extent status tree.
60 * -- FIEMAP 73 * -- FIEMAP
61 * FIEMAP looks up page cache to identify delayed allocations from holes. 74 * FIEMAP looks up page cache to identify delayed allocations from holes.
62 * 75 *
@@ -68,47 +81,48 @@
68 * already under delayed allocation or not to determine whether 81 * already under delayed allocation or not to determine whether
69 * quota reserving is needed for the cluster. 82 * quota reserving is needed for the cluster.
70 * 83 *
71 * -- punch hole
72 * punch hole looks up page cache to identify a delayed extent.
73 *
74 * -- writeout 84 * -- writeout
75 * Writeout looks up whole page cache to see if a buffer is 85 * Writeout looks up whole page cache to see if a buffer is
76 * mapped, If there are not very many delayed buffers, then it is 86 * mapped, If there are not very many delayed buffers, then it is
77 * time comsuming. 87 * time comsuming.
78 * 88 *
79 * With delayed extents implementation, FIEMAP, SEEK_HOLE/DATA, 89 * With extent status tree implementation, FIEMAP, SEEK_HOLE/DATA,
80 * bigalloc and writeout can figure out if a block or a range of 90 * bigalloc and writeout can figure out if a block or a range of
81 * blocks is under delayed allocation(belonged to a delayed extent) or 91 * blocks is under delayed allocation(belonged to a delayed extent) or
82 * not by searching the delayed extent tree. 92 * not by searching the extent tree.
83 * 93 *
84 * 94 *
85 * ========================================================================== 95 * ==========================================================================
86 * 2. ext4 delayed extents impelmentation 96 * 2. Ext4 extent status tree impelmentation
97 *
98 * -- extent
99 * A extent is a range of blocks which are contiguous logically and
100 * physically. Unlike extent in extent tree, this extent in ext4 is
101 * a in-memory struct, there is no corresponding on-disk data. There
102 * is no limit on length of extent, so an extent can contain as many
103 * blocks as they are contiguous logically and physically.
87 * 104 *
88 * -- delayed extent 105 * -- extent status tree
89 * A delayed extent is a range of blocks which are contiguous 106 * Every inode has an extent status tree and all allocation blocks
90 * logically and under delayed allocation. Unlike extent in 107 * are added to the tree with different status. The extent in the
91 * ext4, delayed extent in ext4 is a in-memory struct, there is 108 * tree are ordered by logical block no.
92 * no corresponding on-disk data. There is no limit on length of
93 * delayed extent, so a delayed extent can contain as many blocks
94 * as they are contiguous logically.
95 * 109 *
96 * -- delayed extent tree 110 * -- operations on a extent status tree
97 * Every inode has a delayed extent tree and all under delayed 111 * There are three important operations on a delayed extent tree: find
98 * allocation blocks are added to the tree as delayed extents. 112 * next extent, adding a extent(a range of blocks) and removing a extent.
99 * Delayed extents in the tree are ordered by logical block no.
100 * 113 *
101 * -- operations on a delayed extent tree 114 * -- race on a extent status tree
102 * There are three operations on a delayed extent tree: find next 115 * Extent status tree is protected by inode->i_es_lock.
103 * delayed extent, adding a space(a range of blocks) and removing
104 * a space.
105 * 116 *
106 * -- race on a delayed extent tree 117 * -- memory consumption
107 * Delayed extent tree is protected inode->i_es_lock. 118 * Fragmented extent tree will make extent status tree cost too much
119 * memory. Hence, we will reclaim written/unwritten/hole extents from
120 * the tree under a heavy memory pressure.
108 * 121 *
109 * 122 *
110 * ========================================================================== 123 * ==========================================================================
111 * 3. performance analysis 124 * 3. Performance analysis
125 *
112 * -- overhead 126 * -- overhead
113 * 1. There is a cache extent for write access, so if writes are 127 * 1. There is a cache extent for write access, so if writes are
114 * not very random, adding space operaions are in O(1) time. 128 * not very random, adding space operaions are in O(1) time.
@@ -120,15 +134,19 @@
120 * 134 *
121 * ========================================================================== 135 * ==========================================================================
122 * 4. TODO list 136 * 4. TODO list
123 * -- Track all extent status
124 * 137 *
125 * -- Improve get block process 138 * -- Refactor delayed space reservation
126 * 139 *
127 * -- Extent-level locking 140 * -- Extent-level locking
128 */ 141 */
129 142
130static struct kmem_cache *ext4_es_cachep; 143static struct kmem_cache *ext4_es_cachep;
131 144
145static int __es_insert_extent(struct ext4_es_tree *tree,
146 struct extent_status *newes);
147static int __es_remove_extent(struct ext4_es_tree *tree, ext4_lblk_t lblk,
148 ext4_lblk_t end);
149
132int __init ext4_init_es(void) 150int __init ext4_init_es(void)
133{ 151{
134 ext4_es_cachep = KMEM_CACHE(extent_status, SLAB_RECLAIM_ACCOUNT); 152 ext4_es_cachep = KMEM_CACHE(extent_status, SLAB_RECLAIM_ACCOUNT);
@@ -161,7 +179,7 @@ static void ext4_es_print_tree(struct inode *inode)
161 while (node) { 179 while (node) {
162 struct extent_status *es; 180 struct extent_status *es;
163 es = rb_entry(node, struct extent_status, rb_node); 181 es = rb_entry(node, struct extent_status, rb_node);
164 printk(KERN_DEBUG " [%u/%u)", es->start, es->len); 182 printk(KERN_DEBUG " [%u/%u)", es->es_lblk, es->es_len);
165 node = rb_next(node); 183 node = rb_next(node);
166 } 184 }
167 printk(KERN_DEBUG "\n"); 185 printk(KERN_DEBUG "\n");
@@ -170,10 +188,10 @@ static void ext4_es_print_tree(struct inode *inode)
170#define ext4_es_print_tree(inode) 188#define ext4_es_print_tree(inode)
171#endif 189#endif
172 190
173static inline ext4_lblk_t extent_status_end(struct extent_status *es) 191static inline ext4_lblk_t ext4_es_end(struct extent_status *es)
174{ 192{
175 BUG_ON(es->start + es->len < es->start); 193 BUG_ON(es->es_lblk + es->es_len < es->es_lblk);
176 return es->start + es->len - 1; 194 return es->es_lblk + es->es_len - 1;
177} 195}
178 196
179/* 197/*
@@ -181,25 +199,25 @@ static inline ext4_lblk_t extent_status_end(struct extent_status *es)
181 * it can't be found, try to find next extent. 199 * it can't be found, try to find next extent.
182 */ 200 */
183static struct extent_status *__es_tree_search(struct rb_root *root, 201static struct extent_status *__es_tree_search(struct rb_root *root,
184 ext4_lblk_t offset) 202 ext4_lblk_t lblk)
185{ 203{
186 struct rb_node *node = root->rb_node; 204 struct rb_node *node = root->rb_node;
187 struct extent_status *es = NULL; 205 struct extent_status *es = NULL;
188 206
189 while (node) { 207 while (node) {
190 es = rb_entry(node, struct extent_status, rb_node); 208 es = rb_entry(node, struct extent_status, rb_node);
191 if (offset < es->start) 209 if (lblk < es->es_lblk)
192 node = node->rb_left; 210 node = node->rb_left;
193 else if (offset > extent_status_end(es)) 211 else if (lblk > ext4_es_end(es))
194 node = node->rb_right; 212 node = node->rb_right;
195 else 213 else
196 return es; 214 return es;
197 } 215 }
198 216
199 if (es && offset < es->start) 217 if (es && lblk < es->es_lblk)
200 return es; 218 return es;
201 219
202 if (es && offset > extent_status_end(es)) { 220 if (es && lblk > ext4_es_end(es)) {
203 node = rb_next(&es->rb_node); 221 node = rb_next(&es->rb_node);
204 return node ? rb_entry(node, struct extent_status, rb_node) : 222 return node ? rb_entry(node, struct extent_status, rb_node) :
205 NULL; 223 NULL;
@@ -209,8 +227,8 @@ static struct extent_status *__es_tree_search(struct rb_root *root,
209} 227}
210 228
211/* 229/*
212 * ext4_es_find_extent: find the 1st delayed extent covering @es->start 230 * ext4_es_find_extent: find the 1st delayed extent covering @es->lblk
213 * if it exists, otherwise, the next extent after @es->start. 231 * if it exists, otherwise, the next extent after @es->lblk.
214 * 232 *
215 * @inode: the inode which owns delayed extents 233 * @inode: the inode which owns delayed extents
216 * @es: delayed extent that we found 234 * @es: delayed extent that we found
@@ -226,7 +244,7 @@ ext4_lblk_t ext4_es_find_extent(struct inode *inode, struct extent_status *es)
226 struct rb_node *node; 244 struct rb_node *node;
227 ext4_lblk_t ret = EXT_MAX_BLOCKS; 245 ext4_lblk_t ret = EXT_MAX_BLOCKS;
228 246
229 trace_ext4_es_find_extent_enter(inode, es->start); 247 trace_ext4_es_find_extent_enter(inode, es->es_lblk);
230 248
231 read_lock(&EXT4_I(inode)->i_es_lock); 249 read_lock(&EXT4_I(inode)->i_es_lock);
232 tree = &EXT4_I(inode)->i_es_tree; 250 tree = &EXT4_I(inode)->i_es_tree;
@@ -234,25 +252,25 @@ ext4_lblk_t ext4_es_find_extent(struct inode *inode, struct extent_status *es)
234 /* find delay extent in cache firstly */ 252 /* find delay extent in cache firstly */
235 if (tree->cache_es) { 253 if (tree->cache_es) {
236 es1 = tree->cache_es; 254 es1 = tree->cache_es;
237 if (in_range(es->start, es1->start, es1->len)) { 255 if (in_range(es->es_lblk, es1->es_lblk, es1->es_len)) {
238 es_debug("%u cached by [%u/%u)\n", 256 es_debug("%u cached by [%u/%u)\n",
239 es->start, es1->start, es1->len); 257 es->es_lblk, es1->es_lblk, es1->es_len);
240 goto out; 258 goto out;
241 } 259 }
242 } 260 }
243 261
244 es->len = 0; 262 es->es_len = 0;
245 es1 = __es_tree_search(&tree->root, es->start); 263 es1 = __es_tree_search(&tree->root, es->es_lblk);
246 264
247out: 265out:
248 if (es1) { 266 if (es1) {
249 tree->cache_es = es1; 267 tree->cache_es = es1;
250 es->start = es1->start; 268 es->es_lblk = es1->es_lblk;
251 es->len = es1->len; 269 es->es_len = es1->es_len;
252 node = rb_next(&es1->rb_node); 270 node = rb_next(&es1->rb_node);
253 if (node) { 271 if (node) {
254 es1 = rb_entry(node, struct extent_status, rb_node); 272 es1 = rb_entry(node, struct extent_status, rb_node);
255 ret = es1->start; 273 ret = es1->es_lblk;
256 } 274 }
257 } 275 }
258 276
@@ -263,14 +281,14 @@ out:
263} 281}
264 282
265static struct extent_status * 283static struct extent_status *
266ext4_es_alloc_extent(ext4_lblk_t start, ext4_lblk_t len) 284ext4_es_alloc_extent(ext4_lblk_t lblk, ext4_lblk_t len)
267{ 285{
268 struct extent_status *es; 286 struct extent_status *es;
269 es = kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC); 287 es = kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC);
270 if (es == NULL) 288 if (es == NULL)
271 return NULL; 289 return NULL;
272 es->start = start; 290 es->es_lblk = lblk;
273 es->len = len; 291 es->es_len = len;
274 return es; 292 return es;
275} 293}
276 294
@@ -279,6 +297,20 @@ static void ext4_es_free_extent(struct extent_status *es)
279 kmem_cache_free(ext4_es_cachep, es); 297 kmem_cache_free(ext4_es_cachep, es);
280} 298}
281 299
300/*
301 * Check whether or not two extents can be merged
302 * Condition:
303 * - logical block number is contiguous
304 */
305static int ext4_es_can_be_merged(struct extent_status *es1,
306 struct extent_status *es2)
307{
308 if (es1->es_lblk + es1->es_len != es2->es_lblk)
309 return 0;
310
311 return 1;
312}
313
282static struct extent_status * 314static struct extent_status *
283ext4_es_try_to_merge_left(struct ext4_es_tree *tree, struct extent_status *es) 315ext4_es_try_to_merge_left(struct ext4_es_tree *tree, struct extent_status *es)
284{ 316{
@@ -290,8 +322,8 @@ ext4_es_try_to_merge_left(struct ext4_es_tree *tree, struct extent_status *es)
290 return es; 322 return es;
291 323
292 es1 = rb_entry(node, struct extent_status, rb_node); 324 es1 = rb_entry(node, struct extent_status, rb_node);
293 if (es->start == extent_status_end(es1) + 1) { 325 if (ext4_es_can_be_merged(es1, es)) {
294 es1->len += es->len; 326 es1->es_len += es->es_len;
295 rb_erase(&es->rb_node, &tree->root); 327 rb_erase(&es->rb_node, &tree->root);
296 ext4_es_free_extent(es); 328 ext4_es_free_extent(es);
297 es = es1; 329 es = es1;
@@ -311,8 +343,8 @@ ext4_es_try_to_merge_right(struct ext4_es_tree *tree, struct extent_status *es)
311 return es; 343 return es;
312 344
313 es1 = rb_entry(node, struct extent_status, rb_node); 345 es1 = rb_entry(node, struct extent_status, rb_node);
314 if (es1->start == extent_status_end(es) + 1) { 346 if (ext4_es_can_be_merged(es, es1)) {
315 es->len += es1->len; 347 es->es_len += es1->es_len;
316 rb_erase(node, &tree->root); 348 rb_erase(node, &tree->root);
317 ext4_es_free_extent(es1); 349 ext4_es_free_extent(es1);
318 } 350 }
@@ -320,60 +352,43 @@ ext4_es_try_to_merge_right(struct ext4_es_tree *tree, struct extent_status *es)
320 return es; 352 return es;
321} 353}
322 354
323static int __es_insert_extent(struct ext4_es_tree *tree, ext4_lblk_t offset, 355static int __es_insert_extent(struct ext4_es_tree *tree,
324 ext4_lblk_t len) 356 struct extent_status *newes)
325{ 357{
326 struct rb_node **p = &tree->root.rb_node; 358 struct rb_node **p = &tree->root.rb_node;
327 struct rb_node *parent = NULL; 359 struct rb_node *parent = NULL;
328 struct extent_status *es; 360 struct extent_status *es;
329 ext4_lblk_t end = offset + len - 1;
330
331 BUG_ON(end < offset);
332 es = tree->cache_es;
333 if (es && offset == (extent_status_end(es) + 1)) {
334 es_debug("cached by [%u/%u)\n", es->start, es->len);
335 es->len += len;
336 es = ext4_es_try_to_merge_right(tree, es);
337 goto out;
338 } else if (es && es->start == end + 1) {
339 es_debug("cached by [%u/%u)\n", es->start, es->len);
340 es->start = offset;
341 es->len += len;
342 es = ext4_es_try_to_merge_left(tree, es);
343 goto out;
344 } else if (es && es->start <= offset &&
345 end <= extent_status_end(es)) {
346 es_debug("cached by [%u/%u)\n", es->start, es->len);
347 goto out;
348 }
349 361
350 while (*p) { 362 while (*p) {
351 parent = *p; 363 parent = *p;
352 es = rb_entry(parent, struct extent_status, rb_node); 364 es = rb_entry(parent, struct extent_status, rb_node);
353 365
354 if (offset < es->start) { 366 if (newes->es_lblk < es->es_lblk) {
355 if (es->start == end + 1) { 367 if (ext4_es_can_be_merged(newes, es)) {
356 es->start = offset; 368 /*
357 es->len += len; 369 * Here we can modify es_lblk directly
370 * because it isn't overlapped.
371 */
372 es->es_lblk = newes->es_lblk;
373 es->es_len += newes->es_len;
358 es = ext4_es_try_to_merge_left(tree, es); 374 es = ext4_es_try_to_merge_left(tree, es);
359 goto out; 375 goto out;
360 } 376 }
361 p = &(*p)->rb_left; 377 p = &(*p)->rb_left;
362 } else if (offset > extent_status_end(es)) { 378 } else if (newes->es_lblk > ext4_es_end(es)) {
363 if (offset == extent_status_end(es) + 1) { 379 if (ext4_es_can_be_merged(es, newes)) {
364 es->len += len; 380 es->es_len += newes->es_len;
365 es = ext4_es_try_to_merge_right(tree, es); 381 es = ext4_es_try_to_merge_right(tree, es);
366 goto out; 382 goto out;
367 } 383 }
368 p = &(*p)->rb_right; 384 p = &(*p)->rb_right;
369 } else { 385 } else {
370 if (extent_status_end(es) <= end) 386 BUG_ON(1);
371 es->len = offset - es->start + len; 387 return -EINVAL;
372 goto out;
373 } 388 }
374 } 389 }
375 390
376 es = ext4_es_alloc_extent(offset, len); 391 es = ext4_es_alloc_extent(newes->es_lblk, newes->es_len);
377 if (!es) 392 if (!es)
378 return -ENOMEM; 393 return -ENOMEM;
379 rb_link_node(&es->rb_node, parent, p); 394 rb_link_node(&es->rb_node, parent, p);
@@ -385,27 +400,38 @@ out:
385} 400}
386 401
387/* 402/*
388 * ext4_es_insert_extent() adds a space to a delayed extent tree. 403 * ext4_es_insert_extent() adds a space to a extent status tree.
389 * Caller holds inode->i_es_lock.
390 * 404 *
391 * ext4_es_insert_extent is called by ext4_da_write_begin and 405 * ext4_es_insert_extent is called by ext4_da_write_begin and
392 * ext4_es_remove_extent. 406 * ext4_es_remove_extent.
393 * 407 *
394 * Return 0 on success, error code on failure. 408 * Return 0 on success, error code on failure.
395 */ 409 */
396int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t offset, 410int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
397 ext4_lblk_t len) 411 ext4_lblk_t len)
398{ 412{
399 struct ext4_es_tree *tree; 413 struct ext4_es_tree *tree;
414 struct extent_status newes;
415 ext4_lblk_t end = lblk + len - 1;
400 int err = 0; 416 int err = 0;
401 417
402 trace_ext4_es_insert_extent(inode, offset, len); 418 trace_ext4_es_insert_extent(inode, lblk, len);
403 es_debug("add [%u/%u) to extent status tree of inode %lu\n", 419 es_debug("add [%u/%u) to extent status tree of inode %lu\n",
404 offset, len, inode->i_ino); 420 lblk, len, inode->i_ino);
421
422 BUG_ON(end < lblk);
423
424 newes.es_lblk = lblk;
425 newes.es_len = len;
405 426
406 write_lock(&EXT4_I(inode)->i_es_lock); 427 write_lock(&EXT4_I(inode)->i_es_lock);
407 tree = &EXT4_I(inode)->i_es_tree; 428 tree = &EXT4_I(inode)->i_es_tree;
408 err = __es_insert_extent(tree, offset, len); 429 err = __es_remove_extent(tree, lblk, end);
430 if (err != 0)
431 goto error;
432 err = __es_insert_extent(tree, &newes);
433
434error:
409 write_unlock(&EXT4_I(inode)->i_es_lock); 435 write_unlock(&EXT4_I(inode)->i_es_lock);
410 436
411 ext4_es_print_tree(inode); 437 ext4_es_print_tree(inode);
@@ -413,57 +439,45 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t offset,
413 return err; 439 return err;
414} 440}
415 441
416/* 442static int __es_remove_extent(struct ext4_es_tree *tree, ext4_lblk_t lblk,
417 * ext4_es_remove_extent() removes a space from a delayed extent tree. 443 ext4_lblk_t end)
418 * Caller holds inode->i_es_lock.
419 *
420 * Return 0 on success, error code on failure.
421 */
422int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t offset,
423 ext4_lblk_t len)
424{ 444{
425 struct rb_node *node; 445 struct rb_node *node;
426 struct ext4_es_tree *tree;
427 struct extent_status *es; 446 struct extent_status *es;
428 struct extent_status orig_es; 447 struct extent_status orig_es;
429 ext4_lblk_t len1, len2, end; 448 ext4_lblk_t len1, len2;
430 int err = 0; 449 int err = 0;
431 450
432 trace_ext4_es_remove_extent(inode, offset, len); 451 es = __es_tree_search(&tree->root, lblk);
433 es_debug("remove [%u/%u) from extent status tree of inode %lu\n",
434 offset, len, inode->i_ino);
435
436 end = offset + len - 1;
437 BUG_ON(end < offset);
438 write_lock(&EXT4_I(inode)->i_es_lock);
439 tree = &EXT4_I(inode)->i_es_tree;
440 es = __es_tree_search(&tree->root, offset);
441 if (!es) 452 if (!es)
442 goto out; 453 goto out;
443 if (es->start > end) 454 if (es->es_lblk > end)
444 goto out; 455 goto out;
445 456
446 /* Simply invalidate cache_es. */ 457 /* Simply invalidate cache_es. */
447 tree->cache_es = NULL; 458 tree->cache_es = NULL;
448 459
449 orig_es.start = es->start; 460 orig_es.es_lblk = es->es_lblk;
450 orig_es.len = es->len; 461 orig_es.es_len = es->es_len;
451 len1 = offset > es->start ? offset - es->start : 0; 462 len1 = lblk > es->es_lblk ? lblk - es->es_lblk : 0;
452 len2 = extent_status_end(es) > end ? 463 len2 = ext4_es_end(es) > end ? ext4_es_end(es) - end : 0;
453 extent_status_end(es) - end : 0;
454 if (len1 > 0) 464 if (len1 > 0)
455 es->len = len1; 465 es->es_len = len1;
456 if (len2 > 0) { 466 if (len2 > 0) {
457 if (len1 > 0) { 467 if (len1 > 0) {
458 err = __es_insert_extent(tree, end + 1, len2); 468 struct extent_status newes;
469
470 newes.es_lblk = end + 1;
471 newes.es_len = len2;
472 err = __es_insert_extent(tree, &newes);
459 if (err) { 473 if (err) {
460 es->start = orig_es.start; 474 es->es_lblk = orig_es.es_lblk;
461 es->len = orig_es.len; 475 es->es_len = orig_es.es_len;
462 goto out; 476 goto out;
463 } 477 }
464 } else { 478 } else {
465 es->start = end + 1; 479 es->es_lblk = end + 1;
466 es->len = len2; 480 es->es_len = len2;
467 } 481 }
468 goto out; 482 goto out;
469 } 483 }
@@ -476,7 +490,7 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t offset,
476 es = NULL; 490 es = NULL;
477 } 491 }
478 492
479 while (es && extent_status_end(es) <= end) { 493 while (es && ext4_es_end(es) <= end) {
480 node = rb_next(&es->rb_node); 494 node = rb_next(&es->rb_node);
481 rb_erase(&es->rb_node, &tree->root); 495 rb_erase(&es->rb_node, &tree->root);
482 ext4_es_free_extent(es); 496 ext4_es_free_extent(es);
@@ -487,13 +501,39 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t offset,
487 es = rb_entry(node, struct extent_status, rb_node); 501 es = rb_entry(node, struct extent_status, rb_node);
488 } 502 }
489 503
490 if (es && es->start < end + 1) { 504 if (es && es->es_lblk < end + 1) {
491 len1 = extent_status_end(es) - end; 505 len1 = ext4_es_end(es) - end;
492 es->start = end + 1; 506 es->es_lblk = end + 1;
493 es->len = len1; 507 es->es_len = len1;
494 } 508 }
495 509
496out: 510out:
511 return err;
512}
513
514/*
515 * ext4_es_remove_extent() removes a space from a extent status tree.
516 *
517 * Return 0 on success, error code on failure.
518 */
519int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
520 ext4_lblk_t len)
521{
522 struct ext4_es_tree *tree;
523 ext4_lblk_t end;
524 int err = 0;
525
526 trace_ext4_es_remove_extent(inode, lblk, len);
527 es_debug("remove [%u/%u) from extent status tree of inode %lu\n",
528 lblk, len, inode->i_ino);
529
530 end = lblk + len - 1;
531 BUG_ON(end < lblk);
532
533 tree = &EXT4_I(inode)->i_es_tree;
534
535 write_lock(&EXT4_I(inode)->i_es_lock);
536 err = __es_remove_extent(tree, lblk, end);
497 write_unlock(&EXT4_I(inode)->i_es_lock); 537 write_unlock(&EXT4_I(inode)->i_es_lock);
498 ext4_es_print_tree(inode); 538 ext4_es_print_tree(inode);
499 return err; 539 return err;
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index 077f82db092a..81e9339f23f1 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -22,8 +22,8 @@
22 22
23struct extent_status { 23struct extent_status {
24 struct rb_node rb_node; 24 struct rb_node rb_node;
25 ext4_lblk_t start; /* first block extent covers */ 25 ext4_lblk_t es_lblk; /* first logical block extent covers */
26 ext4_lblk_t len; /* length of extent in block */ 26 ext4_lblk_t es_len; /* length of extent in block */
27}; 27};
28 28
29struct ext4_es_tree { 29struct ext4_es_tree {
@@ -35,9 +35,9 @@ extern int __init ext4_init_es(void);
35extern void ext4_exit_es(void); 35extern void ext4_exit_es(void);
36extern void ext4_es_init_tree(struct ext4_es_tree *tree); 36extern void ext4_es_init_tree(struct ext4_es_tree *tree);
37 37
38extern int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t start, 38extern int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
39 ext4_lblk_t len); 39 ext4_lblk_t len);
40extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t start, 40extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
41 ext4_lblk_t len); 41 ext4_lblk_t len);
42extern ext4_lblk_t ext4_es_find_extent(struct inode *inode, 42extern ext4_lblk_t ext4_es_find_extent(struct inode *inode,
43 struct extent_status *es); 43 struct extent_status *es);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 2cf8ab810687..2df9354b105e 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -464,10 +464,9 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
464 * If there is a delay extent at this offset, 464 * If there is a delay extent at this offset,
465 * it will be as a data. 465 * it will be as a data.
466 */ 466 */
467 es.start = last; 467 es.es_lblk = last;
468 (void)ext4_es_find_extent(inode, &es); 468 (void)ext4_es_find_extent(inode, &es);
469 if (last >= es.start && 469 if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
470 last < es.start + es.len) {
471 if (last != start) 470 if (last != start)
472 dataoff = last << blkbits; 471 dataoff = last << blkbits;
473 break; 472 break;
@@ -549,11 +548,10 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
549 * If there is a delay extent at this offset, 548 * If there is a delay extent at this offset,
550 * we will skip this extent. 549 * we will skip this extent.
551 */ 550 */
552 es.start = last; 551 es.es_lblk = last;
553 (void)ext4_es_find_extent(inode, &es); 552 (void)ext4_es_find_extent(inode, &es);
554 if (last >= es.start && 553 if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
555 last < es.start + es.len) { 554 last = es.es_lblk + es.es_len;
556 last = es.start + es.len;
557 holeoff = last << blkbits; 555 holeoff = last << blkbits;
558 continue; 556 continue;
559 } 557 }
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index 6080ea1380b8..52c923851959 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -2093,75 +2093,75 @@ TRACE_EVENT(ext4_ext_remove_space_done,
2093); 2093);
2094 2094
2095TRACE_EVENT(ext4_es_insert_extent, 2095TRACE_EVENT(ext4_es_insert_extent,
2096 TP_PROTO(struct inode *inode, ext4_lblk_t start, ext4_lblk_t len), 2096 TP_PROTO(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len),
2097 2097
2098 TP_ARGS(inode, start, len), 2098 TP_ARGS(inode, lblk, len),
2099 2099
2100 TP_STRUCT__entry( 2100 TP_STRUCT__entry(
2101 __field( dev_t, dev ) 2101 __field( dev_t, dev )
2102 __field( ino_t, ino ) 2102 __field( ino_t, ino )
2103 __field( loff_t, start ) 2103 __field( loff_t, lblk )
2104 __field( loff_t, len ) 2104 __field( loff_t, len )
2105 ), 2105 ),
2106 2106
2107 TP_fast_assign( 2107 TP_fast_assign(
2108 __entry->dev = inode->i_sb->s_dev; 2108 __entry->dev = inode->i_sb->s_dev;
2109 __entry->ino = inode->i_ino; 2109 __entry->ino = inode->i_ino;
2110 __entry->start = start; 2110 __entry->lblk = lblk;
2111 __entry->len = len; 2111 __entry->len = len;
2112 ), 2112 ),
2113 2113
2114 TP_printk("dev %d,%d ino %lu es [%lld/%lld)", 2114 TP_printk("dev %d,%d ino %lu es [%lld/%lld)",
2115 MAJOR(__entry->dev), MINOR(__entry->dev), 2115 MAJOR(__entry->dev), MINOR(__entry->dev),
2116 (unsigned long) __entry->ino, 2116 (unsigned long) __entry->ino,
2117 __entry->start, __entry->len) 2117 __entry->lblk, __entry->len)
2118); 2118);
2119 2119
2120TRACE_EVENT(ext4_es_remove_extent, 2120TRACE_EVENT(ext4_es_remove_extent,
2121 TP_PROTO(struct inode *inode, ext4_lblk_t start, ext4_lblk_t len), 2121 TP_PROTO(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len),
2122 2122
2123 TP_ARGS(inode, start, len), 2123 TP_ARGS(inode, lblk, len),
2124 2124
2125 TP_STRUCT__entry( 2125 TP_STRUCT__entry(
2126 __field( dev_t, dev ) 2126 __field( dev_t, dev )
2127 __field( ino_t, ino ) 2127 __field( ino_t, ino )
2128 __field( loff_t, start ) 2128 __field( loff_t, lblk )
2129 __field( loff_t, len ) 2129 __field( loff_t, len )
2130 ), 2130 ),
2131 2131
2132 TP_fast_assign( 2132 TP_fast_assign(
2133 __entry->dev = inode->i_sb->s_dev; 2133 __entry->dev = inode->i_sb->s_dev;
2134 __entry->ino = inode->i_ino; 2134 __entry->ino = inode->i_ino;
2135 __entry->start = start; 2135 __entry->lblk = lblk;
2136 __entry->len = len; 2136 __entry->len = len;
2137 ), 2137 ),
2138 2138
2139 TP_printk("dev %d,%d ino %lu es [%lld/%lld)", 2139 TP_printk("dev %d,%d ino %lu es [%lld/%lld)",
2140 MAJOR(__entry->dev), MINOR(__entry->dev), 2140 MAJOR(__entry->dev), MINOR(__entry->dev),
2141 (unsigned long) __entry->ino, 2141 (unsigned long) __entry->ino,
2142 __entry->start, __entry->len) 2142 __entry->lblk, __entry->len)
2143); 2143);
2144 2144
2145TRACE_EVENT(ext4_es_find_extent_enter, 2145TRACE_EVENT(ext4_es_find_extent_enter,
2146 TP_PROTO(struct inode *inode, ext4_lblk_t start), 2146 TP_PROTO(struct inode *inode, ext4_lblk_t lblk),
2147 2147
2148 TP_ARGS(inode, start), 2148 TP_ARGS(inode, lblk),
2149 2149
2150 TP_STRUCT__entry( 2150 TP_STRUCT__entry(
2151 __field( dev_t, dev ) 2151 __field( dev_t, dev )
2152 __field( ino_t, ino ) 2152 __field( ino_t, ino )
2153 __field( ext4_lblk_t, start ) 2153 __field( ext4_lblk_t, lblk )
2154 ), 2154 ),
2155 2155
2156 TP_fast_assign( 2156 TP_fast_assign(
2157 __entry->dev = inode->i_sb->s_dev; 2157 __entry->dev = inode->i_sb->s_dev;
2158 __entry->ino = inode->i_ino; 2158 __entry->ino = inode->i_ino;
2159 __entry->start = start; 2159 __entry->lblk = lblk;
2160 ), 2160 ),
2161 2161
2162 TP_printk("dev %d,%d ino %lu start %u", 2162 TP_printk("dev %d,%d ino %lu lblk %u",
2163 MAJOR(__entry->dev), MINOR(__entry->dev), 2163 MAJOR(__entry->dev), MINOR(__entry->dev),
2164 (unsigned long) __entry->ino, __entry->start) 2164 (unsigned long) __entry->ino, __entry->lblk)
2165); 2165);
2166 2166
2167TRACE_EVENT(ext4_es_find_extent_exit, 2167TRACE_EVENT(ext4_es_find_extent_exit,
@@ -2173,7 +2173,7 @@ TRACE_EVENT(ext4_es_find_extent_exit,
2173 TP_STRUCT__entry( 2173 TP_STRUCT__entry(
2174 __field( dev_t, dev ) 2174 __field( dev_t, dev )
2175 __field( ino_t, ino ) 2175 __field( ino_t, ino )
2176 __field( ext4_lblk_t, start ) 2176 __field( ext4_lblk_t, lblk )
2177 __field( ext4_lblk_t, len ) 2177 __field( ext4_lblk_t, len )
2178 __field( ext4_lblk_t, ret ) 2178 __field( ext4_lblk_t, ret )
2179 ), 2179 ),
@@ -2181,15 +2181,15 @@ TRACE_EVENT(ext4_es_find_extent_exit,
2181 TP_fast_assign( 2181 TP_fast_assign(
2182 __entry->dev = inode->i_sb->s_dev; 2182 __entry->dev = inode->i_sb->s_dev;
2183 __entry->ino = inode->i_ino; 2183 __entry->ino = inode->i_ino;
2184 __entry->start = es->start; 2184 __entry->lblk = es->es_lblk;
2185 __entry->len = es->len; 2185 __entry->len = es->es_len;
2186 __entry->ret = ret; 2186 __entry->ret = ret;
2187 ), 2187 ),
2188 2188
2189 TP_printk("dev %d,%d ino %lu es [%u/%u) ret %u", 2189 TP_printk("dev %d,%d ino %lu es [%u/%u) ret %u",
2190 MAJOR(__entry->dev), MINOR(__entry->dev), 2190 MAJOR(__entry->dev), MINOR(__entry->dev),
2191 (unsigned long) __entry->ino, 2191 (unsigned long) __entry->ino,
2192 __entry->start, __entry->len, __entry->ret) 2192 __entry->lblk, __entry->len, __entry->ret)
2193); 2193);
2194 2194
2195#endif /* _TRACE_EXT4_H */ 2195#endif /* _TRACE_EXT4_H */