aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ocfs2/alloc.c
diff options
context:
space:
mode:
authorMark Fasheh <mark.fasheh@oracle.com>2007-01-16 14:32:23 -0500
committerMark Fasheh <mark.fasheh@oracle.com>2007-04-26 17:44:03 -0400
commitdcd0538ff4e854fa9d7f4630b359ca8fdb5cb5a8 (patch)
tree226d725f8199907cea2433d1d183b01e51d9bc55 /fs/ocfs2/alloc.c
parent6f16bf655c5795586dd2ac96a7c70e0b9a378746 (diff)
ocfs2: sparse b-tree support
Introduce tree rotations into the b-tree code. This will allow ocfs2 to support sparse files. Much of the added code is designed to be generic (in the ocfs2 sense) so that it can later be re-used to implement large extended attributes. This patch only adds the rotation code and does minimal updates to callers of the extent api. Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
Diffstat (limited to 'fs/ocfs2/alloc.c')
-rw-r--r--fs/ocfs2/alloc.c2446
1 files changed, 1963 insertions, 483 deletions
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index f27e5378caf2..a96696867576 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -47,62 +47,230 @@
47 47
48#include "buffer_head_io.h" 48#include "buffer_head_io.h"
49 49
50static int ocfs2_extent_contig(struct inode *inode, 50static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
51 struct ocfs2_extent_rec *ext,
52 u64 blkno);
53 51
54static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb, 52/*
55 handle_t *handle, 53 * Structures which describe a path through a btree, and functions to
56 struct inode *inode, 54 * manipulate them.
57 int wanted, 55 *
58 struct ocfs2_alloc_context *meta_ac, 56 * The idea here is to be as generic as possible with the tree
59 struct buffer_head *bhs[]); 57 * manipulation code.
58 */
59struct ocfs2_path_item {
60 struct buffer_head *bh;
61 struct ocfs2_extent_list *el;
62};
60 63
61static int ocfs2_add_branch(struct ocfs2_super *osb, 64#define OCFS2_MAX_PATH_DEPTH 5
62 handle_t *handle,
63 struct inode *inode,
64 struct buffer_head *fe_bh,
65 struct buffer_head *eb_bh,
66 struct buffer_head *last_eb_bh,
67 struct ocfs2_alloc_context *meta_ac);
68 65
69static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, 66struct ocfs2_path {
70 handle_t *handle, 67 int p_tree_depth;
71 struct inode *inode, 68 struct ocfs2_path_item p_node[OCFS2_MAX_PATH_DEPTH];
72 struct buffer_head *fe_bh, 69};
73 struct ocfs2_alloc_context *meta_ac,
74 struct buffer_head **ret_new_eb_bh);
75 70
76static int ocfs2_do_insert_extent(struct ocfs2_super *osb, 71#define path_root_bh(_path) ((_path)->p_node[0].bh)
77 handle_t *handle, 72#define path_root_el(_path) ((_path)->p_node[0].el)
78 struct inode *inode, 73#define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh)
79 struct buffer_head *fe_bh, 74#define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el)
80 u64 blkno, 75#define path_num_items(_path) ((_path)->p_tree_depth + 1)
81 u32 new_clusters);
82 76
83static int ocfs2_find_branch_target(struct ocfs2_super *osb, 77/*
84 struct inode *inode, 78 * Reset the actual path elements so that we can re-use the structure
85 struct buffer_head *fe_bh, 79 * to build another path. Generally, this involves freeing the buffer
86 struct buffer_head **target_bh); 80 * heads.
81 */
82static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
83{
84 int i, start = 0, depth = 0;
85 struct ocfs2_path_item *node;
87 86
88static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb, 87 if (keep_root)
89 struct inode *inode, 88 start = 1;
90 struct ocfs2_dinode *fe,
91 unsigned int new_i_clusters,
92 struct buffer_head *old_last_eb,
93 struct buffer_head **new_last_eb);
94 89
95static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc); 90 for(i = start; i < path_num_items(path); i++) {
91 node = &path->p_node[i];
96 92
97static int ocfs2_extent_contig(struct inode *inode, 93 brelse(node->bh);
98 struct ocfs2_extent_rec *ext, 94 node->bh = NULL;
99 u64 blkno) 95 node->el = NULL;
96 }
97
98 /*
99 * Tree depth may change during truncate, or insert. If we're
100 * keeping the root extent list, then make sure that our path
101 * structure reflects the proper depth.
102 */
103 if (keep_root)
104 depth = le16_to_cpu(path_root_el(path)->l_tree_depth);
105
106 path->p_tree_depth = depth;
107}
108
109static void ocfs2_free_path(struct ocfs2_path *path)
110{
111 if (path) {
112 ocfs2_reinit_path(path, 0);
113 kfree(path);
114 }
115}
116
117/*
118 * Make the *dest path the same as src and re-initialize src path to
119 * have a root only.
120 */
121static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src)
122{
123 int i;
124
125 BUG_ON(path_root_bh(dest) != path_root_bh(src));
126
127 for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
128 brelse(dest->p_node[i].bh);
129
130 dest->p_node[i].bh = src->p_node[i].bh;
131 dest->p_node[i].el = src->p_node[i].el;
132
133 src->p_node[i].bh = NULL;
134 src->p_node[i].el = NULL;
135 }
136}
137
138/*
139 * Insert an extent block at given index.
140 *
141 * This will not take an additional reference on eb_bh.
142 */
143static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index,
144 struct buffer_head *eb_bh)
145{
146 struct ocfs2_extent_block *eb = (struct ocfs2_extent_block *)eb_bh->b_data;
147
148 /*
149 * Right now, no root bh is an extent block, so this helps
150 * catch code errors with dinode trees. The assertion can be
151 * safely removed if we ever need to insert extent block
152 * structures at the root.
153 */
154 BUG_ON(index == 0);
155
156 path->p_node[index].bh = eb_bh;
157 path->p_node[index].el = &eb->h_list;
158}
159
160static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
161 struct ocfs2_extent_list *root_el)
162{
163 struct ocfs2_path *path;
164
165 BUG_ON(le16_to_cpu(root_el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH);
166
167 path = kzalloc(sizeof(*path), GFP_NOFS);
168 if (path) {
169 path->p_tree_depth = le16_to_cpu(root_el->l_tree_depth);
170 get_bh(root_bh);
171 path_root_bh(path) = root_bh;
172 path_root_el(path) = root_el;
173 }
174
175 return path;
176}
177
178/*
179 * Allocate and initialize a new path based on a disk inode tree.
180 */
181static struct ocfs2_path *ocfs2_new_inode_path(struct buffer_head *di_bh)
182{
183 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
184 struct ocfs2_extent_list *el = &di->id2.i_list;
185
186 return ocfs2_new_path(di_bh, el);
187}
188
189/*
190 * Convenience function to journal all components in a path.
191 */
192static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle,
193 struct ocfs2_path *path)
194{
195 int i, ret = 0;
196
197 if (!path)
198 goto out;
199
200 for(i = 0; i < path_num_items(path); i++) {
201 ret = ocfs2_journal_access(handle, inode, path->p_node[i].bh,
202 OCFS2_JOURNAL_ACCESS_WRITE);
203 if (ret < 0) {
204 mlog_errno(ret);
205 goto out;
206 }
207 }
208
209out:
210 return ret;
211}
212
213enum ocfs2_contig_type {
214 CONTIG_NONE = 0,
215 CONTIG_LEFT,
216 CONTIG_RIGHT
217};
218
219static int ocfs2_block_extent_contig(struct super_block *sb,
220 struct ocfs2_extent_rec *ext,
221 u64 blkno)
100{ 222{
101 return blkno == (le64_to_cpu(ext->e_blkno) + 223 return blkno == (le64_to_cpu(ext->e_blkno) +
102 ocfs2_clusters_to_blocks(inode->i_sb, 224 ocfs2_clusters_to_blocks(sb,
103 le32_to_cpu(ext->e_clusters))); 225 le32_to_cpu(ext->e_clusters)));
104} 226}
105 227
228static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left,
229 struct ocfs2_extent_rec *right)
230{
231 return (le32_to_cpu(left->e_cpos) + le32_to_cpu(left->e_clusters) ==
232 le32_to_cpu(right->e_cpos));
233}
234
235static enum ocfs2_contig_type
236 ocfs2_extent_contig(struct inode *inode,
237 struct ocfs2_extent_rec *ext,
238 struct ocfs2_extent_rec *insert_rec)
239{
240 u64 blkno = le64_to_cpu(insert_rec->e_blkno);
241
242 if (ocfs2_extents_adjacent(ext, insert_rec) &&
243 ocfs2_block_extent_contig(inode->i_sb, ext, blkno))
244 return CONTIG_RIGHT;
245
246 blkno = le64_to_cpu(ext->e_blkno);
247 if (ocfs2_extents_adjacent(insert_rec, ext) &&
248 ocfs2_block_extent_contig(inode->i_sb, insert_rec, blkno))
249 return CONTIG_LEFT;
250
251 return CONTIG_NONE;
252}
253
254/*
255 * NOTE: We can have pretty much any combination of contiguousness and
256 * appending.
257 *
258 * The usefulness of APPEND_TAIL is more in that it lets us know that
259 * we'll have to update the path to that leaf.
260 */
261enum ocfs2_append_type {
262 APPEND_NONE = 0,
263 APPEND_TAIL,
264};
265
266struct ocfs2_insert_type {
267 enum ocfs2_append_type ins_appending;
268 enum ocfs2_contig_type ins_contig;
269 int ins_contig_index;
270 int ins_free_records;
271 int ins_tree_depth;
272};
273
106/* 274/*
107 * How many free extents have we got before we need more meta data? 275 * How many free extents have we got before we need more meta data?
108 */ 276 */
@@ -242,6 +410,28 @@ bail:
242} 410}
243 411
244/* 412/*
413 * Helper function for ocfs2_add_branch() and ocfs2_shift_tree_depth().
414 *
415 * Returns the sum of the rightmost extent rec logical offset and
416 * cluster count.
417 *
418 * ocfs2_add_branch() uses this to determine what logical cluster
419 * value should be populated into the leftmost new branch records.
420 *
421 * ocfs2_shift_tree_depth() uses this to determine the # clusters
422 * value for the new topmost tree record.
423 */
424static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list *el)
425{
426 int i;
427
428 i = le16_to_cpu(el->l_next_free_rec) - 1;
429
430 return le32_to_cpu(el->l_recs[i].e_cpos) +
431 le32_to_cpu(el->l_recs[i].e_clusters);
432}
433
434/*
245 * Add an entire tree branch to our inode. eb_bh is the extent block 435 * Add an entire tree branch to our inode. eb_bh is the extent block
246 * to start at, if we don't want to start the branch at the dinode 436 * to start at, if we don't want to start the branch at the dinode
247 * structure. 437 * structure.
@@ -268,6 +458,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
268 struct ocfs2_extent_block *eb; 458 struct ocfs2_extent_block *eb;
269 struct ocfs2_extent_list *eb_el; 459 struct ocfs2_extent_list *eb_el;
270 struct ocfs2_extent_list *el; 460 struct ocfs2_extent_list *el;
461 u32 new_cpos;
271 462
272 mlog_entry_void(); 463 mlog_entry_void();
273 464
@@ -302,6 +493,9 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
302 goto bail; 493 goto bail;
303 } 494 }
304 495
496 eb = (struct ocfs2_extent_block *)last_eb_bh->b_data;
497 new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list);
498
305 /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be 499 /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
306 * linked with the rest of the tree. 500 * linked with the rest of the tree.
307 * conversly, new_eb_bhs[0] is the new bottommost leaf. 501 * conversly, new_eb_bhs[0] is the new bottommost leaf.
@@ -330,7 +524,11 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
330 eb->h_next_leaf_blk = 0; 524 eb->h_next_leaf_blk = 0;
331 eb_el->l_tree_depth = cpu_to_le16(i); 525 eb_el->l_tree_depth = cpu_to_le16(i);
332 eb_el->l_next_free_rec = cpu_to_le16(1); 526 eb_el->l_next_free_rec = cpu_to_le16(1);
333 eb_el->l_recs[0].e_cpos = fe->i_clusters; 527 /*
528 * This actually counts as an empty extent as
529 * c_clusters == 0
530 */
531 eb_el->l_recs[0].e_cpos = cpu_to_le32(new_cpos);
334 eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno); 532 eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno);
335 eb_el->l_recs[0].e_clusters = cpu_to_le32(0); 533 eb_el->l_recs[0].e_clusters = cpu_to_le32(0);
336 if (!eb_el->l_tree_depth) 534 if (!eb_el->l_tree_depth)
@@ -376,7 +574,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
376 * either be on the fe, or the extent block passed in. */ 574 * either be on the fe, or the extent block passed in. */
377 i = le16_to_cpu(el->l_next_free_rec); 575 i = le16_to_cpu(el->l_next_free_rec);
378 el->l_recs[i].e_blkno = cpu_to_le64(next_blkno); 576 el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
379 el->l_recs[i].e_cpos = fe->i_clusters; 577 el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
380 el->l_recs[i].e_clusters = 0; 578 el->l_recs[i].e_clusters = 0;
381 le16_add_cpu(&el->l_next_free_rec, 1); 579 le16_add_cpu(&el->l_next_free_rec, 1);
382 580
@@ -425,6 +623,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
425 struct buffer_head **ret_new_eb_bh) 623 struct buffer_head **ret_new_eb_bh)
426{ 624{
427 int status, i; 625 int status, i;
626 u32 new_clusters;
428 struct buffer_head *new_eb_bh = NULL; 627 struct buffer_head *new_eb_bh = NULL;
429 struct ocfs2_dinode *fe; 628 struct ocfs2_dinode *fe;
430 struct ocfs2_extent_block *eb; 629 struct ocfs2_extent_block *eb;
@@ -480,11 +679,13 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
480 goto bail; 679 goto bail;
481 } 680 }
482 681
682 new_clusters = ocfs2_sum_rightmost_rec(eb_el);
683
483 /* update fe now */ 684 /* update fe now */
484 le16_add_cpu(&fe_el->l_tree_depth, 1); 685 le16_add_cpu(&fe_el->l_tree_depth, 1);
485 fe_el->l_recs[0].e_cpos = 0; 686 fe_el->l_recs[0].e_cpos = 0;
486 fe_el->l_recs[0].e_blkno = eb->h_blkno; 687 fe_el->l_recs[0].e_blkno = eb->h_blkno;
487 fe_el->l_recs[0].e_clusters = fe->i_clusters; 688 fe_el->l_recs[0].e_clusters = cpu_to_le32(new_clusters);
488 for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++) { 689 for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++) {
489 fe_el->l_recs[i].e_cpos = 0; 690 fe_el->l_recs[i].e_cpos = 0;
490 fe_el->l_recs[i].e_clusters = 0; 691 fe_el->l_recs[i].e_clusters = 0;
@@ -515,199 +716,6 @@ bail:
515} 716}
516 717
517/* 718/*
518 * Expects the tree to already have room in the rightmost leaf for the
519 * extent. Updates all the extent blocks (and the dinode) on the way
520 * down.
521 */
522static int ocfs2_do_insert_extent(struct ocfs2_super *osb,
523 handle_t *handle,
524 struct inode *inode,
525 struct buffer_head *fe_bh,
526 u64 start_blk,
527 u32 new_clusters)
528{
529 int status, i, num_bhs = 0;
530 u64 next_blkno;
531 u16 next_free;
532 struct buffer_head **eb_bhs = NULL;
533 struct ocfs2_dinode *fe;
534 struct ocfs2_extent_block *eb;
535 struct ocfs2_extent_list *el;
536
537 mlog_entry_void();
538
539 status = ocfs2_journal_access(handle, inode, fe_bh,
540 OCFS2_JOURNAL_ACCESS_WRITE);
541 if (status < 0) {
542 mlog_errno(status);
543 goto bail;
544 }
545
546 fe = (struct ocfs2_dinode *) fe_bh->b_data;
547 el = &fe->id2.i_list;
548 if (el->l_tree_depth) {
549 /* This is another operation where we want to be
550 * careful about our tree updates. An error here means
551 * none of the previous changes we made should roll
552 * forward. As a result, we have to record the buffers
553 * for this part of the tree in an array and reserve a
554 * journal write to them before making any changes. */
555 num_bhs = le16_to_cpu(fe->id2.i_list.l_tree_depth);
556 eb_bhs = kcalloc(num_bhs, sizeof(struct buffer_head *),
557 GFP_KERNEL);
558 if (!eb_bhs) {
559 status = -ENOMEM;
560 mlog_errno(status);
561 goto bail;
562 }
563
564 i = 0;
565 while(el->l_tree_depth) {
566 next_free = le16_to_cpu(el->l_next_free_rec);
567 if (next_free == 0) {
568 ocfs2_error(inode->i_sb,
569 "Dinode %llu has a bad extent list",
570 (unsigned long long)OCFS2_I(inode)->ip_blkno);
571 status = -EIO;
572 goto bail;
573 }
574 next_blkno = le64_to_cpu(el->l_recs[next_free - 1].e_blkno);
575
576 BUG_ON(i >= num_bhs);
577 status = ocfs2_read_block(osb, next_blkno, &eb_bhs[i],
578 OCFS2_BH_CACHED, inode);
579 if (status < 0) {
580 mlog_errno(status);
581 goto bail;
582 }
583 eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data;
584 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
585 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
586 eb);
587 status = -EIO;
588 goto bail;
589 }
590
591 status = ocfs2_journal_access(handle, inode, eb_bhs[i],
592 OCFS2_JOURNAL_ACCESS_WRITE);
593 if (status < 0) {
594 mlog_errno(status);
595 goto bail;
596 }
597
598 el = &eb->h_list;
599 i++;
600 /* When we leave this loop, eb_bhs[num_bhs - 1] will
601 * hold the bottom-most leaf extent block. */
602 }
603 BUG_ON(el->l_tree_depth);
604
605 el = &fe->id2.i_list;
606 /* If we have tree depth, then the fe update is
607 * trivial, and we want to switch el out for the
608 * bottom-most leaf in order to update it with the
609 * actual extent data below. */
610 next_free = le16_to_cpu(el->l_next_free_rec);
611 if (next_free == 0) {
612 ocfs2_error(inode->i_sb,
613 "Dinode %llu has a bad extent list",
614 (unsigned long long)OCFS2_I(inode)->ip_blkno);
615 status = -EIO;
616 goto bail;
617 }
618 le32_add_cpu(&el->l_recs[next_free - 1].e_clusters,
619 new_clusters);
620 /* (num_bhs - 1) to avoid the leaf */
621 for(i = 0; i < (num_bhs - 1); i++) {
622 eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data;
623 el = &eb->h_list;
624
625 /* finally, make our actual change to the
626 * intermediate extent blocks. */
627 next_free = le16_to_cpu(el->l_next_free_rec);
628 le32_add_cpu(&el->l_recs[next_free - 1].e_clusters,
629 new_clusters);
630
631 status = ocfs2_journal_dirty(handle, eb_bhs[i]);
632 if (status < 0)
633 mlog_errno(status);
634 }
635 BUG_ON(i != (num_bhs - 1));
636 /* note that the leaf block wasn't touched in
637 * the loop above */
638 eb = (struct ocfs2_extent_block *) eb_bhs[num_bhs - 1]->b_data;
639 el = &eb->h_list;
640 BUG_ON(el->l_tree_depth);
641 }
642
643 /* yay, we can finally add the actual extent now! */
644 i = le16_to_cpu(el->l_next_free_rec) - 1;
645 if (le16_to_cpu(el->l_next_free_rec) &&
646 ocfs2_extent_contig(inode, &el->l_recs[i], start_blk)) {
647 le32_add_cpu(&el->l_recs[i].e_clusters, new_clusters);
648 } else if (le16_to_cpu(el->l_next_free_rec) &&
649 (le32_to_cpu(el->l_recs[i].e_clusters) == 0)) {
650 /* having an empty extent at eof is legal. */
651 if (el->l_recs[i].e_cpos != fe->i_clusters) {
652 ocfs2_error(inode->i_sb,
653 "Dinode %llu trailing extent is bad: "
654 "cpos (%u) != number of clusters (%u)",
655 (unsigned long long)OCFS2_I(inode)->ip_blkno,
656 le32_to_cpu(el->l_recs[i].e_cpos),
657 le32_to_cpu(fe->i_clusters));
658 status = -EIO;
659 goto bail;
660 }
661 el->l_recs[i].e_blkno = cpu_to_le64(start_blk);
662 el->l_recs[i].e_clusters = cpu_to_le32(new_clusters);
663 } else {
664 /* No contiguous record, or no empty record at eof, so
665 * we add a new one. */
666
667 BUG_ON(le16_to_cpu(el->l_next_free_rec) >=
668 le16_to_cpu(el->l_count));
669 i = le16_to_cpu(el->l_next_free_rec);
670
671 el->l_recs[i].e_blkno = cpu_to_le64(start_blk);
672 el->l_recs[i].e_clusters = cpu_to_le32(new_clusters);
673 el->l_recs[i].e_cpos = fe->i_clusters;
674 le16_add_cpu(&el->l_next_free_rec, 1);
675 }
676
677 /*
678 * extent_map errors are not fatal, so they are ignored outside
679 * of flushing the thing.
680 */
681 status = ocfs2_extent_map_append(inode, &el->l_recs[i],
682 new_clusters);
683 if (status) {
684 mlog_errno(status);
685 ocfs2_extent_map_drop(inode, le32_to_cpu(fe->i_clusters));
686 }
687
688 status = ocfs2_journal_dirty(handle, fe_bh);
689 if (status < 0)
690 mlog_errno(status);
691 if (fe->id2.i_list.l_tree_depth) {
692 status = ocfs2_journal_dirty(handle, eb_bhs[num_bhs - 1]);
693 if (status < 0)
694 mlog_errno(status);
695 }
696
697 status = 0;
698bail:
699 if (eb_bhs) {
700 for (i = 0; i < num_bhs; i++)
701 if (eb_bhs[i])
702 brelse(eb_bhs[i]);
703 kfree(eb_bhs);
704 }
705
706 mlog_exit(status);
707 return status;
708}
709
710/*
711 * Should only be called when there is no space left in any of the 719 * Should only be called when there is no space left in any of the
712 * leaf nodes. What we want to do is find the lowest tree depth 720 * leaf nodes. What we want to do is find the lowest tree depth
713 * non-leaf extent block with room for new records. There are three 721 * non-leaf extent block with room for new records. There are three
@@ -807,53 +815,1523 @@ bail:
807 return status; 815 return status;
808} 816}
809 817
810/* the caller needs to update fe->i_clusters */ 818static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec)
811int ocfs2_insert_extent(struct ocfs2_super *osb,
812 handle_t *handle,
813 struct inode *inode,
814 struct buffer_head *fe_bh,
815 u64 start_blk,
816 u32 new_clusters,
817 struct ocfs2_alloc_context *meta_ac)
818{ 819{
819 int status, i, shift; 820 return !rec->e_clusters;
820 struct buffer_head *last_eb_bh = NULL; 821}
822
823/*
824 * This function will discard the rightmost extent record.
825 */
826static void ocfs2_shift_records_right(struct ocfs2_extent_list *el)
827{
828 int next_free = le16_to_cpu(el->l_next_free_rec);
829 int count = le16_to_cpu(el->l_count);
830 unsigned int num_bytes;
831
832 BUG_ON(!next_free);
833 /* This will cause us to go off the end of our extent list. */
834 BUG_ON(next_free >= count);
835
836 num_bytes = sizeof(struct ocfs2_extent_rec) * next_free;
837
838 memmove(&el->l_recs[1], &el->l_recs[0], num_bytes);
839}
840
841static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
842 struct ocfs2_extent_rec *insert_rec)
843{
844 int i, insert_index, next_free, has_empty, num_bytes;
845 u32 insert_cpos = le32_to_cpu(insert_rec->e_cpos);
846 struct ocfs2_extent_rec *rec;
847
848 next_free = le16_to_cpu(el->l_next_free_rec);
849 has_empty = ocfs2_is_empty_extent(&el->l_recs[0]);
850
851 BUG_ON(!next_free);
852
853 /* The tree code before us didn't allow enough room in the leaf. */
854 if (el->l_next_free_rec == el->l_count && !has_empty)
855 BUG();
856
857 /*
858 * The easiest way to approach this is to just remove the
859 * empty extent and temporarily decrement next_free.
860 */
861 if (has_empty) {
862 /*
863 * If next_free was 1 (only an empty extent), this
864 * loop won't execute, which is fine. We still want
865 * the decrement above to happen.
866 */
867 for(i = 0; i < (next_free - 1); i++)
868 el->l_recs[i] = el->l_recs[i+1];
869
870 next_free--;
871 }
872
873 /*
874 * Figure out what the new record index should be.
875 */
876 for(i = 0; i < next_free; i++) {
877 rec = &el->l_recs[i];
878
879 if (insert_cpos < le32_to_cpu(rec->e_cpos))
880 break;
881 }
882 insert_index = i;
883
884 mlog(0, "ins %u: index %d, has_empty %d, next_free %d, count %d\n",
885 insert_cpos, insert_index, has_empty, next_free, le16_to_cpu(el->l_count));
886
887 BUG_ON(insert_index < 0);
888 BUG_ON(insert_index >= le16_to_cpu(el->l_count));
889 BUG_ON(insert_index > next_free);
890
891 /*
892 * No need to memmove if we're just adding to the tail.
893 */
894 if (insert_index != next_free) {
895 BUG_ON(next_free >= le16_to_cpu(el->l_count));
896
897 num_bytes = next_free - insert_index;
898 num_bytes *= sizeof(struct ocfs2_extent_rec);
899 memmove(&el->l_recs[insert_index + 1],
900 &el->l_recs[insert_index],
901 num_bytes);
902 }
903
904 /*
905 * Either we had an empty extent, and need to re-increment or
906 * there was no empty extent on a non full rightmost leaf node,
907 * in which case we still need to increment.
908 */
909 next_free++;
910 el->l_next_free_rec = cpu_to_le16(next_free);
911 /*
912 * Make sure none of the math above just messed up our tree.
913 */
914 BUG_ON(le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count));
915
916 el->l_recs[insert_index] = *insert_rec;
917
918}
919
920/*
921 * Create an empty extent record .
922 *
923 * l_next_free_rec may be updated.
924 *
925 * If an empty extent already exists do nothing.
926 */
927static void ocfs2_create_empty_extent(struct ocfs2_extent_list *el)
928{
929 int next_free = le16_to_cpu(el->l_next_free_rec);
930
931 if (next_free == 0)
932 goto set_and_inc;
933
934 if (ocfs2_is_empty_extent(&el->l_recs[0]))
935 return;
936
937 mlog_bug_on_msg(el->l_count == el->l_next_free_rec,
938 "Asked to create an empty extent in a full list:\n"
939 "count = %u, tree depth = %u",
940 le16_to_cpu(el->l_count),
941 le16_to_cpu(el->l_tree_depth));
942
943 ocfs2_shift_records_right(el);
944
945set_and_inc:
946 le16_add_cpu(&el->l_next_free_rec, 1);
947 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
948}
949
950/*
951 * For a rotation which involves two leaf nodes, the "root node" is
952 * the lowest level tree node which contains a path to both leafs. This
953 * resulting set of information can be used to form a complete "subtree"
954 *
955 * This function is passed two full paths from the dinode down to a
956 * pair of adjacent leaves. It's task is to figure out which path
957 * index contains the subtree root - this can be the root index itself
958 * in a worst-case rotation.
959 *
960 * The array index of the subtree root is passed back.
961 */
962static int ocfs2_find_subtree_root(struct inode *inode,
963 struct ocfs2_path *left,
964 struct ocfs2_path *right)
965{
966 int i = 0;
967
968 /*
969 * Check that the caller passed in two paths from the same tree.
970 */
971 BUG_ON(path_root_bh(left) != path_root_bh(right));
972
973 do {
974 i++;
975
976 /*
977 * The caller didn't pass two adjacent paths.
978 */
979 mlog_bug_on_msg(i > left->p_tree_depth,
980 "Inode %lu, left depth %u, right depth %u\n"
981 "left leaf blk %llu, right leaf blk %llu\n",
982 inode->i_ino, left->p_tree_depth,
983 right->p_tree_depth,
984 (unsigned long long)path_leaf_bh(left)->b_blocknr,
985 (unsigned long long)path_leaf_bh(right)->b_blocknr);
986 } while (left->p_node[i].bh->b_blocknr ==
987 right->p_node[i].bh->b_blocknr);
988
989 return i - 1;
990}
991
992typedef void (path_insert_t)(void *, struct buffer_head *);
993
994/*
995 * Traverse a btree path in search of cpos, starting at root_el.
996 *
997 * This code can be called with a cpos larger than the tree, in which
998 * case it will return the rightmost path.
999 */
1000static int __ocfs2_find_path(struct inode *inode,
1001 struct ocfs2_extent_list *root_el, u32 cpos,
1002 path_insert_t *func, void *data)
1003{
1004 int i, ret = 0;
1005 u32 range;
1006 u64 blkno;
821 struct buffer_head *bh = NULL; 1007 struct buffer_head *bh = NULL;
822 struct ocfs2_dinode *fe;
823 struct ocfs2_extent_block *eb; 1008 struct ocfs2_extent_block *eb;
824 struct ocfs2_extent_list *el; 1009 struct ocfs2_extent_list *el;
1010 struct ocfs2_extent_rec *rec;
1011 struct ocfs2_inode_info *oi = OCFS2_I(inode);
825 1012
826 mlog_entry_void(); 1013 el = root_el;
1014 while (el->l_tree_depth) {
1015 if (le16_to_cpu(el->l_next_free_rec) == 0) {
1016 ocfs2_error(inode->i_sb,
1017 "Inode %llu has empty extent list at "
1018 "depth %u\n",
1019 (unsigned long long)oi->ip_blkno,
1020 le16_to_cpu(el->l_tree_depth));
1021 ret = -EROFS;
1022 goto out;
827 1023
828 mlog(0, "add %u clusters starting at block %llu to inode %llu\n", 1024 }
829 new_clusters, (unsigned long long)start_blk,
830 (unsigned long long)OCFS2_I(inode)->ip_blkno);
831 1025
832 fe = (struct ocfs2_dinode *) fe_bh->b_data; 1026 for(i = 0; i < le16_to_cpu(el->l_next_free_rec) - 1; i++) {
833 el = &fe->id2.i_list; 1027 rec = &el->l_recs[i];
1028
1029 /*
1030 * In the case that cpos is off the allocation
1031 * tree, this should just wind up returning the
1032 * rightmost record.
1033 */
1034 range = le32_to_cpu(rec->e_cpos) +
1035 le32_to_cpu(rec->e_clusters);
1036 if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
1037 break;
1038 }
834 1039
835 if (el->l_tree_depth) { 1040 blkno = le64_to_cpu(el->l_recs[i].e_blkno);
836 /* jump to end of tree */ 1041 if (blkno == 0) {
837 status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk), 1042 ocfs2_error(inode->i_sb,
838 &last_eb_bh, OCFS2_BH_CACHED, inode); 1043 "Inode %llu has bad blkno in extent list "
839 if (status < 0) { 1044 "at depth %u (index %d)\n",
840 mlog_exit(status); 1045 (unsigned long long)oi->ip_blkno,
841 goto bail; 1046 le16_to_cpu(el->l_tree_depth), i);
1047 ret = -EROFS;
1048 goto out;
842 } 1049 }
843 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; 1050
1051 brelse(bh);
1052 bh = NULL;
1053 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno,
1054 &bh, OCFS2_BH_CACHED, inode);
1055 if (ret) {
1056 mlog_errno(ret);
1057 goto out;
1058 }
1059
1060 eb = (struct ocfs2_extent_block *) bh->b_data;
844 el = &eb->h_list; 1061 el = &eb->h_list;
1062 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1063 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1064 ret = -EIO;
1065 goto out;
1066 }
1067
1068 if (le16_to_cpu(el->l_next_free_rec) >
1069 le16_to_cpu(el->l_count)) {
1070 ocfs2_error(inode->i_sb,
1071 "Inode %llu has bad count in extent list "
1072 "at block %llu (next free=%u, count=%u)\n",
1073 (unsigned long long)oi->ip_blkno,
1074 (unsigned long long)bh->b_blocknr,
1075 le16_to_cpu(el->l_next_free_rec),
1076 le16_to_cpu(el->l_count));
1077 ret = -EROFS;
1078 goto out;
1079 }
1080
1081 if (func)
1082 func(data, bh);
1083 }
1084
1085out:
1086 /*
1087 * Catch any trailing bh that the loop didn't handle.
1088 */
1089 brelse(bh);
1090
1091 return ret;
1092}
1093
1094/*
1095 * Given an initialized path (that is, it has a valid root extent
1096 * list), this function will traverse the btree in search of the path
1097 * which would contain cpos.
1098 *
1099 * The path traveled is recorded in the path structure.
1100 *
1101 * Note that this will not do any comparisons on leaf node extent
1102 * records, so it will work fine in the case that we just added a tree
1103 * branch.
1104 */
1105struct find_path_data {
1106 int index;
1107 struct ocfs2_path *path;
1108};
1109static void find_path_ins(void *data, struct buffer_head *bh)
1110{
1111 struct find_path_data *fp = data;
1112
1113 get_bh(bh);
1114 ocfs2_path_insert_eb(fp->path, fp->index, bh);
1115 fp->index++;
1116}
1117static int ocfs2_find_path(struct inode *inode, struct ocfs2_path *path,
1118 u32 cpos)
1119{
1120 struct find_path_data data;
1121
1122 data.index = 1;
1123 data.path = path;
1124 return __ocfs2_find_path(inode, path_root_el(path), cpos,
1125 find_path_ins, &data);
1126}
1127
1128static void find_leaf_ins(void *data, struct buffer_head *bh)
1129{
1130 struct ocfs2_extent_block *eb =(struct ocfs2_extent_block *)bh->b_data;
1131 struct ocfs2_extent_list *el = &eb->h_list;
1132 struct buffer_head **ret = data;
1133
1134 /* We want to retain only the leaf block. */
1135 if (le16_to_cpu(el->l_tree_depth) == 0) {
1136 get_bh(bh);
1137 *ret = bh;
1138 }
1139}
1140/*
1141 * Find the leaf block in the tree which would contain cpos. No
1142 * checking of the actual leaf is done.
1143 *
1144 * Some paths want to call this instead of allocating a path structure
1145 * and calling ocfs2_find_path().
1146 *
1147 * This function doesn't handle non btree extent lists.
1148 */
1149static int ocfs2_find_leaf(struct inode *inode,
1150 struct ocfs2_extent_list *root_el, u32 cpos,
1151 struct buffer_head **leaf_bh)
1152{
1153 int ret;
1154 struct buffer_head *bh = NULL;
1155
1156 ret = __ocfs2_find_path(inode, root_el, cpos, find_leaf_ins, &bh);
1157 if (ret) {
1158 mlog_errno(ret);
1159 goto out;
1160 }
1161
1162 *leaf_bh = bh;
1163out:
1164 return ret;
1165}
1166
1167/*
1168 * Adjust the adjacent records (left_rec, right_rec) involved in a rotation.
1169 *
1170 * Basically, we've moved stuff around at the bottom of the tree and
1171 * we need to fix up the extent records above the changes to reflect
1172 * the new changes.
1173 *
1174 * left_rec: the record on the left.
1175 * left_child_el: is the child list pointed to by left_rec
1176 * right_rec: the record to the right of left_rec
1177 * right_child_el: is the child list pointed to by right_rec
1178 *
1179 * By definition, this only works on interior nodes.
1180 */
1181static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
1182 struct ocfs2_extent_list *left_child_el,
1183 struct ocfs2_extent_rec *right_rec,
1184 struct ocfs2_extent_list *right_child_el)
1185{
1186 u32 left_clusters, right_end;
1187
1188 /*
1189 * Interior nodes never have holes. Their cpos is the cpos of
1190 * the leftmost record in their child list. Their cluster
1191 * count covers the full theoretical range of their child list
1192 * - the range between their cpos and the cpos of the record
1193 * immediately to their right.
1194 */
1195 left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos);
1196 left_clusters -= le32_to_cpu(left_rec->e_cpos);
1197 left_rec->e_clusters = cpu_to_le32(left_clusters);
1198
1199 /*
1200 * Calculate the rightmost cluster count boundary before
1201 * moving cpos - we will need to adjust e_clusters after
1202 * updating e_cpos to keep the same highest cluster count.
1203 */
1204 right_end = le32_to_cpu(right_rec->e_cpos);
1205 right_end += le32_to_cpu(right_rec->e_clusters);
1206
1207 right_rec->e_cpos = left_rec->e_cpos;
1208 le32_add_cpu(&right_rec->e_cpos, left_clusters);
1209
1210 right_end -= le32_to_cpu(right_rec->e_cpos);
1211 right_rec->e_clusters = cpu_to_le32(right_end);
1212}
1213
1214/*
1215 * Adjust the adjacent root node records involved in a
1216 * rotation. left_el_blkno is passed in as a key so that we can easily
1217 * find it's index in the root list.
1218 */
1219static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el,
1220 struct ocfs2_extent_list *left_el,
1221 struct ocfs2_extent_list *right_el,
1222 u64 left_el_blkno)
1223{
1224 int i;
1225
1226 BUG_ON(le16_to_cpu(root_el->l_tree_depth) <=
1227 le16_to_cpu(left_el->l_tree_depth));
1228
1229 for(i = 0; i < le16_to_cpu(root_el->l_next_free_rec) - 1; i++) {
1230 if (le64_to_cpu(root_el->l_recs[i].e_blkno) == left_el_blkno)
1231 break;
1232 }
1233
1234 /*
1235 * The path walking code should have never returned a root and
1236 * two paths which are not adjacent.
1237 */
1238 BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1));
1239
1240 ocfs2_adjust_adjacent_records(&root_el->l_recs[i], left_el,
1241 &root_el->l_recs[i + 1], right_el);
1242}
1243
1244/*
1245 * We've changed a leaf block (in right_path) and need to reflect that
1246 * change back up the subtree.
1247 *
1248 * This happens in multiple places:
1249 * - When we've moved an extent record from the left path leaf to the right
1250 * path leaf to make room for an empty extent in the left path leaf.
1251 * - When our insert into the right path leaf is at the leftmost edge
1252 * and requires an update of the path immediately to it's left. This
1253 * can occur at the end of some types of rotation and appending inserts.
1254 */
1255static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle,
1256 struct ocfs2_path *left_path,
1257 struct ocfs2_path *right_path,
1258 int subtree_index)
1259{
1260 int ret, i, idx;
1261 struct ocfs2_extent_list *el, *left_el, *right_el;
1262 struct ocfs2_extent_rec *left_rec, *right_rec;
1263 struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
1264
1265 /*
1266 * Update the counts and position values within all the
1267 * interior nodes to reflect the leaf rotation we just did.
1268 *
1269 * The root node is handled below the loop.
1270 *
1271 * We begin the loop with right_el and left_el pointing to the
1272 * leaf lists and work our way up.
1273 *
1274 * NOTE: within this loop, left_el and right_el always refer
1275 * to the *child* lists.
1276 */
1277 left_el = path_leaf_el(left_path);
1278 right_el = path_leaf_el(right_path);
1279 for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) {
1280 mlog(0, "Adjust records at index %u\n", i);
1281
1282 /*
1283 * One nice property of knowing that all of these
1284 * nodes are below the root is that we only deal with
1285 * the leftmost right node record and the rightmost
1286 * left node record.
1287 */
1288 el = left_path->p_node[i].el;
1289 idx = le16_to_cpu(left_el->l_next_free_rec) - 1;
1290 left_rec = &el->l_recs[idx];
1291
1292 el = right_path->p_node[i].el;
1293 right_rec = &el->l_recs[0];
1294
1295 ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec,
1296 right_el);
1297
1298 ret = ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
1299 if (ret)
1300 mlog_errno(ret);
1301
1302 ret = ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
1303 if (ret)
1304 mlog_errno(ret);
1305
1306 /*
1307 * Setup our list pointers now so that the current
1308 * parents become children in the next iteration.
1309 */
1310 left_el = left_path->p_node[i].el;
1311 right_el = right_path->p_node[i].el;
1312 }
1313
1314 /*
1315 * At the root node, adjust the two adjacent records which
1316 * begin our path to the leaves.
1317 */
1318
1319 el = left_path->p_node[subtree_index].el;
1320 left_el = left_path->p_node[subtree_index + 1].el;
1321 right_el = right_path->p_node[subtree_index + 1].el;
1322
1323 ocfs2_adjust_root_records(el, left_el, right_el,
1324 left_path->p_node[subtree_index + 1].bh->b_blocknr);
1325
1326 root_bh = left_path->p_node[subtree_index].bh;
1327
1328 ret = ocfs2_journal_dirty(handle, root_bh);
1329 if (ret)
1330 mlog_errno(ret);
1331}
1332
1333static int ocfs2_rotate_subtree_right(struct inode *inode,
1334 handle_t *handle,
1335 struct ocfs2_path *left_path,
1336 struct ocfs2_path *right_path,
1337 int subtree_index)
1338{
1339 int ret, i;
1340 struct buffer_head *right_leaf_bh;
1341 struct buffer_head *left_leaf_bh = NULL;
1342 struct buffer_head *root_bh;
1343 struct ocfs2_extent_list *right_el, *left_el;
1344 struct ocfs2_extent_rec move_rec;
1345
1346 left_leaf_bh = path_leaf_bh(left_path);
1347 left_el = path_leaf_el(left_path);
1348
1349 if (left_el->l_next_free_rec != left_el->l_count) {
1350 ocfs2_error(inode->i_sb,
1351 "Inode %llu has non-full interior leaf node %llu"
1352 "(next free = %u)",
1353 (unsigned long long)OCFS2_I(inode)->ip_blkno,
1354 (unsigned long long)left_leaf_bh->b_blocknr,
1355 le16_to_cpu(left_el->l_next_free_rec));
1356 return -EROFS;
1357 }
1358
1359 /*
1360 * This extent block may already have an empty record, so we
1361 * return early if so.
1362 */
1363 if (ocfs2_is_empty_extent(&left_el->l_recs[0]))
1364 return 0;
1365
1366 root_bh = left_path->p_node[subtree_index].bh;
1367 BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
1368
1369 ret = ocfs2_journal_access(handle, inode, root_bh,
1370 OCFS2_JOURNAL_ACCESS_WRITE);
1371 if (ret) {
1372 mlog_errno(ret);
1373 goto out;
1374 }
1375
1376 for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
1377 ret = ocfs2_journal_access(handle, inode,
1378 right_path->p_node[i].bh,
1379 OCFS2_JOURNAL_ACCESS_WRITE);
1380 if (ret) {
1381 mlog_errno(ret);
1382 goto out;
1383 }
1384
1385 ret = ocfs2_journal_access(handle, inode,
1386 left_path->p_node[i].bh,
1387 OCFS2_JOURNAL_ACCESS_WRITE);
1388 if (ret) {
1389 mlog_errno(ret);
1390 goto out;
1391 }
1392 }
1393
1394 right_leaf_bh = path_leaf_bh(right_path);
1395 right_el = path_leaf_el(right_path);
1396
1397 /* This is a code error, not a disk corruption. */
1398 mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails "
1399 "because rightmost leaf block %llu is empty\n",
1400 (unsigned long long)OCFS2_I(inode)->ip_blkno,
1401 (unsigned long long)right_leaf_bh->b_blocknr);
1402
1403 ocfs2_create_empty_extent(right_el);
1404
1405 ret = ocfs2_journal_dirty(handle, right_leaf_bh);
1406 if (ret) {
1407 mlog_errno(ret);
1408 goto out;
1409 }
1410
1411 /* Do the copy now. */
1412 i = le16_to_cpu(left_el->l_next_free_rec) - 1;
1413 move_rec = left_el->l_recs[i];
1414 right_el->l_recs[0] = move_rec;
1415
1416 /*
1417 * Clear out the record we just copied and shift everything
1418 * over, leaving an empty extent in the left leaf.
1419 *
1420 * We temporarily subtract from next_free_rec so that the
1421 * shift will lose the tail record (which is now defunct).
1422 */
1423 le16_add_cpu(&left_el->l_next_free_rec, -1);
1424 ocfs2_shift_records_right(left_el);
1425 memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
1426 le16_add_cpu(&left_el->l_next_free_rec, 1);
1427
1428 ret = ocfs2_journal_dirty(handle, left_leaf_bh);
1429 if (ret) {
1430 mlog_errno(ret);
1431 goto out;
1432 }
1433
1434 ocfs2_complete_edge_insert(inode, handle, left_path, right_path,
1435 subtree_index);
1436
1437out:
1438 return ret;
1439}
1440
1441/*
1442 * Given a full path, determine what cpos value would return us a path
1443 * containing the leaf immediately to the left of the current one.
1444 *
1445 * Will return zero if the path passed in is already the leftmost path.
1446 */
1447static int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
1448 struct ocfs2_path *path, u32 *cpos)
1449{
1450 int i, j, ret = 0;
1451 u64 blkno;
1452 struct ocfs2_extent_list *el;
1453
1454 *cpos = 0;
1455
1456 blkno = path_leaf_bh(path)->b_blocknr;
1457
1458 /* Start at the tree node just above the leaf and work our way up. */
1459 i = path->p_tree_depth - 1;
1460 while (i >= 0) {
1461 el = path->p_node[i].el;
1462
1463 /*
1464 * Find the extent record just before the one in our
1465 * path.
1466 */
1467 for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
1468 if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
1469 if (j == 0) {
1470 if (i == 0) {
1471 /*
1472 * We've determined that the
1473 * path specified is already
1474 * the leftmost one - return a
1475 * cpos of zero.
1476 */
1477 goto out;
1478 }
1479 /*
1480 * The leftmost record points to our
1481 * leaf - we need to travel up the
1482 * tree one level.
1483 */
1484 goto next_node;
1485 }
1486
1487 *cpos = le32_to_cpu(el->l_recs[j - 1].e_cpos);
1488 *cpos = *cpos + le32_to_cpu(el->l_recs[j - 1].e_clusters) - 1;
1489 goto out;
1490 }
1491 }
1492
1493 /*
1494 * If we got here, we never found a valid node where
1495 * the tree indicated one should be.
1496 */
1497 ocfs2_error(sb,
1498 "Invalid extent tree at extent block %llu\n",
1499 (unsigned long long)blkno);
1500 ret = -EROFS;
1501 goto out;
1502
1503next_node:
1504 blkno = path->p_node[i].bh->b_blocknr;
1505 i--;
1506 }
1507
1508out:
1509 return ret;
1510}
1511
1512static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
1513 struct ocfs2_path *path)
1514{
1515 int credits = (path->p_tree_depth - subtree_depth) * 2 + 1;
1516
1517 if (handle->h_buffer_credits < credits)
1518 return ocfs2_extend_trans(handle, credits);
1519
1520 return 0;
1521}
1522
1523/*
1524 * Trap the case where we're inserting into the theoretical range past
1525 * the _actual_ left leaf range. Otherwise, we'll rotate a record
1526 * whose cpos is less than ours into the right leaf.
1527 *
1528 * It's only necessary to look at the rightmost record of the left
1529 * leaf because the logic that calls us should ensure that the
1530 * theoretical ranges in the path components above the leaves are
1531 * correct.
1532 */
1533static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path,
1534 u32 insert_cpos)
1535{
1536 struct ocfs2_extent_list *left_el;
1537 struct ocfs2_extent_rec *rec;
1538 int next_free;
1539
1540 left_el = path_leaf_el(left_path);
1541 next_free = le16_to_cpu(left_el->l_next_free_rec);
1542 rec = &left_el->l_recs[next_free - 1];
1543
1544 if (insert_cpos > le32_to_cpu(rec->e_cpos))
1545 return 1;
1546 return 0;
1547}
1548
1549/*
1550 * Rotate all the records in a btree right one record, starting at insert_cpos.
1551 *
1552 * The path to the rightmost leaf should be passed in.
1553 *
1554 * The array is assumed to be large enough to hold an entire path (tree depth).
1555 *
1556 * Upon succesful return from this function:
1557 *
1558 * - The 'right_path' array will contain a path to the leaf block
1559 * whose range contains e_cpos.
1560 * - That leaf block will have a single empty extent in list index 0.
1561 * - In the case that the rotation requires a post-insert update,
1562 * *ret_left_path will contain a valid path which can be passed to
1563 * ocfs2_insert_path().
1564 */
1565static int ocfs2_rotate_tree_right(struct inode *inode,
1566 handle_t *handle,
1567 u32 insert_cpos,
1568 struct ocfs2_path *right_path,
1569 struct ocfs2_path **ret_left_path)
1570{
1571 int ret, start;
1572 u32 cpos;
1573 struct ocfs2_path *left_path = NULL;
1574
1575 *ret_left_path = NULL;
1576
1577 left_path = ocfs2_new_path(path_root_bh(right_path),
1578 path_root_el(right_path));
1579 if (!left_path) {
1580 ret = -ENOMEM;
1581 mlog_errno(ret);
1582 goto out;
1583 }
1584
1585 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, &cpos);
1586 if (ret) {
1587 mlog_errno(ret);
1588 goto out;
1589 }
1590
1591 mlog(0, "Insert: %u, first left path cpos: %u\n", insert_cpos, cpos);
1592
1593 /*
1594 * What we want to do here is:
1595 *
1596 * 1) Start with the rightmost path.
1597 *
1598 * 2) Determine a path to the leaf block directly to the left
1599 * of that leaf.
1600 *
1601 * 3) Determine the 'subtree root' - the lowest level tree node
1602 * which contains a path to both leaves.
1603 *
1604 * 4) Rotate the subtree.
1605 *
1606 * 5) Find the next subtree by considering the left path to be
1607 * the new right path.
1608 *
1609 * The check at the top of this while loop also accepts
1610 * insert_cpos == cpos because cpos is only a _theoretical_
1611 * value to get us the left path - insert_cpos might very well
1612 * be filling that hole.
1613 *
1614 * Stop at a cpos of '0' because we either started at the
1615 * leftmost branch (i.e., a tree with one branch and a
1616 * rotation inside of it), or we've gone as far as we can in
1617 * rotating subtrees.
1618 */
1619 while (cpos && insert_cpos <= cpos) {
1620 mlog(0, "Rotating a tree: ins. cpos: %u, left path cpos: %u\n",
1621 insert_cpos, cpos);
1622
1623 ret = ocfs2_find_path(inode, left_path, cpos);
1624 if (ret) {
1625 mlog_errno(ret);
1626 goto out;
1627 }
1628
1629 mlog_bug_on_msg(path_leaf_bh(left_path) ==
1630 path_leaf_bh(right_path),
1631 "Inode %lu: error during insert of %u "
1632 "(left path cpos %u) results in two identical "
1633 "paths ending at %llu\n",
1634 inode->i_ino, insert_cpos, cpos,
1635 (unsigned long long)
1636 path_leaf_bh(left_path)->b_blocknr);
1637
1638 if (ocfs2_rotate_requires_path_adjustment(left_path,
1639 insert_cpos)) {
1640 mlog(0, "Path adjustment required\n");
1641
1642 /*
1643 * We've rotated the tree as much as we
1644 * should. The rest is up to
1645 * ocfs2_insert_path() to complete, after the
1646 * record insertion. We indicate this
1647 * situation by returning the left path.
1648 *
1649 * The reason we don't adjust the records here
1650 * before the record insert is that an error
1651 * later might break the rule where a parent
1652 * record e_cpos will reflect the actual
1653 * e_cpos of the 1st nonempty record of the
1654 * child list.
1655 */
1656 *ret_left_path = left_path;
1657 goto out_ret_path;
1658 }
1659
1660 start = ocfs2_find_subtree_root(inode, left_path, right_path);
1661
1662 mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
1663 start,
1664 (unsigned long long) right_path->p_node[start].bh->b_blocknr,
1665 right_path->p_tree_depth);
1666
1667 ret = ocfs2_extend_rotate_transaction(handle, start,
1668 right_path);
1669 if (ret) {
1670 mlog_errno(ret);
1671 goto out;
1672 }
1673
1674 ret = ocfs2_rotate_subtree_right(inode, handle, left_path,
1675 right_path, start);
1676 if (ret) {
1677 mlog_errno(ret);
1678 goto out;
1679 }
1680
1681 /*
1682 * There is no need to re-read the next right path
1683 * as we know that it'll be our current left
1684 * path. Optimize by copying values instead.
1685 */
1686 ocfs2_mv_path(right_path, left_path);
1687
1688 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
1689 &cpos);
1690 if (ret) {
1691 mlog_errno(ret);
1692 goto out;
1693 }
1694 }
1695
1696out:
1697 ocfs2_free_path(left_path);
1698
1699out_ret_path:
1700 return ret;
1701}
1702
1703/*
1704 * Do the final bits of extent record insertion at the target leaf
1705 * list. If this leaf is part of an allocation tree, it is assumed
1706 * that the tree above has been prepared.
1707 */
1708static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
1709 struct ocfs2_extent_list *el,
1710 struct ocfs2_insert_type *insert,
1711 struct inode *inode)
1712{
1713 int i = insert->ins_contig_index;
1714 unsigned int range;
1715 struct ocfs2_extent_rec *rec;
1716
1717 BUG_ON(el->l_tree_depth);
1718
1719 /*
1720 * Contiguous insert - either left or right.
1721 */
1722 if (insert->ins_contig != CONTIG_NONE) {
1723 rec = &el->l_recs[i];
1724 if (insert->ins_contig == CONTIG_LEFT) {
1725 rec->e_blkno = insert_rec->e_blkno;
1726 rec->e_cpos = insert_rec->e_cpos;
1727 }
1728 le32_add_cpu(&rec->e_clusters,
1729 le32_to_cpu(insert_rec->e_clusters));
1730 return;
1731 }
1732
1733 /*
1734 * Handle insert into an empty leaf.
1735 */
1736 if (le16_to_cpu(el->l_next_free_rec) == 0 ||
1737 ((le16_to_cpu(el->l_next_free_rec) == 1) &&
1738 ocfs2_is_empty_extent(&el->l_recs[0]))) {
1739 el->l_recs[0] = *insert_rec;
1740 el->l_next_free_rec = cpu_to_le16(1);
1741 return;
1742 }
1743
1744 /*
1745 * Appending insert.
1746 */
1747 if (insert->ins_appending == APPEND_TAIL) {
1748 i = le16_to_cpu(el->l_next_free_rec) - 1;
1749 rec = &el->l_recs[i];
1750 range = le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters);
1751 BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range);
1752
1753 mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >=
1754 le16_to_cpu(el->l_count),
1755 "inode %lu, depth %u, count %u, next free %u, "
1756 "rec.cpos %u, rec.clusters %u, "
1757 "insert.cpos %u, insert.clusters %u\n",
1758 inode->i_ino,
1759 le16_to_cpu(el->l_tree_depth),
1760 le16_to_cpu(el->l_count),
1761 le16_to_cpu(el->l_next_free_rec),
1762 le32_to_cpu(el->l_recs[i].e_cpos),
1763 le32_to_cpu(el->l_recs[i].e_clusters),
1764 le32_to_cpu(insert_rec->e_cpos),
1765 le32_to_cpu(insert_rec->e_clusters));
1766 i++;
1767 el->l_recs[i] = *insert_rec;
1768 le16_add_cpu(&el->l_next_free_rec, 1);
1769 return;
1770 }
1771
1772 /*
1773 * Ok, we have to rotate.
1774 *
1775 * At this point, it is safe to assume that inserting into an
1776 * empty leaf and appending to a leaf have both been handled
1777 * above.
1778 *
1779 * This leaf needs to have space, either by the empty 1st
1780 * extent record, or by virtue of an l_next_rec < l_count.
1781 */
1782 ocfs2_rotate_leaf(el, insert_rec);
1783}
1784
1785static inline void ocfs2_update_dinode_clusters(struct inode *inode,
1786 struct ocfs2_dinode *di,
1787 u32 clusters)
1788{
1789 le32_add_cpu(&di->i_clusters, clusters);
1790 spin_lock(&OCFS2_I(inode)->ip_lock);
1791 OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters);
1792 spin_unlock(&OCFS2_I(inode)->ip_lock);
1793}
1794
1795static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
1796 struct ocfs2_extent_rec *insert_rec,
1797 struct ocfs2_path *right_path,
1798 struct ocfs2_path **ret_left_path)
1799{
1800 int ret, i, next_free;
1801 struct buffer_head *bh;
1802 struct ocfs2_extent_list *el;
1803 struct ocfs2_path *left_path = NULL;
1804
1805 *ret_left_path = NULL;
1806
1807 /*
1808 * If our appending insert is at the leftmost edge of a leaf,
1809 * then we might need to update the rightmost records of the
1810 * neighboring path.
1811 */
1812 el = path_leaf_el(right_path);
1813 next_free = le16_to_cpu(el->l_next_free_rec);
1814 if (next_free == 0 ||
1815 (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) {
1816 u32 left_cpos;
1817
1818 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
1819 &left_cpos);
1820 if (ret) {
1821 mlog_errno(ret);
1822 goto out;
1823 }
1824
1825 mlog(0, "Append may need a left path update. cpos: %u, "
1826 "left_cpos: %u\n", le32_to_cpu(insert_rec->e_cpos),
1827 left_cpos);
1828
1829 /*
1830 * No need to worry if the append is already in the
1831 * leftmost leaf.
1832 */
1833 if (left_cpos) {
1834 left_path = ocfs2_new_path(path_root_bh(right_path),
1835 path_root_el(right_path));
1836 if (!left_path) {
1837 ret = -ENOMEM;
1838 mlog_errno(ret);
1839 goto out;
1840 }
1841
1842 ret = ocfs2_find_path(inode, left_path, left_cpos);
1843 if (ret) {
1844 mlog_errno(ret);
1845 goto out;
1846 }
1847
1848 /*
1849 * ocfs2_insert_path() will pass the left_path to the
1850 * journal for us.
1851 */
1852 }
1853 }
1854
1855 ret = ocfs2_journal_access_path(inode, handle, right_path);
1856 if (ret) {
1857 mlog_errno(ret);
1858 goto out;
1859 }
1860
1861 el = path_root_el(right_path);
1862 bh = path_root_bh(right_path);
1863 i = 0;
1864 while (1) {
1865 next_free = le16_to_cpu(el->l_next_free_rec);
1866 if (next_free == 0) {
1867 ocfs2_error(inode->i_sb,
1868 "Dinode %llu has a bad extent list",
1869 (unsigned long long)OCFS2_I(inode)->ip_blkno);
1870 ret = -EIO;
1871 goto out;
1872 }
1873
1874 el->l_recs[next_free - 1].e_clusters = insert_rec->e_cpos;
1875 le32_add_cpu(&el->l_recs[next_free - 1].e_clusters,
1876 le32_to_cpu(insert_rec->e_clusters));
1877 le32_add_cpu(&el->l_recs[next_free - 1].e_clusters,
1878 -le32_to_cpu(el->l_recs[next_free - 1].e_cpos));
1879
1880 ret = ocfs2_journal_dirty(handle, bh);
1881 if (ret)
1882 mlog_errno(ret);
1883
1884 if (++i >= right_path->p_tree_depth)
1885 break;
1886
1887 bh = right_path->p_node[i].bh;
1888 el = right_path->p_node[i].el;
1889 }
1890
1891 *ret_left_path = left_path;
1892 ret = 0;
1893out:
1894 if (ret != 0)
1895 ocfs2_free_path(left_path);
1896
1897 return ret;
1898}
1899
1900/*
1901 * This function only does inserts on an allocation b-tree. For dinode
1902 * lists, ocfs2_insert_at_leaf() is called directly.
1903 *
1904 * right_path is the path we want to do the actual insert
1905 * in. left_path should only be passed in if we need to update that
1906 * portion of the tree after an edge insert.
1907 */
1908static int ocfs2_insert_path(struct inode *inode,
1909 handle_t *handle,
1910 struct ocfs2_path *left_path,
1911 struct ocfs2_path *right_path,
1912 struct ocfs2_extent_rec *insert_rec,
1913 struct ocfs2_insert_type *insert)
1914{
1915 int ret, subtree_index;
1916 struct buffer_head *leaf_bh = path_leaf_bh(right_path);
1917 struct ocfs2_extent_list *el;
1918
1919 /*
1920 * Pass both paths to the journal. The majority of inserts
1921 * will be touching all components anyway.
1922 */
1923 ret = ocfs2_journal_access_path(inode, handle, right_path);
1924 if (ret < 0) {
1925 mlog_errno(ret);
1926 goto out;
1927 }
1928
1929 if (left_path) {
1930 int credits = handle->h_buffer_credits;
1931
1932 /*
1933 * There's a chance that left_path got passed back to
1934 * us without being accounted for in the
1935 * journal. Extend our transaction here to be sure we
1936 * can change those blocks.
1937 */
1938 credits += left_path->p_tree_depth;
1939
1940 ret = ocfs2_extend_trans(handle, credits);
1941 if (ret < 0) {
1942 mlog_errno(ret);
1943 goto out;
1944 }
1945
1946 ret = ocfs2_journal_access_path(inode, handle, left_path);
1947 if (ret < 0) {
1948 mlog_errno(ret);
1949 goto out;
1950 }
1951 }
1952
1953 el = path_leaf_el(right_path);
1954
1955 ocfs2_insert_at_leaf(insert_rec, el, insert, inode);
1956 ret = ocfs2_journal_dirty(handle, leaf_bh);
1957 if (ret)
1958 mlog_errno(ret);
1959
1960 if (left_path) {
1961 /*
1962 * The rotate code has indicated that we need to fix
1963 * up portions of the tree after the insert.
1964 *
1965 * XXX: Should we extend the transaction here?
1966 */
1967 subtree_index = ocfs2_find_subtree_root(inode, left_path,
1968 right_path);
1969 ocfs2_complete_edge_insert(inode, handle, left_path,
1970 right_path, subtree_index);
1971 }
1972
1973 ret = 0;
1974out:
1975 return ret;
1976}
1977
1978static int ocfs2_do_insert_extent(struct inode *inode,
1979 handle_t *handle,
1980 struct buffer_head *di_bh,
1981 struct ocfs2_extent_rec *insert_rec,
1982 struct ocfs2_insert_type *type)
1983{
1984 int ret, rotate = 0;
1985 u32 cpos;
1986 struct ocfs2_path *right_path = NULL;
1987 struct ocfs2_path *left_path = NULL;
1988 struct ocfs2_dinode *di;
1989 struct ocfs2_extent_list *el;
1990
1991 di = (struct ocfs2_dinode *) di_bh->b_data;
1992 el = &di->id2.i_list;
1993
1994 ret = ocfs2_journal_access(handle, inode, di_bh,
1995 OCFS2_JOURNAL_ACCESS_WRITE);
1996 if (ret) {
1997 mlog_errno(ret);
1998 goto out;
1999 }
2000
2001 if (le16_to_cpu(el->l_tree_depth) == 0) {
2002 ocfs2_insert_at_leaf(insert_rec, el, type, inode);
2003 goto out_update_clusters;
2004 }
2005
2006 right_path = ocfs2_new_inode_path(di_bh);
2007 if (!right_path) {
2008 ret = -ENOMEM;
2009 mlog_errno(ret);
2010 goto out;
2011 }
2012
2013 /*
2014 * Determine the path to start with. Rotations need the
2015 * rightmost path, everything else can go directly to the
2016 * target leaf.
2017 */
2018 cpos = le32_to_cpu(insert_rec->e_cpos);
2019 if (type->ins_appending == APPEND_NONE &&
2020 type->ins_contig == CONTIG_NONE) {
2021 rotate = 1;
2022 cpos = UINT_MAX;
2023 }
2024
2025 ret = ocfs2_find_path(inode, right_path, cpos);
2026 if (ret) {
2027 mlog_errno(ret);
2028 goto out;
2029 }
2030
2031 /*
2032 * Rotations and appends need special treatment - they modify
2033 * parts of the tree's above them.
2034 *
2035 * Both might pass back a path immediate to the left of the
2036 * one being inserted to. This will be cause
2037 * ocfs2_insert_path() to modify the rightmost records of
2038 * left_path to account for an edge insert.
2039 *
2040 * XXX: When modifying this code, keep in mind that an insert
2041 * can wind up skipping both of these two special cases...
2042 */
2043 if (rotate) {
2044 ret = ocfs2_rotate_tree_right(inode, handle,
2045 le32_to_cpu(insert_rec->e_cpos),
2046 right_path, &left_path);
2047 if (ret) {
2048 mlog_errno(ret);
2049 goto out;
2050 }
2051 } else if (type->ins_appending == APPEND_TAIL
2052 && type->ins_contig != CONTIG_LEFT) {
2053 ret = ocfs2_append_rec_to_path(inode, handle, insert_rec,
2054 right_path, &left_path);
2055 if (ret) {
2056 mlog_errno(ret);
2057 goto out;
2058 }
2059 }
2060
2061 ret = ocfs2_insert_path(inode, handle, left_path, right_path,
2062 insert_rec, type);
2063 if (ret) {
2064 mlog_errno(ret);
2065 goto out;
2066 }
2067
2068out_update_clusters:
2069 ocfs2_update_dinode_clusters(inode, di,
2070 le32_to_cpu(insert_rec->e_clusters));
2071
2072 ret = ocfs2_journal_dirty(handle, di_bh);
2073 if (ret)
2074 mlog_errno(ret);
2075
2076out:
2077 ocfs2_free_path(left_path);
2078 ocfs2_free_path(right_path);
2079
2080 return ret;
2081}
2082
2083static void ocfs2_figure_contig_type(struct inode *inode,
2084 struct ocfs2_insert_type *insert,
2085 struct ocfs2_extent_list *el,
2086 struct ocfs2_extent_rec *insert_rec)
2087{
2088 int i;
2089 enum ocfs2_contig_type contig_type = CONTIG_NONE;
2090
2091 for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
2092 contig_type = ocfs2_extent_contig(inode, &el->l_recs[i],
2093 insert_rec);
2094 if (contig_type != CONTIG_NONE) {
2095 insert->ins_contig_index = i;
2096 break;
2097 }
2098 }
2099 insert->ins_contig = contig_type;
2100}
2101
2102/*
2103 * This should only be called against the righmost leaf extent list.
2104 *
2105 * ocfs2_figure_appending_type() will figure out whether we'll have to
2106 * insert at the tail of the rightmost leaf.
2107 *
2108 * This should also work against the dinode list for tree's with 0
2109 * depth. If we consider the dinode list to be the rightmost leaf node
2110 * then the logic here makes sense.
2111 */
2112static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert,
2113 struct ocfs2_extent_list *el,
2114 struct ocfs2_extent_rec *insert_rec)
2115{
2116 int i;
2117 u32 cpos = le32_to_cpu(insert_rec->e_cpos);
2118 struct ocfs2_extent_rec *rec;
2119
2120 insert->ins_appending = APPEND_NONE;
2121
2122 BUG_ON(el->l_tree_depth);
2123
2124 if (!el->l_next_free_rec)
2125 goto set_tail_append;
2126
2127 if (ocfs2_is_empty_extent(&el->l_recs[0])) {
2128 /* Were all records empty? */
2129 if (le16_to_cpu(el->l_next_free_rec) == 1)
2130 goto set_tail_append;
845 } 2131 }
846 2132
847 /* Can we allocate without adding/shifting tree bits? */
848 i = le16_to_cpu(el->l_next_free_rec) - 1; 2133 i = le16_to_cpu(el->l_next_free_rec) - 1;
849 if (le16_to_cpu(el->l_next_free_rec) == 0 2134 rec = &el->l_recs[i];
850 || (le16_to_cpu(el->l_next_free_rec) < le16_to_cpu(el->l_count)) 2135
851 || le32_to_cpu(el->l_recs[i].e_clusters) == 0 2136 if (cpos >= (le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)))
852 || ocfs2_extent_contig(inode, &el->l_recs[i], start_blk)) 2137 goto set_tail_append;
853 goto out_add;
854 2138
855 mlog(0, "ocfs2_allocate_extent: couldn't do a simple add, traversing " 2139 return;
856 "tree now.\n"); 2140
2141set_tail_append:
2142 insert->ins_appending = APPEND_TAIL;
2143}
2144
2145/*
2146 * Helper function called at the begining of an insert.
2147 *
2148 * This computes a few things that are commonly used in the process of
2149 * inserting into the btree:
2150 * - Whether the new extent is contiguous with an existing one.
2151 * - The current tree depth.
2152 * - Whether the insert is an appending one.
2153 * - The total # of free records in the tree.
2154 *
2155 * All of the information is stored on the ocfs2_insert_type
2156 * structure.
2157 */
2158static int ocfs2_figure_insert_type(struct inode *inode,
2159 struct buffer_head *di_bh,
2160 struct buffer_head **last_eb_bh,
2161 struct ocfs2_extent_rec *insert_rec,
2162 struct ocfs2_insert_type *insert)
2163{
2164 int ret;
2165 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
2166 struct ocfs2_extent_block *eb;
2167 struct ocfs2_extent_list *el;
2168 struct ocfs2_path *path = NULL;
2169 struct buffer_head *bh = NULL;
2170
2171 el = &di->id2.i_list;
2172 insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth);
2173
2174 if (el->l_tree_depth) {
2175 /*
2176 * If we have tree depth, we read in the
2177 * rightmost extent block ahead of time as
2178 * ocfs2_figure_insert_type() and ocfs2_add_branch()
2179 * may want it later.
2180 */
2181 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
2182 le64_to_cpu(di->i_last_eb_blk), &bh,
2183 OCFS2_BH_CACHED, inode);
2184 if (ret) {
2185 mlog_exit(ret);
2186 goto out;
2187 }
2188 eb = (struct ocfs2_extent_block *) bh->b_data;
2189 el = &eb->h_list;
2190 }
2191
2192 /*
2193 * Unless we have a contiguous insert, we'll need to know if
2194 * there is room left in our allocation tree for another
2195 * extent record.
2196 *
2197 * XXX: This test is simplistic, we can search for empty
2198 * extent records too.
2199 */
2200 insert->ins_free_records = le16_to_cpu(el->l_count) -
2201 le16_to_cpu(el->l_next_free_rec);
2202
2203 if (!insert->ins_tree_depth) {
2204 ocfs2_figure_contig_type(inode, insert, el, insert_rec);
2205 ocfs2_figure_appending_type(insert, el, insert_rec);
2206 return 0;
2207 }
2208
2209 path = ocfs2_new_inode_path(di_bh);
2210 if (!path) {
2211 ret = -ENOMEM;
2212 mlog_errno(ret);
2213 goto out;
2214 }
2215
2216 /*
2217 * In the case that we're inserting past what the tree
2218 * currently accounts for, ocfs2_find_path() will return for
2219 * us the rightmost tree path. This is accounted for below in
2220 * the appending code.
2221 */
2222 ret = ocfs2_find_path(inode, path, le32_to_cpu(insert_rec->e_cpos));
2223 if (ret) {
2224 mlog_errno(ret);
2225 goto out;
2226 }
2227
2228 el = path_leaf_el(path);
2229
2230 /*
2231 * Now that we have the path, there's two things we want to determine:
2232 * 1) Contiguousness (also set contig_index if this is so)
2233 *
2234 * 2) Are we doing an append? We can trivially break this up
2235 * into two types of appends: simple record append, or a
2236 * rotate inside the tail leaf.
2237 */
2238 ocfs2_figure_contig_type(inode, insert, el, insert_rec);
2239
2240 /*
2241 * The insert code isn't quite ready to deal with all cases of
2242 * left contiguousness. Specifically, if it's an insert into
2243 * the 1st record in a leaf, it will require the adjustment of
2244 * e_clusters on the last record of the path directly to it's
2245 * left. For now, just catch that case and fool the layers
2246 * above us. This works just fine for tree_depth == 0, which
2247 * is why we allow that above.
2248 */
2249 if (insert->ins_contig == CONTIG_LEFT &&
2250 insert->ins_contig_index == 0)
2251 insert->ins_contig = CONTIG_NONE;
2252
2253 /*
2254 * Ok, so we can simply compare against last_eb to figure out
2255 * whether the path doesn't exist. This will only happen in
2256 * the case that we're doing a tail append, so maybe we can
2257 * take advantage of that information somehow.
2258 */
2259 if (le64_to_cpu(di->i_last_eb_blk) == path_leaf_bh(path)->b_blocknr) {
2260 /*
2261 * Ok, ocfs2_find_path() returned us the rightmost
2262 * tree path. This might be an appending insert. There are
2263 * two cases:
2264 * 1) We're doing a true append at the tail:
2265 * -This might even be off the end of the leaf
2266 * 2) We're "appending" by rotating in the tail
2267 */
2268 ocfs2_figure_appending_type(insert, el, insert_rec);
2269 }
2270
2271out:
2272 ocfs2_free_path(path);
2273
2274 if (ret == 0)
2275 *last_eb_bh = bh;
2276 else
2277 brelse(bh);
2278 return ret;
2279}
2280
2281/*
2282 * Insert an extent into an inode btree.
2283 *
2284 * The caller needs to update fe->i_clusters
2285 */
2286int ocfs2_insert_extent(struct ocfs2_super *osb,
2287 handle_t *handle,
2288 struct inode *inode,
2289 struct buffer_head *fe_bh,
2290 u32 cpos,
2291 u64 start_blk,
2292 u32 new_clusters,
2293 struct ocfs2_alloc_context *meta_ac)
2294{
2295 int status, shift;
2296 struct buffer_head *last_eb_bh = NULL;
2297 struct buffer_head *bh = NULL;
2298 struct ocfs2_insert_type insert = {0, };
2299 struct ocfs2_extent_rec rec;
2300
2301 mlog(0, "add %u clusters at position %u to inode %llu\n",
2302 new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno);
2303
2304 mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
2305 (OCFS2_I(inode)->ip_clusters != cpos),
2306 "Device %s, asking for sparse allocation: inode %llu, "
2307 "cpos %u, clusters %u\n",
2308 osb->dev_str,
2309 (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos,
2310 OCFS2_I(inode)->ip_clusters);
2311
2312 rec.e_cpos = cpu_to_le32(cpos);
2313 rec.e_blkno = cpu_to_le64(start_blk);
2314 rec.e_clusters = cpu_to_le32(new_clusters);
2315
2316 status = ocfs2_figure_insert_type(inode, fe_bh, &last_eb_bh, &rec,
2317 &insert);
2318 if (status < 0) {
2319 mlog_errno(status);
2320 goto bail;
2321 }
2322
2323 mlog(0, "Insert.appending: %u, Insert.Contig: %u, "
2324 "Insert.contig_index: %d, Insert.free_records: %d, "
2325 "Insert.tree_depth: %d\n",
2326 insert.ins_appending, insert.ins_contig, insert.ins_contig_index,
2327 insert.ins_free_records, insert.ins_tree_depth);
2328
2329 /*
2330 * Avoid growing the tree unless we're out of records and the
2331 * insert type requres one.
2332 */
2333 if (insert.ins_contig != CONTIG_NONE || insert.ins_free_records)
2334 goto out_add;
857 2335
858 shift = ocfs2_find_branch_target(osb, inode, fe_bh, &bh); 2336 shift = ocfs2_find_branch_target(osb, inode, fe_bh, &bh);
859 if (shift < 0) { 2337 if (shift < 0) {
@@ -866,13 +2344,9 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
866 * and didn't find room for any more extents - we need to add 2344 * and didn't find room for any more extents - we need to add
867 * another tree level */ 2345 * another tree level */
868 if (shift) { 2346 if (shift) {
869 /* if we hit a leaf, we'd better be empty :) */
870 BUG_ON(le16_to_cpu(el->l_next_free_rec) !=
871 le16_to_cpu(el->l_count));
872 BUG_ON(bh); 2347 BUG_ON(bh);
873 mlog(0, "ocfs2_allocate_extent: need to shift tree depth " 2348 mlog(0, "need to shift tree depth "
874 "(current = %u)\n", 2349 "(current = %d)\n", insert.ins_tree_depth);
875 le16_to_cpu(fe->id2.i_list.l_tree_depth));
876 2350
877 /* ocfs2_shift_tree_depth will return us a buffer with 2351 /* ocfs2_shift_tree_depth will return us a buffer with
878 * the new extent block (so we can pass that to 2352 * the new extent block (so we can pass that to
@@ -883,15 +2357,16 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
883 mlog_errno(status); 2357 mlog_errno(status);
884 goto bail; 2358 goto bail;
885 } 2359 }
2360 insert.ins_tree_depth++;
886 /* Special case: we have room now if we shifted from 2361 /* Special case: we have room now if we shifted from
887 * tree_depth 0 */ 2362 * tree_depth 0 */
888 if (fe->id2.i_list.l_tree_depth == cpu_to_le16(1)) 2363 if (insert.ins_tree_depth == 1)
889 goto out_add; 2364 goto out_add;
890 } 2365 }
891 2366
892 /* call ocfs2_add_branch to add the final part of the tree with 2367 /* call ocfs2_add_branch to add the final part of the tree with
893 * the new data. */ 2368 * the new data. */
894 mlog(0, "ocfs2_allocate_extent: add branch. bh = %p\n", bh); 2369 mlog(0, "add branch. bh = %p\n", bh);
895 status = ocfs2_add_branch(osb, handle, inode, fe_bh, bh, last_eb_bh, 2370 status = ocfs2_add_branch(osb, handle, inode, fe_bh, bh, last_eb_bh,
896 meta_ac); 2371 meta_ac);
897 if (status < 0) { 2372 if (status < 0) {
@@ -900,9 +2375,8 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
900 } 2375 }
901 2376
902out_add: 2377out_add:
903 /* Finally, we can add clusters. */ 2378 /* Finally, we can add clusters. This might rotate the tree for us. */
904 status = ocfs2_do_insert_extent(osb, handle, inode, fe_bh, 2379 status = ocfs2_do_insert_extent(inode, handle, fe_bh, &rec, &insert);
905 start_blk, new_clusters);
906 if (status < 0) 2380 if (status < 0)
907 mlog_errno(status); 2381 mlog_errno(status);
908 2382
@@ -1447,140 +2921,141 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb)
1447 * block will be deleted, and if it will, what the new last extent 2921 * block will be deleted, and if it will, what the new last extent
1448 * block will be so we can update his h_next_leaf_blk field, as well 2922 * block will be so we can update his h_next_leaf_blk field, as well
1449 * as the dinodes i_last_eb_blk */ 2923 * as the dinodes i_last_eb_blk */
1450static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb, 2924static int ocfs2_find_new_last_ext_blk(struct inode *inode,
1451 struct inode *inode,
1452 struct ocfs2_dinode *fe,
1453 u32 new_i_clusters, 2925 u32 new_i_clusters,
1454 struct buffer_head *old_last_eb, 2926 struct ocfs2_path *path,
1455 struct buffer_head **new_last_eb) 2927 struct buffer_head **new_last_eb)
1456{ 2928{
1457 int i, status = 0; 2929 int ret = 0;
1458 u64 block = 0; 2930 u32 cpos;
1459 struct ocfs2_extent_block *eb; 2931 struct ocfs2_extent_block *eb;
1460 struct ocfs2_extent_list *el; 2932 struct ocfs2_extent_list *el;
1461 struct buffer_head *bh = NULL; 2933 struct buffer_head *bh = NULL;
1462 2934
1463 *new_last_eb = NULL; 2935 *new_last_eb = NULL;
1464 2936
1465 if (!OCFS2_IS_VALID_DINODE(fe)) {
1466 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
1467 status = -EIO;
1468 goto bail;
1469 }
1470
1471 /* we have no tree, so of course, no last_eb. */ 2937 /* we have no tree, so of course, no last_eb. */
1472 if (!fe->id2.i_list.l_tree_depth) 2938 if (!path->p_tree_depth)
1473 goto bail; 2939 goto out;
1474 2940
1475 /* trunc to zero special case - this makes tree_depth = 0 2941 /* trunc to zero special case - this makes tree_depth = 0
1476 * regardless of what it is. */ 2942 * regardless of what it is. */
1477 if (!new_i_clusters) 2943 if (!new_i_clusters)
1478 goto bail; 2944 goto out;
1479 2945
1480 eb = (struct ocfs2_extent_block *) old_last_eb->b_data; 2946 el = path_leaf_el(path);
1481 el = &(eb->h_list);
1482 BUG_ON(!el->l_next_free_rec); 2947 BUG_ON(!el->l_next_free_rec);
1483 2948
1484 /* Make sure that this guy will actually be empty after we 2949 /* Make sure that this guy will actually be empty after we
1485 * clear away the data. */ 2950 * clear away the data. */
1486 if (le32_to_cpu(el->l_recs[0].e_cpos) < new_i_clusters) 2951 if (ocfs2_is_empty_extent(&el->l_recs[0])) {
1487 goto bail; 2952 if (le16_to_cpu(el->l_next_free_rec) > 1 &&
2953 le32_to_cpu(el->l_recs[1].e_cpos) < new_i_clusters)
2954 goto out;
2955 } else if (le32_to_cpu(el->l_recs[0].e_cpos) < new_i_clusters)
2956 goto out;
1488 2957
1489 /* Ok, at this point, we know that last_eb will definitely 2958 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
1490 * change, so lets traverse the tree and find the second to 2959 if (ret) {
1491 * last extent block. */ 2960 mlog_errno(ret);
1492 el = &(fe->id2.i_list); 2961 goto out;
1493 /* go down the tree, */ 2962 }
1494 do {
1495 for(i = (le16_to_cpu(el->l_next_free_rec) - 1); i >= 0; i--) {
1496 if (le32_to_cpu(el->l_recs[i].e_cpos) <
1497 new_i_clusters) {
1498 block = le64_to_cpu(el->l_recs[i].e_blkno);
1499 break;
1500 }
1501 }
1502 BUG_ON(i < 0);
1503 2963
1504 if (bh) { 2964 ret = ocfs2_find_leaf(inode, path_root_el(path), cpos, &bh);
1505 brelse(bh); 2965 if (ret) {
1506 bh = NULL; 2966 mlog_errno(ret);
1507 } 2967 goto out;
2968 }
1508 2969
1509 status = ocfs2_read_block(osb, block, &bh, OCFS2_BH_CACHED, 2970 eb = (struct ocfs2_extent_block *) bh->b_data;
1510 inode); 2971 el = &eb->h_list;
1511 if (status < 0) { 2972 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1512 mlog_errno(status); 2973 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1513 goto bail; 2974 ret = -EROFS;
1514 } 2975 goto out;
1515 eb = (struct ocfs2_extent_block *) bh->b_data; 2976 }
1516 el = &eb->h_list;
1517 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1518 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1519 status = -EIO;
1520 goto bail;
1521 }
1522 } while (el->l_tree_depth);
1523 2977
1524 *new_last_eb = bh; 2978 *new_last_eb = bh;
1525 get_bh(*new_last_eb); 2979 get_bh(*new_last_eb);
1526 mlog(0, "returning block %llu\n", 2980 mlog(0, "returning block %llu, (cpos: %u)\n",
1527 (unsigned long long)le64_to_cpu(eb->h_blkno)); 2981 (unsigned long long)le64_to_cpu(eb->h_blkno), cpos);
1528bail: 2982out:
1529 if (bh) 2983 brelse(bh);
1530 brelse(bh);
1531 2984
1532 return status; 2985 return ret;
1533} 2986}
1534 2987
1535static int ocfs2_do_truncate(struct ocfs2_super *osb, 2988static int ocfs2_do_truncate(struct ocfs2_super *osb,
1536 unsigned int clusters_to_del, 2989 unsigned int clusters_to_del,
1537 struct inode *inode, 2990 struct inode *inode,
1538 struct buffer_head *fe_bh, 2991 struct buffer_head *fe_bh,
1539 struct buffer_head *old_last_eb_bh,
1540 handle_t *handle, 2992 handle_t *handle,
1541 struct ocfs2_truncate_context *tc) 2993 struct ocfs2_truncate_context *tc,
2994 struct ocfs2_path *path)
1542{ 2995{
1543 int status, i, depth; 2996 int status, i, index;
1544 struct ocfs2_dinode *fe; 2997 struct ocfs2_dinode *fe;
1545 struct ocfs2_extent_block *eb; 2998 struct ocfs2_extent_block *eb;
1546 struct ocfs2_extent_block *last_eb = NULL; 2999 struct ocfs2_extent_block *last_eb = NULL;
1547 struct ocfs2_extent_list *el; 3000 struct ocfs2_extent_list *el;
1548 struct buffer_head *eb_bh = NULL; 3001 struct buffer_head *eb_bh = NULL;
1549 struct buffer_head *last_eb_bh = NULL; 3002 struct buffer_head *last_eb_bh = NULL;
1550 u64 next_eb = 0;
1551 u64 delete_blk = 0; 3003 u64 delete_blk = 0;
1552 3004
1553 fe = (struct ocfs2_dinode *) fe_bh->b_data; 3005 fe = (struct ocfs2_dinode *) fe_bh->b_data;
1554 3006
1555 status = ocfs2_find_new_last_ext_blk(osb, 3007 status = ocfs2_find_new_last_ext_blk(inode,
1556 inode,
1557 fe,
1558 le32_to_cpu(fe->i_clusters) - 3008 le32_to_cpu(fe->i_clusters) -
1559 clusters_to_del, 3009 clusters_to_del,
1560 old_last_eb_bh, 3010 path, &last_eb_bh);
1561 &last_eb_bh);
1562 if (status < 0) { 3011 if (status < 0) {
1563 mlog_errno(status); 3012 mlog_errno(status);
1564 goto bail; 3013 goto bail;
1565 } 3014 }
1566 if (last_eb_bh) 3015
3016 /*
3017 * Each component will be touched, so we might as well journal
3018 * here to avoid having to handle errors later.
3019 */
3020 for (i = 0; i < path_num_items(path); i++) {
3021 status = ocfs2_journal_access(handle, inode,
3022 path->p_node[i].bh,
3023 OCFS2_JOURNAL_ACCESS_WRITE);
3024 if (status < 0) {
3025 mlog_errno(status);
3026 goto bail;
3027 }
3028 }
3029
3030 if (last_eb_bh) {
3031 status = ocfs2_journal_access(handle, inode, last_eb_bh,
3032 OCFS2_JOURNAL_ACCESS_WRITE);
3033 if (status < 0) {
3034 mlog_errno(status);
3035 goto bail;
3036 }
3037
1567 last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; 3038 last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
3039 }
1568 3040
1569 status = ocfs2_journal_access(handle, inode, fe_bh, 3041 el = &(fe->id2.i_list);
1570 OCFS2_JOURNAL_ACCESS_WRITE); 3042
1571 if (status < 0) { 3043 /*
1572 mlog_errno(status); 3044 * Lower levels depend on this never happening, but it's best
3045 * to check it up here before changing the tree.
3046 */
3047 if (el->l_tree_depth && ocfs2_is_empty_extent(&el->l_recs[0])) {
3048 ocfs2_error(inode->i_sb,
3049 "Inode %lu has an empty extent record, depth %u\n",
3050 inode->i_ino, le16_to_cpu(el->l_tree_depth));
1573 goto bail; 3051 goto bail;
1574 } 3052 }
1575 el = &(fe->id2.i_list);
1576 3053
1577 spin_lock(&OCFS2_I(inode)->ip_lock); 3054 spin_lock(&OCFS2_I(inode)->ip_lock);
1578 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) - 3055 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
1579 clusters_to_del; 3056 clusters_to_del;
1580 spin_unlock(&OCFS2_I(inode)->ip_lock); 3057 spin_unlock(&OCFS2_I(inode)->ip_lock);
1581 le32_add_cpu(&fe->i_clusters, -clusters_to_del); 3058 le32_add_cpu(&fe->i_clusters, -clusters_to_del);
1582 fe->i_mtime = cpu_to_le64(CURRENT_TIME.tv_sec);
1583 fe->i_mtime_nsec = cpu_to_le32(CURRENT_TIME.tv_nsec);
1584 3059
1585 i = le16_to_cpu(el->l_next_free_rec) - 1; 3060 i = le16_to_cpu(el->l_next_free_rec) - 1;
1586 3061
@@ -1589,26 +3064,34 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
1589 /* tree depth zero, we can just delete the clusters, otherwise 3064 /* tree depth zero, we can just delete the clusters, otherwise
1590 * we need to record the offset of the next level extent block 3065 * we need to record the offset of the next level extent block
1591 * as we may overwrite it. */ 3066 * as we may overwrite it. */
1592 if (!el->l_tree_depth) 3067 if (!el->l_tree_depth) {
1593 delete_blk = le64_to_cpu(el->l_recs[i].e_blkno) 3068 delete_blk = le64_to_cpu(el->l_recs[i].e_blkno)
1594 + ocfs2_clusters_to_blocks(osb->sb, 3069 + ocfs2_clusters_to_blocks(osb->sb,
1595 le32_to_cpu(el->l_recs[i].e_clusters)); 3070 le32_to_cpu(el->l_recs[i].e_clusters));
1596 else
1597 next_eb = le64_to_cpu(el->l_recs[i].e_blkno);
1598 3071
1599 if (!el->l_recs[i].e_clusters) { 3072 if (!el->l_recs[i].e_clusters) {
1600 /* if we deleted the whole extent record, then clear 3073 /* if we deleted the whole extent record, then clear
1601 * out the other fields and update the extent 3074 * out the other fields and update the extent
1602 * list. For depth > 0 trees, we've already recorded 3075 * list.
1603 * the extent block in 'next_eb' */ 3076 */
1604 el->l_recs[i].e_cpos = 0; 3077 el->l_recs[i].e_cpos = 0;
1605 el->l_recs[i].e_blkno = 0; 3078 el->l_recs[i].e_blkno = 0;
1606 BUG_ON(!el->l_next_free_rec); 3079 BUG_ON(!el->l_next_free_rec);
1607 le16_add_cpu(&el->l_next_free_rec, -1); 3080 le16_add_cpu(&el->l_next_free_rec, -1);
3081
3082 /*
3083 * The leftmost record might be an empty extent -
3084 * delete it here too.
3085 */
3086 if (i == 1 && ocfs2_is_empty_extent(&el->l_recs[0])) {
3087 el->l_recs[0].e_cpos = 0;
3088 el->l_recs[0].e_blkno = 0;
3089 el->l_next_free_rec = 0;
3090 }
3091 }
1608 } 3092 }
1609 3093
1610 depth = le16_to_cpu(el->l_tree_depth); 3094 if (le32_to_cpu(fe->i_clusters) == 0) {
1611 if (!fe->i_clusters) {
1612 /* trunc to zero is a special case. */ 3095 /* trunc to zero is a special case. */
1613 el->l_tree_depth = 0; 3096 el->l_tree_depth = 0;
1614 fe->i_last_eb_blk = 0; 3097 fe->i_last_eb_blk = 0;
@@ -1625,12 +3108,6 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
1625 /* If there will be a new last extent block, then by 3108 /* If there will be a new last extent block, then by
1626 * definition, there cannot be any leaves to the right of 3109 * definition, there cannot be any leaves to the right of
1627 * him. */ 3110 * him. */
1628 status = ocfs2_journal_access(handle, inode, last_eb_bh,
1629 OCFS2_JOURNAL_ACCESS_WRITE);
1630 if (status < 0) {
1631 mlog_errno(status);
1632 goto bail;
1633 }
1634 last_eb->h_next_leaf_blk = 0; 3111 last_eb->h_next_leaf_blk = 0;
1635 status = ocfs2_journal_dirty(handle, last_eb_bh); 3112 status = ocfs2_journal_dirty(handle, last_eb_bh);
1636 if (status < 0) { 3113 if (status < 0) {
@@ -1639,33 +3116,25 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
1639 } 3116 }
1640 } 3117 }
1641 3118
3119 index = 1;
1642 /* if our tree depth > 0, update all the tree blocks below us. */ 3120 /* if our tree depth > 0, update all the tree blocks below us. */
1643 while (depth) { 3121 while (index <= path->p_tree_depth) {
1644 mlog(0, "traveling tree (depth = %d, next_eb = %llu)\n", 3122 eb_bh = path->p_node[index].bh;
1645 depth, (unsigned long long)next_eb);
1646 status = ocfs2_read_block(osb, next_eb, &eb_bh,
1647 OCFS2_BH_CACHED, inode);
1648 if (status < 0) {
1649 mlog_errno(status);
1650 goto bail;
1651 }
1652 eb = (struct ocfs2_extent_block *)eb_bh->b_data; 3123 eb = (struct ocfs2_extent_block *)eb_bh->b_data;
1653 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { 3124 el = path->p_node[index].el;
1654 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1655 status = -EIO;
1656 goto bail;
1657 }
1658 el = &(eb->h_list);
1659 3125
1660 status = ocfs2_journal_access(handle, inode, eb_bh, 3126 mlog(0, "traveling tree (index = %d, extent block: %llu)\n",
1661 OCFS2_JOURNAL_ACCESS_WRITE); 3127 index, (unsigned long long)eb_bh->b_blocknr);
1662 if (status < 0) {
1663 mlog_errno(status);
1664 goto bail;
1665 }
1666 3128
1667 BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0); 3129 BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
1668 BUG_ON(depth != (le16_to_cpu(el->l_tree_depth) + 1)); 3130 if (index !=
3131 (path->p_tree_depth - le16_to_cpu(el->l_tree_depth))) {
3132 ocfs2_error(inode->i_sb,
3133 "Inode %lu has invalid ext. block %llu\n",
3134 inode->i_ino,
3135 (unsigned long long)eb_bh->b_blocknr);
3136 goto bail;
3137 }
1669 3138
1670 i = le16_to_cpu(el->l_next_free_rec) - 1; 3139 i = le16_to_cpu(el->l_next_free_rec) - 1;
1671 3140
@@ -1680,7 +3149,6 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
1680 BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del); 3149 BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del);
1681 le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del); 3150 le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del);
1682 3151
1683 next_eb = le64_to_cpu(el->l_recs[i].e_blkno);
1684 /* bottom-most block requires us to delete data.*/ 3152 /* bottom-most block requires us to delete data.*/
1685 if (!el->l_tree_depth) 3153 if (!el->l_tree_depth)
1686 delete_blk = le64_to_cpu(el->l_recs[i].e_blkno) 3154 delete_blk = le64_to_cpu(el->l_recs[i].e_blkno)
@@ -1692,6 +3160,12 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
1692 BUG_ON(!el->l_next_free_rec); 3160 BUG_ON(!el->l_next_free_rec);
1693 le16_add_cpu(&el->l_next_free_rec, -1); 3161 le16_add_cpu(&el->l_next_free_rec, -1);
1694 } 3162 }
3163 if (i == 1 && ocfs2_is_empty_extent(&el->l_recs[0])) {
3164 el->l_recs[0].e_cpos = 0;
3165 el->l_recs[0].e_blkno = 0;
3166 el->l_next_free_rec = 0;
3167 }
3168
1695 mlog(0, "extent block %llu, after: record %d: " 3169 mlog(0, "extent block %llu, after: record %d: "
1696 "(%u, %u, %llu), next = %u\n", 3170 "(%u, %u, %llu), next = %u\n",
1697 (unsigned long long)le64_to_cpu(eb->h_blkno), i, 3171 (unsigned long long)le64_to_cpu(eb->h_blkno), i,
@@ -1714,6 +3188,22 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
1714 BUG_ON(el->l_recs[0].e_clusters); 3188 BUG_ON(el->l_recs[0].e_clusters);
1715 BUG_ON(el->l_recs[0].e_cpos); 3189 BUG_ON(el->l_recs[0].e_cpos);
1716 BUG_ON(el->l_recs[0].e_blkno); 3190 BUG_ON(el->l_recs[0].e_blkno);
3191
3192 /*
3193 * We need to remove this extent block from
3194 * the list above it.
3195 *
3196 * Since we've passed it already in this loop,
3197 * no need to worry about journaling.
3198 */
3199 el = path->p_node[index - 1].el;
3200 i = le16_to_cpu(el->l_next_free_rec) - 1;
3201 BUG_ON(i < 0);
3202 el->l_recs[i].e_cpos = 0;
3203 el->l_recs[i].e_clusters = 0;
3204 el->l_recs[i].e_blkno = 0;
3205 le16_add_cpu(&el->l_next_free_rec, -1);
3206
1717 if (eb->h_suballoc_slot == 0) { 3207 if (eb->h_suballoc_slot == 0) {
1718 /* 3208 /*
1719 * This code only understands how to 3209 * This code only understands how to
@@ -1736,9 +3226,7 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
1736 } 3226 }
1737 } 3227 }
1738 } 3228 }
1739 brelse(eb_bh); 3229 index++;
1740 eb_bh = NULL;
1741 depth--;
1742 } 3230 }
1743 3231
1744 BUG_ON(!delete_blk); 3232 BUG_ON(!delete_blk);
@@ -1750,10 +3238,7 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
1750 } 3238 }
1751 status = 0; 3239 status = 0;
1752bail: 3240bail:
1753 if (!status) 3241
1754 ocfs2_extent_map_trunc(inode, le32_to_cpu(fe->i_clusters));
1755 else
1756 ocfs2_extent_map_drop(inode, 0);
1757 mlog_exit(status); 3242 mlog_exit(status);
1758 return status; 3243 return status;
1759} 3244}
@@ -1770,82 +3255,70 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
1770 struct ocfs2_truncate_context *tc) 3255 struct ocfs2_truncate_context *tc)
1771{ 3256{
1772 int status, i, credits, tl_sem = 0; 3257 int status, i, credits, tl_sem = 0;
1773 u32 clusters_to_del, target_i_clusters; 3258 u32 clusters_to_del, new_highest_cpos, range;
1774 u64 last_eb = 0;
1775 struct ocfs2_dinode *fe;
1776 struct ocfs2_extent_block *eb;
1777 struct ocfs2_extent_list *el; 3259 struct ocfs2_extent_list *el;
1778 struct buffer_head *last_eb_bh;
1779 handle_t *handle = NULL; 3260 handle_t *handle = NULL;
1780 struct inode *tl_inode = osb->osb_tl_inode; 3261 struct inode *tl_inode = osb->osb_tl_inode;
3262 struct ocfs2_path *path = NULL;
1781 3263
1782 mlog_entry_void(); 3264 mlog_entry_void();
1783 3265
1784 down_write(&OCFS2_I(inode)->ip_alloc_sem); 3266 down_write(&OCFS2_I(inode)->ip_alloc_sem);
1785 3267
1786 target_i_clusters = ocfs2_clusters_for_bytes(osb->sb, 3268 new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
1787 i_size_read(inode)); 3269 i_size_read(inode));
1788 3270
1789 last_eb_bh = tc->tc_last_eb_bh; 3271 path = ocfs2_new_inode_path(fe_bh);
1790 tc->tc_last_eb_bh = NULL; 3272 if (!path) {
1791 3273 status = -ENOMEM;
1792 fe = (struct ocfs2_dinode *) fe_bh->b_data; 3274 mlog_errno(status);
1793 3275 goto bail;
1794 if (fe->id2.i_list.l_tree_depth) { 3276 }
1795 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
1796 el = &eb->h_list;
1797 } else
1798 el = &fe->id2.i_list;
1799 last_eb = le64_to_cpu(fe->i_last_eb_blk);
1800start: 3277start:
1801 mlog(0, "ocfs2_commit_truncate: fe->i_clusters = %u, " 3278 /*
1802 "last_eb = %llu, fe->i_last_eb_blk = %llu, " 3279 * Truncate always works against the rightmost tree branch.
1803 "fe->id2.i_list.l_tree_depth = %u last_eb_bh = %p\n", 3280 */
1804 le32_to_cpu(fe->i_clusters), (unsigned long long)last_eb, 3281 status = ocfs2_find_path(inode, path, UINT_MAX);
1805 (unsigned long long)le64_to_cpu(fe->i_last_eb_blk), 3282 if (status) {
1806 le16_to_cpu(fe->id2.i_list.l_tree_depth), last_eb_bh); 3283 mlog_errno(status);
1807 3284 goto bail;
1808 if (last_eb != le64_to_cpu(fe->i_last_eb_blk)) {
1809 mlog(0, "last_eb changed!\n");
1810 BUG_ON(!fe->id2.i_list.l_tree_depth);
1811 last_eb = le64_to_cpu(fe->i_last_eb_blk);
1812 /* i_last_eb_blk may have changed, read it if
1813 * necessary. We don't have to worry about the
1814 * truncate to zero case here (where there becomes no
1815 * last_eb) because we never loop back after our work
1816 * is done. */
1817 if (last_eb_bh) {
1818 brelse(last_eb_bh);
1819 last_eb_bh = NULL;
1820 }
1821
1822 status = ocfs2_read_block(osb, last_eb,
1823 &last_eb_bh, OCFS2_BH_CACHED,
1824 inode);
1825 if (status < 0) {
1826 mlog_errno(status);
1827 goto bail;
1828 }
1829 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
1830 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1831 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1832 status = -EIO;
1833 goto bail;
1834 }
1835 el = &(eb->h_list);
1836 } 3285 }
1837 3286
1838 /* by now, el will point to the extent list on the bottom most 3287 mlog(0, "inode->ip_clusters = %u, tree_depth = %u\n",
1839 * portion of this tree. */ 3288 OCFS2_I(inode)->ip_clusters, path->p_tree_depth);
3289
3290 /*
3291 * By now, el will point to the extent list on the bottom most
3292 * portion of this tree. Only the tail record is considered in
3293 * each pass.
3294 *
3295 * We handle the following cases, in order:
3296 * - empty extent: delete the remaining branch
3297 * - remove the entire record
3298 * - remove a partial record
3299 * - no record needs to be removed (truncate has completed)
3300 */
3301 el = path_leaf_el(path);
1840 i = le16_to_cpu(el->l_next_free_rec) - 1; 3302 i = le16_to_cpu(el->l_next_free_rec) - 1;
1841 if (le32_to_cpu(el->l_recs[i].e_cpos) >= target_i_clusters) 3303 range = le32_to_cpu(el->l_recs[i].e_cpos) +
3304 le32_to_cpu(el->l_recs[i].e_clusters);
3305 if (i == 0 && ocfs2_is_empty_extent(&el->l_recs[i])) {
3306 clusters_to_del = 0;
3307 } else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) {
1842 clusters_to_del = le32_to_cpu(el->l_recs[i].e_clusters); 3308 clusters_to_del = le32_to_cpu(el->l_recs[i].e_clusters);
1843 else 3309 } else if (range > new_highest_cpos) {
1844 clusters_to_del = (le32_to_cpu(el->l_recs[i].e_clusters) + 3310 clusters_to_del = (le32_to_cpu(el->l_recs[i].e_clusters) +
1845 le32_to_cpu(el->l_recs[i].e_cpos)) - 3311 le32_to_cpu(el->l_recs[i].e_cpos)) -
1846 target_i_clusters; 3312 new_highest_cpos;
3313 } else {
3314 status = 0;
3315 goto bail;
3316 }
1847 3317
1848 mlog(0, "clusters_to_del = %u in this pass\n", clusters_to_del); 3318 mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n",
3319 clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr);
3320
3321 BUG_ON(clusters_to_del == 0);
1849 3322
1850 mutex_lock(&tl_inode->i_mutex); 3323 mutex_lock(&tl_inode->i_mutex);
1851 tl_sem = 1; 3324 tl_sem = 1;
@@ -1861,7 +3334,8 @@ start:
1861 } 3334 }
1862 3335
1863 credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del, 3336 credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del,
1864 fe, el); 3337 (struct ocfs2_dinode *)fe_bh->b_data,
3338 el);
1865 handle = ocfs2_start_trans(osb, credits); 3339 handle = ocfs2_start_trans(osb, credits);
1866 if (IS_ERR(handle)) { 3340 if (IS_ERR(handle)) {
1867 status = PTR_ERR(handle); 3341 status = PTR_ERR(handle);
@@ -1870,13 +3344,8 @@ start:
1870 goto bail; 3344 goto bail;
1871 } 3345 }
1872 3346
1873 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 3347 status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle,
1874 status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); 3348 tc, path);
1875 if (status < 0)
1876 mlog_errno(status);
1877
1878 status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh,
1879 last_eb_bh, handle, tc);
1880 if (status < 0) { 3349 if (status < 0) {
1881 mlog_errno(status); 3350 mlog_errno(status);
1882 goto bail; 3351 goto bail;
@@ -1888,8 +3357,12 @@ start:
1888 ocfs2_commit_trans(osb, handle); 3357 ocfs2_commit_trans(osb, handle);
1889 handle = NULL; 3358 handle = NULL;
1890 3359
1891 BUG_ON(le32_to_cpu(fe->i_clusters) < target_i_clusters); 3360 ocfs2_reinit_path(path, 1);
1892 if (le32_to_cpu(fe->i_clusters) > target_i_clusters) 3361
3362 /*
3363 * Only loop if we still have allocation.
3364 */
3365 if (OCFS2_I(inode)->ip_clusters)
1893 goto start; 3366 goto start;
1894bail: 3367bail:
1895 up_write(&OCFS2_I(inode)->ip_alloc_sem); 3368 up_write(&OCFS2_I(inode)->ip_alloc_sem);
@@ -1902,8 +3375,7 @@ bail:
1902 if (handle) 3375 if (handle)
1903 ocfs2_commit_trans(osb, handle); 3376 ocfs2_commit_trans(osb, handle);
1904 3377
1905 if (last_eb_bh) 3378 ocfs2_free_path(path);
1906 brelse(last_eb_bh);
1907 3379
1908 /* This will drop the ext_alloc cluster lock for us */ 3380 /* This will drop the ext_alloc cluster lock for us */
1909 ocfs2_free_truncate_context(tc); 3381 ocfs2_free_truncate_context(tc);
@@ -1912,7 +3384,6 @@ bail:
1912 return status; 3384 return status;
1913} 3385}
1914 3386
1915
1916/* 3387/*
1917 * Expects the inode to already be locked. This will figure out which 3388 * Expects the inode to already be locked. This will figure out which
1918 * inodes need to be locked and will put them on the returned truncate 3389 * inodes need to be locked and will put them on the returned truncate
@@ -1923,7 +3394,7 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
1923 struct buffer_head *fe_bh, 3394 struct buffer_head *fe_bh,
1924 struct ocfs2_truncate_context **tc) 3395 struct ocfs2_truncate_context **tc)
1925{ 3396{
1926 int status, metadata_delete; 3397 int status, metadata_delete, i;
1927 unsigned int new_i_clusters; 3398 unsigned int new_i_clusters;
1928 struct ocfs2_dinode *fe; 3399 struct ocfs2_dinode *fe;
1929 struct ocfs2_extent_block *eb; 3400 struct ocfs2_extent_block *eb;
@@ -1944,7 +3415,8 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
1944 "%llu\n", fe->i_clusters, new_i_clusters, 3415 "%llu\n", fe->i_clusters, new_i_clusters,
1945 (unsigned long long)fe->i_size); 3416 (unsigned long long)fe->i_size);
1946 3417
1947 if (le32_to_cpu(fe->i_clusters) <= new_i_clusters) { 3418 if (!ocfs2_sparse_alloc(osb) &&
3419 le32_to_cpu(fe->i_clusters) <= new_i_clusters) {
1948 ocfs2_error(inode->i_sb, "Dinode %llu has cluster count " 3420 ocfs2_error(inode->i_sb, "Dinode %llu has cluster count "
1949 "%u and size %llu whereas struct inode has " 3421 "%u and size %llu whereas struct inode has "
1950 "cluster count %u and size %llu which caused an " 3422 "cluster count %u and size %llu which caused an "
@@ -1986,7 +3458,15 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
1986 goto bail; 3458 goto bail;
1987 } 3459 }
1988 el = &(eb->h_list); 3460 el = &(eb->h_list);
1989 if (le32_to_cpu(el->l_recs[0].e_cpos) >= new_i_clusters) 3461
3462 i = 0;
3463 if (ocfs2_is_empty_extent(&el->l_recs[0]))
3464 i = 1;
3465 /*
3466 * XXX: Should we check that next_free_rec contains
3467 * the extent?
3468 */
3469 if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_i_clusters)
1990 metadata_delete = 1; 3470 metadata_delete = 1;
1991 } 3471 }
1992 3472