aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ocfs2/alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ocfs2/alloc.c')
-rw-r--r--fs/ocfs2/alloc.c2676
1 files changed, 2444 insertions, 232 deletions
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 19712a7d145f..f5e11f4fa952 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -50,6 +50,8 @@
50#include "buffer_head_io.h" 50#include "buffer_head_io.h"
51 51
52static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc); 52static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
53static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
54 struct ocfs2_extent_block *eb);
53 55
54/* 56/*
55 * Structures which describe a path through a btree, and functions to 57 * Structures which describe a path through a btree, and functions to
@@ -117,6 +119,31 @@ static void ocfs2_free_path(struct ocfs2_path *path)
117} 119}
118 120
119/* 121/*
122 * All the elements of src into dest. After this call, src could be freed
123 * without affecting dest.
124 *
125 * Both paths should have the same root. Any non-root elements of dest
126 * will be freed.
127 */
128static void ocfs2_cp_path(struct ocfs2_path *dest, struct ocfs2_path *src)
129{
130 int i;
131
132 BUG_ON(path_root_bh(dest) != path_root_bh(src));
133 BUG_ON(path_root_el(dest) != path_root_el(src));
134
135 ocfs2_reinit_path(dest, 1);
136
137 for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
138 dest->p_node[i].bh = src->p_node[i].bh;
139 dest->p_node[i].el = src->p_node[i].el;
140
141 if (dest->p_node[i].bh)
142 get_bh(dest->p_node[i].bh);
143 }
144}
145
146/*
120 * Make the *dest path the same as src and re-initialize src path to 147 * Make the *dest path the same as src and re-initialize src path to
121 * have a root only. 148 * have a root only.
122 */ 149 */
@@ -212,10 +239,41 @@ out:
212 return ret; 239 return ret;
213} 240}
214 241
242/*
243 * Return the index of the extent record which contains cluster #v_cluster.
244 * -1 is returned if it was not found.
245 *
246 * Should work fine on interior and exterior nodes.
247 */
248int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster)
249{
250 int ret = -1;
251 int i;
252 struct ocfs2_extent_rec *rec;
253 u32 rec_end, rec_start, clusters;
254
255 for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
256 rec = &el->l_recs[i];
257
258 rec_start = le32_to_cpu(rec->e_cpos);
259 clusters = ocfs2_rec_clusters(el, rec);
260
261 rec_end = rec_start + clusters;
262
263 if (v_cluster >= rec_start && v_cluster < rec_end) {
264 ret = i;
265 break;
266 }
267 }
268
269 return ret;
270}
271
215enum ocfs2_contig_type { 272enum ocfs2_contig_type {
216 CONTIG_NONE = 0, 273 CONTIG_NONE = 0,
217 CONTIG_LEFT, 274 CONTIG_LEFT,
218 CONTIG_RIGHT 275 CONTIG_RIGHT,
276 CONTIG_LEFTRIGHT,
219}; 277};
220 278
221 279
@@ -253,6 +311,14 @@ static enum ocfs2_contig_type
253{ 311{
254 u64 blkno = le64_to_cpu(insert_rec->e_blkno); 312 u64 blkno = le64_to_cpu(insert_rec->e_blkno);
255 313
314 /*
315 * Refuse to coalesce extent records with different flag
316 * fields - we don't want to mix unwritten extents with user
317 * data.
318 */
319 if (ext->e_flags != insert_rec->e_flags)
320 return CONTIG_NONE;
321
256 if (ocfs2_extents_adjacent(ext, insert_rec) && 322 if (ocfs2_extents_adjacent(ext, insert_rec) &&
257 ocfs2_block_extent_contig(inode->i_sb, ext, blkno)) 323 ocfs2_block_extent_contig(inode->i_sb, ext, blkno))
258 return CONTIG_RIGHT; 324 return CONTIG_RIGHT;
@@ -277,7 +343,14 @@ enum ocfs2_append_type {
277 APPEND_TAIL, 343 APPEND_TAIL,
278}; 344};
279 345
346enum ocfs2_split_type {
347 SPLIT_NONE = 0,
348 SPLIT_LEFT,
349 SPLIT_RIGHT,
350};
351
280struct ocfs2_insert_type { 352struct ocfs2_insert_type {
353 enum ocfs2_split_type ins_split;
281 enum ocfs2_append_type ins_appending; 354 enum ocfs2_append_type ins_appending;
282 enum ocfs2_contig_type ins_contig; 355 enum ocfs2_contig_type ins_contig;
283 int ins_contig_index; 356 int ins_contig_index;
@@ -285,6 +358,13 @@ struct ocfs2_insert_type {
285 int ins_tree_depth; 358 int ins_tree_depth;
286}; 359};
287 360
361struct ocfs2_merge_ctxt {
362 enum ocfs2_contig_type c_contig_type;
363 int c_has_empty_extent;
364 int c_split_covers_rec;
365 int c_used_tail_recs;
366};
367
288/* 368/*
289 * How many free extents have we got before we need more meta data? 369 * How many free extents have we got before we need more meta data?
290 */ 370 */
@@ -384,13 +464,7 @@ static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
384 strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE); 464 strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
385 eb->h_blkno = cpu_to_le64(first_blkno); 465 eb->h_blkno = cpu_to_le64(first_blkno);
386 eb->h_fs_generation = cpu_to_le32(osb->fs_generation); 466 eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
387
388#ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS
389 /* we always use slot zero's suballocator */
390 eb->h_suballoc_slot = 0;
391#else
392 eb->h_suballoc_slot = cpu_to_le16(osb->slot_num); 467 eb->h_suballoc_slot = cpu_to_le16(osb->slot_num);
393#endif
394 eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start); 468 eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
395 eb->h_list.l_count = 469 eb->h_list.l_count =
396 cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb)); 470 cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
@@ -461,7 +535,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
461 struct inode *inode, 535 struct inode *inode,
462 struct buffer_head *fe_bh, 536 struct buffer_head *fe_bh,
463 struct buffer_head *eb_bh, 537 struct buffer_head *eb_bh,
464 struct buffer_head *last_eb_bh, 538 struct buffer_head **last_eb_bh,
465 struct ocfs2_alloc_context *meta_ac) 539 struct ocfs2_alloc_context *meta_ac)
466{ 540{
467 int status, new_blocks, i; 541 int status, new_blocks, i;
@@ -476,7 +550,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
476 550
477 mlog_entry_void(); 551 mlog_entry_void();
478 552
479 BUG_ON(!last_eb_bh); 553 BUG_ON(!last_eb_bh || !*last_eb_bh);
480 554
481 fe = (struct ocfs2_dinode *) fe_bh->b_data; 555 fe = (struct ocfs2_dinode *) fe_bh->b_data;
482 556
@@ -507,7 +581,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
507 goto bail; 581 goto bail;
508 } 582 }
509 583
510 eb = (struct ocfs2_extent_block *)last_eb_bh->b_data; 584 eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data;
511 new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list); 585 new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list);
512 586
513 /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be 587 /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
@@ -568,7 +642,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
568 * journal_dirty erroring as it won't unless we've aborted the 642 * journal_dirty erroring as it won't unless we've aborted the
569 * handle (in which case we would never be here) so reserving 643 * handle (in which case we would never be here) so reserving
570 * the write with journal_access is all we need to do. */ 644 * the write with journal_access is all we need to do. */
571 status = ocfs2_journal_access(handle, inode, last_eb_bh, 645 status = ocfs2_journal_access(handle, inode, *last_eb_bh,
572 OCFS2_JOURNAL_ACCESS_WRITE); 646 OCFS2_JOURNAL_ACCESS_WRITE);
573 if (status < 0) { 647 if (status < 0) {
574 mlog_errno(status); 648 mlog_errno(status);
@@ -601,10 +675,10 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
601 * next_leaf on the previously last-extent-block. */ 675 * next_leaf on the previously last-extent-block. */
602 fe->i_last_eb_blk = cpu_to_le64(new_last_eb_blk); 676 fe->i_last_eb_blk = cpu_to_le64(new_last_eb_blk);
603 677
604 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; 678 eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
605 eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk); 679 eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
606 680
607 status = ocfs2_journal_dirty(handle, last_eb_bh); 681 status = ocfs2_journal_dirty(handle, *last_eb_bh);
608 if (status < 0) 682 if (status < 0)
609 mlog_errno(status); 683 mlog_errno(status);
610 status = ocfs2_journal_dirty(handle, fe_bh); 684 status = ocfs2_journal_dirty(handle, fe_bh);
@@ -616,6 +690,14 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
616 mlog_errno(status); 690 mlog_errno(status);
617 } 691 }
618 692
693 /*
694 * Some callers want to track the rightmost leaf so pass it
695 * back here.
696 */
697 brelse(*last_eb_bh);
698 get_bh(new_eb_bhs[0]);
699 *last_eb_bh = new_eb_bhs[0];
700
619 status = 0; 701 status = 0;
620bail: 702bail:
621 if (new_eb_bhs) { 703 if (new_eb_bhs) {
@@ -829,6 +911,87 @@ bail:
829} 911}
830 912
831/* 913/*
914 * Grow a b-tree so that it has more records.
915 *
916 * We might shift the tree depth in which case existing paths should
917 * be considered invalid.
918 *
919 * Tree depth after the grow is returned via *final_depth.
920 *
921 * *last_eb_bh will be updated by ocfs2_add_branch().
922 */
923static int ocfs2_grow_tree(struct inode *inode, handle_t *handle,
924 struct buffer_head *di_bh, int *final_depth,
925 struct buffer_head **last_eb_bh,
926 struct ocfs2_alloc_context *meta_ac)
927{
928 int ret, shift;
929 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
930 int depth = le16_to_cpu(di->id2.i_list.l_tree_depth);
931 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
932 struct buffer_head *bh = NULL;
933
934 BUG_ON(meta_ac == NULL);
935
936 shift = ocfs2_find_branch_target(osb, inode, di_bh, &bh);
937 if (shift < 0) {
938 ret = shift;
939 mlog_errno(ret);
940 goto out;
941 }
942
943 /* We traveled all the way to the bottom of the allocation tree
944 * and didn't find room for any more extents - we need to add
945 * another tree level */
946 if (shift) {
947 BUG_ON(bh);
948 mlog(0, "need to shift tree depth (current = %d)\n", depth);
949
950 /* ocfs2_shift_tree_depth will return us a buffer with
951 * the new extent block (so we can pass that to
952 * ocfs2_add_branch). */
953 ret = ocfs2_shift_tree_depth(osb, handle, inode, di_bh,
954 meta_ac, &bh);
955 if (ret < 0) {
956 mlog_errno(ret);
957 goto out;
958 }
959 depth++;
960 if (depth == 1) {
961 /*
962 * Special case: we have room now if we shifted from
963 * tree_depth 0, so no more work needs to be done.
964 *
965 * We won't be calling add_branch, so pass
966 * back *last_eb_bh as the new leaf. At depth
967 * zero, it should always be null so there's
968 * no reason to brelse.
969 */
970 BUG_ON(*last_eb_bh);
971 get_bh(bh);
972 *last_eb_bh = bh;
973 goto out;
974 }
975 }
976
977 /* call ocfs2_add_branch to add the final part of the tree with
978 * the new data. */
979 mlog(0, "add branch. bh = %p\n", bh);
980 ret = ocfs2_add_branch(osb, handle, inode, di_bh, bh, last_eb_bh,
981 meta_ac);
982 if (ret < 0) {
983 mlog_errno(ret);
984 goto out;
985 }
986
987out:
988 if (final_depth)
989 *final_depth = depth;
990 brelse(bh);
991 return ret;
992}
993
994/*
832 * This is only valid for leaf nodes, which are the only ones that can 995 * This is only valid for leaf nodes, which are the only ones that can
833 * have empty extents anyway. 996 * have empty extents anyway.
834 */ 997 */
@@ -934,6 +1097,22 @@ static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
934 1097
935} 1098}
936 1099
1100static void ocfs2_remove_empty_extent(struct ocfs2_extent_list *el)
1101{
1102 int size, num_recs = le16_to_cpu(el->l_next_free_rec);
1103
1104 BUG_ON(num_recs == 0);
1105
1106 if (ocfs2_is_empty_extent(&el->l_recs[0])) {
1107 num_recs--;
1108 size = num_recs * sizeof(struct ocfs2_extent_rec);
1109 memmove(&el->l_recs[0], &el->l_recs[1], size);
1110 memset(&el->l_recs[num_recs], 0,
1111 sizeof(struct ocfs2_extent_rec));
1112 el->l_next_free_rec = cpu_to_le16(num_recs);
1113 }
1114}
1115
937/* 1116/*
938 * Create an empty extent record . 1117 * Create an empty extent record .
939 * 1118 *
@@ -1211,6 +1390,10 @@ static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
1211 * immediately to their right. 1390 * immediately to their right.
1212 */ 1391 */
1213 left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos); 1392 left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos);
1393 if (ocfs2_is_empty_extent(&right_child_el->l_recs[0])) {
1394 BUG_ON(le16_to_cpu(right_child_el->l_next_free_rec) <= 1);
1395 left_clusters = le32_to_cpu(right_child_el->l_recs[1].e_cpos);
1396 }
1214 left_clusters -= le32_to_cpu(left_rec->e_cpos); 1397 left_clusters -= le32_to_cpu(left_rec->e_cpos);
1215 left_rec->e_int_clusters = cpu_to_le32(left_clusters); 1398 left_rec->e_int_clusters = cpu_to_le32(left_clusters);
1216 1399
@@ -1531,10 +1714,16 @@ out:
1531 return ret; 1714 return ret;
1532} 1715}
1533 1716
1717/*
1718 * Extend the transaction by enough credits to complete the rotation,
1719 * and still leave at least the original number of credits allocated
1720 * to this transaction.
1721 */
1534static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth, 1722static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
1723 int op_credits,
1535 struct ocfs2_path *path) 1724 struct ocfs2_path *path)
1536{ 1725{
1537 int credits = (path->p_tree_depth - subtree_depth) * 2 + 1; 1726 int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits;
1538 1727
1539 if (handle->h_buffer_credits < credits) 1728 if (handle->h_buffer_credits < credits)
1540 return ocfs2_extend_trans(handle, credits); 1729 return ocfs2_extend_trans(handle, credits);
@@ -1568,6 +1757,29 @@ static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path,
1568 return 0; 1757 return 0;
1569} 1758}
1570 1759
1760static int ocfs2_leftmost_rec_contains(struct ocfs2_extent_list *el, u32 cpos)
1761{
1762 int next_free = le16_to_cpu(el->l_next_free_rec);
1763 unsigned int range;
1764 struct ocfs2_extent_rec *rec;
1765
1766 if (next_free == 0)
1767 return 0;
1768
1769 rec = &el->l_recs[0];
1770 if (ocfs2_is_empty_extent(rec)) {
1771 /* Empty list. */
1772 if (next_free == 1)
1773 return 0;
1774 rec = &el->l_recs[1];
1775 }
1776
1777 range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
1778 if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
1779 return 1;
1780 return 0;
1781}
1782
1571/* 1783/*
1572 * Rotate all the records in a btree right one record, starting at insert_cpos. 1784 * Rotate all the records in a btree right one record, starting at insert_cpos.
1573 * 1785 *
@@ -1586,11 +1798,12 @@ static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path,
1586 */ 1798 */
1587static int ocfs2_rotate_tree_right(struct inode *inode, 1799static int ocfs2_rotate_tree_right(struct inode *inode,
1588 handle_t *handle, 1800 handle_t *handle,
1801 enum ocfs2_split_type split,
1589 u32 insert_cpos, 1802 u32 insert_cpos,
1590 struct ocfs2_path *right_path, 1803 struct ocfs2_path *right_path,
1591 struct ocfs2_path **ret_left_path) 1804 struct ocfs2_path **ret_left_path)
1592{ 1805{
1593 int ret, start; 1806 int ret, start, orig_credits = handle->h_buffer_credits;
1594 u32 cpos; 1807 u32 cpos;
1595 struct ocfs2_path *left_path = NULL; 1808 struct ocfs2_path *left_path = NULL;
1596 1809
@@ -1657,9 +1870,9 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
1657 (unsigned long long) 1870 (unsigned long long)
1658 path_leaf_bh(left_path)->b_blocknr); 1871 path_leaf_bh(left_path)->b_blocknr);
1659 1872
1660 if (ocfs2_rotate_requires_path_adjustment(left_path, 1873 if (split == SPLIT_NONE &&
1874 ocfs2_rotate_requires_path_adjustment(left_path,
1661 insert_cpos)) { 1875 insert_cpos)) {
1662 mlog(0, "Path adjustment required\n");
1663 1876
1664 /* 1877 /*
1665 * We've rotated the tree as much as we 1878 * We've rotated the tree as much as we
@@ -1687,7 +1900,7 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
1687 right_path->p_tree_depth); 1900 right_path->p_tree_depth);
1688 1901
1689 ret = ocfs2_extend_rotate_transaction(handle, start, 1902 ret = ocfs2_extend_rotate_transaction(handle, start,
1690 right_path); 1903 orig_credits, right_path);
1691 if (ret) { 1904 if (ret) {
1692 mlog_errno(ret); 1905 mlog_errno(ret);
1693 goto out; 1906 goto out;
@@ -1700,6 +1913,24 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
1700 goto out; 1913 goto out;
1701 } 1914 }
1702 1915
1916 if (split != SPLIT_NONE &&
1917 ocfs2_leftmost_rec_contains(path_leaf_el(right_path),
1918 insert_cpos)) {
1919 /*
1920 * A rotate moves the rightmost left leaf
1921 * record over to the leftmost right leaf
1922 * slot. If we're doing an extent split
1923 * instead of a real insert, then we have to
1924 * check that the extent to be split wasn't
1925 * just moved over. If it was, then we can
1926 * exit here, passing left_path back -
1927 * ocfs2_split_extent() is smart enough to
1928 * search both leaves.
1929 */
1930 *ret_left_path = left_path;
1931 goto out_ret_path;
1932 }
1933
1703 /* 1934 /*
1704 * There is no need to re-read the next right path 1935 * There is no need to re-read the next right path
1705 * as we know that it'll be our current left 1936 * as we know that it'll be our current left
@@ -1722,6 +1953,1031 @@ out_ret_path:
1722 return ret; 1953 return ret;
1723} 1954}
1724 1955
1956static void ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle,
1957 struct ocfs2_path *path)
1958{
1959 int i, idx;
1960 struct ocfs2_extent_rec *rec;
1961 struct ocfs2_extent_list *el;
1962 struct ocfs2_extent_block *eb;
1963 u32 range;
1964
1965 /* Path should always be rightmost. */
1966 eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
1967 BUG_ON(eb->h_next_leaf_blk != 0ULL);
1968
1969 el = &eb->h_list;
1970 BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
1971 idx = le16_to_cpu(el->l_next_free_rec) - 1;
1972 rec = &el->l_recs[idx];
1973 range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
1974
1975 for (i = 0; i < path->p_tree_depth; i++) {
1976 el = path->p_node[i].el;
1977 idx = le16_to_cpu(el->l_next_free_rec) - 1;
1978 rec = &el->l_recs[idx];
1979
1980 rec->e_int_clusters = cpu_to_le32(range);
1981 le32_add_cpu(&rec->e_int_clusters, -le32_to_cpu(rec->e_cpos));
1982
1983 ocfs2_journal_dirty(handle, path->p_node[i].bh);
1984 }
1985}
1986
1987static void ocfs2_unlink_path(struct inode *inode, handle_t *handle,
1988 struct ocfs2_cached_dealloc_ctxt *dealloc,
1989 struct ocfs2_path *path, int unlink_start)
1990{
1991 int ret, i;
1992 struct ocfs2_extent_block *eb;
1993 struct ocfs2_extent_list *el;
1994 struct buffer_head *bh;
1995
1996 for(i = unlink_start; i < path_num_items(path); i++) {
1997 bh = path->p_node[i].bh;
1998
1999 eb = (struct ocfs2_extent_block *)bh->b_data;
2000 /*
2001 * Not all nodes might have had their final count
2002 * decremented by the caller - handle this here.
2003 */
2004 el = &eb->h_list;
2005 if (le16_to_cpu(el->l_next_free_rec) > 1) {
2006 mlog(ML_ERROR,
2007 "Inode %llu, attempted to remove extent block "
2008 "%llu with %u records\n",
2009 (unsigned long long)OCFS2_I(inode)->ip_blkno,
2010 (unsigned long long)le64_to_cpu(eb->h_blkno),
2011 le16_to_cpu(el->l_next_free_rec));
2012
2013 ocfs2_journal_dirty(handle, bh);
2014 ocfs2_remove_from_cache(inode, bh);
2015 continue;
2016 }
2017
2018 el->l_next_free_rec = 0;
2019 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2020
2021 ocfs2_journal_dirty(handle, bh);
2022
2023 ret = ocfs2_cache_extent_block_free(dealloc, eb);
2024 if (ret)
2025 mlog_errno(ret);
2026
2027 ocfs2_remove_from_cache(inode, bh);
2028 }
2029}
2030
2031static void ocfs2_unlink_subtree(struct inode *inode, handle_t *handle,
2032 struct ocfs2_path *left_path,
2033 struct ocfs2_path *right_path,
2034 int subtree_index,
2035 struct ocfs2_cached_dealloc_ctxt *dealloc)
2036{
2037 int i;
2038 struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
2039 struct ocfs2_extent_list *root_el = left_path->p_node[subtree_index].el;
2040 struct ocfs2_extent_list *el;
2041 struct ocfs2_extent_block *eb;
2042
2043 el = path_leaf_el(left_path);
2044
2045 eb = (struct ocfs2_extent_block *)right_path->p_node[subtree_index + 1].bh->b_data;
2046
2047 for(i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
2048 if (root_el->l_recs[i].e_blkno == eb->h_blkno)
2049 break;
2050
2051 BUG_ON(i >= le16_to_cpu(root_el->l_next_free_rec));
2052
2053 memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
2054 le16_add_cpu(&root_el->l_next_free_rec, -1);
2055
2056 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2057 eb->h_next_leaf_blk = 0;
2058
2059 ocfs2_journal_dirty(handle, root_bh);
2060 ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2061
2062 ocfs2_unlink_path(inode, handle, dealloc, right_path,
2063 subtree_index + 1);
2064}
2065
2066static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
2067 struct ocfs2_path *left_path,
2068 struct ocfs2_path *right_path,
2069 int subtree_index,
2070 struct ocfs2_cached_dealloc_ctxt *dealloc,
2071 int *deleted)
2072{
2073 int ret, i, del_right_subtree = 0, right_has_empty = 0;
2074 struct buffer_head *root_bh, *di_bh = path_root_bh(right_path);
2075 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
2076 struct ocfs2_extent_list *right_leaf_el, *left_leaf_el;
2077 struct ocfs2_extent_block *eb;
2078
2079 *deleted = 0;
2080
2081 right_leaf_el = path_leaf_el(right_path);
2082 left_leaf_el = path_leaf_el(left_path);
2083 root_bh = left_path->p_node[subtree_index].bh;
2084 BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
2085
2086 if (!ocfs2_is_empty_extent(&left_leaf_el->l_recs[0]))
2087 return 0;
2088
2089 eb = (struct ocfs2_extent_block *)path_leaf_bh(right_path)->b_data;
2090 if (ocfs2_is_empty_extent(&right_leaf_el->l_recs[0])) {
2091 /*
2092 * It's legal for us to proceed if the right leaf is
2093 * the rightmost one and it has an empty extent. There
2094 * are two cases to handle - whether the leaf will be
2095 * empty after removal or not. If the leaf isn't empty
2096 * then just remove the empty extent up front. The
2097 * next block will handle empty leaves by flagging
2098 * them for unlink.
2099 *
2100 * Non rightmost leaves will throw -EAGAIN and the
2101 * caller can manually move the subtree and retry.
2102 */
2103
2104 if (eb->h_next_leaf_blk != 0ULL)
2105 return -EAGAIN;
2106
2107 if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) {
2108 ret = ocfs2_journal_access(handle, inode,
2109 path_leaf_bh(right_path),
2110 OCFS2_JOURNAL_ACCESS_WRITE);
2111 if (ret) {
2112 mlog_errno(ret);
2113 goto out;
2114 }
2115
2116 ocfs2_remove_empty_extent(right_leaf_el);
2117 } else
2118 right_has_empty = 1;
2119 }
2120
2121 if (eb->h_next_leaf_blk == 0ULL &&
2122 le16_to_cpu(right_leaf_el->l_next_free_rec) == 1) {
2123 /*
2124 * We have to update i_last_eb_blk during the meta
2125 * data delete.
2126 */
2127 ret = ocfs2_journal_access(handle, inode, di_bh,
2128 OCFS2_JOURNAL_ACCESS_WRITE);
2129 if (ret) {
2130 mlog_errno(ret);
2131 goto out;
2132 }
2133
2134 del_right_subtree = 1;
2135 }
2136
2137 /*
2138 * Getting here with an empty extent in the right path implies
2139 * that it's the rightmost path and will be deleted.
2140 */
2141 BUG_ON(right_has_empty && !del_right_subtree);
2142
2143 ret = ocfs2_journal_access(handle, inode, root_bh,
2144 OCFS2_JOURNAL_ACCESS_WRITE);
2145 if (ret) {
2146 mlog_errno(ret);
2147 goto out;
2148 }
2149
2150 for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
2151 ret = ocfs2_journal_access(handle, inode,
2152 right_path->p_node[i].bh,
2153 OCFS2_JOURNAL_ACCESS_WRITE);
2154 if (ret) {
2155 mlog_errno(ret);
2156 goto out;
2157 }
2158
2159 ret = ocfs2_journal_access(handle, inode,
2160 left_path->p_node[i].bh,
2161 OCFS2_JOURNAL_ACCESS_WRITE);
2162 if (ret) {
2163 mlog_errno(ret);
2164 goto out;
2165 }
2166 }
2167
2168 if (!right_has_empty) {
2169 /*
2170 * Only do this if we're moving a real
2171 * record. Otherwise, the action is delayed until
2172 * after removal of the right path in which case we
2173 * can do a simple shift to remove the empty extent.
2174 */
2175 ocfs2_rotate_leaf(left_leaf_el, &right_leaf_el->l_recs[0]);
2176 memset(&right_leaf_el->l_recs[0], 0,
2177 sizeof(struct ocfs2_extent_rec));
2178 }
2179 if (eb->h_next_leaf_blk == 0ULL) {
2180 /*
2181 * Move recs over to get rid of empty extent, decrease
2182 * next_free. This is allowed to remove the last
2183 * extent in our leaf (setting l_next_free_rec to
2184 * zero) - the delete code below won't care.
2185 */
2186 ocfs2_remove_empty_extent(right_leaf_el);
2187 }
2188
2189 ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2190 if (ret)
2191 mlog_errno(ret);
2192 ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
2193 if (ret)
2194 mlog_errno(ret);
2195
2196 if (del_right_subtree) {
2197 ocfs2_unlink_subtree(inode, handle, left_path, right_path,
2198 subtree_index, dealloc);
2199 ocfs2_update_edge_lengths(inode, handle, left_path);
2200
2201 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2202 di->i_last_eb_blk = eb->h_blkno;
2203
2204 /*
2205 * Removal of the extent in the left leaf was skipped
2206 * above so we could delete the right path
2207 * 1st.
2208 */
2209 if (right_has_empty)
2210 ocfs2_remove_empty_extent(left_leaf_el);
2211
2212 ret = ocfs2_journal_dirty(handle, di_bh);
2213 if (ret)
2214 mlog_errno(ret);
2215
2216 *deleted = 1;
2217 } else
2218 ocfs2_complete_edge_insert(inode, handle, left_path, right_path,
2219 subtree_index);
2220
2221out:
2222 return ret;
2223}
2224
2225/*
2226 * Given a full path, determine what cpos value would return us a path
2227 * containing the leaf immediately to the right of the current one.
2228 *
2229 * Will return zero if the path passed in is already the rightmost path.
2230 *
2231 * This looks similar, but is subtly different to
2232 * ocfs2_find_cpos_for_left_leaf().
2233 */
2234static int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
2235 struct ocfs2_path *path, u32 *cpos)
2236{
2237 int i, j, ret = 0;
2238 u64 blkno;
2239 struct ocfs2_extent_list *el;
2240
2241 *cpos = 0;
2242
2243 if (path->p_tree_depth == 0)
2244 return 0;
2245
2246 blkno = path_leaf_bh(path)->b_blocknr;
2247
2248 /* Start at the tree node just above the leaf and work our way up. */
2249 i = path->p_tree_depth - 1;
2250 while (i >= 0) {
2251 int next_free;
2252
2253 el = path->p_node[i].el;
2254
2255 /*
2256 * Find the extent record just after the one in our
2257 * path.
2258 */
2259 next_free = le16_to_cpu(el->l_next_free_rec);
2260 for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
2261 if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
2262 if (j == (next_free - 1)) {
2263 if (i == 0) {
2264 /*
2265 * We've determined that the
2266 * path specified is already
2267 * the rightmost one - return a
2268 * cpos of zero.
2269 */
2270 goto out;
2271 }
2272 /*
2273 * The rightmost record points to our
2274 * leaf - we need to travel up the
2275 * tree one level.
2276 */
2277 goto next_node;
2278 }
2279
2280 *cpos = le32_to_cpu(el->l_recs[j + 1].e_cpos);
2281 goto out;
2282 }
2283 }
2284
2285 /*
2286 * If we got here, we never found a valid node where
2287 * the tree indicated one should be.
2288 */
2289 ocfs2_error(sb,
2290 "Invalid extent tree at extent block %llu\n",
2291 (unsigned long long)blkno);
2292 ret = -EROFS;
2293 goto out;
2294
2295next_node:
2296 blkno = path->p_node[i].bh->b_blocknr;
2297 i--;
2298 }
2299
2300out:
2301 return ret;
2302}
2303
2304static int ocfs2_rotate_rightmost_leaf_left(struct inode *inode,
2305 handle_t *handle,
2306 struct buffer_head *bh,
2307 struct ocfs2_extent_list *el)
2308{
2309 int ret;
2310
2311 if (!ocfs2_is_empty_extent(&el->l_recs[0]))
2312 return 0;
2313
2314 ret = ocfs2_journal_access(handle, inode, bh,
2315 OCFS2_JOURNAL_ACCESS_WRITE);
2316 if (ret) {
2317 mlog_errno(ret);
2318 goto out;
2319 }
2320
2321 ocfs2_remove_empty_extent(el);
2322
2323 ret = ocfs2_journal_dirty(handle, bh);
2324 if (ret)
2325 mlog_errno(ret);
2326
2327out:
2328 return ret;
2329}
2330
2331static int __ocfs2_rotate_tree_left(struct inode *inode,
2332 handle_t *handle, int orig_credits,
2333 struct ocfs2_path *path,
2334 struct ocfs2_cached_dealloc_ctxt *dealloc,
2335 struct ocfs2_path **empty_extent_path)
2336{
2337 int ret, subtree_root, deleted;
2338 u32 right_cpos;
2339 struct ocfs2_path *left_path = NULL;
2340 struct ocfs2_path *right_path = NULL;
2341
2342 BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0])));
2343
2344 *empty_extent_path = NULL;
2345
2346 ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, path,
2347 &right_cpos);
2348 if (ret) {
2349 mlog_errno(ret);
2350 goto out;
2351 }
2352
2353 left_path = ocfs2_new_path(path_root_bh(path),
2354 path_root_el(path));
2355 if (!left_path) {
2356 ret = -ENOMEM;
2357 mlog_errno(ret);
2358 goto out;
2359 }
2360
2361 ocfs2_cp_path(left_path, path);
2362
2363 right_path = ocfs2_new_path(path_root_bh(path),
2364 path_root_el(path));
2365 if (!right_path) {
2366 ret = -ENOMEM;
2367 mlog_errno(ret);
2368 goto out;
2369 }
2370
2371 while (right_cpos) {
2372 ret = ocfs2_find_path(inode, right_path, right_cpos);
2373 if (ret) {
2374 mlog_errno(ret);
2375 goto out;
2376 }
2377
2378 subtree_root = ocfs2_find_subtree_root(inode, left_path,
2379 right_path);
2380
2381 mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
2382 subtree_root,
2383 (unsigned long long)
2384 right_path->p_node[subtree_root].bh->b_blocknr,
2385 right_path->p_tree_depth);
2386
2387 ret = ocfs2_extend_rotate_transaction(handle, subtree_root,
2388 orig_credits, left_path);
2389 if (ret) {
2390 mlog_errno(ret);
2391 goto out;
2392 }
2393
2394 ret = ocfs2_rotate_subtree_left(inode, handle, left_path,
2395 right_path, subtree_root,
2396 dealloc, &deleted);
2397 if (ret == -EAGAIN) {
2398 /*
2399 * The rotation has to temporarily stop due to
2400 * the right subtree having an empty
2401 * extent. Pass it back to the caller for a
2402 * fixup.
2403 */
2404 *empty_extent_path = right_path;
2405 right_path = NULL;
2406 goto out;
2407 }
2408 if (ret) {
2409 mlog_errno(ret);
2410 goto out;
2411 }
2412
2413 /*
2414 * The subtree rotate might have removed records on
2415 * the rightmost edge. If so, then rotation is
2416 * complete.
2417 */
2418 if (deleted)
2419 break;
2420
2421 ocfs2_mv_path(left_path, right_path);
2422
2423 ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path,
2424 &right_cpos);
2425 if (ret) {
2426 mlog_errno(ret);
2427 goto out;
2428 }
2429 }
2430
2431out:
2432 ocfs2_free_path(right_path);
2433 ocfs2_free_path(left_path);
2434
2435 return ret;
2436}
2437
2438static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
2439 struct ocfs2_path *path,
2440 struct ocfs2_cached_dealloc_ctxt *dealloc)
2441{
2442 int ret, subtree_index;
2443 u32 cpos;
2444 struct ocfs2_path *left_path = NULL;
2445 struct ocfs2_dinode *di;
2446 struct ocfs2_extent_block *eb;
2447 struct ocfs2_extent_list *el;
2448
2449 /*
2450 * XXX: This code assumes that the root is an inode, which is
2451 * true for now but may change as tree code gets generic.
2452 */
2453 di = (struct ocfs2_dinode *)path_root_bh(path)->b_data;
2454 if (!OCFS2_IS_VALID_DINODE(di)) {
2455 ret = -EIO;
2456 ocfs2_error(inode->i_sb,
2457 "Inode %llu has invalid path root",
2458 (unsigned long long)OCFS2_I(inode)->ip_blkno);
2459 goto out;
2460 }
2461
2462 /*
2463 * There's two ways we handle this depending on
2464 * whether path is the only existing one.
2465 */
2466 ret = ocfs2_extend_rotate_transaction(handle, 0,
2467 handle->h_buffer_credits,
2468 path);
2469 if (ret) {
2470 mlog_errno(ret);
2471 goto out;
2472 }
2473
2474 ret = ocfs2_journal_access_path(inode, handle, path);
2475 if (ret) {
2476 mlog_errno(ret);
2477 goto out;
2478 }
2479
2480 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
2481 if (ret) {
2482 mlog_errno(ret);
2483 goto out;
2484 }
2485
2486 if (cpos) {
2487 /*
2488 * We have a path to the left of this one - it needs
2489 * an update too.
2490 */
2491 left_path = ocfs2_new_path(path_root_bh(path),
2492 path_root_el(path));
2493 if (!left_path) {
2494 ret = -ENOMEM;
2495 mlog_errno(ret);
2496 goto out;
2497 }
2498
2499 ret = ocfs2_find_path(inode, left_path, cpos);
2500 if (ret) {
2501 mlog_errno(ret);
2502 goto out;
2503 }
2504
2505 ret = ocfs2_journal_access_path(inode, handle, left_path);
2506 if (ret) {
2507 mlog_errno(ret);
2508 goto out;
2509 }
2510
2511 subtree_index = ocfs2_find_subtree_root(inode, left_path, path);
2512
2513 ocfs2_unlink_subtree(inode, handle, left_path, path,
2514 subtree_index, dealloc);
2515 ocfs2_update_edge_lengths(inode, handle, left_path);
2516
2517 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2518 di->i_last_eb_blk = eb->h_blkno;
2519 } else {
2520 /*
2521 * 'path' is also the leftmost path which
2522 * means it must be the only one. This gets
2523 * handled differently because we want to
2524 * revert the inode back to having extents
2525 * in-line.
2526 */
2527 ocfs2_unlink_path(inode, handle, dealloc, path, 1);
2528
2529 el = &di->id2.i_list;
2530 el->l_tree_depth = 0;
2531 el->l_next_free_rec = 0;
2532 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2533
2534 di->i_last_eb_blk = 0;
2535 }
2536
2537 ocfs2_journal_dirty(handle, path_root_bh(path));
2538
2539out:
2540 ocfs2_free_path(left_path);
2541 return ret;
2542}
2543
2544/*
2545 * Left rotation of btree records.
2546 *
2547 * In many ways, this is (unsurprisingly) the opposite of right
2548 * rotation. We start at some non-rightmost path containing an empty
2549 * extent in the leaf block. The code works its way to the rightmost
2550 * path by rotating records to the left in every subtree.
2551 *
2552 * This is used by any code which reduces the number of extent records
2553 * in a leaf. After removal, an empty record should be placed in the
2554 * leftmost list position.
2555 *
2556 * This won't handle a length update of the rightmost path records if
2557 * the rightmost tree leaf record is removed so the caller is
2558 * responsible for detecting and correcting that.
2559 */
2560static int ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle,
2561 struct ocfs2_path *path,
2562 struct ocfs2_cached_dealloc_ctxt *dealloc)
2563{
2564 int ret, orig_credits = handle->h_buffer_credits;
2565 struct ocfs2_path *tmp_path = NULL, *restart_path = NULL;
2566 struct ocfs2_extent_block *eb;
2567 struct ocfs2_extent_list *el;
2568
2569 el = path_leaf_el(path);
2570 if (!ocfs2_is_empty_extent(&el->l_recs[0]))
2571 return 0;
2572
2573 if (path->p_tree_depth == 0) {
2574rightmost_no_delete:
2575 /*
2576 * In-inode extents. This is trivially handled, so do
2577 * it up front.
2578 */
2579 ret = ocfs2_rotate_rightmost_leaf_left(inode, handle,
2580 path_leaf_bh(path),
2581 path_leaf_el(path));
2582 if (ret)
2583 mlog_errno(ret);
2584 goto out;
2585 }
2586
2587 /*
2588 * Handle rightmost branch now. There's several cases:
2589 * 1) simple rotation leaving records in there. That's trivial.
2590 * 2) rotation requiring a branch delete - there's no more
2591 * records left. Two cases of this:
2592 * a) There are branches to the left.
2593 * b) This is also the leftmost (the only) branch.
2594 *
2595 * 1) is handled via ocfs2_rotate_rightmost_leaf_left()
2596 * 2a) we need the left branch so that we can update it with the unlink
2597 * 2b) we need to bring the inode back to inline extents.
2598 */
2599
2600 eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
2601 el = &eb->h_list;
2602 if (eb->h_next_leaf_blk == 0) {
2603 /*
2604 * This gets a bit tricky if we're going to delete the
2605 * rightmost path. Get the other cases out of the way
2606 * 1st.
2607 */
2608 if (le16_to_cpu(el->l_next_free_rec) > 1)
2609 goto rightmost_no_delete;
2610
2611 if (le16_to_cpu(el->l_next_free_rec) == 0) {
2612 ret = -EIO;
2613 ocfs2_error(inode->i_sb,
2614 "Inode %llu has empty extent block at %llu",
2615 (unsigned long long)OCFS2_I(inode)->ip_blkno,
2616 (unsigned long long)le64_to_cpu(eb->h_blkno));
2617 goto out;
2618 }
2619
2620 /*
2621 * XXX: The caller can not trust "path" any more after
2622 * this as it will have been deleted. What do we do?
2623 *
2624 * In theory the rotate-for-merge code will never get
2625 * here because it'll always ask for a rotate in a
2626 * nonempty list.
2627 */
2628
2629 ret = ocfs2_remove_rightmost_path(inode, handle, path,
2630 dealloc);
2631 if (ret)
2632 mlog_errno(ret);
2633 goto out;
2634 }
2635
2636 /*
2637 * Now we can loop, remembering the path we get from -EAGAIN
2638 * and restarting from there.
2639 */
2640try_rotate:
2641 ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, path,
2642 dealloc, &restart_path);
2643 if (ret && ret != -EAGAIN) {
2644 mlog_errno(ret);
2645 goto out;
2646 }
2647
2648 while (ret == -EAGAIN) {
2649 tmp_path = restart_path;
2650 restart_path = NULL;
2651
2652 ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits,
2653 tmp_path, dealloc,
2654 &restart_path);
2655 if (ret && ret != -EAGAIN) {
2656 mlog_errno(ret);
2657 goto out;
2658 }
2659
2660 ocfs2_free_path(tmp_path);
2661 tmp_path = NULL;
2662
2663 if (ret == 0)
2664 goto try_rotate;
2665 }
2666
2667out:
2668 ocfs2_free_path(tmp_path);
2669 ocfs2_free_path(restart_path);
2670 return ret;
2671}
2672
2673static void ocfs2_cleanup_merge(struct ocfs2_extent_list *el,
2674 int index)
2675{
2676 struct ocfs2_extent_rec *rec = &el->l_recs[index];
2677 unsigned int size;
2678
2679 if (rec->e_leaf_clusters == 0) {
2680 /*
2681 * We consumed all of the merged-from record. An empty
2682 * extent cannot exist anywhere but the 1st array
2683 * position, so move things over if the merged-from
2684 * record doesn't occupy that position.
2685 *
2686 * This creates a new empty extent so the caller
2687 * should be smart enough to have removed any existing
2688 * ones.
2689 */
2690 if (index > 0) {
2691 BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
2692 size = index * sizeof(struct ocfs2_extent_rec);
2693 memmove(&el->l_recs[1], &el->l_recs[0], size);
2694 }
2695
2696 /*
2697 * Always memset - the caller doesn't check whether it
2698 * created an empty extent, so there could be junk in
2699 * the other fields.
2700 */
2701 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2702 }
2703}
2704
2705/*
2706 * Remove split_rec clusters from the record at index and merge them
2707 * onto the beginning of the record at index + 1.
2708 */
2709static int ocfs2_merge_rec_right(struct inode *inode, struct buffer_head *bh,
2710 handle_t *handle,
2711 struct ocfs2_extent_rec *split_rec,
2712 struct ocfs2_extent_list *el, int index)
2713{
2714 int ret;
2715 unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
2716 struct ocfs2_extent_rec *left_rec;
2717 struct ocfs2_extent_rec *right_rec;
2718
2719 BUG_ON(index >= le16_to_cpu(el->l_next_free_rec));
2720
2721 left_rec = &el->l_recs[index];
2722 right_rec = &el->l_recs[index + 1];
2723
2724 ret = ocfs2_journal_access(handle, inode, bh,
2725 OCFS2_JOURNAL_ACCESS_WRITE);
2726 if (ret) {
2727 mlog_errno(ret);
2728 goto out;
2729 }
2730
2731 le16_add_cpu(&left_rec->e_leaf_clusters, -split_clusters);
2732
2733 le32_add_cpu(&right_rec->e_cpos, -split_clusters);
2734 le64_add_cpu(&right_rec->e_blkno,
2735 -ocfs2_clusters_to_blocks(inode->i_sb, split_clusters));
2736 le16_add_cpu(&right_rec->e_leaf_clusters, split_clusters);
2737
2738 ocfs2_cleanup_merge(el, index);
2739
2740 ret = ocfs2_journal_dirty(handle, bh);
2741 if (ret)
2742 mlog_errno(ret);
2743
2744out:
2745 return ret;
2746}
2747
2748/*
2749 * Remove split_rec clusters from the record at index and merge them
2750 * onto the tail of the record at index - 1.
2751 */
2752static int ocfs2_merge_rec_left(struct inode *inode, struct buffer_head *bh,
2753 handle_t *handle,
2754 struct ocfs2_extent_rec *split_rec,
2755 struct ocfs2_extent_list *el, int index)
2756{
2757 int ret, has_empty_extent = 0;
2758 unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
2759 struct ocfs2_extent_rec *left_rec;
2760 struct ocfs2_extent_rec *right_rec;
2761
2762 BUG_ON(index <= 0);
2763
2764 left_rec = &el->l_recs[index - 1];
2765 right_rec = &el->l_recs[index];
2766 if (ocfs2_is_empty_extent(&el->l_recs[0]))
2767 has_empty_extent = 1;
2768
2769 ret = ocfs2_journal_access(handle, inode, bh,
2770 OCFS2_JOURNAL_ACCESS_WRITE);
2771 if (ret) {
2772 mlog_errno(ret);
2773 goto out;
2774 }
2775
2776 if (has_empty_extent && index == 1) {
2777 /*
2778 * The easy case - we can just plop the record right in.
2779 */
2780 *left_rec = *split_rec;
2781
2782 has_empty_extent = 0;
2783 } else {
2784 le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters);
2785 }
2786
2787 le32_add_cpu(&right_rec->e_cpos, split_clusters);
2788 le64_add_cpu(&right_rec->e_blkno,
2789 ocfs2_clusters_to_blocks(inode->i_sb, split_clusters));
2790 le16_add_cpu(&right_rec->e_leaf_clusters, -split_clusters);
2791
2792 ocfs2_cleanup_merge(el, index);
2793
2794 ret = ocfs2_journal_dirty(handle, bh);
2795 if (ret)
2796 mlog_errno(ret);
2797
2798out:
2799 return ret;
2800}
2801
2802static int ocfs2_try_to_merge_extent(struct inode *inode,
2803 handle_t *handle,
2804 struct ocfs2_path *left_path,
2805 int split_index,
2806 struct ocfs2_extent_rec *split_rec,
2807 struct ocfs2_cached_dealloc_ctxt *dealloc,
2808 struct ocfs2_merge_ctxt *ctxt)
2809
2810{
2811 int ret = 0, delete_tail_recs = 0;
2812 struct ocfs2_extent_list *el = path_leaf_el(left_path);
2813 struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
2814
2815 BUG_ON(ctxt->c_contig_type == CONTIG_NONE);
2816
2817 if (ctxt->c_split_covers_rec) {
2818 delete_tail_recs++;
2819
2820 if (ctxt->c_contig_type == CONTIG_LEFTRIGHT ||
2821 ctxt->c_has_empty_extent)
2822 delete_tail_recs++;
2823
2824 if (ctxt->c_has_empty_extent) {
2825 /*
2826 * The merge code will need to create an empty
2827 * extent to take the place of the newly
2828 * emptied slot. Remove any pre-existing empty
2829 * extents - having more than one in a leaf is
2830 * illegal.
2831 */
2832 ret = ocfs2_rotate_tree_left(inode, handle, left_path,
2833 dealloc);
2834 if (ret) {
2835 mlog_errno(ret);
2836 goto out;
2837 }
2838 split_index--;
2839 rec = &el->l_recs[split_index];
2840 }
2841 }
2842
2843 if (ctxt->c_contig_type == CONTIG_LEFTRIGHT) {
2844 /*
2845 * Left-right contig implies this.
2846 */
2847 BUG_ON(!ctxt->c_split_covers_rec);
2848 BUG_ON(split_index == 0);
2849
2850 /*
2851 * Since the leftright insert always covers the entire
2852 * extent, this call will delete the insert record
2853 * entirely, resulting in an empty extent record added to
2854 * the extent block.
2855 *
2856 * Since the adding of an empty extent shifts
2857 * everything back to the right, there's no need to
2858 * update split_index here.
2859 */
2860 ret = ocfs2_merge_rec_left(inode, path_leaf_bh(left_path),
2861 handle, split_rec, el, split_index);
2862 if (ret) {
2863 mlog_errno(ret);
2864 goto out;
2865 }
2866
2867 /*
2868 * We can only get this from logic error above.
2869 */
2870 BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
2871
2872 /*
2873 * The left merge left us with an empty extent, remove
2874 * it.
2875 */
2876 ret = ocfs2_rotate_tree_left(inode, handle, left_path, dealloc);
2877 if (ret) {
2878 mlog_errno(ret);
2879 goto out;
2880 }
2881 split_index--;
2882 rec = &el->l_recs[split_index];
2883
2884 /*
2885 * Note that we don't pass split_rec here on purpose -
2886 * we've merged it into the left side.
2887 */
2888 ret = ocfs2_merge_rec_right(inode, path_leaf_bh(left_path),
2889 handle, rec, el, split_index);
2890 if (ret) {
2891 mlog_errno(ret);
2892 goto out;
2893 }
2894
2895 BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
2896
2897 ret = ocfs2_rotate_tree_left(inode, handle, left_path,
2898 dealloc);
2899 /*
2900 * Error from this last rotate is not critical, so
2901 * print but don't bubble it up.
2902 */
2903 if (ret)
2904 mlog_errno(ret);
2905 ret = 0;
2906 } else {
2907 /*
2908 * Merge a record to the left or right.
2909 *
2910 * 'contig_type' is relative to the existing record,
2911 * so for example, if we're "right contig", it's to
2912 * the record on the left (hence the left merge).
2913 */
2914 if (ctxt->c_contig_type == CONTIG_RIGHT) {
2915 ret = ocfs2_merge_rec_left(inode,
2916 path_leaf_bh(left_path),
2917 handle, split_rec, el,
2918 split_index);
2919 if (ret) {
2920 mlog_errno(ret);
2921 goto out;
2922 }
2923 } else {
2924 ret = ocfs2_merge_rec_right(inode,
2925 path_leaf_bh(left_path),
2926 handle, split_rec, el,
2927 split_index);
2928 if (ret) {
2929 mlog_errno(ret);
2930 goto out;
2931 }
2932 }
2933
2934 if (ctxt->c_split_covers_rec) {
2935 /*
2936 * The merge may have left an empty extent in
2937 * our leaf. Try to rotate it away.
2938 */
2939 ret = ocfs2_rotate_tree_left(inode, handle, left_path,
2940 dealloc);
2941 if (ret)
2942 mlog_errno(ret);
2943 ret = 0;
2944 }
2945 }
2946
2947out:
2948 return ret;
2949}
2950
2951static void ocfs2_subtract_from_rec(struct super_block *sb,
2952 enum ocfs2_split_type split,
2953 struct ocfs2_extent_rec *rec,
2954 struct ocfs2_extent_rec *split_rec)
2955{
2956 u64 len_blocks;
2957
2958 len_blocks = ocfs2_clusters_to_blocks(sb,
2959 le16_to_cpu(split_rec->e_leaf_clusters));
2960
2961 if (split == SPLIT_LEFT) {
2962 /*
2963 * Region is on the left edge of the existing
2964 * record.
2965 */
2966 le32_add_cpu(&rec->e_cpos,
2967 le16_to_cpu(split_rec->e_leaf_clusters));
2968 le64_add_cpu(&rec->e_blkno, len_blocks);
2969 le16_add_cpu(&rec->e_leaf_clusters,
2970 -le16_to_cpu(split_rec->e_leaf_clusters));
2971 } else {
2972 /*
2973 * Region is on the right edge of the existing
2974 * record.
2975 */
2976 le16_add_cpu(&rec->e_leaf_clusters,
2977 -le16_to_cpu(split_rec->e_leaf_clusters));
2978 }
2979}
2980
1725/* 2981/*
1726 * Do the final bits of extent record insertion at the target leaf 2982 * Do the final bits of extent record insertion at the target leaf
1727 * list. If this leaf is part of an allocation tree, it is assumed 2983 * list. If this leaf is part of an allocation tree, it is assumed
@@ -1738,6 +2994,15 @@ static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
1738 2994
1739 BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); 2995 BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
1740 2996
2997 if (insert->ins_split != SPLIT_NONE) {
2998 i = ocfs2_search_extent_list(el, le32_to_cpu(insert_rec->e_cpos));
2999 BUG_ON(i == -1);
3000 rec = &el->l_recs[i];
3001 ocfs2_subtract_from_rec(inode->i_sb, insert->ins_split, rec,
3002 insert_rec);
3003 goto rotate;
3004 }
3005
1741 /* 3006 /*
1742 * Contiguous insert - either left or right. 3007 * Contiguous insert - either left or right.
1743 */ 3008 */
@@ -1792,6 +3057,7 @@ static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
1792 return; 3057 return;
1793 } 3058 }
1794 3059
3060rotate:
1795 /* 3061 /*
1796 * Ok, we have to rotate. 3062 * Ok, we have to rotate.
1797 * 3063 *
@@ -1815,13 +3081,53 @@ static inline void ocfs2_update_dinode_clusters(struct inode *inode,
1815 spin_unlock(&OCFS2_I(inode)->ip_lock); 3081 spin_unlock(&OCFS2_I(inode)->ip_lock);
1816} 3082}
1817 3083
3084static void ocfs2_adjust_rightmost_records(struct inode *inode,
3085 handle_t *handle,
3086 struct ocfs2_path *path,
3087 struct ocfs2_extent_rec *insert_rec)
3088{
3089 int ret, i, next_free;
3090 struct buffer_head *bh;
3091 struct ocfs2_extent_list *el;
3092 struct ocfs2_extent_rec *rec;
3093
3094 /*
3095 * Update everything except the leaf block.
3096 */
3097 for (i = 0; i < path->p_tree_depth; i++) {
3098 bh = path->p_node[i].bh;
3099 el = path->p_node[i].el;
3100
3101 next_free = le16_to_cpu(el->l_next_free_rec);
3102 if (next_free == 0) {
3103 ocfs2_error(inode->i_sb,
3104 "Dinode %llu has a bad extent list",
3105 (unsigned long long)OCFS2_I(inode)->ip_blkno);
3106 ret = -EIO;
3107 return;
3108 }
3109
3110 rec = &el->l_recs[next_free - 1];
3111
3112 rec->e_int_clusters = insert_rec->e_cpos;
3113 le32_add_cpu(&rec->e_int_clusters,
3114 le16_to_cpu(insert_rec->e_leaf_clusters));
3115 le32_add_cpu(&rec->e_int_clusters,
3116 -le32_to_cpu(rec->e_cpos));
3117
3118 ret = ocfs2_journal_dirty(handle, bh);
3119 if (ret)
3120 mlog_errno(ret);
3121
3122 }
3123}
3124
1818static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle, 3125static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
1819 struct ocfs2_extent_rec *insert_rec, 3126 struct ocfs2_extent_rec *insert_rec,
1820 struct ocfs2_path *right_path, 3127 struct ocfs2_path *right_path,
1821 struct ocfs2_path **ret_left_path) 3128 struct ocfs2_path **ret_left_path)
1822{ 3129{
1823 int ret, i, next_free; 3130 int ret, next_free;
1824 struct buffer_head *bh;
1825 struct ocfs2_extent_list *el; 3131 struct ocfs2_extent_list *el;
1826 struct ocfs2_path *left_path = NULL; 3132 struct ocfs2_path *left_path = NULL;
1827 3133
@@ -1887,40 +3193,7 @@ static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
1887 goto out; 3193 goto out;
1888 } 3194 }
1889 3195
1890 el = path_root_el(right_path); 3196 ocfs2_adjust_rightmost_records(inode, handle, right_path, insert_rec);
1891 bh = path_root_bh(right_path);
1892 i = 0;
1893 while (1) {
1894 struct ocfs2_extent_rec *rec;
1895
1896 next_free = le16_to_cpu(el->l_next_free_rec);
1897 if (next_free == 0) {
1898 ocfs2_error(inode->i_sb,
1899 "Dinode %llu has a bad extent list",
1900 (unsigned long long)OCFS2_I(inode)->ip_blkno);
1901 ret = -EIO;
1902 goto out;
1903 }
1904
1905 rec = &el->l_recs[next_free - 1];
1906
1907 rec->e_int_clusters = insert_rec->e_cpos;
1908 le32_add_cpu(&rec->e_int_clusters,
1909 le16_to_cpu(insert_rec->e_leaf_clusters));
1910 le32_add_cpu(&rec->e_int_clusters,
1911 -le32_to_cpu(rec->e_cpos));
1912
1913 ret = ocfs2_journal_dirty(handle, bh);
1914 if (ret)
1915 mlog_errno(ret);
1916
1917 /* Don't touch the leaf node */
1918 if (++i >= right_path->p_tree_depth)
1919 break;
1920
1921 bh = right_path->p_node[i].bh;
1922 el = right_path->p_node[i].el;
1923 }
1924 3197
1925 *ret_left_path = left_path; 3198 *ret_left_path = left_path;
1926 ret = 0; 3199 ret = 0;
@@ -1931,6 +3204,83 @@ out:
1931 return ret; 3204 return ret;
1932} 3205}
1933 3206
3207static void ocfs2_split_record(struct inode *inode,
3208 struct ocfs2_path *left_path,
3209 struct ocfs2_path *right_path,
3210 struct ocfs2_extent_rec *split_rec,
3211 enum ocfs2_split_type split)
3212{
3213 int index;
3214 u32 cpos = le32_to_cpu(split_rec->e_cpos);
3215 struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el;
3216 struct ocfs2_extent_rec *rec, *tmprec;
3217
3218 right_el = path_leaf_el(right_path);;
3219 if (left_path)
3220 left_el = path_leaf_el(left_path);
3221
3222 el = right_el;
3223 insert_el = right_el;
3224 index = ocfs2_search_extent_list(el, cpos);
3225 if (index != -1) {
3226 if (index == 0 && left_path) {
3227 BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
3228
3229 /*
3230 * This typically means that the record
3231 * started in the left path but moved to the
3232 * right as a result of rotation. We either
3233 * move the existing record to the left, or we
3234 * do the later insert there.
3235 *
3236 * In this case, the left path should always
3237 * exist as the rotate code will have passed
3238 * it back for a post-insert update.
3239 */
3240
3241 if (split == SPLIT_LEFT) {
3242 /*
3243 * It's a left split. Since we know
3244 * that the rotate code gave us an
3245 * empty extent in the left path, we
3246 * can just do the insert there.
3247 */
3248 insert_el = left_el;
3249 } else {
3250 /*
3251 * Right split - we have to move the
3252 * existing record over to the left
3253 * leaf. The insert will be into the
3254 * newly created empty extent in the
3255 * right leaf.
3256 */
3257 tmprec = &right_el->l_recs[index];
3258 ocfs2_rotate_leaf(left_el, tmprec);
3259 el = left_el;
3260
3261 memset(tmprec, 0, sizeof(*tmprec));
3262 index = ocfs2_search_extent_list(left_el, cpos);
3263 BUG_ON(index == -1);
3264 }
3265 }
3266 } else {
3267 BUG_ON(!left_path);
3268 BUG_ON(!ocfs2_is_empty_extent(&left_el->l_recs[0]));
3269 /*
3270 * Left path is easy - we can just allow the insert to
3271 * happen.
3272 */
3273 el = left_el;
3274 insert_el = left_el;
3275 index = ocfs2_search_extent_list(el, cpos);
3276 BUG_ON(index == -1);
3277 }
3278
3279 rec = &el->l_recs[index];
3280 ocfs2_subtract_from_rec(inode->i_sb, split, rec, split_rec);
3281 ocfs2_rotate_leaf(insert_el, split_rec);
3282}
3283
1934/* 3284/*
1935 * This function only does inserts on an allocation b-tree. For dinode 3285 * This function only does inserts on an allocation b-tree. For dinode
1936 * lists, ocfs2_insert_at_leaf() is called directly. 3286 * lists, ocfs2_insert_at_leaf() is called directly.
@@ -1948,7 +3298,6 @@ static int ocfs2_insert_path(struct inode *inode,
1948{ 3298{
1949 int ret, subtree_index; 3299 int ret, subtree_index;
1950 struct buffer_head *leaf_bh = path_leaf_bh(right_path); 3300 struct buffer_head *leaf_bh = path_leaf_bh(right_path);
1951 struct ocfs2_extent_list *el;
1952 3301
1953 /* 3302 /*
1954 * Pass both paths to the journal. The majority of inserts 3303 * Pass both paths to the journal. The majority of inserts
@@ -1984,9 +3333,18 @@ static int ocfs2_insert_path(struct inode *inode,
1984 } 3333 }
1985 } 3334 }
1986 3335
1987 el = path_leaf_el(right_path); 3336 if (insert->ins_split != SPLIT_NONE) {
3337 /*
3338 * We could call ocfs2_insert_at_leaf() for some types
3339 * of splits, but it's easier to just let one seperate
3340 * function sort it all out.
3341 */
3342 ocfs2_split_record(inode, left_path, right_path,
3343 insert_rec, insert->ins_split);
3344 } else
3345 ocfs2_insert_at_leaf(insert_rec, path_leaf_el(right_path),
3346 insert, inode);
1988 3347
1989 ocfs2_insert_at_leaf(insert_rec, el, insert, inode);
1990 ret = ocfs2_journal_dirty(handle, leaf_bh); 3348 ret = ocfs2_journal_dirty(handle, leaf_bh);
1991 if (ret) 3349 if (ret)
1992 mlog_errno(ret); 3350 mlog_errno(ret);
@@ -2075,7 +3433,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
2075 * can wind up skipping both of these two special cases... 3433 * can wind up skipping both of these two special cases...
2076 */ 3434 */
2077 if (rotate) { 3435 if (rotate) {
2078 ret = ocfs2_rotate_tree_right(inode, handle, 3436 ret = ocfs2_rotate_tree_right(inode, handle, type->ins_split,
2079 le32_to_cpu(insert_rec->e_cpos), 3437 le32_to_cpu(insert_rec->e_cpos),
2080 right_path, &left_path); 3438 right_path, &left_path);
2081 if (ret) { 3439 if (ret) {
@@ -2100,8 +3458,9 @@ static int ocfs2_do_insert_extent(struct inode *inode,
2100 } 3458 }
2101 3459
2102out_update_clusters: 3460out_update_clusters:
2103 ocfs2_update_dinode_clusters(inode, di, 3461 if (type->ins_split == SPLIT_NONE)
2104 le16_to_cpu(insert_rec->e_leaf_clusters)); 3462 ocfs2_update_dinode_clusters(inode, di,
3463 le16_to_cpu(insert_rec->e_leaf_clusters));
2105 3464
2106 ret = ocfs2_journal_dirty(handle, di_bh); 3465 ret = ocfs2_journal_dirty(handle, di_bh);
2107 if (ret) 3466 if (ret)
@@ -2114,6 +3473,44 @@ out:
2114 return ret; 3473 return ret;
2115} 3474}
2116 3475
3476static enum ocfs2_contig_type
3477ocfs2_figure_merge_contig_type(struct inode *inode,
3478 struct ocfs2_extent_list *el, int index,
3479 struct ocfs2_extent_rec *split_rec)
3480{
3481 struct ocfs2_extent_rec *rec;
3482 enum ocfs2_contig_type ret = CONTIG_NONE;
3483
3484 /*
3485 * We're careful to check for an empty extent record here -
3486 * the merge code will know what to do if it sees one.
3487 */
3488
3489 if (index > 0) {
3490 rec = &el->l_recs[index - 1];
3491 if (index == 1 && ocfs2_is_empty_extent(rec)) {
3492 if (split_rec->e_cpos == el->l_recs[index].e_cpos)
3493 ret = CONTIG_RIGHT;
3494 } else {
3495 ret = ocfs2_extent_contig(inode, rec, split_rec);
3496 }
3497 }
3498
3499 if (index < (le16_to_cpu(el->l_next_free_rec) - 1)) {
3500 enum ocfs2_contig_type contig_type;
3501
3502 rec = &el->l_recs[index + 1];
3503 contig_type = ocfs2_extent_contig(inode, rec, split_rec);
3504
3505 if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT)
3506 ret = CONTIG_LEFTRIGHT;
3507 else if (ret == CONTIG_NONE)
3508 ret = contig_type;
3509 }
3510
3511 return ret;
3512}
3513
2117static void ocfs2_figure_contig_type(struct inode *inode, 3514static void ocfs2_figure_contig_type(struct inode *inode,
2118 struct ocfs2_insert_type *insert, 3515 struct ocfs2_insert_type *insert,
2119 struct ocfs2_extent_list *el, 3516 struct ocfs2_extent_list *el,
@@ -2205,6 +3602,8 @@ static int ocfs2_figure_insert_type(struct inode *inode,
2205 struct ocfs2_path *path = NULL; 3602 struct ocfs2_path *path = NULL;
2206 struct buffer_head *bh = NULL; 3603 struct buffer_head *bh = NULL;
2207 3604
3605 insert->ins_split = SPLIT_NONE;
3606
2208 el = &di->id2.i_list; 3607 el = &di->id2.i_list;
2209 insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth); 3608 insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth);
2210 3609
@@ -2327,9 +3726,10 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
2327 u32 cpos, 3726 u32 cpos,
2328 u64 start_blk, 3727 u64 start_blk,
2329 u32 new_clusters, 3728 u32 new_clusters,
3729 u8 flags,
2330 struct ocfs2_alloc_context *meta_ac) 3730 struct ocfs2_alloc_context *meta_ac)
2331{ 3731{
2332 int status, shift; 3732 int status;
2333 struct buffer_head *last_eb_bh = NULL; 3733 struct buffer_head *last_eb_bh = NULL;
2334 struct buffer_head *bh = NULL; 3734 struct buffer_head *bh = NULL;
2335 struct ocfs2_insert_type insert = {0, }; 3735 struct ocfs2_insert_type insert = {0, };
@@ -2350,6 +3750,7 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
2350 rec.e_cpos = cpu_to_le32(cpos); 3750 rec.e_cpos = cpu_to_le32(cpos);
2351 rec.e_blkno = cpu_to_le64(start_blk); 3751 rec.e_blkno = cpu_to_le64(start_blk);
2352 rec.e_leaf_clusters = cpu_to_le16(new_clusters); 3752 rec.e_leaf_clusters = cpu_to_le16(new_clusters);
3753 rec.e_flags = flags;
2353 3754
2354 status = ocfs2_figure_insert_type(inode, fe_bh, &last_eb_bh, &rec, 3755 status = ocfs2_figure_insert_type(inode, fe_bh, &last_eb_bh, &rec,
2355 &insert); 3756 &insert);
@@ -2364,55 +3765,16 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
2364 insert.ins_appending, insert.ins_contig, insert.ins_contig_index, 3765 insert.ins_appending, insert.ins_contig, insert.ins_contig_index,
2365 insert.ins_free_records, insert.ins_tree_depth); 3766 insert.ins_free_records, insert.ins_tree_depth);
2366 3767
2367 /* 3768 if (insert.ins_contig == CONTIG_NONE && insert.ins_free_records == 0) {
2368 * Avoid growing the tree unless we're out of records and the 3769 status = ocfs2_grow_tree(inode, handle, fe_bh,
2369 * insert type requres one. 3770 &insert.ins_tree_depth, &last_eb_bh,
2370 */ 3771 meta_ac);
2371 if (insert.ins_contig != CONTIG_NONE || insert.ins_free_records) 3772 if (status) {
2372 goto out_add;
2373
2374 shift = ocfs2_find_branch_target(osb, inode, fe_bh, &bh);
2375 if (shift < 0) {
2376 status = shift;
2377 mlog_errno(status);
2378 goto bail;
2379 }
2380
2381 /* We traveled all the way to the bottom of the allocation tree
2382 * and didn't find room for any more extents - we need to add
2383 * another tree level */
2384 if (shift) {
2385 BUG_ON(bh);
2386 mlog(0, "need to shift tree depth "
2387 "(current = %d)\n", insert.ins_tree_depth);
2388
2389 /* ocfs2_shift_tree_depth will return us a buffer with
2390 * the new extent block (so we can pass that to
2391 * ocfs2_add_branch). */
2392 status = ocfs2_shift_tree_depth(osb, handle, inode, fe_bh,
2393 meta_ac, &bh);
2394 if (status < 0) {
2395 mlog_errno(status); 3773 mlog_errno(status);
2396 goto bail; 3774 goto bail;
2397 } 3775 }
2398 insert.ins_tree_depth++;
2399 /* Special case: we have room now if we shifted from
2400 * tree_depth 0 */
2401 if (insert.ins_tree_depth == 1)
2402 goto out_add;
2403 }
2404
2405 /* call ocfs2_add_branch to add the final part of the tree with
2406 * the new data. */
2407 mlog(0, "add branch. bh = %p\n", bh);
2408 status = ocfs2_add_branch(osb, handle, inode, fe_bh, bh, last_eb_bh,
2409 meta_ac);
2410 if (status < 0) {
2411 mlog_errno(status);
2412 goto bail;
2413 } 3776 }
2414 3777
2415out_add:
2416 /* Finally, we can add clusters. This might rotate the tree for us. */ 3778 /* Finally, we can add clusters. This might rotate the tree for us. */
2417 status = ocfs2_do_insert_extent(inode, handle, fe_bh, &rec, &insert); 3779 status = ocfs2_do_insert_extent(inode, handle, fe_bh, &rec, &insert);
2418 if (status < 0) 3780 if (status < 0)
@@ -2431,7 +3793,720 @@ bail:
2431 return status; 3793 return status;
2432} 3794}
2433 3795
2434static inline int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb) 3796static void ocfs2_make_right_split_rec(struct super_block *sb,
3797 struct ocfs2_extent_rec *split_rec,
3798 u32 cpos,
3799 struct ocfs2_extent_rec *rec)
3800{
3801 u32 rec_cpos = le32_to_cpu(rec->e_cpos);
3802 u32 rec_range = rec_cpos + le16_to_cpu(rec->e_leaf_clusters);
3803
3804 memset(split_rec, 0, sizeof(struct ocfs2_extent_rec));
3805
3806 split_rec->e_cpos = cpu_to_le32(cpos);
3807 split_rec->e_leaf_clusters = cpu_to_le16(rec_range - cpos);
3808
3809 split_rec->e_blkno = rec->e_blkno;
3810 le64_add_cpu(&split_rec->e_blkno,
3811 ocfs2_clusters_to_blocks(sb, cpos - rec_cpos));
3812
3813 split_rec->e_flags = rec->e_flags;
3814}
3815
3816static int ocfs2_split_and_insert(struct inode *inode,
3817 handle_t *handle,
3818 struct ocfs2_path *path,
3819 struct buffer_head *di_bh,
3820 struct buffer_head **last_eb_bh,
3821 int split_index,
3822 struct ocfs2_extent_rec *orig_split_rec,
3823 struct ocfs2_alloc_context *meta_ac)
3824{
3825 int ret = 0, depth;
3826 unsigned int insert_range, rec_range, do_leftright = 0;
3827 struct ocfs2_extent_rec tmprec;
3828 struct ocfs2_extent_list *rightmost_el;
3829 struct ocfs2_extent_rec rec;
3830 struct ocfs2_extent_rec split_rec = *orig_split_rec;
3831 struct ocfs2_insert_type insert;
3832 struct ocfs2_extent_block *eb;
3833 struct ocfs2_dinode *di;
3834
3835leftright:
3836 /*
3837 * Store a copy of the record on the stack - it might move
3838 * around as the tree is manipulated below.
3839 */
3840 rec = path_leaf_el(path)->l_recs[split_index];
3841
3842 di = (struct ocfs2_dinode *)di_bh->b_data;
3843 rightmost_el = &di->id2.i_list;
3844
3845 depth = le16_to_cpu(rightmost_el->l_tree_depth);
3846 if (depth) {
3847 BUG_ON(!(*last_eb_bh));
3848 eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
3849 rightmost_el = &eb->h_list;
3850 }
3851
3852 if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
3853 le16_to_cpu(rightmost_el->l_count)) {
3854 int old_depth = depth;
3855
3856 ret = ocfs2_grow_tree(inode, handle, di_bh, &depth, last_eb_bh,
3857 meta_ac);
3858 if (ret) {
3859 mlog_errno(ret);
3860 goto out;
3861 }
3862
3863 if (old_depth != depth) {
3864 eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data;
3865 rightmost_el = &eb->h_list;
3866 }
3867 }
3868
3869 memset(&insert, 0, sizeof(struct ocfs2_insert_type));
3870 insert.ins_appending = APPEND_NONE;
3871 insert.ins_contig = CONTIG_NONE;
3872 insert.ins_free_records = le16_to_cpu(rightmost_el->l_count)
3873 - le16_to_cpu(rightmost_el->l_next_free_rec);
3874 insert.ins_tree_depth = depth;
3875
3876 insert_range = le32_to_cpu(split_rec.e_cpos) +
3877 le16_to_cpu(split_rec.e_leaf_clusters);
3878 rec_range = le32_to_cpu(rec.e_cpos) +
3879 le16_to_cpu(rec.e_leaf_clusters);
3880
3881 if (split_rec.e_cpos == rec.e_cpos) {
3882 insert.ins_split = SPLIT_LEFT;
3883 } else if (insert_range == rec_range) {
3884 insert.ins_split = SPLIT_RIGHT;
3885 } else {
3886 /*
3887 * Left/right split. We fake this as a right split
3888 * first and then make a second pass as a left split.
3889 */
3890 insert.ins_split = SPLIT_RIGHT;
3891
3892 ocfs2_make_right_split_rec(inode->i_sb, &tmprec, insert_range,
3893 &rec);
3894
3895 split_rec = tmprec;
3896
3897 BUG_ON(do_leftright);
3898 do_leftright = 1;
3899 }
3900
3901 ret = ocfs2_do_insert_extent(inode, handle, di_bh, &split_rec,
3902 &insert);
3903 if (ret) {
3904 mlog_errno(ret);
3905 goto out;
3906 }
3907
3908 if (do_leftright == 1) {
3909 u32 cpos;
3910 struct ocfs2_extent_list *el;
3911
3912 do_leftright++;
3913 split_rec = *orig_split_rec;
3914
3915 ocfs2_reinit_path(path, 1);
3916
3917 cpos = le32_to_cpu(split_rec.e_cpos);
3918 ret = ocfs2_find_path(inode, path, cpos);
3919 if (ret) {
3920 mlog_errno(ret);
3921 goto out;
3922 }
3923
3924 el = path_leaf_el(path);
3925 split_index = ocfs2_search_extent_list(el, cpos);
3926 goto leftright;
3927 }
3928out:
3929
3930 return ret;
3931}
3932
3933/*
3934 * Mark part or all of the extent record at split_index in the leaf
3935 * pointed to by path as written. This removes the unwritten
3936 * extent flag.
3937 *
3938 * Care is taken to handle contiguousness so as to not grow the tree.
3939 *
3940 * meta_ac is not strictly necessary - we only truly need it if growth
3941 * of the tree is required. All other cases will degrade into a less
3942 * optimal tree layout.
3943 *
3944 * last_eb_bh should be the rightmost leaf block for any inode with a
3945 * btree. Since a split may grow the tree or a merge might shrink it, the caller cannot trust the contents of that buffer after this call.
3946 *
3947 * This code is optimized for readability - several passes might be
3948 * made over certain portions of the tree. All of those blocks will
3949 * have been brought into cache (and pinned via the journal), so the
3950 * extra overhead is not expressed in terms of disk reads.
3951 */
3952static int __ocfs2_mark_extent_written(struct inode *inode,
3953 struct buffer_head *di_bh,
3954 handle_t *handle,
3955 struct ocfs2_path *path,
3956 int split_index,
3957 struct ocfs2_extent_rec *split_rec,
3958 struct ocfs2_alloc_context *meta_ac,
3959 struct ocfs2_cached_dealloc_ctxt *dealloc)
3960{
3961 int ret = 0;
3962 struct ocfs2_extent_list *el = path_leaf_el(path);
3963 struct buffer_head *eb_bh, *last_eb_bh = NULL;
3964 struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
3965 struct ocfs2_merge_ctxt ctxt;
3966 struct ocfs2_extent_list *rightmost_el;
3967
3968 if (!rec->e_flags & OCFS2_EXT_UNWRITTEN) {
3969 ret = -EIO;
3970 mlog_errno(ret);
3971 goto out;
3972 }
3973
3974 if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) ||
3975 ((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) <
3976 (le32_to_cpu(split_rec->e_cpos) + le16_to_cpu(split_rec->e_leaf_clusters)))) {
3977 ret = -EIO;
3978 mlog_errno(ret);
3979 goto out;
3980 }
3981
3982 eb_bh = path_leaf_bh(path);
3983 ret = ocfs2_journal_access(handle, inode, eb_bh,
3984 OCFS2_JOURNAL_ACCESS_WRITE);
3985 if (ret) {
3986 mlog_errno(ret);
3987 goto out;
3988 }
3989
3990 ctxt.c_contig_type = ocfs2_figure_merge_contig_type(inode, el,
3991 split_index,
3992 split_rec);
3993
3994 /*
3995 * The core merge / split code wants to know how much room is
3996 * left in this inodes allocation tree, so we pass the
3997 * rightmost extent list.
3998 */
3999 if (path->p_tree_depth) {
4000 struct ocfs2_extent_block *eb;
4001 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
4002
4003 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
4004 le64_to_cpu(di->i_last_eb_blk),
4005 &last_eb_bh, OCFS2_BH_CACHED, inode);
4006 if (ret) {
4007 mlog_exit(ret);
4008 goto out;
4009 }
4010
4011 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
4012 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
4013 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
4014 ret = -EROFS;
4015 goto out;
4016 }
4017
4018 rightmost_el = &eb->h_list;
4019 } else
4020 rightmost_el = path_root_el(path);
4021
4022 ctxt.c_used_tail_recs = le16_to_cpu(rightmost_el->l_next_free_rec);
4023 if (ctxt.c_used_tail_recs > 0 &&
4024 ocfs2_is_empty_extent(&rightmost_el->l_recs[0]))
4025 ctxt.c_used_tail_recs--;
4026
4027 if (rec->e_cpos == split_rec->e_cpos &&
4028 rec->e_leaf_clusters == split_rec->e_leaf_clusters)
4029 ctxt.c_split_covers_rec = 1;
4030 else
4031 ctxt.c_split_covers_rec = 0;
4032
4033 ctxt.c_has_empty_extent = ocfs2_is_empty_extent(&el->l_recs[0]);
4034
4035 mlog(0, "index: %d, contig: %u, used_tail_recs: %u, "
4036 "has_empty: %u, split_covers: %u\n", split_index,
4037 ctxt.c_contig_type, ctxt.c_used_tail_recs,
4038 ctxt.c_has_empty_extent, ctxt.c_split_covers_rec);
4039
4040 if (ctxt.c_contig_type == CONTIG_NONE) {
4041 if (ctxt.c_split_covers_rec)
4042 el->l_recs[split_index] = *split_rec;
4043 else
4044 ret = ocfs2_split_and_insert(inode, handle, path, di_bh,
4045 &last_eb_bh, split_index,
4046 split_rec, meta_ac);
4047 if (ret)
4048 mlog_errno(ret);
4049 } else {
4050 ret = ocfs2_try_to_merge_extent(inode, handle, path,
4051 split_index, split_rec,
4052 dealloc, &ctxt);
4053 if (ret)
4054 mlog_errno(ret);
4055 }
4056
4057 ocfs2_journal_dirty(handle, eb_bh);
4058
4059out:
4060 brelse(last_eb_bh);
4061 return ret;
4062}
4063
4064/*
4065 * Mark the already-existing extent at cpos as written for len clusters.
4066 *
4067 * If the existing extent is larger than the request, initiate a
4068 * split. An attempt will be made at merging with adjacent extents.
4069 *
4070 * The caller is responsible for passing down meta_ac if we'll need it.
4071 */
4072int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh,
4073 handle_t *handle, u32 cpos, u32 len, u32 phys,
4074 struct ocfs2_alloc_context *meta_ac,
4075 struct ocfs2_cached_dealloc_ctxt *dealloc)
4076{
4077 int ret, index;
4078 u64 start_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys);
4079 struct ocfs2_extent_rec split_rec;
4080 struct ocfs2_path *left_path = NULL;
4081 struct ocfs2_extent_list *el;
4082
4083 mlog(0, "Inode %lu cpos %u, len %u, phys %u (%llu)\n",
4084 inode->i_ino, cpos, len, phys, (unsigned long long)start_blkno);
4085
4086 if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
4087 ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents "
4088 "that are being written to, but the feature bit "
4089 "is not set in the super block.",
4090 (unsigned long long)OCFS2_I(inode)->ip_blkno);
4091 ret = -EROFS;
4092 goto out;
4093 }
4094
4095 /*
4096 * XXX: This should be fixed up so that we just re-insert the
4097 * next extent records.
4098 */
4099 ocfs2_extent_map_trunc(inode, 0);
4100
4101 left_path = ocfs2_new_inode_path(di_bh);
4102 if (!left_path) {
4103 ret = -ENOMEM;
4104 mlog_errno(ret);
4105 goto out;
4106 }
4107
4108 ret = ocfs2_find_path(inode, left_path, cpos);
4109 if (ret) {
4110 mlog_errno(ret);
4111 goto out;
4112 }
4113 el = path_leaf_el(left_path);
4114
4115 index = ocfs2_search_extent_list(el, cpos);
4116 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
4117 ocfs2_error(inode->i_sb,
4118 "Inode %llu has an extent at cpos %u which can no "
4119 "longer be found.\n",
4120 (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
4121 ret = -EROFS;
4122 goto out;
4123 }
4124
4125 memset(&split_rec, 0, sizeof(struct ocfs2_extent_rec));
4126 split_rec.e_cpos = cpu_to_le32(cpos);
4127 split_rec.e_leaf_clusters = cpu_to_le16(len);
4128 split_rec.e_blkno = cpu_to_le64(start_blkno);
4129 split_rec.e_flags = path_leaf_el(left_path)->l_recs[index].e_flags;
4130 split_rec.e_flags &= ~OCFS2_EXT_UNWRITTEN;
4131
4132 ret = __ocfs2_mark_extent_written(inode, di_bh, handle, left_path,
4133 index, &split_rec, meta_ac, dealloc);
4134 if (ret)
4135 mlog_errno(ret);
4136
4137out:
4138 ocfs2_free_path(left_path);
4139 return ret;
4140}
4141
4142static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh,
4143 handle_t *handle, struct ocfs2_path *path,
4144 int index, u32 new_range,
4145 struct ocfs2_alloc_context *meta_ac)
4146{
4147 int ret, depth, credits = handle->h_buffer_credits;
4148 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
4149 struct buffer_head *last_eb_bh = NULL;
4150 struct ocfs2_extent_block *eb;
4151 struct ocfs2_extent_list *rightmost_el, *el;
4152 struct ocfs2_extent_rec split_rec;
4153 struct ocfs2_extent_rec *rec;
4154 struct ocfs2_insert_type insert;
4155
4156 /*
4157 * Setup the record to split before we grow the tree.
4158 */
4159 el = path_leaf_el(path);
4160 rec = &el->l_recs[index];
4161 ocfs2_make_right_split_rec(inode->i_sb, &split_rec, new_range, rec);
4162
4163 depth = path->p_tree_depth;
4164 if (depth > 0) {
4165 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
4166 le64_to_cpu(di->i_last_eb_blk),
4167 &last_eb_bh, OCFS2_BH_CACHED, inode);
4168 if (ret < 0) {
4169 mlog_errno(ret);
4170 goto out;
4171 }
4172
4173 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
4174 rightmost_el = &eb->h_list;
4175 } else
4176 rightmost_el = path_leaf_el(path);
4177
4178 credits += path->p_tree_depth + ocfs2_extend_meta_needed(di);
4179 ret = ocfs2_extend_trans(handle, credits);
4180 if (ret) {
4181 mlog_errno(ret);
4182 goto out;
4183 }
4184
4185 if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
4186 le16_to_cpu(rightmost_el->l_count)) {
4187 int old_depth = depth;
4188
4189 ret = ocfs2_grow_tree(inode, handle, di_bh, &depth, &last_eb_bh,
4190 meta_ac);
4191 if (ret) {
4192 mlog_errno(ret);
4193 goto out;
4194 }
4195
4196 if (old_depth != depth) {
4197 eb = (struct ocfs2_extent_block *)last_eb_bh->b_data;
4198 rightmost_el = &eb->h_list;
4199 }
4200 }
4201
4202 memset(&insert, 0, sizeof(struct ocfs2_insert_type));
4203 insert.ins_appending = APPEND_NONE;
4204 insert.ins_contig = CONTIG_NONE;
4205 insert.ins_split = SPLIT_RIGHT;
4206 insert.ins_free_records = le16_to_cpu(rightmost_el->l_count)
4207 - le16_to_cpu(rightmost_el->l_next_free_rec);
4208 insert.ins_tree_depth = depth;
4209
4210 ret = ocfs2_do_insert_extent(inode, handle, di_bh, &split_rec, &insert);
4211 if (ret)
4212 mlog_errno(ret);
4213
4214out:
4215 brelse(last_eb_bh);
4216 return ret;
4217}
4218
4219static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
4220 struct ocfs2_path *path, int index,
4221 struct ocfs2_cached_dealloc_ctxt *dealloc,
4222 u32 cpos, u32 len)
4223{
4224 int ret;
4225 u32 left_cpos, rec_range, trunc_range;
4226 int wants_rotate = 0, is_rightmost_tree_rec = 0;
4227 struct super_block *sb = inode->i_sb;
4228 struct ocfs2_path *left_path = NULL;
4229 struct ocfs2_extent_list *el = path_leaf_el(path);
4230 struct ocfs2_extent_rec *rec;
4231 struct ocfs2_extent_block *eb;
4232
4233 if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
4234 ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc);
4235 if (ret) {
4236 mlog_errno(ret);
4237 goto out;
4238 }
4239
4240 index--;
4241 }
4242
4243 if (index == (le16_to_cpu(el->l_next_free_rec) - 1) &&
4244 path->p_tree_depth) {
4245 /*
4246 * Check whether this is the rightmost tree record. If
4247 * we remove all of this record or part of its right
4248 * edge then an update of the record lengths above it
4249 * will be required.
4250 */
4251 eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
4252 if (eb->h_next_leaf_blk == 0)
4253 is_rightmost_tree_rec = 1;
4254 }
4255
4256 rec = &el->l_recs[index];
4257 if (index == 0 && path->p_tree_depth &&
4258 le32_to_cpu(rec->e_cpos) == cpos) {
4259 /*
4260 * Changing the leftmost offset (via partial or whole
4261 * record truncate) of an interior (or rightmost) path
4262 * means we have to update the subtree that is formed
4263 * by this leaf and the one to it's left.
4264 *
4265 * There are two cases we can skip:
4266 * 1) Path is the leftmost one in our inode tree.
4267 * 2) The leaf is rightmost and will be empty after
4268 * we remove the extent record - the rotate code
4269 * knows how to update the newly formed edge.
4270 */
4271
4272 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path,
4273 &left_cpos);
4274 if (ret) {
4275 mlog_errno(ret);
4276 goto out;
4277 }
4278
4279 if (left_cpos && le16_to_cpu(el->l_next_free_rec) > 1) {
4280 left_path = ocfs2_new_path(path_root_bh(path),
4281 path_root_el(path));
4282 if (!left_path) {
4283 ret = -ENOMEM;
4284 mlog_errno(ret);
4285 goto out;
4286 }
4287
4288 ret = ocfs2_find_path(inode, left_path, left_cpos);
4289 if (ret) {
4290 mlog_errno(ret);
4291 goto out;
4292 }
4293 }
4294 }
4295
4296 ret = ocfs2_extend_rotate_transaction(handle, 0,
4297 handle->h_buffer_credits,
4298 path);
4299 if (ret) {
4300 mlog_errno(ret);
4301 goto out;
4302 }
4303
4304 ret = ocfs2_journal_access_path(inode, handle, path);
4305 if (ret) {
4306 mlog_errno(ret);
4307 goto out;
4308 }
4309
4310 ret = ocfs2_journal_access_path(inode, handle, left_path);
4311 if (ret) {
4312 mlog_errno(ret);
4313 goto out;
4314 }
4315
4316 rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
4317 trunc_range = cpos + len;
4318
4319 if (le32_to_cpu(rec->e_cpos) == cpos && rec_range == trunc_range) {
4320 int next_free;
4321
4322 memset(rec, 0, sizeof(*rec));
4323 ocfs2_cleanup_merge(el, index);
4324 wants_rotate = 1;
4325
4326 next_free = le16_to_cpu(el->l_next_free_rec);
4327 if (is_rightmost_tree_rec && next_free > 1) {
4328 /*
4329 * We skip the edge update if this path will
4330 * be deleted by the rotate code.
4331 */
4332 rec = &el->l_recs[next_free - 1];
4333 ocfs2_adjust_rightmost_records(inode, handle, path,
4334 rec);
4335 }
4336 } else if (le32_to_cpu(rec->e_cpos) == cpos) {
4337 /* Remove leftmost portion of the record. */
4338 le32_add_cpu(&rec->e_cpos, len);
4339 le64_add_cpu(&rec->e_blkno, ocfs2_clusters_to_blocks(sb, len));
4340 le16_add_cpu(&rec->e_leaf_clusters, -len);
4341 } else if (rec_range == trunc_range) {
4342 /* Remove rightmost portion of the record */
4343 le16_add_cpu(&rec->e_leaf_clusters, -len);
4344 if (is_rightmost_tree_rec)
4345 ocfs2_adjust_rightmost_records(inode, handle, path, rec);
4346 } else {
4347 /* Caller should have trapped this. */
4348 mlog(ML_ERROR, "Inode %llu: Invalid record truncate: (%u, %u) "
4349 "(%u, %u)\n", (unsigned long long)OCFS2_I(inode)->ip_blkno,
4350 le32_to_cpu(rec->e_cpos),
4351 le16_to_cpu(rec->e_leaf_clusters), cpos, len);
4352 BUG();
4353 }
4354
4355 if (left_path) {
4356 int subtree_index;
4357
4358 subtree_index = ocfs2_find_subtree_root(inode, left_path, path);
4359 ocfs2_complete_edge_insert(inode, handle, left_path, path,
4360 subtree_index);
4361 }
4362
4363 ocfs2_journal_dirty(handle, path_leaf_bh(path));
4364
4365 ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc);
4366 if (ret) {
4367 mlog_errno(ret);
4368 goto out;
4369 }
4370
4371out:
4372 ocfs2_free_path(left_path);
4373 return ret;
4374}
4375
4376int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh,
4377 u32 cpos, u32 len, handle_t *handle,
4378 struct ocfs2_alloc_context *meta_ac,
4379 struct ocfs2_cached_dealloc_ctxt *dealloc)
4380{
4381 int ret, index;
4382 u32 rec_range, trunc_range;
4383 struct ocfs2_extent_rec *rec;
4384 struct ocfs2_extent_list *el;
4385 struct ocfs2_path *path;
4386
4387 ocfs2_extent_map_trunc(inode, 0);
4388
4389 path = ocfs2_new_inode_path(di_bh);
4390 if (!path) {
4391 ret = -ENOMEM;
4392 mlog_errno(ret);
4393 goto out;
4394 }
4395
4396 ret = ocfs2_find_path(inode, path, cpos);
4397 if (ret) {
4398 mlog_errno(ret);
4399 goto out;
4400 }
4401
4402 el = path_leaf_el(path);
4403 index = ocfs2_search_extent_list(el, cpos);
4404 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
4405 ocfs2_error(inode->i_sb,
4406 "Inode %llu has an extent at cpos %u which can no "
4407 "longer be found.\n",
4408 (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
4409 ret = -EROFS;
4410 goto out;
4411 }
4412
4413 /*
4414 * We have 3 cases of extent removal:
4415 * 1) Range covers the entire extent rec
4416 * 2) Range begins or ends on one edge of the extent rec
4417 * 3) Range is in the middle of the extent rec (no shared edges)
4418 *
4419 * For case 1 we remove the extent rec and left rotate to
4420 * fill the hole.
4421 *
4422 * For case 2 we just shrink the existing extent rec, with a
4423 * tree update if the shrinking edge is also the edge of an
4424 * extent block.
4425 *
4426 * For case 3 we do a right split to turn the extent rec into
4427 * something case 2 can handle.
4428 */
4429 rec = &el->l_recs[index];
4430 rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
4431 trunc_range = cpos + len;
4432
4433 BUG_ON(cpos < le32_to_cpu(rec->e_cpos) || trunc_range > rec_range);
4434
4435 mlog(0, "Inode %llu, remove (cpos %u, len %u). Existing index %d "
4436 "(cpos %u, len %u)\n",
4437 (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos, len, index,
4438 le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec));
4439
4440 if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) {
4441 ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
4442 cpos, len);
4443 if (ret) {
4444 mlog_errno(ret);
4445 goto out;
4446 }
4447 } else {
4448 ret = ocfs2_split_tree(inode, di_bh, handle, path, index,
4449 trunc_range, meta_ac);
4450 if (ret) {
4451 mlog_errno(ret);
4452 goto out;
4453 }
4454
4455 /*
4456 * The split could have manipulated the tree enough to
4457 * move the record location, so we have to look for it again.
4458 */
4459 ocfs2_reinit_path(path, 1);
4460
4461 ret = ocfs2_find_path(inode, path, cpos);
4462 if (ret) {
4463 mlog_errno(ret);
4464 goto out;
4465 }
4466
4467 el = path_leaf_el(path);
4468 index = ocfs2_search_extent_list(el, cpos);
4469 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
4470 ocfs2_error(inode->i_sb,
4471 "Inode %llu: split at cpos %u lost record.",
4472 (unsigned long long)OCFS2_I(inode)->ip_blkno,
4473 cpos);
4474 ret = -EROFS;
4475 goto out;
4476 }
4477
4478 /*
4479 * Double check our values here. If anything is fishy,
4480 * it's easier to catch it at the top level.
4481 */
4482 rec = &el->l_recs[index];
4483 rec_range = le32_to_cpu(rec->e_cpos) +
4484 ocfs2_rec_clusters(el, rec);
4485 if (rec_range != trunc_range) {
4486 ocfs2_error(inode->i_sb,
4487 "Inode %llu: error after split at cpos %u"
4488 "trunc len %u, existing record is (%u,%u)",
4489 (unsigned long long)OCFS2_I(inode)->ip_blkno,
4490 cpos, len, le32_to_cpu(rec->e_cpos),
4491 ocfs2_rec_clusters(el, rec));
4492 ret = -EROFS;
4493 goto out;
4494 }
4495
4496 ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
4497 cpos, len);
4498 if (ret) {
4499 mlog_errno(ret);
4500 goto out;
4501 }
4502 }
4503
4504out:
4505 ocfs2_free_path(path);
4506 return ret;
4507}
4508
4509int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
2435{ 4510{
2436 struct buffer_head *tl_bh = osb->osb_tl_bh; 4511 struct buffer_head *tl_bh = osb->osb_tl_bh;
2437 struct ocfs2_dinode *di; 4512 struct ocfs2_dinode *di;
@@ -2464,10 +4539,10 @@ static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl,
2464 return current_tail == new_start; 4539 return current_tail == new_start;
2465} 4540}
2466 4541
2467static int ocfs2_truncate_log_append(struct ocfs2_super *osb, 4542int ocfs2_truncate_log_append(struct ocfs2_super *osb,
2468 handle_t *handle, 4543 handle_t *handle,
2469 u64 start_blk, 4544 u64 start_blk,
2470 unsigned int num_clusters) 4545 unsigned int num_clusters)
2471{ 4546{
2472 int status, index; 4547 int status, index;
2473 unsigned int start_cluster, tl_count; 4548 unsigned int start_cluster, tl_count;
@@ -2623,7 +4698,7 @@ bail:
2623} 4698}
2624 4699
2625/* Expects you to already be holding tl_inode->i_mutex */ 4700/* Expects you to already be holding tl_inode->i_mutex */
2626static int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) 4701int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
2627{ 4702{
2628 int status; 4703 int status;
2629 unsigned int num_to_flush; 4704 unsigned int num_to_flush;
@@ -2957,6 +5032,219 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb)
2957 return status; 5032 return status;
2958} 5033}
2959 5034
5035/*
5036 * Delayed de-allocation of suballocator blocks.
5037 *
5038 * Some sets of block de-allocations might involve multiple suballocator inodes.
5039 *
5040 * The locking for this can get extremely complicated, especially when
5041 * the suballocator inodes to delete from aren't known until deep
5042 * within an unrelated codepath.
5043 *
5044 * ocfs2_extent_block structures are a good example of this - an inode
5045 * btree could have been grown by any number of nodes each allocating
5046 * out of their own suballoc inode.
5047 *
5048 * These structures allow the delay of block de-allocation until a
5049 * later time, when locking of multiple cluster inodes won't cause
5050 * deadlock.
5051 */
5052
5053/*
5054 * Describes a single block free from a suballocator
5055 */
5056struct ocfs2_cached_block_free {
5057 struct ocfs2_cached_block_free *free_next;
5058 u64 free_blk;
5059 unsigned int free_bit;
5060};
5061
5062struct ocfs2_per_slot_free_list {
5063 struct ocfs2_per_slot_free_list *f_next_suballocator;
5064 int f_inode_type;
5065 int f_slot;
5066 struct ocfs2_cached_block_free *f_first;
5067};
5068
5069static int ocfs2_free_cached_items(struct ocfs2_super *osb,
5070 int sysfile_type,
5071 int slot,
5072 struct ocfs2_cached_block_free *head)
5073{
5074 int ret;
5075 u64 bg_blkno;
5076 handle_t *handle;
5077 struct inode *inode;
5078 struct buffer_head *di_bh = NULL;
5079 struct ocfs2_cached_block_free *tmp;
5080
5081 inode = ocfs2_get_system_file_inode(osb, sysfile_type, slot);
5082 if (!inode) {
5083 ret = -EINVAL;
5084 mlog_errno(ret);
5085 goto out;
5086 }
5087
5088 mutex_lock(&inode->i_mutex);
5089
5090 ret = ocfs2_meta_lock(inode, &di_bh, 1);
5091 if (ret) {
5092 mlog_errno(ret);
5093 goto out_mutex;
5094 }
5095
5096 handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE);
5097 if (IS_ERR(handle)) {
5098 ret = PTR_ERR(handle);
5099 mlog_errno(ret);
5100 goto out_unlock;
5101 }
5102
5103 while (head) {
5104 bg_blkno = ocfs2_which_suballoc_group(head->free_blk,
5105 head->free_bit);
5106 mlog(0, "Free bit: (bit %u, blkno %llu)\n",
5107 head->free_bit, (unsigned long long)head->free_blk);
5108
5109 ret = ocfs2_free_suballoc_bits(handle, inode, di_bh,
5110 head->free_bit, bg_blkno, 1);
5111 if (ret) {
5112 mlog_errno(ret);
5113 goto out_journal;
5114 }
5115
5116 ret = ocfs2_extend_trans(handle, OCFS2_SUBALLOC_FREE);
5117 if (ret) {
5118 mlog_errno(ret);
5119 goto out_journal;
5120 }
5121
5122 tmp = head;
5123 head = head->free_next;
5124 kfree(tmp);
5125 }
5126
5127out_journal:
5128 ocfs2_commit_trans(osb, handle);
5129
5130out_unlock:
5131 ocfs2_meta_unlock(inode, 1);
5132 brelse(di_bh);
5133out_mutex:
5134 mutex_unlock(&inode->i_mutex);
5135 iput(inode);
5136out:
5137 while(head) {
5138 /* Premature exit may have left some dangling items. */
5139 tmp = head;
5140 head = head->free_next;
5141 kfree(tmp);
5142 }
5143
5144 return ret;
5145}
5146
5147int ocfs2_run_deallocs(struct ocfs2_super *osb,
5148 struct ocfs2_cached_dealloc_ctxt *ctxt)
5149{
5150 int ret = 0, ret2;
5151 struct ocfs2_per_slot_free_list *fl;
5152
5153 if (!ctxt)
5154 return 0;
5155
5156 while (ctxt->c_first_suballocator) {
5157 fl = ctxt->c_first_suballocator;
5158
5159 if (fl->f_first) {
5160 mlog(0, "Free items: (type %u, slot %d)\n",
5161 fl->f_inode_type, fl->f_slot);
5162 ret2 = ocfs2_free_cached_items(osb, fl->f_inode_type,
5163 fl->f_slot, fl->f_first);
5164 if (ret2)
5165 mlog_errno(ret2);
5166 if (!ret)
5167 ret = ret2;
5168 }
5169
5170 ctxt->c_first_suballocator = fl->f_next_suballocator;
5171 kfree(fl);
5172 }
5173
5174 return ret;
5175}
5176
5177static struct ocfs2_per_slot_free_list *
5178ocfs2_find_per_slot_free_list(int type,
5179 int slot,
5180 struct ocfs2_cached_dealloc_ctxt *ctxt)
5181{
5182 struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator;
5183
5184 while (fl) {
5185 if (fl->f_inode_type == type && fl->f_slot == slot)
5186 return fl;
5187
5188 fl = fl->f_next_suballocator;
5189 }
5190
5191 fl = kmalloc(sizeof(*fl), GFP_NOFS);
5192 if (fl) {
5193 fl->f_inode_type = type;
5194 fl->f_slot = slot;
5195 fl->f_first = NULL;
5196 fl->f_next_suballocator = ctxt->c_first_suballocator;
5197
5198 ctxt->c_first_suballocator = fl;
5199 }
5200 return fl;
5201}
5202
5203static int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
5204 int type, int slot, u64 blkno,
5205 unsigned int bit)
5206{
5207 int ret;
5208 struct ocfs2_per_slot_free_list *fl;
5209 struct ocfs2_cached_block_free *item;
5210
5211 fl = ocfs2_find_per_slot_free_list(type, slot, ctxt);
5212 if (fl == NULL) {
5213 ret = -ENOMEM;
5214 mlog_errno(ret);
5215 goto out;
5216 }
5217
5218 item = kmalloc(sizeof(*item), GFP_NOFS);
5219 if (item == NULL) {
5220 ret = -ENOMEM;
5221 mlog_errno(ret);
5222 goto out;
5223 }
5224
5225 mlog(0, "Insert: (type %d, slot %u, bit %u, blk %llu)\n",
5226 type, slot, bit, (unsigned long long)blkno);
5227
5228 item->free_blk = blkno;
5229 item->free_bit = bit;
5230 item->free_next = fl->f_first;
5231
5232 fl->f_first = item;
5233
5234 ret = 0;
5235out:
5236 return ret;
5237}
5238
5239static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
5240 struct ocfs2_extent_block *eb)
5241{
5242 return ocfs2_cache_block_dealloc(ctxt, EXTENT_ALLOC_SYSTEM_INODE,
5243 le16_to_cpu(eb->h_suballoc_slot),
5244 le64_to_cpu(eb->h_blkno),
5245 le16_to_cpu(eb->h_suballoc_bit));
5246}
5247
2960/* This function will figure out whether the currently last extent 5248/* This function will figure out whether the currently last extent
2961 * block will be deleted, and if it will, what the new last extent 5249 * block will be deleted, and if it will, what the new last extent
2962 * block will be so we can update his h_next_leaf_blk field, as well 5250 * block will be so we can update his h_next_leaf_blk field, as well
@@ -3238,27 +5526,10 @@ delete:
3238 BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos)); 5526 BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos));
3239 BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno)); 5527 BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno));
3240 5528
3241 if (le16_to_cpu(eb->h_suballoc_slot) == 0) { 5529 ret = ocfs2_cache_extent_block_free(&tc->tc_dealloc, eb);
3242 /* 5530 /* An error here is not fatal. */
3243 * This code only understands how to 5531 if (ret < 0)
3244 * lock the suballocator in slot 0, 5532 mlog_errno(ret);
3245 * which is fine because allocation is
3246 * only ever done out of that
3247 * suballocator too. A future version
3248 * might change that however, so avoid
3249 * a free if we don't know how to
3250 * handle it. This way an fs incompat
3251 * bit will not be necessary.
3252 */
3253 ret = ocfs2_free_extent_block(handle,
3254 tc->tc_ext_alloc_inode,
3255 tc->tc_ext_alloc_bh,
3256 eb);
3257
3258 /* An error here is not fatal. */
3259 if (ret < 0)
3260 mlog_errno(ret);
3261 }
3262 } else { 5533 } else {
3263 deleted_eb = 0; 5534 deleted_eb = 0;
3264 } 5535 }
@@ -3397,9 +5668,9 @@ static int ocfs2_ordered_zero_func(handle_t *handle, struct buffer_head *bh)
3397 return ocfs2_journal_dirty_data(handle, bh); 5668 return ocfs2_journal_dirty_data(handle, bh);
3398} 5669}
3399 5670
3400static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t isize, 5671static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start,
3401 struct page **pages, int numpages, 5672 loff_t end, struct page **pages,
3402 u64 phys, handle_t *handle) 5673 int numpages, u64 phys, handle_t *handle)
3403{ 5674{
3404 int i, ret, partial = 0; 5675 int i, ret, partial = 0;
3405 void *kaddr; 5676 void *kaddr;
@@ -3412,26 +5683,14 @@ static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t isize,
3412 if (numpages == 0) 5683 if (numpages == 0)
3413 goto out; 5684 goto out;
3414 5685
3415 from = isize & (PAGE_CACHE_SIZE - 1); /* 1st page offset */ 5686 to = PAGE_CACHE_SIZE;
3416 if (PAGE_CACHE_SHIFT > OCFS2_SB(sb)->s_clustersize_bits) {
3417 /*
3418 * Since 'from' has been capped to a value below page
3419 * size, this calculation won't be able to overflow
3420 * 'to'
3421 */
3422 to = ocfs2_align_bytes_to_clusters(sb, from);
3423
3424 /*
3425 * The truncate tail in this case should never contain
3426 * more than one page at maximum. The loop below also
3427 * assumes this.
3428 */
3429 BUG_ON(numpages != 1);
3430 }
3431
3432 for(i = 0; i < numpages; i++) { 5687 for(i = 0; i < numpages; i++) {
3433 page = pages[i]; 5688 page = pages[i];
3434 5689
5690 from = start & (PAGE_CACHE_SIZE - 1);
5691 if ((end >> PAGE_CACHE_SHIFT) == page->index)
5692 to = end & (PAGE_CACHE_SIZE - 1);
5693
3435 BUG_ON(from > PAGE_CACHE_SIZE); 5694 BUG_ON(from > PAGE_CACHE_SIZE);
3436 BUG_ON(to > PAGE_CACHE_SIZE); 5695 BUG_ON(to > PAGE_CACHE_SIZE);
3437 5696
@@ -3468,10 +5727,7 @@ static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t isize,
3468 5727
3469 flush_dcache_page(page); 5728 flush_dcache_page(page);
3470 5729
3471 /* 5730 start = (page->index + 1) << PAGE_CACHE_SHIFT;
3472 * Every page after the 1st one should be completely zero'd.
3473 */
3474 from = 0;
3475 } 5731 }
3476out: 5732out:
3477 if (pages) { 5733 if (pages) {
@@ -3484,24 +5740,26 @@ out:
3484 } 5740 }
3485} 5741}
3486 5742
3487static int ocfs2_grab_eof_pages(struct inode *inode, loff_t isize, struct page **pages, 5743static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end,
3488 int *num, u64 *phys) 5744 struct page **pages, int *num, u64 *phys)
3489{ 5745{
3490 int i, numpages = 0, ret = 0; 5746 int i, numpages = 0, ret = 0;
3491 unsigned int csize = OCFS2_SB(inode->i_sb)->s_clustersize;
3492 unsigned int ext_flags; 5747 unsigned int ext_flags;
3493 struct super_block *sb = inode->i_sb; 5748 struct super_block *sb = inode->i_sb;
3494 struct address_space *mapping = inode->i_mapping; 5749 struct address_space *mapping = inode->i_mapping;
3495 unsigned long index; 5750 unsigned long index;
3496 u64 next_cluster_bytes; 5751 loff_t last_page_bytes;
3497 5752
3498 BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb))); 5753 BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
5754 BUG_ON(start > end);
3499 5755
3500 /* Cluster boundary, so we don't need to grab any pages. */ 5756 if (start == end)
3501 if ((isize & (csize - 1)) == 0)
3502 goto out; 5757 goto out;
3503 5758
3504 ret = ocfs2_extent_map_get_blocks(inode, isize >> sb->s_blocksize_bits, 5759 BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits !=
5760 (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits);
5761
5762 ret = ocfs2_extent_map_get_blocks(inode, start >> sb->s_blocksize_bits,
3505 phys, NULL, &ext_flags); 5763 phys, NULL, &ext_flags);
3506 if (ret) { 5764 if (ret) {
3507 mlog_errno(ret); 5765 mlog_errno(ret);
@@ -3517,8 +5775,8 @@ static int ocfs2_grab_eof_pages(struct inode *inode, loff_t isize, struct page *
3517 if (ext_flags & OCFS2_EXT_UNWRITTEN) 5775 if (ext_flags & OCFS2_EXT_UNWRITTEN)
3518 goto out; 5776 goto out;
3519 5777
3520 next_cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, isize); 5778 last_page_bytes = PAGE_ALIGN(end);
3521 index = isize >> PAGE_CACHE_SHIFT; 5779 index = start >> PAGE_CACHE_SHIFT;
3522 do { 5780 do {
3523 pages[numpages] = grab_cache_page(mapping, index); 5781 pages[numpages] = grab_cache_page(mapping, index);
3524 if (!pages[numpages]) { 5782 if (!pages[numpages]) {
@@ -3529,7 +5787,7 @@ static int ocfs2_grab_eof_pages(struct inode *inode, loff_t isize, struct page *
3529 5787
3530 numpages++; 5788 numpages++;
3531 index++; 5789 index++;
3532 } while (index < (next_cluster_bytes >> PAGE_CACHE_SHIFT)); 5790 } while (index < (last_page_bytes >> PAGE_CACHE_SHIFT));
3533 5791
3534out: 5792out:
3535 if (ret != 0) { 5793 if (ret != 0) {
@@ -3558,11 +5816,10 @@ out:
3558 * otherwise block_write_full_page() will skip writeout of pages past 5816 * otherwise block_write_full_page() will skip writeout of pages past
3559 * i_size. The new_i_size parameter is passed for this reason. 5817 * i_size. The new_i_size parameter is passed for this reason.
3560 */ 5818 */
3561int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle, 5819int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
3562 u64 new_i_size) 5820 u64 range_start, u64 range_end)
3563{ 5821{
3564 int ret, numpages; 5822 int ret, numpages;
3565 loff_t endbyte;
3566 struct page **pages = NULL; 5823 struct page **pages = NULL;
3567 u64 phys; 5824 u64 phys;
3568 5825
@@ -3581,7 +5838,8 @@ int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle,
3581 goto out; 5838 goto out;
3582 } 5839 }
3583 5840
3584 ret = ocfs2_grab_eof_pages(inode, new_i_size, pages, &numpages, &phys); 5841 ret = ocfs2_grab_eof_pages(inode, range_start, range_end, pages,
5842 &numpages, &phys);
3585 if (ret) { 5843 if (ret) {
3586 mlog_errno(ret); 5844 mlog_errno(ret);
3587 goto out; 5845 goto out;
@@ -3590,17 +5848,16 @@ int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle,
3590 if (numpages == 0) 5848 if (numpages == 0)
3591 goto out; 5849 goto out;
3592 5850
3593 ocfs2_zero_cluster_pages(inode, new_i_size, pages, numpages, phys, 5851 ocfs2_zero_cluster_pages(inode, range_start, range_end, pages,
3594 handle); 5852 numpages, phys, handle);
3595 5853
3596 /* 5854 /*
3597 * Initiate writeout of the pages we zero'd here. We don't 5855 * Initiate writeout of the pages we zero'd here. We don't
3598 * wait on them - the truncate_inode_pages() call later will 5856 * wait on them - the truncate_inode_pages() call later will
3599 * do that for us. 5857 * do that for us.
3600 */ 5858 */
3601 endbyte = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size); 5859 ret = do_sync_mapping_range(inode->i_mapping, range_start,
3602 ret = do_sync_mapping_range(inode->i_mapping, new_i_size, 5860 range_end - 1, SYNC_FILE_RANGE_WRITE);
3603 endbyte - 1, SYNC_FILE_RANGE_WRITE);
3604 if (ret) 5861 if (ret)
3605 mlog_errno(ret); 5862 mlog_errno(ret);
3606 5863
@@ -3631,8 +5888,6 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
3631 5888
3632 mlog_entry_void(); 5889 mlog_entry_void();
3633 5890
3634 down_write(&OCFS2_I(inode)->ip_alloc_sem);
3635
3636 new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb, 5891 new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
3637 i_size_read(inode)); 5892 i_size_read(inode));
3638 5893
@@ -3754,7 +6009,6 @@ start:
3754 goto start; 6009 goto start;
3755 6010
3756bail: 6011bail:
3757 up_write(&OCFS2_I(inode)->ip_alloc_sem);
3758 6012
3759 ocfs2_schedule_truncate_log_flush(osb, 1); 6013 ocfs2_schedule_truncate_log_flush(osb, 1);
3760 6014
@@ -3764,6 +6018,8 @@ bail:
3764 if (handle) 6018 if (handle)
3765 ocfs2_commit_trans(osb, handle); 6019 ocfs2_commit_trans(osb, handle);
3766 6020
6021 ocfs2_run_deallocs(osb, &tc->tc_dealloc);
6022
3767 ocfs2_free_path(path); 6023 ocfs2_free_path(path);
3768 6024
3769 /* This will drop the ext_alloc cluster lock for us */ 6025 /* This will drop the ext_alloc cluster lock for us */
@@ -3774,23 +6030,18 @@ bail:
3774} 6030}
3775 6031
3776/* 6032/*
3777 * Expects the inode to already be locked. This will figure out which 6033 * Expects the inode to already be locked.
3778 * inodes need to be locked and will put them on the returned truncate
3779 * context.
3780 */ 6034 */
3781int ocfs2_prepare_truncate(struct ocfs2_super *osb, 6035int ocfs2_prepare_truncate(struct ocfs2_super *osb,
3782 struct inode *inode, 6036 struct inode *inode,
3783 struct buffer_head *fe_bh, 6037 struct buffer_head *fe_bh,
3784 struct ocfs2_truncate_context **tc) 6038 struct ocfs2_truncate_context **tc)
3785{ 6039{
3786 int status, metadata_delete, i; 6040 int status;
3787 unsigned int new_i_clusters; 6041 unsigned int new_i_clusters;
3788 struct ocfs2_dinode *fe; 6042 struct ocfs2_dinode *fe;
3789 struct ocfs2_extent_block *eb; 6043 struct ocfs2_extent_block *eb;
3790 struct ocfs2_extent_list *el;
3791 struct buffer_head *last_eb_bh = NULL; 6044 struct buffer_head *last_eb_bh = NULL;
3792 struct inode *ext_alloc_inode = NULL;
3793 struct buffer_head *ext_alloc_bh = NULL;
3794 6045
3795 mlog_entry_void(); 6046 mlog_entry_void();
3796 6047
@@ -3810,12 +6061,9 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
3810 mlog_errno(status); 6061 mlog_errno(status);
3811 goto bail; 6062 goto bail;
3812 } 6063 }
6064 ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc);
3813 6065
3814 metadata_delete = 0;
3815 if (fe->id2.i_list.l_tree_depth) { 6066 if (fe->id2.i_list.l_tree_depth) {
3816 /* If we have a tree, then the truncate may result in
3817 * metadata deletes. Figure this out from the
3818 * rightmost leaf block.*/
3819 status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk), 6067 status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
3820 &last_eb_bh, OCFS2_BH_CACHED, inode); 6068 &last_eb_bh, OCFS2_BH_CACHED, inode);
3821 if (status < 0) { 6069 if (status < 0) {
@@ -3830,43 +6078,10 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
3830 status = -EIO; 6078 status = -EIO;
3831 goto bail; 6079 goto bail;
3832 } 6080 }
3833 el = &(eb->h_list);
3834
3835 i = 0;
3836 if (ocfs2_is_empty_extent(&el->l_recs[0]))
3837 i = 1;
3838 /*
3839 * XXX: Should we check that next_free_rec contains
3840 * the extent?
3841 */
3842 if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_i_clusters)
3843 metadata_delete = 1;
3844 } 6081 }
3845 6082
3846 (*tc)->tc_last_eb_bh = last_eb_bh; 6083 (*tc)->tc_last_eb_bh = last_eb_bh;
3847 6084
3848 if (metadata_delete) {
3849 mlog(0, "Will have to delete metadata for this trunc. "
3850 "locking allocator.\n");
3851 ext_alloc_inode = ocfs2_get_system_file_inode(osb, EXTENT_ALLOC_SYSTEM_INODE, 0);
3852 if (!ext_alloc_inode) {
3853 status = -ENOMEM;
3854 mlog_errno(status);
3855 goto bail;
3856 }
3857
3858 mutex_lock(&ext_alloc_inode->i_mutex);
3859 (*tc)->tc_ext_alloc_inode = ext_alloc_inode;
3860
3861 status = ocfs2_meta_lock(ext_alloc_inode, &ext_alloc_bh, 1);
3862 if (status < 0) {
3863 mlog_errno(status);
3864 goto bail;
3865 }
3866 (*tc)->tc_ext_alloc_bh = ext_alloc_bh;
3867 (*tc)->tc_ext_alloc_locked = 1;
3868 }
3869
3870 status = 0; 6085 status = 0;
3871bail: 6086bail:
3872 if (status < 0) { 6087 if (status < 0) {
@@ -3880,16 +6095,13 @@ bail:
3880 6095
3881static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc) 6096static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
3882{ 6097{
3883 if (tc->tc_ext_alloc_inode) { 6098 /*
3884 if (tc->tc_ext_alloc_locked) 6099 * The caller is responsible for completing deallocation
3885 ocfs2_meta_unlock(tc->tc_ext_alloc_inode, 1); 6100 * before freeing the context.
3886 6101 */
3887 mutex_unlock(&tc->tc_ext_alloc_inode->i_mutex); 6102 if (tc->tc_dealloc.c_first_suballocator != NULL)
3888 iput(tc->tc_ext_alloc_inode); 6103 mlog(ML_NOTICE,
3889 } 6104 "Truncate completion has non-empty dealloc context\n");
3890
3891 if (tc->tc_ext_alloc_bh)
3892 brelse(tc->tc_ext_alloc_bh);
3893 6105
3894 if (tc->tc_last_eb_bh) 6106 if (tc->tc_last_eb_bh)
3895 brelse(tc->tc_last_eb_bh); 6107 brelse(tc->tc_last_eb_bh);