aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fs/ocfs2/alloc.c1799
-rw-r--r--fs/ocfs2/alloc.h6
-rw-r--r--fs/ocfs2/endian.h5
-rw-r--r--fs/ocfs2/extent_map.c31
-rw-r--r--fs/ocfs2/ocfs2.h13
-rw-r--r--fs/ocfs2/ocfs2_fs.h7
6 files changed, 1770 insertions, 91 deletions
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index a02d026cb0e4..0db6a1f724e1 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -119,6 +119,31 @@ static void ocfs2_free_path(struct ocfs2_path *path)
119} 119}
120 120
121/* 121/*
122 * All the elements of src into dest. After this call, src could be freed
123 * without affecting dest.
124 *
125 * Both paths should have the same root. Any non-root elements of dest
126 * will be freed.
127 */
128static void ocfs2_cp_path(struct ocfs2_path *dest, struct ocfs2_path *src)
129{
130 int i;
131
132 BUG_ON(path_root_bh(dest) != path_root_bh(src));
133 BUG_ON(path_root_el(dest) != path_root_el(src));
134
135 ocfs2_reinit_path(dest, 1);
136
137 for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
138 dest->p_node[i].bh = src->p_node[i].bh;
139 dest->p_node[i].el = src->p_node[i].el;
140
141 if (dest->p_node[i].bh)
142 get_bh(dest->p_node[i].bh);
143 }
144}
145
146/*
122 * Make the *dest path the same as src and re-initialize src path to 147 * Make the *dest path the same as src and re-initialize src path to
123 * have a root only. 148 * have a root only.
124 */ 149 */
@@ -214,10 +239,41 @@ out:
214 return ret; 239 return ret;
215} 240}
216 241
242/*
243 * Return the index of the extent record which contains cluster #v_cluster.
244 * -1 is returned if it was not found.
245 *
246 * Should work fine on interior and exterior nodes.
247 */
248int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster)
249{
250 int ret = -1;
251 int i;
252 struct ocfs2_extent_rec *rec;
253 u32 rec_end, rec_start, clusters;
254
255 for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
256 rec = &el->l_recs[i];
257
258 rec_start = le32_to_cpu(rec->e_cpos);
259 clusters = ocfs2_rec_clusters(el, rec);
260
261 rec_end = rec_start + clusters;
262
263 if (v_cluster >= rec_start && v_cluster < rec_end) {
264 ret = i;
265 break;
266 }
267 }
268
269 return ret;
270}
271
217enum ocfs2_contig_type { 272enum ocfs2_contig_type {
218 CONTIG_NONE = 0, 273 CONTIG_NONE = 0,
219 CONTIG_LEFT, 274 CONTIG_LEFT,
220 CONTIG_RIGHT 275 CONTIG_RIGHT,
276 CONTIG_LEFTRIGHT,
221}; 277};
222 278
223 279
@@ -255,6 +311,14 @@ static enum ocfs2_contig_type
255{ 311{
256 u64 blkno = le64_to_cpu(insert_rec->e_blkno); 312 u64 blkno = le64_to_cpu(insert_rec->e_blkno);
257 313
314 /*
315 * Refuse to coalesce extent records with different flag
316 * fields - we don't want to mix unwritten extents with user
317 * data.
318 */
319 if (ext->e_flags != insert_rec->e_flags)
320 return CONTIG_NONE;
321
258 if (ocfs2_extents_adjacent(ext, insert_rec) && 322 if (ocfs2_extents_adjacent(ext, insert_rec) &&
259 ocfs2_block_extent_contig(inode->i_sb, ext, blkno)) 323 ocfs2_block_extent_contig(inode->i_sb, ext, blkno))
260 return CONTIG_RIGHT; 324 return CONTIG_RIGHT;
@@ -279,7 +343,14 @@ enum ocfs2_append_type {
279 APPEND_TAIL, 343 APPEND_TAIL,
280}; 344};
281 345
346enum ocfs2_split_type {
347 SPLIT_NONE = 0,
348 SPLIT_LEFT,
349 SPLIT_RIGHT,
350};
351
282struct ocfs2_insert_type { 352struct ocfs2_insert_type {
353 enum ocfs2_split_type ins_split;
283 enum ocfs2_append_type ins_appending; 354 enum ocfs2_append_type ins_appending;
284 enum ocfs2_contig_type ins_contig; 355 enum ocfs2_contig_type ins_contig;
285 int ins_contig_index; 356 int ins_contig_index;
@@ -287,6 +358,13 @@ struct ocfs2_insert_type {
287 int ins_tree_depth; 358 int ins_tree_depth;
288}; 359};
289 360
361struct ocfs2_merge_ctxt {
362 enum ocfs2_contig_type c_contig_type;
363 int c_has_empty_extent;
364 int c_split_covers_rec;
365 int c_used_tail_recs;
366};
367
290/* 368/*
291 * How many free extents have we got before we need more meta data? 369 * How many free extents have we got before we need more meta data?
292 */ 370 */
@@ -457,7 +535,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
457 struct inode *inode, 535 struct inode *inode,
458 struct buffer_head *fe_bh, 536 struct buffer_head *fe_bh,
459 struct buffer_head *eb_bh, 537 struct buffer_head *eb_bh,
460 struct buffer_head *last_eb_bh, 538 struct buffer_head **last_eb_bh,
461 struct ocfs2_alloc_context *meta_ac) 539 struct ocfs2_alloc_context *meta_ac)
462{ 540{
463 int status, new_blocks, i; 541 int status, new_blocks, i;
@@ -472,7 +550,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
472 550
473 mlog_entry_void(); 551 mlog_entry_void();
474 552
475 BUG_ON(!last_eb_bh); 553 BUG_ON(!last_eb_bh || !*last_eb_bh);
476 554
477 fe = (struct ocfs2_dinode *) fe_bh->b_data; 555 fe = (struct ocfs2_dinode *) fe_bh->b_data;
478 556
@@ -503,7 +581,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
503 goto bail; 581 goto bail;
504 } 582 }
505 583
506 eb = (struct ocfs2_extent_block *)last_eb_bh->b_data; 584 eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data;
507 new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list); 585 new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list);
508 586
509 /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be 587 /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
@@ -564,7 +642,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
564 * journal_dirty erroring as it won't unless we've aborted the 642 * journal_dirty erroring as it won't unless we've aborted the
565 * handle (in which case we would never be here) so reserving 643 * handle (in which case we would never be here) so reserving
566 * the write with journal_access is all we need to do. */ 644 * the write with journal_access is all we need to do. */
567 status = ocfs2_journal_access(handle, inode, last_eb_bh, 645 status = ocfs2_journal_access(handle, inode, *last_eb_bh,
568 OCFS2_JOURNAL_ACCESS_WRITE); 646 OCFS2_JOURNAL_ACCESS_WRITE);
569 if (status < 0) { 647 if (status < 0) {
570 mlog_errno(status); 648 mlog_errno(status);
@@ -597,10 +675,10 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
597 * next_leaf on the previously last-extent-block. */ 675 * next_leaf on the previously last-extent-block. */
598 fe->i_last_eb_blk = cpu_to_le64(new_last_eb_blk); 676 fe->i_last_eb_blk = cpu_to_le64(new_last_eb_blk);
599 677
600 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; 678 eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
601 eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk); 679 eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
602 680
603 status = ocfs2_journal_dirty(handle, last_eb_bh); 681 status = ocfs2_journal_dirty(handle, *last_eb_bh);
604 if (status < 0) 682 if (status < 0)
605 mlog_errno(status); 683 mlog_errno(status);
606 status = ocfs2_journal_dirty(handle, fe_bh); 684 status = ocfs2_journal_dirty(handle, fe_bh);
@@ -612,6 +690,14 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
612 mlog_errno(status); 690 mlog_errno(status);
613 } 691 }
614 692
693 /*
694 * Some callers want to track the rightmost leaf so pass it
695 * back here.
696 */
697 brelse(*last_eb_bh);
698 get_bh(new_eb_bhs[0]);
699 *last_eb_bh = new_eb_bhs[0];
700
615 status = 0; 701 status = 0;
616bail: 702bail:
617 if (new_eb_bhs) { 703 if (new_eb_bhs) {
@@ -831,10 +917,12 @@ bail:
831 * be considered invalid. 917 * be considered invalid.
832 * 918 *
833 * Tree depth after the grow is returned via *final_depth. 919 * Tree depth after the grow is returned via *final_depth.
920 *
921 * *last_eb_bh will be updated by ocfs2_add_branch().
834 */ 922 */
835static int ocfs2_grow_tree(struct inode *inode, handle_t *handle, 923static int ocfs2_grow_tree(struct inode *inode, handle_t *handle,
836 struct buffer_head *di_bh, int *final_depth, 924 struct buffer_head *di_bh, int *final_depth,
837 struct buffer_head *last_eb_bh, 925 struct buffer_head **last_eb_bh,
838 struct ocfs2_alloc_context *meta_ac) 926 struct ocfs2_alloc_context *meta_ac)
839{ 927{
840 int ret, shift; 928 int ret, shift;
@@ -869,10 +957,21 @@ static int ocfs2_grow_tree(struct inode *inode, handle_t *handle,
869 goto out; 957 goto out;
870 } 958 }
871 depth++; 959 depth++;
872 /* Special case: we have room now if we shifted from 960 if (depth == 1) {
873 * tree_depth 0 */ 961 /*
874 if (depth == 1) 962 * Special case: we have room now if we shifted from
963 * tree_depth 0, so no more work needs to be done.
964 *
965 * We won't be calling add_branch, so pass
966 * back *last_eb_bh as the new leaf. At depth
967 * zero, it should always be null so there's
968 * no reason to brelse.
969 */
970 BUG_ON(*last_eb_bh);
971 get_bh(bh);
972 *last_eb_bh = bh;
875 goto out; 973 goto out;
974 }
876 } 975 }
877 976
878 /* call ocfs2_add_branch to add the final part of the tree with 977 /* call ocfs2_add_branch to add the final part of the tree with
@@ -998,6 +1097,22 @@ static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
998 1097
999} 1098}
1000 1099
1100static void ocfs2_remove_empty_extent(struct ocfs2_extent_list *el)
1101{
1102 int size, num_recs = le16_to_cpu(el->l_next_free_rec);
1103
1104 BUG_ON(num_recs == 0);
1105
1106 if (ocfs2_is_empty_extent(&el->l_recs[0])) {
1107 num_recs--;
1108 size = num_recs * sizeof(struct ocfs2_extent_rec);
1109 memmove(&el->l_recs[0], &el->l_recs[1], size);
1110 memset(&el->l_recs[num_recs], 0,
1111 sizeof(struct ocfs2_extent_rec));
1112 el->l_next_free_rec = cpu_to_le16(num_recs);
1113 }
1114}
1115
1001/* 1116/*
1002 * Create an empty extent record . 1117 * Create an empty extent record .
1003 * 1118 *
@@ -1275,6 +1390,10 @@ static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
1275 * immediately to their right. 1390 * immediately to their right.
1276 */ 1391 */
1277 left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos); 1392 left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos);
1393 if (ocfs2_is_empty_extent(&right_child_el->l_recs[0])) {
1394 BUG_ON(le16_to_cpu(right_child_el->l_next_free_rec) <= 1);
1395 left_clusters = le32_to_cpu(right_child_el->l_recs[1].e_cpos);
1396 }
1278 left_clusters -= le32_to_cpu(left_rec->e_cpos); 1397 left_clusters -= le32_to_cpu(left_rec->e_cpos);
1279 left_rec->e_int_clusters = cpu_to_le32(left_clusters); 1398 left_rec->e_int_clusters = cpu_to_le32(left_clusters);
1280 1399
@@ -1595,10 +1714,16 @@ out:
1595 return ret; 1714 return ret;
1596} 1715}
1597 1716
1717/*
1718 * Extend the transaction by enough credits to complete the rotation,
1719 * and still leave at least the original number of credits allocated
1720 * to this transaction.
1721 */
1598static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth, 1722static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
1723 int op_credits,
1599 struct ocfs2_path *path) 1724 struct ocfs2_path *path)
1600{ 1725{
1601 int credits = (path->p_tree_depth - subtree_depth) * 2 + 1; 1726 int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits;
1602 1727
1603 if (handle->h_buffer_credits < credits) 1728 if (handle->h_buffer_credits < credits)
1604 return ocfs2_extend_trans(handle, credits); 1729 return ocfs2_extend_trans(handle, credits);
@@ -1632,6 +1757,29 @@ static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path,
1632 return 0; 1757 return 0;
1633} 1758}
1634 1759
1760static int ocfs2_leftmost_rec_contains(struct ocfs2_extent_list *el, u32 cpos)
1761{
1762 int next_free = le16_to_cpu(el->l_next_free_rec);
1763 unsigned int range;
1764 struct ocfs2_extent_rec *rec;
1765
1766 if (next_free == 0)
1767 return 0;
1768
1769 rec = &el->l_recs[0];
1770 if (ocfs2_is_empty_extent(rec)) {
1771 /* Empty list. */
1772 if (next_free == 1)
1773 return 0;
1774 rec = &el->l_recs[1];
1775 }
1776
1777 range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
1778 if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
1779 return 1;
1780 return 0;
1781}
1782
1635/* 1783/*
1636 * Rotate all the records in a btree right one record, starting at insert_cpos. 1784 * Rotate all the records in a btree right one record, starting at insert_cpos.
1637 * 1785 *
@@ -1650,11 +1798,12 @@ static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path,
1650 */ 1798 */
1651static int ocfs2_rotate_tree_right(struct inode *inode, 1799static int ocfs2_rotate_tree_right(struct inode *inode,
1652 handle_t *handle, 1800 handle_t *handle,
1801 enum ocfs2_split_type split,
1653 u32 insert_cpos, 1802 u32 insert_cpos,
1654 struct ocfs2_path *right_path, 1803 struct ocfs2_path *right_path,
1655 struct ocfs2_path **ret_left_path) 1804 struct ocfs2_path **ret_left_path)
1656{ 1805{
1657 int ret, start; 1806 int ret, start, orig_credits = handle->h_buffer_credits;
1658 u32 cpos; 1807 u32 cpos;
1659 struct ocfs2_path *left_path = NULL; 1808 struct ocfs2_path *left_path = NULL;
1660 1809
@@ -1721,9 +1870,9 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
1721 (unsigned long long) 1870 (unsigned long long)
1722 path_leaf_bh(left_path)->b_blocknr); 1871 path_leaf_bh(left_path)->b_blocknr);
1723 1872
1724 if (ocfs2_rotate_requires_path_adjustment(left_path, 1873 if (split == SPLIT_NONE &&
1874 ocfs2_rotate_requires_path_adjustment(left_path,
1725 insert_cpos)) { 1875 insert_cpos)) {
1726 mlog(0, "Path adjustment required\n");
1727 1876
1728 /* 1877 /*
1729 * We've rotated the tree as much as we 1878 * We've rotated the tree as much as we
@@ -1751,7 +1900,7 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
1751 right_path->p_tree_depth); 1900 right_path->p_tree_depth);
1752 1901
1753 ret = ocfs2_extend_rotate_transaction(handle, start, 1902 ret = ocfs2_extend_rotate_transaction(handle, start,
1754 right_path); 1903 orig_credits, right_path);
1755 if (ret) { 1904 if (ret) {
1756 mlog_errno(ret); 1905 mlog_errno(ret);
1757 goto out; 1906 goto out;
@@ -1764,6 +1913,24 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
1764 goto out; 1913 goto out;
1765 } 1914 }
1766 1915
1916 if (split != SPLIT_NONE &&
1917 ocfs2_leftmost_rec_contains(path_leaf_el(right_path),
1918 insert_cpos)) {
1919 /*
1920 * A rotate moves the rightmost left leaf
1921 * record over to the leftmost right leaf
1922 * slot. If we're doing an extent split
1923 * instead of a real insert, then we have to
1924 * check that the extent to be split wasn't
1925 * just moved over. If it was, then we can
1926 * exit here, passing left_path back -
1927 * ocfs2_split_extent() is smart enough to
1928 * search both leaves.
1929 */
1930 *ret_left_path = left_path;
1931 goto out_ret_path;
1932 }
1933
1767 /* 1934 /*
1768 * There is no need to re-read the next right path 1935 * There is no need to re-read the next right path
1769 * as we know that it'll be our current left 1936 * as we know that it'll be our current left
@@ -1786,6 +1953,1031 @@ out_ret_path:
1786 return ret; 1953 return ret;
1787} 1954}
1788 1955
1956static void ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle,
1957 struct ocfs2_path *path)
1958{
1959 int i, idx;
1960 struct ocfs2_extent_rec *rec;
1961 struct ocfs2_extent_list *el;
1962 struct ocfs2_extent_block *eb;
1963 u32 range;
1964
1965 /* Path should always be rightmost. */
1966 eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
1967 BUG_ON(eb->h_next_leaf_blk != 0ULL);
1968
1969 el = &eb->h_list;
1970 BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
1971 idx = le16_to_cpu(el->l_next_free_rec) - 1;
1972 rec = &el->l_recs[idx];
1973 range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
1974
1975 for (i = 0; i < path->p_tree_depth; i++) {
1976 el = path->p_node[i].el;
1977 idx = le16_to_cpu(el->l_next_free_rec) - 1;
1978 rec = &el->l_recs[idx];
1979
1980 rec->e_int_clusters = cpu_to_le32(range);
1981 le32_add_cpu(&rec->e_int_clusters, -le32_to_cpu(rec->e_cpos));
1982
1983 ocfs2_journal_dirty(handle, path->p_node[i].bh);
1984 }
1985}
1986
1987static void ocfs2_unlink_path(struct inode *inode, handle_t *handle,
1988 struct ocfs2_cached_dealloc_ctxt *dealloc,
1989 struct ocfs2_path *path, int unlink_start)
1990{
1991 int ret, i;
1992 struct ocfs2_extent_block *eb;
1993 struct ocfs2_extent_list *el;
1994 struct buffer_head *bh;
1995
1996 for(i = unlink_start; i < path_num_items(path); i++) {
1997 bh = path->p_node[i].bh;
1998
1999 eb = (struct ocfs2_extent_block *)bh->b_data;
2000 /*
2001 * Not all nodes might have had their final count
2002 * decremented by the caller - handle this here.
2003 */
2004 el = &eb->h_list;
2005 if (le16_to_cpu(el->l_next_free_rec) > 1) {
2006 mlog(ML_ERROR,
2007 "Inode %llu, attempted to remove extent block "
2008 "%llu with %u records\n",
2009 (unsigned long long)OCFS2_I(inode)->ip_blkno,
2010 (unsigned long long)le64_to_cpu(eb->h_blkno),
2011 le16_to_cpu(el->l_next_free_rec));
2012
2013 ocfs2_journal_dirty(handle, bh);
2014 ocfs2_remove_from_cache(inode, bh);
2015 continue;
2016 }
2017
2018 el->l_next_free_rec = 0;
2019 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2020
2021 ocfs2_journal_dirty(handle, bh);
2022
2023 ret = ocfs2_cache_extent_block_free(dealloc, eb);
2024 if (ret)
2025 mlog_errno(ret);
2026
2027 ocfs2_remove_from_cache(inode, bh);
2028 }
2029}
2030
2031static void ocfs2_unlink_subtree(struct inode *inode, handle_t *handle,
2032 struct ocfs2_path *left_path,
2033 struct ocfs2_path *right_path,
2034 int subtree_index,
2035 struct ocfs2_cached_dealloc_ctxt *dealloc)
2036{
2037 int i;
2038 struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
2039 struct ocfs2_extent_list *root_el = left_path->p_node[subtree_index].el;
2040 struct ocfs2_extent_list *el;
2041 struct ocfs2_extent_block *eb;
2042
2043 el = path_leaf_el(left_path);
2044
2045 eb = (struct ocfs2_extent_block *)right_path->p_node[subtree_index + 1].bh->b_data;
2046
2047 for(i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
2048 if (root_el->l_recs[i].e_blkno == eb->h_blkno)
2049 break;
2050
2051 BUG_ON(i >= le16_to_cpu(root_el->l_next_free_rec));
2052
2053 memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
2054 le16_add_cpu(&root_el->l_next_free_rec, -1);
2055
2056 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2057 eb->h_next_leaf_blk = 0;
2058
2059 ocfs2_journal_dirty(handle, root_bh);
2060 ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2061
2062 ocfs2_unlink_path(inode, handle, dealloc, right_path,
2063 subtree_index + 1);
2064}
2065
2066static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
2067 struct ocfs2_path *left_path,
2068 struct ocfs2_path *right_path,
2069 int subtree_index,
2070 struct ocfs2_cached_dealloc_ctxt *dealloc,
2071 int *deleted)
2072{
2073 int ret, i, del_right_subtree = 0, right_has_empty = 0;
2074 struct buffer_head *root_bh, *di_bh = path_root_bh(right_path);
2075 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
2076 struct ocfs2_extent_list *right_leaf_el, *left_leaf_el;
2077 struct ocfs2_extent_block *eb;
2078
2079 *deleted = 0;
2080
2081 right_leaf_el = path_leaf_el(right_path);
2082 left_leaf_el = path_leaf_el(left_path);
2083 root_bh = left_path->p_node[subtree_index].bh;
2084 BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
2085
2086 if (!ocfs2_is_empty_extent(&left_leaf_el->l_recs[0]))
2087 return 0;
2088
2089 eb = (struct ocfs2_extent_block *)path_leaf_bh(right_path)->b_data;
2090 if (ocfs2_is_empty_extent(&right_leaf_el->l_recs[0])) {
2091 /*
2092 * It's legal for us to proceed if the right leaf is
2093 * the rightmost one and it has an empty extent. There
2094 * are two cases to handle - whether the leaf will be
2095 * empty after removal or not. If the leaf isn't empty
2096 * then just remove the empty extent up front. The
2097 * next block will handle empty leaves by flagging
2098 * them for unlink.
2099 *
2100 * Non rightmost leaves will throw -EAGAIN and the
2101 * caller can manually move the subtree and retry.
2102 */
2103
2104 if (eb->h_next_leaf_blk != 0ULL)
2105 return -EAGAIN;
2106
2107 if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) {
2108 ret = ocfs2_journal_access(handle, inode,
2109 path_leaf_bh(right_path),
2110 OCFS2_JOURNAL_ACCESS_WRITE);
2111 if (ret) {
2112 mlog_errno(ret);
2113 goto out;
2114 }
2115
2116 ocfs2_remove_empty_extent(right_leaf_el);
2117 } else
2118 right_has_empty = 1;
2119 }
2120
2121 if (eb->h_next_leaf_blk == 0ULL &&
2122 le16_to_cpu(right_leaf_el->l_next_free_rec) == 1) {
2123 /*
2124 * We have to update i_last_eb_blk during the meta
2125 * data delete.
2126 */
2127 ret = ocfs2_journal_access(handle, inode, di_bh,
2128 OCFS2_JOURNAL_ACCESS_WRITE);
2129 if (ret) {
2130 mlog_errno(ret);
2131 goto out;
2132 }
2133
2134 del_right_subtree = 1;
2135 }
2136
2137 /*
2138 * Getting here with an empty extent in the right path implies
2139 * that it's the rightmost path and will be deleted.
2140 */
2141 BUG_ON(right_has_empty && !del_right_subtree);
2142
2143 ret = ocfs2_journal_access(handle, inode, root_bh,
2144 OCFS2_JOURNAL_ACCESS_WRITE);
2145 if (ret) {
2146 mlog_errno(ret);
2147 goto out;
2148 }
2149
2150 for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
2151 ret = ocfs2_journal_access(handle, inode,
2152 right_path->p_node[i].bh,
2153 OCFS2_JOURNAL_ACCESS_WRITE);
2154 if (ret) {
2155 mlog_errno(ret);
2156 goto out;
2157 }
2158
2159 ret = ocfs2_journal_access(handle, inode,
2160 left_path->p_node[i].bh,
2161 OCFS2_JOURNAL_ACCESS_WRITE);
2162 if (ret) {
2163 mlog_errno(ret);
2164 goto out;
2165 }
2166 }
2167
2168 if (!right_has_empty) {
2169 /*
2170 * Only do this if we're moving a real
2171 * record. Otherwise, the action is delayed until
2172 * after removal of the right path in which case we
2173 * can do a simple shift to remove the empty extent.
2174 */
2175 ocfs2_rotate_leaf(left_leaf_el, &right_leaf_el->l_recs[0]);
2176 memset(&right_leaf_el->l_recs[0], 0,
2177 sizeof(struct ocfs2_extent_rec));
2178 }
2179 if (eb->h_next_leaf_blk == 0ULL) {
2180 /*
2181 * Move recs over to get rid of empty extent, decrease
2182 * next_free. This is allowed to remove the last
2183 * extent in our leaf (setting l_next_free_rec to
2184 * zero) - the delete code below won't care.
2185 */
2186 ocfs2_remove_empty_extent(right_leaf_el);
2187 }
2188
2189 ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2190 if (ret)
2191 mlog_errno(ret);
2192 ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
2193 if (ret)
2194 mlog_errno(ret);
2195
2196 if (del_right_subtree) {
2197 ocfs2_unlink_subtree(inode, handle, left_path, right_path,
2198 subtree_index, dealloc);
2199 ocfs2_update_edge_lengths(inode, handle, left_path);
2200
2201 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2202 di->i_last_eb_blk = eb->h_blkno;
2203
2204 /*
2205 * Removal of the extent in the left leaf was skipped
2206 * above so we could delete the right path
2207 * 1st.
2208 */
2209 if (right_has_empty)
2210 ocfs2_remove_empty_extent(left_leaf_el);
2211
2212 ret = ocfs2_journal_dirty(handle, di_bh);
2213 if (ret)
2214 mlog_errno(ret);
2215
2216 *deleted = 1;
2217 } else
2218 ocfs2_complete_edge_insert(inode, handle, left_path, right_path,
2219 subtree_index);
2220
2221out:
2222 return ret;
2223}
2224
2225/*
2226 * Given a full path, determine what cpos value would return us a path
2227 * containing the leaf immediately to the right of the current one.
2228 *
2229 * Will return zero if the path passed in is already the rightmost path.
2230 *
2231 * This looks similar, but is subtly different to
2232 * ocfs2_find_cpos_for_left_leaf().
2233 */
2234static int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
2235 struct ocfs2_path *path, u32 *cpos)
2236{
2237 int i, j, ret = 0;
2238 u64 blkno;
2239 struct ocfs2_extent_list *el;
2240
2241 *cpos = 0;
2242
2243 if (path->p_tree_depth == 0)
2244 return 0;
2245
2246 blkno = path_leaf_bh(path)->b_blocknr;
2247
2248 /* Start at the tree node just above the leaf and work our way up. */
2249 i = path->p_tree_depth - 1;
2250 while (i >= 0) {
2251 int next_free;
2252
2253 el = path->p_node[i].el;
2254
2255 /*
2256 * Find the extent record just after the one in our
2257 * path.
2258 */
2259 next_free = le16_to_cpu(el->l_next_free_rec);
2260 for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
2261 if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
2262 if (j == (next_free - 1)) {
2263 if (i == 0) {
2264 /*
2265 * We've determined that the
2266 * path specified is already
2267 * the rightmost one - return a
2268 * cpos of zero.
2269 */
2270 goto out;
2271 }
2272 /*
2273 * The rightmost record points to our
2274 * leaf - we need to travel up the
2275 * tree one level.
2276 */
2277 goto next_node;
2278 }
2279
2280 *cpos = le32_to_cpu(el->l_recs[j + 1].e_cpos);
2281 goto out;
2282 }
2283 }
2284
2285 /*
2286 * If we got here, we never found a valid node where
2287 * the tree indicated one should be.
2288 */
2289 ocfs2_error(sb,
2290 "Invalid extent tree at extent block %llu\n",
2291 (unsigned long long)blkno);
2292 ret = -EROFS;
2293 goto out;
2294
2295next_node:
2296 blkno = path->p_node[i].bh->b_blocknr;
2297 i--;
2298 }
2299
2300out:
2301 return ret;
2302}
2303
2304static int ocfs2_rotate_rightmost_leaf_left(struct inode *inode,
2305 handle_t *handle,
2306 struct buffer_head *bh,
2307 struct ocfs2_extent_list *el)
2308{
2309 int ret;
2310
2311 if (!ocfs2_is_empty_extent(&el->l_recs[0]))
2312 return 0;
2313
2314 ret = ocfs2_journal_access(handle, inode, bh,
2315 OCFS2_JOURNAL_ACCESS_WRITE);
2316 if (ret) {
2317 mlog_errno(ret);
2318 goto out;
2319 }
2320
2321 ocfs2_remove_empty_extent(el);
2322
2323 ret = ocfs2_journal_dirty(handle, bh);
2324 if (ret)
2325 mlog_errno(ret);
2326
2327out:
2328 return ret;
2329}
2330
2331static int __ocfs2_rotate_tree_left(struct inode *inode,
2332 handle_t *handle, int orig_credits,
2333 struct ocfs2_path *path,
2334 struct ocfs2_cached_dealloc_ctxt *dealloc,
2335 struct ocfs2_path **empty_extent_path)
2336{
2337 int ret, subtree_root, deleted;
2338 u32 right_cpos;
2339 struct ocfs2_path *left_path = NULL;
2340 struct ocfs2_path *right_path = NULL;
2341
2342 BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0])));
2343
2344 *empty_extent_path = NULL;
2345
2346 ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, path,
2347 &right_cpos);
2348 if (ret) {
2349 mlog_errno(ret);
2350 goto out;
2351 }
2352
2353 left_path = ocfs2_new_path(path_root_bh(path),
2354 path_root_el(path));
2355 if (!left_path) {
2356 ret = -ENOMEM;
2357 mlog_errno(ret);
2358 goto out;
2359 }
2360
2361 ocfs2_cp_path(left_path, path);
2362
2363 right_path = ocfs2_new_path(path_root_bh(path),
2364 path_root_el(path));
2365 if (!right_path) {
2366 ret = -ENOMEM;
2367 mlog_errno(ret);
2368 goto out;
2369 }
2370
2371 while (right_cpos) {
2372 ret = ocfs2_find_path(inode, right_path, right_cpos);
2373 if (ret) {
2374 mlog_errno(ret);
2375 goto out;
2376 }
2377
2378 subtree_root = ocfs2_find_subtree_root(inode, left_path,
2379 right_path);
2380
2381 mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
2382 subtree_root,
2383 (unsigned long long)
2384 right_path->p_node[subtree_root].bh->b_blocknr,
2385 right_path->p_tree_depth);
2386
2387 ret = ocfs2_extend_rotate_transaction(handle, subtree_root,
2388 orig_credits, left_path);
2389 if (ret) {
2390 mlog_errno(ret);
2391 goto out;
2392 }
2393
2394 ret = ocfs2_rotate_subtree_left(inode, handle, left_path,
2395 right_path, subtree_root,
2396 dealloc, &deleted);
2397 if (ret == -EAGAIN) {
2398 /*
2399 * The rotation has to temporarily stop due to
2400 * the right subtree having an empty
2401 * extent. Pass it back to the caller for a
2402 * fixup.
2403 */
2404 *empty_extent_path = right_path;
2405 right_path = NULL;
2406 goto out;
2407 }
2408 if (ret) {
2409 mlog_errno(ret);
2410 goto out;
2411 }
2412
2413 /*
2414 * The subtree rotate might have removed records on
2415 * the rightmost edge. If so, then rotation is
2416 * complete.
2417 */
2418 if (deleted)
2419 break;
2420
2421 ocfs2_mv_path(left_path, right_path);
2422
2423 ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path,
2424 &right_cpos);
2425 if (ret) {
2426 mlog_errno(ret);
2427 goto out;
2428 }
2429 }
2430
2431out:
2432 ocfs2_free_path(right_path);
2433 ocfs2_free_path(left_path);
2434
2435 return ret;
2436}
2437
2438static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
2439 struct ocfs2_path *path,
2440 struct ocfs2_cached_dealloc_ctxt *dealloc)
2441{
2442 int ret, subtree_index;
2443 u32 cpos;
2444 struct ocfs2_path *left_path = NULL;
2445 struct ocfs2_dinode *di;
2446 struct ocfs2_extent_block *eb;
2447 struct ocfs2_extent_list *el;
2448
2449 /*
2450 * XXX: This code assumes that the root is an inode, which is
2451 * true for now but may change as tree code gets generic.
2452 */
2453 di = (struct ocfs2_dinode *)path_root_bh(path)->b_data;
2454 if (!OCFS2_IS_VALID_DINODE(di)) {
2455 ret = -EIO;
2456 ocfs2_error(inode->i_sb,
2457 "Inode %llu has invalid path root",
2458 (unsigned long long)OCFS2_I(inode)->ip_blkno);
2459 goto out;
2460 }
2461
2462 /*
2463 * There's two ways we handle this depending on
2464 * whether path is the only existing one.
2465 */
2466 ret = ocfs2_extend_rotate_transaction(handle, 0,
2467 handle->h_buffer_credits,
2468 path);
2469 if (ret) {
2470 mlog_errno(ret);
2471 goto out;
2472 }
2473
2474 ret = ocfs2_journal_access_path(inode, handle, path);
2475 if (ret) {
2476 mlog_errno(ret);
2477 goto out;
2478 }
2479
2480 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
2481 if (ret) {
2482 mlog_errno(ret);
2483 goto out;
2484 }
2485
2486 if (cpos) {
2487 /*
2488 * We have a path to the left of this one - it needs
2489 * an update too.
2490 */
2491 left_path = ocfs2_new_path(path_root_bh(path),
2492 path_root_el(path));
2493 if (!left_path) {
2494 ret = -ENOMEM;
2495 mlog_errno(ret);
2496 goto out;
2497 }
2498
2499 ret = ocfs2_find_path(inode, left_path, cpos);
2500 if (ret) {
2501 mlog_errno(ret);
2502 goto out;
2503 }
2504
2505 ret = ocfs2_journal_access_path(inode, handle, left_path);
2506 if (ret) {
2507 mlog_errno(ret);
2508 goto out;
2509 }
2510
2511 subtree_index = ocfs2_find_subtree_root(inode, left_path, path);
2512
2513 ocfs2_unlink_subtree(inode, handle, left_path, path,
2514 subtree_index, dealloc);
2515 ocfs2_update_edge_lengths(inode, handle, left_path);
2516
2517 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2518 di->i_last_eb_blk = eb->h_blkno;
2519 } else {
2520 /*
2521 * 'path' is also the leftmost path which
2522 * means it must be the only one. This gets
2523 * handled differently because we want to
2524 * revert the inode back to having extents
2525 * in-line.
2526 */
2527 ocfs2_unlink_path(inode, handle, dealloc, path, 1);
2528
2529 el = &di->id2.i_list;
2530 el->l_tree_depth = 0;
2531 el->l_next_free_rec = 0;
2532 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2533
2534 di->i_last_eb_blk = 0;
2535 }
2536
2537 ocfs2_journal_dirty(handle, path_root_bh(path));
2538
2539out:
2540 ocfs2_free_path(left_path);
2541 return ret;
2542}
2543
2544/*
2545 * Left rotation of btree records.
2546 *
2547 * In many ways, this is (unsurprisingly) the opposite of right
2548 * rotation. We start at some non-rightmost path containing an empty
2549 * extent in the leaf block. The code works its way to the rightmost
2550 * path by rotating records to the left in every subtree.
2551 *
2552 * This is used by any code which reduces the number of extent records
2553 * in a leaf. After removal, an empty record should be placed in the
2554 * leftmost list position.
2555 *
2556 * This won't handle a length update of the rightmost path records if
2557 * the rightmost tree leaf record is removed so the caller is
2558 * responsible for detecting and correcting that.
2559 */
2560static int ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle,
2561 struct ocfs2_path *path,
2562 struct ocfs2_cached_dealloc_ctxt *dealloc)
2563{
2564 int ret, orig_credits = handle->h_buffer_credits;
2565 struct ocfs2_path *tmp_path = NULL, *restart_path = NULL;
2566 struct ocfs2_extent_block *eb;
2567 struct ocfs2_extent_list *el;
2568
2569 el = path_leaf_el(path);
2570 if (!ocfs2_is_empty_extent(&el->l_recs[0]))
2571 return 0;
2572
2573 if (path->p_tree_depth == 0) {
2574rightmost_no_delete:
2575 /*
2576 * In-inode extents. This is trivially handled, so do
2577 * it up front.
2578 */
2579 ret = ocfs2_rotate_rightmost_leaf_left(inode, handle,
2580 path_leaf_bh(path),
2581 path_leaf_el(path));
2582 if (ret)
2583 mlog_errno(ret);
2584 goto out;
2585 }
2586
2587 /*
2588 * Handle rightmost branch now. There's several cases:
2589 * 1) simple rotation leaving records in there. That's trivial.
2590 * 2) rotation requiring a branch delete - there's no more
2591 * records left. Two cases of this:
2592 * a) There are branches to the left.
2593 * b) This is also the leftmost (the only) branch.
2594 *
2595 * 1) is handled via ocfs2_rotate_rightmost_leaf_left()
2596 * 2a) we need the left branch so that we can update it with the unlink
2597 * 2b) we need to bring the inode back to inline extents.
2598 */
2599
2600 eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
2601 el = &eb->h_list;
2602 if (eb->h_next_leaf_blk == 0) {
2603 /*
2604 * This gets a bit tricky if we're going to delete the
2605 * rightmost path. Get the other cases out of the way
2606 * 1st.
2607 */
2608 if (le16_to_cpu(el->l_next_free_rec) > 1)
2609 goto rightmost_no_delete;
2610
2611 if (le16_to_cpu(el->l_next_free_rec) == 0) {
2612 ret = -EIO;
2613 ocfs2_error(inode->i_sb,
2614 "Inode %llu has empty extent block at %llu",
2615 (unsigned long long)OCFS2_I(inode)->ip_blkno,
2616 (unsigned long long)le64_to_cpu(eb->h_blkno));
2617 goto out;
2618 }
2619
2620 /*
2621 * XXX: The caller can not trust "path" any more after
2622 * this as it will have been deleted. What do we do?
2623 *
2624 * In theory the rotate-for-merge code will never get
2625 * here because it'll always ask for a rotate in a
2626 * nonempty list.
2627 */
2628
2629 ret = ocfs2_remove_rightmost_path(inode, handle, path,
2630 dealloc);
2631 if (ret)
2632 mlog_errno(ret);
2633 goto out;
2634 }
2635
2636 /*
2637 * Now we can loop, remembering the path we get from -EAGAIN
2638 * and restarting from there.
2639 */
2640try_rotate:
2641 ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, path,
2642 dealloc, &restart_path);
2643 if (ret && ret != -EAGAIN) {
2644 mlog_errno(ret);
2645 goto out;
2646 }
2647
2648 while (ret == -EAGAIN) {
2649 tmp_path = restart_path;
2650 restart_path = NULL;
2651
2652 ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits,
2653 tmp_path, dealloc,
2654 &restart_path);
2655 if (ret && ret != -EAGAIN) {
2656 mlog_errno(ret);
2657 goto out;
2658 }
2659
2660 ocfs2_free_path(tmp_path);
2661 tmp_path = NULL;
2662
2663 if (ret == 0)
2664 goto try_rotate;
2665 }
2666
2667out:
2668 ocfs2_free_path(tmp_path);
2669 ocfs2_free_path(restart_path);
2670 return ret;
2671}
2672
2673static void ocfs2_cleanup_merge(struct ocfs2_extent_list *el,
2674 int index)
2675{
2676 struct ocfs2_extent_rec *rec = &el->l_recs[index];
2677 unsigned int size;
2678
2679 if (rec->e_leaf_clusters == 0) {
2680 /*
2681 * We consumed all of the merged-from record. An empty
2682 * extent cannot exist anywhere but the 1st array
2683 * position, so move things over if the merged-from
2684 * record doesn't occupy that position.
2685 *
2686 * This creates a new empty extent so the caller
2687 * should be smart enough to have removed any existing
2688 * ones.
2689 */
2690 if (index > 0) {
2691 BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
2692 size = index * sizeof(struct ocfs2_extent_rec);
2693 memmove(&el->l_recs[1], &el->l_recs[0], size);
2694 }
2695
2696 /*
2697 * Always memset - the caller doesn't check whether it
2698 * created an empty extent, so there could be junk in
2699 * the other fields.
2700 */
2701 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2702 }
2703}
2704
2705/*
2706 * Remove split_rec clusters from the record at index and merge them
2707 * onto the beginning of the record at index + 1.
2708 */
2709static int ocfs2_merge_rec_right(struct inode *inode, struct buffer_head *bh,
2710 handle_t *handle,
2711 struct ocfs2_extent_rec *split_rec,
2712 struct ocfs2_extent_list *el, int index)
2713{
2714 int ret;
2715 unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
2716 struct ocfs2_extent_rec *left_rec;
2717 struct ocfs2_extent_rec *right_rec;
2718
2719 BUG_ON(index >= le16_to_cpu(el->l_next_free_rec));
2720
2721 left_rec = &el->l_recs[index];
2722 right_rec = &el->l_recs[index + 1];
2723
2724 ret = ocfs2_journal_access(handle, inode, bh,
2725 OCFS2_JOURNAL_ACCESS_WRITE);
2726 if (ret) {
2727 mlog_errno(ret);
2728 goto out;
2729 }
2730
2731 le16_add_cpu(&left_rec->e_leaf_clusters, -split_clusters);
2732
2733 le32_add_cpu(&right_rec->e_cpos, -split_clusters);
2734 le64_add_cpu(&right_rec->e_blkno,
2735 -ocfs2_clusters_to_blocks(inode->i_sb, split_clusters));
2736 le16_add_cpu(&right_rec->e_leaf_clusters, split_clusters);
2737
2738 ocfs2_cleanup_merge(el, index);
2739
2740 ret = ocfs2_journal_dirty(handle, bh);
2741 if (ret)
2742 mlog_errno(ret);
2743
2744out:
2745 return ret;
2746}
2747
2748/*
2749 * Remove split_rec clusters from the record at index and merge them
2750 * onto the tail of the record at index - 1.
2751 */
2752static int ocfs2_merge_rec_left(struct inode *inode, struct buffer_head *bh,
2753 handle_t *handle,
2754 struct ocfs2_extent_rec *split_rec,
2755 struct ocfs2_extent_list *el, int index)
2756{
2757 int ret, has_empty_extent = 0;
2758 unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
2759 struct ocfs2_extent_rec *left_rec;
2760 struct ocfs2_extent_rec *right_rec;
2761
2762 BUG_ON(index <= 0);
2763
2764 left_rec = &el->l_recs[index - 1];
2765 right_rec = &el->l_recs[index];
2766 if (ocfs2_is_empty_extent(&el->l_recs[0]))
2767 has_empty_extent = 1;
2768
2769 ret = ocfs2_journal_access(handle, inode, bh,
2770 OCFS2_JOURNAL_ACCESS_WRITE);
2771 if (ret) {
2772 mlog_errno(ret);
2773 goto out;
2774 }
2775
2776 if (has_empty_extent && index == 1) {
2777 /*
2778 * The easy case - we can just plop the record right in.
2779 */
2780 *left_rec = *split_rec;
2781
2782 has_empty_extent = 0;
2783 } else {
2784 le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters);
2785 }
2786
2787 le32_add_cpu(&right_rec->e_cpos, split_clusters);
2788 le64_add_cpu(&right_rec->e_blkno,
2789 ocfs2_clusters_to_blocks(inode->i_sb, split_clusters));
2790 le16_add_cpu(&right_rec->e_leaf_clusters, -split_clusters);
2791
2792 ocfs2_cleanup_merge(el, index);
2793
2794 ret = ocfs2_journal_dirty(handle, bh);
2795 if (ret)
2796 mlog_errno(ret);
2797
2798out:
2799 return ret;
2800}
2801
2802static int ocfs2_try_to_merge_extent(struct inode *inode,
2803 handle_t *handle,
2804 struct ocfs2_path *left_path,
2805 int split_index,
2806 struct ocfs2_extent_rec *split_rec,
2807 struct ocfs2_cached_dealloc_ctxt *dealloc,
2808 struct ocfs2_merge_ctxt *ctxt)
2809
2810{
2811 int ret = 0, delete_tail_recs = 0;
2812 struct ocfs2_extent_list *el = path_leaf_el(left_path);
2813 struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
2814
2815 BUG_ON(ctxt->c_contig_type == CONTIG_NONE);
2816
2817 if (ctxt->c_split_covers_rec) {
2818 delete_tail_recs++;
2819
2820 if (ctxt->c_contig_type == CONTIG_LEFTRIGHT ||
2821 ctxt->c_has_empty_extent)
2822 delete_tail_recs++;
2823
2824 if (ctxt->c_has_empty_extent) {
2825 /*
2826 * The merge code will need to create an empty
2827 * extent to take the place of the newly
2828 * emptied slot. Remove any pre-existing empty
2829 * extents - having more than one in a leaf is
2830 * illegal.
2831 */
2832 ret = ocfs2_rotate_tree_left(inode, handle, left_path,
2833 dealloc);
2834 if (ret) {
2835 mlog_errno(ret);
2836 goto out;
2837 }
2838 split_index--;
2839 rec = &el->l_recs[split_index];
2840 }
2841 }
2842
2843 if (ctxt->c_contig_type == CONTIG_LEFTRIGHT) {
2844 /*
2845 * Left-right contig implies this.
2846 */
2847 BUG_ON(!ctxt->c_split_covers_rec);
2848 BUG_ON(split_index == 0);
2849
2850 /*
2851 * Since the leftright insert always covers the entire
2852 * extent, this call will delete the insert record
2853 * entirely, resulting in an empty extent record added to
2854 * the extent block.
2855 *
2856 * Since the adding of an empty extent shifts
2857 * everything back to the right, there's no need to
2858 * update split_index here.
2859 */
2860 ret = ocfs2_merge_rec_left(inode, path_leaf_bh(left_path),
2861 handle, split_rec, el, split_index);
2862 if (ret) {
2863 mlog_errno(ret);
2864 goto out;
2865 }
2866
2867 /*
2868 * We can only get this from logic error above.
2869 */
2870 BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
2871
2872 /*
2873 * The left merge left us with an empty extent, remove
2874 * it.
2875 */
2876 ret = ocfs2_rotate_tree_left(inode, handle, left_path, dealloc);
2877 if (ret) {
2878 mlog_errno(ret);
2879 goto out;
2880 }
2881 split_index--;
2882 rec = &el->l_recs[split_index];
2883
2884 /*
2885 * Note that we don't pass split_rec here on purpose -
2886 * we've merged it into the left side.
2887 */
2888 ret = ocfs2_merge_rec_right(inode, path_leaf_bh(left_path),
2889 handle, rec, el, split_index);
2890 if (ret) {
2891 mlog_errno(ret);
2892 goto out;
2893 }
2894
2895 BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
2896
2897 ret = ocfs2_rotate_tree_left(inode, handle, left_path,
2898 dealloc);
2899 /*
2900 * Error from this last rotate is not critical, so
2901 * print but don't bubble it up.
2902 */
2903 if (ret)
2904 mlog_errno(ret);
2905 ret = 0;
2906 } else {
2907 /*
2908 * Merge a record to the left or right.
2909 *
2910 * 'contig_type' is relative to the existing record,
2911 * so for example, if we're "right contig", it's to
2912 * the record on the left (hence the left merge).
2913 */
2914 if (ctxt->c_contig_type == CONTIG_RIGHT) {
2915 ret = ocfs2_merge_rec_left(inode,
2916 path_leaf_bh(left_path),
2917 handle, split_rec, el,
2918 split_index);
2919 if (ret) {
2920 mlog_errno(ret);
2921 goto out;
2922 }
2923 } else {
2924 ret = ocfs2_merge_rec_right(inode,
2925 path_leaf_bh(left_path),
2926 handle, split_rec, el,
2927 split_index);
2928 if (ret) {
2929 mlog_errno(ret);
2930 goto out;
2931 }
2932 }
2933
2934 if (ctxt->c_split_covers_rec) {
2935 /*
2936 * The merge may have left an empty extent in
2937 * our leaf. Try to rotate it away.
2938 */
2939 ret = ocfs2_rotate_tree_left(inode, handle, left_path,
2940 dealloc);
2941 if (ret)
2942 mlog_errno(ret);
2943 ret = 0;
2944 }
2945 }
2946
2947out:
2948 return ret;
2949}
2950
2951static void ocfs2_subtract_from_rec(struct super_block *sb,
2952 enum ocfs2_split_type split,
2953 struct ocfs2_extent_rec *rec,
2954 struct ocfs2_extent_rec *split_rec)
2955{
2956 u64 len_blocks;
2957
2958 len_blocks = ocfs2_clusters_to_blocks(sb,
2959 le16_to_cpu(split_rec->e_leaf_clusters));
2960
2961 if (split == SPLIT_LEFT) {
2962 /*
2963 * Region is on the left edge of the existing
2964 * record.
2965 */
2966 le32_add_cpu(&rec->e_cpos,
2967 le16_to_cpu(split_rec->e_leaf_clusters));
2968 le64_add_cpu(&rec->e_blkno, len_blocks);
2969 le16_add_cpu(&rec->e_leaf_clusters,
2970 -le16_to_cpu(split_rec->e_leaf_clusters));
2971 } else {
2972 /*
2973 * Region is on the right edge of the existing
2974 * record.
2975 */
2976 le16_add_cpu(&rec->e_leaf_clusters,
2977 -le16_to_cpu(split_rec->e_leaf_clusters));
2978 }
2979}
2980
1789/* 2981/*
1790 * Do the final bits of extent record insertion at the target leaf 2982 * Do the final bits of extent record insertion at the target leaf
1791 * list. If this leaf is part of an allocation tree, it is assumed 2983 * list. If this leaf is part of an allocation tree, it is assumed
@@ -1802,6 +2994,15 @@ static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
1802 2994
1803 BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); 2995 BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
1804 2996
2997 if (insert->ins_split != SPLIT_NONE) {
2998 i = ocfs2_search_extent_list(el, le32_to_cpu(insert_rec->e_cpos));
2999 BUG_ON(i == -1);
3000 rec = &el->l_recs[i];
3001 ocfs2_subtract_from_rec(inode->i_sb, insert->ins_split, rec,
3002 insert_rec);
3003 goto rotate;
3004 }
3005
1805 /* 3006 /*
1806 * Contiguous insert - either left or right. 3007 * Contiguous insert - either left or right.
1807 */ 3008 */
@@ -1856,6 +3057,7 @@ static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
1856 return; 3057 return;
1857 } 3058 }
1858 3059
3060rotate:
1859 /* 3061 /*
1860 * Ok, we have to rotate. 3062 * Ok, we have to rotate.
1861 * 3063 *
@@ -1879,13 +3081,53 @@ static inline void ocfs2_update_dinode_clusters(struct inode *inode,
1879 spin_unlock(&OCFS2_I(inode)->ip_lock); 3081 spin_unlock(&OCFS2_I(inode)->ip_lock);
1880} 3082}
1881 3083
3084static void ocfs2_adjust_rightmost_records(struct inode *inode,
3085 handle_t *handle,
3086 struct ocfs2_path *path,
3087 struct ocfs2_extent_rec *insert_rec)
3088{
3089 int ret, i, next_free;
3090 struct buffer_head *bh;
3091 struct ocfs2_extent_list *el;
3092 struct ocfs2_extent_rec *rec;
3093
3094 /*
3095 * Update everything except the leaf block.
3096 */
3097 for (i = 0; i < path->p_tree_depth; i++) {
3098 bh = path->p_node[i].bh;
3099 el = path->p_node[i].el;
3100
3101 next_free = le16_to_cpu(el->l_next_free_rec);
3102 if (next_free == 0) {
3103 ocfs2_error(inode->i_sb,
3104 "Dinode %llu has a bad extent list",
3105 (unsigned long long)OCFS2_I(inode)->ip_blkno);
3106 ret = -EIO;
3107 return;
3108 }
3109
3110 rec = &el->l_recs[next_free - 1];
3111
3112 rec->e_int_clusters = insert_rec->e_cpos;
3113 le32_add_cpu(&rec->e_int_clusters,
3114 le16_to_cpu(insert_rec->e_leaf_clusters));
3115 le32_add_cpu(&rec->e_int_clusters,
3116 -le32_to_cpu(rec->e_cpos));
3117
3118 ret = ocfs2_journal_dirty(handle, bh);
3119 if (ret)
3120 mlog_errno(ret);
3121
3122 }
3123}
3124
1882static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle, 3125static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
1883 struct ocfs2_extent_rec *insert_rec, 3126 struct ocfs2_extent_rec *insert_rec,
1884 struct ocfs2_path *right_path, 3127 struct ocfs2_path *right_path,
1885 struct ocfs2_path **ret_left_path) 3128 struct ocfs2_path **ret_left_path)
1886{ 3129{
1887 int ret, i, next_free; 3130 int ret, next_free;
1888 struct buffer_head *bh;
1889 struct ocfs2_extent_list *el; 3131 struct ocfs2_extent_list *el;
1890 struct ocfs2_path *left_path = NULL; 3132 struct ocfs2_path *left_path = NULL;
1891 3133
@@ -1951,40 +3193,7 @@ static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
1951 goto out; 3193 goto out;
1952 } 3194 }
1953 3195
1954 el = path_root_el(right_path); 3196 ocfs2_adjust_rightmost_records(inode, handle, right_path, insert_rec);
1955 bh = path_root_bh(right_path);
1956 i = 0;
1957 while (1) {
1958 struct ocfs2_extent_rec *rec;
1959
1960 next_free = le16_to_cpu(el->l_next_free_rec);
1961 if (next_free == 0) {
1962 ocfs2_error(inode->i_sb,
1963 "Dinode %llu has a bad extent list",
1964 (unsigned long long)OCFS2_I(inode)->ip_blkno);
1965 ret = -EIO;
1966 goto out;
1967 }
1968
1969 rec = &el->l_recs[next_free - 1];
1970
1971 rec->e_int_clusters = insert_rec->e_cpos;
1972 le32_add_cpu(&rec->e_int_clusters,
1973 le16_to_cpu(insert_rec->e_leaf_clusters));
1974 le32_add_cpu(&rec->e_int_clusters,
1975 -le32_to_cpu(rec->e_cpos));
1976
1977 ret = ocfs2_journal_dirty(handle, bh);
1978 if (ret)
1979 mlog_errno(ret);
1980
1981 /* Don't touch the leaf node */
1982 if (++i >= right_path->p_tree_depth)
1983 break;
1984
1985 bh = right_path->p_node[i].bh;
1986 el = right_path->p_node[i].el;
1987 }
1988 3197
1989 *ret_left_path = left_path; 3198 *ret_left_path = left_path;
1990 ret = 0; 3199 ret = 0;
@@ -1995,6 +3204,83 @@ out:
1995 return ret; 3204 return ret;
1996} 3205}
1997 3206
3207static void ocfs2_split_record(struct inode *inode,
3208 struct ocfs2_path *left_path,
3209 struct ocfs2_path *right_path,
3210 struct ocfs2_extent_rec *split_rec,
3211 enum ocfs2_split_type split)
3212{
3213 int index;
3214 u32 cpos = le32_to_cpu(split_rec->e_cpos);
3215 struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el;
3216 struct ocfs2_extent_rec *rec, *tmprec;
3217
3218 right_el = path_leaf_el(right_path);;
3219 if (left_path)
3220 left_el = path_leaf_el(left_path);
3221
3222 el = right_el;
3223 insert_el = right_el;
3224 index = ocfs2_search_extent_list(el, cpos);
3225 if (index != -1) {
3226 if (index == 0 && left_path) {
3227 BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
3228
3229 /*
3230 * This typically means that the record
3231 * started in the left path but moved to the
3232 * right as a result of rotation. We either
3233 * move the existing record to the left, or we
3234 * do the later insert there.
3235 *
3236 * In this case, the left path should always
3237 * exist as the rotate code will have passed
3238 * it back for a post-insert update.
3239 */
3240
3241 if (split == SPLIT_LEFT) {
3242 /*
3243 * It's a left split. Since we know
3244 * that the rotate code gave us an
3245 * empty extent in the left path, we
3246 * can just do the insert there.
3247 */
3248 insert_el = left_el;
3249 } else {
3250 /*
3251 * Right split - we have to move the
3252 * existing record over to the left
3253 * leaf. The insert will be into the
3254 * newly created empty extent in the
3255 * right leaf.
3256 */
3257 tmprec = &right_el->l_recs[index];
3258 ocfs2_rotate_leaf(left_el, tmprec);
3259 el = left_el;
3260
3261 memset(tmprec, 0, sizeof(*tmprec));
3262 index = ocfs2_search_extent_list(left_el, cpos);
3263 BUG_ON(index == -1);
3264 }
3265 }
3266 } else {
3267 BUG_ON(!left_path);
3268 BUG_ON(!ocfs2_is_empty_extent(&left_el->l_recs[0]));
3269 /*
3270 * Left path is easy - we can just allow the insert to
3271 * happen.
3272 */
3273 el = left_el;
3274 insert_el = left_el;
3275 index = ocfs2_search_extent_list(el, cpos);
3276 BUG_ON(index == -1);
3277 }
3278
3279 rec = &el->l_recs[index];
3280 ocfs2_subtract_from_rec(inode->i_sb, split, rec, split_rec);
3281 ocfs2_rotate_leaf(insert_el, split_rec);
3282}
3283
1998/* 3284/*
1999 * This function only does inserts on an allocation b-tree. For dinode 3285 * This function only does inserts on an allocation b-tree. For dinode
2000 * lists, ocfs2_insert_at_leaf() is called directly. 3286 * lists, ocfs2_insert_at_leaf() is called directly.
@@ -2012,7 +3298,6 @@ static int ocfs2_insert_path(struct inode *inode,
2012{ 3298{
2013 int ret, subtree_index; 3299 int ret, subtree_index;
2014 struct buffer_head *leaf_bh = path_leaf_bh(right_path); 3300 struct buffer_head *leaf_bh = path_leaf_bh(right_path);
2015 struct ocfs2_extent_list *el;
2016 3301
2017 /* 3302 /*
2018 * Pass both paths to the journal. The majority of inserts 3303 * Pass both paths to the journal. The majority of inserts
@@ -2048,9 +3333,18 @@ static int ocfs2_insert_path(struct inode *inode,
2048 } 3333 }
2049 } 3334 }
2050 3335
2051 el = path_leaf_el(right_path); 3336 if (insert->ins_split != SPLIT_NONE) {
3337 /*
3338 * We could call ocfs2_insert_at_leaf() for some types
3339 * of splits, but it's easier to just let one seperate
3340 * function sort it all out.
3341 */
3342 ocfs2_split_record(inode, left_path, right_path,
3343 insert_rec, insert->ins_split);
3344 } else
3345 ocfs2_insert_at_leaf(insert_rec, path_leaf_el(right_path),
3346 insert, inode);
2052 3347
2053 ocfs2_insert_at_leaf(insert_rec, el, insert, inode);
2054 ret = ocfs2_journal_dirty(handle, leaf_bh); 3348 ret = ocfs2_journal_dirty(handle, leaf_bh);
2055 if (ret) 3349 if (ret)
2056 mlog_errno(ret); 3350 mlog_errno(ret);
@@ -2139,7 +3433,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
2139 * can wind up skipping both of these two special cases... 3433 * can wind up skipping both of these two special cases...
2140 */ 3434 */
2141 if (rotate) { 3435 if (rotate) {
2142 ret = ocfs2_rotate_tree_right(inode, handle, 3436 ret = ocfs2_rotate_tree_right(inode, handle, type->ins_split,
2143 le32_to_cpu(insert_rec->e_cpos), 3437 le32_to_cpu(insert_rec->e_cpos),
2144 right_path, &left_path); 3438 right_path, &left_path);
2145 if (ret) { 3439 if (ret) {
@@ -2164,8 +3458,9 @@ static int ocfs2_do_insert_extent(struct inode *inode,
2164 } 3458 }
2165 3459
2166out_update_clusters: 3460out_update_clusters:
2167 ocfs2_update_dinode_clusters(inode, di, 3461 if (type->ins_split == SPLIT_NONE)
2168 le16_to_cpu(insert_rec->e_leaf_clusters)); 3462 ocfs2_update_dinode_clusters(inode, di,
3463 le16_to_cpu(insert_rec->e_leaf_clusters));
2169 3464
2170 ret = ocfs2_journal_dirty(handle, di_bh); 3465 ret = ocfs2_journal_dirty(handle, di_bh);
2171 if (ret) 3466 if (ret)
@@ -2178,6 +3473,44 @@ out:
2178 return ret; 3473 return ret;
2179} 3474}
2180 3475
3476static enum ocfs2_contig_type
3477ocfs2_figure_merge_contig_type(struct inode *inode,
3478 struct ocfs2_extent_list *el, int index,
3479 struct ocfs2_extent_rec *split_rec)
3480{
3481 struct ocfs2_extent_rec *rec;
3482 enum ocfs2_contig_type ret = CONTIG_NONE;
3483
3484 /*
3485 * We're careful to check for an empty extent record here -
3486 * the merge code will know what to do if it sees one.
3487 */
3488
3489 if (index > 0) {
3490 rec = &el->l_recs[index - 1];
3491 if (index == 1 && ocfs2_is_empty_extent(rec)) {
3492 if (split_rec->e_cpos == el->l_recs[index].e_cpos)
3493 ret = CONTIG_RIGHT;
3494 } else {
3495 ret = ocfs2_extent_contig(inode, rec, split_rec);
3496 }
3497 }
3498
3499 if (index < (le16_to_cpu(el->l_next_free_rec) - 1)) {
3500 enum ocfs2_contig_type contig_type;
3501
3502 rec = &el->l_recs[index + 1];
3503 contig_type = ocfs2_extent_contig(inode, rec, split_rec);
3504
3505 if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT)
3506 ret = CONTIG_LEFTRIGHT;
3507 else if (ret == CONTIG_NONE)
3508 ret = contig_type;
3509 }
3510
3511 return ret;
3512}
3513
2181static void ocfs2_figure_contig_type(struct inode *inode, 3514static void ocfs2_figure_contig_type(struct inode *inode,
2182 struct ocfs2_insert_type *insert, 3515 struct ocfs2_insert_type *insert,
2183 struct ocfs2_extent_list *el, 3516 struct ocfs2_extent_list *el,
@@ -2269,6 +3602,8 @@ static int ocfs2_figure_insert_type(struct inode *inode,
2269 struct ocfs2_path *path = NULL; 3602 struct ocfs2_path *path = NULL;
2270 struct buffer_head *bh = NULL; 3603 struct buffer_head *bh = NULL;
2271 3604
3605 insert->ins_split = SPLIT_NONE;
3606
2272 el = &di->id2.i_list; 3607 el = &di->id2.i_list;
2273 insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth); 3608 insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth);
2274 3609
@@ -2430,7 +3765,7 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
2430 3765
2431 if (insert.ins_contig == CONTIG_NONE && insert.ins_free_records == 0) { 3766 if (insert.ins_contig == CONTIG_NONE && insert.ins_free_records == 0) {
2432 status = ocfs2_grow_tree(inode, handle, fe_bh, 3767 status = ocfs2_grow_tree(inode, handle, fe_bh,
2433 &insert.ins_tree_depth, last_eb_bh, 3768 &insert.ins_tree_depth, &last_eb_bh,
2434 meta_ac); 3769 meta_ac);
2435 if (status) { 3770 if (status) {
2436 mlog_errno(status); 3771 mlog_errno(status);
@@ -2456,6 +3791,352 @@ bail:
2456 return status; 3791 return status;
2457} 3792}
2458 3793
3794static void ocfs2_make_right_split_rec(struct super_block *sb,
3795 struct ocfs2_extent_rec *split_rec,
3796 u32 cpos,
3797 struct ocfs2_extent_rec *rec)
3798{
3799 u32 rec_cpos = le32_to_cpu(rec->e_cpos);
3800 u32 rec_range = rec_cpos + le16_to_cpu(rec->e_leaf_clusters);
3801
3802 memset(split_rec, 0, sizeof(struct ocfs2_extent_rec));
3803
3804 split_rec->e_cpos = cpu_to_le32(cpos);
3805 split_rec->e_leaf_clusters = cpu_to_le16(rec_range - cpos);
3806
3807 split_rec->e_blkno = rec->e_blkno;
3808 le64_add_cpu(&split_rec->e_blkno,
3809 ocfs2_clusters_to_blocks(sb, cpos - rec_cpos));
3810
3811 split_rec->e_flags = rec->e_flags;
3812}
3813
3814static int ocfs2_split_and_insert(struct inode *inode,
3815 handle_t *handle,
3816 struct ocfs2_path *path,
3817 struct buffer_head *di_bh,
3818 struct buffer_head **last_eb_bh,
3819 int split_index,
3820 struct ocfs2_extent_rec *orig_split_rec,
3821 struct ocfs2_alloc_context *meta_ac)
3822{
3823 int ret = 0, depth;
3824 unsigned int insert_range, rec_range, do_leftright = 0;
3825 struct ocfs2_extent_rec tmprec;
3826 struct ocfs2_extent_list *rightmost_el;
3827 struct ocfs2_extent_rec rec;
3828 struct ocfs2_extent_rec split_rec = *orig_split_rec;
3829 struct ocfs2_insert_type insert;
3830 struct ocfs2_extent_block *eb;
3831 struct ocfs2_dinode *di;
3832
3833leftright:
3834 /*
3835 * Store a copy of the record on the stack - it might move
3836 * around as the tree is manipulated below.
3837 */
3838 rec = path_leaf_el(path)->l_recs[split_index];
3839
3840 di = (struct ocfs2_dinode *)di_bh->b_data;
3841 rightmost_el = &di->id2.i_list;
3842
3843 depth = le16_to_cpu(rightmost_el->l_tree_depth);
3844 if (depth) {
3845 BUG_ON(!(*last_eb_bh));
3846 eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
3847 rightmost_el = &eb->h_list;
3848 }
3849
3850 if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
3851 le16_to_cpu(rightmost_el->l_count)) {
3852 int old_depth = depth;
3853
3854 ret = ocfs2_grow_tree(inode, handle, di_bh, &depth, last_eb_bh,
3855 meta_ac);
3856 if (ret) {
3857 mlog_errno(ret);
3858 goto out;
3859 }
3860
3861 if (old_depth != depth) {
3862 eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data;
3863 rightmost_el = &eb->h_list;
3864 }
3865 }
3866
3867 memset(&insert, 0, sizeof(struct ocfs2_insert_type));
3868 insert.ins_appending = APPEND_NONE;
3869 insert.ins_contig = CONTIG_NONE;
3870 insert.ins_free_records = le16_to_cpu(rightmost_el->l_count)
3871 - le16_to_cpu(rightmost_el->l_next_free_rec);
3872 insert.ins_tree_depth = depth;
3873
3874 insert_range = le32_to_cpu(split_rec.e_cpos) +
3875 le16_to_cpu(split_rec.e_leaf_clusters);
3876 rec_range = le32_to_cpu(rec.e_cpos) +
3877 le16_to_cpu(rec.e_leaf_clusters);
3878
3879 if (split_rec.e_cpos == rec.e_cpos) {
3880 insert.ins_split = SPLIT_LEFT;
3881 } else if (insert_range == rec_range) {
3882 insert.ins_split = SPLIT_RIGHT;
3883 } else {
3884 /*
3885 * Left/right split. We fake this as a right split
3886 * first and then make a second pass as a left split.
3887 */
3888 insert.ins_split = SPLIT_RIGHT;
3889
3890 ocfs2_make_right_split_rec(inode->i_sb, &tmprec, insert_range,
3891 &rec);
3892
3893 split_rec = tmprec;
3894
3895 BUG_ON(do_leftright);
3896 do_leftright = 1;
3897 }
3898
3899 ret = ocfs2_do_insert_extent(inode, handle, di_bh, &split_rec,
3900 &insert);
3901 if (ret) {
3902 mlog_errno(ret);
3903 goto out;
3904 }
3905
3906 if (do_leftright == 1) {
3907 u32 cpos;
3908 struct ocfs2_extent_list *el;
3909
3910 do_leftright++;
3911 split_rec = *orig_split_rec;
3912
3913 ocfs2_reinit_path(path, 1);
3914
3915 cpos = le32_to_cpu(split_rec.e_cpos);
3916 ret = ocfs2_find_path(inode, path, cpos);
3917 if (ret) {
3918 mlog_errno(ret);
3919 goto out;
3920 }
3921
3922 el = path_leaf_el(path);
3923 split_index = ocfs2_search_extent_list(el, cpos);
3924 goto leftright;
3925 }
3926out:
3927
3928 return ret;
3929}
3930
3931/*
3932 * Mark part or all of the extent record at split_index in the leaf
3933 * pointed to by path as written. This removes the unwritten
3934 * extent flag.
3935 *
3936 * Care is taken to handle contiguousness so as to not grow the tree.
3937 *
3938 * meta_ac is not strictly necessary - we only truly need it if growth
3939 * of the tree is required. All other cases will degrade into a less
3940 * optimal tree layout.
3941 *
3942 * last_eb_bh should be the rightmost leaf block for any inode with a
3943 * btree. Since a split may grow the tree or a merge might shrink it, the caller cannot trust the contents of that buffer after this call.
3944 *
3945 * This code is optimized for readability - several passes might be
3946 * made over certain portions of the tree. All of those blocks will
3947 * have been brought into cache (and pinned via the journal), so the
3948 * extra overhead is not expressed in terms of disk reads.
3949 */
3950static int __ocfs2_mark_extent_written(struct inode *inode,
3951 struct buffer_head *di_bh,
3952 handle_t *handle,
3953 struct ocfs2_path *path,
3954 int split_index,
3955 struct ocfs2_extent_rec *split_rec,
3956 struct ocfs2_alloc_context *meta_ac,
3957 struct ocfs2_cached_dealloc_ctxt *dealloc)
3958{
3959 int ret = 0;
3960 struct ocfs2_extent_list *el = path_leaf_el(path);
3961 struct buffer_head *eb_bh, *last_eb_bh = NULL;
3962 struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
3963 struct ocfs2_merge_ctxt ctxt;
3964 struct ocfs2_extent_list *rightmost_el;
3965
3966 if (!rec->e_flags & OCFS2_EXT_UNWRITTEN) {
3967 ret = -EIO;
3968 mlog_errno(ret);
3969 goto out;
3970 }
3971
3972 if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) ||
3973 ((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) <
3974 (le32_to_cpu(split_rec->e_cpos) + le16_to_cpu(split_rec->e_leaf_clusters)))) {
3975 ret = -EIO;
3976 mlog_errno(ret);
3977 goto out;
3978 }
3979
3980 eb_bh = path_leaf_bh(path);
3981 ret = ocfs2_journal_access(handle, inode, eb_bh,
3982 OCFS2_JOURNAL_ACCESS_WRITE);
3983 if (ret) {
3984 mlog_errno(ret);
3985 goto out;
3986 }
3987
3988 ctxt.c_contig_type = ocfs2_figure_merge_contig_type(inode, el,
3989 split_index,
3990 split_rec);
3991
3992 /*
3993 * The core merge / split code wants to know how much room is
3994 * left in this inodes allocation tree, so we pass the
3995 * rightmost extent list.
3996 */
3997 if (path->p_tree_depth) {
3998 struct ocfs2_extent_block *eb;
3999 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
4000
4001 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
4002 le64_to_cpu(di->i_last_eb_blk),
4003 &last_eb_bh, OCFS2_BH_CACHED, inode);
4004 if (ret) {
4005 mlog_exit(ret);
4006 goto out;
4007 }
4008
4009 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
4010 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
4011 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
4012 ret = -EROFS;
4013 goto out;
4014 }
4015
4016 rightmost_el = &eb->h_list;
4017 } else
4018 rightmost_el = path_root_el(path);
4019
4020 ctxt.c_used_tail_recs = le16_to_cpu(rightmost_el->l_next_free_rec);
4021 if (ctxt.c_used_tail_recs > 0 &&
4022 ocfs2_is_empty_extent(&rightmost_el->l_recs[0]))
4023 ctxt.c_used_tail_recs--;
4024
4025 if (rec->e_cpos == split_rec->e_cpos &&
4026 rec->e_leaf_clusters == split_rec->e_leaf_clusters)
4027 ctxt.c_split_covers_rec = 1;
4028 else
4029 ctxt.c_split_covers_rec = 0;
4030
4031 ctxt.c_has_empty_extent = ocfs2_is_empty_extent(&el->l_recs[0]);
4032
4033 mlog(0, "index: %d, contig: %u, used_tail_recs: %u, "
4034 "has_empty: %u, split_covers: %u\n", split_index,
4035 ctxt.c_contig_type, ctxt.c_used_tail_recs,
4036 ctxt.c_has_empty_extent, ctxt.c_split_covers_rec);
4037
4038 if (ctxt.c_contig_type == CONTIG_NONE) {
4039 if (ctxt.c_split_covers_rec)
4040 el->l_recs[split_index] = *split_rec;
4041 else
4042 ret = ocfs2_split_and_insert(inode, handle, path, di_bh,
4043 &last_eb_bh, split_index,
4044 split_rec, meta_ac);
4045 if (ret)
4046 mlog_errno(ret);
4047 } else {
4048 ret = ocfs2_try_to_merge_extent(inode, handle, path,
4049 split_index, split_rec,
4050 dealloc, &ctxt);
4051 if (ret)
4052 mlog_errno(ret);
4053 }
4054
4055 ocfs2_journal_dirty(handle, eb_bh);
4056
4057out:
4058 brelse(last_eb_bh);
4059 return ret;
4060}
4061
4062/*
4063 * Mark the already-existing extent at cpos as written for len clusters.
4064 *
4065 * If the existing extent is larger than the request, initiate a
4066 * split. An attempt will be made at merging with adjacent extents.
4067 *
4068 * The caller is responsible for passing down meta_ac if we'll need it.
4069 */
4070int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh,
4071 handle_t *handle, u32 cpos, u32 len, u32 phys,
4072 struct ocfs2_alloc_context *meta_ac,
4073 struct ocfs2_cached_dealloc_ctxt *dealloc)
4074{
4075 int ret, index;
4076 u64 start_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys);
4077 struct ocfs2_extent_rec split_rec;
4078 struct ocfs2_path *left_path = NULL;
4079 struct ocfs2_extent_list *el;
4080
4081 mlog(0, "Inode %lu cpos %u, len %u, phys %u (%llu)\n",
4082 inode->i_ino, cpos, len, phys, (unsigned long long)start_blkno);
4083
4084 if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
4085 ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents "
4086 "that are being written to, but the feature bit "
4087 "is not set in the super block.",
4088 (unsigned long long)OCFS2_I(inode)->ip_blkno);
4089 ret = -EROFS;
4090 goto out;
4091 }
4092
4093 /*
4094 * XXX: This should be fixed up so that we just re-insert the
4095 * next extent records.
4096 */
4097 ocfs2_extent_map_trunc(inode, 0);
4098
4099 left_path = ocfs2_new_inode_path(di_bh);
4100 if (!left_path) {
4101 ret = -ENOMEM;
4102 mlog_errno(ret);
4103 goto out;
4104 }
4105
4106 ret = ocfs2_find_path(inode, left_path, cpos);
4107 if (ret) {
4108 mlog_errno(ret);
4109 goto out;
4110 }
4111 el = path_leaf_el(left_path);
4112
4113 index = ocfs2_search_extent_list(el, cpos);
4114 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
4115 ocfs2_error(inode->i_sb,
4116 "Inode %llu has an extent at cpos %u which can no "
4117 "longer be found.\n",
4118 (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
4119 ret = -EROFS;
4120 goto out;
4121 }
4122
4123 memset(&split_rec, 0, sizeof(struct ocfs2_extent_rec));
4124 split_rec.e_cpos = cpu_to_le32(cpos);
4125 split_rec.e_leaf_clusters = cpu_to_le16(len);
4126 split_rec.e_blkno = cpu_to_le64(start_blkno);
4127 split_rec.e_flags = path_leaf_el(left_path)->l_recs[index].e_flags;
4128 split_rec.e_flags &= ~OCFS2_EXT_UNWRITTEN;
4129
4130 ret = __ocfs2_mark_extent_written(inode, di_bh, handle, left_path,
4131 index, &split_rec, meta_ac, dealloc);
4132 if (ret)
4133 mlog_errno(ret);
4134
4135out:
4136 ocfs2_free_path(left_path);
4137 return ret;
4138}
4139
2459static inline int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb) 4140static inline int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
2460{ 4141{
2461 struct buffer_head *tl_bh = osb->osb_tl_bh; 4142 struct buffer_head *tl_bh = osb->osb_tl_bh;
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index cb02e53b593c..d3acf45225c2 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -35,6 +35,11 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
35 u64 start_blk, 35 u64 start_blk,
36 u32 new_clusters, 36 u32 new_clusters,
37 struct ocfs2_alloc_context *meta_ac); 37 struct ocfs2_alloc_context *meta_ac);
38struct ocfs2_cached_dealloc_ctxt;
39int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh,
40 handle_t *handle, u32 cpos, u32 len, u32 phys,
41 struct ocfs2_alloc_context *meta_ac,
42 struct ocfs2_cached_dealloc_ctxt *dealloc);
38int ocfs2_num_free_extents(struct ocfs2_super *osb, 43int ocfs2_num_free_extents(struct ocfs2_super *osb,
39 struct inode *inode, 44 struct inode *inode,
40 struct ocfs2_dinode *fe); 45 struct ocfs2_dinode *fe);
@@ -102,6 +107,7 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
102 107
103int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el, 108int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el,
104 u32 cpos, struct buffer_head **leaf_bh); 109 u32 cpos, struct buffer_head **leaf_bh);
110int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster);
105 111
106/* 112/*
107 * Helper function to look at the # of clusters in an extent record. 113 * Helper function to look at the # of clusters in an extent record.
diff --git a/fs/ocfs2/endian.h b/fs/ocfs2/endian.h
index f226b2207628..ff257628af16 100644
--- a/fs/ocfs2/endian.h
+++ b/fs/ocfs2/endian.h
@@ -32,6 +32,11 @@ static inline void le32_add_cpu(__le32 *var, u32 val)
32 *var = cpu_to_le32(le32_to_cpu(*var) + val); 32 *var = cpu_to_le32(le32_to_cpu(*var) + val);
33} 33}
34 34
35static inline void le64_add_cpu(__le64 *var, u64 val)
36{
37 *var = cpu_to_le64(le64_to_cpu(*var) + val);
38}
39
35static inline void le32_and_cpu(__le32 *var, u32 val) 40static inline void le32_and_cpu(__le32 *var, u32 val)
36{ 41{
37 *var = cpu_to_le32(le32_to_cpu(*var) & val); 42 *var = cpu_to_le32(le32_to_cpu(*var) & val);
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index e23e416ca74c..03c1d365c78b 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -373,37 +373,6 @@ out:
373 return ret; 373 return ret;
374} 374}
375 375
376/*
377 * Return the index of the extent record which contains cluster #v_cluster.
378 * -1 is returned if it was not found.
379 *
380 * Should work fine on interior and exterior nodes.
381 */
382static int ocfs2_search_extent_list(struct ocfs2_extent_list *el,
383 u32 v_cluster)
384{
385 int ret = -1;
386 int i;
387 struct ocfs2_extent_rec *rec;
388 u32 rec_end, rec_start, clusters;
389
390 for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
391 rec = &el->l_recs[i];
392
393 rec_start = le32_to_cpu(rec->e_cpos);
394 clusters = ocfs2_rec_clusters(el, rec);
395
396 rec_end = rec_start + clusters;
397
398 if (v_cluster >= rec_start && v_cluster < rec_end) {
399 ret = i;
400 break;
401 }
402 }
403
404 return ret;
405}
406
407int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, 376int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
408 u32 *p_cluster, u32 *num_clusters, 377 u32 *p_cluster, u32 *num_clusters,
409 unsigned int *extent_flags) 378 unsigned int *extent_flags)
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 648ef8e45eaa..5cc90a40b3c5 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -306,6 +306,19 @@ static inline int ocfs2_sparse_alloc(struct ocfs2_super *osb)
306 return 0; 306 return 0;
307} 307}
308 308
309static inline int ocfs2_writes_unwritten_extents(struct ocfs2_super *osb)
310{
311 /*
312 * Support for sparse files is a pre-requisite
313 */
314 if (!ocfs2_sparse_alloc(osb))
315 return 0;
316
317 if (osb->s_feature_ro_compat & OCFS2_FEATURE_RO_COMPAT_UNWRITTEN)
318 return 1;
319 return 0;
320}
321
309/* set / clear functions because cluster events can make these happen 322/* set / clear functions because cluster events can make these happen
310 * in parallel so we want the transitions to be atomic. this also 323 * in parallel so we want the transitions to be atomic. this also
311 * means that any future flags osb_flags must be protected by spinlock 324 * means that any future flags osb_flags must be protected by spinlock
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index f0d9eb08547a..c20a74b99d87 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -88,7 +88,7 @@
88#define OCFS2_FEATURE_COMPAT_SUPP OCFS2_FEATURE_COMPAT_BACKUP_SB 88#define OCFS2_FEATURE_COMPAT_SUPP OCFS2_FEATURE_COMPAT_BACKUP_SB
89#define OCFS2_FEATURE_INCOMPAT_SUPP (OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT \ 89#define OCFS2_FEATURE_INCOMPAT_SUPP (OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT \
90 | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC) 90 | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC)
91#define OCFS2_FEATURE_RO_COMPAT_SUPP 0 91#define OCFS2_FEATURE_RO_COMPAT_SUPP OCFS2_FEATURE_RO_COMPAT_UNWRITTEN
92 92
93/* 93/*
94 * Heartbeat-only devices are missing journals and other files. The 94 * Heartbeat-only devices are missing journals and other files. The
@@ -116,6 +116,11 @@
116 */ 116 */
117#define OCFS2_FEATURE_COMPAT_BACKUP_SB 0x0001 117#define OCFS2_FEATURE_COMPAT_BACKUP_SB 0x0001
118 118
119/*
120 * Unwritten extents support.
121 */
122#define OCFS2_FEATURE_RO_COMPAT_UNWRITTEN 0x0001
123
119/* The byte offset of the first backup block will be 1G. 124/* The byte offset of the first backup block will be 1G.
120 * The following will be 4G, 16G, 64G, 256G and 1T. 125 * The following will be 4G, 16G, 64G, 256G and 1T.
121 */ 126 */