aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ocfs2
diff options
context:
space:
mode:
authorDavid Woodhouse <dwmw2@infradead.org>2007-07-23 05:20:10 -0400
committerDavid Woodhouse <dwmw2@infradead.org>2007-07-23 05:20:10 -0400
commit39fe5434cb9de5da40510028b17b96bc4eb312b3 (patch)
tree7a02a317b9ad57da51ca99887c119e779ccf3f13 /fs/ocfs2
parent0fc72b81d3111d114ab378935b1cf07680ca1289 (diff)
parentf695baf2df9e0413d3521661070103711545207a (diff)
Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6
Diffstat (limited to 'fs/ocfs2')
-rw-r--r--fs/ocfs2/alloc.c2676
-rw-r--r--fs/ocfs2/alloc.h43
-rw-r--r--fs/ocfs2/aops.c1017
-rw-r--r--fs/ocfs2/aops.h61
-rw-r--r--fs/ocfs2/cluster/heartbeat.c96
-rw-r--r--fs/ocfs2/cluster/heartbeat.h6
-rw-r--r--fs/ocfs2/cluster/masklog.c1
-rw-r--r--fs/ocfs2/cluster/nodemanager.c42
-rw-r--r--fs/ocfs2/cluster/nodemanager.h5
-rw-r--r--fs/ocfs2/cluster/tcp.c21
-rw-r--r--fs/ocfs2/dir.c2
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c8
-rw-r--r--fs/ocfs2/dlm/dlmfs.c2
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c42
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c79
-rw-r--r--fs/ocfs2/dlmglue.c6
-rw-r--r--fs/ocfs2/endian.h5
-rw-r--r--fs/ocfs2/export.h2
-rw-r--r--fs/ocfs2/extent_map.c41
-rw-r--r--fs/ocfs2/file.c744
-rw-r--r--fs/ocfs2/file.h10
-rw-r--r--fs/ocfs2/heartbeat.c12
-rw-r--r--fs/ocfs2/ioctl.c17
-rw-r--r--fs/ocfs2/journal.c6
-rw-r--r--fs/ocfs2/journal.h2
-rw-r--r--fs/ocfs2/mmap.c183
-rw-r--r--fs/ocfs2/namei.c2
-rw-r--r--fs/ocfs2/ocfs2.h14
-rw-r--r--fs/ocfs2/ocfs2_fs.h33
-rw-r--r--fs/ocfs2/slot_map.c12
-rw-r--r--fs/ocfs2/suballoc.c46
-rw-r--r--fs/ocfs2/suballoc.h17
-rw-r--r--fs/ocfs2/super.c29
-rw-r--r--fs/ocfs2/super.h2
-rw-r--r--fs/ocfs2/uptodate.c2
35 files changed, 4289 insertions, 997 deletions
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 19712a7d145f..f5e11f4fa952 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -50,6 +50,8 @@
50#include "buffer_head_io.h" 50#include "buffer_head_io.h"
51 51
52static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc); 52static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
53static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
54 struct ocfs2_extent_block *eb);
53 55
54/* 56/*
55 * Structures which describe a path through a btree, and functions to 57 * Structures which describe a path through a btree, and functions to
@@ -117,6 +119,31 @@ static void ocfs2_free_path(struct ocfs2_path *path)
117} 119}
118 120
119/* 121/*
122 * All the elements of src into dest. After this call, src could be freed
123 * without affecting dest.
124 *
125 * Both paths should have the same root. Any non-root elements of dest
126 * will be freed.
127 */
128static void ocfs2_cp_path(struct ocfs2_path *dest, struct ocfs2_path *src)
129{
130 int i;
131
132 BUG_ON(path_root_bh(dest) != path_root_bh(src));
133 BUG_ON(path_root_el(dest) != path_root_el(src));
134
135 ocfs2_reinit_path(dest, 1);
136
137 for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
138 dest->p_node[i].bh = src->p_node[i].bh;
139 dest->p_node[i].el = src->p_node[i].el;
140
141 if (dest->p_node[i].bh)
142 get_bh(dest->p_node[i].bh);
143 }
144}
145
146/*
120 * Make the *dest path the same as src and re-initialize src path to 147 * Make the *dest path the same as src and re-initialize src path to
121 * have a root only. 148 * have a root only.
122 */ 149 */
@@ -212,10 +239,41 @@ out:
212 return ret; 239 return ret;
213} 240}
214 241
242/*
243 * Return the index of the extent record which contains cluster #v_cluster.
244 * -1 is returned if it was not found.
245 *
246 * Should work fine on interior and exterior nodes.
247 */
248int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster)
249{
250 int ret = -1;
251 int i;
252 struct ocfs2_extent_rec *rec;
253 u32 rec_end, rec_start, clusters;
254
255 for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
256 rec = &el->l_recs[i];
257
258 rec_start = le32_to_cpu(rec->e_cpos);
259 clusters = ocfs2_rec_clusters(el, rec);
260
261 rec_end = rec_start + clusters;
262
263 if (v_cluster >= rec_start && v_cluster < rec_end) {
264 ret = i;
265 break;
266 }
267 }
268
269 return ret;
270}
271
215enum ocfs2_contig_type { 272enum ocfs2_contig_type {
216 CONTIG_NONE = 0, 273 CONTIG_NONE = 0,
217 CONTIG_LEFT, 274 CONTIG_LEFT,
218 CONTIG_RIGHT 275 CONTIG_RIGHT,
276 CONTIG_LEFTRIGHT,
219}; 277};
220 278
221 279
@@ -253,6 +311,14 @@ static enum ocfs2_contig_type
253{ 311{
254 u64 blkno = le64_to_cpu(insert_rec->e_blkno); 312 u64 blkno = le64_to_cpu(insert_rec->e_blkno);
255 313
314 /*
315 * Refuse to coalesce extent records with different flag
316 * fields - we don't want to mix unwritten extents with user
317 * data.
318 */
319 if (ext->e_flags != insert_rec->e_flags)
320 return CONTIG_NONE;
321
256 if (ocfs2_extents_adjacent(ext, insert_rec) && 322 if (ocfs2_extents_adjacent(ext, insert_rec) &&
257 ocfs2_block_extent_contig(inode->i_sb, ext, blkno)) 323 ocfs2_block_extent_contig(inode->i_sb, ext, blkno))
258 return CONTIG_RIGHT; 324 return CONTIG_RIGHT;
@@ -277,7 +343,14 @@ enum ocfs2_append_type {
277 APPEND_TAIL, 343 APPEND_TAIL,
278}; 344};
279 345
346enum ocfs2_split_type {
347 SPLIT_NONE = 0,
348 SPLIT_LEFT,
349 SPLIT_RIGHT,
350};
351
280struct ocfs2_insert_type { 352struct ocfs2_insert_type {
353 enum ocfs2_split_type ins_split;
281 enum ocfs2_append_type ins_appending; 354 enum ocfs2_append_type ins_appending;
282 enum ocfs2_contig_type ins_contig; 355 enum ocfs2_contig_type ins_contig;
283 int ins_contig_index; 356 int ins_contig_index;
@@ -285,6 +358,13 @@ struct ocfs2_insert_type {
285 int ins_tree_depth; 358 int ins_tree_depth;
286}; 359};
287 360
361struct ocfs2_merge_ctxt {
362 enum ocfs2_contig_type c_contig_type;
363 int c_has_empty_extent;
364 int c_split_covers_rec;
365 int c_used_tail_recs;
366};
367
288/* 368/*
289 * How many free extents have we got before we need more meta data? 369 * How many free extents have we got before we need more meta data?
290 */ 370 */
@@ -384,13 +464,7 @@ static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
384 strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE); 464 strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
385 eb->h_blkno = cpu_to_le64(first_blkno); 465 eb->h_blkno = cpu_to_le64(first_blkno);
386 eb->h_fs_generation = cpu_to_le32(osb->fs_generation); 466 eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
387
388#ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS
389 /* we always use slot zero's suballocator */
390 eb->h_suballoc_slot = 0;
391#else
392 eb->h_suballoc_slot = cpu_to_le16(osb->slot_num); 467 eb->h_suballoc_slot = cpu_to_le16(osb->slot_num);
393#endif
394 eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start); 468 eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
395 eb->h_list.l_count = 469 eb->h_list.l_count =
396 cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb)); 470 cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
@@ -461,7 +535,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
461 struct inode *inode, 535 struct inode *inode,
462 struct buffer_head *fe_bh, 536 struct buffer_head *fe_bh,
463 struct buffer_head *eb_bh, 537 struct buffer_head *eb_bh,
464 struct buffer_head *last_eb_bh, 538 struct buffer_head **last_eb_bh,
465 struct ocfs2_alloc_context *meta_ac) 539 struct ocfs2_alloc_context *meta_ac)
466{ 540{
467 int status, new_blocks, i; 541 int status, new_blocks, i;
@@ -476,7 +550,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
476 550
477 mlog_entry_void(); 551 mlog_entry_void();
478 552
479 BUG_ON(!last_eb_bh); 553 BUG_ON(!last_eb_bh || !*last_eb_bh);
480 554
481 fe = (struct ocfs2_dinode *) fe_bh->b_data; 555 fe = (struct ocfs2_dinode *) fe_bh->b_data;
482 556
@@ -507,7 +581,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
507 goto bail; 581 goto bail;
508 } 582 }
509 583
510 eb = (struct ocfs2_extent_block *)last_eb_bh->b_data; 584 eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data;
511 new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list); 585 new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list);
512 586
513 /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be 587 /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
@@ -568,7 +642,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
568 * journal_dirty erroring as it won't unless we've aborted the 642 * journal_dirty erroring as it won't unless we've aborted the
569 * handle (in which case we would never be here) so reserving 643 * handle (in which case we would never be here) so reserving
570 * the write with journal_access is all we need to do. */ 644 * the write with journal_access is all we need to do. */
571 status = ocfs2_journal_access(handle, inode, last_eb_bh, 645 status = ocfs2_journal_access(handle, inode, *last_eb_bh,
572 OCFS2_JOURNAL_ACCESS_WRITE); 646 OCFS2_JOURNAL_ACCESS_WRITE);
573 if (status < 0) { 647 if (status < 0) {
574 mlog_errno(status); 648 mlog_errno(status);
@@ -601,10 +675,10 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
601 * next_leaf on the previously last-extent-block. */ 675 * next_leaf on the previously last-extent-block. */
602 fe->i_last_eb_blk = cpu_to_le64(new_last_eb_blk); 676 fe->i_last_eb_blk = cpu_to_le64(new_last_eb_blk);
603 677
604 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; 678 eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
605 eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk); 679 eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
606 680
607 status = ocfs2_journal_dirty(handle, last_eb_bh); 681 status = ocfs2_journal_dirty(handle, *last_eb_bh);
608 if (status < 0) 682 if (status < 0)
609 mlog_errno(status); 683 mlog_errno(status);
610 status = ocfs2_journal_dirty(handle, fe_bh); 684 status = ocfs2_journal_dirty(handle, fe_bh);
@@ -616,6 +690,14 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
616 mlog_errno(status); 690 mlog_errno(status);
617 } 691 }
618 692
693 /*
694 * Some callers want to track the rightmost leaf so pass it
695 * back here.
696 */
697 brelse(*last_eb_bh);
698 get_bh(new_eb_bhs[0]);
699 *last_eb_bh = new_eb_bhs[0];
700
619 status = 0; 701 status = 0;
620bail: 702bail:
621 if (new_eb_bhs) { 703 if (new_eb_bhs) {
@@ -829,6 +911,87 @@ bail:
829} 911}
830 912
831/* 913/*
914 * Grow a b-tree so that it has more records.
915 *
916 * We might shift the tree depth in which case existing paths should
917 * be considered invalid.
918 *
919 * Tree depth after the grow is returned via *final_depth.
920 *
921 * *last_eb_bh will be updated by ocfs2_add_branch().
922 */
923static int ocfs2_grow_tree(struct inode *inode, handle_t *handle,
924 struct buffer_head *di_bh, int *final_depth,
925 struct buffer_head **last_eb_bh,
926 struct ocfs2_alloc_context *meta_ac)
927{
928 int ret, shift;
929 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
930 int depth = le16_to_cpu(di->id2.i_list.l_tree_depth);
931 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
932 struct buffer_head *bh = NULL;
933
934 BUG_ON(meta_ac == NULL);
935
936 shift = ocfs2_find_branch_target(osb, inode, di_bh, &bh);
937 if (shift < 0) {
938 ret = shift;
939 mlog_errno(ret);
940 goto out;
941 }
942
943 /* We traveled all the way to the bottom of the allocation tree
944 * and didn't find room for any more extents - we need to add
945 * another tree level */
946 if (shift) {
947 BUG_ON(bh);
948 mlog(0, "need to shift tree depth (current = %d)\n", depth);
949
950 /* ocfs2_shift_tree_depth will return us a buffer with
951 * the new extent block (so we can pass that to
952 * ocfs2_add_branch). */
953 ret = ocfs2_shift_tree_depth(osb, handle, inode, di_bh,
954 meta_ac, &bh);
955 if (ret < 0) {
956 mlog_errno(ret);
957 goto out;
958 }
959 depth++;
960 if (depth == 1) {
961 /*
962 * Special case: we have room now if we shifted from
963 * tree_depth 0, so no more work needs to be done.
964 *
965 * We won't be calling add_branch, so pass
966 * back *last_eb_bh as the new leaf. At depth
967 * zero, it should always be null so there's
968 * no reason to brelse.
969 */
970 BUG_ON(*last_eb_bh);
971 get_bh(bh);
972 *last_eb_bh = bh;
973 goto out;
974 }
975 }
976
977 /* call ocfs2_add_branch to add the final part of the tree with
978 * the new data. */
979 mlog(0, "add branch. bh = %p\n", bh);
980 ret = ocfs2_add_branch(osb, handle, inode, di_bh, bh, last_eb_bh,
981 meta_ac);
982 if (ret < 0) {
983 mlog_errno(ret);
984 goto out;
985 }
986
987out:
988 if (final_depth)
989 *final_depth = depth;
990 brelse(bh);
991 return ret;
992}
993
994/*
832 * This is only valid for leaf nodes, which are the only ones that can 995 * This is only valid for leaf nodes, which are the only ones that can
833 * have empty extents anyway. 996 * have empty extents anyway.
834 */ 997 */
@@ -934,6 +1097,22 @@ static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
934 1097
935} 1098}
936 1099
1100static void ocfs2_remove_empty_extent(struct ocfs2_extent_list *el)
1101{
1102 int size, num_recs = le16_to_cpu(el->l_next_free_rec);
1103
1104 BUG_ON(num_recs == 0);
1105
1106 if (ocfs2_is_empty_extent(&el->l_recs[0])) {
1107 num_recs--;
1108 size = num_recs * sizeof(struct ocfs2_extent_rec);
1109 memmove(&el->l_recs[0], &el->l_recs[1], size);
1110 memset(&el->l_recs[num_recs], 0,
1111 sizeof(struct ocfs2_extent_rec));
1112 el->l_next_free_rec = cpu_to_le16(num_recs);
1113 }
1114}
1115
937/* 1116/*
938 * Create an empty extent record . 1117 * Create an empty extent record .
939 * 1118 *
@@ -1211,6 +1390,10 @@ static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
1211 * immediately to their right. 1390 * immediately to their right.
1212 */ 1391 */
1213 left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos); 1392 left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos);
1393 if (ocfs2_is_empty_extent(&right_child_el->l_recs[0])) {
1394 BUG_ON(le16_to_cpu(right_child_el->l_next_free_rec) <= 1);
1395 left_clusters = le32_to_cpu(right_child_el->l_recs[1].e_cpos);
1396 }
1214 left_clusters -= le32_to_cpu(left_rec->e_cpos); 1397 left_clusters -= le32_to_cpu(left_rec->e_cpos);
1215 left_rec->e_int_clusters = cpu_to_le32(left_clusters); 1398 left_rec->e_int_clusters = cpu_to_le32(left_clusters);
1216 1399
@@ -1531,10 +1714,16 @@ out:
1531 return ret; 1714 return ret;
1532} 1715}
1533 1716
1717/*
1718 * Extend the transaction by enough credits to complete the rotation,
1719 * and still leave at least the original number of credits allocated
1720 * to this transaction.
1721 */
1534static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth, 1722static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
1723 int op_credits,
1535 struct ocfs2_path *path) 1724 struct ocfs2_path *path)
1536{ 1725{
1537 int credits = (path->p_tree_depth - subtree_depth) * 2 + 1; 1726 int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits;
1538 1727
1539 if (handle->h_buffer_credits < credits) 1728 if (handle->h_buffer_credits < credits)
1540 return ocfs2_extend_trans(handle, credits); 1729 return ocfs2_extend_trans(handle, credits);
@@ -1568,6 +1757,29 @@ static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path,
1568 return 0; 1757 return 0;
1569} 1758}
1570 1759
1760static int ocfs2_leftmost_rec_contains(struct ocfs2_extent_list *el, u32 cpos)
1761{
1762 int next_free = le16_to_cpu(el->l_next_free_rec);
1763 unsigned int range;
1764 struct ocfs2_extent_rec *rec;
1765
1766 if (next_free == 0)
1767 return 0;
1768
1769 rec = &el->l_recs[0];
1770 if (ocfs2_is_empty_extent(rec)) {
1771 /* Empty list. */
1772 if (next_free == 1)
1773 return 0;
1774 rec = &el->l_recs[1];
1775 }
1776
1777 range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
1778 if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
1779 return 1;
1780 return 0;
1781}
1782
1571/* 1783/*
1572 * Rotate all the records in a btree right one record, starting at insert_cpos. 1784 * Rotate all the records in a btree right one record, starting at insert_cpos.
1573 * 1785 *
@@ -1586,11 +1798,12 @@ static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path,
1586 */ 1798 */
1587static int ocfs2_rotate_tree_right(struct inode *inode, 1799static int ocfs2_rotate_tree_right(struct inode *inode,
1588 handle_t *handle, 1800 handle_t *handle,
1801 enum ocfs2_split_type split,
1589 u32 insert_cpos, 1802 u32 insert_cpos,
1590 struct ocfs2_path *right_path, 1803 struct ocfs2_path *right_path,
1591 struct ocfs2_path **ret_left_path) 1804 struct ocfs2_path **ret_left_path)
1592{ 1805{
1593 int ret, start; 1806 int ret, start, orig_credits = handle->h_buffer_credits;
1594 u32 cpos; 1807 u32 cpos;
1595 struct ocfs2_path *left_path = NULL; 1808 struct ocfs2_path *left_path = NULL;
1596 1809
@@ -1657,9 +1870,9 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
1657 (unsigned long long) 1870 (unsigned long long)
1658 path_leaf_bh(left_path)->b_blocknr); 1871 path_leaf_bh(left_path)->b_blocknr);
1659 1872
1660 if (ocfs2_rotate_requires_path_adjustment(left_path, 1873 if (split == SPLIT_NONE &&
1874 ocfs2_rotate_requires_path_adjustment(left_path,
1661 insert_cpos)) { 1875 insert_cpos)) {
1662 mlog(0, "Path adjustment required\n");
1663 1876
1664 /* 1877 /*
1665 * We've rotated the tree as much as we 1878 * We've rotated the tree as much as we
@@ -1687,7 +1900,7 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
1687 right_path->p_tree_depth); 1900 right_path->p_tree_depth);
1688 1901
1689 ret = ocfs2_extend_rotate_transaction(handle, start, 1902 ret = ocfs2_extend_rotate_transaction(handle, start,
1690 right_path); 1903 orig_credits, right_path);
1691 if (ret) { 1904 if (ret) {
1692 mlog_errno(ret); 1905 mlog_errno(ret);
1693 goto out; 1906 goto out;
@@ -1700,6 +1913,24 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
1700 goto out; 1913 goto out;
1701 } 1914 }
1702 1915
1916 if (split != SPLIT_NONE &&
1917 ocfs2_leftmost_rec_contains(path_leaf_el(right_path),
1918 insert_cpos)) {
1919 /*
1920 * A rotate moves the rightmost left leaf
1921 * record over to the leftmost right leaf
1922 * slot. If we're doing an extent split
1923 * instead of a real insert, then we have to
1924 * check that the extent to be split wasn't
1925 * just moved over. If it was, then we can
1926 * exit here, passing left_path back -
1927 * ocfs2_split_extent() is smart enough to
1928 * search both leaves.
1929 */
1930 *ret_left_path = left_path;
1931 goto out_ret_path;
1932 }
1933
1703 /* 1934 /*
1704 * There is no need to re-read the next right path 1935 * There is no need to re-read the next right path
1705 * as we know that it'll be our current left 1936 * as we know that it'll be our current left
@@ -1722,6 +1953,1031 @@ out_ret_path:
1722 return ret; 1953 return ret;
1723} 1954}
1724 1955
1956static void ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle,
1957 struct ocfs2_path *path)
1958{
1959 int i, idx;
1960 struct ocfs2_extent_rec *rec;
1961 struct ocfs2_extent_list *el;
1962 struct ocfs2_extent_block *eb;
1963 u32 range;
1964
1965 /* Path should always be rightmost. */
1966 eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
1967 BUG_ON(eb->h_next_leaf_blk != 0ULL);
1968
1969 el = &eb->h_list;
1970 BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
1971 idx = le16_to_cpu(el->l_next_free_rec) - 1;
1972 rec = &el->l_recs[idx];
1973 range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
1974
1975 for (i = 0; i < path->p_tree_depth; i++) {
1976 el = path->p_node[i].el;
1977 idx = le16_to_cpu(el->l_next_free_rec) - 1;
1978 rec = &el->l_recs[idx];
1979
1980 rec->e_int_clusters = cpu_to_le32(range);
1981 le32_add_cpu(&rec->e_int_clusters, -le32_to_cpu(rec->e_cpos));
1982
1983 ocfs2_journal_dirty(handle, path->p_node[i].bh);
1984 }
1985}
1986
1987static void ocfs2_unlink_path(struct inode *inode, handle_t *handle,
1988 struct ocfs2_cached_dealloc_ctxt *dealloc,
1989 struct ocfs2_path *path, int unlink_start)
1990{
1991 int ret, i;
1992 struct ocfs2_extent_block *eb;
1993 struct ocfs2_extent_list *el;
1994 struct buffer_head *bh;
1995
1996 for(i = unlink_start; i < path_num_items(path); i++) {
1997 bh = path->p_node[i].bh;
1998
1999 eb = (struct ocfs2_extent_block *)bh->b_data;
2000 /*
2001 * Not all nodes might have had their final count
2002 * decremented by the caller - handle this here.
2003 */
2004 el = &eb->h_list;
2005 if (le16_to_cpu(el->l_next_free_rec) > 1) {
2006 mlog(ML_ERROR,
2007 "Inode %llu, attempted to remove extent block "
2008 "%llu with %u records\n",
2009 (unsigned long long)OCFS2_I(inode)->ip_blkno,
2010 (unsigned long long)le64_to_cpu(eb->h_blkno),
2011 le16_to_cpu(el->l_next_free_rec));
2012
2013 ocfs2_journal_dirty(handle, bh);
2014 ocfs2_remove_from_cache(inode, bh);
2015 continue;
2016 }
2017
2018 el->l_next_free_rec = 0;
2019 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2020
2021 ocfs2_journal_dirty(handle, bh);
2022
2023 ret = ocfs2_cache_extent_block_free(dealloc, eb);
2024 if (ret)
2025 mlog_errno(ret);
2026
2027 ocfs2_remove_from_cache(inode, bh);
2028 }
2029}
2030
2031static void ocfs2_unlink_subtree(struct inode *inode, handle_t *handle,
2032 struct ocfs2_path *left_path,
2033 struct ocfs2_path *right_path,
2034 int subtree_index,
2035 struct ocfs2_cached_dealloc_ctxt *dealloc)
2036{
2037 int i;
2038 struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
2039 struct ocfs2_extent_list *root_el = left_path->p_node[subtree_index].el;
2040 struct ocfs2_extent_list *el;
2041 struct ocfs2_extent_block *eb;
2042
2043 el = path_leaf_el(left_path);
2044
2045 eb = (struct ocfs2_extent_block *)right_path->p_node[subtree_index + 1].bh->b_data;
2046
2047 for(i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
2048 if (root_el->l_recs[i].e_blkno == eb->h_blkno)
2049 break;
2050
2051 BUG_ON(i >= le16_to_cpu(root_el->l_next_free_rec));
2052
2053 memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
2054 le16_add_cpu(&root_el->l_next_free_rec, -1);
2055
2056 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2057 eb->h_next_leaf_blk = 0;
2058
2059 ocfs2_journal_dirty(handle, root_bh);
2060 ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2061
2062 ocfs2_unlink_path(inode, handle, dealloc, right_path,
2063 subtree_index + 1);
2064}
2065
2066static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
2067 struct ocfs2_path *left_path,
2068 struct ocfs2_path *right_path,
2069 int subtree_index,
2070 struct ocfs2_cached_dealloc_ctxt *dealloc,
2071 int *deleted)
2072{
2073 int ret, i, del_right_subtree = 0, right_has_empty = 0;
2074 struct buffer_head *root_bh, *di_bh = path_root_bh(right_path);
2075 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
2076 struct ocfs2_extent_list *right_leaf_el, *left_leaf_el;
2077 struct ocfs2_extent_block *eb;
2078
2079 *deleted = 0;
2080
2081 right_leaf_el = path_leaf_el(right_path);
2082 left_leaf_el = path_leaf_el(left_path);
2083 root_bh = left_path->p_node[subtree_index].bh;
2084 BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
2085
2086 if (!ocfs2_is_empty_extent(&left_leaf_el->l_recs[0]))
2087 return 0;
2088
2089 eb = (struct ocfs2_extent_block *)path_leaf_bh(right_path)->b_data;
2090 if (ocfs2_is_empty_extent(&right_leaf_el->l_recs[0])) {
2091 /*
2092 * It's legal for us to proceed if the right leaf is
2093 * the rightmost one and it has an empty extent. There
2094 * are two cases to handle - whether the leaf will be
2095 * empty after removal or not. If the leaf isn't empty
2096 * then just remove the empty extent up front. The
2097 * next block will handle empty leaves by flagging
2098 * them for unlink.
2099 *
2100 * Non rightmost leaves will throw -EAGAIN and the
2101 * caller can manually move the subtree and retry.
2102 */
2103
2104 if (eb->h_next_leaf_blk != 0ULL)
2105 return -EAGAIN;
2106
2107 if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) {
2108 ret = ocfs2_journal_access(handle, inode,
2109 path_leaf_bh(right_path),
2110 OCFS2_JOURNAL_ACCESS_WRITE);
2111 if (ret) {
2112 mlog_errno(ret);
2113 goto out;
2114 }
2115
2116 ocfs2_remove_empty_extent(right_leaf_el);
2117 } else
2118 right_has_empty = 1;
2119 }
2120
2121 if (eb->h_next_leaf_blk == 0ULL &&
2122 le16_to_cpu(right_leaf_el->l_next_free_rec) == 1) {
2123 /*
2124 * We have to update i_last_eb_blk during the meta
2125 * data delete.
2126 */
2127 ret = ocfs2_journal_access(handle, inode, di_bh,
2128 OCFS2_JOURNAL_ACCESS_WRITE);
2129 if (ret) {
2130 mlog_errno(ret);
2131 goto out;
2132 }
2133
2134 del_right_subtree = 1;
2135 }
2136
2137 /*
2138 * Getting here with an empty extent in the right path implies
2139 * that it's the rightmost path and will be deleted.
2140 */
2141 BUG_ON(right_has_empty && !del_right_subtree);
2142
2143 ret = ocfs2_journal_access(handle, inode, root_bh,
2144 OCFS2_JOURNAL_ACCESS_WRITE);
2145 if (ret) {
2146 mlog_errno(ret);
2147 goto out;
2148 }
2149
2150 for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
2151 ret = ocfs2_journal_access(handle, inode,
2152 right_path->p_node[i].bh,
2153 OCFS2_JOURNAL_ACCESS_WRITE);
2154 if (ret) {
2155 mlog_errno(ret);
2156 goto out;
2157 }
2158
2159 ret = ocfs2_journal_access(handle, inode,
2160 left_path->p_node[i].bh,
2161 OCFS2_JOURNAL_ACCESS_WRITE);
2162 if (ret) {
2163 mlog_errno(ret);
2164 goto out;
2165 }
2166 }
2167
2168 if (!right_has_empty) {
2169 /*
2170 * Only do this if we're moving a real
2171 * record. Otherwise, the action is delayed until
2172 * after removal of the right path in which case we
2173 * can do a simple shift to remove the empty extent.
2174 */
2175 ocfs2_rotate_leaf(left_leaf_el, &right_leaf_el->l_recs[0]);
2176 memset(&right_leaf_el->l_recs[0], 0,
2177 sizeof(struct ocfs2_extent_rec));
2178 }
2179 if (eb->h_next_leaf_blk == 0ULL) {
2180 /*
2181 * Move recs over to get rid of empty extent, decrease
2182 * next_free. This is allowed to remove the last
2183 * extent in our leaf (setting l_next_free_rec to
2184 * zero) - the delete code below won't care.
2185 */
2186 ocfs2_remove_empty_extent(right_leaf_el);
2187 }
2188
2189 ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2190 if (ret)
2191 mlog_errno(ret);
2192 ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
2193 if (ret)
2194 mlog_errno(ret);
2195
2196 if (del_right_subtree) {
2197 ocfs2_unlink_subtree(inode, handle, left_path, right_path,
2198 subtree_index, dealloc);
2199 ocfs2_update_edge_lengths(inode, handle, left_path);
2200
2201 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2202 di->i_last_eb_blk = eb->h_blkno;
2203
2204 /*
2205 * Removal of the extent in the left leaf was skipped
2206 * above so we could delete the right path
2207 * 1st.
2208 */
2209 if (right_has_empty)
2210 ocfs2_remove_empty_extent(left_leaf_el);
2211
2212 ret = ocfs2_journal_dirty(handle, di_bh);
2213 if (ret)
2214 mlog_errno(ret);
2215
2216 *deleted = 1;
2217 } else
2218 ocfs2_complete_edge_insert(inode, handle, left_path, right_path,
2219 subtree_index);
2220
2221out:
2222 return ret;
2223}
2224
2225/*
2226 * Given a full path, determine what cpos value would return us a path
2227 * containing the leaf immediately to the right of the current one.
2228 *
2229 * Will return zero if the path passed in is already the rightmost path.
2230 *
2231 * This looks similar, but is subtly different to
2232 * ocfs2_find_cpos_for_left_leaf().
2233 */
2234static int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
2235 struct ocfs2_path *path, u32 *cpos)
2236{
2237 int i, j, ret = 0;
2238 u64 blkno;
2239 struct ocfs2_extent_list *el;
2240
2241 *cpos = 0;
2242
2243 if (path->p_tree_depth == 0)
2244 return 0;
2245
2246 blkno = path_leaf_bh(path)->b_blocknr;
2247
2248 /* Start at the tree node just above the leaf and work our way up. */
2249 i = path->p_tree_depth - 1;
2250 while (i >= 0) {
2251 int next_free;
2252
2253 el = path->p_node[i].el;
2254
2255 /*
2256 * Find the extent record just after the one in our
2257 * path.
2258 */
2259 next_free = le16_to_cpu(el->l_next_free_rec);
2260 for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
2261 if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
2262 if (j == (next_free - 1)) {
2263 if (i == 0) {
2264 /*
2265 * We've determined that the
2266 * path specified is already
2267 * the rightmost one - return a
2268 * cpos of zero.
2269 */
2270 goto out;
2271 }
2272 /*
2273 * The rightmost record points to our
2274 * leaf - we need to travel up the
2275 * tree one level.
2276 */
2277 goto next_node;
2278 }
2279
2280 *cpos = le32_to_cpu(el->l_recs[j + 1].e_cpos);
2281 goto out;
2282 }
2283 }
2284
2285 /*
2286 * If we got here, we never found a valid node where
2287 * the tree indicated one should be.
2288 */
2289 ocfs2_error(sb,
2290 "Invalid extent tree at extent block %llu\n",
2291 (unsigned long long)blkno);
2292 ret = -EROFS;
2293 goto out;
2294
2295next_node:
2296 blkno = path->p_node[i].bh->b_blocknr;
2297 i--;
2298 }
2299
2300out:
2301 return ret;
2302}
2303
2304static int ocfs2_rotate_rightmost_leaf_left(struct inode *inode,
2305 handle_t *handle,
2306 struct buffer_head *bh,
2307 struct ocfs2_extent_list *el)
2308{
2309 int ret;
2310
2311 if (!ocfs2_is_empty_extent(&el->l_recs[0]))
2312 return 0;
2313
2314 ret = ocfs2_journal_access(handle, inode, bh,
2315 OCFS2_JOURNAL_ACCESS_WRITE);
2316 if (ret) {
2317 mlog_errno(ret);
2318 goto out;
2319 }
2320
2321 ocfs2_remove_empty_extent(el);
2322
2323 ret = ocfs2_journal_dirty(handle, bh);
2324 if (ret)
2325 mlog_errno(ret);
2326
2327out:
2328 return ret;
2329}
2330
2331static int __ocfs2_rotate_tree_left(struct inode *inode,
2332 handle_t *handle, int orig_credits,
2333 struct ocfs2_path *path,
2334 struct ocfs2_cached_dealloc_ctxt *dealloc,
2335 struct ocfs2_path **empty_extent_path)
2336{
2337 int ret, subtree_root, deleted;
2338 u32 right_cpos;
2339 struct ocfs2_path *left_path = NULL;
2340 struct ocfs2_path *right_path = NULL;
2341
2342 BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0])));
2343
2344 *empty_extent_path = NULL;
2345
2346 ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, path,
2347 &right_cpos);
2348 if (ret) {
2349 mlog_errno(ret);
2350 goto out;
2351 }
2352
2353 left_path = ocfs2_new_path(path_root_bh(path),
2354 path_root_el(path));
2355 if (!left_path) {
2356 ret = -ENOMEM;
2357 mlog_errno(ret);
2358 goto out;
2359 }
2360
2361 ocfs2_cp_path(left_path, path);
2362
2363 right_path = ocfs2_new_path(path_root_bh(path),
2364 path_root_el(path));
2365 if (!right_path) {
2366 ret = -ENOMEM;
2367 mlog_errno(ret);
2368 goto out;
2369 }
2370
2371 while (right_cpos) {
2372 ret = ocfs2_find_path(inode, right_path, right_cpos);
2373 if (ret) {
2374 mlog_errno(ret);
2375 goto out;
2376 }
2377
2378 subtree_root = ocfs2_find_subtree_root(inode, left_path,
2379 right_path);
2380
2381 mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
2382 subtree_root,
2383 (unsigned long long)
2384 right_path->p_node[subtree_root].bh->b_blocknr,
2385 right_path->p_tree_depth);
2386
2387 ret = ocfs2_extend_rotate_transaction(handle, subtree_root,
2388 orig_credits, left_path);
2389 if (ret) {
2390 mlog_errno(ret);
2391 goto out;
2392 }
2393
2394 ret = ocfs2_rotate_subtree_left(inode, handle, left_path,
2395 right_path, subtree_root,
2396 dealloc, &deleted);
2397 if (ret == -EAGAIN) {
2398 /*
2399 * The rotation has to temporarily stop due to
2400 * the right subtree having an empty
2401 * extent. Pass it back to the caller for a
2402 * fixup.
2403 */
2404 *empty_extent_path = right_path;
2405 right_path = NULL;
2406 goto out;
2407 }
2408 if (ret) {
2409 mlog_errno(ret);
2410 goto out;
2411 }
2412
2413 /*
2414 * The subtree rotate might have removed records on
2415 * the rightmost edge. If so, then rotation is
2416 * complete.
2417 */
2418 if (deleted)
2419 break;
2420
2421 ocfs2_mv_path(left_path, right_path);
2422
2423 ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path,
2424 &right_cpos);
2425 if (ret) {
2426 mlog_errno(ret);
2427 goto out;
2428 }
2429 }
2430
2431out:
2432 ocfs2_free_path(right_path);
2433 ocfs2_free_path(left_path);
2434
2435 return ret;
2436}
2437
2438static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
2439 struct ocfs2_path *path,
2440 struct ocfs2_cached_dealloc_ctxt *dealloc)
2441{
2442 int ret, subtree_index;
2443 u32 cpos;
2444 struct ocfs2_path *left_path = NULL;
2445 struct ocfs2_dinode *di;
2446 struct ocfs2_extent_block *eb;
2447 struct ocfs2_extent_list *el;
2448
2449 /*
2450 * XXX: This code assumes that the root is an inode, which is
2451 * true for now but may change as tree code gets generic.
2452 */
2453 di = (struct ocfs2_dinode *)path_root_bh(path)->b_data;
2454 if (!OCFS2_IS_VALID_DINODE(di)) {
2455 ret = -EIO;
2456 ocfs2_error(inode->i_sb,
2457 "Inode %llu has invalid path root",
2458 (unsigned long long)OCFS2_I(inode)->ip_blkno);
2459 goto out;
2460 }
2461
2462 /*
2463 * There's two ways we handle this depending on
2464 * whether path is the only existing one.
2465 */
2466 ret = ocfs2_extend_rotate_transaction(handle, 0,
2467 handle->h_buffer_credits,
2468 path);
2469 if (ret) {
2470 mlog_errno(ret);
2471 goto out;
2472 }
2473
2474 ret = ocfs2_journal_access_path(inode, handle, path);
2475 if (ret) {
2476 mlog_errno(ret);
2477 goto out;
2478 }
2479
2480 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
2481 if (ret) {
2482 mlog_errno(ret);
2483 goto out;
2484 }
2485
2486 if (cpos) {
2487 /*
2488 * We have a path to the left of this one - it needs
2489 * an update too.
2490 */
2491 left_path = ocfs2_new_path(path_root_bh(path),
2492 path_root_el(path));
2493 if (!left_path) {
2494 ret = -ENOMEM;
2495 mlog_errno(ret);
2496 goto out;
2497 }
2498
2499 ret = ocfs2_find_path(inode, left_path, cpos);
2500 if (ret) {
2501 mlog_errno(ret);
2502 goto out;
2503 }
2504
2505 ret = ocfs2_journal_access_path(inode, handle, left_path);
2506 if (ret) {
2507 mlog_errno(ret);
2508 goto out;
2509 }
2510
2511 subtree_index = ocfs2_find_subtree_root(inode, left_path, path);
2512
2513 ocfs2_unlink_subtree(inode, handle, left_path, path,
2514 subtree_index, dealloc);
2515 ocfs2_update_edge_lengths(inode, handle, left_path);
2516
2517 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2518 di->i_last_eb_blk = eb->h_blkno;
2519 } else {
2520 /*
2521 * 'path' is also the leftmost path which
2522 * means it must be the only one. This gets
2523 * handled differently because we want to
2524 * revert the inode back to having extents
2525 * in-line.
2526 */
2527 ocfs2_unlink_path(inode, handle, dealloc, path, 1);
2528
2529 el = &di->id2.i_list;
2530 el->l_tree_depth = 0;
2531 el->l_next_free_rec = 0;
2532 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2533
2534 di->i_last_eb_blk = 0;
2535 }
2536
2537 ocfs2_journal_dirty(handle, path_root_bh(path));
2538
2539out:
2540 ocfs2_free_path(left_path);
2541 return ret;
2542}
2543
2544/*
2545 * Left rotation of btree records.
2546 *
2547 * In many ways, this is (unsurprisingly) the opposite of right
2548 * rotation. We start at some non-rightmost path containing an empty
2549 * extent in the leaf block. The code works its way to the rightmost
2550 * path by rotating records to the left in every subtree.
2551 *
2552 * This is used by any code which reduces the number of extent records
2553 * in a leaf. After removal, an empty record should be placed in the
2554 * leftmost list position.
2555 *
2556 * This won't handle a length update of the rightmost path records if
2557 * the rightmost tree leaf record is removed so the caller is
2558 * responsible for detecting and correcting that.
2559 */
2560static int ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle,
2561 struct ocfs2_path *path,
2562 struct ocfs2_cached_dealloc_ctxt *dealloc)
2563{
2564 int ret, orig_credits = handle->h_buffer_credits;
2565 struct ocfs2_path *tmp_path = NULL, *restart_path = NULL;
2566 struct ocfs2_extent_block *eb;
2567 struct ocfs2_extent_list *el;
2568
2569 el = path_leaf_el(path);
2570 if (!ocfs2_is_empty_extent(&el->l_recs[0]))
2571 return 0;
2572
2573 if (path->p_tree_depth == 0) {
2574rightmost_no_delete:
2575 /*
2576 * In-inode extents. This is trivially handled, so do
2577 * it up front.
2578 */
2579 ret = ocfs2_rotate_rightmost_leaf_left(inode, handle,
2580 path_leaf_bh(path),
2581 path_leaf_el(path));
2582 if (ret)
2583 mlog_errno(ret);
2584 goto out;
2585 }
2586
2587 /*
2588 * Handle rightmost branch now. There's several cases:
2589 * 1) simple rotation leaving records in there. That's trivial.
2590 * 2) rotation requiring a branch delete - there's no more
2591 * records left. Two cases of this:
2592 * a) There are branches to the left.
2593 * b) This is also the leftmost (the only) branch.
2594 *
2595 * 1) is handled via ocfs2_rotate_rightmost_leaf_left()
2596 * 2a) we need the left branch so that we can update it with the unlink
2597 * 2b) we need to bring the inode back to inline extents.
2598 */
2599
2600 eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
2601 el = &eb->h_list;
2602 if (eb->h_next_leaf_blk == 0) {
2603 /*
2604 * This gets a bit tricky if we're going to delete the
2605 * rightmost path. Get the other cases out of the way
2606 * 1st.
2607 */
2608 if (le16_to_cpu(el->l_next_free_rec) > 1)
2609 goto rightmost_no_delete;
2610
2611 if (le16_to_cpu(el->l_next_free_rec) == 0) {
2612 ret = -EIO;
2613 ocfs2_error(inode->i_sb,
2614 "Inode %llu has empty extent block at %llu",
2615 (unsigned long long)OCFS2_I(inode)->ip_blkno,
2616 (unsigned long long)le64_to_cpu(eb->h_blkno));
2617 goto out;
2618 }
2619
2620 /*
2621 * XXX: The caller can not trust "path" any more after
2622 * this as it will have been deleted. What do we do?
2623 *
2624 * In theory the rotate-for-merge code will never get
2625 * here because it'll always ask for a rotate in a
2626 * nonempty list.
2627 */
2628
2629 ret = ocfs2_remove_rightmost_path(inode, handle, path,
2630 dealloc);
2631 if (ret)
2632 mlog_errno(ret);
2633 goto out;
2634 }
2635
2636 /*
2637 * Now we can loop, remembering the path we get from -EAGAIN
2638 * and restarting from there.
2639 */
2640try_rotate:
2641 ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, path,
2642 dealloc, &restart_path);
2643 if (ret && ret != -EAGAIN) {
2644 mlog_errno(ret);
2645 goto out;
2646 }
2647
2648 while (ret == -EAGAIN) {
2649 tmp_path = restart_path;
2650 restart_path = NULL;
2651
2652 ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits,
2653 tmp_path, dealloc,
2654 &restart_path);
2655 if (ret && ret != -EAGAIN) {
2656 mlog_errno(ret);
2657 goto out;
2658 }
2659
2660 ocfs2_free_path(tmp_path);
2661 tmp_path = NULL;
2662
2663 if (ret == 0)
2664 goto try_rotate;
2665 }
2666
2667out:
2668 ocfs2_free_path(tmp_path);
2669 ocfs2_free_path(restart_path);
2670 return ret;
2671}
2672
2673static void ocfs2_cleanup_merge(struct ocfs2_extent_list *el,
2674 int index)
2675{
2676 struct ocfs2_extent_rec *rec = &el->l_recs[index];
2677 unsigned int size;
2678
2679 if (rec->e_leaf_clusters == 0) {
2680 /*
2681 * We consumed all of the merged-from record. An empty
2682 * extent cannot exist anywhere but the 1st array
2683 * position, so move things over if the merged-from
2684 * record doesn't occupy that position.
2685 *
2686 * This creates a new empty extent so the caller
2687 * should be smart enough to have removed any existing
2688 * ones.
2689 */
2690 if (index > 0) {
2691 BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
2692 size = index * sizeof(struct ocfs2_extent_rec);
2693 memmove(&el->l_recs[1], &el->l_recs[0], size);
2694 }
2695
2696 /*
2697 * Always memset - the caller doesn't check whether it
2698 * created an empty extent, so there could be junk in
2699 * the other fields.
2700 */
2701 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2702 }
2703}
2704
2705/*
2706 * Remove split_rec clusters from the record at index and merge them
2707 * onto the beginning of the record at index + 1.
2708 */
2709static int ocfs2_merge_rec_right(struct inode *inode, struct buffer_head *bh,
2710 handle_t *handle,
2711 struct ocfs2_extent_rec *split_rec,
2712 struct ocfs2_extent_list *el, int index)
2713{
2714 int ret;
2715 unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
2716 struct ocfs2_extent_rec *left_rec;
2717 struct ocfs2_extent_rec *right_rec;
2718
2719 BUG_ON(index >= le16_to_cpu(el->l_next_free_rec));
2720
2721 left_rec = &el->l_recs[index];
2722 right_rec = &el->l_recs[index + 1];
2723
2724 ret = ocfs2_journal_access(handle, inode, bh,
2725 OCFS2_JOURNAL_ACCESS_WRITE);
2726 if (ret) {
2727 mlog_errno(ret);
2728 goto out;
2729 }
2730
2731 le16_add_cpu(&left_rec->e_leaf_clusters, -split_clusters);
2732
2733 le32_add_cpu(&right_rec->e_cpos, -split_clusters);
2734 le64_add_cpu(&right_rec->e_blkno,
2735 -ocfs2_clusters_to_blocks(inode->i_sb, split_clusters));
2736 le16_add_cpu(&right_rec->e_leaf_clusters, split_clusters);
2737
2738 ocfs2_cleanup_merge(el, index);
2739
2740 ret = ocfs2_journal_dirty(handle, bh);
2741 if (ret)
2742 mlog_errno(ret);
2743
2744out:
2745 return ret;
2746}
2747
2748/*
2749 * Remove split_rec clusters from the record at index and merge them
2750 * onto the tail of the record at index - 1.
2751 */
2752static int ocfs2_merge_rec_left(struct inode *inode, struct buffer_head *bh,
2753 handle_t *handle,
2754 struct ocfs2_extent_rec *split_rec,
2755 struct ocfs2_extent_list *el, int index)
2756{
2757 int ret, has_empty_extent = 0;
2758 unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
2759 struct ocfs2_extent_rec *left_rec;
2760 struct ocfs2_extent_rec *right_rec;
2761
2762 BUG_ON(index <= 0);
2763
2764 left_rec = &el->l_recs[index - 1];
2765 right_rec = &el->l_recs[index];
2766 if (ocfs2_is_empty_extent(&el->l_recs[0]))
2767 has_empty_extent = 1;
2768
2769 ret = ocfs2_journal_access(handle, inode, bh,
2770 OCFS2_JOURNAL_ACCESS_WRITE);
2771 if (ret) {
2772 mlog_errno(ret);
2773 goto out;
2774 }
2775
2776 if (has_empty_extent && index == 1) {
2777 /*
2778 * The easy case - we can just plop the record right in.
2779 */
2780 *left_rec = *split_rec;
2781
2782 has_empty_extent = 0;
2783 } else {
2784 le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters);
2785 }
2786
2787 le32_add_cpu(&right_rec->e_cpos, split_clusters);
2788 le64_add_cpu(&right_rec->e_blkno,
2789 ocfs2_clusters_to_blocks(inode->i_sb, split_clusters));
2790 le16_add_cpu(&right_rec->e_leaf_clusters, -split_clusters);
2791
2792 ocfs2_cleanup_merge(el, index);
2793
2794 ret = ocfs2_journal_dirty(handle, bh);
2795 if (ret)
2796 mlog_errno(ret);
2797
2798out:
2799 return ret;
2800}
2801
2802static int ocfs2_try_to_merge_extent(struct inode *inode,
2803 handle_t *handle,
2804 struct ocfs2_path *left_path,
2805 int split_index,
2806 struct ocfs2_extent_rec *split_rec,
2807 struct ocfs2_cached_dealloc_ctxt *dealloc,
2808 struct ocfs2_merge_ctxt *ctxt)
2809
2810{
2811 int ret = 0, delete_tail_recs = 0;
2812 struct ocfs2_extent_list *el = path_leaf_el(left_path);
2813 struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
2814
2815 BUG_ON(ctxt->c_contig_type == CONTIG_NONE);
2816
2817 if (ctxt->c_split_covers_rec) {
2818 delete_tail_recs++;
2819
2820 if (ctxt->c_contig_type == CONTIG_LEFTRIGHT ||
2821 ctxt->c_has_empty_extent)
2822 delete_tail_recs++;
2823
2824 if (ctxt->c_has_empty_extent) {
2825 /*
2826 * The merge code will need to create an empty
2827 * extent to take the place of the newly
2828 * emptied slot. Remove any pre-existing empty
2829 * extents - having more than one in a leaf is
2830 * illegal.
2831 */
2832 ret = ocfs2_rotate_tree_left(inode, handle, left_path,
2833 dealloc);
2834 if (ret) {
2835 mlog_errno(ret);
2836 goto out;
2837 }
2838 split_index--;
2839 rec = &el->l_recs[split_index];
2840 }
2841 }
2842
2843 if (ctxt->c_contig_type == CONTIG_LEFTRIGHT) {
2844 /*
2845 * Left-right contig implies this.
2846 */
2847 BUG_ON(!ctxt->c_split_covers_rec);
2848 BUG_ON(split_index == 0);
2849
2850 /*
2851 * Since the leftright insert always covers the entire
2852 * extent, this call will delete the insert record
2853 * entirely, resulting in an empty extent record added to
2854 * the extent block.
2855 *
2856 * Since the adding of an empty extent shifts
2857 * everything back to the right, there's no need to
2858 * update split_index here.
2859 */
2860 ret = ocfs2_merge_rec_left(inode, path_leaf_bh(left_path),
2861 handle, split_rec, el, split_index);
2862 if (ret) {
2863 mlog_errno(ret);
2864 goto out;
2865 }
2866
2867 /*
2868 * We can only get this from logic error above.
2869 */
2870 BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
2871
2872 /*
2873 * The left merge left us with an empty extent, remove
2874 * it.
2875 */
2876 ret = ocfs2_rotate_tree_left(inode, handle, left_path, dealloc);
2877 if (ret) {
2878 mlog_errno(ret);
2879 goto out;
2880 }
2881 split_index--;
2882 rec = &el->l_recs[split_index];
2883
2884 /*
2885 * Note that we don't pass split_rec here on purpose -
2886 * we've merged it into the left side.
2887 */
2888 ret = ocfs2_merge_rec_right(inode, path_leaf_bh(left_path),
2889 handle, rec, el, split_index);
2890 if (ret) {
2891 mlog_errno(ret);
2892 goto out;
2893 }
2894
2895 BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
2896
2897 ret = ocfs2_rotate_tree_left(inode, handle, left_path,
2898 dealloc);
2899 /*
2900 * Error from this last rotate is not critical, so
2901 * print but don't bubble it up.
2902 */
2903 if (ret)
2904 mlog_errno(ret);
2905 ret = 0;
2906 } else {
2907 /*
2908 * Merge a record to the left or right.
2909 *
2910 * 'contig_type' is relative to the existing record,
2911 * so for example, if we're "right contig", it's to
2912 * the record on the left (hence the left merge).
2913 */
2914 if (ctxt->c_contig_type == CONTIG_RIGHT) {
2915 ret = ocfs2_merge_rec_left(inode,
2916 path_leaf_bh(left_path),
2917 handle, split_rec, el,
2918 split_index);
2919 if (ret) {
2920 mlog_errno(ret);
2921 goto out;
2922 }
2923 } else {
2924 ret = ocfs2_merge_rec_right(inode,
2925 path_leaf_bh(left_path),
2926 handle, split_rec, el,
2927 split_index);
2928 if (ret) {
2929 mlog_errno(ret);
2930 goto out;
2931 }
2932 }
2933
2934 if (ctxt->c_split_covers_rec) {
2935 /*
2936 * The merge may have left an empty extent in
2937 * our leaf. Try to rotate it away.
2938 */
2939 ret = ocfs2_rotate_tree_left(inode, handle, left_path,
2940 dealloc);
2941 if (ret)
2942 mlog_errno(ret);
2943 ret = 0;
2944 }
2945 }
2946
2947out:
2948 return ret;
2949}
2950
2951static void ocfs2_subtract_from_rec(struct super_block *sb,
2952 enum ocfs2_split_type split,
2953 struct ocfs2_extent_rec *rec,
2954 struct ocfs2_extent_rec *split_rec)
2955{
2956 u64 len_blocks;
2957
2958 len_blocks = ocfs2_clusters_to_blocks(sb,
2959 le16_to_cpu(split_rec->e_leaf_clusters));
2960
2961 if (split == SPLIT_LEFT) {
2962 /*
2963 * Region is on the left edge of the existing
2964 * record.
2965 */
2966 le32_add_cpu(&rec->e_cpos,
2967 le16_to_cpu(split_rec->e_leaf_clusters));
2968 le64_add_cpu(&rec->e_blkno, len_blocks);
2969 le16_add_cpu(&rec->e_leaf_clusters,
2970 -le16_to_cpu(split_rec->e_leaf_clusters));
2971 } else {
2972 /*
2973 * Region is on the right edge of the existing
2974 * record.
2975 */
2976 le16_add_cpu(&rec->e_leaf_clusters,
2977 -le16_to_cpu(split_rec->e_leaf_clusters));
2978 }
2979}
2980
1725/* 2981/*
1726 * Do the final bits of extent record insertion at the target leaf 2982 * Do the final bits of extent record insertion at the target leaf
1727 * list. If this leaf is part of an allocation tree, it is assumed 2983 * list. If this leaf is part of an allocation tree, it is assumed
@@ -1738,6 +2994,15 @@ static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
1738 2994
1739 BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); 2995 BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
1740 2996
2997 if (insert->ins_split != SPLIT_NONE) {
2998 i = ocfs2_search_extent_list(el, le32_to_cpu(insert_rec->e_cpos));
2999 BUG_ON(i == -1);
3000 rec = &el->l_recs[i];
3001 ocfs2_subtract_from_rec(inode->i_sb, insert->ins_split, rec,
3002 insert_rec);
3003 goto rotate;
3004 }
3005
1741 /* 3006 /*
1742 * Contiguous insert - either left or right. 3007 * Contiguous insert - either left or right.
1743 */ 3008 */
@@ -1792,6 +3057,7 @@ static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
1792 return; 3057 return;
1793 } 3058 }
1794 3059
3060rotate:
1795 /* 3061 /*
1796 * Ok, we have to rotate. 3062 * Ok, we have to rotate.
1797 * 3063 *
@@ -1815,13 +3081,53 @@ static inline void ocfs2_update_dinode_clusters(struct inode *inode,
1815 spin_unlock(&OCFS2_I(inode)->ip_lock); 3081 spin_unlock(&OCFS2_I(inode)->ip_lock);
1816} 3082}
1817 3083
3084static void ocfs2_adjust_rightmost_records(struct inode *inode,
3085 handle_t *handle,
3086 struct ocfs2_path *path,
3087 struct ocfs2_extent_rec *insert_rec)
3088{
3089 int ret, i, next_free;
3090 struct buffer_head *bh;
3091 struct ocfs2_extent_list *el;
3092 struct ocfs2_extent_rec *rec;
3093
3094 /*
3095 * Update everything except the leaf block.
3096 */
3097 for (i = 0; i < path->p_tree_depth; i++) {
3098 bh = path->p_node[i].bh;
3099 el = path->p_node[i].el;
3100
3101 next_free = le16_to_cpu(el->l_next_free_rec);
3102 if (next_free == 0) {
3103 ocfs2_error(inode->i_sb,
3104 "Dinode %llu has a bad extent list",
3105 (unsigned long long)OCFS2_I(inode)->ip_blkno);
3106 ret = -EIO;
3107 return;
3108 }
3109
3110 rec = &el->l_recs[next_free - 1];
3111
3112 rec->e_int_clusters = insert_rec->e_cpos;
3113 le32_add_cpu(&rec->e_int_clusters,
3114 le16_to_cpu(insert_rec->e_leaf_clusters));
3115 le32_add_cpu(&rec->e_int_clusters,
3116 -le32_to_cpu(rec->e_cpos));
3117
3118 ret = ocfs2_journal_dirty(handle, bh);
3119 if (ret)
3120 mlog_errno(ret);
3121
3122 }
3123}
3124
1818static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle, 3125static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
1819 struct ocfs2_extent_rec *insert_rec, 3126 struct ocfs2_extent_rec *insert_rec,
1820 struct ocfs2_path *right_path, 3127 struct ocfs2_path *right_path,
1821 struct ocfs2_path **ret_left_path) 3128 struct ocfs2_path **ret_left_path)
1822{ 3129{
1823 int ret, i, next_free; 3130 int ret, next_free;
1824 struct buffer_head *bh;
1825 struct ocfs2_extent_list *el; 3131 struct ocfs2_extent_list *el;
1826 struct ocfs2_path *left_path = NULL; 3132 struct ocfs2_path *left_path = NULL;
1827 3133
@@ -1887,40 +3193,7 @@ static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
1887 goto out; 3193 goto out;
1888 } 3194 }
1889 3195
1890 el = path_root_el(right_path); 3196 ocfs2_adjust_rightmost_records(inode, handle, right_path, insert_rec);
1891 bh = path_root_bh(right_path);
1892 i = 0;
1893 while (1) {
1894 struct ocfs2_extent_rec *rec;
1895
1896 next_free = le16_to_cpu(el->l_next_free_rec);
1897 if (next_free == 0) {
1898 ocfs2_error(inode->i_sb,
1899 "Dinode %llu has a bad extent list",
1900 (unsigned long long)OCFS2_I(inode)->ip_blkno);
1901 ret = -EIO;
1902 goto out;
1903 }
1904
1905 rec = &el->l_recs[next_free - 1];
1906
1907 rec->e_int_clusters = insert_rec->e_cpos;
1908 le32_add_cpu(&rec->e_int_clusters,
1909 le16_to_cpu(insert_rec->e_leaf_clusters));
1910 le32_add_cpu(&rec->e_int_clusters,
1911 -le32_to_cpu(rec->e_cpos));
1912
1913 ret = ocfs2_journal_dirty(handle, bh);
1914 if (ret)
1915 mlog_errno(ret);
1916
1917 /* Don't touch the leaf node */
1918 if (++i >= right_path->p_tree_depth)
1919 break;
1920
1921 bh = right_path->p_node[i].bh;
1922 el = right_path->p_node[i].el;
1923 }
1924 3197
1925 *ret_left_path = left_path; 3198 *ret_left_path = left_path;
1926 ret = 0; 3199 ret = 0;
@@ -1931,6 +3204,83 @@ out:
1931 return ret; 3204 return ret;
1932} 3205}
1933 3206
3207static void ocfs2_split_record(struct inode *inode,
3208 struct ocfs2_path *left_path,
3209 struct ocfs2_path *right_path,
3210 struct ocfs2_extent_rec *split_rec,
3211 enum ocfs2_split_type split)
3212{
3213 int index;
3214 u32 cpos = le32_to_cpu(split_rec->e_cpos);
3215 struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el;
3216 struct ocfs2_extent_rec *rec, *tmprec;
3217
3218 right_el = path_leaf_el(right_path);;
3219 if (left_path)
3220 left_el = path_leaf_el(left_path);
3221
3222 el = right_el;
3223 insert_el = right_el;
3224 index = ocfs2_search_extent_list(el, cpos);
3225 if (index != -1) {
3226 if (index == 0 && left_path) {
3227 BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
3228
3229 /*
3230 * This typically means that the record
3231 * started in the left path but moved to the
3232 * right as a result of rotation. We either
3233 * move the existing record to the left, or we
3234 * do the later insert there.
3235 *
3236 * In this case, the left path should always
3237 * exist as the rotate code will have passed
3238 * it back for a post-insert update.
3239 */
3240
3241 if (split == SPLIT_LEFT) {
3242 /*
3243 * It's a left split. Since we know
3244 * that the rotate code gave us an
3245 * empty extent in the left path, we
3246 * can just do the insert there.
3247 */
3248 insert_el = left_el;
3249 } else {
3250 /*
3251 * Right split - we have to move the
3252 * existing record over to the left
3253 * leaf. The insert will be into the
3254 * newly created empty extent in the
3255 * right leaf.
3256 */
3257 tmprec = &right_el->l_recs[index];
3258 ocfs2_rotate_leaf(left_el, tmprec);
3259 el = left_el;
3260
3261 memset(tmprec, 0, sizeof(*tmprec));
3262 index = ocfs2_search_extent_list(left_el, cpos);
3263 BUG_ON(index == -1);
3264 }
3265 }
3266 } else {
3267 BUG_ON(!left_path);
3268 BUG_ON(!ocfs2_is_empty_extent(&left_el->l_recs[0]));
3269 /*
3270 * Left path is easy - we can just allow the insert to
3271 * happen.
3272 */
3273 el = left_el;
3274 insert_el = left_el;
3275 index = ocfs2_search_extent_list(el, cpos);
3276 BUG_ON(index == -1);
3277 }
3278
3279 rec = &el->l_recs[index];
3280 ocfs2_subtract_from_rec(inode->i_sb, split, rec, split_rec);
3281 ocfs2_rotate_leaf(insert_el, split_rec);
3282}
3283
1934/* 3284/*
1935 * This function only does inserts on an allocation b-tree. For dinode 3285 * This function only does inserts on an allocation b-tree. For dinode
1936 * lists, ocfs2_insert_at_leaf() is called directly. 3286 * lists, ocfs2_insert_at_leaf() is called directly.
@@ -1948,7 +3298,6 @@ static int ocfs2_insert_path(struct inode *inode,
1948{ 3298{
1949 int ret, subtree_index; 3299 int ret, subtree_index;
1950 struct buffer_head *leaf_bh = path_leaf_bh(right_path); 3300 struct buffer_head *leaf_bh = path_leaf_bh(right_path);
1951 struct ocfs2_extent_list *el;
1952 3301
1953 /* 3302 /*
1954 * Pass both paths to the journal. The majority of inserts 3303 * Pass both paths to the journal. The majority of inserts
@@ -1984,9 +3333,18 @@ static int ocfs2_insert_path(struct inode *inode,
1984 } 3333 }
1985 } 3334 }
1986 3335
1987 el = path_leaf_el(right_path); 3336 if (insert->ins_split != SPLIT_NONE) {
3337 /*
3338 * We could call ocfs2_insert_at_leaf() for some types
3339 * of splits, but it's easier to just let one seperate
3340 * function sort it all out.
3341 */
3342 ocfs2_split_record(inode, left_path, right_path,
3343 insert_rec, insert->ins_split);
3344 } else
3345 ocfs2_insert_at_leaf(insert_rec, path_leaf_el(right_path),
3346 insert, inode);
1988 3347
1989 ocfs2_insert_at_leaf(insert_rec, el, insert, inode);
1990 ret = ocfs2_journal_dirty(handle, leaf_bh); 3348 ret = ocfs2_journal_dirty(handle, leaf_bh);
1991 if (ret) 3349 if (ret)
1992 mlog_errno(ret); 3350 mlog_errno(ret);
@@ -2075,7 +3433,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
2075 * can wind up skipping both of these two special cases... 3433 * can wind up skipping both of these two special cases...
2076 */ 3434 */
2077 if (rotate) { 3435 if (rotate) {
2078 ret = ocfs2_rotate_tree_right(inode, handle, 3436 ret = ocfs2_rotate_tree_right(inode, handle, type->ins_split,
2079 le32_to_cpu(insert_rec->e_cpos), 3437 le32_to_cpu(insert_rec->e_cpos),
2080 right_path, &left_path); 3438 right_path, &left_path);
2081 if (ret) { 3439 if (ret) {
@@ -2100,8 +3458,9 @@ static int ocfs2_do_insert_extent(struct inode *inode,
2100 } 3458 }
2101 3459
2102out_update_clusters: 3460out_update_clusters:
2103 ocfs2_update_dinode_clusters(inode, di, 3461 if (type->ins_split == SPLIT_NONE)
2104 le16_to_cpu(insert_rec->e_leaf_clusters)); 3462 ocfs2_update_dinode_clusters(inode, di,
3463 le16_to_cpu(insert_rec->e_leaf_clusters));
2105 3464
2106 ret = ocfs2_journal_dirty(handle, di_bh); 3465 ret = ocfs2_journal_dirty(handle, di_bh);
2107 if (ret) 3466 if (ret)
@@ -2114,6 +3473,44 @@ out:
2114 return ret; 3473 return ret;
2115} 3474}
2116 3475
3476static enum ocfs2_contig_type
3477ocfs2_figure_merge_contig_type(struct inode *inode,
3478 struct ocfs2_extent_list *el, int index,
3479 struct ocfs2_extent_rec *split_rec)
3480{
3481 struct ocfs2_extent_rec *rec;
3482 enum ocfs2_contig_type ret = CONTIG_NONE;
3483
3484 /*
3485 * We're careful to check for an empty extent record here -
3486 * the merge code will know what to do if it sees one.
3487 */
3488
3489 if (index > 0) {
3490 rec = &el->l_recs[index - 1];
3491 if (index == 1 && ocfs2_is_empty_extent(rec)) {
3492 if (split_rec->e_cpos == el->l_recs[index].e_cpos)
3493 ret = CONTIG_RIGHT;
3494 } else {
3495 ret = ocfs2_extent_contig(inode, rec, split_rec);
3496 }
3497 }
3498
3499 if (index < (le16_to_cpu(el->l_next_free_rec) - 1)) {
3500 enum ocfs2_contig_type contig_type;
3501
3502 rec = &el->l_recs[index + 1];
3503 contig_type = ocfs2_extent_contig(inode, rec, split_rec);
3504
3505 if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT)
3506 ret = CONTIG_LEFTRIGHT;
3507 else if (ret == CONTIG_NONE)
3508 ret = contig_type;
3509 }
3510
3511 return ret;
3512}
3513
2117static void ocfs2_figure_contig_type(struct inode *inode, 3514static void ocfs2_figure_contig_type(struct inode *inode,
2118 struct ocfs2_insert_type *insert, 3515 struct ocfs2_insert_type *insert,
2119 struct ocfs2_extent_list *el, 3516 struct ocfs2_extent_list *el,
@@ -2205,6 +3602,8 @@ static int ocfs2_figure_insert_type(struct inode *inode,
2205 struct ocfs2_path *path = NULL; 3602 struct ocfs2_path *path = NULL;
2206 struct buffer_head *bh = NULL; 3603 struct buffer_head *bh = NULL;
2207 3604
3605 insert->ins_split = SPLIT_NONE;
3606
2208 el = &di->id2.i_list; 3607 el = &di->id2.i_list;
2209 insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth); 3608 insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth);
2210 3609
@@ -2327,9 +3726,10 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
2327 u32 cpos, 3726 u32 cpos,
2328 u64 start_blk, 3727 u64 start_blk,
2329 u32 new_clusters, 3728 u32 new_clusters,
3729 u8 flags,
2330 struct ocfs2_alloc_context *meta_ac) 3730 struct ocfs2_alloc_context *meta_ac)
2331{ 3731{
2332 int status, shift; 3732 int status;
2333 struct buffer_head *last_eb_bh = NULL; 3733 struct buffer_head *last_eb_bh = NULL;
2334 struct buffer_head *bh = NULL; 3734 struct buffer_head *bh = NULL;
2335 struct ocfs2_insert_type insert = {0, }; 3735 struct ocfs2_insert_type insert = {0, };
@@ -2350,6 +3750,7 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
2350 rec.e_cpos = cpu_to_le32(cpos); 3750 rec.e_cpos = cpu_to_le32(cpos);
2351 rec.e_blkno = cpu_to_le64(start_blk); 3751 rec.e_blkno = cpu_to_le64(start_blk);
2352 rec.e_leaf_clusters = cpu_to_le16(new_clusters); 3752 rec.e_leaf_clusters = cpu_to_le16(new_clusters);
3753 rec.e_flags = flags;
2353 3754
2354 status = ocfs2_figure_insert_type(inode, fe_bh, &last_eb_bh, &rec, 3755 status = ocfs2_figure_insert_type(inode, fe_bh, &last_eb_bh, &rec,
2355 &insert); 3756 &insert);
@@ -2364,55 +3765,16 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
2364 insert.ins_appending, insert.ins_contig, insert.ins_contig_index, 3765 insert.ins_appending, insert.ins_contig, insert.ins_contig_index,
2365 insert.ins_free_records, insert.ins_tree_depth); 3766 insert.ins_free_records, insert.ins_tree_depth);
2366 3767
2367 /* 3768 if (insert.ins_contig == CONTIG_NONE && insert.ins_free_records == 0) {
2368 * Avoid growing the tree unless we're out of records and the 3769 status = ocfs2_grow_tree(inode, handle, fe_bh,
2369 * insert type requres one. 3770 &insert.ins_tree_depth, &last_eb_bh,
2370 */ 3771 meta_ac);
2371 if (insert.ins_contig != CONTIG_NONE || insert.ins_free_records) 3772 if (status) {
2372 goto out_add;
2373
2374 shift = ocfs2_find_branch_target(osb, inode, fe_bh, &bh);
2375 if (shift < 0) {
2376 status = shift;
2377 mlog_errno(status);
2378 goto bail;
2379 }
2380
2381 /* We traveled all the way to the bottom of the allocation tree
2382 * and didn't find room for any more extents - we need to add
2383 * another tree level */
2384 if (shift) {
2385 BUG_ON(bh);
2386 mlog(0, "need to shift tree depth "
2387 "(current = %d)\n", insert.ins_tree_depth);
2388
2389 /* ocfs2_shift_tree_depth will return us a buffer with
2390 * the new extent block (so we can pass that to
2391 * ocfs2_add_branch). */
2392 status = ocfs2_shift_tree_depth(osb, handle, inode, fe_bh,
2393 meta_ac, &bh);
2394 if (status < 0) {
2395 mlog_errno(status); 3773 mlog_errno(status);
2396 goto bail; 3774 goto bail;
2397 } 3775 }
2398 insert.ins_tree_depth++;
2399 /* Special case: we have room now if we shifted from
2400 * tree_depth 0 */
2401 if (insert.ins_tree_depth == 1)
2402 goto out_add;
2403 }
2404
2405 /* call ocfs2_add_branch to add the final part of the tree with
2406 * the new data. */
2407 mlog(0, "add branch. bh = %p\n", bh);
2408 status = ocfs2_add_branch(osb, handle, inode, fe_bh, bh, last_eb_bh,
2409 meta_ac);
2410 if (status < 0) {
2411 mlog_errno(status);
2412 goto bail;
2413 } 3776 }
2414 3777
2415out_add:
2416 /* Finally, we can add clusters. This might rotate the tree for us. */ 3778 /* Finally, we can add clusters. This might rotate the tree for us. */
2417 status = ocfs2_do_insert_extent(inode, handle, fe_bh, &rec, &insert); 3779 status = ocfs2_do_insert_extent(inode, handle, fe_bh, &rec, &insert);
2418 if (status < 0) 3780 if (status < 0)
@@ -2431,7 +3793,720 @@ bail:
2431 return status; 3793 return status;
2432} 3794}
2433 3795
2434static inline int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb) 3796static void ocfs2_make_right_split_rec(struct super_block *sb,
3797 struct ocfs2_extent_rec *split_rec,
3798 u32 cpos,
3799 struct ocfs2_extent_rec *rec)
3800{
3801 u32 rec_cpos = le32_to_cpu(rec->e_cpos);
3802 u32 rec_range = rec_cpos + le16_to_cpu(rec->e_leaf_clusters);
3803
3804 memset(split_rec, 0, sizeof(struct ocfs2_extent_rec));
3805
3806 split_rec->e_cpos = cpu_to_le32(cpos);
3807 split_rec->e_leaf_clusters = cpu_to_le16(rec_range - cpos);
3808
3809 split_rec->e_blkno = rec->e_blkno;
3810 le64_add_cpu(&split_rec->e_blkno,
3811 ocfs2_clusters_to_blocks(sb, cpos - rec_cpos));
3812
3813 split_rec->e_flags = rec->e_flags;
3814}
3815
3816static int ocfs2_split_and_insert(struct inode *inode,
3817 handle_t *handle,
3818 struct ocfs2_path *path,
3819 struct buffer_head *di_bh,
3820 struct buffer_head **last_eb_bh,
3821 int split_index,
3822 struct ocfs2_extent_rec *orig_split_rec,
3823 struct ocfs2_alloc_context *meta_ac)
3824{
3825 int ret = 0, depth;
3826 unsigned int insert_range, rec_range, do_leftright = 0;
3827 struct ocfs2_extent_rec tmprec;
3828 struct ocfs2_extent_list *rightmost_el;
3829 struct ocfs2_extent_rec rec;
3830 struct ocfs2_extent_rec split_rec = *orig_split_rec;
3831 struct ocfs2_insert_type insert;
3832 struct ocfs2_extent_block *eb;
3833 struct ocfs2_dinode *di;
3834
3835leftright:
3836 /*
3837 * Store a copy of the record on the stack - it might move
3838 * around as the tree is manipulated below.
3839 */
3840 rec = path_leaf_el(path)->l_recs[split_index];
3841
3842 di = (struct ocfs2_dinode *)di_bh->b_data;
3843 rightmost_el = &di->id2.i_list;
3844
3845 depth = le16_to_cpu(rightmost_el->l_tree_depth);
3846 if (depth) {
3847 BUG_ON(!(*last_eb_bh));
3848 eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
3849 rightmost_el = &eb->h_list;
3850 }
3851
3852 if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
3853 le16_to_cpu(rightmost_el->l_count)) {
3854 int old_depth = depth;
3855
3856 ret = ocfs2_grow_tree(inode, handle, di_bh, &depth, last_eb_bh,
3857 meta_ac);
3858 if (ret) {
3859 mlog_errno(ret);
3860 goto out;
3861 }
3862
3863 if (old_depth != depth) {
3864 eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data;
3865 rightmost_el = &eb->h_list;
3866 }
3867 }
3868
3869 memset(&insert, 0, sizeof(struct ocfs2_insert_type));
3870 insert.ins_appending = APPEND_NONE;
3871 insert.ins_contig = CONTIG_NONE;
3872 insert.ins_free_records = le16_to_cpu(rightmost_el->l_count)
3873 - le16_to_cpu(rightmost_el->l_next_free_rec);
3874 insert.ins_tree_depth = depth;
3875
3876 insert_range = le32_to_cpu(split_rec.e_cpos) +
3877 le16_to_cpu(split_rec.e_leaf_clusters);
3878 rec_range = le32_to_cpu(rec.e_cpos) +
3879 le16_to_cpu(rec.e_leaf_clusters);
3880
3881 if (split_rec.e_cpos == rec.e_cpos) {
3882 insert.ins_split = SPLIT_LEFT;
3883 } else if (insert_range == rec_range) {
3884 insert.ins_split = SPLIT_RIGHT;
3885 } else {
3886 /*
3887 * Left/right split. We fake this as a right split
3888 * first and then make a second pass as a left split.
3889 */
3890 insert.ins_split = SPLIT_RIGHT;
3891
3892 ocfs2_make_right_split_rec(inode->i_sb, &tmprec, insert_range,
3893 &rec);
3894
3895 split_rec = tmprec;
3896
3897 BUG_ON(do_leftright);
3898 do_leftright = 1;
3899 }
3900
3901 ret = ocfs2_do_insert_extent(inode, handle, di_bh, &split_rec,
3902 &insert);
3903 if (ret) {
3904 mlog_errno(ret);
3905 goto out;
3906 }
3907
3908 if (do_leftright == 1) {
3909 u32 cpos;
3910 struct ocfs2_extent_list *el;
3911
3912 do_leftright++;
3913 split_rec = *orig_split_rec;
3914
3915 ocfs2_reinit_path(path, 1);
3916
3917 cpos = le32_to_cpu(split_rec.e_cpos);
3918 ret = ocfs2_find_path(inode, path, cpos);
3919 if (ret) {
3920 mlog_errno(ret);
3921 goto out;
3922 }
3923
3924 el = path_leaf_el(path);
3925 split_index = ocfs2_search_extent_list(el, cpos);
3926 goto leftright;
3927 }
3928out:
3929
3930 return ret;
3931}
3932
3933/*
3934 * Mark part or all of the extent record at split_index in the leaf
3935 * pointed to by path as written. This removes the unwritten
3936 * extent flag.
3937 *
3938 * Care is taken to handle contiguousness so as to not grow the tree.
3939 *
3940 * meta_ac is not strictly necessary - we only truly need it if growth
3941 * of the tree is required. All other cases will degrade into a less
3942 * optimal tree layout.
3943 *
3944 * last_eb_bh should be the rightmost leaf block for any inode with a
3945 * btree. Since a split may grow the tree or a merge might shrink it, the caller cannot trust the contents of that buffer after this call.
3946 *
3947 * This code is optimized for readability - several passes might be
3948 * made over certain portions of the tree. All of those blocks will
3949 * have been brought into cache (and pinned via the journal), so the
3950 * extra overhead is not expressed in terms of disk reads.
3951 */
3952static int __ocfs2_mark_extent_written(struct inode *inode,
3953 struct buffer_head *di_bh,
3954 handle_t *handle,
3955 struct ocfs2_path *path,
3956 int split_index,
3957 struct ocfs2_extent_rec *split_rec,
3958 struct ocfs2_alloc_context *meta_ac,
3959 struct ocfs2_cached_dealloc_ctxt *dealloc)
3960{
3961 int ret = 0;
3962 struct ocfs2_extent_list *el = path_leaf_el(path);
3963 struct buffer_head *eb_bh, *last_eb_bh = NULL;
3964 struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
3965 struct ocfs2_merge_ctxt ctxt;
3966 struct ocfs2_extent_list *rightmost_el;
3967
3968 if (!rec->e_flags & OCFS2_EXT_UNWRITTEN) {
3969 ret = -EIO;
3970 mlog_errno(ret);
3971 goto out;
3972 }
3973
3974 if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) ||
3975 ((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) <
3976 (le32_to_cpu(split_rec->e_cpos) + le16_to_cpu(split_rec->e_leaf_clusters)))) {
3977 ret = -EIO;
3978 mlog_errno(ret);
3979 goto out;
3980 }
3981
3982 eb_bh = path_leaf_bh(path);
3983 ret = ocfs2_journal_access(handle, inode, eb_bh,
3984 OCFS2_JOURNAL_ACCESS_WRITE);
3985 if (ret) {
3986 mlog_errno(ret);
3987 goto out;
3988 }
3989
3990 ctxt.c_contig_type = ocfs2_figure_merge_contig_type(inode, el,
3991 split_index,
3992 split_rec);
3993
3994 /*
3995 * The core merge / split code wants to know how much room is
3996 * left in this inodes allocation tree, so we pass the
3997 * rightmost extent list.
3998 */
3999 if (path->p_tree_depth) {
4000 struct ocfs2_extent_block *eb;
4001 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
4002
4003 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
4004 le64_to_cpu(di->i_last_eb_blk),
4005 &last_eb_bh, OCFS2_BH_CACHED, inode);
4006 if (ret) {
4007 mlog_exit(ret);
4008 goto out;
4009 }
4010
4011 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
4012 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
4013 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
4014 ret = -EROFS;
4015 goto out;
4016 }
4017
4018 rightmost_el = &eb->h_list;
4019 } else
4020 rightmost_el = path_root_el(path);
4021
4022 ctxt.c_used_tail_recs = le16_to_cpu(rightmost_el->l_next_free_rec);
4023 if (ctxt.c_used_tail_recs > 0 &&
4024 ocfs2_is_empty_extent(&rightmost_el->l_recs[0]))
4025 ctxt.c_used_tail_recs--;
4026
4027 if (rec->e_cpos == split_rec->e_cpos &&
4028 rec->e_leaf_clusters == split_rec->e_leaf_clusters)
4029 ctxt.c_split_covers_rec = 1;
4030 else
4031 ctxt.c_split_covers_rec = 0;
4032
4033 ctxt.c_has_empty_extent = ocfs2_is_empty_extent(&el->l_recs[0]);
4034
4035 mlog(0, "index: %d, contig: %u, used_tail_recs: %u, "
4036 "has_empty: %u, split_covers: %u\n", split_index,
4037 ctxt.c_contig_type, ctxt.c_used_tail_recs,
4038 ctxt.c_has_empty_extent, ctxt.c_split_covers_rec);
4039
4040 if (ctxt.c_contig_type == CONTIG_NONE) {
4041 if (ctxt.c_split_covers_rec)
4042 el->l_recs[split_index] = *split_rec;
4043 else
4044 ret = ocfs2_split_and_insert(inode, handle, path, di_bh,
4045 &last_eb_bh, split_index,
4046 split_rec, meta_ac);
4047 if (ret)
4048 mlog_errno(ret);
4049 } else {
4050 ret = ocfs2_try_to_merge_extent(inode, handle, path,
4051 split_index, split_rec,
4052 dealloc, &ctxt);
4053 if (ret)
4054 mlog_errno(ret);
4055 }
4056
4057 ocfs2_journal_dirty(handle, eb_bh);
4058
4059out:
4060 brelse(last_eb_bh);
4061 return ret;
4062}
4063
4064/*
4065 * Mark the already-existing extent at cpos as written for len clusters.
4066 *
4067 * If the existing extent is larger than the request, initiate a
4068 * split. An attempt will be made at merging with adjacent extents.
4069 *
4070 * The caller is responsible for passing down meta_ac if we'll need it.
4071 */
4072int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh,
4073 handle_t *handle, u32 cpos, u32 len, u32 phys,
4074 struct ocfs2_alloc_context *meta_ac,
4075 struct ocfs2_cached_dealloc_ctxt *dealloc)
4076{
4077 int ret, index;
4078 u64 start_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys);
4079 struct ocfs2_extent_rec split_rec;
4080 struct ocfs2_path *left_path = NULL;
4081 struct ocfs2_extent_list *el;
4082
4083 mlog(0, "Inode %lu cpos %u, len %u, phys %u (%llu)\n",
4084 inode->i_ino, cpos, len, phys, (unsigned long long)start_blkno);
4085
4086 if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
4087 ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents "
4088 "that are being written to, but the feature bit "
4089 "is not set in the super block.",
4090 (unsigned long long)OCFS2_I(inode)->ip_blkno);
4091 ret = -EROFS;
4092 goto out;
4093 }
4094
4095 /*
4096 * XXX: This should be fixed up so that we just re-insert the
4097 * next extent records.
4098 */
4099 ocfs2_extent_map_trunc(inode, 0);
4100
4101 left_path = ocfs2_new_inode_path(di_bh);
4102 if (!left_path) {
4103 ret = -ENOMEM;
4104 mlog_errno(ret);
4105 goto out;
4106 }
4107
4108 ret = ocfs2_find_path(inode, left_path, cpos);
4109 if (ret) {
4110 mlog_errno(ret);
4111 goto out;
4112 }
4113 el = path_leaf_el(left_path);
4114
4115 index = ocfs2_search_extent_list(el, cpos);
4116 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
4117 ocfs2_error(inode->i_sb,
4118 "Inode %llu has an extent at cpos %u which can no "
4119 "longer be found.\n",
4120 (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
4121 ret = -EROFS;
4122 goto out;
4123 }
4124
4125 memset(&split_rec, 0, sizeof(struct ocfs2_extent_rec));
4126 split_rec.e_cpos = cpu_to_le32(cpos);
4127 split_rec.e_leaf_clusters = cpu_to_le16(len);
4128 split_rec.e_blkno = cpu_to_le64(start_blkno);
4129 split_rec.e_flags = path_leaf_el(left_path)->l_recs[index].e_flags;
4130 split_rec.e_flags &= ~OCFS2_EXT_UNWRITTEN;
4131
4132 ret = __ocfs2_mark_extent_written(inode, di_bh, handle, left_path,
4133 index, &split_rec, meta_ac, dealloc);
4134 if (ret)
4135 mlog_errno(ret);
4136
4137out:
4138 ocfs2_free_path(left_path);
4139 return ret;
4140}
4141
4142static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh,
4143 handle_t *handle, struct ocfs2_path *path,
4144 int index, u32 new_range,
4145 struct ocfs2_alloc_context *meta_ac)
4146{
4147 int ret, depth, credits = handle->h_buffer_credits;
4148 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
4149 struct buffer_head *last_eb_bh = NULL;
4150 struct ocfs2_extent_block *eb;
4151 struct ocfs2_extent_list *rightmost_el, *el;
4152 struct ocfs2_extent_rec split_rec;
4153 struct ocfs2_extent_rec *rec;
4154 struct ocfs2_insert_type insert;
4155
4156 /*
4157 * Setup the record to split before we grow the tree.
4158 */
4159 el = path_leaf_el(path);
4160 rec = &el->l_recs[index];
4161 ocfs2_make_right_split_rec(inode->i_sb, &split_rec, new_range, rec);
4162
4163 depth = path->p_tree_depth;
4164 if (depth > 0) {
4165 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
4166 le64_to_cpu(di->i_last_eb_blk),
4167 &last_eb_bh, OCFS2_BH_CACHED, inode);
4168 if (ret < 0) {
4169 mlog_errno(ret);
4170 goto out;
4171 }
4172
4173 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
4174 rightmost_el = &eb->h_list;
4175 } else
4176 rightmost_el = path_leaf_el(path);
4177
4178 credits += path->p_tree_depth + ocfs2_extend_meta_needed(di);
4179 ret = ocfs2_extend_trans(handle, credits);
4180 if (ret) {
4181 mlog_errno(ret);
4182 goto out;
4183 }
4184
4185 if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
4186 le16_to_cpu(rightmost_el->l_count)) {
4187 int old_depth = depth;
4188
4189 ret = ocfs2_grow_tree(inode, handle, di_bh, &depth, &last_eb_bh,
4190 meta_ac);
4191 if (ret) {
4192 mlog_errno(ret);
4193 goto out;
4194 }
4195
4196 if (old_depth != depth) {
4197 eb = (struct ocfs2_extent_block *)last_eb_bh->b_data;
4198 rightmost_el = &eb->h_list;
4199 }
4200 }
4201
4202 memset(&insert, 0, sizeof(struct ocfs2_insert_type));
4203 insert.ins_appending = APPEND_NONE;
4204 insert.ins_contig = CONTIG_NONE;
4205 insert.ins_split = SPLIT_RIGHT;
4206 insert.ins_free_records = le16_to_cpu(rightmost_el->l_count)
4207 - le16_to_cpu(rightmost_el->l_next_free_rec);
4208 insert.ins_tree_depth = depth;
4209
4210 ret = ocfs2_do_insert_extent(inode, handle, di_bh, &split_rec, &insert);
4211 if (ret)
4212 mlog_errno(ret);
4213
4214out:
4215 brelse(last_eb_bh);
4216 return ret;
4217}
4218
4219static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
4220 struct ocfs2_path *path, int index,
4221 struct ocfs2_cached_dealloc_ctxt *dealloc,
4222 u32 cpos, u32 len)
4223{
4224 int ret;
4225 u32 left_cpos, rec_range, trunc_range;
4226 int wants_rotate = 0, is_rightmost_tree_rec = 0;
4227 struct super_block *sb = inode->i_sb;
4228 struct ocfs2_path *left_path = NULL;
4229 struct ocfs2_extent_list *el = path_leaf_el(path);
4230 struct ocfs2_extent_rec *rec;
4231 struct ocfs2_extent_block *eb;
4232
4233 if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
4234 ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc);
4235 if (ret) {
4236 mlog_errno(ret);
4237 goto out;
4238 }
4239
4240 index--;
4241 }
4242
4243 if (index == (le16_to_cpu(el->l_next_free_rec) - 1) &&
4244 path->p_tree_depth) {
4245 /*
4246 * Check whether this is the rightmost tree record. If
4247 * we remove all of this record or part of its right
4248 * edge then an update of the record lengths above it
4249 * will be required.
4250 */
4251 eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
4252 if (eb->h_next_leaf_blk == 0)
4253 is_rightmost_tree_rec = 1;
4254 }
4255
4256 rec = &el->l_recs[index];
4257 if (index == 0 && path->p_tree_depth &&
4258 le32_to_cpu(rec->e_cpos) == cpos) {
4259 /*
4260 * Changing the leftmost offset (via partial or whole
4261 * record truncate) of an interior (or rightmost) path
4262 * means we have to update the subtree that is formed
4263 * by this leaf and the one to it's left.
4264 *
4265 * There are two cases we can skip:
4266 * 1) Path is the leftmost one in our inode tree.
4267 * 2) The leaf is rightmost and will be empty after
4268 * we remove the extent record - the rotate code
4269 * knows how to update the newly formed edge.
4270 */
4271
4272 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path,
4273 &left_cpos);
4274 if (ret) {
4275 mlog_errno(ret);
4276 goto out;
4277 }
4278
4279 if (left_cpos && le16_to_cpu(el->l_next_free_rec) > 1) {
4280 left_path = ocfs2_new_path(path_root_bh(path),
4281 path_root_el(path));
4282 if (!left_path) {
4283 ret = -ENOMEM;
4284 mlog_errno(ret);
4285 goto out;
4286 }
4287
4288 ret = ocfs2_find_path(inode, left_path, left_cpos);
4289 if (ret) {
4290 mlog_errno(ret);
4291 goto out;
4292 }
4293 }
4294 }
4295
4296 ret = ocfs2_extend_rotate_transaction(handle, 0,
4297 handle->h_buffer_credits,
4298 path);
4299 if (ret) {
4300 mlog_errno(ret);
4301 goto out;
4302 }
4303
4304 ret = ocfs2_journal_access_path(inode, handle, path);
4305 if (ret) {
4306 mlog_errno(ret);
4307 goto out;
4308 }
4309
4310 ret = ocfs2_journal_access_path(inode, handle, left_path);
4311 if (ret) {
4312 mlog_errno(ret);
4313 goto out;
4314 }
4315
4316 rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
4317 trunc_range = cpos + len;
4318
4319 if (le32_to_cpu(rec->e_cpos) == cpos && rec_range == trunc_range) {
4320 int next_free;
4321
4322 memset(rec, 0, sizeof(*rec));
4323 ocfs2_cleanup_merge(el, index);
4324 wants_rotate = 1;
4325
4326 next_free = le16_to_cpu(el->l_next_free_rec);
4327 if (is_rightmost_tree_rec && next_free > 1) {
4328 /*
4329 * We skip the edge update if this path will
4330 * be deleted by the rotate code.
4331 */
4332 rec = &el->l_recs[next_free - 1];
4333 ocfs2_adjust_rightmost_records(inode, handle, path,
4334 rec);
4335 }
4336 } else if (le32_to_cpu(rec->e_cpos) == cpos) {
4337 /* Remove leftmost portion of the record. */
4338 le32_add_cpu(&rec->e_cpos, len);
4339 le64_add_cpu(&rec->e_blkno, ocfs2_clusters_to_blocks(sb, len));
4340 le16_add_cpu(&rec->e_leaf_clusters, -len);
4341 } else if (rec_range == trunc_range) {
4342 /* Remove rightmost portion of the record */
4343 le16_add_cpu(&rec->e_leaf_clusters, -len);
4344 if (is_rightmost_tree_rec)
4345 ocfs2_adjust_rightmost_records(inode, handle, path, rec);
4346 } else {
4347 /* Caller should have trapped this. */
4348 mlog(ML_ERROR, "Inode %llu: Invalid record truncate: (%u, %u) "
4349 "(%u, %u)\n", (unsigned long long)OCFS2_I(inode)->ip_blkno,
4350 le32_to_cpu(rec->e_cpos),
4351 le16_to_cpu(rec->e_leaf_clusters), cpos, len);
4352 BUG();
4353 }
4354
4355 if (left_path) {
4356 int subtree_index;
4357
4358 subtree_index = ocfs2_find_subtree_root(inode, left_path, path);
4359 ocfs2_complete_edge_insert(inode, handle, left_path, path,
4360 subtree_index);
4361 }
4362
4363 ocfs2_journal_dirty(handle, path_leaf_bh(path));
4364
4365 ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc);
4366 if (ret) {
4367 mlog_errno(ret);
4368 goto out;
4369 }
4370
4371out:
4372 ocfs2_free_path(left_path);
4373 return ret;
4374}
4375
4376int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh,
4377 u32 cpos, u32 len, handle_t *handle,
4378 struct ocfs2_alloc_context *meta_ac,
4379 struct ocfs2_cached_dealloc_ctxt *dealloc)
4380{
4381 int ret, index;
4382 u32 rec_range, trunc_range;
4383 struct ocfs2_extent_rec *rec;
4384 struct ocfs2_extent_list *el;
4385 struct ocfs2_path *path;
4386
4387 ocfs2_extent_map_trunc(inode, 0);
4388
4389 path = ocfs2_new_inode_path(di_bh);
4390 if (!path) {
4391 ret = -ENOMEM;
4392 mlog_errno(ret);
4393 goto out;
4394 }
4395
4396 ret = ocfs2_find_path(inode, path, cpos);
4397 if (ret) {
4398 mlog_errno(ret);
4399 goto out;
4400 }
4401
4402 el = path_leaf_el(path);
4403 index = ocfs2_search_extent_list(el, cpos);
4404 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
4405 ocfs2_error(inode->i_sb,
4406 "Inode %llu has an extent at cpos %u which can no "
4407 "longer be found.\n",
4408 (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
4409 ret = -EROFS;
4410 goto out;
4411 }
4412
4413 /*
4414 * We have 3 cases of extent removal:
4415 * 1) Range covers the entire extent rec
4416 * 2) Range begins or ends on one edge of the extent rec
4417 * 3) Range is in the middle of the extent rec (no shared edges)
4418 *
4419 * For case 1 we remove the extent rec and left rotate to
4420 * fill the hole.
4421 *
4422 * For case 2 we just shrink the existing extent rec, with a
4423 * tree update if the shrinking edge is also the edge of an
4424 * extent block.
4425 *
4426 * For case 3 we do a right split to turn the extent rec into
4427 * something case 2 can handle.
4428 */
4429 rec = &el->l_recs[index];
4430 rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
4431 trunc_range = cpos + len;
4432
4433 BUG_ON(cpos < le32_to_cpu(rec->e_cpos) || trunc_range > rec_range);
4434
4435 mlog(0, "Inode %llu, remove (cpos %u, len %u). Existing index %d "
4436 "(cpos %u, len %u)\n",
4437 (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos, len, index,
4438 le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec));
4439
4440 if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) {
4441 ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
4442 cpos, len);
4443 if (ret) {
4444 mlog_errno(ret);
4445 goto out;
4446 }
4447 } else {
4448 ret = ocfs2_split_tree(inode, di_bh, handle, path, index,
4449 trunc_range, meta_ac);
4450 if (ret) {
4451 mlog_errno(ret);
4452 goto out;
4453 }
4454
4455 /*
4456 * The split could have manipulated the tree enough to
4457 * move the record location, so we have to look for it again.
4458 */
4459 ocfs2_reinit_path(path, 1);
4460
4461 ret = ocfs2_find_path(inode, path, cpos);
4462 if (ret) {
4463 mlog_errno(ret);
4464 goto out;
4465 }
4466
4467 el = path_leaf_el(path);
4468 index = ocfs2_search_extent_list(el, cpos);
4469 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
4470 ocfs2_error(inode->i_sb,
4471 "Inode %llu: split at cpos %u lost record.",
4472 (unsigned long long)OCFS2_I(inode)->ip_blkno,
4473 cpos);
4474 ret = -EROFS;
4475 goto out;
4476 }
4477
4478 /*
4479 * Double check our values here. If anything is fishy,
4480 * it's easier to catch it at the top level.
4481 */
4482 rec = &el->l_recs[index];
4483 rec_range = le32_to_cpu(rec->e_cpos) +
4484 ocfs2_rec_clusters(el, rec);
4485 if (rec_range != trunc_range) {
4486 ocfs2_error(inode->i_sb,
4487 "Inode %llu: error after split at cpos %u"
4488 "trunc len %u, existing record is (%u,%u)",
4489 (unsigned long long)OCFS2_I(inode)->ip_blkno,
4490 cpos, len, le32_to_cpu(rec->e_cpos),
4491 ocfs2_rec_clusters(el, rec));
4492 ret = -EROFS;
4493 goto out;
4494 }
4495
4496 ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
4497 cpos, len);
4498 if (ret) {
4499 mlog_errno(ret);
4500 goto out;
4501 }
4502 }
4503
4504out:
4505 ocfs2_free_path(path);
4506 return ret;
4507}
4508
4509int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
2435{ 4510{
2436 struct buffer_head *tl_bh = osb->osb_tl_bh; 4511 struct buffer_head *tl_bh = osb->osb_tl_bh;
2437 struct ocfs2_dinode *di; 4512 struct ocfs2_dinode *di;
@@ -2464,10 +4539,10 @@ static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl,
2464 return current_tail == new_start; 4539 return current_tail == new_start;
2465} 4540}
2466 4541
2467static int ocfs2_truncate_log_append(struct ocfs2_super *osb, 4542int ocfs2_truncate_log_append(struct ocfs2_super *osb,
2468 handle_t *handle, 4543 handle_t *handle,
2469 u64 start_blk, 4544 u64 start_blk,
2470 unsigned int num_clusters) 4545 unsigned int num_clusters)
2471{ 4546{
2472 int status, index; 4547 int status, index;
2473 unsigned int start_cluster, tl_count; 4548 unsigned int start_cluster, tl_count;
@@ -2623,7 +4698,7 @@ bail:
2623} 4698}
2624 4699
2625/* Expects you to already be holding tl_inode->i_mutex */ 4700/* Expects you to already be holding tl_inode->i_mutex */
2626static int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) 4701int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
2627{ 4702{
2628 int status; 4703 int status;
2629 unsigned int num_to_flush; 4704 unsigned int num_to_flush;
@@ -2957,6 +5032,219 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb)
2957 return status; 5032 return status;
2958} 5033}
2959 5034
5035/*
5036 * Delayed de-allocation of suballocator blocks.
5037 *
5038 * Some sets of block de-allocations might involve multiple suballocator inodes.
5039 *
5040 * The locking for this can get extremely complicated, especially when
5041 * the suballocator inodes to delete from aren't known until deep
5042 * within an unrelated codepath.
5043 *
5044 * ocfs2_extent_block structures are a good example of this - an inode
5045 * btree could have been grown by any number of nodes each allocating
5046 * out of their own suballoc inode.
5047 *
5048 * These structures allow the delay of block de-allocation until a
5049 * later time, when locking of multiple cluster inodes won't cause
5050 * deadlock.
5051 */
5052
5053/*
5054 * Describes a single block free from a suballocator
5055 */
5056struct ocfs2_cached_block_free {
5057 struct ocfs2_cached_block_free *free_next;
5058 u64 free_blk;
5059 unsigned int free_bit;
5060};
5061
5062struct ocfs2_per_slot_free_list {
5063 struct ocfs2_per_slot_free_list *f_next_suballocator;
5064 int f_inode_type;
5065 int f_slot;
5066 struct ocfs2_cached_block_free *f_first;
5067};
5068
5069static int ocfs2_free_cached_items(struct ocfs2_super *osb,
5070 int sysfile_type,
5071 int slot,
5072 struct ocfs2_cached_block_free *head)
5073{
5074 int ret;
5075 u64 bg_blkno;
5076 handle_t *handle;
5077 struct inode *inode;
5078 struct buffer_head *di_bh = NULL;
5079 struct ocfs2_cached_block_free *tmp;
5080
5081 inode = ocfs2_get_system_file_inode(osb, sysfile_type, slot);
5082 if (!inode) {
5083 ret = -EINVAL;
5084 mlog_errno(ret);
5085 goto out;
5086 }
5087
5088 mutex_lock(&inode->i_mutex);
5089
5090 ret = ocfs2_meta_lock(inode, &di_bh, 1);
5091 if (ret) {
5092 mlog_errno(ret);
5093 goto out_mutex;
5094 }
5095
5096 handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE);
5097 if (IS_ERR(handle)) {
5098 ret = PTR_ERR(handle);
5099 mlog_errno(ret);
5100 goto out_unlock;
5101 }
5102
5103 while (head) {
5104 bg_blkno = ocfs2_which_suballoc_group(head->free_blk,
5105 head->free_bit);
5106 mlog(0, "Free bit: (bit %u, blkno %llu)\n",
5107 head->free_bit, (unsigned long long)head->free_blk);
5108
5109 ret = ocfs2_free_suballoc_bits(handle, inode, di_bh,
5110 head->free_bit, bg_blkno, 1);
5111 if (ret) {
5112 mlog_errno(ret);
5113 goto out_journal;
5114 }
5115
5116 ret = ocfs2_extend_trans(handle, OCFS2_SUBALLOC_FREE);
5117 if (ret) {
5118 mlog_errno(ret);
5119 goto out_journal;
5120 }
5121
5122 tmp = head;
5123 head = head->free_next;
5124 kfree(tmp);
5125 }
5126
5127out_journal:
5128 ocfs2_commit_trans(osb, handle);
5129
5130out_unlock:
5131 ocfs2_meta_unlock(inode, 1);
5132 brelse(di_bh);
5133out_mutex:
5134 mutex_unlock(&inode->i_mutex);
5135 iput(inode);
5136out:
5137 while(head) {
5138 /* Premature exit may have left some dangling items. */
5139 tmp = head;
5140 head = head->free_next;
5141 kfree(tmp);
5142 }
5143
5144 return ret;
5145}
5146
5147int ocfs2_run_deallocs(struct ocfs2_super *osb,
5148 struct ocfs2_cached_dealloc_ctxt *ctxt)
5149{
5150 int ret = 0, ret2;
5151 struct ocfs2_per_slot_free_list *fl;
5152
5153 if (!ctxt)
5154 return 0;
5155
5156 while (ctxt->c_first_suballocator) {
5157 fl = ctxt->c_first_suballocator;
5158
5159 if (fl->f_first) {
5160 mlog(0, "Free items: (type %u, slot %d)\n",
5161 fl->f_inode_type, fl->f_slot);
5162 ret2 = ocfs2_free_cached_items(osb, fl->f_inode_type,
5163 fl->f_slot, fl->f_first);
5164 if (ret2)
5165 mlog_errno(ret2);
5166 if (!ret)
5167 ret = ret2;
5168 }
5169
5170 ctxt->c_first_suballocator = fl->f_next_suballocator;
5171 kfree(fl);
5172 }
5173
5174 return ret;
5175}
5176
5177static struct ocfs2_per_slot_free_list *
5178ocfs2_find_per_slot_free_list(int type,
5179 int slot,
5180 struct ocfs2_cached_dealloc_ctxt *ctxt)
5181{
5182 struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator;
5183
5184 while (fl) {
5185 if (fl->f_inode_type == type && fl->f_slot == slot)
5186 return fl;
5187
5188 fl = fl->f_next_suballocator;
5189 }
5190
5191 fl = kmalloc(sizeof(*fl), GFP_NOFS);
5192 if (fl) {
5193 fl->f_inode_type = type;
5194 fl->f_slot = slot;
5195 fl->f_first = NULL;
5196 fl->f_next_suballocator = ctxt->c_first_suballocator;
5197
5198 ctxt->c_first_suballocator = fl;
5199 }
5200 return fl;
5201}
5202
5203static int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
5204 int type, int slot, u64 blkno,
5205 unsigned int bit)
5206{
5207 int ret;
5208 struct ocfs2_per_slot_free_list *fl;
5209 struct ocfs2_cached_block_free *item;
5210
5211 fl = ocfs2_find_per_slot_free_list(type, slot, ctxt);
5212 if (fl == NULL) {
5213 ret = -ENOMEM;
5214 mlog_errno(ret);
5215 goto out;
5216 }
5217
5218 item = kmalloc(sizeof(*item), GFP_NOFS);
5219 if (item == NULL) {
5220 ret = -ENOMEM;
5221 mlog_errno(ret);
5222 goto out;
5223 }
5224
5225 mlog(0, "Insert: (type %d, slot %u, bit %u, blk %llu)\n",
5226 type, slot, bit, (unsigned long long)blkno);
5227
5228 item->free_blk = blkno;
5229 item->free_bit = bit;
5230 item->free_next = fl->f_first;
5231
5232 fl->f_first = item;
5233
5234 ret = 0;
5235out:
5236 return ret;
5237}
5238
5239static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
5240 struct ocfs2_extent_block *eb)
5241{
5242 return ocfs2_cache_block_dealloc(ctxt, EXTENT_ALLOC_SYSTEM_INODE,
5243 le16_to_cpu(eb->h_suballoc_slot),
5244 le64_to_cpu(eb->h_blkno),
5245 le16_to_cpu(eb->h_suballoc_bit));
5246}
5247
2960/* This function will figure out whether the currently last extent 5248/* This function will figure out whether the currently last extent
2961 * block will be deleted, and if it will, what the new last extent 5249 * block will be deleted, and if it will, what the new last extent
2962 * block will be so we can update his h_next_leaf_blk field, as well 5250 * block will be so we can update his h_next_leaf_blk field, as well
@@ -3238,27 +5526,10 @@ delete:
3238 BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos)); 5526 BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos));
3239 BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno)); 5527 BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno));
3240 5528
3241 if (le16_to_cpu(eb->h_suballoc_slot) == 0) { 5529 ret = ocfs2_cache_extent_block_free(&tc->tc_dealloc, eb);
3242 /* 5530 /* An error here is not fatal. */
3243 * This code only understands how to 5531 if (ret < 0)
3244 * lock the suballocator in slot 0, 5532 mlog_errno(ret);
3245 * which is fine because allocation is
3246 * only ever done out of that
3247 * suballocator too. A future version
3248 * might change that however, so avoid
3249 * a free if we don't know how to
3250 * handle it. This way an fs incompat
3251 * bit will not be necessary.
3252 */
3253 ret = ocfs2_free_extent_block(handle,
3254 tc->tc_ext_alloc_inode,
3255 tc->tc_ext_alloc_bh,
3256 eb);
3257
3258 /* An error here is not fatal. */
3259 if (ret < 0)
3260 mlog_errno(ret);
3261 }
3262 } else { 5533 } else {
3263 deleted_eb = 0; 5534 deleted_eb = 0;
3264 } 5535 }
@@ -3397,9 +5668,9 @@ static int ocfs2_ordered_zero_func(handle_t *handle, struct buffer_head *bh)
3397 return ocfs2_journal_dirty_data(handle, bh); 5668 return ocfs2_journal_dirty_data(handle, bh);
3398} 5669}
3399 5670
3400static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t isize, 5671static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start,
3401 struct page **pages, int numpages, 5672 loff_t end, struct page **pages,
3402 u64 phys, handle_t *handle) 5673 int numpages, u64 phys, handle_t *handle)
3403{ 5674{
3404 int i, ret, partial = 0; 5675 int i, ret, partial = 0;
3405 void *kaddr; 5676 void *kaddr;
@@ -3412,26 +5683,14 @@ static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t isize,
3412 if (numpages == 0) 5683 if (numpages == 0)
3413 goto out; 5684 goto out;
3414 5685
3415 from = isize & (PAGE_CACHE_SIZE - 1); /* 1st page offset */ 5686 to = PAGE_CACHE_SIZE;
3416 if (PAGE_CACHE_SHIFT > OCFS2_SB(sb)->s_clustersize_bits) {
3417 /*
3418 * Since 'from' has been capped to a value below page
3419 * size, this calculation won't be able to overflow
3420 * 'to'
3421 */
3422 to = ocfs2_align_bytes_to_clusters(sb, from);
3423
3424 /*
3425 * The truncate tail in this case should never contain
3426 * more than one page at maximum. The loop below also
3427 * assumes this.
3428 */
3429 BUG_ON(numpages != 1);
3430 }
3431
3432 for(i = 0; i < numpages; i++) { 5687 for(i = 0; i < numpages; i++) {
3433 page = pages[i]; 5688 page = pages[i];
3434 5689
5690 from = start & (PAGE_CACHE_SIZE - 1);
5691 if ((end >> PAGE_CACHE_SHIFT) == page->index)
5692 to = end & (PAGE_CACHE_SIZE - 1);
5693
3435 BUG_ON(from > PAGE_CACHE_SIZE); 5694 BUG_ON(from > PAGE_CACHE_SIZE);
3436 BUG_ON(to > PAGE_CACHE_SIZE); 5695 BUG_ON(to > PAGE_CACHE_SIZE);
3437 5696
@@ -3468,10 +5727,7 @@ static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t isize,
3468 5727
3469 flush_dcache_page(page); 5728 flush_dcache_page(page);
3470 5729
3471 /* 5730 start = (page->index + 1) << PAGE_CACHE_SHIFT;
3472 * Every page after the 1st one should be completely zero'd.
3473 */
3474 from = 0;
3475 } 5731 }
3476out: 5732out:
3477 if (pages) { 5733 if (pages) {
@@ -3484,24 +5740,26 @@ out:
3484 } 5740 }
3485} 5741}
3486 5742
3487static int ocfs2_grab_eof_pages(struct inode *inode, loff_t isize, struct page **pages, 5743static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end,
3488 int *num, u64 *phys) 5744 struct page **pages, int *num, u64 *phys)
3489{ 5745{
3490 int i, numpages = 0, ret = 0; 5746 int i, numpages = 0, ret = 0;
3491 unsigned int csize = OCFS2_SB(inode->i_sb)->s_clustersize;
3492 unsigned int ext_flags; 5747 unsigned int ext_flags;
3493 struct super_block *sb = inode->i_sb; 5748 struct super_block *sb = inode->i_sb;
3494 struct address_space *mapping = inode->i_mapping; 5749 struct address_space *mapping = inode->i_mapping;
3495 unsigned long index; 5750 unsigned long index;
3496 u64 next_cluster_bytes; 5751 loff_t last_page_bytes;
3497 5752
3498 BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb))); 5753 BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
5754 BUG_ON(start > end);
3499 5755
3500 /* Cluster boundary, so we don't need to grab any pages. */ 5756 if (start == end)
3501 if ((isize & (csize - 1)) == 0)
3502 goto out; 5757 goto out;
3503 5758
3504 ret = ocfs2_extent_map_get_blocks(inode, isize >> sb->s_blocksize_bits, 5759 BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits !=
5760 (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits);
5761
5762 ret = ocfs2_extent_map_get_blocks(inode, start >> sb->s_blocksize_bits,
3505 phys, NULL, &ext_flags); 5763 phys, NULL, &ext_flags);
3506 if (ret) { 5764 if (ret) {
3507 mlog_errno(ret); 5765 mlog_errno(ret);
@@ -3517,8 +5775,8 @@ static int ocfs2_grab_eof_pages(struct inode *inode, loff_t isize, struct page *
3517 if (ext_flags & OCFS2_EXT_UNWRITTEN) 5775 if (ext_flags & OCFS2_EXT_UNWRITTEN)
3518 goto out; 5776 goto out;
3519 5777
3520 next_cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, isize); 5778 last_page_bytes = PAGE_ALIGN(end);
3521 index = isize >> PAGE_CACHE_SHIFT; 5779 index = start >> PAGE_CACHE_SHIFT;
3522 do { 5780 do {
3523 pages[numpages] = grab_cache_page(mapping, index); 5781 pages[numpages] = grab_cache_page(mapping, index);
3524 if (!pages[numpages]) { 5782 if (!pages[numpages]) {
@@ -3529,7 +5787,7 @@ static int ocfs2_grab_eof_pages(struct inode *inode, loff_t isize, struct page *
3529 5787
3530 numpages++; 5788 numpages++;
3531 index++; 5789 index++;
3532 } while (index < (next_cluster_bytes >> PAGE_CACHE_SHIFT)); 5790 } while (index < (last_page_bytes >> PAGE_CACHE_SHIFT));
3533 5791
3534out: 5792out:
3535 if (ret != 0) { 5793 if (ret != 0) {
@@ -3558,11 +5816,10 @@ out:
3558 * otherwise block_write_full_page() will skip writeout of pages past 5816 * otherwise block_write_full_page() will skip writeout of pages past
3559 * i_size. The new_i_size parameter is passed for this reason. 5817 * i_size. The new_i_size parameter is passed for this reason.
3560 */ 5818 */
3561int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle, 5819int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
3562 u64 new_i_size) 5820 u64 range_start, u64 range_end)
3563{ 5821{
3564 int ret, numpages; 5822 int ret, numpages;
3565 loff_t endbyte;
3566 struct page **pages = NULL; 5823 struct page **pages = NULL;
3567 u64 phys; 5824 u64 phys;
3568 5825
@@ -3581,7 +5838,8 @@ int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle,
3581 goto out; 5838 goto out;
3582 } 5839 }
3583 5840
3584 ret = ocfs2_grab_eof_pages(inode, new_i_size, pages, &numpages, &phys); 5841 ret = ocfs2_grab_eof_pages(inode, range_start, range_end, pages,
5842 &numpages, &phys);
3585 if (ret) { 5843 if (ret) {
3586 mlog_errno(ret); 5844 mlog_errno(ret);
3587 goto out; 5845 goto out;
@@ -3590,17 +5848,16 @@ int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle,
3590 if (numpages == 0) 5848 if (numpages == 0)
3591 goto out; 5849 goto out;
3592 5850
3593 ocfs2_zero_cluster_pages(inode, new_i_size, pages, numpages, phys, 5851 ocfs2_zero_cluster_pages(inode, range_start, range_end, pages,
3594 handle); 5852 numpages, phys, handle);
3595 5853
3596 /* 5854 /*
3597 * Initiate writeout of the pages we zero'd here. We don't 5855 * Initiate writeout of the pages we zero'd here. We don't
3598 * wait on them - the truncate_inode_pages() call later will 5856 * wait on them - the truncate_inode_pages() call later will
3599 * do that for us. 5857 * do that for us.
3600 */ 5858 */
3601 endbyte = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size); 5859 ret = do_sync_mapping_range(inode->i_mapping, range_start,
3602 ret = do_sync_mapping_range(inode->i_mapping, new_i_size, 5860 range_end - 1, SYNC_FILE_RANGE_WRITE);
3603 endbyte - 1, SYNC_FILE_RANGE_WRITE);
3604 if (ret) 5861 if (ret)
3605 mlog_errno(ret); 5862 mlog_errno(ret);
3606 5863
@@ -3631,8 +5888,6 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
3631 5888
3632 mlog_entry_void(); 5889 mlog_entry_void();
3633 5890
3634 down_write(&OCFS2_I(inode)->ip_alloc_sem);
3635
3636 new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb, 5891 new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
3637 i_size_read(inode)); 5892 i_size_read(inode));
3638 5893
@@ -3754,7 +6009,6 @@ start:
3754 goto start; 6009 goto start;
3755 6010
3756bail: 6011bail:
3757 up_write(&OCFS2_I(inode)->ip_alloc_sem);
3758 6012
3759 ocfs2_schedule_truncate_log_flush(osb, 1); 6013 ocfs2_schedule_truncate_log_flush(osb, 1);
3760 6014
@@ -3764,6 +6018,8 @@ bail:
3764 if (handle) 6018 if (handle)
3765 ocfs2_commit_trans(osb, handle); 6019 ocfs2_commit_trans(osb, handle);
3766 6020
6021 ocfs2_run_deallocs(osb, &tc->tc_dealloc);
6022
3767 ocfs2_free_path(path); 6023 ocfs2_free_path(path);
3768 6024
3769 /* This will drop the ext_alloc cluster lock for us */ 6025 /* This will drop the ext_alloc cluster lock for us */
@@ -3774,23 +6030,18 @@ bail:
3774} 6030}
3775 6031
3776/* 6032/*
3777 * Expects the inode to already be locked. This will figure out which 6033 * Expects the inode to already be locked.
3778 * inodes need to be locked and will put them on the returned truncate
3779 * context.
3780 */ 6034 */
3781int ocfs2_prepare_truncate(struct ocfs2_super *osb, 6035int ocfs2_prepare_truncate(struct ocfs2_super *osb,
3782 struct inode *inode, 6036 struct inode *inode,
3783 struct buffer_head *fe_bh, 6037 struct buffer_head *fe_bh,
3784 struct ocfs2_truncate_context **tc) 6038 struct ocfs2_truncate_context **tc)
3785{ 6039{
3786 int status, metadata_delete, i; 6040 int status;
3787 unsigned int new_i_clusters; 6041 unsigned int new_i_clusters;
3788 struct ocfs2_dinode *fe; 6042 struct ocfs2_dinode *fe;
3789 struct ocfs2_extent_block *eb; 6043 struct ocfs2_extent_block *eb;
3790 struct ocfs2_extent_list *el;
3791 struct buffer_head *last_eb_bh = NULL; 6044 struct buffer_head *last_eb_bh = NULL;
3792 struct inode *ext_alloc_inode = NULL;
3793 struct buffer_head *ext_alloc_bh = NULL;
3794 6045
3795 mlog_entry_void(); 6046 mlog_entry_void();
3796 6047
@@ -3810,12 +6061,9 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
3810 mlog_errno(status); 6061 mlog_errno(status);
3811 goto bail; 6062 goto bail;
3812 } 6063 }
6064 ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc);
3813 6065
3814 metadata_delete = 0;
3815 if (fe->id2.i_list.l_tree_depth) { 6066 if (fe->id2.i_list.l_tree_depth) {
3816 /* If we have a tree, then the truncate may result in
3817 * metadata deletes. Figure this out from the
3818 * rightmost leaf block.*/
3819 status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk), 6067 status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
3820 &last_eb_bh, OCFS2_BH_CACHED, inode); 6068 &last_eb_bh, OCFS2_BH_CACHED, inode);
3821 if (status < 0) { 6069 if (status < 0) {
@@ -3830,43 +6078,10 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
3830 status = -EIO; 6078 status = -EIO;
3831 goto bail; 6079 goto bail;
3832 } 6080 }
3833 el = &(eb->h_list);
3834
3835 i = 0;
3836 if (ocfs2_is_empty_extent(&el->l_recs[0]))
3837 i = 1;
3838 /*
3839 * XXX: Should we check that next_free_rec contains
3840 * the extent?
3841 */
3842 if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_i_clusters)
3843 metadata_delete = 1;
3844 } 6081 }
3845 6082
3846 (*tc)->tc_last_eb_bh = last_eb_bh; 6083 (*tc)->tc_last_eb_bh = last_eb_bh;
3847 6084
3848 if (metadata_delete) {
3849 mlog(0, "Will have to delete metadata for this trunc. "
3850 "locking allocator.\n");
3851 ext_alloc_inode = ocfs2_get_system_file_inode(osb, EXTENT_ALLOC_SYSTEM_INODE, 0);
3852 if (!ext_alloc_inode) {
3853 status = -ENOMEM;
3854 mlog_errno(status);
3855 goto bail;
3856 }
3857
3858 mutex_lock(&ext_alloc_inode->i_mutex);
3859 (*tc)->tc_ext_alloc_inode = ext_alloc_inode;
3860
3861 status = ocfs2_meta_lock(ext_alloc_inode, &ext_alloc_bh, 1);
3862 if (status < 0) {
3863 mlog_errno(status);
3864 goto bail;
3865 }
3866 (*tc)->tc_ext_alloc_bh = ext_alloc_bh;
3867 (*tc)->tc_ext_alloc_locked = 1;
3868 }
3869
3870 status = 0; 6085 status = 0;
3871bail: 6086bail:
3872 if (status < 0) { 6087 if (status < 0) {
@@ -3880,16 +6095,13 @@ bail:
3880 6095
3881static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc) 6096static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
3882{ 6097{
3883 if (tc->tc_ext_alloc_inode) { 6098 /*
3884 if (tc->tc_ext_alloc_locked) 6099 * The caller is responsible for completing deallocation
3885 ocfs2_meta_unlock(tc->tc_ext_alloc_inode, 1); 6100 * before freeing the context.
3886 6101 */
3887 mutex_unlock(&tc->tc_ext_alloc_inode->i_mutex); 6102 if (tc->tc_dealloc.c_first_suballocator != NULL)
3888 iput(tc->tc_ext_alloc_inode); 6103 mlog(ML_NOTICE,
3889 } 6104 "Truncate completion has non-empty dealloc context\n");
3890
3891 if (tc->tc_ext_alloc_bh)
3892 brelse(tc->tc_ext_alloc_bh);
3893 6105
3894 if (tc->tc_last_eb_bh) 6106 if (tc->tc_last_eb_bh)
3895 brelse(tc->tc_last_eb_bh); 6107 brelse(tc->tc_last_eb_bh);
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index fbcb5934a081..990df48ae8d3 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -34,7 +34,17 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
34 u32 cpos, 34 u32 cpos,
35 u64 start_blk, 35 u64 start_blk,
36 u32 new_clusters, 36 u32 new_clusters,
37 u8 flags,
37 struct ocfs2_alloc_context *meta_ac); 38 struct ocfs2_alloc_context *meta_ac);
39struct ocfs2_cached_dealloc_ctxt;
40int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh,
41 handle_t *handle, u32 cpos, u32 len, u32 phys,
42 struct ocfs2_alloc_context *meta_ac,
43 struct ocfs2_cached_dealloc_ctxt *dealloc);
44int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh,
45 u32 cpos, u32 len, handle_t *handle,
46 struct ocfs2_alloc_context *meta_ac,
47 struct ocfs2_cached_dealloc_ctxt *dealloc);
38int ocfs2_num_free_extents(struct ocfs2_super *osb, 48int ocfs2_num_free_extents(struct ocfs2_super *osb,
39 struct inode *inode, 49 struct inode *inode,
40 struct ocfs2_dinode *fe); 50 struct ocfs2_dinode *fe);
@@ -62,17 +72,41 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
62 struct ocfs2_dinode **tl_copy); 72 struct ocfs2_dinode **tl_copy);
63int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb, 73int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
64 struct ocfs2_dinode *tl_copy); 74 struct ocfs2_dinode *tl_copy);
75int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb);
76int ocfs2_truncate_log_append(struct ocfs2_super *osb,
77 handle_t *handle,
78 u64 start_blk,
79 unsigned int num_clusters);
80int __ocfs2_flush_truncate_log(struct ocfs2_super *osb);
81
82/*
83 * Process local structure which describes the block unlinks done
84 * during an operation. This is populated via
85 * ocfs2_cache_block_dealloc().
86 *
87 * ocfs2_run_deallocs() should be called after the potentially
88 * de-allocating routines. No journal handles should be open, and most
89 * locks should have been dropped.
90 */
91struct ocfs2_cached_dealloc_ctxt {
92 struct ocfs2_per_slot_free_list *c_first_suballocator;
93};
94static inline void ocfs2_init_dealloc_ctxt(struct ocfs2_cached_dealloc_ctxt *c)
95{
96 c->c_first_suballocator = NULL;
97}
98int ocfs2_run_deallocs(struct ocfs2_super *osb,
99 struct ocfs2_cached_dealloc_ctxt *ctxt);
65 100
66struct ocfs2_truncate_context { 101struct ocfs2_truncate_context {
67 struct inode *tc_ext_alloc_inode; 102 struct ocfs2_cached_dealloc_ctxt tc_dealloc;
68 struct buffer_head *tc_ext_alloc_bh;
69 int tc_ext_alloc_locked; /* is it cluster locked? */ 103 int tc_ext_alloc_locked; /* is it cluster locked? */
70 /* these get destroyed once it's passed to ocfs2_commit_truncate. */ 104 /* these get destroyed once it's passed to ocfs2_commit_truncate. */
71 struct buffer_head *tc_last_eb_bh; 105 struct buffer_head *tc_last_eb_bh;
72}; 106};
73 107
74int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle, 108int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
75 u64 new_i_size); 109 u64 range_start, u64 range_end);
76int ocfs2_prepare_truncate(struct ocfs2_super *osb, 110int ocfs2_prepare_truncate(struct ocfs2_super *osb,
77 struct inode *inode, 111 struct inode *inode,
78 struct buffer_head *fe_bh, 112 struct buffer_head *fe_bh,
@@ -84,6 +118,7 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
84 118
85int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el, 119int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el,
86 u32 cpos, struct buffer_head **leaf_bh); 120 u32 cpos, struct buffer_head **leaf_bh);
121int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster);
87 122
88/* 123/*
89 * Helper function to look at the # of clusters in an extent record. 124 * Helper function to look at the # of clusters in an extent record.
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index a480b09c79b9..460d440310f2 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -232,7 +232,7 @@ static int ocfs2_readpage(struct file *file, struct page *page)
232 * might now be discovering a truncate that hit on another node. 232 * might now be discovering a truncate that hit on another node.
233 * block_read_full_page->get_block freaks out if it is asked to read 233 * block_read_full_page->get_block freaks out if it is asked to read
234 * beyond the end of a file, so we check here. Callers 234 * beyond the end of a file, so we check here. Callers
235 * (generic_file_read, fault->nopage) are clever enough to check i_size 235 * (generic_file_read, vm_ops->fault) are clever enough to check i_size
236 * and notice that the page they just read isn't needed. 236 * and notice that the page they just read isn't needed.
237 * 237 *
238 * XXX sys_readahead() seems to get that wrong? 238 * XXX sys_readahead() seems to get that wrong?
@@ -684,6 +684,8 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
684 bh = bh->b_this_page, block_start += bsize) { 684 bh = bh->b_this_page, block_start += bsize) {
685 block_end = block_start + bsize; 685 block_end = block_start + bsize;
686 686
687 clear_buffer_new(bh);
688
687 /* 689 /*
688 * Ignore blocks outside of our i/o range - 690 * Ignore blocks outside of our i/o range -
689 * they may belong to unallocated clusters. 691 * they may belong to unallocated clusters.
@@ -698,9 +700,8 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
698 * For an allocating write with cluster size >= page 700 * For an allocating write with cluster size >= page
699 * size, we always write the entire page. 701 * size, we always write the entire page.
700 */ 702 */
701 703 if (new)
702 if (buffer_new(bh)) 704 set_buffer_new(bh);
703 clear_buffer_new(bh);
704 705
705 if (!buffer_mapped(bh)) { 706 if (!buffer_mapped(bh)) {
706 map_bh(bh, inode->i_sb, *p_blkno); 707 map_bh(bh, inode->i_sb, *p_blkno);
@@ -711,7 +712,8 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
711 if (!buffer_uptodate(bh)) 712 if (!buffer_uptodate(bh))
712 set_buffer_uptodate(bh); 713 set_buffer_uptodate(bh);
713 } else if (!buffer_uptodate(bh) && !buffer_delay(bh) && 714 } else if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
714 (block_start < from || block_end > to)) { 715 !buffer_new(bh) &&
716 (block_start < from || block_end > to)) {
715 ll_rw_block(READ, 1, &bh); 717 ll_rw_block(READ, 1, &bh);
716 *wait_bh++=bh; 718 *wait_bh++=bh;
717 } 719 }
@@ -738,18 +740,13 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
738 bh = head; 740 bh = head;
739 block_start = 0; 741 block_start = 0;
740 do { 742 do {
741 void *kaddr;
742
743 block_end = block_start + bsize; 743 block_end = block_start + bsize;
744 if (block_end <= from) 744 if (block_end <= from)
745 goto next_bh; 745 goto next_bh;
746 if (block_start >= to) 746 if (block_start >= to)
747 break; 747 break;
748 748
749 kaddr = kmap_atomic(page, KM_USER0); 749 zero_user_page(page, block_start, bh->b_size, KM_USER0);
750 memset(kaddr+block_start, 0, bh->b_size);
751 flush_dcache_page(page);
752 kunmap_atomic(kaddr, KM_USER0);
753 set_buffer_uptodate(bh); 750 set_buffer_uptodate(bh);
754 mark_buffer_dirty(bh); 751 mark_buffer_dirty(bh);
755 752
@@ -761,217 +758,240 @@ next_bh:
761 return ret; 758 return ret;
762} 759}
763 760
761#if (PAGE_CACHE_SIZE >= OCFS2_MAX_CLUSTERSIZE)
762#define OCFS2_MAX_CTXT_PAGES 1
763#else
764#define OCFS2_MAX_CTXT_PAGES (OCFS2_MAX_CLUSTERSIZE / PAGE_CACHE_SIZE)
765#endif
766
767#define OCFS2_MAX_CLUSTERS_PER_PAGE (PAGE_CACHE_SIZE / OCFS2_MIN_CLUSTERSIZE)
768
764/* 769/*
765 * This will copy user data from the buffer page in the splice 770 * Describe the state of a single cluster to be written to.
766 * context.
767 *
768 * For now, we ignore SPLICE_F_MOVE as that would require some extra
769 * communication out all the way to ocfs2_write().
770 */ 771 */
771int ocfs2_map_and_write_splice_data(struct inode *inode, 772struct ocfs2_write_cluster_desc {
772 struct ocfs2_write_ctxt *wc, u64 *p_blkno, 773 u32 c_cpos;
773 unsigned int *ret_from, unsigned int *ret_to) 774 u32 c_phys;
775 /*
776 * Give this a unique field because c_phys eventually gets
777 * filled.
778 */
779 unsigned c_new;
780 unsigned c_unwritten;
781};
782
783static inline int ocfs2_should_zero_cluster(struct ocfs2_write_cluster_desc *d)
774{ 784{
775 int ret; 785 return d->c_new || d->c_unwritten;
776 unsigned int to, from, cluster_start, cluster_end; 786}
777 char *src, *dst;
778 struct ocfs2_splice_write_priv *sp = wc->w_private;
779 struct pipe_buffer *buf = sp->s_buf;
780 unsigned long bytes, src_from;
781 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
782 787
783 ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start, 788struct ocfs2_write_ctxt {
784 &cluster_end); 789 /* Logical cluster position / len of write */
790 u32 w_cpos;
791 u32 w_clen;
785 792
786 from = sp->s_offset; 793 struct ocfs2_write_cluster_desc w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE];
787 src_from = sp->s_buf_offset;
788 bytes = wc->w_count;
789 794
790 if (wc->w_large_pages) { 795 /*
791 /* 796 * This is true if page_size > cluster_size.
792 * For cluster size < page size, we have to 797 *
793 * calculate pos within the cluster and obey 798 * It triggers a set of special cases during write which might
794 * the rightmost boundary. 799 * have to deal with allocating writes to partial pages.
795 */ 800 */
796 bytes = min(bytes, (unsigned long)(osb->s_clustersize 801 unsigned int w_large_pages;
797 - (wc->w_pos & (osb->s_clustersize - 1)))); 802
798 } 803 /*
799 to = from + bytes; 804 * Pages involved in this write.
805 *
806 * w_target_page is the page being written to by the user.
807 *
808 * w_pages is an array of pages which always contains
809 * w_target_page, and in the case of an allocating write with
810 * page_size < cluster size, it will contain zero'd and mapped
811 * pages adjacent to w_target_page which need to be written
812 * out in so that future reads from that region will get
813 * zero's.
814 */
815 struct page *w_pages[OCFS2_MAX_CTXT_PAGES];
816 unsigned int w_num_pages;
817 struct page *w_target_page;
800 818
801 BUG_ON(from > PAGE_CACHE_SIZE); 819 /*
802 BUG_ON(to > PAGE_CACHE_SIZE); 820 * ocfs2_write_end() uses this to know what the real range to
803 BUG_ON(from < cluster_start); 821 * write in the target should be.
804 BUG_ON(to > cluster_end); 822 */
823 unsigned int w_target_from;
824 unsigned int w_target_to;
805 825
806 if (wc->w_this_page_new) 826 /*
807 ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, 827 * We could use journal_current_handle() but this is cleaner,
808 cluster_start, cluster_end, 1); 828 * IMHO -Mark
809 else 829 */
810 ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, 830 handle_t *w_handle;
811 from, to, 0); 831
812 if (ret) { 832 struct buffer_head *w_di_bh;
813 mlog_errno(ret); 833
814 goto out; 834 struct ocfs2_cached_dealloc_ctxt w_dealloc;
835};
836
837static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
838{
839 int i;
840
841 for(i = 0; i < wc->w_num_pages; i++) {
842 if (wc->w_pages[i] == NULL)
843 continue;
844
845 unlock_page(wc->w_pages[i]);
846 mark_page_accessed(wc->w_pages[i]);
847 page_cache_release(wc->w_pages[i]);
815 } 848 }
816 849
817 src = buf->ops->map(sp->s_pipe, buf, 1); 850 brelse(wc->w_di_bh);
818 dst = kmap_atomic(wc->w_this_page, KM_USER1); 851 kfree(wc);
819 memcpy(dst + from, src + src_from, bytes); 852}
820 kunmap_atomic(wc->w_this_page, KM_USER1); 853
821 buf->ops->unmap(sp->s_pipe, buf, src); 854static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
855 struct ocfs2_super *osb, loff_t pos,
856 unsigned len, struct buffer_head *di_bh)
857{
858 struct ocfs2_write_ctxt *wc;
859
860 wc = kzalloc(sizeof(struct ocfs2_write_ctxt), GFP_NOFS);
861 if (!wc)
862 return -ENOMEM;
822 863
823 wc->w_finished_copy = 1; 864 wc->w_cpos = pos >> osb->s_clustersize_bits;
865 wc->w_clen = ocfs2_clusters_for_bytes(osb->sb, len);
866 get_bh(di_bh);
867 wc->w_di_bh = di_bh;
824 868
825 *ret_from = from; 869 if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits))
826 *ret_to = to; 870 wc->w_large_pages = 1;
827out: 871 else
872 wc->w_large_pages = 0;
873
874 ocfs2_init_dealloc_ctxt(&wc->w_dealloc);
875
876 *wcp = wc;
828 877
829 return bytes ? (unsigned int)bytes : ret; 878 return 0;
830} 879}
831 880
832/* 881/*
833 * This will copy user data from the iovec in the buffered write 882 * If a page has any new buffers, zero them out here, and mark them uptodate
834 * context. 883 * and dirty so they'll be written out (in order to prevent uninitialised
884 * block data from leaking). And clear the new bit.
835 */ 885 */
836int ocfs2_map_and_write_user_data(struct inode *inode, 886static void ocfs2_zero_new_buffers(struct page *page, unsigned from, unsigned to)
837 struct ocfs2_write_ctxt *wc, u64 *p_blkno,
838 unsigned int *ret_from, unsigned int *ret_to)
839{ 887{
840 int ret; 888 unsigned int block_start, block_end;
841 unsigned int to, from, cluster_start, cluster_end; 889 struct buffer_head *head, *bh;
842 unsigned long bytes, src_from;
843 char *dst;
844 struct ocfs2_buffered_write_priv *bp = wc->w_private;
845 const struct iovec *cur_iov = bp->b_cur_iov;
846 char __user *buf;
847 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
848 890
849 ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start, 891 BUG_ON(!PageLocked(page));
850 &cluster_end); 892 if (!page_has_buffers(page))
893 return;
851 894
852 buf = cur_iov->iov_base + bp->b_cur_off; 895 bh = head = page_buffers(page);
853 src_from = (unsigned long)buf & ~PAGE_CACHE_MASK; 896 block_start = 0;
897 do {
898 block_end = block_start + bh->b_size;
854 899
855 from = wc->w_pos & (PAGE_CACHE_SIZE - 1); 900 if (buffer_new(bh)) {
901 if (block_end > from && block_start < to) {
902 if (!PageUptodate(page)) {
903 unsigned start, end;
856 904
857 /* 905 start = max(from, block_start);
858 * This is a lot of comparisons, but it reads quite 906 end = min(to, block_end);
859 * easily, which is important here.
860 */
861 /* Stay within the src page */
862 bytes = PAGE_SIZE - src_from;
863 /* Stay within the vector */
864 bytes = min(bytes,
865 (unsigned long)(cur_iov->iov_len - bp->b_cur_off));
866 /* Stay within count */
867 bytes = min(bytes, (unsigned long)wc->w_count);
868 /*
869 * For clustersize > page size, just stay within
870 * target page, otherwise we have to calculate pos
871 * within the cluster and obey the rightmost
872 * boundary.
873 */
874 if (wc->w_large_pages) {
875 /*
876 * For cluster size < page size, we have to
877 * calculate pos within the cluster and obey
878 * the rightmost boundary.
879 */
880 bytes = min(bytes, (unsigned long)(osb->s_clustersize
881 - (wc->w_pos & (osb->s_clustersize - 1))));
882 } else {
883 /*
884 * cluster size > page size is the most common
885 * case - we just stay within the target page
886 * boundary.
887 */
888 bytes = min(bytes, PAGE_CACHE_SIZE - from);
889 }
890 907
891 to = from + bytes; 908 zero_user_page(page, start, end - start, KM_USER0);
909 set_buffer_uptodate(bh);
910 }
892 911
893 BUG_ON(from > PAGE_CACHE_SIZE); 912 clear_buffer_new(bh);
894 BUG_ON(to > PAGE_CACHE_SIZE); 913 mark_buffer_dirty(bh);
895 BUG_ON(from < cluster_start); 914 }
896 BUG_ON(to > cluster_end); 915 }
897 916
898 if (wc->w_this_page_new) 917 block_start = block_end;
899 ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, 918 bh = bh->b_this_page;
900 cluster_start, cluster_end, 1); 919 } while (bh != head);
901 else 920}
902 ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
903 from, to, 0);
904 if (ret) {
905 mlog_errno(ret);
906 goto out;
907 }
908 921
909 dst = kmap(wc->w_this_page); 922/*
910 memcpy(dst + from, bp->b_src_buf + src_from, bytes); 923 * Only called when we have a failure during allocating write to write
911 kunmap(wc->w_this_page); 924 * zero's to the newly allocated region.
925 */
926static void ocfs2_write_failure(struct inode *inode,
927 struct ocfs2_write_ctxt *wc,
928 loff_t user_pos, unsigned user_len)
929{
930 int i;
931 unsigned from, to;
932 struct page *tmppage;
912 933
913 /* 934 ocfs2_zero_new_buffers(wc->w_target_page, user_pos, user_len);
914 * XXX: This is slow, but simple. The caller of
915 * ocfs2_buffered_write_cluster() is responsible for
916 * passing through the iovecs, so it's difficult to
917 * predict what our next step is in here after our
918 * initial write. A future version should be pushing
919 * that iovec manipulation further down.
920 *
921 * By setting this, we indicate that a copy from user
922 * data was done, and subsequent calls for this
923 * cluster will skip copying more data.
924 */
925 wc->w_finished_copy = 1;
926 935
927 *ret_from = from; 936 if (wc->w_large_pages) {
928 *ret_to = to; 937 from = wc->w_target_from;
929out: 938 to = wc->w_target_to;
939 } else {
940 from = 0;
941 to = PAGE_CACHE_SIZE;
942 }
943
944 for(i = 0; i < wc->w_num_pages; i++) {
945 tmppage = wc->w_pages[i];
930 946
931 return bytes ? (unsigned int)bytes : ret; 947 if (ocfs2_should_order_data(inode))
948 walk_page_buffers(wc->w_handle, page_buffers(tmppage),
949 from, to, NULL,
950 ocfs2_journal_dirty_data);
951
952 block_commit_write(tmppage, from, to);
953 }
932} 954}
933 955
934/* 956static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno,
935 * Map, fill and write a page to disk. 957 struct ocfs2_write_ctxt *wc,
936 * 958 struct page *page, u32 cpos,
937 * The work of copying data is done via callback. Newly allocated 959 loff_t user_pos, unsigned user_len,
938 * pages which don't take user data will be zero'd (set 'new' to 960 int new)
939 * indicate an allocating write)
940 *
941 * Returns a negative error code or the number of bytes copied into
942 * the page.
943 */
944static int ocfs2_write_data_page(struct inode *inode, handle_t *handle,
945 u64 *p_blkno, struct page *page,
946 struct ocfs2_write_ctxt *wc, int new)
947{ 961{
948 int ret, copied = 0; 962 int ret;
949 unsigned int from = 0, to = 0; 963 unsigned int map_from = 0, map_to = 0;
950 unsigned int cluster_start, cluster_end; 964 unsigned int cluster_start, cluster_end;
951 unsigned int zero_from = 0, zero_to = 0; 965 unsigned int user_data_from = 0, user_data_to = 0;
952 966
953 ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), wc->w_cpos, 967 ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), cpos,
954 &cluster_start, &cluster_end); 968 &cluster_start, &cluster_end);
955 969
956 if ((wc->w_pos >> PAGE_CACHE_SHIFT) == page->index 970 if (page == wc->w_target_page) {
957 && !wc->w_finished_copy) { 971 map_from = user_pos & (PAGE_CACHE_SIZE - 1);
958 972 map_to = map_from + user_len;
959 wc->w_this_page = page; 973
960 wc->w_this_page_new = new; 974 if (new)
961 ret = wc->w_write_data_page(inode, wc, p_blkno, &from, &to); 975 ret = ocfs2_map_page_blocks(page, p_blkno, inode,
962 if (ret < 0) { 976 cluster_start, cluster_end,
977 new);
978 else
979 ret = ocfs2_map_page_blocks(page, p_blkno, inode,
980 map_from, map_to, new);
981 if (ret) {
963 mlog_errno(ret); 982 mlog_errno(ret);
964 goto out; 983 goto out;
965 } 984 }
966 985
967 copied = ret; 986 user_data_from = map_from;
968 987 user_data_to = map_to;
969 zero_from = from;
970 zero_to = to;
971 if (new) { 988 if (new) {
972 from = cluster_start; 989 map_from = cluster_start;
973 to = cluster_end; 990 map_to = cluster_end;
974 } 991 }
992
993 wc->w_target_from = map_from;
994 wc->w_target_to = map_to;
975 } else { 995 } else {
976 /* 996 /*
977 * If we haven't allocated the new page yet, we 997 * If we haven't allocated the new page yet, we
@@ -980,11 +1000,11 @@ static int ocfs2_write_data_page(struct inode *inode, handle_t *handle,
980 */ 1000 */
981 BUG_ON(!new); 1001 BUG_ON(!new);
982 1002
983 from = cluster_start; 1003 map_from = cluster_start;
984 to = cluster_end; 1004 map_to = cluster_end;
985 1005
986 ret = ocfs2_map_page_blocks(page, p_blkno, inode, 1006 ret = ocfs2_map_page_blocks(page, p_blkno, inode,
987 cluster_start, cluster_end, 1); 1007 cluster_start, cluster_end, new);
988 if (ret) { 1008 if (ret) {
989 mlog_errno(ret); 1009 mlog_errno(ret);
990 goto out; 1010 goto out;
@@ -1003,108 +1023,113 @@ static int ocfs2_write_data_page(struct inode *inode, handle_t *handle,
1003 */ 1023 */
1004 if (new && !PageUptodate(page)) 1024 if (new && !PageUptodate(page))
1005 ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb), 1025 ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb),
1006 wc->w_cpos, zero_from, zero_to); 1026 cpos, user_data_from, user_data_to);
1007 1027
1008 flush_dcache_page(page); 1028 flush_dcache_page(page);
1009 1029
1010 if (ocfs2_should_order_data(inode)) {
1011 ret = walk_page_buffers(handle,
1012 page_buffers(page),
1013 from, to, NULL,
1014 ocfs2_journal_dirty_data);
1015 if (ret < 0)
1016 mlog_errno(ret);
1017 }
1018
1019 /*
1020 * We don't use generic_commit_write() because we need to
1021 * handle our own i_size update.
1022 */
1023 ret = block_commit_write(page, from, to);
1024 if (ret)
1025 mlog_errno(ret);
1026out: 1030out:
1027 1031 return ret;
1028 return copied ? copied : ret;
1029} 1032}
1030 1033
1031/* 1034/*
1032 * Do the actual write of some data into an inode. Optionally allocate 1035 * This function will only grab one clusters worth of pages.
1033 * in order to fulfill the write.
1034 *
1035 * cpos is the logical cluster offset within the file to write at
1036 *
1037 * 'phys' is the physical mapping of that offset. a 'phys' value of
1038 * zero indicates that allocation is required. In this case, data_ac
1039 * and meta_ac should be valid (meta_ac can be null if metadata
1040 * allocation isn't required).
1041 */ 1036 */
1042static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle, 1037static int ocfs2_grab_pages_for_write(struct address_space *mapping,
1043 struct buffer_head *di_bh, 1038 struct ocfs2_write_ctxt *wc,
1044 struct ocfs2_alloc_context *data_ac, 1039 u32 cpos, loff_t user_pos, int new,
1045 struct ocfs2_alloc_context *meta_ac, 1040 struct page *mmap_page)
1046 struct ocfs2_write_ctxt *wc)
1047{ 1041{
1048 int ret, i, numpages = 1, new; 1042 int ret = 0, i;
1049 unsigned int copied = 0; 1043 unsigned long start, target_index, index;
1050 u32 tmp_pos;
1051 u64 v_blkno, p_blkno;
1052 struct address_space *mapping = file->f_mapping;
1053 struct inode *inode = mapping->host; 1044 struct inode *inode = mapping->host;
1054 unsigned long index, start;
1055 struct page **cpages;
1056 1045
1057 new = phys == 0 ? 1 : 0; 1046 target_index = user_pos >> PAGE_CACHE_SHIFT;
1058 1047
1059 /* 1048 /*
1060 * Figure out how many pages we'll be manipulating here. For 1049 * Figure out how many pages we'll be manipulating here. For
1061 * non allocating write, we just change the one 1050 * non allocating write, we just change the one
1062 * page. Otherwise, we'll need a whole clusters worth. 1051 * page. Otherwise, we'll need a whole clusters worth.
1063 */ 1052 */
1064 if (new)
1065 numpages = ocfs2_pages_per_cluster(inode->i_sb);
1066
1067 cpages = kzalloc(sizeof(*cpages) * numpages, GFP_NOFS);
1068 if (!cpages) {
1069 ret = -ENOMEM;
1070 mlog_errno(ret);
1071 return ret;
1072 }
1073
1074 /*
1075 * Fill our page array first. That way we've grabbed enough so
1076 * that we can zero and flush if we error after adding the
1077 * extent.
1078 */
1079 if (new) { 1053 if (new) {
1080 start = ocfs2_align_clusters_to_page_index(inode->i_sb, 1054 wc->w_num_pages = ocfs2_pages_per_cluster(inode->i_sb);
1081 wc->w_cpos); 1055 start = ocfs2_align_clusters_to_page_index(inode->i_sb, cpos);
1082 v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, wc->w_cpos);
1083 } else { 1056 } else {
1084 start = wc->w_pos >> PAGE_CACHE_SHIFT; 1057 wc->w_num_pages = 1;
1085 v_blkno = wc->w_pos >> inode->i_sb->s_blocksize_bits; 1058 start = target_index;
1086 } 1059 }
1087 1060
1088 for(i = 0; i < numpages; i++) { 1061 for(i = 0; i < wc->w_num_pages; i++) {
1089 index = start + i; 1062 index = start + i;
1090 1063
1091 cpages[i] = find_or_create_page(mapping, index, GFP_NOFS); 1064 if (index == target_index && mmap_page) {
1092 if (!cpages[i]) { 1065 /*
1093 ret = -ENOMEM; 1066 * ocfs2_pagemkwrite() is a little different
1094 mlog_errno(ret); 1067 * and wants us to directly use the page
1095 goto out; 1068 * passed in.
1069 */
1070 lock_page(mmap_page);
1071
1072 if (mmap_page->mapping != mapping) {
1073 unlock_page(mmap_page);
1074 /*
1075 * Sanity check - the locking in
1076 * ocfs2_pagemkwrite() should ensure
1077 * that this code doesn't trigger.
1078 */
1079 ret = -EINVAL;
1080 mlog_errno(ret);
1081 goto out;
1082 }
1083
1084 page_cache_get(mmap_page);
1085 wc->w_pages[i] = mmap_page;
1086 } else {
1087 wc->w_pages[i] = find_or_create_page(mapping, index,
1088 GFP_NOFS);
1089 if (!wc->w_pages[i]) {
1090 ret = -ENOMEM;
1091 mlog_errno(ret);
1092 goto out;
1093 }
1096 } 1094 }
1095
1096 if (index == target_index)
1097 wc->w_target_page = wc->w_pages[i];
1097 } 1098 }
1099out:
1100 return ret;
1101}
1102
1103/*
1104 * Prepare a single cluster for write one cluster into the file.
1105 */
1106static int ocfs2_write_cluster(struct address_space *mapping,
1107 u32 phys, unsigned int unwritten,
1108 struct ocfs2_alloc_context *data_ac,
1109 struct ocfs2_alloc_context *meta_ac,
1110 struct ocfs2_write_ctxt *wc, u32 cpos,
1111 loff_t user_pos, unsigned user_len)
1112{
1113 int ret, i, new, should_zero = 0;
1114 u64 v_blkno, p_blkno;
1115 struct inode *inode = mapping->host;
1116
1117 new = phys == 0 ? 1 : 0;
1118 if (new || unwritten)
1119 should_zero = 1;
1098 1120
1099 if (new) { 1121 if (new) {
1122 u32 tmp_pos;
1123
1100 /* 1124 /*
1101 * This is safe to call with the page locks - it won't take 1125 * This is safe to call with the page locks - it won't take
1102 * any additional semaphores or cluster locks. 1126 * any additional semaphores or cluster locks.
1103 */ 1127 */
1104 tmp_pos = wc->w_cpos; 1128 tmp_pos = cpos;
1105 ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode, 1129 ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode,
1106 &tmp_pos, 1, di_bh, handle, 1130 &tmp_pos, 1, 0, wc->w_di_bh,
1107 data_ac, meta_ac, NULL); 1131 wc->w_handle, data_ac,
1132 meta_ac, NULL);
1108 /* 1133 /*
1109 * This shouldn't happen because we must have already 1134 * This shouldn't happen because we must have already
1110 * calculated the correct meta data allocation required. The 1135 * calculated the correct meta data allocation required. The
@@ -1121,159 +1146,433 @@ static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle,
1121 mlog_errno(ret); 1146 mlog_errno(ret);
1122 goto out; 1147 goto out;
1123 } 1148 }
1149 } else if (unwritten) {
1150 ret = ocfs2_mark_extent_written(inode, wc->w_di_bh,
1151 wc->w_handle, cpos, 1, phys,
1152 meta_ac, &wc->w_dealloc);
1153 if (ret < 0) {
1154 mlog_errno(ret);
1155 goto out;
1156 }
1124 } 1157 }
1125 1158
1159 if (should_zero)
1160 v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, cpos);
1161 else
1162 v_blkno = user_pos >> inode->i_sb->s_blocksize_bits;
1163
1164 /*
1165 * The only reason this should fail is due to an inability to
1166 * find the extent added.
1167 */
1126 ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL, 1168 ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL,
1127 NULL); 1169 NULL);
1128 if (ret < 0) { 1170 if (ret < 0) {
1129 1171 ocfs2_error(inode->i_sb, "Corrupting extend for inode %llu, "
1130 /* 1172 "at logical block %llu",
1131 * XXX: Should we go readonly here? 1173 (unsigned long long)OCFS2_I(inode)->ip_blkno,
1132 */ 1174 (unsigned long long)v_blkno);
1133
1134 mlog_errno(ret);
1135 goto out; 1175 goto out;
1136 } 1176 }
1137 1177
1138 BUG_ON(p_blkno == 0); 1178 BUG_ON(p_blkno == 0);
1139 1179
1140 for(i = 0; i < numpages; i++) { 1180 for(i = 0; i < wc->w_num_pages; i++) {
1141 ret = ocfs2_write_data_page(inode, handle, &p_blkno, cpages[i], 1181 int tmpret;
1142 wc, new); 1182
1143 if (ret < 0) { 1183 tmpret = ocfs2_prepare_page_for_write(inode, &p_blkno, wc,
1144 mlog_errno(ret); 1184 wc->w_pages[i], cpos,
1145 goto out; 1185 user_pos, user_len,
1186 should_zero);
1187 if (tmpret) {
1188 mlog_errno(tmpret);
1189 if (ret == 0)
1190 tmpret = ret;
1146 } 1191 }
1147
1148 copied += ret;
1149 } 1192 }
1150 1193
1194 /*
1195 * We only have cleanup to do in case of allocating write.
1196 */
1197 if (ret && new)
1198 ocfs2_write_failure(inode, wc, user_pos, user_len);
1199
1151out: 1200out:
1152 for(i = 0; i < numpages; i++) { 1201
1153 unlock_page(cpages[i]); 1202 return ret;
1154 mark_page_accessed(cpages[i]); 1203}
1155 page_cache_release(cpages[i]); 1204
1205static int ocfs2_write_cluster_by_desc(struct address_space *mapping,
1206 struct ocfs2_alloc_context *data_ac,
1207 struct ocfs2_alloc_context *meta_ac,
1208 struct ocfs2_write_ctxt *wc,
1209 loff_t pos, unsigned len)
1210{
1211 int ret, i;
1212 struct ocfs2_write_cluster_desc *desc;
1213
1214 for (i = 0; i < wc->w_clen; i++) {
1215 desc = &wc->w_desc[i];
1216
1217 ret = ocfs2_write_cluster(mapping, desc->c_phys,
1218 desc->c_unwritten, data_ac, meta_ac,
1219 wc, desc->c_cpos, pos, len);
1220 if (ret) {
1221 mlog_errno(ret);
1222 goto out;
1223 }
1156 } 1224 }
1157 kfree(cpages);
1158 1225
1159 return copied ? copied : ret; 1226 ret = 0;
1227out:
1228 return ret;
1160} 1229}
1161 1230
1162static void ocfs2_write_ctxt_init(struct ocfs2_write_ctxt *wc, 1231/*
1163 struct ocfs2_super *osb, loff_t pos, 1232 * ocfs2_write_end() wants to know which parts of the target page it
1164 size_t count, ocfs2_page_writer *cb, 1233 * should complete the write on. It's easiest to compute them ahead of
1165 void *cb_priv) 1234 * time when a more complete view of the write is available.
1235 */
1236static void ocfs2_set_target_boundaries(struct ocfs2_super *osb,
1237 struct ocfs2_write_ctxt *wc,
1238 loff_t pos, unsigned len, int alloc)
1166{ 1239{
1167 wc->w_count = count; 1240 struct ocfs2_write_cluster_desc *desc;
1168 wc->w_pos = pos;
1169 wc->w_cpos = wc->w_pos >> osb->s_clustersize_bits;
1170 wc->w_finished_copy = 0;
1171 1241
1172 if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) 1242 wc->w_target_from = pos & (PAGE_CACHE_SIZE - 1);
1173 wc->w_large_pages = 1; 1243 wc->w_target_to = wc->w_target_from + len;
1174 else
1175 wc->w_large_pages = 0;
1176 1244
1177 wc->w_write_data_page = cb; 1245 if (alloc == 0)
1178 wc->w_private = cb_priv; 1246 return;
1247
1248 /*
1249 * Allocating write - we may have different boundaries based
1250 * on page size and cluster size.
1251 *
1252 * NOTE: We can no longer compute one value from the other as
1253 * the actual write length and user provided length may be
1254 * different.
1255 */
1256
1257 if (wc->w_large_pages) {
1258 /*
1259 * We only care about the 1st and last cluster within
1260 * our range and whether they should be zero'd or not. Either
1261 * value may be extended out to the start/end of a
1262 * newly allocated cluster.
1263 */
1264 desc = &wc->w_desc[0];
1265 if (ocfs2_should_zero_cluster(desc))
1266 ocfs2_figure_cluster_boundaries(osb,
1267 desc->c_cpos,
1268 &wc->w_target_from,
1269 NULL);
1270
1271 desc = &wc->w_desc[wc->w_clen - 1];
1272 if (ocfs2_should_zero_cluster(desc))
1273 ocfs2_figure_cluster_boundaries(osb,
1274 desc->c_cpos,
1275 NULL,
1276 &wc->w_target_to);
1277 } else {
1278 wc->w_target_from = 0;
1279 wc->w_target_to = PAGE_CACHE_SIZE;
1280 }
1179} 1281}
1180 1282
1181/* 1283/*
1182 * Write a cluster to an inode. The cluster may not be allocated yet, 1284 * Populate each single-cluster write descriptor in the write context
1183 * in which case it will be. This only exists for buffered writes - 1285 * with information about the i/o to be done.
1184 * O_DIRECT takes a more "traditional" path through the kernel.
1185 *
1186 * The caller is responsible for incrementing pos, written counts, etc
1187 * 1286 *
1188 * For file systems that don't support sparse files, pre-allocation 1287 * Returns the number of clusters that will have to be allocated, as
1189 * and page zeroing up until cpos should be done prior to this 1288 * well as a worst case estimate of the number of extent records that
1190 * function call. 1289 * would have to be created during a write to an unwritten region.
1191 *
1192 * Callers should be holding i_sem, and the rw cluster lock.
1193 *
1194 * Returns the number of user bytes written, or less than zero for
1195 * error.
1196 */ 1290 */
1197ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos, 1291static int ocfs2_populate_write_desc(struct inode *inode,
1198 size_t count, ocfs2_page_writer *actor, 1292 struct ocfs2_write_ctxt *wc,
1199 void *priv) 1293 unsigned int *clusters_to_alloc,
1294 unsigned int *extents_to_split)
1295{
1296 int ret;
1297 struct ocfs2_write_cluster_desc *desc;
1298 unsigned int num_clusters = 0;
1299 unsigned int ext_flags = 0;
1300 u32 phys = 0;
1301 int i;
1302
1303 *clusters_to_alloc = 0;
1304 *extents_to_split = 0;
1305
1306 for (i = 0; i < wc->w_clen; i++) {
1307 desc = &wc->w_desc[i];
1308 desc->c_cpos = wc->w_cpos + i;
1309
1310 if (num_clusters == 0) {
1311 /*
1312 * Need to look up the next extent record.
1313 */
1314 ret = ocfs2_get_clusters(inode, desc->c_cpos, &phys,
1315 &num_clusters, &ext_flags);
1316 if (ret) {
1317 mlog_errno(ret);
1318 goto out;
1319 }
1320
1321 /*
1322 * Assume worst case - that we're writing in
1323 * the middle of the extent.
1324 *
1325 * We can assume that the write proceeds from
1326 * left to right, in which case the extent
1327 * insert code is smart enough to coalesce the
1328 * next splits into the previous records created.
1329 */
1330 if (ext_flags & OCFS2_EXT_UNWRITTEN)
1331 *extents_to_split = *extents_to_split + 2;
1332 } else if (phys) {
1333 /*
1334 * Only increment phys if it doesn't describe
1335 * a hole.
1336 */
1337 phys++;
1338 }
1339
1340 desc->c_phys = phys;
1341 if (phys == 0) {
1342 desc->c_new = 1;
1343 *clusters_to_alloc = *clusters_to_alloc + 1;
1344 }
1345 if (ext_flags & OCFS2_EXT_UNWRITTEN)
1346 desc->c_unwritten = 1;
1347
1348 num_clusters--;
1349 }
1350
1351 ret = 0;
1352out:
1353 return ret;
1354}
1355
1356int ocfs2_write_begin_nolock(struct address_space *mapping,
1357 loff_t pos, unsigned len, unsigned flags,
1358 struct page **pagep, void **fsdata,
1359 struct buffer_head *di_bh, struct page *mmap_page)
1200{ 1360{
1201 int ret, credits = OCFS2_INODE_UPDATE_CREDITS; 1361 int ret, credits = OCFS2_INODE_UPDATE_CREDITS;
1202 ssize_t written = 0; 1362 unsigned int clusters_to_alloc, extents_to_split;
1203 u32 phys; 1363 struct ocfs2_write_ctxt *wc;
1204 struct inode *inode = file->f_mapping->host; 1364 struct inode *inode = mapping->host;
1205 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1365 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1206 struct buffer_head *di_bh = NULL;
1207 struct ocfs2_dinode *di; 1366 struct ocfs2_dinode *di;
1208 struct ocfs2_alloc_context *data_ac = NULL; 1367 struct ocfs2_alloc_context *data_ac = NULL;
1209 struct ocfs2_alloc_context *meta_ac = NULL; 1368 struct ocfs2_alloc_context *meta_ac = NULL;
1210 handle_t *handle; 1369 handle_t *handle;
1211 struct ocfs2_write_ctxt wc;
1212
1213 ocfs2_write_ctxt_init(&wc, osb, pos, count, actor, priv);
1214 1370
1215 ret = ocfs2_meta_lock(inode, &di_bh, 1); 1371 ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh);
1216 if (ret) { 1372 if (ret) {
1217 mlog_errno(ret); 1373 mlog_errno(ret);
1218 goto out; 1374 return ret;
1219 } 1375 }
1220 di = (struct ocfs2_dinode *)di_bh->b_data;
1221
1222 /*
1223 * Take alloc sem here to prevent concurrent lookups. That way
1224 * the mapping, zeroing and tree manipulation within
1225 * ocfs2_write() will be safe against ->readpage(). This
1226 * should also serve to lock out allocation from a shared
1227 * writeable region.
1228 */
1229 down_write(&OCFS2_I(inode)->ip_alloc_sem);
1230 1376
1231 ret = ocfs2_get_clusters(inode, wc.w_cpos, &phys, NULL, NULL); 1377 ret = ocfs2_populate_write_desc(inode, wc, &clusters_to_alloc,
1378 &extents_to_split);
1232 if (ret) { 1379 if (ret) {
1233 mlog_errno(ret); 1380 mlog_errno(ret);
1234 goto out_meta; 1381 goto out;
1235 } 1382 }
1236 1383
1237 /* phys == 0 means that allocation is required. */ 1384 di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
1238 if (phys == 0) { 1385
1239 ret = ocfs2_lock_allocators(inode, di, 1, &data_ac, &meta_ac); 1386 /*
1387 * We set w_target_from, w_target_to here so that
1388 * ocfs2_write_end() knows which range in the target page to
1389 * write out. An allocation requires that we write the entire
1390 * cluster range.
1391 */
1392 if (clusters_to_alloc || extents_to_split) {
1393 /*
1394 * XXX: We are stretching the limits of
1395 * ocfs2_lock_allocators(). It greatly over-estimates
1396 * the work to be done.
1397 */
1398 ret = ocfs2_lock_allocators(inode, di, clusters_to_alloc,
1399 extents_to_split, &data_ac, &meta_ac);
1240 if (ret) { 1400 if (ret) {
1241 mlog_errno(ret); 1401 mlog_errno(ret);
1242 goto out_meta; 1402 goto out;
1243 } 1403 }
1244 1404
1245 credits = ocfs2_calc_extend_credits(inode->i_sb, di, 1); 1405 credits = ocfs2_calc_extend_credits(inode->i_sb, di,
1246 } 1406 clusters_to_alloc);
1247 1407
1248 ret = ocfs2_data_lock(inode, 1);
1249 if (ret) {
1250 mlog_errno(ret);
1251 goto out_meta;
1252 } 1408 }
1253 1409
1410 ocfs2_set_target_boundaries(osb, wc, pos, len,
1411 clusters_to_alloc + extents_to_split);
1412
1254 handle = ocfs2_start_trans(osb, credits); 1413 handle = ocfs2_start_trans(osb, credits);
1255 if (IS_ERR(handle)) { 1414 if (IS_ERR(handle)) {
1256 ret = PTR_ERR(handle); 1415 ret = PTR_ERR(handle);
1257 mlog_errno(ret); 1416 mlog_errno(ret);
1258 goto out_data; 1417 goto out;
1259 } 1418 }
1260 1419
1261 written = ocfs2_write(file, phys, handle, di_bh, data_ac, 1420 wc->w_handle = handle;
1262 meta_ac, &wc); 1421
1263 if (written < 0) { 1422 /*
1264 ret = written; 1423 * We don't want this to fail in ocfs2_write_end(), so do it
1424 * here.
1425 */
1426 ret = ocfs2_journal_access(handle, inode, wc->w_di_bh,
1427 OCFS2_JOURNAL_ACCESS_WRITE);
1428 if (ret) {
1265 mlog_errno(ret); 1429 mlog_errno(ret);
1266 goto out_commit; 1430 goto out_commit;
1267 } 1431 }
1268 1432
1269 ret = ocfs2_journal_access(handle, inode, di_bh, 1433 /*
1270 OCFS2_JOURNAL_ACCESS_WRITE); 1434 * Fill our page array first. That way we've grabbed enough so
1435 * that we can zero and flush if we error after adding the
1436 * extent.
1437 */
1438 ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos,
1439 clusters_to_alloc + extents_to_split,
1440 mmap_page);
1271 if (ret) { 1441 if (ret) {
1272 mlog_errno(ret); 1442 mlog_errno(ret);
1273 goto out_commit; 1443 goto out_commit;
1274 } 1444 }
1275 1445
1276 pos += written; 1446 ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos,
1447 len);
1448 if (ret) {
1449 mlog_errno(ret);
1450 goto out_commit;
1451 }
1452
1453 if (data_ac)
1454 ocfs2_free_alloc_context(data_ac);
1455 if (meta_ac)
1456 ocfs2_free_alloc_context(meta_ac);
1457
1458 *pagep = wc->w_target_page;
1459 *fsdata = wc;
1460 return 0;
1461out_commit:
1462 ocfs2_commit_trans(osb, handle);
1463
1464out:
1465 ocfs2_free_write_ctxt(wc);
1466
1467 if (data_ac)
1468 ocfs2_free_alloc_context(data_ac);
1469 if (meta_ac)
1470 ocfs2_free_alloc_context(meta_ac);
1471 return ret;
1472}
1473
1474int ocfs2_write_begin(struct file *file, struct address_space *mapping,
1475 loff_t pos, unsigned len, unsigned flags,
1476 struct page **pagep, void **fsdata)
1477{
1478 int ret;
1479 struct buffer_head *di_bh = NULL;
1480 struct inode *inode = mapping->host;
1481
1482 ret = ocfs2_meta_lock(inode, &di_bh, 1);
1483 if (ret) {
1484 mlog_errno(ret);
1485 return ret;
1486 }
1487
1488 /*
1489 * Take alloc sem here to prevent concurrent lookups. That way
1490 * the mapping, zeroing and tree manipulation within
1491 * ocfs2_write() will be safe against ->readpage(). This
1492 * should also serve to lock out allocation from a shared
1493 * writeable region.
1494 */
1495 down_write(&OCFS2_I(inode)->ip_alloc_sem);
1496
1497 ret = ocfs2_data_lock(inode, 1);
1498 if (ret) {
1499 mlog_errno(ret);
1500 goto out_fail;
1501 }
1502
1503 ret = ocfs2_write_begin_nolock(mapping, pos, len, flags, pagep,
1504 fsdata, di_bh, NULL);
1505 if (ret) {
1506 mlog_errno(ret);
1507 goto out_fail_data;
1508 }
1509
1510 brelse(di_bh);
1511
1512 return 0;
1513
1514out_fail_data:
1515 ocfs2_data_unlock(inode, 1);
1516out_fail:
1517 up_write(&OCFS2_I(inode)->ip_alloc_sem);
1518
1519 brelse(di_bh);
1520 ocfs2_meta_unlock(inode, 1);
1521
1522 return ret;
1523}
1524
1525int ocfs2_write_end_nolock(struct address_space *mapping,
1526 loff_t pos, unsigned len, unsigned copied,
1527 struct page *page, void *fsdata)
1528{
1529 int i;
1530 unsigned from, to, start = pos & (PAGE_CACHE_SIZE - 1);
1531 struct inode *inode = mapping->host;
1532 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1533 struct ocfs2_write_ctxt *wc = fsdata;
1534 struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
1535 handle_t *handle = wc->w_handle;
1536 struct page *tmppage;
1537
1538 if (unlikely(copied < len)) {
1539 if (!PageUptodate(wc->w_target_page))
1540 copied = 0;
1541
1542 ocfs2_zero_new_buffers(wc->w_target_page, start+copied,
1543 start+len);
1544 }
1545 flush_dcache_page(wc->w_target_page);
1546
1547 for(i = 0; i < wc->w_num_pages; i++) {
1548 tmppage = wc->w_pages[i];
1549
1550 if (tmppage == wc->w_target_page) {
1551 from = wc->w_target_from;
1552 to = wc->w_target_to;
1553
1554 BUG_ON(from > PAGE_CACHE_SIZE ||
1555 to > PAGE_CACHE_SIZE ||
1556 to < from);
1557 } else {
1558 /*
1559 * Pages adjacent to the target (if any) imply
1560 * a hole-filling write in which case we want
1561 * to flush their entire range.
1562 */
1563 from = 0;
1564 to = PAGE_CACHE_SIZE;
1565 }
1566
1567 if (ocfs2_should_order_data(inode))
1568 walk_page_buffers(wc->w_handle, page_buffers(tmppage),
1569 from, to, NULL,
1570 ocfs2_journal_dirty_data);
1571
1572 block_commit_write(tmppage, from, to);
1573 }
1574
1575 pos += copied;
1277 if (pos > inode->i_size) { 1576 if (pos > inode->i_size) {
1278 i_size_write(inode, pos); 1577 i_size_write(inode, pos);
1279 mark_inode_dirty(inode); 1578 mark_inode_dirty(inode);
@@ -1283,29 +1582,31 @@ ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos,
1283 inode->i_mtime = inode->i_ctime = CURRENT_TIME; 1582 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
1284 di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); 1583 di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
1285 di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); 1584 di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
1585 ocfs2_journal_dirty(handle, wc->w_di_bh);
1286 1586
1287 ret = ocfs2_journal_dirty(handle, di_bh);
1288 if (ret)
1289 mlog_errno(ret);
1290
1291out_commit:
1292 ocfs2_commit_trans(osb, handle); 1587 ocfs2_commit_trans(osb, handle);
1293 1588
1294out_data: 1589 ocfs2_run_deallocs(osb, &wc->w_dealloc);
1295 ocfs2_data_unlock(inode, 1); 1590
1591 ocfs2_free_write_ctxt(wc);
1592
1593 return copied;
1594}
1595
1596int ocfs2_write_end(struct file *file, struct address_space *mapping,
1597 loff_t pos, unsigned len, unsigned copied,
1598 struct page *page, void *fsdata)
1599{
1600 int ret;
1601 struct inode *inode = mapping->host;
1296 1602
1297out_meta: 1603 ret = ocfs2_write_end_nolock(mapping, pos, len, copied, page, fsdata);
1604
1605 ocfs2_data_unlock(inode, 1);
1298 up_write(&OCFS2_I(inode)->ip_alloc_sem); 1606 up_write(&OCFS2_I(inode)->ip_alloc_sem);
1299 ocfs2_meta_unlock(inode, 1); 1607 ocfs2_meta_unlock(inode, 1);
1300 1608
1301out: 1609 return ret;
1302 brelse(di_bh);
1303 if (data_ac)
1304 ocfs2_free_alloc_context(data_ac);
1305 if (meta_ac)
1306 ocfs2_free_alloc_context(meta_ac);
1307
1308 return written ? written : ret;
1309} 1610}
1310 1611
1311const struct address_space_operations ocfs2_aops = { 1612const struct address_space_operations ocfs2_aops = {
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index 45821d479b5a..389579bd64e3 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -42,57 +42,22 @@ int walk_page_buffers( handle_t *handle,
42 int (*fn)( handle_t *handle, 42 int (*fn)( handle_t *handle,
43 struct buffer_head *bh)); 43 struct buffer_head *bh));
44 44
45struct ocfs2_write_ctxt; 45int ocfs2_write_begin(struct file *file, struct address_space *mapping,
46typedef int (ocfs2_page_writer)(struct inode *, struct ocfs2_write_ctxt *, 46 loff_t pos, unsigned len, unsigned flags,
47 u64 *, unsigned int *, unsigned int *); 47 struct page **pagep, void **fsdata);
48 48
49ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos, 49int ocfs2_write_end(struct file *file, struct address_space *mapping,
50 size_t count, ocfs2_page_writer *actor, 50 loff_t pos, unsigned len, unsigned copied,
51 void *priv); 51 struct page *page, void *fsdata);
52 52
53struct ocfs2_write_ctxt { 53int ocfs2_write_end_nolock(struct address_space *mapping,
54 size_t w_count; 54 loff_t pos, unsigned len, unsigned copied,
55 loff_t w_pos; 55 struct page *page, void *fsdata);
56 u32 w_cpos;
57 unsigned int w_finished_copy;
58 56
59 /* This is true if page_size > cluster_size */ 57int ocfs2_write_begin_nolock(struct address_space *mapping,
60 unsigned int w_large_pages; 58 loff_t pos, unsigned len, unsigned flags,
61 59 struct page **pagep, void **fsdata,
62 /* Filler callback and private data */ 60 struct buffer_head *di_bh, struct page *mmap_page);
63 ocfs2_page_writer *w_write_data_page;
64 void *w_private;
65
66 /* Only valid for the filler callback */
67 struct page *w_this_page;
68 unsigned int w_this_page_new;
69};
70
71struct ocfs2_buffered_write_priv {
72 char *b_src_buf;
73 const struct iovec *b_cur_iov; /* Current iovec */
74 size_t b_cur_off; /* Offset in the
75 * current iovec */
76};
77int ocfs2_map_and_write_user_data(struct inode *inode,
78 struct ocfs2_write_ctxt *wc,
79 u64 *p_blkno,
80 unsigned int *ret_from,
81 unsigned int *ret_to);
82
83struct ocfs2_splice_write_priv {
84 struct splice_desc *s_sd;
85 struct pipe_buffer *s_buf;
86 struct pipe_inode_info *s_pipe;
87 /* Neither offset value is ever larger than one page */
88 unsigned int s_offset;
89 unsigned int s_buf_offset;
90};
91int ocfs2_map_and_write_splice_data(struct inode *inode,
92 struct ocfs2_write_ctxt *wc,
93 u64 *p_blkno,
94 unsigned int *ret_from,
95 unsigned int *ret_to);
96 61
97/* all ocfs2_dio_end_io()'s fault */ 62/* all ocfs2_dio_end_io()'s fault */
98#define ocfs2_iocb_is_rw_locked(iocb) \ 63#define ocfs2_iocb_is_rw_locked(iocb) \
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 979113479c66..2bd7f788cf34 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1335,6 +1335,7 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
1335 ret = wait_event_interruptible(o2hb_steady_queue, 1335 ret = wait_event_interruptible(o2hb_steady_queue,
1336 atomic_read(&reg->hr_steady_iterations) == 0); 1336 atomic_read(&reg->hr_steady_iterations) == 0);
1337 if (ret) { 1337 if (ret) {
1338 /* We got interrupted (hello ptrace!). Clean up */
1338 spin_lock(&o2hb_live_lock); 1339 spin_lock(&o2hb_live_lock);
1339 hb_task = reg->hr_task; 1340 hb_task = reg->hr_task;
1340 reg->hr_task = NULL; 1341 reg->hr_task = NULL;
@@ -1345,7 +1346,16 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
1345 goto out; 1346 goto out;
1346 } 1347 }
1347 1348
1348 ret = count; 1349 /* Ok, we were woken. Make sure it wasn't by drop_item() */
1350 spin_lock(&o2hb_live_lock);
1351 hb_task = reg->hr_task;
1352 spin_unlock(&o2hb_live_lock);
1353
1354 if (hb_task)
1355 ret = count;
1356 else
1357 ret = -EIO;
1358
1349out: 1359out:
1350 if (filp) 1360 if (filp)
1351 fput(filp); 1361 fput(filp);
@@ -1523,6 +1533,15 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
1523 if (hb_task) 1533 if (hb_task)
1524 kthread_stop(hb_task); 1534 kthread_stop(hb_task);
1525 1535
1536 /*
1537 * If we're racing a dev_write(), we need to wake them. They will
1538 * check reg->hr_task
1539 */
1540 if (atomic_read(&reg->hr_steady_iterations) != 0) {
1541 atomic_set(&reg->hr_steady_iterations, 0);
1542 wake_up(&o2hb_steady_queue);
1543 }
1544
1526 config_item_put(item); 1545 config_item_put(item);
1527} 1546}
1528 1547
@@ -1665,7 +1684,67 @@ void o2hb_setup_callback(struct o2hb_callback_func *hc,
1665} 1684}
1666EXPORT_SYMBOL_GPL(o2hb_setup_callback); 1685EXPORT_SYMBOL_GPL(o2hb_setup_callback);
1667 1686
1668int o2hb_register_callback(struct o2hb_callback_func *hc) 1687static struct o2hb_region *o2hb_find_region(const char *region_uuid)
1688{
1689 struct o2hb_region *p, *reg = NULL;
1690
1691 assert_spin_locked(&o2hb_live_lock);
1692
1693 list_for_each_entry(p, &o2hb_all_regions, hr_all_item) {
1694 if (!strcmp(region_uuid, config_item_name(&p->hr_item))) {
1695 reg = p;
1696 break;
1697 }
1698 }
1699
1700 return reg;
1701}
1702
1703static int o2hb_region_get(const char *region_uuid)
1704{
1705 int ret = 0;
1706 struct o2hb_region *reg;
1707
1708 spin_lock(&o2hb_live_lock);
1709
1710 reg = o2hb_find_region(region_uuid);
1711 if (!reg)
1712 ret = -ENOENT;
1713 spin_unlock(&o2hb_live_lock);
1714
1715 if (ret)
1716 goto out;
1717
1718 ret = o2nm_depend_this_node();
1719 if (ret)
1720 goto out;
1721
1722 ret = o2nm_depend_item(&reg->hr_item);
1723 if (ret)
1724 o2nm_undepend_this_node();
1725
1726out:
1727 return ret;
1728}
1729
1730static void o2hb_region_put(const char *region_uuid)
1731{
1732 struct o2hb_region *reg;
1733
1734 spin_lock(&o2hb_live_lock);
1735
1736 reg = o2hb_find_region(region_uuid);
1737
1738 spin_unlock(&o2hb_live_lock);
1739
1740 if (reg) {
1741 o2nm_undepend_item(&reg->hr_item);
1742 o2nm_undepend_this_node();
1743 }
1744}
1745
1746int o2hb_register_callback(const char *region_uuid,
1747 struct o2hb_callback_func *hc)
1669{ 1748{
1670 struct o2hb_callback_func *tmp; 1749 struct o2hb_callback_func *tmp;
1671 struct list_head *iter; 1750 struct list_head *iter;
@@ -1681,6 +1760,12 @@ int o2hb_register_callback(struct o2hb_callback_func *hc)
1681 goto out; 1760 goto out;
1682 } 1761 }
1683 1762
1763 if (region_uuid) {
1764 ret = o2hb_region_get(region_uuid);
1765 if (ret)
1766 goto out;
1767 }
1768
1684 down_write(&o2hb_callback_sem); 1769 down_write(&o2hb_callback_sem);
1685 1770
1686 list_for_each(iter, &hbcall->list) { 1771 list_for_each(iter, &hbcall->list) {
@@ -1702,16 +1787,21 @@ out:
1702} 1787}
1703EXPORT_SYMBOL_GPL(o2hb_register_callback); 1788EXPORT_SYMBOL_GPL(o2hb_register_callback);
1704 1789
1705void o2hb_unregister_callback(struct o2hb_callback_func *hc) 1790void o2hb_unregister_callback(const char *region_uuid,
1791 struct o2hb_callback_func *hc)
1706{ 1792{
1707 BUG_ON(hc->hc_magic != O2HB_CB_MAGIC); 1793 BUG_ON(hc->hc_magic != O2HB_CB_MAGIC);
1708 1794
1709 mlog(ML_HEARTBEAT, "on behalf of %p for funcs %p\n", 1795 mlog(ML_HEARTBEAT, "on behalf of %p for funcs %p\n",
1710 __builtin_return_address(0), hc); 1796 __builtin_return_address(0), hc);
1711 1797
1798 /* XXX Can this happen _with_ a region reference? */
1712 if (list_empty(&hc->hc_item)) 1799 if (list_empty(&hc->hc_item))
1713 return; 1800 return;
1714 1801
1802 if (region_uuid)
1803 o2hb_region_put(region_uuid);
1804
1715 down_write(&o2hb_callback_sem); 1805 down_write(&o2hb_callback_sem);
1716 1806
1717 list_del_init(&hc->hc_item); 1807 list_del_init(&hc->hc_item);
diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h
index cc6d40b39771..35397dd5ecdb 100644
--- a/fs/ocfs2/cluster/heartbeat.h
+++ b/fs/ocfs2/cluster/heartbeat.h
@@ -69,8 +69,10 @@ void o2hb_setup_callback(struct o2hb_callback_func *hc,
69 o2hb_cb_func *func, 69 o2hb_cb_func *func,
70 void *data, 70 void *data,
71 int priority); 71 int priority);
72int o2hb_register_callback(struct o2hb_callback_func *hc); 72int o2hb_register_callback(const char *region_uuid,
73void o2hb_unregister_callback(struct o2hb_callback_func *hc); 73 struct o2hb_callback_func *hc);
74void o2hb_unregister_callback(const char *region_uuid,
75 struct o2hb_callback_func *hc);
74void o2hb_fill_node_map(unsigned long *map, 76void o2hb_fill_node_map(unsigned long *map,
75 unsigned bytes); 77 unsigned bytes);
76void o2hb_init(void); 78void o2hb_init(void);
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index 2b205f5d5790..e9e042b93dbf 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -74,7 +74,6 @@ struct mlog_attribute {
74#define define_mask(_name) { \ 74#define define_mask(_name) { \
75 .attr = { \ 75 .attr = { \
76 .name = #_name, \ 76 .name = #_name, \
77 .owner = THIS_MODULE, \
78 .mode = S_IRUGO | S_IWUSR, \ 77 .mode = S_IRUGO | S_IWUSR, \
79 }, \ 78 }, \
80 .mask = ML_##_name, \ 79 .mask = ML_##_name, \
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index 9f5ad0f01ce0..af2070da308b 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -900,6 +900,46 @@ static struct o2nm_cluster_group o2nm_cluster_group = {
900 }, 900 },
901}; 901};
902 902
903int o2nm_depend_item(struct config_item *item)
904{
905 return configfs_depend_item(&o2nm_cluster_group.cs_subsys, item);
906}
907
908void o2nm_undepend_item(struct config_item *item)
909{
910 configfs_undepend_item(&o2nm_cluster_group.cs_subsys, item);
911}
912
913int o2nm_depend_this_node(void)
914{
915 int ret = 0;
916 struct o2nm_node *local_node;
917
918 local_node = o2nm_get_node_by_num(o2nm_this_node());
919 if (!local_node) {
920 ret = -EINVAL;
921 goto out;
922 }
923
924 ret = o2nm_depend_item(&local_node->nd_item);
925 o2nm_node_put(local_node);
926
927out:
928 return ret;
929}
930
931void o2nm_undepend_this_node(void)
932{
933 struct o2nm_node *local_node;
934
935 local_node = o2nm_get_node_by_num(o2nm_this_node());
936 BUG_ON(!local_node);
937
938 o2nm_undepend_item(&local_node->nd_item);
939 o2nm_node_put(local_node);
940}
941
942
903static void __exit exit_o2nm(void) 943static void __exit exit_o2nm(void)
904{ 944{
905 if (ocfs2_table_header) 945 if (ocfs2_table_header)
@@ -934,7 +974,7 @@ static int __init init_o2nm(void)
934 goto out_sysctl; 974 goto out_sysctl;
935 975
936 config_group_init(&o2nm_cluster_group.cs_subsys.su_group); 976 config_group_init(&o2nm_cluster_group.cs_subsys.su_group);
937 init_MUTEX(&o2nm_cluster_group.cs_subsys.su_sem); 977 mutex_init(&o2nm_cluster_group.cs_subsys.su_mutex);
938 ret = configfs_register_subsystem(&o2nm_cluster_group.cs_subsys); 978 ret = configfs_register_subsystem(&o2nm_cluster_group.cs_subsys);
939 if (ret) { 979 if (ret) {
940 printk(KERN_ERR "nodemanager: Registration returned %d\n", ret); 980 printk(KERN_ERR "nodemanager: Registration returned %d\n", ret);
diff --git a/fs/ocfs2/cluster/nodemanager.h b/fs/ocfs2/cluster/nodemanager.h
index 070522138ae2..7c860361b8dd 100644
--- a/fs/ocfs2/cluster/nodemanager.h
+++ b/fs/ocfs2/cluster/nodemanager.h
@@ -77,4 +77,9 @@ struct o2nm_node *o2nm_get_node_by_ip(__be32 addr);
77void o2nm_node_get(struct o2nm_node *node); 77void o2nm_node_get(struct o2nm_node *node);
78void o2nm_node_put(struct o2nm_node *node); 78void o2nm_node_put(struct o2nm_node *node);
79 79
80int o2nm_depend_item(struct config_item *item);
81void o2nm_undepend_item(struct config_item *item);
82int o2nm_depend_this_node(void);
83void o2nm_undepend_this_node(void);
84
80#endif /* O2CLUSTER_NODEMANAGER_H */ 85#endif /* O2CLUSTER_NODEMANAGER_H */
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 0b229a9c7952..f0bdfd944c44 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -261,14 +261,12 @@ out:
261 261
262static void o2net_complete_nodes_nsw(struct o2net_node *nn) 262static void o2net_complete_nodes_nsw(struct o2net_node *nn)
263{ 263{
264 struct list_head *iter, *tmp; 264 struct o2net_status_wait *nsw, *tmp;
265 unsigned int num_kills = 0; 265 unsigned int num_kills = 0;
266 struct o2net_status_wait *nsw;
267 266
268 assert_spin_locked(&nn->nn_lock); 267 assert_spin_locked(&nn->nn_lock);
269 268
270 list_for_each_safe(iter, tmp, &nn->nn_status_list) { 269 list_for_each_entry_safe(nsw, tmp, &nn->nn_status_list, ns_node_item) {
271 nsw = list_entry(iter, struct o2net_status_wait, ns_node_item);
272 o2net_complete_nsw_locked(nn, nsw, O2NET_ERR_DIED, 0); 270 o2net_complete_nsw_locked(nn, nsw, O2NET_ERR_DIED, 0);
273 num_kills++; 271 num_kills++;
274 } 272 }
@@ -764,13 +762,10 @@ EXPORT_SYMBOL_GPL(o2net_register_handler);
764 762
765void o2net_unregister_handler_list(struct list_head *list) 763void o2net_unregister_handler_list(struct list_head *list)
766{ 764{
767 struct list_head *pos, *n; 765 struct o2net_msg_handler *nmh, *n;
768 struct o2net_msg_handler *nmh;
769 766
770 write_lock(&o2net_handler_lock); 767 write_lock(&o2net_handler_lock);
771 list_for_each_safe(pos, n, list) { 768 list_for_each_entry_safe(nmh, n, list, nh_unregister_item) {
772 nmh = list_entry(pos, struct o2net_msg_handler,
773 nh_unregister_item);
774 mlog(ML_TCP, "unregistering handler func %p type %u key %08x\n", 769 mlog(ML_TCP, "unregistering handler func %p type %u key %08x\n",
775 nmh->nh_func, nmh->nh_msg_type, nmh->nh_key); 770 nmh->nh_func, nmh->nh_msg_type, nmh->nh_key);
776 rb_erase(&nmh->nh_node, &o2net_handler_tree); 771 rb_erase(&nmh->nh_node, &o2net_handler_tree);
@@ -1638,8 +1633,8 @@ static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num,
1638 1633
1639void o2net_unregister_hb_callbacks(void) 1634void o2net_unregister_hb_callbacks(void)
1640{ 1635{
1641 o2hb_unregister_callback(&o2net_hb_up); 1636 o2hb_unregister_callback(NULL, &o2net_hb_up);
1642 o2hb_unregister_callback(&o2net_hb_down); 1637 o2hb_unregister_callback(NULL, &o2net_hb_down);
1643} 1638}
1644 1639
1645int o2net_register_hb_callbacks(void) 1640int o2net_register_hb_callbacks(void)
@@ -1651,9 +1646,9 @@ int o2net_register_hb_callbacks(void)
1651 o2hb_setup_callback(&o2net_hb_up, O2HB_NODE_UP_CB, 1646 o2hb_setup_callback(&o2net_hb_up, O2HB_NODE_UP_CB,
1652 o2net_hb_node_up_cb, NULL, O2NET_HB_PRI); 1647 o2net_hb_node_up_cb, NULL, O2NET_HB_PRI);
1653 1648
1654 ret = o2hb_register_callback(&o2net_hb_up); 1649 ret = o2hb_register_callback(NULL, &o2net_hb_up);
1655 if (ret == 0) 1650 if (ret == 0)
1656 ret = o2hb_register_callback(&o2net_hb_down); 1651 ret = o2hb_register_callback(NULL, &o2net_hb_down);
1657 1652
1658 if (ret) 1653 if (ret)
1659 o2net_unregister_hb_callbacks(); 1654 o2net_unregister_hb_callbacks();
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index c441ef1f2bad..0d5fdde959c8 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -368,7 +368,7 @@ int ocfs2_do_extend_dir(struct super_block *sb,
368 u32 offset = OCFS2_I(dir)->ip_clusters; 368 u32 offset = OCFS2_I(dir)->ip_clusters;
369 369
370 status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, &offset, 370 status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, &offset,
371 1, parent_fe_bh, handle, 371 1, 0, parent_fe_bh, handle,
372 data_ac, meta_ac, NULL); 372 data_ac, meta_ac, NULL);
373 BUG_ON(status == -EAGAIN); 373 BUG_ON(status == -EAGAIN);
374 if (status < 0) { 374 if (status < 0) {
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index d836b98dd99a..6954565b8ccb 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -1128,8 +1128,8 @@ bail:
1128 1128
1129static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm) 1129static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm)
1130{ 1130{
1131 o2hb_unregister_callback(&dlm->dlm_hb_up); 1131 o2hb_unregister_callback(NULL, &dlm->dlm_hb_up);
1132 o2hb_unregister_callback(&dlm->dlm_hb_down); 1132 o2hb_unregister_callback(NULL, &dlm->dlm_hb_down);
1133 o2net_unregister_handler_list(&dlm->dlm_domain_handlers); 1133 o2net_unregister_handler_list(&dlm->dlm_domain_handlers);
1134} 1134}
1135 1135
@@ -1141,13 +1141,13 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm)
1141 1141
1142 o2hb_setup_callback(&dlm->dlm_hb_down, O2HB_NODE_DOWN_CB, 1142 o2hb_setup_callback(&dlm->dlm_hb_down, O2HB_NODE_DOWN_CB,
1143 dlm_hb_node_down_cb, dlm, DLM_HB_NODE_DOWN_PRI); 1143 dlm_hb_node_down_cb, dlm, DLM_HB_NODE_DOWN_PRI);
1144 status = o2hb_register_callback(&dlm->dlm_hb_down); 1144 status = o2hb_register_callback(NULL, &dlm->dlm_hb_down);
1145 if (status) 1145 if (status)
1146 goto bail; 1146 goto bail;
1147 1147
1148 o2hb_setup_callback(&dlm->dlm_hb_up, O2HB_NODE_UP_CB, 1148 o2hb_setup_callback(&dlm->dlm_hb_up, O2HB_NODE_UP_CB,
1149 dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI); 1149 dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI);
1150 status = o2hb_register_callback(&dlm->dlm_hb_up); 1150 status = o2hb_register_callback(NULL, &dlm->dlm_hb_up);
1151 if (status) 1151 if (status)
1152 goto bail; 1152 goto bail;
1153 1153
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c
index fd8cb1badc9b..7418dc83de1c 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlm/dlmfs.c
@@ -592,7 +592,7 @@ static int __init init_dlmfs_fs(void)
592 sizeof(struct dlmfs_inode_private), 592 sizeof(struct dlmfs_inode_private),
593 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| 593 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
594 SLAB_MEM_SPREAD), 594 SLAB_MEM_SPREAD),
595 dlmfs_init_once, NULL); 595 dlmfs_init_once);
596 if (!dlmfs_inode_cache) 596 if (!dlmfs_inode_cache)
597 return -ENOMEM; 597 return -ENOMEM;
598 cleanup_inode = 1; 598 cleanup_inode = 1;
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 6edffca99d98..62e4a7daa286 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -192,25 +192,20 @@ static void dlm_print_one_mle(struct dlm_master_list_entry *mle)
192static void dlm_dump_mles(struct dlm_ctxt *dlm) 192static void dlm_dump_mles(struct dlm_ctxt *dlm)
193{ 193{
194 struct dlm_master_list_entry *mle; 194 struct dlm_master_list_entry *mle;
195 struct list_head *iter;
196 195
197 mlog(ML_NOTICE, "dumping all mles for domain %s:\n", dlm->name); 196 mlog(ML_NOTICE, "dumping all mles for domain %s:\n", dlm->name);
198 spin_lock(&dlm->master_lock); 197 spin_lock(&dlm->master_lock);
199 list_for_each(iter, &dlm->master_list) { 198 list_for_each_entry(mle, &dlm->master_list, list)
200 mle = list_entry(iter, struct dlm_master_list_entry, list);
201 dlm_print_one_mle(mle); 199 dlm_print_one_mle(mle);
202 }
203 spin_unlock(&dlm->master_lock); 200 spin_unlock(&dlm->master_lock);
204} 201}
205 202
206int dlm_dump_all_mles(const char __user *data, unsigned int len) 203int dlm_dump_all_mles(const char __user *data, unsigned int len)
207{ 204{
208 struct list_head *iter;
209 struct dlm_ctxt *dlm; 205 struct dlm_ctxt *dlm;
210 206
211 spin_lock(&dlm_domain_lock); 207 spin_lock(&dlm_domain_lock);
212 list_for_each(iter, &dlm_domains) { 208 list_for_each_entry(dlm, &dlm_domains, list) {
213 dlm = list_entry (iter, struct dlm_ctxt, list);
214 mlog(ML_NOTICE, "found dlm: %p, name=%s\n", dlm, dlm->name); 209 mlog(ML_NOTICE, "found dlm: %p, name=%s\n", dlm, dlm->name);
215 dlm_dump_mles(dlm); 210 dlm_dump_mles(dlm);
216 } 211 }
@@ -454,12 +449,10 @@ static int dlm_find_mle(struct dlm_ctxt *dlm,
454 char *name, unsigned int namelen) 449 char *name, unsigned int namelen)
455{ 450{
456 struct dlm_master_list_entry *tmpmle; 451 struct dlm_master_list_entry *tmpmle;
457 struct list_head *iter;
458 452
459 assert_spin_locked(&dlm->master_lock); 453 assert_spin_locked(&dlm->master_lock);
460 454
461 list_for_each(iter, &dlm->master_list) { 455 list_for_each_entry(tmpmle, &dlm->master_list, list) {
462 tmpmle = list_entry(iter, struct dlm_master_list_entry, list);
463 if (!dlm_mle_equal(dlm, tmpmle, name, namelen)) 456 if (!dlm_mle_equal(dlm, tmpmle, name, namelen))
464 continue; 457 continue;
465 dlm_get_mle(tmpmle); 458 dlm_get_mle(tmpmle);
@@ -472,13 +465,10 @@ static int dlm_find_mle(struct dlm_ctxt *dlm,
472void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up) 465void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up)
473{ 466{
474 struct dlm_master_list_entry *mle; 467 struct dlm_master_list_entry *mle;
475 struct list_head *iter;
476 468
477 assert_spin_locked(&dlm->spinlock); 469 assert_spin_locked(&dlm->spinlock);
478 470
479 list_for_each(iter, &dlm->mle_hb_events) { 471 list_for_each_entry(mle, &dlm->mle_hb_events, hb_events) {
480 mle = list_entry(iter, struct dlm_master_list_entry,
481 hb_events);
482 if (node_up) 472 if (node_up)
483 dlm_mle_node_up(dlm, mle, NULL, idx); 473 dlm_mle_node_up(dlm, mle, NULL, idx);
484 else 474 else
@@ -520,7 +510,7 @@ int dlm_init_mle_cache(void)
520 dlm_mle_cache = kmem_cache_create("dlm_mle_cache", 510 dlm_mle_cache = kmem_cache_create("dlm_mle_cache",
521 sizeof(struct dlm_master_list_entry), 511 sizeof(struct dlm_master_list_entry),
522 0, SLAB_HWCACHE_ALIGN, 512 0, SLAB_HWCACHE_ALIGN,
523 NULL, NULL); 513 NULL);
524 if (dlm_mle_cache == NULL) 514 if (dlm_mle_cache == NULL)
525 return -ENOMEM; 515 return -ENOMEM;
526 return 0; 516 return 0;
@@ -2434,7 +2424,7 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
2434 int ret; 2424 int ret;
2435 int i; 2425 int i;
2436 int count = 0; 2426 int count = 0;
2437 struct list_head *queue, *iter; 2427 struct list_head *queue;
2438 struct dlm_lock *lock; 2428 struct dlm_lock *lock;
2439 2429
2440 assert_spin_locked(&res->spinlock); 2430 assert_spin_locked(&res->spinlock);
@@ -2453,8 +2443,7 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
2453 ret = 0; 2443 ret = 0;
2454 queue = &res->granted; 2444 queue = &res->granted;
2455 for (i = 0; i < 3; i++) { 2445 for (i = 0; i < 3; i++) {
2456 list_for_each(iter, queue) { 2446 list_for_each_entry(lock, queue, list) {
2457 lock = list_entry(iter, struct dlm_lock, list);
2458 ++count; 2447 ++count;
2459 if (lock->ml.node == dlm->node_num) { 2448 if (lock->ml.node == dlm->node_num) {
2460 mlog(0, "found a lock owned by this node still " 2449 mlog(0, "found a lock owned by this node still "
@@ -2923,18 +2912,16 @@ again:
2923static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm, 2912static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
2924 struct dlm_lock_resource *res) 2913 struct dlm_lock_resource *res)
2925{ 2914{
2926 struct list_head *iter, *iter2;
2927 struct list_head *queue = &res->granted; 2915 struct list_head *queue = &res->granted;
2928 int i, bit; 2916 int i, bit;
2929 struct dlm_lock *lock; 2917 struct dlm_lock *lock, *next;
2930 2918
2931 assert_spin_locked(&res->spinlock); 2919 assert_spin_locked(&res->spinlock);
2932 2920
2933 BUG_ON(res->owner == dlm->node_num); 2921 BUG_ON(res->owner == dlm->node_num);
2934 2922
2935 for (i=0; i<3; i++) { 2923 for (i=0; i<3; i++) {
2936 list_for_each_safe(iter, iter2, queue) { 2924 list_for_each_entry_safe(lock, next, queue, list) {
2937 lock = list_entry (iter, struct dlm_lock, list);
2938 if (lock->ml.node != dlm->node_num) { 2925 if (lock->ml.node != dlm->node_num) {
2939 mlog(0, "putting lock for node %u\n", 2926 mlog(0, "putting lock for node %u\n",
2940 lock->ml.node); 2927 lock->ml.node);
@@ -2976,7 +2963,6 @@ static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
2976{ 2963{
2977 int i; 2964 int i;
2978 struct list_head *queue = &res->granted; 2965 struct list_head *queue = &res->granted;
2979 struct list_head *iter;
2980 struct dlm_lock *lock; 2966 struct dlm_lock *lock;
2981 int nodenum; 2967 int nodenum;
2982 2968
@@ -2984,10 +2970,9 @@ static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
2984 2970
2985 spin_lock(&res->spinlock); 2971 spin_lock(&res->spinlock);
2986 for (i=0; i<3; i++) { 2972 for (i=0; i<3; i++) {
2987 list_for_each(iter, queue) { 2973 list_for_each_entry(lock, queue, list) {
2988 /* up to the caller to make sure this node 2974 /* up to the caller to make sure this node
2989 * is alive */ 2975 * is alive */
2990 lock = list_entry (iter, struct dlm_lock, list);
2991 if (lock->ml.node != dlm->node_num) { 2976 if (lock->ml.node != dlm->node_num) {
2992 spin_unlock(&res->spinlock); 2977 spin_unlock(&res->spinlock);
2993 return lock->ml.node; 2978 return lock->ml.node;
@@ -3234,8 +3219,7 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
3234 3219
3235void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node) 3220void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node)
3236{ 3221{
3237 struct list_head *iter, *iter2; 3222 struct dlm_master_list_entry *mle, *next;
3238 struct dlm_master_list_entry *mle;
3239 struct dlm_lock_resource *res; 3223 struct dlm_lock_resource *res;
3240 unsigned int hash; 3224 unsigned int hash;
3241 3225
@@ -3245,9 +3229,7 @@ top:
3245 3229
3246 /* clean the master list */ 3230 /* clean the master list */
3247 spin_lock(&dlm->master_lock); 3231 spin_lock(&dlm->master_lock);
3248 list_for_each_safe(iter, iter2, &dlm->master_list) { 3232 list_for_each_entry_safe(mle, next, &dlm->master_list, list) {
3249 mle = list_entry(iter, struct dlm_master_list_entry, list);
3250
3251 BUG_ON(mle->type != DLM_MLE_BLOCK && 3233 BUG_ON(mle->type != DLM_MLE_BLOCK &&
3252 mle->type != DLM_MLE_MASTER && 3234 mle->type != DLM_MLE_MASTER &&
3253 mle->type != DLM_MLE_MIGRATION); 3235 mle->type != DLM_MLE_MIGRATION);
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 671c4ed58ee2..a2c33160bfd6 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -158,8 +158,7 @@ void dlm_dispatch_work(struct work_struct *work)
158 struct dlm_ctxt *dlm = 158 struct dlm_ctxt *dlm =
159 container_of(work, struct dlm_ctxt, dispatched_work); 159 container_of(work, struct dlm_ctxt, dispatched_work);
160 LIST_HEAD(tmp_list); 160 LIST_HEAD(tmp_list);
161 struct list_head *iter, *iter2; 161 struct dlm_work_item *item, *next;
162 struct dlm_work_item *item;
163 dlm_workfunc_t *workfunc; 162 dlm_workfunc_t *workfunc;
164 int tot=0; 163 int tot=0;
165 164
@@ -167,13 +166,12 @@ void dlm_dispatch_work(struct work_struct *work)
167 list_splice_init(&dlm->work_list, &tmp_list); 166 list_splice_init(&dlm->work_list, &tmp_list);
168 spin_unlock(&dlm->work_lock); 167 spin_unlock(&dlm->work_lock);
169 168
170 list_for_each_safe(iter, iter2, &tmp_list) { 169 list_for_each_entry(item, &tmp_list, list) {
171 tot++; 170 tot++;
172 } 171 }
173 mlog(0, "%s: work thread has %d work items\n", dlm->name, tot); 172 mlog(0, "%s: work thread has %d work items\n", dlm->name, tot);
174 173
175 list_for_each_safe(iter, iter2, &tmp_list) { 174 list_for_each_entry_safe(item, next, &tmp_list, list) {
176 item = list_entry(iter, struct dlm_work_item, list);
177 workfunc = item->func; 175 workfunc = item->func;
178 list_del_init(&item->list); 176 list_del_init(&item->list);
179 177
@@ -549,7 +547,6 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
549{ 547{
550 int status = 0; 548 int status = 0;
551 struct dlm_reco_node_data *ndata; 549 struct dlm_reco_node_data *ndata;
552 struct list_head *iter;
553 int all_nodes_done; 550 int all_nodes_done;
554 int destroy = 0; 551 int destroy = 0;
555 int pass = 0; 552 int pass = 0;
@@ -567,8 +564,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
567 564
568 /* safe to access the node data list without a lock, since this 565 /* safe to access the node data list without a lock, since this
569 * process is the only one to change the list */ 566 * process is the only one to change the list */
570 list_for_each(iter, &dlm->reco.node_data) { 567 list_for_each_entry(ndata, &dlm->reco.node_data, list) {
571 ndata = list_entry (iter, struct dlm_reco_node_data, list);
572 BUG_ON(ndata->state != DLM_RECO_NODE_DATA_INIT); 568 BUG_ON(ndata->state != DLM_RECO_NODE_DATA_INIT);
573 ndata->state = DLM_RECO_NODE_DATA_REQUESTING; 569 ndata->state = DLM_RECO_NODE_DATA_REQUESTING;
574 570
@@ -655,9 +651,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
655 * done, or if anyone died */ 651 * done, or if anyone died */
656 all_nodes_done = 1; 652 all_nodes_done = 1;
657 spin_lock(&dlm_reco_state_lock); 653 spin_lock(&dlm_reco_state_lock);
658 list_for_each(iter, &dlm->reco.node_data) { 654 list_for_each_entry(ndata, &dlm->reco.node_data, list) {
659 ndata = list_entry (iter, struct dlm_reco_node_data, list);
660
661 mlog(0, "checking recovery state of node %u\n", 655 mlog(0, "checking recovery state of node %u\n",
662 ndata->node_num); 656 ndata->node_num);
663 switch (ndata->state) { 657 switch (ndata->state) {
@@ -774,16 +768,14 @@ static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node)
774 768
775static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm, u8 dead_node) 769static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm, u8 dead_node)
776{ 770{
777 struct list_head *iter, *iter2; 771 struct dlm_reco_node_data *ndata, *next;
778 struct dlm_reco_node_data *ndata;
779 LIST_HEAD(tmplist); 772 LIST_HEAD(tmplist);
780 773
781 spin_lock(&dlm_reco_state_lock); 774 spin_lock(&dlm_reco_state_lock);
782 list_splice_init(&dlm->reco.node_data, &tmplist); 775 list_splice_init(&dlm->reco.node_data, &tmplist);
783 spin_unlock(&dlm_reco_state_lock); 776 spin_unlock(&dlm_reco_state_lock);
784 777
785 list_for_each_safe(iter, iter2, &tmplist) { 778 list_for_each_entry_safe(ndata, next, &tmplist, list) {
786 ndata = list_entry (iter, struct dlm_reco_node_data, list);
787 list_del_init(&ndata->list); 779 list_del_init(&ndata->list);
788 kfree(ndata); 780 kfree(ndata);
789 } 781 }
@@ -876,7 +868,6 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
876 struct dlm_lock_resource *res; 868 struct dlm_lock_resource *res;
877 struct dlm_ctxt *dlm; 869 struct dlm_ctxt *dlm;
878 LIST_HEAD(resources); 870 LIST_HEAD(resources);
879 struct list_head *iter;
880 int ret; 871 int ret;
881 u8 dead_node, reco_master; 872 u8 dead_node, reco_master;
882 int skip_all_done = 0; 873 int skip_all_done = 0;
@@ -920,8 +911,7 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
920 911
921 /* any errors returned will be due to the new_master dying, 912 /* any errors returned will be due to the new_master dying,
922 * the dlm_reco_thread should detect this */ 913 * the dlm_reco_thread should detect this */
923 list_for_each(iter, &resources) { 914 list_for_each_entry(res, &resources, recovering) {
924 res = list_entry (iter, struct dlm_lock_resource, recovering);
925 ret = dlm_send_one_lockres(dlm, res, mres, reco_master, 915 ret = dlm_send_one_lockres(dlm, res, mres, reco_master,
926 DLM_MRES_RECOVERY); 916 DLM_MRES_RECOVERY);
927 if (ret < 0) { 917 if (ret < 0) {
@@ -983,7 +973,6 @@ int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data,
983{ 973{
984 struct dlm_ctxt *dlm = data; 974 struct dlm_ctxt *dlm = data;
985 struct dlm_reco_data_done *done = (struct dlm_reco_data_done *)msg->buf; 975 struct dlm_reco_data_done *done = (struct dlm_reco_data_done *)msg->buf;
986 struct list_head *iter;
987 struct dlm_reco_node_data *ndata = NULL; 976 struct dlm_reco_node_data *ndata = NULL;
988 int ret = -EINVAL; 977 int ret = -EINVAL;
989 978
@@ -1000,8 +989,7 @@ int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data,
1000 dlm->reco.dead_node, done->node_idx, dlm->node_num); 989 dlm->reco.dead_node, done->node_idx, dlm->node_num);
1001 990
1002 spin_lock(&dlm_reco_state_lock); 991 spin_lock(&dlm_reco_state_lock);
1003 list_for_each(iter, &dlm->reco.node_data) { 992 list_for_each_entry(ndata, &dlm->reco.node_data, list) {
1004 ndata = list_entry (iter, struct dlm_reco_node_data, list);
1005 if (ndata->node_num != done->node_idx) 993 if (ndata->node_num != done->node_idx)
1006 continue; 994 continue;
1007 995
@@ -1049,13 +1037,11 @@ static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm,
1049 struct list_head *list, 1037 struct list_head *list,
1050 u8 dead_node) 1038 u8 dead_node)
1051{ 1039{
1052 struct dlm_lock_resource *res; 1040 struct dlm_lock_resource *res, *next;
1053 struct list_head *iter, *iter2;
1054 struct dlm_lock *lock; 1041 struct dlm_lock *lock;
1055 1042
1056 spin_lock(&dlm->spinlock); 1043 spin_lock(&dlm->spinlock);
1057 list_for_each_safe(iter, iter2, &dlm->reco.resources) { 1044 list_for_each_entry_safe(res, next, &dlm->reco.resources, recovering) {
1058 res = list_entry (iter, struct dlm_lock_resource, recovering);
1059 /* always prune any $RECOVERY entries for dead nodes, 1045 /* always prune any $RECOVERY entries for dead nodes,
1060 * otherwise hangs can occur during later recovery */ 1046 * otherwise hangs can occur during later recovery */
1061 if (dlm_is_recovery_lock(res->lockname.name, 1047 if (dlm_is_recovery_lock(res->lockname.name,
@@ -1169,7 +1155,7 @@ static void dlm_init_migratable_lockres(struct dlm_migratable_lockres *mres,
1169 u8 flags, u8 master) 1155 u8 flags, u8 master)
1170{ 1156{
1171 /* mres here is one full page */ 1157 /* mres here is one full page */
1172 memset(mres, 0, PAGE_SIZE); 1158 clear_page(mres);
1173 mres->lockname_len = namelen; 1159 mres->lockname_len = namelen;
1174 memcpy(mres->lockname, lockname, namelen); 1160 memcpy(mres->lockname, lockname, namelen);
1175 mres->num_locks = 0; 1161 mres->num_locks = 0;
@@ -1252,7 +1238,7 @@ int dlm_send_one_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
1252 struct dlm_migratable_lockres *mres, 1238 struct dlm_migratable_lockres *mres,
1253 u8 send_to, u8 flags) 1239 u8 send_to, u8 flags)
1254{ 1240{
1255 struct list_head *queue, *iter; 1241 struct list_head *queue;
1256 int total_locks, i; 1242 int total_locks, i;
1257 u64 mig_cookie = 0; 1243 u64 mig_cookie = 0;
1258 struct dlm_lock *lock; 1244 struct dlm_lock *lock;
@@ -1278,9 +1264,7 @@ int dlm_send_one_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
1278 total_locks = 0; 1264 total_locks = 0;
1279 for (i=DLM_GRANTED_LIST; i<=DLM_BLOCKED_LIST; i++) { 1265 for (i=DLM_GRANTED_LIST; i<=DLM_BLOCKED_LIST; i++) {
1280 queue = dlm_list_idx_to_ptr(res, i); 1266 queue = dlm_list_idx_to_ptr(res, i);
1281 list_for_each(iter, queue) { 1267 list_for_each_entry(lock, queue, list) {
1282 lock = list_entry (iter, struct dlm_lock, list);
1283
1284 /* add another lock. */ 1268 /* add another lock. */
1285 total_locks++; 1269 total_locks++;
1286 if (!dlm_add_lock_to_array(lock, mres, i)) 1270 if (!dlm_add_lock_to_array(lock, mres, i))
@@ -1717,7 +1701,6 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
1717 struct dlm_lockstatus *lksb = NULL; 1701 struct dlm_lockstatus *lksb = NULL;
1718 int ret = 0; 1702 int ret = 0;
1719 int i, j, bad; 1703 int i, j, bad;
1720 struct list_head *iter;
1721 struct dlm_lock *lock = NULL; 1704 struct dlm_lock *lock = NULL;
1722 u8 from = O2NM_MAX_NODES; 1705 u8 from = O2NM_MAX_NODES;
1723 unsigned int added = 0; 1706 unsigned int added = 0;
@@ -1755,8 +1738,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
1755 spin_lock(&res->spinlock); 1738 spin_lock(&res->spinlock);
1756 for (j = DLM_GRANTED_LIST; j <= DLM_BLOCKED_LIST; j++) { 1739 for (j = DLM_GRANTED_LIST; j <= DLM_BLOCKED_LIST; j++) {
1757 tmpq = dlm_list_idx_to_ptr(res, j); 1740 tmpq = dlm_list_idx_to_ptr(res, j);
1758 list_for_each(iter, tmpq) { 1741 list_for_each_entry(lock, tmpq, list) {
1759 lock = list_entry (iter, struct dlm_lock, list);
1760 if (lock->ml.cookie != ml->cookie) 1742 if (lock->ml.cookie != ml->cookie)
1761 lock = NULL; 1743 lock = NULL;
1762 else 1744 else
@@ -1930,8 +1912,8 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
1930 struct dlm_lock_resource *res) 1912 struct dlm_lock_resource *res)
1931{ 1913{
1932 int i; 1914 int i;
1933 struct list_head *queue, *iter, *iter2; 1915 struct list_head *queue;
1934 struct dlm_lock *lock; 1916 struct dlm_lock *lock, *next;
1935 1917
1936 res->state |= DLM_LOCK_RES_RECOVERING; 1918 res->state |= DLM_LOCK_RES_RECOVERING;
1937 if (!list_empty(&res->recovering)) { 1919 if (!list_empty(&res->recovering)) {
@@ -1947,8 +1929,7 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
1947 /* find any pending locks and put them back on proper list */ 1929 /* find any pending locks and put them back on proper list */
1948 for (i=DLM_BLOCKED_LIST; i>=DLM_GRANTED_LIST; i--) { 1930 for (i=DLM_BLOCKED_LIST; i>=DLM_GRANTED_LIST; i--) {
1949 queue = dlm_list_idx_to_ptr(res, i); 1931 queue = dlm_list_idx_to_ptr(res, i);
1950 list_for_each_safe(iter, iter2, queue) { 1932 list_for_each_entry_safe(lock, next, queue, list) {
1951 lock = list_entry (iter, struct dlm_lock, list);
1952 dlm_lock_get(lock); 1933 dlm_lock_get(lock);
1953 if (lock->convert_pending) { 1934 if (lock->convert_pending) {
1954 /* move converting lock back to granted */ 1935 /* move converting lock back to granted */
@@ -2013,18 +1994,15 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
2013 u8 dead_node, u8 new_master) 1994 u8 dead_node, u8 new_master)
2014{ 1995{
2015 int i; 1996 int i;
2016 struct list_head *iter, *iter2;
2017 struct hlist_node *hash_iter; 1997 struct hlist_node *hash_iter;
2018 struct hlist_head *bucket; 1998 struct hlist_head *bucket;
2019 1999 struct dlm_lock_resource *res, *next;
2020 struct dlm_lock_resource *res;
2021 2000
2022 mlog_entry_void(); 2001 mlog_entry_void();
2023 2002
2024 assert_spin_locked(&dlm->spinlock); 2003 assert_spin_locked(&dlm->spinlock);
2025 2004
2026 list_for_each_safe(iter, iter2, &dlm->reco.resources) { 2005 list_for_each_entry_safe(res, next, &dlm->reco.resources, recovering) {
2027 res = list_entry (iter, struct dlm_lock_resource, recovering);
2028 if (res->owner == dead_node) { 2006 if (res->owner == dead_node) {
2029 list_del_init(&res->recovering); 2007 list_del_init(&res->recovering);
2030 spin_lock(&res->spinlock); 2008 spin_lock(&res->spinlock);
@@ -2099,7 +2077,7 @@ static inline int dlm_lvb_needs_invalidation(struct dlm_lock *lock, int local)
2099static void dlm_revalidate_lvb(struct dlm_ctxt *dlm, 2077static void dlm_revalidate_lvb(struct dlm_ctxt *dlm,
2100 struct dlm_lock_resource *res, u8 dead_node) 2078 struct dlm_lock_resource *res, u8 dead_node)
2101{ 2079{
2102 struct list_head *iter, *queue; 2080 struct list_head *queue;
2103 struct dlm_lock *lock; 2081 struct dlm_lock *lock;
2104 int blank_lvb = 0, local = 0; 2082 int blank_lvb = 0, local = 0;
2105 int i; 2083 int i;
@@ -2121,8 +2099,7 @@ static void dlm_revalidate_lvb(struct dlm_ctxt *dlm,
2121 2099
2122 for (i=DLM_GRANTED_LIST; i<=DLM_CONVERTING_LIST; i++) { 2100 for (i=DLM_GRANTED_LIST; i<=DLM_CONVERTING_LIST; i++) {
2123 queue = dlm_list_idx_to_ptr(res, i); 2101 queue = dlm_list_idx_to_ptr(res, i);
2124 list_for_each(iter, queue) { 2102 list_for_each_entry(lock, queue, list) {
2125 lock = list_entry (iter, struct dlm_lock, list);
2126 if (lock->ml.node == search_node) { 2103 if (lock->ml.node == search_node) {
2127 if (dlm_lvb_needs_invalidation(lock, local)) { 2104 if (dlm_lvb_needs_invalidation(lock, local)) {
2128 /* zero the lksb lvb and lockres lvb */ 2105 /* zero the lksb lvb and lockres lvb */
@@ -2143,8 +2120,7 @@ static void dlm_revalidate_lvb(struct dlm_ctxt *dlm,
2143static void dlm_free_dead_locks(struct dlm_ctxt *dlm, 2120static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
2144 struct dlm_lock_resource *res, u8 dead_node) 2121 struct dlm_lock_resource *res, u8 dead_node)
2145{ 2122{
2146 struct list_head *iter, *tmpiter; 2123 struct dlm_lock *lock, *next;
2147 struct dlm_lock *lock;
2148 unsigned int freed = 0; 2124 unsigned int freed = 0;
2149 2125
2150 /* this node is the lockres master: 2126 /* this node is the lockres master:
@@ -2155,24 +2131,21 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
2155 assert_spin_locked(&res->spinlock); 2131 assert_spin_locked(&res->spinlock);
2156 2132
2157 /* TODO: check pending_asts, pending_basts here */ 2133 /* TODO: check pending_asts, pending_basts here */
2158 list_for_each_safe(iter, tmpiter, &res->granted) { 2134 list_for_each_entry_safe(lock, next, &res->granted, list) {
2159 lock = list_entry (iter, struct dlm_lock, list);
2160 if (lock->ml.node == dead_node) { 2135 if (lock->ml.node == dead_node) {
2161 list_del_init(&lock->list); 2136 list_del_init(&lock->list);
2162 dlm_lock_put(lock); 2137 dlm_lock_put(lock);
2163 freed++; 2138 freed++;
2164 } 2139 }
2165 } 2140 }
2166 list_for_each_safe(iter, tmpiter, &res->converting) { 2141 list_for_each_entry_safe(lock, next, &res->converting, list) {
2167 lock = list_entry (iter, struct dlm_lock, list);
2168 if (lock->ml.node == dead_node) { 2142 if (lock->ml.node == dead_node) {
2169 list_del_init(&lock->list); 2143 list_del_init(&lock->list);
2170 dlm_lock_put(lock); 2144 dlm_lock_put(lock);
2171 freed++; 2145 freed++;
2172 } 2146 }
2173 } 2147 }
2174 list_for_each_safe(iter, tmpiter, &res->blocked) { 2148 list_for_each_entry_safe(lock, next, &res->blocked, list) {
2175 lock = list_entry (iter, struct dlm_lock, list);
2176 if (lock->ml.node == dead_node) { 2149 if (lock->ml.node == dead_node) {
2177 list_del_init(&lock->list); 2150 list_del_init(&lock->list);
2178 dlm_lock_put(lock); 2151 dlm_lock_put(lock);
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index d1bd305ef0d7..f71250ed166f 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -600,15 +600,13 @@ static inline int ocfs2_highest_compat_lock_level(int level)
600static void lockres_set_flags(struct ocfs2_lock_res *lockres, 600static void lockres_set_flags(struct ocfs2_lock_res *lockres,
601 unsigned long newflags) 601 unsigned long newflags)
602{ 602{
603 struct list_head *pos, *tmp; 603 struct ocfs2_mask_waiter *mw, *tmp;
604 struct ocfs2_mask_waiter *mw;
605 604
606 assert_spin_locked(&lockres->l_lock); 605 assert_spin_locked(&lockres->l_lock);
607 606
608 lockres->l_flags = newflags; 607 lockres->l_flags = newflags;
609 608
610 list_for_each_safe(pos, tmp, &lockres->l_mask_waiters) { 609 list_for_each_entry_safe(mw, tmp, &lockres->l_mask_waiters, mw_item) {
611 mw = list_entry(pos, struct ocfs2_mask_waiter, mw_item);
612 if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal) 610 if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal)
613 continue; 611 continue;
614 612
diff --git a/fs/ocfs2/endian.h b/fs/ocfs2/endian.h
index f226b2207628..ff257628af16 100644
--- a/fs/ocfs2/endian.h
+++ b/fs/ocfs2/endian.h
@@ -32,6 +32,11 @@ static inline void le32_add_cpu(__le32 *var, u32 val)
32 *var = cpu_to_le32(le32_to_cpu(*var) + val); 32 *var = cpu_to_le32(le32_to_cpu(*var) + val);
33} 33}
34 34
35static inline void le64_add_cpu(__le64 *var, u64 val)
36{
37 *var = cpu_to_le64(le64_to_cpu(*var) + val);
38}
39
35static inline void le32_and_cpu(__le32 *var, u32 val) 40static inline void le32_and_cpu(__le32 *var, u32 val)
36{ 41{
37 *var = cpu_to_le32(le32_to_cpu(*var) & val); 42 *var = cpu_to_le32(le32_to_cpu(*var) & val);
diff --git a/fs/ocfs2/export.h b/fs/ocfs2/export.h
index 5b77ee7866ef..e08bed9e45a0 100644
--- a/fs/ocfs2/export.h
+++ b/fs/ocfs2/export.h
@@ -26,6 +26,8 @@
26#ifndef OCFS2_EXPORT_H 26#ifndef OCFS2_EXPORT_H
27#define OCFS2_EXPORT_H 27#define OCFS2_EXPORT_H
28 28
29#include <linux/exportfs.h>
30
29extern struct export_operations ocfs2_export_ops; 31extern struct export_operations ocfs2_export_ops;
30 32
31#endif /* OCFS2_EXPORT_H */ 33#endif /* OCFS2_EXPORT_H */
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index ba2b2ab1c6e4..03c1d365c78b 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -109,17 +109,14 @@ static int ocfs2_extent_map_lookup(struct inode *inode, unsigned int cpos,
109 */ 109 */
110void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cpos) 110void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cpos)
111{ 111{
112 struct list_head *p, *n; 112 struct ocfs2_extent_map_item *emi, *n;
113 struct ocfs2_extent_map_item *emi;
114 struct ocfs2_inode_info *oi = OCFS2_I(inode); 113 struct ocfs2_inode_info *oi = OCFS2_I(inode);
115 struct ocfs2_extent_map *em = &oi->ip_extent_map; 114 struct ocfs2_extent_map *em = &oi->ip_extent_map;
116 LIST_HEAD(tmp_list); 115 LIST_HEAD(tmp_list);
117 unsigned int range; 116 unsigned int range;
118 117
119 spin_lock(&oi->ip_lock); 118 spin_lock(&oi->ip_lock);
120 list_for_each_safe(p, n, &em->em_list) { 119 list_for_each_entry_safe(emi, n, &em->em_list, ei_list) {
121 emi = list_entry(p, struct ocfs2_extent_map_item, ei_list);
122
123 if (emi->ei_cpos >= cpos) { 120 if (emi->ei_cpos >= cpos) {
124 /* Full truncate of this record. */ 121 /* Full truncate of this record. */
125 list_move(&emi->ei_list, &tmp_list); 122 list_move(&emi->ei_list, &tmp_list);
@@ -136,8 +133,7 @@ void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cpos)
136 } 133 }
137 spin_unlock(&oi->ip_lock); 134 spin_unlock(&oi->ip_lock);
138 135
139 list_for_each_safe(p, n, &tmp_list) { 136 list_for_each_entry_safe(emi, n, &tmp_list, ei_list) {
140 emi = list_entry(p, struct ocfs2_extent_map_item, ei_list);
141 list_del(&emi->ei_list); 137 list_del(&emi->ei_list);
142 kfree(emi); 138 kfree(emi);
143 } 139 }
@@ -377,37 +373,6 @@ out:
377 return ret; 373 return ret;
378} 374}
379 375
380/*
381 * Return the index of the extent record which contains cluster #v_cluster.
382 * -1 is returned if it was not found.
383 *
384 * Should work fine on interior and exterior nodes.
385 */
386static int ocfs2_search_extent_list(struct ocfs2_extent_list *el,
387 u32 v_cluster)
388{
389 int ret = -1;
390 int i;
391 struct ocfs2_extent_rec *rec;
392 u32 rec_end, rec_start, clusters;
393
394 for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
395 rec = &el->l_recs[i];
396
397 rec_start = le32_to_cpu(rec->e_cpos);
398 clusters = ocfs2_rec_clusters(el, rec);
399
400 rec_end = rec_start + clusters;
401
402 if (v_cluster >= rec_start && v_cluster < rec_end) {
403 ret = i;
404 break;
405 }
406 }
407
408 return ret;
409}
410
411int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, 376int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
412 u32 *p_cluster, u32 *num_clusters, 377 u32 *p_cluster, u32 *num_clusters,
413 unsigned int *extent_flags) 378 unsigned int *extent_flags)
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 4979b6675717..5727cd18302a 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -34,6 +34,7 @@
34#include <linux/splice.h> 34#include <linux/splice.h>
35#include <linux/mount.h> 35#include <linux/mount.h>
36#include <linux/writeback.h> 36#include <linux/writeback.h>
37#include <linux/falloc.h>
37 38
38#define MLOG_MASK_PREFIX ML_INODE 39#define MLOG_MASK_PREFIX ML_INODE
39#include <cluster/masklog.h> 40#include <cluster/masklog.h>
@@ -263,6 +264,7 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
263 int status; 264 int status;
264 handle_t *handle; 265 handle_t *handle;
265 struct ocfs2_dinode *di; 266 struct ocfs2_dinode *di;
267 u64 cluster_bytes;
266 268
267 mlog_entry_void(); 269 mlog_entry_void();
268 270
@@ -286,7 +288,9 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
286 /* 288 /*
287 * Do this before setting i_size. 289 * Do this before setting i_size.
288 */ 290 */
289 status = ocfs2_zero_tail_for_truncate(inode, handle, new_i_size); 291 cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size);
292 status = ocfs2_zero_range_for_truncate(inode, handle, new_i_size,
293 cluster_bytes);
290 if (status) { 294 if (status) {
291 mlog_errno(status); 295 mlog_errno(status);
292 goto out_commit; 296 goto out_commit;
@@ -326,9 +330,6 @@ static int ocfs2_truncate_file(struct inode *inode,
326 (unsigned long long)OCFS2_I(inode)->ip_blkno, 330 (unsigned long long)OCFS2_I(inode)->ip_blkno,
327 (unsigned long long)new_i_size); 331 (unsigned long long)new_i_size);
328 332
329 unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
330 truncate_inode_pages(inode->i_mapping, new_i_size);
331
332 fe = (struct ocfs2_dinode *) di_bh->b_data; 333 fe = (struct ocfs2_dinode *) di_bh->b_data;
333 if (!OCFS2_IS_VALID_DINODE(fe)) { 334 if (!OCFS2_IS_VALID_DINODE(fe)) {
334 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); 335 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
@@ -363,16 +364,23 @@ static int ocfs2_truncate_file(struct inode *inode,
363 if (new_i_size == le64_to_cpu(fe->i_size)) 364 if (new_i_size == le64_to_cpu(fe->i_size))
364 goto bail; 365 goto bail;
365 366
367 down_write(&OCFS2_I(inode)->ip_alloc_sem);
368
366 /* This forces other nodes to sync and drop their pages. Do 369 /* This forces other nodes to sync and drop their pages. Do
367 * this even if we have a truncate without allocation change - 370 * this even if we have a truncate without allocation change -
368 * ocfs2 cluster sizes can be much greater than page size, so 371 * ocfs2 cluster sizes can be much greater than page size, so
369 * we have to truncate them anyway. */ 372 * we have to truncate them anyway. */
370 status = ocfs2_data_lock(inode, 1); 373 status = ocfs2_data_lock(inode, 1);
371 if (status < 0) { 374 if (status < 0) {
375 up_write(&OCFS2_I(inode)->ip_alloc_sem);
376
372 mlog_errno(status); 377 mlog_errno(status);
373 goto bail; 378 goto bail;
374 } 379 }
375 380
381 unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
382 truncate_inode_pages(inode->i_mapping, new_i_size);
383
376 /* alright, we're going to need to do a full blown alloc size 384 /* alright, we're going to need to do a full blown alloc size
377 * change. Orphan the inode so that recovery can complete the 385 * change. Orphan the inode so that recovery can complete the
378 * truncate if necessary. This does the task of marking 386 * truncate if necessary. This does the task of marking
@@ -399,6 +407,8 @@ static int ocfs2_truncate_file(struct inode *inode,
399bail_unlock_data: 407bail_unlock_data:
400 ocfs2_data_unlock(inode, 1); 408 ocfs2_data_unlock(inode, 1);
401 409
410 up_write(&OCFS2_I(inode)->ip_alloc_sem);
411
402bail: 412bail:
403 413
404 mlog_exit(status); 414 mlog_exit(status);
@@ -419,6 +429,7 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
419 struct inode *inode, 429 struct inode *inode,
420 u32 *logical_offset, 430 u32 *logical_offset,
421 u32 clusters_to_add, 431 u32 clusters_to_add,
432 int mark_unwritten,
422 struct buffer_head *fe_bh, 433 struct buffer_head *fe_bh,
423 handle_t *handle, 434 handle_t *handle,
424 struct ocfs2_alloc_context *data_ac, 435 struct ocfs2_alloc_context *data_ac,
@@ -431,9 +442,13 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
431 enum ocfs2_alloc_restarted reason = RESTART_NONE; 442 enum ocfs2_alloc_restarted reason = RESTART_NONE;
432 u32 bit_off, num_bits; 443 u32 bit_off, num_bits;
433 u64 block; 444 u64 block;
445 u8 flags = 0;
434 446
435 BUG_ON(!clusters_to_add); 447 BUG_ON(!clusters_to_add);
436 448
449 if (mark_unwritten)
450 flags = OCFS2_EXT_UNWRITTEN;
451
437 free_extents = ocfs2_num_free_extents(osb, inode, fe); 452 free_extents = ocfs2_num_free_extents(osb, inode, fe);
438 if (free_extents < 0) { 453 if (free_extents < 0) {
439 status = free_extents; 454 status = free_extents;
@@ -483,7 +498,7 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
483 num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno); 498 num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
484 status = ocfs2_insert_extent(osb, handle, inode, fe_bh, 499 status = ocfs2_insert_extent(osb, handle, inode, fe_bh,
485 *logical_offset, block, num_bits, 500 *logical_offset, block, num_bits,
486 meta_ac); 501 flags, meta_ac);
487 if (status < 0) { 502 if (status < 0) {
488 mlog_errno(status); 503 mlog_errno(status);
489 goto leave; 504 goto leave;
@@ -516,25 +531,31 @@ leave:
516 * For a given allocation, determine which allocators will need to be 531 * For a given allocation, determine which allocators will need to be
517 * accessed, and lock them, reserving the appropriate number of bits. 532 * accessed, and lock them, reserving the appropriate number of bits.
518 * 533 *
519 * Called from ocfs2_extend_allocation() for file systems which don't 534 * Sparse file systems call this from ocfs2_write_begin_nolock()
520 * support holes, and from ocfs2_write() for file systems which 535 * and ocfs2_allocate_unwritten_extents().
521 * understand sparse inodes. 536 *
537 * File systems which don't support holes call this from
538 * ocfs2_extend_allocation().
522 */ 539 */
523int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di, 540int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
524 u32 clusters_to_add, 541 u32 clusters_to_add, u32 extents_to_split,
525 struct ocfs2_alloc_context **data_ac, 542 struct ocfs2_alloc_context **data_ac,
526 struct ocfs2_alloc_context **meta_ac) 543 struct ocfs2_alloc_context **meta_ac)
527{ 544{
528 int ret, num_free_extents; 545 int ret = 0, num_free_extents;
546 unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split;
529 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 547 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
530 548
531 *meta_ac = NULL; 549 *meta_ac = NULL;
532 *data_ac = NULL; 550 if (data_ac)
551 *data_ac = NULL;
552
553 BUG_ON(clusters_to_add != 0 && data_ac == NULL);
533 554
534 mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, " 555 mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "
535 "clusters_to_add = %u\n", 556 "clusters_to_add = %u, extents_to_split = %u\n",
536 (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode), 557 (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode),
537 le32_to_cpu(di->i_clusters), clusters_to_add); 558 le32_to_cpu(di->i_clusters), clusters_to_add, extents_to_split);
538 559
539 num_free_extents = ocfs2_num_free_extents(osb, inode, di); 560 num_free_extents = ocfs2_num_free_extents(osb, inode, di);
540 if (num_free_extents < 0) { 561 if (num_free_extents < 0) {
@@ -552,9 +573,12 @@ int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
552 * 573 *
553 * Most of the time we'll only be seeing this 1 cluster at a time 574 * Most of the time we'll only be seeing this 1 cluster at a time
554 * anyway. 575 * anyway.
576 *
577 * Always lock for any unwritten extents - we might want to
578 * add blocks during a split.
555 */ 579 */
556 if (!num_free_extents || 580 if (!num_free_extents ||
557 (ocfs2_sparse_alloc(osb) && num_free_extents < clusters_to_add)) { 581 (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) {
558 ret = ocfs2_reserve_new_metadata(osb, di, meta_ac); 582 ret = ocfs2_reserve_new_metadata(osb, di, meta_ac);
559 if (ret < 0) { 583 if (ret < 0) {
560 if (ret != -ENOSPC) 584 if (ret != -ENOSPC)
@@ -563,6 +587,9 @@ int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
563 } 587 }
564 } 588 }
565 589
590 if (clusters_to_add == 0)
591 goto out;
592
566 ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac); 593 ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac);
567 if (ret < 0) { 594 if (ret < 0) {
568 if (ret != -ENOSPC) 595 if (ret != -ENOSPC)
@@ -585,14 +612,13 @@ out:
585 return ret; 612 return ret;
586} 613}
587 614
588static int ocfs2_extend_allocation(struct inode *inode, 615static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
589 u32 clusters_to_add) 616 u32 clusters_to_add, int mark_unwritten)
590{ 617{
591 int status = 0; 618 int status = 0;
592 int restart_func = 0; 619 int restart_func = 0;
593 int drop_alloc_sem = 0;
594 int credits; 620 int credits;
595 u32 prev_clusters, logical_start; 621 u32 prev_clusters;
596 struct buffer_head *bh = NULL; 622 struct buffer_head *bh = NULL;
597 struct ocfs2_dinode *fe = NULL; 623 struct ocfs2_dinode *fe = NULL;
598 handle_t *handle = NULL; 624 handle_t *handle = NULL;
@@ -607,7 +633,7 @@ static int ocfs2_extend_allocation(struct inode *inode,
607 * This function only exists for file systems which don't 633 * This function only exists for file systems which don't
608 * support holes. 634 * support holes.
609 */ 635 */
610 BUG_ON(ocfs2_sparse_alloc(osb)); 636 BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb));
611 637
612 status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh, 638 status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
613 OCFS2_BH_CACHED, inode); 639 OCFS2_BH_CACHED, inode);
@@ -623,19 +649,10 @@ static int ocfs2_extend_allocation(struct inode *inode,
623 goto leave; 649 goto leave;
624 } 650 }
625 651
626 logical_start = OCFS2_I(inode)->ip_clusters;
627
628restart_all: 652restart_all:
629 BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters); 653 BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
630 654
631 /* blocks peope in read/write from reading our allocation 655 status = ocfs2_lock_allocators(inode, fe, clusters_to_add, 0, &data_ac,
632 * until we're done changing it. We depend on i_mutex to block
633 * other extend/truncate calls while we're here. Ordering wrt
634 * start_trans is important here -- always do it before! */
635 down_write(&OCFS2_I(inode)->ip_alloc_sem);
636 drop_alloc_sem = 1;
637
638 status = ocfs2_lock_allocators(inode, fe, clusters_to_add, &data_ac,
639 &meta_ac); 656 &meta_ac);
640 if (status) { 657 if (status) {
641 mlog_errno(status); 658 mlog_errno(status);
@@ -668,6 +685,7 @@ restarted_transaction:
668 inode, 685 inode,
669 &logical_start, 686 &logical_start,
670 clusters_to_add, 687 clusters_to_add,
688 mark_unwritten,
671 bh, 689 bh,
672 handle, 690 handle,
673 data_ac, 691 data_ac,
@@ -720,10 +738,6 @@ restarted_transaction:
720 OCFS2_I(inode)->ip_clusters, i_size_read(inode)); 738 OCFS2_I(inode)->ip_clusters, i_size_read(inode));
721 739
722leave: 740leave:
723 if (drop_alloc_sem) {
724 up_write(&OCFS2_I(inode)->ip_alloc_sem);
725 drop_alloc_sem = 0;
726 }
727 if (handle) { 741 if (handle) {
728 ocfs2_commit_trans(osb, handle); 742 ocfs2_commit_trans(osb, handle);
729 handle = NULL; 743 handle = NULL;
@@ -749,6 +763,25 @@ leave:
749 return status; 763 return status;
750} 764}
751 765
766static int ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
767 u32 clusters_to_add, int mark_unwritten)
768{
769 int ret;
770
771 /*
772 * The alloc sem blocks peope in read/write from reading our
773 * allocation until we're done changing it. We depend on
774 * i_mutex to block other extend/truncate calls while we're
775 * here.
776 */
777 down_write(&OCFS2_I(inode)->ip_alloc_sem);
778 ret = __ocfs2_extend_allocation(inode, logical_start, clusters_to_add,
779 mark_unwritten);
780 up_write(&OCFS2_I(inode)->ip_alloc_sem);
781
782 return ret;
783}
784
752/* Some parts of this taken from generic_cont_expand, which turned out 785/* Some parts of this taken from generic_cont_expand, which turned out
753 * to be too fragile to do exactly what we need without us having to 786 * to be too fragile to do exactly what we need without us having to
754 * worry about recursive locking in ->prepare_write() and 787 * worry about recursive locking in ->prepare_write() and
@@ -890,7 +923,9 @@ static int ocfs2_extend_file(struct inode *inode,
890 } 923 }
891 924
892 if (clusters_to_add) { 925 if (clusters_to_add) {
893 ret = ocfs2_extend_allocation(inode, clusters_to_add); 926 ret = ocfs2_extend_allocation(inode,
927 OCFS2_I(inode)->ip_clusters,
928 clusters_to_add, 0);
894 if (ret < 0) { 929 if (ret < 0) {
895 mlog_errno(ret); 930 mlog_errno(ret);
896 goto out_unlock; 931 goto out_unlock;
@@ -995,6 +1030,13 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
995 goto bail_unlock; 1030 goto bail_unlock;
996 } 1031 }
997 1032
1033 /*
1034 * This will intentionally not wind up calling vmtruncate(),
1035 * since all the work for a size change has been done above.
1036 * Otherwise, we could get into problems with truncate as
1037 * ip_alloc_sem is used there to protect against i_size
1038 * changes.
1039 */
998 status = inode_setattr(inode, attr); 1040 status = inode_setattr(inode, attr);
999 if (status < 0) { 1041 if (status < 0) {
1000 mlog_errno(status); 1042 mlog_errno(status);
@@ -1070,17 +1112,16 @@ out:
1070 return ret; 1112 return ret;
1071} 1113}
1072 1114
1073static int ocfs2_write_remove_suid(struct inode *inode) 1115static int __ocfs2_write_remove_suid(struct inode *inode,
1116 struct buffer_head *bh)
1074{ 1117{
1075 int ret; 1118 int ret;
1076 struct buffer_head *bh = NULL;
1077 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1078 handle_t *handle; 1119 handle_t *handle;
1079 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1120 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1080 struct ocfs2_dinode *di; 1121 struct ocfs2_dinode *di;
1081 1122
1082 mlog_entry("(Inode %llu, mode 0%o)\n", 1123 mlog_entry("(Inode %llu, mode 0%o)\n",
1083 (unsigned long long)oi->ip_blkno, inode->i_mode); 1124 (unsigned long long)OCFS2_I(inode)->ip_blkno, inode->i_mode);
1084 1125
1085 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 1126 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1086 if (handle == NULL) { 1127 if (handle == NULL) {
@@ -1089,17 +1130,11 @@ static int ocfs2_write_remove_suid(struct inode *inode)
1089 goto out; 1130 goto out;
1090 } 1131 }
1091 1132
1092 ret = ocfs2_read_block(osb, oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode);
1093 if (ret < 0) {
1094 mlog_errno(ret);
1095 goto out_trans;
1096 }
1097
1098 ret = ocfs2_journal_access(handle, inode, bh, 1133 ret = ocfs2_journal_access(handle, inode, bh,
1099 OCFS2_JOURNAL_ACCESS_WRITE); 1134 OCFS2_JOURNAL_ACCESS_WRITE);
1100 if (ret < 0) { 1135 if (ret < 0) {
1101 mlog_errno(ret); 1136 mlog_errno(ret);
1102 goto out_bh; 1137 goto out_trans;
1103 } 1138 }
1104 1139
1105 inode->i_mode &= ~S_ISUID; 1140 inode->i_mode &= ~S_ISUID;
@@ -1112,8 +1147,7 @@ static int ocfs2_write_remove_suid(struct inode *inode)
1112 ret = ocfs2_journal_dirty(handle, bh); 1147 ret = ocfs2_journal_dirty(handle, bh);
1113 if (ret < 0) 1148 if (ret < 0)
1114 mlog_errno(ret); 1149 mlog_errno(ret);
1115out_bh: 1150
1116 brelse(bh);
1117out_trans: 1151out_trans:
1118 ocfs2_commit_trans(osb, handle); 1152 ocfs2_commit_trans(osb, handle);
1119out: 1153out:
@@ -1159,6 +1193,499 @@ out:
1159 return ret; 1193 return ret;
1160} 1194}
1161 1195
1196static int ocfs2_write_remove_suid(struct inode *inode)
1197{
1198 int ret;
1199 struct buffer_head *bh = NULL;
1200 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1201
1202 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
1203 oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode);
1204 if (ret < 0) {
1205 mlog_errno(ret);
1206 goto out;
1207 }
1208
1209 ret = __ocfs2_write_remove_suid(inode, bh);
1210out:
1211 brelse(bh);
1212 return ret;
1213}
1214
1215/*
1216 * Allocate enough extents to cover the region starting at byte offset
1217 * start for len bytes. Existing extents are skipped, any extents
1218 * added are marked as "unwritten".
1219 */
1220static int ocfs2_allocate_unwritten_extents(struct inode *inode,
1221 u64 start, u64 len)
1222{
1223 int ret;
1224 u32 cpos, phys_cpos, clusters, alloc_size;
1225
1226 /*
1227 * We consider both start and len to be inclusive.
1228 */
1229 cpos = start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
1230 clusters = ocfs2_clusters_for_bytes(inode->i_sb, start + len);
1231 clusters -= cpos;
1232
1233 while (clusters) {
1234 ret = ocfs2_get_clusters(inode, cpos, &phys_cpos,
1235 &alloc_size, NULL);
1236 if (ret) {
1237 mlog_errno(ret);
1238 goto out;
1239 }
1240
1241 /*
1242 * Hole or existing extent len can be arbitrary, so
1243 * cap it to our own allocation request.
1244 */
1245 if (alloc_size > clusters)
1246 alloc_size = clusters;
1247
1248 if (phys_cpos) {
1249 /*
1250 * We already have an allocation at this
1251 * region so we can safely skip it.
1252 */
1253 goto next;
1254 }
1255
1256 ret = __ocfs2_extend_allocation(inode, cpos, alloc_size, 1);
1257 if (ret) {
1258 if (ret != -ENOSPC)
1259 mlog_errno(ret);
1260 goto out;
1261 }
1262
1263next:
1264 cpos += alloc_size;
1265 clusters -= alloc_size;
1266 }
1267
1268 ret = 0;
1269out:
1270 return ret;
1271}
1272
1273static int __ocfs2_remove_inode_range(struct inode *inode,
1274 struct buffer_head *di_bh,
1275 u32 cpos, u32 phys_cpos, u32 len,
1276 struct ocfs2_cached_dealloc_ctxt *dealloc)
1277{
1278 int ret;
1279 u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
1280 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1281 struct inode *tl_inode = osb->osb_tl_inode;
1282 handle_t *handle;
1283 struct ocfs2_alloc_context *meta_ac = NULL;
1284 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
1285
1286 ret = ocfs2_lock_allocators(inode, di, 0, 1, NULL, &meta_ac);
1287 if (ret) {
1288 mlog_errno(ret);
1289 return ret;
1290 }
1291
1292 mutex_lock(&tl_inode->i_mutex);
1293
1294 if (ocfs2_truncate_log_needs_flush(osb)) {
1295 ret = __ocfs2_flush_truncate_log(osb);
1296 if (ret < 0) {
1297 mlog_errno(ret);
1298 goto out;
1299 }
1300 }
1301
1302 handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
1303 if (handle == NULL) {
1304 ret = -ENOMEM;
1305 mlog_errno(ret);
1306 goto out;
1307 }
1308
1309 ret = ocfs2_journal_access(handle, inode, di_bh,
1310 OCFS2_JOURNAL_ACCESS_WRITE);
1311 if (ret) {
1312 mlog_errno(ret);
1313 goto out;
1314 }
1315
1316 ret = ocfs2_remove_extent(inode, di_bh, cpos, len, handle, meta_ac,
1317 dealloc);
1318 if (ret) {
1319 mlog_errno(ret);
1320 goto out_commit;
1321 }
1322
1323 OCFS2_I(inode)->ip_clusters -= len;
1324 di->i_clusters = cpu_to_le32(OCFS2_I(inode)->ip_clusters);
1325
1326 ret = ocfs2_journal_dirty(handle, di_bh);
1327 if (ret) {
1328 mlog_errno(ret);
1329 goto out_commit;
1330 }
1331
1332 ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len);
1333 if (ret)
1334 mlog_errno(ret);
1335
1336out_commit:
1337 ocfs2_commit_trans(osb, handle);
1338out:
1339 mutex_unlock(&tl_inode->i_mutex);
1340
1341 if (meta_ac)
1342 ocfs2_free_alloc_context(meta_ac);
1343
1344 return ret;
1345}
1346
1347/*
1348 * Truncate a byte range, avoiding pages within partial clusters. This
1349 * preserves those pages for the zeroing code to write to.
1350 */
1351static void ocfs2_truncate_cluster_pages(struct inode *inode, u64 byte_start,
1352 u64 byte_len)
1353{
1354 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1355 loff_t start, end;
1356 struct address_space *mapping = inode->i_mapping;
1357
1358 start = (loff_t)ocfs2_align_bytes_to_clusters(inode->i_sb, byte_start);
1359 end = byte_start + byte_len;
1360 end = end & ~(osb->s_clustersize - 1);
1361
1362 if (start < end) {
1363 unmap_mapping_range(mapping, start, end - start, 0);
1364 truncate_inode_pages_range(mapping, start, end - 1);
1365 }
1366}
1367
1368static int ocfs2_zero_partial_clusters(struct inode *inode,
1369 u64 start, u64 len)
1370{
1371 int ret = 0;
1372 u64 tmpend, end = start + len;
1373 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1374 unsigned int csize = osb->s_clustersize;
1375 handle_t *handle;
1376
1377 /*
1378 * The "start" and "end" values are NOT necessarily part of
1379 * the range whose allocation is being deleted. Rather, this
1380 * is what the user passed in with the request. We must zero
1381 * partial clusters here. There's no need to worry about
1382 * physical allocation - the zeroing code knows to skip holes.
1383 */
1384 mlog(0, "byte start: %llu, end: %llu\n",
1385 (unsigned long long)start, (unsigned long long)end);
1386
1387 /*
1388 * If both edges are on a cluster boundary then there's no
1389 * zeroing required as the region is part of the allocation to
1390 * be truncated.
1391 */
1392 if ((start & (csize - 1)) == 0 && (end & (csize - 1)) == 0)
1393 goto out;
1394
1395 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1396 if (handle == NULL) {
1397 ret = -ENOMEM;
1398 mlog_errno(ret);
1399 goto out;
1400 }
1401
1402 /*
1403 * We want to get the byte offset of the end of the 1st cluster.
1404 */
1405 tmpend = (u64)osb->s_clustersize + (start & ~(osb->s_clustersize - 1));
1406 if (tmpend > end)
1407 tmpend = end;
1408
1409 mlog(0, "1st range: start: %llu, tmpend: %llu\n",
1410 (unsigned long long)start, (unsigned long long)tmpend);
1411
1412 ret = ocfs2_zero_range_for_truncate(inode, handle, start, tmpend);
1413 if (ret)
1414 mlog_errno(ret);
1415
1416 if (tmpend < end) {
1417 /*
1418 * This may make start and end equal, but the zeroing
1419 * code will skip any work in that case so there's no
1420 * need to catch it up here.
1421 */
1422 start = end & ~(osb->s_clustersize - 1);
1423
1424 mlog(0, "2nd range: start: %llu, end: %llu\n",
1425 (unsigned long long)start, (unsigned long long)end);
1426
1427 ret = ocfs2_zero_range_for_truncate(inode, handle, start, end);
1428 if (ret)
1429 mlog_errno(ret);
1430 }
1431
1432 ocfs2_commit_trans(osb, handle);
1433out:
1434 return ret;
1435}
1436
1437static int ocfs2_remove_inode_range(struct inode *inode,
1438 struct buffer_head *di_bh, u64 byte_start,
1439 u64 byte_len)
1440{
1441 int ret = 0;
1442 u32 trunc_start, trunc_len, cpos, phys_cpos, alloc_size;
1443 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1444 struct ocfs2_cached_dealloc_ctxt dealloc;
1445
1446 ocfs2_init_dealloc_ctxt(&dealloc);
1447
1448 if (byte_len == 0)
1449 return 0;
1450
1451 trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start);
1452 trunc_len = (byte_start + byte_len) >> osb->s_clustersize_bits;
1453 if (trunc_len >= trunc_start)
1454 trunc_len -= trunc_start;
1455 else
1456 trunc_len = 0;
1457
1458 mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u\n",
1459 (unsigned long long)OCFS2_I(inode)->ip_blkno,
1460 (unsigned long long)byte_start,
1461 (unsigned long long)byte_len, trunc_start, trunc_len);
1462
1463 ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len);
1464 if (ret) {
1465 mlog_errno(ret);
1466 goto out;
1467 }
1468
1469 cpos = trunc_start;
1470 while (trunc_len) {
1471 ret = ocfs2_get_clusters(inode, cpos, &phys_cpos,
1472 &alloc_size, NULL);
1473 if (ret) {
1474 mlog_errno(ret);
1475 goto out;
1476 }
1477
1478 if (alloc_size > trunc_len)
1479 alloc_size = trunc_len;
1480
1481 /* Only do work for non-holes */
1482 if (phys_cpos != 0) {
1483 ret = __ocfs2_remove_inode_range(inode, di_bh, cpos,
1484 phys_cpos, alloc_size,
1485 &dealloc);
1486 if (ret) {
1487 mlog_errno(ret);
1488 goto out;
1489 }
1490 }
1491
1492 cpos += alloc_size;
1493 trunc_len -= alloc_size;
1494 }
1495
1496 ocfs2_truncate_cluster_pages(inode, byte_start, byte_len);
1497
1498out:
1499 ocfs2_schedule_truncate_log_flush(osb, 1);
1500 ocfs2_run_deallocs(osb, &dealloc);
1501
1502 return ret;
1503}
1504
1505/*
1506 * Parts of this function taken from xfs_change_file_space()
1507 */
1508static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
1509 loff_t f_pos, unsigned int cmd,
1510 struct ocfs2_space_resv *sr,
1511 int change_size)
1512{
1513 int ret;
1514 s64 llen;
1515 loff_t size;
1516 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1517 struct buffer_head *di_bh = NULL;
1518 handle_t *handle;
1519 unsigned long long max_off = ocfs2_max_file_offset(inode->i_sb->s_blocksize_bits);
1520
1521 if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
1522 return -EROFS;
1523
1524 mutex_lock(&inode->i_mutex);
1525
1526 /*
1527 * This prevents concurrent writes on other nodes
1528 */
1529 ret = ocfs2_rw_lock(inode, 1);
1530 if (ret) {
1531 mlog_errno(ret);
1532 goto out;
1533 }
1534
1535 ret = ocfs2_meta_lock(inode, &di_bh, 1);
1536 if (ret) {
1537 mlog_errno(ret);
1538 goto out_rw_unlock;
1539 }
1540
1541 if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
1542 ret = -EPERM;
1543 goto out_meta_unlock;
1544 }
1545
1546 switch (sr->l_whence) {
1547 case 0: /*SEEK_SET*/
1548 break;
1549 case 1: /*SEEK_CUR*/
1550 sr->l_start += f_pos;
1551 break;
1552 case 2: /*SEEK_END*/
1553 sr->l_start += i_size_read(inode);
1554 break;
1555 default:
1556 ret = -EINVAL;
1557 goto out_meta_unlock;
1558 }
1559 sr->l_whence = 0;
1560
1561 llen = sr->l_len > 0 ? sr->l_len - 1 : sr->l_len;
1562
1563 if (sr->l_start < 0
1564 || sr->l_start > max_off
1565 || (sr->l_start + llen) < 0
1566 || (sr->l_start + llen) > max_off) {
1567 ret = -EINVAL;
1568 goto out_meta_unlock;
1569 }
1570 size = sr->l_start + sr->l_len;
1571
1572 if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) {
1573 if (sr->l_len <= 0) {
1574 ret = -EINVAL;
1575 goto out_meta_unlock;
1576 }
1577 }
1578
1579 if (file && should_remove_suid(file->f_path.dentry)) {
1580 ret = __ocfs2_write_remove_suid(inode, di_bh);
1581 if (ret) {
1582 mlog_errno(ret);
1583 goto out_meta_unlock;
1584 }
1585 }
1586
1587 down_write(&OCFS2_I(inode)->ip_alloc_sem);
1588 switch (cmd) {
1589 case OCFS2_IOC_RESVSP:
1590 case OCFS2_IOC_RESVSP64:
1591 /*
1592 * This takes unsigned offsets, but the signed ones we
1593 * pass have been checked against overflow above.
1594 */
1595 ret = ocfs2_allocate_unwritten_extents(inode, sr->l_start,
1596 sr->l_len);
1597 break;
1598 case OCFS2_IOC_UNRESVSP:
1599 case OCFS2_IOC_UNRESVSP64:
1600 ret = ocfs2_remove_inode_range(inode, di_bh, sr->l_start,
1601 sr->l_len);
1602 break;
1603 default:
1604 ret = -EINVAL;
1605 }
1606 up_write(&OCFS2_I(inode)->ip_alloc_sem);
1607 if (ret) {
1608 mlog_errno(ret);
1609 goto out_meta_unlock;
1610 }
1611
1612 /*
1613 * We update c/mtime for these changes
1614 */
1615 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1616 if (IS_ERR(handle)) {
1617 ret = PTR_ERR(handle);
1618 mlog_errno(ret);
1619 goto out_meta_unlock;
1620 }
1621
1622 if (change_size && i_size_read(inode) < size)
1623 i_size_write(inode, size);
1624
1625 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
1626 ret = ocfs2_mark_inode_dirty(handle, inode, di_bh);
1627 if (ret < 0)
1628 mlog_errno(ret);
1629
1630 ocfs2_commit_trans(osb, handle);
1631
1632out_meta_unlock:
1633 brelse(di_bh);
1634 ocfs2_meta_unlock(inode, 1);
1635out_rw_unlock:
1636 ocfs2_rw_unlock(inode, 1);
1637
1638 mutex_unlock(&inode->i_mutex);
1639out:
1640 return ret;
1641}
1642
1643int ocfs2_change_file_space(struct file *file, unsigned int cmd,
1644 struct ocfs2_space_resv *sr)
1645{
1646 struct inode *inode = file->f_path.dentry->d_inode;
1647 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);;
1648
1649 if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) &&
1650 !ocfs2_writes_unwritten_extents(osb))
1651 return -ENOTTY;
1652 else if ((cmd == OCFS2_IOC_UNRESVSP || cmd == OCFS2_IOC_UNRESVSP64) &&
1653 !ocfs2_sparse_alloc(osb))
1654 return -ENOTTY;
1655
1656 if (!S_ISREG(inode->i_mode))
1657 return -EINVAL;
1658
1659 if (!(file->f_mode & FMODE_WRITE))
1660 return -EBADF;
1661
1662 return __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0);
1663}
1664
1665static long ocfs2_fallocate(struct inode *inode, int mode, loff_t offset,
1666 loff_t len)
1667{
1668 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1669 struct ocfs2_space_resv sr;
1670 int change_size = 1;
1671
1672 if (!ocfs2_writes_unwritten_extents(osb))
1673 return -EOPNOTSUPP;
1674
1675 if (S_ISDIR(inode->i_mode))
1676 return -ENODEV;
1677
1678 if (mode & FALLOC_FL_KEEP_SIZE)
1679 change_size = 0;
1680
1681 sr.l_whence = 0;
1682 sr.l_start = (s64)offset;
1683 sr.l_len = (s64)len;
1684
1685 return __ocfs2_change_file_space(NULL, inode, offset,
1686 OCFS2_IOC_RESVSP64, &sr, change_size);
1687}
1688
1162static int ocfs2_prepare_inode_for_write(struct dentry *dentry, 1689static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
1163 loff_t *ppos, 1690 loff_t *ppos,
1164 size_t count, 1691 size_t count,
@@ -1329,15 +1856,16 @@ ocfs2_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes)
1329 *basep = base; 1856 *basep = base;
1330} 1857}
1331 1858
1332static struct page * ocfs2_get_write_source(struct ocfs2_buffered_write_priv *bp, 1859static struct page * ocfs2_get_write_source(char **ret_src_buf,
1333 const struct iovec *cur_iov, 1860 const struct iovec *cur_iov,
1334 size_t iov_offset) 1861 size_t iov_offset)
1335{ 1862{
1336 int ret; 1863 int ret;
1337 char *buf; 1864 char *buf = cur_iov->iov_base + iov_offset;
1338 struct page *src_page = NULL; 1865 struct page *src_page = NULL;
1866 unsigned long off;
1339 1867
1340 buf = cur_iov->iov_base + iov_offset; 1868 off = (unsigned long)(buf) & ~PAGE_CACHE_MASK;
1341 1869
1342 if (!segment_eq(get_fs(), KERNEL_DS)) { 1870 if (!segment_eq(get_fs(), KERNEL_DS)) {
1343 /* 1871 /*
@@ -1349,18 +1877,17 @@ static struct page * ocfs2_get_write_source(struct ocfs2_buffered_write_priv *bp
1349 (unsigned long)buf & PAGE_CACHE_MASK, 1, 1877 (unsigned long)buf & PAGE_CACHE_MASK, 1,
1350 0, 0, &src_page, NULL); 1878 0, 0, &src_page, NULL);
1351 if (ret == 1) 1879 if (ret == 1)
1352 bp->b_src_buf = kmap(src_page); 1880 *ret_src_buf = kmap(src_page) + off;
1353 else 1881 else
1354 src_page = ERR_PTR(-EFAULT); 1882 src_page = ERR_PTR(-EFAULT);
1355 } else { 1883 } else {
1356 bp->b_src_buf = buf; 1884 *ret_src_buf = buf;
1357 } 1885 }
1358 1886
1359 return src_page; 1887 return src_page;
1360} 1888}
1361 1889
1362static void ocfs2_put_write_source(struct ocfs2_buffered_write_priv *bp, 1890static void ocfs2_put_write_source(struct page *page)
1363 struct page *page)
1364{ 1891{
1365 if (page) { 1892 if (page) {
1366 kunmap(page); 1893 kunmap(page);
@@ -1376,10 +1903,13 @@ static ssize_t ocfs2_file_buffered_write(struct file *file, loff_t *ppos,
1376{ 1903{
1377 int ret = 0; 1904 int ret = 0;
1378 ssize_t copied, total = 0; 1905 ssize_t copied, total = 0;
1379 size_t iov_offset = 0; 1906 size_t iov_offset = 0, bytes;
1907 loff_t pos;
1380 const struct iovec *cur_iov = iov; 1908 const struct iovec *cur_iov = iov;
1381 struct ocfs2_buffered_write_priv bp; 1909 struct page *user_page, *page;
1382 struct page *page; 1910 char * uninitialized_var(buf);
1911 char *dst;
1912 void *fsdata;
1383 1913
1384 /* 1914 /*
1385 * handle partial DIO write. Adjust cur_iov if needed. 1915 * handle partial DIO write. Adjust cur_iov if needed.
@@ -1387,21 +1917,38 @@ static ssize_t ocfs2_file_buffered_write(struct file *file, loff_t *ppos,
1387 ocfs2_set_next_iovec(&cur_iov, &iov_offset, o_direct_written); 1917 ocfs2_set_next_iovec(&cur_iov, &iov_offset, o_direct_written);
1388 1918
1389 do { 1919 do {
1390 bp.b_cur_off = iov_offset; 1920 pos = *ppos;
1391 bp.b_cur_iov = cur_iov;
1392 1921
1393 page = ocfs2_get_write_source(&bp, cur_iov, iov_offset); 1922 user_page = ocfs2_get_write_source(&buf, cur_iov, iov_offset);
1394 if (IS_ERR(page)) { 1923 if (IS_ERR(user_page)) {
1395 ret = PTR_ERR(page); 1924 ret = PTR_ERR(user_page);
1396 goto out; 1925 goto out;
1397 } 1926 }
1398 1927
1399 copied = ocfs2_buffered_write_cluster(file, *ppos, count, 1928 /* Stay within our page boundaries */
1400 ocfs2_map_and_write_user_data, 1929 bytes = min((PAGE_CACHE_SIZE - ((unsigned long)pos & ~PAGE_CACHE_MASK)),
1401 &bp); 1930 (PAGE_CACHE_SIZE - ((unsigned long)buf & ~PAGE_CACHE_MASK)));
1931 /* Stay within the vector boundary */
1932 bytes = min_t(size_t, bytes, cur_iov->iov_len - iov_offset);
1933 /* Stay within count */
1934 bytes = min(bytes, count);
1935
1936 page = NULL;
1937 ret = ocfs2_write_begin(file, file->f_mapping, pos, bytes, 0,
1938 &page, &fsdata);
1939 if (ret) {
1940 mlog_errno(ret);
1941 goto out;
1942 }
1402 1943
1403 ocfs2_put_write_source(&bp, page); 1944 dst = kmap_atomic(page, KM_USER0);
1945 memcpy(dst + (pos & (PAGE_CACHE_SIZE - 1)), buf, bytes);
1946 kunmap_atomic(dst, KM_USER0);
1947 flush_dcache_page(page);
1948 ocfs2_put_write_source(user_page);
1404 1949
1950 copied = ocfs2_write_end(file, file->f_mapping, pos, bytes,
1951 bytes, page, fsdata);
1405 if (copied < 0) { 1952 if (copied < 0) {
1406 mlog_errno(copied); 1953 mlog_errno(copied);
1407 ret = copied; 1954 ret = copied;
@@ -1409,7 +1956,7 @@ static ssize_t ocfs2_file_buffered_write(struct file *file, loff_t *ppos,
1409 } 1956 }
1410 1957
1411 total += copied; 1958 total += copied;
1412 *ppos = *ppos + copied; 1959 *ppos = pos + copied;
1413 count -= copied; 1960 count -= copied;
1414 1961
1415 ocfs2_set_next_iovec(&cur_iov, &iov_offset, copied); 1962 ocfs2_set_next_iovec(&cur_iov, &iov_offset, copied);
@@ -1579,52 +2126,46 @@ static int ocfs2_splice_write_actor(struct pipe_inode_info *pipe,
1579 struct pipe_buffer *buf, 2126 struct pipe_buffer *buf,
1580 struct splice_desc *sd) 2127 struct splice_desc *sd)
1581{ 2128{
1582 int ret, count, total = 0; 2129 int ret, count;
1583 ssize_t copied = 0; 2130 ssize_t copied = 0;
1584 struct ocfs2_splice_write_priv sp; 2131 struct file *file = sd->u.file;
2132 unsigned int offset;
2133 struct page *page = NULL;
2134 void *fsdata;
2135 char *src, *dst;
1585 2136
1586 ret = buf->ops->confirm(pipe, buf); 2137 ret = buf->ops->confirm(pipe, buf);
1587 if (ret) 2138 if (ret)
1588 goto out; 2139 goto out;
1589 2140
1590 sp.s_sd = sd; 2141 offset = sd->pos & ~PAGE_CACHE_MASK;
1591 sp.s_buf = buf;
1592 sp.s_pipe = pipe;
1593 sp.s_offset = sd->pos & ~PAGE_CACHE_MASK;
1594 sp.s_buf_offset = buf->offset;
1595
1596 count = sd->len; 2142 count = sd->len;
1597 if (count + sp.s_offset > PAGE_CACHE_SIZE) 2143 if (count + offset > PAGE_CACHE_SIZE)
1598 count = PAGE_CACHE_SIZE - sp.s_offset; 2144 count = PAGE_CACHE_SIZE - offset;
1599 2145
1600 do { 2146 ret = ocfs2_write_begin(file, file->f_mapping, sd->pos, count, 0,
1601 /* 2147 &page, &fsdata);
1602 * splice wants us to copy up to one page at a 2148 if (ret) {
1603 * time. For pagesize > cluster size, this means we 2149 mlog_errno(ret);
1604 * might enter ocfs2_buffered_write_cluster() more 2150 goto out;
1605 * than once, so keep track of our progress here. 2151 }
1606 */
1607 copied = ocfs2_buffered_write_cluster(sd->u.file,
1608 (loff_t)sd->pos + total,
1609 count,
1610 ocfs2_map_and_write_splice_data,
1611 &sp);
1612 if (copied < 0) {
1613 mlog_errno(copied);
1614 ret = copied;
1615 goto out;
1616 }
1617 2152
1618 count -= copied; 2153 src = buf->ops->map(pipe, buf, 1);
1619 sp.s_offset += copied; 2154 dst = kmap_atomic(page, KM_USER1);
1620 sp.s_buf_offset += copied; 2155 memcpy(dst + offset, src + buf->offset, count);
1621 total += copied; 2156 kunmap_atomic(page, KM_USER1);
1622 } while (count); 2157 buf->ops->unmap(pipe, buf, src);
1623 2158
1624 ret = 0; 2159 copied = ocfs2_write_end(file, file->f_mapping, sd->pos, count, count,
2160 page, fsdata);
2161 if (copied < 0) {
2162 mlog_errno(copied);
2163 ret = copied;
2164 goto out;
2165 }
1625out: 2166out:
1626 2167
1627 return total ? total : ret; 2168 return copied ? copied : ret;
1628} 2169}
1629 2170
1630static ssize_t __ocfs2_file_splice_write(struct pipe_inode_info *pipe, 2171static ssize_t __ocfs2_file_splice_write(struct pipe_inode_info *pipe,
@@ -1811,6 +2352,7 @@ const struct inode_operations ocfs2_file_iops = {
1811 .setattr = ocfs2_setattr, 2352 .setattr = ocfs2_setattr,
1812 .getattr = ocfs2_getattr, 2353 .getattr = ocfs2_getattr,
1813 .permission = ocfs2_permission, 2354 .permission = ocfs2_permission,
2355 .fallocate = ocfs2_fallocate,
1814}; 2356};
1815 2357
1816const struct inode_operations ocfs2_special_file_iops = { 2358const struct inode_operations ocfs2_special_file_iops = {
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index a4dd1fa1822b..36fe27f268ee 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -39,15 +39,16 @@ enum ocfs2_alloc_restarted {
39}; 39};
40int ocfs2_do_extend_allocation(struct ocfs2_super *osb, 40int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
41 struct inode *inode, 41 struct inode *inode,
42 u32 *cluster_start, 42 u32 *logical_offset,
43 u32 clusters_to_add, 43 u32 clusters_to_add,
44 int mark_unwritten,
44 struct buffer_head *fe_bh, 45 struct buffer_head *fe_bh,
45 handle_t *handle, 46 handle_t *handle,
46 struct ocfs2_alloc_context *data_ac, 47 struct ocfs2_alloc_context *data_ac,
47 struct ocfs2_alloc_context *meta_ac, 48 struct ocfs2_alloc_context *meta_ac,
48 enum ocfs2_alloc_restarted *reason); 49 enum ocfs2_alloc_restarted *reason_ret);
49int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di, 50int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
50 u32 clusters_to_add, 51 u32 clusters_to_add, u32 extents_to_split,
51 struct ocfs2_alloc_context **data_ac, 52 struct ocfs2_alloc_context **data_ac,
52 struct ocfs2_alloc_context **meta_ac); 53 struct ocfs2_alloc_context **meta_ac);
53int ocfs2_setattr(struct dentry *dentry, struct iattr *attr); 54int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
@@ -61,4 +62,7 @@ int ocfs2_should_update_atime(struct inode *inode,
61int ocfs2_update_inode_atime(struct inode *inode, 62int ocfs2_update_inode_atime(struct inode *inode,
62 struct buffer_head *bh); 63 struct buffer_head *bh);
63 64
65int ocfs2_change_file_space(struct file *file, unsigned int cmd,
66 struct ocfs2_space_resv *sr);
67
64#endif /* OCFS2_FILE_H */ 68#endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
index b25ef63781ba..c4c36171240d 100644
--- a/fs/ocfs2/heartbeat.c
+++ b/fs/ocfs2/heartbeat.c
@@ -157,16 +157,16 @@ int ocfs2_register_hb_callbacks(struct ocfs2_super *osb)
157 if (ocfs2_mount_local(osb)) 157 if (ocfs2_mount_local(osb))
158 return 0; 158 return 0;
159 159
160 status = o2hb_register_callback(&osb->osb_hb_down); 160 status = o2hb_register_callback(osb->uuid_str, &osb->osb_hb_down);
161 if (status < 0) { 161 if (status < 0) {
162 mlog_errno(status); 162 mlog_errno(status);
163 goto bail; 163 goto bail;
164 } 164 }
165 165
166 status = o2hb_register_callback(&osb->osb_hb_up); 166 status = o2hb_register_callback(osb->uuid_str, &osb->osb_hb_up);
167 if (status < 0) { 167 if (status < 0) {
168 mlog_errno(status); 168 mlog_errno(status);
169 o2hb_unregister_callback(&osb->osb_hb_down); 169 o2hb_unregister_callback(osb->uuid_str, &osb->osb_hb_down);
170 } 170 }
171 171
172bail: 172bail:
@@ -178,8 +178,8 @@ void ocfs2_clear_hb_callbacks(struct ocfs2_super *osb)
178 if (ocfs2_mount_local(osb)) 178 if (ocfs2_mount_local(osb))
179 return; 179 return;
180 180
181 o2hb_unregister_callback(&osb->osb_hb_down); 181 o2hb_unregister_callback(osb->uuid_str, &osb->osb_hb_down);
182 o2hb_unregister_callback(&osb->osb_hb_up); 182 o2hb_unregister_callback(osb->uuid_str, &osb->osb_hb_up);
183} 183}
184 184
185void ocfs2_stop_heartbeat(struct ocfs2_super *osb) 185void ocfs2_stop_heartbeat(struct ocfs2_super *osb)
@@ -209,7 +209,7 @@ void ocfs2_stop_heartbeat(struct ocfs2_super *osb)
209 envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; 209 envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
210 envp[2] = NULL; 210 envp[2] = NULL;
211 211
212 ret = call_usermodehelper(argv[0], argv, envp, 1); 212 ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
213 if (ret < 0) 213 if (ret < 0)
214 mlog_errno(ret); 214 mlog_errno(ret);
215} 215}
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index f3ad21ad9aed..87dcece7e1b5 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -14,6 +14,7 @@
14#include "ocfs2.h" 14#include "ocfs2.h"
15#include "alloc.h" 15#include "alloc.h"
16#include "dlmglue.h" 16#include "dlmglue.h"
17#include "file.h"
17#include "inode.h" 18#include "inode.h"
18#include "journal.h" 19#include "journal.h"
19 20
@@ -62,7 +63,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
62 goto bail_unlock; 63 goto bail_unlock;
63 64
64 status = -EACCES; 65 status = -EACCES;
65 if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) 66 if (!is_owner_or_cap(inode))
66 goto bail_unlock; 67 goto bail_unlock;
67 68
68 if (!S_ISDIR(inode->i_mode)) 69 if (!S_ISDIR(inode->i_mode))
@@ -115,6 +116,7 @@ int ocfs2_ioctl(struct inode * inode, struct file * filp,
115{ 116{
116 unsigned int flags; 117 unsigned int flags;
117 int status; 118 int status;
119 struct ocfs2_space_resv sr;
118 120
119 switch (cmd) { 121 switch (cmd) {
120 case OCFS2_IOC_GETFLAGS: 122 case OCFS2_IOC_GETFLAGS:
@@ -130,6 +132,14 @@ int ocfs2_ioctl(struct inode * inode, struct file * filp,
130 132
131 return ocfs2_set_inode_attr(inode, flags, 133 return ocfs2_set_inode_attr(inode, flags,
132 OCFS2_FL_MODIFIABLE); 134 OCFS2_FL_MODIFIABLE);
135 case OCFS2_IOC_RESVSP:
136 case OCFS2_IOC_RESVSP64:
137 case OCFS2_IOC_UNRESVSP:
138 case OCFS2_IOC_UNRESVSP64:
139 if (copy_from_user(&sr, (int __user *) arg, sizeof(sr)))
140 return -EFAULT;
141
142 return ocfs2_change_file_space(filp, cmd, &sr);
133 default: 143 default:
134 return -ENOTTY; 144 return -ENOTTY;
135 } 145 }
@@ -148,6 +158,11 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
148 case OCFS2_IOC32_SETFLAGS: 158 case OCFS2_IOC32_SETFLAGS:
149 cmd = OCFS2_IOC_SETFLAGS; 159 cmd = OCFS2_IOC_SETFLAGS;
150 break; 160 break;
161 case OCFS2_IOC_RESVSP:
162 case OCFS2_IOC_RESVSP64:
163 case OCFS2_IOC_UNRESVSP:
164 case OCFS2_IOC_UNRESVSP64:
165 break;
151 default: 166 default:
152 return -ENOIOCTLCMD; 167 return -ENOIOCTLCMD;
153 } 168 }
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index dc1188081720..dbfb20bb27ea 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -722,8 +722,7 @@ void ocfs2_complete_recovery(struct work_struct *work)
722 container_of(work, struct ocfs2_journal, j_recovery_work); 722 container_of(work, struct ocfs2_journal, j_recovery_work);
723 struct ocfs2_super *osb = journal->j_osb; 723 struct ocfs2_super *osb = journal->j_osb;
724 struct ocfs2_dinode *la_dinode, *tl_dinode; 724 struct ocfs2_dinode *la_dinode, *tl_dinode;
725 struct ocfs2_la_recovery_item *item; 725 struct ocfs2_la_recovery_item *item, *n;
726 struct list_head *p, *n;
727 LIST_HEAD(tmp_la_list); 726 LIST_HEAD(tmp_la_list);
728 727
729 mlog_entry_void(); 728 mlog_entry_void();
@@ -734,8 +733,7 @@ void ocfs2_complete_recovery(struct work_struct *work)
734 list_splice_init(&journal->j_la_cleanups, &tmp_la_list); 733 list_splice_init(&journal->j_la_cleanups, &tmp_la_list);
735 spin_unlock(&journal->j_lock); 734 spin_unlock(&journal->j_lock);
736 735
737 list_for_each_safe(p, n, &tmp_la_list) { 736 list_for_each_entry_safe(item, n, &tmp_la_list, lri_list) {
738 item = list_entry(p, struct ocfs2_la_recovery_item, lri_list);
739 list_del_init(&item->lri_list); 737 list_del_init(&item->lri_list);
740 738
741 mlog(0, "Complete recovery for slot %d\n", item->lri_slot); 739 mlog(0, "Complete recovery for slot %d\n", item->lri_slot);
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 3db5de4506da..ce60aab013aa 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -289,6 +289,8 @@ int ocfs2_journal_dirty_data(handle_t *handle,
289#define OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC (OCFS2_SUBALLOC_FREE \ 289#define OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC (OCFS2_SUBALLOC_FREE \
290 + OCFS2_TRUNCATE_LOG_UPDATE) 290 + OCFS2_TRUNCATE_LOG_UPDATE)
291 291
292#define OCFS2_REMOVE_EXTENT_CREDITS (OCFS2_TRUNCATE_LOG_UPDATE + OCFS2_INODE_UPDATE_CREDITS)
293
292/* data block for new dir/symlink, 2 for bitmap updates (bitmap fe + 294/* data block for new dir/symlink, 2 for bitmap updates (bitmap fe +
293 * bitmap block for the new bit) */ 295 * bitmap block for the new bit) */
294#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2) 296#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2)
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index af01158b39f5..98756156d298 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -37,66 +37,182 @@
37 37
38#include "ocfs2.h" 38#include "ocfs2.h"
39 39
40#include "aops.h"
40#include "dlmglue.h" 41#include "dlmglue.h"
41#include "file.h" 42#include "file.h"
42#include "inode.h" 43#include "inode.h"
43#include "mmap.h" 44#include "mmap.h"
44 45
45static struct page *ocfs2_nopage(struct vm_area_struct * area, 46static inline int ocfs2_vm_op_block_sigs(sigset_t *blocked, sigset_t *oldset)
46 unsigned long address, 47{
47 int *type) 48 /* The best way to deal with signals in the vm path is
49 * to block them upfront, rather than allowing the
50 * locking paths to return -ERESTARTSYS. */
51 sigfillset(blocked);
52
53 /* We should technically never get a bad return value
54 * from sigprocmask */
55 return sigprocmask(SIG_BLOCK, blocked, oldset);
56}
57
58static inline int ocfs2_vm_op_unblock_sigs(sigset_t *oldset)
59{
60 return sigprocmask(SIG_SETMASK, oldset, NULL);
61}
62
63static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf)
48{ 64{
49 struct page *page = NOPAGE_SIGBUS;
50 sigset_t blocked, oldset; 65 sigset_t blocked, oldset;
66 int error, ret;
67
68 mlog_entry("(area=%p, page offset=%lu)\n", area, vmf->pgoff);
69
70 error = ocfs2_vm_op_block_sigs(&blocked, &oldset);
71 if (error < 0) {
72 mlog_errno(error);
73 ret = VM_FAULT_SIGBUS;
74 goto out;
75 }
76
77 ret = filemap_fault(area, vmf);
78
79 error = ocfs2_vm_op_unblock_sigs(&oldset);
80 if (error < 0)
81 mlog_errno(error);
82out:
83 mlog_exit_ptr(vmf->page);
84 return ret;
85}
86
87static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh,
88 struct page *page)
89{
51 int ret; 90 int ret;
91 struct address_space *mapping = inode->i_mapping;
92 loff_t pos = page_offset(page);
93 unsigned int len = PAGE_CACHE_SIZE;
94 pgoff_t last_index;
95 struct page *locked_page = NULL;
96 void *fsdata;
97 loff_t size = i_size_read(inode);
52 98
53 mlog_entry("(area=%p, address=%lu, type=%p)\n", area, address, 99 /*
54 type); 100 * Another node might have truncated while we were waiting on
101 * cluster locks.
102 */
103 last_index = size >> PAGE_CACHE_SHIFT;
104 if (page->index > last_index) {
105 ret = -EINVAL;
106 goto out;
107 }
55 108
56 /* The best way to deal with signals in this path is 109 /*
57 * to block them upfront, rather than allowing the 110 * The i_size check above doesn't catch the case where nodes
58 * locking paths to return -ERESTARTSYS. */ 111 * truncated and then re-extended the file. We'll re-check the
59 sigfillset(&blocked); 112 * page mapping after taking the page lock inside of
113 * ocfs2_write_begin_nolock().
114 */
115 if (!PageUptodate(page) || page->mapping != inode->i_mapping) {
116 ret = -EINVAL;
117 goto out;
118 }
60 119
61 /* We should technically never get a bad ret return 120 /*
62 * from sigprocmask */ 121 * Call ocfs2_write_begin() and ocfs2_write_end() to take
63 ret = sigprocmask(SIG_BLOCK, &blocked, &oldset); 122 * advantage of the allocation code there. We pass a write
123 * length of the whole page (chopped to i_size) to make sure
124 * the whole thing is allocated.
125 *
126 * Since we know the page is up to date, we don't have to
127 * worry about ocfs2_write_begin() skipping some buffer reads
128 * because the "write" would invalidate their data.
129 */
130 if (page->index == last_index)
131 len = size & ~PAGE_CACHE_MASK;
132
133 ret = ocfs2_write_begin_nolock(mapping, pos, len, 0, &locked_page,
134 &fsdata, di_bh, page);
135 if (ret) {
136 if (ret != -ENOSPC)
137 mlog_errno(ret);
138 goto out;
139 }
140
141 ret = ocfs2_write_end_nolock(mapping, pos, len, len, locked_page,
142 fsdata);
64 if (ret < 0) { 143 if (ret < 0) {
65 mlog_errno(ret); 144 mlog_errno(ret);
66 goto out; 145 goto out;
67 } 146 }
147 BUG_ON(ret != len);
148 ret = 0;
149out:
150 return ret;
151}
68 152
69 page = filemap_nopage(area, address, type); 153static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
154{
155 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
156 struct buffer_head *di_bh = NULL;
157 sigset_t blocked, oldset;
158 int ret, ret2;
70 159
71 ret = sigprocmask(SIG_SETMASK, &oldset, NULL); 160 ret = ocfs2_vm_op_block_sigs(&blocked, &oldset);
72 if (ret < 0) 161 if (ret < 0) {
73 mlog_errno(ret); 162 mlog_errno(ret);
163 return ret;
164 }
165
166 /*
167 * The cluster locks taken will block a truncate from another
168 * node. Taking the data lock will also ensure that we don't
169 * attempt page truncation as part of a downconvert.
170 */
171 ret = ocfs2_meta_lock(inode, &di_bh, 1);
172 if (ret < 0) {
173 mlog_errno(ret);
174 goto out;
175 }
176
177 /*
178 * The alloc sem should be enough to serialize with
179 * ocfs2_truncate_file() changing i_size as well as any thread
180 * modifying the inode btree.
181 */
182 down_write(&OCFS2_I(inode)->ip_alloc_sem);
183
184 ret = ocfs2_data_lock(inode, 1);
185 if (ret < 0) {
186 mlog_errno(ret);
187 goto out_meta_unlock;
188 }
189
190 ret = __ocfs2_page_mkwrite(inode, di_bh, page);
191
192 ocfs2_data_unlock(inode, 1);
193
194out_meta_unlock:
195 up_write(&OCFS2_I(inode)->ip_alloc_sem);
196
197 brelse(di_bh);
198 ocfs2_meta_unlock(inode, 1);
199
74out: 200out:
75 mlog_exit_ptr(page); 201 ret2 = ocfs2_vm_op_unblock_sigs(&oldset);
76 return page; 202 if (ret2 < 0)
203 mlog_errno(ret2);
204
205 return ret;
77} 206}
78 207
79static struct vm_operations_struct ocfs2_file_vm_ops = { 208static struct vm_operations_struct ocfs2_file_vm_ops = {
80 .nopage = ocfs2_nopage, 209 .fault = ocfs2_fault,
210 .page_mkwrite = ocfs2_page_mkwrite,
81}; 211};
82 212
83int ocfs2_mmap(struct file *file, struct vm_area_struct *vma) 213int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
84{ 214{
85 int ret = 0, lock_level = 0; 215 int ret = 0, lock_level = 0;
86 struct ocfs2_super *osb = OCFS2_SB(file->f_dentry->d_inode->i_sb);
87
88 /*
89 * Only support shared writeable mmap for local mounts which
90 * don't know about holes.
91 */
92 if ((!ocfs2_mount_local(osb) || ocfs2_sparse_alloc(osb)) &&
93 ((vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_MAYSHARE)) &&
94 ((vma->vm_flags & VM_WRITE) || (vma->vm_flags & VM_MAYWRITE))) {
95 mlog(0, "disallow shared writable mmaps %lx\n", vma->vm_flags);
96 /* This is -EINVAL because generic_file_readonly_mmap
97 * returns it in a similar situation. */
98 return -EINVAL;
99 }
100 216
101 ret = ocfs2_meta_lock_atime(file->f_dentry->d_inode, 217 ret = ocfs2_meta_lock_atime(file->f_dentry->d_inode,
102 file->f_vfsmnt, &lock_level); 218 file->f_vfsmnt, &lock_level);
@@ -107,6 +223,7 @@ int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
107 ocfs2_meta_unlock(file->f_dentry->d_inode, lock_level); 223 ocfs2_meta_unlock(file->f_dentry->d_inode, lock_level);
108out: 224out:
109 vma->vm_ops = &ocfs2_file_vm_ops; 225 vma->vm_ops = &ocfs2_file_vm_ops;
226 vma->vm_flags |= VM_CAN_NONLINEAR;
110 return 0; 227 return 0;
111} 228}
112 229
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 36289e6295ce..d430fdab16e9 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -1674,7 +1674,7 @@ static int ocfs2_symlink(struct inode *dir,
1674 u32 offset = 0; 1674 u32 offset = 0;
1675 1675
1676 inode->i_op = &ocfs2_symlink_inode_operations; 1676 inode->i_op = &ocfs2_symlink_inode_operations;
1677 status = ocfs2_do_extend_allocation(osb, inode, &offset, 1, 1677 status = ocfs2_do_extend_allocation(osb, inode, &offset, 1, 0,
1678 new_fe_bh, 1678 new_fe_bh,
1679 handle, data_ac, NULL, 1679 handle, data_ac, NULL,
1680 NULL); 1680 NULL);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index a860633e833f..5cc90a40b3c5 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -219,6 +219,7 @@ struct ocfs2_super
219 u16 max_slots; 219 u16 max_slots;
220 s16 node_num; 220 s16 node_num;
221 s16 slot_num; 221 s16 slot_num;
222 s16 preferred_slot;
222 int s_sectsize_bits; 223 int s_sectsize_bits;
223 int s_clustersize; 224 int s_clustersize;
224 int s_clustersize_bits; 225 int s_clustersize_bits;
@@ -305,6 +306,19 @@ static inline int ocfs2_sparse_alloc(struct ocfs2_super *osb)
305 return 0; 306 return 0;
306} 307}
307 308
309static inline int ocfs2_writes_unwritten_extents(struct ocfs2_super *osb)
310{
311 /*
312 * Support for sparse files is a pre-requisite
313 */
314 if (!ocfs2_sparse_alloc(osb))
315 return 0;
316
317 if (osb->s_feature_ro_compat & OCFS2_FEATURE_RO_COMPAT_UNWRITTEN)
318 return 1;
319 return 0;
320}
321
308/* set / clear functions because cluster events can make these happen 322/* set / clear functions because cluster events can make these happen
309 * in parallel so we want the transitions to be atomic. this also 323 * in parallel so we want the transitions to be atomic. this also
310 * means that any future flags osb_flags must be protected by spinlock 324 * means that any future flags osb_flags must be protected by spinlock
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index f0d9eb08547a..82f8a75b207e 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -88,7 +88,7 @@
88#define OCFS2_FEATURE_COMPAT_SUPP OCFS2_FEATURE_COMPAT_BACKUP_SB 88#define OCFS2_FEATURE_COMPAT_SUPP OCFS2_FEATURE_COMPAT_BACKUP_SB
89#define OCFS2_FEATURE_INCOMPAT_SUPP (OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT \ 89#define OCFS2_FEATURE_INCOMPAT_SUPP (OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT \
90 | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC) 90 | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC)
91#define OCFS2_FEATURE_RO_COMPAT_SUPP 0 91#define OCFS2_FEATURE_RO_COMPAT_SUPP OCFS2_FEATURE_RO_COMPAT_UNWRITTEN
92 92
93/* 93/*
94 * Heartbeat-only devices are missing journals and other files. The 94 * Heartbeat-only devices are missing journals and other files. The
@@ -116,6 +116,11 @@
116 */ 116 */
117#define OCFS2_FEATURE_COMPAT_BACKUP_SB 0x0001 117#define OCFS2_FEATURE_COMPAT_BACKUP_SB 0x0001
118 118
119/*
120 * Unwritten extents support.
121 */
122#define OCFS2_FEATURE_RO_COMPAT_UNWRITTEN 0x0001
123
119/* The byte offset of the first backup block will be 1G. 124/* The byte offset of the first backup block will be 1G.
120 * The following will be 4G, 16G, 64G, 256G and 1T. 125 * The following will be 4G, 16G, 64G, 256G and 1T.
121 */ 126 */
@@ -170,6 +175,32 @@
170#define OCFS2_IOC32_SETFLAGS _IOW('f', 2, int) 175#define OCFS2_IOC32_SETFLAGS _IOW('f', 2, int)
171 176
172/* 177/*
178 * Space reservation / allocation / free ioctls and argument structure
179 * are designed to be compatible with XFS.
180 *
181 * ALLOCSP* and FREESP* are not and will never be supported, but are
182 * included here for completeness.
183 */
184struct ocfs2_space_resv {
185 __s16 l_type;
186 __s16 l_whence;
187 __s64 l_start;
188 __s64 l_len; /* len == 0 means until end of file */
189 __s32 l_sysid;
190 __u32 l_pid;
191 __s32 l_pad[4]; /* reserve area */
192};
193
194#define OCFS2_IOC_ALLOCSP _IOW ('X', 10, struct ocfs2_space_resv)
195#define OCFS2_IOC_FREESP _IOW ('X', 11, struct ocfs2_space_resv)
196#define OCFS2_IOC_RESVSP _IOW ('X', 40, struct ocfs2_space_resv)
197#define OCFS2_IOC_UNRESVSP _IOW ('X', 41, struct ocfs2_space_resv)
198#define OCFS2_IOC_ALLOCSP64 _IOW ('X', 36, struct ocfs2_space_resv)
199#define OCFS2_IOC_FREESP64 _IOW ('X', 37, struct ocfs2_space_resv)
200#define OCFS2_IOC_RESVSP64 _IOW ('X', 42, struct ocfs2_space_resv)
201#define OCFS2_IOC_UNRESVSP64 _IOW ('X', 43, struct ocfs2_space_resv)
202
203/*
173 * Journal Flags (ocfs2_dinode.id1.journal1.i_flags) 204 * Journal Flags (ocfs2_dinode.id1.journal1.i_flags)
174 */ 205 */
175#define OCFS2_JOURNAL_DIRTY_FL (0x00000001) /* Journal needs recovery */ 206#define OCFS2_JOURNAL_DIRTY_FL (0x00000001) /* Journal needs recovery */
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index d8b79067dc14..af4882b62cfa 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -121,17 +121,25 @@ static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
121 return ret; 121 return ret;
122} 122}
123 123
124static s16 __ocfs2_find_empty_slot(struct ocfs2_slot_info *si) 124static s16 __ocfs2_find_empty_slot(struct ocfs2_slot_info *si, s16 preferred)
125{ 125{
126 int i; 126 int i;
127 s16 ret = OCFS2_INVALID_SLOT; 127 s16 ret = OCFS2_INVALID_SLOT;
128 128
129 if (preferred >= 0 && preferred < si->si_num_slots) {
130 if (OCFS2_INVALID_SLOT == si->si_global_node_nums[preferred]) {
131 ret = preferred;
132 goto out;
133 }
134 }
135
129 for(i = 0; i < si->si_num_slots; i++) { 136 for(i = 0; i < si->si_num_slots; i++) {
130 if (OCFS2_INVALID_SLOT == si->si_global_node_nums[i]) { 137 if (OCFS2_INVALID_SLOT == si->si_global_node_nums[i]) {
131 ret = (s16) i; 138 ret = (s16) i;
132 break; 139 break;
133 } 140 }
134 } 141 }
142out:
135 return ret; 143 return ret;
136} 144}
137 145
@@ -248,7 +256,7 @@ int ocfs2_find_slot(struct ocfs2_super *osb)
248 if (slot == OCFS2_INVALID_SLOT) { 256 if (slot == OCFS2_INVALID_SLOT) {
249 /* if no slot yet, then just take 1st available 257 /* if no slot yet, then just take 1st available
250 * one. */ 258 * one. */
251 slot = __ocfs2_find_empty_slot(si); 259 slot = __ocfs2_find_empty_slot(si, osb->preferred_slot);
252 if (slot == OCFS2_INVALID_SLOT) { 260 if (slot == OCFS2_INVALID_SLOT) {
253 spin_unlock(&si->si_lock); 261 spin_unlock(&si->si_lock);
254 mlog(ML_ERROR, "no free slots available!\n"); 262 mlog(ML_ERROR, "no free slots available!\n");
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index e3437626d183..d9c5c9fcb30f 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -98,14 +98,6 @@ static int ocfs2_relink_block_group(handle_t *handle,
98 u16 chain); 98 u16 chain);
99static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg, 99static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
100 u32 wanted); 100 u32 wanted);
101static int ocfs2_free_suballoc_bits(handle_t *handle,
102 struct inode *alloc_inode,
103 struct buffer_head *alloc_bh,
104 unsigned int start_bit,
105 u64 bg_blkno,
106 unsigned int count);
107static inline u64 ocfs2_which_suballoc_group(u64 block,
108 unsigned int bit);
109static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode, 101static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
110 u64 bg_blkno, 102 u64 bg_blkno,
111 u16 bg_bit_off); 103 u16 bg_bit_off);
@@ -496,13 +488,7 @@ int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
496 488
497 (*ac)->ac_bits_wanted = ocfs2_extend_meta_needed(fe); 489 (*ac)->ac_bits_wanted = ocfs2_extend_meta_needed(fe);
498 (*ac)->ac_which = OCFS2_AC_USE_META; 490 (*ac)->ac_which = OCFS2_AC_USE_META;
499
500#ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS
501 slot = 0;
502#else
503 slot = osb->slot_num; 491 slot = osb->slot_num;
504#endif
505
506 (*ac)->ac_group_search = ocfs2_block_group_search; 492 (*ac)->ac_group_search = ocfs2_block_group_search;
507 493
508 status = ocfs2_reserve_suballoc_bits(osb, (*ac), 494 status = ocfs2_reserve_suballoc_bits(osb, (*ac),
@@ -1626,12 +1612,12 @@ bail:
1626/* 1612/*
1627 * expects the suballoc inode to already be locked. 1613 * expects the suballoc inode to already be locked.
1628 */ 1614 */
1629static int ocfs2_free_suballoc_bits(handle_t *handle, 1615int ocfs2_free_suballoc_bits(handle_t *handle,
1630 struct inode *alloc_inode, 1616 struct inode *alloc_inode,
1631 struct buffer_head *alloc_bh, 1617 struct buffer_head *alloc_bh,
1632 unsigned int start_bit, 1618 unsigned int start_bit,
1633 u64 bg_blkno, 1619 u64 bg_blkno,
1634 unsigned int count) 1620 unsigned int count)
1635{ 1621{
1636 int status = 0; 1622 int status = 0;
1637 u32 tmp_used; 1623 u32 tmp_used;
@@ -1703,13 +1689,6 @@ bail:
1703 return status; 1689 return status;
1704} 1690}
1705 1691
1706static inline u64 ocfs2_which_suballoc_group(u64 block, unsigned int bit)
1707{
1708 u64 group = block - (u64) bit;
1709
1710 return group;
1711}
1712
1713int ocfs2_free_dinode(handle_t *handle, 1692int ocfs2_free_dinode(handle_t *handle,
1714 struct inode *inode_alloc_inode, 1693 struct inode *inode_alloc_inode,
1715 struct buffer_head *inode_alloc_bh, 1694 struct buffer_head *inode_alloc_bh,
@@ -1723,19 +1702,6 @@ int ocfs2_free_dinode(handle_t *handle,
1723 inode_alloc_bh, bit, bg_blkno, 1); 1702 inode_alloc_bh, bit, bg_blkno, 1);
1724} 1703}
1725 1704
1726int ocfs2_free_extent_block(handle_t *handle,
1727 struct inode *eb_alloc_inode,
1728 struct buffer_head *eb_alloc_bh,
1729 struct ocfs2_extent_block *eb)
1730{
1731 u64 blk = le64_to_cpu(eb->h_blkno);
1732 u16 bit = le16_to_cpu(eb->h_suballoc_bit);
1733 u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
1734
1735 return ocfs2_free_suballoc_bits(handle, eb_alloc_inode, eb_alloc_bh,
1736 bit, bg_blkno, 1);
1737}
1738
1739int ocfs2_free_clusters(handle_t *handle, 1705int ocfs2_free_clusters(handle_t *handle,
1740 struct inode *bitmap_inode, 1706 struct inode *bitmap_inode,
1741 struct buffer_head *bitmap_bh, 1707 struct buffer_head *bitmap_bh,
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index 1a3c94cb9250..f212dc01a84b 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -86,20 +86,29 @@ int ocfs2_claim_clusters(struct ocfs2_super *osb,
86 u32 *cluster_start, 86 u32 *cluster_start,
87 u32 *num_clusters); 87 u32 *num_clusters);
88 88
89int ocfs2_free_suballoc_bits(handle_t *handle,
90 struct inode *alloc_inode,
91 struct buffer_head *alloc_bh,
92 unsigned int start_bit,
93 u64 bg_blkno,
94 unsigned int count);
89int ocfs2_free_dinode(handle_t *handle, 95int ocfs2_free_dinode(handle_t *handle,
90 struct inode *inode_alloc_inode, 96 struct inode *inode_alloc_inode,
91 struct buffer_head *inode_alloc_bh, 97 struct buffer_head *inode_alloc_bh,
92 struct ocfs2_dinode *di); 98 struct ocfs2_dinode *di);
93int ocfs2_free_extent_block(handle_t *handle,
94 struct inode *eb_alloc_inode,
95 struct buffer_head *eb_alloc_bh,
96 struct ocfs2_extent_block *eb);
97int ocfs2_free_clusters(handle_t *handle, 99int ocfs2_free_clusters(handle_t *handle,
98 struct inode *bitmap_inode, 100 struct inode *bitmap_inode,
99 struct buffer_head *bitmap_bh, 101 struct buffer_head *bitmap_bh,
100 u64 start_blk, 102 u64 start_blk,
101 unsigned int num_clusters); 103 unsigned int num_clusters);
102 104
105static inline u64 ocfs2_which_suballoc_group(u64 block, unsigned int bit)
106{
107 u64 group = block - (u64) bit;
108
109 return group;
110}
111
103static inline u32 ocfs2_cluster_from_desc(struct ocfs2_super *osb, 112static inline u32 ocfs2_cluster_from_desc(struct ocfs2_super *osb,
104 u64 bg_blkno) 113 u64 bg_blkno)
105{ 114{
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 86b559c7dce9..200c7d4790dc 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -82,7 +82,8 @@ MODULE_AUTHOR("Oracle");
82MODULE_LICENSE("GPL"); 82MODULE_LICENSE("GPL");
83 83
84static int ocfs2_parse_options(struct super_block *sb, char *options, 84static int ocfs2_parse_options(struct super_block *sb, char *options,
85 unsigned long *mount_opt, int is_remount); 85 unsigned long *mount_opt, s16 *slot,
86 int is_remount);
86static void ocfs2_put_super(struct super_block *sb); 87static void ocfs2_put_super(struct super_block *sb);
87static int ocfs2_mount_volume(struct super_block *sb); 88static int ocfs2_mount_volume(struct super_block *sb);
88static int ocfs2_remount(struct super_block *sb, int *flags, char *data); 89static int ocfs2_remount(struct super_block *sb, int *flags, char *data);
@@ -114,8 +115,6 @@ static void ocfs2_write_super(struct super_block *sb);
114static struct inode *ocfs2_alloc_inode(struct super_block *sb); 115static struct inode *ocfs2_alloc_inode(struct super_block *sb);
115static void ocfs2_destroy_inode(struct inode *inode); 116static void ocfs2_destroy_inode(struct inode *inode);
116 117
117static unsigned long long ocfs2_max_file_offset(unsigned int blockshift);
118
119static const struct super_operations ocfs2_sops = { 118static const struct super_operations ocfs2_sops = {
120 .statfs = ocfs2_statfs, 119 .statfs = ocfs2_statfs,
121 .alloc_inode = ocfs2_alloc_inode, 120 .alloc_inode = ocfs2_alloc_inode,
@@ -140,6 +139,7 @@ enum {
140 Opt_data_ordered, 139 Opt_data_ordered,
141 Opt_data_writeback, 140 Opt_data_writeback,
142 Opt_atime_quantum, 141 Opt_atime_quantum,
142 Opt_slot,
143 Opt_err, 143 Opt_err,
144}; 144};
145 145
@@ -154,6 +154,7 @@ static match_table_t tokens = {
154 {Opt_data_ordered, "data=ordered"}, 154 {Opt_data_ordered, "data=ordered"},
155 {Opt_data_writeback, "data=writeback"}, 155 {Opt_data_writeback, "data=writeback"},
156 {Opt_atime_quantum, "atime_quantum=%u"}, 156 {Opt_atime_quantum, "atime_quantum=%u"},
157 {Opt_slot, "preferred_slot=%u"},
157 {Opt_err, NULL} 158 {Opt_err, NULL}
158}; 159};
159 160
@@ -318,7 +319,7 @@ static void ocfs2_destroy_inode(struct inode *inode)
318/* From xfs_super.c:xfs_max_file_offset 319/* From xfs_super.c:xfs_max_file_offset
319 * Copyright (c) 2000-2004 Silicon Graphics, Inc. 320 * Copyright (c) 2000-2004 Silicon Graphics, Inc.
320 */ 321 */
321static unsigned long long ocfs2_max_file_offset(unsigned int blockshift) 322unsigned long long ocfs2_max_file_offset(unsigned int blockshift)
322{ 323{
323 unsigned int pagefactor = 1; 324 unsigned int pagefactor = 1;
324 unsigned int bitshift = BITS_PER_LONG - 1; 325 unsigned int bitshift = BITS_PER_LONG - 1;
@@ -355,9 +356,10 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
355 int incompat_features; 356 int incompat_features;
356 int ret = 0; 357 int ret = 0;
357 unsigned long parsed_options; 358 unsigned long parsed_options;
359 s16 slot;
358 struct ocfs2_super *osb = OCFS2_SB(sb); 360 struct ocfs2_super *osb = OCFS2_SB(sb);
359 361
360 if (!ocfs2_parse_options(sb, data, &parsed_options, 1)) { 362 if (!ocfs2_parse_options(sb, data, &parsed_options, &slot, 1)) {
361 ret = -EINVAL; 363 ret = -EINVAL;
362 goto out; 364 goto out;
363 } 365 }
@@ -534,6 +536,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
534 struct dentry *root; 536 struct dentry *root;
535 int status, sector_size; 537 int status, sector_size;
536 unsigned long parsed_opt; 538 unsigned long parsed_opt;
539 s16 slot;
537 struct inode *inode = NULL; 540 struct inode *inode = NULL;
538 struct ocfs2_super *osb = NULL; 541 struct ocfs2_super *osb = NULL;
539 struct buffer_head *bh = NULL; 542 struct buffer_head *bh = NULL;
@@ -541,7 +544,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
541 544
542 mlog_entry("%p, %p, %i", sb, data, silent); 545 mlog_entry("%p, %p, %i", sb, data, silent);
543 546
544 if (!ocfs2_parse_options(sb, data, &parsed_opt, 0)) { 547 if (!ocfs2_parse_options(sb, data, &parsed_opt, &slot, 0)) {
545 status = -EINVAL; 548 status = -EINVAL;
546 goto read_super_error; 549 goto read_super_error;
547 } 550 }
@@ -571,6 +574,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
571 brelse(bh); 574 brelse(bh);
572 bh = NULL; 575 bh = NULL;
573 osb->s_mount_opt = parsed_opt; 576 osb->s_mount_opt = parsed_opt;
577 osb->preferred_slot = slot;
574 578
575 sb->s_magic = OCFS2_SUPER_MAGIC; 579 sb->s_magic = OCFS2_SUPER_MAGIC;
576 580
@@ -713,6 +717,7 @@ static struct file_system_type ocfs2_fs_type = {
713static int ocfs2_parse_options(struct super_block *sb, 717static int ocfs2_parse_options(struct super_block *sb,
714 char *options, 718 char *options,
715 unsigned long *mount_opt, 719 unsigned long *mount_opt,
720 s16 *slot,
716 int is_remount) 721 int is_remount)
717{ 722{
718 int status; 723 int status;
@@ -722,6 +727,7 @@ static int ocfs2_parse_options(struct super_block *sb,
722 options ? options : "(none)"); 727 options ? options : "(none)");
723 728
724 *mount_opt = 0; 729 *mount_opt = 0;
730 *slot = OCFS2_INVALID_SLOT;
725 731
726 if (!options) { 732 if (!options) {
727 status = 1; 733 status = 1;
@@ -782,6 +788,15 @@ static int ocfs2_parse_options(struct super_block *sb,
782 else 788 else
783 osb->s_atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM; 789 osb->s_atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
784 break; 790 break;
791 case Opt_slot:
792 option = 0;
793 if (match_int(&args[0], &option)) {
794 status = 0;
795 goto bail;
796 }
797 if (option)
798 *slot = (s16)option;
799 break;
785 default: 800 default:
786 mlog(ML_ERROR, 801 mlog(ML_ERROR,
787 "Unrecognized mount option \"%s\" " 802 "Unrecognized mount option \"%s\" "
@@ -969,7 +984,7 @@ static int ocfs2_initialize_mem_caches(void)
969 0, 984 0,
970 (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| 985 (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
971 SLAB_MEM_SPREAD), 986 SLAB_MEM_SPREAD),
972 ocfs2_inode_init_once, NULL); 987 ocfs2_inode_init_once);
973 if (!ocfs2_inode_cachep) 988 if (!ocfs2_inode_cachep)
974 return -ENOMEM; 989 return -ENOMEM;
975 990
diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h
index 783f5270f2a1..3b9cb3d0b008 100644
--- a/fs/ocfs2/super.h
+++ b/fs/ocfs2/super.h
@@ -45,4 +45,6 @@ void __ocfs2_abort(struct super_block *sb,
45 45
46#define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args) 46#define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args)
47 47
48unsigned long long ocfs2_max_file_offset(unsigned int blockshift);
49
48#endif /* OCFS2_SUPER_H */ 50#endif /* OCFS2_SUPER_H */
diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c
index 39814b900fc0..4da8851f2b23 100644
--- a/fs/ocfs2/uptodate.c
+++ b/fs/ocfs2/uptodate.c
@@ -548,7 +548,7 @@ int __init init_ocfs2_uptodate_cache(void)
548{ 548{
549 ocfs2_uptodate_cachep = kmem_cache_create("ocfs2_uptodate", 549 ocfs2_uptodate_cachep = kmem_cache_create("ocfs2_uptodate",
550 sizeof(struct ocfs2_meta_cache_item), 550 sizeof(struct ocfs2_meta_cache_item),
551 0, SLAB_HWCACHE_ALIGN, NULL, NULL); 551 0, SLAB_HWCACHE_ALIGN, NULL);
552 if (!ocfs2_uptodate_cachep) 552 if (!ocfs2_uptodate_cachep)
553 return -ENOMEM; 553 return -ENOMEM;
554 554