diff options
| author | Linus Torvalds <torvalds@woody.linux-foundation.org> | 2007-04-27 13:29:56 -0400 |
|---|---|---|
| committer | Linus Torvalds <torvalds@woody.linux-foundation.org> | 2007-04-27 13:29:56 -0400 |
| commit | ea6db58f3ea55f413c882095d2afaea8137f4f8c (patch) | |
| tree | 9f7509b5dfe0fdd422b3e2b3a98ed8321d796c66 | |
| parent | c58b8e4a25a1ba347a0e5d21984c97bd296f1691 (diff) | |
| parent | 83418978827324918a8cd25ce5227312de1d4468 (diff) | |
Merge branch 'upstream-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mfasheh/ocfs2
* 'upstream-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mfasheh/ocfs2: (27 commits)
ocfs2: Cache extent records
ocfs2: Remember rw lock level during direct io
ocfs2: Fix up i_blocks calculation to know about holes
ocfs2: Fix extent lookup to return true size of holes
ocfs2: Read from an unwritten extent returns zeros
ocfs2: make room for unwritten extents flag
ocfs2: Use own splice write actor
ocfs2: Use do_sync_mapping_range() in ocfs2_zero_tail_for_truncate()
[PATCH] Turn do_sync_file_range() into do_sync_mapping_range()
ocfs2: zero tail of sparse files on truncate
ocfs2: Teach ocfs2_get_block() about holes
ocfs2: remove ocfs2_prepare_write() and ocfs2_commit_write()
ocfs2: teach ocfs2_file_aio_write() about sparse files
ocfs2: Turn off shared writeable mmap for local files systems with holes.
ocfs2: abstract out allocation locking
ocfs2: teach extend/truncate about sparse files
ocfs2: temporarily remove extent map caching
ocfs2: sparse b-tree support
ocfs2: small cleanup of ocfs2_request_delete()
ocfs2: remove unused code
...
| -rw-r--r-- | fs/ocfs2/alloc.c | 3037 | ||||
| -rw-r--r-- | fs/ocfs2/alloc.h | 27 | ||||
| -rw-r--r-- | fs/ocfs2/aops.c | 1011 | ||||
| -rw-r--r-- | fs/ocfs2/aops.h | 77 | ||||
| -rw-r--r-- | fs/ocfs2/cluster/quorum.c | 5 | ||||
| -rw-r--r-- | fs/ocfs2/cluster/tcp_internal.h | 5 | ||||
| -rw-r--r-- | fs/ocfs2/dir.c | 15 | ||||
| -rw-r--r-- | fs/ocfs2/dlm/dlmdomain.c | 5 | ||||
| -rw-r--r-- | fs/ocfs2/dlm/dlmrecovery.c | 2 | ||||
| -rw-r--r-- | fs/ocfs2/dlmglue.c | 143 | ||||
| -rw-r--r-- | fs/ocfs2/dlmglue.h | 3 | ||||
| -rw-r--r-- | fs/ocfs2/extent_map.c | 1233 | ||||
| -rw-r--r-- | fs/ocfs2/extent_map.h | 39 | ||||
| -rw-r--r-- | fs/ocfs2/file.c | 637 | ||||
| -rw-r--r-- | fs/ocfs2/file.h | 5 | ||||
| -rw-r--r-- | fs/ocfs2/inode.c | 199 | ||||
| -rw-r--r-- | fs/ocfs2/inode.h | 23 | ||||
| -rw-r--r-- | fs/ocfs2/journal.c | 24 | ||||
| -rw-r--r-- | fs/ocfs2/journal.h | 2 | ||||
| -rw-r--r-- | fs/ocfs2/mmap.c | 7 | ||||
| -rw-r--r-- | fs/ocfs2/namei.c | 23 | ||||
| -rw-r--r-- | fs/ocfs2/ocfs2.h | 55 | ||||
| -rw-r--r-- | fs/ocfs2/ocfs2_fs.h | 31 | ||||
| -rw-r--r-- | fs/ocfs2/ocfs2_lockid.h | 5 | ||||
| -rw-r--r-- | fs/ocfs2/slot_map.c | 2 | ||||
| -rw-r--r-- | fs/ocfs2/suballoc.c | 3 | ||||
| -rw-r--r-- | fs/ocfs2/super.c | 7 | ||||
| -rw-r--r-- | fs/ocfs2/vote.c | 289 | ||||
| -rw-r--r-- | fs/ocfs2/vote.h | 3 | ||||
| -rw-r--r-- | fs/sync.c | 8 | ||||
| -rw-r--r-- | include/linux/fs.h | 9 |
31 files changed, 4697 insertions, 2237 deletions
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index f27e5378caf2..a0c8667caa72 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c | |||
| @@ -27,6 +27,7 @@ | |||
| 27 | #include <linux/types.h> | 27 | #include <linux/types.h> |
| 28 | #include <linux/slab.h> | 28 | #include <linux/slab.h> |
| 29 | #include <linux/highmem.h> | 29 | #include <linux/highmem.h> |
| 30 | #include <linux/swap.h> | ||
| 30 | 31 | ||
| 31 | #define MLOG_MASK_PREFIX ML_DISK_ALLOC | 32 | #define MLOG_MASK_PREFIX ML_DISK_ALLOC |
| 32 | #include <cluster/masklog.h> | 33 | #include <cluster/masklog.h> |
| @@ -34,6 +35,7 @@ | |||
| 34 | #include "ocfs2.h" | 35 | #include "ocfs2.h" |
| 35 | 36 | ||
| 36 | #include "alloc.h" | 37 | #include "alloc.h" |
| 38 | #include "aops.h" | ||
| 37 | #include "dlmglue.h" | 39 | #include "dlmglue.h" |
| 38 | #include "extent_map.h" | 40 | #include "extent_map.h" |
| 39 | #include "inode.h" | 41 | #include "inode.h" |
| @@ -47,63 +49,243 @@ | |||
| 47 | 49 | ||
| 48 | #include "buffer_head_io.h" | 50 | #include "buffer_head_io.h" |
| 49 | 51 | ||
| 50 | static int ocfs2_extent_contig(struct inode *inode, | 52 | static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc); |
| 51 | struct ocfs2_extent_rec *ext, | ||
| 52 | u64 blkno); | ||
| 53 | 53 | ||
| 54 | static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb, | 54 | /* |
| 55 | handle_t *handle, | 55 | * Structures which describe a path through a btree, and functions to |
| 56 | struct inode *inode, | 56 | * manipulate them. |
| 57 | int wanted, | 57 | * |
| 58 | struct ocfs2_alloc_context *meta_ac, | 58 | * The idea here is to be as generic as possible with the tree |
| 59 | struct buffer_head *bhs[]); | 59 | * manipulation code. |
| 60 | */ | ||
| 61 | struct ocfs2_path_item { | ||
| 62 | struct buffer_head *bh; | ||
| 63 | struct ocfs2_extent_list *el; | ||
| 64 | }; | ||
| 60 | 65 | ||
| 61 | static int ocfs2_add_branch(struct ocfs2_super *osb, | 66 | #define OCFS2_MAX_PATH_DEPTH 5 |
| 62 | handle_t *handle, | ||
| 63 | struct inode *inode, | ||
| 64 | struct buffer_head *fe_bh, | ||
| 65 | struct buffer_head *eb_bh, | ||
| 66 | struct buffer_head *last_eb_bh, | ||
| 67 | struct ocfs2_alloc_context *meta_ac); | ||
| 68 | 67 | ||
| 69 | static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, | 68 | struct ocfs2_path { |
| 70 | handle_t *handle, | 69 | int p_tree_depth; |
| 71 | struct inode *inode, | 70 | struct ocfs2_path_item p_node[OCFS2_MAX_PATH_DEPTH]; |
| 72 | struct buffer_head *fe_bh, | 71 | }; |
| 73 | struct ocfs2_alloc_context *meta_ac, | ||
| 74 | struct buffer_head **ret_new_eb_bh); | ||
| 75 | 72 | ||
| 76 | static int ocfs2_do_insert_extent(struct ocfs2_super *osb, | 73 | #define path_root_bh(_path) ((_path)->p_node[0].bh) |
| 77 | handle_t *handle, | 74 | #define path_root_el(_path) ((_path)->p_node[0].el) |
| 78 | struct inode *inode, | 75 | #define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh) |
| 79 | struct buffer_head *fe_bh, | 76 | #define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el) |
| 80 | u64 blkno, | 77 | #define path_num_items(_path) ((_path)->p_tree_depth + 1) |
| 81 | u32 new_clusters); | ||
| 82 | 78 | ||
| 83 | static int ocfs2_find_branch_target(struct ocfs2_super *osb, | 79 | /* |
| 84 | struct inode *inode, | 80 | * Reset the actual path elements so that we can re-use the structure |
| 85 | struct buffer_head *fe_bh, | 81 | * to build another path. Generally, this involves freeing the buffer |
| 86 | struct buffer_head **target_bh); | 82 | * heads. |
| 83 | */ | ||
| 84 | static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root) | ||
| 85 | { | ||
| 86 | int i, start = 0, depth = 0; | ||
| 87 | struct ocfs2_path_item *node; | ||
| 87 | 88 | ||
| 88 | static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb, | 89 | if (keep_root) |
| 89 | struct inode *inode, | 90 | start = 1; |
| 90 | struct ocfs2_dinode *fe, | 91 | |
| 91 | unsigned int new_i_clusters, | 92 | for(i = start; i < path_num_items(path); i++) { |
| 92 | struct buffer_head *old_last_eb, | 93 | node = &path->p_node[i]; |
| 93 | struct buffer_head **new_last_eb); | 94 | |
| 95 | brelse(node->bh); | ||
| 96 | node->bh = NULL; | ||
| 97 | node->el = NULL; | ||
| 98 | } | ||
| 99 | |||
| 100 | /* | ||
| 101 | * Tree depth may change during truncate, or insert. If we're | ||
| 102 | * keeping the root extent list, then make sure that our path | ||
| 103 | * structure reflects the proper depth. | ||
| 104 | */ | ||
| 105 | if (keep_root) | ||
| 106 | depth = le16_to_cpu(path_root_el(path)->l_tree_depth); | ||
| 107 | |||
| 108 | path->p_tree_depth = depth; | ||
| 109 | } | ||
| 110 | |||
| 111 | static void ocfs2_free_path(struct ocfs2_path *path) | ||
| 112 | { | ||
| 113 | if (path) { | ||
| 114 | ocfs2_reinit_path(path, 0); | ||
| 115 | kfree(path); | ||
| 116 | } | ||
| 117 | } | ||
| 118 | |||
| 119 | /* | ||
| 120 | * Make the *dest path the same as src and re-initialize src path to | ||
| 121 | * have a root only. | ||
| 122 | */ | ||
| 123 | static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src) | ||
| 124 | { | ||
| 125 | int i; | ||
| 126 | |||
| 127 | BUG_ON(path_root_bh(dest) != path_root_bh(src)); | ||
| 128 | |||
| 129 | for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) { | ||
| 130 | brelse(dest->p_node[i].bh); | ||
| 131 | |||
| 132 | dest->p_node[i].bh = src->p_node[i].bh; | ||
| 133 | dest->p_node[i].el = src->p_node[i].el; | ||
| 134 | |||
| 135 | src->p_node[i].bh = NULL; | ||
| 136 | src->p_node[i].el = NULL; | ||
| 137 | } | ||
| 138 | } | ||
| 139 | |||
| 140 | /* | ||
| 141 | * Insert an extent block at given index. | ||
| 142 | * | ||
| 143 | * This will not take an additional reference on eb_bh. | ||
| 144 | */ | ||
| 145 | static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index, | ||
| 146 | struct buffer_head *eb_bh) | ||
| 147 | { | ||
| 148 | struct ocfs2_extent_block *eb = (struct ocfs2_extent_block *)eb_bh->b_data; | ||
| 149 | |||
| 150 | /* | ||
| 151 | * Right now, no root bh is an extent block, so this helps | ||
| 152 | * catch code errors with dinode trees. The assertion can be | ||
| 153 | * safely removed if we ever need to insert extent block | ||
| 154 | * structures at the root. | ||
| 155 | */ | ||
| 156 | BUG_ON(index == 0); | ||
| 157 | |||
| 158 | path->p_node[index].bh = eb_bh; | ||
| 159 | path->p_node[index].el = &eb->h_list; | ||
| 160 | } | ||
| 161 | |||
| 162 | static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh, | ||
| 163 | struct ocfs2_extent_list *root_el) | ||
| 164 | { | ||
| 165 | struct ocfs2_path *path; | ||
| 166 | |||
| 167 | BUG_ON(le16_to_cpu(root_el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH); | ||
| 168 | |||
| 169 | path = kzalloc(sizeof(*path), GFP_NOFS); | ||
| 170 | if (path) { | ||
| 171 | path->p_tree_depth = le16_to_cpu(root_el->l_tree_depth); | ||
| 172 | get_bh(root_bh); | ||
| 173 | path_root_bh(path) = root_bh; | ||
| 174 | path_root_el(path) = root_el; | ||
| 175 | } | ||
| 176 | |||
| 177 | return path; | ||
| 178 | } | ||
| 179 | |||
| 180 | /* | ||
| 181 | * Allocate and initialize a new path based on a disk inode tree. | ||
| 182 | */ | ||
| 183 | static struct ocfs2_path *ocfs2_new_inode_path(struct buffer_head *di_bh) | ||
| 184 | { | ||
| 185 | struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; | ||
| 186 | struct ocfs2_extent_list *el = &di->id2.i_list; | ||
| 187 | |||
| 188 | return ocfs2_new_path(di_bh, el); | ||
| 189 | } | ||
| 190 | |||
| 191 | /* | ||
| 192 | * Convenience function to journal all components in a path. | ||
| 193 | */ | ||
| 194 | static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle, | ||
| 195 | struct ocfs2_path *path) | ||
| 196 | { | ||
| 197 | int i, ret = 0; | ||
| 198 | |||
| 199 | if (!path) | ||
| 200 | goto out; | ||
| 201 | |||
| 202 | for(i = 0; i < path_num_items(path); i++) { | ||
| 203 | ret = ocfs2_journal_access(handle, inode, path->p_node[i].bh, | ||
| 204 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
| 205 | if (ret < 0) { | ||
| 206 | mlog_errno(ret); | ||
| 207 | goto out; | ||
| 208 | } | ||
| 209 | } | ||
| 210 | |||
| 211 | out: | ||
| 212 | return ret; | ||
| 213 | } | ||
| 214 | |||
| 215 | enum ocfs2_contig_type { | ||
| 216 | CONTIG_NONE = 0, | ||
| 217 | CONTIG_LEFT, | ||
| 218 | CONTIG_RIGHT | ||
| 219 | }; | ||
| 94 | 220 | ||
| 95 | static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc); | ||
| 96 | 221 | ||
| 97 | static int ocfs2_extent_contig(struct inode *inode, | 222 | /* |
| 98 | struct ocfs2_extent_rec *ext, | 223 | * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and |
| 99 | u64 blkno) | 224 | * ocfs2_extent_contig only work properly against leaf nodes! |
| 225 | */ | ||
| 226 | static int ocfs2_block_extent_contig(struct super_block *sb, | ||
| 227 | struct ocfs2_extent_rec *ext, | ||
| 228 | u64 blkno) | ||
| 229 | { | ||
| 230 | u64 blk_end = le64_to_cpu(ext->e_blkno); | ||
| 231 | |||
| 232 | blk_end += ocfs2_clusters_to_blocks(sb, | ||
| 233 | le16_to_cpu(ext->e_leaf_clusters)); | ||
| 234 | |||
| 235 | return blkno == blk_end; | ||
| 236 | } | ||
| 237 | |||
| 238 | static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left, | ||
| 239 | struct ocfs2_extent_rec *right) | ||
| 240 | { | ||
| 241 | u32 left_range; | ||
| 242 | |||
| 243 | left_range = le32_to_cpu(left->e_cpos) + | ||
| 244 | le16_to_cpu(left->e_leaf_clusters); | ||
| 245 | |||
| 246 | return (left_range == le32_to_cpu(right->e_cpos)); | ||
| 247 | } | ||
| 248 | |||
| 249 | static enum ocfs2_contig_type | ||
| 250 | ocfs2_extent_contig(struct inode *inode, | ||
| 251 | struct ocfs2_extent_rec *ext, | ||
| 252 | struct ocfs2_extent_rec *insert_rec) | ||
| 100 | { | 253 | { |
| 101 | return blkno == (le64_to_cpu(ext->e_blkno) + | 254 | u64 blkno = le64_to_cpu(insert_rec->e_blkno); |
| 102 | ocfs2_clusters_to_blocks(inode->i_sb, | 255 | |
| 103 | le32_to_cpu(ext->e_clusters))); | 256 | if (ocfs2_extents_adjacent(ext, insert_rec) && |
| 257 | ocfs2_block_extent_contig(inode->i_sb, ext, blkno)) | ||
| 258 | return CONTIG_RIGHT; | ||
| 259 | |||
| 260 | blkno = le64_to_cpu(ext->e_blkno); | ||
| 261 | if (ocfs2_extents_adjacent(insert_rec, ext) && | ||
| 262 | ocfs2_block_extent_contig(inode->i_sb, insert_rec, blkno)) | ||
| 263 | return CONTIG_LEFT; | ||
| 264 | |||
| 265 | return CONTIG_NONE; | ||
| 104 | } | 266 | } |
| 105 | 267 | ||
| 106 | /* | 268 | /* |
| 269 | * NOTE: We can have pretty much any combination of contiguousness and | ||
| 270 | * appending. | ||
| 271 | * | ||
| 272 | * The usefulness of APPEND_TAIL is more in that it lets us know that | ||
| 273 | * we'll have to update the path to that leaf. | ||
| 274 | */ | ||
| 275 | enum ocfs2_append_type { | ||
| 276 | APPEND_NONE = 0, | ||
| 277 | APPEND_TAIL, | ||
| 278 | }; | ||
| 279 | |||
| 280 | struct ocfs2_insert_type { | ||
| 281 | enum ocfs2_append_type ins_appending; | ||
| 282 | enum ocfs2_contig_type ins_contig; | ||
| 283 | int ins_contig_index; | ||
| 284 | int ins_free_records; | ||
| 285 | int ins_tree_depth; | ||
| 286 | }; | ||
| 287 | |||
| 288 | /* | ||
| 107 | * How many free extents have we got before we need more meta data? | 289 | * How many free extents have we got before we need more meta data? |
| 108 | */ | 290 | */ |
| 109 | int ocfs2_num_free_extents(struct ocfs2_super *osb, | 291 | int ocfs2_num_free_extents(struct ocfs2_super *osb, |
| @@ -242,6 +424,28 @@ bail: | |||
| 242 | } | 424 | } |
| 243 | 425 | ||
| 244 | /* | 426 | /* |
| 427 | * Helper function for ocfs2_add_branch() and ocfs2_shift_tree_depth(). | ||
| 428 | * | ||
| 429 | * Returns the sum of the rightmost extent rec logical offset and | ||
| 430 | * cluster count. | ||
| 431 | * | ||
| 432 | * ocfs2_add_branch() uses this to determine what logical cluster | ||
| 433 | * value should be populated into the leftmost new branch records. | ||
| 434 | * | ||
| 435 | * ocfs2_shift_tree_depth() uses this to determine the # clusters | ||
| 436 | * value for the new topmost tree record. | ||
| 437 | */ | ||
| 438 | static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list *el) | ||
| 439 | { | ||
| 440 | int i; | ||
| 441 | |||
| 442 | i = le16_to_cpu(el->l_next_free_rec) - 1; | ||
| 443 | |||
| 444 | return le32_to_cpu(el->l_recs[i].e_cpos) + | ||
| 445 | ocfs2_rec_clusters(el, &el->l_recs[i]); | ||
| 446 | } | ||
| 447 | |||
| 448 | /* | ||
| 245 | * Add an entire tree branch to our inode. eb_bh is the extent block | 449 | * Add an entire tree branch to our inode. eb_bh is the extent block |
| 246 | * to start at, if we don't want to start the branch at the dinode | 450 | * to start at, if we don't want to start the branch at the dinode |
| 247 | * structure. | 451 | * structure. |
| @@ -250,7 +454,7 @@ bail: | |||
| 250 | * for the new last extent block. | 454 | * for the new last extent block. |
| 251 | * | 455 | * |
| 252 | * the new branch will be 'empty' in the sense that every block will | 456 | * the new branch will be 'empty' in the sense that every block will |
| 253 | * contain a single record with e_clusters == 0. | 457 | * contain a single record with cluster count == 0. |
| 254 | */ | 458 | */ |
| 255 | static int ocfs2_add_branch(struct ocfs2_super *osb, | 459 | static int ocfs2_add_branch(struct ocfs2_super *osb, |
| 256 | handle_t *handle, | 460 | handle_t *handle, |
| @@ -268,6 +472,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, | |||
| 268 | struct ocfs2_extent_block *eb; | 472 | struct ocfs2_extent_block *eb; |
| 269 | struct ocfs2_extent_list *eb_el; | 473 | struct ocfs2_extent_list *eb_el; |
| 270 | struct ocfs2_extent_list *el; | 474 | struct ocfs2_extent_list *el; |
| 475 | u32 new_cpos; | ||
| 271 | 476 | ||
| 272 | mlog_entry_void(); | 477 | mlog_entry_void(); |
| 273 | 478 | ||
| @@ -302,6 +507,9 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, | |||
| 302 | goto bail; | 507 | goto bail; |
| 303 | } | 508 | } |
| 304 | 509 | ||
| 510 | eb = (struct ocfs2_extent_block *)last_eb_bh->b_data; | ||
| 511 | new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list); | ||
| 512 | |||
| 305 | /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be | 513 | /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be |
| 306 | * linked with the rest of the tree. | 514 | * linked with the rest of the tree. |
| 307 | * conversly, new_eb_bhs[0] is the new bottommost leaf. | 515 | * conversly, new_eb_bhs[0] is the new bottommost leaf. |
| @@ -330,9 +538,18 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, | |||
| 330 | eb->h_next_leaf_blk = 0; | 538 | eb->h_next_leaf_blk = 0; |
| 331 | eb_el->l_tree_depth = cpu_to_le16(i); | 539 | eb_el->l_tree_depth = cpu_to_le16(i); |
| 332 | eb_el->l_next_free_rec = cpu_to_le16(1); | 540 | eb_el->l_next_free_rec = cpu_to_le16(1); |
| 333 | eb_el->l_recs[0].e_cpos = fe->i_clusters; | 541 | /* |
| 542 | * This actually counts as an empty extent as | ||
| 543 | * c_clusters == 0 | ||
| 544 | */ | ||
| 545 | eb_el->l_recs[0].e_cpos = cpu_to_le32(new_cpos); | ||
| 334 | eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno); | 546 | eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno); |
| 335 | eb_el->l_recs[0].e_clusters = cpu_to_le32(0); | 547 | /* |
| 548 | * eb_el isn't always an interior node, but even leaf | ||
| 549 | * nodes want a zero'd flags and reserved field so | ||
| 550 | * this gets the whole 32 bits regardless of use. | ||
| 551 | */ | ||
| 552 | eb_el->l_recs[0].e_int_clusters = cpu_to_le32(0); | ||
| 336 | if (!eb_el->l_tree_depth) | 553 | if (!eb_el->l_tree_depth) |
| 337 | new_last_eb_blk = le64_to_cpu(eb->h_blkno); | 554 | new_last_eb_blk = le64_to_cpu(eb->h_blkno); |
| 338 | 555 | ||
| @@ -376,8 +593,8 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, | |||
| 376 | * either be on the fe, or the extent block passed in. */ | 593 | * either be on the fe, or the extent block passed in. */ |
| 377 | i = le16_to_cpu(el->l_next_free_rec); | 594 | i = le16_to_cpu(el->l_next_free_rec); |
| 378 | el->l_recs[i].e_blkno = cpu_to_le64(next_blkno); | 595 | el->l_recs[i].e_blkno = cpu_to_le64(next_blkno); |
| 379 | el->l_recs[i].e_cpos = fe->i_clusters; | 596 | el->l_recs[i].e_cpos = cpu_to_le32(new_cpos); |
| 380 | el->l_recs[i].e_clusters = 0; | 597 | el->l_recs[i].e_int_clusters = 0; |
| 381 | le16_add_cpu(&el->l_next_free_rec, 1); | 598 | le16_add_cpu(&el->l_next_free_rec, 1); |
| 382 | 599 | ||
| 383 | /* fe needs a new last extent block pointer, as does the | 600 | /* fe needs a new last extent block pointer, as does the |
| @@ -425,6 +642,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, | |||
| 425 | struct buffer_head **ret_new_eb_bh) | 642 | struct buffer_head **ret_new_eb_bh) |
| 426 | { | 643 | { |
| 427 | int status, i; | 644 | int status, i; |
| 645 | u32 new_clusters; | ||
| 428 | struct buffer_head *new_eb_bh = NULL; | 646 | struct buffer_head *new_eb_bh = NULL; |
| 429 | struct ocfs2_dinode *fe; | 647 | struct ocfs2_dinode *fe; |
| 430 | struct ocfs2_extent_block *eb; | 648 | struct ocfs2_extent_block *eb; |
| @@ -461,11 +679,8 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, | |||
| 461 | /* copy the fe data into the new extent block */ | 679 | /* copy the fe data into the new extent block */ |
| 462 | eb_el->l_tree_depth = fe_el->l_tree_depth; | 680 | eb_el->l_tree_depth = fe_el->l_tree_depth; |
| 463 | eb_el->l_next_free_rec = fe_el->l_next_free_rec; | 681 | eb_el->l_next_free_rec = fe_el->l_next_free_rec; |
| 464 | for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++) { | 682 | for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++) |
| 465 | eb_el->l_recs[i].e_cpos = fe_el->l_recs[i].e_cpos; | 683 | eb_el->l_recs[i] = fe_el->l_recs[i]; |
| 466 | eb_el->l_recs[i].e_clusters = fe_el->l_recs[i].e_clusters; | ||
| 467 | eb_el->l_recs[i].e_blkno = fe_el->l_recs[i].e_blkno; | ||
| 468 | } | ||
| 469 | 684 | ||
| 470 | status = ocfs2_journal_dirty(handle, new_eb_bh); | 685 | status = ocfs2_journal_dirty(handle, new_eb_bh); |
| 471 | if (status < 0) { | 686 | if (status < 0) { |
| @@ -480,16 +695,15 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, | |||
| 480 | goto bail; | 695 | goto bail; |
| 481 | } | 696 | } |
| 482 | 697 | ||
| 698 | new_clusters = ocfs2_sum_rightmost_rec(eb_el); | ||
| 699 | |||
| 483 | /* update fe now */ | 700 | /* update fe now */ |
| 484 | le16_add_cpu(&fe_el->l_tree_depth, 1); | 701 | le16_add_cpu(&fe_el->l_tree_depth, 1); |
| 485 | fe_el->l_recs[0].e_cpos = 0; | 702 | fe_el->l_recs[0].e_cpos = 0; |
| 486 | fe_el->l_recs[0].e_blkno = eb->h_blkno; | 703 | fe_el->l_recs[0].e_blkno = eb->h_blkno; |
| 487 | fe_el->l_recs[0].e_clusters = fe->i_clusters; | 704 | fe_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters); |
| 488 | for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++) { | 705 | for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++) |
| 489 | fe_el->l_recs[i].e_cpos = 0; | 706 | memset(&fe_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec)); |
| 490 | fe_el->l_recs[i].e_clusters = 0; | ||
| 491 | fe_el->l_recs[i].e_blkno = 0; | ||
| 492 | } | ||
| 493 | fe_el->l_next_free_rec = cpu_to_le16(1); | 707 | fe_el->l_next_free_rec = cpu_to_le16(1); |
| 494 | 708 | ||
| 495 | /* If this is our 1st tree depth shift, then last_eb_blk | 709 | /* If this is our 1st tree depth shift, then last_eb_blk |
| @@ -515,199 +729,6 @@ bail: | |||
| 515 | } | 729 | } |
| 516 | 730 | ||
| 517 | /* | 731 | /* |
| 518 | * Expects the tree to already have room in the rightmost leaf for the | ||
| 519 | * extent. Updates all the extent blocks (and the dinode) on the way | ||
| 520 | * down. | ||
| 521 | */ | ||
| 522 | static int ocfs2_do_insert_extent(struct ocfs2_super *osb, | ||
| 523 | handle_t *handle, | ||
| 524 | struct inode *inode, | ||
| 525 | struct buffer_head *fe_bh, | ||
| 526 | u64 start_blk, | ||
| 527 | u32 new_clusters) | ||
| 528 | { | ||
| 529 | int status, i, num_bhs = 0; | ||
| 530 | u64 next_blkno; | ||
| 531 | u16 next_free; | ||
| 532 | struct buffer_head **eb_bhs = NULL; | ||
| 533 | struct ocfs2_dinode *fe; | ||
| 534 | struct ocfs2_extent_block *eb; | ||
| 535 | struct ocfs2_extent_list *el; | ||
| 536 | |||
| 537 | mlog_entry_void(); | ||
| 538 | |||
| 539 | status = ocfs2_journal_access(handle, inode, fe_bh, | ||
| 540 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
| 541 | if (status < 0) { | ||
| 542 | mlog_errno(status); | ||
| 543 | goto bail; | ||
| 544 | } | ||
| 545 | |||
| 546 | fe = (struct ocfs2_dinode *) fe_bh->b_data; | ||
| 547 | el = &fe->id2.i_list; | ||
| 548 | if (el->l_tree_depth) { | ||
| 549 | /* This is another operation where we want to be | ||
| 550 | * careful about our tree updates. An error here means | ||
| 551 | * none of the previous changes we made should roll | ||
| 552 | * forward. As a result, we have to record the buffers | ||
| 553 | * for this part of the tree in an array and reserve a | ||
| 554 | * journal write to them before making any changes. */ | ||
| 555 | num_bhs = le16_to_cpu(fe->id2.i_list.l_tree_depth); | ||
| 556 | eb_bhs = kcalloc(num_bhs, sizeof(struct buffer_head *), | ||
| 557 | GFP_KERNEL); | ||
| 558 | if (!eb_bhs) { | ||
| 559 | status = -ENOMEM; | ||
| 560 | mlog_errno(status); | ||
| 561 | goto bail; | ||
| 562 | } | ||
| 563 | |||
| 564 | i = 0; | ||
| 565 | while(el->l_tree_depth) { | ||
| 566 | next_free = le16_to_cpu(el->l_next_free_rec); | ||
| 567 | if (next_free == 0) { | ||
| 568 | ocfs2_error(inode->i_sb, | ||
| 569 | "Dinode %llu has a bad extent list", | ||
| 570 | (unsigned long long)OCFS2_I(inode)->ip_blkno); | ||
| 571 | status = -EIO; | ||
| 572 | goto bail; | ||
| 573 | } | ||
| 574 | next_blkno = le64_to_cpu(el->l_recs[next_free - 1].e_blkno); | ||
| 575 | |||
| 576 | BUG_ON(i >= num_bhs); | ||
| 577 | status = ocfs2_read_block(osb, next_blkno, &eb_bhs[i], | ||
| 578 | OCFS2_BH_CACHED, inode); | ||
| 579 | if (status < 0) { | ||
| 580 | mlog_errno(status); | ||
| 581 | goto bail; | ||
| 582 | } | ||
| 583 | eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data; | ||
| 584 | if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { | ||
| 585 | OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, | ||
| 586 | eb); | ||
| 587 | status = -EIO; | ||
| 588 | goto bail; | ||
| 589 | } | ||
| 590 | |||
| 591 | status = ocfs2_journal_access(handle, inode, eb_bhs[i], | ||
| 592 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
| 593 | if (status < 0) { | ||
| 594 | mlog_errno(status); | ||
| 595 | goto bail; | ||
| 596 | } | ||
| 597 | |||
| 598 | el = &eb->h_list; | ||
| 599 | i++; | ||
| 600 | /* When we leave this loop, eb_bhs[num_bhs - 1] will | ||
| 601 | * hold the bottom-most leaf extent block. */ | ||
| 602 | } | ||
| 603 | BUG_ON(el->l_tree_depth); | ||
| 604 | |||
| 605 | el = &fe->id2.i_list; | ||
| 606 | /* If we have tree depth, then the fe update is | ||
| 607 | * trivial, and we want to switch el out for the | ||
| 608 | * bottom-most leaf in order to update it with the | ||
| 609 | * actual extent data below. */ | ||
| 610 | next_free = le16_to_cpu(el->l_next_free_rec); | ||
| 611 | if (next_free == 0) { | ||
| 612 | ocfs2_error(inode->i_sb, | ||
| 613 | "Dinode %llu has a bad extent list", | ||
| 614 | (unsigned long long)OCFS2_I(inode)->ip_blkno); | ||
| 615 | status = -EIO; | ||
| 616 | goto bail; | ||
| 617 | } | ||
| 618 | le32_add_cpu(&el->l_recs[next_free - 1].e_clusters, | ||
| 619 | new_clusters); | ||
| 620 | /* (num_bhs - 1) to avoid the leaf */ | ||
| 621 | for(i = 0; i < (num_bhs - 1); i++) { | ||
| 622 | eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data; | ||
| 623 | el = &eb->h_list; | ||
| 624 | |||
| 625 | /* finally, make our actual change to the | ||
| 626 | * intermediate extent blocks. */ | ||
| 627 | next_free = le16_to_cpu(el->l_next_free_rec); | ||
| 628 | le32_add_cpu(&el->l_recs[next_free - 1].e_clusters, | ||
| 629 | new_clusters); | ||
| 630 | |||
| 631 | status = ocfs2_journal_dirty(handle, eb_bhs[i]); | ||
| 632 | if (status < 0) | ||
| 633 | mlog_errno(status); | ||
| 634 | } | ||
| 635 | BUG_ON(i != (num_bhs - 1)); | ||
| 636 | /* note that the leaf block wasn't touched in | ||
| 637 | * the loop above */ | ||
| 638 | eb = (struct ocfs2_extent_block *) eb_bhs[num_bhs - 1]->b_data; | ||
| 639 | el = &eb->h_list; | ||
| 640 | BUG_ON(el->l_tree_depth); | ||
| 641 | } | ||
| 642 | |||
| 643 | /* yay, we can finally add the actual extent now! */ | ||
| 644 | i = le16_to_cpu(el->l_next_free_rec) - 1; | ||
| 645 | if (le16_to_cpu(el->l_next_free_rec) && | ||
| 646 | ocfs2_extent_contig(inode, &el->l_recs[i], start_blk)) { | ||
| 647 | le32_add_cpu(&el->l_recs[i].e_clusters, new_clusters); | ||
| 648 | } else if (le16_to_cpu(el->l_next_free_rec) && | ||
| 649 | (le32_to_cpu(el->l_recs[i].e_clusters) == 0)) { | ||
| 650 | /* having an empty extent at eof is legal. */ | ||
| 651 | if (el->l_recs[i].e_cpos != fe->i_clusters) { | ||
| 652 | ocfs2_error(inode->i_sb, | ||
| 653 | "Dinode %llu trailing extent is bad: " | ||
| 654 | "cpos (%u) != number of clusters (%u)", | ||
| 655 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | ||
| 656 | le32_to_cpu(el->l_recs[i].e_cpos), | ||
| 657 | le32_to_cpu(fe->i_clusters)); | ||
| 658 | status = -EIO; | ||
| 659 | goto bail; | ||
| 660 | } | ||
| 661 | el->l_recs[i].e_blkno = cpu_to_le64(start_blk); | ||
| 662 | el->l_recs[i].e_clusters = cpu_to_le32(new_clusters); | ||
| 663 | } else { | ||
| 664 | /* No contiguous record, or no empty record at eof, so | ||
| 665 | * we add a new one. */ | ||
| 666 | |||
| 667 | BUG_ON(le16_to_cpu(el->l_next_free_rec) >= | ||
| 668 | le16_to_cpu(el->l_count)); | ||
| 669 | i = le16_to_cpu(el->l_next_free_rec); | ||
| 670 | |||
| 671 | el->l_recs[i].e_blkno = cpu_to_le64(start_blk); | ||
| 672 | el->l_recs[i].e_clusters = cpu_to_le32(new_clusters); | ||
| 673 | el->l_recs[i].e_cpos = fe->i_clusters; | ||
| 674 | le16_add_cpu(&el->l_next_free_rec, 1); | ||
| 675 | } | ||
| 676 | |||
| 677 | /* | ||
| 678 | * extent_map errors are not fatal, so they are ignored outside | ||
| 679 | * of flushing the thing. | ||
| 680 | */ | ||
| 681 | status = ocfs2_extent_map_append(inode, &el->l_recs[i], | ||
| 682 | new_clusters); | ||
| 683 | if (status) { | ||
| 684 | mlog_errno(status); | ||
| 685 | ocfs2_extent_map_drop(inode, le32_to_cpu(fe->i_clusters)); | ||
| 686 | } | ||
| 687 | |||
| 688 | status = ocfs2_journal_dirty(handle, fe_bh); | ||
| 689 | if (status < 0) | ||
| 690 | mlog_errno(status); | ||
| 691 | if (fe->id2.i_list.l_tree_depth) { | ||
| 692 | status = ocfs2_journal_dirty(handle, eb_bhs[num_bhs - 1]); | ||
| 693 | if (status < 0) | ||
| 694 | mlog_errno(status); | ||
| 695 | } | ||
| 696 | |||
| 697 | status = 0; | ||
| 698 | bail: | ||
| 699 | if (eb_bhs) { | ||
| 700 | for (i = 0; i < num_bhs; i++) | ||
| 701 | if (eb_bhs[i]) | ||
| 702 | brelse(eb_bhs[i]); | ||
| 703 | kfree(eb_bhs); | ||
| 704 | } | ||
| 705 | |||
| 706 | mlog_exit(status); | ||
| 707 | return status; | ||
| 708 | } | ||
| 709 | |||
| 710 | /* | ||
| 711 | * Should only be called when there is no space left in any of the | 732 | * Should only be called when there is no space left in any of the |
| 712 | * leaf nodes. What we want to do is find the lowest tree depth | 733 | * leaf nodes. What we want to do is find the lowest tree depth |
| 713 | * non-leaf extent block with room for new records. There are three | 734 | * non-leaf extent block with room for new records. There are three |
| @@ -807,53 +828,1548 @@ bail: | |||
| 807 | return status; | 828 | return status; |
| 808 | } | 829 | } |
| 809 | 830 | ||
| 810 | /* the caller needs to update fe->i_clusters */ | 831 | /* |
| 811 | int ocfs2_insert_extent(struct ocfs2_super *osb, | 832 | * This is only valid for leaf nodes, which are the only ones that can |
| 812 | handle_t *handle, | 833 | * have empty extents anyway. |
| 813 | struct inode *inode, | 834 | */ |
| 814 | struct buffer_head *fe_bh, | 835 | static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec) |
| 815 | u64 start_blk, | ||
| 816 | u32 new_clusters, | ||
| 817 | struct ocfs2_alloc_context *meta_ac) | ||
| 818 | { | 836 | { |
| 819 | int status, i, shift; | 837 | return !rec->e_leaf_clusters; |
| 820 | struct buffer_head *last_eb_bh = NULL; | 838 | } |
| 839 | |||
| 840 | /* | ||
| 841 | * This function will discard the rightmost extent record. | ||
| 842 | */ | ||
| 843 | static void ocfs2_shift_records_right(struct ocfs2_extent_list *el) | ||
| 844 | { | ||
| 845 | int next_free = le16_to_cpu(el->l_next_free_rec); | ||
| 846 | int count = le16_to_cpu(el->l_count); | ||
| 847 | unsigned int num_bytes; | ||
| 848 | |||
| 849 | BUG_ON(!next_free); | ||
| 850 | /* This will cause us to go off the end of our extent list. */ | ||
| 851 | BUG_ON(next_free >= count); | ||
| 852 | |||
| 853 | num_bytes = sizeof(struct ocfs2_extent_rec) * next_free; | ||
| 854 | |||
| 855 | memmove(&el->l_recs[1], &el->l_recs[0], num_bytes); | ||
| 856 | } | ||
| 857 | |||
| 858 | static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el, | ||
| 859 | struct ocfs2_extent_rec *insert_rec) | ||
| 860 | { | ||
| 861 | int i, insert_index, next_free, has_empty, num_bytes; | ||
| 862 | u32 insert_cpos = le32_to_cpu(insert_rec->e_cpos); | ||
| 863 | struct ocfs2_extent_rec *rec; | ||
| 864 | |||
| 865 | next_free = le16_to_cpu(el->l_next_free_rec); | ||
| 866 | has_empty = ocfs2_is_empty_extent(&el->l_recs[0]); | ||
| 867 | |||
| 868 | BUG_ON(!next_free); | ||
| 869 | |||
| 870 | /* The tree code before us didn't allow enough room in the leaf. */ | ||
| 871 | if (el->l_next_free_rec == el->l_count && !has_empty) | ||
| 872 | BUG(); | ||
| 873 | |||
| 874 | /* | ||
| 875 | * The easiest way to approach this is to just remove the | ||
| 876 | * empty extent and temporarily decrement next_free. | ||
| 877 | */ | ||
| 878 | if (has_empty) { | ||
| 879 | /* | ||
| 880 | * If next_free was 1 (only an empty extent), this | ||
| 881 | * loop won't execute, which is fine. We still want | ||
| 882 | * the decrement above to happen. | ||
| 883 | */ | ||
| 884 | for(i = 0; i < (next_free - 1); i++) | ||
| 885 | el->l_recs[i] = el->l_recs[i+1]; | ||
| 886 | |||
| 887 | next_free--; | ||
| 888 | } | ||
| 889 | |||
| 890 | /* | ||
| 891 | * Figure out what the new record index should be. | ||
| 892 | */ | ||
| 893 | for(i = 0; i < next_free; i++) { | ||
| 894 | rec = &el->l_recs[i]; | ||
| 895 | |||
| 896 | if (insert_cpos < le32_to_cpu(rec->e_cpos)) | ||
| 897 | break; | ||
| 898 | } | ||
| 899 | insert_index = i; | ||
| 900 | |||
| 901 | mlog(0, "ins %u: index %d, has_empty %d, next_free %d, count %d\n", | ||
| 902 | insert_cpos, insert_index, has_empty, next_free, le16_to_cpu(el->l_count)); | ||
| 903 | |||
| 904 | BUG_ON(insert_index < 0); | ||
| 905 | BUG_ON(insert_index >= le16_to_cpu(el->l_count)); | ||
| 906 | BUG_ON(insert_index > next_free); | ||
| 907 | |||
| 908 | /* | ||
| 909 | * No need to memmove if we're just adding to the tail. | ||
| 910 | */ | ||
| 911 | if (insert_index != next_free) { | ||
| 912 | BUG_ON(next_free >= le16_to_cpu(el->l_count)); | ||
| 913 | |||
| 914 | num_bytes = next_free - insert_index; | ||
| 915 | num_bytes *= sizeof(struct ocfs2_extent_rec); | ||
| 916 | memmove(&el->l_recs[insert_index + 1], | ||
| 917 | &el->l_recs[insert_index], | ||
| 918 | num_bytes); | ||
| 919 | } | ||
| 920 | |||
| 921 | /* | ||
| 922 | * Either we had an empty extent, and need to re-increment or | ||
| 923 | * there was no empty extent on a non full rightmost leaf node, | ||
| 924 | * in which case we still need to increment. | ||
| 925 | */ | ||
| 926 | next_free++; | ||
| 927 | el->l_next_free_rec = cpu_to_le16(next_free); | ||
| 928 | /* | ||
| 929 | * Make sure none of the math above just messed up our tree. | ||
| 930 | */ | ||
| 931 | BUG_ON(le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count)); | ||
| 932 | |||
| 933 | el->l_recs[insert_index] = *insert_rec; | ||
| 934 | |||
| 935 | } | ||
| 936 | |||
| 937 | /* | ||
| 938 | * Create an empty extent record . | ||
| 939 | * | ||
| 940 | * l_next_free_rec may be updated. | ||
| 941 | * | ||
| 942 | * If an empty extent already exists do nothing. | ||
| 943 | */ | ||
| 944 | static void ocfs2_create_empty_extent(struct ocfs2_extent_list *el) | ||
| 945 | { | ||
| 946 | int next_free = le16_to_cpu(el->l_next_free_rec); | ||
| 947 | |||
| 948 | BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); | ||
| 949 | |||
| 950 | if (next_free == 0) | ||
| 951 | goto set_and_inc; | ||
| 952 | |||
| 953 | if (ocfs2_is_empty_extent(&el->l_recs[0])) | ||
| 954 | return; | ||
| 955 | |||
| 956 | mlog_bug_on_msg(el->l_count == el->l_next_free_rec, | ||
| 957 | "Asked to create an empty extent in a full list:\n" | ||
| 958 | "count = %u, tree depth = %u", | ||
| 959 | le16_to_cpu(el->l_count), | ||
| 960 | le16_to_cpu(el->l_tree_depth)); | ||
| 961 | |||
| 962 | ocfs2_shift_records_right(el); | ||
| 963 | |||
| 964 | set_and_inc: | ||
| 965 | le16_add_cpu(&el->l_next_free_rec, 1); | ||
| 966 | memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); | ||
| 967 | } | ||
| 968 | |||
| 969 | /* | ||
| 970 | * For a rotation which involves two leaf nodes, the "root node" is | ||
| 971 | * the lowest level tree node which contains a path to both leafs. This | ||
| 972 | * resulting set of information can be used to form a complete "subtree" | ||
| 973 | * | ||
| 974 | * This function is passed two full paths from the dinode down to a | ||
| 975 | * pair of adjacent leaves. It's task is to figure out which path | ||
| 976 | * index contains the subtree root - this can be the root index itself | ||
| 977 | * in a worst-case rotation. | ||
| 978 | * | ||
| 979 | * The array index of the subtree root is passed back. | ||
| 980 | */ | ||
| 981 | static int ocfs2_find_subtree_root(struct inode *inode, | ||
| 982 | struct ocfs2_path *left, | ||
| 983 | struct ocfs2_path *right) | ||
| 984 | { | ||
| 985 | int i = 0; | ||
| 986 | |||
| 987 | /* | ||
| 988 | * Check that the caller passed in two paths from the same tree. | ||
| 989 | */ | ||
| 990 | BUG_ON(path_root_bh(left) != path_root_bh(right)); | ||
| 991 | |||
| 992 | do { | ||
| 993 | i++; | ||
| 994 | |||
| 995 | /* | ||
| 996 | * The caller didn't pass two adjacent paths. | ||
| 997 | */ | ||
| 998 | mlog_bug_on_msg(i > left->p_tree_depth, | ||
| 999 | "Inode %lu, left depth %u, right depth %u\n" | ||
| 1000 | "left leaf blk %llu, right leaf blk %llu\n", | ||
| 1001 | inode->i_ino, left->p_tree_depth, | ||
| 1002 | right->p_tree_depth, | ||
| 1003 | (unsigned long long)path_leaf_bh(left)->b_blocknr, | ||
| 1004 | (unsigned long long)path_leaf_bh(right)->b_blocknr); | ||
| 1005 | } while (left->p_node[i].bh->b_blocknr == | ||
| 1006 | right->p_node[i].bh->b_blocknr); | ||
| 1007 | |||
| 1008 | return i - 1; | ||
| 1009 | } | ||
| 1010 | |||
| 1011 | typedef void (path_insert_t)(void *, struct buffer_head *); | ||
| 1012 | |||
| 1013 | /* | ||
| 1014 | * Traverse a btree path in search of cpos, starting at root_el. | ||
| 1015 | * | ||
| 1016 | * This code can be called with a cpos larger than the tree, in which | ||
| 1017 | * case it will return the rightmost path. | ||
| 1018 | */ | ||
| 1019 | static int __ocfs2_find_path(struct inode *inode, | ||
| 1020 | struct ocfs2_extent_list *root_el, u32 cpos, | ||
| 1021 | path_insert_t *func, void *data) | ||
| 1022 | { | ||
| 1023 | int i, ret = 0; | ||
| 1024 | u32 range; | ||
| 1025 | u64 blkno; | ||
| 821 | struct buffer_head *bh = NULL; | 1026 | struct buffer_head *bh = NULL; |
| 822 | struct ocfs2_dinode *fe; | ||
| 823 | struct ocfs2_extent_block *eb; | 1027 | struct ocfs2_extent_block *eb; |
| 824 | struct ocfs2_extent_list *el; | 1028 | struct ocfs2_extent_list *el; |
| 1029 | struct ocfs2_extent_rec *rec; | ||
| 1030 | struct ocfs2_inode_info *oi = OCFS2_I(inode); | ||
| 825 | 1031 | ||
| 826 | mlog_entry_void(); | 1032 | el = root_el; |
| 1033 | while (el->l_tree_depth) { | ||
| 1034 | if (le16_to_cpu(el->l_next_free_rec) == 0) { | ||
| 1035 | ocfs2_error(inode->i_sb, | ||
| 1036 | "Inode %llu has empty extent list at " | ||
| 1037 | "depth %u\n", | ||
| 1038 | (unsigned long long)oi->ip_blkno, | ||
| 1039 | le16_to_cpu(el->l_tree_depth)); | ||
| 1040 | ret = -EROFS; | ||
| 1041 | goto out; | ||
| 827 | 1042 | ||
| 828 | mlog(0, "add %u clusters starting at block %llu to inode %llu\n", | 1043 | } |
| 829 | new_clusters, (unsigned long long)start_blk, | ||
| 830 | (unsigned long long)OCFS2_I(inode)->ip_blkno); | ||
| 831 | 1044 | ||
| 832 | fe = (struct ocfs2_dinode *) fe_bh->b_data; | 1045 | for(i = 0; i < le16_to_cpu(el->l_next_free_rec) - 1; i++) { |
| 833 | el = &fe->id2.i_list; | 1046 | rec = &el->l_recs[i]; |
| 1047 | |||
| 1048 | /* | ||
| 1049 | * In the case that cpos is off the allocation | ||
| 1050 | * tree, this should just wind up returning the | ||
| 1051 | * rightmost record. | ||
| 1052 | */ | ||
| 1053 | range = le32_to_cpu(rec->e_cpos) + | ||
| 1054 | ocfs2_rec_clusters(el, rec); | ||
| 1055 | if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range) | ||
| 1056 | break; | ||
| 1057 | } | ||
| 834 | 1058 | ||
| 835 | if (el->l_tree_depth) { | 1059 | blkno = le64_to_cpu(el->l_recs[i].e_blkno); |
| 836 | /* jump to end of tree */ | 1060 | if (blkno == 0) { |
| 837 | status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk), | 1061 | ocfs2_error(inode->i_sb, |
| 838 | &last_eb_bh, OCFS2_BH_CACHED, inode); | 1062 | "Inode %llu has bad blkno in extent list " |
| 839 | if (status < 0) { | 1063 | "at depth %u (index %d)\n", |
| 840 | mlog_exit(status); | 1064 | (unsigned long long)oi->ip_blkno, |
| 841 | goto bail; | 1065 | le16_to_cpu(el->l_tree_depth), i); |
| 1066 | ret = -EROFS; | ||
| 1067 | goto out; | ||
| 842 | } | 1068 | } |
| 843 | eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; | 1069 | |
| 1070 | brelse(bh); | ||
| 1071 | bh = NULL; | ||
| 1072 | ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno, | ||
| 1073 | &bh, OCFS2_BH_CACHED, inode); | ||
| 1074 | if (ret) { | ||
| 1075 | mlog_errno(ret); | ||
| 1076 | goto out; | ||
| 1077 | } | ||
| 1078 | |||
| 1079 | eb = (struct ocfs2_extent_block *) bh->b_data; | ||
| 844 | el = &eb->h_list; | 1080 | el = &eb->h_list; |
| 1081 | if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { | ||
| 1082 | OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); | ||
| 1083 | ret = -EIO; | ||
| 1084 | goto out; | ||
| 1085 | } | ||
| 1086 | |||
| 1087 | if (le16_to_cpu(el->l_next_free_rec) > | ||
| 1088 | le16_to_cpu(el->l_count)) { | ||
| 1089 | ocfs2_error(inode->i_sb, | ||
| 1090 | "Inode %llu has bad count in extent list " | ||
| 1091 | "at block %llu (next free=%u, count=%u)\n", | ||
| 1092 | (unsigned long long)oi->ip_blkno, | ||
| 1093 | (unsigned long long)bh->b_blocknr, | ||
| 1094 | le16_to_cpu(el->l_next_free_rec), | ||
| 1095 | le16_to_cpu(el->l_count)); | ||
| 1096 | ret = -EROFS; | ||
| 1097 | goto out; | ||
| 1098 | } | ||
| 1099 | |||
| 1100 | if (func) | ||
| 1101 | func(data, bh); | ||
| 1102 | } | ||
| 1103 | |||
| 1104 | out: | ||
| 1105 | /* | ||
| 1106 | * Catch any trailing bh that the loop didn't handle. | ||
| 1107 | */ | ||
| 1108 | brelse(bh); | ||
| 1109 | |||
| 1110 | return ret; | ||
| 1111 | } | ||
| 1112 | |||
| 1113 | /* | ||
| 1114 | * Given an initialized path (that is, it has a valid root extent | ||
| 1115 | * list), this function will traverse the btree in search of the path | ||
| 1116 | * which would contain cpos. | ||
| 1117 | * | ||
| 1118 | * The path traveled is recorded in the path structure. | ||
| 1119 | * | ||
| 1120 | * Note that this will not do any comparisons on leaf node extent | ||
| 1121 | * records, so it will work fine in the case that we just added a tree | ||
| 1122 | * branch. | ||
| 1123 | */ | ||
| 1124 | struct find_path_data { | ||
| 1125 | int index; | ||
| 1126 | struct ocfs2_path *path; | ||
| 1127 | }; | ||
| 1128 | static void find_path_ins(void *data, struct buffer_head *bh) | ||
| 1129 | { | ||
| 1130 | struct find_path_data *fp = data; | ||
| 1131 | |||
| 1132 | get_bh(bh); | ||
| 1133 | ocfs2_path_insert_eb(fp->path, fp->index, bh); | ||
| 1134 | fp->index++; | ||
| 1135 | } | ||
| 1136 | static int ocfs2_find_path(struct inode *inode, struct ocfs2_path *path, | ||
| 1137 | u32 cpos) | ||
| 1138 | { | ||
| 1139 | struct find_path_data data; | ||
| 1140 | |||
| 1141 | data.index = 1; | ||
| 1142 | data.path = path; | ||
| 1143 | return __ocfs2_find_path(inode, path_root_el(path), cpos, | ||
| 1144 | find_path_ins, &data); | ||
| 1145 | } | ||
| 1146 | |||
| 1147 | static void find_leaf_ins(void *data, struct buffer_head *bh) | ||
| 1148 | { | ||
| 1149 | struct ocfs2_extent_block *eb =(struct ocfs2_extent_block *)bh->b_data; | ||
| 1150 | struct ocfs2_extent_list *el = &eb->h_list; | ||
| 1151 | struct buffer_head **ret = data; | ||
| 1152 | |||
| 1153 | /* We want to retain only the leaf block. */ | ||
| 1154 | if (le16_to_cpu(el->l_tree_depth) == 0) { | ||
| 1155 | get_bh(bh); | ||
| 1156 | *ret = bh; | ||
| 1157 | } | ||
| 1158 | } | ||
| 1159 | /* | ||
| 1160 | * Find the leaf block in the tree which would contain cpos. No | ||
| 1161 | * checking of the actual leaf is done. | ||
| 1162 | * | ||
| 1163 | * Some paths want to call this instead of allocating a path structure | ||
| 1164 | * and calling ocfs2_find_path(). | ||
| 1165 | * | ||
| 1166 | * This function doesn't handle non btree extent lists. | ||
| 1167 | */ | ||
| 1168 | int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el, | ||
| 1169 | u32 cpos, struct buffer_head **leaf_bh) | ||
| 1170 | { | ||
| 1171 | int ret; | ||
| 1172 | struct buffer_head *bh = NULL; | ||
| 1173 | |||
| 1174 | ret = __ocfs2_find_path(inode, root_el, cpos, find_leaf_ins, &bh); | ||
| 1175 | if (ret) { | ||
| 1176 | mlog_errno(ret); | ||
| 1177 | goto out; | ||
| 1178 | } | ||
| 1179 | |||
| 1180 | *leaf_bh = bh; | ||
| 1181 | out: | ||
| 1182 | return ret; | ||
| 1183 | } | ||
| 1184 | |||
| 1185 | /* | ||
| 1186 | * Adjust the adjacent records (left_rec, right_rec) involved in a rotation. | ||
| 1187 | * | ||
| 1188 | * Basically, we've moved stuff around at the bottom of the tree and | ||
| 1189 | * we need to fix up the extent records above the changes to reflect | ||
| 1190 | * the new changes. | ||
| 1191 | * | ||
| 1192 | * left_rec: the record on the left. | ||
| 1193 | * left_child_el: is the child list pointed to by left_rec | ||
| 1194 | * right_rec: the record to the right of left_rec | ||
| 1195 | * right_child_el: is the child list pointed to by right_rec | ||
| 1196 | * | ||
| 1197 | * By definition, this only works on interior nodes. | ||
| 1198 | */ | ||
| 1199 | static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec, | ||
| 1200 | struct ocfs2_extent_list *left_child_el, | ||
| 1201 | struct ocfs2_extent_rec *right_rec, | ||
| 1202 | struct ocfs2_extent_list *right_child_el) | ||
| 1203 | { | ||
| 1204 | u32 left_clusters, right_end; | ||
| 1205 | |||
| 1206 | /* | ||
| 1207 | * Interior nodes never have holes. Their cpos is the cpos of | ||
| 1208 | * the leftmost record in their child list. Their cluster | ||
| 1209 | * count covers the full theoretical range of their child list | ||
| 1210 | * - the range between their cpos and the cpos of the record | ||
| 1211 | * immediately to their right. | ||
| 1212 | */ | ||
| 1213 | left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos); | ||
| 1214 | left_clusters -= le32_to_cpu(left_rec->e_cpos); | ||
| 1215 | left_rec->e_int_clusters = cpu_to_le32(left_clusters); | ||
| 1216 | |||
| 1217 | /* | ||
| 1218 | * Calculate the rightmost cluster count boundary before | ||
| 1219 | * moving cpos - we will need to adjust clusters after | ||
| 1220 | * updating e_cpos to keep the same highest cluster count. | ||
| 1221 | */ | ||
| 1222 | right_end = le32_to_cpu(right_rec->e_cpos); | ||
| 1223 | right_end += le32_to_cpu(right_rec->e_int_clusters); | ||
| 1224 | |||
| 1225 | right_rec->e_cpos = left_rec->e_cpos; | ||
| 1226 | le32_add_cpu(&right_rec->e_cpos, left_clusters); | ||
| 1227 | |||
| 1228 | right_end -= le32_to_cpu(right_rec->e_cpos); | ||
| 1229 | right_rec->e_int_clusters = cpu_to_le32(right_end); | ||
| 1230 | } | ||
| 1231 | |||
| 1232 | /* | ||
| 1233 | * Adjust the adjacent root node records involved in a | ||
| 1234 | * rotation. left_el_blkno is passed in as a key so that we can easily | ||
| 1235 | * find it's index in the root list. | ||
| 1236 | */ | ||
| 1237 | static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el, | ||
| 1238 | struct ocfs2_extent_list *left_el, | ||
| 1239 | struct ocfs2_extent_list *right_el, | ||
| 1240 | u64 left_el_blkno) | ||
| 1241 | { | ||
| 1242 | int i; | ||
| 1243 | |||
| 1244 | BUG_ON(le16_to_cpu(root_el->l_tree_depth) <= | ||
| 1245 | le16_to_cpu(left_el->l_tree_depth)); | ||
| 1246 | |||
| 1247 | for(i = 0; i < le16_to_cpu(root_el->l_next_free_rec) - 1; i++) { | ||
| 1248 | if (le64_to_cpu(root_el->l_recs[i].e_blkno) == left_el_blkno) | ||
| 1249 | break; | ||
| 1250 | } | ||
| 1251 | |||
| 1252 | /* | ||
| 1253 | * The path walking code should have never returned a root and | ||
| 1254 | * two paths which are not adjacent. | ||
| 1255 | */ | ||
| 1256 | BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1)); | ||
| 1257 | |||
| 1258 | ocfs2_adjust_adjacent_records(&root_el->l_recs[i], left_el, | ||
| 1259 | &root_el->l_recs[i + 1], right_el); | ||
| 1260 | } | ||
| 1261 | |||
| 1262 | /* | ||
| 1263 | * We've changed a leaf block (in right_path) and need to reflect that | ||
| 1264 | * change back up the subtree. | ||
| 1265 | * | ||
| 1266 | * This happens in multiple places: | ||
| 1267 | * - When we've moved an extent record from the left path leaf to the right | ||
| 1268 | * path leaf to make room for an empty extent in the left path leaf. | ||
| 1269 | * - When our insert into the right path leaf is at the leftmost edge | ||
| 1270 | * and requires an update of the path immediately to it's left. This | ||
| 1271 | * can occur at the end of some types of rotation and appending inserts. | ||
| 1272 | */ | ||
| 1273 | static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle, | ||
| 1274 | struct ocfs2_path *left_path, | ||
| 1275 | struct ocfs2_path *right_path, | ||
| 1276 | int subtree_index) | ||
| 1277 | { | ||
| 1278 | int ret, i, idx; | ||
| 1279 | struct ocfs2_extent_list *el, *left_el, *right_el; | ||
| 1280 | struct ocfs2_extent_rec *left_rec, *right_rec; | ||
| 1281 | struct buffer_head *root_bh = left_path->p_node[subtree_index].bh; | ||
| 1282 | |||
| 1283 | /* | ||
| 1284 | * Update the counts and position values within all the | ||
| 1285 | * interior nodes to reflect the leaf rotation we just did. | ||
| 1286 | * | ||
| 1287 | * The root node is handled below the loop. | ||
| 1288 | * | ||
| 1289 | * We begin the loop with right_el and left_el pointing to the | ||
| 1290 | * leaf lists and work our way up. | ||
| 1291 | * | ||
| 1292 | * NOTE: within this loop, left_el and right_el always refer | ||
| 1293 | * to the *child* lists. | ||
| 1294 | */ | ||
| 1295 | left_el = path_leaf_el(left_path); | ||
| 1296 | right_el = path_leaf_el(right_path); | ||
| 1297 | for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) { | ||
| 1298 | mlog(0, "Adjust records at index %u\n", i); | ||
| 1299 | |||
| 1300 | /* | ||
| 1301 | * One nice property of knowing that all of these | ||
| 1302 | * nodes are below the root is that we only deal with | ||
| 1303 | * the leftmost right node record and the rightmost | ||
| 1304 | * left node record. | ||
| 1305 | */ | ||
| 1306 | el = left_path->p_node[i].el; | ||
| 1307 | idx = le16_to_cpu(left_el->l_next_free_rec) - 1; | ||
| 1308 | left_rec = &el->l_recs[idx]; | ||
| 1309 | |||
| 1310 | el = right_path->p_node[i].el; | ||
| 1311 | right_rec = &el->l_recs[0]; | ||
| 1312 | |||
| 1313 | ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec, | ||
| 1314 | right_el); | ||
| 1315 | |||
| 1316 | ret = ocfs2_journal_dirty(handle, left_path->p_node[i].bh); | ||
| 1317 | if (ret) | ||
| 1318 | mlog_errno(ret); | ||
| 1319 | |||
| 1320 | ret = ocfs2_journal_dirty(handle, right_path->p_node[i].bh); | ||
| 1321 | if (ret) | ||
| 1322 | mlog_errno(ret); | ||
| 1323 | |||
| 1324 | /* | ||
| 1325 | * Setup our list pointers now so that the current | ||
| 1326 | * parents become children in the next iteration. | ||
| 1327 | */ | ||
| 1328 | left_el = left_path->p_node[i].el; | ||
| 1329 | right_el = right_path->p_node[i].el; | ||
| 1330 | } | ||
| 1331 | |||
| 1332 | /* | ||
| 1333 | * At the root node, adjust the two adjacent records which | ||
| 1334 | * begin our path to the leaves. | ||
| 1335 | */ | ||
| 1336 | |||
| 1337 | el = left_path->p_node[subtree_index].el; | ||
| 1338 | left_el = left_path->p_node[subtree_index + 1].el; | ||
| 1339 | right_el = right_path->p_node[subtree_index + 1].el; | ||
| 1340 | |||
| 1341 | ocfs2_adjust_root_records(el, left_el, right_el, | ||
| 1342 | left_path->p_node[subtree_index + 1].bh->b_blocknr); | ||
| 1343 | |||
| 1344 | root_bh = left_path->p_node[subtree_index].bh; | ||
| 1345 | |||
| 1346 | ret = ocfs2_journal_dirty(handle, root_bh); | ||
| 1347 | if (ret) | ||
| 1348 | mlog_errno(ret); | ||
| 1349 | } | ||
| 1350 | |||
| 1351 | static int ocfs2_rotate_subtree_right(struct inode *inode, | ||
| 1352 | handle_t *handle, | ||
| 1353 | struct ocfs2_path *left_path, | ||
| 1354 | struct ocfs2_path *right_path, | ||
| 1355 | int subtree_index) | ||
| 1356 | { | ||
| 1357 | int ret, i; | ||
| 1358 | struct buffer_head *right_leaf_bh; | ||
| 1359 | struct buffer_head *left_leaf_bh = NULL; | ||
| 1360 | struct buffer_head *root_bh; | ||
| 1361 | struct ocfs2_extent_list *right_el, *left_el; | ||
| 1362 | struct ocfs2_extent_rec move_rec; | ||
| 1363 | |||
| 1364 | left_leaf_bh = path_leaf_bh(left_path); | ||
| 1365 | left_el = path_leaf_el(left_path); | ||
| 1366 | |||
| 1367 | if (left_el->l_next_free_rec != left_el->l_count) { | ||
| 1368 | ocfs2_error(inode->i_sb, | ||
| 1369 | "Inode %llu has non-full interior leaf node %llu" | ||
| 1370 | "(next free = %u)", | ||
| 1371 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | ||
| 1372 | (unsigned long long)left_leaf_bh->b_blocknr, | ||
| 1373 | le16_to_cpu(left_el->l_next_free_rec)); | ||
| 1374 | return -EROFS; | ||
| 1375 | } | ||
| 1376 | |||
| 1377 | /* | ||
| 1378 | * This extent block may already have an empty record, so we | ||
| 1379 | * return early if so. | ||
| 1380 | */ | ||
| 1381 | if (ocfs2_is_empty_extent(&left_el->l_recs[0])) | ||
| 1382 | return 0; | ||
| 1383 | |||
| 1384 | root_bh = left_path->p_node[subtree_index].bh; | ||
| 1385 | BUG_ON(root_bh != right_path->p_node[subtree_index].bh); | ||
| 1386 | |||
| 1387 | ret = ocfs2_journal_access(handle, inode, root_bh, | ||
| 1388 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
| 1389 | if (ret) { | ||
| 1390 | mlog_errno(ret); | ||
| 1391 | goto out; | ||
| 1392 | } | ||
| 1393 | |||
| 1394 | for(i = subtree_index + 1; i < path_num_items(right_path); i++) { | ||
| 1395 | ret = ocfs2_journal_access(handle, inode, | ||
| 1396 | right_path->p_node[i].bh, | ||
| 1397 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
| 1398 | if (ret) { | ||
| 1399 | mlog_errno(ret); | ||
| 1400 | goto out; | ||
| 1401 | } | ||
| 1402 | |||
| 1403 | ret = ocfs2_journal_access(handle, inode, | ||
| 1404 | left_path->p_node[i].bh, | ||
| 1405 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
| 1406 | if (ret) { | ||
| 1407 | mlog_errno(ret); | ||
| 1408 | goto out; | ||
| 1409 | } | ||
| 1410 | } | ||
| 1411 | |||
| 1412 | right_leaf_bh = path_leaf_bh(right_path); | ||
| 1413 | right_el = path_leaf_el(right_path); | ||
| 1414 | |||
| 1415 | /* This is a code error, not a disk corruption. */ | ||
| 1416 | mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails " | ||
| 1417 | "because rightmost leaf block %llu is empty\n", | ||
| 1418 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | ||
| 1419 | (unsigned long long)right_leaf_bh->b_blocknr); | ||
| 1420 | |||
| 1421 | ocfs2_create_empty_extent(right_el); | ||
| 1422 | |||
| 1423 | ret = ocfs2_journal_dirty(handle, right_leaf_bh); | ||
| 1424 | if (ret) { | ||
| 1425 | mlog_errno(ret); | ||
| 1426 | goto out; | ||
| 1427 | } | ||
| 1428 | |||
| 1429 | /* Do the copy now. */ | ||
| 1430 | i = le16_to_cpu(left_el->l_next_free_rec) - 1; | ||
| 1431 | move_rec = left_el->l_recs[i]; | ||
| 1432 | right_el->l_recs[0] = move_rec; | ||
| 1433 | |||
| 1434 | /* | ||
| 1435 | * Clear out the record we just copied and shift everything | ||
| 1436 | * over, leaving an empty extent in the left leaf. | ||
| 1437 | * | ||
| 1438 | * We temporarily subtract from next_free_rec so that the | ||
| 1439 | * shift will lose the tail record (which is now defunct). | ||
| 1440 | */ | ||
| 1441 | le16_add_cpu(&left_el->l_next_free_rec, -1); | ||
| 1442 | ocfs2_shift_records_right(left_el); | ||
| 1443 | memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); | ||
| 1444 | le16_add_cpu(&left_el->l_next_free_rec, 1); | ||
| 1445 | |||
| 1446 | ret = ocfs2_journal_dirty(handle, left_leaf_bh); | ||
| 1447 | if (ret) { | ||
| 1448 | mlog_errno(ret); | ||
| 1449 | goto out; | ||
| 1450 | } | ||
| 1451 | |||
| 1452 | ocfs2_complete_edge_insert(inode, handle, left_path, right_path, | ||
| 1453 | subtree_index); | ||
| 1454 | |||
| 1455 | out: | ||
| 1456 | return ret; | ||
| 1457 | } | ||
| 1458 | |||
| 1459 | /* | ||
| 1460 | * Given a full path, determine what cpos value would return us a path | ||
| 1461 | * containing the leaf immediately to the left of the current one. | ||
| 1462 | * | ||
| 1463 | * Will return zero if the path passed in is already the leftmost path. | ||
| 1464 | */ | ||
| 1465 | static int ocfs2_find_cpos_for_left_leaf(struct super_block *sb, | ||
| 1466 | struct ocfs2_path *path, u32 *cpos) | ||
| 1467 | { | ||
| 1468 | int i, j, ret = 0; | ||
| 1469 | u64 blkno; | ||
| 1470 | struct ocfs2_extent_list *el; | ||
| 1471 | |||
| 1472 | BUG_ON(path->p_tree_depth == 0); | ||
| 1473 | |||
| 1474 | *cpos = 0; | ||
| 1475 | |||
| 1476 | blkno = path_leaf_bh(path)->b_blocknr; | ||
| 1477 | |||
| 1478 | /* Start at the tree node just above the leaf and work our way up. */ | ||
| 1479 | i = path->p_tree_depth - 1; | ||
| 1480 | while (i >= 0) { | ||
| 1481 | el = path->p_node[i].el; | ||
| 1482 | |||
| 1483 | /* | ||
| 1484 | * Find the extent record just before the one in our | ||
| 1485 | * path. | ||
| 1486 | */ | ||
| 1487 | for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) { | ||
| 1488 | if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) { | ||
| 1489 | if (j == 0) { | ||
| 1490 | if (i == 0) { | ||
| 1491 | /* | ||
| 1492 | * We've determined that the | ||
| 1493 | * path specified is already | ||
| 1494 | * the leftmost one - return a | ||
| 1495 | * cpos of zero. | ||
| 1496 | */ | ||
| 1497 | goto out; | ||
| 1498 | } | ||
| 1499 | /* | ||
| 1500 | * The leftmost record points to our | ||
| 1501 | * leaf - we need to travel up the | ||
| 1502 | * tree one level. | ||
| 1503 | */ | ||
| 1504 | goto next_node; | ||
| 1505 | } | ||
| 1506 | |||
| 1507 | *cpos = le32_to_cpu(el->l_recs[j - 1].e_cpos); | ||
| 1508 | *cpos = *cpos + ocfs2_rec_clusters(el, | ||
| 1509 | &el->l_recs[j - 1]); | ||
| 1510 | *cpos = *cpos - 1; | ||
| 1511 | goto out; | ||
| 1512 | } | ||
| 1513 | } | ||
| 1514 | |||
| 1515 | /* | ||
| 1516 | * If we got here, we never found a valid node where | ||
| 1517 | * the tree indicated one should be. | ||
| 1518 | */ | ||
| 1519 | ocfs2_error(sb, | ||
| 1520 | "Invalid extent tree at extent block %llu\n", | ||
| 1521 | (unsigned long long)blkno); | ||
| 1522 | ret = -EROFS; | ||
| 1523 | goto out; | ||
| 1524 | |||
| 1525 | next_node: | ||
| 1526 | blkno = path->p_node[i].bh->b_blocknr; | ||
| 1527 | i--; | ||
| 1528 | } | ||
| 1529 | |||
| 1530 | out: | ||
| 1531 | return ret; | ||
| 1532 | } | ||
| 1533 | |||
| 1534 | static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth, | ||
| 1535 | struct ocfs2_path *path) | ||
| 1536 | { | ||
| 1537 | int credits = (path->p_tree_depth - subtree_depth) * 2 + 1; | ||
| 1538 | |||
| 1539 | if (handle->h_buffer_credits < credits) | ||
| 1540 | return ocfs2_extend_trans(handle, credits); | ||
| 1541 | |||
| 1542 | return 0; | ||
| 1543 | } | ||
| 1544 | |||
| 1545 | /* | ||
| 1546 | * Trap the case where we're inserting into the theoretical range past | ||
| 1547 | * the _actual_ left leaf range. Otherwise, we'll rotate a record | ||
| 1548 | * whose cpos is less than ours into the right leaf. | ||
| 1549 | * | ||
| 1550 | * It's only necessary to look at the rightmost record of the left | ||
| 1551 | * leaf because the logic that calls us should ensure that the | ||
| 1552 | * theoretical ranges in the path components above the leaves are | ||
| 1553 | * correct. | ||
| 1554 | */ | ||
| 1555 | static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path, | ||
| 1556 | u32 insert_cpos) | ||
| 1557 | { | ||
| 1558 | struct ocfs2_extent_list *left_el; | ||
| 1559 | struct ocfs2_extent_rec *rec; | ||
| 1560 | int next_free; | ||
| 1561 | |||
| 1562 | left_el = path_leaf_el(left_path); | ||
| 1563 | next_free = le16_to_cpu(left_el->l_next_free_rec); | ||
| 1564 | rec = &left_el->l_recs[next_free - 1]; | ||
| 1565 | |||
| 1566 | if (insert_cpos > le32_to_cpu(rec->e_cpos)) | ||
| 1567 | return 1; | ||
| 1568 | return 0; | ||
| 1569 | } | ||
| 1570 | |||
| 1571 | /* | ||
| 1572 | * Rotate all the records in a btree right one record, starting at insert_cpos. | ||
| 1573 | * | ||
| 1574 | * The path to the rightmost leaf should be passed in. | ||
| 1575 | * | ||
| 1576 | * The array is assumed to be large enough to hold an entire path (tree depth). | ||
| 1577 | * | ||
| 1578 | * Upon succesful return from this function: | ||
| 1579 | * | ||
| 1580 | * - The 'right_path' array will contain a path to the leaf block | ||
| 1581 | * whose range contains e_cpos. | ||
| 1582 | * - That leaf block will have a single empty extent in list index 0. | ||
| 1583 | * - In the case that the rotation requires a post-insert update, | ||
| 1584 | * *ret_left_path will contain a valid path which can be passed to | ||
| 1585 | * ocfs2_insert_path(). | ||
| 1586 | */ | ||
| 1587 | static int ocfs2_rotate_tree_right(struct inode *inode, | ||
| 1588 | handle_t *handle, | ||
| 1589 | u32 insert_cpos, | ||
| 1590 | struct ocfs2_path *right_path, | ||
| 1591 | struct ocfs2_path **ret_left_path) | ||
| 1592 | { | ||
| 1593 | int ret, start; | ||
| 1594 | u32 cpos; | ||
| 1595 | struct ocfs2_path *left_path = NULL; | ||
| 1596 | |||
| 1597 | *ret_left_path = NULL; | ||
| 1598 | |||
| 1599 | left_path = ocfs2_new_path(path_root_bh(right_path), | ||
| 1600 | path_root_el(right_path)); | ||
| 1601 | if (!left_path) { | ||
| 1602 | ret = -ENOMEM; | ||
| 1603 | mlog_errno(ret); | ||
| 1604 | goto out; | ||
| 1605 | } | ||
| 1606 | |||
| 1607 | ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, &cpos); | ||
| 1608 | if (ret) { | ||
| 1609 | mlog_errno(ret); | ||
| 1610 | goto out; | ||
| 1611 | } | ||
| 1612 | |||
| 1613 | mlog(0, "Insert: %u, first left path cpos: %u\n", insert_cpos, cpos); | ||
| 1614 | |||
| 1615 | /* | ||
| 1616 | * What we want to do here is: | ||
| 1617 | * | ||
| 1618 | * 1) Start with the rightmost path. | ||
| 1619 | * | ||
| 1620 | * 2) Determine a path to the leaf block directly to the left | ||
| 1621 | * of that leaf. | ||
| 1622 | * | ||
| 1623 | * 3) Determine the 'subtree root' - the lowest level tree node | ||
| 1624 | * which contains a path to both leaves. | ||
| 1625 | * | ||
| 1626 | * 4) Rotate the subtree. | ||
| 1627 | * | ||
| 1628 | * 5) Find the next subtree by considering the left path to be | ||
| 1629 | * the new right path. | ||
| 1630 | * | ||
| 1631 | * The check at the top of this while loop also accepts | ||
| 1632 | * insert_cpos == cpos because cpos is only a _theoretical_ | ||
| 1633 | * value to get us the left path - insert_cpos might very well | ||
| 1634 | * be filling that hole. | ||
| 1635 | * | ||
| 1636 | * Stop at a cpos of '0' because we either started at the | ||
| 1637 | * leftmost branch (i.e., a tree with one branch and a | ||
| 1638 | * rotation inside of it), or we've gone as far as we can in | ||
| 1639 | * rotating subtrees. | ||
| 1640 | */ | ||
| 1641 | while (cpos && insert_cpos <= cpos) { | ||
| 1642 | mlog(0, "Rotating a tree: ins. cpos: %u, left path cpos: %u\n", | ||
| 1643 | insert_cpos, cpos); | ||
| 1644 | |||
| 1645 | ret = ocfs2_find_path(inode, left_path, cpos); | ||
| 1646 | if (ret) { | ||
| 1647 | mlog_errno(ret); | ||
| 1648 | goto out; | ||
| 1649 | } | ||
| 1650 | |||
| 1651 | mlog_bug_on_msg(path_leaf_bh(left_path) == | ||
| 1652 | path_leaf_bh(right_path), | ||
| 1653 | "Inode %lu: error during insert of %u " | ||
| 1654 | "(left path cpos %u) results in two identical " | ||
| 1655 | "paths ending at %llu\n", | ||
| 1656 | inode->i_ino, insert_cpos, cpos, | ||
| 1657 | (unsigned long long) | ||
| 1658 | path_leaf_bh(left_path)->b_blocknr); | ||
| 1659 | |||
| 1660 | if (ocfs2_rotate_requires_path_adjustment(left_path, | ||
| 1661 | insert_cpos)) { | ||
| 1662 | mlog(0, "Path adjustment required\n"); | ||
| 1663 | |||
| 1664 | /* | ||
| 1665 | * We've rotated the tree as much as we | ||
| 1666 | * should. The rest is up to | ||
| 1667 | * ocfs2_insert_path() to complete, after the | ||
| 1668 | * record insertion. We indicate this | ||
| 1669 | * situation by returning the left path. | ||
| 1670 | * | ||
| 1671 | * The reason we don't adjust the records here | ||
| 1672 | * before the record insert is that an error | ||
| 1673 | * later might break the rule where a parent | ||
| 1674 | * record e_cpos will reflect the actual | ||
| 1675 | * e_cpos of the 1st nonempty record of the | ||
| 1676 | * child list. | ||
| 1677 | */ | ||
| 1678 | *ret_left_path = left_path; | ||
| 1679 | goto out_ret_path; | ||
| 1680 | } | ||
| 1681 | |||
| 1682 | start = ocfs2_find_subtree_root(inode, left_path, right_path); | ||
| 1683 | |||
| 1684 | mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n", | ||
| 1685 | start, | ||
| 1686 | (unsigned long long) right_path->p_node[start].bh->b_blocknr, | ||
| 1687 | right_path->p_tree_depth); | ||
| 1688 | |||
| 1689 | ret = ocfs2_extend_rotate_transaction(handle, start, | ||
| 1690 | right_path); | ||
| 1691 | if (ret) { | ||
| 1692 | mlog_errno(ret); | ||
| 1693 | goto out; | ||
| 1694 | } | ||
| 1695 | |||
| 1696 | ret = ocfs2_rotate_subtree_right(inode, handle, left_path, | ||
| 1697 | right_path, start); | ||
| 1698 | if (ret) { | ||
| 1699 | mlog_errno(ret); | ||
| 1700 | goto out; | ||
| 1701 | } | ||
| 1702 | |||
| 1703 | /* | ||
| 1704 | * There is no need to re-read the next right path | ||
| 1705 | * as we know that it'll be our current left | ||
| 1706 | * path. Optimize by copying values instead. | ||
| 1707 | */ | ||
| 1708 | ocfs2_mv_path(right_path, left_path); | ||
| 1709 | |||
| 1710 | ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, | ||
| 1711 | &cpos); | ||
| 1712 | if (ret) { | ||
| 1713 | mlog_errno(ret); | ||
| 1714 | goto out; | ||
| 1715 | } | ||
| 1716 | } | ||
| 1717 | |||
| 1718 | out: | ||
| 1719 | ocfs2_free_path(left_path); | ||
| 1720 | |||
| 1721 | out_ret_path: | ||
| 1722 | return ret; | ||
| 1723 | } | ||
| 1724 | |||
| 1725 | /* | ||
| 1726 | * Do the final bits of extent record insertion at the target leaf | ||
| 1727 | * list. If this leaf is part of an allocation tree, it is assumed | ||
| 1728 | * that the tree above has been prepared. | ||
| 1729 | */ | ||
| 1730 | static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec, | ||
| 1731 | struct ocfs2_extent_list *el, | ||
| 1732 | struct ocfs2_insert_type *insert, | ||
| 1733 | struct inode *inode) | ||
| 1734 | { | ||
| 1735 | int i = insert->ins_contig_index; | ||
| 1736 | unsigned int range; | ||
| 1737 | struct ocfs2_extent_rec *rec; | ||
| 1738 | |||
| 1739 | BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); | ||
| 1740 | |||
| 1741 | /* | ||
| 1742 | * Contiguous insert - either left or right. | ||
| 1743 | */ | ||
| 1744 | if (insert->ins_contig != CONTIG_NONE) { | ||
| 1745 | rec = &el->l_recs[i]; | ||
| 1746 | if (insert->ins_contig == CONTIG_LEFT) { | ||
| 1747 | rec->e_blkno = insert_rec->e_blkno; | ||
| 1748 | rec->e_cpos = insert_rec->e_cpos; | ||
| 1749 | } | ||
| 1750 | le16_add_cpu(&rec->e_leaf_clusters, | ||
| 1751 | le16_to_cpu(insert_rec->e_leaf_clusters)); | ||
| 1752 | return; | ||
| 1753 | } | ||
| 1754 | |||
| 1755 | /* | ||
| 1756 | * Handle insert into an empty leaf. | ||
| 1757 | */ | ||
| 1758 | if (le16_to_cpu(el->l_next_free_rec) == 0 || | ||
| 1759 | ((le16_to_cpu(el->l_next_free_rec) == 1) && | ||
| 1760 | ocfs2_is_empty_extent(&el->l_recs[0]))) { | ||
| 1761 | el->l_recs[0] = *insert_rec; | ||
| 1762 | el->l_next_free_rec = cpu_to_le16(1); | ||
| 1763 | return; | ||
| 1764 | } | ||
| 1765 | |||
| 1766 | /* | ||
| 1767 | * Appending insert. | ||
| 1768 | */ | ||
| 1769 | if (insert->ins_appending == APPEND_TAIL) { | ||
| 1770 | i = le16_to_cpu(el->l_next_free_rec) - 1; | ||
| 1771 | rec = &el->l_recs[i]; | ||
| 1772 | range = le32_to_cpu(rec->e_cpos) | ||
| 1773 | + le16_to_cpu(rec->e_leaf_clusters); | ||
| 1774 | BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range); | ||
| 1775 | |||
| 1776 | mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >= | ||
| 1777 | le16_to_cpu(el->l_count), | ||
| 1778 | "inode %lu, depth %u, count %u, next free %u, " | ||
| 1779 | "rec.cpos %u, rec.clusters %u, " | ||
| 1780 | "insert.cpos %u, insert.clusters %u\n", | ||
| 1781 | inode->i_ino, | ||
| 1782 | le16_to_cpu(el->l_tree_depth), | ||
| 1783 | le16_to_cpu(el->l_count), | ||
| 1784 | le16_to_cpu(el->l_next_free_rec), | ||
| 1785 | le32_to_cpu(el->l_recs[i].e_cpos), | ||
| 1786 | le16_to_cpu(el->l_recs[i].e_leaf_clusters), | ||
| 1787 | le32_to_cpu(insert_rec->e_cpos), | ||
| 1788 | le16_to_cpu(insert_rec->e_leaf_clusters)); | ||
| 1789 | i++; | ||
| 1790 | el->l_recs[i] = *insert_rec; | ||
| 1791 | le16_add_cpu(&el->l_next_free_rec, 1); | ||
| 1792 | return; | ||
| 1793 | } | ||
| 1794 | |||
| 1795 | /* | ||
| 1796 | * Ok, we have to rotate. | ||
| 1797 | * | ||
| 1798 | * At this point, it is safe to assume that inserting into an | ||
| 1799 | * empty leaf and appending to a leaf have both been handled | ||
| 1800 | * above. | ||
| 1801 | * | ||
| 1802 | * This leaf needs to have space, either by the empty 1st | ||
| 1803 | * extent record, or by virtue of an l_next_rec < l_count. | ||
| 1804 | */ | ||
| 1805 | ocfs2_rotate_leaf(el, insert_rec); | ||
| 1806 | } | ||
| 1807 | |||
| 1808 | static inline void ocfs2_update_dinode_clusters(struct inode *inode, | ||
| 1809 | struct ocfs2_dinode *di, | ||
| 1810 | u32 clusters) | ||
| 1811 | { | ||
| 1812 | le32_add_cpu(&di->i_clusters, clusters); | ||
| 1813 | spin_lock(&OCFS2_I(inode)->ip_lock); | ||
| 1814 | OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters); | ||
| 1815 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
| 1816 | } | ||
| 1817 | |||
| 1818 | static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle, | ||
| 1819 | struct ocfs2_extent_rec *insert_rec, | ||
| 1820 | struct ocfs2_path *right_path, | ||
| 1821 | struct ocfs2_path **ret_left_path) | ||
| 1822 | { | ||
| 1823 | int ret, i, next_free; | ||
| 1824 | struct buffer_head *bh; | ||
| 1825 | struct ocfs2_extent_list *el; | ||
| 1826 | struct ocfs2_path *left_path = NULL; | ||
| 1827 | |||
| 1828 | *ret_left_path = NULL; | ||
| 1829 | |||
| 1830 | /* | ||
| 1831 | * This shouldn't happen for non-trees. The extent rec cluster | ||
| 1832 | * count manipulation below only works for interior nodes. | ||
| 1833 | */ | ||
| 1834 | BUG_ON(right_path->p_tree_depth == 0); | ||
| 1835 | |||
| 1836 | /* | ||
| 1837 | * If our appending insert is at the leftmost edge of a leaf, | ||
| 1838 | * then we might need to update the rightmost records of the | ||
| 1839 | * neighboring path. | ||
| 1840 | */ | ||
| 1841 | el = path_leaf_el(right_path); | ||
| 1842 | next_free = le16_to_cpu(el->l_next_free_rec); | ||
| 1843 | if (next_free == 0 || | ||
| 1844 | (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) { | ||
| 1845 | u32 left_cpos; | ||
| 1846 | |||
| 1847 | ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, | ||
| 1848 | &left_cpos); | ||
| 1849 | if (ret) { | ||
| 1850 | mlog_errno(ret); | ||
| 1851 | goto out; | ||
| 1852 | } | ||
| 1853 | |||
| 1854 | mlog(0, "Append may need a left path update. cpos: %u, " | ||
| 1855 | "left_cpos: %u\n", le32_to_cpu(insert_rec->e_cpos), | ||
| 1856 | left_cpos); | ||
| 1857 | |||
| 1858 | /* | ||
| 1859 | * No need to worry if the append is already in the | ||
| 1860 | * leftmost leaf. | ||
| 1861 | */ | ||
| 1862 | if (left_cpos) { | ||
| 1863 | left_path = ocfs2_new_path(path_root_bh(right_path), | ||
| 1864 | path_root_el(right_path)); | ||
| 1865 | if (!left_path) { | ||
| 1866 | ret = -ENOMEM; | ||
| 1867 | mlog_errno(ret); | ||
| 1868 | goto out; | ||
| 1869 | } | ||
| 1870 | |||
| 1871 | ret = ocfs2_find_path(inode, left_path, left_cpos); | ||
| 1872 | if (ret) { | ||
| 1873 | mlog_errno(ret); | ||
| 1874 | goto out; | ||
| 1875 | } | ||
| 1876 | |||
| 1877 | /* | ||
| 1878 | * ocfs2_insert_path() will pass the left_path to the | ||
| 1879 | * journal for us. | ||
| 1880 | */ | ||
| 1881 | } | ||
| 1882 | } | ||
| 1883 | |||
| 1884 | ret = ocfs2_journal_access_path(inode, handle, right_path); | ||
| 1885 | if (ret) { | ||
| 1886 | mlog_errno(ret); | ||
| 1887 | goto out; | ||
| 1888 | } | ||
| 1889 | |||
| 1890 | el = path_root_el(right_path); | ||
| 1891 | bh = path_root_bh(right_path); | ||
| 1892 | i = 0; | ||
| 1893 | while (1) { | ||
| 1894 | struct ocfs2_extent_rec *rec; | ||
| 1895 | |||
| 1896 | next_free = le16_to_cpu(el->l_next_free_rec); | ||
| 1897 | if (next_free == 0) { | ||
| 1898 | ocfs2_error(inode->i_sb, | ||
| 1899 | "Dinode %llu has a bad extent list", | ||
| 1900 | (unsigned long long)OCFS2_I(inode)->ip_blkno); | ||
| 1901 | ret = -EIO; | ||
| 1902 | goto out; | ||
| 1903 | } | ||
| 1904 | |||
| 1905 | rec = &el->l_recs[next_free - 1]; | ||
| 1906 | |||
| 1907 | rec->e_int_clusters = insert_rec->e_cpos; | ||
| 1908 | le32_add_cpu(&rec->e_int_clusters, | ||
| 1909 | le16_to_cpu(insert_rec->e_leaf_clusters)); | ||
| 1910 | le32_add_cpu(&rec->e_int_clusters, | ||
| 1911 | -le32_to_cpu(rec->e_cpos)); | ||
| 1912 | |||
| 1913 | ret = ocfs2_journal_dirty(handle, bh); | ||
| 1914 | if (ret) | ||
| 1915 | mlog_errno(ret); | ||
| 1916 | |||
| 1917 | /* Don't touch the leaf node */ | ||
| 1918 | if (++i >= right_path->p_tree_depth) | ||
| 1919 | break; | ||
| 1920 | |||
| 1921 | bh = right_path->p_node[i].bh; | ||
| 1922 | el = right_path->p_node[i].el; | ||
| 1923 | } | ||
| 1924 | |||
| 1925 | *ret_left_path = left_path; | ||
| 1926 | ret = 0; | ||
| 1927 | out: | ||
| 1928 | if (ret != 0) | ||
| 1929 | ocfs2_free_path(left_path); | ||
| 1930 | |||
| 1931 | return ret; | ||
| 1932 | } | ||
| 1933 | |||
| 1934 | /* | ||
| 1935 | * This function only does inserts on an allocation b-tree. For dinode | ||
| 1936 | * lists, ocfs2_insert_at_leaf() is called directly. | ||
| 1937 | * | ||
| 1938 | * right_path is the path we want to do the actual insert | ||
| 1939 | * in. left_path should only be passed in if we need to update that | ||
| 1940 | * portion of the tree after an edge insert. | ||
| 1941 | */ | ||
| 1942 | static int ocfs2_insert_path(struct inode *inode, | ||
| 1943 | handle_t *handle, | ||
| 1944 | struct ocfs2_path *left_path, | ||
| 1945 | struct ocfs2_path *right_path, | ||
| 1946 | struct ocfs2_extent_rec *insert_rec, | ||
| 1947 | struct ocfs2_insert_type *insert) | ||
| 1948 | { | ||
| 1949 | int ret, subtree_index; | ||
| 1950 | struct buffer_head *leaf_bh = path_leaf_bh(right_path); | ||
| 1951 | struct ocfs2_extent_list *el; | ||
| 1952 | |||
| 1953 | /* | ||
| 1954 | * Pass both paths to the journal. The majority of inserts | ||
| 1955 | * will be touching all components anyway. | ||
| 1956 | */ | ||
| 1957 | ret = ocfs2_journal_access_path(inode, handle, right_path); | ||
| 1958 | if (ret < 0) { | ||
| 1959 | mlog_errno(ret); | ||
| 1960 | goto out; | ||
| 1961 | } | ||
| 1962 | |||
| 1963 | if (left_path) { | ||
| 1964 | int credits = handle->h_buffer_credits; | ||
| 1965 | |||
| 1966 | /* | ||
| 1967 | * There's a chance that left_path got passed back to | ||
| 1968 | * us without being accounted for in the | ||
| 1969 | * journal. Extend our transaction here to be sure we | ||
| 1970 | * can change those blocks. | ||
| 1971 | */ | ||
| 1972 | credits += left_path->p_tree_depth; | ||
| 1973 | |||
| 1974 | ret = ocfs2_extend_trans(handle, credits); | ||
| 1975 | if (ret < 0) { | ||
| 1976 | mlog_errno(ret); | ||
| 1977 | goto out; | ||
| 1978 | } | ||
| 1979 | |||
| 1980 | ret = ocfs2_journal_access_path(inode, handle, left_path); | ||
| 1981 | if (ret < 0) { | ||
| 1982 | mlog_errno(ret); | ||
| 1983 | goto out; | ||
| 1984 | } | ||
| 1985 | } | ||
| 1986 | |||
| 1987 | el = path_leaf_el(right_path); | ||
| 1988 | |||
| 1989 | ocfs2_insert_at_leaf(insert_rec, el, insert, inode); | ||
| 1990 | ret = ocfs2_journal_dirty(handle, leaf_bh); | ||
| 1991 | if (ret) | ||
| 1992 | mlog_errno(ret); | ||
| 1993 | |||
| 1994 | if (left_path) { | ||
| 1995 | /* | ||
| 1996 | * The rotate code has indicated that we need to fix | ||
| 1997 | * up portions of the tree after the insert. | ||
| 1998 | * | ||
| 1999 | * XXX: Should we extend the transaction here? | ||
| 2000 | */ | ||
| 2001 | subtree_index = ocfs2_find_subtree_root(inode, left_path, | ||
| 2002 | right_path); | ||
| 2003 | ocfs2_complete_edge_insert(inode, handle, left_path, | ||
| 2004 | right_path, subtree_index); | ||
| 2005 | } | ||
| 2006 | |||
| 2007 | ret = 0; | ||
| 2008 | out: | ||
| 2009 | return ret; | ||
| 2010 | } | ||
| 2011 | |||
| 2012 | static int ocfs2_do_insert_extent(struct inode *inode, | ||
| 2013 | handle_t *handle, | ||
| 2014 | struct buffer_head *di_bh, | ||
| 2015 | struct ocfs2_extent_rec *insert_rec, | ||
| 2016 | struct ocfs2_insert_type *type) | ||
| 2017 | { | ||
| 2018 | int ret, rotate = 0; | ||
| 2019 | u32 cpos; | ||
| 2020 | struct ocfs2_path *right_path = NULL; | ||
| 2021 | struct ocfs2_path *left_path = NULL; | ||
| 2022 | struct ocfs2_dinode *di; | ||
| 2023 | struct ocfs2_extent_list *el; | ||
| 2024 | |||
| 2025 | di = (struct ocfs2_dinode *) di_bh->b_data; | ||
| 2026 | el = &di->id2.i_list; | ||
| 2027 | |||
| 2028 | ret = ocfs2_journal_access(handle, inode, di_bh, | ||
| 2029 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
| 2030 | if (ret) { | ||
| 2031 | mlog_errno(ret); | ||
| 2032 | goto out; | ||
| 2033 | } | ||
| 2034 | |||
| 2035 | if (le16_to_cpu(el->l_tree_depth) == 0) { | ||
| 2036 | ocfs2_insert_at_leaf(insert_rec, el, type, inode); | ||
| 2037 | goto out_update_clusters; | ||
| 2038 | } | ||
| 2039 | |||
| 2040 | right_path = ocfs2_new_inode_path(di_bh); | ||
| 2041 | if (!right_path) { | ||
| 2042 | ret = -ENOMEM; | ||
| 2043 | mlog_errno(ret); | ||
| 2044 | goto out; | ||
| 2045 | } | ||
| 2046 | |||
| 2047 | /* | ||
| 2048 | * Determine the path to start with. Rotations need the | ||
| 2049 | * rightmost path, everything else can go directly to the | ||
| 2050 | * target leaf. | ||
| 2051 | */ | ||
| 2052 | cpos = le32_to_cpu(insert_rec->e_cpos); | ||
| 2053 | if (type->ins_appending == APPEND_NONE && | ||
| 2054 | type->ins_contig == CONTIG_NONE) { | ||
| 2055 | rotate = 1; | ||
| 2056 | cpos = UINT_MAX; | ||
| 2057 | } | ||
| 2058 | |||
| 2059 | ret = ocfs2_find_path(inode, right_path, cpos); | ||
| 2060 | if (ret) { | ||
| 2061 | mlog_errno(ret); | ||
| 2062 | goto out; | ||
| 2063 | } | ||
| 2064 | |||
| 2065 | /* | ||
| 2066 | * Rotations and appends need special treatment - they modify | ||
| 2067 | * parts of the tree's above them. | ||
| 2068 | * | ||
| 2069 | * Both might pass back a path immediate to the left of the | ||
| 2070 | * one being inserted to. This will be cause | ||
| 2071 | * ocfs2_insert_path() to modify the rightmost records of | ||
| 2072 | * left_path to account for an edge insert. | ||
| 2073 | * | ||
| 2074 | * XXX: When modifying this code, keep in mind that an insert | ||
| 2075 | * can wind up skipping both of these two special cases... | ||
| 2076 | */ | ||
| 2077 | if (rotate) { | ||
| 2078 | ret = ocfs2_rotate_tree_right(inode, handle, | ||
| 2079 | le32_to_cpu(insert_rec->e_cpos), | ||
| 2080 | right_path, &left_path); | ||
| 2081 | if (ret) { | ||
| 2082 | mlog_errno(ret); | ||
| 2083 | goto out; | ||
| 2084 | } | ||
| 2085 | } else if (type->ins_appending == APPEND_TAIL | ||
| 2086 | && type->ins_contig != CONTIG_LEFT) { | ||
| 2087 | ret = ocfs2_append_rec_to_path(inode, handle, insert_rec, | ||
| 2088 | right_path, &left_path); | ||
| 2089 | if (ret) { | ||
| 2090 | mlog_errno(ret); | ||
| 2091 | goto out; | ||
| 2092 | } | ||
| 2093 | } | ||
| 2094 | |||
| 2095 | ret = ocfs2_insert_path(inode, handle, left_path, right_path, | ||
| 2096 | insert_rec, type); | ||
| 2097 | if (ret) { | ||
| 2098 | mlog_errno(ret); | ||
| 2099 | goto out; | ||
| 2100 | } | ||
| 2101 | |||
| 2102 | out_update_clusters: | ||
| 2103 | ocfs2_update_dinode_clusters(inode, di, | ||
| 2104 | le16_to_cpu(insert_rec->e_leaf_clusters)); | ||
| 2105 | |||
| 2106 | ret = ocfs2_journal_dirty(handle, di_bh); | ||
| 2107 | if (ret) | ||
| 2108 | mlog_errno(ret); | ||
| 2109 | |||
| 2110 | out: | ||
| 2111 | ocfs2_free_path(left_path); | ||
| 2112 | ocfs2_free_path(right_path); | ||
| 2113 | |||
| 2114 | return ret; | ||
| 2115 | } | ||
| 2116 | |||
| 2117 | static void ocfs2_figure_contig_type(struct inode *inode, | ||
| 2118 | struct ocfs2_insert_type *insert, | ||
| 2119 | struct ocfs2_extent_list *el, | ||
| 2120 | struct ocfs2_extent_rec *insert_rec) | ||
| 2121 | { | ||
| 2122 | int i; | ||
| 2123 | enum ocfs2_contig_type contig_type = CONTIG_NONE; | ||
| 2124 | |||
| 2125 | BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); | ||
| 2126 | |||
| 2127 | for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { | ||
| 2128 | contig_type = ocfs2_extent_contig(inode, &el->l_recs[i], | ||
| 2129 | insert_rec); | ||
| 2130 | if (contig_type != CONTIG_NONE) { | ||
| 2131 | insert->ins_contig_index = i; | ||
| 2132 | break; | ||
| 2133 | } | ||
| 2134 | } | ||
| 2135 | insert->ins_contig = contig_type; | ||
| 2136 | } | ||
| 2137 | |||
| 2138 | /* | ||
| 2139 | * This should only be called against the righmost leaf extent list. | ||
| 2140 | * | ||
| 2141 | * ocfs2_figure_appending_type() will figure out whether we'll have to | ||
| 2142 | * insert at the tail of the rightmost leaf. | ||
| 2143 | * | ||
| 2144 | * This should also work against the dinode list for tree's with 0 | ||
| 2145 | * depth. If we consider the dinode list to be the rightmost leaf node | ||
| 2146 | * then the logic here makes sense. | ||
| 2147 | */ | ||
| 2148 | static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert, | ||
| 2149 | struct ocfs2_extent_list *el, | ||
| 2150 | struct ocfs2_extent_rec *insert_rec) | ||
| 2151 | { | ||
| 2152 | int i; | ||
| 2153 | u32 cpos = le32_to_cpu(insert_rec->e_cpos); | ||
| 2154 | struct ocfs2_extent_rec *rec; | ||
| 2155 | |||
| 2156 | insert->ins_appending = APPEND_NONE; | ||
| 2157 | |||
| 2158 | BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); | ||
| 2159 | |||
| 2160 | if (!el->l_next_free_rec) | ||
| 2161 | goto set_tail_append; | ||
| 2162 | |||
| 2163 | if (ocfs2_is_empty_extent(&el->l_recs[0])) { | ||
| 2164 | /* Were all records empty? */ | ||
| 2165 | if (le16_to_cpu(el->l_next_free_rec) == 1) | ||
| 2166 | goto set_tail_append; | ||
| 845 | } | 2167 | } |
| 846 | 2168 | ||
| 847 | /* Can we allocate without adding/shifting tree bits? */ | ||
| 848 | i = le16_to_cpu(el->l_next_free_rec) - 1; | 2169 | i = le16_to_cpu(el->l_next_free_rec) - 1; |
| 849 | if (le16_to_cpu(el->l_next_free_rec) == 0 | 2170 | rec = &el->l_recs[i]; |
| 850 | || (le16_to_cpu(el->l_next_free_rec) < le16_to_cpu(el->l_count)) | 2171 | |
| 851 | || le32_to_cpu(el->l_recs[i].e_clusters) == 0 | 2172 | if (cpos >= |
| 852 | || ocfs2_extent_contig(inode, &el->l_recs[i], start_blk)) | 2173 | (le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters))) |
| 853 | goto out_add; | 2174 | goto set_tail_append; |
| 2175 | |||
| 2176 | return; | ||
| 2177 | |||
| 2178 | set_tail_append: | ||
| 2179 | insert->ins_appending = APPEND_TAIL; | ||
| 2180 | } | ||
| 2181 | |||
| 2182 | /* | ||
| 2183 | * Helper function called at the begining of an insert. | ||
| 2184 | * | ||
| 2185 | * This computes a few things that are commonly used in the process of | ||
| 2186 | * inserting into the btree: | ||
| 2187 | * - Whether the new extent is contiguous with an existing one. | ||
| 2188 | * - The current tree depth. | ||
| 2189 | * - Whether the insert is an appending one. | ||
| 2190 | * - The total # of free records in the tree. | ||
| 2191 | * | ||
| 2192 | * All of the information is stored on the ocfs2_insert_type | ||
| 2193 | * structure. | ||
| 2194 | */ | ||
| 2195 | static int ocfs2_figure_insert_type(struct inode *inode, | ||
| 2196 | struct buffer_head *di_bh, | ||
| 2197 | struct buffer_head **last_eb_bh, | ||
| 2198 | struct ocfs2_extent_rec *insert_rec, | ||
| 2199 | struct ocfs2_insert_type *insert) | ||
| 2200 | { | ||
| 2201 | int ret; | ||
| 2202 | struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; | ||
| 2203 | struct ocfs2_extent_block *eb; | ||
| 2204 | struct ocfs2_extent_list *el; | ||
| 2205 | struct ocfs2_path *path = NULL; | ||
| 2206 | struct buffer_head *bh = NULL; | ||
| 2207 | |||
| 2208 | el = &di->id2.i_list; | ||
| 2209 | insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth); | ||
| 2210 | |||
| 2211 | if (el->l_tree_depth) { | ||
| 2212 | /* | ||
| 2213 | * If we have tree depth, we read in the | ||
| 2214 | * rightmost extent block ahead of time as | ||
| 2215 | * ocfs2_figure_insert_type() and ocfs2_add_branch() | ||
| 2216 | * may want it later. | ||
| 2217 | */ | ||
| 2218 | ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), | ||
| 2219 | le64_to_cpu(di->i_last_eb_blk), &bh, | ||
| 2220 | OCFS2_BH_CACHED, inode); | ||
| 2221 | if (ret) { | ||
| 2222 | mlog_exit(ret); | ||
| 2223 | goto out; | ||
| 2224 | } | ||
| 2225 | eb = (struct ocfs2_extent_block *) bh->b_data; | ||
| 2226 | el = &eb->h_list; | ||
| 2227 | } | ||
| 2228 | |||
| 2229 | /* | ||
| 2230 | * Unless we have a contiguous insert, we'll need to know if | ||
| 2231 | * there is room left in our allocation tree for another | ||
| 2232 | * extent record. | ||
| 2233 | * | ||
| 2234 | * XXX: This test is simplistic, we can search for empty | ||
| 2235 | * extent records too. | ||
| 2236 | */ | ||
| 2237 | insert->ins_free_records = le16_to_cpu(el->l_count) - | ||
| 2238 | le16_to_cpu(el->l_next_free_rec); | ||
| 2239 | |||
| 2240 | if (!insert->ins_tree_depth) { | ||
| 2241 | ocfs2_figure_contig_type(inode, insert, el, insert_rec); | ||
| 2242 | ocfs2_figure_appending_type(insert, el, insert_rec); | ||
| 2243 | return 0; | ||
| 2244 | } | ||
| 2245 | |||
| 2246 | path = ocfs2_new_inode_path(di_bh); | ||
| 2247 | if (!path) { | ||
| 2248 | ret = -ENOMEM; | ||
| 2249 | mlog_errno(ret); | ||
| 2250 | goto out; | ||
| 2251 | } | ||
| 2252 | |||
| 2253 | /* | ||
| 2254 | * In the case that we're inserting past what the tree | ||
| 2255 | * currently accounts for, ocfs2_find_path() will return for | ||
| 2256 | * us the rightmost tree path. This is accounted for below in | ||
| 2257 | * the appending code. | ||
| 2258 | */ | ||
| 2259 | ret = ocfs2_find_path(inode, path, le32_to_cpu(insert_rec->e_cpos)); | ||
| 2260 | if (ret) { | ||
| 2261 | mlog_errno(ret); | ||
| 2262 | goto out; | ||
| 2263 | } | ||
| 2264 | |||
| 2265 | el = path_leaf_el(path); | ||
| 2266 | |||
| 2267 | /* | ||
| 2268 | * Now that we have the path, there's two things we want to determine: | ||
| 2269 | * 1) Contiguousness (also set contig_index if this is so) | ||
| 2270 | * | ||
| 2271 | * 2) Are we doing an append? We can trivially break this up | ||
| 2272 | * into two types of appends: simple record append, or a | ||
| 2273 | * rotate inside the tail leaf. | ||
| 2274 | */ | ||
| 2275 | ocfs2_figure_contig_type(inode, insert, el, insert_rec); | ||
| 2276 | |||
| 2277 | /* | ||
| 2278 | * The insert code isn't quite ready to deal with all cases of | ||
| 2279 | * left contiguousness. Specifically, if it's an insert into | ||
| 2280 | * the 1st record in a leaf, it will require the adjustment of | ||
| 2281 | * cluster count on the last record of the path directly to it's | ||
| 2282 | * left. For now, just catch that case and fool the layers | ||
| 2283 | * above us. This works just fine for tree_depth == 0, which | ||
| 2284 | * is why we allow that above. | ||
| 2285 | */ | ||
| 2286 | if (insert->ins_contig == CONTIG_LEFT && | ||
| 2287 | insert->ins_contig_index == 0) | ||
| 2288 | insert->ins_contig = CONTIG_NONE; | ||
| 2289 | |||
| 2290 | /* | ||
| 2291 | * Ok, so we can simply compare against last_eb to figure out | ||
| 2292 | * whether the path doesn't exist. This will only happen in | ||
| 2293 | * the case that we're doing a tail append, so maybe we can | ||
| 2294 | * take advantage of that information somehow. | ||
| 2295 | */ | ||
| 2296 | if (le64_to_cpu(di->i_last_eb_blk) == path_leaf_bh(path)->b_blocknr) { | ||
| 2297 | /* | ||
| 2298 | * Ok, ocfs2_find_path() returned us the rightmost | ||
| 2299 | * tree path. This might be an appending insert. There are | ||
| 2300 | * two cases: | ||
| 2301 | * 1) We're doing a true append at the tail: | ||
| 2302 | * -This might even be off the end of the leaf | ||
| 2303 | * 2) We're "appending" by rotating in the tail | ||
| 2304 | */ | ||
| 2305 | ocfs2_figure_appending_type(insert, el, insert_rec); | ||
| 2306 | } | ||
| 2307 | |||
| 2308 | out: | ||
| 2309 | ocfs2_free_path(path); | ||
| 2310 | |||
| 2311 | if (ret == 0) | ||
| 2312 | *last_eb_bh = bh; | ||
| 2313 | else | ||
| 2314 | brelse(bh); | ||
| 2315 | return ret; | ||
| 2316 | } | ||
| 2317 | |||
| 2318 | /* | ||
| 2319 | * Insert an extent into an inode btree. | ||
| 2320 | * | ||
| 2321 | * The caller needs to update fe->i_clusters | ||
| 2322 | */ | ||
| 2323 | int ocfs2_insert_extent(struct ocfs2_super *osb, | ||
| 2324 | handle_t *handle, | ||
| 2325 | struct inode *inode, | ||
| 2326 | struct buffer_head *fe_bh, | ||
| 2327 | u32 cpos, | ||
| 2328 | u64 start_blk, | ||
| 2329 | u32 new_clusters, | ||
| 2330 | struct ocfs2_alloc_context *meta_ac) | ||
| 2331 | { | ||
| 2332 | int status, shift; | ||
| 2333 | struct buffer_head *last_eb_bh = NULL; | ||
| 2334 | struct buffer_head *bh = NULL; | ||
| 2335 | struct ocfs2_insert_type insert = {0, }; | ||
| 2336 | struct ocfs2_extent_rec rec; | ||
| 2337 | |||
| 2338 | mlog(0, "add %u clusters at position %u to inode %llu\n", | ||
| 2339 | new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno); | ||
| 2340 | |||
| 2341 | mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) && | ||
| 2342 | (OCFS2_I(inode)->ip_clusters != cpos), | ||
| 2343 | "Device %s, asking for sparse allocation: inode %llu, " | ||
| 2344 | "cpos %u, clusters %u\n", | ||
| 2345 | osb->dev_str, | ||
| 2346 | (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos, | ||
| 2347 | OCFS2_I(inode)->ip_clusters); | ||
| 2348 | |||
| 2349 | memset(&rec, 0, sizeof(rec)); | ||
| 2350 | rec.e_cpos = cpu_to_le32(cpos); | ||
| 2351 | rec.e_blkno = cpu_to_le64(start_blk); | ||
| 2352 | rec.e_leaf_clusters = cpu_to_le16(new_clusters); | ||
| 2353 | |||
| 2354 | status = ocfs2_figure_insert_type(inode, fe_bh, &last_eb_bh, &rec, | ||
| 2355 | &insert); | ||
| 2356 | if (status < 0) { | ||
| 2357 | mlog_errno(status); | ||
| 2358 | goto bail; | ||
| 2359 | } | ||
| 854 | 2360 | ||
| 855 | mlog(0, "ocfs2_allocate_extent: couldn't do a simple add, traversing " | 2361 | mlog(0, "Insert.appending: %u, Insert.Contig: %u, " |
| 856 | "tree now.\n"); | 2362 | "Insert.contig_index: %d, Insert.free_records: %d, " |
| 2363 | "Insert.tree_depth: %d\n", | ||
| 2364 | insert.ins_appending, insert.ins_contig, insert.ins_contig_index, | ||
| 2365 | insert.ins_free_records, insert.ins_tree_depth); | ||
| 2366 | |||
| 2367 | /* | ||
| 2368 | * Avoid growing the tree unless we're out of records and the | ||
| 2369 | * insert type requres one. | ||
| 2370 | */ | ||
| 2371 | if (insert.ins_contig != CONTIG_NONE || insert.ins_free_records) | ||
| 2372 | goto out_add; | ||
| 857 | 2373 | ||
| 858 | shift = ocfs2_find_branch_target(osb, inode, fe_bh, &bh); | 2374 | shift = ocfs2_find_branch_target(osb, inode, fe_bh, &bh); |
| 859 | if (shift < 0) { | 2375 | if (shift < 0) { |
| @@ -866,13 +2382,9 @@ int ocfs2_insert_extent(struct ocfs2_super *osb, | |||
| 866 | * and didn't find room for any more extents - we need to add | 2382 | * and didn't find room for any more extents - we need to add |
| 867 | * another tree level */ | 2383 | * another tree level */ |
| 868 | if (shift) { | 2384 | if (shift) { |
| 869 | /* if we hit a leaf, we'd better be empty :) */ | ||
| 870 | BUG_ON(le16_to_cpu(el->l_next_free_rec) != | ||
| 871 | le16_to_cpu(el->l_count)); | ||
| 872 | BUG_ON(bh); | 2385 | BUG_ON(bh); |
| 873 | mlog(0, "ocfs2_allocate_extent: need to shift tree depth " | 2386 | mlog(0, "need to shift tree depth " |
| 874 | "(current = %u)\n", | 2387 | "(current = %d)\n", insert.ins_tree_depth); |
| 875 | le16_to_cpu(fe->id2.i_list.l_tree_depth)); | ||
| 876 | 2388 | ||
| 877 | /* ocfs2_shift_tree_depth will return us a buffer with | 2389 | /* ocfs2_shift_tree_depth will return us a buffer with |
| 878 | * the new extent block (so we can pass that to | 2390 | * the new extent block (so we can pass that to |
| @@ -883,15 +2395,16 @@ int ocfs2_insert_extent(struct ocfs2_super *osb, | |||
| 883 | mlog_errno(status); | 2395 | mlog_errno(status); |
| 884 | goto bail; | 2396 | goto bail; |
| 885 | } | 2397 | } |
| 2398 | insert.ins_tree_depth++; | ||
| 886 | /* Special case: we have room now if we shifted from | 2399 | /* Special case: we have room now if we shifted from |
| 887 | * tree_depth 0 */ | 2400 | * tree_depth 0 */ |
| 888 | if (fe->id2.i_list.l_tree_depth == cpu_to_le16(1)) | 2401 | if (insert.ins_tree_depth == 1) |
| 889 | goto out_add; | 2402 | goto out_add; |
| 890 | } | 2403 | } |
| 891 | 2404 | ||
| 892 | /* call ocfs2_add_branch to add the final part of the tree with | 2405 | /* call ocfs2_add_branch to add the final part of the tree with |
| 893 | * the new data. */ | 2406 | * the new data. */ |
| 894 | mlog(0, "ocfs2_allocate_extent: add branch. bh = %p\n", bh); | 2407 | mlog(0, "add branch. bh = %p\n", bh); |
| 895 | status = ocfs2_add_branch(osb, handle, inode, fe_bh, bh, last_eb_bh, | 2408 | status = ocfs2_add_branch(osb, handle, inode, fe_bh, bh, last_eb_bh, |
| 896 | meta_ac); | 2409 | meta_ac); |
| 897 | if (status < 0) { | 2410 | if (status < 0) { |
| @@ -900,11 +2413,12 @@ int ocfs2_insert_extent(struct ocfs2_super *osb, | |||
| 900 | } | 2413 | } |
| 901 | 2414 | ||
| 902 | out_add: | 2415 | out_add: |
| 903 | /* Finally, we can add clusters. */ | 2416 | /* Finally, we can add clusters. This might rotate the tree for us. */ |
| 904 | status = ocfs2_do_insert_extent(osb, handle, inode, fe_bh, | 2417 | status = ocfs2_do_insert_extent(inode, handle, fe_bh, &rec, &insert); |
| 905 | start_blk, new_clusters); | ||
| 906 | if (status < 0) | 2418 | if (status < 0) |
| 907 | mlog_errno(status); | 2419 | mlog_errno(status); |
| 2420 | else | ||
| 2421 | ocfs2_extent_map_insert_rec(inode, &rec); | ||
| 908 | 2422 | ||
| 909 | bail: | 2423 | bail: |
| 910 | if (bh) | 2424 | if (bh) |
| @@ -1447,168 +2961,389 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb) | |||
| 1447 | * block will be deleted, and if it will, what the new last extent | 2961 | * block will be deleted, and if it will, what the new last extent |
| 1448 | * block will be so we can update his h_next_leaf_blk field, as well | 2962 | * block will be so we can update his h_next_leaf_blk field, as well |
| 1449 | * as the dinodes i_last_eb_blk */ | 2963 | * as the dinodes i_last_eb_blk */ |
| 1450 | static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb, | 2964 | static int ocfs2_find_new_last_ext_blk(struct inode *inode, |
| 1451 | struct inode *inode, | 2965 | unsigned int clusters_to_del, |
| 1452 | struct ocfs2_dinode *fe, | 2966 | struct ocfs2_path *path, |
| 1453 | u32 new_i_clusters, | ||
| 1454 | struct buffer_head *old_last_eb, | ||
| 1455 | struct buffer_head **new_last_eb) | 2967 | struct buffer_head **new_last_eb) |
| 1456 | { | 2968 | { |
| 1457 | int i, status = 0; | 2969 | int next_free, ret = 0; |
| 1458 | u64 block = 0; | 2970 | u32 cpos; |
| 2971 | struct ocfs2_extent_rec *rec; | ||
| 1459 | struct ocfs2_extent_block *eb; | 2972 | struct ocfs2_extent_block *eb; |
| 1460 | struct ocfs2_extent_list *el; | 2973 | struct ocfs2_extent_list *el; |
| 1461 | struct buffer_head *bh = NULL; | 2974 | struct buffer_head *bh = NULL; |
| 1462 | 2975 | ||
| 1463 | *new_last_eb = NULL; | 2976 | *new_last_eb = NULL; |
| 1464 | 2977 | ||
| 1465 | if (!OCFS2_IS_VALID_DINODE(fe)) { | ||
| 1466 | OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); | ||
| 1467 | status = -EIO; | ||
| 1468 | goto bail; | ||
| 1469 | } | ||
| 1470 | |||
| 1471 | /* we have no tree, so of course, no last_eb. */ | 2978 | /* we have no tree, so of course, no last_eb. */ |
| 1472 | if (!fe->id2.i_list.l_tree_depth) | 2979 | if (!path->p_tree_depth) |
| 1473 | goto bail; | 2980 | goto out; |
| 1474 | 2981 | ||
| 1475 | /* trunc to zero special case - this makes tree_depth = 0 | 2982 | /* trunc to zero special case - this makes tree_depth = 0 |
| 1476 | * regardless of what it is. */ | 2983 | * regardless of what it is. */ |
| 1477 | if (!new_i_clusters) | 2984 | if (OCFS2_I(inode)->ip_clusters == clusters_to_del) |
| 1478 | goto bail; | 2985 | goto out; |
| 1479 | 2986 | ||
| 1480 | eb = (struct ocfs2_extent_block *) old_last_eb->b_data; | 2987 | el = path_leaf_el(path); |
| 1481 | el = &(eb->h_list); | ||
| 1482 | BUG_ON(!el->l_next_free_rec); | 2988 | BUG_ON(!el->l_next_free_rec); |
| 1483 | 2989 | ||
| 1484 | /* Make sure that this guy will actually be empty after we | 2990 | /* |
| 1485 | * clear away the data. */ | 2991 | * Make sure that this extent list will actually be empty |
| 1486 | if (le32_to_cpu(el->l_recs[0].e_cpos) < new_i_clusters) | 2992 | * after we clear away the data. We can shortcut out if |
| 1487 | goto bail; | 2993 | * there's more than one non-empty extent in the |
| 2994 | * list. Otherwise, a check of the remaining extent is | ||
| 2995 | * necessary. | ||
| 2996 | */ | ||
| 2997 | next_free = le16_to_cpu(el->l_next_free_rec); | ||
| 2998 | rec = NULL; | ||
| 2999 | if (ocfs2_is_empty_extent(&el->l_recs[0])) { | ||
| 3000 | if (next_free > 2) | ||
| 3001 | goto out; | ||
| 1488 | 3002 | ||
| 1489 | /* Ok, at this point, we know that last_eb will definitely | 3003 | /* We may have a valid extent in index 1, check it. */ |
| 1490 | * change, so lets traverse the tree and find the second to | 3004 | if (next_free == 2) |
| 1491 | * last extent block. */ | 3005 | rec = &el->l_recs[1]; |
| 1492 | el = &(fe->id2.i_list); | 3006 | |
| 1493 | /* go down the tree, */ | 3007 | /* |
| 1494 | do { | 3008 | * Fall through - no more nonempty extents, so we want |
| 1495 | for(i = (le16_to_cpu(el->l_next_free_rec) - 1); i >= 0; i--) { | 3009 | * to delete this leaf. |
| 1496 | if (le32_to_cpu(el->l_recs[i].e_cpos) < | 3010 | */ |
| 1497 | new_i_clusters) { | 3011 | } else { |
| 1498 | block = le64_to_cpu(el->l_recs[i].e_blkno); | 3012 | if (next_free > 1) |
| 1499 | break; | 3013 | goto out; |
| 1500 | } | 3014 | |
| 3015 | rec = &el->l_recs[0]; | ||
| 3016 | } | ||
| 3017 | |||
| 3018 | if (rec) { | ||
| 3019 | /* | ||
| 3020 | * Check it we'll only be trimming off the end of this | ||
| 3021 | * cluster. | ||
| 3022 | */ | ||
| 3023 | if (le16_to_cpu(rec->e_leaf_clusters) > clusters_to_del) | ||
| 3024 | goto out; | ||
| 3025 | } | ||
| 3026 | |||
| 3027 | ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos); | ||
| 3028 | if (ret) { | ||
| 3029 | mlog_errno(ret); | ||
| 3030 | goto out; | ||
| 3031 | } | ||
| 3032 | |||
| 3033 | ret = ocfs2_find_leaf(inode, path_root_el(path), cpos, &bh); | ||
| 3034 | if (ret) { | ||
| 3035 | mlog_errno(ret); | ||
| 3036 | goto out; | ||
| 3037 | } | ||
| 3038 | |||
| 3039 | eb = (struct ocfs2_extent_block *) bh->b_data; | ||
| 3040 | el = &eb->h_list; | ||
| 3041 | if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { | ||
| 3042 | OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); | ||
| 3043 | ret = -EROFS; | ||
| 3044 | goto out; | ||
| 3045 | } | ||
| 3046 | |||
| 3047 | *new_last_eb = bh; | ||
| 3048 | get_bh(*new_last_eb); | ||
| 3049 | mlog(0, "returning block %llu, (cpos: %u)\n", | ||
| 3050 | (unsigned long long)le64_to_cpu(eb->h_blkno), cpos); | ||
| 3051 | out: | ||
| 3052 | brelse(bh); | ||
| 3053 | |||
| 3054 | return ret; | ||
| 3055 | } | ||
| 3056 | |||
| 3057 | /* | ||
| 3058 | * Trim some clusters off the rightmost edge of a tree. Only called | ||
| 3059 | * during truncate. | ||
| 3060 | * | ||
| 3061 | * The caller needs to: | ||
| 3062 | * - start journaling of each path component. | ||
| 3063 | * - compute and fully set up any new last ext block | ||
| 3064 | */ | ||
| 3065 | static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path, | ||
| 3066 | handle_t *handle, struct ocfs2_truncate_context *tc, | ||
| 3067 | u32 clusters_to_del, u64 *delete_start) | ||
| 3068 | { | ||
| 3069 | int ret, i, index = path->p_tree_depth; | ||
| 3070 | u32 new_edge = 0; | ||
| 3071 | u64 deleted_eb = 0; | ||
| 3072 | struct buffer_head *bh; | ||
| 3073 | struct ocfs2_extent_list *el; | ||
| 3074 | struct ocfs2_extent_rec *rec; | ||
| 3075 | |||
| 3076 | *delete_start = 0; | ||
| 3077 | |||
| 3078 | while (index >= 0) { | ||
| 3079 | bh = path->p_node[index].bh; | ||
| 3080 | el = path->p_node[index].el; | ||
| 3081 | |||
| 3082 | mlog(0, "traveling tree (index = %d, block = %llu)\n", | ||
| 3083 | index, (unsigned long long)bh->b_blocknr); | ||
| 3084 | |||
| 3085 | BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0); | ||
| 3086 | |||
| 3087 | if (index != | ||
| 3088 | (path->p_tree_depth - le16_to_cpu(el->l_tree_depth))) { | ||
| 3089 | ocfs2_error(inode->i_sb, | ||
| 3090 | "Inode %lu has invalid ext. block %llu", | ||
| 3091 | inode->i_ino, | ||
| 3092 | (unsigned long long)bh->b_blocknr); | ||
| 3093 | ret = -EROFS; | ||
| 3094 | goto out; | ||
| 1501 | } | 3095 | } |
| 1502 | BUG_ON(i < 0); | ||
| 1503 | 3096 | ||
| 1504 | if (bh) { | 3097 | find_tail_record: |
| 1505 | brelse(bh); | 3098 | i = le16_to_cpu(el->l_next_free_rec) - 1; |
| 1506 | bh = NULL; | 3099 | rec = &el->l_recs[i]; |
| 3100 | |||
| 3101 | mlog(0, "Extent list before: record %d: (%u, %u, %llu), " | ||
| 3102 | "next = %u\n", i, le32_to_cpu(rec->e_cpos), | ||
| 3103 | ocfs2_rec_clusters(el, rec), | ||
| 3104 | (unsigned long long)le64_to_cpu(rec->e_blkno), | ||
| 3105 | le16_to_cpu(el->l_next_free_rec)); | ||
| 3106 | |||
| 3107 | BUG_ON(ocfs2_rec_clusters(el, rec) < clusters_to_del); | ||
| 3108 | |||
| 3109 | if (le16_to_cpu(el->l_tree_depth) == 0) { | ||
| 3110 | /* | ||
| 3111 | * If the leaf block contains a single empty | ||
| 3112 | * extent and no records, we can just remove | ||
| 3113 | * the block. | ||
| 3114 | */ | ||
| 3115 | if (i == 0 && ocfs2_is_empty_extent(rec)) { | ||
| 3116 | memset(rec, 0, | ||
| 3117 | sizeof(struct ocfs2_extent_rec)); | ||
| 3118 | el->l_next_free_rec = cpu_to_le16(0); | ||
| 3119 | |||
| 3120 | goto delete; | ||
| 3121 | } | ||
| 3122 | |||
| 3123 | /* | ||
| 3124 | * Remove any empty extents by shifting things | ||
| 3125 | * left. That should make life much easier on | ||
| 3126 | * the code below. This condition is rare | ||
| 3127 | * enough that we shouldn't see a performance | ||
| 3128 | * hit. | ||
| 3129 | */ | ||
| 3130 | if (ocfs2_is_empty_extent(&el->l_recs[0])) { | ||
| 3131 | le16_add_cpu(&el->l_next_free_rec, -1); | ||
| 3132 | |||
| 3133 | for(i = 0; | ||
| 3134 | i < le16_to_cpu(el->l_next_free_rec); i++) | ||
| 3135 | el->l_recs[i] = el->l_recs[i + 1]; | ||
| 3136 | |||
| 3137 | memset(&el->l_recs[i], 0, | ||
| 3138 | sizeof(struct ocfs2_extent_rec)); | ||
| 3139 | |||
| 3140 | /* | ||
| 3141 | * We've modified our extent list. The | ||
| 3142 | * simplest way to handle this change | ||
| 3143 | * is to being the search from the | ||
| 3144 | * start again. | ||
| 3145 | */ | ||
| 3146 | goto find_tail_record; | ||
| 3147 | } | ||
| 3148 | |||
| 3149 | le16_add_cpu(&rec->e_leaf_clusters, -clusters_to_del); | ||
| 3150 | |||
| 3151 | /* | ||
| 3152 | * We'll use "new_edge" on our way back up the | ||
| 3153 | * tree to know what our rightmost cpos is. | ||
| 3154 | */ | ||
| 3155 | new_edge = le16_to_cpu(rec->e_leaf_clusters); | ||
| 3156 | new_edge += le32_to_cpu(rec->e_cpos); | ||
| 3157 | |||
| 3158 | /* | ||
| 3159 | * The caller will use this to delete data blocks. | ||
| 3160 | */ | ||
| 3161 | *delete_start = le64_to_cpu(rec->e_blkno) | ||
| 3162 | + ocfs2_clusters_to_blocks(inode->i_sb, | ||
| 3163 | le16_to_cpu(rec->e_leaf_clusters)); | ||
| 3164 | |||
| 3165 | /* | ||
| 3166 | * If it's now empty, remove this record. | ||
| 3167 | */ | ||
| 3168 | if (le16_to_cpu(rec->e_leaf_clusters) == 0) { | ||
| 3169 | memset(rec, 0, | ||
| 3170 | sizeof(struct ocfs2_extent_rec)); | ||
| 3171 | le16_add_cpu(&el->l_next_free_rec, -1); | ||
| 3172 | } | ||
| 3173 | } else { | ||
| 3174 | if (le64_to_cpu(rec->e_blkno) == deleted_eb) { | ||
| 3175 | memset(rec, 0, | ||
| 3176 | sizeof(struct ocfs2_extent_rec)); | ||
| 3177 | le16_add_cpu(&el->l_next_free_rec, -1); | ||
| 3178 | |||
| 3179 | goto delete; | ||
| 3180 | } | ||
| 3181 | |||
| 3182 | /* Can this actually happen? */ | ||
| 3183 | if (le16_to_cpu(el->l_next_free_rec) == 0) | ||
| 3184 | goto delete; | ||
| 3185 | |||
| 3186 | /* | ||
| 3187 | * We never actually deleted any clusters | ||
| 3188 | * because our leaf was empty. There's no | ||
| 3189 | * reason to adjust the rightmost edge then. | ||
| 3190 | */ | ||
| 3191 | if (new_edge == 0) | ||
| 3192 | goto delete; | ||
| 3193 | |||
| 3194 | rec->e_int_clusters = cpu_to_le32(new_edge); | ||
| 3195 | le32_add_cpu(&rec->e_int_clusters, | ||
| 3196 | -le32_to_cpu(rec->e_cpos)); | ||
| 3197 | |||
| 3198 | /* | ||
| 3199 | * A deleted child record should have been | ||
| 3200 | * caught above. | ||
| 3201 | */ | ||
| 3202 | BUG_ON(le32_to_cpu(rec->e_int_clusters) == 0); | ||
| 1507 | } | 3203 | } |
| 1508 | 3204 | ||
| 1509 | status = ocfs2_read_block(osb, block, &bh, OCFS2_BH_CACHED, | 3205 | delete: |
| 1510 | inode); | 3206 | ret = ocfs2_journal_dirty(handle, bh); |
| 1511 | if (status < 0) { | 3207 | if (ret) { |
| 1512 | mlog_errno(status); | 3208 | mlog_errno(ret); |
| 1513 | goto bail; | 3209 | goto out; |
| 1514 | } | 3210 | } |
| 1515 | eb = (struct ocfs2_extent_block *) bh->b_data; | 3211 | |
| 1516 | el = &eb->h_list; | 3212 | mlog(0, "extent list container %llu, after: record %d: " |
| 1517 | if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { | 3213 | "(%u, %u, %llu), next = %u.\n", |
| 1518 | OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); | 3214 | (unsigned long long)bh->b_blocknr, i, |
| 1519 | status = -EIO; | 3215 | le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec), |
| 1520 | goto bail; | 3216 | (unsigned long long)le64_to_cpu(rec->e_blkno), |
| 3217 | le16_to_cpu(el->l_next_free_rec)); | ||
| 3218 | |||
| 3219 | /* | ||
| 3220 | * We must be careful to only attempt delete of an | ||
| 3221 | * extent block (and not the root inode block). | ||
| 3222 | */ | ||
| 3223 | if (index > 0 && le16_to_cpu(el->l_next_free_rec) == 0) { | ||
| 3224 | struct ocfs2_extent_block *eb = | ||
| 3225 | (struct ocfs2_extent_block *)bh->b_data; | ||
| 3226 | |||
| 3227 | /* | ||
| 3228 | * Save this for use when processing the | ||
| 3229 | * parent block. | ||
| 3230 | */ | ||
| 3231 | deleted_eb = le64_to_cpu(eb->h_blkno); | ||
| 3232 | |||
| 3233 | mlog(0, "deleting this extent block.\n"); | ||
| 3234 | |||
| 3235 | ocfs2_remove_from_cache(inode, bh); | ||
| 3236 | |||
| 3237 | BUG_ON(ocfs2_rec_clusters(el, &el->l_recs[0])); | ||
| 3238 | BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos)); | ||
| 3239 | BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno)); | ||
| 3240 | |||
| 3241 | if (le16_to_cpu(eb->h_suballoc_slot) == 0) { | ||
| 3242 | /* | ||
| 3243 | * This code only understands how to | ||
| 3244 | * lock the suballocator in slot 0, | ||
| 3245 | * which is fine because allocation is | ||
| 3246 | * only ever done out of that | ||
| 3247 | * suballocator too. A future version | ||
| 3248 | * might change that however, so avoid | ||
| 3249 | * a free if we don't know how to | ||
| 3250 | * handle it. This way an fs incompat | ||
| 3251 | * bit will not be necessary. | ||
| 3252 | */ | ||
| 3253 | ret = ocfs2_free_extent_block(handle, | ||
| 3254 | tc->tc_ext_alloc_inode, | ||
| 3255 | tc->tc_ext_alloc_bh, | ||
| 3256 | eb); | ||
| 3257 | |||
| 3258 | /* An error here is not fatal. */ | ||
| 3259 | if (ret < 0) | ||
| 3260 | mlog_errno(ret); | ||
| 3261 | } | ||
| 3262 | } else { | ||
| 3263 | deleted_eb = 0; | ||
| 1521 | } | 3264 | } |
| 1522 | } while (el->l_tree_depth); | ||
| 1523 | 3265 | ||
| 1524 | *new_last_eb = bh; | 3266 | index--; |
| 1525 | get_bh(*new_last_eb); | 3267 | } |
| 1526 | mlog(0, "returning block %llu\n", | ||
| 1527 | (unsigned long long)le64_to_cpu(eb->h_blkno)); | ||
| 1528 | bail: | ||
| 1529 | if (bh) | ||
| 1530 | brelse(bh); | ||
| 1531 | 3268 | ||
| 1532 | return status; | 3269 | ret = 0; |
| 3270 | out: | ||
| 3271 | return ret; | ||
| 1533 | } | 3272 | } |
| 1534 | 3273 | ||
| 1535 | static int ocfs2_do_truncate(struct ocfs2_super *osb, | 3274 | static int ocfs2_do_truncate(struct ocfs2_super *osb, |
| 1536 | unsigned int clusters_to_del, | 3275 | unsigned int clusters_to_del, |
| 1537 | struct inode *inode, | 3276 | struct inode *inode, |
| 1538 | struct buffer_head *fe_bh, | 3277 | struct buffer_head *fe_bh, |
| 1539 | struct buffer_head *old_last_eb_bh, | ||
| 1540 | handle_t *handle, | 3278 | handle_t *handle, |
| 1541 | struct ocfs2_truncate_context *tc) | 3279 | struct ocfs2_truncate_context *tc, |
| 3280 | struct ocfs2_path *path) | ||
| 1542 | { | 3281 | { |
| 1543 | int status, i, depth; | 3282 | int status; |
| 1544 | struct ocfs2_dinode *fe; | 3283 | struct ocfs2_dinode *fe; |
| 1545 | struct ocfs2_extent_block *eb; | ||
| 1546 | struct ocfs2_extent_block *last_eb = NULL; | 3284 | struct ocfs2_extent_block *last_eb = NULL; |
| 1547 | struct ocfs2_extent_list *el; | 3285 | struct ocfs2_extent_list *el; |
| 1548 | struct buffer_head *eb_bh = NULL; | ||
| 1549 | struct buffer_head *last_eb_bh = NULL; | 3286 | struct buffer_head *last_eb_bh = NULL; |
| 1550 | u64 next_eb = 0; | ||
| 1551 | u64 delete_blk = 0; | 3287 | u64 delete_blk = 0; |
| 1552 | 3288 | ||
| 1553 | fe = (struct ocfs2_dinode *) fe_bh->b_data; | 3289 | fe = (struct ocfs2_dinode *) fe_bh->b_data; |
| 1554 | 3290 | ||
| 1555 | status = ocfs2_find_new_last_ext_blk(osb, | 3291 | status = ocfs2_find_new_last_ext_blk(inode, clusters_to_del, |
| 1556 | inode, | 3292 | path, &last_eb_bh); |
| 1557 | fe, | ||
| 1558 | le32_to_cpu(fe->i_clusters) - | ||
| 1559 | clusters_to_del, | ||
| 1560 | old_last_eb_bh, | ||
| 1561 | &last_eb_bh); | ||
| 1562 | if (status < 0) { | 3293 | if (status < 0) { |
| 1563 | mlog_errno(status); | 3294 | mlog_errno(status); |
| 1564 | goto bail; | 3295 | goto bail; |
| 1565 | } | 3296 | } |
| 1566 | if (last_eb_bh) | ||
| 1567 | last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; | ||
| 1568 | 3297 | ||
| 1569 | status = ocfs2_journal_access(handle, inode, fe_bh, | 3298 | /* |
| 1570 | OCFS2_JOURNAL_ACCESS_WRITE); | 3299 | * Each component will be touched, so we might as well journal |
| 3300 | * here to avoid having to handle errors later. | ||
| 3301 | */ | ||
| 3302 | status = ocfs2_journal_access_path(inode, handle, path); | ||
| 1571 | if (status < 0) { | 3303 | if (status < 0) { |
| 1572 | mlog_errno(status); | 3304 | mlog_errno(status); |
| 1573 | goto bail; | 3305 | goto bail; |
| 1574 | } | 3306 | } |
| 3307 | |||
| 3308 | if (last_eb_bh) { | ||
| 3309 | status = ocfs2_journal_access(handle, inode, last_eb_bh, | ||
| 3310 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
| 3311 | if (status < 0) { | ||
| 3312 | mlog_errno(status); | ||
| 3313 | goto bail; | ||
| 3314 | } | ||
| 3315 | |||
| 3316 | last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; | ||
| 3317 | } | ||
| 3318 | |||
| 1575 | el = &(fe->id2.i_list); | 3319 | el = &(fe->id2.i_list); |
| 1576 | 3320 | ||
| 3321 | /* | ||
| 3322 | * Lower levels depend on this never happening, but it's best | ||
| 3323 | * to check it up here before changing the tree. | ||
| 3324 | */ | ||
| 3325 | if (el->l_tree_depth && el->l_recs[0].e_int_clusters == 0) { | ||
| 3326 | ocfs2_error(inode->i_sb, | ||
| 3327 | "Inode %lu has an empty extent record, depth %u\n", | ||
| 3328 | inode->i_ino, le16_to_cpu(el->l_tree_depth)); | ||
| 3329 | status = -EROFS; | ||
| 3330 | goto bail; | ||
| 3331 | } | ||
| 3332 | |||
| 1577 | spin_lock(&OCFS2_I(inode)->ip_lock); | 3333 | spin_lock(&OCFS2_I(inode)->ip_lock); |
| 1578 | OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) - | 3334 | OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) - |
| 1579 | clusters_to_del; | 3335 | clusters_to_del; |
| 1580 | spin_unlock(&OCFS2_I(inode)->ip_lock); | 3336 | spin_unlock(&OCFS2_I(inode)->ip_lock); |
| 1581 | le32_add_cpu(&fe->i_clusters, -clusters_to_del); | 3337 | le32_add_cpu(&fe->i_clusters, -clusters_to_del); |
| 1582 | fe->i_mtime = cpu_to_le64(CURRENT_TIME.tv_sec); | ||
| 1583 | fe->i_mtime_nsec = cpu_to_le32(CURRENT_TIME.tv_nsec); | ||
| 1584 | |||
| 1585 | i = le16_to_cpu(el->l_next_free_rec) - 1; | ||
| 1586 | |||
| 1587 | BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del); | ||
| 1588 | le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del); | ||
| 1589 | /* tree depth zero, we can just delete the clusters, otherwise | ||
| 1590 | * we need to record the offset of the next level extent block | ||
| 1591 | * as we may overwrite it. */ | ||
| 1592 | if (!el->l_tree_depth) | ||
| 1593 | delete_blk = le64_to_cpu(el->l_recs[i].e_blkno) | ||
| 1594 | + ocfs2_clusters_to_blocks(osb->sb, | ||
| 1595 | le32_to_cpu(el->l_recs[i].e_clusters)); | ||
| 1596 | else | ||
| 1597 | next_eb = le64_to_cpu(el->l_recs[i].e_blkno); | ||
| 1598 | 3338 | ||
| 1599 | if (!el->l_recs[i].e_clusters) { | 3339 | status = ocfs2_trim_tree(inode, path, handle, tc, |
| 1600 | /* if we deleted the whole extent record, then clear | 3340 | clusters_to_del, &delete_blk); |
| 1601 | * out the other fields and update the extent | 3341 | if (status) { |
| 1602 | * list. For depth > 0 trees, we've already recorded | 3342 | mlog_errno(status); |
| 1603 | * the extent block in 'next_eb' */ | 3343 | goto bail; |
| 1604 | el->l_recs[i].e_cpos = 0; | ||
| 1605 | el->l_recs[i].e_blkno = 0; | ||
| 1606 | BUG_ON(!el->l_next_free_rec); | ||
| 1607 | le16_add_cpu(&el->l_next_free_rec, -1); | ||
| 1608 | } | 3344 | } |
| 1609 | 3345 | ||
| 1610 | depth = le16_to_cpu(el->l_tree_depth); | 3346 | if (le32_to_cpu(fe->i_clusters) == 0) { |
| 1611 | if (!fe->i_clusters) { | ||
| 1612 | /* trunc to zero is a special case. */ | 3347 | /* trunc to zero is a special case. */ |
| 1613 | el->l_tree_depth = 0; | 3348 | el->l_tree_depth = 0; |
| 1614 | fe->i_last_eb_blk = 0; | 3349 | fe->i_last_eb_blk = 0; |
| @@ -1625,12 +3360,6 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb, | |||
| 1625 | /* If there will be a new last extent block, then by | 3360 | /* If there will be a new last extent block, then by |
| 1626 | * definition, there cannot be any leaves to the right of | 3361 | * definition, there cannot be any leaves to the right of |
| 1627 | * him. */ | 3362 | * him. */ |
| 1628 | status = ocfs2_journal_access(handle, inode, last_eb_bh, | ||
| 1629 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
| 1630 | if (status < 0) { | ||
| 1631 | mlog_errno(status); | ||
| 1632 | goto bail; | ||
| 1633 | } | ||
| 1634 | last_eb->h_next_leaf_blk = 0; | 3363 | last_eb->h_next_leaf_blk = 0; |
| 1635 | status = ocfs2_journal_dirty(handle, last_eb_bh); | 3364 | status = ocfs2_journal_dirty(handle, last_eb_bh); |
| 1636 | if (status < 0) { | 3365 | if (status < 0) { |
| @@ -1639,123 +3368,247 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb, | |||
| 1639 | } | 3368 | } |
| 1640 | } | 3369 | } |
| 1641 | 3370 | ||
| 1642 | /* if our tree depth > 0, update all the tree blocks below us. */ | 3371 | if (delete_blk) { |
| 1643 | while (depth) { | 3372 | status = ocfs2_truncate_log_append(osb, handle, delete_blk, |
| 1644 | mlog(0, "traveling tree (depth = %d, next_eb = %llu)\n", | 3373 | clusters_to_del); |
| 1645 | depth, (unsigned long long)next_eb); | ||
| 1646 | status = ocfs2_read_block(osb, next_eb, &eb_bh, | ||
| 1647 | OCFS2_BH_CACHED, inode); | ||
| 1648 | if (status < 0) { | 3374 | if (status < 0) { |
| 1649 | mlog_errno(status); | 3375 | mlog_errno(status); |
| 1650 | goto bail; | 3376 | goto bail; |
| 1651 | } | 3377 | } |
| 1652 | eb = (struct ocfs2_extent_block *)eb_bh->b_data; | 3378 | } |
| 1653 | if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { | 3379 | status = 0; |
| 1654 | OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); | 3380 | bail: |
| 1655 | status = -EIO; | 3381 | |
| 1656 | goto bail; | 3382 | mlog_exit(status); |
| 3383 | return status; | ||
| 3384 | } | ||
| 3385 | |||
| 3386 | static int ocfs2_writeback_zero_func(handle_t *handle, struct buffer_head *bh) | ||
| 3387 | { | ||
| 3388 | set_buffer_uptodate(bh); | ||
| 3389 | mark_buffer_dirty(bh); | ||
| 3390 | return 0; | ||
| 3391 | } | ||
| 3392 | |||
| 3393 | static int ocfs2_ordered_zero_func(handle_t *handle, struct buffer_head *bh) | ||
| 3394 | { | ||
| 3395 | set_buffer_uptodate(bh); | ||
| 3396 | mark_buffer_dirty(bh); | ||
| 3397 | return ocfs2_journal_dirty_data(handle, bh); | ||
| 3398 | } | ||
| 3399 | |||
| 3400 | static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t isize, | ||
| 3401 | struct page **pages, int numpages, | ||
| 3402 | u64 phys, handle_t *handle) | ||
| 3403 | { | ||
| 3404 | int i, ret, partial = 0; | ||
| 3405 | void *kaddr; | ||
| 3406 | struct page *page; | ||
| 3407 | unsigned int from, to = PAGE_CACHE_SIZE; | ||
| 3408 | struct super_block *sb = inode->i_sb; | ||
| 3409 | |||
| 3410 | BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb))); | ||
| 3411 | |||
| 3412 | if (numpages == 0) | ||
| 3413 | goto out; | ||
| 3414 | |||
| 3415 | from = isize & (PAGE_CACHE_SIZE - 1); /* 1st page offset */ | ||
| 3416 | if (PAGE_CACHE_SHIFT > OCFS2_SB(sb)->s_clustersize_bits) { | ||
| 3417 | /* | ||
| 3418 | * Since 'from' has been capped to a value below page | ||
| 3419 | * size, this calculation won't be able to overflow | ||
| 3420 | * 'to' | ||
| 3421 | */ | ||
| 3422 | to = ocfs2_align_bytes_to_clusters(sb, from); | ||
| 3423 | |||
| 3424 | /* | ||
| 3425 | * The truncate tail in this case should never contain | ||
| 3426 | * more than one page at maximum. The loop below also | ||
| 3427 | * assumes this. | ||
| 3428 | */ | ||
| 3429 | BUG_ON(numpages != 1); | ||
| 3430 | } | ||
| 3431 | |||
| 3432 | for(i = 0; i < numpages; i++) { | ||
| 3433 | page = pages[i]; | ||
| 3434 | |||
| 3435 | BUG_ON(from > PAGE_CACHE_SIZE); | ||
| 3436 | BUG_ON(to > PAGE_CACHE_SIZE); | ||
| 3437 | |||
| 3438 | ret = ocfs2_map_page_blocks(page, &phys, inode, from, to, 0); | ||
| 3439 | if (ret) | ||
| 3440 | mlog_errno(ret); | ||
| 3441 | |||
| 3442 | kaddr = kmap_atomic(page, KM_USER0); | ||
| 3443 | memset(kaddr + from, 0, to - from); | ||
| 3444 | kunmap_atomic(kaddr, KM_USER0); | ||
| 3445 | |||
| 3446 | /* | ||
| 3447 | * Need to set the buffers we zero'd into uptodate | ||
| 3448 | * here if they aren't - ocfs2_map_page_blocks() | ||
| 3449 | * might've skipped some | ||
| 3450 | */ | ||
| 3451 | if (ocfs2_should_order_data(inode)) { | ||
| 3452 | ret = walk_page_buffers(handle, | ||
| 3453 | page_buffers(page), | ||
| 3454 | from, to, &partial, | ||
| 3455 | ocfs2_ordered_zero_func); | ||
| 3456 | if (ret < 0) | ||
| 3457 | mlog_errno(ret); | ||
| 3458 | } else { | ||
| 3459 | ret = walk_page_buffers(handle, page_buffers(page), | ||
| 3460 | from, to, &partial, | ||
| 3461 | ocfs2_writeback_zero_func); | ||
| 3462 | if (ret < 0) | ||
| 3463 | mlog_errno(ret); | ||
| 1657 | } | 3464 | } |
| 1658 | el = &(eb->h_list); | ||
| 1659 | 3465 | ||
| 1660 | status = ocfs2_journal_access(handle, inode, eb_bh, | 3466 | if (!partial) |
| 1661 | OCFS2_JOURNAL_ACCESS_WRITE); | 3467 | SetPageUptodate(page); |
| 1662 | if (status < 0) { | 3468 | |
| 1663 | mlog_errno(status); | 3469 | flush_dcache_page(page); |
| 1664 | goto bail; | 3470 | |
| 3471 | /* | ||
| 3472 | * Every page after the 1st one should be completely zero'd. | ||
| 3473 | */ | ||
| 3474 | from = 0; | ||
| 3475 | } | ||
| 3476 | out: | ||
| 3477 | if (pages) { | ||
| 3478 | for (i = 0; i < numpages; i++) { | ||
| 3479 | page = pages[i]; | ||
| 3480 | unlock_page(page); | ||
| 3481 | mark_page_accessed(page); | ||
| 3482 | page_cache_release(page); | ||
| 1665 | } | 3483 | } |
| 3484 | } | ||
| 3485 | } | ||
| 1666 | 3486 | ||
| 1667 | BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0); | 3487 | static int ocfs2_grab_eof_pages(struct inode *inode, loff_t isize, struct page **pages, |
| 1668 | BUG_ON(depth != (le16_to_cpu(el->l_tree_depth) + 1)); | 3488 | int *num, u64 *phys) |
| 3489 | { | ||
| 3490 | int i, numpages = 0, ret = 0; | ||
| 3491 | unsigned int csize = OCFS2_SB(inode->i_sb)->s_clustersize; | ||
| 3492 | unsigned int ext_flags; | ||
| 3493 | struct super_block *sb = inode->i_sb; | ||
| 3494 | struct address_space *mapping = inode->i_mapping; | ||
| 3495 | unsigned long index; | ||
| 3496 | u64 next_cluster_bytes; | ||
| 3497 | |||
| 3498 | BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb))); | ||
| 3499 | |||
| 3500 | /* Cluster boundary, so we don't need to grab any pages. */ | ||
| 3501 | if ((isize & (csize - 1)) == 0) | ||
| 3502 | goto out; | ||
| 1669 | 3503 | ||
| 1670 | i = le16_to_cpu(el->l_next_free_rec) - 1; | 3504 | ret = ocfs2_extent_map_get_blocks(inode, isize >> sb->s_blocksize_bits, |
| 3505 | phys, NULL, &ext_flags); | ||
| 3506 | if (ret) { | ||
| 3507 | mlog_errno(ret); | ||
| 3508 | goto out; | ||
| 3509 | } | ||
| 1671 | 3510 | ||
| 1672 | mlog(0, "extent block %llu, before: record %d: " | 3511 | /* Tail is a hole. */ |
| 1673 | "(%u, %u, %llu), next = %u\n", | 3512 | if (*phys == 0) |
| 1674 | (unsigned long long)le64_to_cpu(eb->h_blkno), i, | 3513 | goto out; |
| 1675 | le32_to_cpu(el->l_recs[i].e_cpos), | ||
| 1676 | le32_to_cpu(el->l_recs[i].e_clusters), | ||
| 1677 | (unsigned long long)le64_to_cpu(el->l_recs[i].e_blkno), | ||
| 1678 | le16_to_cpu(el->l_next_free_rec)); | ||
| 1679 | 3514 | ||
| 1680 | BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del); | 3515 | /* Tail is marked as unwritten, we can count on write to zero |
| 1681 | le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del); | 3516 | * in that case. */ |
| 1682 | 3517 | if (ext_flags & OCFS2_EXT_UNWRITTEN) | |
| 1683 | next_eb = le64_to_cpu(el->l_recs[i].e_blkno); | 3518 | goto out; |
| 1684 | /* bottom-most block requires us to delete data.*/ | ||
| 1685 | if (!el->l_tree_depth) | ||
| 1686 | delete_blk = le64_to_cpu(el->l_recs[i].e_blkno) | ||
| 1687 | + ocfs2_clusters_to_blocks(osb->sb, | ||
| 1688 | le32_to_cpu(el->l_recs[i].e_clusters)); | ||
| 1689 | if (!el->l_recs[i].e_clusters) { | ||
| 1690 | el->l_recs[i].e_cpos = 0; | ||
| 1691 | el->l_recs[i].e_blkno = 0; | ||
| 1692 | BUG_ON(!el->l_next_free_rec); | ||
| 1693 | le16_add_cpu(&el->l_next_free_rec, -1); | ||
| 1694 | } | ||
| 1695 | mlog(0, "extent block %llu, after: record %d: " | ||
| 1696 | "(%u, %u, %llu), next = %u\n", | ||
| 1697 | (unsigned long long)le64_to_cpu(eb->h_blkno), i, | ||
| 1698 | le32_to_cpu(el->l_recs[i].e_cpos), | ||
| 1699 | le32_to_cpu(el->l_recs[i].e_clusters), | ||
| 1700 | (unsigned long long)le64_to_cpu(el->l_recs[i].e_blkno), | ||
| 1701 | le16_to_cpu(el->l_next_free_rec)); | ||
| 1702 | 3519 | ||
| 1703 | status = ocfs2_journal_dirty(handle, eb_bh); | 3520 | next_cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, isize); |
| 1704 | if (status < 0) { | 3521 | index = isize >> PAGE_CACHE_SHIFT; |
| 1705 | mlog_errno(status); | 3522 | do { |
| 1706 | goto bail; | 3523 | pages[numpages] = grab_cache_page(mapping, index); |
| 3524 | if (!pages[numpages]) { | ||
| 3525 | ret = -ENOMEM; | ||
| 3526 | mlog_errno(ret); | ||
| 3527 | goto out; | ||
| 1707 | } | 3528 | } |
| 1708 | 3529 | ||
| 1709 | if (!el->l_next_free_rec) { | 3530 | numpages++; |
| 1710 | mlog(0, "deleting this extent block.\n"); | 3531 | index++; |
| 1711 | 3532 | } while (index < (next_cluster_bytes >> PAGE_CACHE_SHIFT)); | |
| 1712 | ocfs2_remove_from_cache(inode, eb_bh); | ||
| 1713 | 3533 | ||
| 1714 | BUG_ON(el->l_recs[0].e_clusters); | 3534 | out: |
| 1715 | BUG_ON(el->l_recs[0].e_cpos); | 3535 | if (ret != 0) { |
| 1716 | BUG_ON(el->l_recs[0].e_blkno); | 3536 | if (pages) { |
| 1717 | if (eb->h_suballoc_slot == 0) { | 3537 | for (i = 0; i < numpages; i++) { |
| 1718 | /* | 3538 | if (pages[i]) { |
| 1719 | * This code only understands how to | 3539 | unlock_page(pages[i]); |
| 1720 | * lock the suballocator in slot 0, | 3540 | page_cache_release(pages[i]); |
| 1721 | * which is fine because allocation is | ||
| 1722 | * only ever done out of that | ||
| 1723 | * suballocator too. A future version | ||
| 1724 | * might change that however, so avoid | ||
| 1725 | * a free if we don't know how to | ||
| 1726 | * handle it. This way an fs incompat | ||
| 1727 | * bit will not be necessary. | ||
| 1728 | */ | ||
| 1729 | status = ocfs2_free_extent_block(handle, | ||
| 1730 | tc->tc_ext_alloc_inode, | ||
| 1731 | tc->tc_ext_alloc_bh, | ||
| 1732 | eb); | ||
| 1733 | if (status < 0) { | ||
| 1734 | mlog_errno(status); | ||
| 1735 | goto bail; | ||
| 1736 | } | 3541 | } |
| 1737 | } | 3542 | } |
| 1738 | } | 3543 | } |
| 1739 | brelse(eb_bh); | 3544 | numpages = 0; |
| 1740 | eb_bh = NULL; | ||
| 1741 | depth--; | ||
| 1742 | } | 3545 | } |
| 1743 | 3546 | ||
| 1744 | BUG_ON(!delete_blk); | 3547 | *num = numpages; |
| 1745 | status = ocfs2_truncate_log_append(osb, handle, delete_blk, | 3548 | |
| 1746 | clusters_to_del); | 3549 | return ret; |
| 1747 | if (status < 0) { | 3550 | } |
| 1748 | mlog_errno(status); | 3551 | |
| 1749 | goto bail; | 3552 | /* |
| 3553 | * Zero the area past i_size but still within an allocated | ||
| 3554 | * cluster. This avoids exposing nonzero data on subsequent file | ||
| 3555 | * extends. | ||
| 3556 | * | ||
| 3557 | * We need to call this before i_size is updated on the inode because | ||
| 3558 | * otherwise block_write_full_page() will skip writeout of pages past | ||
| 3559 | * i_size. The new_i_size parameter is passed for this reason. | ||
| 3560 | */ | ||
| 3561 | int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle, | ||
| 3562 | u64 new_i_size) | ||
| 3563 | { | ||
| 3564 | int ret, numpages; | ||
| 3565 | loff_t endbyte; | ||
| 3566 | struct page **pages = NULL; | ||
| 3567 | u64 phys; | ||
| 3568 | |||
| 3569 | /* | ||
| 3570 | * File systems which don't support sparse files zero on every | ||
| 3571 | * extend. | ||
| 3572 | */ | ||
| 3573 | if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) | ||
| 3574 | return 0; | ||
| 3575 | |||
| 3576 | pages = kcalloc(ocfs2_pages_per_cluster(inode->i_sb), | ||
| 3577 | sizeof(struct page *), GFP_NOFS); | ||
| 3578 | if (pages == NULL) { | ||
| 3579 | ret = -ENOMEM; | ||
| 3580 | mlog_errno(ret); | ||
| 3581 | goto out; | ||
| 1750 | } | 3582 | } |
| 1751 | status = 0; | 3583 | |
| 1752 | bail: | 3584 | ret = ocfs2_grab_eof_pages(inode, new_i_size, pages, &numpages, &phys); |
| 1753 | if (!status) | 3585 | if (ret) { |
| 1754 | ocfs2_extent_map_trunc(inode, le32_to_cpu(fe->i_clusters)); | 3586 | mlog_errno(ret); |
| 1755 | else | 3587 | goto out; |
| 1756 | ocfs2_extent_map_drop(inode, 0); | 3588 | } |
| 1757 | mlog_exit(status); | 3589 | |
| 1758 | return status; | 3590 | if (numpages == 0) |
| 3591 | goto out; | ||
| 3592 | |||
| 3593 | ocfs2_zero_cluster_pages(inode, new_i_size, pages, numpages, phys, | ||
| 3594 | handle); | ||
| 3595 | |||
| 3596 | /* | ||
| 3597 | * Initiate writeout of the pages we zero'd here. We don't | ||
| 3598 | * wait on them - the truncate_inode_pages() call later will | ||
| 3599 | * do that for us. | ||
| 3600 | */ | ||
| 3601 | endbyte = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size); | ||
| 3602 | ret = do_sync_mapping_range(inode->i_mapping, new_i_size, | ||
| 3603 | endbyte - 1, SYNC_FILE_RANGE_WRITE); | ||
| 3604 | if (ret) | ||
| 3605 | mlog_errno(ret); | ||
| 3606 | |||
| 3607 | out: | ||
| 3608 | if (pages) | ||
| 3609 | kfree(pages); | ||
| 3610 | |||
| 3611 | return ret; | ||
| 1759 | } | 3612 | } |
| 1760 | 3613 | ||
| 1761 | /* | 3614 | /* |
| @@ -1770,82 +3623,90 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb, | |||
| 1770 | struct ocfs2_truncate_context *tc) | 3623 | struct ocfs2_truncate_context *tc) |
| 1771 | { | 3624 | { |
| 1772 | int status, i, credits, tl_sem = 0; | 3625 | int status, i, credits, tl_sem = 0; |
| 1773 | u32 clusters_to_del, target_i_clusters; | 3626 | u32 clusters_to_del, new_highest_cpos, range; |
| 1774 | u64 last_eb = 0; | ||
| 1775 | struct ocfs2_dinode *fe; | ||
| 1776 | struct ocfs2_extent_block *eb; | ||
| 1777 | struct ocfs2_extent_list *el; | 3627 | struct ocfs2_extent_list *el; |
| 1778 | struct buffer_head *last_eb_bh; | ||
| 1779 | handle_t *handle = NULL; | 3628 | handle_t *handle = NULL; |
| 1780 | struct inode *tl_inode = osb->osb_tl_inode; | 3629 | struct inode *tl_inode = osb->osb_tl_inode; |
| 3630 | struct ocfs2_path *path = NULL; | ||
| 1781 | 3631 | ||
| 1782 | mlog_entry_void(); | 3632 | mlog_entry_void(); |
| 1783 | 3633 | ||
| 1784 | down_write(&OCFS2_I(inode)->ip_alloc_sem); | 3634 | down_write(&OCFS2_I(inode)->ip_alloc_sem); |
| 1785 | 3635 | ||
| 1786 | target_i_clusters = ocfs2_clusters_for_bytes(osb->sb, | 3636 | new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb, |
| 1787 | i_size_read(inode)); | 3637 | i_size_read(inode)); |
| 1788 | 3638 | ||
| 1789 | last_eb_bh = tc->tc_last_eb_bh; | 3639 | path = ocfs2_new_inode_path(fe_bh); |
| 1790 | tc->tc_last_eb_bh = NULL; | 3640 | if (!path) { |
| 3641 | status = -ENOMEM; | ||
| 3642 | mlog_errno(status); | ||
| 3643 | goto bail; | ||
| 3644 | } | ||
| 1791 | 3645 | ||
| 1792 | fe = (struct ocfs2_dinode *) fe_bh->b_data; | 3646 | ocfs2_extent_map_trunc(inode, new_highest_cpos); |
| 1793 | 3647 | ||
| 1794 | if (fe->id2.i_list.l_tree_depth) { | ||
| 1795 | eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; | ||
| 1796 | el = &eb->h_list; | ||
| 1797 | } else | ||
| 1798 | el = &fe->id2.i_list; | ||
| 1799 | last_eb = le64_to_cpu(fe->i_last_eb_blk); | ||
| 1800 | start: | 3648 | start: |
| 1801 | mlog(0, "ocfs2_commit_truncate: fe->i_clusters = %u, " | 3649 | /* |
| 1802 | "last_eb = %llu, fe->i_last_eb_blk = %llu, " | 3650 | * Check that we still have allocation to delete. |
| 1803 | "fe->id2.i_list.l_tree_depth = %u last_eb_bh = %p\n", | 3651 | */ |
| 1804 | le32_to_cpu(fe->i_clusters), (unsigned long long)last_eb, | 3652 | if (OCFS2_I(inode)->ip_clusters == 0) { |
| 1805 | (unsigned long long)le64_to_cpu(fe->i_last_eb_blk), | 3653 | status = 0; |
| 1806 | le16_to_cpu(fe->id2.i_list.l_tree_depth), last_eb_bh); | 3654 | goto bail; |
| 1807 | 3655 | } | |
| 1808 | if (last_eb != le64_to_cpu(fe->i_last_eb_blk)) { | ||
| 1809 | mlog(0, "last_eb changed!\n"); | ||
| 1810 | BUG_ON(!fe->id2.i_list.l_tree_depth); | ||
| 1811 | last_eb = le64_to_cpu(fe->i_last_eb_blk); | ||
| 1812 | /* i_last_eb_blk may have changed, read it if | ||
| 1813 | * necessary. We don't have to worry about the | ||
| 1814 | * truncate to zero case here (where there becomes no | ||
| 1815 | * last_eb) because we never loop back after our work | ||
| 1816 | * is done. */ | ||
| 1817 | if (last_eb_bh) { | ||
| 1818 | brelse(last_eb_bh); | ||
| 1819 | last_eb_bh = NULL; | ||
| 1820 | } | ||
| 1821 | 3656 | ||
| 1822 | status = ocfs2_read_block(osb, last_eb, | 3657 | /* |
| 1823 | &last_eb_bh, OCFS2_BH_CACHED, | 3658 | * Truncate always works against the rightmost tree branch. |
| 1824 | inode); | 3659 | */ |
| 1825 | if (status < 0) { | 3660 | status = ocfs2_find_path(inode, path, UINT_MAX); |
| 1826 | mlog_errno(status); | 3661 | if (status) { |
| 1827 | goto bail; | 3662 | mlog_errno(status); |
| 1828 | } | 3663 | goto bail; |
| 1829 | eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; | 3664 | } |
| 1830 | if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { | 3665 | |
| 1831 | OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); | 3666 | mlog(0, "inode->ip_clusters = %u, tree_depth = %u\n", |
| 1832 | status = -EIO; | 3667 | OCFS2_I(inode)->ip_clusters, path->p_tree_depth); |
| 1833 | goto bail; | 3668 | |
| 1834 | } | 3669 | /* |
| 1835 | el = &(eb->h_list); | 3670 | * By now, el will point to the extent list on the bottom most |
| 3671 | * portion of this tree. Only the tail record is considered in | ||
| 3672 | * each pass. | ||
| 3673 | * | ||
| 3674 | * We handle the following cases, in order: | ||
| 3675 | * - empty extent: delete the remaining branch | ||
| 3676 | * - remove the entire record | ||
| 3677 | * - remove a partial record | ||
| 3678 | * - no record needs to be removed (truncate has completed) | ||
| 3679 | */ | ||
| 3680 | el = path_leaf_el(path); | ||
| 3681 | if (le16_to_cpu(el->l_next_free_rec) == 0) { | ||
| 3682 | ocfs2_error(inode->i_sb, | ||
| 3683 | "Inode %llu has empty extent block at %llu\n", | ||
| 3684 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | ||
| 3685 | (unsigned long long)path_leaf_bh(path)->b_blocknr); | ||
| 3686 | status = -EROFS; | ||
| 3687 | goto bail; | ||
| 1836 | } | 3688 | } |
| 1837 | 3689 | ||
| 1838 | /* by now, el will point to the extent list on the bottom most | ||
| 1839 | * portion of this tree. */ | ||
| 1840 | i = le16_to_cpu(el->l_next_free_rec) - 1; | 3690 | i = le16_to_cpu(el->l_next_free_rec) - 1; |
| 1841 | if (le32_to_cpu(el->l_recs[i].e_cpos) >= target_i_clusters) | 3691 | range = le32_to_cpu(el->l_recs[i].e_cpos) + |
| 1842 | clusters_to_del = le32_to_cpu(el->l_recs[i].e_clusters); | 3692 | ocfs2_rec_clusters(el, &el->l_recs[i]); |
| 1843 | else | 3693 | if (i == 0 && ocfs2_is_empty_extent(&el->l_recs[i])) { |
| 1844 | clusters_to_del = (le32_to_cpu(el->l_recs[i].e_clusters) + | 3694 | clusters_to_del = 0; |
| 3695 | } else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) { | ||
| 3696 | clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]); | ||
| 3697 | } else if (range > new_highest_cpos) { | ||
| 3698 | clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) + | ||
| 1845 | le32_to_cpu(el->l_recs[i].e_cpos)) - | 3699 | le32_to_cpu(el->l_recs[i].e_cpos)) - |
| 1846 | target_i_clusters; | 3700 | new_highest_cpos; |
| 3701 | } else { | ||
| 3702 | status = 0; | ||
| 3703 | goto bail; | ||
| 3704 | } | ||
| 1847 | 3705 | ||
| 1848 | mlog(0, "clusters_to_del = %u in this pass\n", clusters_to_del); | 3706 | mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n", |
| 3707 | clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr); | ||
| 3708 | |||
| 3709 | BUG_ON(clusters_to_del == 0); | ||
| 1849 | 3710 | ||
| 1850 | mutex_lock(&tl_inode->i_mutex); | 3711 | mutex_lock(&tl_inode->i_mutex); |
| 1851 | tl_sem = 1; | 3712 | tl_sem = 1; |
| @@ -1861,7 +3722,8 @@ start: | |||
| 1861 | } | 3722 | } |
| 1862 | 3723 | ||
| 1863 | credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del, | 3724 | credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del, |
| 1864 | fe, el); | 3725 | (struct ocfs2_dinode *)fe_bh->b_data, |
| 3726 | el); | ||
| 1865 | handle = ocfs2_start_trans(osb, credits); | 3727 | handle = ocfs2_start_trans(osb, credits); |
| 1866 | if (IS_ERR(handle)) { | 3728 | if (IS_ERR(handle)) { |
| 1867 | status = PTR_ERR(handle); | 3729 | status = PTR_ERR(handle); |
| @@ -1870,13 +3732,8 @@ start: | |||
| 1870 | goto bail; | 3732 | goto bail; |
| 1871 | } | 3733 | } |
| 1872 | 3734 | ||
| 1873 | inode->i_ctime = inode->i_mtime = CURRENT_TIME; | 3735 | status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle, |
| 1874 | status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); | 3736 | tc, path); |
| 1875 | if (status < 0) | ||
| 1876 | mlog_errno(status); | ||
| 1877 | |||
| 1878 | status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, | ||
| 1879 | last_eb_bh, handle, tc); | ||
| 1880 | if (status < 0) { | 3737 | if (status < 0) { |
| 1881 | mlog_errno(status); | 3738 | mlog_errno(status); |
| 1882 | goto bail; | 3739 | goto bail; |
| @@ -1888,9 +3745,14 @@ start: | |||
| 1888 | ocfs2_commit_trans(osb, handle); | 3745 | ocfs2_commit_trans(osb, handle); |
| 1889 | handle = NULL; | 3746 | handle = NULL; |
| 1890 | 3747 | ||
| 1891 | BUG_ON(le32_to_cpu(fe->i_clusters) < target_i_clusters); | 3748 | ocfs2_reinit_path(path, 1); |
| 1892 | if (le32_to_cpu(fe->i_clusters) > target_i_clusters) | 3749 | |
| 1893 | goto start; | 3750 | /* |
| 3751 | * The check above will catch the case where we've truncated | ||
| 3752 | * away all allocation. | ||
| 3753 | */ | ||
| 3754 | goto start; | ||
| 3755 | |||
| 1894 | bail: | 3756 | bail: |
| 1895 | up_write(&OCFS2_I(inode)->ip_alloc_sem); | 3757 | up_write(&OCFS2_I(inode)->ip_alloc_sem); |
| 1896 | 3758 | ||
| @@ -1902,8 +3764,7 @@ bail: | |||
| 1902 | if (handle) | 3764 | if (handle) |
| 1903 | ocfs2_commit_trans(osb, handle); | 3765 | ocfs2_commit_trans(osb, handle); |
| 1904 | 3766 | ||
| 1905 | if (last_eb_bh) | 3767 | ocfs2_free_path(path); |
| 1906 | brelse(last_eb_bh); | ||
| 1907 | 3768 | ||
| 1908 | /* This will drop the ext_alloc cluster lock for us */ | 3769 | /* This will drop the ext_alloc cluster lock for us */ |
| 1909 | ocfs2_free_truncate_context(tc); | 3770 | ocfs2_free_truncate_context(tc); |
| @@ -1912,7 +3773,6 @@ bail: | |||
| 1912 | return status; | 3773 | return status; |
| 1913 | } | 3774 | } |
| 1914 | 3775 | ||
| 1915 | |||
| 1916 | /* | 3776 | /* |
| 1917 | * Expects the inode to already be locked. This will figure out which | 3777 | * Expects the inode to already be locked. This will figure out which |
| 1918 | * inodes need to be locked and will put them on the returned truncate | 3778 | * inodes need to be locked and will put them on the returned truncate |
| @@ -1923,7 +3783,7 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb, | |||
| 1923 | struct buffer_head *fe_bh, | 3783 | struct buffer_head *fe_bh, |
| 1924 | struct ocfs2_truncate_context **tc) | 3784 | struct ocfs2_truncate_context **tc) |
| 1925 | { | 3785 | { |
| 1926 | int status, metadata_delete; | 3786 | int status, metadata_delete, i; |
| 1927 | unsigned int new_i_clusters; | 3787 | unsigned int new_i_clusters; |
| 1928 | struct ocfs2_dinode *fe; | 3788 | struct ocfs2_dinode *fe; |
| 1929 | struct ocfs2_extent_block *eb; | 3789 | struct ocfs2_extent_block *eb; |
| @@ -1944,21 +3804,6 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb, | |||
| 1944 | "%llu\n", fe->i_clusters, new_i_clusters, | 3804 | "%llu\n", fe->i_clusters, new_i_clusters, |
| 1945 | (unsigned long long)fe->i_size); | 3805 | (unsigned long long)fe->i_size); |
| 1946 | 3806 | ||
| 1947 | if (le32_to_cpu(fe->i_clusters) <= new_i_clusters) { | ||
| 1948 | ocfs2_error(inode->i_sb, "Dinode %llu has cluster count " | ||
| 1949 | "%u and size %llu whereas struct inode has " | ||
| 1950 | "cluster count %u and size %llu which caused an " | ||
| 1951 | "invalid truncate to %u clusters.", | ||
| 1952 | (unsigned long long)le64_to_cpu(fe->i_blkno), | ||
| 1953 | le32_to_cpu(fe->i_clusters), | ||
| 1954 | (unsigned long long)le64_to_cpu(fe->i_size), | ||
| 1955 | OCFS2_I(inode)->ip_clusters, i_size_read(inode), | ||
| 1956 | new_i_clusters); | ||
| 1957 | mlog_meta_lvb(ML_ERROR, &OCFS2_I(inode)->ip_meta_lockres); | ||
| 1958 | status = -EIO; | ||
| 1959 | goto bail; | ||
| 1960 | } | ||
| 1961 | |||
| 1962 | *tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL); | 3807 | *tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL); |
| 1963 | if (!(*tc)) { | 3808 | if (!(*tc)) { |
| 1964 | status = -ENOMEM; | 3809 | status = -ENOMEM; |
| @@ -1986,7 +3831,15 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb, | |||
| 1986 | goto bail; | 3831 | goto bail; |
| 1987 | } | 3832 | } |
| 1988 | el = &(eb->h_list); | 3833 | el = &(eb->h_list); |
| 1989 | if (le32_to_cpu(el->l_recs[0].e_cpos) >= new_i_clusters) | 3834 | |
| 3835 | i = 0; | ||
| 3836 | if (ocfs2_is_empty_extent(&el->l_recs[0])) | ||
| 3837 | i = 1; | ||
| 3838 | /* | ||
| 3839 | * XXX: Should we check that next_free_rec contains | ||
| 3840 | * the extent? | ||
| 3841 | */ | ||
| 3842 | if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_i_clusters) | ||
| 1990 | metadata_delete = 1; | 3843 | metadata_delete = 1; |
| 1991 | } | 3844 | } |
| 1992 | 3845 | ||
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h index 0b82e8044325..fbcb5934a081 100644 --- a/fs/ocfs2/alloc.h +++ b/fs/ocfs2/alloc.h | |||
| @@ -31,7 +31,8 @@ int ocfs2_insert_extent(struct ocfs2_super *osb, | |||
| 31 | handle_t *handle, | 31 | handle_t *handle, |
| 32 | struct inode *inode, | 32 | struct inode *inode, |
| 33 | struct buffer_head *fe_bh, | 33 | struct buffer_head *fe_bh, |
| 34 | u64 blkno, | 34 | u32 cpos, |
| 35 | u64 start_blk, | ||
| 35 | u32 new_clusters, | 36 | u32 new_clusters, |
| 36 | struct ocfs2_alloc_context *meta_ac); | 37 | struct ocfs2_alloc_context *meta_ac); |
| 37 | int ocfs2_num_free_extents(struct ocfs2_super *osb, | 38 | int ocfs2_num_free_extents(struct ocfs2_super *osb, |
| @@ -70,6 +71,8 @@ struct ocfs2_truncate_context { | |||
| 70 | struct buffer_head *tc_last_eb_bh; | 71 | struct buffer_head *tc_last_eb_bh; |
| 71 | }; | 72 | }; |
| 72 | 73 | ||
| 74 | int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle, | ||
| 75 | u64 new_i_size); | ||
| 73 | int ocfs2_prepare_truncate(struct ocfs2_super *osb, | 76 | int ocfs2_prepare_truncate(struct ocfs2_super *osb, |
| 74 | struct inode *inode, | 77 | struct inode *inode, |
| 75 | struct buffer_head *fe_bh, | 78 | struct buffer_head *fe_bh, |
| @@ -79,4 +82,26 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb, | |||
| 79 | struct buffer_head *fe_bh, | 82 | struct buffer_head *fe_bh, |
| 80 | struct ocfs2_truncate_context *tc); | 83 | struct ocfs2_truncate_context *tc); |
| 81 | 84 | ||
| 85 | int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el, | ||
| 86 | u32 cpos, struct buffer_head **leaf_bh); | ||
| 87 | |||
| 88 | /* | ||
| 89 | * Helper function to look at the # of clusters in an extent record. | ||
| 90 | */ | ||
| 91 | static inline unsigned int ocfs2_rec_clusters(struct ocfs2_extent_list *el, | ||
| 92 | struct ocfs2_extent_rec *rec) | ||
| 93 | { | ||
| 94 | /* | ||
| 95 | * Cluster count in extent records is slightly different | ||
| 96 | * between interior nodes and leaf nodes. This is to support | ||
| 97 | * unwritten extents which need a flags field in leaf node | ||
| 98 | * records, thus shrinking the available space for a clusters | ||
| 99 | * field. | ||
| 100 | */ | ||
| 101 | if (el->l_tree_depth) | ||
| 102 | return le32_to_cpu(rec->e_int_clusters); | ||
| 103 | else | ||
| 104 | return le16_to_cpu(rec->e_leaf_clusters); | ||
| 105 | } | ||
| 106 | |||
| 82 | #endif /* OCFS2_ALLOC_H */ | 107 | #endif /* OCFS2_ALLOC_H */ |
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 875c11443817..56963e6c46c0 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c | |||
| @@ -24,6 +24,8 @@ | |||
| 24 | #include <linux/highmem.h> | 24 | #include <linux/highmem.h> |
| 25 | #include <linux/pagemap.h> | 25 | #include <linux/pagemap.h> |
| 26 | #include <asm/byteorder.h> | 26 | #include <asm/byteorder.h> |
| 27 | #include <linux/swap.h> | ||
| 28 | #include <linux/pipe_fs_i.h> | ||
| 27 | 29 | ||
| 28 | #define MLOG_MASK_PREFIX ML_FILE_IO | 30 | #define MLOG_MASK_PREFIX ML_FILE_IO |
| 29 | #include <cluster/masklog.h> | 31 | #include <cluster/masklog.h> |
| @@ -37,6 +39,7 @@ | |||
| 37 | #include "file.h" | 39 | #include "file.h" |
| 38 | #include "inode.h" | 40 | #include "inode.h" |
| 39 | #include "journal.h" | 41 | #include "journal.h" |
| 42 | #include "suballoc.h" | ||
| 40 | #include "super.h" | 43 | #include "super.h" |
| 41 | #include "symlink.h" | 44 | #include "symlink.h" |
| 42 | 45 | ||
| @@ -134,7 +137,9 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock, | |||
| 134 | struct buffer_head *bh_result, int create) | 137 | struct buffer_head *bh_result, int create) |
| 135 | { | 138 | { |
| 136 | int err = 0; | 139 | int err = 0; |
| 140 | unsigned int ext_flags; | ||
| 137 | u64 p_blkno, past_eof; | 141 | u64 p_blkno, past_eof; |
| 142 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
| 138 | 143 | ||
| 139 | mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode, | 144 | mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode, |
| 140 | (unsigned long long)iblock, bh_result, create); | 145 | (unsigned long long)iblock, bh_result, create); |
| @@ -149,17 +154,8 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock, | |||
| 149 | goto bail; | 154 | goto bail; |
| 150 | } | 155 | } |
| 151 | 156 | ||
| 152 | /* this can happen if another node truncs after our extend! */ | 157 | err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, NULL, |
| 153 | spin_lock(&OCFS2_I(inode)->ip_lock); | 158 | &ext_flags); |
| 154 | if (iblock >= ocfs2_clusters_to_blocks(inode->i_sb, | ||
| 155 | OCFS2_I(inode)->ip_clusters)) | ||
| 156 | err = -EIO; | ||
| 157 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
| 158 | if (err) | ||
| 159 | goto bail; | ||
| 160 | |||
| 161 | err = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno, | ||
| 162 | NULL); | ||
| 163 | if (err) { | 159 | if (err) { |
| 164 | mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, " | 160 | mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, " |
| 165 | "%llu, NULL)\n", err, inode, (unsigned long long)iblock, | 161 | "%llu, NULL)\n", err, inode, (unsigned long long)iblock, |
| @@ -167,22 +163,39 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock, | |||
| 167 | goto bail; | 163 | goto bail; |
| 168 | } | 164 | } |
| 169 | 165 | ||
| 170 | map_bh(bh_result, inode->i_sb, p_blkno); | 166 | /* |
| 171 | 167 | * ocfs2 never allocates in this function - the only time we | |
| 172 | if (bh_result->b_blocknr == 0) { | 168 | * need to use BH_New is when we're extending i_size on a file |
| 173 | err = -EIO; | 169 | * system which doesn't support holes, in which case BH_New |
| 174 | mlog(ML_ERROR, "iblock = %llu p_blkno = %llu blkno=(%llu)\n", | 170 | * allows block_prepare_write() to zero. |
| 175 | (unsigned long long)iblock, | 171 | */ |
| 176 | (unsigned long long)p_blkno, | 172 | mlog_bug_on_msg(create && p_blkno == 0 && ocfs2_sparse_alloc(osb), |
| 177 | (unsigned long long)OCFS2_I(inode)->ip_blkno); | 173 | "ino %lu, iblock %llu\n", inode->i_ino, |
| 178 | } | 174 | (unsigned long long)iblock); |
| 175 | |||
| 176 | /* Treat the unwritten extent as a hole for zeroing purposes. */ | ||
| 177 | if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN)) | ||
| 178 | map_bh(bh_result, inode->i_sb, p_blkno); | ||
| 179 | |||
| 180 | if (!ocfs2_sparse_alloc(osb)) { | ||
| 181 | if (p_blkno == 0) { | ||
| 182 | err = -EIO; | ||
| 183 | mlog(ML_ERROR, | ||
| 184 | "iblock = %llu p_blkno = %llu blkno=(%llu)\n", | ||
| 185 | (unsigned long long)iblock, | ||
| 186 | (unsigned long long)p_blkno, | ||
| 187 | (unsigned long long)OCFS2_I(inode)->ip_blkno); | ||
| 188 | mlog(ML_ERROR, "Size %llu, clusters %u\n", (unsigned long long)i_size_read(inode), OCFS2_I(inode)->ip_clusters); | ||
| 189 | dump_stack(); | ||
| 190 | } | ||
| 179 | 191 | ||
| 180 | past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); | 192 | past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); |
| 181 | mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino, | 193 | mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino, |
| 182 | (unsigned long long)past_eof); | 194 | (unsigned long long)past_eof); |
| 183 | 195 | ||
| 184 | if (create && (iblock >= past_eof)) | 196 | if (create && (iblock >= past_eof)) |
| 185 | set_buffer_new(bh_result); | 197 | set_buffer_new(bh_result); |
| 198 | } | ||
| 186 | 199 | ||
| 187 | bail: | 200 | bail: |
| 188 | if (err < 0) | 201 | if (err < 0) |
| @@ -276,8 +289,11 @@ static int ocfs2_writepage(struct page *page, struct writeback_control *wbc) | |||
| 276 | return ret; | 289 | return ret; |
| 277 | } | 290 | } |
| 278 | 291 | ||
| 279 | /* This can also be called from ocfs2_write_zero_page() which has done | 292 | /* |
| 280 | * it's own cluster locking. */ | 293 | * This is called from ocfs2_write_zero_page() which has handled it's |
| 294 | * own cluster locking and has ensured allocation exists for those | ||
| 295 | * blocks to be written. | ||
| 296 | */ | ||
| 281 | int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page, | 297 | int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page, |
| 282 | unsigned from, unsigned to) | 298 | unsigned from, unsigned to) |
| 283 | { | 299 | { |
| @@ -292,44 +308,17 @@ int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page, | |||
| 292 | return ret; | 308 | return ret; |
| 293 | } | 309 | } |
| 294 | 310 | ||
| 295 | /* | ||
| 296 | * ocfs2_prepare_write() can be an outer-most ocfs2 call when it is called | ||
| 297 | * from loopback. It must be able to perform its own locking around | ||
| 298 | * ocfs2_get_block(). | ||
| 299 | */ | ||
| 300 | static int ocfs2_prepare_write(struct file *file, struct page *page, | ||
| 301 | unsigned from, unsigned to) | ||
| 302 | { | ||
| 303 | struct inode *inode = page->mapping->host; | ||
| 304 | int ret; | ||
| 305 | |||
| 306 | mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to); | ||
| 307 | |||
| 308 | ret = ocfs2_meta_lock_with_page(inode, NULL, 0, page); | ||
| 309 | if (ret != 0) { | ||
| 310 | mlog_errno(ret); | ||
| 311 | goto out; | ||
| 312 | } | ||
| 313 | |||
| 314 | ret = ocfs2_prepare_write_nolock(inode, page, from, to); | ||
| 315 | |||
| 316 | ocfs2_meta_unlock(inode, 0); | ||
| 317 | out: | ||
| 318 | mlog_exit(ret); | ||
| 319 | return ret; | ||
| 320 | } | ||
| 321 | |||
| 322 | /* Taken from ext3. We don't necessarily need the full blown | 311 | /* Taken from ext3. We don't necessarily need the full blown |
| 323 | * functionality yet, but IMHO it's better to cut and paste the whole | 312 | * functionality yet, but IMHO it's better to cut and paste the whole |
| 324 | * thing so we can avoid introducing our own bugs (and easily pick up | 313 | * thing so we can avoid introducing our own bugs (and easily pick up |
| 325 | * their fixes when they happen) --Mark */ | 314 | * their fixes when they happen) --Mark */ |
| 326 | static int walk_page_buffers( handle_t *handle, | 315 | int walk_page_buffers( handle_t *handle, |
| 327 | struct buffer_head *head, | 316 | struct buffer_head *head, |
| 328 | unsigned from, | 317 | unsigned from, |
| 329 | unsigned to, | 318 | unsigned to, |
| 330 | int *partial, | 319 | int *partial, |
| 331 | int (*fn)( handle_t *handle, | 320 | int (*fn)( handle_t *handle, |
| 332 | struct buffer_head *bh)) | 321 | struct buffer_head *bh)) |
| 333 | { | 322 | { |
| 334 | struct buffer_head *bh; | 323 | struct buffer_head *bh; |
| 335 | unsigned block_start, block_end; | 324 | unsigned block_start, block_end; |
| @@ -388,95 +377,6 @@ out: | |||
| 388 | return handle; | 377 | return handle; |
| 389 | } | 378 | } |
| 390 | 379 | ||
| 391 | static int ocfs2_commit_write(struct file *file, struct page *page, | ||
| 392 | unsigned from, unsigned to) | ||
| 393 | { | ||
| 394 | int ret; | ||
| 395 | struct buffer_head *di_bh = NULL; | ||
| 396 | struct inode *inode = page->mapping->host; | ||
| 397 | handle_t *handle = NULL; | ||
| 398 | struct ocfs2_dinode *di; | ||
| 399 | |||
| 400 | mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to); | ||
| 401 | |||
| 402 | /* NOTE: ocfs2_file_aio_write has ensured that it's safe for | ||
| 403 | * us to continue here without rechecking the I/O against | ||
| 404 | * changed inode values. | ||
| 405 | * | ||
| 406 | * 1) We're currently holding the inode alloc lock, so no | ||
| 407 | * nodes can change it underneath us. | ||
| 408 | * | ||
| 409 | * 2) We've had to take the metadata lock at least once | ||
| 410 | * already to check for extending writes, suid removal, etc. | ||
| 411 | * The meta data update code then ensures that we don't get a | ||
| 412 | * stale inode allocation image (i_size, i_clusters, etc). | ||
| 413 | */ | ||
| 414 | |||
| 415 | ret = ocfs2_meta_lock_with_page(inode, &di_bh, 1, page); | ||
| 416 | if (ret != 0) { | ||
| 417 | mlog_errno(ret); | ||
| 418 | goto out; | ||
| 419 | } | ||
| 420 | |||
| 421 | ret = ocfs2_data_lock_with_page(inode, 1, page); | ||
| 422 | if (ret != 0) { | ||
| 423 | mlog_errno(ret); | ||
| 424 | goto out_unlock_meta; | ||
| 425 | } | ||
| 426 | |||
| 427 | handle = ocfs2_start_walk_page_trans(inode, page, from, to); | ||
| 428 | if (IS_ERR(handle)) { | ||
| 429 | ret = PTR_ERR(handle); | ||
| 430 | goto out_unlock_data; | ||
| 431 | } | ||
| 432 | |||
| 433 | /* Mark our buffer early. We'd rather catch this error up here | ||
| 434 | * as opposed to after a successful commit_write which would | ||
| 435 | * require us to set back inode->i_size. */ | ||
| 436 | ret = ocfs2_journal_access(handle, inode, di_bh, | ||
| 437 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
| 438 | if (ret < 0) { | ||
| 439 | mlog_errno(ret); | ||
| 440 | goto out_commit; | ||
| 441 | } | ||
| 442 | |||
| 443 | /* might update i_size */ | ||
| 444 | ret = generic_commit_write(file, page, from, to); | ||
| 445 | if (ret < 0) { | ||
| 446 | mlog_errno(ret); | ||
| 447 | goto out_commit; | ||
| 448 | } | ||
| 449 | |||
| 450 | di = (struct ocfs2_dinode *)di_bh->b_data; | ||
| 451 | |||
| 452 | /* ocfs2_mark_inode_dirty() is too heavy to use here. */ | ||
| 453 | inode->i_mtime = inode->i_ctime = CURRENT_TIME; | ||
| 454 | di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); | ||
| 455 | di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); | ||
| 456 | |||
| 457 | inode->i_blocks = ocfs2_align_bytes_to_sectors((u64)(i_size_read(inode))); | ||
| 458 | di->i_size = cpu_to_le64((u64)i_size_read(inode)); | ||
| 459 | |||
| 460 | ret = ocfs2_journal_dirty(handle, di_bh); | ||
| 461 | if (ret < 0) { | ||
| 462 | mlog_errno(ret); | ||
| 463 | goto out_commit; | ||
| 464 | } | ||
| 465 | |||
| 466 | out_commit: | ||
| 467 | ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); | ||
| 468 | out_unlock_data: | ||
| 469 | ocfs2_data_unlock(inode, 1); | ||
| 470 | out_unlock_meta: | ||
| 471 | ocfs2_meta_unlock(inode, 1); | ||
| 472 | out: | ||
| 473 | if (di_bh) | ||
| 474 | brelse(di_bh); | ||
| 475 | |||
| 476 | mlog_exit(ret); | ||
| 477 | return ret; | ||
| 478 | } | ||
| 479 | |||
| 480 | static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block) | 380 | static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block) |
| 481 | { | 381 | { |
| 482 | sector_t status; | 382 | sector_t status; |
| @@ -499,8 +399,7 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block) | |||
| 499 | down_read(&OCFS2_I(inode)->ip_alloc_sem); | 399 | down_read(&OCFS2_I(inode)->ip_alloc_sem); |
| 500 | } | 400 | } |
| 501 | 401 | ||
| 502 | err = ocfs2_extent_map_get_blocks(inode, block, 1, &p_blkno, | 402 | err = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL, NULL); |
| 503 | NULL); | ||
| 504 | 403 | ||
| 505 | if (!INODE_JOURNAL(inode)) { | 404 | if (!INODE_JOURNAL(inode)) { |
| 506 | up_read(&OCFS2_I(inode)->ip_alloc_sem); | 405 | up_read(&OCFS2_I(inode)->ip_alloc_sem); |
| @@ -540,8 +439,8 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, | |||
| 540 | struct buffer_head *bh_result, int create) | 439 | struct buffer_head *bh_result, int create) |
| 541 | { | 440 | { |
| 542 | int ret; | 441 | int ret; |
| 543 | u64 p_blkno, inode_blocks; | 442 | u64 p_blkno, inode_blocks, contig_blocks; |
| 544 | int contig_blocks; | 443 | unsigned int ext_flags; |
| 545 | unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; | 444 | unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; |
| 546 | unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits; | 445 | unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits; |
| 547 | 446 | ||
| @@ -549,33 +448,20 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, | |||
| 549 | * nicely aligned and of the right size, so there's no need | 448 | * nicely aligned and of the right size, so there's no need |
| 550 | * for us to check any of that. */ | 449 | * for us to check any of that. */ |
| 551 | 450 | ||
| 552 | spin_lock(&OCFS2_I(inode)->ip_lock); | 451 | inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); |
| 553 | inode_blocks = ocfs2_clusters_to_blocks(inode->i_sb, | ||
| 554 | OCFS2_I(inode)->ip_clusters); | ||
| 555 | |||
| 556 | /* | ||
| 557 | * For a read which begins past the end of file, we return a hole. | ||
| 558 | */ | ||
| 559 | if (!create && (iblock >= inode_blocks)) { | ||
| 560 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
| 561 | ret = 0; | ||
| 562 | goto bail; | ||
| 563 | } | ||
| 564 | 452 | ||
| 565 | /* | 453 | /* |
| 566 | * Any write past EOF is not allowed because we'd be extending. | 454 | * Any write past EOF is not allowed because we'd be extending. |
| 567 | */ | 455 | */ |
| 568 | if (create && (iblock + max_blocks) > inode_blocks) { | 456 | if (create && (iblock + max_blocks) > inode_blocks) { |
| 569 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
| 570 | ret = -EIO; | 457 | ret = -EIO; |
| 571 | goto bail; | 458 | goto bail; |
| 572 | } | 459 | } |
| 573 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
| 574 | 460 | ||
| 575 | /* This figures out the size of the next contiguous block, and | 461 | /* This figures out the size of the next contiguous block, and |
| 576 | * our logical offset */ | 462 | * our logical offset */ |
| 577 | ret = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno, | 463 | ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, |
| 578 | &contig_blocks); | 464 | &contig_blocks, &ext_flags); |
| 579 | if (ret) { | 465 | if (ret) { |
| 580 | mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n", | 466 | mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n", |
| 581 | (unsigned long long)iblock); | 467 | (unsigned long long)iblock); |
| @@ -583,7 +469,37 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, | |||
| 583 | goto bail; | 469 | goto bail; |
| 584 | } | 470 | } |
| 585 | 471 | ||
| 586 | map_bh(bh_result, inode->i_sb, p_blkno); | 472 | if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno) { |
| 473 | ocfs2_error(inode->i_sb, | ||
| 474 | "Inode %llu has a hole at block %llu\n", | ||
| 475 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | ||
| 476 | (unsigned long long)iblock); | ||
| 477 | ret = -EROFS; | ||
| 478 | goto bail; | ||
| 479 | } | ||
| 480 | |||
| 481 | /* | ||
| 482 | * get_more_blocks() expects us to describe a hole by clearing | ||
| 483 | * the mapped bit on bh_result(). | ||
| 484 | * | ||
| 485 | * Consider an unwritten extent as a hole. | ||
| 486 | */ | ||
| 487 | if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN)) | ||
| 488 | map_bh(bh_result, inode->i_sb, p_blkno); | ||
| 489 | else { | ||
| 490 | /* | ||
| 491 | * ocfs2_prepare_inode_for_write() should have caught | ||
| 492 | * the case where we'd be filling a hole and triggered | ||
| 493 | * a buffered write instead. | ||
| 494 | */ | ||
| 495 | if (create) { | ||
| 496 | ret = -EIO; | ||
| 497 | mlog_errno(ret); | ||
| 498 | goto bail; | ||
| 499 | } | ||
| 500 | |||
| 501 | clear_buffer_mapped(bh_result); | ||
| 502 | } | ||
| 587 | 503 | ||
| 588 | /* make sure we don't map more than max_blocks blocks here as | 504 | /* make sure we don't map more than max_blocks blocks here as |
| 589 | that's all the kernel will handle at this point. */ | 505 | that's all the kernel will handle at this point. */ |
| @@ -606,12 +522,17 @@ static void ocfs2_dio_end_io(struct kiocb *iocb, | |||
| 606 | void *private) | 522 | void *private) |
| 607 | { | 523 | { |
| 608 | struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; | 524 | struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; |
| 525 | int level; | ||
| 609 | 526 | ||
| 610 | /* this io's submitter should not have unlocked this before we could */ | 527 | /* this io's submitter should not have unlocked this before we could */ |
| 611 | BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); | 528 | BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); |
| 529 | |||
| 612 | ocfs2_iocb_clear_rw_locked(iocb); | 530 | ocfs2_iocb_clear_rw_locked(iocb); |
| 613 | up_read(&inode->i_alloc_sem); | 531 | |
| 614 | ocfs2_rw_unlock(inode, 0); | 532 | level = ocfs2_iocb_rw_locked_level(iocb); |
| 533 | if (!level) | ||
| 534 | up_read(&inode->i_alloc_sem); | ||
| 535 | ocfs2_rw_unlock(inode, level); | ||
| 615 | } | 536 | } |
| 616 | 537 | ||
| 617 | /* | 538 | /* |
| @@ -647,23 +568,27 @@ static ssize_t ocfs2_direct_IO(int rw, | |||
| 647 | 568 | ||
| 648 | mlog_entry_void(); | 569 | mlog_entry_void(); |
| 649 | 570 | ||
| 650 | /* | 571 | if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) { |
| 651 | * We get PR data locks even for O_DIRECT. This allows | 572 | /* |
| 652 | * concurrent O_DIRECT I/O but doesn't let O_DIRECT with | 573 | * We get PR data locks even for O_DIRECT. This |
| 653 | * extending and buffered zeroing writes race. If they did | 574 | * allows concurrent O_DIRECT I/O but doesn't let |
| 654 | * race then the buffered zeroing could be written back after | 575 | * O_DIRECT with extending and buffered zeroing writes |
| 655 | * the O_DIRECT I/O. It's one thing to tell people not to mix | 576 | * race. If they did race then the buffered zeroing |
| 656 | * buffered and O_DIRECT writes, but expecting them to | 577 | * could be written back after the O_DIRECT I/O. It's |
| 657 | * understand that file extension is also an implicit buffered | 578 | * one thing to tell people not to mix buffered and |
| 658 | * write is too much. By getting the PR we force writeback of | 579 | * O_DIRECT writes, but expecting them to understand |
| 659 | * the buffered zeroing before proceeding. | 580 | * that file extension is also an implicit buffered |
| 660 | */ | 581 | * write is too much. By getting the PR we force |
| 661 | ret = ocfs2_data_lock(inode, 0); | 582 | * writeback of the buffered zeroing before |
| 662 | if (ret < 0) { | 583 | * proceeding. |
| 663 | mlog_errno(ret); | 584 | */ |
| 664 | goto out; | 585 | ret = ocfs2_data_lock(inode, 0); |
| 586 | if (ret < 0) { | ||
| 587 | mlog_errno(ret); | ||
| 588 | goto out; | ||
| 589 | } | ||
| 590 | ocfs2_data_unlock(inode, 0); | ||
| 665 | } | 591 | } |
| 666 | ocfs2_data_unlock(inode, 0); | ||
| 667 | 592 | ||
| 668 | ret = blockdev_direct_IO_no_locking(rw, iocb, inode, | 593 | ret = blockdev_direct_IO_no_locking(rw, iocb, inode, |
| 669 | inode->i_sb->s_bdev, iov, offset, | 594 | inode->i_sb->s_bdev, iov, offset, |
| @@ -675,11 +600,715 @@ out: | |||
| 675 | return ret; | 600 | return ret; |
| 676 | } | 601 | } |
| 677 | 602 | ||
| 603 | static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb, | ||
| 604 | u32 cpos, | ||
| 605 | unsigned int *start, | ||
| 606 | unsigned int *end) | ||
| 607 | { | ||
| 608 | unsigned int cluster_start = 0, cluster_end = PAGE_CACHE_SIZE; | ||
| 609 | |||
| 610 | if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) { | ||
| 611 | unsigned int cpp; | ||
| 612 | |||
| 613 | cpp = 1 << (PAGE_CACHE_SHIFT - osb->s_clustersize_bits); | ||
| 614 | |||
| 615 | cluster_start = cpos % cpp; | ||
| 616 | cluster_start = cluster_start << osb->s_clustersize_bits; | ||
| 617 | |||
| 618 | cluster_end = cluster_start + osb->s_clustersize; | ||
| 619 | } | ||
| 620 | |||
| 621 | BUG_ON(cluster_start > PAGE_SIZE); | ||
| 622 | BUG_ON(cluster_end > PAGE_SIZE); | ||
| 623 | |||
| 624 | if (start) | ||
| 625 | *start = cluster_start; | ||
| 626 | if (end) | ||
| 627 | *end = cluster_end; | ||
| 628 | } | ||
| 629 | |||
| 630 | /* | ||
| 631 | * 'from' and 'to' are the region in the page to avoid zeroing. | ||
| 632 | * | ||
| 633 | * If pagesize > clustersize, this function will avoid zeroing outside | ||
| 634 | * of the cluster boundary. | ||
| 635 | * | ||
| 636 | * from == to == 0 is code for "zero the entire cluster region" | ||
| 637 | */ | ||
| 638 | static void ocfs2_clear_page_regions(struct page *page, | ||
| 639 | struct ocfs2_super *osb, u32 cpos, | ||
| 640 | unsigned from, unsigned to) | ||
| 641 | { | ||
| 642 | void *kaddr; | ||
| 643 | unsigned int cluster_start, cluster_end; | ||
| 644 | |||
| 645 | ocfs2_figure_cluster_boundaries(osb, cpos, &cluster_start, &cluster_end); | ||
| 646 | |||
| 647 | kaddr = kmap_atomic(page, KM_USER0); | ||
| 648 | |||
| 649 | if (from || to) { | ||
| 650 | if (from > cluster_start) | ||
| 651 | memset(kaddr + cluster_start, 0, from - cluster_start); | ||
| 652 | if (to < cluster_end) | ||
| 653 | memset(kaddr + to, 0, cluster_end - to); | ||
| 654 | } else { | ||
| 655 | memset(kaddr + cluster_start, 0, cluster_end - cluster_start); | ||
| 656 | } | ||
| 657 | |||
| 658 | kunmap_atomic(kaddr, KM_USER0); | ||
| 659 | } | ||
| 660 | |||
| 661 | /* | ||
| 662 | * Some of this taken from block_prepare_write(). We already have our | ||
| 663 | * mapping by now though, and the entire write will be allocating or | ||
| 664 | * it won't, so not much need to use BH_New. | ||
| 665 | * | ||
| 666 | * This will also skip zeroing, which is handled externally. | ||
| 667 | */ | ||
| 668 | int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, | ||
| 669 | struct inode *inode, unsigned int from, | ||
| 670 | unsigned int to, int new) | ||
| 671 | { | ||
| 672 | int ret = 0; | ||
| 673 | struct buffer_head *head, *bh, *wait[2], **wait_bh = wait; | ||
| 674 | unsigned int block_end, block_start; | ||
| 675 | unsigned int bsize = 1 << inode->i_blkbits; | ||
| 676 | |||
| 677 | if (!page_has_buffers(page)) | ||
| 678 | create_empty_buffers(page, bsize, 0); | ||
| 679 | |||
| 680 | head = page_buffers(page); | ||
| 681 | for (bh = head, block_start = 0; bh != head || !block_start; | ||
| 682 | bh = bh->b_this_page, block_start += bsize) { | ||
| 683 | block_end = block_start + bsize; | ||
| 684 | |||
| 685 | /* | ||
| 686 | * Ignore blocks outside of our i/o range - | ||
| 687 | * they may belong to unallocated clusters. | ||
| 688 | */ | ||
| 689 | if (block_start >= to || block_end <= from) { | ||
| 690 | if (PageUptodate(page)) | ||
| 691 | set_buffer_uptodate(bh); | ||
| 692 | continue; | ||
| 693 | } | ||
| 694 | |||
| 695 | /* | ||
| 696 | * For an allocating write with cluster size >= page | ||
| 697 | * size, we always write the entire page. | ||
| 698 | */ | ||
| 699 | |||
| 700 | if (buffer_new(bh)) | ||
| 701 | clear_buffer_new(bh); | ||
| 702 | |||
| 703 | if (!buffer_mapped(bh)) { | ||
| 704 | map_bh(bh, inode->i_sb, *p_blkno); | ||
| 705 | unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); | ||
| 706 | } | ||
| 707 | |||
| 708 | if (PageUptodate(page)) { | ||
| 709 | if (!buffer_uptodate(bh)) | ||
| 710 | set_buffer_uptodate(bh); | ||
| 711 | } else if (!buffer_uptodate(bh) && !buffer_delay(bh) && | ||
| 712 | (block_start < from || block_end > to)) { | ||
| 713 | ll_rw_block(READ, 1, &bh); | ||
| 714 | *wait_bh++=bh; | ||
| 715 | } | ||
| 716 | |||
| 717 | *p_blkno = *p_blkno + 1; | ||
| 718 | } | ||
| 719 | |||
| 720 | /* | ||
| 721 | * If we issued read requests - let them complete. | ||
| 722 | */ | ||
| 723 | while(wait_bh > wait) { | ||
| 724 | wait_on_buffer(*--wait_bh); | ||
| 725 | if (!buffer_uptodate(*wait_bh)) | ||
| 726 | ret = -EIO; | ||
| 727 | } | ||
| 728 | |||
| 729 | if (ret == 0 || !new) | ||
| 730 | return ret; | ||
| 731 | |||
| 732 | /* | ||
| 733 | * If we get -EIO above, zero out any newly allocated blocks | ||
| 734 | * to avoid exposing stale data. | ||
| 735 | */ | ||
| 736 | bh = head; | ||
| 737 | block_start = 0; | ||
| 738 | do { | ||
| 739 | void *kaddr; | ||
| 740 | |||
| 741 | block_end = block_start + bsize; | ||
| 742 | if (block_end <= from) | ||
| 743 | goto next_bh; | ||
| 744 | if (block_start >= to) | ||
| 745 | break; | ||
| 746 | |||
| 747 | kaddr = kmap_atomic(page, KM_USER0); | ||
| 748 | memset(kaddr+block_start, 0, bh->b_size); | ||
| 749 | flush_dcache_page(page); | ||
| 750 | kunmap_atomic(kaddr, KM_USER0); | ||
| 751 | set_buffer_uptodate(bh); | ||
| 752 | mark_buffer_dirty(bh); | ||
| 753 | |||
| 754 | next_bh: | ||
| 755 | block_start = block_end; | ||
| 756 | bh = bh->b_this_page; | ||
| 757 | } while (bh != head); | ||
| 758 | |||
| 759 | return ret; | ||
| 760 | } | ||
| 761 | |||
| 762 | /* | ||
| 763 | * This will copy user data from the buffer page in the splice | ||
| 764 | * context. | ||
| 765 | * | ||
| 766 | * For now, we ignore SPLICE_F_MOVE as that would require some extra | ||
| 767 | * communication out all the way to ocfs2_write(). | ||
| 768 | */ | ||
| 769 | int ocfs2_map_and_write_splice_data(struct inode *inode, | ||
| 770 | struct ocfs2_write_ctxt *wc, u64 *p_blkno, | ||
| 771 | unsigned int *ret_from, unsigned int *ret_to) | ||
| 772 | { | ||
| 773 | int ret; | ||
| 774 | unsigned int to, from, cluster_start, cluster_end; | ||
| 775 | char *src, *dst; | ||
| 776 | struct ocfs2_splice_write_priv *sp = wc->w_private; | ||
| 777 | struct pipe_buffer *buf = sp->s_buf; | ||
| 778 | unsigned long bytes, src_from; | ||
| 779 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
| 780 | |||
| 781 | ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start, | ||
| 782 | &cluster_end); | ||
| 783 | |||
| 784 | from = sp->s_offset; | ||
| 785 | src_from = sp->s_buf_offset; | ||
| 786 | bytes = wc->w_count; | ||
| 787 | |||
| 788 | if (wc->w_large_pages) { | ||
| 789 | /* | ||
| 790 | * For cluster size < page size, we have to | ||
| 791 | * calculate pos within the cluster and obey | ||
| 792 | * the rightmost boundary. | ||
| 793 | */ | ||
| 794 | bytes = min(bytes, (unsigned long)(osb->s_clustersize | ||
| 795 | - (wc->w_pos & (osb->s_clustersize - 1)))); | ||
| 796 | } | ||
| 797 | to = from + bytes; | ||
| 798 | |||
| 799 | if (wc->w_this_page_new) | ||
| 800 | ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, | ||
| 801 | cluster_start, cluster_end, 1); | ||
| 802 | else | ||
| 803 | ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, | ||
| 804 | from, to, 0); | ||
| 805 | if (ret) { | ||
| 806 | mlog_errno(ret); | ||
| 807 | goto out; | ||
| 808 | } | ||
| 809 | |||
| 810 | BUG_ON(from > PAGE_CACHE_SIZE); | ||
| 811 | BUG_ON(to > PAGE_CACHE_SIZE); | ||
| 812 | BUG_ON(from > osb->s_clustersize); | ||
| 813 | BUG_ON(to > osb->s_clustersize); | ||
| 814 | |||
| 815 | src = buf->ops->map(sp->s_pipe, buf, 1); | ||
| 816 | dst = kmap_atomic(wc->w_this_page, KM_USER1); | ||
| 817 | memcpy(dst + from, src + src_from, bytes); | ||
| 818 | kunmap_atomic(wc->w_this_page, KM_USER1); | ||
| 819 | buf->ops->unmap(sp->s_pipe, buf, src); | ||
| 820 | |||
| 821 | wc->w_finished_copy = 1; | ||
| 822 | |||
| 823 | *ret_from = from; | ||
| 824 | *ret_to = to; | ||
| 825 | out: | ||
| 826 | |||
| 827 | return bytes ? (unsigned int)bytes : ret; | ||
| 828 | } | ||
| 829 | |||
| 830 | /* | ||
| 831 | * This will copy user data from the iovec in the buffered write | ||
| 832 | * context. | ||
| 833 | */ | ||
| 834 | int ocfs2_map_and_write_user_data(struct inode *inode, | ||
| 835 | struct ocfs2_write_ctxt *wc, u64 *p_blkno, | ||
| 836 | unsigned int *ret_from, unsigned int *ret_to) | ||
| 837 | { | ||
| 838 | int ret; | ||
| 839 | unsigned int to, from, cluster_start, cluster_end; | ||
| 840 | unsigned long bytes, src_from; | ||
| 841 | char *dst; | ||
| 842 | struct ocfs2_buffered_write_priv *bp = wc->w_private; | ||
| 843 | const struct iovec *cur_iov = bp->b_cur_iov; | ||
| 844 | char __user *buf; | ||
| 845 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
| 846 | |||
| 847 | ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start, | ||
| 848 | &cluster_end); | ||
| 849 | |||
| 850 | buf = cur_iov->iov_base + bp->b_cur_off; | ||
| 851 | src_from = (unsigned long)buf & ~PAGE_CACHE_MASK; | ||
| 852 | |||
| 853 | from = wc->w_pos & (PAGE_CACHE_SIZE - 1); | ||
| 854 | |||
| 855 | /* | ||
| 856 | * This is a lot of comparisons, but it reads quite | ||
| 857 | * easily, which is important here. | ||
| 858 | */ | ||
| 859 | /* Stay within the src page */ | ||
| 860 | bytes = PAGE_SIZE - src_from; | ||
| 861 | /* Stay within the vector */ | ||
| 862 | bytes = min(bytes, | ||
| 863 | (unsigned long)(cur_iov->iov_len - bp->b_cur_off)); | ||
| 864 | /* Stay within count */ | ||
| 865 | bytes = min(bytes, (unsigned long)wc->w_count); | ||
| 866 | /* | ||
| 867 | * For clustersize > page size, just stay within | ||
| 868 | * target page, otherwise we have to calculate pos | ||
| 869 | * within the cluster and obey the rightmost | ||
| 870 | * boundary. | ||
| 871 | */ | ||
| 872 | if (wc->w_large_pages) { | ||
| 873 | /* | ||
| 874 | * For cluster size < page size, we have to | ||
| 875 | * calculate pos within the cluster and obey | ||
| 876 | * the rightmost boundary. | ||
| 877 | */ | ||
| 878 | bytes = min(bytes, (unsigned long)(osb->s_clustersize | ||
| 879 | - (wc->w_pos & (osb->s_clustersize - 1)))); | ||
| 880 | } else { | ||
| 881 | /* | ||
| 882 | * cluster size > page size is the most common | ||
| 883 | * case - we just stay within the target page | ||
| 884 | * boundary. | ||
| 885 | */ | ||
| 886 | bytes = min(bytes, PAGE_CACHE_SIZE - from); | ||
| 887 | } | ||
| 888 | |||
| 889 | to = from + bytes; | ||
| 890 | |||
| 891 | if (wc->w_this_page_new) | ||
| 892 | ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, | ||
| 893 | cluster_start, cluster_end, 1); | ||
| 894 | else | ||
| 895 | ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, | ||
| 896 | from, to, 0); | ||
| 897 | if (ret) { | ||
| 898 | mlog_errno(ret); | ||
| 899 | goto out; | ||
| 900 | } | ||
| 901 | |||
| 902 | BUG_ON(from > PAGE_CACHE_SIZE); | ||
| 903 | BUG_ON(to > PAGE_CACHE_SIZE); | ||
| 904 | BUG_ON(from > osb->s_clustersize); | ||
| 905 | BUG_ON(to > osb->s_clustersize); | ||
| 906 | |||
| 907 | dst = kmap(wc->w_this_page); | ||
| 908 | memcpy(dst + from, bp->b_src_buf + src_from, bytes); | ||
| 909 | kunmap(wc->w_this_page); | ||
| 910 | |||
| 911 | /* | ||
| 912 | * XXX: This is slow, but simple. The caller of | ||
| 913 | * ocfs2_buffered_write_cluster() is responsible for | ||
| 914 | * passing through the iovecs, so it's difficult to | ||
| 915 | * predict what our next step is in here after our | ||
| 916 | * initial write. A future version should be pushing | ||
| 917 | * that iovec manipulation further down. | ||
| 918 | * | ||
| 919 | * By setting this, we indicate that a copy from user | ||
| 920 | * data was done, and subsequent calls for this | ||
| 921 | * cluster will skip copying more data. | ||
| 922 | */ | ||
| 923 | wc->w_finished_copy = 1; | ||
| 924 | |||
| 925 | *ret_from = from; | ||
| 926 | *ret_to = to; | ||
| 927 | out: | ||
| 928 | |||
| 929 | return bytes ? (unsigned int)bytes : ret; | ||
| 930 | } | ||
| 931 | |||
| 932 | /* | ||
| 933 | * Map, fill and write a page to disk. | ||
| 934 | * | ||
| 935 | * The work of copying data is done via callback. Newly allocated | ||
| 936 | * pages which don't take user data will be zero'd (set 'new' to | ||
| 937 | * indicate an allocating write) | ||
| 938 | * | ||
| 939 | * Returns a negative error code or the number of bytes copied into | ||
| 940 | * the page. | ||
| 941 | */ | ||
| 942 | int ocfs2_write_data_page(struct inode *inode, handle_t *handle, | ||
| 943 | u64 *p_blkno, struct page *page, | ||
| 944 | struct ocfs2_write_ctxt *wc, int new) | ||
| 945 | { | ||
| 946 | int ret, copied = 0; | ||
| 947 | unsigned int from = 0, to = 0; | ||
| 948 | unsigned int cluster_start, cluster_end; | ||
| 949 | unsigned int zero_from = 0, zero_to = 0; | ||
| 950 | |||
| 951 | ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), wc->w_cpos, | ||
| 952 | &cluster_start, &cluster_end); | ||
| 953 | |||
| 954 | if ((wc->w_pos >> PAGE_CACHE_SHIFT) == page->index | ||
| 955 | && !wc->w_finished_copy) { | ||
| 956 | |||
| 957 | wc->w_this_page = page; | ||
| 958 | wc->w_this_page_new = new; | ||
| 959 | ret = wc->w_write_data_page(inode, wc, p_blkno, &from, &to); | ||
| 960 | if (ret < 0) { | ||
| 961 | mlog_errno(ret); | ||
| 962 | goto out; | ||
| 963 | } | ||
| 964 | |||
| 965 | copied = ret; | ||
| 966 | |||
| 967 | zero_from = from; | ||
| 968 | zero_to = to; | ||
| 969 | if (new) { | ||
| 970 | from = cluster_start; | ||
| 971 | to = cluster_end; | ||
| 972 | } | ||
| 973 | } else { | ||
| 974 | /* | ||
| 975 | * If we haven't allocated the new page yet, we | ||
| 976 | * shouldn't be writing it out without copying user | ||
| 977 | * data. This is likely a math error from the caller. | ||
| 978 | */ | ||
| 979 | BUG_ON(!new); | ||
| 980 | |||
| 981 | from = cluster_start; | ||
| 982 | to = cluster_end; | ||
| 983 | |||
| 984 | ret = ocfs2_map_page_blocks(page, p_blkno, inode, | ||
| 985 | cluster_start, cluster_end, 1); | ||
| 986 | if (ret) { | ||
| 987 | mlog_errno(ret); | ||
| 988 | goto out; | ||
| 989 | } | ||
| 990 | } | ||
| 991 | |||
| 992 | /* | ||
| 993 | * Parts of newly allocated pages need to be zero'd. | ||
| 994 | * | ||
| 995 | * Above, we have also rewritten 'to' and 'from' - as far as | ||
| 996 | * the rest of the function is concerned, the entire cluster | ||
| 997 | * range inside of a page needs to be written. | ||
| 998 | * | ||
| 999 | * We can skip this if the page is up to date - it's already | ||
| 1000 | * been zero'd from being read in as a hole. | ||
| 1001 | */ | ||
| 1002 | if (new && !PageUptodate(page)) | ||
| 1003 | ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb), | ||
| 1004 | wc->w_cpos, zero_from, zero_to); | ||
| 1005 | |||
| 1006 | flush_dcache_page(page); | ||
| 1007 | |||
| 1008 | if (ocfs2_should_order_data(inode)) { | ||
| 1009 | ret = walk_page_buffers(handle, | ||
| 1010 | page_buffers(page), | ||
| 1011 | from, to, NULL, | ||
| 1012 | ocfs2_journal_dirty_data); | ||
| 1013 | if (ret < 0) | ||
| 1014 | mlog_errno(ret); | ||
| 1015 | } | ||
| 1016 | |||
| 1017 | /* | ||
| 1018 | * We don't use generic_commit_write() because we need to | ||
| 1019 | * handle our own i_size update. | ||
| 1020 | */ | ||
| 1021 | ret = block_commit_write(page, from, to); | ||
| 1022 | if (ret) | ||
| 1023 | mlog_errno(ret); | ||
| 1024 | out: | ||
| 1025 | |||
| 1026 | return copied ? copied : ret; | ||
| 1027 | } | ||
| 1028 | |||
| 1029 | /* | ||
| 1030 | * Do the actual write of some data into an inode. Optionally allocate | ||
| 1031 | * in order to fulfill the write. | ||
| 1032 | * | ||
| 1033 | * cpos is the logical cluster offset within the file to write at | ||
| 1034 | * | ||
| 1035 | * 'phys' is the physical mapping of that offset. a 'phys' value of | ||
| 1036 | * zero indicates that allocation is required. In this case, data_ac | ||
| 1037 | * and meta_ac should be valid (meta_ac can be null if metadata | ||
| 1038 | * allocation isn't required). | ||
| 1039 | */ | ||
| 1040 | static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle, | ||
| 1041 | struct buffer_head *di_bh, | ||
| 1042 | struct ocfs2_alloc_context *data_ac, | ||
| 1043 | struct ocfs2_alloc_context *meta_ac, | ||
| 1044 | struct ocfs2_write_ctxt *wc) | ||
| 1045 | { | ||
| 1046 | int ret, i, numpages = 1, new; | ||
| 1047 | unsigned int copied = 0; | ||
| 1048 | u32 tmp_pos; | ||
| 1049 | u64 v_blkno, p_blkno; | ||
| 1050 | struct address_space *mapping = file->f_mapping; | ||
| 1051 | struct inode *inode = mapping->host; | ||
| 1052 | unsigned long index, start; | ||
| 1053 | struct page **cpages; | ||
| 1054 | |||
| 1055 | new = phys == 0 ? 1 : 0; | ||
| 1056 | |||
| 1057 | /* | ||
| 1058 | * Figure out how many pages we'll be manipulating here. For | ||
| 1059 | * non allocating write, we just change the one | ||
| 1060 | * page. Otherwise, we'll need a whole clusters worth. | ||
| 1061 | */ | ||
| 1062 | if (new) | ||
| 1063 | numpages = ocfs2_pages_per_cluster(inode->i_sb); | ||
| 1064 | |||
| 1065 | cpages = kzalloc(sizeof(*cpages) * numpages, GFP_NOFS); | ||
| 1066 | if (!cpages) { | ||
| 1067 | ret = -ENOMEM; | ||
| 1068 | mlog_errno(ret); | ||
| 1069 | return ret; | ||
| 1070 | } | ||
| 1071 | |||
| 1072 | /* | ||
| 1073 | * Fill our page array first. That way we've grabbed enough so | ||
| 1074 | * that we can zero and flush if we error after adding the | ||
| 1075 | * extent. | ||
| 1076 | */ | ||
| 1077 | if (new) { | ||
| 1078 | start = ocfs2_align_clusters_to_page_index(inode->i_sb, | ||
| 1079 | wc->w_cpos); | ||
| 1080 | v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, wc->w_cpos); | ||
| 1081 | } else { | ||
| 1082 | start = wc->w_pos >> PAGE_CACHE_SHIFT; | ||
| 1083 | v_blkno = wc->w_pos >> inode->i_sb->s_blocksize_bits; | ||
| 1084 | } | ||
| 1085 | |||
| 1086 | for(i = 0; i < numpages; i++) { | ||
| 1087 | index = start + i; | ||
| 1088 | |||
| 1089 | cpages[i] = grab_cache_page(mapping, index); | ||
| 1090 | if (!cpages[i]) { | ||
| 1091 | ret = -ENOMEM; | ||
| 1092 | mlog_errno(ret); | ||
| 1093 | goto out; | ||
| 1094 | } | ||
| 1095 | } | ||
| 1096 | |||
| 1097 | if (new) { | ||
| 1098 | /* | ||
| 1099 | * This is safe to call with the page locks - it won't take | ||
| 1100 | * any additional semaphores or cluster locks. | ||
| 1101 | */ | ||
| 1102 | tmp_pos = wc->w_cpos; | ||
| 1103 | ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode, | ||
| 1104 | &tmp_pos, 1, di_bh, handle, | ||
| 1105 | data_ac, meta_ac, NULL); | ||
| 1106 | /* | ||
| 1107 | * This shouldn't happen because we must have already | ||
| 1108 | * calculated the correct meta data allocation required. The | ||
| 1109 | * internal tree allocation code should know how to increase | ||
| 1110 | * transaction credits itself. | ||
| 1111 | * | ||
| 1112 | * If need be, we could handle -EAGAIN for a | ||
| 1113 | * RESTART_TRANS here. | ||
| 1114 | */ | ||
| 1115 | mlog_bug_on_msg(ret == -EAGAIN, | ||
| 1116 | "Inode %llu: EAGAIN return during allocation.\n", | ||
| 1117 | (unsigned long long)OCFS2_I(inode)->ip_blkno); | ||
| 1118 | if (ret < 0) { | ||
| 1119 | mlog_errno(ret); | ||
| 1120 | goto out; | ||
| 1121 | } | ||
| 1122 | } | ||
| 1123 | |||
| 1124 | ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL, | ||
| 1125 | NULL); | ||
| 1126 | if (ret < 0) { | ||
| 1127 | |||
| 1128 | /* | ||
| 1129 | * XXX: Should we go readonly here? | ||
| 1130 | */ | ||
| 1131 | |||
| 1132 | mlog_errno(ret); | ||
| 1133 | goto out; | ||
| 1134 | } | ||
| 1135 | |||
| 1136 | BUG_ON(p_blkno == 0); | ||
| 1137 | |||
| 1138 | for(i = 0; i < numpages; i++) { | ||
| 1139 | ret = ocfs2_write_data_page(inode, handle, &p_blkno, cpages[i], | ||
| 1140 | wc, new); | ||
| 1141 | if (ret < 0) { | ||
| 1142 | mlog_errno(ret); | ||
| 1143 | goto out; | ||
| 1144 | } | ||
| 1145 | |||
| 1146 | copied += ret; | ||
| 1147 | } | ||
| 1148 | |||
| 1149 | out: | ||
| 1150 | for(i = 0; i < numpages; i++) { | ||
| 1151 | unlock_page(cpages[i]); | ||
| 1152 | mark_page_accessed(cpages[i]); | ||
| 1153 | page_cache_release(cpages[i]); | ||
| 1154 | } | ||
| 1155 | kfree(cpages); | ||
| 1156 | |||
| 1157 | return copied ? copied : ret; | ||
| 1158 | } | ||
| 1159 | |||
| 1160 | static void ocfs2_write_ctxt_init(struct ocfs2_write_ctxt *wc, | ||
| 1161 | struct ocfs2_super *osb, loff_t pos, | ||
| 1162 | size_t count, ocfs2_page_writer *cb, | ||
| 1163 | void *cb_priv) | ||
| 1164 | { | ||
| 1165 | wc->w_count = count; | ||
| 1166 | wc->w_pos = pos; | ||
| 1167 | wc->w_cpos = wc->w_pos >> osb->s_clustersize_bits; | ||
| 1168 | wc->w_finished_copy = 0; | ||
| 1169 | |||
| 1170 | if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) | ||
| 1171 | wc->w_large_pages = 1; | ||
| 1172 | else | ||
| 1173 | wc->w_large_pages = 0; | ||
| 1174 | |||
| 1175 | wc->w_write_data_page = cb; | ||
| 1176 | wc->w_private = cb_priv; | ||
| 1177 | } | ||
| 1178 | |||
| 1179 | /* | ||
| 1180 | * Write a cluster to an inode. The cluster may not be allocated yet, | ||
| 1181 | * in which case it will be. This only exists for buffered writes - | ||
| 1182 | * O_DIRECT takes a more "traditional" path through the kernel. | ||
| 1183 | * | ||
| 1184 | * The caller is responsible for incrementing pos, written counts, etc | ||
| 1185 | * | ||
| 1186 | * For file systems that don't support sparse files, pre-allocation | ||
| 1187 | * and page zeroing up until cpos should be done prior to this | ||
| 1188 | * function call. | ||
| 1189 | * | ||
| 1190 | * Callers should be holding i_sem, and the rw cluster lock. | ||
| 1191 | * | ||
| 1192 | * Returns the number of user bytes written, or less than zero for | ||
| 1193 | * error. | ||
| 1194 | */ | ||
| 1195 | ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos, | ||
| 1196 | size_t count, ocfs2_page_writer *actor, | ||
| 1197 | void *priv) | ||
| 1198 | { | ||
| 1199 | int ret, credits = OCFS2_INODE_UPDATE_CREDITS; | ||
| 1200 | ssize_t written = 0; | ||
| 1201 | u32 phys; | ||
| 1202 | struct inode *inode = file->f_mapping->host; | ||
| 1203 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
| 1204 | struct buffer_head *di_bh = NULL; | ||
| 1205 | struct ocfs2_dinode *di; | ||
| 1206 | struct ocfs2_alloc_context *data_ac = NULL; | ||
| 1207 | struct ocfs2_alloc_context *meta_ac = NULL; | ||
| 1208 | handle_t *handle; | ||
| 1209 | struct ocfs2_write_ctxt wc; | ||
| 1210 | |||
| 1211 | ocfs2_write_ctxt_init(&wc, osb, pos, count, actor, priv); | ||
| 1212 | |||
| 1213 | ret = ocfs2_meta_lock(inode, &di_bh, 1); | ||
| 1214 | if (ret) { | ||
| 1215 | mlog_errno(ret); | ||
| 1216 | goto out; | ||
| 1217 | } | ||
| 1218 | di = (struct ocfs2_dinode *)di_bh->b_data; | ||
| 1219 | |||
| 1220 | /* | ||
| 1221 | * Take alloc sem here to prevent concurrent lookups. That way | ||
| 1222 | * the mapping, zeroing and tree manipulation within | ||
| 1223 | * ocfs2_write() will be safe against ->readpage(). This | ||
| 1224 | * should also serve to lock out allocation from a shared | ||
| 1225 | * writeable region. | ||
| 1226 | */ | ||
| 1227 | down_write(&OCFS2_I(inode)->ip_alloc_sem); | ||
| 1228 | |||
| 1229 | ret = ocfs2_get_clusters(inode, wc.w_cpos, &phys, NULL, NULL); | ||
| 1230 | if (ret) { | ||
| 1231 | mlog_errno(ret); | ||
| 1232 | goto out_meta; | ||
| 1233 | } | ||
| 1234 | |||
| 1235 | /* phys == 0 means that allocation is required. */ | ||
| 1236 | if (phys == 0) { | ||
| 1237 | ret = ocfs2_lock_allocators(inode, di, 1, &data_ac, &meta_ac); | ||
| 1238 | if (ret) { | ||
| 1239 | mlog_errno(ret); | ||
| 1240 | goto out_meta; | ||
| 1241 | } | ||
| 1242 | |||
| 1243 | credits = ocfs2_calc_extend_credits(inode->i_sb, di, 1); | ||
| 1244 | } | ||
| 1245 | |||
| 1246 | ret = ocfs2_data_lock(inode, 1); | ||
| 1247 | if (ret) { | ||
| 1248 | mlog_errno(ret); | ||
| 1249 | goto out_meta; | ||
| 1250 | } | ||
| 1251 | |||
| 1252 | handle = ocfs2_start_trans(osb, credits); | ||
| 1253 | if (IS_ERR(handle)) { | ||
| 1254 | ret = PTR_ERR(handle); | ||
| 1255 | mlog_errno(ret); | ||
| 1256 | goto out_data; | ||
| 1257 | } | ||
| 1258 | |||
| 1259 | written = ocfs2_write(file, phys, handle, di_bh, data_ac, | ||
| 1260 | meta_ac, &wc); | ||
| 1261 | if (written < 0) { | ||
| 1262 | ret = written; | ||
| 1263 | mlog_errno(ret); | ||
| 1264 | goto out_commit; | ||
| 1265 | } | ||
| 1266 | |||
| 1267 | ret = ocfs2_journal_access(handle, inode, di_bh, | ||
| 1268 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
| 1269 | if (ret) { | ||
| 1270 | mlog_errno(ret); | ||
| 1271 | goto out_commit; | ||
| 1272 | } | ||
| 1273 | |||
| 1274 | pos += written; | ||
| 1275 | if (pos > inode->i_size) { | ||
| 1276 | i_size_write(inode, pos); | ||
| 1277 | mark_inode_dirty(inode); | ||
| 1278 | } | ||
| 1279 | inode->i_blocks = ocfs2_inode_sector_count(inode); | ||
| 1280 | di->i_size = cpu_to_le64((u64)i_size_read(inode)); | ||
| 1281 | inode->i_mtime = inode->i_ctime = CURRENT_TIME; | ||
| 1282 | di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); | ||
| 1283 | di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); | ||
| 1284 | |||
| 1285 | ret = ocfs2_journal_dirty(handle, di_bh); | ||
| 1286 | if (ret) | ||
| 1287 | mlog_errno(ret); | ||
| 1288 | |||
| 1289 | out_commit: | ||
| 1290 | ocfs2_commit_trans(osb, handle); | ||
| 1291 | |||
| 1292 | out_data: | ||
| 1293 | ocfs2_data_unlock(inode, 1); | ||
| 1294 | |||
| 1295 | out_meta: | ||
| 1296 | up_write(&OCFS2_I(inode)->ip_alloc_sem); | ||
| 1297 | ocfs2_meta_unlock(inode, 1); | ||
| 1298 | |||
| 1299 | out: | ||
| 1300 | brelse(di_bh); | ||
| 1301 | if (data_ac) | ||
| 1302 | ocfs2_free_alloc_context(data_ac); | ||
| 1303 | if (meta_ac) | ||
| 1304 | ocfs2_free_alloc_context(meta_ac); | ||
| 1305 | |||
| 1306 | return written ? written : ret; | ||
| 1307 | } | ||
| 1308 | |||
| 678 | const struct address_space_operations ocfs2_aops = { | 1309 | const struct address_space_operations ocfs2_aops = { |
| 679 | .readpage = ocfs2_readpage, | 1310 | .readpage = ocfs2_readpage, |
| 680 | .writepage = ocfs2_writepage, | 1311 | .writepage = ocfs2_writepage, |
| 681 | .prepare_write = ocfs2_prepare_write, | ||
| 682 | .commit_write = ocfs2_commit_write, | ||
| 683 | .bmap = ocfs2_bmap, | 1312 | .bmap = ocfs2_bmap, |
| 684 | .sync_page = block_sync_page, | 1313 | .sync_page = block_sync_page, |
| 685 | .direct_IO = ocfs2_direct_IO, | 1314 | .direct_IO = ocfs2_direct_IO, |
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h index f446a15eab88..45821d479b5a 100644 --- a/fs/ocfs2/aops.h +++ b/fs/ocfs2/aops.h | |||
| @@ -30,12 +30,83 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode, | |||
| 30 | unsigned from, | 30 | unsigned from, |
| 31 | unsigned to); | 31 | unsigned to); |
| 32 | 32 | ||
| 33 | int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, | ||
| 34 | struct inode *inode, unsigned int from, | ||
| 35 | unsigned int to, int new); | ||
| 36 | |||
| 37 | int walk_page_buffers( handle_t *handle, | ||
| 38 | struct buffer_head *head, | ||
| 39 | unsigned from, | ||
| 40 | unsigned to, | ||
| 41 | int *partial, | ||
| 42 | int (*fn)( handle_t *handle, | ||
| 43 | struct buffer_head *bh)); | ||
| 44 | |||
| 45 | struct ocfs2_write_ctxt; | ||
| 46 | typedef int (ocfs2_page_writer)(struct inode *, struct ocfs2_write_ctxt *, | ||
| 47 | u64 *, unsigned int *, unsigned int *); | ||
| 48 | |||
| 49 | ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos, | ||
| 50 | size_t count, ocfs2_page_writer *actor, | ||
| 51 | void *priv); | ||
| 52 | |||
| 53 | struct ocfs2_write_ctxt { | ||
| 54 | size_t w_count; | ||
| 55 | loff_t w_pos; | ||
| 56 | u32 w_cpos; | ||
| 57 | unsigned int w_finished_copy; | ||
| 58 | |||
| 59 | /* This is true if page_size > cluster_size */ | ||
| 60 | unsigned int w_large_pages; | ||
| 61 | |||
| 62 | /* Filler callback and private data */ | ||
| 63 | ocfs2_page_writer *w_write_data_page; | ||
| 64 | void *w_private; | ||
| 65 | |||
| 66 | /* Only valid for the filler callback */ | ||
| 67 | struct page *w_this_page; | ||
| 68 | unsigned int w_this_page_new; | ||
| 69 | }; | ||
| 70 | |||
| 71 | struct ocfs2_buffered_write_priv { | ||
| 72 | char *b_src_buf; | ||
| 73 | const struct iovec *b_cur_iov; /* Current iovec */ | ||
| 74 | size_t b_cur_off; /* Offset in the | ||
| 75 | * current iovec */ | ||
| 76 | }; | ||
| 77 | int ocfs2_map_and_write_user_data(struct inode *inode, | ||
| 78 | struct ocfs2_write_ctxt *wc, | ||
| 79 | u64 *p_blkno, | ||
| 80 | unsigned int *ret_from, | ||
| 81 | unsigned int *ret_to); | ||
| 82 | |||
| 83 | struct ocfs2_splice_write_priv { | ||
| 84 | struct splice_desc *s_sd; | ||
| 85 | struct pipe_buffer *s_buf; | ||
| 86 | struct pipe_inode_info *s_pipe; | ||
| 87 | /* Neither offset value is ever larger than one page */ | ||
| 88 | unsigned int s_offset; | ||
| 89 | unsigned int s_buf_offset; | ||
| 90 | }; | ||
| 91 | int ocfs2_map_and_write_splice_data(struct inode *inode, | ||
| 92 | struct ocfs2_write_ctxt *wc, | ||
| 93 | u64 *p_blkno, | ||
| 94 | unsigned int *ret_from, | ||
| 95 | unsigned int *ret_to); | ||
| 96 | |||
| 33 | /* all ocfs2_dio_end_io()'s fault */ | 97 | /* all ocfs2_dio_end_io()'s fault */ |
| 34 | #define ocfs2_iocb_is_rw_locked(iocb) \ | 98 | #define ocfs2_iocb_is_rw_locked(iocb) \ |
| 35 | test_bit(0, (unsigned long *)&iocb->private) | 99 | test_bit(0, (unsigned long *)&iocb->private) |
| 36 | #define ocfs2_iocb_set_rw_locked(iocb) \ | 100 | static inline void ocfs2_iocb_set_rw_locked(struct kiocb *iocb, int level) |
| 37 | set_bit(0, (unsigned long *)&iocb->private) | 101 | { |
| 102 | set_bit(0, (unsigned long *)&iocb->private); | ||
| 103 | if (level) | ||
| 104 | set_bit(1, (unsigned long *)&iocb->private); | ||
| 105 | else | ||
| 106 | clear_bit(1, (unsigned long *)&iocb->private); | ||
| 107 | } | ||
| 38 | #define ocfs2_iocb_clear_rw_locked(iocb) \ | 108 | #define ocfs2_iocb_clear_rw_locked(iocb) \ |
| 39 | clear_bit(0, (unsigned long *)&iocb->private) | 109 | clear_bit(0, (unsigned long *)&iocb->private) |
| 40 | 110 | #define ocfs2_iocb_rw_locked_level(iocb) \ | |
| 111 | test_bit(1, (unsigned long *)&iocb->private) | ||
| 41 | #endif /* OCFS2_FILE_H */ | 112 | #endif /* OCFS2_FILE_H */ |
diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c index 4705d659fe57..bbacf7da48a4 100644 --- a/fs/ocfs2/cluster/quorum.c +++ b/fs/ocfs2/cluster/quorum.c | |||
| @@ -46,6 +46,7 @@ | |||
| 46 | #include <linux/kernel.h> | 46 | #include <linux/kernel.h> |
| 47 | #include <linux/slab.h> | 47 | #include <linux/slab.h> |
| 48 | #include <linux/workqueue.h> | 48 | #include <linux/workqueue.h> |
| 49 | #include <linux/reboot.h> | ||
| 49 | 50 | ||
| 50 | #include "heartbeat.h" | 51 | #include "heartbeat.h" |
| 51 | #include "nodemanager.h" | 52 | #include "nodemanager.h" |
| @@ -72,7 +73,9 @@ static void o2quo_fence_self(void) | |||
| 72 | /* panic spins with interrupts enabled. with preempt | 73 | /* panic spins with interrupts enabled. with preempt |
| 73 | * threads can still schedule, etc, etc */ | 74 | * threads can still schedule, etc, etc */ |
| 74 | o2hb_stop_all_regions(); | 75 | o2hb_stop_all_regions(); |
| 75 | panic("ocfs2 is very sorry to be fencing this system by panicing\n"); | 76 | |
| 77 | printk("ocfs2 is very sorry to be fencing this system by restarting\n"); | ||
| 78 | emergency_restart(); | ||
| 76 | } | 79 | } |
| 77 | 80 | ||
| 78 | /* Indicate that a timeout occured on a hearbeat region write. The | 81 | /* Indicate that a timeout occured on a hearbeat region write. The |
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h index 4dae5df5e467..9606111fe89d 100644 --- a/fs/ocfs2/cluster/tcp_internal.h +++ b/fs/ocfs2/cluster/tcp_internal.h | |||
| @@ -38,6 +38,9 @@ | |||
| 38 | * locking semantics of the file system using the protocol. It should | 38 | * locking semantics of the file system using the protocol. It should |
| 39 | * be somewhere else, I'm sure, but right now it isn't. | 39 | * be somewhere else, I'm sure, but right now it isn't. |
| 40 | * | 40 | * |
| 41 | * New in version 8: | ||
| 42 | * - Replace delete inode votes with a cluster lock | ||
| 43 | * | ||
| 41 | * New in version 7: | 44 | * New in version 7: |
| 42 | * - DLM join domain includes the live nodemap | 45 | * - DLM join domain includes the live nodemap |
| 43 | * | 46 | * |
| @@ -57,7 +60,7 @@ | |||
| 57 | * - full 64 bit i_size in the metadata lock lvbs | 60 | * - full 64 bit i_size in the metadata lock lvbs |
| 58 | * - introduction of "rw" lock and pushing meta/data locking down | 61 | * - introduction of "rw" lock and pushing meta/data locking down |
| 59 | */ | 62 | */ |
| 60 | #define O2NET_PROTOCOL_VERSION 7ULL | 63 | #define O2NET_PROTOCOL_VERSION 8ULL |
| 61 | struct o2net_handshake { | 64 | struct o2net_handshake { |
| 62 | __be64 protocol_version; | 65 | __be64 protocol_version; |
| 63 | __be64 connector_id; | 66 | __be64 connector_id; |
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 66821e178167..67e6866a2a4f 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c | |||
| @@ -358,15 +358,17 @@ int ocfs2_do_extend_dir(struct super_block *sb, | |||
| 358 | { | 358 | { |
| 359 | int status; | 359 | int status; |
| 360 | int extend; | 360 | int extend; |
| 361 | u64 p_blkno; | 361 | u64 p_blkno, v_blkno; |
| 362 | 362 | ||
| 363 | spin_lock(&OCFS2_I(dir)->ip_lock); | 363 | spin_lock(&OCFS2_I(dir)->ip_lock); |
| 364 | extend = (i_size_read(dir) == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)); | 364 | extend = (i_size_read(dir) == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)); |
| 365 | spin_unlock(&OCFS2_I(dir)->ip_lock); | 365 | spin_unlock(&OCFS2_I(dir)->ip_lock); |
| 366 | 366 | ||
| 367 | if (extend) { | 367 | if (extend) { |
| 368 | status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, 1, | 368 | u32 offset = OCFS2_I(dir)->ip_clusters; |
| 369 | parent_fe_bh, handle, | 369 | |
| 370 | status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, &offset, | ||
| 371 | 1, parent_fe_bh, handle, | ||
| 370 | data_ac, meta_ac, NULL); | 372 | data_ac, meta_ac, NULL); |
| 371 | BUG_ON(status == -EAGAIN); | 373 | BUG_ON(status == -EAGAIN); |
| 372 | if (status < 0) { | 374 | if (status < 0) { |
| @@ -375,9 +377,8 @@ int ocfs2_do_extend_dir(struct super_block *sb, | |||
| 375 | } | 377 | } |
| 376 | } | 378 | } |
| 377 | 379 | ||
| 378 | status = ocfs2_extent_map_get_blocks(dir, (dir->i_blocks >> | 380 | v_blkno = ocfs2_blocks_for_bytes(sb, i_size_read(dir)); |
| 379 | (sb->s_blocksize_bits - 9)), | 381 | status = ocfs2_extent_map_get_blocks(dir, v_blkno, &p_blkno, NULL, NULL); |
| 380 | 1, &p_blkno, NULL); | ||
| 381 | if (status < 0) { | 382 | if (status < 0) { |
| 382 | mlog_errno(status); | 383 | mlog_errno(status); |
| 383 | goto bail; | 384 | goto bail; |
| @@ -486,7 +487,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb, | |||
| 486 | 487 | ||
| 487 | dir_i_size += dir->i_sb->s_blocksize; | 488 | dir_i_size += dir->i_sb->s_blocksize; |
| 488 | i_size_write(dir, dir_i_size); | 489 | i_size_write(dir, dir_i_size); |
| 489 | dir->i_blocks = ocfs2_align_bytes_to_sectors(dir_i_size); | 490 | dir->i_blocks = ocfs2_inode_sector_count(dir); |
| 490 | status = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh); | 491 | status = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh); |
| 491 | if (status < 0) { | 492 | if (status < 0) { |
| 492 | mlog_errno(status); | 493 | mlog_errno(status); |
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index c558442a0b44..d836b98dd99a 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c | |||
| @@ -430,11 +430,10 @@ redo_bucket: | |||
| 430 | 430 | ||
| 431 | dlm_lockres_put(res); | 431 | dlm_lockres_put(res); |
| 432 | 432 | ||
| 433 | cond_resched_lock(&dlm->spinlock); | ||
| 434 | |||
| 435 | if (dropped) | 433 | if (dropped) |
| 436 | goto redo_bucket; | 434 | goto redo_bucket; |
| 437 | } | 435 | } |
| 436 | cond_resched_lock(&dlm->spinlock); | ||
| 438 | num += n; | 437 | num += n; |
| 439 | mlog(0, "%s: touched %d lockreses in bucket %d " | 438 | mlog(0, "%s: touched %d lockreses in bucket %d " |
| 440 | "(tot=%d)\n", dlm->name, n, i, num); | 439 | "(tot=%d)\n", dlm->name, n, i, num); |
| @@ -1035,7 +1034,7 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm) | |||
| 1035 | { | 1034 | { |
| 1036 | int status = 0, tmpstat, node; | 1035 | int status = 0, tmpstat, node; |
| 1037 | struct domain_join_ctxt *ctxt; | 1036 | struct domain_join_ctxt *ctxt; |
| 1038 | enum dlm_query_join_response response; | 1037 | enum dlm_query_join_response response = JOIN_DISALLOW; |
| 1039 | 1038 | ||
| 1040 | mlog_entry("%p", dlm); | 1039 | mlog_entry("%p", dlm); |
| 1041 | 1040 | ||
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 6d4a83d50152..c1807a42c49f 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c | |||
| @@ -611,6 +611,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) | |||
| 611 | } | 611 | } |
| 612 | } while (status != 0); | 612 | } while (status != 0); |
| 613 | 613 | ||
| 614 | spin_lock(&dlm_reco_state_lock); | ||
| 614 | switch (ndata->state) { | 615 | switch (ndata->state) { |
| 615 | case DLM_RECO_NODE_DATA_INIT: | 616 | case DLM_RECO_NODE_DATA_INIT: |
| 616 | case DLM_RECO_NODE_DATA_FINALIZE_SENT: | 617 | case DLM_RECO_NODE_DATA_FINALIZE_SENT: |
| @@ -641,6 +642,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) | |||
| 641 | ndata->node_num, dead_node); | 642 | ndata->node_num, dead_node); |
| 642 | break; | 643 | break; |
| 643 | } | 644 | } |
| 645 | spin_unlock(&dlm_reco_state_lock); | ||
| 644 | } | 646 | } |
| 645 | 647 | ||
| 646 | mlog(0, "done requesting all lock info\n"); | 648 | mlog(0, "done requesting all lock info\n"); |
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index e335541727f9..27e43b0c0eae 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c | |||
| @@ -225,11 +225,17 @@ static struct ocfs2_lock_res_ops ocfs2_dentry_lops = { | |||
| 225 | .flags = 0, | 225 | .flags = 0, |
| 226 | }; | 226 | }; |
| 227 | 227 | ||
| 228 | static struct ocfs2_lock_res_ops ocfs2_inode_open_lops = { | ||
| 229 | .get_osb = ocfs2_get_inode_osb, | ||
| 230 | .flags = 0, | ||
| 231 | }; | ||
| 232 | |||
| 228 | static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres) | 233 | static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres) |
| 229 | { | 234 | { |
| 230 | return lockres->l_type == OCFS2_LOCK_TYPE_META || | 235 | return lockres->l_type == OCFS2_LOCK_TYPE_META || |
| 231 | lockres->l_type == OCFS2_LOCK_TYPE_DATA || | 236 | lockres->l_type == OCFS2_LOCK_TYPE_DATA || |
| 232 | lockres->l_type == OCFS2_LOCK_TYPE_RW; | 237 | lockres->l_type == OCFS2_LOCK_TYPE_RW || |
| 238 | lockres->l_type == OCFS2_LOCK_TYPE_OPEN; | ||
| 233 | } | 239 | } |
| 234 | 240 | ||
| 235 | static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres) | 241 | static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres) |
| @@ -373,6 +379,9 @@ void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res, | |||
| 373 | case OCFS2_LOCK_TYPE_DATA: | 379 | case OCFS2_LOCK_TYPE_DATA: |
| 374 | ops = &ocfs2_inode_data_lops; | 380 | ops = &ocfs2_inode_data_lops; |
| 375 | break; | 381 | break; |
| 382 | case OCFS2_LOCK_TYPE_OPEN: | ||
| 383 | ops = &ocfs2_inode_open_lops; | ||
| 384 | break; | ||
| 376 | default: | 385 | default: |
| 377 | mlog_bug_on_msg(1, "type: %d\n", type); | 386 | mlog_bug_on_msg(1, "type: %d\n", type); |
| 378 | ops = NULL; /* thanks, gcc */ | 387 | ops = NULL; /* thanks, gcc */ |
| @@ -1129,6 +1138,12 @@ int ocfs2_create_new_inode_locks(struct inode *inode) | |||
| 1129 | goto bail; | 1138 | goto bail; |
| 1130 | } | 1139 | } |
| 1131 | 1140 | ||
| 1141 | ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_open_lockres, 0, 0); | ||
| 1142 | if (ret) { | ||
| 1143 | mlog_errno(ret); | ||
| 1144 | goto bail; | ||
| 1145 | } | ||
| 1146 | |||
| 1132 | bail: | 1147 | bail: |
| 1133 | mlog_exit(ret); | 1148 | mlog_exit(ret); |
| 1134 | return ret; | 1149 | return ret; |
| @@ -1182,6 +1197,99 @@ void ocfs2_rw_unlock(struct inode *inode, int write) | |||
| 1182 | mlog_exit_void(); | 1197 | mlog_exit_void(); |
| 1183 | } | 1198 | } |
| 1184 | 1199 | ||
| 1200 | /* | ||
| 1201 | * ocfs2_open_lock always get PR mode lock. | ||
| 1202 | */ | ||
| 1203 | int ocfs2_open_lock(struct inode *inode) | ||
| 1204 | { | ||
| 1205 | int status = 0; | ||
| 1206 | struct ocfs2_lock_res *lockres; | ||
| 1207 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
| 1208 | |||
| 1209 | BUG_ON(!inode); | ||
| 1210 | |||
| 1211 | mlog_entry_void(); | ||
| 1212 | |||
| 1213 | mlog(0, "inode %llu take PRMODE open lock\n", | ||
| 1214 | (unsigned long long)OCFS2_I(inode)->ip_blkno); | ||
| 1215 | |||
| 1216 | if (ocfs2_mount_local(osb)) | ||
| 1217 | goto out; | ||
| 1218 | |||
| 1219 | lockres = &OCFS2_I(inode)->ip_open_lockres; | ||
| 1220 | |||
| 1221 | status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, | ||
| 1222 | LKM_PRMODE, 0, 0); | ||
| 1223 | if (status < 0) | ||
| 1224 | mlog_errno(status); | ||
| 1225 | |||
| 1226 | out: | ||
| 1227 | mlog_exit(status); | ||
| 1228 | return status; | ||
| 1229 | } | ||
| 1230 | |||
| 1231 | int ocfs2_try_open_lock(struct inode *inode, int write) | ||
| 1232 | { | ||
| 1233 | int status = 0, level; | ||
| 1234 | struct ocfs2_lock_res *lockres; | ||
| 1235 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
| 1236 | |||
| 1237 | BUG_ON(!inode); | ||
| 1238 | |||
| 1239 | mlog_entry_void(); | ||
| 1240 | |||
| 1241 | mlog(0, "inode %llu try to take %s open lock\n", | ||
| 1242 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | ||
| 1243 | write ? "EXMODE" : "PRMODE"); | ||
| 1244 | |||
| 1245 | if (ocfs2_mount_local(osb)) | ||
| 1246 | goto out; | ||
| 1247 | |||
| 1248 | lockres = &OCFS2_I(inode)->ip_open_lockres; | ||
| 1249 | |||
| 1250 | level = write ? LKM_EXMODE : LKM_PRMODE; | ||
| 1251 | |||
| 1252 | /* | ||
| 1253 | * The file system may already holding a PRMODE/EXMODE open lock. | ||
| 1254 | * Since we pass LKM_NOQUEUE, the request won't block waiting on | ||
| 1255 | * other nodes and the -EAGAIN will indicate to the caller that | ||
| 1256 | * this inode is still in use. | ||
| 1257 | */ | ||
| 1258 | status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, | ||
| 1259 | level, LKM_NOQUEUE, 0); | ||
| 1260 | |||
| 1261 | out: | ||
| 1262 | mlog_exit(status); | ||
| 1263 | return status; | ||
| 1264 | } | ||
| 1265 | |||
| 1266 | /* | ||
| 1267 | * ocfs2_open_unlock unlock PR and EX mode open locks. | ||
| 1268 | */ | ||
| 1269 | void ocfs2_open_unlock(struct inode *inode) | ||
| 1270 | { | ||
| 1271 | struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_open_lockres; | ||
| 1272 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
| 1273 | |||
| 1274 | mlog_entry_void(); | ||
| 1275 | |||
| 1276 | mlog(0, "inode %llu drop open lock\n", | ||
| 1277 | (unsigned long long)OCFS2_I(inode)->ip_blkno); | ||
| 1278 | |||
| 1279 | if (ocfs2_mount_local(osb)) | ||
| 1280 | goto out; | ||
| 1281 | |||
| 1282 | if(lockres->l_ro_holders) | ||
| 1283 | ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, | ||
| 1284 | LKM_PRMODE); | ||
| 1285 | if(lockres->l_ex_holders) | ||
| 1286 | ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, | ||
| 1287 | LKM_EXMODE); | ||
| 1288 | |||
| 1289 | out: | ||
| 1290 | mlog_exit_void(); | ||
| 1291 | } | ||
| 1292 | |||
| 1185 | int ocfs2_data_lock_full(struct inode *inode, | 1293 | int ocfs2_data_lock_full(struct inode *inode, |
| 1186 | int write, | 1294 | int write, |
| 1187 | int arg_flags) | 1295 | int arg_flags) |
| @@ -1387,8 +1495,7 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode) | |||
| 1387 | if (S_ISLNK(inode->i_mode) && !oi->ip_clusters) | 1495 | if (S_ISLNK(inode->i_mode) && !oi->ip_clusters) |
| 1388 | inode->i_blocks = 0; | 1496 | inode->i_blocks = 0; |
| 1389 | else | 1497 | else |
| 1390 | inode->i_blocks = | 1498 | inode->i_blocks = ocfs2_inode_sector_count(inode); |
| 1391 | ocfs2_align_bytes_to_sectors(i_size_read(inode)); | ||
| 1392 | 1499 | ||
| 1393 | inode->i_uid = be32_to_cpu(lvb->lvb_iuid); | 1500 | inode->i_uid = be32_to_cpu(lvb->lvb_iuid); |
| 1394 | inode->i_gid = be32_to_cpu(lvb->lvb_igid); | 1501 | inode->i_gid = be32_to_cpu(lvb->lvb_igid); |
| @@ -1479,12 +1586,15 @@ static int ocfs2_meta_lock_update(struct inode *inode, | |||
| 1479 | { | 1586 | { |
| 1480 | int status = 0; | 1587 | int status = 0; |
| 1481 | struct ocfs2_inode_info *oi = OCFS2_I(inode); | 1588 | struct ocfs2_inode_info *oi = OCFS2_I(inode); |
| 1482 | struct ocfs2_lock_res *lockres = NULL; | 1589 | struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres; |
| 1483 | struct ocfs2_dinode *fe; | 1590 | struct ocfs2_dinode *fe; |
| 1484 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | 1591 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); |
| 1485 | 1592 | ||
| 1486 | mlog_entry_void(); | 1593 | mlog_entry_void(); |
| 1487 | 1594 | ||
| 1595 | if (ocfs2_mount_local(osb)) | ||
| 1596 | goto bail; | ||
| 1597 | |||
| 1488 | spin_lock(&oi->ip_lock); | 1598 | spin_lock(&oi->ip_lock); |
| 1489 | if (oi->ip_flags & OCFS2_INODE_DELETED) { | 1599 | if (oi->ip_flags & OCFS2_INODE_DELETED) { |
| 1490 | mlog(0, "Orphaned inode %llu was deleted while we " | 1600 | mlog(0, "Orphaned inode %llu was deleted while we " |
| @@ -1496,22 +1606,16 @@ static int ocfs2_meta_lock_update(struct inode *inode, | |||
| 1496 | } | 1606 | } |
| 1497 | spin_unlock(&oi->ip_lock); | 1607 | spin_unlock(&oi->ip_lock); |
| 1498 | 1608 | ||
| 1499 | if (!ocfs2_mount_local(osb)) { | 1609 | if (!ocfs2_should_refresh_lock_res(lockres)) |
| 1500 | lockres = &oi->ip_meta_lockres; | 1610 | goto bail; |
| 1501 | |||
| 1502 | if (!ocfs2_should_refresh_lock_res(lockres)) | ||
| 1503 | goto bail; | ||
| 1504 | } | ||
| 1505 | 1611 | ||
| 1506 | /* This will discard any caching information we might have had | 1612 | /* This will discard any caching information we might have had |
| 1507 | * for the inode metadata. */ | 1613 | * for the inode metadata. */ |
| 1508 | ocfs2_metadata_cache_purge(inode); | 1614 | ocfs2_metadata_cache_purge(inode); |
| 1509 | 1615 | ||
| 1510 | /* will do nothing for inode types that don't use the extent | ||
| 1511 | * map (directories, bitmap files, etc) */ | ||
| 1512 | ocfs2_extent_map_trunc(inode, 0); | 1616 | ocfs2_extent_map_trunc(inode, 0); |
| 1513 | 1617 | ||
| 1514 | if (lockres && ocfs2_meta_lvb_is_trustable(inode, lockres)) { | 1618 | if (ocfs2_meta_lvb_is_trustable(inode, lockres)) { |
| 1515 | mlog(0, "Trusting LVB on inode %llu\n", | 1619 | mlog(0, "Trusting LVB on inode %llu\n", |
| 1516 | (unsigned long long)oi->ip_blkno); | 1620 | (unsigned long long)oi->ip_blkno); |
| 1517 | ocfs2_refresh_inode_from_lvb(inode); | 1621 | ocfs2_refresh_inode_from_lvb(inode); |
| @@ -1558,8 +1662,7 @@ static int ocfs2_meta_lock_update(struct inode *inode, | |||
| 1558 | 1662 | ||
| 1559 | status = 0; | 1663 | status = 0; |
| 1560 | bail_refresh: | 1664 | bail_refresh: |
| 1561 | if (lockres) | 1665 | ocfs2_complete_lock_res_refresh(lockres, status); |
| 1562 | ocfs2_complete_lock_res_refresh(lockres, status); | ||
| 1563 | bail: | 1666 | bail: |
| 1564 | mlog_exit(status); | 1667 | mlog_exit(status); |
| 1565 | return status; | 1668 | return status; |
| @@ -1630,7 +1733,6 @@ int ocfs2_meta_lock_full(struct inode *inode, | |||
| 1630 | wait_event(osb->recovery_event, | 1733 | wait_event(osb->recovery_event, |
| 1631 | ocfs2_node_map_is_empty(osb, &osb->recovery_map)); | 1734 | ocfs2_node_map_is_empty(osb, &osb->recovery_map)); |
| 1632 | 1735 | ||
| 1633 | acquired = 0; | ||
| 1634 | lockres = &OCFS2_I(inode)->ip_meta_lockres; | 1736 | lockres = &OCFS2_I(inode)->ip_meta_lockres; |
| 1635 | level = ex ? LKM_EXMODE : LKM_PRMODE; | 1737 | level = ex ? LKM_EXMODE : LKM_PRMODE; |
| 1636 | dlm_flags = 0; | 1738 | dlm_flags = 0; |
| @@ -2458,13 +2560,20 @@ int ocfs2_drop_inode_locks(struct inode *inode) | |||
| 2458 | * ocfs2_clear_inode has done it for us. */ | 2560 | * ocfs2_clear_inode has done it for us. */ |
| 2459 | 2561 | ||
| 2460 | err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb), | 2562 | err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb), |
| 2461 | &OCFS2_I(inode)->ip_data_lockres); | 2563 | &OCFS2_I(inode)->ip_open_lockres); |
| 2462 | if (err < 0) | 2564 | if (err < 0) |
| 2463 | mlog_errno(err); | 2565 | mlog_errno(err); |
| 2464 | 2566 | ||
| 2465 | status = err; | 2567 | status = err; |
| 2466 | 2568 | ||
| 2467 | err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb), | 2569 | err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb), |
| 2570 | &OCFS2_I(inode)->ip_data_lockres); | ||
| 2571 | if (err < 0) | ||
| 2572 | mlog_errno(err); | ||
| 2573 | if (err < 0 && !status) | ||
| 2574 | status = err; | ||
| 2575 | |||
| 2576 | err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb), | ||
| 2468 | &OCFS2_I(inode)->ip_meta_lockres); | 2577 | &OCFS2_I(inode)->ip_meta_lockres); |
| 2469 | if (err < 0) | 2578 | if (err < 0) |
| 2470 | mlog_errno(err); | 2579 | mlog_errno(err); |
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h index c343fca68cf1..59cb566e7983 100644 --- a/fs/ocfs2/dlmglue.h +++ b/fs/ocfs2/dlmglue.h | |||
| @@ -80,6 +80,9 @@ void ocfs2_data_unlock(struct inode *inode, | |||
| 80 | int write); | 80 | int write); |
| 81 | int ocfs2_rw_lock(struct inode *inode, int write); | 81 | int ocfs2_rw_lock(struct inode *inode, int write); |
| 82 | void ocfs2_rw_unlock(struct inode *inode, int write); | 82 | void ocfs2_rw_unlock(struct inode *inode, int write); |
| 83 | int ocfs2_open_lock(struct inode *inode); | ||
| 84 | int ocfs2_try_open_lock(struct inode *inode, int write); | ||
| 85 | void ocfs2_open_unlock(struct inode *inode); | ||
| 83 | int ocfs2_meta_lock_atime(struct inode *inode, | 86 | int ocfs2_meta_lock_atime(struct inode *inode, |
| 84 | struct vfsmount *vfsmnt, | 87 | struct vfsmount *vfsmnt, |
| 85 | int *level); | 88 | int *level); |
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index 80ac69f11d9f..ba2b2ab1c6e4 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c | |||
| @@ -3,8 +3,7 @@ | |||
| 3 | * | 3 | * |
| 4 | * extent_map.c | 4 | * extent_map.c |
| 5 | * | 5 | * |
| 6 | * In-memory extent map for OCFS2. Man, this code was prettier in | 6 | * Block/Cluster mapping functions |
| 7 | * the library. | ||
| 8 | * | 7 | * |
| 9 | * Copyright (C) 2004 Oracle. All rights reserved. | 8 | * Copyright (C) 2004 Oracle. All rights reserved. |
| 10 | * | 9 | * |
| @@ -26,1016 +25,528 @@ | |||
| 26 | #include <linux/fs.h> | 25 | #include <linux/fs.h> |
| 27 | #include <linux/init.h> | 26 | #include <linux/init.h> |
| 28 | #include <linux/types.h> | 27 | #include <linux/types.h> |
| 29 | #include <linux/slab.h> | ||
| 30 | #include <linux/rbtree.h> | ||
| 31 | 28 | ||
| 32 | #define MLOG_MASK_PREFIX ML_EXTENT_MAP | 29 | #define MLOG_MASK_PREFIX ML_EXTENT_MAP |
| 33 | #include <cluster/masklog.h> | 30 | #include <cluster/masklog.h> |
| 34 | 31 | ||
| 35 | #include "ocfs2.h" | 32 | #include "ocfs2.h" |
| 36 | 33 | ||
| 34 | #include "alloc.h" | ||
| 37 | #include "extent_map.h" | 35 | #include "extent_map.h" |
| 38 | #include "inode.h" | 36 | #include "inode.h" |
| 39 | #include "super.h" | 37 | #include "super.h" |
| 40 | 38 | ||
| 41 | #include "buffer_head_io.h" | 39 | #include "buffer_head_io.h" |
| 42 | 40 | ||
| 43 | |||
| 44 | /* | 41 | /* |
| 45 | * SUCK SUCK SUCK | 42 | * The extent caching implementation is intentionally trivial. |
| 46 | * Our headers are so bad that struct ocfs2_extent_map is in ocfs.h | ||
| 47 | */ | ||
| 48 | |||
| 49 | struct ocfs2_extent_map_entry { | ||
| 50 | struct rb_node e_node; | ||
| 51 | int e_tree_depth; | ||
| 52 | struct ocfs2_extent_rec e_rec; | ||
| 53 | }; | ||
| 54 | |||
| 55 | struct ocfs2_em_insert_context { | ||
| 56 | int need_left; | ||
| 57 | int need_right; | ||
| 58 | struct ocfs2_extent_map_entry *new_ent; | ||
| 59 | struct ocfs2_extent_map_entry *old_ent; | ||
| 60 | struct ocfs2_extent_map_entry *left_ent; | ||
| 61 | struct ocfs2_extent_map_entry *right_ent; | ||
| 62 | }; | ||
| 63 | |||
| 64 | static struct kmem_cache *ocfs2_em_ent_cachep = NULL; | ||
| 65 | |||
| 66 | |||
| 67 | static struct ocfs2_extent_map_entry * | ||
| 68 | ocfs2_extent_map_lookup(struct ocfs2_extent_map *em, | ||
| 69 | u32 cpos, u32 clusters, | ||
| 70 | struct rb_node ***ret_p, | ||
| 71 | struct rb_node **ret_parent); | ||
| 72 | static int ocfs2_extent_map_insert(struct inode *inode, | ||
| 73 | struct ocfs2_extent_rec *rec, | ||
| 74 | int tree_depth); | ||
| 75 | static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em, | ||
| 76 | struct ocfs2_extent_map_entry *ent); | ||
| 77 | static int ocfs2_extent_map_find_leaf(struct inode *inode, | ||
| 78 | u32 cpos, u32 clusters, | ||
| 79 | struct ocfs2_extent_list *el); | ||
| 80 | static int ocfs2_extent_map_lookup_read(struct inode *inode, | ||
| 81 | u32 cpos, u32 clusters, | ||
| 82 | struct ocfs2_extent_map_entry **ret_ent); | ||
| 83 | static int ocfs2_extent_map_try_insert(struct inode *inode, | ||
| 84 | struct ocfs2_extent_rec *rec, | ||
| 85 | int tree_depth, | ||
| 86 | struct ocfs2_em_insert_context *ctxt); | ||
| 87 | |||
| 88 | /* returns 1 only if the rec contains all the given clusters -- that is that | ||
| 89 | * rec's cpos is <= the cluster cpos and that the rec endpoint (cpos + | ||
| 90 | * clusters) is >= the argument's endpoint */ | ||
| 91 | static int ocfs2_extent_rec_contains_clusters(struct ocfs2_extent_rec *rec, | ||
| 92 | u32 cpos, u32 clusters) | ||
| 93 | { | ||
| 94 | if (le32_to_cpu(rec->e_cpos) > cpos) | ||
| 95 | return 0; | ||
| 96 | if (cpos + clusters > le32_to_cpu(rec->e_cpos) + | ||
| 97 | le32_to_cpu(rec->e_clusters)) | ||
| 98 | return 0; | ||
| 99 | return 1; | ||
| 100 | } | ||
| 101 | |||
| 102 | |||
| 103 | /* | ||
| 104 | * Find an entry in the tree that intersects the region passed in. | ||
| 105 | * Note that this will find straddled intervals, it is up to the | ||
| 106 | * callers to enforce any boundary conditions. | ||
| 107 | * | ||
| 108 | * Callers must hold ip_lock. This lookup is not guaranteed to return | ||
| 109 | * a tree_depth 0 match, and as such can race inserts if the lock | ||
| 110 | * were not held. | ||
| 111 | * | 43 | * |
| 112 | * The rb_node garbage lets insertion share the search. Trivial | 44 | * We only cache a small number of extents stored directly on the |
| 113 | * callers pass NULL. | 45 | * inode, so linear order operations are acceptable. If we ever want |
| 46 | * to increase the size of the extent map, then these algorithms must | ||
| 47 | * get smarter. | ||
| 114 | */ | 48 | */ |
| 115 | static struct ocfs2_extent_map_entry * | 49 | |
| 116 | ocfs2_extent_map_lookup(struct ocfs2_extent_map *em, | 50 | void ocfs2_extent_map_init(struct inode *inode) |
| 117 | u32 cpos, u32 clusters, | ||
| 118 | struct rb_node ***ret_p, | ||
| 119 | struct rb_node **ret_parent) | ||
| 120 | { | 51 | { |
| 121 | struct rb_node **p = &em->em_extents.rb_node; | 52 | struct ocfs2_inode_info *oi = OCFS2_I(inode); |
| 122 | struct rb_node *parent = NULL; | ||
| 123 | struct ocfs2_extent_map_entry *ent = NULL; | ||
| 124 | |||
| 125 | while (*p) | ||
| 126 | { | ||
| 127 | parent = *p; | ||
| 128 | ent = rb_entry(parent, struct ocfs2_extent_map_entry, | ||
| 129 | e_node); | ||
| 130 | if ((cpos + clusters) <= le32_to_cpu(ent->e_rec.e_cpos)) { | ||
| 131 | p = &(*p)->rb_left; | ||
| 132 | ent = NULL; | ||
| 133 | } else if (cpos >= (le32_to_cpu(ent->e_rec.e_cpos) + | ||
| 134 | le32_to_cpu(ent->e_rec.e_clusters))) { | ||
| 135 | p = &(*p)->rb_right; | ||
| 136 | ent = NULL; | ||
| 137 | } else | ||
| 138 | break; | ||
| 139 | } | ||
| 140 | 53 | ||
| 141 | if (ret_p != NULL) | 54 | oi->ip_extent_map.em_num_items = 0; |
| 142 | *ret_p = p; | 55 | INIT_LIST_HEAD(&oi->ip_extent_map.em_list); |
| 143 | if (ret_parent != NULL) | ||
| 144 | *ret_parent = parent; | ||
| 145 | return ent; | ||
| 146 | } | 56 | } |
| 147 | 57 | ||
| 148 | /* | 58 | static void __ocfs2_extent_map_lookup(struct ocfs2_extent_map *em, |
| 149 | * Find the leaf containing the interval we want. While we're on our | 59 | unsigned int cpos, |
| 150 | * way down the tree, fill in every record we see at any depth, because | 60 | struct ocfs2_extent_map_item **ret_emi) |
| 151 | * we might want it later. | ||
| 152 | * | ||
| 153 | * Note that this code is run without ip_lock. That's because it | ||
| 154 | * sleeps while reading. If someone is also filling the extent list at | ||
| 155 | * the same time we are, we might have to restart. | ||
| 156 | */ | ||
| 157 | static int ocfs2_extent_map_find_leaf(struct inode *inode, | ||
| 158 | u32 cpos, u32 clusters, | ||
| 159 | struct ocfs2_extent_list *el) | ||
| 160 | { | 61 | { |
| 161 | int i, ret; | 62 | unsigned int range; |
| 162 | struct buffer_head *eb_bh = NULL; | 63 | struct ocfs2_extent_map_item *emi; |
| 163 | u64 blkno; | ||
| 164 | u32 rec_end; | ||
| 165 | struct ocfs2_extent_block *eb; | ||
| 166 | struct ocfs2_extent_rec *rec; | ||
| 167 | |||
| 168 | /* | ||
| 169 | * The bh data containing the el cannot change here, because | ||
| 170 | * we hold alloc_sem. So we can do this without other | ||
| 171 | * locks. | ||
| 172 | */ | ||
| 173 | while (el->l_tree_depth) | ||
| 174 | { | ||
| 175 | blkno = 0; | ||
| 176 | for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { | ||
| 177 | rec = &el->l_recs[i]; | ||
| 178 | rec_end = (le32_to_cpu(rec->e_cpos) + | ||
| 179 | le32_to_cpu(rec->e_clusters)); | ||
| 180 | |||
| 181 | ret = -EBADR; | ||
| 182 | if (rec_end > OCFS2_I(inode)->ip_clusters) { | ||
| 183 | mlog_errno(ret); | ||
| 184 | ocfs2_error(inode->i_sb, | ||
| 185 | "Extent %d at e_blkno %llu of inode %llu goes past ip_clusters of %u\n", | ||
| 186 | i, | ||
| 187 | (unsigned long long)le64_to_cpu(rec->e_blkno), | ||
| 188 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | ||
| 189 | OCFS2_I(inode)->ip_clusters); | ||
| 190 | goto out_free; | ||
| 191 | } | ||
| 192 | |||
| 193 | if (rec_end <= cpos) { | ||
| 194 | ret = ocfs2_extent_map_insert(inode, rec, | ||
| 195 | le16_to_cpu(el->l_tree_depth)); | ||
| 196 | if (ret && (ret != -EEXIST)) { | ||
| 197 | mlog_errno(ret); | ||
| 198 | goto out_free; | ||
| 199 | } | ||
| 200 | continue; | ||
| 201 | } | ||
| 202 | if ((cpos + clusters) <= le32_to_cpu(rec->e_cpos)) { | ||
| 203 | ret = ocfs2_extent_map_insert(inode, rec, | ||
| 204 | le16_to_cpu(el->l_tree_depth)); | ||
| 205 | if (ret && (ret != -EEXIST)) { | ||
| 206 | mlog_errno(ret); | ||
| 207 | goto out_free; | ||
| 208 | } | ||
| 209 | continue; | ||
| 210 | } | ||
| 211 | 64 | ||
| 212 | /* | 65 | *ret_emi = NULL; |
| 213 | * We've found a record that matches our | ||
| 214 | * interval. We don't insert it because we're | ||
| 215 | * about to traverse it. | ||
| 216 | */ | ||
| 217 | |||
| 218 | /* Check to see if we're stradling */ | ||
| 219 | ret = -ESRCH; | ||
| 220 | if (!ocfs2_extent_rec_contains_clusters(rec, | ||
| 221 | cpos, | ||
| 222 | clusters)) { | ||
| 223 | mlog_errno(ret); | ||
| 224 | goto out_free; | ||
| 225 | } | ||
| 226 | 66 | ||
| 227 | /* | 67 | list_for_each_entry(emi, &em->em_list, ei_list) { |
| 228 | * If we've already found a record, the el has | 68 | range = emi->ei_cpos + emi->ei_clusters; |
| 229 | * two records covering the same interval. | ||
| 230 | * EEEK! | ||
| 231 | */ | ||
| 232 | ret = -EBADR; | ||
| 233 | if (blkno) { | ||
| 234 | mlog_errno(ret); | ||
| 235 | ocfs2_error(inode->i_sb, | ||
| 236 | "Multiple extents for (cpos = %u, clusters = %u) on inode %llu; e_blkno %llu and rec %d at e_blkno %llu\n", | ||
| 237 | cpos, clusters, | ||
| 238 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | ||
| 239 | (unsigned long long)blkno, i, | ||
| 240 | (unsigned long long)le64_to_cpu(rec->e_blkno)); | ||
| 241 | goto out_free; | ||
| 242 | } | ||
| 243 | 69 | ||
| 244 | blkno = le64_to_cpu(rec->e_blkno); | 70 | if (cpos >= emi->ei_cpos && cpos < range) { |
| 245 | } | 71 | list_move(&emi->ei_list, &em->em_list); |
| 246 | 72 | ||
| 247 | /* | 73 | *ret_emi = emi; |
| 248 | * We don't support holes, and we're still up | 74 | break; |
| 249 | * in the branches, so we'd better have found someone | ||
| 250 | */ | ||
| 251 | ret = -EBADR; | ||
| 252 | if (!blkno) { | ||
| 253 | ocfs2_error(inode->i_sb, | ||
| 254 | "No record found for (cpos = %u, clusters = %u) on inode %llu\n", | ||
| 255 | cpos, clusters, | ||
| 256 | (unsigned long long)OCFS2_I(inode)->ip_blkno); | ||
| 257 | mlog_errno(ret); | ||
| 258 | goto out_free; | ||
| 259 | } | ||
| 260 | |||
| 261 | if (eb_bh) { | ||
| 262 | brelse(eb_bh); | ||
| 263 | eb_bh = NULL; | ||
| 264 | } | ||
| 265 | ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), | ||
| 266 | blkno, &eb_bh, OCFS2_BH_CACHED, | ||
| 267 | inode); | ||
| 268 | if (ret) { | ||
| 269 | mlog_errno(ret); | ||
| 270 | goto out_free; | ||
| 271 | } | ||
| 272 | eb = (struct ocfs2_extent_block *)eb_bh->b_data; | ||
| 273 | if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { | ||
| 274 | OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); | ||
| 275 | ret = -EIO; | ||
| 276 | goto out_free; | ||
| 277 | } | 75 | } |
| 278 | el = &eb->h_list; | ||
| 279 | } | 76 | } |
| 77 | } | ||
| 280 | 78 | ||
| 281 | BUG_ON(el->l_tree_depth); | 79 | static int ocfs2_extent_map_lookup(struct inode *inode, unsigned int cpos, |
| 282 | 80 | unsigned int *phys, unsigned int *len, | |
| 283 | for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { | 81 | unsigned int *flags) |
| 284 | rec = &el->l_recs[i]; | 82 | { |
| 285 | 83 | unsigned int coff; | |
| 286 | if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) > | 84 | struct ocfs2_inode_info *oi = OCFS2_I(inode); |
| 287 | OCFS2_I(inode)->ip_clusters) { | 85 | struct ocfs2_extent_map_item *emi; |
| 288 | ret = -EBADR; | 86 | |
| 289 | mlog_errno(ret); | 87 | spin_lock(&oi->ip_lock); |
| 290 | ocfs2_error(inode->i_sb, | 88 | |
| 291 | "Extent %d at e_blkno %llu of inode %llu goes past ip_clusters of %u\n", | 89 | __ocfs2_extent_map_lookup(&oi->ip_extent_map, cpos, &emi); |
| 292 | i, | 90 | if (emi) { |
| 293 | (unsigned long long)le64_to_cpu(rec->e_blkno), | 91 | coff = cpos - emi->ei_cpos; |
| 294 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | 92 | *phys = emi->ei_phys + coff; |
| 295 | OCFS2_I(inode)->ip_clusters); | 93 | if (len) |
| 296 | return ret; | 94 | *len = emi->ei_clusters - coff; |
| 297 | } | 95 | if (flags) |
| 298 | 96 | *flags = emi->ei_flags; | |
| 299 | ret = ocfs2_extent_map_insert(inode, rec, | ||
| 300 | le16_to_cpu(el->l_tree_depth)); | ||
| 301 | if (ret && (ret != -EEXIST)) { | ||
| 302 | mlog_errno(ret); | ||
| 303 | goto out_free; | ||
| 304 | } | ||
| 305 | } | 97 | } |
| 306 | 98 | ||
| 307 | ret = 0; | 99 | spin_unlock(&oi->ip_lock); |
| 308 | 100 | ||
| 309 | out_free: | 101 | if (emi == NULL) |
| 310 | if (eb_bh) | 102 | return -ENOENT; |
| 311 | brelse(eb_bh); | ||
| 312 | 103 | ||
| 313 | return ret; | 104 | return 0; |
| 314 | } | 105 | } |
| 315 | 106 | ||
| 316 | /* | 107 | /* |
| 317 | * This lookup actually will read from disk. It has one invariant: | 108 | * Forget about all clusters equal to or greater than cpos. |
| 318 | * It will never re-traverse blocks. This means that all inserts should | ||
| 319 | * be new regions or more granular regions (both allowed by insert). | ||
| 320 | */ | 109 | */ |
| 321 | static int ocfs2_extent_map_lookup_read(struct inode *inode, | 110 | void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cpos) |
| 322 | u32 cpos, | ||
| 323 | u32 clusters, | ||
| 324 | struct ocfs2_extent_map_entry **ret_ent) | ||
| 325 | { | 111 | { |
| 326 | int ret; | 112 | struct list_head *p, *n; |
| 327 | u64 blkno; | 113 | struct ocfs2_extent_map_item *emi; |
| 328 | struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; | 114 | struct ocfs2_inode_info *oi = OCFS2_I(inode); |
| 329 | struct ocfs2_extent_map_entry *ent; | 115 | struct ocfs2_extent_map *em = &oi->ip_extent_map; |
| 330 | struct buffer_head *bh = NULL; | 116 | LIST_HEAD(tmp_list); |
| 331 | struct ocfs2_extent_block *eb; | 117 | unsigned int range; |
| 332 | struct ocfs2_dinode *di; | 118 | |
| 333 | struct ocfs2_extent_list *el; | 119 | spin_lock(&oi->ip_lock); |
| 334 | 120 | list_for_each_safe(p, n, &em->em_list) { | |
| 335 | spin_lock(&OCFS2_I(inode)->ip_lock); | 121 | emi = list_entry(p, struct ocfs2_extent_map_item, ei_list); |
| 336 | ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL); | 122 | |
| 337 | if (ent) { | 123 | if (emi->ei_cpos >= cpos) { |
| 338 | if (!ent->e_tree_depth) { | 124 | /* Full truncate of this record. */ |
| 339 | spin_unlock(&OCFS2_I(inode)->ip_lock); | 125 | list_move(&emi->ei_list, &tmp_list); |
| 340 | *ret_ent = ent; | 126 | BUG_ON(em->em_num_items == 0); |
| 341 | return 0; | 127 | em->em_num_items--; |
| 342 | } | 128 | continue; |
| 343 | blkno = le64_to_cpu(ent->e_rec.e_blkno); | ||
| 344 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
| 345 | |||
| 346 | ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno, &bh, | ||
| 347 | OCFS2_BH_CACHED, inode); | ||
| 348 | if (ret) { | ||
| 349 | mlog_errno(ret); | ||
| 350 | if (bh) | ||
| 351 | brelse(bh); | ||
| 352 | return ret; | ||
| 353 | } | 129 | } |
| 354 | eb = (struct ocfs2_extent_block *)bh->b_data; | ||
| 355 | if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { | ||
| 356 | OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); | ||
| 357 | brelse(bh); | ||
| 358 | return -EIO; | ||
| 359 | } | ||
| 360 | el = &eb->h_list; | ||
| 361 | } else { | ||
| 362 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
| 363 | 130 | ||
| 364 | ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), | 131 | range = emi->ei_cpos + emi->ei_clusters; |
| 365 | OCFS2_I(inode)->ip_blkno, &bh, | 132 | if (range > cpos) { |
| 366 | OCFS2_BH_CACHED, inode); | 133 | /* Partial truncate */ |
| 367 | if (ret) { | 134 | emi->ei_clusters = cpos - emi->ei_cpos; |
| 368 | mlog_errno(ret); | ||
| 369 | if (bh) | ||
| 370 | brelse(bh); | ||
| 371 | return ret; | ||
| 372 | } | 135 | } |
| 373 | di = (struct ocfs2_dinode *)bh->b_data; | ||
| 374 | if (!OCFS2_IS_VALID_DINODE(di)) { | ||
| 375 | brelse(bh); | ||
| 376 | OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, di); | ||
| 377 | return -EIO; | ||
| 378 | } | ||
| 379 | el = &di->id2.i_list; | ||
| 380 | } | ||
| 381 | |||
| 382 | ret = ocfs2_extent_map_find_leaf(inode, cpos, clusters, el); | ||
| 383 | brelse(bh); | ||
| 384 | if (ret) { | ||
| 385 | mlog_errno(ret); | ||
| 386 | return ret; | ||
| 387 | } | 136 | } |
| 137 | spin_unlock(&oi->ip_lock); | ||
| 388 | 138 | ||
| 389 | ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL); | 139 | list_for_each_safe(p, n, &tmp_list) { |
| 390 | if (!ent) { | 140 | emi = list_entry(p, struct ocfs2_extent_map_item, ei_list); |
| 391 | ret = -ESRCH; | 141 | list_del(&emi->ei_list); |
| 392 | mlog_errno(ret); | 142 | kfree(emi); |
| 393 | return ret; | ||
| 394 | } | 143 | } |
| 395 | |||
| 396 | /* FIXME: Make sure this isn't a corruption */ | ||
| 397 | BUG_ON(ent->e_tree_depth); | ||
| 398 | |||
| 399 | *ret_ent = ent; | ||
| 400 | |||
| 401 | return 0; | ||
| 402 | } | 144 | } |
| 403 | 145 | ||
| 404 | /* | 146 | /* |
| 405 | * Callers must hold ip_lock. This can insert pieces of the tree, | 147 | * Is any part of emi2 contained within emi1 |
| 406 | * thus racing lookup if the lock weren't held. | ||
| 407 | */ | 148 | */ |
| 408 | static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em, | 149 | static int ocfs2_ei_is_contained(struct ocfs2_extent_map_item *emi1, |
| 409 | struct ocfs2_extent_map_entry *ent) | 150 | struct ocfs2_extent_map_item *emi2) |
| 410 | { | 151 | { |
| 411 | struct rb_node **p, *parent; | 152 | unsigned int range1, range2; |
| 412 | struct ocfs2_extent_map_entry *old_ent; | ||
| 413 | 153 | ||
| 414 | old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(ent->e_rec.e_cpos), | 154 | /* |
| 415 | le32_to_cpu(ent->e_rec.e_clusters), | 155 | * Check if logical start of emi2 is inside emi1 |
| 416 | &p, &parent); | 156 | */ |
| 417 | if (old_ent) | 157 | range1 = emi1->ei_cpos + emi1->ei_clusters; |
| 418 | return -EEXIST; | 158 | if (emi2->ei_cpos >= emi1->ei_cpos && emi2->ei_cpos < range1) |
| 159 | return 1; | ||
| 419 | 160 | ||
| 420 | rb_link_node(&ent->e_node, parent, p); | 161 | /* |
| 421 | rb_insert_color(&ent->e_node, &em->em_extents); | 162 | * Check if logical end of emi2 is inside emi1 |
| 163 | */ | ||
| 164 | range2 = emi2->ei_cpos + emi2->ei_clusters; | ||
| 165 | if (range2 > emi1->ei_cpos && range2 <= range1) | ||
| 166 | return 1; | ||
| 422 | 167 | ||
| 423 | return 0; | 168 | return 0; |
| 424 | } | 169 | } |
| 425 | 170 | ||
| 171 | static void ocfs2_copy_emi_fields(struct ocfs2_extent_map_item *dest, | ||
| 172 | struct ocfs2_extent_map_item *src) | ||
| 173 | { | ||
| 174 | dest->ei_cpos = src->ei_cpos; | ||
| 175 | dest->ei_phys = src->ei_phys; | ||
| 176 | dest->ei_clusters = src->ei_clusters; | ||
| 177 | dest->ei_flags = src->ei_flags; | ||
| 178 | } | ||
| 426 | 179 | ||
| 427 | /* | 180 | /* |
| 428 | * Simple rule: on any return code other than -EAGAIN, anything left | 181 | * Try to merge emi with ins. Returns 1 if merge succeeds, zero |
| 429 | * in the insert_context will be freed. | 182 | * otherwise. |
| 430 | * | ||
| 431 | * Simple rule #2: A return code of -EEXIST from this function or | ||
| 432 | * its calls to ocfs2_extent_map_insert_entry() signifies that another | ||
| 433 | * thread beat us to the insert. It is not an actual error, but it | ||
| 434 | * tells the caller we have no more work to do. | ||
| 435 | */ | 183 | */ |
| 436 | static int ocfs2_extent_map_try_insert(struct inode *inode, | 184 | static int ocfs2_try_to_merge_extent_map(struct ocfs2_extent_map_item *emi, |
| 437 | struct ocfs2_extent_rec *rec, | 185 | struct ocfs2_extent_map_item *ins) |
| 438 | int tree_depth, | ||
| 439 | struct ocfs2_em_insert_context *ctxt) | ||
| 440 | { | 186 | { |
| 441 | int ret; | ||
| 442 | struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; | ||
| 443 | struct ocfs2_extent_map_entry *old_ent; | ||
| 444 | |||
| 445 | ctxt->need_left = 0; | ||
| 446 | ctxt->need_right = 0; | ||
| 447 | ctxt->old_ent = NULL; | ||
| 448 | |||
| 449 | spin_lock(&OCFS2_I(inode)->ip_lock); | ||
| 450 | ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent); | ||
| 451 | if (!ret) { | ||
| 452 | ctxt->new_ent = NULL; | ||
| 453 | goto out_unlock; | ||
| 454 | } | ||
| 455 | |||
| 456 | /* Since insert_entry failed, the map MUST have old_ent */ | ||
| 457 | old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos), | ||
| 458 | le32_to_cpu(rec->e_clusters), | ||
| 459 | NULL, NULL); | ||
| 460 | |||
| 461 | BUG_ON(!old_ent); | ||
| 462 | |||
| 463 | if (old_ent->e_tree_depth < tree_depth) { | ||
| 464 | /* Another thread beat us to the lower tree_depth */ | ||
| 465 | ret = -EEXIST; | ||
| 466 | goto out_unlock; | ||
| 467 | } | ||
| 468 | |||
| 469 | if (old_ent->e_tree_depth == tree_depth) { | ||
| 470 | /* | ||
| 471 | * Another thread beat us to this tree_depth. | ||
| 472 | * Let's make sure we agree with that thread (the | ||
| 473 | * extent_rec should be identical). | ||
| 474 | */ | ||
| 475 | if (!memcmp(rec, &old_ent->e_rec, | ||
| 476 | sizeof(struct ocfs2_extent_rec))) | ||
| 477 | ret = 0; | ||
| 478 | else | ||
| 479 | /* FIXME: Should this be ESRCH/EBADR??? */ | ||
| 480 | ret = -EEXIST; | ||
| 481 | |||
| 482 | goto out_unlock; | ||
| 483 | } | ||
| 484 | |||
| 485 | /* | 187 | /* |
| 486 | * We do it in this order specifically so that no actual tree | 188 | * Handle contiguousness |
| 487 | * changes occur until we have all the pieces we need. We | ||
| 488 | * don't want malloc failures to leave an inconsistent tree. | ||
| 489 | * Whenever we drop the lock, another process could be | ||
| 490 | * inserting. Also note that, if another process just beat us | ||
| 491 | * to an insert, we might not need the same pieces we needed | ||
| 492 | * the first go round. In the end, the pieces we need will | ||
| 493 | * be used, and the pieces we don't will be freed. | ||
| 494 | */ | 189 | */ |
| 495 | ctxt->need_left = !!(le32_to_cpu(rec->e_cpos) > | 190 | if (ins->ei_phys == (emi->ei_phys + emi->ei_clusters) && |
| 496 | le32_to_cpu(old_ent->e_rec.e_cpos)); | 191 | ins->ei_cpos == (emi->ei_cpos + emi->ei_clusters) && |
| 497 | ctxt->need_right = !!((le32_to_cpu(old_ent->e_rec.e_cpos) + | 192 | ins->ei_flags == emi->ei_flags) { |
| 498 | le32_to_cpu(old_ent->e_rec.e_clusters)) > | 193 | emi->ei_clusters += ins->ei_clusters; |
| 499 | (le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters))); | 194 | return 1; |
| 500 | ret = -EAGAIN; | 195 | } else if ((ins->ei_phys + ins->ei_clusters) == emi->ei_phys && |
| 501 | if (ctxt->need_left) { | 196 | (ins->ei_cpos + ins->ei_clusters) == emi->ei_phys && |
| 502 | if (!ctxt->left_ent) | 197 | ins->ei_flags == emi->ei_flags) { |
| 503 | goto out_unlock; | 198 | emi->ei_phys = ins->ei_phys; |
| 504 | *(ctxt->left_ent) = *old_ent; | 199 | emi->ei_cpos = ins->ei_cpos; |
| 505 | ctxt->left_ent->e_rec.e_clusters = | 200 | emi->ei_clusters += ins->ei_clusters; |
| 506 | cpu_to_le32(le32_to_cpu(rec->e_cpos) - | 201 | return 1; |
| 507 | le32_to_cpu(ctxt->left_ent->e_rec.e_cpos)); | ||
| 508 | } | ||
| 509 | if (ctxt->need_right) { | ||
| 510 | if (!ctxt->right_ent) | ||
| 511 | goto out_unlock; | ||
| 512 | *(ctxt->right_ent) = *old_ent; | ||
| 513 | ctxt->right_ent->e_rec.e_cpos = | ||
| 514 | cpu_to_le32(le32_to_cpu(rec->e_cpos) + | ||
| 515 | le32_to_cpu(rec->e_clusters)); | ||
| 516 | ctxt->right_ent->e_rec.e_clusters = | ||
| 517 | cpu_to_le32((le32_to_cpu(old_ent->e_rec.e_cpos) + | ||
| 518 | le32_to_cpu(old_ent->e_rec.e_clusters)) - | ||
| 519 | le32_to_cpu(ctxt->right_ent->e_rec.e_cpos)); | ||
| 520 | } | ||
| 521 | |||
| 522 | rb_erase(&old_ent->e_node, &em->em_extents); | ||
| 523 | /* Now that he's erased, set him up for deletion */ | ||
| 524 | ctxt->old_ent = old_ent; | ||
| 525 | |||
| 526 | if (ctxt->need_left) { | ||
| 527 | ret = ocfs2_extent_map_insert_entry(em, | ||
| 528 | ctxt->left_ent); | ||
| 529 | if (ret) | ||
| 530 | goto out_unlock; | ||
| 531 | ctxt->left_ent = NULL; | ||
| 532 | } | 202 | } |
| 533 | 203 | ||
| 534 | if (ctxt->need_right) { | 204 | /* |
| 535 | ret = ocfs2_extent_map_insert_entry(em, | 205 | * Overlapping extents - this shouldn't happen unless we've |
| 536 | ctxt->right_ent); | 206 | * split an extent to change it's flags. That is exceedingly |
| 537 | if (ret) | 207 | * rare, so there's no sense in trying to optimize it yet. |
| 538 | goto out_unlock; | 208 | */ |
| 539 | ctxt->right_ent = NULL; | 209 | if (ocfs2_ei_is_contained(emi, ins) || |
| 210 | ocfs2_ei_is_contained(ins, emi)) { | ||
| 211 | ocfs2_copy_emi_fields(emi, ins); | ||
| 212 | return 1; | ||
| 540 | } | 213 | } |
| 541 | 214 | ||
| 542 | ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent); | 215 | /* No merge was possible. */ |
| 543 | 216 | return 0; | |
| 544 | if (!ret) | ||
| 545 | ctxt->new_ent = NULL; | ||
| 546 | |||
| 547 | out_unlock: | ||
| 548 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
| 549 | |||
| 550 | return ret; | ||
| 551 | } | 217 | } |
| 552 | 218 | ||
| 553 | 219 | /* | |
| 554 | static int ocfs2_extent_map_insert(struct inode *inode, | 220 | * In order to reduce complexity on the caller, this insert function |
| 555 | struct ocfs2_extent_rec *rec, | 221 | * is intentionally liberal in what it will accept. |
| 556 | int tree_depth) | 222 | * |
| 223 | * The only rule is that the truncate call *must* be used whenever | ||
| 224 | * records have been deleted. This avoids inserting overlapping | ||
| 225 | * records with different physical mappings. | ||
| 226 | */ | ||
| 227 | void ocfs2_extent_map_insert_rec(struct inode *inode, | ||
| 228 | struct ocfs2_extent_rec *rec) | ||
| 557 | { | 229 | { |
| 558 | int ret; | 230 | struct ocfs2_inode_info *oi = OCFS2_I(inode); |
| 559 | struct ocfs2_em_insert_context ctxt = {0, }; | 231 | struct ocfs2_extent_map *em = &oi->ip_extent_map; |
| 560 | 232 | struct ocfs2_extent_map_item *emi, *new_emi = NULL; | |
| 561 | if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) > | 233 | struct ocfs2_extent_map_item ins; |
| 562 | OCFS2_I(inode)->ip_map.em_clusters) { | 234 | |
| 563 | ret = -EBADR; | 235 | ins.ei_cpos = le32_to_cpu(rec->e_cpos); |
| 564 | mlog_errno(ret); | 236 | ins.ei_phys = ocfs2_blocks_to_clusters(inode->i_sb, |
| 565 | return ret; | 237 | le64_to_cpu(rec->e_blkno)); |
| 238 | ins.ei_clusters = le16_to_cpu(rec->e_leaf_clusters); | ||
| 239 | ins.ei_flags = rec->e_flags; | ||
| 240 | |||
| 241 | search: | ||
| 242 | spin_lock(&oi->ip_lock); | ||
| 243 | |||
| 244 | list_for_each_entry(emi, &em->em_list, ei_list) { | ||
| 245 | if (ocfs2_try_to_merge_extent_map(emi, &ins)) { | ||
| 246 | list_move(&emi->ei_list, &em->em_list); | ||
| 247 | spin_unlock(&oi->ip_lock); | ||
| 248 | goto out; | ||
| 249 | } | ||
| 566 | } | 250 | } |
| 567 | 251 | ||
| 568 | /* Zero e_clusters means a truncated tail record. It better be EOF */ | 252 | /* |
| 569 | if (!rec->e_clusters) { | 253 | * No item could be merged. |
| 570 | if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) != | 254 | * |
| 571 | OCFS2_I(inode)->ip_map.em_clusters) { | 255 | * Either allocate and add a new item, or overwrite the last recently |
| 572 | ret = -EBADR; | 256 | * inserted. |
| 573 | mlog_errno(ret); | 257 | */ |
| 574 | ocfs2_error(inode->i_sb, | ||
| 575 | "Zero e_clusters on non-tail extent record at e_blkno %llu on inode %llu\n", | ||
| 576 | (unsigned long long)le64_to_cpu(rec->e_blkno), | ||
| 577 | (unsigned long long)OCFS2_I(inode)->ip_blkno); | ||
| 578 | return ret; | ||
| 579 | } | ||
| 580 | 258 | ||
| 581 | /* Ignore the truncated tail */ | 259 | if (em->em_num_items < OCFS2_MAX_EXTENT_MAP_ITEMS) { |
| 582 | return 0; | 260 | if (new_emi == NULL) { |
| 583 | } | 261 | spin_unlock(&oi->ip_lock); |
| 584 | 262 | ||
| 585 | ret = -ENOMEM; | 263 | new_emi = kmalloc(sizeof(*new_emi), GFP_NOFS); |
| 586 | ctxt.new_ent = kmem_cache_alloc(ocfs2_em_ent_cachep, | 264 | if (new_emi == NULL) |
| 587 | GFP_NOFS); | 265 | goto out; |
| 588 | if (!ctxt.new_ent) { | ||
| 589 | mlog_errno(ret); | ||
| 590 | return ret; | ||
| 591 | } | ||
| 592 | 266 | ||
| 593 | ctxt.new_ent->e_rec = *rec; | 267 | goto search; |
| 594 | ctxt.new_ent->e_tree_depth = tree_depth; | ||
| 595 | |||
| 596 | do { | ||
| 597 | ret = -ENOMEM; | ||
| 598 | if (ctxt.need_left && !ctxt.left_ent) { | ||
| 599 | ctxt.left_ent = | ||
| 600 | kmem_cache_alloc(ocfs2_em_ent_cachep, | ||
| 601 | GFP_NOFS); | ||
| 602 | if (!ctxt.left_ent) | ||
| 603 | break; | ||
| 604 | } | ||
| 605 | if (ctxt.need_right && !ctxt.right_ent) { | ||
| 606 | ctxt.right_ent = | ||
| 607 | kmem_cache_alloc(ocfs2_em_ent_cachep, | ||
| 608 | GFP_NOFS); | ||
| 609 | if (!ctxt.right_ent) | ||
| 610 | break; | ||
| 611 | } | 268 | } |
| 612 | 269 | ||
| 613 | ret = ocfs2_extent_map_try_insert(inode, rec, | 270 | ocfs2_copy_emi_fields(new_emi, &ins); |
| 614 | tree_depth, &ctxt); | 271 | list_add(&new_emi->ei_list, &em->em_list); |
| 615 | } while (ret == -EAGAIN); | 272 | em->em_num_items++; |
| 616 | 273 | new_emi = NULL; | |
| 617 | if ((ret < 0) && (ret != -EEXIST)) | 274 | } else { |
| 618 | mlog_errno(ret); | 275 | BUG_ON(list_empty(&em->em_list) || em->em_num_items == 0); |
| 276 | emi = list_entry(em->em_list.prev, | ||
| 277 | struct ocfs2_extent_map_item, ei_list); | ||
| 278 | list_move(&emi->ei_list, &em->em_list); | ||
| 279 | ocfs2_copy_emi_fields(emi, &ins); | ||
| 280 | } | ||
| 619 | 281 | ||
| 620 | if (ctxt.left_ent) | 282 | spin_unlock(&oi->ip_lock); |
| 621 | kmem_cache_free(ocfs2_em_ent_cachep, ctxt.left_ent); | ||
| 622 | if (ctxt.right_ent) | ||
| 623 | kmem_cache_free(ocfs2_em_ent_cachep, ctxt.right_ent); | ||
| 624 | if (ctxt.old_ent) | ||
| 625 | kmem_cache_free(ocfs2_em_ent_cachep, ctxt.old_ent); | ||
| 626 | if (ctxt.new_ent) | ||
| 627 | kmem_cache_free(ocfs2_em_ent_cachep, ctxt.new_ent); | ||
| 628 | 283 | ||
| 629 | return ret; | 284 | out: |
| 285 | if (new_emi) | ||
| 286 | kfree(new_emi); | ||
| 630 | } | 287 | } |
| 631 | 288 | ||
| 632 | /* | 289 | /* |
| 633 | * Append this record to the tail of the extent map. It must be | 290 | * Return the 1st index within el which contains an extent start |
| 634 | * tree_depth 0. The record might be an extension of an existing | 291 | * larger than v_cluster. |
| 635 | * record, and as such that needs to be handled. eg: | ||
| 636 | * | ||
| 637 | * Existing record in the extent map: | ||
| 638 | * | ||
| 639 | * cpos = 10, len = 10 | ||
| 640 | * |---------| | ||
| 641 | * | ||
| 642 | * New Record: | ||
| 643 | * | ||
| 644 | * cpos = 10, len = 20 | ||
| 645 | * |------------------| | ||
| 646 | * | ||
| 647 | * The passed record is the new on-disk record. The new_clusters value | ||
| 648 | * is how many clusters were added to the file. If the append is a | ||
| 649 | * contiguous append, the new_clusters has been added to | ||
| 650 | * rec->e_clusters. If the append is an entirely new extent, then | ||
| 651 | * rec->e_clusters is == new_clusters. | ||
| 652 | */ | 292 | */ |
| 653 | int ocfs2_extent_map_append(struct inode *inode, | 293 | static int ocfs2_search_for_hole_index(struct ocfs2_extent_list *el, |
| 654 | struct ocfs2_extent_rec *rec, | 294 | u32 v_cluster) |
| 655 | u32 new_clusters) | ||
| 656 | { | 295 | { |
| 657 | int ret; | 296 | int i; |
| 658 | struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; | 297 | struct ocfs2_extent_rec *rec; |
| 659 | struct ocfs2_extent_map_entry *ent; | ||
| 660 | struct ocfs2_extent_rec *old; | ||
| 661 | |||
| 662 | BUG_ON(!new_clusters); | ||
| 663 | BUG_ON(le32_to_cpu(rec->e_clusters) < new_clusters); | ||
| 664 | 298 | ||
| 665 | if (em->em_clusters < OCFS2_I(inode)->ip_clusters) { | 299 | for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { |
| 666 | /* | 300 | rec = &el->l_recs[i]; |
| 667 | * Size changed underneath us on disk. Drop any | ||
| 668 | * straddling records and update our idea of | ||
| 669 | * i_clusters | ||
| 670 | */ | ||
| 671 | ocfs2_extent_map_drop(inode, em->em_clusters - 1); | ||
| 672 | em->em_clusters = OCFS2_I(inode)->ip_clusters; | ||
| 673 | } | ||
| 674 | 301 | ||
| 675 | mlog_bug_on_msg((le32_to_cpu(rec->e_cpos) + | 302 | if (v_cluster < le32_to_cpu(rec->e_cpos)) |
| 676 | le32_to_cpu(rec->e_clusters)) != | 303 | break; |
| 677 | (em->em_clusters + new_clusters), | ||
| 678 | "Inode %llu:\n" | ||
| 679 | "rec->e_cpos = %u + rec->e_clusters = %u = %u\n" | ||
| 680 | "em->em_clusters = %u + new_clusters = %u = %u\n", | ||
| 681 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | ||
| 682 | le32_to_cpu(rec->e_cpos), le32_to_cpu(rec->e_clusters), | ||
| 683 | le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters), | ||
| 684 | em->em_clusters, new_clusters, | ||
| 685 | em->em_clusters + new_clusters); | ||
| 686 | |||
| 687 | em->em_clusters += new_clusters; | ||
| 688 | |||
| 689 | ret = -ENOENT; | ||
| 690 | if (le32_to_cpu(rec->e_clusters) > new_clusters) { | ||
| 691 | /* This is a contiguous append */ | ||
| 692 | ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos), 1, | ||
| 693 | NULL, NULL); | ||
| 694 | if (ent) { | ||
| 695 | old = &ent->e_rec; | ||
| 696 | BUG_ON((le32_to_cpu(rec->e_cpos) + | ||
| 697 | le32_to_cpu(rec->e_clusters)) != | ||
| 698 | (le32_to_cpu(old->e_cpos) + | ||
| 699 | le32_to_cpu(old->e_clusters) + | ||
| 700 | new_clusters)); | ||
| 701 | if (ent->e_tree_depth == 0) { | ||
| 702 | BUG_ON(le32_to_cpu(old->e_cpos) != | ||
| 703 | le32_to_cpu(rec->e_cpos)); | ||
| 704 | BUG_ON(le64_to_cpu(old->e_blkno) != | ||
| 705 | le64_to_cpu(rec->e_blkno)); | ||
| 706 | ret = 0; | ||
| 707 | } | ||
| 708 | /* | ||
| 709 | * Let non-leafs fall through as -ENOENT to | ||
| 710 | * force insertion of the new leaf. | ||
| 711 | */ | ||
| 712 | le32_add_cpu(&old->e_clusters, new_clusters); | ||
| 713 | } | ||
| 714 | } | 304 | } |
| 715 | 305 | ||
| 716 | if (ret == -ENOENT) | 306 | return i; |
| 717 | ret = ocfs2_extent_map_insert(inode, rec, 0); | ||
| 718 | if (ret < 0) | ||
| 719 | mlog_errno(ret); | ||
| 720 | return ret; | ||
| 721 | } | 307 | } |
| 722 | 308 | ||
| 723 | #if 0 | ||
| 724 | /* Code here is included but defined out as it completes the extent | ||
| 725 | * map api and may be used in the future. */ | ||
| 726 | |||
| 727 | /* | 309 | /* |
| 728 | * Look up the record containing this cluster offset. This record is | 310 | * Figure out the size of a hole which starts at v_cluster within the given |
| 729 | * part of the extent map. Do not free it. Any changes you make to | 311 | * extent list. |
| 730 | * it will reflect in the extent map. So, if your last extent | ||
| 731 | * is (cpos = 10, clusters = 10) and you truncate the file by 5 | ||
| 732 | * clusters, you can do: | ||
| 733 | * | 312 | * |
| 734 | * ret = ocfs2_extent_map_get_rec(em, orig_size - 5, &rec); | 313 | * If there is no more allocation past v_cluster, we return the maximum |
| 735 | * rec->e_clusters -= 5; | 314 | * cluster size minus v_cluster. |
| 736 | * | 315 | * |
| 737 | * The lookup does not read from disk. If the map isn't filled in for | 316 | * If we have in-inode extents, then el points to the dinode list and |
| 738 | * an entry, you won't find it. | 317 | * eb_bh is NULL. Otherwise, eb_bh should point to the extent block |
| 739 | * | 318 | * containing el. |
| 740 | * Also note that the returned record is valid until alloc_sem is | ||
| 741 | * dropped. After that, truncate and extend can happen. Caveat Emptor. | ||
| 742 | */ | 319 | */ |
| 743 | int ocfs2_extent_map_get_rec(struct inode *inode, u32 cpos, | 320 | static int ocfs2_figure_hole_clusters(struct inode *inode, |
| 744 | struct ocfs2_extent_rec **rec, | 321 | struct ocfs2_extent_list *el, |
| 745 | int *tree_depth) | 322 | struct buffer_head *eb_bh, |
| 323 | u32 v_cluster, | ||
| 324 | u32 *num_clusters) | ||
| 746 | { | 325 | { |
| 747 | int ret = -ENOENT; | 326 | int ret, i; |
| 748 | struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; | 327 | struct buffer_head *next_eb_bh = NULL; |
| 749 | struct ocfs2_extent_map_entry *ent; | 328 | struct ocfs2_extent_block *eb, *next_eb; |
| 750 | 329 | ||
| 751 | *rec = NULL; | 330 | i = ocfs2_search_for_hole_index(el, v_cluster); |
| 752 | 331 | ||
| 753 | if (cpos >= OCFS2_I(inode)->ip_clusters) | 332 | if (i == le16_to_cpu(el->l_next_free_rec) && eb_bh) { |
| 754 | return -EINVAL; | 333 | eb = (struct ocfs2_extent_block *)eb_bh->b_data; |
| 755 | 334 | ||
| 756 | if (cpos >= em->em_clusters) { | ||
| 757 | /* | 335 | /* |
| 758 | * Size changed underneath us on disk. Drop any | 336 | * Check the next leaf for any extents. |
| 759 | * straddling records and update our idea of | ||
| 760 | * i_clusters | ||
| 761 | */ | 337 | */ |
| 762 | ocfs2_extent_map_drop(inode, em->em_clusters - 1); | ||
| 763 | em->em_clusters = OCFS2_I(inode)->ip_clusters ; | ||
| 764 | } | ||
| 765 | |||
| 766 | ent = ocfs2_extent_map_lookup(&OCFS2_I(inode)->ip_map, cpos, 1, | ||
| 767 | NULL, NULL); | ||
| 768 | 338 | ||
| 769 | if (ent) { | 339 | if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL) |
| 770 | *rec = &ent->e_rec; | 340 | goto no_more_extents; |
| 771 | if (tree_depth) | ||
| 772 | *tree_depth = ent->e_tree_depth; | ||
| 773 | ret = 0; | ||
| 774 | } | ||
| 775 | 341 | ||
| 776 | return ret; | 342 | ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), |
| 777 | } | 343 | le64_to_cpu(eb->h_next_leaf_blk), |
| 344 | &next_eb_bh, OCFS2_BH_CACHED, inode); | ||
| 345 | if (ret) { | ||
| 346 | mlog_errno(ret); | ||
| 347 | goto out; | ||
| 348 | } | ||
| 349 | next_eb = (struct ocfs2_extent_block *)next_eb_bh->b_data; | ||
| 778 | 350 | ||
| 779 | int ocfs2_extent_map_get_clusters(struct inode *inode, | 351 | if (!OCFS2_IS_VALID_EXTENT_BLOCK(next_eb)) { |
| 780 | u32 v_cpos, int count, | 352 | ret = -EROFS; |
| 781 | u32 *p_cpos, int *ret_count) | 353 | OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, next_eb); |
| 782 | { | 354 | goto out; |
| 783 | int ret; | 355 | } |
| 784 | u32 coff, ccount; | ||
| 785 | struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; | ||
| 786 | struct ocfs2_extent_map_entry *ent = NULL; | ||
| 787 | 356 | ||
| 788 | *p_cpos = ccount = 0; | 357 | el = &next_eb->h_list; |
| 789 | 358 | ||
| 790 | if ((v_cpos + count) > OCFS2_I(inode)->ip_clusters) | 359 | i = ocfs2_search_for_hole_index(el, v_cluster); |
| 791 | return -EINVAL; | 360 | } |
| 792 | 361 | ||
| 793 | if ((v_cpos + count) > em->em_clusters) { | 362 | no_more_extents: |
| 363 | if (i == le16_to_cpu(el->l_next_free_rec)) { | ||
| 794 | /* | 364 | /* |
| 795 | * Size changed underneath us on disk. Drop any | 365 | * We're at the end of our existing allocation. Just |
| 796 | * straddling records and update our idea of | 366 | * return the maximum number of clusters we could |
| 797 | * i_clusters | 367 | * possibly allocate. |
| 798 | */ | 368 | */ |
| 799 | ocfs2_extent_map_drop(inode, em->em_clusters - 1); | 369 | *num_clusters = UINT_MAX - v_cluster; |
| 800 | em->em_clusters = OCFS2_I(inode)->ip_clusters; | 370 | } else { |
| 371 | *num_clusters = le32_to_cpu(el->l_recs[i].e_cpos) - v_cluster; | ||
| 801 | } | 372 | } |
| 802 | 373 | ||
| 374 | ret = 0; | ||
| 375 | out: | ||
| 376 | brelse(next_eb_bh); | ||
| 377 | return ret; | ||
| 378 | } | ||
| 803 | 379 | ||
| 804 | ret = ocfs2_extent_map_lookup_read(inode, v_cpos, count, &ent); | 380 | /* |
| 805 | if (ret) | 381 | * Return the index of the extent record which contains cluster #v_cluster. |
| 806 | return ret; | 382 | * -1 is returned if it was not found. |
| 383 | * | ||
| 384 | * Should work fine on interior and exterior nodes. | ||
| 385 | */ | ||
| 386 | static int ocfs2_search_extent_list(struct ocfs2_extent_list *el, | ||
| 387 | u32 v_cluster) | ||
| 388 | { | ||
| 389 | int ret = -1; | ||
| 390 | int i; | ||
| 391 | struct ocfs2_extent_rec *rec; | ||
| 392 | u32 rec_end, rec_start, clusters; | ||
| 807 | 393 | ||
| 808 | if (ent) { | 394 | for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { |
| 809 | /* We should never find ourselves straddling an interval */ | 395 | rec = &el->l_recs[i]; |
| 810 | if (!ocfs2_extent_rec_contains_clusters(&ent->e_rec, | ||
| 811 | v_cpos, | ||
| 812 | count)) | ||
| 813 | return -ESRCH; | ||
| 814 | 396 | ||
| 815 | coff = v_cpos - le32_to_cpu(ent->e_rec.e_cpos); | 397 | rec_start = le32_to_cpu(rec->e_cpos); |
| 816 | *p_cpos = ocfs2_blocks_to_clusters(inode->i_sb, | 398 | clusters = ocfs2_rec_clusters(el, rec); |
| 817 | le64_to_cpu(ent->e_rec.e_blkno)) + | ||
| 818 | coff; | ||
| 819 | 399 | ||
| 820 | if (ret_count) | 400 | rec_end = rec_start + clusters; |
| 821 | *ret_count = le32_to_cpu(ent->e_rec.e_clusters) - coff; | ||
| 822 | 401 | ||
| 823 | return 0; | 402 | if (v_cluster >= rec_start && v_cluster < rec_end) { |
| 403 | ret = i; | ||
| 404 | break; | ||
| 405 | } | ||
| 824 | } | 406 | } |
| 825 | 407 | ||
| 826 | 408 | return ret; | |
| 827 | return -ENOENT; | ||
| 828 | } | 409 | } |
| 829 | 410 | ||
| 830 | #endif /* 0 */ | 411 | int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, |
| 831 | 412 | u32 *p_cluster, u32 *num_clusters, | |
| 832 | int ocfs2_extent_map_get_blocks(struct inode *inode, | 413 | unsigned int *extent_flags) |
| 833 | u64 v_blkno, int count, | ||
| 834 | u64 *p_blkno, int *ret_count) | ||
| 835 | { | 414 | { |
| 836 | int ret; | 415 | int ret, i; |
| 837 | u64 boff; | 416 | unsigned int flags = 0; |
| 838 | u32 cpos, clusters; | 417 | struct buffer_head *di_bh = NULL; |
| 839 | int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); | 418 | struct buffer_head *eb_bh = NULL; |
| 840 | struct ocfs2_extent_map_entry *ent = NULL; | 419 | struct ocfs2_dinode *di; |
| 841 | struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; | 420 | struct ocfs2_extent_block *eb; |
| 421 | struct ocfs2_extent_list *el; | ||
| 842 | struct ocfs2_extent_rec *rec; | 422 | struct ocfs2_extent_rec *rec; |
| 423 | u32 coff; | ||
| 843 | 424 | ||
| 844 | *p_blkno = 0; | 425 | ret = ocfs2_extent_map_lookup(inode, v_cluster, p_cluster, |
| 845 | 426 | num_clusters, extent_flags); | |
| 846 | cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno); | 427 | if (ret == 0) |
| 847 | clusters = ocfs2_blocks_to_clusters(inode->i_sb, | 428 | goto out; |
| 848 | (u64)count + bpc - 1); | ||
| 849 | if ((cpos + clusters) > OCFS2_I(inode)->ip_clusters) { | ||
| 850 | ret = -EINVAL; | ||
| 851 | mlog_errno(ret); | ||
| 852 | return ret; | ||
| 853 | } | ||
| 854 | |||
| 855 | if ((cpos + clusters) > em->em_clusters) { | ||
| 856 | /* | ||
| 857 | * Size changed underneath us on disk. Drop any | ||
| 858 | * straddling records and update our idea of | ||
| 859 | * i_clusters | ||
| 860 | */ | ||
| 861 | ocfs2_extent_map_drop(inode, em->em_clusters - 1); | ||
| 862 | em->em_clusters = OCFS2_I(inode)->ip_clusters; | ||
| 863 | } | ||
| 864 | 429 | ||
| 865 | ret = ocfs2_extent_map_lookup_read(inode, cpos, clusters, &ent); | 430 | ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), OCFS2_I(inode)->ip_blkno, |
| 431 | &di_bh, OCFS2_BH_CACHED, inode); | ||
| 866 | if (ret) { | 432 | if (ret) { |
| 867 | mlog_errno(ret); | 433 | mlog_errno(ret); |
| 868 | return ret; | 434 | goto out; |
| 869 | } | 435 | } |
| 870 | 436 | ||
| 871 | if (ent) | 437 | di = (struct ocfs2_dinode *) di_bh->b_data; |
| 872 | { | 438 | el = &di->id2.i_list; |
| 873 | rec = &ent->e_rec; | ||
| 874 | 439 | ||
| 875 | /* We should never find ourselves straddling an interval */ | 440 | if (el->l_tree_depth) { |
| 876 | if (!ocfs2_extent_rec_contains_clusters(rec, cpos, clusters)) { | 441 | ret = ocfs2_find_leaf(inode, el, v_cluster, &eb_bh); |
| 877 | ret = -ESRCH; | 442 | if (ret) { |
| 878 | mlog_errno(ret); | 443 | mlog_errno(ret); |
| 879 | return ret; | 444 | goto out; |
| 880 | } | 445 | } |
| 881 | 446 | ||
| 882 | boff = ocfs2_clusters_to_blocks(inode->i_sb, cpos - | 447 | eb = (struct ocfs2_extent_block *) eb_bh->b_data; |
| 883 | le32_to_cpu(rec->e_cpos)); | 448 | el = &eb->h_list; |
| 884 | boff += (v_blkno & (u64)(bpc - 1)); | ||
| 885 | *p_blkno = le64_to_cpu(rec->e_blkno) + boff; | ||
| 886 | 449 | ||
| 887 | if (ret_count) { | 450 | if (el->l_tree_depth) { |
| 888 | *ret_count = ocfs2_clusters_to_blocks(inode->i_sb, | 451 | ocfs2_error(inode->i_sb, |
| 889 | le32_to_cpu(rec->e_clusters)) - boff; | 452 | "Inode %lu has non zero tree depth in " |
| 453 | "leaf block %llu\n", inode->i_ino, | ||
| 454 | (unsigned long long)eb_bh->b_blocknr); | ||
| 455 | ret = -EROFS; | ||
| 456 | goto out; | ||
| 890 | } | 457 | } |
| 891 | |||
| 892 | return 0; | ||
| 893 | } | 458 | } |
| 894 | 459 | ||
| 895 | return -ENOENT; | 460 | i = ocfs2_search_extent_list(el, v_cluster); |
| 896 | } | 461 | if (i == -1) { |
| 897 | 462 | /* | |
| 898 | int ocfs2_extent_map_init(struct inode *inode) | 463 | * A hole was found. Return some canned values that |
| 899 | { | 464 | * callers can key on. If asked for, num_clusters will |
| 900 | struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; | 465 | * be populated with the size of the hole. |
| 901 | 466 | */ | |
| 902 | em->em_extents = RB_ROOT; | 467 | *p_cluster = 0; |
| 903 | em->em_clusters = 0; | 468 | if (num_clusters) { |
| 904 | 469 | ret = ocfs2_figure_hole_clusters(inode, el, eb_bh, | |
| 905 | return 0; | 470 | v_cluster, |
| 906 | } | 471 | num_clusters); |
| 907 | 472 | if (ret) { | |
| 908 | /* Needs the lock */ | 473 | mlog_errno(ret); |
| 909 | static void __ocfs2_extent_map_drop(struct inode *inode, | 474 | goto out; |
| 910 | u32 new_clusters, | 475 | } |
| 911 | struct rb_node **free_head, | 476 | } |
| 912 | struct ocfs2_extent_map_entry **tail_ent) | 477 | } else { |
| 913 | { | 478 | rec = &el->l_recs[i]; |
| 914 | struct rb_node *node, *next; | ||
| 915 | struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; | ||
| 916 | struct ocfs2_extent_map_entry *ent; | ||
| 917 | 479 | ||
| 918 | *free_head = NULL; | 480 | BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos)); |
| 919 | 481 | ||
| 920 | ent = NULL; | 482 | if (!rec->e_blkno) { |
| 921 | node = rb_last(&em->em_extents); | 483 | ocfs2_error(inode->i_sb, "Inode %lu has bad extent " |
| 922 | while (node) | 484 | "record (%u, %u, 0)", inode->i_ino, |
| 923 | { | 485 | le32_to_cpu(rec->e_cpos), |
| 924 | next = rb_prev(node); | 486 | ocfs2_rec_clusters(el, rec)); |
| 487 | ret = -EROFS; | ||
| 488 | goto out; | ||
| 489 | } | ||
| 925 | 490 | ||
| 926 | ent = rb_entry(node, struct ocfs2_extent_map_entry, | 491 | coff = v_cluster - le32_to_cpu(rec->e_cpos); |
| 927 | e_node); | ||
| 928 | if (le32_to_cpu(ent->e_rec.e_cpos) < new_clusters) | ||
| 929 | break; | ||
| 930 | 492 | ||
| 931 | rb_erase(&ent->e_node, &em->em_extents); | 493 | *p_cluster = ocfs2_blocks_to_clusters(inode->i_sb, |
| 494 | le64_to_cpu(rec->e_blkno)); | ||
| 495 | *p_cluster = *p_cluster + coff; | ||
| 932 | 496 | ||
| 933 | node->rb_right = *free_head; | 497 | if (num_clusters) |
| 934 | *free_head = node; | 498 | *num_clusters = ocfs2_rec_clusters(el, rec) - coff; |
| 935 | 499 | ||
| 936 | ent = NULL; | 500 | flags = rec->e_flags; |
| 937 | node = next; | ||
| 938 | } | ||
| 939 | 501 | ||
| 940 | /* Do we have an entry straddling new_clusters? */ | 502 | ocfs2_extent_map_insert_rec(inode, rec); |
| 941 | if (tail_ent) { | ||
| 942 | if (ent && | ||
| 943 | ((le32_to_cpu(ent->e_rec.e_cpos) + | ||
| 944 | le32_to_cpu(ent->e_rec.e_clusters)) > new_clusters)) | ||
| 945 | *tail_ent = ent; | ||
| 946 | else | ||
| 947 | *tail_ent = NULL; | ||
| 948 | } | 503 | } |
| 949 | } | ||
| 950 | |||
| 951 | static void __ocfs2_extent_map_drop_cleanup(struct rb_node *free_head) | ||
| 952 | { | ||
| 953 | struct rb_node *node; | ||
| 954 | struct ocfs2_extent_map_entry *ent; | ||
| 955 | 504 | ||
| 956 | while (free_head) { | 505 | if (extent_flags) |
| 957 | node = free_head; | 506 | *extent_flags = flags; |
| 958 | free_head = node->rb_right; | ||
| 959 | 507 | ||
| 960 | ent = rb_entry(node, struct ocfs2_extent_map_entry, | 508 | out: |
| 961 | e_node); | 509 | brelse(di_bh); |
| 962 | kmem_cache_free(ocfs2_em_ent_cachep, ent); | 510 | brelse(eb_bh); |
| 963 | } | 511 | return ret; |
| 964 | } | 512 | } |
| 965 | 513 | ||
| 966 | /* | 514 | /* |
| 967 | * Remove all entries past new_clusters, inclusive of an entry that | 515 | * This expects alloc_sem to be held. The allocation cannot change at |
| 968 | * contains new_clusters. This is effectively a cache forget. | 516 | * all while the map is in the process of being updated. |
| 969 | * | ||
| 970 | * If you want to also clip the last extent by some number of clusters, | ||
| 971 | * you need to call ocfs2_extent_map_trunc(). | ||
| 972 | * This code does not check or modify ip_clusters. | ||
| 973 | */ | 517 | */ |
| 974 | int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters) | 518 | int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno, |
| 519 | u64 *ret_count, unsigned int *extent_flags) | ||
| 975 | { | 520 | { |
| 976 | struct rb_node *free_head = NULL; | 521 | int ret; |
| 977 | struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; | 522 | int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); |
| 978 | struct ocfs2_extent_map_entry *ent; | 523 | u32 cpos, num_clusters, p_cluster; |
| 979 | 524 | u64 boff = 0; | |
| 980 | spin_lock(&OCFS2_I(inode)->ip_lock); | ||
| 981 | 525 | ||
| 982 | __ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent); | 526 | cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno); |
| 983 | 527 | ||
| 984 | if (ent) { | 528 | ret = ocfs2_get_clusters(inode, cpos, &p_cluster, &num_clusters, |
| 985 | rb_erase(&ent->e_node, &em->em_extents); | 529 | extent_flags); |
| 986 | ent->e_node.rb_right = free_head; | 530 | if (ret) { |
| 987 | free_head = &ent->e_node; | 531 | mlog_errno(ret); |
| 532 | goto out; | ||
| 988 | } | 533 | } |
| 989 | 534 | ||
| 990 | spin_unlock(&OCFS2_I(inode)->ip_lock); | 535 | /* |
| 991 | 536 | * p_cluster == 0 indicates a hole. | |
| 992 | if (free_head) | 537 | */ |
| 993 | __ocfs2_extent_map_drop_cleanup(free_head); | 538 | if (p_cluster) { |
| 994 | 539 | boff = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster); | |
| 995 | return 0; | 540 | boff += (v_blkno & (u64)(bpc - 1)); |
| 996 | } | 541 | } |
| 997 | |||
| 998 | /* | ||
| 999 | * Remove all entries past new_clusters and also clip any extent | ||
| 1000 | * straddling new_clusters, if there is one. This does not check | ||
| 1001 | * or modify ip_clusters | ||
| 1002 | */ | ||
| 1003 | int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters) | ||
| 1004 | { | ||
| 1005 | struct rb_node *free_head = NULL; | ||
| 1006 | struct ocfs2_extent_map_entry *ent = NULL; | ||
| 1007 | |||
| 1008 | spin_lock(&OCFS2_I(inode)->ip_lock); | ||
| 1009 | |||
| 1010 | __ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent); | ||
| 1011 | |||
| 1012 | if (ent) | ||
| 1013 | ent->e_rec.e_clusters = cpu_to_le32(new_clusters - | ||
| 1014 | le32_to_cpu(ent->e_rec.e_cpos)); | ||
| 1015 | |||
| 1016 | OCFS2_I(inode)->ip_map.em_clusters = new_clusters; | ||
| 1017 | |||
| 1018 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
| 1019 | |||
| 1020 | if (free_head) | ||
| 1021 | __ocfs2_extent_map_drop_cleanup(free_head); | ||
| 1022 | |||
| 1023 | return 0; | ||
| 1024 | } | ||
| 1025 | 542 | ||
| 1026 | int __init init_ocfs2_extent_maps(void) | 543 | *p_blkno = boff; |
| 1027 | { | ||
| 1028 | ocfs2_em_ent_cachep = | ||
| 1029 | kmem_cache_create("ocfs2_em_ent", | ||
| 1030 | sizeof(struct ocfs2_extent_map_entry), | ||
| 1031 | 0, SLAB_HWCACHE_ALIGN, NULL, NULL); | ||
| 1032 | if (!ocfs2_em_ent_cachep) | ||
| 1033 | return -ENOMEM; | ||
| 1034 | 544 | ||
| 1035 | return 0; | 545 | if (ret_count) { |
| 1036 | } | 546 | *ret_count = ocfs2_clusters_to_blocks(inode->i_sb, num_clusters); |
| 547 | *ret_count -= v_blkno & (u64)(bpc - 1); | ||
| 548 | } | ||
| 1037 | 549 | ||
| 1038 | void exit_ocfs2_extent_maps(void) | 550 | out: |
| 1039 | { | 551 | return ret; |
| 1040 | kmem_cache_destroy(ocfs2_em_ent_cachep); | ||
| 1041 | } | 552 | } |
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h index fa3745efa886..de91e3e41a22 100644 --- a/fs/ocfs2/extent_map.h +++ b/fs/ocfs2/extent_map.h | |||
| @@ -25,22 +25,29 @@ | |||
| 25 | #ifndef _EXTENT_MAP_H | 25 | #ifndef _EXTENT_MAP_H |
| 26 | #define _EXTENT_MAP_H | 26 | #define _EXTENT_MAP_H |
| 27 | 27 | ||
| 28 | int init_ocfs2_extent_maps(void); | 28 | struct ocfs2_extent_map_item { |
| 29 | void exit_ocfs2_extent_maps(void); | 29 | unsigned int ei_cpos; |
| 30 | unsigned int ei_phys; | ||
| 31 | unsigned int ei_clusters; | ||
| 32 | unsigned int ei_flags; | ||
| 30 | 33 | ||
| 31 | /* | 34 | struct list_head ei_list; |
| 32 | * EVERY CALL here except _init, _trunc, and _drop expects alloc_sem | 35 | }; |
| 33 | * to be held. The allocation cannot change at all while the map is | 36 | |
| 34 | * in the process of being updated. | 37 | #define OCFS2_MAX_EXTENT_MAP_ITEMS 3 |
| 35 | */ | 38 | struct ocfs2_extent_map { |
| 36 | int ocfs2_extent_map_init(struct inode *inode); | 39 | unsigned int em_num_items; |
| 37 | int ocfs2_extent_map_append(struct inode *inode, | 40 | struct list_head em_list; |
| 38 | struct ocfs2_extent_rec *rec, | 41 | }; |
| 39 | u32 new_clusters); | 42 | |
| 40 | int ocfs2_extent_map_get_blocks(struct inode *inode, | 43 | void ocfs2_extent_map_init(struct inode *inode); |
| 41 | u64 v_blkno, int count, | 44 | void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cluster); |
| 42 | u64 *p_blkno, int *ret_count); | 45 | void ocfs2_extent_map_insert_rec(struct inode *inode, |
| 43 | int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters); | 46 | struct ocfs2_extent_rec *rec); |
| 44 | int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters); | 47 | |
| 48 | int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, u32 *p_cluster, | ||
| 49 | u32 *num_clusters, unsigned int *extent_flags); | ||
| 50 | int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno, | ||
| 51 | u64 *ret_count, unsigned int *extent_flags); | ||
| 45 | 52 | ||
| 46 | #endif /* _EXTENT_MAP_H */ | 53 | #endif /* _EXTENT_MAP_H */ |
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index f2cd3bf9efb2..520a2a6d7670 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c | |||
| @@ -33,6 +33,7 @@ | |||
| 33 | #include <linux/sched.h> | 33 | #include <linux/sched.h> |
| 34 | #include <linux/pipe_fs_i.h> | 34 | #include <linux/pipe_fs_i.h> |
| 35 | #include <linux/mount.h> | 35 | #include <linux/mount.h> |
| 36 | #include <linux/writeback.h> | ||
| 36 | 37 | ||
| 37 | #define MLOG_MASK_PREFIX ML_INODE | 38 | #define MLOG_MASK_PREFIX ML_INODE |
| 38 | #include <cluster/masklog.h> | 39 | #include <cluster/masklog.h> |
| @@ -215,7 +216,7 @@ int ocfs2_set_inode_size(handle_t *handle, | |||
| 215 | 216 | ||
| 216 | mlog_entry_void(); | 217 | mlog_entry_void(); |
| 217 | i_size_write(inode, new_i_size); | 218 | i_size_write(inode, new_i_size); |
| 218 | inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size); | 219 | inode->i_blocks = ocfs2_inode_sector_count(inode); |
| 219 | inode->i_ctime = inode->i_mtime = CURRENT_TIME; | 220 | inode->i_ctime = inode->i_mtime = CURRENT_TIME; |
| 220 | 221 | ||
| 221 | status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); | 222 | status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); |
| @@ -261,6 +262,7 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, | |||
| 261 | { | 262 | { |
| 262 | int status; | 263 | int status; |
| 263 | handle_t *handle; | 264 | handle_t *handle; |
| 265 | struct ocfs2_dinode *di; | ||
| 264 | 266 | ||
| 265 | mlog_entry_void(); | 267 | mlog_entry_void(); |
| 266 | 268 | ||
| @@ -274,12 +276,39 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, | |||
| 274 | goto out; | 276 | goto out; |
| 275 | } | 277 | } |
| 276 | 278 | ||
| 277 | status = ocfs2_set_inode_size(handle, inode, fe_bh, new_i_size); | 279 | status = ocfs2_journal_access(handle, inode, fe_bh, |
| 280 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
| 281 | if (status < 0) { | ||
| 282 | mlog_errno(status); | ||
| 283 | goto out_commit; | ||
| 284 | } | ||
| 285 | |||
| 286 | /* | ||
| 287 | * Do this before setting i_size. | ||
| 288 | */ | ||
| 289 | status = ocfs2_zero_tail_for_truncate(inode, handle, new_i_size); | ||
| 290 | if (status) { | ||
| 291 | mlog_errno(status); | ||
| 292 | goto out_commit; | ||
| 293 | } | ||
| 294 | |||
| 295 | i_size_write(inode, new_i_size); | ||
| 296 | inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size); | ||
| 297 | inode->i_ctime = inode->i_mtime = CURRENT_TIME; | ||
| 298 | |||
| 299 | di = (struct ocfs2_dinode *) fe_bh->b_data; | ||
| 300 | di->i_size = cpu_to_le64(new_i_size); | ||
| 301 | di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec); | ||
| 302 | di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); | ||
| 303 | |||
| 304 | status = ocfs2_journal_dirty(handle, fe_bh); | ||
| 278 | if (status < 0) | 305 | if (status < 0) |
| 279 | mlog_errno(status); | 306 | mlog_errno(status); |
| 280 | 307 | ||
| 308 | out_commit: | ||
| 281 | ocfs2_commit_trans(osb, handle); | 309 | ocfs2_commit_trans(osb, handle); |
| 282 | out: | 310 | out: |
| 311 | |||
| 283 | mlog_exit(status); | 312 | mlog_exit(status); |
| 284 | return status; | 313 | return status; |
| 285 | } | 314 | } |
| @@ -342,19 +371,6 @@ static int ocfs2_truncate_file(struct inode *inode, | |||
| 342 | mlog_errno(status); | 371 | mlog_errno(status); |
| 343 | goto bail; | 372 | goto bail; |
| 344 | } | 373 | } |
| 345 | ocfs2_data_unlock(inode, 1); | ||
| 346 | |||
| 347 | if (le32_to_cpu(fe->i_clusters) == | ||
| 348 | ocfs2_clusters_for_bytes(osb->sb, new_i_size)) { | ||
| 349 | mlog(0, "fe->i_clusters = %u, so we do a simple truncate\n", | ||
| 350 | fe->i_clusters); | ||
| 351 | /* No allocation change is required, so lets fast path | ||
| 352 | * this truncate. */ | ||
| 353 | status = ocfs2_simple_size_update(inode, di_bh, new_i_size); | ||
| 354 | if (status < 0) | ||
| 355 | mlog_errno(status); | ||
| 356 | goto bail; | ||
| 357 | } | ||
| 358 | 374 | ||
| 359 | /* alright, we're going to need to do a full blown alloc size | 375 | /* alright, we're going to need to do a full blown alloc size |
| 360 | * change. Orphan the inode so that recovery can complete the | 376 | * change. Orphan the inode so that recovery can complete the |
| @@ -363,22 +379,25 @@ static int ocfs2_truncate_file(struct inode *inode, | |||
| 363 | status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size); | 379 | status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size); |
| 364 | if (status < 0) { | 380 | if (status < 0) { |
| 365 | mlog_errno(status); | 381 | mlog_errno(status); |
| 366 | goto bail; | 382 | goto bail_unlock_data; |
| 367 | } | 383 | } |
| 368 | 384 | ||
| 369 | status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc); | 385 | status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc); |
| 370 | if (status < 0) { | 386 | if (status < 0) { |
| 371 | mlog_errno(status); | 387 | mlog_errno(status); |
| 372 | goto bail; | 388 | goto bail_unlock_data; |
| 373 | } | 389 | } |
| 374 | 390 | ||
| 375 | status = ocfs2_commit_truncate(osb, inode, di_bh, tc); | 391 | status = ocfs2_commit_truncate(osb, inode, di_bh, tc); |
| 376 | if (status < 0) { | 392 | if (status < 0) { |
| 377 | mlog_errno(status); | 393 | mlog_errno(status); |
| 378 | goto bail; | 394 | goto bail_unlock_data; |
| 379 | } | 395 | } |
| 380 | 396 | ||
| 381 | /* TODO: orphan dir cleanup here. */ | 397 | /* TODO: orphan dir cleanup here. */ |
| 398 | bail_unlock_data: | ||
| 399 | ocfs2_data_unlock(inode, 1); | ||
| 400 | |||
| 382 | bail: | 401 | bail: |
| 383 | 402 | ||
| 384 | mlog_exit(status); | 403 | mlog_exit(status); |
| @@ -397,6 +416,7 @@ bail: | |||
| 397 | */ | 416 | */ |
| 398 | int ocfs2_do_extend_allocation(struct ocfs2_super *osb, | 417 | int ocfs2_do_extend_allocation(struct ocfs2_super *osb, |
| 399 | struct inode *inode, | 418 | struct inode *inode, |
| 419 | u32 *logical_offset, | ||
| 400 | u32 clusters_to_add, | 420 | u32 clusters_to_add, |
| 401 | struct buffer_head *fe_bh, | 421 | struct buffer_head *fe_bh, |
| 402 | handle_t *handle, | 422 | handle_t *handle, |
| @@ -460,18 +480,14 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb, | |||
| 460 | block = ocfs2_clusters_to_blocks(osb->sb, bit_off); | 480 | block = ocfs2_clusters_to_blocks(osb->sb, bit_off); |
| 461 | mlog(0, "Allocating %u clusters at block %u for inode %llu\n", | 481 | mlog(0, "Allocating %u clusters at block %u for inode %llu\n", |
| 462 | num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno); | 482 | num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno); |
| 463 | status = ocfs2_insert_extent(osb, handle, inode, fe_bh, block, | 483 | status = ocfs2_insert_extent(osb, handle, inode, fe_bh, |
| 464 | num_bits, meta_ac); | 484 | *logical_offset, block, num_bits, |
| 485 | meta_ac); | ||
| 465 | if (status < 0) { | 486 | if (status < 0) { |
| 466 | mlog_errno(status); | 487 | mlog_errno(status); |
| 467 | goto leave; | 488 | goto leave; |
| 468 | } | 489 | } |
| 469 | 490 | ||
| 470 | le32_add_cpu(&fe->i_clusters, num_bits); | ||
| 471 | spin_lock(&OCFS2_I(inode)->ip_lock); | ||
| 472 | OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); | ||
| 473 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
| 474 | |||
| 475 | status = ocfs2_journal_dirty(handle, fe_bh); | 491 | status = ocfs2_journal_dirty(handle, fe_bh); |
| 476 | if (status < 0) { | 492 | if (status < 0) { |
| 477 | mlog_errno(status); | 493 | mlog_errno(status); |
| @@ -479,6 +495,7 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb, | |||
| 479 | } | 495 | } |
| 480 | 496 | ||
| 481 | clusters_to_add -= num_bits; | 497 | clusters_to_add -= num_bits; |
| 498 | *logical_offset += num_bits; | ||
| 482 | 499 | ||
| 483 | if (clusters_to_add) { | 500 | if (clusters_to_add) { |
| 484 | mlog(0, "need to alloc once more, clusters = %u, wanted = " | 501 | mlog(0, "need to alloc once more, clusters = %u, wanted = " |
| @@ -494,14 +511,87 @@ leave: | |||
| 494 | return status; | 511 | return status; |
| 495 | } | 512 | } |
| 496 | 513 | ||
| 514 | /* | ||
| 515 | * For a given allocation, determine which allocators will need to be | ||
| 516 | * accessed, and lock them, reserving the appropriate number of bits. | ||
| 517 | * | ||
| 518 | * Called from ocfs2_extend_allocation() for file systems which don't | ||
| 519 | * support holes, and from ocfs2_write() for file systems which | ||
| 520 | * understand sparse inodes. | ||
| 521 | */ | ||
| 522 | int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di, | ||
| 523 | u32 clusters_to_add, | ||
| 524 | struct ocfs2_alloc_context **data_ac, | ||
| 525 | struct ocfs2_alloc_context **meta_ac) | ||
| 526 | { | ||
| 527 | int ret, num_free_extents; | ||
| 528 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
| 529 | |||
| 530 | *meta_ac = NULL; | ||
| 531 | *data_ac = NULL; | ||
| 532 | |||
| 533 | mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, " | ||
| 534 | "clusters_to_add = %u\n", | ||
| 535 | (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode), | ||
| 536 | le32_to_cpu(di->i_clusters), clusters_to_add); | ||
| 537 | |||
| 538 | num_free_extents = ocfs2_num_free_extents(osb, inode, di); | ||
| 539 | if (num_free_extents < 0) { | ||
| 540 | ret = num_free_extents; | ||
| 541 | mlog_errno(ret); | ||
| 542 | goto out; | ||
| 543 | } | ||
| 544 | |||
| 545 | /* | ||
| 546 | * Sparse allocation file systems need to be more conservative | ||
| 547 | * with reserving room for expansion - the actual allocation | ||
| 548 | * happens while we've got a journal handle open so re-taking | ||
| 549 | * a cluster lock (because we ran out of room for another | ||
| 550 | * extent) will violate ordering rules. | ||
| 551 | * | ||
| 552 | * Most of the time we'll only be seeing this 1 cluster at a time | ||
| 553 | * anyway. | ||
| 554 | */ | ||
| 555 | if (!num_free_extents || | ||
| 556 | (ocfs2_sparse_alloc(osb) && num_free_extents < clusters_to_add)) { | ||
| 557 | ret = ocfs2_reserve_new_metadata(osb, di, meta_ac); | ||
| 558 | if (ret < 0) { | ||
| 559 | if (ret != -ENOSPC) | ||
| 560 | mlog_errno(ret); | ||
| 561 | goto out; | ||
| 562 | } | ||
| 563 | } | ||
| 564 | |||
| 565 | ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac); | ||
| 566 | if (ret < 0) { | ||
| 567 | if (ret != -ENOSPC) | ||
| 568 | mlog_errno(ret); | ||
| 569 | goto out; | ||
| 570 | } | ||
| 571 | |||
| 572 | out: | ||
| 573 | if (ret) { | ||
| 574 | if (*meta_ac) { | ||
| 575 | ocfs2_free_alloc_context(*meta_ac); | ||
| 576 | *meta_ac = NULL; | ||
| 577 | } | ||
| 578 | |||
| 579 | /* | ||
| 580 | * We cannot have an error and a non null *data_ac. | ||
| 581 | */ | ||
| 582 | } | ||
| 583 | |||
| 584 | return ret; | ||
| 585 | } | ||
| 586 | |||
| 497 | static int ocfs2_extend_allocation(struct inode *inode, | 587 | static int ocfs2_extend_allocation(struct inode *inode, |
| 498 | u32 clusters_to_add) | 588 | u32 clusters_to_add) |
| 499 | { | 589 | { |
| 500 | int status = 0; | 590 | int status = 0; |
| 501 | int restart_func = 0; | 591 | int restart_func = 0; |
| 502 | int drop_alloc_sem = 0; | 592 | int drop_alloc_sem = 0; |
| 503 | int credits, num_free_extents; | 593 | int credits; |
| 504 | u32 prev_clusters; | 594 | u32 prev_clusters, logical_start; |
| 505 | struct buffer_head *bh = NULL; | 595 | struct buffer_head *bh = NULL; |
| 506 | struct ocfs2_dinode *fe = NULL; | 596 | struct ocfs2_dinode *fe = NULL; |
| 507 | handle_t *handle = NULL; | 597 | handle_t *handle = NULL; |
| @@ -512,6 +602,12 @@ static int ocfs2_extend_allocation(struct inode *inode, | |||
| 512 | 602 | ||
| 513 | mlog_entry("(clusters_to_add = %u)\n", clusters_to_add); | 603 | mlog_entry("(clusters_to_add = %u)\n", clusters_to_add); |
| 514 | 604 | ||
| 605 | /* | ||
| 606 | * This function only exists for file systems which don't | ||
| 607 | * support holes. | ||
| 608 | */ | ||
| 609 | BUG_ON(ocfs2_sparse_alloc(osb)); | ||
| 610 | |||
| 515 | status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh, | 611 | status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh, |
| 516 | OCFS2_BH_CACHED, inode); | 612 | OCFS2_BH_CACHED, inode); |
| 517 | if (status < 0) { | 613 | if (status < 0) { |
| @@ -526,39 +622,11 @@ static int ocfs2_extend_allocation(struct inode *inode, | |||
| 526 | goto leave; | 622 | goto leave; |
| 527 | } | 623 | } |
| 528 | 624 | ||
| 625 | logical_start = OCFS2_I(inode)->ip_clusters; | ||
| 626 | |||
| 529 | restart_all: | 627 | restart_all: |
| 530 | BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters); | 628 | BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters); |
| 531 | 629 | ||
| 532 | mlog(0, "extend inode %llu, i_size = %lld, fe->i_clusters = %u, " | ||
| 533 | "clusters_to_add = %u\n", | ||
| 534 | (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode), | ||
| 535 | fe->i_clusters, clusters_to_add); | ||
| 536 | |||
| 537 | num_free_extents = ocfs2_num_free_extents(osb, | ||
| 538 | inode, | ||
| 539 | fe); | ||
| 540 | if (num_free_extents < 0) { | ||
| 541 | status = num_free_extents; | ||
| 542 | mlog_errno(status); | ||
| 543 | goto leave; | ||
| 544 | } | ||
| 545 | |||
| 546 | if (!num_free_extents) { | ||
| 547 | status = ocfs2_reserve_new_metadata(osb, fe, &meta_ac); | ||
| 548 | if (status < 0) { | ||
| 549 | if (status != -ENOSPC) | ||
| 550 | mlog_errno(status); | ||
| 551 | goto leave; | ||
| 552 | } | ||
| 553 | } | ||
| 554 | |||
| 555 | status = ocfs2_reserve_clusters(osb, clusters_to_add, &data_ac); | ||
| 556 | if (status < 0) { | ||
| 557 | if (status != -ENOSPC) | ||
| 558 | mlog_errno(status); | ||
| 559 | goto leave; | ||
| 560 | } | ||
| 561 | |||
| 562 | /* blocks peope in read/write from reading our allocation | 630 | /* blocks peope in read/write from reading our allocation |
| 563 | * until we're done changing it. We depend on i_mutex to block | 631 | * until we're done changing it. We depend on i_mutex to block |
| 564 | * other extend/truncate calls while we're here. Ordering wrt | 632 | * other extend/truncate calls while we're here. Ordering wrt |
| @@ -566,6 +634,13 @@ restart_all: | |||
| 566 | down_write(&OCFS2_I(inode)->ip_alloc_sem); | 634 | down_write(&OCFS2_I(inode)->ip_alloc_sem); |
| 567 | drop_alloc_sem = 1; | 635 | drop_alloc_sem = 1; |
| 568 | 636 | ||
| 637 | status = ocfs2_lock_allocators(inode, fe, clusters_to_add, &data_ac, | ||
| 638 | &meta_ac); | ||
| 639 | if (status) { | ||
| 640 | mlog_errno(status); | ||
| 641 | goto leave; | ||
| 642 | } | ||
| 643 | |||
| 569 | credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add); | 644 | credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add); |
| 570 | handle = ocfs2_start_trans(osb, credits); | 645 | handle = ocfs2_start_trans(osb, credits); |
| 571 | if (IS_ERR(handle)) { | 646 | if (IS_ERR(handle)) { |
| @@ -590,6 +665,7 @@ restarted_transaction: | |||
| 590 | 665 | ||
| 591 | status = ocfs2_do_extend_allocation(osb, | 666 | status = ocfs2_do_extend_allocation(osb, |
| 592 | inode, | 667 | inode, |
| 668 | &logical_start, | ||
| 593 | clusters_to_add, | 669 | clusters_to_add, |
| 594 | bh, | 670 | bh, |
| 595 | handle, | 671 | handle, |
| @@ -778,7 +854,7 @@ static int ocfs2_extend_file(struct inode *inode, | |||
| 778 | size_t tail_to_skip) | 854 | size_t tail_to_skip) |
| 779 | { | 855 | { |
| 780 | int ret = 0; | 856 | int ret = 0; |
| 781 | u32 clusters_to_add; | 857 | u32 clusters_to_add = 0; |
| 782 | 858 | ||
| 783 | BUG_ON(!tail_to_skip && !di_bh); | 859 | BUG_ON(!tail_to_skip && !di_bh); |
| 784 | 860 | ||
| @@ -790,6 +866,11 @@ static int ocfs2_extend_file(struct inode *inode, | |||
| 790 | goto out; | 866 | goto out; |
| 791 | BUG_ON(new_i_size < i_size_read(inode)); | 867 | BUG_ON(new_i_size < i_size_read(inode)); |
| 792 | 868 | ||
| 869 | if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) { | ||
| 870 | BUG_ON(tail_to_skip != 0); | ||
| 871 | goto out_update_size; | ||
| 872 | } | ||
| 873 | |||
| 793 | clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) - | 874 | clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) - |
| 794 | OCFS2_I(inode)->ip_clusters; | 875 | OCFS2_I(inode)->ip_clusters; |
| 795 | 876 | ||
| @@ -825,6 +906,7 @@ static int ocfs2_extend_file(struct inode *inode, | |||
| 825 | goto out_unlock; | 906 | goto out_unlock; |
| 826 | } | 907 | } |
| 827 | 908 | ||
| 909 | out_update_size: | ||
| 828 | if (!tail_to_skip) { | 910 | if (!tail_to_skip) { |
| 829 | /* We're being called from ocfs2_setattr() which wants | 911 | /* We're being called from ocfs2_setattr() which wants |
| 830 | * us to update i_size */ | 912 | * us to update i_size */ |
| @@ -834,7 +916,8 @@ static int ocfs2_extend_file(struct inode *inode, | |||
| 834 | } | 916 | } |
| 835 | 917 | ||
| 836 | out_unlock: | 918 | out_unlock: |
| 837 | ocfs2_data_unlock(inode, 1); | 919 | if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) |
| 920 | ocfs2_data_unlock(inode, 1); | ||
| 838 | 921 | ||
| 839 | out: | 922 | out: |
| 840 | return ret; | 923 | return ret; |
| @@ -972,7 +1055,8 @@ int ocfs2_permission(struct inode *inode, int mask, struct nameidata *nd) | |||
| 972 | 1055 | ||
| 973 | ret = ocfs2_meta_lock(inode, NULL, 0); | 1056 | ret = ocfs2_meta_lock(inode, NULL, 0); |
| 974 | if (ret) { | 1057 | if (ret) { |
| 975 | mlog_errno(ret); | 1058 | if (ret != -ENOENT) |
| 1059 | mlog_errno(ret); | ||
| 976 | goto out; | 1060 | goto out; |
| 977 | } | 1061 | } |
| 978 | 1062 | ||
| @@ -1035,10 +1119,49 @@ out: | |||
| 1035 | return ret; | 1119 | return ret; |
| 1036 | } | 1120 | } |
| 1037 | 1121 | ||
| 1122 | /* | ||
| 1123 | * Will look for holes and unwritten extents in the range starting at | ||
| 1124 | * pos for count bytes (inclusive). | ||
| 1125 | */ | ||
| 1126 | static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos, | ||
| 1127 | size_t count) | ||
| 1128 | { | ||
| 1129 | int ret = 0; | ||
| 1130 | unsigned int extent_flags; | ||
| 1131 | u32 cpos, clusters, extent_len, phys_cpos; | ||
| 1132 | struct super_block *sb = inode->i_sb; | ||
| 1133 | |||
| 1134 | cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits; | ||
| 1135 | clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos; | ||
| 1136 | |||
| 1137 | while (clusters) { | ||
| 1138 | ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len, | ||
| 1139 | &extent_flags); | ||
| 1140 | if (ret < 0) { | ||
| 1141 | mlog_errno(ret); | ||
| 1142 | goto out; | ||
| 1143 | } | ||
| 1144 | |||
| 1145 | if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) { | ||
| 1146 | ret = 1; | ||
| 1147 | break; | ||
| 1148 | } | ||
| 1149 | |||
| 1150 | if (extent_len > clusters) | ||
| 1151 | extent_len = clusters; | ||
| 1152 | |||
| 1153 | clusters -= extent_len; | ||
| 1154 | cpos += extent_len; | ||
| 1155 | } | ||
| 1156 | out: | ||
| 1157 | return ret; | ||
| 1158 | } | ||
| 1159 | |||
| 1038 | static int ocfs2_prepare_inode_for_write(struct dentry *dentry, | 1160 | static int ocfs2_prepare_inode_for_write(struct dentry *dentry, |
| 1039 | loff_t *ppos, | 1161 | loff_t *ppos, |
| 1040 | size_t count, | 1162 | size_t count, |
| 1041 | int appending) | 1163 | int appending, |
| 1164 | int *direct_io) | ||
| 1042 | { | 1165 | { |
| 1043 | int ret = 0, meta_level = appending; | 1166 | int ret = 0, meta_level = appending; |
| 1044 | struct inode *inode = dentry->d_inode; | 1167 | struct inode *inode = dentry->d_inode; |
| @@ -1089,6 +1212,49 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry, | |||
| 1089 | } else { | 1212 | } else { |
| 1090 | saved_pos = *ppos; | 1213 | saved_pos = *ppos; |
| 1091 | } | 1214 | } |
| 1215 | |||
| 1216 | if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) { | ||
| 1217 | loff_t end = saved_pos + count; | ||
| 1218 | |||
| 1219 | /* | ||
| 1220 | * Skip the O_DIRECT checks if we don't need | ||
| 1221 | * them. | ||
| 1222 | */ | ||
| 1223 | if (!direct_io || !(*direct_io)) | ||
| 1224 | break; | ||
| 1225 | |||
| 1226 | /* | ||
| 1227 | * Allowing concurrent direct writes means | ||
| 1228 | * i_size changes wouldn't be synchronized, so | ||
| 1229 | * one node could wind up truncating another | ||
| 1230 | * nodes writes. | ||
| 1231 | */ | ||
| 1232 | if (end > i_size_read(inode)) { | ||
| 1233 | *direct_io = 0; | ||
| 1234 | break; | ||
| 1235 | } | ||
| 1236 | |||
| 1237 | /* | ||
| 1238 | * We don't fill holes during direct io, so | ||
| 1239 | * check for them here. If any are found, the | ||
| 1240 | * caller will have to retake some cluster | ||
| 1241 | * locks and initiate the io as buffered. | ||
| 1242 | */ | ||
| 1243 | ret = ocfs2_check_range_for_holes(inode, saved_pos, | ||
| 1244 | count); | ||
| 1245 | if (ret == 1) { | ||
| 1246 | *direct_io = 0; | ||
| 1247 | ret = 0; | ||
| 1248 | } else if (ret < 0) | ||
| 1249 | mlog_errno(ret); | ||
| 1250 | break; | ||
| 1251 | } | ||
| 1252 | |||
| 1253 | /* | ||
| 1254 | * The rest of this loop is concerned with legacy file | ||
| 1255 | * systems which don't support sparse files. | ||
| 1256 | */ | ||
| 1257 | |||
| 1092 | newsize = count + saved_pos; | 1258 | newsize = count + saved_pos; |
| 1093 | 1259 | ||
| 1094 | mlog(0, "pos=%lld newsize=%lld cursize=%lld\n", | 1260 | mlog(0, "pos=%lld newsize=%lld cursize=%lld\n", |
| @@ -1141,55 +1307,264 @@ out: | |||
| 1141 | return ret; | 1307 | return ret; |
| 1142 | } | 1308 | } |
| 1143 | 1309 | ||
| 1310 | static inline void | ||
| 1311 | ocfs2_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes) | ||
| 1312 | { | ||
| 1313 | const struct iovec *iov = *iovp; | ||
| 1314 | size_t base = *basep; | ||
| 1315 | |||
| 1316 | do { | ||
| 1317 | int copy = min(bytes, iov->iov_len - base); | ||
| 1318 | |||
| 1319 | bytes -= copy; | ||
| 1320 | base += copy; | ||
| 1321 | if (iov->iov_len == base) { | ||
| 1322 | iov++; | ||
| 1323 | base = 0; | ||
| 1324 | } | ||
| 1325 | } while (bytes); | ||
| 1326 | *iovp = iov; | ||
| 1327 | *basep = base; | ||
| 1328 | } | ||
| 1329 | |||
| 1330 | static struct page * ocfs2_get_write_source(struct ocfs2_buffered_write_priv *bp, | ||
| 1331 | const struct iovec *cur_iov, | ||
| 1332 | size_t iov_offset) | ||
| 1333 | { | ||
| 1334 | int ret; | ||
| 1335 | char *buf; | ||
| 1336 | struct page *src_page = NULL; | ||
| 1337 | |||
| 1338 | buf = cur_iov->iov_base + iov_offset; | ||
| 1339 | |||
| 1340 | if (!segment_eq(get_fs(), KERNEL_DS)) { | ||
| 1341 | /* | ||
| 1342 | * Pull in the user page. We want to do this outside | ||
| 1343 | * of the meta data locks in order to preserve locking | ||
| 1344 | * order in case of page fault. | ||
| 1345 | */ | ||
| 1346 | ret = get_user_pages(current, current->mm, | ||
| 1347 | (unsigned long)buf & PAGE_CACHE_MASK, 1, | ||
| 1348 | 0, 0, &src_page, NULL); | ||
| 1349 | if (ret == 1) | ||
| 1350 | bp->b_src_buf = kmap(src_page); | ||
| 1351 | else | ||
| 1352 | src_page = ERR_PTR(-EFAULT); | ||
| 1353 | } else { | ||
| 1354 | bp->b_src_buf = buf; | ||
| 1355 | } | ||
| 1356 | |||
| 1357 | return src_page; | ||
| 1358 | } | ||
| 1359 | |||
| 1360 | static void ocfs2_put_write_source(struct ocfs2_buffered_write_priv *bp, | ||
| 1361 | struct page *page) | ||
| 1362 | { | ||
| 1363 | if (page) { | ||
| 1364 | kunmap(page); | ||
| 1365 | page_cache_release(page); | ||
| 1366 | } | ||
| 1367 | } | ||
| 1368 | |||
| 1369 | static ssize_t ocfs2_file_buffered_write(struct file *file, loff_t *ppos, | ||
| 1370 | const struct iovec *iov, | ||
| 1371 | unsigned long nr_segs, | ||
| 1372 | size_t count, | ||
| 1373 | ssize_t o_direct_written) | ||
| 1374 | { | ||
| 1375 | int ret = 0; | ||
| 1376 | ssize_t copied, total = 0; | ||
| 1377 | size_t iov_offset = 0; | ||
| 1378 | const struct iovec *cur_iov = iov; | ||
| 1379 | struct ocfs2_buffered_write_priv bp; | ||
| 1380 | struct page *page; | ||
| 1381 | |||
| 1382 | /* | ||
| 1383 | * handle partial DIO write. Adjust cur_iov if needed. | ||
| 1384 | */ | ||
| 1385 | ocfs2_set_next_iovec(&cur_iov, &iov_offset, o_direct_written); | ||
| 1386 | |||
| 1387 | do { | ||
| 1388 | bp.b_cur_off = iov_offset; | ||
| 1389 | bp.b_cur_iov = cur_iov; | ||
| 1390 | |||
| 1391 | page = ocfs2_get_write_source(&bp, cur_iov, iov_offset); | ||
| 1392 | if (IS_ERR(page)) { | ||
| 1393 | ret = PTR_ERR(page); | ||
| 1394 | goto out; | ||
| 1395 | } | ||
| 1396 | |||
| 1397 | copied = ocfs2_buffered_write_cluster(file, *ppos, count, | ||
| 1398 | ocfs2_map_and_write_user_data, | ||
| 1399 | &bp); | ||
| 1400 | |||
| 1401 | ocfs2_put_write_source(&bp, page); | ||
| 1402 | |||
| 1403 | if (copied < 0) { | ||
| 1404 | mlog_errno(copied); | ||
| 1405 | ret = copied; | ||
| 1406 | goto out; | ||
| 1407 | } | ||
| 1408 | |||
| 1409 | total += copied; | ||
| 1410 | *ppos = *ppos + copied; | ||
| 1411 | count -= copied; | ||
| 1412 | |||
| 1413 | ocfs2_set_next_iovec(&cur_iov, &iov_offset, copied); | ||
| 1414 | } while(count); | ||
| 1415 | |||
| 1416 | out: | ||
| 1417 | return total ? total : ret; | ||
| 1418 | } | ||
| 1419 | |||
| 1420 | static int ocfs2_check_iovec(const struct iovec *iov, size_t *counted, | ||
| 1421 | unsigned long *nr_segs) | ||
| 1422 | { | ||
| 1423 | size_t ocount; /* original count */ | ||
| 1424 | unsigned long seg; | ||
| 1425 | |||
| 1426 | ocount = 0; | ||
| 1427 | for (seg = 0; seg < *nr_segs; seg++) { | ||
| 1428 | const struct iovec *iv = &iov[seg]; | ||
| 1429 | |||
| 1430 | /* | ||
| 1431 | * If any segment has a negative length, or the cumulative | ||
| 1432 | * length ever wraps negative then return -EINVAL. | ||
| 1433 | */ | ||
| 1434 | ocount += iv->iov_len; | ||
| 1435 | if (unlikely((ssize_t)(ocount|iv->iov_len) < 0)) | ||
| 1436 | return -EINVAL; | ||
| 1437 | if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len)) | ||
| 1438 | continue; | ||
| 1439 | if (seg == 0) | ||
| 1440 | return -EFAULT; | ||
| 1441 | *nr_segs = seg; | ||
| 1442 | ocount -= iv->iov_len; /* This segment is no good */ | ||
| 1443 | break; | ||
| 1444 | } | ||
| 1445 | |||
| 1446 | *counted = ocount; | ||
| 1447 | return 0; | ||
| 1448 | } | ||
| 1449 | |||
| 1144 | static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, | 1450 | static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, |
| 1145 | const struct iovec *iov, | 1451 | const struct iovec *iov, |
| 1146 | unsigned long nr_segs, | 1452 | unsigned long nr_segs, |
| 1147 | loff_t pos) | 1453 | loff_t pos) |
| 1148 | { | 1454 | { |
| 1149 | int ret, rw_level, have_alloc_sem = 0; | 1455 | int ret, direct_io, appending, rw_level, have_alloc_sem = 0; |
| 1150 | struct file *filp = iocb->ki_filp; | 1456 | int can_do_direct, sync = 0; |
| 1151 | struct inode *inode = filp->f_path.dentry->d_inode; | 1457 | ssize_t written = 0; |
| 1152 | int appending = filp->f_flags & O_APPEND ? 1 : 0; | 1458 | size_t ocount; /* original count */ |
| 1153 | 1459 | size_t count; /* after file limit checks */ | |
| 1154 | mlog_entry("(0x%p, %u, '%.*s')\n", filp, | 1460 | loff_t *ppos = &iocb->ki_pos; |
| 1461 | struct file *file = iocb->ki_filp; | ||
| 1462 | struct inode *inode = file->f_path.dentry->d_inode; | ||
| 1463 | |||
| 1464 | mlog_entry("(0x%p, %u, '%.*s')\n", file, | ||
| 1155 | (unsigned int)nr_segs, | 1465 | (unsigned int)nr_segs, |
| 1156 | filp->f_path.dentry->d_name.len, | 1466 | file->f_path.dentry->d_name.len, |
| 1157 | filp->f_path.dentry->d_name.name); | 1467 | file->f_path.dentry->d_name.name); |
| 1158 | 1468 | ||
| 1159 | /* happy write of zero bytes */ | ||
| 1160 | if (iocb->ki_left == 0) | 1469 | if (iocb->ki_left == 0) |
| 1161 | return 0; | 1470 | return 0; |
| 1162 | 1471 | ||
| 1472 | ret = ocfs2_check_iovec(iov, &ocount, &nr_segs); | ||
| 1473 | if (ret) | ||
| 1474 | return ret; | ||
| 1475 | |||
| 1476 | count = ocount; | ||
| 1477 | |||
| 1478 | vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); | ||
| 1479 | |||
| 1480 | appending = file->f_flags & O_APPEND ? 1 : 0; | ||
| 1481 | direct_io = file->f_flags & O_DIRECT ? 1 : 0; | ||
| 1482 | |||
| 1163 | mutex_lock(&inode->i_mutex); | 1483 | mutex_lock(&inode->i_mutex); |
| 1484 | |||
| 1485 | relock: | ||
| 1164 | /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */ | 1486 | /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */ |
| 1165 | if (filp->f_flags & O_DIRECT) { | 1487 | if (direct_io) { |
| 1166 | have_alloc_sem = 1; | ||
| 1167 | down_read(&inode->i_alloc_sem); | 1488 | down_read(&inode->i_alloc_sem); |
| 1489 | have_alloc_sem = 1; | ||
| 1168 | } | 1490 | } |
| 1169 | 1491 | ||
| 1170 | /* concurrent O_DIRECT writes are allowed */ | 1492 | /* concurrent O_DIRECT writes are allowed */ |
| 1171 | rw_level = (filp->f_flags & O_DIRECT) ? 0 : 1; | 1493 | rw_level = !direct_io; |
| 1172 | ret = ocfs2_rw_lock(inode, rw_level); | 1494 | ret = ocfs2_rw_lock(inode, rw_level); |
| 1173 | if (ret < 0) { | 1495 | if (ret < 0) { |
| 1174 | rw_level = -1; | ||
| 1175 | mlog_errno(ret); | 1496 | mlog_errno(ret); |
| 1176 | goto out; | 1497 | goto out_sems; |
| 1177 | } | 1498 | } |
| 1178 | 1499 | ||
| 1179 | ret = ocfs2_prepare_inode_for_write(filp->f_path.dentry, &iocb->ki_pos, | 1500 | can_do_direct = direct_io; |
| 1180 | iocb->ki_left, appending); | 1501 | ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos, |
| 1502 | iocb->ki_left, appending, | ||
| 1503 | &can_do_direct); | ||
| 1181 | if (ret < 0) { | 1504 | if (ret < 0) { |
| 1182 | mlog_errno(ret); | 1505 | mlog_errno(ret); |
| 1183 | goto out; | 1506 | goto out; |
| 1184 | } | 1507 | } |
| 1185 | 1508 | ||
| 1186 | /* communicate with ocfs2_dio_end_io */ | 1509 | /* |
| 1187 | ocfs2_iocb_set_rw_locked(iocb); | 1510 | * We can't complete the direct I/O as requested, fall back to |
| 1511 | * buffered I/O. | ||
| 1512 | */ | ||
| 1513 | if (direct_io && !can_do_direct) { | ||
| 1514 | ocfs2_rw_unlock(inode, rw_level); | ||
| 1515 | up_read(&inode->i_alloc_sem); | ||
| 1516 | |||
| 1517 | have_alloc_sem = 0; | ||
| 1518 | rw_level = -1; | ||
| 1188 | 1519 | ||
| 1189 | ret = generic_file_aio_write_nolock(iocb, iov, nr_segs, iocb->ki_pos); | 1520 | direct_io = 0; |
| 1521 | sync = 1; | ||
| 1522 | goto relock; | ||
| 1523 | } | ||
| 1524 | |||
| 1525 | if (!sync && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) | ||
| 1526 | sync = 1; | ||
| 1527 | |||
| 1528 | /* | ||
| 1529 | * XXX: Is it ok to execute these checks a second time? | ||
| 1530 | */ | ||
| 1531 | ret = generic_write_checks(file, ppos, &count, S_ISBLK(inode->i_mode)); | ||
| 1532 | if (ret) | ||
| 1533 | goto out; | ||
| 1534 | |||
| 1535 | /* | ||
| 1536 | * Set pos so that sync_page_range_nolock() below understands | ||
| 1537 | * where to start from. We might've moved it around via the | ||
| 1538 | * calls above. The range we want to actually sync starts from | ||
| 1539 | * *ppos here. | ||
| 1540 | * | ||
| 1541 | */ | ||
| 1542 | pos = *ppos; | ||
| 1543 | |||
| 1544 | /* communicate with ocfs2_dio_end_io */ | ||
| 1545 | ocfs2_iocb_set_rw_locked(iocb, rw_level); | ||
| 1546 | |||
| 1547 | if (direct_io) { | ||
| 1548 | written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos, | ||
| 1549 | ppos, count, ocount); | ||
| 1550 | if (written < 0) { | ||
| 1551 | ret = written; | ||
| 1552 | goto out_dio; | ||
| 1553 | } | ||
| 1554 | } else { | ||
| 1555 | written = ocfs2_file_buffered_write(file, ppos, iov, nr_segs, | ||
| 1556 | count, written); | ||
| 1557 | if (written < 0) { | ||
| 1558 | ret = written; | ||
| 1559 | if (ret != -EFAULT || ret != -ENOSPC) | ||
| 1560 | mlog_errno(ret); | ||
| 1561 | goto out; | ||
| 1562 | } | ||
| 1563 | } | ||
| 1190 | 1564 | ||
| 1565 | out_dio: | ||
| 1191 | /* buffered aio wouldn't have proper lock coverage today */ | 1566 | /* buffered aio wouldn't have proper lock coverage today */ |
| 1192 | BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT)); | 1567 | BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT)); |
| 1193 | 1568 | ||
| 1194 | /* | 1569 | /* |
| 1195 | * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io | 1570 | * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io |
| @@ -1207,13 +1582,102 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, | |||
| 1207 | } | 1582 | } |
| 1208 | 1583 | ||
| 1209 | out: | 1584 | out: |
| 1585 | if (rw_level != -1) | ||
| 1586 | ocfs2_rw_unlock(inode, rw_level); | ||
| 1587 | |||
| 1588 | out_sems: | ||
| 1210 | if (have_alloc_sem) | 1589 | if (have_alloc_sem) |
| 1211 | up_read(&inode->i_alloc_sem); | 1590 | up_read(&inode->i_alloc_sem); |
| 1212 | if (rw_level != -1) | 1591 | |
| 1213 | ocfs2_rw_unlock(inode, rw_level); | 1592 | if (written > 0 && sync) { |
| 1593 | ssize_t err; | ||
| 1594 | |||
| 1595 | err = sync_page_range_nolock(inode, file->f_mapping, pos, count); | ||
| 1596 | if (err < 0) | ||
| 1597 | written = err; | ||
| 1598 | } | ||
| 1599 | |||
| 1214 | mutex_unlock(&inode->i_mutex); | 1600 | mutex_unlock(&inode->i_mutex); |
| 1215 | 1601 | ||
| 1216 | mlog_exit(ret); | 1602 | mlog_exit(ret); |
| 1603 | return written ? written : ret; | ||
| 1604 | } | ||
| 1605 | |||
| 1606 | static int ocfs2_splice_write_actor(struct pipe_inode_info *pipe, | ||
| 1607 | struct pipe_buffer *buf, | ||
| 1608 | struct splice_desc *sd) | ||
| 1609 | { | ||
| 1610 | int ret, count, total = 0; | ||
| 1611 | ssize_t copied = 0; | ||
| 1612 | struct ocfs2_splice_write_priv sp; | ||
| 1613 | |||
| 1614 | ret = buf->ops->pin(pipe, buf); | ||
| 1615 | if (ret) | ||
| 1616 | goto out; | ||
| 1617 | |||
| 1618 | sp.s_sd = sd; | ||
| 1619 | sp.s_buf = buf; | ||
| 1620 | sp.s_pipe = pipe; | ||
| 1621 | sp.s_offset = sd->pos & ~PAGE_CACHE_MASK; | ||
| 1622 | sp.s_buf_offset = buf->offset; | ||
| 1623 | |||
| 1624 | count = sd->len; | ||
| 1625 | if (count + sp.s_offset > PAGE_CACHE_SIZE) | ||
| 1626 | count = PAGE_CACHE_SIZE - sp.s_offset; | ||
| 1627 | |||
| 1628 | do { | ||
| 1629 | /* | ||
| 1630 | * splice wants us to copy up to one page at a | ||
| 1631 | * time. For pagesize > cluster size, this means we | ||
| 1632 | * might enter ocfs2_buffered_write_cluster() more | ||
| 1633 | * than once, so keep track of our progress here. | ||
| 1634 | */ | ||
| 1635 | copied = ocfs2_buffered_write_cluster(sd->file, | ||
| 1636 | (loff_t)sd->pos + total, | ||
| 1637 | count, | ||
| 1638 | ocfs2_map_and_write_splice_data, | ||
| 1639 | &sp); | ||
| 1640 | if (copied < 0) { | ||
| 1641 | mlog_errno(copied); | ||
| 1642 | ret = copied; | ||
| 1643 | goto out; | ||
| 1644 | } | ||
| 1645 | |||
| 1646 | count -= copied; | ||
| 1647 | sp.s_offset += copied; | ||
| 1648 | sp.s_buf_offset += copied; | ||
| 1649 | total += copied; | ||
| 1650 | } while (count); | ||
| 1651 | |||
| 1652 | ret = 0; | ||
| 1653 | out: | ||
| 1654 | |||
| 1655 | return total ? total : ret; | ||
| 1656 | } | ||
| 1657 | |||
| 1658 | static ssize_t __ocfs2_file_splice_write(struct pipe_inode_info *pipe, | ||
| 1659 | struct file *out, | ||
| 1660 | loff_t *ppos, | ||
| 1661 | size_t len, | ||
| 1662 | unsigned int flags) | ||
| 1663 | { | ||
| 1664 | int ret, err; | ||
| 1665 | struct address_space *mapping = out->f_mapping; | ||
| 1666 | struct inode *inode = mapping->host; | ||
| 1667 | |||
| 1668 | ret = __splice_from_pipe(pipe, out, ppos, len, flags, | ||
| 1669 | ocfs2_splice_write_actor); | ||
| 1670 | if (ret > 0) { | ||
| 1671 | *ppos += ret; | ||
| 1672 | |||
| 1673 | if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { | ||
| 1674 | err = generic_osync_inode(inode, mapping, | ||
| 1675 | OSYNC_METADATA|OSYNC_DATA); | ||
| 1676 | if (err) | ||
| 1677 | ret = err; | ||
| 1678 | } | ||
| 1679 | } | ||
| 1680 | |||
| 1217 | return ret; | 1681 | return ret; |
| 1218 | } | 1682 | } |
| 1219 | 1683 | ||
| @@ -1239,14 +1703,15 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe, | |||
| 1239 | goto out; | 1703 | goto out; |
| 1240 | } | 1704 | } |
| 1241 | 1705 | ||
| 1242 | ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0); | 1706 | ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0, |
| 1707 | NULL); | ||
| 1243 | if (ret < 0) { | 1708 | if (ret < 0) { |
| 1244 | mlog_errno(ret); | 1709 | mlog_errno(ret); |
| 1245 | goto out_unlock; | 1710 | goto out_unlock; |
| 1246 | } | 1711 | } |
| 1247 | 1712 | ||
| 1248 | /* ok, we're done with i_size and alloc work */ | 1713 | /* ok, we're done with i_size and alloc work */ |
| 1249 | ret = generic_file_splice_write_nolock(pipe, out, ppos, len, flags); | 1714 | ret = __ocfs2_file_splice_write(pipe, out, ppos, len, flags); |
| 1250 | 1715 | ||
| 1251 | out_unlock: | 1716 | out_unlock: |
| 1252 | ocfs2_rw_unlock(inode, 1); | 1717 | ocfs2_rw_unlock(inode, 1); |
| @@ -1323,7 +1788,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, | |||
| 1323 | } | 1788 | } |
| 1324 | rw_level = 0; | 1789 | rw_level = 0; |
| 1325 | /* communicate with ocfs2_dio_end_io */ | 1790 | /* communicate with ocfs2_dio_end_io */ |
| 1326 | ocfs2_iocb_set_rw_locked(iocb); | 1791 | ocfs2_iocb_set_rw_locked(iocb, rw_level); |
| 1327 | } | 1792 | } |
| 1328 | 1793 | ||
| 1329 | /* | 1794 | /* |
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h index cc973f01f6ce..2c4460fced52 100644 --- a/fs/ocfs2/file.h +++ b/fs/ocfs2/file.h | |||
| @@ -39,12 +39,17 @@ enum ocfs2_alloc_restarted { | |||
| 39 | }; | 39 | }; |
| 40 | int ocfs2_do_extend_allocation(struct ocfs2_super *osb, | 40 | int ocfs2_do_extend_allocation(struct ocfs2_super *osb, |
| 41 | struct inode *inode, | 41 | struct inode *inode, |
| 42 | u32 *cluster_start, | ||
| 42 | u32 clusters_to_add, | 43 | u32 clusters_to_add, |
| 43 | struct buffer_head *fe_bh, | 44 | struct buffer_head *fe_bh, |
| 44 | handle_t *handle, | 45 | handle_t *handle, |
| 45 | struct ocfs2_alloc_context *data_ac, | 46 | struct ocfs2_alloc_context *data_ac, |
| 46 | struct ocfs2_alloc_context *meta_ac, | 47 | struct ocfs2_alloc_context *meta_ac, |
| 47 | enum ocfs2_alloc_restarted *reason); | 48 | enum ocfs2_alloc_restarted *reason); |
| 49 | int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di, | ||
| 50 | u32 clusters_to_add, | ||
| 51 | struct ocfs2_alloc_context **data_ac, | ||
| 52 | struct ocfs2_alloc_context **meta_ac); | ||
| 48 | int ocfs2_setattr(struct dentry *dentry, struct iattr *attr); | 53 | int ocfs2_setattr(struct dentry *dentry, struct iattr *attr); |
| 49 | int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry, | 54 | int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry, |
| 50 | struct kstat *stat); | 55 | struct kstat *stat); |
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 28ab56f2b98c..21a605079c62 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c | |||
| @@ -89,24 +89,6 @@ void ocfs2_set_inode_flags(struct inode *inode) | |||
| 89 | inode->i_flags |= S_DIRSYNC; | 89 | inode->i_flags |= S_DIRSYNC; |
| 90 | } | 90 | } |
| 91 | 91 | ||
| 92 | struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb, | ||
| 93 | u64 blkno, | ||
| 94 | int delete_vote) | ||
| 95 | { | ||
| 96 | struct ocfs2_find_inode_args args; | ||
| 97 | |||
| 98 | /* ocfs2_ilookup_for_vote should *only* be called from the | ||
| 99 | * vote thread */ | ||
| 100 | BUG_ON(current != osb->vote_task); | ||
| 101 | |||
| 102 | args.fi_blkno = blkno; | ||
| 103 | args.fi_flags = OCFS2_FI_FLAG_NOWAIT; | ||
| 104 | if (delete_vote) | ||
| 105 | args.fi_flags |= OCFS2_FI_FLAG_DELETE; | ||
| 106 | args.fi_ino = ino_from_blkno(osb->sb, blkno); | ||
| 107 | return ilookup5(osb->sb, args.fi_ino, ocfs2_find_actor, &args); | ||
| 108 | } | ||
| 109 | |||
| 110 | struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, int flags) | 92 | struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, int flags) |
| 111 | { | 93 | { |
| 112 | struct inode *inode = NULL; | 94 | struct inode *inode = NULL; |
| @@ -182,28 +164,6 @@ static int ocfs2_find_actor(struct inode *inode, void *opaque) | |||
| 182 | if (oi->ip_blkno != args->fi_blkno) | 164 | if (oi->ip_blkno != args->fi_blkno) |
| 183 | goto bail; | 165 | goto bail; |
| 184 | 166 | ||
| 185 | /* OCFS2_FI_FLAG_NOWAIT is *only* set from | ||
| 186 | * ocfs2_ilookup_for_vote which won't create an inode for one | ||
| 187 | * that isn't found. The vote thread which doesn't want to get | ||
| 188 | * an inode which is in the process of going away - otherwise | ||
| 189 | * the call to __wait_on_freeing_inode in find_inode_fast will | ||
| 190 | * cause it to deadlock on an inode which may be waiting on a | ||
| 191 | * vote (or lock release) in delete_inode */ | ||
| 192 | if ((args->fi_flags & OCFS2_FI_FLAG_NOWAIT) && | ||
| 193 | (inode->i_state & (I_FREEING|I_CLEAR))) { | ||
| 194 | /* As stated above, we're not going to return an | ||
| 195 | * inode. In the case of a delete vote, the voting | ||
| 196 | * code is going to signal the other node to go | ||
| 197 | * ahead. Mark that state here, so this freeing inode | ||
| 198 | * has the state when it gets to delete_inode. */ | ||
| 199 | if (args->fi_flags & OCFS2_FI_FLAG_DELETE) { | ||
| 200 | spin_lock(&oi->ip_lock); | ||
| 201 | ocfs2_mark_inode_remotely_deleted(inode); | ||
| 202 | spin_unlock(&oi->ip_lock); | ||
| 203 | } | ||
| 204 | goto bail; | ||
| 205 | } | ||
| 206 | |||
| 207 | ret = 1; | 167 | ret = 1; |
| 208 | bail: | 168 | bail: |
| 209 | mlog_exit(ret); | 169 | mlog_exit(ret); |
| @@ -261,6 +221,9 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, | |||
| 261 | goto bail; | 221 | goto bail; |
| 262 | } | 222 | } |
| 263 | 223 | ||
| 224 | OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); | ||
| 225 | OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr); | ||
| 226 | |||
| 264 | inode->i_version = 1; | 227 | inode->i_version = 1; |
| 265 | inode->i_generation = le32_to_cpu(fe->i_generation); | 228 | inode->i_generation = le32_to_cpu(fe->i_generation); |
| 266 | inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev)); | 229 | inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev)); |
| @@ -272,8 +235,7 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, | |||
| 272 | if (S_ISLNK(inode->i_mode) && !fe->i_clusters) | 235 | if (S_ISLNK(inode->i_mode) && !fe->i_clusters) |
| 273 | inode->i_blocks = 0; | 236 | inode->i_blocks = 0; |
| 274 | else | 237 | else |
| 275 | inode->i_blocks = | 238 | inode->i_blocks = ocfs2_inode_sector_count(inode); |
| 276 | ocfs2_align_bytes_to_sectors(le64_to_cpu(fe->i_size)); | ||
| 277 | inode->i_mapping->a_ops = &ocfs2_aops; | 239 | inode->i_mapping->a_ops = &ocfs2_aops; |
| 278 | inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime); | 240 | inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime); |
| 279 | inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec); | 241 | inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec); |
| @@ -288,10 +250,6 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, | |||
| 288 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | 250 | (unsigned long long)OCFS2_I(inode)->ip_blkno, |
| 289 | (unsigned long long)fe->i_blkno); | 251 | (unsigned long long)fe->i_blkno); |
| 290 | 252 | ||
| 291 | OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); | ||
| 292 | OCFS2_I(inode)->ip_orphaned_slot = OCFS2_INVALID_SLOT; | ||
| 293 | OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr); | ||
| 294 | |||
| 295 | inode->i_nlink = le16_to_cpu(fe->i_links_count); | 253 | inode->i_nlink = le16_to_cpu(fe->i_links_count); |
| 296 | 254 | ||
| 297 | if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) | 255 | if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) |
| @@ -347,6 +305,9 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, | |||
| 347 | 305 | ||
| 348 | ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres, | 306 | ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres, |
| 349 | OCFS2_LOCK_TYPE_META, 0, inode); | 307 | OCFS2_LOCK_TYPE_META, 0, inode); |
| 308 | |||
| 309 | ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres, | ||
| 310 | OCFS2_LOCK_TYPE_OPEN, 0, inode); | ||
| 350 | } | 311 | } |
| 351 | 312 | ||
| 352 | ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_rw_lockres, | 313 | ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_rw_lockres, |
| @@ -421,7 +382,7 @@ static int ocfs2_read_locked_inode(struct inode *inode, | |||
| 421 | * cluster lock before trusting anything anyway. | 382 | * cluster lock before trusting anything anyway. |
| 422 | */ | 383 | */ |
| 423 | can_lock = !(args->fi_flags & OCFS2_FI_FLAG_SYSFILE) | 384 | can_lock = !(args->fi_flags & OCFS2_FI_FLAG_SYSFILE) |
| 424 | && !(args->fi_flags & OCFS2_FI_FLAG_NOLOCK) | 385 | && !(args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY) |
| 425 | && !ocfs2_mount_local(osb); | 386 | && !ocfs2_mount_local(osb); |
| 426 | 387 | ||
| 427 | /* | 388 | /* |
| @@ -438,7 +399,17 @@ static int ocfs2_read_locked_inode(struct inode *inode, | |||
| 438 | OCFS2_LOCK_TYPE_META, | 399 | OCFS2_LOCK_TYPE_META, |
| 439 | generation, inode); | 400 | generation, inode); |
| 440 | 401 | ||
| 402 | ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres, | ||
| 403 | OCFS2_LOCK_TYPE_OPEN, | ||
| 404 | 0, inode); | ||
| 405 | |||
| 441 | if (can_lock) { | 406 | if (can_lock) { |
| 407 | status = ocfs2_open_lock(inode); | ||
| 408 | if (status) { | ||
| 409 | make_bad_inode(inode); | ||
| 410 | mlog_errno(status); | ||
| 411 | return status; | ||
| 412 | } | ||
| 442 | status = ocfs2_meta_lock(inode, NULL, 0); | 413 | status = ocfs2_meta_lock(inode, NULL, 0); |
| 443 | if (status) { | 414 | if (status) { |
| 444 | make_bad_inode(inode); | 415 | make_bad_inode(inode); |
| @@ -447,6 +418,14 @@ static int ocfs2_read_locked_inode(struct inode *inode, | |||
| 447 | } | 418 | } |
| 448 | } | 419 | } |
| 449 | 420 | ||
| 421 | if (args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY) { | ||
| 422 | status = ocfs2_try_open_lock(inode, 0); | ||
| 423 | if (status) { | ||
| 424 | make_bad_inode(inode); | ||
| 425 | return status; | ||
| 426 | } | ||
| 427 | } | ||
| 428 | |||
| 450 | status = ocfs2_read_block(osb, args->fi_blkno, &bh, 0, | 429 | status = ocfs2_read_block(osb, args->fi_blkno, &bh, 0, |
| 451 | can_lock ? inode : NULL); | 430 | can_lock ? inode : NULL); |
| 452 | if (status < 0) { | 431 | if (status < 0) { |
| @@ -507,50 +486,56 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb, | |||
| 507 | struct buffer_head *fe_bh) | 486 | struct buffer_head *fe_bh) |
| 508 | { | 487 | { |
| 509 | int status = 0; | 488 | int status = 0; |
| 510 | handle_t *handle = NULL; | ||
| 511 | struct ocfs2_truncate_context *tc = NULL; | 489 | struct ocfs2_truncate_context *tc = NULL; |
| 512 | struct ocfs2_dinode *fe; | 490 | struct ocfs2_dinode *fe; |
| 491 | handle_t *handle = NULL; | ||
| 513 | 492 | ||
| 514 | mlog_entry_void(); | 493 | mlog_entry_void(); |
| 515 | 494 | ||
| 516 | fe = (struct ocfs2_dinode *) fe_bh->b_data; | 495 | fe = (struct ocfs2_dinode *) fe_bh->b_data; |
| 517 | 496 | ||
| 518 | /* zero allocation, zero truncate :) */ | 497 | if (fe->i_clusters) { |
| 519 | if (!fe->i_clusters) | 498 | handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); |
| 520 | goto bail; | 499 | if (IS_ERR(handle)) { |
| 500 | status = PTR_ERR(handle); | ||
| 501 | mlog_errno(status); | ||
| 502 | goto out; | ||
| 503 | } | ||
| 521 | 504 | ||
| 522 | handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); | 505 | status = ocfs2_journal_access(handle, inode, fe_bh, |
| 523 | if (IS_ERR(handle)) { | 506 | OCFS2_JOURNAL_ACCESS_WRITE); |
| 524 | status = PTR_ERR(handle); | 507 | if (status < 0) { |
| 525 | handle = NULL; | 508 | mlog_errno(status); |
| 526 | mlog_errno(status); | 509 | goto out; |
| 527 | goto bail; | 510 | } |
| 528 | } | ||
| 529 | 511 | ||
| 530 | status = ocfs2_set_inode_size(handle, inode, fe_bh, 0ULL); | 512 | i_size_write(inode, 0); |
| 531 | if (status < 0) { | ||
| 532 | mlog_errno(status); | ||
| 533 | goto bail; | ||
| 534 | } | ||
| 535 | 513 | ||
| 536 | ocfs2_commit_trans(osb, handle); | 514 | status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); |
| 537 | handle = NULL; | 515 | if (status < 0) { |
| 516 | mlog_errno(status); | ||
| 517 | goto out; | ||
| 518 | } | ||
| 538 | 519 | ||
| 539 | status = ocfs2_prepare_truncate(osb, inode, fe_bh, &tc); | 520 | ocfs2_commit_trans(osb, handle); |
| 540 | if (status < 0) { | 521 | handle = NULL; |
| 541 | mlog_errno(status); | ||
| 542 | goto bail; | ||
| 543 | } | ||
| 544 | 522 | ||
| 545 | status = ocfs2_commit_truncate(osb, inode, fe_bh, tc); | 523 | status = ocfs2_prepare_truncate(osb, inode, fe_bh, &tc); |
| 546 | if (status < 0) { | 524 | if (status < 0) { |
| 547 | mlog_errno(status); | 525 | mlog_errno(status); |
| 548 | goto bail; | 526 | goto out; |
| 527 | } | ||
| 528 | |||
| 529 | status = ocfs2_commit_truncate(osb, inode, fe_bh, tc); | ||
| 530 | if (status < 0) { | ||
| 531 | mlog_errno(status); | ||
| 532 | goto out; | ||
| 533 | } | ||
| 549 | } | 534 | } |
| 550 | bail: | 535 | |
| 536 | out: | ||
| 551 | if (handle) | 537 | if (handle) |
| 552 | ocfs2_commit_trans(osb, handle); | 538 | ocfs2_commit_trans(osb, handle); |
| 553 | |||
| 554 | mlog_exit(status); | 539 | mlog_exit(status); |
| 555 | return status; | 540 | return status; |
| 556 | } | 541 | } |
| @@ -678,10 +663,10 @@ static int ocfs2_wipe_inode(struct inode *inode, | |||
| 678 | struct inode *orphan_dir_inode = NULL; | 663 | struct inode *orphan_dir_inode = NULL; |
| 679 | struct buffer_head *orphan_dir_bh = NULL; | 664 | struct buffer_head *orphan_dir_bh = NULL; |
| 680 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | 665 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); |
| 666 | struct ocfs2_dinode *di; | ||
| 681 | 667 | ||
| 682 | /* We've already voted on this so it should be readonly - no | 668 | di = (struct ocfs2_dinode *) di_bh->b_data; |
| 683 | * spinlock needed. */ | 669 | orphaned_slot = le16_to_cpu(di->i_orphaned_slot); |
| 684 | orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot; | ||
| 685 | 670 | ||
| 686 | status = ocfs2_check_orphan_recovery_state(osb, orphaned_slot); | 671 | status = ocfs2_check_orphan_recovery_state(osb, orphaned_slot); |
| 687 | if (status) | 672 | if (status) |
| @@ -839,11 +824,20 @@ static int ocfs2_query_inode_wipe(struct inode *inode, | |||
| 839 | goto bail; | 824 | goto bail; |
| 840 | } | 825 | } |
| 841 | 826 | ||
| 842 | status = ocfs2_request_delete_vote(inode); | 827 | /* |
| 843 | /* -EBUSY means that other nodes are still using the | 828 | * This is how ocfs2 determines whether an inode is still live |
| 844 | * inode. We're done here though, so avoid doing anything on | 829 | * within the cluster. Every node takes a shared read lock on |
| 845 | * disk and let them worry about deleting it. */ | 830 | * the inode open lock in ocfs2_read_locked_inode(). When we |
| 846 | if (status == -EBUSY) { | 831 | * get to ->delete_inode(), each node tries to convert it's |
| 832 | * lock to an exclusive. Trylocks are serialized by the inode | ||
| 833 | * meta data lock. If the upconvert suceeds, we know the inode | ||
| 834 | * is no longer live and can be deleted. | ||
| 835 | * | ||
| 836 | * Though we call this with the meta data lock held, the | ||
| 837 | * trylock keeps us from ABBA deadlock. | ||
| 838 | */ | ||
| 839 | status = ocfs2_try_open_lock(inode, 1); | ||
| 840 | if (status == -EAGAIN) { | ||
| 847 | status = 0; | 841 | status = 0; |
| 848 | mlog(0, "Skipping delete of %llu because it is in use on" | 842 | mlog(0, "Skipping delete of %llu because it is in use on" |
| 849 | "other nodes\n", (unsigned long long)oi->ip_blkno); | 843 | "other nodes\n", (unsigned long long)oi->ip_blkno); |
| @@ -854,21 +848,10 @@ static int ocfs2_query_inode_wipe(struct inode *inode, | |||
| 854 | goto bail; | 848 | goto bail; |
| 855 | } | 849 | } |
| 856 | 850 | ||
| 857 | spin_lock(&oi->ip_lock); | 851 | *wipe = 1; |
| 858 | if (oi->ip_orphaned_slot == OCFS2_INVALID_SLOT) { | 852 | mlog(0, "Inode %llu is ok to wipe from orphan dir %u\n", |
| 859 | /* Nobody knew which slot this inode was orphaned | 853 | (unsigned long long)oi->ip_blkno, |
| 860 | * into. This may happen during node death and | 854 | le16_to_cpu(di->i_orphaned_slot)); |
| 861 | * recovery knows how to clean it up so we can safely | ||
| 862 | * ignore this inode for now on. */ | ||
| 863 | mlog(0, "Nobody knew where inode %llu was orphaned!\n", | ||
| 864 | (unsigned long long)oi->ip_blkno); | ||
| 865 | } else { | ||
| 866 | *wipe = 1; | ||
| 867 | |||
| 868 | mlog(0, "Inode %llu is ok to wipe from orphan dir %d\n", | ||
| 869 | (unsigned long long)oi->ip_blkno, oi->ip_orphaned_slot); | ||
| 870 | } | ||
| 871 | spin_unlock(&oi->ip_lock); | ||
| 872 | 855 | ||
| 873 | bail: | 856 | bail: |
| 874 | return status; | 857 | return status; |
| @@ -1001,11 +984,16 @@ void ocfs2_clear_inode(struct inode *inode) | |||
| 1001 | mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL, | 984 | mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL, |
| 1002 | "Inode=%lu\n", inode->i_ino); | 985 | "Inode=%lu\n", inode->i_ino); |
| 1003 | 986 | ||
| 987 | /* For remove delete_inode vote, we hold open lock before, | ||
| 988 | * now it is time to unlock PR and EX open locks. */ | ||
| 989 | ocfs2_open_unlock(inode); | ||
| 990 | |||
| 1004 | /* Do these before all the other work so that we don't bounce | 991 | /* Do these before all the other work so that we don't bounce |
| 1005 | * the vote thread while waiting to destroy the locks. */ | 992 | * the vote thread while waiting to destroy the locks. */ |
| 1006 | ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres); | 993 | ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres); |
| 1007 | ocfs2_mark_lockres_freeing(&oi->ip_meta_lockres); | 994 | ocfs2_mark_lockres_freeing(&oi->ip_meta_lockres); |
| 1008 | ocfs2_mark_lockres_freeing(&oi->ip_data_lockres); | 995 | ocfs2_mark_lockres_freeing(&oi->ip_data_lockres); |
| 996 | ocfs2_mark_lockres_freeing(&oi->ip_open_lockres); | ||
| 1009 | 997 | ||
| 1010 | /* We very well may get a clear_inode before all an inodes | 998 | /* We very well may get a clear_inode before all an inodes |
| 1011 | * metadata has hit disk. Of course, we can't drop any cluster | 999 | * metadata has hit disk. Of course, we can't drop any cluster |
| @@ -1020,8 +1008,7 @@ void ocfs2_clear_inode(struct inode *inode) | |||
| 1020 | "Clear inode of %llu, inode has io markers\n", | 1008 | "Clear inode of %llu, inode has io markers\n", |
| 1021 | (unsigned long long)oi->ip_blkno); | 1009 | (unsigned long long)oi->ip_blkno); |
| 1022 | 1010 | ||
| 1023 | ocfs2_extent_map_drop(inode, 0); | 1011 | ocfs2_extent_map_trunc(inode, 0); |
| 1024 | ocfs2_extent_map_init(inode); | ||
| 1025 | 1012 | ||
| 1026 | status = ocfs2_drop_inode_locks(inode); | 1013 | status = ocfs2_drop_inode_locks(inode); |
| 1027 | if (status < 0) | 1014 | if (status < 0) |
| @@ -1030,6 +1017,7 @@ void ocfs2_clear_inode(struct inode *inode) | |||
| 1030 | ocfs2_lock_res_free(&oi->ip_rw_lockres); | 1017 | ocfs2_lock_res_free(&oi->ip_rw_lockres); |
| 1031 | ocfs2_lock_res_free(&oi->ip_meta_lockres); | 1018 | ocfs2_lock_res_free(&oi->ip_meta_lockres); |
| 1032 | ocfs2_lock_res_free(&oi->ip_data_lockres); | 1019 | ocfs2_lock_res_free(&oi->ip_data_lockres); |
| 1020 | ocfs2_lock_res_free(&oi->ip_open_lockres); | ||
| 1033 | 1021 | ||
| 1034 | ocfs2_metadata_cache_purge(inode); | 1022 | ocfs2_metadata_cache_purge(inode); |
| 1035 | 1023 | ||
| @@ -1086,9 +1074,6 @@ void ocfs2_drop_inode(struct inode *inode) | |||
| 1086 | mlog(0, "Drop inode %llu, nlink = %u, ip_flags = 0x%x\n", | 1074 | mlog(0, "Drop inode %llu, nlink = %u, ip_flags = 0x%x\n", |
| 1087 | (unsigned long long)oi->ip_blkno, inode->i_nlink, oi->ip_flags); | 1075 | (unsigned long long)oi->ip_blkno, inode->i_nlink, oi->ip_flags); |
| 1088 | 1076 | ||
| 1089 | /* Testing ip_orphaned_slot here wouldn't work because we may | ||
| 1090 | * not have gotten a delete_inode vote from any other nodes | ||
| 1091 | * yet. */ | ||
| 1092 | if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED) | 1077 | if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED) |
| 1093 | generic_delete_inode(inode); | 1078 | generic_delete_inode(inode); |
| 1094 | else | 1079 | else |
| @@ -1121,8 +1106,8 @@ struct buffer_head *ocfs2_bread(struct inode *inode, | |||
| 1121 | return NULL; | 1106 | return NULL; |
| 1122 | } | 1107 | } |
| 1123 | 1108 | ||
| 1124 | tmperr = ocfs2_extent_map_get_blocks(inode, block, 1, | 1109 | tmperr = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL, |
| 1125 | &p_blkno, NULL); | 1110 | NULL); |
| 1126 | if (tmperr < 0) { | 1111 | if (tmperr < 0) { |
| 1127 | mlog_errno(tmperr); | 1112 | mlog_errno(tmperr); |
| 1128 | goto fail; | 1113 | goto fail; |
| @@ -1259,7 +1244,7 @@ void ocfs2_refresh_inode(struct inode *inode, | |||
| 1259 | if (S_ISLNK(inode->i_mode) && le32_to_cpu(fe->i_clusters) == 0) | 1244 | if (S_ISLNK(inode->i_mode) && le32_to_cpu(fe->i_clusters) == 0) |
| 1260 | inode->i_blocks = 0; | 1245 | inode->i_blocks = 0; |
| 1261 | else | 1246 | else |
| 1262 | inode->i_blocks = ocfs2_align_bytes_to_sectors(i_size_read(inode)); | 1247 | inode->i_blocks = ocfs2_inode_sector_count(inode); |
| 1263 | inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime); | 1248 | inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime); |
| 1264 | inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec); | 1249 | inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec); |
| 1265 | inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime); | 1250 | inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime); |
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index 1a7dd2945b34..03ae075869ee 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h | |||
| @@ -26,6 +26,8 @@ | |||
| 26 | #ifndef OCFS2_INODE_H | 26 | #ifndef OCFS2_INODE_H |
| 27 | #define OCFS2_INODE_H | 27 | #define OCFS2_INODE_H |
| 28 | 28 | ||
| 29 | #include "extent_map.h" | ||
| 30 | |||
| 29 | /* OCFS2 Inode Private Data */ | 31 | /* OCFS2 Inode Private Data */ |
| 30 | struct ocfs2_inode_info | 32 | struct ocfs2_inode_info |
| 31 | { | 33 | { |
| @@ -34,6 +36,7 @@ struct ocfs2_inode_info | |||
| 34 | struct ocfs2_lock_res ip_rw_lockres; | 36 | struct ocfs2_lock_res ip_rw_lockres; |
| 35 | struct ocfs2_lock_res ip_meta_lockres; | 37 | struct ocfs2_lock_res ip_meta_lockres; |
| 36 | struct ocfs2_lock_res ip_data_lockres; | 38 | struct ocfs2_lock_res ip_data_lockres; |
| 39 | struct ocfs2_lock_res ip_open_lockres; | ||
| 37 | 40 | ||
| 38 | /* protects allocation changes on this inode. */ | 41 | /* protects allocation changes on this inode. */ |
| 39 | struct rw_semaphore ip_alloc_sem; | 42 | struct rw_semaphore ip_alloc_sem; |
| @@ -42,9 +45,7 @@ struct ocfs2_inode_info | |||
| 42 | spinlock_t ip_lock; | 45 | spinlock_t ip_lock; |
| 43 | u32 ip_open_count; | 46 | u32 ip_open_count; |
| 44 | u32 ip_clusters; | 47 | u32 ip_clusters; |
| 45 | struct ocfs2_extent_map ip_map; | ||
| 46 | struct list_head ip_io_markers; | 48 | struct list_head ip_io_markers; |
| 47 | int ip_orphaned_slot; | ||
| 48 | 49 | ||
| 49 | struct mutex ip_io_mutex; | 50 | struct mutex ip_io_mutex; |
| 50 | 51 | ||
| @@ -64,6 +65,8 @@ struct ocfs2_inode_info | |||
| 64 | 65 | ||
| 65 | struct ocfs2_caching_info ip_metadata_cache; | 66 | struct ocfs2_caching_info ip_metadata_cache; |
| 66 | 67 | ||
| 68 | struct ocfs2_extent_map ip_extent_map; | ||
| 69 | |||
| 67 | struct inode vfs_inode; | 70 | struct inode vfs_inode; |
| 68 | }; | 71 | }; |
| 69 | 72 | ||
| @@ -117,14 +120,9 @@ void ocfs2_delete_inode(struct inode *inode); | |||
| 117 | void ocfs2_drop_inode(struct inode *inode); | 120 | void ocfs2_drop_inode(struct inode *inode); |
| 118 | 121 | ||
| 119 | /* Flags for ocfs2_iget() */ | 122 | /* Flags for ocfs2_iget() */ |
| 120 | #define OCFS2_FI_FLAG_NOWAIT 0x1 | 123 | #define OCFS2_FI_FLAG_SYSFILE 0x4 |
| 121 | #define OCFS2_FI_FLAG_DELETE 0x2 | 124 | #define OCFS2_FI_FLAG_ORPHAN_RECOVERY 0x8 |
| 122 | #define OCFS2_FI_FLAG_SYSFILE 0x4 | ||
| 123 | #define OCFS2_FI_FLAG_NOLOCK 0x8 | ||
| 124 | struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, int flags); | 125 | struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, int flags); |
| 125 | struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb, | ||
| 126 | u64 blkno, | ||
| 127 | int delete_vote); | ||
| 128 | int ocfs2_inode_init_private(struct inode *inode); | 126 | int ocfs2_inode_init_private(struct inode *inode); |
| 129 | int ocfs2_inode_revalidate(struct dentry *dentry); | 127 | int ocfs2_inode_revalidate(struct dentry *dentry); |
| 130 | int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, | 128 | int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, |
| @@ -144,4 +142,11 @@ int ocfs2_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb); | |||
| 144 | 142 | ||
| 145 | void ocfs2_set_inode_flags(struct inode *inode); | 143 | void ocfs2_set_inode_flags(struct inode *inode); |
| 146 | 144 | ||
| 145 | static inline blkcnt_t ocfs2_inode_sector_count(struct inode *inode) | ||
| 146 | { | ||
| 147 | int c_to_s_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits - 9; | ||
| 148 | |||
| 149 | return (blkcnt_t)(OCFS2_I(inode)->ip_clusters << c_to_s_bits); | ||
| 150 | } | ||
| 151 | |||
| 147 | #endif /* OCFS2_INODE_H */ | 152 | #endif /* OCFS2_INODE_H */ |
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 825cb0ae1b4c..5a8a90d1c787 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c | |||
| @@ -649,29 +649,20 @@ bail: | |||
| 649 | static int ocfs2_force_read_journal(struct inode *inode) | 649 | static int ocfs2_force_read_journal(struct inode *inode) |
| 650 | { | 650 | { |
| 651 | int status = 0; | 651 | int status = 0; |
| 652 | int i, p_blocks; | 652 | int i; |
| 653 | u64 v_blkno, p_blkno; | 653 | u64 v_blkno, p_blkno, p_blocks, num_blocks; |
| 654 | #define CONCURRENT_JOURNAL_FILL 32 | 654 | #define CONCURRENT_JOURNAL_FILL 32ULL |
| 655 | struct buffer_head *bhs[CONCURRENT_JOURNAL_FILL]; | 655 | struct buffer_head *bhs[CONCURRENT_JOURNAL_FILL]; |
| 656 | 656 | ||
| 657 | mlog_entry_void(); | 657 | mlog_entry_void(); |
| 658 | 658 | ||
| 659 | BUG_ON(inode->i_blocks != | ||
| 660 | ocfs2_align_bytes_to_sectors(i_size_read(inode))); | ||
| 661 | |||
| 662 | memset(bhs, 0, sizeof(struct buffer_head *) * CONCURRENT_JOURNAL_FILL); | 659 | memset(bhs, 0, sizeof(struct buffer_head *) * CONCURRENT_JOURNAL_FILL); |
| 663 | 660 | ||
| 664 | mlog(0, "Force reading %llu blocks\n", | 661 | num_blocks = ocfs2_blocks_for_bytes(inode->i_sb, inode->i_size); |
| 665 | (unsigned long long)(inode->i_blocks >> | ||
| 666 | (inode->i_sb->s_blocksize_bits - 9))); | ||
| 667 | |||
| 668 | v_blkno = 0; | 662 | v_blkno = 0; |
| 669 | while (v_blkno < | 663 | while (v_blkno < num_blocks) { |
| 670 | (inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9))) { | ||
| 671 | |||
| 672 | status = ocfs2_extent_map_get_blocks(inode, v_blkno, | 664 | status = ocfs2_extent_map_get_blocks(inode, v_blkno, |
| 673 | 1, &p_blkno, | 665 | &p_blkno, &p_blocks, NULL); |
| 674 | &p_blocks); | ||
| 675 | if (status < 0) { | 666 | if (status < 0) { |
| 676 | mlog_errno(status); | 667 | mlog_errno(status); |
| 677 | goto bail; | 668 | goto bail; |
| @@ -1306,7 +1297,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb, | |||
| 1306 | continue; | 1297 | continue; |
| 1307 | 1298 | ||
| 1308 | iter = ocfs2_iget(osb, le64_to_cpu(de->inode), | 1299 | iter = ocfs2_iget(osb, le64_to_cpu(de->inode), |
| 1309 | OCFS2_FI_FLAG_NOLOCK); | 1300 | OCFS2_FI_FLAG_ORPHAN_RECOVERY); |
| 1310 | if (IS_ERR(iter)) | 1301 | if (IS_ERR(iter)) |
| 1311 | continue; | 1302 | continue; |
| 1312 | 1303 | ||
| @@ -1418,7 +1409,6 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, | |||
| 1418 | /* Set the proper information to get us going into | 1409 | /* Set the proper information to get us going into |
| 1419 | * ocfs2_delete_inode. */ | 1410 | * ocfs2_delete_inode. */ |
| 1420 | oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED; | 1411 | oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED; |
| 1421 | oi->ip_orphaned_slot = slot; | ||
| 1422 | spin_unlock(&oi->ip_lock); | 1412 | spin_unlock(&oi->ip_lock); |
| 1423 | 1413 | ||
| 1424 | iput(inode); | 1414 | iput(inode); |
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index d026b4f27757..3db5de4506da 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h | |||
| @@ -390,7 +390,7 @@ static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb, | |||
| 390 | /* We may be deleting metadata blocks, so metadata alloc dinode + | 390 | /* We may be deleting metadata blocks, so metadata alloc dinode + |
| 391 | one desc. block for each possible delete. */ | 391 | one desc. block for each possible delete. */ |
| 392 | if (tree_depth && next_free == 1 && | 392 | if (tree_depth && next_free == 1 && |
| 393 | le32_to_cpu(last_el->l_recs[i].e_clusters) == clusters_to_del) | 393 | ocfs2_rec_clusters(last_el, &last_el->l_recs[i]) == clusters_to_del) |
| 394 | credits += 1 + tree_depth; | 394 | credits += 1 + tree_depth; |
| 395 | 395 | ||
| 396 | /* update to the truncate log. */ | 396 | /* update to the truncate log. */ |
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index 51b020447683..af01158b39f5 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c | |||
| @@ -85,8 +85,11 @@ int ocfs2_mmap(struct file *file, struct vm_area_struct *vma) | |||
| 85 | int ret = 0, lock_level = 0; | 85 | int ret = 0, lock_level = 0; |
| 86 | struct ocfs2_super *osb = OCFS2_SB(file->f_dentry->d_inode->i_sb); | 86 | struct ocfs2_super *osb = OCFS2_SB(file->f_dentry->d_inode->i_sb); |
| 87 | 87 | ||
| 88 | /* We don't want to support shared writable mappings yet. */ | 88 | /* |
| 89 | if (!ocfs2_mount_local(osb) && | 89 | * Only support shared writeable mmap for local mounts which |
| 90 | * don't know about holes. | ||
| 91 | */ | ||
| 92 | if ((!ocfs2_mount_local(osb) || ocfs2_sparse_alloc(osb)) && | ||
| 90 | ((vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_MAYSHARE)) && | 93 | ((vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_MAYSHARE)) && |
| 91 | ((vma->vm_flags & VM_WRITE) || (vma->vm_flags & VM_MAYWRITE))) { | 94 | ((vma->vm_flags & VM_WRITE) || (vma->vm_flags & VM_MAYWRITE))) { |
| 92 | mlog(0, "disallow shared writable mmaps %lx\n", vma->vm_flags); | 95 | mlog(0, "disallow shared writable mmaps %lx\n", vma->vm_flags); |
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 28dd757ff67d..2bcf353fd7c5 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c | |||
| @@ -175,8 +175,6 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry, | |||
| 175 | 175 | ||
| 176 | inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0); | 176 | inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0); |
| 177 | if (IS_ERR(inode)) { | 177 | if (IS_ERR(inode)) { |
| 178 | mlog(ML_ERROR, "Unable to create inode %llu\n", | ||
| 179 | (unsigned long long)blkno); | ||
| 180 | ret = ERR_PTR(-EACCES); | 178 | ret = ERR_PTR(-EACCES); |
| 181 | goto bail_unlock; | 179 | goto bail_unlock; |
| 182 | } | 180 | } |
| @@ -189,7 +187,6 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry, | |||
| 189 | * unlink. */ | 187 | * unlink. */ |
| 190 | spin_lock(&oi->ip_lock); | 188 | spin_lock(&oi->ip_lock); |
| 191 | oi->ip_flags &= ~OCFS2_INODE_MAYBE_ORPHANED; | 189 | oi->ip_flags &= ~OCFS2_INODE_MAYBE_ORPHANED; |
| 192 | oi->ip_orphaned_slot = OCFS2_INVALID_SLOT; | ||
| 193 | spin_unlock(&oi->ip_lock); | 190 | spin_unlock(&oi->ip_lock); |
| 194 | 191 | ||
| 195 | bail_add: | 192 | bail_add: |
| @@ -288,7 +285,7 @@ static int ocfs2_fill_new_dir(struct ocfs2_super *osb, | |||
| 288 | 285 | ||
| 289 | i_size_write(inode, inode->i_sb->s_blocksize); | 286 | i_size_write(inode, inode->i_sb->s_blocksize); |
| 290 | inode->i_nlink = 2; | 287 | inode->i_nlink = 2; |
| 291 | inode->i_blocks = ocfs2_align_bytes_to_sectors(inode->i_sb->s_blocksize); | 288 | inode->i_blocks = ocfs2_inode_sector_count(inode); |
| 292 | status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); | 289 | status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); |
| 293 | if (status < 0) { | 290 | if (status < 0) { |
| 294 | mlog_errno(status); | 291 | mlog_errno(status); |
| @@ -1486,8 +1483,7 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb, | |||
| 1486 | struct buffer_head **bhs = NULL; | 1483 | struct buffer_head **bhs = NULL; |
| 1487 | const char *c; | 1484 | const char *c; |
| 1488 | struct super_block *sb = osb->sb; | 1485 | struct super_block *sb = osb->sb; |
| 1489 | u64 p_blkno; | 1486 | u64 p_blkno, p_blocks; |
| 1490 | int p_blocks; | ||
| 1491 | int virtual, blocks, status, i, bytes_left; | 1487 | int virtual, blocks, status, i, bytes_left; |
| 1492 | 1488 | ||
| 1493 | bytes_left = i_size_read(inode) + 1; | 1489 | bytes_left = i_size_read(inode) + 1; |
| @@ -1514,8 +1510,8 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb, | |||
| 1514 | goto bail; | 1510 | goto bail; |
| 1515 | } | 1511 | } |
| 1516 | 1512 | ||
| 1517 | status = ocfs2_extent_map_get_blocks(inode, 0, 1, &p_blkno, | 1513 | status = ocfs2_extent_map_get_blocks(inode, 0, &p_blkno, &p_blocks, |
| 1518 | &p_blocks); | 1514 | NULL); |
| 1519 | if (status < 0) { | 1515 | if (status < 0) { |
| 1520 | mlog_errno(status); | 1516 | mlog_errno(status); |
| 1521 | goto bail; | 1517 | goto bail; |
| @@ -1674,8 +1670,11 @@ static int ocfs2_symlink(struct inode *dir, | |||
| 1674 | inode->i_rdev = 0; | 1670 | inode->i_rdev = 0; |
| 1675 | newsize = l - 1; | 1671 | newsize = l - 1; |
| 1676 | if (l > ocfs2_fast_symlink_chars(sb)) { | 1672 | if (l > ocfs2_fast_symlink_chars(sb)) { |
| 1673 | u32 offset = 0; | ||
| 1674 | |||
| 1677 | inode->i_op = &ocfs2_symlink_inode_operations; | 1675 | inode->i_op = &ocfs2_symlink_inode_operations; |
| 1678 | status = ocfs2_do_extend_allocation(osb, inode, 1, new_fe_bh, | 1676 | status = ocfs2_do_extend_allocation(osb, inode, &offset, 1, |
| 1677 | new_fe_bh, | ||
| 1679 | handle, data_ac, NULL, | 1678 | handle, data_ac, NULL, |
| 1680 | NULL); | 1679 | NULL); |
| 1681 | if (status < 0) { | 1680 | if (status < 0) { |
| @@ -1689,7 +1688,7 @@ static int ocfs2_symlink(struct inode *dir, | |||
| 1689 | goto bail; | 1688 | goto bail; |
| 1690 | } | 1689 | } |
| 1691 | i_size_write(inode, newsize); | 1690 | i_size_write(inode, newsize); |
| 1692 | inode->i_blocks = ocfs2_align_bytes_to_sectors(newsize); | 1691 | inode->i_blocks = ocfs2_inode_sector_count(inode); |
| 1693 | } else { | 1692 | } else { |
| 1694 | inode->i_op = &ocfs2_fast_symlink_inode_operations; | 1693 | inode->i_op = &ocfs2_fast_symlink_inode_operations; |
| 1695 | memcpy((char *) fe->id2.i_symlink, symname, l); | 1694 | memcpy((char *) fe->id2.i_symlink, symname, l); |
| @@ -2222,9 +2221,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb, | |||
| 2222 | /* Record which orphan dir our inode now resides | 2221 | /* Record which orphan dir our inode now resides |
| 2223 | * in. delete_inode will use this to determine which orphan | 2222 | * in. delete_inode will use this to determine which orphan |
| 2224 | * dir to lock. */ | 2223 | * dir to lock. */ |
| 2225 | spin_lock(&OCFS2_I(inode)->ip_lock); | 2224 | fe->i_orphaned_slot = cpu_to_le16(osb->slot_num); |
| 2226 | OCFS2_I(inode)->ip_orphaned_slot = osb->slot_num; | ||
| 2227 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
| 2228 | 2225 | ||
| 2229 | mlog(0, "Inode %llu orphaned in slot %d\n", | 2226 | mlog(0, "Inode %llu orphaned in slot %d\n", |
| 2230 | (unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num); | 2227 | (unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num); |
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index db8e77cd35d3..82cc92dcf8a6 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h | |||
| @@ -46,11 +46,6 @@ | |||
| 46 | #include "endian.h" | 46 | #include "endian.h" |
| 47 | #include "ocfs2_lockid.h" | 47 | #include "ocfs2_lockid.h" |
| 48 | 48 | ||
| 49 | struct ocfs2_extent_map { | ||
| 50 | u32 em_clusters; | ||
| 51 | struct rb_root em_extents; | ||
| 52 | }; | ||
| 53 | |||
| 54 | /* Most user visible OCFS2 inodes will have very few pieces of | 49 | /* Most user visible OCFS2 inodes will have very few pieces of |
| 55 | * metadata, but larger files (including bitmaps, etc) must be taken | 50 | * metadata, but larger files (including bitmaps, etc) must be taken |
| 56 | * into account when designing an access scheme. We allow a small | 51 | * into account when designing an access scheme. We allow a small |
| @@ -303,6 +298,13 @@ static inline int ocfs2_should_order_data(struct inode *inode) | |||
| 303 | return 1; | 298 | return 1; |
| 304 | } | 299 | } |
| 305 | 300 | ||
| 301 | static inline int ocfs2_sparse_alloc(struct ocfs2_super *osb) | ||
| 302 | { | ||
| 303 | if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC) | ||
| 304 | return 1; | ||
| 305 | return 0; | ||
| 306 | } | ||
| 307 | |||
| 306 | /* set / clear functions because cluster events can make these happen | 308 | /* set / clear functions because cluster events can make these happen |
| 307 | * in parallel so we want the transitions to be atomic. this also | 309 | * in parallel so we want the transitions to be atomic. this also |
| 308 | * means that any future flags osb_flags must be protected by spinlock | 310 | * means that any future flags osb_flags must be protected by spinlock |
| @@ -461,6 +463,49 @@ static inline unsigned long ocfs2_align_bytes_to_sectors(u64 bytes) | |||
| 461 | return (unsigned long)((bytes + 511) >> 9); | 463 | return (unsigned long)((bytes + 511) >> 9); |
| 462 | } | 464 | } |
| 463 | 465 | ||
| 466 | static inline unsigned int ocfs2_page_index_to_clusters(struct super_block *sb, | ||
| 467 | unsigned long pg_index) | ||
| 468 | { | ||
| 469 | u32 clusters = pg_index; | ||
| 470 | unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits; | ||
| 471 | |||
| 472 | if (unlikely(PAGE_CACHE_SHIFT > cbits)) | ||
| 473 | clusters = pg_index << (PAGE_CACHE_SHIFT - cbits); | ||
| 474 | else if (PAGE_CACHE_SHIFT < cbits) | ||
| 475 | clusters = pg_index >> (cbits - PAGE_CACHE_SHIFT); | ||
| 476 | |||
| 477 | return clusters; | ||
| 478 | } | ||
| 479 | |||
| 480 | /* | ||
| 481 | * Find the 1st page index which covers the given clusters. | ||
| 482 | */ | ||
| 483 | static inline unsigned long ocfs2_align_clusters_to_page_index(struct super_block *sb, | ||
| 484 | u32 clusters) | ||
| 485 | { | ||
| 486 | unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits; | ||
| 487 | unsigned long index = clusters; | ||
| 488 | |||
| 489 | if (PAGE_CACHE_SHIFT > cbits) { | ||
| 490 | index = clusters >> (PAGE_CACHE_SHIFT - cbits); | ||
| 491 | } else if (PAGE_CACHE_SHIFT < cbits) { | ||
| 492 | index = clusters << (cbits - PAGE_CACHE_SHIFT); | ||
| 493 | } | ||
| 494 | |||
| 495 | return index; | ||
| 496 | } | ||
| 497 | |||
| 498 | static inline unsigned int ocfs2_pages_per_cluster(struct super_block *sb) | ||
| 499 | { | ||
| 500 | unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits; | ||
| 501 | unsigned int pages_per_cluster = 1; | ||
| 502 | |||
| 503 | if (PAGE_CACHE_SHIFT < cbits) | ||
| 504 | pages_per_cluster = 1 << (cbits - PAGE_CACHE_SHIFT); | ||
| 505 | |||
| 506 | return pages_per_cluster; | ||
| 507 | } | ||
| 508 | |||
| 464 | #define ocfs2_set_bit ext2_set_bit | 509 | #define ocfs2_set_bit ext2_set_bit |
| 465 | #define ocfs2_clear_bit ext2_clear_bit | 510 | #define ocfs2_clear_bit ext2_clear_bit |
| 466 | #define ocfs2_test_bit ext2_test_bit | 511 | #define ocfs2_test_bit ext2_test_bit |
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index e61e218f5e0b..71306479c68f 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h | |||
| @@ -86,7 +86,8 @@ | |||
| 86 | OCFS2_SB(sb)->s_feature_incompat &= ~(mask) | 86 | OCFS2_SB(sb)->s_feature_incompat &= ~(mask) |
| 87 | 87 | ||
| 88 | #define OCFS2_FEATURE_COMPAT_SUPP OCFS2_FEATURE_COMPAT_BACKUP_SB | 88 | #define OCFS2_FEATURE_COMPAT_SUPP OCFS2_FEATURE_COMPAT_BACKUP_SB |
| 89 | #define OCFS2_FEATURE_INCOMPAT_SUPP OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT | 89 | #define OCFS2_FEATURE_INCOMPAT_SUPP (OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT \ |
| 90 | | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC) | ||
| 90 | #define OCFS2_FEATURE_RO_COMPAT_SUPP 0 | 91 | #define OCFS2_FEATURE_RO_COMPAT_SUPP 0 |
| 91 | 92 | ||
| 92 | /* | 93 | /* |
| @@ -155,6 +156,12 @@ | |||
| 155 | #define OCFS2_FL_MODIFIABLE (0x000100FF) /* User modifiable flags */ | 156 | #define OCFS2_FL_MODIFIABLE (0x000100FF) /* User modifiable flags */ |
| 156 | 157 | ||
| 157 | /* | 158 | /* |
| 159 | * Extent record flags (e_node.leaf.flags) | ||
| 160 | */ | ||
| 161 | #define OCFS2_EXT_UNWRITTEN (0x01) /* Extent is allocated but | ||
| 162 | * unwritten */ | ||
| 163 | |||
| 164 | /* | ||
| 158 | * ioctl commands | 165 | * ioctl commands |
| 159 | */ | 166 | */ |
| 160 | #define OCFS2_IOC_GETFLAGS _IOR('f', 1, long) | 167 | #define OCFS2_IOC_GETFLAGS _IOR('f', 1, long) |
| @@ -282,10 +289,21 @@ static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = { | |||
| 282 | /* | 289 | /* |
| 283 | * On disk extent record for OCFS2 | 290 | * On disk extent record for OCFS2 |
| 284 | * It describes a range of clusters on disk. | 291 | * It describes a range of clusters on disk. |
| 292 | * | ||
| 293 | * Length fields are divided into interior and leaf node versions. | ||
| 294 | * This leaves room for a flags field (OCFS2_EXT_*) in the leaf nodes. | ||
| 285 | */ | 295 | */ |
| 286 | struct ocfs2_extent_rec { | 296 | struct ocfs2_extent_rec { |
| 287 | /*00*/ __le32 e_cpos; /* Offset into the file, in clusters */ | 297 | /*00*/ __le32 e_cpos; /* Offset into the file, in clusters */ |
| 288 | __le32 e_clusters; /* Clusters covered by this extent */ | 298 | union { |
| 299 | __le32 e_int_clusters; /* Clusters covered by all children */ | ||
| 300 | struct { | ||
| 301 | __le16 e_leaf_clusters; /* Clusters covered by this | ||
| 302 | extent */ | ||
| 303 | __u8 e_reserved1; | ||
| 304 | __u8 e_flags; /* Extent flags */ | ||
| 305 | }; | ||
| 306 | }; | ||
| 289 | __le64 e_blkno; /* Physical disk offset, in blocks */ | 307 | __le64 e_blkno; /* Physical disk offset, in blocks */ |
| 290 | /*10*/ | 308 | /*10*/ |
| 291 | }; | 309 | }; |
| @@ -311,7 +329,10 @@ struct ocfs2_extent_list { | |||
| 311 | /*00*/ __le16 l_tree_depth; /* Extent tree depth from this | 329 | /*00*/ __le16 l_tree_depth; /* Extent tree depth from this |
| 312 | point. 0 means data extents | 330 | point. 0 means data extents |
| 313 | hang directly off this | 331 | hang directly off this |
| 314 | header (a leaf) */ | 332 | header (a leaf) |
| 333 | NOTE: The high 8 bits cannot be | ||
| 334 | used - tree_depth is never that big. | ||
| 335 | */ | ||
| 315 | __le16 l_count; /* Number of extent records */ | 336 | __le16 l_count; /* Number of extent records */ |
| 316 | __le16 l_next_free_rec; /* Next unused extent slot */ | 337 | __le16 l_next_free_rec; /* Next unused extent slot */ |
| 317 | __le16 l_reserved1; | 338 | __le16 l_reserved1; |
| @@ -446,7 +467,9 @@ struct ocfs2_dinode { | |||
| 446 | __le32 i_ctime_nsec; | 467 | __le32 i_ctime_nsec; |
| 447 | __le32 i_mtime_nsec; | 468 | __le32 i_mtime_nsec; |
| 448 | __le32 i_attr; | 469 | __le32 i_attr; |
| 449 | __le32 i_reserved1; | 470 | __le16 i_orphaned_slot; /* Only valid when OCFS2_ORPHANED_FL |
| 471 | was set in i_flags */ | ||
| 472 | __le16 i_reserved1; | ||
| 450 | /*70*/ __le64 i_reserved2[8]; | 473 | /*70*/ __le64 i_reserved2[8]; |
| 451 | /*B8*/ union { | 474 | /*B8*/ union { |
| 452 | __le64 i_pad1; /* Generic way to refer to this | 475 | __le64 i_pad1; /* Generic way to refer to this |
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h index 4d5d5655c185..4ca02b1c38ac 100644 --- a/fs/ocfs2/ocfs2_lockid.h +++ b/fs/ocfs2/ocfs2_lockid.h | |||
| @@ -44,6 +44,7 @@ enum ocfs2_lock_type { | |||
| 44 | OCFS2_LOCK_TYPE_RENAME, | 44 | OCFS2_LOCK_TYPE_RENAME, |
| 45 | OCFS2_LOCK_TYPE_RW, | 45 | OCFS2_LOCK_TYPE_RW, |
| 46 | OCFS2_LOCK_TYPE_DENTRY, | 46 | OCFS2_LOCK_TYPE_DENTRY, |
| 47 | OCFS2_LOCK_TYPE_OPEN, | ||
| 47 | OCFS2_NUM_LOCK_TYPES | 48 | OCFS2_NUM_LOCK_TYPES |
| 48 | }; | 49 | }; |
| 49 | 50 | ||
| @@ -69,6 +70,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type) | |||
| 69 | case OCFS2_LOCK_TYPE_DENTRY: | 70 | case OCFS2_LOCK_TYPE_DENTRY: |
| 70 | c = 'N'; | 71 | c = 'N'; |
| 71 | break; | 72 | break; |
| 73 | case OCFS2_LOCK_TYPE_OPEN: | ||
| 74 | c = 'O'; | ||
| 75 | break; | ||
| 72 | default: | 76 | default: |
| 73 | c = '\0'; | 77 | c = '\0'; |
| 74 | } | 78 | } |
| @@ -85,6 +89,7 @@ static char *ocfs2_lock_type_strings[] = { | |||
| 85 | * important job it does, anyway. */ | 89 | * important job it does, anyway. */ |
| 86 | [OCFS2_LOCK_TYPE_RW] = "Write/Read", | 90 | [OCFS2_LOCK_TYPE_RW] = "Write/Read", |
| 87 | [OCFS2_LOCK_TYPE_DENTRY] = "Dentry", | 91 | [OCFS2_LOCK_TYPE_DENTRY] = "Dentry", |
| 92 | [OCFS2_LOCK_TYPE_OPEN] = "Open", | ||
| 88 | }; | 93 | }; |
| 89 | 94 | ||
| 90 | static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type) | 95 | static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type) |
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index 2d3ac32cb74e..d921a28329dc 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c | |||
| @@ -197,7 +197,7 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb) | |||
| 197 | goto bail; | 197 | goto bail; |
| 198 | } | 198 | } |
| 199 | 199 | ||
| 200 | status = ocfs2_extent_map_get_blocks(inode, 0ULL, 1, &blkno, NULL); | 200 | status = ocfs2_extent_map_get_blocks(inode, 0ULL, &blkno, NULL, NULL); |
| 201 | if (status < 0) { | 201 | if (status < 0) { |
| 202 | mlog_errno(status); | 202 | mlog_errno(status); |
| 203 | goto bail; | 203 | goto bail; |
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 6dbb11762759..0da655ae5d6f 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c | |||
| @@ -381,8 +381,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb, | |||
| 381 | le32_to_cpu(fe->i_clusters))); | 381 | le32_to_cpu(fe->i_clusters))); |
| 382 | spin_unlock(&OCFS2_I(alloc_inode)->ip_lock); | 382 | spin_unlock(&OCFS2_I(alloc_inode)->ip_lock); |
| 383 | i_size_write(alloc_inode, le64_to_cpu(fe->i_size)); | 383 | i_size_write(alloc_inode, le64_to_cpu(fe->i_size)); |
| 384 | alloc_inode->i_blocks = | 384 | alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode); |
| 385 | ocfs2_align_bytes_to_sectors(i_size_read(alloc_inode)); | ||
| 386 | 385 | ||
| 387 | status = 0; | 386 | status = 0; |
| 388 | bail: | 387 | bail: |
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 6534f92424dd..5c9e8243691f 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c | |||
| @@ -806,9 +806,6 @@ static int __init ocfs2_init(void) | |||
| 806 | 806 | ||
| 807 | ocfs2_print_version(); | 807 | ocfs2_print_version(); |
| 808 | 808 | ||
| 809 | if (init_ocfs2_extent_maps()) | ||
| 810 | return -ENOMEM; | ||
| 811 | |||
| 812 | status = init_ocfs2_uptodate_cache(); | 809 | status = init_ocfs2_uptodate_cache(); |
| 813 | if (status < 0) { | 810 | if (status < 0) { |
| 814 | mlog_errno(status); | 811 | mlog_errno(status); |
| @@ -837,7 +834,6 @@ leave: | |||
| 837 | if (status < 0) { | 834 | if (status < 0) { |
| 838 | ocfs2_free_mem_caches(); | 835 | ocfs2_free_mem_caches(); |
| 839 | exit_ocfs2_uptodate_cache(); | 836 | exit_ocfs2_uptodate_cache(); |
| 840 | exit_ocfs2_extent_maps(); | ||
| 841 | } | 837 | } |
| 842 | 838 | ||
| 843 | mlog_exit(status); | 839 | mlog_exit(status); |
| @@ -863,8 +859,6 @@ static void __exit ocfs2_exit(void) | |||
| 863 | 859 | ||
| 864 | unregister_filesystem(&ocfs2_fs_type); | 860 | unregister_filesystem(&ocfs2_fs_type); |
| 865 | 861 | ||
| 866 | exit_ocfs2_extent_maps(); | ||
| 867 | |||
| 868 | exit_ocfs2_uptodate_cache(); | 862 | exit_ocfs2_uptodate_cache(); |
| 869 | 863 | ||
| 870 | mlog_exit_void(); | 864 | mlog_exit_void(); |
| @@ -963,6 +957,7 @@ static void ocfs2_inode_init_once(void *data, | |||
| 963 | ocfs2_lock_res_init_once(&oi->ip_rw_lockres); | 957 | ocfs2_lock_res_init_once(&oi->ip_rw_lockres); |
| 964 | ocfs2_lock_res_init_once(&oi->ip_meta_lockres); | 958 | ocfs2_lock_res_init_once(&oi->ip_meta_lockres); |
| 965 | ocfs2_lock_res_init_once(&oi->ip_data_lockres); | 959 | ocfs2_lock_res_init_once(&oi->ip_data_lockres); |
| 960 | ocfs2_lock_res_init_once(&oi->ip_open_lockres); | ||
| 966 | 961 | ||
| 967 | ocfs2_metadata_cache_init(&oi->vfs_inode); | 962 | ocfs2_metadata_cache_init(&oi->vfs_inode); |
| 968 | 963 | ||
diff --git a/fs/ocfs2/vote.c b/fs/ocfs2/vote.c index f30e63b9910c..4f82a2f0efef 100644 --- a/fs/ocfs2/vote.c +++ b/fs/ocfs2/vote.c | |||
| @@ -63,17 +63,10 @@ struct ocfs2_msg_hdr | |||
| 63 | __be32 h_node_num; /* node sending this particular message. */ | 63 | __be32 h_node_num; /* node sending this particular message. */ |
| 64 | }; | 64 | }; |
| 65 | 65 | ||
| 66 | /* OCFS2_MAX_FILENAME_LEN is 255 characters, but we want to align this | ||
| 67 | * for the network. */ | ||
| 68 | #define OCFS2_VOTE_FILENAME_LEN 256 | ||
| 69 | struct ocfs2_vote_msg | 66 | struct ocfs2_vote_msg |
| 70 | { | 67 | { |
| 71 | struct ocfs2_msg_hdr v_hdr; | 68 | struct ocfs2_msg_hdr v_hdr; |
| 72 | union { | 69 | __be32 v_reserved1; |
| 73 | __be32 v_generic1; | ||
| 74 | __be32 v_orphaned_slot; /* Used during delete votes */ | ||
| 75 | __be32 v_nlink; /* Used during unlink votes */ | ||
| 76 | } md1; /* Message type dependant 1 */ | ||
| 77 | }; | 70 | }; |
| 78 | 71 | ||
| 79 | /* Responses are given these values to maintain backwards | 72 | /* Responses are given these values to maintain backwards |
| @@ -86,7 +79,6 @@ struct ocfs2_response_msg | |||
| 86 | { | 79 | { |
| 87 | struct ocfs2_msg_hdr r_hdr; | 80 | struct ocfs2_msg_hdr r_hdr; |
| 88 | __be32 r_response; | 81 | __be32 r_response; |
| 89 | __be32 r_orphaned_slot; | ||
| 90 | }; | 82 | }; |
| 91 | 83 | ||
| 92 | struct ocfs2_vote_work { | 84 | struct ocfs2_vote_work { |
| @@ -96,7 +88,6 @@ struct ocfs2_vote_work { | |||
| 96 | 88 | ||
| 97 | enum ocfs2_vote_request { | 89 | enum ocfs2_vote_request { |
| 98 | OCFS2_VOTE_REQ_INVALID = 0, | 90 | OCFS2_VOTE_REQ_INVALID = 0, |
| 99 | OCFS2_VOTE_REQ_DELETE, | ||
| 100 | OCFS2_VOTE_REQ_MOUNT, | 91 | OCFS2_VOTE_REQ_MOUNT, |
| 101 | OCFS2_VOTE_REQ_UMOUNT, | 92 | OCFS2_VOTE_REQ_UMOUNT, |
| 102 | OCFS2_VOTE_REQ_LAST | 93 | OCFS2_VOTE_REQ_LAST |
| @@ -151,135 +142,23 @@ static void ocfs2_process_umount_request(struct ocfs2_super *osb, | |||
| 151 | ocfs2_node_map_set_bit(osb, &osb->umount_map, node_num); | 142 | ocfs2_node_map_set_bit(osb, &osb->umount_map, node_num); |
| 152 | } | 143 | } |
| 153 | 144 | ||
| 154 | void ocfs2_mark_inode_remotely_deleted(struct inode *inode) | ||
| 155 | { | ||
| 156 | struct ocfs2_inode_info *oi = OCFS2_I(inode); | ||
| 157 | |||
| 158 | assert_spin_locked(&oi->ip_lock); | ||
| 159 | /* We set the SKIP_DELETE flag on the inode so we don't try to | ||
| 160 | * delete it in delete_inode ourselves, thus avoiding | ||
| 161 | * unecessary lock pinging. If the other node failed to wipe | ||
| 162 | * the inode as a result of a crash, then recovery will pick | ||
| 163 | * up the slack. */ | ||
| 164 | oi->ip_flags |= OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE; | ||
| 165 | } | ||
| 166 | |||
| 167 | static int ocfs2_process_delete_request(struct inode *inode, | ||
| 168 | int *orphaned_slot) | ||
| 169 | { | ||
| 170 | int response = OCFS2_RESPONSE_BUSY; | ||
| 171 | |||
| 172 | mlog(0, "DELETE vote on inode %lu, read lnk_cnt = %u, slot = %d\n", | ||
| 173 | inode->i_ino, inode->i_nlink, *orphaned_slot); | ||
| 174 | |||
| 175 | spin_lock(&OCFS2_I(inode)->ip_lock); | ||
| 176 | |||
| 177 | /* Whatever our vote response is, we want to make sure that | ||
| 178 | * the orphaned slot is recorded properly on this node *and* | ||
| 179 | * on the requesting node. Technically, if the requesting node | ||
| 180 | * did not know which slot the inode is orphaned in but we | ||
| 181 | * respond with BUSY he doesn't actually need the orphaned | ||
| 182 | * slot, but it doesn't hurt to do it here anyway. */ | ||
| 183 | if ((*orphaned_slot) != OCFS2_INVALID_SLOT) { | ||
| 184 | mlog_bug_on_msg(OCFS2_I(inode)->ip_orphaned_slot != | ||
| 185 | OCFS2_INVALID_SLOT && | ||
| 186 | OCFS2_I(inode)->ip_orphaned_slot != | ||
| 187 | (*orphaned_slot), | ||
| 188 | "Inode %llu: This node thinks it's " | ||
| 189 | "orphaned in slot %d, messaged it's in %d\n", | ||
| 190 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | ||
| 191 | OCFS2_I(inode)->ip_orphaned_slot, | ||
| 192 | *orphaned_slot); | ||
| 193 | |||
| 194 | mlog(0, "Setting orphaned slot for inode %llu to %d\n", | ||
| 195 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | ||
| 196 | *orphaned_slot); | ||
| 197 | |||
| 198 | OCFS2_I(inode)->ip_orphaned_slot = *orphaned_slot; | ||
| 199 | } else { | ||
| 200 | mlog(0, "Sending back orphaned slot %d for inode %llu\n", | ||
| 201 | OCFS2_I(inode)->ip_orphaned_slot, | ||
| 202 | (unsigned long long)OCFS2_I(inode)->ip_blkno); | ||
| 203 | |||
| 204 | *orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot; | ||
| 205 | } | ||
| 206 | |||
| 207 | /* vote no if the file is still open. */ | ||
| 208 | if (OCFS2_I(inode)->ip_open_count) { | ||
| 209 | mlog(0, "open count = %u\n", | ||
| 210 | OCFS2_I(inode)->ip_open_count); | ||
| 211 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
| 212 | goto done; | ||
| 213 | } | ||
| 214 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
| 215 | |||
| 216 | /* directories are a bit ugly... What if someone is sitting in | ||
| 217 | * it? We want to make sure the inode is removed completely as | ||
| 218 | * a result of the iput in process_vote. */ | ||
| 219 | if (S_ISDIR(inode->i_mode) && (atomic_read(&inode->i_count) != 1)) { | ||
| 220 | mlog(0, "i_count = %u\n", atomic_read(&inode->i_count)); | ||
| 221 | goto done; | ||
| 222 | } | ||
| 223 | |||
| 224 | if (filemap_fdatawrite(inode->i_mapping)) { | ||
| 225 | mlog(ML_ERROR, "Could not sync inode %llu for delete!\n", | ||
| 226 | (unsigned long long)OCFS2_I(inode)->ip_blkno); | ||
| 227 | goto done; | ||
| 228 | } | ||
| 229 | sync_mapping_buffers(inode->i_mapping); | ||
| 230 | truncate_inode_pages(inode->i_mapping, 0); | ||
| 231 | ocfs2_extent_map_trunc(inode, 0); | ||
| 232 | |||
| 233 | spin_lock(&OCFS2_I(inode)->ip_lock); | ||
| 234 | /* double check open count - someone might have raced this | ||
| 235 | * thread into ocfs2_file_open while we were writing out | ||
| 236 | * data. If we're to allow a wipe of this inode now, we *must* | ||
| 237 | * hold the spinlock until we've marked it. */ | ||
| 238 | if (OCFS2_I(inode)->ip_open_count) { | ||
| 239 | mlog(0, "Raced to wipe! open count = %u\n", | ||
| 240 | OCFS2_I(inode)->ip_open_count); | ||
| 241 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
| 242 | goto done; | ||
| 243 | } | ||
| 244 | |||
| 245 | /* Mark the inode as being wiped from disk. */ | ||
| 246 | ocfs2_mark_inode_remotely_deleted(inode); | ||
| 247 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
| 248 | |||
| 249 | /* Not sure this is necessary anymore. */ | ||
| 250 | d_prune_aliases(inode); | ||
| 251 | |||
| 252 | /* If we get here, then we're voting 'yes', so commit the | ||
| 253 | * delete on our side. */ | ||
| 254 | response = OCFS2_RESPONSE_OK; | ||
| 255 | done: | ||
| 256 | return response; | ||
| 257 | } | ||
| 258 | |||
| 259 | static void ocfs2_process_vote(struct ocfs2_super *osb, | 145 | static void ocfs2_process_vote(struct ocfs2_super *osb, |
| 260 | struct ocfs2_vote_msg *msg) | 146 | struct ocfs2_vote_msg *msg) |
| 261 | { | 147 | { |
| 262 | int net_status, vote_response; | 148 | int net_status, vote_response; |
| 263 | int orphaned_slot = 0; | 149 | unsigned int node_num; |
| 264 | unsigned int node_num, generation; | ||
| 265 | u64 blkno; | 150 | u64 blkno; |
| 266 | enum ocfs2_vote_request request; | 151 | enum ocfs2_vote_request request; |
| 267 | struct inode *inode = NULL; | ||
| 268 | struct ocfs2_msg_hdr *hdr = &msg->v_hdr; | 152 | struct ocfs2_msg_hdr *hdr = &msg->v_hdr; |
| 269 | struct ocfs2_response_msg response; | 153 | struct ocfs2_response_msg response; |
| 270 | 154 | ||
| 271 | /* decode the network mumbo jumbo into local variables. */ | 155 | /* decode the network mumbo jumbo into local variables. */ |
| 272 | request = be32_to_cpu(hdr->h_request); | 156 | request = be32_to_cpu(hdr->h_request); |
| 273 | blkno = be64_to_cpu(hdr->h_blkno); | 157 | blkno = be64_to_cpu(hdr->h_blkno); |
| 274 | generation = be32_to_cpu(hdr->h_generation); | ||
| 275 | node_num = be32_to_cpu(hdr->h_node_num); | 158 | node_num = be32_to_cpu(hdr->h_node_num); |
| 276 | if (request == OCFS2_VOTE_REQ_DELETE) | ||
| 277 | orphaned_slot = be32_to_cpu(msg->md1.v_orphaned_slot); | ||
| 278 | 159 | ||
| 279 | mlog(0, "processing vote: request = %u, blkno = %llu, " | 160 | mlog(0, "processing vote: request = %u, blkno = %llu, node_num = %u\n", |
| 280 | "generation = %u, node_num = %u, priv1 = %u\n", request, | 161 | request, (unsigned long long)blkno, node_num); |
| 281 | (unsigned long long)blkno, generation, node_num, | ||
| 282 | be32_to_cpu(msg->md1.v_generic1)); | ||
| 283 | 162 | ||
| 284 | if (!ocfs2_is_valid_vote_request(request)) { | 163 | if (!ocfs2_is_valid_vote_request(request)) { |
| 285 | mlog(ML_ERROR, "Invalid vote request %d from node %u\n", | 164 | mlog(ML_ERROR, "Invalid vote request %d from node %u\n", |
| @@ -302,52 +181,6 @@ static void ocfs2_process_vote(struct ocfs2_super *osb, | |||
| 302 | break; | 181 | break; |
| 303 | } | 182 | } |
| 304 | 183 | ||
| 305 | /* We cannot process the remaining message types before we're | ||
| 306 | * fully mounted. It's perfectly safe however to send a 'yes' | ||
| 307 | * response as we can't possibly have any of the state they're | ||
| 308 | * asking us to modify yet. */ | ||
| 309 | if (atomic_read(&osb->vol_state) == VOLUME_INIT) | ||
| 310 | goto respond; | ||
| 311 | |||
| 312 | /* If we get here, then the request is against an inode. */ | ||
| 313 | inode = ocfs2_ilookup_for_vote(osb, blkno, | ||
| 314 | request == OCFS2_VOTE_REQ_DELETE); | ||
| 315 | |||
| 316 | /* Not finding the inode is perfectly valid - it means we're | ||
| 317 | * not interested in what the other node is about to do to it | ||
| 318 | * so in those cases we automatically respond with an | ||
| 319 | * affirmative. Cluster locking ensures that we won't race | ||
| 320 | * interest in the inode with this vote request. */ | ||
| 321 | if (!inode) | ||
| 322 | goto respond; | ||
| 323 | |||
| 324 | /* Check generation values. It's possible for us to get a | ||
| 325 | * request against a stale inode. If so then we proceed as if | ||
| 326 | * we had not found an inode in the first place. */ | ||
| 327 | if (inode->i_generation != generation) { | ||
| 328 | mlog(0, "generation passed %u != inode generation = %u, " | ||
| 329 | "ip_flags = %x, ip_blkno = %llu, msg %llu, i_count = %u, " | ||
| 330 | "message type = %u\n", generation, inode->i_generation, | ||
| 331 | OCFS2_I(inode)->ip_flags, | ||
| 332 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | ||
| 333 | (unsigned long long)blkno, atomic_read(&inode->i_count), | ||
| 334 | request); | ||
| 335 | iput(inode); | ||
| 336 | inode = NULL; | ||
| 337 | goto respond; | ||
| 338 | } | ||
| 339 | |||
| 340 | switch (request) { | ||
| 341 | case OCFS2_VOTE_REQ_DELETE: | ||
| 342 | vote_response = ocfs2_process_delete_request(inode, | ||
| 343 | &orphaned_slot); | ||
| 344 | break; | ||
| 345 | default: | ||
| 346 | mlog(ML_ERROR, "node %u, invalid request: %u\n", | ||
| 347 | node_num, request); | ||
| 348 | vote_response = OCFS2_RESPONSE_BAD_MSG; | ||
| 349 | } | ||
| 350 | |||
| 351 | respond: | 184 | respond: |
| 352 | /* Response struture is small so we just put it on the stack | 185 | /* Response struture is small so we just put it on the stack |
| 353 | * and stuff it inline. */ | 186 | * and stuff it inline. */ |
| @@ -357,7 +190,6 @@ respond: | |||
| 357 | response.r_hdr.h_generation = hdr->h_generation; | 190 | response.r_hdr.h_generation = hdr->h_generation; |
| 358 | response.r_hdr.h_node_num = cpu_to_be32(osb->node_num); | 191 | response.r_hdr.h_node_num = cpu_to_be32(osb->node_num); |
| 359 | response.r_response = cpu_to_be32(vote_response); | 192 | response.r_response = cpu_to_be32(vote_response); |
| 360 | response.r_orphaned_slot = cpu_to_be32(orphaned_slot); | ||
| 361 | 193 | ||
| 362 | net_status = o2net_send_message(OCFS2_MESSAGE_TYPE_RESPONSE, | 194 | net_status = o2net_send_message(OCFS2_MESSAGE_TYPE_RESPONSE, |
| 363 | osb->net_key, | 195 | osb->net_key, |
| @@ -373,9 +205,6 @@ respond: | |||
| 373 | && net_status != -ENOTCONN) | 205 | && net_status != -ENOTCONN) |
| 374 | mlog(ML_ERROR, "message to node %u fails with error %d!\n", | 206 | mlog(ML_ERROR, "message to node %u fails with error %d!\n", |
| 375 | node_num, net_status); | 207 | node_num, net_status); |
| 376 | |||
| 377 | if (inode) | ||
| 378 | iput(inode); | ||
| 379 | } | 208 | } |
| 380 | 209 | ||
| 381 | static void ocfs2_vote_thread_do_work(struct ocfs2_super *osb) | 210 | static void ocfs2_vote_thread_do_work(struct ocfs2_super *osb) |
| @@ -634,8 +463,7 @@ bail: | |||
| 634 | static struct ocfs2_vote_msg * ocfs2_new_vote_request(struct ocfs2_super *osb, | 463 | static struct ocfs2_vote_msg * ocfs2_new_vote_request(struct ocfs2_super *osb, |
| 635 | u64 blkno, | 464 | u64 blkno, |
| 636 | unsigned int generation, | 465 | unsigned int generation, |
| 637 | enum ocfs2_vote_request type, | 466 | enum ocfs2_vote_request type) |
| 638 | u32 priv) | ||
| 639 | { | 467 | { |
| 640 | struct ocfs2_vote_msg *request; | 468 | struct ocfs2_vote_msg *request; |
| 641 | struct ocfs2_msg_hdr *hdr; | 469 | struct ocfs2_msg_hdr *hdr; |
| @@ -651,8 +479,6 @@ static struct ocfs2_vote_msg * ocfs2_new_vote_request(struct ocfs2_super *osb, | |||
| 651 | hdr->h_request = cpu_to_be32(type); | 479 | hdr->h_request = cpu_to_be32(type); |
| 652 | hdr->h_blkno = cpu_to_be64(blkno); | 480 | hdr->h_blkno = cpu_to_be64(blkno); |
| 653 | hdr->h_generation = cpu_to_be32(generation); | 481 | hdr->h_generation = cpu_to_be32(generation); |
| 654 | |||
| 655 | request->md1.v_generic1 = cpu_to_be32(priv); | ||
| 656 | } | 482 | } |
| 657 | 483 | ||
| 658 | return request; | 484 | return request; |
| @@ -664,7 +490,7 @@ static int ocfs2_do_request_vote(struct ocfs2_super *osb, | |||
| 664 | struct ocfs2_vote_msg *request, | 490 | struct ocfs2_vote_msg *request, |
| 665 | struct ocfs2_net_response_cb *callback) | 491 | struct ocfs2_net_response_cb *callback) |
| 666 | { | 492 | { |
| 667 | int status, response; | 493 | int status, response = -EBUSY; |
| 668 | unsigned int response_id; | 494 | unsigned int response_id; |
| 669 | struct ocfs2_msg_hdr *hdr; | 495 | struct ocfs2_msg_hdr *hdr; |
| 670 | 496 | ||
| @@ -686,109 +512,12 @@ bail: | |||
| 686 | return status; | 512 | return status; |
| 687 | } | 513 | } |
| 688 | 514 | ||
| 689 | static int ocfs2_request_vote(struct inode *inode, | ||
| 690 | struct ocfs2_vote_msg *request, | ||
| 691 | struct ocfs2_net_response_cb *callback) | ||
| 692 | { | ||
| 693 | int status; | ||
| 694 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
| 695 | |||
| 696 | if (ocfs2_inode_is_new(inode)) | ||
| 697 | return 0; | ||
| 698 | |||
| 699 | status = -EAGAIN; | ||
| 700 | while (status == -EAGAIN) { | ||
| 701 | if (!(osb->s_mount_opt & OCFS2_MOUNT_NOINTR) && | ||
| 702 | signal_pending(current)) | ||
| 703 | return -ERESTARTSYS; | ||
| 704 | |||
| 705 | status = ocfs2_super_lock(osb, 0); | ||
| 706 | if (status < 0) { | ||
| 707 | mlog_errno(status); | ||
| 708 | break; | ||
| 709 | } | ||
| 710 | |||
| 711 | status = 0; | ||
| 712 | if (!ocfs2_node_map_is_only(osb, &osb->mounted_map, | ||
| 713 | osb->node_num)) | ||
| 714 | status = ocfs2_do_request_vote(osb, request, callback); | ||
| 715 | |||
| 716 | ocfs2_super_unlock(osb, 0); | ||
| 717 | } | ||
| 718 | return status; | ||
| 719 | } | ||
| 720 | |||
| 721 | static void ocfs2_delete_response_cb(void *priv, | ||
| 722 | struct ocfs2_response_msg *resp) | ||
| 723 | { | ||
| 724 | int orphaned_slot, node; | ||
| 725 | struct inode *inode = priv; | ||
| 726 | |||
| 727 | orphaned_slot = be32_to_cpu(resp->r_orphaned_slot); | ||
| 728 | node = be32_to_cpu(resp->r_hdr.h_node_num); | ||
| 729 | mlog(0, "node %d tells us that inode %llu is orphaned in slot %d\n", | ||
| 730 | node, (unsigned long long)OCFS2_I(inode)->ip_blkno, | ||
| 731 | orphaned_slot); | ||
| 732 | |||
| 733 | /* The other node may not actually know which slot the inode | ||
| 734 | * is orphaned in. */ | ||
| 735 | if (orphaned_slot == OCFS2_INVALID_SLOT) | ||
| 736 | return; | ||
| 737 | |||
| 738 | /* Ok, the responding node knows which slot this inode is | ||
| 739 | * orphaned in. We verify that the information is correct and | ||
| 740 | * then record this in the inode. ocfs2_delete_inode will use | ||
| 741 | * this information to determine which lock to take. */ | ||
| 742 | spin_lock(&OCFS2_I(inode)->ip_lock); | ||
| 743 | mlog_bug_on_msg(OCFS2_I(inode)->ip_orphaned_slot != orphaned_slot && | ||
| 744 | OCFS2_I(inode)->ip_orphaned_slot | ||
| 745 | != OCFS2_INVALID_SLOT, "Inode %llu: Node %d says it's " | ||
| 746 | "orphaned in slot %d, we think it's in %d\n", | ||
| 747 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | ||
| 748 | be32_to_cpu(resp->r_hdr.h_node_num), | ||
| 749 | orphaned_slot, OCFS2_I(inode)->ip_orphaned_slot); | ||
| 750 | |||
| 751 | OCFS2_I(inode)->ip_orphaned_slot = orphaned_slot; | ||
| 752 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
| 753 | } | ||
| 754 | |||
| 755 | int ocfs2_request_delete_vote(struct inode *inode) | ||
| 756 | { | ||
| 757 | int orphaned_slot, status; | ||
| 758 | struct ocfs2_net_response_cb delete_cb; | ||
| 759 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
| 760 | struct ocfs2_vote_msg *request; | ||
| 761 | |||
| 762 | spin_lock(&OCFS2_I(inode)->ip_lock); | ||
| 763 | orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot; | ||
| 764 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
| 765 | |||
| 766 | delete_cb.rc_cb = ocfs2_delete_response_cb; | ||
| 767 | delete_cb.rc_priv = inode; | ||
| 768 | |||
| 769 | mlog(0, "Inode %llu, we start thinking orphaned slot is %d\n", | ||
| 770 | (unsigned long long)OCFS2_I(inode)->ip_blkno, orphaned_slot); | ||
| 771 | |||
| 772 | status = -ENOMEM; | ||
| 773 | request = ocfs2_new_vote_request(osb, OCFS2_I(inode)->ip_blkno, | ||
| 774 | inode->i_generation, | ||
| 775 | OCFS2_VOTE_REQ_DELETE, orphaned_slot); | ||
| 776 | if (request) { | ||
| 777 | status = ocfs2_request_vote(inode, request, &delete_cb); | ||
| 778 | |||
| 779 | kfree(request); | ||
| 780 | } | ||
| 781 | |||
| 782 | return status; | ||
| 783 | } | ||
| 784 | |||
| 785 | int ocfs2_request_mount_vote(struct ocfs2_super *osb) | 515 | int ocfs2_request_mount_vote(struct ocfs2_super *osb) |
| 786 | { | 516 | { |
| 787 | int status; | 517 | int status; |
| 788 | struct ocfs2_vote_msg *request = NULL; | 518 | struct ocfs2_vote_msg *request = NULL; |
| 789 | 519 | ||
| 790 | request = ocfs2_new_vote_request(osb, 0ULL, 0, | 520 | request = ocfs2_new_vote_request(osb, 0ULL, 0, OCFS2_VOTE_REQ_MOUNT); |
| 791 | OCFS2_VOTE_REQ_MOUNT, 0); | ||
| 792 | if (!request) { | 521 | if (!request) { |
| 793 | status = -ENOMEM; | 522 | status = -ENOMEM; |
| 794 | goto bail; | 523 | goto bail; |
| @@ -821,8 +550,7 @@ int ocfs2_request_umount_vote(struct ocfs2_super *osb) | |||
| 821 | int status; | 550 | int status; |
| 822 | struct ocfs2_vote_msg *request = NULL; | 551 | struct ocfs2_vote_msg *request = NULL; |
| 823 | 552 | ||
| 824 | request = ocfs2_new_vote_request(osb, 0ULL, 0, | 553 | request = ocfs2_new_vote_request(osb, 0ULL, 0, OCFS2_VOTE_REQ_UMOUNT); |
| 825 | OCFS2_VOTE_REQ_UMOUNT, 0); | ||
| 826 | if (!request) { | 554 | if (!request) { |
| 827 | status = -ENOMEM; | 555 | status = -ENOMEM; |
| 828 | goto bail; | 556 | goto bail; |
| @@ -969,7 +697,6 @@ static int ocfs2_handle_vote_message(struct o2net_msg *msg, | |||
| 969 | be32_to_cpu(work->w_msg.v_hdr.h_generation)); | 697 | be32_to_cpu(work->w_msg.v_hdr.h_generation)); |
| 970 | mlog(0, "h_node_num = %u\n", | 698 | mlog(0, "h_node_num = %u\n", |
| 971 | be32_to_cpu(work->w_msg.v_hdr.h_node_num)); | 699 | be32_to_cpu(work->w_msg.v_hdr.h_node_num)); |
| 972 | mlog(0, "v_generic1 = %u\n", be32_to_cpu(work->w_msg.md1.v_generic1)); | ||
| 973 | 700 | ||
| 974 | spin_lock(&osb->vote_task_lock); | 701 | spin_lock(&osb->vote_task_lock); |
| 975 | list_add_tail(&work->w_list, &osb->vote_list); | 702 | list_add_tail(&work->w_list, &osb->vote_list); |
diff --git a/fs/ocfs2/vote.h b/fs/ocfs2/vote.h index 53ebc1c69e56..9ea46f62de31 100644 --- a/fs/ocfs2/vote.h +++ b/fs/ocfs2/vote.h | |||
| @@ -38,14 +38,11 @@ static inline void ocfs2_kick_vote_thread(struct ocfs2_super *osb) | |||
| 38 | wake_up(&osb->vote_event); | 38 | wake_up(&osb->vote_event); |
| 39 | } | 39 | } |
| 40 | 40 | ||
| 41 | int ocfs2_request_delete_vote(struct inode *inode); | ||
| 42 | int ocfs2_request_mount_vote(struct ocfs2_super *osb); | 41 | int ocfs2_request_mount_vote(struct ocfs2_super *osb); |
| 43 | int ocfs2_request_umount_vote(struct ocfs2_super *osb); | 42 | int ocfs2_request_umount_vote(struct ocfs2_super *osb); |
| 44 | int ocfs2_register_net_handlers(struct ocfs2_super *osb); | 43 | int ocfs2_register_net_handlers(struct ocfs2_super *osb); |
| 45 | void ocfs2_unregister_net_handlers(struct ocfs2_super *osb); | 44 | void ocfs2_unregister_net_handlers(struct ocfs2_super *osb); |
| 46 | 45 | ||
| 47 | void ocfs2_mark_inode_remotely_deleted(struct inode *inode); | ||
| 48 | |||
| 49 | void ocfs2_remove_node_from_vote_queues(struct ocfs2_super *osb, | 46 | void ocfs2_remove_node_from_vote_queues(struct ocfs2_super *osb, |
| 50 | int node_num); | 47 | int node_num); |
| 51 | #endif | 48 | #endif |
| @@ -239,13 +239,11 @@ out: | |||
| 239 | /* | 239 | /* |
| 240 | * `endbyte' is inclusive | 240 | * `endbyte' is inclusive |
| 241 | */ | 241 | */ |
| 242 | int do_sync_file_range(struct file *file, loff_t offset, loff_t endbyte, | 242 | int do_sync_mapping_range(struct address_space *mapping, loff_t offset, |
| 243 | unsigned int flags) | 243 | loff_t endbyte, unsigned int flags) |
| 244 | { | 244 | { |
| 245 | int ret; | 245 | int ret; |
| 246 | struct address_space *mapping; | ||
| 247 | 246 | ||
| 248 | mapping = file->f_mapping; | ||
| 249 | if (!mapping) { | 247 | if (!mapping) { |
| 250 | ret = -EINVAL; | 248 | ret = -EINVAL; |
| 251 | goto out; | 249 | goto out; |
| @@ -275,4 +273,4 @@ int do_sync_file_range(struct file *file, loff_t offset, loff_t endbyte, | |||
| 275 | out: | 273 | out: |
| 276 | return ret; | 274 | return ret; |
| 277 | } | 275 | } |
| 278 | EXPORT_SYMBOL_GPL(do_sync_file_range); | 276 | EXPORT_SYMBOL_GPL(do_sync_mapping_range); |
diff --git a/include/linux/fs.h b/include/linux/fs.h index 86ec3f4a7da6..095a9c9a64fb 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h | |||
| @@ -843,8 +843,13 @@ extern int fcntl_setlease(unsigned int fd, struct file *filp, long arg); | |||
| 843 | extern int fcntl_getlease(struct file *filp); | 843 | extern int fcntl_getlease(struct file *filp); |
| 844 | 844 | ||
| 845 | /* fs/sync.c */ | 845 | /* fs/sync.c */ |
| 846 | extern int do_sync_file_range(struct file *file, loff_t offset, loff_t endbyte, | 846 | extern int do_sync_mapping_range(struct address_space *mapping, loff_t offset, |
| 847 | unsigned int flags); | 847 | loff_t endbyte, unsigned int flags); |
| 848 | static inline int do_sync_file_range(struct file *file, loff_t offset, | ||
| 849 | loff_t endbyte, unsigned int flags) | ||
| 850 | { | ||
| 851 | return do_sync_mapping_range(file->f_mapping, offset, endbyte, flags); | ||
| 852 | } | ||
| 848 | 853 | ||
| 849 | /* fs/locks.c */ | 854 | /* fs/locks.c */ |
| 850 | extern void locks_init_lock(struct file_lock *); | 855 | extern void locks_init_lock(struct file_lock *); |
