diff options
author | Mark Fasheh <mark.fasheh@oracle.com> | 2007-01-17 15:31:35 -0500 |
---|---|---|
committer | Mark Fasheh <mark.fasheh@oracle.com> | 2007-04-26 18:01:31 -0400 |
commit | 363041a5f74b953ab6b705ac9c88e5eda218a24b (patch) | |
tree | c0661c3f88978f2049693682f1cb94b20a8454c0 /fs/ocfs2 | |
parent | dcd0538ff4e854fa9d7f4630b359ca8fdb5cb5a8 (diff) |
ocfs2: temporarily remove extent map caching
The code in extent_map.c is not prepared to deal with a subtree being
rotated between lookups. This can happen when filling holes in sparse files.
Instead of a lengthy patch to update the code (which would likely lose the
benefit of caching subtree roots), we remove most of the algorithms and
implement a simple path based lookup. A less ambitious extent caching scheme
will be added in a later patch.
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
Diffstat (limited to 'fs/ocfs2')
-rw-r--r-- | fs/ocfs2/alloc.c | 5 | ||||
-rw-r--r-- | fs/ocfs2/alloc.h | 3 | ||||
-rw-r--r-- | fs/ocfs2/aops.c | 8 | ||||
-rw-r--r-- | fs/ocfs2/dir.c | 2 | ||||
-rw-r--r-- | fs/ocfs2/dlmglue.c | 4 | ||||
-rw-r--r-- | fs/ocfs2/extent_map.c | 1024 | ||||
-rw-r--r-- | fs/ocfs2/extent_map.h | 19 | ||||
-rw-r--r-- | fs/ocfs2/inode.c | 6 | ||||
-rw-r--r-- | fs/ocfs2/inode.h | 1 | ||||
-rw-r--r-- | fs/ocfs2/journal.c | 3 | ||||
-rw-r--r-- | fs/ocfs2/namei.c | 3 | ||||
-rw-r--r-- | fs/ocfs2/ocfs2.h | 5 | ||||
-rw-r--r-- | fs/ocfs2/slot_map.c | 2 | ||||
-rw-r--r-- | fs/ocfs2/super.c | 7 |
14 files changed, 96 insertions, 996 deletions
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index a96696867576..85a05f120249 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c | |||
@@ -1146,9 +1146,8 @@ static void find_leaf_ins(void *data, struct buffer_head *bh) | |||
1146 | * | 1146 | * |
1147 | * This function doesn't handle non btree extent lists. | 1147 | * This function doesn't handle non btree extent lists. |
1148 | */ | 1148 | */ |
1149 | static int ocfs2_find_leaf(struct inode *inode, | 1149 | int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el, |
1150 | struct ocfs2_extent_list *root_el, u32 cpos, | 1150 | u32 cpos, struct buffer_head **leaf_bh) |
1151 | struct buffer_head **leaf_bh) | ||
1152 | { | 1151 | { |
1153 | int ret; | 1152 | int ret; |
1154 | struct buffer_head *bh = NULL; | 1153 | struct buffer_head *bh = NULL; |
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h index b0880fdb3108..bff2a162b030 100644 --- a/fs/ocfs2/alloc.h +++ b/fs/ocfs2/alloc.h | |||
@@ -80,4 +80,7 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb, | |||
80 | struct buffer_head *fe_bh, | 80 | struct buffer_head *fe_bh, |
81 | struct ocfs2_truncate_context *tc); | 81 | struct ocfs2_truncate_context *tc); |
82 | 82 | ||
83 | int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el, | ||
84 | u32 cpos, struct buffer_head **leaf_bh); | ||
85 | |||
83 | #endif /* OCFS2_ALLOC_H */ | 86 | #endif /* OCFS2_ALLOC_H */ |
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 875c11443817..f3b0cc5cba1a 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c | |||
@@ -158,8 +158,7 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock, | |||
158 | if (err) | 158 | if (err) |
159 | goto bail; | 159 | goto bail; |
160 | 160 | ||
161 | err = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno, | 161 | err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, NULL); |
162 | NULL); | ||
163 | if (err) { | 162 | if (err) { |
164 | mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, " | 163 | mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, " |
165 | "%llu, NULL)\n", err, inode, (unsigned long long)iblock, | 164 | "%llu, NULL)\n", err, inode, (unsigned long long)iblock, |
@@ -499,8 +498,7 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block) | |||
499 | down_read(&OCFS2_I(inode)->ip_alloc_sem); | 498 | down_read(&OCFS2_I(inode)->ip_alloc_sem); |
500 | } | 499 | } |
501 | 500 | ||
502 | err = ocfs2_extent_map_get_blocks(inode, block, 1, &p_blkno, | 501 | err = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL); |
503 | NULL); | ||
504 | 502 | ||
505 | if (!INODE_JOURNAL(inode)) { | 503 | if (!INODE_JOURNAL(inode)) { |
506 | up_read(&OCFS2_I(inode)->ip_alloc_sem); | 504 | up_read(&OCFS2_I(inode)->ip_alloc_sem); |
@@ -574,7 +572,7 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, | |||
574 | 572 | ||
575 | /* This figures out the size of the next contiguous block, and | 573 | /* This figures out the size of the next contiguous block, and |
576 | * our logical offset */ | 574 | * our logical offset */ |
577 | ret = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno, | 575 | ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, |
578 | &contig_blocks); | 576 | &contig_blocks); |
579 | if (ret) { | 577 | if (ret) { |
580 | mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n", | 578 | mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n", |
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 5d211c53a8d8..c91490670ffa 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c | |||
@@ -379,7 +379,7 @@ int ocfs2_do_extend_dir(struct super_block *sb, | |||
379 | 379 | ||
380 | status = ocfs2_extent_map_get_blocks(dir, (dir->i_blocks >> | 380 | status = ocfs2_extent_map_get_blocks(dir, (dir->i_blocks >> |
381 | (sb->s_blocksize_bits - 9)), | 381 | (sb->s_blocksize_bits - 9)), |
382 | 1, &p_blkno, NULL); | 382 | &p_blkno, NULL); |
383 | if (status < 0) { | 383 | if (status < 0) { |
384 | mlog_errno(status); | 384 | mlog_errno(status); |
385 | goto bail; | 385 | goto bail; |
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index ca4f0e0e7587..8de6678a340a 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c | |||
@@ -1614,10 +1614,6 @@ static int ocfs2_meta_lock_update(struct inode *inode, | |||
1614 | * for the inode metadata. */ | 1614 | * for the inode metadata. */ |
1615 | ocfs2_metadata_cache_purge(inode); | 1615 | ocfs2_metadata_cache_purge(inode); |
1616 | 1616 | ||
1617 | /* will do nothing for inode types that don't use the extent | ||
1618 | * map (bitmap files, etc) */ | ||
1619 | ocfs2_extent_map_trunc(inode, 0); | ||
1620 | |||
1621 | if (ocfs2_meta_lvb_is_trustable(inode, lockres)) { | 1617 | if (ocfs2_meta_lvb_is_trustable(inode, lockres)) { |
1622 | mlog(0, "Trusting LVB on inode %llu\n", | 1618 | mlog(0, "Trusting LVB on inode %llu\n", |
1623 | (unsigned long long)oi->ip_blkno); | 1619 | (unsigned long long)oi->ip_blkno); |
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index 80ac69f11d9f..3b4322fd369a 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c | |||
@@ -3,8 +3,7 @@ | |||
3 | * | 3 | * |
4 | * extent_map.c | 4 | * extent_map.c |
5 | * | 5 | * |
6 | * In-memory extent map for OCFS2. Man, this code was prettier in | 6 | * Block/Cluster mapping functions |
7 | * the library. | ||
8 | * | 7 | * |
9 | * Copyright (C) 2004 Oracle. All rights reserved. | 8 | * Copyright (C) 2004 Oracle. All rights reserved. |
10 | * | 9 | * |
@@ -26,1016 +25,155 @@ | |||
26 | #include <linux/fs.h> | 25 | #include <linux/fs.h> |
27 | #include <linux/init.h> | 26 | #include <linux/init.h> |
28 | #include <linux/types.h> | 27 | #include <linux/types.h> |
29 | #include <linux/slab.h> | ||
30 | #include <linux/rbtree.h> | ||
31 | 28 | ||
32 | #define MLOG_MASK_PREFIX ML_EXTENT_MAP | 29 | #define MLOG_MASK_PREFIX ML_EXTENT_MAP |
33 | #include <cluster/masklog.h> | 30 | #include <cluster/masklog.h> |
34 | 31 | ||
35 | #include "ocfs2.h" | 32 | #include "ocfs2.h" |
36 | 33 | ||
34 | #include "alloc.h" | ||
37 | #include "extent_map.h" | 35 | #include "extent_map.h" |
38 | #include "inode.h" | 36 | #include "inode.h" |
39 | #include "super.h" | 37 | #include "super.h" |
40 | 38 | ||
41 | #include "buffer_head_io.h" | 39 | #include "buffer_head_io.h" |
42 | 40 | ||
43 | |||
44 | /* | ||
45 | * SUCK SUCK SUCK | ||
46 | * Our headers are so bad that struct ocfs2_extent_map is in ocfs.h | ||
47 | */ | ||
48 | |||
49 | struct ocfs2_extent_map_entry { | ||
50 | struct rb_node e_node; | ||
51 | int e_tree_depth; | ||
52 | struct ocfs2_extent_rec e_rec; | ||
53 | }; | ||
54 | |||
55 | struct ocfs2_em_insert_context { | ||
56 | int need_left; | ||
57 | int need_right; | ||
58 | struct ocfs2_extent_map_entry *new_ent; | ||
59 | struct ocfs2_extent_map_entry *old_ent; | ||
60 | struct ocfs2_extent_map_entry *left_ent; | ||
61 | struct ocfs2_extent_map_entry *right_ent; | ||
62 | }; | ||
63 | |||
64 | static struct kmem_cache *ocfs2_em_ent_cachep = NULL; | ||
65 | |||
66 | |||
67 | static struct ocfs2_extent_map_entry * | ||
68 | ocfs2_extent_map_lookup(struct ocfs2_extent_map *em, | ||
69 | u32 cpos, u32 clusters, | ||
70 | struct rb_node ***ret_p, | ||
71 | struct rb_node **ret_parent); | ||
72 | static int ocfs2_extent_map_insert(struct inode *inode, | ||
73 | struct ocfs2_extent_rec *rec, | ||
74 | int tree_depth); | ||
75 | static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em, | ||
76 | struct ocfs2_extent_map_entry *ent); | ||
77 | static int ocfs2_extent_map_find_leaf(struct inode *inode, | ||
78 | u32 cpos, u32 clusters, | ||
79 | struct ocfs2_extent_list *el); | ||
80 | static int ocfs2_extent_map_lookup_read(struct inode *inode, | ||
81 | u32 cpos, u32 clusters, | ||
82 | struct ocfs2_extent_map_entry **ret_ent); | ||
83 | static int ocfs2_extent_map_try_insert(struct inode *inode, | ||
84 | struct ocfs2_extent_rec *rec, | ||
85 | int tree_depth, | ||
86 | struct ocfs2_em_insert_context *ctxt); | ||
87 | |||
88 | /* returns 1 only if the rec contains all the given clusters -- that is that | ||
89 | * rec's cpos is <= the cluster cpos and that the rec endpoint (cpos + | ||
90 | * clusters) is >= the argument's endpoint */ | ||
91 | static int ocfs2_extent_rec_contains_clusters(struct ocfs2_extent_rec *rec, | ||
92 | u32 cpos, u32 clusters) | ||
93 | { | ||
94 | if (le32_to_cpu(rec->e_cpos) > cpos) | ||
95 | return 0; | ||
96 | if (cpos + clusters > le32_to_cpu(rec->e_cpos) + | ||
97 | le32_to_cpu(rec->e_clusters)) | ||
98 | return 0; | ||
99 | return 1; | ||
100 | } | ||
101 | |||
102 | |||
103 | /* | 41 | /* |
104 | * Find an entry in the tree that intersects the region passed in. | 42 | * Return the index of the extent record which contains cluster #v_cluster. |
105 | * Note that this will find straddled intervals, it is up to the | 43 | * -1 is returned if it was not found. |
106 | * callers to enforce any boundary conditions. | ||
107 | * | ||
108 | * Callers must hold ip_lock. This lookup is not guaranteed to return | ||
109 | * a tree_depth 0 match, and as such can race inserts if the lock | ||
110 | * were not held. | ||
111 | * | 44 | * |
112 | * The rb_node garbage lets insertion share the search. Trivial | 45 | * Should work fine on interior and exterior nodes. |
113 | * callers pass NULL. | ||
114 | */ | 46 | */ |
115 | static struct ocfs2_extent_map_entry * | 47 | static int ocfs2_search_extent_list(struct ocfs2_extent_list *el, |
116 | ocfs2_extent_map_lookup(struct ocfs2_extent_map *em, | 48 | u32 v_cluster) |
117 | u32 cpos, u32 clusters, | ||
118 | struct rb_node ***ret_p, | ||
119 | struct rb_node **ret_parent) | ||
120 | { | 49 | { |
121 | struct rb_node **p = &em->em_extents.rb_node; | 50 | int ret = -1; |
122 | struct rb_node *parent = NULL; | 51 | int i; |
123 | struct ocfs2_extent_map_entry *ent = NULL; | ||
124 | |||
125 | while (*p) | ||
126 | { | ||
127 | parent = *p; | ||
128 | ent = rb_entry(parent, struct ocfs2_extent_map_entry, | ||
129 | e_node); | ||
130 | if ((cpos + clusters) <= le32_to_cpu(ent->e_rec.e_cpos)) { | ||
131 | p = &(*p)->rb_left; | ||
132 | ent = NULL; | ||
133 | } else if (cpos >= (le32_to_cpu(ent->e_rec.e_cpos) + | ||
134 | le32_to_cpu(ent->e_rec.e_clusters))) { | ||
135 | p = &(*p)->rb_right; | ||
136 | ent = NULL; | ||
137 | } else | ||
138 | break; | ||
139 | } | ||
140 | |||
141 | if (ret_p != NULL) | ||
142 | *ret_p = p; | ||
143 | if (ret_parent != NULL) | ||
144 | *ret_parent = parent; | ||
145 | return ent; | ||
146 | } | ||
147 | |||
148 | /* | ||
149 | * Find the leaf containing the interval we want. While we're on our | ||
150 | * way down the tree, fill in every record we see at any depth, because | ||
151 | * we might want it later. | ||
152 | * | ||
153 | * Note that this code is run without ip_lock. That's because it | ||
154 | * sleeps while reading. If someone is also filling the extent list at | ||
155 | * the same time we are, we might have to restart. | ||
156 | */ | ||
157 | static int ocfs2_extent_map_find_leaf(struct inode *inode, | ||
158 | u32 cpos, u32 clusters, | ||
159 | struct ocfs2_extent_list *el) | ||
160 | { | ||
161 | int i, ret; | ||
162 | struct buffer_head *eb_bh = NULL; | ||
163 | u64 blkno; | ||
164 | u32 rec_end; | ||
165 | struct ocfs2_extent_block *eb; | ||
166 | struct ocfs2_extent_rec *rec; | 52 | struct ocfs2_extent_rec *rec; |
53 | u32 rec_end, rec_start; | ||
167 | 54 | ||
168 | /* | 55 | for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { |
169 | * The bh data containing the el cannot change here, because | ||
170 | * we hold alloc_sem. So we can do this without other | ||
171 | * locks. | ||
172 | */ | ||
173 | while (el->l_tree_depth) | ||
174 | { | ||
175 | blkno = 0; | ||
176 | for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { | ||
177 | rec = &el->l_recs[i]; | ||
178 | rec_end = (le32_to_cpu(rec->e_cpos) + | ||
179 | le32_to_cpu(rec->e_clusters)); | ||
180 | |||
181 | ret = -EBADR; | ||
182 | if (rec_end > OCFS2_I(inode)->ip_clusters) { | ||
183 | mlog_errno(ret); | ||
184 | ocfs2_error(inode->i_sb, | ||
185 | "Extent %d at e_blkno %llu of inode %llu goes past ip_clusters of %u\n", | ||
186 | i, | ||
187 | (unsigned long long)le64_to_cpu(rec->e_blkno), | ||
188 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | ||
189 | OCFS2_I(inode)->ip_clusters); | ||
190 | goto out_free; | ||
191 | } | ||
192 | |||
193 | if (rec_end <= cpos) { | ||
194 | ret = ocfs2_extent_map_insert(inode, rec, | ||
195 | le16_to_cpu(el->l_tree_depth)); | ||
196 | if (ret && (ret != -EEXIST)) { | ||
197 | mlog_errno(ret); | ||
198 | goto out_free; | ||
199 | } | ||
200 | continue; | ||
201 | } | ||
202 | if ((cpos + clusters) <= le32_to_cpu(rec->e_cpos)) { | ||
203 | ret = ocfs2_extent_map_insert(inode, rec, | ||
204 | le16_to_cpu(el->l_tree_depth)); | ||
205 | if (ret && (ret != -EEXIST)) { | ||
206 | mlog_errno(ret); | ||
207 | goto out_free; | ||
208 | } | ||
209 | continue; | ||
210 | } | ||
211 | |||
212 | /* | ||
213 | * We've found a record that matches our | ||
214 | * interval. We don't insert it because we're | ||
215 | * about to traverse it. | ||
216 | */ | ||
217 | |||
218 | /* Check to see if we're stradling */ | ||
219 | ret = -ESRCH; | ||
220 | if (!ocfs2_extent_rec_contains_clusters(rec, | ||
221 | cpos, | ||
222 | clusters)) { | ||
223 | mlog_errno(ret); | ||
224 | goto out_free; | ||
225 | } | ||
226 | |||
227 | /* | ||
228 | * If we've already found a record, the el has | ||
229 | * two records covering the same interval. | ||
230 | * EEEK! | ||
231 | */ | ||
232 | ret = -EBADR; | ||
233 | if (blkno) { | ||
234 | mlog_errno(ret); | ||
235 | ocfs2_error(inode->i_sb, | ||
236 | "Multiple extents for (cpos = %u, clusters = %u) on inode %llu; e_blkno %llu and rec %d at e_blkno %llu\n", | ||
237 | cpos, clusters, | ||
238 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | ||
239 | (unsigned long long)blkno, i, | ||
240 | (unsigned long long)le64_to_cpu(rec->e_blkno)); | ||
241 | goto out_free; | ||
242 | } | ||
243 | |||
244 | blkno = le64_to_cpu(rec->e_blkno); | ||
245 | } | ||
246 | |||
247 | /* | ||
248 | * We don't support holes, and we're still up | ||
249 | * in the branches, so we'd better have found someone | ||
250 | */ | ||
251 | ret = -EBADR; | ||
252 | if (!blkno) { | ||
253 | ocfs2_error(inode->i_sb, | ||
254 | "No record found for (cpos = %u, clusters = %u) on inode %llu\n", | ||
255 | cpos, clusters, | ||
256 | (unsigned long long)OCFS2_I(inode)->ip_blkno); | ||
257 | mlog_errno(ret); | ||
258 | goto out_free; | ||
259 | } | ||
260 | |||
261 | if (eb_bh) { | ||
262 | brelse(eb_bh); | ||
263 | eb_bh = NULL; | ||
264 | } | ||
265 | ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), | ||
266 | blkno, &eb_bh, OCFS2_BH_CACHED, | ||
267 | inode); | ||
268 | if (ret) { | ||
269 | mlog_errno(ret); | ||
270 | goto out_free; | ||
271 | } | ||
272 | eb = (struct ocfs2_extent_block *)eb_bh->b_data; | ||
273 | if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { | ||
274 | OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); | ||
275 | ret = -EIO; | ||
276 | goto out_free; | ||
277 | } | ||
278 | el = &eb->h_list; | ||
279 | } | ||
280 | |||
281 | BUG_ON(el->l_tree_depth); | ||
282 | |||
283 | for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { | ||
284 | rec = &el->l_recs[i]; | 56 | rec = &el->l_recs[i]; |
285 | 57 | ||
286 | if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) > | 58 | rec_start = le32_to_cpu(rec->e_cpos); |
287 | OCFS2_I(inode)->ip_clusters) { | 59 | rec_end = rec_start + le32_to_cpu(rec->e_clusters); |
288 | ret = -EBADR; | ||
289 | mlog_errno(ret); | ||
290 | ocfs2_error(inode->i_sb, | ||
291 | "Extent %d at e_blkno %llu of inode %llu goes past ip_clusters of %u\n", | ||
292 | i, | ||
293 | (unsigned long long)le64_to_cpu(rec->e_blkno), | ||
294 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | ||
295 | OCFS2_I(inode)->ip_clusters); | ||
296 | return ret; | ||
297 | } | ||
298 | 60 | ||
299 | ret = ocfs2_extent_map_insert(inode, rec, | 61 | if (v_cluster >= rec_start && v_cluster < rec_end) { |
300 | le16_to_cpu(el->l_tree_depth)); | 62 | ret = i; |
301 | if (ret && (ret != -EEXIST)) { | 63 | break; |
302 | mlog_errno(ret); | ||
303 | goto out_free; | ||
304 | } | 64 | } |
305 | } | 65 | } |
306 | 66 | ||
307 | ret = 0; | ||
308 | |||
309 | out_free: | ||
310 | if (eb_bh) | ||
311 | brelse(eb_bh); | ||
312 | |||
313 | return ret; | 67 | return ret; |
314 | } | 68 | } |
315 | 69 | ||
316 | /* | 70 | static int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, |
317 | * This lookup actually will read from disk. It has one invariant: | 71 | u32 *p_cluster, u32 *num_clusters) |
318 | * It will never re-traverse blocks. This means that all inserts should | ||
319 | * be new regions or more granular regions (both allowed by insert). | ||
320 | */ | ||
321 | static int ocfs2_extent_map_lookup_read(struct inode *inode, | ||
322 | u32 cpos, | ||
323 | u32 clusters, | ||
324 | struct ocfs2_extent_map_entry **ret_ent) | ||
325 | { | 72 | { |
326 | int ret; | 73 | int ret, i; |
327 | u64 blkno; | 74 | struct buffer_head *di_bh = NULL; |
328 | struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; | 75 | struct buffer_head *eb_bh = NULL; |
329 | struct ocfs2_extent_map_entry *ent; | ||
330 | struct buffer_head *bh = NULL; | ||
331 | struct ocfs2_extent_block *eb; | ||
332 | struct ocfs2_dinode *di; | 76 | struct ocfs2_dinode *di; |
77 | struct ocfs2_extent_block *eb; | ||
333 | struct ocfs2_extent_list *el; | 78 | struct ocfs2_extent_list *el; |
79 | struct ocfs2_extent_rec *rec; | ||
80 | u32 coff; | ||
334 | 81 | ||
335 | spin_lock(&OCFS2_I(inode)->ip_lock); | 82 | ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), OCFS2_I(inode)->ip_blkno, |
336 | ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL); | 83 | &di_bh, OCFS2_BH_CACHED, inode); |
337 | if (ent) { | ||
338 | if (!ent->e_tree_depth) { | ||
339 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
340 | *ret_ent = ent; | ||
341 | return 0; | ||
342 | } | ||
343 | blkno = le64_to_cpu(ent->e_rec.e_blkno); | ||
344 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
345 | |||
346 | ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno, &bh, | ||
347 | OCFS2_BH_CACHED, inode); | ||
348 | if (ret) { | ||
349 | mlog_errno(ret); | ||
350 | if (bh) | ||
351 | brelse(bh); | ||
352 | return ret; | ||
353 | } | ||
354 | eb = (struct ocfs2_extent_block *)bh->b_data; | ||
355 | if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { | ||
356 | OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); | ||
357 | brelse(bh); | ||
358 | return -EIO; | ||
359 | } | ||
360 | el = &eb->h_list; | ||
361 | } else { | ||
362 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
363 | |||
364 | ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), | ||
365 | OCFS2_I(inode)->ip_blkno, &bh, | ||
366 | OCFS2_BH_CACHED, inode); | ||
367 | if (ret) { | ||
368 | mlog_errno(ret); | ||
369 | if (bh) | ||
370 | brelse(bh); | ||
371 | return ret; | ||
372 | } | ||
373 | di = (struct ocfs2_dinode *)bh->b_data; | ||
374 | if (!OCFS2_IS_VALID_DINODE(di)) { | ||
375 | brelse(bh); | ||
376 | OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, di); | ||
377 | return -EIO; | ||
378 | } | ||
379 | el = &di->id2.i_list; | ||
380 | } | ||
381 | |||
382 | ret = ocfs2_extent_map_find_leaf(inode, cpos, clusters, el); | ||
383 | brelse(bh); | ||
384 | if (ret) { | 84 | if (ret) { |
385 | mlog_errno(ret); | 85 | mlog_errno(ret); |
386 | return ret; | 86 | goto out; |
387 | } | 87 | } |
388 | 88 | ||
389 | ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL); | 89 | di = (struct ocfs2_dinode *) di_bh->b_data; |
390 | if (!ent) { | 90 | el = &di->id2.i_list; |
391 | ret = -ESRCH; | ||
392 | mlog_errno(ret); | ||
393 | return ret; | ||
394 | } | ||
395 | |||
396 | /* FIXME: Make sure this isn't a corruption */ | ||
397 | BUG_ON(ent->e_tree_depth); | ||
398 | 91 | ||
399 | *ret_ent = ent; | 92 | if (el->l_tree_depth) { |
400 | 93 | ret = ocfs2_find_leaf(inode, el, v_cluster, &eb_bh); | |
401 | return 0; | 94 | if (ret) { |
402 | } | 95 | mlog_errno(ret); |
403 | 96 | goto out; | |
404 | /* | 97 | } |
405 | * Callers must hold ip_lock. This can insert pieces of the tree, | ||
406 | * thus racing lookup if the lock weren't held. | ||
407 | */ | ||
408 | static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em, | ||
409 | struct ocfs2_extent_map_entry *ent) | ||
410 | { | ||
411 | struct rb_node **p, *parent; | ||
412 | struct ocfs2_extent_map_entry *old_ent; | ||
413 | |||
414 | old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(ent->e_rec.e_cpos), | ||
415 | le32_to_cpu(ent->e_rec.e_clusters), | ||
416 | &p, &parent); | ||
417 | if (old_ent) | ||
418 | return -EEXIST; | ||
419 | |||
420 | rb_link_node(&ent->e_node, parent, p); | ||
421 | rb_insert_color(&ent->e_node, &em->em_extents); | ||
422 | |||
423 | return 0; | ||
424 | } | ||
425 | |||
426 | |||
427 | /* | ||
428 | * Simple rule: on any return code other than -EAGAIN, anything left | ||
429 | * in the insert_context will be freed. | ||
430 | * | ||
431 | * Simple rule #2: A return code of -EEXIST from this function or | ||
432 | * its calls to ocfs2_extent_map_insert_entry() signifies that another | ||
433 | * thread beat us to the insert. It is not an actual error, but it | ||
434 | * tells the caller we have no more work to do. | ||
435 | */ | ||
436 | static int ocfs2_extent_map_try_insert(struct inode *inode, | ||
437 | struct ocfs2_extent_rec *rec, | ||
438 | int tree_depth, | ||
439 | struct ocfs2_em_insert_context *ctxt) | ||
440 | { | ||
441 | int ret; | ||
442 | struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; | ||
443 | struct ocfs2_extent_map_entry *old_ent; | ||
444 | |||
445 | ctxt->need_left = 0; | ||
446 | ctxt->need_right = 0; | ||
447 | ctxt->old_ent = NULL; | ||
448 | |||
449 | spin_lock(&OCFS2_I(inode)->ip_lock); | ||
450 | ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent); | ||
451 | if (!ret) { | ||
452 | ctxt->new_ent = NULL; | ||
453 | goto out_unlock; | ||
454 | } | ||
455 | |||
456 | /* Since insert_entry failed, the map MUST have old_ent */ | ||
457 | old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos), | ||
458 | le32_to_cpu(rec->e_clusters), | ||
459 | NULL, NULL); | ||
460 | |||
461 | BUG_ON(!old_ent); | ||
462 | 98 | ||
463 | if (old_ent->e_tree_depth < tree_depth) { | 99 | eb = (struct ocfs2_extent_block *) eb_bh->b_data; |
464 | /* Another thread beat us to the lower tree_depth */ | 100 | el = &eb->h_list; |
465 | ret = -EEXIST; | ||
466 | goto out_unlock; | ||
467 | } | 101 | } |
468 | 102 | ||
469 | if (old_ent->e_tree_depth == tree_depth) { | 103 | i = ocfs2_search_extent_list(el, v_cluster); |
104 | if (i == -1) { | ||
470 | /* | 105 | /* |
471 | * Another thread beat us to this tree_depth. | 106 | * A hole was found. Return some canned values that |
472 | * Let's make sure we agree with that thread (the | 107 | * callers can key on. |
473 | * extent_rec should be identical). | ||
474 | */ | 108 | */ |
475 | if (!memcmp(rec, &old_ent->e_rec, | 109 | *p_cluster = 0; |
476 | sizeof(struct ocfs2_extent_rec))) | 110 | if (num_clusters) |
477 | ret = 0; | 111 | *num_clusters = 1; |
478 | else | 112 | } else { |
479 | /* FIXME: Should this be ESRCH/EBADR??? */ | 113 | rec = &el->l_recs[i]; |
480 | ret = -EEXIST; | ||
481 | 114 | ||
482 | goto out_unlock; | 115 | BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos)); |
483 | } | ||
484 | 116 | ||
485 | /* | 117 | if (!rec->e_blkno) { |
486 | * We do it in this order specifically so that no actual tree | 118 | ocfs2_error(inode->i_sb, "Inode %lu has bad extent " |
487 | * changes occur until we have all the pieces we need. We | 119 | "record (%u, %u, 0)", inode->i_ino, |
488 | * don't want malloc failures to leave an inconsistent tree. | 120 | le32_to_cpu(rec->e_cpos), |
489 | * Whenever we drop the lock, another process could be | ||
490 | * inserting. Also note that, if another process just beat us | ||
491 | * to an insert, we might not need the same pieces we needed | ||
492 | * the first go round. In the end, the pieces we need will | ||
493 | * be used, and the pieces we don't will be freed. | ||
494 | */ | ||
495 | ctxt->need_left = !!(le32_to_cpu(rec->e_cpos) > | ||
496 | le32_to_cpu(old_ent->e_rec.e_cpos)); | ||
497 | ctxt->need_right = !!((le32_to_cpu(old_ent->e_rec.e_cpos) + | ||
498 | le32_to_cpu(old_ent->e_rec.e_clusters)) > | ||
499 | (le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters))); | ||
500 | ret = -EAGAIN; | ||
501 | if (ctxt->need_left) { | ||
502 | if (!ctxt->left_ent) | ||
503 | goto out_unlock; | ||
504 | *(ctxt->left_ent) = *old_ent; | ||
505 | ctxt->left_ent->e_rec.e_clusters = | ||
506 | cpu_to_le32(le32_to_cpu(rec->e_cpos) - | ||
507 | le32_to_cpu(ctxt->left_ent->e_rec.e_cpos)); | ||
508 | } | ||
509 | if (ctxt->need_right) { | ||
510 | if (!ctxt->right_ent) | ||
511 | goto out_unlock; | ||
512 | *(ctxt->right_ent) = *old_ent; | ||
513 | ctxt->right_ent->e_rec.e_cpos = | ||
514 | cpu_to_le32(le32_to_cpu(rec->e_cpos) + | ||
515 | le32_to_cpu(rec->e_clusters)); | 121 | le32_to_cpu(rec->e_clusters)); |
516 | ctxt->right_ent->e_rec.e_clusters = | 122 | ret = -EROFS; |
517 | cpu_to_le32((le32_to_cpu(old_ent->e_rec.e_cpos) + | 123 | goto out; |
518 | le32_to_cpu(old_ent->e_rec.e_clusters)) - | ||
519 | le32_to_cpu(ctxt->right_ent->e_rec.e_cpos)); | ||
520 | } | ||
521 | |||
522 | rb_erase(&old_ent->e_node, &em->em_extents); | ||
523 | /* Now that he's erased, set him up for deletion */ | ||
524 | ctxt->old_ent = old_ent; | ||
525 | |||
526 | if (ctxt->need_left) { | ||
527 | ret = ocfs2_extent_map_insert_entry(em, | ||
528 | ctxt->left_ent); | ||
529 | if (ret) | ||
530 | goto out_unlock; | ||
531 | ctxt->left_ent = NULL; | ||
532 | } | ||
533 | |||
534 | if (ctxt->need_right) { | ||
535 | ret = ocfs2_extent_map_insert_entry(em, | ||
536 | ctxt->right_ent); | ||
537 | if (ret) | ||
538 | goto out_unlock; | ||
539 | ctxt->right_ent = NULL; | ||
540 | } | ||
541 | |||
542 | ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent); | ||
543 | |||
544 | if (!ret) | ||
545 | ctxt->new_ent = NULL; | ||
546 | |||
547 | out_unlock: | ||
548 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
549 | |||
550 | return ret; | ||
551 | } | ||
552 | |||
553 | |||
554 | static int ocfs2_extent_map_insert(struct inode *inode, | ||
555 | struct ocfs2_extent_rec *rec, | ||
556 | int tree_depth) | ||
557 | { | ||
558 | int ret; | ||
559 | struct ocfs2_em_insert_context ctxt = {0, }; | ||
560 | |||
561 | if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) > | ||
562 | OCFS2_I(inode)->ip_map.em_clusters) { | ||
563 | ret = -EBADR; | ||
564 | mlog_errno(ret); | ||
565 | return ret; | ||
566 | } | ||
567 | |||
568 | /* Zero e_clusters means a truncated tail record. It better be EOF */ | ||
569 | if (!rec->e_clusters) { | ||
570 | if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) != | ||
571 | OCFS2_I(inode)->ip_map.em_clusters) { | ||
572 | ret = -EBADR; | ||
573 | mlog_errno(ret); | ||
574 | ocfs2_error(inode->i_sb, | ||
575 | "Zero e_clusters on non-tail extent record at e_blkno %llu on inode %llu\n", | ||
576 | (unsigned long long)le64_to_cpu(rec->e_blkno), | ||
577 | (unsigned long long)OCFS2_I(inode)->ip_blkno); | ||
578 | return ret; | ||
579 | } | ||
580 | |||
581 | /* Ignore the truncated tail */ | ||
582 | return 0; | ||
583 | } | ||
584 | |||
585 | ret = -ENOMEM; | ||
586 | ctxt.new_ent = kmem_cache_alloc(ocfs2_em_ent_cachep, | ||
587 | GFP_NOFS); | ||
588 | if (!ctxt.new_ent) { | ||
589 | mlog_errno(ret); | ||
590 | return ret; | ||
591 | } | ||
592 | |||
593 | ctxt.new_ent->e_rec = *rec; | ||
594 | ctxt.new_ent->e_tree_depth = tree_depth; | ||
595 | |||
596 | do { | ||
597 | ret = -ENOMEM; | ||
598 | if (ctxt.need_left && !ctxt.left_ent) { | ||
599 | ctxt.left_ent = | ||
600 | kmem_cache_alloc(ocfs2_em_ent_cachep, | ||
601 | GFP_NOFS); | ||
602 | if (!ctxt.left_ent) | ||
603 | break; | ||
604 | } | ||
605 | if (ctxt.need_right && !ctxt.right_ent) { | ||
606 | ctxt.right_ent = | ||
607 | kmem_cache_alloc(ocfs2_em_ent_cachep, | ||
608 | GFP_NOFS); | ||
609 | if (!ctxt.right_ent) | ||
610 | break; | ||
611 | } | 124 | } |
612 | 125 | ||
613 | ret = ocfs2_extent_map_try_insert(inode, rec, | 126 | coff = v_cluster - le32_to_cpu(rec->e_cpos); |
614 | tree_depth, &ctxt); | ||
615 | } while (ret == -EAGAIN); | ||
616 | 127 | ||
617 | if ((ret < 0) && (ret != -EEXIST)) | 128 | *p_cluster = ocfs2_blocks_to_clusters(inode->i_sb, |
618 | mlog_errno(ret); | 129 | le64_to_cpu(rec->e_blkno)); |
619 | 130 | *p_cluster = *p_cluster + coff; | |
620 | if (ctxt.left_ent) | ||
621 | kmem_cache_free(ocfs2_em_ent_cachep, ctxt.left_ent); | ||
622 | if (ctxt.right_ent) | ||
623 | kmem_cache_free(ocfs2_em_ent_cachep, ctxt.right_ent); | ||
624 | if (ctxt.old_ent) | ||
625 | kmem_cache_free(ocfs2_em_ent_cachep, ctxt.old_ent); | ||
626 | if (ctxt.new_ent) | ||
627 | kmem_cache_free(ocfs2_em_ent_cachep, ctxt.new_ent); | ||
628 | |||
629 | return ret; | ||
630 | } | ||
631 | 131 | ||
632 | /* | 132 | if (num_clusters) |
633 | * Append this record to the tail of the extent map. It must be | 133 | *num_clusters = le32_to_cpu(rec->e_clusters) - coff; |
634 | * tree_depth 0. The record might be an extension of an existing | ||
635 | * record, and as such that needs to be handled. eg: | ||
636 | * | ||
637 | * Existing record in the extent map: | ||
638 | * | ||
639 | * cpos = 10, len = 10 | ||
640 | * |---------| | ||
641 | * | ||
642 | * New Record: | ||
643 | * | ||
644 | * cpos = 10, len = 20 | ||
645 | * |------------------| | ||
646 | * | ||
647 | * The passed record is the new on-disk record. The new_clusters value | ||
648 | * is how many clusters were added to the file. If the append is a | ||
649 | * contiguous append, the new_clusters has been added to | ||
650 | * rec->e_clusters. If the append is an entirely new extent, then | ||
651 | * rec->e_clusters is == new_clusters. | ||
652 | */ | ||
653 | int ocfs2_extent_map_append(struct inode *inode, | ||
654 | struct ocfs2_extent_rec *rec, | ||
655 | u32 new_clusters) | ||
656 | { | ||
657 | int ret; | ||
658 | struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; | ||
659 | struct ocfs2_extent_map_entry *ent; | ||
660 | struct ocfs2_extent_rec *old; | ||
661 | |||
662 | BUG_ON(!new_clusters); | ||
663 | BUG_ON(le32_to_cpu(rec->e_clusters) < new_clusters); | ||
664 | |||
665 | if (em->em_clusters < OCFS2_I(inode)->ip_clusters) { | ||
666 | /* | ||
667 | * Size changed underneath us on disk. Drop any | ||
668 | * straddling records and update our idea of | ||
669 | * i_clusters | ||
670 | */ | ||
671 | ocfs2_extent_map_drop(inode, em->em_clusters - 1); | ||
672 | em->em_clusters = OCFS2_I(inode)->ip_clusters; | ||
673 | } | 134 | } |
674 | 135 | ||
675 | mlog_bug_on_msg((le32_to_cpu(rec->e_cpos) + | 136 | out: |
676 | le32_to_cpu(rec->e_clusters)) != | 137 | brelse(di_bh); |
677 | (em->em_clusters + new_clusters), | 138 | brelse(eb_bh); |
678 | "Inode %llu:\n" | ||
679 | "rec->e_cpos = %u + rec->e_clusters = %u = %u\n" | ||
680 | "em->em_clusters = %u + new_clusters = %u = %u\n", | ||
681 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | ||
682 | le32_to_cpu(rec->e_cpos), le32_to_cpu(rec->e_clusters), | ||
683 | le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters), | ||
684 | em->em_clusters, new_clusters, | ||
685 | em->em_clusters + new_clusters); | ||
686 | |||
687 | em->em_clusters += new_clusters; | ||
688 | |||
689 | ret = -ENOENT; | ||
690 | if (le32_to_cpu(rec->e_clusters) > new_clusters) { | ||
691 | /* This is a contiguous append */ | ||
692 | ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos), 1, | ||
693 | NULL, NULL); | ||
694 | if (ent) { | ||
695 | old = &ent->e_rec; | ||
696 | BUG_ON((le32_to_cpu(rec->e_cpos) + | ||
697 | le32_to_cpu(rec->e_clusters)) != | ||
698 | (le32_to_cpu(old->e_cpos) + | ||
699 | le32_to_cpu(old->e_clusters) + | ||
700 | new_clusters)); | ||
701 | if (ent->e_tree_depth == 0) { | ||
702 | BUG_ON(le32_to_cpu(old->e_cpos) != | ||
703 | le32_to_cpu(rec->e_cpos)); | ||
704 | BUG_ON(le64_to_cpu(old->e_blkno) != | ||
705 | le64_to_cpu(rec->e_blkno)); | ||
706 | ret = 0; | ||
707 | } | ||
708 | /* | ||
709 | * Let non-leafs fall through as -ENOENT to | ||
710 | * force insertion of the new leaf. | ||
711 | */ | ||
712 | le32_add_cpu(&old->e_clusters, new_clusters); | ||
713 | } | ||
714 | } | ||
715 | |||
716 | if (ret == -ENOENT) | ||
717 | ret = ocfs2_extent_map_insert(inode, rec, 0); | ||
718 | if (ret < 0) | ||
719 | mlog_errno(ret); | ||
720 | return ret; | 139 | return ret; |
721 | } | 140 | } |
722 | 141 | ||
723 | #if 0 | ||
724 | /* Code here is included but defined out as it completes the extent | ||
725 | * map api and may be used in the future. */ | ||
726 | |||
727 | /* | 142 | /* |
728 | * Look up the record containing this cluster offset. This record is | 143 | * This expects alloc_sem to be held. The allocation cannot change at |
729 | * part of the extent map. Do not free it. Any changes you make to | 144 | * all while the map is in the process of being updated. |
730 | * it will reflect in the extent map. So, if your last extent | ||
731 | * is (cpos = 10, clusters = 10) and you truncate the file by 5 | ||
732 | * clusters, you can do: | ||
733 | * | ||
734 | * ret = ocfs2_extent_map_get_rec(em, orig_size - 5, &rec); | ||
735 | * rec->e_clusters -= 5; | ||
736 | * | ||
737 | * The lookup does not read from disk. If the map isn't filled in for | ||
738 | * an entry, you won't find it. | ||
739 | * | ||
740 | * Also note that the returned record is valid until alloc_sem is | ||
741 | * dropped. After that, truncate and extend can happen. Caveat Emptor. | ||
742 | */ | 145 | */ |
743 | int ocfs2_extent_map_get_rec(struct inode *inode, u32 cpos, | 146 | int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno, |
744 | struct ocfs2_extent_rec **rec, | 147 | int *ret_count) |
745 | int *tree_depth) | ||
746 | { | ||
747 | int ret = -ENOENT; | ||
748 | struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; | ||
749 | struct ocfs2_extent_map_entry *ent; | ||
750 | |||
751 | *rec = NULL; | ||
752 | |||
753 | if (cpos >= OCFS2_I(inode)->ip_clusters) | ||
754 | return -EINVAL; | ||
755 | |||
756 | if (cpos >= em->em_clusters) { | ||
757 | /* | ||
758 | * Size changed underneath us on disk. Drop any | ||
759 | * straddling records and update our idea of | ||
760 | * i_clusters | ||
761 | */ | ||
762 | ocfs2_extent_map_drop(inode, em->em_clusters - 1); | ||
763 | em->em_clusters = OCFS2_I(inode)->ip_clusters ; | ||
764 | } | ||
765 | |||
766 | ent = ocfs2_extent_map_lookup(&OCFS2_I(inode)->ip_map, cpos, 1, | ||
767 | NULL, NULL); | ||
768 | |||
769 | if (ent) { | ||
770 | *rec = &ent->e_rec; | ||
771 | if (tree_depth) | ||
772 | *tree_depth = ent->e_tree_depth; | ||
773 | ret = 0; | ||
774 | } | ||
775 | |||
776 | return ret; | ||
777 | } | ||
778 | |||
779 | int ocfs2_extent_map_get_clusters(struct inode *inode, | ||
780 | u32 v_cpos, int count, | ||
781 | u32 *p_cpos, int *ret_count) | ||
782 | { | ||
783 | int ret; | ||
784 | u32 coff, ccount; | ||
785 | struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; | ||
786 | struct ocfs2_extent_map_entry *ent = NULL; | ||
787 | |||
788 | *p_cpos = ccount = 0; | ||
789 | |||
790 | if ((v_cpos + count) > OCFS2_I(inode)->ip_clusters) | ||
791 | return -EINVAL; | ||
792 | |||
793 | if ((v_cpos + count) > em->em_clusters) { | ||
794 | /* | ||
795 | * Size changed underneath us on disk. Drop any | ||
796 | * straddling records and update our idea of | ||
797 | * i_clusters | ||
798 | */ | ||
799 | ocfs2_extent_map_drop(inode, em->em_clusters - 1); | ||
800 | em->em_clusters = OCFS2_I(inode)->ip_clusters; | ||
801 | } | ||
802 | |||
803 | |||
804 | ret = ocfs2_extent_map_lookup_read(inode, v_cpos, count, &ent); | ||
805 | if (ret) | ||
806 | return ret; | ||
807 | |||
808 | if (ent) { | ||
809 | /* We should never find ourselves straddling an interval */ | ||
810 | if (!ocfs2_extent_rec_contains_clusters(&ent->e_rec, | ||
811 | v_cpos, | ||
812 | count)) | ||
813 | return -ESRCH; | ||
814 | |||
815 | coff = v_cpos - le32_to_cpu(ent->e_rec.e_cpos); | ||
816 | *p_cpos = ocfs2_blocks_to_clusters(inode->i_sb, | ||
817 | le64_to_cpu(ent->e_rec.e_blkno)) + | ||
818 | coff; | ||
819 | |||
820 | if (ret_count) | ||
821 | *ret_count = le32_to_cpu(ent->e_rec.e_clusters) - coff; | ||
822 | |||
823 | return 0; | ||
824 | } | ||
825 | |||
826 | |||
827 | return -ENOENT; | ||
828 | } | ||
829 | |||
830 | #endif /* 0 */ | ||
831 | |||
832 | int ocfs2_extent_map_get_blocks(struct inode *inode, | ||
833 | u64 v_blkno, int count, | ||
834 | u64 *p_blkno, int *ret_count) | ||
835 | { | 148 | { |
836 | int ret; | 149 | int ret; |
837 | u64 boff; | ||
838 | u32 cpos, clusters; | ||
839 | int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); | 150 | int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); |
840 | struct ocfs2_extent_map_entry *ent = NULL; | 151 | u32 cpos, num_clusters, p_cluster; |
841 | struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; | 152 | u64 boff = 0; |
842 | struct ocfs2_extent_rec *rec; | ||
843 | |||
844 | *p_blkno = 0; | ||
845 | 153 | ||
846 | cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno); | 154 | cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno); |
847 | clusters = ocfs2_blocks_to_clusters(inode->i_sb, | ||
848 | (u64)count + bpc - 1); | ||
849 | if ((cpos + clusters) > OCFS2_I(inode)->ip_clusters) { | ||
850 | ret = -EINVAL; | ||
851 | mlog_errno(ret); | ||
852 | return ret; | ||
853 | } | ||
854 | 155 | ||
855 | if ((cpos + clusters) > em->em_clusters) { | 156 | ret = ocfs2_get_clusters(inode, cpos, &p_cluster, &num_clusters); |
856 | /* | ||
857 | * Size changed underneath us on disk. Drop any | ||
858 | * straddling records and update our idea of | ||
859 | * i_clusters | ||
860 | */ | ||
861 | ocfs2_extent_map_drop(inode, em->em_clusters - 1); | ||
862 | em->em_clusters = OCFS2_I(inode)->ip_clusters; | ||
863 | } | ||
864 | |||
865 | ret = ocfs2_extent_map_lookup_read(inode, cpos, clusters, &ent); | ||
866 | if (ret) { | 157 | if (ret) { |
867 | mlog_errno(ret); | 158 | mlog_errno(ret); |
868 | return ret; | 159 | goto out; |
869 | } | 160 | } |
870 | 161 | ||
871 | if (ent) | 162 | /* |
872 | { | 163 | * p_cluster == 0 indicates a hole. |
873 | rec = &ent->e_rec; | 164 | */ |
874 | 165 | if (p_cluster) { | |
875 | /* We should never find ourselves straddling an interval */ | 166 | boff = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster); |
876 | if (!ocfs2_extent_rec_contains_clusters(rec, cpos, clusters)) { | ||
877 | ret = -ESRCH; | ||
878 | mlog_errno(ret); | ||
879 | return ret; | ||
880 | } | ||
881 | |||
882 | boff = ocfs2_clusters_to_blocks(inode->i_sb, cpos - | ||
883 | le32_to_cpu(rec->e_cpos)); | ||
884 | boff += (v_blkno & (u64)(bpc - 1)); | 167 | boff += (v_blkno & (u64)(bpc - 1)); |
885 | *p_blkno = le64_to_cpu(rec->e_blkno) + boff; | ||
886 | |||
887 | if (ret_count) { | ||
888 | *ret_count = ocfs2_clusters_to_blocks(inode->i_sb, | ||
889 | le32_to_cpu(rec->e_clusters)) - boff; | ||
890 | } | ||
891 | |||
892 | return 0; | ||
893 | } | 168 | } |
894 | 169 | ||
895 | return -ENOENT; | 170 | *p_blkno = boff; |
896 | } | ||
897 | |||
898 | int ocfs2_extent_map_init(struct inode *inode) | ||
899 | { | ||
900 | struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; | ||
901 | 171 | ||
902 | em->em_extents = RB_ROOT; | 172 | if (ret_count) { |
903 | em->em_clusters = 0; | 173 | *ret_count = ocfs2_clusters_to_blocks(inode->i_sb, num_clusters); |
904 | 174 | *ret_count -= v_blkno & (u64)(bpc - 1); | |
905 | return 0; | ||
906 | } | ||
907 | |||
908 | /* Needs the lock */ | ||
909 | static void __ocfs2_extent_map_drop(struct inode *inode, | ||
910 | u32 new_clusters, | ||
911 | struct rb_node **free_head, | ||
912 | struct ocfs2_extent_map_entry **tail_ent) | ||
913 | { | ||
914 | struct rb_node *node, *next; | ||
915 | struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; | ||
916 | struct ocfs2_extent_map_entry *ent; | ||
917 | |||
918 | *free_head = NULL; | ||
919 | |||
920 | ent = NULL; | ||
921 | node = rb_last(&em->em_extents); | ||
922 | while (node) | ||
923 | { | ||
924 | next = rb_prev(node); | ||
925 | |||
926 | ent = rb_entry(node, struct ocfs2_extent_map_entry, | ||
927 | e_node); | ||
928 | if (le32_to_cpu(ent->e_rec.e_cpos) < new_clusters) | ||
929 | break; | ||
930 | |||
931 | rb_erase(&ent->e_node, &em->em_extents); | ||
932 | |||
933 | node->rb_right = *free_head; | ||
934 | *free_head = node; | ||
935 | |||
936 | ent = NULL; | ||
937 | node = next; | ||
938 | } | ||
939 | |||
940 | /* Do we have an entry straddling new_clusters? */ | ||
941 | if (tail_ent) { | ||
942 | if (ent && | ||
943 | ((le32_to_cpu(ent->e_rec.e_cpos) + | ||
944 | le32_to_cpu(ent->e_rec.e_clusters)) > new_clusters)) | ||
945 | *tail_ent = ent; | ||
946 | else | ||
947 | *tail_ent = NULL; | ||
948 | } | 175 | } |
949 | } | ||
950 | |||
951 | static void __ocfs2_extent_map_drop_cleanup(struct rb_node *free_head) | ||
952 | { | ||
953 | struct rb_node *node; | ||
954 | struct ocfs2_extent_map_entry *ent; | ||
955 | |||
956 | while (free_head) { | ||
957 | node = free_head; | ||
958 | free_head = node->rb_right; | ||
959 | 176 | ||
960 | ent = rb_entry(node, struct ocfs2_extent_map_entry, | 177 | out: |
961 | e_node); | 178 | return ret; |
962 | kmem_cache_free(ocfs2_em_ent_cachep, ent); | ||
963 | } | ||
964 | } | ||
965 | |||
966 | /* | ||
967 | * Remove all entries past new_clusters, inclusive of an entry that | ||
968 | * contains new_clusters. This is effectively a cache forget. | ||
969 | * | ||
970 | * If you want to also clip the last extent by some number of clusters, | ||
971 | * you need to call ocfs2_extent_map_trunc(). | ||
972 | * This code does not check or modify ip_clusters. | ||
973 | */ | ||
974 | int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters) | ||
975 | { | ||
976 | struct rb_node *free_head = NULL; | ||
977 | struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; | ||
978 | struct ocfs2_extent_map_entry *ent; | ||
979 | |||
980 | spin_lock(&OCFS2_I(inode)->ip_lock); | ||
981 | |||
982 | __ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent); | ||
983 | |||
984 | if (ent) { | ||
985 | rb_erase(&ent->e_node, &em->em_extents); | ||
986 | ent->e_node.rb_right = free_head; | ||
987 | free_head = &ent->e_node; | ||
988 | } | ||
989 | |||
990 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
991 | |||
992 | if (free_head) | ||
993 | __ocfs2_extent_map_drop_cleanup(free_head); | ||
994 | |||
995 | return 0; | ||
996 | } | ||
997 | |||
998 | /* | ||
999 | * Remove all entries past new_clusters and also clip any extent | ||
1000 | * straddling new_clusters, if there is one. This does not check | ||
1001 | * or modify ip_clusters | ||
1002 | */ | ||
1003 | int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters) | ||
1004 | { | ||
1005 | struct rb_node *free_head = NULL; | ||
1006 | struct ocfs2_extent_map_entry *ent = NULL; | ||
1007 | |||
1008 | spin_lock(&OCFS2_I(inode)->ip_lock); | ||
1009 | |||
1010 | __ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent); | ||
1011 | |||
1012 | if (ent) | ||
1013 | ent->e_rec.e_clusters = cpu_to_le32(new_clusters - | ||
1014 | le32_to_cpu(ent->e_rec.e_cpos)); | ||
1015 | |||
1016 | OCFS2_I(inode)->ip_map.em_clusters = new_clusters; | ||
1017 | |||
1018 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
1019 | |||
1020 | if (free_head) | ||
1021 | __ocfs2_extent_map_drop_cleanup(free_head); | ||
1022 | |||
1023 | return 0; | ||
1024 | } | ||
1025 | |||
1026 | int __init init_ocfs2_extent_maps(void) | ||
1027 | { | ||
1028 | ocfs2_em_ent_cachep = | ||
1029 | kmem_cache_create("ocfs2_em_ent", | ||
1030 | sizeof(struct ocfs2_extent_map_entry), | ||
1031 | 0, SLAB_HWCACHE_ALIGN, NULL, NULL); | ||
1032 | if (!ocfs2_em_ent_cachep) | ||
1033 | return -ENOMEM; | ||
1034 | |||
1035 | return 0; | ||
1036 | } | ||
1037 | |||
1038 | void exit_ocfs2_extent_maps(void) | ||
1039 | { | ||
1040 | kmem_cache_destroy(ocfs2_em_ent_cachep); | ||
1041 | } | 179 | } |
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h index fa3745efa886..036e23251448 100644 --- a/fs/ocfs2/extent_map.h +++ b/fs/ocfs2/extent_map.h | |||
@@ -25,22 +25,7 @@ | |||
25 | #ifndef _EXTENT_MAP_H | 25 | #ifndef _EXTENT_MAP_H |
26 | #define _EXTENT_MAP_H | 26 | #define _EXTENT_MAP_H |
27 | 27 | ||
28 | int init_ocfs2_extent_maps(void); | 28 | int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno, |
29 | void exit_ocfs2_extent_maps(void); | 29 | int *ret_count); |
30 | |||
31 | /* | ||
32 | * EVERY CALL here except _init, _trunc, and _drop expects alloc_sem | ||
33 | * to be held. The allocation cannot change at all while the map is | ||
34 | * in the process of being updated. | ||
35 | */ | ||
36 | int ocfs2_extent_map_init(struct inode *inode); | ||
37 | int ocfs2_extent_map_append(struct inode *inode, | ||
38 | struct ocfs2_extent_rec *rec, | ||
39 | u32 new_clusters); | ||
40 | int ocfs2_extent_map_get_blocks(struct inode *inode, | ||
41 | u64 v_blkno, int count, | ||
42 | u64 *p_blkno, int *ret_count); | ||
43 | int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters); | ||
44 | int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters); | ||
45 | 30 | ||
46 | #endif /* _EXTENT_MAP_H */ | 31 | #endif /* _EXTENT_MAP_H */ |
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 08d57a3d4e83..5ff8549eb1a3 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c | |||
@@ -1003,9 +1003,6 @@ void ocfs2_clear_inode(struct inode *inode) | |||
1003 | "Clear inode of %llu, inode has io markers\n", | 1003 | "Clear inode of %llu, inode has io markers\n", |
1004 | (unsigned long long)oi->ip_blkno); | 1004 | (unsigned long long)oi->ip_blkno); |
1005 | 1005 | ||
1006 | ocfs2_extent_map_drop(inode, 0); | ||
1007 | ocfs2_extent_map_init(inode); | ||
1008 | |||
1009 | status = ocfs2_drop_inode_locks(inode); | 1006 | status = ocfs2_drop_inode_locks(inode); |
1010 | if (status < 0) | 1007 | if (status < 0) |
1011 | mlog_errno(status); | 1008 | mlog_errno(status); |
@@ -1102,8 +1099,7 @@ struct buffer_head *ocfs2_bread(struct inode *inode, | |||
1102 | return NULL; | 1099 | return NULL; |
1103 | } | 1100 | } |
1104 | 1101 | ||
1105 | tmperr = ocfs2_extent_map_get_blocks(inode, block, 1, | 1102 | tmperr = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL); |
1106 | &p_blkno, NULL); | ||
1107 | if (tmperr < 0) { | 1103 | if (tmperr < 0) { |
1108 | mlog_errno(tmperr); | 1104 | mlog_errno(tmperr); |
1109 | goto fail; | 1105 | goto fail; |
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index 042ae20a7132..a9ced009cb9c 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h | |||
@@ -43,7 +43,6 @@ struct ocfs2_inode_info | |||
43 | spinlock_t ip_lock; | 43 | spinlock_t ip_lock; |
44 | u32 ip_open_count; | 44 | u32 ip_open_count; |
45 | u32 ip_clusters; | 45 | u32 ip_clusters; |
46 | struct ocfs2_extent_map ip_map; | ||
47 | struct list_head ip_io_markers; | 46 | struct list_head ip_io_markers; |
48 | 47 | ||
49 | struct mutex ip_io_mutex; | 48 | struct mutex ip_io_mutex; |
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 12445a31f733..2e2e04fe9738 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c | |||
@@ -670,8 +670,7 @@ static int ocfs2_force_read_journal(struct inode *inode) | |||
670 | (inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9))) { | 670 | (inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9))) { |
671 | 671 | ||
672 | status = ocfs2_extent_map_get_blocks(inode, v_blkno, | 672 | status = ocfs2_extent_map_get_blocks(inode, v_blkno, |
673 | 1, &p_blkno, | 673 | &p_blkno, &p_blocks); |
674 | &p_blocks); | ||
675 | if (status < 0) { | 674 | if (status < 0) { |
676 | mlog_errno(status); | 675 | mlog_errno(status); |
677 | goto bail; | 676 | goto bail; |
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index d65fef4a8bd8..5755e0748256 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c | |||
@@ -1511,8 +1511,7 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb, | |||
1511 | goto bail; | 1511 | goto bail; |
1512 | } | 1512 | } |
1513 | 1513 | ||
1514 | status = ocfs2_extent_map_get_blocks(inode, 0, 1, &p_blkno, | 1514 | status = ocfs2_extent_map_get_blocks(inode, 0, &p_blkno, &p_blocks); |
1515 | &p_blocks); | ||
1516 | if (status < 0) { | 1515 | if (status < 0) { |
1517 | mlog_errno(status); | 1516 | mlog_errno(status); |
1518 | goto bail; | 1517 | goto bail; |
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index fe7e1ecafca5..faeb53f2eecf 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h | |||
@@ -46,11 +46,6 @@ | |||
46 | #include "endian.h" | 46 | #include "endian.h" |
47 | #include "ocfs2_lockid.h" | 47 | #include "ocfs2_lockid.h" |
48 | 48 | ||
49 | struct ocfs2_extent_map { | ||
50 | u32 em_clusters; | ||
51 | struct rb_root em_extents; | ||
52 | }; | ||
53 | |||
54 | /* Most user visible OCFS2 inodes will have very few pieces of | 49 | /* Most user visible OCFS2 inodes will have very few pieces of |
55 | * metadata, but larger files (including bitmaps, etc) must be taken | 50 | * metadata, but larger files (including bitmaps, etc) must be taken |
56 | * into account when designing an access scheme. We allow a small | 51 | * into account when designing an access scheme. We allow a small |
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index 2d3ac32cb74e..f4416e7330e1 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c | |||
@@ -197,7 +197,7 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb) | |||
197 | goto bail; | 197 | goto bail; |
198 | } | 198 | } |
199 | 199 | ||
200 | status = ocfs2_extent_map_get_blocks(inode, 0ULL, 1, &blkno, NULL); | 200 | status = ocfs2_extent_map_get_blocks(inode, 0ULL, &blkno, NULL); |
201 | if (status < 0) { | 201 | if (status < 0) { |
202 | mlog_errno(status); | 202 | mlog_errno(status); |
203 | goto bail; | 203 | goto bail; |
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 16564ea6c141..6ab52351943a 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c | |||
@@ -806,9 +806,6 @@ static int __init ocfs2_init(void) | |||
806 | 806 | ||
807 | ocfs2_print_version(); | 807 | ocfs2_print_version(); |
808 | 808 | ||
809 | if (init_ocfs2_extent_maps()) | ||
810 | return -ENOMEM; | ||
811 | |||
812 | status = init_ocfs2_uptodate_cache(); | 809 | status = init_ocfs2_uptodate_cache(); |
813 | if (status < 0) { | 810 | if (status < 0) { |
814 | mlog_errno(status); | 811 | mlog_errno(status); |
@@ -837,7 +834,6 @@ leave: | |||
837 | if (status < 0) { | 834 | if (status < 0) { |
838 | ocfs2_free_mem_caches(); | 835 | ocfs2_free_mem_caches(); |
839 | exit_ocfs2_uptodate_cache(); | 836 | exit_ocfs2_uptodate_cache(); |
840 | exit_ocfs2_extent_maps(); | ||
841 | } | 837 | } |
842 | 838 | ||
843 | mlog_exit(status); | 839 | mlog_exit(status); |
@@ -863,8 +859,6 @@ static void __exit ocfs2_exit(void) | |||
863 | 859 | ||
864 | unregister_filesystem(&ocfs2_fs_type); | 860 | unregister_filesystem(&ocfs2_fs_type); |
865 | 861 | ||
866 | exit_ocfs2_extent_maps(); | ||
867 | |||
868 | exit_ocfs2_uptodate_cache(); | 862 | exit_ocfs2_uptodate_cache(); |
869 | 863 | ||
870 | mlog_exit_void(); | 864 | mlog_exit_void(); |
@@ -948,7 +942,6 @@ static void ocfs2_inode_init_once(void *data, | |||
948 | oi->ip_flags = 0; | 942 | oi->ip_flags = 0; |
949 | oi->ip_open_count = 0; | 943 | oi->ip_open_count = 0; |
950 | spin_lock_init(&oi->ip_lock); | 944 | spin_lock_init(&oi->ip_lock); |
951 | ocfs2_extent_map_init(&oi->vfs_inode); | ||
952 | INIT_LIST_HEAD(&oi->ip_io_markers); | 945 | INIT_LIST_HEAD(&oi->ip_io_markers); |
953 | oi->ip_created_trans = 0; | 946 | oi->ip_created_trans = 0; |
954 | oi->ip_last_trans = 0; | 947 | oi->ip_last_trans = 0; |