diff options
Diffstat (limited to 'fs/ocfs2/extent_map.c')
-rw-r--r-- | fs/ocfs2/extent_map.c | 1233 |
1 files changed, 372 insertions, 861 deletions
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index 80ac69f11d9f..ba2b2ab1c6e4 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c | |||
@@ -3,8 +3,7 @@ | |||
3 | * | 3 | * |
4 | * extent_map.c | 4 | * extent_map.c |
5 | * | 5 | * |
6 | * In-memory extent map for OCFS2. Man, this code was prettier in | 6 | * Block/Cluster mapping functions |
7 | * the library. | ||
8 | * | 7 | * |
9 | * Copyright (C) 2004 Oracle. All rights reserved. | 8 | * Copyright (C) 2004 Oracle. All rights reserved. |
10 | * | 9 | * |
@@ -26,1016 +25,528 @@ | |||
26 | #include <linux/fs.h> | 25 | #include <linux/fs.h> |
27 | #include <linux/init.h> | 26 | #include <linux/init.h> |
28 | #include <linux/types.h> | 27 | #include <linux/types.h> |
29 | #include <linux/slab.h> | ||
30 | #include <linux/rbtree.h> | ||
31 | 28 | ||
32 | #define MLOG_MASK_PREFIX ML_EXTENT_MAP | 29 | #define MLOG_MASK_PREFIX ML_EXTENT_MAP |
33 | #include <cluster/masklog.h> | 30 | #include <cluster/masklog.h> |
34 | 31 | ||
35 | #include "ocfs2.h" | 32 | #include "ocfs2.h" |
36 | 33 | ||
34 | #include "alloc.h" | ||
37 | #include "extent_map.h" | 35 | #include "extent_map.h" |
38 | #include "inode.h" | 36 | #include "inode.h" |
39 | #include "super.h" | 37 | #include "super.h" |
40 | 38 | ||
41 | #include "buffer_head_io.h" | 39 | #include "buffer_head_io.h" |
42 | 40 | ||
43 | |||
44 | /* | 41 | /* |
45 | * SUCK SUCK SUCK | 42 | * The extent caching implementation is intentionally trivial. |
46 | * Our headers are so bad that struct ocfs2_extent_map is in ocfs.h | ||
47 | */ | ||
48 | |||
49 | struct ocfs2_extent_map_entry { | ||
50 | struct rb_node e_node; | ||
51 | int e_tree_depth; | ||
52 | struct ocfs2_extent_rec e_rec; | ||
53 | }; | ||
54 | |||
55 | struct ocfs2_em_insert_context { | ||
56 | int need_left; | ||
57 | int need_right; | ||
58 | struct ocfs2_extent_map_entry *new_ent; | ||
59 | struct ocfs2_extent_map_entry *old_ent; | ||
60 | struct ocfs2_extent_map_entry *left_ent; | ||
61 | struct ocfs2_extent_map_entry *right_ent; | ||
62 | }; | ||
63 | |||
64 | static struct kmem_cache *ocfs2_em_ent_cachep = NULL; | ||
65 | |||
66 | |||
67 | static struct ocfs2_extent_map_entry * | ||
68 | ocfs2_extent_map_lookup(struct ocfs2_extent_map *em, | ||
69 | u32 cpos, u32 clusters, | ||
70 | struct rb_node ***ret_p, | ||
71 | struct rb_node **ret_parent); | ||
72 | static int ocfs2_extent_map_insert(struct inode *inode, | ||
73 | struct ocfs2_extent_rec *rec, | ||
74 | int tree_depth); | ||
75 | static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em, | ||
76 | struct ocfs2_extent_map_entry *ent); | ||
77 | static int ocfs2_extent_map_find_leaf(struct inode *inode, | ||
78 | u32 cpos, u32 clusters, | ||
79 | struct ocfs2_extent_list *el); | ||
80 | static int ocfs2_extent_map_lookup_read(struct inode *inode, | ||
81 | u32 cpos, u32 clusters, | ||
82 | struct ocfs2_extent_map_entry **ret_ent); | ||
83 | static int ocfs2_extent_map_try_insert(struct inode *inode, | ||
84 | struct ocfs2_extent_rec *rec, | ||
85 | int tree_depth, | ||
86 | struct ocfs2_em_insert_context *ctxt); | ||
87 | |||
88 | /* returns 1 only if the rec contains all the given clusters -- that is that | ||
89 | * rec's cpos is <= the cluster cpos and that the rec endpoint (cpos + | ||
90 | * clusters) is >= the argument's endpoint */ | ||
91 | static int ocfs2_extent_rec_contains_clusters(struct ocfs2_extent_rec *rec, | ||
92 | u32 cpos, u32 clusters) | ||
93 | { | ||
94 | if (le32_to_cpu(rec->e_cpos) > cpos) | ||
95 | return 0; | ||
96 | if (cpos + clusters > le32_to_cpu(rec->e_cpos) + | ||
97 | le32_to_cpu(rec->e_clusters)) | ||
98 | return 0; | ||
99 | return 1; | ||
100 | } | ||
101 | |||
102 | |||
103 | /* | ||
104 | * Find an entry in the tree that intersects the region passed in. | ||
105 | * Note that this will find straddled intervals, it is up to the | ||
106 | * callers to enforce any boundary conditions. | ||
107 | * | ||
108 | * Callers must hold ip_lock. This lookup is not guaranteed to return | ||
109 | * a tree_depth 0 match, and as such can race inserts if the lock | ||
110 | * were not held. | ||
111 | * | 43 | * |
112 | * The rb_node garbage lets insertion share the search. Trivial | 44 | * We only cache a small number of extents stored directly on the |
113 | * callers pass NULL. | 45 | * inode, so linear order operations are acceptable. If we ever want |
46 | * to increase the size of the extent map, then these algorithms must | ||
47 | * get smarter. | ||
114 | */ | 48 | */ |
115 | static struct ocfs2_extent_map_entry * | 49 | |
116 | ocfs2_extent_map_lookup(struct ocfs2_extent_map *em, | 50 | void ocfs2_extent_map_init(struct inode *inode) |
117 | u32 cpos, u32 clusters, | ||
118 | struct rb_node ***ret_p, | ||
119 | struct rb_node **ret_parent) | ||
120 | { | 51 | { |
121 | struct rb_node **p = &em->em_extents.rb_node; | 52 | struct ocfs2_inode_info *oi = OCFS2_I(inode); |
122 | struct rb_node *parent = NULL; | ||
123 | struct ocfs2_extent_map_entry *ent = NULL; | ||
124 | |||
125 | while (*p) | ||
126 | { | ||
127 | parent = *p; | ||
128 | ent = rb_entry(parent, struct ocfs2_extent_map_entry, | ||
129 | e_node); | ||
130 | if ((cpos + clusters) <= le32_to_cpu(ent->e_rec.e_cpos)) { | ||
131 | p = &(*p)->rb_left; | ||
132 | ent = NULL; | ||
133 | } else if (cpos >= (le32_to_cpu(ent->e_rec.e_cpos) + | ||
134 | le32_to_cpu(ent->e_rec.e_clusters))) { | ||
135 | p = &(*p)->rb_right; | ||
136 | ent = NULL; | ||
137 | } else | ||
138 | break; | ||
139 | } | ||
140 | 53 | ||
141 | if (ret_p != NULL) | 54 | oi->ip_extent_map.em_num_items = 0; |
142 | *ret_p = p; | 55 | INIT_LIST_HEAD(&oi->ip_extent_map.em_list); |
143 | if (ret_parent != NULL) | ||
144 | *ret_parent = parent; | ||
145 | return ent; | ||
146 | } | 56 | } |
147 | 57 | ||
148 | /* | 58 | static void __ocfs2_extent_map_lookup(struct ocfs2_extent_map *em, |
149 | * Find the leaf containing the interval we want. While we're on our | 59 | unsigned int cpos, |
150 | * way down the tree, fill in every record we see at any depth, because | 60 | struct ocfs2_extent_map_item **ret_emi) |
151 | * we might want it later. | ||
152 | * | ||
153 | * Note that this code is run without ip_lock. That's because it | ||
154 | * sleeps while reading. If someone is also filling the extent list at | ||
155 | * the same time we are, we might have to restart. | ||
156 | */ | ||
157 | static int ocfs2_extent_map_find_leaf(struct inode *inode, | ||
158 | u32 cpos, u32 clusters, | ||
159 | struct ocfs2_extent_list *el) | ||
160 | { | 61 | { |
161 | int i, ret; | 62 | unsigned int range; |
162 | struct buffer_head *eb_bh = NULL; | 63 | struct ocfs2_extent_map_item *emi; |
163 | u64 blkno; | ||
164 | u32 rec_end; | ||
165 | struct ocfs2_extent_block *eb; | ||
166 | struct ocfs2_extent_rec *rec; | ||
167 | |||
168 | /* | ||
169 | * The bh data containing the el cannot change here, because | ||
170 | * we hold alloc_sem. So we can do this without other | ||
171 | * locks. | ||
172 | */ | ||
173 | while (el->l_tree_depth) | ||
174 | { | ||
175 | blkno = 0; | ||
176 | for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { | ||
177 | rec = &el->l_recs[i]; | ||
178 | rec_end = (le32_to_cpu(rec->e_cpos) + | ||
179 | le32_to_cpu(rec->e_clusters)); | ||
180 | |||
181 | ret = -EBADR; | ||
182 | if (rec_end > OCFS2_I(inode)->ip_clusters) { | ||
183 | mlog_errno(ret); | ||
184 | ocfs2_error(inode->i_sb, | ||
185 | "Extent %d at e_blkno %llu of inode %llu goes past ip_clusters of %u\n", | ||
186 | i, | ||
187 | (unsigned long long)le64_to_cpu(rec->e_blkno), | ||
188 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | ||
189 | OCFS2_I(inode)->ip_clusters); | ||
190 | goto out_free; | ||
191 | } | ||
192 | |||
193 | if (rec_end <= cpos) { | ||
194 | ret = ocfs2_extent_map_insert(inode, rec, | ||
195 | le16_to_cpu(el->l_tree_depth)); | ||
196 | if (ret && (ret != -EEXIST)) { | ||
197 | mlog_errno(ret); | ||
198 | goto out_free; | ||
199 | } | ||
200 | continue; | ||
201 | } | ||
202 | if ((cpos + clusters) <= le32_to_cpu(rec->e_cpos)) { | ||
203 | ret = ocfs2_extent_map_insert(inode, rec, | ||
204 | le16_to_cpu(el->l_tree_depth)); | ||
205 | if (ret && (ret != -EEXIST)) { | ||
206 | mlog_errno(ret); | ||
207 | goto out_free; | ||
208 | } | ||
209 | continue; | ||
210 | } | ||
211 | 64 | ||
212 | /* | 65 | *ret_emi = NULL; |
213 | * We've found a record that matches our | ||
214 | * interval. We don't insert it because we're | ||
215 | * about to traverse it. | ||
216 | */ | ||
217 | |||
218 | /* Check to see if we're stradling */ | ||
219 | ret = -ESRCH; | ||
220 | if (!ocfs2_extent_rec_contains_clusters(rec, | ||
221 | cpos, | ||
222 | clusters)) { | ||
223 | mlog_errno(ret); | ||
224 | goto out_free; | ||
225 | } | ||
226 | 66 | ||
227 | /* | 67 | list_for_each_entry(emi, &em->em_list, ei_list) { |
228 | * If we've already found a record, the el has | 68 | range = emi->ei_cpos + emi->ei_clusters; |
229 | * two records covering the same interval. | ||
230 | * EEEK! | ||
231 | */ | ||
232 | ret = -EBADR; | ||
233 | if (blkno) { | ||
234 | mlog_errno(ret); | ||
235 | ocfs2_error(inode->i_sb, | ||
236 | "Multiple extents for (cpos = %u, clusters = %u) on inode %llu; e_blkno %llu and rec %d at e_blkno %llu\n", | ||
237 | cpos, clusters, | ||
238 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | ||
239 | (unsigned long long)blkno, i, | ||
240 | (unsigned long long)le64_to_cpu(rec->e_blkno)); | ||
241 | goto out_free; | ||
242 | } | ||
243 | 69 | ||
244 | blkno = le64_to_cpu(rec->e_blkno); | 70 | if (cpos >= emi->ei_cpos && cpos < range) { |
245 | } | 71 | list_move(&emi->ei_list, &em->em_list); |
246 | 72 | ||
247 | /* | 73 | *ret_emi = emi; |
248 | * We don't support holes, and we're still up | 74 | break; |
249 | * in the branches, so we'd better have found someone | ||
250 | */ | ||
251 | ret = -EBADR; | ||
252 | if (!blkno) { | ||
253 | ocfs2_error(inode->i_sb, | ||
254 | "No record found for (cpos = %u, clusters = %u) on inode %llu\n", | ||
255 | cpos, clusters, | ||
256 | (unsigned long long)OCFS2_I(inode)->ip_blkno); | ||
257 | mlog_errno(ret); | ||
258 | goto out_free; | ||
259 | } | ||
260 | |||
261 | if (eb_bh) { | ||
262 | brelse(eb_bh); | ||
263 | eb_bh = NULL; | ||
264 | } | ||
265 | ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), | ||
266 | blkno, &eb_bh, OCFS2_BH_CACHED, | ||
267 | inode); | ||
268 | if (ret) { | ||
269 | mlog_errno(ret); | ||
270 | goto out_free; | ||
271 | } | ||
272 | eb = (struct ocfs2_extent_block *)eb_bh->b_data; | ||
273 | if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { | ||
274 | OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); | ||
275 | ret = -EIO; | ||
276 | goto out_free; | ||
277 | } | 75 | } |
278 | el = &eb->h_list; | ||
279 | } | 76 | } |
77 | } | ||
280 | 78 | ||
281 | BUG_ON(el->l_tree_depth); | 79 | static int ocfs2_extent_map_lookup(struct inode *inode, unsigned int cpos, |
282 | 80 | unsigned int *phys, unsigned int *len, | |
283 | for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { | 81 | unsigned int *flags) |
284 | rec = &el->l_recs[i]; | 82 | { |
285 | 83 | unsigned int coff; | |
286 | if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) > | 84 | struct ocfs2_inode_info *oi = OCFS2_I(inode); |
287 | OCFS2_I(inode)->ip_clusters) { | 85 | struct ocfs2_extent_map_item *emi; |
288 | ret = -EBADR; | 86 | |
289 | mlog_errno(ret); | 87 | spin_lock(&oi->ip_lock); |
290 | ocfs2_error(inode->i_sb, | 88 | |
291 | "Extent %d at e_blkno %llu of inode %llu goes past ip_clusters of %u\n", | 89 | __ocfs2_extent_map_lookup(&oi->ip_extent_map, cpos, &emi); |
292 | i, | 90 | if (emi) { |
293 | (unsigned long long)le64_to_cpu(rec->e_blkno), | 91 | coff = cpos - emi->ei_cpos; |
294 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | 92 | *phys = emi->ei_phys + coff; |
295 | OCFS2_I(inode)->ip_clusters); | 93 | if (len) |
296 | return ret; | 94 | *len = emi->ei_clusters - coff; |
297 | } | 95 | if (flags) |
298 | 96 | *flags = emi->ei_flags; | |
299 | ret = ocfs2_extent_map_insert(inode, rec, | ||
300 | le16_to_cpu(el->l_tree_depth)); | ||
301 | if (ret && (ret != -EEXIST)) { | ||
302 | mlog_errno(ret); | ||
303 | goto out_free; | ||
304 | } | ||
305 | } | 97 | } |
306 | 98 | ||
307 | ret = 0; | 99 | spin_unlock(&oi->ip_lock); |
308 | 100 | ||
309 | out_free: | 101 | if (emi == NULL) |
310 | if (eb_bh) | 102 | return -ENOENT; |
311 | brelse(eb_bh); | ||
312 | 103 | ||
313 | return ret; | 104 | return 0; |
314 | } | 105 | } |
315 | 106 | ||
316 | /* | 107 | /* |
317 | * This lookup actually will read from disk. It has one invariant: | 108 | * Forget about all clusters equal to or greater than cpos. |
318 | * It will never re-traverse blocks. This means that all inserts should | ||
319 | * be new regions or more granular regions (both allowed by insert). | ||
320 | */ | 109 | */ |
321 | static int ocfs2_extent_map_lookup_read(struct inode *inode, | 110 | void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cpos) |
322 | u32 cpos, | ||
323 | u32 clusters, | ||
324 | struct ocfs2_extent_map_entry **ret_ent) | ||
325 | { | 111 | { |
326 | int ret; | 112 | struct list_head *p, *n; |
327 | u64 blkno; | 113 | struct ocfs2_extent_map_item *emi; |
328 | struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; | 114 | struct ocfs2_inode_info *oi = OCFS2_I(inode); |
329 | struct ocfs2_extent_map_entry *ent; | 115 | struct ocfs2_extent_map *em = &oi->ip_extent_map; |
330 | struct buffer_head *bh = NULL; | 116 | LIST_HEAD(tmp_list); |
331 | struct ocfs2_extent_block *eb; | 117 | unsigned int range; |
332 | struct ocfs2_dinode *di; | 118 | |
333 | struct ocfs2_extent_list *el; | 119 | spin_lock(&oi->ip_lock); |
334 | 120 | list_for_each_safe(p, n, &em->em_list) { | |
335 | spin_lock(&OCFS2_I(inode)->ip_lock); | 121 | emi = list_entry(p, struct ocfs2_extent_map_item, ei_list); |
336 | ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL); | 122 | |
337 | if (ent) { | 123 | if (emi->ei_cpos >= cpos) { |
338 | if (!ent->e_tree_depth) { | 124 | /* Full truncate of this record. */ |
339 | spin_unlock(&OCFS2_I(inode)->ip_lock); | 125 | list_move(&emi->ei_list, &tmp_list); |
340 | *ret_ent = ent; | 126 | BUG_ON(em->em_num_items == 0); |
341 | return 0; | 127 | em->em_num_items--; |
342 | } | 128 | continue; |
343 | blkno = le64_to_cpu(ent->e_rec.e_blkno); | ||
344 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
345 | |||
346 | ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno, &bh, | ||
347 | OCFS2_BH_CACHED, inode); | ||
348 | if (ret) { | ||
349 | mlog_errno(ret); | ||
350 | if (bh) | ||
351 | brelse(bh); | ||
352 | return ret; | ||
353 | } | 129 | } |
354 | eb = (struct ocfs2_extent_block *)bh->b_data; | ||
355 | if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { | ||
356 | OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); | ||
357 | brelse(bh); | ||
358 | return -EIO; | ||
359 | } | ||
360 | el = &eb->h_list; | ||
361 | } else { | ||
362 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
363 | 130 | ||
364 | ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), | 131 | range = emi->ei_cpos + emi->ei_clusters; |
365 | OCFS2_I(inode)->ip_blkno, &bh, | 132 | if (range > cpos) { |
366 | OCFS2_BH_CACHED, inode); | 133 | /* Partial truncate */ |
367 | if (ret) { | 134 | emi->ei_clusters = cpos - emi->ei_cpos; |
368 | mlog_errno(ret); | ||
369 | if (bh) | ||
370 | brelse(bh); | ||
371 | return ret; | ||
372 | } | 135 | } |
373 | di = (struct ocfs2_dinode *)bh->b_data; | ||
374 | if (!OCFS2_IS_VALID_DINODE(di)) { | ||
375 | brelse(bh); | ||
376 | OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, di); | ||
377 | return -EIO; | ||
378 | } | ||
379 | el = &di->id2.i_list; | ||
380 | } | ||
381 | |||
382 | ret = ocfs2_extent_map_find_leaf(inode, cpos, clusters, el); | ||
383 | brelse(bh); | ||
384 | if (ret) { | ||
385 | mlog_errno(ret); | ||
386 | return ret; | ||
387 | } | 136 | } |
137 | spin_unlock(&oi->ip_lock); | ||
388 | 138 | ||
389 | ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL); | 139 | list_for_each_safe(p, n, &tmp_list) { |
390 | if (!ent) { | 140 | emi = list_entry(p, struct ocfs2_extent_map_item, ei_list); |
391 | ret = -ESRCH; | 141 | list_del(&emi->ei_list); |
392 | mlog_errno(ret); | 142 | kfree(emi); |
393 | return ret; | ||
394 | } | 143 | } |
395 | |||
396 | /* FIXME: Make sure this isn't a corruption */ | ||
397 | BUG_ON(ent->e_tree_depth); | ||
398 | |||
399 | *ret_ent = ent; | ||
400 | |||
401 | return 0; | ||
402 | } | 144 | } |
403 | 145 | ||
404 | /* | 146 | /* |
405 | * Callers must hold ip_lock. This can insert pieces of the tree, | 147 | * Is any part of emi2 contained within emi1 |
406 | * thus racing lookup if the lock weren't held. | ||
407 | */ | 148 | */ |
408 | static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em, | 149 | static int ocfs2_ei_is_contained(struct ocfs2_extent_map_item *emi1, |
409 | struct ocfs2_extent_map_entry *ent) | 150 | struct ocfs2_extent_map_item *emi2) |
410 | { | 151 | { |
411 | struct rb_node **p, *parent; | 152 | unsigned int range1, range2; |
412 | struct ocfs2_extent_map_entry *old_ent; | ||
413 | 153 | ||
414 | old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(ent->e_rec.e_cpos), | 154 | /* |
415 | le32_to_cpu(ent->e_rec.e_clusters), | 155 | * Check if logical start of emi2 is inside emi1 |
416 | &p, &parent); | 156 | */ |
417 | if (old_ent) | 157 | range1 = emi1->ei_cpos + emi1->ei_clusters; |
418 | return -EEXIST; | 158 | if (emi2->ei_cpos >= emi1->ei_cpos && emi2->ei_cpos < range1) |
159 | return 1; | ||
419 | 160 | ||
420 | rb_link_node(&ent->e_node, parent, p); | 161 | /* |
421 | rb_insert_color(&ent->e_node, &em->em_extents); | 162 | * Check if logical end of emi2 is inside emi1 |
163 | */ | ||
164 | range2 = emi2->ei_cpos + emi2->ei_clusters; | ||
165 | if (range2 > emi1->ei_cpos && range2 <= range1) | ||
166 | return 1; | ||
422 | 167 | ||
423 | return 0; | 168 | return 0; |
424 | } | 169 | } |
425 | 170 | ||
171 | static void ocfs2_copy_emi_fields(struct ocfs2_extent_map_item *dest, | ||
172 | struct ocfs2_extent_map_item *src) | ||
173 | { | ||
174 | dest->ei_cpos = src->ei_cpos; | ||
175 | dest->ei_phys = src->ei_phys; | ||
176 | dest->ei_clusters = src->ei_clusters; | ||
177 | dest->ei_flags = src->ei_flags; | ||
178 | } | ||
426 | 179 | ||
427 | /* | 180 | /* |
428 | * Simple rule: on any return code other than -EAGAIN, anything left | 181 | * Try to merge emi with ins. Returns 1 if merge succeeds, zero |
429 | * in the insert_context will be freed. | 182 | * otherwise. |
430 | * | ||
431 | * Simple rule #2: A return code of -EEXIST from this function or | ||
432 | * its calls to ocfs2_extent_map_insert_entry() signifies that another | ||
433 | * thread beat us to the insert. It is not an actual error, but it | ||
434 | * tells the caller we have no more work to do. | ||
435 | */ | 183 | */ |
436 | static int ocfs2_extent_map_try_insert(struct inode *inode, | 184 | static int ocfs2_try_to_merge_extent_map(struct ocfs2_extent_map_item *emi, |
437 | struct ocfs2_extent_rec *rec, | 185 | struct ocfs2_extent_map_item *ins) |
438 | int tree_depth, | ||
439 | struct ocfs2_em_insert_context *ctxt) | ||
440 | { | 186 | { |
441 | int ret; | ||
442 | struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; | ||
443 | struct ocfs2_extent_map_entry *old_ent; | ||
444 | |||
445 | ctxt->need_left = 0; | ||
446 | ctxt->need_right = 0; | ||
447 | ctxt->old_ent = NULL; | ||
448 | |||
449 | spin_lock(&OCFS2_I(inode)->ip_lock); | ||
450 | ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent); | ||
451 | if (!ret) { | ||
452 | ctxt->new_ent = NULL; | ||
453 | goto out_unlock; | ||
454 | } | ||
455 | |||
456 | /* Since insert_entry failed, the map MUST have old_ent */ | ||
457 | old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos), | ||
458 | le32_to_cpu(rec->e_clusters), | ||
459 | NULL, NULL); | ||
460 | |||
461 | BUG_ON(!old_ent); | ||
462 | |||
463 | if (old_ent->e_tree_depth < tree_depth) { | ||
464 | /* Another thread beat us to the lower tree_depth */ | ||
465 | ret = -EEXIST; | ||
466 | goto out_unlock; | ||
467 | } | ||
468 | |||
469 | if (old_ent->e_tree_depth == tree_depth) { | ||
470 | /* | ||
471 | * Another thread beat us to this tree_depth. | ||
472 | * Let's make sure we agree with that thread (the | ||
473 | * extent_rec should be identical). | ||
474 | */ | ||
475 | if (!memcmp(rec, &old_ent->e_rec, | ||
476 | sizeof(struct ocfs2_extent_rec))) | ||
477 | ret = 0; | ||
478 | else | ||
479 | /* FIXME: Should this be ESRCH/EBADR??? */ | ||
480 | ret = -EEXIST; | ||
481 | |||
482 | goto out_unlock; | ||
483 | } | ||
484 | |||
485 | /* | 187 | /* |
486 | * We do it in this order specifically so that no actual tree | 188 | * Handle contiguousness |
487 | * changes occur until we have all the pieces we need. We | ||
488 | * don't want malloc failures to leave an inconsistent tree. | ||
489 | * Whenever we drop the lock, another process could be | ||
490 | * inserting. Also note that, if another process just beat us | ||
491 | * to an insert, we might not need the same pieces we needed | ||
492 | * the first go round. In the end, the pieces we need will | ||
493 | * be used, and the pieces we don't will be freed. | ||
494 | */ | 189 | */ |
495 | ctxt->need_left = !!(le32_to_cpu(rec->e_cpos) > | 190 | if (ins->ei_phys == (emi->ei_phys + emi->ei_clusters) && |
496 | le32_to_cpu(old_ent->e_rec.e_cpos)); | 191 | ins->ei_cpos == (emi->ei_cpos + emi->ei_clusters) && |
497 | ctxt->need_right = !!((le32_to_cpu(old_ent->e_rec.e_cpos) + | 192 | ins->ei_flags == emi->ei_flags) { |
498 | le32_to_cpu(old_ent->e_rec.e_clusters)) > | 193 | emi->ei_clusters += ins->ei_clusters; |
499 | (le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters))); | 194 | return 1; |
500 | ret = -EAGAIN; | 195 | } else if ((ins->ei_phys + ins->ei_clusters) == emi->ei_phys && |
501 | if (ctxt->need_left) { | 196 | (ins->ei_cpos + ins->ei_clusters) == emi->ei_phys && |
502 | if (!ctxt->left_ent) | 197 | ins->ei_flags == emi->ei_flags) { |
503 | goto out_unlock; | 198 | emi->ei_phys = ins->ei_phys; |
504 | *(ctxt->left_ent) = *old_ent; | 199 | emi->ei_cpos = ins->ei_cpos; |
505 | ctxt->left_ent->e_rec.e_clusters = | 200 | emi->ei_clusters += ins->ei_clusters; |
506 | cpu_to_le32(le32_to_cpu(rec->e_cpos) - | 201 | return 1; |
507 | le32_to_cpu(ctxt->left_ent->e_rec.e_cpos)); | ||
508 | } | ||
509 | if (ctxt->need_right) { | ||
510 | if (!ctxt->right_ent) | ||
511 | goto out_unlock; | ||
512 | *(ctxt->right_ent) = *old_ent; | ||
513 | ctxt->right_ent->e_rec.e_cpos = | ||
514 | cpu_to_le32(le32_to_cpu(rec->e_cpos) + | ||
515 | le32_to_cpu(rec->e_clusters)); | ||
516 | ctxt->right_ent->e_rec.e_clusters = | ||
517 | cpu_to_le32((le32_to_cpu(old_ent->e_rec.e_cpos) + | ||
518 | le32_to_cpu(old_ent->e_rec.e_clusters)) - | ||
519 | le32_to_cpu(ctxt->right_ent->e_rec.e_cpos)); | ||
520 | } | ||
521 | |||
522 | rb_erase(&old_ent->e_node, &em->em_extents); | ||
523 | /* Now that he's erased, set him up for deletion */ | ||
524 | ctxt->old_ent = old_ent; | ||
525 | |||
526 | if (ctxt->need_left) { | ||
527 | ret = ocfs2_extent_map_insert_entry(em, | ||
528 | ctxt->left_ent); | ||
529 | if (ret) | ||
530 | goto out_unlock; | ||
531 | ctxt->left_ent = NULL; | ||
532 | } | 202 | } |
533 | 203 | ||
534 | if (ctxt->need_right) { | 204 | /* |
535 | ret = ocfs2_extent_map_insert_entry(em, | 205 | * Overlapping extents - this shouldn't happen unless we've |
536 | ctxt->right_ent); | 206 | * split an extent to change it's flags. That is exceedingly |
537 | if (ret) | 207 | * rare, so there's no sense in trying to optimize it yet. |
538 | goto out_unlock; | 208 | */ |
539 | ctxt->right_ent = NULL; | 209 | if (ocfs2_ei_is_contained(emi, ins) || |
210 | ocfs2_ei_is_contained(ins, emi)) { | ||
211 | ocfs2_copy_emi_fields(emi, ins); | ||
212 | return 1; | ||
540 | } | 213 | } |
541 | 214 | ||
542 | ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent); | 215 | /* No merge was possible. */ |
543 | 216 | return 0; | |
544 | if (!ret) | ||
545 | ctxt->new_ent = NULL; | ||
546 | |||
547 | out_unlock: | ||
548 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
549 | |||
550 | return ret; | ||
551 | } | 217 | } |
552 | 218 | ||
553 | 219 | /* | |
554 | static int ocfs2_extent_map_insert(struct inode *inode, | 220 | * In order to reduce complexity on the caller, this insert function |
555 | struct ocfs2_extent_rec *rec, | 221 | * is intentionally liberal in what it will accept. |
556 | int tree_depth) | 222 | * |
223 | * The only rule is that the truncate call *must* be used whenever | ||
224 | * records have been deleted. This avoids inserting overlapping | ||
225 | * records with different physical mappings. | ||
226 | */ | ||
227 | void ocfs2_extent_map_insert_rec(struct inode *inode, | ||
228 | struct ocfs2_extent_rec *rec) | ||
557 | { | 229 | { |
558 | int ret; | 230 | struct ocfs2_inode_info *oi = OCFS2_I(inode); |
559 | struct ocfs2_em_insert_context ctxt = {0, }; | 231 | struct ocfs2_extent_map *em = &oi->ip_extent_map; |
560 | 232 | struct ocfs2_extent_map_item *emi, *new_emi = NULL; | |
561 | if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) > | 233 | struct ocfs2_extent_map_item ins; |
562 | OCFS2_I(inode)->ip_map.em_clusters) { | 234 | |
563 | ret = -EBADR; | 235 | ins.ei_cpos = le32_to_cpu(rec->e_cpos); |
564 | mlog_errno(ret); | 236 | ins.ei_phys = ocfs2_blocks_to_clusters(inode->i_sb, |
565 | return ret; | 237 | le64_to_cpu(rec->e_blkno)); |
238 | ins.ei_clusters = le16_to_cpu(rec->e_leaf_clusters); | ||
239 | ins.ei_flags = rec->e_flags; | ||
240 | |||
241 | search: | ||
242 | spin_lock(&oi->ip_lock); | ||
243 | |||
244 | list_for_each_entry(emi, &em->em_list, ei_list) { | ||
245 | if (ocfs2_try_to_merge_extent_map(emi, &ins)) { | ||
246 | list_move(&emi->ei_list, &em->em_list); | ||
247 | spin_unlock(&oi->ip_lock); | ||
248 | goto out; | ||
249 | } | ||
566 | } | 250 | } |
567 | 251 | ||
568 | /* Zero e_clusters means a truncated tail record. It better be EOF */ | 252 | /* |
569 | if (!rec->e_clusters) { | 253 | * No item could be merged. |
570 | if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) != | 254 | * |
571 | OCFS2_I(inode)->ip_map.em_clusters) { | 255 | * Either allocate and add a new item, or overwrite the last recently |
572 | ret = -EBADR; | 256 | * inserted. |
573 | mlog_errno(ret); | 257 | */ |
574 | ocfs2_error(inode->i_sb, | ||
575 | "Zero e_clusters on non-tail extent record at e_blkno %llu on inode %llu\n", | ||
576 | (unsigned long long)le64_to_cpu(rec->e_blkno), | ||
577 | (unsigned long long)OCFS2_I(inode)->ip_blkno); | ||
578 | return ret; | ||
579 | } | ||
580 | 258 | ||
581 | /* Ignore the truncated tail */ | 259 | if (em->em_num_items < OCFS2_MAX_EXTENT_MAP_ITEMS) { |
582 | return 0; | 260 | if (new_emi == NULL) { |
583 | } | 261 | spin_unlock(&oi->ip_lock); |
584 | 262 | ||
585 | ret = -ENOMEM; | 263 | new_emi = kmalloc(sizeof(*new_emi), GFP_NOFS); |
586 | ctxt.new_ent = kmem_cache_alloc(ocfs2_em_ent_cachep, | 264 | if (new_emi == NULL) |
587 | GFP_NOFS); | 265 | goto out; |
588 | if (!ctxt.new_ent) { | ||
589 | mlog_errno(ret); | ||
590 | return ret; | ||
591 | } | ||
592 | 266 | ||
593 | ctxt.new_ent->e_rec = *rec; | 267 | goto search; |
594 | ctxt.new_ent->e_tree_depth = tree_depth; | ||
595 | |||
596 | do { | ||
597 | ret = -ENOMEM; | ||
598 | if (ctxt.need_left && !ctxt.left_ent) { | ||
599 | ctxt.left_ent = | ||
600 | kmem_cache_alloc(ocfs2_em_ent_cachep, | ||
601 | GFP_NOFS); | ||
602 | if (!ctxt.left_ent) | ||
603 | break; | ||
604 | } | ||
605 | if (ctxt.need_right && !ctxt.right_ent) { | ||
606 | ctxt.right_ent = | ||
607 | kmem_cache_alloc(ocfs2_em_ent_cachep, | ||
608 | GFP_NOFS); | ||
609 | if (!ctxt.right_ent) | ||
610 | break; | ||
611 | } | 268 | } |
612 | 269 | ||
613 | ret = ocfs2_extent_map_try_insert(inode, rec, | 270 | ocfs2_copy_emi_fields(new_emi, &ins); |
614 | tree_depth, &ctxt); | 271 | list_add(&new_emi->ei_list, &em->em_list); |
615 | } while (ret == -EAGAIN); | 272 | em->em_num_items++; |
616 | 273 | new_emi = NULL; | |
617 | if ((ret < 0) && (ret != -EEXIST)) | 274 | } else { |
618 | mlog_errno(ret); | 275 | BUG_ON(list_empty(&em->em_list) || em->em_num_items == 0); |
276 | emi = list_entry(em->em_list.prev, | ||
277 | struct ocfs2_extent_map_item, ei_list); | ||
278 | list_move(&emi->ei_list, &em->em_list); | ||
279 | ocfs2_copy_emi_fields(emi, &ins); | ||
280 | } | ||
619 | 281 | ||
620 | if (ctxt.left_ent) | 282 | spin_unlock(&oi->ip_lock); |
621 | kmem_cache_free(ocfs2_em_ent_cachep, ctxt.left_ent); | ||
622 | if (ctxt.right_ent) | ||
623 | kmem_cache_free(ocfs2_em_ent_cachep, ctxt.right_ent); | ||
624 | if (ctxt.old_ent) | ||
625 | kmem_cache_free(ocfs2_em_ent_cachep, ctxt.old_ent); | ||
626 | if (ctxt.new_ent) | ||
627 | kmem_cache_free(ocfs2_em_ent_cachep, ctxt.new_ent); | ||
628 | 283 | ||
629 | return ret; | 284 | out: |
285 | if (new_emi) | ||
286 | kfree(new_emi); | ||
630 | } | 287 | } |
631 | 288 | ||
632 | /* | 289 | /* |
633 | * Append this record to the tail of the extent map. It must be | 290 | * Return the 1st index within el which contains an extent start |
634 | * tree_depth 0. The record might be an extension of an existing | 291 | * larger than v_cluster. |
635 | * record, and as such that needs to be handled. eg: | ||
636 | * | ||
637 | * Existing record in the extent map: | ||
638 | * | ||
639 | * cpos = 10, len = 10 | ||
640 | * |---------| | ||
641 | * | ||
642 | * New Record: | ||
643 | * | ||
644 | * cpos = 10, len = 20 | ||
645 | * |------------------| | ||
646 | * | ||
647 | * The passed record is the new on-disk record. The new_clusters value | ||
648 | * is how many clusters were added to the file. If the append is a | ||
649 | * contiguous append, the new_clusters has been added to | ||
650 | * rec->e_clusters. If the append is an entirely new extent, then | ||
651 | * rec->e_clusters is == new_clusters. | ||
652 | */ | 292 | */ |
653 | int ocfs2_extent_map_append(struct inode *inode, | 293 | static int ocfs2_search_for_hole_index(struct ocfs2_extent_list *el, |
654 | struct ocfs2_extent_rec *rec, | 294 | u32 v_cluster) |
655 | u32 new_clusters) | ||
656 | { | 295 | { |
657 | int ret; | 296 | int i; |
658 | struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; | 297 | struct ocfs2_extent_rec *rec; |
659 | struct ocfs2_extent_map_entry *ent; | ||
660 | struct ocfs2_extent_rec *old; | ||
661 | |||
662 | BUG_ON(!new_clusters); | ||
663 | BUG_ON(le32_to_cpu(rec->e_clusters) < new_clusters); | ||
664 | 298 | ||
665 | if (em->em_clusters < OCFS2_I(inode)->ip_clusters) { | 299 | for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { |
666 | /* | 300 | rec = &el->l_recs[i]; |
667 | * Size changed underneath us on disk. Drop any | ||
668 | * straddling records and update our idea of | ||
669 | * i_clusters | ||
670 | */ | ||
671 | ocfs2_extent_map_drop(inode, em->em_clusters - 1); | ||
672 | em->em_clusters = OCFS2_I(inode)->ip_clusters; | ||
673 | } | ||
674 | 301 | ||
675 | mlog_bug_on_msg((le32_to_cpu(rec->e_cpos) + | 302 | if (v_cluster < le32_to_cpu(rec->e_cpos)) |
676 | le32_to_cpu(rec->e_clusters)) != | 303 | break; |
677 | (em->em_clusters + new_clusters), | ||
678 | "Inode %llu:\n" | ||
679 | "rec->e_cpos = %u + rec->e_clusters = %u = %u\n" | ||
680 | "em->em_clusters = %u + new_clusters = %u = %u\n", | ||
681 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | ||
682 | le32_to_cpu(rec->e_cpos), le32_to_cpu(rec->e_clusters), | ||
683 | le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters), | ||
684 | em->em_clusters, new_clusters, | ||
685 | em->em_clusters + new_clusters); | ||
686 | |||
687 | em->em_clusters += new_clusters; | ||
688 | |||
689 | ret = -ENOENT; | ||
690 | if (le32_to_cpu(rec->e_clusters) > new_clusters) { | ||
691 | /* This is a contiguous append */ | ||
692 | ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos), 1, | ||
693 | NULL, NULL); | ||
694 | if (ent) { | ||
695 | old = &ent->e_rec; | ||
696 | BUG_ON((le32_to_cpu(rec->e_cpos) + | ||
697 | le32_to_cpu(rec->e_clusters)) != | ||
698 | (le32_to_cpu(old->e_cpos) + | ||
699 | le32_to_cpu(old->e_clusters) + | ||
700 | new_clusters)); | ||
701 | if (ent->e_tree_depth == 0) { | ||
702 | BUG_ON(le32_to_cpu(old->e_cpos) != | ||
703 | le32_to_cpu(rec->e_cpos)); | ||
704 | BUG_ON(le64_to_cpu(old->e_blkno) != | ||
705 | le64_to_cpu(rec->e_blkno)); | ||
706 | ret = 0; | ||
707 | } | ||
708 | /* | ||
709 | * Let non-leafs fall through as -ENOENT to | ||
710 | * force insertion of the new leaf. | ||
711 | */ | ||
712 | le32_add_cpu(&old->e_clusters, new_clusters); | ||
713 | } | ||
714 | } | 304 | } |
715 | 305 | ||
716 | if (ret == -ENOENT) | 306 | return i; |
717 | ret = ocfs2_extent_map_insert(inode, rec, 0); | ||
718 | if (ret < 0) | ||
719 | mlog_errno(ret); | ||
720 | return ret; | ||
721 | } | 307 | } |
722 | 308 | ||
723 | #if 0 | ||
724 | /* Code here is included but defined out as it completes the extent | ||
725 | * map api and may be used in the future. */ | ||
726 | |||
727 | /* | 309 | /* |
728 | * Look up the record containing this cluster offset. This record is | 310 | * Figure out the size of a hole which starts at v_cluster within the given |
729 | * part of the extent map. Do not free it. Any changes you make to | 311 | * extent list. |
730 | * it will reflect in the extent map. So, if your last extent | ||
731 | * is (cpos = 10, clusters = 10) and you truncate the file by 5 | ||
732 | * clusters, you can do: | ||
733 | * | 312 | * |
734 | * ret = ocfs2_extent_map_get_rec(em, orig_size - 5, &rec); | 313 | * If there is no more allocation past v_cluster, we return the maximum |
735 | * rec->e_clusters -= 5; | 314 | * cluster size minus v_cluster. |
736 | * | 315 | * |
737 | * The lookup does not read from disk. If the map isn't filled in for | 316 | * If we have in-inode extents, then el points to the dinode list and |
738 | * an entry, you won't find it. | 317 | * eb_bh is NULL. Otherwise, eb_bh should point to the extent block |
739 | * | 318 | * containing el. |
740 | * Also note that the returned record is valid until alloc_sem is | ||
741 | * dropped. After that, truncate and extend can happen. Caveat Emptor. | ||
742 | */ | 319 | */ |
743 | int ocfs2_extent_map_get_rec(struct inode *inode, u32 cpos, | 320 | static int ocfs2_figure_hole_clusters(struct inode *inode, |
744 | struct ocfs2_extent_rec **rec, | 321 | struct ocfs2_extent_list *el, |
745 | int *tree_depth) | 322 | struct buffer_head *eb_bh, |
323 | u32 v_cluster, | ||
324 | u32 *num_clusters) | ||
746 | { | 325 | { |
747 | int ret = -ENOENT; | 326 | int ret, i; |
748 | struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; | 327 | struct buffer_head *next_eb_bh = NULL; |
749 | struct ocfs2_extent_map_entry *ent; | 328 | struct ocfs2_extent_block *eb, *next_eb; |
750 | 329 | ||
751 | *rec = NULL; | 330 | i = ocfs2_search_for_hole_index(el, v_cluster); |
752 | 331 | ||
753 | if (cpos >= OCFS2_I(inode)->ip_clusters) | 332 | if (i == le16_to_cpu(el->l_next_free_rec) && eb_bh) { |
754 | return -EINVAL; | 333 | eb = (struct ocfs2_extent_block *)eb_bh->b_data; |
755 | 334 | ||
756 | if (cpos >= em->em_clusters) { | ||
757 | /* | 335 | /* |
758 | * Size changed underneath us on disk. Drop any | 336 | * Check the next leaf for any extents. |
759 | * straddling records and update our idea of | ||
760 | * i_clusters | ||
761 | */ | 337 | */ |
762 | ocfs2_extent_map_drop(inode, em->em_clusters - 1); | ||
763 | em->em_clusters = OCFS2_I(inode)->ip_clusters ; | ||
764 | } | ||
765 | |||
766 | ent = ocfs2_extent_map_lookup(&OCFS2_I(inode)->ip_map, cpos, 1, | ||
767 | NULL, NULL); | ||
768 | 338 | ||
769 | if (ent) { | 339 | if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL) |
770 | *rec = &ent->e_rec; | 340 | goto no_more_extents; |
771 | if (tree_depth) | ||
772 | *tree_depth = ent->e_tree_depth; | ||
773 | ret = 0; | ||
774 | } | ||
775 | 341 | ||
776 | return ret; | 342 | ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), |
777 | } | 343 | le64_to_cpu(eb->h_next_leaf_blk), |
344 | &next_eb_bh, OCFS2_BH_CACHED, inode); | ||
345 | if (ret) { | ||
346 | mlog_errno(ret); | ||
347 | goto out; | ||
348 | } | ||
349 | next_eb = (struct ocfs2_extent_block *)next_eb_bh->b_data; | ||
778 | 350 | ||
779 | int ocfs2_extent_map_get_clusters(struct inode *inode, | 351 | if (!OCFS2_IS_VALID_EXTENT_BLOCK(next_eb)) { |
780 | u32 v_cpos, int count, | 352 | ret = -EROFS; |
781 | u32 *p_cpos, int *ret_count) | 353 | OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, next_eb); |
782 | { | 354 | goto out; |
783 | int ret; | 355 | } |
784 | u32 coff, ccount; | ||
785 | struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; | ||
786 | struct ocfs2_extent_map_entry *ent = NULL; | ||
787 | 356 | ||
788 | *p_cpos = ccount = 0; | 357 | el = &next_eb->h_list; |
789 | 358 | ||
790 | if ((v_cpos + count) > OCFS2_I(inode)->ip_clusters) | 359 | i = ocfs2_search_for_hole_index(el, v_cluster); |
791 | return -EINVAL; | 360 | } |
792 | 361 | ||
793 | if ((v_cpos + count) > em->em_clusters) { | 362 | no_more_extents: |
363 | if (i == le16_to_cpu(el->l_next_free_rec)) { | ||
794 | /* | 364 | /* |
795 | * Size changed underneath us on disk. Drop any | 365 | * We're at the end of our existing allocation. Just |
796 | * straddling records and update our idea of | 366 | * return the maximum number of clusters we could |
797 | * i_clusters | 367 | * possibly allocate. |
798 | */ | 368 | */ |
799 | ocfs2_extent_map_drop(inode, em->em_clusters - 1); | 369 | *num_clusters = UINT_MAX - v_cluster; |
800 | em->em_clusters = OCFS2_I(inode)->ip_clusters; | 370 | } else { |
371 | *num_clusters = le32_to_cpu(el->l_recs[i].e_cpos) - v_cluster; | ||
801 | } | 372 | } |
802 | 373 | ||
374 | ret = 0; | ||
375 | out: | ||
376 | brelse(next_eb_bh); | ||
377 | return ret; | ||
378 | } | ||
803 | 379 | ||
804 | ret = ocfs2_extent_map_lookup_read(inode, v_cpos, count, &ent); | 380 | /* |
805 | if (ret) | 381 | * Return the index of the extent record which contains cluster #v_cluster. |
806 | return ret; | 382 | * -1 is returned if it was not found. |
383 | * | ||
384 | * Should work fine on interior and exterior nodes. | ||
385 | */ | ||
386 | static int ocfs2_search_extent_list(struct ocfs2_extent_list *el, | ||
387 | u32 v_cluster) | ||
388 | { | ||
389 | int ret = -1; | ||
390 | int i; | ||
391 | struct ocfs2_extent_rec *rec; | ||
392 | u32 rec_end, rec_start, clusters; | ||
807 | 393 | ||
808 | if (ent) { | 394 | for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { |
809 | /* We should never find ourselves straddling an interval */ | 395 | rec = &el->l_recs[i]; |
810 | if (!ocfs2_extent_rec_contains_clusters(&ent->e_rec, | ||
811 | v_cpos, | ||
812 | count)) | ||
813 | return -ESRCH; | ||
814 | 396 | ||
815 | coff = v_cpos - le32_to_cpu(ent->e_rec.e_cpos); | 397 | rec_start = le32_to_cpu(rec->e_cpos); |
816 | *p_cpos = ocfs2_blocks_to_clusters(inode->i_sb, | 398 | clusters = ocfs2_rec_clusters(el, rec); |
817 | le64_to_cpu(ent->e_rec.e_blkno)) + | ||
818 | coff; | ||
819 | 399 | ||
820 | if (ret_count) | 400 | rec_end = rec_start + clusters; |
821 | *ret_count = le32_to_cpu(ent->e_rec.e_clusters) - coff; | ||
822 | 401 | ||
823 | return 0; | 402 | if (v_cluster >= rec_start && v_cluster < rec_end) { |
403 | ret = i; | ||
404 | break; | ||
405 | } | ||
824 | } | 406 | } |
825 | 407 | ||
826 | 408 | return ret; | |
827 | return -ENOENT; | ||
828 | } | 409 | } |
829 | 410 | ||
830 | #endif /* 0 */ | 411 | int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, |
831 | 412 | u32 *p_cluster, u32 *num_clusters, | |
832 | int ocfs2_extent_map_get_blocks(struct inode *inode, | 413 | unsigned int *extent_flags) |
833 | u64 v_blkno, int count, | ||
834 | u64 *p_blkno, int *ret_count) | ||
835 | { | 414 | { |
836 | int ret; | 415 | int ret, i; |
837 | u64 boff; | 416 | unsigned int flags = 0; |
838 | u32 cpos, clusters; | 417 | struct buffer_head *di_bh = NULL; |
839 | int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); | 418 | struct buffer_head *eb_bh = NULL; |
840 | struct ocfs2_extent_map_entry *ent = NULL; | 419 | struct ocfs2_dinode *di; |
841 | struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; | 420 | struct ocfs2_extent_block *eb; |
421 | struct ocfs2_extent_list *el; | ||
842 | struct ocfs2_extent_rec *rec; | 422 | struct ocfs2_extent_rec *rec; |
423 | u32 coff; | ||
843 | 424 | ||
844 | *p_blkno = 0; | 425 | ret = ocfs2_extent_map_lookup(inode, v_cluster, p_cluster, |
845 | 426 | num_clusters, extent_flags); | |
846 | cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno); | 427 | if (ret == 0) |
847 | clusters = ocfs2_blocks_to_clusters(inode->i_sb, | 428 | goto out; |
848 | (u64)count + bpc - 1); | ||
849 | if ((cpos + clusters) > OCFS2_I(inode)->ip_clusters) { | ||
850 | ret = -EINVAL; | ||
851 | mlog_errno(ret); | ||
852 | return ret; | ||
853 | } | ||
854 | |||
855 | if ((cpos + clusters) > em->em_clusters) { | ||
856 | /* | ||
857 | * Size changed underneath us on disk. Drop any | ||
858 | * straddling records and update our idea of | ||
859 | * i_clusters | ||
860 | */ | ||
861 | ocfs2_extent_map_drop(inode, em->em_clusters - 1); | ||
862 | em->em_clusters = OCFS2_I(inode)->ip_clusters; | ||
863 | } | ||
864 | 429 | ||
865 | ret = ocfs2_extent_map_lookup_read(inode, cpos, clusters, &ent); | 430 | ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), OCFS2_I(inode)->ip_blkno, |
431 | &di_bh, OCFS2_BH_CACHED, inode); | ||
866 | if (ret) { | 432 | if (ret) { |
867 | mlog_errno(ret); | 433 | mlog_errno(ret); |
868 | return ret; | 434 | goto out; |
869 | } | 435 | } |
870 | 436 | ||
871 | if (ent) | 437 | di = (struct ocfs2_dinode *) di_bh->b_data; |
872 | { | 438 | el = &di->id2.i_list; |
873 | rec = &ent->e_rec; | ||
874 | 439 | ||
875 | /* We should never find ourselves straddling an interval */ | 440 | if (el->l_tree_depth) { |
876 | if (!ocfs2_extent_rec_contains_clusters(rec, cpos, clusters)) { | 441 | ret = ocfs2_find_leaf(inode, el, v_cluster, &eb_bh); |
877 | ret = -ESRCH; | 442 | if (ret) { |
878 | mlog_errno(ret); | 443 | mlog_errno(ret); |
879 | return ret; | 444 | goto out; |
880 | } | 445 | } |
881 | 446 | ||
882 | boff = ocfs2_clusters_to_blocks(inode->i_sb, cpos - | 447 | eb = (struct ocfs2_extent_block *) eb_bh->b_data; |
883 | le32_to_cpu(rec->e_cpos)); | 448 | el = &eb->h_list; |
884 | boff += (v_blkno & (u64)(bpc - 1)); | ||
885 | *p_blkno = le64_to_cpu(rec->e_blkno) + boff; | ||
886 | 449 | ||
887 | if (ret_count) { | 450 | if (el->l_tree_depth) { |
888 | *ret_count = ocfs2_clusters_to_blocks(inode->i_sb, | 451 | ocfs2_error(inode->i_sb, |
889 | le32_to_cpu(rec->e_clusters)) - boff; | 452 | "Inode %lu has non zero tree depth in " |
453 | "leaf block %llu\n", inode->i_ino, | ||
454 | (unsigned long long)eb_bh->b_blocknr); | ||
455 | ret = -EROFS; | ||
456 | goto out; | ||
890 | } | 457 | } |
891 | |||
892 | return 0; | ||
893 | } | 458 | } |
894 | 459 | ||
895 | return -ENOENT; | 460 | i = ocfs2_search_extent_list(el, v_cluster); |
896 | } | 461 | if (i == -1) { |
897 | 462 | /* | |
898 | int ocfs2_extent_map_init(struct inode *inode) | 463 | * A hole was found. Return some canned values that |
899 | { | 464 | * callers can key on. If asked for, num_clusters will |
900 | struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; | 465 | * be populated with the size of the hole. |
901 | 466 | */ | |
902 | em->em_extents = RB_ROOT; | 467 | *p_cluster = 0; |
903 | em->em_clusters = 0; | 468 | if (num_clusters) { |
904 | 469 | ret = ocfs2_figure_hole_clusters(inode, el, eb_bh, | |
905 | return 0; | 470 | v_cluster, |
906 | } | 471 | num_clusters); |
907 | 472 | if (ret) { | |
908 | /* Needs the lock */ | 473 | mlog_errno(ret); |
909 | static void __ocfs2_extent_map_drop(struct inode *inode, | 474 | goto out; |
910 | u32 new_clusters, | 475 | } |
911 | struct rb_node **free_head, | 476 | } |
912 | struct ocfs2_extent_map_entry **tail_ent) | 477 | } else { |
913 | { | 478 | rec = &el->l_recs[i]; |
914 | struct rb_node *node, *next; | ||
915 | struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; | ||
916 | struct ocfs2_extent_map_entry *ent; | ||
917 | 479 | ||
918 | *free_head = NULL; | 480 | BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos)); |
919 | 481 | ||
920 | ent = NULL; | 482 | if (!rec->e_blkno) { |
921 | node = rb_last(&em->em_extents); | 483 | ocfs2_error(inode->i_sb, "Inode %lu has bad extent " |
922 | while (node) | 484 | "record (%u, %u, 0)", inode->i_ino, |
923 | { | 485 | le32_to_cpu(rec->e_cpos), |
924 | next = rb_prev(node); | 486 | ocfs2_rec_clusters(el, rec)); |
487 | ret = -EROFS; | ||
488 | goto out; | ||
489 | } | ||
925 | 490 | ||
926 | ent = rb_entry(node, struct ocfs2_extent_map_entry, | 491 | coff = v_cluster - le32_to_cpu(rec->e_cpos); |
927 | e_node); | ||
928 | if (le32_to_cpu(ent->e_rec.e_cpos) < new_clusters) | ||
929 | break; | ||
930 | 492 | ||
931 | rb_erase(&ent->e_node, &em->em_extents); | 493 | *p_cluster = ocfs2_blocks_to_clusters(inode->i_sb, |
494 | le64_to_cpu(rec->e_blkno)); | ||
495 | *p_cluster = *p_cluster + coff; | ||
932 | 496 | ||
933 | node->rb_right = *free_head; | 497 | if (num_clusters) |
934 | *free_head = node; | 498 | *num_clusters = ocfs2_rec_clusters(el, rec) - coff; |
935 | 499 | ||
936 | ent = NULL; | 500 | flags = rec->e_flags; |
937 | node = next; | ||
938 | } | ||
939 | 501 | ||
940 | /* Do we have an entry straddling new_clusters? */ | 502 | ocfs2_extent_map_insert_rec(inode, rec); |
941 | if (tail_ent) { | ||
942 | if (ent && | ||
943 | ((le32_to_cpu(ent->e_rec.e_cpos) + | ||
944 | le32_to_cpu(ent->e_rec.e_clusters)) > new_clusters)) | ||
945 | *tail_ent = ent; | ||
946 | else | ||
947 | *tail_ent = NULL; | ||
948 | } | 503 | } |
949 | } | ||
950 | |||
951 | static void __ocfs2_extent_map_drop_cleanup(struct rb_node *free_head) | ||
952 | { | ||
953 | struct rb_node *node; | ||
954 | struct ocfs2_extent_map_entry *ent; | ||
955 | 504 | ||
956 | while (free_head) { | 505 | if (extent_flags) |
957 | node = free_head; | 506 | *extent_flags = flags; |
958 | free_head = node->rb_right; | ||
959 | 507 | ||
960 | ent = rb_entry(node, struct ocfs2_extent_map_entry, | 508 | out: |
961 | e_node); | 509 | brelse(di_bh); |
962 | kmem_cache_free(ocfs2_em_ent_cachep, ent); | 510 | brelse(eb_bh); |
963 | } | 511 | return ret; |
964 | } | 512 | } |
965 | 513 | ||
966 | /* | 514 | /* |
967 | * Remove all entries past new_clusters, inclusive of an entry that | 515 | * This expects alloc_sem to be held. The allocation cannot change at |
968 | * contains new_clusters. This is effectively a cache forget. | 516 | * all while the map is in the process of being updated. |
969 | * | ||
970 | * If you want to also clip the last extent by some number of clusters, | ||
971 | * you need to call ocfs2_extent_map_trunc(). | ||
972 | * This code does not check or modify ip_clusters. | ||
973 | */ | 517 | */ |
974 | int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters) | 518 | int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno, |
519 | u64 *ret_count, unsigned int *extent_flags) | ||
975 | { | 520 | { |
976 | struct rb_node *free_head = NULL; | 521 | int ret; |
977 | struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; | 522 | int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); |
978 | struct ocfs2_extent_map_entry *ent; | 523 | u32 cpos, num_clusters, p_cluster; |
979 | 524 | u64 boff = 0; | |
980 | spin_lock(&OCFS2_I(inode)->ip_lock); | ||
981 | 525 | ||
982 | __ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent); | 526 | cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno); |
983 | 527 | ||
984 | if (ent) { | 528 | ret = ocfs2_get_clusters(inode, cpos, &p_cluster, &num_clusters, |
985 | rb_erase(&ent->e_node, &em->em_extents); | 529 | extent_flags); |
986 | ent->e_node.rb_right = free_head; | 530 | if (ret) { |
987 | free_head = &ent->e_node; | 531 | mlog_errno(ret); |
532 | goto out; | ||
988 | } | 533 | } |
989 | 534 | ||
990 | spin_unlock(&OCFS2_I(inode)->ip_lock); | 535 | /* |
991 | 536 | * p_cluster == 0 indicates a hole. | |
992 | if (free_head) | 537 | */ |
993 | __ocfs2_extent_map_drop_cleanup(free_head); | 538 | if (p_cluster) { |
994 | 539 | boff = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster); | |
995 | return 0; | 540 | boff += (v_blkno & (u64)(bpc - 1)); |
996 | } | 541 | } |
997 | |||
998 | /* | ||
999 | * Remove all entries past new_clusters and also clip any extent | ||
1000 | * straddling new_clusters, if there is one. This does not check | ||
1001 | * or modify ip_clusters | ||
1002 | */ | ||
1003 | int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters) | ||
1004 | { | ||
1005 | struct rb_node *free_head = NULL; | ||
1006 | struct ocfs2_extent_map_entry *ent = NULL; | ||
1007 | |||
1008 | spin_lock(&OCFS2_I(inode)->ip_lock); | ||
1009 | |||
1010 | __ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent); | ||
1011 | |||
1012 | if (ent) | ||
1013 | ent->e_rec.e_clusters = cpu_to_le32(new_clusters - | ||
1014 | le32_to_cpu(ent->e_rec.e_cpos)); | ||
1015 | |||
1016 | OCFS2_I(inode)->ip_map.em_clusters = new_clusters; | ||
1017 | |||
1018 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
1019 | |||
1020 | if (free_head) | ||
1021 | __ocfs2_extent_map_drop_cleanup(free_head); | ||
1022 | |||
1023 | return 0; | ||
1024 | } | ||
1025 | 542 | ||
1026 | int __init init_ocfs2_extent_maps(void) | 543 | *p_blkno = boff; |
1027 | { | ||
1028 | ocfs2_em_ent_cachep = | ||
1029 | kmem_cache_create("ocfs2_em_ent", | ||
1030 | sizeof(struct ocfs2_extent_map_entry), | ||
1031 | 0, SLAB_HWCACHE_ALIGN, NULL, NULL); | ||
1032 | if (!ocfs2_em_ent_cachep) | ||
1033 | return -ENOMEM; | ||
1034 | 544 | ||
1035 | return 0; | 545 | if (ret_count) { |
1036 | } | 546 | *ret_count = ocfs2_clusters_to_blocks(inode->i_sb, num_clusters); |
547 | *ret_count -= v_blkno & (u64)(bpc - 1); | ||
548 | } | ||
1037 | 549 | ||
1038 | void exit_ocfs2_extent_maps(void) | 550 | out: |
1039 | { | 551 | return ret; |
1040 | kmem_cache_destroy(ocfs2_em_ent_cachep); | ||
1041 | } | 552 | } |