diff options
author | Mark Fasheh <mark.fasheh@oracle.com> | 2005-12-15 17:31:24 -0500 |
---|---|---|
committer | Joel Becker <joel.becker@oracle.com> | 2006-01-03 14:45:47 -0500 |
commit | ccd979bdbce9fba8412beb3f1de68a9d0171b12c (patch) | |
tree | c50ed941849ce06ccadd4ce27599b3ef9fdbe2ae /fs/ocfs2/extent_map.c | |
parent | 8df08c89c668e1bd922a053fdb5ba1fadbecbb38 (diff) |
[PATCH] OCFS2: The Second Oracle Cluster Filesystem
The OCFS2 file system module.
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
Diffstat (limited to 'fs/ocfs2/extent_map.c')
-rw-r--r-- | fs/ocfs2/extent_map.c | 994 |
1 files changed, 994 insertions, 0 deletions
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c new file mode 100644 index 000000000000..f2fb40cd296a --- /dev/null +++ b/fs/ocfs2/extent_map.c | |||
@@ -0,0 +1,994 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * extent_map.c | ||
5 | * | ||
6 | * In-memory extent map for OCFS2. Man, this code was prettier in | ||
7 | * the library. | ||
8 | * | ||
9 | * Copyright (C) 2004 Oracle. All rights reserved. | ||
10 | * | ||
11 | * This program is free software; you can redistribute it and/or | ||
12 | * modify it under the terms of the GNU General Public | ||
13 | * License, version 2, as published by the Free Software Foundation. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #include <linux/fs.h> | ||
27 | #include <linux/init.h> | ||
28 | #include <linux/types.h> | ||
29 | #include <linux/slab.h> | ||
30 | #include <linux/rbtree.h> | ||
31 | |||
32 | #define MLOG_MASK_PREFIX ML_EXTENT_MAP | ||
33 | #include <cluster/masklog.h> | ||
34 | |||
35 | #include "ocfs2.h" | ||
36 | |||
37 | #include "extent_map.h" | ||
38 | #include "inode.h" | ||
39 | #include "super.h" | ||
40 | |||
41 | #include "buffer_head_io.h" | ||
42 | |||
43 | |||
44 | /* | ||
45 | * SUCK SUCK SUCK | ||
46 | * Our headers are so bad that struct ocfs2_extent_map is in ocfs.h | ||
47 | */ | ||
48 | |||
49 | struct ocfs2_extent_map_entry { | ||
50 | struct rb_node e_node; | ||
51 | int e_tree_depth; | ||
52 | struct ocfs2_extent_rec e_rec; | ||
53 | }; | ||
54 | |||
55 | struct ocfs2_em_insert_context { | ||
56 | int need_left; | ||
57 | int need_right; | ||
58 | struct ocfs2_extent_map_entry *new_ent; | ||
59 | struct ocfs2_extent_map_entry *old_ent; | ||
60 | struct ocfs2_extent_map_entry *left_ent; | ||
61 | struct ocfs2_extent_map_entry *right_ent; | ||
62 | }; | ||
63 | |||
64 | static kmem_cache_t *ocfs2_em_ent_cachep = NULL; | ||
65 | |||
66 | |||
67 | static struct ocfs2_extent_map_entry * | ||
68 | ocfs2_extent_map_lookup(struct ocfs2_extent_map *em, | ||
69 | u32 cpos, u32 clusters, | ||
70 | struct rb_node ***ret_p, | ||
71 | struct rb_node **ret_parent); | ||
72 | static int ocfs2_extent_map_insert(struct inode *inode, | ||
73 | struct ocfs2_extent_rec *rec, | ||
74 | int tree_depth); | ||
75 | static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em, | ||
76 | struct ocfs2_extent_map_entry *ent); | ||
77 | static int ocfs2_extent_map_find_leaf(struct inode *inode, | ||
78 | u32 cpos, u32 clusters, | ||
79 | struct ocfs2_extent_list *el); | ||
80 | static int ocfs2_extent_map_lookup_read(struct inode *inode, | ||
81 | u32 cpos, u32 clusters, | ||
82 | struct ocfs2_extent_map_entry **ret_ent); | ||
83 | static int ocfs2_extent_map_try_insert(struct inode *inode, | ||
84 | struct ocfs2_extent_rec *rec, | ||
85 | int tree_depth, | ||
86 | struct ocfs2_em_insert_context *ctxt); | ||
87 | |||
88 | /* returns 1 only if the rec contains all the given clusters -- that is that | ||
89 | * rec's cpos is <= the cluster cpos and that the rec endpoint (cpos + | ||
90 | * clusters) is >= the argument's endpoint */ | ||
91 | static int ocfs2_extent_rec_contains_clusters(struct ocfs2_extent_rec *rec, | ||
92 | u32 cpos, u32 clusters) | ||
93 | { | ||
94 | if (le32_to_cpu(rec->e_cpos) > cpos) | ||
95 | return 0; | ||
96 | if (cpos + clusters > le32_to_cpu(rec->e_cpos) + | ||
97 | le32_to_cpu(rec->e_clusters)) | ||
98 | return 0; | ||
99 | return 1; | ||
100 | } | ||
101 | |||
102 | |||
103 | /* | ||
104 | * Find an entry in the tree that intersects the region passed in. | ||
105 | * Note that this will find straddled intervals, it is up to the | ||
106 | * callers to enforce any boundary conditions. | ||
107 | * | ||
108 | * Callers must hold ip_lock. This lookup is not guaranteed to return | ||
109 | * a tree_depth 0 match, and as such can race inserts if the lock | ||
110 | * were not held. | ||
111 | * | ||
112 | * The rb_node garbage lets insertion share the search. Trivial | ||
113 | * callers pass NULL. | ||
114 | */ | ||
115 | static struct ocfs2_extent_map_entry * | ||
116 | ocfs2_extent_map_lookup(struct ocfs2_extent_map *em, | ||
117 | u32 cpos, u32 clusters, | ||
118 | struct rb_node ***ret_p, | ||
119 | struct rb_node **ret_parent) | ||
120 | { | ||
121 | struct rb_node **p = &em->em_extents.rb_node; | ||
122 | struct rb_node *parent = NULL; | ||
123 | struct ocfs2_extent_map_entry *ent = NULL; | ||
124 | |||
125 | while (*p) | ||
126 | { | ||
127 | parent = *p; | ||
128 | ent = rb_entry(parent, struct ocfs2_extent_map_entry, | ||
129 | e_node); | ||
130 | if ((cpos + clusters) <= le32_to_cpu(ent->e_rec.e_cpos)) { | ||
131 | p = &(*p)->rb_left; | ||
132 | ent = NULL; | ||
133 | } else if (cpos >= (le32_to_cpu(ent->e_rec.e_cpos) + | ||
134 | le32_to_cpu(ent->e_rec.e_clusters))) { | ||
135 | p = &(*p)->rb_right; | ||
136 | ent = NULL; | ||
137 | } else | ||
138 | break; | ||
139 | } | ||
140 | |||
141 | if (ret_p != NULL) | ||
142 | *ret_p = p; | ||
143 | if (ret_parent != NULL) | ||
144 | *ret_parent = parent; | ||
145 | return ent; | ||
146 | } | ||
147 | |||
148 | /* | ||
149 | * Find the leaf containing the interval we want. While we're on our | ||
150 | * way down the tree, fill in every record we see at any depth, because | ||
151 | * we might want it later. | ||
152 | * | ||
153 | * Note that this code is run without ip_lock. That's because it | ||
154 | * sleeps while reading. If someone is also filling the extent list at | ||
155 | * the same time we are, we might have to restart. | ||
156 | */ | ||
157 | static int ocfs2_extent_map_find_leaf(struct inode *inode, | ||
158 | u32 cpos, u32 clusters, | ||
159 | struct ocfs2_extent_list *el) | ||
160 | { | ||
161 | int i, ret; | ||
162 | struct buffer_head *eb_bh = NULL; | ||
163 | u64 blkno; | ||
164 | u32 rec_end; | ||
165 | struct ocfs2_extent_block *eb; | ||
166 | struct ocfs2_extent_rec *rec; | ||
167 | |||
168 | /* | ||
169 | * The bh data containing the el cannot change here, because | ||
170 | * we hold alloc_sem. So we can do this without other | ||
171 | * locks. | ||
172 | */ | ||
173 | while (el->l_tree_depth) | ||
174 | { | ||
175 | blkno = 0; | ||
176 | for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { | ||
177 | rec = &el->l_recs[i]; | ||
178 | rec_end = (le32_to_cpu(rec->e_cpos) + | ||
179 | le32_to_cpu(rec->e_clusters)); | ||
180 | |||
181 | ret = -EBADR; | ||
182 | if (rec_end > OCFS2_I(inode)->ip_clusters) { | ||
183 | mlog_errno(ret); | ||
184 | goto out_free; | ||
185 | } | ||
186 | |||
187 | if (rec_end <= cpos) { | ||
188 | ret = ocfs2_extent_map_insert(inode, rec, | ||
189 | le16_to_cpu(el->l_tree_depth)); | ||
190 | if (ret && (ret != -EEXIST)) { | ||
191 | mlog_errno(ret); | ||
192 | goto out_free; | ||
193 | } | ||
194 | continue; | ||
195 | } | ||
196 | if ((cpos + clusters) <= le32_to_cpu(rec->e_cpos)) { | ||
197 | ret = ocfs2_extent_map_insert(inode, rec, | ||
198 | le16_to_cpu(el->l_tree_depth)); | ||
199 | if (ret && (ret != -EEXIST)) { | ||
200 | mlog_errno(ret); | ||
201 | goto out_free; | ||
202 | } | ||
203 | continue; | ||
204 | } | ||
205 | |||
206 | /* | ||
207 | * We've found a record that matches our | ||
208 | * interval. We don't insert it because we're | ||
209 | * about to traverse it. | ||
210 | */ | ||
211 | |||
212 | /* Check to see if we're stradling */ | ||
213 | ret = -ESRCH; | ||
214 | if (!ocfs2_extent_rec_contains_clusters(rec, | ||
215 | cpos, | ||
216 | clusters)) { | ||
217 | mlog_errno(ret); | ||
218 | goto out_free; | ||
219 | } | ||
220 | |||
221 | /* | ||
222 | * If we've already found a record, the el has | ||
223 | * two records covering the same interval. | ||
224 | * EEEK! | ||
225 | */ | ||
226 | ret = -EBADR; | ||
227 | if (blkno) { | ||
228 | mlog_errno(ret); | ||
229 | goto out_free; | ||
230 | } | ||
231 | |||
232 | blkno = le64_to_cpu(rec->e_blkno); | ||
233 | } | ||
234 | |||
235 | /* | ||
236 | * We don't support holes, and we're still up | ||
237 | * in the branches, so we'd better have found someone | ||
238 | */ | ||
239 | ret = -EBADR; | ||
240 | if (!blkno) { | ||
241 | mlog_errno(ret); | ||
242 | goto out_free; | ||
243 | } | ||
244 | |||
245 | if (eb_bh) { | ||
246 | brelse(eb_bh); | ||
247 | eb_bh = NULL; | ||
248 | } | ||
249 | ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), | ||
250 | blkno, &eb_bh, OCFS2_BH_CACHED, | ||
251 | inode); | ||
252 | if (ret) { | ||
253 | mlog_errno(ret); | ||
254 | goto out_free; | ||
255 | } | ||
256 | eb = (struct ocfs2_extent_block *)eb_bh->b_data; | ||
257 | if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { | ||
258 | OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); | ||
259 | ret = -EIO; | ||
260 | goto out_free; | ||
261 | } | ||
262 | el = &eb->h_list; | ||
263 | } | ||
264 | |||
265 | if (el->l_tree_depth) | ||
266 | BUG(); | ||
267 | |||
268 | for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { | ||
269 | rec = &el->l_recs[i]; | ||
270 | ret = ocfs2_extent_map_insert(inode, rec, | ||
271 | le16_to_cpu(el->l_tree_depth)); | ||
272 | if (ret) { | ||
273 | mlog_errno(ret); | ||
274 | goto out_free; | ||
275 | } | ||
276 | } | ||
277 | |||
278 | ret = 0; | ||
279 | |||
280 | out_free: | ||
281 | if (eb_bh) | ||
282 | brelse(eb_bh); | ||
283 | |||
284 | return ret; | ||
285 | } | ||
286 | |||
287 | /* | ||
288 | * This lookup actually will read from disk. It has one invariant: | ||
289 | * It will never re-traverse blocks. This means that all inserts should | ||
290 | * be new regions or more granular regions (both allowed by insert). | ||
291 | */ | ||
292 | static int ocfs2_extent_map_lookup_read(struct inode *inode, | ||
293 | u32 cpos, | ||
294 | u32 clusters, | ||
295 | struct ocfs2_extent_map_entry **ret_ent) | ||
296 | { | ||
297 | int ret; | ||
298 | u64 blkno; | ||
299 | struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; | ||
300 | struct ocfs2_extent_map_entry *ent; | ||
301 | struct buffer_head *bh = NULL; | ||
302 | struct ocfs2_extent_block *eb; | ||
303 | struct ocfs2_dinode *di; | ||
304 | struct ocfs2_extent_list *el; | ||
305 | |||
306 | spin_lock(&OCFS2_I(inode)->ip_lock); | ||
307 | ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL); | ||
308 | if (ent) { | ||
309 | if (!ent->e_tree_depth) { | ||
310 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
311 | *ret_ent = ent; | ||
312 | return 0; | ||
313 | } | ||
314 | blkno = le64_to_cpu(ent->e_rec.e_blkno); | ||
315 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
316 | |||
317 | ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno, &bh, | ||
318 | OCFS2_BH_CACHED, inode); | ||
319 | if (ret) { | ||
320 | mlog_errno(ret); | ||
321 | if (bh) | ||
322 | brelse(bh); | ||
323 | return ret; | ||
324 | } | ||
325 | eb = (struct ocfs2_extent_block *)bh->b_data; | ||
326 | if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { | ||
327 | OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); | ||
328 | brelse(bh); | ||
329 | return -EIO; | ||
330 | } | ||
331 | el = &eb->h_list; | ||
332 | } else { | ||
333 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
334 | |||
335 | ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), | ||
336 | OCFS2_I(inode)->ip_blkno, &bh, | ||
337 | OCFS2_BH_CACHED, inode); | ||
338 | if (ret) { | ||
339 | mlog_errno(ret); | ||
340 | if (bh) | ||
341 | brelse(bh); | ||
342 | return ret; | ||
343 | } | ||
344 | di = (struct ocfs2_dinode *)bh->b_data; | ||
345 | if (!OCFS2_IS_VALID_DINODE(di)) { | ||
346 | brelse(bh); | ||
347 | OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, di); | ||
348 | return -EIO; | ||
349 | } | ||
350 | el = &di->id2.i_list; | ||
351 | } | ||
352 | |||
353 | ret = ocfs2_extent_map_find_leaf(inode, cpos, clusters, el); | ||
354 | brelse(bh); | ||
355 | if (ret) { | ||
356 | mlog_errno(ret); | ||
357 | return ret; | ||
358 | } | ||
359 | |||
360 | ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL); | ||
361 | if (!ent) { | ||
362 | ret = -ESRCH; | ||
363 | mlog_errno(ret); | ||
364 | return ret; | ||
365 | } | ||
366 | |||
367 | if (ent->e_tree_depth) | ||
368 | BUG(); /* FIXME: Make sure this isn't a corruption */ | ||
369 | |||
370 | *ret_ent = ent; | ||
371 | |||
372 | return 0; | ||
373 | } | ||
374 | |||
375 | /* | ||
376 | * Callers must hold ip_lock. This can insert pieces of the tree, | ||
377 | * thus racing lookup if the lock weren't held. | ||
378 | */ | ||
379 | static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em, | ||
380 | struct ocfs2_extent_map_entry *ent) | ||
381 | { | ||
382 | struct rb_node **p, *parent; | ||
383 | struct ocfs2_extent_map_entry *old_ent; | ||
384 | |||
385 | old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(ent->e_rec.e_cpos), | ||
386 | le32_to_cpu(ent->e_rec.e_clusters), | ||
387 | &p, &parent); | ||
388 | if (old_ent) | ||
389 | return -EEXIST; | ||
390 | |||
391 | rb_link_node(&ent->e_node, parent, p); | ||
392 | rb_insert_color(&ent->e_node, &em->em_extents); | ||
393 | |||
394 | return 0; | ||
395 | } | ||
396 | |||
397 | |||
398 | /* | ||
399 | * Simple rule: on any return code other than -EAGAIN, anything left | ||
400 | * in the insert_context will be freed. | ||
401 | */ | ||
402 | static int ocfs2_extent_map_try_insert(struct inode *inode, | ||
403 | struct ocfs2_extent_rec *rec, | ||
404 | int tree_depth, | ||
405 | struct ocfs2_em_insert_context *ctxt) | ||
406 | { | ||
407 | int ret; | ||
408 | struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; | ||
409 | struct ocfs2_extent_map_entry *old_ent; | ||
410 | |||
411 | ctxt->need_left = 0; | ||
412 | ctxt->need_right = 0; | ||
413 | ctxt->old_ent = NULL; | ||
414 | |||
415 | spin_lock(&OCFS2_I(inode)->ip_lock); | ||
416 | ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent); | ||
417 | if (!ret) { | ||
418 | ctxt->new_ent = NULL; | ||
419 | goto out_unlock; | ||
420 | } | ||
421 | |||
422 | old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos), | ||
423 | le32_to_cpu(rec->e_clusters), NULL, | ||
424 | NULL); | ||
425 | |||
426 | if (!old_ent) | ||
427 | BUG(); | ||
428 | |||
429 | ret = -EEXIST; | ||
430 | if (old_ent->e_tree_depth < tree_depth) | ||
431 | goto out_unlock; | ||
432 | |||
433 | if (old_ent->e_tree_depth == tree_depth) { | ||
434 | if (!memcmp(rec, &old_ent->e_rec, | ||
435 | sizeof(struct ocfs2_extent_rec))) | ||
436 | ret = 0; | ||
437 | |||
438 | /* FIXME: Should this be ESRCH/EBADR??? */ | ||
439 | goto out_unlock; | ||
440 | } | ||
441 | |||
442 | /* | ||
443 | * We do it in this order specifically so that no actual tree | ||
444 | * changes occur until we have all the pieces we need. We | ||
445 | * don't want malloc failures to leave an inconsistent tree. | ||
446 | * Whenever we drop the lock, another process could be | ||
447 | * inserting. Also note that, if another process just beat us | ||
448 | * to an insert, we might not need the same pieces we needed | ||
449 | * the first go round. In the end, the pieces we need will | ||
450 | * be used, and the pieces we don't will be freed. | ||
451 | */ | ||
452 | ctxt->need_left = !!(le32_to_cpu(rec->e_cpos) > | ||
453 | le32_to_cpu(old_ent->e_rec.e_cpos)); | ||
454 | ctxt->need_right = !!((le32_to_cpu(old_ent->e_rec.e_cpos) + | ||
455 | le32_to_cpu(old_ent->e_rec.e_clusters)) > | ||
456 | (le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters))); | ||
457 | ret = -EAGAIN; | ||
458 | if (ctxt->need_left) { | ||
459 | if (!ctxt->left_ent) | ||
460 | goto out_unlock; | ||
461 | *(ctxt->left_ent) = *old_ent; | ||
462 | ctxt->left_ent->e_rec.e_clusters = | ||
463 | cpu_to_le32(le32_to_cpu(rec->e_cpos) - | ||
464 | le32_to_cpu(ctxt->left_ent->e_rec.e_cpos)); | ||
465 | } | ||
466 | if (ctxt->need_right) { | ||
467 | if (!ctxt->right_ent) | ||
468 | goto out_unlock; | ||
469 | *(ctxt->right_ent) = *old_ent; | ||
470 | ctxt->right_ent->e_rec.e_cpos = | ||
471 | cpu_to_le32(le32_to_cpu(rec->e_cpos) + | ||
472 | le32_to_cpu(rec->e_clusters)); | ||
473 | ctxt->right_ent->e_rec.e_clusters = | ||
474 | cpu_to_le32((le32_to_cpu(old_ent->e_rec.e_cpos) + | ||
475 | le32_to_cpu(old_ent->e_rec.e_clusters)) - | ||
476 | le32_to_cpu(ctxt->right_ent->e_rec.e_cpos)); | ||
477 | } | ||
478 | |||
479 | rb_erase(&old_ent->e_node, &em->em_extents); | ||
480 | /* Now that he's erased, set him up for deletion */ | ||
481 | ctxt->old_ent = old_ent; | ||
482 | |||
483 | if (ctxt->need_left) { | ||
484 | ret = ocfs2_extent_map_insert_entry(em, | ||
485 | ctxt->left_ent); | ||
486 | if (ret) | ||
487 | goto out_unlock; | ||
488 | ctxt->left_ent = NULL; | ||
489 | } | ||
490 | |||
491 | if (ctxt->need_right) { | ||
492 | ret = ocfs2_extent_map_insert_entry(em, | ||
493 | ctxt->right_ent); | ||
494 | if (ret) | ||
495 | goto out_unlock; | ||
496 | ctxt->right_ent = NULL; | ||
497 | } | ||
498 | |||
499 | ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent); | ||
500 | |||
501 | if (!ret) | ||
502 | ctxt->new_ent = NULL; | ||
503 | |||
504 | out_unlock: | ||
505 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
506 | |||
507 | return ret; | ||
508 | } | ||
509 | |||
510 | |||
511 | static int ocfs2_extent_map_insert(struct inode *inode, | ||
512 | struct ocfs2_extent_rec *rec, | ||
513 | int tree_depth) | ||
514 | { | ||
515 | int ret; | ||
516 | struct ocfs2_em_insert_context ctxt = {0, }; | ||
517 | |||
518 | if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) > | ||
519 | OCFS2_I(inode)->ip_map.em_clusters) { | ||
520 | ret = -EBADR; | ||
521 | mlog_errno(ret); | ||
522 | return ret; | ||
523 | } | ||
524 | |||
525 | /* Zero e_clusters means a truncated tail record. It better be EOF */ | ||
526 | if (!rec->e_clusters) { | ||
527 | if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) != | ||
528 | OCFS2_I(inode)->ip_map.em_clusters) { | ||
529 | ret = -EBADR; | ||
530 | mlog_errno(ret); | ||
531 | return ret; | ||
532 | } | ||
533 | |||
534 | /* Ignore the truncated tail */ | ||
535 | return 0; | ||
536 | } | ||
537 | |||
538 | ret = -ENOMEM; | ||
539 | ctxt.new_ent = kmem_cache_alloc(ocfs2_em_ent_cachep, | ||
540 | GFP_KERNEL); | ||
541 | if (!ctxt.new_ent) { | ||
542 | mlog_errno(ret); | ||
543 | return ret; | ||
544 | } | ||
545 | |||
546 | ctxt.new_ent->e_rec = *rec; | ||
547 | ctxt.new_ent->e_tree_depth = tree_depth; | ||
548 | |||
549 | do { | ||
550 | ret = -ENOMEM; | ||
551 | if (ctxt.need_left && !ctxt.left_ent) { | ||
552 | ctxt.left_ent = | ||
553 | kmem_cache_alloc(ocfs2_em_ent_cachep, | ||
554 | GFP_KERNEL); | ||
555 | if (!ctxt.left_ent) | ||
556 | break; | ||
557 | } | ||
558 | if (ctxt.need_right && !ctxt.right_ent) { | ||
559 | ctxt.right_ent = | ||
560 | kmem_cache_alloc(ocfs2_em_ent_cachep, | ||
561 | GFP_KERNEL); | ||
562 | if (!ctxt.right_ent) | ||
563 | break; | ||
564 | } | ||
565 | |||
566 | ret = ocfs2_extent_map_try_insert(inode, rec, | ||
567 | tree_depth, &ctxt); | ||
568 | } while (ret == -EAGAIN); | ||
569 | |||
570 | if (ret < 0) | ||
571 | mlog_errno(ret); | ||
572 | |||
573 | if (ctxt.left_ent) | ||
574 | kmem_cache_free(ocfs2_em_ent_cachep, ctxt.left_ent); | ||
575 | if (ctxt.right_ent) | ||
576 | kmem_cache_free(ocfs2_em_ent_cachep, ctxt.right_ent); | ||
577 | if (ctxt.old_ent) | ||
578 | kmem_cache_free(ocfs2_em_ent_cachep, ctxt.old_ent); | ||
579 | if (ctxt.new_ent) | ||
580 | kmem_cache_free(ocfs2_em_ent_cachep, ctxt.new_ent); | ||
581 | |||
582 | return ret; | ||
583 | } | ||
584 | |||
585 | /* | ||
586 | * Append this record to the tail of the extent map. It must be | ||
587 | * tree_depth 0. The record might be an extension of an existing | ||
588 | * record, and as such that needs to be handled. eg: | ||
589 | * | ||
590 | * Existing record in the extent map: | ||
591 | * | ||
592 | * cpos = 10, len = 10 | ||
593 | * |---------| | ||
594 | * | ||
595 | * New Record: | ||
596 | * | ||
597 | * cpos = 10, len = 20 | ||
598 | * |------------------| | ||
599 | * | ||
600 | * The passed record is the new on-disk record. The new_clusters value | ||
601 | * is how many clusters were added to the file. If the append is a | ||
602 | * contiguous append, the new_clusters has been added to | ||
603 | * rec->e_clusters. If the append is an entirely new extent, then | ||
604 | * rec->e_clusters is == new_clusters. | ||
605 | */ | ||
606 | int ocfs2_extent_map_append(struct inode *inode, | ||
607 | struct ocfs2_extent_rec *rec, | ||
608 | u32 new_clusters) | ||
609 | { | ||
610 | int ret; | ||
611 | struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; | ||
612 | struct ocfs2_extent_map_entry *ent; | ||
613 | struct ocfs2_extent_rec *old; | ||
614 | |||
615 | BUG_ON(!new_clusters); | ||
616 | BUG_ON(le32_to_cpu(rec->e_clusters) < new_clusters); | ||
617 | |||
618 | if (em->em_clusters < OCFS2_I(inode)->ip_clusters) { | ||
619 | /* | ||
620 | * Size changed underneath us on disk. Drop any | ||
621 | * straddling records and update our idea of | ||
622 | * i_clusters | ||
623 | */ | ||
624 | ocfs2_extent_map_drop(inode, em->em_clusters - 1); | ||
625 | em->em_clusters = OCFS2_I(inode)->ip_clusters; | ||
626 | } | ||
627 | |||
628 | mlog_bug_on_msg((le32_to_cpu(rec->e_cpos) + | ||
629 | le32_to_cpu(rec->e_clusters)) != | ||
630 | (em->em_clusters + new_clusters), | ||
631 | "Inode %"MLFu64":\n" | ||
632 | "rec->e_cpos = %u + rec->e_clusters = %u = %u\n" | ||
633 | "em->em_clusters = %u + new_clusters = %u = %u\n", | ||
634 | OCFS2_I(inode)->ip_blkno, | ||
635 | le32_to_cpu(rec->e_cpos), le32_to_cpu(rec->e_clusters), | ||
636 | le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters), | ||
637 | em->em_clusters, new_clusters, | ||
638 | em->em_clusters + new_clusters); | ||
639 | |||
640 | em->em_clusters += new_clusters; | ||
641 | |||
642 | ret = -ENOENT; | ||
643 | if (le32_to_cpu(rec->e_clusters) > new_clusters) { | ||
644 | /* This is a contiguous append */ | ||
645 | ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos), 1, | ||
646 | NULL, NULL); | ||
647 | if (ent) { | ||
648 | old = &ent->e_rec; | ||
649 | BUG_ON((le32_to_cpu(rec->e_cpos) + | ||
650 | le32_to_cpu(rec->e_clusters)) != | ||
651 | (le32_to_cpu(old->e_cpos) + | ||
652 | le32_to_cpu(old->e_clusters) + | ||
653 | new_clusters)); | ||
654 | if (ent->e_tree_depth == 0) { | ||
655 | BUG_ON(le32_to_cpu(old->e_cpos) != | ||
656 | le32_to_cpu(rec->e_cpos)); | ||
657 | BUG_ON(le64_to_cpu(old->e_blkno) != | ||
658 | le64_to_cpu(rec->e_blkno)); | ||
659 | ret = 0; | ||
660 | } | ||
661 | /* | ||
662 | * Let non-leafs fall through as -ENOENT to | ||
663 | * force insertion of the new leaf. | ||
664 | */ | ||
665 | le32_add_cpu(&old->e_clusters, new_clusters); | ||
666 | } | ||
667 | } | ||
668 | |||
669 | if (ret == -ENOENT) | ||
670 | ret = ocfs2_extent_map_insert(inode, rec, 0); | ||
671 | if (ret < 0) | ||
672 | mlog_errno(ret); | ||
673 | return ret; | ||
674 | } | ||
675 | |||
676 | #if 0 | ||
677 | /* Code here is included but defined out as it completes the extent | ||
678 | * map api and may be used in the future. */ | ||
679 | |||
680 | /* | ||
681 | * Look up the record containing this cluster offset. This record is | ||
682 | * part of the extent map. Do not free it. Any changes you make to | ||
683 | * it will reflect in the extent map. So, if your last extent | ||
684 | * is (cpos = 10, clusters = 10) and you truncate the file by 5 | ||
685 | * clusters, you can do: | ||
686 | * | ||
687 | * ret = ocfs2_extent_map_get_rec(em, orig_size - 5, &rec); | ||
688 | * rec->e_clusters -= 5; | ||
689 | * | ||
690 | * The lookup does not read from disk. If the map isn't filled in for | ||
691 | * an entry, you won't find it. | ||
692 | * | ||
693 | * Also note that the returned record is valid until alloc_sem is | ||
694 | * dropped. After that, truncate and extend can happen. Caveat Emptor. | ||
695 | */ | ||
696 | int ocfs2_extent_map_get_rec(struct inode *inode, u32 cpos, | ||
697 | struct ocfs2_extent_rec **rec, | ||
698 | int *tree_depth) | ||
699 | { | ||
700 | int ret = -ENOENT; | ||
701 | struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; | ||
702 | struct ocfs2_extent_map_entry *ent; | ||
703 | |||
704 | *rec = NULL; | ||
705 | |||
706 | if (cpos >= OCFS2_I(inode)->ip_clusters) | ||
707 | return -EINVAL; | ||
708 | |||
709 | if (cpos >= em->em_clusters) { | ||
710 | /* | ||
711 | * Size changed underneath us on disk. Drop any | ||
712 | * straddling records and update our idea of | ||
713 | * i_clusters | ||
714 | */ | ||
715 | ocfs2_extent_map_drop(inode, em->em_clusters - 1); | ||
716 | em->em_clusters = OCFS2_I(inode)->ip_clusters ; | ||
717 | } | ||
718 | |||
719 | ent = ocfs2_extent_map_lookup(&OCFS2_I(inode)->ip_map, cpos, 1, | ||
720 | NULL, NULL); | ||
721 | |||
722 | if (ent) { | ||
723 | *rec = &ent->e_rec; | ||
724 | if (tree_depth) | ||
725 | *tree_depth = ent->e_tree_depth; | ||
726 | ret = 0; | ||
727 | } | ||
728 | |||
729 | return ret; | ||
730 | } | ||
731 | |||
732 | int ocfs2_extent_map_get_clusters(struct inode *inode, | ||
733 | u32 v_cpos, int count, | ||
734 | u32 *p_cpos, int *ret_count) | ||
735 | { | ||
736 | int ret; | ||
737 | u32 coff, ccount; | ||
738 | struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; | ||
739 | struct ocfs2_extent_map_entry *ent = NULL; | ||
740 | |||
741 | *p_cpos = ccount = 0; | ||
742 | |||
743 | if ((v_cpos + count) > OCFS2_I(inode)->ip_clusters) | ||
744 | return -EINVAL; | ||
745 | |||
746 | if ((v_cpos + count) > em->em_clusters) { | ||
747 | /* | ||
748 | * Size changed underneath us on disk. Drop any | ||
749 | * straddling records and update our idea of | ||
750 | * i_clusters | ||
751 | */ | ||
752 | ocfs2_extent_map_drop(inode, em->em_clusters - 1); | ||
753 | em->em_clusters = OCFS2_I(inode)->ip_clusters; | ||
754 | } | ||
755 | |||
756 | |||
757 | ret = ocfs2_extent_map_lookup_read(inode, v_cpos, count, &ent); | ||
758 | if (ret) | ||
759 | return ret; | ||
760 | |||
761 | if (ent) { | ||
762 | /* We should never find ourselves straddling an interval */ | ||
763 | if (!ocfs2_extent_rec_contains_clusters(&ent->e_rec, | ||
764 | v_cpos, | ||
765 | count)) | ||
766 | return -ESRCH; | ||
767 | |||
768 | coff = v_cpos - le32_to_cpu(ent->e_rec.e_cpos); | ||
769 | *p_cpos = ocfs2_blocks_to_clusters(inode->i_sb, | ||
770 | le64_to_cpu(ent->e_rec.e_blkno)) + | ||
771 | coff; | ||
772 | |||
773 | if (ret_count) | ||
774 | *ret_count = le32_to_cpu(ent->e_rec.e_clusters) - coff; | ||
775 | |||
776 | return 0; | ||
777 | } | ||
778 | |||
779 | |||
780 | return -ENOENT; | ||
781 | } | ||
782 | |||
783 | #endif /* 0 */ | ||
784 | |||
785 | int ocfs2_extent_map_get_blocks(struct inode *inode, | ||
786 | u64 v_blkno, int count, | ||
787 | u64 *p_blkno, int *ret_count) | ||
788 | { | ||
789 | int ret; | ||
790 | u64 boff; | ||
791 | u32 cpos, clusters; | ||
792 | int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); | ||
793 | struct ocfs2_extent_map_entry *ent = NULL; | ||
794 | struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; | ||
795 | struct ocfs2_extent_rec *rec; | ||
796 | |||
797 | *p_blkno = 0; | ||
798 | |||
799 | cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno); | ||
800 | clusters = ocfs2_blocks_to_clusters(inode->i_sb, | ||
801 | (u64)count + bpc - 1); | ||
802 | if ((cpos + clusters) > OCFS2_I(inode)->ip_clusters) { | ||
803 | ret = -EINVAL; | ||
804 | mlog_errno(ret); | ||
805 | return ret; | ||
806 | } | ||
807 | |||
808 | if ((cpos + clusters) > em->em_clusters) { | ||
809 | /* | ||
810 | * Size changed underneath us on disk. Drop any | ||
811 | * straddling records and update our idea of | ||
812 | * i_clusters | ||
813 | */ | ||
814 | ocfs2_extent_map_drop(inode, em->em_clusters - 1); | ||
815 | em->em_clusters = OCFS2_I(inode)->ip_clusters; | ||
816 | } | ||
817 | |||
818 | ret = ocfs2_extent_map_lookup_read(inode, cpos, clusters, &ent); | ||
819 | if (ret) { | ||
820 | mlog_errno(ret); | ||
821 | return ret; | ||
822 | } | ||
823 | |||
824 | if (ent) | ||
825 | { | ||
826 | rec = &ent->e_rec; | ||
827 | |||
828 | /* We should never find ourselves straddling an interval */ | ||
829 | if (!ocfs2_extent_rec_contains_clusters(rec, cpos, clusters)) { | ||
830 | ret = -ESRCH; | ||
831 | mlog_errno(ret); | ||
832 | return ret; | ||
833 | } | ||
834 | |||
835 | boff = ocfs2_clusters_to_blocks(inode->i_sb, cpos - | ||
836 | le32_to_cpu(rec->e_cpos)); | ||
837 | boff += (v_blkno & (u64)(bpc - 1)); | ||
838 | *p_blkno = le64_to_cpu(rec->e_blkno) + boff; | ||
839 | |||
840 | if (ret_count) { | ||
841 | *ret_count = ocfs2_clusters_to_blocks(inode->i_sb, | ||
842 | le32_to_cpu(rec->e_clusters)) - boff; | ||
843 | } | ||
844 | |||
845 | return 0; | ||
846 | } | ||
847 | |||
848 | return -ENOENT; | ||
849 | } | ||
850 | |||
851 | int ocfs2_extent_map_init(struct inode *inode) | ||
852 | { | ||
853 | struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; | ||
854 | |||
855 | em->em_extents = RB_ROOT; | ||
856 | em->em_clusters = 0; | ||
857 | |||
858 | return 0; | ||
859 | } | ||
860 | |||
861 | /* Needs the lock */ | ||
862 | static void __ocfs2_extent_map_drop(struct inode *inode, | ||
863 | u32 new_clusters, | ||
864 | struct rb_node **free_head, | ||
865 | struct ocfs2_extent_map_entry **tail_ent) | ||
866 | { | ||
867 | struct rb_node *node, *next; | ||
868 | struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; | ||
869 | struct ocfs2_extent_map_entry *ent; | ||
870 | |||
871 | *free_head = NULL; | ||
872 | |||
873 | ent = NULL; | ||
874 | node = rb_last(&em->em_extents); | ||
875 | while (node) | ||
876 | { | ||
877 | next = rb_prev(node); | ||
878 | |||
879 | ent = rb_entry(node, struct ocfs2_extent_map_entry, | ||
880 | e_node); | ||
881 | if (le32_to_cpu(ent->e_rec.e_cpos) < new_clusters) | ||
882 | break; | ||
883 | |||
884 | rb_erase(&ent->e_node, &em->em_extents); | ||
885 | |||
886 | node->rb_right = *free_head; | ||
887 | *free_head = node; | ||
888 | |||
889 | ent = NULL; | ||
890 | node = next; | ||
891 | } | ||
892 | |||
893 | /* Do we have an entry straddling new_clusters? */ | ||
894 | if (tail_ent) { | ||
895 | if (ent && | ||
896 | ((le32_to_cpu(ent->e_rec.e_cpos) + | ||
897 | le32_to_cpu(ent->e_rec.e_clusters)) > new_clusters)) | ||
898 | *tail_ent = ent; | ||
899 | else | ||
900 | *tail_ent = NULL; | ||
901 | } | ||
902 | } | ||
903 | |||
904 | static void __ocfs2_extent_map_drop_cleanup(struct rb_node *free_head) | ||
905 | { | ||
906 | struct rb_node *node; | ||
907 | struct ocfs2_extent_map_entry *ent; | ||
908 | |||
909 | while (free_head) { | ||
910 | node = free_head; | ||
911 | free_head = node->rb_right; | ||
912 | |||
913 | ent = rb_entry(node, struct ocfs2_extent_map_entry, | ||
914 | e_node); | ||
915 | kmem_cache_free(ocfs2_em_ent_cachep, ent); | ||
916 | } | ||
917 | } | ||
918 | |||
919 | /* | ||
920 | * Remove all entries past new_clusters, inclusive of an entry that | ||
921 | * contains new_clusters. This is effectively a cache forget. | ||
922 | * | ||
923 | * If you want to also clip the last extent by some number of clusters, | ||
924 | * you need to call ocfs2_extent_map_trunc(). | ||
925 | * This code does not check or modify ip_clusters. | ||
926 | */ | ||
927 | int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters) | ||
928 | { | ||
929 | struct rb_node *free_head = NULL; | ||
930 | struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; | ||
931 | struct ocfs2_extent_map_entry *ent; | ||
932 | |||
933 | spin_lock(&OCFS2_I(inode)->ip_lock); | ||
934 | |||
935 | __ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent); | ||
936 | |||
937 | if (ent) { | ||
938 | rb_erase(&ent->e_node, &em->em_extents); | ||
939 | ent->e_node.rb_right = free_head; | ||
940 | free_head = &ent->e_node; | ||
941 | } | ||
942 | |||
943 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
944 | |||
945 | if (free_head) | ||
946 | __ocfs2_extent_map_drop_cleanup(free_head); | ||
947 | |||
948 | return 0; | ||
949 | } | ||
950 | |||
951 | /* | ||
952 | * Remove all entries past new_clusters and also clip any extent | ||
953 | * straddling new_clusters, if there is one. This does not check | ||
954 | * or modify ip_clusters | ||
955 | */ | ||
956 | int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters) | ||
957 | { | ||
958 | struct rb_node *free_head = NULL; | ||
959 | struct ocfs2_extent_map_entry *ent = NULL; | ||
960 | |||
961 | spin_lock(&OCFS2_I(inode)->ip_lock); | ||
962 | |||
963 | __ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent); | ||
964 | |||
965 | if (ent) | ||
966 | ent->e_rec.e_clusters = cpu_to_le32(new_clusters - | ||
967 | le32_to_cpu(ent->e_rec.e_cpos)); | ||
968 | |||
969 | OCFS2_I(inode)->ip_map.em_clusters = new_clusters; | ||
970 | |||
971 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
972 | |||
973 | if (free_head) | ||
974 | __ocfs2_extent_map_drop_cleanup(free_head); | ||
975 | |||
976 | return 0; | ||
977 | } | ||
978 | |||
979 | int __init init_ocfs2_extent_maps(void) | ||
980 | { | ||
981 | ocfs2_em_ent_cachep = | ||
982 | kmem_cache_create("ocfs2_em_ent", | ||
983 | sizeof(struct ocfs2_extent_map_entry), | ||
984 | 0, SLAB_HWCACHE_ALIGN, NULL, NULL); | ||
985 | if (!ocfs2_em_ent_cachep) | ||
986 | return -ENOMEM; | ||
987 | |||
988 | return 0; | ||
989 | } | ||
990 | |||
991 | void __exit exit_ocfs2_extent_maps(void) | ||
992 | { | ||
993 | kmem_cache_destroy(ocfs2_em_ent_cachep); | ||
994 | } | ||