diff options
Diffstat (limited to 'fs/nfs/blocklayout/extents.c')
-rw-r--r-- | fs/nfs/blocklayout/extents.c | 935 |
1 files changed, 935 insertions, 0 deletions
diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c new file mode 100644 index 000000000000..19fa7b0b8c00 --- /dev/null +++ b/fs/nfs/blocklayout/extents.c | |||
@@ -0,0 +1,935 @@ | |||
1 | /* | ||
2 | * linux/fs/nfs/blocklayout/blocklayout.h | ||
3 | * | ||
4 | * Module for the NFSv4.1 pNFS block layout driver. | ||
5 | * | ||
6 | * Copyright (c) 2006 The Regents of the University of Michigan. | ||
7 | * All rights reserved. | ||
8 | * | ||
9 | * Andy Adamson <andros@citi.umich.edu> | ||
10 | * Fred Isaman <iisaman@umich.edu> | ||
11 | * | ||
12 | * permission is granted to use, copy, create derivative works and | ||
13 | * redistribute this software and such derivative works for any purpose, | ||
14 | * so long as the name of the university of michigan is not used in | ||
15 | * any advertising or publicity pertaining to the use or distribution | ||
16 | * of this software without specific, written prior authorization. if | ||
17 | * the above copyright notice or any other identification of the | ||
18 | * university of michigan is included in any copy of any portion of | ||
19 | * this software, then the disclaimer below must also be included. | ||
20 | * | ||
21 | * this software is provided as is, without representation from the | ||
22 | * university of michigan as to its fitness for any purpose, and without | ||
23 | * warranty by the university of michigan of any kind, either express | ||
24 | * or implied, including without limitation the implied warranties of | ||
25 | * merchantability and fitness for a particular purpose. the regents | ||
26 | * of the university of michigan shall not be liable for any damages, | ||
27 | * including special, indirect, incidental, or consequential damages, | ||
28 | * with respect to any claim arising out or in connection with the use | ||
29 | * of the software, even if it has been or is hereafter advised of the | ||
30 | * possibility of such damages. | ||
31 | */ | ||
32 | |||
33 | #include "blocklayout.h" | ||
34 | #define NFSDBG_FACILITY NFSDBG_PNFS_LD | ||
35 | |||
36 | /* Bit numbers */ | ||
37 | #define EXTENT_INITIALIZED 0 | ||
38 | #define EXTENT_WRITTEN 1 | ||
39 | #define EXTENT_IN_COMMIT 2 | ||
40 | #define INTERNAL_EXISTS MY_MAX_TAGS | ||
41 | #define INTERNAL_MASK ((1 << INTERNAL_EXISTS) - 1) | ||
42 | |||
43 | /* Returns largest t<=s s.t. t%base==0 */ | ||
44 | static inline sector_t normalize(sector_t s, int base) | ||
45 | { | ||
46 | sector_t tmp = s; /* Since do_div modifies its argument */ | ||
47 | return s - do_div(tmp, base); | ||
48 | } | ||
49 | |||
50 | static inline sector_t normalize_up(sector_t s, int base) | ||
51 | { | ||
52 | return normalize(s + base - 1, base); | ||
53 | } | ||
54 | |||
55 | /* Complete stub using list while determine API wanted */ | ||
56 | |||
57 | /* Returns tags, or negative */ | ||
58 | static int32_t _find_entry(struct my_tree *tree, u64 s) | ||
59 | { | ||
60 | struct pnfs_inval_tracking *pos; | ||
61 | |||
62 | dprintk("%s(%llu) enter\n", __func__, s); | ||
63 | list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) { | ||
64 | if (pos->it_sector > s) | ||
65 | continue; | ||
66 | else if (pos->it_sector == s) | ||
67 | return pos->it_tags & INTERNAL_MASK; | ||
68 | else | ||
69 | break; | ||
70 | } | ||
71 | return -ENOENT; | ||
72 | } | ||
73 | |||
74 | static inline | ||
75 | int _has_tag(struct my_tree *tree, u64 s, int32_t tag) | ||
76 | { | ||
77 | int32_t tags; | ||
78 | |||
79 | dprintk("%s(%llu, %i) enter\n", __func__, s, tag); | ||
80 | s = normalize(s, tree->mtt_step_size); | ||
81 | tags = _find_entry(tree, s); | ||
82 | if ((tags < 0) || !(tags & (1 << tag))) | ||
83 | return 0; | ||
84 | else | ||
85 | return 1; | ||
86 | } | ||
87 | |||
88 | /* Creates entry with tag, or if entry already exists, unions tag to it. | ||
89 | * If storage is not NULL, newly created entry will use it. | ||
90 | * Returns number of entries added, or negative on error. | ||
91 | */ | ||
92 | static int _add_entry(struct my_tree *tree, u64 s, int32_t tag, | ||
93 | struct pnfs_inval_tracking *storage) | ||
94 | { | ||
95 | int found = 0; | ||
96 | struct pnfs_inval_tracking *pos; | ||
97 | |||
98 | dprintk("%s(%llu, %i, %p) enter\n", __func__, s, tag, storage); | ||
99 | list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) { | ||
100 | if (pos->it_sector > s) | ||
101 | continue; | ||
102 | else if (pos->it_sector == s) { | ||
103 | found = 1; | ||
104 | break; | ||
105 | } else | ||
106 | break; | ||
107 | } | ||
108 | if (found) { | ||
109 | pos->it_tags |= (1 << tag); | ||
110 | return 0; | ||
111 | } else { | ||
112 | struct pnfs_inval_tracking *new; | ||
113 | if (storage) | ||
114 | new = storage; | ||
115 | else { | ||
116 | new = kmalloc(sizeof(*new), GFP_NOFS); | ||
117 | if (!new) | ||
118 | return -ENOMEM; | ||
119 | } | ||
120 | new->it_sector = s; | ||
121 | new->it_tags = (1 << tag); | ||
122 | list_add(&new->it_link, &pos->it_link); | ||
123 | return 1; | ||
124 | } | ||
125 | } | ||
126 | |||
127 | /* XXXX Really want option to not create */ | ||
128 | /* Over range, unions tag with existing entries, else creates entry with tag */ | ||
129 | static int _set_range(struct my_tree *tree, int32_t tag, u64 s, u64 length) | ||
130 | { | ||
131 | u64 i; | ||
132 | |||
133 | dprintk("%s(%i, %llu, %llu) enter\n", __func__, tag, s, length); | ||
134 | for (i = normalize(s, tree->mtt_step_size); i < s + length; | ||
135 | i += tree->mtt_step_size) | ||
136 | if (_add_entry(tree, i, tag, NULL)) | ||
137 | return -ENOMEM; | ||
138 | return 0; | ||
139 | } | ||
140 | |||
141 | /* Ensure that future operations on given range of tree will not malloc */ | ||
142 | static int _preload_range(struct my_tree *tree, u64 offset, u64 length) | ||
143 | { | ||
144 | u64 start, end, s; | ||
145 | int count, i, used = 0, status = -ENOMEM; | ||
146 | struct pnfs_inval_tracking **storage; | ||
147 | |||
148 | dprintk("%s(%llu, %llu) enter\n", __func__, offset, length); | ||
149 | start = normalize(offset, tree->mtt_step_size); | ||
150 | end = normalize_up(offset + length, tree->mtt_step_size); | ||
151 | count = (int)(end - start) / (int)tree->mtt_step_size; | ||
152 | |||
153 | /* Pre-malloc what memory we might need */ | ||
154 | storage = kmalloc(sizeof(*storage) * count, GFP_NOFS); | ||
155 | if (!storage) | ||
156 | return -ENOMEM; | ||
157 | for (i = 0; i < count; i++) { | ||
158 | storage[i] = kmalloc(sizeof(struct pnfs_inval_tracking), | ||
159 | GFP_NOFS); | ||
160 | if (!storage[i]) | ||
161 | goto out_cleanup; | ||
162 | } | ||
163 | |||
164 | /* Now need lock - HOW??? */ | ||
165 | |||
166 | for (s = start; s < end; s += tree->mtt_step_size) | ||
167 | used += _add_entry(tree, s, INTERNAL_EXISTS, storage[used]); | ||
168 | |||
169 | /* Unlock - HOW??? */ | ||
170 | status = 0; | ||
171 | |||
172 | out_cleanup: | ||
173 | for (i = used; i < count; i++) { | ||
174 | if (!storage[i]) | ||
175 | break; | ||
176 | kfree(storage[i]); | ||
177 | } | ||
178 | kfree(storage); | ||
179 | return status; | ||
180 | } | ||
181 | |||
182 | static void set_needs_init(sector_t *array, sector_t offset) | ||
183 | { | ||
184 | sector_t *p = array; | ||
185 | |||
186 | dprintk("%s enter\n", __func__); | ||
187 | if (!p) | ||
188 | return; | ||
189 | while (*p < offset) | ||
190 | p++; | ||
191 | if (*p == offset) | ||
192 | return; | ||
193 | else if (*p == ~0) { | ||
194 | *p++ = offset; | ||
195 | *p = ~0; | ||
196 | return; | ||
197 | } else { | ||
198 | sector_t *save = p; | ||
199 | dprintk("%s Adding %llu\n", __func__, (u64)offset); | ||
200 | while (*p != ~0) | ||
201 | p++; | ||
202 | p++; | ||
203 | memmove(save + 1, save, (char *)p - (char *)save); | ||
204 | *save = offset; | ||
205 | return; | ||
206 | } | ||
207 | } | ||
208 | |||
209 | /* We are relying on page lock to serialize this */ | ||
210 | int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect) | ||
211 | { | ||
212 | int rv; | ||
213 | |||
214 | spin_lock(&marks->im_lock); | ||
215 | rv = _has_tag(&marks->im_tree, isect, EXTENT_INITIALIZED); | ||
216 | spin_unlock(&marks->im_lock); | ||
217 | return rv; | ||
218 | } | ||
219 | |||
220 | /* Assume start, end already sector aligned */ | ||
221 | static int | ||
222 | _range_has_tag(struct my_tree *tree, u64 start, u64 end, int32_t tag) | ||
223 | { | ||
224 | struct pnfs_inval_tracking *pos; | ||
225 | u64 expect = 0; | ||
226 | |||
227 | dprintk("%s(%llu, %llu, %i) enter\n", __func__, start, end, tag); | ||
228 | list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) { | ||
229 | if (pos->it_sector >= end) | ||
230 | continue; | ||
231 | if (!expect) { | ||
232 | if ((pos->it_sector == end - tree->mtt_step_size) && | ||
233 | (pos->it_tags & (1 << tag))) { | ||
234 | expect = pos->it_sector - tree->mtt_step_size; | ||
235 | if (pos->it_sector < tree->mtt_step_size || expect < start) | ||
236 | return 1; | ||
237 | continue; | ||
238 | } else { | ||
239 | return 0; | ||
240 | } | ||
241 | } | ||
242 | if (pos->it_sector != expect || !(pos->it_tags & (1 << tag))) | ||
243 | return 0; | ||
244 | expect -= tree->mtt_step_size; | ||
245 | if (expect < start) | ||
246 | return 1; | ||
247 | } | ||
248 | return 0; | ||
249 | } | ||
250 | |||
251 | static int is_range_written(struct pnfs_inval_markings *marks, | ||
252 | sector_t start, sector_t end) | ||
253 | { | ||
254 | int rv; | ||
255 | |||
256 | spin_lock(&marks->im_lock); | ||
257 | rv = _range_has_tag(&marks->im_tree, start, end, EXTENT_WRITTEN); | ||
258 | spin_unlock(&marks->im_lock); | ||
259 | return rv; | ||
260 | } | ||
261 | |||
262 | /* Marks sectors in [offest, offset_length) as having been initialized. | ||
263 | * All lengths are step-aligned, where step is min(pagesize, blocksize). | ||
264 | * Notes where partial block is initialized, and helps prepare it for | ||
265 | * complete initialization later. | ||
266 | */ | ||
267 | /* Currently assumes offset is page-aligned */ | ||
268 | int bl_mark_sectors_init(struct pnfs_inval_markings *marks, | ||
269 | sector_t offset, sector_t length, | ||
270 | sector_t **pages) | ||
271 | { | ||
272 | sector_t s, start, end; | ||
273 | sector_t *array = NULL; /* Pages to mark */ | ||
274 | |||
275 | dprintk("%s(offset=%llu,len=%llu) enter\n", | ||
276 | __func__, (u64)offset, (u64)length); | ||
277 | s = max((sector_t) 3, | ||
278 | 2 * (marks->im_block_size / (PAGE_CACHE_SECTORS))); | ||
279 | dprintk("%s set max=%llu\n", __func__, (u64)s); | ||
280 | if (pages) { | ||
281 | array = kmalloc(s * sizeof(sector_t), GFP_NOFS); | ||
282 | if (!array) | ||
283 | goto outerr; | ||
284 | array[0] = ~0; | ||
285 | } | ||
286 | |||
287 | start = normalize(offset, marks->im_block_size); | ||
288 | end = normalize_up(offset + length, marks->im_block_size); | ||
289 | if (_preload_range(&marks->im_tree, start, end - start)) | ||
290 | goto outerr; | ||
291 | |||
292 | spin_lock(&marks->im_lock); | ||
293 | |||
294 | for (s = normalize_up(start, PAGE_CACHE_SECTORS); | ||
295 | s < offset; s += PAGE_CACHE_SECTORS) { | ||
296 | dprintk("%s pre-area pages\n", __func__); | ||
297 | /* Portion of used block is not initialized */ | ||
298 | if (!_has_tag(&marks->im_tree, s, EXTENT_INITIALIZED)) | ||
299 | set_needs_init(array, s); | ||
300 | } | ||
301 | if (_set_range(&marks->im_tree, EXTENT_INITIALIZED, offset, length)) | ||
302 | goto out_unlock; | ||
303 | for (s = normalize_up(offset + length, PAGE_CACHE_SECTORS); | ||
304 | s < end; s += PAGE_CACHE_SECTORS) { | ||
305 | dprintk("%s post-area pages\n", __func__); | ||
306 | if (!_has_tag(&marks->im_tree, s, EXTENT_INITIALIZED)) | ||
307 | set_needs_init(array, s); | ||
308 | } | ||
309 | |||
310 | spin_unlock(&marks->im_lock); | ||
311 | |||
312 | if (pages) { | ||
313 | if (array[0] == ~0) { | ||
314 | kfree(array); | ||
315 | *pages = NULL; | ||
316 | } else | ||
317 | *pages = array; | ||
318 | } | ||
319 | return 0; | ||
320 | |||
321 | out_unlock: | ||
322 | spin_unlock(&marks->im_lock); | ||
323 | outerr: | ||
324 | if (pages) { | ||
325 | kfree(array); | ||
326 | *pages = NULL; | ||
327 | } | ||
328 | return -ENOMEM; | ||
329 | } | ||
330 | |||
331 | /* Marks sectors in [offest, offset+length) as having been written to disk. | ||
332 | * All lengths should be block aligned. | ||
333 | */ | ||
334 | static int mark_written_sectors(struct pnfs_inval_markings *marks, | ||
335 | sector_t offset, sector_t length) | ||
336 | { | ||
337 | int status; | ||
338 | |||
339 | dprintk("%s(offset=%llu,len=%llu) enter\n", __func__, | ||
340 | (u64)offset, (u64)length); | ||
341 | spin_lock(&marks->im_lock); | ||
342 | status = _set_range(&marks->im_tree, EXTENT_WRITTEN, offset, length); | ||
343 | spin_unlock(&marks->im_lock); | ||
344 | return status; | ||
345 | } | ||
346 | |||
347 | static void print_short_extent(struct pnfs_block_short_extent *be) | ||
348 | { | ||
349 | dprintk("PRINT SHORT EXTENT extent %p\n", be); | ||
350 | if (be) { | ||
351 | dprintk(" be_f_offset %llu\n", (u64)be->bse_f_offset); | ||
352 | dprintk(" be_length %llu\n", (u64)be->bse_length); | ||
353 | } | ||
354 | } | ||
355 | |||
356 | static void print_clist(struct list_head *list, unsigned int count) | ||
357 | { | ||
358 | struct pnfs_block_short_extent *be; | ||
359 | unsigned int i = 0; | ||
360 | |||
361 | ifdebug(FACILITY) { | ||
362 | printk(KERN_DEBUG "****************\n"); | ||
363 | printk(KERN_DEBUG "Extent list looks like:\n"); | ||
364 | list_for_each_entry(be, list, bse_node) { | ||
365 | i++; | ||
366 | print_short_extent(be); | ||
367 | } | ||
368 | if (i != count) | ||
369 | printk(KERN_DEBUG "\n\nExpected %u entries\n\n\n", count); | ||
370 | printk(KERN_DEBUG "****************\n"); | ||
371 | } | ||
372 | } | ||
373 | |||
374 | /* Note: In theory, we should do more checking that devid's match between | ||
375 | * old and new, but if they don't, the lists are too corrupt to salvage anyway. | ||
376 | */ | ||
377 | /* Note this is very similar to bl_add_merge_extent */ | ||
378 | static void add_to_commitlist(struct pnfs_block_layout *bl, | ||
379 | struct pnfs_block_short_extent *new) | ||
380 | { | ||
381 | struct list_head *clist = &bl->bl_commit; | ||
382 | struct pnfs_block_short_extent *old, *save; | ||
383 | sector_t end = new->bse_f_offset + new->bse_length; | ||
384 | |||
385 | dprintk("%s enter\n", __func__); | ||
386 | print_short_extent(new); | ||
387 | print_clist(clist, bl->bl_count); | ||
388 | bl->bl_count++; | ||
389 | /* Scan for proper place to insert, extending new to the left | ||
390 | * as much as possible. | ||
391 | */ | ||
392 | list_for_each_entry_safe(old, save, clist, bse_node) { | ||
393 | if (new->bse_f_offset < old->bse_f_offset) | ||
394 | break; | ||
395 | if (end <= old->bse_f_offset + old->bse_length) { | ||
396 | /* Range is already in list */ | ||
397 | bl->bl_count--; | ||
398 | kfree(new); | ||
399 | return; | ||
400 | } else if (new->bse_f_offset <= | ||
401 | old->bse_f_offset + old->bse_length) { | ||
402 | /* new overlaps or abuts existing be */ | ||
403 | if (new->bse_mdev == old->bse_mdev) { | ||
404 | /* extend new to fully replace old */ | ||
405 | new->bse_length += new->bse_f_offset - | ||
406 | old->bse_f_offset; | ||
407 | new->bse_f_offset = old->bse_f_offset; | ||
408 | list_del(&old->bse_node); | ||
409 | bl->bl_count--; | ||
410 | kfree(old); | ||
411 | } | ||
412 | } | ||
413 | } | ||
414 | /* Note that if we never hit the above break, old will not point to a | ||
415 | * valid extent. However, in that case &old->bse_node==list. | ||
416 | */ | ||
417 | list_add_tail(&new->bse_node, &old->bse_node); | ||
418 | /* Scan forward for overlaps. If we find any, extend new and | ||
419 | * remove the overlapped extent. | ||
420 | */ | ||
421 | old = list_prepare_entry(new, clist, bse_node); | ||
422 | list_for_each_entry_safe_continue(old, save, clist, bse_node) { | ||
423 | if (end < old->bse_f_offset) | ||
424 | break; | ||
425 | /* new overlaps or abuts old */ | ||
426 | if (new->bse_mdev == old->bse_mdev) { | ||
427 | if (end < old->bse_f_offset + old->bse_length) { | ||
428 | /* extend new to fully cover old */ | ||
429 | end = old->bse_f_offset + old->bse_length; | ||
430 | new->bse_length = end - new->bse_f_offset; | ||
431 | } | ||
432 | list_del(&old->bse_node); | ||
433 | bl->bl_count--; | ||
434 | kfree(old); | ||
435 | } | ||
436 | } | ||
437 | dprintk("%s: after merging\n", __func__); | ||
438 | print_clist(clist, bl->bl_count); | ||
439 | } | ||
440 | |||
441 | /* Note the range described by offset, length is guaranteed to be contained | ||
442 | * within be. | ||
443 | */ | ||
444 | int bl_mark_for_commit(struct pnfs_block_extent *be, | ||
445 | sector_t offset, sector_t length) | ||
446 | { | ||
447 | sector_t new_end, end = offset + length; | ||
448 | struct pnfs_block_short_extent *new; | ||
449 | struct pnfs_block_layout *bl = container_of(be->be_inval, | ||
450 | struct pnfs_block_layout, | ||
451 | bl_inval); | ||
452 | |||
453 | new = kmalloc(sizeof(*new), GFP_NOFS); | ||
454 | if (!new) | ||
455 | return -ENOMEM; | ||
456 | |||
457 | mark_written_sectors(be->be_inval, offset, length); | ||
458 | /* We want to add the range to commit list, but it must be | ||
459 | * block-normalized, and verified that the normalized range has | ||
460 | * been entirely written to disk. | ||
461 | */ | ||
462 | new->bse_f_offset = offset; | ||
463 | offset = normalize(offset, bl->bl_blocksize); | ||
464 | if (offset < new->bse_f_offset) { | ||
465 | if (is_range_written(be->be_inval, offset, new->bse_f_offset)) | ||
466 | new->bse_f_offset = offset; | ||
467 | else | ||
468 | new->bse_f_offset = offset + bl->bl_blocksize; | ||
469 | } | ||
470 | new_end = normalize_up(end, bl->bl_blocksize); | ||
471 | if (end < new_end) { | ||
472 | if (is_range_written(be->be_inval, end, new_end)) | ||
473 | end = new_end; | ||
474 | else | ||
475 | end = new_end - bl->bl_blocksize; | ||
476 | } | ||
477 | if (end <= new->bse_f_offset) { | ||
478 | kfree(new); | ||
479 | return 0; | ||
480 | } | ||
481 | new->bse_length = end - new->bse_f_offset; | ||
482 | new->bse_devid = be->be_devid; | ||
483 | new->bse_mdev = be->be_mdev; | ||
484 | |||
485 | spin_lock(&bl->bl_ext_lock); | ||
486 | /* new will be freed, either by add_to_commitlist if it decides not | ||
487 | * to use it, or after LAYOUTCOMMIT uses it in the commitlist. | ||
488 | */ | ||
489 | add_to_commitlist(bl, new); | ||
490 | spin_unlock(&bl->bl_ext_lock); | ||
491 | return 0; | ||
492 | } | ||
493 | |||
494 | static void print_bl_extent(struct pnfs_block_extent *be) | ||
495 | { | ||
496 | dprintk("PRINT EXTENT extent %p\n", be); | ||
497 | if (be) { | ||
498 | dprintk(" be_f_offset %llu\n", (u64)be->be_f_offset); | ||
499 | dprintk(" be_length %llu\n", (u64)be->be_length); | ||
500 | dprintk(" be_v_offset %llu\n", (u64)be->be_v_offset); | ||
501 | dprintk(" be_state %d\n", be->be_state); | ||
502 | } | ||
503 | } | ||
504 | |||
505 | static void | ||
506 | destroy_extent(struct kref *kref) | ||
507 | { | ||
508 | struct pnfs_block_extent *be; | ||
509 | |||
510 | be = container_of(kref, struct pnfs_block_extent, be_refcnt); | ||
511 | dprintk("%s be=%p\n", __func__, be); | ||
512 | kfree(be); | ||
513 | } | ||
514 | |||
515 | void | ||
516 | bl_put_extent(struct pnfs_block_extent *be) | ||
517 | { | ||
518 | if (be) { | ||
519 | dprintk("%s enter %p (%i)\n", __func__, be, | ||
520 | atomic_read(&be->be_refcnt.refcount)); | ||
521 | kref_put(&be->be_refcnt, destroy_extent); | ||
522 | } | ||
523 | } | ||
524 | |||
525 | struct pnfs_block_extent *bl_alloc_extent(void) | ||
526 | { | ||
527 | struct pnfs_block_extent *be; | ||
528 | |||
529 | be = kmalloc(sizeof(struct pnfs_block_extent), GFP_NOFS); | ||
530 | if (!be) | ||
531 | return NULL; | ||
532 | INIT_LIST_HEAD(&be->be_node); | ||
533 | kref_init(&be->be_refcnt); | ||
534 | be->be_inval = NULL; | ||
535 | return be; | ||
536 | } | ||
537 | |||
538 | static void print_elist(struct list_head *list) | ||
539 | { | ||
540 | struct pnfs_block_extent *be; | ||
541 | dprintk("****************\n"); | ||
542 | dprintk("Extent list looks like:\n"); | ||
543 | list_for_each_entry(be, list, be_node) { | ||
544 | print_bl_extent(be); | ||
545 | } | ||
546 | dprintk("****************\n"); | ||
547 | } | ||
548 | |||
549 | static inline int | ||
550 | extents_consistent(struct pnfs_block_extent *old, struct pnfs_block_extent *new) | ||
551 | { | ||
552 | /* Note this assumes new->be_f_offset >= old->be_f_offset */ | ||
553 | return (new->be_state == old->be_state) && | ||
554 | ((new->be_state == PNFS_BLOCK_NONE_DATA) || | ||
555 | ((new->be_v_offset - old->be_v_offset == | ||
556 | new->be_f_offset - old->be_f_offset) && | ||
557 | new->be_mdev == old->be_mdev)); | ||
558 | } | ||
559 | |||
560 | /* Adds new to appropriate list in bl, modifying new and removing existing | ||
561 | * extents as appropriate to deal with overlaps. | ||
562 | * | ||
563 | * See bl_find_get_extent for list constraints. | ||
564 | * | ||
565 | * Refcount on new is already set. If end up not using it, or error out, | ||
566 | * need to put the reference. | ||
567 | * | ||
568 | * bl->bl_ext_lock is held by caller. | ||
569 | */ | ||
570 | int | ||
571 | bl_add_merge_extent(struct pnfs_block_layout *bl, | ||
572 | struct pnfs_block_extent *new) | ||
573 | { | ||
574 | struct pnfs_block_extent *be, *tmp; | ||
575 | sector_t end = new->be_f_offset + new->be_length; | ||
576 | struct list_head *list; | ||
577 | |||
578 | dprintk("%s enter with be=%p\n", __func__, new); | ||
579 | print_bl_extent(new); | ||
580 | list = &bl->bl_extents[bl_choose_list(new->be_state)]; | ||
581 | print_elist(list); | ||
582 | |||
583 | /* Scan for proper place to insert, extending new to the left | ||
584 | * as much as possible. | ||
585 | */ | ||
586 | list_for_each_entry_safe_reverse(be, tmp, list, be_node) { | ||
587 | if (new->be_f_offset >= be->be_f_offset + be->be_length) | ||
588 | break; | ||
589 | if (new->be_f_offset >= be->be_f_offset) { | ||
590 | if (end <= be->be_f_offset + be->be_length) { | ||
591 | /* new is a subset of existing be*/ | ||
592 | if (extents_consistent(be, new)) { | ||
593 | dprintk("%s: new is subset, ignoring\n", | ||
594 | __func__); | ||
595 | bl_put_extent(new); | ||
596 | return 0; | ||
597 | } else { | ||
598 | goto out_err; | ||
599 | } | ||
600 | } else { | ||
601 | /* |<-- be -->| | ||
602 | * |<-- new -->| */ | ||
603 | if (extents_consistent(be, new)) { | ||
604 | /* extend new to fully replace be */ | ||
605 | new->be_length += new->be_f_offset - | ||
606 | be->be_f_offset; | ||
607 | new->be_f_offset = be->be_f_offset; | ||
608 | new->be_v_offset = be->be_v_offset; | ||
609 | dprintk("%s: removing %p\n", __func__, be); | ||
610 | list_del(&be->be_node); | ||
611 | bl_put_extent(be); | ||
612 | } else { | ||
613 | goto out_err; | ||
614 | } | ||
615 | } | ||
616 | } else if (end >= be->be_f_offset + be->be_length) { | ||
617 | /* new extent overlap existing be */ | ||
618 | if (extents_consistent(be, new)) { | ||
619 | /* extend new to fully replace be */ | ||
620 | dprintk("%s: removing %p\n", __func__, be); | ||
621 | list_del(&be->be_node); | ||
622 | bl_put_extent(be); | ||
623 | } else { | ||
624 | goto out_err; | ||
625 | } | ||
626 | } else if (end > be->be_f_offset) { | ||
627 | /* |<-- be -->| | ||
628 | *|<-- new -->| */ | ||
629 | if (extents_consistent(new, be)) { | ||
630 | /* extend new to fully replace be */ | ||
631 | new->be_length += be->be_f_offset + be->be_length - | ||
632 | new->be_f_offset - new->be_length; | ||
633 | dprintk("%s: removing %p\n", __func__, be); | ||
634 | list_del(&be->be_node); | ||
635 | bl_put_extent(be); | ||
636 | } else { | ||
637 | goto out_err; | ||
638 | } | ||
639 | } | ||
640 | } | ||
641 | /* Note that if we never hit the above break, be will not point to a | ||
642 | * valid extent. However, in that case &be->be_node==list. | ||
643 | */ | ||
644 | list_add(&new->be_node, &be->be_node); | ||
645 | dprintk("%s: inserting new\n", __func__); | ||
646 | print_elist(list); | ||
647 | /* FIXME - The per-list consistency checks have all been done, | ||
648 | * should now check cross-list consistency. | ||
649 | */ | ||
650 | return 0; | ||
651 | |||
652 | out_err: | ||
653 | bl_put_extent(new); | ||
654 | return -EIO; | ||
655 | } | ||
656 | |||
657 | /* Returns extent, or NULL. If a second READ extent exists, it is returned | ||
658 | * in cow_read, if given. | ||
659 | * | ||
660 | * The extents are kept in two seperate ordered lists, one for READ and NONE, | ||
661 | * one for READWRITE and INVALID. Within each list, we assume: | ||
662 | * 1. Extents are ordered by file offset. | ||
663 | * 2. For any given isect, there is at most one extents that matches. | ||
664 | */ | ||
665 | struct pnfs_block_extent * | ||
666 | bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect, | ||
667 | struct pnfs_block_extent **cow_read) | ||
668 | { | ||
669 | struct pnfs_block_extent *be, *cow, *ret; | ||
670 | int i; | ||
671 | |||
672 | dprintk("%s enter with isect %llu\n", __func__, (u64)isect); | ||
673 | cow = ret = NULL; | ||
674 | spin_lock(&bl->bl_ext_lock); | ||
675 | for (i = 0; i < EXTENT_LISTS; i++) { | ||
676 | list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) { | ||
677 | if (isect >= be->be_f_offset + be->be_length) | ||
678 | break; | ||
679 | if (isect >= be->be_f_offset) { | ||
680 | /* We have found an extent */ | ||
681 | dprintk("%s Get %p (%i)\n", __func__, be, | ||
682 | atomic_read(&be->be_refcnt.refcount)); | ||
683 | kref_get(&be->be_refcnt); | ||
684 | if (!ret) | ||
685 | ret = be; | ||
686 | else if (be->be_state != PNFS_BLOCK_READ_DATA) | ||
687 | bl_put_extent(be); | ||
688 | else | ||
689 | cow = be; | ||
690 | break; | ||
691 | } | ||
692 | } | ||
693 | if (ret && | ||
694 | (!cow_read || ret->be_state != PNFS_BLOCK_INVALID_DATA)) | ||
695 | break; | ||
696 | } | ||
697 | spin_unlock(&bl->bl_ext_lock); | ||
698 | if (cow_read) | ||
699 | *cow_read = cow; | ||
700 | print_bl_extent(ret); | ||
701 | return ret; | ||
702 | } | ||
703 | |||
704 | /* Similar to bl_find_get_extent, but called with lock held, and ignores cow */ | ||
705 | static struct pnfs_block_extent * | ||
706 | bl_find_get_extent_locked(struct pnfs_block_layout *bl, sector_t isect) | ||
707 | { | ||
708 | struct pnfs_block_extent *be, *ret = NULL; | ||
709 | int i; | ||
710 | |||
711 | dprintk("%s enter with isect %llu\n", __func__, (u64)isect); | ||
712 | for (i = 0; i < EXTENT_LISTS; i++) { | ||
713 | if (ret) | ||
714 | break; | ||
715 | list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) { | ||
716 | if (isect >= be->be_f_offset + be->be_length) | ||
717 | break; | ||
718 | if (isect >= be->be_f_offset) { | ||
719 | /* We have found an extent */ | ||
720 | dprintk("%s Get %p (%i)\n", __func__, be, | ||
721 | atomic_read(&be->be_refcnt.refcount)); | ||
722 | kref_get(&be->be_refcnt); | ||
723 | ret = be; | ||
724 | break; | ||
725 | } | ||
726 | } | ||
727 | } | ||
728 | print_bl_extent(ret); | ||
729 | return ret; | ||
730 | } | ||
731 | |||
732 | int | ||
733 | encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, | ||
734 | struct xdr_stream *xdr, | ||
735 | const struct nfs4_layoutcommit_args *arg) | ||
736 | { | ||
737 | struct pnfs_block_short_extent *lce, *save; | ||
738 | unsigned int count = 0; | ||
739 | __be32 *p, *xdr_start; | ||
740 | |||
741 | dprintk("%s enter\n", __func__); | ||
742 | /* BUG - creation of bl_commit is buggy - need to wait for | ||
743 | * entire block to be marked WRITTEN before it can be added. | ||
744 | */ | ||
745 | spin_lock(&bl->bl_ext_lock); | ||
746 | /* Want to adjust for possible truncate */ | ||
747 | /* We now want to adjust argument range */ | ||
748 | |||
749 | /* XDR encode the ranges found */ | ||
750 | xdr_start = xdr_reserve_space(xdr, 8); | ||
751 | if (!xdr_start) | ||
752 | goto out; | ||
753 | list_for_each_entry_safe(lce, save, &bl->bl_commit, bse_node) { | ||
754 | p = xdr_reserve_space(xdr, 7 * 4 + sizeof(lce->bse_devid.data)); | ||
755 | if (!p) | ||
756 | break; | ||
757 | p = xdr_encode_opaque_fixed(p, lce->bse_devid.data, NFS4_DEVICEID4_SIZE); | ||
758 | p = xdr_encode_hyper(p, lce->bse_f_offset << SECTOR_SHIFT); | ||
759 | p = xdr_encode_hyper(p, lce->bse_length << SECTOR_SHIFT); | ||
760 | p = xdr_encode_hyper(p, 0LL); | ||
761 | *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA); | ||
762 | list_del(&lce->bse_node); | ||
763 | list_add_tail(&lce->bse_node, &bl->bl_committing); | ||
764 | bl->bl_count--; | ||
765 | count++; | ||
766 | } | ||
767 | xdr_start[0] = cpu_to_be32((xdr->p - xdr_start - 1) * 4); | ||
768 | xdr_start[1] = cpu_to_be32(count); | ||
769 | out: | ||
770 | spin_unlock(&bl->bl_ext_lock); | ||
771 | dprintk("%s found %i ranges\n", __func__, count); | ||
772 | return 0; | ||
773 | } | ||
774 | |||
775 | /* Helper function to set_to_rw that initialize a new extent */ | ||
776 | static void | ||
777 | _prep_new_extent(struct pnfs_block_extent *new, | ||
778 | struct pnfs_block_extent *orig, | ||
779 | sector_t offset, sector_t length, int state) | ||
780 | { | ||
781 | kref_init(&new->be_refcnt); | ||
782 | /* don't need to INIT_LIST_HEAD(&new->be_node) */ | ||
783 | memcpy(&new->be_devid, &orig->be_devid, sizeof(struct nfs4_deviceid)); | ||
784 | new->be_mdev = orig->be_mdev; | ||
785 | new->be_f_offset = offset; | ||
786 | new->be_length = length; | ||
787 | new->be_v_offset = orig->be_v_offset - orig->be_f_offset + offset; | ||
788 | new->be_state = state; | ||
789 | new->be_inval = orig->be_inval; | ||
790 | } | ||
791 | |||
792 | /* Tries to merge be with extent in front of it in list. | ||
793 | * Frees storage if not used. | ||
794 | */ | ||
795 | static struct pnfs_block_extent * | ||
796 | _front_merge(struct pnfs_block_extent *be, struct list_head *head, | ||
797 | struct pnfs_block_extent *storage) | ||
798 | { | ||
799 | struct pnfs_block_extent *prev; | ||
800 | |||
801 | if (!storage) | ||
802 | goto no_merge; | ||
803 | if (&be->be_node == head || be->be_node.prev == head) | ||
804 | goto no_merge; | ||
805 | prev = list_entry(be->be_node.prev, struct pnfs_block_extent, be_node); | ||
806 | if ((prev->be_f_offset + prev->be_length != be->be_f_offset) || | ||
807 | !extents_consistent(prev, be)) | ||
808 | goto no_merge; | ||
809 | _prep_new_extent(storage, prev, prev->be_f_offset, | ||
810 | prev->be_length + be->be_length, prev->be_state); | ||
811 | list_replace(&prev->be_node, &storage->be_node); | ||
812 | bl_put_extent(prev); | ||
813 | list_del(&be->be_node); | ||
814 | bl_put_extent(be); | ||
815 | return storage; | ||
816 | |||
817 | no_merge: | ||
818 | kfree(storage); | ||
819 | return be; | ||
820 | } | ||
821 | |||
822 | static u64 | ||
823 | set_to_rw(struct pnfs_block_layout *bl, u64 offset, u64 length) | ||
824 | { | ||
825 | u64 rv = offset + length; | ||
826 | struct pnfs_block_extent *be, *e1, *e2, *e3, *new, *old; | ||
827 | struct pnfs_block_extent *children[3]; | ||
828 | struct pnfs_block_extent *merge1 = NULL, *merge2 = NULL; | ||
829 | int i = 0, j; | ||
830 | |||
831 | dprintk("%s(%llu, %llu)\n", __func__, offset, length); | ||
832 | /* Create storage for up to three new extents e1, e2, e3 */ | ||
833 | e1 = kmalloc(sizeof(*e1), GFP_ATOMIC); | ||
834 | e2 = kmalloc(sizeof(*e2), GFP_ATOMIC); | ||
835 | e3 = kmalloc(sizeof(*e3), GFP_ATOMIC); | ||
836 | /* BUG - we are ignoring any failure */ | ||
837 | if (!e1 || !e2 || !e3) | ||
838 | goto out_nosplit; | ||
839 | |||
840 | spin_lock(&bl->bl_ext_lock); | ||
841 | be = bl_find_get_extent_locked(bl, offset); | ||
842 | rv = be->be_f_offset + be->be_length; | ||
843 | if (be->be_state != PNFS_BLOCK_INVALID_DATA) { | ||
844 | spin_unlock(&bl->bl_ext_lock); | ||
845 | goto out_nosplit; | ||
846 | } | ||
847 | /* Add e* to children, bumping e*'s krefs */ | ||
848 | if (be->be_f_offset != offset) { | ||
849 | _prep_new_extent(e1, be, be->be_f_offset, | ||
850 | offset - be->be_f_offset, | ||
851 | PNFS_BLOCK_INVALID_DATA); | ||
852 | children[i++] = e1; | ||
853 | print_bl_extent(e1); | ||
854 | } else | ||
855 | merge1 = e1; | ||
856 | _prep_new_extent(e2, be, offset, | ||
857 | min(length, be->be_f_offset + be->be_length - offset), | ||
858 | PNFS_BLOCK_READWRITE_DATA); | ||
859 | children[i++] = e2; | ||
860 | print_bl_extent(e2); | ||
861 | if (offset + length < be->be_f_offset + be->be_length) { | ||
862 | _prep_new_extent(e3, be, e2->be_f_offset + e2->be_length, | ||
863 | be->be_f_offset + be->be_length - | ||
864 | offset - length, | ||
865 | PNFS_BLOCK_INVALID_DATA); | ||
866 | children[i++] = e3; | ||
867 | print_bl_extent(e3); | ||
868 | } else | ||
869 | merge2 = e3; | ||
870 | |||
871 | /* Remove be from list, and insert the e* */ | ||
872 | /* We don't get refs on e*, since this list is the base reference | ||
873 | * set when init'ed. | ||
874 | */ | ||
875 | if (i < 3) | ||
876 | children[i] = NULL; | ||
877 | new = children[0]; | ||
878 | list_replace(&be->be_node, &new->be_node); | ||
879 | bl_put_extent(be); | ||
880 | new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge1); | ||
881 | for (j = 1; j < i; j++) { | ||
882 | old = new; | ||
883 | new = children[j]; | ||
884 | list_add(&new->be_node, &old->be_node); | ||
885 | } | ||
886 | if (merge2) { | ||
887 | /* This is a HACK, should just create a _back_merge function */ | ||
888 | new = list_entry(new->be_node.next, | ||
889 | struct pnfs_block_extent, be_node); | ||
890 | new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge2); | ||
891 | } | ||
892 | spin_unlock(&bl->bl_ext_lock); | ||
893 | |||
894 | /* Since we removed the base reference above, be is now scheduled for | ||
895 | * destruction. | ||
896 | */ | ||
897 | bl_put_extent(be); | ||
898 | dprintk("%s returns %llu after split\n", __func__, rv); | ||
899 | return rv; | ||
900 | |||
901 | out_nosplit: | ||
902 | kfree(e1); | ||
903 | kfree(e2); | ||
904 | kfree(e3); | ||
905 | dprintk("%s returns %llu without splitting\n", __func__, rv); | ||
906 | return rv; | ||
907 | } | ||
908 | |||
909 | void | ||
910 | clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, | ||
911 | const struct nfs4_layoutcommit_args *arg, | ||
912 | int status) | ||
913 | { | ||
914 | struct pnfs_block_short_extent *lce, *save; | ||
915 | |||
916 | dprintk("%s status %d\n", __func__, status); | ||
917 | list_for_each_entry_safe(lce, save, &bl->bl_committing, bse_node) { | ||
918 | if (likely(!status)) { | ||
919 | u64 offset = lce->bse_f_offset; | ||
920 | u64 end = offset + lce->bse_length; | ||
921 | |||
922 | do { | ||
923 | offset = set_to_rw(bl, offset, end - offset); | ||
924 | } while (offset < end); | ||
925 | list_del(&lce->bse_node); | ||
926 | |||
927 | kfree(lce); | ||
928 | } else { | ||
929 | list_del(&lce->bse_node); | ||
930 | spin_lock(&bl->bl_ext_lock); | ||
931 | add_to_commitlist(bl, lce); | ||
932 | spin_unlock(&bl->bl_ext_lock); | ||
933 | } | ||
934 | } | ||
935 | } | ||