aboutsummaryrefslogtreecommitdiffstats
path: root/fs/nfs
diff options
context:
space:
mode:
authorChristoph Hellwig <hch@lst.de>2014-09-10 11:23:34 -0400
committerTrond Myklebust <trond.myklebust@primarydata.com>2014-09-10 15:47:03 -0400
commit8067253c8cc531b6f367b9f5942bdc6168385701 (patch)
tree78d7663906883132df08a2a1aecf6e385fd766d1 /fs/nfs
parent8c792ea940499153732adea2ea4ca37f6999778f (diff)
pnfs/blocklayout: rewrite extent tracking
Currently the block layout driver tracks extents in three separate data structures: - the two list of pnfs_block_extent structures returned by the server - the list of sectors that were in invalid state but have been written to - a list of pnfs_block_short_extent structures for LAYOUTCOMMIT All of these share the property that they are not only highly inefficient data structures, but also that operations on them are even more inefficient than nessecary. In addition there are various implementation defects like: - using an int to track sectors, causing corruption for large offsets - incorrect normalization of page or block granularity ranges - insufficient error handling - incorrect synchronization as extents can be modified while they are in use This patch replace all three data with a single unified rbtree structure tracking all extents, as well as their in-memory state, although we still need to instance for read-only and read-write extent due to the arcane client side COW feature in the block layouts spec. To fix the problem of extent possibly being modified while in use we make sure to return a copy of the extent for use in the write path - the extent can only be invalidated by a layout recall or return which has to wait until the I/O operations finished due to refcounts on the layout segment. The new extent tree work similar to the schemes used by block based filesystems like XFS or ext4. Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
Diffstat (limited to 'fs/nfs')
-rw-r--r--fs/nfs/blocklayout/Makefile3
-rw-r--r--fs/nfs/blocklayout/blocklayout.c258
-rw-r--r--fs/nfs/blocklayout/blocklayout.h112
-rw-r--r--fs/nfs/blocklayout/blocklayoutdev.c35
-rw-r--r--fs/nfs/blocklayout/extent_tree.c547
-rw-r--r--fs/nfs/blocklayout/extents.c908
6 files changed, 651 insertions, 1212 deletions
diff --git a/fs/nfs/blocklayout/Makefile b/fs/nfs/blocklayout/Makefile
index d5815505c020..3fa5ec780a8e 100644
--- a/fs/nfs/blocklayout/Makefile
+++ b/fs/nfs/blocklayout/Makefile
@@ -2,4 +2,5 @@
2# Makefile for the pNFS block layout driver kernel module 2# Makefile for the pNFS block layout driver kernel module
3# 3#
4obj-$(CONFIG_PNFS_BLOCK) += blocklayoutdriver.o 4obj-$(CONFIG_PNFS_BLOCK) += blocklayoutdriver.o
5blocklayoutdriver-objs := blocklayout.o extents.o blocklayoutdev.o blocklayoutdm.o 5blocklayoutdriver-objs := blocklayout.o blocklayoutdev.o blocklayoutdm.o \
6 extent_tree.o
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 5aa23750a149..8502e620f644 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -49,26 +49,16 @@ MODULE_LICENSE("GPL");
49MODULE_AUTHOR("Andy Adamson <andros@citi.umich.edu>"); 49MODULE_AUTHOR("Andy Adamson <andros@citi.umich.edu>");
50MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver"); 50MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver");
51 51
52/* Given the be associated with isect, determine if page data needs to be 52static bool is_hole(struct pnfs_block_extent *be)
53 * initialized.
54 */
55static int is_hole(struct pnfs_block_extent *be, sector_t isect)
56{
57 if (be->be_state == PNFS_BLOCK_NONE_DATA)
58 return 1;
59 else if (be->be_state != PNFS_BLOCK_INVALID_DATA)
60 return 0;
61 else
62 return !bl_is_sector_init(be->be_inval, isect);
63}
64
65/* Given the be associated with isect, determine if page data can be
66 * written to disk.
67 */
68static int is_writable(struct pnfs_block_extent *be, sector_t isect)
69{ 53{
70 return (be->be_state == PNFS_BLOCK_READWRITE_DATA || 54 switch (be->be_state) {
71 be->be_state == PNFS_BLOCK_INVALID_DATA); 55 case PNFS_BLOCK_NONE_DATA:
56 return true;
57 case PNFS_BLOCK_INVALID_DATA:
58 return be->be_tag ? false : true;
59 default:
60 return false;
61 }
72} 62}
73 63
74/* The data we are handed might be spread across several bios. We need 64/* The data we are handed might be spread across several bios. We need
@@ -76,9 +66,8 @@ static int is_writable(struct pnfs_block_extent *be, sector_t isect)
76 */ 66 */
77struct parallel_io { 67struct parallel_io {
78 struct kref refcnt; 68 struct kref refcnt;
79 void (*pnfs_callback) (void *data, int num_se); 69 void (*pnfs_callback) (void *data);
80 void *data; 70 void *data;
81 int bse_count;
82}; 71};
83 72
84static inline struct parallel_io *alloc_parallel(void *data) 73static inline struct parallel_io *alloc_parallel(void *data)
@@ -89,7 +78,6 @@ static inline struct parallel_io *alloc_parallel(void *data)
89 if (rv) { 78 if (rv) {
90 rv->data = data; 79 rv->data = data;
91 kref_init(&rv->refcnt); 80 kref_init(&rv->refcnt);
92 rv->bse_count = 0;
93 } 81 }
94 return rv; 82 return rv;
95} 83}
@@ -104,7 +92,7 @@ static void destroy_parallel(struct kref *kref)
104 struct parallel_io *p = container_of(kref, struct parallel_io, refcnt); 92 struct parallel_io *p = container_of(kref, struct parallel_io, refcnt);
105 93
106 dprintk("%s enter\n", __func__); 94 dprintk("%s enter\n", __func__);
107 p->pnfs_callback(p->data, p->bse_count); 95 p->pnfs_callback(p->data);
108 kfree(p); 96 kfree(p);
109} 97}
110 98
@@ -200,7 +188,7 @@ static void bl_read_cleanup(struct work_struct *work)
200} 188}
201 189
202static void 190static void
203bl_end_par_io_read(void *data, int unused) 191bl_end_par_io_read(void *data)
204{ 192{
205 struct nfs_pgio_header *hdr = data; 193 struct nfs_pgio_header *hdr = data;
206 194
@@ -210,56 +198,46 @@ bl_end_par_io_read(void *data, int unused)
210} 198}
211 199
212static enum pnfs_try_status 200static enum pnfs_try_status
213bl_read_pagelist(struct nfs_pgio_header *hdr) 201bl_read_pagelist(struct nfs_pgio_header *header)
214{ 202{
215 struct nfs_pgio_header *header = hdr; 203 struct pnfs_block_layout *bl = BLK_LSEG2EXT(header->lseg);
216 int i, hole;
217 struct bio *bio = NULL; 204 struct bio *bio = NULL;
218 struct pnfs_block_extent *be = NULL, *cow_read = NULL; 205 struct pnfs_block_extent be;
219 sector_t isect, extent_length = 0; 206 sector_t isect, extent_length = 0;
220 struct parallel_io *par; 207 struct parallel_io *par;
221 loff_t f_offset = hdr->args.offset; 208 loff_t f_offset = header->args.offset;
222 size_t bytes_left = hdr->args.count; 209 size_t bytes_left = header->args.count;
223 unsigned int pg_offset, pg_len; 210 unsigned int pg_offset, pg_len;
224 struct page **pages = hdr->args.pages; 211 struct page **pages = header->args.pages;
225 int pg_index = hdr->args.pgbase >> PAGE_CACHE_SHIFT; 212 int pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT;
226 const bool is_dio = (header->dreq != NULL); 213 const bool is_dio = (header->dreq != NULL);
227 struct blk_plug plug; 214 struct blk_plug plug;
215 int i;
228 216
229 dprintk("%s enter nr_pages %u offset %lld count %u\n", __func__, 217 dprintk("%s enter nr_pages %u offset %lld count %u\n", __func__,
230 hdr->page_array.npages, f_offset, 218 header->page_array.npages, f_offset,
231 (unsigned int)hdr->args.count); 219 (unsigned int)header->args.count);
232 220
233 par = alloc_parallel(hdr); 221 par = alloc_parallel(header);
234 if (!par) 222 if (!par)
235 goto use_mds; 223 return PNFS_NOT_ATTEMPTED;
236 par->pnfs_callback = bl_end_par_io_read; 224 par->pnfs_callback = bl_end_par_io_read;
237 /* At this point, we can no longer jump to use_mds */
238 225
239 blk_start_plug(&plug); 226 blk_start_plug(&plug);
240 227
241 isect = (sector_t) (f_offset >> SECTOR_SHIFT); 228 isect = (sector_t) (f_offset >> SECTOR_SHIFT);
242 /* Code assumes extents are page-aligned */ 229 /* Code assumes extents are page-aligned */
243 for (i = pg_index; i < hdr->page_array.npages; i++) { 230 for (i = pg_index; i < header->page_array.npages; i++) {
244 if (extent_length <= 0) { 231 if (extent_length <= 0) {
245 /* We've used up the previous extent */ 232 /* We've used up the previous extent */
246 bl_put_extent(be);
247 bl_put_extent(cow_read);
248 bio = bl_submit_bio(READ, bio); 233 bio = bl_submit_bio(READ, bio);
234
249 /* Get the next one */ 235 /* Get the next one */
250 be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg), 236 if (!ext_tree_lookup(bl, isect, &be, false)) {
251 isect, &cow_read);
252 if (!be) {
253 header->pnfs_error = -EIO; 237 header->pnfs_error = -EIO;
254 goto out; 238 goto out;
255 } 239 }
256 extent_length = be->be_length - 240 extent_length = be.be_length - (isect - be.be_f_offset);
257 (isect - be->be_f_offset);
258 if (cow_read) {
259 sector_t cow_length = cow_read->be_length -
260 (isect - cow_read->be_f_offset);
261 extent_length = min(extent_length, cow_length);
262 }
263 } 241 }
264 242
265 pg_offset = f_offset & ~PAGE_CACHE_MASK; 243 pg_offset = f_offset & ~PAGE_CACHE_MASK;
@@ -278,20 +256,16 @@ bl_read_pagelist(struct nfs_pgio_header *hdr)
278 pg_len = PAGE_CACHE_SIZE; 256 pg_len = PAGE_CACHE_SIZE;
279 } 257 }
280 258
281 hole = is_hole(be, isect); 259 if (is_hole(&be)) {
282 if (hole && !cow_read) {
283 bio = bl_submit_bio(READ, bio); 260 bio = bl_submit_bio(READ, bio);
284 /* Fill hole w/ zeroes w/o accessing device */ 261 /* Fill hole w/ zeroes w/o accessing device */
285 dprintk("%s Zeroing page for hole\n", __func__); 262 dprintk("%s Zeroing page for hole\n", __func__);
286 zero_user_segment(pages[i], pg_offset, pg_len); 263 zero_user_segment(pages[i], pg_offset, pg_len);
287 } else { 264 } else {
288 struct pnfs_block_extent *be_read;
289
290 be_read = (hole && cow_read) ? cow_read : be;
291 bio = do_add_page_to_bio(bio, 265 bio = do_add_page_to_bio(bio,
292 hdr->page_array.npages - i, 266 header->page_array.npages - i,
293 READ, 267 READ,
294 isect, pages[i], be_read, 268 isect, pages[i], &be,
295 bl_end_io_read, par, 269 bl_end_io_read, par,
296 pg_offset, pg_len); 270 pg_offset, pg_len);
297 if (IS_ERR(bio)) { 271 if (IS_ERR(bio)) {
@@ -304,50 +278,16 @@ bl_read_pagelist(struct nfs_pgio_header *hdr)
304 extent_length -= (pg_len >> SECTOR_SHIFT); 278 extent_length -= (pg_len >> SECTOR_SHIFT);
305 } 279 }
306 if ((isect << SECTOR_SHIFT) >= header->inode->i_size) { 280 if ((isect << SECTOR_SHIFT) >= header->inode->i_size) {
307 hdr->res.eof = 1; 281 header->res.eof = 1;
308 hdr->res.count = header->inode->i_size - hdr->args.offset; 282 header->res.count = header->inode->i_size - header->args.offset;
309 } else { 283 } else {
310 hdr->res.count = (isect << SECTOR_SHIFT) - hdr->args.offset; 284 header->res.count = (isect << SECTOR_SHIFT) - header->args.offset;
311 } 285 }
312out: 286out:
313 bl_put_extent(be);
314 bl_put_extent(cow_read);
315 bl_submit_bio(READ, bio); 287 bl_submit_bio(READ, bio);
316 blk_finish_plug(&plug); 288 blk_finish_plug(&plug);
317 put_parallel(par); 289 put_parallel(par);
318 return PNFS_ATTEMPTED; 290 return PNFS_ATTEMPTED;
319
320 use_mds:
321 dprintk("Giving up and using normal NFS\n");
322 return PNFS_NOT_ATTEMPTED;
323}
324
325static void mark_extents_written(struct pnfs_block_layout *bl,
326 __u64 offset, __u32 count)
327{
328 sector_t isect, end;
329 struct pnfs_block_extent *be;
330 struct pnfs_block_short_extent *se;
331
332 dprintk("%s(%llu, %u)\n", __func__, offset, count);
333 if (count == 0)
334 return;
335 isect = (offset & (long)(PAGE_CACHE_MASK)) >> SECTOR_SHIFT;
336 end = (offset + count + PAGE_CACHE_SIZE - 1) & (long)(PAGE_CACHE_MASK);
337 end >>= SECTOR_SHIFT;
338 while (isect < end) {
339 sector_t len;
340 be = bl_find_get_extent(bl, isect, NULL);
341 BUG_ON(!be); /* FIXME */
342 len = min(end, be->be_f_offset + be->be_length) - isect;
343 if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
344 se = bl_pop_one_short_extent(be->be_inval);
345 BUG_ON(!se);
346 bl_mark_for_commit(be, isect, len, se);
347 }
348 isect += len;
349 bl_put_extent(be);
350 }
351} 291}
352 292
353static void bl_end_io_write(struct bio *bio, int err) 293static void bl_end_io_write(struct bio *bio, int err)
@@ -370,29 +310,30 @@ static void bl_end_io_write(struct bio *bio, int err)
370 */ 310 */
371static void bl_write_cleanup(struct work_struct *work) 311static void bl_write_cleanup(struct work_struct *work)
372{ 312{
373 struct rpc_task *task; 313 struct rpc_task *task = container_of(work, struct rpc_task, u.tk_work);
374 struct nfs_pgio_header *hdr; 314 struct nfs_pgio_header *hdr =
315 container_of(task, struct nfs_pgio_header, task);
316
375 dprintk("%s enter\n", __func__); 317 dprintk("%s enter\n", __func__);
376 task = container_of(work, struct rpc_task, u.tk_work); 318
377 hdr = container_of(task, struct nfs_pgio_header, task);
378 if (likely(!hdr->pnfs_error)) { 319 if (likely(!hdr->pnfs_error)) {
379 /* Marks for LAYOUTCOMMIT */ 320 struct pnfs_block_layout *bl = BLK_LSEG2EXT(hdr->lseg);
380 mark_extents_written(BLK_LSEG2EXT(hdr->lseg), 321 u64 start = hdr->args.offset & (loff_t)PAGE_CACHE_MASK;
381 hdr->args.offset, hdr->args.count); 322 u64 end = (hdr->args.offset + hdr->args.count +
323 PAGE_CACHE_SIZE - 1) & (loff_t)PAGE_CACHE_MASK;
324
325 ext_tree_mark_written(bl, start >> SECTOR_SHIFT,
326 (end - start) >> SECTOR_SHIFT);
382 } 327 }
328
383 pnfs_ld_write_done(hdr); 329 pnfs_ld_write_done(hdr);
384} 330}
385 331
386/* Called when last of bios associated with a bl_write_pagelist call finishes */ 332/* Called when last of bios associated with a bl_write_pagelist call finishes */
387static void bl_end_par_io_write(void *data, int num_se) 333static void bl_end_par_io_write(void *data)
388{ 334{
389 struct nfs_pgio_header *hdr = data; 335 struct nfs_pgio_header *hdr = data;
390 336
391 if (unlikely(hdr->pnfs_error)) {
392 bl_free_short_extents(&BLK_LSEG2EXT(hdr->lseg)->bl_inval,
393 num_se);
394 }
395
396 hdr->task.tk_status = hdr->pnfs_error; 337 hdr->task.tk_status = hdr->pnfs_error;
397 hdr->verf.committed = NFS_FILE_SYNC; 338 hdr->verf.committed = NFS_FILE_SYNC;
398 INIT_WORK(&hdr->task.u.tk_work, bl_write_cleanup); 339 INIT_WORK(&hdr->task.u.tk_work, bl_write_cleanup);
@@ -402,9 +343,9 @@ static void bl_end_par_io_write(void *data, int num_se)
402static enum pnfs_try_status 343static enum pnfs_try_status
403bl_write_pagelist(struct nfs_pgio_header *header, int sync) 344bl_write_pagelist(struct nfs_pgio_header *header, int sync)
404{ 345{
405 int i, ret; 346 struct pnfs_block_layout *bl = BLK_LSEG2EXT(header->lseg);
406 struct bio *bio = NULL; 347 struct bio *bio = NULL;
407 struct pnfs_block_extent *be = NULL; 348 struct pnfs_block_extent be;
408 sector_t isect, extent_length = 0; 349 sector_t isect, extent_length = 0;
409 struct parallel_io *par = NULL; 350 struct parallel_io *par = NULL;
410 loff_t offset = header->args.offset; 351 loff_t offset = header->args.offset;
@@ -412,6 +353,7 @@ bl_write_pagelist(struct nfs_pgio_header *header, int sync)
412 struct page **pages = header->args.pages; 353 struct page **pages = header->args.pages;
413 int pg_index = pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT; 354 int pg_index = pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT;
414 struct blk_plug plug; 355 struct blk_plug plug;
356 int i;
415 357
416 dprintk("%s enter, %Zu@%lld\n", __func__, count, offset); 358 dprintk("%s enter, %Zu@%lld\n", __func__, count, offset);
417 359
@@ -421,9 +363,8 @@ bl_write_pagelist(struct nfs_pgio_header *header, int sync)
421 */ 363 */
422 par = alloc_parallel(header); 364 par = alloc_parallel(header);
423 if (!par) 365 if (!par)
424 goto out_mds; 366 return PNFS_NOT_ATTEMPTED;
425 par->pnfs_callback = bl_end_par_io_write; 367 par->pnfs_callback = bl_end_par_io_write;
426 /* At this point, have to be more careful with error handling */
427 368
428 blk_start_plug(&plug); 369 blk_start_plug(&plug);
429 370
@@ -434,44 +375,18 @@ bl_write_pagelist(struct nfs_pgio_header *header, int sync)
434 for (i = pg_index; i < header->page_array.npages; i++) { 375 for (i = pg_index; i < header->page_array.npages; i++) {
435 if (extent_length <= 0) { 376 if (extent_length <= 0) {
436 /* We've used up the previous extent */ 377 /* We've used up the previous extent */
437 bl_put_extent(be);
438 bio = bl_submit_bio(WRITE, bio); 378 bio = bl_submit_bio(WRITE, bio);
439 /* Get the next one */ 379 /* Get the next one */
440 be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg), 380 if (!ext_tree_lookup(bl, isect, &be, true)) {
441 isect, NULL);
442 if (!be || !is_writable(be, isect)) {
443 header->pnfs_error = -EINVAL; 381 header->pnfs_error = -EINVAL;
444 goto out; 382 goto out;
445 } 383 }
446 if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
447 if (likely(!bl_push_one_short_extent(
448 be->be_inval)))
449 par->bse_count++;
450 else {
451 header->pnfs_error = -ENOMEM;
452 goto out;
453 }
454 }
455 extent_length = be->be_length -
456 (isect - be->be_f_offset);
457 }
458 384
459 BUG_ON(offset & ~PAGE_CACHE_MASK); 385 extent_length = be.be_length - (isect - be.be_f_offset);
460
461 if (be->be_state == PNFS_BLOCK_INVALID_DATA &&
462 !bl_is_sector_init(be->be_inval, isect)) {
463 ret = bl_mark_sectors_init(be->be_inval, isect,
464 PAGE_CACHE_SECTORS);
465 if (unlikely(ret)) {
466 dprintk("%s bl_mark_sectors_init fail %d\n",
467 __func__, ret);
468 header->pnfs_error = ret;
469 goto out;
470 }
471 } 386 }
472 387
473 bio = do_add_page_to_bio(bio, header->page_array.npages - i, 388 bio = do_add_page_to_bio(bio, header->page_array.npages - i,
474 WRITE, isect, pages[i], be, 389 WRITE, isect, pages[i], &be,
475 bl_end_io_write, par, 390 bl_end_io_write, par,
476 0, PAGE_CACHE_SIZE); 391 0, PAGE_CACHE_SIZE);
477 if (IS_ERR(bio)) { 392 if (IS_ERR(bio)) {
@@ -487,60 +402,22 @@ bl_write_pagelist(struct nfs_pgio_header *header, int sync)
487 402
488 header->res.count = header->args.count; 403 header->res.count = header->args.count;
489out: 404out:
490 bl_put_extent(be);
491 bl_submit_bio(WRITE, bio); 405 bl_submit_bio(WRITE, bio);
492 blk_finish_plug(&plug); 406 blk_finish_plug(&plug);
493 put_parallel(par); 407 put_parallel(par);
494 return PNFS_ATTEMPTED; 408 return PNFS_ATTEMPTED;
495out_mds:
496 return PNFS_NOT_ATTEMPTED;
497}
498
499/* FIXME - range ignored */
500static void
501release_extents(struct pnfs_block_layout *bl, struct pnfs_layout_range *range)
502{
503 int i;
504 struct pnfs_block_extent *be;
505
506 spin_lock(&bl->bl_ext_lock);
507 for (i = 0; i < EXTENT_LISTS; i++) {
508 while (!list_empty(&bl->bl_extents[i])) {
509 be = list_first_entry(&bl->bl_extents[i],
510 struct pnfs_block_extent,
511 be_node);
512 list_del(&be->be_node);
513 bl_put_extent(be);
514 }
515 }
516 spin_unlock(&bl->bl_ext_lock);
517}
518
519static void
520release_inval_marks(struct pnfs_inval_markings *marks)
521{
522 struct pnfs_inval_tracking *pos, *temp;
523 struct pnfs_block_short_extent *se, *stemp;
524
525 list_for_each_entry_safe(pos, temp, &marks->im_tree.mtt_stub, it_link) {
526 list_del(&pos->it_link);
527 kfree(pos);
528 }
529
530 list_for_each_entry_safe(se, stemp, &marks->im_extents, bse_node) {
531 list_del(&se->bse_node);
532 kfree(se);
533 }
534 return;
535} 409}
536 410
537static void bl_free_layout_hdr(struct pnfs_layout_hdr *lo) 411static void bl_free_layout_hdr(struct pnfs_layout_hdr *lo)
538{ 412{
539 struct pnfs_block_layout *bl = BLK_LO2EXT(lo); 413 struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
414 int err;
540 415
541 dprintk("%s enter\n", __func__); 416 dprintk("%s enter\n", __func__);
542 release_extents(bl, NULL); 417
543 release_inval_marks(&bl->bl_inval); 418 err = ext_tree_remove(bl, true, 0, LLONG_MAX);
419 WARN_ON(err);
420
544 kfree(bl); 421 kfree(bl);
545} 422}
546 423
@@ -553,14 +430,11 @@ static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode,
553 bl = kzalloc(sizeof(*bl), gfp_flags); 430 bl = kzalloc(sizeof(*bl), gfp_flags);
554 if (!bl) 431 if (!bl)
555 return NULL; 432 return NULL;
433
434 bl->bl_ext_rw = RB_ROOT;
435 bl->bl_ext_ro = RB_ROOT;
556 spin_lock_init(&bl->bl_ext_lock); 436 spin_lock_init(&bl->bl_ext_lock);
557 INIT_LIST_HEAD(&bl->bl_extents[0]); 437
558 INIT_LIST_HEAD(&bl->bl_extents[1]);
559 INIT_LIST_HEAD(&bl->bl_commit);
560 INIT_LIST_HEAD(&bl->bl_committing);
561 bl->bl_count = 0;
562 bl->bl_blocksize = NFS_SERVER(inode)->pnfs_blksize >> SECTOR_SHIFT;
563 BL_INIT_INVAL_MARKS(&bl->bl_inval, bl->bl_blocksize);
564 return &bl->bl_layout; 438 return &bl->bl_layout;
565} 439}
566 440
@@ -600,7 +474,7 @@ bl_encode_layoutcommit(struct pnfs_layout_hdr *lo, struct xdr_stream *xdr,
600 const struct nfs4_layoutcommit_args *arg) 474 const struct nfs4_layoutcommit_args *arg)
601{ 475{
602 dprintk("%s enter\n", __func__); 476 dprintk("%s enter\n", __func__);
603 encode_pnfs_block_layoutupdate(BLK_LO2EXT(lo), xdr, arg); 477 ext_tree_encode_commit(BLK_LO2EXT(lo), xdr);
604} 478}
605 479
606static void 480static void
@@ -609,7 +483,7 @@ bl_cleanup_layoutcommit(struct nfs4_layoutcommit_data *lcdata)
609 struct pnfs_layout_hdr *lo = NFS_I(lcdata->args.inode)->layout; 483 struct pnfs_layout_hdr *lo = NFS_I(lcdata->args.inode)->layout;
610 484
611 dprintk("%s enter\n", __func__); 485 dprintk("%s enter\n", __func__);
612 clean_pnfs_block_layoutupdate(BLK_LO2EXT(lo), &lcdata->args, lcdata->res.status); 486 ext_tree_mark_committed(BLK_LO2EXT(lo), lcdata->res.status);
613} 487}
614 488
615static void free_blk_mountid(struct block_mount_id *mid) 489static void free_blk_mountid(struct block_mount_id *mid)
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index 9838fb020473..b4f66d875f12 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -63,82 +63,28 @@ enum exstate4 {
63 PNFS_BLOCK_NONE_DATA = 3 /* unmapped, it's a hole */ 63 PNFS_BLOCK_NONE_DATA = 3 /* unmapped, it's a hole */
64}; 64};
65 65
66#define MY_MAX_TAGS (15) /* tag bitnums used must be less than this */
67
68struct my_tree {
69 sector_t mtt_step_size; /* Internal sector alignment */
70 struct list_head mtt_stub; /* Should be a radix tree */
71};
72
73struct pnfs_inval_markings {
74 spinlock_t im_lock;
75 struct my_tree im_tree; /* Sectors that need LAYOUTCOMMIT */
76 sector_t im_block_size; /* Server blocksize in sectors */
77 struct list_head im_extents; /* Short extents for INVAL->RW conversion */
78};
79
80struct pnfs_inval_tracking {
81 struct list_head it_link;
82 int it_sector;
83 int it_tags;
84};
85
86/* sector_t fields are all in 512-byte sectors */ 66/* sector_t fields are all in 512-byte sectors */
87struct pnfs_block_extent { 67struct pnfs_block_extent {
88 struct kref be_refcnt; 68 union {
89 struct list_head be_node; /* link into lseg list */ 69 struct rb_node be_node;
90 struct nfs4_deviceid be_devid; /* FIXME: could use device cache instead */ 70 struct list_head be_list;
71 };
72 struct nfs4_deviceid be_devid; /* FIXME: could use device cache instead */
91 struct block_device *be_mdev; 73 struct block_device *be_mdev;
92 sector_t be_f_offset; /* the starting offset in the file */ 74 sector_t be_f_offset; /* the starting offset in the file */
93 sector_t be_length; /* the size of the extent */ 75 sector_t be_length; /* the size of the extent */
94 sector_t be_v_offset; /* the starting offset in the volume */ 76 sector_t be_v_offset; /* the starting offset in the volume */
95 enum exstate4 be_state; /* the state of this extent */ 77 enum exstate4 be_state; /* the state of this extent */
96 struct pnfs_inval_markings *be_inval; /* tracks INVAL->RW transition */ 78#define EXTENT_WRITTEN 1
97}; 79#define EXTENT_COMMITTING 2
98 80 unsigned int be_tag;
99/* Shortened extent used by LAYOUTCOMMIT */
100struct pnfs_block_short_extent {
101 struct list_head bse_node;
102 struct nfs4_deviceid bse_devid;
103 struct block_device *bse_mdev;
104 sector_t bse_f_offset; /* the starting offset in the file */
105 sector_t bse_length; /* the size of the extent */
106}; 81};
107 82
108static inline void
109BL_INIT_INVAL_MARKS(struct pnfs_inval_markings *marks, sector_t blocksize)
110{
111 spin_lock_init(&marks->im_lock);
112 INIT_LIST_HEAD(&marks->im_tree.mtt_stub);
113 INIT_LIST_HEAD(&marks->im_extents);
114 marks->im_block_size = blocksize;
115 marks->im_tree.mtt_step_size = min((sector_t)PAGE_CACHE_SECTORS,
116 blocksize);
117}
118
119enum extentclass4 {
120 RW_EXTENT = 0, /* READWRTE and INVAL */
121 RO_EXTENT = 1, /* READ and NONE */
122 EXTENT_LISTS = 2,
123};
124
125static inline int bl_choose_list(enum exstate4 state)
126{
127 if (state == PNFS_BLOCK_READ_DATA || state == PNFS_BLOCK_NONE_DATA)
128 return RO_EXTENT;
129 else
130 return RW_EXTENT;
131}
132
133struct pnfs_block_layout { 83struct pnfs_block_layout {
134 struct pnfs_layout_hdr bl_layout; 84 struct pnfs_layout_hdr bl_layout;
135 struct pnfs_inval_markings bl_inval; /* tracks INVAL->RW transition */ 85 struct rb_root bl_ext_rw;
86 struct rb_root bl_ext_ro;
136 spinlock_t bl_ext_lock; /* Protects list manipulation */ 87 spinlock_t bl_ext_lock; /* Protects list manipulation */
137 struct list_head bl_extents[EXTENT_LISTS]; /* R and RW extents */
138 struct list_head bl_commit; /* Needs layout commit */
139 struct list_head bl_committing; /* Layout committing */
140 unsigned int bl_count; /* entries in bl_commit */
141 sector_t bl_blocksize; /* Server blocksize in sectors */
142}; 88};
143 89
144#define BLK_ID(lo) ((struct block_mount_id *)(NFS_SERVER(lo->plh_inode)->pnfs_ld_data)) 90#define BLK_ID(lo) ((struct block_mount_id *)(NFS_SERVER(lo->plh_inode)->pnfs_ld_data))
@@ -183,29 +129,17 @@ int nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
183/* blocklayoutdm.c */ 129/* blocklayoutdm.c */
184void bl_free_block_dev(struct pnfs_block_dev *bdev); 130void bl_free_block_dev(struct pnfs_block_dev *bdev);
185 131
186/* extents.c */ 132/* extent_tree.c */
187struct pnfs_block_extent * 133int ext_tree_insert(struct pnfs_block_layout *bl,
188bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect, 134 struct pnfs_block_extent *new);
189 struct pnfs_block_extent **cow_read); 135int ext_tree_remove(struct pnfs_block_layout *bl, bool rw, sector_t start,
190int bl_mark_sectors_init(struct pnfs_inval_markings *marks, 136 sector_t end);
191 sector_t offset, sector_t length); 137int ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
192void bl_put_extent(struct pnfs_block_extent *be); 138 sector_t len);
193struct pnfs_block_extent *bl_alloc_extent(void); 139bool ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect,
194int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect); 140 struct pnfs_block_extent *ret, bool rw);
195int encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, 141int ext_tree_encode_commit(struct pnfs_block_layout *bl,
196 struct xdr_stream *xdr, 142 struct xdr_stream *xdr);
197 const struct nfs4_layoutcommit_args *arg); 143void ext_tree_mark_committed(struct pnfs_block_layout *bl, int status);
198void clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
199 const struct nfs4_layoutcommit_args *arg,
200 int status);
201int bl_add_merge_extent(struct pnfs_block_layout *bl,
202 struct pnfs_block_extent *new);
203int bl_mark_for_commit(struct pnfs_block_extent *be,
204 sector_t offset, sector_t length,
205 struct pnfs_block_short_extent *new);
206int bl_push_one_short_extent(struct pnfs_inval_markings *marks);
207struct pnfs_block_short_extent *
208bl_pop_one_short_extent(struct pnfs_inval_markings *marks);
209void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free);
210 144
211#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */ 145#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */
diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c
index 63f77925aa87..cd71b5e231ec 100644
--- a/fs/nfs/blocklayout/blocklayoutdev.c
+++ b/fs/nfs/blocklayout/blocklayoutdev.c
@@ -309,7 +309,7 @@ nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
309 * recovery easier. 309 * recovery easier.
310 */ 310 */
311 for (i = 0; i < count; i++) { 311 for (i = 0; i < count; i++) {
312 be = bl_alloc_extent(); 312 be = kzalloc(sizeof(struct pnfs_block_extent), GFP_NOFS);
313 if (!be) { 313 if (!be) {
314 status = -ENOMEM; 314 status = -ENOMEM;
315 goto out_err; 315 goto out_err;
@@ -330,13 +330,11 @@ nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
330 if (decode_sector_number(&p, &be->be_v_offset) < 0) 330 if (decode_sector_number(&p, &be->be_v_offset) < 0)
331 goto out_err; 331 goto out_err;
332 be->be_state = be32_to_cpup(p++); 332 be->be_state = be32_to_cpup(p++);
333 if (be->be_state == PNFS_BLOCK_INVALID_DATA)
334 be->be_inval = &bl->bl_inval;
335 if (verify_extent(be, &lv)) { 333 if (verify_extent(be, &lv)) {
336 dprintk("%s verify failed\n", __func__); 334 dprintk("%s verify failed\n", __func__);
337 goto out_err; 335 goto out_err;
338 } 336 }
339 list_add_tail(&be->be_node, &extents); 337 list_add_tail(&be->be_list, &extents);
340 } 338 }
341 if (lgr->range.offset + lgr->range.length != 339 if (lgr->range.offset + lgr->range.length !=
342 lv.start << SECTOR_SHIFT) { 340 lv.start << SECTOR_SHIFT) {
@@ -352,21 +350,13 @@ nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
352 /* Extents decoded properly, now try to merge them in to 350 /* Extents decoded properly, now try to merge them in to
353 * existing layout extents. 351 * existing layout extents.
354 */ 352 */
355 spin_lock(&bl->bl_ext_lock); 353 list_for_each_entry_safe(be, save, &extents, be_list) {
356 list_for_each_entry_safe(be, save, &extents, be_node) { 354 list_del(&be->be_list);
357 list_del(&be->be_node); 355
358 status = bl_add_merge_extent(bl, be); 356 status = ext_tree_insert(bl, be);
359 if (status) { 357 if (status)
360 spin_unlock(&bl->bl_ext_lock); 358 goto out_free_list;
361 /* This is a fairly catastrophic error, as the
362 * entire layout extent lists are now corrupted.
363 * We should have some way to distinguish this.
364 */
365 be = NULL;
366 goto out_err;
367 }
368 } 359 }
369 spin_unlock(&bl->bl_ext_lock);
370 status = 0; 360 status = 0;
371 out: 361 out:
372 __free_page(scratch); 362 __free_page(scratch);
@@ -374,12 +364,13 @@ nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
374 return status; 364 return status;
375 365
376 out_err: 366 out_err:
377 bl_put_extent(be); 367 kfree(be);
368 out_free_list:
378 while (!list_empty(&extents)) { 369 while (!list_empty(&extents)) {
379 be = list_first_entry(&extents, struct pnfs_block_extent, 370 be = list_first_entry(&extents, struct pnfs_block_extent,
380 be_node); 371 be_list);
381 list_del(&be->be_node); 372 list_del(&be->be_list);
382 bl_put_extent(be); 373 kfree(be);
383 } 374 }
384 goto out; 375 goto out;
385} 376}
diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c
new file mode 100644
index 000000000000..c8c59a5b1a8f
--- /dev/null
+++ b/fs/nfs/blocklayout/extent_tree.c
@@ -0,0 +1,547 @@
1/*
2 * Copyright (c) 2014 Christoph Hellwig.
3 */
4
5#include "blocklayout.h"
6
7#define NFSDBG_FACILITY NFSDBG_PNFS_LD
8
9static inline struct pnfs_block_extent *
10ext_node(struct rb_node *node)
11{
12 return rb_entry(node, struct pnfs_block_extent, be_node);
13}
14
15static struct pnfs_block_extent *
16ext_tree_first(struct rb_root *root)
17{
18 struct rb_node *node = rb_first(root);
19 return node ? ext_node(node) : NULL;
20}
21
22static struct pnfs_block_extent *
23ext_tree_prev(struct pnfs_block_extent *be)
24{
25 struct rb_node *node = rb_prev(&be->be_node);
26 return node ? ext_node(node) : NULL;
27}
28
29static struct pnfs_block_extent *
30ext_tree_next(struct pnfs_block_extent *be)
31{
32 struct rb_node *node = rb_next(&be->be_node);
33 return node ? ext_node(node) : NULL;
34}
35
36static inline sector_t
37ext_f_end(struct pnfs_block_extent *be)
38{
39 return be->be_f_offset + be->be_length;
40}
41
42static struct pnfs_block_extent *
43__ext_tree_search(struct rb_root *root, sector_t start)
44{
45 struct rb_node *node = root->rb_node;
46 struct pnfs_block_extent *be = NULL;
47
48 while (node) {
49 be = ext_node(node);
50 if (start < be->be_f_offset)
51 node = node->rb_left;
52 else if (start >= ext_f_end(be))
53 node = node->rb_right;
54 else
55 return be;
56 }
57
58 if (be) {
59 if (start < be->be_f_offset)
60 return be;
61
62 if (start >= ext_f_end(be))
63 return ext_tree_next(be);
64 }
65
66 return NULL;
67}
68
69static bool
70ext_can_merge(struct pnfs_block_extent *be1, struct pnfs_block_extent *be2)
71{
72 if (be1->be_state != be2->be_state)
73 return false;
74 if (be1->be_mdev != be2->be_mdev)
75 return false;
76
77 if (be1->be_f_offset + be1->be_length != be2->be_f_offset)
78 return false;
79
80 if (be1->be_state != PNFS_BLOCK_NONE_DATA &&
81 (be1->be_v_offset + be1->be_length != be2->be_v_offset))
82 return false;
83
84 if (be1->be_state == PNFS_BLOCK_INVALID_DATA &&
85 be1->be_tag != be2->be_tag)
86 return false;
87
88 return true;
89}
90
91static struct pnfs_block_extent *
92ext_try_to_merge_left(struct rb_root *root, struct pnfs_block_extent *be)
93{
94 struct pnfs_block_extent *left = ext_tree_prev(be);
95
96 if (left && ext_can_merge(left, be)) {
97 left->be_length += be->be_length;
98 rb_erase(&be->be_node, root);
99 kfree(be);
100 return left;
101 }
102
103 return be;
104}
105
106static struct pnfs_block_extent *
107ext_try_to_merge_right(struct rb_root *root, struct pnfs_block_extent *be)
108{
109 struct pnfs_block_extent *right = ext_tree_next(be);
110
111 if (right && ext_can_merge(be, right)) {
112 be->be_length += right->be_length;
113 rb_erase(&right->be_node, root);
114 kfree(right);
115 }
116
117 return be;
118}
119
120static void
121__ext_tree_insert(struct rb_root *root,
122 struct pnfs_block_extent *new, bool merge_ok)
123{
124 struct rb_node **p = &root->rb_node, *parent = NULL;
125 struct pnfs_block_extent *be;
126
127 while (*p) {
128 parent = *p;
129 be = ext_node(parent);
130
131 if (new->be_f_offset < be->be_f_offset) {
132 if (merge_ok && ext_can_merge(new, be)) {
133 be->be_f_offset = new->be_f_offset;
134 if (be->be_state != PNFS_BLOCK_NONE_DATA)
135 be->be_v_offset = new->be_v_offset;
136 be->be_length += new->be_length;
137 be = ext_try_to_merge_left(root, be);
138 kfree(new);
139 return;
140 }
141 p = &(*p)->rb_left;
142 } else if (new->be_f_offset >= ext_f_end(be)) {
143 if (merge_ok && ext_can_merge(be, new)) {
144 be->be_length += new->be_length;
145 be = ext_try_to_merge_right(root, be);
146 kfree(new);
147 return;
148 }
149 p = &(*p)->rb_right;
150 } else {
151 BUG();
152 }
153 }
154
155 rb_link_node(&new->be_node, parent, p);
156 rb_insert_color(&new->be_node, root);
157}
158
159static int
160__ext_tree_remove(struct rb_root *root, sector_t start, sector_t end)
161{
162 struct pnfs_block_extent *be;
163 sector_t len1 = 0, len2 = 0;
164 sector_t orig_f_offset;
165 sector_t orig_v_offset;
166 sector_t orig_len;
167
168 be = __ext_tree_search(root, start);
169 if (!be)
170 return 0;
171 if (be->be_f_offset >= end)
172 return 0;
173
174 orig_f_offset = be->be_f_offset;
175 orig_v_offset = be->be_v_offset;
176 orig_len = be->be_length;
177
178 if (start > be->be_f_offset)
179 len1 = start - be->be_f_offset;
180 if (ext_f_end(be) > end)
181 len2 = ext_f_end(be) - end;
182
183 if (len2 > 0) {
184 if (len1 > 0) {
185 struct pnfs_block_extent *new;
186
187 new = kzalloc(sizeof(*new), GFP_ATOMIC);
188 if (!new)
189 return -ENOMEM;
190
191 be->be_length = len1;
192
193 new->be_f_offset = end;
194 if (be->be_state != PNFS_BLOCK_NONE_DATA) {
195 new->be_v_offset =
196 orig_v_offset + orig_len - len2;
197 }
198 new->be_length = len2;
199 new->be_state = be->be_state;
200 new->be_tag = be->be_tag;
201 new->be_mdev = be->be_mdev;
202 memcpy(&new->be_devid, &be->be_devid,
203 sizeof(struct nfs4_deviceid));
204
205 __ext_tree_insert(root, new, true);
206 } else {
207 be->be_f_offset = end;
208 if (be->be_state != PNFS_BLOCK_NONE_DATA) {
209 be->be_v_offset =
210 orig_v_offset + orig_len - len2;
211 }
212 be->be_length = len2;
213 }
214 } else {
215 if (len1 > 0) {
216 be->be_length = len1;
217 be = ext_tree_next(be);
218 }
219
220 while (be && ext_f_end(be) <= end) {
221 struct pnfs_block_extent *next = ext_tree_next(be);
222
223 rb_erase(&be->be_node, root);
224 kfree(be);
225 be = next;
226 }
227
228 if (be && be->be_f_offset < end) {
229 len1 = ext_f_end(be) - end;
230 be->be_f_offset = end;
231 if (be->be_state != PNFS_BLOCK_NONE_DATA)
232 be->be_v_offset += be->be_length - len1;
233 be->be_length = len1;
234 }
235 }
236
237 return 0;
238}
239
240int
241ext_tree_insert(struct pnfs_block_layout *bl, struct pnfs_block_extent *new)
242{
243 struct pnfs_block_extent *be;
244 struct rb_root *root;
245 int err = 0;
246
247 switch (new->be_state) {
248 case PNFS_BLOCK_READWRITE_DATA:
249 case PNFS_BLOCK_INVALID_DATA:
250 root = &bl->bl_ext_rw;
251 break;
252 case PNFS_BLOCK_READ_DATA:
253 case PNFS_BLOCK_NONE_DATA:
254 root = &bl->bl_ext_ro;
255 break;
256 default:
257 dprintk("invalid extent type\n");
258 return -EINVAL;
259 }
260
261 spin_lock(&bl->bl_ext_lock);
262retry:
263 be = __ext_tree_search(root, new->be_f_offset);
264 if (!be || be->be_f_offset >= ext_f_end(new)) {
265 __ext_tree_insert(root, new, true);
266 } else if (new->be_f_offset >= be->be_f_offset) {
267 if (ext_f_end(new) <= ext_f_end(be)) {
268 kfree(new);
269 } else {
270 sector_t new_len = ext_f_end(new) - ext_f_end(be);
271 sector_t diff = new->be_length - new_len;
272
273 new->be_f_offset += diff;
274 new->be_v_offset += diff;
275 new->be_length = new_len;
276 goto retry;
277 }
278 } else if (ext_f_end(new) <= ext_f_end(be)) {
279 new->be_length = be->be_f_offset - new->be_f_offset;
280 __ext_tree_insert(root, new, true);
281 } else {
282 struct pnfs_block_extent *split;
283 sector_t new_len = ext_f_end(new) - ext_f_end(be);
284 sector_t diff = new->be_length - new_len;
285
286 split = kmemdup(new, sizeof(*new), GFP_ATOMIC);
287 if (!split) {
288 err = -EINVAL;
289 goto out;
290 }
291
292 split->be_length = be->be_f_offset - split->be_f_offset;
293 __ext_tree_insert(root, split, true);
294
295 new->be_f_offset += diff;
296 new->be_v_offset += diff;
297 new->be_length = new_len;
298 goto retry;
299 }
300out:
301 spin_unlock(&bl->bl_ext_lock);
302 return err;
303}
304
305static bool
306__ext_tree_lookup(struct rb_root *root, sector_t isect,
307 struct pnfs_block_extent *ret)
308{
309 struct rb_node *node;
310 struct pnfs_block_extent *be;
311
312 node = root->rb_node;
313 while (node) {
314 be = ext_node(node);
315 if (isect < be->be_f_offset)
316 node = node->rb_left;
317 else if (isect >= ext_f_end(be))
318 node = node->rb_right;
319 else {
320 *ret = *be;
321 return true;
322 }
323 }
324
325 return false;
326}
327
328bool
329ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect,
330 struct pnfs_block_extent *ret, bool rw)
331{
332 bool found = false;
333
334 spin_lock(&bl->bl_ext_lock);
335 if (!rw)
336 found = __ext_tree_lookup(&bl->bl_ext_ro, isect, ret);
337 if (!found)
338 found = __ext_tree_lookup(&bl->bl_ext_rw, isect, ret);
339 spin_unlock(&bl->bl_ext_lock);
340
341 return found;
342}
343
344int ext_tree_remove(struct pnfs_block_layout *bl, bool rw,
345 sector_t start, sector_t end)
346{
347 int err, err2;
348
349 spin_lock(&bl->bl_ext_lock);
350 err = __ext_tree_remove(&bl->bl_ext_ro, start, end);
351 if (rw) {
352 err2 = __ext_tree_remove(&bl->bl_ext_rw, start, end);
353 if (!err)
354 err = err2;
355 }
356 spin_unlock(&bl->bl_ext_lock);
357
358 return err;
359}
360
361static int
362ext_tree_split(struct rb_root *root, struct pnfs_block_extent *be,
363 sector_t split)
364{
365 struct pnfs_block_extent *new;
366 sector_t orig_len = be->be_length;
367
368 dprintk("%s: need split for 0x%lx:0x%lx at 0x%lx\n",
369 __func__, be->be_f_offset, ext_f_end(be), split);
370
371 new = kzalloc(sizeof(*new), GFP_ATOMIC);
372 if (!new)
373 return -ENOMEM;
374
375 be->be_length = split - be->be_f_offset;
376
377 new->be_f_offset = split;
378 if (be->be_state != PNFS_BLOCK_NONE_DATA)
379 new->be_v_offset = be->be_v_offset + be->be_length;
380 new->be_length = orig_len - be->be_length;
381 new->be_state = be->be_state;
382 new->be_tag = be->be_tag;
383
384 new->be_mdev = be->be_mdev;
385 memcpy(&new->be_devid, &be->be_devid, sizeof(struct nfs4_deviceid));
386
387 dprintk("%s: got 0x%lx:0x%lx!\n",
388 __func__, be->be_f_offset, ext_f_end(be));
389 dprintk("%s: got 0x%lx:0x%lx!\n",
390 __func__, new->be_f_offset, ext_f_end(new));
391
392 __ext_tree_insert(root, new, false);
393 return 0;
394}
395
396int
397ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
398 sector_t len)
399{
400 struct rb_root *root = &bl->bl_ext_rw;
401 sector_t end = start + len;
402 struct pnfs_block_extent *be;
403 int err = 0;
404
405 spin_lock(&bl->bl_ext_lock);
406 /*
407 * First remove all COW extents or holes from written to range.
408 */
409 err = __ext_tree_remove(&bl->bl_ext_ro, start, end);
410 if (err)
411 goto out;
412
413 /*
414 * Then mark all invalid extents in the range as written to.
415 */
416 for (be = __ext_tree_search(root, start); be; be = ext_tree_next(be)) {
417 if (be->be_f_offset >= end)
418 break;
419
420 if (be->be_state != PNFS_BLOCK_INVALID_DATA || be->be_tag)
421 continue;
422
423 if (be->be_f_offset < start) {
424 struct pnfs_block_extent *left = ext_tree_prev(be);
425
426 if (left && ext_can_merge(left, be)) {
427 sector_t diff = start - be->be_f_offset;
428
429 left->be_length += diff;
430
431 be->be_f_offset += diff;
432 be->be_v_offset += diff;
433 be->be_length -= diff;
434 } else {
435 err = ext_tree_split(root, be, start);
436 if (err)
437 goto out;
438 }
439 }
440
441 if (ext_f_end(be) > end) {
442 struct pnfs_block_extent *right = ext_tree_next(be);
443
444 if (right && ext_can_merge(be, right)) {
445 sector_t diff = end - be->be_f_offset;
446
447 be->be_length -= diff;
448
449 right->be_f_offset -= diff;
450 right->be_v_offset -= diff;
451 right->be_length += diff;
452 } else {
453 err = ext_tree_split(root, be, end);
454 if (err)
455 goto out;
456 }
457 }
458
459 if (be->be_f_offset >= start && ext_f_end(be) <= end) {
460 be->be_tag = EXTENT_WRITTEN;
461 be = ext_try_to_merge_left(root, be);
462 be = ext_try_to_merge_right(root, be);
463 }
464 }
465out:
466 spin_unlock(&bl->bl_ext_lock);
467 return err;
468}
469
470int
471ext_tree_encode_commit(struct pnfs_block_layout *bl, struct xdr_stream *xdr)
472{
473 struct pnfs_block_extent *be;
474 unsigned int count = 0;
475 __be32 *p, *xdr_start;
476 int ret = 0;
477
478 dprintk("%s enter\n", __func__);
479
480 xdr_start = xdr_reserve_space(xdr, 8);
481 if (!xdr_start)
482 return -ENOSPC;
483
484 spin_lock(&bl->bl_ext_lock);
485 for (be = ext_tree_first(&bl->bl_ext_rw); be; be = ext_tree_next(be)) {
486 if (be->be_state != PNFS_BLOCK_INVALID_DATA ||
487 be->be_tag != EXTENT_WRITTEN)
488 continue;
489
490 p = xdr_reserve_space(xdr, 7 * sizeof(__be32) +
491 NFS4_DEVICEID4_SIZE);
492 if (!p) {
493 printk("%s: out of space for extent list\n", __func__);
494 ret = -ENOSPC;
495 break;
496 }
497
498 p = xdr_encode_opaque_fixed(p, be->be_devid.data,
499 NFS4_DEVICEID4_SIZE);
500 p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT);
501 p = xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
502 p = xdr_encode_hyper(p, 0LL);
503 *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
504
505 be->be_tag = EXTENT_COMMITTING;
506 count++;
507 }
508 spin_unlock(&bl->bl_ext_lock);
509
510 xdr_start[0] = cpu_to_be32((xdr->p - xdr_start - 1) * 4);
511 xdr_start[1] = cpu_to_be32(count);
512
513 dprintk("%s found %i ranges\n", __func__, count);
514 return ret;
515}
516
517void
518ext_tree_mark_committed(struct pnfs_block_layout *bl, int status)
519{
520 struct rb_root *root = &bl->bl_ext_rw;
521 struct pnfs_block_extent *be;
522
523 dprintk("%s status %d\n", __func__, status);
524
525 spin_lock(&bl->bl_ext_lock);
526 for (be = ext_tree_first(root); be; be = ext_tree_next(be)) {
527 if (be->be_state != PNFS_BLOCK_INVALID_DATA ||
528 be->be_tag != EXTENT_COMMITTING)
529 continue;
530
531 if (status) {
532 /*
533 * Mark as written and try again.
534 *
535 * XXX: some real error handling here wouldn't hurt..
536 */
537 be->be_tag = EXTENT_WRITTEN;
538 } else {
539 be->be_state = PNFS_BLOCK_READWRITE_DATA;
540 be->be_tag = 0;
541 }
542
543 be = ext_try_to_merge_left(root, be);
544 be = ext_try_to_merge_right(root, be);
545 }
546 spin_unlock(&bl->bl_ext_lock);
547}
diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c
deleted file mode 100644
index 4d0161442565..000000000000
--- a/fs/nfs/blocklayout/extents.c
+++ /dev/null
@@ -1,908 +0,0 @@
1/*
2 * linux/fs/nfs/blocklayout/blocklayout.h
3 *
4 * Module for the NFSv4.1 pNFS block layout driver.
5 *
6 * Copyright (c) 2006 The Regents of the University of Michigan.
7 * All rights reserved.
8 *
9 * Andy Adamson <andros@citi.umich.edu>
10 * Fred Isaman <iisaman@umich.edu>
11 *
12 * permission is granted to use, copy, create derivative works and
13 * redistribute this software and such derivative works for any purpose,
14 * so long as the name of the university of michigan is not used in
15 * any advertising or publicity pertaining to the use or distribution
16 * of this software without specific, written prior authorization. if
17 * the above copyright notice or any other identification of the
18 * university of michigan is included in any copy of any portion of
19 * this software, then the disclaimer below must also be included.
20 *
21 * this software is provided as is, without representation from the
22 * university of michigan as to its fitness for any purpose, and without
23 * warranty by the university of michigan of any kind, either express
24 * or implied, including without limitation the implied warranties of
25 * merchantability and fitness for a particular purpose. the regents
26 * of the university of michigan shall not be liable for any damages,
27 * including special, indirect, incidental, or consequential damages,
28 * with respect to any claim arising out or in connection with the use
29 * of the software, even if it has been or is hereafter advised of the
30 * possibility of such damages.
31 */
32
33#include "blocklayout.h"
34#define NFSDBG_FACILITY NFSDBG_PNFS_LD
35
36/* Bit numbers */
37#define EXTENT_INITIALIZED 0
38#define EXTENT_WRITTEN 1
39#define EXTENT_IN_COMMIT 2
40#define INTERNAL_EXISTS MY_MAX_TAGS
41#define INTERNAL_MASK ((1 << INTERNAL_EXISTS) - 1)
42
43/* Returns largest t<=s s.t. t%base==0 */
44static inline sector_t normalize(sector_t s, int base)
45{
46 sector_t tmp = s; /* Since do_div modifies its argument */
47 return s - sector_div(tmp, base);
48}
49
50static inline sector_t normalize_up(sector_t s, int base)
51{
52 return normalize(s + base - 1, base);
53}
54
55/* Complete stub using list while determine API wanted */
56
57/* Returns tags, or negative */
58static int32_t _find_entry(struct my_tree *tree, u64 s)
59{
60 struct pnfs_inval_tracking *pos;
61
62 dprintk("%s(%llu) enter\n", __func__, s);
63 list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
64 if (pos->it_sector > s)
65 continue;
66 else if (pos->it_sector == s)
67 return pos->it_tags & INTERNAL_MASK;
68 else
69 break;
70 }
71 return -ENOENT;
72}
73
74static inline
75int _has_tag(struct my_tree *tree, u64 s, int32_t tag)
76{
77 int32_t tags;
78
79 dprintk("%s(%llu, %i) enter\n", __func__, s, tag);
80 s = normalize(s, tree->mtt_step_size);
81 tags = _find_entry(tree, s);
82 if ((tags < 0) || !(tags & (1 << tag)))
83 return 0;
84 else
85 return 1;
86}
87
88/* Creates entry with tag, or if entry already exists, unions tag to it.
89 * If storage is not NULL, newly created entry will use it.
90 * Returns number of entries added, or negative on error.
91 */
92static int _add_entry(struct my_tree *tree, u64 s, int32_t tag,
93 struct pnfs_inval_tracking *storage)
94{
95 int found = 0;
96 struct pnfs_inval_tracking *pos;
97
98 dprintk("%s(%llu, %i, %p) enter\n", __func__, s, tag, storage);
99 list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
100 if (pos->it_sector > s)
101 continue;
102 else if (pos->it_sector == s) {
103 found = 1;
104 break;
105 } else
106 break;
107 }
108 if (found) {
109 pos->it_tags |= (1 << tag);
110 return 0;
111 } else {
112 struct pnfs_inval_tracking *new;
113 new = storage;
114 new->it_sector = s;
115 new->it_tags = (1 << tag);
116 list_add(&new->it_link, &pos->it_link);
117 return 1;
118 }
119}
120
121/* XXXX Really want option to not create */
122/* Over range, unions tag with existing entries, else creates entry with tag */
123static int _set_range(struct my_tree *tree, int32_t tag, u64 s, u64 length)
124{
125 u64 i;
126
127 dprintk("%s(%i, %llu, %llu) enter\n", __func__, tag, s, length);
128 for (i = normalize(s, tree->mtt_step_size); i < s + length;
129 i += tree->mtt_step_size)
130 if (_add_entry(tree, i, tag, NULL))
131 return -ENOMEM;
132 return 0;
133}
134
135/* Ensure that future operations on given range of tree will not malloc */
136static int _preload_range(struct pnfs_inval_markings *marks,
137 u64 offset, u64 length)
138{
139 u64 start, end, s;
140 int count, i, used = 0, status = -ENOMEM;
141 struct pnfs_inval_tracking **storage;
142 struct my_tree *tree = &marks->im_tree;
143
144 dprintk("%s(%llu, %llu) enter\n", __func__, offset, length);
145 start = normalize(offset, tree->mtt_step_size);
146 end = normalize_up(offset + length, tree->mtt_step_size);
147 count = (int)(end - start) / (int)tree->mtt_step_size;
148
149 /* Pre-malloc what memory we might need */
150 storage = kcalloc(count, sizeof(*storage), GFP_NOFS);
151 if (!storage)
152 return -ENOMEM;
153 for (i = 0; i < count; i++) {
154 storage[i] = kmalloc(sizeof(struct pnfs_inval_tracking),
155 GFP_NOFS);
156 if (!storage[i])
157 goto out_cleanup;
158 }
159
160 spin_lock_bh(&marks->im_lock);
161 for (s = start; s < end; s += tree->mtt_step_size)
162 used += _add_entry(tree, s, INTERNAL_EXISTS, storage[used]);
163 spin_unlock_bh(&marks->im_lock);
164
165 status = 0;
166
167 out_cleanup:
168 for (i = used; i < count; i++) {
169 if (!storage[i])
170 break;
171 kfree(storage[i]);
172 }
173 kfree(storage);
174 return status;
175}
176
177/* We are relying on page lock to serialize this */
178int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect)
179{
180 int rv;
181
182 spin_lock_bh(&marks->im_lock);
183 rv = _has_tag(&marks->im_tree, isect, EXTENT_INITIALIZED);
184 spin_unlock_bh(&marks->im_lock);
185 return rv;
186}
187
188/* Assume start, end already sector aligned */
189static int
190_range_has_tag(struct my_tree *tree, u64 start, u64 end, int32_t tag)
191{
192 struct pnfs_inval_tracking *pos;
193 u64 expect = 0;
194
195 dprintk("%s(%llu, %llu, %i) enter\n", __func__, start, end, tag);
196 list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
197 if (pos->it_sector >= end)
198 continue;
199 if (!expect) {
200 if ((pos->it_sector == end - tree->mtt_step_size) &&
201 (pos->it_tags & (1 << tag))) {
202 expect = pos->it_sector - tree->mtt_step_size;
203 if (pos->it_sector < tree->mtt_step_size || expect < start)
204 return 1;
205 continue;
206 } else {
207 return 0;
208 }
209 }
210 if (pos->it_sector != expect || !(pos->it_tags & (1 << tag)))
211 return 0;
212 expect -= tree->mtt_step_size;
213 if (expect < start)
214 return 1;
215 }
216 return 0;
217}
218
219static int is_range_written(struct pnfs_inval_markings *marks,
220 sector_t start, sector_t end)
221{
222 int rv;
223
224 spin_lock_bh(&marks->im_lock);
225 rv = _range_has_tag(&marks->im_tree, start, end, EXTENT_WRITTEN);
226 spin_unlock_bh(&marks->im_lock);
227 return rv;
228}
229
230/* Marks sectors in [offest, offset_length) as having been initialized.
231 * All lengths are step-aligned, where step is min(pagesize, blocksize).
232 * Currently assumes offset is page-aligned
233 */
234int bl_mark_sectors_init(struct pnfs_inval_markings *marks,
235 sector_t offset, sector_t length)
236{
237 sector_t start, end;
238
239 dprintk("%s(offset=%llu,len=%llu) enter\n",
240 __func__, (u64)offset, (u64)length);
241
242 start = normalize(offset, marks->im_block_size);
243 end = normalize_up(offset + length, marks->im_block_size);
244 if (_preload_range(marks, start, end - start))
245 goto outerr;
246
247 spin_lock_bh(&marks->im_lock);
248 if (_set_range(&marks->im_tree, EXTENT_INITIALIZED, offset, length))
249 goto out_unlock;
250 spin_unlock_bh(&marks->im_lock);
251
252 return 0;
253
254out_unlock:
255 spin_unlock_bh(&marks->im_lock);
256outerr:
257 return -ENOMEM;
258}
259
260/* Marks sectors in [offest, offset+length) as having been written to disk.
261 * All lengths should be block aligned.
262 */
263static int mark_written_sectors(struct pnfs_inval_markings *marks,
264 sector_t offset, sector_t length)
265{
266 int status;
267
268 dprintk("%s(offset=%llu,len=%llu) enter\n", __func__,
269 (u64)offset, (u64)length);
270 spin_lock_bh(&marks->im_lock);
271 status = _set_range(&marks->im_tree, EXTENT_WRITTEN, offset, length);
272 spin_unlock_bh(&marks->im_lock);
273 return status;
274}
275
276static void print_short_extent(struct pnfs_block_short_extent *be)
277{
278 dprintk("PRINT SHORT EXTENT extent %p\n", be);
279 if (be) {
280 dprintk(" be_f_offset %llu\n", (u64)be->bse_f_offset);
281 dprintk(" be_length %llu\n", (u64)be->bse_length);
282 }
283}
284
285static void print_clist(struct list_head *list, unsigned int count)
286{
287 struct pnfs_block_short_extent *be;
288 unsigned int i = 0;
289
290 ifdebug(FACILITY) {
291 printk(KERN_DEBUG "****************\n");
292 printk(KERN_DEBUG "Extent list looks like:\n");
293 list_for_each_entry(be, list, bse_node) {
294 i++;
295 print_short_extent(be);
296 }
297 if (i != count)
298 printk(KERN_DEBUG "\n\nExpected %u entries\n\n\n", count);
299 printk(KERN_DEBUG "****************\n");
300 }
301}
302
303/* Note: In theory, we should do more checking that devid's match between
304 * old and new, but if they don't, the lists are too corrupt to salvage anyway.
305 */
306/* Note this is very similar to bl_add_merge_extent */
307static void add_to_commitlist(struct pnfs_block_layout *bl,
308 struct pnfs_block_short_extent *new)
309{
310 struct list_head *clist = &bl->bl_commit;
311 struct pnfs_block_short_extent *old, *save;
312 sector_t end = new->bse_f_offset + new->bse_length;
313
314 dprintk("%s enter\n", __func__);
315 print_short_extent(new);
316 print_clist(clist, bl->bl_count);
317 bl->bl_count++;
318 /* Scan for proper place to insert, extending new to the left
319 * as much as possible.
320 */
321 list_for_each_entry_safe(old, save, clist, bse_node) {
322 if (new->bse_f_offset < old->bse_f_offset)
323 break;
324 if (end <= old->bse_f_offset + old->bse_length) {
325 /* Range is already in list */
326 bl->bl_count--;
327 kfree(new);
328 return;
329 } else if (new->bse_f_offset <=
330 old->bse_f_offset + old->bse_length) {
331 /* new overlaps or abuts existing be */
332 if (new->bse_mdev == old->bse_mdev) {
333 /* extend new to fully replace old */
334 new->bse_length += new->bse_f_offset -
335 old->bse_f_offset;
336 new->bse_f_offset = old->bse_f_offset;
337 list_del(&old->bse_node);
338 bl->bl_count--;
339 kfree(old);
340 }
341 }
342 }
343 /* Note that if we never hit the above break, old will not point to a
344 * valid extent. However, in that case &old->bse_node==list.
345 */
346 list_add_tail(&new->bse_node, &old->bse_node);
347 /* Scan forward for overlaps. If we find any, extend new and
348 * remove the overlapped extent.
349 */
350 old = list_prepare_entry(new, clist, bse_node);
351 list_for_each_entry_safe_continue(old, save, clist, bse_node) {
352 if (end < old->bse_f_offset)
353 break;
354 /* new overlaps or abuts old */
355 if (new->bse_mdev == old->bse_mdev) {
356 if (end < old->bse_f_offset + old->bse_length) {
357 /* extend new to fully cover old */
358 end = old->bse_f_offset + old->bse_length;
359 new->bse_length = end - new->bse_f_offset;
360 }
361 list_del(&old->bse_node);
362 bl->bl_count--;
363 kfree(old);
364 }
365 }
366 dprintk("%s: after merging\n", __func__);
367 print_clist(clist, bl->bl_count);
368}
369
370/* Note the range described by offset, length is guaranteed to be contained
371 * within be.
372 * new will be freed, either by this function or add_to_commitlist if they
373 * decide not to use it, or after LAYOUTCOMMIT uses it in the commitlist.
374 */
375int bl_mark_for_commit(struct pnfs_block_extent *be,
376 sector_t offset, sector_t length,
377 struct pnfs_block_short_extent *new)
378{
379 sector_t new_end, end = offset + length;
380 struct pnfs_block_layout *bl = container_of(be->be_inval,
381 struct pnfs_block_layout,
382 bl_inval);
383
384 mark_written_sectors(be->be_inval, offset, length);
385 /* We want to add the range to commit list, but it must be
386 * block-normalized, and verified that the normalized range has
387 * been entirely written to disk.
388 */
389 new->bse_f_offset = offset;
390 offset = normalize(offset, bl->bl_blocksize);
391 if (offset < new->bse_f_offset) {
392 if (is_range_written(be->be_inval, offset, new->bse_f_offset))
393 new->bse_f_offset = offset;
394 else
395 new->bse_f_offset = offset + bl->bl_blocksize;
396 }
397 new_end = normalize_up(end, bl->bl_blocksize);
398 if (end < new_end) {
399 if (is_range_written(be->be_inval, end, new_end))
400 end = new_end;
401 else
402 end = new_end - bl->bl_blocksize;
403 }
404 if (end <= new->bse_f_offset) {
405 kfree(new);
406 return 0;
407 }
408 new->bse_length = end - new->bse_f_offset;
409 new->bse_devid = be->be_devid;
410 new->bse_mdev = be->be_mdev;
411
412 spin_lock(&bl->bl_ext_lock);
413 add_to_commitlist(bl, new);
414 spin_unlock(&bl->bl_ext_lock);
415 return 0;
416}
417
418static void print_bl_extent(struct pnfs_block_extent *be)
419{
420 dprintk("PRINT EXTENT extent %p\n", be);
421 if (be) {
422 dprintk(" be_f_offset %llu\n", (u64)be->be_f_offset);
423 dprintk(" be_length %llu\n", (u64)be->be_length);
424 dprintk(" be_v_offset %llu\n", (u64)be->be_v_offset);
425 dprintk(" be_state %d\n", be->be_state);
426 }
427}
428
429static void
430destroy_extent(struct kref *kref)
431{
432 struct pnfs_block_extent *be;
433
434 be = container_of(kref, struct pnfs_block_extent, be_refcnt);
435 dprintk("%s be=%p\n", __func__, be);
436 kfree(be);
437}
438
439void
440bl_put_extent(struct pnfs_block_extent *be)
441{
442 if (be) {
443 dprintk("%s enter %p (%i)\n", __func__, be,
444 atomic_read(&be->be_refcnt.refcount));
445 kref_put(&be->be_refcnt, destroy_extent);
446 }
447}
448
449struct pnfs_block_extent *bl_alloc_extent(void)
450{
451 struct pnfs_block_extent *be;
452
453 be = kmalloc(sizeof(struct pnfs_block_extent), GFP_NOFS);
454 if (!be)
455 return NULL;
456 INIT_LIST_HEAD(&be->be_node);
457 kref_init(&be->be_refcnt);
458 be->be_inval = NULL;
459 return be;
460}
461
462static void print_elist(struct list_head *list)
463{
464 struct pnfs_block_extent *be;
465 dprintk("****************\n");
466 dprintk("Extent list looks like:\n");
467 list_for_each_entry(be, list, be_node) {
468 print_bl_extent(be);
469 }
470 dprintk("****************\n");
471}
472
473static inline int
474extents_consistent(struct pnfs_block_extent *old, struct pnfs_block_extent *new)
475{
476 /* Note this assumes new->be_f_offset >= old->be_f_offset */
477 return (new->be_state == old->be_state) &&
478 ((new->be_state == PNFS_BLOCK_NONE_DATA) ||
479 ((new->be_v_offset - old->be_v_offset ==
480 new->be_f_offset - old->be_f_offset) &&
481 new->be_mdev == old->be_mdev));
482}
483
484/* Adds new to appropriate list in bl, modifying new and removing existing
485 * extents as appropriate to deal with overlaps.
486 *
487 * See bl_find_get_extent for list constraints.
488 *
489 * Refcount on new is already set. If end up not using it, or error out,
490 * need to put the reference.
491 *
492 * bl->bl_ext_lock is held by caller.
493 */
494int
495bl_add_merge_extent(struct pnfs_block_layout *bl,
496 struct pnfs_block_extent *new)
497{
498 struct pnfs_block_extent *be, *tmp;
499 sector_t end = new->be_f_offset + new->be_length;
500 struct list_head *list;
501
502 dprintk("%s enter with be=%p\n", __func__, new);
503 print_bl_extent(new);
504 list = &bl->bl_extents[bl_choose_list(new->be_state)];
505 print_elist(list);
506
507 /* Scan for proper place to insert, extending new to the left
508 * as much as possible.
509 */
510 list_for_each_entry_safe_reverse(be, tmp, list, be_node) {
511 if (new->be_f_offset >= be->be_f_offset + be->be_length)
512 break;
513 if (new->be_f_offset >= be->be_f_offset) {
514 if (end <= be->be_f_offset + be->be_length) {
515 /* new is a subset of existing be*/
516 if (extents_consistent(be, new)) {
517 dprintk("%s: new is subset, ignoring\n",
518 __func__);
519 bl_put_extent(new);
520 return 0;
521 } else {
522 goto out_err;
523 }
524 } else {
525 /* |<-- be -->|
526 * |<-- new -->| */
527 if (extents_consistent(be, new)) {
528 /* extend new to fully replace be */
529 new->be_length += new->be_f_offset -
530 be->be_f_offset;
531 new->be_f_offset = be->be_f_offset;
532 new->be_v_offset = be->be_v_offset;
533 dprintk("%s: removing %p\n", __func__, be);
534 list_del(&be->be_node);
535 bl_put_extent(be);
536 } else {
537 goto out_err;
538 }
539 }
540 } else if (end >= be->be_f_offset + be->be_length) {
541 /* new extent overlap existing be */
542 if (extents_consistent(be, new)) {
543 /* extend new to fully replace be */
544 dprintk("%s: removing %p\n", __func__, be);
545 list_del(&be->be_node);
546 bl_put_extent(be);
547 } else {
548 goto out_err;
549 }
550 } else if (end > be->be_f_offset) {
551 /* |<-- be -->|
552 *|<-- new -->| */
553 if (extents_consistent(new, be)) {
554 /* extend new to fully replace be */
555 new->be_length += be->be_f_offset + be->be_length -
556 new->be_f_offset - new->be_length;
557 dprintk("%s: removing %p\n", __func__, be);
558 list_del(&be->be_node);
559 bl_put_extent(be);
560 } else {
561 goto out_err;
562 }
563 }
564 }
565 /* Note that if we never hit the above break, be will not point to a
566 * valid extent. However, in that case &be->be_node==list.
567 */
568 list_add(&new->be_node, &be->be_node);
569 dprintk("%s: inserting new\n", __func__);
570 print_elist(list);
571 /* FIXME - The per-list consistency checks have all been done,
572 * should now check cross-list consistency.
573 */
574 return 0;
575
576 out_err:
577 bl_put_extent(new);
578 return -EIO;
579}
580
581/* Returns extent, or NULL. If a second READ extent exists, it is returned
582 * in cow_read, if given.
583 *
584 * The extents are kept in two seperate ordered lists, one for READ and NONE,
585 * one for READWRITE and INVALID. Within each list, we assume:
586 * 1. Extents are ordered by file offset.
587 * 2. For any given isect, there is at most one extents that matches.
588 */
589struct pnfs_block_extent *
590bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect,
591 struct pnfs_block_extent **cow_read)
592{
593 struct pnfs_block_extent *be, *cow, *ret;
594 int i;
595
596 dprintk("%s enter with isect %llu\n", __func__, (u64)isect);
597 cow = ret = NULL;
598 spin_lock(&bl->bl_ext_lock);
599 for (i = 0; i < EXTENT_LISTS; i++) {
600 list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) {
601 if (isect >= be->be_f_offset + be->be_length)
602 break;
603 if (isect >= be->be_f_offset) {
604 /* We have found an extent */
605 dprintk("%s Get %p (%i)\n", __func__, be,
606 atomic_read(&be->be_refcnt.refcount));
607 kref_get(&be->be_refcnt);
608 if (!ret)
609 ret = be;
610 else if (be->be_state != PNFS_BLOCK_READ_DATA)
611 bl_put_extent(be);
612 else
613 cow = be;
614 break;
615 }
616 }
617 if (ret &&
618 (!cow_read || ret->be_state != PNFS_BLOCK_INVALID_DATA))
619 break;
620 }
621 spin_unlock(&bl->bl_ext_lock);
622 if (cow_read)
623 *cow_read = cow;
624 print_bl_extent(ret);
625 return ret;
626}
627
628/* Similar to bl_find_get_extent, but called with lock held, and ignores cow */
629static struct pnfs_block_extent *
630bl_find_get_extent_locked(struct pnfs_block_layout *bl, sector_t isect)
631{
632 struct pnfs_block_extent *be, *ret = NULL;
633 int i;
634
635 dprintk("%s enter with isect %llu\n", __func__, (u64)isect);
636 for (i = 0; i < EXTENT_LISTS; i++) {
637 if (ret)
638 break;
639 list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) {
640 if (isect >= be->be_f_offset + be->be_length)
641 break;
642 if (isect >= be->be_f_offset) {
643 /* We have found an extent */
644 dprintk("%s Get %p (%i)\n", __func__, be,
645 atomic_read(&be->be_refcnt.refcount));
646 kref_get(&be->be_refcnt);
647 ret = be;
648 break;
649 }
650 }
651 }
652 print_bl_extent(ret);
653 return ret;
654}
655
656int
657encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
658 struct xdr_stream *xdr,
659 const struct nfs4_layoutcommit_args *arg)
660{
661 struct pnfs_block_short_extent *lce, *save;
662 unsigned int count = 0;
663 __be32 *p, *xdr_start;
664
665 dprintk("%s enter\n", __func__);
666 /* BUG - creation of bl_commit is buggy - need to wait for
667 * entire block to be marked WRITTEN before it can be added.
668 */
669 spin_lock(&bl->bl_ext_lock);
670 /* Want to adjust for possible truncate */
671 /* We now want to adjust argument range */
672
673 /* XDR encode the ranges found */
674 xdr_start = xdr_reserve_space(xdr, 8);
675 if (!xdr_start)
676 goto out;
677 list_for_each_entry_safe(lce, save, &bl->bl_commit, bse_node) {
678 p = xdr_reserve_space(xdr, 7 * 4 + sizeof(lce->bse_devid.data));
679 if (!p)
680 break;
681 p = xdr_encode_opaque_fixed(p, lce->bse_devid.data, NFS4_DEVICEID4_SIZE);
682 p = xdr_encode_hyper(p, lce->bse_f_offset << SECTOR_SHIFT);
683 p = xdr_encode_hyper(p, lce->bse_length << SECTOR_SHIFT);
684 p = xdr_encode_hyper(p, 0LL);
685 *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
686 list_move_tail(&lce->bse_node, &bl->bl_committing);
687 bl->bl_count--;
688 count++;
689 }
690 xdr_start[0] = cpu_to_be32((xdr->p - xdr_start - 1) * 4);
691 xdr_start[1] = cpu_to_be32(count);
692out:
693 spin_unlock(&bl->bl_ext_lock);
694 dprintk("%s found %i ranges\n", __func__, count);
695 return 0;
696}
697
698/* Helper function to set_to_rw that initialize a new extent */
699static void
700_prep_new_extent(struct pnfs_block_extent *new,
701 struct pnfs_block_extent *orig,
702 sector_t offset, sector_t length, int state)
703{
704 kref_init(&new->be_refcnt);
705 /* don't need to INIT_LIST_HEAD(&new->be_node) */
706 memcpy(&new->be_devid, &orig->be_devid, sizeof(struct nfs4_deviceid));
707 new->be_mdev = orig->be_mdev;
708 new->be_f_offset = offset;
709 new->be_length = length;
710 new->be_v_offset = orig->be_v_offset - orig->be_f_offset + offset;
711 new->be_state = state;
712 new->be_inval = orig->be_inval;
713}
714
715/* Tries to merge be with extent in front of it in list.
716 * Frees storage if not used.
717 */
718static struct pnfs_block_extent *
719_front_merge(struct pnfs_block_extent *be, struct list_head *head,
720 struct pnfs_block_extent *storage)
721{
722 struct pnfs_block_extent *prev;
723
724 if (!storage)
725 goto no_merge;
726 if (&be->be_node == head || be->be_node.prev == head)
727 goto no_merge;
728 prev = list_entry(be->be_node.prev, struct pnfs_block_extent, be_node);
729 if ((prev->be_f_offset + prev->be_length != be->be_f_offset) ||
730 !extents_consistent(prev, be))
731 goto no_merge;
732 _prep_new_extent(storage, prev, prev->be_f_offset,
733 prev->be_length + be->be_length, prev->be_state);
734 list_replace(&prev->be_node, &storage->be_node);
735 bl_put_extent(prev);
736 list_del(&be->be_node);
737 bl_put_extent(be);
738 return storage;
739
740 no_merge:
741 kfree(storage);
742 return be;
743}
744
745static u64
746set_to_rw(struct pnfs_block_layout *bl, u64 offset, u64 length)
747{
748 u64 rv = offset + length;
749 struct pnfs_block_extent *be, *e1, *e2, *e3, *new, *old;
750 struct pnfs_block_extent *children[3];
751 struct pnfs_block_extent *merge1 = NULL, *merge2 = NULL;
752 int i = 0, j;
753
754 dprintk("%s(%llu, %llu)\n", __func__, offset, length);
755 /* Create storage for up to three new extents e1, e2, e3 */
756 e1 = kmalloc(sizeof(*e1), GFP_ATOMIC);
757 e2 = kmalloc(sizeof(*e2), GFP_ATOMIC);
758 e3 = kmalloc(sizeof(*e3), GFP_ATOMIC);
759 /* BUG - we are ignoring any failure */
760 if (!e1 || !e2 || !e3)
761 goto out_nosplit;
762
763 spin_lock(&bl->bl_ext_lock);
764 be = bl_find_get_extent_locked(bl, offset);
765 rv = be->be_f_offset + be->be_length;
766 if (be->be_state != PNFS_BLOCK_INVALID_DATA) {
767 spin_unlock(&bl->bl_ext_lock);
768 goto out_nosplit;
769 }
770 /* Add e* to children, bumping e*'s krefs */
771 if (be->be_f_offset != offset) {
772 _prep_new_extent(e1, be, be->be_f_offset,
773 offset - be->be_f_offset,
774 PNFS_BLOCK_INVALID_DATA);
775 children[i++] = e1;
776 print_bl_extent(e1);
777 } else
778 merge1 = e1;
779 _prep_new_extent(e2, be, offset,
780 min(length, be->be_f_offset + be->be_length - offset),
781 PNFS_BLOCK_READWRITE_DATA);
782 children[i++] = e2;
783 print_bl_extent(e2);
784 if (offset + length < be->be_f_offset + be->be_length) {
785 _prep_new_extent(e3, be, e2->be_f_offset + e2->be_length,
786 be->be_f_offset + be->be_length -
787 offset - length,
788 PNFS_BLOCK_INVALID_DATA);
789 children[i++] = e3;
790 print_bl_extent(e3);
791 } else
792 merge2 = e3;
793
794 /* Remove be from list, and insert the e* */
795 /* We don't get refs on e*, since this list is the base reference
796 * set when init'ed.
797 */
798 if (i < 3)
799 children[i] = NULL;
800 new = children[0];
801 list_replace(&be->be_node, &new->be_node);
802 bl_put_extent(be);
803 new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge1);
804 for (j = 1; j < i; j++) {
805 old = new;
806 new = children[j];
807 list_add(&new->be_node, &old->be_node);
808 }
809 if (merge2) {
810 /* This is a HACK, should just create a _back_merge function */
811 new = list_entry(new->be_node.next,
812 struct pnfs_block_extent, be_node);
813 new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge2);
814 }
815 spin_unlock(&bl->bl_ext_lock);
816
817 /* Since we removed the base reference above, be is now scheduled for
818 * destruction.
819 */
820 bl_put_extent(be);
821 dprintk("%s returns %llu after split\n", __func__, rv);
822 return rv;
823
824 out_nosplit:
825 kfree(e1);
826 kfree(e2);
827 kfree(e3);
828 dprintk("%s returns %llu without splitting\n", __func__, rv);
829 return rv;
830}
831
832void
833clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
834 const struct nfs4_layoutcommit_args *arg,
835 int status)
836{
837 struct pnfs_block_short_extent *lce, *save;
838
839 dprintk("%s status %d\n", __func__, status);
840 list_for_each_entry_safe(lce, save, &bl->bl_committing, bse_node) {
841 if (likely(!status)) {
842 u64 offset = lce->bse_f_offset;
843 u64 end = offset + lce->bse_length;
844
845 do {
846 offset = set_to_rw(bl, offset, end - offset);
847 } while (offset < end);
848 list_del(&lce->bse_node);
849
850 kfree(lce);
851 } else {
852 list_del(&lce->bse_node);
853 spin_lock(&bl->bl_ext_lock);
854 add_to_commitlist(bl, lce);
855 spin_unlock(&bl->bl_ext_lock);
856 }
857 }
858}
859
860int bl_push_one_short_extent(struct pnfs_inval_markings *marks)
861{
862 struct pnfs_block_short_extent *new;
863
864 new = kmalloc(sizeof(*new), GFP_NOFS);
865 if (unlikely(!new))
866 return -ENOMEM;
867
868 spin_lock_bh(&marks->im_lock);
869 list_add(&new->bse_node, &marks->im_extents);
870 spin_unlock_bh(&marks->im_lock);
871
872 return 0;
873}
874
875struct pnfs_block_short_extent *
876bl_pop_one_short_extent(struct pnfs_inval_markings *marks)
877{
878 struct pnfs_block_short_extent *rv = NULL;
879
880 spin_lock_bh(&marks->im_lock);
881 if (!list_empty(&marks->im_extents)) {
882 rv = list_entry((&marks->im_extents)->next,
883 struct pnfs_block_short_extent, bse_node);
884 list_del_init(&rv->bse_node);
885 }
886 spin_unlock_bh(&marks->im_lock);
887
888 return rv;
889}
890
891void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free)
892{
893 struct pnfs_block_short_extent *se = NULL, *tmp;
894
895 if (num_to_free <= 0)
896 return;
897
898 spin_lock(&marks->im_lock);
899 list_for_each_entry_safe(se, tmp, &marks->im_extents, bse_node) {
900 list_del(&se->bse_node);
901 kfree(se);
902 if (--num_to_free == 0)
903 break;
904 }
905 spin_unlock(&marks->im_lock);
906
907 BUG_ON(num_to_free > 0);
908}