diff options
author | Peng Tao <bergwolf@gmail.com> | 2011-07-30 20:52:56 -0400 |
---|---|---|
committer | Trond Myklebust <Trond.Myklebust@netapp.com> | 2011-07-31 12:18:17 -0400 |
commit | 71cdd40fd498f12679070def668f6a4719ddbd1c (patch) | |
tree | d9f41109c962fd2a54e16217ce8abc37c7e99918 /fs/nfs | |
parent | 31e6306a4046926b598484f1cacf69309382eac6 (diff) |
pnfsblock: write_pagelist handle zero invalid extents
For invalid extents, find other pages in the same fsblock and write them out.
[pnfsblock: write_begin]
Signed-off-by: Fred Isaman <iisaman@citi.umich.edu>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Benny Halevy <bhalevy@tonian.com>
Signed-off-by: Peng Tao <peng_tao@emc.com>
Signed-off-by: Jim Rees <rees@umich.edu>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Diffstat (limited to 'fs/nfs')
-rw-r--r-- | fs/nfs/blocklayout/blocklayout.c | 275 |
1 files changed, 233 insertions, 42 deletions
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 21efef7c2fd..e56564d2ef9 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c | |||
@@ -35,6 +35,7 @@ | |||
35 | #include <linux/mount.h> | 35 | #include <linux/mount.h> |
36 | #include <linux/namei.h> | 36 | #include <linux/namei.h> |
37 | #include <linux/bio.h> /* struct bio */ | 37 | #include <linux/bio.h> /* struct bio */ |
38 | #include <linux/buffer_head.h> /* various write calls */ | ||
38 | 39 | ||
39 | #include "blocklayout.h" | 40 | #include "blocklayout.h" |
40 | 41 | ||
@@ -79,12 +80,8 @@ static int is_hole(struct pnfs_block_extent *be, sector_t isect) | |||
79 | */ | 80 | */ |
80 | static int is_writable(struct pnfs_block_extent *be, sector_t isect) | 81 | static int is_writable(struct pnfs_block_extent *be, sector_t isect) |
81 | { | 82 | { |
82 | if (be->be_state == PNFS_BLOCK_READWRITE_DATA) | 83 | return (be->be_state == PNFS_BLOCK_READWRITE_DATA || |
83 | return 1; | 84 | be->be_state == PNFS_BLOCK_INVALID_DATA); |
84 | else if (be->be_state != PNFS_BLOCK_INVALID_DATA) | ||
85 | return 0; | ||
86 | else | ||
87 | return bl_is_sector_init(be->be_inval, isect); | ||
88 | } | 85 | } |
89 | 86 | ||
90 | /* The data we are handed might be spread across several bios. We need | 87 | /* The data we are handed might be spread across several bios. We need |
@@ -353,6 +350,31 @@ static void mark_extents_written(struct pnfs_block_layout *bl, | |||
353 | } | 350 | } |
354 | } | 351 | } |
355 | 352 | ||
353 | static void bl_end_io_write_zero(struct bio *bio, int err) | ||
354 | { | ||
355 | struct parallel_io *par = bio->bi_private; | ||
356 | const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | ||
357 | struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; | ||
358 | struct nfs_write_data *wdata = (struct nfs_write_data *)par->data; | ||
359 | |||
360 | do { | ||
361 | struct page *page = bvec->bv_page; | ||
362 | |||
363 | if (--bvec >= bio->bi_io_vec) | ||
364 | prefetchw(&bvec->bv_page->flags); | ||
365 | /* This is the zeroing page we added */ | ||
366 | end_page_writeback(page); | ||
367 | page_cache_release(page); | ||
368 | } while (bvec >= bio->bi_io_vec); | ||
369 | if (!uptodate) { | ||
370 | if (!wdata->pnfs_error) | ||
371 | wdata->pnfs_error = -EIO; | ||
372 | bl_set_lo_fail(wdata->lseg); | ||
373 | } | ||
374 | bio_put(bio); | ||
375 | put_parallel(par); | ||
376 | } | ||
377 | |||
356 | /* This is basically copied from mpage_end_io_read */ | 378 | /* This is basically copied from mpage_end_io_read */ |
357 | static void bl_end_io_write(struct bio *bio, int err) | 379 | static void bl_end_io_write(struct bio *bio, int err) |
358 | { | 380 | { |
@@ -379,11 +401,8 @@ static void bl_write_cleanup(struct work_struct *work) | |||
379 | dprintk("%s enter\n", __func__); | 401 | dprintk("%s enter\n", __func__); |
380 | task = container_of(work, struct rpc_task, u.tk_work); | 402 | task = container_of(work, struct rpc_task, u.tk_work); |
381 | wdata = container_of(task, struct nfs_write_data, task); | 403 | wdata = container_of(task, struct nfs_write_data, task); |
382 | if (!wdata->task.tk_status) { | 404 | if (!wdata->pnfs_error) { |
383 | /* Marks for LAYOUTCOMMIT */ | 405 | /* Marks for LAYOUTCOMMIT */ |
384 | /* BUG - this should be called after each bio, not after | ||
385 | * all finish, unless have some way of storing success/failure | ||
386 | */ | ||
387 | mark_extents_written(BLK_LSEG2EXT(wdata->lseg), | 406 | mark_extents_written(BLK_LSEG2EXT(wdata->lseg), |
388 | wdata->args.offset, wdata->args.count); | 407 | wdata->args.offset, wdata->args.count); |
389 | } | 408 | } |
@@ -391,38 +410,110 @@ static void bl_write_cleanup(struct work_struct *work) | |||
391 | } | 410 | } |
392 | 411 | ||
393 | /* Called when last of bios associated with a bl_write_pagelist call finishes */ | 412 | /* Called when last of bios associated with a bl_write_pagelist call finishes */ |
394 | static void | 413 | static void bl_end_par_io_write(void *data) |
395 | bl_end_par_io_write(void *data) | ||
396 | { | 414 | { |
397 | struct nfs_write_data *wdata = data; | 415 | struct nfs_write_data *wdata = data; |
398 | 416 | ||
399 | /* STUB - ignoring error handling */ | ||
400 | wdata->task.tk_status = 0; | 417 | wdata->task.tk_status = 0; |
401 | wdata->verf.committed = NFS_FILE_SYNC; | 418 | wdata->verf.committed = NFS_FILE_SYNC; |
402 | INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup); | 419 | INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup); |
403 | schedule_work(&wdata->task.u.tk_work); | 420 | schedule_work(&wdata->task.u.tk_work); |
404 | } | 421 | } |
405 | 422 | ||
423 | /* FIXME STUB - mark intersection of layout and page as bad, so is not | ||
424 | * used again. | ||
425 | */ | ||
426 | static void mark_bad_read(void) | ||
427 | { | ||
428 | return; | ||
429 | } | ||
430 | |||
431 | /* | ||
432 | * map_block: map a requested I/0 block (isect) into an offset in the LVM | ||
433 | * block_device | ||
434 | */ | ||
435 | static void | ||
436 | map_block(struct buffer_head *bh, sector_t isect, struct pnfs_block_extent *be) | ||
437 | { | ||
438 | dprintk("%s enter be=%p\n", __func__, be); | ||
439 | |||
440 | set_buffer_mapped(bh); | ||
441 | bh->b_bdev = be->be_mdev; | ||
442 | bh->b_blocknr = (isect - be->be_f_offset + be->be_v_offset) >> | ||
443 | (be->be_mdev->bd_inode->i_blkbits - SECTOR_SHIFT); | ||
444 | |||
445 | dprintk("%s isect %llu, bh->b_blocknr %ld, using bsize %Zd\n", | ||
446 | __func__, (unsigned long long)isect, (long)bh->b_blocknr, | ||
447 | bh->b_size); | ||
448 | return; | ||
449 | } | ||
450 | |||
451 | /* Given an unmapped page, zero it or read in page for COW, page is locked | ||
452 | * by caller. | ||
453 | */ | ||
454 | static int | ||
455 | init_page_for_write(struct page *page, struct pnfs_block_extent *cow_read) | ||
456 | { | ||
457 | struct buffer_head *bh = NULL; | ||
458 | int ret = 0; | ||
459 | sector_t isect; | ||
460 | |||
461 | dprintk("%s enter, %p\n", __func__, page); | ||
462 | BUG_ON(PageUptodate(page)); | ||
463 | if (!cow_read) { | ||
464 | zero_user_segment(page, 0, PAGE_SIZE); | ||
465 | SetPageUptodate(page); | ||
466 | goto cleanup; | ||
467 | } | ||
468 | |||
469 | bh = alloc_page_buffers(page, PAGE_CACHE_SIZE, 0); | ||
470 | if (!bh) { | ||
471 | ret = -ENOMEM; | ||
472 | goto cleanup; | ||
473 | } | ||
474 | |||
475 | isect = (sector_t) page->index << PAGE_CACHE_SECTOR_SHIFT; | ||
476 | map_block(bh, isect, cow_read); | ||
477 | if (!bh_uptodate_or_lock(bh)) | ||
478 | ret = bh_submit_read(bh); | ||
479 | if (ret) | ||
480 | goto cleanup; | ||
481 | SetPageUptodate(page); | ||
482 | |||
483 | cleanup: | ||
484 | bl_put_extent(cow_read); | ||
485 | if (bh) | ||
486 | free_buffer_head(bh); | ||
487 | if (ret) { | ||
488 | /* Need to mark layout with bad read...should now | ||
489 | * just use nfs4 for reads and writes. | ||
490 | */ | ||
491 | mark_bad_read(); | ||
492 | } | ||
493 | return ret; | ||
494 | } | ||
495 | |||
406 | static enum pnfs_try_status | 496 | static enum pnfs_try_status |
407 | bl_write_pagelist(struct nfs_write_data *wdata, int sync) | 497 | bl_write_pagelist(struct nfs_write_data *wdata, int sync) |
408 | { | 498 | { |
409 | int i; | 499 | int i, ret, npg_zero, pg_index, last = 0; |
410 | struct bio *bio = NULL; | 500 | struct bio *bio = NULL; |
411 | struct pnfs_block_extent *be = NULL; | 501 | struct pnfs_block_extent *be = NULL, *cow_read = NULL; |
412 | sector_t isect, extent_length = 0; | 502 | sector_t isect, last_isect = 0, extent_length = 0; |
413 | struct parallel_io *par; | 503 | struct parallel_io *par; |
414 | loff_t offset = wdata->args.offset; | 504 | loff_t offset = wdata->args.offset; |
415 | size_t count = wdata->args.count; | 505 | size_t count = wdata->args.count; |
416 | struct page **pages = wdata->args.pages; | 506 | struct page **pages = wdata->args.pages; |
417 | int pg_index = wdata->args.pgbase >> PAGE_CACHE_SHIFT; | 507 | struct page *page; |
508 | pgoff_t index; | ||
509 | u64 temp; | ||
510 | int npg_per_block = | ||
511 | NFS_SERVER(wdata->inode)->pnfs_blksize >> PAGE_CACHE_SHIFT; | ||
418 | 512 | ||
419 | dprintk("%s enter, %Zu@%lld\n", __func__, count, offset); | 513 | dprintk("%s enter, %Zu@%lld\n", __func__, count, offset); |
420 | /* At this point, wdata->pages is a (sequential) list of nfs_pages. | 514 | /* At this point, wdata->pages is a (sequential) list of nfs_pages. |
421 | * We want to write each, and if there is an error remove it from | 515 | * We want to write each, and if there is an error set pnfs_error |
422 | * list and call | 516 | * to have it redone using nfs. |
423 | * nfs_retry_request(req) to have it redone using nfs. | ||
424 | * QUEST? Do as block or per req? Think have to do per block | ||
425 | * as part of end_bio | ||
426 | */ | 517 | */ |
427 | par = alloc_parallel(wdata); | 518 | par = alloc_parallel(wdata); |
428 | if (!par) | 519 | if (!par) |
@@ -433,7 +524,91 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync) | |||
433 | /* At this point, have to be more careful with error handling */ | 524 | /* At this point, have to be more careful with error handling */ |
434 | 525 | ||
435 | isect = (sector_t) ((offset & (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT); | 526 | isect = (sector_t) ((offset & (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT); |
436 | for (i = pg_index; i < wdata->npages ; i++) { | 527 | be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg), isect, &cow_read); |
528 | if (!be || !is_writable(be, isect)) { | ||
529 | dprintk("%s no matching extents!\n", __func__); | ||
530 | wdata->pnfs_error = -EINVAL; | ||
531 | goto out; | ||
532 | } | ||
533 | |||
534 | /* First page inside INVALID extent */ | ||
535 | if (be->be_state == PNFS_BLOCK_INVALID_DATA) { | ||
536 | temp = offset >> PAGE_CACHE_SHIFT; | ||
537 | npg_zero = do_div(temp, npg_per_block); | ||
538 | isect = (sector_t) (((offset - npg_zero * PAGE_CACHE_SIZE) & | ||
539 | (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT); | ||
540 | extent_length = be->be_length - (isect - be->be_f_offset); | ||
541 | |||
542 | fill_invalid_ext: | ||
543 | dprintk("%s need to zero %d pages\n", __func__, npg_zero); | ||
544 | for (;npg_zero > 0; npg_zero--) { | ||
545 | /* page ref released in bl_end_io_write_zero */ | ||
546 | index = isect >> PAGE_CACHE_SECTOR_SHIFT; | ||
547 | dprintk("%s zero %dth page: index %lu isect %llu\n", | ||
548 | __func__, npg_zero, index, | ||
549 | (unsigned long long)isect); | ||
550 | page = | ||
551 | find_or_create_page(wdata->inode->i_mapping, index, | ||
552 | GFP_NOFS); | ||
553 | if (!page) { | ||
554 | dprintk("%s oom\n", __func__); | ||
555 | wdata->pnfs_error = -ENOMEM; | ||
556 | goto out; | ||
557 | } | ||
558 | |||
559 | /* PageDirty: Other will write this out | ||
560 | * PageWriteback: Other is writing this out | ||
561 | * PageUptodate: It was read before | ||
562 | * sector_initialized: already written out | ||
563 | */ | ||
564 | if (PageDirty(page) || PageWriteback(page) || | ||
565 | bl_is_sector_init(be->be_inval, isect)) { | ||
566 | print_page(page); | ||
567 | unlock_page(page); | ||
568 | page_cache_release(page); | ||
569 | goto next_page; | ||
570 | } | ||
571 | if (!PageUptodate(page)) { | ||
572 | /* New page, readin or zero it */ | ||
573 | init_page_for_write(page, cow_read); | ||
574 | } | ||
575 | set_page_writeback(page); | ||
576 | unlock_page(page); | ||
577 | |||
578 | ret = bl_mark_sectors_init(be->be_inval, isect, | ||
579 | PAGE_CACHE_SECTORS, | ||
580 | NULL); | ||
581 | if (unlikely(ret)) { | ||
582 | dprintk("%s bl_mark_sectors_init fail %d\n", | ||
583 | __func__, ret); | ||
584 | end_page_writeback(page); | ||
585 | page_cache_release(page); | ||
586 | wdata->pnfs_error = ret; | ||
587 | goto out; | ||
588 | } | ||
589 | bio = bl_add_page_to_bio(bio, npg_zero, WRITE, | ||
590 | isect, page, be, | ||
591 | bl_end_io_write_zero, par); | ||
592 | if (IS_ERR(bio)) { | ||
593 | wdata->pnfs_error = PTR_ERR(bio); | ||
594 | goto out; | ||
595 | } | ||
596 | /* FIXME: This should be done in bi_end_io */ | ||
597 | mark_extents_written(BLK_LSEG2EXT(wdata->lseg), | ||
598 | page->index << PAGE_CACHE_SHIFT, | ||
599 | PAGE_CACHE_SIZE); | ||
600 | next_page: | ||
601 | isect += PAGE_CACHE_SECTORS; | ||
602 | extent_length -= PAGE_CACHE_SECTORS; | ||
603 | } | ||
604 | if (last) | ||
605 | goto write_done; | ||
606 | } | ||
607 | bio = bl_submit_bio(WRITE, bio); | ||
608 | |||
609 | /* Middle pages */ | ||
610 | pg_index = wdata->args.pgbase >> PAGE_CACHE_SHIFT; | ||
611 | for (i = pg_index; i < wdata->npages; i++) { | ||
437 | if (!extent_length) { | 612 | if (!extent_length) { |
438 | /* We've used up the previous extent */ | 613 | /* We've used up the previous extent */ |
439 | bl_put_extent(be); | 614 | bl_put_extent(be); |
@@ -442,35 +617,51 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync) | |||
442 | be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg), | 617 | be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg), |
443 | isect, NULL); | 618 | isect, NULL); |
444 | if (!be || !is_writable(be, isect)) { | 619 | if (!be || !is_writable(be, isect)) { |
445 | wdata->pnfs_error = -ENOMEM; | 620 | wdata->pnfs_error = -EINVAL; |
446 | goto out; | 621 | goto out; |
447 | } | 622 | } |
448 | extent_length = be->be_length - | 623 | extent_length = be->be_length - |
449 | (isect - be->be_f_offset); | 624 | (isect - be->be_f_offset); |
450 | } | 625 | } |
451 | for (;;) { | 626 | if (be->be_state == PNFS_BLOCK_INVALID_DATA) { |
452 | if (!bio) { | 627 | ret = bl_mark_sectors_init(be->be_inval, isect, |
453 | bio = bio_alloc(GFP_NOIO, wdata->npages - i); | 628 | PAGE_CACHE_SECTORS, |
454 | if (!bio) { | 629 | NULL); |
455 | wdata->pnfs_error = -ENOMEM; | 630 | if (unlikely(ret)) { |
456 | goto out; | 631 | dprintk("%s bl_mark_sectors_init fail %d\n", |
457 | } | 632 | __func__, ret); |
458 | bio->bi_sector = isect - be->be_f_offset + | 633 | wdata->pnfs_error = ret; |
459 | be->be_v_offset; | 634 | goto out; |
460 | bio->bi_bdev = be->be_mdev; | ||
461 | bio->bi_end_io = bl_end_io_write; | ||
462 | bio->bi_private = par; | ||
463 | } | 635 | } |
464 | if (bio_add_page(bio, pages[i], PAGE_SIZE, 0)) | 636 | } |
465 | break; | 637 | bio = bl_add_page_to_bio(bio, wdata->npages - i, WRITE, |
466 | bio = bl_submit_bio(WRITE, bio); | 638 | isect, pages[i], be, |
639 | bl_end_io_write, par); | ||
640 | if (IS_ERR(bio)) { | ||
641 | wdata->pnfs_error = PTR_ERR(bio); | ||
642 | goto out; | ||
467 | } | 643 | } |
468 | isect += PAGE_CACHE_SECTORS; | 644 | isect += PAGE_CACHE_SECTORS; |
645 | last_isect = isect; | ||
469 | extent_length -= PAGE_CACHE_SECTORS; | 646 | extent_length -= PAGE_CACHE_SECTORS; |
470 | } | 647 | } |
471 | wdata->res.count = (isect << SECTOR_SHIFT) - (offset); | 648 | |
472 | if (count < wdata->res.count) | 649 | /* Last page inside INVALID extent */ |
650 | if (be->be_state == PNFS_BLOCK_INVALID_DATA) { | ||
651 | bio = bl_submit_bio(WRITE, bio); | ||
652 | temp = last_isect >> PAGE_CACHE_SECTOR_SHIFT; | ||
653 | npg_zero = npg_per_block - do_div(temp, npg_per_block); | ||
654 | if (npg_zero < npg_per_block) { | ||
655 | last = 1; | ||
656 | goto fill_invalid_ext; | ||
657 | } | ||
658 | } | ||
659 | |||
660 | write_done: | ||
661 | wdata->res.count = (last_isect << SECTOR_SHIFT) - (offset); | ||
662 | if (count < wdata->res.count) { | ||
473 | wdata->res.count = count; | 663 | wdata->res.count = count; |
664 | } | ||
474 | out: | 665 | out: |
475 | bl_put_extent(be); | 666 | bl_put_extent(be); |
476 | bl_submit_bio(WRITE, bio); | 667 | bl_submit_bio(WRITE, bio); |