aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeng Tao <bergwolf@gmail.com>2011-07-30 20:52:56 -0400
committerTrond Myklebust <Trond.Myklebust@netapp.com>2011-07-31 12:18:17 -0400
commit71cdd40fd498f12679070def668f6a4719ddbd1c (patch)
treed9f41109c962fd2a54e16217ce8abc37c7e99918
parent31e6306a4046926b598484f1cacf69309382eac6 (diff)
pnfsblock: write_pagelist handle zero invalid extents
For invalid extents, find other pages in the same fsblock and write them out. [pnfsblock: write_begin] Signed-off-by: Fred Isaman <iisaman@citi.umich.edu> Signed-off-by: Benny Halevy <bhalevy@panasas.com> Signed-off-by: Benny Halevy <bhalevy@tonian.com> Signed-off-by: Peng Tao <peng_tao@emc.com> Signed-off-by: Jim Rees <rees@umich.edu> Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
-rw-r--r--fs/nfs/blocklayout/blocklayout.c275
1 files changed, 233 insertions, 42 deletions
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 21efef7c2fd2..e56564d2ef95 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -35,6 +35,7 @@
35#include <linux/mount.h> 35#include <linux/mount.h>
36#include <linux/namei.h> 36#include <linux/namei.h>
37#include <linux/bio.h> /* struct bio */ 37#include <linux/bio.h> /* struct bio */
38#include <linux/buffer_head.h> /* various write calls */
38 39
39#include "blocklayout.h" 40#include "blocklayout.h"
40 41
@@ -79,12 +80,8 @@ static int is_hole(struct pnfs_block_extent *be, sector_t isect)
79 */ 80 */
80static int is_writable(struct pnfs_block_extent *be, sector_t isect) 81static int is_writable(struct pnfs_block_extent *be, sector_t isect)
81{ 82{
82 if (be->be_state == PNFS_BLOCK_READWRITE_DATA) 83 return (be->be_state == PNFS_BLOCK_READWRITE_DATA ||
83 return 1; 84 be->be_state == PNFS_BLOCK_INVALID_DATA);
84 else if (be->be_state != PNFS_BLOCK_INVALID_DATA)
85 return 0;
86 else
87 return bl_is_sector_init(be->be_inval, isect);
88} 85}
89 86
90/* The data we are handed might be spread across several bios. We need 87/* The data we are handed might be spread across several bios. We need
@@ -353,6 +350,31 @@ static void mark_extents_written(struct pnfs_block_layout *bl,
353 } 350 }
354} 351}
355 352
353static void bl_end_io_write_zero(struct bio *bio, int err)
354{
355 struct parallel_io *par = bio->bi_private;
356 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
357 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
358 struct nfs_write_data *wdata = (struct nfs_write_data *)par->data;
359
360 do {
361 struct page *page = bvec->bv_page;
362
363 if (--bvec >= bio->bi_io_vec)
364 prefetchw(&bvec->bv_page->flags);
365 /* This is the zeroing page we added */
366 end_page_writeback(page);
367 page_cache_release(page);
368 } while (bvec >= bio->bi_io_vec);
369 if (!uptodate) {
370 if (!wdata->pnfs_error)
371 wdata->pnfs_error = -EIO;
372 bl_set_lo_fail(wdata->lseg);
373 }
374 bio_put(bio);
375 put_parallel(par);
376}
377
356/* This is basically copied from mpage_end_io_read */ 378/* This is basically copied from mpage_end_io_read */
357static void bl_end_io_write(struct bio *bio, int err) 379static void bl_end_io_write(struct bio *bio, int err)
358{ 380{
@@ -379,11 +401,8 @@ static void bl_write_cleanup(struct work_struct *work)
379 dprintk("%s enter\n", __func__); 401 dprintk("%s enter\n", __func__);
380 task = container_of(work, struct rpc_task, u.tk_work); 402 task = container_of(work, struct rpc_task, u.tk_work);
381 wdata = container_of(task, struct nfs_write_data, task); 403 wdata = container_of(task, struct nfs_write_data, task);
382 if (!wdata->task.tk_status) { 404 if (!wdata->pnfs_error) {
383 /* Marks for LAYOUTCOMMIT */ 405 /* Marks for LAYOUTCOMMIT */
384 /* BUG - this should be called after each bio, not after
385 * all finish, unless have some way of storing success/failure
386 */
387 mark_extents_written(BLK_LSEG2EXT(wdata->lseg), 406 mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
388 wdata->args.offset, wdata->args.count); 407 wdata->args.offset, wdata->args.count);
389 } 408 }
@@ -391,38 +410,110 @@ static void bl_write_cleanup(struct work_struct *work)
391} 410}
392 411
393/* Called when last of bios associated with a bl_write_pagelist call finishes */ 412/* Called when last of bios associated with a bl_write_pagelist call finishes */
394static void 413static void bl_end_par_io_write(void *data)
395bl_end_par_io_write(void *data)
396{ 414{
397 struct nfs_write_data *wdata = data; 415 struct nfs_write_data *wdata = data;
398 416
399 /* STUB - ignoring error handling */
400 wdata->task.tk_status = 0; 417 wdata->task.tk_status = 0;
401 wdata->verf.committed = NFS_FILE_SYNC; 418 wdata->verf.committed = NFS_FILE_SYNC;
402 INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup); 419 INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup);
403 schedule_work(&wdata->task.u.tk_work); 420 schedule_work(&wdata->task.u.tk_work);
404} 421}
405 422
423/* FIXME STUB - mark intersection of layout and page as bad, so is not
424 * used again.
425 */
426static void mark_bad_read(void)
427{
428 return;
429}
430
431/*
432 * map_block: map a requested I/0 block (isect) into an offset in the LVM
433 * block_device
434 */
435static void
436map_block(struct buffer_head *bh, sector_t isect, struct pnfs_block_extent *be)
437{
438 dprintk("%s enter be=%p\n", __func__, be);
439
440 set_buffer_mapped(bh);
441 bh->b_bdev = be->be_mdev;
442 bh->b_blocknr = (isect - be->be_f_offset + be->be_v_offset) >>
443 (be->be_mdev->bd_inode->i_blkbits - SECTOR_SHIFT);
444
445 dprintk("%s isect %llu, bh->b_blocknr %ld, using bsize %Zd\n",
446 __func__, (unsigned long long)isect, (long)bh->b_blocknr,
447 bh->b_size);
448 return;
449}
450
451/* Given an unmapped page, zero it or read in page for COW, page is locked
452 * by caller.
453 */
454static int
455init_page_for_write(struct page *page, struct pnfs_block_extent *cow_read)
456{
457 struct buffer_head *bh = NULL;
458 int ret = 0;
459 sector_t isect;
460
461 dprintk("%s enter, %p\n", __func__, page);
462 BUG_ON(PageUptodate(page));
463 if (!cow_read) {
464 zero_user_segment(page, 0, PAGE_SIZE);
465 SetPageUptodate(page);
466 goto cleanup;
467 }
468
469 bh = alloc_page_buffers(page, PAGE_CACHE_SIZE, 0);
470 if (!bh) {
471 ret = -ENOMEM;
472 goto cleanup;
473 }
474
475 isect = (sector_t) page->index << PAGE_CACHE_SECTOR_SHIFT;
476 map_block(bh, isect, cow_read);
477 if (!bh_uptodate_or_lock(bh))
478 ret = bh_submit_read(bh);
479 if (ret)
480 goto cleanup;
481 SetPageUptodate(page);
482
483cleanup:
484 bl_put_extent(cow_read);
485 if (bh)
486 free_buffer_head(bh);
487 if (ret) {
488 /* Need to mark layout with bad read...should now
489 * just use nfs4 for reads and writes.
490 */
491 mark_bad_read();
492 }
493 return ret;
494}
495
406static enum pnfs_try_status 496static enum pnfs_try_status
407bl_write_pagelist(struct nfs_write_data *wdata, int sync) 497bl_write_pagelist(struct nfs_write_data *wdata, int sync)
408{ 498{
409 int i; 499 int i, ret, npg_zero, pg_index, last = 0;
410 struct bio *bio = NULL; 500 struct bio *bio = NULL;
411 struct pnfs_block_extent *be = NULL; 501 struct pnfs_block_extent *be = NULL, *cow_read = NULL;
412 sector_t isect, extent_length = 0; 502 sector_t isect, last_isect = 0, extent_length = 0;
413 struct parallel_io *par; 503 struct parallel_io *par;
414 loff_t offset = wdata->args.offset; 504 loff_t offset = wdata->args.offset;
415 size_t count = wdata->args.count; 505 size_t count = wdata->args.count;
416 struct page **pages = wdata->args.pages; 506 struct page **pages = wdata->args.pages;
417 int pg_index = wdata->args.pgbase >> PAGE_CACHE_SHIFT; 507 struct page *page;
508 pgoff_t index;
509 u64 temp;
510 int npg_per_block =
511 NFS_SERVER(wdata->inode)->pnfs_blksize >> PAGE_CACHE_SHIFT;
418 512
419 dprintk("%s enter, %Zu@%lld\n", __func__, count, offset); 513 dprintk("%s enter, %Zu@%lld\n", __func__, count, offset);
420 /* At this point, wdata->pages is a (sequential) list of nfs_pages. 514 /* At this point, wdata->pages is a (sequential) list of nfs_pages.
421 * We want to write each, and if there is an error remove it from 515 * We want to write each, and if there is an error set pnfs_error
422 * list and call 516 * to have it redone using nfs.
423 * nfs_retry_request(req) to have it redone using nfs.
424 * QUEST? Do as block or per req? Think have to do per block
425 * as part of end_bio
426 */ 517 */
427 par = alloc_parallel(wdata); 518 par = alloc_parallel(wdata);
428 if (!par) 519 if (!par)
@@ -433,7 +524,91 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync)
433 /* At this point, have to be more careful with error handling */ 524 /* At this point, have to be more careful with error handling */
434 525
435 isect = (sector_t) ((offset & (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT); 526 isect = (sector_t) ((offset & (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT);
436 for (i = pg_index; i < wdata->npages ; i++) { 527 be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg), isect, &cow_read);
528 if (!be || !is_writable(be, isect)) {
529 dprintk("%s no matching extents!\n", __func__);
530 wdata->pnfs_error = -EINVAL;
531 goto out;
532 }
533
534 /* First page inside INVALID extent */
535 if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
536 temp = offset >> PAGE_CACHE_SHIFT;
537 npg_zero = do_div(temp, npg_per_block);
538 isect = (sector_t) (((offset - npg_zero * PAGE_CACHE_SIZE) &
539 (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT);
540 extent_length = be->be_length - (isect - be->be_f_offset);
541
542fill_invalid_ext:
543 dprintk("%s need to zero %d pages\n", __func__, npg_zero);
544 for (;npg_zero > 0; npg_zero--) {
545 /* page ref released in bl_end_io_write_zero */
546 index = isect >> PAGE_CACHE_SECTOR_SHIFT;
547 dprintk("%s zero %dth page: index %lu isect %llu\n",
548 __func__, npg_zero, index,
549 (unsigned long long)isect);
550 page =
551 find_or_create_page(wdata->inode->i_mapping, index,
552 GFP_NOFS);
553 if (!page) {
554 dprintk("%s oom\n", __func__);
555 wdata->pnfs_error = -ENOMEM;
556 goto out;
557 }
558
559 /* PageDirty: Other will write this out
560 * PageWriteback: Other is writing this out
561 * PageUptodate: It was read before
562 * sector_initialized: already written out
563 */
564 if (PageDirty(page) || PageWriteback(page) ||
565 bl_is_sector_init(be->be_inval, isect)) {
566 print_page(page);
567 unlock_page(page);
568 page_cache_release(page);
569 goto next_page;
570 }
571 if (!PageUptodate(page)) {
572 /* New page, readin or zero it */
573 init_page_for_write(page, cow_read);
574 }
575 set_page_writeback(page);
576 unlock_page(page);
577
578 ret = bl_mark_sectors_init(be->be_inval, isect,
579 PAGE_CACHE_SECTORS,
580 NULL);
581 if (unlikely(ret)) {
582 dprintk("%s bl_mark_sectors_init fail %d\n",
583 __func__, ret);
584 end_page_writeback(page);
585 page_cache_release(page);
586 wdata->pnfs_error = ret;
587 goto out;
588 }
589 bio = bl_add_page_to_bio(bio, npg_zero, WRITE,
590 isect, page, be,
591 bl_end_io_write_zero, par);
592 if (IS_ERR(bio)) {
593 wdata->pnfs_error = PTR_ERR(bio);
594 goto out;
595 }
596 /* FIXME: This should be done in bi_end_io */
597 mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
598 page->index << PAGE_CACHE_SHIFT,
599 PAGE_CACHE_SIZE);
600next_page:
601 isect += PAGE_CACHE_SECTORS;
602 extent_length -= PAGE_CACHE_SECTORS;
603 }
604 if (last)
605 goto write_done;
606 }
607 bio = bl_submit_bio(WRITE, bio);
608
609 /* Middle pages */
610 pg_index = wdata->args.pgbase >> PAGE_CACHE_SHIFT;
611 for (i = pg_index; i < wdata->npages; i++) {
437 if (!extent_length) { 612 if (!extent_length) {
438 /* We've used up the previous extent */ 613 /* We've used up the previous extent */
439 bl_put_extent(be); 614 bl_put_extent(be);
@@ -442,35 +617,51 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync)
442 be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg), 617 be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg),
443 isect, NULL); 618 isect, NULL);
444 if (!be || !is_writable(be, isect)) { 619 if (!be || !is_writable(be, isect)) {
445 wdata->pnfs_error = -ENOMEM; 620 wdata->pnfs_error = -EINVAL;
446 goto out; 621 goto out;
447 } 622 }
448 extent_length = be->be_length - 623 extent_length = be->be_length -
449 (isect - be->be_f_offset); 624 (isect - be->be_f_offset);
450 } 625 }
451 for (;;) { 626 if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
452 if (!bio) { 627 ret = bl_mark_sectors_init(be->be_inval, isect,
453 bio = bio_alloc(GFP_NOIO, wdata->npages - i); 628 PAGE_CACHE_SECTORS,
454 if (!bio) { 629 NULL);
455 wdata->pnfs_error = -ENOMEM; 630 if (unlikely(ret)) {
456 goto out; 631 dprintk("%s bl_mark_sectors_init fail %d\n",
457 } 632 __func__, ret);
458 bio->bi_sector = isect - be->be_f_offset + 633 wdata->pnfs_error = ret;
459 be->be_v_offset; 634 goto out;
460 bio->bi_bdev = be->be_mdev;
461 bio->bi_end_io = bl_end_io_write;
462 bio->bi_private = par;
463 } 635 }
464 if (bio_add_page(bio, pages[i], PAGE_SIZE, 0)) 636 }
465 break; 637 bio = bl_add_page_to_bio(bio, wdata->npages - i, WRITE,
466 bio = bl_submit_bio(WRITE, bio); 638 isect, pages[i], be,
639 bl_end_io_write, par);
640 if (IS_ERR(bio)) {
641 wdata->pnfs_error = PTR_ERR(bio);
642 goto out;
467 } 643 }
468 isect += PAGE_CACHE_SECTORS; 644 isect += PAGE_CACHE_SECTORS;
645 last_isect = isect;
469 extent_length -= PAGE_CACHE_SECTORS; 646 extent_length -= PAGE_CACHE_SECTORS;
470 } 647 }
471 wdata->res.count = (isect << SECTOR_SHIFT) - (offset); 648
472 if (count < wdata->res.count) 649 /* Last page inside INVALID extent */
650 if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
651 bio = bl_submit_bio(WRITE, bio);
652 temp = last_isect >> PAGE_CACHE_SECTOR_SHIFT;
653 npg_zero = npg_per_block - do_div(temp, npg_per_block);
654 if (npg_zero < npg_per_block) {
655 last = 1;
656 goto fill_invalid_ext;
657 }
658 }
659
660write_done:
661 wdata->res.count = (last_isect << SECTOR_SHIFT) - (offset);
662 if (count < wdata->res.count) {
473 wdata->res.count = count; 663 wdata->res.count = count;
664 }
474out: 665out:
475 bl_put_extent(be); 666 bl_put_extent(be);
476 bl_submit_bio(WRITE, bio); 667 bl_submit_bio(WRITE, bio);