diff options
author | Boaz Harrosh <bharrosh@panasas.com> | 2011-05-22 12:52:19 -0400 |
---|---|---|
committer | Boaz Harrosh <bharrosh@panasas.com> | 2011-05-29 13:54:15 -0400 |
commit | 04f83450388e87d86b387cf4a27b81eb7e69de7d (patch) | |
tree | 99c10d6a995ed1e7b872abb0127fc38d6bef9982 | |
parent | d20581aa4be11407c9eeeb75992df5ef176bba0f (diff) |
pnfs-obj: osd raid engine read/write implementation
With the use of the in-kernel osd library. Implement read/write
of data from/to osd-objects according to information specified
in the objects-layout.
Support for stripping over mirrors with a received stripe_unit.
There are however a few constrains which are not supported:
1. Stripe Unit must be a multiple of PAGE_SIZE
2. stripe length (stripe_unit * number_of_stripes) can not be
bigger then 32bit.
Also support raid-groups and partial-layout. Partial-layout is
when not all the groups are received on the line, addressing
only a partial range of the file.
TODO:
Only raid0! raid 4/5/6 support will come at later stage
A none supported layout will send IO through the MDS
[Important fallout from the last rebase]
Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
[gfp_flags]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
-rw-r--r-- | fs/nfs/objlayout/objio_osd.c | 605 | ||||
-rw-r--r-- | fs/nfs/objlayout/objlayout.c | 254 | ||||
-rw-r--r-- | fs/nfs/objlayout/objlayout.h | 42 |
3 files changed, 901 insertions, 0 deletions
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index 353821f7937b..cc92d3b3dc3a 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c | |||
@@ -46,6 +46,10 @@ | |||
46 | 46 | ||
47 | #define _LLU(x) ((unsigned long long)x) | 47 | #define _LLU(x) ((unsigned long long)x) |
48 | 48 | ||
49 | enum { BIO_MAX_PAGES_KMALLOC = | ||
50 | (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec), | ||
51 | }; | ||
52 | |||
49 | struct objio_dev_ent { | 53 | struct objio_dev_ent { |
50 | struct nfs4_deviceid_node id_node; | 54 | struct nfs4_deviceid_node id_node; |
51 | struct osd_dev *od; | 55 | struct osd_dev *od; |
@@ -136,6 +140,31 @@ OBJIO_LSEG(struct pnfs_layout_segment *lseg) | |||
136 | return container_of(lseg, struct objio_segment, lseg); | 140 | return container_of(lseg, struct objio_segment, lseg); |
137 | } | 141 | } |
138 | 142 | ||
143 | struct objio_state; | ||
144 | typedef ssize_t (*objio_done_fn)(struct objio_state *ios); | ||
145 | |||
146 | struct objio_state { | ||
147 | /* Generic layer */ | ||
148 | struct objlayout_io_state ol_state; | ||
149 | |||
150 | struct objio_segment *layout; | ||
151 | |||
152 | struct kref kref; | ||
153 | objio_done_fn done; | ||
154 | void *private; | ||
155 | |||
156 | unsigned long length; | ||
157 | unsigned numdevs; /* Actually used devs in this IO */ | ||
158 | /* A per-device variable array of size numdevs */ | ||
159 | struct _objio_per_comp { | ||
160 | struct bio *bio; | ||
161 | struct osd_request *or; | ||
162 | unsigned long length; | ||
163 | u64 offset; | ||
164 | unsigned dev; | ||
165 | } per_dev[]; | ||
166 | }; | ||
167 | |||
139 | /* Send and wait for a get_device_info of devices in the layout, | 168 | /* Send and wait for a get_device_info of devices in the layout, |
140 | then look them up with the osd_initiator library */ | 169 | then look them up with the osd_initiator library */ |
141 | static struct objio_dev_ent *_device_lookup(struct pnfs_layout_hdr *pnfslay, | 170 | static struct objio_dev_ent *_device_lookup(struct pnfs_layout_hdr *pnfslay, |
@@ -359,6 +388,578 @@ void objio_free_lseg(struct pnfs_layout_segment *lseg) | |||
359 | kfree(objio_seg); | 388 | kfree(objio_seg); |
360 | } | 389 | } |
361 | 390 | ||
391 | int objio_alloc_io_state(struct pnfs_layout_segment *lseg, | ||
392 | struct objlayout_io_state **outp, | ||
393 | gfp_t gfp_flags) | ||
394 | { | ||
395 | struct objio_segment *objio_seg = OBJIO_LSEG(lseg); | ||
396 | struct objio_state *ios; | ||
397 | const unsigned first_size = sizeof(*ios) + | ||
398 | objio_seg->num_comps * sizeof(ios->per_dev[0]); | ||
399 | |||
400 | ios = kzalloc(first_size, gfp_flags); | ||
401 | if (unlikely(!ios)) | ||
402 | return -ENOMEM; | ||
403 | |||
404 | ios->layout = objio_seg; | ||
405 | |||
406 | *outp = &ios->ol_state; | ||
407 | return 0; | ||
408 | } | ||
409 | |||
410 | void objio_free_io_state(struct objlayout_io_state *ol_state) | ||
411 | { | ||
412 | struct objio_state *ios = container_of(ol_state, struct objio_state, | ||
413 | ol_state); | ||
414 | |||
415 | kfree(ios); | ||
416 | } | ||
417 | |||
418 | static void _clear_bio(struct bio *bio) | ||
419 | { | ||
420 | struct bio_vec *bv; | ||
421 | unsigned i; | ||
422 | |||
423 | __bio_for_each_segment(bv, bio, i, 0) { | ||
424 | unsigned this_count = bv->bv_len; | ||
425 | |||
426 | if (likely(PAGE_SIZE == this_count)) | ||
427 | clear_highpage(bv->bv_page); | ||
428 | else | ||
429 | zero_user(bv->bv_page, bv->bv_offset, this_count); | ||
430 | } | ||
431 | } | ||
432 | |||
433 | static int _io_check(struct objio_state *ios, bool is_write) | ||
434 | { | ||
435 | enum osd_err_priority oep = OSD_ERR_PRI_NO_ERROR; | ||
436 | int lin_ret = 0; | ||
437 | int i; | ||
438 | |||
439 | for (i = 0; i < ios->numdevs; i++) { | ||
440 | struct osd_sense_info osi; | ||
441 | struct osd_request *or = ios->per_dev[i].or; | ||
442 | unsigned dev; | ||
443 | int ret; | ||
444 | |||
445 | if (!or) | ||
446 | continue; | ||
447 | |||
448 | ret = osd_req_decode_sense(or, &osi); | ||
449 | if (likely(!ret)) | ||
450 | continue; | ||
451 | |||
452 | if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) { | ||
453 | /* start read offset passed endof file */ | ||
454 | BUG_ON(is_write); | ||
455 | _clear_bio(ios->per_dev[i].bio); | ||
456 | dprintk("%s: start read offset passed end of file " | ||
457 | "offset=0x%llx, length=0x%lx\n", __func__, | ||
458 | _LLU(ios->per_dev[i].offset), | ||
459 | ios->per_dev[i].length); | ||
460 | |||
461 | continue; /* we recovered */ | ||
462 | } | ||
463 | dev = ios->per_dev[i].dev; | ||
464 | |||
465 | if (osi.osd_err_pri >= oep) { | ||
466 | oep = osi.osd_err_pri; | ||
467 | lin_ret = ret; | ||
468 | } | ||
469 | } | ||
470 | |||
471 | return lin_ret; | ||
472 | } | ||
473 | |||
474 | /* | ||
475 | * Common IO state helpers. | ||
476 | */ | ||
477 | static void _io_free(struct objio_state *ios) | ||
478 | { | ||
479 | unsigned i; | ||
480 | |||
481 | for (i = 0; i < ios->numdevs; i++) { | ||
482 | struct _objio_per_comp *per_dev = &ios->per_dev[i]; | ||
483 | |||
484 | if (per_dev->or) { | ||
485 | osd_end_request(per_dev->or); | ||
486 | per_dev->or = NULL; | ||
487 | } | ||
488 | |||
489 | if (per_dev->bio) { | ||
490 | bio_put(per_dev->bio); | ||
491 | per_dev->bio = NULL; | ||
492 | } | ||
493 | } | ||
494 | } | ||
495 | |||
496 | struct osd_dev *_io_od(struct objio_state *ios, unsigned dev) | ||
497 | { | ||
498 | unsigned min_dev = ios->layout->comps_index; | ||
499 | unsigned max_dev = min_dev + ios->layout->num_comps; | ||
500 | |||
501 | BUG_ON(dev < min_dev || max_dev <= dev); | ||
502 | return ios->layout->ods[dev - min_dev]->od; | ||
503 | } | ||
504 | |||
505 | struct _striping_info { | ||
506 | u64 obj_offset; | ||
507 | u64 group_length; | ||
508 | unsigned dev; | ||
509 | unsigned unit_off; | ||
510 | }; | ||
511 | |||
512 | static void _calc_stripe_info(struct objio_state *ios, u64 file_offset, | ||
513 | struct _striping_info *si) | ||
514 | { | ||
515 | u32 stripe_unit = ios->layout->stripe_unit; | ||
516 | u32 group_width = ios->layout->group_width; | ||
517 | u64 group_depth = ios->layout->group_depth; | ||
518 | u32 U = stripe_unit * group_width; | ||
519 | |||
520 | u64 T = U * group_depth; | ||
521 | u64 S = T * ios->layout->group_count; | ||
522 | u64 M = div64_u64(file_offset, S); | ||
523 | |||
524 | /* | ||
525 | G = (L - (M * S)) / T | ||
526 | H = (L - (M * S)) % T | ||
527 | */ | ||
528 | u64 LmodU = file_offset - M * S; | ||
529 | u32 G = div64_u64(LmodU, T); | ||
530 | u64 H = LmodU - G * T; | ||
531 | |||
532 | u32 N = div_u64(H, U); | ||
533 | |||
534 | div_u64_rem(file_offset, stripe_unit, &si->unit_off); | ||
535 | si->obj_offset = si->unit_off + (N * stripe_unit) + | ||
536 | (M * group_depth * stripe_unit); | ||
537 | |||
538 | /* "H - (N * U)" is just "H % U" so it's bound to u32 */ | ||
539 | si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width; | ||
540 | si->dev *= ios->layout->mirrors_p1; | ||
541 | |||
542 | si->group_length = T - H; | ||
543 | } | ||
544 | |||
545 | static int _add_stripe_unit(struct objio_state *ios, unsigned *cur_pg, | ||
546 | unsigned pgbase, struct _objio_per_comp *per_dev, int cur_len, | ||
547 | gfp_t gfp_flags) | ||
548 | { | ||
549 | unsigned pg = *cur_pg; | ||
550 | struct request_queue *q = | ||
551 | osd_request_queue(_io_od(ios, per_dev->dev)); | ||
552 | |||
553 | per_dev->length += cur_len; | ||
554 | |||
555 | if (per_dev->bio == NULL) { | ||
556 | unsigned stripes = ios->layout->num_comps / | ||
557 | ios->layout->mirrors_p1; | ||
558 | unsigned pages_in_stripe = stripes * | ||
559 | (ios->layout->stripe_unit / PAGE_SIZE); | ||
560 | unsigned bio_size = (ios->ol_state.nr_pages + pages_in_stripe) / | ||
561 | stripes; | ||
562 | |||
563 | if (BIO_MAX_PAGES_KMALLOC < bio_size) | ||
564 | bio_size = BIO_MAX_PAGES_KMALLOC; | ||
565 | |||
566 | per_dev->bio = bio_kmalloc(gfp_flags, bio_size); | ||
567 | if (unlikely(!per_dev->bio)) { | ||
568 | dprintk("Faild to allocate BIO size=%u\n", bio_size); | ||
569 | return -ENOMEM; | ||
570 | } | ||
571 | } | ||
572 | |||
573 | while (cur_len > 0) { | ||
574 | unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len); | ||
575 | unsigned added_len; | ||
576 | |||
577 | BUG_ON(ios->ol_state.nr_pages <= pg); | ||
578 | cur_len -= pglen; | ||
579 | |||
580 | added_len = bio_add_pc_page(q, per_dev->bio, | ||
581 | ios->ol_state.pages[pg], pglen, pgbase); | ||
582 | if (unlikely(pglen != added_len)) | ||
583 | return -ENOMEM; | ||
584 | pgbase = 0; | ||
585 | ++pg; | ||
586 | } | ||
587 | BUG_ON(cur_len); | ||
588 | |||
589 | *cur_pg = pg; | ||
590 | return 0; | ||
591 | } | ||
592 | |||
593 | static int _prepare_one_group(struct objio_state *ios, u64 length, | ||
594 | struct _striping_info *si, unsigned *last_pg, | ||
595 | gfp_t gfp_flags) | ||
596 | { | ||
597 | unsigned stripe_unit = ios->layout->stripe_unit; | ||
598 | unsigned mirrors_p1 = ios->layout->mirrors_p1; | ||
599 | unsigned devs_in_group = ios->layout->group_width * mirrors_p1; | ||
600 | unsigned dev = si->dev; | ||
601 | unsigned first_dev = dev - (dev % devs_in_group); | ||
602 | unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0; | ||
603 | unsigned cur_pg = *last_pg; | ||
604 | int ret = 0; | ||
605 | |||
606 | while (length) { | ||
607 | struct _objio_per_comp *per_dev = &ios->per_dev[dev]; | ||
608 | unsigned cur_len, page_off = 0; | ||
609 | |||
610 | if (!per_dev->length) { | ||
611 | per_dev->dev = dev; | ||
612 | if (dev < si->dev) { | ||
613 | per_dev->offset = si->obj_offset + stripe_unit - | ||
614 | si->unit_off; | ||
615 | cur_len = stripe_unit; | ||
616 | } else if (dev == si->dev) { | ||
617 | per_dev->offset = si->obj_offset; | ||
618 | cur_len = stripe_unit - si->unit_off; | ||
619 | page_off = si->unit_off & ~PAGE_MASK; | ||
620 | BUG_ON(page_off && | ||
621 | (page_off != ios->ol_state.pgbase)); | ||
622 | } else { /* dev > si->dev */ | ||
623 | per_dev->offset = si->obj_offset - si->unit_off; | ||
624 | cur_len = stripe_unit; | ||
625 | } | ||
626 | |||
627 | if (max_comp < dev) | ||
628 | max_comp = dev; | ||
629 | } else { | ||
630 | cur_len = stripe_unit; | ||
631 | } | ||
632 | if (cur_len >= length) | ||
633 | cur_len = length; | ||
634 | |||
635 | ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev, | ||
636 | cur_len, gfp_flags); | ||
637 | if (unlikely(ret)) | ||
638 | goto out; | ||
639 | |||
640 | dev += mirrors_p1; | ||
641 | dev = (dev % devs_in_group) + first_dev; | ||
642 | |||
643 | length -= cur_len; | ||
644 | ios->length += cur_len; | ||
645 | } | ||
646 | out: | ||
647 | ios->numdevs = max_comp + mirrors_p1; | ||
648 | *last_pg = cur_pg; | ||
649 | return ret; | ||
650 | } | ||
651 | |||
652 | static int _io_rw_pagelist(struct objio_state *ios, gfp_t gfp_flags) | ||
653 | { | ||
654 | u64 length = ios->ol_state.count; | ||
655 | u64 offset = ios->ol_state.offset; | ||
656 | struct _striping_info si; | ||
657 | unsigned last_pg = 0; | ||
658 | int ret = 0; | ||
659 | |||
660 | while (length) { | ||
661 | _calc_stripe_info(ios, offset, &si); | ||
662 | |||
663 | if (length < si.group_length) | ||
664 | si.group_length = length; | ||
665 | |||
666 | ret = _prepare_one_group(ios, si.group_length, &si, &last_pg, gfp_flags); | ||
667 | if (unlikely(ret)) | ||
668 | goto out; | ||
669 | |||
670 | offset += si.group_length; | ||
671 | length -= si.group_length; | ||
672 | } | ||
673 | |||
674 | out: | ||
675 | if (!ios->length) | ||
676 | return ret; | ||
677 | |||
678 | return 0; | ||
679 | } | ||
680 | |||
681 | static ssize_t _sync_done(struct objio_state *ios) | ||
682 | { | ||
683 | struct completion *waiting = ios->private; | ||
684 | |||
685 | complete(waiting); | ||
686 | return 0; | ||
687 | } | ||
688 | |||
689 | static void _last_io(struct kref *kref) | ||
690 | { | ||
691 | struct objio_state *ios = container_of(kref, struct objio_state, kref); | ||
692 | |||
693 | ios->done(ios); | ||
694 | } | ||
695 | |||
696 | static void _done_io(struct osd_request *or, void *p) | ||
697 | { | ||
698 | struct objio_state *ios = p; | ||
699 | |||
700 | kref_put(&ios->kref, _last_io); | ||
701 | } | ||
702 | |||
703 | static ssize_t _io_exec(struct objio_state *ios) | ||
704 | { | ||
705 | DECLARE_COMPLETION_ONSTACK(wait); | ||
706 | ssize_t status = 0; /* sync status */ | ||
707 | unsigned i; | ||
708 | objio_done_fn saved_done_fn = ios->done; | ||
709 | bool sync = ios->ol_state.sync; | ||
710 | |||
711 | if (sync) { | ||
712 | ios->done = _sync_done; | ||
713 | ios->private = &wait; | ||
714 | } | ||
715 | |||
716 | kref_init(&ios->kref); | ||
717 | |||
718 | for (i = 0; i < ios->numdevs; i++) { | ||
719 | struct osd_request *or = ios->per_dev[i].or; | ||
720 | |||
721 | if (!or) | ||
722 | continue; | ||
723 | |||
724 | kref_get(&ios->kref); | ||
725 | osd_execute_request_async(or, _done_io, ios); | ||
726 | } | ||
727 | |||
728 | kref_put(&ios->kref, _last_io); | ||
729 | |||
730 | if (sync) { | ||
731 | wait_for_completion(&wait); | ||
732 | status = saved_done_fn(ios); | ||
733 | } | ||
734 | |||
735 | return status; | ||
736 | } | ||
737 | |||
738 | /* | ||
739 | * read | ||
740 | */ | ||
741 | static ssize_t _read_done(struct objio_state *ios) | ||
742 | { | ||
743 | ssize_t status; | ||
744 | int ret = _io_check(ios, false); | ||
745 | |||
746 | _io_free(ios); | ||
747 | |||
748 | if (likely(!ret)) | ||
749 | status = ios->length; | ||
750 | else | ||
751 | status = ret; | ||
752 | |||
753 | objlayout_read_done(&ios->ol_state, status, ios->ol_state.sync); | ||
754 | return status; | ||
755 | } | ||
756 | |||
757 | static int _read_mirrors(struct objio_state *ios, unsigned cur_comp) | ||
758 | { | ||
759 | struct osd_request *or = NULL; | ||
760 | struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp]; | ||
761 | unsigned dev = per_dev->dev; | ||
762 | struct pnfs_osd_object_cred *cred = | ||
763 | &ios->layout->comps[dev]; | ||
764 | struct osd_obj_id obj = { | ||
765 | .partition = cred->oc_object_id.oid_partition_id, | ||
766 | .id = cred->oc_object_id.oid_object_id, | ||
767 | }; | ||
768 | int ret; | ||
769 | |||
770 | or = osd_start_request(_io_od(ios, dev), GFP_KERNEL); | ||
771 | if (unlikely(!or)) { | ||
772 | ret = -ENOMEM; | ||
773 | goto err; | ||
774 | } | ||
775 | per_dev->or = or; | ||
776 | |||
777 | osd_req_read(or, &obj, per_dev->offset, per_dev->bio, per_dev->length); | ||
778 | |||
779 | ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL); | ||
780 | if (ret) { | ||
781 | dprintk("%s: Faild to osd_finalize_request() => %d\n", | ||
782 | __func__, ret); | ||
783 | goto err; | ||
784 | } | ||
785 | |||
786 | dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n", | ||
787 | __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset), | ||
788 | per_dev->length); | ||
789 | |||
790 | err: | ||
791 | return ret; | ||
792 | } | ||
793 | |||
794 | static ssize_t _read_exec(struct objio_state *ios) | ||
795 | { | ||
796 | unsigned i; | ||
797 | int ret; | ||
798 | |||
799 | for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { | ||
800 | if (!ios->per_dev[i].length) | ||
801 | continue; | ||
802 | ret = _read_mirrors(ios, i); | ||
803 | if (unlikely(ret)) | ||
804 | goto err; | ||
805 | } | ||
806 | |||
807 | ios->done = _read_done; | ||
808 | return _io_exec(ios); /* In sync mode exec returns the io status */ | ||
809 | |||
810 | err: | ||
811 | _io_free(ios); | ||
812 | return ret; | ||
813 | } | ||
814 | |||
815 | ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state) | ||
816 | { | ||
817 | struct objio_state *ios = container_of(ol_state, struct objio_state, | ||
818 | ol_state); | ||
819 | int ret; | ||
820 | |||
821 | ret = _io_rw_pagelist(ios, GFP_KERNEL); | ||
822 | if (unlikely(ret)) | ||
823 | return ret; | ||
824 | |||
825 | return _read_exec(ios); | ||
826 | } | ||
827 | |||
828 | /* | ||
829 | * write | ||
830 | */ | ||
831 | static ssize_t _write_done(struct objio_state *ios) | ||
832 | { | ||
833 | ssize_t status; | ||
834 | int ret = _io_check(ios, true); | ||
835 | |||
836 | _io_free(ios); | ||
837 | |||
838 | if (likely(!ret)) { | ||
839 | /* FIXME: should be based on the OSD's persistence model | ||
840 | * See OSD2r05 Section 4.13 Data persistence model */ | ||
841 | ios->ol_state.committed = NFS_FILE_SYNC; | ||
842 | status = ios->length; | ||
843 | } else { | ||
844 | status = ret; | ||
845 | } | ||
846 | |||
847 | objlayout_write_done(&ios->ol_state, status, ios->ol_state.sync); | ||
848 | return status; | ||
849 | } | ||
850 | |||
851 | static int _write_mirrors(struct objio_state *ios, unsigned cur_comp) | ||
852 | { | ||
853 | struct _objio_per_comp *master_dev = &ios->per_dev[cur_comp]; | ||
854 | unsigned dev = ios->per_dev[cur_comp].dev; | ||
855 | unsigned last_comp = cur_comp + ios->layout->mirrors_p1; | ||
856 | int ret; | ||
857 | |||
858 | for (; cur_comp < last_comp; ++cur_comp, ++dev) { | ||
859 | struct osd_request *or = NULL; | ||
860 | struct pnfs_osd_object_cred *cred = | ||
861 | &ios->layout->comps[dev]; | ||
862 | struct osd_obj_id obj = { | ||
863 | .partition = cred->oc_object_id.oid_partition_id, | ||
864 | .id = cred->oc_object_id.oid_object_id, | ||
865 | }; | ||
866 | struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp]; | ||
867 | struct bio *bio; | ||
868 | |||
869 | or = osd_start_request(_io_od(ios, dev), GFP_NOFS); | ||
870 | if (unlikely(!or)) { | ||
871 | ret = -ENOMEM; | ||
872 | goto err; | ||
873 | } | ||
874 | per_dev->or = or; | ||
875 | |||
876 | if (per_dev != master_dev) { | ||
877 | bio = bio_kmalloc(GFP_NOFS, | ||
878 | master_dev->bio->bi_max_vecs); | ||
879 | if (unlikely(!bio)) { | ||
880 | dprintk("Faild to allocate BIO size=%u\n", | ||
881 | master_dev->bio->bi_max_vecs); | ||
882 | ret = -ENOMEM; | ||
883 | goto err; | ||
884 | } | ||
885 | |||
886 | __bio_clone(bio, master_dev->bio); | ||
887 | bio->bi_bdev = NULL; | ||
888 | bio->bi_next = NULL; | ||
889 | per_dev->bio = bio; | ||
890 | per_dev->dev = dev; | ||
891 | per_dev->length = master_dev->length; | ||
892 | per_dev->offset = master_dev->offset; | ||
893 | } else { | ||
894 | bio = master_dev->bio; | ||
895 | bio->bi_rw |= REQ_WRITE; | ||
896 | } | ||
897 | |||
898 | osd_req_write(or, &obj, per_dev->offset, bio, per_dev->length); | ||
899 | |||
900 | ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL); | ||
901 | if (ret) { | ||
902 | dprintk("%s: Faild to osd_finalize_request() => %d\n", | ||
903 | __func__, ret); | ||
904 | goto err; | ||
905 | } | ||
906 | |||
907 | dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n", | ||
908 | __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset), | ||
909 | per_dev->length); | ||
910 | } | ||
911 | |||
912 | err: | ||
913 | return ret; | ||
914 | } | ||
915 | |||
916 | static ssize_t _write_exec(struct objio_state *ios) | ||
917 | { | ||
918 | unsigned i; | ||
919 | int ret; | ||
920 | |||
921 | for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { | ||
922 | if (!ios->per_dev[i].length) | ||
923 | continue; | ||
924 | ret = _write_mirrors(ios, i); | ||
925 | if (unlikely(ret)) | ||
926 | goto err; | ||
927 | } | ||
928 | |||
929 | ios->done = _write_done; | ||
930 | return _io_exec(ios); /* In sync mode exec returns the io->status */ | ||
931 | |||
932 | err: | ||
933 | _io_free(ios); | ||
934 | return ret; | ||
935 | } | ||
936 | |||
937 | ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable) | ||
938 | { | ||
939 | struct objio_state *ios = container_of(ol_state, struct objio_state, | ||
940 | ol_state); | ||
941 | int ret; | ||
942 | |||
943 | /* TODO: ios->stable = stable; */ | ||
944 | ret = _io_rw_pagelist(ios, GFP_NOFS); | ||
945 | if (unlikely(ret)) | ||
946 | return ret; | ||
947 | |||
948 | return _write_exec(ios); | ||
949 | } | ||
950 | |||
951 | /* | ||
952 | * objlayout_pg_test(). Called by nfs_can_coalesce_requests() | ||
953 | * | ||
954 | * return 1 : coalesce page | ||
955 | * return 0 : don't coalesce page | ||
956 | */ | ||
957 | int | ||
958 | objlayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, | ||
959 | struct nfs_page *req) | ||
960 | { | ||
961 | return 1; | ||
962 | } | ||
362 | 963 | ||
363 | static struct pnfs_layoutdriver_type objlayout_type = { | 964 | static struct pnfs_layoutdriver_type objlayout_type = { |
364 | .id = LAYOUT_OSD2_OBJECTS, | 965 | .id = LAYOUT_OSD2_OBJECTS, |
@@ -370,6 +971,10 @@ static struct pnfs_layoutdriver_type objlayout_type = { | |||
370 | .alloc_lseg = objlayout_alloc_lseg, | 971 | .alloc_lseg = objlayout_alloc_lseg, |
371 | .free_lseg = objlayout_free_lseg, | 972 | .free_lseg = objlayout_free_lseg, |
372 | 973 | ||
974 | .read_pagelist = objlayout_read_pagelist, | ||
975 | .write_pagelist = objlayout_write_pagelist, | ||
976 | .pg_test = objlayout_pg_test, | ||
977 | |||
373 | .free_deviceid_node = objio_free_deviceid_node, | 978 | .free_deviceid_node = objio_free_deviceid_node, |
374 | }; | 979 | }; |
375 | 980 | ||
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c index f14b4da34052..5157ef6d0041 100644 --- a/fs/nfs/objlayout/objlayout.c +++ b/fs/nfs/objlayout/objlayout.c | |||
@@ -129,6 +129,260 @@ objlayout_free_lseg(struct pnfs_layout_segment *lseg) | |||
129 | } | 129 | } |
130 | 130 | ||
131 | /* | 131 | /* |
132 | * I/O Operations | ||
133 | */ | ||
134 | static inline u64 | ||
135 | end_offset(u64 start, u64 len) | ||
136 | { | ||
137 | u64 end; | ||
138 | |||
139 | end = start + len; | ||
140 | return end >= start ? end : NFS4_MAX_UINT64; | ||
141 | } | ||
142 | |||
143 | /* last octet in a range */ | ||
144 | static inline u64 | ||
145 | last_byte_offset(u64 start, u64 len) | ||
146 | { | ||
147 | u64 end; | ||
148 | |||
149 | BUG_ON(!len); | ||
150 | end = start + len; | ||
151 | return end > start ? end - 1 : NFS4_MAX_UINT64; | ||
152 | } | ||
153 | |||
154 | static struct objlayout_io_state * | ||
155 | objlayout_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type, | ||
156 | struct page **pages, | ||
157 | unsigned pgbase, | ||
158 | loff_t offset, | ||
159 | size_t count, | ||
160 | struct pnfs_layout_segment *lseg, | ||
161 | void *rpcdata, | ||
162 | gfp_t gfp_flags) | ||
163 | { | ||
164 | struct objlayout_io_state *state; | ||
165 | u64 lseg_end_offset; | ||
166 | |||
167 | dprintk("%s: allocating io_state\n", __func__); | ||
168 | if (objio_alloc_io_state(lseg, &state, gfp_flags)) | ||
169 | return NULL; | ||
170 | |||
171 | BUG_ON(offset < lseg->pls_range.offset); | ||
172 | lseg_end_offset = end_offset(lseg->pls_range.offset, | ||
173 | lseg->pls_range.length); | ||
174 | BUG_ON(offset >= lseg_end_offset); | ||
175 | if (offset + count > lseg_end_offset) { | ||
176 | count = lseg->pls_range.length - | ||
177 | (offset - lseg->pls_range.offset); | ||
178 | dprintk("%s: truncated count %Zd\n", __func__, count); | ||
179 | } | ||
180 | |||
181 | if (pgbase > PAGE_SIZE) { | ||
182 | pages += pgbase >> PAGE_SHIFT; | ||
183 | pgbase &= ~PAGE_MASK; | ||
184 | } | ||
185 | |||
186 | state->lseg = lseg; | ||
187 | state->rpcdata = rpcdata; | ||
188 | state->pages = pages; | ||
189 | state->pgbase = pgbase; | ||
190 | state->nr_pages = (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT; | ||
191 | state->offset = offset; | ||
192 | state->count = count; | ||
193 | state->sync = 0; | ||
194 | |||
195 | return state; | ||
196 | } | ||
197 | |||
198 | static void | ||
199 | objlayout_free_io_state(struct objlayout_io_state *state) | ||
200 | { | ||
201 | dprintk("%s: freeing io_state\n", __func__); | ||
202 | if (unlikely(!state)) | ||
203 | return; | ||
204 | |||
205 | objio_free_io_state(state); | ||
206 | } | ||
207 | |||
208 | /* | ||
209 | * I/O done common code | ||
210 | */ | ||
211 | static void | ||
212 | objlayout_iodone(struct objlayout_io_state *state) | ||
213 | { | ||
214 | dprintk("%s: state %p status\n", __func__, state); | ||
215 | |||
216 | objlayout_free_io_state(state); | ||
217 | } | ||
218 | |||
219 | /* Function scheduled on rpc workqueue to call ->nfs_readlist_complete(). | ||
220 | * This is because the osd completion is called with ints-off from | ||
221 | * the block layer | ||
222 | */ | ||
223 | static void _rpc_read_complete(struct work_struct *work) | ||
224 | { | ||
225 | struct rpc_task *task; | ||
226 | struct nfs_read_data *rdata; | ||
227 | |||
228 | dprintk("%s enter\n", __func__); | ||
229 | task = container_of(work, struct rpc_task, u.tk_work); | ||
230 | rdata = container_of(task, struct nfs_read_data, task); | ||
231 | |||
232 | pnfs_ld_read_done(rdata); | ||
233 | } | ||
234 | |||
235 | void | ||
236 | objlayout_read_done(struct objlayout_io_state *state, ssize_t status, bool sync) | ||
237 | { | ||
238 | int eof = state->eof; | ||
239 | struct nfs_read_data *rdata; | ||
240 | |||
241 | state->status = status; | ||
242 | dprintk("%s: Begin status=%ld eof=%d\n", __func__, status, eof); | ||
243 | rdata = state->rpcdata; | ||
244 | rdata->task.tk_status = status; | ||
245 | if (status >= 0) { | ||
246 | rdata->res.count = status; | ||
247 | rdata->res.eof = eof; | ||
248 | } | ||
249 | objlayout_iodone(state); | ||
250 | /* must not use state after this point */ | ||
251 | |||
252 | if (sync) | ||
253 | pnfs_ld_read_done(rdata); | ||
254 | else { | ||
255 | INIT_WORK(&rdata->task.u.tk_work, _rpc_read_complete); | ||
256 | schedule_work(&rdata->task.u.tk_work); | ||
257 | } | ||
258 | } | ||
259 | |||
260 | /* | ||
261 | * Perform sync or async reads. | ||
262 | */ | ||
263 | enum pnfs_try_status | ||
264 | objlayout_read_pagelist(struct nfs_read_data *rdata) | ||
265 | { | ||
266 | loff_t offset = rdata->args.offset; | ||
267 | size_t count = rdata->args.count; | ||
268 | struct objlayout_io_state *state; | ||
269 | ssize_t status = 0; | ||
270 | loff_t eof; | ||
271 | |||
272 | dprintk("%s: Begin inode %p offset %llu count %d\n", | ||
273 | __func__, rdata->inode, offset, (int)count); | ||
274 | |||
275 | eof = i_size_read(rdata->inode); | ||
276 | if (unlikely(offset + count > eof)) { | ||
277 | if (offset >= eof) { | ||
278 | status = 0; | ||
279 | rdata->res.count = 0; | ||
280 | rdata->res.eof = 1; | ||
281 | goto out; | ||
282 | } | ||
283 | count = eof - offset; | ||
284 | } | ||
285 | |||
286 | state = objlayout_alloc_io_state(NFS_I(rdata->inode)->layout, | ||
287 | rdata->args.pages, rdata->args.pgbase, | ||
288 | offset, count, | ||
289 | rdata->lseg, rdata, | ||
290 | GFP_KERNEL); | ||
291 | if (unlikely(!state)) { | ||
292 | status = -ENOMEM; | ||
293 | goto out; | ||
294 | } | ||
295 | |||
296 | state->eof = state->offset + state->count >= eof; | ||
297 | |||
298 | status = objio_read_pagelist(state); | ||
299 | out: | ||
300 | dprintk("%s: Return status %Zd\n", __func__, status); | ||
301 | rdata->pnfs_error = status; | ||
302 | return PNFS_ATTEMPTED; | ||
303 | } | ||
304 | |||
305 | /* Function scheduled on rpc workqueue to call ->nfs_writelist_complete(). | ||
306 | * This is because the osd completion is called with ints-off from | ||
307 | * the block layer | ||
308 | */ | ||
309 | static void _rpc_write_complete(struct work_struct *work) | ||
310 | { | ||
311 | struct rpc_task *task; | ||
312 | struct nfs_write_data *wdata; | ||
313 | |||
314 | dprintk("%s enter\n", __func__); | ||
315 | task = container_of(work, struct rpc_task, u.tk_work); | ||
316 | wdata = container_of(task, struct nfs_write_data, task); | ||
317 | |||
318 | pnfs_ld_write_done(wdata); | ||
319 | } | ||
320 | |||
321 | void | ||
322 | objlayout_write_done(struct objlayout_io_state *state, ssize_t status, | ||
323 | bool sync) | ||
324 | { | ||
325 | struct nfs_write_data *wdata; | ||
326 | |||
327 | dprintk("%s: Begin\n", __func__); | ||
328 | wdata = state->rpcdata; | ||
329 | state->status = status; | ||
330 | wdata->task.tk_status = status; | ||
331 | if (status >= 0) { | ||
332 | wdata->res.count = status; | ||
333 | wdata->verf.committed = state->committed; | ||
334 | dprintk("%s: Return status %d committed %d\n", | ||
335 | __func__, wdata->task.tk_status, | ||
336 | wdata->verf.committed); | ||
337 | } else | ||
338 | dprintk("%s: Return status %d\n", | ||
339 | __func__, wdata->task.tk_status); | ||
340 | objlayout_iodone(state); | ||
341 | /* must not use state after this point */ | ||
342 | |||
343 | if (sync) | ||
344 | pnfs_ld_write_done(wdata); | ||
345 | else { | ||
346 | INIT_WORK(&wdata->task.u.tk_work, _rpc_write_complete); | ||
347 | schedule_work(&wdata->task.u.tk_work); | ||
348 | } | ||
349 | } | ||
350 | |||
351 | /* | ||
352 | * Perform sync or async writes. | ||
353 | */ | ||
354 | enum pnfs_try_status | ||
355 | objlayout_write_pagelist(struct nfs_write_data *wdata, | ||
356 | int how) | ||
357 | { | ||
358 | struct objlayout_io_state *state; | ||
359 | ssize_t status; | ||
360 | |||
361 | dprintk("%s: Begin inode %p offset %llu count %u\n", | ||
362 | __func__, wdata->inode, wdata->args.offset, wdata->args.count); | ||
363 | |||
364 | state = objlayout_alloc_io_state(NFS_I(wdata->inode)->layout, | ||
365 | wdata->args.pages, | ||
366 | wdata->args.pgbase, | ||
367 | wdata->args.offset, | ||
368 | wdata->args.count, | ||
369 | wdata->lseg, wdata, | ||
370 | GFP_NOFS); | ||
371 | if (unlikely(!state)) { | ||
372 | status = -ENOMEM; | ||
373 | goto out; | ||
374 | } | ||
375 | |||
376 | state->sync = how & FLUSH_SYNC; | ||
377 | |||
378 | status = objio_write_pagelist(state, how & FLUSH_STABLE); | ||
379 | out: | ||
380 | dprintk("%s: Return status %Zd\n", __func__, status); | ||
381 | wdata->pnfs_error = status; | ||
382 | return PNFS_ATTEMPTED; | ||
383 | } | ||
384 | |||
385 | /* | ||
132 | * Get Device Info API for io engines | 386 | * Get Device Info API for io engines |
133 | */ | 387 | */ |
134 | struct objlayout_deviceinfo { | 388 | struct objlayout_deviceinfo { |
diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h index fa0262149f59..9a405e8069f3 100644 --- a/fs/nfs/objlayout/objlayout.h +++ b/fs/nfs/objlayout/objlayout.h | |||
@@ -59,6 +59,26 @@ OBJLAYOUT(struct pnfs_layout_hdr *lo) | |||
59 | } | 59 | } |
60 | 60 | ||
61 | /* | 61 | /* |
62 | * per-I/O operation state | ||
63 | * embedded in objects provider io_state data structure | ||
64 | */ | ||
65 | struct objlayout_io_state { | ||
66 | struct pnfs_layout_segment *lseg; | ||
67 | |||
68 | struct page **pages; | ||
69 | unsigned pgbase; | ||
70 | unsigned nr_pages; | ||
71 | unsigned long count; | ||
72 | loff_t offset; | ||
73 | bool sync; | ||
74 | |||
75 | void *rpcdata; | ||
76 | int status; /* res */ | ||
77 | int eof; /* res */ | ||
78 | int committed; /* res */ | ||
79 | }; | ||
80 | |||
81 | /* | ||
62 | * Raid engine I/O API | 82 | * Raid engine I/O API |
63 | */ | 83 | */ |
64 | extern int objio_alloc_lseg(struct pnfs_layout_segment **outp, | 84 | extern int objio_alloc_lseg(struct pnfs_layout_segment **outp, |
@@ -68,9 +88,24 @@ extern int objio_alloc_lseg(struct pnfs_layout_segment **outp, | |||
68 | gfp_t gfp_flags); | 88 | gfp_t gfp_flags); |
69 | extern void objio_free_lseg(struct pnfs_layout_segment *lseg); | 89 | extern void objio_free_lseg(struct pnfs_layout_segment *lseg); |
70 | 90 | ||
91 | extern int objio_alloc_io_state( | ||
92 | struct pnfs_layout_segment *lseg, | ||
93 | struct objlayout_io_state **outp, | ||
94 | gfp_t gfp_flags); | ||
95 | extern void objio_free_io_state(struct objlayout_io_state *state); | ||
96 | |||
97 | extern ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state); | ||
98 | extern ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, | ||
99 | bool stable); | ||
100 | |||
71 | /* | 101 | /* |
72 | * callback API | 102 | * callback API |
73 | */ | 103 | */ |
104 | extern void objlayout_read_done(struct objlayout_io_state *state, | ||
105 | ssize_t status, bool sync); | ||
106 | extern void objlayout_write_done(struct objlayout_io_state *state, | ||
107 | ssize_t status, bool sync); | ||
108 | |||
74 | extern int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay, | 109 | extern int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay, |
75 | struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr, | 110 | struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr, |
76 | gfp_t gfp_flags); | 111 | gfp_t gfp_flags); |
@@ -89,4 +124,11 @@ extern struct pnfs_layout_segment *objlayout_alloc_lseg( | |||
89 | gfp_t gfp_flags); | 124 | gfp_t gfp_flags); |
90 | extern void objlayout_free_lseg(struct pnfs_layout_segment *); | 125 | extern void objlayout_free_lseg(struct pnfs_layout_segment *); |
91 | 126 | ||
127 | extern enum pnfs_try_status objlayout_read_pagelist( | ||
128 | struct nfs_read_data *); | ||
129 | |||
130 | extern enum pnfs_try_status objlayout_write_pagelist( | ||
131 | struct nfs_write_data *, | ||
132 | int how); | ||
133 | |||
92 | #endif /* _OBJLAYOUT_H */ | 134 | #endif /* _OBJLAYOUT_H */ |