diff options
author | Boaz Harrosh <bharrosh@panasas.com> | 2011-09-28 04:55:51 -0400 |
---|---|---|
committer | Boaz Harrosh <bharrosh@panasas.com> | 2011-10-14 12:52:50 -0400 |
commit | b916c5cd4d895a27b47a652648958f73e4f23ac6 (patch) | |
tree | 9fe6e59edd44119c79a18b9df0b02a0c4dacb6d1 /fs/exofs/ore.c | |
parent | d866d875f68fdeae63df334d291fe138dc636d96 (diff) |
ore: Only IO one group at a time (API change)
Usually a single IO is confined to one group of devices
(group_width) and at the boundary of a raid group it can
spill into a second group. Current code would allocate a
full device_table size array at each io_state so it can
comply to requests that span two groups. Needless to say
that is very wasteful, specially when device_table count
can get very large (hundreds even thousands), while a
group_width is usually 8 or 10.
* Change ore API to trim on IO that spans two raid groups.
The user passes offset+length to ore_get_rw_state, the
ore might trim on that length if spanning a group boundary.
The user must check ios->length or ios->nrpages to see
how much IO will be preformed. It is the responsibility
of the user to re-issue the reminder of the IO.
* Modify exofs To copy spilled pages on to the next IO.
This means one last kick is needed after all coalescing
of pages is done.
Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Diffstat (limited to 'fs/exofs/ore.c')
-rw-r--r-- | fs/exofs/ore.c | 105 |
1 files changed, 69 insertions, 36 deletions
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index a7d79257fc65..c1c2cc607adf 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c | |||
@@ -47,6 +47,9 @@ MODULE_AUTHOR("Boaz Harrosh <bharrosh@panasas.com>"); | |||
47 | MODULE_DESCRIPTION("Objects Raid Engine ore.ko"); | 47 | MODULE_DESCRIPTION("Objects Raid Engine ore.ko"); |
48 | MODULE_LICENSE("GPL"); | 48 | MODULE_LICENSE("GPL"); |
49 | 49 | ||
50 | static void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset, | ||
51 | struct ore_striping_info *si); | ||
52 | |||
50 | static u8 *_ios_cred(struct ore_io_state *ios, unsigned index) | 53 | static u8 *_ios_cred(struct ore_io_state *ios, unsigned index) |
51 | { | 54 | { |
52 | return ios->oc->comps[index & ios->oc->single_comp].cred; | 55 | return ios->oc->comps[index & ios->oc->single_comp].cred; |
@@ -62,38 +65,85 @@ static struct osd_dev *_ios_od(struct ore_io_state *ios, unsigned index) | |||
62 | return ore_comp_dev(ios->oc, index); | 65 | return ore_comp_dev(ios->oc, index); |
63 | } | 66 | } |
64 | 67 | ||
65 | int ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc, | 68 | static int _get_io_state(struct ore_layout *layout, |
66 | bool is_reading, u64 offset, u64 length, | 69 | struct ore_components *oc, unsigned numdevs, |
67 | struct ore_io_state **pios) | 70 | struct ore_io_state **pios) |
68 | { | 71 | { |
69 | struct ore_io_state *ios; | 72 | struct ore_io_state *ios; |
70 | 73 | ||
71 | /*TODO: Maybe use kmem_cach per sbi of size | 74 | /*TODO: Maybe use kmem_cach per sbi of size |
72 | * exofs_io_state_size(layout->s_numdevs) | 75 | * exofs_io_state_size(layout->s_numdevs) |
73 | */ | 76 | */ |
74 | ios = kzalloc(ore_io_state_size(oc->numdevs), GFP_KERNEL); | 77 | ios = kzalloc(ore_io_state_size(numdevs), GFP_KERNEL); |
75 | if (unlikely(!ios)) { | 78 | if (unlikely(!ios)) { |
76 | ORE_DBGMSG("Failed kzalloc bytes=%d\n", | 79 | ORE_DBGMSG("Failed kzalloc bytes=%d\n", |
77 | ore_io_state_size(oc->numdevs)); | 80 | ore_io_state_size(numdevs)); |
78 | *pios = NULL; | 81 | *pios = NULL; |
79 | return -ENOMEM; | 82 | return -ENOMEM; |
80 | } | 83 | } |
81 | 84 | ||
82 | ios->layout = layout; | 85 | ios->layout = layout; |
83 | ios->oc = oc; | 86 | ios->oc = oc; |
84 | ios->offset = offset; | 87 | *pios = ios; |
85 | ios->length = length; | 88 | return 0; |
89 | } | ||
90 | |||
91 | /* Allocate an io_state for only a single group of devices | ||
92 | * | ||
93 | * If a user needs to call ore_read/write() this version must be used becase it | ||
94 | * allocates extra stuff for striping and raid. | ||
95 | * The ore might decide to only IO less then @length bytes do to alignmets | ||
96 | * and constrains as follows: | ||
97 | * - The IO cannot cross group boundary. | ||
98 | * - In raid5/6 The end of the IO must align at end of a stripe eg. | ||
99 | * (@offset + @length) % strip_size == 0. Or the complete range is within a | ||
100 | * single stripe. | ||
101 | * - Memory condition only permitted a shorter IO. (A user can use @length=~0 | ||
102 | * And check the returned ios->length for max_io_size.) | ||
103 | * | ||
104 | * The caller must check returned ios->length (and/or ios->nr_pages) and | ||
105 | * re-issue these pages that fall outside of ios->length | ||
106 | */ | ||
107 | int ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc, | ||
108 | bool is_reading, u64 offset, u64 length, | ||
109 | struct ore_io_state **pios) | ||
110 | { | ||
111 | struct ore_io_state *ios; | ||
112 | unsigned numdevs = layout->group_width * layout->mirrors_p1; | ||
113 | int ret; | ||
114 | |||
115 | ret = _get_io_state(layout, oc, numdevs, pios); | ||
116 | if (unlikely(ret)) | ||
117 | return ret; | ||
118 | |||
119 | ios = *pios; | ||
86 | ios->reading = is_reading; | 120 | ios->reading = is_reading; |
121 | ios->offset = offset; | ||
122 | |||
123 | if (length) { | ||
124 | struct ore_striping_info si; | ||
125 | |||
126 | ore_calc_stripe_info(layout, offset, &si); | ||
127 | ios->length = (length <= si.group_length) ? length : | ||
128 | si.group_length; | ||
129 | ios->nr_pages = (ios->length + PAGE_SIZE - 1) / PAGE_SIZE; | ||
130 | } | ||
87 | 131 | ||
88 | *pios = ios; | ||
89 | return 0; | 132 | return 0; |
90 | } | 133 | } |
91 | EXPORT_SYMBOL(ore_get_rw_state); | 134 | EXPORT_SYMBOL(ore_get_rw_state); |
92 | 135 | ||
136 | /* Allocate an io_state for all the devices in the comps array | ||
137 | * | ||
138 | * This version of io_state allocation is used mostly by create/remove | ||
139 | * and trunc where we currently need all the devices. The only wastful | ||
140 | * bit is the read/write_attributes with no IO. Those sites should | ||
141 | * be converted to use ore_get_rw_state() with length=0 | ||
142 | */ | ||
93 | int ore_get_io_state(struct ore_layout *layout, struct ore_components *oc, | 143 | int ore_get_io_state(struct ore_layout *layout, struct ore_components *oc, |
94 | struct ore_io_state **ios) | 144 | struct ore_io_state **pios) |
95 | { | 145 | { |
96 | return ore_get_rw_state(layout, oc, true, 0, 0, ios); | 146 | return _get_io_state(layout, oc, oc->numdevs, pios); |
97 | } | 147 | } |
98 | EXPORT_SYMBOL(ore_get_io_state); | 148 | EXPORT_SYMBOL(ore_get_io_state); |
99 | 149 | ||
@@ -374,12 +424,12 @@ static int _prepare_one_group(struct ore_io_state *ios, u64 length, | |||
374 | unsigned devs_in_group = ios->layout->group_width * mirrors_p1; | 424 | unsigned devs_in_group = ios->layout->group_width * mirrors_p1; |
375 | unsigned dev = si->dev; | 425 | unsigned dev = si->dev; |
376 | unsigned first_dev = dev - (dev % devs_in_group); | 426 | unsigned first_dev = dev - (dev % devs_in_group); |
377 | unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0; | ||
378 | unsigned cur_pg = ios->pages_consumed; | 427 | unsigned cur_pg = ios->pages_consumed; |
379 | int ret = 0; | 428 | int ret = 0; |
380 | 429 | ||
381 | while (length) { | 430 | while (length) { |
382 | struct ore_per_dev_state *per_dev = &ios->per_dev[dev]; | 431 | unsigned comp = dev - first_dev; |
432 | struct ore_per_dev_state *per_dev = &ios->per_dev[comp]; | ||
383 | unsigned cur_len, page_off = 0; | 433 | unsigned cur_len, page_off = 0; |
384 | 434 | ||
385 | if (!per_dev->length) { | 435 | if (!per_dev->length) { |
@@ -397,9 +447,6 @@ static int _prepare_one_group(struct ore_io_state *ios, u64 length, | |||
397 | per_dev->offset = si->obj_offset - si->unit_off; | 447 | per_dev->offset = si->obj_offset - si->unit_off; |
398 | cur_len = stripe_unit; | 448 | cur_len = stripe_unit; |
399 | } | 449 | } |
400 | |||
401 | if (max_comp < dev) | ||
402 | max_comp = dev; | ||
403 | } else { | 450 | } else { |
404 | cur_len = stripe_unit; | 451 | cur_len = stripe_unit; |
405 | } | 452 | } |
@@ -417,17 +464,15 @@ static int _prepare_one_group(struct ore_io_state *ios, u64 length, | |||
417 | length -= cur_len; | 464 | length -= cur_len; |
418 | } | 465 | } |
419 | out: | 466 | out: |
420 | ios->numdevs = max_comp + mirrors_p1; | 467 | ios->numdevs = devs_in_group; |
421 | ios->pages_consumed = cur_pg; | 468 | ios->pages_consumed = cur_pg; |
422 | return ret; | 469 | return ret; |
423 | } | 470 | } |
424 | 471 | ||
425 | static int _prepare_for_striping(struct ore_io_state *ios) | 472 | static int _prepare_for_striping(struct ore_io_state *ios) |
426 | { | 473 | { |
427 | u64 length = ios->length; | ||
428 | u64 offset = ios->offset; | ||
429 | struct ore_striping_info si; | 474 | struct ore_striping_info si; |
430 | int ret = 0; | 475 | int ret; |
431 | 476 | ||
432 | if (!ios->pages) { | 477 | if (!ios->pages) { |
433 | if (ios->kern_buff) { | 478 | if (ios->kern_buff) { |
@@ -446,21 +491,11 @@ static int _prepare_for_striping(struct ore_io_state *ios) | |||
446 | return 0; | 491 | return 0; |
447 | } | 492 | } |
448 | 493 | ||
449 | while (length) { | 494 | ore_calc_stripe_info(ios->layout, ios->offset, &si); |
450 | ore_calc_stripe_info(ios->layout, offset, &si); | ||
451 | |||
452 | if (length < si.group_length) | ||
453 | si.group_length = length; | ||
454 | 495 | ||
455 | ret = _prepare_one_group(ios, si.group_length, &si); | 496 | BUG_ON(ios->length > si.group_length); |
456 | if (unlikely(ret)) | 497 | ret = _prepare_one_group(ios, ios->length, &si); |
457 | goto out; | ||
458 | 498 | ||
459 | offset += si.group_length; | ||
460 | length -= si.group_length; | ||
461 | } | ||
462 | |||
463 | out: | ||
464 | return ret; | 499 | return ret; |
465 | } | 500 | } |
466 | 501 | ||
@@ -742,7 +777,6 @@ struct _trunc_info { | |||
742 | 777 | ||
743 | unsigned first_group_dev; | 778 | unsigned first_group_dev; |
744 | unsigned nex_group_dev; | 779 | unsigned nex_group_dev; |
745 | unsigned max_devs; | ||
746 | }; | 780 | }; |
747 | 781 | ||
748 | static void _calc_trunk_info(struct ore_layout *layout, u64 file_offset, | 782 | static void _calc_trunk_info(struct ore_layout *layout, u64 file_offset, |
@@ -757,7 +791,6 @@ static void _calc_trunk_info(struct ore_layout *layout, u64 file_offset, | |||
757 | 791 | ||
758 | ti->first_group_dev = ti->si.dev - (ti->si.dev % layout->group_width); | 792 | ti->first_group_dev = ti->si.dev - (ti->si.dev % layout->group_width); |
759 | ti->nex_group_dev = ti->first_group_dev + layout->group_width; | 793 | ti->nex_group_dev = ti->first_group_dev + layout->group_width; |
760 | ti->max_devs = layout->group_width * layout->group_count; | ||
761 | } | 794 | } |
762 | 795 | ||
763 | int ore_truncate(struct ore_layout *layout, struct ore_components *oc, | 796 | int ore_truncate(struct ore_layout *layout, struct ore_components *oc, |
@@ -777,7 +810,7 @@ int ore_truncate(struct ore_layout *layout, struct ore_components *oc, | |||
777 | 810 | ||
778 | _calc_trunk_info(ios->layout, size, &ti); | 811 | _calc_trunk_info(ios->layout, size, &ti); |
779 | 812 | ||
780 | size_attrs = kcalloc(ti.max_devs, sizeof(*size_attrs), | 813 | size_attrs = kcalloc(ios->oc->numdevs, sizeof(*size_attrs), |
781 | GFP_KERNEL); | 814 | GFP_KERNEL); |
782 | if (unlikely(!size_attrs)) { | 815 | if (unlikely(!size_attrs)) { |
783 | ret = -ENOMEM; | 816 | ret = -ENOMEM; |
@@ -786,7 +819,7 @@ int ore_truncate(struct ore_layout *layout, struct ore_components *oc, | |||
786 | 819 | ||
787 | ios->numdevs = ios->oc->numdevs; | 820 | ios->numdevs = ios->oc->numdevs; |
788 | 821 | ||
789 | for (i = 0; i < ti.max_devs; ++i) { | 822 | for (i = 0; i < ios->numdevs; ++i) { |
790 | struct exofs_trunc_attr *size_attr = &size_attrs[i]; | 823 | struct exofs_trunc_attr *size_attr = &size_attrs[i]; |
791 | u64 obj_size; | 824 | u64 obj_size; |
792 | 825 | ||