aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBoaz Harrosh <bharrosh@panasas.com>2011-10-14 09:33:51 -0400
committerBoaz Harrosh <bharrosh@panasas.com>2011-10-24 20:15:33 -0400
commit769ba8d92025fa390f3097e658b8ed6e032d68e9 (patch)
treeeec1a556d6b3796f702c96e7f97ce94ba7d9d410
parenta1fec1dbbc8db974d2582e4040590cebe72171e4 (diff)
ore: RAID5 Write
This is finally the RAID5 Write support. The bigger part of this patch is not the XOR engine itself, But the read4write logic, which is a complete mini prepare_for_striping reading engine that can read scattered pages of a stripe into cache so it can be used for XOR calculation. That is, if the write was not stripe aligned. The main algorithm behind the XOR engine is the 2 dimensional array: struct __stripe_pages_2d. A drawing might save 1000 words --- __stripe_pages_2d | n = pages_in_stripe_unit; w = group_width - parity; | pages array presented to the XOR lib | | V | __1_page_stripe[0].pages --> [c0][c1]..[cw][c_par] <---| | | __1_page_stripe[1].pages --> [c0][c1]..[cw][c_par] <--- | ... | ... | __1_page_stripe[n].pages --> [c0][c1]..[cw][c_par] ^ | data added columns first then row --- The pages are put on this array columns first. .i.e: p0-of-c0, p1-of-c0, ... pn-of-c0, p0-of-c1, ... So we are doing a corner turn of the pages. Note that pages will zigzag down and left. but are put sequentially in growing order. So when the time comes to XOR the stripe, only the beginning and end of the array need be checked. We scan the array and any NULL spot will be field by pages-to-be-read. The FS that wants to support RAID5 needs to supply an operations-vector that searches a given page in cache, and specifies if the page is uptodate or need reading. All these pages to be read are put on a slave ore_io_state and synchronously read. All the pages of a stripe are read in one IO, using the scatter gather mechanism. In write we constrain our IO to only be incomplete on a single stripe. Meaning either the complete IO is within a single stripe so we might have pages to read from both beginning or end of the strip. Or we have some reading to do at beginning but end at strip boundary. The left over pages are pushed to the next IO by the API already established by previous work, where an IO offset/length combination presented to the ORE might get the length truncated and the user must re-submit the leftover pages. (Both exofs and NFS support this) But any ORE user should make it's best effort to align it's IO before hand and avoid complications. A cached ore_layout->stripe_size member can be used for that calculation. (NOTE: that ORE demands that stripe_size may not be bigger then 32bit) What else? Well read it and tell me. Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
-rw-r--r--fs/exofs/Kconfig9
-rw-r--r--fs/exofs/ore.c36
-rw-r--r--fs/exofs/ore_raid.c534
-rw-r--r--fs/exofs/ore_raid.h15
-rw-r--r--include/scsi/osd_ore.h9
5 files changed, 587 insertions, 16 deletions
diff --git a/fs/exofs/Kconfig b/fs/exofs/Kconfig
index 70bae4149291..fa9a286c8771 100644
--- a/fs/exofs/Kconfig
+++ b/fs/exofs/Kconfig
@@ -1,10 +1,17 @@
1# Note ORE needs to "select ASYNC_XOR". So Not to force multiple selects
2# for every ORE user we do it like this. Any user should add itself here
3# at the "depends on EXOFS_FS || ..." with an ||. The dependencies are
4# selected here, and we default to "ON". So in effect it is like been
5# selected by any of the users.
1config ORE 6config ORE
2 tristate 7 tristate
8 depends on EXOFS_FS
9 select ASYNC_XOR
10 default SCSI_OSD_ULD
3 11
4config EXOFS_FS 12config EXOFS_FS
5 tristate "exofs: OSD based file system support" 13 tristate "exofs: OSD based file system support"
6 depends on SCSI_OSD_ULD 14 depends on SCSI_OSD_ULD
7 select ORE
8 help 15 help
9 EXOFS is a file system that uses an OSD storage device, 16 EXOFS is a file system that uses an OSD storage device,
10 as its backing storage. 17 as its backing storage.
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index fd6090ddd3bf..08ee454b2187 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -95,6 +95,14 @@ int ore_verify_layout(unsigned total_comps, struct ore_layout *layout)
95 layout->max_io_length = 95 layout->max_io_length =
96 (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE - layout->stripe_unit) * 96 (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE - layout->stripe_unit) *
97 layout->group_width; 97 layout->group_width;
98 if (layout->parity) {
99 unsigned stripe_length =
100 (layout->group_width - layout->parity) *
101 layout->stripe_unit;
102
103 layout->max_io_length /= stripe_length;
104 layout->max_io_length *= stripe_length;
105 }
98 return 0; 106 return 0;
99} 107}
100EXPORT_SYMBOL(ore_verify_layout); 108EXPORT_SYMBOL(ore_verify_layout);
@@ -118,7 +126,7 @@ static struct osd_dev *_ios_od(struct ore_io_state *ios, unsigned index)
118 return ore_comp_dev(ios->oc, index); 126 return ore_comp_dev(ios->oc, index);
119} 127}
120 128
121static int _ore_get_io_state(struct ore_layout *layout, 129int _ore_get_io_state(struct ore_layout *layout,
122 struct ore_components *oc, unsigned numdevs, 130 struct ore_components *oc, unsigned numdevs,
123 unsigned sgs_per_dev, unsigned num_par_pages, 131 unsigned sgs_per_dev, unsigned num_par_pages,
124 struct ore_io_state **pios) 132 struct ore_io_state **pios)
@@ -334,7 +342,7 @@ static void _done_io(struct osd_request *or, void *p)
334 kref_put(&ios->kref, _last_io); 342 kref_put(&ios->kref, _last_io);
335} 343}
336 344
337static int ore_io_execute(struct ore_io_state *ios) 345int ore_io_execute(struct ore_io_state *ios)
338{ 346{
339 DECLARE_COMPLETION_ONSTACK(wait); 347 DECLARE_COMPLETION_ONSTACK(wait);
340 bool sync = (ios->done == NULL); 348 bool sync = (ios->done == NULL);
@@ -597,6 +605,8 @@ int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg,
597 ret = -ENOMEM; 605 ret = -ENOMEM;
598 goto out; 606 goto out;
599 } 607 }
608 _add_stripe_page(ios->sp2d, &ios->si, pages[pg]);
609
600 pgbase = 0; 610 pgbase = 0;
601 ++pg; 611 ++pg;
602 } 612 }
@@ -636,6 +646,7 @@ static int _prepare_for_striping(struct ore_io_state *ios)
636 646
637 dev_order = _dev_order(devs_in_group, mirrors_p1, si->par_dev, dev); 647 dev_order = _dev_order(devs_in_group, mirrors_p1, si->par_dev, dev);
638 si->cur_comp = dev_order; 648 si->cur_comp = dev_order;
649 si->cur_pg = si->unit_off / PAGE_SIZE;
639 650
640 while (length) { 651 while (length) {
641 unsigned comp = dev - first_dev; 652 unsigned comp = dev - first_dev;
@@ -677,14 +688,14 @@ static int _prepare_for_striping(struct ore_io_state *ios)
677 length -= cur_len; 688 length -= cur_len;
678 689
679 si->cur_comp = (si->cur_comp + 1) % group_width; 690 si->cur_comp = (si->cur_comp + 1) % group_width;
680 if (unlikely((dev == si->par_dev) || 691 if (unlikely((dev == si->par_dev) || (!length && ios->sp2d))) {
681 (!length && ios->parity_pages))) { 692 if (!length && ios->sp2d) {
682 if (!length)
683 /* If we are writing and this is the very last 693 /* If we are writing and this is the very last
684 * stripe. then operate on parity dev. 694 * stripe. then operate on parity dev.
685 */ 695 */
686 dev = si->par_dev; 696 dev = si->par_dev;
687 if (ios->reading) 697 }
698 if (ios->sp2d)
688 /* In writes cur_len just means if it's the 699 /* In writes cur_len just means if it's the
689 * last one. See _ore_add_parity_unit. 700 * last one. See _ore_add_parity_unit.
690 */ 701 */
@@ -709,6 +720,7 @@ static int _prepare_for_striping(struct ore_io_state *ios)
709 devs_in_group + first_dev; 720 devs_in_group + first_dev;
710 /* Next stripe, start fresh */ 721 /* Next stripe, start fresh */
711 si->cur_comp = 0; 722 si->cur_comp = 0;
723 si->cur_pg = 0;
712 } 724 }
713 } 725 }
714out: 726out:
@@ -873,6 +885,14 @@ int ore_write(struct ore_io_state *ios)
873 int i; 885 int i;
874 int ret; 886 int ret;
875 887
888 if (unlikely(ios->sp2d && !ios->r4w)) {
889 /* A library is attempting a RAID-write without providing
890 * a pages lock interface.
891 */
892 WARN_ON_ONCE(1);
893 return -ENOTSUPP;
894 }
895
876 ret = _prepare_for_striping(ios); 896 ret = _prepare_for_striping(ios);
877 if (unlikely(ret)) 897 if (unlikely(ret))
878 return ret; 898 return ret;
@@ -888,7 +908,7 @@ int ore_write(struct ore_io_state *ios)
888} 908}
889EXPORT_SYMBOL(ore_write); 909EXPORT_SYMBOL(ore_write);
890 910
891static int _read_mirror(struct ore_io_state *ios, unsigned cur_comp) 911int _ore_read_mirror(struct ore_io_state *ios, unsigned cur_comp)
892{ 912{
893 struct osd_request *or; 913 struct osd_request *or;
894 struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp]; 914 struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp];
@@ -952,7 +972,7 @@ int ore_read(struct ore_io_state *ios)
952 return ret; 972 return ret;
953 973
954 for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { 974 for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
955 ret = _read_mirror(ios, i); 975 ret = _ore_read_mirror(ios, i);
956 if (unlikely(ret)) 976 if (unlikely(ret))
957 return ret; 977 return ret;
958 } 978 }
diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c
index 8d4b93a93c67..29c47e5c4a86 100644
--- a/fs/exofs/ore_raid.c
+++ b/fs/exofs/ore_raid.c
@@ -14,9 +14,13 @@
14 */ 14 */
15 15
16#include <linux/gfp.h> 16#include <linux/gfp.h>
17#include <linux/async_tx.h>
17 18
18#include "ore_raid.h" 19#include "ore_raid.h"
19 20
21#undef ORE_DBGMSG2
22#define ORE_DBGMSG2 ORE_DBGMSG
23
20struct page *_raid_page_alloc(void) 24struct page *_raid_page_alloc(void)
21{ 25{
22 return alloc_page(GFP_KERNEL); 26 return alloc_page(GFP_KERNEL);
@@ -27,6 +31,236 @@ void _raid_page_free(struct page *p)
27 __free_page(p); 31 __free_page(p);
28} 32}
29 33
34/* This struct is forward declare in ore_io_state, but is private to here.
35 * It is put on ios->sp2d for RAID5/6 writes only. See _gen_xor_unit.
36 *
37 * __stripe_pages_2d is a 2d array of pages, and it is also a corner turn.
38 * Ascending page index access is sp2d(p-minor, c-major). But storage is
39 * sp2d[p-minor][c-major], so it can be properlly presented to the async-xor
40 * API.
41 */
42struct __stripe_pages_2d {
43 /* Cache some hot path repeated calculations */
44 unsigned parity;
45 unsigned data_devs;
46 unsigned pages_in_unit;
47
48 bool needed ;
49
50 /* Array size is pages_in_unit (layout->stripe_unit / PAGE_SIZE) */
51 struct __1_page_stripe {
52 bool alloc;
53 unsigned write_count;
54 struct async_submit_ctl submit;
55 struct dma_async_tx_descriptor *tx;
56
57 /* The size of this array is data_devs + parity */
58 struct page **pages;
59 struct page **scribble;
60 /* bool array, size of this array is data_devs */
61 char *page_is_read;
62 } _1p_stripes[];
63};
64
65/* This can get bigger then a page. So support multiple page allocations
66 * _sp2d_free should be called even if _sp2d_alloc fails (by returning
67 * none-zero).
68 */
69static int _sp2d_alloc(unsigned pages_in_unit, unsigned group_width,
70 unsigned parity, struct __stripe_pages_2d **psp2d)
71{
72 struct __stripe_pages_2d *sp2d;
73 unsigned data_devs = group_width - parity;
74 struct _alloc_all_bytes {
75 struct __alloc_stripe_pages_2d {
76 struct __stripe_pages_2d sp2d;
77 struct __1_page_stripe _1p_stripes[pages_in_unit];
78 } __asp2d;
79 struct __alloc_1p_arrays {
80 struct page *pages[group_width];
81 struct page *scribble[group_width];
82 char page_is_read[data_devs];
83 } __a1pa[pages_in_unit];
84 } *_aab;
85 struct __alloc_1p_arrays *__a1pa;
86 struct __alloc_1p_arrays *__a1pa_end;
87 const unsigned sizeof__a1pa = sizeof(_aab->__a1pa[0]);
88 unsigned num_a1pa, alloc_size, i;
89
90 /* FIXME: check these numbers in ore_verify_layout */
91 BUG_ON(sizeof(_aab->__asp2d) > PAGE_SIZE);
92 BUG_ON(sizeof__a1pa > PAGE_SIZE);
93
94 if (sizeof(*_aab) > PAGE_SIZE) {
95 num_a1pa = (PAGE_SIZE - sizeof(_aab->__asp2d)) / sizeof__a1pa;
96 alloc_size = sizeof(_aab->__asp2d) + sizeof__a1pa * num_a1pa;
97 } else {
98 num_a1pa = pages_in_unit;
99 alloc_size = sizeof(*_aab);
100 }
101
102 _aab = kzalloc(alloc_size, GFP_KERNEL);
103 if (unlikely(!_aab)) {
104 ORE_DBGMSG("!! Failed to alloc sp2d size=%d\n", alloc_size);
105 return -ENOMEM;
106 }
107
108 sp2d = &_aab->__asp2d.sp2d;
109 *psp2d = sp2d; /* From here Just call _sp2d_free */
110
111 __a1pa = _aab->__a1pa;
112 __a1pa_end = __a1pa + num_a1pa;
113
114 for (i = 0; i < pages_in_unit; ++i) {
115 if (unlikely(__a1pa >= __a1pa_end)) {
116 num_a1pa = min_t(unsigned, PAGE_SIZE / sizeof__a1pa,
117 pages_in_unit - i);
118
119 __a1pa = kzalloc(num_a1pa * sizeof__a1pa, GFP_KERNEL);
120 if (unlikely(!__a1pa)) {
121 ORE_DBGMSG("!! Failed to _alloc_1p_arrays=%d\n",
122 num_a1pa);
123 return -ENOMEM;
124 }
125 __a1pa_end = __a1pa + num_a1pa;
126 /* First *pages is marked for kfree of the buffer */
127 sp2d->_1p_stripes[i].alloc = true;
128 }
129
130 sp2d->_1p_stripes[i].pages = __a1pa->pages;
131 sp2d->_1p_stripes[i].scribble = __a1pa->scribble ;
132 sp2d->_1p_stripes[i].page_is_read = __a1pa->page_is_read;
133 ++__a1pa;
134 }
135
136 sp2d->parity = parity;
137 sp2d->data_devs = data_devs;
138 sp2d->pages_in_unit = pages_in_unit;
139 return 0;
140}
141
142static void _sp2d_reset(struct __stripe_pages_2d *sp2d,
143 const struct _ore_r4w_op *r4w, void *priv)
144{
145 unsigned data_devs = sp2d->data_devs;
146 unsigned group_width = data_devs + sp2d->parity;
147 unsigned p;
148
149 if (!sp2d->needed)
150 return;
151
152 for (p = 0; p < sp2d->pages_in_unit; p++) {
153 struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
154
155 if (_1ps->write_count < group_width) {
156 unsigned c;
157
158 for (c = 0; c < data_devs; c++)
159 if (_1ps->page_is_read[c]) {
160 struct page *page = _1ps->pages[c];
161
162 r4w->put_page(priv, page);
163 _1ps->page_is_read[c] = false;
164 }
165 }
166
167 memset(_1ps->pages, 0, group_width * sizeof(*_1ps->pages));
168 _1ps->write_count = 0;
169 _1ps->tx = NULL;
170 }
171
172 sp2d->needed = false;
173}
174
175static void _sp2d_free(struct __stripe_pages_2d *sp2d)
176{
177 unsigned i;
178
179 if (!sp2d)
180 return;
181
182 for (i = 0; i < sp2d->pages_in_unit; ++i) {
183 if (sp2d->_1p_stripes[i].alloc)
184 kfree(sp2d->_1p_stripes[i].pages);
185 }
186
187 kfree(sp2d);
188}
189
190static unsigned _sp2d_min_pg(struct __stripe_pages_2d *sp2d)
191{
192 unsigned p;
193
194 for (p = 0; p < sp2d->pages_in_unit; p++) {
195 struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
196
197 if (_1ps->write_count)
198 return p;
199 }
200
201 return ~0;
202}
203
204static unsigned _sp2d_max_pg(struct __stripe_pages_2d *sp2d)
205{
206 unsigned p;
207
208 for (p = sp2d->pages_in_unit - 1; p >= 0; --p) {
209 struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
210
211 if (_1ps->write_count)
212 return p;
213 }
214
215 return ~0;
216}
217
218static void _gen_xor_unit(struct __stripe_pages_2d *sp2d)
219{
220 unsigned p;
221 for (p = 0; p < sp2d->pages_in_unit; p++) {
222 struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
223
224 if (!_1ps->write_count)
225 continue;
226
227 init_async_submit(&_1ps->submit,
228 ASYNC_TX_XOR_ZERO_DST | ASYNC_TX_ACK,
229 NULL,
230 NULL, NULL,
231 (addr_conv_t *)_1ps->scribble);
232
233 /* TODO: raid6 */
234 _1ps->tx = async_xor(_1ps->pages[sp2d->data_devs], _1ps->pages,
235 0, sp2d->data_devs, PAGE_SIZE,
236 &_1ps->submit);
237 }
238
239 for (p = 0; p < sp2d->pages_in_unit; p++) {
240 struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
241 /* NOTE: We wait for HW synchronously (I don't have such HW
242 * to test with.) Is parallelism needed with today's multi
243 * cores?
244 */
245 async_tx_issue_pending(_1ps->tx);
246 }
247}
248
249void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d,
250 struct ore_striping_info *si, struct page *page)
251{
252 struct __1_page_stripe *_1ps;
253
254 sp2d->needed = true;
255
256 _1ps = &sp2d->_1p_stripes[si->cur_pg];
257 _1ps->pages[si->cur_comp] = page;
258 ++_1ps->write_count;
259
260 si->cur_pg = (si->cur_pg + 1) % sp2d->pages_in_unit;
261 /* si->cur_comp is advanced outside at main loop */
262}
263
30void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len, 264void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len,
31 bool not_last) 265 bool not_last)
32{ 266{
@@ -76,6 +310,240 @@ void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len,
76 } 310 }
77} 311}
78 312
313static int _alloc_read_4_write(struct ore_io_state *ios)
314{
315 struct ore_layout *layout = ios->layout;
316 int ret;
317 /* We want to only read those pages not in cache so worst case
318 * is a stripe populated with every other page
319 */
320 unsigned sgs_per_dev = ios->sp2d->pages_in_unit + 2;
321
322 ret = _ore_get_io_state(layout, ios->oc,
323 layout->group_width * layout->mirrors_p1,
324 sgs_per_dev, 0, &ios->ios_read_4_write);
325 return ret;
326}
327
328/* @si contains info of the to-be-inserted page. Update of @si should be
329 * maintained by caller. Specificaly si->dev, si->obj_offset, ...
330 */
331static int _add_to_read_4_write(struct ore_io_state *ios,
332 struct ore_striping_info *si, struct page *page)
333{
334 struct request_queue *q;
335 struct ore_per_dev_state *per_dev;
336 struct ore_io_state *read_ios;
337 unsigned first_dev = si->dev - (si->dev %
338 (ios->layout->group_width * ios->layout->mirrors_p1));
339 unsigned comp = si->dev - first_dev;
340 unsigned added_len;
341
342 if (!ios->ios_read_4_write) {
343 int ret = _alloc_read_4_write(ios);
344
345 if (unlikely(ret))
346 return ret;
347 }
348
349 read_ios = ios->ios_read_4_write;
350 read_ios->numdevs = ios->layout->group_width * ios->layout->mirrors_p1;
351
352 per_dev = &read_ios->per_dev[comp];
353 if (!per_dev->length) {
354 per_dev->bio = bio_kmalloc(GFP_KERNEL,
355 ios->sp2d->pages_in_unit);
356 if (unlikely(!per_dev->bio)) {
357 ORE_DBGMSG("Failed to allocate BIO size=%u\n",
358 ios->sp2d->pages_in_unit);
359 return -ENOMEM;
360 }
361 per_dev->offset = si->obj_offset;
362 per_dev->dev = si->dev;
363 } else if (si->obj_offset != (per_dev->offset + per_dev->length)) {
364 u64 gap = si->obj_offset - (per_dev->offset + per_dev->length);
365
366 _ore_add_sg_seg(per_dev, gap, true);
367 }
368 q = osd_request_queue(ore_comp_dev(read_ios->oc, per_dev->dev));
369 added_len = bio_add_pc_page(q, per_dev->bio, page, PAGE_SIZE, 0);
370 if (unlikely(added_len != PAGE_SIZE)) {
371 ORE_DBGMSG("Failed to bio_add_pc_page bi_vcnt=%d\n",
372 per_dev->bio->bi_vcnt);
373 return -ENOMEM;
374 }
375
376 per_dev->length += PAGE_SIZE;
377 return 0;
378}
379
380static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret)
381{
382 struct bio_vec *bv;
383 unsigned i, d;
384
385 /* loop on all devices all pages */
386 for (d = 0; d < ios->numdevs; d++) {
387 struct bio *bio = ios->per_dev[d].bio;
388
389 if (!bio)
390 continue;
391
392 __bio_for_each_segment(bv, bio, i, 0) {
393 struct page *page = bv->bv_page;
394
395 SetPageUptodate(page);
396 if (PageError(page))
397 ClearPageError(page);
398 }
399 }
400}
401
402/* read_4_write is hacked to read the start of the first stripe and/or
403 * the end of the last stripe. If needed, with an sg-gap at each device/page.
404 * It is assumed to be called after the to_be_written pages of the first stripe
405 * are populating ios->sp2d[][]
406 *
407 * NOTE: We call ios->r4w->lock_fn for all pages needed for parity calculations
408 * These pages are held at sp2d[p].pages[c] but with
409 * sp2d[p].page_is_read[c] = true. At _sp2d_reset these pages are
410 * ios->r4w->lock_fn(). The ios->r4w->lock_fn might signal that the page is
411 * @uptodate=true, so we don't need to read it, only unlock, after IO.
412 *
413 * TODO: The read_4_write should calc a need_to_read_pages_count, if bigger then
414 * to-be-written count, we should consider the xor-in-place mode.
415 * need_to_read_pages_count is the actual number of pages not present in cache.
416 * maybe "devs_in_group - ios->sp2d[p].write_count" is a good enough
417 * approximation? In this mode the read pages are put in the empty places of
418 * ios->sp2d[p][*], xor is calculated the same way. These pages are
419 * allocated/freed and don't go through cache
420 */
421static int _read_4_write(struct ore_io_state *ios)
422{
423 struct ore_io_state *ios_read;
424 struct ore_striping_info read_si;
425 struct __stripe_pages_2d *sp2d = ios->sp2d;
426 u64 offset = ios->si.first_stripe_start;
427 u64 last_stripe_end;
428 unsigned bytes_in_stripe = ios->si.bytes_in_stripe;
429 unsigned i, c, p, min_p = sp2d->pages_in_unit, max_p = -1;
430 int ret;
431
432 if (offset == ios->offset) /* Go to start collect $200 */
433 goto read_last_stripe;
434
435 min_p = _sp2d_min_pg(sp2d);
436 max_p = _sp2d_max_pg(sp2d);
437
438 for (c = 0; ; c++) {
439 ore_calc_stripe_info(ios->layout, offset, 0, &read_si);
440 read_si.obj_offset += min_p * PAGE_SIZE;
441 offset += min_p * PAGE_SIZE;
442 for (p = min_p; p <= max_p; p++) {
443 struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
444 struct page **pp = &_1ps->pages[c];
445 bool uptodate;
446
447 if (*pp)
448 /* to-be-written pages start here */
449 goto read_last_stripe;
450
451 *pp = ios->r4w->get_page(ios->private, offset,
452 &uptodate);
453 if (unlikely(!*pp))
454 return -ENOMEM;
455
456 if (!uptodate)
457 _add_to_read_4_write(ios, &read_si, *pp);
458
459 /* Mark read-pages to be cache_released */
460 _1ps->page_is_read[c] = true;
461 read_si.obj_offset += PAGE_SIZE;
462 offset += PAGE_SIZE;
463 }
464 offset += (sp2d->pages_in_unit - p) * PAGE_SIZE;
465 }
466
467read_last_stripe:
468 offset = ios->offset + (ios->length + PAGE_SIZE - 1) /
469 PAGE_SIZE * PAGE_SIZE;
470 last_stripe_end = div_u64(offset + bytes_in_stripe - 1, bytes_in_stripe)
471 * bytes_in_stripe;
472 if (offset == last_stripe_end) /* Optimize for the aligned case */
473 goto read_it;
474
475 ore_calc_stripe_info(ios->layout, offset, 0, &read_si);
476 p = read_si.unit_off / PAGE_SIZE;
477 c = _dev_order(ios->layout->group_width * ios->layout->mirrors_p1,
478 ios->layout->mirrors_p1, read_si.par_dev, read_si.dev);
479
480 BUG_ON(ios->si.first_stripe_start + bytes_in_stripe != last_stripe_end);
481 /* unaligned IO must be within a single stripe */
482
483 if (min_p == sp2d->pages_in_unit) {
484 /* Didn't do it yet */
485 min_p = _sp2d_min_pg(sp2d);
486 max_p = _sp2d_max_pg(sp2d);
487 }
488
489 while (offset < last_stripe_end) {
490 struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
491
492 if ((min_p <= p) && (p <= max_p)) {
493 struct page *page;
494 bool uptodate;
495
496 BUG_ON(_1ps->pages[c]);
497 page = ios->r4w->get_page(ios->private, offset,
498 &uptodate);
499 if (unlikely(!page))
500 return -ENOMEM;
501
502 _1ps->pages[c] = page;
503 /* Mark read-pages to be cache_released */
504 _1ps->page_is_read[c] = true;
505 if (!uptodate)
506 _add_to_read_4_write(ios, &read_si, page);
507 }
508
509 offset += PAGE_SIZE;
510 if (p == (sp2d->pages_in_unit - 1)) {
511 ++c;
512 p = 0;
513 ore_calc_stripe_info(ios->layout, offset, 0, &read_si);
514 } else {
515 read_si.obj_offset += PAGE_SIZE;
516 ++p;
517 }
518 }
519
520read_it:
521 ios_read = ios->ios_read_4_write;
522 if (!ios_read)
523 return 0;
524
525 /* FIXME: Ugly to signal _sbi_read_mirror that we have bio(s). Change
526 * to check for per_dev->bio
527 */
528 ios_read->pages = ios->pages;
529
530 /* Now read these devices */
531 for (i = 0; i < ios_read->numdevs; i += ios_read->layout->mirrors_p1) {
532 ret = _ore_read_mirror(ios_read, i);
533 if (unlikely(ret))
534 return ret;
535 }
536
537 ret = ore_io_execute(ios_read); /* Synchronus execution */
538 if (unlikely(ret)) {
539 ORE_DBGMSG("!! ore_io_execute => %d\n", ret);
540 return ret;
541 }
542
543 _mark_read4write_pages_uptodate(ios_read, ret);
544 return 0;
545}
546
79/* In writes @cur_len means length left. .i.e cur_len==0 is the last parity U */ 547/* In writes @cur_len means length left. .i.e cur_len==0 is the last parity U */
80int _ore_add_parity_unit(struct ore_io_state *ios, 548int _ore_add_parity_unit(struct ore_io_state *ios,
81 struct ore_striping_info *si, 549 struct ore_striping_info *si,
@@ -86,42 +554,89 @@ int _ore_add_parity_unit(struct ore_io_state *ios,
86 BUG_ON(per_dev->cur_sg >= ios->sgs_per_dev); 554 BUG_ON(per_dev->cur_sg >= ios->sgs_per_dev);
87 _ore_add_sg_seg(per_dev, cur_len, true); 555 _ore_add_sg_seg(per_dev, cur_len, true);
88 } else { 556 } else {
557 struct __stripe_pages_2d *sp2d = ios->sp2d;
89 struct page **pages = ios->parity_pages + ios->cur_par_page; 558 struct page **pages = ios->parity_pages + ios->cur_par_page;
90 unsigned num_pages = ios->layout->stripe_unit / PAGE_SIZE; 559 unsigned num_pages;
91 unsigned array_start = 0; 560 unsigned array_start = 0;
92 unsigned i; 561 unsigned i;
93 int ret; 562 int ret;
94 563
564 si->cur_pg = _sp2d_min_pg(sp2d);
565 num_pages = _sp2d_max_pg(sp2d) + 1 - si->cur_pg;
566
567 if (!cur_len) /* If last stripe operate on parity comp */
568 si->cur_comp = sp2d->data_devs;
569
570 if (!per_dev->length) {
571 per_dev->offset += si->cur_pg * PAGE_SIZE;
572 /* If first stripe, Read in all read4write pages
573 * (if needed) before we calculate the first parity.
574 */
575 _read_4_write(ios);
576 }
577
95 for (i = 0; i < num_pages; i++) { 578 for (i = 0; i < num_pages; i++) {
96 pages[i] = _raid_page_alloc(); 579 pages[i] = _raid_page_alloc();
97 if (unlikely(!pages[i])) 580 if (unlikely(!pages[i]))
98 return -ENOMEM; 581 return -ENOMEM;
99 582
100 ++(ios->cur_par_page); 583 ++(ios->cur_par_page);
101 /* TODO: only read support for now */
102 clear_highpage(pages[i]);
103 } 584 }
104 585
105 ORE_DBGMSG("writing dev=%d num_pages=%d cur_par_page=%d", 586 BUG_ON(si->cur_comp != sp2d->data_devs);
106 per_dev->dev, num_pages, ios->cur_par_page); 587 BUG_ON(si->cur_pg + num_pages > sp2d->pages_in_unit);
107 588
108 ret = _ore_add_stripe_unit(ios, &array_start, 0, pages, 589 ret = _ore_add_stripe_unit(ios, &array_start, 0, pages,
109 per_dev, num_pages * PAGE_SIZE); 590 per_dev, num_pages * PAGE_SIZE);
110 if (unlikely(ret)) 591 if (unlikely(ret))
111 return ret; 592 return ret;
593
594 /* TODO: raid6 if (last_parity_dev) */
595 _gen_xor_unit(sp2d);
596 _sp2d_reset(sp2d, ios->r4w, ios->private);
112 } 597 }
113 return 0; 598 return 0;
114} 599}
115 600
116int _ore_post_alloc_raid_stuff(struct ore_io_state *ios) 601int _ore_post_alloc_raid_stuff(struct ore_io_state *ios)
117{ 602{
118 /*TODO: Only raid writes has stuff to add here */ 603 struct ore_layout *layout = ios->layout;
604
605 if (ios->parity_pages) {
606 unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE;
607 unsigned stripe_size = ios->si.bytes_in_stripe;
608 u64 last_stripe, first_stripe;
609
610 if (_sp2d_alloc(pages_in_unit, layout->group_width,
611 layout->parity, &ios->sp2d)) {
612 return -ENOMEM;
613 }
614
615 BUG_ON(ios->offset % PAGE_SIZE);
616
617 /* Round io down to last full strip */
618 first_stripe = div_u64(ios->offset, stripe_size);
619 last_stripe = div_u64(ios->offset + ios->length, stripe_size);
620
621 /* If an IO spans more then a single stripe it must end at
622 * a stripe boundary. The reminder at the end is pushed into the
623 * next IO.
624 */
625 if (last_stripe != first_stripe) {
626 ios->length = last_stripe * stripe_size - ios->offset;
627
628 BUG_ON(!ios->length);
629 ios->nr_pages = (ios->length + PAGE_SIZE - 1) /
630 PAGE_SIZE;
631 ios->si.length = ios->length; /*make it consistent */
632 }
633 }
119 return 0; 634 return 0;
120} 635}
121 636
122void _ore_free_raid_stuff(struct ore_io_state *ios) 637void _ore_free_raid_stuff(struct ore_io_state *ios)
123{ 638{
124 if (ios->parity_pages) { /* writing and raid */ 639 if (ios->sp2d) { /* writing and raid */
125 unsigned i; 640 unsigned i;
126 641
127 for (i = 0; i < ios->cur_par_page; i++) { 642 for (i = 0; i < ios->cur_par_page; i++) {
@@ -132,9 +647,14 @@ void _ore_free_raid_stuff(struct ore_io_state *ios)
132 } 647 }
133 if (ios->extra_part_alloc) 648 if (ios->extra_part_alloc)
134 kfree(ios->parity_pages); 649 kfree(ios->parity_pages);
650 /* If IO returned an error pages might need unlocking */
651 _sp2d_reset(ios->sp2d, ios->r4w, ios->private);
652 _sp2d_free(ios->sp2d);
135 } else { 653 } else {
136 /* Will only be set if raid reading && sglist is big */ 654 /* Will only be set if raid reading && sglist is big */
137 if (ios->extra_part_alloc) 655 if (ios->extra_part_alloc)
138 kfree(ios->per_dev[0].sglist); 656 kfree(ios->per_dev[0].sglist);
139 } 657 }
658 if (ios->ios_read_4_write)
659 ore_put_io_state(ios->ios_read_4_write);
140} 660}
diff --git a/fs/exofs/ore_raid.h b/fs/exofs/ore_raid.h
index c21080b4407f..2ffd2c3c6e46 100644
--- a/fs/exofs/ore_raid.h
+++ b/fs/exofs/ore_raid.h
@@ -57,8 +57,23 @@ void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len,
57 bool not_last); 57 bool not_last);
58int _ore_add_parity_unit(struct ore_io_state *ios, struct ore_striping_info *si, 58int _ore_add_parity_unit(struct ore_io_state *ios, struct ore_striping_info *si,
59 struct ore_per_dev_state *per_dev, unsigned cur_len); 59 struct ore_per_dev_state *per_dev, unsigned cur_len);
60void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d,
61 struct ore_striping_info *si, struct page *page);
62static inline void _add_stripe_page(struct __stripe_pages_2d *sp2d,
63 struct ore_striping_info *si, struct page *page)
64{
65 if (!sp2d) /* Inline the fast path */
66 return; /* Hay no raid stuff */
67 _ore_add_stripe_page(sp2d, si, page);
68}
60 69
61/* ios.c stuff needed by ios_raid.c */ 70/* ios.c stuff needed by ios_raid.c */
71int _ore_get_io_state(struct ore_layout *layout,
72 struct ore_components *oc, unsigned numdevs,
73 unsigned sgs_per_dev, unsigned num_par_pages,
74 struct ore_io_state **pios);
62int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, 75int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg,
63 unsigned pgbase, struct page **pages, 76 unsigned pgbase, struct page **pages,
64 struct ore_per_dev_state *per_dev, int cur_len); 77 struct ore_per_dev_state *per_dev, int cur_len);
78int _ore_read_mirror(struct ore_io_state *ios, unsigned cur_comp);
79int ore_io_execute(struct ore_io_state *ios);
diff --git a/include/scsi/osd_ore.h b/include/scsi/osd_ore.h
index 43821c18cd3f..f05fa826f89e 100644
--- a/include/scsi/osd_ore.h
+++ b/include/scsi/osd_ore.h
@@ -99,11 +99,17 @@ struct ore_striping_info {
99 unsigned dev; 99 unsigned dev;
100 unsigned par_dev; 100 unsigned par_dev;
101 unsigned unit_off; 101 unsigned unit_off;
102 unsigned cur_pg;
102 unsigned cur_comp; 103 unsigned cur_comp;
103}; 104};
104 105
105struct ore_io_state; 106struct ore_io_state;
106typedef void (*ore_io_done_fn)(struct ore_io_state *ios, void *private); 107typedef void (*ore_io_done_fn)(struct ore_io_state *ios, void *private);
108struct _ore_r4w_op {
109 /* @Priv given here is passed ios->private */
110 struct page * (*get_page)(void *priv, u64 page_index, bool *uptodate);
111 void (*put_page)(void *priv, struct page *page);
112};
107 113
108struct ore_io_state { 114struct ore_io_state {
109 struct kref kref; 115 struct kref kref;
@@ -139,6 +145,9 @@ struct ore_io_state {
139 unsigned max_par_pages; 145 unsigned max_par_pages;
140 unsigned cur_par_page; 146 unsigned cur_par_page;
141 unsigned sgs_per_dev; 147 unsigned sgs_per_dev;
148 struct __stripe_pages_2d *sp2d;
149 struct ore_io_state *ios_read_4_write;
150 const struct _ore_r4w_op *r4w;
142 151
143 /* Variable array of size numdevs */ 152 /* Variable array of size numdevs */
144 unsigned numdevs; 153 unsigned numdevs;