diff options
Diffstat (limited to 'fs/exofs/ore_raid.c')
-rw-r--r-- | fs/exofs/ore_raid.c | 660 |
1 files changed, 660 insertions, 0 deletions
diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c new file mode 100644 index 000000000000..29c47e5c4a86 --- /dev/null +++ b/fs/exofs/ore_raid.c | |||
@@ -0,0 +1,660 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2011 | ||
3 | * Boaz Harrosh <bharrosh@panasas.com> | ||
4 | * | ||
5 | * This file is part of the objects raid engine (ore). | ||
6 | * | ||
7 | * It is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License version 2 as published | ||
9 | * by the Free Software Foundation. | ||
10 | * | ||
11 | * You should have received a copy of the GNU General Public License | ||
12 | * along with "ore". If not, write to the Free Software Foundation, Inc: | ||
13 | * "Free Software Foundation <info@fsf.org>" | ||
14 | */ | ||
15 | |||
16 | #include <linux/gfp.h> | ||
17 | #include <linux/async_tx.h> | ||
18 | |||
19 | #include "ore_raid.h" | ||
20 | |||
21 | #undef ORE_DBGMSG2 | ||
22 | #define ORE_DBGMSG2 ORE_DBGMSG | ||
23 | |||
24 | struct page *_raid_page_alloc(void) | ||
25 | { | ||
26 | return alloc_page(GFP_KERNEL); | ||
27 | } | ||
28 | |||
29 | void _raid_page_free(struct page *p) | ||
30 | { | ||
31 | __free_page(p); | ||
32 | } | ||
33 | |||
34 | /* This struct is forward declare in ore_io_state, but is private to here. | ||
35 | * It is put on ios->sp2d for RAID5/6 writes only. See _gen_xor_unit. | ||
36 | * | ||
37 | * __stripe_pages_2d is a 2d array of pages, and it is also a corner turn. | ||
38 | * Ascending page index access is sp2d(p-minor, c-major). But storage is | ||
39 | * sp2d[p-minor][c-major], so it can be properlly presented to the async-xor | ||
40 | * API. | ||
41 | */ | ||
42 | struct __stripe_pages_2d { | ||
43 | /* Cache some hot path repeated calculations */ | ||
44 | unsigned parity; | ||
45 | unsigned data_devs; | ||
46 | unsigned pages_in_unit; | ||
47 | |||
48 | bool needed ; | ||
49 | |||
50 | /* Array size is pages_in_unit (layout->stripe_unit / PAGE_SIZE) */ | ||
51 | struct __1_page_stripe { | ||
52 | bool alloc; | ||
53 | unsigned write_count; | ||
54 | struct async_submit_ctl submit; | ||
55 | struct dma_async_tx_descriptor *tx; | ||
56 | |||
57 | /* The size of this array is data_devs + parity */ | ||
58 | struct page **pages; | ||
59 | struct page **scribble; | ||
60 | /* bool array, size of this array is data_devs */ | ||
61 | char *page_is_read; | ||
62 | } _1p_stripes[]; | ||
63 | }; | ||
64 | |||
65 | /* This can get bigger then a page. So support multiple page allocations | ||
66 | * _sp2d_free should be called even if _sp2d_alloc fails (by returning | ||
67 | * none-zero). | ||
68 | */ | ||
69 | static int _sp2d_alloc(unsigned pages_in_unit, unsigned group_width, | ||
70 | unsigned parity, struct __stripe_pages_2d **psp2d) | ||
71 | { | ||
72 | struct __stripe_pages_2d *sp2d; | ||
73 | unsigned data_devs = group_width - parity; | ||
74 | struct _alloc_all_bytes { | ||
75 | struct __alloc_stripe_pages_2d { | ||
76 | struct __stripe_pages_2d sp2d; | ||
77 | struct __1_page_stripe _1p_stripes[pages_in_unit]; | ||
78 | } __asp2d; | ||
79 | struct __alloc_1p_arrays { | ||
80 | struct page *pages[group_width]; | ||
81 | struct page *scribble[group_width]; | ||
82 | char page_is_read[data_devs]; | ||
83 | } __a1pa[pages_in_unit]; | ||
84 | } *_aab; | ||
85 | struct __alloc_1p_arrays *__a1pa; | ||
86 | struct __alloc_1p_arrays *__a1pa_end; | ||
87 | const unsigned sizeof__a1pa = sizeof(_aab->__a1pa[0]); | ||
88 | unsigned num_a1pa, alloc_size, i; | ||
89 | |||
90 | /* FIXME: check these numbers in ore_verify_layout */ | ||
91 | BUG_ON(sizeof(_aab->__asp2d) > PAGE_SIZE); | ||
92 | BUG_ON(sizeof__a1pa > PAGE_SIZE); | ||
93 | |||
94 | if (sizeof(*_aab) > PAGE_SIZE) { | ||
95 | num_a1pa = (PAGE_SIZE - sizeof(_aab->__asp2d)) / sizeof__a1pa; | ||
96 | alloc_size = sizeof(_aab->__asp2d) + sizeof__a1pa * num_a1pa; | ||
97 | } else { | ||
98 | num_a1pa = pages_in_unit; | ||
99 | alloc_size = sizeof(*_aab); | ||
100 | } | ||
101 | |||
102 | _aab = kzalloc(alloc_size, GFP_KERNEL); | ||
103 | if (unlikely(!_aab)) { | ||
104 | ORE_DBGMSG("!! Failed to alloc sp2d size=%d\n", alloc_size); | ||
105 | return -ENOMEM; | ||
106 | } | ||
107 | |||
108 | sp2d = &_aab->__asp2d.sp2d; | ||
109 | *psp2d = sp2d; /* From here Just call _sp2d_free */ | ||
110 | |||
111 | __a1pa = _aab->__a1pa; | ||
112 | __a1pa_end = __a1pa + num_a1pa; | ||
113 | |||
114 | for (i = 0; i < pages_in_unit; ++i) { | ||
115 | if (unlikely(__a1pa >= __a1pa_end)) { | ||
116 | num_a1pa = min_t(unsigned, PAGE_SIZE / sizeof__a1pa, | ||
117 | pages_in_unit - i); | ||
118 | |||
119 | __a1pa = kzalloc(num_a1pa * sizeof__a1pa, GFP_KERNEL); | ||
120 | if (unlikely(!__a1pa)) { | ||
121 | ORE_DBGMSG("!! Failed to _alloc_1p_arrays=%d\n", | ||
122 | num_a1pa); | ||
123 | return -ENOMEM; | ||
124 | } | ||
125 | __a1pa_end = __a1pa + num_a1pa; | ||
126 | /* First *pages is marked for kfree of the buffer */ | ||
127 | sp2d->_1p_stripes[i].alloc = true; | ||
128 | } | ||
129 | |||
130 | sp2d->_1p_stripes[i].pages = __a1pa->pages; | ||
131 | sp2d->_1p_stripes[i].scribble = __a1pa->scribble ; | ||
132 | sp2d->_1p_stripes[i].page_is_read = __a1pa->page_is_read; | ||
133 | ++__a1pa; | ||
134 | } | ||
135 | |||
136 | sp2d->parity = parity; | ||
137 | sp2d->data_devs = data_devs; | ||
138 | sp2d->pages_in_unit = pages_in_unit; | ||
139 | return 0; | ||
140 | } | ||
141 | |||
142 | static void _sp2d_reset(struct __stripe_pages_2d *sp2d, | ||
143 | const struct _ore_r4w_op *r4w, void *priv) | ||
144 | { | ||
145 | unsigned data_devs = sp2d->data_devs; | ||
146 | unsigned group_width = data_devs + sp2d->parity; | ||
147 | unsigned p; | ||
148 | |||
149 | if (!sp2d->needed) | ||
150 | return; | ||
151 | |||
152 | for (p = 0; p < sp2d->pages_in_unit; p++) { | ||
153 | struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; | ||
154 | |||
155 | if (_1ps->write_count < group_width) { | ||
156 | unsigned c; | ||
157 | |||
158 | for (c = 0; c < data_devs; c++) | ||
159 | if (_1ps->page_is_read[c]) { | ||
160 | struct page *page = _1ps->pages[c]; | ||
161 | |||
162 | r4w->put_page(priv, page); | ||
163 | _1ps->page_is_read[c] = false; | ||
164 | } | ||
165 | } | ||
166 | |||
167 | memset(_1ps->pages, 0, group_width * sizeof(*_1ps->pages)); | ||
168 | _1ps->write_count = 0; | ||
169 | _1ps->tx = NULL; | ||
170 | } | ||
171 | |||
172 | sp2d->needed = false; | ||
173 | } | ||
174 | |||
175 | static void _sp2d_free(struct __stripe_pages_2d *sp2d) | ||
176 | { | ||
177 | unsigned i; | ||
178 | |||
179 | if (!sp2d) | ||
180 | return; | ||
181 | |||
182 | for (i = 0; i < sp2d->pages_in_unit; ++i) { | ||
183 | if (sp2d->_1p_stripes[i].alloc) | ||
184 | kfree(sp2d->_1p_stripes[i].pages); | ||
185 | } | ||
186 | |||
187 | kfree(sp2d); | ||
188 | } | ||
189 | |||
190 | static unsigned _sp2d_min_pg(struct __stripe_pages_2d *sp2d) | ||
191 | { | ||
192 | unsigned p; | ||
193 | |||
194 | for (p = 0; p < sp2d->pages_in_unit; p++) { | ||
195 | struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; | ||
196 | |||
197 | if (_1ps->write_count) | ||
198 | return p; | ||
199 | } | ||
200 | |||
201 | return ~0; | ||
202 | } | ||
203 | |||
204 | static unsigned _sp2d_max_pg(struct __stripe_pages_2d *sp2d) | ||
205 | { | ||
206 | unsigned p; | ||
207 | |||
208 | for (p = sp2d->pages_in_unit - 1; p >= 0; --p) { | ||
209 | struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; | ||
210 | |||
211 | if (_1ps->write_count) | ||
212 | return p; | ||
213 | } | ||
214 | |||
215 | return ~0; | ||
216 | } | ||
217 | |||
218 | static void _gen_xor_unit(struct __stripe_pages_2d *sp2d) | ||
219 | { | ||
220 | unsigned p; | ||
221 | for (p = 0; p < sp2d->pages_in_unit; p++) { | ||
222 | struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; | ||
223 | |||
224 | if (!_1ps->write_count) | ||
225 | continue; | ||
226 | |||
227 | init_async_submit(&_1ps->submit, | ||
228 | ASYNC_TX_XOR_ZERO_DST | ASYNC_TX_ACK, | ||
229 | NULL, | ||
230 | NULL, NULL, | ||
231 | (addr_conv_t *)_1ps->scribble); | ||
232 | |||
233 | /* TODO: raid6 */ | ||
234 | _1ps->tx = async_xor(_1ps->pages[sp2d->data_devs], _1ps->pages, | ||
235 | 0, sp2d->data_devs, PAGE_SIZE, | ||
236 | &_1ps->submit); | ||
237 | } | ||
238 | |||
239 | for (p = 0; p < sp2d->pages_in_unit; p++) { | ||
240 | struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; | ||
241 | /* NOTE: We wait for HW synchronously (I don't have such HW | ||
242 | * to test with.) Is parallelism needed with today's multi | ||
243 | * cores? | ||
244 | */ | ||
245 | async_tx_issue_pending(_1ps->tx); | ||
246 | } | ||
247 | } | ||
248 | |||
249 | void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d, | ||
250 | struct ore_striping_info *si, struct page *page) | ||
251 | { | ||
252 | struct __1_page_stripe *_1ps; | ||
253 | |||
254 | sp2d->needed = true; | ||
255 | |||
256 | _1ps = &sp2d->_1p_stripes[si->cur_pg]; | ||
257 | _1ps->pages[si->cur_comp] = page; | ||
258 | ++_1ps->write_count; | ||
259 | |||
260 | si->cur_pg = (si->cur_pg + 1) % sp2d->pages_in_unit; | ||
261 | /* si->cur_comp is advanced outside at main loop */ | ||
262 | } | ||
263 | |||
264 | void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len, | ||
265 | bool not_last) | ||
266 | { | ||
267 | struct osd_sg_entry *sge; | ||
268 | |||
269 | ORE_DBGMSG("dev=%d cur_len=0x%x not_last=%d cur_sg=%d " | ||
270 | "offset=0x%llx length=0x%x last_sgs_total=0x%x\n", | ||
271 | per_dev->dev, cur_len, not_last, per_dev->cur_sg, | ||
272 | _LLU(per_dev->offset), per_dev->length, | ||
273 | per_dev->last_sgs_total); | ||
274 | |||
275 | if (!per_dev->cur_sg) { | ||
276 | sge = per_dev->sglist; | ||
277 | |||
278 | /* First time we prepare two entries */ | ||
279 | if (per_dev->length) { | ||
280 | ++per_dev->cur_sg; | ||
281 | sge->offset = per_dev->offset; | ||
282 | sge->len = per_dev->length; | ||
283 | } else { | ||
284 | /* Here the parity is the first unit of this object. | ||
285 | * This happens every time we reach a parity device on | ||
286 | * the same stripe as the per_dev->offset. We need to | ||
287 | * just skip this unit. | ||
288 | */ | ||
289 | per_dev->offset += cur_len; | ||
290 | return; | ||
291 | } | ||
292 | } else { | ||
293 | /* finalize the last one */ | ||
294 | sge = &per_dev->sglist[per_dev->cur_sg - 1]; | ||
295 | sge->len = per_dev->length - per_dev->last_sgs_total; | ||
296 | } | ||
297 | |||
298 | if (not_last) { | ||
299 | /* Partly prepare the next one */ | ||
300 | struct osd_sg_entry *next_sge = sge + 1; | ||
301 | |||
302 | ++per_dev->cur_sg; | ||
303 | next_sge->offset = sge->offset + sge->len + cur_len; | ||
304 | /* Save cur len so we know how mutch was added next time */ | ||
305 | per_dev->last_sgs_total = per_dev->length; | ||
306 | next_sge->len = 0; | ||
307 | } else if (!sge->len) { | ||
308 | /* Optimize for when the last unit is a parity */ | ||
309 | --per_dev->cur_sg; | ||
310 | } | ||
311 | } | ||
312 | |||
313 | static int _alloc_read_4_write(struct ore_io_state *ios) | ||
314 | { | ||
315 | struct ore_layout *layout = ios->layout; | ||
316 | int ret; | ||
317 | /* We want to only read those pages not in cache so worst case | ||
318 | * is a stripe populated with every other page | ||
319 | */ | ||
320 | unsigned sgs_per_dev = ios->sp2d->pages_in_unit + 2; | ||
321 | |||
322 | ret = _ore_get_io_state(layout, ios->oc, | ||
323 | layout->group_width * layout->mirrors_p1, | ||
324 | sgs_per_dev, 0, &ios->ios_read_4_write); | ||
325 | return ret; | ||
326 | } | ||
327 | |||
328 | /* @si contains info of the to-be-inserted page. Update of @si should be | ||
329 | * maintained by caller. Specificaly si->dev, si->obj_offset, ... | ||
330 | */ | ||
331 | static int _add_to_read_4_write(struct ore_io_state *ios, | ||
332 | struct ore_striping_info *si, struct page *page) | ||
333 | { | ||
334 | struct request_queue *q; | ||
335 | struct ore_per_dev_state *per_dev; | ||
336 | struct ore_io_state *read_ios; | ||
337 | unsigned first_dev = si->dev - (si->dev % | ||
338 | (ios->layout->group_width * ios->layout->mirrors_p1)); | ||
339 | unsigned comp = si->dev - first_dev; | ||
340 | unsigned added_len; | ||
341 | |||
342 | if (!ios->ios_read_4_write) { | ||
343 | int ret = _alloc_read_4_write(ios); | ||
344 | |||
345 | if (unlikely(ret)) | ||
346 | return ret; | ||
347 | } | ||
348 | |||
349 | read_ios = ios->ios_read_4_write; | ||
350 | read_ios->numdevs = ios->layout->group_width * ios->layout->mirrors_p1; | ||
351 | |||
352 | per_dev = &read_ios->per_dev[comp]; | ||
353 | if (!per_dev->length) { | ||
354 | per_dev->bio = bio_kmalloc(GFP_KERNEL, | ||
355 | ios->sp2d->pages_in_unit); | ||
356 | if (unlikely(!per_dev->bio)) { | ||
357 | ORE_DBGMSG("Failed to allocate BIO size=%u\n", | ||
358 | ios->sp2d->pages_in_unit); | ||
359 | return -ENOMEM; | ||
360 | } | ||
361 | per_dev->offset = si->obj_offset; | ||
362 | per_dev->dev = si->dev; | ||
363 | } else if (si->obj_offset != (per_dev->offset + per_dev->length)) { | ||
364 | u64 gap = si->obj_offset - (per_dev->offset + per_dev->length); | ||
365 | |||
366 | _ore_add_sg_seg(per_dev, gap, true); | ||
367 | } | ||
368 | q = osd_request_queue(ore_comp_dev(read_ios->oc, per_dev->dev)); | ||
369 | added_len = bio_add_pc_page(q, per_dev->bio, page, PAGE_SIZE, 0); | ||
370 | if (unlikely(added_len != PAGE_SIZE)) { | ||
371 | ORE_DBGMSG("Failed to bio_add_pc_page bi_vcnt=%d\n", | ||
372 | per_dev->bio->bi_vcnt); | ||
373 | return -ENOMEM; | ||
374 | } | ||
375 | |||
376 | per_dev->length += PAGE_SIZE; | ||
377 | return 0; | ||
378 | } | ||
379 | |||
380 | static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret) | ||
381 | { | ||
382 | struct bio_vec *bv; | ||
383 | unsigned i, d; | ||
384 | |||
385 | /* loop on all devices all pages */ | ||
386 | for (d = 0; d < ios->numdevs; d++) { | ||
387 | struct bio *bio = ios->per_dev[d].bio; | ||
388 | |||
389 | if (!bio) | ||
390 | continue; | ||
391 | |||
392 | __bio_for_each_segment(bv, bio, i, 0) { | ||
393 | struct page *page = bv->bv_page; | ||
394 | |||
395 | SetPageUptodate(page); | ||
396 | if (PageError(page)) | ||
397 | ClearPageError(page); | ||
398 | } | ||
399 | } | ||
400 | } | ||
401 | |||
402 | /* read_4_write is hacked to read the start of the first stripe and/or | ||
403 | * the end of the last stripe. If needed, with an sg-gap at each device/page. | ||
404 | * It is assumed to be called after the to_be_written pages of the first stripe | ||
405 | * are populating ios->sp2d[][] | ||
406 | * | ||
407 | * NOTE: We call ios->r4w->lock_fn for all pages needed for parity calculations | ||
408 | * These pages are held at sp2d[p].pages[c] but with | ||
409 | * sp2d[p].page_is_read[c] = true. At _sp2d_reset these pages are | ||
410 | * ios->r4w->lock_fn(). The ios->r4w->lock_fn might signal that the page is | ||
411 | * @uptodate=true, so we don't need to read it, only unlock, after IO. | ||
412 | * | ||
413 | * TODO: The read_4_write should calc a need_to_read_pages_count, if bigger then | ||
414 | * to-be-written count, we should consider the xor-in-place mode. | ||
415 | * need_to_read_pages_count is the actual number of pages not present in cache. | ||
416 | * maybe "devs_in_group - ios->sp2d[p].write_count" is a good enough | ||
417 | * approximation? In this mode the read pages are put in the empty places of | ||
418 | * ios->sp2d[p][*], xor is calculated the same way. These pages are | ||
419 | * allocated/freed and don't go through cache | ||
420 | */ | ||
421 | static int _read_4_write(struct ore_io_state *ios) | ||
422 | { | ||
423 | struct ore_io_state *ios_read; | ||
424 | struct ore_striping_info read_si; | ||
425 | struct __stripe_pages_2d *sp2d = ios->sp2d; | ||
426 | u64 offset = ios->si.first_stripe_start; | ||
427 | u64 last_stripe_end; | ||
428 | unsigned bytes_in_stripe = ios->si.bytes_in_stripe; | ||
429 | unsigned i, c, p, min_p = sp2d->pages_in_unit, max_p = -1; | ||
430 | int ret; | ||
431 | |||
432 | if (offset == ios->offset) /* Go to start collect $200 */ | ||
433 | goto read_last_stripe; | ||
434 | |||
435 | min_p = _sp2d_min_pg(sp2d); | ||
436 | max_p = _sp2d_max_pg(sp2d); | ||
437 | |||
438 | for (c = 0; ; c++) { | ||
439 | ore_calc_stripe_info(ios->layout, offset, 0, &read_si); | ||
440 | read_si.obj_offset += min_p * PAGE_SIZE; | ||
441 | offset += min_p * PAGE_SIZE; | ||
442 | for (p = min_p; p <= max_p; p++) { | ||
443 | struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; | ||
444 | struct page **pp = &_1ps->pages[c]; | ||
445 | bool uptodate; | ||
446 | |||
447 | if (*pp) | ||
448 | /* to-be-written pages start here */ | ||
449 | goto read_last_stripe; | ||
450 | |||
451 | *pp = ios->r4w->get_page(ios->private, offset, | ||
452 | &uptodate); | ||
453 | if (unlikely(!*pp)) | ||
454 | return -ENOMEM; | ||
455 | |||
456 | if (!uptodate) | ||
457 | _add_to_read_4_write(ios, &read_si, *pp); | ||
458 | |||
459 | /* Mark read-pages to be cache_released */ | ||
460 | _1ps->page_is_read[c] = true; | ||
461 | read_si.obj_offset += PAGE_SIZE; | ||
462 | offset += PAGE_SIZE; | ||
463 | } | ||
464 | offset += (sp2d->pages_in_unit - p) * PAGE_SIZE; | ||
465 | } | ||
466 | |||
467 | read_last_stripe: | ||
468 | offset = ios->offset + (ios->length + PAGE_SIZE - 1) / | ||
469 | PAGE_SIZE * PAGE_SIZE; | ||
470 | last_stripe_end = div_u64(offset + bytes_in_stripe - 1, bytes_in_stripe) | ||
471 | * bytes_in_stripe; | ||
472 | if (offset == last_stripe_end) /* Optimize for the aligned case */ | ||
473 | goto read_it; | ||
474 | |||
475 | ore_calc_stripe_info(ios->layout, offset, 0, &read_si); | ||
476 | p = read_si.unit_off / PAGE_SIZE; | ||
477 | c = _dev_order(ios->layout->group_width * ios->layout->mirrors_p1, | ||
478 | ios->layout->mirrors_p1, read_si.par_dev, read_si.dev); | ||
479 | |||
480 | BUG_ON(ios->si.first_stripe_start + bytes_in_stripe != last_stripe_end); | ||
481 | /* unaligned IO must be within a single stripe */ | ||
482 | |||
483 | if (min_p == sp2d->pages_in_unit) { | ||
484 | /* Didn't do it yet */ | ||
485 | min_p = _sp2d_min_pg(sp2d); | ||
486 | max_p = _sp2d_max_pg(sp2d); | ||
487 | } | ||
488 | |||
489 | while (offset < last_stripe_end) { | ||
490 | struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; | ||
491 | |||
492 | if ((min_p <= p) && (p <= max_p)) { | ||
493 | struct page *page; | ||
494 | bool uptodate; | ||
495 | |||
496 | BUG_ON(_1ps->pages[c]); | ||
497 | page = ios->r4w->get_page(ios->private, offset, | ||
498 | &uptodate); | ||
499 | if (unlikely(!page)) | ||
500 | return -ENOMEM; | ||
501 | |||
502 | _1ps->pages[c] = page; | ||
503 | /* Mark read-pages to be cache_released */ | ||
504 | _1ps->page_is_read[c] = true; | ||
505 | if (!uptodate) | ||
506 | _add_to_read_4_write(ios, &read_si, page); | ||
507 | } | ||
508 | |||
509 | offset += PAGE_SIZE; | ||
510 | if (p == (sp2d->pages_in_unit - 1)) { | ||
511 | ++c; | ||
512 | p = 0; | ||
513 | ore_calc_stripe_info(ios->layout, offset, 0, &read_si); | ||
514 | } else { | ||
515 | read_si.obj_offset += PAGE_SIZE; | ||
516 | ++p; | ||
517 | } | ||
518 | } | ||
519 | |||
520 | read_it: | ||
521 | ios_read = ios->ios_read_4_write; | ||
522 | if (!ios_read) | ||
523 | return 0; | ||
524 | |||
525 | /* FIXME: Ugly to signal _sbi_read_mirror that we have bio(s). Change | ||
526 | * to check for per_dev->bio | ||
527 | */ | ||
528 | ios_read->pages = ios->pages; | ||
529 | |||
530 | /* Now read these devices */ | ||
531 | for (i = 0; i < ios_read->numdevs; i += ios_read->layout->mirrors_p1) { | ||
532 | ret = _ore_read_mirror(ios_read, i); | ||
533 | if (unlikely(ret)) | ||
534 | return ret; | ||
535 | } | ||
536 | |||
537 | ret = ore_io_execute(ios_read); /* Synchronus execution */ | ||
538 | if (unlikely(ret)) { | ||
539 | ORE_DBGMSG("!! ore_io_execute => %d\n", ret); | ||
540 | return ret; | ||
541 | } | ||
542 | |||
543 | _mark_read4write_pages_uptodate(ios_read, ret); | ||
544 | return 0; | ||
545 | } | ||
546 | |||
547 | /* In writes @cur_len means length left. .i.e cur_len==0 is the last parity U */ | ||
548 | int _ore_add_parity_unit(struct ore_io_state *ios, | ||
549 | struct ore_striping_info *si, | ||
550 | struct ore_per_dev_state *per_dev, | ||
551 | unsigned cur_len) | ||
552 | { | ||
553 | if (ios->reading) { | ||
554 | BUG_ON(per_dev->cur_sg >= ios->sgs_per_dev); | ||
555 | _ore_add_sg_seg(per_dev, cur_len, true); | ||
556 | } else { | ||
557 | struct __stripe_pages_2d *sp2d = ios->sp2d; | ||
558 | struct page **pages = ios->parity_pages + ios->cur_par_page; | ||
559 | unsigned num_pages; | ||
560 | unsigned array_start = 0; | ||
561 | unsigned i; | ||
562 | int ret; | ||
563 | |||
564 | si->cur_pg = _sp2d_min_pg(sp2d); | ||
565 | num_pages = _sp2d_max_pg(sp2d) + 1 - si->cur_pg; | ||
566 | |||
567 | if (!cur_len) /* If last stripe operate on parity comp */ | ||
568 | si->cur_comp = sp2d->data_devs; | ||
569 | |||
570 | if (!per_dev->length) { | ||
571 | per_dev->offset += si->cur_pg * PAGE_SIZE; | ||
572 | /* If first stripe, Read in all read4write pages | ||
573 | * (if needed) before we calculate the first parity. | ||
574 | */ | ||
575 | _read_4_write(ios); | ||
576 | } | ||
577 | |||
578 | for (i = 0; i < num_pages; i++) { | ||
579 | pages[i] = _raid_page_alloc(); | ||
580 | if (unlikely(!pages[i])) | ||
581 | return -ENOMEM; | ||
582 | |||
583 | ++(ios->cur_par_page); | ||
584 | } | ||
585 | |||
586 | BUG_ON(si->cur_comp != sp2d->data_devs); | ||
587 | BUG_ON(si->cur_pg + num_pages > sp2d->pages_in_unit); | ||
588 | |||
589 | ret = _ore_add_stripe_unit(ios, &array_start, 0, pages, | ||
590 | per_dev, num_pages * PAGE_SIZE); | ||
591 | if (unlikely(ret)) | ||
592 | return ret; | ||
593 | |||
594 | /* TODO: raid6 if (last_parity_dev) */ | ||
595 | _gen_xor_unit(sp2d); | ||
596 | _sp2d_reset(sp2d, ios->r4w, ios->private); | ||
597 | } | ||
598 | return 0; | ||
599 | } | ||
600 | |||
601 | int _ore_post_alloc_raid_stuff(struct ore_io_state *ios) | ||
602 | { | ||
603 | struct ore_layout *layout = ios->layout; | ||
604 | |||
605 | if (ios->parity_pages) { | ||
606 | unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE; | ||
607 | unsigned stripe_size = ios->si.bytes_in_stripe; | ||
608 | u64 last_stripe, first_stripe; | ||
609 | |||
610 | if (_sp2d_alloc(pages_in_unit, layout->group_width, | ||
611 | layout->parity, &ios->sp2d)) { | ||
612 | return -ENOMEM; | ||
613 | } | ||
614 | |||
615 | BUG_ON(ios->offset % PAGE_SIZE); | ||
616 | |||
617 | /* Round io down to last full strip */ | ||
618 | first_stripe = div_u64(ios->offset, stripe_size); | ||
619 | last_stripe = div_u64(ios->offset + ios->length, stripe_size); | ||
620 | |||
621 | /* If an IO spans more then a single stripe it must end at | ||
622 | * a stripe boundary. The reminder at the end is pushed into the | ||
623 | * next IO. | ||
624 | */ | ||
625 | if (last_stripe != first_stripe) { | ||
626 | ios->length = last_stripe * stripe_size - ios->offset; | ||
627 | |||
628 | BUG_ON(!ios->length); | ||
629 | ios->nr_pages = (ios->length + PAGE_SIZE - 1) / | ||
630 | PAGE_SIZE; | ||
631 | ios->si.length = ios->length; /*make it consistent */ | ||
632 | } | ||
633 | } | ||
634 | return 0; | ||
635 | } | ||
636 | |||
637 | void _ore_free_raid_stuff(struct ore_io_state *ios) | ||
638 | { | ||
639 | if (ios->sp2d) { /* writing and raid */ | ||
640 | unsigned i; | ||
641 | |||
642 | for (i = 0; i < ios->cur_par_page; i++) { | ||
643 | struct page *page = ios->parity_pages[i]; | ||
644 | |||
645 | if (page) | ||
646 | _raid_page_free(page); | ||
647 | } | ||
648 | if (ios->extra_part_alloc) | ||
649 | kfree(ios->parity_pages); | ||
650 | /* If IO returned an error pages might need unlocking */ | ||
651 | _sp2d_reset(ios->sp2d, ios->r4w, ios->private); | ||
652 | _sp2d_free(ios->sp2d); | ||
653 | } else { | ||
654 | /* Will only be set if raid reading && sglist is big */ | ||
655 | if (ios->extra_part_alloc) | ||
656 | kfree(ios->per_dev[0].sglist); | ||
657 | } | ||
658 | if (ios->ios_read_4_write) | ||
659 | ore_put_io_state(ios->ios_read_4_write); | ||
660 | } | ||