diff options
Diffstat (limited to 'fs')
-rw-r--r-- | fs/exofs/Kbuild | 3 | ||||
-rw-r--r-- | fs/exofs/ore.c | 326 | ||||
-rw-r--r-- | fs/exofs/ore_raid.c | 140 | ||||
-rw-r--r-- | fs/exofs/ore_raid.h | 64 |
4 files changed, 455 insertions, 78 deletions
diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild index c5a5855a6c44..352ba149d23e 100644 --- a/fs/exofs/Kbuild +++ b/fs/exofs/Kbuild | |||
@@ -13,7 +13,8 @@ | |||
13 | # | 13 | # |
14 | 14 | ||
15 | # ore module library | 15 | # ore module library |
16 | obj-$(CONFIG_ORE) += ore.o | 16 | libore-y := ore.o ore_raid.o |
17 | obj-$(CONFIG_ORE) += libore.o | ||
17 | 18 | ||
18 | exofs-y := inode.o file.o symlink.o namei.o dir.o super.o | 19 | exofs-y := inode.o file.o symlink.o namei.o dir.o super.o |
19 | obj-$(CONFIG_EXOFS_FS) += exofs.o | 20 | obj-$(CONFIG_EXOFS_FS) += exofs.o |
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index d92998d5c2d6..fd6090ddd3bf 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c | |||
@@ -24,24 +24,9 @@ | |||
24 | 24 | ||
25 | #include <linux/slab.h> | 25 | #include <linux/slab.h> |
26 | #include <asm/div64.h> | 26 | #include <asm/div64.h> |
27 | #include <linux/lcm.h> | ||
27 | 28 | ||
28 | #include <scsi/osd_ore.h> | 29 | #include "ore_raid.h" |
29 | |||
30 | #define ORE_ERR(fmt, a...) printk(KERN_ERR "ore: " fmt, ##a) | ||
31 | |||
32 | #ifdef CONFIG_EXOFS_DEBUG | ||
33 | #define ORE_DBGMSG(fmt, a...) \ | ||
34 | printk(KERN_NOTICE "ore @%s:%d: " fmt, __func__, __LINE__, ##a) | ||
35 | #else | ||
36 | #define ORE_DBGMSG(fmt, a...) \ | ||
37 | do { if (0) printk(fmt, ##a); } while (0) | ||
38 | #endif | ||
39 | |||
40 | /* u64 has problems with printk this will cast it to unsigned long long */ | ||
41 | #define _LLU(x) (unsigned long long)(x) | ||
42 | |||
43 | #define ORE_DBGMSG2(M...) do {} while (0) | ||
44 | /* #define ORE_DBGMSG2 ORE_DBGMSG */ | ||
45 | 30 | ||
46 | MODULE_AUTHOR("Boaz Harrosh <bharrosh@panasas.com>"); | 31 | MODULE_AUTHOR("Boaz Harrosh <bharrosh@panasas.com>"); |
47 | MODULE_DESCRIPTION("Objects Raid Engine ore.ko"); | 32 | MODULE_DESCRIPTION("Objects Raid Engine ore.ko"); |
@@ -133,21 +118,81 @@ static struct osd_dev *_ios_od(struct ore_io_state *ios, unsigned index) | |||
133 | return ore_comp_dev(ios->oc, index); | 118 | return ore_comp_dev(ios->oc, index); |
134 | } | 119 | } |
135 | 120 | ||
136 | static int _get_io_state(struct ore_layout *layout, | 121 | static int _ore_get_io_state(struct ore_layout *layout, |
137 | struct ore_components *oc, unsigned numdevs, | 122 | struct ore_components *oc, unsigned numdevs, |
138 | struct ore_io_state **pios) | 123 | unsigned sgs_per_dev, unsigned num_par_pages, |
124 | struct ore_io_state **pios) | ||
139 | { | 125 | { |
140 | struct ore_io_state *ios; | 126 | struct ore_io_state *ios; |
127 | struct page **pages; | ||
128 | struct osd_sg_entry *sgilist; | ||
129 | struct __alloc_all_io_state { | ||
130 | struct ore_io_state ios; | ||
131 | struct ore_per_dev_state per_dev[numdevs]; | ||
132 | union { | ||
133 | struct osd_sg_entry sglist[sgs_per_dev * numdevs]; | ||
134 | struct page *pages[num_par_pages]; | ||
135 | }; | ||
136 | } *_aios; | ||
137 | |||
138 | if (likely(sizeof(*_aios) <= PAGE_SIZE)) { | ||
139 | _aios = kzalloc(sizeof(*_aios), GFP_KERNEL); | ||
140 | if (unlikely(!_aios)) { | ||
141 | ORE_DBGMSG("Failed kzalloc bytes=%zd\n", | ||
142 | sizeof(*_aios)); | ||
143 | *pios = NULL; | ||
144 | return -ENOMEM; | ||
145 | } | ||
146 | pages = num_par_pages ? _aios->pages : NULL; | ||
147 | sgilist = sgs_per_dev ? _aios->sglist : NULL; | ||
148 | ios = &_aios->ios; | ||
149 | } else { | ||
150 | struct __alloc_small_io_state { | ||
151 | struct ore_io_state ios; | ||
152 | struct ore_per_dev_state per_dev[numdevs]; | ||
153 | } *_aio_small; | ||
154 | union __extra_part { | ||
155 | struct osd_sg_entry sglist[sgs_per_dev * numdevs]; | ||
156 | struct page *pages[num_par_pages]; | ||
157 | } *extra_part; | ||
158 | |||
159 | _aio_small = kzalloc(sizeof(*_aio_small), GFP_KERNEL); | ||
160 | if (unlikely(!_aio_small)) { | ||
161 | ORE_DBGMSG("Failed alloc first part bytes=%zd\n", | ||
162 | sizeof(*_aio_small)); | ||
163 | *pios = NULL; | ||
164 | return -ENOMEM; | ||
165 | } | ||
166 | extra_part = kzalloc(sizeof(*extra_part), GFP_KERNEL); | ||
167 | if (unlikely(!extra_part)) { | ||
168 | ORE_DBGMSG("Failed alloc second part bytes=%zd\n", | ||
169 | sizeof(*extra_part)); | ||
170 | kfree(_aio_small); | ||
171 | *pios = NULL; | ||
172 | return -ENOMEM; | ||
173 | } | ||
141 | 174 | ||
142 | /*TODO: Maybe use kmem_cach per sbi of size | 175 | pages = num_par_pages ? extra_part->pages : NULL; |
143 | * exofs_io_state_size(layout->s_numdevs) | 176 | sgilist = sgs_per_dev ? extra_part->sglist : NULL; |
144 | */ | 177 | /* In this case the per_dev[0].sgilist holds the pointer to |
145 | ios = kzalloc(ore_io_state_size(numdevs), GFP_KERNEL); | 178 | * be freed |
146 | if (unlikely(!ios)) { | 179 | */ |
147 | ORE_DBGMSG("Failed kzalloc bytes=%d\n", | 180 | ios = &_aio_small->ios; |
148 | ore_io_state_size(numdevs)); | 181 | ios->extra_part_alloc = true; |
149 | *pios = NULL; | 182 | } |
150 | return -ENOMEM; | 183 | |
184 | if (pages) { | ||
185 | ios->parity_pages = pages; | ||
186 | ios->max_par_pages = num_par_pages; | ||
187 | } | ||
188 | if (sgilist) { | ||
189 | unsigned d; | ||
190 | |||
191 | for (d = 0; d < numdevs; ++d) { | ||
192 | ios->per_dev[d].sglist = sgilist; | ||
193 | sgilist += sgs_per_dev; | ||
194 | } | ||
195 | ios->sgs_per_dev = sgs_per_dev; | ||
151 | } | 196 | } |
152 | 197 | ||
153 | ios->layout = layout; | 198 | ios->layout = layout; |
@@ -178,9 +223,42 @@ int ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc, | |||
178 | { | 223 | { |
179 | struct ore_io_state *ios; | 224 | struct ore_io_state *ios; |
180 | unsigned numdevs = layout->group_width * layout->mirrors_p1; | 225 | unsigned numdevs = layout->group_width * layout->mirrors_p1; |
226 | unsigned sgs_per_dev = 0, max_par_pages = 0; | ||
181 | int ret; | 227 | int ret; |
182 | 228 | ||
183 | ret = _get_io_state(layout, oc, numdevs, pios); | 229 | if (layout->parity && length) { |
230 | unsigned data_devs = layout->group_width - layout->parity; | ||
231 | unsigned stripe_size = layout->stripe_unit * data_devs; | ||
232 | unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE; | ||
233 | u32 remainder; | ||
234 | u64 num_stripes; | ||
235 | u64 num_raid_units; | ||
236 | |||
237 | num_stripes = div_u64_rem(length, stripe_size, &remainder); | ||
238 | if (remainder) | ||
239 | ++num_stripes; | ||
240 | |||
241 | num_raid_units = num_stripes * layout->parity; | ||
242 | |||
243 | if (is_reading) { | ||
244 | /* For reads add per_dev sglist array */ | ||
245 | /* TODO: Raid 6 we need twice more. Actually: | ||
246 | * num_stripes / LCMdP(W,P); | ||
247 | * if (W%P != 0) num_stripes *= parity; | ||
248 | */ | ||
249 | |||
250 | /* first/last seg is split */ | ||
251 | num_raid_units += layout->group_width; | ||
252 | sgs_per_dev = div_u64(num_raid_units, data_devs); | ||
253 | } else { | ||
254 | /* For Writes add parity pages array. */ | ||
255 | max_par_pages = num_raid_units * pages_in_unit * | ||
256 | sizeof(struct page *); | ||
257 | } | ||
258 | } | ||
259 | |||
260 | ret = _ore_get_io_state(layout, oc, numdevs, sgs_per_dev, max_par_pages, | ||
261 | pios); | ||
184 | if (unlikely(ret)) | 262 | if (unlikely(ret)) |
185 | return ret; | 263 | return ret; |
186 | 264 | ||
@@ -189,10 +267,11 @@ int ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc, | |||
189 | ios->offset = offset; | 267 | ios->offset = offset; |
190 | 268 | ||
191 | if (length) { | 269 | if (length) { |
192 | ore_calc_stripe_info(layout, offset, &ios->si); | 270 | ore_calc_stripe_info(layout, offset, length, &ios->si); |
193 | ios->length = (length <= ios->si.group_length) ? length : | 271 | ios->length = ios->si.length; |
194 | ios->si.group_length; | ||
195 | ios->nr_pages = (ios->length + PAGE_SIZE - 1) / PAGE_SIZE; | 272 | ios->nr_pages = (ios->length + PAGE_SIZE - 1) / PAGE_SIZE; |
273 | if (layout->parity) | ||
274 | _ore_post_alloc_raid_stuff(ios); | ||
196 | } | 275 | } |
197 | 276 | ||
198 | return 0; | 277 | return 0; |
@@ -209,7 +288,7 @@ EXPORT_SYMBOL(ore_get_rw_state); | |||
209 | int ore_get_io_state(struct ore_layout *layout, struct ore_components *oc, | 288 | int ore_get_io_state(struct ore_layout *layout, struct ore_components *oc, |
210 | struct ore_io_state **pios) | 289 | struct ore_io_state **pios) |
211 | { | 290 | { |
212 | return _get_io_state(layout, oc, oc->numdevs, pios); | 291 | return _ore_get_io_state(layout, oc, oc->numdevs, 0, 0, pios); |
213 | } | 292 | } |
214 | EXPORT_SYMBOL(ore_get_io_state); | 293 | EXPORT_SYMBOL(ore_get_io_state); |
215 | 294 | ||
@@ -227,6 +306,7 @@ void ore_put_io_state(struct ore_io_state *ios) | |||
227 | bio_put(per_dev->bio); | 306 | bio_put(per_dev->bio); |
228 | } | 307 | } |
229 | 308 | ||
309 | _ore_free_raid_stuff(ios); | ||
230 | kfree(ios); | 310 | kfree(ios); |
231 | } | 311 | } |
232 | } | 312 | } |
@@ -367,53 +447,65 @@ EXPORT_SYMBOL(ore_check_io); | |||
367 | /* | 447 | /* |
368 | * L - logical offset into the file | 448 | * L - logical offset into the file |
369 | * | 449 | * |
370 | * U - The number of bytes in a stripe within a group | 450 | * D - number of Data devices |
451 | * D = group_width - parity | ||
371 | * | 452 | * |
372 | * U = stripe_unit * group_width | 453 | * U - The number of bytes in a stripe within a group |
454 | * U = stripe_unit * D | ||
373 | * | 455 | * |
374 | * T - The number of bytes striped within a group of component objects | 456 | * T - The number of bytes striped within a group of component objects |
375 | * (before advancing to the next group) | 457 | * (before advancing to the next group) |
376 | * | 458 | * T = U * group_depth |
377 | * T = stripe_unit * group_width * group_depth | ||
378 | * | 459 | * |
379 | * S - The number of bytes striped across all component objects | 460 | * S - The number of bytes striped across all component objects |
380 | * before the pattern repeats | 461 | * before the pattern repeats |
462 | * S = T * group_count | ||
381 | * | 463 | * |
382 | * S = stripe_unit * group_width * group_depth * group_count | 464 | * M - The "major" (i.e., across all components) cycle number |
383 | * | ||
384 | * M - The "major" (i.e., across all components) stripe number | ||
385 | * | ||
386 | * M = L / S | 465 | * M = L / S |
387 | * | 466 | * |
388 | * G - Counts the groups from the beginning of the major stripe | 467 | * G - Counts the groups from the beginning of the major cycle |
389 | * | ||
390 | * G = (L - (M * S)) / T [or (L % S) / T] | 468 | * G = (L - (M * S)) / T [or (L % S) / T] |
391 | * | 469 | * |
392 | * H - The byte offset within the group | 470 | * H - The byte offset within the group |
393 | * | ||
394 | * H = (L - (M * S)) % T [or (L % S) % T] | 471 | * H = (L - (M * S)) % T [or (L % S) % T] |
395 | * | 472 | * |
396 | * N - The "minor" (i.e., across the group) stripe number | 473 | * N - The "minor" (i.e., across the group) stripe number |
397 | * | ||
398 | * N = H / U | 474 | * N = H / U |
399 | * | 475 | * |
400 | * C - The component index coresponding to L | 476 | * C - The component index coresponding to L |
401 | * | 477 | * |
402 | * C = (H - (N * U)) / stripe_unit + G * group_width | 478 | * C = (H - (N * U)) / stripe_unit + G * D |
403 | * [or (L % U) / stripe_unit + G * group_width] | 479 | * [or (L % U) / stripe_unit + G * D] |
404 | * | 480 | * |
405 | * O - The component offset coresponding to L | 481 | * O - The component offset coresponding to L |
406 | * | ||
407 | * O = L % stripe_unit + N * stripe_unit + M * group_depth * stripe_unit | 482 | * O = L % stripe_unit + N * stripe_unit + M * group_depth * stripe_unit |
483 | * | ||
484 | * LCMdP – Parity cycle: Lowest Common Multiple of group_width, parity | ||
485 | * divide by parity | ||
486 | * LCMdP = lcm(group_width, parity) / parity | ||
487 | * | ||
488 | * R - The parity Rotation stripe | ||
489 | * (Note parity cycle always starts at a group's boundary) | ||
490 | * R = N % LCMdP | ||
491 | * | ||
492 | * I = the first parity device index | ||
493 | * I = (group_width + group_width - R*parity - parity) % group_width | ||
494 | * | ||
495 | * Craid - The component index Rotated | ||
496 | * Craid = (group_width + C - R*parity) % group_width | ||
497 | * (We add the group_width to avoid negative numbers modulo math) | ||
408 | */ | 498 | */ |
409 | void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset, | 499 | void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset, |
410 | struct ore_striping_info *si) | 500 | u64 length, struct ore_striping_info *si) |
411 | { | 501 | { |
412 | u32 stripe_unit = layout->stripe_unit; | 502 | u32 stripe_unit = layout->stripe_unit; |
413 | u32 group_width = layout->group_width; | 503 | u32 group_width = layout->group_width; |
414 | u64 group_depth = layout->group_depth; | 504 | u64 group_depth = layout->group_depth; |
505 | u32 parity = layout->parity; | ||
415 | 506 | ||
416 | u32 U = stripe_unit * group_width; | 507 | u32 D = group_width - parity; |
508 | u32 U = D * stripe_unit; | ||
417 | u64 T = U * group_depth; | 509 | u64 T = U * group_depth; |
418 | u64 S = T * layout->group_count; | 510 | u64 S = T * layout->group_count; |
419 | u64 M = div64_u64(file_offset, S); | 511 | u64 M = div64_u64(file_offset, S); |
@@ -429,22 +521,43 @@ void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset, | |||
429 | u32 N = div_u64(H, U); | 521 | u32 N = div_u64(H, U); |
430 | 522 | ||
431 | /* "H - (N * U)" is just "H % U" so it's bound to u32 */ | 523 | /* "H - (N * U)" is just "H % U" so it's bound to u32 */ |
432 | si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width; | 524 | u32 C = (u32)(H - (N * U)) / stripe_unit + G * group_width; |
433 | si->dev *= layout->mirrors_p1; | ||
434 | 525 | ||
435 | div_u64_rem(file_offset, stripe_unit, &si->unit_off); | 526 | div_u64_rem(file_offset, stripe_unit, &si->unit_off); |
436 | 527 | ||
437 | si->obj_offset = si->unit_off + (N * stripe_unit) + | 528 | si->obj_offset = si->unit_off + (N * stripe_unit) + |
438 | (M * group_depth * stripe_unit); | 529 | (M * group_depth * stripe_unit); |
439 | 530 | ||
440 | si->group_length = T - H; | 531 | if (parity) { |
532 | u32 LCMdP = lcm(group_width, parity) / parity; | ||
533 | /* R = N % LCMdP; */ | ||
534 | u32 RxP = (N % LCMdP) * parity; | ||
535 | u32 first_dev = C - C % group_width; | ||
536 | |||
537 | si->par_dev = (group_width + group_width - parity - RxP) % | ||
538 | group_width + first_dev; | ||
539 | si->dev = (group_width + C - RxP) % group_width + first_dev; | ||
540 | si->bytes_in_stripe = U; | ||
541 | si->first_stripe_start = M * S + G * T + N * U; | ||
542 | } else { | ||
543 | /* Make the math correct see _prepare_one_group */ | ||
544 | si->par_dev = group_width; | ||
545 | si->dev = C; | ||
546 | } | ||
547 | |||
548 | si->dev *= layout->mirrors_p1; | ||
549 | si->par_dev *= layout->mirrors_p1; | ||
550 | si->offset = file_offset; | ||
551 | si->length = T - H; | ||
552 | if (si->length > length) | ||
553 | si->length = length; | ||
441 | si->M = M; | 554 | si->M = M; |
442 | } | 555 | } |
443 | EXPORT_SYMBOL(ore_calc_stripe_info); | 556 | EXPORT_SYMBOL(ore_calc_stripe_info); |
444 | 557 | ||
445 | static int _add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, | 558 | int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, |
446 | unsigned pgbase, struct ore_per_dev_state *per_dev, | 559 | unsigned pgbase, struct page **pages, |
447 | int cur_len) | 560 | struct ore_per_dev_state *per_dev, int cur_len) |
448 | { | 561 | { |
449 | unsigned pg = *cur_pg; | 562 | unsigned pg = *cur_pg; |
450 | struct request_queue *q = | 563 | struct request_queue *q = |
@@ -455,8 +568,11 @@ static int _add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, | |||
455 | if (per_dev->bio == NULL) { | 568 | if (per_dev->bio == NULL) { |
456 | unsigned pages_in_stripe = ios->layout->group_width * | 569 | unsigned pages_in_stripe = ios->layout->group_width * |
457 | (ios->layout->stripe_unit / PAGE_SIZE); | 570 | (ios->layout->stripe_unit / PAGE_SIZE); |
458 | unsigned bio_size = (ios->nr_pages + pages_in_stripe) / | 571 | unsigned nr_pages = ios->nr_pages * ios->layout->group_width / |
459 | ios->layout->group_width; | 572 | (ios->layout->group_width - |
573 | ios->layout->parity); | ||
574 | unsigned bio_size = (nr_pages + pages_in_stripe) / | ||
575 | ios->layout->group_width; | ||
460 | 576 | ||
461 | per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size); | 577 | per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size); |
462 | if (unlikely(!per_dev->bio)) { | 578 | if (unlikely(!per_dev->bio)) { |
@@ -471,12 +587,13 @@ static int _add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, | |||
471 | unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len); | 587 | unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len); |
472 | unsigned added_len; | 588 | unsigned added_len; |
473 | 589 | ||
474 | BUG_ON(ios->nr_pages <= pg); | ||
475 | cur_len -= pglen; | 590 | cur_len -= pglen; |
476 | 591 | ||
477 | added_len = bio_add_pc_page(q, per_dev->bio, ios->pages[pg], | 592 | added_len = bio_add_pc_page(q, per_dev->bio, pages[pg], |
478 | pglen, pgbase); | 593 | pglen, pgbase); |
479 | if (unlikely(pglen != added_len)) { | 594 | if (unlikely(pglen != added_len)) { |
595 | ORE_DBGMSG("Failed bio_add_pc_page bi_vcnt=%u\n", | ||
596 | per_dev->bio->bi_vcnt); | ||
480 | ret = -ENOMEM; | 597 | ret = -ENOMEM; |
481 | goto out; | 598 | goto out; |
482 | } | 599 | } |
@@ -501,9 +618,11 @@ static int _prepare_for_striping(struct ore_io_state *ios) | |||
501 | struct ore_striping_info *si = &ios->si; | 618 | struct ore_striping_info *si = &ios->si; |
502 | unsigned stripe_unit = ios->layout->stripe_unit; | 619 | unsigned stripe_unit = ios->layout->stripe_unit; |
503 | unsigned mirrors_p1 = ios->layout->mirrors_p1; | 620 | unsigned mirrors_p1 = ios->layout->mirrors_p1; |
504 | unsigned devs_in_group = ios->layout->group_width * mirrors_p1; | 621 | unsigned group_width = ios->layout->group_width; |
622 | unsigned devs_in_group = group_width * mirrors_p1; | ||
505 | unsigned dev = si->dev; | 623 | unsigned dev = si->dev; |
506 | unsigned first_dev = dev - (dev % devs_in_group); | 624 | unsigned first_dev = dev - (dev % devs_in_group); |
625 | unsigned dev_order; | ||
507 | unsigned cur_pg = ios->pages_consumed; | 626 | unsigned cur_pg = ios->pages_consumed; |
508 | u64 length = ios->length; | 627 | u64 length = ios->length; |
509 | int ret = 0; | 628 | int ret = 0; |
@@ -513,7 +632,10 @@ static int _prepare_for_striping(struct ore_io_state *ios) | |||
513 | return 0; | 632 | return 0; |
514 | } | 633 | } |
515 | 634 | ||
516 | BUG_ON(length > si->group_length); | 635 | BUG_ON(length > si->length); |
636 | |||
637 | dev_order = _dev_order(devs_in_group, mirrors_p1, si->par_dev, dev); | ||
638 | si->cur_comp = dev_order; | ||
517 | 639 | ||
518 | while (length) { | 640 | while (length) { |
519 | unsigned comp = dev - first_dev; | 641 | unsigned comp = dev - first_dev; |
@@ -522,17 +644,20 @@ static int _prepare_for_striping(struct ore_io_state *ios) | |||
522 | 644 | ||
523 | if (!per_dev->length) { | 645 | if (!per_dev->length) { |
524 | per_dev->dev = dev; | 646 | per_dev->dev = dev; |
525 | if (dev < si->dev) { | 647 | if (dev == si->dev) { |
526 | per_dev->offset = si->obj_offset + stripe_unit - | 648 | WARN_ON(dev == si->par_dev); |
527 | si->unit_off; | ||
528 | cur_len = stripe_unit; | ||
529 | } else if (dev == si->dev) { | ||
530 | per_dev->offset = si->obj_offset; | 649 | per_dev->offset = si->obj_offset; |
531 | cur_len = stripe_unit - si->unit_off; | 650 | cur_len = stripe_unit - si->unit_off; |
532 | page_off = si->unit_off & ~PAGE_MASK; | 651 | page_off = si->unit_off & ~PAGE_MASK; |
533 | BUG_ON(page_off && (page_off != ios->pgbase)); | 652 | BUG_ON(page_off && (page_off != ios->pgbase)); |
534 | } else { /* dev > si->dev */ | 653 | } else { |
535 | per_dev->offset = si->obj_offset - si->unit_off; | 654 | if (si->cur_comp > dev_order) |
655 | per_dev->offset = | ||
656 | si->obj_offset - si->unit_off; | ||
657 | else /* si->cur_comp < dev_order */ | ||
658 | per_dev->offset = | ||
659 | si->obj_offset + stripe_unit - | ||
660 | si->unit_off; | ||
536 | cur_len = stripe_unit; | 661 | cur_len = stripe_unit; |
537 | } | 662 | } |
538 | } else { | 663 | } else { |
@@ -541,8 +666,8 @@ static int _prepare_for_striping(struct ore_io_state *ios) | |||
541 | if (cur_len >= length) | 666 | if (cur_len >= length) |
542 | cur_len = length; | 667 | cur_len = length; |
543 | 668 | ||
544 | ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev, | 669 | ret = _ore_add_stripe_unit(ios, &cur_pg, page_off, ios->pages, |
545 | cur_len); | 670 | per_dev, cur_len); |
546 | if (unlikely(ret)) | 671 | if (unlikely(ret)) |
547 | goto out; | 672 | goto out; |
548 | 673 | ||
@@ -550,6 +675,41 @@ static int _prepare_for_striping(struct ore_io_state *ios) | |||
550 | dev = (dev % devs_in_group) + first_dev; | 675 | dev = (dev % devs_in_group) + first_dev; |
551 | 676 | ||
552 | length -= cur_len; | 677 | length -= cur_len; |
678 | |||
679 | si->cur_comp = (si->cur_comp + 1) % group_width; | ||
680 | if (unlikely((dev == si->par_dev) || | ||
681 | (!length && ios->parity_pages))) { | ||
682 | if (!length) | ||
683 | /* If we are writing and this is the very last | ||
684 | * stripe. then operate on parity dev. | ||
685 | */ | ||
686 | dev = si->par_dev; | ||
687 | if (ios->reading) | ||
688 | /* In writes cur_len just means if it's the | ||
689 | * last one. See _ore_add_parity_unit. | ||
690 | */ | ||
691 | cur_len = length; | ||
692 | per_dev = &ios->per_dev[dev - first_dev]; | ||
693 | if (!per_dev->length) { | ||
694 | /* Only/always the parity unit of the first | ||
695 | * stripe will be empty. So this is a chance to | ||
696 | * initialize the per_dev info. | ||
697 | */ | ||
698 | per_dev->dev = dev; | ||
699 | per_dev->offset = si->obj_offset - si->unit_off; | ||
700 | } | ||
701 | |||
702 | ret = _ore_add_parity_unit(ios, si, per_dev, cur_len); | ||
703 | if (unlikely(ret)) | ||
704 | goto out; | ||
705 | |||
706 | /* Rotate next par_dev backwards with wraping */ | ||
707 | si->par_dev = (devs_in_group + si->par_dev - | ||
708 | ios->layout->parity * mirrors_p1) % | ||
709 | devs_in_group + first_dev; | ||
710 | /* Next stripe, start fresh */ | ||
711 | si->cur_comp = 0; | ||
712 | } | ||
553 | } | 713 | } |
554 | out: | 714 | out: |
555 | ios->numdevs = devs_in_group; | 715 | ios->numdevs = devs_in_group; |
@@ -747,12 +907,24 @@ static int _read_mirror(struct ore_io_state *ios, unsigned cur_comp) | |||
747 | per_dev->or = or; | 907 | per_dev->or = or; |
748 | 908 | ||
749 | if (ios->pages) { | 909 | if (ios->pages) { |
750 | osd_req_read(or, obj, per_dev->offset, | 910 | if (per_dev->cur_sg) { |
751 | per_dev->bio, per_dev->length); | 911 | /* finalize the last sg_entry */ |
912 | _ore_add_sg_seg(per_dev, 0, false); | ||
913 | if (unlikely(!per_dev->cur_sg)) | ||
914 | return 0; /* Skip parity only device */ | ||
915 | |||
916 | osd_req_read_sg(or, obj, per_dev->bio, | ||
917 | per_dev->sglist, per_dev->cur_sg); | ||
918 | } else { | ||
919 | /* The no raid case */ | ||
920 | osd_req_read(or, obj, per_dev->offset, | ||
921 | per_dev->bio, per_dev->length); | ||
922 | } | ||
923 | |||
752 | ORE_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx" | 924 | ORE_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx" |
753 | " dev=%d\n", _LLU(obj->id), | 925 | " dev=%d sg_len=%d\n", _LLU(obj->id), |
754 | _LLU(per_dev->offset), _LLU(per_dev->length), | 926 | _LLU(per_dev->offset), _LLU(per_dev->length), |
755 | first_dev); | 927 | first_dev, per_dev->cur_sg); |
756 | } else { | 928 | } else { |
757 | BUG_ON(ios->kern_buff); | 929 | BUG_ON(ios->kern_buff); |
758 | 930 | ||
@@ -849,7 +1021,7 @@ static void _calc_trunk_info(struct ore_layout *layout, u64 file_offset, | |||
849 | { | 1021 | { |
850 | unsigned stripe_unit = layout->stripe_unit; | 1022 | unsigned stripe_unit = layout->stripe_unit; |
851 | 1023 | ||
852 | ore_calc_stripe_info(layout, file_offset, &ti->si); | 1024 | ore_calc_stripe_info(layout, file_offset, 0, &ti->si); |
853 | 1025 | ||
854 | ti->prev_group_obj_off = ti->si.M * stripe_unit; | 1026 | ti->prev_group_obj_off = ti->si.M * stripe_unit; |
855 | ti->next_group_obj_off = ti->si.M ? (ti->si.M - 1) * stripe_unit : 0; | 1027 | ti->next_group_obj_off = ti->si.M ? (ti->si.M - 1) * stripe_unit : 0; |
diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c new file mode 100644 index 000000000000..8d4b93a93c67 --- /dev/null +++ b/fs/exofs/ore_raid.c | |||
@@ -0,0 +1,140 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2011 | ||
3 | * Boaz Harrosh <bharrosh@panasas.com> | ||
4 | * | ||
5 | * This file is part of the objects raid engine (ore). | ||
6 | * | ||
7 | * It is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License version 2 as published | ||
9 | * by the Free Software Foundation. | ||
10 | * | ||
11 | * You should have received a copy of the GNU General Public License | ||
12 | * along with "ore". If not, write to the Free Software Foundation, Inc: | ||
13 | * "Free Software Foundation <info@fsf.org>" | ||
14 | */ | ||
15 | |||
16 | #include <linux/gfp.h> | ||
17 | |||
18 | #include "ore_raid.h" | ||
19 | |||
20 | struct page *_raid_page_alloc(void) | ||
21 | { | ||
22 | return alloc_page(GFP_KERNEL); | ||
23 | } | ||
24 | |||
25 | void _raid_page_free(struct page *p) | ||
26 | { | ||
27 | __free_page(p); | ||
28 | } | ||
29 | |||
30 | void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len, | ||
31 | bool not_last) | ||
32 | { | ||
33 | struct osd_sg_entry *sge; | ||
34 | |||
35 | ORE_DBGMSG("dev=%d cur_len=0x%x not_last=%d cur_sg=%d " | ||
36 | "offset=0x%llx length=0x%x last_sgs_total=0x%x\n", | ||
37 | per_dev->dev, cur_len, not_last, per_dev->cur_sg, | ||
38 | _LLU(per_dev->offset), per_dev->length, | ||
39 | per_dev->last_sgs_total); | ||
40 | |||
41 | if (!per_dev->cur_sg) { | ||
42 | sge = per_dev->sglist; | ||
43 | |||
44 | /* First time we prepare two entries */ | ||
45 | if (per_dev->length) { | ||
46 | ++per_dev->cur_sg; | ||
47 | sge->offset = per_dev->offset; | ||
48 | sge->len = per_dev->length; | ||
49 | } else { | ||
50 | /* Here the parity is the first unit of this object. | ||
51 | * This happens every time we reach a parity device on | ||
52 | * the same stripe as the per_dev->offset. We need to | ||
53 | * just skip this unit. | ||
54 | */ | ||
55 | per_dev->offset += cur_len; | ||
56 | return; | ||
57 | } | ||
58 | } else { | ||
59 | /* finalize the last one */ | ||
60 | sge = &per_dev->sglist[per_dev->cur_sg - 1]; | ||
61 | sge->len = per_dev->length - per_dev->last_sgs_total; | ||
62 | } | ||
63 | |||
64 | if (not_last) { | ||
65 | /* Partly prepare the next one */ | ||
66 | struct osd_sg_entry *next_sge = sge + 1; | ||
67 | |||
68 | ++per_dev->cur_sg; | ||
69 | next_sge->offset = sge->offset + sge->len + cur_len; | ||
70 | /* Save cur len so we know how mutch was added next time */ | ||
71 | per_dev->last_sgs_total = per_dev->length; | ||
72 | next_sge->len = 0; | ||
73 | } else if (!sge->len) { | ||
74 | /* Optimize for when the last unit is a parity */ | ||
75 | --per_dev->cur_sg; | ||
76 | } | ||
77 | } | ||
78 | |||
79 | /* In writes @cur_len means length left. .i.e cur_len==0 is the last parity U */ | ||
80 | int _ore_add_parity_unit(struct ore_io_state *ios, | ||
81 | struct ore_striping_info *si, | ||
82 | struct ore_per_dev_state *per_dev, | ||
83 | unsigned cur_len) | ||
84 | { | ||
85 | if (ios->reading) { | ||
86 | BUG_ON(per_dev->cur_sg >= ios->sgs_per_dev); | ||
87 | _ore_add_sg_seg(per_dev, cur_len, true); | ||
88 | } else { | ||
89 | struct page **pages = ios->parity_pages + ios->cur_par_page; | ||
90 | unsigned num_pages = ios->layout->stripe_unit / PAGE_SIZE; | ||
91 | unsigned array_start = 0; | ||
92 | unsigned i; | ||
93 | int ret; | ||
94 | |||
95 | for (i = 0; i < num_pages; i++) { | ||
96 | pages[i] = _raid_page_alloc(); | ||
97 | if (unlikely(!pages[i])) | ||
98 | return -ENOMEM; | ||
99 | |||
100 | ++(ios->cur_par_page); | ||
101 | /* TODO: only read support for now */ | ||
102 | clear_highpage(pages[i]); | ||
103 | } | ||
104 | |||
105 | ORE_DBGMSG("writing dev=%d num_pages=%d cur_par_page=%d", | ||
106 | per_dev->dev, num_pages, ios->cur_par_page); | ||
107 | |||
108 | ret = _ore_add_stripe_unit(ios, &array_start, 0, pages, | ||
109 | per_dev, num_pages * PAGE_SIZE); | ||
110 | if (unlikely(ret)) | ||
111 | return ret; | ||
112 | } | ||
113 | return 0; | ||
114 | } | ||
115 | |||
116 | int _ore_post_alloc_raid_stuff(struct ore_io_state *ios) | ||
117 | { | ||
118 | /*TODO: Only raid writes has stuff to add here */ | ||
119 | return 0; | ||
120 | } | ||
121 | |||
122 | void _ore_free_raid_stuff(struct ore_io_state *ios) | ||
123 | { | ||
124 | if (ios->parity_pages) { /* writing and raid */ | ||
125 | unsigned i; | ||
126 | |||
127 | for (i = 0; i < ios->cur_par_page; i++) { | ||
128 | struct page *page = ios->parity_pages[i]; | ||
129 | |||
130 | if (page) | ||
131 | _raid_page_free(page); | ||
132 | } | ||
133 | if (ios->extra_part_alloc) | ||
134 | kfree(ios->parity_pages); | ||
135 | } else { | ||
136 | /* Will only be set if raid reading && sglist is big */ | ||
137 | if (ios->extra_part_alloc) | ||
138 | kfree(ios->per_dev[0].sglist); | ||
139 | } | ||
140 | } | ||
diff --git a/fs/exofs/ore_raid.h b/fs/exofs/ore_raid.h new file mode 100644 index 000000000000..c21080b4407f --- /dev/null +++ b/fs/exofs/ore_raid.h | |||
@@ -0,0 +1,64 @@ | |||
1 | /* | ||
2 | * Copyright (C) from 2011 | ||
3 | * Boaz Harrosh <bharrosh@panasas.com> | ||
4 | * | ||
5 | * This file is part of the objects raid engine (ore). | ||
6 | * | ||
7 | * It is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License version 2 as published | ||
9 | * by the Free Software Foundation. | ||
10 | * | ||
11 | * You should have received a copy of the GNU General Public License | ||
12 | * along with "ore". If not, write to the Free Software Foundation, Inc: | ||
13 | * "Free Software Foundation <info@fsf.org>" | ||
14 | */ | ||
15 | |||
16 | #include <scsi/osd_ore.h> | ||
17 | |||
18 | #define ORE_ERR(fmt, a...) printk(KERN_ERR "ore: " fmt, ##a) | ||
19 | |||
20 | #ifdef CONFIG_EXOFS_DEBUG | ||
21 | #define ORE_DBGMSG(fmt, a...) \ | ||
22 | printk(KERN_NOTICE "ore @%s:%d: " fmt, __func__, __LINE__, ##a) | ||
23 | #else | ||
24 | #define ORE_DBGMSG(fmt, a...) \ | ||
25 | do { if (0) printk(fmt, ##a); } while (0) | ||
26 | #endif | ||
27 | |||
28 | /* u64 has problems with printk this will cast it to unsigned long long */ | ||
29 | #define _LLU(x) (unsigned long long)(x) | ||
30 | |||
31 | #define ORE_DBGMSG2(M...) do {} while (0) | ||
32 | /* #define ORE_DBGMSG2 ORE_DBGMSG */ | ||
33 | |||
34 | /* Calculate the component order in a stripe. eg the logical data unit | ||
35 | * address within the stripe of @dev given the @par_dev of this stripe. | ||
36 | */ | ||
37 | static inline unsigned _dev_order(unsigned devs_in_group, unsigned mirrors_p1, | ||
38 | unsigned par_dev, unsigned dev) | ||
39 | { | ||
40 | unsigned first_dev = dev - dev % devs_in_group; | ||
41 | |||
42 | dev -= first_dev; | ||
43 | par_dev -= first_dev; | ||
44 | |||
45 | if (devs_in_group == par_dev) /* The raid 0 case */ | ||
46 | return dev / mirrors_p1; | ||
47 | /* raid4/5/6 case */ | ||
48 | return ((devs_in_group + dev - par_dev - mirrors_p1) % devs_in_group) / | ||
49 | mirrors_p1; | ||
50 | } | ||
51 | |||
52 | /* ios_raid.c stuff needed by ios.c */ | ||
53 | int _ore_post_alloc_raid_stuff(struct ore_io_state *ios); | ||
54 | void _ore_free_raid_stuff(struct ore_io_state *ios); | ||
55 | |||
56 | void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len, | ||
57 | bool not_last); | ||
58 | int _ore_add_parity_unit(struct ore_io_state *ios, struct ore_striping_info *si, | ||
59 | struct ore_per_dev_state *per_dev, unsigned cur_len); | ||
60 | |||
61 | /* ios.c stuff needed by ios_raid.c */ | ||
62 | int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, | ||
63 | unsigned pgbase, struct page **pages, | ||
64 | struct ore_per_dev_state *per_dev, int cur_len); | ||