aboutsummaryrefslogtreecommitdiffstats
path: root/fs/exofs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/exofs')
-rw-r--r--fs/exofs/Kbuild3
-rw-r--r--fs/exofs/ore.c326
-rw-r--r--fs/exofs/ore_raid.c140
-rw-r--r--fs/exofs/ore_raid.h64
4 files changed, 455 insertions, 78 deletions
diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild
index c5a5855a6c44..352ba149d23e 100644
--- a/fs/exofs/Kbuild
+++ b/fs/exofs/Kbuild
@@ -13,7 +13,8 @@
13# 13#
14 14
15# ore module library 15# ore module library
16obj-$(CONFIG_ORE) += ore.o 16libore-y := ore.o ore_raid.o
17obj-$(CONFIG_ORE) += libore.o
17 18
18exofs-y := inode.o file.o symlink.o namei.o dir.o super.o 19exofs-y := inode.o file.o symlink.o namei.o dir.o super.o
19obj-$(CONFIG_EXOFS_FS) += exofs.o 20obj-$(CONFIG_EXOFS_FS) += exofs.o
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index d92998d5c2d6..fd6090ddd3bf 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -24,24 +24,9 @@
24 24
25#include <linux/slab.h> 25#include <linux/slab.h>
26#include <asm/div64.h> 26#include <asm/div64.h>
27#include <linux/lcm.h>
27 28
28#include <scsi/osd_ore.h> 29#include "ore_raid.h"
29
30#define ORE_ERR(fmt, a...) printk(KERN_ERR "ore: " fmt, ##a)
31
32#ifdef CONFIG_EXOFS_DEBUG
33#define ORE_DBGMSG(fmt, a...) \
34 printk(KERN_NOTICE "ore @%s:%d: " fmt, __func__, __LINE__, ##a)
35#else
36#define ORE_DBGMSG(fmt, a...) \
37 do { if (0) printk(fmt, ##a); } while (0)
38#endif
39
40/* u64 has problems with printk this will cast it to unsigned long long */
41#define _LLU(x) (unsigned long long)(x)
42
43#define ORE_DBGMSG2(M...) do {} while (0)
44/* #define ORE_DBGMSG2 ORE_DBGMSG */
45 30
46MODULE_AUTHOR("Boaz Harrosh <bharrosh@panasas.com>"); 31MODULE_AUTHOR("Boaz Harrosh <bharrosh@panasas.com>");
47MODULE_DESCRIPTION("Objects Raid Engine ore.ko"); 32MODULE_DESCRIPTION("Objects Raid Engine ore.ko");
@@ -133,21 +118,81 @@ static struct osd_dev *_ios_od(struct ore_io_state *ios, unsigned index)
133 return ore_comp_dev(ios->oc, index); 118 return ore_comp_dev(ios->oc, index);
134} 119}
135 120
136static int _get_io_state(struct ore_layout *layout, 121static int _ore_get_io_state(struct ore_layout *layout,
137 struct ore_components *oc, unsigned numdevs, 122 struct ore_components *oc, unsigned numdevs,
138 struct ore_io_state **pios) 123 unsigned sgs_per_dev, unsigned num_par_pages,
124 struct ore_io_state **pios)
139{ 125{
140 struct ore_io_state *ios; 126 struct ore_io_state *ios;
127 struct page **pages;
128 struct osd_sg_entry *sgilist;
129 struct __alloc_all_io_state {
130 struct ore_io_state ios;
131 struct ore_per_dev_state per_dev[numdevs];
132 union {
133 struct osd_sg_entry sglist[sgs_per_dev * numdevs];
134 struct page *pages[num_par_pages];
135 };
136 } *_aios;
137
138 if (likely(sizeof(*_aios) <= PAGE_SIZE)) {
139 _aios = kzalloc(sizeof(*_aios), GFP_KERNEL);
140 if (unlikely(!_aios)) {
141 ORE_DBGMSG("Failed kzalloc bytes=%zd\n",
142 sizeof(*_aios));
143 *pios = NULL;
144 return -ENOMEM;
145 }
146 pages = num_par_pages ? _aios->pages : NULL;
147 sgilist = sgs_per_dev ? _aios->sglist : NULL;
148 ios = &_aios->ios;
149 } else {
150 struct __alloc_small_io_state {
151 struct ore_io_state ios;
152 struct ore_per_dev_state per_dev[numdevs];
153 } *_aio_small;
154 union __extra_part {
155 struct osd_sg_entry sglist[sgs_per_dev * numdevs];
156 struct page *pages[num_par_pages];
157 } *extra_part;
158
159 _aio_small = kzalloc(sizeof(*_aio_small), GFP_KERNEL);
160 if (unlikely(!_aio_small)) {
161 ORE_DBGMSG("Failed alloc first part bytes=%zd\n",
162 sizeof(*_aio_small));
163 *pios = NULL;
164 return -ENOMEM;
165 }
166 extra_part = kzalloc(sizeof(*extra_part), GFP_KERNEL);
167 if (unlikely(!extra_part)) {
168 ORE_DBGMSG("Failed alloc second part bytes=%zd\n",
169 sizeof(*extra_part));
170 kfree(_aio_small);
171 *pios = NULL;
172 return -ENOMEM;
173 }
141 174
142 /*TODO: Maybe use kmem_cach per sbi of size 175 pages = num_par_pages ? extra_part->pages : NULL;
143 * exofs_io_state_size(layout->s_numdevs) 176 sgilist = sgs_per_dev ? extra_part->sglist : NULL;
144 */ 177 /* In this case the per_dev[0].sgilist holds the pointer to
145 ios = kzalloc(ore_io_state_size(numdevs), GFP_KERNEL); 178 * be freed
146 if (unlikely(!ios)) { 179 */
147 ORE_DBGMSG("Failed kzalloc bytes=%d\n", 180 ios = &_aio_small->ios;
148 ore_io_state_size(numdevs)); 181 ios->extra_part_alloc = true;
149 *pios = NULL; 182 }
150 return -ENOMEM; 183
184 if (pages) {
185 ios->parity_pages = pages;
186 ios->max_par_pages = num_par_pages;
187 }
188 if (sgilist) {
189 unsigned d;
190
191 for (d = 0; d < numdevs; ++d) {
192 ios->per_dev[d].sglist = sgilist;
193 sgilist += sgs_per_dev;
194 }
195 ios->sgs_per_dev = sgs_per_dev;
151 } 196 }
152 197
153 ios->layout = layout; 198 ios->layout = layout;
@@ -178,9 +223,42 @@ int ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc,
178{ 223{
179 struct ore_io_state *ios; 224 struct ore_io_state *ios;
180 unsigned numdevs = layout->group_width * layout->mirrors_p1; 225 unsigned numdevs = layout->group_width * layout->mirrors_p1;
226 unsigned sgs_per_dev = 0, max_par_pages = 0;
181 int ret; 227 int ret;
182 228
183 ret = _get_io_state(layout, oc, numdevs, pios); 229 if (layout->parity && length) {
230 unsigned data_devs = layout->group_width - layout->parity;
231 unsigned stripe_size = layout->stripe_unit * data_devs;
232 unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE;
233 u32 remainder;
234 u64 num_stripes;
235 u64 num_raid_units;
236
237 num_stripes = div_u64_rem(length, stripe_size, &remainder);
238 if (remainder)
239 ++num_stripes;
240
241 num_raid_units = num_stripes * layout->parity;
242
243 if (is_reading) {
244 /* For reads add per_dev sglist array */
245 /* TODO: Raid 6 we need twice more. Actually:
246 * num_stripes / LCMdP(W,P);
247 * if (W%P != 0) num_stripes *= parity;
248 */
249
250 /* first/last seg is split */
251 num_raid_units += layout->group_width;
252 sgs_per_dev = div_u64(num_raid_units, data_devs);
253 } else {
254 /* For Writes add parity pages array. */
255 max_par_pages = num_raid_units * pages_in_unit *
256 sizeof(struct page *);
257 }
258 }
259
260 ret = _ore_get_io_state(layout, oc, numdevs, sgs_per_dev, max_par_pages,
261 pios);
184 if (unlikely(ret)) 262 if (unlikely(ret))
185 return ret; 263 return ret;
186 264
@@ -189,10 +267,11 @@ int ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc,
189 ios->offset = offset; 267 ios->offset = offset;
190 268
191 if (length) { 269 if (length) {
192 ore_calc_stripe_info(layout, offset, &ios->si); 270 ore_calc_stripe_info(layout, offset, length, &ios->si);
193 ios->length = (length <= ios->si.group_length) ? length : 271 ios->length = ios->si.length;
194 ios->si.group_length;
195 ios->nr_pages = (ios->length + PAGE_SIZE - 1) / PAGE_SIZE; 272 ios->nr_pages = (ios->length + PAGE_SIZE - 1) / PAGE_SIZE;
273 if (layout->parity)
274 _ore_post_alloc_raid_stuff(ios);
196 } 275 }
197 276
198 return 0; 277 return 0;
@@ -209,7 +288,7 @@ EXPORT_SYMBOL(ore_get_rw_state);
209int ore_get_io_state(struct ore_layout *layout, struct ore_components *oc, 288int ore_get_io_state(struct ore_layout *layout, struct ore_components *oc,
210 struct ore_io_state **pios) 289 struct ore_io_state **pios)
211{ 290{
212 return _get_io_state(layout, oc, oc->numdevs, pios); 291 return _ore_get_io_state(layout, oc, oc->numdevs, 0, 0, pios);
213} 292}
214EXPORT_SYMBOL(ore_get_io_state); 293EXPORT_SYMBOL(ore_get_io_state);
215 294
@@ -227,6 +306,7 @@ void ore_put_io_state(struct ore_io_state *ios)
227 bio_put(per_dev->bio); 306 bio_put(per_dev->bio);
228 } 307 }
229 308
309 _ore_free_raid_stuff(ios);
230 kfree(ios); 310 kfree(ios);
231 } 311 }
232} 312}
@@ -367,53 +447,65 @@ EXPORT_SYMBOL(ore_check_io);
367/* 447/*
368 * L - logical offset into the file 448 * L - logical offset into the file
369 * 449 *
370 * U - The number of bytes in a stripe within a group 450 * D - number of Data devices
451 * D = group_width - parity
371 * 452 *
372 * U = stripe_unit * group_width 453 * U - The number of bytes in a stripe within a group
454 * U = stripe_unit * D
373 * 455 *
374 * T - The number of bytes striped within a group of component objects 456 * T - The number of bytes striped within a group of component objects
375 * (before advancing to the next group) 457 * (before advancing to the next group)
376 * 458 * T = U * group_depth
377 * T = stripe_unit * group_width * group_depth
378 * 459 *
379 * S - The number of bytes striped across all component objects 460 * S - The number of bytes striped across all component objects
380 * before the pattern repeats 461 * before the pattern repeats
462 * S = T * group_count
381 * 463 *
382 * S = stripe_unit * group_width * group_depth * group_count 464 * M - The "major" (i.e., across all components) cycle number
383 *
384 * M - The "major" (i.e., across all components) stripe number
385 *
386 * M = L / S 465 * M = L / S
387 * 466 *
388 * G - Counts the groups from the beginning of the major stripe 467 * G - Counts the groups from the beginning of the major cycle
389 *
390 * G = (L - (M * S)) / T [or (L % S) / T] 468 * G = (L - (M * S)) / T [or (L % S) / T]
391 * 469 *
392 * H - The byte offset within the group 470 * H - The byte offset within the group
393 *
394 * H = (L - (M * S)) % T [or (L % S) % T] 471 * H = (L - (M * S)) % T [or (L % S) % T]
395 * 472 *
396 * N - The "minor" (i.e., across the group) stripe number 473 * N - The "minor" (i.e., across the group) stripe number
397 *
398 * N = H / U 474 * N = H / U
399 * 475 *
400 * C - The component index coresponding to L 476 * C - The component index coresponding to L
401 * 477 *
402 * C = (H - (N * U)) / stripe_unit + G * group_width 478 * C = (H - (N * U)) / stripe_unit + G * D
403 * [or (L % U) / stripe_unit + G * group_width] 479 * [or (L % U) / stripe_unit + G * D]
404 * 480 *
405 * O - The component offset coresponding to L 481 * O - The component offset coresponding to L
406 *
407 * O = L % stripe_unit + N * stripe_unit + M * group_depth * stripe_unit 482 * O = L % stripe_unit + N * stripe_unit + M * group_depth * stripe_unit
483 *
484 * LCMdP – Parity cycle: Lowest Common Multiple of group_width, parity
485 * divide by parity
486 * LCMdP = lcm(group_width, parity) / parity
487 *
488 * R - The parity Rotation stripe
489 * (Note parity cycle always starts at a group's boundary)
490 * R = N % LCMdP
491 *
492 * I = the first parity device index
493 * I = (group_width + group_width - R*parity - parity) % group_width
494 *
495 * Craid - The component index Rotated
496 * Craid = (group_width + C - R*parity) % group_width
497 * (We add the group_width to avoid negative numbers modulo math)
408 */ 498 */
409void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset, 499void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset,
410 struct ore_striping_info *si) 500 u64 length, struct ore_striping_info *si)
411{ 501{
412 u32 stripe_unit = layout->stripe_unit; 502 u32 stripe_unit = layout->stripe_unit;
413 u32 group_width = layout->group_width; 503 u32 group_width = layout->group_width;
414 u64 group_depth = layout->group_depth; 504 u64 group_depth = layout->group_depth;
505 u32 parity = layout->parity;
415 506
416 u32 U = stripe_unit * group_width; 507 u32 D = group_width - parity;
508 u32 U = D * stripe_unit;
417 u64 T = U * group_depth; 509 u64 T = U * group_depth;
418 u64 S = T * layout->group_count; 510 u64 S = T * layout->group_count;
419 u64 M = div64_u64(file_offset, S); 511 u64 M = div64_u64(file_offset, S);
@@ -429,22 +521,43 @@ void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset,
429 u32 N = div_u64(H, U); 521 u32 N = div_u64(H, U);
430 522
431 /* "H - (N * U)" is just "H % U" so it's bound to u32 */ 523 /* "H - (N * U)" is just "H % U" so it's bound to u32 */
432 si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width; 524 u32 C = (u32)(H - (N * U)) / stripe_unit + G * group_width;
433 si->dev *= layout->mirrors_p1;
434 525
435 div_u64_rem(file_offset, stripe_unit, &si->unit_off); 526 div_u64_rem(file_offset, stripe_unit, &si->unit_off);
436 527
437 si->obj_offset = si->unit_off + (N * stripe_unit) + 528 si->obj_offset = si->unit_off + (N * stripe_unit) +
438 (M * group_depth * stripe_unit); 529 (M * group_depth * stripe_unit);
439 530
440 si->group_length = T - H; 531 if (parity) {
532 u32 LCMdP = lcm(group_width, parity) / parity;
533 /* R = N % LCMdP; */
534 u32 RxP = (N % LCMdP) * parity;
535 u32 first_dev = C - C % group_width;
536
537 si->par_dev = (group_width + group_width - parity - RxP) %
538 group_width + first_dev;
539 si->dev = (group_width + C - RxP) % group_width + first_dev;
540 si->bytes_in_stripe = U;
541 si->first_stripe_start = M * S + G * T + N * U;
542 } else {
543 /* Make the math correct see _prepare_one_group */
544 si->par_dev = group_width;
545 si->dev = C;
546 }
547
548 si->dev *= layout->mirrors_p1;
549 si->par_dev *= layout->mirrors_p1;
550 si->offset = file_offset;
551 si->length = T - H;
552 if (si->length > length)
553 si->length = length;
441 si->M = M; 554 si->M = M;
442} 555}
443EXPORT_SYMBOL(ore_calc_stripe_info); 556EXPORT_SYMBOL(ore_calc_stripe_info);
444 557
445static int _add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, 558int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg,
446 unsigned pgbase, struct ore_per_dev_state *per_dev, 559 unsigned pgbase, struct page **pages,
447 int cur_len) 560 struct ore_per_dev_state *per_dev, int cur_len)
448{ 561{
449 unsigned pg = *cur_pg; 562 unsigned pg = *cur_pg;
450 struct request_queue *q = 563 struct request_queue *q =
@@ -455,8 +568,11 @@ static int _add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg,
455 if (per_dev->bio == NULL) { 568 if (per_dev->bio == NULL) {
456 unsigned pages_in_stripe = ios->layout->group_width * 569 unsigned pages_in_stripe = ios->layout->group_width *
457 (ios->layout->stripe_unit / PAGE_SIZE); 570 (ios->layout->stripe_unit / PAGE_SIZE);
458 unsigned bio_size = (ios->nr_pages + pages_in_stripe) / 571 unsigned nr_pages = ios->nr_pages * ios->layout->group_width /
459 ios->layout->group_width; 572 (ios->layout->group_width -
573 ios->layout->parity);
574 unsigned bio_size = (nr_pages + pages_in_stripe) /
575 ios->layout->group_width;
460 576
461 per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size); 577 per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size);
462 if (unlikely(!per_dev->bio)) { 578 if (unlikely(!per_dev->bio)) {
@@ -471,12 +587,13 @@ static int _add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg,
471 unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len); 587 unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len);
472 unsigned added_len; 588 unsigned added_len;
473 589
474 BUG_ON(ios->nr_pages <= pg);
475 cur_len -= pglen; 590 cur_len -= pglen;
476 591
477 added_len = bio_add_pc_page(q, per_dev->bio, ios->pages[pg], 592 added_len = bio_add_pc_page(q, per_dev->bio, pages[pg],
478 pglen, pgbase); 593 pglen, pgbase);
479 if (unlikely(pglen != added_len)) { 594 if (unlikely(pglen != added_len)) {
595 ORE_DBGMSG("Failed bio_add_pc_page bi_vcnt=%u\n",
596 per_dev->bio->bi_vcnt);
480 ret = -ENOMEM; 597 ret = -ENOMEM;
481 goto out; 598 goto out;
482 } 599 }
@@ -501,9 +618,11 @@ static int _prepare_for_striping(struct ore_io_state *ios)
501 struct ore_striping_info *si = &ios->si; 618 struct ore_striping_info *si = &ios->si;
502 unsigned stripe_unit = ios->layout->stripe_unit; 619 unsigned stripe_unit = ios->layout->stripe_unit;
503 unsigned mirrors_p1 = ios->layout->mirrors_p1; 620 unsigned mirrors_p1 = ios->layout->mirrors_p1;
504 unsigned devs_in_group = ios->layout->group_width * mirrors_p1; 621 unsigned group_width = ios->layout->group_width;
622 unsigned devs_in_group = group_width * mirrors_p1;
505 unsigned dev = si->dev; 623 unsigned dev = si->dev;
506 unsigned first_dev = dev - (dev % devs_in_group); 624 unsigned first_dev = dev - (dev % devs_in_group);
625 unsigned dev_order;
507 unsigned cur_pg = ios->pages_consumed; 626 unsigned cur_pg = ios->pages_consumed;
508 u64 length = ios->length; 627 u64 length = ios->length;
509 int ret = 0; 628 int ret = 0;
@@ -513,7 +632,10 @@ static int _prepare_for_striping(struct ore_io_state *ios)
513 return 0; 632 return 0;
514 } 633 }
515 634
516 BUG_ON(length > si->group_length); 635 BUG_ON(length > si->length);
636
637 dev_order = _dev_order(devs_in_group, mirrors_p1, si->par_dev, dev);
638 si->cur_comp = dev_order;
517 639
518 while (length) { 640 while (length) {
519 unsigned comp = dev - first_dev; 641 unsigned comp = dev - first_dev;
@@ -522,17 +644,20 @@ static int _prepare_for_striping(struct ore_io_state *ios)
522 644
523 if (!per_dev->length) { 645 if (!per_dev->length) {
524 per_dev->dev = dev; 646 per_dev->dev = dev;
525 if (dev < si->dev) { 647 if (dev == si->dev) {
526 per_dev->offset = si->obj_offset + stripe_unit - 648 WARN_ON(dev == si->par_dev);
527 si->unit_off;
528 cur_len = stripe_unit;
529 } else if (dev == si->dev) {
530 per_dev->offset = si->obj_offset; 649 per_dev->offset = si->obj_offset;
531 cur_len = stripe_unit - si->unit_off; 650 cur_len = stripe_unit - si->unit_off;
532 page_off = si->unit_off & ~PAGE_MASK; 651 page_off = si->unit_off & ~PAGE_MASK;
533 BUG_ON(page_off && (page_off != ios->pgbase)); 652 BUG_ON(page_off && (page_off != ios->pgbase));
534 } else { /* dev > si->dev */ 653 } else {
535 per_dev->offset = si->obj_offset - si->unit_off; 654 if (si->cur_comp > dev_order)
655 per_dev->offset =
656 si->obj_offset - si->unit_off;
657 else /* si->cur_comp < dev_order */
658 per_dev->offset =
659 si->obj_offset + stripe_unit -
660 si->unit_off;
536 cur_len = stripe_unit; 661 cur_len = stripe_unit;
537 } 662 }
538 } else { 663 } else {
@@ -541,8 +666,8 @@ static int _prepare_for_striping(struct ore_io_state *ios)
541 if (cur_len >= length) 666 if (cur_len >= length)
542 cur_len = length; 667 cur_len = length;
543 668
544 ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev, 669 ret = _ore_add_stripe_unit(ios, &cur_pg, page_off, ios->pages,
545 cur_len); 670 per_dev, cur_len);
546 if (unlikely(ret)) 671 if (unlikely(ret))
547 goto out; 672 goto out;
548 673
@@ -550,6 +675,41 @@ static int _prepare_for_striping(struct ore_io_state *ios)
550 dev = (dev % devs_in_group) + first_dev; 675 dev = (dev % devs_in_group) + first_dev;
551 676
552 length -= cur_len; 677 length -= cur_len;
678
679 si->cur_comp = (si->cur_comp + 1) % group_width;
680 if (unlikely((dev == si->par_dev) ||
681 (!length && ios->parity_pages))) {
682 if (!length)
683 /* If we are writing and this is the very last
684 * stripe. then operate on parity dev.
685 */
686 dev = si->par_dev;
687 if (ios->reading)
688 /* In writes cur_len just means if it's the
689 * last one. See _ore_add_parity_unit.
690 */
691 cur_len = length;
692 per_dev = &ios->per_dev[dev - first_dev];
693 if (!per_dev->length) {
694 /* Only/always the parity unit of the first
695 * stripe will be empty. So this is a chance to
696 * initialize the per_dev info.
697 */
698 per_dev->dev = dev;
699 per_dev->offset = si->obj_offset - si->unit_off;
700 }
701
702 ret = _ore_add_parity_unit(ios, si, per_dev, cur_len);
703 if (unlikely(ret))
704 goto out;
705
706 /* Rotate next par_dev backwards with wraping */
707 si->par_dev = (devs_in_group + si->par_dev -
708 ios->layout->parity * mirrors_p1) %
709 devs_in_group + first_dev;
710 /* Next stripe, start fresh */
711 si->cur_comp = 0;
712 }
553 } 713 }
554out: 714out:
555 ios->numdevs = devs_in_group; 715 ios->numdevs = devs_in_group;
@@ -747,12 +907,24 @@ static int _read_mirror(struct ore_io_state *ios, unsigned cur_comp)
747 per_dev->or = or; 907 per_dev->or = or;
748 908
749 if (ios->pages) { 909 if (ios->pages) {
750 osd_req_read(or, obj, per_dev->offset, 910 if (per_dev->cur_sg) {
751 per_dev->bio, per_dev->length); 911 /* finalize the last sg_entry */
912 _ore_add_sg_seg(per_dev, 0, false);
913 if (unlikely(!per_dev->cur_sg))
914 return 0; /* Skip parity only device */
915
916 osd_req_read_sg(or, obj, per_dev->bio,
917 per_dev->sglist, per_dev->cur_sg);
918 } else {
919 /* The no raid case */
920 osd_req_read(or, obj, per_dev->offset,
921 per_dev->bio, per_dev->length);
922 }
923
752 ORE_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx" 924 ORE_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx"
753 " dev=%d\n", _LLU(obj->id), 925 " dev=%d sg_len=%d\n", _LLU(obj->id),
754 _LLU(per_dev->offset), _LLU(per_dev->length), 926 _LLU(per_dev->offset), _LLU(per_dev->length),
755 first_dev); 927 first_dev, per_dev->cur_sg);
756 } else { 928 } else {
757 BUG_ON(ios->kern_buff); 929 BUG_ON(ios->kern_buff);
758 930
@@ -849,7 +1021,7 @@ static void _calc_trunk_info(struct ore_layout *layout, u64 file_offset,
849{ 1021{
850 unsigned stripe_unit = layout->stripe_unit; 1022 unsigned stripe_unit = layout->stripe_unit;
851 1023
852 ore_calc_stripe_info(layout, file_offset, &ti->si); 1024 ore_calc_stripe_info(layout, file_offset, 0, &ti->si);
853 1025
854 ti->prev_group_obj_off = ti->si.M * stripe_unit; 1026 ti->prev_group_obj_off = ti->si.M * stripe_unit;
855 ti->next_group_obj_off = ti->si.M ? (ti->si.M - 1) * stripe_unit : 0; 1027 ti->next_group_obj_off = ti->si.M ? (ti->si.M - 1) * stripe_unit : 0;
diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c
new file mode 100644
index 000000000000..8d4b93a93c67
--- /dev/null
+++ b/fs/exofs/ore_raid.c
@@ -0,0 +1,140 @@
1/*
2 * Copyright (C) 2011
3 * Boaz Harrosh <bharrosh@panasas.com>
4 *
5 * This file is part of the objects raid engine (ore).
6 *
7 * It is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as published
9 * by the Free Software Foundation.
10 *
11 * You should have received a copy of the GNU General Public License
12 * along with "ore". If not, write to the Free Software Foundation, Inc:
13 * "Free Software Foundation <info@fsf.org>"
14 */
15
16#include <linux/gfp.h>
17
18#include "ore_raid.h"
19
20struct page *_raid_page_alloc(void)
21{
22 return alloc_page(GFP_KERNEL);
23}
24
25void _raid_page_free(struct page *p)
26{
27 __free_page(p);
28}
29
30void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len,
31 bool not_last)
32{
33 struct osd_sg_entry *sge;
34
35 ORE_DBGMSG("dev=%d cur_len=0x%x not_last=%d cur_sg=%d "
36 "offset=0x%llx length=0x%x last_sgs_total=0x%x\n",
37 per_dev->dev, cur_len, not_last, per_dev->cur_sg,
38 _LLU(per_dev->offset), per_dev->length,
39 per_dev->last_sgs_total);
40
41 if (!per_dev->cur_sg) {
42 sge = per_dev->sglist;
43
44 /* First time we prepare two entries */
45 if (per_dev->length) {
46 ++per_dev->cur_sg;
47 sge->offset = per_dev->offset;
48 sge->len = per_dev->length;
49 } else {
50 /* Here the parity is the first unit of this object.
51 * This happens every time we reach a parity device on
52 * the same stripe as the per_dev->offset. We need to
53 * just skip this unit.
54 */
55 per_dev->offset += cur_len;
56 return;
57 }
58 } else {
59 /* finalize the last one */
60 sge = &per_dev->sglist[per_dev->cur_sg - 1];
61 sge->len = per_dev->length - per_dev->last_sgs_total;
62 }
63
64 if (not_last) {
65 /* Partly prepare the next one */
66 struct osd_sg_entry *next_sge = sge + 1;
67
68 ++per_dev->cur_sg;
69 next_sge->offset = sge->offset + sge->len + cur_len;
70 /* Save cur len so we know how mutch was added next time */
71 per_dev->last_sgs_total = per_dev->length;
72 next_sge->len = 0;
73 } else if (!sge->len) {
74 /* Optimize for when the last unit is a parity */
75 --per_dev->cur_sg;
76 }
77}
78
79/* In writes @cur_len means length left. .i.e cur_len==0 is the last parity U */
80int _ore_add_parity_unit(struct ore_io_state *ios,
81 struct ore_striping_info *si,
82 struct ore_per_dev_state *per_dev,
83 unsigned cur_len)
84{
85 if (ios->reading) {
86 BUG_ON(per_dev->cur_sg >= ios->sgs_per_dev);
87 _ore_add_sg_seg(per_dev, cur_len, true);
88 } else {
89 struct page **pages = ios->parity_pages + ios->cur_par_page;
90 unsigned num_pages = ios->layout->stripe_unit / PAGE_SIZE;
91 unsigned array_start = 0;
92 unsigned i;
93 int ret;
94
95 for (i = 0; i < num_pages; i++) {
96 pages[i] = _raid_page_alloc();
97 if (unlikely(!pages[i]))
98 return -ENOMEM;
99
100 ++(ios->cur_par_page);
101 /* TODO: only read support for now */
102 clear_highpage(pages[i]);
103 }
104
105 ORE_DBGMSG("writing dev=%d num_pages=%d cur_par_page=%d",
106 per_dev->dev, num_pages, ios->cur_par_page);
107
108 ret = _ore_add_stripe_unit(ios, &array_start, 0, pages,
109 per_dev, num_pages * PAGE_SIZE);
110 if (unlikely(ret))
111 return ret;
112 }
113 return 0;
114}
115
116int _ore_post_alloc_raid_stuff(struct ore_io_state *ios)
117{
118 /*TODO: Only raid writes has stuff to add here */
119 return 0;
120}
121
122void _ore_free_raid_stuff(struct ore_io_state *ios)
123{
124 if (ios->parity_pages) { /* writing and raid */
125 unsigned i;
126
127 for (i = 0; i < ios->cur_par_page; i++) {
128 struct page *page = ios->parity_pages[i];
129
130 if (page)
131 _raid_page_free(page);
132 }
133 if (ios->extra_part_alloc)
134 kfree(ios->parity_pages);
135 } else {
136 /* Will only be set if raid reading && sglist is big */
137 if (ios->extra_part_alloc)
138 kfree(ios->per_dev[0].sglist);
139 }
140}
diff --git a/fs/exofs/ore_raid.h b/fs/exofs/ore_raid.h
new file mode 100644
index 000000000000..c21080b4407f
--- /dev/null
+++ b/fs/exofs/ore_raid.h
@@ -0,0 +1,64 @@
1/*
2 * Copyright (C) from 2011
3 * Boaz Harrosh <bharrosh@panasas.com>
4 *
5 * This file is part of the objects raid engine (ore).
6 *
7 * It is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as published
9 * by the Free Software Foundation.
10 *
11 * You should have received a copy of the GNU General Public License
12 * along with "ore". If not, write to the Free Software Foundation, Inc:
13 * "Free Software Foundation <info@fsf.org>"
14 */
15
16#include <scsi/osd_ore.h>
17
18#define ORE_ERR(fmt, a...) printk(KERN_ERR "ore: " fmt, ##a)
19
20#ifdef CONFIG_EXOFS_DEBUG
21#define ORE_DBGMSG(fmt, a...) \
22 printk(KERN_NOTICE "ore @%s:%d: " fmt, __func__, __LINE__, ##a)
23#else
24#define ORE_DBGMSG(fmt, a...) \
25 do { if (0) printk(fmt, ##a); } while (0)
26#endif
27
28/* u64 has problems with printk this will cast it to unsigned long long */
29#define _LLU(x) (unsigned long long)(x)
30
31#define ORE_DBGMSG2(M...) do {} while (0)
32/* #define ORE_DBGMSG2 ORE_DBGMSG */
33
34/* Calculate the component order in a stripe. eg the logical data unit
35 * address within the stripe of @dev given the @par_dev of this stripe.
36 */
37static inline unsigned _dev_order(unsigned devs_in_group, unsigned mirrors_p1,
38 unsigned par_dev, unsigned dev)
39{
40 unsigned first_dev = dev - dev % devs_in_group;
41
42 dev -= first_dev;
43 par_dev -= first_dev;
44
45 if (devs_in_group == par_dev) /* The raid 0 case */
46 return dev / mirrors_p1;
47 /* raid4/5/6 case */
48 return ((devs_in_group + dev - par_dev - mirrors_p1) % devs_in_group) /
49 mirrors_p1;
50}
51
52/* ios_raid.c stuff needed by ios.c */
53int _ore_post_alloc_raid_stuff(struct ore_io_state *ios);
54void _ore_free_raid_stuff(struct ore_io_state *ios);
55
56void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len,
57 bool not_last);
58int _ore_add_parity_unit(struct ore_io_state *ios, struct ore_striping_info *si,
59 struct ore_per_dev_state *per_dev, unsigned cur_len);
60
61/* ios.c stuff needed by ios_raid.c */
62int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg,
63 unsigned pgbase, struct page **pages,
64 struct ore_per_dev_state *per_dev, int cur_len);