aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorBoaz Harrosh <bharrosh@panasas.com>2011-10-12 12:42:22 -0400
committerBoaz Harrosh <bharrosh@panasas.com>2011-10-24 19:55:36 -0400
commita1fec1dbbc8db974d2582e4040590cebe72171e4 (patch)
tree9dcbe1933b7f40256f40393f3c86dbb16e8fb953 /fs
parent3e335672e018c06e007f85a5d54afd721fb3d6d5 (diff)
ore: RAID5 read
This patch introduces the first stage of RAID5 support mainly the skip-over-raid-units when reading. For writes it inserts BLANK units, into where XOR blocks should be calculated and written to. It introduces the new "general raid maths", and the main additional parameters and components needed for raid5. Since at this stage it could corrupt future version that actually do support raid5. The enablement of raid5 mounting and setting of parity-count > 0 is disabled. So the raid5 code will never be used. Mounting of raid5 is only enabled later once the basic XOR write is also in. But if the patch "enable RAID5" is applied this code has been tested to be able to properly read raid5 volumes and is according to standard. Also it has been tested that the new maths still properly supports RAID0 and grouping code just as before. (BTW: I have found more bugs in the pnfs-obj RAID math fixed here) The ore.c file is getting too big, so new ore_raid.[hc] files are added that will include the special raid stuff that are not used in striping and mirrors. In future write support these will get bigger. When adding the ore_raid.c to Kbuild file I was forced to rename ore.ko to libore.ko. Is it possible to keep source file, say ore.c and module file ore.ko the same even if there are multiple files inside ore.ko? Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Diffstat (limited to 'fs')
-rw-r--r--fs/exofs/Kbuild3
-rw-r--r--fs/exofs/ore.c326
-rw-r--r--fs/exofs/ore_raid.c140
-rw-r--r--fs/exofs/ore_raid.h64
4 files changed, 455 insertions, 78 deletions
diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild
index c5a5855a6c44..352ba149d23e 100644
--- a/fs/exofs/Kbuild
+++ b/fs/exofs/Kbuild
@@ -13,7 +13,8 @@
13# 13#
14 14
15# ore module library 15# ore module library
16obj-$(CONFIG_ORE) += ore.o 16libore-y := ore.o ore_raid.o
17obj-$(CONFIG_ORE) += libore.o
17 18
18exofs-y := inode.o file.o symlink.o namei.o dir.o super.o 19exofs-y := inode.o file.o symlink.o namei.o dir.o super.o
19obj-$(CONFIG_EXOFS_FS) += exofs.o 20obj-$(CONFIG_EXOFS_FS) += exofs.o
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index d92998d5c2d6..fd6090ddd3bf 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -24,24 +24,9 @@
24 24
25#include <linux/slab.h> 25#include <linux/slab.h>
26#include <asm/div64.h> 26#include <asm/div64.h>
27#include <linux/lcm.h>
27 28
28#include <scsi/osd_ore.h> 29#include "ore_raid.h"
29
30#define ORE_ERR(fmt, a...) printk(KERN_ERR "ore: " fmt, ##a)
31
32#ifdef CONFIG_EXOFS_DEBUG
33#define ORE_DBGMSG(fmt, a...) \
34 printk(KERN_NOTICE "ore @%s:%d: " fmt, __func__, __LINE__, ##a)
35#else
36#define ORE_DBGMSG(fmt, a...) \
37 do { if (0) printk(fmt, ##a); } while (0)
38#endif
39
40/* u64 has problems with printk this will cast it to unsigned long long */
41#define _LLU(x) (unsigned long long)(x)
42
43#define ORE_DBGMSG2(M...) do {} while (0)
44/* #define ORE_DBGMSG2 ORE_DBGMSG */
45 30
46MODULE_AUTHOR("Boaz Harrosh <bharrosh@panasas.com>"); 31MODULE_AUTHOR("Boaz Harrosh <bharrosh@panasas.com>");
47MODULE_DESCRIPTION("Objects Raid Engine ore.ko"); 32MODULE_DESCRIPTION("Objects Raid Engine ore.ko");
@@ -133,21 +118,81 @@ static struct osd_dev *_ios_od(struct ore_io_state *ios, unsigned index)
133 return ore_comp_dev(ios->oc, index); 118 return ore_comp_dev(ios->oc, index);
134} 119}
135 120
136static int _get_io_state(struct ore_layout *layout, 121static int _ore_get_io_state(struct ore_layout *layout,
137 struct ore_components *oc, unsigned numdevs, 122 struct ore_components *oc, unsigned numdevs,
138 struct ore_io_state **pios) 123 unsigned sgs_per_dev, unsigned num_par_pages,
124 struct ore_io_state **pios)
139{ 125{
140 struct ore_io_state *ios; 126 struct ore_io_state *ios;
127 struct page **pages;
128 struct osd_sg_entry *sgilist;
129 struct __alloc_all_io_state {
130 struct ore_io_state ios;
131 struct ore_per_dev_state per_dev[numdevs];
132 union {
133 struct osd_sg_entry sglist[sgs_per_dev * numdevs];
134 struct page *pages[num_par_pages];
135 };
136 } *_aios;
137
138 if (likely(sizeof(*_aios) <= PAGE_SIZE)) {
139 _aios = kzalloc(sizeof(*_aios), GFP_KERNEL);
140 if (unlikely(!_aios)) {
141 ORE_DBGMSG("Failed kzalloc bytes=%zd\n",
142 sizeof(*_aios));
143 *pios = NULL;
144 return -ENOMEM;
145 }
146 pages = num_par_pages ? _aios->pages : NULL;
147 sgilist = sgs_per_dev ? _aios->sglist : NULL;
148 ios = &_aios->ios;
149 } else {
150 struct __alloc_small_io_state {
151 struct ore_io_state ios;
152 struct ore_per_dev_state per_dev[numdevs];
153 } *_aio_small;
154 union __extra_part {
155 struct osd_sg_entry sglist[sgs_per_dev * numdevs];
156 struct page *pages[num_par_pages];
157 } *extra_part;
158
159 _aio_small = kzalloc(sizeof(*_aio_small), GFP_KERNEL);
160 if (unlikely(!_aio_small)) {
161 ORE_DBGMSG("Failed alloc first part bytes=%zd\n",
162 sizeof(*_aio_small));
163 *pios = NULL;
164 return -ENOMEM;
165 }
166 extra_part = kzalloc(sizeof(*extra_part), GFP_KERNEL);
167 if (unlikely(!extra_part)) {
168 ORE_DBGMSG("Failed alloc second part bytes=%zd\n",
169 sizeof(*extra_part));
170 kfree(_aio_small);
171 *pios = NULL;
172 return -ENOMEM;
173 }
141 174
142 /*TODO: Maybe use kmem_cach per sbi of size 175 pages = num_par_pages ? extra_part->pages : NULL;
143 * exofs_io_state_size(layout->s_numdevs) 176 sgilist = sgs_per_dev ? extra_part->sglist : NULL;
144 */ 177 /* In this case the per_dev[0].sgilist holds the pointer to
145 ios = kzalloc(ore_io_state_size(numdevs), GFP_KERNEL); 178 * be freed
146 if (unlikely(!ios)) { 179 */
147 ORE_DBGMSG("Failed kzalloc bytes=%d\n", 180 ios = &_aio_small->ios;
148 ore_io_state_size(numdevs)); 181 ios->extra_part_alloc = true;
149 *pios = NULL; 182 }
150 return -ENOMEM; 183
184 if (pages) {
185 ios->parity_pages = pages;
186 ios->max_par_pages = num_par_pages;
187 }
188 if (sgilist) {
189 unsigned d;
190
191 for (d = 0; d < numdevs; ++d) {
192 ios->per_dev[d].sglist = sgilist;
193 sgilist += sgs_per_dev;
194 }
195 ios->sgs_per_dev = sgs_per_dev;
151 } 196 }
152 197
153 ios->layout = layout; 198 ios->layout = layout;
@@ -178,9 +223,42 @@ int ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc,
178{ 223{
179 struct ore_io_state *ios; 224 struct ore_io_state *ios;
180 unsigned numdevs = layout->group_width * layout->mirrors_p1; 225 unsigned numdevs = layout->group_width * layout->mirrors_p1;
226 unsigned sgs_per_dev = 0, max_par_pages = 0;
181 int ret; 227 int ret;
182 228
183 ret = _get_io_state(layout, oc, numdevs, pios); 229 if (layout->parity && length) {
230 unsigned data_devs = layout->group_width - layout->parity;
231 unsigned stripe_size = layout->stripe_unit * data_devs;
232 unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE;
233 u32 remainder;
234 u64 num_stripes;
235 u64 num_raid_units;
236
237 num_stripes = div_u64_rem(length, stripe_size, &remainder);
238 if (remainder)
239 ++num_stripes;
240
241 num_raid_units = num_stripes * layout->parity;
242
243 if (is_reading) {
244 /* For reads add per_dev sglist array */
245 /* TODO: Raid 6 we need twice more. Actually:
246 * num_stripes / LCMdP(W,P);
247 * if (W%P != 0) num_stripes *= parity;
248 */
249
250 /* first/last seg is split */
251 num_raid_units += layout->group_width;
252 sgs_per_dev = div_u64(num_raid_units, data_devs);
253 } else {
254 /* For Writes add parity pages array. */
255 max_par_pages = num_raid_units * pages_in_unit *
256 sizeof(struct page *);
257 }
258 }
259
260 ret = _ore_get_io_state(layout, oc, numdevs, sgs_per_dev, max_par_pages,
261 pios);
184 if (unlikely(ret)) 262 if (unlikely(ret))
185 return ret; 263 return ret;
186 264
@@ -189,10 +267,11 @@ int ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc,
189 ios->offset = offset; 267 ios->offset = offset;
190 268
191 if (length) { 269 if (length) {
192 ore_calc_stripe_info(layout, offset, &ios->si); 270 ore_calc_stripe_info(layout, offset, length, &ios->si);
193 ios->length = (length <= ios->si.group_length) ? length : 271 ios->length = ios->si.length;
194 ios->si.group_length;
195 ios->nr_pages = (ios->length + PAGE_SIZE - 1) / PAGE_SIZE; 272 ios->nr_pages = (ios->length + PAGE_SIZE - 1) / PAGE_SIZE;
273 if (layout->parity)
274 _ore_post_alloc_raid_stuff(ios);
196 } 275 }
197 276
198 return 0; 277 return 0;
@@ -209,7 +288,7 @@ EXPORT_SYMBOL(ore_get_rw_state);
209int ore_get_io_state(struct ore_layout *layout, struct ore_components *oc, 288int ore_get_io_state(struct ore_layout *layout, struct ore_components *oc,
210 struct ore_io_state **pios) 289 struct ore_io_state **pios)
211{ 290{
212 return _get_io_state(layout, oc, oc->numdevs, pios); 291 return _ore_get_io_state(layout, oc, oc->numdevs, 0, 0, pios);
213} 292}
214EXPORT_SYMBOL(ore_get_io_state); 293EXPORT_SYMBOL(ore_get_io_state);
215 294
@@ -227,6 +306,7 @@ void ore_put_io_state(struct ore_io_state *ios)
227 bio_put(per_dev->bio); 306 bio_put(per_dev->bio);
228 } 307 }
229 308
309 _ore_free_raid_stuff(ios);
230 kfree(ios); 310 kfree(ios);
231 } 311 }
232} 312}
@@ -367,53 +447,65 @@ EXPORT_SYMBOL(ore_check_io);
367/* 447/*
368 * L - logical offset into the file 448 * L - logical offset into the file
369 * 449 *
370 * U - The number of bytes in a stripe within a group 450 * D - number of Data devices
451 * D = group_width - parity
371 * 452 *
372 * U = stripe_unit * group_width 453 * U - The number of bytes in a stripe within a group
454 * U = stripe_unit * D
373 * 455 *
374 * T - The number of bytes striped within a group of component objects 456 * T - The number of bytes striped within a group of component objects
375 * (before advancing to the next group) 457 * (before advancing to the next group)
376 * 458 * T = U * group_depth
377 * T = stripe_unit * group_width * group_depth
378 * 459 *
379 * S - The number of bytes striped across all component objects 460 * S - The number of bytes striped across all component objects
380 * before the pattern repeats 461 * before the pattern repeats
462 * S = T * group_count
381 * 463 *
382 * S = stripe_unit * group_width * group_depth * group_count 464 * M - The "major" (i.e., across all components) cycle number
383 *
384 * M - The "major" (i.e., across all components) stripe number
385 *
386 * M = L / S 465 * M = L / S
387 * 466 *
388 * G - Counts the groups from the beginning of the major stripe 467 * G - Counts the groups from the beginning of the major cycle
389 *
390 * G = (L - (M * S)) / T [or (L % S) / T] 468 * G = (L - (M * S)) / T [or (L % S) / T]
391 * 469 *
392 * H - The byte offset within the group 470 * H - The byte offset within the group
393 *
394 * H = (L - (M * S)) % T [or (L % S) % T] 471 * H = (L - (M * S)) % T [or (L % S) % T]
395 * 472 *
396 * N - The "minor" (i.e., across the group) stripe number 473 * N - The "minor" (i.e., across the group) stripe number
397 *
398 * N = H / U 474 * N = H / U
399 * 475 *
400 * C - The component index coresponding to L 476 * C - The component index coresponding to L
401 * 477 *
402 * C = (H - (N * U)) / stripe_unit + G * group_width 478 * C = (H - (N * U)) / stripe_unit + G * D
403 * [or (L % U) / stripe_unit + G * group_width] 479 * [or (L % U) / stripe_unit + G * D]
404 * 480 *
405 * O - The component offset coresponding to L 481 * O - The component offset coresponding to L
406 *
407 * O = L % stripe_unit + N * stripe_unit + M * group_depth * stripe_unit 482 * O = L % stripe_unit + N * stripe_unit + M * group_depth * stripe_unit
483 *
484 * LCMdP – Parity cycle: Lowest Common Multiple of group_width, parity
485 * divide by parity
486 * LCMdP = lcm(group_width, parity) / parity
487 *
488 * R - The parity Rotation stripe
489 * (Note parity cycle always starts at a group's boundary)
490 * R = N % LCMdP
491 *
492 * I = the first parity device index
493 * I = (group_width + group_width - R*parity - parity) % group_width
494 *
495 * Craid - The component index Rotated
496 * Craid = (group_width + C - R*parity) % group_width
497 * (We add the group_width to avoid negative numbers modulo math)
408 */ 498 */
409void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset, 499void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset,
410 struct ore_striping_info *si) 500 u64 length, struct ore_striping_info *si)
411{ 501{
412 u32 stripe_unit = layout->stripe_unit; 502 u32 stripe_unit = layout->stripe_unit;
413 u32 group_width = layout->group_width; 503 u32 group_width = layout->group_width;
414 u64 group_depth = layout->group_depth; 504 u64 group_depth = layout->group_depth;
505 u32 parity = layout->parity;
415 506
416 u32 U = stripe_unit * group_width; 507 u32 D = group_width - parity;
508 u32 U = D * stripe_unit;
417 u64 T = U * group_depth; 509 u64 T = U * group_depth;
418 u64 S = T * layout->group_count; 510 u64 S = T * layout->group_count;
419 u64 M = div64_u64(file_offset, S); 511 u64 M = div64_u64(file_offset, S);
@@ -429,22 +521,43 @@ void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset,
429 u32 N = div_u64(H, U); 521 u32 N = div_u64(H, U);
430 522
431 /* "H - (N * U)" is just "H % U" so it's bound to u32 */ 523 /* "H - (N * U)" is just "H % U" so it's bound to u32 */
432 si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width; 524 u32 C = (u32)(H - (N * U)) / stripe_unit + G * group_width;
433 si->dev *= layout->mirrors_p1;
434 525
435 div_u64_rem(file_offset, stripe_unit, &si->unit_off); 526 div_u64_rem(file_offset, stripe_unit, &si->unit_off);
436 527
437 si->obj_offset = si->unit_off + (N * stripe_unit) + 528 si->obj_offset = si->unit_off + (N * stripe_unit) +
438 (M * group_depth * stripe_unit); 529 (M * group_depth * stripe_unit);
439 530
440 si->group_length = T - H; 531 if (parity) {
532 u32 LCMdP = lcm(group_width, parity) / parity;
533 /* R = N % LCMdP; */
534 u32 RxP = (N % LCMdP) * parity;
535 u32 first_dev = C - C % group_width;
536
537 si->par_dev = (group_width + group_width - parity - RxP) %
538 group_width + first_dev;
539 si->dev = (group_width + C - RxP) % group_width + first_dev;
540 si->bytes_in_stripe = U;
541 si->first_stripe_start = M * S + G * T + N * U;
542 } else {
543 /* Make the math correct see _prepare_one_group */
544 si->par_dev = group_width;
545 si->dev = C;
546 }
547
548 si->dev *= layout->mirrors_p1;
549 si->par_dev *= layout->mirrors_p1;
550 si->offset = file_offset;
551 si->length = T - H;
552 if (si->length > length)
553 si->length = length;
441 si->M = M; 554 si->M = M;
442} 555}
443EXPORT_SYMBOL(ore_calc_stripe_info); 556EXPORT_SYMBOL(ore_calc_stripe_info);
444 557
445static int _add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, 558int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg,
446 unsigned pgbase, struct ore_per_dev_state *per_dev, 559 unsigned pgbase, struct page **pages,
447 int cur_len) 560 struct ore_per_dev_state *per_dev, int cur_len)
448{ 561{
449 unsigned pg = *cur_pg; 562 unsigned pg = *cur_pg;
450 struct request_queue *q = 563 struct request_queue *q =
@@ -455,8 +568,11 @@ static int _add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg,
455 if (per_dev->bio == NULL) { 568 if (per_dev->bio == NULL) {
456 unsigned pages_in_stripe = ios->layout->group_width * 569 unsigned pages_in_stripe = ios->layout->group_width *
457 (ios->layout->stripe_unit / PAGE_SIZE); 570 (ios->layout->stripe_unit / PAGE_SIZE);
458 unsigned bio_size = (ios->nr_pages + pages_in_stripe) / 571 unsigned nr_pages = ios->nr_pages * ios->layout->group_width /
459 ios->layout->group_width; 572 (ios->layout->group_width -
573 ios->layout->parity);
574 unsigned bio_size = (nr_pages + pages_in_stripe) /
575 ios->layout->group_width;
460 576
461 per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size); 577 per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size);
462 if (unlikely(!per_dev->bio)) { 578 if (unlikely(!per_dev->bio)) {
@@ -471,12 +587,13 @@ static int _add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg,
471 unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len); 587 unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len);
472 unsigned added_len; 588 unsigned added_len;
473 589
474 BUG_ON(ios->nr_pages <= pg);
475 cur_len -= pglen; 590 cur_len -= pglen;
476 591
477 added_len = bio_add_pc_page(q, per_dev->bio, ios->pages[pg], 592 added_len = bio_add_pc_page(q, per_dev->bio, pages[pg],
478 pglen, pgbase); 593 pglen, pgbase);
479 if (unlikely(pglen != added_len)) { 594 if (unlikely(pglen != added_len)) {
595 ORE_DBGMSG("Failed bio_add_pc_page bi_vcnt=%u\n",
596 per_dev->bio->bi_vcnt);
480 ret = -ENOMEM; 597 ret = -ENOMEM;
481 goto out; 598 goto out;
482 } 599 }
@@ -501,9 +618,11 @@ static int _prepare_for_striping(struct ore_io_state *ios)
501 struct ore_striping_info *si = &ios->si; 618 struct ore_striping_info *si = &ios->si;
502 unsigned stripe_unit = ios->layout->stripe_unit; 619 unsigned stripe_unit = ios->layout->stripe_unit;
503 unsigned mirrors_p1 = ios->layout->mirrors_p1; 620 unsigned mirrors_p1 = ios->layout->mirrors_p1;
504 unsigned devs_in_group = ios->layout->group_width * mirrors_p1; 621 unsigned group_width = ios->layout->group_width;
622 unsigned devs_in_group = group_width * mirrors_p1;
505 unsigned dev = si->dev; 623 unsigned dev = si->dev;
506 unsigned first_dev = dev - (dev % devs_in_group); 624 unsigned first_dev = dev - (dev % devs_in_group);
625 unsigned dev_order;
507 unsigned cur_pg = ios->pages_consumed; 626 unsigned cur_pg = ios->pages_consumed;
508 u64 length = ios->length; 627 u64 length = ios->length;
509 int ret = 0; 628 int ret = 0;
@@ -513,7 +632,10 @@ static int _prepare_for_striping(struct ore_io_state *ios)
513 return 0; 632 return 0;
514 } 633 }
515 634
516 BUG_ON(length > si->group_length); 635 BUG_ON(length > si->length);
636
637 dev_order = _dev_order(devs_in_group, mirrors_p1, si->par_dev, dev);
638 si->cur_comp = dev_order;
517 639
518 while (length) { 640 while (length) {
519 unsigned comp = dev - first_dev; 641 unsigned comp = dev - first_dev;
@@ -522,17 +644,20 @@ static int _prepare_for_striping(struct ore_io_state *ios)
522 644
523 if (!per_dev->length) { 645 if (!per_dev->length) {
524 per_dev->dev = dev; 646 per_dev->dev = dev;
525 if (dev < si->dev) { 647 if (dev == si->dev) {
526 per_dev->offset = si->obj_offset + stripe_unit - 648 WARN_ON(dev == si->par_dev);
527 si->unit_off;
528 cur_len = stripe_unit;
529 } else if (dev == si->dev) {
530 per_dev->offset = si->obj_offset; 649 per_dev->offset = si->obj_offset;
531 cur_len = stripe_unit - si->unit_off; 650 cur_len = stripe_unit - si->unit_off;
532 page_off = si->unit_off & ~PAGE_MASK; 651 page_off = si->unit_off & ~PAGE_MASK;
533 BUG_ON(page_off && (page_off != ios->pgbase)); 652 BUG_ON(page_off && (page_off != ios->pgbase));
534 } else { /* dev > si->dev */ 653 } else {
535 per_dev->offset = si->obj_offset - si->unit_off; 654 if (si->cur_comp > dev_order)
655 per_dev->offset =
656 si->obj_offset - si->unit_off;
657 else /* si->cur_comp < dev_order */
658 per_dev->offset =
659 si->obj_offset + stripe_unit -
660 si->unit_off;
536 cur_len = stripe_unit; 661 cur_len = stripe_unit;
537 } 662 }
538 } else { 663 } else {
@@ -541,8 +666,8 @@ static int _prepare_for_striping(struct ore_io_state *ios)
541 if (cur_len >= length) 666 if (cur_len >= length)
542 cur_len = length; 667 cur_len = length;
543 668
544 ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev, 669 ret = _ore_add_stripe_unit(ios, &cur_pg, page_off, ios->pages,
545 cur_len); 670 per_dev, cur_len);
546 if (unlikely(ret)) 671 if (unlikely(ret))
547 goto out; 672 goto out;
548 673
@@ -550,6 +675,41 @@ static int _prepare_for_striping(struct ore_io_state *ios)
550 dev = (dev % devs_in_group) + first_dev; 675 dev = (dev % devs_in_group) + first_dev;
551 676
552 length -= cur_len; 677 length -= cur_len;
678
679 si->cur_comp = (si->cur_comp + 1) % group_width;
680 if (unlikely((dev == si->par_dev) ||
681 (!length && ios->parity_pages))) {
682 if (!length)
683 /* If we are writing and this is the very last
684 * stripe. then operate on parity dev.
685 */
686 dev = si->par_dev;
687 if (ios->reading)
688 /* In writes cur_len just means if it's the
689 * last one. See _ore_add_parity_unit.
690 */
691 cur_len = length;
692 per_dev = &ios->per_dev[dev - first_dev];
693 if (!per_dev->length) {
694 /* Only/always the parity unit of the first
695 * stripe will be empty. So this is a chance to
696 * initialize the per_dev info.
697 */
698 per_dev->dev = dev;
699 per_dev->offset = si->obj_offset - si->unit_off;
700 }
701
702 ret = _ore_add_parity_unit(ios, si, per_dev, cur_len);
703 if (unlikely(ret))
704 goto out;
705
706 /* Rotate next par_dev backwards with wraping */
707 si->par_dev = (devs_in_group + si->par_dev -
708 ios->layout->parity * mirrors_p1) %
709 devs_in_group + first_dev;
710 /* Next stripe, start fresh */
711 si->cur_comp = 0;
712 }
553 } 713 }
554out: 714out:
555 ios->numdevs = devs_in_group; 715 ios->numdevs = devs_in_group;
@@ -747,12 +907,24 @@ static int _read_mirror(struct ore_io_state *ios, unsigned cur_comp)
747 per_dev->or = or; 907 per_dev->or = or;
748 908
749 if (ios->pages) { 909 if (ios->pages) {
750 osd_req_read(or, obj, per_dev->offset, 910 if (per_dev->cur_sg) {
751 per_dev->bio, per_dev->length); 911 /* finalize the last sg_entry */
912 _ore_add_sg_seg(per_dev, 0, false);
913 if (unlikely(!per_dev->cur_sg))
914 return 0; /* Skip parity only device */
915
916 osd_req_read_sg(or, obj, per_dev->bio,
917 per_dev->sglist, per_dev->cur_sg);
918 } else {
919 /* The no raid case */
920 osd_req_read(or, obj, per_dev->offset,
921 per_dev->bio, per_dev->length);
922 }
923
752 ORE_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx" 924 ORE_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx"
753 " dev=%d\n", _LLU(obj->id), 925 " dev=%d sg_len=%d\n", _LLU(obj->id),
754 _LLU(per_dev->offset), _LLU(per_dev->length), 926 _LLU(per_dev->offset), _LLU(per_dev->length),
755 first_dev); 927 first_dev, per_dev->cur_sg);
756 } else { 928 } else {
757 BUG_ON(ios->kern_buff); 929 BUG_ON(ios->kern_buff);
758 930
@@ -849,7 +1021,7 @@ static void _calc_trunk_info(struct ore_layout *layout, u64 file_offset,
849{ 1021{
850 unsigned stripe_unit = layout->stripe_unit; 1022 unsigned stripe_unit = layout->stripe_unit;
851 1023
852 ore_calc_stripe_info(layout, file_offset, &ti->si); 1024 ore_calc_stripe_info(layout, file_offset, 0, &ti->si);
853 1025
854 ti->prev_group_obj_off = ti->si.M * stripe_unit; 1026 ti->prev_group_obj_off = ti->si.M * stripe_unit;
855 ti->next_group_obj_off = ti->si.M ? (ti->si.M - 1) * stripe_unit : 0; 1027 ti->next_group_obj_off = ti->si.M ? (ti->si.M - 1) * stripe_unit : 0;
diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c
new file mode 100644
index 000000000000..8d4b93a93c67
--- /dev/null
+++ b/fs/exofs/ore_raid.c
@@ -0,0 +1,140 @@
1/*
2 * Copyright (C) 2011
3 * Boaz Harrosh <bharrosh@panasas.com>
4 *
5 * This file is part of the objects raid engine (ore).
6 *
7 * It is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as published
9 * by the Free Software Foundation.
10 *
11 * You should have received a copy of the GNU General Public License
12 * along with "ore". If not, write to the Free Software Foundation, Inc:
13 * "Free Software Foundation <info@fsf.org>"
14 */
15
16#include <linux/gfp.h>
17
18#include "ore_raid.h"
19
20struct page *_raid_page_alloc(void)
21{
22 return alloc_page(GFP_KERNEL);
23}
24
25void _raid_page_free(struct page *p)
26{
27 __free_page(p);
28}
29
30void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len,
31 bool not_last)
32{
33 struct osd_sg_entry *sge;
34
35 ORE_DBGMSG("dev=%d cur_len=0x%x not_last=%d cur_sg=%d "
36 "offset=0x%llx length=0x%x last_sgs_total=0x%x\n",
37 per_dev->dev, cur_len, not_last, per_dev->cur_sg,
38 _LLU(per_dev->offset), per_dev->length,
39 per_dev->last_sgs_total);
40
41 if (!per_dev->cur_sg) {
42 sge = per_dev->sglist;
43
44 /* First time we prepare two entries */
45 if (per_dev->length) {
46 ++per_dev->cur_sg;
47 sge->offset = per_dev->offset;
48 sge->len = per_dev->length;
49 } else {
50 /* Here the parity is the first unit of this object.
51 * This happens every time we reach a parity device on
52 * the same stripe as the per_dev->offset. We need to
53 * just skip this unit.
54 */
55 per_dev->offset += cur_len;
56 return;
57 }
58 } else {
59 /* finalize the last one */
60 sge = &per_dev->sglist[per_dev->cur_sg - 1];
61 sge->len = per_dev->length - per_dev->last_sgs_total;
62 }
63
64 if (not_last) {
65 /* Partly prepare the next one */
66 struct osd_sg_entry *next_sge = sge + 1;
67
68 ++per_dev->cur_sg;
69 next_sge->offset = sge->offset + sge->len + cur_len;
70 /* Save cur len so we know how mutch was added next time */
71 per_dev->last_sgs_total = per_dev->length;
72 next_sge->len = 0;
73 } else if (!sge->len) {
74 /* Optimize for when the last unit is a parity */
75 --per_dev->cur_sg;
76 }
77}
78
79/* In writes @cur_len means length left. .i.e cur_len==0 is the last parity U */
80int _ore_add_parity_unit(struct ore_io_state *ios,
81 struct ore_striping_info *si,
82 struct ore_per_dev_state *per_dev,
83 unsigned cur_len)
84{
85 if (ios->reading) {
86 BUG_ON(per_dev->cur_sg >= ios->sgs_per_dev);
87 _ore_add_sg_seg(per_dev, cur_len, true);
88 } else {
89 struct page **pages = ios->parity_pages + ios->cur_par_page;
90 unsigned num_pages = ios->layout->stripe_unit / PAGE_SIZE;
91 unsigned array_start = 0;
92 unsigned i;
93 int ret;
94
95 for (i = 0; i < num_pages; i++) {
96 pages[i] = _raid_page_alloc();
97 if (unlikely(!pages[i]))
98 return -ENOMEM;
99
100 ++(ios->cur_par_page);
101 /* TODO: only read support for now */
102 clear_highpage(pages[i]);
103 }
104
105 ORE_DBGMSG("writing dev=%d num_pages=%d cur_par_page=%d",
106 per_dev->dev, num_pages, ios->cur_par_page);
107
108 ret = _ore_add_stripe_unit(ios, &array_start, 0, pages,
109 per_dev, num_pages * PAGE_SIZE);
110 if (unlikely(ret))
111 return ret;
112 }
113 return 0;
114}
115
116int _ore_post_alloc_raid_stuff(struct ore_io_state *ios)
117{
118 /*TODO: Only raid writes has stuff to add here */
119 return 0;
120}
121
122void _ore_free_raid_stuff(struct ore_io_state *ios)
123{
124 if (ios->parity_pages) { /* writing and raid */
125 unsigned i;
126
127 for (i = 0; i < ios->cur_par_page; i++) {
128 struct page *page = ios->parity_pages[i];
129
130 if (page)
131 _raid_page_free(page);
132 }
133 if (ios->extra_part_alloc)
134 kfree(ios->parity_pages);
135 } else {
136 /* Will only be set if raid reading && sglist is big */
137 if (ios->extra_part_alloc)
138 kfree(ios->per_dev[0].sglist);
139 }
140}
diff --git a/fs/exofs/ore_raid.h b/fs/exofs/ore_raid.h
new file mode 100644
index 000000000000..c21080b4407f
--- /dev/null
+++ b/fs/exofs/ore_raid.h
@@ -0,0 +1,64 @@
1/*
2 * Copyright (C) from 2011
3 * Boaz Harrosh <bharrosh@panasas.com>
4 *
5 * This file is part of the objects raid engine (ore).
6 *
7 * It is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as published
9 * by the Free Software Foundation.
10 *
11 * You should have received a copy of the GNU General Public License
12 * along with "ore". If not, write to the Free Software Foundation, Inc:
13 * "Free Software Foundation <info@fsf.org>"
14 */
15
16#include <scsi/osd_ore.h>
17
18#define ORE_ERR(fmt, a...) printk(KERN_ERR "ore: " fmt, ##a)
19
20#ifdef CONFIG_EXOFS_DEBUG
21#define ORE_DBGMSG(fmt, a...) \
22 printk(KERN_NOTICE "ore @%s:%d: " fmt, __func__, __LINE__, ##a)
23#else
24#define ORE_DBGMSG(fmt, a...) \
25 do { if (0) printk(fmt, ##a); } while (0)
26#endif
27
28/* u64 has problems with printk this will cast it to unsigned long long */
29#define _LLU(x) (unsigned long long)(x)
30
31#define ORE_DBGMSG2(M...) do {} while (0)
32/* #define ORE_DBGMSG2 ORE_DBGMSG */
33
34/* Calculate the component order in a stripe. eg the logical data unit
35 * address within the stripe of @dev given the @par_dev of this stripe.
36 */
37static inline unsigned _dev_order(unsigned devs_in_group, unsigned mirrors_p1,
38 unsigned par_dev, unsigned dev)
39{
40 unsigned first_dev = dev - dev % devs_in_group;
41
42 dev -= first_dev;
43 par_dev -= first_dev;
44
45 if (devs_in_group == par_dev) /* The raid 0 case */
46 return dev / mirrors_p1;
47 /* raid4/5/6 case */
48 return ((devs_in_group + dev - par_dev - mirrors_p1) % devs_in_group) /
49 mirrors_p1;
50}
51
52/* ios_raid.c stuff needed by ios.c */
53int _ore_post_alloc_raid_stuff(struct ore_io_state *ios);
54void _ore_free_raid_stuff(struct ore_io_state *ios);
55
56void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len,
57 bool not_last);
58int _ore_add_parity_unit(struct ore_io_state *ios, struct ore_striping_info *si,
59 struct ore_per_dev_state *per_dev, unsigned cur_len);
60
61/* ios.c stuff needed by ios_raid.c */
62int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg,
63 unsigned pgbase, struct page **pages,
64 struct ore_per_dev_state *per_dev, int cur_len);