diff options
| -rw-r--r-- | fs/exofs/Kbuild | 3 | ||||
| -rw-r--r-- | fs/exofs/ore.c | 326 | ||||
| -rw-r--r-- | fs/exofs/ore_raid.c | 140 | ||||
| -rw-r--r-- | fs/exofs/ore_raid.h | 64 | ||||
| -rw-r--r-- | include/scsi/osd_ore.h | 21 |
5 files changed, 473 insertions, 81 deletions
diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild index c5a5855a6c44..352ba149d23e 100644 --- a/fs/exofs/Kbuild +++ b/fs/exofs/Kbuild | |||
| @@ -13,7 +13,8 @@ | |||
| 13 | # | 13 | # |
| 14 | 14 | ||
| 15 | # ore module library | 15 | # ore module library |
| 16 | obj-$(CONFIG_ORE) += ore.o | 16 | libore-y := ore.o ore_raid.o |
| 17 | obj-$(CONFIG_ORE) += libore.o | ||
| 17 | 18 | ||
| 18 | exofs-y := inode.o file.o symlink.o namei.o dir.o super.o | 19 | exofs-y := inode.o file.o symlink.o namei.o dir.o super.o |
| 19 | obj-$(CONFIG_EXOFS_FS) += exofs.o | 20 | obj-$(CONFIG_EXOFS_FS) += exofs.o |
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index d92998d5c2d6..fd6090ddd3bf 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c | |||
| @@ -24,24 +24,9 @@ | |||
| 24 | 24 | ||
| 25 | #include <linux/slab.h> | 25 | #include <linux/slab.h> |
| 26 | #include <asm/div64.h> | 26 | #include <asm/div64.h> |
| 27 | #include <linux/lcm.h> | ||
| 27 | 28 | ||
| 28 | #include <scsi/osd_ore.h> | 29 | #include "ore_raid.h" |
| 29 | |||
| 30 | #define ORE_ERR(fmt, a...) printk(KERN_ERR "ore: " fmt, ##a) | ||
| 31 | |||
| 32 | #ifdef CONFIG_EXOFS_DEBUG | ||
| 33 | #define ORE_DBGMSG(fmt, a...) \ | ||
| 34 | printk(KERN_NOTICE "ore @%s:%d: " fmt, __func__, __LINE__, ##a) | ||
| 35 | #else | ||
| 36 | #define ORE_DBGMSG(fmt, a...) \ | ||
| 37 | do { if (0) printk(fmt, ##a); } while (0) | ||
| 38 | #endif | ||
| 39 | |||
| 40 | /* u64 has problems with printk this will cast it to unsigned long long */ | ||
| 41 | #define _LLU(x) (unsigned long long)(x) | ||
| 42 | |||
| 43 | #define ORE_DBGMSG2(M...) do {} while (0) | ||
| 44 | /* #define ORE_DBGMSG2 ORE_DBGMSG */ | ||
| 45 | 30 | ||
| 46 | MODULE_AUTHOR("Boaz Harrosh <bharrosh@panasas.com>"); | 31 | MODULE_AUTHOR("Boaz Harrosh <bharrosh@panasas.com>"); |
| 47 | MODULE_DESCRIPTION("Objects Raid Engine ore.ko"); | 32 | MODULE_DESCRIPTION("Objects Raid Engine ore.ko"); |
| @@ -133,21 +118,81 @@ static struct osd_dev *_ios_od(struct ore_io_state *ios, unsigned index) | |||
| 133 | return ore_comp_dev(ios->oc, index); | 118 | return ore_comp_dev(ios->oc, index); |
| 134 | } | 119 | } |
| 135 | 120 | ||
| 136 | static int _get_io_state(struct ore_layout *layout, | 121 | static int _ore_get_io_state(struct ore_layout *layout, |
| 137 | struct ore_components *oc, unsigned numdevs, | 122 | struct ore_components *oc, unsigned numdevs, |
| 138 | struct ore_io_state **pios) | 123 | unsigned sgs_per_dev, unsigned num_par_pages, |
| 124 | struct ore_io_state **pios) | ||
| 139 | { | 125 | { |
| 140 | struct ore_io_state *ios; | 126 | struct ore_io_state *ios; |
| 127 | struct page **pages; | ||
| 128 | struct osd_sg_entry *sgilist; | ||
| 129 | struct __alloc_all_io_state { | ||
| 130 | struct ore_io_state ios; | ||
| 131 | struct ore_per_dev_state per_dev[numdevs]; | ||
| 132 | union { | ||
| 133 | struct osd_sg_entry sglist[sgs_per_dev * numdevs]; | ||
| 134 | struct page *pages[num_par_pages]; | ||
| 135 | }; | ||
| 136 | } *_aios; | ||
| 137 | |||
| 138 | if (likely(sizeof(*_aios) <= PAGE_SIZE)) { | ||
| 139 | _aios = kzalloc(sizeof(*_aios), GFP_KERNEL); | ||
| 140 | if (unlikely(!_aios)) { | ||
| 141 | ORE_DBGMSG("Failed kzalloc bytes=%zd\n", | ||
| 142 | sizeof(*_aios)); | ||
| 143 | *pios = NULL; | ||
| 144 | return -ENOMEM; | ||
| 145 | } | ||
| 146 | pages = num_par_pages ? _aios->pages : NULL; | ||
| 147 | sgilist = sgs_per_dev ? _aios->sglist : NULL; | ||
| 148 | ios = &_aios->ios; | ||
| 149 | } else { | ||
| 150 | struct __alloc_small_io_state { | ||
| 151 | struct ore_io_state ios; | ||
| 152 | struct ore_per_dev_state per_dev[numdevs]; | ||
| 153 | } *_aio_small; | ||
| 154 | union __extra_part { | ||
| 155 | struct osd_sg_entry sglist[sgs_per_dev * numdevs]; | ||
| 156 | struct page *pages[num_par_pages]; | ||
| 157 | } *extra_part; | ||
| 158 | |||
| 159 | _aio_small = kzalloc(sizeof(*_aio_small), GFP_KERNEL); | ||
| 160 | if (unlikely(!_aio_small)) { | ||
| 161 | ORE_DBGMSG("Failed alloc first part bytes=%zd\n", | ||
| 162 | sizeof(*_aio_small)); | ||
| 163 | *pios = NULL; | ||
| 164 | return -ENOMEM; | ||
| 165 | } | ||
| 166 | extra_part = kzalloc(sizeof(*extra_part), GFP_KERNEL); | ||
| 167 | if (unlikely(!extra_part)) { | ||
| 168 | ORE_DBGMSG("Failed alloc second part bytes=%zd\n", | ||
| 169 | sizeof(*extra_part)); | ||
| 170 | kfree(_aio_small); | ||
| 171 | *pios = NULL; | ||
| 172 | return -ENOMEM; | ||
| 173 | } | ||
| 141 | 174 | ||
| 142 | /*TODO: Maybe use kmem_cach per sbi of size | 175 | pages = num_par_pages ? extra_part->pages : NULL; |
| 143 | * exofs_io_state_size(layout->s_numdevs) | 176 | sgilist = sgs_per_dev ? extra_part->sglist : NULL; |
| 144 | */ | 177 | /* In this case the per_dev[0].sgilist holds the pointer to |
| 145 | ios = kzalloc(ore_io_state_size(numdevs), GFP_KERNEL); | 178 | * be freed |
| 146 | if (unlikely(!ios)) { | 179 | */ |
| 147 | ORE_DBGMSG("Failed kzalloc bytes=%d\n", | 180 | ios = &_aio_small->ios; |
| 148 | ore_io_state_size(numdevs)); | 181 | ios->extra_part_alloc = true; |
| 149 | *pios = NULL; | 182 | } |
| 150 | return -ENOMEM; | 183 | |
| 184 | if (pages) { | ||
| 185 | ios->parity_pages = pages; | ||
| 186 | ios->max_par_pages = num_par_pages; | ||
| 187 | } | ||
| 188 | if (sgilist) { | ||
| 189 | unsigned d; | ||
| 190 | |||
| 191 | for (d = 0; d < numdevs; ++d) { | ||
| 192 | ios->per_dev[d].sglist = sgilist; | ||
| 193 | sgilist += sgs_per_dev; | ||
| 194 | } | ||
| 195 | ios->sgs_per_dev = sgs_per_dev; | ||
| 151 | } | 196 | } |
| 152 | 197 | ||
| 153 | ios->layout = layout; | 198 | ios->layout = layout; |
| @@ -178,9 +223,42 @@ int ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc, | |||
| 178 | { | 223 | { |
| 179 | struct ore_io_state *ios; | 224 | struct ore_io_state *ios; |
| 180 | unsigned numdevs = layout->group_width * layout->mirrors_p1; | 225 | unsigned numdevs = layout->group_width * layout->mirrors_p1; |
| 226 | unsigned sgs_per_dev = 0, max_par_pages = 0; | ||
| 181 | int ret; | 227 | int ret; |
| 182 | 228 | ||
| 183 | ret = _get_io_state(layout, oc, numdevs, pios); | 229 | if (layout->parity && length) { |
| 230 | unsigned data_devs = layout->group_width - layout->parity; | ||
| 231 | unsigned stripe_size = layout->stripe_unit * data_devs; | ||
| 232 | unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE; | ||
| 233 | u32 remainder; | ||
| 234 | u64 num_stripes; | ||
| 235 | u64 num_raid_units; | ||
| 236 | |||
| 237 | num_stripes = div_u64_rem(length, stripe_size, &remainder); | ||
| 238 | if (remainder) | ||
| 239 | ++num_stripes; | ||
| 240 | |||
| 241 | num_raid_units = num_stripes * layout->parity; | ||
| 242 | |||
| 243 | if (is_reading) { | ||
| 244 | /* For reads add per_dev sglist array */ | ||
| 245 | /* TODO: Raid 6 we need twice more. Actually: | ||
| 246 | * num_stripes / LCMdP(W,P); | ||
| 247 | * if (W%P != 0) num_stripes *= parity; | ||
| 248 | */ | ||
| 249 | |||
| 250 | /* first/last seg is split */ | ||
| 251 | num_raid_units += layout->group_width; | ||
| 252 | sgs_per_dev = div_u64(num_raid_units, data_devs); | ||
| 253 | } else { | ||
| 254 | /* For Writes add parity pages array. */ | ||
| 255 | max_par_pages = num_raid_units * pages_in_unit * | ||
| 256 | sizeof(struct page *); | ||
| 257 | } | ||
| 258 | } | ||
| 259 | |||
| 260 | ret = _ore_get_io_state(layout, oc, numdevs, sgs_per_dev, max_par_pages, | ||
| 261 | pios); | ||
| 184 | if (unlikely(ret)) | 262 | if (unlikely(ret)) |
| 185 | return ret; | 263 | return ret; |
| 186 | 264 | ||
| @@ -189,10 +267,11 @@ int ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc, | |||
| 189 | ios->offset = offset; | 267 | ios->offset = offset; |
| 190 | 268 | ||
| 191 | if (length) { | 269 | if (length) { |
| 192 | ore_calc_stripe_info(layout, offset, &ios->si); | 270 | ore_calc_stripe_info(layout, offset, length, &ios->si); |
| 193 | ios->length = (length <= ios->si.group_length) ? length : | 271 | ios->length = ios->si.length; |
| 194 | ios->si.group_length; | ||
| 195 | ios->nr_pages = (ios->length + PAGE_SIZE - 1) / PAGE_SIZE; | 272 | ios->nr_pages = (ios->length + PAGE_SIZE - 1) / PAGE_SIZE; |
| 273 | if (layout->parity) | ||
| 274 | _ore_post_alloc_raid_stuff(ios); | ||
| 196 | } | 275 | } |
| 197 | 276 | ||
| 198 | return 0; | 277 | return 0; |
| @@ -209,7 +288,7 @@ EXPORT_SYMBOL(ore_get_rw_state); | |||
| 209 | int ore_get_io_state(struct ore_layout *layout, struct ore_components *oc, | 288 | int ore_get_io_state(struct ore_layout *layout, struct ore_components *oc, |
| 210 | struct ore_io_state **pios) | 289 | struct ore_io_state **pios) |
| 211 | { | 290 | { |
| 212 | return _get_io_state(layout, oc, oc->numdevs, pios); | 291 | return _ore_get_io_state(layout, oc, oc->numdevs, 0, 0, pios); |
| 213 | } | 292 | } |
| 214 | EXPORT_SYMBOL(ore_get_io_state); | 293 | EXPORT_SYMBOL(ore_get_io_state); |
| 215 | 294 | ||
| @@ -227,6 +306,7 @@ void ore_put_io_state(struct ore_io_state *ios) | |||
| 227 | bio_put(per_dev->bio); | 306 | bio_put(per_dev->bio); |
| 228 | } | 307 | } |
| 229 | 308 | ||
| 309 | _ore_free_raid_stuff(ios); | ||
| 230 | kfree(ios); | 310 | kfree(ios); |
| 231 | } | 311 | } |
| 232 | } | 312 | } |
| @@ -367,53 +447,65 @@ EXPORT_SYMBOL(ore_check_io); | |||
| 367 | /* | 447 | /* |
| 368 | * L - logical offset into the file | 448 | * L - logical offset into the file |
| 369 | * | 449 | * |
| 370 | * U - The number of bytes in a stripe within a group | 450 | * D - number of Data devices |
| 451 | * D = group_width - parity | ||
| 371 | * | 452 | * |
| 372 | * U = stripe_unit * group_width | 453 | * U - The number of bytes in a stripe within a group |
| 454 | * U = stripe_unit * D | ||
| 373 | * | 455 | * |
| 374 | * T - The number of bytes striped within a group of component objects | 456 | * T - The number of bytes striped within a group of component objects |
| 375 | * (before advancing to the next group) | 457 | * (before advancing to the next group) |
| 376 | * | 458 | * T = U * group_depth |
| 377 | * T = stripe_unit * group_width * group_depth | ||
| 378 | * | 459 | * |
| 379 | * S - The number of bytes striped across all component objects | 460 | * S - The number of bytes striped across all component objects |
| 380 | * before the pattern repeats | 461 | * before the pattern repeats |
| 462 | * S = T * group_count | ||
| 381 | * | 463 | * |
| 382 | * S = stripe_unit * group_width * group_depth * group_count | 464 | * M - The "major" (i.e., across all components) cycle number |
| 383 | * | ||
| 384 | * M - The "major" (i.e., across all components) stripe number | ||
| 385 | * | ||
| 386 | * M = L / S | 465 | * M = L / S |
| 387 | * | 466 | * |
| 388 | * G - Counts the groups from the beginning of the major stripe | 467 | * G - Counts the groups from the beginning of the major cycle |
| 389 | * | ||
| 390 | * G = (L - (M * S)) / T [or (L % S) / T] | 468 | * G = (L - (M * S)) / T [or (L % S) / T] |
| 391 | * | 469 | * |
| 392 | * H - The byte offset within the group | 470 | * H - The byte offset within the group |
| 393 | * | ||
| 394 | * H = (L - (M * S)) % T [or (L % S) % T] | 471 | * H = (L - (M * S)) % T [or (L % S) % T] |
| 395 | * | 472 | * |
| 396 | * N - The "minor" (i.e., across the group) stripe number | 473 | * N - The "minor" (i.e., across the group) stripe number |
| 397 | * | ||
| 398 | * N = H / U | 474 | * N = H / U |
| 399 | * | 475 | * |
| 400 | * C - The component index coresponding to L | 476 | * C - The component index coresponding to L |
| 401 | * | 477 | * |
| 402 | * C = (H - (N * U)) / stripe_unit + G * group_width | 478 | * C = (H - (N * U)) / stripe_unit + G * D |
| 403 | * [or (L % U) / stripe_unit + G * group_width] | 479 | * [or (L % U) / stripe_unit + G * D] |
| 404 | * | 480 | * |
| 405 | * O - The component offset coresponding to L | 481 | * O - The component offset coresponding to L |
| 406 | * | ||
| 407 | * O = L % stripe_unit + N * stripe_unit + M * group_depth * stripe_unit | 482 | * O = L % stripe_unit + N * stripe_unit + M * group_depth * stripe_unit |
| 483 | * | ||
| 484 | * LCMdP – Parity cycle: Lowest Common Multiple of group_width, parity | ||
| 485 | * divide by parity | ||
| 486 | * LCMdP = lcm(group_width, parity) / parity | ||
| 487 | * | ||
| 488 | * R - The parity Rotation stripe | ||
| 489 | * (Note parity cycle always starts at a group's boundary) | ||
| 490 | * R = N % LCMdP | ||
| 491 | * | ||
| 492 | * I = the first parity device index | ||
| 493 | * I = (group_width + group_width - R*parity - parity) % group_width | ||
| 494 | * | ||
| 495 | * Craid - The component index Rotated | ||
| 496 | * Craid = (group_width + C - R*parity) % group_width | ||
| 497 | * (We add the group_width to avoid negative numbers modulo math) | ||
| 408 | */ | 498 | */ |
| 409 | void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset, | 499 | void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset, |
| 410 | struct ore_striping_info *si) | 500 | u64 length, struct ore_striping_info *si) |
| 411 | { | 501 | { |
| 412 | u32 stripe_unit = layout->stripe_unit; | 502 | u32 stripe_unit = layout->stripe_unit; |
| 413 | u32 group_width = layout->group_width; | 503 | u32 group_width = layout->group_width; |
| 414 | u64 group_depth = layout->group_depth; | 504 | u64 group_depth = layout->group_depth; |
| 505 | u32 parity = layout->parity; | ||
| 415 | 506 | ||
| 416 | u32 U = stripe_unit * group_width; | 507 | u32 D = group_width - parity; |
| 508 | u32 U = D * stripe_unit; | ||
| 417 | u64 T = U * group_depth; | 509 | u64 T = U * group_depth; |
| 418 | u64 S = T * layout->group_count; | 510 | u64 S = T * layout->group_count; |
| 419 | u64 M = div64_u64(file_offset, S); | 511 | u64 M = div64_u64(file_offset, S); |
| @@ -429,22 +521,43 @@ void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset, | |||
| 429 | u32 N = div_u64(H, U); | 521 | u32 N = div_u64(H, U); |
| 430 | 522 | ||
| 431 | /* "H - (N * U)" is just "H % U" so it's bound to u32 */ | 523 | /* "H - (N * U)" is just "H % U" so it's bound to u32 */ |
| 432 | si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width; | 524 | u32 C = (u32)(H - (N * U)) / stripe_unit + G * group_width; |
| 433 | si->dev *= layout->mirrors_p1; | ||
| 434 | 525 | ||
| 435 | div_u64_rem(file_offset, stripe_unit, &si->unit_off); | 526 | div_u64_rem(file_offset, stripe_unit, &si->unit_off); |
| 436 | 527 | ||
| 437 | si->obj_offset = si->unit_off + (N * stripe_unit) + | 528 | si->obj_offset = si->unit_off + (N * stripe_unit) + |
| 438 | (M * group_depth * stripe_unit); | 529 | (M * group_depth * stripe_unit); |
| 439 | 530 | ||
| 440 | si->group_length = T - H; | 531 | if (parity) { |
| 532 | u32 LCMdP = lcm(group_width, parity) / parity; | ||
| 533 | /* R = N % LCMdP; */ | ||
| 534 | u32 RxP = (N % LCMdP) * parity; | ||
| 535 | u32 first_dev = C - C % group_width; | ||
| 536 | |||
| 537 | si->par_dev = (group_width + group_width - parity - RxP) % | ||
| 538 | group_width + first_dev; | ||
| 539 | si->dev = (group_width + C - RxP) % group_width + first_dev; | ||
| 540 | si->bytes_in_stripe = U; | ||
| 541 | si->first_stripe_start = M * S + G * T + N * U; | ||
| 542 | } else { | ||
| 543 | /* Make the math correct see _prepare_one_group */ | ||
| 544 | si->par_dev = group_width; | ||
| 545 | si->dev = C; | ||
| 546 | } | ||
| 547 | |||
| 548 | si->dev *= layout->mirrors_p1; | ||
| 549 | si->par_dev *= layout->mirrors_p1; | ||
| 550 | si->offset = file_offset; | ||
| 551 | si->length = T - H; | ||
| 552 | if (si->length > length) | ||
| 553 | si->length = length; | ||
| 441 | si->M = M; | 554 | si->M = M; |
| 442 | } | 555 | } |
| 443 | EXPORT_SYMBOL(ore_calc_stripe_info); | 556 | EXPORT_SYMBOL(ore_calc_stripe_info); |
| 444 | 557 | ||
| 445 | static int _add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, | 558 | int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, |
| 446 | unsigned pgbase, struct ore_per_dev_state *per_dev, | 559 | unsigned pgbase, struct page **pages, |
| 447 | int cur_len) | 560 | struct ore_per_dev_state *per_dev, int cur_len) |
| 448 | { | 561 | { |
| 449 | unsigned pg = *cur_pg; | 562 | unsigned pg = *cur_pg; |
| 450 | struct request_queue *q = | 563 | struct request_queue *q = |
| @@ -455,8 +568,11 @@ static int _add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, | |||
| 455 | if (per_dev->bio == NULL) { | 568 | if (per_dev->bio == NULL) { |
| 456 | unsigned pages_in_stripe = ios->layout->group_width * | 569 | unsigned pages_in_stripe = ios->layout->group_width * |
| 457 | (ios->layout->stripe_unit / PAGE_SIZE); | 570 | (ios->layout->stripe_unit / PAGE_SIZE); |
| 458 | unsigned bio_size = (ios->nr_pages + pages_in_stripe) / | 571 | unsigned nr_pages = ios->nr_pages * ios->layout->group_width / |
| 459 | ios->layout->group_width; | 572 | (ios->layout->group_width - |
| 573 | ios->layout->parity); | ||
| 574 | unsigned bio_size = (nr_pages + pages_in_stripe) / | ||
| 575 | ios->layout->group_width; | ||
| 460 | 576 | ||
| 461 | per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size); | 577 | per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size); |
| 462 | if (unlikely(!per_dev->bio)) { | 578 | if (unlikely(!per_dev->bio)) { |
| @@ -471,12 +587,13 @@ static int _add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, | |||
| 471 | unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len); | 587 | unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len); |
| 472 | unsigned added_len; | 588 | unsigned added_len; |
| 473 | 589 | ||
| 474 | BUG_ON(ios->nr_pages <= pg); | ||
| 475 | cur_len -= pglen; | 590 | cur_len -= pglen; |
| 476 | 591 | ||
| 477 | added_len = bio_add_pc_page(q, per_dev->bio, ios->pages[pg], | 592 | added_len = bio_add_pc_page(q, per_dev->bio, pages[pg], |
| 478 | pglen, pgbase); | 593 | pglen, pgbase); |
| 479 | if (unlikely(pglen != added_len)) { | 594 | if (unlikely(pglen != added_len)) { |
| 595 | ORE_DBGMSG("Failed bio_add_pc_page bi_vcnt=%u\n", | ||
| 596 | per_dev->bio->bi_vcnt); | ||
| 480 | ret = -ENOMEM; | 597 | ret = -ENOMEM; |
| 481 | goto out; | 598 | goto out; |
| 482 | } | 599 | } |
| @@ -501,9 +618,11 @@ static int _prepare_for_striping(struct ore_io_state *ios) | |||
| 501 | struct ore_striping_info *si = &ios->si; | 618 | struct ore_striping_info *si = &ios->si; |
| 502 | unsigned stripe_unit = ios->layout->stripe_unit; | 619 | unsigned stripe_unit = ios->layout->stripe_unit; |
| 503 | unsigned mirrors_p1 = ios->layout->mirrors_p1; | 620 | unsigned mirrors_p1 = ios->layout->mirrors_p1; |
| 504 | unsigned devs_in_group = ios->layout->group_width * mirrors_p1; | 621 | unsigned group_width = ios->layout->group_width; |
| 622 | unsigned devs_in_group = group_width * mirrors_p1; | ||
| 505 | unsigned dev = si->dev; | 623 | unsigned dev = si->dev; |
| 506 | unsigned first_dev = dev - (dev % devs_in_group); | 624 | unsigned first_dev = dev - (dev % devs_in_group); |
| 625 | unsigned dev_order; | ||
| 507 | unsigned cur_pg = ios->pages_consumed; | 626 | unsigned cur_pg = ios->pages_consumed; |
| 508 | u64 length = ios->length; | 627 | u64 length = ios->length; |
| 509 | int ret = 0; | 628 | int ret = 0; |
| @@ -513,7 +632,10 @@ static int _prepare_for_striping(struct ore_io_state *ios) | |||
| 513 | return 0; | 632 | return 0; |
| 514 | } | 633 | } |
| 515 | 634 | ||
| 516 | BUG_ON(length > si->group_length); | 635 | BUG_ON(length > si->length); |
| 636 | |||
| 637 | dev_order = _dev_order(devs_in_group, mirrors_p1, si->par_dev, dev); | ||
| 638 | si->cur_comp = dev_order; | ||
| 517 | 639 | ||
| 518 | while (length) { | 640 | while (length) { |
| 519 | unsigned comp = dev - first_dev; | 641 | unsigned comp = dev - first_dev; |
| @@ -522,17 +644,20 @@ static int _prepare_for_striping(struct ore_io_state *ios) | |||
| 522 | 644 | ||
| 523 | if (!per_dev->length) { | 645 | if (!per_dev->length) { |
| 524 | per_dev->dev = dev; | 646 | per_dev->dev = dev; |
| 525 | if (dev < si->dev) { | 647 | if (dev == si->dev) { |
| 526 | per_dev->offset = si->obj_offset + stripe_unit - | 648 | WARN_ON(dev == si->par_dev); |
| 527 | si->unit_off; | ||
| 528 | cur_len = stripe_unit; | ||
| 529 | } else if (dev == si->dev) { | ||
| 530 | per_dev->offset = si->obj_offset; | 649 | per_dev->offset = si->obj_offset; |
| 531 | cur_len = stripe_unit - si->unit_off; | 650 | cur_len = stripe_unit - si->unit_off; |
| 532 | page_off = si->unit_off & ~PAGE_MASK; | 651 | page_off = si->unit_off & ~PAGE_MASK; |
| 533 | BUG_ON(page_off && (page_off != ios->pgbase)); | 652 | BUG_ON(page_off && (page_off != ios->pgbase)); |
| 534 | } else { /* dev > si->dev */ | 653 | } else { |
| 535 | per_dev->offset = si->obj_offset - si->unit_off; | 654 | if (si->cur_comp > dev_order) |
| 655 | per_dev->offset = | ||
| 656 | si->obj_offset - si->unit_off; | ||
| 657 | else /* si->cur_comp < dev_order */ | ||
| 658 | per_dev->offset = | ||
| 659 | si->obj_offset + stripe_unit - | ||
| 660 | si->unit_off; | ||
| 536 | cur_len = stripe_unit; | 661 | cur_len = stripe_unit; |
| 537 | } | 662 | } |
| 538 | } else { | 663 | } else { |
| @@ -541,8 +666,8 @@ static int _prepare_for_striping(struct ore_io_state *ios) | |||
| 541 | if (cur_len >= length) | 666 | if (cur_len >= length) |
| 542 | cur_len = length; | 667 | cur_len = length; |
| 543 | 668 | ||
| 544 | ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev, | 669 | ret = _ore_add_stripe_unit(ios, &cur_pg, page_off, ios->pages, |
| 545 | cur_len); | 670 | per_dev, cur_len); |
| 546 | if (unlikely(ret)) | 671 | if (unlikely(ret)) |
| 547 | goto out; | 672 | goto out; |
| 548 | 673 | ||
| @@ -550,6 +675,41 @@ static int _prepare_for_striping(struct ore_io_state *ios) | |||
| 550 | dev = (dev % devs_in_group) + first_dev; | 675 | dev = (dev % devs_in_group) + first_dev; |
| 551 | 676 | ||
| 552 | length -= cur_len; | 677 | length -= cur_len; |
| 678 | |||
| 679 | si->cur_comp = (si->cur_comp + 1) % group_width; | ||
| 680 | if (unlikely((dev == si->par_dev) || | ||
| 681 | (!length && ios->parity_pages))) { | ||
| 682 | if (!length) | ||
| 683 | /* If we are writing and this is the very last | ||
| 684 | * stripe. then operate on parity dev. | ||
| 685 | */ | ||
| 686 | dev = si->par_dev; | ||
| 687 | if (ios->reading) | ||
| 688 | /* In writes cur_len just means if it's the | ||
| 689 | * last one. See _ore_add_parity_unit. | ||
| 690 | */ | ||
| 691 | cur_len = length; | ||
| 692 | per_dev = &ios->per_dev[dev - first_dev]; | ||
| 693 | if (!per_dev->length) { | ||
| 694 | /* Only/always the parity unit of the first | ||
| 695 | * stripe will be empty. So this is a chance to | ||
| 696 | * initialize the per_dev info. | ||
| 697 | */ | ||
| 698 | per_dev->dev = dev; | ||
| 699 | per_dev->offset = si->obj_offset - si->unit_off; | ||
| 700 | } | ||
| 701 | |||
| 702 | ret = _ore_add_parity_unit(ios, si, per_dev, cur_len); | ||
| 703 | if (unlikely(ret)) | ||
| 704 | goto out; | ||
| 705 | |||
| 706 | /* Rotate next par_dev backwards with wraping */ | ||
| 707 | si->par_dev = (devs_in_group + si->par_dev - | ||
| 708 | ios->layout->parity * mirrors_p1) % | ||
| 709 | devs_in_group + first_dev; | ||
| 710 | /* Next stripe, start fresh */ | ||
| 711 | si->cur_comp = 0; | ||
| 712 | } | ||
| 553 | } | 713 | } |
| 554 | out: | 714 | out: |
| 555 | ios->numdevs = devs_in_group; | 715 | ios->numdevs = devs_in_group; |
| @@ -747,12 +907,24 @@ static int _read_mirror(struct ore_io_state *ios, unsigned cur_comp) | |||
| 747 | per_dev->or = or; | 907 | per_dev->or = or; |
| 748 | 908 | ||
| 749 | if (ios->pages) { | 909 | if (ios->pages) { |
| 750 | osd_req_read(or, obj, per_dev->offset, | 910 | if (per_dev->cur_sg) { |
| 751 | per_dev->bio, per_dev->length); | 911 | /* finalize the last sg_entry */ |
| 912 | _ore_add_sg_seg(per_dev, 0, false); | ||
| 913 | if (unlikely(!per_dev->cur_sg)) | ||
| 914 | return 0; /* Skip parity only device */ | ||
| 915 | |||
| 916 | osd_req_read_sg(or, obj, per_dev->bio, | ||
| 917 | per_dev->sglist, per_dev->cur_sg); | ||
| 918 | } else { | ||
| 919 | /* The no raid case */ | ||
| 920 | osd_req_read(or, obj, per_dev->offset, | ||
| 921 | per_dev->bio, per_dev->length); | ||
| 922 | } | ||
| 923 | |||
| 752 | ORE_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx" | 924 | ORE_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx" |
| 753 | " dev=%d\n", _LLU(obj->id), | 925 | " dev=%d sg_len=%d\n", _LLU(obj->id), |
| 754 | _LLU(per_dev->offset), _LLU(per_dev->length), | 926 | _LLU(per_dev->offset), _LLU(per_dev->length), |
| 755 | first_dev); | 927 | first_dev, per_dev->cur_sg); |
| 756 | } else { | 928 | } else { |
| 757 | BUG_ON(ios->kern_buff); | 929 | BUG_ON(ios->kern_buff); |
| 758 | 930 | ||
| @@ -849,7 +1021,7 @@ static void _calc_trunk_info(struct ore_layout *layout, u64 file_offset, | |||
| 849 | { | 1021 | { |
| 850 | unsigned stripe_unit = layout->stripe_unit; | 1022 | unsigned stripe_unit = layout->stripe_unit; |
| 851 | 1023 | ||
| 852 | ore_calc_stripe_info(layout, file_offset, &ti->si); | 1024 | ore_calc_stripe_info(layout, file_offset, 0, &ti->si); |
| 853 | 1025 | ||
| 854 | ti->prev_group_obj_off = ti->si.M * stripe_unit; | 1026 | ti->prev_group_obj_off = ti->si.M * stripe_unit; |
| 855 | ti->next_group_obj_off = ti->si.M ? (ti->si.M - 1) * stripe_unit : 0; | 1027 | ti->next_group_obj_off = ti->si.M ? (ti->si.M - 1) * stripe_unit : 0; |
diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c new file mode 100644 index 000000000000..8d4b93a93c67 --- /dev/null +++ b/fs/exofs/ore_raid.c | |||
| @@ -0,0 +1,140 @@ | |||
| 1 | /* | ||
| 2 | * Copyright (C) 2011 | ||
| 3 | * Boaz Harrosh <bharrosh@panasas.com> | ||
| 4 | * | ||
| 5 | * This file is part of the objects raid engine (ore). | ||
| 6 | * | ||
| 7 | * It is free software; you can redistribute it and/or modify | ||
| 8 | * it under the terms of the GNU General Public License version 2 as published | ||
| 9 | * by the Free Software Foundation. | ||
| 10 | * | ||
| 11 | * You should have received a copy of the GNU General Public License | ||
| 12 | * along with "ore". If not, write to the Free Software Foundation, Inc: | ||
| 13 | * "Free Software Foundation <info@fsf.org>" | ||
| 14 | */ | ||
| 15 | |||
| 16 | #include <linux/gfp.h> | ||
| 17 | |||
| 18 | #include "ore_raid.h" | ||
| 19 | |||
| 20 | struct page *_raid_page_alloc(void) | ||
| 21 | { | ||
| 22 | return alloc_page(GFP_KERNEL); | ||
| 23 | } | ||
| 24 | |||
| 25 | void _raid_page_free(struct page *p) | ||
| 26 | { | ||
| 27 | __free_page(p); | ||
| 28 | } | ||
| 29 | |||
| 30 | void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len, | ||
| 31 | bool not_last) | ||
| 32 | { | ||
| 33 | struct osd_sg_entry *sge; | ||
| 34 | |||
| 35 | ORE_DBGMSG("dev=%d cur_len=0x%x not_last=%d cur_sg=%d " | ||
| 36 | "offset=0x%llx length=0x%x last_sgs_total=0x%x\n", | ||
| 37 | per_dev->dev, cur_len, not_last, per_dev->cur_sg, | ||
| 38 | _LLU(per_dev->offset), per_dev->length, | ||
| 39 | per_dev->last_sgs_total); | ||
| 40 | |||
| 41 | if (!per_dev->cur_sg) { | ||
| 42 | sge = per_dev->sglist; | ||
| 43 | |||
| 44 | /* First time we prepare two entries */ | ||
| 45 | if (per_dev->length) { | ||
| 46 | ++per_dev->cur_sg; | ||
| 47 | sge->offset = per_dev->offset; | ||
| 48 | sge->len = per_dev->length; | ||
| 49 | } else { | ||
| 50 | /* Here the parity is the first unit of this object. | ||
| 51 | * This happens every time we reach a parity device on | ||
| 52 | * the same stripe as the per_dev->offset. We need to | ||
| 53 | * just skip this unit. | ||
| 54 | */ | ||
| 55 | per_dev->offset += cur_len; | ||
| 56 | return; | ||
| 57 | } | ||
| 58 | } else { | ||
| 59 | /* finalize the last one */ | ||
| 60 | sge = &per_dev->sglist[per_dev->cur_sg - 1]; | ||
| 61 | sge->len = per_dev->length - per_dev->last_sgs_total; | ||
| 62 | } | ||
| 63 | |||
| 64 | if (not_last) { | ||
| 65 | /* Partly prepare the next one */ | ||
| 66 | struct osd_sg_entry *next_sge = sge + 1; | ||
| 67 | |||
| 68 | ++per_dev->cur_sg; | ||
| 69 | next_sge->offset = sge->offset + sge->len + cur_len; | ||
| 70 | /* Save cur len so we know how mutch was added next time */ | ||
| 71 | per_dev->last_sgs_total = per_dev->length; | ||
| 72 | next_sge->len = 0; | ||
| 73 | } else if (!sge->len) { | ||
| 74 | /* Optimize for when the last unit is a parity */ | ||
| 75 | --per_dev->cur_sg; | ||
| 76 | } | ||
| 77 | } | ||
| 78 | |||
| 79 | /* In writes @cur_len means length left. .i.e cur_len==0 is the last parity U */ | ||
| 80 | int _ore_add_parity_unit(struct ore_io_state *ios, | ||
| 81 | struct ore_striping_info *si, | ||
| 82 | struct ore_per_dev_state *per_dev, | ||
| 83 | unsigned cur_len) | ||
| 84 | { | ||
| 85 | if (ios->reading) { | ||
| 86 | BUG_ON(per_dev->cur_sg >= ios->sgs_per_dev); | ||
| 87 | _ore_add_sg_seg(per_dev, cur_len, true); | ||
| 88 | } else { | ||
| 89 | struct page **pages = ios->parity_pages + ios->cur_par_page; | ||
| 90 | unsigned num_pages = ios->layout->stripe_unit / PAGE_SIZE; | ||
| 91 | unsigned array_start = 0; | ||
| 92 | unsigned i; | ||
| 93 | int ret; | ||
| 94 | |||
| 95 | for (i = 0; i < num_pages; i++) { | ||
| 96 | pages[i] = _raid_page_alloc(); | ||
| 97 | if (unlikely(!pages[i])) | ||
| 98 | return -ENOMEM; | ||
| 99 | |||
| 100 | ++(ios->cur_par_page); | ||
| 101 | /* TODO: only read support for now */ | ||
| 102 | clear_highpage(pages[i]); | ||
| 103 | } | ||
| 104 | |||
| 105 | ORE_DBGMSG("writing dev=%d num_pages=%d cur_par_page=%d", | ||
| 106 | per_dev->dev, num_pages, ios->cur_par_page); | ||
| 107 | |||
| 108 | ret = _ore_add_stripe_unit(ios, &array_start, 0, pages, | ||
| 109 | per_dev, num_pages * PAGE_SIZE); | ||
| 110 | if (unlikely(ret)) | ||
| 111 | return ret; | ||
| 112 | } | ||
| 113 | return 0; | ||
| 114 | } | ||
| 115 | |||
| 116 | int _ore_post_alloc_raid_stuff(struct ore_io_state *ios) | ||
| 117 | { | ||
| 118 | /*TODO: Only raid writes has stuff to add here */ | ||
| 119 | return 0; | ||
| 120 | } | ||
| 121 | |||
| 122 | void _ore_free_raid_stuff(struct ore_io_state *ios) | ||
| 123 | { | ||
| 124 | if (ios->parity_pages) { /* writing and raid */ | ||
| 125 | unsigned i; | ||
| 126 | |||
| 127 | for (i = 0; i < ios->cur_par_page; i++) { | ||
| 128 | struct page *page = ios->parity_pages[i]; | ||
| 129 | |||
| 130 | if (page) | ||
| 131 | _raid_page_free(page); | ||
| 132 | } | ||
| 133 | if (ios->extra_part_alloc) | ||
| 134 | kfree(ios->parity_pages); | ||
| 135 | } else { | ||
| 136 | /* Will only be set if raid reading && sglist is big */ | ||
| 137 | if (ios->extra_part_alloc) | ||
| 138 | kfree(ios->per_dev[0].sglist); | ||
| 139 | } | ||
| 140 | } | ||
diff --git a/fs/exofs/ore_raid.h b/fs/exofs/ore_raid.h new file mode 100644 index 000000000000..c21080b4407f --- /dev/null +++ b/fs/exofs/ore_raid.h | |||
| @@ -0,0 +1,64 @@ | |||
| 1 | /* | ||
| 2 | * Copyright (C) from 2011 | ||
| 3 | * Boaz Harrosh <bharrosh@panasas.com> | ||
| 4 | * | ||
| 5 | * This file is part of the objects raid engine (ore). | ||
| 6 | * | ||
| 7 | * It is free software; you can redistribute it and/or modify | ||
| 8 | * it under the terms of the GNU General Public License version 2 as published | ||
| 9 | * by the Free Software Foundation. | ||
| 10 | * | ||
| 11 | * You should have received a copy of the GNU General Public License | ||
| 12 | * along with "ore". If not, write to the Free Software Foundation, Inc: | ||
| 13 | * "Free Software Foundation <info@fsf.org>" | ||
| 14 | */ | ||
| 15 | |||
| 16 | #include <scsi/osd_ore.h> | ||
| 17 | |||
| 18 | #define ORE_ERR(fmt, a...) printk(KERN_ERR "ore: " fmt, ##a) | ||
| 19 | |||
| 20 | #ifdef CONFIG_EXOFS_DEBUG | ||
| 21 | #define ORE_DBGMSG(fmt, a...) \ | ||
| 22 | printk(KERN_NOTICE "ore @%s:%d: " fmt, __func__, __LINE__, ##a) | ||
| 23 | #else | ||
| 24 | #define ORE_DBGMSG(fmt, a...) \ | ||
| 25 | do { if (0) printk(fmt, ##a); } while (0) | ||
| 26 | #endif | ||
| 27 | |||
| 28 | /* u64 has problems with printk this will cast it to unsigned long long */ | ||
| 29 | #define _LLU(x) (unsigned long long)(x) | ||
| 30 | |||
| 31 | #define ORE_DBGMSG2(M...) do {} while (0) | ||
| 32 | /* #define ORE_DBGMSG2 ORE_DBGMSG */ | ||
| 33 | |||
| 34 | /* Calculate the component order in a stripe. eg the logical data unit | ||
| 35 | * address within the stripe of @dev given the @par_dev of this stripe. | ||
| 36 | */ | ||
| 37 | static inline unsigned _dev_order(unsigned devs_in_group, unsigned mirrors_p1, | ||
| 38 | unsigned par_dev, unsigned dev) | ||
| 39 | { | ||
| 40 | unsigned first_dev = dev - dev % devs_in_group; | ||
| 41 | |||
| 42 | dev -= first_dev; | ||
| 43 | par_dev -= first_dev; | ||
| 44 | |||
| 45 | if (devs_in_group == par_dev) /* The raid 0 case */ | ||
| 46 | return dev / mirrors_p1; | ||
| 47 | /* raid4/5/6 case */ | ||
| 48 | return ((devs_in_group + dev - par_dev - mirrors_p1) % devs_in_group) / | ||
| 49 | mirrors_p1; | ||
| 50 | } | ||
| 51 | |||
| 52 | /* ios_raid.c stuff needed by ios.c */ | ||
| 53 | int _ore_post_alloc_raid_stuff(struct ore_io_state *ios); | ||
| 54 | void _ore_free_raid_stuff(struct ore_io_state *ios); | ||
| 55 | |||
| 56 | void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len, | ||
| 57 | bool not_last); | ||
| 58 | int _ore_add_parity_unit(struct ore_io_state *ios, struct ore_striping_info *si, | ||
| 59 | struct ore_per_dev_state *per_dev, unsigned cur_len); | ||
| 60 | |||
| 61 | /* ios.c stuff needed by ios_raid.c */ | ||
| 62 | int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, | ||
| 63 | unsigned pgbase, struct page **pages, | ||
| 64 | struct ore_per_dev_state *per_dev, int cur_len); | ||
diff --git a/include/scsi/osd_ore.h b/include/scsi/osd_ore.h index a8e39d14f82b..43821c18cd3f 100644 --- a/include/scsi/osd_ore.h +++ b/include/scsi/osd_ore.h | |||
| @@ -40,6 +40,7 @@ struct ore_layout { | |||
| 40 | unsigned mirrors_p1; | 40 | unsigned mirrors_p1; |
| 41 | 41 | ||
| 42 | unsigned group_width; | 42 | unsigned group_width; |
| 43 | unsigned parity; | ||
| 43 | u64 group_depth; | 44 | u64 group_depth; |
| 44 | unsigned group_count; | 45 | unsigned group_count; |
| 45 | 46 | ||
| @@ -89,11 +90,16 @@ static inline void ore_comp_set_dev( | |||
| 89 | } | 90 | } |
| 90 | 91 | ||
| 91 | struct ore_striping_info { | 92 | struct ore_striping_info { |
| 93 | u64 offset; | ||
| 92 | u64 obj_offset; | 94 | u64 obj_offset; |
| 93 | u64 group_length; | 95 | u64 length; |
| 96 | u64 first_stripe_start; /* only used in raid writes */ | ||
| 94 | u64 M; /* for truncate */ | 97 | u64 M; /* for truncate */ |
| 98 | unsigned bytes_in_stripe; | ||
| 95 | unsigned dev; | 99 | unsigned dev; |
| 100 | unsigned par_dev; | ||
| 96 | unsigned unit_off; | 101 | unsigned unit_off; |
| 102 | unsigned cur_comp; | ||
| 97 | }; | 103 | }; |
| 98 | 104 | ||
| 99 | struct ore_io_state; | 105 | struct ore_io_state; |
| @@ -127,6 +133,13 @@ struct ore_io_state { | |||
| 127 | 133 | ||
| 128 | bool reading; | 134 | bool reading; |
| 129 | 135 | ||
| 136 | /* House keeping of Parity pages */ | ||
| 137 | bool extra_part_alloc; | ||
| 138 | struct page **parity_pages; | ||
| 139 | unsigned max_par_pages; | ||
| 140 | unsigned cur_par_page; | ||
| 141 | unsigned sgs_per_dev; | ||
| 142 | |||
| 130 | /* Variable array of size numdevs */ | 143 | /* Variable array of size numdevs */ |
| 131 | unsigned numdevs; | 144 | unsigned numdevs; |
| 132 | struct ore_per_dev_state { | 145 | struct ore_per_dev_state { |
| @@ -134,7 +147,10 @@ struct ore_io_state { | |||
| 134 | struct bio *bio; | 147 | struct bio *bio; |
| 135 | loff_t offset; | 148 | loff_t offset; |
| 136 | unsigned length; | 149 | unsigned length; |
| 150 | unsigned last_sgs_total; | ||
| 137 | unsigned dev; | 151 | unsigned dev; |
| 152 | struct osd_sg_entry *sglist; | ||
| 153 | unsigned cur_sg; | ||
| 138 | } per_dev[]; | 154 | } per_dev[]; |
| 139 | }; | 155 | }; |
| 140 | 156 | ||
| @@ -147,8 +163,7 @@ static inline unsigned ore_io_state_size(unsigned numdevs) | |||
| 147 | /* ore.c */ | 163 | /* ore.c */ |
| 148 | int ore_verify_layout(unsigned total_comps, struct ore_layout *layout); | 164 | int ore_verify_layout(unsigned total_comps, struct ore_layout *layout); |
| 149 | void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset, | 165 | void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset, |
| 150 | struct ore_striping_info *si); | 166 | u64 length, struct ore_striping_info *si); |
| 151 | |||
| 152 | int ore_get_rw_state(struct ore_layout *layout, struct ore_components *comps, | 167 | int ore_get_rw_state(struct ore_layout *layout, struct ore_components *comps, |
| 153 | bool is_reading, u64 offset, u64 length, | 168 | bool is_reading, u64 offset, u64 length, |
| 154 | struct ore_io_state **ios); | 169 | struct ore_io_state **ios); |
