aboutsummaryrefslogtreecommitdiffstats
path: root/fs/exofs/super.c
diff options
context:
space:
mode:
authorBoaz Harrosh <bharrosh@panasas.com>2010-02-01 06:35:51 -0500
committerBoaz Harrosh <bharrosh@panasas.com>2010-02-28 06:43:08 -0500
commit5d952b8391692553c31e620a92d6e09262a9a307 (patch)
treeb3a1a0490fc98b6304685d64bb4774235ec94a2d /fs/exofs/super.c
parentd9c740d2253e75db8cef8f87a3125c450f3ebd82 (diff)
exofs: RAID0 support
We now support striping over mirror devices. Including variable sized stripe_unit. Some limits: * stripe_unit must be a multiple of PAGE_SIZE * stripe_unit * stripe_count is maximum upto 32-bit (4Gb) Tested RAID0 over mirrors, RAID0 only, mirrors only. All check. Design notes: * I'm not using a vectored raid-engine mechanism yet. Following the pnfs-objects-layout data-map structure, "Mirror" is just a private case of "group_width" == 1, and RAID0 is a private case of "Mirrors" == 1. The performance lose of the general case over the particular special case optimization is totally negligible, also considering the extra code size. * In general I added a prepare_stripes() stage that divides the to-be-io pages to the participating devices, the previous exofs_ios_write/read, now becomes _write/read_mirrors and a new write/read upper layer loops on all devices calling _write/read_mirrors. Effectively the prepare_stripes stage is the all secret. Also truncate need fixing to accommodate for striping. * In a RAID0 arrangement, in a regular usage scenario, if all inode layouts will start at the same device, the small files fill up the first device and the later devices stay empty, the farther the device the emptier it is. To fix that, each inode will start at a different stripe_unit, according to it's obj_id modulus number-of-stripe-units. And will then span all stripe-units in the same incrementing order wrapping back to the beginning of the device table. We call it a stripe-units moving window. Special consideration was taken to keep all devices in a mirror arrangement identical. So a broken osd-device could just be cloned from one of the mirrors and no FS scrubbing is needed. (We do that by rotating stripe-unit at a time and not a single device at a time.) TODO: We no longer verify object_length == inode->i_size in exofs_iget. (since i_size is stripped on multiple objects now). I should introduce a multiple-device attribute reading, and use it in exofs_iget. Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Diffstat (limited to 'fs/exofs/super.c')
-rw-r--r--fs/exofs/super.c52
1 files changed, 45 insertions, 7 deletions
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index fc8875186ae8..8f4e4b37a578 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -308,6 +308,8 @@ static void exofs_put_super(struct super_block *sb)
308static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs, 308static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs,
309 struct exofs_device_table *dt) 309 struct exofs_device_table *dt)
310{ 310{
311 u64 stripe_length;
312
311 sbi->data_map.odm_num_comps = 313 sbi->data_map.odm_num_comps =
312 le32_to_cpu(dt->dt_data_map.cb_num_comps); 314 le32_to_cpu(dt->dt_data_map.cb_num_comps);
313 sbi->data_map.odm_stripe_unit = 315 sbi->data_map.odm_stripe_unit =
@@ -321,14 +323,47 @@ static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs,
321 sbi->data_map.odm_raid_algorithm = 323 sbi->data_map.odm_raid_algorithm =
322 le32_to_cpu(dt->dt_data_map.cb_raid_algorithm); 324 le32_to_cpu(dt->dt_data_map.cb_raid_algorithm);
323 325
324/* FIXME: Hard coded mirror only for now. if not so do not mount */ 326/* FIXME: Only raid0 !group_width/depth for now. if not so, do not mount */
325 if ((sbi->data_map.odm_num_comps != numdevs) || 327 if (sbi->data_map.odm_group_width || sbi->data_map.odm_group_depth) {
326 (sbi->data_map.odm_stripe_unit != EXOFS_BLKSIZE) || 328 EXOFS_ERR("Group width/depth not supported\n");
327 (sbi->data_map.odm_raid_algorithm != PNFS_OSD_RAID_0) ||
328 (sbi->data_map.odm_mirror_cnt != (numdevs - 1)))
329 return -EINVAL; 329 return -EINVAL;
330 else 330 }
331 return 0; 331 if (sbi->data_map.odm_num_comps != numdevs) {
332 EXOFS_ERR("odm_num_comps(%u) != numdevs(%u)\n",
333 sbi->data_map.odm_num_comps, numdevs);
334 return -EINVAL;
335 }
336 if (sbi->data_map.odm_raid_algorithm != PNFS_OSD_RAID_0) {
337 EXOFS_ERR("Only RAID_0 for now\n");
338 return -EINVAL;
339 }
340 if (0 != (numdevs % (sbi->data_map.odm_mirror_cnt + 1))) {
341 EXOFS_ERR("Data Map wrong, numdevs=%d mirrors=%d\n",
342 numdevs, sbi->data_map.odm_mirror_cnt);
343 return -EINVAL;
344 }
345
346 stripe_length = sbi->data_map.odm_stripe_unit *
347 (numdevs / (sbi->data_map.odm_mirror_cnt + 1));
348 if (stripe_length >= (1ULL << 32)) {
349 EXOFS_ERR("Total Stripe length(0x%llx)"
350 " >= 32bit is not supported\n", _LLU(stripe_length));
351 return -EINVAL;
352 }
353
354 if (0 != (sbi->data_map.odm_stripe_unit & ~PAGE_MASK)) {
355 EXOFS_ERR("Stripe Unit(0x%llx)"
356 " must be Multples of PAGE_SIZE(0x%lx)\n",
357 _LLU(sbi->data_map.odm_stripe_unit), PAGE_SIZE);
358 return -EINVAL;
359 }
360
361 sbi->layout.stripe_unit = sbi->data_map.odm_stripe_unit;
362 sbi->layout.mirrors_p1 = sbi->data_map.odm_mirror_cnt + 1;
363 sbi->layout.group_width = sbi->data_map.odm_num_comps /
364 sbi->layout.mirrors_p1;
365
366 return 0;
332} 367}
333 368
334/* @odi is valid only as long as @fscb_dev is valid */ 369/* @odi is valid only as long as @fscb_dev is valid */
@@ -502,6 +537,9 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
502 } 537 }
503 538
504 /* Default layout in case we do not have a device-table */ 539 /* Default layout in case we do not have a device-table */
540 sbi->layout.stripe_unit = PAGE_SIZE;
541 sbi->layout.mirrors_p1 = 1;
542 sbi->layout.group_width = 1;
505 sbi->layout.s_ods[0] = od; 543 sbi->layout.s_ods[0] = od;
506 sbi->layout.s_numdevs = 1; 544 sbi->layout.s_numdevs = 1;
507 sbi->layout.s_pid = opts->pid; 545 sbi->layout.s_pid = opts->pid;