aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBoaz Harrosh <bharrosh@panasas.com>2010-02-01 06:35:51 -0500
committerBoaz Harrosh <bharrosh@panasas.com>2010-02-28 06:43:08 -0500
commit5d952b8391692553c31e620a92d6e09262a9a307 (patch)
treeb3a1a0490fc98b6304685d64bb4774235ec94a2d
parentd9c740d2253e75db8cef8f87a3125c450f3ebd82 (diff)
exofs: RAID0 support
We now support striping over mirror devices. Including variable sized stripe_unit. Some limits: * stripe_unit must be a multiple of PAGE_SIZE * stripe_unit * stripe_count is maximum upto 32-bit (4Gb) Tested RAID0 over mirrors, RAID0 only, mirrors only. All check. Design notes: * I'm not using a vectored raid-engine mechanism yet. Following the pnfs-objects-layout data-map structure, "Mirror" is just a private case of "group_width" == 1, and RAID0 is a private case of "Mirrors" == 1. The performance lose of the general case over the particular special case optimization is totally negligible, also considering the extra code size. * In general I added a prepare_stripes() stage that divides the to-be-io pages to the participating devices, the previous exofs_ios_write/read, now becomes _write/read_mirrors and a new write/read upper layer loops on all devices calling _write/read_mirrors. Effectively the prepare_stripes stage is the all secret. Also truncate need fixing to accommodate for striping. * In a RAID0 arrangement, in a regular usage scenario, if all inode layouts will start at the same device, the small files fill up the first device and the later devices stay empty, the farther the device the emptier it is. To fix that, each inode will start at a different stripe_unit, according to it's obj_id modulus number-of-stripe-units. And will then span all stripe-units in the same incrementing order wrapping back to the beginning of the device table. We call it a stripe-units moving window. Special consideration was taken to keep all devices in a mirror arrangement identical. So a broken osd-device could just be cloned from one of the mirrors and no FS scrubbing is needed. (We do that by rotating stripe-unit at a time and not a single device at a time.) TODO: We no longer verify object_length == inode->i_size in exofs_iget. (since i_size is stripped on multiple objects now). I should introduce a multiple-device attribute reading, and use it in exofs_iget. Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
-rw-r--r--fs/exofs/exofs.h11
-rw-r--r--fs/exofs/inode.c26
-rw-r--r--fs/exofs/ios.c327
-rw-r--r--fs/exofs/super.c52
4 files changed, 333 insertions, 83 deletions
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index 09e331935514..0d8a34b21ae1 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -58,6 +58,14 @@
58struct exofs_layout { 58struct exofs_layout {
59 osd_id s_pid; /* partition ID of file system*/ 59 osd_id s_pid; /* partition ID of file system*/
60 60
61 /* Our way of looking at the data_map */
62 unsigned stripe_unit;
63 unsigned mirrors_p1;
64
65 unsigned group_width;
66
67 enum exofs_inode_layout_gen_functions lay_func;
68
61 unsigned s_numdevs; /* Num of devices in array */ 69 unsigned s_numdevs; /* Num of devices in array */
62 struct osd_dev *s_ods[0]; /* Variable length */ 70 struct osd_dev *s_ods[0]; /* Variable length */
63}; 71};
@@ -133,6 +141,9 @@ struct exofs_io_state {
133 struct exofs_per_dev_state { 141 struct exofs_per_dev_state {
134 struct osd_request *or; 142 struct osd_request *or;
135 struct bio *bio; 143 struct bio *bio;
144 loff_t offset;
145 unsigned length;
146 unsigned dev;
136 } per_dev[]; 147 } per_dev[];
137}; 148};
138 149
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 0163546ba05a..2b3163ea56eb 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -869,18 +869,17 @@ static const struct osd_attr g_attr_inode_dir_layout = ATTR_DEF(
869 0); 869 0);
870 870
871/* 871/*
872 * Read an inode from the OSD, and return it as is. We also return the size 872 * Read the Linux inode info from the OSD, and return it as is. In exofs the
873 * attribute in the 'obj_size' argument. 873 * inode info is in an application specific page/attribute of the osd-object.
874 */ 874 */
875static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, 875static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
876 struct exofs_fcb *inode, uint64_t *obj_size) 876 struct exofs_fcb *inode)
877{ 877{
878 struct exofs_sb_info *sbi = sb->s_fs_info; 878 struct exofs_sb_info *sbi = sb->s_fs_info;
879 struct osd_attr attrs[] = { 879 struct osd_attr attrs[] = {
880 [0] = g_attr_inode_data, 880 [0] = g_attr_inode_data,
881 [1] = g_attr_inode_file_layout, 881 [1] = g_attr_inode_file_layout,
882 [2] = g_attr_inode_dir_layout, 882 [2] = g_attr_inode_dir_layout,
883 [3] = g_attr_logical_length,
884 }; 883 };
885 struct exofs_io_state *ios; 884 struct exofs_io_state *ios;
886 struct exofs_on_disk_inode_layout *layout; 885 struct exofs_on_disk_inode_layout *layout;
@@ -944,15 +943,6 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
944 } 943 }
945 } 944 }
946 945
947 *obj_size = ~0;
948 ret = extract_attr_from_ios(ios, &attrs[3]);
949 if (ret) {
950 EXOFS_ERR("%s: extract_attr of logical_length failed\n",
951 __func__);
952 goto out;
953 }
954 *obj_size = get_unaligned_be64(attrs[3].val_ptr);
955
956out: 946out:
957 exofs_put_io_state(ios); 947 exofs_put_io_state(ios);
958 return ret; 948 return ret;
@@ -971,7 +961,6 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
971 struct exofs_i_info *oi; 961 struct exofs_i_info *oi;
972 struct exofs_fcb fcb; 962 struct exofs_fcb fcb;
973 struct inode *inode; 963 struct inode *inode;
974 uint64_t obj_size;
975 int ret; 964 int ret;
976 965
977 inode = iget_locked(sb, ino); 966 inode = iget_locked(sb, ino);
@@ -983,7 +972,7 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
983 __oi_init(oi); 972 __oi_init(oi);
984 973
985 /* read the inode from the osd */ 974 /* read the inode from the osd */
986 ret = exofs_get_inode(sb, oi, &fcb, &obj_size); 975 ret = exofs_get_inode(sb, oi, &fcb);
987 if (ret) 976 if (ret)
988 goto bad_inode; 977 goto bad_inode;
989 978
@@ -1004,13 +993,6 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
1004 inode->i_blkbits = EXOFS_BLKSHIFT; 993 inode->i_blkbits = EXOFS_BLKSHIFT;
1005 inode->i_generation = le32_to_cpu(fcb.i_generation); 994 inode->i_generation = le32_to_cpu(fcb.i_generation);
1006 995
1007 if ((inode->i_size != obj_size) &&
1008 (!exofs_inode_is_fast_symlink(inode))) {
1009 EXOFS_ERR("WARNING: Size of inode=%llu != object=%llu\n",
1010 inode->i_size, _LLU(obj_size));
1011 /* FIXME: call exofs_inode_recovery() */
1012 }
1013
1014 oi->i_dir_start_lookup = 0; 996 oi->i_dir_start_lookup = 0;
1015 997
1016 if ((inode->i_nlink == 0) && (inode->i_mode == 0)) { 998 if ((inode->i_nlink == 0) && (inode->i_mode == 0)) {
diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c
index 2b81f99fd62c..6e446b2670b9 100644
--- a/fs/exofs/ios.c
+++ b/fs/exofs/ios.c
@@ -23,6 +23,7 @@
23 */ 23 */
24 24
25#include <scsi/scsi_device.h> 25#include <scsi/scsi_device.h>
26#include <asm/div64.h>
26 27
27#include "exofs.h" 28#include "exofs.h"
28 29
@@ -110,7 +111,17 @@ void exofs_put_io_state(struct exofs_io_state *ios)
110unsigned exofs_layout_od_id(struct exofs_layout *layout, 111unsigned exofs_layout_od_id(struct exofs_layout *layout,
111 osd_id obj_no, unsigned layout_index) 112 osd_id obj_no, unsigned layout_index)
112{ 113{
113 return layout_index; 114/* switch (layout->lay_func) {
115 case LAYOUT_MOVING_WINDOW:
116 {*/
117 unsigned dev_mod = obj_no;
118
119 return (layout_index + dev_mod * layout->mirrors_p1) %
120 layout->s_numdevs;
121/* }
122 case LAYOUT_FUNC_IMPLICT:
123 return layout->devs[layout_index];
124 }*/
114} 125}
115 126
116static inline struct osd_dev *exofs_ios_od(struct exofs_io_state *ios, 127static inline struct osd_dev *exofs_ios_od(struct exofs_io_state *ios,
@@ -225,8 +236,8 @@ int exofs_check_io(struct exofs_io_state *ios, u64 *resid)
225 _clear_bio(ios->per_dev[i].bio); 236 _clear_bio(ios->per_dev[i].bio);
226 EXOFS_DBGMSG("start read offset passed end of file " 237 EXOFS_DBGMSG("start read offset passed end of file "
227 "offset=0x%llx, length=0x%llx\n", 238 "offset=0x%llx, length=0x%llx\n",
228 _LLU(ios->offset), 239 _LLU(ios->per_dev[i].offset),
229 _LLU(ios->length)); 240 _LLU(ios->per_dev[i].length));
230 241
231 continue; /* we recovered */ 242 continue; /* we recovered */
232 } 243 }
@@ -248,6 +259,127 @@ int exofs_check_io(struct exofs_io_state *ios, u64 *resid)
248 return acumulated_lin_err; 259 return acumulated_lin_err;
249} 260}
250 261
262/* REMOVEME: After review
263 Some quoteing from the standard
264
265 L = logical offset into the file
266 W = number of data components in a stripe
267 S = W * stripe_unit (S is Stripe length)
268 N = L / S (N is the stripe Number)
269 C = (L-(N*S)) / stripe_unit (C is the component)
270 O = (N*stripe_unit)+(L%stripe_unit) (O is the object's offset)
271*/
272
273static void _offset_dev_unit_off(struct exofs_io_state *ios, u64 file_offset,
274 u64 *obj_offset, unsigned *dev, unsigned *unit_off)
275{
276 unsigned stripe_unit = ios->layout->stripe_unit;
277 unsigned stripe_length = stripe_unit * ios->layout->group_width;
278 u64 stripe_no = file_offset;
279 unsigned stripe_mod = do_div(stripe_no, stripe_length);
280
281 *unit_off = stripe_mod % stripe_unit;
282 *obj_offset = stripe_no * stripe_unit + *unit_off;
283 *dev = stripe_mod / stripe_unit * ios->layout->mirrors_p1;
284}
285
286static int _add_stripe_unit(struct exofs_io_state *ios, unsigned *cur_bvec,
287 struct exofs_per_dev_state *per_dev, int cur_len)
288{
289 unsigned bv = *cur_bvec;
290 struct request_queue *q =
291 osd_request_queue(exofs_ios_od(ios, per_dev->dev));
292
293 per_dev->length += cur_len;
294
295 if (per_dev->bio == NULL) {
296 unsigned pages_in_stripe = ios->layout->group_width *
297 (ios->layout->stripe_unit / PAGE_SIZE);
298 unsigned bio_size = (ios->bio->bi_vcnt + pages_in_stripe) /
299 ios->layout->group_width;
300
301 per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size);
302 if (unlikely(!per_dev->bio)) {
303 EXOFS_DBGMSG("Faild to allocate BIO size=%u\n",
304 bio_size);
305 return -ENOMEM;
306 }
307 }
308
309 while (cur_len > 0) {
310 int added_len;
311 struct bio_vec *bvec = &ios->bio->bi_io_vec[bv];
312
313 BUG_ON(ios->bio->bi_vcnt <= bv);
314 cur_len -= bvec->bv_len;
315
316 added_len = bio_add_pc_page(q, per_dev->bio, bvec->bv_page,
317 bvec->bv_len, bvec->bv_offset);
318 if (unlikely(bvec->bv_len != added_len))
319 return -ENOMEM;
320 ++bv;
321 }
322 BUG_ON(cur_len);
323
324 *cur_bvec = bv;
325 return 0;
326}
327
328static int _prepare_for_striping(struct exofs_io_state *ios)
329{
330 u64 length = ios->length;
331 u64 offset = ios->offset;
332 unsigned stripe_unit = ios->layout->stripe_unit;
333 unsigned comp = 0;
334 unsigned stripes = 0;
335 unsigned cur_bvec = 0;
336 int ret;
337
338 if (!ios->bio) {
339 if (ios->kern_buff) {
340 struct exofs_per_dev_state *per_dev = &ios->per_dev[0];
341 unsigned unit_off;
342
343 _offset_dev_unit_off(ios, offset, &per_dev->offset,
344 &per_dev->dev, &unit_off);
345 /* no cross device without page array */
346 BUG_ON((ios->layout->group_width > 1) &&
347 (unit_off + length > stripe_unit));
348 }
349 ios->numdevs = ios->layout->mirrors_p1;
350 return 0;
351 }
352
353 while (length) {
354 struct exofs_per_dev_state *per_dev = &ios->per_dev[comp];
355 unsigned cur_len;
356
357 if (!per_dev->length) {
358 unsigned unit_off;
359
360 _offset_dev_unit_off(ios, offset, &per_dev->offset,
361 &per_dev->dev, &unit_off);
362 stripes++;
363 cur_len = min_t(u64, stripe_unit - unit_off, length);
364 offset += cur_len;
365 } else {
366 cur_len = min_t(u64, stripe_unit, length);
367 }
368
369 ret = _add_stripe_unit(ios, &cur_bvec, per_dev, cur_len);
370 if (unlikely(ret))
371 goto out;
372
373 comp += ios->layout->mirrors_p1;
374 comp %= ios->layout->s_numdevs;
375
376 length -= cur_len;
377 }
378out:
379 ios->numdevs = stripes * ios->layout->mirrors_p1;
380 return ret;
381}
382
251int exofs_sbi_create(struct exofs_io_state *ios) 383int exofs_sbi_create(struct exofs_io_state *ios)
252{ 384{
253 int i, ret; 385 int i, ret;
@@ -296,61 +428,71 @@ out:
296 return ret; 428 return ret;
297} 429}
298 430
299int exofs_sbi_write(struct exofs_io_state *ios) 431static int _sbi_write_mirror(struct exofs_io_state *ios, int cur_comp)
300{ 432{
301 int i, ret; 433 struct exofs_per_dev_state *master_dev = &ios->per_dev[cur_comp];
434 unsigned dev = ios->per_dev[cur_comp].dev;
435 unsigned last_comp = cur_comp + ios->layout->mirrors_p1;
436 int ret = 0;
302 437
303 for (i = 0; i < ios->layout->s_numdevs; i++) { 438 for (; cur_comp < last_comp; ++cur_comp, ++dev) {
439 struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp];
304 struct osd_request *or; 440 struct osd_request *or;
305 441
306 or = osd_start_request(exofs_ios_od(ios, i), GFP_KERNEL); 442 or = osd_start_request(exofs_ios_od(ios, dev), GFP_KERNEL);
307 if (unlikely(!or)) { 443 if (unlikely(!or)) {
308 EXOFS_ERR("%s: osd_start_request failed\n", __func__); 444 EXOFS_ERR("%s: osd_start_request failed\n", __func__);
309 ret = -ENOMEM; 445 ret = -ENOMEM;
310 goto out; 446 goto out;
311 } 447 }
312 ios->per_dev[i].or = or; 448 per_dev->or = or;
313 ios->numdevs++; 449 per_dev->offset = master_dev->offset;
314 450
315 if (ios->bio) { 451 if (ios->bio) {
316 struct bio *bio; 452 struct bio *bio;
317 453
318 if (i != 0) { 454 if (per_dev != master_dev) {
319 bio = bio_kmalloc(GFP_KERNEL, 455 bio = bio_kmalloc(GFP_KERNEL,
320 ios->bio->bi_max_vecs); 456 master_dev->bio->bi_max_vecs);
321 if (unlikely(!bio)) { 457 if (unlikely(!bio)) {
322 EXOFS_DBGMSG( 458 EXOFS_DBGMSG(
323 "Faild to allocate BIO size=%u\n", 459 "Faild to allocate BIO size=%u\n",
324 ios->bio->bi_max_vecs); 460 master_dev->bio->bi_max_vecs);
325 ret = -ENOMEM; 461 ret = -ENOMEM;
326 goto out; 462 goto out;
327 } 463 }
328 464
329 __bio_clone(bio, ios->bio); 465 __bio_clone(bio, master_dev->bio);
330 bio->bi_bdev = NULL; 466 bio->bi_bdev = NULL;
331 bio->bi_next = NULL; 467 bio->bi_next = NULL;
332 ios->per_dev[i].bio = bio; 468 per_dev->length = master_dev->length;
469 per_dev->bio = bio;
470 per_dev->dev = dev;
333 } else { 471 } else {
334 bio = ios->bio; 472 bio = master_dev->bio;
473 /* FIXME: bio_set_dir() */
474 bio->bi_rw |= (1 << BIO_RW);
335 } 475 }
336 476
337 osd_req_write(or, &ios->obj, ios->offset, bio, 477 osd_req_write(or, &ios->obj, per_dev->offset, bio,
338 ios->length); 478 per_dev->length);
339 EXOFS_DBGMSG("write(0x%llx) offset=0x%llx " 479 EXOFS_DBGMSG("write(0x%llx) offset=0x%llx "
340 "length=0x%llx dev=%d\n", 480 "length=0x%llx dev=%d\n",
341 _LLU(ios->obj.id), _LLU(ios->offset), 481 _LLU(ios->obj.id), _LLU(per_dev->offset),
342 _LLU(ios->length), i); 482 _LLU(per_dev->length), dev);
343 } else if (ios->kern_buff) { 483 } else if (ios->kern_buff) {
344 osd_req_write_kern(or, &ios->obj, ios->offset, 484 ret = osd_req_write_kern(or, &ios->obj, per_dev->offset,
345 ios->kern_buff, ios->length); 485 ios->kern_buff, ios->length);
486 if (unlikely(ret))
487 goto out;
346 EXOFS_DBGMSG2("write_kern(0x%llx) offset=0x%llx " 488 EXOFS_DBGMSG2("write_kern(0x%llx) offset=0x%llx "
347 "length=0x%llx dev=%d\n", 489 "length=0x%llx dev=%d\n",
348 _LLU(ios->obj.id), _LLU(ios->offset), 490 _LLU(ios->obj.id), _LLU(per_dev->offset),
349 _LLU(ios->length), i); 491 _LLU(ios->length), dev);
350 } else { 492 } else {
351 osd_req_set_attributes(or, &ios->obj); 493 osd_req_set_attributes(or, &ios->obj);
352 EXOFS_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n", 494 EXOFS_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n",
353 _LLU(ios->obj.id), ios->out_attr_len, i); 495 _LLU(ios->obj.id), ios->out_attr_len, dev);
354 } 496 }
355 497
356 if (ios->out_attr) 498 if (ios->out_attr)
@@ -361,40 +503,57 @@ int exofs_sbi_write(struct exofs_io_state *ios)
361 osd_req_add_get_attr_list(or, ios->in_attr, 503 osd_req_add_get_attr_list(or, ios->in_attr,
362 ios->in_attr_len); 504 ios->in_attr_len);
363 } 505 }
364 ret = exofs_io_execute(ios);
365 506
366out: 507out:
367 return ret; 508 return ret;
368} 509}
369 510
370int exofs_sbi_read(struct exofs_io_state *ios) 511int exofs_sbi_write(struct exofs_io_state *ios)
512{
513 int i;
514 int ret;
515
516 ret = _prepare_for_striping(ios);
517 if (unlikely(ret))
518 return ret;
519
520 for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
521 ret = _sbi_write_mirror(ios, i);
522 if (unlikely(ret))
523 return ret;
524 }
525
526 ret = exofs_io_execute(ios);
527 return ret;
528}
529
530static int _sbi_read_mirror(struct exofs_io_state *ios, unsigned cur_comp)
371{ 531{
372 struct osd_request *or; 532 struct osd_request *or;
373 struct exofs_per_dev_state *per_dev = &ios->per_dev[0]; 533 struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp];
374 unsigned first_dev = (unsigned)ios->obj.id; 534 unsigned first_dev = (unsigned)ios->obj.id;
375 535
376 first_dev %= ios->layout->s_numdevs; 536 first_dev = per_dev->dev + first_dev % ios->layout->mirrors_p1;
377 or = osd_start_request(exofs_ios_od(ios, first_dev), GFP_KERNEL); 537 or = osd_start_request(exofs_ios_od(ios, first_dev), GFP_KERNEL);
378 if (unlikely(!or)) { 538 if (unlikely(!or)) {
379 EXOFS_ERR("%s: osd_start_request failed\n", __func__); 539 EXOFS_ERR("%s: osd_start_request failed\n", __func__);
380 return -ENOMEM; 540 return -ENOMEM;
381 } 541 }
382 per_dev->or = or; 542 per_dev->or = or;
383 ios->numdevs++;
384 543
385 if (ios->bio) { 544 if (ios->bio) {
386 osd_req_read(or, &ios->obj, ios->offset, ios->bio, ios->length); 545 osd_req_read(or, &ios->obj, per_dev->offset,
546 per_dev->bio, per_dev->length);
387 EXOFS_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx" 547 EXOFS_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx"
388 " dev=%d\n", _LLU(ios->obj.id), 548 " dev=%d\n", _LLU(ios->obj.id),
389 _LLU(ios->offset), _LLU(ios->length), 549 _LLU(per_dev->offset), _LLU(per_dev->length),
390 first_dev); 550 first_dev);
391 } else if (ios->kern_buff) { 551 } else if (ios->kern_buff) {
392 int ret = osd_req_read_kern(or, &ios->obj, ios->offset, 552 int ret = osd_req_read_kern(or, &ios->obj, per_dev->offset,
393 ios->kern_buff, ios->length); 553 ios->kern_buff, ios->length);
394
395 EXOFS_DBGMSG2("read_kern(0x%llx) offset=0x%llx " 554 EXOFS_DBGMSG2("read_kern(0x%llx) offset=0x%llx "
396 "length=0x%llx dev=%d ret=>%d\n", 555 "length=0x%llx dev=%d ret=>%d\n",
397 _LLU(ios->obj.id), _LLU(ios->offset), 556 _LLU(ios->obj.id), _LLU(per_dev->offset),
398 _LLU(ios->length), first_dev, ret); 557 _LLU(ios->length), first_dev, ret);
399 if (unlikely(ret)) 558 if (unlikely(ret))
400 return ret; 559 return ret;
@@ -403,14 +562,32 @@ int exofs_sbi_read(struct exofs_io_state *ios)
403 EXOFS_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n", 562 EXOFS_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n",
404 _LLU(ios->obj.id), ios->in_attr_len, first_dev); 563 _LLU(ios->obj.id), ios->in_attr_len, first_dev);
405 } 564 }
406
407 if (ios->out_attr) 565 if (ios->out_attr)
408 osd_req_add_set_attr_list(or, ios->out_attr, ios->out_attr_len); 566 osd_req_add_set_attr_list(or, ios->out_attr, ios->out_attr_len);
409 567
410 if (ios->in_attr) 568 if (ios->in_attr)
411 osd_req_add_get_attr_list(or, ios->in_attr, ios->in_attr_len); 569 osd_req_add_get_attr_list(or, ios->in_attr, ios->in_attr_len);
412 570
413 return exofs_io_execute(ios); 571 return 0;
572}
573
574int exofs_sbi_read(struct exofs_io_state *ios)
575{
576 int i;
577 int ret;
578
579 ret = _prepare_for_striping(ios);
580 if (unlikely(ret))
581 return ret;
582
583 for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
584 ret = _sbi_read_mirror(ios, i);
585 if (unlikely(ret))
586 return ret;
587 }
588
589 ret = exofs_io_execute(ios);
590 return ret;
414} 591}
415 592
416int extract_attr_from_ios(struct exofs_io_state *ios, struct osd_attr *attr) 593int extract_attr_from_ios(struct exofs_io_state *ios, struct osd_attr *attr)
@@ -434,42 +611,84 @@ int extract_attr_from_ios(struct exofs_io_state *ios, struct osd_attr *attr)
434 return -EIO; 611 return -EIO;
435} 612}
436 613
614static int _truncate_mirrors(struct exofs_io_state *ios, unsigned cur_comp,
615 struct osd_attr *attr)
616{
617 int last_comp = cur_comp + ios->layout->mirrors_p1;
618
619 for (; cur_comp < last_comp; ++cur_comp) {
620 struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp];
621 struct osd_request *or;
622
623 or = osd_start_request(exofs_ios_od(ios, cur_comp), GFP_KERNEL);
624 if (unlikely(!or)) {
625 EXOFS_ERR("%s: osd_start_request failed\n", __func__);
626 return -ENOMEM;
627 }
628 per_dev->or = or;
629
630 osd_req_set_attributes(or, &ios->obj);
631 osd_req_add_set_attr_list(or, attr, 1);
632 }
633
634 return 0;
635}
636
437int exofs_oi_truncate(struct exofs_i_info *oi, u64 size) 637int exofs_oi_truncate(struct exofs_i_info *oi, u64 size)
438{ 638{
439 struct exofs_sb_info *sbi = oi->vfs_inode.i_sb->s_fs_info; 639 struct exofs_sb_info *sbi = oi->vfs_inode.i_sb->s_fs_info;
440 struct exofs_io_state *ios; 640 struct exofs_io_state *ios;
441 struct osd_attr attr; 641 struct exofs_trunc_attr {
442 __be64 newsize; 642 struct osd_attr attr;
643 __be64 newsize;
644 } *size_attrs;
645 u64 this_obj_size;
646 unsigned dev;
647 unsigned unit_off;
443 int i, ret; 648 int i, ret;
444 649
445 if (exofs_get_io_state(&sbi->layout, &ios)) 650 ret = exofs_get_io_state(&sbi->layout, &ios);
446 return -ENOMEM; 651 if (unlikely(ret))
652 return ret;
653
654 size_attrs = kcalloc(ios->layout->group_width, sizeof(*size_attrs),
655 GFP_KERNEL);
656 if (unlikely(!size_attrs)) {
657 ret = -ENOMEM;
658 goto out;
659 }
447 660
448 ios->obj.id = exofs_oi_objno(oi); 661 ios->obj.id = exofs_oi_objno(oi);
449 ios->cred = oi->i_cred; 662 ios->cred = oi->i_cred;
450 663
451 newsize = cpu_to_be64(size); 664 ios->numdevs = ios->layout->s_numdevs;
452 attr = g_attr_logical_length; 665 _offset_dev_unit_off(ios, size, &this_obj_size, &dev, &unit_off);
453 attr.val_ptr = &newsize;
454 666
455 for (i = 0; i < sbi->layout.s_numdevs; i++) { 667 for (i = 0; i < ios->layout->group_width; ++i) {
456 struct osd_request *or; 668 struct exofs_trunc_attr *size_attr = &size_attrs[i];
669 u64 obj_size;
457 670
458 or = osd_start_request(exofs_ios_od(ios, i), GFP_KERNEL); 671 if (i < dev)
459 if (unlikely(!or)) { 672 obj_size = this_obj_size +
460 EXOFS_ERR("%s: osd_start_request failed\n", __func__); 673 ios->layout->stripe_unit - unit_off;
461 ret = -ENOMEM; 674 else if (i == dev)
462 goto out; 675 obj_size = this_obj_size;
463 } 676 else /* i > dev */
464 ios->per_dev[i].or = or; 677 obj_size = this_obj_size - unit_off;
465 ios->numdevs++;
466 678
467 osd_req_set_attributes(or, &ios->obj); 679 size_attr->newsize = cpu_to_be64(obj_size);
468 osd_req_add_set_attr_list(or, &attr, 1); 680 size_attr->attr = g_attr_logical_length;
681 size_attr->attr.val_ptr = &size_attr->newsize;
682
683 ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1,
684 &size_attr->attr);
685 if (unlikely(ret))
686 goto out;
469 } 687 }
470 ret = exofs_io_execute(ios); 688 ret = exofs_io_execute(ios);
471 689
472out: 690out:
691 kfree(size_attrs);
473 exofs_put_io_state(ios); 692 exofs_put_io_state(ios);
474 return ret; 693 return ret;
475} 694}
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index fc8875186ae8..8f4e4b37a578 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -308,6 +308,8 @@ static void exofs_put_super(struct super_block *sb)
308static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs, 308static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs,
309 struct exofs_device_table *dt) 309 struct exofs_device_table *dt)
310{ 310{
311 u64 stripe_length;
312
311 sbi->data_map.odm_num_comps = 313 sbi->data_map.odm_num_comps =
312 le32_to_cpu(dt->dt_data_map.cb_num_comps); 314 le32_to_cpu(dt->dt_data_map.cb_num_comps);
313 sbi->data_map.odm_stripe_unit = 315 sbi->data_map.odm_stripe_unit =
@@ -321,14 +323,47 @@ static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs,
321 sbi->data_map.odm_raid_algorithm = 323 sbi->data_map.odm_raid_algorithm =
322 le32_to_cpu(dt->dt_data_map.cb_raid_algorithm); 324 le32_to_cpu(dt->dt_data_map.cb_raid_algorithm);
323 325
324/* FIXME: Hard coded mirror only for now. if not so do not mount */ 326/* FIXME: Only raid0 !group_width/depth for now. if not so, do not mount */
325 if ((sbi->data_map.odm_num_comps != numdevs) || 327 if (sbi->data_map.odm_group_width || sbi->data_map.odm_group_depth) {
326 (sbi->data_map.odm_stripe_unit != EXOFS_BLKSIZE) || 328 EXOFS_ERR("Group width/depth not supported\n");
327 (sbi->data_map.odm_raid_algorithm != PNFS_OSD_RAID_0) ||
328 (sbi->data_map.odm_mirror_cnt != (numdevs - 1)))
329 return -EINVAL; 329 return -EINVAL;
330 else 330 }
331 return 0; 331 if (sbi->data_map.odm_num_comps != numdevs) {
332 EXOFS_ERR("odm_num_comps(%u) != numdevs(%u)\n",
333 sbi->data_map.odm_num_comps, numdevs);
334 return -EINVAL;
335 }
336 if (sbi->data_map.odm_raid_algorithm != PNFS_OSD_RAID_0) {
337 EXOFS_ERR("Only RAID_0 for now\n");
338 return -EINVAL;
339 }
340 if (0 != (numdevs % (sbi->data_map.odm_mirror_cnt + 1))) {
341 EXOFS_ERR("Data Map wrong, numdevs=%d mirrors=%d\n",
342 numdevs, sbi->data_map.odm_mirror_cnt);
343 return -EINVAL;
344 }
345
346 stripe_length = sbi->data_map.odm_stripe_unit *
347 (numdevs / (sbi->data_map.odm_mirror_cnt + 1));
348 if (stripe_length >= (1ULL << 32)) {
349 EXOFS_ERR("Total Stripe length(0x%llx)"
350 " >= 32bit is not supported\n", _LLU(stripe_length));
351 return -EINVAL;
352 }
353
354 if (0 != (sbi->data_map.odm_stripe_unit & ~PAGE_MASK)) {
355 EXOFS_ERR("Stripe Unit(0x%llx)"
356 " must be Multples of PAGE_SIZE(0x%lx)\n",
357 _LLU(sbi->data_map.odm_stripe_unit), PAGE_SIZE);
358 return -EINVAL;
359 }
360
361 sbi->layout.stripe_unit = sbi->data_map.odm_stripe_unit;
362 sbi->layout.mirrors_p1 = sbi->data_map.odm_mirror_cnt + 1;
363 sbi->layout.group_width = sbi->data_map.odm_num_comps /
364 sbi->layout.mirrors_p1;
365
366 return 0;
332} 367}
333 368
334/* @odi is valid only as long as @fscb_dev is valid */ 369/* @odi is valid only as long as @fscb_dev is valid */
@@ -502,6 +537,9 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
502 } 537 }
503 538
504 /* Default layout in case we do not have a device-table */ 539 /* Default layout in case we do not have a device-table */
540 sbi->layout.stripe_unit = PAGE_SIZE;
541 sbi->layout.mirrors_p1 = 1;
542 sbi->layout.group_width = 1;
505 sbi->layout.s_ods[0] = od; 543 sbi->layout.s_ods[0] = od;
506 sbi->layout.s_numdevs = 1; 544 sbi->layout.s_numdevs = 1;
507 sbi->layout.s_pid = opts->pid; 545 sbi->layout.s_pid = opts->pid;