diff options
author | Boaz Harrosh <bharrosh@panasas.com> | 2010-02-01 06:35:51 -0500 |
---|---|---|
committer | Boaz Harrosh <bharrosh@panasas.com> | 2010-02-28 06:43:08 -0500 |
commit | 5d952b8391692553c31e620a92d6e09262a9a307 (patch) | |
tree | b3a1a0490fc98b6304685d64bb4774235ec94a2d | |
parent | d9c740d2253e75db8cef8f87a3125c450f3ebd82 (diff) |
exofs: RAID0 support
We now support striping over mirror devices. Including variable sized
stripe_unit.
Some limits:
* stripe_unit must be a multiple of PAGE_SIZE
* stripe_unit * stripe_count is maximum upto 32-bit (4Gb)
Tested RAID0 over mirrors, RAID0 only, mirrors only. All check.
Design notes:
* I'm not using a vectored raid-engine mechanism yet. Following the
pnfs-objects-layout data-map structure, "Mirror" is just a private
case of "group_width" == 1, and RAID0 is a private case of
"Mirrors" == 1. The performance lose of the general case over the
particular special case optimization is totally negligible, also
considering the extra code size.
* In general I added a prepare_stripes() stage that divides the
to-be-io pages to the participating devices, the previous
exofs_ios_write/read, now becomes _write/read_mirrors and a new
write/read upper layer loops on all devices calling
_write/read_mirrors. Effectively the prepare_stripes stage is the all
secret.
Also truncate need fixing to accommodate for striping.
* In a RAID0 arrangement, in a regular usage scenario, if all inode
layouts will start at the same device, the small files fill up the
first device and the later devices stay empty, the farther the device
the emptier it is.
To fix that, each inode will start at a different stripe_unit,
according to it's obj_id modulus number-of-stripe-units. And
will then span all stripe-units in the same incrementing order
wrapping back to the beginning of the device table. We call it
a stripe-units moving window.
Special consideration was taken to keep all devices in a mirror
arrangement identical. So a broken osd-device could just be cloned
from one of the mirrors and no FS scrubbing is needed. (We do that
by rotating stripe-unit at a time and not a single device at a time.)
TODO:
We no longer verify object_length == inode->i_size in exofs_iget.
(since i_size is stripped on multiple objects now).
I should introduce a multiple-device attribute reading, and use
it in exofs_iget.
Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
-rw-r--r-- | fs/exofs/exofs.h | 11 | ||||
-rw-r--r-- | fs/exofs/inode.c | 26 | ||||
-rw-r--r-- | fs/exofs/ios.c | 327 | ||||
-rw-r--r-- | fs/exofs/super.c | 52 |
4 files changed, 333 insertions, 83 deletions
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h index 09e331935514..0d8a34b21ae1 100644 --- a/fs/exofs/exofs.h +++ b/fs/exofs/exofs.h | |||
@@ -58,6 +58,14 @@ | |||
58 | struct exofs_layout { | 58 | struct exofs_layout { |
59 | osd_id s_pid; /* partition ID of file system*/ | 59 | osd_id s_pid; /* partition ID of file system*/ |
60 | 60 | ||
61 | /* Our way of looking at the data_map */ | ||
62 | unsigned stripe_unit; | ||
63 | unsigned mirrors_p1; | ||
64 | |||
65 | unsigned group_width; | ||
66 | |||
67 | enum exofs_inode_layout_gen_functions lay_func; | ||
68 | |||
61 | unsigned s_numdevs; /* Num of devices in array */ | 69 | unsigned s_numdevs; /* Num of devices in array */ |
62 | struct osd_dev *s_ods[0]; /* Variable length */ | 70 | struct osd_dev *s_ods[0]; /* Variable length */ |
63 | }; | 71 | }; |
@@ -133,6 +141,9 @@ struct exofs_io_state { | |||
133 | struct exofs_per_dev_state { | 141 | struct exofs_per_dev_state { |
134 | struct osd_request *or; | 142 | struct osd_request *or; |
135 | struct bio *bio; | 143 | struct bio *bio; |
144 | loff_t offset; | ||
145 | unsigned length; | ||
146 | unsigned dev; | ||
136 | } per_dev[]; | 147 | } per_dev[]; |
137 | }; | 148 | }; |
138 | 149 | ||
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 0163546ba05a..2b3163ea56eb 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c | |||
@@ -869,18 +869,17 @@ static const struct osd_attr g_attr_inode_dir_layout = ATTR_DEF( | |||
869 | 0); | 869 | 0); |
870 | 870 | ||
871 | /* | 871 | /* |
872 | * Read an inode from the OSD, and return it as is. We also return the size | 872 | * Read the Linux inode info from the OSD, and return it as is. In exofs the |
873 | * attribute in the 'obj_size' argument. | 873 | * inode info is in an application specific page/attribute of the osd-object. |
874 | */ | 874 | */ |
875 | static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, | 875 | static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, |
876 | struct exofs_fcb *inode, uint64_t *obj_size) | 876 | struct exofs_fcb *inode) |
877 | { | 877 | { |
878 | struct exofs_sb_info *sbi = sb->s_fs_info; | 878 | struct exofs_sb_info *sbi = sb->s_fs_info; |
879 | struct osd_attr attrs[] = { | 879 | struct osd_attr attrs[] = { |
880 | [0] = g_attr_inode_data, | 880 | [0] = g_attr_inode_data, |
881 | [1] = g_attr_inode_file_layout, | 881 | [1] = g_attr_inode_file_layout, |
882 | [2] = g_attr_inode_dir_layout, | 882 | [2] = g_attr_inode_dir_layout, |
883 | [3] = g_attr_logical_length, | ||
884 | }; | 883 | }; |
885 | struct exofs_io_state *ios; | 884 | struct exofs_io_state *ios; |
886 | struct exofs_on_disk_inode_layout *layout; | 885 | struct exofs_on_disk_inode_layout *layout; |
@@ -944,15 +943,6 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, | |||
944 | } | 943 | } |
945 | } | 944 | } |
946 | 945 | ||
947 | *obj_size = ~0; | ||
948 | ret = extract_attr_from_ios(ios, &attrs[3]); | ||
949 | if (ret) { | ||
950 | EXOFS_ERR("%s: extract_attr of logical_length failed\n", | ||
951 | __func__); | ||
952 | goto out; | ||
953 | } | ||
954 | *obj_size = get_unaligned_be64(attrs[3].val_ptr); | ||
955 | |||
956 | out: | 946 | out: |
957 | exofs_put_io_state(ios); | 947 | exofs_put_io_state(ios); |
958 | return ret; | 948 | return ret; |
@@ -971,7 +961,6 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino) | |||
971 | struct exofs_i_info *oi; | 961 | struct exofs_i_info *oi; |
972 | struct exofs_fcb fcb; | 962 | struct exofs_fcb fcb; |
973 | struct inode *inode; | 963 | struct inode *inode; |
974 | uint64_t obj_size; | ||
975 | int ret; | 964 | int ret; |
976 | 965 | ||
977 | inode = iget_locked(sb, ino); | 966 | inode = iget_locked(sb, ino); |
@@ -983,7 +972,7 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino) | |||
983 | __oi_init(oi); | 972 | __oi_init(oi); |
984 | 973 | ||
985 | /* read the inode from the osd */ | 974 | /* read the inode from the osd */ |
986 | ret = exofs_get_inode(sb, oi, &fcb, &obj_size); | 975 | ret = exofs_get_inode(sb, oi, &fcb); |
987 | if (ret) | 976 | if (ret) |
988 | goto bad_inode; | 977 | goto bad_inode; |
989 | 978 | ||
@@ -1004,13 +993,6 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino) | |||
1004 | inode->i_blkbits = EXOFS_BLKSHIFT; | 993 | inode->i_blkbits = EXOFS_BLKSHIFT; |
1005 | inode->i_generation = le32_to_cpu(fcb.i_generation); | 994 | inode->i_generation = le32_to_cpu(fcb.i_generation); |
1006 | 995 | ||
1007 | if ((inode->i_size != obj_size) && | ||
1008 | (!exofs_inode_is_fast_symlink(inode))) { | ||
1009 | EXOFS_ERR("WARNING: Size of inode=%llu != object=%llu\n", | ||
1010 | inode->i_size, _LLU(obj_size)); | ||
1011 | /* FIXME: call exofs_inode_recovery() */ | ||
1012 | } | ||
1013 | |||
1014 | oi->i_dir_start_lookup = 0; | 996 | oi->i_dir_start_lookup = 0; |
1015 | 997 | ||
1016 | if ((inode->i_nlink == 0) && (inode->i_mode == 0)) { | 998 | if ((inode->i_nlink == 0) && (inode->i_mode == 0)) { |
diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c index 2b81f99fd62c..6e446b2670b9 100644 --- a/fs/exofs/ios.c +++ b/fs/exofs/ios.c | |||
@@ -23,6 +23,7 @@ | |||
23 | */ | 23 | */ |
24 | 24 | ||
25 | #include <scsi/scsi_device.h> | 25 | #include <scsi/scsi_device.h> |
26 | #include <asm/div64.h> | ||
26 | 27 | ||
27 | #include "exofs.h" | 28 | #include "exofs.h" |
28 | 29 | ||
@@ -110,7 +111,17 @@ void exofs_put_io_state(struct exofs_io_state *ios) | |||
110 | unsigned exofs_layout_od_id(struct exofs_layout *layout, | 111 | unsigned exofs_layout_od_id(struct exofs_layout *layout, |
111 | osd_id obj_no, unsigned layout_index) | 112 | osd_id obj_no, unsigned layout_index) |
112 | { | 113 | { |
113 | return layout_index; | 114 | /* switch (layout->lay_func) { |
115 | case LAYOUT_MOVING_WINDOW: | ||
116 | {*/ | ||
117 | unsigned dev_mod = obj_no; | ||
118 | |||
119 | return (layout_index + dev_mod * layout->mirrors_p1) % | ||
120 | layout->s_numdevs; | ||
121 | /* } | ||
122 | case LAYOUT_FUNC_IMPLICT: | ||
123 | return layout->devs[layout_index]; | ||
124 | }*/ | ||
114 | } | 125 | } |
115 | 126 | ||
116 | static inline struct osd_dev *exofs_ios_od(struct exofs_io_state *ios, | 127 | static inline struct osd_dev *exofs_ios_od(struct exofs_io_state *ios, |
@@ -225,8 +236,8 @@ int exofs_check_io(struct exofs_io_state *ios, u64 *resid) | |||
225 | _clear_bio(ios->per_dev[i].bio); | 236 | _clear_bio(ios->per_dev[i].bio); |
226 | EXOFS_DBGMSG("start read offset passed end of file " | 237 | EXOFS_DBGMSG("start read offset passed end of file " |
227 | "offset=0x%llx, length=0x%llx\n", | 238 | "offset=0x%llx, length=0x%llx\n", |
228 | _LLU(ios->offset), | 239 | _LLU(ios->per_dev[i].offset), |
229 | _LLU(ios->length)); | 240 | _LLU(ios->per_dev[i].length)); |
230 | 241 | ||
231 | continue; /* we recovered */ | 242 | continue; /* we recovered */ |
232 | } | 243 | } |
@@ -248,6 +259,127 @@ int exofs_check_io(struct exofs_io_state *ios, u64 *resid) | |||
248 | return acumulated_lin_err; | 259 | return acumulated_lin_err; |
249 | } | 260 | } |
250 | 261 | ||
262 | /* REMOVEME: After review | ||
263 | Some quoteing from the standard | ||
264 | |||
265 | L = logical offset into the file | ||
266 | W = number of data components in a stripe | ||
267 | S = W * stripe_unit (S is Stripe length) | ||
268 | N = L / S (N is the stripe Number) | ||
269 | C = (L-(N*S)) / stripe_unit (C is the component) | ||
270 | O = (N*stripe_unit)+(L%stripe_unit) (O is the object's offset) | ||
271 | */ | ||
272 | |||
273 | static void _offset_dev_unit_off(struct exofs_io_state *ios, u64 file_offset, | ||
274 | u64 *obj_offset, unsigned *dev, unsigned *unit_off) | ||
275 | { | ||
276 | unsigned stripe_unit = ios->layout->stripe_unit; | ||
277 | unsigned stripe_length = stripe_unit * ios->layout->group_width; | ||
278 | u64 stripe_no = file_offset; | ||
279 | unsigned stripe_mod = do_div(stripe_no, stripe_length); | ||
280 | |||
281 | *unit_off = stripe_mod % stripe_unit; | ||
282 | *obj_offset = stripe_no * stripe_unit + *unit_off; | ||
283 | *dev = stripe_mod / stripe_unit * ios->layout->mirrors_p1; | ||
284 | } | ||
285 | |||
286 | static int _add_stripe_unit(struct exofs_io_state *ios, unsigned *cur_bvec, | ||
287 | struct exofs_per_dev_state *per_dev, int cur_len) | ||
288 | { | ||
289 | unsigned bv = *cur_bvec; | ||
290 | struct request_queue *q = | ||
291 | osd_request_queue(exofs_ios_od(ios, per_dev->dev)); | ||
292 | |||
293 | per_dev->length += cur_len; | ||
294 | |||
295 | if (per_dev->bio == NULL) { | ||
296 | unsigned pages_in_stripe = ios->layout->group_width * | ||
297 | (ios->layout->stripe_unit / PAGE_SIZE); | ||
298 | unsigned bio_size = (ios->bio->bi_vcnt + pages_in_stripe) / | ||
299 | ios->layout->group_width; | ||
300 | |||
301 | per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size); | ||
302 | if (unlikely(!per_dev->bio)) { | ||
303 | EXOFS_DBGMSG("Faild to allocate BIO size=%u\n", | ||
304 | bio_size); | ||
305 | return -ENOMEM; | ||
306 | } | ||
307 | } | ||
308 | |||
309 | while (cur_len > 0) { | ||
310 | int added_len; | ||
311 | struct bio_vec *bvec = &ios->bio->bi_io_vec[bv]; | ||
312 | |||
313 | BUG_ON(ios->bio->bi_vcnt <= bv); | ||
314 | cur_len -= bvec->bv_len; | ||
315 | |||
316 | added_len = bio_add_pc_page(q, per_dev->bio, bvec->bv_page, | ||
317 | bvec->bv_len, bvec->bv_offset); | ||
318 | if (unlikely(bvec->bv_len != added_len)) | ||
319 | return -ENOMEM; | ||
320 | ++bv; | ||
321 | } | ||
322 | BUG_ON(cur_len); | ||
323 | |||
324 | *cur_bvec = bv; | ||
325 | return 0; | ||
326 | } | ||
327 | |||
328 | static int _prepare_for_striping(struct exofs_io_state *ios) | ||
329 | { | ||
330 | u64 length = ios->length; | ||
331 | u64 offset = ios->offset; | ||
332 | unsigned stripe_unit = ios->layout->stripe_unit; | ||
333 | unsigned comp = 0; | ||
334 | unsigned stripes = 0; | ||
335 | unsigned cur_bvec = 0; | ||
336 | int ret; | ||
337 | |||
338 | if (!ios->bio) { | ||
339 | if (ios->kern_buff) { | ||
340 | struct exofs_per_dev_state *per_dev = &ios->per_dev[0]; | ||
341 | unsigned unit_off; | ||
342 | |||
343 | _offset_dev_unit_off(ios, offset, &per_dev->offset, | ||
344 | &per_dev->dev, &unit_off); | ||
345 | /* no cross device without page array */ | ||
346 | BUG_ON((ios->layout->group_width > 1) && | ||
347 | (unit_off + length > stripe_unit)); | ||
348 | } | ||
349 | ios->numdevs = ios->layout->mirrors_p1; | ||
350 | return 0; | ||
351 | } | ||
352 | |||
353 | while (length) { | ||
354 | struct exofs_per_dev_state *per_dev = &ios->per_dev[comp]; | ||
355 | unsigned cur_len; | ||
356 | |||
357 | if (!per_dev->length) { | ||
358 | unsigned unit_off; | ||
359 | |||
360 | _offset_dev_unit_off(ios, offset, &per_dev->offset, | ||
361 | &per_dev->dev, &unit_off); | ||
362 | stripes++; | ||
363 | cur_len = min_t(u64, stripe_unit - unit_off, length); | ||
364 | offset += cur_len; | ||
365 | } else { | ||
366 | cur_len = min_t(u64, stripe_unit, length); | ||
367 | } | ||
368 | |||
369 | ret = _add_stripe_unit(ios, &cur_bvec, per_dev, cur_len); | ||
370 | if (unlikely(ret)) | ||
371 | goto out; | ||
372 | |||
373 | comp += ios->layout->mirrors_p1; | ||
374 | comp %= ios->layout->s_numdevs; | ||
375 | |||
376 | length -= cur_len; | ||
377 | } | ||
378 | out: | ||
379 | ios->numdevs = stripes * ios->layout->mirrors_p1; | ||
380 | return ret; | ||
381 | } | ||
382 | |||
251 | int exofs_sbi_create(struct exofs_io_state *ios) | 383 | int exofs_sbi_create(struct exofs_io_state *ios) |
252 | { | 384 | { |
253 | int i, ret; | 385 | int i, ret; |
@@ -296,61 +428,71 @@ out: | |||
296 | return ret; | 428 | return ret; |
297 | } | 429 | } |
298 | 430 | ||
299 | int exofs_sbi_write(struct exofs_io_state *ios) | 431 | static int _sbi_write_mirror(struct exofs_io_state *ios, int cur_comp) |
300 | { | 432 | { |
301 | int i, ret; | 433 | struct exofs_per_dev_state *master_dev = &ios->per_dev[cur_comp]; |
434 | unsigned dev = ios->per_dev[cur_comp].dev; | ||
435 | unsigned last_comp = cur_comp + ios->layout->mirrors_p1; | ||
436 | int ret = 0; | ||
302 | 437 | ||
303 | for (i = 0; i < ios->layout->s_numdevs; i++) { | 438 | for (; cur_comp < last_comp; ++cur_comp, ++dev) { |
439 | struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp]; | ||
304 | struct osd_request *or; | 440 | struct osd_request *or; |
305 | 441 | ||
306 | or = osd_start_request(exofs_ios_od(ios, i), GFP_KERNEL); | 442 | or = osd_start_request(exofs_ios_od(ios, dev), GFP_KERNEL); |
307 | if (unlikely(!or)) { | 443 | if (unlikely(!or)) { |
308 | EXOFS_ERR("%s: osd_start_request failed\n", __func__); | 444 | EXOFS_ERR("%s: osd_start_request failed\n", __func__); |
309 | ret = -ENOMEM; | 445 | ret = -ENOMEM; |
310 | goto out; | 446 | goto out; |
311 | } | 447 | } |
312 | ios->per_dev[i].or = or; | 448 | per_dev->or = or; |
313 | ios->numdevs++; | 449 | per_dev->offset = master_dev->offset; |
314 | 450 | ||
315 | if (ios->bio) { | 451 | if (ios->bio) { |
316 | struct bio *bio; | 452 | struct bio *bio; |
317 | 453 | ||
318 | if (i != 0) { | 454 | if (per_dev != master_dev) { |
319 | bio = bio_kmalloc(GFP_KERNEL, | 455 | bio = bio_kmalloc(GFP_KERNEL, |
320 | ios->bio->bi_max_vecs); | 456 | master_dev->bio->bi_max_vecs); |
321 | if (unlikely(!bio)) { | 457 | if (unlikely(!bio)) { |
322 | EXOFS_DBGMSG( | 458 | EXOFS_DBGMSG( |
323 | "Faild to allocate BIO size=%u\n", | 459 | "Faild to allocate BIO size=%u\n", |
324 | ios->bio->bi_max_vecs); | 460 | master_dev->bio->bi_max_vecs); |
325 | ret = -ENOMEM; | 461 | ret = -ENOMEM; |
326 | goto out; | 462 | goto out; |
327 | } | 463 | } |
328 | 464 | ||
329 | __bio_clone(bio, ios->bio); | 465 | __bio_clone(bio, master_dev->bio); |
330 | bio->bi_bdev = NULL; | 466 | bio->bi_bdev = NULL; |
331 | bio->bi_next = NULL; | 467 | bio->bi_next = NULL; |
332 | ios->per_dev[i].bio = bio; | 468 | per_dev->length = master_dev->length; |
469 | per_dev->bio = bio; | ||
470 | per_dev->dev = dev; | ||
333 | } else { | 471 | } else { |
334 | bio = ios->bio; | 472 | bio = master_dev->bio; |
473 | /* FIXME: bio_set_dir() */ | ||
474 | bio->bi_rw |= (1 << BIO_RW); | ||
335 | } | 475 | } |
336 | 476 | ||
337 | osd_req_write(or, &ios->obj, ios->offset, bio, | 477 | osd_req_write(or, &ios->obj, per_dev->offset, bio, |
338 | ios->length); | 478 | per_dev->length); |
339 | EXOFS_DBGMSG("write(0x%llx) offset=0x%llx " | 479 | EXOFS_DBGMSG("write(0x%llx) offset=0x%llx " |
340 | "length=0x%llx dev=%d\n", | 480 | "length=0x%llx dev=%d\n", |
341 | _LLU(ios->obj.id), _LLU(ios->offset), | 481 | _LLU(ios->obj.id), _LLU(per_dev->offset), |
342 | _LLU(ios->length), i); | 482 | _LLU(per_dev->length), dev); |
343 | } else if (ios->kern_buff) { | 483 | } else if (ios->kern_buff) { |
344 | osd_req_write_kern(or, &ios->obj, ios->offset, | 484 | ret = osd_req_write_kern(or, &ios->obj, per_dev->offset, |
345 | ios->kern_buff, ios->length); | 485 | ios->kern_buff, ios->length); |
486 | if (unlikely(ret)) | ||
487 | goto out; | ||
346 | EXOFS_DBGMSG2("write_kern(0x%llx) offset=0x%llx " | 488 | EXOFS_DBGMSG2("write_kern(0x%llx) offset=0x%llx " |
347 | "length=0x%llx dev=%d\n", | 489 | "length=0x%llx dev=%d\n", |
348 | _LLU(ios->obj.id), _LLU(ios->offset), | 490 | _LLU(ios->obj.id), _LLU(per_dev->offset), |
349 | _LLU(ios->length), i); | 491 | _LLU(ios->length), dev); |
350 | } else { | 492 | } else { |
351 | osd_req_set_attributes(or, &ios->obj); | 493 | osd_req_set_attributes(or, &ios->obj); |
352 | EXOFS_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n", | 494 | EXOFS_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n", |
353 | _LLU(ios->obj.id), ios->out_attr_len, i); | 495 | _LLU(ios->obj.id), ios->out_attr_len, dev); |
354 | } | 496 | } |
355 | 497 | ||
356 | if (ios->out_attr) | 498 | if (ios->out_attr) |
@@ -361,40 +503,57 @@ int exofs_sbi_write(struct exofs_io_state *ios) | |||
361 | osd_req_add_get_attr_list(or, ios->in_attr, | 503 | osd_req_add_get_attr_list(or, ios->in_attr, |
362 | ios->in_attr_len); | 504 | ios->in_attr_len); |
363 | } | 505 | } |
364 | ret = exofs_io_execute(ios); | ||
365 | 506 | ||
366 | out: | 507 | out: |
367 | return ret; | 508 | return ret; |
368 | } | 509 | } |
369 | 510 | ||
370 | int exofs_sbi_read(struct exofs_io_state *ios) | 511 | int exofs_sbi_write(struct exofs_io_state *ios) |
512 | { | ||
513 | int i; | ||
514 | int ret; | ||
515 | |||
516 | ret = _prepare_for_striping(ios); | ||
517 | if (unlikely(ret)) | ||
518 | return ret; | ||
519 | |||
520 | for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { | ||
521 | ret = _sbi_write_mirror(ios, i); | ||
522 | if (unlikely(ret)) | ||
523 | return ret; | ||
524 | } | ||
525 | |||
526 | ret = exofs_io_execute(ios); | ||
527 | return ret; | ||
528 | } | ||
529 | |||
530 | static int _sbi_read_mirror(struct exofs_io_state *ios, unsigned cur_comp) | ||
371 | { | 531 | { |
372 | struct osd_request *or; | 532 | struct osd_request *or; |
373 | struct exofs_per_dev_state *per_dev = &ios->per_dev[0]; | 533 | struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp]; |
374 | unsigned first_dev = (unsigned)ios->obj.id; | 534 | unsigned first_dev = (unsigned)ios->obj.id; |
375 | 535 | ||
376 | first_dev %= ios->layout->s_numdevs; | 536 | first_dev = per_dev->dev + first_dev % ios->layout->mirrors_p1; |
377 | or = osd_start_request(exofs_ios_od(ios, first_dev), GFP_KERNEL); | 537 | or = osd_start_request(exofs_ios_od(ios, first_dev), GFP_KERNEL); |
378 | if (unlikely(!or)) { | 538 | if (unlikely(!or)) { |
379 | EXOFS_ERR("%s: osd_start_request failed\n", __func__); | 539 | EXOFS_ERR("%s: osd_start_request failed\n", __func__); |
380 | return -ENOMEM; | 540 | return -ENOMEM; |
381 | } | 541 | } |
382 | per_dev->or = or; | 542 | per_dev->or = or; |
383 | ios->numdevs++; | ||
384 | 543 | ||
385 | if (ios->bio) { | 544 | if (ios->bio) { |
386 | osd_req_read(or, &ios->obj, ios->offset, ios->bio, ios->length); | 545 | osd_req_read(or, &ios->obj, per_dev->offset, |
546 | per_dev->bio, per_dev->length); | ||
387 | EXOFS_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx" | 547 | EXOFS_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx" |
388 | " dev=%d\n", _LLU(ios->obj.id), | 548 | " dev=%d\n", _LLU(ios->obj.id), |
389 | _LLU(ios->offset), _LLU(ios->length), | 549 | _LLU(per_dev->offset), _LLU(per_dev->length), |
390 | first_dev); | 550 | first_dev); |
391 | } else if (ios->kern_buff) { | 551 | } else if (ios->kern_buff) { |
392 | int ret = osd_req_read_kern(or, &ios->obj, ios->offset, | 552 | int ret = osd_req_read_kern(or, &ios->obj, per_dev->offset, |
393 | ios->kern_buff, ios->length); | 553 | ios->kern_buff, ios->length); |
394 | |||
395 | EXOFS_DBGMSG2("read_kern(0x%llx) offset=0x%llx " | 554 | EXOFS_DBGMSG2("read_kern(0x%llx) offset=0x%llx " |
396 | "length=0x%llx dev=%d ret=>%d\n", | 555 | "length=0x%llx dev=%d ret=>%d\n", |
397 | _LLU(ios->obj.id), _LLU(ios->offset), | 556 | _LLU(ios->obj.id), _LLU(per_dev->offset), |
398 | _LLU(ios->length), first_dev, ret); | 557 | _LLU(ios->length), first_dev, ret); |
399 | if (unlikely(ret)) | 558 | if (unlikely(ret)) |
400 | return ret; | 559 | return ret; |
@@ -403,14 +562,32 @@ int exofs_sbi_read(struct exofs_io_state *ios) | |||
403 | EXOFS_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n", | 562 | EXOFS_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n", |
404 | _LLU(ios->obj.id), ios->in_attr_len, first_dev); | 563 | _LLU(ios->obj.id), ios->in_attr_len, first_dev); |
405 | } | 564 | } |
406 | |||
407 | if (ios->out_attr) | 565 | if (ios->out_attr) |
408 | osd_req_add_set_attr_list(or, ios->out_attr, ios->out_attr_len); | 566 | osd_req_add_set_attr_list(or, ios->out_attr, ios->out_attr_len); |
409 | 567 | ||
410 | if (ios->in_attr) | 568 | if (ios->in_attr) |
411 | osd_req_add_get_attr_list(or, ios->in_attr, ios->in_attr_len); | 569 | osd_req_add_get_attr_list(or, ios->in_attr, ios->in_attr_len); |
412 | 570 | ||
413 | return exofs_io_execute(ios); | 571 | return 0; |
572 | } | ||
573 | |||
574 | int exofs_sbi_read(struct exofs_io_state *ios) | ||
575 | { | ||
576 | int i; | ||
577 | int ret; | ||
578 | |||
579 | ret = _prepare_for_striping(ios); | ||
580 | if (unlikely(ret)) | ||
581 | return ret; | ||
582 | |||
583 | for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { | ||
584 | ret = _sbi_read_mirror(ios, i); | ||
585 | if (unlikely(ret)) | ||
586 | return ret; | ||
587 | } | ||
588 | |||
589 | ret = exofs_io_execute(ios); | ||
590 | return ret; | ||
414 | } | 591 | } |
415 | 592 | ||
416 | int extract_attr_from_ios(struct exofs_io_state *ios, struct osd_attr *attr) | 593 | int extract_attr_from_ios(struct exofs_io_state *ios, struct osd_attr *attr) |
@@ -434,42 +611,84 @@ int extract_attr_from_ios(struct exofs_io_state *ios, struct osd_attr *attr) | |||
434 | return -EIO; | 611 | return -EIO; |
435 | } | 612 | } |
436 | 613 | ||
614 | static int _truncate_mirrors(struct exofs_io_state *ios, unsigned cur_comp, | ||
615 | struct osd_attr *attr) | ||
616 | { | ||
617 | int last_comp = cur_comp + ios->layout->mirrors_p1; | ||
618 | |||
619 | for (; cur_comp < last_comp; ++cur_comp) { | ||
620 | struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp]; | ||
621 | struct osd_request *or; | ||
622 | |||
623 | or = osd_start_request(exofs_ios_od(ios, cur_comp), GFP_KERNEL); | ||
624 | if (unlikely(!or)) { | ||
625 | EXOFS_ERR("%s: osd_start_request failed\n", __func__); | ||
626 | return -ENOMEM; | ||
627 | } | ||
628 | per_dev->or = or; | ||
629 | |||
630 | osd_req_set_attributes(or, &ios->obj); | ||
631 | osd_req_add_set_attr_list(or, attr, 1); | ||
632 | } | ||
633 | |||
634 | return 0; | ||
635 | } | ||
636 | |||
437 | int exofs_oi_truncate(struct exofs_i_info *oi, u64 size) | 637 | int exofs_oi_truncate(struct exofs_i_info *oi, u64 size) |
438 | { | 638 | { |
439 | struct exofs_sb_info *sbi = oi->vfs_inode.i_sb->s_fs_info; | 639 | struct exofs_sb_info *sbi = oi->vfs_inode.i_sb->s_fs_info; |
440 | struct exofs_io_state *ios; | 640 | struct exofs_io_state *ios; |
441 | struct osd_attr attr; | 641 | struct exofs_trunc_attr { |
442 | __be64 newsize; | 642 | struct osd_attr attr; |
643 | __be64 newsize; | ||
644 | } *size_attrs; | ||
645 | u64 this_obj_size; | ||
646 | unsigned dev; | ||
647 | unsigned unit_off; | ||
443 | int i, ret; | 648 | int i, ret; |
444 | 649 | ||
445 | if (exofs_get_io_state(&sbi->layout, &ios)) | 650 | ret = exofs_get_io_state(&sbi->layout, &ios); |
446 | return -ENOMEM; | 651 | if (unlikely(ret)) |
652 | return ret; | ||
653 | |||
654 | size_attrs = kcalloc(ios->layout->group_width, sizeof(*size_attrs), | ||
655 | GFP_KERNEL); | ||
656 | if (unlikely(!size_attrs)) { | ||
657 | ret = -ENOMEM; | ||
658 | goto out; | ||
659 | } | ||
447 | 660 | ||
448 | ios->obj.id = exofs_oi_objno(oi); | 661 | ios->obj.id = exofs_oi_objno(oi); |
449 | ios->cred = oi->i_cred; | 662 | ios->cred = oi->i_cred; |
450 | 663 | ||
451 | newsize = cpu_to_be64(size); | 664 | ios->numdevs = ios->layout->s_numdevs; |
452 | attr = g_attr_logical_length; | 665 | _offset_dev_unit_off(ios, size, &this_obj_size, &dev, &unit_off); |
453 | attr.val_ptr = &newsize; | ||
454 | 666 | ||
455 | for (i = 0; i < sbi->layout.s_numdevs; i++) { | 667 | for (i = 0; i < ios->layout->group_width; ++i) { |
456 | struct osd_request *or; | 668 | struct exofs_trunc_attr *size_attr = &size_attrs[i]; |
669 | u64 obj_size; | ||
457 | 670 | ||
458 | or = osd_start_request(exofs_ios_od(ios, i), GFP_KERNEL); | 671 | if (i < dev) |
459 | if (unlikely(!or)) { | 672 | obj_size = this_obj_size + |
460 | EXOFS_ERR("%s: osd_start_request failed\n", __func__); | 673 | ios->layout->stripe_unit - unit_off; |
461 | ret = -ENOMEM; | 674 | else if (i == dev) |
462 | goto out; | 675 | obj_size = this_obj_size; |
463 | } | 676 | else /* i > dev */ |
464 | ios->per_dev[i].or = or; | 677 | obj_size = this_obj_size - unit_off; |
465 | ios->numdevs++; | ||
466 | 678 | ||
467 | osd_req_set_attributes(or, &ios->obj); | 679 | size_attr->newsize = cpu_to_be64(obj_size); |
468 | osd_req_add_set_attr_list(or, &attr, 1); | 680 | size_attr->attr = g_attr_logical_length; |
681 | size_attr->attr.val_ptr = &size_attr->newsize; | ||
682 | |||
683 | ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1, | ||
684 | &size_attr->attr); | ||
685 | if (unlikely(ret)) | ||
686 | goto out; | ||
469 | } | 687 | } |
470 | ret = exofs_io_execute(ios); | 688 | ret = exofs_io_execute(ios); |
471 | 689 | ||
472 | out: | 690 | out: |
691 | kfree(size_attrs); | ||
473 | exofs_put_io_state(ios); | 692 | exofs_put_io_state(ios); |
474 | return ret; | 693 | return ret; |
475 | } | 694 | } |
diff --git a/fs/exofs/super.c b/fs/exofs/super.c index fc8875186ae8..8f4e4b37a578 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c | |||
@@ -308,6 +308,8 @@ static void exofs_put_super(struct super_block *sb) | |||
308 | static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs, | 308 | static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs, |
309 | struct exofs_device_table *dt) | 309 | struct exofs_device_table *dt) |
310 | { | 310 | { |
311 | u64 stripe_length; | ||
312 | |||
311 | sbi->data_map.odm_num_comps = | 313 | sbi->data_map.odm_num_comps = |
312 | le32_to_cpu(dt->dt_data_map.cb_num_comps); | 314 | le32_to_cpu(dt->dt_data_map.cb_num_comps); |
313 | sbi->data_map.odm_stripe_unit = | 315 | sbi->data_map.odm_stripe_unit = |
@@ -321,14 +323,47 @@ static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs, | |||
321 | sbi->data_map.odm_raid_algorithm = | 323 | sbi->data_map.odm_raid_algorithm = |
322 | le32_to_cpu(dt->dt_data_map.cb_raid_algorithm); | 324 | le32_to_cpu(dt->dt_data_map.cb_raid_algorithm); |
323 | 325 | ||
324 | /* FIXME: Hard coded mirror only for now. if not so do not mount */ | 326 | /* FIXME: Only raid0 !group_width/depth for now. if not so, do not mount */ |
325 | if ((sbi->data_map.odm_num_comps != numdevs) || | 327 | if (sbi->data_map.odm_group_width || sbi->data_map.odm_group_depth) { |
326 | (sbi->data_map.odm_stripe_unit != EXOFS_BLKSIZE) || | 328 | EXOFS_ERR("Group width/depth not supported\n"); |
327 | (sbi->data_map.odm_raid_algorithm != PNFS_OSD_RAID_0) || | ||
328 | (sbi->data_map.odm_mirror_cnt != (numdevs - 1))) | ||
329 | return -EINVAL; | 329 | return -EINVAL; |
330 | else | 330 | } |
331 | return 0; | 331 | if (sbi->data_map.odm_num_comps != numdevs) { |
332 | EXOFS_ERR("odm_num_comps(%u) != numdevs(%u)\n", | ||
333 | sbi->data_map.odm_num_comps, numdevs); | ||
334 | return -EINVAL; | ||
335 | } | ||
336 | if (sbi->data_map.odm_raid_algorithm != PNFS_OSD_RAID_0) { | ||
337 | EXOFS_ERR("Only RAID_0 for now\n"); | ||
338 | return -EINVAL; | ||
339 | } | ||
340 | if (0 != (numdevs % (sbi->data_map.odm_mirror_cnt + 1))) { | ||
341 | EXOFS_ERR("Data Map wrong, numdevs=%d mirrors=%d\n", | ||
342 | numdevs, sbi->data_map.odm_mirror_cnt); | ||
343 | return -EINVAL; | ||
344 | } | ||
345 | |||
346 | stripe_length = sbi->data_map.odm_stripe_unit * | ||
347 | (numdevs / (sbi->data_map.odm_mirror_cnt + 1)); | ||
348 | if (stripe_length >= (1ULL << 32)) { | ||
349 | EXOFS_ERR("Total Stripe length(0x%llx)" | ||
350 | " >= 32bit is not supported\n", _LLU(stripe_length)); | ||
351 | return -EINVAL; | ||
352 | } | ||
353 | |||
354 | if (0 != (sbi->data_map.odm_stripe_unit & ~PAGE_MASK)) { | ||
355 | EXOFS_ERR("Stripe Unit(0x%llx)" | ||
356 | " must be Multples of PAGE_SIZE(0x%lx)\n", | ||
357 | _LLU(sbi->data_map.odm_stripe_unit), PAGE_SIZE); | ||
358 | return -EINVAL; | ||
359 | } | ||
360 | |||
361 | sbi->layout.stripe_unit = sbi->data_map.odm_stripe_unit; | ||
362 | sbi->layout.mirrors_p1 = sbi->data_map.odm_mirror_cnt + 1; | ||
363 | sbi->layout.group_width = sbi->data_map.odm_num_comps / | ||
364 | sbi->layout.mirrors_p1; | ||
365 | |||
366 | return 0; | ||
332 | } | 367 | } |
333 | 368 | ||
334 | /* @odi is valid only as long as @fscb_dev is valid */ | 369 | /* @odi is valid only as long as @fscb_dev is valid */ |
@@ -502,6 +537,9 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) | |||
502 | } | 537 | } |
503 | 538 | ||
504 | /* Default layout in case we do not have a device-table */ | 539 | /* Default layout in case we do not have a device-table */ |
540 | sbi->layout.stripe_unit = PAGE_SIZE; | ||
541 | sbi->layout.mirrors_p1 = 1; | ||
542 | sbi->layout.group_width = 1; | ||
505 | sbi->layout.s_ods[0] = od; | 543 | sbi->layout.s_ods[0] = od; |
506 | sbi->layout.s_numdevs = 1; | 544 | sbi->layout.s_numdevs = 1; |
507 | sbi->layout.s_pid = opts->pid; | 545 | sbi->layout.s_pid = opts->pid; |