aboutsummaryrefslogtreecommitdiffstats
path: root/fs/nfs/objlayout
diff options
context:
space:
mode:
Diffstat (limited to 'fs/nfs/objlayout')
-rw-r--r--fs/nfs/objlayout/Kbuild5
-rw-r--r--fs/nfs/objlayout/objio_osd.c1059
-rw-r--r--fs/nfs/objlayout/objlayout.c712
-rw-r--r--fs/nfs/objlayout/objlayout.h187
-rw-r--r--fs/nfs/objlayout/pnfs_osd_xdr_cli.c412
5 files changed, 2375 insertions, 0 deletions
diff --git a/fs/nfs/objlayout/Kbuild b/fs/nfs/objlayout/Kbuild
new file mode 100644
index 000000000000..ed30ea072bb8
--- /dev/null
+++ b/fs/nfs/objlayout/Kbuild
@@ -0,0 +1,5 @@
1#
2# Makefile for the pNFS Objects Layout Driver kernel module
3#
4objlayoutdriver-y := objio_osd.o pnfs_osd_xdr_cli.o objlayout.o
5obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayoutdriver.o
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
new file mode 100644
index 000000000000..8ff2ea3f10ef
--- /dev/null
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -0,0 +1,1059 @@
1/*
2 * pNFS Objects layout implementation over open-osd initiator library
3 *
4 * Copyright (C) 2009 Panasas Inc. [year of first publication]
5 * All rights reserved.
6 *
7 * Benny Halevy <bhalevy@panasas.com>
8 * Boaz Harrosh <bharrosh@panasas.com>
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License version 2
12 * See the file COPYING included with this distribution for more details.
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 *
18 * 1. Redistributions of source code must retain the above copyright
19 * notice, this list of conditions and the following disclaimer.
20 * 2. Redistributions in binary form must reproduce the above copyright
21 * notice, this list of conditions and the following disclaimer in the
22 * documentation and/or other materials provided with the distribution.
23 * 3. Neither the name of the Panasas company nor the names of its
24 * contributors may be used to endorse or promote products derived
25 * from this software without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
28 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
29 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
30 * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
34 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 */
39
40#include <linux/module.h>
41#include <scsi/osd_initiator.h>
42
43#include "objlayout.h"
44
45#define NFSDBG_FACILITY NFSDBG_PNFS_LD
46
47#define _LLU(x) ((unsigned long long)x)
48
49enum { BIO_MAX_PAGES_KMALLOC =
50 (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),
51};
52
53struct objio_dev_ent {
54 struct nfs4_deviceid_node id_node;
55 struct osd_dev *od;
56};
57
58static void
59objio_free_deviceid_node(struct nfs4_deviceid_node *d)
60{
61 struct objio_dev_ent *de = container_of(d, struct objio_dev_ent, id_node);
62
63 dprintk("%s: free od=%p\n", __func__, de->od);
64 osduld_put_device(de->od);
65 kfree(de);
66}
67
68static struct objio_dev_ent *_dev_list_find(const struct nfs_server *nfss,
69 const struct nfs4_deviceid *d_id)
70{
71 struct nfs4_deviceid_node *d;
72 struct objio_dev_ent *de;
73
74 d = nfs4_find_get_deviceid(nfss->pnfs_curr_ld, nfss->nfs_client, d_id);
75 if (!d)
76 return NULL;
77
78 de = container_of(d, struct objio_dev_ent, id_node);
79 return de;
80}
81
82static struct objio_dev_ent *
83_dev_list_add(const struct nfs_server *nfss,
84 const struct nfs4_deviceid *d_id, struct osd_dev *od,
85 gfp_t gfp_flags)
86{
87 struct nfs4_deviceid_node *d;
88 struct objio_dev_ent *de = kzalloc(sizeof(*de), gfp_flags);
89 struct objio_dev_ent *n;
90
91 if (!de) {
92 dprintk("%s: -ENOMEM od=%p\n", __func__, od);
93 return NULL;
94 }
95
96 dprintk("%s: Adding od=%p\n", __func__, od);
97 nfs4_init_deviceid_node(&de->id_node,
98 nfss->pnfs_curr_ld,
99 nfss->nfs_client,
100 d_id);
101 de->od = od;
102
103 d = nfs4_insert_deviceid_node(&de->id_node);
104 n = container_of(d, struct objio_dev_ent, id_node);
105 if (n != de) {
106 dprintk("%s: Race with other n->od=%p\n", __func__, n->od);
107 objio_free_deviceid_node(&de->id_node);
108 de = n;
109 }
110
111 return de;
112}
113
114struct caps_buffers {
115 u8 caps_key[OSD_CRYPTO_KEYID_SIZE];
116 u8 creds[OSD_CAP_LEN];
117};
118
119struct objio_segment {
120 struct pnfs_layout_segment lseg;
121
122 struct pnfs_osd_object_cred *comps;
123
124 unsigned mirrors_p1;
125 unsigned stripe_unit;
126 unsigned group_width; /* Data stripe_units without integrity comps */
127 u64 group_depth;
128 unsigned group_count;
129
130 unsigned max_io_size;
131
132 unsigned comps_index;
133 unsigned num_comps;
134 /* variable length */
135 struct objio_dev_ent *ods[];
136};
137
138static inline struct objio_segment *
139OBJIO_LSEG(struct pnfs_layout_segment *lseg)
140{
141 return container_of(lseg, struct objio_segment, lseg);
142}
143
144struct objio_state;
145typedef ssize_t (*objio_done_fn)(struct objio_state *ios);
146
147struct objio_state {
148 /* Generic layer */
149 struct objlayout_io_state ol_state;
150
151 struct objio_segment *layout;
152
153 struct kref kref;
154 objio_done_fn done;
155 void *private;
156
157 unsigned long length;
158 unsigned numdevs; /* Actually used devs in this IO */
159 /* A per-device variable array of size numdevs */
160 struct _objio_per_comp {
161 struct bio *bio;
162 struct osd_request *or;
163 unsigned long length;
164 u64 offset;
165 unsigned dev;
166 } per_dev[];
167};
168
169/* Send and wait for a get_device_info of devices in the layout,
170 then look them up with the osd_initiator library */
171static struct objio_dev_ent *_device_lookup(struct pnfs_layout_hdr *pnfslay,
172 struct objio_segment *objio_seg, unsigned comp,
173 gfp_t gfp_flags)
174{
175 struct pnfs_osd_deviceaddr *deviceaddr;
176 struct nfs4_deviceid *d_id;
177 struct objio_dev_ent *ode;
178 struct osd_dev *od;
179 struct osd_dev_info odi;
180 int err;
181
182 d_id = &objio_seg->comps[comp].oc_object_id.oid_device_id;
183
184 ode = _dev_list_find(NFS_SERVER(pnfslay->plh_inode), d_id);
185 if (ode)
186 return ode;
187
188 err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr, gfp_flags);
189 if (unlikely(err)) {
190 dprintk("%s: objlayout_get_deviceinfo dev(%llx:%llx) =>%d\n",
191 __func__, _DEVID_LO(d_id), _DEVID_HI(d_id), err);
192 return ERR_PTR(err);
193 }
194
195 odi.systemid_len = deviceaddr->oda_systemid.len;
196 if (odi.systemid_len > sizeof(odi.systemid)) {
197 err = -EINVAL;
198 goto out;
199 } else if (odi.systemid_len)
200 memcpy(odi.systemid, deviceaddr->oda_systemid.data,
201 odi.systemid_len);
202 odi.osdname_len = deviceaddr->oda_osdname.len;
203 odi.osdname = (u8 *)deviceaddr->oda_osdname.data;
204
205 if (!odi.osdname_len && !odi.systemid_len) {
206 dprintk("%s: !odi.osdname_len && !odi.systemid_len\n",
207 __func__);
208 err = -ENODEV;
209 goto out;
210 }
211
212 od = osduld_info_lookup(&odi);
213 if (unlikely(IS_ERR(od))) {
214 err = PTR_ERR(od);
215 dprintk("%s: osduld_info_lookup => %d\n", __func__, err);
216 goto out;
217 }
218
219 ode = _dev_list_add(NFS_SERVER(pnfslay->plh_inode), d_id, od,
220 gfp_flags);
221
222out:
223 dprintk("%s: return=%d\n", __func__, err);
224 objlayout_put_deviceinfo(deviceaddr);
225 return err ? ERR_PTR(err) : ode;
226}
227
228static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay,
229 struct objio_segment *objio_seg,
230 gfp_t gfp_flags)
231{
232 unsigned i;
233 int err;
234
235 /* lookup all devices */
236 for (i = 0; i < objio_seg->num_comps; i++) {
237 struct objio_dev_ent *ode;
238
239 ode = _device_lookup(pnfslay, objio_seg, i, gfp_flags);
240 if (unlikely(IS_ERR(ode))) {
241 err = PTR_ERR(ode);
242 goto out;
243 }
244 objio_seg->ods[i] = ode;
245 }
246 err = 0;
247
248out:
249 dprintk("%s: return=%d\n", __func__, err);
250 return err;
251}
252
253static int _verify_data_map(struct pnfs_osd_layout *layout)
254{
255 struct pnfs_osd_data_map *data_map = &layout->olo_map;
256 u64 stripe_length;
257 u32 group_width;
258
259/* FIXME: Only raid0 for now. if not go through MDS */
260 if (data_map->odm_raid_algorithm != PNFS_OSD_RAID_0) {
261 printk(KERN_ERR "Only RAID_0 for now\n");
262 return -ENOTSUPP;
263 }
264 if (0 != (data_map->odm_num_comps % (data_map->odm_mirror_cnt + 1))) {
265 printk(KERN_ERR "Data Map wrong, num_comps=%u mirrors=%u\n",
266 data_map->odm_num_comps, data_map->odm_mirror_cnt);
267 return -EINVAL;
268 }
269
270 if (data_map->odm_group_width)
271 group_width = data_map->odm_group_width;
272 else
273 group_width = data_map->odm_num_comps /
274 (data_map->odm_mirror_cnt + 1);
275
276 stripe_length = (u64)data_map->odm_stripe_unit * group_width;
277 if (stripe_length >= (1ULL << 32)) {
278 printk(KERN_ERR "Total Stripe length(0x%llx)"
279 " >= 32bit is not supported\n", _LLU(stripe_length));
280 return -ENOTSUPP;
281 }
282
283 if (0 != (data_map->odm_stripe_unit & ~PAGE_MASK)) {
284 printk(KERN_ERR "Stripe Unit(0x%llx)"
285 " must be Multples of PAGE_SIZE(0x%lx)\n",
286 _LLU(data_map->odm_stripe_unit), PAGE_SIZE);
287 return -ENOTSUPP;
288 }
289
290 return 0;
291}
292
293static void copy_single_comp(struct pnfs_osd_object_cred *cur_comp,
294 struct pnfs_osd_object_cred *src_comp,
295 struct caps_buffers *caps_p)
296{
297 WARN_ON(src_comp->oc_cap_key.cred_len > sizeof(caps_p->caps_key));
298 WARN_ON(src_comp->oc_cap.cred_len > sizeof(caps_p->creds));
299
300 *cur_comp = *src_comp;
301
302 memcpy(caps_p->caps_key, src_comp->oc_cap_key.cred,
303 sizeof(caps_p->caps_key));
304 cur_comp->oc_cap_key.cred = caps_p->caps_key;
305
306 memcpy(caps_p->creds, src_comp->oc_cap.cred,
307 sizeof(caps_p->creds));
308 cur_comp->oc_cap.cred = caps_p->creds;
309}
310
311int objio_alloc_lseg(struct pnfs_layout_segment **outp,
312 struct pnfs_layout_hdr *pnfslay,
313 struct pnfs_layout_range *range,
314 struct xdr_stream *xdr,
315 gfp_t gfp_flags)
316{
317 struct objio_segment *objio_seg;
318 struct pnfs_osd_xdr_decode_layout_iter iter;
319 struct pnfs_osd_layout layout;
320 struct pnfs_osd_object_cred *cur_comp, src_comp;
321 struct caps_buffers *caps_p;
322 int err;
323
324 err = pnfs_osd_xdr_decode_layout_map(&layout, &iter, xdr);
325 if (unlikely(err))
326 return err;
327
328 err = _verify_data_map(&layout);
329 if (unlikely(err))
330 return err;
331
332 objio_seg = kzalloc(sizeof(*objio_seg) +
333 sizeof(objio_seg->ods[0]) * layout.olo_num_comps +
334 sizeof(*objio_seg->comps) * layout.olo_num_comps +
335 sizeof(struct caps_buffers) * layout.olo_num_comps,
336 gfp_flags);
337 if (!objio_seg)
338 return -ENOMEM;
339
340 objio_seg->comps = (void *)(objio_seg->ods + layout.olo_num_comps);
341 cur_comp = objio_seg->comps;
342 caps_p = (void *)(cur_comp + layout.olo_num_comps);
343 while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err))
344 copy_single_comp(cur_comp++, &src_comp, caps_p++);
345 if (unlikely(err))
346 goto err;
347
348 objio_seg->num_comps = layout.olo_num_comps;
349 objio_seg->comps_index = layout.olo_comps_index;
350 err = objio_devices_lookup(pnfslay, objio_seg, gfp_flags);
351 if (err)
352 goto err;
353
354 objio_seg->mirrors_p1 = layout.olo_map.odm_mirror_cnt + 1;
355 objio_seg->stripe_unit = layout.olo_map.odm_stripe_unit;
356 if (layout.olo_map.odm_group_width) {
357 objio_seg->group_width = layout.olo_map.odm_group_width;
358 objio_seg->group_depth = layout.olo_map.odm_group_depth;
359 objio_seg->group_count = layout.olo_map.odm_num_comps /
360 objio_seg->mirrors_p1 /
361 objio_seg->group_width;
362 } else {
363 objio_seg->group_width = layout.olo_map.odm_num_comps /
364 objio_seg->mirrors_p1;
365 objio_seg->group_depth = -1;
366 objio_seg->group_count = 1;
367 }
368
369 /* Cache this calculation it will hit for every page */
370 objio_seg->max_io_size = (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE -
371 objio_seg->stripe_unit) *
372 objio_seg->group_width;
373
374 *outp = &objio_seg->lseg;
375 return 0;
376
377err:
378 kfree(objio_seg);
379 dprintk("%s: Error: return %d\n", __func__, err);
380 *outp = NULL;
381 return err;
382}
383
384void objio_free_lseg(struct pnfs_layout_segment *lseg)
385{
386 int i;
387 struct objio_segment *objio_seg = OBJIO_LSEG(lseg);
388
389 for (i = 0; i < objio_seg->num_comps; i++) {
390 if (!objio_seg->ods[i])
391 break;
392 nfs4_put_deviceid_node(&objio_seg->ods[i]->id_node);
393 }
394 kfree(objio_seg);
395}
396
397int objio_alloc_io_state(struct pnfs_layout_segment *lseg,
398 struct objlayout_io_state **outp,
399 gfp_t gfp_flags)
400{
401 struct objio_segment *objio_seg = OBJIO_LSEG(lseg);
402 struct objio_state *ios;
403 const unsigned first_size = sizeof(*ios) +
404 objio_seg->num_comps * sizeof(ios->per_dev[0]);
405 const unsigned sec_size = objio_seg->num_comps *
406 sizeof(ios->ol_state.ioerrs[0]);
407
408 ios = kzalloc(first_size + sec_size, gfp_flags);
409 if (unlikely(!ios))
410 return -ENOMEM;
411
412 ios->layout = objio_seg;
413 ios->ol_state.ioerrs = ((void *)ios) + first_size;
414 ios->ol_state.num_comps = objio_seg->num_comps;
415
416 *outp = &ios->ol_state;
417 return 0;
418}
419
420void objio_free_io_state(struct objlayout_io_state *ol_state)
421{
422 struct objio_state *ios = container_of(ol_state, struct objio_state,
423 ol_state);
424
425 kfree(ios);
426}
427
428enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep)
429{
430 switch (oep) {
431 case OSD_ERR_PRI_NO_ERROR:
432 return (enum pnfs_osd_errno)0;
433
434 case OSD_ERR_PRI_CLEAR_PAGES:
435 BUG_ON(1);
436 return 0;
437
438 case OSD_ERR_PRI_RESOURCE:
439 return PNFS_OSD_ERR_RESOURCE;
440 case OSD_ERR_PRI_BAD_CRED:
441 return PNFS_OSD_ERR_BAD_CRED;
442 case OSD_ERR_PRI_NO_ACCESS:
443 return PNFS_OSD_ERR_NO_ACCESS;
444 case OSD_ERR_PRI_UNREACHABLE:
445 return PNFS_OSD_ERR_UNREACHABLE;
446 case OSD_ERR_PRI_NOT_FOUND:
447 return PNFS_OSD_ERR_NOT_FOUND;
448 case OSD_ERR_PRI_NO_SPACE:
449 return PNFS_OSD_ERR_NO_SPACE;
450 default:
451 WARN_ON(1);
452 /* fallthrough */
453 case OSD_ERR_PRI_EIO:
454 return PNFS_OSD_ERR_EIO;
455 }
456}
457
458static void _clear_bio(struct bio *bio)
459{
460 struct bio_vec *bv;
461 unsigned i;
462
463 __bio_for_each_segment(bv, bio, i, 0) {
464 unsigned this_count = bv->bv_len;
465
466 if (likely(PAGE_SIZE == this_count))
467 clear_highpage(bv->bv_page);
468 else
469 zero_user(bv->bv_page, bv->bv_offset, this_count);
470 }
471}
472
473static int _io_check(struct objio_state *ios, bool is_write)
474{
475 enum osd_err_priority oep = OSD_ERR_PRI_NO_ERROR;
476 int lin_ret = 0;
477 int i;
478
479 for (i = 0; i < ios->numdevs; i++) {
480 struct osd_sense_info osi;
481 struct osd_request *or = ios->per_dev[i].or;
482 unsigned dev;
483 int ret;
484
485 if (!or)
486 continue;
487
488 ret = osd_req_decode_sense(or, &osi);
489 if (likely(!ret))
490 continue;
491
492 if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) {
493 /* start read offset passed endof file */
494 BUG_ON(is_write);
495 _clear_bio(ios->per_dev[i].bio);
496 dprintk("%s: start read offset passed end of file "
497 "offset=0x%llx, length=0x%lx\n", __func__,
498 _LLU(ios->per_dev[i].offset),
499 ios->per_dev[i].length);
500
501 continue; /* we recovered */
502 }
503 dev = ios->per_dev[i].dev;
504 objlayout_io_set_result(&ios->ol_state, dev,
505 &ios->layout->comps[dev].oc_object_id,
506 osd_pri_2_pnfs_err(osi.osd_err_pri),
507 ios->per_dev[i].offset,
508 ios->per_dev[i].length,
509 is_write);
510
511 if (osi.osd_err_pri >= oep) {
512 oep = osi.osd_err_pri;
513 lin_ret = ret;
514 }
515 }
516
517 return lin_ret;
518}
519
520/*
521 * Common IO state helpers.
522 */
523static void _io_free(struct objio_state *ios)
524{
525 unsigned i;
526
527 for (i = 0; i < ios->numdevs; i++) {
528 struct _objio_per_comp *per_dev = &ios->per_dev[i];
529
530 if (per_dev->or) {
531 osd_end_request(per_dev->or);
532 per_dev->or = NULL;
533 }
534
535 if (per_dev->bio) {
536 bio_put(per_dev->bio);
537 per_dev->bio = NULL;
538 }
539 }
540}
541
542struct osd_dev *_io_od(struct objio_state *ios, unsigned dev)
543{
544 unsigned min_dev = ios->layout->comps_index;
545 unsigned max_dev = min_dev + ios->layout->num_comps;
546
547 BUG_ON(dev < min_dev || max_dev <= dev);
548 return ios->layout->ods[dev - min_dev]->od;
549}
550
551struct _striping_info {
552 u64 obj_offset;
553 u64 group_length;
554 unsigned dev;
555 unsigned unit_off;
556};
557
558static void _calc_stripe_info(struct objio_state *ios, u64 file_offset,
559 struct _striping_info *si)
560{
561 u32 stripe_unit = ios->layout->stripe_unit;
562 u32 group_width = ios->layout->group_width;
563 u64 group_depth = ios->layout->group_depth;
564 u32 U = stripe_unit * group_width;
565
566 u64 T = U * group_depth;
567 u64 S = T * ios->layout->group_count;
568 u64 M = div64_u64(file_offset, S);
569
570 /*
571 G = (L - (M * S)) / T
572 H = (L - (M * S)) % T
573 */
574 u64 LmodU = file_offset - M * S;
575 u32 G = div64_u64(LmodU, T);
576 u64 H = LmodU - G * T;
577
578 u32 N = div_u64(H, U);
579
580 div_u64_rem(file_offset, stripe_unit, &si->unit_off);
581 si->obj_offset = si->unit_off + (N * stripe_unit) +
582 (M * group_depth * stripe_unit);
583
584 /* "H - (N * U)" is just "H % U" so it's bound to u32 */
585 si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width;
586 si->dev *= ios->layout->mirrors_p1;
587
588 si->group_length = T - H;
589}
590
591static int _add_stripe_unit(struct objio_state *ios, unsigned *cur_pg,
592 unsigned pgbase, struct _objio_per_comp *per_dev, int cur_len,
593 gfp_t gfp_flags)
594{
595 unsigned pg = *cur_pg;
596 struct request_queue *q =
597 osd_request_queue(_io_od(ios, per_dev->dev));
598
599 per_dev->length += cur_len;
600
601 if (per_dev->bio == NULL) {
602 unsigned stripes = ios->layout->num_comps /
603 ios->layout->mirrors_p1;
604 unsigned pages_in_stripe = stripes *
605 (ios->layout->stripe_unit / PAGE_SIZE);
606 unsigned bio_size = (ios->ol_state.nr_pages + pages_in_stripe) /
607 stripes;
608
609 if (BIO_MAX_PAGES_KMALLOC < bio_size)
610 bio_size = BIO_MAX_PAGES_KMALLOC;
611
612 per_dev->bio = bio_kmalloc(gfp_flags, bio_size);
613 if (unlikely(!per_dev->bio)) {
614 dprintk("Faild to allocate BIO size=%u\n", bio_size);
615 return -ENOMEM;
616 }
617 }
618
619 while (cur_len > 0) {
620 unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len);
621 unsigned added_len;
622
623 BUG_ON(ios->ol_state.nr_pages <= pg);
624 cur_len -= pglen;
625
626 added_len = bio_add_pc_page(q, per_dev->bio,
627 ios->ol_state.pages[pg], pglen, pgbase);
628 if (unlikely(pglen != added_len))
629 return -ENOMEM;
630 pgbase = 0;
631 ++pg;
632 }
633 BUG_ON(cur_len);
634
635 *cur_pg = pg;
636 return 0;
637}
638
639static int _prepare_one_group(struct objio_state *ios, u64 length,
640 struct _striping_info *si, unsigned *last_pg,
641 gfp_t gfp_flags)
642{
643 unsigned stripe_unit = ios->layout->stripe_unit;
644 unsigned mirrors_p1 = ios->layout->mirrors_p1;
645 unsigned devs_in_group = ios->layout->group_width * mirrors_p1;
646 unsigned dev = si->dev;
647 unsigned first_dev = dev - (dev % devs_in_group);
648 unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0;
649 unsigned cur_pg = *last_pg;
650 int ret = 0;
651
652 while (length) {
653 struct _objio_per_comp *per_dev = &ios->per_dev[dev];
654 unsigned cur_len, page_off = 0;
655
656 if (!per_dev->length) {
657 per_dev->dev = dev;
658 if (dev < si->dev) {
659 per_dev->offset = si->obj_offset + stripe_unit -
660 si->unit_off;
661 cur_len = stripe_unit;
662 } else if (dev == si->dev) {
663 per_dev->offset = si->obj_offset;
664 cur_len = stripe_unit - si->unit_off;
665 page_off = si->unit_off & ~PAGE_MASK;
666 BUG_ON(page_off &&
667 (page_off != ios->ol_state.pgbase));
668 } else { /* dev > si->dev */
669 per_dev->offset = si->obj_offset - si->unit_off;
670 cur_len = stripe_unit;
671 }
672
673 if (max_comp < dev)
674 max_comp = dev;
675 } else {
676 cur_len = stripe_unit;
677 }
678 if (cur_len >= length)
679 cur_len = length;
680
681 ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev,
682 cur_len, gfp_flags);
683 if (unlikely(ret))
684 goto out;
685
686 dev += mirrors_p1;
687 dev = (dev % devs_in_group) + first_dev;
688
689 length -= cur_len;
690 ios->length += cur_len;
691 }
692out:
693 ios->numdevs = max_comp + mirrors_p1;
694 *last_pg = cur_pg;
695 return ret;
696}
697
698static int _io_rw_pagelist(struct objio_state *ios, gfp_t gfp_flags)
699{
700 u64 length = ios->ol_state.count;
701 u64 offset = ios->ol_state.offset;
702 struct _striping_info si;
703 unsigned last_pg = 0;
704 int ret = 0;
705
706 while (length) {
707 _calc_stripe_info(ios, offset, &si);
708
709 if (length < si.group_length)
710 si.group_length = length;
711
712 ret = _prepare_one_group(ios, si.group_length, &si, &last_pg, gfp_flags);
713 if (unlikely(ret))
714 goto out;
715
716 offset += si.group_length;
717 length -= si.group_length;
718 }
719
720out:
721 if (!ios->length)
722 return ret;
723
724 return 0;
725}
726
727static ssize_t _sync_done(struct objio_state *ios)
728{
729 struct completion *waiting = ios->private;
730
731 complete(waiting);
732 return 0;
733}
734
735static void _last_io(struct kref *kref)
736{
737 struct objio_state *ios = container_of(kref, struct objio_state, kref);
738
739 ios->done(ios);
740}
741
742static void _done_io(struct osd_request *or, void *p)
743{
744 struct objio_state *ios = p;
745
746 kref_put(&ios->kref, _last_io);
747}
748
749static ssize_t _io_exec(struct objio_state *ios)
750{
751 DECLARE_COMPLETION_ONSTACK(wait);
752 ssize_t status = 0; /* sync status */
753 unsigned i;
754 objio_done_fn saved_done_fn = ios->done;
755 bool sync = ios->ol_state.sync;
756
757 if (sync) {
758 ios->done = _sync_done;
759 ios->private = &wait;
760 }
761
762 kref_init(&ios->kref);
763
764 for (i = 0; i < ios->numdevs; i++) {
765 struct osd_request *or = ios->per_dev[i].or;
766
767 if (!or)
768 continue;
769
770 kref_get(&ios->kref);
771 osd_execute_request_async(or, _done_io, ios);
772 }
773
774 kref_put(&ios->kref, _last_io);
775
776 if (sync) {
777 wait_for_completion(&wait);
778 status = saved_done_fn(ios);
779 }
780
781 return status;
782}
783
784/*
785 * read
786 */
787static ssize_t _read_done(struct objio_state *ios)
788{
789 ssize_t status;
790 int ret = _io_check(ios, false);
791
792 _io_free(ios);
793
794 if (likely(!ret))
795 status = ios->length;
796 else
797 status = ret;
798
799 objlayout_read_done(&ios->ol_state, status, ios->ol_state.sync);
800 return status;
801}
802
803static int _read_mirrors(struct objio_state *ios, unsigned cur_comp)
804{
805 struct osd_request *or = NULL;
806 struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
807 unsigned dev = per_dev->dev;
808 struct pnfs_osd_object_cred *cred =
809 &ios->layout->comps[dev];
810 struct osd_obj_id obj = {
811 .partition = cred->oc_object_id.oid_partition_id,
812 .id = cred->oc_object_id.oid_object_id,
813 };
814 int ret;
815
816 or = osd_start_request(_io_od(ios, dev), GFP_KERNEL);
817 if (unlikely(!or)) {
818 ret = -ENOMEM;
819 goto err;
820 }
821 per_dev->or = or;
822
823 osd_req_read(or, &obj, per_dev->offset, per_dev->bio, per_dev->length);
824
825 ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL);
826 if (ret) {
827 dprintk("%s: Faild to osd_finalize_request() => %d\n",
828 __func__, ret);
829 goto err;
830 }
831
832 dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n",
833 __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset),
834 per_dev->length);
835
836err:
837 return ret;
838}
839
840static ssize_t _read_exec(struct objio_state *ios)
841{
842 unsigned i;
843 int ret;
844
845 for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
846 if (!ios->per_dev[i].length)
847 continue;
848 ret = _read_mirrors(ios, i);
849 if (unlikely(ret))
850 goto err;
851 }
852
853 ios->done = _read_done;
854 return _io_exec(ios); /* In sync mode exec returns the io status */
855
856err:
857 _io_free(ios);
858 return ret;
859}
860
861ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state)
862{
863 struct objio_state *ios = container_of(ol_state, struct objio_state,
864 ol_state);
865 int ret;
866
867 ret = _io_rw_pagelist(ios, GFP_KERNEL);
868 if (unlikely(ret))
869 return ret;
870
871 return _read_exec(ios);
872}
873
874/*
875 * write
876 */
877static ssize_t _write_done(struct objio_state *ios)
878{
879 ssize_t status;
880 int ret = _io_check(ios, true);
881
882 _io_free(ios);
883
884 if (likely(!ret)) {
885 /* FIXME: should be based on the OSD's persistence model
886 * See OSD2r05 Section 4.13 Data persistence model */
887 ios->ol_state.committed = NFS_FILE_SYNC;
888 status = ios->length;
889 } else {
890 status = ret;
891 }
892
893 objlayout_write_done(&ios->ol_state, status, ios->ol_state.sync);
894 return status;
895}
896
897static int _write_mirrors(struct objio_state *ios, unsigned cur_comp)
898{
899 struct _objio_per_comp *master_dev = &ios->per_dev[cur_comp];
900 unsigned dev = ios->per_dev[cur_comp].dev;
901 unsigned last_comp = cur_comp + ios->layout->mirrors_p1;
902 int ret;
903
904 for (; cur_comp < last_comp; ++cur_comp, ++dev) {
905 struct osd_request *or = NULL;
906 struct pnfs_osd_object_cred *cred =
907 &ios->layout->comps[dev];
908 struct osd_obj_id obj = {
909 .partition = cred->oc_object_id.oid_partition_id,
910 .id = cred->oc_object_id.oid_object_id,
911 };
912 struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
913 struct bio *bio;
914
915 or = osd_start_request(_io_od(ios, dev), GFP_NOFS);
916 if (unlikely(!or)) {
917 ret = -ENOMEM;
918 goto err;
919 }
920 per_dev->or = or;
921
922 if (per_dev != master_dev) {
923 bio = bio_kmalloc(GFP_NOFS,
924 master_dev->bio->bi_max_vecs);
925 if (unlikely(!bio)) {
926 dprintk("Faild to allocate BIO size=%u\n",
927 master_dev->bio->bi_max_vecs);
928 ret = -ENOMEM;
929 goto err;
930 }
931
932 __bio_clone(bio, master_dev->bio);
933 bio->bi_bdev = NULL;
934 bio->bi_next = NULL;
935 per_dev->bio = bio;
936 per_dev->dev = dev;
937 per_dev->length = master_dev->length;
938 per_dev->offset = master_dev->offset;
939 } else {
940 bio = master_dev->bio;
941 bio->bi_rw |= REQ_WRITE;
942 }
943
944 osd_req_write(or, &obj, per_dev->offset, bio, per_dev->length);
945
946 ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL);
947 if (ret) {
948 dprintk("%s: Faild to osd_finalize_request() => %d\n",
949 __func__, ret);
950 goto err;
951 }
952
953 dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n",
954 __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset),
955 per_dev->length);
956 }
957
958err:
959 return ret;
960}
961
962static ssize_t _write_exec(struct objio_state *ios)
963{
964 unsigned i;
965 int ret;
966
967 for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
968 if (!ios->per_dev[i].length)
969 continue;
970 ret = _write_mirrors(ios, i);
971 if (unlikely(ret))
972 goto err;
973 }
974
975 ios->done = _write_done;
976 return _io_exec(ios); /* In sync mode exec returns the io->status */
977
978err:
979 _io_free(ios);
980 return ret;
981}
982
983ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable)
984{
985 struct objio_state *ios = container_of(ol_state, struct objio_state,
986 ol_state);
987 int ret;
988
989 /* TODO: ios->stable = stable; */
990 ret = _io_rw_pagelist(ios, GFP_NOFS);
991 if (unlikely(ret))
992 return ret;
993
994 return _write_exec(ios);
995}
996
997static bool objio_pg_test(struct nfs_pageio_descriptor *pgio,
998 struct nfs_page *prev, struct nfs_page *req)
999{
1000 if (!pnfs_generic_pg_test(pgio, prev, req))
1001 return false;
1002
1003 if (pgio->pg_lseg == NULL)
1004 return true;
1005
1006 return pgio->pg_count + req->wb_bytes <=
1007 OBJIO_LSEG(pgio->pg_lseg)->max_io_size;
1008}
1009
1010static struct pnfs_layoutdriver_type objlayout_type = {
1011 .id = LAYOUT_OSD2_OBJECTS,
1012 .name = "LAYOUT_OSD2_OBJECTS",
1013 .flags = PNFS_LAYOUTRET_ON_SETATTR,
1014
1015 .alloc_layout_hdr = objlayout_alloc_layout_hdr,
1016 .free_layout_hdr = objlayout_free_layout_hdr,
1017
1018 .alloc_lseg = objlayout_alloc_lseg,
1019 .free_lseg = objlayout_free_lseg,
1020
1021 .read_pagelist = objlayout_read_pagelist,
1022 .write_pagelist = objlayout_write_pagelist,
1023 .pg_test = objio_pg_test,
1024
1025 .free_deviceid_node = objio_free_deviceid_node,
1026
1027 .encode_layoutcommit = objlayout_encode_layoutcommit,
1028 .encode_layoutreturn = objlayout_encode_layoutreturn,
1029};
1030
1031MODULE_DESCRIPTION("pNFS Layout Driver for OSD2 objects");
1032MODULE_AUTHOR("Benny Halevy <bhalevy@panasas.com>");
1033MODULE_LICENSE("GPL");
1034
1035static int __init
1036objlayout_init(void)
1037{
1038 int ret = pnfs_register_layoutdriver(&objlayout_type);
1039
1040 if (ret)
1041 printk(KERN_INFO
1042 "%s: Registering OSD pNFS Layout Driver failed: error=%d\n",
1043 __func__, ret);
1044 else
1045 printk(KERN_INFO "%s: Registered OSD pNFS Layout Driver\n",
1046 __func__);
1047 return ret;
1048}
1049
1050static void __exit
1051objlayout_exit(void)
1052{
1053 pnfs_unregister_layoutdriver(&objlayout_type);
1054 printk(KERN_INFO "%s: Unregistered OSD pNFS Layout Driver\n",
1055 __func__);
1056}
1057
1058module_init(objlayout_init);
1059module_exit(objlayout_exit);
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
new file mode 100644
index 000000000000..1d06f8e2adea
--- /dev/null
+++ b/fs/nfs/objlayout/objlayout.c
@@ -0,0 +1,712 @@
1/*
2 * pNFS Objects layout driver high level definitions
3 *
4 * Copyright (C) 2007 Panasas Inc. [year of first publication]
5 * All rights reserved.
6 *
7 * Benny Halevy <bhalevy@panasas.com>
8 * Boaz Harrosh <bharrosh@panasas.com>
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License version 2
12 * See the file COPYING included with this distribution for more details.
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 *
18 * 1. Redistributions of source code must retain the above copyright
19 * notice, this list of conditions and the following disclaimer.
20 * 2. Redistributions in binary form must reproduce the above copyright
21 * notice, this list of conditions and the following disclaimer in the
22 * documentation and/or other materials provided with the distribution.
23 * 3. Neither the name of the Panasas company nor the names of its
24 * contributors may be used to endorse or promote products derived
25 * from this software without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
28 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
29 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
30 * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
34 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 */
39
40#include <scsi/osd_initiator.h>
41#include "objlayout.h"
42
43#define NFSDBG_FACILITY NFSDBG_PNFS_LD
44/*
45 * Create a objlayout layout structure for the given inode and return it.
46 */
47struct pnfs_layout_hdr *
48objlayout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
49{
50 struct objlayout *objlay;
51
52 objlay = kzalloc(sizeof(struct objlayout), gfp_flags);
53 if (objlay) {
54 spin_lock_init(&objlay->lock);
55 INIT_LIST_HEAD(&objlay->err_list);
56 }
57 dprintk("%s: Return %p\n", __func__, objlay);
58 return &objlay->pnfs_layout;
59}
60
61/*
62 * Free an objlayout layout structure
63 */
64void
65objlayout_free_layout_hdr(struct pnfs_layout_hdr *lo)
66{
67 struct objlayout *objlay = OBJLAYOUT(lo);
68
69 dprintk("%s: objlay %p\n", __func__, objlay);
70
71 WARN_ON(!list_empty(&objlay->err_list));
72 kfree(objlay);
73}
74
75/*
76 * Unmarshall layout and store it in pnfslay.
77 */
78struct pnfs_layout_segment *
79objlayout_alloc_lseg(struct pnfs_layout_hdr *pnfslay,
80 struct nfs4_layoutget_res *lgr,
81 gfp_t gfp_flags)
82{
83 int status = -ENOMEM;
84 struct xdr_stream stream;
85 struct xdr_buf buf = {
86 .pages = lgr->layoutp->pages,
87 .page_len = lgr->layoutp->len,
88 .buflen = lgr->layoutp->len,
89 .len = lgr->layoutp->len,
90 };
91 struct page *scratch;
92 struct pnfs_layout_segment *lseg;
93
94 dprintk("%s: Begin pnfslay %p\n", __func__, pnfslay);
95
96 scratch = alloc_page(gfp_flags);
97 if (!scratch)
98 goto err_nofree;
99
100 xdr_init_decode(&stream, &buf, NULL);
101 xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
102
103 status = objio_alloc_lseg(&lseg, pnfslay, &lgr->range, &stream, gfp_flags);
104 if (unlikely(status)) {
105 dprintk("%s: objio_alloc_lseg Return err %d\n", __func__,
106 status);
107 goto err;
108 }
109
110 __free_page(scratch);
111
112 dprintk("%s: Return %p\n", __func__, lseg);
113 return lseg;
114
115err:
116 __free_page(scratch);
117err_nofree:
118 dprintk("%s: Err Return=>%d\n", __func__, status);
119 return ERR_PTR(status);
120}
121
122/*
123 * Free a layout segement
124 */
125void
126objlayout_free_lseg(struct pnfs_layout_segment *lseg)
127{
128 dprintk("%s: freeing layout segment %p\n", __func__, lseg);
129
130 if (unlikely(!lseg))
131 return;
132
133 objio_free_lseg(lseg);
134}
135
136/*
137 * I/O Operations
138 */
139static inline u64
140end_offset(u64 start, u64 len)
141{
142 u64 end;
143
144 end = start + len;
145 return end >= start ? end : NFS4_MAX_UINT64;
146}
147
148/* last octet in a range */
149static inline u64
150last_byte_offset(u64 start, u64 len)
151{
152 u64 end;
153
154 BUG_ON(!len);
155 end = start + len;
156 return end > start ? end - 1 : NFS4_MAX_UINT64;
157}
158
159static struct objlayout_io_state *
160objlayout_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type,
161 struct page **pages,
162 unsigned pgbase,
163 loff_t offset,
164 size_t count,
165 struct pnfs_layout_segment *lseg,
166 void *rpcdata,
167 gfp_t gfp_flags)
168{
169 struct objlayout_io_state *state;
170 u64 lseg_end_offset;
171
172 dprintk("%s: allocating io_state\n", __func__);
173 if (objio_alloc_io_state(lseg, &state, gfp_flags))
174 return NULL;
175
176 BUG_ON(offset < lseg->pls_range.offset);
177 lseg_end_offset = end_offset(lseg->pls_range.offset,
178 lseg->pls_range.length);
179 BUG_ON(offset >= lseg_end_offset);
180 if (offset + count > lseg_end_offset) {
181 count = lseg->pls_range.length -
182 (offset - lseg->pls_range.offset);
183 dprintk("%s: truncated count %Zd\n", __func__, count);
184 }
185
186 if (pgbase > PAGE_SIZE) {
187 pages += pgbase >> PAGE_SHIFT;
188 pgbase &= ~PAGE_MASK;
189 }
190
191 INIT_LIST_HEAD(&state->err_list);
192 state->lseg = lseg;
193 state->rpcdata = rpcdata;
194 state->pages = pages;
195 state->pgbase = pgbase;
196 state->nr_pages = (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT;
197 state->offset = offset;
198 state->count = count;
199 state->sync = 0;
200
201 return state;
202}
203
204static void
205objlayout_free_io_state(struct objlayout_io_state *state)
206{
207 dprintk("%s: freeing io_state\n", __func__);
208 if (unlikely(!state))
209 return;
210
211 objio_free_io_state(state);
212}
213
214/*
215 * I/O done common code
216 */
217static void
218objlayout_iodone(struct objlayout_io_state *state)
219{
220 dprintk("%s: state %p status\n", __func__, state);
221
222 if (likely(state->status >= 0)) {
223 objlayout_free_io_state(state);
224 } else {
225 struct objlayout *objlay = OBJLAYOUT(state->lseg->pls_layout);
226
227 spin_lock(&objlay->lock);
228 objlay->delta_space_valid = OBJ_DSU_INVALID;
229 list_add(&objlay->err_list, &state->err_list);
230 spin_unlock(&objlay->lock);
231 }
232}
233
234/*
235 * objlayout_io_set_result - Set an osd_error code on a specific osd comp.
236 *
237 * The @index component IO failed (error returned from target). Register
238 * the error for later reporting at layout-return.
239 */
240void
241objlayout_io_set_result(struct objlayout_io_state *state, unsigned index,
242 struct pnfs_osd_objid *pooid, int osd_error,
243 u64 offset, u64 length, bool is_write)
244{
245 struct pnfs_osd_ioerr *ioerr = &state->ioerrs[index];
246
247 BUG_ON(index >= state->num_comps);
248 if (osd_error) {
249 ioerr->oer_component = *pooid;
250 ioerr->oer_comp_offset = offset;
251 ioerr->oer_comp_length = length;
252 ioerr->oer_iswrite = is_write;
253 ioerr->oer_errno = osd_error;
254
255 dprintk("%s: err[%d]: errno=%d is_write=%d dev(%llx:%llx) "
256 "par=0x%llx obj=0x%llx offset=0x%llx length=0x%llx\n",
257 __func__, index, ioerr->oer_errno,
258 ioerr->oer_iswrite,
259 _DEVID_LO(&ioerr->oer_component.oid_device_id),
260 _DEVID_HI(&ioerr->oer_component.oid_device_id),
261 ioerr->oer_component.oid_partition_id,
262 ioerr->oer_component.oid_object_id,
263 ioerr->oer_comp_offset,
264 ioerr->oer_comp_length);
265 } else {
266 /* User need not call if no error is reported */
267 ioerr->oer_errno = 0;
268 }
269}
270
271/* Function scheduled on rpc workqueue to call ->nfs_readlist_complete().
272 * This is because the osd completion is called with ints-off from
273 * the block layer
274 */
275static void _rpc_read_complete(struct work_struct *work)
276{
277 struct rpc_task *task;
278 struct nfs_read_data *rdata;
279
280 dprintk("%s enter\n", __func__);
281 task = container_of(work, struct rpc_task, u.tk_work);
282 rdata = container_of(task, struct nfs_read_data, task);
283
284 pnfs_ld_read_done(rdata);
285}
286
287void
288objlayout_read_done(struct objlayout_io_state *state, ssize_t status, bool sync)
289{
290 int eof = state->eof;
291 struct nfs_read_data *rdata;
292
293 state->status = status;
294 dprintk("%s: Begin status=%zd eof=%d\n", __func__, status, eof);
295 rdata = state->rpcdata;
296 rdata->task.tk_status = status;
297 if (status >= 0) {
298 rdata->res.count = status;
299 rdata->res.eof = eof;
300 }
301 objlayout_iodone(state);
302 /* must not use state after this point */
303
304 if (sync)
305 pnfs_ld_read_done(rdata);
306 else {
307 INIT_WORK(&rdata->task.u.tk_work, _rpc_read_complete);
308 schedule_work(&rdata->task.u.tk_work);
309 }
310}
311
312/*
313 * Perform sync or async reads.
314 */
315enum pnfs_try_status
316objlayout_read_pagelist(struct nfs_read_data *rdata)
317{
318 loff_t offset = rdata->args.offset;
319 size_t count = rdata->args.count;
320 struct objlayout_io_state *state;
321 ssize_t status = 0;
322 loff_t eof;
323
324 dprintk("%s: Begin inode %p offset %llu count %d\n",
325 __func__, rdata->inode, offset, (int)count);
326
327 eof = i_size_read(rdata->inode);
328 if (unlikely(offset + count > eof)) {
329 if (offset >= eof) {
330 status = 0;
331 rdata->res.count = 0;
332 rdata->res.eof = 1;
333 goto out;
334 }
335 count = eof - offset;
336 }
337
338 state = objlayout_alloc_io_state(NFS_I(rdata->inode)->layout,
339 rdata->args.pages, rdata->args.pgbase,
340 offset, count,
341 rdata->lseg, rdata,
342 GFP_KERNEL);
343 if (unlikely(!state)) {
344 status = -ENOMEM;
345 goto out;
346 }
347
348 state->eof = state->offset + state->count >= eof;
349
350 status = objio_read_pagelist(state);
351 out:
352 dprintk("%s: Return status %Zd\n", __func__, status);
353 rdata->pnfs_error = status;
354 return PNFS_ATTEMPTED;
355}
356
357/* Function scheduled on rpc workqueue to call ->nfs_writelist_complete().
358 * This is because the osd completion is called with ints-off from
359 * the block layer
360 */
361static void _rpc_write_complete(struct work_struct *work)
362{
363 struct rpc_task *task;
364 struct nfs_write_data *wdata;
365
366 dprintk("%s enter\n", __func__);
367 task = container_of(work, struct rpc_task, u.tk_work);
368 wdata = container_of(task, struct nfs_write_data, task);
369
370 pnfs_ld_write_done(wdata);
371}
372
373void
374objlayout_write_done(struct objlayout_io_state *state, ssize_t status,
375 bool sync)
376{
377 struct nfs_write_data *wdata;
378
379 dprintk("%s: Begin\n", __func__);
380 wdata = state->rpcdata;
381 state->status = status;
382 wdata->task.tk_status = status;
383 if (status >= 0) {
384 wdata->res.count = status;
385 wdata->verf.committed = state->committed;
386 dprintk("%s: Return status %d committed %d\n",
387 __func__, wdata->task.tk_status,
388 wdata->verf.committed);
389 } else
390 dprintk("%s: Return status %d\n",
391 __func__, wdata->task.tk_status);
392 objlayout_iodone(state);
393 /* must not use state after this point */
394
395 if (sync)
396 pnfs_ld_write_done(wdata);
397 else {
398 INIT_WORK(&wdata->task.u.tk_work, _rpc_write_complete);
399 schedule_work(&wdata->task.u.tk_work);
400 }
401}
402
403/*
404 * Perform sync or async writes.
405 */
406enum pnfs_try_status
407objlayout_write_pagelist(struct nfs_write_data *wdata,
408 int how)
409{
410 struct objlayout_io_state *state;
411 ssize_t status;
412
413 dprintk("%s: Begin inode %p offset %llu count %u\n",
414 __func__, wdata->inode, wdata->args.offset, wdata->args.count);
415
416 state = objlayout_alloc_io_state(NFS_I(wdata->inode)->layout,
417 wdata->args.pages,
418 wdata->args.pgbase,
419 wdata->args.offset,
420 wdata->args.count,
421 wdata->lseg, wdata,
422 GFP_NOFS);
423 if (unlikely(!state)) {
424 status = -ENOMEM;
425 goto out;
426 }
427
428 state->sync = how & FLUSH_SYNC;
429
430 status = objio_write_pagelist(state, how & FLUSH_STABLE);
431 out:
432 dprintk("%s: Return status %Zd\n", __func__, status);
433 wdata->pnfs_error = status;
434 return PNFS_ATTEMPTED;
435}
436
437void
438objlayout_encode_layoutcommit(struct pnfs_layout_hdr *pnfslay,
439 struct xdr_stream *xdr,
440 const struct nfs4_layoutcommit_args *args)
441{
442 struct objlayout *objlay = OBJLAYOUT(pnfslay);
443 struct pnfs_osd_layoutupdate lou;
444 __be32 *start;
445
446 dprintk("%s: Begin\n", __func__);
447
448 spin_lock(&objlay->lock);
449 lou.dsu_valid = (objlay->delta_space_valid == OBJ_DSU_VALID);
450 lou.dsu_delta = objlay->delta_space_used;
451 objlay->delta_space_used = 0;
452 objlay->delta_space_valid = OBJ_DSU_INIT;
453 lou.olu_ioerr_flag = !list_empty(&objlay->err_list);
454 spin_unlock(&objlay->lock);
455
456 start = xdr_reserve_space(xdr, 4);
457
458 BUG_ON(pnfs_osd_xdr_encode_layoutupdate(xdr, &lou));
459
460 *start = cpu_to_be32((xdr->p - start - 1) * 4);
461
462 dprintk("%s: Return delta_space_used %lld err %d\n", __func__,
463 lou.dsu_delta, lou.olu_ioerr_flag);
464}
465
466static int
467err_prio(u32 oer_errno)
468{
469 switch (oer_errno) {
470 case 0:
471 return 0;
472
473 case PNFS_OSD_ERR_RESOURCE:
474 return OSD_ERR_PRI_RESOURCE;
475 case PNFS_OSD_ERR_BAD_CRED:
476 return OSD_ERR_PRI_BAD_CRED;
477 case PNFS_OSD_ERR_NO_ACCESS:
478 return OSD_ERR_PRI_NO_ACCESS;
479 case PNFS_OSD_ERR_UNREACHABLE:
480 return OSD_ERR_PRI_UNREACHABLE;
481 case PNFS_OSD_ERR_NOT_FOUND:
482 return OSD_ERR_PRI_NOT_FOUND;
483 case PNFS_OSD_ERR_NO_SPACE:
484 return OSD_ERR_PRI_NO_SPACE;
485 default:
486 WARN_ON(1);
487 /* fallthrough */
488 case PNFS_OSD_ERR_EIO:
489 return OSD_ERR_PRI_EIO;
490 }
491}
492
493static void
494merge_ioerr(struct pnfs_osd_ioerr *dest_err,
495 const struct pnfs_osd_ioerr *src_err)
496{
497 u64 dest_end, src_end;
498
499 if (!dest_err->oer_errno) {
500 *dest_err = *src_err;
501 /* accumulated device must be blank */
502 memset(&dest_err->oer_component.oid_device_id, 0,
503 sizeof(dest_err->oer_component.oid_device_id));
504
505 return;
506 }
507
508 if (dest_err->oer_component.oid_partition_id !=
509 src_err->oer_component.oid_partition_id)
510 dest_err->oer_component.oid_partition_id = 0;
511
512 if (dest_err->oer_component.oid_object_id !=
513 src_err->oer_component.oid_object_id)
514 dest_err->oer_component.oid_object_id = 0;
515
516 if (dest_err->oer_comp_offset > src_err->oer_comp_offset)
517 dest_err->oer_comp_offset = src_err->oer_comp_offset;
518
519 dest_end = end_offset(dest_err->oer_comp_offset,
520 dest_err->oer_comp_length);
521 src_end = end_offset(src_err->oer_comp_offset,
522 src_err->oer_comp_length);
523 if (dest_end < src_end)
524 dest_end = src_end;
525
526 dest_err->oer_comp_length = dest_end - dest_err->oer_comp_offset;
527
528 if ((src_err->oer_iswrite == dest_err->oer_iswrite) &&
529 (err_prio(src_err->oer_errno) > err_prio(dest_err->oer_errno))) {
530 dest_err->oer_errno = src_err->oer_errno;
531 } else if (src_err->oer_iswrite) {
532 dest_err->oer_iswrite = true;
533 dest_err->oer_errno = src_err->oer_errno;
534 }
535}
536
537static void
538encode_accumulated_error(struct objlayout *objlay, __be32 *p)
539{
540 struct objlayout_io_state *state, *tmp;
541 struct pnfs_osd_ioerr accumulated_err = {.oer_errno = 0};
542
543 list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) {
544 unsigned i;
545
546 for (i = 0; i < state->num_comps; i++) {
547 struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i];
548
549 if (!ioerr->oer_errno)
550 continue;
551
552 printk(KERN_ERR "%s: err[%d]: errno=%d is_write=%d "
553 "dev(%llx:%llx) par=0x%llx obj=0x%llx "
554 "offset=0x%llx length=0x%llx\n",
555 __func__, i, ioerr->oer_errno,
556 ioerr->oer_iswrite,
557 _DEVID_LO(&ioerr->oer_component.oid_device_id),
558 _DEVID_HI(&ioerr->oer_component.oid_device_id),
559 ioerr->oer_component.oid_partition_id,
560 ioerr->oer_component.oid_object_id,
561 ioerr->oer_comp_offset,
562 ioerr->oer_comp_length);
563
564 merge_ioerr(&accumulated_err, ioerr);
565 }
566 list_del(&state->err_list);
567 objlayout_free_io_state(state);
568 }
569
570 pnfs_osd_xdr_encode_ioerr(p, &accumulated_err);
571}
572
573void
574objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay,
575 struct xdr_stream *xdr,
576 const struct nfs4_layoutreturn_args *args)
577{
578 struct objlayout *objlay = OBJLAYOUT(pnfslay);
579 struct objlayout_io_state *state, *tmp;
580 __be32 *start;
581
582 dprintk("%s: Begin\n", __func__);
583 start = xdr_reserve_space(xdr, 4);
584 BUG_ON(!start);
585
586 spin_lock(&objlay->lock);
587
588 list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) {
589 __be32 *last_xdr = NULL, *p;
590 unsigned i;
591 int res = 0;
592
593 for (i = 0; i < state->num_comps; i++) {
594 struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i];
595
596 if (!ioerr->oer_errno)
597 continue;
598
599 dprintk("%s: err[%d]: errno=%d is_write=%d "
600 "dev(%llx:%llx) par=0x%llx obj=0x%llx "
601 "offset=0x%llx length=0x%llx\n",
602 __func__, i, ioerr->oer_errno,
603 ioerr->oer_iswrite,
604 _DEVID_LO(&ioerr->oer_component.oid_device_id),
605 _DEVID_HI(&ioerr->oer_component.oid_device_id),
606 ioerr->oer_component.oid_partition_id,
607 ioerr->oer_component.oid_object_id,
608 ioerr->oer_comp_offset,
609 ioerr->oer_comp_length);
610
611 p = pnfs_osd_xdr_ioerr_reserve_space(xdr);
612 if (unlikely(!p)) {
613 res = -E2BIG;
614 break; /* accumulated_error */
615 }
616
617 last_xdr = p;
618 pnfs_osd_xdr_encode_ioerr(p, &state->ioerrs[i]);
619 }
620
621 /* TODO: use xdr_write_pages */
622 if (unlikely(res)) {
623 /* no space for even one error descriptor */
624 BUG_ON(!last_xdr);
625
626 /* we've encountered a situation with lots and lots of
627 * errors and no space to encode them all. Use the last
628 * available slot to report the union of all the
629 * remaining errors.
630 */
631 encode_accumulated_error(objlay, last_xdr);
632 goto loop_done;
633 }
634 list_del(&state->err_list);
635 objlayout_free_io_state(state);
636 }
637loop_done:
638 spin_unlock(&objlay->lock);
639
640 *start = cpu_to_be32((xdr->p - start - 1) * 4);
641 dprintk("%s: Return\n", __func__);
642}
643
644
645/*
646 * Get Device Info API for io engines
647 */
648struct objlayout_deviceinfo {
649 struct page *page;
650 struct pnfs_osd_deviceaddr da; /* This must be last */
651};
652
653/* Initialize and call nfs_getdeviceinfo, then decode and return a
654 * "struct pnfs_osd_deviceaddr *" Eventually objlayout_put_deviceinfo()
655 * should be called.
656 */
657int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
658 struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr,
659 gfp_t gfp_flags)
660{
661 struct objlayout_deviceinfo *odi;
662 struct pnfs_device pd;
663 struct super_block *sb;
664 struct page *page, **pages;
665 u32 *p;
666 int err;
667
668 page = alloc_page(gfp_flags);
669 if (!page)
670 return -ENOMEM;
671
672 pages = &page;
673 pd.pages = pages;
674
675 memcpy(&pd.dev_id, d_id, sizeof(*d_id));
676 pd.layout_type = LAYOUT_OSD2_OBJECTS;
677 pd.pages = &page;
678 pd.pgbase = 0;
679 pd.pglen = PAGE_SIZE;
680 pd.mincount = 0;
681
682 sb = pnfslay->plh_inode->i_sb;
683 err = nfs4_proc_getdeviceinfo(NFS_SERVER(pnfslay->plh_inode), &pd);
684 dprintk("%s nfs_getdeviceinfo returned %d\n", __func__, err);
685 if (err)
686 goto err_out;
687
688 p = page_address(page);
689 odi = kzalloc(sizeof(*odi), gfp_flags);
690 if (!odi) {
691 err = -ENOMEM;
692 goto err_out;
693 }
694 pnfs_osd_xdr_decode_deviceaddr(&odi->da, p);
695 odi->page = page;
696 *deviceaddr = &odi->da;
697 return 0;
698
699err_out:
700 __free_page(page);
701 return err;
702}
703
704void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr)
705{
706 struct objlayout_deviceinfo *odi = container_of(deviceaddr,
707 struct objlayout_deviceinfo,
708 da);
709
710 __free_page(odi->page);
711 kfree(odi);
712}
diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h
new file mode 100644
index 000000000000..a8244c8e042d
--- /dev/null
+++ b/fs/nfs/objlayout/objlayout.h
@@ -0,0 +1,187 @@
1/*
2 * Data types and function declerations for interfacing with the
3 * pNFS standard object layout driver.
4 *
5 * Copyright (C) 2007 Panasas Inc. [year of first publication]
6 * All rights reserved.
7 *
8 * Benny Halevy <bhalevy@panasas.com>
9 * Boaz Harrosh <bharrosh@panasas.com>
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License version 2
13 * See the file COPYING included with this distribution for more details.
14 *
15 * Redistribution and use in source and binary forms, with or without
16 * modification, are permitted provided that the following conditions
17 * are met:
18 *
19 * 1. Redistributions of source code must retain the above copyright
20 * notice, this list of conditions and the following disclaimer.
21 * 2. Redistributions in binary form must reproduce the above copyright
22 * notice, this list of conditions and the following disclaimer in the
23 * documentation and/or other materials provided with the distribution.
24 * 3. Neither the name of the Panasas company nor the names of its
25 * contributors may be used to endorse or promote products derived
26 * from this software without specific prior written permission.
27 *
28 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
29 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
30 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
31 * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
32 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
33 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
34 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
35 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
36 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
37 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
38 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
39 */
40
41#ifndef _OBJLAYOUT_H
42#define _OBJLAYOUT_H
43
44#include <linux/nfs_fs.h>
45#include <linux/pnfs_osd_xdr.h>
46#include "../pnfs.h"
47
48/*
49 * per-inode layout
50 */
51struct objlayout {
52 struct pnfs_layout_hdr pnfs_layout;
53
54 /* for layout_commit */
55 enum osd_delta_space_valid_enum {
56 OBJ_DSU_INIT = 0,
57 OBJ_DSU_VALID,
58 OBJ_DSU_INVALID,
59 } delta_space_valid;
60 s64 delta_space_used; /* consumed by write ops */
61
62 /* for layout_return */
63 spinlock_t lock;
64 struct list_head err_list;
65};
66
67static inline struct objlayout *
68OBJLAYOUT(struct pnfs_layout_hdr *lo)
69{
70 return container_of(lo, struct objlayout, pnfs_layout);
71}
72
73/*
74 * per-I/O operation state
75 * embedded in objects provider io_state data structure
76 */
77struct objlayout_io_state {
78 struct pnfs_layout_segment *lseg;
79
80 struct page **pages;
81 unsigned pgbase;
82 unsigned nr_pages;
83 unsigned long count;
84 loff_t offset;
85 bool sync;
86
87 void *rpcdata;
88 int status; /* res */
89 int eof; /* res */
90 int committed; /* res */
91
92 /* Error reporting (layout_return) */
93 struct list_head err_list;
94 unsigned num_comps;
95 /* Pointer to array of error descriptors of size num_comps.
96 * It should contain as many entries as devices in the osd_layout
97 * that participate in the I/O. It is up to the io_engine to allocate
98 * needed space and set num_comps.
99 */
100 struct pnfs_osd_ioerr *ioerrs;
101};
102
103/*
104 * Raid engine I/O API
105 */
106extern int objio_alloc_lseg(struct pnfs_layout_segment **outp,
107 struct pnfs_layout_hdr *pnfslay,
108 struct pnfs_layout_range *range,
109 struct xdr_stream *xdr,
110 gfp_t gfp_flags);
111extern void objio_free_lseg(struct pnfs_layout_segment *lseg);
112
113extern int objio_alloc_io_state(
114 struct pnfs_layout_segment *lseg,
115 struct objlayout_io_state **outp,
116 gfp_t gfp_flags);
117extern void objio_free_io_state(struct objlayout_io_state *state);
118
119extern ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state);
120extern ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state,
121 bool stable);
122
123/*
124 * callback API
125 */
126extern void objlayout_io_set_result(struct objlayout_io_state *state,
127 unsigned index, struct pnfs_osd_objid *pooid,
128 int osd_error, u64 offset, u64 length, bool is_write);
129
130static inline void
131objlayout_add_delta_space_used(struct objlayout_io_state *state, s64 space_used)
132{
133 struct objlayout *objlay = OBJLAYOUT(state->lseg->pls_layout);
134
135 /* If one of the I/Os errored out and the delta_space_used was
136 * invalid we render the complete report as invalid. Protocol mandate
137 * the DSU be accurate or not reported.
138 */
139 spin_lock(&objlay->lock);
140 if (objlay->delta_space_valid != OBJ_DSU_INVALID) {
141 objlay->delta_space_valid = OBJ_DSU_VALID;
142 objlay->delta_space_used += space_used;
143 }
144 spin_unlock(&objlay->lock);
145}
146
147extern void objlayout_read_done(struct objlayout_io_state *state,
148 ssize_t status, bool sync);
149extern void objlayout_write_done(struct objlayout_io_state *state,
150 ssize_t status, bool sync);
151
152extern int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
153 struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr,
154 gfp_t gfp_flags);
155extern void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr);
156
157/*
158 * exported generic objects function vectors
159 */
160
161extern struct pnfs_layout_hdr *objlayout_alloc_layout_hdr(struct inode *, gfp_t gfp_flags);
162extern void objlayout_free_layout_hdr(struct pnfs_layout_hdr *);
163
164extern struct pnfs_layout_segment *objlayout_alloc_lseg(
165 struct pnfs_layout_hdr *,
166 struct nfs4_layoutget_res *,
167 gfp_t gfp_flags);
168extern void objlayout_free_lseg(struct pnfs_layout_segment *);
169
170extern enum pnfs_try_status objlayout_read_pagelist(
171 struct nfs_read_data *);
172
173extern enum pnfs_try_status objlayout_write_pagelist(
174 struct nfs_write_data *,
175 int how);
176
177extern void objlayout_encode_layoutcommit(
178 struct pnfs_layout_hdr *,
179 struct xdr_stream *,
180 const struct nfs4_layoutcommit_args *);
181
182extern void objlayout_encode_layoutreturn(
183 struct pnfs_layout_hdr *,
184 struct xdr_stream *,
185 const struct nfs4_layoutreturn_args *);
186
187#endif /* _OBJLAYOUT_H */
diff --git a/fs/nfs/objlayout/pnfs_osd_xdr_cli.c b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c
new file mode 100644
index 000000000000..16fc758e9123
--- /dev/null
+++ b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c
@@ -0,0 +1,412 @@
1/*
2 * Object-Based pNFS Layout XDR layer
3 *
4 * Copyright (C) 2007 Panasas Inc. [year of first publication]
5 * All rights reserved.
6 *
7 * Benny Halevy <bhalevy@panasas.com>
8 * Boaz Harrosh <bharrosh@panasas.com>
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License version 2
12 * See the file COPYING included with this distribution for more details.
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 *
18 * 1. Redistributions of source code must retain the above copyright
19 * notice, this list of conditions and the following disclaimer.
20 * 2. Redistributions in binary form must reproduce the above copyright
21 * notice, this list of conditions and the following disclaimer in the
22 * documentation and/or other materials provided with the distribution.
23 * 3. Neither the name of the Panasas company nor the names of its
24 * contributors may be used to endorse or promote products derived
25 * from this software without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
28 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
29 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
30 * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
34 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 */
39
40#include <linux/pnfs_osd_xdr.h>
41
42#define NFSDBG_FACILITY NFSDBG_PNFS_LD
43
44/*
45 * The following implementation is based on RFC5664
46 */
47
48/*
49 * struct pnfs_osd_objid {
50 * struct nfs4_deviceid oid_device_id;
51 * u64 oid_partition_id;
52 * u64 oid_object_id;
53 * }; // xdr size 32 bytes
54 */
55static __be32 *
56_osd_xdr_decode_objid(__be32 *p, struct pnfs_osd_objid *objid)
57{
58 p = xdr_decode_opaque_fixed(p, objid->oid_device_id.data,
59 sizeof(objid->oid_device_id.data));
60
61 p = xdr_decode_hyper(p, &objid->oid_partition_id);
62 p = xdr_decode_hyper(p, &objid->oid_object_id);
63 return p;
64}
65/*
66 * struct pnfs_osd_opaque_cred {
67 * u32 cred_len;
68 * void *cred;
69 * }; // xdr size [variable]
70 * The return pointers are from the xdr buffer
71 */
72static int
73_osd_xdr_decode_opaque_cred(struct pnfs_osd_opaque_cred *opaque_cred,
74 struct xdr_stream *xdr)
75{
76 __be32 *p = xdr_inline_decode(xdr, 1);
77
78 if (!p)
79 return -EINVAL;
80
81 opaque_cred->cred_len = be32_to_cpu(*p++);
82
83 p = xdr_inline_decode(xdr, opaque_cred->cred_len);
84 if (!p)
85 return -EINVAL;
86
87 opaque_cred->cred = p;
88 return 0;
89}
90
91/*
92 * struct pnfs_osd_object_cred {
93 * struct pnfs_osd_objid oc_object_id;
94 * u32 oc_osd_version;
95 * u32 oc_cap_key_sec;
96 * struct pnfs_osd_opaque_cred oc_cap_key
97 * struct pnfs_osd_opaque_cred oc_cap;
98 * }; // xdr size 32 + 4 + 4 + [variable] + [variable]
99 */
100static int
101_osd_xdr_decode_object_cred(struct pnfs_osd_object_cred *comp,
102 struct xdr_stream *xdr)
103{
104 __be32 *p = xdr_inline_decode(xdr, 32 + 4 + 4);
105 int ret;
106
107 if (!p)
108 return -EIO;
109
110 p = _osd_xdr_decode_objid(p, &comp->oc_object_id);
111 comp->oc_osd_version = be32_to_cpup(p++);
112 comp->oc_cap_key_sec = be32_to_cpup(p);
113
114 ret = _osd_xdr_decode_opaque_cred(&comp->oc_cap_key, xdr);
115 if (unlikely(ret))
116 return ret;
117
118 ret = _osd_xdr_decode_opaque_cred(&comp->oc_cap, xdr);
119 return ret;
120}
121
122/*
123 * struct pnfs_osd_data_map {
124 * u32 odm_num_comps;
125 * u64 odm_stripe_unit;
126 * u32 odm_group_width;
127 * u32 odm_group_depth;
128 * u32 odm_mirror_cnt;
129 * u32 odm_raid_algorithm;
130 * }; // xdr size 4 + 8 + 4 + 4 + 4 + 4
131 */
132static inline int
133_osd_data_map_xdr_sz(void)
134{
135 return 4 + 8 + 4 + 4 + 4 + 4;
136}
137
138static __be32 *
139_osd_xdr_decode_data_map(__be32 *p, struct pnfs_osd_data_map *data_map)
140{
141 data_map->odm_num_comps = be32_to_cpup(p++);
142 p = xdr_decode_hyper(p, &data_map->odm_stripe_unit);
143 data_map->odm_group_width = be32_to_cpup(p++);
144 data_map->odm_group_depth = be32_to_cpup(p++);
145 data_map->odm_mirror_cnt = be32_to_cpup(p++);
146 data_map->odm_raid_algorithm = be32_to_cpup(p++);
147 dprintk("%s: odm_num_comps=%u odm_stripe_unit=%llu odm_group_width=%u "
148 "odm_group_depth=%u odm_mirror_cnt=%u odm_raid_algorithm=%u\n",
149 __func__,
150 data_map->odm_num_comps,
151 (unsigned long long)data_map->odm_stripe_unit,
152 data_map->odm_group_width,
153 data_map->odm_group_depth,
154 data_map->odm_mirror_cnt,
155 data_map->odm_raid_algorithm);
156 return p;
157}
158
159int pnfs_osd_xdr_decode_layout_map(struct pnfs_osd_layout *layout,
160 struct pnfs_osd_xdr_decode_layout_iter *iter, struct xdr_stream *xdr)
161{
162 __be32 *p;
163
164 memset(iter, 0, sizeof(*iter));
165
166 p = xdr_inline_decode(xdr, _osd_data_map_xdr_sz() + 4 + 4);
167 if (unlikely(!p))
168 return -EINVAL;
169
170 p = _osd_xdr_decode_data_map(p, &layout->olo_map);
171 layout->olo_comps_index = be32_to_cpup(p++);
172 layout->olo_num_comps = be32_to_cpup(p++);
173 iter->total_comps = layout->olo_num_comps;
174 return 0;
175}
176
177bool pnfs_osd_xdr_decode_layout_comp(struct pnfs_osd_object_cred *comp,
178 struct pnfs_osd_xdr_decode_layout_iter *iter, struct xdr_stream *xdr,
179 int *err)
180{
181 BUG_ON(iter->decoded_comps > iter->total_comps);
182 if (iter->decoded_comps == iter->total_comps)
183 return false;
184
185 *err = _osd_xdr_decode_object_cred(comp, xdr);
186 if (unlikely(*err)) {
187 dprintk("%s: _osd_xdr_decode_object_cred=>%d decoded_comps=%d "
188 "total_comps=%d\n", __func__, *err,
189 iter->decoded_comps, iter->total_comps);
190 return false; /* stop the loop */
191 }
192 dprintk("%s: dev(%llx:%llx) par=0x%llx obj=0x%llx "
193 "key_len=%u cap_len=%u\n",
194 __func__,
195 _DEVID_LO(&comp->oc_object_id.oid_device_id),
196 _DEVID_HI(&comp->oc_object_id.oid_device_id),
197 comp->oc_object_id.oid_partition_id,
198 comp->oc_object_id.oid_object_id,
199 comp->oc_cap_key.cred_len, comp->oc_cap.cred_len);
200
201 iter->decoded_comps++;
202 return true;
203}
204
205/*
206 * Get Device Information Decoding
207 *
208 * Note: since Device Information is currently done synchronously, all
209 * variable strings fields are left inside the rpc buffer and are only
210 * pointed to by the pnfs_osd_deviceaddr members. So the read buffer
211 * should not be freed while the returned information is in use.
212 */
213/*
214 *struct nfs4_string {
215 * unsigned int len;
216 * char *data;
217 *}; // size [variable]
218 * NOTE: Returned string points to inside the XDR buffer
219 */
220static __be32 *
221__read_u8_opaque(__be32 *p, struct nfs4_string *str)
222{
223 str->len = be32_to_cpup(p++);
224 str->data = (char *)p;
225
226 p += XDR_QUADLEN(str->len);
227 return p;
228}
229
230/*
231 * struct pnfs_osd_targetid {
232 * u32 oti_type;
233 * struct nfs4_string oti_scsi_device_id;
234 * };// size 4 + [variable]
235 */
236static __be32 *
237__read_targetid(__be32 *p, struct pnfs_osd_targetid* targetid)
238{
239 u32 oti_type;
240
241 oti_type = be32_to_cpup(p++);
242 targetid->oti_type = oti_type;
243
244 switch (oti_type) {
245 case OBJ_TARGET_SCSI_NAME:
246 case OBJ_TARGET_SCSI_DEVICE_ID:
247 p = __read_u8_opaque(p, &targetid->oti_scsi_device_id);
248 }
249
250 return p;
251}
252
253/*
254 * struct pnfs_osd_net_addr {
255 * struct nfs4_string r_netid;
256 * struct nfs4_string r_addr;
257 * };
258 */
259static __be32 *
260__read_net_addr(__be32 *p, struct pnfs_osd_net_addr* netaddr)
261{
262 p = __read_u8_opaque(p, &netaddr->r_netid);
263 p = __read_u8_opaque(p, &netaddr->r_addr);
264
265 return p;
266}
267
268/*
269 * struct pnfs_osd_targetaddr {
270 * u32 ota_available;
271 * struct pnfs_osd_net_addr ota_netaddr;
272 * };
273 */
274static __be32 *
275__read_targetaddr(__be32 *p, struct pnfs_osd_targetaddr *targetaddr)
276{
277 u32 ota_available;
278
279 ota_available = be32_to_cpup(p++);
280 targetaddr->ota_available = ota_available;
281
282 if (ota_available)
283 p = __read_net_addr(p, &targetaddr->ota_netaddr);
284
285
286 return p;
287}
288
289/*
290 * struct pnfs_osd_deviceaddr {
291 * struct pnfs_osd_targetid oda_targetid;
292 * struct pnfs_osd_targetaddr oda_targetaddr;
293 * u8 oda_lun[8];
294 * struct nfs4_string oda_systemid;
295 * struct pnfs_osd_object_cred oda_root_obj_cred;
296 * struct nfs4_string oda_osdname;
297 * };
298 */
299
300/* We need this version for the pnfs_osd_xdr_decode_deviceaddr which does
301 * not have an xdr_stream
302 */
303static __be32 *
304__read_opaque_cred(__be32 *p,
305 struct pnfs_osd_opaque_cred *opaque_cred)
306{
307 opaque_cred->cred_len = be32_to_cpu(*p++);
308 opaque_cred->cred = p;
309 return p + XDR_QUADLEN(opaque_cred->cred_len);
310}
311
312static __be32 *
313__read_object_cred(__be32 *p, struct pnfs_osd_object_cred *comp)
314{
315 p = _osd_xdr_decode_objid(p, &comp->oc_object_id);
316 comp->oc_osd_version = be32_to_cpup(p++);
317 comp->oc_cap_key_sec = be32_to_cpup(p++);
318
319 p = __read_opaque_cred(p, &comp->oc_cap_key);
320 p = __read_opaque_cred(p, &comp->oc_cap);
321 return p;
322}
323
324void pnfs_osd_xdr_decode_deviceaddr(
325 struct pnfs_osd_deviceaddr *deviceaddr, __be32 *p)
326{
327 p = __read_targetid(p, &deviceaddr->oda_targetid);
328
329 p = __read_targetaddr(p, &deviceaddr->oda_targetaddr);
330
331 p = xdr_decode_opaque_fixed(p, deviceaddr->oda_lun,
332 sizeof(deviceaddr->oda_lun));
333
334 p = __read_u8_opaque(p, &deviceaddr->oda_systemid);
335
336 p = __read_object_cred(p, &deviceaddr->oda_root_obj_cred);
337
338 p = __read_u8_opaque(p, &deviceaddr->oda_osdname);
339
340 /* libosd likes this terminated in dbg. It's last, so no problems */
341 deviceaddr->oda_osdname.data[deviceaddr->oda_osdname.len] = 0;
342}
343
344/*
345 * struct pnfs_osd_layoutupdate {
346 * u32 dsu_valid;
347 * s64 dsu_delta;
348 * u32 olu_ioerr_flag;
349 * }; xdr size 4 + 8 + 4
350 */
351int
352pnfs_osd_xdr_encode_layoutupdate(struct xdr_stream *xdr,
353 struct pnfs_osd_layoutupdate *lou)
354{
355 __be32 *p = xdr_reserve_space(xdr, 4 + 8 + 4);
356
357 if (!p)
358 return -E2BIG;
359
360 *p++ = cpu_to_be32(lou->dsu_valid);
361 if (lou->dsu_valid)
362 p = xdr_encode_hyper(p, lou->dsu_delta);
363 *p++ = cpu_to_be32(lou->olu_ioerr_flag);
364 return 0;
365}
366
367/*
368 * struct pnfs_osd_objid {
369 * struct nfs4_deviceid oid_device_id;
370 * u64 oid_partition_id;
371 * u64 oid_object_id;
372 * }; // xdr size 32 bytes
373 */
374static inline __be32 *
375pnfs_osd_xdr_encode_objid(__be32 *p, struct pnfs_osd_objid *object_id)
376{
377 p = xdr_encode_opaque_fixed(p, &object_id->oid_device_id.data,
378 sizeof(object_id->oid_device_id.data));
379 p = xdr_encode_hyper(p, object_id->oid_partition_id);
380 p = xdr_encode_hyper(p, object_id->oid_object_id);
381
382 return p;
383}
384
385/*
386 * struct pnfs_osd_ioerr {
387 * struct pnfs_osd_objid oer_component;
388 * u64 oer_comp_offset;
389 * u64 oer_comp_length;
390 * u32 oer_iswrite;
391 * u32 oer_errno;
392 * }; // xdr size 32 + 24 bytes
393 */
394void pnfs_osd_xdr_encode_ioerr(__be32 *p, struct pnfs_osd_ioerr *ioerr)
395{
396 p = pnfs_osd_xdr_encode_objid(p, &ioerr->oer_component);
397 p = xdr_encode_hyper(p, ioerr->oer_comp_offset);
398 p = xdr_encode_hyper(p, ioerr->oer_comp_length);
399 *p++ = cpu_to_be32(ioerr->oer_iswrite);
400 *p = cpu_to_be32(ioerr->oer_errno);
401}
402
403__be32 *pnfs_osd_xdr_ioerr_reserve_space(struct xdr_stream *xdr)
404{
405 __be32 *p;
406
407 p = xdr_reserve_space(xdr, 32 + 24);
408 if (unlikely(!p))
409 dprintk("%s: out of xdr space\n", __func__);
410
411 return p;
412}