aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fs/nfs/Kconfig10
-rw-r--r--fs/nfs/Makefile4
-rw-r--r--fs/nfs/callback.h17
-rw-r--r--fs/nfs/callback_proc.c51
-rw-r--r--fs/nfs/callback_xdr.c96
-rw-r--r--fs/nfs/client.c2
-rw-r--r--fs/nfs/dir.c9
-rw-r--r--fs/nfs/inode.c3
-rw-r--r--fs/nfs/internal.h1
-rw-r--r--fs/nfs/nfs4filelayout.c38
-rw-r--r--fs/nfs/nfs4filelayout.h8
-rw-r--r--fs/nfs/nfs4filelayoutdev.c119
-rw-r--r--fs/nfs/nfs4proc.c98
-rw-r--r--fs/nfs/nfs4xdr.c132
-rw-r--r--fs/nfs/objlayout/Kbuild5
-rw-r--r--fs/nfs/objlayout/objio_osd.c1057
-rw-r--r--fs/nfs/objlayout/objlayout.c712
-rw-r--r--fs/nfs/objlayout/objlayout.h187
-rw-r--r--fs/nfs/objlayout/pnfs_osd_xdr_cli.c412
-rw-r--r--fs/nfs/pagelist.c62
-rw-r--r--fs/nfs/pnfs.c340
-rw-r--r--fs/nfs/pnfs.h117
-rw-r--r--fs/nfs/pnfs_dev.c270
-rw-r--r--fs/nfs/read.c9
-rw-r--r--fs/nfs/super.c25
-rw-r--r--fs/nfs/write.c10
-rw-r--r--include/linux/nfs4.h1
-rw-r--r--include/linux/nfs_page.h2
-rw-r--r--include/linux/nfs_xdr.h23
-rw-r--r--include/linux/pnfs_osd_xdr.h345
-rw-r--r--include/linux/sunrpc/xdr.h2
-rw-r--r--net/sunrpc/xdr.c19
32 files changed, 3907 insertions, 279 deletions
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index ba306658a6db..81515545ba75 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -87,6 +87,16 @@ config NFS_V4_1
87config PNFS_FILE_LAYOUT 87config PNFS_FILE_LAYOUT
88 tristate 88 tristate
89 89
90config PNFS_OBJLAYOUT
91 tristate "Provide support for the pNFS Objects Layout Driver for NFSv4.1 pNFS (EXPERIMENTAL)"
92 depends on NFS_FS && NFS_V4_1 && SCSI_OSD_ULD
93 help
94 Say M here if you want your pNFS client to support the Objects Layout Driver.
95 Requires the SCSI osd initiator library (SCSI_OSD_INITIATOR) and
96 upper level driver (SCSI_OSD_ULD).
97
98 If unsure, say N.
99
90config ROOT_NFS 100config ROOT_NFS
91 bool "Root file system on NFS" 101 bool "Root file system on NFS"
92 depends on NFS_FS=y && IP_PNP 102 depends on NFS_FS=y && IP_PNP
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index 4776ff9e3814..6a34f7dd0e6f 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -15,9 +15,11 @@ nfs-$(CONFIG_NFS_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \
15 delegation.o idmap.o \ 15 delegation.o idmap.o \
16 callback.o callback_xdr.o callback_proc.o \ 16 callback.o callback_xdr.o callback_proc.o \
17 nfs4namespace.o 17 nfs4namespace.o
18nfs-$(CONFIG_NFS_V4_1) += pnfs.o 18nfs-$(CONFIG_NFS_V4_1) += pnfs.o pnfs_dev.o
19nfs-$(CONFIG_SYSCTL) += sysctl.o 19nfs-$(CONFIG_SYSCTL) += sysctl.o
20nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o 20nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o
21 21
22obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o 22obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o
23nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o 23nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o
24
25obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index 46d93ce7311b..b257383bb565 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -167,6 +167,23 @@ extern unsigned nfs4_callback_layoutrecall(
167 167
168extern void nfs4_check_drain_bc_complete(struct nfs4_session *ses); 168extern void nfs4_check_drain_bc_complete(struct nfs4_session *ses);
169extern void nfs4_cb_take_slot(struct nfs_client *clp); 169extern void nfs4_cb_take_slot(struct nfs_client *clp);
170
171struct cb_devicenotifyitem {
172 uint32_t cbd_notify_type;
173 uint32_t cbd_layout_type;
174 struct nfs4_deviceid cbd_dev_id;
175 uint32_t cbd_immediate;
176};
177
178struct cb_devicenotifyargs {
179 int ndevs;
180 struct cb_devicenotifyitem *devs;
181};
182
183extern __be32 nfs4_callback_devicenotify(
184 struct cb_devicenotifyargs *args,
185 void *dummy, struct cb_process_state *cps);
186
170#endif /* CONFIG_NFS_V4_1 */ 187#endif /* CONFIG_NFS_V4_1 */
171extern int check_gss_callback_principal(struct nfs_client *, struct svc_rqst *); 188extern int check_gss_callback_principal(struct nfs_client *, struct svc_rqst *);
172extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, 189extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args,
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 2f41dccea18e..d4d1954e9bb9 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -139,7 +139,7 @@ static u32 initiate_file_draining(struct nfs_client *clp,
139 spin_lock(&ino->i_lock); 139 spin_lock(&ino->i_lock);
140 if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) || 140 if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
141 mark_matching_lsegs_invalid(lo, &free_me_list, 141 mark_matching_lsegs_invalid(lo, &free_me_list,
142 args->cbl_range.iomode)) 142 &args->cbl_range))
143 rv = NFS4ERR_DELAY; 143 rv = NFS4ERR_DELAY;
144 else 144 else
145 rv = NFS4ERR_NOMATCHING_LAYOUT; 145 rv = NFS4ERR_NOMATCHING_LAYOUT;
@@ -184,7 +184,7 @@ static u32 initiate_bulk_draining(struct nfs_client *clp,
184 ino = lo->plh_inode; 184 ino = lo->plh_inode;
185 spin_lock(&ino->i_lock); 185 spin_lock(&ino->i_lock);
186 set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); 186 set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
187 if (mark_matching_lsegs_invalid(lo, &free_me_list, range.iomode)) 187 if (mark_matching_lsegs_invalid(lo, &free_me_list, &range))
188 rv = NFS4ERR_DELAY; 188 rv = NFS4ERR_DELAY;
189 list_del_init(&lo->plh_bulk_recall); 189 list_del_init(&lo->plh_bulk_recall);
190 spin_unlock(&ino->i_lock); 190 spin_unlock(&ino->i_lock);
@@ -241,6 +241,53 @@ static void pnfs_recall_all_layouts(struct nfs_client *clp)
241 do_callback_layoutrecall(clp, &args); 241 do_callback_layoutrecall(clp, &args);
242} 242}
243 243
244__be32 nfs4_callback_devicenotify(struct cb_devicenotifyargs *args,
245 void *dummy, struct cb_process_state *cps)
246{
247 int i;
248 __be32 res = 0;
249 struct nfs_client *clp = cps->clp;
250 struct nfs_server *server = NULL;
251
252 dprintk("%s: -->\n", __func__);
253
254 if (!clp) {
255 res = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION);
256 goto out;
257 }
258
259 for (i = 0; i < args->ndevs; i++) {
260 struct cb_devicenotifyitem *dev = &args->devs[i];
261
262 if (!server ||
263 server->pnfs_curr_ld->id != dev->cbd_layout_type) {
264 rcu_read_lock();
265 list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
266 if (server->pnfs_curr_ld &&
267 server->pnfs_curr_ld->id == dev->cbd_layout_type) {
268 rcu_read_unlock();
269 goto found;
270 }
271 rcu_read_unlock();
272 dprintk("%s: layout type %u not found\n",
273 __func__, dev->cbd_layout_type);
274 continue;
275 }
276
277 found:
278 if (dev->cbd_notify_type == NOTIFY_DEVICEID4_CHANGE)
279 dprintk("%s: NOTIFY_DEVICEID4_CHANGE not supported, "
280 "deleting instead\n", __func__);
281 nfs4_delete_deviceid(server->pnfs_curr_ld, clp, &dev->cbd_dev_id);
282 }
283
284out:
285 kfree(args->devs);
286 dprintk("%s: exit with status = %u\n",
287 __func__, be32_to_cpu(res));
288 return res;
289}
290
244int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid) 291int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid)
245{ 292{
246 if (delegation == NULL) 293 if (delegation == NULL)
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 00ecf62ce7c1..c6c86a77e043 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -25,6 +25,7 @@
25 25
26#if defined(CONFIG_NFS_V4_1) 26#if defined(CONFIG_NFS_V4_1)
27#define CB_OP_LAYOUTRECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) 27#define CB_OP_LAYOUTRECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
28#define CB_OP_DEVICENOTIFY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
28#define CB_OP_SEQUENCE_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \ 29#define CB_OP_SEQUENCE_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \
29 4 + 1 + 3) 30 4 + 1 + 3)
30#define CB_OP_RECALLANY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) 31#define CB_OP_RECALLANY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
@@ -284,6 +285,93 @@ out:
284 return status; 285 return status;
285} 286}
286 287
288static
289__be32 decode_devicenotify_args(struct svc_rqst *rqstp,
290 struct xdr_stream *xdr,
291 struct cb_devicenotifyargs *args)
292{
293 __be32 *p;
294 __be32 status = 0;
295 u32 tmp;
296 int n, i;
297 args->ndevs = 0;
298
299 /* Num of device notifications */
300 p = read_buf(xdr, sizeof(uint32_t));
301 if (unlikely(p == NULL)) {
302 status = htonl(NFS4ERR_BADXDR);
303 goto out;
304 }
305 n = ntohl(*p++);
306 if (n <= 0)
307 goto out;
308
309 args->devs = kmalloc(n * sizeof(*args->devs), GFP_KERNEL);
310 if (!args->devs) {
311 status = htonl(NFS4ERR_DELAY);
312 goto out;
313 }
314
315 /* Decode each dev notification */
316 for (i = 0; i < n; i++) {
317 struct cb_devicenotifyitem *dev = &args->devs[i];
318
319 p = read_buf(xdr, (4 * sizeof(uint32_t)) + NFS4_DEVICEID4_SIZE);
320 if (unlikely(p == NULL)) {
321 status = htonl(NFS4ERR_BADXDR);
322 goto err;
323 }
324
325 tmp = ntohl(*p++); /* bitmap size */
326 if (tmp != 1) {
327 status = htonl(NFS4ERR_INVAL);
328 goto err;
329 }
330 dev->cbd_notify_type = ntohl(*p++);
331 if (dev->cbd_notify_type != NOTIFY_DEVICEID4_CHANGE &&
332 dev->cbd_notify_type != NOTIFY_DEVICEID4_DELETE) {
333 status = htonl(NFS4ERR_INVAL);
334 goto err;
335 }
336
337 tmp = ntohl(*p++); /* opaque size */
338 if (((dev->cbd_notify_type == NOTIFY_DEVICEID4_CHANGE) &&
339 (tmp != NFS4_DEVICEID4_SIZE + 8)) ||
340 ((dev->cbd_notify_type == NOTIFY_DEVICEID4_DELETE) &&
341 (tmp != NFS4_DEVICEID4_SIZE + 4))) {
342 status = htonl(NFS4ERR_INVAL);
343 goto err;
344 }
345 dev->cbd_layout_type = ntohl(*p++);
346 memcpy(dev->cbd_dev_id.data, p, NFS4_DEVICEID4_SIZE);
347 p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
348
349 if (dev->cbd_layout_type == NOTIFY_DEVICEID4_CHANGE) {
350 p = read_buf(xdr, sizeof(uint32_t));
351 if (unlikely(p == NULL)) {
352 status = htonl(NFS4ERR_BADXDR);
353 goto err;
354 }
355 dev->cbd_immediate = ntohl(*p++);
356 } else {
357 dev->cbd_immediate = 0;
358 }
359
360 args->ndevs++;
361
362 dprintk("%s: type %d layout 0x%x immediate %d\n",
363 __func__, dev->cbd_notify_type, dev->cbd_layout_type,
364 dev->cbd_immediate);
365 }
366out:
367 dprintk("%s: status %d ndevs %d\n",
368 __func__, ntohl(status), args->ndevs);
369 return status;
370err:
371 kfree(args->devs);
372 goto out;
373}
374
287static __be32 decode_sessionid(struct xdr_stream *xdr, 375static __be32 decode_sessionid(struct xdr_stream *xdr,
288 struct nfs4_sessionid *sid) 376 struct nfs4_sessionid *sid)
289{ 377{
@@ -639,10 +727,10 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
639 case OP_CB_RECALL_ANY: 727 case OP_CB_RECALL_ANY:
640 case OP_CB_RECALL_SLOT: 728 case OP_CB_RECALL_SLOT:
641 case OP_CB_LAYOUTRECALL: 729 case OP_CB_LAYOUTRECALL:
730 case OP_CB_NOTIFY_DEVICEID:
642 *op = &callback_ops[op_nr]; 731 *op = &callback_ops[op_nr];
643 break; 732 break;
644 733
645 case OP_CB_NOTIFY_DEVICEID:
646 case OP_CB_NOTIFY: 734 case OP_CB_NOTIFY:
647 case OP_CB_PUSH_DELEG: 735 case OP_CB_PUSH_DELEG:
648 case OP_CB_RECALLABLE_OBJ_AVAIL: 736 case OP_CB_RECALLABLE_OBJ_AVAIL:
@@ -849,6 +937,12 @@ static struct callback_op callback_ops[] = {
849 (callback_decode_arg_t)decode_layoutrecall_args, 937 (callback_decode_arg_t)decode_layoutrecall_args,
850 .res_maxsize = CB_OP_LAYOUTRECALL_RES_MAXSZ, 938 .res_maxsize = CB_OP_LAYOUTRECALL_RES_MAXSZ,
851 }, 939 },
940 [OP_CB_NOTIFY_DEVICEID] = {
941 .process_op = (callback_process_op_t)nfs4_callback_devicenotify,
942 .decode_args =
943 (callback_decode_arg_t)decode_devicenotify_args,
944 .res_maxsize = CB_OP_DEVICENOTIFY_RES_MAXSZ,
945 },
852 [OP_CB_SEQUENCE] = { 946 [OP_CB_SEQUENCE] = {
853 .process_op = (callback_process_op_t)nfs4_callback_sequence, 947 .process_op = (callback_process_op_t)nfs4_callback_sequence,
854 .decode_args = (callback_decode_arg_t)decode_cb_sequence_args, 948 .decode_args = (callback_decode_arg_t)decode_cb_sequence_args,
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 139be9647d80..b3dc2b88b65b 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -290,6 +290,8 @@ static void nfs_free_client(struct nfs_client *clp)
290 if (clp->cl_machine_cred != NULL) 290 if (clp->cl_machine_cred != NULL)
291 put_rpccred(clp->cl_machine_cred); 291 put_rpccred(clp->cl_machine_cred);
292 292
293 nfs4_deviceid_purge_client(clp);
294
293 kfree(clp->cl_hostname); 295 kfree(clp->cl_hostname);
294 kfree(clp); 296 kfree(clp);
295 297
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 424e47773a84..ededdbd0db38 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -512,12 +512,7 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en
512 struct page **xdr_pages, struct page *page, unsigned int buflen) 512 struct page **xdr_pages, struct page *page, unsigned int buflen)
513{ 513{
514 struct xdr_stream stream; 514 struct xdr_stream stream;
515 struct xdr_buf buf = { 515 struct xdr_buf buf;
516 .pages = xdr_pages,
517 .page_len = buflen,
518 .buflen = buflen,
519 .len = buflen,
520 };
521 struct page *scratch; 516 struct page *scratch;
522 struct nfs_cache_array *array; 517 struct nfs_cache_array *array;
523 unsigned int count = 0; 518 unsigned int count = 0;
@@ -527,7 +522,7 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en
527 if (scratch == NULL) 522 if (scratch == NULL)
528 return -ENOMEM; 523 return -ENOMEM;
529 524
530 xdr_init_decode(&stream, &buf, NULL); 525 xdr_init_decode_pages(&stream, &buf, xdr_pages, buflen);
531 xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); 526 xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
532 527
533 do { 528 do {
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 873c6fa8bc3b..144f2a3c7185 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1428,9 +1428,10 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1428 */ 1428 */
1429void nfs4_evict_inode(struct inode *inode) 1429void nfs4_evict_inode(struct inode *inode)
1430{ 1430{
1431 pnfs_destroy_layout(NFS_I(inode));
1432 truncate_inode_pages(&inode->i_data, 0); 1431 truncate_inode_pages(&inode->i_data, 0);
1433 end_writeback(inode); 1432 end_writeback(inode);
1433 pnfs_return_layout(inode);
1434 pnfs_destroy_layout(NFS_I(inode));
1434 /* If we are holding a delegation, return it! */ 1435 /* If we are holding a delegation, return it! */
1435 nfs_inode_return_delegation_noreclaim(inode); 1436 nfs_inode_return_delegation_noreclaim(inode);
1436 /* First call standard NFS clear_inode() code */ 1437 /* First call standard NFS clear_inode() code */
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 2df6ca7b5898..b9056cbe68d6 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -310,6 +310,7 @@ extern int nfs_migrate_page(struct address_space *,
310#endif 310#endif
311 311
312/* nfs4proc.c */ 312/* nfs4proc.c */
313extern void __nfs4_read_done_cb(struct nfs_read_data *);
313extern void nfs4_reset_read(struct rpc_task *task, struct nfs_read_data *data); 314extern void nfs4_reset_read(struct rpc_task *task, struct nfs_read_data *data);
314extern int nfs4_init_client(struct nfs_client *clp, 315extern int nfs4_init_client(struct nfs_client *clp,
315 const struct rpc_timeout *timeparms, 316 const struct rpc_timeout *timeparms,
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index be79dc9f386d..426908809c97 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -421,6 +421,7 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
421 struct nfs4_deviceid *id, 421 struct nfs4_deviceid *id,
422 gfp_t gfp_flags) 422 gfp_t gfp_flags)
423{ 423{
424 struct nfs4_deviceid_node *d;
424 struct nfs4_file_layout_dsaddr *dsaddr; 425 struct nfs4_file_layout_dsaddr *dsaddr;
425 int status = -EINVAL; 426 int status = -EINVAL;
426 struct nfs_server *nfss = NFS_SERVER(lo->plh_inode); 427 struct nfs_server *nfss = NFS_SERVER(lo->plh_inode);
@@ -428,7 +429,7 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
428 dprintk("--> %s\n", __func__); 429 dprintk("--> %s\n", __func__);
429 430
430 if (fl->pattern_offset > lgr->range.offset) { 431 if (fl->pattern_offset > lgr->range.offset) {
431 dprintk("%s pattern_offset %lld to large\n", 432 dprintk("%s pattern_offset %lld too large\n",
432 __func__, fl->pattern_offset); 433 __func__, fl->pattern_offset);
433 goto out; 434 goto out;
434 } 435 }
@@ -440,12 +441,14 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
440 } 441 }
441 442
442 /* find and reference the deviceid */ 443 /* find and reference the deviceid */
443 dsaddr = nfs4_fl_find_get_deviceid(id); 444 d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode)->pnfs_curr_ld,
444 if (dsaddr == NULL) { 445 NFS_SERVER(lo->plh_inode)->nfs_client, id);
446 if (d == NULL) {
445 dsaddr = get_device_info(lo->plh_inode, id, gfp_flags); 447 dsaddr = get_device_info(lo->plh_inode, id, gfp_flags);
446 if (dsaddr == NULL) 448 if (dsaddr == NULL)
447 goto out; 449 goto out;
448 } 450 } else
451 dsaddr = container_of(d, struct nfs4_file_layout_dsaddr, id_node);
449 fl->dsaddr = dsaddr; 452 fl->dsaddr = dsaddr;
450 453
451 if (fl->first_stripe_index < 0 || 454 if (fl->first_stripe_index < 0 ||
@@ -507,12 +510,7 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo,
507 gfp_t gfp_flags) 510 gfp_t gfp_flags)
508{ 511{
509 struct xdr_stream stream; 512 struct xdr_stream stream;
510 struct xdr_buf buf = { 513 struct xdr_buf buf;
511 .pages = lgr->layoutp->pages,
512 .page_len = lgr->layoutp->len,
513 .buflen = lgr->layoutp->len,
514 .len = lgr->layoutp->len,
515 };
516 struct page *scratch; 514 struct page *scratch;
517 __be32 *p; 515 __be32 *p;
518 uint32_t nfl_util; 516 uint32_t nfl_util;
@@ -524,7 +522,7 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo,
524 if (!scratch) 522 if (!scratch)
525 return -ENOMEM; 523 return -ENOMEM;
526 524
527 xdr_init_decode(&stream, &buf, NULL); 525 xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len);
528 xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); 526 xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
529 527
530 /* 20 = ufl_util (4), first_stripe_index (4), pattern_offset (8), 528 /* 20 = ufl_util (4), first_stripe_index (4), pattern_offset (8),
@@ -535,7 +533,7 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo,
535 533
536 memcpy(id, p, sizeof(*id)); 534 memcpy(id, p, sizeof(*id));
537 p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE); 535 p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
538 print_deviceid(id); 536 nfs4_print_deviceid(id);
539 537
540 nfl_util = be32_to_cpup(p++); 538 nfl_util = be32_to_cpup(p++);
541 if (nfl_util & NFL4_UFLG_COMMIT_THRU_MDS) 539 if (nfl_util & NFL4_UFLG_COMMIT_THRU_MDS)
@@ -653,16 +651,19 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,
653/* 651/*
654 * filelayout_pg_test(). Called by nfs_can_coalesce_requests() 652 * filelayout_pg_test(). Called by nfs_can_coalesce_requests()
655 * 653 *
656 * return 1 : coalesce page 654 * return true : coalesce page
657 * return 0 : don't coalesce page 655 * return false : don't coalesce page
658 */ 656 */
659int 657bool
660filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, 658filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
661 struct nfs_page *req) 659 struct nfs_page *req)
662{ 660{
663 u64 p_stripe, r_stripe; 661 u64 p_stripe, r_stripe;
664 u32 stripe_unit; 662 u32 stripe_unit;
665 663
664 if (!pnfs_generic_pg_test(pgio, prev, req))
665 return 0;
666
666 if (!pgio->pg_lseg) 667 if (!pgio->pg_lseg)
667 return 1; 668 return 1;
668 p_stripe = (u64)prev->wb_index << PAGE_CACHE_SHIFT; 669 p_stripe = (u64)prev->wb_index << PAGE_CACHE_SHIFT;
@@ -860,6 +861,12 @@ filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
860 return -ENOMEM; 861 return -ENOMEM;
861} 862}
862 863
864static void
865filelayout_free_deveiceid_node(struct nfs4_deviceid_node *d)
866{
867 nfs4_fl_free_deviceid(container_of(d, struct nfs4_file_layout_dsaddr, id_node));
868}
869
863static struct pnfs_layoutdriver_type filelayout_type = { 870static struct pnfs_layoutdriver_type filelayout_type = {
864 .id = LAYOUT_NFSV4_1_FILES, 871 .id = LAYOUT_NFSV4_1_FILES,
865 .name = "LAYOUT_NFSV4_1_FILES", 872 .name = "LAYOUT_NFSV4_1_FILES",
@@ -872,6 +879,7 @@ static struct pnfs_layoutdriver_type filelayout_type = {
872 .commit_pagelist = filelayout_commit_pagelist, 879 .commit_pagelist = filelayout_commit_pagelist,
873 .read_pagelist = filelayout_read_pagelist, 880 .read_pagelist = filelayout_read_pagelist,
874 .write_pagelist = filelayout_write_pagelist, 881 .write_pagelist = filelayout_write_pagelist,
882 .free_deviceid_node = filelayout_free_deveiceid_node,
875}; 883};
876 884
877static int __init nfs4filelayout_init(void) 885static int __init nfs4filelayout_init(void)
diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
index 2b461d77b43a..cebe01e3795e 100644
--- a/fs/nfs/nfs4filelayout.h
+++ b/fs/nfs/nfs4filelayout.h
@@ -59,9 +59,7 @@ struct nfs4_pnfs_ds {
59#define NFS4_DEVICE_ID_NEG_ENTRY 0x00000001 59#define NFS4_DEVICE_ID_NEG_ENTRY 0x00000001
60 60
61struct nfs4_file_layout_dsaddr { 61struct nfs4_file_layout_dsaddr {
62 struct hlist_node node; 62 struct nfs4_deviceid_node id_node;
63 struct nfs4_deviceid deviceid;
64 atomic_t ref;
65 unsigned long flags; 63 unsigned long flags;
66 u32 stripe_count; 64 u32 stripe_count;
67 u8 *stripe_indices; 65 u8 *stripe_indices;
@@ -95,14 +93,12 @@ extern struct nfs_fh *
95nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j); 93nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j);
96 94
97extern void print_ds(struct nfs4_pnfs_ds *ds); 95extern void print_ds(struct nfs4_pnfs_ds *ds);
98extern void print_deviceid(struct nfs4_deviceid *dev_id);
99u32 nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset); 96u32 nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset);
100u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j); 97u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j);
101struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, 98struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg,
102 u32 ds_idx); 99 u32 ds_idx);
103extern struct nfs4_file_layout_dsaddr *
104nfs4_fl_find_get_deviceid(struct nfs4_deviceid *dev_id);
105extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr); 100extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
101extern void nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
106struct nfs4_file_layout_dsaddr * 102struct nfs4_file_layout_dsaddr *
107get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gfp_t gfp_flags); 103get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gfp_t gfp_flags);
108 104
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index db07c7af1395..3b7bf1377264 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -37,30 +37,6 @@
37#define NFSDBG_FACILITY NFSDBG_PNFS_LD 37#define NFSDBG_FACILITY NFSDBG_PNFS_LD
38 38
39/* 39/*
40 * Device ID RCU cache. A device ID is unique per client ID and layout type.
41 */
42#define NFS4_FL_DEVICE_ID_HASH_BITS 5
43#define NFS4_FL_DEVICE_ID_HASH_SIZE (1 << NFS4_FL_DEVICE_ID_HASH_BITS)
44#define NFS4_FL_DEVICE_ID_HASH_MASK (NFS4_FL_DEVICE_ID_HASH_SIZE - 1)
45
46static inline u32
47nfs4_fl_deviceid_hash(struct nfs4_deviceid *id)
48{
49 unsigned char *cptr = (unsigned char *)id->data;
50 unsigned int nbytes = NFS4_DEVICEID4_SIZE;
51 u32 x = 0;
52
53 while (nbytes--) {
54 x *= 37;
55 x += *cptr++;
56 }
57 return x & NFS4_FL_DEVICE_ID_HASH_MASK;
58}
59
60static struct hlist_head filelayout_deviceid_cache[NFS4_FL_DEVICE_ID_HASH_SIZE];
61static DEFINE_SPINLOCK(filelayout_deviceid_lock);
62
63/*
64 * Data server cache 40 * Data server cache
65 * 41 *
66 * Data servers can be mapped to different device ids. 42 * Data servers can be mapped to different device ids.
@@ -89,27 +65,6 @@ print_ds(struct nfs4_pnfs_ds *ds)
89 ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0); 65 ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0);
90} 66}
91 67
92void
93print_ds_list(struct nfs4_file_layout_dsaddr *dsaddr)
94{
95 int i;
96
97 ifdebug(FACILITY) {
98 printk("%s dsaddr->ds_num %d\n", __func__,
99 dsaddr->ds_num);
100 for (i = 0; i < dsaddr->ds_num; i++)
101 print_ds(dsaddr->ds_list[i]);
102 }
103}
104
105void print_deviceid(struct nfs4_deviceid *id)
106{
107 u32 *p = (u32 *)id;
108
109 dprintk("%s: device id= [%x%x%x%x]\n", __func__,
110 p[0], p[1], p[2], p[3]);
111}
112
113/* nfs4_ds_cache_lock is held */ 68/* nfs4_ds_cache_lock is held */
114static struct nfs4_pnfs_ds * 69static struct nfs4_pnfs_ds *
115_data_server_lookup_locked(u32 ip_addr, u32 port) 70_data_server_lookup_locked(u32 ip_addr, u32 port)
@@ -201,13 +156,13 @@ destroy_ds(struct nfs4_pnfs_ds *ds)
201 kfree(ds); 156 kfree(ds);
202} 157}
203 158
204static void 159void
205nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr) 160nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
206{ 161{
207 struct nfs4_pnfs_ds *ds; 162 struct nfs4_pnfs_ds *ds;
208 int i; 163 int i;
209 164
210 print_deviceid(&dsaddr->deviceid); 165 nfs4_print_deviceid(&dsaddr->id_node.deviceid);
211 166
212 for (i = 0; i < dsaddr->ds_num; i++) { 167 for (i = 0; i < dsaddr->ds_num; i++) {
213 ds = dsaddr->ds_list[i]; 168 ds = dsaddr->ds_list[i];
@@ -353,12 +308,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
353 u8 max_stripe_index; 308 u8 max_stripe_index;
354 struct nfs4_file_layout_dsaddr *dsaddr = NULL; 309 struct nfs4_file_layout_dsaddr *dsaddr = NULL;
355 struct xdr_stream stream; 310 struct xdr_stream stream;
356 struct xdr_buf buf = { 311 struct xdr_buf buf;
357 .pages = pdev->pages,
358 .page_len = pdev->pglen,
359 .buflen = pdev->pglen,
360 .len = pdev->pglen,
361 };
362 struct page *scratch; 312 struct page *scratch;
363 313
364 /* set up xdr stream */ 314 /* set up xdr stream */
@@ -366,7 +316,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
366 if (!scratch) 316 if (!scratch)
367 goto out_err; 317 goto out_err;
368 318
369 xdr_init_decode(&stream, &buf, NULL); 319 xdr_init_decode_pages(&stream, &buf, pdev->pages, pdev->pglen);
370 xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); 320 xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
371 321
372 /* Get the stripe count (number of stripe index) */ 322 /* Get the stripe count (number of stripe index) */
@@ -431,8 +381,10 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
431 dsaddr->stripe_indices = stripe_indices; 381 dsaddr->stripe_indices = stripe_indices;
432 stripe_indices = NULL; 382 stripe_indices = NULL;
433 dsaddr->ds_num = num; 383 dsaddr->ds_num = num;
434 384 nfs4_init_deviceid_node(&dsaddr->id_node,
435 memcpy(&dsaddr->deviceid, &pdev->dev_id, sizeof(pdev->dev_id)); 385 NFS_SERVER(ino)->pnfs_curr_ld,
386 NFS_SERVER(ino)->nfs_client,
387 &pdev->dev_id);
436 388
437 for (i = 0; i < dsaddr->ds_num; i++) { 389 for (i = 0; i < dsaddr->ds_num; i++) {
438 int j; 390 int j;
@@ -505,8 +457,8 @@ out_err:
505static struct nfs4_file_layout_dsaddr * 457static struct nfs4_file_layout_dsaddr *
506decode_and_add_device(struct inode *inode, struct pnfs_device *dev, gfp_t gfp_flags) 458decode_and_add_device(struct inode *inode, struct pnfs_device *dev, gfp_t gfp_flags)
507{ 459{
508 struct nfs4_file_layout_dsaddr *d, *new; 460 struct nfs4_deviceid_node *d;
509 long hash; 461 struct nfs4_file_layout_dsaddr *n, *new;
510 462
511 new = decode_device(inode, dev, gfp_flags); 463 new = decode_device(inode, dev, gfp_flags);
512 if (!new) { 464 if (!new) {
@@ -515,20 +467,13 @@ decode_and_add_device(struct inode *inode, struct pnfs_device *dev, gfp_t gfp_fl
515 return NULL; 467 return NULL;
516 } 468 }
517 469
518 spin_lock(&filelayout_deviceid_lock); 470 d = nfs4_insert_deviceid_node(&new->id_node);
519 d = nfs4_fl_find_get_deviceid(&new->deviceid); 471 n = container_of(d, struct nfs4_file_layout_dsaddr, id_node);
520 if (d) { 472 if (n != new) {
521 spin_unlock(&filelayout_deviceid_lock);
522 nfs4_fl_free_deviceid(new); 473 nfs4_fl_free_deviceid(new);
523 return d; 474 return n;
524 } 475 }
525 476
526 INIT_HLIST_NODE(&new->node);
527 atomic_set(&new->ref, 1);
528 hash = nfs4_fl_deviceid_hash(&new->deviceid);
529 hlist_add_head_rcu(&new->node, &filelayout_deviceid_cache[hash]);
530 spin_unlock(&filelayout_deviceid_lock);
531
532 return new; 477 return new;
533} 478}
534 479
@@ -600,35 +545,7 @@ out_free:
600void 545void
601nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr) 546nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
602{ 547{
603 if (atomic_dec_and_lock(&dsaddr->ref, &filelayout_deviceid_lock)) { 548 nfs4_put_deviceid_node(&dsaddr->id_node);
604 hlist_del_rcu(&dsaddr->node);
605 spin_unlock(&filelayout_deviceid_lock);
606
607 synchronize_rcu();
608 nfs4_fl_free_deviceid(dsaddr);
609 }
610}
611
612struct nfs4_file_layout_dsaddr *
613nfs4_fl_find_get_deviceid(struct nfs4_deviceid *id)
614{
615 struct nfs4_file_layout_dsaddr *d;
616 struct hlist_node *n;
617 long hash = nfs4_fl_deviceid_hash(id);
618
619
620 rcu_read_lock();
621 hlist_for_each_entry_rcu(d, n, &filelayout_deviceid_cache[hash], node) {
622 if (!memcmp(&d->deviceid, id, sizeof(*id))) {
623 if (!atomic_inc_not_zero(&d->ref))
624 goto fail;
625 rcu_read_unlock();
626 return d;
627 }
628 }
629fail:
630 rcu_read_unlock();
631 return NULL;
632} 549}
633 550
634/* 551/*
@@ -676,15 +593,15 @@ static void
676filelayout_mark_devid_negative(struct nfs4_file_layout_dsaddr *dsaddr, 593filelayout_mark_devid_negative(struct nfs4_file_layout_dsaddr *dsaddr,
677 int err, u32 ds_addr) 594 int err, u32 ds_addr)
678{ 595{
679 u32 *p = (u32 *)&dsaddr->deviceid; 596 u32 *p = (u32 *)&dsaddr->id_node.deviceid;
680 597
681 printk(KERN_ERR "NFS: data server %x connection error %d." 598 printk(KERN_ERR "NFS: data server %x connection error %d."
682 " Deviceid [%x%x%x%x] marked out of use.\n", 599 " Deviceid [%x%x%x%x] marked out of use.\n",
683 ds_addr, err, p[0], p[1], p[2], p[3]); 600 ds_addr, err, p[0], p[1], p[2], p[3]);
684 601
685 spin_lock(&filelayout_deviceid_lock); 602 spin_lock(&nfs4_ds_cache_lock);
686 dsaddr->flags |= NFS4_DEVICE_ID_NEG_ENTRY; 603 dsaddr->flags |= NFS4_DEVICE_ID_NEG_ENTRY;
687 spin_unlock(&filelayout_deviceid_lock); 604 spin_unlock(&nfs4_ds_cache_lock);
688} 605}
689 606
690struct nfs4_pnfs_ds * 607struct nfs4_pnfs_ds *
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index d0e15dba7a5a..d2c4b59c896d 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2363,6 +2363,9 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
2363 struct nfs4_state *state = NULL; 2363 struct nfs4_state *state = NULL;
2364 int status; 2364 int status;
2365 2365
2366 if (pnfs_ld_layoutret_on_setattr(inode))
2367 pnfs_return_layout(inode);
2368
2366 nfs_fattr_init(fattr); 2369 nfs_fattr_init(fattr);
2367 2370
2368 /* Search for an existing open(O_WRITE) file */ 2371 /* Search for an existing open(O_WRITE) file */
@@ -3177,6 +3180,11 @@ static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
3177 return err; 3180 return err;
3178} 3181}
3179 3182
3183void __nfs4_read_done_cb(struct nfs_read_data *data)
3184{
3185 nfs_invalidate_atime(data->inode);
3186}
3187
3180static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data) 3188static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data)
3181{ 3189{
3182 struct nfs_server *server = NFS_SERVER(data->inode); 3190 struct nfs_server *server = NFS_SERVER(data->inode);
@@ -3186,7 +3194,7 @@ static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data)
3186 return -EAGAIN; 3194 return -EAGAIN;
3187 } 3195 }
3188 3196
3189 nfs_invalidate_atime(data->inode); 3197 __nfs4_read_done_cb(data);
3190 if (task->tk_status > 0) 3198 if (task->tk_status > 0)
3191 renew_lease(server, data->timestamp); 3199 renew_lease(server, data->timestamp);
3192 return 0; 3200 return 0;
@@ -3200,7 +3208,8 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
3200 if (!nfs4_sequence_done(task, &data->res.seq_res)) 3208 if (!nfs4_sequence_done(task, &data->res.seq_res))
3201 return -EAGAIN; 3209 return -EAGAIN;
3202 3210
3203 return data->read_done_cb(task, data); 3211 return data->read_done_cb ? data->read_done_cb(task, data) :
3212 nfs4_read_done_cb(task, data);
3204} 3213}
3205 3214
3206static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg) 3215static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg)
@@ -3245,7 +3254,8 @@ static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
3245{ 3254{
3246 if (!nfs4_sequence_done(task, &data->res.seq_res)) 3255 if (!nfs4_sequence_done(task, &data->res.seq_res))
3247 return -EAGAIN; 3256 return -EAGAIN;
3248 return data->write_done_cb(task, data); 3257 return data->write_done_cb ? data->write_done_cb(task, data) :
3258 nfs4_write_done_cb(task, data);
3249} 3259}
3250 3260
3251/* Reset the the nfs_write_data to send the write to the MDS. */ 3261/* Reset the the nfs_write_data to send the write to the MDS. */
@@ -5671,6 +5681,88 @@ int nfs4_proc_layoutget(struct nfs4_layoutget *lgp)
5671 return status; 5681 return status;
5672} 5682}
5673 5683
5684static void
5685nfs4_layoutreturn_prepare(struct rpc_task *task, void *calldata)
5686{
5687 struct nfs4_layoutreturn *lrp = calldata;
5688
5689 dprintk("--> %s\n", __func__);
5690 if (nfs41_setup_sequence(lrp->clp->cl_session, &lrp->args.seq_args,
5691 &lrp->res.seq_res, 0, task))
5692 return;
5693 rpc_call_start(task);
5694}
5695
5696static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
5697{
5698 struct nfs4_layoutreturn *lrp = calldata;
5699 struct nfs_server *server;
5700
5701 dprintk("--> %s\n", __func__);
5702
5703 if (!nfs4_sequence_done(task, &lrp->res.seq_res))
5704 return;
5705
5706 server = NFS_SERVER(lrp->args.inode);
5707 if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) {
5708 nfs_restart_rpc(task, lrp->clp);
5709 return;
5710 }
5711 if (task->tk_status == 0) {
5712 struct pnfs_layout_hdr *lo = NFS_I(lrp->args.inode)->layout;
5713
5714 if (lrp->res.lrs_present) {
5715 spin_lock(&lo->plh_inode->i_lock);
5716 pnfs_set_layout_stateid(lo, &lrp->res.stateid, true);
5717 spin_unlock(&lo->plh_inode->i_lock);
5718 } else
5719 BUG_ON(!list_empty(&lo->plh_segs));
5720 }
5721 dprintk("<-- %s\n", __func__);
5722}
5723
5724static void nfs4_layoutreturn_release(void *calldata)
5725{
5726 struct nfs4_layoutreturn *lrp = calldata;
5727
5728 dprintk("--> %s\n", __func__);
5729 put_layout_hdr(NFS_I(lrp->args.inode)->layout);
5730 kfree(calldata);
5731 dprintk("<-- %s\n", __func__);
5732}
5733
5734static const struct rpc_call_ops nfs4_layoutreturn_call_ops = {
5735 .rpc_call_prepare = nfs4_layoutreturn_prepare,
5736 .rpc_call_done = nfs4_layoutreturn_done,
5737 .rpc_release = nfs4_layoutreturn_release,
5738};
5739
5740int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp)
5741{
5742 struct rpc_task *task;
5743 struct rpc_message msg = {
5744 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTRETURN],
5745 .rpc_argp = &lrp->args,
5746 .rpc_resp = &lrp->res,
5747 };
5748 struct rpc_task_setup task_setup_data = {
5749 .rpc_client = lrp->clp->cl_rpcclient,
5750 .rpc_message = &msg,
5751 .callback_ops = &nfs4_layoutreturn_call_ops,
5752 .callback_data = lrp,
5753 };
5754 int status;
5755
5756 dprintk("--> %s\n", __func__);
5757 task = rpc_run_task(&task_setup_data);
5758 if (IS_ERR(task))
5759 return PTR_ERR(task);
5760 status = task->tk_status;
5761 dprintk("<-- %s status=%d\n", __func__, status);
5762 rpc_put_task(task);
5763 return status;
5764}
5765
5674static int 5766static int
5675_nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev) 5767_nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
5676{ 5768{
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index c3ccd2c46834..d869a5e5464b 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -338,7 +338,11 @@ static int nfs4_stat_to_errno(int);
338 1 /* layoutupdate4 layout type */ + \ 338 1 /* layoutupdate4 layout type */ + \
339 1 /* NULL filelayout layoutupdate4 payload */) 339 1 /* NULL filelayout layoutupdate4 payload */)
340#define decode_layoutcommit_maxsz (op_decode_hdr_maxsz + 3) 340#define decode_layoutcommit_maxsz (op_decode_hdr_maxsz + 3)
341 341#define encode_layoutreturn_maxsz (8 + op_encode_hdr_maxsz + \
342 encode_stateid_maxsz + \
343 1 /* FIXME: opaque lrf_body always empty at the moment */)
344#define decode_layoutreturn_maxsz (op_decode_hdr_maxsz + \
345 1 + decode_stateid_maxsz)
342#else /* CONFIG_NFS_V4_1 */ 346#else /* CONFIG_NFS_V4_1 */
343#define encode_sequence_maxsz 0 347#define encode_sequence_maxsz 0
344#define decode_sequence_maxsz 0 348#define decode_sequence_maxsz 0
@@ -760,7 +764,14 @@ static int nfs4_stat_to_errno(int);
760 decode_putfh_maxsz + \ 764 decode_putfh_maxsz + \
761 decode_layoutcommit_maxsz + \ 765 decode_layoutcommit_maxsz + \
762 decode_getattr_maxsz) 766 decode_getattr_maxsz)
763 767#define NFS4_enc_layoutreturn_sz (compound_encode_hdr_maxsz + \
768 encode_sequence_maxsz + \
769 encode_putfh_maxsz + \
770 encode_layoutreturn_maxsz)
771#define NFS4_dec_layoutreturn_sz (compound_decode_hdr_maxsz + \
772 decode_sequence_maxsz + \
773 decode_putfh_maxsz + \
774 decode_layoutreturn_maxsz)
764 775
765const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH + 776const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH +
766 compound_encode_hdr_maxsz + 777 compound_encode_hdr_maxsz +
@@ -1864,6 +1875,7 @@ encode_layoutget(struct xdr_stream *xdr,
1864 1875
1865static int 1876static int
1866encode_layoutcommit(struct xdr_stream *xdr, 1877encode_layoutcommit(struct xdr_stream *xdr,
1878 struct inode *inode,
1867 const struct nfs4_layoutcommit_args *args, 1879 const struct nfs4_layoutcommit_args *args,
1868 struct compound_hdr *hdr) 1880 struct compound_hdr *hdr)
1869{ 1881{
@@ -1872,7 +1884,7 @@ encode_layoutcommit(struct xdr_stream *xdr,
1872 dprintk("%s: lbw: %llu type: %d\n", __func__, args->lastbytewritten, 1884 dprintk("%s: lbw: %llu type: %d\n", __func__, args->lastbytewritten,
1873 NFS_SERVER(args->inode)->pnfs_curr_ld->id); 1885 NFS_SERVER(args->inode)->pnfs_curr_ld->id);
1874 1886
1875 p = reserve_space(xdr, 48 + NFS4_STATEID_SIZE); 1887 p = reserve_space(xdr, 44 + NFS4_STATEID_SIZE);
1876 *p++ = cpu_to_be32(OP_LAYOUTCOMMIT); 1888 *p++ = cpu_to_be32(OP_LAYOUTCOMMIT);
1877 /* Only whole file layouts */ 1889 /* Only whole file layouts */
1878 p = xdr_encode_hyper(p, 0); /* offset */ 1890 p = xdr_encode_hyper(p, 0); /* offset */
@@ -1883,12 +1895,49 @@ encode_layoutcommit(struct xdr_stream *xdr,
1883 p = xdr_encode_hyper(p, args->lastbytewritten); 1895 p = xdr_encode_hyper(p, args->lastbytewritten);
1884 *p++ = cpu_to_be32(0); /* Never send time_modify_changed */ 1896 *p++ = cpu_to_be32(0); /* Never send time_modify_changed */
1885 *p++ = cpu_to_be32(NFS_SERVER(args->inode)->pnfs_curr_ld->id);/* type */ 1897 *p++ = cpu_to_be32(NFS_SERVER(args->inode)->pnfs_curr_ld->id);/* type */
1886 *p++ = cpu_to_be32(0); /* no file layout payload */ 1898
1899 if (NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit)
1900 NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit(
1901 NFS_I(inode)->layout, xdr, args);
1902 else {
1903 p = reserve_space(xdr, 4);
1904 *p = cpu_to_be32(0); /* no layout-type payload */
1905 }
1887 1906
1888 hdr->nops++; 1907 hdr->nops++;
1889 hdr->replen += decode_layoutcommit_maxsz; 1908 hdr->replen += decode_layoutcommit_maxsz;
1890 return 0; 1909 return 0;
1891} 1910}
1911
1912static void
1913encode_layoutreturn(struct xdr_stream *xdr,
1914 const struct nfs4_layoutreturn_args *args,
1915 struct compound_hdr *hdr)
1916{
1917 __be32 *p;
1918
1919 p = reserve_space(xdr, 20);
1920 *p++ = cpu_to_be32(OP_LAYOUTRETURN);
1921 *p++ = cpu_to_be32(0); /* reclaim. always 0 for now */
1922 *p++ = cpu_to_be32(args->layout_type);
1923 *p++ = cpu_to_be32(IOMODE_ANY);
1924 *p = cpu_to_be32(RETURN_FILE);
1925 p = reserve_space(xdr, 16 + NFS4_STATEID_SIZE);
1926 p = xdr_encode_hyper(p, 0);
1927 p = xdr_encode_hyper(p, NFS4_MAX_UINT64);
1928 spin_lock(&args->inode->i_lock);
1929 xdr_encode_opaque_fixed(p, &args->stateid.data, NFS4_STATEID_SIZE);
1930 spin_unlock(&args->inode->i_lock);
1931 if (NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn) {
1932 NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn(
1933 NFS_I(args->inode)->layout, xdr, args);
1934 } else {
1935 p = reserve_space(xdr, 4);
1936 *p = cpu_to_be32(0);
1937 }
1938 hdr->nops++;
1939 hdr->replen += decode_layoutreturn_maxsz;
1940}
1892#endif /* CONFIG_NFS_V4_1 */ 1941#endif /* CONFIG_NFS_V4_1 */
1893 1942
1894/* 1943/*
@@ -2706,10 +2755,12 @@ static void nfs4_xdr_enc_layoutget(struct rpc_rqst *req,
2706/* 2755/*
2707 * Encode LAYOUTCOMMIT request 2756 * Encode LAYOUTCOMMIT request
2708 */ 2757 */
2709static int nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req, 2758static void nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req,
2710 struct xdr_stream *xdr, 2759 struct xdr_stream *xdr,
2711 struct nfs4_layoutcommit_args *args) 2760 struct nfs4_layoutcommit_args *args)
2712{ 2761{
2762 struct nfs4_layoutcommit_data *data =
2763 container_of(args, struct nfs4_layoutcommit_data, args);
2713 struct compound_hdr hdr = { 2764 struct compound_hdr hdr = {
2714 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 2765 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2715 }; 2766 };
@@ -2717,10 +2768,27 @@ static int nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req,
2717 encode_compound_hdr(xdr, req, &hdr); 2768 encode_compound_hdr(xdr, req, &hdr);
2718 encode_sequence(xdr, &args->seq_args, &hdr); 2769 encode_sequence(xdr, &args->seq_args, &hdr);
2719 encode_putfh(xdr, NFS_FH(args->inode), &hdr); 2770 encode_putfh(xdr, NFS_FH(args->inode), &hdr);
2720 encode_layoutcommit(xdr, args, &hdr); 2771 encode_layoutcommit(xdr, data->args.inode, args, &hdr);
2721 encode_getfattr(xdr, args->bitmask, &hdr); 2772 encode_getfattr(xdr, args->bitmask, &hdr);
2722 encode_nops(&hdr); 2773 encode_nops(&hdr);
2723 return 0; 2774}
2775
2776/*
2777 * Encode LAYOUTRETURN request
2778 */
2779static void nfs4_xdr_enc_layoutreturn(struct rpc_rqst *req,
2780 struct xdr_stream *xdr,
2781 struct nfs4_layoutreturn_args *args)
2782{
2783 struct compound_hdr hdr = {
2784 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2785 };
2786
2787 encode_compound_hdr(xdr, req, &hdr);
2788 encode_sequence(xdr, &args->seq_args, &hdr);
2789 encode_putfh(xdr, NFS_FH(args->inode), &hdr);
2790 encode_layoutreturn(xdr, args, &hdr);
2791 encode_nops(&hdr);
2724} 2792}
2725#endif /* CONFIG_NFS_V4_1 */ 2793#endif /* CONFIG_NFS_V4_1 */
2726 2794
@@ -5203,6 +5271,27 @@ out_overflow:
5203 return -EIO; 5271 return -EIO;
5204} 5272}
5205 5273
5274static int decode_layoutreturn(struct xdr_stream *xdr,
5275 struct nfs4_layoutreturn_res *res)
5276{
5277 __be32 *p;
5278 int status;
5279
5280 status = decode_op_hdr(xdr, OP_LAYOUTRETURN);
5281 if (status)
5282 return status;
5283 p = xdr_inline_decode(xdr, 4);
5284 if (unlikely(!p))
5285 goto out_overflow;
5286 res->lrs_present = be32_to_cpup(p);
5287 if (res->lrs_present)
5288 status = decode_stateid(xdr, &res->stateid);
5289 return status;
5290out_overflow:
5291 print_overflow_msg(__func__, xdr);
5292 return -EIO;
5293}
5294
5206static int decode_layoutcommit(struct xdr_stream *xdr, 5295static int decode_layoutcommit(struct xdr_stream *xdr,
5207 struct rpc_rqst *req, 5296 struct rpc_rqst *req,
5208 struct nfs4_layoutcommit_res *res) 5297 struct nfs4_layoutcommit_res *res)
@@ -6320,6 +6409,30 @@ out:
6320} 6409}
6321 6410
6322/* 6411/*
6412 * Decode LAYOUTRETURN response
6413 */
6414static int nfs4_xdr_dec_layoutreturn(struct rpc_rqst *rqstp,
6415 struct xdr_stream *xdr,
6416 struct nfs4_layoutreturn_res *res)
6417{
6418 struct compound_hdr hdr;
6419 int status;
6420
6421 status = decode_compound_hdr(xdr, &hdr);
6422 if (status)
6423 goto out;
6424 status = decode_sequence(xdr, &res->seq_res, rqstp);
6425 if (status)
6426 goto out;
6427 status = decode_putfh(xdr);
6428 if (status)
6429 goto out;
6430 status = decode_layoutreturn(xdr, res);
6431out:
6432 return status;
6433}
6434
6435/*
6323 * Decode LAYOUTCOMMIT response 6436 * Decode LAYOUTCOMMIT response
6324 */ 6437 */
6325static int nfs4_xdr_dec_layoutcommit(struct rpc_rqst *rqstp, 6438static int nfs4_xdr_dec_layoutcommit(struct rpc_rqst *rqstp,
@@ -6547,6 +6660,7 @@ struct rpc_procinfo nfs4_procedures[] = {
6547 PROC(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo), 6660 PROC(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo),
6548 PROC(LAYOUTGET, enc_layoutget, dec_layoutget), 6661 PROC(LAYOUTGET, enc_layoutget, dec_layoutget),
6549 PROC(LAYOUTCOMMIT, enc_layoutcommit, dec_layoutcommit), 6662 PROC(LAYOUTCOMMIT, enc_layoutcommit, dec_layoutcommit),
6663 PROC(LAYOUTRETURN, enc_layoutreturn, dec_layoutreturn),
6550#endif /* CONFIG_NFS_V4_1 */ 6664#endif /* CONFIG_NFS_V4_1 */
6551}; 6665};
6552 6666
diff --git a/fs/nfs/objlayout/Kbuild b/fs/nfs/objlayout/Kbuild
new file mode 100644
index 000000000000..ed30ea072bb8
--- /dev/null
+++ b/fs/nfs/objlayout/Kbuild
@@ -0,0 +1,5 @@
1#
2# Makefile for the pNFS Objects Layout Driver kernel module
3#
4objlayoutdriver-y := objio_osd.o pnfs_osd_xdr_cli.o objlayout.o
5obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayoutdriver.o
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
new file mode 100644
index 000000000000..9cf208df1f25
--- /dev/null
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -0,0 +1,1057 @@
1/*
2 * pNFS Objects layout implementation over open-osd initiator library
3 *
4 * Copyright (C) 2009 Panasas Inc. [year of first publication]
5 * All rights reserved.
6 *
7 * Benny Halevy <bhalevy@panasas.com>
8 * Boaz Harrosh <bharrosh@panasas.com>
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License version 2
12 * See the file COPYING included with this distribution for more details.
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 *
18 * 1. Redistributions of source code must retain the above copyright
19 * notice, this list of conditions and the following disclaimer.
20 * 2. Redistributions in binary form must reproduce the above copyright
21 * notice, this list of conditions and the following disclaimer in the
22 * documentation and/or other materials provided with the distribution.
23 * 3. Neither the name of the Panasas company nor the names of its
24 * contributors may be used to endorse or promote products derived
25 * from this software without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
28 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
29 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
30 * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
34 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 */
39
40#include <linux/module.h>
41#include <scsi/osd_initiator.h>
42
43#include "objlayout.h"
44
45#define NFSDBG_FACILITY NFSDBG_PNFS_LD
46
47#define _LLU(x) ((unsigned long long)x)
48
49enum { BIO_MAX_PAGES_KMALLOC =
50 (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),
51};
52
53struct objio_dev_ent {
54 struct nfs4_deviceid_node id_node;
55 struct osd_dev *od;
56};
57
58static void
59objio_free_deviceid_node(struct nfs4_deviceid_node *d)
60{
61 struct objio_dev_ent *de = container_of(d, struct objio_dev_ent, id_node);
62
63 dprintk("%s: free od=%p\n", __func__, de->od);
64 osduld_put_device(de->od);
65 kfree(de);
66}
67
68static struct objio_dev_ent *_dev_list_find(const struct nfs_server *nfss,
69 const struct nfs4_deviceid *d_id)
70{
71 struct nfs4_deviceid_node *d;
72 struct objio_dev_ent *de;
73
74 d = nfs4_find_get_deviceid(nfss->pnfs_curr_ld, nfss->nfs_client, d_id);
75 if (!d)
76 return NULL;
77
78 de = container_of(d, struct objio_dev_ent, id_node);
79 return de;
80}
81
82static struct objio_dev_ent *
83_dev_list_add(const struct nfs_server *nfss,
84 const struct nfs4_deviceid *d_id, struct osd_dev *od,
85 gfp_t gfp_flags)
86{
87 struct nfs4_deviceid_node *d;
88 struct objio_dev_ent *de = kzalloc(sizeof(*de), gfp_flags);
89 struct objio_dev_ent *n;
90
91 if (!de) {
92 dprintk("%s: -ENOMEM od=%p\n", __func__, od);
93 return NULL;
94 }
95
96 dprintk("%s: Adding od=%p\n", __func__, od);
97 nfs4_init_deviceid_node(&de->id_node,
98 nfss->pnfs_curr_ld,
99 nfss->nfs_client,
100 d_id);
101 de->od = od;
102
103 d = nfs4_insert_deviceid_node(&de->id_node);
104 n = container_of(d, struct objio_dev_ent, id_node);
105 if (n != de) {
106 dprintk("%s: Race with other n->od=%p\n", __func__, n->od);
107 objio_free_deviceid_node(&de->id_node);
108 de = n;
109 }
110
111 atomic_inc(&de->id_node.ref);
112 return de;
113}
114
115struct caps_buffers {
116 u8 caps_key[OSD_CRYPTO_KEYID_SIZE];
117 u8 creds[OSD_CAP_LEN];
118};
119
120struct objio_segment {
121 struct pnfs_layout_segment lseg;
122
123 struct pnfs_osd_object_cred *comps;
124
125 unsigned mirrors_p1;
126 unsigned stripe_unit;
127 unsigned group_width; /* Data stripe_units without integrity comps */
128 u64 group_depth;
129 unsigned group_count;
130
131 unsigned max_io_size;
132
133 unsigned comps_index;
134 unsigned num_comps;
135 /* variable length */
136 struct objio_dev_ent *ods[];
137};
138
139static inline struct objio_segment *
140OBJIO_LSEG(struct pnfs_layout_segment *lseg)
141{
142 return container_of(lseg, struct objio_segment, lseg);
143}
144
145struct objio_state;
146typedef ssize_t (*objio_done_fn)(struct objio_state *ios);
147
148struct objio_state {
149 /* Generic layer */
150 struct objlayout_io_state ol_state;
151
152 struct objio_segment *layout;
153
154 struct kref kref;
155 objio_done_fn done;
156 void *private;
157
158 unsigned long length;
159 unsigned numdevs; /* Actually used devs in this IO */
160 /* A per-device variable array of size numdevs */
161 struct _objio_per_comp {
162 struct bio *bio;
163 struct osd_request *or;
164 unsigned long length;
165 u64 offset;
166 unsigned dev;
167 } per_dev[];
168};
169
170/* Send and wait for a get_device_info of devices in the layout,
171 then look them up with the osd_initiator library */
172static struct objio_dev_ent *_device_lookup(struct pnfs_layout_hdr *pnfslay,
173 struct objio_segment *objio_seg, unsigned comp,
174 gfp_t gfp_flags)
175{
176 struct pnfs_osd_deviceaddr *deviceaddr;
177 struct nfs4_deviceid *d_id;
178 struct objio_dev_ent *ode;
179 struct osd_dev *od;
180 struct osd_dev_info odi;
181 int err;
182
183 d_id = &objio_seg->comps[comp].oc_object_id.oid_device_id;
184
185 ode = _dev_list_find(NFS_SERVER(pnfslay->plh_inode), d_id);
186 if (ode)
187 return ode;
188
189 err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr, gfp_flags);
190 if (unlikely(err)) {
191 dprintk("%s: objlayout_get_deviceinfo dev(%llx:%llx) =>%d\n",
192 __func__, _DEVID_LO(d_id), _DEVID_HI(d_id), err);
193 return ERR_PTR(err);
194 }
195
196 odi.systemid_len = deviceaddr->oda_systemid.len;
197 if (odi.systemid_len > sizeof(odi.systemid)) {
198 err = -EINVAL;
199 goto out;
200 } else if (odi.systemid_len)
201 memcpy(odi.systemid, deviceaddr->oda_systemid.data,
202 odi.systemid_len);
203 odi.osdname_len = deviceaddr->oda_osdname.len;
204 odi.osdname = (u8 *)deviceaddr->oda_osdname.data;
205
206 if (!odi.osdname_len && !odi.systemid_len) {
207 dprintk("%s: !odi.osdname_len && !odi.systemid_len\n",
208 __func__);
209 err = -ENODEV;
210 goto out;
211 }
212
213 od = osduld_info_lookup(&odi);
214 if (unlikely(IS_ERR(od))) {
215 err = PTR_ERR(od);
216 dprintk("%s: osduld_info_lookup => %d\n", __func__, err);
217 goto out;
218 }
219
220 ode = _dev_list_add(NFS_SERVER(pnfslay->plh_inode), d_id, od,
221 gfp_flags);
222
223out:
224 dprintk("%s: return=%d\n", __func__, err);
225 objlayout_put_deviceinfo(deviceaddr);
226 return err ? ERR_PTR(err) : ode;
227}
228
229static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay,
230 struct objio_segment *objio_seg,
231 gfp_t gfp_flags)
232{
233 unsigned i;
234 int err;
235
236 /* lookup all devices */
237 for (i = 0; i < objio_seg->num_comps; i++) {
238 struct objio_dev_ent *ode;
239
240 ode = _device_lookup(pnfslay, objio_seg, i, gfp_flags);
241 if (unlikely(IS_ERR(ode))) {
242 err = PTR_ERR(ode);
243 goto out;
244 }
245 objio_seg->ods[i] = ode;
246 }
247 err = 0;
248
249out:
250 dprintk("%s: return=%d\n", __func__, err);
251 return err;
252}
253
254static int _verify_data_map(struct pnfs_osd_layout *layout)
255{
256 struct pnfs_osd_data_map *data_map = &layout->olo_map;
257 u64 stripe_length;
258 u32 group_width;
259
260/* FIXME: Only raid0 for now. if not go through MDS */
261 if (data_map->odm_raid_algorithm != PNFS_OSD_RAID_0) {
262 printk(KERN_ERR "Only RAID_0 for now\n");
263 return -ENOTSUPP;
264 }
265 if (0 != (data_map->odm_num_comps % (data_map->odm_mirror_cnt + 1))) {
266 printk(KERN_ERR "Data Map wrong, num_comps=%u mirrors=%u\n",
267 data_map->odm_num_comps, data_map->odm_mirror_cnt);
268 return -EINVAL;
269 }
270
271 if (data_map->odm_group_width)
272 group_width = data_map->odm_group_width;
273 else
274 group_width = data_map->odm_num_comps /
275 (data_map->odm_mirror_cnt + 1);
276
277 stripe_length = (u64)data_map->odm_stripe_unit * group_width;
278 if (stripe_length >= (1ULL << 32)) {
279 printk(KERN_ERR "Total Stripe length(0x%llx)"
280 " >= 32bit is not supported\n", _LLU(stripe_length));
281 return -ENOTSUPP;
282 }
283
284 if (0 != (data_map->odm_stripe_unit & ~PAGE_MASK)) {
285 printk(KERN_ERR "Stripe Unit(0x%llx)"
286 " must be Multples of PAGE_SIZE(0x%lx)\n",
287 _LLU(data_map->odm_stripe_unit), PAGE_SIZE);
288 return -ENOTSUPP;
289 }
290
291 return 0;
292}
293
294static void copy_single_comp(struct pnfs_osd_object_cred *cur_comp,
295 struct pnfs_osd_object_cred *src_comp,
296 struct caps_buffers *caps_p)
297{
298 WARN_ON(src_comp->oc_cap_key.cred_len > sizeof(caps_p->caps_key));
299 WARN_ON(src_comp->oc_cap.cred_len > sizeof(caps_p->creds));
300
301 *cur_comp = *src_comp;
302
303 memcpy(caps_p->caps_key, src_comp->oc_cap_key.cred,
304 sizeof(caps_p->caps_key));
305 cur_comp->oc_cap_key.cred = caps_p->caps_key;
306
307 memcpy(caps_p->creds, src_comp->oc_cap.cred,
308 sizeof(caps_p->creds));
309 cur_comp->oc_cap.cred = caps_p->creds;
310}
311
312int objio_alloc_lseg(struct pnfs_layout_segment **outp,
313 struct pnfs_layout_hdr *pnfslay,
314 struct pnfs_layout_range *range,
315 struct xdr_stream *xdr,
316 gfp_t gfp_flags)
317{
318 struct objio_segment *objio_seg;
319 struct pnfs_osd_xdr_decode_layout_iter iter;
320 struct pnfs_osd_layout layout;
321 struct pnfs_osd_object_cred *cur_comp, src_comp;
322 struct caps_buffers *caps_p;
323 int err;
324
325 err = pnfs_osd_xdr_decode_layout_map(&layout, &iter, xdr);
326 if (unlikely(err))
327 return err;
328
329 err = _verify_data_map(&layout);
330 if (unlikely(err))
331 return err;
332
333 objio_seg = kzalloc(sizeof(*objio_seg) +
334 sizeof(objio_seg->ods[0]) * layout.olo_num_comps +
335 sizeof(*objio_seg->comps) * layout.olo_num_comps +
336 sizeof(struct caps_buffers) * layout.olo_num_comps,
337 gfp_flags);
338 if (!objio_seg)
339 return -ENOMEM;
340
341 objio_seg->comps = (void *)(objio_seg->ods + layout.olo_num_comps);
342 cur_comp = objio_seg->comps;
343 caps_p = (void *)(cur_comp + layout.olo_num_comps);
344 while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err))
345 copy_single_comp(cur_comp++, &src_comp, caps_p++);
346 if (unlikely(err))
347 goto err;
348
349 objio_seg->num_comps = layout.olo_num_comps;
350 objio_seg->comps_index = layout.olo_comps_index;
351 err = objio_devices_lookup(pnfslay, objio_seg, gfp_flags);
352 if (err)
353 goto err;
354
355 objio_seg->mirrors_p1 = layout.olo_map.odm_mirror_cnt + 1;
356 objio_seg->stripe_unit = layout.olo_map.odm_stripe_unit;
357 if (layout.olo_map.odm_group_width) {
358 objio_seg->group_width = layout.olo_map.odm_group_width;
359 objio_seg->group_depth = layout.olo_map.odm_group_depth;
360 objio_seg->group_count = layout.olo_map.odm_num_comps /
361 objio_seg->mirrors_p1 /
362 objio_seg->group_width;
363 } else {
364 objio_seg->group_width = layout.olo_map.odm_num_comps /
365 objio_seg->mirrors_p1;
366 objio_seg->group_depth = -1;
367 objio_seg->group_count = 1;
368 }
369
370 /* Cache this calculation it will hit for every page */
371 objio_seg->max_io_size = (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE -
372 objio_seg->stripe_unit) *
373 objio_seg->group_width;
374
375 *outp = &objio_seg->lseg;
376 return 0;
377
378err:
379 kfree(objio_seg);
380 dprintk("%s: Error: return %d\n", __func__, err);
381 *outp = NULL;
382 return err;
383}
384
385void objio_free_lseg(struct pnfs_layout_segment *lseg)
386{
387 int i;
388 struct objio_segment *objio_seg = OBJIO_LSEG(lseg);
389
390 for (i = 0; i < objio_seg->num_comps; i++) {
391 if (!objio_seg->ods[i])
392 break;
393 nfs4_put_deviceid_node(&objio_seg->ods[i]->id_node);
394 }
395 kfree(objio_seg);
396}
397
398int objio_alloc_io_state(struct pnfs_layout_segment *lseg,
399 struct objlayout_io_state **outp,
400 gfp_t gfp_flags)
401{
402 struct objio_segment *objio_seg = OBJIO_LSEG(lseg);
403 struct objio_state *ios;
404 const unsigned first_size = sizeof(*ios) +
405 objio_seg->num_comps * sizeof(ios->per_dev[0]);
406 const unsigned sec_size = objio_seg->num_comps *
407 sizeof(ios->ol_state.ioerrs[0]);
408
409 ios = kzalloc(first_size + sec_size, gfp_flags);
410 if (unlikely(!ios))
411 return -ENOMEM;
412
413 ios->layout = objio_seg;
414 ios->ol_state.ioerrs = ((void *)ios) + first_size;
415 ios->ol_state.num_comps = objio_seg->num_comps;
416
417 *outp = &ios->ol_state;
418 return 0;
419}
420
421void objio_free_io_state(struct objlayout_io_state *ol_state)
422{
423 struct objio_state *ios = container_of(ol_state, struct objio_state,
424 ol_state);
425
426 kfree(ios);
427}
428
429enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep)
430{
431 switch (oep) {
432 case OSD_ERR_PRI_NO_ERROR:
433 return (enum pnfs_osd_errno)0;
434
435 case OSD_ERR_PRI_CLEAR_PAGES:
436 BUG_ON(1);
437 return 0;
438
439 case OSD_ERR_PRI_RESOURCE:
440 return PNFS_OSD_ERR_RESOURCE;
441 case OSD_ERR_PRI_BAD_CRED:
442 return PNFS_OSD_ERR_BAD_CRED;
443 case OSD_ERR_PRI_NO_ACCESS:
444 return PNFS_OSD_ERR_NO_ACCESS;
445 case OSD_ERR_PRI_UNREACHABLE:
446 return PNFS_OSD_ERR_UNREACHABLE;
447 case OSD_ERR_PRI_NOT_FOUND:
448 return PNFS_OSD_ERR_NOT_FOUND;
449 case OSD_ERR_PRI_NO_SPACE:
450 return PNFS_OSD_ERR_NO_SPACE;
451 default:
452 WARN_ON(1);
453 /* fallthrough */
454 case OSD_ERR_PRI_EIO:
455 return PNFS_OSD_ERR_EIO;
456 }
457}
458
459static void _clear_bio(struct bio *bio)
460{
461 struct bio_vec *bv;
462 unsigned i;
463
464 __bio_for_each_segment(bv, bio, i, 0) {
465 unsigned this_count = bv->bv_len;
466
467 if (likely(PAGE_SIZE == this_count))
468 clear_highpage(bv->bv_page);
469 else
470 zero_user(bv->bv_page, bv->bv_offset, this_count);
471 }
472}
473
474static int _io_check(struct objio_state *ios, bool is_write)
475{
476 enum osd_err_priority oep = OSD_ERR_PRI_NO_ERROR;
477 int lin_ret = 0;
478 int i;
479
480 for (i = 0; i < ios->numdevs; i++) {
481 struct osd_sense_info osi;
482 struct osd_request *or = ios->per_dev[i].or;
483 unsigned dev;
484 int ret;
485
486 if (!or)
487 continue;
488
489 ret = osd_req_decode_sense(or, &osi);
490 if (likely(!ret))
491 continue;
492
493 if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) {
494 /* start read offset passed endof file */
495 BUG_ON(is_write);
496 _clear_bio(ios->per_dev[i].bio);
497 dprintk("%s: start read offset passed end of file "
498 "offset=0x%llx, length=0x%lx\n", __func__,
499 _LLU(ios->per_dev[i].offset),
500 ios->per_dev[i].length);
501
502 continue; /* we recovered */
503 }
504 dev = ios->per_dev[i].dev;
505 objlayout_io_set_result(&ios->ol_state, dev,
506 &ios->layout->comps[dev].oc_object_id,
507 osd_pri_2_pnfs_err(osi.osd_err_pri),
508 ios->per_dev[i].offset,
509 ios->per_dev[i].length,
510 is_write);
511
512 if (osi.osd_err_pri >= oep) {
513 oep = osi.osd_err_pri;
514 lin_ret = ret;
515 }
516 }
517
518 return lin_ret;
519}
520
521/*
522 * Common IO state helpers.
523 */
524static void _io_free(struct objio_state *ios)
525{
526 unsigned i;
527
528 for (i = 0; i < ios->numdevs; i++) {
529 struct _objio_per_comp *per_dev = &ios->per_dev[i];
530
531 if (per_dev->or) {
532 osd_end_request(per_dev->or);
533 per_dev->or = NULL;
534 }
535
536 if (per_dev->bio) {
537 bio_put(per_dev->bio);
538 per_dev->bio = NULL;
539 }
540 }
541}
542
543struct osd_dev *_io_od(struct objio_state *ios, unsigned dev)
544{
545 unsigned min_dev = ios->layout->comps_index;
546 unsigned max_dev = min_dev + ios->layout->num_comps;
547
548 BUG_ON(dev < min_dev || max_dev <= dev);
549 return ios->layout->ods[dev - min_dev]->od;
550}
551
552struct _striping_info {
553 u64 obj_offset;
554 u64 group_length;
555 unsigned dev;
556 unsigned unit_off;
557};
558
559static void _calc_stripe_info(struct objio_state *ios, u64 file_offset,
560 struct _striping_info *si)
561{
562 u32 stripe_unit = ios->layout->stripe_unit;
563 u32 group_width = ios->layout->group_width;
564 u64 group_depth = ios->layout->group_depth;
565 u32 U = stripe_unit * group_width;
566
567 u64 T = U * group_depth;
568 u64 S = T * ios->layout->group_count;
569 u64 M = div64_u64(file_offset, S);
570
571 /*
572 G = (L - (M * S)) / T
573 H = (L - (M * S)) % T
574 */
575 u64 LmodU = file_offset - M * S;
576 u32 G = div64_u64(LmodU, T);
577 u64 H = LmodU - G * T;
578
579 u32 N = div_u64(H, U);
580
581 div_u64_rem(file_offset, stripe_unit, &si->unit_off);
582 si->obj_offset = si->unit_off + (N * stripe_unit) +
583 (M * group_depth * stripe_unit);
584
585 /* "H - (N * U)" is just "H % U" so it's bound to u32 */
586 si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width;
587 si->dev *= ios->layout->mirrors_p1;
588
589 si->group_length = T - H;
590}
591
592static int _add_stripe_unit(struct objio_state *ios, unsigned *cur_pg,
593 unsigned pgbase, struct _objio_per_comp *per_dev, int cur_len,
594 gfp_t gfp_flags)
595{
596 unsigned pg = *cur_pg;
597 struct request_queue *q =
598 osd_request_queue(_io_od(ios, per_dev->dev));
599
600 per_dev->length += cur_len;
601
602 if (per_dev->bio == NULL) {
603 unsigned stripes = ios->layout->num_comps /
604 ios->layout->mirrors_p1;
605 unsigned pages_in_stripe = stripes *
606 (ios->layout->stripe_unit / PAGE_SIZE);
607 unsigned bio_size = (ios->ol_state.nr_pages + pages_in_stripe) /
608 stripes;
609
610 if (BIO_MAX_PAGES_KMALLOC < bio_size)
611 bio_size = BIO_MAX_PAGES_KMALLOC;
612
613 per_dev->bio = bio_kmalloc(gfp_flags, bio_size);
614 if (unlikely(!per_dev->bio)) {
615 dprintk("Faild to allocate BIO size=%u\n", bio_size);
616 return -ENOMEM;
617 }
618 }
619
620 while (cur_len > 0) {
621 unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len);
622 unsigned added_len;
623
624 BUG_ON(ios->ol_state.nr_pages <= pg);
625 cur_len -= pglen;
626
627 added_len = bio_add_pc_page(q, per_dev->bio,
628 ios->ol_state.pages[pg], pglen, pgbase);
629 if (unlikely(pglen != added_len))
630 return -ENOMEM;
631 pgbase = 0;
632 ++pg;
633 }
634 BUG_ON(cur_len);
635
636 *cur_pg = pg;
637 return 0;
638}
639
640static int _prepare_one_group(struct objio_state *ios, u64 length,
641 struct _striping_info *si, unsigned *last_pg,
642 gfp_t gfp_flags)
643{
644 unsigned stripe_unit = ios->layout->stripe_unit;
645 unsigned mirrors_p1 = ios->layout->mirrors_p1;
646 unsigned devs_in_group = ios->layout->group_width * mirrors_p1;
647 unsigned dev = si->dev;
648 unsigned first_dev = dev - (dev % devs_in_group);
649 unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0;
650 unsigned cur_pg = *last_pg;
651 int ret = 0;
652
653 while (length) {
654 struct _objio_per_comp *per_dev = &ios->per_dev[dev];
655 unsigned cur_len, page_off = 0;
656
657 if (!per_dev->length) {
658 per_dev->dev = dev;
659 if (dev < si->dev) {
660 per_dev->offset = si->obj_offset + stripe_unit -
661 si->unit_off;
662 cur_len = stripe_unit;
663 } else if (dev == si->dev) {
664 per_dev->offset = si->obj_offset;
665 cur_len = stripe_unit - si->unit_off;
666 page_off = si->unit_off & ~PAGE_MASK;
667 BUG_ON(page_off &&
668 (page_off != ios->ol_state.pgbase));
669 } else { /* dev > si->dev */
670 per_dev->offset = si->obj_offset - si->unit_off;
671 cur_len = stripe_unit;
672 }
673
674 if (max_comp < dev)
675 max_comp = dev;
676 } else {
677 cur_len = stripe_unit;
678 }
679 if (cur_len >= length)
680 cur_len = length;
681
682 ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev,
683 cur_len, gfp_flags);
684 if (unlikely(ret))
685 goto out;
686
687 dev += mirrors_p1;
688 dev = (dev % devs_in_group) + first_dev;
689
690 length -= cur_len;
691 ios->length += cur_len;
692 }
693out:
694 ios->numdevs = max_comp + mirrors_p1;
695 *last_pg = cur_pg;
696 return ret;
697}
698
699static int _io_rw_pagelist(struct objio_state *ios, gfp_t gfp_flags)
700{
701 u64 length = ios->ol_state.count;
702 u64 offset = ios->ol_state.offset;
703 struct _striping_info si;
704 unsigned last_pg = 0;
705 int ret = 0;
706
707 while (length) {
708 _calc_stripe_info(ios, offset, &si);
709
710 if (length < si.group_length)
711 si.group_length = length;
712
713 ret = _prepare_one_group(ios, si.group_length, &si, &last_pg, gfp_flags);
714 if (unlikely(ret))
715 goto out;
716
717 offset += si.group_length;
718 length -= si.group_length;
719 }
720
721out:
722 if (!ios->length)
723 return ret;
724
725 return 0;
726}
727
728static ssize_t _sync_done(struct objio_state *ios)
729{
730 struct completion *waiting = ios->private;
731
732 complete(waiting);
733 return 0;
734}
735
736static void _last_io(struct kref *kref)
737{
738 struct objio_state *ios = container_of(kref, struct objio_state, kref);
739
740 ios->done(ios);
741}
742
743static void _done_io(struct osd_request *or, void *p)
744{
745 struct objio_state *ios = p;
746
747 kref_put(&ios->kref, _last_io);
748}
749
750static ssize_t _io_exec(struct objio_state *ios)
751{
752 DECLARE_COMPLETION_ONSTACK(wait);
753 ssize_t status = 0; /* sync status */
754 unsigned i;
755 objio_done_fn saved_done_fn = ios->done;
756 bool sync = ios->ol_state.sync;
757
758 if (sync) {
759 ios->done = _sync_done;
760 ios->private = &wait;
761 }
762
763 kref_init(&ios->kref);
764
765 for (i = 0; i < ios->numdevs; i++) {
766 struct osd_request *or = ios->per_dev[i].or;
767
768 if (!or)
769 continue;
770
771 kref_get(&ios->kref);
772 osd_execute_request_async(or, _done_io, ios);
773 }
774
775 kref_put(&ios->kref, _last_io);
776
777 if (sync) {
778 wait_for_completion(&wait);
779 status = saved_done_fn(ios);
780 }
781
782 return status;
783}
784
785/*
786 * read
787 */
788static ssize_t _read_done(struct objio_state *ios)
789{
790 ssize_t status;
791 int ret = _io_check(ios, false);
792
793 _io_free(ios);
794
795 if (likely(!ret))
796 status = ios->length;
797 else
798 status = ret;
799
800 objlayout_read_done(&ios->ol_state, status, ios->ol_state.sync);
801 return status;
802}
803
804static int _read_mirrors(struct objio_state *ios, unsigned cur_comp)
805{
806 struct osd_request *or = NULL;
807 struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
808 unsigned dev = per_dev->dev;
809 struct pnfs_osd_object_cred *cred =
810 &ios->layout->comps[dev];
811 struct osd_obj_id obj = {
812 .partition = cred->oc_object_id.oid_partition_id,
813 .id = cred->oc_object_id.oid_object_id,
814 };
815 int ret;
816
817 or = osd_start_request(_io_od(ios, dev), GFP_KERNEL);
818 if (unlikely(!or)) {
819 ret = -ENOMEM;
820 goto err;
821 }
822 per_dev->or = or;
823
824 osd_req_read(or, &obj, per_dev->offset, per_dev->bio, per_dev->length);
825
826 ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL);
827 if (ret) {
828 dprintk("%s: Faild to osd_finalize_request() => %d\n",
829 __func__, ret);
830 goto err;
831 }
832
833 dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n",
834 __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset),
835 per_dev->length);
836
837err:
838 return ret;
839}
840
841static ssize_t _read_exec(struct objio_state *ios)
842{
843 unsigned i;
844 int ret;
845
846 for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
847 if (!ios->per_dev[i].length)
848 continue;
849 ret = _read_mirrors(ios, i);
850 if (unlikely(ret))
851 goto err;
852 }
853
854 ios->done = _read_done;
855 return _io_exec(ios); /* In sync mode exec returns the io status */
856
857err:
858 _io_free(ios);
859 return ret;
860}
861
862ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state)
863{
864 struct objio_state *ios = container_of(ol_state, struct objio_state,
865 ol_state);
866 int ret;
867
868 ret = _io_rw_pagelist(ios, GFP_KERNEL);
869 if (unlikely(ret))
870 return ret;
871
872 return _read_exec(ios);
873}
874
875/*
876 * write
877 */
878static ssize_t _write_done(struct objio_state *ios)
879{
880 ssize_t status;
881 int ret = _io_check(ios, true);
882
883 _io_free(ios);
884
885 if (likely(!ret)) {
886 /* FIXME: should be based on the OSD's persistence model
887 * See OSD2r05 Section 4.13 Data persistence model */
888 ios->ol_state.committed = NFS_FILE_SYNC;
889 status = ios->length;
890 } else {
891 status = ret;
892 }
893
894 objlayout_write_done(&ios->ol_state, status, ios->ol_state.sync);
895 return status;
896}
897
898static int _write_mirrors(struct objio_state *ios, unsigned cur_comp)
899{
900 struct _objio_per_comp *master_dev = &ios->per_dev[cur_comp];
901 unsigned dev = ios->per_dev[cur_comp].dev;
902 unsigned last_comp = cur_comp + ios->layout->mirrors_p1;
903 int ret;
904
905 for (; cur_comp < last_comp; ++cur_comp, ++dev) {
906 struct osd_request *or = NULL;
907 struct pnfs_osd_object_cred *cred =
908 &ios->layout->comps[dev];
909 struct osd_obj_id obj = {
910 .partition = cred->oc_object_id.oid_partition_id,
911 .id = cred->oc_object_id.oid_object_id,
912 };
913 struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
914 struct bio *bio;
915
916 or = osd_start_request(_io_od(ios, dev), GFP_NOFS);
917 if (unlikely(!or)) {
918 ret = -ENOMEM;
919 goto err;
920 }
921 per_dev->or = or;
922
923 if (per_dev != master_dev) {
924 bio = bio_kmalloc(GFP_NOFS,
925 master_dev->bio->bi_max_vecs);
926 if (unlikely(!bio)) {
927 dprintk("Faild to allocate BIO size=%u\n",
928 master_dev->bio->bi_max_vecs);
929 ret = -ENOMEM;
930 goto err;
931 }
932
933 __bio_clone(bio, master_dev->bio);
934 bio->bi_bdev = NULL;
935 bio->bi_next = NULL;
936 per_dev->bio = bio;
937 per_dev->dev = dev;
938 per_dev->length = master_dev->length;
939 per_dev->offset = master_dev->offset;
940 } else {
941 bio = master_dev->bio;
942 bio->bi_rw |= REQ_WRITE;
943 }
944
945 osd_req_write(or, &obj, per_dev->offset, bio, per_dev->length);
946
947 ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL);
948 if (ret) {
949 dprintk("%s: Faild to osd_finalize_request() => %d\n",
950 __func__, ret);
951 goto err;
952 }
953
954 dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n",
955 __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset),
956 per_dev->length);
957 }
958
959err:
960 return ret;
961}
962
963static ssize_t _write_exec(struct objio_state *ios)
964{
965 unsigned i;
966 int ret;
967
968 for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
969 if (!ios->per_dev[i].length)
970 continue;
971 ret = _write_mirrors(ios, i);
972 if (unlikely(ret))
973 goto err;
974 }
975
976 ios->done = _write_done;
977 return _io_exec(ios); /* In sync mode exec returns the io->status */
978
979err:
980 _io_free(ios);
981 return ret;
982}
983
984ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable)
985{
986 struct objio_state *ios = container_of(ol_state, struct objio_state,
987 ol_state);
988 int ret;
989
990 /* TODO: ios->stable = stable; */
991 ret = _io_rw_pagelist(ios, GFP_NOFS);
992 if (unlikely(ret))
993 return ret;
994
995 return _write_exec(ios);
996}
997
998static bool objio_pg_test(struct nfs_pageio_descriptor *pgio,
999 struct nfs_page *prev, struct nfs_page *req)
1000{
1001 if (!pnfs_generic_pg_test(pgio, prev, req))
1002 return false;
1003
1004 return pgio->pg_count + req->wb_bytes <=
1005 OBJIO_LSEG(pgio->pg_lseg)->max_io_size;
1006}
1007
1008static struct pnfs_layoutdriver_type objlayout_type = {
1009 .id = LAYOUT_OSD2_OBJECTS,
1010 .name = "LAYOUT_OSD2_OBJECTS",
1011 .flags = PNFS_LAYOUTRET_ON_SETATTR,
1012
1013 .alloc_layout_hdr = objlayout_alloc_layout_hdr,
1014 .free_layout_hdr = objlayout_free_layout_hdr,
1015
1016 .alloc_lseg = objlayout_alloc_lseg,
1017 .free_lseg = objlayout_free_lseg,
1018
1019 .read_pagelist = objlayout_read_pagelist,
1020 .write_pagelist = objlayout_write_pagelist,
1021 .pg_test = objio_pg_test,
1022
1023 .free_deviceid_node = objio_free_deviceid_node,
1024
1025 .encode_layoutcommit = objlayout_encode_layoutcommit,
1026 .encode_layoutreturn = objlayout_encode_layoutreturn,
1027};
1028
1029MODULE_DESCRIPTION("pNFS Layout Driver for OSD2 objects");
1030MODULE_AUTHOR("Benny Halevy <bhalevy@panasas.com>");
1031MODULE_LICENSE("GPL");
1032
1033static int __init
1034objlayout_init(void)
1035{
1036 int ret = pnfs_register_layoutdriver(&objlayout_type);
1037
1038 if (ret)
1039 printk(KERN_INFO
1040 "%s: Registering OSD pNFS Layout Driver failed: error=%d\n",
1041 __func__, ret);
1042 else
1043 printk(KERN_INFO "%s: Registered OSD pNFS Layout Driver\n",
1044 __func__);
1045 return ret;
1046}
1047
1048static void __exit
1049objlayout_exit(void)
1050{
1051 pnfs_unregister_layoutdriver(&objlayout_type);
1052 printk(KERN_INFO "%s: Unregistered OSD pNFS Layout Driver\n",
1053 __func__);
1054}
1055
1056module_init(objlayout_init);
1057module_exit(objlayout_exit);
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
new file mode 100644
index 000000000000..dc3956c0de80
--- /dev/null
+++ b/fs/nfs/objlayout/objlayout.c
@@ -0,0 +1,712 @@
1/*
2 * pNFS Objects layout driver high level definitions
3 *
4 * Copyright (C) 2007 Panasas Inc. [year of first publication]
5 * All rights reserved.
6 *
7 * Benny Halevy <bhalevy@panasas.com>
8 * Boaz Harrosh <bharrosh@panasas.com>
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License version 2
12 * See the file COPYING included with this distribution for more details.
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 *
18 * 1. Redistributions of source code must retain the above copyright
19 * notice, this list of conditions and the following disclaimer.
20 * 2. Redistributions in binary form must reproduce the above copyright
21 * notice, this list of conditions and the following disclaimer in the
22 * documentation and/or other materials provided with the distribution.
23 * 3. Neither the name of the Panasas company nor the names of its
24 * contributors may be used to endorse or promote products derived
25 * from this software without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
28 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
29 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
30 * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
34 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 */
39
40#include <scsi/osd_initiator.h>
41#include "objlayout.h"
42
43#define NFSDBG_FACILITY NFSDBG_PNFS_LD
44/*
45 * Create a objlayout layout structure for the given inode and return it.
46 */
47struct pnfs_layout_hdr *
48objlayout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
49{
50 struct objlayout *objlay;
51
52 objlay = kzalloc(sizeof(struct objlayout), gfp_flags);
53 if (objlay) {
54 spin_lock_init(&objlay->lock);
55 INIT_LIST_HEAD(&objlay->err_list);
56 }
57 dprintk("%s: Return %p\n", __func__, objlay);
58 return &objlay->pnfs_layout;
59}
60
61/*
62 * Free an objlayout layout structure
63 */
64void
65objlayout_free_layout_hdr(struct pnfs_layout_hdr *lo)
66{
67 struct objlayout *objlay = OBJLAYOUT(lo);
68
69 dprintk("%s: objlay %p\n", __func__, objlay);
70
71 WARN_ON(!list_empty(&objlay->err_list));
72 kfree(objlay);
73}
74
75/*
76 * Unmarshall layout and store it in pnfslay.
77 */
78struct pnfs_layout_segment *
79objlayout_alloc_lseg(struct pnfs_layout_hdr *pnfslay,
80 struct nfs4_layoutget_res *lgr,
81 gfp_t gfp_flags)
82{
83 int status = -ENOMEM;
84 struct xdr_stream stream;
85 struct xdr_buf buf = {
86 .pages = lgr->layoutp->pages,
87 .page_len = lgr->layoutp->len,
88 .buflen = lgr->layoutp->len,
89 .len = lgr->layoutp->len,
90 };
91 struct page *scratch;
92 struct pnfs_layout_segment *lseg;
93
94 dprintk("%s: Begin pnfslay %p\n", __func__, pnfslay);
95
96 scratch = alloc_page(gfp_flags);
97 if (!scratch)
98 goto err_nofree;
99
100 xdr_init_decode(&stream, &buf, NULL);
101 xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
102
103 status = objio_alloc_lseg(&lseg, pnfslay, &lgr->range, &stream, gfp_flags);
104 if (unlikely(status)) {
105 dprintk("%s: objio_alloc_lseg Return err %d\n", __func__,
106 status);
107 goto err;
108 }
109
110 __free_page(scratch);
111
112 dprintk("%s: Return %p\n", __func__, lseg);
113 return lseg;
114
115err:
116 __free_page(scratch);
117err_nofree:
118 dprintk("%s: Err Return=>%d\n", __func__, status);
119 return ERR_PTR(status);
120}
121
122/*
123 * Free a layout segement
124 */
125void
126objlayout_free_lseg(struct pnfs_layout_segment *lseg)
127{
128 dprintk("%s: freeing layout segment %p\n", __func__, lseg);
129
130 if (unlikely(!lseg))
131 return;
132
133 objio_free_lseg(lseg);
134}
135
136/*
137 * I/O Operations
138 */
139static inline u64
140end_offset(u64 start, u64 len)
141{
142 u64 end;
143
144 end = start + len;
145 return end >= start ? end : NFS4_MAX_UINT64;
146}
147
148/* last octet in a range */
149static inline u64
150last_byte_offset(u64 start, u64 len)
151{
152 u64 end;
153
154 BUG_ON(!len);
155 end = start + len;
156 return end > start ? end - 1 : NFS4_MAX_UINT64;
157}
158
159static struct objlayout_io_state *
160objlayout_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type,
161 struct page **pages,
162 unsigned pgbase,
163 loff_t offset,
164 size_t count,
165 struct pnfs_layout_segment *lseg,
166 void *rpcdata,
167 gfp_t gfp_flags)
168{
169 struct objlayout_io_state *state;
170 u64 lseg_end_offset;
171
172 dprintk("%s: allocating io_state\n", __func__);
173 if (objio_alloc_io_state(lseg, &state, gfp_flags))
174 return NULL;
175
176 BUG_ON(offset < lseg->pls_range.offset);
177 lseg_end_offset = end_offset(lseg->pls_range.offset,
178 lseg->pls_range.length);
179 BUG_ON(offset >= lseg_end_offset);
180 if (offset + count > lseg_end_offset) {
181 count = lseg->pls_range.length -
182 (offset - lseg->pls_range.offset);
183 dprintk("%s: truncated count %Zd\n", __func__, count);
184 }
185
186 if (pgbase > PAGE_SIZE) {
187 pages += pgbase >> PAGE_SHIFT;
188 pgbase &= ~PAGE_MASK;
189 }
190
191 INIT_LIST_HEAD(&state->err_list);
192 state->lseg = lseg;
193 state->rpcdata = rpcdata;
194 state->pages = pages;
195 state->pgbase = pgbase;
196 state->nr_pages = (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT;
197 state->offset = offset;
198 state->count = count;
199 state->sync = 0;
200
201 return state;
202}
203
204static void
205objlayout_free_io_state(struct objlayout_io_state *state)
206{
207 dprintk("%s: freeing io_state\n", __func__);
208 if (unlikely(!state))
209 return;
210
211 objio_free_io_state(state);
212}
213
214/*
215 * I/O done common code
216 */
217static void
218objlayout_iodone(struct objlayout_io_state *state)
219{
220 dprintk("%s: state %p status\n", __func__, state);
221
222 if (likely(state->status >= 0)) {
223 objlayout_free_io_state(state);
224 } else {
225 struct objlayout *objlay = OBJLAYOUT(state->lseg->pls_layout);
226
227 spin_lock(&objlay->lock);
228 objlay->delta_space_valid = OBJ_DSU_INVALID;
229 list_add(&objlay->err_list, &state->err_list);
230 spin_unlock(&objlay->lock);
231 }
232}
233
234/*
235 * objlayout_io_set_result - Set an osd_error code on a specific osd comp.
236 *
237 * The @index component IO failed (error returned from target). Register
238 * the error for later reporting at layout-return.
239 */
240void
241objlayout_io_set_result(struct objlayout_io_state *state, unsigned index,
242 struct pnfs_osd_objid *pooid, int osd_error,
243 u64 offset, u64 length, bool is_write)
244{
245 struct pnfs_osd_ioerr *ioerr = &state->ioerrs[index];
246
247 BUG_ON(index >= state->num_comps);
248 if (osd_error) {
249 ioerr->oer_component = *pooid;
250 ioerr->oer_comp_offset = offset;
251 ioerr->oer_comp_length = length;
252 ioerr->oer_iswrite = is_write;
253 ioerr->oer_errno = osd_error;
254
255 dprintk("%s: err[%d]: errno=%d is_write=%d dev(%llx:%llx) "
256 "par=0x%llx obj=0x%llx offset=0x%llx length=0x%llx\n",
257 __func__, index, ioerr->oer_errno,
258 ioerr->oer_iswrite,
259 _DEVID_LO(&ioerr->oer_component.oid_device_id),
260 _DEVID_HI(&ioerr->oer_component.oid_device_id),
261 ioerr->oer_component.oid_partition_id,
262 ioerr->oer_component.oid_object_id,
263 ioerr->oer_comp_offset,
264 ioerr->oer_comp_length);
265 } else {
266 /* User need not call if no error is reported */
267 ioerr->oer_errno = 0;
268 }
269}
270
271/* Function scheduled on rpc workqueue to call ->nfs_readlist_complete().
272 * This is because the osd completion is called with ints-off from
273 * the block layer
274 */
275static void _rpc_read_complete(struct work_struct *work)
276{
277 struct rpc_task *task;
278 struct nfs_read_data *rdata;
279
280 dprintk("%s enter\n", __func__);
281 task = container_of(work, struct rpc_task, u.tk_work);
282 rdata = container_of(task, struct nfs_read_data, task);
283
284 pnfs_ld_read_done(rdata);
285}
286
287void
288objlayout_read_done(struct objlayout_io_state *state, ssize_t status, bool sync)
289{
290 int eof = state->eof;
291 struct nfs_read_data *rdata;
292
293 state->status = status;
294 dprintk("%s: Begin status=%ld eof=%d\n", __func__, status, eof);
295 rdata = state->rpcdata;
296 rdata->task.tk_status = status;
297 if (status >= 0) {
298 rdata->res.count = status;
299 rdata->res.eof = eof;
300 }
301 objlayout_iodone(state);
302 /* must not use state after this point */
303
304 if (sync)
305 pnfs_ld_read_done(rdata);
306 else {
307 INIT_WORK(&rdata->task.u.tk_work, _rpc_read_complete);
308 schedule_work(&rdata->task.u.tk_work);
309 }
310}
311
312/*
313 * Perform sync or async reads.
314 */
315enum pnfs_try_status
316objlayout_read_pagelist(struct nfs_read_data *rdata)
317{
318 loff_t offset = rdata->args.offset;
319 size_t count = rdata->args.count;
320 struct objlayout_io_state *state;
321 ssize_t status = 0;
322 loff_t eof;
323
324 dprintk("%s: Begin inode %p offset %llu count %d\n",
325 __func__, rdata->inode, offset, (int)count);
326
327 eof = i_size_read(rdata->inode);
328 if (unlikely(offset + count > eof)) {
329 if (offset >= eof) {
330 status = 0;
331 rdata->res.count = 0;
332 rdata->res.eof = 1;
333 goto out;
334 }
335 count = eof - offset;
336 }
337
338 state = objlayout_alloc_io_state(NFS_I(rdata->inode)->layout,
339 rdata->args.pages, rdata->args.pgbase,
340 offset, count,
341 rdata->lseg, rdata,
342 GFP_KERNEL);
343 if (unlikely(!state)) {
344 status = -ENOMEM;
345 goto out;
346 }
347
348 state->eof = state->offset + state->count >= eof;
349
350 status = objio_read_pagelist(state);
351 out:
352 dprintk("%s: Return status %Zd\n", __func__, status);
353 rdata->pnfs_error = status;
354 return PNFS_ATTEMPTED;
355}
356
357/* Function scheduled on rpc workqueue to call ->nfs_writelist_complete().
358 * This is because the osd completion is called with ints-off from
359 * the block layer
360 */
361static void _rpc_write_complete(struct work_struct *work)
362{
363 struct rpc_task *task;
364 struct nfs_write_data *wdata;
365
366 dprintk("%s enter\n", __func__);
367 task = container_of(work, struct rpc_task, u.tk_work);
368 wdata = container_of(task, struct nfs_write_data, task);
369
370 pnfs_ld_write_done(wdata);
371}
372
373void
374objlayout_write_done(struct objlayout_io_state *state, ssize_t status,
375 bool sync)
376{
377 struct nfs_write_data *wdata;
378
379 dprintk("%s: Begin\n", __func__);
380 wdata = state->rpcdata;
381 state->status = status;
382 wdata->task.tk_status = status;
383 if (status >= 0) {
384 wdata->res.count = status;
385 wdata->verf.committed = state->committed;
386 dprintk("%s: Return status %d committed %d\n",
387 __func__, wdata->task.tk_status,
388 wdata->verf.committed);
389 } else
390 dprintk("%s: Return status %d\n",
391 __func__, wdata->task.tk_status);
392 objlayout_iodone(state);
393 /* must not use state after this point */
394
395 if (sync)
396 pnfs_ld_write_done(wdata);
397 else {
398 INIT_WORK(&wdata->task.u.tk_work, _rpc_write_complete);
399 schedule_work(&wdata->task.u.tk_work);
400 }
401}
402
403/*
404 * Perform sync or async writes.
405 */
406enum pnfs_try_status
407objlayout_write_pagelist(struct nfs_write_data *wdata,
408 int how)
409{
410 struct objlayout_io_state *state;
411 ssize_t status;
412
413 dprintk("%s: Begin inode %p offset %llu count %u\n",
414 __func__, wdata->inode, wdata->args.offset, wdata->args.count);
415
416 state = objlayout_alloc_io_state(NFS_I(wdata->inode)->layout,
417 wdata->args.pages,
418 wdata->args.pgbase,
419 wdata->args.offset,
420 wdata->args.count,
421 wdata->lseg, wdata,
422 GFP_NOFS);
423 if (unlikely(!state)) {
424 status = -ENOMEM;
425 goto out;
426 }
427
428 state->sync = how & FLUSH_SYNC;
429
430 status = objio_write_pagelist(state, how & FLUSH_STABLE);
431 out:
432 dprintk("%s: Return status %Zd\n", __func__, status);
433 wdata->pnfs_error = status;
434 return PNFS_ATTEMPTED;
435}
436
437void
438objlayout_encode_layoutcommit(struct pnfs_layout_hdr *pnfslay,
439 struct xdr_stream *xdr,
440 const struct nfs4_layoutcommit_args *args)
441{
442 struct objlayout *objlay = OBJLAYOUT(pnfslay);
443 struct pnfs_osd_layoutupdate lou;
444 __be32 *start;
445
446 dprintk("%s: Begin\n", __func__);
447
448 spin_lock(&objlay->lock);
449 lou.dsu_valid = (objlay->delta_space_valid == OBJ_DSU_VALID);
450 lou.dsu_delta = objlay->delta_space_used;
451 objlay->delta_space_used = 0;
452 objlay->delta_space_valid = OBJ_DSU_INIT;
453 lou.olu_ioerr_flag = !list_empty(&objlay->err_list);
454 spin_unlock(&objlay->lock);
455
456 start = xdr_reserve_space(xdr, 4);
457
458 BUG_ON(pnfs_osd_xdr_encode_layoutupdate(xdr, &lou));
459
460 *start = cpu_to_be32((xdr->p - start - 1) * 4);
461
462 dprintk("%s: Return delta_space_used %lld err %d\n", __func__,
463 lou.dsu_delta, lou.olu_ioerr_flag);
464}
465
466static int
467err_prio(u32 oer_errno)
468{
469 switch (oer_errno) {
470 case 0:
471 return 0;
472
473 case PNFS_OSD_ERR_RESOURCE:
474 return OSD_ERR_PRI_RESOURCE;
475 case PNFS_OSD_ERR_BAD_CRED:
476 return OSD_ERR_PRI_BAD_CRED;
477 case PNFS_OSD_ERR_NO_ACCESS:
478 return OSD_ERR_PRI_NO_ACCESS;
479 case PNFS_OSD_ERR_UNREACHABLE:
480 return OSD_ERR_PRI_UNREACHABLE;
481 case PNFS_OSD_ERR_NOT_FOUND:
482 return OSD_ERR_PRI_NOT_FOUND;
483 case PNFS_OSD_ERR_NO_SPACE:
484 return OSD_ERR_PRI_NO_SPACE;
485 default:
486 WARN_ON(1);
487 /* fallthrough */
488 case PNFS_OSD_ERR_EIO:
489 return OSD_ERR_PRI_EIO;
490 }
491}
492
493static void
494merge_ioerr(struct pnfs_osd_ioerr *dest_err,
495 const struct pnfs_osd_ioerr *src_err)
496{
497 u64 dest_end, src_end;
498
499 if (!dest_err->oer_errno) {
500 *dest_err = *src_err;
501 /* accumulated device must be blank */
502 memset(&dest_err->oer_component.oid_device_id, 0,
503 sizeof(dest_err->oer_component.oid_device_id));
504
505 return;
506 }
507
508 if (dest_err->oer_component.oid_partition_id !=
509 src_err->oer_component.oid_partition_id)
510 dest_err->oer_component.oid_partition_id = 0;
511
512 if (dest_err->oer_component.oid_object_id !=
513 src_err->oer_component.oid_object_id)
514 dest_err->oer_component.oid_object_id = 0;
515
516 if (dest_err->oer_comp_offset > src_err->oer_comp_offset)
517 dest_err->oer_comp_offset = src_err->oer_comp_offset;
518
519 dest_end = end_offset(dest_err->oer_comp_offset,
520 dest_err->oer_comp_length);
521 src_end = end_offset(src_err->oer_comp_offset,
522 src_err->oer_comp_length);
523 if (dest_end < src_end)
524 dest_end = src_end;
525
526 dest_err->oer_comp_length = dest_end - dest_err->oer_comp_offset;
527
528 if ((src_err->oer_iswrite == dest_err->oer_iswrite) &&
529 (err_prio(src_err->oer_errno) > err_prio(dest_err->oer_errno))) {
530 dest_err->oer_errno = src_err->oer_errno;
531 } else if (src_err->oer_iswrite) {
532 dest_err->oer_iswrite = true;
533 dest_err->oer_errno = src_err->oer_errno;
534 }
535}
536
537static void
538encode_accumulated_error(struct objlayout *objlay, __be32 *p)
539{
540 struct objlayout_io_state *state, *tmp;
541 struct pnfs_osd_ioerr accumulated_err = {.oer_errno = 0};
542
543 list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) {
544 unsigned i;
545
546 for (i = 0; i < state->num_comps; i++) {
547 struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i];
548
549 if (!ioerr->oer_errno)
550 continue;
551
552 printk(KERN_ERR "%s: err[%d]: errno=%d is_write=%d "
553 "dev(%llx:%llx) par=0x%llx obj=0x%llx "
554 "offset=0x%llx length=0x%llx\n",
555 __func__, i, ioerr->oer_errno,
556 ioerr->oer_iswrite,
557 _DEVID_LO(&ioerr->oer_component.oid_device_id),
558 _DEVID_HI(&ioerr->oer_component.oid_device_id),
559 ioerr->oer_component.oid_partition_id,
560 ioerr->oer_component.oid_object_id,
561 ioerr->oer_comp_offset,
562 ioerr->oer_comp_length);
563
564 merge_ioerr(&accumulated_err, ioerr);
565 }
566 list_del(&state->err_list);
567 objlayout_free_io_state(state);
568 }
569
570 pnfs_osd_xdr_encode_ioerr(p, &accumulated_err);
571}
572
573void
574objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay,
575 struct xdr_stream *xdr,
576 const struct nfs4_layoutreturn_args *args)
577{
578 struct objlayout *objlay = OBJLAYOUT(pnfslay);
579 struct objlayout_io_state *state, *tmp;
580 __be32 *start;
581
582 dprintk("%s: Begin\n", __func__);
583 start = xdr_reserve_space(xdr, 4);
584 BUG_ON(!start);
585
586 spin_lock(&objlay->lock);
587
588 list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) {
589 __be32 *last_xdr = NULL, *p;
590 unsigned i;
591 int res = 0;
592
593 for (i = 0; i < state->num_comps; i++) {
594 struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i];
595
596 if (!ioerr->oer_errno)
597 continue;
598
599 dprintk("%s: err[%d]: errno=%d is_write=%d "
600 "dev(%llx:%llx) par=0x%llx obj=0x%llx "
601 "offset=0x%llx length=0x%llx\n",
602 __func__, i, ioerr->oer_errno,
603 ioerr->oer_iswrite,
604 _DEVID_LO(&ioerr->oer_component.oid_device_id),
605 _DEVID_HI(&ioerr->oer_component.oid_device_id),
606 ioerr->oer_component.oid_partition_id,
607 ioerr->oer_component.oid_object_id,
608 ioerr->oer_comp_offset,
609 ioerr->oer_comp_length);
610
611 p = pnfs_osd_xdr_ioerr_reserve_space(xdr);
612 if (unlikely(!p)) {
613 res = -E2BIG;
614 break; /* accumulated_error */
615 }
616
617 last_xdr = p;
618 pnfs_osd_xdr_encode_ioerr(p, &state->ioerrs[i]);
619 }
620
621 /* TODO: use xdr_write_pages */
622 if (unlikely(res)) {
623 /* no space for even one error descriptor */
624 BUG_ON(!last_xdr);
625
626 /* we've encountered a situation with lots and lots of
627 * errors and no space to encode them all. Use the last
628 * available slot to report the union of all the
629 * remaining errors.
630 */
631 encode_accumulated_error(objlay, last_xdr);
632 goto loop_done;
633 }
634 list_del(&state->err_list);
635 objlayout_free_io_state(state);
636 }
637loop_done:
638 spin_unlock(&objlay->lock);
639
640 *start = cpu_to_be32((xdr->p - start - 1) * 4);
641 dprintk("%s: Return\n", __func__);
642}
643
644
645/*
646 * Get Device Info API for io engines
647 */
648struct objlayout_deviceinfo {
649 struct page *page;
650 struct pnfs_osd_deviceaddr da; /* This must be last */
651};
652
653/* Initialize and call nfs_getdeviceinfo, then decode and return a
654 * "struct pnfs_osd_deviceaddr *" Eventually objlayout_put_deviceinfo()
655 * should be called.
656 */
657int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
658 struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr,
659 gfp_t gfp_flags)
660{
661 struct objlayout_deviceinfo *odi;
662 struct pnfs_device pd;
663 struct super_block *sb;
664 struct page *page, **pages;
665 u32 *p;
666 int err;
667
668 page = alloc_page(gfp_flags);
669 if (!page)
670 return -ENOMEM;
671
672 pages = &page;
673 pd.pages = pages;
674
675 memcpy(&pd.dev_id, d_id, sizeof(*d_id));
676 pd.layout_type = LAYOUT_OSD2_OBJECTS;
677 pd.pages = &page;
678 pd.pgbase = 0;
679 pd.pglen = PAGE_SIZE;
680 pd.mincount = 0;
681
682 sb = pnfslay->plh_inode->i_sb;
683 err = nfs4_proc_getdeviceinfo(NFS_SERVER(pnfslay->plh_inode), &pd);
684 dprintk("%s nfs_getdeviceinfo returned %d\n", __func__, err);
685 if (err)
686 goto err_out;
687
688 p = page_address(page);
689 odi = kzalloc(sizeof(*odi), gfp_flags);
690 if (!odi) {
691 err = -ENOMEM;
692 goto err_out;
693 }
694 pnfs_osd_xdr_decode_deviceaddr(&odi->da, p);
695 odi->page = page;
696 *deviceaddr = &odi->da;
697 return 0;
698
699err_out:
700 __free_page(page);
701 return err;
702}
703
704void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr)
705{
706 struct objlayout_deviceinfo *odi = container_of(deviceaddr,
707 struct objlayout_deviceinfo,
708 da);
709
710 __free_page(odi->page);
711 kfree(odi);
712}
diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h
new file mode 100644
index 000000000000..a8244c8e042d
--- /dev/null
+++ b/fs/nfs/objlayout/objlayout.h
@@ -0,0 +1,187 @@
1/*
2 * Data types and function declerations for interfacing with the
3 * pNFS standard object layout driver.
4 *
5 * Copyright (C) 2007 Panasas Inc. [year of first publication]
6 * All rights reserved.
7 *
8 * Benny Halevy <bhalevy@panasas.com>
9 * Boaz Harrosh <bharrosh@panasas.com>
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License version 2
13 * See the file COPYING included with this distribution for more details.
14 *
15 * Redistribution and use in source and binary forms, with or without
16 * modification, are permitted provided that the following conditions
17 * are met:
18 *
19 * 1. Redistributions of source code must retain the above copyright
20 * notice, this list of conditions and the following disclaimer.
21 * 2. Redistributions in binary form must reproduce the above copyright
22 * notice, this list of conditions and the following disclaimer in the
23 * documentation and/or other materials provided with the distribution.
24 * 3. Neither the name of the Panasas company nor the names of its
25 * contributors may be used to endorse or promote products derived
26 * from this software without specific prior written permission.
27 *
28 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
29 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
30 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
31 * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
32 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
33 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
34 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
35 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
36 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
37 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
38 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
39 */
40
41#ifndef _OBJLAYOUT_H
42#define _OBJLAYOUT_H
43
44#include <linux/nfs_fs.h>
45#include <linux/pnfs_osd_xdr.h>
46#include "../pnfs.h"
47
48/*
49 * per-inode layout
50 */
51struct objlayout {
52 struct pnfs_layout_hdr pnfs_layout;
53
54 /* for layout_commit */
55 enum osd_delta_space_valid_enum {
56 OBJ_DSU_INIT = 0,
57 OBJ_DSU_VALID,
58 OBJ_DSU_INVALID,
59 } delta_space_valid;
60 s64 delta_space_used; /* consumed by write ops */
61
62 /* for layout_return */
63 spinlock_t lock;
64 struct list_head err_list;
65};
66
67static inline struct objlayout *
68OBJLAYOUT(struct pnfs_layout_hdr *lo)
69{
70 return container_of(lo, struct objlayout, pnfs_layout);
71}
72
73/*
74 * per-I/O operation state
75 * embedded in objects provider io_state data structure
76 */
77struct objlayout_io_state {
78 struct pnfs_layout_segment *lseg;
79
80 struct page **pages;
81 unsigned pgbase;
82 unsigned nr_pages;
83 unsigned long count;
84 loff_t offset;
85 bool sync;
86
87 void *rpcdata;
88 int status; /* res */
89 int eof; /* res */
90 int committed; /* res */
91
92 /* Error reporting (layout_return) */
93 struct list_head err_list;
94 unsigned num_comps;
95 /* Pointer to array of error descriptors of size num_comps.
96 * It should contain as many entries as devices in the osd_layout
97 * that participate in the I/O. It is up to the io_engine to allocate
98 * needed space and set num_comps.
99 */
100 struct pnfs_osd_ioerr *ioerrs;
101};
102
103/*
104 * Raid engine I/O API
105 */
106extern int objio_alloc_lseg(struct pnfs_layout_segment **outp,
107 struct pnfs_layout_hdr *pnfslay,
108 struct pnfs_layout_range *range,
109 struct xdr_stream *xdr,
110 gfp_t gfp_flags);
111extern void objio_free_lseg(struct pnfs_layout_segment *lseg);
112
113extern int objio_alloc_io_state(
114 struct pnfs_layout_segment *lseg,
115 struct objlayout_io_state **outp,
116 gfp_t gfp_flags);
117extern void objio_free_io_state(struct objlayout_io_state *state);
118
119extern ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state);
120extern ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state,
121 bool stable);
122
123/*
124 * callback API
125 */
126extern void objlayout_io_set_result(struct objlayout_io_state *state,
127 unsigned index, struct pnfs_osd_objid *pooid,
128 int osd_error, u64 offset, u64 length, bool is_write);
129
130static inline void
131objlayout_add_delta_space_used(struct objlayout_io_state *state, s64 space_used)
132{
133 struct objlayout *objlay = OBJLAYOUT(state->lseg->pls_layout);
134
135 /* If one of the I/Os errored out and the delta_space_used was
136 * invalid we render the complete report as invalid. Protocol mandate
137 * the DSU be accurate or not reported.
138 */
139 spin_lock(&objlay->lock);
140 if (objlay->delta_space_valid != OBJ_DSU_INVALID) {
141 objlay->delta_space_valid = OBJ_DSU_VALID;
142 objlay->delta_space_used += space_used;
143 }
144 spin_unlock(&objlay->lock);
145}
146
147extern void objlayout_read_done(struct objlayout_io_state *state,
148 ssize_t status, bool sync);
149extern void objlayout_write_done(struct objlayout_io_state *state,
150 ssize_t status, bool sync);
151
152extern int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
153 struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr,
154 gfp_t gfp_flags);
155extern void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr);
156
157/*
158 * exported generic objects function vectors
159 */
160
161extern struct pnfs_layout_hdr *objlayout_alloc_layout_hdr(struct inode *, gfp_t gfp_flags);
162extern void objlayout_free_layout_hdr(struct pnfs_layout_hdr *);
163
164extern struct pnfs_layout_segment *objlayout_alloc_lseg(
165 struct pnfs_layout_hdr *,
166 struct nfs4_layoutget_res *,
167 gfp_t gfp_flags);
168extern void objlayout_free_lseg(struct pnfs_layout_segment *);
169
170extern enum pnfs_try_status objlayout_read_pagelist(
171 struct nfs_read_data *);
172
173extern enum pnfs_try_status objlayout_write_pagelist(
174 struct nfs_write_data *,
175 int how);
176
177extern void objlayout_encode_layoutcommit(
178 struct pnfs_layout_hdr *,
179 struct xdr_stream *,
180 const struct nfs4_layoutcommit_args *);
181
182extern void objlayout_encode_layoutreturn(
183 struct pnfs_layout_hdr *,
184 struct xdr_stream *,
185 const struct nfs4_layoutreturn_args *);
186
187#endif /* _OBJLAYOUT_H */
diff --git a/fs/nfs/objlayout/pnfs_osd_xdr_cli.c b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c
new file mode 100644
index 000000000000..16fc758e9123
--- /dev/null
+++ b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c
@@ -0,0 +1,412 @@
1/*
2 * Object-Based pNFS Layout XDR layer
3 *
4 * Copyright (C) 2007 Panasas Inc. [year of first publication]
5 * All rights reserved.
6 *
7 * Benny Halevy <bhalevy@panasas.com>
8 * Boaz Harrosh <bharrosh@panasas.com>
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License version 2
12 * See the file COPYING included with this distribution for more details.
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 *
18 * 1. Redistributions of source code must retain the above copyright
19 * notice, this list of conditions and the following disclaimer.
20 * 2. Redistributions in binary form must reproduce the above copyright
21 * notice, this list of conditions and the following disclaimer in the
22 * documentation and/or other materials provided with the distribution.
23 * 3. Neither the name of the Panasas company nor the names of its
24 * contributors may be used to endorse or promote products derived
25 * from this software without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
28 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
29 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
30 * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
34 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 */
39
40#include <linux/pnfs_osd_xdr.h>
41
42#define NFSDBG_FACILITY NFSDBG_PNFS_LD
43
44/*
45 * The following implementation is based on RFC5664
46 */
47
48/*
49 * struct pnfs_osd_objid {
50 * struct nfs4_deviceid oid_device_id;
51 * u64 oid_partition_id;
52 * u64 oid_object_id;
53 * }; // xdr size 32 bytes
54 */
55static __be32 *
56_osd_xdr_decode_objid(__be32 *p, struct pnfs_osd_objid *objid)
57{
58 p = xdr_decode_opaque_fixed(p, objid->oid_device_id.data,
59 sizeof(objid->oid_device_id.data));
60
61 p = xdr_decode_hyper(p, &objid->oid_partition_id);
62 p = xdr_decode_hyper(p, &objid->oid_object_id);
63 return p;
64}
65/*
66 * struct pnfs_osd_opaque_cred {
67 * u32 cred_len;
68 * void *cred;
69 * }; // xdr size [variable]
70 * The return pointers are from the xdr buffer
71 */
72static int
73_osd_xdr_decode_opaque_cred(struct pnfs_osd_opaque_cred *opaque_cred,
74 struct xdr_stream *xdr)
75{
76 __be32 *p = xdr_inline_decode(xdr, 1);
77
78 if (!p)
79 return -EINVAL;
80
81 opaque_cred->cred_len = be32_to_cpu(*p++);
82
83 p = xdr_inline_decode(xdr, opaque_cred->cred_len);
84 if (!p)
85 return -EINVAL;
86
87 opaque_cred->cred = p;
88 return 0;
89}
90
91/*
92 * struct pnfs_osd_object_cred {
93 * struct pnfs_osd_objid oc_object_id;
94 * u32 oc_osd_version;
95 * u32 oc_cap_key_sec;
96 * struct pnfs_osd_opaque_cred oc_cap_key
97 * struct pnfs_osd_opaque_cred oc_cap;
98 * }; // xdr size 32 + 4 + 4 + [variable] + [variable]
99 */
100static int
101_osd_xdr_decode_object_cred(struct pnfs_osd_object_cred *comp,
102 struct xdr_stream *xdr)
103{
104 __be32 *p = xdr_inline_decode(xdr, 32 + 4 + 4);
105 int ret;
106
107 if (!p)
108 return -EIO;
109
110 p = _osd_xdr_decode_objid(p, &comp->oc_object_id);
111 comp->oc_osd_version = be32_to_cpup(p++);
112 comp->oc_cap_key_sec = be32_to_cpup(p);
113
114 ret = _osd_xdr_decode_opaque_cred(&comp->oc_cap_key, xdr);
115 if (unlikely(ret))
116 return ret;
117
118 ret = _osd_xdr_decode_opaque_cred(&comp->oc_cap, xdr);
119 return ret;
120}
121
122/*
123 * struct pnfs_osd_data_map {
124 * u32 odm_num_comps;
125 * u64 odm_stripe_unit;
126 * u32 odm_group_width;
127 * u32 odm_group_depth;
128 * u32 odm_mirror_cnt;
129 * u32 odm_raid_algorithm;
130 * }; // xdr size 4 + 8 + 4 + 4 + 4 + 4
131 */
132static inline int
133_osd_data_map_xdr_sz(void)
134{
135 return 4 + 8 + 4 + 4 + 4 + 4;
136}
137
138static __be32 *
139_osd_xdr_decode_data_map(__be32 *p, struct pnfs_osd_data_map *data_map)
140{
141 data_map->odm_num_comps = be32_to_cpup(p++);
142 p = xdr_decode_hyper(p, &data_map->odm_stripe_unit);
143 data_map->odm_group_width = be32_to_cpup(p++);
144 data_map->odm_group_depth = be32_to_cpup(p++);
145 data_map->odm_mirror_cnt = be32_to_cpup(p++);
146 data_map->odm_raid_algorithm = be32_to_cpup(p++);
147 dprintk("%s: odm_num_comps=%u odm_stripe_unit=%llu odm_group_width=%u "
148 "odm_group_depth=%u odm_mirror_cnt=%u odm_raid_algorithm=%u\n",
149 __func__,
150 data_map->odm_num_comps,
151 (unsigned long long)data_map->odm_stripe_unit,
152 data_map->odm_group_width,
153 data_map->odm_group_depth,
154 data_map->odm_mirror_cnt,
155 data_map->odm_raid_algorithm);
156 return p;
157}
158
159int pnfs_osd_xdr_decode_layout_map(struct pnfs_osd_layout *layout,
160 struct pnfs_osd_xdr_decode_layout_iter *iter, struct xdr_stream *xdr)
161{
162 __be32 *p;
163
164 memset(iter, 0, sizeof(*iter));
165
166 p = xdr_inline_decode(xdr, _osd_data_map_xdr_sz() + 4 + 4);
167 if (unlikely(!p))
168 return -EINVAL;
169
170 p = _osd_xdr_decode_data_map(p, &layout->olo_map);
171 layout->olo_comps_index = be32_to_cpup(p++);
172 layout->olo_num_comps = be32_to_cpup(p++);
173 iter->total_comps = layout->olo_num_comps;
174 return 0;
175}
176
177bool pnfs_osd_xdr_decode_layout_comp(struct pnfs_osd_object_cred *comp,
178 struct pnfs_osd_xdr_decode_layout_iter *iter, struct xdr_stream *xdr,
179 int *err)
180{
181 BUG_ON(iter->decoded_comps > iter->total_comps);
182 if (iter->decoded_comps == iter->total_comps)
183 return false;
184
185 *err = _osd_xdr_decode_object_cred(comp, xdr);
186 if (unlikely(*err)) {
187 dprintk("%s: _osd_xdr_decode_object_cred=>%d decoded_comps=%d "
188 "total_comps=%d\n", __func__, *err,
189 iter->decoded_comps, iter->total_comps);
190 return false; /* stop the loop */
191 }
192 dprintk("%s: dev(%llx:%llx) par=0x%llx obj=0x%llx "
193 "key_len=%u cap_len=%u\n",
194 __func__,
195 _DEVID_LO(&comp->oc_object_id.oid_device_id),
196 _DEVID_HI(&comp->oc_object_id.oid_device_id),
197 comp->oc_object_id.oid_partition_id,
198 comp->oc_object_id.oid_object_id,
199 comp->oc_cap_key.cred_len, comp->oc_cap.cred_len);
200
201 iter->decoded_comps++;
202 return true;
203}
204
205/*
206 * Get Device Information Decoding
207 *
208 * Note: since Device Information is currently done synchronously, all
209 * variable strings fields are left inside the rpc buffer and are only
210 * pointed to by the pnfs_osd_deviceaddr members. So the read buffer
211 * should not be freed while the returned information is in use.
212 */
213/*
214 *struct nfs4_string {
215 * unsigned int len;
216 * char *data;
217 *}; // size [variable]
218 * NOTE: Returned string points to inside the XDR buffer
219 */
220static __be32 *
221__read_u8_opaque(__be32 *p, struct nfs4_string *str)
222{
223 str->len = be32_to_cpup(p++);
224 str->data = (char *)p;
225
226 p += XDR_QUADLEN(str->len);
227 return p;
228}
229
230/*
231 * struct pnfs_osd_targetid {
232 * u32 oti_type;
233 * struct nfs4_string oti_scsi_device_id;
234 * };// size 4 + [variable]
235 */
236static __be32 *
237__read_targetid(__be32 *p, struct pnfs_osd_targetid* targetid)
238{
239 u32 oti_type;
240
241 oti_type = be32_to_cpup(p++);
242 targetid->oti_type = oti_type;
243
244 switch (oti_type) {
245 case OBJ_TARGET_SCSI_NAME:
246 case OBJ_TARGET_SCSI_DEVICE_ID:
247 p = __read_u8_opaque(p, &targetid->oti_scsi_device_id);
248 }
249
250 return p;
251}
252
253/*
254 * struct pnfs_osd_net_addr {
255 * struct nfs4_string r_netid;
256 * struct nfs4_string r_addr;
257 * };
258 */
259static __be32 *
260__read_net_addr(__be32 *p, struct pnfs_osd_net_addr* netaddr)
261{
262 p = __read_u8_opaque(p, &netaddr->r_netid);
263 p = __read_u8_opaque(p, &netaddr->r_addr);
264
265 return p;
266}
267
268/*
269 * struct pnfs_osd_targetaddr {
270 * u32 ota_available;
271 * struct pnfs_osd_net_addr ota_netaddr;
272 * };
273 */
274static __be32 *
275__read_targetaddr(__be32 *p, struct pnfs_osd_targetaddr *targetaddr)
276{
277 u32 ota_available;
278
279 ota_available = be32_to_cpup(p++);
280 targetaddr->ota_available = ota_available;
281
282 if (ota_available)
283 p = __read_net_addr(p, &targetaddr->ota_netaddr);
284
285
286 return p;
287}
288
289/*
290 * struct pnfs_osd_deviceaddr {
291 * struct pnfs_osd_targetid oda_targetid;
292 * struct pnfs_osd_targetaddr oda_targetaddr;
293 * u8 oda_lun[8];
294 * struct nfs4_string oda_systemid;
295 * struct pnfs_osd_object_cred oda_root_obj_cred;
296 * struct nfs4_string oda_osdname;
297 * };
298 */
299
300/* We need this version for the pnfs_osd_xdr_decode_deviceaddr which does
301 * not have an xdr_stream
302 */
303static __be32 *
304__read_opaque_cred(__be32 *p,
305 struct pnfs_osd_opaque_cred *opaque_cred)
306{
307 opaque_cred->cred_len = be32_to_cpu(*p++);
308 opaque_cred->cred = p;
309 return p + XDR_QUADLEN(opaque_cred->cred_len);
310}
311
312static __be32 *
313__read_object_cred(__be32 *p, struct pnfs_osd_object_cred *comp)
314{
315 p = _osd_xdr_decode_objid(p, &comp->oc_object_id);
316 comp->oc_osd_version = be32_to_cpup(p++);
317 comp->oc_cap_key_sec = be32_to_cpup(p++);
318
319 p = __read_opaque_cred(p, &comp->oc_cap_key);
320 p = __read_opaque_cred(p, &comp->oc_cap);
321 return p;
322}
323
324void pnfs_osd_xdr_decode_deviceaddr(
325 struct pnfs_osd_deviceaddr *deviceaddr, __be32 *p)
326{
327 p = __read_targetid(p, &deviceaddr->oda_targetid);
328
329 p = __read_targetaddr(p, &deviceaddr->oda_targetaddr);
330
331 p = xdr_decode_opaque_fixed(p, deviceaddr->oda_lun,
332 sizeof(deviceaddr->oda_lun));
333
334 p = __read_u8_opaque(p, &deviceaddr->oda_systemid);
335
336 p = __read_object_cred(p, &deviceaddr->oda_root_obj_cred);
337
338 p = __read_u8_opaque(p, &deviceaddr->oda_osdname);
339
340 /* libosd likes this terminated in dbg. It's last, so no problems */
341 deviceaddr->oda_osdname.data[deviceaddr->oda_osdname.len] = 0;
342}
343
344/*
345 * struct pnfs_osd_layoutupdate {
346 * u32 dsu_valid;
347 * s64 dsu_delta;
348 * u32 olu_ioerr_flag;
349 * }; xdr size 4 + 8 + 4
350 */
351int
352pnfs_osd_xdr_encode_layoutupdate(struct xdr_stream *xdr,
353 struct pnfs_osd_layoutupdate *lou)
354{
355 __be32 *p = xdr_reserve_space(xdr, 4 + 8 + 4);
356
357 if (!p)
358 return -E2BIG;
359
360 *p++ = cpu_to_be32(lou->dsu_valid);
361 if (lou->dsu_valid)
362 p = xdr_encode_hyper(p, lou->dsu_delta);
363 *p++ = cpu_to_be32(lou->olu_ioerr_flag);
364 return 0;
365}
366
367/*
368 * struct pnfs_osd_objid {
369 * struct nfs4_deviceid oid_device_id;
370 * u64 oid_partition_id;
371 * u64 oid_object_id;
372 * }; // xdr size 32 bytes
373 */
374static inline __be32 *
375pnfs_osd_xdr_encode_objid(__be32 *p, struct pnfs_osd_objid *object_id)
376{
377 p = xdr_encode_opaque_fixed(p, &object_id->oid_device_id.data,
378 sizeof(object_id->oid_device_id.data));
379 p = xdr_encode_hyper(p, object_id->oid_partition_id);
380 p = xdr_encode_hyper(p, object_id->oid_object_id);
381
382 return p;
383}
384
385/*
386 * struct pnfs_osd_ioerr {
387 * struct pnfs_osd_objid oer_component;
388 * u64 oer_comp_offset;
389 * u64 oer_comp_length;
390 * u32 oer_iswrite;
391 * u32 oer_errno;
392 * }; // xdr size 32 + 24 bytes
393 */
394void pnfs_osd_xdr_encode_ioerr(__be32 *p, struct pnfs_osd_ioerr *ioerr)
395{
396 p = pnfs_osd_xdr_encode_objid(p, &ioerr->oer_component);
397 p = xdr_encode_hyper(p, ioerr->oer_comp_offset);
398 p = xdr_encode_hyper(p, ioerr->oer_comp_length);
399 *p++ = cpu_to_be32(ioerr->oer_iswrite);
400 *p = cpu_to_be32(ioerr->oer_errno);
401}
402
403__be32 *pnfs_osd_xdr_ioerr_reserve_space(struct xdr_stream *xdr)
404{
405 __be32 *p;
406
407 p = xdr_reserve_space(xdr, 32 + 24);
408 if (unlikely(!p))
409 dprintk("%s: out of xdr space\n", __func__);
410
411 return p;
412}
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index c80add6e2213..7913961aff22 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -204,6 +204,21 @@ nfs_wait_on_request(struct nfs_page *req)
204 TASK_UNINTERRUPTIBLE); 204 TASK_UNINTERRUPTIBLE);
205} 205}
206 206
207static bool nfs_generic_pg_test(struct nfs_pageio_descriptor *desc, struct nfs_page *prev, struct nfs_page *req)
208{
209 /*
210 * FIXME: ideally we should be able to coalesce all requests
211 * that are not block boundary aligned, but currently this
212 * is problematic for the case of bsize < PAGE_CACHE_SIZE,
213 * since nfs_flush_multi and nfs_pagein_multi assume you
214 * can have only one struct nfs_page.
215 */
216 if (desc->pg_bsize < PAGE_SIZE)
217 return 0;
218
219 return desc->pg_count + req->wb_bytes <= desc->pg_bsize;
220}
221
207/** 222/**
208 * nfs_pageio_init - initialise a page io descriptor 223 * nfs_pageio_init - initialise a page io descriptor
209 * @desc: pointer to descriptor 224 * @desc: pointer to descriptor
@@ -229,6 +244,8 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
229 desc->pg_ioflags = io_flags; 244 desc->pg_ioflags = io_flags;
230 desc->pg_error = 0; 245 desc->pg_error = 0;
231 desc->pg_lseg = NULL; 246 desc->pg_lseg = NULL;
247 desc->pg_test = nfs_generic_pg_test;
248 pnfs_pageio_init(desc, inode);
232} 249}
233 250
234/** 251/**
@@ -242,29 +259,23 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
242 * 259 *
243 * Return 'true' if this is the case, else return 'false'. 260 * Return 'true' if this is the case, else return 'false'.
244 */ 261 */
245static int nfs_can_coalesce_requests(struct nfs_page *prev, 262static bool nfs_can_coalesce_requests(struct nfs_page *prev,
246 struct nfs_page *req, 263 struct nfs_page *req,
247 struct nfs_pageio_descriptor *pgio) 264 struct nfs_pageio_descriptor *pgio)
248{ 265{
249 if (req->wb_context->cred != prev->wb_context->cred) 266 if (req->wb_context->cred != prev->wb_context->cred)
250 return 0; 267 return false;
251 if (req->wb_lock_context->lockowner != prev->wb_lock_context->lockowner) 268 if (req->wb_lock_context->lockowner != prev->wb_lock_context->lockowner)
252 return 0; 269 return false;
253 if (req->wb_context->state != prev->wb_context->state) 270 if (req->wb_context->state != prev->wb_context->state)
254 return 0; 271 return false;
255 if (req->wb_index != (prev->wb_index + 1)) 272 if (req->wb_index != (prev->wb_index + 1))
256 return 0; 273 return false;
257 if (req->wb_pgbase != 0) 274 if (req->wb_pgbase != 0)
258 return 0; 275 return false;
259 if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE) 276 if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE)
260 return 0; 277 return false;
261 /* 278 return pgio->pg_test(pgio, prev, req);
262 * Non-whole file layouts need to check that req is inside of
263 * pgio->pg_lseg.
264 */
265 if (pgio->pg_test && !pgio->pg_test(pgio, prev, req))
266 return 0;
267 return 1;
268} 279}
269 280
270/** 281/**
@@ -278,31 +289,18 @@ static int nfs_can_coalesce_requests(struct nfs_page *prev,
278static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc, 289static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
279 struct nfs_page *req) 290 struct nfs_page *req)
280{ 291{
281 size_t newlen = req->wb_bytes;
282
283 if (desc->pg_count != 0) { 292 if (desc->pg_count != 0) {
284 struct nfs_page *prev; 293 struct nfs_page *prev;
285 294
286 /*
287 * FIXME: ideally we should be able to coalesce all requests
288 * that are not block boundary aligned, but currently this
289 * is problematic for the case of bsize < PAGE_CACHE_SIZE,
290 * since nfs_flush_multi and nfs_pagein_multi assume you
291 * can have only one struct nfs_page.
292 */
293 if (desc->pg_bsize < PAGE_SIZE)
294 return 0;
295 newlen += desc->pg_count;
296 if (newlen > desc->pg_bsize)
297 return 0;
298 prev = nfs_list_entry(desc->pg_list.prev); 295 prev = nfs_list_entry(desc->pg_list.prev);
299 if (!nfs_can_coalesce_requests(prev, req, desc)) 296 if (!nfs_can_coalesce_requests(prev, req, desc))
300 return 0; 297 return 0;
301 } else 298 } else {
302 desc->pg_base = req->wb_pgbase; 299 desc->pg_base = req->wb_pgbase;
300 }
303 nfs_list_remove_request(req); 301 nfs_list_remove_request(req);
304 nfs_list_add_request(req, &desc->pg_list); 302 nfs_list_add_request(req, &desc->pg_list);
305 desc->pg_count = newlen; 303 desc->pg_count += req->wb_bytes;
306 return 1; 304 return 1;
307} 305}
308 306
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 101c85a3644e..8c1309d852a6 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -177,13 +177,28 @@ get_layout_hdr(struct pnfs_layout_hdr *lo)
177 atomic_inc(&lo->plh_refcount); 177 atomic_inc(&lo->plh_refcount);
178} 178}
179 179
180static struct pnfs_layout_hdr *
181pnfs_alloc_layout_hdr(struct inode *ino, gfp_t gfp_flags)
182{
183 struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld;
184 return ld->alloc_layout_hdr ? ld->alloc_layout_hdr(ino, gfp_flags) :
185 kzalloc(sizeof(struct pnfs_layout_hdr), gfp_flags);
186}
187
188static void
189pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo)
190{
191 struct pnfs_layoutdriver_type *ld = NFS_SERVER(lo->plh_inode)->pnfs_curr_ld;
192 return ld->alloc_layout_hdr ? ld->free_layout_hdr(lo) : kfree(lo);
193}
194
180static void 195static void
181destroy_layout_hdr(struct pnfs_layout_hdr *lo) 196destroy_layout_hdr(struct pnfs_layout_hdr *lo)
182{ 197{
183 dprintk("%s: freeing layout cache %p\n", __func__, lo); 198 dprintk("%s: freeing layout cache %p\n", __func__, lo);
184 BUG_ON(!list_empty(&lo->plh_layouts)); 199 BUG_ON(!list_empty(&lo->plh_layouts));
185 NFS_I(lo->plh_inode)->layout = NULL; 200 NFS_I(lo->plh_inode)->layout = NULL;
186 kfree(lo); 201 pnfs_free_layout_hdr(lo);
187} 202}
188 203
189static void 204static void
@@ -228,7 +243,7 @@ put_lseg_common(struct pnfs_layout_segment *lseg)
228{ 243{
229 struct inode *inode = lseg->pls_layout->plh_inode; 244 struct inode *inode = lseg->pls_layout->plh_inode;
230 245
231 BUG_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); 246 WARN_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
232 list_del_init(&lseg->pls_list); 247 list_del_init(&lseg->pls_list);
233 if (list_empty(&lseg->pls_layout->plh_segs)) { 248 if (list_empty(&lseg->pls_layout->plh_segs)) {
234 set_bit(NFS_LAYOUT_DESTROYED, &lseg->pls_layout->plh_flags); 249 set_bit(NFS_LAYOUT_DESTROYED, &lseg->pls_layout->plh_flags);
@@ -261,11 +276,72 @@ put_lseg(struct pnfs_layout_segment *lseg)
261} 276}
262EXPORT_SYMBOL_GPL(put_lseg); 277EXPORT_SYMBOL_GPL(put_lseg);
263 278
279static inline u64
280end_offset(u64 start, u64 len)
281{
282 u64 end;
283
284 end = start + len;
285 return end >= start ? end : NFS4_MAX_UINT64;
286}
287
288/* last octet in a range */
289static inline u64
290last_byte_offset(u64 start, u64 len)
291{
292 u64 end;
293
294 BUG_ON(!len);
295 end = start + len;
296 return end > start ? end - 1 : NFS4_MAX_UINT64;
297}
298
299/*
300 * is l2 fully contained in l1?
301 * start1 end1
302 * [----------------------------------)
303 * start2 end2
304 * [----------------)
305 */
306static inline int
307lo_seg_contained(struct pnfs_layout_range *l1,
308 struct pnfs_layout_range *l2)
309{
310 u64 start1 = l1->offset;
311 u64 end1 = end_offset(start1, l1->length);
312 u64 start2 = l2->offset;
313 u64 end2 = end_offset(start2, l2->length);
314
315 return (start1 <= start2) && (end1 >= end2);
316}
317
318/*
319 * is l1 and l2 intersecting?
320 * start1 end1
321 * [----------------------------------)
322 * start2 end2
323 * [----------------)
324 */
325static inline int
326lo_seg_intersecting(struct pnfs_layout_range *l1,
327 struct pnfs_layout_range *l2)
328{
329 u64 start1 = l1->offset;
330 u64 end1 = end_offset(start1, l1->length);
331 u64 start2 = l2->offset;
332 u64 end2 = end_offset(start2, l2->length);
333
334 return (end1 == NFS4_MAX_UINT64 || end1 > start2) &&
335 (end2 == NFS4_MAX_UINT64 || end2 > start1);
336}
337
264static bool 338static bool
265should_free_lseg(u32 lseg_iomode, u32 recall_iomode) 339should_free_lseg(struct pnfs_layout_range *lseg_range,
340 struct pnfs_layout_range *recall_range)
266{ 341{
267 return (recall_iomode == IOMODE_ANY || 342 return (recall_range->iomode == IOMODE_ANY ||
268 lseg_iomode == recall_iomode); 343 lseg_range->iomode == recall_range->iomode) &&
344 lo_seg_intersecting(lseg_range, recall_range);
269} 345}
270 346
271/* Returns 1 if lseg is removed from list, 0 otherwise */ 347/* Returns 1 if lseg is removed from list, 0 otherwise */
@@ -296,7 +372,7 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
296int 372int
297mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, 373mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
298 struct list_head *tmp_list, 374 struct list_head *tmp_list,
299 u32 iomode) 375 struct pnfs_layout_range *recall_range)
300{ 376{
301 struct pnfs_layout_segment *lseg, *next; 377 struct pnfs_layout_segment *lseg, *next;
302 int invalid = 0, removed = 0; 378 int invalid = 0, removed = 0;
@@ -309,7 +385,8 @@ mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
309 return 0; 385 return 0;
310 } 386 }
311 list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) 387 list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
312 if (should_free_lseg(lseg->pls_range.iomode, iomode)) { 388 if (!recall_range ||
389 should_free_lseg(&lseg->pls_range, recall_range)) {
313 dprintk("%s: freeing lseg %p iomode %d " 390 dprintk("%s: freeing lseg %p iomode %d "
314 "offset %llu length %llu\n", __func__, 391 "offset %llu length %llu\n", __func__,
315 lseg, lseg->pls_range.iomode, lseg->pls_range.offset, 392 lseg, lseg->pls_range.iomode, lseg->pls_range.offset,
@@ -358,7 +435,7 @@ pnfs_destroy_layout(struct nfs_inode *nfsi)
358 lo = nfsi->layout; 435 lo = nfsi->layout;
359 if (lo) { 436 if (lo) {
360 lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */ 437 lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */
361 mark_matching_lsegs_invalid(lo, &tmp_list, IOMODE_ANY); 438 mark_matching_lsegs_invalid(lo, &tmp_list, NULL);
362 } 439 }
363 spin_unlock(&nfsi->vfs_inode.i_lock); 440 spin_unlock(&nfsi->vfs_inode.i_lock);
364 pnfs_free_lseg_list(&tmp_list); 441 pnfs_free_lseg_list(&tmp_list);
@@ -467,7 +544,7 @@ pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
467static struct pnfs_layout_segment * 544static struct pnfs_layout_segment *
468send_layoutget(struct pnfs_layout_hdr *lo, 545send_layoutget(struct pnfs_layout_hdr *lo,
469 struct nfs_open_context *ctx, 546 struct nfs_open_context *ctx,
470 u32 iomode, 547 struct pnfs_layout_range *range,
471 gfp_t gfp_flags) 548 gfp_t gfp_flags)
472{ 549{
473 struct inode *ino = lo->plh_inode; 550 struct inode *ino = lo->plh_inode;
@@ -499,11 +576,11 @@ send_layoutget(struct pnfs_layout_hdr *lo,
499 goto out_err_free; 576 goto out_err_free;
500 } 577 }
501 578
502 lgp->args.minlength = NFS4_MAX_UINT64; 579 lgp->args.minlength = PAGE_CACHE_SIZE;
580 if (lgp->args.minlength > range->length)
581 lgp->args.minlength = range->length;
503 lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE; 582 lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
504 lgp->args.range.iomode = iomode; 583 lgp->args.range = *range;
505 lgp->args.range.offset = 0;
506 lgp->args.range.length = NFS4_MAX_UINT64;
507 lgp->args.type = server->pnfs_curr_ld->id; 584 lgp->args.type = server->pnfs_curr_ld->id;
508 lgp->args.inode = ino; 585 lgp->args.inode = ino;
509 lgp->args.ctx = get_nfs_open_context(ctx); 586 lgp->args.ctx = get_nfs_open_context(ctx);
@@ -518,7 +595,7 @@ send_layoutget(struct pnfs_layout_hdr *lo,
518 nfs4_proc_layoutget(lgp); 595 nfs4_proc_layoutget(lgp);
519 if (!lseg) { 596 if (!lseg) {
520 /* remember that LAYOUTGET failed and suspend trying */ 597 /* remember that LAYOUTGET failed and suspend trying */
521 set_bit(lo_fail_bit(iomode), &lo->plh_flags); 598 set_bit(lo_fail_bit(range->iomode), &lo->plh_flags);
522 } 599 }
523 600
524 /* free xdr pages */ 601 /* free xdr pages */
@@ -542,6 +619,51 @@ out_err_free:
542 return NULL; 619 return NULL;
543} 620}
544 621
622/* Initiates a LAYOUTRETURN(FILE) */
623int
624_pnfs_return_layout(struct inode *ino)
625{
626 struct pnfs_layout_hdr *lo = NULL;
627 struct nfs_inode *nfsi = NFS_I(ino);
628 LIST_HEAD(tmp_list);
629 struct nfs4_layoutreturn *lrp;
630 nfs4_stateid stateid;
631 int status = 0;
632
633 dprintk("--> %s\n", __func__);
634
635 spin_lock(&ino->i_lock);
636 lo = nfsi->layout;
637 if (!lo || !mark_matching_lsegs_invalid(lo, &tmp_list, NULL)) {
638 spin_unlock(&ino->i_lock);
639 dprintk("%s: no layout segments to return\n", __func__);
640 goto out;
641 }
642 stateid = nfsi->layout->plh_stateid;
643 /* Reference matched in nfs4_layoutreturn_release */
644 get_layout_hdr(lo);
645 spin_unlock(&ino->i_lock);
646 pnfs_free_lseg_list(&tmp_list);
647
648 WARN_ON(test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags));
649
650 lrp = kzalloc(sizeof(*lrp), GFP_KERNEL);
651 if (unlikely(lrp == NULL)) {
652 status = -ENOMEM;
653 goto out;
654 }
655
656 lrp->args.stateid = stateid;
657 lrp->args.layout_type = NFS_SERVER(ino)->pnfs_curr_ld->id;
658 lrp->args.inode = ino;
659 lrp->clp = NFS_SERVER(ino)->nfs_client;
660
661 status = nfs4_proc_layoutreturn(lrp);
662out:
663 dprintk("<-- %s status: %d\n", __func__, status);
664 return status;
665}
666
545bool pnfs_roc(struct inode *ino) 667bool pnfs_roc(struct inode *ino)
546{ 668{
547 struct pnfs_layout_hdr *lo; 669 struct pnfs_layout_hdr *lo;
@@ -625,10 +747,23 @@ bool pnfs_roc_drain(struct inode *ino, u32 *barrier)
625 * are seen first. 747 * are seen first.
626 */ 748 */
627static s64 749static s64
628cmp_layout(u32 iomode1, u32 iomode2) 750cmp_layout(struct pnfs_layout_range *l1,
751 struct pnfs_layout_range *l2)
629{ 752{
753 s64 d;
754
755 /* high offset > low offset */
756 d = l1->offset - l2->offset;
757 if (d)
758 return d;
759
760 /* short length > long length */
761 d = l2->length - l1->length;
762 if (d)
763 return d;
764
630 /* read > read/write */ 765 /* read > read/write */
631 return (int)(iomode2 == IOMODE_READ) - (int)(iomode1 == IOMODE_READ); 766 return (int)(l1->iomode == IOMODE_READ) - (int)(l2->iomode == IOMODE_READ);
632} 767}
633 768
634static void 769static void
@@ -636,13 +771,12 @@ pnfs_insert_layout(struct pnfs_layout_hdr *lo,
636 struct pnfs_layout_segment *lseg) 771 struct pnfs_layout_segment *lseg)
637{ 772{
638 struct pnfs_layout_segment *lp; 773 struct pnfs_layout_segment *lp;
639 int found = 0;
640 774
641 dprintk("%s:Begin\n", __func__); 775 dprintk("%s:Begin\n", __func__);
642 776
643 assert_spin_locked(&lo->plh_inode->i_lock); 777 assert_spin_locked(&lo->plh_inode->i_lock);
644 list_for_each_entry(lp, &lo->plh_segs, pls_list) { 778 list_for_each_entry(lp, &lo->plh_segs, pls_list) {
645 if (cmp_layout(lp->pls_range.iomode, lseg->pls_range.iomode) > 0) 779 if (cmp_layout(&lseg->pls_range, &lp->pls_range) > 0)
646 continue; 780 continue;
647 list_add_tail(&lseg->pls_list, &lp->pls_list); 781 list_add_tail(&lseg->pls_list, &lp->pls_list);
648 dprintk("%s: inserted lseg %p " 782 dprintk("%s: inserted lseg %p "
@@ -652,16 +786,14 @@ pnfs_insert_layout(struct pnfs_layout_hdr *lo,
652 lseg->pls_range.offset, lseg->pls_range.length, 786 lseg->pls_range.offset, lseg->pls_range.length,
653 lp, lp->pls_range.iomode, lp->pls_range.offset, 787 lp, lp->pls_range.iomode, lp->pls_range.offset,
654 lp->pls_range.length); 788 lp->pls_range.length);
655 found = 1; 789 goto out;
656 break;
657 }
658 if (!found) {
659 list_add_tail(&lseg->pls_list, &lo->plh_segs);
660 dprintk("%s: inserted lseg %p "
661 "iomode %d offset %llu length %llu at tail\n",
662 __func__, lseg, lseg->pls_range.iomode,
663 lseg->pls_range.offset, lseg->pls_range.length);
664 } 790 }
791 list_add_tail(&lseg->pls_list, &lo->plh_segs);
792 dprintk("%s: inserted lseg %p "
793 "iomode %d offset %llu length %llu at tail\n",
794 __func__, lseg, lseg->pls_range.iomode,
795 lseg->pls_range.offset, lseg->pls_range.length);
796out:
665 get_layout_hdr(lo); 797 get_layout_hdr(lo);
666 798
667 dprintk("%s:Return\n", __func__); 799 dprintk("%s:Return\n", __func__);
@@ -672,7 +804,7 @@ alloc_init_layout_hdr(struct inode *ino, gfp_t gfp_flags)
672{ 804{
673 struct pnfs_layout_hdr *lo; 805 struct pnfs_layout_hdr *lo;
674 806
675 lo = kzalloc(sizeof(struct pnfs_layout_hdr), gfp_flags); 807 lo = pnfs_alloc_layout_hdr(ino, gfp_flags);
676 if (!lo) 808 if (!lo)
677 return NULL; 809 return NULL;
678 atomic_set(&lo->plh_refcount, 1); 810 atomic_set(&lo->plh_refcount, 1);
@@ -705,7 +837,7 @@ pnfs_find_alloc_layout(struct inode *ino, gfp_t gfp_flags)
705 if (likely(nfsi->layout == NULL)) /* Won the race? */ 837 if (likely(nfsi->layout == NULL)) /* Won the race? */
706 nfsi->layout = new; 838 nfsi->layout = new;
707 else 839 else
708 kfree(new); 840 pnfs_free_layout_hdr(new);
709 return nfsi->layout; 841 return nfsi->layout;
710} 842}
711 843
@@ -721,16 +853,28 @@ pnfs_find_alloc_layout(struct inode *ino, gfp_t gfp_flags)
721 * READ RW true 853 * READ RW true
722 */ 854 */
723static int 855static int
724is_matching_lseg(struct pnfs_layout_segment *lseg, u32 iomode) 856is_matching_lseg(struct pnfs_layout_range *ls_range,
857 struct pnfs_layout_range *range)
725{ 858{
726 return (iomode != IOMODE_RW || lseg->pls_range.iomode == IOMODE_RW); 859 struct pnfs_layout_range range1;
860
861 if ((range->iomode == IOMODE_RW &&
862 ls_range->iomode != IOMODE_RW) ||
863 !lo_seg_intersecting(ls_range, range))
864 return 0;
865
866 /* range1 covers only the first byte in the range */
867 range1 = *range;
868 range1.length = 1;
869 return lo_seg_contained(ls_range, &range1);
727} 870}
728 871
729/* 872/*
730 * lookup range in layout 873 * lookup range in layout
731 */ 874 */
732static struct pnfs_layout_segment * 875static struct pnfs_layout_segment *
733pnfs_find_lseg(struct pnfs_layout_hdr *lo, u32 iomode) 876pnfs_find_lseg(struct pnfs_layout_hdr *lo,
877 struct pnfs_layout_range *range)
734{ 878{
735 struct pnfs_layout_segment *lseg, *ret = NULL; 879 struct pnfs_layout_segment *lseg, *ret = NULL;
736 880
@@ -739,11 +883,11 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo, u32 iomode)
739 assert_spin_locked(&lo->plh_inode->i_lock); 883 assert_spin_locked(&lo->plh_inode->i_lock);
740 list_for_each_entry(lseg, &lo->plh_segs, pls_list) { 884 list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
741 if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) && 885 if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
742 is_matching_lseg(lseg, iomode)) { 886 is_matching_lseg(&lseg->pls_range, range)) {
743 ret = get_lseg(lseg); 887 ret = get_lseg(lseg);
744 break; 888 break;
745 } 889 }
746 if (cmp_layout(iomode, lseg->pls_range.iomode) > 0) 890 if (cmp_layout(range, &lseg->pls_range) > 0)
747 break; 891 break;
748 } 892 }
749 893
@@ -759,9 +903,17 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo, u32 iomode)
759struct pnfs_layout_segment * 903struct pnfs_layout_segment *
760pnfs_update_layout(struct inode *ino, 904pnfs_update_layout(struct inode *ino,
761 struct nfs_open_context *ctx, 905 struct nfs_open_context *ctx,
906 loff_t pos,
907 u64 count,
762 enum pnfs_iomode iomode, 908 enum pnfs_iomode iomode,
763 gfp_t gfp_flags) 909 gfp_t gfp_flags)
764{ 910{
911 struct pnfs_layout_range arg = {
912 .iomode = iomode,
913 .offset = pos,
914 .length = count,
915 };
916 unsigned pg_offset;
765 struct nfs_inode *nfsi = NFS_I(ino); 917 struct nfs_inode *nfsi = NFS_I(ino);
766 struct nfs_client *clp = NFS_SERVER(ino)->nfs_client; 918 struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
767 struct pnfs_layout_hdr *lo; 919 struct pnfs_layout_hdr *lo;
@@ -789,7 +941,7 @@ pnfs_update_layout(struct inode *ino,
789 goto out_unlock; 941 goto out_unlock;
790 942
791 /* Check to see if the layout for the given range already exists */ 943 /* Check to see if the layout for the given range already exists */
792 lseg = pnfs_find_lseg(lo, iomode); 944 lseg = pnfs_find_lseg(lo, &arg);
793 if (lseg) 945 if (lseg)
794 goto out_unlock; 946 goto out_unlock;
795 947
@@ -811,7 +963,14 @@ pnfs_update_layout(struct inode *ino,
811 spin_unlock(&clp->cl_lock); 963 spin_unlock(&clp->cl_lock);
812 } 964 }
813 965
814 lseg = send_layoutget(lo, ctx, iomode, gfp_flags); 966 pg_offset = arg.offset & ~PAGE_CACHE_MASK;
967 if (pg_offset) {
968 arg.offset -= pg_offset;
969 arg.length += pg_offset;
970 }
971 arg.length = PAGE_CACHE_ALIGN(arg.length);
972
973 lseg = send_layoutget(lo, ctx, &arg, gfp_flags);
815 if (!lseg && first) { 974 if (!lseg && first) {
816 spin_lock(&clp->cl_lock); 975 spin_lock(&clp->cl_lock);
817 list_del_init(&lo->plh_layouts); 976 list_del_init(&lo->plh_layouts);
@@ -838,17 +997,6 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
838 struct nfs_client *clp = NFS_SERVER(ino)->nfs_client; 997 struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
839 int status = 0; 998 int status = 0;
840 999
841 /* Verify we got what we asked for.
842 * Note that because the xdr parsing only accepts a single
843 * element array, this can fail even if the server is behaving
844 * correctly.
845 */
846 if (lgp->args.range.iomode > res->range.iomode ||
847 res->range.offset != 0 ||
848 res->range.length != NFS4_MAX_UINT64) {
849 status = -EINVAL;
850 goto out;
851 }
852 /* Inject layout blob into I/O device driver */ 1000 /* Inject layout blob into I/O device driver */
853 lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res, lgp->gfp_flags); 1001 lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res, lgp->gfp_flags);
854 if (!lseg || IS_ERR(lseg)) { 1002 if (!lseg || IS_ERR(lseg)) {
@@ -895,51 +1043,64 @@ out_forget_reply:
895 goto out; 1043 goto out;
896} 1044}
897 1045
898static int pnfs_read_pg_test(struct nfs_pageio_descriptor *pgio, 1046bool
899 struct nfs_page *prev, 1047pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
900 struct nfs_page *req) 1048 struct nfs_page *req)
901{ 1049{
1050 enum pnfs_iomode access_type;
1051 gfp_t gfp_flags;
1052
1053 /* We assume that pg_ioflags == 0 iff we're reading a page */
1054 if (pgio->pg_ioflags == 0) {
1055 access_type = IOMODE_READ;
1056 gfp_flags = GFP_KERNEL;
1057 } else {
1058 access_type = IOMODE_RW;
1059 gfp_flags = GFP_NOFS;
1060 }
1061
902 if (pgio->pg_count == prev->wb_bytes) { 1062 if (pgio->pg_count == prev->wb_bytes) {
903 /* This is first coelesce call for a series of nfs_pages */ 1063 /* This is first coelesce call for a series of nfs_pages */
904 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, 1064 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
905 prev->wb_context, 1065 prev->wb_context,
906 IOMODE_READ, 1066 req_offset(req),
907 GFP_KERNEL); 1067 pgio->pg_count,
1068 access_type,
1069 gfp_flags);
1070 return true;
908 } 1071 }
909 return NFS_SERVER(pgio->pg_inode)->pnfs_curr_ld->pg_test(pgio, prev, req);
910}
911 1072
912void 1073 if (pgio->pg_lseg &&
913pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode) 1074 req_offset(req) > end_offset(pgio->pg_lseg->pls_range.offset,
914{ 1075 pgio->pg_lseg->pls_range.length))
915 struct pnfs_layoutdriver_type *ld; 1076 return false;
916 1077
917 ld = NFS_SERVER(inode)->pnfs_curr_ld; 1078 return true;
918 pgio->pg_test = (ld && ld->pg_test) ? pnfs_read_pg_test : NULL;
919} 1079}
1080EXPORT_SYMBOL_GPL(pnfs_generic_pg_test);
920 1081
921static int pnfs_write_pg_test(struct nfs_pageio_descriptor *pgio, 1082/*
922 struct nfs_page *prev, 1083 * Called by non rpc-based layout drivers
923 struct nfs_page *req) 1084 */
1085int
1086pnfs_ld_write_done(struct nfs_write_data *data)
924{ 1087{
925 if (pgio->pg_count == prev->wb_bytes) { 1088 int status;
926 /* This is first coelesce call for a series of nfs_pages */
927 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
928 prev->wb_context,
929 IOMODE_RW,
930 GFP_NOFS);
931 }
932 return NFS_SERVER(pgio->pg_inode)->pnfs_curr_ld->pg_test(pgio, prev, req);
933}
934 1089
935void 1090 if (!data->pnfs_error) {
936pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode) 1091 pnfs_set_layoutcommit(data);
937{ 1092 data->mds_ops->rpc_call_done(&data->task, data);
938 struct pnfs_layoutdriver_type *ld; 1093 data->mds_ops->rpc_release(data);
1094 return 0;
1095 }
939 1096
940 ld = NFS_SERVER(inode)->pnfs_curr_ld; 1097 dprintk("%s: pnfs_error=%d, retry via MDS\n", __func__,
941 pgio->pg_test = (ld && ld->pg_test) ? pnfs_write_pg_test : NULL; 1098 data->pnfs_error);
1099 status = nfs_initiate_write(data, NFS_CLIENT(data->inode),
1100 data->mds_ops, NFS_FILE_SYNC);
1101 return status ? : -EAGAIN;
942} 1102}
1103EXPORT_SYMBOL_GPL(pnfs_ld_write_done);
943 1104
944enum pnfs_try_status 1105enum pnfs_try_status
945pnfs_try_to_write_data(struct nfs_write_data *wdata, 1106pnfs_try_to_write_data(struct nfs_write_data *wdata,
@@ -966,6 +1127,29 @@ pnfs_try_to_write_data(struct nfs_write_data *wdata,
966} 1127}
967 1128
968/* 1129/*
1130 * Called by non rpc-based layout drivers
1131 */
1132int
1133pnfs_ld_read_done(struct nfs_read_data *data)
1134{
1135 int status;
1136
1137 if (!data->pnfs_error) {
1138 __nfs4_read_done_cb(data);
1139 data->mds_ops->rpc_call_done(&data->task, data);
1140 data->mds_ops->rpc_release(data);
1141 return 0;
1142 }
1143
1144 dprintk("%s: pnfs_error=%d, retry via MDS\n", __func__,
1145 data->pnfs_error);
1146 status = nfs_initiate_read(data, NFS_CLIENT(data->inode),
1147 data->mds_ops);
1148 return status ? : -EAGAIN;
1149}
1150EXPORT_SYMBOL_GPL(pnfs_ld_read_done);
1151
1152/*
969 * Call the appropriate parallel I/O subsystem read function. 1153 * Call the appropriate parallel I/O subsystem read function.
970 */ 1154 */
971enum pnfs_try_status 1155enum pnfs_try_status
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 0c015bad9e7a..48d0a8e4d062 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -30,6 +30,7 @@
30#ifndef FS_NFS_PNFS_H 30#ifndef FS_NFS_PNFS_H
31#define FS_NFS_PNFS_H 31#define FS_NFS_PNFS_H
32 32
33#include <linux/nfs_fs.h>
33#include <linux/nfs_page.h> 34#include <linux/nfs_page.h>
34 35
35enum { 36enum {
@@ -64,17 +65,29 @@ enum {
64 NFS_LAYOUT_DESTROYED, /* no new use of layout allowed */ 65 NFS_LAYOUT_DESTROYED, /* no new use of layout allowed */
65}; 66};
66 67
68enum layoutdriver_policy_flags {
69 /* Should the pNFS client commit and return the layout upon a setattr */
70 PNFS_LAYOUTRET_ON_SETATTR = 1 << 0,
71};
72
73struct nfs4_deviceid_node;
74
67/* Per-layout driver specific registration structure */ 75/* Per-layout driver specific registration structure */
68struct pnfs_layoutdriver_type { 76struct pnfs_layoutdriver_type {
69 struct list_head pnfs_tblid; 77 struct list_head pnfs_tblid;
70 const u32 id; 78 const u32 id;
71 const char *name; 79 const char *name;
72 struct module *owner; 80 struct module *owner;
81 unsigned flags;
82
83 struct pnfs_layout_hdr * (*alloc_layout_hdr) (struct inode *inode, gfp_t gfp_flags);
84 void (*free_layout_hdr) (struct pnfs_layout_hdr *);
85
73 struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr, gfp_t gfp_flags); 86 struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr, gfp_t gfp_flags);
74 void (*free_lseg) (struct pnfs_layout_segment *lseg); 87 void (*free_lseg) (struct pnfs_layout_segment *lseg);
75 88
76 /* test for nfs page cache coalescing */ 89 /* test for nfs page cache coalescing */
77 int (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *); 90 bool (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *);
78 91
79 /* Returns true if layoutdriver wants to divert this request to 92 /* Returns true if layoutdriver wants to divert this request to
80 * driver's commit routine. 93 * driver's commit routine.
@@ -89,6 +102,16 @@ struct pnfs_layoutdriver_type {
89 */ 102 */
90 enum pnfs_try_status (*read_pagelist) (struct nfs_read_data *nfs_data); 103 enum pnfs_try_status (*read_pagelist) (struct nfs_read_data *nfs_data);
91 enum pnfs_try_status (*write_pagelist) (struct nfs_write_data *nfs_data, int how); 104 enum pnfs_try_status (*write_pagelist) (struct nfs_write_data *nfs_data, int how);
105
106 void (*free_deviceid_node) (struct nfs4_deviceid_node *);
107
108 void (*encode_layoutreturn) (struct pnfs_layout_hdr *layoutid,
109 struct xdr_stream *xdr,
110 const struct nfs4_layoutreturn_args *args);
111
112 void (*encode_layoutcommit) (struct pnfs_layout_hdr *layoutid,
113 struct xdr_stream *xdr,
114 const struct nfs4_layoutcommit_args *args);
92}; 115};
93 116
94struct pnfs_layout_hdr { 117struct pnfs_layout_hdr {
@@ -120,21 +143,22 @@ extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
120extern int nfs4_proc_getdeviceinfo(struct nfs_server *server, 143extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
121 struct pnfs_device *dev); 144 struct pnfs_device *dev);
122extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp); 145extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp);
146extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp);
123 147
124/* pnfs.c */ 148/* pnfs.c */
125void get_layout_hdr(struct pnfs_layout_hdr *lo); 149void get_layout_hdr(struct pnfs_layout_hdr *lo);
126void put_lseg(struct pnfs_layout_segment *lseg); 150void put_lseg(struct pnfs_layout_segment *lseg);
127struct pnfs_layout_segment * 151struct pnfs_layout_segment *
128pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, 152pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
129 enum pnfs_iomode access_type, gfp_t gfp_flags); 153 loff_t pos, u64 count, enum pnfs_iomode access_type,
154 gfp_t gfp_flags);
130void set_pnfs_layoutdriver(struct nfs_server *, u32 id); 155void set_pnfs_layoutdriver(struct nfs_server *, u32 id);
131void unset_pnfs_layoutdriver(struct nfs_server *); 156void unset_pnfs_layoutdriver(struct nfs_server *);
132enum pnfs_try_status pnfs_try_to_write_data(struct nfs_write_data *, 157enum pnfs_try_status pnfs_try_to_write_data(struct nfs_write_data *,
133 const struct rpc_call_ops *, int); 158 const struct rpc_call_ops *, int);
134enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *, 159enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *,
135 const struct rpc_call_ops *); 160 const struct rpc_call_ops *);
136void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *); 161bool pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req);
137void pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *);
138int pnfs_layout_process(struct nfs4_layoutget *lgp); 162int pnfs_layout_process(struct nfs4_layoutget *lgp);
139void pnfs_free_lseg_list(struct list_head *tmp_list); 163void pnfs_free_lseg_list(struct list_head *tmp_list);
140void pnfs_destroy_layout(struct nfs_inode *); 164void pnfs_destroy_layout(struct nfs_inode *);
@@ -148,13 +172,37 @@ int pnfs_choose_layoutget_stateid(nfs4_stateid *dst,
148 struct nfs4_state *open_state); 172 struct nfs4_state *open_state);
149int mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, 173int mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
150 struct list_head *tmp_list, 174 struct list_head *tmp_list,
151 u32 iomode); 175 struct pnfs_layout_range *recall_range);
152bool pnfs_roc(struct inode *ino); 176bool pnfs_roc(struct inode *ino);
153void pnfs_roc_release(struct inode *ino); 177void pnfs_roc_release(struct inode *ino);
154void pnfs_roc_set_barrier(struct inode *ino, u32 barrier); 178void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
155bool pnfs_roc_drain(struct inode *ino, u32 *barrier); 179bool pnfs_roc_drain(struct inode *ino, u32 *barrier);
156void pnfs_set_layoutcommit(struct nfs_write_data *wdata); 180void pnfs_set_layoutcommit(struct nfs_write_data *wdata);
157int pnfs_layoutcommit_inode(struct inode *inode, bool sync); 181int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
182int _pnfs_return_layout(struct inode *);
183int pnfs_ld_write_done(struct nfs_write_data *);
184int pnfs_ld_read_done(struct nfs_read_data *);
185
186/* pnfs_dev.c */
187struct nfs4_deviceid_node {
188 struct hlist_node node;
189 const struct pnfs_layoutdriver_type *ld;
190 const struct nfs_client *nfs_client;
191 struct nfs4_deviceid deviceid;
192 atomic_t ref;
193};
194
195void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id);
196struct nfs4_deviceid_node *nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);
197struct nfs4_deviceid_node *nfs4_unhash_put_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);
198void nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);
199void nfs4_init_deviceid_node(struct nfs4_deviceid_node *,
200 const struct pnfs_layoutdriver_type *,
201 const struct nfs_client *,
202 const struct nfs4_deviceid *);
203struct nfs4_deviceid_node *nfs4_insert_deviceid_node(struct nfs4_deviceid_node *);
204bool nfs4_put_deviceid_node(struct nfs4_deviceid_node *);
205void nfs4_deviceid_purge_client(const struct nfs_client *);
158 206
159static inline int lo_fail_bit(u32 iomode) 207static inline int lo_fail_bit(u32 iomode)
160{ 208{
@@ -223,6 +271,36 @@ static inline void pnfs_clear_request_commit(struct nfs_page *req)
223 put_lseg(req->wb_commit_lseg); 271 put_lseg(req->wb_commit_lseg);
224} 272}
225 273
274/* Should the pNFS client commit and return the layout upon a setattr */
275static inline bool
276pnfs_ld_layoutret_on_setattr(struct inode *inode)
277{
278 if (!pnfs_enabled_sb(NFS_SERVER(inode)))
279 return false;
280 return NFS_SERVER(inode)->pnfs_curr_ld->flags &
281 PNFS_LAYOUTRET_ON_SETATTR;
282}
283
284static inline int pnfs_return_layout(struct inode *ino)
285{
286 struct nfs_inode *nfsi = NFS_I(ino);
287 struct nfs_server *nfss = NFS_SERVER(ino);
288
289 if (pnfs_enabled_sb(nfss) && nfsi->layout)
290 return _pnfs_return_layout(ino);
291
292 return 0;
293}
294
295static inline void pnfs_pageio_init(struct nfs_pageio_descriptor *pgio,
296 struct inode *inode)
297{
298 struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
299
300 if (ld)
301 pgio->pg_test = ld->pg_test;
302}
303
226#else /* CONFIG_NFS_V4_1 */ 304#else /* CONFIG_NFS_V4_1 */
227 305
228static inline void pnfs_destroy_all_layouts(struct nfs_client *clp) 306static inline void pnfs_destroy_all_layouts(struct nfs_client *clp)
@@ -245,7 +323,8 @@ static inline void put_lseg(struct pnfs_layout_segment *lseg)
245 323
246static inline struct pnfs_layout_segment * 324static inline struct pnfs_layout_segment *
247pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, 325pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
248 enum pnfs_iomode access_type, gfp_t gfp_flags) 326 loff_t pos, u64 count, enum pnfs_iomode access_type,
327 gfp_t gfp_flags)
249{ 328{
250 return NULL; 329 return NULL;
251} 330}
@@ -264,6 +343,17 @@ pnfs_try_to_write_data(struct nfs_write_data *data,
264 return PNFS_NOT_ATTEMPTED; 343 return PNFS_NOT_ATTEMPTED;
265} 344}
266 345
346static inline int pnfs_return_layout(struct inode *ino)
347{
348 return 0;
349}
350
351static inline bool
352pnfs_ld_layoutret_on_setattr(struct inode *inode)
353{
354 return false;
355}
356
267static inline bool 357static inline bool
268pnfs_roc(struct inode *ino) 358pnfs_roc(struct inode *ino)
269{ 359{
@@ -294,16 +384,9 @@ static inline void unset_pnfs_layoutdriver(struct nfs_server *s)
294{ 384{
295} 385}
296 386
297static inline void 387static inline void pnfs_pageio_init(struct nfs_pageio_descriptor *pgio,
298pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *ino) 388 struct inode *inode)
299{
300 pgio->pg_test = NULL;
301}
302
303static inline void
304pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *ino)
305{ 389{
306 pgio->pg_test = NULL;
307} 390}
308 391
309static inline void 392static inline void
@@ -331,6 +414,10 @@ static inline int pnfs_layoutcommit_inode(struct inode *inode, bool sync)
331{ 414{
332 return 0; 415 return 0;
333} 416}
417
418static inline void nfs4_deviceid_purge_client(struct nfs_client *ncl)
419{
420}
334#endif /* CONFIG_NFS_V4_1 */ 421#endif /* CONFIG_NFS_V4_1 */
335 422
336#endif /* FS_NFS_PNFS_H */ 423#endif /* FS_NFS_PNFS_H */
diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c
new file mode 100644
index 000000000000..c65e133ce9c0
--- /dev/null
+++ b/fs/nfs/pnfs_dev.c
@@ -0,0 +1,270 @@
1/*
2 * Device operations for the pnfs client.
3 *
4 * Copyright (c) 2002
5 * The Regents of the University of Michigan
6 * All Rights Reserved
7 *
8 * Dean Hildebrand <dhildebz@umich.edu>
9 * Garth Goodson <Garth.Goodson@netapp.com>
10 *
11 * Permission is granted to use, copy, create derivative works, and
12 * redistribute this software and such derivative works for any purpose,
13 * so long as the name of the University of Michigan is not used in
14 * any advertising or publicity pertaining to the use or distribution
15 * of this software without specific, written prior authorization. If
16 * the above copyright notice or any other identification of the
17 * University of Michigan is included in any copy of any portion of
18 * this software, then the disclaimer below must also be included.
19 *
20 * This software is provided as is, without representation or warranty
21 * of any kind either express or implied, including without limitation
22 * the implied warranties of merchantability, fitness for a particular
23 * purpose, or noninfringement. The Regents of the University of
24 * Michigan shall not be liable for any damages, including special,
25 * indirect, incidental, or consequential damages, with respect to any
26 * claim arising out of or in connection with the use of the software,
27 * even if it has been or is hereafter advised of the possibility of
28 * such damages.
29 */
30
31#include "pnfs.h"
32
33#define NFSDBG_FACILITY NFSDBG_PNFS
34
35/*
36 * Device ID RCU cache. A device ID is unique per server and layout type.
37 */
38#define NFS4_DEVICE_ID_HASH_BITS 5
39#define NFS4_DEVICE_ID_HASH_SIZE (1 << NFS4_DEVICE_ID_HASH_BITS)
40#define NFS4_DEVICE_ID_HASH_MASK (NFS4_DEVICE_ID_HASH_SIZE - 1)
41
42static struct hlist_head nfs4_deviceid_cache[NFS4_DEVICE_ID_HASH_SIZE];
43static DEFINE_SPINLOCK(nfs4_deviceid_lock);
44
45void
46nfs4_print_deviceid(const struct nfs4_deviceid *id)
47{
48 u32 *p = (u32 *)id;
49
50 dprintk("%s: device id= [%x%x%x%x]\n", __func__,
51 p[0], p[1], p[2], p[3]);
52}
53EXPORT_SYMBOL_GPL(nfs4_print_deviceid);
54
55static inline u32
56nfs4_deviceid_hash(const struct nfs4_deviceid *id)
57{
58 unsigned char *cptr = (unsigned char *)id->data;
59 unsigned int nbytes = NFS4_DEVICEID4_SIZE;
60 u32 x = 0;
61
62 while (nbytes--) {
63 x *= 37;
64 x += *cptr++;
65 }
66 return x & NFS4_DEVICE_ID_HASH_MASK;
67}
68
69static struct nfs4_deviceid_node *
70_lookup_deviceid(const struct pnfs_layoutdriver_type *ld,
71 const struct nfs_client *clp, const struct nfs4_deviceid *id,
72 long hash)
73{
74 struct nfs4_deviceid_node *d;
75 struct hlist_node *n;
76
77 hlist_for_each_entry_rcu(d, n, &nfs4_deviceid_cache[hash], node)
78 if (d->ld == ld && d->nfs_client == clp &&
79 !memcmp(&d->deviceid, id, sizeof(*id))) {
80 if (atomic_read(&d->ref))
81 return d;
82 else
83 continue;
84 }
85 return NULL;
86}
87
88/*
89 * Lookup a deviceid in cache and get a reference count on it if found
90 *
91 * @clp nfs_client associated with deviceid
92 * @id deviceid to look up
93 */
94struct nfs4_deviceid_node *
95_find_get_deviceid(const struct pnfs_layoutdriver_type *ld,
96 const struct nfs_client *clp, const struct nfs4_deviceid *id,
97 long hash)
98{
99 struct nfs4_deviceid_node *d;
100
101 rcu_read_lock();
102 d = _lookup_deviceid(ld, clp, id, hash);
103 if (d && !atomic_inc_not_zero(&d->ref))
104 d = NULL;
105 rcu_read_unlock();
106 return d;
107}
108
109struct nfs4_deviceid_node *
110nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *ld,
111 const struct nfs_client *clp, const struct nfs4_deviceid *id)
112{
113 return _find_get_deviceid(ld, clp, id, nfs4_deviceid_hash(id));
114}
115EXPORT_SYMBOL_GPL(nfs4_find_get_deviceid);
116
117/*
118 * Unhash and put deviceid
119 *
120 * @clp nfs_client associated with deviceid
121 * @id the deviceid to unhash
122 *
123 * @ret the unhashed node, if found and dereferenced to zero, NULL otherwise.
124 */
125struct nfs4_deviceid_node *
126nfs4_unhash_put_deviceid(const struct pnfs_layoutdriver_type *ld,
127 const struct nfs_client *clp, const struct nfs4_deviceid *id)
128{
129 struct nfs4_deviceid_node *d;
130
131 spin_lock(&nfs4_deviceid_lock);
132 rcu_read_lock();
133 d = _lookup_deviceid(ld, clp, id, nfs4_deviceid_hash(id));
134 rcu_read_unlock();
135 if (!d) {
136 spin_unlock(&nfs4_deviceid_lock);
137 return NULL;
138 }
139 hlist_del_init_rcu(&d->node);
140 spin_unlock(&nfs4_deviceid_lock);
141 synchronize_rcu();
142
143 /* balance the initial ref set in pnfs_insert_deviceid */
144 if (atomic_dec_and_test(&d->ref))
145 return d;
146
147 return NULL;
148}
149EXPORT_SYMBOL_GPL(nfs4_unhash_put_deviceid);
150
151/*
152 * Delete a deviceid from cache
153 *
154 * @clp struct nfs_client qualifying the deviceid
155 * @id deviceid to delete
156 */
157void
158nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *ld,
159 const struct nfs_client *clp, const struct nfs4_deviceid *id)
160{
161 struct nfs4_deviceid_node *d;
162
163 d = nfs4_unhash_put_deviceid(ld, clp, id);
164 if (!d)
165 return;
166 d->ld->free_deviceid_node(d);
167}
168EXPORT_SYMBOL_GPL(nfs4_delete_deviceid);
169
170void
171nfs4_init_deviceid_node(struct nfs4_deviceid_node *d,
172 const struct pnfs_layoutdriver_type *ld,
173 const struct nfs_client *nfs_client,
174 const struct nfs4_deviceid *id)
175{
176 INIT_HLIST_NODE(&d->node);
177 d->ld = ld;
178 d->nfs_client = nfs_client;
179 d->deviceid = *id;
180 atomic_set(&d->ref, 1);
181}
182EXPORT_SYMBOL_GPL(nfs4_init_deviceid_node);
183
184/*
185 * Uniquely initialize and insert a deviceid node into cache
186 *
187 * @new new deviceid node
188 * Note that the caller must set up the following members:
189 * new->ld
190 * new->nfs_client
191 * new->deviceid
192 *
193 * @ret the inserted node, if none found, otherwise, the found entry.
194 */
195struct nfs4_deviceid_node *
196nfs4_insert_deviceid_node(struct nfs4_deviceid_node *new)
197{
198 struct nfs4_deviceid_node *d;
199 long hash;
200
201 spin_lock(&nfs4_deviceid_lock);
202 hash = nfs4_deviceid_hash(&new->deviceid);
203 d = _find_get_deviceid(new->ld, new->nfs_client, &new->deviceid, hash);
204 if (d) {
205 spin_unlock(&nfs4_deviceid_lock);
206 return d;
207 }
208
209 hlist_add_head_rcu(&new->node, &nfs4_deviceid_cache[hash]);
210 spin_unlock(&nfs4_deviceid_lock);
211
212 return new;
213}
214EXPORT_SYMBOL_GPL(nfs4_insert_deviceid_node);
215
216/*
217 * Dereference a deviceid node and delete it when its reference count drops
218 * to zero.
219 *
220 * @d deviceid node to put
221 *
222 * @ret true iff the node was deleted
223 */
224bool
225nfs4_put_deviceid_node(struct nfs4_deviceid_node *d)
226{
227 if (!atomic_dec_and_lock(&d->ref, &nfs4_deviceid_lock))
228 return false;
229 hlist_del_init_rcu(&d->node);
230 spin_unlock(&nfs4_deviceid_lock);
231 synchronize_rcu();
232 d->ld->free_deviceid_node(d);
233 return true;
234}
235EXPORT_SYMBOL_GPL(nfs4_put_deviceid_node);
236
237static void
238_deviceid_purge_client(const struct nfs_client *clp, long hash)
239{
240 struct nfs4_deviceid_node *d;
241 struct hlist_node *n, *next;
242 HLIST_HEAD(tmp);
243
244 rcu_read_lock();
245 hlist_for_each_entry_rcu(d, n, &nfs4_deviceid_cache[hash], node)
246 if (d->nfs_client == clp && atomic_read(&d->ref)) {
247 hlist_del_init_rcu(&d->node);
248 hlist_add_head(&d->node, &tmp);
249 }
250 rcu_read_unlock();
251
252 if (hlist_empty(&tmp))
253 return;
254
255 synchronize_rcu();
256 hlist_for_each_entry_safe(d, n, next, &tmp, node)
257 if (atomic_dec_and_test(&d->ref))
258 d->ld->free_deviceid_node(d);
259}
260
261void
262nfs4_deviceid_purge_client(const struct nfs_client *clp)
263{
264 long h;
265
266 spin_lock(&nfs4_deviceid_lock);
267 for (h = 0; h < NFS4_DEVICE_ID_HASH_SIZE; h++)
268 _deviceid_purge_client(clp, h);
269 spin_unlock(&nfs4_deviceid_lock);
270}
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 2bcf0dc306a1..20a7f952e244 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -288,7 +288,9 @@ static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc)
288 atomic_set(&req->wb_complete, requests); 288 atomic_set(&req->wb_complete, requests);
289 289
290 BUG_ON(desc->pg_lseg != NULL); 290 BUG_ON(desc->pg_lseg != NULL);
291 lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_READ, GFP_KERNEL); 291 lseg = pnfs_update_layout(desc->pg_inode, req->wb_context,
292 req_offset(req), desc->pg_count,
293 IOMODE_READ, GFP_KERNEL);
292 ClearPageError(page); 294 ClearPageError(page);
293 offset = 0; 295 offset = 0;
294 nbytes = desc->pg_count; 296 nbytes = desc->pg_count;
@@ -351,7 +353,9 @@ static int nfs_pagein_one(struct nfs_pageio_descriptor *desc)
351 } 353 }
352 req = nfs_list_entry(data->pages.next); 354 req = nfs_list_entry(data->pages.next);
353 if ((!lseg) && list_is_singular(&data->pages)) 355 if ((!lseg) && list_is_singular(&data->pages))
354 lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_READ, GFP_KERNEL); 356 lseg = pnfs_update_layout(desc->pg_inode, req->wb_context,
357 req_offset(req), desc->pg_count,
358 IOMODE_READ, GFP_KERNEL);
355 359
356 ret = nfs_read_rpcsetup(req, data, &nfs_read_full_ops, desc->pg_count, 360 ret = nfs_read_rpcsetup(req, data, &nfs_read_full_ops, desc->pg_count,
357 0, lseg); 361 0, lseg);
@@ -660,7 +664,6 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
660 if (ret == 0) 664 if (ret == 0)
661 goto read_complete; /* all pages were read */ 665 goto read_complete; /* all pages were read */
662 666
663 pnfs_pageio_init_read(&pgio, inode);
664 if (rsize < PAGE_CACHE_SIZE) 667 if (rsize < PAGE_CACHE_SIZE)
665 nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0); 668 nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0);
666 else 669 else
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index e288f06d3fa7..ce40e5c568ba 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -63,6 +63,7 @@
63#include "iostat.h" 63#include "iostat.h"
64#include "internal.h" 64#include "internal.h"
65#include "fscache.h" 65#include "fscache.h"
66#include "pnfs.h"
66 67
67#define NFSDBG_FACILITY NFSDBG_VFS 68#define NFSDBG_FACILITY NFSDBG_VFS
68 69
@@ -732,6 +733,28 @@ static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt)
732 733
733 return 0; 734 return 0;
734} 735}
736#ifdef CONFIG_NFS_V4_1
737void show_sessions(struct seq_file *m, struct nfs_server *server)
738{
739 if (nfs4_has_session(server->nfs_client))
740 seq_printf(m, ",sessions");
741}
742#else
743void show_sessions(struct seq_file *m, struct nfs_server *server) {}
744#endif
745
746#ifdef CONFIG_NFS_V4_1
747void show_pnfs(struct seq_file *m, struct nfs_server *server)
748{
749 seq_printf(m, ",pnfs=");
750 if (server->pnfs_curr_ld)
751 seq_printf(m, "%s", server->pnfs_curr_ld->name);
752 else
753 seq_printf(m, "not configured");
754}
755#else /* CONFIG_NFS_V4_1 */
756void show_pnfs(struct seq_file *m, struct nfs_server *server) {}
757#endif /* CONFIG_NFS_V4_1 */
735 758
736static int nfs_show_devname(struct seq_file *m, struct vfsmount *mnt) 759static int nfs_show_devname(struct seq_file *m, struct vfsmount *mnt)
737{ 760{
@@ -792,6 +815,8 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt)
792 seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]); 815 seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]);
793 seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]); 816 seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]);
794 seq_printf(m, ",acl=0x%x", nfss->acl_bitmask); 817 seq_printf(m, ",acl=0x%x", nfss->acl_bitmask);
818 show_sessions(m, nfss);
819 show_pnfs(m, nfss);
795 } 820 }
796#endif 821#endif
797 822
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 49c715b4ac92..e268e3b23497 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -939,7 +939,9 @@ static int nfs_flush_multi(struct nfs_pageio_descriptor *desc)
939 atomic_set(&req->wb_complete, requests); 939 atomic_set(&req->wb_complete, requests);
940 940
941 BUG_ON(desc->pg_lseg); 941 BUG_ON(desc->pg_lseg);
942 lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_RW, GFP_NOFS); 942 lseg = pnfs_update_layout(desc->pg_inode, req->wb_context,
943 req_offset(req), desc->pg_count,
944 IOMODE_RW, GFP_NOFS);
943 ClearPageError(page); 945 ClearPageError(page);
944 offset = 0; 946 offset = 0;
945 nbytes = desc->pg_count; 947 nbytes = desc->pg_count;
@@ -1013,7 +1015,9 @@ static int nfs_flush_one(struct nfs_pageio_descriptor *desc)
1013 } 1015 }
1014 req = nfs_list_entry(data->pages.next); 1016 req = nfs_list_entry(data->pages.next);
1015 if ((!lseg) && list_is_singular(&data->pages)) 1017 if ((!lseg) && list_is_singular(&data->pages))
1016 lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_RW, GFP_NOFS); 1018 lseg = pnfs_update_layout(desc->pg_inode, req->wb_context,
1019 req_offset(req), desc->pg_count,
1020 IOMODE_RW, GFP_NOFS);
1017 1021
1018 if ((desc->pg_ioflags & FLUSH_COND_STABLE) && 1022 if ((desc->pg_ioflags & FLUSH_COND_STABLE) &&
1019 (desc->pg_moreio || NFS_I(desc->pg_inode)->ncommit)) 1023 (desc->pg_moreio || NFS_I(desc->pg_inode)->ncommit))
@@ -1032,8 +1036,6 @@ static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
1032{ 1036{
1033 size_t wsize = NFS_SERVER(inode)->wsize; 1037 size_t wsize = NFS_SERVER(inode)->wsize;
1034 1038
1035 pnfs_pageio_init_write(pgio, inode);
1036
1037 if (wsize < PAGE_CACHE_SIZE) 1039 if (wsize < PAGE_CACHE_SIZE)
1038 nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags); 1040 nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags);
1039 else 1041 else
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index 8e66c5ccc1c4..504b289ba680 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -562,6 +562,7 @@ enum {
562 NFSPROC4_CLNT_LAYOUTGET, 562 NFSPROC4_CLNT_LAYOUTGET,
563 NFSPROC4_CLNT_GETDEVICEINFO, 563 NFSPROC4_CLNT_GETDEVICEINFO,
564 NFSPROC4_CLNT_LAYOUTCOMMIT, 564 NFSPROC4_CLNT_LAYOUTCOMMIT,
565 NFSPROC4_CLNT_LAYOUTRETURN,
565}; 566};
566 567
567/* nfs41 types */ 568/* nfs41 types */
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index 91af2e49fa3a..3a34e80ae92f 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -68,7 +68,7 @@ struct nfs_pageio_descriptor {
68 int pg_ioflags; 68 int pg_ioflags;
69 int pg_error; 69 int pg_error;
70 struct pnfs_layout_segment *pg_lseg; 70 struct pnfs_layout_segment *pg_lseg;
71 int (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *); 71 bool (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *);
72}; 72};
73 73
74#define NFS_WBACK_BUSY(req) (test_bit(PG_BUSY,&(req)->wb_flags)) 74#define NFS_WBACK_BUSY(req) (test_bit(PG_BUSY,&(req)->wb_flags))
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 7e371f7df9c4..5e8444a11adf 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -269,6 +269,27 @@ struct nfs4_layoutcommit_data {
269 struct nfs4_layoutcommit_res res; 269 struct nfs4_layoutcommit_res res;
270}; 270};
271 271
272struct nfs4_layoutreturn_args {
273 __u32 layout_type;
274 struct inode *inode;
275 nfs4_stateid stateid;
276 struct nfs4_sequence_args seq_args;
277};
278
279struct nfs4_layoutreturn_res {
280 struct nfs4_sequence_res seq_res;
281 u32 lrs_present;
282 nfs4_stateid stateid;
283};
284
285struct nfs4_layoutreturn {
286 struct nfs4_layoutreturn_args args;
287 struct nfs4_layoutreturn_res res;
288 struct rpc_cred *cred;
289 struct nfs_client *clp;
290 int rpc_status;
291};
292
272/* 293/*
273 * Arguments to the open call. 294 * Arguments to the open call.
274 */ 295 */
@@ -1087,6 +1108,7 @@ struct nfs_read_data {
1087 const struct rpc_call_ops *mds_ops; 1108 const struct rpc_call_ops *mds_ops;
1088 int (*read_done_cb) (struct rpc_task *task, struct nfs_read_data *data); 1109 int (*read_done_cb) (struct rpc_task *task, struct nfs_read_data *data);
1089 __u64 mds_offset; 1110 __u64 mds_offset;
1111 int pnfs_error;
1090 struct page *page_array[NFS_PAGEVEC_SIZE]; 1112 struct page *page_array[NFS_PAGEVEC_SIZE];
1091}; 1113};
1092 1114
@@ -1112,6 +1134,7 @@ struct nfs_write_data {
1112 unsigned long timestamp; /* For lease renewal */ 1134 unsigned long timestamp; /* For lease renewal */
1113#endif 1135#endif
1114 __u64 mds_offset; /* Filelayout dense stripe */ 1136 __u64 mds_offset; /* Filelayout dense stripe */
1137 int pnfs_error;
1115 struct page *page_array[NFS_PAGEVEC_SIZE]; 1138 struct page *page_array[NFS_PAGEVEC_SIZE];
1116}; 1139};
1117 1140
diff --git a/include/linux/pnfs_osd_xdr.h b/include/linux/pnfs_osd_xdr.h
new file mode 100644
index 000000000000..76efbdd01622
--- /dev/null
+++ b/include/linux/pnfs_osd_xdr.h
@@ -0,0 +1,345 @@
1/*
2 * pNFS-osd on-the-wire data structures
3 *
4 * Copyright (C) 2007 Panasas Inc. [year of first publication]
5 * All rights reserved.
6 *
7 * Benny Halevy <bhalevy@panasas.com>
8 * Boaz Harrosh <bharrosh@panasas.com>
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License version 2
12 * See the file COPYING included with this distribution for more details.
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 *
18 * 1. Redistributions of source code must retain the above copyright
19 * notice, this list of conditions and the following disclaimer.
20 * 2. Redistributions in binary form must reproduce the above copyright
21 * notice, this list of conditions and the following disclaimer in the
22 * documentation and/or other materials provided with the distribution.
23 * 3. Neither the name of the Panasas company nor the names of its
24 * contributors may be used to endorse or promote products derived
25 * from this software without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
28 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
29 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
30 * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
34 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 */
39#ifndef __PNFS_OSD_XDR_H__
40#define __PNFS_OSD_XDR_H__
41
42#include <linux/nfs_fs.h>
43#include <linux/nfs_page.h>
44#include <scsi/osd_protocol.h>
45
46#define PNFS_OSD_OSDNAME_MAXSIZE 256
47
48/*
49 * draft-ietf-nfsv4-minorversion-22
50 * draft-ietf-nfsv4-pnfs-obj-12
51 */
52
53/* Layout Structure */
54
55enum pnfs_osd_raid_algorithm4 {
56 PNFS_OSD_RAID_0 = 1,
57 PNFS_OSD_RAID_4 = 2,
58 PNFS_OSD_RAID_5 = 3,
59 PNFS_OSD_RAID_PQ = 4 /* Reed-Solomon P+Q */
60};
61
62/* struct pnfs_osd_data_map4 {
63 * uint32_t odm_num_comps;
64 * length4 odm_stripe_unit;
65 * uint32_t odm_group_width;
66 * uint32_t odm_group_depth;
67 * uint32_t odm_mirror_cnt;
68 * pnfs_osd_raid_algorithm4 odm_raid_algorithm;
69 * };
70 */
71struct pnfs_osd_data_map {
72 u32 odm_num_comps;
73 u64 odm_stripe_unit;
74 u32 odm_group_width;
75 u32 odm_group_depth;
76 u32 odm_mirror_cnt;
77 u32 odm_raid_algorithm;
78};
79
80/* struct pnfs_osd_objid4 {
81 * deviceid4 oid_device_id;
82 * uint64_t oid_partition_id;
83 * uint64_t oid_object_id;
84 * };
85 */
86struct pnfs_osd_objid {
87 struct nfs4_deviceid oid_device_id;
88 u64 oid_partition_id;
89 u64 oid_object_id;
90};
91
92/* For printout. I use:
93 * kprint("dev(%llx:%llx)", _DEVID_LO(pointer), _DEVID_HI(pointer));
94 * BE style
95 */
96#define _DEVID_LO(oid_device_id) \
97 (unsigned long long)be64_to_cpup((__be64 *)(oid_device_id)->data)
98
99#define _DEVID_HI(oid_device_id) \
100 (unsigned long long)be64_to_cpup(((__be64 *)(oid_device_id)->data) + 1)
101
102static inline int
103pnfs_osd_objid_xdr_sz(void)
104{
105 return (NFS4_DEVICEID4_SIZE / 4) + 2 + 2;
106}
107
108enum pnfs_osd_version {
109 PNFS_OSD_MISSING = 0,
110 PNFS_OSD_VERSION_1 = 1,
111 PNFS_OSD_VERSION_2 = 2
112};
113
114struct pnfs_osd_opaque_cred {
115 u32 cred_len;
116 void *cred;
117};
118
119enum pnfs_osd_cap_key_sec {
120 PNFS_OSD_CAP_KEY_SEC_NONE = 0,
121 PNFS_OSD_CAP_KEY_SEC_SSV = 1,
122};
123
124/* struct pnfs_osd_object_cred4 {
125 * pnfs_osd_objid4 oc_object_id;
126 * pnfs_osd_version4 oc_osd_version;
127 * pnfs_osd_cap_key_sec4 oc_cap_key_sec;
128 * opaque oc_capability_key<>;
129 * opaque oc_capability<>;
130 * };
131 */
132struct pnfs_osd_object_cred {
133 struct pnfs_osd_objid oc_object_id;
134 u32 oc_osd_version;
135 u32 oc_cap_key_sec;
136 struct pnfs_osd_opaque_cred oc_cap_key;
137 struct pnfs_osd_opaque_cred oc_cap;
138};
139
140/* struct pnfs_osd_layout4 {
141 * pnfs_osd_data_map4 olo_map;
142 * uint32_t olo_comps_index;
143 * pnfs_osd_object_cred4 olo_components<>;
144 * };
145 */
146struct pnfs_osd_layout {
147 struct pnfs_osd_data_map olo_map;
148 u32 olo_comps_index;
149 u32 olo_num_comps;
150 struct pnfs_osd_object_cred *olo_comps;
151};
152
153/* Device Address */
154enum pnfs_osd_targetid_type {
155 OBJ_TARGET_ANON = 1,
156 OBJ_TARGET_SCSI_NAME = 2,
157 OBJ_TARGET_SCSI_DEVICE_ID = 3,
158};
159
160/* union pnfs_osd_targetid4 switch (pnfs_osd_targetid_type4 oti_type) {
161 * case OBJ_TARGET_SCSI_NAME:
162 * string oti_scsi_name<>;
163 *
164 * case OBJ_TARGET_SCSI_DEVICE_ID:
165 * opaque oti_scsi_device_id<>;
166 *
167 * default:
168 * void;
169 * };
170 *
171 * union pnfs_osd_targetaddr4 switch (bool ota_available) {
172 * case TRUE:
173 * netaddr4 ota_netaddr;
174 * case FALSE:
175 * void;
176 * };
177 *
178 * struct pnfs_osd_deviceaddr4 {
179 * pnfs_osd_targetid4 oda_targetid;
180 * pnfs_osd_targetaddr4 oda_targetaddr;
181 * uint64_t oda_lun;
182 * opaque oda_systemid<>;
183 * pnfs_osd_object_cred4 oda_root_obj_cred;
184 * opaque oda_osdname<>;
185 * };
186 */
187struct pnfs_osd_targetid {
188 u32 oti_type;
189 struct nfs4_string oti_scsi_device_id;
190};
191
192enum { PNFS_OSD_TARGETID_MAX = 1 + PNFS_OSD_OSDNAME_MAXSIZE / 4 };
193
194/* struct netaddr4 {
195 * // see struct rpcb in RFC1833
196 * string r_netid<>; // network id
197 * string r_addr<>; // universal address
198 * };
199 */
200struct pnfs_osd_net_addr {
201 struct nfs4_string r_netid;
202 struct nfs4_string r_addr;
203};
204
205struct pnfs_osd_targetaddr {
206 u32 ota_available;
207 struct pnfs_osd_net_addr ota_netaddr;
208};
209
210enum {
211 NETWORK_ID_MAX = 16 / 4,
212 UNIVERSAL_ADDRESS_MAX = 64 / 4,
213 PNFS_OSD_TARGETADDR_MAX = 3 + NETWORK_ID_MAX + UNIVERSAL_ADDRESS_MAX,
214};
215
216struct pnfs_osd_deviceaddr {
217 struct pnfs_osd_targetid oda_targetid;
218 struct pnfs_osd_targetaddr oda_targetaddr;
219 u8 oda_lun[8];
220 struct nfs4_string oda_systemid;
221 struct pnfs_osd_object_cred oda_root_obj_cred;
222 struct nfs4_string oda_osdname;
223};
224
225enum {
226 ODA_OSDNAME_MAX = PNFS_OSD_OSDNAME_MAXSIZE / 4,
227 PNFS_OSD_DEVICEADDR_MAX =
228 PNFS_OSD_TARGETID_MAX + PNFS_OSD_TARGETADDR_MAX +
229 2 /*oda_lun*/ +
230 1 + OSD_SYSTEMID_LEN +
231 1 + ODA_OSDNAME_MAX,
232};
233
234/* LAYOUTCOMMIT: layoutupdate */
235
236/* union pnfs_osd_deltaspaceused4 switch (bool dsu_valid) {
237 * case TRUE:
238 * int64_t dsu_delta;
239 * case FALSE:
240 * void;
241 * };
242 *
243 * struct pnfs_osd_layoutupdate4 {
244 * pnfs_osd_deltaspaceused4 olu_delta_space_used;
245 * bool olu_ioerr_flag;
246 * };
247 */
248struct pnfs_osd_layoutupdate {
249 u32 dsu_valid;
250 s64 dsu_delta;
251 u32 olu_ioerr_flag;
252};
253
254/* LAYOUTRETURN: I/O Rrror Report */
255
256enum pnfs_osd_errno {
257 PNFS_OSD_ERR_EIO = 1,
258 PNFS_OSD_ERR_NOT_FOUND = 2,
259 PNFS_OSD_ERR_NO_SPACE = 3,
260 PNFS_OSD_ERR_BAD_CRED = 4,
261 PNFS_OSD_ERR_NO_ACCESS = 5,
262 PNFS_OSD_ERR_UNREACHABLE = 6,
263 PNFS_OSD_ERR_RESOURCE = 7
264};
265
266/* struct pnfs_osd_ioerr4 {
267 * pnfs_osd_objid4 oer_component;
268 * length4 oer_comp_offset;
269 * length4 oer_comp_length;
270 * bool oer_iswrite;
271 * pnfs_osd_errno4 oer_errno;
272 * };
273 */
274struct pnfs_osd_ioerr {
275 struct pnfs_osd_objid oer_component;
276 u64 oer_comp_offset;
277 u64 oer_comp_length;
278 u32 oer_iswrite;
279 u32 oer_errno;
280};
281
282/* OSD XDR API */
283/* Layout helpers */
284/* Layout decoding is done in two parts:
285 * 1. First Call pnfs_osd_xdr_decode_layout_map to read in only the header part
286 * of the layout. @iter members need not be initialized.
287 * Returned:
288 * @layout members are set. (@layout->olo_comps set to NULL).
289 *
290 * Zero on success, or negative error if passed xdr is broken.
291 *
292 * 2. 2nd Call pnfs_osd_xdr_decode_layout_comp() in a loop until it returns
293 * false, to decode the next component.
294 * Returned:
295 * true if there is more to decode or false if we are done or error.
296 *
297 * Example:
298 * struct pnfs_osd_xdr_decode_layout_iter iter;
299 * struct pnfs_osd_layout layout;
300 * struct pnfs_osd_object_cred comp;
301 * int status;
302 *
303 * status = pnfs_osd_xdr_decode_layout_map(&layout, &iter, xdr);
304 * if (unlikely(status))
305 * goto err;
306 * while(pnfs_osd_xdr_decode_layout_comp(&comp, &iter, xdr, &status)) {
307 * // All of @comp strings point to inside the xdr_buffer
308 * // or scrach buffer. Copy them out to user memory eg.
309 * copy_single_comp(dest_comp++, &comp);
310 * }
311 * if (unlikely(status))
312 * goto err;
313 */
314
315struct pnfs_osd_xdr_decode_layout_iter {
316 unsigned total_comps;
317 unsigned decoded_comps;
318};
319
320extern int pnfs_osd_xdr_decode_layout_map(struct pnfs_osd_layout *layout,
321 struct pnfs_osd_xdr_decode_layout_iter *iter, struct xdr_stream *xdr);
322
323extern bool pnfs_osd_xdr_decode_layout_comp(struct pnfs_osd_object_cred *comp,
324 struct pnfs_osd_xdr_decode_layout_iter *iter, struct xdr_stream *xdr,
325 int *err);
326
327/* Device Info helpers */
328
329/* Note: All strings inside @deviceaddr point to space inside @p.
330 * @p should stay valid while @deviceaddr is in use.
331 */
332extern void pnfs_osd_xdr_decode_deviceaddr(
333 struct pnfs_osd_deviceaddr *deviceaddr, __be32 *p);
334
335/* layoutupdate (layout_commit) xdr helpers */
336extern int
337pnfs_osd_xdr_encode_layoutupdate(struct xdr_stream *xdr,
338 struct pnfs_osd_layoutupdate *lou);
339
340/* osd_ioerror encoding/decoding (layout_return) */
341/* Client */
342extern __be32 *pnfs_osd_xdr_ioerr_reserve_space(struct xdr_stream *xdr);
343extern void pnfs_osd_xdr_encode_ioerr(__be32 *p, struct pnfs_osd_ioerr *ioerr);
344
345#endif /* __PNFS_OSD_XDR_H__ */
diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index fc84b7a19ca3..a20970ef9e4e 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -216,6 +216,8 @@ extern __be32 *xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes);
216extern void xdr_write_pages(struct xdr_stream *xdr, struct page **pages, 216extern void xdr_write_pages(struct xdr_stream *xdr, struct page **pages,
217 unsigned int base, unsigned int len); 217 unsigned int base, unsigned int len);
218extern void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p); 218extern void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p);
219extern void xdr_init_decode_pages(struct xdr_stream *xdr, struct xdr_buf *buf,
220 struct page **pages, unsigned int len);
219extern void xdr_set_scratch_buffer(struct xdr_stream *xdr, void *buf, size_t buflen); 221extern void xdr_set_scratch_buffer(struct xdr_stream *xdr, void *buf, size_t buflen);
220extern __be32 *xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes); 222extern __be32 *xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes);
221extern void xdr_read_pages(struct xdr_stream *xdr, unsigned int len); 223extern void xdr_read_pages(struct xdr_stream *xdr, unsigned int len);
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 679cd674b81d..f008c14ad34c 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -638,6 +638,25 @@ void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p)
638} 638}
639EXPORT_SYMBOL_GPL(xdr_init_decode); 639EXPORT_SYMBOL_GPL(xdr_init_decode);
640 640
641/**
642 * xdr_init_decode - Initialize an xdr_stream for decoding data.
643 * @xdr: pointer to xdr_stream struct
644 * @buf: pointer to XDR buffer from which to decode data
645 * @pages: list of pages to decode into
646 * @len: length in bytes of buffer in pages
647 */
648void xdr_init_decode_pages(struct xdr_stream *xdr, struct xdr_buf *buf,
649 struct page **pages, unsigned int len)
650{
651 memset(buf, 0, sizeof(*buf));
652 buf->pages = pages;
653 buf->page_len = len;
654 buf->buflen = len;
655 buf->len = len;
656 xdr_init_decode(xdr, buf, NULL);
657}
658EXPORT_SYMBOL_GPL(xdr_init_decode_pages);
659
641static __be32 * __xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes) 660static __be32 * __xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes)
642{ 661{
643 __be32 *p = xdr->p; 662 __be32 *p = xdr->p;