aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-07-02 14:32:23 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2015-07-02 14:32:23 -0400
commit8688d9540cc6e17df4cba71615e27f04e0378fe6 (patch)
tree45ab333822188966217f6a3ec7e8289ca7eced72
parent320cd413faefe2d30f4ee9651efddec5141bc95b (diff)
parentb4839ebe21fc5d543b933d83644981ea73e9ba36 (diff)
Merge tag 'nfs-for-4.2-1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs
Pull NFS client updates from Trond Myklebust: "Highlights include: Stable patches: - Fix a crash in the NFSv4 file locking code. - Fix an fsync() regression, where we were failing to retry I/O in some circumstances. - Fix an infinite loop in NFSv4.0 OPEN stateid recovery - Fix a memory leak when an attempted pnfs fails. - Fix a memory leak in the backchannel code - Large hostnames were not supported correctly in NFSv4.1 - Fix a pNFS/flexfiles bug that was impeding error reporting on I/O. - Fix a couple of credential issues in pNFS/flexfiles Bugfixes + cleanups: - Open flag sanity checks in the NFSv4 atomic open codepath - More NFSv4 delegation related bugfixes - Various NFSv4.1 backchannel bugfixes and cleanups - Fix the NFS swap socket code - Various cleanups of the NFSv4 SETCLIENTID and EXCHANGE_ID code - Fix a UDP transport deadlock issue Features: - More RDMA client transport improvements - NFSv4.2 LAYOUTSTATS functionality for pnfs flexfiles" * tag 'nfs-for-4.2-1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs: (87 commits) nfs: Remove invalid tk_pid from debug message nfs: Remove invalid NFS_ATTR_FATTR_V4_REFERRAL checking in nfs4_get_rootfh nfs: Drop bad comment in nfs41_walk_client_list() nfs: Remove unneeded micro checking of CONFIG_PROC_FS nfs: Don't setting FILE_CREATED flags always nfs: Use remove_proc_subtree() instead remove_proc_entry() nfs: Remove unused argument in nfs_server_set_fsinfo() nfs: Fix a memory leak when meeting an unsupported state protect nfs: take extra reference to fl->fl_file when running a LOCKU operation NFSv4: When returning a delegation, don't reclaim an incompatible open mode. NFSv4.2: LAYOUTSTATS is optional to implement NFSv4.2: Fix up a decoding error in layoutstats pNFS/flexfiles: Fix the reset of struct pgio_header when resending pNFS/flexfiles: Turn off layoutcommit for servers that don't need it pnfs/flexfiles: protect ktime manipulation with mirror lock nfs: provide pnfs_report_layoutstat when NFS42 is disabled nfs: verify open flags before allowing open nfs: always update creds in mirror, even when we have an already connected ds nfs: fix potential credential leak in ff_layout_update_mirror_cred pnfs/flexfiles: report layoutstat regularly ...
-rw-r--r--fs/nfs/callback.c6
-rw-r--r--fs/nfs/callback_proc.c38
-rw-r--r--fs/nfs/callback_xdr.c2
-rw-r--r--fs/nfs/client.c40
-rw-r--r--fs/nfs/dir.c3
-rw-r--r--fs/nfs/file.c15
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c480
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.h33
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayoutdev.c7
-rw-r--r--fs/nfs/inode.c12
-rw-r--r--fs/nfs/nfs3xdr.c2
-rw-r--r--fs/nfs/nfs42.h9
-rw-r--r--fs/nfs/nfs42proc.c87
-rw-r--r--fs/nfs/nfs42xdr.c106
-rw-r--r--fs/nfs/nfs4_fs.h1
-rw-r--r--fs/nfs/nfs4client.c1
-rw-r--r--fs/nfs/nfs4file.c4
-rw-r--r--fs/nfs/nfs4getroot.c7
-rw-r--r--fs/nfs/nfs4idmap.c7
-rw-r--r--fs/nfs/nfs4proc.c221
-rw-r--r--fs/nfs/nfs4state.c4
-rw-r--r--fs/nfs/nfs4xdr.c15
-rw-r--r--fs/nfs/pagelist.c10
-rw-r--r--fs/nfs/pnfs.c64
-rw-r--r--fs/nfs/pnfs.h13
-rw-r--r--fs/nfs/write.c9
-rw-r--r--include/linux/nfs4.h1
-rw-r--r--include/linux/nfs_fs.h1
-rw-r--r--include/linux/nfs_fs_sb.h1
-rw-r--r--include/linux/nfs_page.h1
-rw-r--r--include/linux/nfs_xdr.h51
-rw-r--r--include/linux/sunrpc/bc_xprt.h1
-rw-r--r--include/linux/sunrpc/clnt.h1
-rw-r--r--include/linux/sunrpc/sched.h19
-rw-r--r--include/linux/sunrpc/xprt.h39
-rw-r--r--include/linux/sunrpc/xprtrdma.h3
-rw-r--r--net/sunrpc/Makefile2
-rw-r--r--net/sunrpc/backchannel_rqst.c134
-rw-r--r--net/sunrpc/bc_svc.c63
-rw-r--r--net/sunrpc/clnt.c109
-rw-r--r--net/sunrpc/debugfs.c78
-rw-r--r--net/sunrpc/svc.c36
-rw-r--r--net/sunrpc/xprt.c7
-rw-r--r--net/sunrpc/xprtrdma/fmr_ops.c120
-rw-r--r--net/sunrpc/xprtrdma/frwr_ops.c227
-rw-r--r--net/sunrpc/xprtrdma/physical_ops.c14
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c8
-rw-r--r--net/sunrpc/xprtrdma/transport.c43
-rw-r--r--net/sunrpc/xprtrdma/verbs.c253
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h38
-rw-r--r--net/sunrpc/xprtsock.c151
51 files changed, 1858 insertions, 739 deletions
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 8d129bb7355a..682529c00996 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -458,7 +458,7 @@ check_gss_callback_principal(struct nfs_client *clp, struct svc_rqst *rqstp)
458 * pg_authenticate method for nfsv4 callback threads. 458 * pg_authenticate method for nfsv4 callback threads.
459 * 459 *
460 * The authflavor has been negotiated, so an incorrect flavor is a server 460 * The authflavor has been negotiated, so an incorrect flavor is a server
461 * bug. Drop packets with incorrect authflavor. 461 * bug. Deny packets with incorrect authflavor.
462 * 462 *
463 * All other checking done after NFS decoding where the nfs_client can be 463 * All other checking done after NFS decoding where the nfs_client can be
464 * found in nfs4_callback_compound 464 * found in nfs4_callback_compound
@@ -468,12 +468,12 @@ static int nfs_callback_authenticate(struct svc_rqst *rqstp)
468 switch (rqstp->rq_authop->flavour) { 468 switch (rqstp->rq_authop->flavour) {
469 case RPC_AUTH_NULL: 469 case RPC_AUTH_NULL:
470 if (rqstp->rq_proc != CB_NULL) 470 if (rqstp->rq_proc != CB_NULL)
471 return SVC_DROP; 471 return SVC_DENIED;
472 break; 472 break;
473 case RPC_AUTH_GSS: 473 case RPC_AUTH_GSS:
474 /* No RPC_AUTH_GSS support yet in NFSv4.1 */ 474 /* No RPC_AUTH_GSS support yet in NFSv4.1 */
475 if (svc_is_backchannel(rqstp)) 475 if (svc_is_backchannel(rqstp))
476 return SVC_DROP; 476 return SVC_DENIED;
477 } 477 }
478 return SVC_OK; 478 return SVC_OK;
479} 479}
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 197806fb87ff..29e3c1b011b7 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -327,10 +327,8 @@ validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args)
327 dprintk("%s slot table seqid: %u\n", __func__, slot->seq_nr); 327 dprintk("%s slot table seqid: %u\n", __func__, slot->seq_nr);
328 328
329 /* Normal */ 329 /* Normal */
330 if (likely(args->csa_sequenceid == slot->seq_nr + 1)) { 330 if (likely(args->csa_sequenceid == slot->seq_nr + 1))
331 slot->seq_nr++;
332 goto out_ok; 331 goto out_ok;
333 }
334 332
335 /* Replay */ 333 /* Replay */
336 if (args->csa_sequenceid == slot->seq_nr) { 334 if (args->csa_sequenceid == slot->seq_nr) {
@@ -418,6 +416,7 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
418 struct cb_process_state *cps) 416 struct cb_process_state *cps)
419{ 417{
420 struct nfs4_slot_table *tbl; 418 struct nfs4_slot_table *tbl;
419 struct nfs4_slot *slot;
421 struct nfs_client *clp; 420 struct nfs_client *clp;
422 int i; 421 int i;
423 __be32 status = htonl(NFS4ERR_BADSESSION); 422 __be32 status = htonl(NFS4ERR_BADSESSION);
@@ -429,25 +428,32 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
429 428
430 if (!(clp->cl_session->flags & SESSION4_BACK_CHAN)) 429 if (!(clp->cl_session->flags & SESSION4_BACK_CHAN))
431 goto out; 430 goto out;
431
432 tbl = &clp->cl_session->bc_slot_table; 432 tbl = &clp->cl_session->bc_slot_table;
433 slot = tbl->slots + args->csa_slotid;
433 434
434 spin_lock(&tbl->slot_tbl_lock); 435 spin_lock(&tbl->slot_tbl_lock);
435 /* state manager is resetting the session */ 436 /* state manager is resetting the session */
436 if (test_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state)) { 437 if (test_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state)) {
437 spin_unlock(&tbl->slot_tbl_lock);
438 status = htonl(NFS4ERR_DELAY); 438 status = htonl(NFS4ERR_DELAY);
439 /* Return NFS4ERR_BADSESSION if we're draining the session 439 /* Return NFS4ERR_BADSESSION if we're draining the session
440 * in order to reset it. 440 * in order to reset it.
441 */ 441 */
442 if (test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state)) 442 if (test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state))
443 status = htonl(NFS4ERR_BADSESSION); 443 status = htonl(NFS4ERR_BADSESSION);
444 goto out; 444 goto out_unlock;
445 } 445 }
446 446
447 status = validate_seqid(&clp->cl_session->bc_slot_table, args); 447 memcpy(&res->csr_sessionid, &args->csa_sessionid,
448 spin_unlock(&tbl->slot_tbl_lock); 448 sizeof(res->csr_sessionid));
449 res->csr_sequenceid = args->csa_sequenceid;
450 res->csr_slotid = args->csa_slotid;
451 res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
452 res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
453
454 status = validate_seqid(tbl, args);
449 if (status) 455 if (status)
450 goto out; 456 goto out_unlock;
451 457
452 cps->slotid = args->csa_slotid; 458 cps->slotid = args->csa_slotid;
453 459
@@ -458,15 +464,17 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
458 */ 464 */
459 if (referring_call_exists(clp, args->csa_nrclists, args->csa_rclists)) { 465 if (referring_call_exists(clp, args->csa_nrclists, args->csa_rclists)) {
460 status = htonl(NFS4ERR_DELAY); 466 status = htonl(NFS4ERR_DELAY);
461 goto out; 467 goto out_unlock;
462 } 468 }
463 469
464 memcpy(&res->csr_sessionid, &args->csa_sessionid, 470 /*
465 sizeof(res->csr_sessionid)); 471 * RFC5661 20.9.3
466 res->csr_sequenceid = args->csa_sequenceid; 472 * If CB_SEQUENCE returns an error, then the state of the slot
467 res->csr_slotid = args->csa_slotid; 473 * (sequence ID, cached reply) MUST NOT change.
468 res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1; 474 */
469 res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1; 475 slot->seq_nr++;
476out_unlock:
477 spin_unlock(&tbl->slot_tbl_lock);
470 478
471out: 479out:
472 cps->clp = clp; /* put in nfs4_callback_compound */ 480 cps->clp = clp; /* put in nfs4_callback_compound */
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 19ca95cdfd9b..6b1697a01dde 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -909,7 +909,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
909 xdr_init_encode(&xdr_out, &rqstp->rq_res, p); 909 xdr_init_encode(&xdr_out, &rqstp->rq_res, p);
910 910
911 status = decode_compound_hdr_arg(&xdr_in, &hdr_arg); 911 status = decode_compound_hdr_arg(&xdr_in, &hdr_arg);
912 if (status == __constant_htonl(NFS4ERR_RESOURCE)) 912 if (status == htonl(NFS4ERR_RESOURCE))
913 return rpc_garbage_args; 913 return rpc_garbage_args;
914 914
915 if (hdr_arg.minorversion == 0) { 915 if (hdr_arg.minorversion == 0) {
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 892aefff3630..ecebb406cc1a 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -825,7 +825,6 @@ error:
825 * Load up the server record from information gained in an fsinfo record 825 * Load up the server record from information gained in an fsinfo record
826 */ 826 */
827static void nfs_server_set_fsinfo(struct nfs_server *server, 827static void nfs_server_set_fsinfo(struct nfs_server *server,
828 struct nfs_fh *mntfh,
829 struct nfs_fsinfo *fsinfo) 828 struct nfs_fsinfo *fsinfo)
830{ 829{
831 unsigned long max_rpc_payload; 830 unsigned long max_rpc_payload;
@@ -901,7 +900,7 @@ int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs
901 if (error < 0) 900 if (error < 0)
902 goto out_error; 901 goto out_error;
903 902
904 nfs_server_set_fsinfo(server, mntfh, &fsinfo); 903 nfs_server_set_fsinfo(server, &fsinfo);
905 904
906 /* Get some general file system info */ 905 /* Get some general file system info */
907 if (server->namelen == 0) { 906 if (server->namelen == 0) {
@@ -1193,8 +1192,6 @@ void nfs_clients_init(struct net *net)
1193} 1192}
1194 1193
1195#ifdef CONFIG_PROC_FS 1194#ifdef CONFIG_PROC_FS
1196static struct proc_dir_entry *proc_fs_nfs;
1197
1198static int nfs_server_list_open(struct inode *inode, struct file *file); 1195static int nfs_server_list_open(struct inode *inode, struct file *file);
1199static void *nfs_server_list_start(struct seq_file *p, loff_t *pos); 1196static void *nfs_server_list_start(struct seq_file *p, loff_t *pos);
1200static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos); 1197static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos);
@@ -1364,27 +1361,29 @@ static int nfs_volume_list_show(struct seq_file *m, void *v)
1364{ 1361{
1365 struct nfs_server *server; 1362 struct nfs_server *server;
1366 struct nfs_client *clp; 1363 struct nfs_client *clp;
1367 char dev[8], fsid[17]; 1364 char dev[13]; // 8 for 2^24, 1 for ':', 3 for 2^8, 1 for '\0'
1365 char fsid[34]; // 2 * 16 for %llx, 1 for ':', 1 for '\0'
1368 struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id); 1366 struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id);
1369 1367
1370 /* display header on line 1 */ 1368 /* display header on line 1 */
1371 if (v == &nn->nfs_volume_list) { 1369 if (v == &nn->nfs_volume_list) {
1372 seq_puts(m, "NV SERVER PORT DEV FSID FSC\n"); 1370 seq_puts(m, "NV SERVER PORT DEV FSID"
1371 " FSC\n");
1373 return 0; 1372 return 0;
1374 } 1373 }
1375 /* display one transport per line on subsequent lines */ 1374 /* display one transport per line on subsequent lines */
1376 server = list_entry(v, struct nfs_server, master_link); 1375 server = list_entry(v, struct nfs_server, master_link);
1377 clp = server->nfs_client; 1376 clp = server->nfs_client;
1378 1377
1379 snprintf(dev, 8, "%u:%u", 1378 snprintf(dev, sizeof(dev), "%u:%u",
1380 MAJOR(server->s_dev), MINOR(server->s_dev)); 1379 MAJOR(server->s_dev), MINOR(server->s_dev));
1381 1380
1382 snprintf(fsid, 17, "%llx:%llx", 1381 snprintf(fsid, sizeof(fsid), "%llx:%llx",
1383 (unsigned long long) server->fsid.major, 1382 (unsigned long long) server->fsid.major,
1384 (unsigned long long) server->fsid.minor); 1383 (unsigned long long) server->fsid.minor);
1385 1384
1386 rcu_read_lock(); 1385 rcu_read_lock();
1387 seq_printf(m, "v%u %s %s %-7s %-17s %s\n", 1386 seq_printf(m, "v%u %s %s %-12s %-33s %s\n",
1388 clp->rpc_ops->version, 1387 clp->rpc_ops->version,
1389 rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR), 1388 rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR),
1390 rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT), 1389 rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT),
@@ -1434,27 +1433,20 @@ void nfs_fs_proc_net_exit(struct net *net)
1434 */ 1433 */
1435int __init nfs_fs_proc_init(void) 1434int __init nfs_fs_proc_init(void)
1436{ 1435{
1437 struct proc_dir_entry *p; 1436 if (!proc_mkdir("fs/nfsfs", NULL))
1438
1439 proc_fs_nfs = proc_mkdir("fs/nfsfs", NULL);
1440 if (!proc_fs_nfs)
1441 goto error_0; 1437 goto error_0;
1442 1438
1443 /* a file of servers with which we're dealing */ 1439 /* a file of servers with which we're dealing */
1444 p = proc_symlink("servers", proc_fs_nfs, "../../net/nfsfs/servers"); 1440 if (!proc_symlink("fs/nfsfs/servers", NULL, "../../net/nfsfs/servers"))
1445 if (!p)
1446 goto error_1; 1441 goto error_1;
1447 1442
1448 /* a file of volumes that we have mounted */ 1443 /* a file of volumes that we have mounted */
1449 p = proc_symlink("volumes", proc_fs_nfs, "../../net/nfsfs/volumes"); 1444 if (!proc_symlink("fs/nfsfs/volumes", NULL, "../../net/nfsfs/volumes"))
1450 if (!p) 1445 goto error_1;
1451 goto error_2;
1452 return 0;
1453 1446
1454error_2: 1447 return 0;
1455 remove_proc_entry("servers", proc_fs_nfs);
1456error_1: 1448error_1:
1457 remove_proc_entry("fs/nfsfs", NULL); 1449 remove_proc_subtree("fs/nfsfs", NULL);
1458error_0: 1450error_0:
1459 return -ENOMEM; 1451 return -ENOMEM;
1460} 1452}
@@ -1464,9 +1456,7 @@ error_0:
1464 */ 1456 */
1465void nfs_fs_proc_exit(void) 1457void nfs_fs_proc_exit(void)
1466{ 1458{
1467 remove_proc_entry("volumes", proc_fs_nfs); 1459 remove_proc_subtree("fs/nfsfs", NULL);
1468 remove_proc_entry("servers", proc_fs_nfs);
1469 remove_proc_entry("fs/nfsfs", NULL);
1470} 1460}
1471 1461
1472#endif /* CONFIG_PROC_FS */ 1462#endif /* CONFIG_PROC_FS */
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index b2c8b31b2be7..21457bb0edd6 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1470,9 +1470,6 @@ static int nfs_finish_open(struct nfs_open_context *ctx,
1470{ 1470{
1471 int err; 1471 int err;
1472 1472
1473 if ((open_flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))
1474 *opened |= FILE_CREATED;
1475
1476 err = finish_open(file, dentry, do_open, opened); 1473 err = finish_open(file, dentry, do_open, opened);
1477 if (err) 1474 if (err)
1478 goto out; 1475 goto out;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 8b8d83a526ce..cc4fa1ed61fc 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -555,31 +555,22 @@ static int nfs_launder_page(struct page *page)
555 return nfs_wb_page(inode, page); 555 return nfs_wb_page(inode, page);
556} 556}
557 557
558#ifdef CONFIG_NFS_SWAP
559static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file, 558static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file,
560 sector_t *span) 559 sector_t *span)
561{ 560{
562 int ret;
563 struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host); 561 struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host);
564 562
565 *span = sis->pages; 563 *span = sis->pages;
566 564
567 rcu_read_lock(); 565 return rpc_clnt_swap_activate(clnt);
568 ret = xs_swapper(rcu_dereference(clnt->cl_xprt), 1);
569 rcu_read_unlock();
570
571 return ret;
572} 566}
573 567
574static void nfs_swap_deactivate(struct file *file) 568static void nfs_swap_deactivate(struct file *file)
575{ 569{
576 struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host); 570 struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host);
577 571
578 rcu_read_lock(); 572 rpc_clnt_swap_deactivate(clnt);
579 xs_swapper(rcu_dereference(clnt->cl_xprt), 0);
580 rcu_read_unlock();
581} 573}
582#endif
583 574
584const struct address_space_operations nfs_file_aops = { 575const struct address_space_operations nfs_file_aops = {
585 .readpage = nfs_readpage, 576 .readpage = nfs_readpage,
@@ -596,10 +587,8 @@ const struct address_space_operations nfs_file_aops = {
596 .launder_page = nfs_launder_page, 587 .launder_page = nfs_launder_page,
597 .is_dirty_writeback = nfs_check_dirty_writeback, 588 .is_dirty_writeback = nfs_check_dirty_writeback,
598 .error_remove_page = generic_error_remove_page, 589 .error_remove_page = generic_error_remove_page,
599#ifdef CONFIG_NFS_SWAP
600 .swap_activate = nfs_swap_activate, 590 .swap_activate = nfs_swap_activate,
601 .swap_deactivate = nfs_swap_deactivate, 591 .swap_deactivate = nfs_swap_deactivate,
602#endif
603}; 592};
604 593
605/* 594/*
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 7d05089e52d6..c12951b9551e 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -20,6 +20,7 @@
20#include "../nfs4trace.h" 20#include "../nfs4trace.h"
21#include "../iostat.h" 21#include "../iostat.h"
22#include "../nfs.h" 22#include "../nfs.h"
23#include "../nfs42.h"
23 24
24#define NFSDBG_FACILITY NFSDBG_PNFS_LD 25#define NFSDBG_FACILITY NFSDBG_PNFS_LD
25 26
@@ -182,17 +183,14 @@ static void _ff_layout_free_lseg(struct nfs4_ff_layout_segment *fls)
182 183
183static void ff_layout_sort_mirrors(struct nfs4_ff_layout_segment *fls) 184static void ff_layout_sort_mirrors(struct nfs4_ff_layout_segment *fls)
184{ 185{
185 struct nfs4_ff_layout_mirror *tmp;
186 int i, j; 186 int i, j;
187 187
188 for (i = 0; i < fls->mirror_array_cnt - 1; i++) { 188 for (i = 0; i < fls->mirror_array_cnt - 1; i++) {
189 for (j = i + 1; j < fls->mirror_array_cnt; j++) 189 for (j = i + 1; j < fls->mirror_array_cnt; j++)
190 if (fls->mirror_array[i]->efficiency < 190 if (fls->mirror_array[i]->efficiency <
191 fls->mirror_array[j]->efficiency) { 191 fls->mirror_array[j]->efficiency)
192 tmp = fls->mirror_array[i]; 192 swap(fls->mirror_array[i],
193 fls->mirror_array[i] = fls->mirror_array[j]; 193 fls->mirror_array[j]);
194 fls->mirror_array[j] = tmp;
195 }
196 } 194 }
197} 195}
198 196
@@ -274,6 +272,7 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
274 272
275 spin_lock_init(&fls->mirror_array[i]->lock); 273 spin_lock_init(&fls->mirror_array[i]->lock);
276 fls->mirror_array[i]->ds_count = ds_count; 274 fls->mirror_array[i]->ds_count = ds_count;
275 fls->mirror_array[i]->lseg = &fls->generic_hdr;
277 276
278 /* deviceid */ 277 /* deviceid */
279 rc = decode_deviceid(&stream, &devid); 278 rc = decode_deviceid(&stream, &devid);
@@ -344,6 +343,10 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
344 fls->mirror_array[i]->gid); 343 fls->mirror_array[i]->gid);
345 } 344 }
346 345
346 p = xdr_inline_decode(&stream, 4);
347 if (p)
348 fls->flags = be32_to_cpup(p);
349
347 ff_layout_sort_mirrors(fls); 350 ff_layout_sort_mirrors(fls);
348 rc = ff_layout_check_layout(lgr); 351 rc = ff_layout_check_layout(lgr);
349 if (rc) 352 if (rc)
@@ -415,6 +418,146 @@ ff_layout_get_lseg_count(struct nfs4_ff_layout_segment *fls)
415 return 1; 418 return 1;
416} 419}
417 420
421static void
422nfs4_ff_start_busy_timer(struct nfs4_ff_busy_timer *timer)
423{
424 /* first IO request? */
425 if (atomic_inc_return(&timer->n_ops) == 1) {
426 timer->start_time = ktime_get();
427 }
428}
429
430static ktime_t
431nfs4_ff_end_busy_timer(struct nfs4_ff_busy_timer *timer)
432{
433 ktime_t start, now;
434
435 if (atomic_dec_return(&timer->n_ops) < 0)
436 WARN_ON_ONCE(1);
437
438 now = ktime_get();
439 start = timer->start_time;
440 timer->start_time = now;
441 return ktime_sub(now, start);
442}
443
444static ktime_t
445nfs4_ff_layout_calc_completion_time(struct rpc_task *task)
446{
447 return ktime_sub(ktime_get(), task->tk_start);
448}
449
450static bool
451nfs4_ff_layoutstat_start_io(struct nfs4_ff_layout_mirror *mirror,
452 struct nfs4_ff_layoutstat *layoutstat)
453{
454 static const ktime_t notime = {0};
455 ktime_t now = ktime_get();
456
457 nfs4_ff_start_busy_timer(&layoutstat->busy_timer);
458 if (ktime_equal(mirror->start_time, notime))
459 mirror->start_time = now;
460 if (ktime_equal(mirror->last_report_time, notime))
461 mirror->last_report_time = now;
462 if (ktime_to_ms(ktime_sub(now, mirror->last_report_time)) >=
463 FF_LAYOUTSTATS_REPORT_INTERVAL) {
464 mirror->last_report_time = now;
465 return true;
466 }
467
468 return false;
469}
470
471static void
472nfs4_ff_layout_stat_io_update_requested(struct nfs4_ff_layoutstat *layoutstat,
473 __u64 requested)
474{
475 struct nfs4_ff_io_stat *iostat = &layoutstat->io_stat;
476
477 iostat->ops_requested++;
478 iostat->bytes_requested += requested;
479}
480
481static void
482nfs4_ff_layout_stat_io_update_completed(struct nfs4_ff_layoutstat *layoutstat,
483 __u64 requested,
484 __u64 completed,
485 ktime_t time_completed)
486{
487 struct nfs4_ff_io_stat *iostat = &layoutstat->io_stat;
488 ktime_t timer;
489
490 iostat->ops_completed++;
491 iostat->bytes_completed += completed;
492 iostat->bytes_not_delivered += requested - completed;
493
494 timer = nfs4_ff_end_busy_timer(&layoutstat->busy_timer);
495 iostat->total_busy_time =
496 ktime_add(iostat->total_busy_time, timer);
497 iostat->aggregate_completion_time =
498 ktime_add(iostat->aggregate_completion_time, time_completed);
499}
500
501static void
502nfs4_ff_layout_stat_io_start_read(struct nfs4_ff_layout_mirror *mirror,
503 __u64 requested)
504{
505 bool report;
506
507 spin_lock(&mirror->lock);
508 report = nfs4_ff_layoutstat_start_io(mirror, &mirror->read_stat);
509 nfs4_ff_layout_stat_io_update_requested(&mirror->read_stat, requested);
510 spin_unlock(&mirror->lock);
511
512 if (report)
513 pnfs_report_layoutstat(mirror->lseg->pls_layout->plh_inode);
514}
515
516static void
517nfs4_ff_layout_stat_io_end_read(struct rpc_task *task,
518 struct nfs4_ff_layout_mirror *mirror,
519 __u64 requested,
520 __u64 completed)
521{
522 spin_lock(&mirror->lock);
523 nfs4_ff_layout_stat_io_update_completed(&mirror->read_stat,
524 requested, completed,
525 nfs4_ff_layout_calc_completion_time(task));
526 spin_unlock(&mirror->lock);
527}
528
529static void
530nfs4_ff_layout_stat_io_start_write(struct nfs4_ff_layout_mirror *mirror,
531 __u64 requested)
532{
533 bool report;
534
535 spin_lock(&mirror->lock);
536 report = nfs4_ff_layoutstat_start_io(mirror , &mirror->write_stat);
537 nfs4_ff_layout_stat_io_update_requested(&mirror->write_stat, requested);
538 spin_unlock(&mirror->lock);
539
540 if (report)
541 pnfs_report_layoutstat(mirror->lseg->pls_layout->plh_inode);
542}
543
544static void
545nfs4_ff_layout_stat_io_end_write(struct rpc_task *task,
546 struct nfs4_ff_layout_mirror *mirror,
547 __u64 requested,
548 __u64 completed,
549 enum nfs3_stable_how committed)
550{
551 if (committed == NFS_UNSTABLE)
552 requested = completed = 0;
553
554 spin_lock(&mirror->lock);
555 nfs4_ff_layout_stat_io_update_completed(&mirror->write_stat,
556 requested, completed,
557 nfs4_ff_layout_calc_completion_time(task));
558 spin_unlock(&mirror->lock);
559}
560
418static int 561static int
419ff_layout_alloc_commit_info(struct pnfs_layout_segment *lseg, 562ff_layout_alloc_commit_info(struct pnfs_layout_segment *lseg,
420 struct nfs_commit_info *cinfo, 563 struct nfs_commit_info *cinfo,
@@ -631,7 +774,7 @@ static void ff_layout_reset_write(struct nfs_pgio_header *hdr, bool retry_pnfs)
631 nfs_direct_set_resched_writes(hdr->dreq); 774 nfs_direct_set_resched_writes(hdr->dreq);
632 /* fake unstable write to let common nfs resend pages */ 775 /* fake unstable write to let common nfs resend pages */
633 hdr->verf.committed = NFS_UNSTABLE; 776 hdr->verf.committed = NFS_UNSTABLE;
634 hdr->good_bytes = 0; 777 hdr->good_bytes = hdr->args.count;
635 } 778 }
636 return; 779 return;
637 } 780 }
@@ -879,6 +1022,12 @@ static int ff_layout_read_done_cb(struct rpc_task *task,
879 return 0; 1022 return 0;
880} 1023}
881 1024
1025static bool
1026ff_layout_need_layoutcommit(struct pnfs_layout_segment *lseg)
1027{
1028 return !(FF_LAYOUT_LSEG(lseg)->flags & FF_FLAGS_NO_LAYOUTCOMMIT);
1029}
1030
882/* 1031/*
883 * We reference the rpc_cred of the first WRITE that triggers the need for 1032 * We reference the rpc_cred of the first WRITE that triggers the need for
884 * a LAYOUTCOMMIT, and use it to send the layoutcommit compound. 1033 * a LAYOUTCOMMIT, and use it to send the layoutcommit compound.
@@ -891,6 +1040,9 @@ static int ff_layout_read_done_cb(struct rpc_task *task,
891static void 1040static void
892ff_layout_set_layoutcommit(struct nfs_pgio_header *hdr) 1041ff_layout_set_layoutcommit(struct nfs_pgio_header *hdr)
893{ 1042{
1043 if (!ff_layout_need_layoutcommit(hdr->lseg))
1044 return;
1045
894 pnfs_set_layoutcommit(hdr->inode, hdr->lseg, 1046 pnfs_set_layoutcommit(hdr->inode, hdr->lseg,
895 hdr->mds_offset + hdr->res.count); 1047 hdr->mds_offset + hdr->res.count);
896 dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino, 1048 dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino,
@@ -909,6 +1061,10 @@ ff_layout_reset_to_mds(struct pnfs_layout_segment *lseg, int idx)
909static int ff_layout_read_prepare_common(struct rpc_task *task, 1061static int ff_layout_read_prepare_common(struct rpc_task *task,
910 struct nfs_pgio_header *hdr) 1062 struct nfs_pgio_header *hdr)
911{ 1063{
1064 nfs4_ff_layout_stat_io_start_read(
1065 FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
1066 hdr->args.count);
1067
912 if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) { 1068 if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
913 rpc_exit(task, -EIO); 1069 rpc_exit(task, -EIO);
914 return -EIO; 1070 return -EIO;
@@ -962,15 +1118,15 @@ static void ff_layout_read_prepare_v4(struct rpc_task *task, void *data)
962{ 1118{
963 struct nfs_pgio_header *hdr = data; 1119 struct nfs_pgio_header *hdr = data;
964 1120
965 if (ff_layout_read_prepare_common(task, hdr))
966 return;
967
968 if (ff_layout_setup_sequence(hdr->ds_clp, 1121 if (ff_layout_setup_sequence(hdr->ds_clp,
969 &hdr->args.seq_args, 1122 &hdr->args.seq_args,
970 &hdr->res.seq_res, 1123 &hdr->res.seq_res,
971 task)) 1124 task))
972 return; 1125 return;
973 1126
1127 if (ff_layout_read_prepare_common(task, hdr))
1128 return;
1129
974 if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context, 1130 if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
975 hdr->args.lock_context, FMODE_READ) == -EIO) 1131 hdr->args.lock_context, FMODE_READ) == -EIO)
976 rpc_exit(task, -EIO); /* lost lock, terminate I/O */ 1132 rpc_exit(task, -EIO); /* lost lock, terminate I/O */
@@ -982,6 +1138,10 @@ static void ff_layout_read_call_done(struct rpc_task *task, void *data)
982 1138
983 dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status); 1139 dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status);
984 1140
1141 nfs4_ff_layout_stat_io_end_read(task,
1142 FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
1143 hdr->args.count, hdr->res.count);
1144
985 if (test_bit(NFS_IOHDR_REDO, &hdr->flags) && 1145 if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
986 task->tk_status == 0) { 1146 task->tk_status == 0) {
987 nfs4_sequence_done(task, &hdr->res.seq_res); 1147 nfs4_sequence_done(task, &hdr->res.seq_res);
@@ -1074,7 +1234,8 @@ static int ff_layout_commit_done_cb(struct rpc_task *task,
1074 return -EAGAIN; 1234 return -EAGAIN;
1075 } 1235 }
1076 1236
1077 if (data->verf.committed == NFS_UNSTABLE) 1237 if (data->verf.committed == NFS_UNSTABLE
1238 && ff_layout_need_layoutcommit(data->lseg))
1078 pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb); 1239 pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb);
1079 1240
1080 return 0; 1241 return 0;
@@ -1083,6 +1244,10 @@ static int ff_layout_commit_done_cb(struct rpc_task *task,
1083static int ff_layout_write_prepare_common(struct rpc_task *task, 1244static int ff_layout_write_prepare_common(struct rpc_task *task,
1084 struct nfs_pgio_header *hdr) 1245 struct nfs_pgio_header *hdr)
1085{ 1246{
1247 nfs4_ff_layout_stat_io_start_write(
1248 FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
1249 hdr->args.count);
1250
1086 if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) { 1251 if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
1087 rpc_exit(task, -EIO); 1252 rpc_exit(task, -EIO);
1088 return -EIO; 1253 return -EIO;
@@ -1116,15 +1281,15 @@ static void ff_layout_write_prepare_v4(struct rpc_task *task, void *data)
1116{ 1281{
1117 struct nfs_pgio_header *hdr = data; 1282 struct nfs_pgio_header *hdr = data;
1118 1283
1119 if (ff_layout_write_prepare_common(task, hdr))
1120 return;
1121
1122 if (ff_layout_setup_sequence(hdr->ds_clp, 1284 if (ff_layout_setup_sequence(hdr->ds_clp,
1123 &hdr->args.seq_args, 1285 &hdr->args.seq_args,
1124 &hdr->res.seq_res, 1286 &hdr->res.seq_res,
1125 task)) 1287 task))
1126 return; 1288 return;
1127 1289
1290 if (ff_layout_write_prepare_common(task, hdr))
1291 return;
1292
1128 if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context, 1293 if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
1129 hdr->args.lock_context, FMODE_WRITE) == -EIO) 1294 hdr->args.lock_context, FMODE_WRITE) == -EIO)
1130 rpc_exit(task, -EIO); /* lost lock, terminate I/O */ 1295 rpc_exit(task, -EIO); /* lost lock, terminate I/O */
@@ -1134,6 +1299,11 @@ static void ff_layout_write_call_done(struct rpc_task *task, void *data)
1134{ 1299{
1135 struct nfs_pgio_header *hdr = data; 1300 struct nfs_pgio_header *hdr = data;
1136 1301
1302 nfs4_ff_layout_stat_io_end_write(task,
1303 FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
1304 hdr->args.count, hdr->res.count,
1305 hdr->res.verf->committed);
1306
1137 if (test_bit(NFS_IOHDR_REDO, &hdr->flags) && 1307 if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
1138 task->tk_status == 0) { 1308 task->tk_status == 0) {
1139 nfs4_sequence_done(task, &hdr->res.seq_res); 1309 nfs4_sequence_done(task, &hdr->res.seq_res);
@@ -1152,8 +1322,17 @@ static void ff_layout_write_count_stats(struct rpc_task *task, void *data)
1152 &NFS_CLIENT(hdr->inode)->cl_metrics[NFSPROC4_CLNT_WRITE]); 1322 &NFS_CLIENT(hdr->inode)->cl_metrics[NFSPROC4_CLNT_WRITE]);
1153} 1323}
1154 1324
1325static void ff_layout_commit_prepare_common(struct rpc_task *task,
1326 struct nfs_commit_data *cdata)
1327{
1328 nfs4_ff_layout_stat_io_start_write(
1329 FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index),
1330 0);
1331}
1332
1155static void ff_layout_commit_prepare_v3(struct rpc_task *task, void *data) 1333static void ff_layout_commit_prepare_v3(struct rpc_task *task, void *data)
1156{ 1334{
1335 ff_layout_commit_prepare_common(task, data);
1157 rpc_call_start(task); 1336 rpc_call_start(task);
1158} 1337}
1159 1338
@@ -1161,10 +1340,30 @@ static void ff_layout_commit_prepare_v4(struct rpc_task *task, void *data)
1161{ 1340{
1162 struct nfs_commit_data *wdata = data; 1341 struct nfs_commit_data *wdata = data;
1163 1342
1164 ff_layout_setup_sequence(wdata->ds_clp, 1343 if (ff_layout_setup_sequence(wdata->ds_clp,
1165 &wdata->args.seq_args, 1344 &wdata->args.seq_args,
1166 &wdata->res.seq_res, 1345 &wdata->res.seq_res,
1167 task); 1346 task))
1347 return;
1348 ff_layout_commit_prepare_common(task, data);
1349}
1350
1351static void ff_layout_commit_done(struct rpc_task *task, void *data)
1352{
1353 struct nfs_commit_data *cdata = data;
1354 struct nfs_page *req;
1355 __u64 count = 0;
1356
1357 if (task->tk_status == 0) {
1358 list_for_each_entry(req, &cdata->pages, wb_list)
1359 count += req->wb_bytes;
1360 }
1361
1362 nfs4_ff_layout_stat_io_end_write(task,
1363 FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index),
1364 count, count, NFS_FILE_SYNC);
1365
1366 pnfs_generic_write_commit_done(task, data);
1168} 1367}
1169 1368
1170static void ff_layout_commit_count_stats(struct rpc_task *task, void *data) 1369static void ff_layout_commit_count_stats(struct rpc_task *task, void *data)
@@ -1205,14 +1404,14 @@ static const struct rpc_call_ops ff_layout_write_call_ops_v4 = {
1205 1404
1206static const struct rpc_call_ops ff_layout_commit_call_ops_v3 = { 1405static const struct rpc_call_ops ff_layout_commit_call_ops_v3 = {
1207 .rpc_call_prepare = ff_layout_commit_prepare_v3, 1406 .rpc_call_prepare = ff_layout_commit_prepare_v3,
1208 .rpc_call_done = pnfs_generic_write_commit_done, 1407 .rpc_call_done = ff_layout_commit_done,
1209 .rpc_count_stats = ff_layout_commit_count_stats, 1408 .rpc_count_stats = ff_layout_commit_count_stats,
1210 .rpc_release = pnfs_generic_commit_release, 1409 .rpc_release = pnfs_generic_commit_release,
1211}; 1410};
1212 1411
1213static const struct rpc_call_ops ff_layout_commit_call_ops_v4 = { 1412static const struct rpc_call_ops ff_layout_commit_call_ops_v4 = {
1214 .rpc_call_prepare = ff_layout_commit_prepare_v4, 1413 .rpc_call_prepare = ff_layout_commit_prepare_v4,
1215 .rpc_call_done = pnfs_generic_write_commit_done, 1414 .rpc_call_done = ff_layout_commit_done,
1216 .rpc_count_stats = ff_layout_commit_count_stats, 1415 .rpc_count_stats = ff_layout_commit_count_stats,
1217 .rpc_release = pnfs_generic_commit_release, 1416 .rpc_release = pnfs_generic_commit_release,
1218}; 1417};
@@ -1256,7 +1455,6 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
1256 fh = nfs4_ff_layout_select_ds_fh(lseg, idx); 1455 fh = nfs4_ff_layout_select_ds_fh(lseg, idx);
1257 if (fh) 1456 if (fh)
1258 hdr->args.fh = fh; 1457 hdr->args.fh = fh;
1259
1260 /* 1458 /*
1261 * Note that if we ever decide to split across DSes, 1459 * Note that if we ever decide to split across DSes,
1262 * then we may need to handle dense-like offsets. 1460 * then we may need to handle dense-like offsets.
@@ -1385,6 +1583,7 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how)
1385 fh = select_ds_fh_from_commit(lseg, data->ds_commit_index); 1583 fh = select_ds_fh_from_commit(lseg, data->ds_commit_index);
1386 if (fh) 1584 if (fh)
1387 data->args.fh = fh; 1585 data->args.fh = fh;
1586
1388 return nfs_initiate_commit(ds_clnt, data, ds->ds_clp->rpc_ops, 1587 return nfs_initiate_commit(ds_clnt, data, ds->ds_clp->rpc_ops,
1389 vers == 3 ? &ff_layout_commit_call_ops_v3 : 1588 vers == 3 ? &ff_layout_commit_call_ops_v3 :
1390 &ff_layout_commit_call_ops_v4, 1589 &ff_layout_commit_call_ops_v4,
@@ -1488,6 +1687,247 @@ out:
1488 dprintk("%s: Return\n", __func__); 1687 dprintk("%s: Return\n", __func__);
1489} 1688}
1490 1689
1690static int
1691ff_layout_ntop4(const struct sockaddr *sap, char *buf, const size_t buflen)
1692{
1693 const struct sockaddr_in *sin = (struct sockaddr_in *)sap;
1694
1695 return snprintf(buf, buflen, "%pI4", &sin->sin_addr);
1696}
1697
1698static size_t
1699ff_layout_ntop6_noscopeid(const struct sockaddr *sap, char *buf,
1700 const int buflen)
1701{
1702 const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
1703 const struct in6_addr *addr = &sin6->sin6_addr;
1704
1705 /*
1706 * RFC 4291, Section 2.2.2
1707 *
1708 * Shorthanded ANY address
1709 */
1710 if (ipv6_addr_any(addr))
1711 return snprintf(buf, buflen, "::");
1712
1713 /*
1714 * RFC 4291, Section 2.2.2
1715 *
1716 * Shorthanded loopback address
1717 */
1718 if (ipv6_addr_loopback(addr))
1719 return snprintf(buf, buflen, "::1");
1720
1721 /*
1722 * RFC 4291, Section 2.2.3
1723 *
1724 * Special presentation address format for mapped v4
1725 * addresses.
1726 */
1727 if (ipv6_addr_v4mapped(addr))
1728 return snprintf(buf, buflen, "::ffff:%pI4",
1729 &addr->s6_addr32[3]);
1730
1731 /*
1732 * RFC 4291, Section 2.2.1
1733 */
1734 return snprintf(buf, buflen, "%pI6c", addr);
1735}
1736
1737/* Derived from rpc_sockaddr2uaddr */
1738static void
1739ff_layout_encode_netaddr(struct xdr_stream *xdr, struct nfs4_pnfs_ds_addr *da)
1740{
1741 struct sockaddr *sap = (struct sockaddr *)&da->da_addr;
1742 char portbuf[RPCBIND_MAXUADDRPLEN];
1743 char addrbuf[RPCBIND_MAXUADDRLEN];
1744 char *netid;
1745 unsigned short port;
1746 int len, netid_len;
1747 __be32 *p;
1748
1749 switch (sap->sa_family) {
1750 case AF_INET:
1751 if (ff_layout_ntop4(sap, addrbuf, sizeof(addrbuf)) == 0)
1752 return;
1753 port = ntohs(((struct sockaddr_in *)sap)->sin_port);
1754 netid = "tcp";
1755 netid_len = 3;
1756 break;
1757 case AF_INET6:
1758 if (ff_layout_ntop6_noscopeid(sap, addrbuf, sizeof(addrbuf)) == 0)
1759 return;
1760 port = ntohs(((struct sockaddr_in6 *)sap)->sin6_port);
1761 netid = "tcp6";
1762 netid_len = 4;
1763 break;
1764 default:
1765 /* we only support tcp and tcp6 */
1766 WARN_ON_ONCE(1);
1767 return;
1768 }
1769
1770 snprintf(portbuf, sizeof(portbuf), ".%u.%u", port >> 8, port & 0xff);
1771 len = strlcat(addrbuf, portbuf, sizeof(addrbuf));
1772
1773 p = xdr_reserve_space(xdr, 4 + netid_len);
1774 xdr_encode_opaque(p, netid, netid_len);
1775
1776 p = xdr_reserve_space(xdr, 4 + len);
1777 xdr_encode_opaque(p, addrbuf, len);
1778}
1779
1780static void
1781ff_layout_encode_nfstime(struct xdr_stream *xdr,
1782 ktime_t t)
1783{
1784 struct timespec64 ts;
1785 __be32 *p;
1786
1787 p = xdr_reserve_space(xdr, 12);
1788 ts = ktime_to_timespec64(t);
1789 p = xdr_encode_hyper(p, ts.tv_sec);
1790 *p++ = cpu_to_be32(ts.tv_nsec);
1791}
1792
1793static void
1794ff_layout_encode_io_latency(struct xdr_stream *xdr,
1795 struct nfs4_ff_io_stat *stat)
1796{
1797 __be32 *p;
1798
1799 p = xdr_reserve_space(xdr, 5 * 8);
1800 p = xdr_encode_hyper(p, stat->ops_requested);
1801 p = xdr_encode_hyper(p, stat->bytes_requested);
1802 p = xdr_encode_hyper(p, stat->ops_completed);
1803 p = xdr_encode_hyper(p, stat->bytes_completed);
1804 p = xdr_encode_hyper(p, stat->bytes_not_delivered);
1805 ff_layout_encode_nfstime(xdr, stat->total_busy_time);
1806 ff_layout_encode_nfstime(xdr, stat->aggregate_completion_time);
1807}
1808
1809static void
1810ff_layout_encode_layoutstats(struct xdr_stream *xdr,
1811 struct nfs42_layoutstat_args *args,
1812 struct nfs42_layoutstat_devinfo *devinfo)
1813{
1814 struct nfs4_ff_layout_mirror *mirror = devinfo->layout_private;
1815 struct nfs4_pnfs_ds_addr *da;
1816 struct nfs4_pnfs_ds *ds = mirror->mirror_ds->ds;
1817 struct nfs_fh *fh = &mirror->fh_versions[0];
1818 __be32 *p, *start;
1819
1820 da = list_first_entry(&ds->ds_addrs, struct nfs4_pnfs_ds_addr, da_node);
1821 dprintk("%s: DS %s: encoding address %s\n",
1822 __func__, ds->ds_remotestr, da->da_remotestr);
1823 /* layoutupdate length */
1824 start = xdr_reserve_space(xdr, 4);
1825 /* netaddr4 */
1826 ff_layout_encode_netaddr(xdr, da);
1827 /* nfs_fh4 */
1828 p = xdr_reserve_space(xdr, 4 + fh->size);
1829 xdr_encode_opaque(p, fh->data, fh->size);
1830 /* ff_io_latency4 read */
1831 spin_lock(&mirror->lock);
1832 ff_layout_encode_io_latency(xdr, &mirror->read_stat.io_stat);
1833 /* ff_io_latency4 write */
1834 ff_layout_encode_io_latency(xdr, &mirror->write_stat.io_stat);
1835 spin_unlock(&mirror->lock);
1836 /* nfstime4 */
1837 ff_layout_encode_nfstime(xdr, ktime_sub(ktime_get(), mirror->start_time));
1838 /* bool */
1839 p = xdr_reserve_space(xdr, 4);
1840 *p = cpu_to_be32(false);
1841
1842 *start = cpu_to_be32((xdr->p - start - 1) * 4);
1843}
1844
1845static bool
1846ff_layout_mirror_prepare_stats(struct nfs42_layoutstat_args *args,
1847 struct pnfs_layout_segment *pls,
1848 int *dev_count, int dev_limit)
1849{
1850 struct nfs4_ff_layout_mirror *mirror;
1851 struct nfs4_deviceid_node *dev;
1852 struct nfs42_layoutstat_devinfo *devinfo;
1853 int i;
1854
1855 for (i = 0; i <= FF_LAYOUT_MIRROR_COUNT(pls); i++) {
1856 if (*dev_count >= dev_limit)
1857 break;
1858 mirror = FF_LAYOUT_COMP(pls, i);
1859 if (!mirror || !mirror->mirror_ds)
1860 continue;
1861 dev = FF_LAYOUT_DEVID_NODE(pls, i);
1862 devinfo = &args->devinfo[*dev_count];
1863 memcpy(&devinfo->dev_id, &dev->deviceid, NFS4_DEVICEID4_SIZE);
1864 devinfo->offset = pls->pls_range.offset;
1865 devinfo->length = pls->pls_range.length;
1866 /* well, we don't really know if IO is continuous or not! */
1867 devinfo->read_count = mirror->read_stat.io_stat.bytes_completed;
1868 devinfo->read_bytes = mirror->read_stat.io_stat.bytes_completed;
1869 devinfo->write_count = mirror->write_stat.io_stat.bytes_completed;
1870 devinfo->write_bytes = mirror->write_stat.io_stat.bytes_completed;
1871 devinfo->layout_type = LAYOUT_FLEX_FILES;
1872 devinfo->layoutstats_encode = ff_layout_encode_layoutstats;
1873 devinfo->layout_private = mirror;
1874 /* lseg refcount put in cleanup_layoutstats */
1875 pnfs_get_lseg(pls);
1876
1877 ++(*dev_count);
1878 }
1879
1880 return *dev_count < dev_limit;
1881}
1882
1883static int
1884ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args)
1885{
1886 struct pnfs_layout_segment *pls;
1887 int dev_count = 0;
1888
1889 spin_lock(&args->inode->i_lock);
1890 list_for_each_entry(pls, &NFS_I(args->inode)->layout->plh_segs, pls_list) {
1891 dev_count += FF_LAYOUT_MIRROR_COUNT(pls);
1892 }
1893 spin_unlock(&args->inode->i_lock);
1894 /* For now, send at most PNFS_LAYOUTSTATS_MAXDEV statistics */
1895 if (dev_count > PNFS_LAYOUTSTATS_MAXDEV) {
1896 dprintk("%s: truncating devinfo to limit (%d:%d)\n",
1897 __func__, dev_count, PNFS_LAYOUTSTATS_MAXDEV);
1898 dev_count = PNFS_LAYOUTSTATS_MAXDEV;
1899 }
1900 args->devinfo = kmalloc(dev_count * sizeof(*args->devinfo), GFP_KERNEL);
1901 if (!args->devinfo)
1902 return -ENOMEM;
1903
1904 dev_count = 0;
1905 spin_lock(&args->inode->i_lock);
1906 list_for_each_entry(pls, &NFS_I(args->inode)->layout->plh_segs, pls_list) {
1907 if (!ff_layout_mirror_prepare_stats(args, pls, &dev_count,
1908 PNFS_LAYOUTSTATS_MAXDEV)) {
1909 break;
1910 }
1911 }
1912 spin_unlock(&args->inode->i_lock);
1913 args->num_dev = dev_count;
1914
1915 return 0;
1916}
1917
1918static void
1919ff_layout_cleanup_layoutstats(struct nfs42_layoutstat_data *data)
1920{
1921 struct nfs4_ff_layout_mirror *mirror;
1922 int i;
1923
1924 for (i = 0; i < data->args.num_dev; i++) {
1925 mirror = data->args.devinfo[i].layout_private;
1926 data->args.devinfo[i].layout_private = NULL;
1927 pnfs_put_lseg(mirror->lseg);
1928 }
1929}
1930
1491static struct pnfs_layoutdriver_type flexfilelayout_type = { 1931static struct pnfs_layoutdriver_type flexfilelayout_type = {
1492 .id = LAYOUT_FLEX_FILES, 1932 .id = LAYOUT_FLEX_FILES,
1493 .name = "LAYOUT_FLEX_FILES", 1933 .name = "LAYOUT_FLEX_FILES",
@@ -1510,6 +1950,8 @@ static struct pnfs_layoutdriver_type flexfilelayout_type = {
1510 .alloc_deviceid_node = ff_layout_alloc_deviceid_node, 1950 .alloc_deviceid_node = ff_layout_alloc_deviceid_node,
1511 .encode_layoutreturn = ff_layout_encode_layoutreturn, 1951 .encode_layoutreturn = ff_layout_encode_layoutreturn,
1512 .sync = pnfs_nfs_generic_sync, 1952 .sync = pnfs_nfs_generic_sync,
1953 .prepare_layoutstats = ff_layout_prepare_layoutstats,
1954 .cleanup_layoutstats = ff_layout_cleanup_layoutstats,
1513}; 1955};
1514 1956
1515static int __init nfs4flexfilelayout_init(void) 1957static int __init nfs4flexfilelayout_init(void)
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h
index 070f20445b2d..f92f9a0a856b 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.h
+++ b/fs/nfs/flexfilelayout/flexfilelayout.h
@@ -9,12 +9,17 @@
9#ifndef FS_NFS_NFS4FLEXFILELAYOUT_H 9#ifndef FS_NFS_NFS4FLEXFILELAYOUT_H
10#define FS_NFS_NFS4FLEXFILELAYOUT_H 10#define FS_NFS_NFS4FLEXFILELAYOUT_H
11 11
12#define FF_FLAGS_NO_LAYOUTCOMMIT 1
13
12#include "../pnfs.h" 14#include "../pnfs.h"
13 15
14/* XXX: Let's filter out insanely large mirror count for now to avoid oom 16/* XXX: Let's filter out insanely large mirror count for now to avoid oom
15 * due to network error etc. */ 17 * due to network error etc. */
16#define NFS4_FLEXFILE_LAYOUT_MAX_MIRROR_CNT 4096 18#define NFS4_FLEXFILE_LAYOUT_MAX_MIRROR_CNT 4096
17 19
20/* LAYOUTSTATS report interval in ms */
21#define FF_LAYOUTSTATS_REPORT_INTERVAL (60000L)
22
18struct nfs4_ff_ds_version { 23struct nfs4_ff_ds_version {
19 u32 version; 24 u32 version;
20 u32 minor_version; 25 u32 minor_version;
@@ -41,24 +46,48 @@ struct nfs4_ff_layout_ds_err {
41 struct nfs4_deviceid deviceid; 46 struct nfs4_deviceid deviceid;
42}; 47};
43 48
49struct nfs4_ff_io_stat {
50 __u64 ops_requested;
51 __u64 bytes_requested;
52 __u64 ops_completed;
53 __u64 bytes_completed;
54 __u64 bytes_not_delivered;
55 ktime_t total_busy_time;
56 ktime_t aggregate_completion_time;
57};
58
59struct nfs4_ff_busy_timer {
60 ktime_t start_time;
61 atomic_t n_ops;
62};
63
64struct nfs4_ff_layoutstat {
65 struct nfs4_ff_io_stat io_stat;
66 struct nfs4_ff_busy_timer busy_timer;
67};
68
44struct nfs4_ff_layout_mirror { 69struct nfs4_ff_layout_mirror {
70 struct pnfs_layout_segment *lseg; /* back pointer */
45 u32 ds_count; 71 u32 ds_count;
46 u32 efficiency; 72 u32 efficiency;
47 struct nfs4_ff_layout_ds *mirror_ds; 73 struct nfs4_ff_layout_ds *mirror_ds;
48 u32 fh_versions_cnt; 74 u32 fh_versions_cnt;
49 struct nfs_fh *fh_versions; 75 struct nfs_fh *fh_versions;
50 nfs4_stateid stateid; 76 nfs4_stateid stateid;
51 struct nfs4_string user_name;
52 struct nfs4_string group_name;
53 u32 uid; 77 u32 uid;
54 u32 gid; 78 u32 gid;
55 struct rpc_cred *cred; 79 struct rpc_cred *cred;
56 spinlock_t lock; 80 spinlock_t lock;
81 struct nfs4_ff_layoutstat read_stat;
82 struct nfs4_ff_layoutstat write_stat;
83 ktime_t start_time;
84 ktime_t last_report_time;
57}; 85};
58 86
59struct nfs4_ff_layout_segment { 87struct nfs4_ff_layout_segment {
60 struct pnfs_layout_segment generic_hdr; 88 struct pnfs_layout_segment generic_hdr;
61 u64 stripe_unit; 89 u64 stripe_unit;
90 u32 flags;
62 u32 mirror_array_cnt; 91 u32 mirror_array_cnt;
63 struct nfs4_ff_layout_mirror **mirror_array; 92 struct nfs4_ff_layout_mirror **mirror_array;
64}; 93};
diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
index 77a2d026aa12..f13e1969eedd 100644
--- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -324,7 +324,8 @@ static int ff_layout_update_mirror_cred(struct nfs4_ff_layout_mirror *mirror,
324 __func__, PTR_ERR(cred)); 324 __func__, PTR_ERR(cred));
325 return PTR_ERR(cred); 325 return PTR_ERR(cred);
326 } else { 326 } else {
327 mirror->cred = cred; 327 if (cmpxchg(&mirror->cred, NULL, cred))
328 put_rpccred(cred);
328 } 329 }
329 } 330 }
330 return 0; 331 return 0;
@@ -386,7 +387,7 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
386 /* matching smp_wmb() in _nfs4_pnfs_v3/4_ds_connect */ 387 /* matching smp_wmb() in _nfs4_pnfs_v3/4_ds_connect */
387 smp_rmb(); 388 smp_rmb();
388 if (ds->ds_clp) 389 if (ds->ds_clp)
389 goto out; 390 goto out_update_creds;
390 391
391 flavor = nfs4_ff_layout_choose_authflavor(mirror); 392 flavor = nfs4_ff_layout_choose_authflavor(mirror);
392 393
@@ -430,7 +431,7 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
430 } 431 }
431 } 432 }
432 } 433 }
433 434out_update_creds:
434 if (ff_layout_update_mirror_cred(mirror, ds)) 435 if (ff_layout_update_mirror_cred(mirror, ds))
435 ds = NULL; 436 ds = NULL;
436out: 437out:
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index f734562c6d24..b77b328a06d7 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -678,6 +678,8 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
678 if (!err) { 678 if (!err) {
679 generic_fillattr(inode, stat); 679 generic_fillattr(inode, stat);
680 stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode)); 680 stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode));
681 if (S_ISDIR(inode->i_mode))
682 stat->blksize = NFS_SERVER(inode)->dtsize;
681 } 683 }
682out: 684out:
683 trace_nfs_getattr_exit(inode, err); 685 trace_nfs_getattr_exit(inode, err);
@@ -2008,17 +2010,15 @@ static int __init init_nfs_fs(void)
2008 if (err) 2010 if (err)
2009 goto out1; 2011 goto out1;
2010 2012
2011#ifdef CONFIG_PROC_FS
2012 rpc_proc_register(&init_net, &nfs_rpcstat); 2013 rpc_proc_register(&init_net, &nfs_rpcstat);
2013#endif 2014
2014 if ((err = register_nfs_fs()) != 0) 2015 err = register_nfs_fs();
2016 if (err)
2015 goto out0; 2017 goto out0;
2016 2018
2017 return 0; 2019 return 0;
2018out0: 2020out0:
2019#ifdef CONFIG_PROC_FS
2020 rpc_proc_unregister(&init_net, "nfs"); 2021 rpc_proc_unregister(&init_net, "nfs");
2021#endif
2022 nfs_destroy_directcache(); 2022 nfs_destroy_directcache();
2023out1: 2023out1:
2024 nfs_destroy_writepagecache(); 2024 nfs_destroy_writepagecache();
@@ -2049,9 +2049,7 @@ static void __exit exit_nfs_fs(void)
2049 nfs_destroy_nfspagecache(); 2049 nfs_destroy_nfspagecache();
2050 nfs_fscache_unregister(); 2050 nfs_fscache_unregister();
2051 unregister_pernet_subsys(&nfs_net_ops); 2051 unregister_pernet_subsys(&nfs_net_ops);
2052#ifdef CONFIG_PROC_FS
2053 rpc_proc_unregister(&init_net, "nfs"); 2052 rpc_proc_unregister(&init_net, "nfs");
2054#endif
2055 unregister_nfs_fs(); 2053 unregister_nfs_fs();
2056 nfs_fs_proc_exit(); 2054 nfs_fs_proc_exit();
2057 nfsiod_stop(); 2055 nfsiod_stop();
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 53852a4bd88b..9b04c2e6fffc 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -1342,7 +1342,7 @@ static void nfs3_xdr_enc_setacl3args(struct rpc_rqst *req,
1342 if (args->npages != 0) 1342 if (args->npages != 0)
1343 xdr_write_pages(xdr, args->pages, 0, args->len); 1343 xdr_write_pages(xdr, args->pages, 0, args->len);
1344 else 1344 else
1345 xdr_reserve_space(xdr, NFS_ACL_INLINE_BUFSIZE); 1345 xdr_reserve_space(xdr, args->len);
1346 1346
1347 error = nfsacl_encode(xdr->buf, base, args->inode, 1347 error = nfsacl_encode(xdr->buf, base, args->inode,
1348 (args->mask & NFS_ACL) ? 1348 (args->mask & NFS_ACL) ?
diff --git a/fs/nfs/nfs42.h b/fs/nfs/nfs42.h
index 7afb8947dfdf..ff66ae700b89 100644
--- a/fs/nfs/nfs42.h
+++ b/fs/nfs/nfs42.h
@@ -5,11 +5,18 @@
5#ifndef __LINUX_FS_NFS_NFS4_2_H 5#ifndef __LINUX_FS_NFS_NFS4_2_H
6#define __LINUX_FS_NFS_NFS4_2_H 6#define __LINUX_FS_NFS_NFS4_2_H
7 7
8/*
9 * FIXME: four LAYOUTSTATS calls per compound at most! Do we need to support
10 * more? Need to consider not to pre-alloc too much for a compound.
11 */
12#define PNFS_LAYOUTSTATS_MAXDEV (4)
13
8/* nfs4.2proc.c */ 14/* nfs4.2proc.c */
9int nfs42_proc_allocate(struct file *, loff_t, loff_t); 15int nfs42_proc_allocate(struct file *, loff_t, loff_t);
10int nfs42_proc_deallocate(struct file *, loff_t, loff_t); 16int nfs42_proc_deallocate(struct file *, loff_t, loff_t);
11loff_t nfs42_proc_llseek(struct file *, loff_t, int); 17loff_t nfs42_proc_llseek(struct file *, loff_t, int);
12 18int nfs42_proc_layoutstats_generic(struct nfs_server *,
19 struct nfs42_layoutstat_data *);
13/* nfs4.2xdr.h */ 20/* nfs4.2xdr.h */
14extern struct rpc_procinfo nfs4_2_procedures[]; 21extern struct rpc_procinfo nfs4_2_procedures[];
15 22
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index 3a9e75235f30..f486b80f927a 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -10,6 +10,11 @@
10#include <linux/nfs_fs.h> 10#include <linux/nfs_fs.h>
11#include "nfs4_fs.h" 11#include "nfs4_fs.h"
12#include "nfs42.h" 12#include "nfs42.h"
13#include "iostat.h"
14#include "pnfs.h"
15#include "internal.h"
16
17#define NFSDBG_FACILITY NFSDBG_PNFS
13 18
14static int nfs42_set_rw_stateid(nfs4_stateid *dst, struct file *file, 19static int nfs42_set_rw_stateid(nfs4_stateid *dst, struct file *file,
15 fmode_t fmode) 20 fmode_t fmode)
@@ -165,3 +170,85 @@ loff_t nfs42_proc_llseek(struct file *filep, loff_t offset, int whence)
165 170
166 return vfs_setpos(filep, res.sr_offset, inode->i_sb->s_maxbytes); 171 return vfs_setpos(filep, res.sr_offset, inode->i_sb->s_maxbytes);
167} 172}
173
174static void
175nfs42_layoutstat_prepare(struct rpc_task *task, void *calldata)
176{
177 struct nfs42_layoutstat_data *data = calldata;
178 struct nfs_server *server = NFS_SERVER(data->args.inode);
179
180 nfs41_setup_sequence(nfs4_get_session(server), &data->args.seq_args,
181 &data->res.seq_res, task);
182}
183
184static void
185nfs42_layoutstat_done(struct rpc_task *task, void *calldata)
186{
187 struct nfs42_layoutstat_data *data = calldata;
188
189 if (!nfs4_sequence_done(task, &data->res.seq_res))
190 return;
191
192 switch (task->tk_status) {
193 case 0:
194 break;
195 case -ENOTSUPP:
196 case -EOPNOTSUPP:
197 NFS_SERVER(data->inode)->caps &= ~NFS_CAP_LAYOUTSTATS;
198 default:
199 dprintk("%s server returns %d\n", __func__, task->tk_status);
200 }
201}
202
203static void
204nfs42_layoutstat_release(void *calldata)
205{
206 struct nfs42_layoutstat_data *data = calldata;
207 struct nfs_server *nfss = NFS_SERVER(data->args.inode);
208
209 if (nfss->pnfs_curr_ld->cleanup_layoutstats)
210 nfss->pnfs_curr_ld->cleanup_layoutstats(data);
211
212 pnfs_put_layout_hdr(NFS_I(data->args.inode)->layout);
213 smp_mb__before_atomic();
214 clear_bit(NFS_INO_LAYOUTSTATS, &NFS_I(data->args.inode)->flags);
215 smp_mb__after_atomic();
216 nfs_iput_and_deactive(data->inode);
217 kfree(data->args.devinfo);
218 kfree(data);
219}
220
221static const struct rpc_call_ops nfs42_layoutstat_ops = {
222 .rpc_call_prepare = nfs42_layoutstat_prepare,
223 .rpc_call_done = nfs42_layoutstat_done,
224 .rpc_release = nfs42_layoutstat_release,
225};
226
227int nfs42_proc_layoutstats_generic(struct nfs_server *server,
228 struct nfs42_layoutstat_data *data)
229{
230 struct rpc_message msg = {
231 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTSTATS],
232 .rpc_argp = &data->args,
233 .rpc_resp = &data->res,
234 };
235 struct rpc_task_setup task_setup = {
236 .rpc_client = server->client,
237 .rpc_message = &msg,
238 .callback_ops = &nfs42_layoutstat_ops,
239 .callback_data = data,
240 .flags = RPC_TASK_ASYNC,
241 };
242 struct rpc_task *task;
243
244 data->inode = nfs_igrab_and_active(data->args.inode);
245 if (!data->inode) {
246 nfs42_layoutstat_release(data);
247 return -EAGAIN;
248 }
249 nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0);
250 task = rpc_run_task(&task_setup);
251 if (IS_ERR(task))
252 return PTR_ERR(task);
253 return 0;
254}
diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index 1a25b27248f2..a6bd27da6286 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -4,6 +4,8 @@
4#ifndef __LINUX_FS_NFS_NFS4_2XDR_H 4#ifndef __LINUX_FS_NFS_NFS4_2XDR_H
5#define __LINUX_FS_NFS_NFS4_2XDR_H 5#define __LINUX_FS_NFS_NFS4_2XDR_H
6 6
7#include "nfs42.h"
8
7#define encode_fallocate_maxsz (encode_stateid_maxsz + \ 9#define encode_fallocate_maxsz (encode_stateid_maxsz + \
8 2 /* offset */ + \ 10 2 /* offset */ + \
9 2 /* length */) 11 2 /* length */)
@@ -22,6 +24,16 @@
22 1 /* whence */ + \ 24 1 /* whence */ + \
23 2 /* offset */ + \ 25 2 /* offset */ + \
24 2 /* length */) 26 2 /* length */)
27#define encode_io_info_maxsz 4
28#define encode_layoutstats_maxsz (op_decode_hdr_maxsz + \
29 2 /* offset */ + \
30 2 /* length */ + \
31 encode_stateid_maxsz + \
32 encode_io_info_maxsz + \
33 encode_io_info_maxsz + \
34 1 /* opaque devaddr4 length */ + \
35 XDR_QUADLEN(PNFS_LAYOUTSTATS_MAXSIZE))
36#define decode_layoutstats_maxsz (op_decode_hdr_maxsz)
25 37
26#define NFS4_enc_allocate_sz (compound_encode_hdr_maxsz + \ 38#define NFS4_enc_allocate_sz (compound_encode_hdr_maxsz + \
27 encode_putfh_maxsz + \ 39 encode_putfh_maxsz + \
@@ -45,6 +57,14 @@
45#define NFS4_dec_seek_sz (compound_decode_hdr_maxsz + \ 57#define NFS4_dec_seek_sz (compound_decode_hdr_maxsz + \
46 decode_putfh_maxsz + \ 58 decode_putfh_maxsz + \
47 decode_seek_maxsz) 59 decode_seek_maxsz)
60#define NFS4_enc_layoutstats_sz (compound_encode_hdr_maxsz + \
61 encode_sequence_maxsz + \
62 encode_putfh_maxsz + \
63 PNFS_LAYOUTSTATS_MAXDEV * encode_layoutstats_maxsz)
64#define NFS4_dec_layoutstats_sz (compound_decode_hdr_maxsz + \
65 decode_sequence_maxsz + \
66 decode_putfh_maxsz + \
67 PNFS_LAYOUTSTATS_MAXDEV * decode_layoutstats_maxsz)
48 68
49 69
50static void encode_fallocate(struct xdr_stream *xdr, 70static void encode_fallocate(struct xdr_stream *xdr,
@@ -81,6 +101,33 @@ static void encode_seek(struct xdr_stream *xdr,
81 encode_uint32(xdr, args->sa_what); 101 encode_uint32(xdr, args->sa_what);
82} 102}
83 103
104static void encode_layoutstats(struct xdr_stream *xdr,
105 struct nfs42_layoutstat_args *args,
106 struct nfs42_layoutstat_devinfo *devinfo,
107 struct compound_hdr *hdr)
108{
109 __be32 *p;
110
111 encode_op_hdr(xdr, OP_LAYOUTSTATS, decode_layoutstats_maxsz, hdr);
112 p = reserve_space(xdr, 8 + 8);
113 p = xdr_encode_hyper(p, devinfo->offset);
114 p = xdr_encode_hyper(p, devinfo->length);
115 encode_nfs4_stateid(xdr, &args->stateid);
116 p = reserve_space(xdr, 4*8 + NFS4_DEVICEID4_SIZE + 4);
117 p = xdr_encode_hyper(p, devinfo->read_count);
118 p = xdr_encode_hyper(p, devinfo->read_bytes);
119 p = xdr_encode_hyper(p, devinfo->write_count);
120 p = xdr_encode_hyper(p, devinfo->write_bytes);
121 p = xdr_encode_opaque_fixed(p, devinfo->dev_id.data,
122 NFS4_DEVICEID4_SIZE);
123 /* Encode layoutupdate4 */
124 *p++ = cpu_to_be32(devinfo->layout_type);
125 if (devinfo->layoutstats_encode != NULL)
126 devinfo->layoutstats_encode(xdr, args, devinfo);
127 else
128 encode_uint32(xdr, 0);
129}
130
84/* 131/*
85 * Encode ALLOCATE request 132 * Encode ALLOCATE request
86 */ 133 */
@@ -137,6 +184,28 @@ static void nfs4_xdr_enc_seek(struct rpc_rqst *req,
137 encode_nops(&hdr); 184 encode_nops(&hdr);
138} 185}
139 186
187/*
188 * Encode LAYOUTSTATS request
189 */
190static void nfs4_xdr_enc_layoutstats(struct rpc_rqst *req,
191 struct xdr_stream *xdr,
192 struct nfs42_layoutstat_args *args)
193{
194 int i;
195
196 struct compound_hdr hdr = {
197 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
198 };
199
200 encode_compound_hdr(xdr, req, &hdr);
201 encode_sequence(xdr, &args->seq_args, &hdr);
202 encode_putfh(xdr, args->fh, &hdr);
203 WARN_ON(args->num_dev > PNFS_LAYOUTSTATS_MAXDEV);
204 for (i = 0; i < args->num_dev; i++)
205 encode_layoutstats(xdr, args, &args->devinfo[i], &hdr);
206 encode_nops(&hdr);
207}
208
140static int decode_allocate(struct xdr_stream *xdr, struct nfs42_falloc_res *res) 209static int decode_allocate(struct xdr_stream *xdr, struct nfs42_falloc_res *res)
141{ 210{
142 return decode_op_hdr(xdr, OP_ALLOCATE); 211 return decode_op_hdr(xdr, OP_ALLOCATE);
@@ -169,6 +238,12 @@ out_overflow:
169 return -EIO; 238 return -EIO;
170} 239}
171 240
241static int decode_layoutstats(struct xdr_stream *xdr,
242 struct nfs42_layoutstat_res *res)
243{
244 return decode_op_hdr(xdr, OP_LAYOUTSTATS);
245}
246
172/* 247/*
173 * Decode ALLOCATE request 248 * Decode ALLOCATE request
174 */ 249 */
@@ -246,4 +321,35 @@ static int nfs4_xdr_dec_seek(struct rpc_rqst *rqstp,
246out: 321out:
247 return status; 322 return status;
248} 323}
324
325/*
326 * Decode LAYOUTSTATS request
327 */
328static int nfs4_xdr_dec_layoutstats(struct rpc_rqst *rqstp,
329 struct xdr_stream *xdr,
330 struct nfs42_layoutstat_res *res)
331{
332 struct compound_hdr hdr;
333 int status, i;
334
335 status = decode_compound_hdr(xdr, &hdr);
336 if (status)
337 goto out;
338 status = decode_sequence(xdr, &res->seq_res, rqstp);
339 if (status)
340 goto out;
341 status = decode_putfh(xdr);
342 if (status)
343 goto out;
344 WARN_ON(res->num_dev > PNFS_LAYOUTSTATS_MAXDEV);
345 for (i = 0; i < res->num_dev; i++) {
346 status = decode_layoutstats(xdr, res);
347 if (status)
348 goto out;
349 }
350out:
351 res->rpc_status = status;
352 return status;
353}
354
249#endif /* __LINUX_FS_NFS_NFS4_2XDR_H */ 355#endif /* __LINUX_FS_NFS_NFS4_2XDR_H */
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index fdef424b0cd3..ea3bee919a76 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -233,6 +233,7 @@ extern int nfs4_handle_exception(struct nfs_server *, int, struct nfs4_exception
233extern int nfs4_call_sync(struct rpc_clnt *, struct nfs_server *, 233extern int nfs4_call_sync(struct rpc_clnt *, struct nfs_server *,
234 struct rpc_message *, struct nfs4_sequence_args *, 234 struct rpc_message *, struct nfs4_sequence_args *,
235 struct nfs4_sequence_res *, int); 235 struct nfs4_sequence_res *, int);
236extern void nfs4_init_sequence(struct nfs4_sequence_args *, struct nfs4_sequence_res *, int);
236extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *); 237extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *);
237extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, struct rpc_cred *); 238extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, struct rpc_cred *);
238extern int nfs4_proc_get_rootfh(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *, bool); 239extern int nfs4_proc_get_rootfh(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *, bool);
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index e42be52a8c18..3aa6a9ba5113 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -676,7 +676,6 @@ found:
676 break; 676 break;
677 } 677 }
678 678
679 /* No matching nfs_client found. */
680 spin_unlock(&nn->nfs_client_lock); 679 spin_unlock(&nn->nfs_client_lock);
681 dprintk("NFS: <-- %s status = %d\n", __func__, status); 680 dprintk("NFS: <-- %s status = %d\n", __func__, status);
682 nfs_put_client(prev); 681 nfs_put_client(prev);
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index f58c17b3b480..dcd39d4e2efe 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -41,6 +41,10 @@ nfs4_file_open(struct inode *inode, struct file *filp)
41 41
42 dprintk("NFS: open file(%pd2)\n", dentry); 42 dprintk("NFS: open file(%pd2)\n", dentry);
43 43
44 err = nfs_check_flags(openflags);
45 if (err)
46 return err;
47
44 if ((openflags & O_ACCMODE) == 3) 48 if ((openflags & O_ACCMODE) == 3)
45 openflags--; 49 openflags--;
46 50
diff --git a/fs/nfs/nfs4getroot.c b/fs/nfs/nfs4getroot.c
index c0b3a16b4a00..039b3eb6d834 100644
--- a/fs/nfs/nfs4getroot.c
+++ b/fs/nfs/nfs4getroot.c
@@ -35,13 +35,6 @@ int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh, bool auth_p
35 goto out; 35 goto out;
36 } 36 }
37 37
38 if (fsinfo.fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) {
39 printk(KERN_ERR "nfs4_get_rootfh:"
40 " getroot obtained referral\n");
41 ret = -EREMOTE;
42 goto out;
43 }
44
45 memcpy(&server->fsid, &fsinfo.fattr->fsid, sizeof(server->fsid)); 38 memcpy(&server->fsid, &fsinfo.fattr->fsid, sizeof(server->fsid));
46out: 39out:
47 nfs_free_fattr(fsinfo.fattr); 40 nfs_free_fattr(fsinfo.fattr);
diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c
index 2e1737c40a29..535dfc69c628 100644
--- a/fs/nfs/nfs4idmap.c
+++ b/fs/nfs/nfs4idmap.c
@@ -494,12 +494,7 @@ nfs_idmap_delete(struct nfs_client *clp)
494 494
495int nfs_idmap_init(void) 495int nfs_idmap_init(void)
496{ 496{
497 int ret; 497 return nfs_idmap_init_keyring();
498 ret = nfs_idmap_init_keyring();
499 if (ret != 0)
500 goto out;
501out:
502 return ret;
503} 498}
504 499
505void nfs_idmap_quit(void) 500void nfs_idmap_quit(void)
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 55e1e3af23a3..6f228b5af819 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -356,6 +356,9 @@ int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_
356 case 0: 356 case 0:
357 return 0; 357 return 0;
358 case -NFS4ERR_OPENMODE: 358 case -NFS4ERR_OPENMODE:
359 case -NFS4ERR_DELEG_REVOKED:
360 case -NFS4ERR_ADMIN_REVOKED:
361 case -NFS4ERR_BAD_STATEID:
359 if (inode && nfs4_have_delegation(inode, FMODE_READ)) { 362 if (inode && nfs4_have_delegation(inode, FMODE_READ)) {
360 nfs4_inode_return_delegation(inode); 363 nfs4_inode_return_delegation(inode);
361 exception->retry = 1; 364 exception->retry = 1;
@@ -367,15 +370,6 @@ int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_
367 if (ret < 0) 370 if (ret < 0)
368 break; 371 break;
369 goto wait_on_recovery; 372 goto wait_on_recovery;
370 case -NFS4ERR_DELEG_REVOKED:
371 case -NFS4ERR_ADMIN_REVOKED:
372 case -NFS4ERR_BAD_STATEID:
373 if (state == NULL)
374 break;
375 ret = nfs4_schedule_stateid_recovery(server, state);
376 if (ret < 0)
377 break;
378 goto wait_on_recovery;
379 case -NFS4ERR_EXPIRED: 373 case -NFS4ERR_EXPIRED:
380 if (state != NULL) { 374 if (state != NULL) {
381 ret = nfs4_schedule_stateid_recovery(server, state); 375 ret = nfs4_schedule_stateid_recovery(server, state);
@@ -482,8 +476,8 @@ struct nfs4_call_sync_data {
482 struct nfs4_sequence_res *seq_res; 476 struct nfs4_sequence_res *seq_res;
483}; 477};
484 478
485static void nfs4_init_sequence(struct nfs4_sequence_args *args, 479void nfs4_init_sequence(struct nfs4_sequence_args *args,
486 struct nfs4_sequence_res *res, int cache_reply) 480 struct nfs4_sequence_res *res, int cache_reply)
487{ 481{
488 args->sa_slot = NULL; 482 args->sa_slot = NULL;
489 args->sa_cache_this = cache_reply; 483 args->sa_cache_this = cache_reply;
@@ -1553,6 +1547,13 @@ static int nfs4_open_recover_helper(struct nfs4_opendata *opendata, fmode_t fmod
1553 struct nfs4_state *newstate; 1547 struct nfs4_state *newstate;
1554 int ret; 1548 int ret;
1555 1549
1550 if ((opendata->o_arg.claim == NFS4_OPEN_CLAIM_DELEGATE_CUR ||
1551 opendata->o_arg.claim == NFS4_OPEN_CLAIM_DELEG_CUR_FH) &&
1552 (opendata->o_arg.u.delegation_type & fmode) != fmode)
1553 /* This mode can't have been delegated, so we must have
1554 * a valid open_stateid to cover it - not need to reclaim.
1555 */
1556 return 0;
1556 opendata->o_arg.open_flags = 0; 1557 opendata->o_arg.open_flags = 0;
1557 opendata->o_arg.fmode = fmode; 1558 opendata->o_arg.fmode = fmode;
1558 opendata->o_arg.share_access = nfs4_map_atomic_open_share( 1559 opendata->o_arg.share_access = nfs4_map_atomic_open_share(
@@ -1684,6 +1685,7 @@ static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct
1684 "%d.\n", __func__, err); 1685 "%d.\n", __func__, err);
1685 case 0: 1686 case 0:
1686 case -ENOENT: 1687 case -ENOENT:
1688 case -EAGAIN:
1687 case -ESTALE: 1689 case -ESTALE:
1688 break; 1690 break;
1689 case -NFS4ERR_BADSESSION: 1691 case -NFS4ERR_BADSESSION:
@@ -3355,6 +3357,8 @@ static int nfs4_proc_lookup_common(struct rpc_clnt **clnt, struct inode *dir,
3355 goto out; 3357 goto out;
3356 case -NFS4ERR_MOVED: 3358 case -NFS4ERR_MOVED:
3357 err = nfs4_get_referral(client, dir, name, fattr, fhandle); 3359 err = nfs4_get_referral(client, dir, name, fattr, fhandle);
3360 if (err == -NFS4ERR_MOVED)
3361 err = nfs4_handle_exception(NFS_SERVER(dir), err, &exception);
3358 goto out; 3362 goto out;
3359 case -NFS4ERR_WRONGSEC: 3363 case -NFS4ERR_WRONGSEC:
3360 err = -EPERM; 3364 err = -EPERM;
@@ -4955,49 +4959,128 @@ static void nfs4_init_boot_verifier(const struct nfs_client *clp,
4955 memcpy(bootverf->data, verf, sizeof(bootverf->data)); 4959 memcpy(bootverf->data, verf, sizeof(bootverf->data));
4956} 4960}
4957 4961
4958static unsigned int 4962static int
4959nfs4_init_nonuniform_client_string(struct nfs_client *clp, 4963nfs4_init_nonuniform_client_string(struct nfs_client *clp)
4960 char *buf, size_t len)
4961{ 4964{
4962 unsigned int result; 4965 int result;
4966 size_t len;
4967 char *str;
4968 bool retried = false;
4963 4969
4964 if (clp->cl_owner_id != NULL) 4970 if (clp->cl_owner_id != NULL)
4965 return strlcpy(buf, clp->cl_owner_id, len); 4971 return 0;
4972retry:
4973 rcu_read_lock();
4974 len = 10 + strlen(clp->cl_ipaddr) + 1 +
4975 strlen(rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)) +
4976 1 +
4977 strlen(rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_PROTO)) +
4978 1;
4979 rcu_read_unlock();
4980
4981 if (len > NFS4_OPAQUE_LIMIT + 1)
4982 return -EINVAL;
4983
4984 /*
4985 * Since this string is allocated at mount time, and held until the
4986 * nfs_client is destroyed, we can use GFP_KERNEL here w/o worrying
4987 * about a memory-reclaim deadlock.
4988 */
4989 str = kmalloc(len, GFP_KERNEL);
4990 if (!str)
4991 return -ENOMEM;
4966 4992
4967 rcu_read_lock(); 4993 rcu_read_lock();
4968 result = scnprintf(buf, len, "Linux NFSv4.0 %s/%s %s", 4994 result = scnprintf(str, len, "Linux NFSv4.0 %s/%s %s",
4969 clp->cl_ipaddr, 4995 clp->cl_ipaddr,
4970 rpc_peeraddr2str(clp->cl_rpcclient, 4996 rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR),
4971 RPC_DISPLAY_ADDR), 4997 rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_PROTO));
4972 rpc_peeraddr2str(clp->cl_rpcclient,
4973 RPC_DISPLAY_PROTO));
4974 rcu_read_unlock(); 4998 rcu_read_unlock();
4975 clp->cl_owner_id = kstrdup(buf, GFP_KERNEL); 4999
4976 return result; 5000 /* Did something change? */
5001 if (result >= len) {
5002 kfree(str);
5003 if (retried)
5004 return -EINVAL;
5005 retried = true;
5006 goto retry;
5007 }
5008 clp->cl_owner_id = str;
5009 return 0;
4977} 5010}
4978 5011
4979static unsigned int 5012static int
4980nfs4_init_uniform_client_string(struct nfs_client *clp, 5013nfs4_init_uniquifier_client_string(struct nfs_client *clp)
4981 char *buf, size_t len) 5014{
5015 int result;
5016 size_t len;
5017 char *str;
5018
5019 len = 10 + 10 + 1 + 10 + 1 +
5020 strlen(nfs4_client_id_uniquifier) + 1 +
5021 strlen(clp->cl_rpcclient->cl_nodename) + 1;
5022
5023 if (len > NFS4_OPAQUE_LIMIT + 1)
5024 return -EINVAL;
5025
5026 /*
5027 * Since this string is allocated at mount time, and held until the
5028 * nfs_client is destroyed, we can use GFP_KERNEL here w/o worrying
5029 * about a memory-reclaim deadlock.
5030 */
5031 str = kmalloc(len, GFP_KERNEL);
5032 if (!str)
5033 return -ENOMEM;
5034
5035 result = scnprintf(str, len, "Linux NFSv%u.%u %s/%s",
5036 clp->rpc_ops->version, clp->cl_minorversion,
5037 nfs4_client_id_uniquifier,
5038 clp->cl_rpcclient->cl_nodename);
5039 if (result >= len) {
5040 kfree(str);
5041 return -EINVAL;
5042 }
5043 clp->cl_owner_id = str;
5044 return 0;
5045}
5046
5047static int
5048nfs4_init_uniform_client_string(struct nfs_client *clp)
4982{ 5049{
4983 const char *nodename = clp->cl_rpcclient->cl_nodename; 5050 int result;
4984 unsigned int result; 5051 size_t len;
5052 char *str;
4985 5053
4986 if (clp->cl_owner_id != NULL) 5054 if (clp->cl_owner_id != NULL)
4987 return strlcpy(buf, clp->cl_owner_id, len); 5055 return 0;
4988 5056
4989 if (nfs4_client_id_uniquifier[0] != '\0') 5057 if (nfs4_client_id_uniquifier[0] != '\0')
4990 result = scnprintf(buf, len, "Linux NFSv%u.%u %s/%s", 5058 return nfs4_init_uniquifier_client_string(clp);
4991 clp->rpc_ops->version, 5059
4992 clp->cl_minorversion, 5060 len = 10 + 10 + 1 + 10 + 1 +
4993 nfs4_client_id_uniquifier, 5061 strlen(clp->cl_rpcclient->cl_nodename) + 1;
4994 nodename); 5062
4995 else 5063 if (len > NFS4_OPAQUE_LIMIT + 1)
4996 result = scnprintf(buf, len, "Linux NFSv%u.%u %s", 5064 return -EINVAL;
4997 clp->rpc_ops->version, clp->cl_minorversion, 5065
4998 nodename); 5066 /*
4999 clp->cl_owner_id = kstrdup(buf, GFP_KERNEL); 5067 * Since this string is allocated at mount time, and held until the
5000 return result; 5068 * nfs_client is destroyed, we can use GFP_KERNEL here w/o worrying
5069 * about a memory-reclaim deadlock.
5070 */
5071 str = kmalloc(len, GFP_KERNEL);
5072 if (!str)
5073 return -ENOMEM;
5074
5075 result = scnprintf(str, len, "Linux NFSv%u.%u %s",
5076 clp->rpc_ops->version, clp->cl_minorversion,
5077 clp->cl_rpcclient->cl_nodename);
5078 if (result >= len) {
5079 kfree(str);
5080 return -EINVAL;
5081 }
5082 clp->cl_owner_id = str;
5083 return 0;
5001} 5084}
5002 5085
5003/* 5086/*
@@ -5044,7 +5127,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
5044 struct nfs4_setclientid setclientid = { 5127 struct nfs4_setclientid setclientid = {
5045 .sc_verifier = &sc_verifier, 5128 .sc_verifier = &sc_verifier,
5046 .sc_prog = program, 5129 .sc_prog = program,
5047 .sc_cb_ident = clp->cl_cb_ident, 5130 .sc_clnt = clp,
5048 }; 5131 };
5049 struct rpc_message msg = { 5132 struct rpc_message msg = {
5050 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID], 5133 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID],
@@ -5064,16 +5147,15 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
5064 5147
5065 /* nfs_client_id4 */ 5148 /* nfs_client_id4 */
5066 nfs4_init_boot_verifier(clp, &sc_verifier); 5149 nfs4_init_boot_verifier(clp, &sc_verifier);
5150
5067 if (test_bit(NFS_CS_MIGRATION, &clp->cl_flags)) 5151 if (test_bit(NFS_CS_MIGRATION, &clp->cl_flags))
5068 setclientid.sc_name_len = 5152 status = nfs4_init_uniform_client_string(clp);
5069 nfs4_init_uniform_client_string(clp,
5070 setclientid.sc_name,
5071 sizeof(setclientid.sc_name));
5072 else 5153 else
5073 setclientid.sc_name_len = 5154 status = nfs4_init_nonuniform_client_string(clp);
5074 nfs4_init_nonuniform_client_string(clp, 5155
5075 setclientid.sc_name, 5156 if (status)
5076 sizeof(setclientid.sc_name)); 5157 goto out;
5158
5077 /* cb_client4 */ 5159 /* cb_client4 */
5078 setclientid.sc_netid_len = 5160 setclientid.sc_netid_len =
5079 nfs4_init_callback_netid(clp, 5161 nfs4_init_callback_netid(clp,
@@ -5083,9 +5165,9 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
5083 sizeof(setclientid.sc_uaddr), "%s.%u.%u", 5165 sizeof(setclientid.sc_uaddr), "%s.%u.%u",
5084 clp->cl_ipaddr, port >> 8, port & 255); 5166 clp->cl_ipaddr, port >> 8, port & 255);
5085 5167
5086 dprintk("NFS call setclientid auth=%s, '%.*s'\n", 5168 dprintk("NFS call setclientid auth=%s, '%s'\n",
5087 clp->cl_rpcclient->cl_auth->au_ops->au_name, 5169 clp->cl_rpcclient->cl_auth->au_ops->au_name,
5088 setclientid.sc_name_len, setclientid.sc_name); 5170 clp->cl_owner_id);
5089 task = rpc_run_task(&task_setup_data); 5171 task = rpc_run_task(&task_setup_data);
5090 if (IS_ERR(task)) { 5172 if (IS_ERR(task)) {
5091 status = PTR_ERR(task); 5173 status = PTR_ERR(task);
@@ -5402,6 +5484,7 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl,
5402 atomic_inc(&lsp->ls_count); 5484 atomic_inc(&lsp->ls_count);
5403 /* Ensure we don't close file until we're done freeing locks! */ 5485 /* Ensure we don't close file until we're done freeing locks! */
5404 p->ctx = get_nfs_open_context(ctx); 5486 p->ctx = get_nfs_open_context(ctx);
5487 get_file(fl->fl_file);
5405 memcpy(&p->fl, fl, sizeof(p->fl)); 5488 memcpy(&p->fl, fl, sizeof(p->fl));
5406 p->server = NFS_SERVER(inode); 5489 p->server = NFS_SERVER(inode);
5407 return p; 5490 return p;
@@ -5413,6 +5496,7 @@ static void nfs4_locku_release_calldata(void *data)
5413 nfs_free_seqid(calldata->arg.seqid); 5496 nfs_free_seqid(calldata->arg.seqid);
5414 nfs4_put_lock_state(calldata->lsp); 5497 nfs4_put_lock_state(calldata->lsp);
5415 put_nfs_open_context(calldata->ctx); 5498 put_nfs_open_context(calldata->ctx);
5499 fput(calldata->fl.fl_file);
5416 kfree(calldata); 5500 kfree(calldata);
5417} 5501}
5418 5502
@@ -6846,11 +6930,14 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
6846 }; 6930 };
6847 6931
6848 nfs4_init_boot_verifier(clp, &verifier); 6932 nfs4_init_boot_verifier(clp, &verifier);
6849 args.id_len = nfs4_init_uniform_client_string(clp, args.id, 6933
6850 sizeof(args.id)); 6934 status = nfs4_init_uniform_client_string(clp);
6851 dprintk("NFS call exchange_id auth=%s, '%.*s'\n", 6935 if (status)
6936 goto out;
6937
6938 dprintk("NFS call exchange_id auth=%s, '%s'\n",
6852 clp->cl_rpcclient->cl_auth->au_ops->au_name, 6939 clp->cl_rpcclient->cl_auth->au_ops->au_name,
6853 args.id_len, args.id); 6940 clp->cl_owner_id);
6854 6941
6855 res.server_owner = kzalloc(sizeof(struct nfs41_server_owner), 6942 res.server_owner = kzalloc(sizeof(struct nfs41_server_owner),
6856 GFP_NOFS); 6943 GFP_NOFS);
@@ -6885,7 +6972,7 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
6885 /* unsupported! */ 6972 /* unsupported! */
6886 WARN_ON_ONCE(1); 6973 WARN_ON_ONCE(1);
6887 status = -EINVAL; 6974 status = -EINVAL;
6888 goto out_server_scope; 6975 goto out_impl_id;
6889 } 6976 }
6890 6977
6891 status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); 6978 status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
@@ -6913,6 +7000,7 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
6913 /* use the most recent implementation id */ 7000 /* use the most recent implementation id */
6914 kfree(clp->cl_implid); 7001 kfree(clp->cl_implid);
6915 clp->cl_implid = res.impl_id; 7002 clp->cl_implid = res.impl_id;
7003 res.impl_id = NULL;
6916 7004
6917 if (clp->cl_serverscope != NULL && 7005 if (clp->cl_serverscope != NULL &&
6918 !nfs41_same_server_scope(clp->cl_serverscope, 7006 !nfs41_same_server_scope(clp->cl_serverscope,
@@ -6926,15 +7014,16 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
6926 7014
6927 if (clp->cl_serverscope == NULL) { 7015 if (clp->cl_serverscope == NULL) {
6928 clp->cl_serverscope = res.server_scope; 7016 clp->cl_serverscope = res.server_scope;
6929 goto out; 7017 res.server_scope = NULL;
6930 } 7018 }
6931 } else 7019 }
6932 kfree(res.impl_id);
6933 7020
6934out_server_owner: 7021out_impl_id:
6935 kfree(res.server_owner); 7022 kfree(res.impl_id);
6936out_server_scope: 7023out_server_scope:
6937 kfree(res.server_scope); 7024 kfree(res.server_scope);
7025out_server_owner:
7026 kfree(res.server_owner);
6938out: 7027out:
6939 if (clp->cl_implid != NULL) 7028 if (clp->cl_implid != NULL)
6940 dprintk("NFS reply exchange_id: Server Implementation ID: " 7029 dprintk("NFS reply exchange_id: Server Implementation ID: "
@@ -8061,9 +8150,8 @@ nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync)
8061 struct rpc_task *task; 8150 struct rpc_task *task;
8062 int status = 0; 8151 int status = 0;
8063 8152
8064 dprintk("NFS: %4d initiating layoutcommit call. sync %d " 8153 dprintk("NFS: initiating layoutcommit call. sync %d "
8065 "lbw: %llu inode %lu\n", 8154 "lbw: %llu inode %lu\n", sync,
8066 data->task.tk_pid, sync,
8067 data->args.lastbytewritten, 8155 data->args.lastbytewritten,
8068 data->args.inode->i_ino); 8156 data->args.inode->i_ino);
8069 8157
@@ -8557,7 +8645,8 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
8557 | NFS_CAP_ATOMIC_OPEN_V1 8645 | NFS_CAP_ATOMIC_OPEN_V1
8558 | NFS_CAP_ALLOCATE 8646 | NFS_CAP_ALLOCATE
8559 | NFS_CAP_DEALLOCATE 8647 | NFS_CAP_DEALLOCATE
8560 | NFS_CAP_SEEK, 8648 | NFS_CAP_SEEK
8649 | NFS_CAP_LAYOUTSTATS,
8561 .init_client = nfs41_init_client, 8650 .init_client = nfs41_init_client,
8562 .shutdown_client = nfs41_shutdown_client, 8651 .shutdown_client = nfs41_shutdown_client,
8563 .match_stateid = nfs41_match_stateid, 8652 .match_stateid = nfs41_match_stateid,
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 2782cfca2265..605840dc89cf 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -309,7 +309,6 @@ int nfs41_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
309 309
310 if (test_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state)) 310 if (test_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state))
311 goto do_confirm; 311 goto do_confirm;
312 nfs4_begin_drain_session(clp);
313 status = nfs4_proc_exchange_id(clp, cred); 312 status = nfs4_proc_exchange_id(clp, cred);
314 if (status != 0) 313 if (status != 0)
315 goto out; 314 goto out;
@@ -1482,6 +1481,8 @@ restart:
1482 spin_unlock(&state->state_lock); 1481 spin_unlock(&state->state_lock);
1483 } 1482 }
1484 nfs4_put_open_state(state); 1483 nfs4_put_open_state(state);
1484 clear_bit(NFS4CLNT_RECLAIM_NOGRACE,
1485 &state->flags);
1485 spin_lock(&sp->so_lock); 1486 spin_lock(&sp->so_lock);
1486 goto restart; 1487 goto restart;
1487 } 1488 }
@@ -1830,6 +1831,7 @@ static int nfs4_establish_lease(struct nfs_client *clp)
1830 clp->cl_mvops->reboot_recovery_ops; 1831 clp->cl_mvops->reboot_recovery_ops;
1831 int status; 1832 int status;
1832 1833
1834 nfs4_begin_drain_session(clp);
1833 cred = nfs4_get_clid_cred(clp); 1835 cred = nfs4_get_clid_cred(clp);
1834 if (cred == NULL) 1836 if (cred == NULL)
1835 return -ENOENT; 1837 return -ENOENT;
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 0aea97841d30..558cd65dbdb7 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -139,7 +139,8 @@ static int nfs4_stat_to_errno(int);
139#define encode_setclientid_maxsz \ 139#define encode_setclientid_maxsz \
140 (op_encode_hdr_maxsz + \ 140 (op_encode_hdr_maxsz + \
141 XDR_QUADLEN(NFS4_VERIFIER_SIZE) + \ 141 XDR_QUADLEN(NFS4_VERIFIER_SIZE) + \
142 XDR_QUADLEN(NFS4_SETCLIENTID_NAMELEN) + \ 142 /* client name */ \
143 1 + XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \
143 1 /* sc_prog */ + \ 144 1 /* sc_prog */ + \
144 1 + XDR_QUADLEN(RPCBIND_MAXNETIDLEN) + \ 145 1 + XDR_QUADLEN(RPCBIND_MAXNETIDLEN) + \
145 1 + XDR_QUADLEN(RPCBIND_MAXUADDRLEN) + \ 146 1 + XDR_QUADLEN(RPCBIND_MAXUADDRLEN) + \
@@ -288,7 +289,8 @@ static int nfs4_stat_to_errno(int);
288#define encode_exchange_id_maxsz (op_encode_hdr_maxsz + \ 289#define encode_exchange_id_maxsz (op_encode_hdr_maxsz + \
289 encode_verifier_maxsz + \ 290 encode_verifier_maxsz + \
290 1 /* co_ownerid.len */ + \ 291 1 /* co_ownerid.len */ + \
291 XDR_QUADLEN(NFS4_EXCHANGE_ID_LEN) + \ 292 /* eia_clientowner */ \
293 1 + XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \
292 1 /* flags */ + \ 294 1 /* flags */ + \
293 1 /* spa_how */ + \ 295 1 /* spa_how */ + \
294 /* max is SP4_MACH_CRED (for now) */ + \ 296 /* max is SP4_MACH_CRED (for now) */ + \
@@ -1667,13 +1669,14 @@ static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclie
1667 encode_op_hdr(xdr, OP_SETCLIENTID, decode_setclientid_maxsz, hdr); 1669 encode_op_hdr(xdr, OP_SETCLIENTID, decode_setclientid_maxsz, hdr);
1668 encode_nfs4_verifier(xdr, setclientid->sc_verifier); 1670 encode_nfs4_verifier(xdr, setclientid->sc_verifier);
1669 1671
1670 encode_string(xdr, setclientid->sc_name_len, setclientid->sc_name); 1672 encode_string(xdr, strlen(setclientid->sc_clnt->cl_owner_id),
1673 setclientid->sc_clnt->cl_owner_id);
1671 p = reserve_space(xdr, 4); 1674 p = reserve_space(xdr, 4);
1672 *p = cpu_to_be32(setclientid->sc_prog); 1675 *p = cpu_to_be32(setclientid->sc_prog);
1673 encode_string(xdr, setclientid->sc_netid_len, setclientid->sc_netid); 1676 encode_string(xdr, setclientid->sc_netid_len, setclientid->sc_netid);
1674 encode_string(xdr, setclientid->sc_uaddr_len, setclientid->sc_uaddr); 1677 encode_string(xdr, setclientid->sc_uaddr_len, setclientid->sc_uaddr);
1675 p = reserve_space(xdr, 4); 1678 p = reserve_space(xdr, 4);
1676 *p = cpu_to_be32(setclientid->sc_cb_ident); 1679 *p = cpu_to_be32(setclientid->sc_clnt->cl_cb_ident);
1677} 1680}
1678 1681
1679static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs4_setclientid_res *arg, struct compound_hdr *hdr) 1682static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs4_setclientid_res *arg, struct compound_hdr *hdr)
@@ -1747,7 +1750,8 @@ static void encode_exchange_id(struct xdr_stream *xdr,
1747 encode_op_hdr(xdr, OP_EXCHANGE_ID, decode_exchange_id_maxsz, hdr); 1750 encode_op_hdr(xdr, OP_EXCHANGE_ID, decode_exchange_id_maxsz, hdr);
1748 encode_nfs4_verifier(xdr, args->verifier); 1751 encode_nfs4_verifier(xdr, args->verifier);
1749 1752
1750 encode_string(xdr, args->id_len, args->id); 1753 encode_string(xdr, strlen(args->client->cl_owner_id),
1754 args->client->cl_owner_id);
1751 1755
1752 encode_uint32(xdr, args->flags); 1756 encode_uint32(xdr, args->flags);
1753 encode_uint32(xdr, args->state_protect.how); 1757 encode_uint32(xdr, args->state_protect.how);
@@ -7427,6 +7431,7 @@ struct rpc_procinfo nfs4_procedures[] = {
7427 PROC(SEEK, enc_seek, dec_seek), 7431 PROC(SEEK, enc_seek, dec_seek),
7428 PROC(ALLOCATE, enc_allocate, dec_allocate), 7432 PROC(ALLOCATE, enc_allocate, dec_allocate),
7429 PROC(DEALLOCATE, enc_deallocate, dec_deallocate), 7433 PROC(DEALLOCATE, enc_deallocate, dec_deallocate),
7434 PROC(LAYOUTSTATS, enc_layoutstats, dec_layoutstats),
7430#endif /* CONFIG_NFS_V4_2 */ 7435#endif /* CONFIG_NFS_V4_2 */
7431}; 7436};
7432 7437
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 282b39369510..1da68d3b1eda 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -636,9 +636,8 @@ int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr,
636 636
637 hdr->rw_ops->rw_initiate(hdr, &msg, rpc_ops, &task_setup_data, how); 637 hdr->rw_ops->rw_initiate(hdr, &msg, rpc_ops, &task_setup_data, how);
638 638
639 dprintk("NFS: %5u initiated pgio call " 639 dprintk("NFS: initiated pgio call "
640 "(req %s/%llu, %u bytes @ offset %llu)\n", 640 "(req %s/%llu, %u bytes @ offset %llu)\n",
641 hdr->task.tk_pid,
642 hdr->inode->i_sb->s_id, 641 hdr->inode->i_sb->s_id,
643 (unsigned long long)NFS_FILEID(hdr->inode), 642 (unsigned long long)NFS_FILEID(hdr->inode),
644 hdr->args.count, 643 hdr->args.count,
@@ -690,8 +689,6 @@ static int nfs_pgio_error(struct nfs_pageio_descriptor *desc,
690static void nfs_pgio_release(void *calldata) 689static void nfs_pgio_release(void *calldata)
691{ 690{
692 struct nfs_pgio_header *hdr = calldata; 691 struct nfs_pgio_header *hdr = calldata;
693 if (hdr->rw_ops->rw_release)
694 hdr->rw_ops->rw_release(hdr);
695 nfs_pgio_data_destroy(hdr); 692 nfs_pgio_data_destroy(hdr);
696 hdr->completion_ops->completion(hdr); 693 hdr->completion_ops->completion(hdr);
697} 694}
@@ -711,7 +708,9 @@ static void nfs_pageio_mirror_init(struct nfs_pgio_mirror *mirror,
711 * nfs_pageio_init - initialise a page io descriptor 708 * nfs_pageio_init - initialise a page io descriptor
712 * @desc: pointer to descriptor 709 * @desc: pointer to descriptor
713 * @inode: pointer to inode 710 * @inode: pointer to inode
714 * @doio: pointer to io function 711 * @pg_ops: pointer to pageio operations
712 * @compl_ops: pointer to pageio completion operations
713 * @rw_ops: pointer to nfs read/write operations
715 * @bsize: io block size 714 * @bsize: io block size
716 * @io_flags: extra parameters for the io function 715 * @io_flags: extra parameters for the io function
717 */ 716 */
@@ -1186,6 +1185,7 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
1186 * nfs_pageio_complete_mirror - Complete I/O on the current mirror of an 1185 * nfs_pageio_complete_mirror - Complete I/O on the current mirror of an
1187 * nfs_pageio_descriptor 1186 * nfs_pageio_descriptor
1188 * @desc: pointer to io descriptor 1187 * @desc: pointer to io descriptor
1188 * @mirror_idx: pointer to mirror index
1189 */ 1189 */
1190static void nfs_pageio_complete_mirror(struct nfs_pageio_descriptor *desc, 1190static void nfs_pageio_complete_mirror(struct nfs_pageio_descriptor *desc,
1191 u32 mirror_idx) 1191 u32 mirror_idx)
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 230606243be6..0ba9a02c9566 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -35,6 +35,7 @@
35#include "iostat.h" 35#include "iostat.h"
36#include "nfs4trace.h" 36#include "nfs4trace.h"
37#include "delegation.h" 37#include "delegation.h"
38#include "nfs42.h"
38 39
39#define NFSDBG_FACILITY NFSDBG_PNFS 40#define NFSDBG_FACILITY NFSDBG_PNFS
40#define PNFS_LAYOUTGET_RETRY_TIMEOUT (120*HZ) 41#define PNFS_LAYOUTGET_RETRY_TIMEOUT (120*HZ)
@@ -1821,6 +1822,7 @@ int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *hdr)
1821 /* Resend all requests through the MDS */ 1822 /* Resend all requests through the MDS */
1822 nfs_pageio_init_write(&pgio, hdr->inode, FLUSH_STABLE, true, 1823 nfs_pageio_init_write(&pgio, hdr->inode, FLUSH_STABLE, true,
1823 hdr->completion_ops); 1824 hdr->completion_ops);
1825 set_bit(NFS_CONTEXT_RESEND_WRITES, &hdr->args.context->flags);
1824 return nfs_pageio_resend(&pgio, hdr); 1826 return nfs_pageio_resend(&pgio, hdr);
1825} 1827}
1826EXPORT_SYMBOL_GPL(pnfs_write_done_resend_to_mds); 1828EXPORT_SYMBOL_GPL(pnfs_write_done_resend_to_mds);
@@ -1865,6 +1867,7 @@ pnfs_write_through_mds(struct nfs_pageio_descriptor *desc,
1865 mirror->pg_recoalesce = 1; 1867 mirror->pg_recoalesce = 1;
1866 } 1868 }
1867 nfs_pgio_data_destroy(hdr); 1869 nfs_pgio_data_destroy(hdr);
1870 hdr->release(hdr);
1868} 1871}
1869 1872
1870static enum pnfs_try_status 1873static enum pnfs_try_status
@@ -1979,6 +1982,7 @@ pnfs_read_through_mds(struct nfs_pageio_descriptor *desc,
1979 mirror->pg_recoalesce = 1; 1982 mirror->pg_recoalesce = 1;
1980 } 1983 }
1981 nfs_pgio_data_destroy(hdr); 1984 nfs_pgio_data_destroy(hdr);
1985 hdr->release(hdr);
1982} 1986}
1983 1987
1984/* 1988/*
@@ -2247,3 +2251,63 @@ struct nfs4_threshold *pnfs_mdsthreshold_alloc(void)
2247 } 2251 }
2248 return thp; 2252 return thp;
2249} 2253}
2254
2255#if IS_ENABLED(CONFIG_NFS_V4_2)
2256int
2257pnfs_report_layoutstat(struct inode *inode)
2258{
2259 struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
2260 struct nfs_server *server = NFS_SERVER(inode);
2261 struct nfs_inode *nfsi = NFS_I(inode);
2262 struct nfs42_layoutstat_data *data;
2263 struct pnfs_layout_hdr *hdr;
2264 int status = 0;
2265
2266 if (!pnfs_enabled_sb(server) || !ld->prepare_layoutstats)
2267 goto out;
2268
2269 if (!nfs_server_capable(inode, NFS_CAP_LAYOUTSTATS))
2270 goto out;
2271
2272 if (test_and_set_bit(NFS_INO_LAYOUTSTATS, &nfsi->flags))
2273 goto out;
2274
2275 spin_lock(&inode->i_lock);
2276 if (!NFS_I(inode)->layout) {
2277 spin_unlock(&inode->i_lock);
2278 goto out;
2279 }
2280 hdr = NFS_I(inode)->layout;
2281 pnfs_get_layout_hdr(hdr);
2282 spin_unlock(&inode->i_lock);
2283
2284 data = kzalloc(sizeof(*data), GFP_KERNEL);
2285 if (!data) {
2286 status = -ENOMEM;
2287 goto out_put;
2288 }
2289
2290 data->args.fh = NFS_FH(inode);
2291 data->args.inode = inode;
2292 nfs4_stateid_copy(&data->args.stateid, &hdr->plh_stateid);
2293 status = ld->prepare_layoutstats(&data->args);
2294 if (status)
2295 goto out_free;
2296
2297 status = nfs42_proc_layoutstats_generic(NFS_SERVER(inode), data);
2298
2299out:
2300 dprintk("%s returns %d\n", __func__, status);
2301 return status;
2302
2303out_free:
2304 kfree(data);
2305out_put:
2306 pnfs_put_layout_hdr(hdr);
2307 smp_mb__before_atomic();
2308 clear_bit(NFS_INO_LAYOUTSTATS, &nfsi->flags);
2309 smp_mb__after_atomic();
2310 goto out;
2311}
2312EXPORT_SYMBOL_GPL(pnfs_report_layoutstat);
2313#endif
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 1e6308f82fc3..3e6ab7bfbabd 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -178,6 +178,8 @@ struct pnfs_layoutdriver_type {
178 void (*encode_layoutcommit) (struct pnfs_layout_hdr *lo, 178 void (*encode_layoutcommit) (struct pnfs_layout_hdr *lo,
179 struct xdr_stream *xdr, 179 struct xdr_stream *xdr,
180 const struct nfs4_layoutcommit_args *args); 180 const struct nfs4_layoutcommit_args *args);
181 int (*prepare_layoutstats) (struct nfs42_layoutstat_args *args);
182 void (*cleanup_layoutstats) (struct nfs42_layoutstat_data *data);
181}; 183};
182 184
183struct pnfs_layout_hdr { 185struct pnfs_layout_hdr {
@@ -290,7 +292,6 @@ int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *);
290struct nfs4_threshold *pnfs_mdsthreshold_alloc(void); 292struct nfs4_threshold *pnfs_mdsthreshold_alloc(void);
291void pnfs_error_mark_layout_for_return(struct inode *inode, 293void pnfs_error_mark_layout_for_return(struct inode *inode,
292 struct pnfs_layout_segment *lseg); 294 struct pnfs_layout_segment *lseg);
293
294/* nfs4_deviceid_flags */ 295/* nfs4_deviceid_flags */
295enum { 296enum {
296 NFS_DEVICEID_INVALID = 0, /* set when MDS clientid recalled */ 297 NFS_DEVICEID_INVALID = 0, /* set when MDS clientid recalled */
@@ -689,4 +690,14 @@ static inline void nfs4_pnfs_v3_ds_connect_unload(void)
689 690
690#endif /* CONFIG_NFS_V4_1 */ 691#endif /* CONFIG_NFS_V4_1 */
691 692
693#if IS_ENABLED(CONFIG_NFS_V4_2)
694int pnfs_report_layoutstat(struct inode *inode);
695#else
696static inline int
697pnfs_report_layoutstat(struct inode *inode)
698{
699 return 0;
700}
701#endif
702
692#endif /* FS_NFS_PNFS_H */ 703#endif /* FS_NFS_PNFS_H */
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index e6c262555e08..65869ca9c851 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1290,6 +1290,7 @@ static void nfs_initiate_write(struct nfs_pgio_header *hdr,
1290static void nfs_redirty_request(struct nfs_page *req) 1290static void nfs_redirty_request(struct nfs_page *req)
1291{ 1291{
1292 nfs_mark_request_dirty(req); 1292 nfs_mark_request_dirty(req);
1293 set_bit(NFS_CONTEXT_RESEND_WRITES, &req->wb_context->flags);
1293 nfs_unlock_request(req); 1294 nfs_unlock_request(req);
1294 nfs_end_page_writeback(req); 1295 nfs_end_page_writeback(req);
1295 nfs_release_request(req); 1296 nfs_release_request(req);
@@ -1348,11 +1349,6 @@ void nfs_commit_prepare(struct rpc_task *task, void *calldata)
1348 NFS_PROTO(data->inode)->commit_rpc_prepare(task, data); 1349 NFS_PROTO(data->inode)->commit_rpc_prepare(task, data);
1349} 1350}
1350 1351
1351static void nfs_writeback_release_common(struct nfs_pgio_header *hdr)
1352{
1353 /* do nothing! */
1354}
1355
1356/* 1352/*
1357 * Special version of should_remove_suid() that ignores capabilities. 1353 * Special version of should_remove_suid() that ignores capabilities.
1358 */ 1354 */
@@ -1556,7 +1552,7 @@ int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data,
1556 /* Set up the initial task struct. */ 1552 /* Set up the initial task struct. */
1557 nfs_ops->commit_setup(data, &msg); 1553 nfs_ops->commit_setup(data, &msg);
1558 1554
1559 dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); 1555 dprintk("NFS: initiated commit call\n");
1560 1556
1561 nfs4_state_protect(NFS_SERVER(data->inode)->nfs_client, 1557 nfs4_state_protect(NFS_SERVER(data->inode)->nfs_client,
1562 NFS_SP4_MACH_CRED_COMMIT, &task_setup_data.rpc_client, &msg); 1558 NFS_SP4_MACH_CRED_COMMIT, &task_setup_data.rpc_client, &msg);
@@ -2013,7 +2009,6 @@ static const struct nfs_rw_ops nfs_rw_write_ops = {
2013 .rw_mode = FMODE_WRITE, 2009 .rw_mode = FMODE_WRITE,
2014 .rw_alloc_header = nfs_writehdr_alloc, 2010 .rw_alloc_header = nfs_writehdr_alloc,
2015 .rw_free_header = nfs_writehdr_free, 2011 .rw_free_header = nfs_writehdr_free,
2016 .rw_release = nfs_writeback_release_common,
2017 .rw_done = nfs_writeback_done, 2012 .rw_done = nfs_writeback_done,
2018 .rw_result = nfs_writeback_result, 2013 .rw_result = nfs_writeback_result,
2019 .rw_initiate = nfs_initiate_write, 2014 .rw_initiate = nfs_initiate_write,
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index 32201c269890..b8e72aad919c 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -500,6 +500,7 @@ enum {
500 NFSPROC4_CLNT_SEEK, 500 NFSPROC4_CLNT_SEEK,
501 NFSPROC4_CLNT_ALLOCATE, 501 NFSPROC4_CLNT_ALLOCATE,
502 NFSPROC4_CLNT_DEALLOCATE, 502 NFSPROC4_CLNT_DEALLOCATE,
503 NFSPROC4_CLNT_LAYOUTSTATS,
503}; 504};
504 505
505/* nfs41 types */ 506/* nfs41 types */
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index b95f914ce083..f91b5ade30c9 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -219,6 +219,7 @@ struct nfs_inode {
219#define NFS_INO_COMMIT (7) /* inode is committing unstable writes */ 219#define NFS_INO_COMMIT (7) /* inode is committing unstable writes */
220#define NFS_INO_LAYOUTCOMMIT (9) /* layoutcommit required */ 220#define NFS_INO_LAYOUTCOMMIT (9) /* layoutcommit required */
221#define NFS_INO_LAYOUTCOMMITTING (10) /* layoutcommit inflight */ 221#define NFS_INO_LAYOUTCOMMITTING (10) /* layoutcommit inflight */
222#define NFS_INO_LAYOUTSTATS (11) /* layoutstats inflight */
222 223
223static inline struct nfs_inode *NFS_I(const struct inode *inode) 224static inline struct nfs_inode *NFS_I(const struct inode *inode)
224{ 225{
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 5e1273d4de14..a2ea1491d3df 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -237,5 +237,6 @@ struct nfs_server {
237#define NFS_CAP_SEEK (1U << 19) 237#define NFS_CAP_SEEK (1U << 19)
238#define NFS_CAP_ALLOCATE (1U << 20) 238#define NFS_CAP_ALLOCATE (1U << 20)
239#define NFS_CAP_DEALLOCATE (1U << 21) 239#define NFS_CAP_DEALLOCATE (1U << 21)
240#define NFS_CAP_LAYOUTSTATS (1U << 22)
240 241
241#endif 242#endif
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index 3eb072dbce83..f2f650f136ee 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -67,7 +67,6 @@ struct nfs_rw_ops {
67 const fmode_t rw_mode; 67 const fmode_t rw_mode;
68 struct nfs_pgio_header *(*rw_alloc_header)(void); 68 struct nfs_pgio_header *(*rw_alloc_header)(void);
69 void (*rw_free_header)(struct nfs_pgio_header *); 69 void (*rw_free_header)(struct nfs_pgio_header *);
70 void (*rw_release)(struct nfs_pgio_header *);
71 int (*rw_done)(struct rpc_task *, struct nfs_pgio_header *, 70 int (*rw_done)(struct rpc_task *, struct nfs_pgio_header *,
72 struct inode *); 71 struct inode *);
73 void (*rw_result)(struct rpc_task *, struct nfs_pgio_header *); 72 void (*rw_result)(struct rpc_task *, struct nfs_pgio_header *);
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 93ab6071bbe9..7bbe50504211 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -316,6 +316,49 @@ struct nfs4_layoutreturn {
316 int rpc_status; 316 int rpc_status;
317}; 317};
318 318
319#define PNFS_LAYOUTSTATS_MAXSIZE 256
320
321struct nfs42_layoutstat_args;
322struct nfs42_layoutstat_devinfo;
323typedef void (*layoutstats_encode_t)(struct xdr_stream *,
324 struct nfs42_layoutstat_args *,
325 struct nfs42_layoutstat_devinfo *);
326
327/* Per file per deviceid layoutstats */
328struct nfs42_layoutstat_devinfo {
329 struct nfs4_deviceid dev_id;
330 __u64 offset;
331 __u64 length;
332 __u64 read_count;
333 __u64 read_bytes;
334 __u64 write_count;
335 __u64 write_bytes;
336 __u32 layout_type;
337 layoutstats_encode_t layoutstats_encode;
338 void *layout_private;
339};
340
341struct nfs42_layoutstat_args {
342 struct nfs4_sequence_args seq_args;
343 struct nfs_fh *fh;
344 struct inode *inode;
345 nfs4_stateid stateid;
346 int num_dev;
347 struct nfs42_layoutstat_devinfo *devinfo;
348};
349
350struct nfs42_layoutstat_res {
351 struct nfs4_sequence_res seq_res;
352 int num_dev;
353 int rpc_status;
354};
355
356struct nfs42_layoutstat_data {
357 struct inode *inode;
358 struct nfs42_layoutstat_args args;
359 struct nfs42_layoutstat_res res;
360};
361
319struct stateowner_id { 362struct stateowner_id {
320 __u64 create_time; 363 __u64 create_time;
321 __u32 uniquifier; 364 __u32 uniquifier;
@@ -984,17 +1027,14 @@ struct nfs4_readlink_res {
984 struct nfs4_sequence_res seq_res; 1027 struct nfs4_sequence_res seq_res;
985}; 1028};
986 1029
987#define NFS4_SETCLIENTID_NAMELEN (127)
988struct nfs4_setclientid { 1030struct nfs4_setclientid {
989 const nfs4_verifier * sc_verifier; 1031 const nfs4_verifier * sc_verifier;
990 unsigned int sc_name_len;
991 char sc_name[NFS4_SETCLIENTID_NAMELEN + 1];
992 u32 sc_prog; 1032 u32 sc_prog;
993 unsigned int sc_netid_len; 1033 unsigned int sc_netid_len;
994 char sc_netid[RPCBIND_MAXNETIDLEN + 1]; 1034 char sc_netid[RPCBIND_MAXNETIDLEN + 1];
995 unsigned int sc_uaddr_len; 1035 unsigned int sc_uaddr_len;
996 char sc_uaddr[RPCBIND_MAXUADDRLEN + 1]; 1036 char sc_uaddr[RPCBIND_MAXUADDRLEN + 1];
997 u32 sc_cb_ident; 1037 struct nfs_client *sc_clnt;
998 struct rpc_cred *sc_cred; 1038 struct rpc_cred *sc_cred;
999}; 1039};
1000 1040
@@ -1142,12 +1182,9 @@ struct nfs41_state_protection {
1142 struct nfs4_op_map allow; 1182 struct nfs4_op_map allow;
1143}; 1183};
1144 1184
1145#define NFS4_EXCHANGE_ID_LEN (48)
1146struct nfs41_exchange_id_args { 1185struct nfs41_exchange_id_args {
1147 struct nfs_client *client; 1186 struct nfs_client *client;
1148 nfs4_verifier *verifier; 1187 nfs4_verifier *verifier;
1149 unsigned int id_len;
1150 char id[NFS4_EXCHANGE_ID_LEN];
1151 u32 flags; 1188 u32 flags;
1152 struct nfs41_state_protection state_protect; 1189 struct nfs41_state_protection state_protect;
1153}; 1190};
diff --git a/include/linux/sunrpc/bc_xprt.h b/include/linux/sunrpc/bc_xprt.h
index 2ca67b55e0fe..8df43c9f11dc 100644
--- a/include/linux/sunrpc/bc_xprt.h
+++ b/include/linux/sunrpc/bc_xprt.h
@@ -37,7 +37,6 @@ void xprt_complete_bc_request(struct rpc_rqst *req, uint32_t copied);
37void xprt_free_bc_request(struct rpc_rqst *req); 37void xprt_free_bc_request(struct rpc_rqst *req);
38int xprt_setup_backchannel(struct rpc_xprt *, unsigned int min_reqs); 38int xprt_setup_backchannel(struct rpc_xprt *, unsigned int min_reqs);
39void xprt_destroy_backchannel(struct rpc_xprt *, unsigned int max_reqs); 39void xprt_destroy_backchannel(struct rpc_xprt *, unsigned int max_reqs);
40int bc_send(struct rpc_rqst *req);
41 40
42/* 41/*
43 * Determine if a shared backchannel is in use 42 * Determine if a shared backchannel is in use
diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index 598ba80ec30c..131032f15cc1 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -56,6 +56,7 @@ struct rpc_clnt {
56 struct rpc_rtt * cl_rtt; /* RTO estimator data */ 56 struct rpc_rtt * cl_rtt; /* RTO estimator data */
57 const struct rpc_timeout *cl_timeout; /* Timeout strategy */ 57 const struct rpc_timeout *cl_timeout; /* Timeout strategy */
58 58
59 atomic_t cl_swapper; /* swapfile count */
59 int cl_nodelen; /* nodename length */ 60 int cl_nodelen; /* nodename length */
60 char cl_nodename[UNX_MAXNODENAME+1]; 61 char cl_nodename[UNX_MAXNODENAME+1];
61 struct rpc_pipe_dir_head cl_pipedir_objects; 62 struct rpc_pipe_dir_head cl_pipedir_objects;
diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index 5f1e6bd4c316..d703f0ef37d8 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -205,8 +205,7 @@ struct rpc_wait_queue {
205 */ 205 */
206struct rpc_task *rpc_new_task(const struct rpc_task_setup *); 206struct rpc_task *rpc_new_task(const struct rpc_task_setup *);
207struct rpc_task *rpc_run_task(const struct rpc_task_setup *); 207struct rpc_task *rpc_run_task(const struct rpc_task_setup *);
208struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req, 208struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req);
209 const struct rpc_call_ops *ops);
210void rpc_put_task(struct rpc_task *); 209void rpc_put_task(struct rpc_task *);
211void rpc_put_task_async(struct rpc_task *); 210void rpc_put_task_async(struct rpc_task *);
212void rpc_exit_task(struct rpc_task *); 211void rpc_exit_task(struct rpc_task *);
@@ -269,4 +268,20 @@ static inline void rpc_assign_waitqueue_name(struct rpc_wait_queue *q,
269} 268}
270#endif 269#endif
271 270
271#if IS_ENABLED(CONFIG_SUNRPC_SWAP)
272int rpc_clnt_swap_activate(struct rpc_clnt *clnt);
273void rpc_clnt_swap_deactivate(struct rpc_clnt *clnt);
274#else
275static inline int
276rpc_clnt_swap_activate(struct rpc_clnt *clnt)
277{
278 return -EINVAL;
279}
280
281static inline void
282rpc_clnt_swap_deactivate(struct rpc_clnt *clnt)
283{
284}
285#endif /* CONFIG_SUNRPC_SWAP */
286
272#endif /* _LINUX_SUNRPC_SCHED_H_ */ 287#endif /* _LINUX_SUNRPC_SCHED_H_ */
diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 8b93ef53df3c..0fb9acbb4780 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -133,6 +133,9 @@ struct rpc_xprt_ops {
133 void (*close)(struct rpc_xprt *xprt); 133 void (*close)(struct rpc_xprt *xprt);
134 void (*destroy)(struct rpc_xprt *xprt); 134 void (*destroy)(struct rpc_xprt *xprt);
135 void (*print_stats)(struct rpc_xprt *xprt, struct seq_file *seq); 135 void (*print_stats)(struct rpc_xprt *xprt, struct seq_file *seq);
136 int (*enable_swap)(struct rpc_xprt *xprt);
137 void (*disable_swap)(struct rpc_xprt *xprt);
138 void (*inject_disconnect)(struct rpc_xprt *xprt);
136}; 139};
137 140
138/* 141/*
@@ -180,7 +183,7 @@ struct rpc_xprt {
180 atomic_t num_reqs; /* total slots */ 183 atomic_t num_reqs; /* total slots */
181 unsigned long state; /* transport state */ 184 unsigned long state; /* transport state */
182 unsigned char resvport : 1; /* use a reserved port */ 185 unsigned char resvport : 1; /* use a reserved port */
183 unsigned int swapper; /* we're swapping over this 186 atomic_t swapper; /* we're swapping over this
184 transport */ 187 transport */
185 unsigned int bind_index; /* bind function index */ 188 unsigned int bind_index; /* bind function index */
186 189
@@ -212,7 +215,8 @@ struct rpc_xprt {
212#if defined(CONFIG_SUNRPC_BACKCHANNEL) 215#if defined(CONFIG_SUNRPC_BACKCHANNEL)
213 struct svc_serv *bc_serv; /* The RPC service which will */ 216 struct svc_serv *bc_serv; /* The RPC service which will */
214 /* process the callback */ 217 /* process the callback */
215 unsigned int bc_alloc_count; /* Total number of preallocs */ 218 int bc_alloc_count; /* Total number of preallocs */
219 atomic_t bc_free_slots;
216 spinlock_t bc_pa_lock; /* Protects the preallocated 220 spinlock_t bc_pa_lock; /* Protects the preallocated
217 * items */ 221 * items */
218 struct list_head bc_pa_list; /* List of preallocated 222 struct list_head bc_pa_list; /* List of preallocated
@@ -241,6 +245,7 @@ struct rpc_xprt {
241 const char *address_strings[RPC_DISPLAY_MAX]; 245 const char *address_strings[RPC_DISPLAY_MAX];
242#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) 246#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
243 struct dentry *debugfs; /* debugfs directory */ 247 struct dentry *debugfs; /* debugfs directory */
248 atomic_t inject_disconnect;
244#endif 249#endif
245}; 250};
246 251
@@ -327,6 +332,18 @@ static inline __be32 *xprt_skip_transport_header(struct rpc_xprt *xprt, __be32 *
327 return p + xprt->tsh_size; 332 return p + xprt->tsh_size;
328} 333}
329 334
335static inline int
336xprt_enable_swap(struct rpc_xprt *xprt)
337{
338 return xprt->ops->enable_swap(xprt);
339}
340
341static inline void
342xprt_disable_swap(struct rpc_xprt *xprt)
343{
344 xprt->ops->disable_swap(xprt);
345}
346
330/* 347/*
331 * Transport switch helper functions 348 * Transport switch helper functions
332 */ 349 */
@@ -345,7 +362,6 @@ void xprt_release_rqst_cong(struct rpc_task *task);
345void xprt_disconnect_done(struct rpc_xprt *xprt); 362void xprt_disconnect_done(struct rpc_xprt *xprt);
346void xprt_force_disconnect(struct rpc_xprt *xprt); 363void xprt_force_disconnect(struct rpc_xprt *xprt);
347void xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie); 364void xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie);
348int xs_swapper(struct rpc_xprt *xprt, int enable);
349 365
350bool xprt_lock_connect(struct rpc_xprt *, struct rpc_task *, void *); 366bool xprt_lock_connect(struct rpc_xprt *, struct rpc_task *, void *);
351void xprt_unlock_connect(struct rpc_xprt *, void *); 367void xprt_unlock_connect(struct rpc_xprt *, void *);
@@ -431,6 +447,23 @@ static inline int xprt_test_and_set_binding(struct rpc_xprt *xprt)
431 return test_and_set_bit(XPRT_BINDING, &xprt->state); 447 return test_and_set_bit(XPRT_BINDING, &xprt->state);
432} 448}
433 449
450#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
451extern unsigned int rpc_inject_disconnect;
452static inline void xprt_inject_disconnect(struct rpc_xprt *xprt)
453{
454 if (!rpc_inject_disconnect)
455 return;
456 if (atomic_dec_return(&xprt->inject_disconnect))
457 return;
458 atomic_set(&xprt->inject_disconnect, rpc_inject_disconnect);
459 xprt->ops->inject_disconnect(xprt);
460}
461#else
462static inline void xprt_inject_disconnect(struct rpc_xprt *xprt)
463{
464}
465#endif
466
434#endif /* __KERNEL__*/ 467#endif /* __KERNEL__*/
435 468
436#endif /* _LINUX_SUNRPC_XPRT_H */ 469#endif /* _LINUX_SUNRPC_XPRT_H */
diff --git a/include/linux/sunrpc/xprtrdma.h b/include/linux/sunrpc/xprtrdma.h
index c984c85981ea..b17613052cc3 100644
--- a/include/linux/sunrpc/xprtrdma.h
+++ b/include/linux/sunrpc/xprtrdma.h
@@ -56,7 +56,8 @@
56 56
57#define RPCRDMA_INLINE_PAD_THRESH (512)/* payload threshold to pad (bytes) */ 57#define RPCRDMA_INLINE_PAD_THRESH (512)/* payload threshold to pad (bytes) */
58 58
59/* memory registration strategies */ 59/* Memory registration strategies, by number.
60 * This is part of a kernel / user space API. Do not remove. */
60enum rpcrdma_memreg { 61enum rpcrdma_memreg {
61 RPCRDMA_BOUNCEBUFFERS = 0, 62 RPCRDMA_BOUNCEBUFFERS = 0,
62 RPCRDMA_REGISTER, 63 RPCRDMA_REGISTER,
diff --git a/net/sunrpc/Makefile b/net/sunrpc/Makefile
index 936ad0a15371..b512fbd9d79a 100644
--- a/net/sunrpc/Makefile
+++ b/net/sunrpc/Makefile
@@ -14,6 +14,6 @@ sunrpc-y := clnt.o xprt.o socklib.o xprtsock.o sched.o \
14 sunrpc_syms.o cache.o rpc_pipe.o \ 14 sunrpc_syms.o cache.o rpc_pipe.o \
15 svc_xprt.o 15 svc_xprt.o
16sunrpc-$(CONFIG_SUNRPC_DEBUG) += debugfs.o 16sunrpc-$(CONFIG_SUNRPC_DEBUG) += debugfs.o
17sunrpc-$(CONFIG_SUNRPC_BACKCHANNEL) += backchannel_rqst.o bc_svc.o 17sunrpc-$(CONFIG_SUNRPC_BACKCHANNEL) += backchannel_rqst.o
18sunrpc-$(CONFIG_PROC_FS) += stats.o 18sunrpc-$(CONFIG_PROC_FS) += stats.o
19sunrpc-$(CONFIG_SYSCTL) += sysctl.o 19sunrpc-$(CONFIG_SYSCTL) += sysctl.o
diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c
index 9dd0ea8db463..9825ff0f91d6 100644
--- a/net/sunrpc/backchannel_rqst.c
+++ b/net/sunrpc/backchannel_rqst.c
@@ -37,16 +37,18 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 */ 37 */
38static inline int xprt_need_to_requeue(struct rpc_xprt *xprt) 38static inline int xprt_need_to_requeue(struct rpc_xprt *xprt)
39{ 39{
40 return xprt->bc_alloc_count > 0; 40 return xprt->bc_alloc_count < atomic_read(&xprt->bc_free_slots);
41} 41}
42 42
43static inline void xprt_inc_alloc_count(struct rpc_xprt *xprt, unsigned int n) 43static inline void xprt_inc_alloc_count(struct rpc_xprt *xprt, unsigned int n)
44{ 44{
45 atomic_add(n, &xprt->bc_free_slots);
45 xprt->bc_alloc_count += n; 46 xprt->bc_alloc_count += n;
46} 47}
47 48
48static inline int xprt_dec_alloc_count(struct rpc_xprt *xprt, unsigned int n) 49static inline int xprt_dec_alloc_count(struct rpc_xprt *xprt, unsigned int n)
49{ 50{
51 atomic_sub(n, &xprt->bc_free_slots);
50 return xprt->bc_alloc_count -= n; 52 return xprt->bc_alloc_count -= n;
51} 53}
52 54
@@ -60,13 +62,62 @@ static void xprt_free_allocation(struct rpc_rqst *req)
60 62
61 dprintk("RPC: free allocations for req= %p\n", req); 63 dprintk("RPC: free allocations for req= %p\n", req);
62 WARN_ON_ONCE(test_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state)); 64 WARN_ON_ONCE(test_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state));
63 xbufp = &req->rq_private_buf; 65 xbufp = &req->rq_rcv_buf;
64 free_page((unsigned long)xbufp->head[0].iov_base); 66 free_page((unsigned long)xbufp->head[0].iov_base);
65 xbufp = &req->rq_snd_buf; 67 xbufp = &req->rq_snd_buf;
66 free_page((unsigned long)xbufp->head[0].iov_base); 68 free_page((unsigned long)xbufp->head[0].iov_base);
67 kfree(req); 69 kfree(req);
68} 70}
69 71
72static int xprt_alloc_xdr_buf(struct xdr_buf *buf, gfp_t gfp_flags)
73{
74 struct page *page;
75 /* Preallocate one XDR receive buffer */
76 page = alloc_page(gfp_flags);
77 if (page == NULL)
78 return -ENOMEM;
79 buf->head[0].iov_base = page_address(page);
80 buf->head[0].iov_len = PAGE_SIZE;
81 buf->tail[0].iov_base = NULL;
82 buf->tail[0].iov_len = 0;
83 buf->page_len = 0;
84 buf->len = 0;
85 buf->buflen = PAGE_SIZE;
86 return 0;
87}
88
89static
90struct rpc_rqst *xprt_alloc_bc_req(struct rpc_xprt *xprt, gfp_t gfp_flags)
91{
92 struct rpc_rqst *req;
93
94 /* Pre-allocate one backchannel rpc_rqst */
95 req = kzalloc(sizeof(*req), gfp_flags);
96 if (req == NULL)
97 return NULL;
98
99 req->rq_xprt = xprt;
100 INIT_LIST_HEAD(&req->rq_list);
101 INIT_LIST_HEAD(&req->rq_bc_list);
102
103 /* Preallocate one XDR receive buffer */
104 if (xprt_alloc_xdr_buf(&req->rq_rcv_buf, gfp_flags) < 0) {
105 printk(KERN_ERR "Failed to create bc receive xbuf\n");
106 goto out_free;
107 }
108 req->rq_rcv_buf.len = PAGE_SIZE;
109
110 /* Preallocate one XDR send buffer */
111 if (xprt_alloc_xdr_buf(&req->rq_snd_buf, gfp_flags) < 0) {
112 printk(KERN_ERR "Failed to create bc snd xbuf\n");
113 goto out_free;
114 }
115 return req;
116out_free:
117 xprt_free_allocation(req);
118 return NULL;
119}
120
70/* 121/*
71 * Preallocate up to min_reqs structures and related buffers for use 122 * Preallocate up to min_reqs structures and related buffers for use
72 * by the backchannel. This function can be called multiple times 123 * by the backchannel. This function can be called multiple times
@@ -87,9 +138,7 @@ static void xprt_free_allocation(struct rpc_rqst *req)
87 */ 138 */
88int xprt_setup_backchannel(struct rpc_xprt *xprt, unsigned int min_reqs) 139int xprt_setup_backchannel(struct rpc_xprt *xprt, unsigned int min_reqs)
89{ 140{
90 struct page *page_rcv = NULL, *page_snd = NULL; 141 struct rpc_rqst *req;
91 struct xdr_buf *xbufp = NULL;
92 struct rpc_rqst *req, *tmp;
93 struct list_head tmp_list; 142 struct list_head tmp_list;
94 int i; 143 int i;
95 144
@@ -106,7 +155,7 @@ int xprt_setup_backchannel(struct rpc_xprt *xprt, unsigned int min_reqs)
106 INIT_LIST_HEAD(&tmp_list); 155 INIT_LIST_HEAD(&tmp_list);
107 for (i = 0; i < min_reqs; i++) { 156 for (i = 0; i < min_reqs; i++) {
108 /* Pre-allocate one backchannel rpc_rqst */ 157 /* Pre-allocate one backchannel rpc_rqst */
109 req = kzalloc(sizeof(struct rpc_rqst), GFP_KERNEL); 158 req = xprt_alloc_bc_req(xprt, GFP_KERNEL);
110 if (req == NULL) { 159 if (req == NULL) {
111 printk(KERN_ERR "Failed to create bc rpc_rqst\n"); 160 printk(KERN_ERR "Failed to create bc rpc_rqst\n");
112 goto out_free; 161 goto out_free;
@@ -115,41 +164,6 @@ int xprt_setup_backchannel(struct rpc_xprt *xprt, unsigned int min_reqs)
115 /* Add the allocated buffer to the tmp list */ 164 /* Add the allocated buffer to the tmp list */
116 dprintk("RPC: adding req= %p\n", req); 165 dprintk("RPC: adding req= %p\n", req);
117 list_add(&req->rq_bc_pa_list, &tmp_list); 166 list_add(&req->rq_bc_pa_list, &tmp_list);
118
119 req->rq_xprt = xprt;
120 INIT_LIST_HEAD(&req->rq_list);
121 INIT_LIST_HEAD(&req->rq_bc_list);
122
123 /* Preallocate one XDR receive buffer */
124 page_rcv = alloc_page(GFP_KERNEL);
125 if (page_rcv == NULL) {
126 printk(KERN_ERR "Failed to create bc receive xbuf\n");
127 goto out_free;
128 }
129 xbufp = &req->rq_rcv_buf;
130 xbufp->head[0].iov_base = page_address(page_rcv);
131 xbufp->head[0].iov_len = PAGE_SIZE;
132 xbufp->tail[0].iov_base = NULL;
133 xbufp->tail[0].iov_len = 0;
134 xbufp->page_len = 0;
135 xbufp->len = PAGE_SIZE;
136 xbufp->buflen = PAGE_SIZE;
137
138 /* Preallocate one XDR send buffer */
139 page_snd = alloc_page(GFP_KERNEL);
140 if (page_snd == NULL) {
141 printk(KERN_ERR "Failed to create bc snd xbuf\n");
142 goto out_free;
143 }
144
145 xbufp = &req->rq_snd_buf;
146 xbufp->head[0].iov_base = page_address(page_snd);
147 xbufp->head[0].iov_len = 0;
148 xbufp->tail[0].iov_base = NULL;
149 xbufp->tail[0].iov_len = 0;
150 xbufp->page_len = 0;
151 xbufp->len = 0;
152 xbufp->buflen = PAGE_SIZE;
153 } 167 }
154 168
155 /* 169 /*
@@ -167,7 +181,10 @@ out_free:
167 /* 181 /*
168 * Memory allocation failed, free the temporary list 182 * Memory allocation failed, free the temporary list
169 */ 183 */
170 list_for_each_entry_safe(req, tmp, &tmp_list, rq_bc_pa_list) { 184 while (!list_empty(&tmp_list)) {
185 req = list_first_entry(&tmp_list,
186 struct rpc_rqst,
187 rq_bc_pa_list);
171 list_del(&req->rq_bc_pa_list); 188 list_del(&req->rq_bc_pa_list);
172 xprt_free_allocation(req); 189 xprt_free_allocation(req);
173 } 190 }
@@ -217,9 +234,15 @@ static struct rpc_rqst *xprt_alloc_bc_request(struct rpc_xprt *xprt, __be32 xid)
217 struct rpc_rqst *req = NULL; 234 struct rpc_rqst *req = NULL;
218 235
219 dprintk("RPC: allocate a backchannel request\n"); 236 dprintk("RPC: allocate a backchannel request\n");
220 if (list_empty(&xprt->bc_pa_list)) 237 if (atomic_read(&xprt->bc_free_slots) <= 0)
221 goto not_found; 238 goto not_found;
222 239 if (list_empty(&xprt->bc_pa_list)) {
240 req = xprt_alloc_bc_req(xprt, GFP_ATOMIC);
241 if (!req)
242 goto not_found;
243 /* Note: this 'free' request adds it to xprt->bc_pa_list */
244 xprt_free_bc_request(req);
245 }
223 req = list_first_entry(&xprt->bc_pa_list, struct rpc_rqst, 246 req = list_first_entry(&xprt->bc_pa_list, struct rpc_rqst,
224 rq_bc_pa_list); 247 rq_bc_pa_list);
225 req->rq_reply_bytes_recvd = 0; 248 req->rq_reply_bytes_recvd = 0;
@@ -245,11 +268,21 @@ void xprt_free_bc_request(struct rpc_rqst *req)
245 268
246 req->rq_connect_cookie = xprt->connect_cookie - 1; 269 req->rq_connect_cookie = xprt->connect_cookie - 1;
247 smp_mb__before_atomic(); 270 smp_mb__before_atomic();
248 WARN_ON_ONCE(!test_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state));
249 clear_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state); 271 clear_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state);
250 smp_mb__after_atomic(); 272 smp_mb__after_atomic();
251 273
252 if (!xprt_need_to_requeue(xprt)) { 274 /*
275 * Return it to the list of preallocations so that it
276 * may be reused by a new callback request.
277 */
278 spin_lock_bh(&xprt->bc_pa_lock);
279 if (xprt_need_to_requeue(xprt)) {
280 list_add_tail(&req->rq_bc_pa_list, &xprt->bc_pa_list);
281 xprt->bc_alloc_count++;
282 req = NULL;
283 }
284 spin_unlock_bh(&xprt->bc_pa_lock);
285 if (req != NULL) {
253 /* 286 /*
254 * The last remaining session was destroyed while this 287 * The last remaining session was destroyed while this
255 * entry was in use. Free the entry and don't attempt 288 * entry was in use. Free the entry and don't attempt
@@ -260,14 +293,6 @@ void xprt_free_bc_request(struct rpc_rqst *req)
260 xprt_free_allocation(req); 293 xprt_free_allocation(req);
261 return; 294 return;
262 } 295 }
263
264 /*
265 * Return it to the list of preallocations so that it
266 * may be reused by a new callback request.
267 */
268 spin_lock_bh(&xprt->bc_pa_lock);
269 list_add_tail(&req->rq_bc_pa_list, &xprt->bc_pa_list);
270 spin_unlock_bh(&xprt->bc_pa_lock);
271} 296}
272 297
273/* 298/*
@@ -311,6 +336,7 @@ void xprt_complete_bc_request(struct rpc_rqst *req, uint32_t copied)
311 336
312 spin_lock(&xprt->bc_pa_lock); 337 spin_lock(&xprt->bc_pa_lock);
313 list_del(&req->rq_bc_pa_list); 338 list_del(&req->rq_bc_pa_list);
339 xprt->bc_alloc_count--;
314 spin_unlock(&xprt->bc_pa_lock); 340 spin_unlock(&xprt->bc_pa_lock);
315 341
316 req->rq_private_buf.len = copied; 342 req->rq_private_buf.len = copied;
diff --git a/net/sunrpc/bc_svc.c b/net/sunrpc/bc_svc.c
deleted file mode 100644
index 15c7a8a1c24f..000000000000
--- a/net/sunrpc/bc_svc.c
+++ /dev/null
@@ -1,63 +0,0 @@
1/******************************************************************************
2
3(c) 2007 Network Appliance, Inc. All Rights Reserved.
4(c) 2009 NetApp. All Rights Reserved.
5
6NetApp provides this source code under the GPL v2 License.
7The GPL v2 license is available at
8http://opensource.org/licenses/gpl-license.php.
9
10THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
11"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
12LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
13A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
14CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
15EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
16PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
17PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
18LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
19NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
20SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
21
22******************************************************************************/
23
24/*
25 * The NFSv4.1 callback service helper routines.
26 * They implement the transport level processing required to send the
27 * reply over an existing open connection previously established by the client.
28 */
29
30#include <linux/module.h>
31
32#include <linux/sunrpc/xprt.h>
33#include <linux/sunrpc/sched.h>
34#include <linux/sunrpc/bc_xprt.h>
35
36#define RPCDBG_FACILITY RPCDBG_SVCDSP
37
38/* Empty callback ops */
39static const struct rpc_call_ops nfs41_callback_ops = {
40};
41
42
43/*
44 * Send the callback reply
45 */
46int bc_send(struct rpc_rqst *req)
47{
48 struct rpc_task *task;
49 int ret;
50
51 dprintk("RPC: bc_send req= %p\n", req);
52 task = rpc_run_bc_task(req, &nfs41_callback_ops);
53 if (IS_ERR(task))
54 ret = PTR_ERR(task);
55 else {
56 WARN_ON_ONCE(atomic_read(&task->tk_count) != 1);
57 ret = task->tk_status;
58 rpc_put_task(task);
59 }
60 dprintk("RPC: bc_send ret= %d\n", ret);
61 return ret;
62}
63
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index e6ce1517367f..cbc6af923dd1 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -891,15 +891,8 @@ void rpc_task_set_client(struct rpc_task *task, struct rpc_clnt *clnt)
891 task->tk_flags |= RPC_TASK_SOFT; 891 task->tk_flags |= RPC_TASK_SOFT;
892 if (clnt->cl_noretranstimeo) 892 if (clnt->cl_noretranstimeo)
893 task->tk_flags |= RPC_TASK_NO_RETRANS_TIMEOUT; 893 task->tk_flags |= RPC_TASK_NO_RETRANS_TIMEOUT;
894 if (sk_memalloc_socks()) { 894 if (atomic_read(&clnt->cl_swapper))
895 struct rpc_xprt *xprt; 895 task->tk_flags |= RPC_TASK_SWAPPER;
896
897 rcu_read_lock();
898 xprt = rcu_dereference(clnt->cl_xprt);
899 if (xprt->swapper)
900 task->tk_flags |= RPC_TASK_SWAPPER;
901 rcu_read_unlock();
902 }
903 /* Add to the client's list of all tasks */ 896 /* Add to the client's list of all tasks */
904 spin_lock(&clnt->cl_lock); 897 spin_lock(&clnt->cl_lock);
905 list_add_tail(&task->tk_task, &clnt->cl_tasks); 898 list_add_tail(&task->tk_task, &clnt->cl_tasks);
@@ -1031,15 +1024,14 @@ EXPORT_SYMBOL_GPL(rpc_call_async);
1031 * rpc_run_bc_task - Allocate a new RPC task for backchannel use, then run 1024 * rpc_run_bc_task - Allocate a new RPC task for backchannel use, then run
1032 * rpc_execute against it 1025 * rpc_execute against it
1033 * @req: RPC request 1026 * @req: RPC request
1034 * @tk_ops: RPC call ops
1035 */ 1027 */
1036struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req, 1028struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req)
1037 const struct rpc_call_ops *tk_ops)
1038{ 1029{
1039 struct rpc_task *task; 1030 struct rpc_task *task;
1040 struct xdr_buf *xbufp = &req->rq_snd_buf; 1031 struct xdr_buf *xbufp = &req->rq_snd_buf;
1041 struct rpc_task_setup task_setup_data = { 1032 struct rpc_task_setup task_setup_data = {
1042 .callback_ops = tk_ops, 1033 .callback_ops = &rpc_default_ops,
1034 .flags = RPC_TASK_SOFTCONN,
1043 }; 1035 };
1044 1036
1045 dprintk("RPC: rpc_run_bc_task req= %p\n", req); 1037 dprintk("RPC: rpc_run_bc_task req= %p\n", req);
@@ -1614,6 +1606,7 @@ call_allocate(struct rpc_task *task)
1614 req->rq_callsize + req->rq_rcvsize); 1606 req->rq_callsize + req->rq_rcvsize);
1615 if (req->rq_buffer != NULL) 1607 if (req->rq_buffer != NULL)
1616 return; 1608 return;
1609 xprt_inject_disconnect(xprt);
1617 1610
1618 dprintk("RPC: %5u rpc_buffer allocation failed\n", task->tk_pid); 1611 dprintk("RPC: %5u rpc_buffer allocation failed\n", task->tk_pid);
1619 1612
@@ -1951,33 +1944,36 @@ call_bc_transmit(struct rpc_task *task)
1951{ 1944{
1952 struct rpc_rqst *req = task->tk_rqstp; 1945 struct rpc_rqst *req = task->tk_rqstp;
1953 1946
1954 if (!xprt_prepare_transmit(task)) { 1947 if (!xprt_prepare_transmit(task))
1955 /* 1948 goto out_retry;
1956 * Could not reserve the transport. Try again after the
1957 * transport is released.
1958 */
1959 task->tk_status = 0;
1960 task->tk_action = call_bc_transmit;
1961 return;
1962 }
1963 1949
1964 task->tk_action = rpc_exit_task;
1965 if (task->tk_status < 0) { 1950 if (task->tk_status < 0) {
1966 printk(KERN_NOTICE "RPC: Could not send backchannel reply " 1951 printk(KERN_NOTICE "RPC: Could not send backchannel reply "
1967 "error: %d\n", task->tk_status); 1952 "error: %d\n", task->tk_status);
1968 return; 1953 goto out_done;
1969 } 1954 }
1955 if (req->rq_connect_cookie != req->rq_xprt->connect_cookie)
1956 req->rq_bytes_sent = 0;
1970 1957
1971 xprt_transmit(task); 1958 xprt_transmit(task);
1959
1960 if (task->tk_status == -EAGAIN)
1961 goto out_nospace;
1962
1972 xprt_end_transmit(task); 1963 xprt_end_transmit(task);
1973 dprint_status(task); 1964 dprint_status(task);
1974 switch (task->tk_status) { 1965 switch (task->tk_status) {
1975 case 0: 1966 case 0:
1976 /* Success */ 1967 /* Success */
1977 break;
1978 case -EHOSTDOWN: 1968 case -EHOSTDOWN:
1979 case -EHOSTUNREACH: 1969 case -EHOSTUNREACH:
1980 case -ENETUNREACH: 1970 case -ENETUNREACH:
1971 case -ECONNRESET:
1972 case -ECONNREFUSED:
1973 case -EADDRINUSE:
1974 case -ENOTCONN:
1975 case -EPIPE:
1976 break;
1981 case -ETIMEDOUT: 1977 case -ETIMEDOUT:
1982 /* 1978 /*
1983 * Problem reaching the server. Disconnect and let the 1979 * Problem reaching the server. Disconnect and let the
@@ -2002,6 +1998,13 @@ call_bc_transmit(struct rpc_task *task)
2002 break; 1998 break;
2003 } 1999 }
2004 rpc_wake_up_queued_task(&req->rq_xprt->pending, task); 2000 rpc_wake_up_queued_task(&req->rq_xprt->pending, task);
2001out_done:
2002 task->tk_action = rpc_exit_task;
2003 return;
2004out_nospace:
2005 req->rq_connect_cookie = req->rq_xprt->connect_cookie;
2006out_retry:
2007 task->tk_status = 0;
2005} 2008}
2006#endif /* CONFIG_SUNRPC_BACKCHANNEL */ 2009#endif /* CONFIG_SUNRPC_BACKCHANNEL */
2007 2010
@@ -2476,3 +2479,59 @@ void rpc_show_tasks(struct net *net)
2476 spin_unlock(&sn->rpc_client_lock); 2479 spin_unlock(&sn->rpc_client_lock);
2477} 2480}
2478#endif 2481#endif
2482
2483#if IS_ENABLED(CONFIG_SUNRPC_SWAP)
2484int
2485rpc_clnt_swap_activate(struct rpc_clnt *clnt)
2486{
2487 int ret = 0;
2488 struct rpc_xprt *xprt;
2489
2490 if (atomic_inc_return(&clnt->cl_swapper) == 1) {
2491retry:
2492 rcu_read_lock();
2493 xprt = xprt_get(rcu_dereference(clnt->cl_xprt));
2494 rcu_read_unlock();
2495 if (!xprt) {
2496 /*
2497 * If we didn't get a reference, then we likely are
2498 * racing with a migration event. Wait for a grace
2499 * period and try again.
2500 */
2501 synchronize_rcu();
2502 goto retry;
2503 }
2504
2505 ret = xprt_enable_swap(xprt);
2506 xprt_put(xprt);
2507 }
2508 return ret;
2509}
2510EXPORT_SYMBOL_GPL(rpc_clnt_swap_activate);
2511
2512void
2513rpc_clnt_swap_deactivate(struct rpc_clnt *clnt)
2514{
2515 struct rpc_xprt *xprt;
2516
2517 if (atomic_dec_if_positive(&clnt->cl_swapper) == 0) {
2518retry:
2519 rcu_read_lock();
2520 xprt = xprt_get(rcu_dereference(clnt->cl_xprt));
2521 rcu_read_unlock();
2522 if (!xprt) {
2523 /*
2524 * If we didn't get a reference, then we likely are
2525 * racing with a migration event. Wait for a grace
2526 * period and try again.
2527 */
2528 synchronize_rcu();
2529 goto retry;
2530 }
2531
2532 xprt_disable_swap(xprt);
2533 xprt_put(xprt);
2534 }
2535}
2536EXPORT_SYMBOL_GPL(rpc_clnt_swap_deactivate);
2537#endif /* CONFIG_SUNRPC_SWAP */
diff --git a/net/sunrpc/debugfs.c b/net/sunrpc/debugfs.c
index 82962f7e6e88..e7b4d93566df 100644
--- a/net/sunrpc/debugfs.c
+++ b/net/sunrpc/debugfs.c
@@ -10,9 +10,12 @@
10#include "netns.h" 10#include "netns.h"
11 11
12static struct dentry *topdir; 12static struct dentry *topdir;
13static struct dentry *rpc_fault_dir;
13static struct dentry *rpc_clnt_dir; 14static struct dentry *rpc_clnt_dir;
14static struct dentry *rpc_xprt_dir; 15static struct dentry *rpc_xprt_dir;
15 16
17unsigned int rpc_inject_disconnect;
18
16struct rpc_clnt_iter { 19struct rpc_clnt_iter {
17 struct rpc_clnt *clnt; 20 struct rpc_clnt *clnt;
18 loff_t pos; 21 loff_t pos;
@@ -257,6 +260,8 @@ rpc_xprt_debugfs_register(struct rpc_xprt *xprt)
257 debugfs_remove_recursive(xprt->debugfs); 260 debugfs_remove_recursive(xprt->debugfs);
258 xprt->debugfs = NULL; 261 xprt->debugfs = NULL;
259 } 262 }
263
264 atomic_set(&xprt->inject_disconnect, rpc_inject_disconnect);
260} 265}
261 266
262void 267void
@@ -266,11 +271,79 @@ rpc_xprt_debugfs_unregister(struct rpc_xprt *xprt)
266 xprt->debugfs = NULL; 271 xprt->debugfs = NULL;
267} 272}
268 273
274static int
275fault_open(struct inode *inode, struct file *filp)
276{
277 filp->private_data = kmalloc(128, GFP_KERNEL);
278 if (!filp->private_data)
279 return -ENOMEM;
280 return 0;
281}
282
283static int
284fault_release(struct inode *inode, struct file *filp)
285{
286 kfree(filp->private_data);
287 return 0;
288}
289
290static ssize_t
291fault_disconnect_read(struct file *filp, char __user *user_buf,
292 size_t len, loff_t *offset)
293{
294 char *buffer = (char *)filp->private_data;
295 size_t size;
296
297 size = sprintf(buffer, "%u\n", rpc_inject_disconnect);
298 return simple_read_from_buffer(user_buf, len, offset, buffer, size);
299}
300
301static ssize_t
302fault_disconnect_write(struct file *filp, const char __user *user_buf,
303 size_t len, loff_t *offset)
304{
305 char buffer[16];
306
307 if (len >= sizeof(buffer))
308 len = sizeof(buffer) - 1;
309 if (copy_from_user(buffer, user_buf, len))
310 return -EFAULT;
311 buffer[len] = '\0';
312 if (kstrtouint(buffer, 10, &rpc_inject_disconnect))
313 return -EINVAL;
314 return len;
315}
316
317static const struct file_operations fault_disconnect_fops = {
318 .owner = THIS_MODULE,
319 .open = fault_open,
320 .read = fault_disconnect_read,
321 .write = fault_disconnect_write,
322 .release = fault_release,
323};
324
325static struct dentry *
326inject_fault_dir(struct dentry *topdir)
327{
328 struct dentry *faultdir;
329
330 faultdir = debugfs_create_dir("inject_fault", topdir);
331 if (!faultdir)
332 return NULL;
333
334 if (!debugfs_create_file("disconnect", S_IFREG | S_IRUSR, faultdir,
335 NULL, &fault_disconnect_fops))
336 return NULL;
337
338 return faultdir;
339}
340
269void __exit 341void __exit
270sunrpc_debugfs_exit(void) 342sunrpc_debugfs_exit(void)
271{ 343{
272 debugfs_remove_recursive(topdir); 344 debugfs_remove_recursive(topdir);
273 topdir = NULL; 345 topdir = NULL;
346 rpc_fault_dir = NULL;
274 rpc_clnt_dir = NULL; 347 rpc_clnt_dir = NULL;
275 rpc_xprt_dir = NULL; 348 rpc_xprt_dir = NULL;
276} 349}
@@ -282,6 +355,10 @@ sunrpc_debugfs_init(void)
282 if (!topdir) 355 if (!topdir)
283 return; 356 return;
284 357
358 rpc_fault_dir = inject_fault_dir(topdir);
359 if (!rpc_fault_dir)
360 goto out_remove;
361
285 rpc_clnt_dir = debugfs_create_dir("rpc_clnt", topdir); 362 rpc_clnt_dir = debugfs_create_dir("rpc_clnt", topdir);
286 if (!rpc_clnt_dir) 363 if (!rpc_clnt_dir)
287 goto out_remove; 364 goto out_remove;
@@ -294,5 +371,6 @@ sunrpc_debugfs_init(void)
294out_remove: 371out_remove:
295 debugfs_remove_recursive(topdir); 372 debugfs_remove_recursive(topdir);
296 topdir = NULL; 373 topdir = NULL;
374 rpc_fault_dir = NULL;
297 rpc_clnt_dir = NULL; 375 rpc_clnt_dir = NULL;
298} 376}
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 852ae606b02a..5a16d8d8c831 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -1350,6 +1350,11 @@ bc_svc_process(struct svc_serv *serv, struct rpc_rqst *req,
1350{ 1350{
1351 struct kvec *argv = &rqstp->rq_arg.head[0]; 1351 struct kvec *argv = &rqstp->rq_arg.head[0];
1352 struct kvec *resv = &rqstp->rq_res.head[0]; 1352 struct kvec *resv = &rqstp->rq_res.head[0];
1353 struct rpc_task *task;
1354 int proc_error;
1355 int error;
1356
1357 dprintk("svc: %s(%p)\n", __func__, req);
1353 1358
1354 /* Build the svc_rqst used by the common processing routine */ 1359 /* Build the svc_rqst used by the common processing routine */
1355 rqstp->rq_xprt = serv->sv_bc_xprt; 1360 rqstp->rq_xprt = serv->sv_bc_xprt;
@@ -1372,21 +1377,36 @@ bc_svc_process(struct svc_serv *serv, struct rpc_rqst *req,
1372 1377
1373 /* 1378 /*
1374 * Skip the next two words because they've already been 1379 * Skip the next two words because they've already been
1375 * processed in the trasport 1380 * processed in the transport
1376 */ 1381 */
1377 svc_getu32(argv); /* XID */ 1382 svc_getu32(argv); /* XID */
1378 svc_getnl(argv); /* CALLDIR */ 1383 svc_getnl(argv); /* CALLDIR */
1379 1384
1380 /* Returns 1 for send, 0 for drop */ 1385 /* Parse and execute the bc call */
1381 if (svc_process_common(rqstp, argv, resv)) { 1386 proc_error = svc_process_common(rqstp, argv, resv);
1382 memcpy(&req->rq_snd_buf, &rqstp->rq_res, 1387
1383 sizeof(req->rq_snd_buf)); 1388 atomic_inc(&req->rq_xprt->bc_free_slots);
1384 return bc_send(req); 1389 if (!proc_error) {
1385 } else { 1390 /* Processing error: drop the request */
1386 /* drop request */
1387 xprt_free_bc_request(req); 1391 xprt_free_bc_request(req);
1388 return 0; 1392 return 0;
1389 } 1393 }
1394
1395 /* Finally, send the reply synchronously */
1396 memcpy(&req->rq_snd_buf, &rqstp->rq_res, sizeof(req->rq_snd_buf));
1397 task = rpc_run_bc_task(req);
1398 if (IS_ERR(task)) {
1399 error = PTR_ERR(task);
1400 goto out;
1401 }
1402
1403 WARN_ON_ONCE(atomic_read(&task->tk_count) != 1);
1404 error = task->tk_status;
1405 rpc_put_task(task);
1406
1407out:
1408 dprintk("svc: %s(), error=%d\n", __func__, error);
1409 return error;
1390} 1410}
1391EXPORT_SYMBOL_GPL(bc_svc_process); 1411EXPORT_SYMBOL_GPL(bc_svc_process);
1392#endif /* CONFIG_SUNRPC_BACKCHANNEL */ 1412#endif /* CONFIG_SUNRPC_BACKCHANNEL */
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 1d4fe24af06a..ab5dd621ae0c 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -68,6 +68,7 @@ static void xprt_init(struct rpc_xprt *xprt, struct net *net);
68static void xprt_request_init(struct rpc_task *, struct rpc_xprt *); 68static void xprt_request_init(struct rpc_task *, struct rpc_xprt *);
69static void xprt_connect_status(struct rpc_task *task); 69static void xprt_connect_status(struct rpc_task *task);
70static int __xprt_get_cong(struct rpc_xprt *, struct rpc_task *); 70static int __xprt_get_cong(struct rpc_xprt *, struct rpc_task *);
71static void __xprt_put_cong(struct rpc_xprt *, struct rpc_rqst *);
71static void xprt_destroy(struct rpc_xprt *xprt); 72static void xprt_destroy(struct rpc_xprt *xprt);
72 73
73static DEFINE_SPINLOCK(xprt_list_lock); 74static DEFINE_SPINLOCK(xprt_list_lock);
@@ -250,6 +251,8 @@ int xprt_reserve_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task)
250 } 251 }
251 xprt_clear_locked(xprt); 252 xprt_clear_locked(xprt);
252out_sleep: 253out_sleep:
254 if (req)
255 __xprt_put_cong(xprt, req);
253 dprintk("RPC: %5u failed to lock transport %p\n", task->tk_pid, xprt); 256 dprintk("RPC: %5u failed to lock transport %p\n", task->tk_pid, xprt);
254 task->tk_timeout = 0; 257 task->tk_timeout = 0;
255 task->tk_status = -EAGAIN; 258 task->tk_status = -EAGAIN;
@@ -608,8 +611,8 @@ static void xprt_autoclose(struct work_struct *work)
608 struct rpc_xprt *xprt = 611 struct rpc_xprt *xprt =
609 container_of(work, struct rpc_xprt, task_cleanup); 612 container_of(work, struct rpc_xprt, task_cleanup);
610 613
611 xprt->ops->close(xprt);
612 clear_bit(XPRT_CLOSE_WAIT, &xprt->state); 614 clear_bit(XPRT_CLOSE_WAIT, &xprt->state);
615 xprt->ops->close(xprt);
613 xprt_release_write(xprt, NULL); 616 xprt_release_write(xprt, NULL);
614} 617}
615 618
@@ -967,6 +970,7 @@ void xprt_transmit(struct rpc_task *task)
967 task->tk_status = status; 970 task->tk_status = status;
968 return; 971 return;
969 } 972 }
973 xprt_inject_disconnect(xprt);
970 974
971 dprintk("RPC: %5u xmit complete\n", task->tk_pid); 975 dprintk("RPC: %5u xmit complete\n", task->tk_pid);
972 task->tk_flags |= RPC_TASK_SENT; 976 task->tk_flags |= RPC_TASK_SENT;
@@ -1285,6 +1289,7 @@ void xprt_release(struct rpc_task *task)
1285 spin_unlock_bh(&xprt->transport_lock); 1289 spin_unlock_bh(&xprt->transport_lock);
1286 if (req->rq_buffer) 1290 if (req->rq_buffer)
1287 xprt->ops->buf_free(req->rq_buffer); 1291 xprt->ops->buf_free(req->rq_buffer);
1292 xprt_inject_disconnect(xprt);
1288 if (req->rq_cred != NULL) 1293 if (req->rq_cred != NULL)
1289 put_rpccred(req->rq_cred); 1294 put_rpccred(req->rq_cred);
1290 task->tk_rqstp = NULL; 1295 task->tk_rqstp = NULL;
diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
index 302d4ebf6fbf..f1e8dafbd507 100644
--- a/net/sunrpc/xprtrdma/fmr_ops.c
+++ b/net/sunrpc/xprtrdma/fmr_ops.c
@@ -11,6 +11,21 @@
11 * can take tens of usecs to complete. 11 * can take tens of usecs to complete.
12 */ 12 */
13 13
14/* Normal operation
15 *
16 * A Memory Region is prepared for RDMA READ or WRITE using the
17 * ib_map_phys_fmr verb (fmr_op_map). When the RDMA operation is
18 * finished, the Memory Region is unmapped using the ib_unmap_fmr
19 * verb (fmr_op_unmap).
20 */
21
22/* Transport recovery
23 *
24 * After a transport reconnect, fmr_op_map re-uses the MR already
25 * allocated for the RPC, but generates a fresh rkey then maps the
26 * MR again. This process is synchronous.
27 */
28
14#include "xprt_rdma.h" 29#include "xprt_rdma.h"
15 30
16#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) 31#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
@@ -50,19 +65,28 @@ fmr_op_init(struct rpcrdma_xprt *r_xprt)
50 struct rpcrdma_mw *r; 65 struct rpcrdma_mw *r;
51 int i, rc; 66 int i, rc;
52 67
68 spin_lock_init(&buf->rb_mwlock);
53 INIT_LIST_HEAD(&buf->rb_mws); 69 INIT_LIST_HEAD(&buf->rb_mws);
54 INIT_LIST_HEAD(&buf->rb_all); 70 INIT_LIST_HEAD(&buf->rb_all);
55 71
56 i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS; 72 i = max_t(int, RPCRDMA_MAX_DATA_SEGS / RPCRDMA_MAX_FMR_SGES, 1);
57 dprintk("RPC: %s: initializing %d FMRs\n", __func__, i); 73 i += 2; /* head + tail */
74 i *= buf->rb_max_requests; /* one set for each RPC slot */
75 dprintk("RPC: %s: initalizing %d FMRs\n", __func__, i);
58 76
77 rc = -ENOMEM;
59 while (i--) { 78 while (i--) {
60 r = kzalloc(sizeof(*r), GFP_KERNEL); 79 r = kzalloc(sizeof(*r), GFP_KERNEL);
61 if (!r) 80 if (!r)
62 return -ENOMEM; 81 goto out;
63 82
64 r->r.fmr = ib_alloc_fmr(pd, mr_access_flags, &fmr_attr); 83 r->r.fmr.physaddrs = kmalloc(RPCRDMA_MAX_FMR_SGES *
65 if (IS_ERR(r->r.fmr)) 84 sizeof(u64), GFP_KERNEL);
85 if (!r->r.fmr.physaddrs)
86 goto out_free;
87
88 r->r.fmr.fmr = ib_alloc_fmr(pd, mr_access_flags, &fmr_attr);
89 if (IS_ERR(r->r.fmr.fmr))
66 goto out_fmr_err; 90 goto out_fmr_err;
67 91
68 list_add(&r->mw_list, &buf->rb_mws); 92 list_add(&r->mw_list, &buf->rb_mws);
@@ -71,12 +95,24 @@ fmr_op_init(struct rpcrdma_xprt *r_xprt)
71 return 0; 95 return 0;
72 96
73out_fmr_err: 97out_fmr_err:
74 rc = PTR_ERR(r->r.fmr); 98 rc = PTR_ERR(r->r.fmr.fmr);
75 dprintk("RPC: %s: ib_alloc_fmr status %i\n", __func__, rc); 99 dprintk("RPC: %s: ib_alloc_fmr status %i\n", __func__, rc);
100 kfree(r->r.fmr.physaddrs);
101out_free:
76 kfree(r); 102 kfree(r);
103out:
77 return rc; 104 return rc;
78} 105}
79 106
107static int
108__fmr_unmap(struct rpcrdma_mw *r)
109{
110 LIST_HEAD(l);
111
112 list_add(&r->r.fmr.fmr->list, &l);
113 return ib_unmap_fmr(&l);
114}
115
80/* Use the ib_map_phys_fmr() verb to register a memory region 116/* Use the ib_map_phys_fmr() verb to register a memory region
81 * for remote access via RDMA READ or RDMA WRITE. 117 * for remote access via RDMA READ or RDMA WRITE.
82 */ 118 */
@@ -85,12 +121,24 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
85 int nsegs, bool writing) 121 int nsegs, bool writing)
86{ 122{
87 struct rpcrdma_ia *ia = &r_xprt->rx_ia; 123 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
88 struct ib_device *device = ia->ri_id->device; 124 struct ib_device *device = ia->ri_device;
89 enum dma_data_direction direction = rpcrdma_data_dir(writing); 125 enum dma_data_direction direction = rpcrdma_data_dir(writing);
90 struct rpcrdma_mr_seg *seg1 = seg; 126 struct rpcrdma_mr_seg *seg1 = seg;
91 struct rpcrdma_mw *mw = seg1->rl_mw;
92 u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
93 int len, pageoff, i, rc; 127 int len, pageoff, i, rc;
128 struct rpcrdma_mw *mw;
129
130 mw = seg1->rl_mw;
131 seg1->rl_mw = NULL;
132 if (!mw) {
133 mw = rpcrdma_get_mw(r_xprt);
134 if (!mw)
135 return -ENOMEM;
136 } else {
137 /* this is a retransmit; generate a fresh rkey */
138 rc = __fmr_unmap(mw);
139 if (rc)
140 return rc;
141 }
94 142
95 pageoff = offset_in_page(seg1->mr_offset); 143 pageoff = offset_in_page(seg1->mr_offset);
96 seg1->mr_offset -= pageoff; /* start of page */ 144 seg1->mr_offset -= pageoff; /* start of page */
@@ -100,7 +148,7 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
100 nsegs = RPCRDMA_MAX_FMR_SGES; 148 nsegs = RPCRDMA_MAX_FMR_SGES;
101 for (i = 0; i < nsegs;) { 149 for (i = 0; i < nsegs;) {
102 rpcrdma_map_one(device, seg, direction); 150 rpcrdma_map_one(device, seg, direction);
103 physaddrs[i] = seg->mr_dma; 151 mw->r.fmr.physaddrs[i] = seg->mr_dma;
104 len += seg->mr_len; 152 len += seg->mr_len;
105 ++seg; 153 ++seg;
106 ++i; 154 ++i;
@@ -110,11 +158,13 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
110 break; 158 break;
111 } 159 }
112 160
113 rc = ib_map_phys_fmr(mw->r.fmr, physaddrs, i, seg1->mr_dma); 161 rc = ib_map_phys_fmr(mw->r.fmr.fmr, mw->r.fmr.physaddrs,
162 i, seg1->mr_dma);
114 if (rc) 163 if (rc)
115 goto out_maperr; 164 goto out_maperr;
116 165
117 seg1->mr_rkey = mw->r.fmr->rkey; 166 seg1->rl_mw = mw;
167 seg1->mr_rkey = mw->r.fmr.fmr->rkey;
118 seg1->mr_base = seg1->mr_dma + pageoff; 168 seg1->mr_base = seg1->mr_dma + pageoff;
119 seg1->mr_nsegs = i; 169 seg1->mr_nsegs = i;
120 seg1->mr_len = len; 170 seg1->mr_len = len;
@@ -137,48 +187,28 @@ fmr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
137{ 187{
138 struct rpcrdma_ia *ia = &r_xprt->rx_ia; 188 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
139 struct rpcrdma_mr_seg *seg1 = seg; 189 struct rpcrdma_mr_seg *seg1 = seg;
140 struct ib_device *device; 190 struct rpcrdma_mw *mw = seg1->rl_mw;
141 int rc, nsegs = seg->mr_nsegs; 191 int rc, nsegs = seg->mr_nsegs;
142 LIST_HEAD(l);
143 192
144 list_add(&seg1->rl_mw->r.fmr->list, &l); 193 dprintk("RPC: %s: FMR %p\n", __func__, mw);
145 rc = ib_unmap_fmr(&l); 194
146 read_lock(&ia->ri_qplock); 195 seg1->rl_mw = NULL;
147 device = ia->ri_id->device;
148 while (seg1->mr_nsegs--) 196 while (seg1->mr_nsegs--)
149 rpcrdma_unmap_one(device, seg++); 197 rpcrdma_unmap_one(ia->ri_device, seg++);
150 read_unlock(&ia->ri_qplock); 198 rc = __fmr_unmap(mw);
151 if (rc) 199 if (rc)
152 goto out_err; 200 goto out_err;
201 rpcrdma_put_mw(r_xprt, mw);
153 return nsegs; 202 return nsegs;
154 203
155out_err: 204out_err:
205 /* The FMR is abandoned, but remains in rb_all. fmr_op_destroy
206 * will attempt to release it when the transport is destroyed.
207 */
156 dprintk("RPC: %s: ib_unmap_fmr status %i\n", __func__, rc); 208 dprintk("RPC: %s: ib_unmap_fmr status %i\n", __func__, rc);
157 return nsegs; 209 return nsegs;
158} 210}
159 211
160/* After a disconnect, unmap all FMRs.
161 *
162 * This is invoked only in the transport connect worker in order
163 * to serialize with rpcrdma_register_fmr_external().
164 */
165static void
166fmr_op_reset(struct rpcrdma_xprt *r_xprt)
167{
168 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
169 struct rpcrdma_mw *r;
170 LIST_HEAD(list);
171 int rc;
172
173 list_for_each_entry(r, &buf->rb_all, mw_all)
174 list_add(&r->r.fmr->list, &list);
175
176 rc = ib_unmap_fmr(&list);
177 if (rc)
178 dprintk("RPC: %s: ib_unmap_fmr failed %i\n",
179 __func__, rc);
180}
181
182static void 212static void
183fmr_op_destroy(struct rpcrdma_buffer *buf) 213fmr_op_destroy(struct rpcrdma_buffer *buf)
184{ 214{
@@ -188,10 +218,13 @@ fmr_op_destroy(struct rpcrdma_buffer *buf)
188 while (!list_empty(&buf->rb_all)) { 218 while (!list_empty(&buf->rb_all)) {
189 r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all); 219 r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
190 list_del(&r->mw_all); 220 list_del(&r->mw_all);
191 rc = ib_dealloc_fmr(r->r.fmr); 221 kfree(r->r.fmr.physaddrs);
222
223 rc = ib_dealloc_fmr(r->r.fmr.fmr);
192 if (rc) 224 if (rc)
193 dprintk("RPC: %s: ib_dealloc_fmr failed %i\n", 225 dprintk("RPC: %s: ib_dealloc_fmr failed %i\n",
194 __func__, rc); 226 __func__, rc);
227
195 kfree(r); 228 kfree(r);
196 } 229 }
197} 230}
@@ -202,7 +235,6 @@ const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = {
202 .ro_open = fmr_op_open, 235 .ro_open = fmr_op_open,
203 .ro_maxpages = fmr_op_maxpages, 236 .ro_maxpages = fmr_op_maxpages,
204 .ro_init = fmr_op_init, 237 .ro_init = fmr_op_init,
205 .ro_reset = fmr_op_reset,
206 .ro_destroy = fmr_op_destroy, 238 .ro_destroy = fmr_op_destroy,
207 .ro_displayname = "fmr", 239 .ro_displayname = "fmr",
208}; 240};
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index d234521320a4..04ea914201b2 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -11,12 +11,136 @@
11 * but most complex memory registration mode. 11 * but most complex memory registration mode.
12 */ 12 */
13 13
14/* Normal operation
15 *
16 * A Memory Region is prepared for RDMA READ or WRITE using a FAST_REG
17 * Work Request (frmr_op_map). When the RDMA operation is finished, this
18 * Memory Region is invalidated using a LOCAL_INV Work Request
19 * (frmr_op_unmap).
20 *
21 * Typically these Work Requests are not signaled, and neither are RDMA
22 * SEND Work Requests (with the exception of signaling occasionally to
23 * prevent provider work queue overflows). This greatly reduces HCA
24 * interrupt workload.
25 *
26 * As an optimization, frwr_op_unmap marks MRs INVALID before the
27 * LOCAL_INV WR is posted. If posting succeeds, the MR is placed on
28 * rb_mws immediately so that no work (like managing a linked list
29 * under a spinlock) is needed in the completion upcall.
30 *
31 * But this means that frwr_op_map() can occasionally encounter an MR
32 * that is INVALID but the LOCAL_INV WR has not completed. Work Queue
33 * ordering prevents a subsequent FAST_REG WR from executing against
34 * that MR while it is still being invalidated.
35 */
36
37/* Transport recovery
38 *
39 * ->op_map and the transport connect worker cannot run at the same
40 * time, but ->op_unmap can fire while the transport connect worker
41 * is running. Thus MR recovery is handled in ->op_map, to guarantee
42 * that recovered MRs are owned by a sending RPC, and not one where
43 * ->op_unmap could fire at the same time transport reconnect is
44 * being done.
45 *
46 * When the underlying transport disconnects, MRs are left in one of
47 * three states:
48 *
49 * INVALID: The MR was not in use before the QP entered ERROR state.
50 * (Or, the LOCAL_INV WR has not completed or flushed yet).
51 *
52 * STALE: The MR was being registered or unregistered when the QP
53 * entered ERROR state, and the pending WR was flushed.
54 *
55 * VALID: The MR was registered before the QP entered ERROR state.
56 *
57 * When frwr_op_map encounters STALE and VALID MRs, they are recovered
58 * with ib_dereg_mr and then are re-initialized. Beause MR recovery
59 * allocates fresh resources, it is deferred to a workqueue, and the
60 * recovered MRs are placed back on the rb_mws list when recovery is
61 * complete. frwr_op_map allocates another MR for the current RPC while
62 * the broken MR is reset.
63 *
64 * To ensure that frwr_op_map doesn't encounter an MR that is marked
65 * INVALID but that is about to be flushed due to a previous transport
66 * disconnect, the transport connect worker attempts to drain all
67 * pending send queue WRs before the transport is reconnected.
68 */
69
14#include "xprt_rdma.h" 70#include "xprt_rdma.h"
15 71
16#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) 72#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
17# define RPCDBG_FACILITY RPCDBG_TRANS 73# define RPCDBG_FACILITY RPCDBG_TRANS
18#endif 74#endif
19 75
76static struct workqueue_struct *frwr_recovery_wq;
77
78#define FRWR_RECOVERY_WQ_FLAGS (WQ_UNBOUND | WQ_MEM_RECLAIM)
79
80int
81frwr_alloc_recovery_wq(void)
82{
83 frwr_recovery_wq = alloc_workqueue("frwr_recovery",
84 FRWR_RECOVERY_WQ_FLAGS, 0);
85 return !frwr_recovery_wq ? -ENOMEM : 0;
86}
87
88void
89frwr_destroy_recovery_wq(void)
90{
91 struct workqueue_struct *wq;
92
93 if (!frwr_recovery_wq)
94 return;
95
96 wq = frwr_recovery_wq;
97 frwr_recovery_wq = NULL;
98 destroy_workqueue(wq);
99}
100
101/* Deferred reset of a single FRMR. Generate a fresh rkey by
102 * replacing the MR.
103 *
104 * There's no recovery if this fails. The FRMR is abandoned, but
105 * remains in rb_all. It will be cleaned up when the transport is
106 * destroyed.
107 */
108static void
109__frwr_recovery_worker(struct work_struct *work)
110{
111 struct rpcrdma_mw *r = container_of(work, struct rpcrdma_mw,
112 r.frmr.fr_work);
113 struct rpcrdma_xprt *r_xprt = r->r.frmr.fr_xprt;
114 unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth;
115 struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
116
117 if (ib_dereg_mr(r->r.frmr.fr_mr))
118 goto out_fail;
119
120 r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(pd, depth);
121 if (IS_ERR(r->r.frmr.fr_mr))
122 goto out_fail;
123
124 dprintk("RPC: %s: recovered FRMR %p\n", __func__, r);
125 r->r.frmr.fr_state = FRMR_IS_INVALID;
126 rpcrdma_put_mw(r_xprt, r);
127 return;
128
129out_fail:
130 pr_warn("RPC: %s: FRMR %p unrecovered\n",
131 __func__, r);
132}
133
134/* A broken MR was discovered in a context that can't sleep.
135 * Defer recovery to the recovery worker.
136 */
137static void
138__frwr_queue_recovery(struct rpcrdma_mw *r)
139{
140 INIT_WORK(&r->r.frmr.fr_work, __frwr_recovery_worker);
141 queue_work(frwr_recovery_wq, &r->r.frmr.fr_work);
142}
143
20static int 144static int
21__frwr_init(struct rpcrdma_mw *r, struct ib_pd *pd, struct ib_device *device, 145__frwr_init(struct rpcrdma_mw *r, struct ib_pd *pd, struct ib_device *device,
22 unsigned int depth) 146 unsigned int depth)
@@ -128,7 +252,7 @@ frwr_sendcompletion(struct ib_wc *wc)
128 252
129 /* WARNING: Only wr_id and status are reliable at this point */ 253 /* WARNING: Only wr_id and status are reliable at this point */
130 r = (struct rpcrdma_mw *)(unsigned long)wc->wr_id; 254 r = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
131 dprintk("RPC: %s: frmr %p (stale), status %s (%d)\n", 255 pr_warn("RPC: %s: frmr %p flushed, status %s (%d)\n",
132 __func__, r, ib_wc_status_msg(wc->status), wc->status); 256 __func__, r, ib_wc_status_msg(wc->status), wc->status);
133 r->r.frmr.fr_state = FRMR_IS_STALE; 257 r->r.frmr.fr_state = FRMR_IS_STALE;
134} 258}
@@ -137,16 +261,19 @@ static int
137frwr_op_init(struct rpcrdma_xprt *r_xprt) 261frwr_op_init(struct rpcrdma_xprt *r_xprt)
138{ 262{
139 struct rpcrdma_buffer *buf = &r_xprt->rx_buf; 263 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
140 struct ib_device *device = r_xprt->rx_ia.ri_id->device; 264 struct ib_device *device = r_xprt->rx_ia.ri_device;
141 unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth; 265 unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth;
142 struct ib_pd *pd = r_xprt->rx_ia.ri_pd; 266 struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
143 int i; 267 int i;
144 268
269 spin_lock_init(&buf->rb_mwlock);
145 INIT_LIST_HEAD(&buf->rb_mws); 270 INIT_LIST_HEAD(&buf->rb_mws);
146 INIT_LIST_HEAD(&buf->rb_all); 271 INIT_LIST_HEAD(&buf->rb_all);
147 272
148 i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS; 273 i = max_t(int, RPCRDMA_MAX_DATA_SEGS / depth, 1);
149 dprintk("RPC: %s: initializing %d FRMRs\n", __func__, i); 274 i += 2; /* head + tail */
275 i *= buf->rb_max_requests; /* one set for each RPC slot */
276 dprintk("RPC: %s: initalizing %d FRMRs\n", __func__, i);
150 277
151 while (i--) { 278 while (i--) {
152 struct rpcrdma_mw *r; 279 struct rpcrdma_mw *r;
@@ -165,6 +292,7 @@ frwr_op_init(struct rpcrdma_xprt *r_xprt)
165 list_add(&r->mw_list, &buf->rb_mws); 292 list_add(&r->mw_list, &buf->rb_mws);
166 list_add(&r->mw_all, &buf->rb_all); 293 list_add(&r->mw_all, &buf->rb_all);
167 r->mw_sendcompletion = frwr_sendcompletion; 294 r->mw_sendcompletion = frwr_sendcompletion;
295 r->r.frmr.fr_xprt = r_xprt;
168 } 296 }
169 297
170 return 0; 298 return 0;
@@ -178,12 +306,12 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
178 int nsegs, bool writing) 306 int nsegs, bool writing)
179{ 307{
180 struct rpcrdma_ia *ia = &r_xprt->rx_ia; 308 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
181 struct ib_device *device = ia->ri_id->device; 309 struct ib_device *device = ia->ri_device;
182 enum dma_data_direction direction = rpcrdma_data_dir(writing); 310 enum dma_data_direction direction = rpcrdma_data_dir(writing);
183 struct rpcrdma_mr_seg *seg1 = seg; 311 struct rpcrdma_mr_seg *seg1 = seg;
184 struct rpcrdma_mw *mw = seg1->rl_mw; 312 struct rpcrdma_mw *mw;
185 struct rpcrdma_frmr *frmr = &mw->r.frmr; 313 struct rpcrdma_frmr *frmr;
186 struct ib_mr *mr = frmr->fr_mr; 314 struct ib_mr *mr;
187 struct ib_send_wr fastreg_wr, *bad_wr; 315 struct ib_send_wr fastreg_wr, *bad_wr;
188 u8 key; 316 u8 key;
189 int len, pageoff; 317 int len, pageoff;
@@ -192,12 +320,25 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
192 u64 pa; 320 u64 pa;
193 int page_no; 321 int page_no;
194 322
323 mw = seg1->rl_mw;
324 seg1->rl_mw = NULL;
325 do {
326 if (mw)
327 __frwr_queue_recovery(mw);
328 mw = rpcrdma_get_mw(r_xprt);
329 if (!mw)
330 return -ENOMEM;
331 } while (mw->r.frmr.fr_state != FRMR_IS_INVALID);
332 frmr = &mw->r.frmr;
333 frmr->fr_state = FRMR_IS_VALID;
334
195 pageoff = offset_in_page(seg1->mr_offset); 335 pageoff = offset_in_page(seg1->mr_offset);
196 seg1->mr_offset -= pageoff; /* start of page */ 336 seg1->mr_offset -= pageoff; /* start of page */
197 seg1->mr_len += pageoff; 337 seg1->mr_len += pageoff;
198 len = -pageoff; 338 len = -pageoff;
199 if (nsegs > ia->ri_max_frmr_depth) 339 if (nsegs > ia->ri_max_frmr_depth)
200 nsegs = ia->ri_max_frmr_depth; 340 nsegs = ia->ri_max_frmr_depth;
341
201 for (page_no = i = 0; i < nsegs;) { 342 for (page_no = i = 0; i < nsegs;) {
202 rpcrdma_map_one(device, seg, direction); 343 rpcrdma_map_one(device, seg, direction);
203 pa = seg->mr_dma; 344 pa = seg->mr_dma;
@@ -216,8 +357,6 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
216 dprintk("RPC: %s: Using frmr %p to map %d segments (%d bytes)\n", 357 dprintk("RPC: %s: Using frmr %p to map %d segments (%d bytes)\n",
217 __func__, mw, i, len); 358 __func__, mw, i, len);
218 359
219 frmr->fr_state = FRMR_IS_VALID;
220
221 memset(&fastreg_wr, 0, sizeof(fastreg_wr)); 360 memset(&fastreg_wr, 0, sizeof(fastreg_wr));
222 fastreg_wr.wr_id = (unsigned long)(void *)mw; 361 fastreg_wr.wr_id = (unsigned long)(void *)mw;
223 fastreg_wr.opcode = IB_WR_FAST_REG_MR; 362 fastreg_wr.opcode = IB_WR_FAST_REG_MR;
@@ -229,6 +368,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
229 fastreg_wr.wr.fast_reg.access_flags = writing ? 368 fastreg_wr.wr.fast_reg.access_flags = writing ?
230 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : 369 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
231 IB_ACCESS_REMOTE_READ; 370 IB_ACCESS_REMOTE_READ;
371 mr = frmr->fr_mr;
232 key = (u8)(mr->rkey & 0x000000FF); 372 key = (u8)(mr->rkey & 0x000000FF);
233 ib_update_fast_reg_key(mr, ++key); 373 ib_update_fast_reg_key(mr, ++key);
234 fastreg_wr.wr.fast_reg.rkey = mr->rkey; 374 fastreg_wr.wr.fast_reg.rkey = mr->rkey;
@@ -238,6 +378,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
238 if (rc) 378 if (rc)
239 goto out_senderr; 379 goto out_senderr;
240 380
381 seg1->rl_mw = mw;
241 seg1->mr_rkey = mr->rkey; 382 seg1->mr_rkey = mr->rkey;
242 seg1->mr_base = seg1->mr_dma + pageoff; 383 seg1->mr_base = seg1->mr_dma + pageoff;
243 seg1->mr_nsegs = i; 384 seg1->mr_nsegs = i;
@@ -246,10 +387,9 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
246 387
247out_senderr: 388out_senderr:
248 dprintk("RPC: %s: ib_post_send status %i\n", __func__, rc); 389 dprintk("RPC: %s: ib_post_send status %i\n", __func__, rc);
249 ib_update_fast_reg_key(mr, --key);
250 frmr->fr_state = FRMR_IS_INVALID;
251 while (i--) 390 while (i--)
252 rpcrdma_unmap_one(device, --seg); 391 rpcrdma_unmap_one(device, --seg);
392 __frwr_queue_recovery(mw);
253 return rc; 393 return rc;
254} 394}
255 395
@@ -261,78 +401,46 @@ frwr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
261{ 401{
262 struct rpcrdma_mr_seg *seg1 = seg; 402 struct rpcrdma_mr_seg *seg1 = seg;
263 struct rpcrdma_ia *ia = &r_xprt->rx_ia; 403 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
404 struct rpcrdma_mw *mw = seg1->rl_mw;
264 struct ib_send_wr invalidate_wr, *bad_wr; 405 struct ib_send_wr invalidate_wr, *bad_wr;
265 int rc, nsegs = seg->mr_nsegs; 406 int rc, nsegs = seg->mr_nsegs;
266 struct ib_device *device;
267 407
268 seg1->rl_mw->r.frmr.fr_state = FRMR_IS_INVALID; 408 dprintk("RPC: %s: FRMR %p\n", __func__, mw);
409
410 seg1->rl_mw = NULL;
411 mw->r.frmr.fr_state = FRMR_IS_INVALID;
269 412
270 memset(&invalidate_wr, 0, sizeof(invalidate_wr)); 413 memset(&invalidate_wr, 0, sizeof(invalidate_wr));
271 invalidate_wr.wr_id = (unsigned long)(void *)seg1->rl_mw; 414 invalidate_wr.wr_id = (unsigned long)(void *)mw;
272 invalidate_wr.opcode = IB_WR_LOCAL_INV; 415 invalidate_wr.opcode = IB_WR_LOCAL_INV;
273 invalidate_wr.ex.invalidate_rkey = seg1->rl_mw->r.frmr.fr_mr->rkey; 416 invalidate_wr.ex.invalidate_rkey = mw->r.frmr.fr_mr->rkey;
274 DECR_CQCOUNT(&r_xprt->rx_ep); 417 DECR_CQCOUNT(&r_xprt->rx_ep);
275 418
276 read_lock(&ia->ri_qplock);
277 device = ia->ri_id->device;
278 while (seg1->mr_nsegs--) 419 while (seg1->mr_nsegs--)
279 rpcrdma_unmap_one(device, seg++); 420 rpcrdma_unmap_one(ia->ri_device, seg++);
421 read_lock(&ia->ri_qplock);
280 rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr); 422 rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
281 read_unlock(&ia->ri_qplock); 423 read_unlock(&ia->ri_qplock);
282 if (rc) 424 if (rc)
283 goto out_err; 425 goto out_err;
426
427 rpcrdma_put_mw(r_xprt, mw);
284 return nsegs; 428 return nsegs;
285 429
286out_err: 430out_err:
287 /* Force rpcrdma_buffer_get() to retry */
288 seg1->rl_mw->r.frmr.fr_state = FRMR_IS_STALE;
289 dprintk("RPC: %s: ib_post_send status %i\n", __func__, rc); 431 dprintk("RPC: %s: ib_post_send status %i\n", __func__, rc);
432 __frwr_queue_recovery(mw);
290 return nsegs; 433 return nsegs;
291} 434}
292 435
293/* After a disconnect, a flushed FAST_REG_MR can leave an FRMR in
294 * an unusable state. Find FRMRs in this state and dereg / reg
295 * each. FRMRs that are VALID and attached to an rpcrdma_req are
296 * also torn down.
297 *
298 * This gives all in-use FRMRs a fresh rkey and leaves them INVALID.
299 *
300 * This is invoked only in the transport connect worker in order
301 * to serialize with rpcrdma_register_frmr_external().
302 */
303static void
304frwr_op_reset(struct rpcrdma_xprt *r_xprt)
305{
306 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
307 struct ib_device *device = r_xprt->rx_ia.ri_id->device;
308 unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth;
309 struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
310 struct rpcrdma_mw *r;
311 int rc;
312
313 list_for_each_entry(r, &buf->rb_all, mw_all) {
314 if (r->r.frmr.fr_state == FRMR_IS_INVALID)
315 continue;
316
317 __frwr_release(r);
318 rc = __frwr_init(r, pd, device, depth);
319 if (rc) {
320 dprintk("RPC: %s: mw %p left %s\n",
321 __func__, r,
322 (r->r.frmr.fr_state == FRMR_IS_STALE ?
323 "stale" : "valid"));
324 continue;
325 }
326
327 r->r.frmr.fr_state = FRMR_IS_INVALID;
328 }
329}
330
331static void 436static void
332frwr_op_destroy(struct rpcrdma_buffer *buf) 437frwr_op_destroy(struct rpcrdma_buffer *buf)
333{ 438{
334 struct rpcrdma_mw *r; 439 struct rpcrdma_mw *r;
335 440
441 /* Ensure stale MWs for "buf" are no longer in flight */
442 flush_workqueue(frwr_recovery_wq);
443
336 while (!list_empty(&buf->rb_all)) { 444 while (!list_empty(&buf->rb_all)) {
337 r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all); 445 r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
338 list_del(&r->mw_all); 446 list_del(&r->mw_all);
@@ -347,7 +455,6 @@ const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = {
347 .ro_open = frwr_op_open, 455 .ro_open = frwr_op_open,
348 .ro_maxpages = frwr_op_maxpages, 456 .ro_maxpages = frwr_op_maxpages,
349 .ro_init = frwr_op_init, 457 .ro_init = frwr_op_init,
350 .ro_reset = frwr_op_reset,
351 .ro_destroy = frwr_op_destroy, 458 .ro_destroy = frwr_op_destroy,
352 .ro_displayname = "frwr", 459 .ro_displayname = "frwr",
353}; 460};
diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c
index ba518af16787..41985d07fdb7 100644
--- a/net/sunrpc/xprtrdma/physical_ops.c
+++ b/net/sunrpc/xprtrdma/physical_ops.c
@@ -50,8 +50,7 @@ physical_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
50{ 50{
51 struct rpcrdma_ia *ia = &r_xprt->rx_ia; 51 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
52 52
53 rpcrdma_map_one(ia->ri_id->device, seg, 53 rpcrdma_map_one(ia->ri_device, seg, rpcrdma_data_dir(writing));
54 rpcrdma_data_dir(writing));
55 seg->mr_rkey = ia->ri_bind_mem->rkey; 54 seg->mr_rkey = ia->ri_bind_mem->rkey;
56 seg->mr_base = seg->mr_dma; 55 seg->mr_base = seg->mr_dma;
57 seg->mr_nsegs = 1; 56 seg->mr_nsegs = 1;
@@ -65,19 +64,11 @@ physical_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
65{ 64{
66 struct rpcrdma_ia *ia = &r_xprt->rx_ia; 65 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
67 66
68 read_lock(&ia->ri_qplock); 67 rpcrdma_unmap_one(ia->ri_device, seg);
69 rpcrdma_unmap_one(ia->ri_id->device, seg);
70 read_unlock(&ia->ri_qplock);
71
72 return 1; 68 return 1;
73} 69}
74 70
75static void 71static void
76physical_op_reset(struct rpcrdma_xprt *r_xprt)
77{
78}
79
80static void
81physical_op_destroy(struct rpcrdma_buffer *buf) 72physical_op_destroy(struct rpcrdma_buffer *buf)
82{ 73{
83} 74}
@@ -88,7 +79,6 @@ const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops = {
88 .ro_open = physical_op_open, 79 .ro_open = physical_op_open,
89 .ro_maxpages = physical_op_maxpages, 80 .ro_maxpages = physical_op_maxpages,
90 .ro_init = physical_op_init, 81 .ro_init = physical_op_init,
91 .ro_reset = physical_op_reset,
92 .ro_destroy = physical_op_destroy, 82 .ro_destroy = physical_op_destroy,
93 .ro_displayname = "physical", 83 .ro_displayname = "physical",
94}; 84};
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 2c53ea9e1b83..84ea37daef36 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -284,9 +284,6 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
284 return (unsigned char *)iptr - (unsigned char *)headerp; 284 return (unsigned char *)iptr - (unsigned char *)headerp;
285 285
286out: 286out:
287 if (r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_FRMR)
288 return n;
289
290 for (pos = 0; nchunks--;) 287 for (pos = 0; nchunks--;)
291 pos += r_xprt->rx_ia.ri_ops->ro_unmap(r_xprt, 288 pos += r_xprt->rx_ia.ri_ops->ro_unmap(r_xprt,
292 &req->rl_segments[pos]); 289 &req->rl_segments[pos]);
@@ -732,8 +729,8 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep)
732 struct rpcrdma_msg *headerp; 729 struct rpcrdma_msg *headerp;
733 struct rpcrdma_req *req; 730 struct rpcrdma_req *req;
734 struct rpc_rqst *rqst; 731 struct rpc_rqst *rqst;
735 struct rpc_xprt *xprt = rep->rr_xprt; 732 struct rpcrdma_xprt *r_xprt = rep->rr_rxprt;
736 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); 733 struct rpc_xprt *xprt = &r_xprt->rx_xprt;
737 __be32 *iptr; 734 __be32 *iptr;
738 int rdmalen, status; 735 int rdmalen, status;
739 unsigned long cwnd; 736 unsigned long cwnd;
@@ -770,7 +767,6 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep)
770 rep->rr_len); 767 rep->rr_len);
771repost: 768repost:
772 r_xprt->rx_stats.bad_reply_count++; 769 r_xprt->rx_stats.bad_reply_count++;
773 rep->rr_func = rpcrdma_reply_handler;
774 if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, &r_xprt->rx_ep, rep)) 770 if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, &r_xprt->rx_ep, rep))
775 rpcrdma_recv_buffer_put(rep); 771 rpcrdma_recv_buffer_put(rep);
776 772
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 436da2caec95..680f888a9ddd 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -240,6 +240,16 @@ xprt_rdma_connect_worker(struct work_struct *work)
240 xprt_clear_connecting(xprt); 240 xprt_clear_connecting(xprt);
241} 241}
242 242
243static void
244xprt_rdma_inject_disconnect(struct rpc_xprt *xprt)
245{
246 struct rpcrdma_xprt *r_xprt = container_of(xprt, struct rpcrdma_xprt,
247 rx_xprt);
248
249 pr_info("rpcrdma: injecting transport disconnect on xprt=%p\n", xprt);
250 rdma_disconnect(r_xprt->rx_ia.ri_id);
251}
252
243/* 253/*
244 * xprt_rdma_destroy 254 * xprt_rdma_destroy
245 * 255 *
@@ -612,12 +622,6 @@ xprt_rdma_send_request(struct rpc_task *task)
612 if (req->rl_reply == NULL) /* e.g. reconnection */ 622 if (req->rl_reply == NULL) /* e.g. reconnection */
613 rpcrdma_recv_buffer_get(req); 623 rpcrdma_recv_buffer_get(req);
614 624
615 if (req->rl_reply) {
616 req->rl_reply->rr_func = rpcrdma_reply_handler;
617 /* this need only be done once, but... */
618 req->rl_reply->rr_xprt = xprt;
619 }
620
621 /* Must suppress retransmit to maintain credits */ 625 /* Must suppress retransmit to maintain credits */
622 if (req->rl_connect_cookie == xprt->connect_cookie) 626 if (req->rl_connect_cookie == xprt->connect_cookie)
623 goto drop_connection; 627 goto drop_connection;
@@ -676,6 +680,17 @@ static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
676 r_xprt->rx_stats.bad_reply_count); 680 r_xprt->rx_stats.bad_reply_count);
677} 681}
678 682
683static int
684xprt_rdma_enable_swap(struct rpc_xprt *xprt)
685{
686 return -EINVAL;
687}
688
689static void
690xprt_rdma_disable_swap(struct rpc_xprt *xprt)
691{
692}
693
679/* 694/*
680 * Plumbing for rpc transport switch and kernel module 695 * Plumbing for rpc transport switch and kernel module
681 */ 696 */
@@ -694,7 +709,10 @@ static struct rpc_xprt_ops xprt_rdma_procs = {
694 .send_request = xprt_rdma_send_request, 709 .send_request = xprt_rdma_send_request,
695 .close = xprt_rdma_close, 710 .close = xprt_rdma_close,
696 .destroy = xprt_rdma_destroy, 711 .destroy = xprt_rdma_destroy,
697 .print_stats = xprt_rdma_print_stats 712 .print_stats = xprt_rdma_print_stats,
713 .enable_swap = xprt_rdma_enable_swap,
714 .disable_swap = xprt_rdma_disable_swap,
715 .inject_disconnect = xprt_rdma_inject_disconnect
698}; 716};
699 717
700static struct xprt_class xprt_rdma = { 718static struct xprt_class xprt_rdma = {
@@ -720,17 +738,24 @@ void xprt_rdma_cleanup(void)
720 if (rc) 738 if (rc)
721 dprintk("RPC: %s: xprt_unregister returned %i\n", 739 dprintk("RPC: %s: xprt_unregister returned %i\n",
722 __func__, rc); 740 __func__, rc);
741
742 frwr_destroy_recovery_wq();
723} 743}
724 744
725int xprt_rdma_init(void) 745int xprt_rdma_init(void)
726{ 746{
727 int rc; 747 int rc;
728 748
729 rc = xprt_register_transport(&xprt_rdma); 749 rc = frwr_alloc_recovery_wq();
730
731 if (rc) 750 if (rc)
732 return rc; 751 return rc;
733 752
753 rc = xprt_register_transport(&xprt_rdma);
754 if (rc) {
755 frwr_destroy_recovery_wq();
756 return rc;
757 }
758
734 dprintk("RPCRDMA Module Init, register RPC RDMA transport\n"); 759 dprintk("RPCRDMA Module Init, register RPC RDMA transport\n");
735 760
736 dprintk("Defaults:\n"); 761 dprintk("Defaults:\n");
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 52df265b472a..891c4ede2c20 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -80,7 +80,6 @@ static void
80rpcrdma_run_tasklet(unsigned long data) 80rpcrdma_run_tasklet(unsigned long data)
81{ 81{
82 struct rpcrdma_rep *rep; 82 struct rpcrdma_rep *rep;
83 void (*func)(struct rpcrdma_rep *);
84 unsigned long flags; 83 unsigned long flags;
85 84
86 data = data; 85 data = data;
@@ -89,14 +88,9 @@ rpcrdma_run_tasklet(unsigned long data)
89 rep = list_entry(rpcrdma_tasklets_g.next, 88 rep = list_entry(rpcrdma_tasklets_g.next,
90 struct rpcrdma_rep, rr_list); 89 struct rpcrdma_rep, rr_list);
91 list_del(&rep->rr_list); 90 list_del(&rep->rr_list);
92 func = rep->rr_func;
93 rep->rr_func = NULL;
94 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags); 91 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
95 92
96 if (func) 93 rpcrdma_reply_handler(rep);
97 func(rep);
98 else
99 rpcrdma_recv_buffer_put(rep);
100 94
101 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags); 95 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
102 } 96 }
@@ -236,7 +230,7 @@ rpcrdma_recvcq_process_wc(struct ib_wc *wc, struct list_head *sched_list)
236 __func__, rep, wc->byte_len); 230 __func__, rep, wc->byte_len);
237 231
238 rep->rr_len = wc->byte_len; 232 rep->rr_len = wc->byte_len;
239 ib_dma_sync_single_for_cpu(rdmab_to_ia(rep->rr_buffer)->ri_id->device, 233 ib_dma_sync_single_for_cpu(rep->rr_device,
240 rdmab_addr(rep->rr_rdmabuf), 234 rdmab_addr(rep->rr_rdmabuf),
241 rep->rr_len, DMA_FROM_DEVICE); 235 rep->rr_len, DMA_FROM_DEVICE);
242 prefetch(rdmab_to_msg(rep->rr_rdmabuf)); 236 prefetch(rdmab_to_msg(rep->rr_rdmabuf));
@@ -407,7 +401,7 @@ connected:
407 401
408 pr_info("rpcrdma: connection to %pIS:%u on %s, memreg '%s', %d credits, %d responders%s\n", 402 pr_info("rpcrdma: connection to %pIS:%u on %s, memreg '%s', %d credits, %d responders%s\n",
409 sap, rpc_get_port(sap), 403 sap, rpc_get_port(sap),
410 ia->ri_id->device->name, 404 ia->ri_device->name,
411 ia->ri_ops->ro_displayname, 405 ia->ri_ops->ro_displayname,
412 xprt->rx_buf.rb_max_requests, 406 xprt->rx_buf.rb_max_requests,
413 ird, ird < 4 && ird < tird / 2 ? " (low!)" : ""); 407 ird, ird < 4 && ird < tird / 2 ? " (low!)" : "");
@@ -508,8 +502,9 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
508 rc = PTR_ERR(ia->ri_id); 502 rc = PTR_ERR(ia->ri_id);
509 goto out1; 503 goto out1;
510 } 504 }
505 ia->ri_device = ia->ri_id->device;
511 506
512 ia->ri_pd = ib_alloc_pd(ia->ri_id->device); 507 ia->ri_pd = ib_alloc_pd(ia->ri_device);
513 if (IS_ERR(ia->ri_pd)) { 508 if (IS_ERR(ia->ri_pd)) {
514 rc = PTR_ERR(ia->ri_pd); 509 rc = PTR_ERR(ia->ri_pd);
515 dprintk("RPC: %s: ib_alloc_pd() failed %i\n", 510 dprintk("RPC: %s: ib_alloc_pd() failed %i\n",
@@ -517,7 +512,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
517 goto out2; 512 goto out2;
518 } 513 }
519 514
520 rc = ib_query_device(ia->ri_id->device, devattr); 515 rc = ib_query_device(ia->ri_device, devattr);
521 if (rc) { 516 if (rc) {
522 dprintk("RPC: %s: ib_query_device failed %d\n", 517 dprintk("RPC: %s: ib_query_device failed %d\n",
523 __func__, rc); 518 __func__, rc);
@@ -526,7 +521,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
526 521
527 if (devattr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) { 522 if (devattr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
528 ia->ri_have_dma_lkey = 1; 523 ia->ri_have_dma_lkey = 1;
529 ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey; 524 ia->ri_dma_lkey = ia->ri_device->local_dma_lkey;
530 } 525 }
531 526
532 if (memreg == RPCRDMA_FRMR) { 527 if (memreg == RPCRDMA_FRMR) {
@@ -541,7 +536,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
541 } 536 }
542 } 537 }
543 if (memreg == RPCRDMA_MTHCAFMR) { 538 if (memreg == RPCRDMA_MTHCAFMR) {
544 if (!ia->ri_id->device->alloc_fmr) { 539 if (!ia->ri_device->alloc_fmr) {
545 dprintk("RPC: %s: MTHCAFMR registration " 540 dprintk("RPC: %s: MTHCAFMR registration "
546 "not supported by HCA\n", __func__); 541 "not supported by HCA\n", __func__);
547 memreg = RPCRDMA_ALLPHYSICAL; 542 memreg = RPCRDMA_ALLPHYSICAL;
@@ -590,9 +585,6 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
590 dprintk("RPC: %s: memory registration strategy is '%s'\n", 585 dprintk("RPC: %s: memory registration strategy is '%s'\n",
591 __func__, ia->ri_ops->ro_displayname); 586 __func__, ia->ri_ops->ro_displayname);
592 587
593 /* Else will do memory reg/dereg for each chunk */
594 ia->ri_memreg_strategy = memreg;
595
596 rwlock_init(&ia->ri_qplock); 588 rwlock_init(&ia->ri_qplock);
597 return 0; 589 return 0;
598 590
@@ -622,17 +614,17 @@ rpcrdma_ia_close(struct rpcrdma_ia *ia)
622 dprintk("RPC: %s: ib_dereg_mr returned %i\n", 614 dprintk("RPC: %s: ib_dereg_mr returned %i\n",
623 __func__, rc); 615 __func__, rc);
624 } 616 }
617
625 if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) { 618 if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
626 if (ia->ri_id->qp) 619 if (ia->ri_id->qp)
627 rdma_destroy_qp(ia->ri_id); 620 rdma_destroy_qp(ia->ri_id);
628 rdma_destroy_id(ia->ri_id); 621 rdma_destroy_id(ia->ri_id);
629 ia->ri_id = NULL; 622 ia->ri_id = NULL;
630 } 623 }
631 if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) { 624
632 rc = ib_dealloc_pd(ia->ri_pd); 625 /* If the pd is still busy, xprtrdma missed freeing a resource */
633 dprintk("RPC: %s: ib_dealloc_pd returned %i\n", 626 if (ia->ri_pd && !IS_ERR(ia->ri_pd))
634 __func__, rc); 627 WARN_ON(ib_dealloc_pd(ia->ri_pd));
635 }
636} 628}
637 629
638/* 630/*
@@ -693,8 +685,8 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
693 INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker); 685 INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker);
694 686
695 cq_attr.cqe = ep->rep_attr.cap.max_send_wr + 1; 687 cq_attr.cqe = ep->rep_attr.cap.max_send_wr + 1;
696 sendcq = ib_create_cq(ia->ri_id->device, rpcrdma_sendcq_upcall, 688 sendcq = ib_create_cq(ia->ri_device, rpcrdma_sendcq_upcall,
697 rpcrdma_cq_async_error_upcall, ep, &cq_attr); 689 rpcrdma_cq_async_error_upcall, ep, &cq_attr);
698 if (IS_ERR(sendcq)) { 690 if (IS_ERR(sendcq)) {
699 rc = PTR_ERR(sendcq); 691 rc = PTR_ERR(sendcq);
700 dprintk("RPC: %s: failed to create send CQ: %i\n", 692 dprintk("RPC: %s: failed to create send CQ: %i\n",
@@ -710,8 +702,8 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
710 } 702 }
711 703
712 cq_attr.cqe = ep->rep_attr.cap.max_recv_wr + 1; 704 cq_attr.cqe = ep->rep_attr.cap.max_recv_wr + 1;
713 recvcq = ib_create_cq(ia->ri_id->device, rpcrdma_recvcq_upcall, 705 recvcq = ib_create_cq(ia->ri_device, rpcrdma_recvcq_upcall,
714 rpcrdma_cq_async_error_upcall, ep, &cq_attr); 706 rpcrdma_cq_async_error_upcall, ep, &cq_attr);
715 if (IS_ERR(recvcq)) { 707 if (IS_ERR(recvcq)) {
716 rc = PTR_ERR(recvcq); 708 rc = PTR_ERR(recvcq);
717 dprintk("RPC: %s: failed to create recv CQ: %i\n", 709 dprintk("RPC: %s: failed to create recv CQ: %i\n",
@@ -817,8 +809,6 @@ retry:
817 rpcrdma_flush_cqs(ep); 809 rpcrdma_flush_cqs(ep);
818 810
819 xprt = container_of(ia, struct rpcrdma_xprt, rx_ia); 811 xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
820 ia->ri_ops->ro_reset(xprt);
821
822 id = rpcrdma_create_id(xprt, ia, 812 id = rpcrdma_create_id(xprt, ia,
823 (struct sockaddr *)&xprt->rx_data.addr); 813 (struct sockaddr *)&xprt->rx_data.addr);
824 if (IS_ERR(id)) { 814 if (IS_ERR(id)) {
@@ -832,7 +822,7 @@ retry:
832 * More stuff I haven't thought of! 822 * More stuff I haven't thought of!
833 * Rrrgh! 823 * Rrrgh!
834 */ 824 */
835 if (ia->ri_id->device != id->device) { 825 if (ia->ri_device != id->device) {
836 printk("RPC: %s: can't reconnect on " 826 printk("RPC: %s: can't reconnect on "
837 "different device!\n", __func__); 827 "different device!\n", __func__);
838 rdma_destroy_id(id); 828 rdma_destroy_id(id);
@@ -974,7 +964,8 @@ rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt)
974 goto out_free; 964 goto out_free;
975 } 965 }
976 966
977 rep->rr_buffer = &r_xprt->rx_buf; 967 rep->rr_device = ia->ri_device;
968 rep->rr_rxprt = r_xprt;
978 return rep; 969 return rep;
979 970
980out_free: 971out_free:
@@ -1098,31 +1089,33 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
1098 kfree(buf->rb_pool); 1089 kfree(buf->rb_pool);
1099} 1090}
1100 1091
1101/* "*mw" can be NULL when rpcrdma_buffer_get_mrs() fails, leaving 1092struct rpcrdma_mw *
1102 * some req segments uninitialized. 1093rpcrdma_get_mw(struct rpcrdma_xprt *r_xprt)
1103 */
1104static void
1105rpcrdma_buffer_put_mr(struct rpcrdma_mw **mw, struct rpcrdma_buffer *buf)
1106{ 1094{
1107 if (*mw) { 1095 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
1108 list_add_tail(&(*mw)->mw_list, &buf->rb_mws); 1096 struct rpcrdma_mw *mw = NULL;
1109 *mw = NULL; 1097
1098 spin_lock(&buf->rb_mwlock);
1099 if (!list_empty(&buf->rb_mws)) {
1100 mw = list_first_entry(&buf->rb_mws,
1101 struct rpcrdma_mw, mw_list);
1102 list_del_init(&mw->mw_list);
1110 } 1103 }
1104 spin_unlock(&buf->rb_mwlock);
1105
1106 if (!mw)
1107 pr_err("RPC: %s: no MWs available\n", __func__);
1108 return mw;
1111} 1109}
1112 1110
1113/* Cycle mw's back in reverse order, and "spin" them. 1111void
1114 * This delays and scrambles reuse as much as possible. 1112rpcrdma_put_mw(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw)
1115 */
1116static void
1117rpcrdma_buffer_put_mrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
1118{ 1113{
1119 struct rpcrdma_mr_seg *seg = req->rl_segments; 1114 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
1120 struct rpcrdma_mr_seg *seg1 = seg;
1121 int i;
1122 1115
1123 for (i = 1, seg++; i < RPCRDMA_MAX_SEGS; seg++, i++) 1116 spin_lock(&buf->rb_mwlock);
1124 rpcrdma_buffer_put_mr(&seg->rl_mw, buf); 1117 list_add_tail(&mw->mw_list, &buf->rb_mws);
1125 rpcrdma_buffer_put_mr(&seg1->rl_mw, buf); 1118 spin_unlock(&buf->rb_mwlock);
1126} 1119}
1127 1120
1128static void 1121static void
@@ -1132,115 +1125,10 @@ rpcrdma_buffer_put_sendbuf(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
1132 req->rl_niovs = 0; 1125 req->rl_niovs = 0;
1133 if (req->rl_reply) { 1126 if (req->rl_reply) {
1134 buf->rb_recv_bufs[--buf->rb_recv_index] = req->rl_reply; 1127 buf->rb_recv_bufs[--buf->rb_recv_index] = req->rl_reply;
1135 req->rl_reply->rr_func = NULL;
1136 req->rl_reply = NULL; 1128 req->rl_reply = NULL;
1137 } 1129 }
1138} 1130}
1139 1131
1140/* rpcrdma_unmap_one() was already done during deregistration.
1141 * Redo only the ib_post_send().
1142 */
1143static void
1144rpcrdma_retry_local_inv(struct rpcrdma_mw *r, struct rpcrdma_ia *ia)
1145{
1146 struct rpcrdma_xprt *r_xprt =
1147 container_of(ia, struct rpcrdma_xprt, rx_ia);
1148 struct ib_send_wr invalidate_wr, *bad_wr;
1149 int rc;
1150
1151 dprintk("RPC: %s: FRMR %p is stale\n", __func__, r);
1152
1153 /* When this FRMR is re-inserted into rb_mws, it is no longer stale */
1154 r->r.frmr.fr_state = FRMR_IS_INVALID;
1155
1156 memset(&invalidate_wr, 0, sizeof(invalidate_wr));
1157 invalidate_wr.wr_id = (unsigned long)(void *)r;
1158 invalidate_wr.opcode = IB_WR_LOCAL_INV;
1159 invalidate_wr.ex.invalidate_rkey = r->r.frmr.fr_mr->rkey;
1160 DECR_CQCOUNT(&r_xprt->rx_ep);
1161
1162 dprintk("RPC: %s: frmr %p invalidating rkey %08x\n",
1163 __func__, r, r->r.frmr.fr_mr->rkey);
1164
1165 read_lock(&ia->ri_qplock);
1166 rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
1167 read_unlock(&ia->ri_qplock);
1168 if (rc) {
1169 /* Force rpcrdma_buffer_get() to retry */
1170 r->r.frmr.fr_state = FRMR_IS_STALE;
1171 dprintk("RPC: %s: ib_post_send failed, %i\n",
1172 __func__, rc);
1173 }
1174}
1175
1176static void
1177rpcrdma_retry_flushed_linv(struct list_head *stale,
1178 struct rpcrdma_buffer *buf)
1179{
1180 struct rpcrdma_ia *ia = rdmab_to_ia(buf);
1181 struct list_head *pos;
1182 struct rpcrdma_mw *r;
1183 unsigned long flags;
1184
1185 list_for_each(pos, stale) {
1186 r = list_entry(pos, struct rpcrdma_mw, mw_list);
1187 rpcrdma_retry_local_inv(r, ia);
1188 }
1189
1190 spin_lock_irqsave(&buf->rb_lock, flags);
1191 list_splice_tail(stale, &buf->rb_mws);
1192 spin_unlock_irqrestore(&buf->rb_lock, flags);
1193}
1194
1195static struct rpcrdma_req *
1196rpcrdma_buffer_get_frmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf,
1197 struct list_head *stale)
1198{
1199 struct rpcrdma_mw *r;
1200 int i;
1201
1202 i = RPCRDMA_MAX_SEGS - 1;
1203 while (!list_empty(&buf->rb_mws)) {
1204 r = list_entry(buf->rb_mws.next,
1205 struct rpcrdma_mw, mw_list);
1206 list_del(&r->mw_list);
1207 if (r->r.frmr.fr_state == FRMR_IS_STALE) {
1208 list_add(&r->mw_list, stale);
1209 continue;
1210 }
1211 req->rl_segments[i].rl_mw = r;
1212 if (unlikely(i-- == 0))
1213 return req; /* Success */
1214 }
1215
1216 /* Not enough entries on rb_mws for this req */
1217 rpcrdma_buffer_put_sendbuf(req, buf);
1218 rpcrdma_buffer_put_mrs(req, buf);
1219 return NULL;
1220}
1221
1222static struct rpcrdma_req *
1223rpcrdma_buffer_get_fmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
1224{
1225 struct rpcrdma_mw *r;
1226 int i;
1227
1228 i = RPCRDMA_MAX_SEGS - 1;
1229 while (!list_empty(&buf->rb_mws)) {
1230 r = list_entry(buf->rb_mws.next,
1231 struct rpcrdma_mw, mw_list);
1232 list_del(&r->mw_list);
1233 req->rl_segments[i].rl_mw = r;
1234 if (unlikely(i-- == 0))
1235 return req; /* Success */
1236 }
1237
1238 /* Not enough entries on rb_mws for this req */
1239 rpcrdma_buffer_put_sendbuf(req, buf);
1240 rpcrdma_buffer_put_mrs(req, buf);
1241 return NULL;
1242}
1243
1244/* 1132/*
1245 * Get a set of request/reply buffers. 1133 * Get a set of request/reply buffers.
1246 * 1134 *
@@ -1253,12 +1141,11 @@ rpcrdma_buffer_get_fmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
1253struct rpcrdma_req * 1141struct rpcrdma_req *
1254rpcrdma_buffer_get(struct rpcrdma_buffer *buffers) 1142rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
1255{ 1143{
1256 struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
1257 struct list_head stale;
1258 struct rpcrdma_req *req; 1144 struct rpcrdma_req *req;
1259 unsigned long flags; 1145 unsigned long flags;
1260 1146
1261 spin_lock_irqsave(&buffers->rb_lock, flags); 1147 spin_lock_irqsave(&buffers->rb_lock, flags);
1148
1262 if (buffers->rb_send_index == buffers->rb_max_requests) { 1149 if (buffers->rb_send_index == buffers->rb_max_requests) {
1263 spin_unlock_irqrestore(&buffers->rb_lock, flags); 1150 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1264 dprintk("RPC: %s: out of request buffers\n", __func__); 1151 dprintk("RPC: %s: out of request buffers\n", __func__);
@@ -1277,20 +1164,7 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
1277 } 1164 }
1278 buffers->rb_send_bufs[buffers->rb_send_index++] = NULL; 1165 buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
1279 1166
1280 INIT_LIST_HEAD(&stale);
1281 switch (ia->ri_memreg_strategy) {
1282 case RPCRDMA_FRMR:
1283 req = rpcrdma_buffer_get_frmrs(req, buffers, &stale);
1284 break;
1285 case RPCRDMA_MTHCAFMR:
1286 req = rpcrdma_buffer_get_fmrs(req, buffers);
1287 break;
1288 default:
1289 break;
1290 }
1291 spin_unlock_irqrestore(&buffers->rb_lock, flags); 1167 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1292 if (!list_empty(&stale))
1293 rpcrdma_retry_flushed_linv(&stale, buffers);
1294 return req; 1168 return req;
1295} 1169}
1296 1170
@@ -1302,19 +1176,10 @@ void
1302rpcrdma_buffer_put(struct rpcrdma_req *req) 1176rpcrdma_buffer_put(struct rpcrdma_req *req)
1303{ 1177{
1304 struct rpcrdma_buffer *buffers = req->rl_buffer; 1178 struct rpcrdma_buffer *buffers = req->rl_buffer;
1305 struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
1306 unsigned long flags; 1179 unsigned long flags;
1307 1180
1308 spin_lock_irqsave(&buffers->rb_lock, flags); 1181 spin_lock_irqsave(&buffers->rb_lock, flags);
1309 rpcrdma_buffer_put_sendbuf(req, buffers); 1182 rpcrdma_buffer_put_sendbuf(req, buffers);
1310 switch (ia->ri_memreg_strategy) {
1311 case RPCRDMA_FRMR:
1312 case RPCRDMA_MTHCAFMR:
1313 rpcrdma_buffer_put_mrs(req, buffers);
1314 break;
1315 default:
1316 break;
1317 }
1318 spin_unlock_irqrestore(&buffers->rb_lock, flags); 1183 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1319} 1184}
1320 1185
@@ -1344,10 +1209,9 @@ rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
1344void 1209void
1345rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep) 1210rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
1346{ 1211{
1347 struct rpcrdma_buffer *buffers = rep->rr_buffer; 1212 struct rpcrdma_buffer *buffers = &rep->rr_rxprt->rx_buf;
1348 unsigned long flags; 1213 unsigned long flags;
1349 1214
1350 rep->rr_func = NULL;
1351 spin_lock_irqsave(&buffers->rb_lock, flags); 1215 spin_lock_irqsave(&buffers->rb_lock, flags);
1352 buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep; 1216 buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep;
1353 spin_unlock_irqrestore(&buffers->rb_lock, flags); 1217 spin_unlock_irqrestore(&buffers->rb_lock, flags);
@@ -1376,9 +1240,9 @@ rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
1376 /* 1240 /*
1377 * All memory passed here was kmalloc'ed, therefore phys-contiguous. 1241 * All memory passed here was kmalloc'ed, therefore phys-contiguous.
1378 */ 1242 */
1379 iov->addr = ib_dma_map_single(ia->ri_id->device, 1243 iov->addr = ib_dma_map_single(ia->ri_device,
1380 va, len, DMA_BIDIRECTIONAL); 1244 va, len, DMA_BIDIRECTIONAL);
1381 if (ib_dma_mapping_error(ia->ri_id->device, iov->addr)) 1245 if (ib_dma_mapping_error(ia->ri_device, iov->addr))
1382 return -ENOMEM; 1246 return -ENOMEM;
1383 1247
1384 iov->length = len; 1248 iov->length = len;
@@ -1422,8 +1286,8 @@ rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
1422{ 1286{
1423 int rc; 1287 int rc;
1424 1288
1425 ib_dma_unmap_single(ia->ri_id->device, 1289 ib_dma_unmap_single(ia->ri_device,
1426 iov->addr, iov->length, DMA_BIDIRECTIONAL); 1290 iov->addr, iov->length, DMA_BIDIRECTIONAL);
1427 1291
1428 if (NULL == mr) 1292 if (NULL == mr)
1429 return 0; 1293 return 0;
@@ -1516,15 +1380,18 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
1516 send_wr.num_sge = req->rl_niovs; 1380 send_wr.num_sge = req->rl_niovs;
1517 send_wr.opcode = IB_WR_SEND; 1381 send_wr.opcode = IB_WR_SEND;
1518 if (send_wr.num_sge == 4) /* no need to sync any pad (constant) */ 1382 if (send_wr.num_sge == 4) /* no need to sync any pad (constant) */
1519 ib_dma_sync_single_for_device(ia->ri_id->device, 1383 ib_dma_sync_single_for_device(ia->ri_device,
1520 req->rl_send_iov[3].addr, req->rl_send_iov[3].length, 1384 req->rl_send_iov[3].addr,
1521 DMA_TO_DEVICE); 1385 req->rl_send_iov[3].length,
1522 ib_dma_sync_single_for_device(ia->ri_id->device, 1386 DMA_TO_DEVICE);
1523 req->rl_send_iov[1].addr, req->rl_send_iov[1].length, 1387 ib_dma_sync_single_for_device(ia->ri_device,
1524 DMA_TO_DEVICE); 1388 req->rl_send_iov[1].addr,
1525 ib_dma_sync_single_for_device(ia->ri_id->device, 1389 req->rl_send_iov[1].length,
1526 req->rl_send_iov[0].addr, req->rl_send_iov[0].length, 1390 DMA_TO_DEVICE);
1527 DMA_TO_DEVICE); 1391 ib_dma_sync_single_for_device(ia->ri_device,
1392 req->rl_send_iov[0].addr,
1393 req->rl_send_iov[0].length,
1394 DMA_TO_DEVICE);
1528 1395
1529 if (DECR_CQCOUNT(ep) > 0) 1396 if (DECR_CQCOUNT(ep) > 0)
1530 send_wr.send_flags = 0; 1397 send_wr.send_flags = 0;
@@ -1557,7 +1424,7 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
1557 recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov; 1424 recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov;
1558 recv_wr.num_sge = 1; 1425 recv_wr.num_sge = 1;
1559 1426
1560 ib_dma_sync_single_for_cpu(ia->ri_id->device, 1427 ib_dma_sync_single_for_cpu(ia->ri_device,
1561 rdmab_addr(rep->rr_rdmabuf), 1428 rdmab_addr(rep->rr_rdmabuf),
1562 rdmab_length(rep->rr_rdmabuf), 1429 rdmab_length(rep->rr_rdmabuf),
1563 DMA_BIDIRECTIONAL); 1430 DMA_BIDIRECTIONAL);
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 58163b88738c..f49dd8b38122 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -62,6 +62,7 @@
62struct rpcrdma_ia { 62struct rpcrdma_ia {
63 const struct rpcrdma_memreg_ops *ri_ops; 63 const struct rpcrdma_memreg_ops *ri_ops;
64 rwlock_t ri_qplock; 64 rwlock_t ri_qplock;
65 struct ib_device *ri_device;
65 struct rdma_cm_id *ri_id; 66 struct rdma_cm_id *ri_id;
66 struct ib_pd *ri_pd; 67 struct ib_pd *ri_pd;
67 struct ib_mr *ri_bind_mem; 68 struct ib_mr *ri_bind_mem;
@@ -69,7 +70,6 @@ struct rpcrdma_ia {
69 int ri_have_dma_lkey; 70 int ri_have_dma_lkey;
70 struct completion ri_done; 71 struct completion ri_done;
71 int ri_async_rc; 72 int ri_async_rc;
72 enum rpcrdma_memreg ri_memreg_strategy;
73 unsigned int ri_max_frmr_depth; 73 unsigned int ri_max_frmr_depth;
74 struct ib_device_attr ri_devattr; 74 struct ib_device_attr ri_devattr;
75 struct ib_qp_attr ri_qp_attr; 75 struct ib_qp_attr ri_qp_attr;
@@ -173,9 +173,8 @@ struct rpcrdma_buffer;
173 173
174struct rpcrdma_rep { 174struct rpcrdma_rep {
175 unsigned int rr_len; 175 unsigned int rr_len;
176 struct rpcrdma_buffer *rr_buffer; 176 struct ib_device *rr_device;
177 struct rpc_xprt *rr_xprt; 177 struct rpcrdma_xprt *rr_rxprt;
178 void (*rr_func)(struct rpcrdma_rep *);
179 struct list_head rr_list; 178 struct list_head rr_list;
180 struct rpcrdma_regbuf *rr_rdmabuf; 179 struct rpcrdma_regbuf *rr_rdmabuf;
181}; 180};
@@ -203,11 +202,18 @@ struct rpcrdma_frmr {
203 struct ib_fast_reg_page_list *fr_pgl; 202 struct ib_fast_reg_page_list *fr_pgl;
204 struct ib_mr *fr_mr; 203 struct ib_mr *fr_mr;
205 enum rpcrdma_frmr_state fr_state; 204 enum rpcrdma_frmr_state fr_state;
205 struct work_struct fr_work;
206 struct rpcrdma_xprt *fr_xprt;
207};
208
209struct rpcrdma_fmr {
210 struct ib_fmr *fmr;
211 u64 *physaddrs;
206}; 212};
207 213
208struct rpcrdma_mw { 214struct rpcrdma_mw {
209 union { 215 union {
210 struct ib_fmr *fmr; 216 struct rpcrdma_fmr fmr;
211 struct rpcrdma_frmr frmr; 217 struct rpcrdma_frmr frmr;
212 } r; 218 } r;
213 void (*mw_sendcompletion)(struct ib_wc *); 219 void (*mw_sendcompletion)(struct ib_wc *);
@@ -281,15 +287,17 @@ rpcr_to_rdmar(struct rpc_rqst *rqst)
281 * One of these is associated with a transport instance 287 * One of these is associated with a transport instance
282 */ 288 */
283struct rpcrdma_buffer { 289struct rpcrdma_buffer {
284 spinlock_t rb_lock; /* protects indexes */ 290 spinlock_t rb_mwlock; /* protect rb_mws list */
285 u32 rb_max_requests;/* client max requests */ 291 struct list_head rb_mws;
286 struct list_head rb_mws; /* optional memory windows/fmrs/frmrs */ 292 struct list_head rb_all;
287 struct list_head rb_all; 293 char *rb_pool;
288 int rb_send_index; 294
295 spinlock_t rb_lock; /* protect buf arrays */
296 u32 rb_max_requests;
297 int rb_send_index;
298 int rb_recv_index;
289 struct rpcrdma_req **rb_send_bufs; 299 struct rpcrdma_req **rb_send_bufs;
290 int rb_recv_index;
291 struct rpcrdma_rep **rb_recv_bufs; 300 struct rpcrdma_rep **rb_recv_bufs;
292 char *rb_pool;
293}; 301};
294#define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia) 302#define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia)
295 303
@@ -350,7 +358,6 @@ struct rpcrdma_memreg_ops {
350 struct rpcrdma_create_data_internal *); 358 struct rpcrdma_create_data_internal *);
351 size_t (*ro_maxpages)(struct rpcrdma_xprt *); 359 size_t (*ro_maxpages)(struct rpcrdma_xprt *);
352 int (*ro_init)(struct rpcrdma_xprt *); 360 int (*ro_init)(struct rpcrdma_xprt *);
353 void (*ro_reset)(struct rpcrdma_xprt *);
354 void (*ro_destroy)(struct rpcrdma_buffer *); 361 void (*ro_destroy)(struct rpcrdma_buffer *);
355 const char *ro_displayname; 362 const char *ro_displayname;
356}; 363};
@@ -413,6 +420,8 @@ int rpcrdma_ep_post_recv(struct rpcrdma_ia *, struct rpcrdma_ep *,
413int rpcrdma_buffer_create(struct rpcrdma_xprt *); 420int rpcrdma_buffer_create(struct rpcrdma_xprt *);
414void rpcrdma_buffer_destroy(struct rpcrdma_buffer *); 421void rpcrdma_buffer_destroy(struct rpcrdma_buffer *);
415 422
423struct rpcrdma_mw *rpcrdma_get_mw(struct rpcrdma_xprt *);
424void rpcrdma_put_mw(struct rpcrdma_xprt *, struct rpcrdma_mw *);
416struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *); 425struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *);
417void rpcrdma_buffer_put(struct rpcrdma_req *); 426void rpcrdma_buffer_put(struct rpcrdma_req *);
418void rpcrdma_recv_buffer_get(struct rpcrdma_req *); 427void rpcrdma_recv_buffer_get(struct rpcrdma_req *);
@@ -425,6 +434,9 @@ void rpcrdma_free_regbuf(struct rpcrdma_ia *,
425 434
426unsigned int rpcrdma_max_segments(struct rpcrdma_xprt *); 435unsigned int rpcrdma_max_segments(struct rpcrdma_xprt *);
427 436
437int frwr_alloc_recovery_wq(void);
438void frwr_destroy_recovery_wq(void);
439
428/* 440/*
429 * Wrappers for chunk registration, shared by read/write chunk code. 441 * Wrappers for chunk registration, shared by read/write chunk code.
430 */ 442 */
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index b0517287075b..e193c2b5476b 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -623,24 +623,6 @@ process_status:
623} 623}
624 624
625/** 625/**
626 * xs_tcp_shutdown - gracefully shut down a TCP socket
627 * @xprt: transport
628 *
629 * Initiates a graceful shutdown of the TCP socket by calling the
630 * equivalent of shutdown(SHUT_RDWR);
631 */
632static void xs_tcp_shutdown(struct rpc_xprt *xprt)
633{
634 struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
635 struct socket *sock = transport->sock;
636
637 if (sock != NULL) {
638 kernel_sock_shutdown(sock, SHUT_RDWR);
639 trace_rpc_socket_shutdown(xprt, sock);
640 }
641}
642
643/**
644 * xs_tcp_send_request - write an RPC request to a TCP socket 626 * xs_tcp_send_request - write an RPC request to a TCP socket
645 * @task: address of RPC task that manages the state of an RPC request 627 * @task: address of RPC task that manages the state of an RPC request
646 * 628 *
@@ -786,6 +768,7 @@ static void xs_sock_mark_closed(struct rpc_xprt *xprt)
786 xs_sock_reset_connection_flags(xprt); 768 xs_sock_reset_connection_flags(xprt);
787 /* Mark transport as closed and wake up all pending tasks */ 769 /* Mark transport as closed and wake up all pending tasks */
788 xprt_disconnect_done(xprt); 770 xprt_disconnect_done(xprt);
771 xprt_force_disconnect(xprt);
789} 772}
790 773
791/** 774/**
@@ -827,6 +810,9 @@ static void xs_reset_transport(struct sock_xprt *transport)
827 if (sk == NULL) 810 if (sk == NULL)
828 return; 811 return;
829 812
813 if (atomic_read(&transport->xprt.swapper))
814 sk_clear_memalloc(sk);
815
830 write_lock_bh(&sk->sk_callback_lock); 816 write_lock_bh(&sk->sk_callback_lock);
831 transport->inet = NULL; 817 transport->inet = NULL;
832 transport->sock = NULL; 818 transport->sock = NULL;
@@ -863,6 +849,13 @@ static void xs_close(struct rpc_xprt *xprt)
863 xprt_disconnect_done(xprt); 849 xprt_disconnect_done(xprt);
864} 850}
865 851
852static void xs_inject_disconnect(struct rpc_xprt *xprt)
853{
854 dprintk("RPC: injecting transport disconnect on xprt=%p\n",
855 xprt);
856 xprt_disconnect_done(xprt);
857}
858
866static void xs_xprt_free(struct rpc_xprt *xprt) 859static void xs_xprt_free(struct rpc_xprt *xprt)
867{ 860{
868 xs_free_peer_addresses(xprt); 861 xs_free_peer_addresses(xprt);
@@ -901,7 +894,6 @@ static int xs_local_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb)
901/** 894/**
902 * xs_local_data_ready - "data ready" callback for AF_LOCAL sockets 895 * xs_local_data_ready - "data ready" callback for AF_LOCAL sockets
903 * @sk: socket with data to read 896 * @sk: socket with data to read
904 * @len: how much data to read
905 * 897 *
906 * Currently this assumes we can read the whole reply in a single gulp. 898 * Currently this assumes we can read the whole reply in a single gulp.
907 */ 899 */
@@ -965,7 +957,6 @@ static void xs_local_data_ready(struct sock *sk)
965/** 957/**
966 * xs_udp_data_ready - "data ready" callback for UDP sockets 958 * xs_udp_data_ready - "data ready" callback for UDP sockets
967 * @sk: socket with data to read 959 * @sk: socket with data to read
968 * @len: how much data to read
969 * 960 *
970 */ 961 */
971static void xs_udp_data_ready(struct sock *sk) 962static void xs_udp_data_ready(struct sock *sk)
@@ -1389,7 +1380,6 @@ static int xs_tcp_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, uns
1389/** 1380/**
1390 * xs_tcp_data_ready - "data ready" callback for TCP sockets 1381 * xs_tcp_data_ready - "data ready" callback for TCP sockets
1391 * @sk: socket with data to read 1382 * @sk: socket with data to read
1392 * @bytes: how much data to read
1393 * 1383 *
1394 */ 1384 */
1395static void xs_tcp_data_ready(struct sock *sk) 1385static void xs_tcp_data_ready(struct sock *sk)
@@ -1886,9 +1876,7 @@ static int xs_local_finish_connecting(struct rpc_xprt *xprt,
1886 1876
1887/** 1877/**
1888 * xs_local_setup_socket - create AF_LOCAL socket, connect to a local endpoint 1878 * xs_local_setup_socket - create AF_LOCAL socket, connect to a local endpoint
1889 * @xprt: RPC transport to connect
1890 * @transport: socket transport to connect 1879 * @transport: socket transport to connect
1891 * @create_sock: function to create a socket of the correct type
1892 */ 1880 */
1893static int xs_local_setup_socket(struct sock_xprt *transport) 1881static int xs_local_setup_socket(struct sock_xprt *transport)
1894{ 1882{
@@ -1960,43 +1948,84 @@ static void xs_local_connect(struct rpc_xprt *xprt, struct rpc_task *task)
1960 msleep_interruptible(15000); 1948 msleep_interruptible(15000);
1961} 1949}
1962 1950
1963#ifdef CONFIG_SUNRPC_SWAP 1951#if IS_ENABLED(CONFIG_SUNRPC_SWAP)
1952/*
1953 * Note that this should be called with XPRT_LOCKED held (or when we otherwise
1954 * know that we have exclusive access to the socket), to guard against
1955 * races with xs_reset_transport.
1956 */
1964static void xs_set_memalloc(struct rpc_xprt *xprt) 1957static void xs_set_memalloc(struct rpc_xprt *xprt)
1965{ 1958{
1966 struct sock_xprt *transport = container_of(xprt, struct sock_xprt, 1959 struct sock_xprt *transport = container_of(xprt, struct sock_xprt,
1967 xprt); 1960 xprt);
1968 1961
1969 if (xprt->swapper) 1962 /*
1963 * If there's no sock, then we have nothing to set. The
1964 * reconnecting process will get it for us.
1965 */
1966 if (!transport->inet)
1967 return;
1968 if (atomic_read(&xprt->swapper))
1970 sk_set_memalloc(transport->inet); 1969 sk_set_memalloc(transport->inet);
1971} 1970}
1972 1971
1973/** 1972/**
1974 * xs_swapper - Tag this transport as being used for swap. 1973 * xs_enable_swap - Tag this transport as being used for swap.
1975 * @xprt: transport to tag 1974 * @xprt: transport to tag
1976 * @enable: enable/disable
1977 * 1975 *
1976 * Take a reference to this transport on behalf of the rpc_clnt, and
1977 * optionally mark it for swapping if it wasn't already.
1978 */ 1978 */
1979int xs_swapper(struct rpc_xprt *xprt, int enable) 1979static int
1980xs_enable_swap(struct rpc_xprt *xprt)
1980{ 1981{
1981 struct sock_xprt *transport = container_of(xprt, struct sock_xprt, 1982 struct sock_xprt *xs = container_of(xprt, struct sock_xprt, xprt);
1982 xprt);
1983 int err = 0;
1984 1983
1985 if (enable) { 1984 if (atomic_inc_return(&xprt->swapper) != 1)
1986 xprt->swapper++; 1985 return 0;
1987 xs_set_memalloc(xprt); 1986 if (wait_on_bit_lock(&xprt->state, XPRT_LOCKED, TASK_KILLABLE))
1988 } else if (xprt->swapper) { 1987 return -ERESTARTSYS;
1989 xprt->swapper--; 1988 if (xs->inet)
1990 sk_clear_memalloc(transport->inet); 1989 sk_set_memalloc(xs->inet);
1991 } 1990 xprt_release_xprt(xprt, NULL);
1991 return 0;
1992}
1992 1993
1993 return err; 1994/**
1995 * xs_disable_swap - Untag this transport as being used for swap.
1996 * @xprt: transport to tag
1997 *
1998 * Drop a "swapper" reference to this xprt on behalf of the rpc_clnt. If the
1999 * swapper refcount goes to 0, untag the socket as a memalloc socket.
2000 */
2001static void
2002xs_disable_swap(struct rpc_xprt *xprt)
2003{
2004 struct sock_xprt *xs = container_of(xprt, struct sock_xprt, xprt);
2005
2006 if (!atomic_dec_and_test(&xprt->swapper))
2007 return;
2008 if (wait_on_bit_lock(&xprt->state, XPRT_LOCKED, TASK_KILLABLE))
2009 return;
2010 if (xs->inet)
2011 sk_clear_memalloc(xs->inet);
2012 xprt_release_xprt(xprt, NULL);
1994} 2013}
1995EXPORT_SYMBOL_GPL(xs_swapper);
1996#else 2014#else
1997static void xs_set_memalloc(struct rpc_xprt *xprt) 2015static void xs_set_memalloc(struct rpc_xprt *xprt)
1998{ 2016{
1999} 2017}
2018
2019static int
2020xs_enable_swap(struct rpc_xprt *xprt)
2021{
2022 return -EINVAL;
2023}
2024
2025static void
2026xs_disable_swap(struct rpc_xprt *xprt)
2027{
2028}
2000#endif 2029#endif
2001 2030
2002static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) 2031static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
@@ -2057,6 +2086,27 @@ out:
2057 xprt_wake_pending_tasks(xprt, status); 2086 xprt_wake_pending_tasks(xprt, status);
2058} 2087}
2059 2088
2089/**
2090 * xs_tcp_shutdown - gracefully shut down a TCP socket
2091 * @xprt: transport
2092 *
2093 * Initiates a graceful shutdown of the TCP socket by calling the
2094 * equivalent of shutdown(SHUT_RDWR);
2095 */
2096static void xs_tcp_shutdown(struct rpc_xprt *xprt)
2097{
2098 struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
2099 struct socket *sock = transport->sock;
2100
2101 if (sock == NULL)
2102 return;
2103 if (xprt_connected(xprt)) {
2104 kernel_sock_shutdown(sock, SHUT_RDWR);
2105 trace_rpc_socket_shutdown(xprt, sock);
2106 } else
2107 xs_reset_transport(transport);
2108}
2109
2060static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) 2110static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
2061{ 2111{
2062 struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); 2112 struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
@@ -2067,6 +2117,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
2067 unsigned int keepidle = xprt->timeout->to_initval / HZ; 2117 unsigned int keepidle = xprt->timeout->to_initval / HZ;
2068 unsigned int keepcnt = xprt->timeout->to_retries + 1; 2118 unsigned int keepcnt = xprt->timeout->to_retries + 1;
2069 unsigned int opt_on = 1; 2119 unsigned int opt_on = 1;
2120 unsigned int timeo;
2070 2121
2071 /* TCP Keepalive options */ 2122 /* TCP Keepalive options */
2072 kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, 2123 kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
@@ -2078,6 +2129,12 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
2078 kernel_setsockopt(sock, SOL_TCP, TCP_KEEPCNT, 2129 kernel_setsockopt(sock, SOL_TCP, TCP_KEEPCNT,
2079 (char *)&keepcnt, sizeof(keepcnt)); 2130 (char *)&keepcnt, sizeof(keepcnt));
2080 2131
2132 /* TCP user timeout (see RFC5482) */
2133 timeo = jiffies_to_msecs(xprt->timeout->to_initval) *
2134 (xprt->timeout->to_retries + 1);
2135 kernel_setsockopt(sock, SOL_TCP, TCP_USER_TIMEOUT,
2136 (char *)&timeo, sizeof(timeo));
2137
2081 write_lock_bh(&sk->sk_callback_lock); 2138 write_lock_bh(&sk->sk_callback_lock);
2082 2139
2083 xs_save_old_callbacks(transport, sk); 2140 xs_save_old_callbacks(transport, sk);
@@ -2125,9 +2182,6 @@ out:
2125 2182
2126/** 2183/**
2127 * xs_tcp_setup_socket - create a TCP socket and connect to a remote endpoint 2184 * xs_tcp_setup_socket - create a TCP socket and connect to a remote endpoint
2128 * @xprt: RPC transport to connect
2129 * @transport: socket transport to connect
2130 * @create_sock: function to create a socket of the correct type
2131 * 2185 *
2132 * Invoked by a work queue tasklet. 2186 * Invoked by a work queue tasklet.
2133 */ 2187 */
@@ -2463,6 +2517,8 @@ static struct rpc_xprt_ops xs_local_ops = {
2463 .close = xs_close, 2517 .close = xs_close,
2464 .destroy = xs_destroy, 2518 .destroy = xs_destroy,
2465 .print_stats = xs_local_print_stats, 2519 .print_stats = xs_local_print_stats,
2520 .enable_swap = xs_enable_swap,
2521 .disable_swap = xs_disable_swap,
2466}; 2522};
2467 2523
2468static struct rpc_xprt_ops xs_udp_ops = { 2524static struct rpc_xprt_ops xs_udp_ops = {
@@ -2482,6 +2538,9 @@ static struct rpc_xprt_ops xs_udp_ops = {
2482 .close = xs_close, 2538 .close = xs_close,
2483 .destroy = xs_destroy, 2539 .destroy = xs_destroy,
2484 .print_stats = xs_udp_print_stats, 2540 .print_stats = xs_udp_print_stats,
2541 .enable_swap = xs_enable_swap,
2542 .disable_swap = xs_disable_swap,
2543 .inject_disconnect = xs_inject_disconnect,
2485}; 2544};
2486 2545
2487static struct rpc_xprt_ops xs_tcp_ops = { 2546static struct rpc_xprt_ops xs_tcp_ops = {
@@ -2498,6 +2557,9 @@ static struct rpc_xprt_ops xs_tcp_ops = {
2498 .close = xs_tcp_shutdown, 2557 .close = xs_tcp_shutdown,
2499 .destroy = xs_destroy, 2558 .destroy = xs_destroy,
2500 .print_stats = xs_tcp_print_stats, 2559 .print_stats = xs_tcp_print_stats,
2560 .enable_swap = xs_enable_swap,
2561 .disable_swap = xs_disable_swap,
2562 .inject_disconnect = xs_inject_disconnect,
2501}; 2563};
2502 2564
2503/* 2565/*
@@ -2515,6 +2577,9 @@ static struct rpc_xprt_ops bc_tcp_ops = {
2515 .close = bc_close, 2577 .close = bc_close,
2516 .destroy = bc_destroy, 2578 .destroy = bc_destroy,
2517 .print_stats = xs_tcp_print_stats, 2579 .print_stats = xs_tcp_print_stats,
2580 .enable_swap = xs_enable_swap,
2581 .disable_swap = xs_disable_swap,
2582 .inject_disconnect = xs_inject_disconnect,
2518}; 2583};
2519 2584
2520static int xs_init_anyaddr(const int family, struct sockaddr *sap) 2585static int xs_init_anyaddr(const int family, struct sockaddr *sap)