aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2011-03-17 20:40:00 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2011-03-17 20:40:00 -0400
commit179198373cf374f0ef793f1023c1cdd83b53674d (patch)
tree9c7f9e82b936864b9d8cf91b3d4121a3c8d2671c
parent374e55251cacfb68d331bb8a574b2de8160aacc2 (diff)
parent8e26de238fd794c8ea56a5c98bf67c40cfeb051d (diff)
Merge branch 'nfs-for-2.6.39' of git://git.linux-nfs.org/projects/trondmy/nfs-2.6
* 'nfs-for-2.6.39' of git://git.linux-nfs.org/projects/trondmy/nfs-2.6: (54 commits) RPC: killing RPC tasks races fixed xprt: remove redundant check SUNRPC: Convert struct rpc_xprt to use atomic_t counters SUNRPC: Ensure we always run the tk_callback before tk_action sunrpc: fix printk format warning xprt: remove redundant null check nfs: BKL is no longer needed, so remove the include NFS: Fix a warning in fs/nfs/idmap.c Cleanup: Factor out some cut-and-paste code. cleanup: save 60 lines/100 bytes by combining two mostly duplicate functions. NFS: account direct-io into task io accounting gss:krb5 only include enctype numbers in gm_upcall_enctypes RPCRDMA: Fix FRMR registration/invalidate handling. RPCRDMA: Fix to XDR page base interpretation in marshalling logic. NFSv4: Send unmapped uid/gids to the server when using auth_sys NFSv4: Propagate the error NFS4ERR_BADOWNER to nfs4_do_setattr NFSv4: cleanup idmapper functions to take an nfs_server argument NFSv4: Send unmapped uid/gids to the server if the idmapper fails NFSv4: If the server sends us a numeric uid/gid then accept it NFSv4.1: reject zero layout with zeroed stripe unit ...
-rw-r--r--Documentation/filesystems/nfs/pnfs.txt7
-rw-r--r--Documentation/kernel-parameters.txt8
-rw-r--r--fs/nfs/callback_proc.c2
-rw-r--r--fs/nfs/client.c131
-rw-r--r--fs/nfs/direct.c8
-rw-r--r--fs/nfs/file.c4
-rw-r--r--fs/nfs/idmap.c90
-rw-r--r--fs/nfs/internal.h22
-rw-r--r--fs/nfs/nfs3proc.c1
-rw-r--r--fs/nfs/nfs4_fs.h28
-rw-r--r--fs/nfs/nfs4filelayout.c361
-rw-r--r--fs/nfs/nfs4filelayout.h19
-rw-r--r--fs/nfs/nfs4filelayoutdev.c252
-rw-r--r--fs/nfs/nfs4proc.c123
-rw-r--r--fs/nfs/nfs4renewd.c6
-rw-r--r--fs/nfs/nfs4state.c6
-rw-r--r--fs/nfs/nfs4xdr.c38
-rw-r--r--fs/nfs/pagelist.c22
-rw-r--r--fs/nfs/pnfs.c330
-rw-r--r--fs/nfs/pnfs.h118
-rw-r--r--fs/nfs/proc.c1
-rw-r--r--fs/nfs/read.c127
-rw-r--r--fs/nfs/super.c284
-rw-r--r--fs/nfs/write.c153
-rw-r--r--include/linux/nfs_fs.h2
-rw-r--r--include/linux/nfs_fs_sb.h4
-rw-r--r--include/linux/nfs_idmap.h9
-rw-r--r--include/linux/nfs_iostat.h2
-rw-r--r--include/linux/nfs_page.h6
-rw-r--r--include/linux/nfs_xdr.h16
-rw-r--r--include/linux/sunrpc/clnt.h1
-rw-r--r--include/linux/sunrpc/xprt.h3
-rw-r--r--net/sunrpc/auth_gss/auth_gss.c2
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_mech.c2
-rw-r--r--net/sunrpc/clnt.c18
-rw-r--r--net/sunrpc/sched.c29
-rw-r--r--net/sunrpc/xprt.c25
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c86
-rw-r--r--net/sunrpc/xprtrdma/verbs.c53
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h1
40 files changed, 1600 insertions, 800 deletions
diff --git a/Documentation/filesystems/nfs/pnfs.txt b/Documentation/filesystems/nfs/pnfs.txt
index bc0b9cfe095b..983e14abe7e9 100644
--- a/Documentation/filesystems/nfs/pnfs.txt
+++ b/Documentation/filesystems/nfs/pnfs.txt
@@ -46,3 +46,10 @@ data server cache
46file driver devices refer to data servers, which are kept in a module 46file driver devices refer to data servers, which are kept in a module
47level cache. Its reference is held over the lifetime of the deviceid 47level cache. Its reference is held over the lifetime of the deviceid
48pointing to it. 48pointing to it.
49
50lseg
51----
52lseg maintains an extra reference corresponding to the NFS_LSEG_VALID
53bit which holds it in the pnfs_layout_hdr's list. When the final lseg
54is removed from the pnfs_layout_hdr's list, the NFS_LAYOUT_DESTROYED
55bit is set, preventing any new lsegs from being added.
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 738c6fda3fb0..534dbaf9d618 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1580,6 +1580,14 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
1580 of returning the full 64-bit number. 1580 of returning the full 64-bit number.
1581 The default is to return 64-bit inode numbers. 1581 The default is to return 64-bit inode numbers.
1582 1582
1583 nfs.nfs4_disable_idmapping=
1584 [NFSv4] When set, this option disables the NFSv4
1585 idmapper on the client, but only if the mount
1586 is using the 'sec=sys' security flavour. This may
1587 make migration from legacy NFSv2/v3 systems easier
1588 provided that the server has the appropriate support.
1589 The default is to always enable NFSv4 idmapping.
1590
1583 nmi_debug= [KNL,AVR32,SH] Specify one or more actions to take 1591 nmi_debug= [KNL,AVR32,SH] Specify one or more actions to take
1584 when a NMI is triggered. 1592 when a NMI is triggered.
1585 Format: [state][,regs][,debounce][,die] 1593 Format: [state][,regs][,debounce][,die]
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 89587573fe50..2f41dccea18e 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -188,10 +188,10 @@ static u32 initiate_bulk_draining(struct nfs_client *clp,
188 rv = NFS4ERR_DELAY; 188 rv = NFS4ERR_DELAY;
189 list_del_init(&lo->plh_bulk_recall); 189 list_del_init(&lo->plh_bulk_recall);
190 spin_unlock(&ino->i_lock); 190 spin_unlock(&ino->i_lock);
191 pnfs_free_lseg_list(&free_me_list);
191 put_layout_hdr(lo); 192 put_layout_hdr(lo);
192 iput(ino); 193 iput(ino);
193 } 194 }
194 pnfs_free_lseg_list(&free_me_list);
195 return rv; 195 return rv;
196} 196}
197 197
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index bd3ca32879e7..139be9647d80 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -82,6 +82,11 @@ retry:
82#endif /* CONFIG_NFS_V4 */ 82#endif /* CONFIG_NFS_V4 */
83 83
84/* 84/*
85 * Turn off NFSv4 uid/gid mapping when using AUTH_SYS
86 */
87static int nfs4_disable_idmapping = 0;
88
89/*
85 * RPC cruft for NFS 90 * RPC cruft for NFS
86 */ 91 */
87static struct rpc_version *nfs_version[5] = { 92static struct rpc_version *nfs_version[5] = {
@@ -481,7 +486,12 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat
481 * Look up a client by IP address and protocol version 486 * Look up a client by IP address and protocol version
482 * - creates a new record if one doesn't yet exist 487 * - creates a new record if one doesn't yet exist
483 */ 488 */
484static struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init) 489static struct nfs_client *
490nfs_get_client(const struct nfs_client_initdata *cl_init,
491 const struct rpc_timeout *timeparms,
492 const char *ip_addr,
493 rpc_authflavor_t authflavour,
494 int noresvport)
485{ 495{
486 struct nfs_client *clp, *new = NULL; 496 struct nfs_client *clp, *new = NULL;
487 int error; 497 int error;
@@ -512,6 +522,13 @@ install_client:
512 clp = new; 522 clp = new;
513 list_add(&clp->cl_share_link, &nfs_client_list); 523 list_add(&clp->cl_share_link, &nfs_client_list);
514 spin_unlock(&nfs_client_lock); 524 spin_unlock(&nfs_client_lock);
525
526 error = cl_init->rpc_ops->init_client(clp, timeparms, ip_addr,
527 authflavour, noresvport);
528 if (error < 0) {
529 nfs_put_client(clp);
530 return ERR_PTR(error);
531 }
515 dprintk("--> nfs_get_client() = %p [new]\n", clp); 532 dprintk("--> nfs_get_client() = %p [new]\n", clp);
516 return clp; 533 return clp;
517 534
@@ -767,9 +784,9 @@ static int nfs_init_server_rpcclient(struct nfs_server *server,
767/* 784/*
768 * Initialise an NFS2 or NFS3 client 785 * Initialise an NFS2 or NFS3 client
769 */ 786 */
770static int nfs_init_client(struct nfs_client *clp, 787int nfs_init_client(struct nfs_client *clp, const struct rpc_timeout *timeparms,
771 const struct rpc_timeout *timeparms, 788 const char *ip_addr, rpc_authflavor_t authflavour,
772 const struct nfs_parsed_mount_data *data) 789 int noresvport)
773{ 790{
774 int error; 791 int error;
775 792
@@ -784,7 +801,7 @@ static int nfs_init_client(struct nfs_client *clp,
784 * - RFC 2623, sec 2.3.2 801 * - RFC 2623, sec 2.3.2
785 */ 802 */
786 error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX, 803 error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX,
787 0, data->flags & NFS_MOUNT_NORESVPORT); 804 0, noresvport);
788 if (error < 0) 805 if (error < 0)
789 goto error; 806 goto error;
790 nfs_mark_client_ready(clp, NFS_CS_READY); 807 nfs_mark_client_ready(clp, NFS_CS_READY);
@@ -820,19 +837,17 @@ static int nfs_init_server(struct nfs_server *server,
820 cl_init.rpc_ops = &nfs_v3_clientops; 837 cl_init.rpc_ops = &nfs_v3_clientops;
821#endif 838#endif
822 839
840 nfs_init_timeout_values(&timeparms, data->nfs_server.protocol,
841 data->timeo, data->retrans);
842
823 /* Allocate or find a client reference we can use */ 843 /* Allocate or find a client reference we can use */
824 clp = nfs_get_client(&cl_init); 844 clp = nfs_get_client(&cl_init, &timeparms, NULL, RPC_AUTH_UNIX,
845 data->flags & NFS_MOUNT_NORESVPORT);
825 if (IS_ERR(clp)) { 846 if (IS_ERR(clp)) {
826 dprintk("<-- nfs_init_server() = error %ld\n", PTR_ERR(clp)); 847 dprintk("<-- nfs_init_server() = error %ld\n", PTR_ERR(clp));
827 return PTR_ERR(clp); 848 return PTR_ERR(clp);
828 } 849 }
829 850
830 nfs_init_timeout_values(&timeparms, data->nfs_server.protocol,
831 data->timeo, data->retrans);
832 error = nfs_init_client(clp, &timeparms, data);
833 if (error < 0)
834 goto error;
835
836 server->nfs_client = clp; 851 server->nfs_client = clp;
837 852
838 /* Initialise the client representation from the mount data */ 853 /* Initialise the client representation from the mount data */
@@ -1009,14 +1024,19 @@ static void nfs_server_insert_lists(struct nfs_server *server)
1009 spin_lock(&nfs_client_lock); 1024 spin_lock(&nfs_client_lock);
1010 list_add_tail_rcu(&server->client_link, &clp->cl_superblocks); 1025 list_add_tail_rcu(&server->client_link, &clp->cl_superblocks);
1011 list_add_tail(&server->master_link, &nfs_volume_list); 1026 list_add_tail(&server->master_link, &nfs_volume_list);
1027 clear_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state);
1012 spin_unlock(&nfs_client_lock); 1028 spin_unlock(&nfs_client_lock);
1013 1029
1014} 1030}
1015 1031
1016static void nfs_server_remove_lists(struct nfs_server *server) 1032static void nfs_server_remove_lists(struct nfs_server *server)
1017{ 1033{
1034 struct nfs_client *clp = server->nfs_client;
1035
1018 spin_lock(&nfs_client_lock); 1036 spin_lock(&nfs_client_lock);
1019 list_del_rcu(&server->client_link); 1037 list_del_rcu(&server->client_link);
1038 if (clp && list_empty(&clp->cl_superblocks))
1039 set_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state);
1020 list_del(&server->master_link); 1040 list_del(&server->master_link);
1021 spin_unlock(&nfs_client_lock); 1041 spin_unlock(&nfs_client_lock);
1022 1042
@@ -1307,11 +1327,11 @@ static int nfs4_init_client_minor_version(struct nfs_client *clp)
1307/* 1327/*
1308 * Initialise an NFS4 client record 1328 * Initialise an NFS4 client record
1309 */ 1329 */
1310static int nfs4_init_client(struct nfs_client *clp, 1330int nfs4_init_client(struct nfs_client *clp,
1311 const struct rpc_timeout *timeparms, 1331 const struct rpc_timeout *timeparms,
1312 const char *ip_addr, 1332 const char *ip_addr,
1313 rpc_authflavor_t authflavour, 1333 rpc_authflavor_t authflavour,
1314 int flags) 1334 int noresvport)
1315{ 1335{
1316 int error; 1336 int error;
1317 1337
@@ -1325,7 +1345,7 @@ static int nfs4_init_client(struct nfs_client *clp,
1325 clp->rpc_ops = &nfs_v4_clientops; 1345 clp->rpc_ops = &nfs_v4_clientops;
1326 1346
1327 error = nfs_create_rpc_client(clp, timeparms, authflavour, 1347 error = nfs_create_rpc_client(clp, timeparms, authflavour,
1328 1, flags & NFS_MOUNT_NORESVPORT); 1348 1, noresvport);
1329 if (error < 0) 1349 if (error < 0)
1330 goto error; 1350 goto error;
1331 strlcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr)); 1351 strlcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr));
@@ -1378,27 +1398,71 @@ static int nfs4_set_client(struct nfs_server *server,
1378 dprintk("--> nfs4_set_client()\n"); 1398 dprintk("--> nfs4_set_client()\n");
1379 1399
1380 /* Allocate or find a client reference we can use */ 1400 /* Allocate or find a client reference we can use */
1381 clp = nfs_get_client(&cl_init); 1401 clp = nfs_get_client(&cl_init, timeparms, ip_addr, authflavour,
1402 server->flags & NFS_MOUNT_NORESVPORT);
1382 if (IS_ERR(clp)) { 1403 if (IS_ERR(clp)) {
1383 error = PTR_ERR(clp); 1404 error = PTR_ERR(clp);
1384 goto error; 1405 goto error;
1385 } 1406 }
1386 error = nfs4_init_client(clp, timeparms, ip_addr, authflavour, 1407
1387 server->flags); 1408 /*
1388 if (error < 0) 1409 * Query for the lease time on clientid setup or renewal
1389 goto error_put; 1410 *
1411 * Note that this will be set on nfs_clients that were created
1412 * only for the DS role and did not set this bit, but now will
1413 * serve a dual role.
1414 */
1415 set_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state);
1390 1416
1391 server->nfs_client = clp; 1417 server->nfs_client = clp;
1392 dprintk("<-- nfs4_set_client() = 0 [new %p]\n", clp); 1418 dprintk("<-- nfs4_set_client() = 0 [new %p]\n", clp);
1393 return 0; 1419 return 0;
1394
1395error_put:
1396 nfs_put_client(clp);
1397error: 1420error:
1398 dprintk("<-- nfs4_set_client() = xerror %d\n", error); 1421 dprintk("<-- nfs4_set_client() = xerror %d\n", error);
1399 return error; 1422 return error;
1400} 1423}
1401 1424
1425/*
1426 * Set up a pNFS Data Server client.
1427 *
1428 * Return any existing nfs_client that matches server address,port,version
1429 * and minorversion.
1430 *
1431 * For a new nfs_client, use a soft mount (default), a low retrans and a
1432 * low timeout interval so that if a connection is lost, we retry through
1433 * the MDS.
1434 */
1435struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
1436 const struct sockaddr *ds_addr,
1437 int ds_addrlen, int ds_proto)
1438{
1439 struct nfs_client_initdata cl_init = {
1440 .addr = ds_addr,
1441 .addrlen = ds_addrlen,
1442 .rpc_ops = &nfs_v4_clientops,
1443 .proto = ds_proto,
1444 .minorversion = mds_clp->cl_minorversion,
1445 };
1446 struct rpc_timeout ds_timeout = {
1447 .to_initval = 15 * HZ,
1448 .to_maxval = 15 * HZ,
1449 .to_retries = 1,
1450 .to_exponential = 1,
1451 };
1452 struct nfs_client *clp;
1453
1454 /*
1455 * Set an authflavor equual to the MDS value. Use the MDS nfs_client
1456 * cl_ipaddr so as to use the same EXCHANGE_ID co_ownerid as the MDS
1457 * (section 13.1 RFC 5661).
1458 */
1459 clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr,
1460 mds_clp->cl_rpcclient->cl_auth->au_flavor, 0);
1461
1462 dprintk("<-- %s %p\n", __func__, clp);
1463 return clp;
1464}
1465EXPORT_SYMBOL(nfs4_set_ds_client);
1402 1466
1403/* 1467/*
1404 * Session has been established, and the client marked ready. 1468 * Session has been established, and the client marked ready.
@@ -1435,6 +1499,10 @@ static int nfs4_server_common_setup(struct nfs_server *server,
1435 BUG_ON(!server->nfs_client->rpc_ops); 1499 BUG_ON(!server->nfs_client->rpc_ops);
1436 BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops); 1500 BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
1437 1501
1502 /* data servers support only a subset of NFSv4.1 */
1503 if (is_ds_only_client(server->nfs_client))
1504 return -EPROTONOSUPPORT;
1505
1438 fattr = nfs_alloc_fattr(); 1506 fattr = nfs_alloc_fattr();
1439 if (fattr == NULL) 1507 if (fattr == NULL)
1440 return -ENOMEM; 1508 return -ENOMEM;
@@ -1504,6 +1572,13 @@ static int nfs4_init_server(struct nfs_server *server,
1504 if (error < 0) 1572 if (error < 0)
1505 goto error; 1573 goto error;
1506 1574
1575 /*
1576 * Don't use NFS uid/gid mapping if we're using AUTH_SYS or lower
1577 * authentication.
1578 */
1579 if (nfs4_disable_idmapping && data->auth_flavors[0] == RPC_AUTH_UNIX)
1580 server->caps |= NFS_CAP_UIDGID_NOMAP;
1581
1507 if (data->rsize) 1582 if (data->rsize)
1508 server->rsize = nfs_block_size(data->rsize, NULL); 1583 server->rsize = nfs_block_size(data->rsize, NULL);
1509 if (data->wsize) 1584 if (data->wsize)
@@ -1921,3 +1996,7 @@ void nfs_fs_proc_exit(void)
1921} 1996}
1922 1997
1923#endif /* CONFIG_PROC_FS */ 1998#endif /* CONFIG_PROC_FS */
1999
2000module_param(nfs4_disable_idmapping, bool, 0644);
2001MODULE_PARM_DESC(nfs4_disable_idmapping,
2002 "Turn off NFSv4 idmapping when using 'sec=sys'");
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 9943a75bb6d1..8eea25366717 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -45,6 +45,7 @@
45#include <linux/pagemap.h> 45#include <linux/pagemap.h>
46#include <linux/kref.h> 46#include <linux/kref.h>
47#include <linux/slab.h> 47#include <linux/slab.h>
48#include <linux/task_io_accounting_ops.h>
48 49
49#include <linux/nfs_fs.h> 50#include <linux/nfs_fs.h>
50#include <linux/nfs_page.h> 51#include <linux/nfs_page.h>
@@ -649,8 +650,7 @@ static void nfs_direct_write_result(struct rpc_task *task, void *calldata)
649{ 650{
650 struct nfs_write_data *data = calldata; 651 struct nfs_write_data *data = calldata;
651 652
652 if (nfs_writeback_done(task, data) != 0) 653 nfs_writeback_done(task, data);
653 return;
654} 654}
655 655
656/* 656/*
@@ -938,6 +938,8 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov,
938 if (retval) 938 if (retval)
939 goto out; 939 goto out;
940 940
941 task_io_account_read(count);
942
941 retval = nfs_direct_read(iocb, iov, nr_segs, pos); 943 retval = nfs_direct_read(iocb, iov, nr_segs, pos);
942 if (retval > 0) 944 if (retval > 0)
943 iocb->ki_pos = pos + retval; 945 iocb->ki_pos = pos + retval;
@@ -999,6 +1001,8 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
999 if (retval) 1001 if (retval)
1000 goto out; 1002 goto out;
1001 1003
1004 task_io_account_write(count);
1005
1002 retval = nfs_direct_write(iocb, iov, nr_segs, pos, count); 1006 retval = nfs_direct_write(iocb, iov, nr_segs, pos, count);
1003 1007
1004 if (retval > 0) 1008 if (retval > 0)
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 7bf029ef4084..d85a534b15cd 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -387,10 +387,6 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
387 file->f_path.dentry->d_name.name, 387 file->f_path.dentry->d_name.name,
388 mapping->host->i_ino, len, (long long) pos); 388 mapping->host->i_ino, len, (long long) pos);
389 389
390 pnfs_update_layout(mapping->host,
391 nfs_file_open_context(file),
392 IOMODE_RW);
393
394start: 390start:
395 /* 391 /*
396 * Prevent starvation issues if someone is doing a consistency 392 * Prevent starvation issues if someone is doing a consistency
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 18696882f1c6..79664a1025af 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -33,16 +33,41 @@
33 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 33 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
34 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 34 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35 */ 35 */
36#include <linux/types.h>
37#include <linux/string.h>
38#include <linux/kernel.h>
39
40static int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *res)
41{
42 unsigned long val;
43 char buf[16];
44
45 if (memchr(name, '@', namelen) != NULL || namelen >= sizeof(buf))
46 return 0;
47 memcpy(buf, name, namelen);
48 buf[namelen] = '\0';
49 if (strict_strtoul(buf, 0, &val) != 0)
50 return 0;
51 *res = val;
52 return 1;
53}
54
55static int nfs_map_numeric_to_string(__u32 id, char *buf, size_t buflen)
56{
57 return snprintf(buf, buflen, "%u", id);
58}
36 59
37#ifdef CONFIG_NFS_USE_NEW_IDMAPPER 60#ifdef CONFIG_NFS_USE_NEW_IDMAPPER
38 61
39#include <linux/slab.h> 62#include <linux/slab.h>
40#include <linux/cred.h> 63#include <linux/cred.h>
64#include <linux/sunrpc/sched.h>
65#include <linux/nfs4.h>
66#include <linux/nfs_fs_sb.h>
41#include <linux/nfs_idmap.h> 67#include <linux/nfs_idmap.h>
42#include <linux/keyctl.h> 68#include <linux/keyctl.h>
43#include <linux/key-type.h> 69#include <linux/key-type.h>
44#include <linux/rcupdate.h> 70#include <linux/rcupdate.h>
45#include <linux/kernel.h>
46#include <linux/err.h> 71#include <linux/err.h>
47 72
48#include <keys/user-type.h> 73#include <keys/user-type.h>
@@ -219,23 +244,39 @@ static int nfs_idmap_lookup_id(const char *name, size_t namelen,
219 return ret; 244 return ret;
220} 245}
221 246
222int nfs_map_name_to_uid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *uid) 247int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid)
223{ 248{
249 if (nfs_map_string_to_numeric(name, namelen, uid))
250 return 0;
224 return nfs_idmap_lookup_id(name, namelen, "uid", uid); 251 return nfs_idmap_lookup_id(name, namelen, "uid", uid);
225} 252}
226 253
227int nfs_map_group_to_gid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *gid) 254int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *gid)
228{ 255{
256 if (nfs_map_string_to_numeric(name, namelen, gid))
257 return 0;
229 return nfs_idmap_lookup_id(name, namelen, "gid", gid); 258 return nfs_idmap_lookup_id(name, namelen, "gid", gid);
230} 259}
231 260
232int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen) 261int nfs_map_uid_to_name(const struct nfs_server *server, __u32 uid, char *buf, size_t buflen)
233{ 262{
234 return nfs_idmap_lookup_name(uid, "user", buf, buflen); 263 int ret = -EINVAL;
264
265 if (!(server->caps & NFS_CAP_UIDGID_NOMAP))
266 ret = nfs_idmap_lookup_name(uid, "user", buf, buflen);
267 if (ret < 0)
268 ret = nfs_map_numeric_to_string(uid, buf, buflen);
269 return ret;
235} 270}
236int nfs_map_gid_to_group(struct nfs_client *clp, __u32 gid, char *buf, size_t buflen) 271int nfs_map_gid_to_group(const struct nfs_server *server, __u32 gid, char *buf, size_t buflen)
237{ 272{
238 return nfs_idmap_lookup_name(gid, "group", buf, buflen); 273 int ret = -EINVAL;
274
275 if (!(server->caps & NFS_CAP_UIDGID_NOMAP))
276 ret = nfs_idmap_lookup_name(gid, "group", buf, buflen);
277 if (ret < 0)
278 ret = nfs_map_numeric_to_string(gid, buf, buflen);
279 return ret;
239} 280}
240 281
241#else /* CONFIG_NFS_USE_NEW_IDMAPPER not defined */ 282#else /* CONFIG_NFS_USE_NEW_IDMAPPER not defined */
@@ -243,7 +284,6 @@ int nfs_map_gid_to_group(struct nfs_client *clp, __u32 gid, char *buf, size_t bu
243#include <linux/module.h> 284#include <linux/module.h>
244#include <linux/mutex.h> 285#include <linux/mutex.h>
245#include <linux/init.h> 286#include <linux/init.h>
246#include <linux/types.h>
247#include <linux/slab.h> 287#include <linux/slab.h>
248#include <linux/socket.h> 288#include <linux/socket.h>
249#include <linux/in.h> 289#include <linux/in.h>
@@ -695,31 +735,45 @@ static unsigned int fnvhash32(const void *buf, size_t buflen)
695 return hash; 735 return hash;
696} 736}
697 737
698int nfs_map_name_to_uid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *uid) 738int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid)
699{ 739{
700 struct idmap *idmap = clp->cl_idmap; 740 struct idmap *idmap = server->nfs_client->cl_idmap;
701 741
742 if (nfs_map_string_to_numeric(name, namelen, uid))
743 return 0;
702 return nfs_idmap_id(idmap, &idmap->idmap_user_hash, name, namelen, uid); 744 return nfs_idmap_id(idmap, &idmap->idmap_user_hash, name, namelen, uid);
703} 745}
704 746
705int nfs_map_group_to_gid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *uid) 747int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid)
706{ 748{
707 struct idmap *idmap = clp->cl_idmap; 749 struct idmap *idmap = server->nfs_client->cl_idmap;
708 750
751 if (nfs_map_string_to_numeric(name, namelen, uid))
752 return 0;
709 return nfs_idmap_id(idmap, &idmap->idmap_group_hash, name, namelen, uid); 753 return nfs_idmap_id(idmap, &idmap->idmap_group_hash, name, namelen, uid);
710} 754}
711 755
712int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen) 756int nfs_map_uid_to_name(const struct nfs_server *server, __u32 uid, char *buf, size_t buflen)
713{ 757{
714 struct idmap *idmap = clp->cl_idmap; 758 struct idmap *idmap = server->nfs_client->cl_idmap;
759 int ret = -EINVAL;
715 760
716 return nfs_idmap_name(idmap, &idmap->idmap_user_hash, uid, buf); 761 if (!(server->caps & NFS_CAP_UIDGID_NOMAP))
762 ret = nfs_idmap_name(idmap, &idmap->idmap_user_hash, uid, buf);
763 if (ret < 0)
764 ret = nfs_map_numeric_to_string(uid, buf, buflen);
765 return ret;
717} 766}
718int nfs_map_gid_to_group(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen) 767int nfs_map_gid_to_group(const struct nfs_server *server, __u32 uid, char *buf, size_t buflen)
719{ 768{
720 struct idmap *idmap = clp->cl_idmap; 769 struct idmap *idmap = server->nfs_client->cl_idmap;
770 int ret = -EINVAL;
721 771
722 return nfs_idmap_name(idmap, &idmap->idmap_group_hash, uid, buf); 772 if (!(server->caps & NFS_CAP_UIDGID_NOMAP))
773 ret = nfs_idmap_name(idmap, &idmap->idmap_group_hash, uid, buf);
774 if (ret < 0)
775 ret = nfs_map_numeric_to_string(uid, buf, buflen);
776 return ret;
723} 777}
724 778
725#endif /* CONFIG_NFS_USE_NEW_IDMAPPER */ 779#endif /* CONFIG_NFS_USE_NEW_IDMAPPER */
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index e94ad22da5d2..72e0bddf7a2f 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -148,6 +148,9 @@ extern struct nfs_server *nfs_clone_server(struct nfs_server *,
148 struct nfs_fattr *); 148 struct nfs_fattr *);
149extern void nfs_mark_client_ready(struct nfs_client *clp, int state); 149extern void nfs_mark_client_ready(struct nfs_client *clp, int state);
150extern int nfs4_check_client_ready(struct nfs_client *clp); 150extern int nfs4_check_client_ready(struct nfs_client *clp);
151extern struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
152 const struct sockaddr *ds_addr,
153 int ds_addrlen, int ds_proto);
151#ifdef CONFIG_PROC_FS 154#ifdef CONFIG_PROC_FS
152extern int __init nfs_fs_proc_init(void); 155extern int __init nfs_fs_proc_init(void);
153extern void nfs_fs_proc_exit(void); 156extern void nfs_fs_proc_exit(void);
@@ -213,8 +216,14 @@ extern const u32 nfs41_maxwrite_overhead;
213extern struct rpc_procinfo nfs4_procedures[]; 216extern struct rpc_procinfo nfs4_procedures[];
214#endif 217#endif
215 218
219extern int nfs4_init_ds_session(struct nfs_client *clp);
220
216/* proc.c */ 221/* proc.c */
217void nfs_close_context(struct nfs_open_context *ctx, int is_sync); 222void nfs_close_context(struct nfs_open_context *ctx, int is_sync);
223extern int nfs_init_client(struct nfs_client *clp,
224 const struct rpc_timeout *timeparms,
225 const char *ip_addr, rpc_authflavor_t authflavour,
226 int noresvport);
218 227
219/* dir.c */ 228/* dir.c */
220extern int nfs_access_cache_shrinker(struct shrinker *shrink, 229extern int nfs_access_cache_shrinker(struct shrinker *shrink,
@@ -262,9 +271,15 @@ extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh);
262#endif 271#endif
263 272
264/* read.c */ 273/* read.c */
274extern int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt,
275 const struct rpc_call_ops *call_ops);
265extern void nfs_read_prepare(struct rpc_task *task, void *calldata); 276extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
266 277
267/* write.c */ 278/* write.c */
279extern int nfs_initiate_write(struct nfs_write_data *data,
280 struct rpc_clnt *clnt,
281 const struct rpc_call_ops *call_ops,
282 int how);
268extern void nfs_write_prepare(struct rpc_task *task, void *calldata); 283extern void nfs_write_prepare(struct rpc_task *task, void *calldata);
269#ifdef CONFIG_MIGRATION 284#ifdef CONFIG_MIGRATION
270extern int nfs_migrate_page(struct address_space *, 285extern int nfs_migrate_page(struct address_space *,
@@ -274,6 +289,13 @@ extern int nfs_migrate_page(struct address_space *,
274#endif 289#endif
275 290
276/* nfs4proc.c */ 291/* nfs4proc.c */
292extern void nfs4_reset_read(struct rpc_task *task, struct nfs_read_data *data);
293extern int nfs4_init_client(struct nfs_client *clp,
294 const struct rpc_timeout *timeparms,
295 const char *ip_addr,
296 rpc_authflavor_t authflavour,
297 int noresvport);
298extern void nfs4_reset_write(struct rpc_task *task, struct nfs_write_data *data);
277extern int _nfs4_call_sync(struct nfs_server *server, 299extern int _nfs4_call_sync(struct nfs_server *server,
278 struct rpc_message *msg, 300 struct rpc_message *msg,
279 struct nfs4_sequence_args *args, 301 struct nfs4_sequence_args *args,
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index ce939c062a52..d0c80d8b3f96 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -885,4 +885,5 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
885 .lock = nfs3_proc_lock, 885 .lock = nfs3_proc_lock,
886 .clear_acl_cache = nfs3_forget_cached_acls, 886 .clear_acl_cache = nfs3_forget_cached_acls,
887 .close_context = nfs_close_context, 887 .close_context = nfs_close_context,
888 .init_client = nfs_init_client,
888}; 889};
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 1be36cf65bfc..c64be1cff080 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -252,6 +252,9 @@ static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *ser
252extern int nfs4_setup_sequence(const struct nfs_server *server, 252extern int nfs4_setup_sequence(const struct nfs_server *server,
253 struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, 253 struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
254 int cache_reply, struct rpc_task *task); 254 int cache_reply, struct rpc_task *task);
255extern int nfs41_setup_sequence(struct nfs4_session *session,
256 struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
257 int cache_reply, struct rpc_task *task);
255extern void nfs4_destroy_session(struct nfs4_session *session); 258extern void nfs4_destroy_session(struct nfs4_session *session);
256extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp); 259extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp);
257extern int nfs4_proc_create_session(struct nfs_client *); 260extern int nfs4_proc_create_session(struct nfs_client *);
@@ -259,6 +262,19 @@ extern int nfs4_proc_destroy_session(struct nfs4_session *);
259extern int nfs4_init_session(struct nfs_server *server); 262extern int nfs4_init_session(struct nfs_server *server);
260extern int nfs4_proc_get_lease_time(struct nfs_client *clp, 263extern int nfs4_proc_get_lease_time(struct nfs_client *clp,
261 struct nfs_fsinfo *fsinfo); 264 struct nfs_fsinfo *fsinfo);
265
266static inline bool
267is_ds_only_client(struct nfs_client *clp)
268{
269 return (clp->cl_exchange_flags & EXCHGID4_FLAG_MASK_PNFS) ==
270 EXCHGID4_FLAG_USE_PNFS_DS;
271}
272
273static inline bool
274is_ds_client(struct nfs_client *clp)
275{
276 return clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_DS;
277}
262#else /* CONFIG_NFS_v4_1 */ 278#else /* CONFIG_NFS_v4_1 */
263static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server) 279static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server)
264{ 280{
@@ -276,6 +292,18 @@ static inline int nfs4_init_session(struct nfs_server *server)
276{ 292{
277 return 0; 293 return 0;
278} 294}
295
296static inline bool
297is_ds_only_client(struct nfs_client *clp)
298{
299 return false;
300}
301
302static inline bool
303is_ds_client(struct nfs_client *clp)
304{
305 return false;
306}
279#endif /* CONFIG_NFS_V4_1 */ 307#endif /* CONFIG_NFS_V4_1 */
280 308
281extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[]; 309extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[];
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 23f930caf1e2..428558464817 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -40,32 +40,309 @@ MODULE_LICENSE("GPL");
40MODULE_AUTHOR("Dean Hildebrand <dhildebz@umich.edu>"); 40MODULE_AUTHOR("Dean Hildebrand <dhildebz@umich.edu>");
41MODULE_DESCRIPTION("The NFSv4 file layout driver"); 41MODULE_DESCRIPTION("The NFSv4 file layout driver");
42 42
43static int 43#define FILELAYOUT_POLL_RETRY_MAX (15*HZ)
44filelayout_set_layoutdriver(struct nfs_server *nfss) 44
45static loff_t
46filelayout_get_dense_offset(struct nfs4_filelayout_segment *flseg,
47 loff_t offset)
45{ 48{
46 int status = pnfs_alloc_init_deviceid_cache(nfss->nfs_client, 49 u32 stripe_width = flseg->stripe_unit * flseg->dsaddr->stripe_count;
47 nfs4_fl_free_deviceid_callback); 50 u64 tmp;
48 if (status) { 51
49 printk(KERN_WARNING "%s: deviceid cache could not be " 52 offset -= flseg->pattern_offset;
50 "initialized\n", __func__); 53 tmp = offset;
51 return status; 54 do_div(tmp, stripe_width);
55
56 return tmp * flseg->stripe_unit + do_div(offset, flseg->stripe_unit);
57}
58
59/* This function is used by the layout driver to calculate the
60 * offset of the file on the dserver based on whether the
61 * layout type is STRIPE_DENSE or STRIPE_SPARSE
62 */
63static loff_t
64filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset)
65{
66 struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
67
68 switch (flseg->stripe_type) {
69 case STRIPE_SPARSE:
70 return offset;
71
72 case STRIPE_DENSE:
73 return filelayout_get_dense_offset(flseg, offset);
52 } 74 }
53 dprintk("%s: deviceid cache has been initialized successfully\n", 75
54 __func__); 76 BUG();
77}
78
79/* For data server errors we don't recover from */
80static void
81filelayout_set_lo_fail(struct pnfs_layout_segment *lseg)
82{
83 if (lseg->pls_range.iomode == IOMODE_RW) {
84 dprintk("%s Setting layout IOMODE_RW fail bit\n", __func__);
85 set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
86 } else {
87 dprintk("%s Setting layout IOMODE_READ fail bit\n", __func__);
88 set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
89 }
90}
91
92static int filelayout_async_handle_error(struct rpc_task *task,
93 struct nfs4_state *state,
94 struct nfs_client *clp,
95 int *reset)
96{
97 if (task->tk_status >= 0)
98 return 0;
99
100 *reset = 0;
101
102 switch (task->tk_status) {
103 case -NFS4ERR_BADSESSION:
104 case -NFS4ERR_BADSLOT:
105 case -NFS4ERR_BAD_HIGH_SLOT:
106 case -NFS4ERR_DEADSESSION:
107 case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
108 case -NFS4ERR_SEQ_FALSE_RETRY:
109 case -NFS4ERR_SEQ_MISORDERED:
110 dprintk("%s ERROR %d, Reset session. Exchangeid "
111 "flags 0x%x\n", __func__, task->tk_status,
112 clp->cl_exchange_flags);
113 nfs4_schedule_session_recovery(clp->cl_session);
114 break;
115 case -NFS4ERR_DELAY:
116 case -NFS4ERR_GRACE:
117 case -EKEYEXPIRED:
118 rpc_delay(task, FILELAYOUT_POLL_RETRY_MAX);
119 break;
120 default:
121 dprintk("%s DS error. Retry through MDS %d\n", __func__,
122 task->tk_status);
123 *reset = 1;
124 break;
125 }
126 task->tk_status = 0;
127 return -EAGAIN;
128}
129
130/* NFS_PROTO call done callback routines */
131
132static int filelayout_read_done_cb(struct rpc_task *task,
133 struct nfs_read_data *data)
134{
135 struct nfs_client *clp = data->ds_clp;
136 int reset = 0;
137
138 dprintk("%s DS read\n", __func__);
139
140 if (filelayout_async_handle_error(task, data->args.context->state,
141 data->ds_clp, &reset) == -EAGAIN) {
142 dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n",
143 __func__, data->ds_clp, data->ds_clp->cl_session);
144 if (reset) {
145 filelayout_set_lo_fail(data->lseg);
146 nfs4_reset_read(task, data);
147 clp = NFS_SERVER(data->inode)->nfs_client;
148 }
149 nfs_restart_rpc(task, clp);
150 return -EAGAIN;
151 }
152
55 return 0; 153 return 0;
56} 154}
57 155
58/* Clear out the layout by destroying its device list */ 156/*
59static int 157 * Call ops for the async read/write cases
60filelayout_clear_layoutdriver(struct nfs_server *nfss) 158 * In the case of dense layouts, the offset needs to be reset to its
159 * original value.
160 */
161static void filelayout_read_prepare(struct rpc_task *task, void *data)
61{ 162{
62 dprintk("--> %s\n", __func__); 163 struct nfs_read_data *rdata = (struct nfs_read_data *)data;
164
165 rdata->read_done_cb = filelayout_read_done_cb;
166
167 if (nfs41_setup_sequence(rdata->ds_clp->cl_session,
168 &rdata->args.seq_args, &rdata->res.seq_res,
169 0, task))
170 return;
171
172 rpc_call_start(task);
173}
174
175static void filelayout_read_call_done(struct rpc_task *task, void *data)
176{
177 struct nfs_read_data *rdata = (struct nfs_read_data *)data;
178
179 dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status);
180
181 /* Note this may cause RPC to be resent */
182 rdata->mds_ops->rpc_call_done(task, data);
183}
184
185static void filelayout_read_release(void *data)
186{
187 struct nfs_read_data *rdata = (struct nfs_read_data *)data;
188
189 rdata->mds_ops->rpc_release(data);
190}
191
192static int filelayout_write_done_cb(struct rpc_task *task,
193 struct nfs_write_data *data)
194{
195 int reset = 0;
196
197 if (filelayout_async_handle_error(task, data->args.context->state,
198 data->ds_clp, &reset) == -EAGAIN) {
199 struct nfs_client *clp;
200
201 dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n",
202 __func__, data->ds_clp, data->ds_clp->cl_session);
203 if (reset) {
204 filelayout_set_lo_fail(data->lseg);
205 nfs4_reset_write(task, data);
206 clp = NFS_SERVER(data->inode)->nfs_client;
207 } else
208 clp = data->ds_clp;
209 nfs_restart_rpc(task, clp);
210 return -EAGAIN;
211 }
63 212
64 if (nfss->nfs_client->cl_devid_cache)
65 pnfs_put_deviceid_cache(nfss->nfs_client);
66 return 0; 213 return 0;
67} 214}
68 215
216static void filelayout_write_prepare(struct rpc_task *task, void *data)
217{
218 struct nfs_write_data *wdata = (struct nfs_write_data *)data;
219
220 if (nfs41_setup_sequence(wdata->ds_clp->cl_session,
221 &wdata->args.seq_args, &wdata->res.seq_res,
222 0, task))
223 return;
224
225 rpc_call_start(task);
226}
227
228static void filelayout_write_call_done(struct rpc_task *task, void *data)
229{
230 struct nfs_write_data *wdata = (struct nfs_write_data *)data;
231
232 /* Note this may cause RPC to be resent */
233 wdata->mds_ops->rpc_call_done(task, data);
234}
235
236static void filelayout_write_release(void *data)
237{
238 struct nfs_write_data *wdata = (struct nfs_write_data *)data;
239
240 wdata->mds_ops->rpc_release(data);
241}
242
243struct rpc_call_ops filelayout_read_call_ops = {
244 .rpc_call_prepare = filelayout_read_prepare,
245 .rpc_call_done = filelayout_read_call_done,
246 .rpc_release = filelayout_read_release,
247};
248
249struct rpc_call_ops filelayout_write_call_ops = {
250 .rpc_call_prepare = filelayout_write_prepare,
251 .rpc_call_done = filelayout_write_call_done,
252 .rpc_release = filelayout_write_release,
253};
254
255static enum pnfs_try_status
256filelayout_read_pagelist(struct nfs_read_data *data)
257{
258 struct pnfs_layout_segment *lseg = data->lseg;
259 struct nfs4_pnfs_ds *ds;
260 loff_t offset = data->args.offset;
261 u32 j, idx;
262 struct nfs_fh *fh;
263 int status;
264
265 dprintk("--> %s ino %lu pgbase %u req %Zu@%llu\n",
266 __func__, data->inode->i_ino,
267 data->args.pgbase, (size_t)data->args.count, offset);
268
269 /* Retrieve the correct rpc_client for the byte range */
270 j = nfs4_fl_calc_j_index(lseg, offset);
271 idx = nfs4_fl_calc_ds_index(lseg, j);
272 ds = nfs4_fl_prepare_ds(lseg, idx);
273 if (!ds) {
274 /* Either layout fh index faulty, or ds connect failed */
275 set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
276 set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
277 return PNFS_NOT_ATTEMPTED;
278 }
279 dprintk("%s USE DS:ip %x %hu\n", __func__,
280 ntohl(ds->ds_ip_addr), ntohs(ds->ds_port));
281
282 /* No multipath support. Use first DS */
283 data->ds_clp = ds->ds_clp;
284 fh = nfs4_fl_select_ds_fh(lseg, j);
285 if (fh)
286 data->args.fh = fh;
287
288 data->args.offset = filelayout_get_dserver_offset(lseg, offset);
289 data->mds_offset = offset;
290
291 /* Perform an asynchronous read to ds */
292 status = nfs_initiate_read(data, ds->ds_clp->cl_rpcclient,
293 &filelayout_read_call_ops);
294 BUG_ON(status != 0);
295 return PNFS_ATTEMPTED;
296}
297
298/* Perform async writes. */
299static enum pnfs_try_status
300filelayout_write_pagelist(struct nfs_write_data *data, int sync)
301{
302 struct pnfs_layout_segment *lseg = data->lseg;
303 struct nfs4_pnfs_ds *ds;
304 loff_t offset = data->args.offset;
305 u32 j, idx;
306 struct nfs_fh *fh;
307 int status;
308
309 /* Retrieve the correct rpc_client for the byte range */
310 j = nfs4_fl_calc_j_index(lseg, offset);
311 idx = nfs4_fl_calc_ds_index(lseg, j);
312 ds = nfs4_fl_prepare_ds(lseg, idx);
313 if (!ds) {
314 printk(KERN_ERR "%s: prepare_ds failed, use MDS\n", __func__);
315 set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
316 set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
317 return PNFS_NOT_ATTEMPTED;
318 }
319 dprintk("%s ino %lu sync %d req %Zu@%llu DS:%x:%hu\n", __func__,
320 data->inode->i_ino, sync, (size_t) data->args.count, offset,
321 ntohl(ds->ds_ip_addr), ntohs(ds->ds_port));
322
323 /* We can't handle commit to ds yet */
324 if (!FILELAYOUT_LSEG(lseg)->commit_through_mds)
325 data->args.stable = NFS_FILE_SYNC;
326
327 data->write_done_cb = filelayout_write_done_cb;
328 data->ds_clp = ds->ds_clp;
329 fh = nfs4_fl_select_ds_fh(lseg, j);
330 if (fh)
331 data->args.fh = fh;
332 /*
333 * Get the file offset on the dserver. Set the write offset to
334 * this offset and save the original offset.
335 */
336 data->args.offset = filelayout_get_dserver_offset(lseg, offset);
337 data->mds_offset = offset;
338
339 /* Perform an asynchronous write */
340 status = nfs_initiate_write(data, ds->ds_clp->cl_rpcclient,
341 &filelayout_write_call_ops, sync);
342 BUG_ON(status != 0);
343 return PNFS_ATTEMPTED;
344}
345
69/* 346/*
70 * filelayout_check_layout() 347 * filelayout_check_layout()
71 * 348 *
@@ -92,14 +369,14 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
92 goto out; 369 goto out;
93 } 370 }
94 371
95 if (fl->stripe_unit % PAGE_SIZE) { 372 if (!fl->stripe_unit || fl->stripe_unit % PAGE_SIZE) {
96 dprintk("%s Stripe unit (%u) not page aligned\n", 373 dprintk("%s Invalid stripe unit (%u)\n",
97 __func__, fl->stripe_unit); 374 __func__, fl->stripe_unit);
98 goto out; 375 goto out;
99 } 376 }
100 377
101 /* find and reference the deviceid */ 378 /* find and reference the deviceid */
102 dsaddr = nfs4_fl_find_get_deviceid(nfss->nfs_client, id); 379 dsaddr = nfs4_fl_find_get_deviceid(id);
103 if (dsaddr == NULL) { 380 if (dsaddr == NULL) {
104 dsaddr = get_device_info(lo->plh_inode, id); 381 dsaddr = get_device_info(lo->plh_inode, id);
105 if (dsaddr == NULL) 382 if (dsaddr == NULL)
@@ -134,7 +411,7 @@ out:
134 dprintk("--> %s returns %d\n", __func__, status); 411 dprintk("--> %s returns %d\n", __func__, status);
135 return status; 412 return status;
136out_put: 413out_put:
137 pnfs_put_deviceid(nfss->nfs_client->cl_devid_cache, &dsaddr->deviceid); 414 nfs4_fl_put_deviceid(dsaddr);
138 goto out; 415 goto out;
139} 416}
140 417
@@ -243,23 +520,47 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,
243static void 520static void
244filelayout_free_lseg(struct pnfs_layout_segment *lseg) 521filelayout_free_lseg(struct pnfs_layout_segment *lseg)
245{ 522{
246 struct nfs_server *nfss = NFS_SERVER(lseg->pls_layout->plh_inode);
247 struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg); 523 struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
248 524
249 dprintk("--> %s\n", __func__); 525 dprintk("--> %s\n", __func__);
250 pnfs_put_deviceid(nfss->nfs_client->cl_devid_cache, 526 nfs4_fl_put_deviceid(fl->dsaddr);
251 &fl->dsaddr->deviceid);
252 _filelayout_free_lseg(fl); 527 _filelayout_free_lseg(fl);
253} 528}
254 529
530/*
531 * filelayout_pg_test(). Called by nfs_can_coalesce_requests()
532 *
533 * return 1 : coalesce page
534 * return 0 : don't coalesce page
535 */
536int
537filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
538 struct nfs_page *req)
539{
540 u64 p_stripe, r_stripe;
541 u32 stripe_unit;
542
543 if (!pgio->pg_lseg)
544 return 1;
545 p_stripe = (u64)prev->wb_index << PAGE_CACHE_SHIFT;
546 r_stripe = (u64)req->wb_index << PAGE_CACHE_SHIFT;
547 stripe_unit = FILELAYOUT_LSEG(pgio->pg_lseg)->stripe_unit;
548
549 do_div(p_stripe, stripe_unit);
550 do_div(r_stripe, stripe_unit);
551
552 return (p_stripe == r_stripe);
553}
554
255static struct pnfs_layoutdriver_type filelayout_type = { 555static struct pnfs_layoutdriver_type filelayout_type = {
256 .id = LAYOUT_NFSV4_1_FILES, 556 .id = LAYOUT_NFSV4_1_FILES,
257 .name = "LAYOUT_NFSV4_1_FILES", 557 .name = "LAYOUT_NFSV4_1_FILES",
258 .owner = THIS_MODULE, 558 .owner = THIS_MODULE,
259 .set_layoutdriver = filelayout_set_layoutdriver, 559 .alloc_lseg = filelayout_alloc_lseg,
260 .clear_layoutdriver = filelayout_clear_layoutdriver, 560 .free_lseg = filelayout_free_lseg,
261 .alloc_lseg = filelayout_alloc_lseg, 561 .pg_test = filelayout_pg_test,
262 .free_lseg = filelayout_free_lseg, 562 .read_pagelist = filelayout_read_pagelist,
563 .write_pagelist = filelayout_write_pagelist,
263}; 564};
264 565
265static int __init nfs4filelayout_init(void) 566static int __init nfs4filelayout_init(void)
diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
index bbf60dd2ab9d..ee0c907742b5 100644
--- a/fs/nfs/nfs4filelayout.h
+++ b/fs/nfs/nfs4filelayout.h
@@ -55,8 +55,14 @@ struct nfs4_pnfs_ds {
55 atomic_t ds_count; 55 atomic_t ds_count;
56}; 56};
57 57
58/* nfs4_file_layout_dsaddr flags */
59#define NFS4_DEVICE_ID_NEG_ENTRY 0x00000001
60
58struct nfs4_file_layout_dsaddr { 61struct nfs4_file_layout_dsaddr {
59 struct pnfs_deviceid_node deviceid; 62 struct hlist_node node;
63 struct nfs4_deviceid deviceid;
64 atomic_t ref;
65 unsigned long flags;
60 u32 stripe_count; 66 u32 stripe_count;
61 u8 *stripe_indices; 67 u8 *stripe_indices;
62 u32 ds_num; 68 u32 ds_num;
@@ -83,11 +89,18 @@ FILELAYOUT_LSEG(struct pnfs_layout_segment *lseg)
83 generic_hdr); 89 generic_hdr);
84} 90}
85 91
86extern void nfs4_fl_free_deviceid_callback(struct pnfs_deviceid_node *); 92extern struct nfs_fh *
93nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j);
94
87extern void print_ds(struct nfs4_pnfs_ds *ds); 95extern void print_ds(struct nfs4_pnfs_ds *ds);
88extern void print_deviceid(struct nfs4_deviceid *dev_id); 96extern void print_deviceid(struct nfs4_deviceid *dev_id);
97u32 nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset);
98u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j);
99struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg,
100 u32 ds_idx);
89extern struct nfs4_file_layout_dsaddr * 101extern struct nfs4_file_layout_dsaddr *
90nfs4_fl_find_get_deviceid(struct nfs_client *, struct nfs4_deviceid *dev_id); 102nfs4_fl_find_get_deviceid(struct nfs4_deviceid *dev_id);
103extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
91struct nfs4_file_layout_dsaddr * 104struct nfs4_file_layout_dsaddr *
92get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id); 105get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id);
93 106
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index b73c34375f60..68143c162e3b 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -37,6 +37,30 @@
37#define NFSDBG_FACILITY NFSDBG_PNFS_LD 37#define NFSDBG_FACILITY NFSDBG_PNFS_LD
38 38
39/* 39/*
40 * Device ID RCU cache. A device ID is unique per client ID and layout type.
41 */
42#define NFS4_FL_DEVICE_ID_HASH_BITS 5
43#define NFS4_FL_DEVICE_ID_HASH_SIZE (1 << NFS4_FL_DEVICE_ID_HASH_BITS)
44#define NFS4_FL_DEVICE_ID_HASH_MASK (NFS4_FL_DEVICE_ID_HASH_SIZE - 1)
45
46static inline u32
47nfs4_fl_deviceid_hash(struct nfs4_deviceid *id)
48{
49 unsigned char *cptr = (unsigned char *)id->data;
50 unsigned int nbytes = NFS4_DEVICEID4_SIZE;
51 u32 x = 0;
52
53 while (nbytes--) {
54 x *= 37;
55 x += *cptr++;
56 }
57 return x & NFS4_FL_DEVICE_ID_HASH_MASK;
58}
59
60static struct hlist_head filelayout_deviceid_cache[NFS4_FL_DEVICE_ID_HASH_SIZE];
61static DEFINE_SPINLOCK(filelayout_deviceid_lock);
62
63/*
40 * Data server cache 64 * Data server cache
41 * 65 *
42 * Data servers can be mapped to different device ids. 66 * Data servers can be mapped to different device ids.
@@ -104,6 +128,67 @@ _data_server_lookup_locked(u32 ip_addr, u32 port)
104 return NULL; 128 return NULL;
105} 129}
106 130
131/*
132 * Create an rpc connection to the nfs4_pnfs_ds data server
133 * Currently only support IPv4
134 */
135static int
136nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds)
137{
138 struct nfs_client *clp;
139 struct sockaddr_in sin;
140 int status = 0;
141
142 dprintk("--> %s ip:port %x:%hu au_flavor %d\n", __func__,
143 ntohl(ds->ds_ip_addr), ntohs(ds->ds_port),
144 mds_srv->nfs_client->cl_rpcclient->cl_auth->au_flavor);
145
146 sin.sin_family = AF_INET;
147 sin.sin_addr.s_addr = ds->ds_ip_addr;
148 sin.sin_port = ds->ds_port;
149
150 clp = nfs4_set_ds_client(mds_srv->nfs_client, (struct sockaddr *)&sin,
151 sizeof(sin), IPPROTO_TCP);
152 if (IS_ERR(clp)) {
153 status = PTR_ERR(clp);
154 goto out;
155 }
156
157 if ((clp->cl_exchange_flags & EXCHGID4_FLAG_MASK_PNFS) != 0) {
158 if (!is_ds_client(clp)) {
159 status = -ENODEV;
160 goto out_put;
161 }
162 ds->ds_clp = clp;
163 dprintk("%s [existing] ip=%x, port=%hu\n", __func__,
164 ntohl(ds->ds_ip_addr), ntohs(ds->ds_port));
165 goto out;
166 }
167
168 /*
169 * Do not set NFS_CS_CHECK_LEASE_TIME instead set the DS lease to
170 * be equal to the MDS lease. Renewal is scheduled in create_session.
171 */
172 spin_lock(&mds_srv->nfs_client->cl_lock);
173 clp->cl_lease_time = mds_srv->nfs_client->cl_lease_time;
174 spin_unlock(&mds_srv->nfs_client->cl_lock);
175 clp->cl_last_renewal = jiffies;
176
177 /* New nfs_client */
178 status = nfs4_init_ds_session(clp);
179 if (status)
180 goto out_put;
181
182 ds->ds_clp = clp;
183 dprintk("%s [new] ip=%x, port=%hu\n", __func__, ntohl(ds->ds_ip_addr),
184 ntohs(ds->ds_port));
185out:
186 return status;
187out_put:
188 nfs_put_client(clp);
189 goto out;
190}
191
107static void 192static void
108destroy_ds(struct nfs4_pnfs_ds *ds) 193destroy_ds(struct nfs4_pnfs_ds *ds)
109{ 194{
@@ -122,7 +207,7 @@ nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
122 struct nfs4_pnfs_ds *ds; 207 struct nfs4_pnfs_ds *ds;
123 int i; 208 int i;
124 209
125 print_deviceid(&dsaddr->deviceid.de_id); 210 print_deviceid(&dsaddr->deviceid);
126 211
127 for (i = 0; i < dsaddr->ds_num; i++) { 212 for (i = 0; i < dsaddr->ds_num; i++) {
128 ds = dsaddr->ds_list[i]; 213 ds = dsaddr->ds_list[i];
@@ -139,15 +224,6 @@ nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
139 kfree(dsaddr); 224 kfree(dsaddr);
140} 225}
141 226
142void
143nfs4_fl_free_deviceid_callback(struct pnfs_deviceid_node *device)
144{
145 struct nfs4_file_layout_dsaddr *dsaddr =
146 container_of(device, struct nfs4_file_layout_dsaddr, deviceid);
147
148 nfs4_fl_free_deviceid(dsaddr);
149}
150
151static struct nfs4_pnfs_ds * 227static struct nfs4_pnfs_ds *
152nfs4_pnfs_ds_add(struct inode *inode, u32 ip_addr, u32 port) 228nfs4_pnfs_ds_add(struct inode *inode, u32 ip_addr, u32 port)
153{ 229{
@@ -300,7 +376,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev)
300 dsaddr->stripe_count = cnt; 376 dsaddr->stripe_count = cnt;
301 dsaddr->ds_num = num; 377 dsaddr->ds_num = num;
302 378
303 memcpy(&dsaddr->deviceid.de_id, &pdev->dev_id, sizeof(pdev->dev_id)); 379 memcpy(&dsaddr->deviceid, &pdev->dev_id, sizeof(pdev->dev_id));
304 380
305 /* Go back an read stripe indices */ 381 /* Go back an read stripe indices */
306 p = indicesp; 382 p = indicesp;
@@ -350,28 +426,37 @@ out_err:
350} 426}
351 427
352/* 428/*
353 * Decode the opaque device specified in 'dev' 429 * Decode the opaque device specified in 'dev' and add it to the cache of
354 * and add it to the list of available devices. 430 * available devices.
355 * If the deviceid is already cached, nfs4_add_deviceid will return
356 * a pointer to the cached struct and throw away the new.
357 */ 431 */
358static struct nfs4_file_layout_dsaddr* 432static struct nfs4_file_layout_dsaddr *
359decode_and_add_device(struct inode *inode, struct pnfs_device *dev) 433decode_and_add_device(struct inode *inode, struct pnfs_device *dev)
360{ 434{
361 struct nfs4_file_layout_dsaddr *dsaddr; 435 struct nfs4_file_layout_dsaddr *d, *new;
362 struct pnfs_deviceid_node *d; 436 long hash;
363 437
364 dsaddr = decode_device(inode, dev); 438 new = decode_device(inode, dev);
365 if (!dsaddr) { 439 if (!new) {
366 printk(KERN_WARNING "%s: Could not decode or add device\n", 440 printk(KERN_WARNING "%s: Could not decode or add device\n",
367 __func__); 441 __func__);
368 return NULL; 442 return NULL;
369 } 443 }
370 444
371 d = pnfs_add_deviceid(NFS_SERVER(inode)->nfs_client->cl_devid_cache, 445 spin_lock(&filelayout_deviceid_lock);
372 &dsaddr->deviceid); 446 d = nfs4_fl_find_get_deviceid(&new->deviceid);
447 if (d) {
448 spin_unlock(&filelayout_deviceid_lock);
449 nfs4_fl_free_deviceid(new);
450 return d;
451 }
452
453 INIT_HLIST_NODE(&new->node);
454 atomic_set(&new->ref, 1);
455 hash = nfs4_fl_deviceid_hash(&new->deviceid);
456 hlist_add_head_rcu(&new->node, &filelayout_deviceid_cache[hash]);
457 spin_unlock(&filelayout_deviceid_lock);
373 458
374 return container_of(d, struct nfs4_file_layout_dsaddr, deviceid); 459 return new;
375} 460}
376 461
377/* 462/*
@@ -446,12 +531,123 @@ out_free:
446 return dsaddr; 531 return dsaddr;
447} 532}
448 533
534void
535nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
536{
537 if (atomic_dec_and_lock(&dsaddr->ref, &filelayout_deviceid_lock)) {
538 hlist_del_rcu(&dsaddr->node);
539 spin_unlock(&filelayout_deviceid_lock);
540
541 synchronize_rcu();
542 nfs4_fl_free_deviceid(dsaddr);
543 }
544}
545
449struct nfs4_file_layout_dsaddr * 546struct nfs4_file_layout_dsaddr *
450nfs4_fl_find_get_deviceid(struct nfs_client *clp, struct nfs4_deviceid *id) 547nfs4_fl_find_get_deviceid(struct nfs4_deviceid *id)
548{
549 struct nfs4_file_layout_dsaddr *d;
550 struct hlist_node *n;
551 long hash = nfs4_fl_deviceid_hash(id);
552
553
554 rcu_read_lock();
555 hlist_for_each_entry_rcu(d, n, &filelayout_deviceid_cache[hash], node) {
556 if (!memcmp(&d->deviceid, id, sizeof(*id))) {
557 if (!atomic_inc_not_zero(&d->ref))
558 goto fail;
559 rcu_read_unlock();
560 return d;
561 }
562 }
563fail:
564 rcu_read_unlock();
565 return NULL;
566}
567
568/*
569 * Want res = (offset - layout->pattern_offset)/ layout->stripe_unit
570 * Then: ((res + fsi) % dsaddr->stripe_count)
571 */
572u32
573nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset)
574{
575 struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
576 u64 tmp;
577
578 tmp = offset - flseg->pattern_offset;
579 do_div(tmp, flseg->stripe_unit);
580 tmp += flseg->first_stripe_index;
581 return do_div(tmp, flseg->dsaddr->stripe_count);
582}
583
584u32
585nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j)
586{
587 return FILELAYOUT_LSEG(lseg)->dsaddr->stripe_indices[j];
588}
589
590struct nfs_fh *
591nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j)
592{
593 struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
594 u32 i;
595
596 if (flseg->stripe_type == STRIPE_SPARSE) {
597 if (flseg->num_fh == 1)
598 i = 0;
599 else if (flseg->num_fh == 0)
600 /* Use the MDS OPEN fh set in nfs_read_rpcsetup */
601 return NULL;
602 else
603 i = nfs4_fl_calc_ds_index(lseg, j);
604 } else
605 i = j;
606 return flseg->fh_array[i];
607}
608
609static void
610filelayout_mark_devid_negative(struct nfs4_file_layout_dsaddr *dsaddr,
611 int err, u32 ds_addr)
612{
613 u32 *p = (u32 *)&dsaddr->deviceid;
614
615 printk(KERN_ERR "NFS: data server %x connection error %d."
616 " Deviceid [%x%x%x%x] marked out of use.\n",
617 ds_addr, err, p[0], p[1], p[2], p[3]);
618
619 spin_lock(&filelayout_deviceid_lock);
620 dsaddr->flags |= NFS4_DEVICE_ID_NEG_ENTRY;
621 spin_unlock(&filelayout_deviceid_lock);
622}
623
624struct nfs4_pnfs_ds *
625nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx)
451{ 626{
452 struct pnfs_deviceid_node *d; 627 struct nfs4_file_layout_dsaddr *dsaddr = FILELAYOUT_LSEG(lseg)->dsaddr;
628 struct nfs4_pnfs_ds *ds = dsaddr->ds_list[ds_idx];
453 629
454 d = pnfs_find_get_deviceid(clp->cl_devid_cache, id); 630 if (ds == NULL) {
455 return (d == NULL) ? NULL : 631 printk(KERN_ERR "%s: No data server for offset index %d\n",
456 container_of(d, struct nfs4_file_layout_dsaddr, deviceid); 632 __func__, ds_idx);
633 return NULL;
634 }
635
636 if (!ds->ds_clp) {
637 struct nfs_server *s = NFS_SERVER(lseg->pls_layout->plh_inode);
638 int err;
639
640 if (dsaddr->flags & NFS4_DEVICE_ID_NEG_ENTRY) {
641 /* Already tried to connect, don't try again */
642 dprintk("%s Deviceid marked out of use\n", __func__);
643 return NULL;
644 }
645 err = nfs4_ds_connect(s, ds);
646 if (err) {
647 filelayout_mark_devid_negative(dsaddr, err,
648 ntohl(ds->ds_ip_addr));
649 return NULL;
650 }
651 }
652 return ds;
457} 653}
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 0a07e353a961..1d84e7088af9 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -85,6 +85,9 @@ static int nfs4_map_errors(int err)
85 switch (err) { 85 switch (err) {
86 case -NFS4ERR_RESOURCE: 86 case -NFS4ERR_RESOURCE:
87 return -EREMOTEIO; 87 return -EREMOTEIO;
88 case -NFS4ERR_BADOWNER:
89 case -NFS4ERR_BADNAME:
90 return -EINVAL;
88 default: 91 default:
89 dprintk("%s could not handle NFSv4 error %d\n", 92 dprintk("%s could not handle NFSv4 error %d\n",
90 __func__, -err); 93 __func__, -err);
@@ -241,7 +244,7 @@ static int nfs4_delay(struct rpc_clnt *clnt, long *timeout)
241/* This is the error handling routine for processes that are allowed 244/* This is the error handling routine for processes that are allowed
242 * to sleep. 245 * to sleep.
243 */ 246 */
244static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct nfs4_exception *exception) 247static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_exception *exception)
245{ 248{
246 struct nfs_client *clp = server->nfs_client; 249 struct nfs_client *clp = server->nfs_client;
247 struct nfs4_state *state = exception->state; 250 struct nfs4_state *state = exception->state;
@@ -293,6 +296,19 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode,
293 break; 296 break;
294 case -NFS4ERR_OLD_STATEID: 297 case -NFS4ERR_OLD_STATEID:
295 exception->retry = 1; 298 exception->retry = 1;
299 break;
300 case -NFS4ERR_BADOWNER:
301 /* The following works around a Linux server bug! */
302 case -NFS4ERR_BADNAME:
303 if (server->caps & NFS_CAP_UIDGID_NOMAP) {
304 server->caps &= ~NFS_CAP_UIDGID_NOMAP;
305 exception->retry = 1;
306 printk(KERN_WARNING "NFS: v4 server %s "
307 "does not accept raw "
308 "uid/gids. "
309 "Reenabling the idmapper.\n",
310 server->nfs_client->cl_hostname);
311 }
296 } 312 }
297 /* We failed to handle the error */ 313 /* We failed to handle the error */
298 return nfs4_map_errors(ret); 314 return nfs4_map_errors(ret);
@@ -505,7 +521,7 @@ out:
505 return ret_id; 521 return ret_id;
506} 522}
507 523
508static int nfs41_setup_sequence(struct nfs4_session *session, 524int nfs41_setup_sequence(struct nfs4_session *session,
509 struct nfs4_sequence_args *args, 525 struct nfs4_sequence_args *args,
510 struct nfs4_sequence_res *res, 526 struct nfs4_sequence_res *res,
511 int cache_reply, 527 int cache_reply,
@@ -571,6 +587,7 @@ static int nfs41_setup_sequence(struct nfs4_session *session,
571 res->sr_status = 1; 587 res->sr_status = 1;
572 return 0; 588 return 0;
573} 589}
590EXPORT_SYMBOL_GPL(nfs41_setup_sequence);
574 591
575int nfs4_setup_sequence(const struct nfs_server *server, 592int nfs4_setup_sequence(const struct nfs_server *server,
576 struct nfs4_sequence_args *args, 593 struct nfs4_sequence_args *args,
@@ -1573,9 +1590,8 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
1573 return 0; 1590 return 0;
1574} 1591}
1575 1592
1576static int nfs4_recover_expired_lease(struct nfs_server *server) 1593static int nfs4_client_recover_expired_lease(struct nfs_client *clp)
1577{ 1594{
1578 struct nfs_client *clp = server->nfs_client;
1579 unsigned int loop; 1595 unsigned int loop;
1580 int ret; 1596 int ret;
1581 1597
@@ -1592,6 +1608,11 @@ static int nfs4_recover_expired_lease(struct nfs_server *server)
1592 return ret; 1608 return ret;
1593} 1609}
1594 1610
1611static int nfs4_recover_expired_lease(struct nfs_server *server)
1612{
1613 return nfs4_client_recover_expired_lease(server->nfs_client);
1614}
1615
1595/* 1616/*
1596 * OPEN_EXPIRED: 1617 * OPEN_EXPIRED:
1597 * reclaim state on the server after a network partition. 1618 * reclaim state on the server after a network partition.
@@ -3069,15 +3090,10 @@ static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
3069 return err; 3090 return err;
3070} 3091}
3071 3092
3072static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data) 3093static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data)
3073{ 3094{
3074 struct nfs_server *server = NFS_SERVER(data->inode); 3095 struct nfs_server *server = NFS_SERVER(data->inode);
3075 3096
3076 dprintk("--> %s\n", __func__);
3077
3078 if (!nfs4_sequence_done(task, &data->res.seq_res))
3079 return -EAGAIN;
3080
3081 if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) { 3097 if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) {
3082 nfs_restart_rpc(task, server->nfs_client); 3098 nfs_restart_rpc(task, server->nfs_client);
3083 return -EAGAIN; 3099 return -EAGAIN;
@@ -3089,19 +3105,44 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
3089 return 0; 3105 return 0;
3090} 3106}
3091 3107
3108static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
3109{
3110
3111 dprintk("--> %s\n", __func__);
3112
3113 if (!nfs4_sequence_done(task, &data->res.seq_res))
3114 return -EAGAIN;
3115
3116 return data->read_done_cb(task, data);
3117}
3118
3092static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg) 3119static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg)
3093{ 3120{
3094 data->timestamp = jiffies; 3121 data->timestamp = jiffies;
3122 data->read_done_cb = nfs4_read_done_cb;
3095 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ]; 3123 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
3096} 3124}
3097 3125
3098static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data) 3126/* Reset the the nfs_read_data to send the read to the MDS. */
3127void nfs4_reset_read(struct rpc_task *task, struct nfs_read_data *data)
3128{
3129 dprintk("%s Reset task for i/o through\n", __func__);
3130 put_lseg(data->lseg);
3131 data->lseg = NULL;
3132 /* offsets will differ in the dense stripe case */
3133 data->args.offset = data->mds_offset;
3134 data->ds_clp = NULL;
3135 data->args.fh = NFS_FH(data->inode);
3136 data->read_done_cb = nfs4_read_done_cb;
3137 task->tk_ops = data->mds_ops;
3138 rpc_task_reset_client(task, NFS_CLIENT(data->inode));
3139}
3140EXPORT_SYMBOL_GPL(nfs4_reset_read);
3141
3142static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data)
3099{ 3143{
3100 struct inode *inode = data->inode; 3144 struct inode *inode = data->inode;
3101 3145
3102 if (!nfs4_sequence_done(task, &data->res.seq_res))
3103 return -EAGAIN;
3104
3105 if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) { 3146 if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) {
3106 nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client); 3147 nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client);
3107 return -EAGAIN; 3148 return -EAGAIN;
@@ -3113,11 +3154,41 @@ static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
3113 return 0; 3154 return 0;
3114} 3155}
3115 3156
3157static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
3158{
3159 if (!nfs4_sequence_done(task, &data->res.seq_res))
3160 return -EAGAIN;
3161 return data->write_done_cb(task, data);
3162}
3163
3164/* Reset the the nfs_write_data to send the write to the MDS. */
3165void nfs4_reset_write(struct rpc_task *task, struct nfs_write_data *data)
3166{
3167 dprintk("%s Reset task for i/o through\n", __func__);
3168 put_lseg(data->lseg);
3169 data->lseg = NULL;
3170 data->ds_clp = NULL;
3171 data->write_done_cb = nfs4_write_done_cb;
3172 data->args.fh = NFS_FH(data->inode);
3173 data->args.bitmask = data->res.server->cache_consistency_bitmask;
3174 data->args.offset = data->mds_offset;
3175 data->res.fattr = &data->fattr;
3176 task->tk_ops = data->mds_ops;
3177 rpc_task_reset_client(task, NFS_CLIENT(data->inode));
3178}
3179EXPORT_SYMBOL_GPL(nfs4_reset_write);
3180
3116static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_message *msg) 3181static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_message *msg)
3117{ 3182{
3118 struct nfs_server *server = NFS_SERVER(data->inode); 3183 struct nfs_server *server = NFS_SERVER(data->inode);
3119 3184
3120 data->args.bitmask = server->cache_consistency_bitmask; 3185 if (data->lseg) {
3186 data->args.bitmask = NULL;
3187 data->res.fattr = NULL;
3188 } else
3189 data->args.bitmask = server->cache_consistency_bitmask;
3190 if (!data->write_done_cb)
3191 data->write_done_cb = nfs4_write_done_cb;
3121 data->res.server = server; 3192 data->res.server = server;
3122 data->timestamp = jiffies; 3193 data->timestamp = jiffies;
3123 3194
@@ -5118,6 +5189,27 @@ int nfs4_init_session(struct nfs_server *server)
5118 return ret; 5189 return ret;
5119} 5190}
5120 5191
5192int nfs4_init_ds_session(struct nfs_client *clp)
5193{
5194 struct nfs4_session *session = clp->cl_session;
5195 int ret;
5196
5197 if (!test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state))
5198 return 0;
5199
5200 ret = nfs4_client_recover_expired_lease(clp);
5201 if (!ret)
5202 /* Test for the DS role */
5203 if (!is_ds_client(clp))
5204 ret = -ENODEV;
5205 if (!ret)
5206 ret = nfs4_check_client_ready(clp);
5207 return ret;
5208
5209}
5210EXPORT_SYMBOL_GPL(nfs4_init_ds_session);
5211
5212
5121/* 5213/*
5122 * Renew the cl_session lease. 5214 * Renew the cl_session lease.
5123 */ 5215 */
@@ -5648,6 +5740,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
5648 .clear_acl_cache = nfs4_zap_acl_attr, 5740 .clear_acl_cache = nfs4_zap_acl_attr,
5649 .close_context = nfs4_close_context, 5741 .close_context = nfs4_close_context,
5650 .open_context = nfs4_atomic_open, 5742 .open_context = nfs4_atomic_open,
5743 .init_client = nfs4_init_client,
5651}; 5744};
5652 5745
5653static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = { 5746static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = {
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index 402143d75fc5..df8e7f3ca56d 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -64,12 +64,8 @@ nfs4_renew_state(struct work_struct *work)
64 ops = clp->cl_mvops->state_renewal_ops; 64 ops = clp->cl_mvops->state_renewal_ops;
65 dprintk("%s: start\n", __func__); 65 dprintk("%s: start\n", __func__);
66 66
67 rcu_read_lock(); 67 if (test_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state))
68 if (list_empty(&clp->cl_superblocks)) {
69 rcu_read_unlock();
70 goto out; 68 goto out;
71 }
72 rcu_read_unlock();
73 69
74 spin_lock(&clp->cl_lock); 70 spin_lock(&clp->cl_lock);
75 lease = clp->cl_lease_time; 71 lease = clp->cl_lease_time;
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 0592288f9f06..ab1bf5bb021f 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -153,6 +153,11 @@ static int nfs41_setup_state_renewal(struct nfs_client *clp)
153 int status; 153 int status;
154 struct nfs_fsinfo fsinfo; 154 struct nfs_fsinfo fsinfo;
155 155
156 if (!test_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state)) {
157 nfs4_schedule_state_renewal(clp);
158 return 0;
159 }
160
156 status = nfs4_proc_get_lease_time(clp, &fsinfo); 161 status = nfs4_proc_get_lease_time(clp, &fsinfo);
157 if (status == 0) { 162 if (status == 0) {
158 /* Update lease time and schedule renewal */ 163 /* Update lease time and schedule renewal */
@@ -1448,6 +1453,7 @@ void nfs4_schedule_session_recovery(struct nfs4_session *session)
1448{ 1453{
1449 nfs4_schedule_lease_recovery(session->clp); 1454 nfs4_schedule_lease_recovery(session->clp);
1450} 1455}
1456EXPORT_SYMBOL_GPL(nfs4_schedule_session_recovery);
1451 1457
1452void nfs41_handle_recall_slot(struct nfs_client *clp) 1458void nfs41_handle_recall_slot(struct nfs_client *clp)
1453{ 1459{
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 94d50e86a124..0cf560f77884 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -844,7 +844,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
844 if (iap->ia_valid & ATTR_MODE) 844 if (iap->ia_valid & ATTR_MODE)
845 len += 4; 845 len += 4;
846 if (iap->ia_valid & ATTR_UID) { 846 if (iap->ia_valid & ATTR_UID) {
847 owner_namelen = nfs_map_uid_to_name(server->nfs_client, iap->ia_uid, owner_name, IDMAP_NAMESZ); 847 owner_namelen = nfs_map_uid_to_name(server, iap->ia_uid, owner_name, IDMAP_NAMESZ);
848 if (owner_namelen < 0) { 848 if (owner_namelen < 0) {
849 dprintk("nfs: couldn't resolve uid %d to string\n", 849 dprintk("nfs: couldn't resolve uid %d to string\n",
850 iap->ia_uid); 850 iap->ia_uid);
@@ -856,7 +856,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
856 len += 4 + (XDR_QUADLEN(owner_namelen) << 2); 856 len += 4 + (XDR_QUADLEN(owner_namelen) << 2);
857 } 857 }
858 if (iap->ia_valid & ATTR_GID) { 858 if (iap->ia_valid & ATTR_GID) {
859 owner_grouplen = nfs_map_gid_to_group(server->nfs_client, iap->ia_gid, owner_group, IDMAP_NAMESZ); 859 owner_grouplen = nfs_map_gid_to_group(server, iap->ia_gid, owner_group, IDMAP_NAMESZ);
860 if (owner_grouplen < 0) { 860 if (owner_grouplen < 0) {
861 dprintk("nfs: couldn't resolve gid %d to string\n", 861 dprintk("nfs: couldn't resolve gid %d to string\n",
862 iap->ia_gid); 862 iap->ia_gid);
@@ -1384,7 +1384,7 @@ static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
1384 hdr->replen += decode_putrootfh_maxsz; 1384 hdr->replen += decode_putrootfh_maxsz;
1385} 1385}
1386 1386
1387static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx, const struct nfs_lock_context *l_ctx) 1387static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx, const struct nfs_lock_context *l_ctx, int zero_seqid)
1388{ 1388{
1389 nfs4_stateid stateid; 1389 nfs4_stateid stateid;
1390 __be32 *p; 1390 __be32 *p;
@@ -1392,6 +1392,8 @@ static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context
1392 p = reserve_space(xdr, NFS4_STATEID_SIZE); 1392 p = reserve_space(xdr, NFS4_STATEID_SIZE);
1393 if (ctx->state != NULL) { 1393 if (ctx->state != NULL) {
1394 nfs4_copy_stateid(&stateid, ctx->state, l_ctx->lockowner, l_ctx->pid); 1394 nfs4_copy_stateid(&stateid, ctx->state, l_ctx->lockowner, l_ctx->pid);
1395 if (zero_seqid)
1396 stateid.stateid.seqid = 0;
1395 xdr_encode_opaque_fixed(p, stateid.data, NFS4_STATEID_SIZE); 1397 xdr_encode_opaque_fixed(p, stateid.data, NFS4_STATEID_SIZE);
1396 } else 1398 } else
1397 xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE); 1399 xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE);
@@ -1404,7 +1406,8 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args,
1404 p = reserve_space(xdr, 4); 1406 p = reserve_space(xdr, 4);
1405 *p = cpu_to_be32(OP_READ); 1407 *p = cpu_to_be32(OP_READ);
1406 1408
1407 encode_stateid(xdr, args->context, args->lock_context); 1409 encode_stateid(xdr, args->context, args->lock_context,
1410 hdr->minorversion);
1408 1411
1409 p = reserve_space(xdr, 12); 1412 p = reserve_space(xdr, 12);
1410 p = xdr_encode_hyper(p, args->offset); 1413 p = xdr_encode_hyper(p, args->offset);
@@ -1592,7 +1595,8 @@ static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *arg
1592 p = reserve_space(xdr, 4); 1595 p = reserve_space(xdr, 4);
1593 *p = cpu_to_be32(OP_WRITE); 1596 *p = cpu_to_be32(OP_WRITE);
1594 1597
1595 encode_stateid(xdr, args->context, args->lock_context); 1598 encode_stateid(xdr, args->context, args->lock_context,
1599 hdr->minorversion);
1596 1600
1597 p = reserve_space(xdr, 16); 1601 p = reserve_space(xdr, 16);
1598 p = xdr_encode_hyper(p, args->offset); 1602 p = xdr_encode_hyper(p, args->offset);
@@ -2271,7 +2275,8 @@ static void nfs4_xdr_enc_write(struct rpc_rqst *req, struct xdr_stream *xdr,
2271 encode_putfh(xdr, args->fh, &hdr); 2275 encode_putfh(xdr, args->fh, &hdr);
2272 encode_write(xdr, args, &hdr); 2276 encode_write(xdr, args, &hdr);
2273 req->rq_snd_buf.flags |= XDRBUF_WRITE; 2277 req->rq_snd_buf.flags |= XDRBUF_WRITE;
2274 encode_getfattr(xdr, args->bitmask, &hdr); 2278 if (args->bitmask)
2279 encode_getfattr(xdr, args->bitmask, &hdr);
2275 encode_nops(&hdr); 2280 encode_nops(&hdr);
2276} 2281}
2277 2282
@@ -3382,7 +3387,7 @@ out_overflow:
3382} 3387}
3383 3388
3384static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, 3389static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap,
3385 struct nfs_client *clp, uint32_t *uid, int may_sleep) 3390 const struct nfs_server *server, uint32_t *uid, int may_sleep)
3386{ 3391{
3387 uint32_t len; 3392 uint32_t len;
3388 __be32 *p; 3393 __be32 *p;
@@ -3402,7 +3407,7 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap,
3402 if (!may_sleep) { 3407 if (!may_sleep) {
3403 /* do nothing */ 3408 /* do nothing */
3404 } else if (len < XDR_MAX_NETOBJ) { 3409 } else if (len < XDR_MAX_NETOBJ) {
3405 if (nfs_map_name_to_uid(clp, (char *)p, len, uid) == 0) 3410 if (nfs_map_name_to_uid(server, (char *)p, len, uid) == 0)
3406 ret = NFS_ATTR_FATTR_OWNER; 3411 ret = NFS_ATTR_FATTR_OWNER;
3407 else 3412 else
3408 dprintk("%s: nfs_map_name_to_uid failed!\n", 3413 dprintk("%s: nfs_map_name_to_uid failed!\n",
@@ -3420,7 +3425,7 @@ out_overflow:
3420} 3425}
3421 3426
3422static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, 3427static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap,
3423 struct nfs_client *clp, uint32_t *gid, int may_sleep) 3428 const struct nfs_server *server, uint32_t *gid, int may_sleep)
3424{ 3429{
3425 uint32_t len; 3430 uint32_t len;
3426 __be32 *p; 3431 __be32 *p;
@@ -3440,7 +3445,7 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap,
3440 if (!may_sleep) { 3445 if (!may_sleep) {
3441 /* do nothing */ 3446 /* do nothing */
3442 } else if (len < XDR_MAX_NETOBJ) { 3447 } else if (len < XDR_MAX_NETOBJ) {
3443 if (nfs_map_group_to_gid(clp, (char *)p, len, gid) == 0) 3448 if (nfs_map_group_to_gid(server, (char *)p, len, gid) == 0)
3444 ret = NFS_ATTR_FATTR_GROUP; 3449 ret = NFS_ATTR_FATTR_GROUP;
3445 else 3450 else
3446 dprintk("%s: nfs_map_group_to_gid failed!\n", 3451 dprintk("%s: nfs_map_group_to_gid failed!\n",
@@ -3939,14 +3944,12 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
3939 goto xdr_error; 3944 goto xdr_error;
3940 fattr->valid |= status; 3945 fattr->valid |= status;
3941 3946
3942 status = decode_attr_owner(xdr, bitmap, server->nfs_client, 3947 status = decode_attr_owner(xdr, bitmap, server, &fattr->uid, may_sleep);
3943 &fattr->uid, may_sleep);
3944 if (status < 0) 3948 if (status < 0)
3945 goto xdr_error; 3949 goto xdr_error;
3946 fattr->valid |= status; 3950 fattr->valid |= status;
3947 3951
3948 status = decode_attr_group(xdr, bitmap, server->nfs_client, 3952 status = decode_attr_group(xdr, bitmap, server, &fattr->gid, may_sleep);
3949 &fattr->gid, may_sleep);
3950 if (status < 0) 3953 if (status < 0)
3951 goto xdr_error; 3954 goto xdr_error;
3952 fattr->valid |= status; 3955 fattr->valid |= status;
@@ -5690,8 +5693,9 @@ static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5690 status = decode_write(xdr, res); 5693 status = decode_write(xdr, res);
5691 if (status) 5694 if (status)
5692 goto out; 5695 goto out;
5693 decode_getfattr(xdr, res->fattr, res->server, 5696 if (res->fattr)
5694 !RPC_IS_ASYNC(rqstp->rq_task)); 5697 decode_getfattr(xdr, res->fattr, res->server,
5698 !RPC_IS_ASYNC(rqstp->rq_task));
5695 if (!status) 5699 if (!status)
5696 status = res->count; 5700 status = res->count;
5697out: 5701out:
@@ -6167,8 +6171,6 @@ static struct {
6167 { NFS4ERR_DQUOT, -EDQUOT }, 6171 { NFS4ERR_DQUOT, -EDQUOT },
6168 { NFS4ERR_STALE, -ESTALE }, 6172 { NFS4ERR_STALE, -ESTALE },
6169 { NFS4ERR_BADHANDLE, -EBADHANDLE }, 6173 { NFS4ERR_BADHANDLE, -EBADHANDLE },
6170 { NFS4ERR_BADOWNER, -EINVAL },
6171 { NFS4ERR_BADNAME, -EINVAL },
6172 { NFS4ERR_BAD_COOKIE, -EBADCOOKIE }, 6174 { NFS4ERR_BAD_COOKIE, -EBADCOOKIE },
6173 { NFS4ERR_NOTSUPP, -ENOTSUPP }, 6175 { NFS4ERR_NOTSUPP, -ENOTSUPP },
6174 { NFS4ERR_TOOSMALL, -ETOOSMALL }, 6176 { NFS4ERR_TOOSMALL, -ETOOSMALL },
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index e1164e3f9e69..23e794410669 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -20,6 +20,7 @@
20#include <linux/nfs_mount.h> 20#include <linux/nfs_mount.h>
21 21
22#include "internal.h" 22#include "internal.h"
23#include "pnfs.h"
23 24
24static struct kmem_cache *nfs_page_cachep; 25static struct kmem_cache *nfs_page_cachep;
25 26
@@ -213,7 +214,7 @@ nfs_wait_on_request(struct nfs_page *req)
213 */ 214 */
214void nfs_pageio_init(struct nfs_pageio_descriptor *desc, 215void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
215 struct inode *inode, 216 struct inode *inode,
216 int (*doio)(struct inode *, struct list_head *, unsigned int, size_t, int), 217 int (*doio)(struct nfs_pageio_descriptor *),
217 size_t bsize, 218 size_t bsize,
218 int io_flags) 219 int io_flags)
219{ 220{
@@ -226,6 +227,7 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
226 desc->pg_doio = doio; 227 desc->pg_doio = doio;
227 desc->pg_ioflags = io_flags; 228 desc->pg_ioflags = io_flags;
228 desc->pg_error = 0; 229 desc->pg_error = 0;
230 desc->pg_lseg = NULL;
229} 231}
230 232
231/** 233/**
@@ -240,7 +242,8 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
240 * Return 'true' if this is the case, else return 'false'. 242 * Return 'true' if this is the case, else return 'false'.
241 */ 243 */
242static int nfs_can_coalesce_requests(struct nfs_page *prev, 244static int nfs_can_coalesce_requests(struct nfs_page *prev,
243 struct nfs_page *req) 245 struct nfs_page *req,
246 struct nfs_pageio_descriptor *pgio)
244{ 247{
245 if (req->wb_context->cred != prev->wb_context->cred) 248 if (req->wb_context->cred != prev->wb_context->cred)
246 return 0; 249 return 0;
@@ -254,6 +257,12 @@ static int nfs_can_coalesce_requests(struct nfs_page *prev,
254 return 0; 257 return 0;
255 if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE) 258 if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE)
256 return 0; 259 return 0;
260 /*
261 * Non-whole file layouts need to check that req is inside of
262 * pgio->pg_lseg.
263 */
264 if (pgio->pg_test && !pgio->pg_test(pgio, prev, req))
265 return 0;
257 return 1; 266 return 1;
258} 267}
259 268
@@ -286,7 +295,7 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
286 if (newlen > desc->pg_bsize) 295 if (newlen > desc->pg_bsize)
287 return 0; 296 return 0;
288 prev = nfs_list_entry(desc->pg_list.prev); 297 prev = nfs_list_entry(desc->pg_list.prev);
289 if (!nfs_can_coalesce_requests(prev, req)) 298 if (!nfs_can_coalesce_requests(prev, req, desc))
290 return 0; 299 return 0;
291 } else 300 } else
292 desc->pg_base = req->wb_pgbase; 301 desc->pg_base = req->wb_pgbase;
@@ -302,12 +311,7 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
302static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc) 311static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc)
303{ 312{
304 if (!list_empty(&desc->pg_list)) { 313 if (!list_empty(&desc->pg_list)) {
305 int error = desc->pg_doio(desc->pg_inode, 314 int error = desc->pg_doio(desc);
306 &desc->pg_list,
307 nfs_page_array_len(desc->pg_base,
308 desc->pg_count),
309 desc->pg_count,
310 desc->pg_ioflags);
311 if (error < 0) 315 if (error < 0)
312 desc->pg_error = error; 316 desc->pg_error = error;
313 else 317 else
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 1b1bc1a0fb0a..f38813a0a295 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -30,6 +30,7 @@
30#include <linux/nfs_fs.h> 30#include <linux/nfs_fs.h>
31#include "internal.h" 31#include "internal.h"
32#include "pnfs.h" 32#include "pnfs.h"
33#include "iostat.h"
33 34
34#define NFSDBG_FACILITY NFSDBG_PNFS 35#define NFSDBG_FACILITY NFSDBG_PNFS
35 36
@@ -74,10 +75,8 @@ find_pnfs_driver(u32 id)
74void 75void
75unset_pnfs_layoutdriver(struct nfs_server *nfss) 76unset_pnfs_layoutdriver(struct nfs_server *nfss)
76{ 77{
77 if (nfss->pnfs_curr_ld) { 78 if (nfss->pnfs_curr_ld)
78 nfss->pnfs_curr_ld->clear_layoutdriver(nfss);
79 module_put(nfss->pnfs_curr_ld->owner); 79 module_put(nfss->pnfs_curr_ld->owner);
80 }
81 nfss->pnfs_curr_ld = NULL; 80 nfss->pnfs_curr_ld = NULL;
82} 81}
83 82
@@ -115,13 +114,7 @@ set_pnfs_layoutdriver(struct nfs_server *server, u32 id)
115 goto out_no_driver; 114 goto out_no_driver;
116 } 115 }
117 server->pnfs_curr_ld = ld_type; 116 server->pnfs_curr_ld = ld_type;
118 if (ld_type->set_layoutdriver(server)) { 117
119 printk(KERN_ERR
120 "%s: Error initializing mount point for layout driver %u.\n",
121 __func__, id);
122 module_put(ld_type->owner);
123 goto out_no_driver;
124 }
125 dprintk("%s: pNFS module for %u set\n", __func__, id); 118 dprintk("%s: pNFS module for %u set\n", __func__, id);
126 return; 119 return;
127 120
@@ -230,37 +223,41 @@ static void free_lseg(struct pnfs_layout_segment *lseg)
230 put_layout_hdr(NFS_I(ino)->layout); 223 put_layout_hdr(NFS_I(ino)->layout);
231} 224}
232 225
233/* The use of tmp_list is necessary because pnfs_curr_ld->free_lseg 226static void
234 * could sleep, so must be called outside of the lock. 227put_lseg_common(struct pnfs_layout_segment *lseg)
235 * Returns 1 if object was removed, otherwise return 0. 228{
236 */ 229 struct inode *inode = lseg->pls_layout->plh_inode;
237static int 230
238put_lseg_locked(struct pnfs_layout_segment *lseg, 231 BUG_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
239 struct list_head *tmp_list) 232 list_del_init(&lseg->pls_list);
233 if (list_empty(&lseg->pls_layout->plh_segs)) {
234 set_bit(NFS_LAYOUT_DESTROYED, &lseg->pls_layout->plh_flags);
235 /* Matched by initial refcount set in alloc_init_layout_hdr */
236 put_layout_hdr_locked(lseg->pls_layout);
237 }
238 rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq);
239}
240
241void
242put_lseg(struct pnfs_layout_segment *lseg)
240{ 243{
244 struct inode *inode;
245
246 if (!lseg)
247 return;
248
241 dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg, 249 dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
242 atomic_read(&lseg->pls_refcount), 250 atomic_read(&lseg->pls_refcount),
243 test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); 251 test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
244 if (atomic_dec_and_test(&lseg->pls_refcount)) { 252 inode = lseg->pls_layout->plh_inode;
245 struct inode *ino = lseg->pls_layout->plh_inode; 253 if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) {
254 LIST_HEAD(free_me);
246 255
247 BUG_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); 256 put_lseg_common(lseg);
248 list_del(&lseg->pls_list); 257 list_add(&lseg->pls_list, &free_me);
249 if (list_empty(&lseg->pls_layout->plh_segs)) { 258 spin_unlock(&inode->i_lock);
250 struct nfs_client *clp; 259 pnfs_free_lseg_list(&free_me);
251
252 clp = NFS_SERVER(ino)->nfs_client;
253 spin_lock(&clp->cl_lock);
254 /* List does not take a reference, so no need for put here */
255 list_del_init(&lseg->pls_layout->plh_layouts);
256 spin_unlock(&clp->cl_lock);
257 clear_bit(NFS_LAYOUT_BULK_RECALL, &lseg->pls_layout->plh_flags);
258 }
259 rpc_wake_up(&NFS_SERVER(ino)->roc_rpcwaitq);
260 list_add(&lseg->pls_list, tmp_list);
261 return 1;
262 } 260 }
263 return 0;
264} 261}
265 262
266static bool 263static bool
@@ -281,7 +278,13 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
281 * list. It will now be removed when all 278 * list. It will now be removed when all
282 * outstanding io is finished. 279 * outstanding io is finished.
283 */ 280 */
284 rv = put_lseg_locked(lseg, tmp_list); 281 dprintk("%s: lseg %p ref %d\n", __func__, lseg,
282 atomic_read(&lseg->pls_refcount));
283 if (atomic_dec_and_test(&lseg->pls_refcount)) {
284 put_lseg_common(lseg);
285 list_add(&lseg->pls_list, tmp_list);
286 rv = 1;
287 }
285 } 288 }
286 return rv; 289 return rv;
287} 290}
@@ -299,6 +302,11 @@ mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
299 302
300 dprintk("%s:Begin lo %p\n", __func__, lo); 303 dprintk("%s:Begin lo %p\n", __func__, lo);
301 304
305 if (list_empty(&lo->plh_segs)) {
306 if (!test_and_set_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags))
307 put_layout_hdr_locked(lo);
308 return 0;
309 }
302 list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) 310 list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
303 if (should_free_lseg(lseg->pls_range.iomode, iomode)) { 311 if (should_free_lseg(lseg->pls_range.iomode, iomode)) {
304 dprintk("%s: freeing lseg %p iomode %d " 312 dprintk("%s: freeing lseg %p iomode %d "
@@ -312,11 +320,27 @@ mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
312 return invalid - removed; 320 return invalid - removed;
313} 321}
314 322
323/* note free_me must contain lsegs from a single layout_hdr */
315void 324void
316pnfs_free_lseg_list(struct list_head *free_me) 325pnfs_free_lseg_list(struct list_head *free_me)
317{ 326{
318 struct pnfs_layout_segment *lseg, *tmp; 327 struct pnfs_layout_segment *lseg, *tmp;
328 struct pnfs_layout_hdr *lo;
329
330 if (list_empty(free_me))
331 return;
319 332
333 lo = list_first_entry(free_me, struct pnfs_layout_segment,
334 pls_list)->pls_layout;
335
336 if (test_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags)) {
337 struct nfs_client *clp;
338
339 clp = NFS_SERVER(lo->plh_inode)->nfs_client;
340 spin_lock(&clp->cl_lock);
341 list_del_init(&lo->plh_layouts);
342 spin_unlock(&clp->cl_lock);
343 }
320 list_for_each_entry_safe(lseg, tmp, free_me, pls_list) { 344 list_for_each_entry_safe(lseg, tmp, free_me, pls_list) {
321 list_del(&lseg->pls_list); 345 list_del(&lseg->pls_list);
322 free_lseg(lseg); 346 free_lseg(lseg);
@@ -332,10 +356,8 @@ pnfs_destroy_layout(struct nfs_inode *nfsi)
332 spin_lock(&nfsi->vfs_inode.i_lock); 356 spin_lock(&nfsi->vfs_inode.i_lock);
333 lo = nfsi->layout; 357 lo = nfsi->layout;
334 if (lo) { 358 if (lo) {
335 set_bit(NFS_LAYOUT_DESTROYED, &nfsi->layout->plh_flags); 359 lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */
336 mark_matching_lsegs_invalid(lo, &tmp_list, IOMODE_ANY); 360 mark_matching_lsegs_invalid(lo, &tmp_list, IOMODE_ANY);
337 /* Matched by refcount set to 1 in alloc_init_layout_hdr */
338 put_layout_hdr_locked(lo);
339 } 361 }
340 spin_unlock(&nfsi->vfs_inode.i_lock); 362 spin_unlock(&nfsi->vfs_inode.i_lock);
341 pnfs_free_lseg_list(&tmp_list); 363 pnfs_free_lseg_list(&tmp_list);
@@ -403,6 +425,7 @@ pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid,
403 (int)(lo->plh_barrier - be32_to_cpu(stateid->stateid.seqid)) >= 0) 425 (int)(lo->plh_barrier - be32_to_cpu(stateid->stateid.seqid)) >= 0)
404 return true; 426 return true;
405 return lo->plh_block_lgets || 427 return lo->plh_block_lgets ||
428 test_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags) ||
406 test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) || 429 test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
407 (list_empty(&lo->plh_segs) && 430 (list_empty(&lo->plh_segs) &&
408 (atomic_read(&lo->plh_outstanding) > lget)); 431 (atomic_read(&lo->plh_outstanding) > lget));
@@ -674,7 +697,7 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo, u32 iomode)
674 list_for_each_entry(lseg, &lo->plh_segs, pls_list) { 697 list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
675 if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) && 698 if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
676 is_matching_lseg(lseg, iomode)) { 699 is_matching_lseg(lseg, iomode)) {
677 ret = lseg; 700 ret = get_lseg(lseg);
678 break; 701 break;
679 } 702 }
680 if (cmp_layout(iomode, lseg->pls_range.iomode) > 0) 703 if (cmp_layout(iomode, lseg->pls_range.iomode) > 0)
@@ -699,6 +722,7 @@ pnfs_update_layout(struct inode *ino,
699 struct nfs_client *clp = NFS_SERVER(ino)->nfs_client; 722 struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
700 struct pnfs_layout_hdr *lo; 723 struct pnfs_layout_hdr *lo;
701 struct pnfs_layout_segment *lseg = NULL; 724 struct pnfs_layout_segment *lseg = NULL;
725 bool first = false;
702 726
703 if (!pnfs_enabled_sb(NFS_SERVER(ino))) 727 if (!pnfs_enabled_sb(NFS_SERVER(ino)))
704 return NULL; 728 return NULL;
@@ -715,21 +739,25 @@ pnfs_update_layout(struct inode *ino,
715 dprintk("%s matches recall, use MDS\n", __func__); 739 dprintk("%s matches recall, use MDS\n", __func__);
716 goto out_unlock; 740 goto out_unlock;
717 } 741 }
718 /* Check to see if the layout for the given range already exists */
719 lseg = pnfs_find_lseg(lo, iomode);
720 if (lseg)
721 goto out_unlock;
722 742
723 /* if LAYOUTGET already failed once we don't try again */ 743 /* if LAYOUTGET already failed once we don't try again */
724 if (test_bit(lo_fail_bit(iomode), &nfsi->layout->plh_flags)) 744 if (test_bit(lo_fail_bit(iomode), &nfsi->layout->plh_flags))
725 goto out_unlock; 745 goto out_unlock;
726 746
747 /* Check to see if the layout for the given range already exists */
748 lseg = pnfs_find_lseg(lo, iomode);
749 if (lseg)
750 goto out_unlock;
751
727 if (pnfs_layoutgets_blocked(lo, NULL, 0)) 752 if (pnfs_layoutgets_blocked(lo, NULL, 0))
728 goto out_unlock; 753 goto out_unlock;
729 atomic_inc(&lo->plh_outstanding); 754 atomic_inc(&lo->plh_outstanding);
730 755
731 get_layout_hdr(lo); 756 get_layout_hdr(lo);
732 if (list_empty(&lo->plh_segs)) { 757 if (list_empty(&lo->plh_segs))
758 first = true;
759 spin_unlock(&ino->i_lock);
760 if (first) {
733 /* The lo must be on the clp list if there is any 761 /* The lo must be on the clp list if there is any
734 * chance of a CB_LAYOUTRECALL(FILE) coming in. 762 * chance of a CB_LAYOUTRECALL(FILE) coming in.
735 */ 763 */
@@ -738,24 +766,18 @@ pnfs_update_layout(struct inode *ino,
738 list_add_tail(&lo->plh_layouts, &clp->cl_layouts); 766 list_add_tail(&lo->plh_layouts, &clp->cl_layouts);
739 spin_unlock(&clp->cl_lock); 767 spin_unlock(&clp->cl_lock);
740 } 768 }
741 spin_unlock(&ino->i_lock);
742 769
743 lseg = send_layoutget(lo, ctx, iomode); 770 lseg = send_layoutget(lo, ctx, iomode);
744 if (!lseg) { 771 if (!lseg && first) {
745 spin_lock(&ino->i_lock); 772 spin_lock(&clp->cl_lock);
746 if (list_empty(&lo->plh_segs)) { 773 list_del_init(&lo->plh_layouts);
747 spin_lock(&clp->cl_lock); 774 spin_unlock(&clp->cl_lock);
748 list_del_init(&lo->plh_layouts);
749 spin_unlock(&clp->cl_lock);
750 clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
751 }
752 spin_unlock(&ino->i_lock);
753 } 775 }
754 atomic_dec(&lo->plh_outstanding); 776 atomic_dec(&lo->plh_outstanding);
755 put_layout_hdr(lo); 777 put_layout_hdr(lo);
756out: 778out:
757 dprintk("%s end, state 0x%lx lseg %p\n", __func__, 779 dprintk("%s end, state 0x%lx lseg %p\n", __func__,
758 nfsi->layout->plh_flags, lseg); 780 nfsi->layout ? nfsi->layout->plh_flags : -1, lseg);
759 return lseg; 781 return lseg;
760out_unlock: 782out_unlock:
761 spin_unlock(&ino->i_lock); 783 spin_unlock(&ino->i_lock);
@@ -808,7 +830,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
808 } 830 }
809 init_lseg(lo, lseg); 831 init_lseg(lo, lseg);
810 lseg->pls_range = res->range; 832 lseg->pls_range = res->range;
811 *lgp->lsegpp = lseg; 833 *lgp->lsegpp = get_lseg(lseg);
812 pnfs_insert_layout(lo, lseg); 834 pnfs_insert_layout(lo, lseg);
813 835
814 if (res->return_on_close) { 836 if (res->return_on_close) {
@@ -829,137 +851,97 @@ out_forget_reply:
829 goto out; 851 goto out;
830} 852}
831 853
832/* 854static int pnfs_read_pg_test(struct nfs_pageio_descriptor *pgio,
833 * Device ID cache. Currently supports one layout type per struct nfs_client. 855 struct nfs_page *prev,
834 * Add layout type to the lookup key to expand to support multiple types. 856 struct nfs_page *req)
835 */
836int
837pnfs_alloc_init_deviceid_cache(struct nfs_client *clp,
838 void (*free_callback)(struct pnfs_deviceid_node *))
839{ 857{
840 struct pnfs_deviceid_cache *c; 858 if (pgio->pg_count == prev->wb_bytes) {
841 859 /* This is first coelesce call for a series of nfs_pages */
842 c = kzalloc(sizeof(struct pnfs_deviceid_cache), GFP_KERNEL); 860 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
843 if (!c) 861 prev->wb_context,
844 return -ENOMEM; 862 IOMODE_READ);
845 spin_lock(&clp->cl_lock);
846 if (clp->cl_devid_cache != NULL) {
847 atomic_inc(&clp->cl_devid_cache->dc_ref);
848 dprintk("%s [kref [%d]]\n", __func__,
849 atomic_read(&clp->cl_devid_cache->dc_ref));
850 kfree(c);
851 } else {
852 /* kzalloc initializes hlists */
853 spin_lock_init(&c->dc_lock);
854 atomic_set(&c->dc_ref, 1);
855 c->dc_free_callback = free_callback;
856 clp->cl_devid_cache = c;
857 dprintk("%s [new]\n", __func__);
858 } 863 }
859 spin_unlock(&clp->cl_lock); 864 return NFS_SERVER(pgio->pg_inode)->pnfs_curr_ld->pg_test(pgio, prev, req);
860 return 0;
861} 865}
862EXPORT_SYMBOL_GPL(pnfs_alloc_init_deviceid_cache);
863 866
864/*
865 * Called from pnfs_layoutdriver_type->free_lseg
866 * last layout segment reference frees deviceid
867 */
868void 867void
869pnfs_put_deviceid(struct pnfs_deviceid_cache *c, 868pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode)
870 struct pnfs_deviceid_node *devid)
871{ 869{
872 struct nfs4_deviceid *id = &devid->de_id; 870 struct pnfs_layoutdriver_type *ld;
873 struct pnfs_deviceid_node *d;
874 struct hlist_node *n;
875 long h = nfs4_deviceid_hash(id);
876 871
877 dprintk("%s [%d]\n", __func__, atomic_read(&devid->de_ref)); 872 ld = NFS_SERVER(inode)->pnfs_curr_ld;
878 if (!atomic_dec_and_lock(&devid->de_ref, &c->dc_lock)) 873 pgio->pg_test = (ld && ld->pg_test) ? pnfs_read_pg_test : NULL;
879 return; 874}
880 875
881 hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[h], de_node) 876static int pnfs_write_pg_test(struct nfs_pageio_descriptor *pgio,
882 if (!memcmp(&d->de_id, id, sizeof(*id))) { 877 struct nfs_page *prev,
883 hlist_del_rcu(&d->de_node); 878 struct nfs_page *req)
884 spin_unlock(&c->dc_lock); 879{
885 synchronize_rcu(); 880 if (pgio->pg_count == prev->wb_bytes) {
886 c->dc_free_callback(devid); 881 /* This is first coelesce call for a series of nfs_pages */
887 return; 882 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
888 } 883 prev->wb_context,
889 spin_unlock(&c->dc_lock); 884 IOMODE_RW);
890 /* Why wasn't it found in the list? */
891 BUG();
892}
893EXPORT_SYMBOL_GPL(pnfs_put_deviceid);
894
895/* Find and reference a deviceid */
896struct pnfs_deviceid_node *
897pnfs_find_get_deviceid(struct pnfs_deviceid_cache *c, struct nfs4_deviceid *id)
898{
899 struct pnfs_deviceid_node *d;
900 struct hlist_node *n;
901 long hash = nfs4_deviceid_hash(id);
902
903 dprintk("--> %s hash %ld\n", __func__, hash);
904 rcu_read_lock();
905 hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) {
906 if (!memcmp(&d->de_id, id, sizeof(*id))) {
907 if (!atomic_inc_not_zero(&d->de_ref)) {
908 goto fail;
909 } else {
910 rcu_read_unlock();
911 return d;
912 }
913 }
914 } 885 }
915fail: 886 return NFS_SERVER(pgio->pg_inode)->pnfs_curr_ld->pg_test(pgio, prev, req);
916 rcu_read_unlock(); 887}
917 return NULL; 888
889void
890pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode)
891{
892 struct pnfs_layoutdriver_type *ld;
893
894 ld = NFS_SERVER(inode)->pnfs_curr_ld;
895 pgio->pg_test = (ld && ld->pg_test) ? pnfs_write_pg_test : NULL;
896}
897
898enum pnfs_try_status
899pnfs_try_to_write_data(struct nfs_write_data *wdata,
900 const struct rpc_call_ops *call_ops, int how)
901{
902 struct inode *inode = wdata->inode;
903 enum pnfs_try_status trypnfs;
904 struct nfs_server *nfss = NFS_SERVER(inode);
905
906 wdata->mds_ops = call_ops;
907
908 dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__,
909 inode->i_ino, wdata->args.count, wdata->args.offset, how);
910
911 trypnfs = nfss->pnfs_curr_ld->write_pagelist(wdata, how);
912 if (trypnfs == PNFS_NOT_ATTEMPTED) {
913 put_lseg(wdata->lseg);
914 wdata->lseg = NULL;
915 } else
916 nfs_inc_stats(inode, NFSIOS_PNFS_WRITE);
917
918 dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
919 return trypnfs;
918} 920}
919EXPORT_SYMBOL_GPL(pnfs_find_get_deviceid);
920 921
921/* 922/*
922 * Add a deviceid to the cache. 923 * Call the appropriate parallel I/O subsystem read function.
923 * GETDEVICEINFOs for same deviceid can race. If deviceid is found, discard new
924 */ 924 */
925struct pnfs_deviceid_node * 925enum pnfs_try_status
926pnfs_add_deviceid(struct pnfs_deviceid_cache *c, struct pnfs_deviceid_node *new) 926pnfs_try_to_read_data(struct nfs_read_data *rdata,
927{ 927 const struct rpc_call_ops *call_ops)
928 struct pnfs_deviceid_node *d;
929 long hash = nfs4_deviceid_hash(&new->de_id);
930
931 dprintk("--> %s hash %ld\n", __func__, hash);
932 spin_lock(&c->dc_lock);
933 d = pnfs_find_get_deviceid(c, &new->de_id);
934 if (d) {
935 spin_unlock(&c->dc_lock);
936 dprintk("%s [discard]\n", __func__);
937 c->dc_free_callback(new);
938 return d;
939 }
940 INIT_HLIST_NODE(&new->de_node);
941 atomic_set(&new->de_ref, 1);
942 hlist_add_head_rcu(&new->de_node, &c->dc_deviceids[hash]);
943 spin_unlock(&c->dc_lock);
944 dprintk("%s [new]\n", __func__);
945 return new;
946}
947EXPORT_SYMBOL_GPL(pnfs_add_deviceid);
948
949void
950pnfs_put_deviceid_cache(struct nfs_client *clp)
951{ 928{
952 struct pnfs_deviceid_cache *local = clp->cl_devid_cache; 929 struct inode *inode = rdata->inode;
930 struct nfs_server *nfss = NFS_SERVER(inode);
931 enum pnfs_try_status trypnfs;
953 932
954 dprintk("--> %s ({%d})\n", __func__, atomic_read(&local->dc_ref)); 933 rdata->mds_ops = call_ops;
955 if (atomic_dec_and_lock(&local->dc_ref, &clp->cl_lock)) { 934
956 int i; 935 dprintk("%s: Reading ino:%lu %u@%llu\n",
957 /* Verify cache is empty */ 936 __func__, inode->i_ino, rdata->args.count, rdata->args.offset);
958 for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE; i++) 937
959 BUG_ON(!hlist_empty(&local->dc_deviceids[i])); 938 trypnfs = nfss->pnfs_curr_ld->read_pagelist(rdata);
960 clp->cl_devid_cache = NULL; 939 if (trypnfs == PNFS_NOT_ATTEMPTED) {
961 spin_unlock(&clp->cl_lock); 940 put_lseg(rdata->lseg);
962 kfree(local); 941 rdata->lseg = NULL;
942 } else {
943 nfs_inc_stats(inode, NFSIOS_PNFS_READ);
963 } 944 }
945 dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
946 return trypnfs;
964} 947}
965EXPORT_SYMBOL_GPL(pnfs_put_deviceid_cache);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index e2612ea0cbed..6380b9405bcd 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -30,6 +30,8 @@
30#ifndef FS_NFS_PNFS_H 30#ifndef FS_NFS_PNFS_H
31#define FS_NFS_PNFS_H 31#define FS_NFS_PNFS_H
32 32
33#include <linux/nfs_page.h>
34
33enum { 35enum {
34 NFS_LSEG_VALID = 0, /* cleared when lseg is recalled/returned */ 36 NFS_LSEG_VALID = 0, /* cleared when lseg is recalled/returned */
35 NFS_LSEG_ROC, /* roc bit received from server */ 37 NFS_LSEG_ROC, /* roc bit received from server */
@@ -43,6 +45,11 @@ struct pnfs_layout_segment {
43 struct pnfs_layout_hdr *pls_layout; 45 struct pnfs_layout_hdr *pls_layout;
44}; 46};
45 47
48enum pnfs_try_status {
49 PNFS_ATTEMPTED = 0,
50 PNFS_NOT_ATTEMPTED = 1,
51};
52
46#ifdef CONFIG_NFS_V4_1 53#ifdef CONFIG_NFS_V4_1
47 54
48#define LAYOUT_NFSV4_1_MODULE_PREFIX "nfs-layouttype4" 55#define LAYOUT_NFSV4_1_MODULE_PREFIX "nfs-layouttype4"
@@ -61,10 +68,18 @@ struct pnfs_layoutdriver_type {
61 const u32 id; 68 const u32 id;
62 const char *name; 69 const char *name;
63 struct module *owner; 70 struct module *owner;
64 int (*set_layoutdriver) (struct nfs_server *);
65 int (*clear_layoutdriver) (struct nfs_server *);
66 struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr); 71 struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr);
67 void (*free_lseg) (struct pnfs_layout_segment *lseg); 72 void (*free_lseg) (struct pnfs_layout_segment *lseg);
73
74 /* test for nfs page cache coalescing */
75 int (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *);
76
77 /*
78 * Return PNFS_ATTEMPTED to indicate the layout code has attempted
79 * I/O, else return PNFS_NOT_ATTEMPTED to fall back to normal NFS
80 */
81 enum pnfs_try_status (*read_pagelist) (struct nfs_read_data *nfs_data);
82 enum pnfs_try_status (*write_pagelist) (struct nfs_write_data *nfs_data, int how);
68}; 83};
69 84
70struct pnfs_layout_hdr { 85struct pnfs_layout_hdr {
@@ -90,52 +105,6 @@ struct pnfs_device {
90 unsigned int pglen; 105 unsigned int pglen;
91}; 106};
92 107
93/*
94 * Device ID RCU cache. A device ID is unique per client ID and layout type.
95 */
96#define NFS4_DEVICE_ID_HASH_BITS 5
97#define NFS4_DEVICE_ID_HASH_SIZE (1 << NFS4_DEVICE_ID_HASH_BITS)
98#define NFS4_DEVICE_ID_HASH_MASK (NFS4_DEVICE_ID_HASH_SIZE - 1)
99
100static inline u32
101nfs4_deviceid_hash(struct nfs4_deviceid *id)
102{
103 unsigned char *cptr = (unsigned char *)id->data;
104 unsigned int nbytes = NFS4_DEVICEID4_SIZE;
105 u32 x = 0;
106
107 while (nbytes--) {
108 x *= 37;
109 x += *cptr++;
110 }
111 return x & NFS4_DEVICE_ID_HASH_MASK;
112}
113
114struct pnfs_deviceid_node {
115 struct hlist_node de_node;
116 struct nfs4_deviceid de_id;
117 atomic_t de_ref;
118};
119
120struct pnfs_deviceid_cache {
121 spinlock_t dc_lock;
122 atomic_t dc_ref;
123 void (*dc_free_callback)(struct pnfs_deviceid_node *);
124 struct hlist_head dc_deviceids[NFS4_DEVICE_ID_HASH_SIZE];
125};
126
127extern int pnfs_alloc_init_deviceid_cache(struct nfs_client *,
128 void (*free_callback)(struct pnfs_deviceid_node *));
129extern void pnfs_put_deviceid_cache(struct nfs_client *);
130extern struct pnfs_deviceid_node *pnfs_find_get_deviceid(
131 struct pnfs_deviceid_cache *,
132 struct nfs4_deviceid *);
133extern struct pnfs_deviceid_node *pnfs_add_deviceid(
134 struct pnfs_deviceid_cache *,
135 struct pnfs_deviceid_node *);
136extern void pnfs_put_deviceid(struct pnfs_deviceid_cache *c,
137 struct pnfs_deviceid_node *devid);
138
139extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *); 108extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *);
140extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *); 109extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
141 110
@@ -146,11 +115,18 @@ extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp);
146 115
147/* pnfs.c */ 116/* pnfs.c */
148void get_layout_hdr(struct pnfs_layout_hdr *lo); 117void get_layout_hdr(struct pnfs_layout_hdr *lo);
118void put_lseg(struct pnfs_layout_segment *lseg);
149struct pnfs_layout_segment * 119struct pnfs_layout_segment *
150pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, 120pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
151 enum pnfs_iomode access_type); 121 enum pnfs_iomode access_type);
152void set_pnfs_layoutdriver(struct nfs_server *, u32 id); 122void set_pnfs_layoutdriver(struct nfs_server *, u32 id);
153void unset_pnfs_layoutdriver(struct nfs_server *); 123void unset_pnfs_layoutdriver(struct nfs_server *);
124enum pnfs_try_status pnfs_try_to_write_data(struct nfs_write_data *,
125 const struct rpc_call_ops *, int);
126enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *,
127 const struct rpc_call_ops *);
128void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *);
129void pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *);
154int pnfs_layout_process(struct nfs4_layoutget *lgp); 130int pnfs_layout_process(struct nfs4_layoutget *lgp);
155void pnfs_free_lseg_list(struct list_head *tmp_list); 131void pnfs_free_lseg_list(struct list_head *tmp_list);
156void pnfs_destroy_layout(struct nfs_inode *); 132void pnfs_destroy_layout(struct nfs_inode *);
@@ -177,6 +153,16 @@ static inline int lo_fail_bit(u32 iomode)
177 NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED; 153 NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED;
178} 154}
179 155
156static inline struct pnfs_layout_segment *
157get_lseg(struct pnfs_layout_segment *lseg)
158{
159 if (lseg) {
160 atomic_inc(&lseg->pls_refcount);
161 smp_mb__after_atomic_inc();
162 }
163 return lseg;
164}
165
180/* Return true if a layout driver is being used for this mountpoint */ 166/* Return true if a layout driver is being used for this mountpoint */
181static inline int pnfs_enabled_sb(struct nfs_server *nfss) 167static inline int pnfs_enabled_sb(struct nfs_server *nfss)
182{ 168{
@@ -194,12 +180,36 @@ static inline void pnfs_destroy_layout(struct nfs_inode *nfsi)
194} 180}
195 181
196static inline struct pnfs_layout_segment * 182static inline struct pnfs_layout_segment *
183get_lseg(struct pnfs_layout_segment *lseg)
184{
185 return NULL;
186}
187
188static inline void put_lseg(struct pnfs_layout_segment *lseg)
189{
190}
191
192static inline struct pnfs_layout_segment *
197pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, 193pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
198 enum pnfs_iomode access_type) 194 enum pnfs_iomode access_type)
199{ 195{
200 return NULL; 196 return NULL;
201} 197}
202 198
199static inline enum pnfs_try_status
200pnfs_try_to_read_data(struct nfs_read_data *data,
201 const struct rpc_call_ops *call_ops)
202{
203 return PNFS_NOT_ATTEMPTED;
204}
205
206static inline enum pnfs_try_status
207pnfs_try_to_write_data(struct nfs_write_data *data,
208 const struct rpc_call_ops *call_ops, int how)
209{
210 return PNFS_NOT_ATTEMPTED;
211}
212
203static inline bool 213static inline bool
204pnfs_roc(struct inode *ino) 214pnfs_roc(struct inode *ino)
205{ 215{
@@ -230,6 +240,18 @@ static inline void unset_pnfs_layoutdriver(struct nfs_server *s)
230{ 240{
231} 241}
232 242
243static inline void
244pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *ino)
245{
246 pgio->pg_test = NULL;
247}
248
249static inline void
250pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *ino)
251{
252 pgio->pg_test = NULL;
253}
254
233#endif /* CONFIG_NFS_V4_1 */ 255#endif /* CONFIG_NFS_V4_1 */
234 256
235#endif /* FS_NFS_PNFS_H */ 257#endif /* FS_NFS_PNFS_H */
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 77d5e21c4ad6..b8ec170f2a0f 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -741,4 +741,5 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
741 .lock = nfs_proc_lock, 741 .lock = nfs_proc_lock,
742 .lock_check_bounds = nfs_lock_check_bounds, 742 .lock_check_bounds = nfs_lock_check_bounds,
743 .close_context = nfs_close_context, 743 .close_context = nfs_close_context,
744 .init_client = nfs_init_client,
744}; 745};
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index aedcaa7f291f..7cded2b12a05 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -18,19 +18,20 @@
18#include <linux/sunrpc/clnt.h> 18#include <linux/sunrpc/clnt.h>
19#include <linux/nfs_fs.h> 19#include <linux/nfs_fs.h>
20#include <linux/nfs_page.h> 20#include <linux/nfs_page.h>
21#include <linux/module.h>
21 22
22#include <asm/system.h> 23#include <asm/system.h>
24#include "pnfs.h"
23 25
24#include "nfs4_fs.h" 26#include "nfs4_fs.h"
25#include "internal.h" 27#include "internal.h"
26#include "iostat.h" 28#include "iostat.h"
27#include "fscache.h" 29#include "fscache.h"
28#include "pnfs.h"
29 30
30#define NFSDBG_FACILITY NFSDBG_PAGECACHE 31#define NFSDBG_FACILITY NFSDBG_PAGECACHE
31 32
32static int nfs_pagein_multi(struct inode *, struct list_head *, unsigned int, size_t, int); 33static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc);
33static int nfs_pagein_one(struct inode *, struct list_head *, unsigned int, size_t, int); 34static int nfs_pagein_one(struct nfs_pageio_descriptor *desc);
34static const struct rpc_call_ops nfs_read_partial_ops; 35static const struct rpc_call_ops nfs_read_partial_ops;
35static const struct rpc_call_ops nfs_read_full_ops; 36static const struct rpc_call_ops nfs_read_full_ops;
36 37
@@ -69,6 +70,7 @@ void nfs_readdata_free(struct nfs_read_data *p)
69 70
70static void nfs_readdata_release(struct nfs_read_data *rdata) 71static void nfs_readdata_release(struct nfs_read_data *rdata)
71{ 72{
73 put_lseg(rdata->lseg);
72 put_nfs_open_context(rdata->args.context); 74 put_nfs_open_context(rdata->args.context);
73 nfs_readdata_free(rdata); 75 nfs_readdata_free(rdata);
74} 76}
@@ -114,14 +116,13 @@ static void nfs_readpage_truncate_uninitialised_page(struct nfs_read_data *data)
114int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, 116int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
115 struct page *page) 117 struct page *page)
116{ 118{
117 LIST_HEAD(one_request);
118 struct nfs_page *new; 119 struct nfs_page *new;
119 unsigned int len; 120 unsigned int len;
121 struct nfs_pageio_descriptor pgio;
120 122
121 len = nfs_page_length(page); 123 len = nfs_page_length(page);
122 if (len == 0) 124 if (len == 0)
123 return nfs_return_empty_page(page); 125 return nfs_return_empty_page(page);
124 pnfs_update_layout(inode, ctx, IOMODE_READ);
125 new = nfs_create_request(ctx, inode, page, 0, len); 126 new = nfs_create_request(ctx, inode, page, 0, len);
126 if (IS_ERR(new)) { 127 if (IS_ERR(new)) {
127 unlock_page(page); 128 unlock_page(page);
@@ -130,11 +131,14 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
130 if (len < PAGE_CACHE_SIZE) 131 if (len < PAGE_CACHE_SIZE)
131 zero_user_segment(page, len, PAGE_CACHE_SIZE); 132 zero_user_segment(page, len, PAGE_CACHE_SIZE);
132 133
133 nfs_list_add_request(new, &one_request); 134 nfs_pageio_init(&pgio, inode, NULL, 0, 0);
135 nfs_list_add_request(new, &pgio.pg_list);
136 pgio.pg_count = len;
137
134 if (NFS_SERVER(inode)->rsize < PAGE_CACHE_SIZE) 138 if (NFS_SERVER(inode)->rsize < PAGE_CACHE_SIZE)
135 nfs_pagein_multi(inode, &one_request, 1, len, 0); 139 nfs_pagein_multi(&pgio);
136 else 140 else
137 nfs_pagein_one(inode, &one_request, 1, len, 0); 141 nfs_pagein_one(&pgio);
138 return 0; 142 return 0;
139} 143}
140 144
@@ -155,24 +159,20 @@ static void nfs_readpage_release(struct nfs_page *req)
155 nfs_release_request(req); 159 nfs_release_request(req);
156} 160}
157 161
158/* 162int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt,
159 * Set up the NFS read request struct 163 const struct rpc_call_ops *call_ops)
160 */
161static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
162 const struct rpc_call_ops *call_ops,
163 unsigned int count, unsigned int offset)
164{ 164{
165 struct inode *inode = req->wb_context->path.dentry->d_inode; 165 struct inode *inode = data->inode;
166 int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0; 166 int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0;
167 struct rpc_task *task; 167 struct rpc_task *task;
168 struct rpc_message msg = { 168 struct rpc_message msg = {
169 .rpc_argp = &data->args, 169 .rpc_argp = &data->args,
170 .rpc_resp = &data->res, 170 .rpc_resp = &data->res,
171 .rpc_cred = req->wb_context->cred, 171 .rpc_cred = data->cred,
172 }; 172 };
173 struct rpc_task_setup task_setup_data = { 173 struct rpc_task_setup task_setup_data = {
174 .task = &data->task, 174 .task = &data->task,
175 .rpc_client = NFS_CLIENT(inode), 175 .rpc_client = clnt,
176 .rpc_message = &msg, 176 .rpc_message = &msg,
177 .callback_ops = call_ops, 177 .callback_ops = call_ops,
178 .callback_data = data, 178 .callback_data = data,
@@ -180,9 +180,39 @@ static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
180 .flags = RPC_TASK_ASYNC | swap_flags, 180 .flags = RPC_TASK_ASYNC | swap_flags,
181 }; 181 };
182 182
183 /* Set up the initial task struct. */
184 NFS_PROTO(inode)->read_setup(data, &msg);
185
186 dprintk("NFS: %5u initiated read call (req %s/%lld, %u bytes @ "
187 "offset %llu)\n",
188 data->task.tk_pid,
189 inode->i_sb->s_id,
190 (long long)NFS_FILEID(inode),
191 data->args.count,
192 (unsigned long long)data->args.offset);
193
194 task = rpc_run_task(&task_setup_data);
195 if (IS_ERR(task))
196 return PTR_ERR(task);
197 rpc_put_task(task);
198 return 0;
199}
200EXPORT_SYMBOL_GPL(nfs_initiate_read);
201
202/*
203 * Set up the NFS read request struct
204 */
205static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
206 const struct rpc_call_ops *call_ops,
207 unsigned int count, unsigned int offset,
208 struct pnfs_layout_segment *lseg)
209{
210 struct inode *inode = req->wb_context->path.dentry->d_inode;
211
183 data->req = req; 212 data->req = req;
184 data->inode = inode; 213 data->inode = inode;
185 data->cred = msg.rpc_cred; 214 data->cred = req->wb_context->cred;
215 data->lseg = get_lseg(lseg);
186 216
187 data->args.fh = NFS_FH(inode); 217 data->args.fh = NFS_FH(inode);
188 data->args.offset = req_offset(req) + offset; 218 data->args.offset = req_offset(req) + offset;
@@ -197,21 +227,11 @@ static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
197 data->res.eof = 0; 227 data->res.eof = 0;
198 nfs_fattr_init(&data->fattr); 228 nfs_fattr_init(&data->fattr);
199 229
200 /* Set up the initial task struct. */ 230 if (data->lseg &&
201 NFS_PROTO(inode)->read_setup(data, &msg); 231 (pnfs_try_to_read_data(data, call_ops) == PNFS_ATTEMPTED))
202 232 return 0;
203 dprintk("NFS: %5u initiated read call (req %s/%Ld, %u bytes @ offset %Lu)\n",
204 data->task.tk_pid,
205 inode->i_sb->s_id,
206 (long long)NFS_FILEID(inode),
207 count,
208 (unsigned long long)data->args.offset);
209 233
210 task = rpc_run_task(&task_setup_data); 234 return nfs_initiate_read(data, NFS_CLIENT(inode), call_ops);
211 if (IS_ERR(task))
212 return PTR_ERR(task);
213 rpc_put_task(task);
214 return 0;
215} 235}
216 236
217static void 237static void
@@ -240,20 +260,21 @@ nfs_async_read_error(struct list_head *head)
240 * won't see the new data until our attribute cache is updated. This is more 260 * won't see the new data until our attribute cache is updated. This is more
241 * or less conventional NFS client behavior. 261 * or less conventional NFS client behavior.
242 */ 262 */
243static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int flags) 263static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc)
244{ 264{
245 struct nfs_page *req = nfs_list_entry(head->next); 265 struct nfs_page *req = nfs_list_entry(desc->pg_list.next);
246 struct page *page = req->wb_page; 266 struct page *page = req->wb_page;
247 struct nfs_read_data *data; 267 struct nfs_read_data *data;
248 size_t rsize = NFS_SERVER(inode)->rsize, nbytes; 268 size_t rsize = NFS_SERVER(desc->pg_inode)->rsize, nbytes;
249 unsigned int offset; 269 unsigned int offset;
250 int requests = 0; 270 int requests = 0;
251 int ret = 0; 271 int ret = 0;
272 struct pnfs_layout_segment *lseg;
252 LIST_HEAD(list); 273 LIST_HEAD(list);
253 274
254 nfs_list_remove_request(req); 275 nfs_list_remove_request(req);
255 276
256 nbytes = count; 277 nbytes = desc->pg_count;
257 do { 278 do {
258 size_t len = min(nbytes,rsize); 279 size_t len = min(nbytes,rsize);
259 280
@@ -266,9 +287,11 @@ static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigne
266 } while(nbytes != 0); 287 } while(nbytes != 0);
267 atomic_set(&req->wb_complete, requests); 288 atomic_set(&req->wb_complete, requests);
268 289
290 BUG_ON(desc->pg_lseg != NULL);
291 lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_READ);
269 ClearPageError(page); 292 ClearPageError(page);
270 offset = 0; 293 offset = 0;
271 nbytes = count; 294 nbytes = desc->pg_count;
272 do { 295 do {
273 int ret2; 296 int ret2;
274 297
@@ -280,12 +303,14 @@ static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigne
280 if (nbytes < rsize) 303 if (nbytes < rsize)
281 rsize = nbytes; 304 rsize = nbytes;
282 ret2 = nfs_read_rpcsetup(req, data, &nfs_read_partial_ops, 305 ret2 = nfs_read_rpcsetup(req, data, &nfs_read_partial_ops,
283 rsize, offset); 306 rsize, offset, lseg);
284 if (ret == 0) 307 if (ret == 0)
285 ret = ret2; 308 ret = ret2;
286 offset += rsize; 309 offset += rsize;
287 nbytes -= rsize; 310 nbytes -= rsize;
288 } while (nbytes != 0); 311 } while (nbytes != 0);
312 put_lseg(lseg);
313 desc->pg_lseg = NULL;
289 314
290 return ret; 315 return ret;
291 316
@@ -300,16 +325,21 @@ out_bad:
300 return -ENOMEM; 325 return -ENOMEM;
301} 326}
302 327
303static int nfs_pagein_one(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int flags) 328static int nfs_pagein_one(struct nfs_pageio_descriptor *desc)
304{ 329{
305 struct nfs_page *req; 330 struct nfs_page *req;
306 struct page **pages; 331 struct page **pages;
307 struct nfs_read_data *data; 332 struct nfs_read_data *data;
333 struct list_head *head = &desc->pg_list;
334 struct pnfs_layout_segment *lseg = desc->pg_lseg;
308 int ret = -ENOMEM; 335 int ret = -ENOMEM;
309 336
310 data = nfs_readdata_alloc(npages); 337 data = nfs_readdata_alloc(nfs_page_array_len(desc->pg_base,
311 if (!data) 338 desc->pg_count));
312 goto out_bad; 339 if (!data) {
340 nfs_async_read_error(head);
341 goto out;
342 }
313 343
314 pages = data->pagevec; 344 pages = data->pagevec;
315 while (!list_empty(head)) { 345 while (!list_empty(head)) {
@@ -320,10 +350,14 @@ static int nfs_pagein_one(struct inode *inode, struct list_head *head, unsigned
320 *pages++ = req->wb_page; 350 *pages++ = req->wb_page;
321 } 351 }
322 req = nfs_list_entry(data->pages.next); 352 req = nfs_list_entry(data->pages.next);
353 if ((!lseg) && list_is_singular(&data->pages))
354 lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_READ);
323 355
324 return nfs_read_rpcsetup(req, data, &nfs_read_full_ops, count, 0); 356 ret = nfs_read_rpcsetup(req, data, &nfs_read_full_ops, desc->pg_count,
325out_bad: 357 0, lseg);
326 nfs_async_read_error(head); 358out:
359 put_lseg(lseg);
360 desc->pg_lseg = NULL;
327 return ret; 361 return ret;
328} 362}
329 363
@@ -366,6 +400,7 @@ static void nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data
366 return; 400 return;
367 401
368 /* Yes, so retry the read at the end of the data */ 402 /* Yes, so retry the read at the end of the data */
403 data->mds_offset += resp->count;
369 argp->offset += resp->count; 404 argp->offset += resp->count;
370 argp->pgbase += resp->count; 405 argp->pgbase += resp->count;
371 argp->count -= resp->count; 406 argp->count -= resp->count;
@@ -625,7 +660,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
625 if (ret == 0) 660 if (ret == 0)
626 goto read_complete; /* all pages were read */ 661 goto read_complete; /* all pages were read */
627 662
628 pnfs_update_layout(inode, desc.ctx, IOMODE_READ); 663 pnfs_pageio_init_read(&pgio, inode);
629 if (rsize < PAGE_CACHE_SIZE) 664 if (rsize < PAGE_CACHE_SIZE)
630 nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0); 665 nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0);
631 else 666 else
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index d3286583009a..2b8e9a5e366a 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1008,6 +1008,27 @@ static int nfs_parse_security_flavors(char *value,
1008 return 1; 1008 return 1;
1009} 1009}
1010 1010
1011static int nfs_get_option_str(substring_t args[], char **option)
1012{
1013 kfree(*option);
1014 *option = match_strdup(args);
1015 return !option;
1016}
1017
1018static int nfs_get_option_ul(substring_t args[], unsigned long *option)
1019{
1020 int rc;
1021 char *string;
1022
1023 string = match_strdup(args);
1024 if (string == NULL)
1025 return -ENOMEM;
1026 rc = strict_strtoul(string, 10, option);
1027 kfree(string);
1028
1029 return rc;
1030}
1031
1011/* 1032/*
1012 * Error-check and convert a string of mount options from user space into 1033 * Error-check and convert a string of mount options from user space into
1013 * a data structure. The whole mount string is processed; bad options are 1034 * a data structure. The whole mount string is processed; bad options are
@@ -1156,155 +1177,82 @@ static int nfs_parse_mount_options(char *raw,
1156 * options that take numeric values 1177 * options that take numeric values
1157 */ 1178 */
1158 case Opt_port: 1179 case Opt_port:
1159 string = match_strdup(args); 1180 if (nfs_get_option_ul(args, &option) ||
1160 if (string == NULL) 1181 option > USHRT_MAX)
1161 goto out_nomem;
1162 rc = strict_strtoul(string, 10, &option);
1163 kfree(string);
1164 if (rc != 0 || option > USHRT_MAX)
1165 goto out_invalid_value; 1182 goto out_invalid_value;
1166 mnt->nfs_server.port = option; 1183 mnt->nfs_server.port = option;
1167 break; 1184 break;
1168 case Opt_rsize: 1185 case Opt_rsize:
1169 string = match_strdup(args); 1186 if (nfs_get_option_ul(args, &option))
1170 if (string == NULL)
1171 goto out_nomem;
1172 rc = strict_strtoul(string, 10, &option);
1173 kfree(string);
1174 if (rc != 0)
1175 goto out_invalid_value; 1187 goto out_invalid_value;
1176 mnt->rsize = option; 1188 mnt->rsize = option;
1177 break; 1189 break;
1178 case Opt_wsize: 1190 case Opt_wsize:
1179 string = match_strdup(args); 1191 if (nfs_get_option_ul(args, &option))
1180 if (string == NULL)
1181 goto out_nomem;
1182 rc = strict_strtoul(string, 10, &option);
1183 kfree(string);
1184 if (rc != 0)
1185 goto out_invalid_value; 1192 goto out_invalid_value;
1186 mnt->wsize = option; 1193 mnt->wsize = option;
1187 break; 1194 break;
1188 case Opt_bsize: 1195 case Opt_bsize:
1189 string = match_strdup(args); 1196 if (nfs_get_option_ul(args, &option))
1190 if (string == NULL)
1191 goto out_nomem;
1192 rc = strict_strtoul(string, 10, &option);
1193 kfree(string);
1194 if (rc != 0)
1195 goto out_invalid_value; 1197 goto out_invalid_value;
1196 mnt->bsize = option; 1198 mnt->bsize = option;
1197 break; 1199 break;
1198 case Opt_timeo: 1200 case Opt_timeo:
1199 string = match_strdup(args); 1201 if (nfs_get_option_ul(args, &option) || option == 0)
1200 if (string == NULL)
1201 goto out_nomem;
1202 rc = strict_strtoul(string, 10, &option);
1203 kfree(string);
1204 if (rc != 0 || option == 0)
1205 goto out_invalid_value; 1202 goto out_invalid_value;
1206 mnt->timeo = option; 1203 mnt->timeo = option;
1207 break; 1204 break;
1208 case Opt_retrans: 1205 case Opt_retrans:
1209 string = match_strdup(args); 1206 if (nfs_get_option_ul(args, &option) || option == 0)
1210 if (string == NULL)
1211 goto out_nomem;
1212 rc = strict_strtoul(string, 10, &option);
1213 kfree(string);
1214 if (rc != 0 || option == 0)
1215 goto out_invalid_value; 1207 goto out_invalid_value;
1216 mnt->retrans = option; 1208 mnt->retrans = option;
1217 break; 1209 break;
1218 case Opt_acregmin: 1210 case Opt_acregmin:
1219 string = match_strdup(args); 1211 if (nfs_get_option_ul(args, &option))
1220 if (string == NULL)
1221 goto out_nomem;
1222 rc = strict_strtoul(string, 10, &option);
1223 kfree(string);
1224 if (rc != 0)
1225 goto out_invalid_value; 1212 goto out_invalid_value;
1226 mnt->acregmin = option; 1213 mnt->acregmin = option;
1227 break; 1214 break;
1228 case Opt_acregmax: 1215 case Opt_acregmax:
1229 string = match_strdup(args); 1216 if (nfs_get_option_ul(args, &option))
1230 if (string == NULL)
1231 goto out_nomem;
1232 rc = strict_strtoul(string, 10, &option);
1233 kfree(string);
1234 if (rc != 0)
1235 goto out_invalid_value; 1217 goto out_invalid_value;
1236 mnt->acregmax = option; 1218 mnt->acregmax = option;
1237 break; 1219 break;
1238 case Opt_acdirmin: 1220 case Opt_acdirmin:
1239 string = match_strdup(args); 1221 if (nfs_get_option_ul(args, &option))
1240 if (string == NULL)
1241 goto out_nomem;
1242 rc = strict_strtoul(string, 10, &option);
1243 kfree(string);
1244 if (rc != 0)
1245 goto out_invalid_value; 1222 goto out_invalid_value;
1246 mnt->acdirmin = option; 1223 mnt->acdirmin = option;
1247 break; 1224 break;
1248 case Opt_acdirmax: 1225 case Opt_acdirmax:
1249 string = match_strdup(args); 1226 if (nfs_get_option_ul(args, &option))
1250 if (string == NULL)
1251 goto out_nomem;
1252 rc = strict_strtoul(string, 10, &option);
1253 kfree(string);
1254 if (rc != 0)
1255 goto out_invalid_value; 1227 goto out_invalid_value;
1256 mnt->acdirmax = option; 1228 mnt->acdirmax = option;
1257 break; 1229 break;
1258 case Opt_actimeo: 1230 case Opt_actimeo:
1259 string = match_strdup(args); 1231 if (nfs_get_option_ul(args, &option))
1260 if (string == NULL)
1261 goto out_nomem;
1262 rc = strict_strtoul(string, 10, &option);
1263 kfree(string);
1264 if (rc != 0)
1265 goto out_invalid_value; 1232 goto out_invalid_value;
1266 mnt->acregmin = mnt->acregmax = 1233 mnt->acregmin = mnt->acregmax =
1267 mnt->acdirmin = mnt->acdirmax = option; 1234 mnt->acdirmin = mnt->acdirmax = option;
1268 break; 1235 break;
1269 case Opt_namelen: 1236 case Opt_namelen:
1270 string = match_strdup(args); 1237 if (nfs_get_option_ul(args, &option))
1271 if (string == NULL)
1272 goto out_nomem;
1273 rc = strict_strtoul(string, 10, &option);
1274 kfree(string);
1275 if (rc != 0)
1276 goto out_invalid_value; 1238 goto out_invalid_value;
1277 mnt->namlen = option; 1239 mnt->namlen = option;
1278 break; 1240 break;
1279 case Opt_mountport: 1241 case Opt_mountport:
1280 string = match_strdup(args); 1242 if (nfs_get_option_ul(args, &option) ||
1281 if (string == NULL) 1243 option > USHRT_MAX)
1282 goto out_nomem;
1283 rc = strict_strtoul(string, 10, &option);
1284 kfree(string);
1285 if (rc != 0 || option > USHRT_MAX)
1286 goto out_invalid_value; 1244 goto out_invalid_value;
1287 mnt->mount_server.port = option; 1245 mnt->mount_server.port = option;
1288 break; 1246 break;
1289 case Opt_mountvers: 1247 case Opt_mountvers:
1290 string = match_strdup(args); 1248 if (nfs_get_option_ul(args, &option) ||
1291 if (string == NULL)
1292 goto out_nomem;
1293 rc = strict_strtoul(string, 10, &option);
1294 kfree(string);
1295 if (rc != 0 ||
1296 option < NFS_MNT_VERSION || 1249 option < NFS_MNT_VERSION ||
1297 option > NFS_MNT3_VERSION) 1250 option > NFS_MNT3_VERSION)
1298 goto out_invalid_value; 1251 goto out_invalid_value;
1299 mnt->mount_server.version = option; 1252 mnt->mount_server.version = option;
1300 break; 1253 break;
1301 case Opt_nfsvers: 1254 case Opt_nfsvers:
1302 string = match_strdup(args); 1255 if (nfs_get_option_ul(args, &option))
1303 if (string == NULL)
1304 goto out_nomem;
1305 rc = strict_strtoul(string, 10, &option);
1306 kfree(string);
1307 if (rc != 0)
1308 goto out_invalid_value; 1256 goto out_invalid_value;
1309 switch (option) { 1257 switch (option) {
1310 case NFS2_VERSION: 1258 case NFS2_VERSION:
@@ -1324,12 +1272,7 @@ static int nfs_parse_mount_options(char *raw,
1324 } 1272 }
1325 break; 1273 break;
1326 case Opt_minorversion: 1274 case Opt_minorversion:
1327 string = match_strdup(args); 1275 if (nfs_get_option_ul(args, &option))
1328 if (string == NULL)
1329 goto out_nomem;
1330 rc = strict_strtoul(string, 10, &option);
1331 kfree(string);
1332 if (rc != 0)
1333 goto out_invalid_value; 1276 goto out_invalid_value;
1334 if (option > NFS4_MAX_MINOR_VERSION) 1277 if (option > NFS4_MAX_MINOR_VERSION)
1335 goto out_invalid_value; 1278 goto out_invalid_value;
@@ -1365,21 +1308,18 @@ static int nfs_parse_mount_options(char *raw,
1365 case Opt_xprt_udp: 1308 case Opt_xprt_udp:
1366 mnt->flags &= ~NFS_MOUNT_TCP; 1309 mnt->flags &= ~NFS_MOUNT_TCP;
1367 mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP; 1310 mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP;
1368 kfree(string);
1369 break; 1311 break;
1370 case Opt_xprt_tcp6: 1312 case Opt_xprt_tcp6:
1371 protofamily = AF_INET6; 1313 protofamily = AF_INET6;
1372 case Opt_xprt_tcp: 1314 case Opt_xprt_tcp:
1373 mnt->flags |= NFS_MOUNT_TCP; 1315 mnt->flags |= NFS_MOUNT_TCP;
1374 mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP; 1316 mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP;
1375 kfree(string);
1376 break; 1317 break;
1377 case Opt_xprt_rdma: 1318 case Opt_xprt_rdma:
1378 /* vector side protocols to TCP */ 1319 /* vector side protocols to TCP */
1379 mnt->flags |= NFS_MOUNT_TCP; 1320 mnt->flags |= NFS_MOUNT_TCP;
1380 mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA; 1321 mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA;
1381 xprt_load_transport(string); 1322 xprt_load_transport(string);
1382 kfree(string);
1383 break; 1323 break;
1384 default: 1324 default:
1385 dfprintk(MOUNT, "NFS: unrecognized " 1325 dfprintk(MOUNT, "NFS: unrecognized "
@@ -1387,6 +1327,7 @@ static int nfs_parse_mount_options(char *raw,
1387 kfree(string); 1327 kfree(string);
1388 return 0; 1328 return 0;
1389 } 1329 }
1330 kfree(string);
1390 break; 1331 break;
1391 case Opt_mountproto: 1332 case Opt_mountproto:
1392 string = match_strdup(args); 1333 string = match_strdup(args);
@@ -1429,18 +1370,13 @@ static int nfs_parse_mount_options(char *raw,
1429 goto out_invalid_address; 1370 goto out_invalid_address;
1430 break; 1371 break;
1431 case Opt_clientaddr: 1372 case Opt_clientaddr:
1432 string = match_strdup(args); 1373 if (nfs_get_option_str(args, &mnt->client_address))
1433 if (string == NULL)
1434 goto out_nomem; 1374 goto out_nomem;
1435 kfree(mnt->client_address);
1436 mnt->client_address = string;
1437 break; 1375 break;
1438 case Opt_mounthost: 1376 case Opt_mounthost:
1439 string = match_strdup(args); 1377 if (nfs_get_option_str(args,
1440 if (string == NULL) 1378 &mnt->mount_server.hostname))
1441 goto out_nomem; 1379 goto out_nomem;
1442 kfree(mnt->mount_server.hostname);
1443 mnt->mount_server.hostname = string;
1444 break; 1380 break;
1445 case Opt_mountaddr: 1381 case Opt_mountaddr:
1446 string = match_strdup(args); 1382 string = match_strdup(args);
@@ -1480,11 +1416,8 @@ static int nfs_parse_mount_options(char *raw,
1480 }; 1416 };
1481 break; 1417 break;
1482 case Opt_fscache_uniq: 1418 case Opt_fscache_uniq:
1483 string = match_strdup(args); 1419 if (nfs_get_option_str(args, &mnt->fscache_uniq))
1484 if (string == NULL)
1485 goto out_nomem; 1420 goto out_nomem;
1486 kfree(mnt->fscache_uniq);
1487 mnt->fscache_uniq = string;
1488 mnt->options |= NFS_OPTION_FSCACHE; 1421 mnt->options |= NFS_OPTION_FSCACHE;
1489 break; 1422 break;
1490 case Opt_local_lock: 1423 case Opt_local_lock:
@@ -1694,99 +1627,59 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
1694 return nfs_walk_authlist(args, &request); 1627 return nfs_walk_authlist(args, &request);
1695} 1628}
1696 1629
1697static int nfs_parse_simple_hostname(const char *dev_name, 1630/*
1698 char **hostname, size_t maxnamlen, 1631 * Split "dev_name" into "hostname:export_path".
1699 char **export_path, size_t maxpathlen) 1632 *
1633 * The leftmost colon demarks the split between the server's hostname
1634 * and the export path. If the hostname starts with a left square
1635 * bracket, then it may contain colons.
1636 *
1637 * Note: caller frees hostname and export path, even on error.
1638 */
1639static int nfs_parse_devname(const char *dev_name,
1640 char **hostname, size_t maxnamlen,
1641 char **export_path, size_t maxpathlen)
1700{ 1642{
1701 size_t len; 1643 size_t len;
1702 char *colon, *comma; 1644 char *end;
1703 1645
1704 colon = strchr(dev_name, ':'); 1646 /* Is the host name protected with square brakcets? */
1705 if (colon == NULL) 1647 if (*dev_name == '[') {
1706 goto out_bad_devname; 1648 end = strchr(++dev_name, ']');
1707 1649 if (end == NULL || end[1] != ':')
1708 len = colon - dev_name;
1709 if (len > maxnamlen)
1710 goto out_hostname;
1711
1712 /* N.B. caller will free nfs_server.hostname in all cases */
1713 *hostname = kstrndup(dev_name, len, GFP_KERNEL);
1714 if (!*hostname)
1715 goto out_nomem;
1716
1717 /* kill possible hostname list: not supported */
1718 comma = strchr(*hostname, ',');
1719 if (comma != NULL) {
1720 if (comma == *hostname)
1721 goto out_bad_devname; 1650 goto out_bad_devname;
1722 *comma = '\0';
1723 }
1724
1725 colon++;
1726 len = strlen(colon);
1727 if (len > maxpathlen)
1728 goto out_path;
1729 *export_path = kstrndup(colon, len, GFP_KERNEL);
1730 if (!*export_path)
1731 goto out_nomem;
1732
1733 dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", *export_path);
1734 return 0;
1735
1736out_bad_devname:
1737 dfprintk(MOUNT, "NFS: device name not in host:path format\n");
1738 return -EINVAL;
1739 1651
1740out_nomem: 1652 len = end - dev_name;
1741 dfprintk(MOUNT, "NFS: not enough memory to parse device name\n"); 1653 end++;
1742 return -ENOMEM; 1654 } else {
1743 1655 char *comma;
1744out_hostname:
1745 dfprintk(MOUNT, "NFS: server hostname too long\n");
1746 return -ENAMETOOLONG;
1747
1748out_path:
1749 dfprintk(MOUNT, "NFS: export pathname too long\n");
1750 return -ENAMETOOLONG;
1751}
1752
1753/*
1754 * Hostname has square brackets around it because it contains one or
1755 * more colons. We look for the first closing square bracket, and a
1756 * colon must follow it.
1757 */
1758static int nfs_parse_protected_hostname(const char *dev_name,
1759 char **hostname, size_t maxnamlen,
1760 char **export_path, size_t maxpathlen)
1761{
1762 size_t len;
1763 char *start, *end;
1764 1656
1765 start = (char *)(dev_name + 1); 1657 end = strchr(dev_name, ':');
1658 if (end == NULL)
1659 goto out_bad_devname;
1660 len = end - dev_name;
1766 1661
1767 end = strchr(start, ']'); 1662 /* kill possible hostname list: not supported */
1768 if (end == NULL) 1663 comma = strchr(dev_name, ',');
1769 goto out_bad_devname; 1664 if (comma != NULL && comma < end)
1770 if (*(end + 1) != ':') 1665 *comma = 0;
1771 goto out_bad_devname; 1666 }
1772 1667
1773 len = end - start;
1774 if (len > maxnamlen) 1668 if (len > maxnamlen)
1775 goto out_hostname; 1669 goto out_hostname;
1776 1670
1777 /* N.B. caller will free nfs_server.hostname in all cases */ 1671 /* N.B. caller will free nfs_server.hostname in all cases */
1778 *hostname = kstrndup(start, len, GFP_KERNEL); 1672 *hostname = kstrndup(dev_name, len, GFP_KERNEL);
1779 if (*hostname == NULL) 1673 if (*hostname == NULL)
1780 goto out_nomem; 1674 goto out_nomem;
1781 1675 len = strlen(++end);
1782 end += 2;
1783 len = strlen(end);
1784 if (len > maxpathlen) 1676 if (len > maxpathlen)
1785 goto out_path; 1677 goto out_path;
1786 *export_path = kstrndup(end, len, GFP_KERNEL); 1678 *export_path = kstrndup(end, len, GFP_KERNEL);
1787 if (!*export_path) 1679 if (!*export_path)
1788 goto out_nomem; 1680 goto out_nomem;
1789 1681
1682 dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", *export_path);
1790 return 0; 1683 return 0;
1791 1684
1792out_bad_devname: 1685out_bad_devname:
@@ -1807,29 +1700,6 @@ out_path:
1807} 1700}
1808 1701
1809/* 1702/*
1810 * Split "dev_name" into "hostname:export_path".
1811 *
1812 * The leftmost colon demarks the split between the server's hostname
1813 * and the export path. If the hostname starts with a left square
1814 * bracket, then it may contain colons.
1815 *
1816 * Note: caller frees hostname and export path, even on error.
1817 */
1818static int nfs_parse_devname(const char *dev_name,
1819 char **hostname, size_t maxnamlen,
1820 char **export_path, size_t maxpathlen)
1821{
1822 if (*dev_name == '[')
1823 return nfs_parse_protected_hostname(dev_name,
1824 hostname, maxnamlen,
1825 export_path, maxpathlen);
1826
1827 return nfs_parse_simple_hostname(dev_name,
1828 hostname, maxnamlen,
1829 export_path, maxpathlen);
1830}
1831
1832/*
1833 * Validate the NFS2/NFS3 mount data 1703 * Validate the NFS2/NFS3 mount data
1834 * - fills in the mount root filehandle 1704 * - fills in the mount root filehandle
1835 * 1705 *
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 42b92d7a9cc4..47a3ad63e0d5 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -28,6 +28,7 @@
28#include "iostat.h" 28#include "iostat.h"
29#include "nfs4_fs.h" 29#include "nfs4_fs.h"
30#include "fscache.h" 30#include "fscache.h"
31#include "pnfs.h"
31 32
32#define NFSDBG_FACILITY NFSDBG_PAGECACHE 33#define NFSDBG_FACILITY NFSDBG_PAGECACHE
33 34
@@ -96,6 +97,7 @@ void nfs_writedata_free(struct nfs_write_data *p)
96 97
97static void nfs_writedata_release(struct nfs_write_data *wdata) 98static void nfs_writedata_release(struct nfs_write_data *wdata)
98{ 99{
100 put_lseg(wdata->lseg);
99 put_nfs_open_context(wdata->args.context); 101 put_nfs_open_context(wdata->args.context);
100 nfs_writedata_free(wdata); 102 nfs_writedata_free(wdata);
101} 103}
@@ -781,25 +783,21 @@ static int flush_task_priority(int how)
781 return RPC_PRIORITY_NORMAL; 783 return RPC_PRIORITY_NORMAL;
782} 784}
783 785
784/* 786int nfs_initiate_write(struct nfs_write_data *data,
785 * Set up the argument/result storage required for the RPC call. 787 struct rpc_clnt *clnt,
786 */ 788 const struct rpc_call_ops *call_ops,
787static int nfs_write_rpcsetup(struct nfs_page *req, 789 int how)
788 struct nfs_write_data *data,
789 const struct rpc_call_ops *call_ops,
790 unsigned int count, unsigned int offset,
791 int how)
792{ 790{
793 struct inode *inode = req->wb_context->path.dentry->d_inode; 791 struct inode *inode = data->inode;
794 int priority = flush_task_priority(how); 792 int priority = flush_task_priority(how);
795 struct rpc_task *task; 793 struct rpc_task *task;
796 struct rpc_message msg = { 794 struct rpc_message msg = {
797 .rpc_argp = &data->args, 795 .rpc_argp = &data->args,
798 .rpc_resp = &data->res, 796 .rpc_resp = &data->res,
799 .rpc_cred = req->wb_context->cred, 797 .rpc_cred = data->cred,
800 }; 798 };
801 struct rpc_task_setup task_setup_data = { 799 struct rpc_task_setup task_setup_data = {
802 .rpc_client = NFS_CLIENT(inode), 800 .rpc_client = clnt,
803 .task = &data->task, 801 .task = &data->task,
804 .rpc_message = &msg, 802 .rpc_message = &msg,
805 .callback_ops = call_ops, 803 .callback_ops = call_ops,
@@ -810,12 +808,52 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
810 }; 808 };
811 int ret = 0; 809 int ret = 0;
812 810
811 /* Set up the initial task struct. */
812 NFS_PROTO(inode)->write_setup(data, &msg);
813
814 dprintk("NFS: %5u initiated write call "
815 "(req %s/%lld, %u bytes @ offset %llu)\n",
816 data->task.tk_pid,
817 inode->i_sb->s_id,
818 (long long)NFS_FILEID(inode),
819 data->args.count,
820 (unsigned long long)data->args.offset);
821
822 task = rpc_run_task(&task_setup_data);
823 if (IS_ERR(task)) {
824 ret = PTR_ERR(task);
825 goto out;
826 }
827 if (how & FLUSH_SYNC) {
828 ret = rpc_wait_for_completion_task(task);
829 if (ret == 0)
830 ret = task->tk_status;
831 }
832 rpc_put_task(task);
833out:
834 return ret;
835}
836EXPORT_SYMBOL_GPL(nfs_initiate_write);
837
838/*
839 * Set up the argument/result storage required for the RPC call.
840 */
841static int nfs_write_rpcsetup(struct nfs_page *req,
842 struct nfs_write_data *data,
843 const struct rpc_call_ops *call_ops,
844 unsigned int count, unsigned int offset,
845 struct pnfs_layout_segment *lseg,
846 int how)
847{
848 struct inode *inode = req->wb_context->path.dentry->d_inode;
849
813 /* Set up the RPC argument and reply structs 850 /* Set up the RPC argument and reply structs
814 * NB: take care not to mess about with data->commit et al. */ 851 * NB: take care not to mess about with data->commit et al. */
815 852
816 data->req = req; 853 data->req = req;
817 data->inode = inode = req->wb_context->path.dentry->d_inode; 854 data->inode = inode = req->wb_context->path.dentry->d_inode;
818 data->cred = msg.rpc_cred; 855 data->cred = req->wb_context->cred;
856 data->lseg = get_lseg(lseg);
819 857
820 data->args.fh = NFS_FH(inode); 858 data->args.fh = NFS_FH(inode);
821 data->args.offset = req_offset(req) + offset; 859 data->args.offset = req_offset(req) + offset;
@@ -836,30 +874,11 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
836 data->res.verf = &data->verf; 874 data->res.verf = &data->verf;
837 nfs_fattr_init(&data->fattr); 875 nfs_fattr_init(&data->fattr);
838 876
839 /* Set up the initial task struct. */ 877 if (data->lseg &&
840 NFS_PROTO(inode)->write_setup(data, &msg); 878 (pnfs_try_to_write_data(data, call_ops, how) == PNFS_ATTEMPTED))
841 879 return 0;
842 dprintk("NFS: %5u initiated write call "
843 "(req %s/%lld, %u bytes @ offset %llu)\n",
844 data->task.tk_pid,
845 inode->i_sb->s_id,
846 (long long)NFS_FILEID(inode),
847 count,
848 (unsigned long long)data->args.offset);
849 880
850 task = rpc_run_task(&task_setup_data); 881 return nfs_initiate_write(data, NFS_CLIENT(inode), call_ops, how);
851 if (IS_ERR(task)) {
852 ret = PTR_ERR(task);
853 goto out;
854 }
855 if (how & FLUSH_SYNC) {
856 ret = rpc_wait_for_completion_task(task);
857 if (ret == 0)
858 ret = task->tk_status;
859 }
860 rpc_put_task(task);
861out:
862 return ret;
863} 882}
864 883
865/* If a nfs_flush_* function fails, it should remove reqs from @head and 884/* If a nfs_flush_* function fails, it should remove reqs from @head and
@@ -879,20 +898,21 @@ static void nfs_redirty_request(struct nfs_page *req)
879 * Generate multiple small requests to write out a single 898 * Generate multiple small requests to write out a single
880 * contiguous dirty area on one page. 899 * contiguous dirty area on one page.
881 */ 900 */
882static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int how) 901static int nfs_flush_multi(struct nfs_pageio_descriptor *desc)
883{ 902{
884 struct nfs_page *req = nfs_list_entry(head->next); 903 struct nfs_page *req = nfs_list_entry(desc->pg_list.next);
885 struct page *page = req->wb_page; 904 struct page *page = req->wb_page;
886 struct nfs_write_data *data; 905 struct nfs_write_data *data;
887 size_t wsize = NFS_SERVER(inode)->wsize, nbytes; 906 size_t wsize = NFS_SERVER(desc->pg_inode)->wsize, nbytes;
888 unsigned int offset; 907 unsigned int offset;
889 int requests = 0; 908 int requests = 0;
890 int ret = 0; 909 int ret = 0;
910 struct pnfs_layout_segment *lseg;
891 LIST_HEAD(list); 911 LIST_HEAD(list);
892 912
893 nfs_list_remove_request(req); 913 nfs_list_remove_request(req);
894 914
895 nbytes = count; 915 nbytes = desc->pg_count;
896 do { 916 do {
897 size_t len = min(nbytes, wsize); 917 size_t len = min(nbytes, wsize);
898 918
@@ -905,9 +925,11 @@ static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned
905 } while (nbytes != 0); 925 } while (nbytes != 0);
906 atomic_set(&req->wb_complete, requests); 926 atomic_set(&req->wb_complete, requests);
907 927
928 BUG_ON(desc->pg_lseg);
929 lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_RW);
908 ClearPageError(page); 930 ClearPageError(page);
909 offset = 0; 931 offset = 0;
910 nbytes = count; 932 nbytes = desc->pg_count;
911 do { 933 do {
912 int ret2; 934 int ret2;
913 935
@@ -919,13 +941,15 @@ static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned
919 if (nbytes < wsize) 941 if (nbytes < wsize)
920 wsize = nbytes; 942 wsize = nbytes;
921 ret2 = nfs_write_rpcsetup(req, data, &nfs_write_partial_ops, 943 ret2 = nfs_write_rpcsetup(req, data, &nfs_write_partial_ops,
922 wsize, offset, how); 944 wsize, offset, lseg, desc->pg_ioflags);
923 if (ret == 0) 945 if (ret == 0)
924 ret = ret2; 946 ret = ret2;
925 offset += wsize; 947 offset += wsize;
926 nbytes -= wsize; 948 nbytes -= wsize;
927 } while (nbytes != 0); 949 } while (nbytes != 0);
928 950
951 put_lseg(lseg);
952 desc->pg_lseg = NULL;
929 return ret; 953 return ret;
930 954
931out_bad: 955out_bad:
@@ -946,16 +970,26 @@ out_bad:
946 * This is the case if nfs_updatepage detects a conflicting request 970 * This is the case if nfs_updatepage detects a conflicting request
947 * that has been written but not committed. 971 * that has been written but not committed.
948 */ 972 */
949static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int how) 973static int nfs_flush_one(struct nfs_pageio_descriptor *desc)
950{ 974{
951 struct nfs_page *req; 975 struct nfs_page *req;
952 struct page **pages; 976 struct page **pages;
953 struct nfs_write_data *data; 977 struct nfs_write_data *data;
978 struct list_head *head = &desc->pg_list;
979 struct pnfs_layout_segment *lseg = desc->pg_lseg;
980 int ret;
954 981
955 data = nfs_writedata_alloc(npages); 982 data = nfs_writedata_alloc(nfs_page_array_len(desc->pg_base,
956 if (!data) 983 desc->pg_count));
957 goto out_bad; 984 if (!data) {
958 985 while (!list_empty(head)) {
986 req = nfs_list_entry(head->next);
987 nfs_list_remove_request(req);
988 nfs_redirty_request(req);
989 }
990 ret = -ENOMEM;
991 goto out;
992 }
959 pages = data->pagevec; 993 pages = data->pagevec;
960 while (!list_empty(head)) { 994 while (!list_empty(head)) {
961 req = nfs_list_entry(head->next); 995 req = nfs_list_entry(head->next);
@@ -965,16 +999,15 @@ static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned i
965 *pages++ = req->wb_page; 999 *pages++ = req->wb_page;
966 } 1000 }
967 req = nfs_list_entry(data->pages.next); 1001 req = nfs_list_entry(data->pages.next);
1002 if ((!lseg) && list_is_singular(&data->pages))
1003 lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_RW);
968 1004
969 /* Set up the argument struct */ 1005 /* Set up the argument struct */
970 return nfs_write_rpcsetup(req, data, &nfs_write_full_ops, count, 0, how); 1006 ret = nfs_write_rpcsetup(req, data, &nfs_write_full_ops, desc->pg_count, 0, lseg, desc->pg_ioflags);
971 out_bad: 1007out:
972 while (!list_empty(head)) { 1008 put_lseg(lseg); /* Cleans any gotten in ->pg_test */
973 req = nfs_list_entry(head->next); 1009 desc->pg_lseg = NULL;
974 nfs_list_remove_request(req); 1010 return ret;
975 nfs_redirty_request(req);
976 }
977 return -ENOMEM;
978} 1011}
979 1012
980static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, 1013static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
@@ -982,6 +1015,8 @@ static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
982{ 1015{
983 size_t wsize = NFS_SERVER(inode)->wsize; 1016 size_t wsize = NFS_SERVER(inode)->wsize;
984 1017
1018 pnfs_pageio_init_write(pgio, inode);
1019
985 if (wsize < PAGE_CACHE_SIZE) 1020 if (wsize < PAGE_CACHE_SIZE)
986 nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags); 1021 nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags);
987 else 1022 else
@@ -1132,7 +1167,7 @@ static const struct rpc_call_ops nfs_write_full_ops = {
1132/* 1167/*
1133 * This function is called when the WRITE call is complete. 1168 * This function is called when the WRITE call is complete.
1134 */ 1169 */
1135int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) 1170void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
1136{ 1171{
1137 struct nfs_writeargs *argp = &data->args; 1172 struct nfs_writeargs *argp = &data->args;
1138 struct nfs_writeres *resp = &data->res; 1173 struct nfs_writeres *resp = &data->res;
@@ -1151,7 +1186,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
1151 */ 1186 */
1152 status = NFS_PROTO(data->inode)->write_done(task, data); 1187 status = NFS_PROTO(data->inode)->write_done(task, data);
1153 if (status != 0) 1188 if (status != 0)
1154 return status; 1189 return;
1155 nfs_add_stats(data->inode, NFSIOS_SERVERWRITTENBYTES, resp->count); 1190 nfs_add_stats(data->inode, NFSIOS_SERVERWRITTENBYTES, resp->count);
1156 1191
1157#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) 1192#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
@@ -1166,6 +1201,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
1166 */ 1201 */
1167 static unsigned long complain; 1202 static unsigned long complain;
1168 1203
1204 /* Note this will print the MDS for a DS write */
1169 if (time_before(complain, jiffies)) { 1205 if (time_before(complain, jiffies)) {
1170 dprintk("NFS: faulty NFS server %s:" 1206 dprintk("NFS: faulty NFS server %s:"
1171 " (committed = %d) != (stable = %d)\n", 1207 " (committed = %d) != (stable = %d)\n",
@@ -1186,6 +1222,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
1186 /* Was this an NFSv2 write or an NFSv3 stable write? */ 1222 /* Was this an NFSv2 write or an NFSv3 stable write? */
1187 if (resp->verf->committed != NFS_UNSTABLE) { 1223 if (resp->verf->committed != NFS_UNSTABLE) {
1188 /* Resend from where the server left off */ 1224 /* Resend from where the server left off */
1225 data->mds_offset += resp->count;
1189 argp->offset += resp->count; 1226 argp->offset += resp->count;
1190 argp->pgbase += resp->count; 1227 argp->pgbase += resp->count;
1191 argp->count -= resp->count; 1228 argp->count -= resp->count;
@@ -1196,7 +1233,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
1196 argp->stable = NFS_FILE_SYNC; 1233 argp->stable = NFS_FILE_SYNC;
1197 } 1234 }
1198 nfs_restart_rpc(task, server->nfs_client); 1235 nfs_restart_rpc(task, server->nfs_client);
1199 return -EAGAIN; 1236 return;
1200 } 1237 }
1201 if (time_before(complain, jiffies)) { 1238 if (time_before(complain, jiffies)) {
1202 printk(KERN_WARNING 1239 printk(KERN_WARNING
@@ -1207,7 +1244,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
1207 /* Can't do anything about it except throw an error. */ 1244 /* Can't do anything about it except throw an error. */
1208 task->tk_status = -EIO; 1245 task->tk_status = -EIO;
1209 } 1246 }
1210 return 0; 1247 return;
1211} 1248}
1212 1249
1213 1250
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 6023efa9f5d9..f88522b10a38 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -501,7 +501,7 @@ extern int nfs_writepage(struct page *page, struct writeback_control *wbc);
501extern int nfs_writepages(struct address_space *, struct writeback_control *); 501extern int nfs_writepages(struct address_space *, struct writeback_control *);
502extern int nfs_flush_incompatible(struct file *file, struct page *page); 502extern int nfs_flush_incompatible(struct file *file, struct page *page);
503extern int nfs_updatepage(struct file *, struct page *, unsigned int, unsigned int); 503extern int nfs_updatepage(struct file *, struct page *, unsigned int, unsigned int);
504extern int nfs_writeback_done(struct rpc_task *, struct nfs_write_data *); 504extern void nfs_writeback_done(struct rpc_task *, struct nfs_write_data *);
505 505
506/* 506/*
507 * Try to write back everything synchronously (but check the 507 * Try to write back everything synchronously (but check the
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 3e112de12d8d..216cea5db0aa 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -30,6 +30,8 @@ struct nfs_client {
30#define NFS_CS_CALLBACK 1 /* - callback started */ 30#define NFS_CS_CALLBACK 1 /* - callback started */
31#define NFS_CS_IDMAP 2 /* - idmap started */ 31#define NFS_CS_IDMAP 2 /* - idmap started */
32#define NFS_CS_RENEWD 3 /* - renewd started */ 32#define NFS_CS_RENEWD 3 /* - renewd started */
33#define NFS_CS_STOP_RENEW 4 /* no more state to renew */
34#define NFS_CS_CHECK_LEASE_TIME 5 /* need to check lease time */
33 struct sockaddr_storage cl_addr; /* server identifier */ 35 struct sockaddr_storage cl_addr; /* server identifier */
34 size_t cl_addrlen; 36 size_t cl_addrlen;
35 char * cl_hostname; /* hostname of server */ 37 char * cl_hostname; /* hostname of server */
@@ -75,7 +77,6 @@ struct nfs_client {
75 u32 cl_exchange_flags; 77 u32 cl_exchange_flags;
76 struct nfs4_session *cl_session; /* sharred session */ 78 struct nfs4_session *cl_session; /* sharred session */
77 struct list_head cl_layouts; 79 struct list_head cl_layouts;
78 struct pnfs_deviceid_cache *cl_devid_cache; /* pNFS deviceid cache */
79#endif /* CONFIG_NFS_V4 */ 80#endif /* CONFIG_NFS_V4 */
80 81
81#ifdef CONFIG_NFS_FSCACHE 82#ifdef CONFIG_NFS_FSCACHE
@@ -176,6 +177,7 @@ struct nfs_server {
176#define NFS_CAP_CTIME (1U << 12) 177#define NFS_CAP_CTIME (1U << 12)
177#define NFS_CAP_MTIME (1U << 13) 178#define NFS_CAP_MTIME (1U << 13)
178#define NFS_CAP_POSIX_LOCK (1U << 14) 179#define NFS_CAP_POSIX_LOCK (1U << 14)
180#define NFS_CAP_UIDGID_NOMAP (1U << 15)
179 181
180 182
181/* maximum number of slots to use */ 183/* maximum number of slots to use */
diff --git a/include/linux/nfs_idmap.h b/include/linux/nfs_idmap.h
index e8352dc5afb5..ae7d6a380dae 100644
--- a/include/linux/nfs_idmap.h
+++ b/include/linux/nfs_idmap.h
@@ -65,6 +65,7 @@ struct idmap_msg {
65 65
66/* Forward declaration to make this header independent of others */ 66/* Forward declaration to make this header independent of others */
67struct nfs_client; 67struct nfs_client;
68struct nfs_server;
68 69
69#ifdef CONFIG_NFS_USE_NEW_IDMAPPER 70#ifdef CONFIG_NFS_USE_NEW_IDMAPPER
70 71
@@ -96,10 +97,10 @@ void nfs_idmap_delete(struct nfs_client *);
96 97
97#endif /* CONFIG_NFS_USE_NEW_IDMAPPER */ 98#endif /* CONFIG_NFS_USE_NEW_IDMAPPER */
98 99
99int nfs_map_name_to_uid(struct nfs_client *, const char *, size_t, __u32 *); 100int nfs_map_name_to_uid(const struct nfs_server *, const char *, size_t, __u32 *);
100int nfs_map_group_to_gid(struct nfs_client *, const char *, size_t, __u32 *); 101int nfs_map_group_to_gid(const struct nfs_server *, const char *, size_t, __u32 *);
101int nfs_map_uid_to_name(struct nfs_client *, __u32, char *, size_t); 102int nfs_map_uid_to_name(const struct nfs_server *, __u32, char *, size_t);
102int nfs_map_gid_to_group(struct nfs_client *, __u32, char *, size_t); 103int nfs_map_gid_to_group(const struct nfs_server *, __u32, char *, size_t);
103 104
104extern unsigned int nfs_idmap_cache_timeout; 105extern unsigned int nfs_idmap_cache_timeout;
105#endif /* __KERNEL__ */ 106#endif /* __KERNEL__ */
diff --git a/include/linux/nfs_iostat.h b/include/linux/nfs_iostat.h
index 68b10f5f8907..8866bb3502ee 100644
--- a/include/linux/nfs_iostat.h
+++ b/include/linux/nfs_iostat.h
@@ -113,6 +113,8 @@ enum nfs_stat_eventcounters {
113 NFSIOS_SHORTREAD, 113 NFSIOS_SHORTREAD,
114 NFSIOS_SHORTWRITE, 114 NFSIOS_SHORTWRITE,
115 NFSIOS_DELAY, 115 NFSIOS_DELAY,
116 NFSIOS_PNFS_READ,
117 NFSIOS_PNFS_WRITE,
116 __NFSIOS_COUNTSMAX, 118 __NFSIOS_COUNTSMAX,
117}; 119};
118 120
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index d55cee73f634..90907ada6d52 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -59,9 +59,11 @@ struct nfs_pageio_descriptor {
59 unsigned int pg_base; 59 unsigned int pg_base;
60 60
61 struct inode *pg_inode; 61 struct inode *pg_inode;
62 int (*pg_doio)(struct inode *, struct list_head *, unsigned int, size_t, int); 62 int (*pg_doio)(struct nfs_pageio_descriptor *);
63 int pg_ioflags; 63 int pg_ioflags;
64 int pg_error; 64 int pg_error;
65 struct pnfs_layout_segment *pg_lseg;
66 int (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *);
65}; 67};
66 68
67#define NFS_WBACK_BUSY(req) (test_bit(PG_BUSY,&(req)->wb_flags)) 69#define NFS_WBACK_BUSY(req) (test_bit(PG_BUSY,&(req)->wb_flags))
@@ -79,7 +81,7 @@ extern int nfs_scan_list(struct nfs_inode *nfsi, struct list_head *dst,
79 pgoff_t idx_start, unsigned int npages, int tag); 81 pgoff_t idx_start, unsigned int npages, int tag);
80extern void nfs_pageio_init(struct nfs_pageio_descriptor *desc, 82extern void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
81 struct inode *inode, 83 struct inode *inode,
82 int (*doio)(struct inode *, struct list_head *, unsigned int, size_t, int), 84 int (*doio)(struct nfs_pageio_descriptor *desc),
83 size_t bsize, 85 size_t bsize,
84 int how); 86 int how);
85extern int nfs_pageio_add_request(struct nfs_pageio_descriptor *, 87extern int nfs_pageio_add_request(struct nfs_pageio_descriptor *,
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index b0068579bec2..2c2c67d2eb42 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1016,9 +1016,12 @@ struct nfs_read_data {
1016 unsigned int npages; /* Max length of pagevec */ 1016 unsigned int npages; /* Max length of pagevec */
1017 struct nfs_readargs args; 1017 struct nfs_readargs args;
1018 struct nfs_readres res; 1018 struct nfs_readres res;
1019#ifdef CONFIG_NFS_V4
1020 unsigned long timestamp; /* For lease renewal */ 1019 unsigned long timestamp; /* For lease renewal */
1021#endif 1020 struct pnfs_layout_segment *lseg;
1021 struct nfs_client *ds_clp; /* pNFS data server */
1022 const struct rpc_call_ops *mds_ops;
1023 int (*read_done_cb) (struct rpc_task *task, struct nfs_read_data *data);
1024 __u64 mds_offset;
1022 struct page *page_array[NFS_PAGEVEC_SIZE]; 1025 struct page *page_array[NFS_PAGEVEC_SIZE];
1023}; 1026};
1024 1027
@@ -1035,13 +1038,20 @@ struct nfs_write_data {
1035 unsigned int npages; /* Max length of pagevec */ 1038 unsigned int npages; /* Max length of pagevec */
1036 struct nfs_writeargs args; /* argument struct */ 1039 struct nfs_writeargs args; /* argument struct */
1037 struct nfs_writeres res; /* result struct */ 1040 struct nfs_writeres res; /* result struct */
1041 struct pnfs_layout_segment *lseg;
1042 struct nfs_client *ds_clp; /* pNFS data server */
1043 const struct rpc_call_ops *mds_ops;
1044 int (*write_done_cb) (struct rpc_task *task, struct nfs_write_data *data);
1038#ifdef CONFIG_NFS_V4 1045#ifdef CONFIG_NFS_V4
1039 unsigned long timestamp; /* For lease renewal */ 1046 unsigned long timestamp; /* For lease renewal */
1040#endif 1047#endif
1048 __u64 mds_offset; /* Filelayout dense stripe */
1041 struct page *page_array[NFS_PAGEVEC_SIZE]; 1049 struct page *page_array[NFS_PAGEVEC_SIZE];
1042}; 1050};
1043 1051
1044struct nfs_access_entry; 1052struct nfs_access_entry;
1053struct nfs_client;
1054struct rpc_timeout;
1045 1055
1046/* 1056/*
1047 * RPC procedure vector for NFSv2/NFSv3 demuxing 1057 * RPC procedure vector for NFSv2/NFSv3 demuxing
@@ -1106,6 +1116,8 @@ struct nfs_rpc_ops {
1106 struct nfs_open_context *ctx, 1116 struct nfs_open_context *ctx,
1107 int open_flags, 1117 int open_flags,
1108 struct iattr *iattr); 1118 struct iattr *iattr);
1119 int (*init_client) (struct nfs_client *, const struct rpc_timeout *,
1120 const char *, rpc_authflavor_t, int);
1109}; 1121};
1110 1122
1111/* 1123/*
diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index ef9476a36ff7..db7bcaf7c5bd 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -129,6 +129,7 @@ struct rpc_create_args {
129struct rpc_clnt *rpc_create(struct rpc_create_args *args); 129struct rpc_clnt *rpc_create(struct rpc_create_args *args);
130struct rpc_clnt *rpc_bind_new_program(struct rpc_clnt *, 130struct rpc_clnt *rpc_bind_new_program(struct rpc_clnt *,
131 struct rpc_program *, u32); 131 struct rpc_program *, u32);
132void rpc_task_reset_client(struct rpc_task *task, struct rpc_clnt *clnt);
132struct rpc_clnt *rpc_clone_client(struct rpc_clnt *); 133struct rpc_clnt *rpc_clone_client(struct rpc_clnt *);
133void rpc_shutdown_client(struct rpc_clnt *); 134void rpc_shutdown_client(struct rpc_clnt *);
134void rpc_release_client(struct rpc_clnt *); 135void rpc_release_client(struct rpc_clnt *);
diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index bef0f535f746..a0f998c07c65 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -12,7 +12,6 @@
12#include <linux/uio.h> 12#include <linux/uio.h>
13#include <linux/socket.h> 13#include <linux/socket.h>
14#include <linux/in.h> 14#include <linux/in.h>
15#include <linux/kref.h>
16#include <linux/ktime.h> 15#include <linux/ktime.h>
17#include <linux/sunrpc/sched.h> 16#include <linux/sunrpc/sched.h>
18#include <linux/sunrpc/xdr.h> 17#include <linux/sunrpc/xdr.h>
@@ -146,7 +145,7 @@ enum xprt_transports {
146}; 145};
147 146
148struct rpc_xprt { 147struct rpc_xprt {
149 struct kref kref; /* Reference count */ 148 atomic_t count; /* Reference count */
150 struct rpc_xprt_ops * ops; /* transport methods */ 149 struct rpc_xprt_ops * ops; /* transport methods */
151 150
152 const struct rpc_timeout *timeout; /* timeout parms */ 151 const struct rpc_timeout *timeout; /* timeout parms */
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 45dbf1521b9a..f3914d0c5079 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -417,7 +417,7 @@ static void gss_encode_v1_msg(struct gss_upcall_msg *gss_msg,
417 gss_msg->msg.len += len; 417 gss_msg->msg.len += len;
418 } 418 }
419 if (mech->gm_upcall_enctypes) { 419 if (mech->gm_upcall_enctypes) {
420 len = sprintf(p, mech->gm_upcall_enctypes); 420 len = sprintf(p, "enctypes=%s ", mech->gm_upcall_enctypes);
421 p += len; 421 p += len;
422 gss_msg->msg.len += len; 422 gss_msg->msg.len += len;
423 } 423 }
diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c
index f375decc024b..9022f0a6503e 100644
--- a/net/sunrpc/auth_gss/gss_krb5_mech.c
+++ b/net/sunrpc/auth_gss/gss_krb5_mech.c
@@ -750,7 +750,7 @@ static struct gss_api_mech gss_kerberos_mech = {
750 .gm_ops = &gss_kerberos_ops, 750 .gm_ops = &gss_kerberos_ops,
751 .gm_pf_num = ARRAY_SIZE(gss_kerberos_pfs), 751 .gm_pf_num = ARRAY_SIZE(gss_kerberos_pfs),
752 .gm_pfs = gss_kerberos_pfs, 752 .gm_pfs = gss_kerberos_pfs,
753 .gm_upcall_enctypes = "enctypes=18,17,16,23,3,1,2 ", 753 .gm_upcall_enctypes = "18,17,16,23,3,1,2",
754}; 754};
755 755
756static int __init init_kerberos_module(void) 756static int __init init_kerberos_module(void)
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 57d344cf2256..e7a96e478f63 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -436,7 +436,9 @@ void rpc_killall_tasks(struct rpc_clnt *clnt)
436 if (!(rovr->tk_flags & RPC_TASK_KILLED)) { 436 if (!(rovr->tk_flags & RPC_TASK_KILLED)) {
437 rovr->tk_flags |= RPC_TASK_KILLED; 437 rovr->tk_flags |= RPC_TASK_KILLED;
438 rpc_exit(rovr, -EIO); 438 rpc_exit(rovr, -EIO);
439 rpc_wake_up_queued_task(rovr->tk_waitqueue, rovr); 439 if (RPC_IS_QUEUED(rovr))
440 rpc_wake_up_queued_task(rovr->tk_waitqueue,
441 rovr);
440 } 442 }
441 } 443 }
442 spin_unlock(&clnt->cl_lock); 444 spin_unlock(&clnt->cl_lock);
@@ -597,6 +599,14 @@ void rpc_task_set_client(struct rpc_task *task, struct rpc_clnt *clnt)
597 } 599 }
598} 600}
599 601
602void rpc_task_reset_client(struct rpc_task *task, struct rpc_clnt *clnt)
603{
604 rpc_task_release_client(task);
605 rpc_task_set_client(task, clnt);
606}
607EXPORT_SYMBOL_GPL(rpc_task_reset_client);
608
609
600static void 610static void
601rpc_task_set_rpc_message(struct rpc_task *task, const struct rpc_message *msg) 611rpc_task_set_rpc_message(struct rpc_task *task, const struct rpc_message *msg)
602{ 612{
@@ -636,12 +646,6 @@ struct rpc_task *rpc_run_task(const struct rpc_task_setup *task_setup_data)
636 rpc_task_set_client(task, task_setup_data->rpc_client); 646 rpc_task_set_client(task, task_setup_data->rpc_client);
637 rpc_task_set_rpc_message(task, task_setup_data->rpc_message); 647 rpc_task_set_rpc_message(task, task_setup_data->rpc_message);
638 648
639 if (task->tk_status != 0) {
640 int ret = task->tk_status;
641 rpc_put_task(task);
642 return ERR_PTR(ret);
643 }
644
645 if (task->tk_action == NULL) 649 if (task->tk_action == NULL)
646 rpc_call_start(task); 650 rpc_call_start(task);
647 651
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 3fc8624fcd17..ffb687671da0 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -299,15 +299,8 @@ static void rpc_make_runnable(struct rpc_task *task)
299 if (rpc_test_and_set_running(task)) 299 if (rpc_test_and_set_running(task))
300 return; 300 return;
301 if (RPC_IS_ASYNC(task)) { 301 if (RPC_IS_ASYNC(task)) {
302 int status;
303
304 INIT_WORK(&task->u.tk_work, rpc_async_schedule); 302 INIT_WORK(&task->u.tk_work, rpc_async_schedule);
305 status = queue_work(rpciod_workqueue, &task->u.tk_work); 303 queue_work(rpciod_workqueue, &task->u.tk_work);
306 if (status < 0) {
307 printk(KERN_WARNING "RPC: failed to add task to queue: error: %d!\n", status);
308 task->tk_status = status;
309 return;
310 }
311 } else 304 } else
312 wake_up_bit(&task->tk_runstate, RPC_TASK_QUEUED); 305 wake_up_bit(&task->tk_runstate, RPC_TASK_QUEUED);
313} 306}
@@ -637,14 +630,12 @@ static void __rpc_execute(struct rpc_task *task)
637 save_callback = task->tk_callback; 630 save_callback = task->tk_callback;
638 task->tk_callback = NULL; 631 task->tk_callback = NULL;
639 save_callback(task); 632 save_callback(task);
640 } 633 } else {
641 634 /*
642 /* 635 * Perform the next FSM step.
643 * Perform the next FSM step. 636 * tk_action may be NULL when the task has been killed
644 * tk_action may be NULL when the task has been killed 637 * by someone else.
645 * by someone else. 638 */
646 */
647 if (!RPC_IS_QUEUED(task)) {
648 if (task->tk_action == NULL) 639 if (task->tk_action == NULL)
649 break; 640 break;
650 task->tk_action(task); 641 task->tk_action(task);
@@ -843,12 +834,6 @@ struct rpc_task *rpc_new_task(const struct rpc_task_setup *setup_data)
843 } 834 }
844 835
845 rpc_init_task(task, setup_data); 836 rpc_init_task(task, setup_data);
846 if (task->tk_status < 0) {
847 int err = task->tk_status;
848 rpc_put_task(task);
849 return ERR_PTR(err);
850 }
851
852 task->tk_flags |= flags; 837 task->tk_flags |= flags;
853 dprintk("RPC: allocated task %p\n", task); 838 dprintk("RPC: allocated task %p\n", task);
854 return task; 839 return task;
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 856274d7e85c..9494c3767356 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -202,10 +202,9 @@ int xprt_reserve_xprt(struct rpc_task *task)
202 goto out_sleep; 202 goto out_sleep;
203 } 203 }
204 xprt->snd_task = task; 204 xprt->snd_task = task;
205 if (req) { 205 req->rq_bytes_sent = 0;
206 req->rq_bytes_sent = 0; 206 req->rq_ntrans++;
207 req->rq_ntrans++; 207
208 }
209 return 1; 208 return 1;
210 209
211out_sleep: 210out_sleep:
@@ -213,7 +212,7 @@ out_sleep:
213 task->tk_pid, xprt); 212 task->tk_pid, xprt);
214 task->tk_timeout = 0; 213 task->tk_timeout = 0;
215 task->tk_status = -EAGAIN; 214 task->tk_status = -EAGAIN;
216 if (req && req->rq_ntrans) 215 if (req->rq_ntrans)
217 rpc_sleep_on(&xprt->resend, task, NULL); 216 rpc_sleep_on(&xprt->resend, task, NULL);
218 else 217 else
219 rpc_sleep_on(&xprt->sending, task, NULL); 218 rpc_sleep_on(&xprt->sending, task, NULL);
@@ -965,7 +964,7 @@ struct rpc_xprt *xprt_alloc(struct net *net, int size, int max_req)
965 xprt = kzalloc(size, GFP_KERNEL); 964 xprt = kzalloc(size, GFP_KERNEL);
966 if (xprt == NULL) 965 if (xprt == NULL)
967 goto out; 966 goto out;
968 kref_init(&xprt->kref); 967 atomic_set(&xprt->count, 1);
969 968
970 xprt->max_reqs = max_req; 969 xprt->max_reqs = max_req;
971 xprt->slot = kcalloc(max_req, sizeof(struct rpc_rqst), GFP_KERNEL); 970 xprt->slot = kcalloc(max_req, sizeof(struct rpc_rqst), GFP_KERNEL);
@@ -1145,13 +1144,11 @@ found:
1145 1144
1146/** 1145/**
1147 * xprt_destroy - destroy an RPC transport, killing off all requests. 1146 * xprt_destroy - destroy an RPC transport, killing off all requests.
1148 * @kref: kref for the transport to destroy 1147 * @xprt: transport to destroy
1149 * 1148 *
1150 */ 1149 */
1151static void xprt_destroy(struct kref *kref) 1150static void xprt_destroy(struct rpc_xprt *xprt)
1152{ 1151{
1153 struct rpc_xprt *xprt = container_of(kref, struct rpc_xprt, kref);
1154
1155 dprintk("RPC: destroying transport %p\n", xprt); 1152 dprintk("RPC: destroying transport %p\n", xprt);
1156 xprt->shutdown = 1; 1153 xprt->shutdown = 1;
1157 del_timer_sync(&xprt->timer); 1154 del_timer_sync(&xprt->timer);
@@ -1175,7 +1172,8 @@ static void xprt_destroy(struct kref *kref)
1175 */ 1172 */
1176void xprt_put(struct rpc_xprt *xprt) 1173void xprt_put(struct rpc_xprt *xprt)
1177{ 1174{
1178 kref_put(&xprt->kref, xprt_destroy); 1175 if (atomic_dec_and_test(&xprt->count))
1176 xprt_destroy(xprt);
1179} 1177}
1180 1178
1181/** 1179/**
@@ -1185,6 +1183,7 @@ void xprt_put(struct rpc_xprt *xprt)
1185 */ 1183 */
1186struct rpc_xprt *xprt_get(struct rpc_xprt *xprt) 1184struct rpc_xprt *xprt_get(struct rpc_xprt *xprt)
1187{ 1185{
1188 kref_get(&xprt->kref); 1186 if (atomic_inc_not_zero(&xprt->count))
1189 return xprt; 1187 return xprt;
1188 return NULL;
1190} 1189}
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 2ac3f6e8adff..554d0814c875 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -87,6 +87,8 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
87 enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg, int nsegs) 87 enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg, int nsegs)
88{ 88{
89 int len, n = 0, p; 89 int len, n = 0, p;
90 int page_base;
91 struct page **ppages;
90 92
91 if (pos == 0 && xdrbuf->head[0].iov_len) { 93 if (pos == 0 && xdrbuf->head[0].iov_len) {
92 seg[n].mr_page = NULL; 94 seg[n].mr_page = NULL;
@@ -95,34 +97,32 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
95 ++n; 97 ++n;
96 } 98 }
97 99
98 if (xdrbuf->page_len && (xdrbuf->pages[0] != NULL)) { 100 len = xdrbuf->page_len;
99 if (n == nsegs) 101 ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT);
100 return 0; 102 page_base = xdrbuf->page_base & ~PAGE_MASK;
101 seg[n].mr_page = xdrbuf->pages[0]; 103 p = 0;
102 seg[n].mr_offset = (void *)(unsigned long) xdrbuf->page_base; 104 while (len && n < nsegs) {
103 seg[n].mr_len = min_t(u32, 105 seg[n].mr_page = ppages[p];
104 PAGE_SIZE - xdrbuf->page_base, xdrbuf->page_len); 106 seg[n].mr_offset = (void *)(unsigned long) page_base;
105 len = xdrbuf->page_len - seg[n].mr_len; 107 seg[n].mr_len = min_t(u32, PAGE_SIZE - page_base, len);
108 BUG_ON(seg[n].mr_len > PAGE_SIZE);
109 len -= seg[n].mr_len;
106 ++n; 110 ++n;
107 p = 1; 111 ++p;
108 while (len > 0) { 112 page_base = 0; /* page offset only applies to first page */
109 if (n == nsegs)
110 return 0;
111 seg[n].mr_page = xdrbuf->pages[p];
112 seg[n].mr_offset = NULL;
113 seg[n].mr_len = min_t(u32, PAGE_SIZE, len);
114 len -= seg[n].mr_len;
115 ++n;
116 ++p;
117 }
118 } 113 }
119 114
115 /* Message overflows the seg array */
116 if (len && n == nsegs)
117 return 0;
118
120 if (xdrbuf->tail[0].iov_len) { 119 if (xdrbuf->tail[0].iov_len) {
121 /* the rpcrdma protocol allows us to omit any trailing 120 /* the rpcrdma protocol allows us to omit any trailing
122 * xdr pad bytes, saving the server an RDMA operation. */ 121 * xdr pad bytes, saving the server an RDMA operation. */
123 if (xdrbuf->tail[0].iov_len < 4 && xprt_rdma_pad_optimize) 122 if (xdrbuf->tail[0].iov_len < 4 && xprt_rdma_pad_optimize)
124 return n; 123 return n;
125 if (n == nsegs) 124 if (n == nsegs)
125 /* Tail remains, but we're out of segments */
126 return 0; 126 return 0;
127 seg[n].mr_page = NULL; 127 seg[n].mr_page = NULL;
128 seg[n].mr_offset = xdrbuf->tail[0].iov_base; 128 seg[n].mr_offset = xdrbuf->tail[0].iov_base;
@@ -296,6 +296,8 @@ rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad)
296 int copy_len; 296 int copy_len;
297 unsigned char *srcp, *destp; 297 unsigned char *srcp, *destp;
298 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); 298 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
299 int page_base;
300 struct page **ppages;
299 301
300 destp = rqst->rq_svec[0].iov_base; 302 destp = rqst->rq_svec[0].iov_base;
301 curlen = rqst->rq_svec[0].iov_len; 303 curlen = rqst->rq_svec[0].iov_len;
@@ -324,28 +326,25 @@ rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad)
324 __func__, destp + copy_len, curlen); 326 __func__, destp + copy_len, curlen);
325 rqst->rq_svec[0].iov_len += curlen; 327 rqst->rq_svec[0].iov_len += curlen;
326 } 328 }
327
328 r_xprt->rx_stats.pullup_copy_count += copy_len; 329 r_xprt->rx_stats.pullup_copy_count += copy_len;
329 npages = PAGE_ALIGN(rqst->rq_snd_buf.page_base+copy_len) >> PAGE_SHIFT; 330
331 page_base = rqst->rq_snd_buf.page_base;
332 ppages = rqst->rq_snd_buf.pages + (page_base >> PAGE_SHIFT);
333 page_base &= ~PAGE_MASK;
334 npages = PAGE_ALIGN(page_base+copy_len) >> PAGE_SHIFT;
330 for (i = 0; copy_len && i < npages; i++) { 335 for (i = 0; copy_len && i < npages; i++) {
331 if (i == 0) 336 curlen = PAGE_SIZE - page_base;
332 curlen = PAGE_SIZE - rqst->rq_snd_buf.page_base;
333 else
334 curlen = PAGE_SIZE;
335 if (curlen > copy_len) 337 if (curlen > copy_len)
336 curlen = copy_len; 338 curlen = copy_len;
337 dprintk("RPC: %s: page %d destp 0x%p len %d curlen %d\n", 339 dprintk("RPC: %s: page %d destp 0x%p len %d curlen %d\n",
338 __func__, i, destp, copy_len, curlen); 340 __func__, i, destp, copy_len, curlen);
339 srcp = kmap_atomic(rqst->rq_snd_buf.pages[i], 341 srcp = kmap_atomic(ppages[i], KM_SKB_SUNRPC_DATA);
340 KM_SKB_SUNRPC_DATA); 342 memcpy(destp, srcp+page_base, curlen);
341 if (i == 0)
342 memcpy(destp, srcp+rqst->rq_snd_buf.page_base, curlen);
343 else
344 memcpy(destp, srcp, curlen);
345 kunmap_atomic(srcp, KM_SKB_SUNRPC_DATA); 343 kunmap_atomic(srcp, KM_SKB_SUNRPC_DATA);
346 rqst->rq_svec[0].iov_len += curlen; 344 rqst->rq_svec[0].iov_len += curlen;
347 destp += curlen; 345 destp += curlen;
348 copy_len -= curlen; 346 copy_len -= curlen;
347 page_base = 0;
349 } 348 }
350 /* header now contains entire send message */ 349 /* header now contains entire send message */
351 return pad; 350 return pad;
@@ -606,6 +605,8 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
606{ 605{
607 int i, npages, curlen, olen; 606 int i, npages, curlen, olen;
608 char *destp; 607 char *destp;
608 struct page **ppages;
609 int page_base;
609 610
610 curlen = rqst->rq_rcv_buf.head[0].iov_len; 611 curlen = rqst->rq_rcv_buf.head[0].iov_len;
611 if (curlen > copy_len) { /* write chunk header fixup */ 612 if (curlen > copy_len) { /* write chunk header fixup */
@@ -624,32 +625,29 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
624 olen = copy_len; 625 olen = copy_len;
625 i = 0; 626 i = 0;
626 rpcx_to_rdmax(rqst->rq_xprt)->rx_stats.fixup_copy_count += olen; 627 rpcx_to_rdmax(rqst->rq_xprt)->rx_stats.fixup_copy_count += olen;
628 page_base = rqst->rq_rcv_buf.page_base;
629 ppages = rqst->rq_rcv_buf.pages + (page_base >> PAGE_SHIFT);
630 page_base &= ~PAGE_MASK;
631
627 if (copy_len && rqst->rq_rcv_buf.page_len) { 632 if (copy_len && rqst->rq_rcv_buf.page_len) {
628 npages = PAGE_ALIGN(rqst->rq_rcv_buf.page_base + 633 npages = PAGE_ALIGN(page_base +
629 rqst->rq_rcv_buf.page_len) >> PAGE_SHIFT; 634 rqst->rq_rcv_buf.page_len) >> PAGE_SHIFT;
630 for (; i < npages; i++) { 635 for (; i < npages; i++) {
631 if (i == 0) 636 curlen = PAGE_SIZE - page_base;
632 curlen = PAGE_SIZE - rqst->rq_rcv_buf.page_base;
633 else
634 curlen = PAGE_SIZE;
635 if (curlen > copy_len) 637 if (curlen > copy_len)
636 curlen = copy_len; 638 curlen = copy_len;
637 dprintk("RPC: %s: page %d" 639 dprintk("RPC: %s: page %d"
638 " srcp 0x%p len %d curlen %d\n", 640 " srcp 0x%p len %d curlen %d\n",
639 __func__, i, srcp, copy_len, curlen); 641 __func__, i, srcp, copy_len, curlen);
640 destp = kmap_atomic(rqst->rq_rcv_buf.pages[i], 642 destp = kmap_atomic(ppages[i], KM_SKB_SUNRPC_DATA);
641 KM_SKB_SUNRPC_DATA); 643 memcpy(destp + page_base, srcp, curlen);
642 if (i == 0) 644 flush_dcache_page(ppages[i]);
643 memcpy(destp + rqst->rq_rcv_buf.page_base,
644 srcp, curlen);
645 else
646 memcpy(destp, srcp, curlen);
647 flush_dcache_page(rqst->rq_rcv_buf.pages[i]);
648 kunmap_atomic(destp, KM_SKB_SUNRPC_DATA); 645 kunmap_atomic(destp, KM_SKB_SUNRPC_DATA);
649 srcp += curlen; 646 srcp += curlen;
650 copy_len -= curlen; 647 copy_len -= curlen;
651 if (copy_len == 0) 648 if (copy_len == 0)
652 break; 649 break;
650 page_base = 0;
653 } 651 }
654 rqst->rq_rcv_buf.page_len = olen - copy_len; 652 rqst->rq_rcv_buf.page_len = olen - copy_len;
655 } else 653 } else
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 5f4c7b3bc711..d4297dc43dc4 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -144,6 +144,7 @@ rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
144static inline 144static inline
145void rpcrdma_event_process(struct ib_wc *wc) 145void rpcrdma_event_process(struct ib_wc *wc)
146{ 146{
147 struct rpcrdma_mw *frmr;
147 struct rpcrdma_rep *rep = 148 struct rpcrdma_rep *rep =
148 (struct rpcrdma_rep *)(unsigned long) wc->wr_id; 149 (struct rpcrdma_rep *)(unsigned long) wc->wr_id;
149 150
@@ -154,15 +155,23 @@ void rpcrdma_event_process(struct ib_wc *wc)
154 return; 155 return;
155 156
156 if (IB_WC_SUCCESS != wc->status) { 157 if (IB_WC_SUCCESS != wc->status) {
157 dprintk("RPC: %s: %s WC status %X, connection lost\n", 158 dprintk("RPC: %s: WC opcode %d status %X, connection lost\n",
158 __func__, (wc->opcode & IB_WC_RECV) ? "recv" : "send", 159 __func__, wc->opcode, wc->status);
159 wc->status);
160 rep->rr_len = ~0U; 160 rep->rr_len = ~0U;
161 rpcrdma_schedule_tasklet(rep); 161 if (wc->opcode != IB_WC_FAST_REG_MR && wc->opcode != IB_WC_LOCAL_INV)
162 rpcrdma_schedule_tasklet(rep);
162 return; 163 return;
163 } 164 }
164 165
165 switch (wc->opcode) { 166 switch (wc->opcode) {
167 case IB_WC_FAST_REG_MR:
168 frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
169 frmr->r.frmr.state = FRMR_IS_VALID;
170 break;
171 case IB_WC_LOCAL_INV:
172 frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
173 frmr->r.frmr.state = FRMR_IS_INVALID;
174 break;
166 case IB_WC_RECV: 175 case IB_WC_RECV:
167 rep->rr_len = wc->byte_len; 176 rep->rr_len = wc->byte_len;
168 ib_dma_sync_single_for_cpu( 177 ib_dma_sync_single_for_cpu(
@@ -1450,6 +1459,12 @@ rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing)
1450 seg->mr_dma = ib_dma_map_single(ia->ri_id->device, 1459 seg->mr_dma = ib_dma_map_single(ia->ri_id->device,
1451 seg->mr_offset, 1460 seg->mr_offset,
1452 seg->mr_dmalen, seg->mr_dir); 1461 seg->mr_dmalen, seg->mr_dir);
1462 if (ib_dma_mapping_error(ia->ri_id->device, seg->mr_dma)) {
1463 dprintk("RPC: %s: mr_dma %llx mr_offset %p mr_dma_len %zu\n",
1464 __func__,
1465 (unsigned long long)seg->mr_dma,
1466 seg->mr_offset, seg->mr_dmalen);
1467 }
1453} 1468}
1454 1469
1455static void 1470static void
@@ -1469,7 +1484,8 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
1469 struct rpcrdma_xprt *r_xprt) 1484 struct rpcrdma_xprt *r_xprt)
1470{ 1485{
1471 struct rpcrdma_mr_seg *seg1 = seg; 1486 struct rpcrdma_mr_seg *seg1 = seg;
1472 struct ib_send_wr frmr_wr, *bad_wr; 1487 struct ib_send_wr invalidate_wr, frmr_wr, *bad_wr, *post_wr;
1488
1473 u8 key; 1489 u8 key;
1474 int len, pageoff; 1490 int len, pageoff;
1475 int i, rc; 1491 int i, rc;
@@ -1484,6 +1500,7 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
1484 rpcrdma_map_one(ia, seg, writing); 1500 rpcrdma_map_one(ia, seg, writing);
1485 seg1->mr_chunk.rl_mw->r.frmr.fr_pgl->page_list[i] = seg->mr_dma; 1501 seg1->mr_chunk.rl_mw->r.frmr.fr_pgl->page_list[i] = seg->mr_dma;
1486 len += seg->mr_len; 1502 len += seg->mr_len;
1503 BUG_ON(seg->mr_len > PAGE_SIZE);
1487 ++seg; 1504 ++seg;
1488 ++i; 1505 ++i;
1489 /* Check for holes */ 1506 /* Check for holes */
@@ -1494,26 +1511,45 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
1494 dprintk("RPC: %s: Using frmr %p to map %d segments\n", 1511 dprintk("RPC: %s: Using frmr %p to map %d segments\n",
1495 __func__, seg1->mr_chunk.rl_mw, i); 1512 __func__, seg1->mr_chunk.rl_mw, i);
1496 1513
1514 if (unlikely(seg1->mr_chunk.rl_mw->r.frmr.state == FRMR_IS_VALID)) {
1515 dprintk("RPC: %s: frmr %x left valid, posting invalidate.\n",
1516 __func__,
1517 seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey);
1518 /* Invalidate before using. */
1519 memset(&invalidate_wr, 0, sizeof invalidate_wr);
1520 invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
1521 invalidate_wr.next = &frmr_wr;
1522 invalidate_wr.opcode = IB_WR_LOCAL_INV;
1523 invalidate_wr.send_flags = IB_SEND_SIGNALED;
1524 invalidate_wr.ex.invalidate_rkey =
1525 seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1526 DECR_CQCOUNT(&r_xprt->rx_ep);
1527 post_wr = &invalidate_wr;
1528 } else
1529 post_wr = &frmr_wr;
1530
1497 /* Bump the key */ 1531 /* Bump the key */
1498 key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF); 1532 key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF);
1499 ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key); 1533 ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key);
1500 1534
1501 /* Prepare FRMR WR */ 1535 /* Prepare FRMR WR */
1502 memset(&frmr_wr, 0, sizeof frmr_wr); 1536 memset(&frmr_wr, 0, sizeof frmr_wr);
1537 frmr_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
1503 frmr_wr.opcode = IB_WR_FAST_REG_MR; 1538 frmr_wr.opcode = IB_WR_FAST_REG_MR;
1504 frmr_wr.send_flags = 0; /* unsignaled */ 1539 frmr_wr.send_flags = IB_SEND_SIGNALED;
1505 frmr_wr.wr.fast_reg.iova_start = seg1->mr_dma; 1540 frmr_wr.wr.fast_reg.iova_start = seg1->mr_dma;
1506 frmr_wr.wr.fast_reg.page_list = seg1->mr_chunk.rl_mw->r.frmr.fr_pgl; 1541 frmr_wr.wr.fast_reg.page_list = seg1->mr_chunk.rl_mw->r.frmr.fr_pgl;
1507 frmr_wr.wr.fast_reg.page_list_len = i; 1542 frmr_wr.wr.fast_reg.page_list_len = i;
1508 frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT; 1543 frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
1509 frmr_wr.wr.fast_reg.length = i << PAGE_SHIFT; 1544 frmr_wr.wr.fast_reg.length = i << PAGE_SHIFT;
1545 BUG_ON(frmr_wr.wr.fast_reg.length < len);
1510 frmr_wr.wr.fast_reg.access_flags = (writing ? 1546 frmr_wr.wr.fast_reg.access_flags = (writing ?
1511 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : 1547 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
1512 IB_ACCESS_REMOTE_READ); 1548 IB_ACCESS_REMOTE_READ);
1513 frmr_wr.wr.fast_reg.rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; 1549 frmr_wr.wr.fast_reg.rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1514 DECR_CQCOUNT(&r_xprt->rx_ep); 1550 DECR_CQCOUNT(&r_xprt->rx_ep);
1515 1551
1516 rc = ib_post_send(ia->ri_id->qp, &frmr_wr, &bad_wr); 1552 rc = ib_post_send(ia->ri_id->qp, post_wr, &bad_wr);
1517 1553
1518 if (rc) { 1554 if (rc) {
1519 dprintk("RPC: %s: failed ib_post_send for register," 1555 dprintk("RPC: %s: failed ib_post_send for register,"
@@ -1542,8 +1578,9 @@ rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg,
1542 rpcrdma_unmap_one(ia, seg++); 1578 rpcrdma_unmap_one(ia, seg++);
1543 1579
1544 memset(&invalidate_wr, 0, sizeof invalidate_wr); 1580 memset(&invalidate_wr, 0, sizeof invalidate_wr);
1581 invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
1545 invalidate_wr.opcode = IB_WR_LOCAL_INV; 1582 invalidate_wr.opcode = IB_WR_LOCAL_INV;
1546 invalidate_wr.send_flags = 0; /* unsignaled */ 1583 invalidate_wr.send_flags = IB_SEND_SIGNALED;
1547 invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; 1584 invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1548 DECR_CQCOUNT(&r_xprt->rx_ep); 1585 DECR_CQCOUNT(&r_xprt->rx_ep);
1549 1586
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index c7a7eba991bc..cae761a8536c 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -164,6 +164,7 @@ struct rpcrdma_mr_seg { /* chunk descriptors */
164 struct { 164 struct {
165 struct ib_fast_reg_page_list *fr_pgl; 165 struct ib_fast_reg_page_list *fr_pgl;
166 struct ib_mr *fr_mr; 166 struct ib_mr *fr_mr;
167 enum { FRMR_IS_INVALID, FRMR_IS_VALID } state;
167 } frmr; 168 } frmr;
168 } r; 169 } r;
169 struct list_head mw_list; 170 struct list_head mw_list;