aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/kernel-parameters.txt7
-rw-r--r--fs/Kconfig8
-rw-r--r--fs/lockd/mon.c3
-rw-r--r--fs/lockd/xdr.c8
-rw-r--r--fs/lockd/xdr4.c8
-rw-r--r--fs/nfs/Makefile1
-rw-r--r--fs/nfs/client.c49
-rw-r--r--fs/nfs/delegation.c6
-rw-r--r--fs/nfs/dir.c263
-rw-r--r--fs/nfs/direct.c8
-rw-r--r--fs/nfs/file.c105
-rw-r--r--fs/nfs/inode.c273
-rw-r--r--fs/nfs/internal.h50
-rw-r--r--fs/nfs/nfs2xdr.c20
-rw-r--r--fs/nfs/nfs3acl.c2
-rw-r--r--fs/nfs/nfs3proc.c17
-rw-r--r--fs/nfs/nfs3xdr.c25
-rw-r--r--fs/nfs/nfs4proc.c85
-rw-r--r--fs/nfs/nfs4state.c2
-rw-r--r--fs/nfs/nfs4xdr.c72
-rw-r--r--fs/nfs/nfsroot.c3
-rw-r--r--fs/nfs/proc.c5
-rw-r--r--fs/nfs/read.c9
-rw-r--r--fs/nfs/super.c393
-rw-r--r--fs/nfs/unlink.c3
-rw-r--r--fs/nfs/write.c199
-rw-r--r--fs/nfsd/nfs4xdr.c16
-rw-r--r--include/linux/jiffies.h4
-rw-r--r--include/linux/nfs_fs.h78
-rw-r--r--include/linux/nfs_page.h1
-rw-r--r--include/linux/nfs_xdr.h6
-rw-r--r--include/linux/sunrpc/clnt.h2
-rw-r--r--include/linux/sunrpc/debug.h5
-rw-r--r--include/linux/sunrpc/msg_prot.h13
-rw-r--r--include/linux/sunrpc/rpc_rdma.h116
-rw-r--r--include/linux/sunrpc/xdr.h5
-rw-r--r--include/linux/sunrpc/xprt.h42
-rw-r--r--include/linux/sunrpc/xprtrdma.h85
-rw-r--r--include/linux/sunrpc/xprtsock.h51
-rw-r--r--include/linux/writeback.h2
-rw-r--r--kernel/auditsc.c1
-rw-r--r--net/sunrpc/Makefile1
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_wrap.c6
-rw-r--r--net/sunrpc/clnt.c52
-rw-r--r--net/sunrpc/rpc_pipe.c8
-rw-r--r--net/sunrpc/rpcb_clnt.c151
-rw-r--r--net/sunrpc/sched.c2
-rw-r--r--net/sunrpc/socklib.c3
-rw-r--r--net/sunrpc/sunrpc_syms.c2
-rw-r--r--net/sunrpc/timer.c4
-rw-r--r--net/sunrpc/xprt.c116
-rw-r--r--net/sunrpc/xprtrdma/Makefile3
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c868
-rw-r--r--net/sunrpc/xprtrdma/transport.c800
-rw-r--r--net/sunrpc/xprtrdma/verbs.c1626
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h330
-rw-r--r--net/sunrpc/xprtsock.c567
57 files changed, 5467 insertions, 1123 deletions
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index c323778270ff..fdd6dbcf864e 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1083,6 +1083,13 @@ and is between 256 and 4096 characters. It is defined in the file
1083 [NFS] set the maximum lifetime for idmapper cache 1083 [NFS] set the maximum lifetime for idmapper cache
1084 entries. 1084 entries.
1085 1085
1086 nfs.enable_ino64=
1087 [NFS] enable 64-bit inode numbers.
1088 If zero, the NFS client will fake up a 32-bit inode
1089 number for the readdir() and stat() syscalls instead
1090 of returning the full 64-bit number.
1091 The default is to return 64-bit inode numbers.
1092
1086 nmi_watchdog= [KNL,BUGS=X86-32] Debugging features for SMP kernels 1093 nmi_watchdog= [KNL,BUGS=X86-32] Debugging features for SMP kernels
1087 1094
1088 no387 [BUGS=X86-32] Tells the kernel to use the 387 maths 1095 no387 [BUGS=X86-32] Tells the kernel to use the 387 maths
diff --git a/fs/Kconfig b/fs/Kconfig
index bb02b39380a3..815d201d8600 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -1755,6 +1755,14 @@ config SUNRPC
1755config SUNRPC_GSS 1755config SUNRPC_GSS
1756 tristate 1756 tristate
1757 1757
1758config SUNRPC_XPRT_RDMA
1759 tristate "RDMA transport for sunrpc (EXPERIMENTAL)"
1760 depends on SUNRPC && INFINIBAND && EXPERIMENTAL
1761 default m
1762 help
1763 Adds a client RPC transport for supporting kernel NFS over RDMA
1764 mounts, including Infiniband and iWARP. Experimental.
1765
1758config SUNRPC_BIND34 1766config SUNRPC_BIND34
1759 bool "Support for rpcbind versions 3 & 4 (EXPERIMENTAL)" 1767 bool "Support for rpcbind versions 3 & 4 (EXPERIMENTAL)"
1760 depends on SUNRPC && EXPERIMENTAL 1768 depends on SUNRPC && EXPERIMENTAL
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 3353ed8421a7..908b23fadd05 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -10,6 +10,7 @@
10#include <linux/utsname.h> 10#include <linux/utsname.h>
11#include <linux/kernel.h> 11#include <linux/kernel.h>
12#include <linux/sunrpc/clnt.h> 12#include <linux/sunrpc/clnt.h>
13#include <linux/sunrpc/xprtsock.h>
13#include <linux/sunrpc/svc.h> 14#include <linux/sunrpc/svc.h>
14#include <linux/lockd/lockd.h> 15#include <linux/lockd/lockd.h>
15#include <linux/lockd/sm_inter.h> 16#include <linux/lockd/sm_inter.h>
@@ -132,7 +133,7 @@ nsm_create(void)
132 .sin_port = 0, 133 .sin_port = 0,
133 }; 134 };
134 struct rpc_create_args args = { 135 struct rpc_create_args args = {
135 .protocol = IPPROTO_UDP, 136 .protocol = XPRT_TRANSPORT_UDP,
136 .address = (struct sockaddr *)&sin, 137 .address = (struct sockaddr *)&sin,
137 .addrsize = sizeof(sin), 138 .addrsize = sizeof(sin),
138 .servername = "localhost", 139 .servername = "localhost",
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index 5316e307a49d..633653bff944 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -62,8 +62,9 @@ static __be32 *nlm_decode_cookie(__be32 *p, struct nlm_cookie *c)
62 } 62 }
63 else 63 else
64 { 64 {
65 printk(KERN_NOTICE 65 dprintk("lockd: bad cookie size %d (only cookies under "
66 "lockd: bad cookie size %d (only cookies under %d bytes are supported.)\n", len, NLM_MAXCOOKIELEN); 66 "%d bytes are supported.)\n",
67 len, NLM_MAXCOOKIELEN);
67 return NULL; 68 return NULL;
68 } 69 }
69 return p; 70 return p;
@@ -84,8 +85,7 @@ nlm_decode_fh(__be32 *p, struct nfs_fh *f)
84 unsigned int len; 85 unsigned int len;
85 86
86 if ((len = ntohl(*p++)) != NFS2_FHSIZE) { 87 if ((len = ntohl(*p++)) != NFS2_FHSIZE) {
87 printk(KERN_NOTICE 88 dprintk("lockd: bad fhandle size %d (should be %d)\n",
88 "lockd: bad fhandle size %d (should be %d)\n",
89 len, NFS2_FHSIZE); 89 len, NFS2_FHSIZE);
90 return NULL; 90 return NULL;
91 } 91 }
diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c
index 846fc1d639dd..43ff9397e6c6 100644
--- a/fs/lockd/xdr4.c
+++ b/fs/lockd/xdr4.c
@@ -64,8 +64,9 @@ nlm4_decode_cookie(__be32 *p, struct nlm_cookie *c)
64 } 64 }
65 else 65 else
66 { 66 {
67 printk(KERN_NOTICE 67 dprintk("lockd: bad cookie size %d (only cookies under "
68 "lockd: bad cookie size %d (only cookies under %d bytes are supported.)\n", len, NLM_MAXCOOKIELEN); 68 "%d bytes are supported.)\n",
69 len, NLM_MAXCOOKIELEN);
69 return NULL; 70 return NULL;
70 } 71 }
71 return p; 72 return p;
@@ -86,8 +87,7 @@ nlm4_decode_fh(__be32 *p, struct nfs_fh *f)
86 memset(f->data, 0, sizeof(f->data)); 87 memset(f->data, 0, sizeof(f->data));
87 f->size = ntohl(*p++); 88 f->size = ntohl(*p++);
88 if (f->size > NFS_MAXFHSIZE) { 89 if (f->size > NFS_MAXFHSIZE) {
89 printk(KERN_NOTICE 90 dprintk("lockd: bad fhandle size %d (should be <=%d)\n",
90 "lockd: bad fhandle size %d (should be <=%d)\n",
91 f->size, NFS_MAXFHSIZE); 91 f->size, NFS_MAXFHSIZE);
92 return NULL; 92 return NULL;
93 } 93 }
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index b55cb236cf74..df0f41e09885 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -16,4 +16,3 @@ nfs-$(CONFIG_NFS_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \
16 nfs4namespace.o 16 nfs4namespace.o
17nfs-$(CONFIG_NFS_DIRECTIO) += direct.o 17nfs-$(CONFIG_NFS_DIRECTIO) += direct.o
18nfs-$(CONFIG_SYSCTL) += sysctl.o 18nfs-$(CONFIG_SYSCTL) += sysctl.o
19nfs-objs := $(nfs-y)
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index a204484072f3..a532ee12740a 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -23,6 +23,8 @@
23#include <linux/sunrpc/clnt.h> 23#include <linux/sunrpc/clnt.h>
24#include <linux/sunrpc/stats.h> 24#include <linux/sunrpc/stats.h>
25#include <linux/sunrpc/metrics.h> 25#include <linux/sunrpc/metrics.h>
26#include <linux/sunrpc/xprtsock.h>
27#include <linux/sunrpc/xprtrdma.h>
26#include <linux/nfs_fs.h> 28#include <linux/nfs_fs.h>
27#include <linux/nfs_mount.h> 29#include <linux/nfs_mount.h>
28#include <linux/nfs4_mount.h> 30#include <linux/nfs4_mount.h>
@@ -340,7 +342,8 @@ static void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
340 to->to_retries = 2; 342 to->to_retries = 2;
341 343
342 switch (proto) { 344 switch (proto) {
343 case IPPROTO_TCP: 345 case XPRT_TRANSPORT_TCP:
346 case XPRT_TRANSPORT_RDMA:
344 if (!to->to_initval) 347 if (!to->to_initval)
345 to->to_initval = 60 * HZ; 348 to->to_initval = 60 * HZ;
346 if (to->to_initval > NFS_MAX_TCP_TIMEOUT) 349 if (to->to_initval > NFS_MAX_TCP_TIMEOUT)
@@ -349,7 +352,7 @@ static void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
349 to->to_maxval = to->to_initval + (to->to_increment * to->to_retries); 352 to->to_maxval = to->to_initval + (to->to_increment * to->to_retries);
350 to->to_exponential = 0; 353 to->to_exponential = 0;
351 break; 354 break;
352 case IPPROTO_UDP: 355 case XPRT_TRANSPORT_UDP:
353 default: 356 default:
354 if (!to->to_initval) 357 if (!to->to_initval)
355 to->to_initval = 11 * HZ / 10; 358 to->to_initval = 11 * HZ / 10;
@@ -501,9 +504,9 @@ static int nfs_init_server_rpcclient(struct nfs_server *server, rpc_authflavor_t
501/* 504/*
502 * Initialise an NFS2 or NFS3 client 505 * Initialise an NFS2 or NFS3 client
503 */ 506 */
504static int nfs_init_client(struct nfs_client *clp, const struct nfs_mount_data *data) 507static int nfs_init_client(struct nfs_client *clp,
508 const struct nfs_parsed_mount_data *data)
505{ 509{
506 int proto = (data->flags & NFS_MOUNT_TCP) ? IPPROTO_TCP : IPPROTO_UDP;
507 int error; 510 int error;
508 511
509 if (clp->cl_cons_state == NFS_CS_READY) { 512 if (clp->cl_cons_state == NFS_CS_READY) {
@@ -522,8 +525,8 @@ static int nfs_init_client(struct nfs_client *clp, const struct nfs_mount_data *
522 * Create a client RPC handle for doing FSSTAT with UNIX auth only 525 * Create a client RPC handle for doing FSSTAT with UNIX auth only
523 * - RFC 2623, sec 2.3.2 526 * - RFC 2623, sec 2.3.2
524 */ 527 */
525 error = nfs_create_rpc_client(clp, proto, data->timeo, data->retrans, 528 error = nfs_create_rpc_client(clp, data->nfs_server.protocol,
526 RPC_AUTH_UNIX, 0); 529 data->timeo, data->retrans, RPC_AUTH_UNIX, 0);
527 if (error < 0) 530 if (error < 0)
528 goto error; 531 goto error;
529 nfs_mark_client_ready(clp, NFS_CS_READY); 532 nfs_mark_client_ready(clp, NFS_CS_READY);
@@ -538,7 +541,8 @@ error:
538/* 541/*
539 * Create a version 2 or 3 client 542 * Create a version 2 or 3 client
540 */ 543 */
541static int nfs_init_server(struct nfs_server *server, const struct nfs_mount_data *data) 544static int nfs_init_server(struct nfs_server *server,
545 const struct nfs_parsed_mount_data *data)
542{ 546{
543 struct nfs_client *clp; 547 struct nfs_client *clp;
544 int error, nfsvers = 2; 548 int error, nfsvers = 2;
@@ -551,7 +555,8 @@ static int nfs_init_server(struct nfs_server *server, const struct nfs_mount_dat
551#endif 555#endif
552 556
553 /* Allocate or find a client reference we can use */ 557 /* Allocate or find a client reference we can use */
554 clp = nfs_get_client(data->hostname, &data->addr, nfsvers); 558 clp = nfs_get_client(data->nfs_server.hostname,
559 &data->nfs_server.address, nfsvers);
555 if (IS_ERR(clp)) { 560 if (IS_ERR(clp)) {
556 dprintk("<-- nfs_init_server() = error %ld\n", PTR_ERR(clp)); 561 dprintk("<-- nfs_init_server() = error %ld\n", PTR_ERR(clp));
557 return PTR_ERR(clp); 562 return PTR_ERR(clp);
@@ -581,7 +586,7 @@ static int nfs_init_server(struct nfs_server *server, const struct nfs_mount_dat
581 if (error < 0) 586 if (error < 0)
582 goto error; 587 goto error;
583 588
584 error = nfs_init_server_rpcclient(server, data->pseudoflavor); 589 error = nfs_init_server_rpcclient(server, data->auth_flavors[0]);
585 if (error < 0) 590 if (error < 0)
586 goto error; 591 goto error;
587 592
@@ -760,7 +765,7 @@ void nfs_free_server(struct nfs_server *server)
760 * Create a version 2 or 3 volume record 765 * Create a version 2 or 3 volume record
761 * - keyed on server and FSID 766 * - keyed on server and FSID
762 */ 767 */
763struct nfs_server *nfs_create_server(const struct nfs_mount_data *data, 768struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data,
764 struct nfs_fh *mntfh) 769 struct nfs_fh *mntfh)
765{ 770{
766 struct nfs_server *server; 771 struct nfs_server *server;
@@ -906,7 +911,7 @@ error:
906 * Create a version 4 volume record 911 * Create a version 4 volume record
907 */ 912 */
908static int nfs4_init_server(struct nfs_server *server, 913static int nfs4_init_server(struct nfs_server *server,
909 const struct nfs4_mount_data *data, rpc_authflavor_t authflavour) 914 const struct nfs_parsed_mount_data *data)
910{ 915{
911 int error; 916 int error;
912 917
@@ -926,7 +931,7 @@ static int nfs4_init_server(struct nfs_server *server,
926 server->acdirmin = data->acdirmin * HZ; 931 server->acdirmin = data->acdirmin * HZ;
927 server->acdirmax = data->acdirmax * HZ; 932 server->acdirmax = data->acdirmax * HZ;
928 933
929 error = nfs_init_server_rpcclient(server, authflavour); 934 error = nfs_init_server_rpcclient(server, data->auth_flavors[0]);
930 935
931 /* Done */ 936 /* Done */
932 dprintk("<-- nfs4_init_server() = %d\n", error); 937 dprintk("<-- nfs4_init_server() = %d\n", error);
@@ -937,12 +942,7 @@ static int nfs4_init_server(struct nfs_server *server,
937 * Create a version 4 volume record 942 * Create a version 4 volume record
938 * - keyed on server and FSID 943 * - keyed on server and FSID
939 */ 944 */
940struct nfs_server *nfs4_create_server(const struct nfs4_mount_data *data, 945struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data,
941 const char *hostname,
942 const struct sockaddr_in *addr,
943 const char *mntpath,
944 const char *ip_addr,
945 rpc_authflavor_t authflavour,
946 struct nfs_fh *mntfh) 946 struct nfs_fh *mntfh)
947{ 947{
948 struct nfs_fattr fattr; 948 struct nfs_fattr fattr;
@@ -956,13 +956,18 @@ struct nfs_server *nfs4_create_server(const struct nfs4_mount_data *data,
956 return ERR_PTR(-ENOMEM); 956 return ERR_PTR(-ENOMEM);
957 957
958 /* Get a client record */ 958 /* Get a client record */
959 error = nfs4_set_client(server, hostname, addr, ip_addr, authflavour, 959 error = nfs4_set_client(server,
960 data->proto, data->timeo, data->retrans); 960 data->nfs_server.hostname,
961 &data->nfs_server.address,
962 data->client_address,
963 data->auth_flavors[0],
964 data->nfs_server.protocol,
965 data->timeo, data->retrans);
961 if (error < 0) 966 if (error < 0)
962 goto error; 967 goto error;
963 968
964 /* set up the general RPC client */ 969 /* set up the general RPC client */
965 error = nfs4_init_server(server, data, authflavour); 970 error = nfs4_init_server(server, data);
966 if (error < 0) 971 if (error < 0)
967 goto error; 972 goto error;
968 973
@@ -971,7 +976,7 @@ struct nfs_server *nfs4_create_server(const struct nfs4_mount_data *data,
971 BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops); 976 BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
972 977
973 /* Probe the root fh to retrieve its FSID */ 978 /* Probe the root fh to retrieve its FSID */
974 error = nfs4_path_walk(server, mntfh, mntpath); 979 error = nfs4_path_walk(server, mntfh, data->nfs_server.export_path);
975 if (error < 0) 980 if (error < 0)
976 goto error; 981 goto error;
977 982
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index c55a761c22bb..af8b235d405d 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -52,7 +52,7 @@ static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_
52 for (fl = inode->i_flock; fl != 0; fl = fl->fl_next) { 52 for (fl = inode->i_flock; fl != 0; fl = fl->fl_next) {
53 if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK))) 53 if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK)))
54 continue; 54 continue;
55 if ((struct nfs_open_context *)fl->fl_file->private_data != ctx) 55 if (nfs_file_open_context(fl->fl_file) != ctx)
56 continue; 56 continue;
57 status = nfs4_lock_delegation_recall(state, fl); 57 status = nfs4_lock_delegation_recall(state, fl);
58 if (status >= 0) 58 if (status >= 0)
@@ -109,6 +109,7 @@ again:
109void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res) 109void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res)
110{ 110{
111 struct nfs_delegation *delegation = NFS_I(inode)->delegation; 111 struct nfs_delegation *delegation = NFS_I(inode)->delegation;
112 struct rpc_cred *oldcred;
112 113
113 if (delegation == NULL) 114 if (delegation == NULL)
114 return; 115 return;
@@ -116,11 +117,12 @@ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, st
116 sizeof(delegation->stateid.data)); 117 sizeof(delegation->stateid.data));
117 delegation->type = res->delegation_type; 118 delegation->type = res->delegation_type;
118 delegation->maxsize = res->maxsize; 119 delegation->maxsize = res->maxsize;
119 put_rpccred(cred); 120 oldcred = delegation->cred;
120 delegation->cred = get_rpccred(cred); 121 delegation->cred = get_rpccred(cred);
121 delegation->flags &= ~NFS_DELEGATION_NEED_RECLAIM; 122 delegation->flags &= ~NFS_DELEGATION_NEED_RECLAIM;
122 NFS_I(inode)->delegation_state = delegation->type; 123 NFS_I(inode)->delegation_state = delegation->type;
123 smp_wmb(); 124 smp_wmb();
125 put_rpccred(oldcred);
124} 126}
125 127
126/* 128/*
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index e4a04d16b8b0..8ec7fbd8240c 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -200,9 +200,6 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
200 desc->timestamp = timestamp; 200 desc->timestamp = timestamp;
201 desc->timestamp_valid = 1; 201 desc->timestamp_valid = 1;
202 SetPageUptodate(page); 202 SetPageUptodate(page);
203 spin_lock(&inode->i_lock);
204 NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATIME;
205 spin_unlock(&inode->i_lock);
206 /* Ensure consistent page alignment of the data. 203 /* Ensure consistent page alignment of the data.
207 * Note: assumes we have exclusive access to this mapping either 204 * Note: assumes we have exclusive access to this mapping either
208 * through inode->i_mutex or some other mechanism. 205 * through inode->i_mutex or some other mechanism.
@@ -214,9 +211,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
214 unlock_page(page); 211 unlock_page(page);
215 return 0; 212 return 0;
216 error: 213 error:
217 SetPageError(page);
218 unlock_page(page); 214 unlock_page(page);
219 nfs_zap_caches(inode);
220 desc->error = error; 215 desc->error = error;
221 return -EIO; 216 return -EIO;
222} 217}
@@ -407,7 +402,7 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent,
407 struct file *file = desc->file; 402 struct file *file = desc->file;
408 struct nfs_entry *entry = desc->entry; 403 struct nfs_entry *entry = desc->entry;
409 struct dentry *dentry = NULL; 404 struct dentry *dentry = NULL;
410 unsigned long fileid; 405 u64 fileid;
411 int loop_count = 0, 406 int loop_count = 0,
412 res; 407 res;
413 408
@@ -418,7 +413,7 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent,
418 unsigned d_type = DT_UNKNOWN; 413 unsigned d_type = DT_UNKNOWN;
419 /* Note: entry->prev_cookie contains the cookie for 414 /* Note: entry->prev_cookie contains the cookie for
420 * retrieving the current dirent on the server */ 415 * retrieving the current dirent on the server */
421 fileid = nfs_fileid_to_ino_t(entry->ino); 416 fileid = entry->ino;
422 417
423 /* Get a dentry if we have one */ 418 /* Get a dentry if we have one */
424 if (dentry != NULL) 419 if (dentry != NULL)
@@ -428,11 +423,12 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent,
428 /* Use readdirplus info */ 423 /* Use readdirplus info */
429 if (dentry != NULL && dentry->d_inode != NULL) { 424 if (dentry != NULL && dentry->d_inode != NULL) {
430 d_type = dt_type(dentry->d_inode); 425 d_type = dt_type(dentry->d_inode);
431 fileid = dentry->d_inode->i_ino; 426 fileid = NFS_FILEID(dentry->d_inode);
432 } 427 }
433 428
434 res = filldir(dirent, entry->name, entry->len, 429 res = filldir(dirent, entry->name, entry->len,
435 file->f_pos, fileid, d_type); 430 file->f_pos, nfs_compat_user_ino64(fileid),
431 d_type);
436 if (res < 0) 432 if (res < 0)
437 break; 433 break;
438 file->f_pos++; 434 file->f_pos++;
@@ -490,9 +486,6 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
490 page, 486 page,
491 NFS_SERVER(inode)->dtsize, 487 NFS_SERVER(inode)->dtsize,
492 desc->plus); 488 desc->plus);
493 spin_lock(&inode->i_lock);
494 NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATIME;
495 spin_unlock(&inode->i_lock);
496 desc->page = page; 489 desc->page = page;
497 desc->ptr = kmap(page); /* matching kunmap in nfs_do_filldir */ 490 desc->ptr = kmap(page); /* matching kunmap in nfs_do_filldir */
498 if (desc->error >= 0) { 491 if (desc->error >= 0) {
@@ -558,7 +551,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
558 memset(desc, 0, sizeof(*desc)); 551 memset(desc, 0, sizeof(*desc));
559 552
560 desc->file = filp; 553 desc->file = filp;
561 desc->dir_cookie = &((struct nfs_open_context *)filp->private_data)->dir_cookie; 554 desc->dir_cookie = &nfs_file_open_context(filp)->dir_cookie;
562 desc->decode = NFS_PROTO(inode)->decode_dirent; 555 desc->decode = NFS_PROTO(inode)->decode_dirent;
563 desc->plus = NFS_USE_READDIRPLUS(inode); 556 desc->plus = NFS_USE_READDIRPLUS(inode);
564 557
@@ -623,7 +616,7 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin)
623 } 616 }
624 if (offset != filp->f_pos) { 617 if (offset != filp->f_pos) {
625 filp->f_pos = offset; 618 filp->f_pos = offset;
626 ((struct nfs_open_context *)filp->private_data)->dir_cookie = 0; 619 nfs_file_open_context(filp)->dir_cookie = 0;
627 } 620 }
628out: 621out:
629 mutex_unlock(&filp->f_path.dentry->d_inode->i_mutex); 622 mutex_unlock(&filp->f_path.dentry->d_inode->i_mutex);
@@ -650,36 +643,18 @@ static int nfs_fsync_dir(struct file *filp, struct dentry *dentry, int datasync)
650 */ 643 */
651static int nfs_check_verifier(struct inode *dir, struct dentry *dentry) 644static int nfs_check_verifier(struct inode *dir, struct dentry *dentry)
652{ 645{
653 unsigned long verf;
654
655 if (IS_ROOT(dentry)) 646 if (IS_ROOT(dentry))
656 return 1; 647 return 1;
657 verf = dentry->d_time; 648 if (!nfs_verify_change_attribute(dir, dentry->d_time))
658 if (nfs_caches_unstable(dir) 649 return 0;
659 || verf != NFS_I(dir)->cache_change_attribute) 650 /* Revalidate nfsi->cache_change_attribute before we declare a match */
651 if (nfs_revalidate_inode(NFS_SERVER(dir), dir) < 0)
652 return 0;
653 if (!nfs_verify_change_attribute(dir, dentry->d_time))
660 return 0; 654 return 0;
661 return 1; 655 return 1;
662} 656}
663 657
664static inline void nfs_set_verifier(struct dentry * dentry, unsigned long verf)
665{
666 dentry->d_time = verf;
667}
668
669static void nfs_refresh_verifier(struct dentry * dentry, unsigned long verf)
670{
671 nfs_set_verifier(dentry, verf);
672}
673
674/*
675 * Whenever an NFS operation succeeds, we know that the dentry
676 * is valid, so we update the revalidation timestamp.
677 */
678static inline void nfs_renew_times(struct dentry * dentry)
679{
680 dentry->d_time = jiffies;
681}
682
683/* 658/*
684 * Return the intent data that applies to this particular path component 659 * Return the intent data that applies to this particular path component
685 * 660 *
@@ -695,6 +670,19 @@ static inline unsigned int nfs_lookup_check_intent(struct nameidata *nd, unsigne
695} 670}
696 671
697/* 672/*
673 * Use intent information to check whether or not we're going to do
674 * an O_EXCL create using this path component.
675 */
676static int nfs_is_exclusive_create(struct inode *dir, struct nameidata *nd)
677{
678 if (NFS_PROTO(dir)->version == 2)
679 return 0;
680 if (nd == NULL || nfs_lookup_check_intent(nd, LOOKUP_CREATE) == 0)
681 return 0;
682 return (nd->intent.open.flags & O_EXCL) != 0;
683}
684
685/*
698 * Inode and filehandle revalidation for lookups. 686 * Inode and filehandle revalidation for lookups.
699 * 687 *
700 * We force revalidation in the cases where the VFS sets LOOKUP_REVAL, 688 * We force revalidation in the cases where the VFS sets LOOKUP_REVAL,
@@ -717,6 +705,7 @@ int nfs_lookup_verify_inode(struct inode *inode, struct nameidata *nd)
717 (S_ISREG(inode->i_mode) || 705 (S_ISREG(inode->i_mode) ||
718 S_ISDIR(inode->i_mode))) 706 S_ISDIR(inode->i_mode)))
719 goto out_force; 707 goto out_force;
708 return 0;
720 } 709 }
721 return nfs_revalidate_inode(server, inode); 710 return nfs_revalidate_inode(server, inode);
722out_force: 711out_force:
@@ -759,7 +748,6 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
759 int error; 748 int error;
760 struct nfs_fh fhandle; 749 struct nfs_fh fhandle;
761 struct nfs_fattr fattr; 750 struct nfs_fattr fattr;
762 unsigned long verifier;
763 751
764 parent = dget_parent(dentry); 752 parent = dget_parent(dentry);
765 lock_kernel(); 753 lock_kernel();
@@ -767,10 +755,6 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
767 nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE); 755 nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE);
768 inode = dentry->d_inode; 756 inode = dentry->d_inode;
769 757
770 /* Revalidate parent directory attribute cache */
771 if (nfs_revalidate_inode(NFS_SERVER(dir), dir) < 0)
772 goto out_zap_parent;
773
774 if (!inode) { 758 if (!inode) {
775 if (nfs_neg_need_reval(dir, dentry, nd)) 759 if (nfs_neg_need_reval(dir, dentry, nd))
776 goto out_bad; 760 goto out_bad;
@@ -785,7 +769,7 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
785 } 769 }
786 770
787 /* Force a full look up iff the parent directory has changed */ 771 /* Force a full look up iff the parent directory has changed */
788 if (nfs_check_verifier(dir, dentry)) { 772 if (!nfs_is_exclusive_create(dir, nd) && nfs_check_verifier(dir, dentry)) {
789 if (nfs_lookup_verify_inode(inode, nd)) 773 if (nfs_lookup_verify_inode(inode, nd))
790 goto out_zap_parent; 774 goto out_zap_parent;
791 goto out_valid; 775 goto out_valid;
@@ -794,7 +778,6 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
794 if (NFS_STALE(inode)) 778 if (NFS_STALE(inode))
795 goto out_bad; 779 goto out_bad;
796 780
797 verifier = nfs_save_change_attribute(dir);
798 error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, &fhandle, &fattr); 781 error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, &fhandle, &fattr);
799 if (error) 782 if (error)
800 goto out_bad; 783 goto out_bad;
@@ -803,8 +786,7 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
803 if ((error = nfs_refresh_inode(inode, &fattr)) != 0) 786 if ((error = nfs_refresh_inode(inode, &fattr)) != 0)
804 goto out_bad; 787 goto out_bad;
805 788
806 nfs_renew_times(dentry); 789 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
807 nfs_refresh_verifier(dentry, verifier);
808 out_valid: 790 out_valid:
809 unlock_kernel(); 791 unlock_kernel();
810 dput(parent); 792 dput(parent);
@@ -815,7 +797,7 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
815out_zap_parent: 797out_zap_parent:
816 nfs_zap_caches(dir); 798 nfs_zap_caches(dir);
817 out_bad: 799 out_bad:
818 NFS_CACHEINV(dir); 800 nfs_mark_for_revalidate(dir);
819 if (inode && S_ISDIR(inode->i_mode)) { 801 if (inode && S_ISDIR(inode->i_mode)) {
820 /* Purge readdir caches. */ 802 /* Purge readdir caches. */
821 nfs_zap_caches(inode); 803 nfs_zap_caches(inode);
@@ -872,8 +854,6 @@ static void nfs_dentry_iput(struct dentry *dentry, struct inode *inode)
872 nfs_complete_unlink(dentry, inode); 854 nfs_complete_unlink(dentry, inode);
873 unlock_kernel(); 855 unlock_kernel();
874 } 856 }
875 /* When creating a negative dentry, we want to renew d_time */
876 nfs_renew_times(dentry);
877 iput(inode); 857 iput(inode);
878} 858}
879 859
@@ -883,30 +863,6 @@ struct dentry_operations nfs_dentry_operations = {
883 .d_iput = nfs_dentry_iput, 863 .d_iput = nfs_dentry_iput,
884}; 864};
885 865
886/*
887 * Use intent information to check whether or not we're going to do
888 * an O_EXCL create using this path component.
889 */
890static inline
891int nfs_is_exclusive_create(struct inode *dir, struct nameidata *nd)
892{
893 if (NFS_PROTO(dir)->version == 2)
894 return 0;
895 if (nd == NULL || nfs_lookup_check_intent(nd, LOOKUP_CREATE) == 0)
896 return 0;
897 return (nd->intent.open.flags & O_EXCL) != 0;
898}
899
900static inline int nfs_reval_fsid(struct inode *dir, const struct nfs_fattr *fattr)
901{
902 struct nfs_server *server = NFS_SERVER(dir);
903
904 if (!nfs_fsid_equal(&server->fsid, &fattr->fsid))
905 /* Revalidate fsid using the parent directory */
906 return __nfs_revalidate_inode(server, dir);
907 return 0;
908}
909
910static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd) 866static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
911{ 867{
912 struct dentry *res; 868 struct dentry *res;
@@ -945,11 +901,6 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
945 res = ERR_PTR(error); 901 res = ERR_PTR(error);
946 goto out_unlock; 902 goto out_unlock;
947 } 903 }
948 error = nfs_reval_fsid(dir, &fattr);
949 if (error < 0) {
950 res = ERR_PTR(error);
951 goto out_unlock;
952 }
953 inode = nfs_fhget(dentry->d_sb, &fhandle, &fattr); 904 inode = nfs_fhget(dentry->d_sb, &fhandle, &fattr);
954 res = (struct dentry *)inode; 905 res = (struct dentry *)inode;
955 if (IS_ERR(res)) 906 if (IS_ERR(res))
@@ -958,17 +909,10 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
958no_entry: 909no_entry:
959 res = d_materialise_unique(dentry, inode); 910 res = d_materialise_unique(dentry, inode);
960 if (res != NULL) { 911 if (res != NULL) {
961 struct dentry *parent;
962 if (IS_ERR(res)) 912 if (IS_ERR(res))
963 goto out_unlock; 913 goto out_unlock;
964 /* Was a directory renamed! */
965 parent = dget_parent(res);
966 if (!IS_ROOT(parent))
967 nfs_mark_for_revalidate(parent->d_inode);
968 dput(parent);
969 dentry = res; 914 dentry = res;
970 } 915 }
971 nfs_renew_times(dentry);
972 nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); 916 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
973out_unlock: 917out_unlock:
974 unlock_kernel(); 918 unlock_kernel();
@@ -1020,28 +964,16 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
1020 } 964 }
1021 dentry->d_op = NFS_PROTO(dir)->dentry_ops; 965 dentry->d_op = NFS_PROTO(dir)->dentry_ops;
1022 966
1023 /* Let vfs_create() deal with O_EXCL */ 967 /* Let vfs_create() deal with O_EXCL. Instantiate, but don't hash
968 * the dentry. */
1024 if (nd->intent.open.flags & O_EXCL) { 969 if (nd->intent.open.flags & O_EXCL) {
1025 d_add(dentry, NULL); 970 d_instantiate(dentry, NULL);
1026 goto out; 971 goto out;
1027 } 972 }
1028 973
1029 /* Open the file on the server */ 974 /* Open the file on the server */
1030 lock_kernel(); 975 lock_kernel();
1031 /* Revalidate parent directory attribute cache */ 976 res = nfs4_atomic_open(dir, dentry, nd);
1032 error = nfs_revalidate_inode(NFS_SERVER(dir), dir);
1033 if (error < 0) {
1034 res = ERR_PTR(error);
1035 unlock_kernel();
1036 goto out;
1037 }
1038
1039 if (nd->intent.open.flags & O_CREAT) {
1040 nfs_begin_data_update(dir);
1041 res = nfs4_atomic_open(dir, dentry, nd);
1042 nfs_end_data_update(dir);
1043 } else
1044 res = nfs4_atomic_open(dir, dentry, nd);
1045 unlock_kernel(); 977 unlock_kernel();
1046 if (IS_ERR(res)) { 978 if (IS_ERR(res)) {
1047 error = PTR_ERR(res); 979 error = PTR_ERR(res);
@@ -1063,8 +995,6 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
1063 } 995 }
1064 } else if (res != NULL) 996 } else if (res != NULL)
1065 dentry = res; 997 dentry = res;
1066 nfs_renew_times(dentry);
1067 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
1068out: 998out:
1069 return res; 999 return res;
1070no_open: 1000no_open:
@@ -1076,7 +1006,6 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
1076 struct dentry *parent = NULL; 1006 struct dentry *parent = NULL;
1077 struct inode *inode = dentry->d_inode; 1007 struct inode *inode = dentry->d_inode;
1078 struct inode *dir; 1008 struct inode *dir;
1079 unsigned long verifier;
1080 int openflags, ret = 0; 1009 int openflags, ret = 0;
1081 1010
1082 parent = dget_parent(dentry); 1011 parent = dget_parent(dentry);
@@ -1086,8 +1015,12 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
1086 /* We can't create new files in nfs_open_revalidate(), so we 1015 /* We can't create new files in nfs_open_revalidate(), so we
1087 * optimize away revalidation of negative dentries. 1016 * optimize away revalidation of negative dentries.
1088 */ 1017 */
1089 if (inode == NULL) 1018 if (inode == NULL) {
1019 if (!nfs_neg_need_reval(dir, dentry, nd))
1020 ret = 1;
1090 goto out; 1021 goto out;
1022 }
1023
1091 /* NFS only supports OPEN on regular files */ 1024 /* NFS only supports OPEN on regular files */
1092 if (!S_ISREG(inode->i_mode)) 1025 if (!S_ISREG(inode->i_mode))
1093 goto no_open; 1026 goto no_open;
@@ -1104,10 +1037,7 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
1104 * change attribute *before* we do the RPC call. 1037 * change attribute *before* we do the RPC call.
1105 */ 1038 */
1106 lock_kernel(); 1039 lock_kernel();
1107 verifier = nfs_save_change_attribute(dir);
1108 ret = nfs4_open_revalidate(dir, dentry, openflags, nd); 1040 ret = nfs4_open_revalidate(dir, dentry, openflags, nd);
1109 if (!ret)
1110 nfs_refresh_verifier(dentry, verifier);
1111 unlock_kernel(); 1041 unlock_kernel();
1112out: 1042out:
1113 dput(parent); 1043 dput(parent);
@@ -1133,6 +1063,7 @@ static struct dentry *nfs_readdir_lookup(nfs_readdir_descriptor_t *desc)
1133 .len = entry->len, 1063 .len = entry->len,
1134 }; 1064 };
1135 struct inode *inode; 1065 struct inode *inode;
1066 unsigned long verf = nfs_save_change_attribute(dir);
1136 1067
1137 switch (name.len) { 1068 switch (name.len) {
1138 case 2: 1069 case 2:
@@ -1143,6 +1074,14 @@ static struct dentry *nfs_readdir_lookup(nfs_readdir_descriptor_t *desc)
1143 if (name.name[0] == '.') 1074 if (name.name[0] == '.')
1144 return dget(parent); 1075 return dget(parent);
1145 } 1076 }
1077
1078 spin_lock(&dir->i_lock);
1079 if (NFS_I(dir)->cache_validity & NFS_INO_INVALID_DATA) {
1080 spin_unlock(&dir->i_lock);
1081 return NULL;
1082 }
1083 spin_unlock(&dir->i_lock);
1084
1146 name.hash = full_name_hash(name.name, name.len); 1085 name.hash = full_name_hash(name.name, name.len);
1147 dentry = d_lookup(parent, &name); 1086 dentry = d_lookup(parent, &name);
1148 if (dentry != NULL) { 1087 if (dentry != NULL) {
@@ -1183,12 +1122,8 @@ static struct dentry *nfs_readdir_lookup(nfs_readdir_descriptor_t *desc)
1183 dentry = alias; 1122 dentry = alias;
1184 } 1123 }
1185 1124
1186 nfs_renew_times(dentry);
1187 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
1188 return dentry;
1189out_renew: 1125out_renew:
1190 nfs_renew_times(dentry); 1126 nfs_set_verifier(dentry, verf);
1191 nfs_refresh_verifier(dentry, nfs_save_change_attribute(dir));
1192 return dentry; 1127 return dentry;
1193} 1128}
1194 1129
@@ -1198,32 +1133,40 @@ out_renew:
1198int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle, 1133int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle,
1199 struct nfs_fattr *fattr) 1134 struct nfs_fattr *fattr)
1200{ 1135{
1136 struct dentry *parent = dget_parent(dentry);
1137 struct inode *dir = parent->d_inode;
1201 struct inode *inode; 1138 struct inode *inode;
1202 int error = -EACCES; 1139 int error = -EACCES;
1203 1140
1141 d_drop(dentry);
1142
1204 /* We may have been initialized further down */ 1143 /* We may have been initialized further down */
1205 if (dentry->d_inode) 1144 if (dentry->d_inode)
1206 return 0; 1145 goto out;
1207 if (fhandle->size == 0) { 1146 if (fhandle->size == 0) {
1208 struct inode *dir = dentry->d_parent->d_inode;
1209 error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr); 1147 error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr);
1210 if (error) 1148 if (error)
1211 return error; 1149 goto out_error;
1212 } 1150 }
1151 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
1213 if (!(fattr->valid & NFS_ATTR_FATTR)) { 1152 if (!(fattr->valid & NFS_ATTR_FATTR)) {
1214 struct nfs_server *server = NFS_SB(dentry->d_sb); 1153 struct nfs_server *server = NFS_SB(dentry->d_sb);
1215 error = server->nfs_client->rpc_ops->getattr(server, fhandle, fattr); 1154 error = server->nfs_client->rpc_ops->getattr(server, fhandle, fattr);
1216 if (error < 0) 1155 if (error < 0)
1217 return error; 1156 goto out_error;
1218 } 1157 }
1219 inode = nfs_fhget(dentry->d_sb, fhandle, fattr); 1158 inode = nfs_fhget(dentry->d_sb, fhandle, fattr);
1220 error = PTR_ERR(inode); 1159 error = PTR_ERR(inode);
1221 if (IS_ERR(inode)) 1160 if (IS_ERR(inode))
1222 return error; 1161 goto out_error;
1223 d_instantiate(dentry, inode); 1162 d_add(dentry, inode);
1224 if (d_unhashed(dentry)) 1163out:
1225 d_rehash(dentry); 1164 dput(parent);
1226 return 0; 1165 return 0;
1166out_error:
1167 nfs_mark_for_revalidate(dir);
1168 dput(parent);
1169 return error;
1227} 1170}
1228 1171
1229/* 1172/*
@@ -1249,13 +1192,9 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode,
1249 open_flags = nd->intent.open.flags; 1192 open_flags = nd->intent.open.flags;
1250 1193
1251 lock_kernel(); 1194 lock_kernel();
1252 nfs_begin_data_update(dir);
1253 error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, nd); 1195 error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, nd);
1254 nfs_end_data_update(dir);
1255 if (error != 0) 1196 if (error != 0)
1256 goto out_err; 1197 goto out_err;
1257 nfs_renew_times(dentry);
1258 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
1259 unlock_kernel(); 1198 unlock_kernel();
1260 return 0; 1199 return 0;
1261out_err: 1200out_err:
@@ -1283,13 +1222,9 @@ nfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
1283 attr.ia_valid = ATTR_MODE; 1222 attr.ia_valid = ATTR_MODE;
1284 1223
1285 lock_kernel(); 1224 lock_kernel();
1286 nfs_begin_data_update(dir);
1287 status = NFS_PROTO(dir)->mknod(dir, dentry, &attr, rdev); 1225 status = NFS_PROTO(dir)->mknod(dir, dentry, &attr, rdev);
1288 nfs_end_data_update(dir);
1289 if (status != 0) 1226 if (status != 0)
1290 goto out_err; 1227 goto out_err;
1291 nfs_renew_times(dentry);
1292 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
1293 unlock_kernel(); 1228 unlock_kernel();
1294 return 0; 1229 return 0;
1295out_err: 1230out_err:
@@ -1313,13 +1248,9 @@ static int nfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
1313 attr.ia_mode = mode | S_IFDIR; 1248 attr.ia_mode = mode | S_IFDIR;
1314 1249
1315 lock_kernel(); 1250 lock_kernel();
1316 nfs_begin_data_update(dir);
1317 error = NFS_PROTO(dir)->mkdir(dir, dentry, &attr); 1251 error = NFS_PROTO(dir)->mkdir(dir, dentry, &attr);
1318 nfs_end_data_update(dir);
1319 if (error != 0) 1252 if (error != 0)
1320 goto out_err; 1253 goto out_err;
1321 nfs_renew_times(dentry);
1322 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
1323 unlock_kernel(); 1254 unlock_kernel();
1324 return 0; 1255 return 0;
1325out_err: 1256out_err:
@@ -1336,12 +1267,10 @@ static int nfs_rmdir(struct inode *dir, struct dentry *dentry)
1336 dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); 1267 dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
1337 1268
1338 lock_kernel(); 1269 lock_kernel();
1339 nfs_begin_data_update(dir);
1340 error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name); 1270 error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name);
1341 /* Ensure the VFS deletes this inode */ 1271 /* Ensure the VFS deletes this inode */
1342 if (error == 0 && dentry->d_inode != NULL) 1272 if (error == 0 && dentry->d_inode != NULL)
1343 clear_nlink(dentry->d_inode); 1273 clear_nlink(dentry->d_inode);
1344 nfs_end_data_update(dir);
1345 unlock_kernel(); 1274 unlock_kernel();
1346 1275
1347 return error; 1276 return error;
@@ -1350,9 +1279,9 @@ static int nfs_rmdir(struct inode *dir, struct dentry *dentry)
1350static int nfs_sillyrename(struct inode *dir, struct dentry *dentry) 1279static int nfs_sillyrename(struct inode *dir, struct dentry *dentry)
1351{ 1280{
1352 static unsigned int sillycounter; 1281 static unsigned int sillycounter;
1353 const int i_inosize = sizeof(dir->i_ino)*2; 1282 const int fileidsize = sizeof(NFS_FILEID(dentry->d_inode))*2;
1354 const int countersize = sizeof(sillycounter)*2; 1283 const int countersize = sizeof(sillycounter)*2;
1355 const int slen = sizeof(".nfs") + i_inosize + countersize - 1; 1284 const int slen = sizeof(".nfs")+fileidsize+countersize-1;
1356 char silly[slen+1]; 1285 char silly[slen+1];
1357 struct qstr qsilly; 1286 struct qstr qsilly;
1358 struct dentry *sdentry; 1287 struct dentry *sdentry;
@@ -1370,8 +1299,9 @@ static int nfs_sillyrename(struct inode *dir, struct dentry *dentry)
1370 if (dentry->d_flags & DCACHE_NFSFS_RENAMED) 1299 if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
1371 goto out; 1300 goto out;
1372 1301
1373 sprintf(silly, ".nfs%*.*lx", 1302 sprintf(silly, ".nfs%*.*Lx",
1374 i_inosize, i_inosize, dentry->d_inode->i_ino); 1303 fileidsize, fileidsize,
1304 (unsigned long long)NFS_FILEID(dentry->d_inode));
1375 1305
1376 /* Return delegation in anticipation of the rename */ 1306 /* Return delegation in anticipation of the rename */
1377 nfs_inode_return_delegation(dentry->d_inode); 1307 nfs_inode_return_delegation(dentry->d_inode);
@@ -1398,19 +1328,14 @@ static int nfs_sillyrename(struct inode *dir, struct dentry *dentry)
1398 1328
1399 qsilly.name = silly; 1329 qsilly.name = silly;
1400 qsilly.len = strlen(silly); 1330 qsilly.len = strlen(silly);
1401 nfs_begin_data_update(dir);
1402 if (dentry->d_inode) { 1331 if (dentry->d_inode) {
1403 nfs_begin_data_update(dentry->d_inode);
1404 error = NFS_PROTO(dir)->rename(dir, &dentry->d_name, 1332 error = NFS_PROTO(dir)->rename(dir, &dentry->d_name,
1405 dir, &qsilly); 1333 dir, &qsilly);
1406 nfs_mark_for_revalidate(dentry->d_inode); 1334 nfs_mark_for_revalidate(dentry->d_inode);
1407 nfs_end_data_update(dentry->d_inode);
1408 } else 1335 } else
1409 error = NFS_PROTO(dir)->rename(dir, &dentry->d_name, 1336 error = NFS_PROTO(dir)->rename(dir, &dentry->d_name,
1410 dir, &qsilly); 1337 dir, &qsilly);
1411 nfs_end_data_update(dir);
1412 if (!error) { 1338 if (!error) {
1413 nfs_renew_times(dentry);
1414 nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); 1339 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
1415 d_move(dentry, sdentry); 1340 d_move(dentry, sdentry);
1416 error = nfs_async_unlink(dir, dentry); 1341 error = nfs_async_unlink(dir, dentry);
@@ -1443,19 +1368,15 @@ static int nfs_safe_remove(struct dentry *dentry)
1443 goto out; 1368 goto out;
1444 } 1369 }
1445 1370
1446 nfs_begin_data_update(dir);
1447 if (inode != NULL) { 1371 if (inode != NULL) {
1448 nfs_inode_return_delegation(inode); 1372 nfs_inode_return_delegation(inode);
1449 nfs_begin_data_update(inode);
1450 error = NFS_PROTO(dir)->remove(dir, &dentry->d_name); 1373 error = NFS_PROTO(dir)->remove(dir, &dentry->d_name);
1451 /* The VFS may want to delete this inode */ 1374 /* The VFS may want to delete this inode */
1452 if (error == 0) 1375 if (error == 0)
1453 drop_nlink(inode); 1376 drop_nlink(inode);
1454 nfs_mark_for_revalidate(inode); 1377 nfs_mark_for_revalidate(inode);
1455 nfs_end_data_update(inode);
1456 } else 1378 } else
1457 error = NFS_PROTO(dir)->remove(dir, &dentry->d_name); 1379 error = NFS_PROTO(dir)->remove(dir, &dentry->d_name);
1458 nfs_end_data_update(dir);
1459out: 1380out:
1460 return error; 1381 return error;
1461} 1382}
@@ -1493,7 +1414,6 @@ static int nfs_unlink(struct inode *dir, struct dentry *dentry)
1493 spin_unlock(&dcache_lock); 1414 spin_unlock(&dcache_lock);
1494 error = nfs_safe_remove(dentry); 1415 error = nfs_safe_remove(dentry);
1495 if (!error) { 1416 if (!error) {
1496 nfs_renew_times(dentry);
1497 nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); 1417 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
1498 } else if (need_rehash) 1418 } else if (need_rehash)
1499 d_rehash(dentry); 1419 d_rehash(dentry);
@@ -1548,9 +1468,7 @@ static int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *sym
1548 memset(kaddr + pathlen, 0, PAGE_SIZE - pathlen); 1468 memset(kaddr + pathlen, 0, PAGE_SIZE - pathlen);
1549 kunmap_atomic(kaddr, KM_USER0); 1469 kunmap_atomic(kaddr, KM_USER0);
1550 1470
1551 nfs_begin_data_update(dir);
1552 error = NFS_PROTO(dir)->symlink(dir, dentry, page, pathlen, &attr); 1471 error = NFS_PROTO(dir)->symlink(dir, dentry, page, pathlen, &attr);
1553 nfs_end_data_update(dir);
1554 if (error != 0) { 1472 if (error != 0) {
1555 dfprintk(VFS, "NFS: symlink(%s/%ld, %s, %s) error %d\n", 1473 dfprintk(VFS, "NFS: symlink(%s/%ld, %s, %s) error %d\n",
1556 dir->i_sb->s_id, dir->i_ino, 1474 dir->i_sb->s_id, dir->i_ino,
@@ -1590,15 +1508,12 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
1590 dentry->d_parent->d_name.name, dentry->d_name.name); 1508 dentry->d_parent->d_name.name, dentry->d_name.name);
1591 1509
1592 lock_kernel(); 1510 lock_kernel();
1593 nfs_begin_data_update(dir); 1511 d_drop(dentry);
1594 nfs_begin_data_update(inode);
1595 error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name); 1512 error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name);
1596 if (error == 0) { 1513 if (error == 0) {
1597 atomic_inc(&inode->i_count); 1514 atomic_inc(&inode->i_count);
1598 d_instantiate(dentry, inode); 1515 d_add(dentry, inode);
1599 } 1516 }
1600 nfs_end_data_update(inode);
1601 nfs_end_data_update(dir);
1602 unlock_kernel(); 1517 unlock_kernel();
1603 return error; 1518 return error;
1604} 1519}
@@ -1701,22 +1616,16 @@ go_ahead:
1701 d_delete(new_dentry); 1616 d_delete(new_dentry);
1702 } 1617 }
1703 1618
1704 nfs_begin_data_update(old_dir);
1705 nfs_begin_data_update(new_dir);
1706 nfs_begin_data_update(old_inode);
1707 error = NFS_PROTO(old_dir)->rename(old_dir, &old_dentry->d_name, 1619 error = NFS_PROTO(old_dir)->rename(old_dir, &old_dentry->d_name,
1708 new_dir, &new_dentry->d_name); 1620 new_dir, &new_dentry->d_name);
1709 nfs_mark_for_revalidate(old_inode); 1621 nfs_mark_for_revalidate(old_inode);
1710 nfs_end_data_update(old_inode);
1711 nfs_end_data_update(new_dir);
1712 nfs_end_data_update(old_dir);
1713out: 1622out:
1714 if (rehash) 1623 if (rehash)
1715 d_rehash(rehash); 1624 d_rehash(rehash);
1716 if (!error) { 1625 if (!error) {
1717 d_move(old_dentry, new_dentry); 1626 d_move(old_dentry, new_dentry);
1718 nfs_renew_times(new_dentry); 1627 nfs_set_verifier(new_dentry,
1719 nfs_refresh_verifier(new_dentry, nfs_save_change_attribute(new_dir)); 1628 nfs_save_change_attribute(new_dir));
1720 } 1629 }
1721 1630
1722 /* new dentry created? */ 1631 /* new dentry created? */
@@ -1842,7 +1751,7 @@ static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, st
1842 return NULL; 1751 return NULL;
1843} 1752}
1844 1753
1845int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res) 1754static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res)
1846{ 1755{
1847 struct nfs_inode *nfsi = NFS_I(inode); 1756 struct nfs_inode *nfsi = NFS_I(inode);
1848 struct nfs_access_entry *cache; 1757 struct nfs_access_entry *cache;
@@ -1854,7 +1763,7 @@ int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, struct nfs
1854 cache = nfs_access_search_rbtree(inode, cred); 1763 cache = nfs_access_search_rbtree(inode, cred);
1855 if (cache == NULL) 1764 if (cache == NULL)
1856 goto out; 1765 goto out;
1857 if (time_after(jiffies, cache->jiffies + NFS_ATTRTIMEO(inode))) 1766 if (!time_in_range(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo))
1858 goto out_stale; 1767 goto out_stale;
1859 res->jiffies = cache->jiffies; 1768 res->jiffies = cache->jiffies;
1860 res->cred = cache->cred; 1769 res->cred = cache->cred;
@@ -1909,7 +1818,7 @@ found:
1909 nfs_access_free_entry(entry); 1818 nfs_access_free_entry(entry);
1910} 1819}
1911 1820
1912void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set) 1821static void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set)
1913{ 1822{
1914 struct nfs_access_entry *cache = kmalloc(sizeof(*cache), GFP_KERNEL); 1823 struct nfs_access_entry *cache = kmalloc(sizeof(*cache), GFP_KERNEL);
1915 if (cache == NULL) 1824 if (cache == NULL)
@@ -1957,6 +1866,24 @@ out:
1957 return -EACCES; 1866 return -EACCES;
1958} 1867}
1959 1868
1869static int nfs_open_permission_mask(int openflags)
1870{
1871 int mask = 0;
1872
1873 if (openflags & FMODE_READ)
1874 mask |= MAY_READ;
1875 if (openflags & FMODE_WRITE)
1876 mask |= MAY_WRITE;
1877 if (openflags & FMODE_EXEC)
1878 mask |= MAY_EXEC;
1879 return mask;
1880}
1881
1882int nfs_may_open(struct inode *inode, struct rpc_cred *cred, int openflags)
1883{
1884 return nfs_do_access(inode, cred, nfs_open_permission_mask(openflags));
1885}
1886
1960int nfs_permission(struct inode *inode, int mask, struct nameidata *nd) 1887int nfs_permission(struct inode *inode, int mask, struct nameidata *nd)
1961{ 1888{
1962 struct rpc_cred *cred; 1889 struct rpc_cred *cred;
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index fcf4d384610e..32fe97211eea 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -368,7 +368,7 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, unsigned long user_addr, size
368 return -ENOMEM; 368 return -ENOMEM;
369 369
370 dreq->inode = inode; 370 dreq->inode = inode;
371 dreq->ctx = get_nfs_open_context((struct nfs_open_context *)iocb->ki_filp->private_data); 371 dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
372 if (!is_sync_kiocb(iocb)) 372 if (!is_sync_kiocb(iocb))
373 dreq->iocb = iocb; 373 dreq->iocb = iocb;
374 374
@@ -510,7 +510,6 @@ static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode
510 nfs_direct_write_reschedule(dreq); 510 nfs_direct_write_reschedule(dreq);
511 break; 511 break;
512 default: 512 default:
513 nfs_end_data_update(inode);
514 if (dreq->commit_data != NULL) 513 if (dreq->commit_data != NULL)
515 nfs_commit_free(dreq->commit_data); 514 nfs_commit_free(dreq->commit_data);
516 nfs_direct_free_writedata(dreq); 515 nfs_direct_free_writedata(dreq);
@@ -533,7 +532,6 @@ static inline void nfs_alloc_commit_data(struct nfs_direct_req *dreq)
533 532
534static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode) 533static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
535{ 534{
536 nfs_end_data_update(inode);
537 nfs_direct_free_writedata(dreq); 535 nfs_direct_free_writedata(dreq);
538 nfs_zap_mapping(inode, inode->i_mapping); 536 nfs_zap_mapping(inode, inode->i_mapping);
539 nfs_direct_complete(dreq); 537 nfs_direct_complete(dreq);
@@ -718,14 +716,12 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, siz
718 sync = FLUSH_STABLE; 716 sync = FLUSH_STABLE;
719 717
720 dreq->inode = inode; 718 dreq->inode = inode;
721 dreq->ctx = get_nfs_open_context((struct nfs_open_context *)iocb->ki_filp->private_data); 719 dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
722 if (!is_sync_kiocb(iocb)) 720 if (!is_sync_kiocb(iocb))
723 dreq->iocb = iocb; 721 dreq->iocb = iocb;
724 722
725 nfs_add_stats(inode, NFSIOS_DIRECTWRITTENBYTES, count); 723 nfs_add_stats(inode, NFSIOS_DIRECTWRITTENBYTES, count);
726 724
727 nfs_begin_data_update(inode);
728
729 rpc_clnt_sigmask(clnt, &oldset); 725 rpc_clnt_sigmask(clnt, &oldset);
730 result = nfs_direct_write_schedule(dreq, user_addr, count, pos, sync); 726 result = nfs_direct_write_schedule(dreq, user_addr, count, pos, sync);
731 if (!result) 727 if (!result)
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 579cf8a7d4a7..c664bb921425 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -33,6 +33,7 @@
33#include <asm/system.h> 33#include <asm/system.h>
34 34
35#include "delegation.h" 35#include "delegation.h"
36#include "internal.h"
36#include "iostat.h" 37#include "iostat.h"
37 38
38#define NFSDBG_FACILITY NFSDBG_FILE 39#define NFSDBG_FACILITY NFSDBG_FILE
@@ -55,6 +56,8 @@ static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl);
55static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl); 56static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl);
56static int nfs_setlease(struct file *file, long arg, struct file_lock **fl); 57static int nfs_setlease(struct file *file, long arg, struct file_lock **fl);
57 58
59static struct vm_operations_struct nfs_file_vm_ops;
60
58const struct file_operations nfs_file_operations = { 61const struct file_operations nfs_file_operations = {
59 .llseek = nfs_file_llseek, 62 .llseek = nfs_file_llseek,
60 .read = do_sync_read, 63 .read = do_sync_read,
@@ -174,13 +177,38 @@ static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
174} 177}
175 178
176/* 179/*
180 * Helper for nfs_file_flush() and nfs_fsync()
181 *
182 * Notice that it clears the NFS_CONTEXT_ERROR_WRITE before synching to
183 * disk, but it retrieves and clears ctx->error after synching, despite
184 * the two being set at the same time in nfs_context_set_write_error().
185 * This is because the former is used to notify the _next_ call to
186 * nfs_file_write() that a write error occured, and hence cause it to
187 * fall back to doing a synchronous write.
188 */
189static int nfs_do_fsync(struct nfs_open_context *ctx, struct inode *inode)
190{
191 int have_error, status;
192 int ret = 0;
193
194 have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
195 status = nfs_wb_all(inode);
196 have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
197 if (have_error)
198 ret = xchg(&ctx->error, 0);
199 if (!ret)
200 ret = status;
201 return ret;
202}
203
204/*
177 * Flush all dirty pages, and check for write errors. 205 * Flush all dirty pages, and check for write errors.
178 * 206 *
179 */ 207 */
180static int 208static int
181nfs_file_flush(struct file *file, fl_owner_t id) 209nfs_file_flush(struct file *file, fl_owner_t id)
182{ 210{
183 struct nfs_open_context *ctx = (struct nfs_open_context *)file->private_data; 211 struct nfs_open_context *ctx = nfs_file_open_context(file);
184 struct inode *inode = file->f_path.dentry->d_inode; 212 struct inode *inode = file->f_path.dentry->d_inode;
185 int status; 213 int status;
186 214
@@ -189,16 +217,11 @@ nfs_file_flush(struct file *file, fl_owner_t id)
189 if ((file->f_mode & FMODE_WRITE) == 0) 217 if ((file->f_mode & FMODE_WRITE) == 0)
190 return 0; 218 return 0;
191 nfs_inc_stats(inode, NFSIOS_VFSFLUSH); 219 nfs_inc_stats(inode, NFSIOS_VFSFLUSH);
192 lock_kernel(); 220
193 /* Ensure that data+attribute caches are up to date after close() */ 221 /* Ensure that data+attribute caches are up to date after close() */
194 status = nfs_wb_all(inode); 222 status = nfs_do_fsync(ctx, inode);
195 if (!status) { 223 if (!status)
196 status = ctx->error; 224 nfs_revalidate_inode(NFS_SERVER(inode), inode);
197 ctx->error = 0;
198 if (!status)
199 nfs_revalidate_inode(NFS_SERVER(inode), inode);
200 }
201 unlock_kernel();
202 return status; 225 return status;
203} 226}
204 227
@@ -257,8 +280,11 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma)
257 dentry->d_parent->d_name.name, dentry->d_name.name); 280 dentry->d_parent->d_name.name, dentry->d_name.name);
258 281
259 status = nfs_revalidate_mapping(inode, file->f_mapping); 282 status = nfs_revalidate_mapping(inode, file->f_mapping);
260 if (!status) 283 if (!status) {
261 status = generic_file_mmap(file, vma); 284 vma->vm_ops = &nfs_file_vm_ops;
285 vma->vm_flags |= VM_CAN_NONLINEAR;
286 file_accessed(file);
287 }
262 return status; 288 return status;
263} 289}
264 290
@@ -270,21 +296,13 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma)
270static int 296static int
271nfs_fsync(struct file *file, struct dentry *dentry, int datasync) 297nfs_fsync(struct file *file, struct dentry *dentry, int datasync)
272{ 298{
273 struct nfs_open_context *ctx = (struct nfs_open_context *)file->private_data; 299 struct nfs_open_context *ctx = nfs_file_open_context(file);
274 struct inode *inode = dentry->d_inode; 300 struct inode *inode = dentry->d_inode;
275 int status;
276 301
277 dfprintk(VFS, "nfs: fsync(%s/%ld)\n", inode->i_sb->s_id, inode->i_ino); 302 dfprintk(VFS, "nfs: fsync(%s/%ld)\n", inode->i_sb->s_id, inode->i_ino);
278 303
279 nfs_inc_stats(inode, NFSIOS_VFSFSYNC); 304 nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
280 lock_kernel(); 305 return nfs_do_fsync(ctx, inode);
281 status = nfs_wb_all(inode);
282 if (!status) {
283 status = ctx->error;
284 ctx->error = 0;
285 }
286 unlock_kernel();
287 return status;
288} 306}
289 307
290/* 308/*
@@ -333,7 +351,7 @@ static int nfs_launder_page(struct page *page)
333const struct address_space_operations nfs_file_aops = { 351const struct address_space_operations nfs_file_aops = {
334 .readpage = nfs_readpage, 352 .readpage = nfs_readpage,
335 .readpages = nfs_readpages, 353 .readpages = nfs_readpages,
336 .set_page_dirty = nfs_set_page_dirty, 354 .set_page_dirty = __set_page_dirty_nobuffers,
337 .writepage = nfs_writepage, 355 .writepage = nfs_writepage,
338 .writepages = nfs_writepages, 356 .writepages = nfs_writepages,
339 .prepare_write = nfs_prepare_write, 357 .prepare_write = nfs_prepare_write,
@@ -346,6 +364,43 @@ const struct address_space_operations nfs_file_aops = {
346 .launder_page = nfs_launder_page, 364 .launder_page = nfs_launder_page,
347}; 365};
348 366
367static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
368{
369 struct file *filp = vma->vm_file;
370 unsigned pagelen;
371 int ret = -EINVAL;
372
373 lock_page(page);
374 if (page->mapping != vma->vm_file->f_path.dentry->d_inode->i_mapping)
375 goto out_unlock;
376 pagelen = nfs_page_length(page);
377 if (pagelen == 0)
378 goto out_unlock;
379 ret = nfs_prepare_write(filp, page, 0, pagelen);
380 if (!ret)
381 ret = nfs_commit_write(filp, page, 0, pagelen);
382out_unlock:
383 unlock_page(page);
384 return ret;
385}
386
387static struct vm_operations_struct nfs_file_vm_ops = {
388 .fault = filemap_fault,
389 .page_mkwrite = nfs_vm_page_mkwrite,
390};
391
392static int nfs_need_sync_write(struct file *filp, struct inode *inode)
393{
394 struct nfs_open_context *ctx;
395
396 if (IS_SYNC(inode) || (filp->f_flags & O_SYNC))
397 return 1;
398 ctx = nfs_file_open_context(filp);
399 if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags))
400 return 1;
401 return 0;
402}
403
349static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov, 404static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
350 unsigned long nr_segs, loff_t pos) 405 unsigned long nr_segs, loff_t pos)
351{ 406{
@@ -382,8 +437,8 @@ static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
382 nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, count); 437 nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, count);
383 result = generic_file_aio_write(iocb, iov, nr_segs, pos); 438 result = generic_file_aio_write(iocb, iov, nr_segs, pos);
384 /* Return error values for O_SYNC and IS_SYNC() */ 439 /* Return error values for O_SYNC and IS_SYNC() */
385 if (result >= 0 && (IS_SYNC(inode) || (iocb->ki_filp->f_flags & O_SYNC))) { 440 if (result >= 0 && nfs_need_sync_write(iocb->ki_filp, inode)) {
386 int err = nfs_fsync(iocb->ki_filp, dentry, 1); 441 int err = nfs_do_fsync(nfs_file_open_context(iocb->ki_filp), inode);
387 if (err < 0) 442 if (err < 0)
388 result = err; 443 result = err;
389 } 444 }
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 71a49c3acabd..035c769b715e 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -49,6 +49,11 @@
49 49
50#define NFSDBG_FACILITY NFSDBG_VFS 50#define NFSDBG_FACILITY NFSDBG_VFS
51 51
52#define NFS_64_BIT_INODE_NUMBERS_ENABLED 1
53
54/* Default is to see 64-bit inode numbers */
55static int enable_ino64 = NFS_64_BIT_INODE_NUMBERS_ENABLED;
56
52static void nfs_invalidate_inode(struct inode *); 57static void nfs_invalidate_inode(struct inode *);
53static int nfs_update_inode(struct inode *, struct nfs_fattr *); 58static int nfs_update_inode(struct inode *, struct nfs_fattr *);
54 59
@@ -62,6 +67,25 @@ nfs_fattr_to_ino_t(struct nfs_fattr *fattr)
62 return nfs_fileid_to_ino_t(fattr->fileid); 67 return nfs_fileid_to_ino_t(fattr->fileid);
63} 68}
64 69
70/**
71 * nfs_compat_user_ino64 - returns the user-visible inode number
72 * @fileid: 64-bit fileid
73 *
74 * This function returns a 32-bit inode number if the boot parameter
75 * nfs.enable_ino64 is zero.
76 */
77u64 nfs_compat_user_ino64(u64 fileid)
78{
79 int ino;
80
81 if (enable_ino64)
82 return fileid;
83 ino = fileid;
84 if (sizeof(ino) < sizeof(fileid))
85 ino ^= fileid >> (sizeof(fileid)-sizeof(ino)) * 8;
86 return ino;
87}
88
65int nfs_write_inode(struct inode *inode, int sync) 89int nfs_write_inode(struct inode *inode, int sync)
66{ 90{
67 int ret; 91 int ret;
@@ -85,7 +109,6 @@ void nfs_clear_inode(struct inode *inode)
85 */ 109 */
86 BUG_ON(nfs_have_writebacks(inode)); 110 BUG_ON(nfs_have_writebacks(inode));
87 BUG_ON(!list_empty(&NFS_I(inode)->open_files)); 111 BUG_ON(!list_empty(&NFS_I(inode)->open_files));
88 BUG_ON(atomic_read(&NFS_I(inode)->data_updates) != 0);
89 nfs_zap_acl_cache(inode); 112 nfs_zap_acl_cache(inode);
90 nfs_access_zap_cache(inode); 113 nfs_access_zap_cache(inode);
91} 114}
@@ -118,8 +141,8 @@ static void nfs_zap_caches_locked(struct inode *inode)
118 141
119 nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE); 142 nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE);
120 143
121 NFS_ATTRTIMEO(inode) = NFS_MINATTRTIMEO(inode); 144 nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
122 NFS_ATTRTIMEO_UPDATE(inode) = jiffies; 145 nfsi->attrtimeo_timestamp = jiffies;
123 146
124 memset(NFS_COOKIEVERF(inode), 0, sizeof(NFS_COOKIEVERF(inode))); 147 memset(NFS_COOKIEVERF(inode), 0, sizeof(NFS_COOKIEVERF(inode)));
125 if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) 148 if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))
@@ -156,6 +179,13 @@ static void nfs_zap_acl_cache(struct inode *inode)
156 spin_unlock(&inode->i_lock); 179 spin_unlock(&inode->i_lock);
157} 180}
158 181
182void nfs_invalidate_atime(struct inode *inode)
183{
184 spin_lock(&inode->i_lock);
185 NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATIME;
186 spin_unlock(&inode->i_lock);
187}
188
159/* 189/*
160 * Invalidate, but do not unhash, the inode. 190 * Invalidate, but do not unhash, the inode.
161 * NB: must be called with inode->i_lock held! 191 * NB: must be called with inode->i_lock held!
@@ -338,7 +368,6 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
338 return 0; 368 return 0;
339 369
340 lock_kernel(); 370 lock_kernel();
341 nfs_begin_data_update(inode);
342 /* Write all dirty data */ 371 /* Write all dirty data */
343 if (S_ISREG(inode->i_mode)) { 372 if (S_ISREG(inode->i_mode)) {
344 filemap_write_and_wait(inode->i_mapping); 373 filemap_write_and_wait(inode->i_mapping);
@@ -352,7 +381,6 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
352 error = NFS_PROTO(inode)->setattr(dentry, &fattr, attr); 381 error = NFS_PROTO(inode)->setattr(dentry, &fattr, attr);
353 if (error == 0) 382 if (error == 0)
354 nfs_refresh_inode(inode, &fattr); 383 nfs_refresh_inode(inode, &fattr);
355 nfs_end_data_update(inode);
356 unlock_kernel(); 384 unlock_kernel();
357 return error; 385 return error;
358} 386}
@@ -431,7 +459,7 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
431 459
432 /* Flush out writes to the server in order to update c/mtime */ 460 /* Flush out writes to the server in order to update c/mtime */
433 if (S_ISREG(inode->i_mode)) 461 if (S_ISREG(inode->i_mode))
434 nfs_sync_mapping_range(inode->i_mapping, 0, 0, FLUSH_NOCOMMIT); 462 nfs_wb_nocommit(inode);
435 463
436 /* 464 /*
437 * We may force a getattr if the user cares about atime. 465 * We may force a getattr if the user cares about atime.
@@ -450,8 +478,10 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
450 err = __nfs_revalidate_inode(NFS_SERVER(inode), inode); 478 err = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
451 else 479 else
452 err = nfs_revalidate_inode(NFS_SERVER(inode), inode); 480 err = nfs_revalidate_inode(NFS_SERVER(inode), inode);
453 if (!err) 481 if (!err) {
454 generic_fillattr(inode, stat); 482 generic_fillattr(inode, stat);
483 stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode));
484 }
455 return err; 485 return err;
456} 486}
457 487
@@ -536,7 +566,7 @@ struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_c
536static void nfs_file_clear_open_context(struct file *filp) 566static void nfs_file_clear_open_context(struct file *filp)
537{ 567{
538 struct inode *inode = filp->f_path.dentry->d_inode; 568 struct inode *inode = filp->f_path.dentry->d_inode;
539 struct nfs_open_context *ctx = (struct nfs_open_context *)filp->private_data; 569 struct nfs_open_context *ctx = nfs_file_open_context(filp);
540 570
541 if (ctx) { 571 if (ctx) {
542 filp->private_data = NULL; 572 filp->private_data = NULL;
@@ -598,16 +628,10 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
598 status = nfs_wait_on_inode(inode); 628 status = nfs_wait_on_inode(inode);
599 if (status < 0) 629 if (status < 0)
600 goto out; 630 goto out;
601 if (NFS_STALE(inode)) { 631
602 status = -ESTALE; 632 status = -ESTALE;
603 /* Do we trust the cached ESTALE? */ 633 if (NFS_STALE(inode))
604 if (NFS_ATTRTIMEO(inode) != 0) { 634 goto out;
605 if (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ATIME)) {
606 /* no */
607 } else
608 goto out;
609 }
610 }
611 635
612 status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), &fattr); 636 status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), &fattr);
613 if (status != 0) { 637 if (status != 0) {
@@ -654,7 +678,7 @@ int nfs_attribute_timeout(struct inode *inode)
654 678
655 if (nfs_have_delegation(inode, FMODE_READ)) 679 if (nfs_have_delegation(inode, FMODE_READ))
656 return 0; 680 return 0;
657 return time_after(jiffies, nfsi->read_cache_jiffies+nfsi->attrtimeo); 681 return !time_in_range(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo);
658} 682}
659 683
660/** 684/**
@@ -683,11 +707,8 @@ static int nfs_invalidate_mapping_nolock(struct inode *inode, struct address_spa
683 } 707 }
684 spin_lock(&inode->i_lock); 708 spin_lock(&inode->i_lock);
685 nfsi->cache_validity &= ~NFS_INO_INVALID_DATA; 709 nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
686 if (S_ISDIR(inode->i_mode)) { 710 if (S_ISDIR(inode->i_mode))
687 memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf)); 711 memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
688 /* This ensures we revalidate child dentries */
689 nfsi->cache_change_attribute = jiffies;
690 }
691 spin_unlock(&inode->i_lock); 712 spin_unlock(&inode->i_lock);
692 nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE); 713 nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE);
693 dfprintk(PAGECACHE, "NFS: (%s/%Ld) data cache invalidated\n", 714 dfprintk(PAGECACHE, "NFS: (%s/%Ld) data cache invalidated\n",
@@ -756,56 +777,27 @@ out:
756 return ret; 777 return ret;
757} 778}
758 779
759/**
760 * nfs_begin_data_update
761 * @inode - pointer to inode
762 * Declare that a set of operations will update file data on the server
763 */
764void nfs_begin_data_update(struct inode *inode)
765{
766 atomic_inc(&NFS_I(inode)->data_updates);
767}
768
769/**
770 * nfs_end_data_update
771 * @inode - pointer to inode
772 * Declare end of the operations that will update file data
773 * This will mark the inode as immediately needing revalidation
774 * of its attribute cache.
775 */
776void nfs_end_data_update(struct inode *inode)
777{
778 struct nfs_inode *nfsi = NFS_I(inode);
779
780 /* Directories: invalidate page cache */
781 if (S_ISDIR(inode->i_mode)) {
782 spin_lock(&inode->i_lock);
783 nfsi->cache_validity |= NFS_INO_INVALID_DATA;
784 spin_unlock(&inode->i_lock);
785 }
786 nfsi->cache_change_attribute = jiffies;
787 atomic_dec(&nfsi->data_updates);
788}
789
790static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) 780static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
791{ 781{
792 struct nfs_inode *nfsi = NFS_I(inode); 782 struct nfs_inode *nfsi = NFS_I(inode);
793 unsigned long now = jiffies;
794 783
784 if ((fattr->valid & NFS_ATTR_WCC_V4) != 0 &&
785 nfsi->change_attr == fattr->pre_change_attr) {
786 nfsi->change_attr = fattr->change_attr;
787 if (S_ISDIR(inode->i_mode))
788 nfsi->cache_validity |= NFS_INO_INVALID_DATA;
789 }
795 /* If we have atomic WCC data, we may update some attributes */ 790 /* If we have atomic WCC data, we may update some attributes */
796 if ((fattr->valid & NFS_ATTR_WCC) != 0) { 791 if ((fattr->valid & NFS_ATTR_WCC) != 0) {
797 if (timespec_equal(&inode->i_ctime, &fattr->pre_ctime)) { 792 if (timespec_equal(&inode->i_ctime, &fattr->pre_ctime))
798 memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); 793 memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
799 nfsi->cache_change_attribute = now;
800 }
801 if (timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) { 794 if (timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) {
802 memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); 795 memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
803 nfsi->cache_change_attribute = now; 796 if (S_ISDIR(inode->i_mode))
797 nfsi->cache_validity |= NFS_INO_INVALID_DATA;
804 } 798 }
805 if (inode->i_size == fattr->pre_size && nfsi->npages == 0) { 799 if (inode->i_size == fattr->pre_size && nfsi->npages == 0)
806 inode->i_size = fattr->size; 800 inode->i_size = fattr->size;
807 nfsi->cache_change_attribute = now;
808 }
809 } 801 }
810} 802}
811 803
@@ -822,7 +814,7 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
822{ 814{
823 struct nfs_inode *nfsi = NFS_I(inode); 815 struct nfs_inode *nfsi = NFS_I(inode);
824 loff_t cur_size, new_isize; 816 loff_t cur_size, new_isize;
825 int data_unstable; 817 unsigned long invalid = 0;
826 818
827 819
828 /* Has the inode gone and changed behind our back? */ 820 /* Has the inode gone and changed behind our back? */
@@ -831,37 +823,41 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
831 return -EIO; 823 return -EIO;
832 } 824 }
833 825
834 /* Are we in the process of updating data on the server? */
835 data_unstable = nfs_caches_unstable(inode);
836
837 /* Do atomic weak cache consistency updates */ 826 /* Do atomic weak cache consistency updates */
838 nfs_wcc_update_inode(inode, fattr); 827 nfs_wcc_update_inode(inode, fattr);
839 828
840 if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 && 829 if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 &&
841 nfsi->change_attr != fattr->change_attr) 830 nfsi->change_attr != fattr->change_attr)
842 nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; 831 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
843 832
844 /* Verify a few of the more important attributes */ 833 /* Verify a few of the more important attributes */
845 if (!timespec_equal(&inode->i_mtime, &fattr->mtime)) 834 if (!timespec_equal(&inode->i_mtime, &fattr->mtime))
846 nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; 835 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
847 836
848 cur_size = i_size_read(inode); 837 cur_size = i_size_read(inode);
849 new_isize = nfs_size_to_loff_t(fattr->size); 838 new_isize = nfs_size_to_loff_t(fattr->size);
850 if (cur_size != new_isize && nfsi->npages == 0) 839 if (cur_size != new_isize && nfsi->npages == 0)
851 nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; 840 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
852 841
853 /* Have any file permissions changed? */ 842 /* Have any file permissions changed? */
854 if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO) 843 if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)
855 || inode->i_uid != fattr->uid 844 || inode->i_uid != fattr->uid
856 || inode->i_gid != fattr->gid) 845 || inode->i_gid != fattr->gid)
857 nfsi->cache_validity |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; 846 invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL;
858 847
859 /* Has the link count changed? */ 848 /* Has the link count changed? */
860 if (inode->i_nlink != fattr->nlink) 849 if (inode->i_nlink != fattr->nlink)
861 nfsi->cache_validity |= NFS_INO_INVALID_ATTR; 850 invalid |= NFS_INO_INVALID_ATTR;
862 851
863 if (!timespec_equal(&inode->i_atime, &fattr->atime)) 852 if (!timespec_equal(&inode->i_atime, &fattr->atime))
864 nfsi->cache_validity |= NFS_INO_INVALID_ATIME; 853 invalid |= NFS_INO_INVALID_ATIME;
854
855 if (invalid != 0)
856 nfsi->cache_validity |= invalid;
857 else
858 nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR
859 | NFS_INO_INVALID_ATIME
860 | NFS_INO_REVAL_PAGECACHE);
865 861
866 nfsi->read_cache_jiffies = fattr->time_start; 862 nfsi->read_cache_jiffies = fattr->time_start;
867 return 0; 863 return 0;
@@ -911,17 +907,41 @@ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr)
911int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr) 907int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr)
912{ 908{
913 struct nfs_inode *nfsi = NFS_I(inode); 909 struct nfs_inode *nfsi = NFS_I(inode);
914 int status = 0;
915 910
916 spin_lock(&inode->i_lock); 911 spin_lock(&inode->i_lock);
917 if (unlikely((fattr->valid & NFS_ATTR_FATTR) == 0)) { 912 nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
918 nfsi->cache_validity |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; 913 if (S_ISDIR(inode->i_mode))
919 goto out; 914 nfsi->cache_validity |= NFS_INO_INVALID_DATA;
920 }
921 status = nfs_update_inode(inode, fattr);
922out:
923 spin_unlock(&inode->i_lock); 915 spin_unlock(&inode->i_lock);
924 return status; 916 return nfs_refresh_inode(inode, fattr);
917}
918
919/**
920 * nfs_post_op_update_inode_force_wcc - try to update the inode attribute cache
921 * @inode - pointer to inode
922 * @fattr - updated attributes
923 *
924 * After an operation that has changed the inode metadata, mark the
925 * attribute cache as being invalid, then try to update it. Fake up
926 * weak cache consistency data, if none exist.
927 *
928 * This function is mainly designed to be used by the ->write_done() functions.
929 */
930int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fattr)
931{
932 if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 &&
933 (fattr->valid & NFS_ATTR_WCC_V4) == 0) {
934 fattr->pre_change_attr = NFS_I(inode)->change_attr;
935 fattr->valid |= NFS_ATTR_WCC_V4;
936 }
937 if ((fattr->valid & NFS_ATTR_FATTR) != 0 &&
938 (fattr->valid & NFS_ATTR_WCC) == 0) {
939 memcpy(&fattr->pre_ctime, &inode->i_ctime, sizeof(fattr->pre_ctime));
940 memcpy(&fattr->pre_mtime, &inode->i_mtime, sizeof(fattr->pre_mtime));
941 fattr->pre_size = inode->i_size;
942 fattr->valid |= NFS_ATTR_WCC;
943 }
944 return nfs_post_op_update_inode(inode, fattr);
925} 945}
926 946
927/* 947/*
@@ -941,9 +961,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
941 struct nfs_server *server; 961 struct nfs_server *server;
942 struct nfs_inode *nfsi = NFS_I(inode); 962 struct nfs_inode *nfsi = NFS_I(inode);
943 loff_t cur_isize, new_isize; 963 loff_t cur_isize, new_isize;
944 unsigned int invalid = 0; 964 unsigned long invalid = 0;
945 unsigned long now = jiffies; 965 unsigned long now = jiffies;
946 int data_stable;
947 966
948 dfprintk(VFS, "NFS: %s(%s/%ld ct=%d info=0x%x)\n", 967 dfprintk(VFS, "NFS: %s(%s/%ld ct=%d info=0x%x)\n",
949 __FUNCTION__, inode->i_sb->s_id, inode->i_ino, 968 __FUNCTION__, inode->i_sb->s_id, inode->i_ino,
@@ -968,57 +987,51 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
968 * Update the read time so we don't revalidate too often. 987 * Update the read time so we don't revalidate too often.
969 */ 988 */
970 nfsi->read_cache_jiffies = fattr->time_start; 989 nfsi->read_cache_jiffies = fattr->time_start;
971 nfsi->last_updated = now;
972 990
973 /* Fix a wraparound issue with nfsi->cache_change_attribute */ 991 nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ATIME
974 if (time_before(now, nfsi->cache_change_attribute)) 992 | NFS_INO_REVAL_PAGECACHE);
975 nfsi->cache_change_attribute = now - 600*HZ;
976
977 /* Are we racing with known updates of the metadata on the server? */
978 data_stable = nfs_verify_change_attribute(inode, fattr->time_start);
979 if (data_stable)
980 nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_ATIME);
981 993
982 /* Do atomic weak cache consistency updates */ 994 /* Do atomic weak cache consistency updates */
983 nfs_wcc_update_inode(inode, fattr); 995 nfs_wcc_update_inode(inode, fattr);
984 996
997 /* More cache consistency checks */
998 if (!(fattr->valid & NFS_ATTR_FATTR_V4)) {
999 /* NFSv2/v3: Check if the mtime agrees */
1000 if (!timespec_equal(&inode->i_mtime, &fattr->mtime)) {
1001 dprintk("NFS: mtime change on server for file %s/%ld\n",
1002 inode->i_sb->s_id, inode->i_ino);
1003 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
1004 nfsi->cache_change_attribute = now;
1005 }
1006 /* If ctime has changed we should definitely clear access+acl caches */
1007 if (!timespec_equal(&inode->i_ctime, &fattr->ctime))
1008 invalid |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
1009 } else if (nfsi->change_attr != fattr->change_attr) {
1010 dprintk("NFS: change_attr change on server for file %s/%ld\n",
1011 inode->i_sb->s_id, inode->i_ino);
1012 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
1013 nfsi->cache_change_attribute = now;
1014 }
1015
985 /* Check if our cached file size is stale */ 1016 /* Check if our cached file size is stale */
986 new_isize = nfs_size_to_loff_t(fattr->size); 1017 new_isize = nfs_size_to_loff_t(fattr->size);
987 cur_isize = i_size_read(inode); 1018 cur_isize = i_size_read(inode);
988 if (new_isize != cur_isize) { 1019 if (new_isize != cur_isize) {
989 /* Do we perhaps have any outstanding writes? */ 1020 /* Do we perhaps have any outstanding writes, or has
990 if (nfsi->npages == 0) { 1021 * the file grown beyond our last write? */
991 /* No, but did we race with nfs_end_data_update()? */ 1022 if (nfsi->npages == 0 || new_isize > cur_isize) {
992 if (data_stable) {
993 inode->i_size = new_isize;
994 invalid |= NFS_INO_INVALID_DATA;
995 }
996 invalid |= NFS_INO_INVALID_ATTR;
997 } else if (new_isize > cur_isize) {
998 inode->i_size = new_isize; 1023 inode->i_size = new_isize;
999 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; 1024 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
1000 } 1025 }
1001 nfsi->cache_change_attribute = now;
1002 dprintk("NFS: isize change on server for file %s/%ld\n", 1026 dprintk("NFS: isize change on server for file %s/%ld\n",
1003 inode->i_sb->s_id, inode->i_ino); 1027 inode->i_sb->s_id, inode->i_ino);
1004 } 1028 }
1005 1029
1006 /* Check if the mtime agrees */
1007 if (!timespec_equal(&inode->i_mtime, &fattr->mtime)) {
1008 memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
1009 dprintk("NFS: mtime change on server for file %s/%ld\n",
1010 inode->i_sb->s_id, inode->i_ino);
1011 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
1012 nfsi->cache_change_attribute = now;
1013 }
1014 1030
1015 /* If ctime has changed we should definitely clear access+acl caches */ 1031 memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
1016 if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) { 1032 memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
1017 invalid |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
1018 memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
1019 nfsi->cache_change_attribute = now;
1020 }
1021 memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime)); 1033 memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime));
1034 nfsi->change_attr = fattr->change_attr;
1022 1035
1023 if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO) || 1036 if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO) ||
1024 inode->i_uid != fattr->uid || 1037 inode->i_uid != fattr->uid ||
@@ -1039,31 +1052,29 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1039 inode->i_blocks = fattr->du.nfs2.blocks; 1052 inode->i_blocks = fattr->du.nfs2.blocks;
1040 } 1053 }
1041 1054
1042 if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 &&
1043 nfsi->change_attr != fattr->change_attr) {
1044 dprintk("NFS: change_attr change on server for file %s/%ld\n",
1045 inode->i_sb->s_id, inode->i_ino);
1046 nfsi->change_attr = fattr->change_attr;
1047 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
1048 nfsi->cache_change_attribute = now;
1049 }
1050
1051 /* Update attrtimeo value if we're out of the unstable period */ 1055 /* Update attrtimeo value if we're out of the unstable period */
1052 if (invalid & NFS_INO_INVALID_ATTR) { 1056 if (invalid & NFS_INO_INVALID_ATTR) {
1053 nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE); 1057 nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE);
1054 nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); 1058 nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
1055 nfsi->attrtimeo_timestamp = now; 1059 nfsi->attrtimeo_timestamp = now;
1056 } else if (time_after(now, nfsi->attrtimeo_timestamp+nfsi->attrtimeo)) { 1060 nfsi->last_updated = now;
1057 if ((nfsi->attrtimeo <<= 1) > NFS_MAXATTRTIMEO(inode)) 1061 } else {
1058 nfsi->attrtimeo = NFS_MAXATTRTIMEO(inode); 1062 if (!time_in_range(now, nfsi->attrtimeo_timestamp, nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) {
1059 nfsi->attrtimeo_timestamp = now; 1063 if ((nfsi->attrtimeo <<= 1) > NFS_MAXATTRTIMEO(inode))
1064 nfsi->attrtimeo = NFS_MAXATTRTIMEO(inode);
1065 nfsi->attrtimeo_timestamp = now;
1066 }
1067 /*
1068 * Avoid jiffy wraparound issues with nfsi->last_updated
1069 */
1070 if (!time_in_range(nfsi->last_updated, nfsi->read_cache_jiffies, now))
1071 nfsi->last_updated = nfsi->read_cache_jiffies;
1060 } 1072 }
1073 invalid &= ~NFS_INO_INVALID_ATTR;
1061 /* Don't invalidate the data if we were to blame */ 1074 /* Don't invalidate the data if we were to blame */
1062 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) 1075 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)
1063 || S_ISLNK(inode->i_mode))) 1076 || S_ISLNK(inode->i_mode)))
1064 invalid &= ~NFS_INO_INVALID_DATA; 1077 invalid &= ~NFS_INO_INVALID_DATA;
1065 if (data_stable)
1066 invalid &= ~(NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ATIME|NFS_INO_REVAL_PAGECACHE);
1067 if (!nfs_have_delegation(inode, FMODE_READ) || 1078 if (!nfs_have_delegation(inode, FMODE_READ) ||
1068 (nfsi->cache_validity & NFS_INO_REVAL_FORCED)) 1079 (nfsi->cache_validity & NFS_INO_REVAL_FORCED))
1069 nfsi->cache_validity |= invalid; 1080 nfsi->cache_validity |= invalid;
@@ -1152,7 +1163,6 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
1152 INIT_LIST_HEAD(&nfsi->access_cache_entry_lru); 1163 INIT_LIST_HEAD(&nfsi->access_cache_entry_lru);
1153 INIT_LIST_HEAD(&nfsi->access_cache_inode_lru); 1164 INIT_LIST_HEAD(&nfsi->access_cache_inode_lru);
1154 INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC); 1165 INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC);
1155 atomic_set(&nfsi->data_updates, 0);
1156 nfsi->ncommit = 0; 1166 nfsi->ncommit = 0;
1157 nfsi->npages = 0; 1167 nfsi->npages = 0;
1158 nfs4_init_once(nfsi); 1168 nfs4_init_once(nfsi);
@@ -1249,6 +1259,7 @@ static void __exit exit_nfs_fs(void)
1249/* Not quite true; I just maintain it */ 1259/* Not quite true; I just maintain it */
1250MODULE_AUTHOR("Olaf Kirch <okir@monad.swb.de>"); 1260MODULE_AUTHOR("Olaf Kirch <okir@monad.swb.de>");
1251MODULE_LICENSE("GPL"); 1261MODULE_LICENSE("GPL");
1262module_param(enable_ino64, bool, 0644);
1252 1263
1253module_init(init_nfs_fs) 1264module_init(init_nfs_fs)
1254module_exit(exit_nfs_fs) 1265module_exit(exit_nfs_fs)
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 76cf55d57101..f3acf48412be 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -5,8 +5,6 @@
5#include <linux/mount.h> 5#include <linux/mount.h>
6 6
7struct nfs_string; 7struct nfs_string;
8struct nfs_mount_data;
9struct nfs4_mount_data;
10 8
11/* Maximum number of readahead requests 9/* Maximum number of readahead requests
12 * FIXME: this should really be a sysctl so that users may tune it to suit 10 * FIXME: this should really be a sysctl so that users may tune it to suit
@@ -27,20 +25,50 @@ struct nfs_clone_mount {
27 rpc_authflavor_t authflavor; 25 rpc_authflavor_t authflavor;
28}; 26};
29 27
28/*
29 * In-kernel mount arguments
30 */
31struct nfs_parsed_mount_data {
32 int flags;
33 int rsize, wsize;
34 int timeo, retrans;
35 int acregmin, acregmax,
36 acdirmin, acdirmax;
37 int namlen;
38 unsigned int bsize;
39 unsigned int auth_flavor_len;
40 rpc_authflavor_t auth_flavors[1];
41 char *client_address;
42
43 struct {
44 struct sockaddr_in address;
45 char *hostname;
46 unsigned int program;
47 unsigned int version;
48 unsigned short port;
49 int protocol;
50 } mount_server;
51
52 struct {
53 struct sockaddr_in address;
54 char *hostname;
55 char *export_path;
56 unsigned int program;
57 int protocol;
58 } nfs_server;
59};
60
30/* client.c */ 61/* client.c */
31extern struct rpc_program nfs_program; 62extern struct rpc_program nfs_program;
32 63
33extern void nfs_put_client(struct nfs_client *); 64extern void nfs_put_client(struct nfs_client *);
34extern struct nfs_client *nfs_find_client(const struct sockaddr_in *, int); 65extern struct nfs_client *nfs_find_client(const struct sockaddr_in *, int);
35extern struct nfs_server *nfs_create_server(const struct nfs_mount_data *, 66extern struct nfs_server *nfs_create_server(
36 struct nfs_fh *); 67 const struct nfs_parsed_mount_data *,
37extern struct nfs_server *nfs4_create_server(const struct nfs4_mount_data *, 68 struct nfs_fh *);
38 const char *, 69extern struct nfs_server *nfs4_create_server(
39 const struct sockaddr_in *, 70 const struct nfs_parsed_mount_data *,
40 const char *, 71 struct nfs_fh *);
41 const char *,
42 rpc_authflavor_t,
43 struct nfs_fh *);
44extern struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *, 72extern struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *,
45 struct nfs_fh *); 73 struct nfs_fh *);
46extern void nfs_free_server(struct nfs_server *server); 74extern void nfs_free_server(struct nfs_server *server);
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index c5fce7567200..668ab96c7b59 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -251,6 +251,7 @@ nfs_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args)
251 replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS_readres_sz) << 2; 251 replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS_readres_sz) << 2;
252 xdr_inline_pages(&req->rq_rcv_buf, replen, 252 xdr_inline_pages(&req->rq_rcv_buf, replen,
253 args->pages, args->pgbase, count); 253 args->pages, args->pgbase, count);
254 req->rq_rcv_buf.flags |= XDRBUF_READ;
254 return 0; 255 return 0;
255} 256}
256 257
@@ -271,7 +272,7 @@ nfs_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res)
271 res->eof = 0; 272 res->eof = 0;
272 hdrlen = (u8 *) p - (u8 *) iov->iov_base; 273 hdrlen = (u8 *) p - (u8 *) iov->iov_base;
273 if (iov->iov_len < hdrlen) { 274 if (iov->iov_len < hdrlen) {
274 printk(KERN_WARNING "NFS: READ reply header overflowed:" 275 dprintk("NFS: READ reply header overflowed:"
275 "length %d > %Zu\n", hdrlen, iov->iov_len); 276 "length %d > %Zu\n", hdrlen, iov->iov_len);
276 return -errno_NFSERR_IO; 277 return -errno_NFSERR_IO;
277 } else if (iov->iov_len != hdrlen) { 278 } else if (iov->iov_len != hdrlen) {
@@ -281,7 +282,7 @@ nfs_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res)
281 282
282 recvd = req->rq_rcv_buf.len - hdrlen; 283 recvd = req->rq_rcv_buf.len - hdrlen;
283 if (count > recvd) { 284 if (count > recvd) {
284 printk(KERN_WARNING "NFS: server cheating in read reply: " 285 dprintk("NFS: server cheating in read reply: "
285 "count %d > recvd %d\n", count, recvd); 286 "count %d > recvd %d\n", count, recvd);
286 count = recvd; 287 count = recvd;
287 } 288 }
@@ -313,6 +314,7 @@ nfs_xdr_writeargs(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args)
313 314
314 /* Copy the page array */ 315 /* Copy the page array */
315 xdr_encode_pages(sndbuf, args->pages, args->pgbase, count); 316 xdr_encode_pages(sndbuf, args->pages, args->pgbase, count);
317 sndbuf->flags |= XDRBUF_WRITE;
316 return 0; 318 return 0;
317} 319}
318 320
@@ -431,7 +433,7 @@ nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy)
431 433
432 hdrlen = (u8 *) p - (u8 *) iov->iov_base; 434 hdrlen = (u8 *) p - (u8 *) iov->iov_base;
433 if (iov->iov_len < hdrlen) { 435 if (iov->iov_len < hdrlen) {
434 printk(KERN_WARNING "NFS: READDIR reply header overflowed:" 436 dprintk("NFS: READDIR reply header overflowed:"
435 "length %d > %Zu\n", hdrlen, iov->iov_len); 437 "length %d > %Zu\n", hdrlen, iov->iov_len);
436 return -errno_NFSERR_IO; 438 return -errno_NFSERR_IO;
437 } else if (iov->iov_len != hdrlen) { 439 } else if (iov->iov_len != hdrlen) {
@@ -454,7 +456,7 @@ nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy)
454 len = ntohl(*p++); 456 len = ntohl(*p++);
455 p += XDR_QUADLEN(len) + 1; /* name plus cookie */ 457 p += XDR_QUADLEN(len) + 1; /* name plus cookie */
456 if (len > NFS2_MAXNAMLEN) { 458 if (len > NFS2_MAXNAMLEN) {
457 printk(KERN_WARNING "NFS: giant filename in readdir (len 0x%x)!\n", 459 dprintk("NFS: giant filename in readdir (len 0x%x)!\n",
458 len); 460 len);
459 goto err_unmap; 461 goto err_unmap;
460 } 462 }
@@ -471,7 +473,7 @@ nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy)
471 entry[0] = entry[1] = 0; 473 entry[0] = entry[1] = 0;
472 /* truncate listing ? */ 474 /* truncate listing ? */
473 if (!nr) { 475 if (!nr) {
474 printk(KERN_NOTICE "NFS: readdir reply truncated!\n"); 476 dprintk("NFS: readdir reply truncated!\n");
475 entry[1] = 1; 477 entry[1] = 1;
476 } 478 }
477 goto out; 479 goto out;
@@ -583,12 +585,12 @@ nfs_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, void *dummy)
583 /* Convert length of symlink */ 585 /* Convert length of symlink */
584 len = ntohl(*p++); 586 len = ntohl(*p++);
585 if (len >= rcvbuf->page_len || len <= 0) { 587 if (len >= rcvbuf->page_len || len <= 0) {
586 dprintk(KERN_WARNING "nfs: server returned giant symlink!\n"); 588 dprintk("nfs: server returned giant symlink!\n");
587 return -ENAMETOOLONG; 589 return -ENAMETOOLONG;
588 } 590 }
589 hdrlen = (u8 *) p - (u8 *) iov->iov_base; 591 hdrlen = (u8 *) p - (u8 *) iov->iov_base;
590 if (iov->iov_len < hdrlen) { 592 if (iov->iov_len < hdrlen) {
591 printk(KERN_WARNING "NFS: READLINK reply header overflowed:" 593 dprintk("NFS: READLINK reply header overflowed:"
592 "length %d > %Zu\n", hdrlen, iov->iov_len); 594 "length %d > %Zu\n", hdrlen, iov->iov_len);
593 return -errno_NFSERR_IO; 595 return -errno_NFSERR_IO;
594 } else if (iov->iov_len != hdrlen) { 596 } else if (iov->iov_len != hdrlen) {
@@ -597,7 +599,7 @@ nfs_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, void *dummy)
597 } 599 }
598 recvd = req->rq_rcv_buf.len - hdrlen; 600 recvd = req->rq_rcv_buf.len - hdrlen;
599 if (recvd < len) { 601 if (recvd < len) {
600 printk(KERN_WARNING "NFS: server cheating in readlink reply: " 602 dprintk("NFS: server cheating in readlink reply: "
601 "count %u > recvd %u\n", len, recvd); 603 "count %u > recvd %u\n", len, recvd);
602 return -EIO; 604 return -EIO;
603 } 605 }
@@ -695,7 +697,7 @@ nfs_stat_to_errno(int stat)
695 if (nfs_errtbl[i].stat == stat) 697 if (nfs_errtbl[i].stat == stat)
696 return nfs_errtbl[i].errno; 698 return nfs_errtbl[i].errno;
697 } 699 }
698 printk(KERN_ERR "nfs_stat_to_errno: bad nfs status return value: %d\n", stat); 700 dprintk("nfs_stat_to_errno: bad nfs status return value: %d\n", stat);
699 return nfs_errtbl[i].errno; 701 return nfs_errtbl[i].errno;
700} 702}
701 703
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 7322da4d2055..9b7362565c0c 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -317,13 +317,11 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
317 } 317 }
318 318
319 dprintk("NFS call setacl\n"); 319 dprintk("NFS call setacl\n");
320 nfs_begin_data_update(inode);
321 msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_SETACL]; 320 msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_SETACL];
322 status = rpc_call_sync(server->client_acl, &msg, 0); 321 status = rpc_call_sync(server->client_acl, &msg, 0);
323 spin_lock(&inode->i_lock); 322 spin_lock(&inode->i_lock);
324 NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ACCESS; 323 NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ACCESS;
325 spin_unlock(&inode->i_lock); 324 spin_unlock(&inode->i_lock);
326 nfs_end_data_update(inode);
327 dprintk("NFS reply setacl: %d\n", status); 325 dprintk("NFS reply setacl: %d\n", status);
328 326
329 /* pages may have been allocated at the xdr layer. */ 327 /* pages may have been allocated at the xdr layer. */
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index c7ca5d70870b..4cdc2361a669 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -166,6 +166,7 @@ nfs3_proc_lookup(struct inode *dir, struct qstr *name,
166 nfs_fattr_init(&dir_attr); 166 nfs_fattr_init(&dir_attr);
167 nfs_fattr_init(fattr); 167 nfs_fattr_init(fattr);
168 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); 168 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
169 nfs_refresh_inode(dir, &dir_attr);
169 if (status >= 0 && !(fattr->valid & NFS_ATTR_FATTR)) { 170 if (status >= 0 && !(fattr->valid & NFS_ATTR_FATTR)) {
170 msg.rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR]; 171 msg.rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR];
171 msg.rpc_argp = fhandle; 172 msg.rpc_argp = fhandle;
@@ -173,8 +174,6 @@ nfs3_proc_lookup(struct inode *dir, struct qstr *name,
173 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); 174 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
174 } 175 }
175 dprintk("NFS reply lookup: %d\n", status); 176 dprintk("NFS reply lookup: %d\n", status);
176 if (status >= 0)
177 status = nfs_refresh_inode(dir, &dir_attr);
178 return status; 177 return status;
179} 178}
180 179
@@ -607,6 +606,9 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
607 606
608 nfs_fattr_init(&dir_attr); 607 nfs_fattr_init(&dir_attr);
609 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); 608 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
609
610 nfs_invalidate_atime(dir);
611
610 nfs_refresh_inode(dir, &dir_attr); 612 nfs_refresh_inode(dir, &dir_attr);
611 dprintk("NFS reply readdir: %d\n", status); 613 dprintk("NFS reply readdir: %d\n", status);
612 return status; 614 return status;
@@ -724,9 +726,9 @@ static int nfs3_read_done(struct rpc_task *task, struct nfs_read_data *data)
724{ 726{
725 if (nfs3_async_handle_jukebox(task, data->inode)) 727 if (nfs3_async_handle_jukebox(task, data->inode))
726 return -EAGAIN; 728 return -EAGAIN;
727 /* Call back common NFS readpage processing */ 729
728 if (task->tk_status >= 0) 730 nfs_invalidate_atime(data->inode);
729 nfs_refresh_inode(data->inode, &data->fattr); 731 nfs_refresh_inode(data->inode, &data->fattr);
730 return 0; 732 return 0;
731} 733}
732 734
@@ -747,7 +749,7 @@ static int nfs3_write_done(struct rpc_task *task, struct nfs_write_data *data)
747 if (nfs3_async_handle_jukebox(task, data->inode)) 749 if (nfs3_async_handle_jukebox(task, data->inode))
748 return -EAGAIN; 750 return -EAGAIN;
749 if (task->tk_status >= 0) 751 if (task->tk_status >= 0)
750 nfs_post_op_update_inode(data->inode, data->res.fattr); 752 nfs_post_op_update_inode_force_wcc(data->inode, data->res.fattr);
751 return 0; 753 return 0;
752} 754}
753 755
@@ -775,8 +777,7 @@ static int nfs3_commit_done(struct rpc_task *task, struct nfs_write_data *data)
775{ 777{
776 if (nfs3_async_handle_jukebox(task, data->inode)) 778 if (nfs3_async_handle_jukebox(task, data->inode))
777 return -EAGAIN; 779 return -EAGAIN;
778 if (task->tk_status >= 0) 780 nfs_refresh_inode(data->inode, data->res.fattr);
779 nfs_post_op_update_inode(data->inode, data->res.fattr);
780 return 0; 781 return 0;
781} 782}
782 783
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index d9e08f0cf2a0..616d3267b7e7 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -346,6 +346,7 @@ nfs3_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args)
346 replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS3_readres_sz) << 2; 346 replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS3_readres_sz) << 2;
347 xdr_inline_pages(&req->rq_rcv_buf, replen, 347 xdr_inline_pages(&req->rq_rcv_buf, replen,
348 args->pages, args->pgbase, count); 348 args->pages, args->pgbase, count);
349 req->rq_rcv_buf.flags |= XDRBUF_READ;
349 return 0; 350 return 0;
350} 351}
351 352
@@ -367,6 +368,7 @@ nfs3_xdr_writeargs(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args)
367 368
368 /* Copy the page array */ 369 /* Copy the page array */
369 xdr_encode_pages(sndbuf, args->pages, args->pgbase, count); 370 xdr_encode_pages(sndbuf, args->pages, args->pgbase, count);
371 sndbuf->flags |= XDRBUF_WRITE;
370 return 0; 372 return 0;
371} 373}
372 374
@@ -524,7 +526,7 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res
524 526
525 hdrlen = (u8 *) p - (u8 *) iov->iov_base; 527 hdrlen = (u8 *) p - (u8 *) iov->iov_base;
526 if (iov->iov_len < hdrlen) { 528 if (iov->iov_len < hdrlen) {
527 printk(KERN_WARNING "NFS: READDIR reply header overflowed:" 529 dprintk("NFS: READDIR reply header overflowed:"
528 "length %d > %Zu\n", hdrlen, iov->iov_len); 530 "length %d > %Zu\n", hdrlen, iov->iov_len);
529 return -errno_NFSERR_IO; 531 return -errno_NFSERR_IO;
530 } else if (iov->iov_len != hdrlen) { 532 } else if (iov->iov_len != hdrlen) {
@@ -547,7 +549,7 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res
547 len = ntohl(*p++); /* string length */ 549 len = ntohl(*p++); /* string length */
548 p += XDR_QUADLEN(len) + 2; /* name + cookie */ 550 p += XDR_QUADLEN(len) + 2; /* name + cookie */
549 if (len > NFS3_MAXNAMLEN) { 551 if (len > NFS3_MAXNAMLEN) {
550 printk(KERN_WARNING "NFS: giant filename in readdir (len %x)!\n", 552 dprintk("NFS: giant filename in readdir (len %x)!\n",
551 len); 553 len);
552 goto err_unmap; 554 goto err_unmap;
553 } 555 }
@@ -567,7 +569,7 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res
567 goto short_pkt; 569 goto short_pkt;
568 len = ntohl(*p++); 570 len = ntohl(*p++);
569 if (len > NFS3_FHSIZE) { 571 if (len > NFS3_FHSIZE) {
570 printk(KERN_WARNING "NFS: giant filehandle in " 572 dprintk("NFS: giant filehandle in "
571 "readdir (len %x)!\n", len); 573 "readdir (len %x)!\n", len);
572 goto err_unmap; 574 goto err_unmap;
573 } 575 }
@@ -588,7 +590,7 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res
588 entry[0] = entry[1] = 0; 590 entry[0] = entry[1] = 0;
589 /* truncate listing ? */ 591 /* truncate listing ? */
590 if (!nr) { 592 if (!nr) {
591 printk(KERN_NOTICE "NFS: readdir reply truncated!\n"); 593 dprintk("NFS: readdir reply truncated!\n");
592 entry[1] = 1; 594 entry[1] = 1;
593 } 595 }
594 goto out; 596 goto out;
@@ -826,22 +828,23 @@ nfs3_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
826 /* Convert length of symlink */ 828 /* Convert length of symlink */
827 len = ntohl(*p++); 829 len = ntohl(*p++);
828 if (len >= rcvbuf->page_len || len <= 0) { 830 if (len >= rcvbuf->page_len || len <= 0) {
829 dprintk(KERN_WARNING "nfs: server returned giant symlink!\n"); 831 dprintk("nfs: server returned giant symlink!\n");
830 return -ENAMETOOLONG; 832 return -ENAMETOOLONG;
831 } 833 }
832 834
833 hdrlen = (u8 *) p - (u8 *) iov->iov_base; 835 hdrlen = (u8 *) p - (u8 *) iov->iov_base;
834 if (iov->iov_len < hdrlen) { 836 if (iov->iov_len < hdrlen) {
835 printk(KERN_WARNING "NFS: READLINK reply header overflowed:" 837 dprintk("NFS: READLINK reply header overflowed:"
836 "length %d > %Zu\n", hdrlen, iov->iov_len); 838 "length %d > %Zu\n", hdrlen, iov->iov_len);
837 return -errno_NFSERR_IO; 839 return -errno_NFSERR_IO;
838 } else if (iov->iov_len != hdrlen) { 840 } else if (iov->iov_len != hdrlen) {
839 dprintk("NFS: READLINK header is short. iovec will be shifted.\n"); 841 dprintk("NFS: READLINK header is short. "
842 "iovec will be shifted.\n");
840 xdr_shift_buf(rcvbuf, iov->iov_len - hdrlen); 843 xdr_shift_buf(rcvbuf, iov->iov_len - hdrlen);
841 } 844 }
842 recvd = req->rq_rcv_buf.len - hdrlen; 845 recvd = req->rq_rcv_buf.len - hdrlen;
843 if (recvd < len) { 846 if (recvd < len) {
844 printk(KERN_WARNING "NFS: server cheating in readlink reply: " 847 dprintk("NFS: server cheating in readlink reply: "
845 "count %u > recvd %u\n", len, recvd); 848 "count %u > recvd %u\n", len, recvd);
846 return -EIO; 849 return -EIO;
847 } 850 }
@@ -876,13 +879,13 @@ nfs3_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res)
876 ocount = ntohl(*p++); 879 ocount = ntohl(*p++);
877 880
878 if (ocount != count) { 881 if (ocount != count) {
879 printk(KERN_WARNING "NFS: READ count doesn't match RPC opaque count.\n"); 882 dprintk("NFS: READ count doesn't match RPC opaque count.\n");
880 return -errno_NFSERR_IO; 883 return -errno_NFSERR_IO;
881 } 884 }
882 885
883 hdrlen = (u8 *) p - (u8 *) iov->iov_base; 886 hdrlen = (u8 *) p - (u8 *) iov->iov_base;
884 if (iov->iov_len < hdrlen) { 887 if (iov->iov_len < hdrlen) {
885 printk(KERN_WARNING "NFS: READ reply header overflowed:" 888 dprintk("NFS: READ reply header overflowed:"
886 "length %d > %Zu\n", hdrlen, iov->iov_len); 889 "length %d > %Zu\n", hdrlen, iov->iov_len);
887 return -errno_NFSERR_IO; 890 return -errno_NFSERR_IO;
888 } else if (iov->iov_len != hdrlen) { 891 } else if (iov->iov_len != hdrlen) {
@@ -892,7 +895,7 @@ nfs3_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res)
892 895
893 recvd = req->rq_rcv_buf.len - hdrlen; 896 recvd = req->rq_rcv_buf.len - hdrlen;
894 if (count > recvd) { 897 if (count > recvd) {
895 printk(KERN_WARNING "NFS: server cheating in read reply: " 898 dprintk("NFS: server cheating in read reply: "
896 "count %d > recvd %d\n", count, recvd); 899 "count %d > recvd %d\n", count, recvd);
897 count = recvd; 900 count = recvd;
898 res->eof = 0; 901 res->eof = 0;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 4b90e17555a9..cb99fd90a9ac 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -62,10 +62,8 @@ struct nfs4_opendata;
62static int _nfs4_proc_open(struct nfs4_opendata *data); 62static int _nfs4_proc_open(struct nfs4_opendata *data);
63static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); 63static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
64static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *); 64static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *);
65static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry);
66static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct nfs4_exception *exception); 65static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct nfs4_exception *exception);
67static int nfs4_wait_clnt_recover(struct rpc_clnt *clnt, struct nfs_client *clp); 66static int nfs4_wait_clnt_recover(struct rpc_clnt *clnt, struct nfs_client *clp);
68static int _nfs4_do_access(struct inode *inode, struct rpc_cred *cred, int openflags);
69static int _nfs4_proc_lookup(struct inode *dir, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr); 67static int _nfs4_proc_lookup(struct inode *dir, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr);
70static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr); 68static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr);
71 69
@@ -177,7 +175,7 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent
177 *p++ = xdr_one; /* bitmap length */ 175 *p++ = xdr_one; /* bitmap length */
178 *p++ = htonl(FATTR4_WORD0_FILEID); /* bitmap */ 176 *p++ = htonl(FATTR4_WORD0_FILEID); /* bitmap */
179 *p++ = htonl(8); /* attribute buffer length */ 177 *p++ = htonl(8); /* attribute buffer length */
180 p = xdr_encode_hyper(p, dentry->d_inode->i_ino); 178 p = xdr_encode_hyper(p, NFS_FILEID(dentry->d_inode));
181 } 179 }
182 180
183 *p++ = xdr_one; /* next */ 181 *p++ = xdr_one; /* next */
@@ -189,7 +187,7 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent
189 *p++ = xdr_one; /* bitmap length */ 187 *p++ = xdr_one; /* bitmap length */
190 *p++ = htonl(FATTR4_WORD0_FILEID); /* bitmap */ 188 *p++ = htonl(FATTR4_WORD0_FILEID); /* bitmap */
191 *p++ = htonl(8); /* attribute buffer length */ 189 *p++ = htonl(8); /* attribute buffer length */
192 p = xdr_encode_hyper(p, dentry->d_parent->d_inode->i_ino); 190 p = xdr_encode_hyper(p, NFS_FILEID(dentry->d_parent->d_inode));
193 191
194 readdir->pgbase = (char *)p - (char *)start; 192 readdir->pgbase = (char *)p - (char *)start;
195 readdir->count -= readdir->pgbase; 193 readdir->count -= readdir->pgbase;
@@ -211,8 +209,9 @@ static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo)
211 209
212 spin_lock(&dir->i_lock); 210 spin_lock(&dir->i_lock);
213 nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_DATA; 211 nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_DATA;
214 if (cinfo->before == nfsi->change_attr && cinfo->atomic) 212 if (!cinfo->atomic || cinfo->before != nfsi->change_attr)
215 nfsi->change_attr = cinfo->after; 213 nfsi->cache_change_attribute = jiffies;
214 nfsi->change_attr = cinfo->after;
216 spin_unlock(&dir->i_lock); 215 spin_unlock(&dir->i_lock);
217} 216}
218 217
@@ -454,7 +453,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
454 memcpy(stateid.data, delegation->stateid.data, sizeof(stateid.data)); 453 memcpy(stateid.data, delegation->stateid.data, sizeof(stateid.data));
455 rcu_read_unlock(); 454 rcu_read_unlock();
456 lock_kernel(); 455 lock_kernel();
457 ret = _nfs4_do_access(state->inode, state->owner->so_cred, open_mode); 456 ret = nfs_may_open(state->inode, state->owner->so_cred, open_mode);
458 unlock_kernel(); 457 unlock_kernel();
459 if (ret != 0) 458 if (ret != 0)
460 goto out; 459 goto out;
@@ -948,36 +947,6 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
948 return 0; 947 return 0;
949} 948}
950 949
951static int _nfs4_do_access(struct inode *inode, struct rpc_cred *cred, int openflags)
952{
953 struct nfs_access_entry cache;
954 int mask = 0;
955 int status;
956
957 if (openflags & FMODE_READ)
958 mask |= MAY_READ;
959 if (openflags & FMODE_WRITE)
960 mask |= MAY_WRITE;
961 if (openflags & FMODE_EXEC)
962 mask |= MAY_EXEC;
963 status = nfs_access_get_cached(inode, cred, &cache);
964 if (status == 0)
965 goto out;
966
967 /* Be clever: ask server to check for all possible rights */
968 cache.mask = MAY_EXEC | MAY_WRITE | MAY_READ;
969 cache.cred = cred;
970 cache.jiffies = jiffies;
971 status = _nfs4_proc_access(inode, &cache);
972 if (status != 0)
973 return status;
974 nfs_access_add_cache(inode, &cache);
975out:
976 if ((cache.mask & mask) == mask)
977 return 0;
978 return -EACCES;
979}
980
981static int nfs4_recover_expired_lease(struct nfs_server *server) 950static int nfs4_recover_expired_lease(struct nfs_server *server)
982{ 951{
983 struct nfs_client *clp = server->nfs_client; 952 struct nfs_client *clp = server->nfs_client;
@@ -1381,7 +1350,7 @@ static int nfs4_intent_set_file(struct nameidata *nd, struct path *path, struct
1381 1350
1382 /* If the open_intent is for execute, we have an extra check to make */ 1351 /* If the open_intent is for execute, we have an extra check to make */
1383 if (nd->intent.open.flags & FMODE_EXEC) { 1352 if (nd->intent.open.flags & FMODE_EXEC) {
1384 ret = _nfs4_do_access(state->inode, 1353 ret = nfs_may_open(state->inode,
1385 state->owner->so_cred, 1354 state->owner->so_cred,
1386 nd->intent.open.flags); 1355 nd->intent.open.flags);
1387 if (ret < 0) 1356 if (ret < 0)
@@ -1390,7 +1359,7 @@ static int nfs4_intent_set_file(struct nameidata *nd, struct path *path, struct
1390 filp = lookup_instantiate_filp(nd, path->dentry, NULL); 1359 filp = lookup_instantiate_filp(nd, path->dentry, NULL);
1391 if (!IS_ERR(filp)) { 1360 if (!IS_ERR(filp)) {
1392 struct nfs_open_context *ctx; 1361 struct nfs_open_context *ctx;
1393 ctx = (struct nfs_open_context *)filp->private_data; 1362 ctx = nfs_file_open_context(filp);
1394 ctx->state = state; 1363 ctx->state = state;
1395 return 0; 1364 return 0;
1396 } 1365 }
@@ -1428,13 +1397,16 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
1428 state = nfs4_do_open(dir, &path, nd->intent.open.flags, &attr, cred); 1397 state = nfs4_do_open(dir, &path, nd->intent.open.flags, &attr, cred);
1429 put_rpccred(cred); 1398 put_rpccred(cred);
1430 if (IS_ERR(state)) { 1399 if (IS_ERR(state)) {
1431 if (PTR_ERR(state) == -ENOENT) 1400 if (PTR_ERR(state) == -ENOENT) {
1432 d_add(dentry, NULL); 1401 d_add(dentry, NULL);
1402 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
1403 }
1433 return (struct dentry *)state; 1404 return (struct dentry *)state;
1434 } 1405 }
1435 res = d_add_unique(dentry, igrab(state->inode)); 1406 res = d_add_unique(dentry, igrab(state->inode));
1436 if (res != NULL) 1407 if (res != NULL)
1437 path.dentry = res; 1408 path.dentry = res;
1409 nfs_set_verifier(path.dentry, nfs_save_change_attribute(dir));
1438 nfs4_intent_set_file(nd, &path, state); 1410 nfs4_intent_set_file(nd, &path, state);
1439 return res; 1411 return res;
1440} 1412}
@@ -1468,6 +1440,7 @@ nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags, st
1468 } 1440 }
1469 } 1441 }
1470 if (state->inode == dentry->d_inode) { 1442 if (state->inode == dentry->d_inode) {
1443 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
1471 nfs4_intent_set_file(nd, &path, state); 1444 nfs4_intent_set_file(nd, &path, state);
1472 return 1; 1445 return 1;
1473 } 1446 }
@@ -1757,10 +1730,16 @@ static int nfs4_proc_lookup(struct inode *dir, struct qstr *name, struct nfs_fh
1757 1730
1758static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry) 1731static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry)
1759{ 1732{
1733 struct nfs_server *server = NFS_SERVER(inode);
1734 struct nfs_fattr fattr;
1760 struct nfs4_accessargs args = { 1735 struct nfs4_accessargs args = {
1761 .fh = NFS_FH(inode), 1736 .fh = NFS_FH(inode),
1737 .bitmask = server->attr_bitmask,
1738 };
1739 struct nfs4_accessres res = {
1740 .server = server,
1741 .fattr = &fattr,
1762 }; 1742 };
1763 struct nfs4_accessres res = { 0 };
1764 struct rpc_message msg = { 1743 struct rpc_message msg = {
1765 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ACCESS], 1744 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ACCESS],
1766 .rpc_argp = &args, 1745 .rpc_argp = &args,
@@ -1786,6 +1765,7 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry
1786 if (mode & MAY_EXEC) 1765 if (mode & MAY_EXEC)
1787 args.access |= NFS4_ACCESS_EXECUTE; 1766 args.access |= NFS4_ACCESS_EXECUTE;
1788 } 1767 }
1768 nfs_fattr_init(&fattr);
1789 status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); 1769 status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
1790 if (!status) { 1770 if (!status) {
1791 entry->mask = 0; 1771 entry->mask = 0;
@@ -1795,6 +1775,7 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry
1795 entry->mask |= MAY_WRITE; 1775 entry->mask |= MAY_WRITE;
1796 if (res.access & (NFS4_ACCESS_LOOKUP|NFS4_ACCESS_EXECUTE)) 1776 if (res.access & (NFS4_ACCESS_LOOKUP|NFS4_ACCESS_EXECUTE))
1797 entry->mask |= MAY_EXEC; 1777 entry->mask |= MAY_EXEC;
1778 nfs_refresh_inode(inode, &fattr);
1798 } 1779 }
1799 return status; 1780 return status;
1800} 1781}
@@ -1900,11 +1881,13 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
1900 } 1881 }
1901 state = nfs4_do_open(dir, &path, flags, sattr, cred); 1882 state = nfs4_do_open(dir, &path, flags, sattr, cred);
1902 put_rpccred(cred); 1883 put_rpccred(cred);
1884 d_drop(dentry);
1903 if (IS_ERR(state)) { 1885 if (IS_ERR(state)) {
1904 status = PTR_ERR(state); 1886 status = PTR_ERR(state);
1905 goto out; 1887 goto out;
1906 } 1888 }
1907 d_instantiate(dentry, igrab(state->inode)); 1889 d_add(dentry, igrab(state->inode));
1890 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
1908 if (flags & O_EXCL) { 1891 if (flags & O_EXCL) {
1909 struct nfs_fattr fattr; 1892 struct nfs_fattr fattr;
1910 status = nfs4_do_setattr(state->inode, &fattr, sattr, state); 1893 status = nfs4_do_setattr(state->inode, &fattr, sattr, state);
@@ -2218,6 +2201,9 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
2218 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); 2201 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
2219 if (status == 0) 2202 if (status == 0)
2220 memcpy(NFS_COOKIEVERF(dir), res.verifier.data, NFS4_VERIFIER_SIZE); 2203 memcpy(NFS_COOKIEVERF(dir), res.verifier.data, NFS4_VERIFIER_SIZE);
2204
2205 nfs_invalidate_atime(dir);
2206
2221 dprintk("%s: returns %d\n", __FUNCTION__, status); 2207 dprintk("%s: returns %d\n", __FUNCTION__, status);
2222 return status; 2208 return status;
2223} 2209}
@@ -2414,6 +2400,8 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
2414 rpc_restart_call(task); 2400 rpc_restart_call(task);
2415 return -EAGAIN; 2401 return -EAGAIN;
2416 } 2402 }
2403
2404 nfs_invalidate_atime(data->inode);
2417 if (task->tk_status > 0) 2405 if (task->tk_status > 0)
2418 renew_lease(server, data->timestamp); 2406 renew_lease(server, data->timestamp);
2419 return 0; 2407 return 0;
@@ -2443,7 +2431,7 @@ static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
2443 } 2431 }
2444 if (task->tk_status >= 0) { 2432 if (task->tk_status >= 0) {
2445 renew_lease(NFS_SERVER(inode), data->timestamp); 2433 renew_lease(NFS_SERVER(inode), data->timestamp);
2446 nfs_post_op_update_inode(inode, data->res.fattr); 2434 nfs_post_op_update_inode_force_wcc(inode, data->res.fattr);
2447 } 2435 }
2448 return 0; 2436 return 0;
2449} 2437}
@@ -2485,8 +2473,7 @@ static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data)
2485 rpc_restart_call(task); 2473 rpc_restart_call(task);
2486 return -EAGAIN; 2474 return -EAGAIN;
2487 } 2475 }
2488 if (task->tk_status >= 0) 2476 nfs_refresh_inode(inode, data->res.fattr);
2489 nfs_post_op_update_inode(inode, data->res.fattr);
2490 return 0; 2477 return 0;
2491} 2478}
2492 2479
@@ -3056,7 +3043,7 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
3056 if (status == 0) { 3043 if (status == 0) {
3057 status = data->rpc_status; 3044 status = data->rpc_status;
3058 if (status == 0) 3045 if (status == 0)
3059 nfs_post_op_update_inode(inode, &data->fattr); 3046 nfs_refresh_inode(inode, &data->fattr);
3060 } 3047 }
3061 rpc_put_task(task); 3048 rpc_put_task(task);
3062 return status; 3049 return status;
@@ -3303,7 +3290,7 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *
3303 status = -ENOMEM; 3290 status = -ENOMEM;
3304 if (seqid == NULL) 3291 if (seqid == NULL)
3305 goto out; 3292 goto out;
3306 task = nfs4_do_unlck(request, request->fl_file->private_data, lsp, seqid); 3293 task = nfs4_do_unlck(request, nfs_file_open_context(request->fl_file), lsp, seqid);
3307 status = PTR_ERR(task); 3294 status = PTR_ERR(task);
3308 if (IS_ERR(task)) 3295 if (IS_ERR(task))
3309 goto out; 3296 goto out;
@@ -3447,7 +3434,7 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
3447 int ret; 3434 int ret;
3448 3435
3449 dprintk("%s: begin!\n", __FUNCTION__); 3436 dprintk("%s: begin!\n", __FUNCTION__);
3450 data = nfs4_alloc_lockdata(fl, fl->fl_file->private_data, 3437 data = nfs4_alloc_lockdata(fl, nfs_file_open_context(fl->fl_file),
3451 fl->fl_u.nfs4_fl.owner); 3438 fl->fl_u.nfs4_fl.owner);
3452 if (data == NULL) 3439 if (data == NULL)
3453 return -ENOMEM; 3440 return -ENOMEM;
@@ -3573,7 +3560,7 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request)
3573 int status; 3560 int status;
3574 3561
3575 /* verify open state */ 3562 /* verify open state */
3576 ctx = (struct nfs_open_context *)filp->private_data; 3563 ctx = nfs_file_open_context(filp);
3577 state = ctx->state; 3564 state = ctx->state;
3578 3565
3579 if (request->fl_start < 0 || request->fl_end < 0) 3566 if (request->fl_start < 0 || request->fl_end < 0)
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 3e4adf8c8312..bfb36261cecb 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -774,7 +774,7 @@ static int nfs4_reclaim_locks(struct nfs4_state_recovery_ops *ops, struct nfs4_s
774 for (fl = inode->i_flock; fl != 0; fl = fl->fl_next) { 774 for (fl = inode->i_flock; fl != 0; fl = fl->fl_next) {
775 if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK))) 775 if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK)))
776 continue; 776 continue;
777 if (((struct nfs_open_context *)fl->fl_file->private_data)->state != state) 777 if (nfs_file_open_context(fl->fl_file)->state != state)
778 continue; 778 continue;
779 status = ops->recover_lock(state, fl); 779 status = ops->recover_lock(state, fl);
780 if (status >= 0) 780 if (status >= 0)
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index badd73b7ca12..51dd3804866f 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -376,10 +376,12 @@ static int nfs4_stat_to_errno(int);
376 decode_locku_maxsz) 376 decode_locku_maxsz)
377#define NFS4_enc_access_sz (compound_encode_hdr_maxsz + \ 377#define NFS4_enc_access_sz (compound_encode_hdr_maxsz + \
378 encode_putfh_maxsz + \ 378 encode_putfh_maxsz + \
379 encode_access_maxsz) 379 encode_access_maxsz + \
380 encode_getattr_maxsz)
380#define NFS4_dec_access_sz (compound_decode_hdr_maxsz + \ 381#define NFS4_dec_access_sz (compound_decode_hdr_maxsz + \
381 decode_putfh_maxsz + \ 382 decode_putfh_maxsz + \
382 decode_access_maxsz) 383 decode_access_maxsz + \
384 decode_getattr_maxsz)
383#define NFS4_enc_getattr_sz (compound_encode_hdr_maxsz + \ 385#define NFS4_enc_getattr_sz (compound_encode_hdr_maxsz + \
384 encode_putfh_maxsz + \ 386 encode_putfh_maxsz + \
385 encode_getattr_maxsz) 387 encode_getattr_maxsz)
@@ -562,7 +564,6 @@ struct compound_hdr {
562 564
563#define RESERVE_SPACE(nbytes) do { \ 565#define RESERVE_SPACE(nbytes) do { \
564 p = xdr_reserve_space(xdr, nbytes); \ 566 p = xdr_reserve_space(xdr, nbytes); \
565 if (!p) printk("RESERVE_SPACE(%d) failed in function %s\n", (int) (nbytes), __FUNCTION__); \
566 BUG_ON(!p); \ 567 BUG_ON(!p); \
567} while (0) 568} while (0)
568 569
@@ -628,8 +629,8 @@ static int encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const s
628 if (iap->ia_valid & ATTR_UID) { 629 if (iap->ia_valid & ATTR_UID) {
629 owner_namelen = nfs_map_uid_to_name(server->nfs_client, iap->ia_uid, owner_name); 630 owner_namelen = nfs_map_uid_to_name(server->nfs_client, iap->ia_uid, owner_name);
630 if (owner_namelen < 0) { 631 if (owner_namelen < 0) {
631 printk(KERN_WARNING "nfs: couldn't resolve uid %d to string\n", 632 dprintk("nfs: couldn't resolve uid %d to string\n",
632 iap->ia_uid); 633 iap->ia_uid);
633 /* XXX */ 634 /* XXX */
634 strcpy(owner_name, "nobody"); 635 strcpy(owner_name, "nobody");
635 owner_namelen = sizeof("nobody") - 1; 636 owner_namelen = sizeof("nobody") - 1;
@@ -640,8 +641,8 @@ static int encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const s
640 if (iap->ia_valid & ATTR_GID) { 641 if (iap->ia_valid & ATTR_GID) {
641 owner_grouplen = nfs_map_gid_to_group(server->nfs_client, iap->ia_gid, owner_group); 642 owner_grouplen = nfs_map_gid_to_group(server->nfs_client, iap->ia_gid, owner_group);
642 if (owner_grouplen < 0) { 643 if (owner_grouplen < 0) {
643 printk(KERN_WARNING "nfs4: couldn't resolve gid %d to string\n", 644 dprintk("nfs: couldn't resolve gid %d to string\n",
644 iap->ia_gid); 645 iap->ia_gid);
645 strcpy(owner_group, "nobody"); 646 strcpy(owner_group, "nobody");
646 owner_grouplen = sizeof("nobody") - 1; 647 owner_grouplen = sizeof("nobody") - 1;
647 /* goto out; */ 648 /* goto out; */
@@ -711,7 +712,7 @@ static int encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const s
711 * Now we backfill the bitmap and the attribute buffer length. 712 * Now we backfill the bitmap and the attribute buffer length.
712 */ 713 */
713 if (len != ((char *)p - (char *)q) + 4) { 714 if (len != ((char *)p - (char *)q) + 4) {
714 printk ("encode_attr: Attr length calculation error! %u != %Zu\n", 715 printk(KERN_ERR "nfs: Attr length error, %u != %Zu\n",
715 len, ((char *)p - (char *)q) + 4); 716 len, ((char *)p - (char *)q) + 4);
716 BUG(); 717 BUG();
717 } 718 }
@@ -1376,14 +1377,20 @@ static int nfs4_xdr_enc_access(struct rpc_rqst *req, __be32 *p, const struct nfs
1376{ 1377{
1377 struct xdr_stream xdr; 1378 struct xdr_stream xdr;
1378 struct compound_hdr hdr = { 1379 struct compound_hdr hdr = {
1379 .nops = 2, 1380 .nops = 3,
1380 }; 1381 };
1381 int status; 1382 int status;
1382 1383
1383 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1384 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
1384 encode_compound_hdr(&xdr, &hdr); 1385 encode_compound_hdr(&xdr, &hdr);
1385 if ((status = encode_putfh(&xdr, args->fh)) == 0) 1386 status = encode_putfh(&xdr, args->fh);
1386 status = encode_access(&xdr, args->access); 1387 if (status != 0)
1388 goto out;
1389 status = encode_access(&xdr, args->access);
1390 if (status != 0)
1391 goto out;
1392 status = encode_getfattr(&xdr, args->bitmask);
1393out:
1387 return status; 1394 return status;
1388} 1395}
1389 1396
@@ -1857,6 +1864,7 @@ static int nfs4_xdr_enc_read(struct rpc_rqst *req, __be32 *p, struct nfs_readarg
1857 replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS4_dec_read_sz) << 2; 1864 replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS4_dec_read_sz) << 2;
1858 xdr_inline_pages(&req->rq_rcv_buf, replen, 1865 xdr_inline_pages(&req->rq_rcv_buf, replen,
1859 args->pages, args->pgbase, args->count); 1866 args->pages, args->pgbase, args->count);
1867 req->rq_rcv_buf.flags |= XDRBUF_READ;
1860out: 1868out:
1861 return status; 1869 return status;
1862} 1870}
@@ -1933,6 +1941,7 @@ static int nfs4_xdr_enc_write(struct rpc_rqst *req, __be32 *p, struct nfs_writea
1933 status = encode_write(&xdr, args); 1941 status = encode_write(&xdr, args);
1934 if (status) 1942 if (status)
1935 goto out; 1943 goto out;
1944 req->rq_snd_buf.flags |= XDRBUF_WRITE;
1936 status = encode_getfattr(&xdr, args->bitmask); 1945 status = encode_getfattr(&xdr, args->bitmask);
1937out: 1946out:
1938 return status; 1947 return status;
@@ -2180,9 +2189,9 @@ out:
2180#define READ_BUF(nbytes) do { \ 2189#define READ_BUF(nbytes) do { \
2181 p = xdr_inline_decode(xdr, nbytes); \ 2190 p = xdr_inline_decode(xdr, nbytes); \
2182 if (unlikely(!p)) { \ 2191 if (unlikely(!p)) { \
2183 printk(KERN_INFO "%s: prematurely hit end of receive" \ 2192 dprintk("nfs: %s: prematurely hit end of receive" \
2184 " buffer\n", __FUNCTION__); \ 2193 " buffer\n", __FUNCTION__); \
2185 printk(KERN_INFO "%s: xdr->p=%p, bytes=%u, xdr->end=%p\n", \ 2194 dprintk("nfs: %s: xdr->p=%p, bytes=%u, xdr->end=%p\n", \
2186 __FUNCTION__, xdr->p, nbytes, xdr->end); \ 2195 __FUNCTION__, xdr->p, nbytes, xdr->end); \
2187 return -EIO; \ 2196 return -EIO; \
2188 } \ 2197 } \
@@ -2223,9 +2232,8 @@ static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
2223 READ_BUF(8); 2232 READ_BUF(8);
2224 READ32(opnum); 2233 READ32(opnum);
2225 if (opnum != expected) { 2234 if (opnum != expected) {
2226 printk(KERN_NOTICE 2235 dprintk("nfs: Server returned operation"
2227 "nfs4_decode_op_hdr: Server returned operation" 2236 " %d but we issued a request for %d\n",
2228 " %d but we issued a request for %d\n",
2229 opnum, expected); 2237 opnum, expected);
2230 return -EIO; 2238 return -EIO;
2231 } 2239 }
@@ -2758,7 +2766,7 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
2758 dprintk("%s: nfs_map_name_to_uid failed!\n", 2766 dprintk("%s: nfs_map_name_to_uid failed!\n",
2759 __FUNCTION__); 2767 __FUNCTION__);
2760 } else 2768 } else
2761 printk(KERN_WARNING "%s: name too long (%u)!\n", 2769 dprintk("%s: name too long (%u)!\n",
2762 __FUNCTION__, len); 2770 __FUNCTION__, len);
2763 bitmap[1] &= ~FATTR4_WORD1_OWNER; 2771 bitmap[1] &= ~FATTR4_WORD1_OWNER;
2764 } 2772 }
@@ -2783,7 +2791,7 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
2783 dprintk("%s: nfs_map_group_to_gid failed!\n", 2791 dprintk("%s: nfs_map_group_to_gid failed!\n",
2784 __FUNCTION__); 2792 __FUNCTION__);
2785 } else 2793 } else
2786 printk(KERN_WARNING "%s: name too long (%u)!\n", 2794 dprintk("%s: name too long (%u)!\n",
2787 __FUNCTION__, len); 2795 __FUNCTION__, len);
2788 bitmap[1] &= ~FATTR4_WORD1_OWNER_GROUP; 2796 bitmap[1] &= ~FATTR4_WORD1_OWNER_GROUP;
2789 } 2797 }
@@ -2950,7 +2958,8 @@ static int verify_attr_len(struct xdr_stream *xdr, __be32 *savep, uint32_t attrl
2950 unsigned int nwords = xdr->p - savep; 2958 unsigned int nwords = xdr->p - savep;
2951 2959
2952 if (unlikely(attrwords != nwords)) { 2960 if (unlikely(attrwords != nwords)) {
2953 printk(KERN_WARNING "%s: server returned incorrect attribute length: %u %c %u\n", 2961 dprintk("%s: server returned incorrect attribute length: "
2962 "%u %c %u\n",
2954 __FUNCTION__, 2963 __FUNCTION__,
2955 attrwords << 2, 2964 attrwords << 2,
2956 (attrwords < nwords) ? '<' : '>', 2965 (attrwords < nwords) ? '<' : '>',
@@ -3451,7 +3460,7 @@ static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs_
3451 hdrlen = (u8 *) p - (u8 *) iov->iov_base; 3460 hdrlen = (u8 *) p - (u8 *) iov->iov_base;
3452 recvd = req->rq_rcv_buf.len - hdrlen; 3461 recvd = req->rq_rcv_buf.len - hdrlen;
3453 if (count > recvd) { 3462 if (count > recvd) {
3454 printk(KERN_WARNING "NFS: server cheating in read reply: " 3463 dprintk("NFS: server cheating in read reply: "
3455 "count %u > recvd %u\n", count, recvd); 3464 "count %u > recvd %u\n", count, recvd);
3456 count = recvd; 3465 count = recvd;
3457 eof = 0; 3466 eof = 0;
@@ -3500,7 +3509,8 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n
3500 p += 2; /* cookie */ 3509 p += 2; /* cookie */
3501 len = ntohl(*p++); /* filename length */ 3510 len = ntohl(*p++); /* filename length */
3502 if (len > NFS4_MAXNAMLEN) { 3511 if (len > NFS4_MAXNAMLEN) {
3503 printk(KERN_WARNING "NFS: giant filename in readdir (len 0x%x)\n", len); 3512 dprintk("NFS: giant filename in readdir (len 0x%x)\n",
3513 len);
3504 goto err_unmap; 3514 goto err_unmap;
3505 } 3515 }
3506 xlen = XDR_QUADLEN(len); 3516 xlen = XDR_QUADLEN(len);
@@ -3528,7 +3538,7 @@ short_pkt:
3528 entry[0] = entry[1] = 0; 3538 entry[0] = entry[1] = 0;
3529 /* truncate listing ? */ 3539 /* truncate listing ? */
3530 if (!nr) { 3540 if (!nr) {
3531 printk(KERN_NOTICE "NFS: readdir reply truncated!\n"); 3541 dprintk("NFS: readdir reply truncated!\n");
3532 entry[1] = 1; 3542 entry[1] = 1;
3533 } 3543 }
3534 goto out; 3544 goto out;
@@ -3554,13 +3564,13 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
3554 READ_BUF(4); 3564 READ_BUF(4);
3555 READ32(len); 3565 READ32(len);
3556 if (len >= rcvbuf->page_len || len <= 0) { 3566 if (len >= rcvbuf->page_len || len <= 0) {
3557 dprintk(KERN_WARNING "nfs: server returned giant symlink!\n"); 3567 dprintk("nfs: server returned giant symlink!\n");
3558 return -ENAMETOOLONG; 3568 return -ENAMETOOLONG;
3559 } 3569 }
3560 hdrlen = (char *) xdr->p - (char *) iov->iov_base; 3570 hdrlen = (char *) xdr->p - (char *) iov->iov_base;
3561 recvd = req->rq_rcv_buf.len - hdrlen; 3571 recvd = req->rq_rcv_buf.len - hdrlen;
3562 if (recvd < len) { 3572 if (recvd < len) {
3563 printk(KERN_WARNING "NFS: server cheating in readlink reply: " 3573 dprintk("NFS: server cheating in readlink reply: "
3564 "count %u > recvd %u\n", len, recvd); 3574 "count %u > recvd %u\n", len, recvd);
3565 return -EIO; 3575 return -EIO;
3566 } 3576 }
@@ -3643,7 +3653,7 @@ static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req,
3643 hdrlen = (u8 *)xdr->p - (u8 *)iov->iov_base; 3653 hdrlen = (u8 *)xdr->p - (u8 *)iov->iov_base;
3644 recvd = req->rq_rcv_buf.len - hdrlen; 3654 recvd = req->rq_rcv_buf.len - hdrlen;
3645 if (attrlen > recvd) { 3655 if (attrlen > recvd) {
3646 printk(KERN_WARNING "NFS: server cheating in getattr" 3656 dprintk("NFS: server cheating in getattr"
3647 " acl reply: attrlen %u > recvd %u\n", 3657 " acl reply: attrlen %u > recvd %u\n",
3648 attrlen, recvd); 3658 attrlen, recvd);
3649 return -EINVAL; 3659 return -EINVAL;
@@ -3688,8 +3698,7 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp)
3688 READ_BUF(8); 3698 READ_BUF(8);
3689 READ32(opnum); 3699 READ32(opnum);
3690 if (opnum != OP_SETCLIENTID) { 3700 if (opnum != OP_SETCLIENTID) {
3691 printk(KERN_NOTICE 3701 dprintk("nfs: decode_setclientid: Server returned operation"
3692 "nfs4_decode_setclientid: Server returned operation"
3693 " %d\n", opnum); 3702 " %d\n", opnum);
3694 return -EIO; 3703 return -EIO;
3695 } 3704 }
@@ -3783,8 +3792,13 @@ static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_ac
3783 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 3792 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
3784 if ((status = decode_compound_hdr(&xdr, &hdr)) != 0) 3793 if ((status = decode_compound_hdr(&xdr, &hdr)) != 0)
3785 goto out; 3794 goto out;
3786 if ((status = decode_putfh(&xdr)) == 0) 3795 status = decode_putfh(&xdr);
3787 status = decode_access(&xdr, res); 3796 if (status != 0)
3797 goto out;
3798 status = decode_access(&xdr, res);
3799 if (status != 0)
3800 goto out;
3801 decode_getfattr(&xdr, res->fattr, res->server);
3788out: 3802out:
3789 return status; 3803 return status;
3790} 3804}
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index 3490322d1145..e87b44ee9ac9 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -76,6 +76,7 @@
76#include <linux/fs.h> 76#include <linux/fs.h>
77#include <linux/init.h> 77#include <linux/init.h>
78#include <linux/sunrpc/clnt.h> 78#include <linux/sunrpc/clnt.h>
79#include <linux/sunrpc/xprtsock.h>
79#include <linux/nfs.h> 80#include <linux/nfs.h>
80#include <linux/nfs_fs.h> 81#include <linux/nfs_fs.h>
81#include <linux/nfs_mount.h> 82#include <linux/nfs_mount.h>
@@ -491,7 +492,7 @@ static int __init root_nfs_get_handle(void)
491 struct sockaddr_in sin; 492 struct sockaddr_in sin;
492 int status; 493 int status;
493 int protocol = (nfs_data.flags & NFS_MOUNT_TCP) ? 494 int protocol = (nfs_data.flags & NFS_MOUNT_TCP) ?
494 IPPROTO_TCP : IPPROTO_UDP; 495 XPRT_TRANSPORT_TCP : XPRT_TRANSPORT_UDP;
495 int version = (nfs_data.flags & NFS_MOUNT_VER3) ? 496 int version = (nfs_data.flags & NFS_MOUNT_VER3) ?
496 NFS_MNT3_VERSION : NFS_MNT_VERSION; 497 NFS_MNT3_VERSION : NFS_MNT_VERSION;
497 498
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 845cdde1d8b7..97669ed05500 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -476,6 +476,8 @@ nfs_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
476 dprintk("NFS call readdir %d\n", (unsigned int)cookie); 476 dprintk("NFS call readdir %d\n", (unsigned int)cookie);
477 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); 477 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
478 478
479 nfs_invalidate_atime(dir);
480
479 dprintk("NFS reply readdir: %d\n", status); 481 dprintk("NFS reply readdir: %d\n", status);
480 return status; 482 return status;
481} 483}
@@ -550,6 +552,7 @@ nfs_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
550 552
551static int nfs_read_done(struct rpc_task *task, struct nfs_read_data *data) 553static int nfs_read_done(struct rpc_task *task, struct nfs_read_data *data)
552{ 554{
555 nfs_invalidate_atime(data->inode);
553 if (task->tk_status >= 0) { 556 if (task->tk_status >= 0) {
554 nfs_refresh_inode(data->inode, data->res.fattr); 557 nfs_refresh_inode(data->inode, data->res.fattr);
555 /* Emulate the eof flag, which isn't normally needed in NFSv2 558 /* Emulate the eof flag, which isn't normally needed in NFSv2
@@ -576,7 +579,7 @@ static void nfs_proc_read_setup(struct nfs_read_data *data)
576static int nfs_write_done(struct rpc_task *task, struct nfs_write_data *data) 579static int nfs_write_done(struct rpc_task *task, struct nfs_write_data *data)
577{ 580{
578 if (task->tk_status >= 0) 581 if (task->tk_status >= 0)
579 nfs_post_op_update_inode(data->inode, data->res.fattr); 582 nfs_post_op_update_inode_force_wcc(data->inode, data->res.fattr);
580 return 0; 583 return 0;
581} 584}
582 585
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 19e05633f4e3..4587a86adaac 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -341,9 +341,6 @@ int nfs_readpage_result(struct rpc_task *task, struct nfs_read_data *data)
341 set_bit(NFS_INO_STALE, &NFS_FLAGS(data->inode)); 341 set_bit(NFS_INO_STALE, &NFS_FLAGS(data->inode));
342 nfs_mark_for_revalidate(data->inode); 342 nfs_mark_for_revalidate(data->inode);
343 } 343 }
344 spin_lock(&data->inode->i_lock);
345 NFS_I(data->inode)->cache_validity |= NFS_INO_INVALID_ATIME;
346 spin_unlock(&data->inode->i_lock);
347 return 0; 344 return 0;
348} 345}
349 346
@@ -497,8 +494,7 @@ int nfs_readpage(struct file *file, struct page *page)
497 if (ctx == NULL) 494 if (ctx == NULL)
498 goto out_unlock; 495 goto out_unlock;
499 } else 496 } else
500 ctx = get_nfs_open_context((struct nfs_open_context *) 497 ctx = get_nfs_open_context(nfs_file_open_context(file));
501 file->private_data);
502 498
503 error = nfs_readpage_async(ctx, inode, page); 499 error = nfs_readpage_async(ctx, inode, page);
504 500
@@ -576,8 +572,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
576 if (desc.ctx == NULL) 572 if (desc.ctx == NULL)
577 return -EBADF; 573 return -EBADF;
578 } else 574 } else
579 desc.ctx = get_nfs_open_context((struct nfs_open_context *) 575 desc.ctx = get_nfs_open_context(nfs_file_open_context(filp));
580 filp->private_data);
581 if (rsize < PAGE_CACHE_SIZE) 576 if (rsize < PAGE_CACHE_SIZE)
582 nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0); 577 nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0);
583 else 578 else
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index b878528b64c1..fa517ae9207f 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -33,6 +33,8 @@
33#include <linux/sunrpc/clnt.h> 33#include <linux/sunrpc/clnt.h>
34#include <linux/sunrpc/stats.h> 34#include <linux/sunrpc/stats.h>
35#include <linux/sunrpc/metrics.h> 35#include <linux/sunrpc/metrics.h>
36#include <linux/sunrpc/xprtsock.h>
37#include <linux/sunrpc/xprtrdma.h>
36#include <linux/nfs_fs.h> 38#include <linux/nfs_fs.h>
37#include <linux/nfs_mount.h> 39#include <linux/nfs_mount.h>
38#include <linux/nfs4_mount.h> 40#include <linux/nfs4_mount.h>
@@ -58,36 +60,6 @@
58 60
59#define NFSDBG_FACILITY NFSDBG_VFS 61#define NFSDBG_FACILITY NFSDBG_VFS
60 62
61
62struct nfs_parsed_mount_data {
63 int flags;
64 int rsize, wsize;
65 int timeo, retrans;
66 int acregmin, acregmax,
67 acdirmin, acdirmax;
68 int namlen;
69 unsigned int bsize;
70 unsigned int auth_flavor_len;
71 rpc_authflavor_t auth_flavors[1];
72 char *client_address;
73
74 struct {
75 struct sockaddr_in address;
76 unsigned int program;
77 unsigned int version;
78 unsigned short port;
79 int protocol;
80 } mount_server;
81
82 struct {
83 struct sockaddr_in address;
84 char *hostname;
85 char *export_path;
86 unsigned int program;
87 int protocol;
88 } nfs_server;
89};
90
91enum { 63enum {
92 /* Mount options that take no arguments */ 64 /* Mount options that take no arguments */
93 Opt_soft, Opt_hard, 65 Opt_soft, Opt_hard,
@@ -97,7 +69,7 @@ enum {
97 Opt_ac, Opt_noac, 69 Opt_ac, Opt_noac,
98 Opt_lock, Opt_nolock, 70 Opt_lock, Opt_nolock,
99 Opt_v2, Opt_v3, 71 Opt_v2, Opt_v3,
100 Opt_udp, Opt_tcp, 72 Opt_udp, Opt_tcp, Opt_rdma,
101 Opt_acl, Opt_noacl, 73 Opt_acl, Opt_noacl,
102 Opt_rdirplus, Opt_nordirplus, 74 Opt_rdirplus, Opt_nordirplus,
103 Opt_sharecache, Opt_nosharecache, 75 Opt_sharecache, Opt_nosharecache,
@@ -116,7 +88,7 @@ enum {
116 88
117 /* Mount options that take string arguments */ 89 /* Mount options that take string arguments */
118 Opt_sec, Opt_proto, Opt_mountproto, 90 Opt_sec, Opt_proto, Opt_mountproto,
119 Opt_addr, Opt_mounthost, Opt_clientaddr, 91 Opt_addr, Opt_mountaddr, Opt_clientaddr,
120 92
121 /* Mount options that are ignored */ 93 /* Mount options that are ignored */
122 Opt_userspace, Opt_deprecated, 94 Opt_userspace, Opt_deprecated,
@@ -143,6 +115,7 @@ static match_table_t nfs_mount_option_tokens = {
143 { Opt_v3, "v3" }, 115 { Opt_v3, "v3" },
144 { Opt_udp, "udp" }, 116 { Opt_udp, "udp" },
145 { Opt_tcp, "tcp" }, 117 { Opt_tcp, "tcp" },
118 { Opt_rdma, "rdma" },
146 { Opt_acl, "acl" }, 119 { Opt_acl, "acl" },
147 { Opt_noacl, "noacl" }, 120 { Opt_noacl, "noacl" },
148 { Opt_rdirplus, "rdirplus" }, 121 { Opt_rdirplus, "rdirplus" },
@@ -175,13 +148,14 @@ static match_table_t nfs_mount_option_tokens = {
175 { Opt_mountproto, "mountproto=%s" }, 148 { Opt_mountproto, "mountproto=%s" },
176 { Opt_addr, "addr=%s" }, 149 { Opt_addr, "addr=%s" },
177 { Opt_clientaddr, "clientaddr=%s" }, 150 { Opt_clientaddr, "clientaddr=%s" },
178 { Opt_mounthost, "mounthost=%s" }, 151 { Opt_userspace, "mounthost=%s" },
152 { Opt_mountaddr, "mountaddr=%s" },
179 153
180 { Opt_err, NULL } 154 { Opt_err, NULL }
181}; 155};
182 156
183enum { 157enum {
184 Opt_xprt_udp, Opt_xprt_tcp, 158 Opt_xprt_udp, Opt_xprt_tcp, Opt_xprt_rdma,
185 159
186 Opt_xprt_err 160 Opt_xprt_err
187}; 161};
@@ -189,6 +163,7 @@ enum {
189static match_table_t nfs_xprt_protocol_tokens = { 163static match_table_t nfs_xprt_protocol_tokens = {
190 { Opt_xprt_udp, "udp" }, 164 { Opt_xprt_udp, "udp" },
191 { Opt_xprt_tcp, "tcp" }, 165 { Opt_xprt_tcp, "tcp" },
166 { Opt_xprt_rdma, "rdma" },
192 167
193 { Opt_xprt_err, NULL } 168 { Opt_xprt_err, NULL }
194}; 169};
@@ -449,7 +424,7 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
449 const char *nostr; 424 const char *nostr;
450 } nfs_info[] = { 425 } nfs_info[] = {
451 { NFS_MOUNT_SOFT, ",soft", ",hard" }, 426 { NFS_MOUNT_SOFT, ",soft", ",hard" },
452 { NFS_MOUNT_INTR, ",intr", "" }, 427 { NFS_MOUNT_INTR, ",intr", ",nointr" },
453 { NFS_MOUNT_NOCTO, ",nocto", "" }, 428 { NFS_MOUNT_NOCTO, ",nocto", "" },
454 { NFS_MOUNT_NOAC, ",noac", "" }, 429 { NFS_MOUNT_NOAC, ",noac", "" },
455 { NFS_MOUNT_NONLM, ",nolock", "" }, 430 { NFS_MOUNT_NONLM, ",nolock", "" },
@@ -460,8 +435,6 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
460 }; 435 };
461 const struct proc_nfs_info *nfs_infop; 436 const struct proc_nfs_info *nfs_infop;
462 struct nfs_client *clp = nfss->nfs_client; 437 struct nfs_client *clp = nfss->nfs_client;
463 char buf[12];
464 const char *proto;
465 438
466 seq_printf(m, ",vers=%d", clp->rpc_ops->version); 439 seq_printf(m, ",vers=%d", clp->rpc_ops->version);
467 seq_printf(m, ",rsize=%d", nfss->rsize); 440 seq_printf(m, ",rsize=%d", nfss->rsize);
@@ -480,18 +453,8 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
480 else 453 else
481 seq_puts(m, nfs_infop->nostr); 454 seq_puts(m, nfs_infop->nostr);
482 } 455 }
483 switch (nfss->client->cl_xprt->prot) { 456 seq_printf(m, ",proto=%s",
484 case IPPROTO_TCP: 457 rpc_peeraddr2str(nfss->client, RPC_DISPLAY_PROTO));
485 proto = "tcp";
486 break;
487 case IPPROTO_UDP:
488 proto = "udp";
489 break;
490 default:
491 snprintf(buf, sizeof(buf), "%u", nfss->client->cl_xprt->prot);
492 proto = buf;
493 }
494 seq_printf(m, ",proto=%s", proto);
495 seq_printf(m, ",timeo=%lu", 10U * clp->retrans_timeo / HZ); 458 seq_printf(m, ",timeo=%lu", 10U * clp->retrans_timeo / HZ);
496 seq_printf(m, ",retrans=%u", clp->retrans_count); 459 seq_printf(m, ",retrans=%u", clp->retrans_count);
497 seq_printf(m, ",sec=%s", nfs_pseudoflavour_to_name(nfss->client->cl_auth->au_flavor)); 460 seq_printf(m, ",sec=%s", nfs_pseudoflavour_to_name(nfss->client->cl_auth->au_flavor));
@@ -506,8 +469,8 @@ static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt)
506 469
507 nfs_show_mount_options(m, nfss, 0); 470 nfs_show_mount_options(m, nfss, 0);
508 471
509 seq_puts(m, ",addr="); 472 seq_printf(m, ",addr="NIPQUAD_FMT,
510 seq_escape(m, nfss->nfs_client->cl_hostname, " \t\n\\"); 473 NIPQUAD(nfss->nfs_client->cl_addr.sin_addr));
511 474
512 return 0; 475 return 0;
513} 476}
@@ -698,13 +661,19 @@ static int nfs_parse_mount_options(char *raw,
698 break; 661 break;
699 case Opt_udp: 662 case Opt_udp:
700 mnt->flags &= ~NFS_MOUNT_TCP; 663 mnt->flags &= ~NFS_MOUNT_TCP;
701 mnt->nfs_server.protocol = IPPROTO_UDP; 664 mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP;
702 mnt->timeo = 7; 665 mnt->timeo = 7;
703 mnt->retrans = 5; 666 mnt->retrans = 5;
704 break; 667 break;
705 case Opt_tcp: 668 case Opt_tcp:
706 mnt->flags |= NFS_MOUNT_TCP; 669 mnt->flags |= NFS_MOUNT_TCP;
707 mnt->nfs_server.protocol = IPPROTO_TCP; 670 mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP;
671 mnt->timeo = 600;
672 mnt->retrans = 2;
673 break;
674 case Opt_rdma:
675 mnt->flags |= NFS_MOUNT_TCP; /* for side protocols */
676 mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA;
708 mnt->timeo = 600; 677 mnt->timeo = 600;
709 mnt->retrans = 2; 678 mnt->retrans = 2;
710 break; 679 break;
@@ -913,13 +882,20 @@ static int nfs_parse_mount_options(char *raw,
913 switch (token) { 882 switch (token) {
914 case Opt_xprt_udp: 883 case Opt_xprt_udp:
915 mnt->flags &= ~NFS_MOUNT_TCP; 884 mnt->flags &= ~NFS_MOUNT_TCP;
916 mnt->nfs_server.protocol = IPPROTO_UDP; 885 mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP;
917 mnt->timeo = 7; 886 mnt->timeo = 7;
918 mnt->retrans = 5; 887 mnt->retrans = 5;
919 break; 888 break;
920 case Opt_xprt_tcp: 889 case Opt_xprt_tcp:
921 mnt->flags |= NFS_MOUNT_TCP; 890 mnt->flags |= NFS_MOUNT_TCP;
922 mnt->nfs_server.protocol = IPPROTO_TCP; 891 mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP;
892 mnt->timeo = 600;
893 mnt->retrans = 2;
894 break;
895 case Opt_xprt_rdma:
896 /* vector side protocols to TCP */
897 mnt->flags |= NFS_MOUNT_TCP;
898 mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA;
923 mnt->timeo = 600; 899 mnt->timeo = 600;
924 mnt->retrans = 2; 900 mnt->retrans = 2;
925 break; 901 break;
@@ -937,11 +913,12 @@ static int nfs_parse_mount_options(char *raw,
937 913
938 switch (token) { 914 switch (token) {
939 case Opt_xprt_udp: 915 case Opt_xprt_udp:
940 mnt->mount_server.protocol = IPPROTO_UDP; 916 mnt->mount_server.protocol = XPRT_TRANSPORT_UDP;
941 break; 917 break;
942 case Opt_xprt_tcp: 918 case Opt_xprt_tcp:
943 mnt->mount_server.protocol = IPPROTO_TCP; 919 mnt->mount_server.protocol = XPRT_TRANSPORT_TCP;
944 break; 920 break;
921 case Opt_xprt_rdma: /* not used for side protocols */
945 default: 922 default:
946 goto out_unrec_xprt; 923 goto out_unrec_xprt;
947 } 924 }
@@ -961,7 +938,7 @@ static int nfs_parse_mount_options(char *raw,
961 goto out_nomem; 938 goto out_nomem;
962 mnt->client_address = string; 939 mnt->client_address = string;
963 break; 940 break;
964 case Opt_mounthost: 941 case Opt_mountaddr:
965 string = match_strdup(args); 942 string = match_strdup(args);
966 if (string == NULL) 943 if (string == NULL)
967 goto out_nomem; 944 goto out_nomem;
@@ -1027,16 +1004,10 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
1027 sin = args->mount_server.address; 1004 sin = args->mount_server.address;
1028 else 1005 else
1029 sin = args->nfs_server.address; 1006 sin = args->nfs_server.address;
1030 if (args->mount_server.port == 0) { 1007 /*
1031 status = rpcb_getport_sync(&sin, 1008 * autobind will be used if mount_server.port == 0
1032 args->mount_server.program, 1009 */
1033 args->mount_server.version, 1010 sin.sin_port = htons(args->mount_server.port);
1034 args->mount_server.protocol);
1035 if (status < 0)
1036 goto out_err;
1037 sin.sin_port = htons(status);
1038 } else
1039 sin.sin_port = htons(args->mount_server.port);
1040 1011
1041 /* 1012 /*
1042 * Now ask the mount server to map our export path 1013 * Now ask the mount server to map our export path
@@ -1049,14 +1020,11 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
1049 args->mount_server.version, 1020 args->mount_server.version,
1050 args->mount_server.protocol, 1021 args->mount_server.protocol,
1051 root_fh); 1022 root_fh);
1052 if (status < 0) 1023 if (status == 0)
1053 goto out_err; 1024 return 0;
1054
1055 return status;
1056 1025
1057out_err: 1026 dfprintk(MOUNT, "NFS: unable to mount server " NIPQUAD_FMT
1058 dfprintk(MOUNT, "NFS: unable to contact server on host " 1027 ", error %d\n", NIPQUAD(sin.sin_addr.s_addr), status);
1059 NIPQUAD_FMT "\n", NIPQUAD(sin.sin_addr.s_addr));
1060 return status; 1028 return status;
1061} 1029}
1062 1030
@@ -1079,15 +1047,31 @@ out_err:
1079 * XXX: as far as I can tell, changing the NFS program number is not 1047 * XXX: as far as I can tell, changing the NFS program number is not
1080 * supported in the NFS client. 1048 * supported in the NFS client.
1081 */ 1049 */
1082static int nfs_validate_mount_data(struct nfs_mount_data **options, 1050static int nfs_validate_mount_data(void *options,
1051 struct nfs_parsed_mount_data *args,
1083 struct nfs_fh *mntfh, 1052 struct nfs_fh *mntfh,
1084 const char *dev_name) 1053 const char *dev_name)
1085{ 1054{
1086 struct nfs_mount_data *data = *options; 1055 struct nfs_mount_data *data = (struct nfs_mount_data *)options;
1087 1056
1088 if (data == NULL) 1057 if (data == NULL)
1089 goto out_no_data; 1058 goto out_no_data;
1090 1059
1060 memset(args, 0, sizeof(*args));
1061 args->flags = (NFS_MOUNT_VER3 | NFS_MOUNT_TCP);
1062 args->rsize = NFS_MAX_FILE_IO_SIZE;
1063 args->wsize = NFS_MAX_FILE_IO_SIZE;
1064 args->timeo = 600;
1065 args->retrans = 2;
1066 args->acregmin = 3;
1067 args->acregmax = 60;
1068 args->acdirmin = 30;
1069 args->acdirmax = 60;
1070 args->mount_server.protocol = XPRT_TRANSPORT_UDP;
1071 args->mount_server.program = NFS_MNT_PROGRAM;
1072 args->nfs_server.protocol = XPRT_TRANSPORT_TCP;
1073 args->nfs_server.program = NFS_PROGRAM;
1074
1091 switch (data->version) { 1075 switch (data->version) {
1092 case 1: 1076 case 1:
1093 data->namlen = 0; 1077 data->namlen = 0;
@@ -1116,92 +1100,73 @@ static int nfs_validate_mount_data(struct nfs_mount_data **options,
1116 if (mntfh->size < sizeof(mntfh->data)) 1100 if (mntfh->size < sizeof(mntfh->data))
1117 memset(mntfh->data + mntfh->size, 0, 1101 memset(mntfh->data + mntfh->size, 0,
1118 sizeof(mntfh->data) - mntfh->size); 1102 sizeof(mntfh->data) - mntfh->size);
1103
1104 if (!nfs_verify_server_address((struct sockaddr *) &data->addr))
1105 goto out_no_address;
1106
1107 /*
1108 * Translate to nfs_parsed_mount_data, which nfs_fill_super
1109 * can deal with.
1110 */
1111 args->flags = data->flags;
1112 args->rsize = data->rsize;
1113 args->wsize = data->wsize;
1114 args->flags = data->flags;
1115 args->timeo = data->timeo;
1116 args->retrans = data->retrans;
1117 args->acregmin = data->acregmin;
1118 args->acregmax = data->acregmax;
1119 args->acdirmin = data->acdirmin;
1120 args->acdirmax = data->acdirmax;
1121 args->nfs_server.address = data->addr;
1122 if (!(data->flags & NFS_MOUNT_TCP))
1123 args->nfs_server.protocol = XPRT_TRANSPORT_UDP;
1124 /* N.B. caller will free nfs_server.hostname in all cases */
1125 args->nfs_server.hostname = kstrdup(data->hostname, GFP_KERNEL);
1126 args->namlen = data->namlen;
1127 args->bsize = data->bsize;
1128 args->auth_flavors[0] = data->pseudoflavor;
1119 break; 1129 break;
1120 default: { 1130 default: {
1121 unsigned int len; 1131 unsigned int len;
1122 char *c; 1132 char *c;
1123 int status; 1133 int status;
1124 struct nfs_parsed_mount_data args = {
1125 .flags = (NFS_MOUNT_VER3 | NFS_MOUNT_TCP),
1126 .rsize = NFS_MAX_FILE_IO_SIZE,
1127 .wsize = NFS_MAX_FILE_IO_SIZE,
1128 .timeo = 600,
1129 .retrans = 2,
1130 .acregmin = 3,
1131 .acregmax = 60,
1132 .acdirmin = 30,
1133 .acdirmax = 60,
1134 .mount_server.protocol = IPPROTO_UDP,
1135 .mount_server.program = NFS_MNT_PROGRAM,
1136 .nfs_server.protocol = IPPROTO_TCP,
1137 .nfs_server.program = NFS_PROGRAM,
1138 };
1139
1140 if (nfs_parse_mount_options((char *) *options, &args) == 0)
1141 return -EINVAL;
1142 1134
1143 data = kzalloc(sizeof(*data), GFP_KERNEL); 1135 if (nfs_parse_mount_options((char *)options, args) == 0)
1144 if (data == NULL) 1136 return -EINVAL;
1145 return -ENOMEM;
1146 1137
1147 /* 1138 if (!nfs_verify_server_address((struct sockaddr *)
1148 * NB: after this point, caller will free "data" 1139 &args->nfs_server.address))
1149 * if we return an error 1140 goto out_no_address;
1150 */
1151 *options = data;
1152 1141
1153 c = strchr(dev_name, ':'); 1142 c = strchr(dev_name, ':');
1154 if (c == NULL) 1143 if (c == NULL)
1155 return -EINVAL; 1144 return -EINVAL;
1156 len = c - dev_name; 1145 len = c - dev_name;
1157 if (len > sizeof(data->hostname)) 1146 /* N.B. caller will free nfs_server.hostname in all cases */
1158 return -ENAMETOOLONG; 1147 args->nfs_server.hostname = kstrndup(dev_name, len, GFP_KERNEL);
1159 strncpy(data->hostname, dev_name, len);
1160 args.nfs_server.hostname = data->hostname;
1161 1148
1162 c++; 1149 c++;
1163 if (strlen(c) > NFS_MAXPATHLEN) 1150 if (strlen(c) > NFS_MAXPATHLEN)
1164 return -ENAMETOOLONG; 1151 return -ENAMETOOLONG;
1165 args.nfs_server.export_path = c; 1152 args->nfs_server.export_path = c;
1166 1153
1167 status = nfs_try_mount(&args, mntfh); 1154 status = nfs_try_mount(args, mntfh);
1168 if (status) 1155 if (status)
1169 return status; 1156 return status;
1170 1157
1171 /*
1172 * Translate to nfs_mount_data, which nfs_fill_super
1173 * can deal with.
1174 */
1175 data->version = 6;
1176 data->flags = args.flags;
1177 data->rsize = args.rsize;
1178 data->wsize = args.wsize;
1179 data->timeo = args.timeo;
1180 data->retrans = args.retrans;
1181 data->acregmin = args.acregmin;
1182 data->acregmax = args.acregmax;
1183 data->acdirmin = args.acdirmin;
1184 data->acdirmax = args.acdirmax;
1185 data->addr = args.nfs_server.address;
1186 data->namlen = args.namlen;
1187 data->bsize = args.bsize;
1188 data->pseudoflavor = args.auth_flavors[0];
1189
1190 break; 1158 break;
1191 } 1159 }
1192 } 1160 }
1193 1161
1194 if (!(data->flags & NFS_MOUNT_SECFLAVOUR)) 1162 if (!(args->flags & NFS_MOUNT_SECFLAVOUR))
1195 data->pseudoflavor = RPC_AUTH_UNIX; 1163 args->auth_flavors[0] = RPC_AUTH_UNIX;
1196 1164
1197#ifndef CONFIG_NFS_V3 1165#ifndef CONFIG_NFS_V3
1198 if (data->flags & NFS_MOUNT_VER3) 1166 if (args->flags & NFS_MOUNT_VER3)
1199 goto out_v3_not_compiled; 1167 goto out_v3_not_compiled;
1200#endif /* !CONFIG_NFS_V3 */ 1168#endif /* !CONFIG_NFS_V3 */
1201 1169
1202 if (!nfs_verify_server_address((struct sockaddr *) &data->addr))
1203 goto out_no_address;
1204
1205 return 0; 1170 return 0;
1206 1171
1207out_no_data: 1172out_no_data:
@@ -1258,7 +1223,8 @@ static inline void nfs_initialise_sb(struct super_block *sb)
1258/* 1223/*
1259 * Finish setting up an NFS2/3 superblock 1224 * Finish setting up an NFS2/3 superblock
1260 */ 1225 */
1261static void nfs_fill_super(struct super_block *sb, struct nfs_mount_data *data) 1226static void nfs_fill_super(struct super_block *sb,
1227 struct nfs_parsed_mount_data *data)
1262{ 1228{
1263 struct nfs_server *server = NFS_SB(sb); 1229 struct nfs_server *server = NFS_SB(sb);
1264 1230
@@ -1379,7 +1345,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
1379 struct nfs_server *server = NULL; 1345 struct nfs_server *server = NULL;
1380 struct super_block *s; 1346 struct super_block *s;
1381 struct nfs_fh mntfh; 1347 struct nfs_fh mntfh;
1382 struct nfs_mount_data *data = raw_data; 1348 struct nfs_parsed_mount_data data;
1383 struct dentry *mntroot; 1349 struct dentry *mntroot;
1384 int (*compare_super)(struct super_block *, void *) = nfs_compare_super; 1350 int (*compare_super)(struct super_block *, void *) = nfs_compare_super;
1385 struct nfs_sb_mountdata sb_mntdata = { 1351 struct nfs_sb_mountdata sb_mntdata = {
@@ -1388,12 +1354,12 @@ static int nfs_get_sb(struct file_system_type *fs_type,
1388 int error; 1354 int error;
1389 1355
1390 /* Validate the mount data */ 1356 /* Validate the mount data */
1391 error = nfs_validate_mount_data(&data, &mntfh, dev_name); 1357 error = nfs_validate_mount_data(raw_data, &data, &mntfh, dev_name);
1392 if (error < 0) 1358 if (error < 0)
1393 goto out; 1359 goto out;
1394 1360
1395 /* Get a volume representation */ 1361 /* Get a volume representation */
1396 server = nfs_create_server(data, &mntfh); 1362 server = nfs_create_server(&data, &mntfh);
1397 if (IS_ERR(server)) { 1363 if (IS_ERR(server)) {
1398 error = PTR_ERR(server); 1364 error = PTR_ERR(server);
1399 goto out; 1365 goto out;
@@ -1417,7 +1383,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
1417 1383
1418 if (!s->s_root) { 1384 if (!s->s_root) {
1419 /* initial superblock/root creation */ 1385 /* initial superblock/root creation */
1420 nfs_fill_super(s, data); 1386 nfs_fill_super(s, &data);
1421 } 1387 }
1422 1388
1423 mntroot = nfs_get_root(s, &mntfh); 1389 mntroot = nfs_get_root(s, &mntfh);
@@ -1432,8 +1398,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
1432 error = 0; 1398 error = 0;
1433 1399
1434out: 1400out:
1435 if (data != raw_data) 1401 kfree(data.nfs_server.hostname);
1436 kfree(data);
1437 return error; 1402 return error;
1438 1403
1439out_err_nosb: 1404out_err_nosb:
@@ -1559,38 +1524,49 @@ static void nfs4_fill_super(struct super_block *sb)
1559/* 1524/*
1560 * Validate NFSv4 mount options 1525 * Validate NFSv4 mount options
1561 */ 1526 */
1562static int nfs4_validate_mount_data(struct nfs4_mount_data **options, 1527static int nfs4_validate_mount_data(void *options,
1563 const char *dev_name, 1528 struct nfs_parsed_mount_data *args,
1564 struct sockaddr_in *addr, 1529 const char *dev_name)
1565 rpc_authflavor_t *authflavour,
1566 char **hostname,
1567 char **mntpath,
1568 char **ip_addr)
1569{ 1530{
1570 struct nfs4_mount_data *data = *options; 1531 struct nfs4_mount_data *data = (struct nfs4_mount_data *)options;
1571 char *c; 1532 char *c;
1572 1533
1573 if (data == NULL) 1534 if (data == NULL)
1574 goto out_no_data; 1535 goto out_no_data;
1575 1536
1537 memset(args, 0, sizeof(*args));
1538 args->rsize = NFS_MAX_FILE_IO_SIZE;
1539 args->wsize = NFS_MAX_FILE_IO_SIZE;
1540 args->timeo = 600;
1541 args->retrans = 2;
1542 args->acregmin = 3;
1543 args->acregmax = 60;
1544 args->acdirmin = 30;
1545 args->acdirmax = 60;
1546 args->nfs_server.protocol = XPRT_TRANSPORT_TCP;
1547
1576 switch (data->version) { 1548 switch (data->version) {
1577 case 1: 1549 case 1:
1578 if (data->host_addrlen != sizeof(*addr)) 1550 if (data->host_addrlen != sizeof(args->nfs_server.address))
1579 goto out_no_address; 1551 goto out_no_address;
1580 if (copy_from_user(addr, data->host_addr, sizeof(*addr))) 1552 if (copy_from_user(&args->nfs_server.address,
1553 data->host_addr,
1554 sizeof(args->nfs_server.address)))
1581 return -EFAULT; 1555 return -EFAULT;
1582 if (addr->sin_port == 0) 1556 if (args->nfs_server.address.sin_port == 0)
1583 addr->sin_port = htons(NFS_PORT); 1557 args->nfs_server.address.sin_port = htons(NFS_PORT);
1584 if (!nfs_verify_server_address((struct sockaddr *) addr)) 1558 if (!nfs_verify_server_address((struct sockaddr *)
1559 &args->nfs_server.address))
1585 goto out_no_address; 1560 goto out_no_address;
1586 1561
1587 switch (data->auth_flavourlen) { 1562 switch (data->auth_flavourlen) {
1588 case 0: 1563 case 0:
1589 *authflavour = RPC_AUTH_UNIX; 1564 args->auth_flavors[0] = RPC_AUTH_UNIX;
1590 break; 1565 break;
1591 case 1: 1566 case 1:
1592 if (copy_from_user(authflavour, data->auth_flavours, 1567 if (copy_from_user(&args->auth_flavors[0],
1593 sizeof(*authflavour))) 1568 data->auth_flavours,
1569 sizeof(args->auth_flavors[0])))
1594 return -EFAULT; 1570 return -EFAULT;
1595 break; 1571 break;
1596 default: 1572 default:
@@ -1600,75 +1576,57 @@ static int nfs4_validate_mount_data(struct nfs4_mount_data **options,
1600 c = strndup_user(data->hostname.data, NFS4_MAXNAMLEN); 1576 c = strndup_user(data->hostname.data, NFS4_MAXNAMLEN);
1601 if (IS_ERR(c)) 1577 if (IS_ERR(c))
1602 return PTR_ERR(c); 1578 return PTR_ERR(c);
1603 *hostname = c; 1579 args->nfs_server.hostname = c;
1604 1580
1605 c = strndup_user(data->mnt_path.data, NFS4_MAXPATHLEN); 1581 c = strndup_user(data->mnt_path.data, NFS4_MAXPATHLEN);
1606 if (IS_ERR(c)) 1582 if (IS_ERR(c))
1607 return PTR_ERR(c); 1583 return PTR_ERR(c);
1608 *mntpath = c; 1584 args->nfs_server.export_path = c;
1609 dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", *mntpath); 1585 dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", c);
1610 1586
1611 c = strndup_user(data->client_addr.data, 16); 1587 c = strndup_user(data->client_addr.data, 16);
1612 if (IS_ERR(c)) 1588 if (IS_ERR(c))
1613 return PTR_ERR(c); 1589 return PTR_ERR(c);
1614 *ip_addr = c; 1590 args->client_address = c;
1591
1592 /*
1593 * Translate to nfs_parsed_mount_data, which nfs4_fill_super
1594 * can deal with.
1595 */
1596
1597 args->flags = data->flags & NFS4_MOUNT_FLAGMASK;
1598 args->rsize = data->rsize;
1599 args->wsize = data->wsize;
1600 args->timeo = data->timeo;
1601 args->retrans = data->retrans;
1602 args->acregmin = data->acregmin;
1603 args->acregmax = data->acregmax;
1604 args->acdirmin = data->acdirmin;
1605 args->acdirmax = data->acdirmax;
1606 args->nfs_server.protocol = data->proto;
1615 1607
1616 break; 1608 break;
1617 default: { 1609 default: {
1618 unsigned int len; 1610 unsigned int len;
1619 struct nfs_parsed_mount_data args = { 1611
1620 .rsize = NFS_MAX_FILE_IO_SIZE, 1612 if (nfs_parse_mount_options((char *)options, args) == 0)
1621 .wsize = NFS_MAX_FILE_IO_SIZE,
1622 .timeo = 600,
1623 .retrans = 2,
1624 .acregmin = 3,
1625 .acregmax = 60,
1626 .acdirmin = 30,
1627 .acdirmax = 60,
1628 .nfs_server.protocol = IPPROTO_TCP,
1629 };
1630
1631 if (nfs_parse_mount_options((char *) *options, &args) == 0)
1632 return -EINVAL; 1613 return -EINVAL;
1633 1614
1634 if (!nfs_verify_server_address((struct sockaddr *) 1615 if (!nfs_verify_server_address((struct sockaddr *)
1635 &args.nfs_server.address)) 1616 &args->nfs_server.address))
1636 return -EINVAL; 1617 return -EINVAL;
1637 *addr = args.nfs_server.address;
1638 1618
1639 switch (args.auth_flavor_len) { 1619 switch (args->auth_flavor_len) {
1640 case 0: 1620 case 0:
1641 *authflavour = RPC_AUTH_UNIX; 1621 args->auth_flavors[0] = RPC_AUTH_UNIX;
1642 break; 1622 break;
1643 case 1: 1623 case 1:
1644 *authflavour = (rpc_authflavor_t) args.auth_flavors[0];
1645 break; 1624 break;
1646 default: 1625 default:
1647 goto out_inval_auth; 1626 goto out_inval_auth;
1648 } 1627 }
1649 1628
1650 /* 1629 /*
1651 * Translate to nfs4_mount_data, which nfs4_fill_super
1652 * can deal with.
1653 */
1654 data = kzalloc(sizeof(*data), GFP_KERNEL);
1655 if (data == NULL)
1656 return -ENOMEM;
1657 *options = data;
1658
1659 data->version = 1;
1660 data->flags = args.flags & NFS4_MOUNT_FLAGMASK;
1661 data->rsize = args.rsize;
1662 data->wsize = args.wsize;
1663 data->timeo = args.timeo;
1664 data->retrans = args.retrans;
1665 data->acregmin = args.acregmin;
1666 data->acregmax = args.acregmax;
1667 data->acdirmin = args.acdirmin;
1668 data->acdirmax = args.acdirmax;
1669 data->proto = args.nfs_server.protocol;
1670
1671 /*
1672 * Split "dev_name" into "hostname:mntpath". 1630 * Split "dev_name" into "hostname:mntpath".
1673 */ 1631 */
1674 c = strchr(dev_name, ':'); 1632 c = strchr(dev_name, ':');
@@ -1678,27 +1636,25 @@ static int nfs4_validate_mount_data(struct nfs4_mount_data **options,
1678 len = c - dev_name; 1636 len = c - dev_name;
1679 if (len > NFS4_MAXNAMLEN) 1637 if (len > NFS4_MAXNAMLEN)
1680 return -ENAMETOOLONG; 1638 return -ENAMETOOLONG;
1681 *hostname = kzalloc(len, GFP_KERNEL); 1639 args->nfs_server.hostname = kzalloc(len, GFP_KERNEL);
1682 if (*hostname == NULL) 1640 if (args->nfs_server.hostname == NULL)
1683 return -ENOMEM; 1641 return -ENOMEM;
1684 strncpy(*hostname, dev_name, len - 1); 1642 strncpy(args->nfs_server.hostname, dev_name, len - 1);
1685 1643
1686 c++; /* step over the ':' */ 1644 c++; /* step over the ':' */
1687 len = strlen(c); 1645 len = strlen(c);
1688 if (len > NFS4_MAXPATHLEN) 1646 if (len > NFS4_MAXPATHLEN)
1689 return -ENAMETOOLONG; 1647 return -ENAMETOOLONG;
1690 *mntpath = kzalloc(len + 1, GFP_KERNEL); 1648 args->nfs_server.export_path = kzalloc(len + 1, GFP_KERNEL);
1691 if (*mntpath == NULL) 1649 if (args->nfs_server.export_path == NULL)
1692 return -ENOMEM; 1650 return -ENOMEM;
1693 strncpy(*mntpath, c, len); 1651 strncpy(args->nfs_server.export_path, c, len);
1694 1652
1695 dprintk("MNTPATH: %s\n", *mntpath); 1653 dprintk("MNTPATH: %s\n", args->nfs_server.export_path);
1696 1654
1697 if (args.client_address == NULL) 1655 if (args->client_address == NULL)
1698 goto out_no_client_address; 1656 goto out_no_client_address;
1699 1657
1700 *ip_addr = args.client_address;
1701
1702 break; 1658 break;
1703 } 1659 }
1704 } 1660 }
@@ -1729,14 +1685,11 @@ out_no_client_address:
1729static int nfs4_get_sb(struct file_system_type *fs_type, 1685static int nfs4_get_sb(struct file_system_type *fs_type,
1730 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt) 1686 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
1731{ 1687{
1732 struct nfs4_mount_data *data = raw_data; 1688 struct nfs_parsed_mount_data data;
1733 struct super_block *s; 1689 struct super_block *s;
1734 struct nfs_server *server; 1690 struct nfs_server *server;
1735 struct sockaddr_in addr;
1736 rpc_authflavor_t authflavour;
1737 struct nfs_fh mntfh; 1691 struct nfs_fh mntfh;
1738 struct dentry *mntroot; 1692 struct dentry *mntroot;
1739 char *mntpath = NULL, *hostname = NULL, *ip_addr = NULL;
1740 int (*compare_super)(struct super_block *, void *) = nfs_compare_super; 1693 int (*compare_super)(struct super_block *, void *) = nfs_compare_super;
1741 struct nfs_sb_mountdata sb_mntdata = { 1694 struct nfs_sb_mountdata sb_mntdata = {
1742 .mntflags = flags, 1695 .mntflags = flags,
@@ -1744,14 +1697,12 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
1744 int error; 1697 int error;
1745 1698
1746 /* Validate the mount data */ 1699 /* Validate the mount data */
1747 error = nfs4_validate_mount_data(&data, dev_name, &addr, &authflavour, 1700 error = nfs4_validate_mount_data(raw_data, &data, dev_name);
1748 &hostname, &mntpath, &ip_addr);
1749 if (error < 0) 1701 if (error < 0)
1750 goto out; 1702 goto out;
1751 1703
1752 /* Get a volume representation */ 1704 /* Get a volume representation */
1753 server = nfs4_create_server(data, hostname, &addr, mntpath, ip_addr, 1705 server = nfs4_create_server(&data, &mntfh);
1754 authflavour, &mntfh);
1755 if (IS_ERR(server)) { 1706 if (IS_ERR(server)) {
1756 error = PTR_ERR(server); 1707 error = PTR_ERR(server);
1757 goto out; 1708 goto out;
@@ -1790,9 +1741,9 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
1790 error = 0; 1741 error = 0;
1791 1742
1792out: 1743out:
1793 kfree(ip_addr); 1744 kfree(data.client_address);
1794 kfree(mntpath); 1745 kfree(data.nfs_server.export_path);
1795 kfree(hostname); 1746 kfree(data.nfs_server.hostname);
1796 return error; 1747 return error;
1797 1748
1798out_free: 1749out_free:
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 045ab805c17f..1aed850d18f2 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -66,7 +66,6 @@ static void nfs_async_unlink_init(struct rpc_task *task, void *calldata)
66 .rpc_cred = data->cred, 66 .rpc_cred = data->cred,
67 }; 67 };
68 68
69 nfs_begin_data_update(dir);
70 NFS_PROTO(dir)->unlink_setup(&msg, dir); 69 NFS_PROTO(dir)->unlink_setup(&msg, dir);
71 rpc_call_setup(task, &msg, 0); 70 rpc_call_setup(task, &msg, 0);
72} 71}
@@ -84,8 +83,6 @@ static void nfs_async_unlink_done(struct rpc_task *task, void *calldata)
84 83
85 if (!NFS_PROTO(dir)->unlink_done(task, dir)) 84 if (!NFS_PROTO(dir)->unlink_done(task, dir))
86 rpc_restart_call(task); 85 rpc_restart_call(task);
87 else
88 nfs_end_data_update(dir);
89} 86}
90 87
91/** 88/**
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 0d7a77cc394b..e2bb66c34406 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -110,6 +110,13 @@ void nfs_writedata_release(void *wdata)
110 nfs_writedata_free(wdata); 110 nfs_writedata_free(wdata);
111} 111}
112 112
113static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error)
114{
115 ctx->error = error;
116 smp_wmb();
117 set_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
118}
119
113static struct nfs_page *nfs_page_find_request_locked(struct page *page) 120static struct nfs_page *nfs_page_find_request_locked(struct page *page)
114{ 121{
115 struct nfs_page *req = NULL; 122 struct nfs_page *req = NULL;
@@ -243,10 +250,7 @@ static void nfs_end_page_writeback(struct page *page)
243 250
244/* 251/*
245 * Find an associated nfs write request, and prepare to flush it out 252 * Find an associated nfs write request, and prepare to flush it out
246 * Returns 1 if there was no write request, or if the request was 253 * May return an error if the user signalled nfs_wait_on_request().
247 * already tagged by nfs_set_page_dirty.Returns 0 if the request
248 * was not tagged.
249 * May also return an error if the user signalled nfs_wait_on_request().
250 */ 254 */
251static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, 255static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
252 struct page *page) 256 struct page *page)
@@ -261,7 +265,7 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
261 req = nfs_page_find_request_locked(page); 265 req = nfs_page_find_request_locked(page);
262 if (req == NULL) { 266 if (req == NULL) {
263 spin_unlock(&inode->i_lock); 267 spin_unlock(&inode->i_lock);
264 return 1; 268 return 0;
265 } 269 }
266 if (nfs_lock_request_dontget(req)) 270 if (nfs_lock_request_dontget(req))
267 break; 271 break;
@@ -282,7 +286,7 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
282 spin_unlock(&inode->i_lock); 286 spin_unlock(&inode->i_lock);
283 nfs_unlock_request(req); 287 nfs_unlock_request(req);
284 nfs_pageio_complete(pgio); 288 nfs_pageio_complete(pgio);
285 return 1; 289 return 0;
286 } 290 }
287 if (nfs_set_page_writeback(page) != 0) { 291 if (nfs_set_page_writeback(page) != 0) {
288 spin_unlock(&inode->i_lock); 292 spin_unlock(&inode->i_lock);
@@ -290,70 +294,56 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
290 } 294 }
291 radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index, 295 radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index,
292 NFS_PAGE_TAG_LOCKED); 296 NFS_PAGE_TAG_LOCKED);
293 ret = test_bit(PG_NEED_FLUSH, &req->wb_flags);
294 spin_unlock(&inode->i_lock); 297 spin_unlock(&inode->i_lock);
295 nfs_pageio_add_request(pgio, req); 298 nfs_pageio_add_request(pgio, req);
296 return ret; 299 return 0;
297} 300}
298 301
299/* 302static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio)
300 * Write an mmapped page to the server.
301 */
302static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc)
303{ 303{
304 struct nfs_pageio_descriptor mypgio, *pgio;
305 struct nfs_open_context *ctx;
306 struct inode *inode = page->mapping->host; 304 struct inode *inode = page->mapping->host;
307 unsigned offset;
308 int err;
309 305
310 nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE); 306 nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
311 nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1); 307 nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1);
312 308
313 if (wbc->for_writepages)
314 pgio = wbc->fs_private;
315 else {
316 nfs_pageio_init_write(&mypgio, inode, wb_priority(wbc));
317 pgio = &mypgio;
318 }
319
320 nfs_pageio_cond_complete(pgio, page->index); 309 nfs_pageio_cond_complete(pgio, page->index);
310 return nfs_page_async_flush(pgio, page);
311}
321 312
322 err = nfs_page_async_flush(pgio, page); 313/*
323 if (err <= 0) 314 * Write an mmapped page to the server.
324 goto out; 315 */
325 err = 0; 316static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc)
326 offset = nfs_page_length(page); 317{
327 if (!offset) 318 struct nfs_pageio_descriptor pgio;
328 goto out; 319 int err;
329
330 nfs_pageio_cond_complete(pgio, page->index);
331 320
332 ctx = nfs_find_open_context(inode, NULL, FMODE_WRITE); 321 nfs_pageio_init_write(&pgio, page->mapping->host, wb_priority(wbc));
333 if (ctx == NULL) { 322 err = nfs_do_writepage(page, wbc, &pgio);
334 err = -EBADF; 323 nfs_pageio_complete(&pgio);
335 goto out; 324 if (err < 0)
336 } 325 return err;
337 err = nfs_writepage_setup(ctx, page, 0, offset); 326 if (pgio.pg_error < 0)
338 put_nfs_open_context(ctx); 327 return pgio.pg_error;
339 if (err != 0) 328 return 0;
340 goto out;
341 err = nfs_page_async_flush(pgio, page);
342 if (err > 0)
343 err = 0;
344out:
345 if (!wbc->for_writepages)
346 nfs_pageio_complete(pgio);
347 return err;
348} 329}
349 330
350int nfs_writepage(struct page *page, struct writeback_control *wbc) 331int nfs_writepage(struct page *page, struct writeback_control *wbc)
351{ 332{
352 int err; 333 int ret;
334
335 ret = nfs_writepage_locked(page, wbc);
336 unlock_page(page);
337 return ret;
338}
339
340static int nfs_writepages_callback(struct page *page, struct writeback_control *wbc, void *data)
341{
342 int ret;
353 343
354 err = nfs_writepage_locked(page, wbc); 344 ret = nfs_do_writepage(page, wbc, data);
355 unlock_page(page); 345 unlock_page(page);
356 return err; 346 return ret;
357} 347}
358 348
359int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) 349int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
@@ -365,12 +355,11 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
365 nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES); 355 nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES);
366 356
367 nfs_pageio_init_write(&pgio, inode, wb_priority(wbc)); 357 nfs_pageio_init_write(&pgio, inode, wb_priority(wbc));
368 wbc->fs_private = &pgio; 358 err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio);
369 err = generic_writepages(mapping, wbc);
370 nfs_pageio_complete(&pgio); 359 nfs_pageio_complete(&pgio);
371 if (err) 360 if (err < 0)
372 return err; 361 return err;
373 if (pgio.pg_error) 362 if (pgio.pg_error < 0)
374 return pgio.pg_error; 363 return pgio.pg_error;
375 return 0; 364 return 0;
376} 365}
@@ -389,14 +378,11 @@ static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
389 return error; 378 return error;
390 if (!nfsi->npages) { 379 if (!nfsi->npages) {
391 igrab(inode); 380 igrab(inode);
392 nfs_begin_data_update(inode);
393 if (nfs_have_delegation(inode, FMODE_WRITE)) 381 if (nfs_have_delegation(inode, FMODE_WRITE))
394 nfsi->change_attr++; 382 nfsi->change_attr++;
395 } 383 }
396 SetPagePrivate(req->wb_page); 384 SetPagePrivate(req->wb_page);
397 set_page_private(req->wb_page, (unsigned long)req); 385 set_page_private(req->wb_page, (unsigned long)req);
398 if (PageDirty(req->wb_page))
399 set_bit(PG_NEED_FLUSH, &req->wb_flags);
400 nfsi->npages++; 386 nfsi->npages++;
401 kref_get(&req->wb_kref); 387 kref_get(&req->wb_kref);
402 return 0; 388 return 0;
@@ -416,12 +402,9 @@ static void nfs_inode_remove_request(struct nfs_page *req)
416 set_page_private(req->wb_page, 0); 402 set_page_private(req->wb_page, 0);
417 ClearPagePrivate(req->wb_page); 403 ClearPagePrivate(req->wb_page);
418 radix_tree_delete(&nfsi->nfs_page_tree, req->wb_index); 404 radix_tree_delete(&nfsi->nfs_page_tree, req->wb_index);
419 if (test_and_clear_bit(PG_NEED_FLUSH, &req->wb_flags))
420 __set_page_dirty_nobuffers(req->wb_page);
421 nfsi->npages--; 405 nfsi->npages--;
422 if (!nfsi->npages) { 406 if (!nfsi->npages) {
423 spin_unlock(&inode->i_lock); 407 spin_unlock(&inode->i_lock);
424 nfs_end_data_update(inode);
425 iput(inode); 408 iput(inode);
426 } else 409 } else
427 spin_unlock(&inode->i_lock); 410 spin_unlock(&inode->i_lock);
@@ -682,7 +665,7 @@ static struct nfs_page * nfs_update_request(struct nfs_open_context* ctx,
682 665
683int nfs_flush_incompatible(struct file *file, struct page *page) 666int nfs_flush_incompatible(struct file *file, struct page *page)
684{ 667{
685 struct nfs_open_context *ctx = (struct nfs_open_context *)file->private_data; 668 struct nfs_open_context *ctx = nfs_file_open_context(file);
686 struct nfs_page *req; 669 struct nfs_page *req;
687 int do_flush, status; 670 int do_flush, status;
688 /* 671 /*
@@ -716,7 +699,7 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
716int nfs_updatepage(struct file *file, struct page *page, 699int nfs_updatepage(struct file *file, struct page *page,
717 unsigned int offset, unsigned int count) 700 unsigned int offset, unsigned int count)
718{ 701{
719 struct nfs_open_context *ctx = (struct nfs_open_context *)file->private_data; 702 struct nfs_open_context *ctx = nfs_file_open_context(file);
720 struct inode *inode = page->mapping->host; 703 struct inode *inode = page->mapping->host;
721 int status = 0; 704 int status = 0;
722 705
@@ -967,7 +950,7 @@ static void nfs_writeback_done_partial(struct rpc_task *task, void *calldata)
967 950
968 if (task->tk_status < 0) { 951 if (task->tk_status < 0) {
969 nfs_set_pageerror(page); 952 nfs_set_pageerror(page);
970 req->wb_context->error = task->tk_status; 953 nfs_context_set_write_error(req->wb_context, task->tk_status);
971 dprintk(", error = %d\n", task->tk_status); 954 dprintk(", error = %d\n", task->tk_status);
972 goto out; 955 goto out;
973 } 956 }
@@ -1030,7 +1013,7 @@ static void nfs_writeback_done_full(struct rpc_task *task, void *calldata)
1030 1013
1031 if (task->tk_status < 0) { 1014 if (task->tk_status < 0) {
1032 nfs_set_pageerror(page); 1015 nfs_set_pageerror(page);
1033 req->wb_context->error = task->tk_status; 1016 nfs_context_set_write_error(req->wb_context, task->tk_status);
1034 dprintk(", error = %d\n", task->tk_status); 1017 dprintk(", error = %d\n", task->tk_status);
1035 goto remove_request; 1018 goto remove_request;
1036 } 1019 }
@@ -1244,7 +1227,7 @@ static void nfs_commit_done(struct rpc_task *task, void *calldata)
1244 req->wb_bytes, 1227 req->wb_bytes,
1245 (long long)req_offset(req)); 1228 (long long)req_offset(req));
1246 if (task->tk_status < 0) { 1229 if (task->tk_status < 0) {
1247 req->wb_context->error = task->tk_status; 1230 nfs_context_set_write_error(req->wb_context, task->tk_status);
1248 nfs_inode_remove_request(req); 1231 nfs_inode_remove_request(req);
1249 dprintk(", error = %d\n", task->tk_status); 1232 dprintk(", error = %d\n", task->tk_status);
1250 goto next; 1233 goto next;
@@ -1347,53 +1330,52 @@ long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_contr
1347 return ret; 1330 return ret;
1348} 1331}
1349 1332
1350/* 1333static int __nfs_write_mapping(struct address_space *mapping, struct writeback_control *wbc, int how)
1351 * flush the inode to disk.
1352 */
1353int nfs_wb_all(struct inode *inode)
1354{ 1334{
1355 struct address_space *mapping = inode->i_mapping;
1356 struct writeback_control wbc = {
1357 .bdi = mapping->backing_dev_info,
1358 .sync_mode = WB_SYNC_ALL,
1359 .nr_to_write = LONG_MAX,
1360 .for_writepages = 1,
1361 .range_cyclic = 1,
1362 };
1363 int ret; 1335 int ret;
1364 1336
1365 ret = nfs_writepages(mapping, &wbc); 1337 ret = nfs_writepages(mapping, wbc);
1366 if (ret < 0) 1338 if (ret < 0)
1367 goto out; 1339 goto out;
1368 ret = nfs_sync_mapping_wait(mapping, &wbc, 0); 1340 ret = nfs_sync_mapping_wait(mapping, wbc, how);
1369 if (ret >= 0) 1341 if (ret < 0)
1370 return 0; 1342 goto out;
1343 return 0;
1371out: 1344out:
1372 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 1345 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
1373 return ret; 1346 return ret;
1374} 1347}
1375 1348
1376int nfs_sync_mapping_range(struct address_space *mapping, loff_t range_start, loff_t range_end, int how) 1349/* Two pass sync: first using WB_SYNC_NONE, then WB_SYNC_ALL */
1350static int nfs_write_mapping(struct address_space *mapping, int how)
1377{ 1351{
1378 struct writeback_control wbc = { 1352 struct writeback_control wbc = {
1379 .bdi = mapping->backing_dev_info, 1353 .bdi = mapping->backing_dev_info,
1380 .sync_mode = WB_SYNC_ALL, 1354 .sync_mode = WB_SYNC_NONE,
1381 .nr_to_write = LONG_MAX, 1355 .nr_to_write = LONG_MAX,
1382 .range_start = range_start,
1383 .range_end = range_end,
1384 .for_writepages = 1, 1356 .for_writepages = 1,
1357 .range_cyclic = 1,
1385 }; 1358 };
1386 int ret; 1359 int ret;
1387 1360
1388 ret = nfs_writepages(mapping, &wbc); 1361 ret = __nfs_write_mapping(mapping, &wbc, how);
1389 if (ret < 0) 1362 if (ret < 0)
1390 goto out; 1363 return ret;
1391 ret = nfs_sync_mapping_wait(mapping, &wbc, how); 1364 wbc.sync_mode = WB_SYNC_ALL;
1392 if (ret >= 0) 1365 return __nfs_write_mapping(mapping, &wbc, how);
1393 return 0; 1366}
1394out: 1367
1395 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 1368/*
1396 return ret; 1369 * flush the inode to disk.
1370 */
1371int nfs_wb_all(struct inode *inode)
1372{
1373 return nfs_write_mapping(inode->i_mapping, 0);
1374}
1375
1376int nfs_wb_nocommit(struct inode *inode)
1377{
1378 return nfs_write_mapping(inode->i_mapping, FLUSH_NOCOMMIT);
1397} 1379}
1398 1380
1399int nfs_wb_page_cancel(struct inode *inode, struct page *page) 1381int nfs_wb_page_cancel(struct inode *inode, struct page *page)
@@ -1477,35 +1459,6 @@ int nfs_wb_page(struct inode *inode, struct page* page)
1477 return nfs_wb_page_priority(inode, page, FLUSH_STABLE); 1459 return nfs_wb_page_priority(inode, page, FLUSH_STABLE);
1478} 1460}
1479 1461
1480int nfs_set_page_dirty(struct page *page)
1481{
1482 struct address_space *mapping = page->mapping;
1483 struct inode *inode;
1484 struct nfs_page *req;
1485 int ret;
1486
1487 if (!mapping)
1488 goto out_raced;
1489 inode = mapping->host;
1490 if (!inode)
1491 goto out_raced;
1492 spin_lock(&inode->i_lock);
1493 req = nfs_page_find_request_locked(page);
1494 if (req != NULL) {
1495 /* Mark any existing write requests for flushing */
1496 ret = !test_and_set_bit(PG_NEED_FLUSH, &req->wb_flags);
1497 spin_unlock(&inode->i_lock);
1498 nfs_release_request(req);
1499 return ret;
1500 }
1501 ret = __set_page_dirty_nobuffers(page);
1502 spin_unlock(&inode->i_lock);
1503 return ret;
1504out_raced:
1505 return !TestSetPageDirty(page);
1506}
1507
1508
1509int __init nfs_init_writepagecache(void) 1462int __init nfs_init_writepagecache(void)
1510{ 1463{
1511 nfs_wdata_cachep = kmem_cache_create("nfs_write_data", 1464 nfs_wdata_cachep = kmem_cache_create("nfs_write_data",
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index e15f2cf8ac15..57333944af7f 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -102,7 +102,8 @@ check_filename(char *str, int len, __be32 err)
102out: \ 102out: \
103 return status; \ 103 return status; \
104xdr_error: \ 104xdr_error: \
105 printk(KERN_NOTICE "xdr error! (%s:%d)\n", __FILE__, __LINE__); \ 105 dprintk("NFSD: xdr error (%s:%d)\n", \
106 __FILE__, __LINE__); \
106 status = nfserr_bad_xdr; \ 107 status = nfserr_bad_xdr; \
107 goto out 108 goto out
108 109
@@ -124,7 +125,8 @@ xdr_error: \
124 if (!(x = (p==argp->tmp || p == argp->tmpp) ? \ 125 if (!(x = (p==argp->tmp || p == argp->tmpp) ? \
125 savemem(argp, p, nbytes) : \ 126 savemem(argp, p, nbytes) : \
126 (char *)p)) { \ 127 (char *)p)) { \
127 printk(KERN_NOTICE "xdr error! (%s:%d)\n", __FILE__, __LINE__); \ 128 dprintk("NFSD: xdr error (%s:%d)\n", \
129 __FILE__, __LINE__); \
128 goto xdr_error; \ 130 goto xdr_error; \
129 } \ 131 } \
130 p += XDR_QUADLEN(nbytes); \ 132 p += XDR_QUADLEN(nbytes); \
@@ -140,7 +142,8 @@ xdr_error: \
140 p = argp->p; \ 142 p = argp->p; \
141 argp->p += XDR_QUADLEN(nbytes); \ 143 argp->p += XDR_QUADLEN(nbytes); \
142 } else if (!(p = read_buf(argp, nbytes))) { \ 144 } else if (!(p = read_buf(argp, nbytes))) { \
143 printk(KERN_NOTICE "xdr error! (%s:%d)\n", __FILE__, __LINE__); \ 145 dprintk("NFSD: xdr error (%s:%d)\n", \
146 __FILE__, __LINE__); \
144 goto xdr_error; \ 147 goto xdr_error; \
145 } \ 148 } \
146} while (0) 149} while (0)
@@ -948,7 +951,8 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write)
948 */ 951 */
949 avail = (char*)argp->end - (char*)argp->p; 952 avail = (char*)argp->end - (char*)argp->p;
950 if (avail + argp->pagelen < write->wr_buflen) { 953 if (avail + argp->pagelen < write->wr_buflen) {
951 printk(KERN_NOTICE "xdr error! (%s:%d)\n", __FILE__, __LINE__); 954 dprintk("NFSD: xdr error (%s:%d)\n",
955 __FILE__, __LINE__);
952 goto xdr_error; 956 goto xdr_error;
953 } 957 }
954 argp->rqstp->rq_vec[0].iov_base = p; 958 argp->rqstp->rq_vec[0].iov_base = p;
@@ -1019,7 +1023,7 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
1019 argp->ops = kmalloc(argp->opcnt * sizeof(*argp->ops), GFP_KERNEL); 1023 argp->ops = kmalloc(argp->opcnt * sizeof(*argp->ops), GFP_KERNEL);
1020 if (!argp->ops) { 1024 if (!argp->ops) {
1021 argp->ops = argp->iops; 1025 argp->ops = argp->iops;
1022 printk(KERN_INFO "nfsd: couldn't allocate room for COMPOUND\n"); 1026 dprintk("nfsd: couldn't allocate room for COMPOUND\n");
1023 goto xdr_error; 1027 goto xdr_error;
1024 } 1028 }
1025 } 1029 }
@@ -1326,7 +1330,7 @@ static char *nfsd4_path(struct svc_rqst *rqstp, struct svc_export *exp, __be32 *
1326 path = exp->ex_path; 1330 path = exp->ex_path;
1327 1331
1328 if (strncmp(path, rootpath, strlen(rootpath))) { 1332 if (strncmp(path, rootpath, strlen(rootpath))) {
1329 printk("nfsd: fs_locations failed;" 1333 dprintk("nfsd: fs_locations failed;"
1330 "%s is not contained in %s\n", path, rootpath); 1334 "%s is not contained in %s\n", path, rootpath);
1331 *stat = nfserr_notsupp; 1335 *stat = nfserr_notsupp;
1332 return NULL; 1336 return NULL;
diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h
index d7a5e034c3a2..e757a74b9d17 100644
--- a/include/linux/jiffies.h
+++ b/include/linux/jiffies.h
@@ -109,6 +109,10 @@ static inline u64 get_jiffies_64(void)
109 ((long)(a) - (long)(b) >= 0)) 109 ((long)(a) - (long)(b) >= 0))
110#define time_before_eq(a,b) time_after_eq(b,a) 110#define time_before_eq(a,b) time_after_eq(b,a)
111 111
112#define time_in_range(a,b,c) \
113 (time_after_eq(a,b) && \
114 time_before_eq(a,c))
115
112/* Same as above, but does so with platform independent 64bit types. 116/* Same as above, but does so with platform independent 64bit types.
113 * These must be used when utilizing jiffies_64 (i.e. return value of 117 * These must be used when utilizing jiffies_64 (i.e. return value of
114 * get_jiffies_64() */ 118 * get_jiffies_64() */
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 7250eeadd7b5..c5164c257f71 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -47,10 +47,8 @@
47#include <linux/nfs3.h> 47#include <linux/nfs3.h>
48#include <linux/nfs4.h> 48#include <linux/nfs4.h>
49#include <linux/nfs_xdr.h> 49#include <linux/nfs_xdr.h>
50
51#include <linux/nfs_fs_sb.h> 50#include <linux/nfs_fs_sb.h>
52 51
53#include <linux/rwsem.h>
54#include <linux/mempool.h> 52#include <linux/mempool.h>
55 53
56/* 54/*
@@ -77,6 +75,9 @@ struct nfs_open_context {
77 struct nfs4_state *state; 75 struct nfs4_state *state;
78 fl_owner_t lockowner; 76 fl_owner_t lockowner;
79 int mode; 77 int mode;
78
79 unsigned long flags;
80#define NFS_CONTEXT_ERROR_WRITE (0)
80 int error; 81 int error;
81 82
82 struct list_head list; 83 struct list_head list;
@@ -133,11 +134,6 @@ struct nfs_inode {
133 * server. 134 * server.
134 */ 135 */
135 unsigned long cache_change_attribute; 136 unsigned long cache_change_attribute;
136 /*
137 * Counter indicating the number of outstanding requests that
138 * will cause a file data update.
139 */
140 atomic_t data_updates;
141 137
142 struct rb_root access_cache; 138 struct rb_root access_cache;
143 struct list_head access_cache_entry_lru; 139 struct list_head access_cache_entry_lru;
@@ -205,27 +201,18 @@ static inline struct nfs_inode *NFS_I(struct inode *inode)
205#define NFS_CLIENT(inode) (NFS_SERVER(inode)->client) 201#define NFS_CLIENT(inode) (NFS_SERVER(inode)->client)
206#define NFS_PROTO(inode) (NFS_SERVER(inode)->nfs_client->rpc_ops) 202#define NFS_PROTO(inode) (NFS_SERVER(inode)->nfs_client->rpc_ops)
207#define NFS_COOKIEVERF(inode) (NFS_I(inode)->cookieverf) 203#define NFS_COOKIEVERF(inode) (NFS_I(inode)->cookieverf)
208#define NFS_READTIME(inode) (NFS_I(inode)->read_cache_jiffies)
209#define NFS_CHANGE_ATTR(inode) (NFS_I(inode)->change_attr)
210#define NFS_ATTRTIMEO(inode) (NFS_I(inode)->attrtimeo)
211#define NFS_MINATTRTIMEO(inode) \ 204#define NFS_MINATTRTIMEO(inode) \
212 (S_ISDIR(inode->i_mode)? NFS_SERVER(inode)->acdirmin \ 205 (S_ISDIR(inode->i_mode)? NFS_SERVER(inode)->acdirmin \
213 : NFS_SERVER(inode)->acregmin) 206 : NFS_SERVER(inode)->acregmin)
214#define NFS_MAXATTRTIMEO(inode) \ 207#define NFS_MAXATTRTIMEO(inode) \
215 (S_ISDIR(inode->i_mode)? NFS_SERVER(inode)->acdirmax \ 208 (S_ISDIR(inode->i_mode)? NFS_SERVER(inode)->acdirmax \
216 : NFS_SERVER(inode)->acregmax) 209 : NFS_SERVER(inode)->acregmax)
217#define NFS_ATTRTIMEO_UPDATE(inode) (NFS_I(inode)->attrtimeo_timestamp)
218 210
219#define NFS_FLAGS(inode) (NFS_I(inode)->flags) 211#define NFS_FLAGS(inode) (NFS_I(inode)->flags)
220#define NFS_STALE(inode) (test_bit(NFS_INO_STALE, &NFS_FLAGS(inode))) 212#define NFS_STALE(inode) (test_bit(NFS_INO_STALE, &NFS_FLAGS(inode)))
221 213
222#define NFS_FILEID(inode) (NFS_I(inode)->fileid) 214#define NFS_FILEID(inode) (NFS_I(inode)->fileid)
223 215
224static inline int nfs_caches_unstable(struct inode *inode)
225{
226 return atomic_read(&NFS_I(inode)->data_updates) != 0;
227}
228
229static inline void nfs_mark_for_revalidate(struct inode *inode) 216static inline void nfs_mark_for_revalidate(struct inode *inode)
230{ 217{
231 struct nfs_inode *nfsi = NFS_I(inode); 218 struct nfs_inode *nfsi = NFS_I(inode);
@@ -237,12 +224,6 @@ static inline void nfs_mark_for_revalidate(struct inode *inode)
237 spin_unlock(&inode->i_lock); 224 spin_unlock(&inode->i_lock);
238} 225}
239 226
240static inline void NFS_CACHEINV(struct inode *inode)
241{
242 if (!nfs_caches_unstable(inode))
243 nfs_mark_for_revalidate(inode);
244}
245
246static inline int nfs_server_capable(struct inode *inode, int cap) 227static inline int nfs_server_capable(struct inode *inode, int cap)
247{ 228{
248 return NFS_SERVER(inode)->caps & cap; 229 return NFS_SERVER(inode)->caps & cap;
@@ -253,28 +234,33 @@ static inline int NFS_USE_READDIRPLUS(struct inode *inode)
253 return test_bit(NFS_INO_ADVISE_RDPLUS, &NFS_FLAGS(inode)); 234 return test_bit(NFS_INO_ADVISE_RDPLUS, &NFS_FLAGS(inode));
254} 235}
255 236
237static inline void nfs_set_verifier(struct dentry * dentry, unsigned long verf)
238{
239 dentry->d_time = verf;
240}
241
256/** 242/**
257 * nfs_save_change_attribute - Returns the inode attribute change cookie 243 * nfs_save_change_attribute - Returns the inode attribute change cookie
258 * @inode - pointer to inode 244 * @dir - pointer to parent directory inode
259 * The "change attribute" is updated every time we finish an operation 245 * The "change attribute" is updated every time we finish an operation
260 * that will result in a metadata change on the server. 246 * that will result in a metadata change on the server.
261 */ 247 */
262static inline long nfs_save_change_attribute(struct inode *inode) 248static inline unsigned long nfs_save_change_attribute(struct inode *dir)
263{ 249{
264 return NFS_I(inode)->cache_change_attribute; 250 return NFS_I(dir)->cache_change_attribute;
265} 251}
266 252
267/** 253/**
268 * nfs_verify_change_attribute - Detects NFS inode cache updates 254 * nfs_verify_change_attribute - Detects NFS remote directory changes
269 * @inode - pointer to inode 255 * @dir - pointer to parent directory inode
270 * @chattr - previously saved change attribute 256 * @chattr - previously saved change attribute
271 * Return "false" if metadata has been updated (or is in the process of 257 * Return "false" if the verifiers doesn't match the change attribute.
272 * being updated) since the change attribute was saved. 258 * This would usually indicate that the directory contents have changed on
259 * the server, and that any dentries need revalidating.
273 */ 260 */
274static inline int nfs_verify_change_attribute(struct inode *inode, unsigned long chattr) 261static inline int nfs_verify_change_attribute(struct inode *dir, unsigned long chattr)
275{ 262{
276 return !nfs_caches_unstable(inode) 263 return chattr == NFS_I(dir)->cache_change_attribute;
277 && time_after_eq(chattr, NFS_I(inode)->cache_change_attribute);
278} 264}
279 265
280/* 266/*
@@ -283,15 +269,14 @@ static inline int nfs_verify_change_attribute(struct inode *inode, unsigned long
283extern int nfs_sync_mapping(struct address_space *mapping); 269extern int nfs_sync_mapping(struct address_space *mapping);
284extern void nfs_zap_mapping(struct inode *inode, struct address_space *mapping); 270extern void nfs_zap_mapping(struct inode *inode, struct address_space *mapping);
285extern void nfs_zap_caches(struct inode *); 271extern void nfs_zap_caches(struct inode *);
272extern void nfs_invalidate_atime(struct inode *);
286extern struct inode *nfs_fhget(struct super_block *, struct nfs_fh *, 273extern struct inode *nfs_fhget(struct super_block *, struct nfs_fh *,
287 struct nfs_fattr *); 274 struct nfs_fattr *);
288extern int nfs_refresh_inode(struct inode *, struct nfs_fattr *); 275extern int nfs_refresh_inode(struct inode *, struct nfs_fattr *);
289extern int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr); 276extern int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr);
277extern int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fattr);
290extern int nfs_getattr(struct vfsmount *, struct dentry *, struct kstat *); 278extern int nfs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
291extern int nfs_permission(struct inode *, int, struct nameidata *); 279extern int nfs_permission(struct inode *, int, struct nameidata *);
292extern int nfs_access_get_cached(struct inode *, struct rpc_cred *, struct nfs_access_entry *);
293extern void nfs_access_add_cache(struct inode *, struct nfs_access_entry *);
294extern void nfs_access_zap_cache(struct inode *inode);
295extern int nfs_open(struct inode *, struct file *); 280extern int nfs_open(struct inode *, struct file *);
296extern int nfs_release(struct inode *, struct file *); 281extern int nfs_release(struct inode *, struct file *);
297extern int nfs_attribute_timeout(struct inode *inode); 282extern int nfs_attribute_timeout(struct inode *inode);
@@ -301,13 +286,10 @@ extern int nfs_revalidate_mapping(struct inode *inode, struct address_space *map
301extern int nfs_revalidate_mapping_nolock(struct inode *inode, struct address_space *mapping); 286extern int nfs_revalidate_mapping_nolock(struct inode *inode, struct address_space *mapping);
302extern int nfs_setattr(struct dentry *, struct iattr *); 287extern int nfs_setattr(struct dentry *, struct iattr *);
303extern void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr); 288extern void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr);
304extern void nfs_begin_attr_update(struct inode *);
305extern void nfs_end_attr_update(struct inode *);
306extern void nfs_begin_data_update(struct inode *);
307extern void nfs_end_data_update(struct inode *);
308extern struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx); 289extern struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx);
309extern void put_nfs_open_context(struct nfs_open_context *ctx); 290extern void put_nfs_open_context(struct nfs_open_context *ctx);
310extern struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_cred *cred, int mode); 291extern struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_cred *cred, int mode);
292extern u64 nfs_compat_user_ino64(u64 fileid);
311 293
312/* linux/net/ipv4/ipconfig.c: trims ip addr off front of name, too. */ 294/* linux/net/ipv4/ipconfig.c: trims ip addr off front of name, too. */
313extern __be32 root_nfs_parse_addr(char *name); /*__init*/ 295extern __be32 root_nfs_parse_addr(char *name); /*__init*/
@@ -328,14 +310,15 @@ extern const struct inode_operations nfs3_file_inode_operations;
328extern const struct file_operations nfs_file_operations; 310extern const struct file_operations nfs_file_operations;
329extern const struct address_space_operations nfs_file_aops; 311extern const struct address_space_operations nfs_file_aops;
330 312
331static inline struct rpc_cred *nfs_file_cred(struct file *file) 313static inline struct nfs_open_context *nfs_file_open_context(struct file *filp)
332{ 314{
333 if (file != NULL) { 315 return filp->private_data;
334 struct nfs_open_context *ctx; 316}
335 317
336 ctx = (struct nfs_open_context*)file->private_data; 318static inline struct rpc_cred *nfs_file_cred(struct file *file)
337 return ctx->cred; 319{
338 } 320 if (file != NULL)
321 return nfs_file_open_context(file)->cred;
339 return NULL; 322 return NULL;
340} 323}
341 324
@@ -378,6 +361,8 @@ extern const struct file_operations nfs_dir_operations;
378extern struct dentry_operations nfs_dentry_operations; 361extern struct dentry_operations nfs_dentry_operations;
379 362
380extern int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fh, struct nfs_fattr *fattr); 363extern int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fh, struct nfs_fattr *fattr);
364extern int nfs_may_open(struct inode *inode, struct rpc_cred *cred, int openflags);
365extern void nfs_access_zap_cache(struct inode *inode);
381 366
382/* 367/*
383 * linux/fs/nfs/symlink.c 368 * linux/fs/nfs/symlink.c
@@ -420,15 +405,14 @@ extern int nfs_flush_incompatible(struct file *file, struct page *page);
420extern int nfs_updatepage(struct file *, struct page *, unsigned int, unsigned int); 405extern int nfs_updatepage(struct file *, struct page *, unsigned int, unsigned int);
421extern int nfs_writeback_done(struct rpc_task *, struct nfs_write_data *); 406extern int nfs_writeback_done(struct rpc_task *, struct nfs_write_data *);
422extern void nfs_writedata_release(void *); 407extern void nfs_writedata_release(void *);
423extern int nfs_set_page_dirty(struct page *);
424 408
425/* 409/*
426 * Try to write back everything synchronously (but check the 410 * Try to write back everything synchronously (but check the
427 * return value!) 411 * return value!)
428 */ 412 */
429extern long nfs_sync_mapping_wait(struct address_space *, struct writeback_control *, int); 413extern long nfs_sync_mapping_wait(struct address_space *, struct writeback_control *, int);
430extern int nfs_sync_mapping_range(struct address_space *, loff_t, loff_t, int);
431extern int nfs_wb_all(struct inode *inode); 414extern int nfs_wb_all(struct inode *inode);
415extern int nfs_wb_nocommit(struct inode *inode);
432extern int nfs_wb_page(struct inode *inode, struct page* page); 416extern int nfs_wb_page(struct inode *inode, struct page* page);
433extern int nfs_wb_page_priority(struct inode *inode, struct page* page, int how); 417extern int nfs_wb_page_priority(struct inode *inode, struct page* page, int how);
434extern int nfs_wb_page_cancel(struct inode *inode, struct page* page); 418extern int nfs_wb_page_cancel(struct inode *inode, struct page* page);
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index 78e60798d10e..30dbcc185e69 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -30,7 +30,6 @@
30#define PG_BUSY 0 30#define PG_BUSY 0
31#define PG_NEED_COMMIT 1 31#define PG_NEED_COMMIT 1
32#define PG_NEED_RESCHED 2 32#define PG_NEED_RESCHED 2
33#define PG_NEED_FLUSH 3
34 33
35struct nfs_inode; 34struct nfs_inode;
36struct nfs_page { 35struct nfs_page {
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index cf74a4db84a5..daab252f2e5c 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -62,7 +62,8 @@ struct nfs_fattr {
62#define NFS_ATTR_FATTR 0x0002 /* post-op attributes */ 62#define NFS_ATTR_FATTR 0x0002 /* post-op attributes */
63#define NFS_ATTR_FATTR_V3 0x0004 /* NFSv3 attributes */ 63#define NFS_ATTR_FATTR_V3 0x0004 /* NFSv3 attributes */
64#define NFS_ATTR_FATTR_V4 0x0008 /* NFSv4 change attribute */ 64#define NFS_ATTR_FATTR_V4 0x0008 /* NFSv4 change attribute */
65#define NFS_ATTR_FATTR_V4_REFERRAL 0x0010 /* NFSv4 referral */ 65#define NFS_ATTR_WCC_V4 0x0010 /* pre-op change attribute */
66#define NFS_ATTR_FATTR_V4_REFERRAL 0x0020 /* NFSv4 referral */
66 67
67/* 68/*
68 * Info on the file system 69 * Info on the file system
@@ -538,10 +539,13 @@ typedef u64 clientid4;
538 539
539struct nfs4_accessargs { 540struct nfs4_accessargs {
540 const struct nfs_fh * fh; 541 const struct nfs_fh * fh;
542 const u32 * bitmask;
541 u32 access; 543 u32 access;
542}; 544};
543 545
544struct nfs4_accessres { 546struct nfs4_accessres {
547 const struct nfs_server * server;
548 struct nfs_fattr * fattr;
545 u32 supported; 549 u32 supported;
546 u32 access; 550 u32 access;
547}; 551};
diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index c0d9d14983b3..d9d5c5ad826c 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -117,7 +117,7 @@ struct rpc_create_args {
117 117
118struct rpc_clnt *rpc_create(struct rpc_create_args *args); 118struct rpc_clnt *rpc_create(struct rpc_create_args *args);
119struct rpc_clnt *rpc_bind_new_program(struct rpc_clnt *, 119struct rpc_clnt *rpc_bind_new_program(struct rpc_clnt *,
120 struct rpc_program *, int); 120 struct rpc_program *, u32);
121struct rpc_clnt *rpc_clone_client(struct rpc_clnt *); 121struct rpc_clnt *rpc_clone_client(struct rpc_clnt *);
122void rpc_shutdown_client(struct rpc_clnt *); 122void rpc_shutdown_client(struct rpc_clnt *);
123void rpc_release_client(struct rpc_clnt *); 123void rpc_release_client(struct rpc_clnt *);
diff --git a/include/linux/sunrpc/debug.h b/include/linux/sunrpc/debug.h
index 3912cf16361e..3347c72b848a 100644
--- a/include/linux/sunrpc/debug.h
+++ b/include/linux/sunrpc/debug.h
@@ -88,6 +88,11 @@ enum {
88 CTL_SLOTTABLE_TCP, 88 CTL_SLOTTABLE_TCP,
89 CTL_MIN_RESVPORT, 89 CTL_MIN_RESVPORT,
90 CTL_MAX_RESVPORT, 90 CTL_MAX_RESVPORT,
91 CTL_SLOTTABLE_RDMA,
92 CTL_RDMA_MAXINLINEREAD,
93 CTL_RDMA_MAXINLINEWRITE,
94 CTL_RDMA_WRITEPADDING,
95 CTL_RDMA_MEMREG,
91}; 96};
92 97
93#endif /* _LINUX_SUNRPC_DEBUG_H_ */ 98#endif /* _LINUX_SUNRPC_DEBUG_H_ */
diff --git a/include/linux/sunrpc/msg_prot.h b/include/linux/sunrpc/msg_prot.h
index 784d4c3ef651..c4beb5775111 100644
--- a/include/linux/sunrpc/msg_prot.h
+++ b/include/linux/sunrpc/msg_prot.h
@@ -138,6 +138,19 @@ typedef __be32 rpc_fraghdr;
138#define RPC_MAX_HEADER_WITH_AUTH \ 138#define RPC_MAX_HEADER_WITH_AUTH \
139 (RPC_CALLHDRSIZE + 2*(2+RPC_MAX_AUTH_SIZE/4)) 139 (RPC_CALLHDRSIZE + 2*(2+RPC_MAX_AUTH_SIZE/4))
140 140
141/*
142 * RFC1833/RFC3530 rpcbind (v3+) well-known netid's.
143 */
144#define RPCBIND_NETID_UDP "udp"
145#define RPCBIND_NETID_TCP "tcp"
146#define RPCBIND_NETID_UDP6 "udp6"
147#define RPCBIND_NETID_TCP6 "tcp6"
148
149/*
150 * Note that RFC 1833 does not put any size restrictions on the
151 * netid string, but all currently defined netid's fit in 4 bytes.
152 */
153#define RPCBIND_MAXNETIDLEN (4u)
141 154
142#endif /* __KERNEL__ */ 155#endif /* __KERNEL__ */
143#endif /* _LINUX_SUNRPC_MSGPROT_H_ */ 156#endif /* _LINUX_SUNRPC_MSGPROT_H_ */
diff --git a/include/linux/sunrpc/rpc_rdma.h b/include/linux/sunrpc/rpc_rdma.h
new file mode 100644
index 000000000000..0013a0d8dc6b
--- /dev/null
+++ b/include/linux/sunrpc/rpc_rdma.h
@@ -0,0 +1,116 @@
1/*
2 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the BSD-type
8 * license below:
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 *
17 * Redistributions in binary form must reproduce the above
18 * copyright notice, this list of conditions and the following
19 * disclaimer in the documentation and/or other materials provided
20 * with the distribution.
21 *
22 * Neither the name of the Network Appliance, Inc. nor the names of
23 * its contributors may be used to endorse or promote products
24 * derived from this software without specific prior written
25 * permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 */
39
40#ifndef _LINUX_SUNRPC_RPC_RDMA_H
41#define _LINUX_SUNRPC_RPC_RDMA_H
42
43struct rpcrdma_segment {
44 uint32_t rs_handle; /* Registered memory handle */
45 uint32_t rs_length; /* Length of the chunk in bytes */
46 uint64_t rs_offset; /* Chunk virtual address or offset */
47};
48
49/*
50 * read chunk(s), encoded as a linked list.
51 */
52struct rpcrdma_read_chunk {
53 uint32_t rc_discrim; /* 1 indicates presence */
54 uint32_t rc_position; /* Position in XDR stream */
55 struct rpcrdma_segment rc_target;
56};
57
58/*
59 * write chunk, and reply chunk.
60 */
61struct rpcrdma_write_chunk {
62 struct rpcrdma_segment wc_target;
63};
64
65/*
66 * write chunk(s), encoded as a counted array.
67 */
68struct rpcrdma_write_array {
69 uint32_t wc_discrim; /* 1 indicates presence */
70 uint32_t wc_nchunks; /* Array count */
71 struct rpcrdma_write_chunk wc_array[0];
72};
73
74struct rpcrdma_msg {
75 uint32_t rm_xid; /* Mirrors the RPC header xid */
76 uint32_t rm_vers; /* Version of this protocol */
77 uint32_t rm_credit; /* Buffers requested/granted */
78 uint32_t rm_type; /* Type of message (enum rpcrdma_proc) */
79 union {
80
81 struct { /* no chunks */
82 uint32_t rm_empty[3]; /* 3 empty chunk lists */
83 } rm_nochunks;
84
85 struct { /* no chunks and padded */
86 uint32_t rm_align; /* Padding alignment */
87 uint32_t rm_thresh; /* Padding threshold */
88 uint32_t rm_pempty[3]; /* 3 empty chunk lists */
89 } rm_padded;
90
91 uint32_t rm_chunks[0]; /* read, write and reply chunks */
92
93 } rm_body;
94};
95
96#define RPCRDMA_HDRLEN_MIN 28
97
98enum rpcrdma_errcode {
99 ERR_VERS = 1,
100 ERR_CHUNK = 2
101};
102
103struct rpcrdma_err_vers {
104 uint32_t rdma_vers_low; /* Version range supported by peer */
105 uint32_t rdma_vers_high;
106};
107
108enum rpcrdma_proc {
109 RDMA_MSG = 0, /* An RPC call or reply msg */
110 RDMA_NOMSG = 1, /* An RPC call or reply msg - separate body */
111 RDMA_MSGP = 2, /* An RPC call or reply msg with padding */
112 RDMA_DONE = 3, /* Client signals reply completion */
113 RDMA_ERROR = 4 /* An RPC RDMA encoding error */
114};
115
116#endif /* _LINUX_SUNRPC_RPC_RDMA_H */
diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index c6b53d181bfa..0751c9464d0f 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -70,7 +70,10 @@ struct xdr_buf {
70 70
71 struct page ** pages; /* Array of contiguous pages */ 71 struct page ** pages; /* Array of contiguous pages */
72 unsigned int page_base, /* Start of page data */ 72 unsigned int page_base, /* Start of page data */
73 page_len; /* Length of page data */ 73 page_len, /* Length of page data */
74 flags; /* Flags for data disposition */
75#define XDRBUF_READ 0x01 /* target of file read */
76#define XDRBUF_WRITE 0x02 /* source of file write */
74 77
75 unsigned int buflen, /* Total length of storage buffer */ 78 unsigned int buflen, /* Total length of storage buffer */
76 len; /* Length of XDR encoded message */ 79 len; /* Length of XDR encoded message */
diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index d11cedd14f0f..30b17b3bc1a9 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -19,25 +19,11 @@
19 19
20#ifdef __KERNEL__ 20#ifdef __KERNEL__
21 21
22extern unsigned int xprt_udp_slot_table_entries;
23extern unsigned int xprt_tcp_slot_table_entries;
24
25#define RPC_MIN_SLOT_TABLE (2U) 22#define RPC_MIN_SLOT_TABLE (2U)
26#define RPC_DEF_SLOT_TABLE (16U) 23#define RPC_DEF_SLOT_TABLE (16U)
27#define RPC_MAX_SLOT_TABLE (128U) 24#define RPC_MAX_SLOT_TABLE (128U)
28 25
29/* 26/*
30 * Parameters for choosing a free port
31 */
32extern unsigned int xprt_min_resvport;
33extern unsigned int xprt_max_resvport;
34
35#define RPC_MIN_RESVPORT (1U)
36#define RPC_MAX_RESVPORT (65535U)
37#define RPC_DEF_MIN_RESVPORT (665U)
38#define RPC_DEF_MAX_RESVPORT (1023U)
39
40/*
41 * This describes a timeout strategy 27 * This describes a timeout strategy
42 */ 28 */
43struct rpc_timeout { 29struct rpc_timeout {
@@ -53,6 +39,10 @@ enum rpc_display_format_t {
53 RPC_DISPLAY_PORT, 39 RPC_DISPLAY_PORT,
54 RPC_DISPLAY_PROTO, 40 RPC_DISPLAY_PROTO,
55 RPC_DISPLAY_ALL, 41 RPC_DISPLAY_ALL,
42 RPC_DISPLAY_HEX_ADDR,
43 RPC_DISPLAY_HEX_PORT,
44 RPC_DISPLAY_UNIVERSAL_ADDR,
45 RPC_DISPLAY_NETID,
56 RPC_DISPLAY_MAX, 46 RPC_DISPLAY_MAX,
57}; 47};
58 48
@@ -196,14 +186,22 @@ struct rpc_xprt {
196 char * address_strings[RPC_DISPLAY_MAX]; 186 char * address_strings[RPC_DISPLAY_MAX];
197}; 187};
198 188
199struct rpc_xprtsock_create { 189struct xprt_create {
200 int proto; /* IPPROTO_UDP or IPPROTO_TCP */ 190 int ident; /* XPRT_TRANSPORT identifier */
201 struct sockaddr * srcaddr; /* optional local address */ 191 struct sockaddr * srcaddr; /* optional local address */
202 struct sockaddr * dstaddr; /* remote peer address */ 192 struct sockaddr * dstaddr; /* remote peer address */
203 size_t addrlen; 193 size_t addrlen;
204 struct rpc_timeout * timeout; /* optional timeout parameters */ 194 struct rpc_timeout * timeout; /* optional timeout parameters */
205}; 195};
206 196
197struct xprt_class {
198 struct list_head list;
199 int ident; /* XPRT_TRANSPORT identifier */
200 struct rpc_xprt * (*setup)(struct xprt_create *);
201 struct module *owner;
202 char name[32];
203};
204
207/* 205/*
208 * Transport operations used by ULPs 206 * Transport operations used by ULPs
209 */ 207 */
@@ -212,7 +210,7 @@ void xprt_set_timeout(struct rpc_timeout *to, unsigned int retr, unsigned long
212/* 210/*
213 * Generic internal transport functions 211 * Generic internal transport functions
214 */ 212 */
215struct rpc_xprt * xprt_create_transport(struct rpc_xprtsock_create *args); 213struct rpc_xprt *xprt_create_transport(struct xprt_create *args);
216void xprt_connect(struct rpc_task *task); 214void xprt_connect(struct rpc_task *task);
217void xprt_reserve(struct rpc_task *task); 215void xprt_reserve(struct rpc_task *task);
218int xprt_reserve_xprt(struct rpc_task *task); 216int xprt_reserve_xprt(struct rpc_task *task);
@@ -235,6 +233,8 @@ static inline __be32 *xprt_skip_transport_header(struct rpc_xprt *xprt, __be32 *
235/* 233/*
236 * Transport switch helper functions 234 * Transport switch helper functions
237 */ 235 */
236int xprt_register_transport(struct xprt_class *type);
237int xprt_unregister_transport(struct xprt_class *type);
238void xprt_set_retrans_timeout_def(struct rpc_task *task); 238void xprt_set_retrans_timeout_def(struct rpc_task *task);
239void xprt_set_retrans_timeout_rtt(struct rpc_task *task); 239void xprt_set_retrans_timeout_rtt(struct rpc_task *task);
240void xprt_wake_pending_tasks(struct rpc_xprt *xprt, int status); 240void xprt_wake_pending_tasks(struct rpc_xprt *xprt, int status);
@@ -248,14 +248,6 @@ void xprt_release_rqst_cong(struct rpc_task *task);
248void xprt_disconnect(struct rpc_xprt *xprt); 248void xprt_disconnect(struct rpc_xprt *xprt);
249 249
250/* 250/*
251 * Socket transport setup operations
252 */
253struct rpc_xprt * xs_setup_udp(struct rpc_xprtsock_create *args);
254struct rpc_xprt * xs_setup_tcp(struct rpc_xprtsock_create *args);
255int init_socket_xprt(void);
256void cleanup_socket_xprt(void);
257
258/*
259 * Reserved bit positions in xprt->state 251 * Reserved bit positions in xprt->state
260 */ 252 */
261#define XPRT_LOCKED (0) 253#define XPRT_LOCKED (0)
diff --git a/include/linux/sunrpc/xprtrdma.h b/include/linux/sunrpc/xprtrdma.h
new file mode 100644
index 000000000000..4de56b1d372b
--- /dev/null
+++ b/include/linux/sunrpc/xprtrdma.h
@@ -0,0 +1,85 @@
1/*
2 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the BSD-type
8 * license below:
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 *
17 * Redistributions in binary form must reproduce the above
18 * copyright notice, this list of conditions and the following
19 * disclaimer in the documentation and/or other materials provided
20 * with the distribution.
21 *
22 * Neither the name of the Network Appliance, Inc. nor the names of
23 * its contributors may be used to endorse or promote products
24 * derived from this software without specific prior written
25 * permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 */
39
40#ifndef _LINUX_SUNRPC_XPRTRDMA_H
41#define _LINUX_SUNRPC_XPRTRDMA_H
42
43/*
44 * RPC transport identifier for RDMA
45 */
46#define XPRT_TRANSPORT_RDMA 256
47
48/*
49 * rpcbind (v3+) RDMA netid.
50 */
51#define RPCBIND_NETID_RDMA "rdma"
52
53/*
54 * Constants. Max RPC/NFS header is big enough to account for
55 * additional marshaling buffers passed down by Linux client.
56 *
57 * RDMA header is currently fixed max size, and is big enough for a
58 * fully-chunked NFS message (read chunks are the largest). Note only
59 * a single chunk type per message is supported currently.
60 */
61#define RPCRDMA_MIN_SLOT_TABLE (2U)
62#define RPCRDMA_DEF_SLOT_TABLE (32U)
63#define RPCRDMA_MAX_SLOT_TABLE (256U)
64
65#define RPCRDMA_DEF_INLINE (1024) /* default inline max */
66
67#define RPCRDMA_INLINE_PAD_THRESH (512)/* payload threshold to pad (bytes) */
68
69#define RDMA_RESOLVE_TIMEOUT (5*HZ) /* TBD 5 seconds */
70#define RDMA_CONNECT_RETRY_MAX (2) /* retries if no listener backlog */
71
72/* memory registration strategies */
73#define RPCRDMA_PERSISTENT_REGISTRATION (1)
74
75enum rpcrdma_memreg {
76 RPCRDMA_BOUNCEBUFFERS = 0,
77 RPCRDMA_REGISTER,
78 RPCRDMA_MEMWINDOWS,
79 RPCRDMA_MEMWINDOWS_ASYNC,
80 RPCRDMA_MTHCAFMR,
81 RPCRDMA_ALLPHYSICAL,
82 RPCRDMA_LAST
83};
84
85#endif /* _LINUX_SUNRPC_XPRTRDMA_H */
diff --git a/include/linux/sunrpc/xprtsock.h b/include/linux/sunrpc/xprtsock.h
new file mode 100644
index 000000000000..2c6c2c2783d8
--- /dev/null
+++ b/include/linux/sunrpc/xprtsock.h
@@ -0,0 +1,51 @@
1/*
2 * linux/include/linux/sunrpc/xprtsock.h
3 *
4 * Declarations for the RPC transport socket provider.
5 */
6
7#ifndef _LINUX_SUNRPC_XPRTSOCK_H
8#define _LINUX_SUNRPC_XPRTSOCK_H
9
10#ifdef __KERNEL__
11
12/*
13 * Socket transport setup operations
14 */
15struct rpc_xprt *xs_setup_udp(struct xprt_create *args);
16struct rpc_xprt *xs_setup_tcp(struct xprt_create *args);
17
18int init_socket_xprt(void);
19void cleanup_socket_xprt(void);
20
21/*
22 * RPC transport identifiers for UDP, TCP
23 *
24 * To preserve compatibility with the historical use of raw IP protocol
25 * id's for transport selection, these are specified with the previous
26 * values. No such restriction exists for new transports, except that
27 * they may not collide with these values (17 and 6, respectively).
28 */
29#define XPRT_TRANSPORT_UDP IPPROTO_UDP
30#define XPRT_TRANSPORT_TCP IPPROTO_TCP
31
32/*
33 * RPC slot table sizes for UDP, TCP transports
34 */
35extern unsigned int xprt_udp_slot_table_entries;
36extern unsigned int xprt_tcp_slot_table_entries;
37
38/*
39 * Parameters for choosing a free port
40 */
41extern unsigned int xprt_min_resvport;
42extern unsigned int xprt_max_resvport;
43
44#define RPC_MIN_RESVPORT (1U)
45#define RPC_MAX_RESVPORT (65535U)
46#define RPC_DEF_MIN_RESVPORT (665U)
47#define RPC_DEF_MAX_RESVPORT (1023U)
48
49#endif /* __KERNEL__ */
50
51#endif /* _LINUX_SUNRPC_XPRTSOCK_H */
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index c7c3337c3a88..d1321a81c9c4 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -62,8 +62,6 @@ struct writeback_control {
62 unsigned for_reclaim:1; /* Invoked from the page allocator */ 62 unsigned for_reclaim:1; /* Invoked from the page allocator */
63 unsigned for_writepages:1; /* This is a writepages() call */ 63 unsigned for_writepages:1; /* This is a writepages() call */
64 unsigned range_cyclic:1; /* range_start is cyclic */ 64 unsigned range_cyclic:1; /* range_start is cyclic */
65
66 void *fs_private; /* For use by ->writepages() */
67}; 65};
68 66
69/* 67/*
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 04f3ffb8d9d4..0ae703c157ba 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1525,6 +1525,7 @@ add_names:
1525 context->names[idx].ino = (unsigned long)-1; 1525 context->names[idx].ino = (unsigned long)-1;
1526 } 1526 }
1527} 1527}
1528EXPORT_SYMBOL_GPL(__audit_inode_child);
1528 1529
1529/** 1530/**
1530 * auditsc_get_stamp - get local copies of audit_context values 1531 * auditsc_get_stamp - get local copies of audit_context values
diff --git a/net/sunrpc/Makefile b/net/sunrpc/Makefile
index 8ebfc4db7f51..5c69a725e530 100644
--- a/net/sunrpc/Makefile
+++ b/net/sunrpc/Makefile
@@ -5,6 +5,7 @@
5 5
6obj-$(CONFIG_SUNRPC) += sunrpc.o 6obj-$(CONFIG_SUNRPC) += sunrpc.o
7obj-$(CONFIG_SUNRPC_GSS) += auth_gss/ 7obj-$(CONFIG_SUNRPC_GSS) += auth_gss/
8obj-$(CONFIG_SUNRPC_XPRT_RDMA) += xprtrdma/
8 9
9sunrpc-y := clnt.o xprt.o socklib.o xprtsock.o sched.o \ 10sunrpc-y := clnt.o xprt.o socklib.o xprtsock.o sched.o \
10 auth.o auth_null.o auth_unix.o \ 11 auth.o auth_null.o auth_unix.o \
diff --git a/net/sunrpc/auth_gss/gss_krb5_wrap.c b/net/sunrpc/auth_gss/gss_krb5_wrap.c
index 42b3220bed39..8bd074df27d3 100644
--- a/net/sunrpc/auth_gss/gss_krb5_wrap.c
+++ b/net/sunrpc/auth_gss/gss_krb5_wrap.c
@@ -42,7 +42,7 @@ gss_krb5_remove_padding(struct xdr_buf *buf, int blocksize)
42{ 42{
43 u8 *ptr; 43 u8 *ptr;
44 u8 pad; 44 u8 pad;
45 int len = buf->len; 45 size_t len = buf->len;
46 46
47 if (len <= buf->head[0].iov_len) { 47 if (len <= buf->head[0].iov_len) {
48 pad = *(u8 *)(buf->head[0].iov_base + len - 1); 48 pad = *(u8 *)(buf->head[0].iov_base + len - 1);
@@ -53,9 +53,9 @@ gss_krb5_remove_padding(struct xdr_buf *buf, int blocksize)
53 } else 53 } else
54 len -= buf->head[0].iov_len; 54 len -= buf->head[0].iov_len;
55 if (len <= buf->page_len) { 55 if (len <= buf->page_len) {
56 int last = (buf->page_base + len - 1) 56 unsigned int last = (buf->page_base + len - 1)
57 >>PAGE_CACHE_SHIFT; 57 >>PAGE_CACHE_SHIFT;
58 int offset = (buf->page_base + len - 1) 58 unsigned int offset = (buf->page_base + len - 1)
59 & (PAGE_CACHE_SIZE - 1); 59 & (PAGE_CACHE_SIZE - 1);
60 ptr = kmap_atomic(buf->pages[last], KM_USER0); 60 ptr = kmap_atomic(buf->pages[last], KM_USER0);
61 pad = *(ptr + offset); 61 pad = *(ptr + offset);
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 52429b1ffcc1..76be83ee4b04 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -127,7 +127,14 @@ static struct rpc_clnt * rpc_new_client(struct rpc_xprt *xprt, char *servname, s
127 struct rpc_clnt *clnt = NULL; 127 struct rpc_clnt *clnt = NULL;
128 struct rpc_auth *auth; 128 struct rpc_auth *auth;
129 int err; 129 int err;
130 int len; 130 size_t len;
131
132 /* sanity check the name before trying to print it */
133 err = -EINVAL;
134 len = strlen(servname);
135 if (len > RPC_MAXNETNAMELEN)
136 goto out_no_rpciod;
137 len++;
131 138
132 dprintk("RPC: creating %s client for %s (xprt %p)\n", 139 dprintk("RPC: creating %s client for %s (xprt %p)\n",
133 program->name, servname, xprt); 140 program->name, servname, xprt);
@@ -148,7 +155,6 @@ static struct rpc_clnt * rpc_new_client(struct rpc_xprt *xprt, char *servname, s
148 clnt->cl_parent = clnt; 155 clnt->cl_parent = clnt;
149 156
150 clnt->cl_server = clnt->cl_inline_name; 157 clnt->cl_server = clnt->cl_inline_name;
151 len = strlen(servname) + 1;
152 if (len > sizeof(clnt->cl_inline_name)) { 158 if (len > sizeof(clnt->cl_inline_name)) {
153 char *buf = kmalloc(len, GFP_KERNEL); 159 char *buf = kmalloc(len, GFP_KERNEL);
154 if (buf != 0) 160 if (buf != 0)
@@ -234,8 +240,8 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args)
234{ 240{
235 struct rpc_xprt *xprt; 241 struct rpc_xprt *xprt;
236 struct rpc_clnt *clnt; 242 struct rpc_clnt *clnt;
237 struct rpc_xprtsock_create xprtargs = { 243 struct xprt_create xprtargs = {
238 .proto = args->protocol, 244 .ident = args->protocol,
239 .srcaddr = args->saddress, 245 .srcaddr = args->saddress,
240 .dstaddr = args->address, 246 .dstaddr = args->address,
241 .addrlen = args->addrsize, 247 .addrlen = args->addrsize,
@@ -253,7 +259,7 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args)
253 */ 259 */
254 if (args->servername == NULL) { 260 if (args->servername == NULL) {
255 struct sockaddr_in *addr = 261 struct sockaddr_in *addr =
256 (struct sockaddr_in *) &args->address; 262 (struct sockaddr_in *) args->address;
257 snprintf(servername, sizeof(servername), NIPQUAD_FMT, 263 snprintf(servername, sizeof(servername), NIPQUAD_FMT,
258 NIPQUAD(addr->sin_addr.s_addr)); 264 NIPQUAD(addr->sin_addr.s_addr));
259 args->servername = servername; 265 args->servername = servername;
@@ -269,9 +275,6 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args)
269 if (args->flags & RPC_CLNT_CREATE_NONPRIVPORT) 275 if (args->flags & RPC_CLNT_CREATE_NONPRIVPORT)
270 xprt->resvport = 0; 276 xprt->resvport = 0;
271 277
272 dprintk("RPC: creating %s client for %s (xprt %p)\n",
273 args->program->name, args->servername, xprt);
274
275 clnt = rpc_new_client(xprt, args->servername, args->program, 278 clnt = rpc_new_client(xprt, args->servername, args->program,
276 args->version, args->authflavor); 279 args->version, args->authflavor);
277 if (IS_ERR(clnt)) 280 if (IS_ERR(clnt))
@@ -439,7 +442,7 @@ rpc_release_client(struct rpc_clnt *clnt)
439 */ 442 */
440struct rpc_clnt *rpc_bind_new_program(struct rpc_clnt *old, 443struct rpc_clnt *rpc_bind_new_program(struct rpc_clnt *old,
441 struct rpc_program *program, 444 struct rpc_program *program,
442 int vers) 445 u32 vers)
443{ 446{
444 struct rpc_clnt *clnt; 447 struct rpc_clnt *clnt;
445 struct rpc_version *version; 448 struct rpc_version *version;
@@ -843,8 +846,7 @@ call_allocate(struct rpc_task *task)
843 dprintk("RPC: %5u rpc_buffer allocation failed\n", task->tk_pid); 846 dprintk("RPC: %5u rpc_buffer allocation failed\n", task->tk_pid);
844 847
845 if (RPC_IS_ASYNC(task) || !signalled()) { 848 if (RPC_IS_ASYNC(task) || !signalled()) {
846 xprt_release(task); 849 task->tk_action = call_allocate;
847 task->tk_action = call_reserve;
848 rpc_delay(task, HZ>>4); 850 rpc_delay(task, HZ>>4);
849 return; 851 return;
850 } 852 }
@@ -871,6 +873,7 @@ rpc_xdr_buf_init(struct xdr_buf *buf, void *start, size_t len)
871 buf->head[0].iov_len = len; 873 buf->head[0].iov_len = len;
872 buf->tail[0].iov_len = 0; 874 buf->tail[0].iov_len = 0;
873 buf->page_len = 0; 875 buf->page_len = 0;
876 buf->flags = 0;
874 buf->len = 0; 877 buf->len = 0;
875 buf->buflen = len; 878 buf->buflen = len;
876} 879}
@@ -937,7 +940,7 @@ call_bind(struct rpc_task *task)
937static void 940static void
938call_bind_status(struct rpc_task *task) 941call_bind_status(struct rpc_task *task)
939{ 942{
940 int status = -EACCES; 943 int status = -EIO;
941 944
942 if (task->tk_status >= 0) { 945 if (task->tk_status >= 0) {
943 dprint_status(task); 946 dprint_status(task);
@@ -947,9 +950,20 @@ call_bind_status(struct rpc_task *task)
947 } 950 }
948 951
949 switch (task->tk_status) { 952 switch (task->tk_status) {
953 case -EAGAIN:
954 dprintk("RPC: %5u rpcbind waiting for another request "
955 "to finish\n", task->tk_pid);
956 /* avoid busy-waiting here -- could be a network outage. */
957 rpc_delay(task, 5*HZ);
958 goto retry_timeout;
950 case -EACCES: 959 case -EACCES:
951 dprintk("RPC: %5u remote rpcbind: RPC program/version " 960 dprintk("RPC: %5u remote rpcbind: RPC program/version "
952 "unavailable\n", task->tk_pid); 961 "unavailable\n", task->tk_pid);
962 /* fail immediately if this is an RPC ping */
963 if (task->tk_msg.rpc_proc->p_proc == 0) {
964 status = -EOPNOTSUPP;
965 break;
966 }
953 rpc_delay(task, 3*HZ); 967 rpc_delay(task, 3*HZ);
954 goto retry_timeout; 968 goto retry_timeout;
955 case -ETIMEDOUT: 969 case -ETIMEDOUT:
@@ -957,6 +971,7 @@ call_bind_status(struct rpc_task *task)
957 task->tk_pid); 971 task->tk_pid);
958 goto retry_timeout; 972 goto retry_timeout;
959 case -EPFNOSUPPORT: 973 case -EPFNOSUPPORT:
974 /* server doesn't support any rpcbind version we know of */
960 dprintk("RPC: %5u remote rpcbind service unavailable\n", 975 dprintk("RPC: %5u remote rpcbind service unavailable\n",
961 task->tk_pid); 976 task->tk_pid);
962 break; 977 break;
@@ -969,7 +984,6 @@ call_bind_status(struct rpc_task *task)
969 default: 984 default:
970 dprintk("RPC: %5u unrecognized rpcbind error (%d)\n", 985 dprintk("RPC: %5u unrecognized rpcbind error (%d)\n",
971 task->tk_pid, -task->tk_status); 986 task->tk_pid, -task->tk_status);
972 status = -EIO;
973 } 987 }
974 988
975 rpc_exit(task, status); 989 rpc_exit(task, status);
@@ -1257,7 +1271,6 @@ call_refresh(struct rpc_task *task)
1257{ 1271{
1258 dprint_status(task); 1272 dprint_status(task);
1259 1273
1260 xprt_release(task); /* Must do to obtain new XID */
1261 task->tk_action = call_refreshresult; 1274 task->tk_action = call_refreshresult;
1262 task->tk_status = 0; 1275 task->tk_status = 0;
1263 task->tk_client->cl_stats->rpcauthrefresh++; 1276 task->tk_client->cl_stats->rpcauthrefresh++;
@@ -1375,6 +1388,8 @@ call_verify(struct rpc_task *task)
1375 dprintk("RPC: %5u %s: retry stale creds\n", 1388 dprintk("RPC: %5u %s: retry stale creds\n",
1376 task->tk_pid, __FUNCTION__); 1389 task->tk_pid, __FUNCTION__);
1377 rpcauth_invalcred(task); 1390 rpcauth_invalcred(task);
1391 /* Ensure we obtain a new XID! */
1392 xprt_release(task);
1378 task->tk_action = call_refresh; 1393 task->tk_action = call_refresh;
1379 goto out_retry; 1394 goto out_retry;
1380 case RPC_AUTH_BADCRED: 1395 case RPC_AUTH_BADCRED:
@@ -1523,13 +1538,18 @@ void rpc_show_tasks(void)
1523 spin_lock(&clnt->cl_lock); 1538 spin_lock(&clnt->cl_lock);
1524 list_for_each_entry(t, &clnt->cl_tasks, tk_task) { 1539 list_for_each_entry(t, &clnt->cl_tasks, tk_task) {
1525 const char *rpc_waitq = "none"; 1540 const char *rpc_waitq = "none";
1541 int proc;
1542
1543 if (t->tk_msg.rpc_proc)
1544 proc = t->tk_msg.rpc_proc->p_proc;
1545 else
1546 proc = -1;
1526 1547
1527 if (RPC_IS_QUEUED(t)) 1548 if (RPC_IS_QUEUED(t))
1528 rpc_waitq = rpc_qname(t->u.tk_wait.rpc_waitq); 1549 rpc_waitq = rpc_qname(t->u.tk_wait.rpc_waitq);
1529 1550
1530 printk("%5u %04d %04x %6d %8p %6d %8p %8ld %8s %8p %8p\n", 1551 printk("%5u %04d %04x %6d %8p %6d %8p %8ld %8s %8p %8p\n",
1531 t->tk_pid, 1552 t->tk_pid, proc,
1532 (t->tk_msg.rpc_proc ? t->tk_msg.rpc_proc->p_proc : -1),
1533 t->tk_flags, t->tk_status, 1553 t->tk_flags, t->tk_status,
1534 t->tk_client, 1554 t->tk_client,
1535 (t->tk_client ? t->tk_client->cl_prog : 0), 1555 (t->tk_client ? t->tk_client->cl_prog : 0),
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 669e12a4ed18..c8433e8865aa 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -14,7 +14,7 @@
14#include <linux/pagemap.h> 14#include <linux/pagemap.h>
15#include <linux/mount.h> 15#include <linux/mount.h>
16#include <linux/namei.h> 16#include <linux/namei.h>
17#include <linux/dnotify.h> 17#include <linux/fsnotify.h>
18#include <linux/kernel.h> 18#include <linux/kernel.h>
19 19
20#include <asm/ioctls.h> 20#include <asm/ioctls.h>
@@ -329,6 +329,7 @@ rpc_show_info(struct seq_file *m, void *v)
329 clnt->cl_prog, clnt->cl_vers); 329 clnt->cl_prog, clnt->cl_vers);
330 seq_printf(m, "address: %s\n", rpc_peeraddr2str(clnt, RPC_DISPLAY_ADDR)); 330 seq_printf(m, "address: %s\n", rpc_peeraddr2str(clnt, RPC_DISPLAY_ADDR));
331 seq_printf(m, "protocol: %s\n", rpc_peeraddr2str(clnt, RPC_DISPLAY_PROTO)); 331 seq_printf(m, "protocol: %s\n", rpc_peeraddr2str(clnt, RPC_DISPLAY_PROTO));
332 seq_printf(m, "port: %s\n", rpc_peeraddr2str(clnt, RPC_DISPLAY_PORT));
332 return 0; 333 return 0;
333} 334}
334 335
@@ -585,6 +586,7 @@ rpc_populate(struct dentry *parent,
585 if (S_ISDIR(mode)) 586 if (S_ISDIR(mode))
586 inc_nlink(dir); 587 inc_nlink(dir);
587 d_add(dentry, inode); 588 d_add(dentry, inode);
589 fsnotify_create(dir, dentry);
588 } 590 }
589 mutex_unlock(&dir->i_mutex); 591 mutex_unlock(&dir->i_mutex);
590 return 0; 592 return 0;
@@ -606,7 +608,7 @@ __rpc_mkdir(struct inode *dir, struct dentry *dentry)
606 inode->i_ino = iunique(dir->i_sb, 100); 608 inode->i_ino = iunique(dir->i_sb, 100);
607 d_instantiate(dentry, inode); 609 d_instantiate(dentry, inode);
608 inc_nlink(dir); 610 inc_nlink(dir);
609 inode_dir_notify(dir, DN_CREATE); 611 fsnotify_mkdir(dir, dentry);
610 return 0; 612 return 0;
611out_err: 613out_err:
612 printk(KERN_WARNING "%s: %s failed to allocate inode for dentry %s\n", 614 printk(KERN_WARNING "%s: %s failed to allocate inode for dentry %s\n",
@@ -748,7 +750,7 @@ rpc_mkpipe(struct dentry *parent, const char *name, void *private, struct rpc_pi
748 rpci->flags = flags; 750 rpci->flags = flags;
749 rpci->ops = ops; 751 rpci->ops = ops;
750 rpci->nkern_readwriters = 1; 752 rpci->nkern_readwriters = 1;
751 inode_dir_notify(dir, DN_CREATE); 753 fsnotify_create(dir, dentry);
752 dget(dentry); 754 dget(dentry);
753out: 755out:
754 mutex_unlock(&dir->i_mutex); 756 mutex_unlock(&dir->i_mutex);
diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c
index d1740dbab991..a05493aedb68 100644
--- a/net/sunrpc/rpcb_clnt.c
+++ b/net/sunrpc/rpcb_clnt.c
@@ -16,11 +16,14 @@
16 16
17#include <linux/types.h> 17#include <linux/types.h>
18#include <linux/socket.h> 18#include <linux/socket.h>
19#include <linux/in.h>
20#include <linux/in6.h>
19#include <linux/kernel.h> 21#include <linux/kernel.h>
20#include <linux/errno.h> 22#include <linux/errno.h>
21 23
22#include <linux/sunrpc/clnt.h> 24#include <linux/sunrpc/clnt.h>
23#include <linux/sunrpc/sched.h> 25#include <linux/sunrpc/sched.h>
26#include <linux/sunrpc/xprtsock.h>
24 27
25#ifdef RPC_DEBUG 28#ifdef RPC_DEBUG
26# define RPCDBG_FACILITY RPCDBG_BIND 29# define RPCDBG_FACILITY RPCDBG_BIND
@@ -91,26 +94,6 @@ enum {
91#define RPCB_MAXADDRLEN (128u) 94#define RPCB_MAXADDRLEN (128u)
92 95
93/* 96/*
94 * r_netid
95 *
96 * Quoting RFC 3530, section 2.2:
97 *
98 * For TCP over IPv4 the value of r_netid is the string "tcp". For UDP
99 * over IPv4 the value of r_netid is the string "udp".
100 *
101 * ...
102 *
103 * For TCP over IPv6 the value of r_netid is the string "tcp6". For UDP
104 * over IPv6 the value of r_netid is the string "udp6".
105 */
106#define RPCB_NETID_UDP "\165\144\160" /* "udp" */
107#define RPCB_NETID_TCP "\164\143\160" /* "tcp" */
108#define RPCB_NETID_UDP6 "\165\144\160\066" /* "udp6" */
109#define RPCB_NETID_TCP6 "\164\143\160\066" /* "tcp6" */
110
111#define RPCB_MAXNETIDLEN (4u)
112
113/*
114 * r_owner 97 * r_owner
115 * 98 *
116 * The "owner" is allowed to unset a service in the rpcbind database. 99 * The "owner" is allowed to unset a service in the rpcbind database.
@@ -120,7 +103,7 @@ enum {
120#define RPCB_MAXOWNERLEN sizeof(RPCB_OWNER_STRING) 103#define RPCB_MAXOWNERLEN sizeof(RPCB_OWNER_STRING)
121 104
122static void rpcb_getport_done(struct rpc_task *, void *); 105static void rpcb_getport_done(struct rpc_task *, void *);
123extern struct rpc_program rpcb_program; 106static struct rpc_program rpcb_program;
124 107
125struct rpcbind_args { 108struct rpcbind_args {
126 struct rpc_xprt * r_xprt; 109 struct rpc_xprt * r_xprt;
@@ -137,10 +120,13 @@ struct rpcbind_args {
137static struct rpc_procinfo rpcb_procedures2[]; 120static struct rpc_procinfo rpcb_procedures2[];
138static struct rpc_procinfo rpcb_procedures3[]; 121static struct rpc_procinfo rpcb_procedures3[];
139 122
140static struct rpcb_info { 123struct rpcb_info {
141 int rpc_vers; 124 int rpc_vers;
142 struct rpc_procinfo * rpc_proc; 125 struct rpc_procinfo * rpc_proc;
143} rpcb_next_version[]; 126};
127
128static struct rpcb_info rpcb_next_version[];
129static struct rpcb_info rpcb_next_version6[];
144 130
145static void rpcb_getport_prepare(struct rpc_task *task, void *calldata) 131static void rpcb_getport_prepare(struct rpc_task *task, void *calldata)
146{ 132{
@@ -190,7 +176,17 @@ static struct rpc_clnt *rpcb_create(char *hostname, struct sockaddr *srvaddr,
190 RPC_CLNT_CREATE_INTR), 176 RPC_CLNT_CREATE_INTR),
191 }; 177 };
192 178
193 ((struct sockaddr_in *)srvaddr)->sin_port = htons(RPCBIND_PORT); 179 switch (srvaddr->sa_family) {
180 case AF_INET:
181 ((struct sockaddr_in *)srvaddr)->sin_port = htons(RPCBIND_PORT);
182 break;
183 case AF_INET6:
184 ((struct sockaddr_in6 *)srvaddr)->sin6_port = htons(RPCBIND_PORT);
185 break;
186 default:
187 return NULL;
188 }
189
194 if (!privileged) 190 if (!privileged)
195 args.flags |= RPC_CLNT_CREATE_NONPRIVPORT; 191 args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
196 return rpc_create(&args); 192 return rpc_create(&args);
@@ -234,7 +230,7 @@ int rpcb_register(u32 prog, u32 vers, int prot, unsigned short port, int *okay)
234 prog, vers, prot, port); 230 prog, vers, prot, port);
235 231
236 rpcb_clnt = rpcb_create("localhost", (struct sockaddr *) &sin, 232 rpcb_clnt = rpcb_create("localhost", (struct sockaddr *) &sin,
237 IPPROTO_UDP, 2, 1); 233 XPRT_TRANSPORT_UDP, 2, 1);
238 if (IS_ERR(rpcb_clnt)) 234 if (IS_ERR(rpcb_clnt))
239 return PTR_ERR(rpcb_clnt); 235 return PTR_ERR(rpcb_clnt);
240 236
@@ -316,6 +312,7 @@ void rpcb_getport_async(struct rpc_task *task)
316 struct rpc_task *child; 312 struct rpc_task *child;
317 struct sockaddr addr; 313 struct sockaddr addr;
318 int status; 314 int status;
315 struct rpcb_info *info;
319 316
320 dprintk("RPC: %5u %s(%s, %u, %u, %d)\n", 317 dprintk("RPC: %5u %s(%s, %u, %u, %d)\n",
321 task->tk_pid, __FUNCTION__, 318 task->tk_pid, __FUNCTION__,
@@ -325,7 +322,7 @@ void rpcb_getport_async(struct rpc_task *task)
325 BUG_ON(clnt->cl_parent != clnt); 322 BUG_ON(clnt->cl_parent != clnt);
326 323
327 if (xprt_test_and_set_binding(xprt)) { 324 if (xprt_test_and_set_binding(xprt)) {
328 status = -EACCES; /* tell caller to check again */ 325 status = -EAGAIN; /* tell caller to check again */
329 dprintk("RPC: %5u %s: waiting for another binder\n", 326 dprintk("RPC: %5u %s: waiting for another binder\n",
330 task->tk_pid, __FUNCTION__); 327 task->tk_pid, __FUNCTION__);
331 goto bailout_nowake; 328 goto bailout_nowake;
@@ -343,18 +340,43 @@ void rpcb_getport_async(struct rpc_task *task)
343 goto bailout_nofree; 340 goto bailout_nofree;
344 } 341 }
345 342
346 if (rpcb_next_version[xprt->bind_index].rpc_proc == NULL) { 343 rpc_peeraddr(clnt, (void *)&addr, sizeof(addr));
344
345 /* Don't ever use rpcbind v2 for AF_INET6 requests */
346 switch (addr.sa_family) {
347 case AF_INET:
348 info = rpcb_next_version;
349 break;
350 case AF_INET6:
351 info = rpcb_next_version6;
352 break;
353 default:
354 status = -EAFNOSUPPORT;
355 dprintk("RPC: %5u %s: bad address family\n",
356 task->tk_pid, __FUNCTION__);
357 goto bailout_nofree;
358 }
359 if (info[xprt->bind_index].rpc_proc == NULL) {
347 xprt->bind_index = 0; 360 xprt->bind_index = 0;
348 status = -EACCES; /* tell caller to try again later */ 361 status = -EPFNOSUPPORT;
349 dprintk("RPC: %5u %s: no more getport versions available\n", 362 dprintk("RPC: %5u %s: no more getport versions available\n",
350 task->tk_pid, __FUNCTION__); 363 task->tk_pid, __FUNCTION__);
351 goto bailout_nofree; 364 goto bailout_nofree;
352 } 365 }
353 bind_version = rpcb_next_version[xprt->bind_index].rpc_vers; 366 bind_version = info[xprt->bind_index].rpc_vers;
354 367
355 dprintk("RPC: %5u %s: trying rpcbind version %u\n", 368 dprintk("RPC: %5u %s: trying rpcbind version %u\n",
356 task->tk_pid, __FUNCTION__, bind_version); 369 task->tk_pid, __FUNCTION__, bind_version);
357 370
371 rpcb_clnt = rpcb_create(clnt->cl_server, &addr, xprt->prot,
372 bind_version, 0);
373 if (IS_ERR(rpcb_clnt)) {
374 status = PTR_ERR(rpcb_clnt);
375 dprintk("RPC: %5u %s: rpcb_create failed, error %ld\n",
376 task->tk_pid, __FUNCTION__, PTR_ERR(rpcb_clnt));
377 goto bailout_nofree;
378 }
379
358 map = kzalloc(sizeof(struct rpcbind_args), GFP_ATOMIC); 380 map = kzalloc(sizeof(struct rpcbind_args), GFP_ATOMIC);
359 if (!map) { 381 if (!map) {
360 status = -ENOMEM; 382 status = -ENOMEM;
@@ -367,28 +389,19 @@ void rpcb_getport_async(struct rpc_task *task)
367 map->r_prot = xprt->prot; 389 map->r_prot = xprt->prot;
368 map->r_port = 0; 390 map->r_port = 0;
369 map->r_xprt = xprt_get(xprt); 391 map->r_xprt = xprt_get(xprt);
370 map->r_netid = (xprt->prot == IPPROTO_TCP) ? RPCB_NETID_TCP : 392 map->r_netid = rpc_peeraddr2str(clnt, RPC_DISPLAY_NETID);
371 RPCB_NETID_UDP; 393 memcpy(map->r_addr,
372 memcpy(&map->r_addr, rpc_peeraddr2str(clnt, RPC_DISPLAY_ADDR), 394 rpc_peeraddr2str(rpcb_clnt, RPC_DISPLAY_UNIVERSAL_ADDR),
373 sizeof(map->r_addr)); 395 sizeof(map->r_addr));
374 map->r_owner = RPCB_OWNER_STRING; /* ignored for GETADDR */ 396 map->r_owner = RPCB_OWNER_STRING; /* ignored for GETADDR */
375 397
376 rpc_peeraddr(clnt, (void *)&addr, sizeof(addr));
377 rpcb_clnt = rpcb_create(clnt->cl_server, &addr, xprt->prot, bind_version, 0);
378 if (IS_ERR(rpcb_clnt)) {
379 status = PTR_ERR(rpcb_clnt);
380 dprintk("RPC: %5u %s: rpcb_create failed, error %ld\n",
381 task->tk_pid, __FUNCTION__, PTR_ERR(rpcb_clnt));
382 goto bailout;
383 }
384
385 child = rpc_run_task(rpcb_clnt, RPC_TASK_ASYNC, &rpcb_getport_ops, map); 398 child = rpc_run_task(rpcb_clnt, RPC_TASK_ASYNC, &rpcb_getport_ops, map);
386 rpc_release_client(rpcb_clnt); 399 rpc_release_client(rpcb_clnt);
387 if (IS_ERR(child)) { 400 if (IS_ERR(child)) {
388 status = -EIO; 401 status = -EIO;
389 dprintk("RPC: %5u %s: rpc_run_task failed\n", 402 dprintk("RPC: %5u %s: rpc_run_task failed\n",
390 task->tk_pid, __FUNCTION__); 403 task->tk_pid, __FUNCTION__);
391 goto bailout_nofree; 404 goto bailout;
392 } 405 }
393 rpc_put_task(child); 406 rpc_put_task(child);
394 407
@@ -403,6 +416,7 @@ bailout_nofree:
403bailout_nowake: 416bailout_nowake:
404 task->tk_status = status; 417 task->tk_status = status;
405} 418}
419EXPORT_SYMBOL_GPL(rpcb_getport_async);
406 420
407/* 421/*
408 * Rpcbind child task calls this callback via tk_exit. 422 * Rpcbind child task calls this callback via tk_exit.
@@ -413,6 +427,10 @@ static void rpcb_getport_done(struct rpc_task *child, void *data)
413 struct rpc_xprt *xprt = map->r_xprt; 427 struct rpc_xprt *xprt = map->r_xprt;
414 int status = child->tk_status; 428 int status = child->tk_status;
415 429
430 /* Garbage reply: retry with a lesser rpcbind version */
431 if (status == -EIO)
432 status = -EPROTONOSUPPORT;
433
416 /* rpcbind server doesn't support this rpcbind protocol version */ 434 /* rpcbind server doesn't support this rpcbind protocol version */
417 if (status == -EPROTONOSUPPORT) 435 if (status == -EPROTONOSUPPORT)
418 xprt->bind_index++; 436 xprt->bind_index++;
@@ -490,16 +508,24 @@ static int rpcb_decode_getaddr(struct rpc_rqst *req, __be32 *p,
490 unsigned short *portp) 508 unsigned short *portp)
491{ 509{
492 char *addr; 510 char *addr;
493 int addr_len, c, i, f, first, val; 511 u32 addr_len;
512 int c, i, f, first, val;
494 513
495 *portp = 0; 514 *portp = 0;
496 addr_len = (unsigned int) ntohl(*p++); 515 addr_len = ntohl(*p++);
497 if (addr_len > RPCB_MAXADDRLEN) /* sanity */ 516
498 return -EINVAL; 517 /*
499 518 * Simple sanity check. The smallest possible universal
500 dprintk("RPC: rpcb_decode_getaddr returned string: '%s'\n", 519 * address is an IPv4 address string containing 11 bytes.
501 (char *) p); 520 */
502 521 if (addr_len < 11 || addr_len > RPCB_MAXADDRLEN)
522 goto out_err;
523
524 /*
525 * Start at the end and walk backwards until the first dot
526 * is encountered. When the second dot is found, we have
527 * both parts of the port number.
528 */
503 addr = (char *)p; 529 addr = (char *)p;
504 val = 0; 530 val = 0;
505 first = 1; 531 first = 1;
@@ -521,8 +547,19 @@ static int rpcb_decode_getaddr(struct rpc_rqst *req, __be32 *p,
521 } 547 }
522 } 548 }
523 549
550 /*
551 * Simple sanity check. If we never saw a dot in the reply,
552 * then this was probably just garbage.
553 */
554 if (first)
555 goto out_err;
556
524 dprintk("RPC: rpcb_decode_getaddr port=%u\n", *portp); 557 dprintk("RPC: rpcb_decode_getaddr port=%u\n", *portp);
525 return 0; 558 return 0;
559
560out_err:
561 dprintk("RPC: rpcbind server returned malformed reply\n");
562 return -EIO;
526} 563}
527 564
528#define RPCB_program_sz (1u) 565#define RPCB_program_sz (1u)
@@ -531,7 +568,7 @@ static int rpcb_decode_getaddr(struct rpc_rqst *req, __be32 *p,
531#define RPCB_port_sz (1u) 568#define RPCB_port_sz (1u)
532#define RPCB_boolean_sz (1u) 569#define RPCB_boolean_sz (1u)
533 570
534#define RPCB_netid_sz (1+XDR_QUADLEN(RPCB_MAXNETIDLEN)) 571#define RPCB_netid_sz (1+XDR_QUADLEN(RPCBIND_MAXNETIDLEN))
535#define RPCB_addr_sz (1+XDR_QUADLEN(RPCB_MAXADDRLEN)) 572#define RPCB_addr_sz (1+XDR_QUADLEN(RPCB_MAXADDRLEN))
536#define RPCB_ownerstring_sz (1+XDR_QUADLEN(RPCB_MAXOWNERLEN)) 573#define RPCB_ownerstring_sz (1+XDR_QUADLEN(RPCB_MAXOWNERLEN))
537 574
@@ -593,6 +630,14 @@ static struct rpcb_info rpcb_next_version[] = {
593 { 0, NULL }, 630 { 0, NULL },
594}; 631};
595 632
633static struct rpcb_info rpcb_next_version6[] = {
634#ifdef CONFIG_SUNRPC_BIND34
635 { 4, &rpcb_procedures4[RPCBPROC_GETVERSADDR] },
636 { 3, &rpcb_procedures3[RPCBPROC_GETADDR] },
637#endif
638 { 0, NULL },
639};
640
596static struct rpc_version rpcb_version2 = { 641static struct rpc_version rpcb_version2 = {
597 .number = 2, 642 .number = 2,
598 .nrprocs = RPCB_HIGHPROC_2, 643 .nrprocs = RPCB_HIGHPROC_2,
@@ -621,7 +666,7 @@ static struct rpc_version *rpcb_version[] = {
621 666
622static struct rpc_stat rpcb_stats; 667static struct rpc_stat rpcb_stats;
623 668
624struct rpc_program rpcb_program = { 669static struct rpc_program rpcb_program = {
625 .name = "rpcbind", 670 .name = "rpcbind",
626 .number = RPCBIND_PROGRAM, 671 .number = RPCBIND_PROGRAM,
627 .nrvers = ARRAY_SIZE(rpcb_version), 672 .nrvers = ARRAY_SIZE(rpcb_version),
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 954d7ec86c7e..3c773c53e12e 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -777,6 +777,7 @@ void *rpc_malloc(struct rpc_task *task, size_t size)
777 task->tk_pid, size, buf); 777 task->tk_pid, size, buf);
778 return &buf->data; 778 return &buf->data;
779} 779}
780EXPORT_SYMBOL_GPL(rpc_malloc);
780 781
781/** 782/**
782 * rpc_free - free buffer allocated via rpc_malloc 783 * rpc_free - free buffer allocated via rpc_malloc
@@ -802,6 +803,7 @@ void rpc_free(void *buffer)
802 else 803 else
803 kfree(buf); 804 kfree(buf);
804} 805}
806EXPORT_SYMBOL_GPL(rpc_free);
805 807
806/* 808/*
807 * Creation and deletion of RPC task structures 809 * Creation and deletion of RPC task structures
diff --git a/net/sunrpc/socklib.c b/net/sunrpc/socklib.c
index 1d377d1ab7f4..97ac45f034d6 100644
--- a/net/sunrpc/socklib.c
+++ b/net/sunrpc/socklib.c
@@ -34,6 +34,7 @@ size_t xdr_skb_read_bits(struct xdr_skb_reader *desc, void *to, size_t len)
34 desc->offset += len; 34 desc->offset += len;
35 return len; 35 return len;
36} 36}
37EXPORT_SYMBOL_GPL(xdr_skb_read_bits);
37 38
38/** 39/**
39 * xdr_skb_read_and_csum_bits - copy and checksum from skb to buffer 40 * xdr_skb_read_and_csum_bits - copy and checksum from skb to buffer
@@ -137,6 +138,7 @@ copy_tail:
137out: 138out:
138 return copied; 139 return copied;
139} 140}
141EXPORT_SYMBOL_GPL(xdr_partial_copy_from_skb);
140 142
141/** 143/**
142 * csum_partial_copy_to_xdr - checksum and copy data 144 * csum_partial_copy_to_xdr - checksum and copy data
@@ -179,3 +181,4 @@ no_checksum:
179 return -1; 181 return -1;
180 return 0; 182 return 0;
181} 183}
184EXPORT_SYMBOL_GPL(csum_partial_copy_to_xdr);
diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
index 384c4ad5ab86..33d89e842c85 100644
--- a/net/sunrpc/sunrpc_syms.c
+++ b/net/sunrpc/sunrpc_syms.c
@@ -20,7 +20,7 @@
20#include <linux/sunrpc/auth.h> 20#include <linux/sunrpc/auth.h>
21#include <linux/workqueue.h> 21#include <linux/workqueue.h>
22#include <linux/sunrpc/rpc_pipe_fs.h> 22#include <linux/sunrpc/rpc_pipe_fs.h>
23 23#include <linux/sunrpc/xprtsock.h>
24 24
25/* RPC scheduler */ 25/* RPC scheduler */
26EXPORT_SYMBOL(rpc_execute); 26EXPORT_SYMBOL(rpc_execute);
diff --git a/net/sunrpc/timer.c b/net/sunrpc/timer.c
index 8142fdb8a930..31becbf09263 100644
--- a/net/sunrpc/timer.c
+++ b/net/sunrpc/timer.c
@@ -17,6 +17,7 @@
17 17
18#include <linux/types.h> 18#include <linux/types.h>
19#include <linux/unistd.h> 19#include <linux/unistd.h>
20#include <linux/module.h>
20 21
21#include <linux/sunrpc/clnt.h> 22#include <linux/sunrpc/clnt.h>
22 23
@@ -40,6 +41,7 @@ rpc_init_rtt(struct rpc_rtt *rt, unsigned long timeo)
40 rt->ntimeouts[i] = 0; 41 rt->ntimeouts[i] = 0;
41 } 42 }
42} 43}
44EXPORT_SYMBOL_GPL(rpc_init_rtt);
43 45
44/* 46/*
45 * NB: When computing the smoothed RTT and standard deviation, 47 * NB: When computing the smoothed RTT and standard deviation,
@@ -75,6 +77,7 @@ rpc_update_rtt(struct rpc_rtt *rt, unsigned timer, long m)
75 if (*sdrtt < RPC_RTO_MIN) 77 if (*sdrtt < RPC_RTO_MIN)
76 *sdrtt = RPC_RTO_MIN; 78 *sdrtt = RPC_RTO_MIN;
77} 79}
80EXPORT_SYMBOL_GPL(rpc_update_rtt);
78 81
79/* 82/*
80 * Estimate rto for an nfs rpc sent via. an unreliable datagram. 83 * Estimate rto for an nfs rpc sent via. an unreliable datagram.
@@ -103,3 +106,4 @@ rpc_calc_rto(struct rpc_rtt *rt, unsigned timer)
103 106
104 return res; 107 return res;
105} 108}
109EXPORT_SYMBOL_GPL(rpc_calc_rto);
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index c8c2edccad7e..282a9a2ec90c 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -62,6 +62,9 @@ static inline void do_xprt_reserve(struct rpc_task *);
62static void xprt_connect_status(struct rpc_task *task); 62static void xprt_connect_status(struct rpc_task *task);
63static int __xprt_get_cong(struct rpc_xprt *, struct rpc_task *); 63static int __xprt_get_cong(struct rpc_xprt *, struct rpc_task *);
64 64
65static spinlock_t xprt_list_lock = SPIN_LOCK_UNLOCKED;
66static LIST_HEAD(xprt_list);
67
65/* 68/*
66 * The transport code maintains an estimate on the maximum number of out- 69 * The transport code maintains an estimate on the maximum number of out-
67 * standing RPC requests, using a smoothed version of the congestion 70 * standing RPC requests, using a smoothed version of the congestion
@@ -81,6 +84,78 @@ static int __xprt_get_cong(struct rpc_xprt *, struct rpc_task *);
81#define RPCXPRT_CONGESTED(xprt) ((xprt)->cong >= (xprt)->cwnd) 84#define RPCXPRT_CONGESTED(xprt) ((xprt)->cong >= (xprt)->cwnd)
82 85
83/** 86/**
87 * xprt_register_transport - register a transport implementation
88 * @transport: transport to register
89 *
90 * If a transport implementation is loaded as a kernel module, it can
91 * call this interface to make itself known to the RPC client.
92 *
93 * Returns:
94 * 0: transport successfully registered
95 * -EEXIST: transport already registered
96 * -EINVAL: transport module being unloaded
97 */
98int xprt_register_transport(struct xprt_class *transport)
99{
100 struct xprt_class *t;
101 int result;
102
103 result = -EEXIST;
104 spin_lock(&xprt_list_lock);
105 list_for_each_entry(t, &xprt_list, list) {
106 /* don't register the same transport class twice */
107 if (t->ident == transport->ident)
108 goto out;
109 }
110
111 result = -EINVAL;
112 if (try_module_get(THIS_MODULE)) {
113 list_add_tail(&transport->list, &xprt_list);
114 printk(KERN_INFO "RPC: Registered %s transport module.\n",
115 transport->name);
116 result = 0;
117 }
118
119out:
120 spin_unlock(&xprt_list_lock);
121 return result;
122}
123EXPORT_SYMBOL_GPL(xprt_register_transport);
124
125/**
126 * xprt_unregister_transport - unregister a transport implementation
127 * transport: transport to unregister
128 *
129 * Returns:
130 * 0: transport successfully unregistered
131 * -ENOENT: transport never registered
132 */
133int xprt_unregister_transport(struct xprt_class *transport)
134{
135 struct xprt_class *t;
136 int result;
137
138 result = 0;
139 spin_lock(&xprt_list_lock);
140 list_for_each_entry(t, &xprt_list, list) {
141 if (t == transport) {
142 printk(KERN_INFO
143 "RPC: Unregistered %s transport module.\n",
144 transport->name);
145 list_del_init(&transport->list);
146 module_put(THIS_MODULE);
147 goto out;
148 }
149 }
150 result = -ENOENT;
151
152out:
153 spin_unlock(&xprt_list_lock);
154 return result;
155}
156EXPORT_SYMBOL_GPL(xprt_unregister_transport);
157
158/**
84 * xprt_reserve_xprt - serialize write access to transports 159 * xprt_reserve_xprt - serialize write access to transports
85 * @task: task that is requesting access to the transport 160 * @task: task that is requesting access to the transport
86 * 161 *
@@ -118,6 +193,7 @@ out_sleep:
118 rpc_sleep_on(&xprt->sending, task, NULL, NULL); 193 rpc_sleep_on(&xprt->sending, task, NULL, NULL);
119 return 0; 194 return 0;
120} 195}
196EXPORT_SYMBOL_GPL(xprt_reserve_xprt);
121 197
122static void xprt_clear_locked(struct rpc_xprt *xprt) 198static void xprt_clear_locked(struct rpc_xprt *xprt)
123{ 199{
@@ -167,6 +243,7 @@ out_sleep:
167 rpc_sleep_on(&xprt->sending, task, NULL, NULL); 243 rpc_sleep_on(&xprt->sending, task, NULL, NULL);
168 return 0; 244 return 0;
169} 245}
246EXPORT_SYMBOL_GPL(xprt_reserve_xprt_cong);
170 247
171static inline int xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task) 248static inline int xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task)
172{ 249{
@@ -246,6 +323,7 @@ void xprt_release_xprt(struct rpc_xprt *xprt, struct rpc_task *task)
246 __xprt_lock_write_next(xprt); 323 __xprt_lock_write_next(xprt);
247 } 324 }
248} 325}
326EXPORT_SYMBOL_GPL(xprt_release_xprt);
249 327
250/** 328/**
251 * xprt_release_xprt_cong - allow other requests to use a transport 329 * xprt_release_xprt_cong - allow other requests to use a transport
@@ -262,6 +340,7 @@ void xprt_release_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task)
262 __xprt_lock_write_next_cong(xprt); 340 __xprt_lock_write_next_cong(xprt);
263 } 341 }
264} 342}
343EXPORT_SYMBOL_GPL(xprt_release_xprt_cong);
265 344
266static inline void xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *task) 345static inline void xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *task)
267{ 346{
@@ -314,6 +393,7 @@ void xprt_release_rqst_cong(struct rpc_task *task)
314{ 393{
315 __xprt_put_cong(task->tk_xprt, task->tk_rqstp); 394 __xprt_put_cong(task->tk_xprt, task->tk_rqstp);
316} 395}
396EXPORT_SYMBOL_GPL(xprt_release_rqst_cong);
317 397
318/** 398/**
319 * xprt_adjust_cwnd - adjust transport congestion window 399 * xprt_adjust_cwnd - adjust transport congestion window
@@ -345,6 +425,7 @@ void xprt_adjust_cwnd(struct rpc_task *task, int result)
345 xprt->cwnd = cwnd; 425 xprt->cwnd = cwnd;
346 __xprt_put_cong(xprt, req); 426 __xprt_put_cong(xprt, req);
347} 427}
428EXPORT_SYMBOL_GPL(xprt_adjust_cwnd);
348 429
349/** 430/**
350 * xprt_wake_pending_tasks - wake all tasks on a transport's pending queue 431 * xprt_wake_pending_tasks - wake all tasks on a transport's pending queue
@@ -359,6 +440,7 @@ void xprt_wake_pending_tasks(struct rpc_xprt *xprt, int status)
359 else 440 else
360 rpc_wake_up(&xprt->pending); 441 rpc_wake_up(&xprt->pending);
361} 442}
443EXPORT_SYMBOL_GPL(xprt_wake_pending_tasks);
362 444
363/** 445/**
364 * xprt_wait_for_buffer_space - wait for transport output buffer to clear 446 * xprt_wait_for_buffer_space - wait for transport output buffer to clear
@@ -373,6 +455,7 @@ void xprt_wait_for_buffer_space(struct rpc_task *task)
373 task->tk_timeout = req->rq_timeout; 455 task->tk_timeout = req->rq_timeout;
374 rpc_sleep_on(&xprt->pending, task, NULL, NULL); 456 rpc_sleep_on(&xprt->pending, task, NULL, NULL);
375} 457}
458EXPORT_SYMBOL_GPL(xprt_wait_for_buffer_space);
376 459
377/** 460/**
378 * xprt_write_space - wake the task waiting for transport output buffer space 461 * xprt_write_space - wake the task waiting for transport output buffer space
@@ -393,6 +476,7 @@ void xprt_write_space(struct rpc_xprt *xprt)
393 } 476 }
394 spin_unlock_bh(&xprt->transport_lock); 477 spin_unlock_bh(&xprt->transport_lock);
395} 478}
479EXPORT_SYMBOL_GPL(xprt_write_space);
396 480
397/** 481/**
398 * xprt_set_retrans_timeout_def - set a request's retransmit timeout 482 * xprt_set_retrans_timeout_def - set a request's retransmit timeout
@@ -406,6 +490,7 @@ void xprt_set_retrans_timeout_def(struct rpc_task *task)
406{ 490{
407 task->tk_timeout = task->tk_rqstp->rq_timeout; 491 task->tk_timeout = task->tk_rqstp->rq_timeout;
408} 492}
493EXPORT_SYMBOL_GPL(xprt_set_retrans_timeout_def);
409 494
410/* 495/*
411 * xprt_set_retrans_timeout_rtt - set a request's retransmit timeout 496 * xprt_set_retrans_timeout_rtt - set a request's retransmit timeout
@@ -425,6 +510,7 @@ void xprt_set_retrans_timeout_rtt(struct rpc_task *task)
425 if (task->tk_timeout > max_timeout || task->tk_timeout == 0) 510 if (task->tk_timeout > max_timeout || task->tk_timeout == 0)
426 task->tk_timeout = max_timeout; 511 task->tk_timeout = max_timeout;
427} 512}
513EXPORT_SYMBOL_GPL(xprt_set_retrans_timeout_rtt);
428 514
429static void xprt_reset_majortimeo(struct rpc_rqst *req) 515static void xprt_reset_majortimeo(struct rpc_rqst *req)
430{ 516{
@@ -500,6 +586,7 @@ void xprt_disconnect(struct rpc_xprt *xprt)
500 xprt_wake_pending_tasks(xprt, -ENOTCONN); 586 xprt_wake_pending_tasks(xprt, -ENOTCONN);
501 spin_unlock_bh(&xprt->transport_lock); 587 spin_unlock_bh(&xprt->transport_lock);
502} 588}
589EXPORT_SYMBOL_GPL(xprt_disconnect);
503 590
504static void 591static void
505xprt_init_autodisconnect(unsigned long data) 592xprt_init_autodisconnect(unsigned long data)
@@ -610,6 +697,7 @@ struct rpc_rqst *xprt_lookup_rqst(struct rpc_xprt *xprt, __be32 xid)
610 xprt->stat.bad_xids++; 697 xprt->stat.bad_xids++;
611 return NULL; 698 return NULL;
612} 699}
700EXPORT_SYMBOL_GPL(xprt_lookup_rqst);
613 701
614/** 702/**
615 * xprt_update_rtt - update an RPC client's RTT state after receiving a reply 703 * xprt_update_rtt - update an RPC client's RTT state after receiving a reply
@@ -629,6 +717,7 @@ void xprt_update_rtt(struct rpc_task *task)
629 rpc_set_timeo(rtt, timer, req->rq_ntrans - 1); 717 rpc_set_timeo(rtt, timer, req->rq_ntrans - 1);
630 } 718 }
631} 719}
720EXPORT_SYMBOL_GPL(xprt_update_rtt);
632 721
633/** 722/**
634 * xprt_complete_rqst - called when reply processing is complete 723 * xprt_complete_rqst - called when reply processing is complete
@@ -653,6 +742,7 @@ void xprt_complete_rqst(struct rpc_task *task, int copied)
653 req->rq_received = req->rq_private_buf.len = copied; 742 req->rq_received = req->rq_private_buf.len = copied;
654 rpc_wake_up_task(task); 743 rpc_wake_up_task(task);
655} 744}
745EXPORT_SYMBOL_GPL(xprt_complete_rqst);
656 746
657static void xprt_timer(struct rpc_task *task) 747static void xprt_timer(struct rpc_task *task)
658{ 748{
@@ -889,23 +979,25 @@ void xprt_set_timeout(struct rpc_timeout *to, unsigned int retr, unsigned long i
889 * @args: rpc transport creation arguments 979 * @args: rpc transport creation arguments
890 * 980 *
891 */ 981 */
892struct rpc_xprt *xprt_create_transport(struct rpc_xprtsock_create *args) 982struct rpc_xprt *xprt_create_transport(struct xprt_create *args)
893{ 983{
894 struct rpc_xprt *xprt; 984 struct rpc_xprt *xprt;
895 struct rpc_rqst *req; 985 struct rpc_rqst *req;
986 struct xprt_class *t;
896 987
897 switch (args->proto) { 988 spin_lock(&xprt_list_lock);
898 case IPPROTO_UDP: 989 list_for_each_entry(t, &xprt_list, list) {
899 xprt = xs_setup_udp(args); 990 if (t->ident == args->ident) {
900 break; 991 spin_unlock(&xprt_list_lock);
901 case IPPROTO_TCP: 992 goto found;
902 xprt = xs_setup_tcp(args); 993 }
903 break;
904 default:
905 printk(KERN_ERR "RPC: unrecognized transport protocol: %d\n",
906 args->proto);
907 return ERR_PTR(-EIO);
908 } 994 }
995 spin_unlock(&xprt_list_lock);
996 printk(KERN_ERR "RPC: transport (%d) not supported\n", args->ident);
997 return ERR_PTR(-EIO);
998
999found:
1000 xprt = t->setup(args);
909 if (IS_ERR(xprt)) { 1001 if (IS_ERR(xprt)) {
910 dprintk("RPC: xprt_create_transport: failed, %ld\n", 1002 dprintk("RPC: xprt_create_transport: failed, %ld\n",
911 -PTR_ERR(xprt)); 1003 -PTR_ERR(xprt));
diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile
new file mode 100644
index 000000000000..264f0feeb513
--- /dev/null
+++ b/net/sunrpc/xprtrdma/Makefile
@@ -0,0 +1,3 @@
1obj-$(CONFIG_SUNRPC_XPRT_RDMA) += xprtrdma.o
2
3xprtrdma-y := transport.o rpc_rdma.o verbs.o
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
new file mode 100644
index 000000000000..12db63580427
--- /dev/null
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -0,0 +1,868 @@
1/*
2 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the BSD-type
8 * license below:
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 *
17 * Redistributions in binary form must reproduce the above
18 * copyright notice, this list of conditions and the following
19 * disclaimer in the documentation and/or other materials provided
20 * with the distribution.
21 *
22 * Neither the name of the Network Appliance, Inc. nor the names of
23 * its contributors may be used to endorse or promote products
24 * derived from this software without specific prior written
25 * permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 */
39
40/*
41 * rpc_rdma.c
42 *
43 * This file contains the guts of the RPC RDMA protocol, and
44 * does marshaling/unmarshaling, etc. It is also where interfacing
45 * to the Linux RPC framework lives.
46 */
47
48#include "xprt_rdma.h"
49
50#include <linux/highmem.h>
51
52#ifdef RPC_DEBUG
53# define RPCDBG_FACILITY RPCDBG_TRANS
54#endif
55
56enum rpcrdma_chunktype {
57 rpcrdma_noch = 0,
58 rpcrdma_readch,
59 rpcrdma_areadch,
60 rpcrdma_writech,
61 rpcrdma_replych
62};
63
64#ifdef RPC_DEBUG
65static const char transfertypes[][12] = {
66 "pure inline", /* no chunks */
67 " read chunk", /* some argument via rdma read */
68 "*read chunk", /* entire request via rdma read */
69 "write chunk", /* some result via rdma write */
70 "reply chunk" /* entire reply via rdma write */
71};
72#endif
73
74/*
75 * Chunk assembly from upper layer xdr_buf.
76 *
77 * Prepare the passed-in xdr_buf into representation as RPC/RDMA chunk
78 * elements. Segments are then coalesced when registered, if possible
79 * within the selected memreg mode.
80 *
81 * Note, this routine is never called if the connection's memory
82 * registration strategy is 0 (bounce buffers).
83 */
84
85static int
86rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, int pos,
87 enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg, int nsegs)
88{
89 int len, n = 0, p;
90
91 if (pos == 0 && xdrbuf->head[0].iov_len) {
92 seg[n].mr_page = NULL;
93 seg[n].mr_offset = xdrbuf->head[0].iov_base;
94 seg[n].mr_len = xdrbuf->head[0].iov_len;
95 pos += xdrbuf->head[0].iov_len;
96 ++n;
97 }
98
99 if (xdrbuf->page_len && (xdrbuf->pages[0] != NULL)) {
100 if (n == nsegs)
101 return 0;
102 seg[n].mr_page = xdrbuf->pages[0];
103 seg[n].mr_offset = (void *)(unsigned long) xdrbuf->page_base;
104 seg[n].mr_len = min_t(u32,
105 PAGE_SIZE - xdrbuf->page_base, xdrbuf->page_len);
106 len = xdrbuf->page_len - seg[n].mr_len;
107 pos += len;
108 ++n;
109 p = 1;
110 while (len > 0) {
111 if (n == nsegs)
112 return 0;
113 seg[n].mr_page = xdrbuf->pages[p];
114 seg[n].mr_offset = NULL;
115 seg[n].mr_len = min_t(u32, PAGE_SIZE, len);
116 len -= seg[n].mr_len;
117 ++n;
118 ++p;
119 }
120 }
121
122 if (pos < xdrbuf->len && xdrbuf->tail[0].iov_len) {
123 if (n == nsegs)
124 return 0;
125 seg[n].mr_page = NULL;
126 seg[n].mr_offset = xdrbuf->tail[0].iov_base;
127 seg[n].mr_len = xdrbuf->tail[0].iov_len;
128 pos += xdrbuf->tail[0].iov_len;
129 ++n;
130 }
131
132 if (pos < xdrbuf->len)
133 dprintk("RPC: %s: marshaled only %d of %d\n",
134 __func__, pos, xdrbuf->len);
135
136 return n;
137}
138
139/*
140 * Create read/write chunk lists, and reply chunks, for RDMA
141 *
142 * Assume check against THRESHOLD has been done, and chunks are required.
143 * Assume only encoding one list entry for read|write chunks. The NFSv3
144 * protocol is simple enough to allow this as it only has a single "bulk
145 * result" in each procedure - complicated NFSv4 COMPOUNDs are not. (The
146 * RDMA/Sessions NFSv4 proposal addresses this for future v4 revs.)
147 *
148 * When used for a single reply chunk (which is a special write
149 * chunk used for the entire reply, rather than just the data), it
150 * is used primarily for READDIR and READLINK which would otherwise
151 * be severely size-limited by a small rdma inline read max. The server
152 * response will come back as an RDMA Write, followed by a message
153 * of type RDMA_NOMSG carrying the xid and length. As a result, reply
154 * chunks do not provide data alignment, however they do not require
155 * "fixup" (moving the response to the upper layer buffer) either.
156 *
157 * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64):
158 *
159 * Read chunklist (a linked list):
160 * N elements, position P (same P for all chunks of same arg!):
161 * 1 - PHLOO - 1 - PHLOO - ... - 1 - PHLOO - 0
162 *
163 * Write chunklist (a list of (one) counted array):
164 * N elements:
165 * 1 - N - HLOO - HLOO - ... - HLOO - 0
166 *
167 * Reply chunk (a counted array):
168 * N elements:
169 * 1 - N - HLOO - HLOO - ... - HLOO
170 */
171
172static unsigned int
173rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
174 struct rpcrdma_msg *headerp, enum rpcrdma_chunktype type)
175{
176 struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
177 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_task->tk_xprt);
178 int nsegs, nchunks = 0;
179 int pos;
180 struct rpcrdma_mr_seg *seg = req->rl_segments;
181 struct rpcrdma_read_chunk *cur_rchunk = NULL;
182 struct rpcrdma_write_array *warray = NULL;
183 struct rpcrdma_write_chunk *cur_wchunk = NULL;
184 u32 *iptr = headerp->rm_body.rm_chunks;
185
186 if (type == rpcrdma_readch || type == rpcrdma_areadch) {
187 /* a read chunk - server will RDMA Read our memory */
188 cur_rchunk = (struct rpcrdma_read_chunk *) iptr;
189 } else {
190 /* a write or reply chunk - server will RDMA Write our memory */
191 *iptr++ = xdr_zero; /* encode a NULL read chunk list */
192 if (type == rpcrdma_replych)
193 *iptr++ = xdr_zero; /* a NULL write chunk list */
194 warray = (struct rpcrdma_write_array *) iptr;
195 cur_wchunk = (struct rpcrdma_write_chunk *) (warray + 1);
196 }
197
198 if (type == rpcrdma_replych || type == rpcrdma_areadch)
199 pos = 0;
200 else
201 pos = target->head[0].iov_len;
202
203 nsegs = rpcrdma_convert_iovs(target, pos, type, seg, RPCRDMA_MAX_SEGS);
204 if (nsegs == 0)
205 return 0;
206
207 do {
208 /* bind/register the memory, then build chunk from result. */
209 int n = rpcrdma_register_external(seg, nsegs,
210 cur_wchunk != NULL, r_xprt);
211 if (n <= 0)
212 goto out;
213 if (cur_rchunk) { /* read */
214 cur_rchunk->rc_discrim = xdr_one;
215 /* all read chunks have the same "position" */
216 cur_rchunk->rc_position = htonl(pos);
217 cur_rchunk->rc_target.rs_handle = htonl(seg->mr_rkey);
218 cur_rchunk->rc_target.rs_length = htonl(seg->mr_len);
219 xdr_encode_hyper(
220 (u32 *)&cur_rchunk->rc_target.rs_offset,
221 seg->mr_base);
222 dprintk("RPC: %s: read chunk "
223 "elem %d@0x%llx:0x%x pos %d (%s)\n", __func__,
224 seg->mr_len, seg->mr_base, seg->mr_rkey, pos,
225 n < nsegs ? "more" : "last");
226 cur_rchunk++;
227 r_xprt->rx_stats.read_chunk_count++;
228 } else { /* write/reply */
229 cur_wchunk->wc_target.rs_handle = htonl(seg->mr_rkey);
230 cur_wchunk->wc_target.rs_length = htonl(seg->mr_len);
231 xdr_encode_hyper(
232 (u32 *)&cur_wchunk->wc_target.rs_offset,
233 seg->mr_base);
234 dprintk("RPC: %s: %s chunk "
235 "elem %d@0x%llx:0x%x (%s)\n", __func__,
236 (type == rpcrdma_replych) ? "reply" : "write",
237 seg->mr_len, seg->mr_base, seg->mr_rkey,
238 n < nsegs ? "more" : "last");
239 cur_wchunk++;
240 if (type == rpcrdma_replych)
241 r_xprt->rx_stats.reply_chunk_count++;
242 else
243 r_xprt->rx_stats.write_chunk_count++;
244 r_xprt->rx_stats.total_rdma_request += seg->mr_len;
245 }
246 nchunks++;
247 seg += n;
248 nsegs -= n;
249 } while (nsegs);
250
251 /* success. all failures return above */
252 req->rl_nchunks = nchunks;
253
254 BUG_ON(nchunks == 0);
255
256 /*
257 * finish off header. If write, marshal discrim and nchunks.
258 */
259 if (cur_rchunk) {
260 iptr = (u32 *) cur_rchunk;
261 *iptr++ = xdr_zero; /* finish the read chunk list */
262 *iptr++ = xdr_zero; /* encode a NULL write chunk list */
263 *iptr++ = xdr_zero; /* encode a NULL reply chunk */
264 } else {
265 warray->wc_discrim = xdr_one;
266 warray->wc_nchunks = htonl(nchunks);
267 iptr = (u32 *) cur_wchunk;
268 if (type == rpcrdma_writech) {
269 *iptr++ = xdr_zero; /* finish the write chunk list */
270 *iptr++ = xdr_zero; /* encode a NULL reply chunk */
271 }
272 }
273
274 /*
275 * Return header size.
276 */
277 return (unsigned char *)iptr - (unsigned char *)headerp;
278
279out:
280 for (pos = 0; nchunks--;)
281 pos += rpcrdma_deregister_external(
282 &req->rl_segments[pos], r_xprt, NULL);
283 return 0;
284}
285
286/*
287 * Copy write data inline.
288 * This function is used for "small" requests. Data which is passed
289 * to RPC via iovecs (or page list) is copied directly into the
290 * pre-registered memory buffer for this request. For small amounts
291 * of data, this is efficient. The cutoff value is tunable.
292 */
293static int
294rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad)
295{
296 int i, npages, curlen;
297 int copy_len;
298 unsigned char *srcp, *destp;
299 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
300
301 destp = rqst->rq_svec[0].iov_base;
302 curlen = rqst->rq_svec[0].iov_len;
303 destp += curlen;
304 /*
305 * Do optional padding where it makes sense. Alignment of write
306 * payload can help the server, if our setting is accurate.
307 */
308 pad -= (curlen + 36/*sizeof(struct rpcrdma_msg_padded)*/);
309 if (pad < 0 || rqst->rq_slen - curlen < RPCRDMA_INLINE_PAD_THRESH)
310 pad = 0; /* don't pad this request */
311
312 dprintk("RPC: %s: pad %d destp 0x%p len %d hdrlen %d\n",
313 __func__, pad, destp, rqst->rq_slen, curlen);
314
315 copy_len = rqst->rq_snd_buf.page_len;
316 r_xprt->rx_stats.pullup_copy_count += copy_len;
317 npages = PAGE_ALIGN(rqst->rq_snd_buf.page_base+copy_len) >> PAGE_SHIFT;
318 for (i = 0; copy_len && i < npages; i++) {
319 if (i == 0)
320 curlen = PAGE_SIZE - rqst->rq_snd_buf.page_base;
321 else
322 curlen = PAGE_SIZE;
323 if (curlen > copy_len)
324 curlen = copy_len;
325 dprintk("RPC: %s: page %d destp 0x%p len %d curlen %d\n",
326 __func__, i, destp, copy_len, curlen);
327 srcp = kmap_atomic(rqst->rq_snd_buf.pages[i],
328 KM_SKB_SUNRPC_DATA);
329 if (i == 0)
330 memcpy(destp, srcp+rqst->rq_snd_buf.page_base, curlen);
331 else
332 memcpy(destp, srcp, curlen);
333 kunmap_atomic(srcp, KM_SKB_SUNRPC_DATA);
334 rqst->rq_svec[0].iov_len += curlen;
335 destp += curlen;
336 copy_len -= curlen;
337 }
338 if (rqst->rq_snd_buf.tail[0].iov_len) {
339 curlen = rqst->rq_snd_buf.tail[0].iov_len;
340 if (destp != rqst->rq_snd_buf.tail[0].iov_base) {
341 memcpy(destp,
342 rqst->rq_snd_buf.tail[0].iov_base, curlen);
343 r_xprt->rx_stats.pullup_copy_count += curlen;
344 }
345 dprintk("RPC: %s: tail destp 0x%p len %d curlen %d\n",
346 __func__, destp, copy_len, curlen);
347 rqst->rq_svec[0].iov_len += curlen;
348 }
349 /* header now contains entire send message */
350 return pad;
351}
352
353/*
354 * Marshal a request: the primary job of this routine is to choose
355 * the transfer modes. See comments below.
356 *
357 * Uses multiple RDMA IOVs for a request:
358 * [0] -- RPC RDMA header, which uses memory from the *start* of the
359 * preregistered buffer that already holds the RPC data in
360 * its middle.
361 * [1] -- the RPC header/data, marshaled by RPC and the NFS protocol.
362 * [2] -- optional padding.
363 * [3] -- if padded, header only in [1] and data here.
364 */
365
366int
367rpcrdma_marshal_req(struct rpc_rqst *rqst)
368{
369 struct rpc_xprt *xprt = rqst->rq_task->tk_xprt;
370 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
371 struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
372 char *base;
373 size_t hdrlen, rpclen, padlen;
374 enum rpcrdma_chunktype rtype, wtype;
375 struct rpcrdma_msg *headerp;
376
377 /*
378 * rpclen gets amount of data in first buffer, which is the
379 * pre-registered buffer.
380 */
381 base = rqst->rq_svec[0].iov_base;
382 rpclen = rqst->rq_svec[0].iov_len;
383
384 /* build RDMA header in private area at front */
385 headerp = (struct rpcrdma_msg *) req->rl_base;
386 /* don't htonl XID, it's already done in request */
387 headerp->rm_xid = rqst->rq_xid;
388 headerp->rm_vers = xdr_one;
389 headerp->rm_credit = htonl(r_xprt->rx_buf.rb_max_requests);
390 headerp->rm_type = __constant_htonl(RDMA_MSG);
391
392 /*
393 * Chunks needed for results?
394 *
395 * o If the expected result is under the inline threshold, all ops
396 * return as inline (but see later).
397 * o Large non-read ops return as a single reply chunk.
398 * o Large read ops return data as write chunk(s), header as inline.
399 *
400 * Note: the NFS code sending down multiple result segments implies
401 * the op is one of read, readdir[plus], readlink or NFSv4 getacl.
402 */
403
404 /*
405 * This code can handle read chunks, write chunks OR reply
406 * chunks -- only one type. If the request is too big to fit
407 * inline, then we will choose read chunks. If the request is
408 * a READ, then use write chunks to separate the file data
409 * into pages; otherwise use reply chunks.
410 */
411 if (rqst->rq_rcv_buf.buflen <= RPCRDMA_INLINE_READ_THRESHOLD(rqst))
412 wtype = rpcrdma_noch;
413 else if (rqst->rq_rcv_buf.page_len == 0)
414 wtype = rpcrdma_replych;
415 else if (rqst->rq_rcv_buf.flags & XDRBUF_READ)
416 wtype = rpcrdma_writech;
417 else
418 wtype = rpcrdma_replych;
419
420 /*
421 * Chunks needed for arguments?
422 *
423 * o If the total request is under the inline threshold, all ops
424 * are sent as inline.
425 * o Large non-write ops are sent with the entire message as a
426 * single read chunk (protocol 0-position special case).
427 * o Large write ops transmit data as read chunk(s), header as
428 * inline.
429 *
430 * Note: the NFS code sending down multiple argument segments
431 * implies the op is a write.
432 * TBD check NFSv4 setacl
433 */
434 if (rqst->rq_snd_buf.len <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst))
435 rtype = rpcrdma_noch;
436 else if (rqst->rq_snd_buf.page_len == 0)
437 rtype = rpcrdma_areadch;
438 else
439 rtype = rpcrdma_readch;
440
441 /* The following simplification is not true forever */
442 if (rtype != rpcrdma_noch && wtype == rpcrdma_replych)
443 wtype = rpcrdma_noch;
444 BUG_ON(rtype != rpcrdma_noch && wtype != rpcrdma_noch);
445
446 if (r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_BOUNCEBUFFERS &&
447 (rtype != rpcrdma_noch || wtype != rpcrdma_noch)) {
448 /* forced to "pure inline"? */
449 dprintk("RPC: %s: too much data (%d/%d) for inline\n",
450 __func__, rqst->rq_rcv_buf.len, rqst->rq_snd_buf.len);
451 return -1;
452 }
453
454 hdrlen = 28; /*sizeof *headerp;*/
455 padlen = 0;
456
457 /*
458 * Pull up any extra send data into the preregistered buffer.
459 * When padding is in use and applies to the transfer, insert
460 * it and change the message type.
461 */
462 if (rtype == rpcrdma_noch) {
463
464 padlen = rpcrdma_inline_pullup(rqst,
465 RPCRDMA_INLINE_PAD_VALUE(rqst));
466
467 if (padlen) {
468 headerp->rm_type = __constant_htonl(RDMA_MSGP);
469 headerp->rm_body.rm_padded.rm_align =
470 htonl(RPCRDMA_INLINE_PAD_VALUE(rqst));
471 headerp->rm_body.rm_padded.rm_thresh =
472 __constant_htonl(RPCRDMA_INLINE_PAD_THRESH);
473 headerp->rm_body.rm_padded.rm_pempty[0] = xdr_zero;
474 headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero;
475 headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero;
476 hdrlen += 2 * sizeof(u32); /* extra words in padhdr */
477 BUG_ON(wtype != rpcrdma_noch);
478
479 } else {
480 headerp->rm_body.rm_nochunks.rm_empty[0] = xdr_zero;
481 headerp->rm_body.rm_nochunks.rm_empty[1] = xdr_zero;
482 headerp->rm_body.rm_nochunks.rm_empty[2] = xdr_zero;
483 /* new length after pullup */
484 rpclen = rqst->rq_svec[0].iov_len;
485 /*
486 * Currently we try to not actually use read inline.
487 * Reply chunks have the desirable property that
488 * they land, packed, directly in the target buffers
489 * without headers, so they require no fixup. The
490 * additional RDMA Write op sends the same amount
491 * of data, streams on-the-wire and adds no overhead
492 * on receive. Therefore, we request a reply chunk
493 * for non-writes wherever feasible and efficient.
494 */
495 if (wtype == rpcrdma_noch &&
496 r_xprt->rx_ia.ri_memreg_strategy > RPCRDMA_REGISTER)
497 wtype = rpcrdma_replych;
498 }
499 }
500
501 /*
502 * Marshal chunks. This routine will return the header length
503 * consumed by marshaling.
504 */
505 if (rtype != rpcrdma_noch) {
506 hdrlen = rpcrdma_create_chunks(rqst,
507 &rqst->rq_snd_buf, headerp, rtype);
508 wtype = rtype; /* simplify dprintk */
509
510 } else if (wtype != rpcrdma_noch) {
511 hdrlen = rpcrdma_create_chunks(rqst,
512 &rqst->rq_rcv_buf, headerp, wtype);
513 }
514
515 if (hdrlen == 0)
516 return -1;
517
518 dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd\n"
519 " headerp 0x%p base 0x%p lkey 0x%x\n",
520 __func__, transfertypes[wtype], hdrlen, rpclen, padlen,
521 headerp, base, req->rl_iov.lkey);
522
523 /*
524 * initialize send_iov's - normally only two: rdma chunk header and
525 * single preregistered RPC header buffer, but if padding is present,
526 * then use a preregistered (and zeroed) pad buffer between the RPC
527 * header and any write data. In all non-rdma cases, any following
528 * data has been copied into the RPC header buffer.
529 */
530 req->rl_send_iov[0].addr = req->rl_iov.addr;
531 req->rl_send_iov[0].length = hdrlen;
532 req->rl_send_iov[0].lkey = req->rl_iov.lkey;
533
534 req->rl_send_iov[1].addr = req->rl_iov.addr + (base - req->rl_base);
535 req->rl_send_iov[1].length = rpclen;
536 req->rl_send_iov[1].lkey = req->rl_iov.lkey;
537
538 req->rl_niovs = 2;
539
540 if (padlen) {
541 struct rpcrdma_ep *ep = &r_xprt->rx_ep;
542
543 req->rl_send_iov[2].addr = ep->rep_pad.addr;
544 req->rl_send_iov[2].length = padlen;
545 req->rl_send_iov[2].lkey = ep->rep_pad.lkey;
546
547 req->rl_send_iov[3].addr = req->rl_send_iov[1].addr + rpclen;
548 req->rl_send_iov[3].length = rqst->rq_slen - rpclen;
549 req->rl_send_iov[3].lkey = req->rl_iov.lkey;
550
551 req->rl_niovs = 4;
552 }
553
554 return 0;
555}
556
557/*
558 * Chase down a received write or reply chunklist to get length
559 * RDMA'd by server. See map at rpcrdma_create_chunks()! :-)
560 */
561static int
562rpcrdma_count_chunks(struct rpcrdma_rep *rep, int max, int wrchunk, u32 **iptrp)
563{
564 unsigned int i, total_len;
565 struct rpcrdma_write_chunk *cur_wchunk;
566
567 i = ntohl(**iptrp); /* get array count */
568 if (i > max)
569 return -1;
570 cur_wchunk = (struct rpcrdma_write_chunk *) (*iptrp + 1);
571 total_len = 0;
572 while (i--) {
573 struct rpcrdma_segment *seg = &cur_wchunk->wc_target;
574 ifdebug(FACILITY) {
575 u64 off;
576 xdr_decode_hyper((u32 *)&seg->rs_offset, &off);
577 dprintk("RPC: %s: chunk %d@0x%llx:0x%x\n",
578 __func__,
579 ntohl(seg->rs_length),
580 off,
581 ntohl(seg->rs_handle));
582 }
583 total_len += ntohl(seg->rs_length);
584 ++cur_wchunk;
585 }
586 /* check and adjust for properly terminated write chunk */
587 if (wrchunk) {
588 u32 *w = (u32 *) cur_wchunk;
589 if (*w++ != xdr_zero)
590 return -1;
591 cur_wchunk = (struct rpcrdma_write_chunk *) w;
592 }
593 if ((char *) cur_wchunk > rep->rr_base + rep->rr_len)
594 return -1;
595
596 *iptrp = (u32 *) cur_wchunk;
597 return total_len;
598}
599
600/*
601 * Scatter inline received data back into provided iov's.
602 */
603static void
604rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len)
605{
606 int i, npages, curlen, olen;
607 char *destp;
608
609 curlen = rqst->rq_rcv_buf.head[0].iov_len;
610 if (curlen > copy_len) { /* write chunk header fixup */
611 curlen = copy_len;
612 rqst->rq_rcv_buf.head[0].iov_len = curlen;
613 }
614
615 dprintk("RPC: %s: srcp 0x%p len %d hdrlen %d\n",
616 __func__, srcp, copy_len, curlen);
617
618 /* Shift pointer for first receive segment only */
619 rqst->rq_rcv_buf.head[0].iov_base = srcp;
620 srcp += curlen;
621 copy_len -= curlen;
622
623 olen = copy_len;
624 i = 0;
625 rpcx_to_rdmax(rqst->rq_xprt)->rx_stats.fixup_copy_count += olen;
626 if (copy_len && rqst->rq_rcv_buf.page_len) {
627 npages = PAGE_ALIGN(rqst->rq_rcv_buf.page_base +
628 rqst->rq_rcv_buf.page_len) >> PAGE_SHIFT;
629 for (; i < npages; i++) {
630 if (i == 0)
631 curlen = PAGE_SIZE - rqst->rq_rcv_buf.page_base;
632 else
633 curlen = PAGE_SIZE;
634 if (curlen > copy_len)
635 curlen = copy_len;
636 dprintk("RPC: %s: page %d"
637 " srcp 0x%p len %d curlen %d\n",
638 __func__, i, srcp, copy_len, curlen);
639 destp = kmap_atomic(rqst->rq_rcv_buf.pages[i],
640 KM_SKB_SUNRPC_DATA);
641 if (i == 0)
642 memcpy(destp + rqst->rq_rcv_buf.page_base,
643 srcp, curlen);
644 else
645 memcpy(destp, srcp, curlen);
646 flush_dcache_page(rqst->rq_rcv_buf.pages[i]);
647 kunmap_atomic(destp, KM_SKB_SUNRPC_DATA);
648 srcp += curlen;
649 copy_len -= curlen;
650 if (copy_len == 0)
651 break;
652 }
653 rqst->rq_rcv_buf.page_len = olen - copy_len;
654 } else
655 rqst->rq_rcv_buf.page_len = 0;
656
657 if (copy_len && rqst->rq_rcv_buf.tail[0].iov_len) {
658 curlen = copy_len;
659 if (curlen > rqst->rq_rcv_buf.tail[0].iov_len)
660 curlen = rqst->rq_rcv_buf.tail[0].iov_len;
661 if (rqst->rq_rcv_buf.tail[0].iov_base != srcp)
662 memcpy(rqst->rq_rcv_buf.tail[0].iov_base, srcp, curlen);
663 dprintk("RPC: %s: tail srcp 0x%p len %d curlen %d\n",
664 __func__, srcp, copy_len, curlen);
665 rqst->rq_rcv_buf.tail[0].iov_len = curlen;
666 copy_len -= curlen; ++i;
667 } else
668 rqst->rq_rcv_buf.tail[0].iov_len = 0;
669
670 if (copy_len)
671 dprintk("RPC: %s: %d bytes in"
672 " %d extra segments (%d lost)\n",
673 __func__, olen, i, copy_len);
674
675 /* TBD avoid a warning from call_decode() */
676 rqst->rq_private_buf = rqst->rq_rcv_buf;
677}
678
679/*
680 * This function is called when an async event is posted to
681 * the connection which changes the connection state. All it
682 * does at this point is mark the connection up/down, the rpc
683 * timers do the rest.
684 */
685void
686rpcrdma_conn_func(struct rpcrdma_ep *ep)
687{
688 struct rpc_xprt *xprt = ep->rep_xprt;
689
690 spin_lock_bh(&xprt->transport_lock);
691 if (ep->rep_connected > 0) {
692 if (!xprt_test_and_set_connected(xprt))
693 xprt_wake_pending_tasks(xprt, 0);
694 } else {
695 if (xprt_test_and_clear_connected(xprt))
696 xprt_wake_pending_tasks(xprt, ep->rep_connected);
697 }
698 spin_unlock_bh(&xprt->transport_lock);
699}
700
701/*
702 * This function is called when memory window unbind which we are waiting
703 * for completes. Just use rr_func (zeroed by upcall) to signal completion.
704 */
705static void
706rpcrdma_unbind_func(struct rpcrdma_rep *rep)
707{
708 wake_up(&rep->rr_unbind);
709}
710
711/*
712 * Called as a tasklet to do req/reply match and complete a request
713 * Errors must result in the RPC task either being awakened, or
714 * allowed to timeout, to discover the errors at that time.
715 */
716void
717rpcrdma_reply_handler(struct rpcrdma_rep *rep)
718{
719 struct rpcrdma_msg *headerp;
720 struct rpcrdma_req *req;
721 struct rpc_rqst *rqst;
722 struct rpc_xprt *xprt = rep->rr_xprt;
723 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
724 u32 *iptr;
725 int i, rdmalen, status;
726
727 /* Check status. If bad, signal disconnect and return rep to pool */
728 if (rep->rr_len == ~0U) {
729 rpcrdma_recv_buffer_put(rep);
730 if (r_xprt->rx_ep.rep_connected == 1) {
731 r_xprt->rx_ep.rep_connected = -EIO;
732 rpcrdma_conn_func(&r_xprt->rx_ep);
733 }
734 return;
735 }
736 if (rep->rr_len < 28) {
737 dprintk("RPC: %s: short/invalid reply\n", __func__);
738 goto repost;
739 }
740 headerp = (struct rpcrdma_msg *) rep->rr_base;
741 if (headerp->rm_vers != xdr_one) {
742 dprintk("RPC: %s: invalid version %d\n",
743 __func__, ntohl(headerp->rm_vers));
744 goto repost;
745 }
746
747 /* Get XID and try for a match. */
748 spin_lock(&xprt->transport_lock);
749 rqst = xprt_lookup_rqst(xprt, headerp->rm_xid);
750 if (rqst == NULL) {
751 spin_unlock(&xprt->transport_lock);
752 dprintk("RPC: %s: reply 0x%p failed "
753 "to match any request xid 0x%08x len %d\n",
754 __func__, rep, headerp->rm_xid, rep->rr_len);
755repost:
756 r_xprt->rx_stats.bad_reply_count++;
757 rep->rr_func = rpcrdma_reply_handler;
758 if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, &r_xprt->rx_ep, rep))
759 rpcrdma_recv_buffer_put(rep);
760
761 return;
762 }
763
764 /* get request object */
765 req = rpcr_to_rdmar(rqst);
766
767 dprintk("RPC: %s: reply 0x%p completes request 0x%p\n"
768 " RPC request 0x%p xid 0x%08x\n",
769 __func__, rep, req, rqst, headerp->rm_xid);
770
771 BUG_ON(!req || req->rl_reply);
772
773 /* from here on, the reply is no longer an orphan */
774 req->rl_reply = rep;
775
776 /* check for expected message types */
777 /* The order of some of these tests is important. */
778 switch (headerp->rm_type) {
779 case __constant_htonl(RDMA_MSG):
780 /* never expect read chunks */
781 /* never expect reply chunks (two ways to check) */
782 /* never expect write chunks without having offered RDMA */
783 if (headerp->rm_body.rm_chunks[0] != xdr_zero ||
784 (headerp->rm_body.rm_chunks[1] == xdr_zero &&
785 headerp->rm_body.rm_chunks[2] != xdr_zero) ||
786 (headerp->rm_body.rm_chunks[1] != xdr_zero &&
787 req->rl_nchunks == 0))
788 goto badheader;
789 if (headerp->rm_body.rm_chunks[1] != xdr_zero) {
790 /* count any expected write chunks in read reply */
791 /* start at write chunk array count */
792 iptr = &headerp->rm_body.rm_chunks[2];
793 rdmalen = rpcrdma_count_chunks(rep,
794 req->rl_nchunks, 1, &iptr);
795 /* check for validity, and no reply chunk after */
796 if (rdmalen < 0 || *iptr++ != xdr_zero)
797 goto badheader;
798 rep->rr_len -=
799 ((unsigned char *)iptr - (unsigned char *)headerp);
800 status = rep->rr_len + rdmalen;
801 r_xprt->rx_stats.total_rdma_reply += rdmalen;
802 } else {
803 /* else ordinary inline */
804 iptr = (u32 *)((unsigned char *)headerp + 28);
805 rep->rr_len -= 28; /*sizeof *headerp;*/
806 status = rep->rr_len;
807 }
808 /* Fix up the rpc results for upper layer */
809 rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len);
810 break;
811
812 case __constant_htonl(RDMA_NOMSG):
813 /* never expect read or write chunks, always reply chunks */
814 if (headerp->rm_body.rm_chunks[0] != xdr_zero ||
815 headerp->rm_body.rm_chunks[1] != xdr_zero ||
816 headerp->rm_body.rm_chunks[2] != xdr_one ||
817 req->rl_nchunks == 0)
818 goto badheader;
819 iptr = (u32 *)((unsigned char *)headerp + 28);
820 rdmalen = rpcrdma_count_chunks(rep, req->rl_nchunks, 0, &iptr);
821 if (rdmalen < 0)
822 goto badheader;
823 r_xprt->rx_stats.total_rdma_reply += rdmalen;
824 /* Reply chunk buffer already is the reply vector - no fixup. */
825 status = rdmalen;
826 break;
827
828badheader:
829 default:
830 dprintk("%s: invalid rpcrdma reply header (type %d):"
831 " chunks[012] == %d %d %d"
832 " expected chunks <= %d\n",
833 __func__, ntohl(headerp->rm_type),
834 headerp->rm_body.rm_chunks[0],
835 headerp->rm_body.rm_chunks[1],
836 headerp->rm_body.rm_chunks[2],
837 req->rl_nchunks);
838 status = -EIO;
839 r_xprt->rx_stats.bad_reply_count++;
840 break;
841 }
842
843 /* If using mw bind, start the deregister process now. */
844 /* (Note: if mr_free(), cannot perform it here, in tasklet context) */
845 if (req->rl_nchunks) switch (r_xprt->rx_ia.ri_memreg_strategy) {
846 case RPCRDMA_MEMWINDOWS:
847 for (i = 0; req->rl_nchunks-- > 1;)
848 i += rpcrdma_deregister_external(
849 &req->rl_segments[i], r_xprt, NULL);
850 /* Optionally wait (not here) for unbinds to complete */
851 rep->rr_func = rpcrdma_unbind_func;
852 (void) rpcrdma_deregister_external(&req->rl_segments[i],
853 r_xprt, rep);
854 break;
855 case RPCRDMA_MEMWINDOWS_ASYNC:
856 for (i = 0; req->rl_nchunks--;)
857 i += rpcrdma_deregister_external(&req->rl_segments[i],
858 r_xprt, NULL);
859 break;
860 default:
861 break;
862 }
863
864 dprintk("RPC: %s: xprt_complete_rqst(0x%p, 0x%p, %d)\n",
865 __func__, xprt, rqst, status);
866 xprt_complete_rqst(rqst->rq_task, status);
867 spin_unlock(&xprt->transport_lock);
868}
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
new file mode 100644
index 000000000000..dc55cc974c90
--- /dev/null
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -0,0 +1,800 @@
1/*
2 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the BSD-type
8 * license below:
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 *
17 * Redistributions in binary form must reproduce the above
18 * copyright notice, this list of conditions and the following
19 * disclaimer in the documentation and/or other materials provided
20 * with the distribution.
21 *
22 * Neither the name of the Network Appliance, Inc. nor the names of
23 * its contributors may be used to endorse or promote products
24 * derived from this software without specific prior written
25 * permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 */
39
40/*
41 * transport.c
42 *
43 * This file contains the top-level implementation of an RPC RDMA
44 * transport.
45 *
46 * Naming convention: functions beginning with xprt_ are part of the
47 * transport switch. All others are RPC RDMA internal.
48 */
49
50#include <linux/module.h>
51#include <linux/init.h>
52#include <linux/seq_file.h>
53
54#include "xprt_rdma.h"
55
56#ifdef RPC_DEBUG
57# define RPCDBG_FACILITY RPCDBG_TRANS
58#endif
59
60MODULE_LICENSE("Dual BSD/GPL");
61
62MODULE_DESCRIPTION("RPC/RDMA Transport for Linux kernel NFS");
63MODULE_AUTHOR("Network Appliance, Inc.");
64
65/*
66 * tunables
67 */
68
69static unsigned int xprt_rdma_slot_table_entries = RPCRDMA_DEF_SLOT_TABLE;
70static unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE;
71static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE;
72static unsigned int xprt_rdma_inline_write_padding;
73#if !RPCRDMA_PERSISTENT_REGISTRATION
74static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_REGISTER; /* FMR? */
75#else
76static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_ALLPHYSICAL;
77#endif
78
79#ifdef RPC_DEBUG
80
81static unsigned int min_slot_table_size = RPCRDMA_MIN_SLOT_TABLE;
82static unsigned int max_slot_table_size = RPCRDMA_MAX_SLOT_TABLE;
83static unsigned int zero;
84static unsigned int max_padding = PAGE_SIZE;
85static unsigned int min_memreg = RPCRDMA_BOUNCEBUFFERS;
86static unsigned int max_memreg = RPCRDMA_LAST - 1;
87
88static struct ctl_table_header *sunrpc_table_header;
89
90static ctl_table xr_tunables_table[] = {
91 {
92 .ctl_name = CTL_SLOTTABLE_RDMA,
93 .procname = "rdma_slot_table_entries",
94 .data = &xprt_rdma_slot_table_entries,
95 .maxlen = sizeof(unsigned int),
96 .mode = 0644,
97 .proc_handler = &proc_dointvec_minmax,
98 .strategy = &sysctl_intvec,
99 .extra1 = &min_slot_table_size,
100 .extra2 = &max_slot_table_size
101 },
102 {
103 .ctl_name = CTL_RDMA_MAXINLINEREAD,
104 .procname = "rdma_max_inline_read",
105 .data = &xprt_rdma_max_inline_read,
106 .maxlen = sizeof(unsigned int),
107 .mode = 0644,
108 .proc_handler = &proc_dointvec,
109 .strategy = &sysctl_intvec,
110 },
111 {
112 .ctl_name = CTL_RDMA_MAXINLINEWRITE,
113 .procname = "rdma_max_inline_write",
114 .data = &xprt_rdma_max_inline_write,
115 .maxlen = sizeof(unsigned int),
116 .mode = 0644,
117 .proc_handler = &proc_dointvec,
118 .strategy = &sysctl_intvec,
119 },
120 {
121 .ctl_name = CTL_RDMA_WRITEPADDING,
122 .procname = "rdma_inline_write_padding",
123 .data = &xprt_rdma_inline_write_padding,
124 .maxlen = sizeof(unsigned int),
125 .mode = 0644,
126 .proc_handler = &proc_dointvec_minmax,
127 .strategy = &sysctl_intvec,
128 .extra1 = &zero,
129 .extra2 = &max_padding,
130 },
131 {
132 .ctl_name = CTL_RDMA_MEMREG,
133 .procname = "rdma_memreg_strategy",
134 .data = &xprt_rdma_memreg_strategy,
135 .maxlen = sizeof(unsigned int),
136 .mode = 0644,
137 .proc_handler = &proc_dointvec_minmax,
138 .strategy = &sysctl_intvec,
139 .extra1 = &min_memreg,
140 .extra2 = &max_memreg,
141 },
142 {
143 .ctl_name = 0,
144 },
145};
146
147static ctl_table sunrpc_table[] = {
148 {
149 .ctl_name = CTL_SUNRPC,
150 .procname = "sunrpc",
151 .mode = 0555,
152 .child = xr_tunables_table
153 },
154 {
155 .ctl_name = 0,
156 },
157};
158
159#endif
160
161static struct rpc_xprt_ops xprt_rdma_procs; /* forward reference */
162
163static void
164xprt_rdma_format_addresses(struct rpc_xprt *xprt)
165{
166 struct sockaddr_in *addr = (struct sockaddr_in *)
167 &rpcx_to_rdmad(xprt).addr;
168 char *buf;
169
170 buf = kzalloc(20, GFP_KERNEL);
171 if (buf)
172 snprintf(buf, 20, NIPQUAD_FMT, NIPQUAD(addr->sin_addr.s_addr));
173 xprt->address_strings[RPC_DISPLAY_ADDR] = buf;
174
175 buf = kzalloc(8, GFP_KERNEL);
176 if (buf)
177 snprintf(buf, 8, "%u", ntohs(addr->sin_port));
178 xprt->address_strings[RPC_DISPLAY_PORT] = buf;
179
180 xprt->address_strings[RPC_DISPLAY_PROTO] = "rdma";
181
182 buf = kzalloc(48, GFP_KERNEL);
183 if (buf)
184 snprintf(buf, 48, "addr="NIPQUAD_FMT" port=%u proto=%s",
185 NIPQUAD(addr->sin_addr.s_addr),
186 ntohs(addr->sin_port), "rdma");
187 xprt->address_strings[RPC_DISPLAY_ALL] = buf;
188
189 buf = kzalloc(10, GFP_KERNEL);
190 if (buf)
191 snprintf(buf, 10, "%02x%02x%02x%02x",
192 NIPQUAD(addr->sin_addr.s_addr));
193 xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = buf;
194
195 buf = kzalloc(8, GFP_KERNEL);
196 if (buf)
197 snprintf(buf, 8, "%4hx", ntohs(addr->sin_port));
198 xprt->address_strings[RPC_DISPLAY_HEX_PORT] = buf;
199
200 buf = kzalloc(30, GFP_KERNEL);
201 if (buf)
202 snprintf(buf, 30, NIPQUAD_FMT".%u.%u",
203 NIPQUAD(addr->sin_addr.s_addr),
204 ntohs(addr->sin_port) >> 8,
205 ntohs(addr->sin_port) & 0xff);
206 xprt->address_strings[RPC_DISPLAY_UNIVERSAL_ADDR] = buf;
207
208 /* netid */
209 xprt->address_strings[RPC_DISPLAY_NETID] = "rdma";
210}
211
212static void
213xprt_rdma_free_addresses(struct rpc_xprt *xprt)
214{
215 kfree(xprt->address_strings[RPC_DISPLAY_ADDR]);
216 kfree(xprt->address_strings[RPC_DISPLAY_PORT]);
217 kfree(xprt->address_strings[RPC_DISPLAY_ALL]);
218 kfree(xprt->address_strings[RPC_DISPLAY_HEX_ADDR]);
219 kfree(xprt->address_strings[RPC_DISPLAY_HEX_PORT]);
220 kfree(xprt->address_strings[RPC_DISPLAY_UNIVERSAL_ADDR]);
221}
222
223static void
224xprt_rdma_connect_worker(struct work_struct *work)
225{
226 struct rpcrdma_xprt *r_xprt =
227 container_of(work, struct rpcrdma_xprt, rdma_connect.work);
228 struct rpc_xprt *xprt = &r_xprt->xprt;
229 int rc = 0;
230
231 if (!xprt->shutdown) {
232 xprt_clear_connected(xprt);
233
234 dprintk("RPC: %s: %sconnect\n", __func__,
235 r_xprt->rx_ep.rep_connected != 0 ? "re" : "");
236 rc = rpcrdma_ep_connect(&r_xprt->rx_ep, &r_xprt->rx_ia);
237 if (rc)
238 goto out;
239 }
240 goto out_clear;
241
242out:
243 xprt_wake_pending_tasks(xprt, rc);
244
245out_clear:
246 dprintk("RPC: %s: exit\n", __func__);
247 xprt_clear_connecting(xprt);
248}
249
250/*
251 * xprt_rdma_destroy
252 *
253 * Destroy the xprt.
254 * Free all memory associated with the object, including its own.
255 * NOTE: none of the *destroy methods free memory for their top-level
256 * objects, even though they may have allocated it (they do free
257 * private memory). It's up to the caller to handle it. In this
258 * case (RDMA transport), all structure memory is inlined with the
259 * struct rpcrdma_xprt.
260 */
261static void
262xprt_rdma_destroy(struct rpc_xprt *xprt)
263{
264 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
265 int rc;
266
267 dprintk("RPC: %s: called\n", __func__);
268
269 cancel_delayed_work(&r_xprt->rdma_connect);
270 flush_scheduled_work();
271
272 xprt_clear_connected(xprt);
273
274 rpcrdma_buffer_destroy(&r_xprt->rx_buf);
275 rc = rpcrdma_ep_destroy(&r_xprt->rx_ep, &r_xprt->rx_ia);
276 if (rc)
277 dprintk("RPC: %s: rpcrdma_ep_destroy returned %i\n",
278 __func__, rc);
279 rpcrdma_ia_close(&r_xprt->rx_ia);
280
281 xprt_rdma_free_addresses(xprt);
282
283 kfree(xprt->slot);
284 xprt->slot = NULL;
285 kfree(xprt);
286
287 dprintk("RPC: %s: returning\n", __func__);
288
289 module_put(THIS_MODULE);
290}
291
292/**
293 * xprt_setup_rdma - Set up transport to use RDMA
294 *
295 * @args: rpc transport arguments
296 */
297static struct rpc_xprt *
298xprt_setup_rdma(struct xprt_create *args)
299{
300 struct rpcrdma_create_data_internal cdata;
301 struct rpc_xprt *xprt;
302 struct rpcrdma_xprt *new_xprt;
303 struct rpcrdma_ep *new_ep;
304 struct sockaddr_in *sin;
305 int rc;
306
307 if (args->addrlen > sizeof(xprt->addr)) {
308 dprintk("RPC: %s: address too large\n", __func__);
309 return ERR_PTR(-EBADF);
310 }
311
312 xprt = kzalloc(sizeof(struct rpcrdma_xprt), GFP_KERNEL);
313 if (xprt == NULL) {
314 dprintk("RPC: %s: couldn't allocate rpcrdma_xprt\n",
315 __func__);
316 return ERR_PTR(-ENOMEM);
317 }
318
319 xprt->max_reqs = xprt_rdma_slot_table_entries;
320 xprt->slot = kcalloc(xprt->max_reqs,
321 sizeof(struct rpc_rqst), GFP_KERNEL);
322 if (xprt->slot == NULL) {
323 kfree(xprt);
324 dprintk("RPC: %s: couldn't allocate %d slots\n",
325 __func__, xprt->max_reqs);
326 return ERR_PTR(-ENOMEM);
327 }
328
329 /* 60 second timeout, no retries */
330 xprt_set_timeout(&xprt->timeout, 0, 60UL * HZ);
331 xprt->bind_timeout = (60U * HZ);
332 xprt->connect_timeout = (60U * HZ);
333 xprt->reestablish_timeout = (5U * HZ);
334 xprt->idle_timeout = (5U * 60 * HZ);
335
336 xprt->resvport = 0; /* privileged port not needed */
337 xprt->tsh_size = 0; /* RPC-RDMA handles framing */
338 xprt->max_payload = RPCRDMA_MAX_DATA_SEGS * PAGE_SIZE;
339 xprt->ops = &xprt_rdma_procs;
340
341 /*
342 * Set up RDMA-specific connect data.
343 */
344
345 /* Put server RDMA address in local cdata */
346 memcpy(&cdata.addr, args->dstaddr, args->addrlen);
347
348 /* Ensure xprt->addr holds valid server TCP (not RDMA)
349 * address, for any side protocols which peek at it */
350 xprt->prot = IPPROTO_TCP;
351 xprt->addrlen = args->addrlen;
352 memcpy(&xprt->addr, &cdata.addr, xprt->addrlen);
353
354 sin = (struct sockaddr_in *)&cdata.addr;
355 if (ntohs(sin->sin_port) != 0)
356 xprt_set_bound(xprt);
357
358 dprintk("RPC: %s: %u.%u.%u.%u:%u\n", __func__,
359 NIPQUAD(sin->sin_addr.s_addr), ntohs(sin->sin_port));
360
361 /* Set max requests */
362 cdata.max_requests = xprt->max_reqs;
363
364 /* Set some length limits */
365 cdata.rsize = RPCRDMA_MAX_SEGS * PAGE_SIZE; /* RDMA write max */
366 cdata.wsize = RPCRDMA_MAX_SEGS * PAGE_SIZE; /* RDMA read max */
367
368 cdata.inline_wsize = xprt_rdma_max_inline_write;
369 if (cdata.inline_wsize > cdata.wsize)
370 cdata.inline_wsize = cdata.wsize;
371
372 cdata.inline_rsize = xprt_rdma_max_inline_read;
373 if (cdata.inline_rsize > cdata.rsize)
374 cdata.inline_rsize = cdata.rsize;
375
376 cdata.padding = xprt_rdma_inline_write_padding;
377
378 /*
379 * Create new transport instance, which includes initialized
380 * o ia
381 * o endpoint
382 * o buffers
383 */
384
385 new_xprt = rpcx_to_rdmax(xprt);
386
387 rc = rpcrdma_ia_open(new_xprt, (struct sockaddr *) &cdata.addr,
388 xprt_rdma_memreg_strategy);
389 if (rc)
390 goto out1;
391
392 /*
393 * initialize and create ep
394 */
395 new_xprt->rx_data = cdata;
396 new_ep = &new_xprt->rx_ep;
397 new_ep->rep_remote_addr = cdata.addr;
398
399 rc = rpcrdma_ep_create(&new_xprt->rx_ep,
400 &new_xprt->rx_ia, &new_xprt->rx_data);
401 if (rc)
402 goto out2;
403
404 /*
405 * Allocate pre-registered send and receive buffers for headers and
406 * any inline data. Also specify any padding which will be provided
407 * from a preregistered zero buffer.
408 */
409 rc = rpcrdma_buffer_create(&new_xprt->rx_buf, new_ep, &new_xprt->rx_ia,
410 &new_xprt->rx_data);
411 if (rc)
412 goto out3;
413
414 /*
415 * Register a callback for connection events. This is necessary because
416 * connection loss notification is async. We also catch connection loss
417 * when reaping receives.
418 */
419 INIT_DELAYED_WORK(&new_xprt->rdma_connect, xprt_rdma_connect_worker);
420 new_ep->rep_func = rpcrdma_conn_func;
421 new_ep->rep_xprt = xprt;
422
423 xprt_rdma_format_addresses(xprt);
424
425 if (!try_module_get(THIS_MODULE))
426 goto out4;
427
428 return xprt;
429
430out4:
431 xprt_rdma_free_addresses(xprt);
432 rc = -EINVAL;
433out3:
434 (void) rpcrdma_ep_destroy(new_ep, &new_xprt->rx_ia);
435out2:
436 rpcrdma_ia_close(&new_xprt->rx_ia);
437out1:
438 kfree(xprt->slot);
439 kfree(xprt);
440 return ERR_PTR(rc);
441}
442
443/*
444 * Close a connection, during shutdown or timeout/reconnect
445 */
446static void
447xprt_rdma_close(struct rpc_xprt *xprt)
448{
449 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
450
451 dprintk("RPC: %s: closing\n", __func__);
452 xprt_disconnect(xprt);
453 (void) rpcrdma_ep_disconnect(&r_xprt->rx_ep, &r_xprt->rx_ia);
454}
455
456static void
457xprt_rdma_set_port(struct rpc_xprt *xprt, u16 port)
458{
459 struct sockaddr_in *sap;
460
461 sap = (struct sockaddr_in *)&xprt->addr;
462 sap->sin_port = htons(port);
463 sap = (struct sockaddr_in *)&rpcx_to_rdmad(xprt).addr;
464 sap->sin_port = htons(port);
465 dprintk("RPC: %s: %u\n", __func__, port);
466}
467
468static void
469xprt_rdma_connect(struct rpc_task *task)
470{
471 struct rpc_xprt *xprt = (struct rpc_xprt *)task->tk_xprt;
472 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
473
474 if (!xprt_test_and_set_connecting(xprt)) {
475 if (r_xprt->rx_ep.rep_connected != 0) {
476 /* Reconnect */
477 schedule_delayed_work(&r_xprt->rdma_connect,
478 xprt->reestablish_timeout);
479 } else {
480 schedule_delayed_work(&r_xprt->rdma_connect, 0);
481 if (!RPC_IS_ASYNC(task))
482 flush_scheduled_work();
483 }
484 }
485}
486
487static int
488xprt_rdma_reserve_xprt(struct rpc_task *task)
489{
490 struct rpc_xprt *xprt = task->tk_xprt;
491 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
492 int credits = atomic_read(&r_xprt->rx_buf.rb_credits);
493
494 /* == RPC_CWNDSCALE @ init, but *after* setup */
495 if (r_xprt->rx_buf.rb_cwndscale == 0UL) {
496 r_xprt->rx_buf.rb_cwndscale = xprt->cwnd;
497 dprintk("RPC: %s: cwndscale %lu\n", __func__,
498 r_xprt->rx_buf.rb_cwndscale);
499 BUG_ON(r_xprt->rx_buf.rb_cwndscale <= 0);
500 }
501 xprt->cwnd = credits * r_xprt->rx_buf.rb_cwndscale;
502 return xprt_reserve_xprt_cong(task);
503}
504
505/*
506 * The RDMA allocate/free functions need the task structure as a place
507 * to hide the struct rpcrdma_req, which is necessary for the actual send/recv
508 * sequence. For this reason, the recv buffers are attached to send
509 * buffers for portions of the RPC. Note that the RPC layer allocates
510 * both send and receive buffers in the same call. We may register
511 * the receive buffer portion when using reply chunks.
512 */
513static void *
514xprt_rdma_allocate(struct rpc_task *task, size_t size)
515{
516 struct rpc_xprt *xprt = task->tk_xprt;
517 struct rpcrdma_req *req, *nreq;
518
519 req = rpcrdma_buffer_get(&rpcx_to_rdmax(xprt)->rx_buf);
520 BUG_ON(NULL == req);
521
522 if (size > req->rl_size) {
523 dprintk("RPC: %s: size %zd too large for buffer[%zd]: "
524 "prog %d vers %d proc %d\n",
525 __func__, size, req->rl_size,
526 task->tk_client->cl_prog, task->tk_client->cl_vers,
527 task->tk_msg.rpc_proc->p_proc);
528 /*
529 * Outgoing length shortage. Our inline write max must have
530 * been configured to perform direct i/o.
531 *
532 * This is therefore a large metadata operation, and the
533 * allocate call was made on the maximum possible message,
534 * e.g. containing long filename(s) or symlink data. In
535 * fact, while these metadata operations *might* carry
536 * large outgoing payloads, they rarely *do*. However, we
537 * have to commit to the request here, so reallocate and
538 * register it now. The data path will never require this
539 * reallocation.
540 *
541 * If the allocation or registration fails, the RPC framework
542 * will (doggedly) retry.
543 */
544 if (rpcx_to_rdmax(xprt)->rx_ia.ri_memreg_strategy ==
545 RPCRDMA_BOUNCEBUFFERS) {
546 /* forced to "pure inline" */
547 dprintk("RPC: %s: too much data (%zd) for inline "
548 "(r/w max %d/%d)\n", __func__, size,
549 rpcx_to_rdmad(xprt).inline_rsize,
550 rpcx_to_rdmad(xprt).inline_wsize);
551 size = req->rl_size;
552 rpc_exit(task, -EIO); /* fail the operation */
553 rpcx_to_rdmax(xprt)->rx_stats.failed_marshal_count++;
554 goto out;
555 }
556 if (task->tk_flags & RPC_TASK_SWAPPER)
557 nreq = kmalloc(sizeof *req + size, GFP_ATOMIC);
558 else
559 nreq = kmalloc(sizeof *req + size, GFP_NOFS);
560 if (nreq == NULL)
561 goto outfail;
562
563 if (rpcrdma_register_internal(&rpcx_to_rdmax(xprt)->rx_ia,
564 nreq->rl_base, size + sizeof(struct rpcrdma_req)
565 - offsetof(struct rpcrdma_req, rl_base),
566 &nreq->rl_handle, &nreq->rl_iov)) {
567 kfree(nreq);
568 goto outfail;
569 }
570 rpcx_to_rdmax(xprt)->rx_stats.hardway_register_count += size;
571 nreq->rl_size = size;
572 nreq->rl_niovs = 0;
573 nreq->rl_nchunks = 0;
574 nreq->rl_buffer = (struct rpcrdma_buffer *)req;
575 nreq->rl_reply = req->rl_reply;
576 memcpy(nreq->rl_segments,
577 req->rl_segments, sizeof nreq->rl_segments);
578 /* flag the swap with an unused field */
579 nreq->rl_iov.length = 0;
580 req->rl_reply = NULL;
581 req = nreq;
582 }
583 dprintk("RPC: %s: size %zd, request 0x%p\n", __func__, size, req);
584out:
585 return req->rl_xdr_buf;
586
587outfail:
588 rpcrdma_buffer_put(req);
589 rpcx_to_rdmax(xprt)->rx_stats.failed_marshal_count++;
590 return NULL;
591}
592
593/*
594 * This function returns all RDMA resources to the pool.
595 */
596static void
597xprt_rdma_free(void *buffer)
598{
599 struct rpcrdma_req *req;
600 struct rpcrdma_xprt *r_xprt;
601 struct rpcrdma_rep *rep;
602 int i;
603
604 if (buffer == NULL)
605 return;
606
607 req = container_of(buffer, struct rpcrdma_req, rl_xdr_buf[0]);
608 r_xprt = container_of(req->rl_buffer, struct rpcrdma_xprt, rx_buf);
609 rep = req->rl_reply;
610
611 dprintk("RPC: %s: called on 0x%p%s\n",
612 __func__, rep, (rep && rep->rr_func) ? " (with waiter)" : "");
613
614 /*
615 * Finish the deregistration. When using mw bind, this was
616 * begun in rpcrdma_reply_handler(). In all other modes, we
617 * do it here, in thread context. The process is considered
618 * complete when the rr_func vector becomes NULL - this
619 * was put in place during rpcrdma_reply_handler() - the wait
620 * call below will not block if the dereg is "done". If
621 * interrupted, our framework will clean up.
622 */
623 for (i = 0; req->rl_nchunks;) {
624 --req->rl_nchunks;
625 i += rpcrdma_deregister_external(
626 &req->rl_segments[i], r_xprt, NULL);
627 }
628
629 if (rep && wait_event_interruptible(rep->rr_unbind, !rep->rr_func)) {
630 rep->rr_func = NULL; /* abandon the callback */
631 req->rl_reply = NULL;
632 }
633
634 if (req->rl_iov.length == 0) { /* see allocate above */
635 struct rpcrdma_req *oreq = (struct rpcrdma_req *)req->rl_buffer;
636 oreq->rl_reply = req->rl_reply;
637 (void) rpcrdma_deregister_internal(&r_xprt->rx_ia,
638 req->rl_handle,
639 &req->rl_iov);
640 kfree(req);
641 req = oreq;
642 }
643
644 /* Put back request+reply buffers */
645 rpcrdma_buffer_put(req);
646}
647
648/*
649 * send_request invokes the meat of RPC RDMA. It must do the following:
650 * 1. Marshal the RPC request into an RPC RDMA request, which means
651 * putting a header in front of data, and creating IOVs for RDMA
652 * from those in the request.
653 * 2. In marshaling, detect opportunities for RDMA, and use them.
654 * 3. Post a recv message to set up asynch completion, then send
655 * the request (rpcrdma_ep_post).
656 * 4. No partial sends are possible in the RPC-RDMA protocol (as in UDP).
657 */
658
659static int
660xprt_rdma_send_request(struct rpc_task *task)
661{
662 struct rpc_rqst *rqst = task->tk_rqstp;
663 struct rpc_xprt *xprt = task->tk_xprt;
664 struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
665 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
666
667 /* marshal the send itself */
668 if (req->rl_niovs == 0 && rpcrdma_marshal_req(rqst) != 0) {
669 r_xprt->rx_stats.failed_marshal_count++;
670 dprintk("RPC: %s: rpcrdma_marshal_req failed\n",
671 __func__);
672 return -EIO;
673 }
674
675 if (req->rl_reply == NULL) /* e.g. reconnection */
676 rpcrdma_recv_buffer_get(req);
677
678 if (req->rl_reply) {
679 req->rl_reply->rr_func = rpcrdma_reply_handler;
680 /* this need only be done once, but... */
681 req->rl_reply->rr_xprt = xprt;
682 }
683
684 if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req)) {
685 xprt_disconnect(xprt);
686 return -ENOTCONN; /* implies disconnect */
687 }
688
689 rqst->rq_bytes_sent = 0;
690 return 0;
691}
692
693static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
694{
695 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
696 long idle_time = 0;
697
698 if (xprt_connected(xprt))
699 idle_time = (long)(jiffies - xprt->last_used) / HZ;
700
701 seq_printf(seq,
702 "\txprt:\trdma %u %lu %lu %lu %ld %lu %lu %lu %Lu %Lu "
703 "%lu %lu %lu %Lu %Lu %Lu %Lu %lu %lu %lu\n",
704
705 0, /* need a local port? */
706 xprt->stat.bind_count,
707 xprt->stat.connect_count,
708 xprt->stat.connect_time,
709 idle_time,
710 xprt->stat.sends,
711 xprt->stat.recvs,
712 xprt->stat.bad_xids,
713 xprt->stat.req_u,
714 xprt->stat.bklog_u,
715
716 r_xprt->rx_stats.read_chunk_count,
717 r_xprt->rx_stats.write_chunk_count,
718 r_xprt->rx_stats.reply_chunk_count,
719 r_xprt->rx_stats.total_rdma_request,
720 r_xprt->rx_stats.total_rdma_reply,
721 r_xprt->rx_stats.pullup_copy_count,
722 r_xprt->rx_stats.fixup_copy_count,
723 r_xprt->rx_stats.hardway_register_count,
724 r_xprt->rx_stats.failed_marshal_count,
725 r_xprt->rx_stats.bad_reply_count);
726}
727
728/*
729 * Plumbing for rpc transport switch and kernel module
730 */
731
732static struct rpc_xprt_ops xprt_rdma_procs = {
733 .reserve_xprt = xprt_rdma_reserve_xprt,
734 .release_xprt = xprt_release_xprt_cong, /* sunrpc/xprt.c */
735 .release_request = xprt_release_rqst_cong, /* ditto */
736 .set_retrans_timeout = xprt_set_retrans_timeout_def, /* ditto */
737 .rpcbind = rpcb_getport_async, /* sunrpc/rpcb_clnt.c */
738 .set_port = xprt_rdma_set_port,
739 .connect = xprt_rdma_connect,
740 .buf_alloc = xprt_rdma_allocate,
741 .buf_free = xprt_rdma_free,
742 .send_request = xprt_rdma_send_request,
743 .close = xprt_rdma_close,
744 .destroy = xprt_rdma_destroy,
745 .print_stats = xprt_rdma_print_stats
746};
747
748static struct xprt_class xprt_rdma = {
749 .list = LIST_HEAD_INIT(xprt_rdma.list),
750 .name = "rdma",
751 .owner = THIS_MODULE,
752 .ident = XPRT_TRANSPORT_RDMA,
753 .setup = xprt_setup_rdma,
754};
755
756static void __exit xprt_rdma_cleanup(void)
757{
758 int rc;
759
760 dprintk("RPCRDMA Module Removed, deregister RPC RDMA transport\n");
761#ifdef RPC_DEBUG
762 if (sunrpc_table_header) {
763 unregister_sysctl_table(sunrpc_table_header);
764 sunrpc_table_header = NULL;
765 }
766#endif
767 rc = xprt_unregister_transport(&xprt_rdma);
768 if (rc)
769 dprintk("RPC: %s: xprt_unregister returned %i\n",
770 __func__, rc);
771}
772
773static int __init xprt_rdma_init(void)
774{
775 int rc;
776
777 rc = xprt_register_transport(&xprt_rdma);
778
779 if (rc)
780 return rc;
781
782 dprintk(KERN_INFO "RPCRDMA Module Init, register RPC RDMA transport\n");
783
784 dprintk(KERN_INFO "Defaults:\n");
785 dprintk(KERN_INFO "\tSlots %d\n"
786 "\tMaxInlineRead %d\n\tMaxInlineWrite %d\n",
787 xprt_rdma_slot_table_entries,
788 xprt_rdma_max_inline_read, xprt_rdma_max_inline_write);
789 dprintk(KERN_INFO "\tPadding %d\n\tMemreg %d\n",
790 xprt_rdma_inline_write_padding, xprt_rdma_memreg_strategy);
791
792#ifdef RPC_DEBUG
793 if (!sunrpc_table_header)
794 sunrpc_table_header = register_sysctl_table(sunrpc_table);
795#endif
796 return 0;
797}
798
799module_init(xprt_rdma_init);
800module_exit(xprt_rdma_cleanup);
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
new file mode 100644
index 000000000000..9ec8ca4f6028
--- /dev/null
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -0,0 +1,1626 @@
1/*
2 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the BSD-type
8 * license below:
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 *
17 * Redistributions in binary form must reproduce the above
18 * copyright notice, this list of conditions and the following
19 * disclaimer in the documentation and/or other materials provided
20 * with the distribution.
21 *
22 * Neither the name of the Network Appliance, Inc. nor the names of
23 * its contributors may be used to endorse or promote products
24 * derived from this software without specific prior written
25 * permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 */
39
40/*
41 * verbs.c
42 *
43 * Encapsulates the major functions managing:
44 * o adapters
45 * o endpoints
46 * o connections
47 * o buffer memory
48 */
49
50#include <linux/pci.h> /* for Tavor hack below */
51
52#include "xprt_rdma.h"
53
54/*
55 * Globals/Macros
56 */
57
58#ifdef RPC_DEBUG
59# define RPCDBG_FACILITY RPCDBG_TRANS
60#endif
61
62/*
63 * internal functions
64 */
65
66/*
67 * handle replies in tasklet context, using a single, global list
68 * rdma tasklet function -- just turn around and call the func
69 * for all replies on the list
70 */
71
72static DEFINE_SPINLOCK(rpcrdma_tk_lock_g);
73static LIST_HEAD(rpcrdma_tasklets_g);
74
75static void
76rpcrdma_run_tasklet(unsigned long data)
77{
78 struct rpcrdma_rep *rep;
79 void (*func)(struct rpcrdma_rep *);
80 unsigned long flags;
81
82 data = data;
83 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
84 while (!list_empty(&rpcrdma_tasklets_g)) {
85 rep = list_entry(rpcrdma_tasklets_g.next,
86 struct rpcrdma_rep, rr_list);
87 list_del(&rep->rr_list);
88 func = rep->rr_func;
89 rep->rr_func = NULL;
90 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
91
92 if (func)
93 func(rep);
94 else
95 rpcrdma_recv_buffer_put(rep);
96
97 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
98 }
99 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
100}
101
102static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL);
103
104static inline void
105rpcrdma_schedule_tasklet(struct rpcrdma_rep *rep)
106{
107 unsigned long flags;
108
109 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
110 list_add_tail(&rep->rr_list, &rpcrdma_tasklets_g);
111 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
112 tasklet_schedule(&rpcrdma_tasklet_g);
113}
114
115static void
116rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
117{
118 struct rpcrdma_ep *ep = context;
119
120 dprintk("RPC: %s: QP error %X on device %s ep %p\n",
121 __func__, event->event, event->device->name, context);
122 if (ep->rep_connected == 1) {
123 ep->rep_connected = -EIO;
124 ep->rep_func(ep);
125 wake_up_all(&ep->rep_connect_wait);
126 }
127}
128
129static void
130rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
131{
132 struct rpcrdma_ep *ep = context;
133
134 dprintk("RPC: %s: CQ error %X on device %s ep %p\n",
135 __func__, event->event, event->device->name, context);
136 if (ep->rep_connected == 1) {
137 ep->rep_connected = -EIO;
138 ep->rep_func(ep);
139 wake_up_all(&ep->rep_connect_wait);
140 }
141}
142
143static inline
144void rpcrdma_event_process(struct ib_wc *wc)
145{
146 struct rpcrdma_rep *rep =
147 (struct rpcrdma_rep *)(unsigned long) wc->wr_id;
148
149 dprintk("RPC: %s: event rep %p status %X opcode %X length %u\n",
150 __func__, rep, wc->status, wc->opcode, wc->byte_len);
151
152 if (!rep) /* send or bind completion that we don't care about */
153 return;
154
155 if (IB_WC_SUCCESS != wc->status) {
156 dprintk("RPC: %s: %s WC status %X, connection lost\n",
157 __func__, (wc->opcode & IB_WC_RECV) ? "recv" : "send",
158 wc->status);
159 rep->rr_len = ~0U;
160 rpcrdma_schedule_tasklet(rep);
161 return;
162 }
163
164 switch (wc->opcode) {
165 case IB_WC_RECV:
166 rep->rr_len = wc->byte_len;
167 ib_dma_sync_single_for_cpu(
168 rdmab_to_ia(rep->rr_buffer)->ri_id->device,
169 rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE);
170 /* Keep (only) the most recent credits, after check validity */
171 if (rep->rr_len >= 16) {
172 struct rpcrdma_msg *p =
173 (struct rpcrdma_msg *) rep->rr_base;
174 unsigned int credits = ntohl(p->rm_credit);
175 if (credits == 0) {
176 dprintk("RPC: %s: server"
177 " dropped credits to 0!\n", __func__);
178 /* don't deadlock */
179 credits = 1;
180 } else if (credits > rep->rr_buffer->rb_max_requests) {
181 dprintk("RPC: %s: server"
182 " over-crediting: %d (%d)\n",
183 __func__, credits,
184 rep->rr_buffer->rb_max_requests);
185 credits = rep->rr_buffer->rb_max_requests;
186 }
187 atomic_set(&rep->rr_buffer->rb_credits, credits);
188 }
189 /* fall through */
190 case IB_WC_BIND_MW:
191 rpcrdma_schedule_tasklet(rep);
192 break;
193 default:
194 dprintk("RPC: %s: unexpected WC event %X\n",
195 __func__, wc->opcode);
196 break;
197 }
198}
199
200static inline int
201rpcrdma_cq_poll(struct ib_cq *cq)
202{
203 struct ib_wc wc;
204 int rc;
205
206 for (;;) {
207 rc = ib_poll_cq(cq, 1, &wc);
208 if (rc < 0) {
209 dprintk("RPC: %s: ib_poll_cq failed %i\n",
210 __func__, rc);
211 return rc;
212 }
213 if (rc == 0)
214 break;
215
216 rpcrdma_event_process(&wc);
217 }
218
219 return 0;
220}
221
222/*
223 * rpcrdma_cq_event_upcall
224 *
225 * This upcall handles recv, send, bind and unbind events.
226 * It is reentrant but processes single events in order to maintain
227 * ordering of receives to keep server credits.
228 *
229 * It is the responsibility of the scheduled tasklet to return
230 * recv buffers to the pool. NOTE: this affects synchronization of
231 * connection shutdown. That is, the structures required for
232 * the completion of the reply handler must remain intact until
233 * all memory has been reclaimed.
234 *
235 * Note that send events are suppressed and do not result in an upcall.
236 */
237static void
238rpcrdma_cq_event_upcall(struct ib_cq *cq, void *context)
239{
240 int rc;
241
242 rc = rpcrdma_cq_poll(cq);
243 if (rc)
244 return;
245
246 rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
247 if (rc) {
248 dprintk("RPC: %s: ib_req_notify_cq failed %i\n",
249 __func__, rc);
250 return;
251 }
252
253 rpcrdma_cq_poll(cq);
254}
255
256#ifdef RPC_DEBUG
257static const char * const conn[] = {
258 "address resolved",
259 "address error",
260 "route resolved",
261 "route error",
262 "connect request",
263 "connect response",
264 "connect error",
265 "unreachable",
266 "rejected",
267 "established",
268 "disconnected",
269 "device removal"
270};
271#endif
272
273static int
274rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
275{
276 struct rpcrdma_xprt *xprt = id->context;
277 struct rpcrdma_ia *ia = &xprt->rx_ia;
278 struct rpcrdma_ep *ep = &xprt->rx_ep;
279 struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr;
280 struct ib_qp_attr attr;
281 struct ib_qp_init_attr iattr;
282 int connstate = 0;
283
284 switch (event->event) {
285 case RDMA_CM_EVENT_ADDR_RESOLVED:
286 case RDMA_CM_EVENT_ROUTE_RESOLVED:
287 complete(&ia->ri_done);
288 break;
289 case RDMA_CM_EVENT_ADDR_ERROR:
290 ia->ri_async_rc = -EHOSTUNREACH;
291 dprintk("RPC: %s: CM address resolution error, ep 0x%p\n",
292 __func__, ep);
293 complete(&ia->ri_done);
294 break;
295 case RDMA_CM_EVENT_ROUTE_ERROR:
296 ia->ri_async_rc = -ENETUNREACH;
297 dprintk("RPC: %s: CM route resolution error, ep 0x%p\n",
298 __func__, ep);
299 complete(&ia->ri_done);
300 break;
301 case RDMA_CM_EVENT_ESTABLISHED:
302 connstate = 1;
303 ib_query_qp(ia->ri_id->qp, &attr,
304 IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC,
305 &iattr);
306 dprintk("RPC: %s: %d responder resources"
307 " (%d initiator)\n",
308 __func__, attr.max_dest_rd_atomic, attr.max_rd_atomic);
309 goto connected;
310 case RDMA_CM_EVENT_CONNECT_ERROR:
311 connstate = -ENOTCONN;
312 goto connected;
313 case RDMA_CM_EVENT_UNREACHABLE:
314 connstate = -ENETDOWN;
315 goto connected;
316 case RDMA_CM_EVENT_REJECTED:
317 connstate = -ECONNREFUSED;
318 goto connected;
319 case RDMA_CM_EVENT_DISCONNECTED:
320 connstate = -ECONNABORTED;
321 goto connected;
322 case RDMA_CM_EVENT_DEVICE_REMOVAL:
323 connstate = -ENODEV;
324connected:
325 dprintk("RPC: %s: %s: %u.%u.%u.%u:%u"
326 " (ep 0x%p event 0x%x)\n",
327 __func__,
328 (event->event <= 11) ? conn[event->event] :
329 "unknown connection error",
330 NIPQUAD(addr->sin_addr.s_addr),
331 ntohs(addr->sin_port),
332 ep, event->event);
333 atomic_set(&rpcx_to_rdmax(ep->rep_xprt)->rx_buf.rb_credits, 1);
334 dprintk("RPC: %s: %sconnected\n",
335 __func__, connstate > 0 ? "" : "dis");
336 ep->rep_connected = connstate;
337 ep->rep_func(ep);
338 wake_up_all(&ep->rep_connect_wait);
339 break;
340 default:
341 ia->ri_async_rc = -EINVAL;
342 dprintk("RPC: %s: unexpected CM event %X\n",
343 __func__, event->event);
344 complete(&ia->ri_done);
345 break;
346 }
347
348 return 0;
349}
350
351static struct rdma_cm_id *
352rpcrdma_create_id(struct rpcrdma_xprt *xprt,
353 struct rpcrdma_ia *ia, struct sockaddr *addr)
354{
355 struct rdma_cm_id *id;
356 int rc;
357
358 id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP);
359 if (IS_ERR(id)) {
360 rc = PTR_ERR(id);
361 dprintk("RPC: %s: rdma_create_id() failed %i\n",
362 __func__, rc);
363 return id;
364 }
365
366 ia->ri_async_rc = 0;
367 rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT);
368 if (rc) {
369 dprintk("RPC: %s: rdma_resolve_addr() failed %i\n",
370 __func__, rc);
371 goto out;
372 }
373 wait_for_completion(&ia->ri_done);
374 rc = ia->ri_async_rc;
375 if (rc)
376 goto out;
377
378 ia->ri_async_rc = 0;
379 rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
380 if (rc) {
381 dprintk("RPC: %s: rdma_resolve_route() failed %i\n",
382 __func__, rc);
383 goto out;
384 }
385 wait_for_completion(&ia->ri_done);
386 rc = ia->ri_async_rc;
387 if (rc)
388 goto out;
389
390 return id;
391
392out:
393 rdma_destroy_id(id);
394 return ERR_PTR(rc);
395}
396
397/*
398 * Drain any cq, prior to teardown.
399 */
400static void
401rpcrdma_clean_cq(struct ib_cq *cq)
402{
403 struct ib_wc wc;
404 int count = 0;
405
406 while (1 == ib_poll_cq(cq, 1, &wc))
407 ++count;
408
409 if (count)
410 dprintk("RPC: %s: flushed %d events (last 0x%x)\n",
411 __func__, count, wc.opcode);
412}
413
414/*
415 * Exported functions.
416 */
417
418/*
419 * Open and initialize an Interface Adapter.
420 * o initializes fields of struct rpcrdma_ia, including
421 * interface and provider attributes and protection zone.
422 */
423int
424rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
425{
426 int rc;
427 struct rpcrdma_ia *ia = &xprt->rx_ia;
428
429 init_completion(&ia->ri_done);
430
431 ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
432 if (IS_ERR(ia->ri_id)) {
433 rc = PTR_ERR(ia->ri_id);
434 goto out1;
435 }
436
437 ia->ri_pd = ib_alloc_pd(ia->ri_id->device);
438 if (IS_ERR(ia->ri_pd)) {
439 rc = PTR_ERR(ia->ri_pd);
440 dprintk("RPC: %s: ib_alloc_pd() failed %i\n",
441 __func__, rc);
442 goto out2;
443 }
444
445 /*
446 * Optionally obtain an underlying physical identity mapping in
447 * order to do a memory window-based bind. This base registration
448 * is protected from remote access - that is enabled only by binding
449 * for the specific bytes targeted during each RPC operation, and
450 * revoked after the corresponding completion similar to a storage
451 * adapter.
452 */
453 if (memreg > RPCRDMA_REGISTER) {
454 int mem_priv = IB_ACCESS_LOCAL_WRITE;
455 switch (memreg) {
456#if RPCRDMA_PERSISTENT_REGISTRATION
457 case RPCRDMA_ALLPHYSICAL:
458 mem_priv |= IB_ACCESS_REMOTE_WRITE;
459 mem_priv |= IB_ACCESS_REMOTE_READ;
460 break;
461#endif
462 case RPCRDMA_MEMWINDOWS_ASYNC:
463 case RPCRDMA_MEMWINDOWS:
464 mem_priv |= IB_ACCESS_MW_BIND;
465 break;
466 default:
467 break;
468 }
469 ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
470 if (IS_ERR(ia->ri_bind_mem)) {
471 printk(KERN_ALERT "%s: ib_get_dma_mr for "
472 "phys register failed with %lX\n\t"
473 "Will continue with degraded performance\n",
474 __func__, PTR_ERR(ia->ri_bind_mem));
475 memreg = RPCRDMA_REGISTER;
476 ia->ri_bind_mem = NULL;
477 }
478 }
479
480 /* Else will do memory reg/dereg for each chunk */
481 ia->ri_memreg_strategy = memreg;
482
483 return 0;
484out2:
485 rdma_destroy_id(ia->ri_id);
486out1:
487 return rc;
488}
489
490/*
491 * Clean up/close an IA.
492 * o if event handles and PD have been initialized, free them.
493 * o close the IA
494 */
495void
496rpcrdma_ia_close(struct rpcrdma_ia *ia)
497{
498 int rc;
499
500 dprintk("RPC: %s: entering\n", __func__);
501 if (ia->ri_bind_mem != NULL) {
502 rc = ib_dereg_mr(ia->ri_bind_mem);
503 dprintk("RPC: %s: ib_dereg_mr returned %i\n",
504 __func__, rc);
505 }
506 if (ia->ri_id != NULL && !IS_ERR(ia->ri_id) && ia->ri_id->qp)
507 rdma_destroy_qp(ia->ri_id);
508 if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) {
509 rc = ib_dealloc_pd(ia->ri_pd);
510 dprintk("RPC: %s: ib_dealloc_pd returned %i\n",
511 __func__, rc);
512 }
513 if (ia->ri_id != NULL && !IS_ERR(ia->ri_id))
514 rdma_destroy_id(ia->ri_id);
515}
516
517/*
518 * Create unconnected endpoint.
519 */
520int
521rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
522 struct rpcrdma_create_data_internal *cdata)
523{
524 struct ib_device_attr devattr;
525 int rc;
526
527 rc = ib_query_device(ia->ri_id->device, &devattr);
528 if (rc) {
529 dprintk("RPC: %s: ib_query_device failed %d\n",
530 __func__, rc);
531 return rc;
532 }
533
534 /* check provider's send/recv wr limits */
535 if (cdata->max_requests > devattr.max_qp_wr)
536 cdata->max_requests = devattr.max_qp_wr;
537
538 ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
539 ep->rep_attr.qp_context = ep;
540 /* send_cq and recv_cq initialized below */
541 ep->rep_attr.srq = NULL;
542 ep->rep_attr.cap.max_send_wr = cdata->max_requests;
543 switch (ia->ri_memreg_strategy) {
544 case RPCRDMA_MEMWINDOWS_ASYNC:
545 case RPCRDMA_MEMWINDOWS:
546 /* Add room for mw_binds+unbinds - overkill! */
547 ep->rep_attr.cap.max_send_wr++;
548 ep->rep_attr.cap.max_send_wr *= (2 * RPCRDMA_MAX_SEGS);
549 if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr)
550 return -EINVAL;
551 break;
552 default:
553 break;
554 }
555 ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
556 ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);
557 ep->rep_attr.cap.max_recv_sge = 1;
558 ep->rep_attr.cap.max_inline_data = 0;
559 ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
560 ep->rep_attr.qp_type = IB_QPT_RC;
561 ep->rep_attr.port_num = ~0;
562
563 dprintk("RPC: %s: requested max: dtos: send %d recv %d; "
564 "iovs: send %d recv %d\n",
565 __func__,
566 ep->rep_attr.cap.max_send_wr,
567 ep->rep_attr.cap.max_recv_wr,
568 ep->rep_attr.cap.max_send_sge,
569 ep->rep_attr.cap.max_recv_sge);
570
571 /* set trigger for requesting send completion */
572 ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 /* - 1*/;
573 switch (ia->ri_memreg_strategy) {
574 case RPCRDMA_MEMWINDOWS_ASYNC:
575 case RPCRDMA_MEMWINDOWS:
576 ep->rep_cqinit -= RPCRDMA_MAX_SEGS;
577 break;
578 default:
579 break;
580 }
581 if (ep->rep_cqinit <= 2)
582 ep->rep_cqinit = 0;
583 INIT_CQCOUNT(ep);
584 ep->rep_ia = ia;
585 init_waitqueue_head(&ep->rep_connect_wait);
586
587 /*
588 * Create a single cq for receive dto and mw_bind (only ever
589 * care about unbind, really). Send completions are suppressed.
590 * Use single threaded tasklet upcalls to maintain ordering.
591 */
592 ep->rep_cq = ib_create_cq(ia->ri_id->device, rpcrdma_cq_event_upcall,
593 rpcrdma_cq_async_error_upcall, NULL,
594 ep->rep_attr.cap.max_recv_wr +
595 ep->rep_attr.cap.max_send_wr + 1, 0);
596 if (IS_ERR(ep->rep_cq)) {
597 rc = PTR_ERR(ep->rep_cq);
598 dprintk("RPC: %s: ib_create_cq failed: %i\n",
599 __func__, rc);
600 goto out1;
601 }
602
603 rc = ib_req_notify_cq(ep->rep_cq, IB_CQ_NEXT_COMP);
604 if (rc) {
605 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
606 __func__, rc);
607 goto out2;
608 }
609
610 ep->rep_attr.send_cq = ep->rep_cq;
611 ep->rep_attr.recv_cq = ep->rep_cq;
612
613 /* Initialize cma parameters */
614
615 /* RPC/RDMA does not use private data */
616 ep->rep_remote_cma.private_data = NULL;
617 ep->rep_remote_cma.private_data_len = 0;
618
619 /* Client offers RDMA Read but does not initiate */
620 switch (ia->ri_memreg_strategy) {
621 case RPCRDMA_BOUNCEBUFFERS:
622 ep->rep_remote_cma.responder_resources = 0;
623 break;
624 case RPCRDMA_MTHCAFMR:
625 case RPCRDMA_REGISTER:
626 ep->rep_remote_cma.responder_resources = cdata->max_requests *
627 (RPCRDMA_MAX_DATA_SEGS / 8);
628 break;
629 case RPCRDMA_MEMWINDOWS:
630 case RPCRDMA_MEMWINDOWS_ASYNC:
631#if RPCRDMA_PERSISTENT_REGISTRATION
632 case RPCRDMA_ALLPHYSICAL:
633#endif
634 ep->rep_remote_cma.responder_resources = cdata->max_requests *
635 (RPCRDMA_MAX_DATA_SEGS / 2);
636 break;
637 default:
638 break;
639 }
640 if (ep->rep_remote_cma.responder_resources > devattr.max_qp_rd_atom)
641 ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom;
642 ep->rep_remote_cma.initiator_depth = 0;
643
644 ep->rep_remote_cma.retry_count = 7;
645 ep->rep_remote_cma.flow_control = 0;
646 ep->rep_remote_cma.rnr_retry_count = 0;
647
648 return 0;
649
650out2:
651 if (ib_destroy_cq(ep->rep_cq))
652 ;
653out1:
654 return rc;
655}
656
657/*
658 * rpcrdma_ep_destroy
659 *
660 * Disconnect and destroy endpoint. After this, the only
661 * valid operations on the ep are to free it (if dynamically
662 * allocated) or re-create it.
663 *
664 * The caller's error handling must be sure to not leak the endpoint
665 * if this function fails.
666 */
667int
668rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
669{
670 int rc;
671
672 dprintk("RPC: %s: entering, connected is %d\n",
673 __func__, ep->rep_connected);
674
675 if (ia->ri_id->qp) {
676 rc = rpcrdma_ep_disconnect(ep, ia);
677 if (rc)
678 dprintk("RPC: %s: rpcrdma_ep_disconnect"
679 " returned %i\n", __func__, rc);
680 }
681
682 ep->rep_func = NULL;
683
684 /* padding - could be done in rpcrdma_buffer_destroy... */
685 if (ep->rep_pad_mr) {
686 rpcrdma_deregister_internal(ia, ep->rep_pad_mr, &ep->rep_pad);
687 ep->rep_pad_mr = NULL;
688 }
689
690 if (ia->ri_id->qp) {
691 rdma_destroy_qp(ia->ri_id);
692 ia->ri_id->qp = NULL;
693 }
694
695 rpcrdma_clean_cq(ep->rep_cq);
696 rc = ib_destroy_cq(ep->rep_cq);
697 if (rc)
698 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
699 __func__, rc);
700
701 return rc;
702}
703
704/*
705 * Connect unconnected endpoint.
706 */
707int
708rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
709{
710 struct rdma_cm_id *id;
711 int rc = 0;
712 int retry_count = 0;
713 int reconnect = (ep->rep_connected != 0);
714
715 if (reconnect) {
716 struct rpcrdma_xprt *xprt;
717retry:
718 rc = rpcrdma_ep_disconnect(ep, ia);
719 if (rc && rc != -ENOTCONN)
720 dprintk("RPC: %s: rpcrdma_ep_disconnect"
721 " status %i\n", __func__, rc);
722 rpcrdma_clean_cq(ep->rep_cq);
723
724 xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
725 id = rpcrdma_create_id(xprt, ia,
726 (struct sockaddr *)&xprt->rx_data.addr);
727 if (IS_ERR(id)) {
728 rc = PTR_ERR(id);
729 goto out;
730 }
731 /* TEMP TEMP TEMP - fail if new device:
732 * Deregister/remarshal *all* requests!
733 * Close and recreate adapter, pd, etc!
734 * Re-determine all attributes still sane!
735 * More stuff I haven't thought of!
736 * Rrrgh!
737 */
738 if (ia->ri_id->device != id->device) {
739 printk("RPC: %s: can't reconnect on "
740 "different device!\n", __func__);
741 rdma_destroy_id(id);
742 rc = -ENETDOWN;
743 goto out;
744 }
745 /* END TEMP */
746 rdma_destroy_id(ia->ri_id);
747 ia->ri_id = id;
748 }
749
750 rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
751 if (rc) {
752 dprintk("RPC: %s: rdma_create_qp failed %i\n",
753 __func__, rc);
754 goto out;
755 }
756
757/* XXX Tavor device performs badly with 2K MTU! */
758if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) {
759 struct pci_dev *pcid = to_pci_dev(ia->ri_id->device->dma_device);
760 if (pcid->device == PCI_DEVICE_ID_MELLANOX_TAVOR &&
761 (pcid->vendor == PCI_VENDOR_ID_MELLANOX ||
762 pcid->vendor == PCI_VENDOR_ID_TOPSPIN)) {
763 struct ib_qp_attr attr = {
764 .path_mtu = IB_MTU_1024
765 };
766 rc = ib_modify_qp(ia->ri_id->qp, &attr, IB_QP_PATH_MTU);
767 }
768}
769
770 /* Theoretically a client initiator_depth > 0 is not needed,
771 * but many peers fail to complete the connection unless they
772 * == responder_resources! */
773 if (ep->rep_remote_cma.initiator_depth !=
774 ep->rep_remote_cma.responder_resources)
775 ep->rep_remote_cma.initiator_depth =
776 ep->rep_remote_cma.responder_resources;
777
778 ep->rep_connected = 0;
779
780 rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
781 if (rc) {
782 dprintk("RPC: %s: rdma_connect() failed with %i\n",
783 __func__, rc);
784 goto out;
785 }
786
787 if (reconnect)
788 return 0;
789
790 wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
791
792 /*
793 * Check state. A non-peer reject indicates no listener
794 * (ECONNREFUSED), which may be a transient state. All
795 * others indicate a transport condition which has already
796 * undergone a best-effort.
797 */
798 if (ep->rep_connected == -ECONNREFUSED
799 && ++retry_count <= RDMA_CONNECT_RETRY_MAX) {
800 dprintk("RPC: %s: non-peer_reject, retry\n", __func__);
801 goto retry;
802 }
803 if (ep->rep_connected <= 0) {
804 /* Sometimes, the only way to reliably connect to remote
805 * CMs is to use same nonzero values for ORD and IRD. */
806 ep->rep_remote_cma.initiator_depth =
807 ep->rep_remote_cma.responder_resources;
808 if (ep->rep_remote_cma.initiator_depth == 0)
809 ++ep->rep_remote_cma.initiator_depth;
810 if (ep->rep_remote_cma.responder_resources == 0)
811 ++ep->rep_remote_cma.responder_resources;
812 if (retry_count++ == 0)
813 goto retry;
814 rc = ep->rep_connected;
815 } else {
816 dprintk("RPC: %s: connected\n", __func__);
817 }
818
819out:
820 if (rc)
821 ep->rep_connected = rc;
822 return rc;
823}
824
825/*
826 * rpcrdma_ep_disconnect
827 *
828 * This is separate from destroy to facilitate the ability
829 * to reconnect without recreating the endpoint.
830 *
831 * This call is not reentrant, and must not be made in parallel
832 * on the same endpoint.
833 */
834int
835rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
836{
837 int rc;
838
839 rpcrdma_clean_cq(ep->rep_cq);
840 rc = rdma_disconnect(ia->ri_id);
841 if (!rc) {
842 /* returns without wait if not connected */
843 wait_event_interruptible(ep->rep_connect_wait,
844 ep->rep_connected != 1);
845 dprintk("RPC: %s: after wait, %sconnected\n", __func__,
846 (ep->rep_connected == 1) ? "still " : "dis");
847 } else {
848 dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc);
849 ep->rep_connected = rc;
850 }
851 return rc;
852}
853
854/*
855 * Initialize buffer memory
856 */
857int
858rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
859 struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata)
860{
861 char *p;
862 size_t len;
863 int i, rc;
864
865 buf->rb_max_requests = cdata->max_requests;
866 spin_lock_init(&buf->rb_lock);
867 atomic_set(&buf->rb_credits, 1);
868
869 /* Need to allocate:
870 * 1. arrays for send and recv pointers
871 * 2. arrays of struct rpcrdma_req to fill in pointers
872 * 3. array of struct rpcrdma_rep for replies
873 * 4. padding, if any
874 * 5. mw's, if any
875 * Send/recv buffers in req/rep need to be registered
876 */
877
878 len = buf->rb_max_requests *
879 (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
880 len += cdata->padding;
881 switch (ia->ri_memreg_strategy) {
882 case RPCRDMA_MTHCAFMR:
883 /* TBD we are perhaps overallocating here */
884 len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
885 sizeof(struct rpcrdma_mw);
886 break;
887 case RPCRDMA_MEMWINDOWS_ASYNC:
888 case RPCRDMA_MEMWINDOWS:
889 len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
890 sizeof(struct rpcrdma_mw);
891 break;
892 default:
893 break;
894 }
895
896 /* allocate 1, 4 and 5 in one shot */
897 p = kzalloc(len, GFP_KERNEL);
898 if (p == NULL) {
899 dprintk("RPC: %s: req_t/rep_t/pad kzalloc(%zd) failed\n",
900 __func__, len);
901 rc = -ENOMEM;
902 goto out;
903 }
904 buf->rb_pool = p; /* for freeing it later */
905
906 buf->rb_send_bufs = (struct rpcrdma_req **) p;
907 p = (char *) &buf->rb_send_bufs[buf->rb_max_requests];
908 buf->rb_recv_bufs = (struct rpcrdma_rep **) p;
909 p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests];
910
911 /*
912 * Register the zeroed pad buffer, if any.
913 */
914 if (cdata->padding) {
915 rc = rpcrdma_register_internal(ia, p, cdata->padding,
916 &ep->rep_pad_mr, &ep->rep_pad);
917 if (rc)
918 goto out;
919 }
920 p += cdata->padding;
921
922 /*
923 * Allocate the fmr's, or mw's for mw_bind chunk registration.
924 * We "cycle" the mw's in order to minimize rkey reuse,
925 * and also reduce unbind-to-bind collision.
926 */
927 INIT_LIST_HEAD(&buf->rb_mws);
928 switch (ia->ri_memreg_strategy) {
929 case RPCRDMA_MTHCAFMR:
930 {
931 struct rpcrdma_mw *r = (struct rpcrdma_mw *)p;
932 struct ib_fmr_attr fa = {
933 RPCRDMA_MAX_DATA_SEGS, 1, PAGE_SHIFT
934 };
935 /* TBD we are perhaps overallocating here */
936 for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
937 r->r.fmr = ib_alloc_fmr(ia->ri_pd,
938 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ,
939 &fa);
940 if (IS_ERR(r->r.fmr)) {
941 rc = PTR_ERR(r->r.fmr);
942 dprintk("RPC: %s: ib_alloc_fmr"
943 " failed %i\n", __func__, rc);
944 goto out;
945 }
946 list_add(&r->mw_list, &buf->rb_mws);
947 ++r;
948 }
949 }
950 break;
951 case RPCRDMA_MEMWINDOWS_ASYNC:
952 case RPCRDMA_MEMWINDOWS:
953 {
954 struct rpcrdma_mw *r = (struct rpcrdma_mw *)p;
955 /* Allocate one extra request's worth, for full cycling */
956 for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
957 r->r.mw = ib_alloc_mw(ia->ri_pd);
958 if (IS_ERR(r->r.mw)) {
959 rc = PTR_ERR(r->r.mw);
960 dprintk("RPC: %s: ib_alloc_mw"
961 " failed %i\n", __func__, rc);
962 goto out;
963 }
964 list_add(&r->mw_list, &buf->rb_mws);
965 ++r;
966 }
967 }
968 break;
969 default:
970 break;
971 }
972
973 /*
974 * Allocate/init the request/reply buffers. Doing this
975 * using kmalloc for now -- one for each buf.
976 */
977 for (i = 0; i < buf->rb_max_requests; i++) {
978 struct rpcrdma_req *req;
979 struct rpcrdma_rep *rep;
980
981 len = cdata->inline_wsize + sizeof(struct rpcrdma_req);
982 /* RPC layer requests *double* size + 1K RPC_SLACK_SPACE! */
983 /* Typical ~2400b, so rounding up saves work later */
984 if (len < 4096)
985 len = 4096;
986 req = kmalloc(len, GFP_KERNEL);
987 if (req == NULL) {
988 dprintk("RPC: %s: request buffer %d alloc"
989 " failed\n", __func__, i);
990 rc = -ENOMEM;
991 goto out;
992 }
993 memset(req, 0, sizeof(struct rpcrdma_req));
994 buf->rb_send_bufs[i] = req;
995 buf->rb_send_bufs[i]->rl_buffer = buf;
996
997 rc = rpcrdma_register_internal(ia, req->rl_base,
998 len - offsetof(struct rpcrdma_req, rl_base),
999 &buf->rb_send_bufs[i]->rl_handle,
1000 &buf->rb_send_bufs[i]->rl_iov);
1001 if (rc)
1002 goto out;
1003
1004 buf->rb_send_bufs[i]->rl_size = len-sizeof(struct rpcrdma_req);
1005
1006 len = cdata->inline_rsize + sizeof(struct rpcrdma_rep);
1007 rep = kmalloc(len, GFP_KERNEL);
1008 if (rep == NULL) {
1009 dprintk("RPC: %s: reply buffer %d alloc failed\n",
1010 __func__, i);
1011 rc = -ENOMEM;
1012 goto out;
1013 }
1014 memset(rep, 0, sizeof(struct rpcrdma_rep));
1015 buf->rb_recv_bufs[i] = rep;
1016 buf->rb_recv_bufs[i]->rr_buffer = buf;
1017 init_waitqueue_head(&rep->rr_unbind);
1018
1019 rc = rpcrdma_register_internal(ia, rep->rr_base,
1020 len - offsetof(struct rpcrdma_rep, rr_base),
1021 &buf->rb_recv_bufs[i]->rr_handle,
1022 &buf->rb_recv_bufs[i]->rr_iov);
1023 if (rc)
1024 goto out;
1025
1026 }
1027 dprintk("RPC: %s: max_requests %d\n",
1028 __func__, buf->rb_max_requests);
1029 /* done */
1030 return 0;
1031out:
1032 rpcrdma_buffer_destroy(buf);
1033 return rc;
1034}
1035
1036/*
1037 * Unregister and destroy buffer memory. Need to deal with
1038 * partial initialization, so it's callable from failed create.
1039 * Must be called before destroying endpoint, as registrations
1040 * reference it.
1041 */
1042void
1043rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
1044{
1045 int rc, i;
1046 struct rpcrdma_ia *ia = rdmab_to_ia(buf);
1047
1048 /* clean up in reverse order from create
1049 * 1. recv mr memory (mr free, then kfree)
1050 * 1a. bind mw memory
1051 * 2. send mr memory (mr free, then kfree)
1052 * 3. padding (if any) [moved to rpcrdma_ep_destroy]
1053 * 4. arrays
1054 */
1055 dprintk("RPC: %s: entering\n", __func__);
1056
1057 for (i = 0; i < buf->rb_max_requests; i++) {
1058 if (buf->rb_recv_bufs && buf->rb_recv_bufs[i]) {
1059 rpcrdma_deregister_internal(ia,
1060 buf->rb_recv_bufs[i]->rr_handle,
1061 &buf->rb_recv_bufs[i]->rr_iov);
1062 kfree(buf->rb_recv_bufs[i]);
1063 }
1064 if (buf->rb_send_bufs && buf->rb_send_bufs[i]) {
1065 while (!list_empty(&buf->rb_mws)) {
1066 struct rpcrdma_mw *r;
1067 r = list_entry(buf->rb_mws.next,
1068 struct rpcrdma_mw, mw_list);
1069 list_del(&r->mw_list);
1070 switch (ia->ri_memreg_strategy) {
1071 case RPCRDMA_MTHCAFMR:
1072 rc = ib_dealloc_fmr(r->r.fmr);
1073 if (rc)
1074 dprintk("RPC: %s:"
1075 " ib_dealloc_fmr"
1076 " failed %i\n",
1077 __func__, rc);
1078 break;
1079 case RPCRDMA_MEMWINDOWS_ASYNC:
1080 case RPCRDMA_MEMWINDOWS:
1081 rc = ib_dealloc_mw(r->r.mw);
1082 if (rc)
1083 dprintk("RPC: %s:"
1084 " ib_dealloc_mw"
1085 " failed %i\n",
1086 __func__, rc);
1087 break;
1088 default:
1089 break;
1090 }
1091 }
1092 rpcrdma_deregister_internal(ia,
1093 buf->rb_send_bufs[i]->rl_handle,
1094 &buf->rb_send_bufs[i]->rl_iov);
1095 kfree(buf->rb_send_bufs[i]);
1096 }
1097 }
1098
1099 kfree(buf->rb_pool);
1100}
1101
1102/*
1103 * Get a set of request/reply buffers.
1104 *
1105 * Reply buffer (if needed) is attached to send buffer upon return.
1106 * Rule:
1107 * rb_send_index and rb_recv_index MUST always be pointing to the
1108 * *next* available buffer (non-NULL). They are incremented after
1109 * removing buffers, and decremented *before* returning them.
1110 */
1111struct rpcrdma_req *
1112rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
1113{
1114 struct rpcrdma_req *req;
1115 unsigned long flags;
1116
1117 spin_lock_irqsave(&buffers->rb_lock, flags);
1118 if (buffers->rb_send_index == buffers->rb_max_requests) {
1119 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1120 dprintk("RPC: %s: out of request buffers\n", __func__);
1121 return ((struct rpcrdma_req *)NULL);
1122 }
1123
1124 req = buffers->rb_send_bufs[buffers->rb_send_index];
1125 if (buffers->rb_send_index < buffers->rb_recv_index) {
1126 dprintk("RPC: %s: %d extra receives outstanding (ok)\n",
1127 __func__,
1128 buffers->rb_recv_index - buffers->rb_send_index);
1129 req->rl_reply = NULL;
1130 } else {
1131 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1132 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1133 }
1134 buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
1135 if (!list_empty(&buffers->rb_mws)) {
1136 int i = RPCRDMA_MAX_SEGS - 1;
1137 do {
1138 struct rpcrdma_mw *r;
1139 r = list_entry(buffers->rb_mws.next,
1140 struct rpcrdma_mw, mw_list);
1141 list_del(&r->mw_list);
1142 req->rl_segments[i].mr_chunk.rl_mw = r;
1143 } while (--i >= 0);
1144 }
1145 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1146 return req;
1147}
1148
1149/*
1150 * Put request/reply buffers back into pool.
1151 * Pre-decrement counter/array index.
1152 */
1153void
1154rpcrdma_buffer_put(struct rpcrdma_req *req)
1155{
1156 struct rpcrdma_buffer *buffers = req->rl_buffer;
1157 struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
1158 int i;
1159 unsigned long flags;
1160
1161 BUG_ON(req->rl_nchunks != 0);
1162 spin_lock_irqsave(&buffers->rb_lock, flags);
1163 buffers->rb_send_bufs[--buffers->rb_send_index] = req;
1164 req->rl_niovs = 0;
1165 if (req->rl_reply) {
1166 buffers->rb_recv_bufs[--buffers->rb_recv_index] = req->rl_reply;
1167 init_waitqueue_head(&req->rl_reply->rr_unbind);
1168 req->rl_reply->rr_func = NULL;
1169 req->rl_reply = NULL;
1170 }
1171 switch (ia->ri_memreg_strategy) {
1172 case RPCRDMA_MTHCAFMR:
1173 case RPCRDMA_MEMWINDOWS_ASYNC:
1174 case RPCRDMA_MEMWINDOWS:
1175 /*
1176 * Cycle mw's back in reverse order, and "spin" them.
1177 * This delays and scrambles reuse as much as possible.
1178 */
1179 i = 1;
1180 do {
1181 struct rpcrdma_mw **mw;
1182 mw = &req->rl_segments[i].mr_chunk.rl_mw;
1183 list_add_tail(&(*mw)->mw_list, &buffers->rb_mws);
1184 *mw = NULL;
1185 } while (++i < RPCRDMA_MAX_SEGS);
1186 list_add_tail(&req->rl_segments[0].mr_chunk.rl_mw->mw_list,
1187 &buffers->rb_mws);
1188 req->rl_segments[0].mr_chunk.rl_mw = NULL;
1189 break;
1190 default:
1191 break;
1192 }
1193 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1194}
1195
1196/*
1197 * Recover reply buffers from pool.
1198 * This happens when recovering from error conditions.
1199 * Post-increment counter/array index.
1200 */
1201void
1202rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
1203{
1204 struct rpcrdma_buffer *buffers = req->rl_buffer;
1205 unsigned long flags;
1206
1207 if (req->rl_iov.length == 0) /* special case xprt_rdma_allocate() */
1208 buffers = ((struct rpcrdma_req *) buffers)->rl_buffer;
1209 spin_lock_irqsave(&buffers->rb_lock, flags);
1210 if (buffers->rb_recv_index < buffers->rb_max_requests) {
1211 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1212 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1213 }
1214 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1215}
1216
1217/*
1218 * Put reply buffers back into pool when not attached to
1219 * request. This happens in error conditions, and when
1220 * aborting unbinds. Pre-decrement counter/array index.
1221 */
1222void
1223rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
1224{
1225 struct rpcrdma_buffer *buffers = rep->rr_buffer;
1226 unsigned long flags;
1227
1228 rep->rr_func = NULL;
1229 spin_lock_irqsave(&buffers->rb_lock, flags);
1230 buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep;
1231 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1232}
1233
1234/*
1235 * Wrappers for internal-use kmalloc memory registration, used by buffer code.
1236 */
1237
1238int
1239rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
1240 struct ib_mr **mrp, struct ib_sge *iov)
1241{
1242 struct ib_phys_buf ipb;
1243 struct ib_mr *mr;
1244 int rc;
1245
1246 /*
1247 * All memory passed here was kmalloc'ed, therefore phys-contiguous.
1248 */
1249 iov->addr = ib_dma_map_single(ia->ri_id->device,
1250 va, len, DMA_BIDIRECTIONAL);
1251 iov->length = len;
1252
1253 if (ia->ri_bind_mem != NULL) {
1254 *mrp = NULL;
1255 iov->lkey = ia->ri_bind_mem->lkey;
1256 return 0;
1257 }
1258
1259 ipb.addr = iov->addr;
1260 ipb.size = iov->length;
1261 mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1,
1262 IB_ACCESS_LOCAL_WRITE, &iov->addr);
1263
1264 dprintk("RPC: %s: phys convert: 0x%llx "
1265 "registered 0x%llx length %d\n",
1266 __func__, ipb.addr, iov->addr, len);
1267
1268 if (IS_ERR(mr)) {
1269 *mrp = NULL;
1270 rc = PTR_ERR(mr);
1271 dprintk("RPC: %s: failed with %i\n", __func__, rc);
1272 } else {
1273 *mrp = mr;
1274 iov->lkey = mr->lkey;
1275 rc = 0;
1276 }
1277
1278 return rc;
1279}
1280
1281int
1282rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
1283 struct ib_mr *mr, struct ib_sge *iov)
1284{
1285 int rc;
1286
1287 ib_dma_unmap_single(ia->ri_id->device,
1288 iov->addr, iov->length, DMA_BIDIRECTIONAL);
1289
1290 if (NULL == mr)
1291 return 0;
1292
1293 rc = ib_dereg_mr(mr);
1294 if (rc)
1295 dprintk("RPC: %s: ib_dereg_mr failed %i\n", __func__, rc);
1296 return rc;
1297}
1298
1299/*
1300 * Wrappers for chunk registration, shared by read/write chunk code.
1301 */
1302
1303static void
1304rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing)
1305{
1306 seg->mr_dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
1307 seg->mr_dmalen = seg->mr_len;
1308 if (seg->mr_page)
1309 seg->mr_dma = ib_dma_map_page(ia->ri_id->device,
1310 seg->mr_page, offset_in_page(seg->mr_offset),
1311 seg->mr_dmalen, seg->mr_dir);
1312 else
1313 seg->mr_dma = ib_dma_map_single(ia->ri_id->device,
1314 seg->mr_offset,
1315 seg->mr_dmalen, seg->mr_dir);
1316}
1317
1318static void
1319rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg)
1320{
1321 if (seg->mr_page)
1322 ib_dma_unmap_page(ia->ri_id->device,
1323 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1324 else
1325 ib_dma_unmap_single(ia->ri_id->device,
1326 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1327}
1328
1329int
1330rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
1331 int nsegs, int writing, struct rpcrdma_xprt *r_xprt)
1332{
1333 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
1334 int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
1335 IB_ACCESS_REMOTE_READ);
1336 struct rpcrdma_mr_seg *seg1 = seg;
1337 int i;
1338 int rc = 0;
1339
1340 switch (ia->ri_memreg_strategy) {
1341
1342#if RPCRDMA_PERSISTENT_REGISTRATION
1343 case RPCRDMA_ALLPHYSICAL:
1344 rpcrdma_map_one(ia, seg, writing);
1345 seg->mr_rkey = ia->ri_bind_mem->rkey;
1346 seg->mr_base = seg->mr_dma;
1347 seg->mr_nsegs = 1;
1348 nsegs = 1;
1349 break;
1350#endif
1351
1352 /* Registration using fast memory registration */
1353 case RPCRDMA_MTHCAFMR:
1354 {
1355 u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
1356 int len, pageoff = offset_in_page(seg->mr_offset);
1357 seg1->mr_offset -= pageoff; /* start of page */
1358 seg1->mr_len += pageoff;
1359 len = -pageoff;
1360 if (nsegs > RPCRDMA_MAX_DATA_SEGS)
1361 nsegs = RPCRDMA_MAX_DATA_SEGS;
1362 for (i = 0; i < nsegs;) {
1363 rpcrdma_map_one(ia, seg, writing);
1364 physaddrs[i] = seg->mr_dma;
1365 len += seg->mr_len;
1366 ++seg;
1367 ++i;
1368 /* Check for holes */
1369 if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
1370 offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len))
1371 break;
1372 }
1373 nsegs = i;
1374 rc = ib_map_phys_fmr(seg1->mr_chunk.rl_mw->r.fmr,
1375 physaddrs, nsegs, seg1->mr_dma);
1376 if (rc) {
1377 dprintk("RPC: %s: failed ib_map_phys_fmr "
1378 "%u@0x%llx+%i (%d)... status %i\n", __func__,
1379 len, (unsigned long long)seg1->mr_dma,
1380 pageoff, nsegs, rc);
1381 while (nsegs--)
1382 rpcrdma_unmap_one(ia, --seg);
1383 } else {
1384 seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.fmr->rkey;
1385 seg1->mr_base = seg1->mr_dma + pageoff;
1386 seg1->mr_nsegs = nsegs;
1387 seg1->mr_len = len;
1388 }
1389 }
1390 break;
1391
1392 /* Registration using memory windows */
1393 case RPCRDMA_MEMWINDOWS_ASYNC:
1394 case RPCRDMA_MEMWINDOWS:
1395 {
1396 struct ib_mw_bind param;
1397 rpcrdma_map_one(ia, seg, writing);
1398 param.mr = ia->ri_bind_mem;
1399 param.wr_id = 0ULL; /* no send cookie */
1400 param.addr = seg->mr_dma;
1401 param.length = seg->mr_len;
1402 param.send_flags = 0;
1403 param.mw_access_flags = mem_priv;
1404
1405 DECR_CQCOUNT(&r_xprt->rx_ep);
1406 rc = ib_bind_mw(ia->ri_id->qp,
1407 seg->mr_chunk.rl_mw->r.mw, &param);
1408 if (rc) {
1409 dprintk("RPC: %s: failed ib_bind_mw "
1410 "%u@0x%llx status %i\n",
1411 __func__, seg->mr_len,
1412 (unsigned long long)seg->mr_dma, rc);
1413 rpcrdma_unmap_one(ia, seg);
1414 } else {
1415 seg->mr_rkey = seg->mr_chunk.rl_mw->r.mw->rkey;
1416 seg->mr_base = param.addr;
1417 seg->mr_nsegs = 1;
1418 nsegs = 1;
1419 }
1420 }
1421 break;
1422
1423 /* Default registration each time */
1424 default:
1425 {
1426 struct ib_phys_buf ipb[RPCRDMA_MAX_DATA_SEGS];
1427 int len = 0;
1428 if (nsegs > RPCRDMA_MAX_DATA_SEGS)
1429 nsegs = RPCRDMA_MAX_DATA_SEGS;
1430 for (i = 0; i < nsegs;) {
1431 rpcrdma_map_one(ia, seg, writing);
1432 ipb[i].addr = seg->mr_dma;
1433 ipb[i].size = seg->mr_len;
1434 len += seg->mr_len;
1435 ++seg;
1436 ++i;
1437 /* Check for holes */
1438 if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
1439 offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len))
1440 break;
1441 }
1442 nsegs = i;
1443 seg1->mr_base = seg1->mr_dma;
1444 seg1->mr_chunk.rl_mr = ib_reg_phys_mr(ia->ri_pd,
1445 ipb, nsegs, mem_priv, &seg1->mr_base);
1446 if (IS_ERR(seg1->mr_chunk.rl_mr)) {
1447 rc = PTR_ERR(seg1->mr_chunk.rl_mr);
1448 dprintk("RPC: %s: failed ib_reg_phys_mr "
1449 "%u@0x%llx (%d)... status %i\n",
1450 __func__, len,
1451 (unsigned long long)seg1->mr_dma, nsegs, rc);
1452 while (nsegs--)
1453 rpcrdma_unmap_one(ia, --seg);
1454 } else {
1455 seg1->mr_rkey = seg1->mr_chunk.rl_mr->rkey;
1456 seg1->mr_nsegs = nsegs;
1457 seg1->mr_len = len;
1458 }
1459 }
1460 break;
1461 }
1462 if (rc)
1463 return -1;
1464
1465 return nsegs;
1466}
1467
1468int
1469rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
1470 struct rpcrdma_xprt *r_xprt, void *r)
1471{
1472 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
1473 struct rpcrdma_mr_seg *seg1 = seg;
1474 int nsegs = seg->mr_nsegs, rc;
1475
1476 switch (ia->ri_memreg_strategy) {
1477
1478#if RPCRDMA_PERSISTENT_REGISTRATION
1479 case RPCRDMA_ALLPHYSICAL:
1480 BUG_ON(nsegs != 1);
1481 rpcrdma_unmap_one(ia, seg);
1482 rc = 0;
1483 break;
1484#endif
1485
1486 case RPCRDMA_MTHCAFMR:
1487 {
1488 LIST_HEAD(l);
1489 list_add(&seg->mr_chunk.rl_mw->r.fmr->list, &l);
1490 rc = ib_unmap_fmr(&l);
1491 while (seg1->mr_nsegs--)
1492 rpcrdma_unmap_one(ia, seg++);
1493 }
1494 if (rc)
1495 dprintk("RPC: %s: failed ib_unmap_fmr,"
1496 " status %i\n", __func__, rc);
1497 break;
1498
1499 case RPCRDMA_MEMWINDOWS_ASYNC:
1500 case RPCRDMA_MEMWINDOWS:
1501 {
1502 struct ib_mw_bind param;
1503 BUG_ON(nsegs != 1);
1504 param.mr = ia->ri_bind_mem;
1505 param.addr = 0ULL; /* unbind */
1506 param.length = 0;
1507 param.mw_access_flags = 0;
1508 if (r) {
1509 param.wr_id = (u64) (unsigned long) r;
1510 param.send_flags = IB_SEND_SIGNALED;
1511 INIT_CQCOUNT(&r_xprt->rx_ep);
1512 } else {
1513 param.wr_id = 0ULL;
1514 param.send_flags = 0;
1515 DECR_CQCOUNT(&r_xprt->rx_ep);
1516 }
1517 rc = ib_bind_mw(ia->ri_id->qp,
1518 seg->mr_chunk.rl_mw->r.mw, &param);
1519 rpcrdma_unmap_one(ia, seg);
1520 }
1521 if (rc)
1522 dprintk("RPC: %s: failed ib_(un)bind_mw,"
1523 " status %i\n", __func__, rc);
1524 else
1525 r = NULL; /* will upcall on completion */
1526 break;
1527
1528 default:
1529 rc = ib_dereg_mr(seg1->mr_chunk.rl_mr);
1530 seg1->mr_chunk.rl_mr = NULL;
1531 while (seg1->mr_nsegs--)
1532 rpcrdma_unmap_one(ia, seg++);
1533 if (rc)
1534 dprintk("RPC: %s: failed ib_dereg_mr,"
1535 " status %i\n", __func__, rc);
1536 break;
1537 }
1538 if (r) {
1539 struct rpcrdma_rep *rep = r;
1540 void (*func)(struct rpcrdma_rep *) = rep->rr_func;
1541 rep->rr_func = NULL;
1542 func(rep); /* dereg done, callback now */
1543 }
1544 return nsegs;
1545}
1546
1547/*
1548 * Prepost any receive buffer, then post send.
1549 *
1550 * Receive buffer is donated to hardware, reclaimed upon recv completion.
1551 */
1552int
1553rpcrdma_ep_post(struct rpcrdma_ia *ia,
1554 struct rpcrdma_ep *ep,
1555 struct rpcrdma_req *req)
1556{
1557 struct ib_send_wr send_wr, *send_wr_fail;
1558 struct rpcrdma_rep *rep = req->rl_reply;
1559 int rc;
1560
1561 if (rep) {
1562 rc = rpcrdma_ep_post_recv(ia, ep, rep);
1563 if (rc)
1564 goto out;
1565 req->rl_reply = NULL;
1566 }
1567
1568 send_wr.next = NULL;
1569 send_wr.wr_id = 0ULL; /* no send cookie */
1570 send_wr.sg_list = req->rl_send_iov;
1571 send_wr.num_sge = req->rl_niovs;
1572 send_wr.opcode = IB_WR_SEND;
1573 send_wr.imm_data = 0;
1574 if (send_wr.num_sge == 4) /* no need to sync any pad (constant) */
1575 ib_dma_sync_single_for_device(ia->ri_id->device,
1576 req->rl_send_iov[3].addr, req->rl_send_iov[3].length,
1577 DMA_TO_DEVICE);
1578 ib_dma_sync_single_for_device(ia->ri_id->device,
1579 req->rl_send_iov[1].addr, req->rl_send_iov[1].length,
1580 DMA_TO_DEVICE);
1581 ib_dma_sync_single_for_device(ia->ri_id->device,
1582 req->rl_send_iov[0].addr, req->rl_send_iov[0].length,
1583 DMA_TO_DEVICE);
1584
1585 if (DECR_CQCOUNT(ep) > 0)
1586 send_wr.send_flags = 0;
1587 else { /* Provider must take a send completion every now and then */
1588 INIT_CQCOUNT(ep);
1589 send_wr.send_flags = IB_SEND_SIGNALED;
1590 }
1591
1592 rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
1593 if (rc)
1594 dprintk("RPC: %s: ib_post_send returned %i\n", __func__,
1595 rc);
1596out:
1597 return rc;
1598}
1599
1600/*
1601 * (Re)post a receive buffer.
1602 */
1603int
1604rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
1605 struct rpcrdma_ep *ep,
1606 struct rpcrdma_rep *rep)
1607{
1608 struct ib_recv_wr recv_wr, *recv_wr_fail;
1609 int rc;
1610
1611 recv_wr.next = NULL;
1612 recv_wr.wr_id = (u64) (unsigned long) rep;
1613 recv_wr.sg_list = &rep->rr_iov;
1614 recv_wr.num_sge = 1;
1615
1616 ib_dma_sync_single_for_cpu(ia->ri_id->device,
1617 rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL);
1618
1619 DECR_CQCOUNT(ep);
1620 rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
1621
1622 if (rc)
1623 dprintk("RPC: %s: ib_post_recv returned %i\n", __func__,
1624 rc);
1625 return rc;
1626}
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
new file mode 100644
index 000000000000..2427822f8bd4
--- /dev/null
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -0,0 +1,330 @@
1/*
2 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the BSD-type
8 * license below:
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 *
17 * Redistributions in binary form must reproduce the above
18 * copyright notice, this list of conditions and the following
19 * disclaimer in the documentation and/or other materials provided
20 * with the distribution.
21 *
22 * Neither the name of the Network Appliance, Inc. nor the names of
23 * its contributors may be used to endorse or promote products
24 * derived from this software without specific prior written
25 * permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 */
39
40#ifndef _LINUX_SUNRPC_XPRT_RDMA_H
41#define _LINUX_SUNRPC_XPRT_RDMA_H
42
43#include <linux/wait.h> /* wait_queue_head_t, etc */
44#include <linux/spinlock.h> /* spinlock_t, etc */
45#include <asm/atomic.h> /* atomic_t, etc */
46
47#include <rdma/rdma_cm.h> /* RDMA connection api */
48#include <rdma/ib_verbs.h> /* RDMA verbs api */
49
50#include <linux/sunrpc/clnt.h> /* rpc_xprt */
51#include <linux/sunrpc/rpc_rdma.h> /* RPC/RDMA protocol */
52#include <linux/sunrpc/xprtrdma.h> /* xprt parameters */
53
54/*
55 * Interface Adapter -- one per transport instance
56 */
57struct rpcrdma_ia {
58 struct rdma_cm_id *ri_id;
59 struct ib_pd *ri_pd;
60 struct ib_mr *ri_bind_mem;
61 struct completion ri_done;
62 int ri_async_rc;
63 enum rpcrdma_memreg ri_memreg_strategy;
64};
65
66/*
67 * RDMA Endpoint -- one per transport instance
68 */
69
70struct rpcrdma_ep {
71 atomic_t rep_cqcount;
72 int rep_cqinit;
73 int rep_connected;
74 struct rpcrdma_ia *rep_ia;
75 struct ib_cq *rep_cq;
76 struct ib_qp_init_attr rep_attr;
77 wait_queue_head_t rep_connect_wait;
78 struct ib_sge rep_pad; /* holds zeroed pad */
79 struct ib_mr *rep_pad_mr; /* holds zeroed pad */
80 void (*rep_func)(struct rpcrdma_ep *);
81 struct rpc_xprt *rep_xprt; /* for rep_func */
82 struct rdma_conn_param rep_remote_cma;
83 struct sockaddr_storage rep_remote_addr;
84};
85
86#define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit)
87#define DECR_CQCOUNT(ep) atomic_sub_return(1, &(ep)->rep_cqcount)
88
89/*
90 * struct rpcrdma_rep -- this structure encapsulates state required to recv
91 * and complete a reply, asychronously. It needs several pieces of
92 * state:
93 * o recv buffer (posted to provider)
94 * o ib_sge (also donated to provider)
95 * o status of reply (length, success or not)
96 * o bookkeeping state to get run by tasklet (list, etc)
97 *
98 * These are allocated during initialization, per-transport instance;
99 * however, the tasklet execution list itself is global, as it should
100 * always be pretty short.
101 *
102 * N of these are associated with a transport instance, and stored in
103 * struct rpcrdma_buffer. N is the max number of outstanding requests.
104 */
105
106/* temporary static scatter/gather max */
107#define RPCRDMA_MAX_DATA_SEGS (8) /* max scatter/gather */
108#define RPCRDMA_MAX_SEGS (RPCRDMA_MAX_DATA_SEGS + 2) /* head+tail = 2 */
109#define MAX_RPCRDMAHDR (\
110 /* max supported RPC/RDMA header */ \
111 sizeof(struct rpcrdma_msg) + (2 * sizeof(u32)) + \
112 (sizeof(struct rpcrdma_read_chunk) * RPCRDMA_MAX_SEGS) + sizeof(u32))
113
114struct rpcrdma_buffer;
115
116struct rpcrdma_rep {
117 unsigned int rr_len; /* actual received reply length */
118 struct rpcrdma_buffer *rr_buffer; /* home base for this structure */
119 struct rpc_xprt *rr_xprt; /* needed for request/reply matching */
120 void (*rr_func)(struct rpcrdma_rep *);/* called by tasklet in softint */
121 struct list_head rr_list; /* tasklet list */
122 wait_queue_head_t rr_unbind; /* optional unbind wait */
123 struct ib_sge rr_iov; /* for posting */
124 struct ib_mr *rr_handle; /* handle for mem in rr_iov */
125 char rr_base[MAX_RPCRDMAHDR]; /* minimal inline receive buffer */
126};
127
128/*
129 * struct rpcrdma_req -- structure central to the request/reply sequence.
130 *
131 * N of these are associated with a transport instance, and stored in
132 * struct rpcrdma_buffer. N is the max number of outstanding requests.
133 *
134 * It includes pre-registered buffer memory for send AND recv.
135 * The recv buffer, however, is not owned by this structure, and
136 * is "donated" to the hardware when a recv is posted. When a
137 * reply is handled, the recv buffer used is given back to the
138 * struct rpcrdma_req associated with the request.
139 *
140 * In addition to the basic memory, this structure includes an array
141 * of iovs for send operations. The reason is that the iovs passed to
142 * ib_post_{send,recv} must not be modified until the work request
143 * completes.
144 *
145 * NOTES:
146 * o RPCRDMA_MAX_SEGS is the max number of addressible chunk elements we
147 * marshal. The number needed varies depending on the iov lists that
148 * are passed to us, the memory registration mode we are in, and if
149 * physical addressing is used, the layout.
150 */
151
152struct rpcrdma_mr_seg { /* chunk descriptors */
153 union { /* chunk memory handles */
154 struct ib_mr *rl_mr; /* if registered directly */
155 struct rpcrdma_mw { /* if registered from region */
156 union {
157 struct ib_mw *mw;
158 struct ib_fmr *fmr;
159 } r;
160 struct list_head mw_list;
161 } *rl_mw;
162 } mr_chunk;
163 u64 mr_base; /* registration result */
164 u32 mr_rkey; /* registration result */
165 u32 mr_len; /* length of chunk or segment */
166 int mr_nsegs; /* number of segments in chunk or 0 */
167 enum dma_data_direction mr_dir; /* segment mapping direction */
168 dma_addr_t mr_dma; /* segment mapping address */
169 size_t mr_dmalen; /* segment mapping length */
170 struct page *mr_page; /* owning page, if any */
171 char *mr_offset; /* kva if no page, else offset */
172};
173
174struct rpcrdma_req {
175 size_t rl_size; /* actual length of buffer */
176 unsigned int rl_niovs; /* 0, 2 or 4 */
177 unsigned int rl_nchunks; /* non-zero if chunks */
178 struct rpcrdma_buffer *rl_buffer; /* home base for this structure */
179 struct rpcrdma_rep *rl_reply;/* holder for reply buffer */
180 struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];/* chunk segments */
181 struct ib_sge rl_send_iov[4]; /* for active requests */
182 struct ib_sge rl_iov; /* for posting */
183 struct ib_mr *rl_handle; /* handle for mem in rl_iov */
184 char rl_base[MAX_RPCRDMAHDR]; /* start of actual buffer */
185 __u32 rl_xdr_buf[0]; /* start of returned rpc rq_buffer */
186};
187#define rpcr_to_rdmar(r) \
188 container_of((r)->rq_buffer, struct rpcrdma_req, rl_xdr_buf[0])
189
190/*
191 * struct rpcrdma_buffer -- holds list/queue of pre-registered memory for
192 * inline requests/replies, and client/server credits.
193 *
194 * One of these is associated with a transport instance
195 */
196struct rpcrdma_buffer {
197 spinlock_t rb_lock; /* protects indexes */
198 atomic_t rb_credits; /* most recent server credits */
199 unsigned long rb_cwndscale; /* cached framework rpc_cwndscale */
200 int rb_max_requests;/* client max requests */
201 struct list_head rb_mws; /* optional memory windows/fmrs */
202 int rb_send_index;
203 struct rpcrdma_req **rb_send_bufs;
204 int rb_recv_index;
205 struct rpcrdma_rep **rb_recv_bufs;
206 char *rb_pool;
207};
208#define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia)
209
210/*
211 * Internal structure for transport instance creation. This
212 * exists primarily for modularity.
213 *
214 * This data should be set with mount options
215 */
216struct rpcrdma_create_data_internal {
217 struct sockaddr_storage addr; /* RDMA server address */
218 unsigned int max_requests; /* max requests (slots) in flight */
219 unsigned int rsize; /* mount rsize - max read hdr+data */
220 unsigned int wsize; /* mount wsize - max write hdr+data */
221 unsigned int inline_rsize; /* max non-rdma read data payload */
222 unsigned int inline_wsize; /* max non-rdma write data payload */
223 unsigned int padding; /* non-rdma write header padding */
224};
225
226#define RPCRDMA_INLINE_READ_THRESHOLD(rq) \
227 (rpcx_to_rdmad(rq->rq_task->tk_xprt).inline_rsize)
228
229#define RPCRDMA_INLINE_WRITE_THRESHOLD(rq)\
230 (rpcx_to_rdmad(rq->rq_task->tk_xprt).inline_wsize)
231
232#define RPCRDMA_INLINE_PAD_VALUE(rq)\
233 rpcx_to_rdmad(rq->rq_task->tk_xprt).padding
234
235/*
236 * Statistics for RPCRDMA
237 */
238struct rpcrdma_stats {
239 unsigned long read_chunk_count;
240 unsigned long write_chunk_count;
241 unsigned long reply_chunk_count;
242
243 unsigned long long total_rdma_request;
244 unsigned long long total_rdma_reply;
245
246 unsigned long long pullup_copy_count;
247 unsigned long long fixup_copy_count;
248 unsigned long hardway_register_count;
249 unsigned long failed_marshal_count;
250 unsigned long bad_reply_count;
251};
252
253/*
254 * RPCRDMA transport -- encapsulates the structures above for
255 * integration with RPC.
256 *
257 * The contained structures are embedded, not pointers,
258 * for convenience. This structure need not be visible externally.
259 *
260 * It is allocated and initialized during mount, and released
261 * during unmount.
262 */
263struct rpcrdma_xprt {
264 struct rpc_xprt xprt;
265 struct rpcrdma_ia rx_ia;
266 struct rpcrdma_ep rx_ep;
267 struct rpcrdma_buffer rx_buf;
268 struct rpcrdma_create_data_internal rx_data;
269 struct delayed_work rdma_connect;
270 struct rpcrdma_stats rx_stats;
271};
272
273#define rpcx_to_rdmax(x) container_of(x, struct rpcrdma_xprt, xprt)
274#define rpcx_to_rdmad(x) (rpcx_to_rdmax(x)->rx_data)
275
276/*
277 * Interface Adapter calls - xprtrdma/verbs.c
278 */
279int rpcrdma_ia_open(struct rpcrdma_xprt *, struct sockaddr *, int);
280void rpcrdma_ia_close(struct rpcrdma_ia *);
281
282/*
283 * Endpoint calls - xprtrdma/verbs.c
284 */
285int rpcrdma_ep_create(struct rpcrdma_ep *, struct rpcrdma_ia *,
286 struct rpcrdma_create_data_internal *);
287int rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *);
288int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *);
289int rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *);
290
291int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *,
292 struct rpcrdma_req *);
293int rpcrdma_ep_post_recv(struct rpcrdma_ia *, struct rpcrdma_ep *,
294 struct rpcrdma_rep *);
295
296/*
297 * Buffer calls - xprtrdma/verbs.c
298 */
299int rpcrdma_buffer_create(struct rpcrdma_buffer *, struct rpcrdma_ep *,
300 struct rpcrdma_ia *,
301 struct rpcrdma_create_data_internal *);
302void rpcrdma_buffer_destroy(struct rpcrdma_buffer *);
303
304struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *);
305void rpcrdma_buffer_put(struct rpcrdma_req *);
306void rpcrdma_recv_buffer_get(struct rpcrdma_req *);
307void rpcrdma_recv_buffer_put(struct rpcrdma_rep *);
308
309int rpcrdma_register_internal(struct rpcrdma_ia *, void *, int,
310 struct ib_mr **, struct ib_sge *);
311int rpcrdma_deregister_internal(struct rpcrdma_ia *,
312 struct ib_mr *, struct ib_sge *);
313
314int rpcrdma_register_external(struct rpcrdma_mr_seg *,
315 int, int, struct rpcrdma_xprt *);
316int rpcrdma_deregister_external(struct rpcrdma_mr_seg *,
317 struct rpcrdma_xprt *, void *);
318
319/*
320 * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c
321 */
322void rpcrdma_conn_func(struct rpcrdma_ep *);
323void rpcrdma_reply_handler(struct rpcrdma_rep *);
324
325/*
326 * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c
327 */
328int rpcrdma_marshal_req(struct rpc_rqst *);
329
330#endif /* _LINUX_SUNRPC_XPRT_RDMA_H */
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 282efd447a61..02298f529dad 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -13,10 +13,14 @@
13 * (C) 1999 Trond Myklebust <trond.myklebust@fys.uio.no> 13 * (C) 1999 Trond Myklebust <trond.myklebust@fys.uio.no>
14 * 14 *
15 * IP socket transport implementation, (C) 2005 Chuck Lever <cel@netapp.com> 15 * IP socket transport implementation, (C) 2005 Chuck Lever <cel@netapp.com>
16 *
17 * IPv6 support contributed by Gilles Quillard, Bull Open Source, 2005.
18 * <gilles.quillard@bull.net>
16 */ 19 */
17 20
18#include <linux/types.h> 21#include <linux/types.h>
19#include <linux/slab.h> 22#include <linux/slab.h>
23#include <linux/module.h>
20#include <linux/capability.h> 24#include <linux/capability.h>
21#include <linux/pagemap.h> 25#include <linux/pagemap.h>
22#include <linux/errno.h> 26#include <linux/errno.h>
@@ -28,6 +32,7 @@
28#include <linux/tcp.h> 32#include <linux/tcp.h>
29#include <linux/sunrpc/clnt.h> 33#include <linux/sunrpc/clnt.h>
30#include <linux/sunrpc/sched.h> 34#include <linux/sunrpc/sched.h>
35#include <linux/sunrpc/xprtsock.h>
31#include <linux/file.h> 36#include <linux/file.h>
32 37
33#include <net/sock.h> 38#include <net/sock.h>
@@ -260,14 +265,29 @@ struct sock_xprt {
260#define TCP_RCV_COPY_XID (1UL << 2) 265#define TCP_RCV_COPY_XID (1UL << 2)
261#define TCP_RCV_COPY_DATA (1UL << 3) 266#define TCP_RCV_COPY_DATA (1UL << 3)
262 267
263static void xs_format_peer_addresses(struct rpc_xprt *xprt) 268static inline struct sockaddr *xs_addr(struct rpc_xprt *xprt)
269{
270 return (struct sockaddr *) &xprt->addr;
271}
272
273static inline struct sockaddr_in *xs_addr_in(struct rpc_xprt *xprt)
264{ 274{
265 struct sockaddr_in *addr = (struct sockaddr_in *) &xprt->addr; 275 return (struct sockaddr_in *) &xprt->addr;
276}
277
278static inline struct sockaddr_in6 *xs_addr_in6(struct rpc_xprt *xprt)
279{
280 return (struct sockaddr_in6 *) &xprt->addr;
281}
282
283static void xs_format_ipv4_peer_addresses(struct rpc_xprt *xprt)
284{
285 struct sockaddr_in *addr = xs_addr_in(xprt);
266 char *buf; 286 char *buf;
267 287
268 buf = kzalloc(20, GFP_KERNEL); 288 buf = kzalloc(20, GFP_KERNEL);
269 if (buf) { 289 if (buf) {
270 snprintf(buf, 20, "%u.%u.%u.%u", 290 snprintf(buf, 20, NIPQUAD_FMT,
271 NIPQUAD(addr->sin_addr.s_addr)); 291 NIPQUAD(addr->sin_addr.s_addr));
272 } 292 }
273 xprt->address_strings[RPC_DISPLAY_ADDR] = buf; 293 xprt->address_strings[RPC_DISPLAY_ADDR] = buf;
@@ -279,26 +299,123 @@ static void xs_format_peer_addresses(struct rpc_xprt *xprt)
279 } 299 }
280 xprt->address_strings[RPC_DISPLAY_PORT] = buf; 300 xprt->address_strings[RPC_DISPLAY_PORT] = buf;
281 301
282 if (xprt->prot == IPPROTO_UDP) 302 buf = kzalloc(8, GFP_KERNEL);
283 xprt->address_strings[RPC_DISPLAY_PROTO] = "udp"; 303 if (buf) {
284 else 304 if (xprt->prot == IPPROTO_UDP)
285 xprt->address_strings[RPC_DISPLAY_PROTO] = "tcp"; 305 snprintf(buf, 8, "udp");
306 else
307 snprintf(buf, 8, "tcp");
308 }
309 xprt->address_strings[RPC_DISPLAY_PROTO] = buf;
286 310
287 buf = kzalloc(48, GFP_KERNEL); 311 buf = kzalloc(48, GFP_KERNEL);
288 if (buf) { 312 if (buf) {
289 snprintf(buf, 48, "addr=%u.%u.%u.%u port=%u proto=%s", 313 snprintf(buf, 48, "addr="NIPQUAD_FMT" port=%u proto=%s",
290 NIPQUAD(addr->sin_addr.s_addr), 314 NIPQUAD(addr->sin_addr.s_addr),
291 ntohs(addr->sin_port), 315 ntohs(addr->sin_port),
292 xprt->prot == IPPROTO_UDP ? "udp" : "tcp"); 316 xprt->prot == IPPROTO_UDP ? "udp" : "tcp");
293 } 317 }
294 xprt->address_strings[RPC_DISPLAY_ALL] = buf; 318 xprt->address_strings[RPC_DISPLAY_ALL] = buf;
319
320 buf = kzalloc(10, GFP_KERNEL);
321 if (buf) {
322 snprintf(buf, 10, "%02x%02x%02x%02x",
323 NIPQUAD(addr->sin_addr.s_addr));
324 }
325 xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = buf;
326
327 buf = kzalloc(8, GFP_KERNEL);
328 if (buf) {
329 snprintf(buf, 8, "%4hx",
330 ntohs(addr->sin_port));
331 }
332 xprt->address_strings[RPC_DISPLAY_HEX_PORT] = buf;
333
334 buf = kzalloc(30, GFP_KERNEL);
335 if (buf) {
336 snprintf(buf, 30, NIPQUAD_FMT".%u.%u",
337 NIPQUAD(addr->sin_addr.s_addr),
338 ntohs(addr->sin_port) >> 8,
339 ntohs(addr->sin_port) & 0xff);
340 }
341 xprt->address_strings[RPC_DISPLAY_UNIVERSAL_ADDR] = buf;
342
343 xprt->address_strings[RPC_DISPLAY_NETID] =
344 kstrdup(xprt->prot == IPPROTO_UDP ?
345 RPCBIND_NETID_UDP : RPCBIND_NETID_TCP, GFP_KERNEL);
346}
347
348static void xs_format_ipv6_peer_addresses(struct rpc_xprt *xprt)
349{
350 struct sockaddr_in6 *addr = xs_addr_in6(xprt);
351 char *buf;
352
353 buf = kzalloc(40, GFP_KERNEL);
354 if (buf) {
355 snprintf(buf, 40, NIP6_FMT,
356 NIP6(addr->sin6_addr));
357 }
358 xprt->address_strings[RPC_DISPLAY_ADDR] = buf;
359
360 buf = kzalloc(8, GFP_KERNEL);
361 if (buf) {
362 snprintf(buf, 8, "%u",
363 ntohs(addr->sin6_port));
364 }
365 xprt->address_strings[RPC_DISPLAY_PORT] = buf;
366
367 buf = kzalloc(8, GFP_KERNEL);
368 if (buf) {
369 if (xprt->prot == IPPROTO_UDP)
370 snprintf(buf, 8, "udp");
371 else
372 snprintf(buf, 8, "tcp");
373 }
374 xprt->address_strings[RPC_DISPLAY_PROTO] = buf;
375
376 buf = kzalloc(64, GFP_KERNEL);
377 if (buf) {
378 snprintf(buf, 64, "addr="NIP6_FMT" port=%u proto=%s",
379 NIP6(addr->sin6_addr),
380 ntohs(addr->sin6_port),
381 xprt->prot == IPPROTO_UDP ? "udp" : "tcp");
382 }
383 xprt->address_strings[RPC_DISPLAY_ALL] = buf;
384
385 buf = kzalloc(36, GFP_KERNEL);
386 if (buf) {
387 snprintf(buf, 36, NIP6_SEQFMT,
388 NIP6(addr->sin6_addr));
389 }
390 xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = buf;
391
392 buf = kzalloc(8, GFP_KERNEL);
393 if (buf) {
394 snprintf(buf, 8, "%4hx",
395 ntohs(addr->sin6_port));
396 }
397 xprt->address_strings[RPC_DISPLAY_HEX_PORT] = buf;
398
399 buf = kzalloc(50, GFP_KERNEL);
400 if (buf) {
401 snprintf(buf, 50, NIP6_FMT".%u.%u",
402 NIP6(addr->sin6_addr),
403 ntohs(addr->sin6_port) >> 8,
404 ntohs(addr->sin6_port) & 0xff);
405 }
406 xprt->address_strings[RPC_DISPLAY_UNIVERSAL_ADDR] = buf;
407
408 xprt->address_strings[RPC_DISPLAY_NETID] =
409 kstrdup(xprt->prot == IPPROTO_UDP ?
410 RPCBIND_NETID_UDP6 : RPCBIND_NETID_TCP6, GFP_KERNEL);
295} 411}
296 412
297static void xs_free_peer_addresses(struct rpc_xprt *xprt) 413static void xs_free_peer_addresses(struct rpc_xprt *xprt)
298{ 414{
299 kfree(xprt->address_strings[RPC_DISPLAY_ADDR]); 415 int i;
300 kfree(xprt->address_strings[RPC_DISPLAY_PORT]); 416
301 kfree(xprt->address_strings[RPC_DISPLAY_ALL]); 417 for (i = 0; i < RPC_DISPLAY_MAX; i++)
418 kfree(xprt->address_strings[i]);
302} 419}
303 420
304#define XS_SENDMSG_FLAGS (MSG_DONTWAIT | MSG_NOSIGNAL) 421#define XS_SENDMSG_FLAGS (MSG_DONTWAIT | MSG_NOSIGNAL)
@@ -463,19 +580,20 @@ static int xs_udp_send_request(struct rpc_task *task)
463 580
464 req->rq_xtime = jiffies; 581 req->rq_xtime = jiffies;
465 status = xs_sendpages(transport->sock, 582 status = xs_sendpages(transport->sock,
466 (struct sockaddr *) &xprt->addr, 583 xs_addr(xprt),
467 xprt->addrlen, xdr, 584 xprt->addrlen, xdr,
468 req->rq_bytes_sent); 585 req->rq_bytes_sent);
469 586
470 dprintk("RPC: xs_udp_send_request(%u) = %d\n", 587 dprintk("RPC: xs_udp_send_request(%u) = %d\n",
471 xdr->len - req->rq_bytes_sent, status); 588 xdr->len - req->rq_bytes_sent, status);
472 589
473 if (likely(status >= (int) req->rq_slen)) 590 if (status >= 0) {
474 return 0; 591 task->tk_bytes_sent += status;
475 592 if (status >= req->rq_slen)
476 /* Still some bytes left; set up for a retry later. */ 593 return 0;
477 if (status > 0) 594 /* Still some bytes left; set up for a retry later. */
478 status = -EAGAIN; 595 status = -EAGAIN;
596 }
479 597
480 switch (status) { 598 switch (status) {
481 case -ENETUNREACH: 599 case -ENETUNREACH:
@@ -523,7 +641,8 @@ static int xs_tcp_send_request(struct rpc_task *task)
523 struct rpc_xprt *xprt = req->rq_xprt; 641 struct rpc_xprt *xprt = req->rq_xprt;
524 struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); 642 struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
525 struct xdr_buf *xdr = &req->rq_snd_buf; 643 struct xdr_buf *xdr = &req->rq_snd_buf;
526 int status, retry = 0; 644 int status;
645 unsigned int retry = 0;
527 646
528 xs_encode_tcp_record_marker(&req->rq_snd_buf); 647 xs_encode_tcp_record_marker(&req->rq_snd_buf);
529 648
@@ -661,6 +780,7 @@ static void xs_destroy(struct rpc_xprt *xprt)
661 xs_free_peer_addresses(xprt); 780 xs_free_peer_addresses(xprt);
662 kfree(xprt->slot); 781 kfree(xprt->slot);
663 kfree(xprt); 782 kfree(xprt);
783 module_put(THIS_MODULE);
664} 784}
665 785
666static inline struct rpc_xprt *xprt_from_sock(struct sock *sk) 786static inline struct rpc_xprt *xprt_from_sock(struct sock *sk)
@@ -1139,14 +1259,23 @@ static unsigned short xs_get_random_port(void)
1139 */ 1259 */
1140static void xs_set_port(struct rpc_xprt *xprt, unsigned short port) 1260static void xs_set_port(struct rpc_xprt *xprt, unsigned short port)
1141{ 1261{
1142 struct sockaddr_in *sap = (struct sockaddr_in *) &xprt->addr; 1262 struct sockaddr *addr = xs_addr(xprt);
1143 1263
1144 dprintk("RPC: setting port for xprt %p to %u\n", xprt, port); 1264 dprintk("RPC: setting port for xprt %p to %u\n", xprt, port);
1145 1265
1146 sap->sin_port = htons(port); 1266 switch (addr->sa_family) {
1267 case AF_INET:
1268 ((struct sockaddr_in *)addr)->sin_port = htons(port);
1269 break;
1270 case AF_INET6:
1271 ((struct sockaddr_in6 *)addr)->sin6_port = htons(port);
1272 break;
1273 default:
1274 BUG();
1275 }
1147} 1276}
1148 1277
1149static int xs_bind(struct sock_xprt *transport, struct socket *sock) 1278static int xs_bind4(struct sock_xprt *transport, struct socket *sock)
1150{ 1279{
1151 struct sockaddr_in myaddr = { 1280 struct sockaddr_in myaddr = {
1152 .sin_family = AF_INET, 1281 .sin_family = AF_INET,
@@ -1174,8 +1303,42 @@ static int xs_bind(struct sock_xprt *transport, struct socket *sock)
1174 else 1303 else
1175 port--; 1304 port--;
1176 } while (err == -EADDRINUSE && port != transport->port); 1305 } while (err == -EADDRINUSE && port != transport->port);
1177 dprintk("RPC: xs_bind "NIPQUAD_FMT":%u: %s (%d)\n", 1306 dprintk("RPC: %s "NIPQUAD_FMT":%u: %s (%d)\n",
1178 NIPQUAD(myaddr.sin_addr), port, err ? "failed" : "ok", err); 1307 __FUNCTION__, NIPQUAD(myaddr.sin_addr),
1308 port, err ? "failed" : "ok", err);
1309 return err;
1310}
1311
1312static int xs_bind6(struct sock_xprt *transport, struct socket *sock)
1313{
1314 struct sockaddr_in6 myaddr = {
1315 .sin6_family = AF_INET6,
1316 };
1317 struct sockaddr_in6 *sa;
1318 int err;
1319 unsigned short port = transport->port;
1320
1321 if (!transport->xprt.resvport)
1322 port = 0;
1323 sa = (struct sockaddr_in6 *)&transport->addr;
1324 myaddr.sin6_addr = sa->sin6_addr;
1325 do {
1326 myaddr.sin6_port = htons(port);
1327 err = kernel_bind(sock, (struct sockaddr *) &myaddr,
1328 sizeof(myaddr));
1329 if (!transport->xprt.resvport)
1330 break;
1331 if (err == 0) {
1332 transport->port = port;
1333 break;
1334 }
1335 if (port <= xprt_min_resvport)
1336 port = xprt_max_resvport;
1337 else
1338 port--;
1339 } while (err == -EADDRINUSE && port != transport->port);
1340 dprintk("RPC: xs_bind6 "NIP6_FMT":%u: %s (%d)\n",
1341 NIP6(myaddr.sin6_addr), port, err ? "failed" : "ok", err);
1179 return err; 1342 return err;
1180} 1343}
1181 1344
@@ -1183,38 +1346,69 @@ static int xs_bind(struct sock_xprt *transport, struct socket *sock)
1183static struct lock_class_key xs_key[2]; 1346static struct lock_class_key xs_key[2];
1184static struct lock_class_key xs_slock_key[2]; 1347static struct lock_class_key xs_slock_key[2];
1185 1348
1186static inline void xs_reclassify_socket(struct socket *sock) 1349static inline void xs_reclassify_socket4(struct socket *sock)
1187{ 1350{
1188 struct sock *sk = sock->sk; 1351 struct sock *sk = sock->sk;
1352
1189 BUG_ON(sock_owned_by_user(sk)); 1353 BUG_ON(sock_owned_by_user(sk));
1190 switch (sk->sk_family) { 1354 sock_lock_init_class_and_name(sk, "slock-AF_INET-RPC",
1191 case AF_INET: 1355 &xs_slock_key[0], "sk_lock-AF_INET-RPC", &xs_key[0]);
1192 sock_lock_init_class_and_name(sk, "slock-AF_INET-NFS", 1356}
1193 &xs_slock_key[0], "sk_lock-AF_INET-NFS", &xs_key[0]);
1194 break;
1195 1357
1196 case AF_INET6: 1358static inline void xs_reclassify_socket6(struct socket *sock)
1197 sock_lock_init_class_and_name(sk, "slock-AF_INET6-NFS", 1359{
1198 &xs_slock_key[1], "sk_lock-AF_INET6-NFS", &xs_key[1]); 1360 struct sock *sk = sock->sk;
1199 break;
1200 1361
1201 default: 1362 BUG_ON(sock_owned_by_user(sk));
1202 BUG(); 1363 sock_lock_init_class_and_name(sk, "slock-AF_INET6-RPC",
1203 } 1364 &xs_slock_key[1], "sk_lock-AF_INET6-RPC", &xs_key[1]);
1204} 1365}
1205#else 1366#else
1206static inline void xs_reclassify_socket(struct socket *sock) 1367static inline void xs_reclassify_socket4(struct socket *sock)
1368{
1369}
1370
1371static inline void xs_reclassify_socket6(struct socket *sock)
1207{ 1372{
1208} 1373}
1209#endif 1374#endif
1210 1375
1376static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
1377{
1378 struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
1379
1380 if (!transport->inet) {
1381 struct sock *sk = sock->sk;
1382
1383 write_lock_bh(&sk->sk_callback_lock);
1384
1385 sk->sk_user_data = xprt;
1386 transport->old_data_ready = sk->sk_data_ready;
1387 transport->old_state_change = sk->sk_state_change;
1388 transport->old_write_space = sk->sk_write_space;
1389 sk->sk_data_ready = xs_udp_data_ready;
1390 sk->sk_write_space = xs_udp_write_space;
1391 sk->sk_no_check = UDP_CSUM_NORCV;
1392 sk->sk_allocation = GFP_ATOMIC;
1393
1394 xprt_set_connected(xprt);
1395
1396 /* Reset to new socket */
1397 transport->sock = sock;
1398 transport->inet = sk;
1399
1400 write_unlock_bh(&sk->sk_callback_lock);
1401 }
1402 xs_udp_do_set_buffer_size(xprt);
1403}
1404
1211/** 1405/**
1212 * xs_udp_connect_worker - set up a UDP socket 1406 * xs_udp_connect_worker4 - set up a UDP socket
1213 * @work: RPC transport to connect 1407 * @work: RPC transport to connect
1214 * 1408 *
1215 * Invoked by a work queue tasklet. 1409 * Invoked by a work queue tasklet.
1216 */ 1410 */
1217static void xs_udp_connect_worker(struct work_struct *work) 1411static void xs_udp_connect_worker4(struct work_struct *work)
1218{ 1412{
1219 struct sock_xprt *transport = 1413 struct sock_xprt *transport =
1220 container_of(work, struct sock_xprt, connect_worker.work); 1414 container_of(work, struct sock_xprt, connect_worker.work);
@@ -1232,9 +1426,9 @@ static void xs_udp_connect_worker(struct work_struct *work)
1232 dprintk("RPC: can't create UDP transport socket (%d).\n", -err); 1426 dprintk("RPC: can't create UDP transport socket (%d).\n", -err);
1233 goto out; 1427 goto out;
1234 } 1428 }
1235 xs_reclassify_socket(sock); 1429 xs_reclassify_socket4(sock);
1236 1430
1237 if (xs_bind(transport, sock)) { 1431 if (xs_bind4(transport, sock)) {
1238 sock_release(sock); 1432 sock_release(sock);
1239 goto out; 1433 goto out;
1240 } 1434 }
@@ -1242,29 +1436,48 @@ static void xs_udp_connect_worker(struct work_struct *work)
1242 dprintk("RPC: worker connecting xprt %p to address: %s\n", 1436 dprintk("RPC: worker connecting xprt %p to address: %s\n",
1243 xprt, xprt->address_strings[RPC_DISPLAY_ALL]); 1437 xprt, xprt->address_strings[RPC_DISPLAY_ALL]);
1244 1438
1245 if (!transport->inet) { 1439 xs_udp_finish_connecting(xprt, sock);
1246 struct sock *sk = sock->sk; 1440 status = 0;
1441out:
1442 xprt_wake_pending_tasks(xprt, status);
1443 xprt_clear_connecting(xprt);
1444}
1247 1445
1248 write_lock_bh(&sk->sk_callback_lock); 1446/**
1447 * xs_udp_connect_worker6 - set up a UDP socket
1448 * @work: RPC transport to connect
1449 *
1450 * Invoked by a work queue tasklet.
1451 */
1452static void xs_udp_connect_worker6(struct work_struct *work)
1453{
1454 struct sock_xprt *transport =
1455 container_of(work, struct sock_xprt, connect_worker.work);
1456 struct rpc_xprt *xprt = &transport->xprt;
1457 struct socket *sock = transport->sock;
1458 int err, status = -EIO;
1249 1459
1250 sk->sk_user_data = xprt; 1460 if (xprt->shutdown || !xprt_bound(xprt))
1251 transport->old_data_ready = sk->sk_data_ready; 1461 goto out;
1252 transport->old_state_change = sk->sk_state_change;
1253 transport->old_write_space = sk->sk_write_space;
1254 sk->sk_data_ready = xs_udp_data_ready;
1255 sk->sk_write_space = xs_udp_write_space;
1256 sk->sk_no_check = UDP_CSUM_NORCV;
1257 sk->sk_allocation = GFP_ATOMIC;
1258 1462
1259 xprt_set_connected(xprt); 1463 /* Start by resetting any existing state */
1464 xs_close(xprt);
1260 1465
1261 /* Reset to new socket */ 1466 if ((err = sock_create_kern(PF_INET6, SOCK_DGRAM, IPPROTO_UDP, &sock)) < 0) {
1262 transport->sock = sock; 1467 dprintk("RPC: can't create UDP transport socket (%d).\n", -err);
1263 transport->inet = sk; 1468 goto out;
1469 }
1470 xs_reclassify_socket6(sock);
1264 1471
1265 write_unlock_bh(&sk->sk_callback_lock); 1472 if (xs_bind6(transport, sock) < 0) {
1473 sock_release(sock);
1474 goto out;
1266 } 1475 }
1267 xs_udp_do_set_buffer_size(xprt); 1476
1477 dprintk("RPC: worker connecting xprt %p to address: %s\n",
1478 xprt, xprt->address_strings[RPC_DISPLAY_ALL]);
1479
1480 xs_udp_finish_connecting(xprt, sock);
1268 status = 0; 1481 status = 0;
1269out: 1482out:
1270 xprt_wake_pending_tasks(xprt, status); 1483 xprt_wake_pending_tasks(xprt, status);
@@ -1295,13 +1508,52 @@ static void xs_tcp_reuse_connection(struct rpc_xprt *xprt)
1295 result); 1508 result);
1296} 1509}
1297 1510
1511static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
1512{
1513 struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
1514
1515 if (!transport->inet) {
1516 struct sock *sk = sock->sk;
1517
1518 write_lock_bh(&sk->sk_callback_lock);
1519
1520 sk->sk_user_data = xprt;
1521 transport->old_data_ready = sk->sk_data_ready;
1522 transport->old_state_change = sk->sk_state_change;
1523 transport->old_write_space = sk->sk_write_space;
1524 sk->sk_data_ready = xs_tcp_data_ready;
1525 sk->sk_state_change = xs_tcp_state_change;
1526 sk->sk_write_space = xs_tcp_write_space;
1527 sk->sk_allocation = GFP_ATOMIC;
1528
1529 /* socket options */
1530 sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
1531 sock_reset_flag(sk, SOCK_LINGER);
1532 tcp_sk(sk)->linger2 = 0;
1533 tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF;
1534
1535 xprt_clear_connected(xprt);
1536
1537 /* Reset to new socket */
1538 transport->sock = sock;
1539 transport->inet = sk;
1540
1541 write_unlock_bh(&sk->sk_callback_lock);
1542 }
1543
1544 /* Tell the socket layer to start connecting... */
1545 xprt->stat.connect_count++;
1546 xprt->stat.connect_start = jiffies;
1547 return kernel_connect(sock, xs_addr(xprt), xprt->addrlen, O_NONBLOCK);
1548}
1549
1298/** 1550/**
1299 * xs_tcp_connect_worker - connect a TCP socket to a remote endpoint 1551 * xs_tcp_connect_worker4 - connect a TCP socket to a remote endpoint
1300 * @work: RPC transport to connect 1552 * @work: RPC transport to connect
1301 * 1553 *
1302 * Invoked by a work queue tasklet. 1554 * Invoked by a work queue tasklet.
1303 */ 1555 */
1304static void xs_tcp_connect_worker(struct work_struct *work) 1556static void xs_tcp_connect_worker4(struct work_struct *work)
1305{ 1557{
1306 struct sock_xprt *transport = 1558 struct sock_xprt *transport =
1307 container_of(work, struct sock_xprt, connect_worker.work); 1559 container_of(work, struct sock_xprt, connect_worker.work);
@@ -1315,13 +1567,12 @@ static void xs_tcp_connect_worker(struct work_struct *work)
1315 if (!sock) { 1567 if (!sock) {
1316 /* start from scratch */ 1568 /* start from scratch */
1317 if ((err = sock_create_kern(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock)) < 0) { 1569 if ((err = sock_create_kern(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock)) < 0) {
1318 dprintk("RPC: can't create TCP transport " 1570 dprintk("RPC: can't create TCP transport socket (%d).\n", -err);
1319 "socket (%d).\n", -err);
1320 goto out; 1571 goto out;
1321 } 1572 }
1322 xs_reclassify_socket(sock); 1573 xs_reclassify_socket4(sock);
1323 1574
1324 if (xs_bind(transport, sock)) { 1575 if (xs_bind4(transport, sock) < 0) {
1325 sock_release(sock); 1576 sock_release(sock);
1326 goto out; 1577 goto out;
1327 } 1578 }
@@ -1332,43 +1583,70 @@ static void xs_tcp_connect_worker(struct work_struct *work)
1332 dprintk("RPC: worker connecting xprt %p to address: %s\n", 1583 dprintk("RPC: worker connecting xprt %p to address: %s\n",
1333 xprt, xprt->address_strings[RPC_DISPLAY_ALL]); 1584 xprt, xprt->address_strings[RPC_DISPLAY_ALL]);
1334 1585
1335 if (!transport->inet) { 1586 status = xs_tcp_finish_connecting(xprt, sock);
1336 struct sock *sk = sock->sk; 1587 dprintk("RPC: %p connect status %d connected %d sock state %d\n",
1337 1588 xprt, -status, xprt_connected(xprt),
1338 write_lock_bh(&sk->sk_callback_lock); 1589 sock->sk->sk_state);
1590 if (status < 0) {
1591 switch (status) {
1592 case -EINPROGRESS:
1593 case -EALREADY:
1594 goto out_clear;
1595 case -ECONNREFUSED:
1596 case -ECONNRESET:
1597 /* retry with existing socket, after a delay */
1598 break;
1599 default:
1600 /* get rid of existing socket, and retry */
1601 xs_close(xprt);
1602 break;
1603 }
1604 }
1605out:
1606 xprt_wake_pending_tasks(xprt, status);
1607out_clear:
1608 xprt_clear_connecting(xprt);
1609}
1339 1610
1340 sk->sk_user_data = xprt; 1611/**
1341 transport->old_data_ready = sk->sk_data_ready; 1612 * xs_tcp_connect_worker6 - connect a TCP socket to a remote endpoint
1342 transport->old_state_change = sk->sk_state_change; 1613 * @work: RPC transport to connect
1343 transport->old_write_space = sk->sk_write_space; 1614 *
1344 sk->sk_data_ready = xs_tcp_data_ready; 1615 * Invoked by a work queue tasklet.
1345 sk->sk_state_change = xs_tcp_state_change; 1616 */
1346 sk->sk_write_space = xs_tcp_write_space; 1617static void xs_tcp_connect_worker6(struct work_struct *work)
1347 sk->sk_allocation = GFP_ATOMIC; 1618{
1619 struct sock_xprt *transport =
1620 container_of(work, struct sock_xprt, connect_worker.work);
1621 struct rpc_xprt *xprt = &transport->xprt;
1622 struct socket *sock = transport->sock;
1623 int err, status = -EIO;
1348 1624
1349 /* socket options */ 1625 if (xprt->shutdown || !xprt_bound(xprt))
1350 sk->sk_userlocks |= SOCK_BINDPORT_LOCK; 1626 goto out;
1351 sock_reset_flag(sk, SOCK_LINGER);
1352 tcp_sk(sk)->linger2 = 0;
1353 tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF;
1354 1627
1355 xprt_clear_connected(xprt); 1628 if (!sock) {
1629 /* start from scratch */
1630 if ((err = sock_create_kern(PF_INET6, SOCK_STREAM, IPPROTO_TCP, &sock)) < 0) {
1631 dprintk("RPC: can't create TCP transport socket (%d).\n", -err);
1632 goto out;
1633 }
1634 xs_reclassify_socket6(sock);
1356 1635
1357 /* Reset to new socket */ 1636 if (xs_bind6(transport, sock) < 0) {
1358 transport->sock = sock; 1637 sock_release(sock);
1359 transport->inet = sk; 1638 goto out;
1639 }
1640 } else
1641 /* "close" the socket, preserving the local port */
1642 xs_tcp_reuse_connection(xprt);
1360 1643
1361 write_unlock_bh(&sk->sk_callback_lock); 1644 dprintk("RPC: worker connecting xprt %p to address: %s\n",
1362 } 1645 xprt, xprt->address_strings[RPC_DISPLAY_ALL]);
1363 1646
1364 /* Tell the socket layer to start connecting... */ 1647 status = xs_tcp_finish_connecting(xprt, sock);
1365 xprt->stat.connect_count++;
1366 xprt->stat.connect_start = jiffies;
1367 status = kernel_connect(sock, (struct sockaddr *) &xprt->addr,
1368 xprt->addrlen, O_NONBLOCK);
1369 dprintk("RPC: %p connect status %d connected %d sock state %d\n", 1648 dprintk("RPC: %p connect status %d connected %d sock state %d\n",
1370 xprt, -status, xprt_connected(xprt), 1649 xprt, -status, xprt_connected(xprt), sock->sk->sk_state);
1371 sock->sk->sk_state);
1372 if (status < 0) { 1650 if (status < 0) {
1373 switch (status) { 1651 switch (status) {
1374 case -EINPROGRESS: 1652 case -EINPROGRESS:
@@ -1508,7 +1786,8 @@ static struct rpc_xprt_ops xs_tcp_ops = {
1508 .print_stats = xs_tcp_print_stats, 1786 .print_stats = xs_tcp_print_stats,
1509}; 1787};
1510 1788
1511static struct rpc_xprt *xs_setup_xprt(struct rpc_xprtsock_create *args, unsigned int slot_table_size) 1789static struct rpc_xprt *xs_setup_xprt(struct xprt_create *args,
1790 unsigned int slot_table_size)
1512{ 1791{
1513 struct rpc_xprt *xprt; 1792 struct rpc_xprt *xprt;
1514 struct sock_xprt *new; 1793 struct sock_xprt *new;
@@ -1549,8 +1828,9 @@ static struct rpc_xprt *xs_setup_xprt(struct rpc_xprtsock_create *args, unsigned
1549 * @args: rpc transport creation arguments 1828 * @args: rpc transport creation arguments
1550 * 1829 *
1551 */ 1830 */
1552struct rpc_xprt *xs_setup_udp(struct rpc_xprtsock_create *args) 1831struct rpc_xprt *xs_setup_udp(struct xprt_create *args)
1553{ 1832{
1833 struct sockaddr *addr = args->dstaddr;
1554 struct rpc_xprt *xprt; 1834 struct rpc_xprt *xprt;
1555 struct sock_xprt *transport; 1835 struct sock_xprt *transport;
1556 1836
@@ -1559,15 +1839,11 @@ struct rpc_xprt *xs_setup_udp(struct rpc_xprtsock_create *args)
1559 return xprt; 1839 return xprt;
1560 transport = container_of(xprt, struct sock_xprt, xprt); 1840 transport = container_of(xprt, struct sock_xprt, xprt);
1561 1841
1562 if (ntohs(((struct sockaddr_in *)args->dstaddr)->sin_port) != 0)
1563 xprt_set_bound(xprt);
1564
1565 xprt->prot = IPPROTO_UDP; 1842 xprt->prot = IPPROTO_UDP;
1566 xprt->tsh_size = 0; 1843 xprt->tsh_size = 0;
1567 /* XXX: header size can vary due to auth type, IPv6, etc. */ 1844 /* XXX: header size can vary due to auth type, IPv6, etc. */
1568 xprt->max_payload = (1U << 16) - (MAX_HEADER << 3); 1845 xprt->max_payload = (1U << 16) - (MAX_HEADER << 3);
1569 1846
1570 INIT_DELAYED_WORK(&transport->connect_worker, xs_udp_connect_worker);
1571 xprt->bind_timeout = XS_BIND_TO; 1847 xprt->bind_timeout = XS_BIND_TO;
1572 xprt->connect_timeout = XS_UDP_CONN_TO; 1848 xprt->connect_timeout = XS_UDP_CONN_TO;
1573 xprt->reestablish_timeout = XS_UDP_REEST_TO; 1849 xprt->reestablish_timeout = XS_UDP_REEST_TO;
@@ -1580,11 +1856,37 @@ struct rpc_xprt *xs_setup_udp(struct rpc_xprtsock_create *args)
1580 else 1856 else
1581 xprt_set_timeout(&xprt->timeout, 5, 5 * HZ); 1857 xprt_set_timeout(&xprt->timeout, 5, 5 * HZ);
1582 1858
1583 xs_format_peer_addresses(xprt); 1859 switch (addr->sa_family) {
1860 case AF_INET:
1861 if (((struct sockaddr_in *)addr)->sin_port != htons(0))
1862 xprt_set_bound(xprt);
1863
1864 INIT_DELAYED_WORK(&transport->connect_worker,
1865 xs_udp_connect_worker4);
1866 xs_format_ipv4_peer_addresses(xprt);
1867 break;
1868 case AF_INET6:
1869 if (((struct sockaddr_in6 *)addr)->sin6_port != htons(0))
1870 xprt_set_bound(xprt);
1871
1872 INIT_DELAYED_WORK(&transport->connect_worker,
1873 xs_udp_connect_worker6);
1874 xs_format_ipv6_peer_addresses(xprt);
1875 break;
1876 default:
1877 kfree(xprt);
1878 return ERR_PTR(-EAFNOSUPPORT);
1879 }
1880
1584 dprintk("RPC: set up transport to address %s\n", 1881 dprintk("RPC: set up transport to address %s\n",
1585 xprt->address_strings[RPC_DISPLAY_ALL]); 1882 xprt->address_strings[RPC_DISPLAY_ALL]);
1586 1883
1587 return xprt; 1884 if (try_module_get(THIS_MODULE))
1885 return xprt;
1886
1887 kfree(xprt->slot);
1888 kfree(xprt);
1889 return ERR_PTR(-EINVAL);
1588} 1890}
1589 1891
1590/** 1892/**
@@ -1592,8 +1894,9 @@ struct rpc_xprt *xs_setup_udp(struct rpc_xprtsock_create *args)
1592 * @args: rpc transport creation arguments 1894 * @args: rpc transport creation arguments
1593 * 1895 *
1594 */ 1896 */
1595struct rpc_xprt *xs_setup_tcp(struct rpc_xprtsock_create *args) 1897struct rpc_xprt *xs_setup_tcp(struct xprt_create *args)
1596{ 1898{
1899 struct sockaddr *addr = args->dstaddr;
1597 struct rpc_xprt *xprt; 1900 struct rpc_xprt *xprt;
1598 struct sock_xprt *transport; 1901 struct sock_xprt *transport;
1599 1902
@@ -1602,14 +1905,10 @@ struct rpc_xprt *xs_setup_tcp(struct rpc_xprtsock_create *args)
1602 return xprt; 1905 return xprt;
1603 transport = container_of(xprt, struct sock_xprt, xprt); 1906 transport = container_of(xprt, struct sock_xprt, xprt);
1604 1907
1605 if (ntohs(((struct sockaddr_in *)args->dstaddr)->sin_port) != 0)
1606 xprt_set_bound(xprt);
1607
1608 xprt->prot = IPPROTO_TCP; 1908 xprt->prot = IPPROTO_TCP;
1609 xprt->tsh_size = sizeof(rpc_fraghdr) / sizeof(u32); 1909 xprt->tsh_size = sizeof(rpc_fraghdr) / sizeof(u32);
1610 xprt->max_payload = RPC_MAX_FRAGMENT_SIZE; 1910 xprt->max_payload = RPC_MAX_FRAGMENT_SIZE;
1611 1911
1612 INIT_DELAYED_WORK(&transport->connect_worker, xs_tcp_connect_worker);
1613 xprt->bind_timeout = XS_BIND_TO; 1912 xprt->bind_timeout = XS_BIND_TO;
1614 xprt->connect_timeout = XS_TCP_CONN_TO; 1913 xprt->connect_timeout = XS_TCP_CONN_TO;
1615 xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO; 1914 xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO;
@@ -1622,15 +1921,55 @@ struct rpc_xprt *xs_setup_tcp(struct rpc_xprtsock_create *args)
1622 else 1921 else
1623 xprt_set_timeout(&xprt->timeout, 2, 60 * HZ); 1922 xprt_set_timeout(&xprt->timeout, 2, 60 * HZ);
1624 1923
1625 xs_format_peer_addresses(xprt); 1924 switch (addr->sa_family) {
1925 case AF_INET:
1926 if (((struct sockaddr_in *)addr)->sin_port != htons(0))
1927 xprt_set_bound(xprt);
1928
1929 INIT_DELAYED_WORK(&transport->connect_worker, xs_tcp_connect_worker4);
1930 xs_format_ipv4_peer_addresses(xprt);
1931 break;
1932 case AF_INET6:
1933 if (((struct sockaddr_in6 *)addr)->sin6_port != htons(0))
1934 xprt_set_bound(xprt);
1935
1936 INIT_DELAYED_WORK(&transport->connect_worker, xs_tcp_connect_worker6);
1937 xs_format_ipv6_peer_addresses(xprt);
1938 break;
1939 default:
1940 kfree(xprt);
1941 return ERR_PTR(-EAFNOSUPPORT);
1942 }
1943
1626 dprintk("RPC: set up transport to address %s\n", 1944 dprintk("RPC: set up transport to address %s\n",
1627 xprt->address_strings[RPC_DISPLAY_ALL]); 1945 xprt->address_strings[RPC_DISPLAY_ALL]);
1628 1946
1629 return xprt; 1947 if (try_module_get(THIS_MODULE))
1948 return xprt;
1949
1950 kfree(xprt->slot);
1951 kfree(xprt);
1952 return ERR_PTR(-EINVAL);
1630} 1953}
1631 1954
1955static struct xprt_class xs_udp_transport = {
1956 .list = LIST_HEAD_INIT(xs_udp_transport.list),
1957 .name = "udp",
1958 .owner = THIS_MODULE,
1959 .ident = IPPROTO_UDP,
1960 .setup = xs_setup_udp,
1961};
1962
1963static struct xprt_class xs_tcp_transport = {
1964 .list = LIST_HEAD_INIT(xs_tcp_transport.list),
1965 .name = "tcp",
1966 .owner = THIS_MODULE,
1967 .ident = IPPROTO_TCP,
1968 .setup = xs_setup_tcp,
1969};
1970
1632/** 1971/**
1633 * init_socket_xprt - set up xprtsock's sysctls 1972 * init_socket_xprt - set up xprtsock's sysctls, register with RPC client
1634 * 1973 *
1635 */ 1974 */
1636int init_socket_xprt(void) 1975int init_socket_xprt(void)
@@ -1640,11 +1979,14 @@ int init_socket_xprt(void)
1640 sunrpc_table_header = register_sysctl_table(sunrpc_table); 1979 sunrpc_table_header = register_sysctl_table(sunrpc_table);
1641#endif 1980#endif
1642 1981
1982 xprt_register_transport(&xs_udp_transport);
1983 xprt_register_transport(&xs_tcp_transport);
1984
1643 return 0; 1985 return 0;
1644} 1986}
1645 1987
1646/** 1988/**
1647 * cleanup_socket_xprt - remove xprtsock's sysctls 1989 * cleanup_socket_xprt - remove xprtsock's sysctls, unregister
1648 * 1990 *
1649 */ 1991 */
1650void cleanup_socket_xprt(void) 1992void cleanup_socket_xprt(void)
@@ -1655,4 +1997,7 @@ void cleanup_socket_xprt(void)
1655 sunrpc_table_header = NULL; 1997 sunrpc_table_header = NULL;
1656 } 1998 }
1657#endif 1999#endif
2000
2001 xprt_unregister_transport(&xs_udp_transport);
2002 xprt_unregister_transport(&xs_tcp_transport);
1658} 2003}