aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2008-02-01 22:31:28 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2008-02-01 22:31:28 -0500
commit63e9b66e29357dd12e8b1d3ebf7036e7591f81e3 (patch)
tree5aa6a70a8f4bbf306e2825a1e2fa2660c2c1c187
parent687fcdf741e4a268c2c7bac8b3734de761bb9719 (diff)
parentea339d46b93c7b16e067a29aad1812f7a389815a (diff)
Merge branch 'for-linus' of git://linux-nfs.org/~bfields/linux
* 'for-linus' of git://linux-nfs.org/~bfields/linux: (100 commits) SUNRPC: RPC program information is stored in unsigned integers SUNRPC: Move exported symbol definitions after function declaration part 2 NLM: tear down RPC clients in nlm_shutdown_hosts SUNRPC: spin svc_rqst initialization to its own function nfsd: more careful input validation in nfsctl write methods lockd: minor log message fix knfsd: don't bother mapping putrootfh enoent to eperm rdma: makefile rdma: ONCRPC RDMA protocol marshalling rdma: SVCRDMA sendto rdma: SVCRDMA recvfrom rdma: SVCRDMA Core Transport Services rdma: SVCRDMA Transport Module rdma: SVCRMDA Header File svc: Add svc_xprt_names service to replace svc_sock_names knfsd: Support adding transports by writing portlist file svc: Add svc API that queries for a transport instance svc: Add /proc/sys/sunrpc/transport files svc: Add transport hdr size for defer/revisit svc: Move the xprt independent code to the svc_xprt.c file ...
-rw-r--r--MAINTAINERS2
-rw-r--r--fs/Kconfig2
-rw-r--r--fs/lockd/host.c26
-rw-r--r--fs/lockd/svc.c39
-rw-r--r--fs/lockd/svc4proc.c20
-rw-r--r--fs/lockd/svclock.c20
-rw-r--r--fs/lockd/svcproc.c22
-rw-r--r--fs/lockd/svcsubs.c2
-rw-r--r--fs/nfs/callback.c4
-rw-r--r--fs/nfsd/auth.h (renamed from include/linux/nfsd/auth.h)5
-rw-r--r--fs/nfsd/export.c20
-rw-r--r--fs/nfsd/nfs2acl.c7
-rw-r--r--fs/nfsd/nfs3xdr.c21
-rw-r--r--fs/nfsd/nfs4callback.c92
-rw-r--r--fs/nfsd/nfs4idmap.c28
-rw-r--r--fs/nfsd/nfs4proc.c2
-rw-r--r--fs/nfsd/nfs4state.c257
-rw-r--r--fs/nfsd/nfs4xdr.c36
-rw-r--r--fs/nfsd/nfscache.c28
-rw-r--r--fs/nfsd/nfsctl.c124
-rw-r--r--fs/nfsd/nfsfh.c1
-rw-r--r--fs/nfsd/nfssvc.c8
-rw-r--r--fs/nfsd/nfsxdr.c9
-rw-r--r--fs/nfsd/vfs.c51
-rw-r--r--include/linux/lockd/lockd.h9
-rw-r--r--include/linux/lockd/xdr.h4
-rw-r--r--include/linux/nfsd/Kbuild1
-rw-r--r--include/linux/nfsd/cache.h4
-rw-r--r--include/linux/nfsd/export.h2
-rw-r--r--include/linux/nfsd/nfsd.h5
-rw-r--r--include/linux/nfsd/syscall.h1
-rw-r--r--include/linux/nfsd/xdr.h14
-rw-r--r--include/linux/nfsd/xdr3.h16
-rw-r--r--include/linux/nfsd/xdr4.h2
-rw-r--r--include/linux/nfsd_idmap.h11
-rw-r--r--include/linux/sunrpc/cache.h4
-rw-r--r--include/linux/sunrpc/debug.h2
-rw-r--r--include/linux/sunrpc/svc.h10
-rw-r--r--include/linux/sunrpc/svc_rdma.h262
-rw-r--r--include/linux/sunrpc/svc_xprt.h159
-rw-r--r--include/linux/sunrpc/svcsock.h43
-rw-r--r--include/linux/sunrpc/xdr.h3
-rw-r--r--net/sunrpc/Makefile3
-rw-r--r--net/sunrpc/auth_gss/svcauth_gss.c93
-rw-r--r--net/sunrpc/cache.c152
-rw-r--r--net/sunrpc/stats.c7
-rw-r--r--net/sunrpc/sunrpc_syms.c52
-rw-r--r--net/sunrpc/svc.c90
-rw-r--r--net/sunrpc/svc_xprt.c1055
-rw-r--r--net/sunrpc/svcauth.c6
-rw-r--r--net/sunrpc/svcauth_unix.c59
-rw-r--r--net/sunrpc/svcsock.c1311
-rw-r--r--net/sunrpc/sysctl.c31
-rw-r--r--net/sunrpc/xdr.c8
-rw-r--r--net/sunrpc/xprtrdma/Makefile5
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma.c266
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_marshal.c412
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_recvfrom.c586
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_sendto.c520
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_transport.c1080
60 files changed, 5450 insertions, 1664 deletions
diff --git a/MAINTAINERS b/MAINTAINERS
index 91082e60d289..6cae13718925 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2247,7 +2247,7 @@ P: J. Bruce Fields
2247M: bfields@fieldses.org 2247M: bfields@fieldses.org
2248P: Neil Brown 2248P: Neil Brown
2249M: neilb@suse.de 2249M: neilb@suse.de
2250L: nfs@lists.sourceforge.net 2250L: linux-nfs@vger.kernel.org
2251W: http://nfs.sourceforge.net/ 2251W: http://nfs.sourceforge.net/
2252S: Supported 2252S: Supported
2253 2253
diff --git a/fs/Kconfig b/fs/Kconfig
index 219ec06a8c7e..987b5d7cb21a 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -1674,6 +1674,8 @@ config NFSD
1674 select CRYPTO_MD5 if NFSD_V4 1674 select CRYPTO_MD5 if NFSD_V4
1675 select CRYPTO if NFSD_V4 1675 select CRYPTO if NFSD_V4
1676 select FS_POSIX_ACL if NFSD_V4 1676 select FS_POSIX_ACL if NFSD_V4
1677 select PROC_FS if NFSD_V4
1678 select PROC_FS if SUNRPC_GSS
1677 help 1679 help
1678 If you want your Linux box to act as an NFS *server*, so that other 1680 If you want your Linux box to act as an NFS *server*, so that other
1679 computers on your local network which support NFS can access certain 1681 computers on your local network which support NFS can access certain
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 572601e98dcd..ca6b16fc3101 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -34,10 +34,10 @@ static DEFINE_MUTEX(nlm_host_mutex);
34 34
35static void nlm_gc_hosts(void); 35static void nlm_gc_hosts(void);
36static struct nsm_handle * __nsm_find(const struct sockaddr_in *, 36static struct nsm_handle * __nsm_find(const struct sockaddr_in *,
37 const char *, int, int); 37 const char *, unsigned int, int);
38static struct nsm_handle * nsm_find(const struct sockaddr_in *sin, 38static struct nsm_handle * nsm_find(const struct sockaddr_in *sin,
39 const char *hostname, 39 const char *hostname,
40 int hostname_len); 40 unsigned int hostname_len);
41 41
42/* 42/*
43 * Common host lookup routine for server & client 43 * Common host lookup routine for server & client
@@ -45,7 +45,8 @@ static struct nsm_handle * nsm_find(const struct sockaddr_in *sin,
45static struct nlm_host * 45static struct nlm_host *
46nlm_lookup_host(int server, const struct sockaddr_in *sin, 46nlm_lookup_host(int server, const struct sockaddr_in *sin,
47 int proto, int version, const char *hostname, 47 int proto, int version, const char *hostname,
48 int hostname_len, const struct sockaddr_in *ssin) 48 unsigned int hostname_len,
49 const struct sockaddr_in *ssin)
49{ 50{
50 struct hlist_head *chain; 51 struct hlist_head *chain;
51 struct hlist_node *pos; 52 struct hlist_node *pos;
@@ -176,7 +177,7 @@ nlm_destroy_host(struct nlm_host *host)
176 */ 177 */
177struct nlm_host * 178struct nlm_host *
178nlmclnt_lookup_host(const struct sockaddr_in *sin, int proto, int version, 179nlmclnt_lookup_host(const struct sockaddr_in *sin, int proto, int version,
179 const char *hostname, int hostname_len) 180 const char *hostname, unsigned int hostname_len)
180{ 181{
181 struct sockaddr_in ssin = {0}; 182 struct sockaddr_in ssin = {0};
182 183
@@ -189,7 +190,7 @@ nlmclnt_lookup_host(const struct sockaddr_in *sin, int proto, int version,
189 */ 190 */
190struct nlm_host * 191struct nlm_host *
191nlmsvc_lookup_host(struct svc_rqst *rqstp, 192nlmsvc_lookup_host(struct svc_rqst *rqstp,
192 const char *hostname, int hostname_len) 193 const char *hostname, unsigned int hostname_len)
193{ 194{
194 struct sockaddr_in ssin = {0}; 195 struct sockaddr_in ssin = {0};
195 196
@@ -307,7 +308,8 @@ void nlm_release_host(struct nlm_host *host)
307 * Release all resources held by that peer. 308 * Release all resources held by that peer.
308 */ 309 */
309void nlm_host_rebooted(const struct sockaddr_in *sin, 310void nlm_host_rebooted(const struct sockaddr_in *sin,
310 const char *hostname, int hostname_len, 311 const char *hostname,
312 unsigned int hostname_len,
311 u32 new_state) 313 u32 new_state)
312{ 314{
313 struct hlist_head *chain; 315 struct hlist_head *chain;
@@ -377,8 +379,13 @@ nlm_shutdown_hosts(void)
377 /* First, make all hosts eligible for gc */ 379 /* First, make all hosts eligible for gc */
378 dprintk("lockd: nuking all hosts...\n"); 380 dprintk("lockd: nuking all hosts...\n");
379 for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) { 381 for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) {
380 hlist_for_each_entry(host, pos, chain, h_hash) 382 hlist_for_each_entry(host, pos, chain, h_hash) {
381 host->h_expires = jiffies - 1; 383 host->h_expires = jiffies - 1;
384 if (host->h_rpcclnt) {
385 rpc_shutdown_client(host->h_rpcclnt);
386 host->h_rpcclnt = NULL;
387 }
388 }
382 } 389 }
383 390
384 /* Then, perform a garbage collection pass */ 391 /* Then, perform a garbage collection pass */
@@ -449,7 +456,7 @@ static DEFINE_MUTEX(nsm_mutex);
449 456
450static struct nsm_handle * 457static struct nsm_handle *
451__nsm_find(const struct sockaddr_in *sin, 458__nsm_find(const struct sockaddr_in *sin,
452 const char *hostname, int hostname_len, 459 const char *hostname, unsigned int hostname_len,
453 int create) 460 int create)
454{ 461{
455 struct nsm_handle *nsm = NULL; 462 struct nsm_handle *nsm = NULL;
@@ -503,7 +510,8 @@ out:
503} 510}
504 511
505static struct nsm_handle * 512static struct nsm_handle *
506nsm_find(const struct sockaddr_in *sin, const char *hostname, int hostname_len) 513nsm_find(const struct sockaddr_in *sin, const char *hostname,
514 unsigned int hostname_len)
507{ 515{
508 return __nsm_find(sin, hostname, hostname_len, 1); 516 return __nsm_find(sin, hostname, hostname_len, 1);
509} 517}
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 82e2192a0d5c..08226464e563 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -219,19 +219,6 @@ lockd(struct svc_rqst *rqstp)
219 module_put_and_exit(0); 219 module_put_and_exit(0);
220} 220}
221 221
222
223static int find_socket(struct svc_serv *serv, int proto)
224{
225 struct svc_sock *svsk;
226 int found = 0;
227 list_for_each_entry(svsk, &serv->sv_permsocks, sk_list)
228 if (svsk->sk_sk->sk_protocol == proto) {
229 found = 1;
230 break;
231 }
232 return found;
233}
234
235/* 222/*
236 * Make any sockets that are needed but not present. 223 * Make any sockets that are needed but not present.
237 * If nlm_udpport or nlm_tcpport were set as module 224 * If nlm_udpport or nlm_tcpport were set as module
@@ -240,17 +227,25 @@ static int find_socket(struct svc_serv *serv, int proto)
240static int make_socks(struct svc_serv *serv, int proto) 227static int make_socks(struct svc_serv *serv, int proto)
241{ 228{
242 static int warned; 229 static int warned;
230 struct svc_xprt *xprt;
243 int err = 0; 231 int err = 0;
244 232
245 if (proto == IPPROTO_UDP || nlm_udpport) 233 if (proto == IPPROTO_UDP || nlm_udpport) {
246 if (!find_socket(serv, IPPROTO_UDP)) 234 xprt = svc_find_xprt(serv, "udp", 0, 0);
247 err = svc_makesock(serv, IPPROTO_UDP, nlm_udpport, 235 if (!xprt)
248 SVC_SOCK_DEFAULTS); 236 err = svc_create_xprt(serv, "udp", nlm_udpport,
249 if (err >= 0 && (proto == IPPROTO_TCP || nlm_tcpport)) 237 SVC_SOCK_DEFAULTS);
250 if (!find_socket(serv, IPPROTO_TCP)) 238 else
251 err = svc_makesock(serv, IPPROTO_TCP, nlm_tcpport, 239 svc_xprt_put(xprt);
252 SVC_SOCK_DEFAULTS); 240 }
253 241 if (err >= 0 && (proto == IPPROTO_TCP || nlm_tcpport)) {
242 xprt = svc_find_xprt(serv, "tcp", 0, 0);
243 if (!xprt)
244 err = svc_create_xprt(serv, "tcp", nlm_tcpport,
245 SVC_SOCK_DEFAULTS);
246 else
247 svc_xprt_put(xprt);
248 }
254 if (err >= 0) { 249 if (err >= 0) {
255 warned = 0; 250 warned = 0;
256 err = 0; 251 err = 0;
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index bf27b6c6cb6b..385437e3387d 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -84,6 +84,7 @@ nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
84{ 84{
85 struct nlm_host *host; 85 struct nlm_host *host;
86 struct nlm_file *file; 86 struct nlm_file *file;
87 int rc = rpc_success;
87 88
88 dprintk("lockd: TEST4 called\n"); 89 dprintk("lockd: TEST4 called\n");
89 resp->cookie = argp->cookie; 90 resp->cookie = argp->cookie;
@@ -91,7 +92,7 @@ nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
91 /* Don't accept test requests during grace period */ 92 /* Don't accept test requests during grace period */
92 if (nlmsvc_grace_period) { 93 if (nlmsvc_grace_period) {
93 resp->status = nlm_lck_denied_grace_period; 94 resp->status = nlm_lck_denied_grace_period;
94 return rpc_success; 95 return rc;
95 } 96 }
96 97
97 /* Obtain client and file */ 98 /* Obtain client and file */
@@ -101,12 +102,13 @@ nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
101 /* Now check for conflicting locks */ 102 /* Now check for conflicting locks */
102 resp->status = nlmsvc_testlock(rqstp, file, &argp->lock, &resp->lock, &resp->cookie); 103 resp->status = nlmsvc_testlock(rqstp, file, &argp->lock, &resp->lock, &resp->cookie);
103 if (resp->status == nlm_drop_reply) 104 if (resp->status == nlm_drop_reply)
104 return rpc_drop_reply; 105 rc = rpc_drop_reply;
106 else
107 dprintk("lockd: TEST4 status %d\n", ntohl(resp->status));
105 108
106 dprintk("lockd: TEST4 status %d\n", ntohl(resp->status));
107 nlm_release_host(host); 109 nlm_release_host(host);
108 nlm_release_file(file); 110 nlm_release_file(file);
109 return rpc_success; 111 return rc;
110} 112}
111 113
112static __be32 114static __be32
@@ -115,6 +117,7 @@ nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
115{ 117{
116 struct nlm_host *host; 118 struct nlm_host *host;
117 struct nlm_file *file; 119 struct nlm_file *file;
120 int rc = rpc_success;
118 121
119 dprintk("lockd: LOCK called\n"); 122 dprintk("lockd: LOCK called\n");
120 123
@@ -123,7 +126,7 @@ nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
123 /* Don't accept new lock requests during grace period */ 126 /* Don't accept new lock requests during grace period */
124 if (nlmsvc_grace_period && !argp->reclaim) { 127 if (nlmsvc_grace_period && !argp->reclaim) {
125 resp->status = nlm_lck_denied_grace_period; 128 resp->status = nlm_lck_denied_grace_period;
126 return rpc_success; 129 return rc;
127 } 130 }
128 131
129 /* Obtain client and file */ 132 /* Obtain client and file */
@@ -146,12 +149,13 @@ nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
146 resp->status = nlmsvc_lock(rqstp, file, &argp->lock, 149 resp->status = nlmsvc_lock(rqstp, file, &argp->lock,
147 argp->block, &argp->cookie); 150 argp->block, &argp->cookie);
148 if (resp->status == nlm_drop_reply) 151 if (resp->status == nlm_drop_reply)
149 return rpc_drop_reply; 152 rc = rpc_drop_reply;
153 else
154 dprintk("lockd: LOCK status %d\n", ntohl(resp->status));
150 155
151 dprintk("lockd: LOCK status %d\n", ntohl(resp->status));
152 nlm_release_host(host); 156 nlm_release_host(host);
153 nlm_release_file(file); 157 nlm_release_file(file);
154 return rpc_success; 158 return rc;
155} 159}
156 160
157static __be32 161static __be32
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index d120ec39bcb0..2f4d8fa66689 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -501,25 +501,29 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file,
501 block, block->b_flags, block->b_fl); 501 block, block->b_flags, block->b_fl);
502 if (block->b_flags & B_TIMED_OUT) { 502 if (block->b_flags & B_TIMED_OUT) {
503 nlmsvc_unlink_block(block); 503 nlmsvc_unlink_block(block);
504 return nlm_lck_denied; 504 ret = nlm_lck_denied;
505 goto out;
505 } 506 }
506 if (block->b_flags & B_GOT_CALLBACK) { 507 if (block->b_flags & B_GOT_CALLBACK) {
508 nlmsvc_unlink_block(block);
507 if (block->b_fl != NULL 509 if (block->b_fl != NULL
508 && block->b_fl->fl_type != F_UNLCK) { 510 && block->b_fl->fl_type != F_UNLCK) {
509 lock->fl = *block->b_fl; 511 lock->fl = *block->b_fl;
510 goto conf_lock; 512 goto conf_lock;
511 } 513 } else {
512 else { 514 ret = nlm_granted;
513 nlmsvc_unlink_block(block); 515 goto out;
514 return nlm_granted;
515 } 516 }
516 } 517 }
517 return nlm_drop_reply; 518 ret = nlm_drop_reply;
519 goto out;
518 } 520 }
519 521
520 error = vfs_test_lock(file->f_file, &lock->fl); 522 error = vfs_test_lock(file->f_file, &lock->fl);
521 if (error == -EINPROGRESS) 523 if (error == -EINPROGRESS) {
522 return nlmsvc_defer_lock_rqst(rqstp, block); 524 ret = nlmsvc_defer_lock_rqst(rqstp, block);
525 goto out;
526 }
523 if (error) { 527 if (error) {
524 ret = nlm_lck_denied_nolocks; 528 ret = nlm_lck_denied_nolocks;
525 goto out; 529 goto out;
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 9cd5c8b37593..88379cc6e0b1 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -113,6 +113,7 @@ nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
113{ 113{
114 struct nlm_host *host; 114 struct nlm_host *host;
115 struct nlm_file *file; 115 struct nlm_file *file;
116 int rc = rpc_success;
116 117
117 dprintk("lockd: TEST called\n"); 118 dprintk("lockd: TEST called\n");
118 resp->cookie = argp->cookie; 119 resp->cookie = argp->cookie;
@@ -120,7 +121,7 @@ nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
120 /* Don't accept test requests during grace period */ 121 /* Don't accept test requests during grace period */
121 if (nlmsvc_grace_period) { 122 if (nlmsvc_grace_period) {
122 resp->status = nlm_lck_denied_grace_period; 123 resp->status = nlm_lck_denied_grace_period;
123 return rpc_success; 124 return rc;
124 } 125 }
125 126
126 /* Obtain client and file */ 127 /* Obtain client and file */
@@ -130,13 +131,14 @@ nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
130 /* Now check for conflicting locks */ 131 /* Now check for conflicting locks */
131 resp->status = cast_status(nlmsvc_testlock(rqstp, file, &argp->lock, &resp->lock, &resp->cookie)); 132 resp->status = cast_status(nlmsvc_testlock(rqstp, file, &argp->lock, &resp->lock, &resp->cookie));
132 if (resp->status == nlm_drop_reply) 133 if (resp->status == nlm_drop_reply)
133 return rpc_drop_reply; 134 rc = rpc_drop_reply;
135 else
136 dprintk("lockd: TEST status %d vers %d\n",
137 ntohl(resp->status), rqstp->rq_vers);
134 138
135 dprintk("lockd: TEST status %d vers %d\n",
136 ntohl(resp->status), rqstp->rq_vers);
137 nlm_release_host(host); 139 nlm_release_host(host);
138 nlm_release_file(file); 140 nlm_release_file(file);
139 return rpc_success; 141 return rc;
140} 142}
141 143
142static __be32 144static __be32
@@ -145,6 +147,7 @@ nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
145{ 147{
146 struct nlm_host *host; 148 struct nlm_host *host;
147 struct nlm_file *file; 149 struct nlm_file *file;
150 int rc = rpc_success;
148 151
149 dprintk("lockd: LOCK called\n"); 152 dprintk("lockd: LOCK called\n");
150 153
@@ -153,7 +156,7 @@ nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
153 /* Don't accept new lock requests during grace period */ 156 /* Don't accept new lock requests during grace period */
154 if (nlmsvc_grace_period && !argp->reclaim) { 157 if (nlmsvc_grace_period && !argp->reclaim) {
155 resp->status = nlm_lck_denied_grace_period; 158 resp->status = nlm_lck_denied_grace_period;
156 return rpc_success; 159 return rc;
157 } 160 }
158 161
159 /* Obtain client and file */ 162 /* Obtain client and file */
@@ -176,12 +179,13 @@ nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
176 resp->status = cast_status(nlmsvc_lock(rqstp, file, &argp->lock, 179 resp->status = cast_status(nlmsvc_lock(rqstp, file, &argp->lock,
177 argp->block, &argp->cookie)); 180 argp->block, &argp->cookie));
178 if (resp->status == nlm_drop_reply) 181 if (resp->status == nlm_drop_reply)
179 return rpc_drop_reply; 182 rc = rpc_drop_reply;
183 else
184 dprintk("lockd: LOCK status %d\n", ntohl(resp->status));
180 185
181 dprintk("lockd: LOCK status %d\n", ntohl(resp->status));
182 nlm_release_host(host); 186 nlm_release_host(host);
183 nlm_release_file(file); 187 nlm_release_file(file);
184 return rpc_success; 188 return rc;
185} 189}
186 190
187static __be32 191static __be32
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index 84ebba33b98d..dbbefbcd6712 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -87,7 +87,7 @@ nlm_lookup_file(struct svc_rqst *rqstp, struct nlm_file **result,
87 unsigned int hash; 87 unsigned int hash;
88 __be32 nfserr; 88 __be32 nfserr;
89 89
90 nlm_debug_print_fh("nlm_file_lookup", f); 90 nlm_debug_print_fh("nlm_lookup_file", f);
91 91
92 hash = file_hash(f); 92 hash = file_hash(f);
93 93
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 9b6bbf1b9787..bd185a572a23 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -119,8 +119,8 @@ int nfs_callback_up(void)
119 if (!serv) 119 if (!serv)
120 goto out_err; 120 goto out_err;
121 121
122 ret = svc_makesock(serv, IPPROTO_TCP, nfs_callback_set_tcpport, 122 ret = svc_create_xprt(serv, "tcp", nfs_callback_set_tcpport,
123 SVC_SOCK_ANONYMOUS); 123 SVC_SOCK_ANONYMOUS);
124 if (ret <= 0) 124 if (ret <= 0)
125 goto out_destroy; 125 goto out_destroy;
126 nfs_callback_tcpport = ret; 126 nfs_callback_tcpport = ret;
diff --git a/include/linux/nfsd/auth.h b/fs/nfsd/auth.h
index 0fb9f7212195..78b3c0e93822 100644
--- a/include/linux/nfsd/auth.h
+++ b/fs/nfsd/auth.h
@@ -1,6 +1,4 @@
1/* 1/*
2 * include/linux/nfsd/auth.h
3 *
4 * nfsd-specific authentication stuff. 2 * nfsd-specific authentication stuff.
5 * uid/gid mapping not yet implemented. 3 * uid/gid mapping not yet implemented.
6 * 4 *
@@ -10,8 +8,6 @@
10#ifndef LINUX_NFSD_AUTH_H 8#ifndef LINUX_NFSD_AUTH_H
11#define LINUX_NFSD_AUTH_H 9#define LINUX_NFSD_AUTH_H
12 10
13#ifdef __KERNEL__
14
15#define nfsd_luid(rq, uid) ((u32)(uid)) 11#define nfsd_luid(rq, uid) ((u32)(uid))
16#define nfsd_lgid(rq, gid) ((u32)(gid)) 12#define nfsd_lgid(rq, gid) ((u32)(gid))
17#define nfsd_ruid(rq, uid) ((u32)(uid)) 13#define nfsd_ruid(rq, uid) ((u32)(uid))
@@ -23,5 +19,4 @@
23 */ 19 */
24int nfsd_setuser(struct svc_rqst *, struct svc_export *); 20int nfsd_setuser(struct svc_rqst *, struct svc_export *);
25 21
26#endif /* __KERNEL__ */
27#endif /* LINUX_NFSD_AUTH_H */ 22#endif /* LINUX_NFSD_AUTH_H */
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 66d0aeb32a47..79b4bf812960 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -1357,8 +1357,6 @@ exp_pseudoroot(struct svc_rqst *rqstp, struct svc_fh *fhp)
1357 mk_fsid(FSID_NUM, fsidv, 0, 0, 0, NULL); 1357 mk_fsid(FSID_NUM, fsidv, 0, 0, 0, NULL);
1358 1358
1359 exp = rqst_exp_find(rqstp, FSID_NUM, fsidv); 1359 exp = rqst_exp_find(rqstp, FSID_NUM, fsidv);
1360 if (PTR_ERR(exp) == -ENOENT)
1361 return nfserr_perm;
1362 if (IS_ERR(exp)) 1360 if (IS_ERR(exp))
1363 return nfserrno(PTR_ERR(exp)); 1361 return nfserrno(PTR_ERR(exp));
1364 rv = fh_compose(fhp, exp, exp->ex_dentry, NULL); 1362 rv = fh_compose(fhp, exp, exp->ex_dentry, NULL);
@@ -1637,13 +1635,19 @@ exp_verify_string(char *cp, int max)
1637/* 1635/*
1638 * Initialize the exports module. 1636 * Initialize the exports module.
1639 */ 1637 */
1640void 1638int
1641nfsd_export_init(void) 1639nfsd_export_init(void)
1642{ 1640{
1641 int rv;
1643 dprintk("nfsd: initializing export module.\n"); 1642 dprintk("nfsd: initializing export module.\n");
1644 1643
1645 cache_register(&svc_export_cache); 1644 rv = cache_register(&svc_export_cache);
1646 cache_register(&svc_expkey_cache); 1645 if (rv)
1646 return rv;
1647 rv = cache_register(&svc_expkey_cache);
1648 if (rv)
1649 cache_unregister(&svc_export_cache);
1650 return rv;
1647 1651
1648} 1652}
1649 1653
@@ -1670,10 +1674,8 @@ nfsd_export_shutdown(void)
1670 1674
1671 exp_writelock(); 1675 exp_writelock();
1672 1676
1673 if (cache_unregister(&svc_expkey_cache)) 1677 cache_unregister(&svc_expkey_cache);
1674 printk(KERN_ERR "nfsd: failed to unregister expkey cache\n"); 1678 cache_unregister(&svc_export_cache);
1675 if (cache_unregister(&svc_export_cache))
1676 printk(KERN_ERR "nfsd: failed to unregister export cache\n");
1677 svcauth_unix_purge(); 1679 svcauth_unix_purge();
1678 1680
1679 exp_writeunlock(); 1681 exp_writeunlock();
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index 0e5fa11e6b44..1c3b7654e966 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -221,12 +221,17 @@ static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p,
221 struct nfsd3_getaclres *resp) 221 struct nfsd3_getaclres *resp)
222{ 222{
223 struct dentry *dentry = resp->fh.fh_dentry; 223 struct dentry *dentry = resp->fh.fh_dentry;
224 struct inode *inode = dentry->d_inode; 224 struct inode *inode;
225 struct kvec *head = rqstp->rq_res.head; 225 struct kvec *head = rqstp->rq_res.head;
226 unsigned int base; 226 unsigned int base;
227 int n; 227 int n;
228 int w; 228 int w;
229 229
230 /*
231 * Since this is version 2, the check for nfserr in
232 * nfsd_dispatch actually ensures the following cannot happen.
233 * However, it seems fragile to depend on that.
234 */
230 if (dentry == NULL || dentry->d_inode == NULL) 235 if (dentry == NULL || dentry->d_inode == NULL)
231 return 0; 236 return 0;
232 inode = dentry->d_inode; 237 inode = dentry->d_inode;
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index f917fd25858a..d7647f70e02b 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -21,6 +21,7 @@
21#include <linux/sunrpc/svc.h> 21#include <linux/sunrpc/svc.h>
22#include <linux/nfsd/nfsd.h> 22#include <linux/nfsd/nfsd.h>
23#include <linux/nfsd/xdr3.h> 23#include <linux/nfsd/xdr3.h>
24#include "auth.h"
24 25
25#define NFSDDBG_FACILITY NFSDDBG_XDR 26#define NFSDDBG_FACILITY NFSDDBG_XDR
26 27
@@ -88,10 +89,10 @@ encode_fh(__be32 *p, struct svc_fh *fhp)
88 * no slashes or null bytes. 89 * no slashes or null bytes.
89 */ 90 */
90static __be32 * 91static __be32 *
91decode_filename(__be32 *p, char **namp, int *lenp) 92decode_filename(__be32 *p, char **namp, unsigned int *lenp)
92{ 93{
93 char *name; 94 char *name;
94 int i; 95 unsigned int i;
95 96
96 if ((p = xdr_decode_string_inplace(p, namp, lenp, NFS3_MAXNAMLEN)) != NULL) { 97 if ((p = xdr_decode_string_inplace(p, namp, lenp, NFS3_MAXNAMLEN)) != NULL) {
97 for (i = 0, name = *namp; i < *lenp; i++, name++) { 98 for (i = 0, name = *namp; i < *lenp; i++, name++) {
@@ -452,8 +453,7 @@ int
452nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p, 453nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p,
453 struct nfsd3_symlinkargs *args) 454 struct nfsd3_symlinkargs *args)
454{ 455{
455 unsigned int len; 456 unsigned int len, avail;
456 int avail;
457 char *old, *new; 457 char *old, *new;
458 struct kvec *vec; 458 struct kvec *vec;
459 459
@@ -486,7 +486,8 @@ nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p,
486 /* now copy next page if there is one */ 486 /* now copy next page if there is one */
487 if (len && !avail && rqstp->rq_arg.page_len) { 487 if (len && !avail && rqstp->rq_arg.page_len) {
488 avail = rqstp->rq_arg.page_len; 488 avail = rqstp->rq_arg.page_len;
489 if (avail > PAGE_SIZE) avail = PAGE_SIZE; 489 if (avail > PAGE_SIZE)
490 avail = PAGE_SIZE;
490 old = page_address(rqstp->rq_arg.pages[0]); 491 old = page_address(rqstp->rq_arg.pages[0]);
491 } 492 }
492 while (len && avail && *old) { 493 while (len && avail && *old) {
@@ -816,11 +817,11 @@ static __be32 *
816encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p, 817encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p,
817 struct svc_fh *fhp) 818 struct svc_fh *fhp)
818{ 819{
819 p = encode_post_op_attr(cd->rqstp, p, fhp); 820 p = encode_post_op_attr(cd->rqstp, p, fhp);
820 *p++ = xdr_one; /* yes, a file handle follows */ 821 *p++ = xdr_one; /* yes, a file handle follows */
821 p = encode_fh(p, fhp); 822 p = encode_fh(p, fhp);
822 fh_put(fhp); 823 fh_put(fhp);
823 return p; 824 return p;
824} 825}
825 826
826static int 827static int
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 9d536a8cb379..aae2b29ae2c9 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -350,30 +350,6 @@ static struct rpc_version * nfs_cb_version[] = {
350static int do_probe_callback(void *data) 350static int do_probe_callback(void *data)
351{ 351{
352 struct nfs4_client *clp = data; 352 struct nfs4_client *clp = data;
353 struct nfs4_callback *cb = &clp->cl_callback;
354 struct rpc_message msg = {
355 .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
356 .rpc_argp = clp,
357 };
358 int status;
359
360 status = rpc_call_sync(cb->cb_client, &msg, RPC_TASK_SOFT);
361
362 if (status) {
363 rpc_shutdown_client(cb->cb_client);
364 cb->cb_client = NULL;
365 } else
366 atomic_set(&cb->cb_set, 1);
367 put_nfs4_client(clp);
368 return 0;
369}
370
371/*
372 * Set up the callback client and put a NFSPROC4_CB_NULL on the wire...
373 */
374void
375nfsd4_probe_callback(struct nfs4_client *clp)
376{
377 struct sockaddr_in addr; 353 struct sockaddr_in addr;
378 struct nfs4_callback *cb = &clp->cl_callback; 354 struct nfs4_callback *cb = &clp->cl_callback;
379 struct rpc_timeout timeparms = { 355 struct rpc_timeout timeparms = {
@@ -390,13 +366,15 @@ nfsd4_probe_callback(struct nfs4_client *clp)
390 .timeout = &timeparms, 366 .timeout = &timeparms,
391 .program = program, 367 .program = program,
392 .version = nfs_cb_version[1]->number, 368 .version = nfs_cb_version[1]->number,
393 .authflavor = RPC_AUTH_UNIX, /* XXX: need AUTH_GSS... */ 369 .authflavor = RPC_AUTH_UNIX, /* XXX: need AUTH_GSS... */
394 .flags = (RPC_CLNT_CREATE_NOPING), 370 .flags = (RPC_CLNT_CREATE_NOPING),
395 }; 371 };
396 struct task_struct *t; 372 struct rpc_message msg = {
397 373 .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
398 if (atomic_read(&cb->cb_set)) 374 .rpc_argp = clp,
399 return; 375 };
376 struct rpc_clnt *client;
377 int status;
400 378
401 /* Initialize address */ 379 /* Initialize address */
402 memset(&addr, 0, sizeof(addr)); 380 memset(&addr, 0, sizeof(addr));
@@ -416,29 +394,50 @@ nfsd4_probe_callback(struct nfs4_client *clp)
416 program->stats->program = program; 394 program->stats->program = program;
417 395
418 /* Create RPC client */ 396 /* Create RPC client */
419 cb->cb_client = rpc_create(&args); 397 client = rpc_create(&args);
420 if (IS_ERR(cb->cb_client)) { 398 if (IS_ERR(client)) {
421 dprintk("NFSD: couldn't create callback client\n"); 399 dprintk("NFSD: couldn't create callback client\n");
400 status = PTR_ERR(client);
422 goto out_err; 401 goto out_err;
423 } 402 }
424 403
404 status = rpc_call_sync(client, &msg, RPC_TASK_SOFT);
405
406 if (status)
407 goto out_release_client;
408
409 cb->cb_client = client;
410 atomic_set(&cb->cb_set, 1);
411 put_nfs4_client(clp);
412 return 0;
413out_release_client:
414 rpc_shutdown_client(client);
415out_err:
416 put_nfs4_client(clp);
417 dprintk("NFSD: warning: no callback path to client %.*s\n",
418 (int)clp->cl_name.len, clp->cl_name.data);
419 return status;
420}
421
422/*
423 * Set up the callback client and put a NFSPROC4_CB_NULL on the wire...
424 */
425void
426nfsd4_probe_callback(struct nfs4_client *clp)
427{
428 struct task_struct *t;
429
430 BUG_ON(atomic_read(&clp->cl_callback.cb_set));
431
425 /* the task holds a reference to the nfs4_client struct */ 432 /* the task holds a reference to the nfs4_client struct */
426 atomic_inc(&clp->cl_count); 433 atomic_inc(&clp->cl_count);
427 434
428 t = kthread_run(do_probe_callback, clp, "nfs4_cb_probe"); 435 t = kthread_run(do_probe_callback, clp, "nfs4_cb_probe");
429 436
430 if (IS_ERR(t)) 437 if (IS_ERR(t))
431 goto out_release_clp; 438 atomic_dec(&clp->cl_count);
432 439
433 return; 440 return;
434
435out_release_clp:
436 atomic_dec(&clp->cl_count);
437 rpc_shutdown_client(cb->cb_client);
438out_err:
439 cb->cb_client = NULL;
440 dprintk("NFSD: warning: no callback path to client %.*s\n",
441 (int)clp->cl_name.len, clp->cl_name.data);
442} 441}
443 442
444/* 443/*
@@ -458,9 +457,6 @@ nfsd4_cb_recall(struct nfs4_delegation *dp)
458 int retries = 1; 457 int retries = 1;
459 int status = 0; 458 int status = 0;
460 459
461 if ((!atomic_read(&clp->cl_callback.cb_set)) || !clnt)
462 return;
463
464 cbr->cbr_trunc = 0; /* XXX need to implement truncate optimization */ 460 cbr->cbr_trunc = 0; /* XXX need to implement truncate optimization */
465 cbr->cbr_dp = dp; 461 cbr->cbr_dp = dp;
466 462
@@ -469,6 +465,7 @@ nfsd4_cb_recall(struct nfs4_delegation *dp)
469 switch (status) { 465 switch (status) {
470 case -EIO: 466 case -EIO:
471 /* Network partition? */ 467 /* Network partition? */
468 atomic_set(&clp->cl_callback.cb_set, 0);
472 case -EBADHANDLE: 469 case -EBADHANDLE:
473 case -NFS4ERR_BAD_STATEID: 470 case -NFS4ERR_BAD_STATEID:
474 /* Race: client probably got cb_recall 471 /* Race: client probably got cb_recall
@@ -481,11 +478,10 @@ nfsd4_cb_recall(struct nfs4_delegation *dp)
481 status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFT); 478 status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFT);
482 } 479 }
483out_put_cred: 480out_put_cred:
484 if (status == -EIO) 481 /*
485 atomic_set(&clp->cl_callback.cb_set, 0); 482 * Success or failure, now we're either waiting for lease expiration
486 /* Success or failure, now we're either waiting for lease expiration 483 * or deleg_return.
487 * or deleg_return. */ 484 */
488 dprintk("NFSD: nfs4_cb_recall: dp %p dl_flock %p dl_count %d\n",dp, dp->dl_flock, atomic_read(&dp->dl_count));
489 put_nfs4_client(clp); 485 put_nfs4_client(clp);
490 nfs4_put_delegation(dp); 486 nfs4_put_delegation(dp);
491 return; 487 return;
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index 4c0c683ce07a..996bd88b75ba 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -255,13 +255,10 @@ idtoname_parse(struct cache_detail *cd, char *buf, int buflen)
255 goto out; 255 goto out;
256 if (len == 0) 256 if (len == 0)
257 set_bit(CACHE_NEGATIVE, &ent.h.flags); 257 set_bit(CACHE_NEGATIVE, &ent.h.flags);
258 else { 258 else if (len >= IDMAP_NAMESZ)
259 if (error >= IDMAP_NAMESZ) { 259 goto out;
260 error = -EINVAL; 260 else
261 goto out;
262 }
263 memcpy(ent.name, buf1, sizeof(ent.name)); 261 memcpy(ent.name, buf1, sizeof(ent.name));
264 }
265 error = -ENOMEM; 262 error = -ENOMEM;
266 res = idtoname_update(&ent, res); 263 res = idtoname_update(&ent, res);
267 if (res == NULL) 264 if (res == NULL)
@@ -467,20 +464,25 @@ nametoid_update(struct ent *new, struct ent *old)
467 * Exported API 464 * Exported API
468 */ 465 */
469 466
470void 467int
471nfsd_idmap_init(void) 468nfsd_idmap_init(void)
472{ 469{
473 cache_register(&idtoname_cache); 470 int rv;
474 cache_register(&nametoid_cache); 471
472 rv = cache_register(&idtoname_cache);
473 if (rv)
474 return rv;
475 rv = cache_register(&nametoid_cache);
476 if (rv)
477 cache_unregister(&idtoname_cache);
478 return rv;
475} 479}
476 480
477void 481void
478nfsd_idmap_shutdown(void) 482nfsd_idmap_shutdown(void)
479{ 483{
480 if (cache_unregister(&idtoname_cache)) 484 cache_unregister(&idtoname_cache);
481 printk(KERN_ERR "nfsd: failed to unregister idtoname cache\n"); 485 cache_unregister(&nametoid_cache);
482 if (cache_unregister(&nametoid_cache))
483 printk(KERN_ERR "nfsd: failed to unregister nametoid cache\n");
484} 486}
485 487
486/* 488/*
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 18ead1790bb3..c593db047d8b 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -750,7 +750,7 @@ _nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
750 cstate->current_fh.fh_export, 750 cstate->current_fh.fh_export,
751 cstate->current_fh.fh_dentry, buf, 751 cstate->current_fh.fh_dentry, buf,
752 &count, verify->ve_bmval, 752 &count, verify->ve_bmval,
753 rqstp); 753 rqstp, 0);
754 754
755 /* this means that nfsd4_encode_fattr() ran out of space */ 755 /* this means that nfsd4_encode_fattr() ran out of space */
756 if (status == nfserr_resource && count == 0) 756 if (status == nfserr_resource && count == 0)
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 31673cd251c3..f6744bc03dae 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -61,7 +61,6 @@ static time_t lease_time = 90; /* default lease time */
61static time_t user_lease_time = 90; 61static time_t user_lease_time = 90;
62static time_t boot_time; 62static time_t boot_time;
63static int in_grace = 1; 63static int in_grace = 1;
64static u32 current_clientid = 1;
65static u32 current_ownerid = 1; 64static u32 current_ownerid = 1;
66static u32 current_fileid = 1; 65static u32 current_fileid = 1;
67static u32 current_delegid = 1; 66static u32 current_delegid = 1;
@@ -340,21 +339,20 @@ STALE_CLIENTID(clientid_t *clid)
340 * This type of memory management is somewhat inefficient, but we use it 339 * This type of memory management is somewhat inefficient, but we use it
341 * anyway since SETCLIENTID is not a common operation. 340 * anyway since SETCLIENTID is not a common operation.
342 */ 341 */
343static inline struct nfs4_client * 342static struct nfs4_client *alloc_client(struct xdr_netobj name)
344alloc_client(struct xdr_netobj name)
345{ 343{
346 struct nfs4_client *clp; 344 struct nfs4_client *clp;
347 345
348 if ((clp = kzalloc(sizeof(struct nfs4_client), GFP_KERNEL))!= NULL) { 346 clp = kzalloc(sizeof(struct nfs4_client), GFP_KERNEL);
349 if ((clp->cl_name.data = kmalloc(name.len, GFP_KERNEL)) != NULL) { 347 if (clp == NULL)
350 memcpy(clp->cl_name.data, name.data, name.len); 348 return NULL;
351 clp->cl_name.len = name.len; 349 clp->cl_name.data = kmalloc(name.len, GFP_KERNEL);
352 } 350 if (clp->cl_name.data == NULL) {
353 else { 351 kfree(clp);
354 kfree(clp); 352 return NULL;
355 clp = NULL;
356 }
357 } 353 }
354 memcpy(clp->cl_name.data, name.data, name.len);
355 clp->cl_name.len = name.len;
358 return clp; 356 return clp;
359} 357}
360 358
@@ -363,8 +361,11 @@ shutdown_callback_client(struct nfs4_client *clp)
363{ 361{
364 struct rpc_clnt *clnt = clp->cl_callback.cb_client; 362 struct rpc_clnt *clnt = clp->cl_callback.cb_client;
365 363
366 /* shutdown rpc client, ending any outstanding recall rpcs */
367 if (clnt) { 364 if (clnt) {
365 /*
366 * Callback threads take a reference on the client, so there
367 * should be no outstanding callbacks at this point.
368 */
368 clp->cl_callback.cb_client = NULL; 369 clp->cl_callback.cb_client = NULL;
369 rpc_shutdown_client(clnt); 370 rpc_shutdown_client(clnt);
370 } 371 }
@@ -422,12 +423,13 @@ expire_client(struct nfs4_client *clp)
422 put_nfs4_client(clp); 423 put_nfs4_client(clp);
423} 424}
424 425
425static struct nfs4_client * 426static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir)
426create_client(struct xdr_netobj name, char *recdir) { 427{
427 struct nfs4_client *clp; 428 struct nfs4_client *clp;
428 429
429 if (!(clp = alloc_client(name))) 430 clp = alloc_client(name);
430 goto out; 431 if (clp == NULL)
432 return NULL;
431 memcpy(clp->cl_recdir, recdir, HEXDIR_LEN); 433 memcpy(clp->cl_recdir, recdir, HEXDIR_LEN);
432 atomic_set(&clp->cl_count, 1); 434 atomic_set(&clp->cl_count, 1);
433 atomic_set(&clp->cl_callback.cb_set, 0); 435 atomic_set(&clp->cl_callback.cb_set, 0);
@@ -436,32 +438,30 @@ create_client(struct xdr_netobj name, char *recdir) {
436 INIT_LIST_HEAD(&clp->cl_openowners); 438 INIT_LIST_HEAD(&clp->cl_openowners);
437 INIT_LIST_HEAD(&clp->cl_delegations); 439 INIT_LIST_HEAD(&clp->cl_delegations);
438 INIT_LIST_HEAD(&clp->cl_lru); 440 INIT_LIST_HEAD(&clp->cl_lru);
439out:
440 return clp; 441 return clp;
441} 442}
442 443
443static void 444static void copy_verf(struct nfs4_client *target, nfs4_verifier *source)
444copy_verf(struct nfs4_client *target, nfs4_verifier *source) { 445{
445 memcpy(target->cl_verifier.data, source->data, sizeof(target->cl_verifier.data)); 446 memcpy(target->cl_verifier.data, source->data,
447 sizeof(target->cl_verifier.data));
446} 448}
447 449
448static void 450static void copy_clid(struct nfs4_client *target, struct nfs4_client *source)
449copy_clid(struct nfs4_client *target, struct nfs4_client *source) { 451{
450 target->cl_clientid.cl_boot = source->cl_clientid.cl_boot; 452 target->cl_clientid.cl_boot = source->cl_clientid.cl_boot;
451 target->cl_clientid.cl_id = source->cl_clientid.cl_id; 453 target->cl_clientid.cl_id = source->cl_clientid.cl_id;
452} 454}
453 455
454static void 456static void copy_cred(struct svc_cred *target, struct svc_cred *source)
455copy_cred(struct svc_cred *target, struct svc_cred *source) { 457{
456
457 target->cr_uid = source->cr_uid; 458 target->cr_uid = source->cr_uid;
458 target->cr_gid = source->cr_gid; 459 target->cr_gid = source->cr_gid;
459 target->cr_group_info = source->cr_group_info; 460 target->cr_group_info = source->cr_group_info;
460 get_group_info(target->cr_group_info); 461 get_group_info(target->cr_group_info);
461} 462}
462 463
463static inline int 464static int same_name(const char *n1, const char *n2)
464same_name(const char *n1, const char *n2)
465{ 465{
466 return 0 == memcmp(n1, n2, HEXDIR_LEN); 466 return 0 == memcmp(n1, n2, HEXDIR_LEN);
467} 467}
@@ -485,26 +485,26 @@ same_creds(struct svc_cred *cr1, struct svc_cred *cr2)
485 return cr1->cr_uid == cr2->cr_uid; 485 return cr1->cr_uid == cr2->cr_uid;
486} 486}
487 487
488static void 488static void gen_clid(struct nfs4_client *clp)
489gen_clid(struct nfs4_client *clp) { 489{
490 static u32 current_clientid = 1;
491
490 clp->cl_clientid.cl_boot = boot_time; 492 clp->cl_clientid.cl_boot = boot_time;
491 clp->cl_clientid.cl_id = current_clientid++; 493 clp->cl_clientid.cl_id = current_clientid++;
492} 494}
493 495
494static void 496static void gen_confirm(struct nfs4_client *clp)
495gen_confirm(struct nfs4_client *clp) { 497{
496 struct timespec tv; 498 static u32 i;
497 u32 * p; 499 u32 *p;
498 500
499 tv = CURRENT_TIME;
500 p = (u32 *)clp->cl_confirm.data; 501 p = (u32 *)clp->cl_confirm.data;
501 *p++ = tv.tv_sec; 502 *p++ = get_seconds();
502 *p++ = tv.tv_nsec; 503 *p++ = i++;
503} 504}
504 505
505static int 506static int check_name(struct xdr_netobj name)
506check_name(struct xdr_netobj name) { 507{
507
508 if (name.len == 0) 508 if (name.len == 0)
509 return 0; 509 return 0;
510 if (name.len > NFS4_OPAQUE_LIMIT) { 510 if (name.len > NFS4_OPAQUE_LIMIT) {
@@ -683,39 +683,6 @@ out_err:
683 return; 683 return;
684} 684}
685 685
686/*
687 * RFC 3010 has a complex implmentation description of processing a
688 * SETCLIENTID request consisting of 5 bullets, labeled as
689 * CASE0 - CASE4 below.
690 *
691 * NOTES:
692 * callback information will be processed in a future patch
693 *
694 * an unconfirmed record is added when:
695 * NORMAL (part of CASE 4): there is no confirmed nor unconfirmed record.
696 * CASE 1: confirmed record found with matching name, principal,
697 * verifier, and clientid.
698 * CASE 2: confirmed record found with matching name, principal,
699 * and there is no unconfirmed record with matching
700 * name and principal
701 *
702 * an unconfirmed record is replaced when:
703 * CASE 3: confirmed record found with matching name, principal,
704 * and an unconfirmed record is found with matching
705 * name, principal, and with clientid and
706 * confirm that does not match the confirmed record.
707 * CASE 4: there is no confirmed record with matching name and
708 * principal. there is an unconfirmed record with
709 * matching name, principal.
710 *
711 * an unconfirmed record is deleted when:
712 * CASE 1: an unconfirmed record that matches input name, verifier,
713 * and confirmed clientid.
714 * CASE 4: any unconfirmed records with matching name and principal
715 * that exist after an unconfirmed record has been replaced
716 * as described above.
717 *
718 */
719__be32 686__be32
720nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, 687nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
721 struct nfsd4_setclientid *setclid) 688 struct nfsd4_setclientid *setclid)
@@ -748,11 +715,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
748 nfs4_lock_state(); 715 nfs4_lock_state();
749 conf = find_confirmed_client_by_str(dname, strhashval); 716 conf = find_confirmed_client_by_str(dname, strhashval);
750 if (conf) { 717 if (conf) {
751 /* 718 /* RFC 3530 14.2.33 CASE 0: */
752 * CASE 0:
753 * clname match, confirmed, different principal
754 * or different ip_address
755 */
756 status = nfserr_clid_inuse; 719 status = nfserr_clid_inuse;
757 if (!same_creds(&conf->cl_cred, &rqstp->rq_cred) 720 if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)
758 || conf->cl_addr != sin->sin_addr.s_addr) { 721 || conf->cl_addr != sin->sin_addr.s_addr) {
@@ -761,12 +724,17 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
761 goto out; 724 goto out;
762 } 725 }
763 } 726 }
727 /*
728 * section 14.2.33 of RFC 3530 (under the heading "IMPLEMENTATION")
729 * has a description of SETCLIENTID request processing consisting
730 * of 5 bullet points, labeled as CASE0 - CASE4 below.
731 */
764 unconf = find_unconfirmed_client_by_str(dname, strhashval); 732 unconf = find_unconfirmed_client_by_str(dname, strhashval);
765 status = nfserr_resource; 733 status = nfserr_resource;
766 if (!conf) { 734 if (!conf) {
767 /* 735 /*
768 * CASE 4: 736 * RFC 3530 14.2.33 CASE 4:
769 * placed first, because it is the normal case. 737 * placed first, because it is the normal case
770 */ 738 */
771 if (unconf) 739 if (unconf)
772 expire_client(unconf); 740 expire_client(unconf);
@@ -776,17 +744,8 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
776 gen_clid(new); 744 gen_clid(new);
777 } else if (same_verf(&conf->cl_verifier, &clverifier)) { 745 } else if (same_verf(&conf->cl_verifier, &clverifier)) {
778 /* 746 /*
779 * CASE 1: 747 * RFC 3530 14.2.33 CASE 1:
780 * cl_name match, confirmed, principal match 748 * probable callback update
781 * verifier match: probable callback update
782 *
783 * remove any unconfirmed nfs4_client with
784 * matching cl_name, cl_verifier, and cl_clientid
785 *
786 * create and insert an unconfirmed nfs4_client with same
787 * cl_name, cl_verifier, and cl_clientid as existing
788 * nfs4_client, but with the new callback info and a
789 * new cl_confirm
790 */ 749 */
791 if (unconf) { 750 if (unconf) {
792 /* Note this is removing unconfirmed {*x***}, 751 /* Note this is removing unconfirmed {*x***},
@@ -802,43 +761,25 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
802 copy_clid(new, conf); 761 copy_clid(new, conf);
803 } else if (!unconf) { 762 } else if (!unconf) {
804 /* 763 /*
805 * CASE 2: 764 * RFC 3530 14.2.33 CASE 2:
806 * clname match, confirmed, principal match 765 * probable client reboot; state will be removed if
807 * verfier does not match 766 * confirmed.
808 * no unconfirmed. create a new unconfirmed nfs4_client
809 * using input clverifier, clname, and callback info
810 * and generate a new cl_clientid and cl_confirm.
811 */ 767 */
812 new = create_client(clname, dname); 768 new = create_client(clname, dname);
813 if (new == NULL) 769 if (new == NULL)
814 goto out; 770 goto out;
815 gen_clid(new); 771 gen_clid(new);
816 } else if (!same_verf(&conf->cl_confirm, &unconf->cl_confirm)) { 772 } else {
817 /* 773 /*
818 * CASE3: 774 * RFC 3530 14.2.33 CASE 3:
819 * confirmed found (name, principal match) 775 * probable client reboot; state will be removed if
820 * confirmed verifier does not match input clverifier 776 * confirmed.
821 *
822 * unconfirmed found (name match)
823 * confirmed->cl_confirm != unconfirmed->cl_confirm
824 *
825 * remove unconfirmed.
826 *
827 * create an unconfirmed nfs4_client
828 * with same cl_name as existing confirmed nfs4_client,
829 * but with new callback info, new cl_clientid,
830 * new cl_verifier and a new cl_confirm
831 */ 777 */
832 expire_client(unconf); 778 expire_client(unconf);
833 new = create_client(clname, dname); 779 new = create_client(clname, dname);
834 if (new == NULL) 780 if (new == NULL)
835 goto out; 781 goto out;
836 gen_clid(new); 782 gen_clid(new);
837 } else {
838 /* No cases hit !!! */
839 status = nfserr_inval;
840 goto out;
841
842 } 783 }
843 copy_verf(new, &clverifier); 784 copy_verf(new, &clverifier);
844 new->cl_addr = sin->sin_addr.s_addr; 785 new->cl_addr = sin->sin_addr.s_addr;
@@ -857,11 +798,9 @@ out:
857 798
858 799
859/* 800/*
860 * RFC 3010 has a complex implmentation description of processing a 801 * Section 14.2.34 of RFC 3530 (under the heading "IMPLEMENTATION") has
861 * SETCLIENTID_CONFIRM request consisting of 4 bullets describing 802 * a description of SETCLIENTID_CONFIRM request processing consisting of 4
862 * processing on a DRC miss, labeled as CASE1 - CASE4 below. 803 * bullets, labeled as CASE1 - CASE4 below.
863 *
864 * NOTE: callback information will be processed here in a future patch
865 */ 804 */
866__be32 805__be32
867nfsd4_setclientid_confirm(struct svc_rqst *rqstp, 806nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
@@ -892,16 +831,16 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
892 if (unconf && unconf->cl_addr != sin->sin_addr.s_addr) 831 if (unconf && unconf->cl_addr != sin->sin_addr.s_addr)
893 goto out; 832 goto out;
894 833
895 if ((conf && unconf) && 834 /*
896 (same_verf(&unconf->cl_confirm, &confirm)) && 835 * section 14.2.34 of RFC 3530 has a description of
897 (same_verf(&conf->cl_verifier, &unconf->cl_verifier)) && 836 * SETCLIENTID_CONFIRM request processing consisting
898 (same_name(conf->cl_recdir,unconf->cl_recdir)) && 837 * of 4 bullet points, labeled as CASE1 - CASE4 below.
899 (!same_verf(&conf->cl_confirm, &unconf->cl_confirm))) { 838 */
900 /* CASE 1: 839 if (conf && unconf && same_verf(&confirm, &unconf->cl_confirm)) {
901 * unconf record that matches input clientid and input confirm. 840 /*
902 * conf record that matches input clientid. 841 * RFC 3530 14.2.34 CASE 1:
903 * conf and unconf records match names, verifiers 842 * callback update
904 */ 843 */
905 if (!same_creds(&conf->cl_cred, &unconf->cl_cred)) 844 if (!same_creds(&conf->cl_cred, &unconf->cl_cred))
906 status = nfserr_clid_inuse; 845 status = nfserr_clid_inuse;
907 else { 846 else {
@@ -914,15 +853,11 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
914 status = nfs_ok; 853 status = nfs_ok;
915 854
916 } 855 }
917 } else if ((conf && !unconf) || 856 } else if (conf && !unconf) {
918 ((conf && unconf) && 857 /*
919 (!same_verf(&conf->cl_verifier, &unconf->cl_verifier) || 858 * RFC 3530 14.2.34 CASE 2:
920 !same_name(conf->cl_recdir, unconf->cl_recdir)))) { 859 * probable retransmitted request; play it safe and
921 /* CASE 2: 860 * do nothing.
922 * conf record that matches input clientid.
923 * if unconf record matches input clientid, then
924 * unconf->cl_name or unconf->cl_verifier don't match the
925 * conf record.
926 */ 861 */
927 if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) 862 if (!same_creds(&conf->cl_cred, &rqstp->rq_cred))
928 status = nfserr_clid_inuse; 863 status = nfserr_clid_inuse;
@@ -930,10 +865,9 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
930 status = nfs_ok; 865 status = nfs_ok;
931 } else if (!conf && unconf 866 } else if (!conf && unconf
932 && same_verf(&unconf->cl_confirm, &confirm)) { 867 && same_verf(&unconf->cl_confirm, &confirm)) {
933 /* CASE 3: 868 /*
934 * conf record not found. 869 * RFC 3530 14.2.34 CASE 3:
935 * unconf record found. 870 * Normal case; new or rebooted client:
936 * unconf->cl_confirm matches input confirm
937 */ 871 */
938 if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred)) { 872 if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred)) {
939 status = nfserr_clid_inuse; 873 status = nfserr_clid_inuse;
@@ -948,16 +882,15 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
948 } 882 }
949 move_to_confirmed(unconf); 883 move_to_confirmed(unconf);
950 conf = unconf; 884 conf = unconf;
885 nfsd4_probe_callback(conf);
951 status = nfs_ok; 886 status = nfs_ok;
952 } 887 }
953 } else if ((!conf || (conf && !same_verf(&conf->cl_confirm, &confirm))) 888 } else if ((!conf || (conf && !same_verf(&conf->cl_confirm, &confirm)))
954 && (!unconf || (unconf && !same_verf(&unconf->cl_confirm, 889 && (!unconf || (unconf && !same_verf(&unconf->cl_confirm,
955 &confirm)))) { 890 &confirm)))) {
956 /* CASE 4: 891 /*
957 * conf record not found, or if conf, conf->cl_confirm does not 892 * RFC 3530 14.2.34 CASE 4:
958 * match input confirm. 893 * Client probably hasn't noticed that we rebooted yet.
959 * unconf record not found, or if unconf, unconf->cl_confirm
960 * does not match input confirm.
961 */ 894 */
962 status = nfserr_stale_clientid; 895 status = nfserr_stale_clientid;
963 } else { 896 } else {
@@ -965,8 +898,6 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
965 status = nfserr_clid_inuse; 898 status = nfserr_clid_inuse;
966 } 899 }
967out: 900out:
968 if (!status)
969 nfsd4_probe_callback(conf);
970 nfs4_unlock_state(); 901 nfs4_unlock_state();
971 return status; 902 return status;
972} 903}
@@ -1226,14 +1157,19 @@ find_file(struct inode *ino)
1226 return NULL; 1157 return NULL;
1227} 1158}
1228 1159
1229static int access_valid(u32 x) 1160static inline int access_valid(u32 x)
1230{ 1161{
1231 return (x > 0 && x < 4); 1162 if (x < NFS4_SHARE_ACCESS_READ)
1163 return 0;
1164 if (x > NFS4_SHARE_ACCESS_BOTH)
1165 return 0;
1166 return 1;
1232} 1167}
1233 1168
1234static int deny_valid(u32 x) 1169static inline int deny_valid(u32 x)
1235{ 1170{
1236 return (x >= 0 && x < 5); 1171 /* Note: unlike access bits, deny bits may be zero. */
1172 return x <= NFS4_SHARE_DENY_BOTH;
1237} 1173}
1238 1174
1239static void 1175static void
@@ -2162,8 +2098,10 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei
2162 goto check_replay; 2098 goto check_replay;
2163 } 2099 }
2164 2100
2101 *stpp = stp;
2102 *sopp = sop = stp->st_stateowner;
2103
2165 if (lock) { 2104 if (lock) {
2166 struct nfs4_stateowner *sop = stp->st_stateowner;
2167 clientid_t *lockclid = &lock->v.new.clientid; 2105 clientid_t *lockclid = &lock->v.new.clientid;
2168 struct nfs4_client *clp = sop->so_client; 2106 struct nfs4_client *clp = sop->so_client;
2169 int lkflg = 0; 2107 int lkflg = 0;
@@ -2193,9 +2131,6 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei
2193 return nfserr_bad_stateid; 2131 return nfserr_bad_stateid;
2194 } 2132 }
2195 2133
2196 *stpp = stp;
2197 *sopp = sop = stp->st_stateowner;
2198
2199 /* 2134 /*
2200 * We now validate the seqid and stateid generation numbers. 2135 * We now validate the seqid and stateid generation numbers.
2201 * For the moment, we ignore the possibility of 2136 * For the moment, we ignore the possibility of
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 57333944af7f..b0592e7c378d 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -148,12 +148,12 @@ xdr_error: \
148 } \ 148 } \
149} while (0) 149} while (0)
150 150
151static __be32 *read_buf(struct nfsd4_compoundargs *argp, int nbytes) 151static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes)
152{ 152{
153 /* We want more bytes than seem to be available. 153 /* We want more bytes than seem to be available.
154 * Maybe we need a new page, maybe we have just run out 154 * Maybe we need a new page, maybe we have just run out
155 */ 155 */
156 int avail = (char*)argp->end - (char*)argp->p; 156 unsigned int avail = (char *)argp->end - (char *)argp->p;
157 __be32 *p; 157 __be32 *p;
158 if (avail + argp->pagelen < nbytes) 158 if (avail + argp->pagelen < nbytes)
159 return NULL; 159 return NULL;
@@ -169,6 +169,11 @@ static __be32 *read_buf(struct nfsd4_compoundargs *argp, int nbytes)
169 return NULL; 169 return NULL;
170 170
171 } 171 }
172 /*
173 * The following memcpy is safe because read_buf is always
174 * called with nbytes > avail, and the two cases above both
175 * guarantee p points to at least nbytes bytes.
176 */
172 memcpy(p, argp->p, avail); 177 memcpy(p, argp->p, avail);
173 /* step to next page */ 178 /* step to next page */
174 argp->p = page_address(argp->pagelist[0]); 179 argp->p = page_address(argp->pagelist[0]);
@@ -1448,7 +1453,7 @@ static __be32 fattr_handle_absent_fs(u32 *bmval0, u32 *bmval1, u32 *rdattr_err)
1448__be32 1453__be32
1449nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, 1454nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
1450 struct dentry *dentry, __be32 *buffer, int *countp, u32 *bmval, 1455 struct dentry *dentry, __be32 *buffer, int *countp, u32 *bmval,
1451 struct svc_rqst *rqstp) 1456 struct svc_rqst *rqstp, int ignore_crossmnt)
1452{ 1457{
1453 u32 bmval0 = bmval[0]; 1458 u32 bmval0 = bmval[0];
1454 u32 bmval1 = bmval[1]; 1459 u32 bmval1 = bmval[1];
@@ -1828,7 +1833,12 @@ out_acl:
1828 if (bmval1 & FATTR4_WORD1_MOUNTED_ON_FILEID) { 1833 if (bmval1 & FATTR4_WORD1_MOUNTED_ON_FILEID) {
1829 if ((buflen -= 8) < 0) 1834 if ((buflen -= 8) < 0)
1830 goto out_resource; 1835 goto out_resource;
1831 if (exp->ex_mnt->mnt_root->d_inode == dentry->d_inode) { 1836 /*
1837 * Get parent's attributes if not ignoring crossmount
1838 * and this is the root of a cross-mounted filesystem.
1839 */
1840 if (ignore_crossmnt == 0 &&
1841 exp->ex_mnt->mnt_root->d_inode == dentry->d_inode) {
1832 err = vfs_getattr(exp->ex_mnt->mnt_parent, 1842 err = vfs_getattr(exp->ex_mnt->mnt_parent,
1833 exp->ex_mnt->mnt_mountpoint, &stat); 1843 exp->ex_mnt->mnt_mountpoint, &stat);
1834 if (err) 1844 if (err)
@@ -1864,13 +1874,25 @@ nfsd4_encode_dirent_fattr(struct nfsd4_readdir *cd,
1864 struct svc_export *exp = cd->rd_fhp->fh_export; 1874 struct svc_export *exp = cd->rd_fhp->fh_export;
1865 struct dentry *dentry; 1875 struct dentry *dentry;
1866 __be32 nfserr; 1876 __be32 nfserr;
1877 int ignore_crossmnt = 0;
1867 1878
1868 dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen); 1879 dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen);
1869 if (IS_ERR(dentry)) 1880 if (IS_ERR(dentry))
1870 return nfserrno(PTR_ERR(dentry)); 1881 return nfserrno(PTR_ERR(dentry));
1871 1882
1872 exp_get(exp); 1883 exp_get(exp);
1873 if (d_mountpoint(dentry)) { 1884 /*
1885 * In the case of a mountpoint, the client may be asking for
1886 * attributes that are only properties of the underlying filesystem
1887 * as opposed to the cross-mounted file system. In such a case,
1888 * we will not follow the cross mount and will fill the attribtutes
1889 * directly from the mountpoint dentry.
1890 */
1891 if (d_mountpoint(dentry) &&
1892 (cd->rd_bmval[0] & ~FATTR4_WORD0_RDATTR_ERROR) == 0 &&
1893 (cd->rd_bmval[1] & ~FATTR4_WORD1_MOUNTED_ON_FILEID) == 0)
1894 ignore_crossmnt = 1;
1895 else if (d_mountpoint(dentry)) {
1874 int err; 1896 int err;
1875 1897
1876 /* 1898 /*
@@ -1889,7 +1911,7 @@ nfsd4_encode_dirent_fattr(struct nfsd4_readdir *cd,
1889 1911
1890 } 1912 }
1891 nfserr = nfsd4_encode_fattr(NULL, exp, dentry, p, buflen, cd->rd_bmval, 1913 nfserr = nfsd4_encode_fattr(NULL, exp, dentry, p, buflen, cd->rd_bmval,
1892 cd->rd_rqstp); 1914 cd->rd_rqstp, ignore_crossmnt);
1893out_put: 1915out_put:
1894 dput(dentry); 1916 dput(dentry);
1895 exp_put(exp); 1917 exp_put(exp);
@@ -2043,7 +2065,7 @@ nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
2043 buflen = resp->end - resp->p - (COMPOUND_ERR_SLACK_SPACE >> 2); 2065 buflen = resp->end - resp->p - (COMPOUND_ERR_SLACK_SPACE >> 2);
2044 nfserr = nfsd4_encode_fattr(fhp, fhp->fh_export, fhp->fh_dentry, 2066 nfserr = nfsd4_encode_fattr(fhp, fhp->fh_export, fhp->fh_dentry,
2045 resp->p, &buflen, getattr->ga_bmval, 2067 resp->p, &buflen, getattr->ga_bmval,
2046 resp->rqstp); 2068 resp->rqstp, 0);
2047 if (!nfserr) 2069 if (!nfserr)
2048 resp->p += buflen; 2070 resp->p += buflen;
2049 return nfserr; 2071 return nfserr;
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 578f2c9d56be..5bfc2ac60d54 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -44,17 +44,17 @@ static int nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *vec);
44 */ 44 */
45static DEFINE_SPINLOCK(cache_lock); 45static DEFINE_SPINLOCK(cache_lock);
46 46
47void 47int nfsd_reply_cache_init(void)
48nfsd_cache_init(void)
49{ 48{
50 struct svc_cacherep *rp; 49 struct svc_cacherep *rp;
51 int i; 50 int i;
52 51
53 INIT_LIST_HEAD(&lru_head); 52 INIT_LIST_HEAD(&lru_head);
54 i = CACHESIZE; 53 i = CACHESIZE;
55 while(i) { 54 while (i) {
56 rp = kmalloc(sizeof(*rp), GFP_KERNEL); 55 rp = kmalloc(sizeof(*rp), GFP_KERNEL);
57 if (!rp) break; 56 if (!rp)
57 goto out_nomem;
58 list_add(&rp->c_lru, &lru_head); 58 list_add(&rp->c_lru, &lru_head);
59 rp->c_state = RC_UNUSED; 59 rp->c_state = RC_UNUSED;
60 rp->c_type = RC_NOCACHE; 60 rp->c_type = RC_NOCACHE;
@@ -62,23 +62,19 @@ nfsd_cache_init(void)
62 i--; 62 i--;
63 } 63 }
64 64
65 if (i)
66 printk (KERN_ERR "nfsd: cannot allocate all %d cache entries, only got %d\n",
67 CACHESIZE, CACHESIZE-i);
68
69 hash_list = kcalloc (HASHSIZE, sizeof(struct hlist_head), GFP_KERNEL); 65 hash_list = kcalloc (HASHSIZE, sizeof(struct hlist_head), GFP_KERNEL);
70 if (!hash_list) { 66 if (!hash_list)
71 nfsd_cache_shutdown(); 67 goto out_nomem;
72 printk (KERN_ERR "nfsd: cannot allocate %Zd bytes for hash list\n",
73 HASHSIZE * sizeof(struct hlist_head));
74 return;
75 }
76 68
77 cache_disabled = 0; 69 cache_disabled = 0;
70 return 0;
71out_nomem:
72 printk(KERN_ERR "nfsd: failed to allocate reply cache\n");
73 nfsd_reply_cache_shutdown();
74 return -ENOMEM;
78} 75}
79 76
80void 77void nfsd_reply_cache_shutdown(void)
81nfsd_cache_shutdown(void)
82{ 78{
83 struct svc_cacherep *rp; 79 struct svc_cacherep *rp;
84 80
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 77dc9893b7ba..8516137cdbb0 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -304,6 +304,9 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size)
304 struct auth_domain *dom; 304 struct auth_domain *dom;
305 struct knfsd_fh fh; 305 struct knfsd_fh fh;
306 306
307 if (size == 0)
308 return -EINVAL;
309
307 if (buf[size-1] != '\n') 310 if (buf[size-1] != '\n')
308 return -EINVAL; 311 return -EINVAL;
309 buf[size-1] = 0; 312 buf[size-1] = 0;
@@ -503,7 +506,7 @@ static ssize_t write_ports(struct file *file, char *buf, size_t size)
503 int len = 0; 506 int len = 0;
504 lock_kernel(); 507 lock_kernel();
505 if (nfsd_serv) 508 if (nfsd_serv)
506 len = svc_sock_names(buf, nfsd_serv, NULL); 509 len = svc_xprt_names(nfsd_serv, buf, 0);
507 unlock_kernel(); 510 unlock_kernel();
508 return len; 511 return len;
509 } 512 }
@@ -540,7 +543,7 @@ static ssize_t write_ports(struct file *file, char *buf, size_t size)
540 } 543 }
541 return err < 0 ? err : 0; 544 return err < 0 ? err : 0;
542 } 545 }
543 if (buf[0] == '-') { 546 if (buf[0] == '-' && isdigit(buf[1])) {
544 char *toclose = kstrdup(buf+1, GFP_KERNEL); 547 char *toclose = kstrdup(buf+1, GFP_KERNEL);
545 int len = 0; 548 int len = 0;
546 if (!toclose) 549 if (!toclose)
@@ -554,6 +557,53 @@ static ssize_t write_ports(struct file *file, char *buf, size_t size)
554 kfree(toclose); 557 kfree(toclose);
555 return len; 558 return len;
556 } 559 }
560 /*
561 * Add a transport listener by writing it's transport name
562 */
563 if (isalpha(buf[0])) {
564 int err;
565 char transport[16];
566 int port;
567 if (sscanf(buf, "%15s %4d", transport, &port) == 2) {
568 err = nfsd_create_serv();
569 if (!err) {
570 err = svc_create_xprt(nfsd_serv,
571 transport, port,
572 SVC_SOCK_ANONYMOUS);
573 if (err == -ENOENT)
574 /* Give a reasonable perror msg for
575 * bad transport string */
576 err = -EPROTONOSUPPORT;
577 }
578 return err < 0 ? err : 0;
579 }
580 }
581 /*
582 * Remove a transport by writing it's transport name and port number
583 */
584 if (buf[0] == '-' && isalpha(buf[1])) {
585 struct svc_xprt *xprt;
586 int err = -EINVAL;
587 char transport[16];
588 int port;
589 if (sscanf(&buf[1], "%15s %4d", transport, &port) == 2) {
590 if (port == 0)
591 return -EINVAL;
592 lock_kernel();
593 if (nfsd_serv) {
594 xprt = svc_find_xprt(nfsd_serv, transport,
595 AF_UNSPEC, port);
596 if (xprt) {
597 svc_close_xprt(xprt);
598 svc_xprt_put(xprt);
599 err = 0;
600 } else
601 err = -ENOTCONN;
602 }
603 unlock_kernel();
604 return err < 0 ? err : 0;
605 }
606 }
557 return -EINVAL; 607 return -EINVAL;
558} 608}
559 609
@@ -616,7 +666,7 @@ static ssize_t write_recoverydir(struct file *file, char *buf, size_t size)
616 char *recdir; 666 char *recdir;
617 int len, status; 667 int len, status;
618 668
619 if (size > PATH_MAX || buf[size-1] != '\n') 669 if (size == 0 || size > PATH_MAX || buf[size-1] != '\n')
620 return -EINVAL; 670 return -EINVAL;
621 buf[size-1] = 0; 671 buf[size-1] = 0;
622 672
@@ -674,6 +724,27 @@ static struct file_system_type nfsd_fs_type = {
674 .kill_sb = kill_litter_super, 724 .kill_sb = kill_litter_super,
675}; 725};
676 726
727#ifdef CONFIG_PROC_FS
728static int create_proc_exports_entry(void)
729{
730 struct proc_dir_entry *entry;
731
732 entry = proc_mkdir("fs/nfs", NULL);
733 if (!entry)
734 return -ENOMEM;
735 entry = create_proc_entry("fs/nfs/exports", 0, NULL);
736 if (!entry)
737 return -ENOMEM;
738 entry->proc_fops = &exports_operations;
739 return 0;
740}
741#else /* CONFIG_PROC_FS */
742static int create_proc_exports_entry(void)
743{
744 return 0;
745}
746#endif
747
677static int __init init_nfsd(void) 748static int __init init_nfsd(void)
678{ 749{
679 int retval; 750 int retval;
@@ -683,32 +754,43 @@ static int __init init_nfsd(void)
683 if (retval) 754 if (retval)
684 return retval; 755 return retval;
685 nfsd_stat_init(); /* Statistics */ 756 nfsd_stat_init(); /* Statistics */
686 nfsd_cache_init(); /* RPC reply cache */ 757 retval = nfsd_reply_cache_init();
687 nfsd_export_init(); /* Exports table */ 758 if (retval)
759 goto out_free_stat;
760 retval = nfsd_export_init();
761 if (retval)
762 goto out_free_cache;
688 nfsd_lockd_init(); /* lockd->nfsd callbacks */ 763 nfsd_lockd_init(); /* lockd->nfsd callbacks */
689 nfsd_idmap_init(); /* Name to ID mapping */ 764 retval = nfsd_idmap_init();
690 if (proc_mkdir("fs/nfs", NULL)) { 765 if (retval)
691 struct proc_dir_entry *entry; 766 goto out_free_lockd;
692 entry = create_proc_entry("fs/nfs/exports", 0, NULL); 767 retval = create_proc_exports_entry();
693 if (entry) 768 if (retval)
694 entry->proc_fops = &exports_operations; 769 goto out_free_idmap;
695 }
696 retval = register_filesystem(&nfsd_fs_type); 770 retval = register_filesystem(&nfsd_fs_type);
697 if (retval) { 771 if (retval)
698 nfsd_export_shutdown(); 772 goto out_free_all;
699 nfsd_cache_shutdown(); 773 return 0;
700 remove_proc_entry("fs/nfs/exports", NULL); 774out_free_all:
701 remove_proc_entry("fs/nfs", NULL); 775 remove_proc_entry("fs/nfs/exports", NULL);
702 nfsd_stat_shutdown(); 776 remove_proc_entry("fs/nfs", NULL);
703 nfsd_lockd_shutdown(); 777out_free_idmap:
704 } 778 nfsd_idmap_shutdown();
779out_free_lockd:
780 nfsd_lockd_shutdown();
781 nfsd_export_shutdown();
782out_free_cache:
783 nfsd_reply_cache_shutdown();
784out_free_stat:
785 nfsd_stat_shutdown();
786 nfsd4_free_slabs();
705 return retval; 787 return retval;
706} 788}
707 789
708static void __exit exit_nfsd(void) 790static void __exit exit_nfsd(void)
709{ 791{
710 nfsd_export_shutdown(); 792 nfsd_export_shutdown();
711 nfsd_cache_shutdown(); 793 nfsd_reply_cache_shutdown();
712 remove_proc_entry("fs/nfs/exports", NULL); 794 remove_proc_entry("fs/nfs/exports", NULL);
713 remove_proc_entry("fs/nfs", NULL); 795 remove_proc_entry("fs/nfs", NULL);
714 nfsd_stat_shutdown(); 796 nfsd_stat_shutdown();
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 468f17a78441..8fbd2dc08a92 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -22,6 +22,7 @@
22#include <linux/sunrpc/svc.h> 22#include <linux/sunrpc/svc.h>
23#include <linux/sunrpc/svcauth_gss.h> 23#include <linux/sunrpc/svcauth_gss.h>
24#include <linux/nfsd/nfsd.h> 24#include <linux/nfsd/nfsd.h>
25#include "auth.h"
25 26
26#define NFSDDBG_FACILITY NFSDDBG_FH 27#define NFSDDBG_FACILITY NFSDDBG_FH
27 28
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 1190aeaa92be..9647b0f7bc0c 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -155,8 +155,8 @@ static int killsig; /* signal that was used to kill last nfsd */
155static void nfsd_last_thread(struct svc_serv *serv) 155static void nfsd_last_thread(struct svc_serv *serv)
156{ 156{
157 /* When last nfsd thread exits we need to do some clean-up */ 157 /* When last nfsd thread exits we need to do some clean-up */
158 struct svc_sock *svsk; 158 struct svc_xprt *xprt;
159 list_for_each_entry(svsk, &serv->sv_permsocks, sk_list) 159 list_for_each_entry(xprt, &serv->sv_permsocks, xpt_list)
160 lockd_down(); 160 lockd_down();
161 nfsd_serv = NULL; 161 nfsd_serv = NULL;
162 nfsd_racache_shutdown(); 162 nfsd_racache_shutdown();
@@ -236,7 +236,7 @@ static int nfsd_init_socks(int port)
236 236
237 error = lockd_up(IPPROTO_UDP); 237 error = lockd_up(IPPROTO_UDP);
238 if (error >= 0) { 238 if (error >= 0) {
239 error = svc_makesock(nfsd_serv, IPPROTO_UDP, port, 239 error = svc_create_xprt(nfsd_serv, "udp", port,
240 SVC_SOCK_DEFAULTS); 240 SVC_SOCK_DEFAULTS);
241 if (error < 0) 241 if (error < 0)
242 lockd_down(); 242 lockd_down();
@@ -247,7 +247,7 @@ static int nfsd_init_socks(int port)
247#ifdef CONFIG_NFSD_TCP 247#ifdef CONFIG_NFSD_TCP
248 error = lockd_up(IPPROTO_TCP); 248 error = lockd_up(IPPROTO_TCP);
249 if (error >= 0) { 249 if (error >= 0) {
250 error = svc_makesock(nfsd_serv, IPPROTO_TCP, port, 250 error = svc_create_xprt(nfsd_serv, "tcp", port,
251 SVC_SOCK_DEFAULTS); 251 SVC_SOCK_DEFAULTS);
252 if (error < 0) 252 if (error < 0)
253 lockd_down(); 253 lockd_down();
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index b86e3658a0af..61ad61743d94 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -15,6 +15,7 @@
15#include <linux/nfsd/nfsd.h> 15#include <linux/nfsd/nfsd.h>
16#include <linux/nfsd/xdr.h> 16#include <linux/nfsd/xdr.h>
17#include <linux/mm.h> 17#include <linux/mm.h>
18#include "auth.h"
18 19
19#define NFSDDBG_FACILITY NFSDDBG_XDR 20#define NFSDDBG_FACILITY NFSDDBG_XDR
20 21
@@ -62,10 +63,10 @@ encode_fh(__be32 *p, struct svc_fh *fhp)
62 * no slashes or null bytes. 63 * no slashes or null bytes.
63 */ 64 */
64static __be32 * 65static __be32 *
65decode_filename(__be32 *p, char **namp, int *lenp) 66decode_filename(__be32 *p, char **namp, unsigned int *lenp)
66{ 67{
67 char *name; 68 char *name;
68 int i; 69 unsigned int i;
69 70
70 if ((p = xdr_decode_string_inplace(p, namp, lenp, NFS_MAXNAMLEN)) != NULL) { 71 if ((p = xdr_decode_string_inplace(p, namp, lenp, NFS_MAXNAMLEN)) != NULL) {
71 for (i = 0, name = *namp; i < *lenp; i++, name++) { 72 for (i = 0, name = *namp; i < *lenp; i++, name++) {
@@ -78,10 +79,10 @@ decode_filename(__be32 *p, char **namp, int *lenp)
78} 79}
79 80
80static __be32 * 81static __be32 *
81decode_pathname(__be32 *p, char **namp, int *lenp) 82decode_pathname(__be32 *p, char **namp, unsigned int *lenp)
82{ 83{
83 char *name; 84 char *name;
84 int i; 85 unsigned int i;
85 86
86 if ((p = xdr_decode_string_inplace(p, namp, lenp, NFS_MAXPATHLEN)) != NULL) { 87 if ((p = xdr_decode_string_inplace(p, namp, lenp, NFS_MAXPATHLEN)) != NULL) {
87 for (i = 0, name = *namp; i < *lenp; i++, name++) { 88 for (i = 0, name = *namp; i < *lenp; i++, name++) {
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index d0199189924c..cc75e4fcd02b 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -132,7 +132,7 @@ out:
132 132
133__be32 133__be32
134nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp, 134nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
135 const char *name, int len, 135 const char *name, unsigned int len,
136 struct svc_export **exp_ret, struct dentry **dentry_ret) 136 struct svc_export **exp_ret, struct dentry **dentry_ret)
137{ 137{
138 struct svc_export *exp; 138 struct svc_export *exp;
@@ -226,7 +226,7 @@ out_nfserr:
226 */ 226 */
227__be32 227__be32
228nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name, 228nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name,
229 int len, struct svc_fh *resfh) 229 unsigned int len, struct svc_fh *resfh)
230{ 230{
231 struct svc_export *exp; 231 struct svc_export *exp;
232 struct dentry *dentry; 232 struct dentry *dentry;
@@ -1151,6 +1151,26 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp,
1151} 1151}
1152#endif /* CONFIG_NFSD_V3 */ 1152#endif /* CONFIG_NFSD_V3 */
1153 1153
1154__be32
1155nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *resfhp,
1156 struct iattr *iap)
1157{
1158 /*
1159 * Mode has already been set earlier in create:
1160 */
1161 iap->ia_valid &= ~ATTR_MODE;
1162 /*
1163 * Setting uid/gid works only for root. Irix appears to
1164 * send along the gid on create when it tries to implement
1165 * setgid directories via NFS:
1166 */
1167 if (current->fsuid != 0)
1168 iap->ia_valid &= ~(ATTR_UID|ATTR_GID);
1169 if (iap->ia_valid)
1170 return nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0);
1171 return 0;
1172}
1173
1154/* 1174/*
1155 * Create a file (regular, directory, device, fifo); UNIX sockets 1175 * Create a file (regular, directory, device, fifo); UNIX sockets
1156 * not yet implemented. 1176 * not yet implemented.
@@ -1167,6 +1187,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
1167 struct dentry *dentry, *dchild = NULL; 1187 struct dentry *dentry, *dchild = NULL;
1168 struct inode *dirp; 1188 struct inode *dirp;
1169 __be32 err; 1189 __be32 err;
1190 __be32 err2;
1170 int host_err; 1191 int host_err;
1171 1192
1172 err = nfserr_perm; 1193 err = nfserr_perm;
@@ -1257,16 +1278,9 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
1257 } 1278 }
1258 1279
1259 1280
1260 /* Set file attributes. Mode has already been set and 1281 err2 = nfsd_create_setattr(rqstp, resfhp, iap);
1261 * setting uid/gid works only for root. Irix appears to 1282 if (err2)
1262 * send along the gid when it tries to implement setgid 1283 err = err2;
1263 * directories via NFS.
1264 */
1265 if ((iap->ia_valid &= ~(ATTR_UID|ATTR_GID|ATTR_MODE)) != 0) {
1266 __be32 err2 = nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0);
1267 if (err2)
1268 err = err2;
1269 }
1270 /* 1284 /*
1271 * Update the file handle to get the new inode info. 1285 * Update the file handle to get the new inode info.
1272 */ 1286 */
@@ -1295,6 +1309,7 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
1295 struct dentry *dentry, *dchild = NULL; 1309 struct dentry *dentry, *dchild = NULL;
1296 struct inode *dirp; 1310 struct inode *dirp;
1297 __be32 err; 1311 __be32 err;
1312 __be32 err2;
1298 int host_err; 1313 int host_err;
1299 __u32 v_mtime=0, v_atime=0; 1314 __u32 v_mtime=0, v_atime=0;
1300 1315
@@ -1399,16 +1414,10 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
1399 iap->ia_atime.tv_nsec = 0; 1414 iap->ia_atime.tv_nsec = 0;
1400 } 1415 }
1401 1416
1402 /* Set file attributes.
1403 * Irix appears to send along the gid when it tries to
1404 * implement setgid directories via NFS. Clear out all that cruft.
1405 */
1406 set_attr: 1417 set_attr:
1407 if ((iap->ia_valid &= ~(ATTR_UID|ATTR_GID|ATTR_MODE)) != 0) { 1418 err2 = nfsd_create_setattr(rqstp, resfhp, iap);
1408 __be32 err2 = nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0); 1419 if (err2)
1409 if (err2) 1420 err = err2;
1410 err = err2;
1411 }
1412 1421
1413 /* 1422 /*
1414 * Update the filehandle to get the new inode info. 1423 * Update the filehandle to get the new inode info.
diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h
index e2d1ce36b367..4babb2a129ac 100644
--- a/include/linux/lockd/lockd.h
+++ b/include/linux/lockd/lockd.h
@@ -173,14 +173,17 @@ void nlmclnt_next_cookie(struct nlm_cookie *);
173/* 173/*
174 * Host cache 174 * Host cache
175 */ 175 */
176struct nlm_host * nlmclnt_lookup_host(const struct sockaddr_in *, int, int, const char *, int); 176struct nlm_host *nlmclnt_lookup_host(const struct sockaddr_in *, int, int,
177struct nlm_host * nlmsvc_lookup_host(struct svc_rqst *, const char *, int); 177 const char *, unsigned int);
178struct nlm_host *nlmsvc_lookup_host(struct svc_rqst *, const char *,
179 unsigned int);
178struct rpc_clnt * nlm_bind_host(struct nlm_host *); 180struct rpc_clnt * nlm_bind_host(struct nlm_host *);
179void nlm_rebind_host(struct nlm_host *); 181void nlm_rebind_host(struct nlm_host *);
180struct nlm_host * nlm_get_host(struct nlm_host *); 182struct nlm_host * nlm_get_host(struct nlm_host *);
181void nlm_release_host(struct nlm_host *); 183void nlm_release_host(struct nlm_host *);
182void nlm_shutdown_hosts(void); 184void nlm_shutdown_hosts(void);
183extern void nlm_host_rebooted(const struct sockaddr_in *, const char *, int, u32); 185extern void nlm_host_rebooted(const struct sockaddr_in *, const char *,
186 unsigned int, u32);
184void nsm_release(struct nsm_handle *); 187void nsm_release(struct nsm_handle *);
185 188
186 189
diff --git a/include/linux/lockd/xdr.h b/include/linux/lockd/xdr.h
index 83a1f9f6237b..df18fa053bcd 100644
--- a/include/linux/lockd/xdr.h
+++ b/include/linux/lockd/xdr.h
@@ -29,7 +29,7 @@ struct svc_rqst;
29/* Lock info passed via NLM */ 29/* Lock info passed via NLM */
30struct nlm_lock { 30struct nlm_lock {
31 char * caller; 31 char * caller;
32 int len; /* length of "caller" */ 32 unsigned int len; /* length of "caller" */
33 struct nfs_fh fh; 33 struct nfs_fh fh;
34 struct xdr_netobj oh; 34 struct xdr_netobj oh;
35 u32 svid; 35 u32 svid;
@@ -78,7 +78,7 @@ struct nlm_res {
78 */ 78 */
79struct nlm_reboot { 79struct nlm_reboot {
80 char * mon; 80 char * mon;
81 int len; 81 unsigned int len;
82 u32 state; 82 u32 state;
83 __be32 addr; 83 __be32 addr;
84 __be32 vers; 84 __be32 vers;
diff --git a/include/linux/nfsd/Kbuild b/include/linux/nfsd/Kbuild
index d9c5455808e5..e726fc3a4375 100644
--- a/include/linux/nfsd/Kbuild
+++ b/include/linux/nfsd/Kbuild
@@ -4,4 +4,3 @@ unifdef-y += stats.h
4unifdef-y += syscall.h 4unifdef-y += syscall.h
5unifdef-y += nfsfh.h 5unifdef-y += nfsfh.h
6unifdef-y += debug.h 6unifdef-y += debug.h
7unifdef-y += auth.h
diff --git a/include/linux/nfsd/cache.h b/include/linux/nfsd/cache.h
index 007480cd6a60..7b5d784cc858 100644
--- a/include/linux/nfsd/cache.h
+++ b/include/linux/nfsd/cache.h
@@ -72,8 +72,8 @@ enum {
72 */ 72 */
73#define RC_DELAY (HZ/5) 73#define RC_DELAY (HZ/5)
74 74
75void nfsd_cache_init(void); 75int nfsd_reply_cache_init(void);
76void nfsd_cache_shutdown(void); 76void nfsd_reply_cache_shutdown(void);
77int nfsd_cache_lookup(struct svc_rqst *, int); 77int nfsd_cache_lookup(struct svc_rqst *, int);
78void nfsd_cache_update(struct svc_rqst *, int, __be32 *); 78void nfsd_cache_update(struct svc_rqst *, int, __be32 *);
79 79
diff --git a/include/linux/nfsd/export.h b/include/linux/nfsd/export.h
index bcb7abafbca9..3a1687251367 100644
--- a/include/linux/nfsd/export.h
+++ b/include/linux/nfsd/export.h
@@ -122,7 +122,7 @@ __be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp);
122/* 122/*
123 * Function declarations 123 * Function declarations
124 */ 124 */
125void nfsd_export_init(void); 125int nfsd_export_init(void);
126void nfsd_export_shutdown(void); 126void nfsd_export_shutdown(void);
127void nfsd_export_flush(void); 127void nfsd_export_flush(void);
128void exp_readlock(void); 128void exp_readlock(void);
diff --git a/include/linux/nfsd/nfsd.h b/include/linux/nfsd/nfsd.h
index 604a0d786bc6..8caf4c4f64e6 100644
--- a/include/linux/nfsd/nfsd.h
+++ b/include/linux/nfsd/nfsd.h
@@ -20,7 +20,6 @@
20#include <linux/nfsd/debug.h> 20#include <linux/nfsd/debug.h>
21#include <linux/nfsd/nfsfh.h> 21#include <linux/nfsd/nfsfh.h>
22#include <linux/nfsd/export.h> 22#include <linux/nfsd/export.h>
23#include <linux/nfsd/auth.h>
24#include <linux/nfsd/stats.h> 23#include <linux/nfsd/stats.h>
25/* 24/*
26 * nfsd version 25 * nfsd version
@@ -70,9 +69,9 @@ void nfsd_racache_shutdown(void);
70int nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp, 69int nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
71 struct svc_export **expp); 70 struct svc_export **expp);
72__be32 nfsd_lookup(struct svc_rqst *, struct svc_fh *, 71__be32 nfsd_lookup(struct svc_rqst *, struct svc_fh *,
73 const char *, int, struct svc_fh *); 72 const char *, unsigned int, struct svc_fh *);
74__be32 nfsd_lookup_dentry(struct svc_rqst *, struct svc_fh *, 73__be32 nfsd_lookup_dentry(struct svc_rqst *, struct svc_fh *,
75 const char *, int, 74 const char *, unsigned int,
76 struct svc_export **, struct dentry **); 75 struct svc_export **, struct dentry **);
77__be32 nfsd_setattr(struct svc_rqst *, struct svc_fh *, 76__be32 nfsd_setattr(struct svc_rqst *, struct svc_fh *,
78 struct iattr *, int, time_t); 77 struct iattr *, int, time_t);
diff --git a/include/linux/nfsd/syscall.h b/include/linux/nfsd/syscall.h
index 8bcddccb6c42..4e439765b705 100644
--- a/include/linux/nfsd/syscall.h
+++ b/include/linux/nfsd/syscall.h
@@ -18,7 +18,6 @@
18#include <linux/nfsd/const.h> 18#include <linux/nfsd/const.h>
19#include <linux/nfsd/export.h> 19#include <linux/nfsd/export.h>
20#include <linux/nfsd/nfsfh.h> 20#include <linux/nfsd/nfsfh.h>
21#include <linux/nfsd/auth.h>
22 21
23/* 22/*
24 * Version of the syscall interface 23 * Version of the syscall interface
diff --git a/include/linux/nfsd/xdr.h b/include/linux/nfsd/xdr.h
index 67885d5e6e50..a0132ef58f21 100644
--- a/include/linux/nfsd/xdr.h
+++ b/include/linux/nfsd/xdr.h
@@ -23,7 +23,7 @@ struct nfsd_sattrargs {
23struct nfsd_diropargs { 23struct nfsd_diropargs {
24 struct svc_fh fh; 24 struct svc_fh fh;
25 char * name; 25 char * name;
26 int len; 26 unsigned int len;
27}; 27};
28 28
29struct nfsd_readargs { 29struct nfsd_readargs {
@@ -43,17 +43,17 @@ struct nfsd_writeargs {
43struct nfsd_createargs { 43struct nfsd_createargs {
44 struct svc_fh fh; 44 struct svc_fh fh;
45 char * name; 45 char * name;
46 int len; 46 unsigned int len;
47 struct iattr attrs; 47 struct iattr attrs;
48}; 48};
49 49
50struct nfsd_renameargs { 50struct nfsd_renameargs {
51 struct svc_fh ffh; 51 struct svc_fh ffh;
52 char * fname; 52 char * fname;
53 int flen; 53 unsigned int flen;
54 struct svc_fh tfh; 54 struct svc_fh tfh;
55 char * tname; 55 char * tname;
56 int tlen; 56 unsigned int tlen;
57}; 57};
58 58
59struct nfsd_readlinkargs { 59struct nfsd_readlinkargs {
@@ -65,15 +65,15 @@ struct nfsd_linkargs {
65 struct svc_fh ffh; 65 struct svc_fh ffh;
66 struct svc_fh tfh; 66 struct svc_fh tfh;
67 char * tname; 67 char * tname;
68 int tlen; 68 unsigned int tlen;
69}; 69};
70 70
71struct nfsd_symlinkargs { 71struct nfsd_symlinkargs {
72 struct svc_fh ffh; 72 struct svc_fh ffh;
73 char * fname; 73 char * fname;
74 int flen; 74 unsigned int flen;
75 char * tname; 75 char * tname;
76 int tlen; 76 unsigned int tlen;
77 struct iattr attrs; 77 struct iattr attrs;
78}; 78};
79 79
diff --git a/include/linux/nfsd/xdr3.h b/include/linux/nfsd/xdr3.h
index 89d9d6061a62..421eddd65a25 100644
--- a/include/linux/nfsd/xdr3.h
+++ b/include/linux/nfsd/xdr3.h
@@ -21,7 +21,7 @@ struct nfsd3_sattrargs {
21struct nfsd3_diropargs { 21struct nfsd3_diropargs {
22 struct svc_fh fh; 22 struct svc_fh fh;
23 char * name; 23 char * name;
24 int len; 24 unsigned int len;
25}; 25};
26 26
27struct nfsd3_accessargs { 27struct nfsd3_accessargs {
@@ -48,7 +48,7 @@ struct nfsd3_writeargs {
48struct nfsd3_createargs { 48struct nfsd3_createargs {
49 struct svc_fh fh; 49 struct svc_fh fh;
50 char * name; 50 char * name;
51 int len; 51 unsigned int len;
52 int createmode; 52 int createmode;
53 struct iattr attrs; 53 struct iattr attrs;
54 __be32 * verf; 54 __be32 * verf;
@@ -57,7 +57,7 @@ struct nfsd3_createargs {
57struct nfsd3_mknodargs { 57struct nfsd3_mknodargs {
58 struct svc_fh fh; 58 struct svc_fh fh;
59 char * name; 59 char * name;
60 int len; 60 unsigned int len;
61 __u32 ftype; 61 __u32 ftype;
62 __u32 major, minor; 62 __u32 major, minor;
63 struct iattr attrs; 63 struct iattr attrs;
@@ -66,10 +66,10 @@ struct nfsd3_mknodargs {
66struct nfsd3_renameargs { 66struct nfsd3_renameargs {
67 struct svc_fh ffh; 67 struct svc_fh ffh;
68 char * fname; 68 char * fname;
69 int flen; 69 unsigned int flen;
70 struct svc_fh tfh; 70 struct svc_fh tfh;
71 char * tname; 71 char * tname;
72 int tlen; 72 unsigned int tlen;
73}; 73};
74 74
75struct nfsd3_readlinkargs { 75struct nfsd3_readlinkargs {
@@ -81,15 +81,15 @@ struct nfsd3_linkargs {
81 struct svc_fh ffh; 81 struct svc_fh ffh;
82 struct svc_fh tfh; 82 struct svc_fh tfh;
83 char * tname; 83 char * tname;
84 int tlen; 84 unsigned int tlen;
85}; 85};
86 86
87struct nfsd3_symlinkargs { 87struct nfsd3_symlinkargs {
88 struct svc_fh ffh; 88 struct svc_fh ffh;
89 char * fname; 89 char * fname;
90 int flen; 90 unsigned int flen;
91 char * tname; 91 char * tname;
92 int tlen; 92 unsigned int tlen;
93 struct iattr attrs; 93 struct iattr attrs;
94}; 94};
95 95
diff --git a/include/linux/nfsd/xdr4.h b/include/linux/nfsd/xdr4.h
index b0ddfb41c790..27bd3e38ec5a 100644
--- a/include/linux/nfsd/xdr4.h
+++ b/include/linux/nfsd/xdr4.h
@@ -441,7 +441,7 @@ void nfsd4_encode_operation(struct nfsd4_compoundres *, struct nfsd4_op *);
441void nfsd4_encode_replay(struct nfsd4_compoundres *resp, struct nfsd4_op *op); 441void nfsd4_encode_replay(struct nfsd4_compoundres *resp, struct nfsd4_op *op);
442__be32 nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, 442__be32 nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
443 struct dentry *dentry, __be32 *buffer, int *countp, 443 struct dentry *dentry, __be32 *buffer, int *countp,
444 u32 *bmval, struct svc_rqst *); 444 u32 *bmval, struct svc_rqst *, int ignore_crossmnt);
445extern __be32 nfsd4_setclientid(struct svc_rqst *rqstp, 445extern __be32 nfsd4_setclientid(struct svc_rqst *rqstp,
446 struct nfsd4_compound_state *, 446 struct nfsd4_compound_state *,
447 struct nfsd4_setclientid *setclid); 447 struct nfsd4_setclientid *setclid);
diff --git a/include/linux/nfsd_idmap.h b/include/linux/nfsd_idmap.h
index e82746fcad14..d4a2ac18bd4c 100644
--- a/include/linux/nfsd_idmap.h
+++ b/include/linux/nfsd_idmap.h
@@ -44,11 +44,16 @@
44#define IDMAP_NAMESZ 128 44#define IDMAP_NAMESZ 128
45 45
46#ifdef CONFIG_NFSD_V4 46#ifdef CONFIG_NFSD_V4
47void nfsd_idmap_init(void); 47int nfsd_idmap_init(void);
48void nfsd_idmap_shutdown(void); 48void nfsd_idmap_shutdown(void);
49#else 49#else
50static inline void nfsd_idmap_init(void) {}; 50static inline int nfsd_idmap_init(void)
51static inline void nfsd_idmap_shutdown(void) {}; 51{
52 return 0;
53}
54static inline void nfsd_idmap_shutdown(void)
55{
56}
52#endif 57#endif
53 58
54int nfsd_map_name_to_uid(struct svc_rqst *, const char *, size_t, __u32 *); 59int nfsd_map_name_to_uid(struct svc_rqst *, const char *, size_t, __u32 *);
diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h
index bd7a6b0a87af..03547d6abee5 100644
--- a/include/linux/sunrpc/cache.h
+++ b/include/linux/sunrpc/cache.h
@@ -169,8 +169,8 @@ extern int cache_check(struct cache_detail *detail,
169extern void cache_flush(void); 169extern void cache_flush(void);
170extern void cache_purge(struct cache_detail *detail); 170extern void cache_purge(struct cache_detail *detail);
171#define NEVER (0x7FFFFFFF) 171#define NEVER (0x7FFFFFFF)
172extern void cache_register(struct cache_detail *cd); 172extern int cache_register(struct cache_detail *cd);
173extern int cache_unregister(struct cache_detail *cd); 173extern void cache_unregister(struct cache_detail *cd);
174 174
175extern void qword_add(char **bpp, int *lp, char *str); 175extern void qword_add(char **bpp, int *lp, char *str);
176extern void qword_addhex(char **bpp, int *lp, char *buf, int blen); 176extern void qword_addhex(char **bpp, int *lp, char *buf, int blen);
diff --git a/include/linux/sunrpc/debug.h b/include/linux/sunrpc/debug.h
index 3912cf16361e..10709cbe96fd 100644
--- a/include/linux/sunrpc/debug.h
+++ b/include/linux/sunrpc/debug.h
@@ -20,7 +20,7 @@
20#define RPCDBG_BIND 0x0020 20#define RPCDBG_BIND 0x0020
21#define RPCDBG_SCHED 0x0040 21#define RPCDBG_SCHED 0x0040
22#define RPCDBG_TRANS 0x0080 22#define RPCDBG_TRANS 0x0080
23#define RPCDBG_SVCSOCK 0x0100 23#define RPCDBG_SVCXPRT 0x0100
24#define RPCDBG_SVCDSP 0x0200 24#define RPCDBG_SVCDSP 0x0200
25#define RPCDBG_MISC 0x0400 25#define RPCDBG_MISC 0x0400
26#define RPCDBG_CACHE 0x0800 26#define RPCDBG_CACHE 0x0800
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 8531a70da73d..64c771056187 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -204,7 +204,7 @@ union svc_addr_u {
204struct svc_rqst { 204struct svc_rqst {
205 struct list_head rq_list; /* idle list */ 205 struct list_head rq_list; /* idle list */
206 struct list_head rq_all; /* all threads list */ 206 struct list_head rq_all; /* all threads list */
207 struct svc_sock * rq_sock; /* socket */ 207 struct svc_xprt * rq_xprt; /* transport ptr */
208 struct sockaddr_storage rq_addr; /* peer address */ 208 struct sockaddr_storage rq_addr; /* peer address */
209 size_t rq_addrlen; 209 size_t rq_addrlen;
210 210
@@ -214,9 +214,10 @@ struct svc_rqst {
214 struct auth_ops * rq_authop; /* authentication flavour */ 214 struct auth_ops * rq_authop; /* authentication flavour */
215 u32 rq_flavor; /* pseudoflavor */ 215 u32 rq_flavor; /* pseudoflavor */
216 struct svc_cred rq_cred; /* auth info */ 216 struct svc_cred rq_cred; /* auth info */
217 struct sk_buff * rq_skbuff; /* fast recv inet buffer */ 217 void * rq_xprt_ctxt; /* transport specific context ptr */
218 struct svc_deferred_req*rq_deferred; /* deferred request we are replaying */ 218 struct svc_deferred_req*rq_deferred; /* deferred request we are replaying */
219 219
220 size_t rq_xprt_hlen; /* xprt header len */
220 struct xdr_buf rq_arg; 221 struct xdr_buf rq_arg;
221 struct xdr_buf rq_res; 222 struct xdr_buf rq_res;
222 struct page * rq_pages[RPCSVC_MAXPAGES]; 223 struct page * rq_pages[RPCSVC_MAXPAGES];
@@ -317,11 +318,12 @@ static inline void svc_free_res_pages(struct svc_rqst *rqstp)
317 318
318struct svc_deferred_req { 319struct svc_deferred_req {
319 u32 prot; /* protocol (UDP or TCP) */ 320 u32 prot; /* protocol (UDP or TCP) */
320 struct svc_sock *svsk; 321 struct svc_xprt *xprt;
321 struct sockaddr_storage addr; /* where reply must go */ 322 struct sockaddr_storage addr; /* where reply must go */
322 size_t addrlen; 323 size_t addrlen;
323 union svc_addr_u daddr; /* where reply must come from */ 324 union svc_addr_u daddr; /* where reply must come from */
324 struct cache_deferred_req handle; 325 struct cache_deferred_req handle;
326 size_t xprt_hlen;
325 int argslen; 327 int argslen;
326 __be32 args[0]; 328 __be32 args[0];
327}; 329};
@@ -382,6 +384,8 @@ struct svc_procedure {
382 */ 384 */
383struct svc_serv * svc_create(struct svc_program *, unsigned int, 385struct svc_serv * svc_create(struct svc_program *, unsigned int,
384 void (*shutdown)(struct svc_serv*)); 386 void (*shutdown)(struct svc_serv*));
387struct svc_rqst *svc_prepare_thread(struct svc_serv *serv,
388 struct svc_pool *pool);
385int svc_create_thread(svc_thread_fn, struct svc_serv *); 389int svc_create_thread(svc_thread_fn, struct svc_serv *);
386void svc_exit_thread(struct svc_rqst *); 390void svc_exit_thread(struct svc_rqst *);
387struct svc_serv * svc_create_pooled(struct svc_program *, unsigned int, 391struct svc_serv * svc_create_pooled(struct svc_program *, unsigned int,
diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
new file mode 100644
index 000000000000..c11bbcc081f9
--- /dev/null
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -0,0 +1,262 @@
1/*
2 * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the BSD-type
8 * license below:
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 *
17 * Redistributions in binary form must reproduce the above
18 * copyright notice, this list of conditions and the following
19 * disclaimer in the documentation and/or other materials provided
20 * with the distribution.
21 *
22 * Neither the name of the Network Appliance, Inc. nor the names of
23 * its contributors may be used to endorse or promote products
24 * derived from this software without specific prior written
25 * permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 *
39 * Author: Tom Tucker <tom@opengridcomputing.com>
40 */
41
42#ifndef SVC_RDMA_H
43#define SVC_RDMA_H
44#include <linux/sunrpc/xdr.h>
45#include <linux/sunrpc/svcsock.h>
46#include <linux/sunrpc/rpc_rdma.h>
47#include <rdma/ib_verbs.h>
48#include <rdma/rdma_cm.h>
49#define SVCRDMA_DEBUG
50
51/* RPC/RDMA parameters and stats */
52extern unsigned int svcrdma_ord;
53extern unsigned int svcrdma_max_requests;
54extern unsigned int svcrdma_max_req_size;
55
56extern atomic_t rdma_stat_recv;
57extern atomic_t rdma_stat_read;
58extern atomic_t rdma_stat_write;
59extern atomic_t rdma_stat_sq_starve;
60extern atomic_t rdma_stat_rq_starve;
61extern atomic_t rdma_stat_rq_poll;
62extern atomic_t rdma_stat_rq_prod;
63extern atomic_t rdma_stat_sq_poll;
64extern atomic_t rdma_stat_sq_prod;
65
66#define RPCRDMA_VERSION 1
67
68/*
69 * Contexts are built when an RDMA request is created and are a
70 * record of the resources that can be recovered when the request
71 * completes.
72 */
73struct svc_rdma_op_ctxt {
74 struct svc_rdma_op_ctxt *next;
75 struct xdr_buf arg;
76 struct list_head dto_q;
77 enum ib_wr_opcode wr_op;
78 enum ib_wc_status wc_status;
79 u32 byte_len;
80 struct svcxprt_rdma *xprt;
81 unsigned long flags;
82 enum dma_data_direction direction;
83 int count;
84 struct ib_sge sge[RPCSVC_MAXPAGES];
85 struct page *pages[RPCSVC_MAXPAGES];
86};
87
88#define RDMACTXT_F_READ_DONE 1
89#define RDMACTXT_F_LAST_CTXT 2
90
91struct svcxprt_rdma {
92 struct svc_xprt sc_xprt; /* SVC transport structure */
93 struct rdma_cm_id *sc_cm_id; /* RDMA connection id */
94 struct list_head sc_accept_q; /* Conn. waiting accept */
95 int sc_ord; /* RDMA read limit */
96 wait_queue_head_t sc_read_wait;
97 int sc_max_sge;
98
99 int sc_sq_depth; /* Depth of SQ */
100 atomic_t sc_sq_count; /* Number of SQ WR on queue */
101
102 int sc_max_requests; /* Depth of RQ */
103 int sc_max_req_size; /* Size of each RQ WR buf */
104
105 struct ib_pd *sc_pd;
106
107 struct svc_rdma_op_ctxt *sc_ctxt_head;
108 int sc_ctxt_cnt;
109 int sc_ctxt_bump;
110 int sc_ctxt_max;
111 spinlock_t sc_ctxt_lock;
112 struct list_head sc_rq_dto_q;
113 spinlock_t sc_rq_dto_lock;
114 struct ib_qp *sc_qp;
115 struct ib_cq *sc_rq_cq;
116 struct ib_cq *sc_sq_cq;
117 struct ib_mr *sc_phys_mr; /* MR for server memory */
118
119 spinlock_t sc_lock; /* transport lock */
120
121 wait_queue_head_t sc_send_wait; /* SQ exhaustion waitlist */
122 unsigned long sc_flags;
123 struct list_head sc_dto_q; /* DTO tasklet I/O pending Q */
124 struct list_head sc_read_complete_q;
125 spinlock_t sc_read_complete_lock;
126};
127/* sc_flags */
128#define RDMAXPRT_RQ_PENDING 1
129#define RDMAXPRT_SQ_PENDING 2
130#define RDMAXPRT_CONN_PENDING 3
131
132#define RPCRDMA_LISTEN_BACKLOG 10
133/* The default ORD value is based on two outstanding full-size writes with a
134 * page size of 4k, or 32k * 2 ops / 4k = 16 outstanding RDMA_READ. */
135#define RPCRDMA_ORD (64/4)
136#define RPCRDMA_SQ_DEPTH_MULT 8
137#define RPCRDMA_MAX_THREADS 16
138#define RPCRDMA_MAX_REQUESTS 16
139#define RPCRDMA_MAX_REQ_SIZE 4096
140
141/* svc_rdma_marshal.c */
142extern void svc_rdma_rcl_chunk_counts(struct rpcrdma_read_chunk *,
143 int *, int *);
144extern int svc_rdma_xdr_decode_req(struct rpcrdma_msg **, struct svc_rqst *);
145extern int svc_rdma_xdr_decode_deferred_req(struct svc_rqst *);
146extern int svc_rdma_xdr_encode_error(struct svcxprt_rdma *,
147 struct rpcrdma_msg *,
148 enum rpcrdma_errcode, u32 *);
149extern void svc_rdma_xdr_encode_write_list(struct rpcrdma_msg *, int);
150extern void svc_rdma_xdr_encode_reply_array(struct rpcrdma_write_array *, int);
151extern void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *, int,
152 u32, u64, u32);
153extern void svc_rdma_xdr_encode_reply_header(struct svcxprt_rdma *,
154 struct rpcrdma_msg *,
155 struct rpcrdma_msg *,
156 enum rpcrdma_proc);
157extern int svc_rdma_xdr_get_reply_hdr_len(struct rpcrdma_msg *);
158
159/* svc_rdma_recvfrom.c */
160extern int svc_rdma_recvfrom(struct svc_rqst *);
161
162/* svc_rdma_sendto.c */
163extern int svc_rdma_sendto(struct svc_rqst *);
164
165/* svc_rdma_transport.c */
166extern int svc_rdma_send(struct svcxprt_rdma *, struct ib_send_wr *);
167extern int svc_rdma_send_error(struct svcxprt_rdma *, struct rpcrdma_msg *,
168 enum rpcrdma_errcode);
169struct page *svc_rdma_get_page(void);
170extern int svc_rdma_post_recv(struct svcxprt_rdma *);
171extern int svc_rdma_create_listen(struct svc_serv *, int, struct sockaddr *);
172extern struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *);
173extern void svc_rdma_put_context(struct svc_rdma_op_ctxt *, int);
174extern void svc_sq_reap(struct svcxprt_rdma *);
175extern void svc_rq_reap(struct svcxprt_rdma *);
176extern struct svc_xprt_class svc_rdma_class;
177extern void svc_rdma_prep_reply_hdr(struct svc_rqst *);
178
179/* svc_rdma.c */
180extern int svc_rdma_init(void);
181extern void svc_rdma_cleanup(void);
182
183/*
184 * Returns the address of the first read chunk or <nul> if no read chunk is
185 * present
186 */
187static inline struct rpcrdma_read_chunk *
188svc_rdma_get_read_chunk(struct rpcrdma_msg *rmsgp)
189{
190 struct rpcrdma_read_chunk *ch =
191 (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
192
193 if (ch->rc_discrim == 0)
194 return NULL;
195
196 return ch;
197}
198
199/*
200 * Returns the address of the first read write array element or <nul> if no
201 * write array list is present
202 */
203static inline struct rpcrdma_write_array *
204svc_rdma_get_write_array(struct rpcrdma_msg *rmsgp)
205{
206 if (rmsgp->rm_body.rm_chunks[0] != 0
207 || rmsgp->rm_body.rm_chunks[1] == 0)
208 return NULL;
209
210 return (struct rpcrdma_write_array *)&rmsgp->rm_body.rm_chunks[1];
211}
212
213/*
214 * Returns the address of the first reply array element or <nul> if no
215 * reply array is present
216 */
217static inline struct rpcrdma_write_array *
218svc_rdma_get_reply_array(struct rpcrdma_msg *rmsgp)
219{
220 struct rpcrdma_read_chunk *rch;
221 struct rpcrdma_write_array *wr_ary;
222 struct rpcrdma_write_array *rp_ary;
223
224 /* XXX: Need to fix when reply list may occur with read-list and/or
225 * write list */
226 if (rmsgp->rm_body.rm_chunks[0] != 0 ||
227 rmsgp->rm_body.rm_chunks[1] != 0)
228 return NULL;
229
230 rch = svc_rdma_get_read_chunk(rmsgp);
231 if (rch) {
232 while (rch->rc_discrim)
233 rch++;
234
235 /* The reply list follows an empty write array located
236 * at 'rc_position' here. The reply array is at rc_target.
237 */
238 rp_ary = (struct rpcrdma_write_array *)&rch->rc_target;
239
240 goto found_it;
241 }
242
243 wr_ary = svc_rdma_get_write_array(rmsgp);
244 if (wr_ary) {
245 rp_ary = (struct rpcrdma_write_array *)
246 &wr_ary->
247 wc_array[wr_ary->wc_nchunks].wc_target.rs_length;
248
249 goto found_it;
250 }
251
252 /* No read list, no write list */
253 rp_ary = (struct rpcrdma_write_array *)
254 &rmsgp->rm_body.rm_chunks[2];
255
256 found_it:
257 if (rp_ary->wc_discrim == 0)
258 return NULL;
259
260 return rp_ary;
261}
262#endif
diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h
new file mode 100644
index 000000000000..6fd7b016517f
--- /dev/null
+++ b/include/linux/sunrpc/svc_xprt.h
@@ -0,0 +1,159 @@
1/*
2 * linux/include/linux/sunrpc/svc_xprt.h
3 *
4 * RPC server transport I/O
5 */
6
7#ifndef SUNRPC_SVC_XPRT_H
8#define SUNRPC_SVC_XPRT_H
9
10#include <linux/sunrpc/svc.h>
11#include <linux/module.h>
12
13struct svc_xprt_ops {
14 struct svc_xprt *(*xpo_create)(struct svc_serv *,
15 struct sockaddr *, int,
16 int);
17 struct svc_xprt *(*xpo_accept)(struct svc_xprt *);
18 int (*xpo_has_wspace)(struct svc_xprt *);
19 int (*xpo_recvfrom)(struct svc_rqst *);
20 void (*xpo_prep_reply_hdr)(struct svc_rqst *);
21 int (*xpo_sendto)(struct svc_rqst *);
22 void (*xpo_release_rqst)(struct svc_rqst *);
23 void (*xpo_detach)(struct svc_xprt *);
24 void (*xpo_free)(struct svc_xprt *);
25};
26
27struct svc_xprt_class {
28 const char *xcl_name;
29 struct module *xcl_owner;
30 struct svc_xprt_ops *xcl_ops;
31 struct list_head xcl_list;
32 u32 xcl_max_payload;
33};
34
35struct svc_xprt {
36 struct svc_xprt_class *xpt_class;
37 struct svc_xprt_ops *xpt_ops;
38 struct kref xpt_ref;
39 struct list_head xpt_list;
40 struct list_head xpt_ready;
41 unsigned long xpt_flags;
42#define XPT_BUSY 0 /* enqueued/receiving */
43#define XPT_CONN 1 /* conn pending */
44#define XPT_CLOSE 2 /* dead or dying */
45#define XPT_DATA 3 /* data pending */
46#define XPT_TEMP 4 /* connected transport */
47#define XPT_DEAD 6 /* transport closed */
48#define XPT_CHNGBUF 7 /* need to change snd/rcv buf sizes */
49#define XPT_DEFERRED 8 /* deferred request pending */
50#define XPT_OLD 9 /* used for xprt aging mark+sweep */
51#define XPT_DETACHED 10 /* detached from tempsocks list */
52#define XPT_LISTENER 11 /* listening endpoint */
53#define XPT_CACHE_AUTH 12 /* cache auth info */
54
55 struct svc_pool *xpt_pool; /* current pool iff queued */
56 struct svc_serv *xpt_server; /* service for transport */
57 atomic_t xpt_reserved; /* space on outq that is rsvd */
58 struct mutex xpt_mutex; /* to serialize sending data */
59 spinlock_t xpt_lock; /* protects sk_deferred
60 * and xpt_auth_cache */
61 void *xpt_auth_cache;/* auth cache */
62 struct list_head xpt_deferred; /* deferred requests that need
63 * to be revisted */
64 struct sockaddr_storage xpt_local; /* local address */
65 size_t xpt_locallen; /* length of address */
66 struct sockaddr_storage xpt_remote; /* remote peer's address */
67 size_t xpt_remotelen; /* length of address */
68};
69
70int svc_reg_xprt_class(struct svc_xprt_class *);
71void svc_unreg_xprt_class(struct svc_xprt_class *);
72void svc_xprt_init(struct svc_xprt_class *, struct svc_xprt *,
73 struct svc_serv *);
74int svc_create_xprt(struct svc_serv *, char *, unsigned short, int);
75void svc_xprt_enqueue(struct svc_xprt *xprt);
76void svc_xprt_received(struct svc_xprt *);
77void svc_xprt_put(struct svc_xprt *xprt);
78void svc_xprt_copy_addrs(struct svc_rqst *rqstp, struct svc_xprt *xprt);
79void svc_close_xprt(struct svc_xprt *xprt);
80void svc_delete_xprt(struct svc_xprt *xprt);
81int svc_port_is_privileged(struct sockaddr *sin);
82int svc_print_xprts(char *buf, int maxlen);
83struct svc_xprt *svc_find_xprt(struct svc_serv *, char *, int, int);
84int svc_xprt_names(struct svc_serv *serv, char *buf, int buflen);
85
86static inline void svc_xprt_get(struct svc_xprt *xprt)
87{
88 kref_get(&xprt->xpt_ref);
89}
90static inline void svc_xprt_set_local(struct svc_xprt *xprt,
91 struct sockaddr *sa, int salen)
92{
93 memcpy(&xprt->xpt_local, sa, salen);
94 xprt->xpt_locallen = salen;
95}
96static inline void svc_xprt_set_remote(struct svc_xprt *xprt,
97 struct sockaddr *sa, int salen)
98{
99 memcpy(&xprt->xpt_remote, sa, salen);
100 xprt->xpt_remotelen = salen;
101}
102static inline unsigned short svc_addr_port(struct sockaddr *sa)
103{
104 unsigned short ret = 0;
105 switch (sa->sa_family) {
106 case AF_INET:
107 ret = ntohs(((struct sockaddr_in *)sa)->sin_port);
108 break;
109 case AF_INET6:
110 ret = ntohs(((struct sockaddr_in6 *)sa)->sin6_port);
111 break;
112 }
113 return ret;
114}
115
116static inline size_t svc_addr_len(struct sockaddr *sa)
117{
118 switch (sa->sa_family) {
119 case AF_INET:
120 return sizeof(struct sockaddr_in);
121 case AF_INET6:
122 return sizeof(struct sockaddr_in6);
123 }
124 return -EAFNOSUPPORT;
125}
126
127static inline unsigned short svc_xprt_local_port(struct svc_xprt *xprt)
128{
129 return svc_addr_port((struct sockaddr *)&xprt->xpt_local);
130}
131
132static inline unsigned short svc_xprt_remote_port(struct svc_xprt *xprt)
133{
134 return svc_addr_port((struct sockaddr *)&xprt->xpt_remote);
135}
136
137static inline char *__svc_print_addr(struct sockaddr *addr,
138 char *buf, size_t len)
139{
140 switch (addr->sa_family) {
141 case AF_INET:
142 snprintf(buf, len, "%u.%u.%u.%u, port=%u",
143 NIPQUAD(((struct sockaddr_in *) addr)->sin_addr),
144 ntohs(((struct sockaddr_in *) addr)->sin_port));
145 break;
146
147 case AF_INET6:
148 snprintf(buf, len, "%x:%x:%x:%x:%x:%x:%x:%x, port=%u",
149 NIP6(((struct sockaddr_in6 *) addr)->sin6_addr),
150 ntohs(((struct sockaddr_in6 *) addr)->sin6_port));
151 break;
152
153 default:
154 snprintf(buf, len, "unknown address type: %d", addr->sa_family);
155 break;
156 }
157 return buf;
158}
159#endif /* SUNRPC_SVC_XPRT_H */
diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h
index a53e0fa855d2..206f092ad4c7 100644
--- a/include/linux/sunrpc/svcsock.h
+++ b/include/linux/sunrpc/svcsock.h
@@ -10,42 +10,16 @@
10#define SUNRPC_SVCSOCK_H 10#define SUNRPC_SVCSOCK_H
11 11
12#include <linux/sunrpc/svc.h> 12#include <linux/sunrpc/svc.h>
13#include <linux/sunrpc/svc_xprt.h>
13 14
14/* 15/*
15 * RPC server socket. 16 * RPC server socket.
16 */ 17 */
17struct svc_sock { 18struct svc_sock {
18 struct list_head sk_ready; /* list of ready sockets */ 19 struct svc_xprt sk_xprt;
19 struct list_head sk_list; /* list of all sockets */
20 struct socket * sk_sock; /* berkeley socket layer */ 20 struct socket * sk_sock; /* berkeley socket layer */
21 struct sock * sk_sk; /* INET layer */ 21 struct sock * sk_sk; /* INET layer */
22 22
23 struct svc_pool * sk_pool; /* current pool iff queued */
24 struct svc_serv * sk_server; /* service for this socket */
25 atomic_t sk_inuse; /* use count */
26 unsigned long sk_flags;
27#define SK_BUSY 0 /* enqueued/receiving */
28#define SK_CONN 1 /* conn pending */
29#define SK_CLOSE 2 /* dead or dying */
30#define SK_DATA 3 /* data pending */
31#define SK_TEMP 4 /* temp (TCP) socket */
32#define SK_DEAD 6 /* socket closed */
33#define SK_CHNGBUF 7 /* need to change snd/rcv buffer sizes */
34#define SK_DEFERRED 8 /* request on sk_deferred */
35#define SK_OLD 9 /* used for temp socket aging mark+sweep */
36#define SK_DETACHED 10 /* detached from tempsocks list */
37
38 atomic_t sk_reserved; /* space on outq that is reserved */
39
40 spinlock_t sk_lock; /* protects sk_deferred and
41 * sk_info_authunix */
42 struct list_head sk_deferred; /* deferred requests that need to
43 * be revisted */
44 struct mutex sk_mutex; /* to serialize sending data */
45
46 int (*sk_recvfrom)(struct svc_rqst *rqstp);
47 int (*sk_sendto)(struct svc_rqst *rqstp);
48
49 /* We keep the old state_change and data_ready CB's here */ 23 /* We keep the old state_change and data_ready CB's here */
50 void (*sk_ostate)(struct sock *); 24 void (*sk_ostate)(struct sock *);
51 void (*sk_odata)(struct sock *, int bytes); 25 void (*sk_odata)(struct sock *, int bytes);
@@ -54,21 +28,12 @@ struct svc_sock {
54 /* private TCP part */ 28 /* private TCP part */
55 int sk_reclen; /* length of record */ 29 int sk_reclen; /* length of record */
56 int sk_tcplen; /* current read length */ 30 int sk_tcplen; /* current read length */
57 time_t sk_lastrecv; /* time of last received request */
58
59 /* cache of various info for TCP sockets */
60 void *sk_info_authunix;
61
62 struct sockaddr_storage sk_local; /* local address */
63 struct sockaddr_storage sk_remote; /* remote peer's address */
64 int sk_remotelen; /* length of address */
65}; 31};
66 32
67/* 33/*
68 * Function prototypes. 34 * Function prototypes.
69 */ 35 */
70int svc_makesock(struct svc_serv *, int, unsigned short, int flags); 36void svc_close_all(struct list_head *);
71void svc_force_close_socket(struct svc_sock *);
72int svc_recv(struct svc_rqst *, long); 37int svc_recv(struct svc_rqst *, long);
73int svc_send(struct svc_rqst *); 38int svc_send(struct svc_rqst *);
74void svc_drop(struct svc_rqst *); 39void svc_drop(struct svc_rqst *);
@@ -78,6 +43,8 @@ int svc_addsock(struct svc_serv *serv,
78 int fd, 43 int fd,
79 char *name_return, 44 char *name_return,
80 int *proto); 45 int *proto);
46void svc_init_xprt_sock(void);
47void svc_cleanup_xprt_sock(void);
81 48
82/* 49/*
83 * svc_makesock socket characteristics 50 * svc_makesock socket characteristics
diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index 0751c9464d0f..e4057d729f03 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -112,7 +112,8 @@ struct xdr_buf {
112__be32 *xdr_encode_opaque_fixed(__be32 *p, const void *ptr, unsigned int len); 112__be32 *xdr_encode_opaque_fixed(__be32 *p, const void *ptr, unsigned int len);
113__be32 *xdr_encode_opaque(__be32 *p, const void *ptr, unsigned int len); 113__be32 *xdr_encode_opaque(__be32 *p, const void *ptr, unsigned int len);
114__be32 *xdr_encode_string(__be32 *p, const char *s); 114__be32 *xdr_encode_string(__be32 *p, const char *s);
115__be32 *xdr_decode_string_inplace(__be32 *p, char **sp, int *lenp, int maxlen); 115__be32 *xdr_decode_string_inplace(__be32 *p, char **sp, unsigned int *lenp,
116 unsigned int maxlen);
116__be32 *xdr_encode_netobj(__be32 *p, const struct xdr_netobj *); 117__be32 *xdr_encode_netobj(__be32 *p, const struct xdr_netobj *);
117__be32 *xdr_decode_netobj(__be32 *p, struct xdr_netobj *); 118__be32 *xdr_decode_netobj(__be32 *p, struct xdr_netobj *);
118 119
diff --git a/net/sunrpc/Makefile b/net/sunrpc/Makefile
index 5c69a725e530..92e1dbe50947 100644
--- a/net/sunrpc/Makefile
+++ b/net/sunrpc/Makefile
@@ -11,6 +11,7 @@ sunrpc-y := clnt.o xprt.o socklib.o xprtsock.o sched.o \
11 auth.o auth_null.o auth_unix.o \ 11 auth.o auth_null.o auth_unix.o \
12 svc.o svcsock.o svcauth.o svcauth_unix.o \ 12 svc.o svcsock.o svcauth.o svcauth_unix.o \
13 rpcb_clnt.o timer.o xdr.o \ 13 rpcb_clnt.o timer.o xdr.o \
14 sunrpc_syms.o cache.o rpc_pipe.o 14 sunrpc_syms.o cache.o rpc_pipe.o \
15 svc_xprt.o
15sunrpc-$(CONFIG_PROC_FS) += stats.o 16sunrpc-$(CONFIG_PROC_FS) += stats.o
16sunrpc-$(CONFIG_SYSCTL) += sysctl.o 17sunrpc-$(CONFIG_SYSCTL) += sysctl.o
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index 73940df6c460..481f984e9a22 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -224,38 +224,34 @@ static int rsi_parse(struct cache_detail *cd,
224 224
225 /* major/minor */ 225 /* major/minor */
226 len = qword_get(&mesg, buf, mlen); 226 len = qword_get(&mesg, buf, mlen);
227 if (len < 0) 227 if (len <= 0)
228 goto out; 228 goto out;
229 if (len == 0) { 229 rsii.major_status = simple_strtoul(buf, &ep, 10);
230 if (*ep)
231 goto out;
232 len = qword_get(&mesg, buf, mlen);
233 if (len <= 0)
234 goto out;
235 rsii.minor_status = simple_strtoul(buf, &ep, 10);
236 if (*ep)
230 goto out; 237 goto out;
231 } else {
232 rsii.major_status = simple_strtoul(buf, &ep, 10);
233 if (*ep)
234 goto out;
235 len = qword_get(&mesg, buf, mlen);
236 if (len <= 0)
237 goto out;
238 rsii.minor_status = simple_strtoul(buf, &ep, 10);
239 if (*ep)
240 goto out;
241 238
242 /* out_handle */ 239 /* out_handle */
243 len = qword_get(&mesg, buf, mlen); 240 len = qword_get(&mesg, buf, mlen);
244 if (len < 0) 241 if (len < 0)
245 goto out; 242 goto out;
246 status = -ENOMEM; 243 status = -ENOMEM;
247 if (dup_to_netobj(&rsii.out_handle, buf, len)) 244 if (dup_to_netobj(&rsii.out_handle, buf, len))
248 goto out; 245 goto out;
249 246
250 /* out_token */ 247 /* out_token */
251 len = qword_get(&mesg, buf, mlen); 248 len = qword_get(&mesg, buf, mlen);
252 status = -EINVAL; 249 status = -EINVAL;
253 if (len < 0) 250 if (len < 0)
254 goto out; 251 goto out;
255 status = -ENOMEM; 252 status = -ENOMEM;
256 if (dup_to_netobj(&rsii.out_token, buf, len)) 253 if (dup_to_netobj(&rsii.out_token, buf, len))
257 goto out; 254 goto out;
258 }
259 rsii.h.expiry_time = expiry; 255 rsii.h.expiry_time = expiry;
260 rsip = rsi_update(&rsii, rsip); 256 rsip = rsi_update(&rsii, rsip);
261 status = 0; 257 status = 0;
@@ -975,6 +971,7 @@ static int svcauth_gss_handle_init(struct svc_rqst *rqstp,
975 struct kvec *resv = &rqstp->rq_res.head[0]; 971 struct kvec *resv = &rqstp->rq_res.head[0];
976 struct xdr_netobj tmpobj; 972 struct xdr_netobj tmpobj;
977 struct rsi *rsip, rsikey; 973 struct rsi *rsip, rsikey;
974 int ret;
978 975
979 /* Read the verifier; should be NULL: */ 976 /* Read the verifier; should be NULL: */
980 *authp = rpc_autherr_badverf; 977 *authp = rpc_autherr_badverf;
@@ -1014,23 +1011,27 @@ static int svcauth_gss_handle_init(struct svc_rqst *rqstp,
1014 /* No upcall result: */ 1011 /* No upcall result: */
1015 return SVC_DROP; 1012 return SVC_DROP;
1016 case 0: 1013 case 0:
1014 ret = SVC_DROP;
1017 /* Got an answer to the upcall; use it: */ 1015 /* Got an answer to the upcall; use it: */
1018 if (gss_write_init_verf(rqstp, rsip)) 1016 if (gss_write_init_verf(rqstp, rsip))
1019 return SVC_DROP; 1017 goto out;
1020 if (resv->iov_len + 4 > PAGE_SIZE) 1018 if (resv->iov_len + 4 > PAGE_SIZE)
1021 return SVC_DROP; 1019 goto out;
1022 svc_putnl(resv, RPC_SUCCESS); 1020 svc_putnl(resv, RPC_SUCCESS);
1023 if (svc_safe_putnetobj(resv, &rsip->out_handle)) 1021 if (svc_safe_putnetobj(resv, &rsip->out_handle))
1024 return SVC_DROP; 1022 goto out;
1025 if (resv->iov_len + 3 * 4 > PAGE_SIZE) 1023 if (resv->iov_len + 3 * 4 > PAGE_SIZE)
1026 return SVC_DROP; 1024 goto out;
1027 svc_putnl(resv, rsip->major_status); 1025 svc_putnl(resv, rsip->major_status);
1028 svc_putnl(resv, rsip->minor_status); 1026 svc_putnl(resv, rsip->minor_status);
1029 svc_putnl(resv, GSS_SEQ_WIN); 1027 svc_putnl(resv, GSS_SEQ_WIN);
1030 if (svc_safe_putnetobj(resv, &rsip->out_token)) 1028 if (svc_safe_putnetobj(resv, &rsip->out_token))
1031 return SVC_DROP; 1029 goto out;
1032 } 1030 }
1033 return SVC_COMPLETE; 1031 ret = SVC_COMPLETE;
1032out:
1033 cache_put(&rsip->h, &rsi_cache);
1034 return ret;
1034} 1035}
1035 1036
1036/* 1037/*
@@ -1125,6 +1126,7 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp)
1125 case RPC_GSS_PROC_DESTROY: 1126 case RPC_GSS_PROC_DESTROY:
1126 if (gss_write_verf(rqstp, rsci->mechctx, gc->gc_seq)) 1127 if (gss_write_verf(rqstp, rsci->mechctx, gc->gc_seq))
1127 goto auth_err; 1128 goto auth_err;
1129 rsci->h.expiry_time = get_seconds();
1128 set_bit(CACHE_NEGATIVE, &rsci->h.flags); 1130 set_bit(CACHE_NEGATIVE, &rsci->h.flags);
1129 if (resv->iov_len + 4 > PAGE_SIZE) 1131 if (resv->iov_len + 4 > PAGE_SIZE)
1130 goto drop; 1132 goto drop;
@@ -1386,19 +1388,26 @@ int
1386gss_svc_init(void) 1388gss_svc_init(void)
1387{ 1389{
1388 int rv = svc_auth_register(RPC_AUTH_GSS, &svcauthops_gss); 1390 int rv = svc_auth_register(RPC_AUTH_GSS, &svcauthops_gss);
1389 if (rv == 0) { 1391 if (rv)
1390 cache_register(&rsc_cache); 1392 return rv;
1391 cache_register(&rsi_cache); 1393 rv = cache_register(&rsc_cache);
1392 } 1394 if (rv)
1395 goto out1;
1396 rv = cache_register(&rsi_cache);
1397 if (rv)
1398 goto out2;
1399 return 0;
1400out2:
1401 cache_unregister(&rsc_cache);
1402out1:
1403 svc_auth_unregister(RPC_AUTH_GSS);
1393 return rv; 1404 return rv;
1394} 1405}
1395 1406
1396void 1407void
1397gss_svc_shutdown(void) 1408gss_svc_shutdown(void)
1398{ 1409{
1399 if (cache_unregister(&rsc_cache)) 1410 cache_unregister(&rsc_cache);
1400 printk(KERN_ERR "auth_rpcgss: failed to unregister rsc cache\n"); 1411 cache_unregister(&rsi_cache);
1401 if (cache_unregister(&rsi_cache))
1402 printk(KERN_ERR "auth_rpcgss: failed to unregister rsi cache\n");
1403 svc_auth_unregister(RPC_AUTH_GSS); 1412 svc_auth_unregister(RPC_AUTH_GSS);
1404} 1413}
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 73f053d0cc7a..636c8e04e0be 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -245,6 +245,7 @@ int cache_check(struct cache_detail *detail,
245 cache_put(h, detail); 245 cache_put(h, detail);
246 return rv; 246 return rv;
247} 247}
248EXPORT_SYMBOL(cache_check);
248 249
249/* 250/*
250 * caches need to be periodically cleaned. 251 * caches need to be periodically cleaned.
@@ -290,44 +291,78 @@ static const struct file_operations cache_flush_operations;
290static void do_cache_clean(struct work_struct *work); 291static void do_cache_clean(struct work_struct *work);
291static DECLARE_DELAYED_WORK(cache_cleaner, do_cache_clean); 292static DECLARE_DELAYED_WORK(cache_cleaner, do_cache_clean);
292 293
293void cache_register(struct cache_detail *cd) 294static void remove_cache_proc_entries(struct cache_detail *cd)
294{ 295{
295 cd->proc_ent = proc_mkdir(cd->name, proc_net_rpc); 296 if (cd->proc_ent == NULL)
296 if (cd->proc_ent) { 297 return;
297 struct proc_dir_entry *p; 298 if (cd->flush_ent)
298 cd->proc_ent->owner = cd->owner; 299 remove_proc_entry("flush", cd->proc_ent);
299 cd->channel_ent = cd->content_ent = NULL; 300 if (cd->channel_ent)
301 remove_proc_entry("channel", cd->proc_ent);
302 if (cd->content_ent)
303 remove_proc_entry("content", cd->proc_ent);
304 cd->proc_ent = NULL;
305 remove_proc_entry(cd->name, proc_net_rpc);
306}
300 307
301 p = create_proc_entry("flush", S_IFREG|S_IRUSR|S_IWUSR, 308#ifdef CONFIG_PROC_FS
302 cd->proc_ent); 309static int create_cache_proc_entries(struct cache_detail *cd)
303 cd->flush_ent = p; 310{
304 if (p) { 311 struct proc_dir_entry *p;
305 p->proc_fops = &cache_flush_operations;
306 p->owner = cd->owner;
307 p->data = cd;
308 }
309 312
310 if (cd->cache_request || cd->cache_parse) { 313 cd->proc_ent = proc_mkdir(cd->name, proc_net_rpc);
311 p = create_proc_entry("channel", S_IFREG|S_IRUSR|S_IWUSR, 314 if (cd->proc_ent == NULL)
312 cd->proc_ent); 315 goto out_nomem;
313 cd->channel_ent = p; 316 cd->proc_ent->owner = cd->owner;
314 if (p) { 317 cd->channel_ent = cd->content_ent = NULL;
315 p->proc_fops = &cache_file_operations; 318
316 p->owner = cd->owner; 319 p = create_proc_entry("flush", S_IFREG|S_IRUSR|S_IWUSR, cd->proc_ent);
317 p->data = cd; 320 cd->flush_ent = p;
318 } 321 if (p == NULL)
319 } 322 goto out_nomem;
320 if (cd->cache_show) { 323 p->proc_fops = &cache_flush_operations;
321 p = create_proc_entry("content", S_IFREG|S_IRUSR|S_IWUSR, 324 p->owner = cd->owner;
322 cd->proc_ent); 325 p->data = cd;
323 cd->content_ent = p; 326
324 if (p) { 327 if (cd->cache_request || cd->cache_parse) {
325 p->proc_fops = &content_file_operations; 328 p = create_proc_entry("channel", S_IFREG|S_IRUSR|S_IWUSR,
326 p->owner = cd->owner; 329 cd->proc_ent);
327 p->data = cd; 330 cd->channel_ent = p;
328 } 331 if (p == NULL)
329 } 332 goto out_nomem;
333 p->proc_fops = &cache_file_operations;
334 p->owner = cd->owner;
335 p->data = cd;
330 } 336 }
337 if (cd->cache_show) {
338 p = create_proc_entry("content", S_IFREG|S_IRUSR|S_IWUSR,
339 cd->proc_ent);
340 cd->content_ent = p;
341 if (p == NULL)
342 goto out_nomem;
343 p->proc_fops = &content_file_operations;
344 p->owner = cd->owner;
345 p->data = cd;
346 }
347 return 0;
348out_nomem:
349 remove_cache_proc_entries(cd);
350 return -ENOMEM;
351}
352#else /* CONFIG_PROC_FS */
353static int create_cache_proc_entries(struct cache_detail *cd)
354{
355 return 0;
356}
357#endif
358
359int cache_register(struct cache_detail *cd)
360{
361 int ret;
362
363 ret = create_cache_proc_entries(cd);
364 if (ret)
365 return ret;
331 rwlock_init(&cd->hash_lock); 366 rwlock_init(&cd->hash_lock);
332 INIT_LIST_HEAD(&cd->queue); 367 INIT_LIST_HEAD(&cd->queue);
333 spin_lock(&cache_list_lock); 368 spin_lock(&cache_list_lock);
@@ -341,9 +376,11 @@ void cache_register(struct cache_detail *cd)
341 376
342 /* start the cleaning process */ 377 /* start the cleaning process */
343 schedule_delayed_work(&cache_cleaner, 0); 378 schedule_delayed_work(&cache_cleaner, 0);
379 return 0;
344} 380}
381EXPORT_SYMBOL(cache_register);
345 382
346int cache_unregister(struct cache_detail *cd) 383void cache_unregister(struct cache_detail *cd)
347{ 384{
348 cache_purge(cd); 385 cache_purge(cd);
349 spin_lock(&cache_list_lock); 386 spin_lock(&cache_list_lock);
@@ -351,30 +388,23 @@ int cache_unregister(struct cache_detail *cd)
351 if (cd->entries || atomic_read(&cd->inuse)) { 388 if (cd->entries || atomic_read(&cd->inuse)) {
352 write_unlock(&cd->hash_lock); 389 write_unlock(&cd->hash_lock);
353 spin_unlock(&cache_list_lock); 390 spin_unlock(&cache_list_lock);
354 return -EBUSY; 391 goto out;
355 } 392 }
356 if (current_detail == cd) 393 if (current_detail == cd)
357 current_detail = NULL; 394 current_detail = NULL;
358 list_del_init(&cd->others); 395 list_del_init(&cd->others);
359 write_unlock(&cd->hash_lock); 396 write_unlock(&cd->hash_lock);
360 spin_unlock(&cache_list_lock); 397 spin_unlock(&cache_list_lock);
361 if (cd->proc_ent) { 398 remove_cache_proc_entries(cd);
362 if (cd->flush_ent)
363 remove_proc_entry("flush", cd->proc_ent);
364 if (cd->channel_ent)
365 remove_proc_entry("channel", cd->proc_ent);
366 if (cd->content_ent)
367 remove_proc_entry("content", cd->proc_ent);
368
369 cd->proc_ent = NULL;
370 remove_proc_entry(cd->name, proc_net_rpc);
371 }
372 if (list_empty(&cache_list)) { 399 if (list_empty(&cache_list)) {
373 /* module must be being unloaded so its safe to kill the worker */ 400 /* module must be being unloaded so its safe to kill the worker */
374 cancel_delayed_work_sync(&cache_cleaner); 401 cancel_delayed_work_sync(&cache_cleaner);
375 } 402 }
376 return 0; 403 return;
404out:
405 printk(KERN_ERR "nfsd: failed to unregister %s cache\n", cd->name);
377} 406}
407EXPORT_SYMBOL(cache_unregister);
378 408
379/* clean cache tries to find something to clean 409/* clean cache tries to find something to clean
380 * and cleans it. 410 * and cleans it.
@@ -489,6 +519,7 @@ void cache_flush(void)
489 while (cache_clean() != -1) 519 while (cache_clean() != -1)
490 cond_resched(); 520 cond_resched();
491} 521}
522EXPORT_SYMBOL(cache_flush);
492 523
493void cache_purge(struct cache_detail *detail) 524void cache_purge(struct cache_detail *detail)
494{ 525{
@@ -497,7 +528,7 @@ void cache_purge(struct cache_detail *detail)
497 cache_flush(); 528 cache_flush();
498 detail->flush_time = 1; 529 detail->flush_time = 1;
499} 530}
500 531EXPORT_SYMBOL(cache_purge);
501 532
502 533
503/* 534/*
@@ -634,13 +665,13 @@ void cache_clean_deferred(void *owner)
634/* 665/*
635 * communicate with user-space 666 * communicate with user-space
636 * 667 *
637 * We have a magic /proc file - /proc/sunrpc/cache 668 * We have a magic /proc file - /proc/sunrpc/<cachename>/channel.
638 * On read, you get a full request, or block 669 * On read, you get a full request, or block.
639 * On write, an update request is processed 670 * On write, an update request is processed.
640 * Poll works if anything to read, and always allows write 671 * Poll works if anything to read, and always allows write.
641 * 672 *
642 * Implemented by linked list of requests. Each open file has 673 * Implemented by linked list of requests. Each open file has
643 * a ->private that also exists in this list. New request are added 674 * a ->private that also exists in this list. New requests are added
644 * to the end and may wakeup and preceding readers. 675 * to the end and may wakeup and preceding readers.
645 * New readers are added to the head. If, on read, an item is found with 676 * New readers are added to the head. If, on read, an item is found with
646 * CACHE_UPCALLING clear, we free it from the list. 677 * CACHE_UPCALLING clear, we free it from the list.
@@ -963,6 +994,7 @@ void qword_add(char **bpp, int *lp, char *str)
963 *bpp = bp; 994 *bpp = bp;
964 *lp = len; 995 *lp = len;
965} 996}
997EXPORT_SYMBOL(qword_add);
966 998
967void qword_addhex(char **bpp, int *lp, char *buf, int blen) 999void qword_addhex(char **bpp, int *lp, char *buf, int blen)
968{ 1000{
@@ -991,6 +1023,7 @@ void qword_addhex(char **bpp, int *lp, char *buf, int blen)
991 *bpp = bp; 1023 *bpp = bp;
992 *lp = len; 1024 *lp = len;
993} 1025}
1026EXPORT_SYMBOL(qword_addhex);
994 1027
995static void warn_no_listener(struct cache_detail *detail) 1028static void warn_no_listener(struct cache_detail *detail)
996{ 1029{
@@ -1113,6 +1146,7 @@ int qword_get(char **bpp, char *dest, int bufsize)
1113 *dest = '\0'; 1146 *dest = '\0';
1114 return len; 1147 return len;
1115} 1148}
1149EXPORT_SYMBOL(qword_get);
1116 1150
1117 1151
1118/* 1152/*
@@ -1244,18 +1278,18 @@ static ssize_t read_flush(struct file *file, char __user *buf,
1244 struct cache_detail *cd = PDE(file->f_path.dentry->d_inode)->data; 1278 struct cache_detail *cd = PDE(file->f_path.dentry->d_inode)->data;
1245 char tbuf[20]; 1279 char tbuf[20];
1246 unsigned long p = *ppos; 1280 unsigned long p = *ppos;
1247 int len; 1281 size_t len;
1248 1282
1249 sprintf(tbuf, "%lu\n", cd->flush_time); 1283 sprintf(tbuf, "%lu\n", cd->flush_time);
1250 len = strlen(tbuf); 1284 len = strlen(tbuf);
1251 if (p >= len) 1285 if (p >= len)
1252 return 0; 1286 return 0;
1253 len -= p; 1287 len -= p;
1254 if (len > count) len = count; 1288 if (len > count)
1289 len = count;
1255 if (copy_to_user(buf, (void*)(tbuf+p), len)) 1290 if (copy_to_user(buf, (void*)(tbuf+p), len))
1256 len = -EFAULT; 1291 return -EFAULT;
1257 else 1292 *ppos += len;
1258 *ppos += len;
1259 return len; 1293 return len;
1260} 1294}
1261 1295
diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c
index 74df2d358e61..5a16875f5ac8 100644
--- a/net/sunrpc/stats.c
+++ b/net/sunrpc/stats.c
@@ -33,7 +33,7 @@ struct proc_dir_entry *proc_net_rpc = NULL;
33static int rpc_proc_show(struct seq_file *seq, void *v) { 33static int rpc_proc_show(struct seq_file *seq, void *v) {
34 const struct rpc_stat *statp = seq->private; 34 const struct rpc_stat *statp = seq->private;
35 const struct rpc_program *prog = statp->program; 35 const struct rpc_program *prog = statp->program;
36 int i, j; 36 unsigned int i, j;
37 37
38 seq_printf(seq, 38 seq_printf(seq,
39 "net %u %u %u %u\n", 39 "net %u %u %u %u\n",
@@ -81,7 +81,7 @@ void svc_seq_show(struct seq_file *seq, const struct svc_stat *statp) {
81 const struct svc_program *prog = statp->program; 81 const struct svc_program *prog = statp->program;
82 const struct svc_procedure *proc; 82 const struct svc_procedure *proc;
83 const struct svc_version *vers; 83 const struct svc_version *vers;
84 int i, j; 84 unsigned int i, j;
85 85
86 seq_printf(seq, 86 seq_printf(seq,
87 "net %u %u %u %u\n", 87 "net %u %u %u %u\n",
@@ -106,6 +106,7 @@ void svc_seq_show(struct seq_file *seq, const struct svc_stat *statp) {
106 seq_putc(seq, '\n'); 106 seq_putc(seq, '\n');
107 } 107 }
108} 108}
109EXPORT_SYMBOL(svc_seq_show);
109 110
110/** 111/**
111 * rpc_alloc_iostats - allocate an rpc_iostats structure 112 * rpc_alloc_iostats - allocate an rpc_iostats structure
@@ -255,12 +256,14 @@ svc_proc_register(struct svc_stat *statp, const struct file_operations *fops)
255{ 256{
256 return do_register(statp->program->pg_name, statp, fops); 257 return do_register(statp->program->pg_name, statp, fops);
257} 258}
259EXPORT_SYMBOL(svc_proc_register);
258 260
259void 261void
260svc_proc_unregister(const char *name) 262svc_proc_unregister(const char *name)
261{ 263{
262 remove_proc_entry(name, proc_net_rpc); 264 remove_proc_entry(name, proc_net_rpc);
263} 265}
266EXPORT_SYMBOL(svc_proc_unregister);
264 267
265void 268void
266rpc_proc_init(void) 269rpc_proc_init(void)
diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
index 1a7e309d008b..843629f55763 100644
--- a/net/sunrpc/sunrpc_syms.c
+++ b/net/sunrpc/sunrpc_syms.c
@@ -22,48 +22,6 @@
22#include <linux/sunrpc/rpc_pipe_fs.h> 22#include <linux/sunrpc/rpc_pipe_fs.h>
23#include <linux/sunrpc/xprtsock.h> 23#include <linux/sunrpc/xprtsock.h>
24 24
25/* RPC server stuff */
26EXPORT_SYMBOL(svc_create);
27EXPORT_SYMBOL(svc_create_thread);
28EXPORT_SYMBOL(svc_create_pooled);
29EXPORT_SYMBOL(svc_set_num_threads);
30EXPORT_SYMBOL(svc_exit_thread);
31EXPORT_SYMBOL(svc_destroy);
32EXPORT_SYMBOL(svc_drop);
33EXPORT_SYMBOL(svc_process);
34EXPORT_SYMBOL(svc_recv);
35EXPORT_SYMBOL(svc_wake_up);
36EXPORT_SYMBOL(svc_makesock);
37EXPORT_SYMBOL(svc_reserve);
38EXPORT_SYMBOL(svc_auth_register);
39EXPORT_SYMBOL(auth_domain_lookup);
40EXPORT_SYMBOL(svc_authenticate);
41EXPORT_SYMBOL(svc_set_client);
42
43/* RPC statistics */
44#ifdef CONFIG_PROC_FS
45EXPORT_SYMBOL(svc_proc_register);
46EXPORT_SYMBOL(svc_proc_unregister);
47EXPORT_SYMBOL(svc_seq_show);
48#endif
49
50/* caching... */
51EXPORT_SYMBOL(auth_domain_find);
52EXPORT_SYMBOL(auth_domain_put);
53EXPORT_SYMBOL(auth_unix_add_addr);
54EXPORT_SYMBOL(auth_unix_forget_old);
55EXPORT_SYMBOL(auth_unix_lookup);
56EXPORT_SYMBOL(cache_check);
57EXPORT_SYMBOL(cache_flush);
58EXPORT_SYMBOL(cache_purge);
59EXPORT_SYMBOL(cache_register);
60EXPORT_SYMBOL(cache_unregister);
61EXPORT_SYMBOL(qword_add);
62EXPORT_SYMBOL(qword_addhex);
63EXPORT_SYMBOL(qword_get);
64EXPORT_SYMBOL(svcauth_unix_purge);
65EXPORT_SYMBOL(unix_domain_find);
66
67extern struct cache_detail ip_map_cache, unix_gid_cache; 25extern struct cache_detail ip_map_cache, unix_gid_cache;
68 26
69static int __init 27static int __init
@@ -85,7 +43,8 @@ init_sunrpc(void)
85#endif 43#endif
86 cache_register(&ip_map_cache); 44 cache_register(&ip_map_cache);
87 cache_register(&unix_gid_cache); 45 cache_register(&unix_gid_cache);
88 init_socket_xprt(); 46 svc_init_xprt_sock(); /* svc sock transport */
47 init_socket_xprt(); /* clnt sock transport */
89 rpcauth_init_module(); 48 rpcauth_init_module();
90out: 49out:
91 return err; 50 return err;
@@ -96,12 +55,11 @@ cleanup_sunrpc(void)
96{ 55{
97 rpcauth_remove_module(); 56 rpcauth_remove_module();
98 cleanup_socket_xprt(); 57 cleanup_socket_xprt();
58 svc_cleanup_xprt_sock();
99 unregister_rpc_pipefs(); 59 unregister_rpc_pipefs();
100 rpc_destroy_mempool(); 60 rpc_destroy_mempool();
101 if (cache_unregister(&ip_map_cache)) 61 cache_unregister(&ip_map_cache);
102 printk(KERN_ERR "sunrpc: failed to unregister ip_map cache\n"); 62 cache_unregister(&unix_gid_cache);
103 if (cache_unregister(&unix_gid_cache))
104 printk(KERN_ERR "sunrpc: failed to unregister unix_gid cache\n");
105#ifdef RPC_DEBUG 63#ifdef RPC_DEBUG
106 rpc_unregister_sysctl(); 64 rpc_unregister_sysctl();
107#endif 65#endif
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 4ad5fbbb18b4..a290e1523297 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -364,7 +364,7 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
364 void (*shutdown)(struct svc_serv *serv)) 364 void (*shutdown)(struct svc_serv *serv))
365{ 365{
366 struct svc_serv *serv; 366 struct svc_serv *serv;
367 int vers; 367 unsigned int vers;
368 unsigned int xdrsize; 368 unsigned int xdrsize;
369 unsigned int i; 369 unsigned int i;
370 370
@@ -433,6 +433,7 @@ svc_create(struct svc_program *prog, unsigned int bufsize,
433{ 433{
434 return __svc_create(prog, bufsize, /*npools*/1, shutdown); 434 return __svc_create(prog, bufsize, /*npools*/1, shutdown);
435} 435}
436EXPORT_SYMBOL(svc_create);
436 437
437struct svc_serv * 438struct svc_serv *
438svc_create_pooled(struct svc_program *prog, unsigned int bufsize, 439svc_create_pooled(struct svc_program *prog, unsigned int bufsize,
@@ -452,6 +453,7 @@ svc_create_pooled(struct svc_program *prog, unsigned int bufsize,
452 453
453 return serv; 454 return serv;
454} 455}
456EXPORT_SYMBOL(svc_create_pooled);
455 457
456/* 458/*
457 * Destroy an RPC service. Should be called with the BKL held 459 * Destroy an RPC service. Should be called with the BKL held
@@ -459,9 +461,6 @@ svc_create_pooled(struct svc_program *prog, unsigned int bufsize,
459void 461void
460svc_destroy(struct svc_serv *serv) 462svc_destroy(struct svc_serv *serv)
461{ 463{
462 struct svc_sock *svsk;
463 struct svc_sock *tmp;
464
465 dprintk("svc: svc_destroy(%s, %d)\n", 464 dprintk("svc: svc_destroy(%s, %d)\n",
466 serv->sv_program->pg_name, 465 serv->sv_program->pg_name,
467 serv->sv_nrthreads); 466 serv->sv_nrthreads);
@@ -476,14 +475,12 @@ svc_destroy(struct svc_serv *serv)
476 475
477 del_timer_sync(&serv->sv_temptimer); 476 del_timer_sync(&serv->sv_temptimer);
478 477
479 list_for_each_entry_safe(svsk, tmp, &serv->sv_tempsocks, sk_list) 478 svc_close_all(&serv->sv_tempsocks);
480 svc_force_close_socket(svsk);
481 479
482 if (serv->sv_shutdown) 480 if (serv->sv_shutdown)
483 serv->sv_shutdown(serv); 481 serv->sv_shutdown(serv);
484 482
485 list_for_each_entry_safe(svsk, tmp, &serv->sv_permsocks, sk_list) 483 svc_close_all(&serv->sv_permsocks);
486 svc_force_close_socket(svsk);
487 484
488 BUG_ON(!list_empty(&serv->sv_permsocks)); 485 BUG_ON(!list_empty(&serv->sv_permsocks));
489 BUG_ON(!list_empty(&serv->sv_tempsocks)); 486 BUG_ON(!list_empty(&serv->sv_tempsocks));
@@ -498,6 +495,7 @@ svc_destroy(struct svc_serv *serv)
498 kfree(serv->sv_pools); 495 kfree(serv->sv_pools);
499 kfree(serv); 496 kfree(serv);
500} 497}
498EXPORT_SYMBOL(svc_destroy);
501 499
502/* 500/*
503 * Allocate an RPC server's buffer space. 501 * Allocate an RPC server's buffer space.
@@ -536,31 +534,17 @@ svc_release_buffer(struct svc_rqst *rqstp)
536 put_page(rqstp->rq_pages[i]); 534 put_page(rqstp->rq_pages[i]);
537} 535}
538 536
539/* 537struct svc_rqst *
540 * Create a thread in the given pool. Caller must hold BKL. 538svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool)
541 * On a NUMA or SMP machine, with a multi-pool serv, the thread
542 * will be restricted to run on the cpus belonging to the pool.
543 */
544static int
545__svc_create_thread(svc_thread_fn func, struct svc_serv *serv,
546 struct svc_pool *pool)
547{ 539{
548 struct svc_rqst *rqstp; 540 struct svc_rqst *rqstp;
549 int error = -ENOMEM;
550 int have_oldmask = 0;
551 cpumask_t oldmask;
552 541
553 rqstp = kzalloc(sizeof(*rqstp), GFP_KERNEL); 542 rqstp = kzalloc(sizeof(*rqstp), GFP_KERNEL);
554 if (!rqstp) 543 if (!rqstp)
555 goto out; 544 goto out_enomem;
556 545
557 init_waitqueue_head(&rqstp->rq_wait); 546 init_waitqueue_head(&rqstp->rq_wait);
558 547
559 if (!(rqstp->rq_argp = kmalloc(serv->sv_xdrsize, GFP_KERNEL))
560 || !(rqstp->rq_resp = kmalloc(serv->sv_xdrsize, GFP_KERNEL))
561 || !svc_init_buffer(rqstp, serv->sv_max_mesg))
562 goto out_thread;
563
564 serv->sv_nrthreads++; 548 serv->sv_nrthreads++;
565 spin_lock_bh(&pool->sp_lock); 549 spin_lock_bh(&pool->sp_lock);
566 pool->sp_nrthreads++; 550 pool->sp_nrthreads++;
@@ -569,6 +553,45 @@ __svc_create_thread(svc_thread_fn func, struct svc_serv *serv,
569 rqstp->rq_server = serv; 553 rqstp->rq_server = serv;
570 rqstp->rq_pool = pool; 554 rqstp->rq_pool = pool;
571 555
556 rqstp->rq_argp = kmalloc(serv->sv_xdrsize, GFP_KERNEL);
557 if (!rqstp->rq_argp)
558 goto out_thread;
559
560 rqstp->rq_resp = kmalloc(serv->sv_xdrsize, GFP_KERNEL);
561 if (!rqstp->rq_resp)
562 goto out_thread;
563
564 if (!svc_init_buffer(rqstp, serv->sv_max_mesg))
565 goto out_thread;
566
567 return rqstp;
568out_thread:
569 svc_exit_thread(rqstp);
570out_enomem:
571 return ERR_PTR(-ENOMEM);
572}
573EXPORT_SYMBOL(svc_prepare_thread);
574
575/*
576 * Create a thread in the given pool. Caller must hold BKL.
577 * On a NUMA or SMP machine, with a multi-pool serv, the thread
578 * will be restricted to run on the cpus belonging to the pool.
579 */
580static int
581__svc_create_thread(svc_thread_fn func, struct svc_serv *serv,
582 struct svc_pool *pool)
583{
584 struct svc_rqst *rqstp;
585 int error = -ENOMEM;
586 int have_oldmask = 0;
587 cpumask_t oldmask;
588
589 rqstp = svc_prepare_thread(serv, pool);
590 if (IS_ERR(rqstp)) {
591 error = PTR_ERR(rqstp);
592 goto out;
593 }
594
572 if (serv->sv_nrpools > 1) 595 if (serv->sv_nrpools > 1)
573 have_oldmask = svc_pool_map_set_cpumask(pool->sp_id, &oldmask); 596 have_oldmask = svc_pool_map_set_cpumask(pool->sp_id, &oldmask);
574 597
@@ -597,6 +620,7 @@ svc_create_thread(svc_thread_fn func, struct svc_serv *serv)
597{ 620{
598 return __svc_create_thread(func, serv, &serv->sv_pools[0]); 621 return __svc_create_thread(func, serv, &serv->sv_pools[0]);
599} 622}
623EXPORT_SYMBOL(svc_create_thread);
600 624
601/* 625/*
602 * Choose a pool in which to create a new thread, for svc_set_num_threads 626 * Choose a pool in which to create a new thread, for svc_set_num_threads
@@ -700,6 +724,7 @@ svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
700 724
701 return error; 725 return error;
702} 726}
727EXPORT_SYMBOL(svc_set_num_threads);
703 728
704/* 729/*
705 * Called from a server thread as it's exiting. Caller must hold BKL. 730 * Called from a server thread as it's exiting. Caller must hold BKL.
@@ -726,6 +751,7 @@ svc_exit_thread(struct svc_rqst *rqstp)
726 if (serv) 751 if (serv)
727 svc_destroy(serv); 752 svc_destroy(serv);
728} 753}
754EXPORT_SYMBOL(svc_exit_thread);
729 755
730/* 756/*
731 * Register an RPC service with the local portmapper. 757 * Register an RPC service with the local portmapper.
@@ -737,7 +763,8 @@ svc_register(struct svc_serv *serv, int proto, unsigned short port)
737{ 763{
738 struct svc_program *progp; 764 struct svc_program *progp;
739 unsigned long flags; 765 unsigned long flags;
740 int i, error = 0, dummy; 766 unsigned int i;
767 int error = 0, dummy;
741 768
742 if (!port) 769 if (!port)
743 clear_thread_flag(TIF_SIGPENDING); 770 clear_thread_flag(TIF_SIGPENDING);
@@ -840,9 +867,9 @@ svc_process(struct svc_rqst *rqstp)
840 rqstp->rq_res.tail[0].iov_len = 0; 867 rqstp->rq_res.tail[0].iov_len = 0;
841 /* Will be turned off only in gss privacy case: */ 868 /* Will be turned off only in gss privacy case: */
842 rqstp->rq_splice_ok = 1; 869 rqstp->rq_splice_ok = 1;
843 /* tcp needs a space for the record length... */ 870
844 if (rqstp->rq_prot == IPPROTO_TCP) 871 /* Setup reply header */
845 svc_putnl(resv, 0); 872 rqstp->rq_xprt->xpt_ops->xpo_prep_reply_hdr(rqstp);
846 873
847 rqstp->rq_xid = svc_getu32(argv); 874 rqstp->rq_xid = svc_getu32(argv);
848 svc_putu32(resv, rqstp->rq_xid); 875 svc_putu32(resv, rqstp->rq_xid);
@@ -1049,16 +1076,15 @@ err_bad:
1049 svc_putnl(resv, ntohl(rpc_stat)); 1076 svc_putnl(resv, ntohl(rpc_stat));
1050 goto sendit; 1077 goto sendit;
1051} 1078}
1079EXPORT_SYMBOL(svc_process);
1052 1080
1053/* 1081/*
1054 * Return (transport-specific) limit on the rpc payload. 1082 * Return (transport-specific) limit on the rpc payload.
1055 */ 1083 */
1056u32 svc_max_payload(const struct svc_rqst *rqstp) 1084u32 svc_max_payload(const struct svc_rqst *rqstp)
1057{ 1085{
1058 int max = RPCSVC_MAXPAYLOAD_TCP; 1086 u32 max = rqstp->rq_xprt->xpt_class->xcl_max_payload;
1059 1087
1060 if (rqstp->rq_sock->sk_sock->type == SOCK_DGRAM)
1061 max = RPCSVC_MAXPAYLOAD_UDP;
1062 if (rqstp->rq_server->sv_max_payload < max) 1088 if (rqstp->rq_server->sv_max_payload < max)
1063 max = rqstp->rq_server->sv_max_payload; 1089 max = rqstp->rq_server->sv_max_payload;
1064 return max; 1090 return max;
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
new file mode 100644
index 000000000000..ea377e06afae
--- /dev/null
+++ b/net/sunrpc/svc_xprt.c
@@ -0,0 +1,1055 @@
1/*
2 * linux/net/sunrpc/svc_xprt.c
3 *
4 * Author: Tom Tucker <tom@opengridcomputing.com>
5 */
6
7#include <linux/sched.h>
8#include <linux/errno.h>
9#include <linux/fcntl.h>
10#include <linux/net.h>
11#include <linux/in.h>
12#include <linux/inet.h>
13#include <linux/udp.h>
14#include <linux/tcp.h>
15#include <linux/unistd.h>
16#include <linux/slab.h>
17#include <linux/netdevice.h>
18#include <linux/skbuff.h>
19#include <linux/file.h>
20#include <linux/freezer.h>
21#include <net/sock.h>
22#include <net/checksum.h>
23#include <net/ip.h>
24#include <net/ipv6.h>
25#include <net/tcp_states.h>
26#include <linux/uaccess.h>
27#include <asm/ioctls.h>
28
29#include <linux/sunrpc/types.h>
30#include <linux/sunrpc/clnt.h>
31#include <linux/sunrpc/xdr.h>
32#include <linux/sunrpc/stats.h>
33#include <linux/sunrpc/svc_xprt.h>
34
35#define RPCDBG_FACILITY RPCDBG_SVCXPRT
36
37static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt);
38static int svc_deferred_recv(struct svc_rqst *rqstp);
39static struct cache_deferred_req *svc_defer(struct cache_req *req);
40static void svc_age_temp_xprts(unsigned long closure);
41
42/* apparently the "standard" is that clients close
43 * idle connections after 5 minutes, servers after
44 * 6 minutes
45 * http://www.connectathon.org/talks96/nfstcp.pdf
46 */
47static int svc_conn_age_period = 6*60;
48
49/* List of registered transport classes */
50static DEFINE_SPINLOCK(svc_xprt_class_lock);
51static LIST_HEAD(svc_xprt_class_list);
52
53/* SMP locking strategy:
54 *
55 * svc_pool->sp_lock protects most of the fields of that pool.
56 * svc_serv->sv_lock protects sv_tempsocks, sv_permsocks, sv_tmpcnt.
57 * when both need to be taken (rare), svc_serv->sv_lock is first.
58 * BKL protects svc_serv->sv_nrthread.
59 * svc_sock->sk_lock protects the svc_sock->sk_deferred list
60 * and the ->sk_info_authunix cache.
61 *
62 * The XPT_BUSY bit in xprt->xpt_flags prevents a transport being
63 * enqueued multiply. During normal transport processing this bit
64 * is set by svc_xprt_enqueue and cleared by svc_xprt_received.
65 * Providers should not manipulate this bit directly.
66 *
67 * Some flags can be set to certain values at any time
68 * providing that certain rules are followed:
69 *
70 * XPT_CONN, XPT_DATA:
71 * - Can be set or cleared at any time.
72 * - After a set, svc_xprt_enqueue must be called to enqueue
73 * the transport for processing.
74 * - After a clear, the transport must be read/accepted.
75 * If this succeeds, it must be set again.
76 * XPT_CLOSE:
77 * - Can set at any time. It is never cleared.
78 * XPT_DEAD:
79 * - Can only be set while XPT_BUSY is held which ensures
80 * that no other thread will be using the transport or will
81 * try to set XPT_DEAD.
82 */
83
84int svc_reg_xprt_class(struct svc_xprt_class *xcl)
85{
86 struct svc_xprt_class *cl;
87 int res = -EEXIST;
88
89 dprintk("svc: Adding svc transport class '%s'\n", xcl->xcl_name);
90
91 INIT_LIST_HEAD(&xcl->xcl_list);
92 spin_lock(&svc_xprt_class_lock);
93 /* Make sure there isn't already a class with the same name */
94 list_for_each_entry(cl, &svc_xprt_class_list, xcl_list) {
95 if (strcmp(xcl->xcl_name, cl->xcl_name) == 0)
96 goto out;
97 }
98 list_add_tail(&xcl->xcl_list, &svc_xprt_class_list);
99 res = 0;
100out:
101 spin_unlock(&svc_xprt_class_lock);
102 return res;
103}
104EXPORT_SYMBOL_GPL(svc_reg_xprt_class);
105
106void svc_unreg_xprt_class(struct svc_xprt_class *xcl)
107{
108 dprintk("svc: Removing svc transport class '%s'\n", xcl->xcl_name);
109 spin_lock(&svc_xprt_class_lock);
110 list_del_init(&xcl->xcl_list);
111 spin_unlock(&svc_xprt_class_lock);
112}
113EXPORT_SYMBOL_GPL(svc_unreg_xprt_class);
114
115/*
116 * Format the transport list for printing
117 */
118int svc_print_xprts(char *buf, int maxlen)
119{
120 struct list_head *le;
121 char tmpstr[80];
122 int len = 0;
123 buf[0] = '\0';
124
125 spin_lock(&svc_xprt_class_lock);
126 list_for_each(le, &svc_xprt_class_list) {
127 int slen;
128 struct svc_xprt_class *xcl =
129 list_entry(le, struct svc_xprt_class, xcl_list);
130
131 sprintf(tmpstr, "%s %d\n", xcl->xcl_name, xcl->xcl_max_payload);
132 slen = strlen(tmpstr);
133 if (len + slen > maxlen)
134 break;
135 len += slen;
136 strcat(buf, tmpstr);
137 }
138 spin_unlock(&svc_xprt_class_lock);
139
140 return len;
141}
142
143static void svc_xprt_free(struct kref *kref)
144{
145 struct svc_xprt *xprt =
146 container_of(kref, struct svc_xprt, xpt_ref);
147 struct module *owner = xprt->xpt_class->xcl_owner;
148 if (test_bit(XPT_CACHE_AUTH, &xprt->xpt_flags)
149 && xprt->xpt_auth_cache != NULL)
150 svcauth_unix_info_release(xprt->xpt_auth_cache);
151 xprt->xpt_ops->xpo_free(xprt);
152 module_put(owner);
153}
154
155void svc_xprt_put(struct svc_xprt *xprt)
156{
157 kref_put(&xprt->xpt_ref, svc_xprt_free);
158}
159EXPORT_SYMBOL_GPL(svc_xprt_put);
160
161/*
162 * Called by transport drivers to initialize the transport independent
163 * portion of the transport instance.
164 */
165void svc_xprt_init(struct svc_xprt_class *xcl, struct svc_xprt *xprt,
166 struct svc_serv *serv)
167{
168 memset(xprt, 0, sizeof(*xprt));
169 xprt->xpt_class = xcl;
170 xprt->xpt_ops = xcl->xcl_ops;
171 kref_init(&xprt->xpt_ref);
172 xprt->xpt_server = serv;
173 INIT_LIST_HEAD(&xprt->xpt_list);
174 INIT_LIST_HEAD(&xprt->xpt_ready);
175 INIT_LIST_HEAD(&xprt->xpt_deferred);
176 mutex_init(&xprt->xpt_mutex);
177 spin_lock_init(&xprt->xpt_lock);
178 set_bit(XPT_BUSY, &xprt->xpt_flags);
179}
180EXPORT_SYMBOL_GPL(svc_xprt_init);
181
182int svc_create_xprt(struct svc_serv *serv, char *xprt_name, unsigned short port,
183 int flags)
184{
185 struct svc_xprt_class *xcl;
186 struct sockaddr_in sin = {
187 .sin_family = AF_INET,
188 .sin_addr.s_addr = INADDR_ANY,
189 .sin_port = htons(port),
190 };
191 dprintk("svc: creating transport %s[%d]\n", xprt_name, port);
192 spin_lock(&svc_xprt_class_lock);
193 list_for_each_entry(xcl, &svc_xprt_class_list, xcl_list) {
194 struct svc_xprt *newxprt;
195
196 if (strcmp(xprt_name, xcl->xcl_name))
197 continue;
198
199 if (!try_module_get(xcl->xcl_owner))
200 goto err;
201
202 spin_unlock(&svc_xprt_class_lock);
203 newxprt = xcl->xcl_ops->
204 xpo_create(serv, (struct sockaddr *)&sin, sizeof(sin),
205 flags);
206 if (IS_ERR(newxprt)) {
207 module_put(xcl->xcl_owner);
208 return PTR_ERR(newxprt);
209 }
210
211 clear_bit(XPT_TEMP, &newxprt->xpt_flags);
212 spin_lock_bh(&serv->sv_lock);
213 list_add(&newxprt->xpt_list, &serv->sv_permsocks);
214 spin_unlock_bh(&serv->sv_lock);
215 clear_bit(XPT_BUSY, &newxprt->xpt_flags);
216 return svc_xprt_local_port(newxprt);
217 }
218 err:
219 spin_unlock(&svc_xprt_class_lock);
220 dprintk("svc: transport %s not found\n", xprt_name);
221 return -ENOENT;
222}
223EXPORT_SYMBOL_GPL(svc_create_xprt);
224
225/*
226 * Copy the local and remote xprt addresses to the rqstp structure
227 */
228void svc_xprt_copy_addrs(struct svc_rqst *rqstp, struct svc_xprt *xprt)
229{
230 struct sockaddr *sin;
231
232 memcpy(&rqstp->rq_addr, &xprt->xpt_remote, xprt->xpt_remotelen);
233 rqstp->rq_addrlen = xprt->xpt_remotelen;
234
235 /*
236 * Destination address in request is needed for binding the
237 * source address in RPC replies/callbacks later.
238 */
239 sin = (struct sockaddr *)&xprt->xpt_local;
240 switch (sin->sa_family) {
241 case AF_INET:
242 rqstp->rq_daddr.addr = ((struct sockaddr_in *)sin)->sin_addr;
243 break;
244 case AF_INET6:
245 rqstp->rq_daddr.addr6 = ((struct sockaddr_in6 *)sin)->sin6_addr;
246 break;
247 }
248}
249EXPORT_SYMBOL_GPL(svc_xprt_copy_addrs);
250
251/**
252 * svc_print_addr - Format rq_addr field for printing
253 * @rqstp: svc_rqst struct containing address to print
254 * @buf: target buffer for formatted address
255 * @len: length of target buffer
256 *
257 */
258char *svc_print_addr(struct svc_rqst *rqstp, char *buf, size_t len)
259{
260 return __svc_print_addr(svc_addr(rqstp), buf, len);
261}
262EXPORT_SYMBOL_GPL(svc_print_addr);
263
264/*
265 * Queue up an idle server thread. Must have pool->sp_lock held.
266 * Note: this is really a stack rather than a queue, so that we only
267 * use as many different threads as we need, and the rest don't pollute
268 * the cache.
269 */
270static void svc_thread_enqueue(struct svc_pool *pool, struct svc_rqst *rqstp)
271{
272 list_add(&rqstp->rq_list, &pool->sp_threads);
273}
274
275/*
276 * Dequeue an nfsd thread. Must have pool->sp_lock held.
277 */
278static void svc_thread_dequeue(struct svc_pool *pool, struct svc_rqst *rqstp)
279{
280 list_del(&rqstp->rq_list);
281}
282
283/*
284 * Queue up a transport with data pending. If there are idle nfsd
285 * processes, wake 'em up.
286 *
287 */
288void svc_xprt_enqueue(struct svc_xprt *xprt)
289{
290 struct svc_serv *serv = xprt->xpt_server;
291 struct svc_pool *pool;
292 struct svc_rqst *rqstp;
293 int cpu;
294
295 if (!(xprt->xpt_flags &
296 ((1<<XPT_CONN)|(1<<XPT_DATA)|(1<<XPT_CLOSE)|(1<<XPT_DEFERRED))))
297 return;
298 if (test_bit(XPT_DEAD, &xprt->xpt_flags))
299 return;
300
301 cpu = get_cpu();
302 pool = svc_pool_for_cpu(xprt->xpt_server, cpu);
303 put_cpu();
304
305 spin_lock_bh(&pool->sp_lock);
306
307 if (!list_empty(&pool->sp_threads) &&
308 !list_empty(&pool->sp_sockets))
309 printk(KERN_ERR
310 "svc_xprt_enqueue: "
311 "threads and transports both waiting??\n");
312
313 if (test_bit(XPT_DEAD, &xprt->xpt_flags)) {
314 /* Don't enqueue dead transports */
315 dprintk("svc: transport %p is dead, not enqueued\n", xprt);
316 goto out_unlock;
317 }
318
319 /* Mark transport as busy. It will remain in this state until
320 * the provider calls svc_xprt_received. We update XPT_BUSY
321 * atomically because it also guards against trying to enqueue
322 * the transport twice.
323 */
324 if (test_and_set_bit(XPT_BUSY, &xprt->xpt_flags)) {
325 /* Don't enqueue transport while already enqueued */
326 dprintk("svc: transport %p busy, not enqueued\n", xprt);
327 goto out_unlock;
328 }
329 BUG_ON(xprt->xpt_pool != NULL);
330 xprt->xpt_pool = pool;
331
332 /* Handle pending connection */
333 if (test_bit(XPT_CONN, &xprt->xpt_flags))
334 goto process;
335
336 /* Handle close in-progress */
337 if (test_bit(XPT_CLOSE, &xprt->xpt_flags))
338 goto process;
339
340 /* Check if we have space to reply to a request */
341 if (!xprt->xpt_ops->xpo_has_wspace(xprt)) {
342 /* Don't enqueue while not enough space for reply */
343 dprintk("svc: no write space, transport %p not enqueued\n",
344 xprt);
345 xprt->xpt_pool = NULL;
346 clear_bit(XPT_BUSY, &xprt->xpt_flags);
347 goto out_unlock;
348 }
349
350 process:
351 if (!list_empty(&pool->sp_threads)) {
352 rqstp = list_entry(pool->sp_threads.next,
353 struct svc_rqst,
354 rq_list);
355 dprintk("svc: transport %p served by daemon %p\n",
356 xprt, rqstp);
357 svc_thread_dequeue(pool, rqstp);
358 if (rqstp->rq_xprt)
359 printk(KERN_ERR
360 "svc_xprt_enqueue: server %p, rq_xprt=%p!\n",
361 rqstp, rqstp->rq_xprt);
362 rqstp->rq_xprt = xprt;
363 svc_xprt_get(xprt);
364 rqstp->rq_reserved = serv->sv_max_mesg;
365 atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved);
366 BUG_ON(xprt->xpt_pool != pool);
367 wake_up(&rqstp->rq_wait);
368 } else {
369 dprintk("svc: transport %p put into queue\n", xprt);
370 list_add_tail(&xprt->xpt_ready, &pool->sp_sockets);
371 BUG_ON(xprt->xpt_pool != pool);
372 }
373
374out_unlock:
375 spin_unlock_bh(&pool->sp_lock);
376}
377EXPORT_SYMBOL_GPL(svc_xprt_enqueue);
378
379/*
380 * Dequeue the first transport. Must be called with the pool->sp_lock held.
381 */
382static struct svc_xprt *svc_xprt_dequeue(struct svc_pool *pool)
383{
384 struct svc_xprt *xprt;
385
386 if (list_empty(&pool->sp_sockets))
387 return NULL;
388
389 xprt = list_entry(pool->sp_sockets.next,
390 struct svc_xprt, xpt_ready);
391 list_del_init(&xprt->xpt_ready);
392
393 dprintk("svc: transport %p dequeued, inuse=%d\n",
394 xprt, atomic_read(&xprt->xpt_ref.refcount));
395
396 return xprt;
397}
398
399/*
400 * svc_xprt_received conditionally queues the transport for processing
401 * by another thread. The caller must hold the XPT_BUSY bit and must
402 * not thereafter touch transport data.
403 *
404 * Note: XPT_DATA only gets cleared when a read-attempt finds no (or
405 * insufficient) data.
406 */
407void svc_xprt_received(struct svc_xprt *xprt)
408{
409 BUG_ON(!test_bit(XPT_BUSY, &xprt->xpt_flags));
410 xprt->xpt_pool = NULL;
411 clear_bit(XPT_BUSY, &xprt->xpt_flags);
412 svc_xprt_enqueue(xprt);
413}
414EXPORT_SYMBOL_GPL(svc_xprt_received);
415
416/**
417 * svc_reserve - change the space reserved for the reply to a request.
418 * @rqstp: The request in question
419 * @space: new max space to reserve
420 *
421 * Each request reserves some space on the output queue of the transport
422 * to make sure the reply fits. This function reduces that reserved
423 * space to be the amount of space used already, plus @space.
424 *
425 */
426void svc_reserve(struct svc_rqst *rqstp, int space)
427{
428 space += rqstp->rq_res.head[0].iov_len;
429
430 if (space < rqstp->rq_reserved) {
431 struct svc_xprt *xprt = rqstp->rq_xprt;
432 atomic_sub((rqstp->rq_reserved - space), &xprt->xpt_reserved);
433 rqstp->rq_reserved = space;
434
435 svc_xprt_enqueue(xprt);
436 }
437}
438EXPORT_SYMBOL(svc_reserve);
439
440static void svc_xprt_release(struct svc_rqst *rqstp)
441{
442 struct svc_xprt *xprt = rqstp->rq_xprt;
443
444 rqstp->rq_xprt->xpt_ops->xpo_release_rqst(rqstp);
445
446 svc_free_res_pages(rqstp);
447 rqstp->rq_res.page_len = 0;
448 rqstp->rq_res.page_base = 0;
449
450 /* Reset response buffer and release
451 * the reservation.
452 * But first, check that enough space was reserved
453 * for the reply, otherwise we have a bug!
454 */
455 if ((rqstp->rq_res.len) > rqstp->rq_reserved)
456 printk(KERN_ERR "RPC request reserved %d but used %d\n",
457 rqstp->rq_reserved,
458 rqstp->rq_res.len);
459
460 rqstp->rq_res.head[0].iov_len = 0;
461 svc_reserve(rqstp, 0);
462 rqstp->rq_xprt = NULL;
463
464 svc_xprt_put(xprt);
465}
466
467/*
468 * External function to wake up a server waiting for data
469 * This really only makes sense for services like lockd
470 * which have exactly one thread anyway.
471 */
472void svc_wake_up(struct svc_serv *serv)
473{
474 struct svc_rqst *rqstp;
475 unsigned int i;
476 struct svc_pool *pool;
477
478 for (i = 0; i < serv->sv_nrpools; i++) {
479 pool = &serv->sv_pools[i];
480
481 spin_lock_bh(&pool->sp_lock);
482 if (!list_empty(&pool->sp_threads)) {
483 rqstp = list_entry(pool->sp_threads.next,
484 struct svc_rqst,
485 rq_list);
486 dprintk("svc: daemon %p woken up.\n", rqstp);
487 /*
488 svc_thread_dequeue(pool, rqstp);
489 rqstp->rq_xprt = NULL;
490 */
491 wake_up(&rqstp->rq_wait);
492 }
493 spin_unlock_bh(&pool->sp_lock);
494 }
495}
496EXPORT_SYMBOL(svc_wake_up);
497
498int svc_port_is_privileged(struct sockaddr *sin)
499{
500 switch (sin->sa_family) {
501 case AF_INET:
502 return ntohs(((struct sockaddr_in *)sin)->sin_port)
503 < PROT_SOCK;
504 case AF_INET6:
505 return ntohs(((struct sockaddr_in6 *)sin)->sin6_port)
506 < PROT_SOCK;
507 default:
508 return 0;
509 }
510}
511
512/*
513 * Make sure that we don't have too many active connections. If we
514 * have, something must be dropped.
515 *
516 * There's no point in trying to do random drop here for DoS
517 * prevention. The NFS clients does 1 reconnect in 15 seconds. An
518 * attacker can easily beat that.
519 *
520 * The only somewhat efficient mechanism would be if drop old
521 * connections from the same IP first. But right now we don't even
522 * record the client IP in svc_sock.
523 */
524static void svc_check_conn_limits(struct svc_serv *serv)
525{
526 if (serv->sv_tmpcnt > (serv->sv_nrthreads+3)*20) {
527 struct svc_xprt *xprt = NULL;
528 spin_lock_bh(&serv->sv_lock);
529 if (!list_empty(&serv->sv_tempsocks)) {
530 if (net_ratelimit()) {
531 /* Try to help the admin */
532 printk(KERN_NOTICE "%s: too many open "
533 "connections, consider increasing the "
534 "number of nfsd threads\n",
535 serv->sv_name);
536 }
537 /*
538 * Always select the oldest connection. It's not fair,
539 * but so is life
540 */
541 xprt = list_entry(serv->sv_tempsocks.prev,
542 struct svc_xprt,
543 xpt_list);
544 set_bit(XPT_CLOSE, &xprt->xpt_flags);
545 svc_xprt_get(xprt);
546 }
547 spin_unlock_bh(&serv->sv_lock);
548
549 if (xprt) {
550 svc_xprt_enqueue(xprt);
551 svc_xprt_put(xprt);
552 }
553 }
554}
555
556/*
557 * Receive the next request on any transport. This code is carefully
558 * organised not to touch any cachelines in the shared svc_serv
559 * structure, only cachelines in the local svc_pool.
560 */
561int svc_recv(struct svc_rqst *rqstp, long timeout)
562{
563 struct svc_xprt *xprt = NULL;
564 struct svc_serv *serv = rqstp->rq_server;
565 struct svc_pool *pool = rqstp->rq_pool;
566 int len, i;
567 int pages;
568 struct xdr_buf *arg;
569 DECLARE_WAITQUEUE(wait, current);
570
571 dprintk("svc: server %p waiting for data (to = %ld)\n",
572 rqstp, timeout);
573
574 if (rqstp->rq_xprt)
575 printk(KERN_ERR
576 "svc_recv: service %p, transport not NULL!\n",
577 rqstp);
578 if (waitqueue_active(&rqstp->rq_wait))
579 printk(KERN_ERR
580 "svc_recv: service %p, wait queue active!\n",
581 rqstp);
582
583 /* now allocate needed pages. If we get a failure, sleep briefly */
584 pages = (serv->sv_max_mesg + PAGE_SIZE) / PAGE_SIZE;
585 for (i = 0; i < pages ; i++)
586 while (rqstp->rq_pages[i] == NULL) {
587 struct page *p = alloc_page(GFP_KERNEL);
588 if (!p) {
589 int j = msecs_to_jiffies(500);
590 schedule_timeout_uninterruptible(j);
591 }
592 rqstp->rq_pages[i] = p;
593 }
594 rqstp->rq_pages[i++] = NULL; /* this might be seen in nfs_read_actor */
595 BUG_ON(pages >= RPCSVC_MAXPAGES);
596
597 /* Make arg->head point to first page and arg->pages point to rest */
598 arg = &rqstp->rq_arg;
599 arg->head[0].iov_base = page_address(rqstp->rq_pages[0]);
600 arg->head[0].iov_len = PAGE_SIZE;
601 arg->pages = rqstp->rq_pages + 1;
602 arg->page_base = 0;
603 /* save at least one page for response */
604 arg->page_len = (pages-2)*PAGE_SIZE;
605 arg->len = (pages-1)*PAGE_SIZE;
606 arg->tail[0].iov_len = 0;
607
608 try_to_freeze();
609 cond_resched();
610 if (signalled())
611 return -EINTR;
612
613 spin_lock_bh(&pool->sp_lock);
614 xprt = svc_xprt_dequeue(pool);
615 if (xprt) {
616 rqstp->rq_xprt = xprt;
617 svc_xprt_get(xprt);
618 rqstp->rq_reserved = serv->sv_max_mesg;
619 atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved);
620 } else {
621 /* No data pending. Go to sleep */
622 svc_thread_enqueue(pool, rqstp);
623
624 /*
625 * We have to be able to interrupt this wait
626 * to bring down the daemons ...
627 */
628 set_current_state(TASK_INTERRUPTIBLE);
629 add_wait_queue(&rqstp->rq_wait, &wait);
630 spin_unlock_bh(&pool->sp_lock);
631
632 schedule_timeout(timeout);
633
634 try_to_freeze();
635
636 spin_lock_bh(&pool->sp_lock);
637 remove_wait_queue(&rqstp->rq_wait, &wait);
638
639 xprt = rqstp->rq_xprt;
640 if (!xprt) {
641 svc_thread_dequeue(pool, rqstp);
642 spin_unlock_bh(&pool->sp_lock);
643 dprintk("svc: server %p, no data yet\n", rqstp);
644 return signalled()? -EINTR : -EAGAIN;
645 }
646 }
647 spin_unlock_bh(&pool->sp_lock);
648
649 len = 0;
650 if (test_bit(XPT_CLOSE, &xprt->xpt_flags)) {
651 dprintk("svc_recv: found XPT_CLOSE\n");
652 svc_delete_xprt(xprt);
653 } else if (test_bit(XPT_LISTENER, &xprt->xpt_flags)) {
654 struct svc_xprt *newxpt;
655 newxpt = xprt->xpt_ops->xpo_accept(xprt);
656 if (newxpt) {
657 /*
658 * We know this module_get will succeed because the
659 * listener holds a reference too
660 */
661 __module_get(newxpt->xpt_class->xcl_owner);
662 svc_check_conn_limits(xprt->xpt_server);
663 spin_lock_bh(&serv->sv_lock);
664 set_bit(XPT_TEMP, &newxpt->xpt_flags);
665 list_add(&newxpt->xpt_list, &serv->sv_tempsocks);
666 serv->sv_tmpcnt++;
667 if (serv->sv_temptimer.function == NULL) {
668 /* setup timer to age temp transports */
669 setup_timer(&serv->sv_temptimer,
670 svc_age_temp_xprts,
671 (unsigned long)serv);
672 mod_timer(&serv->sv_temptimer,
673 jiffies + svc_conn_age_period * HZ);
674 }
675 spin_unlock_bh(&serv->sv_lock);
676 svc_xprt_received(newxpt);
677 }
678 svc_xprt_received(xprt);
679 } else {
680 dprintk("svc: server %p, pool %u, transport %p, inuse=%d\n",
681 rqstp, pool->sp_id, xprt,
682 atomic_read(&xprt->xpt_ref.refcount));
683 rqstp->rq_deferred = svc_deferred_dequeue(xprt);
684 if (rqstp->rq_deferred) {
685 svc_xprt_received(xprt);
686 len = svc_deferred_recv(rqstp);
687 } else
688 len = xprt->xpt_ops->xpo_recvfrom(rqstp);
689 dprintk("svc: got len=%d\n", len);
690 }
691
692 /* No data, incomplete (TCP) read, or accept() */
693 if (len == 0 || len == -EAGAIN) {
694 rqstp->rq_res.len = 0;
695 svc_xprt_release(rqstp);
696 return -EAGAIN;
697 }
698 clear_bit(XPT_OLD, &xprt->xpt_flags);
699
700 rqstp->rq_secure = svc_port_is_privileged(svc_addr(rqstp));
701 rqstp->rq_chandle.defer = svc_defer;
702
703 if (serv->sv_stats)
704 serv->sv_stats->netcnt++;
705 return len;
706}
707EXPORT_SYMBOL(svc_recv);
708
709/*
710 * Drop request
711 */
712void svc_drop(struct svc_rqst *rqstp)
713{
714 dprintk("svc: xprt %p dropped request\n", rqstp->rq_xprt);
715 svc_xprt_release(rqstp);
716}
717EXPORT_SYMBOL(svc_drop);
718
719/*
720 * Return reply to client.
721 */
722int svc_send(struct svc_rqst *rqstp)
723{
724 struct svc_xprt *xprt;
725 int len;
726 struct xdr_buf *xb;
727
728 xprt = rqstp->rq_xprt;
729 if (!xprt)
730 return -EFAULT;
731
732 /* release the receive skb before sending the reply */
733 rqstp->rq_xprt->xpt_ops->xpo_release_rqst(rqstp);
734
735 /* calculate over-all length */
736 xb = &rqstp->rq_res;
737 xb->len = xb->head[0].iov_len +
738 xb->page_len +
739 xb->tail[0].iov_len;
740
741 /* Grab mutex to serialize outgoing data. */
742 mutex_lock(&xprt->xpt_mutex);
743 if (test_bit(XPT_DEAD, &xprt->xpt_flags))
744 len = -ENOTCONN;
745 else
746 len = xprt->xpt_ops->xpo_sendto(rqstp);
747 mutex_unlock(&xprt->xpt_mutex);
748 svc_xprt_release(rqstp);
749
750 if (len == -ECONNREFUSED || len == -ENOTCONN || len == -EAGAIN)
751 return 0;
752 return len;
753}
754
755/*
756 * Timer function to close old temporary transports, using
757 * a mark-and-sweep algorithm.
758 */
759static void svc_age_temp_xprts(unsigned long closure)
760{
761 struct svc_serv *serv = (struct svc_serv *)closure;
762 struct svc_xprt *xprt;
763 struct list_head *le, *next;
764 LIST_HEAD(to_be_aged);
765
766 dprintk("svc_age_temp_xprts\n");
767
768 if (!spin_trylock_bh(&serv->sv_lock)) {
769 /* busy, try again 1 sec later */
770 dprintk("svc_age_temp_xprts: busy\n");
771 mod_timer(&serv->sv_temptimer, jiffies + HZ);
772 return;
773 }
774
775 list_for_each_safe(le, next, &serv->sv_tempsocks) {
776 xprt = list_entry(le, struct svc_xprt, xpt_list);
777
778 /* First time through, just mark it OLD. Second time
779 * through, close it. */
780 if (!test_and_set_bit(XPT_OLD, &xprt->xpt_flags))
781 continue;
782 if (atomic_read(&xprt->xpt_ref.refcount) > 1
783 || test_bit(XPT_BUSY, &xprt->xpt_flags))
784 continue;
785 svc_xprt_get(xprt);
786 list_move(le, &to_be_aged);
787 set_bit(XPT_CLOSE, &xprt->xpt_flags);
788 set_bit(XPT_DETACHED, &xprt->xpt_flags);
789 }
790 spin_unlock_bh(&serv->sv_lock);
791
792 while (!list_empty(&to_be_aged)) {
793 le = to_be_aged.next;
794 /* fiddling the xpt_list node is safe 'cos we're XPT_DETACHED */
795 list_del_init(le);
796 xprt = list_entry(le, struct svc_xprt, xpt_list);
797
798 dprintk("queuing xprt %p for closing\n", xprt);
799
800 /* a thread will dequeue and close it soon */
801 svc_xprt_enqueue(xprt);
802 svc_xprt_put(xprt);
803 }
804
805 mod_timer(&serv->sv_temptimer, jiffies + svc_conn_age_period * HZ);
806}
807
808/*
809 * Remove a dead transport
810 */
811void svc_delete_xprt(struct svc_xprt *xprt)
812{
813 struct svc_serv *serv = xprt->xpt_server;
814
815 dprintk("svc: svc_delete_xprt(%p)\n", xprt);
816 xprt->xpt_ops->xpo_detach(xprt);
817
818 spin_lock_bh(&serv->sv_lock);
819 if (!test_and_set_bit(XPT_DETACHED, &xprt->xpt_flags))
820 list_del_init(&xprt->xpt_list);
821 /*
822 * We used to delete the transport from whichever list
823 * it's sk_xprt.xpt_ready node was on, but we don't actually
824 * need to. This is because the only time we're called
825 * while still attached to a queue, the queue itself
826 * is about to be destroyed (in svc_destroy).
827 */
828 if (!test_and_set_bit(XPT_DEAD, &xprt->xpt_flags)) {
829 BUG_ON(atomic_read(&xprt->xpt_ref.refcount) < 2);
830 if (test_bit(XPT_TEMP, &xprt->xpt_flags))
831 serv->sv_tmpcnt--;
832 svc_xprt_put(xprt);
833 }
834 spin_unlock_bh(&serv->sv_lock);
835}
836
837void svc_close_xprt(struct svc_xprt *xprt)
838{
839 set_bit(XPT_CLOSE, &xprt->xpt_flags);
840 if (test_and_set_bit(XPT_BUSY, &xprt->xpt_flags))
841 /* someone else will have to effect the close */
842 return;
843
844 svc_xprt_get(xprt);
845 svc_delete_xprt(xprt);
846 clear_bit(XPT_BUSY, &xprt->xpt_flags);
847 svc_xprt_put(xprt);
848}
849EXPORT_SYMBOL_GPL(svc_close_xprt);
850
851void svc_close_all(struct list_head *xprt_list)
852{
853 struct svc_xprt *xprt;
854 struct svc_xprt *tmp;
855
856 list_for_each_entry_safe(xprt, tmp, xprt_list, xpt_list) {
857 set_bit(XPT_CLOSE, &xprt->xpt_flags);
858 if (test_bit(XPT_BUSY, &xprt->xpt_flags)) {
859 /* Waiting to be processed, but no threads left,
860 * So just remove it from the waiting list
861 */
862 list_del_init(&xprt->xpt_ready);
863 clear_bit(XPT_BUSY, &xprt->xpt_flags);
864 }
865 svc_close_xprt(xprt);
866 }
867}
868
869/*
870 * Handle defer and revisit of requests
871 */
872
873static void svc_revisit(struct cache_deferred_req *dreq, int too_many)
874{
875 struct svc_deferred_req *dr =
876 container_of(dreq, struct svc_deferred_req, handle);
877 struct svc_xprt *xprt = dr->xprt;
878
879 if (too_many) {
880 svc_xprt_put(xprt);
881 kfree(dr);
882 return;
883 }
884 dprintk("revisit queued\n");
885 dr->xprt = NULL;
886 spin_lock(&xprt->xpt_lock);
887 list_add(&dr->handle.recent, &xprt->xpt_deferred);
888 spin_unlock(&xprt->xpt_lock);
889 set_bit(XPT_DEFERRED, &xprt->xpt_flags);
890 svc_xprt_enqueue(xprt);
891 svc_xprt_put(xprt);
892}
893
894/*
895 * Save the request off for later processing. The request buffer looks
896 * like this:
897 *
898 * <xprt-header><rpc-header><rpc-pagelist><rpc-tail>
899 *
900 * This code can only handle requests that consist of an xprt-header
901 * and rpc-header.
902 */
903static struct cache_deferred_req *svc_defer(struct cache_req *req)
904{
905 struct svc_rqst *rqstp = container_of(req, struct svc_rqst, rq_chandle);
906 struct svc_deferred_req *dr;
907
908 if (rqstp->rq_arg.page_len)
909 return NULL; /* if more than a page, give up FIXME */
910 if (rqstp->rq_deferred) {
911 dr = rqstp->rq_deferred;
912 rqstp->rq_deferred = NULL;
913 } else {
914 size_t skip;
915 size_t size;
916 /* FIXME maybe discard if size too large */
917 size = sizeof(struct svc_deferred_req) + rqstp->rq_arg.len;
918 dr = kmalloc(size, GFP_KERNEL);
919 if (dr == NULL)
920 return NULL;
921
922 dr->handle.owner = rqstp->rq_server;
923 dr->prot = rqstp->rq_prot;
924 memcpy(&dr->addr, &rqstp->rq_addr, rqstp->rq_addrlen);
925 dr->addrlen = rqstp->rq_addrlen;
926 dr->daddr = rqstp->rq_daddr;
927 dr->argslen = rqstp->rq_arg.len >> 2;
928 dr->xprt_hlen = rqstp->rq_xprt_hlen;
929
930 /* back up head to the start of the buffer and copy */
931 skip = rqstp->rq_arg.len - rqstp->rq_arg.head[0].iov_len;
932 memcpy(dr->args, rqstp->rq_arg.head[0].iov_base - skip,
933 dr->argslen << 2);
934 }
935 svc_xprt_get(rqstp->rq_xprt);
936 dr->xprt = rqstp->rq_xprt;
937
938 dr->handle.revisit = svc_revisit;
939 return &dr->handle;
940}
941
942/*
943 * recv data from a deferred request into an active one
944 */
945static int svc_deferred_recv(struct svc_rqst *rqstp)
946{
947 struct svc_deferred_req *dr = rqstp->rq_deferred;
948
949 /* setup iov_base past transport header */
950 rqstp->rq_arg.head[0].iov_base = dr->args + (dr->xprt_hlen>>2);
951 /* The iov_len does not include the transport header bytes */
952 rqstp->rq_arg.head[0].iov_len = (dr->argslen<<2) - dr->xprt_hlen;
953 rqstp->rq_arg.page_len = 0;
954 /* The rq_arg.len includes the transport header bytes */
955 rqstp->rq_arg.len = dr->argslen<<2;
956 rqstp->rq_prot = dr->prot;
957 memcpy(&rqstp->rq_addr, &dr->addr, dr->addrlen);
958 rqstp->rq_addrlen = dr->addrlen;
959 /* Save off transport header len in case we get deferred again */
960 rqstp->rq_xprt_hlen = dr->xprt_hlen;
961 rqstp->rq_daddr = dr->daddr;
962 rqstp->rq_respages = rqstp->rq_pages;
963 return (dr->argslen<<2) - dr->xprt_hlen;
964}
965
966
967static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt)
968{
969 struct svc_deferred_req *dr = NULL;
970
971 if (!test_bit(XPT_DEFERRED, &xprt->xpt_flags))
972 return NULL;
973 spin_lock(&xprt->xpt_lock);
974 clear_bit(XPT_DEFERRED, &xprt->xpt_flags);
975 if (!list_empty(&xprt->xpt_deferred)) {
976 dr = list_entry(xprt->xpt_deferred.next,
977 struct svc_deferred_req,
978 handle.recent);
979 list_del_init(&dr->handle.recent);
980 set_bit(XPT_DEFERRED, &xprt->xpt_flags);
981 }
982 spin_unlock(&xprt->xpt_lock);
983 return dr;
984}
985
986/*
987 * Return the transport instance pointer for the endpoint accepting
988 * connections/peer traffic from the specified transport class,
989 * address family and port.
990 *
991 * Specifying 0 for the address family or port is effectively a
992 * wild-card, and will result in matching the first transport in the
993 * service's list that has a matching class name.
994 */
995struct svc_xprt *svc_find_xprt(struct svc_serv *serv, char *xcl_name,
996 int af, int port)
997{
998 struct svc_xprt *xprt;
999 struct svc_xprt *found = NULL;
1000
1001 /* Sanity check the args */
1002 if (!serv || !xcl_name)
1003 return found;
1004
1005 spin_lock_bh(&serv->sv_lock);
1006 list_for_each_entry(xprt, &serv->sv_permsocks, xpt_list) {
1007 if (strcmp(xprt->xpt_class->xcl_name, xcl_name))
1008 continue;
1009 if (af != AF_UNSPEC && af != xprt->xpt_local.ss_family)
1010 continue;
1011 if (port && port != svc_xprt_local_port(xprt))
1012 continue;
1013 found = xprt;
1014 svc_xprt_get(xprt);
1015 break;
1016 }
1017 spin_unlock_bh(&serv->sv_lock);
1018 return found;
1019}
1020EXPORT_SYMBOL_GPL(svc_find_xprt);
1021
1022/*
1023 * Format a buffer with a list of the active transports. A zero for
1024 * the buflen parameter disables target buffer overflow checking.
1025 */
1026int svc_xprt_names(struct svc_serv *serv, char *buf, int buflen)
1027{
1028 struct svc_xprt *xprt;
1029 char xprt_str[64];
1030 int totlen = 0;
1031 int len;
1032
1033 /* Sanity check args */
1034 if (!serv)
1035 return 0;
1036
1037 spin_lock_bh(&serv->sv_lock);
1038 list_for_each_entry(xprt, &serv->sv_permsocks, xpt_list) {
1039 len = snprintf(xprt_str, sizeof(xprt_str),
1040 "%s %d\n", xprt->xpt_class->xcl_name,
1041 svc_xprt_local_port(xprt));
1042 /* If the string was truncated, replace with error string */
1043 if (len >= sizeof(xprt_str))
1044 strcpy(xprt_str, "name-too-long\n");
1045 /* Don't overflow buffer */
1046 len = strlen(xprt_str);
1047 if (buflen && (len + totlen >= buflen))
1048 break;
1049 strcpy(buf+totlen, xprt_str);
1050 totlen += len;
1051 }
1052 spin_unlock_bh(&serv->sv_lock);
1053 return totlen;
1054}
1055EXPORT_SYMBOL_GPL(svc_xprt_names);
diff --git a/net/sunrpc/svcauth.c b/net/sunrpc/svcauth.c
index af7c5f05c6e1..8a73cbb16052 100644
--- a/net/sunrpc/svcauth.c
+++ b/net/sunrpc/svcauth.c
@@ -57,11 +57,13 @@ svc_authenticate(struct svc_rqst *rqstp, __be32 *authp)
57 rqstp->rq_authop = aops; 57 rqstp->rq_authop = aops;
58 return aops->accept(rqstp, authp); 58 return aops->accept(rqstp, authp);
59} 59}
60EXPORT_SYMBOL(svc_authenticate);
60 61
61int svc_set_client(struct svc_rqst *rqstp) 62int svc_set_client(struct svc_rqst *rqstp)
62{ 63{
63 return rqstp->rq_authop->set_client(rqstp); 64 return rqstp->rq_authop->set_client(rqstp);
64} 65}
66EXPORT_SYMBOL(svc_set_client);
65 67
66/* A request, which was authenticated, has now executed. 68/* A request, which was authenticated, has now executed.
67 * Time to finalise the credentials and verifier 69 * Time to finalise the credentials and verifier
@@ -93,6 +95,7 @@ svc_auth_register(rpc_authflavor_t flavor, struct auth_ops *aops)
93 spin_unlock(&authtab_lock); 95 spin_unlock(&authtab_lock);
94 return rv; 96 return rv;
95} 97}
98EXPORT_SYMBOL(svc_auth_register);
96 99
97void 100void
98svc_auth_unregister(rpc_authflavor_t flavor) 101svc_auth_unregister(rpc_authflavor_t flavor)
@@ -129,6 +132,7 @@ void auth_domain_put(struct auth_domain *dom)
129 spin_unlock(&auth_domain_lock); 132 spin_unlock(&auth_domain_lock);
130 } 133 }
131} 134}
135EXPORT_SYMBOL(auth_domain_put);
132 136
133struct auth_domain * 137struct auth_domain *
134auth_domain_lookup(char *name, struct auth_domain *new) 138auth_domain_lookup(char *name, struct auth_domain *new)
@@ -153,8 +157,10 @@ auth_domain_lookup(char *name, struct auth_domain *new)
153 spin_unlock(&auth_domain_lock); 157 spin_unlock(&auth_domain_lock);
154 return new; 158 return new;
155} 159}
160EXPORT_SYMBOL(auth_domain_lookup);
156 161
157struct auth_domain *auth_domain_find(char *name) 162struct auth_domain *auth_domain_find(char *name)
158{ 163{
159 return auth_domain_lookup(name, NULL); 164 return auth_domain_lookup(name, NULL);
160} 165}
166EXPORT_SYMBOL(auth_domain_find);
diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c
index 411479411b21..3c64051e4555 100644
--- a/net/sunrpc/svcauth_unix.c
+++ b/net/sunrpc/svcauth_unix.c
@@ -63,6 +63,7 @@ struct auth_domain *unix_domain_find(char *name)
63 rv = auth_domain_lookup(name, &new->h); 63 rv = auth_domain_lookup(name, &new->h);
64 } 64 }
65} 65}
66EXPORT_SYMBOL(unix_domain_find);
66 67
67static void svcauth_unix_domain_release(struct auth_domain *dom) 68static void svcauth_unix_domain_release(struct auth_domain *dom)
68{ 69{
@@ -340,6 +341,7 @@ int auth_unix_add_addr(struct in_addr addr, struct auth_domain *dom)
340 else 341 else
341 return -ENOMEM; 342 return -ENOMEM;
342} 343}
344EXPORT_SYMBOL(auth_unix_add_addr);
343 345
344int auth_unix_forget_old(struct auth_domain *dom) 346int auth_unix_forget_old(struct auth_domain *dom)
345{ 347{
@@ -351,6 +353,7 @@ int auth_unix_forget_old(struct auth_domain *dom)
351 udom->addr_changes++; 353 udom->addr_changes++;
352 return 0; 354 return 0;
353} 355}
356EXPORT_SYMBOL(auth_unix_forget_old);
354 357
355struct auth_domain *auth_unix_lookup(struct in_addr addr) 358struct auth_domain *auth_unix_lookup(struct in_addr addr)
356{ 359{
@@ -375,50 +378,56 @@ struct auth_domain *auth_unix_lookup(struct in_addr addr)
375 cache_put(&ipm->h, &ip_map_cache); 378 cache_put(&ipm->h, &ip_map_cache);
376 return rv; 379 return rv;
377} 380}
381EXPORT_SYMBOL(auth_unix_lookup);
378 382
379void svcauth_unix_purge(void) 383void svcauth_unix_purge(void)
380{ 384{
381 cache_purge(&ip_map_cache); 385 cache_purge(&ip_map_cache);
382} 386}
387EXPORT_SYMBOL(svcauth_unix_purge);
383 388
384static inline struct ip_map * 389static inline struct ip_map *
385ip_map_cached_get(struct svc_rqst *rqstp) 390ip_map_cached_get(struct svc_rqst *rqstp)
386{ 391{
387 struct ip_map *ipm; 392 struct ip_map *ipm = NULL;
388 struct svc_sock *svsk = rqstp->rq_sock; 393 struct svc_xprt *xprt = rqstp->rq_xprt;
389 spin_lock(&svsk->sk_lock); 394
390 ipm = svsk->sk_info_authunix; 395 if (test_bit(XPT_CACHE_AUTH, &xprt->xpt_flags)) {
391 if (ipm != NULL) { 396 spin_lock(&xprt->xpt_lock);
392 if (!cache_valid(&ipm->h)) { 397 ipm = xprt->xpt_auth_cache;
393 /* 398 if (ipm != NULL) {
394 * The entry has been invalidated since it was 399 if (!cache_valid(&ipm->h)) {
395 * remembered, e.g. by a second mount from the 400 /*
396 * same IP address. 401 * The entry has been invalidated since it was
397 */ 402 * remembered, e.g. by a second mount from the
398 svsk->sk_info_authunix = NULL; 403 * same IP address.
399 spin_unlock(&svsk->sk_lock); 404 */
400 cache_put(&ipm->h, &ip_map_cache); 405 xprt->xpt_auth_cache = NULL;
401 return NULL; 406 spin_unlock(&xprt->xpt_lock);
407 cache_put(&ipm->h, &ip_map_cache);
408 return NULL;
409 }
410 cache_get(&ipm->h);
402 } 411 }
403 cache_get(&ipm->h); 412 spin_unlock(&xprt->xpt_lock);
404 } 413 }
405 spin_unlock(&svsk->sk_lock);
406 return ipm; 414 return ipm;
407} 415}
408 416
409static inline void 417static inline void
410ip_map_cached_put(struct svc_rqst *rqstp, struct ip_map *ipm) 418ip_map_cached_put(struct svc_rqst *rqstp, struct ip_map *ipm)
411{ 419{
412 struct svc_sock *svsk = rqstp->rq_sock; 420 struct svc_xprt *xprt = rqstp->rq_xprt;
413 421
414 spin_lock(&svsk->sk_lock); 422 if (test_bit(XPT_CACHE_AUTH, &xprt->xpt_flags)) {
415 if (svsk->sk_sock->type == SOCK_STREAM && 423 spin_lock(&xprt->xpt_lock);
416 svsk->sk_info_authunix == NULL) { 424 if (xprt->xpt_auth_cache == NULL) {
417 /* newly cached, keep the reference */ 425 /* newly cached, keep the reference */
418 svsk->sk_info_authunix = ipm; 426 xprt->xpt_auth_cache = ipm;
419 ipm = NULL; 427 ipm = NULL;
428 }
429 spin_unlock(&xprt->xpt_lock);
420 } 430 }
421 spin_unlock(&svsk->sk_lock);
422 if (ipm) 431 if (ipm)
423 cache_put(&ipm->h, &ip_map_cache); 432 cache_put(&ipm->h, &ip_map_cache);
424} 433}
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index c75bffeb89eb..1d3e5fcc2cc4 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -5,7 +5,7 @@
5 * 5 *
6 * The server scheduling algorithm does not always distribute the load 6 * The server scheduling algorithm does not always distribute the load
7 * evenly when servicing a single client. May need to modify the 7 * evenly when servicing a single client. May need to modify the
8 * svc_sock_enqueue procedure... 8 * svc_xprt_enqueue procedure...
9 * 9 *
10 * TCP support is largely untested and may be a little slow. The problem 10 * TCP support is largely untested and may be a little slow. The problem
11 * is that we currently do two separate recvfrom's, one for the 4-byte 11 * is that we currently do two separate recvfrom's, one for the 4-byte
@@ -48,72 +48,40 @@
48#include <linux/sunrpc/svcsock.h> 48#include <linux/sunrpc/svcsock.h>
49#include <linux/sunrpc/stats.h> 49#include <linux/sunrpc/stats.h>
50 50
51/* SMP locking strategy: 51#define RPCDBG_FACILITY RPCDBG_SVCXPRT
52 *
53 * svc_pool->sp_lock protects most of the fields of that pool.
54 * svc_serv->sv_lock protects sv_tempsocks, sv_permsocks, sv_tmpcnt.
55 * when both need to be taken (rare), svc_serv->sv_lock is first.
56 * BKL protects svc_serv->sv_nrthread.
57 * svc_sock->sk_lock protects the svc_sock->sk_deferred list
58 * and the ->sk_info_authunix cache.
59 * svc_sock->sk_flags.SK_BUSY prevents a svc_sock being enqueued multiply.
60 *
61 * Some flags can be set to certain values at any time
62 * providing that certain rules are followed:
63 *
64 * SK_CONN, SK_DATA, can be set or cleared at any time.
65 * after a set, svc_sock_enqueue must be called.
66 * after a clear, the socket must be read/accepted
67 * if this succeeds, it must be set again.
68 * SK_CLOSE can set at any time. It is never cleared.
69 * sk_inuse contains a bias of '1' until SK_DEAD is set.
70 * so when sk_inuse hits zero, we know the socket is dead
71 * and no-one is using it.
72 * SK_DEAD can only be set while SK_BUSY is held which ensures
73 * no other thread will be using the socket or will try to
74 * set SK_DEAD.
75 *
76 */
77
78#define RPCDBG_FACILITY RPCDBG_SVCSOCK
79 52
80 53
81static struct svc_sock *svc_setup_socket(struct svc_serv *, struct socket *, 54static struct svc_sock *svc_setup_socket(struct svc_serv *, struct socket *,
82 int *errp, int flags); 55 int *errp, int flags);
83static void svc_delete_socket(struct svc_sock *svsk);
84static void svc_udp_data_ready(struct sock *, int); 56static void svc_udp_data_ready(struct sock *, int);
85static int svc_udp_recvfrom(struct svc_rqst *); 57static int svc_udp_recvfrom(struct svc_rqst *);
86static int svc_udp_sendto(struct svc_rqst *); 58static int svc_udp_sendto(struct svc_rqst *);
87static void svc_close_socket(struct svc_sock *svsk); 59static void svc_sock_detach(struct svc_xprt *);
88 60static void svc_sock_free(struct svc_xprt *);
89static struct svc_deferred_req *svc_deferred_dequeue(struct svc_sock *svsk);
90static int svc_deferred_recv(struct svc_rqst *rqstp);
91static struct cache_deferred_req *svc_defer(struct cache_req *req);
92
93/* apparently the "standard" is that clients close
94 * idle connections after 5 minutes, servers after
95 * 6 minutes
96 * http://www.connectathon.org/talks96/nfstcp.pdf
97 */
98static int svc_conn_age_period = 6*60;
99 61
62static struct svc_xprt *svc_create_socket(struct svc_serv *, int,
63 struct sockaddr *, int, int);
100#ifdef CONFIG_DEBUG_LOCK_ALLOC 64#ifdef CONFIG_DEBUG_LOCK_ALLOC
101static struct lock_class_key svc_key[2]; 65static struct lock_class_key svc_key[2];
102static struct lock_class_key svc_slock_key[2]; 66static struct lock_class_key svc_slock_key[2];
103 67
104static inline void svc_reclassify_socket(struct socket *sock) 68static void svc_reclassify_socket(struct socket *sock)
105{ 69{
106 struct sock *sk = sock->sk; 70 struct sock *sk = sock->sk;
107 BUG_ON(sock_owned_by_user(sk)); 71 BUG_ON(sock_owned_by_user(sk));
108 switch (sk->sk_family) { 72 switch (sk->sk_family) {
109 case AF_INET: 73 case AF_INET:
110 sock_lock_init_class_and_name(sk, "slock-AF_INET-NFSD", 74 sock_lock_init_class_and_name(sk, "slock-AF_INET-NFSD",
111 &svc_slock_key[0], "sk_lock-AF_INET-NFSD", &svc_key[0]); 75 &svc_slock_key[0],
76 "sk_xprt.xpt_lock-AF_INET-NFSD",
77 &svc_key[0]);
112 break; 78 break;
113 79
114 case AF_INET6: 80 case AF_INET6:
115 sock_lock_init_class_and_name(sk, "slock-AF_INET6-NFSD", 81 sock_lock_init_class_and_name(sk, "slock-AF_INET6-NFSD",
116 &svc_slock_key[1], "sk_lock-AF_INET6-NFSD", &svc_key[1]); 82 &svc_slock_key[1],
83 "sk_xprt.xpt_lock-AF_INET6-NFSD",
84 &svc_key[1]);
117 break; 85 break;
118 86
119 default: 87 default:
@@ -121,81 +89,26 @@ static inline void svc_reclassify_socket(struct socket *sock)
121 } 89 }
122} 90}
123#else 91#else
124static inline void svc_reclassify_socket(struct socket *sock) 92static void svc_reclassify_socket(struct socket *sock)
125{ 93{
126} 94}
127#endif 95#endif
128 96
129static char *__svc_print_addr(struct sockaddr *addr, char *buf, size_t len)
130{
131 switch (addr->sa_family) {
132 case AF_INET:
133 snprintf(buf, len, "%u.%u.%u.%u, port=%u",
134 NIPQUAD(((struct sockaddr_in *) addr)->sin_addr),
135 ntohs(((struct sockaddr_in *) addr)->sin_port));
136 break;
137
138 case AF_INET6:
139 snprintf(buf, len, "%x:%x:%x:%x:%x:%x:%x:%x, port=%u",
140 NIP6(((struct sockaddr_in6 *) addr)->sin6_addr),
141 ntohs(((struct sockaddr_in6 *) addr)->sin6_port));
142 break;
143
144 default:
145 snprintf(buf, len, "unknown address type: %d", addr->sa_family);
146 break;
147 }
148 return buf;
149}
150
151/**
152 * svc_print_addr - Format rq_addr field for printing
153 * @rqstp: svc_rqst struct containing address to print
154 * @buf: target buffer for formatted address
155 * @len: length of target buffer
156 *
157 */
158char *svc_print_addr(struct svc_rqst *rqstp, char *buf, size_t len)
159{
160 return __svc_print_addr(svc_addr(rqstp), buf, len);
161}
162EXPORT_SYMBOL_GPL(svc_print_addr);
163
164/*
165 * Queue up an idle server thread. Must have pool->sp_lock held.
166 * Note: this is really a stack rather than a queue, so that we only
167 * use as many different threads as we need, and the rest don't pollute
168 * the cache.
169 */
170static inline void
171svc_thread_enqueue(struct svc_pool *pool, struct svc_rqst *rqstp)
172{
173 list_add(&rqstp->rq_list, &pool->sp_threads);
174}
175
176/*
177 * Dequeue an nfsd thread. Must have pool->sp_lock held.
178 */
179static inline void
180svc_thread_dequeue(struct svc_pool *pool, struct svc_rqst *rqstp)
181{
182 list_del(&rqstp->rq_list);
183}
184
185/* 97/*
186 * Release an skbuff after use 98 * Release an skbuff after use
187 */ 99 */
188static inline void 100static void svc_release_skb(struct svc_rqst *rqstp)
189svc_release_skb(struct svc_rqst *rqstp)
190{ 101{
191 struct sk_buff *skb = rqstp->rq_skbuff; 102 struct sk_buff *skb = rqstp->rq_xprt_ctxt;
192 struct svc_deferred_req *dr = rqstp->rq_deferred; 103 struct svc_deferred_req *dr = rqstp->rq_deferred;
193 104
194 if (skb) { 105 if (skb) {
195 rqstp->rq_skbuff = NULL; 106 struct svc_sock *svsk =
107 container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
108 rqstp->rq_xprt_ctxt = NULL;
196 109
197 dprintk("svc: service %p, releasing skb %p\n", rqstp, skb); 110 dprintk("svc: service %p, releasing skb %p\n", rqstp, skb);
198 skb_free_datagram(rqstp->rq_sock->sk_sk, skb); 111 skb_free_datagram(svsk->sk_sk, skb);
199 } 112 }
200 if (dr) { 113 if (dr) {
201 rqstp->rq_deferred = NULL; 114 rqstp->rq_deferred = NULL;
@@ -203,253 +116,6 @@ svc_release_skb(struct svc_rqst *rqstp)
203 } 116 }
204} 117}
205 118
206/*
207 * Any space to write?
208 */
209static inline unsigned long
210svc_sock_wspace(struct svc_sock *svsk)
211{
212 int wspace;
213
214 if (svsk->sk_sock->type == SOCK_STREAM)
215 wspace = sk_stream_wspace(svsk->sk_sk);
216 else
217 wspace = sock_wspace(svsk->sk_sk);
218
219 return wspace;
220}
221
222/*
223 * Queue up a socket with data pending. If there are idle nfsd
224 * processes, wake 'em up.
225 *
226 */
227static void
228svc_sock_enqueue(struct svc_sock *svsk)
229{
230 struct svc_serv *serv = svsk->sk_server;
231 struct svc_pool *pool;
232 struct svc_rqst *rqstp;
233 int cpu;
234
235 if (!(svsk->sk_flags &
236 ( (1<<SK_CONN)|(1<<SK_DATA)|(1<<SK_CLOSE)|(1<<SK_DEFERRED)) ))
237 return;
238 if (test_bit(SK_DEAD, &svsk->sk_flags))
239 return;
240
241 cpu = get_cpu();
242 pool = svc_pool_for_cpu(svsk->sk_server, cpu);
243 put_cpu();
244
245 spin_lock_bh(&pool->sp_lock);
246
247 if (!list_empty(&pool->sp_threads) &&
248 !list_empty(&pool->sp_sockets))
249 printk(KERN_ERR
250 "svc_sock_enqueue: threads and sockets both waiting??\n");
251
252 if (test_bit(SK_DEAD, &svsk->sk_flags)) {
253 /* Don't enqueue dead sockets */
254 dprintk("svc: socket %p is dead, not enqueued\n", svsk->sk_sk);
255 goto out_unlock;
256 }
257
258 /* Mark socket as busy. It will remain in this state until the
259 * server has processed all pending data and put the socket back
260 * on the idle list. We update SK_BUSY atomically because
261 * it also guards against trying to enqueue the svc_sock twice.
262 */
263 if (test_and_set_bit(SK_BUSY, &svsk->sk_flags)) {
264 /* Don't enqueue socket while already enqueued */
265 dprintk("svc: socket %p busy, not enqueued\n", svsk->sk_sk);
266 goto out_unlock;
267 }
268 BUG_ON(svsk->sk_pool != NULL);
269 svsk->sk_pool = pool;
270
271 set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
272 if (((atomic_read(&svsk->sk_reserved) + serv->sv_max_mesg)*2
273 > svc_sock_wspace(svsk))
274 && !test_bit(SK_CLOSE, &svsk->sk_flags)
275 && !test_bit(SK_CONN, &svsk->sk_flags)) {
276 /* Don't enqueue while not enough space for reply */
277 dprintk("svc: socket %p no space, %d*2 > %ld, not enqueued\n",
278 svsk->sk_sk, atomic_read(&svsk->sk_reserved)+serv->sv_max_mesg,
279 svc_sock_wspace(svsk));
280 svsk->sk_pool = NULL;
281 clear_bit(SK_BUSY, &svsk->sk_flags);
282 goto out_unlock;
283 }
284 clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
285
286
287 if (!list_empty(&pool->sp_threads)) {
288 rqstp = list_entry(pool->sp_threads.next,
289 struct svc_rqst,
290 rq_list);
291 dprintk("svc: socket %p served by daemon %p\n",
292 svsk->sk_sk, rqstp);
293 svc_thread_dequeue(pool, rqstp);
294 if (rqstp->rq_sock)
295 printk(KERN_ERR
296 "svc_sock_enqueue: server %p, rq_sock=%p!\n",
297 rqstp, rqstp->rq_sock);
298 rqstp->rq_sock = svsk;
299 atomic_inc(&svsk->sk_inuse);
300 rqstp->rq_reserved = serv->sv_max_mesg;
301 atomic_add(rqstp->rq_reserved, &svsk->sk_reserved);
302 BUG_ON(svsk->sk_pool != pool);
303 wake_up(&rqstp->rq_wait);
304 } else {
305 dprintk("svc: socket %p put into queue\n", svsk->sk_sk);
306 list_add_tail(&svsk->sk_ready, &pool->sp_sockets);
307 BUG_ON(svsk->sk_pool != pool);
308 }
309
310out_unlock:
311 spin_unlock_bh(&pool->sp_lock);
312}
313
314/*
315 * Dequeue the first socket. Must be called with the pool->sp_lock held.
316 */
317static inline struct svc_sock *
318svc_sock_dequeue(struct svc_pool *pool)
319{
320 struct svc_sock *svsk;
321
322 if (list_empty(&pool->sp_sockets))
323 return NULL;
324
325 svsk = list_entry(pool->sp_sockets.next,
326 struct svc_sock, sk_ready);
327 list_del_init(&svsk->sk_ready);
328
329 dprintk("svc: socket %p dequeued, inuse=%d\n",
330 svsk->sk_sk, atomic_read(&svsk->sk_inuse));
331
332 return svsk;
333}
334
335/*
336 * Having read something from a socket, check whether it
337 * needs to be re-enqueued.
338 * Note: SK_DATA only gets cleared when a read-attempt finds
339 * no (or insufficient) data.
340 */
341static inline void
342svc_sock_received(struct svc_sock *svsk)
343{
344 svsk->sk_pool = NULL;
345 clear_bit(SK_BUSY, &svsk->sk_flags);
346 svc_sock_enqueue(svsk);
347}
348
349
350/**
351 * svc_reserve - change the space reserved for the reply to a request.
352 * @rqstp: The request in question
353 * @space: new max space to reserve
354 *
355 * Each request reserves some space on the output queue of the socket
356 * to make sure the reply fits. This function reduces that reserved
357 * space to be the amount of space used already, plus @space.
358 *
359 */
360void svc_reserve(struct svc_rqst *rqstp, int space)
361{
362 space += rqstp->rq_res.head[0].iov_len;
363
364 if (space < rqstp->rq_reserved) {
365 struct svc_sock *svsk = rqstp->rq_sock;
366 atomic_sub((rqstp->rq_reserved - space), &svsk->sk_reserved);
367 rqstp->rq_reserved = space;
368
369 svc_sock_enqueue(svsk);
370 }
371}
372
373/*
374 * Release a socket after use.
375 */
376static inline void
377svc_sock_put(struct svc_sock *svsk)
378{
379 if (atomic_dec_and_test(&svsk->sk_inuse)) {
380 BUG_ON(! test_bit(SK_DEAD, &svsk->sk_flags));
381
382 dprintk("svc: releasing dead socket\n");
383 if (svsk->sk_sock->file)
384 sockfd_put(svsk->sk_sock);
385 else
386 sock_release(svsk->sk_sock);
387 if (svsk->sk_info_authunix != NULL)
388 svcauth_unix_info_release(svsk->sk_info_authunix);
389 kfree(svsk);
390 }
391}
392
393static void
394svc_sock_release(struct svc_rqst *rqstp)
395{
396 struct svc_sock *svsk = rqstp->rq_sock;
397
398 svc_release_skb(rqstp);
399
400 svc_free_res_pages(rqstp);
401 rqstp->rq_res.page_len = 0;
402 rqstp->rq_res.page_base = 0;
403
404
405 /* Reset response buffer and release
406 * the reservation.
407 * But first, check that enough space was reserved
408 * for the reply, otherwise we have a bug!
409 */
410 if ((rqstp->rq_res.len) > rqstp->rq_reserved)
411 printk(KERN_ERR "RPC request reserved %d but used %d\n",
412 rqstp->rq_reserved,
413 rqstp->rq_res.len);
414
415 rqstp->rq_res.head[0].iov_len = 0;
416 svc_reserve(rqstp, 0);
417 rqstp->rq_sock = NULL;
418
419 svc_sock_put(svsk);
420}
421
422/*
423 * External function to wake up a server waiting for data
424 * This really only makes sense for services like lockd
425 * which have exactly one thread anyway.
426 */
427void
428svc_wake_up(struct svc_serv *serv)
429{
430 struct svc_rqst *rqstp;
431 unsigned int i;
432 struct svc_pool *pool;
433
434 for (i = 0; i < serv->sv_nrpools; i++) {
435 pool = &serv->sv_pools[i];
436
437 spin_lock_bh(&pool->sp_lock);
438 if (!list_empty(&pool->sp_threads)) {
439 rqstp = list_entry(pool->sp_threads.next,
440 struct svc_rqst,
441 rq_list);
442 dprintk("svc: daemon %p woken up.\n", rqstp);
443 /*
444 svc_thread_dequeue(pool, rqstp);
445 rqstp->rq_sock = NULL;
446 */
447 wake_up(&rqstp->rq_wait);
448 }
449 spin_unlock_bh(&pool->sp_lock);
450 }
451}
452
453union svc_pktinfo_u { 119union svc_pktinfo_u {
454 struct in_pktinfo pkti; 120 struct in_pktinfo pkti;
455 struct in6_pktinfo pkti6; 121 struct in6_pktinfo pkti6;
@@ -459,7 +125,9 @@ union svc_pktinfo_u {
459 125
460static void svc_set_cmsg_data(struct svc_rqst *rqstp, struct cmsghdr *cmh) 126static void svc_set_cmsg_data(struct svc_rqst *rqstp, struct cmsghdr *cmh)
461{ 127{
462 switch (rqstp->rq_sock->sk_sk->sk_family) { 128 struct svc_sock *svsk =
129 container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
130 switch (svsk->sk_sk->sk_family) {
463 case AF_INET: { 131 case AF_INET: {
464 struct in_pktinfo *pki = CMSG_DATA(cmh); 132 struct in_pktinfo *pki = CMSG_DATA(cmh);
465 133
@@ -489,10 +157,10 @@ static void svc_set_cmsg_data(struct svc_rqst *rqstp, struct cmsghdr *cmh)
489/* 157/*
490 * Generic sendto routine 158 * Generic sendto routine
491 */ 159 */
492static int 160static int svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr)
493svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr)
494{ 161{
495 struct svc_sock *svsk = rqstp->rq_sock; 162 struct svc_sock *svsk =
163 container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
496 struct socket *sock = svsk->sk_sock; 164 struct socket *sock = svsk->sk_sock;
497 int slen; 165 int slen;
498 union { 166 union {
@@ -565,7 +233,7 @@ svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr)
565 } 233 }
566out: 234out:
567 dprintk("svc: socket %p sendto([%p %Zu... ], %d) = %d (addr %s)\n", 235 dprintk("svc: socket %p sendto([%p %Zu... ], %d) = %d (addr %s)\n",
568 rqstp->rq_sock, xdr->head[0].iov_base, xdr->head[0].iov_len, 236 svsk, xdr->head[0].iov_base, xdr->head[0].iov_len,
569 xdr->len, len, svc_print_addr(rqstp, buf, sizeof(buf))); 237 xdr->len, len, svc_print_addr(rqstp, buf, sizeof(buf)));
570 238
571 return len; 239 return len;
@@ -602,7 +270,7 @@ svc_sock_names(char *buf, struct svc_serv *serv, char *toclose)
602 if (!serv) 270 if (!serv)
603 return 0; 271 return 0;
604 spin_lock_bh(&serv->sv_lock); 272 spin_lock_bh(&serv->sv_lock);
605 list_for_each_entry(svsk, &serv->sv_permsocks, sk_list) { 273 list_for_each_entry(svsk, &serv->sv_permsocks, sk_xprt.xpt_list) {
606 int onelen = one_sock_name(buf+len, svsk); 274 int onelen = one_sock_name(buf+len, svsk);
607 if (toclose && strcmp(toclose, buf+len) == 0) 275 if (toclose && strcmp(toclose, buf+len) == 0)
608 closesk = svsk; 276 closesk = svsk;
@@ -614,7 +282,7 @@ svc_sock_names(char *buf, struct svc_serv *serv, char *toclose)
614 /* Should unregister with portmap, but you cannot 282 /* Should unregister with portmap, but you cannot
615 * unregister just one protocol... 283 * unregister just one protocol...
616 */ 284 */
617 svc_close_socket(closesk); 285 svc_close_xprt(&closesk->sk_xprt);
618 else if (toclose) 286 else if (toclose)
619 return -ENOENT; 287 return -ENOENT;
620 return len; 288 return len;
@@ -624,8 +292,7 @@ EXPORT_SYMBOL(svc_sock_names);
624/* 292/*
625 * Check input queue length 293 * Check input queue length
626 */ 294 */
627static int 295static int svc_recv_available(struct svc_sock *svsk)
628svc_recv_available(struct svc_sock *svsk)
629{ 296{
630 struct socket *sock = svsk->sk_sock; 297 struct socket *sock = svsk->sk_sock;
631 int avail, err; 298 int avail, err;
@@ -638,48 +305,31 @@ svc_recv_available(struct svc_sock *svsk)
638/* 305/*
639 * Generic recvfrom routine. 306 * Generic recvfrom routine.
640 */ 307 */
641static int 308static int svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov, int nr,
642svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov, int nr, int buflen) 309 int buflen)
643{ 310{
644 struct svc_sock *svsk = rqstp->rq_sock; 311 struct svc_sock *svsk =
312 container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
645 struct msghdr msg = { 313 struct msghdr msg = {
646 .msg_flags = MSG_DONTWAIT, 314 .msg_flags = MSG_DONTWAIT,
647 }; 315 };
648 struct sockaddr *sin;
649 int len; 316 int len;
650 317
318 rqstp->rq_xprt_hlen = 0;
319
651 len = kernel_recvmsg(svsk->sk_sock, &msg, iov, nr, buflen, 320 len = kernel_recvmsg(svsk->sk_sock, &msg, iov, nr, buflen,
652 msg.msg_flags); 321 msg.msg_flags);
653 322
654 /* sock_recvmsg doesn't fill in the name/namelen, so we must..
655 */
656 memcpy(&rqstp->rq_addr, &svsk->sk_remote, svsk->sk_remotelen);
657 rqstp->rq_addrlen = svsk->sk_remotelen;
658
659 /* Destination address in request is needed for binding the
660 * source address in RPC callbacks later.
661 */
662 sin = (struct sockaddr *)&svsk->sk_local;
663 switch (sin->sa_family) {
664 case AF_INET:
665 rqstp->rq_daddr.addr = ((struct sockaddr_in *)sin)->sin_addr;
666 break;
667 case AF_INET6:
668 rqstp->rq_daddr.addr6 = ((struct sockaddr_in6 *)sin)->sin6_addr;
669 break;
670 }
671
672 dprintk("svc: socket %p recvfrom(%p, %Zu) = %d\n", 323 dprintk("svc: socket %p recvfrom(%p, %Zu) = %d\n",
673 svsk, iov[0].iov_base, iov[0].iov_len, len); 324 svsk, iov[0].iov_base, iov[0].iov_len, len);
674
675 return len; 325 return len;
676} 326}
677 327
678/* 328/*
679 * Set socket snd and rcv buffer lengths 329 * Set socket snd and rcv buffer lengths
680 */ 330 */
681static inline void 331static void svc_sock_setbufsize(struct socket *sock, unsigned int snd,
682svc_sock_setbufsize(struct socket *sock, unsigned int snd, unsigned int rcv) 332 unsigned int rcv)
683{ 333{
684#if 0 334#if 0
685 mm_segment_t oldfs; 335 mm_segment_t oldfs;
@@ -704,16 +354,16 @@ svc_sock_setbufsize(struct socket *sock, unsigned int snd, unsigned int rcv)
704/* 354/*
705 * INET callback when data has been received on the socket. 355 * INET callback when data has been received on the socket.
706 */ 356 */
707static void 357static void svc_udp_data_ready(struct sock *sk, int count)
708svc_udp_data_ready(struct sock *sk, int count)
709{ 358{
710 struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; 359 struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
711 360
712 if (svsk) { 361 if (svsk) {
713 dprintk("svc: socket %p(inet %p), count=%d, busy=%d\n", 362 dprintk("svc: socket %p(inet %p), count=%d, busy=%d\n",
714 svsk, sk, count, test_bit(SK_BUSY, &svsk->sk_flags)); 363 svsk, sk, count,
715 set_bit(SK_DATA, &svsk->sk_flags); 364 test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags));
716 svc_sock_enqueue(svsk); 365 set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
366 svc_xprt_enqueue(&svsk->sk_xprt);
717 } 367 }
718 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) 368 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
719 wake_up_interruptible(sk->sk_sleep); 369 wake_up_interruptible(sk->sk_sleep);
@@ -722,15 +372,14 @@ svc_udp_data_ready(struct sock *sk, int count)
722/* 372/*
723 * INET callback when space is newly available on the socket. 373 * INET callback when space is newly available on the socket.
724 */ 374 */
725static void 375static void svc_write_space(struct sock *sk)
726svc_write_space(struct sock *sk)
727{ 376{
728 struct svc_sock *svsk = (struct svc_sock *)(sk->sk_user_data); 377 struct svc_sock *svsk = (struct svc_sock *)(sk->sk_user_data);
729 378
730 if (svsk) { 379 if (svsk) {
731 dprintk("svc: socket %p(inet %p), write_space busy=%d\n", 380 dprintk("svc: socket %p(inet %p), write_space busy=%d\n",
732 svsk, sk, test_bit(SK_BUSY, &svsk->sk_flags)); 381 svsk, sk, test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags));
733 svc_sock_enqueue(svsk); 382 svc_xprt_enqueue(&svsk->sk_xprt);
734 } 383 }
735 384
736 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) { 385 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) {
@@ -740,10 +389,19 @@ svc_write_space(struct sock *sk)
740 } 389 }
741} 390}
742 391
743static inline void svc_udp_get_dest_address(struct svc_rqst *rqstp, 392/*
744 struct cmsghdr *cmh) 393 * Copy the UDP datagram's destination address to the rqstp structure.
394 * The 'destination' address in this case is the address to which the
395 * peer sent the datagram, i.e. our local address. For multihomed
396 * hosts, this can change from msg to msg. Note that only the IP
397 * address changes, the port number should remain the same.
398 */
399static void svc_udp_get_dest_address(struct svc_rqst *rqstp,
400 struct cmsghdr *cmh)
745{ 401{
746 switch (rqstp->rq_sock->sk_sk->sk_family) { 402 struct svc_sock *svsk =
403 container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
404 switch (svsk->sk_sk->sk_family) {
747 case AF_INET: { 405 case AF_INET: {
748 struct in_pktinfo *pki = CMSG_DATA(cmh); 406 struct in_pktinfo *pki = CMSG_DATA(cmh);
749 rqstp->rq_daddr.addr.s_addr = pki->ipi_spec_dst.s_addr; 407 rqstp->rq_daddr.addr.s_addr = pki->ipi_spec_dst.s_addr;
@@ -760,11 +418,11 @@ static inline void svc_udp_get_dest_address(struct svc_rqst *rqstp,
760/* 418/*
761 * Receive a datagram from a UDP socket. 419 * Receive a datagram from a UDP socket.
762 */ 420 */
763static int 421static int svc_udp_recvfrom(struct svc_rqst *rqstp)
764svc_udp_recvfrom(struct svc_rqst *rqstp)
765{ 422{
766 struct svc_sock *svsk = rqstp->rq_sock; 423 struct svc_sock *svsk =
767 struct svc_serv *serv = svsk->sk_server; 424 container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
425 struct svc_serv *serv = svsk->sk_xprt.xpt_server;
768 struct sk_buff *skb; 426 struct sk_buff *skb;
769 union { 427 union {
770 struct cmsghdr hdr; 428 struct cmsghdr hdr;
@@ -779,7 +437,7 @@ svc_udp_recvfrom(struct svc_rqst *rqstp)
779 .msg_flags = MSG_DONTWAIT, 437 .msg_flags = MSG_DONTWAIT,
780 }; 438 };
781 439
782 if (test_and_clear_bit(SK_CHNGBUF, &svsk->sk_flags)) 440 if (test_and_clear_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags))
783 /* udp sockets need large rcvbuf as all pending 441 /* udp sockets need large rcvbuf as all pending
784 * requests are still in that buffer. sndbuf must 442 * requests are still in that buffer. sndbuf must
785 * also be large enough that there is enough space 443 * also be large enough that there is enough space
@@ -792,17 +450,7 @@ svc_udp_recvfrom(struct svc_rqst *rqstp)
792 (serv->sv_nrthreads+3) * serv->sv_max_mesg, 450 (serv->sv_nrthreads+3) * serv->sv_max_mesg,
793 (serv->sv_nrthreads+3) * serv->sv_max_mesg); 451 (serv->sv_nrthreads+3) * serv->sv_max_mesg);
794 452
795 if ((rqstp->rq_deferred = svc_deferred_dequeue(svsk))) { 453 clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
796 svc_sock_received(svsk);
797 return svc_deferred_recv(rqstp);
798 }
799
800 if (test_bit(SK_CLOSE, &svsk->sk_flags)) {
801 svc_delete_socket(svsk);
802 return 0;
803 }
804
805 clear_bit(SK_DATA, &svsk->sk_flags);
806 skb = NULL; 454 skb = NULL;
807 err = kernel_recvmsg(svsk->sk_sock, &msg, NULL, 455 err = kernel_recvmsg(svsk->sk_sock, &msg, NULL,
808 0, 0, MSG_PEEK | MSG_DONTWAIT); 456 0, 0, MSG_PEEK | MSG_DONTWAIT);
@@ -813,24 +461,27 @@ svc_udp_recvfrom(struct svc_rqst *rqstp)
813 if (err != -EAGAIN) { 461 if (err != -EAGAIN) {
814 /* possibly an icmp error */ 462 /* possibly an icmp error */
815 dprintk("svc: recvfrom returned error %d\n", -err); 463 dprintk("svc: recvfrom returned error %d\n", -err);
816 set_bit(SK_DATA, &svsk->sk_flags); 464 set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
817 } 465 }
818 svc_sock_received(svsk); 466 svc_xprt_received(&svsk->sk_xprt);
819 return -EAGAIN; 467 return -EAGAIN;
820 } 468 }
821 rqstp->rq_addrlen = sizeof(rqstp->rq_addr); 469 len = svc_addr_len(svc_addr(rqstp));
470 if (len < 0)
471 return len;
472 rqstp->rq_addrlen = len;
822 if (skb->tstamp.tv64 == 0) { 473 if (skb->tstamp.tv64 == 0) {
823 skb->tstamp = ktime_get_real(); 474 skb->tstamp = ktime_get_real();
824 /* Don't enable netstamp, sunrpc doesn't 475 /* Don't enable netstamp, sunrpc doesn't
825 need that much accuracy */ 476 need that much accuracy */
826 } 477 }
827 svsk->sk_sk->sk_stamp = skb->tstamp; 478 svsk->sk_sk->sk_stamp = skb->tstamp;
828 set_bit(SK_DATA, &svsk->sk_flags); /* there may be more data... */ 479 set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); /* there may be more data... */
829 480
830 /* 481 /*
831 * Maybe more packets - kick another thread ASAP. 482 * Maybe more packets - kick another thread ASAP.
832 */ 483 */
833 svc_sock_received(svsk); 484 svc_xprt_received(&svsk->sk_xprt);
834 485
835 len = skb->len - sizeof(struct udphdr); 486 len = skb->len - sizeof(struct udphdr);
836 rqstp->rq_arg.len = len; 487 rqstp->rq_arg.len = len;
@@ -861,13 +512,14 @@ svc_udp_recvfrom(struct svc_rqst *rqstp)
861 skb_free_datagram(svsk->sk_sk, skb); 512 skb_free_datagram(svsk->sk_sk, skb);
862 } else { 513 } else {
863 /* we can use it in-place */ 514 /* we can use it in-place */
864 rqstp->rq_arg.head[0].iov_base = skb->data + sizeof(struct udphdr); 515 rqstp->rq_arg.head[0].iov_base = skb->data +
516 sizeof(struct udphdr);
865 rqstp->rq_arg.head[0].iov_len = len; 517 rqstp->rq_arg.head[0].iov_len = len;
866 if (skb_checksum_complete(skb)) { 518 if (skb_checksum_complete(skb)) {
867 skb_free_datagram(svsk->sk_sk, skb); 519 skb_free_datagram(svsk->sk_sk, skb);
868 return 0; 520 return 0;
869 } 521 }
870 rqstp->rq_skbuff = skb; 522 rqstp->rq_xprt_ctxt = skb;
871 } 523 }
872 524
873 rqstp->rq_arg.page_base = 0; 525 rqstp->rq_arg.page_base = 0;
@@ -900,27 +552,81 @@ svc_udp_sendto(struct svc_rqst *rqstp)
900 return error; 552 return error;
901} 553}
902 554
903static void 555static void svc_udp_prep_reply_hdr(struct svc_rqst *rqstp)
904svc_udp_init(struct svc_sock *svsk) 556{
557}
558
559static int svc_udp_has_wspace(struct svc_xprt *xprt)
560{
561 struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
562 struct svc_serv *serv = xprt->xpt_server;
563 unsigned long required;
564
565 /*
566 * Set the SOCK_NOSPACE flag before checking the available
567 * sock space.
568 */
569 set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
570 required = atomic_read(&svsk->sk_xprt.xpt_reserved) + serv->sv_max_mesg;
571 if (required*2 > sock_wspace(svsk->sk_sk))
572 return 0;
573 clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
574 return 1;
575}
576
577static struct svc_xprt *svc_udp_accept(struct svc_xprt *xprt)
578{
579 BUG();
580 return NULL;
581}
582
583static struct svc_xprt *svc_udp_create(struct svc_serv *serv,
584 struct sockaddr *sa, int salen,
585 int flags)
586{
587 return svc_create_socket(serv, IPPROTO_UDP, sa, salen, flags);
588}
589
590static struct svc_xprt_ops svc_udp_ops = {
591 .xpo_create = svc_udp_create,
592 .xpo_recvfrom = svc_udp_recvfrom,
593 .xpo_sendto = svc_udp_sendto,
594 .xpo_release_rqst = svc_release_skb,
595 .xpo_detach = svc_sock_detach,
596 .xpo_free = svc_sock_free,
597 .xpo_prep_reply_hdr = svc_udp_prep_reply_hdr,
598 .xpo_has_wspace = svc_udp_has_wspace,
599 .xpo_accept = svc_udp_accept,
600};
601
602static struct svc_xprt_class svc_udp_class = {
603 .xcl_name = "udp",
604 .xcl_owner = THIS_MODULE,
605 .xcl_ops = &svc_udp_ops,
606 .xcl_max_payload = RPCSVC_MAXPAYLOAD_UDP,
607};
608
609static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv)
905{ 610{
906 int one = 1; 611 int one = 1;
907 mm_segment_t oldfs; 612 mm_segment_t oldfs;
908 613
614 svc_xprt_init(&svc_udp_class, &svsk->sk_xprt, serv);
615 clear_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags);
909 svsk->sk_sk->sk_data_ready = svc_udp_data_ready; 616 svsk->sk_sk->sk_data_ready = svc_udp_data_ready;
910 svsk->sk_sk->sk_write_space = svc_write_space; 617 svsk->sk_sk->sk_write_space = svc_write_space;
911 svsk->sk_recvfrom = svc_udp_recvfrom;
912 svsk->sk_sendto = svc_udp_sendto;
913 618
914 /* initialise setting must have enough space to 619 /* initialise setting must have enough space to
915 * receive and respond to one request. 620 * receive and respond to one request.
916 * svc_udp_recvfrom will re-adjust if necessary 621 * svc_udp_recvfrom will re-adjust if necessary
917 */ 622 */
918 svc_sock_setbufsize(svsk->sk_sock, 623 svc_sock_setbufsize(svsk->sk_sock,
919 3 * svsk->sk_server->sv_max_mesg, 624 3 * svsk->sk_xprt.xpt_server->sv_max_mesg,
920 3 * svsk->sk_server->sv_max_mesg); 625 3 * svsk->sk_xprt.xpt_server->sv_max_mesg);
921 626
922 set_bit(SK_DATA, &svsk->sk_flags); /* might have come in before data_ready set up */ 627 /* data might have come in before data_ready set up */
923 set_bit(SK_CHNGBUF, &svsk->sk_flags); 628 set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
629 set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags);
924 630
925 oldfs = get_fs(); 631 oldfs = get_fs();
926 set_fs(KERNEL_DS); 632 set_fs(KERNEL_DS);
@@ -934,8 +640,7 @@ svc_udp_init(struct svc_sock *svsk)
934 * A data_ready event on a listening socket means there's a connection 640 * A data_ready event on a listening socket means there's a connection
935 * pending. Do not use state_change as a substitute for it. 641 * pending. Do not use state_change as a substitute for it.
936 */ 642 */
937static void 643static void svc_tcp_listen_data_ready(struct sock *sk, int count_unused)
938svc_tcp_listen_data_ready(struct sock *sk, int count_unused)
939{ 644{
940 struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; 645 struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
941 646
@@ -954,8 +659,8 @@ svc_tcp_listen_data_ready(struct sock *sk, int count_unused)
954 */ 659 */
955 if (sk->sk_state == TCP_LISTEN) { 660 if (sk->sk_state == TCP_LISTEN) {
956 if (svsk) { 661 if (svsk) {
957 set_bit(SK_CONN, &svsk->sk_flags); 662 set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags);
958 svc_sock_enqueue(svsk); 663 svc_xprt_enqueue(&svsk->sk_xprt);
959 } else 664 } else
960 printk("svc: socket %p: no user data\n", sk); 665 printk("svc: socket %p: no user data\n", sk);
961 } 666 }
@@ -967,8 +672,7 @@ svc_tcp_listen_data_ready(struct sock *sk, int count_unused)
967/* 672/*
968 * A state change on a connected socket means it's dying or dead. 673 * A state change on a connected socket means it's dying or dead.
969 */ 674 */
970static void 675static void svc_tcp_state_change(struct sock *sk)
971svc_tcp_state_change(struct sock *sk)
972{ 676{
973 struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; 677 struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
974 678
@@ -978,51 +682,36 @@ svc_tcp_state_change(struct sock *sk)
978 if (!svsk) 682 if (!svsk)
979 printk("svc: socket %p: no user data\n", sk); 683 printk("svc: socket %p: no user data\n", sk);
980 else { 684 else {
981 set_bit(SK_CLOSE, &svsk->sk_flags); 685 set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
982 svc_sock_enqueue(svsk); 686 svc_xprt_enqueue(&svsk->sk_xprt);
983 } 687 }
984 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) 688 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
985 wake_up_interruptible_all(sk->sk_sleep); 689 wake_up_interruptible_all(sk->sk_sleep);
986} 690}
987 691
988static void 692static void svc_tcp_data_ready(struct sock *sk, int count)
989svc_tcp_data_ready(struct sock *sk, int count)
990{ 693{
991 struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; 694 struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
992 695
993 dprintk("svc: socket %p TCP data ready (svsk %p)\n", 696 dprintk("svc: socket %p TCP data ready (svsk %p)\n",
994 sk, sk->sk_user_data); 697 sk, sk->sk_user_data);
995 if (svsk) { 698 if (svsk) {
996 set_bit(SK_DATA, &svsk->sk_flags); 699 set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
997 svc_sock_enqueue(svsk); 700 svc_xprt_enqueue(&svsk->sk_xprt);
998 } 701 }
999 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) 702 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1000 wake_up_interruptible(sk->sk_sleep); 703 wake_up_interruptible(sk->sk_sleep);
1001} 704}
1002 705
1003static inline int svc_port_is_privileged(struct sockaddr *sin)
1004{
1005 switch (sin->sa_family) {
1006 case AF_INET:
1007 return ntohs(((struct sockaddr_in *)sin)->sin_port)
1008 < PROT_SOCK;
1009 case AF_INET6:
1010 return ntohs(((struct sockaddr_in6 *)sin)->sin6_port)
1011 < PROT_SOCK;
1012 default:
1013 return 0;
1014 }
1015}
1016
1017/* 706/*
1018 * Accept a TCP connection 707 * Accept a TCP connection
1019 */ 708 */
1020static void 709static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt)
1021svc_tcp_accept(struct svc_sock *svsk)
1022{ 710{
711 struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
1023 struct sockaddr_storage addr; 712 struct sockaddr_storage addr;
1024 struct sockaddr *sin = (struct sockaddr *) &addr; 713 struct sockaddr *sin = (struct sockaddr *) &addr;
1025 struct svc_serv *serv = svsk->sk_server; 714 struct svc_serv *serv = svsk->sk_xprt.xpt_server;
1026 struct socket *sock = svsk->sk_sock; 715 struct socket *sock = svsk->sk_sock;
1027 struct socket *newsock; 716 struct socket *newsock;
1028 struct svc_sock *newsvsk; 717 struct svc_sock *newsvsk;
@@ -1031,9 +720,9 @@ svc_tcp_accept(struct svc_sock *svsk)
1031 720
1032 dprintk("svc: tcp_accept %p sock %p\n", svsk, sock); 721 dprintk("svc: tcp_accept %p sock %p\n", svsk, sock);
1033 if (!sock) 722 if (!sock)
1034 return; 723 return NULL;
1035 724
1036 clear_bit(SK_CONN, &svsk->sk_flags); 725 clear_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags);
1037 err = kernel_accept(sock, &newsock, O_NONBLOCK); 726 err = kernel_accept(sock, &newsock, O_NONBLOCK);
1038 if (err < 0) { 727 if (err < 0) {
1039 if (err == -ENOMEM) 728 if (err == -ENOMEM)
@@ -1042,11 +731,9 @@ svc_tcp_accept(struct svc_sock *svsk)
1042 else if (err != -EAGAIN && net_ratelimit()) 731 else if (err != -EAGAIN && net_ratelimit())
1043 printk(KERN_WARNING "%s: accept failed (err %d)!\n", 732 printk(KERN_WARNING "%s: accept failed (err %d)!\n",
1044 serv->sv_name, -err); 733 serv->sv_name, -err);
1045 return; 734 return NULL;
1046 } 735 }
1047 736 set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags);
1048 set_bit(SK_CONN, &svsk->sk_flags);
1049 svc_sock_enqueue(svsk);
1050 737
1051 err = kernel_getpeername(newsock, sin, &slen); 738 err = kernel_getpeername(newsock, sin, &slen);
1052 if (err < 0) { 739 if (err < 0) {
@@ -1077,106 +764,42 @@ svc_tcp_accept(struct svc_sock *svsk)
1077 if (!(newsvsk = svc_setup_socket(serv, newsock, &err, 764 if (!(newsvsk = svc_setup_socket(serv, newsock, &err,
1078 (SVC_SOCK_ANONYMOUS | SVC_SOCK_TEMPORARY)))) 765 (SVC_SOCK_ANONYMOUS | SVC_SOCK_TEMPORARY))))
1079 goto failed; 766 goto failed;
1080 memcpy(&newsvsk->sk_remote, sin, slen); 767 svc_xprt_set_remote(&newsvsk->sk_xprt, sin, slen);
1081 newsvsk->sk_remotelen = slen;
1082 err = kernel_getsockname(newsock, sin, &slen); 768 err = kernel_getsockname(newsock, sin, &slen);
1083 if (unlikely(err < 0)) { 769 if (unlikely(err < 0)) {
1084 dprintk("svc_tcp_accept: kernel_getsockname error %d\n", -err); 770 dprintk("svc_tcp_accept: kernel_getsockname error %d\n", -err);
1085 slen = offsetof(struct sockaddr, sa_data); 771 slen = offsetof(struct sockaddr, sa_data);
1086 } 772 }
1087 memcpy(&newsvsk->sk_local, sin, slen); 773 svc_xprt_set_local(&newsvsk->sk_xprt, sin, slen);
1088
1089 svc_sock_received(newsvsk);
1090
1091 /* make sure that we don't have too many active connections.
1092 * If we have, something must be dropped.
1093 *
1094 * There's no point in trying to do random drop here for
1095 * DoS prevention. The NFS clients does 1 reconnect in 15
1096 * seconds. An attacker can easily beat that.
1097 *
1098 * The only somewhat efficient mechanism would be if drop
1099 * old connections from the same IP first. But right now
1100 * we don't even record the client IP in svc_sock.
1101 */
1102 if (serv->sv_tmpcnt > (serv->sv_nrthreads+3)*20) {
1103 struct svc_sock *svsk = NULL;
1104 spin_lock_bh(&serv->sv_lock);
1105 if (!list_empty(&serv->sv_tempsocks)) {
1106 if (net_ratelimit()) {
1107 /* Try to help the admin */
1108 printk(KERN_NOTICE "%s: too many open TCP "
1109 "sockets, consider increasing the "
1110 "number of nfsd threads\n",
1111 serv->sv_name);
1112 printk(KERN_NOTICE
1113 "%s: last TCP connect from %s\n",
1114 serv->sv_name, __svc_print_addr(sin,
1115 buf, sizeof(buf)));
1116 }
1117 /*
1118 * Always select the oldest socket. It's not fair,
1119 * but so is life
1120 */
1121 svsk = list_entry(serv->sv_tempsocks.prev,
1122 struct svc_sock,
1123 sk_list);
1124 set_bit(SK_CLOSE, &svsk->sk_flags);
1125 atomic_inc(&svsk->sk_inuse);
1126 }
1127 spin_unlock_bh(&serv->sv_lock);
1128
1129 if (svsk) {
1130 svc_sock_enqueue(svsk);
1131 svc_sock_put(svsk);
1132 }
1133
1134 }
1135 774
1136 if (serv->sv_stats) 775 if (serv->sv_stats)
1137 serv->sv_stats->nettcpconn++; 776 serv->sv_stats->nettcpconn++;
1138 777
1139 return; 778 return &newsvsk->sk_xprt;
1140 779
1141failed: 780failed:
1142 sock_release(newsock); 781 sock_release(newsock);
1143 return; 782 return NULL;
1144} 783}
1145 784
1146/* 785/*
1147 * Receive data from a TCP socket. 786 * Receive data from a TCP socket.
1148 */ 787 */
1149static int 788static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
1150svc_tcp_recvfrom(struct svc_rqst *rqstp)
1151{ 789{
1152 struct svc_sock *svsk = rqstp->rq_sock; 790 struct svc_sock *svsk =
1153 struct svc_serv *serv = svsk->sk_server; 791 container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
792 struct svc_serv *serv = svsk->sk_xprt.xpt_server;
1154 int len; 793 int len;
1155 struct kvec *vec; 794 struct kvec *vec;
1156 int pnum, vlen; 795 int pnum, vlen;
1157 796
1158 dprintk("svc: tcp_recv %p data %d conn %d close %d\n", 797 dprintk("svc: tcp_recv %p data %d conn %d close %d\n",
1159 svsk, test_bit(SK_DATA, &svsk->sk_flags), 798 svsk, test_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags),
1160 test_bit(SK_CONN, &svsk->sk_flags), 799 test_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags),
1161 test_bit(SK_CLOSE, &svsk->sk_flags)); 800 test_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags));
1162 801
1163 if ((rqstp->rq_deferred = svc_deferred_dequeue(svsk))) { 802 if (test_and_clear_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags))
1164 svc_sock_received(svsk);
1165 return svc_deferred_recv(rqstp);
1166 }
1167
1168 if (test_bit(SK_CLOSE, &svsk->sk_flags)) {
1169 svc_delete_socket(svsk);
1170 return 0;
1171 }
1172
1173 if (svsk->sk_sk->sk_state == TCP_LISTEN) {
1174 svc_tcp_accept(svsk);
1175 svc_sock_received(svsk);
1176 return 0;
1177 }
1178
1179 if (test_and_clear_bit(SK_CHNGBUF, &svsk->sk_flags))
1180 /* sndbuf needs to have room for one request 803 /* sndbuf needs to have room for one request
1181 * per thread, otherwise we can stall even when the 804 * per thread, otherwise we can stall even when the
1182 * network isn't a bottleneck. 805 * network isn't a bottleneck.
@@ -1193,7 +816,7 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp)
1193 (serv->sv_nrthreads+3) * serv->sv_max_mesg, 816 (serv->sv_nrthreads+3) * serv->sv_max_mesg,
1194 3 * serv->sv_max_mesg); 817 3 * serv->sv_max_mesg);
1195 818
1196 clear_bit(SK_DATA, &svsk->sk_flags); 819 clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
1197 820
1198 /* Receive data. If we haven't got the record length yet, get 821 /* Receive data. If we haven't got the record length yet, get
1199 * the next four bytes. Otherwise try to gobble up as much as 822 * the next four bytes. Otherwise try to gobble up as much as
@@ -1212,7 +835,7 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp)
1212 if (len < want) { 835 if (len < want) {
1213 dprintk("svc: short recvfrom while reading record length (%d of %lu)\n", 836 dprintk("svc: short recvfrom while reading record length (%d of %lu)\n",
1214 len, want); 837 len, want);
1215 svc_sock_received(svsk); 838 svc_xprt_received(&svsk->sk_xprt);
1216 return -EAGAIN; /* record header not complete */ 839 return -EAGAIN; /* record header not complete */
1217 } 840 }
1218 841
@@ -1248,11 +871,11 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp)
1248 if (len < svsk->sk_reclen) { 871 if (len < svsk->sk_reclen) {
1249 dprintk("svc: incomplete TCP record (%d of %d)\n", 872 dprintk("svc: incomplete TCP record (%d of %d)\n",
1250 len, svsk->sk_reclen); 873 len, svsk->sk_reclen);
1251 svc_sock_received(svsk); 874 svc_xprt_received(&svsk->sk_xprt);
1252 return -EAGAIN; /* record not complete */ 875 return -EAGAIN; /* record not complete */
1253 } 876 }
1254 len = svsk->sk_reclen; 877 len = svsk->sk_reclen;
1255 set_bit(SK_DATA, &svsk->sk_flags); 878 set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
1256 879
1257 vec = rqstp->rq_vec; 880 vec = rqstp->rq_vec;
1258 vec[0] = rqstp->rq_arg.head[0]; 881 vec[0] = rqstp->rq_arg.head[0];
@@ -1281,30 +904,31 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp)
1281 rqstp->rq_arg.page_len = len - rqstp->rq_arg.head[0].iov_len; 904 rqstp->rq_arg.page_len = len - rqstp->rq_arg.head[0].iov_len;
1282 } 905 }
1283 906
1284 rqstp->rq_skbuff = NULL; 907 rqstp->rq_xprt_ctxt = NULL;
1285 rqstp->rq_prot = IPPROTO_TCP; 908 rqstp->rq_prot = IPPROTO_TCP;
1286 909
1287 /* Reset TCP read info */ 910 /* Reset TCP read info */
1288 svsk->sk_reclen = 0; 911 svsk->sk_reclen = 0;
1289 svsk->sk_tcplen = 0; 912 svsk->sk_tcplen = 0;
1290 913
1291 svc_sock_received(svsk); 914 svc_xprt_copy_addrs(rqstp, &svsk->sk_xprt);
915 svc_xprt_received(&svsk->sk_xprt);
1292 if (serv->sv_stats) 916 if (serv->sv_stats)
1293 serv->sv_stats->nettcpcnt++; 917 serv->sv_stats->nettcpcnt++;
1294 918
1295 return len; 919 return len;
1296 920
1297 err_delete: 921 err_delete:
1298 svc_delete_socket(svsk); 922 set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
1299 return -EAGAIN; 923 return -EAGAIN;
1300 924
1301 error: 925 error:
1302 if (len == -EAGAIN) { 926 if (len == -EAGAIN) {
1303 dprintk("RPC: TCP recvfrom got EAGAIN\n"); 927 dprintk("RPC: TCP recvfrom got EAGAIN\n");
1304 svc_sock_received(svsk); 928 svc_xprt_received(&svsk->sk_xprt);
1305 } else { 929 } else {
1306 printk(KERN_NOTICE "%s: recvfrom returned errno %d\n", 930 printk(KERN_NOTICE "%s: recvfrom returned errno %d\n",
1307 svsk->sk_server->sv_name, -len); 931 svsk->sk_xprt.xpt_server->sv_name, -len);
1308 goto err_delete; 932 goto err_delete;
1309 } 933 }
1310 934
@@ -1314,8 +938,7 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp)
1314/* 938/*
1315 * Send out data on TCP socket. 939 * Send out data on TCP socket.
1316 */ 940 */
1317static int 941static int svc_tcp_sendto(struct svc_rqst *rqstp)
1318svc_tcp_sendto(struct svc_rqst *rqstp)
1319{ 942{
1320 struct xdr_buf *xbufp = &rqstp->rq_res; 943 struct xdr_buf *xbufp = &rqstp->rq_res;
1321 int sent; 944 int sent;
@@ -1328,35 +951,109 @@ svc_tcp_sendto(struct svc_rqst *rqstp)
1328 reclen = htonl(0x80000000|((xbufp->len ) - 4)); 951 reclen = htonl(0x80000000|((xbufp->len ) - 4));
1329 memcpy(xbufp->head[0].iov_base, &reclen, 4); 952 memcpy(xbufp->head[0].iov_base, &reclen, 4);
1330 953
1331 if (test_bit(SK_DEAD, &rqstp->rq_sock->sk_flags)) 954 if (test_bit(XPT_DEAD, &rqstp->rq_xprt->xpt_flags))
1332 return -ENOTCONN; 955 return -ENOTCONN;
1333 956
1334 sent = svc_sendto(rqstp, &rqstp->rq_res); 957 sent = svc_sendto(rqstp, &rqstp->rq_res);
1335 if (sent != xbufp->len) { 958 if (sent != xbufp->len) {
1336 printk(KERN_NOTICE "rpc-srv/tcp: %s: %s %d when sending %d bytes - shutting down socket\n", 959 printk(KERN_NOTICE
1337 rqstp->rq_sock->sk_server->sv_name, 960 "rpc-srv/tcp: %s: %s %d when sending %d bytes "
961 "- shutting down socket\n",
962 rqstp->rq_xprt->xpt_server->sv_name,
1338 (sent<0)?"got error":"sent only", 963 (sent<0)?"got error":"sent only",
1339 sent, xbufp->len); 964 sent, xbufp->len);
1340 set_bit(SK_CLOSE, &rqstp->rq_sock->sk_flags); 965 set_bit(XPT_CLOSE, &rqstp->rq_xprt->xpt_flags);
1341 svc_sock_enqueue(rqstp->rq_sock); 966 svc_xprt_enqueue(rqstp->rq_xprt);
1342 sent = -EAGAIN; 967 sent = -EAGAIN;
1343 } 968 }
1344 return sent; 969 return sent;
1345} 970}
1346 971
1347static void 972/*
1348svc_tcp_init(struct svc_sock *svsk) 973 * Setup response header. TCP has a 4B record length field.
974 */
975static void svc_tcp_prep_reply_hdr(struct svc_rqst *rqstp)
976{
977 struct kvec *resv = &rqstp->rq_res.head[0];
978
979 /* tcp needs a space for the record length... */
980 svc_putnl(resv, 0);
981}
982
983static int svc_tcp_has_wspace(struct svc_xprt *xprt)
984{
985 struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
986 struct svc_serv *serv = svsk->sk_xprt.xpt_server;
987 int required;
988 int wspace;
989
990 /*
991 * Set the SOCK_NOSPACE flag before checking the available
992 * sock space.
993 */
994 set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
995 required = atomic_read(&svsk->sk_xprt.xpt_reserved) + serv->sv_max_mesg;
996 wspace = sk_stream_wspace(svsk->sk_sk);
997
998 if (wspace < sk_stream_min_wspace(svsk->sk_sk))
999 return 0;
1000 if (required * 2 > wspace)
1001 return 0;
1002
1003 clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
1004 return 1;
1005}
1006
1007static struct svc_xprt *svc_tcp_create(struct svc_serv *serv,
1008 struct sockaddr *sa, int salen,
1009 int flags)
1010{
1011 return svc_create_socket(serv, IPPROTO_TCP, sa, salen, flags);
1012}
1013
1014static struct svc_xprt_ops svc_tcp_ops = {
1015 .xpo_create = svc_tcp_create,
1016 .xpo_recvfrom = svc_tcp_recvfrom,
1017 .xpo_sendto = svc_tcp_sendto,
1018 .xpo_release_rqst = svc_release_skb,
1019 .xpo_detach = svc_sock_detach,
1020 .xpo_free = svc_sock_free,
1021 .xpo_prep_reply_hdr = svc_tcp_prep_reply_hdr,
1022 .xpo_has_wspace = svc_tcp_has_wspace,
1023 .xpo_accept = svc_tcp_accept,
1024};
1025
1026static struct svc_xprt_class svc_tcp_class = {
1027 .xcl_name = "tcp",
1028 .xcl_owner = THIS_MODULE,
1029 .xcl_ops = &svc_tcp_ops,
1030 .xcl_max_payload = RPCSVC_MAXPAYLOAD_TCP,
1031};
1032
1033void svc_init_xprt_sock(void)
1034{
1035 svc_reg_xprt_class(&svc_tcp_class);
1036 svc_reg_xprt_class(&svc_udp_class);
1037}
1038
1039void svc_cleanup_xprt_sock(void)
1040{
1041 svc_unreg_xprt_class(&svc_tcp_class);
1042 svc_unreg_xprt_class(&svc_udp_class);
1043}
1044
1045static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv)
1349{ 1046{
1350 struct sock *sk = svsk->sk_sk; 1047 struct sock *sk = svsk->sk_sk;
1351 struct tcp_sock *tp = tcp_sk(sk); 1048 struct tcp_sock *tp = tcp_sk(sk);
1352 1049
1353 svsk->sk_recvfrom = svc_tcp_recvfrom; 1050 svc_xprt_init(&svc_tcp_class, &svsk->sk_xprt, serv);
1354 svsk->sk_sendto = svc_tcp_sendto; 1051 set_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags);
1355
1356 if (sk->sk_state == TCP_LISTEN) { 1052 if (sk->sk_state == TCP_LISTEN) {
1357 dprintk("setting up TCP socket for listening\n"); 1053 dprintk("setting up TCP socket for listening\n");
1054 set_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags);
1358 sk->sk_data_ready = svc_tcp_listen_data_ready; 1055 sk->sk_data_ready = svc_tcp_listen_data_ready;
1359 set_bit(SK_CONN, &svsk->sk_flags); 1056 set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags);
1360 } else { 1057 } else {
1361 dprintk("setting up TCP socket for reading\n"); 1058 dprintk("setting up TCP socket for reading\n");
1362 sk->sk_state_change = svc_tcp_state_change; 1059 sk->sk_state_change = svc_tcp_state_change;
@@ -1373,18 +1070,17 @@ svc_tcp_init(struct svc_sock *svsk)
1373 * svc_tcp_recvfrom will re-adjust if necessary 1070 * svc_tcp_recvfrom will re-adjust if necessary
1374 */ 1071 */
1375 svc_sock_setbufsize(svsk->sk_sock, 1072 svc_sock_setbufsize(svsk->sk_sock,
1376 3 * svsk->sk_server->sv_max_mesg, 1073 3 * svsk->sk_xprt.xpt_server->sv_max_mesg,
1377 3 * svsk->sk_server->sv_max_mesg); 1074 3 * svsk->sk_xprt.xpt_server->sv_max_mesg);
1378 1075
1379 set_bit(SK_CHNGBUF, &svsk->sk_flags); 1076 set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags);
1380 set_bit(SK_DATA, &svsk->sk_flags); 1077 set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
1381 if (sk->sk_state != TCP_ESTABLISHED) 1078 if (sk->sk_state != TCP_ESTABLISHED)
1382 set_bit(SK_CLOSE, &svsk->sk_flags); 1079 set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
1383 } 1080 }
1384} 1081}
1385 1082
1386void 1083void svc_sock_update_bufs(struct svc_serv *serv)
1387svc_sock_update_bufs(struct svc_serv *serv)
1388{ 1084{
1389 /* 1085 /*
1390 * The number of server threads has changed. Update 1086 * The number of server threads has changed. Update
@@ -1395,232 +1091,18 @@ svc_sock_update_bufs(struct svc_serv *serv)
1395 spin_lock_bh(&serv->sv_lock); 1091 spin_lock_bh(&serv->sv_lock);
1396 list_for_each(le, &serv->sv_permsocks) { 1092 list_for_each(le, &serv->sv_permsocks) {
1397 struct svc_sock *svsk = 1093 struct svc_sock *svsk =
1398 list_entry(le, struct svc_sock, sk_list); 1094 list_entry(le, struct svc_sock, sk_xprt.xpt_list);
1399 set_bit(SK_CHNGBUF, &svsk->sk_flags); 1095 set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags);
1400 } 1096 }
1401 list_for_each(le, &serv->sv_tempsocks) { 1097 list_for_each(le, &serv->sv_tempsocks) {
1402 struct svc_sock *svsk = 1098 struct svc_sock *svsk =
1403 list_entry(le, struct svc_sock, sk_list); 1099 list_entry(le, struct svc_sock, sk_xprt.xpt_list);
1404 set_bit(SK_CHNGBUF, &svsk->sk_flags); 1100 set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags);
1405 } 1101 }
1406 spin_unlock_bh(&serv->sv_lock); 1102 spin_unlock_bh(&serv->sv_lock);
1407} 1103}
1408 1104
1409/* 1105/*
1410 * Receive the next request on any socket. This code is carefully
1411 * organised not to touch any cachelines in the shared svc_serv
1412 * structure, only cachelines in the local svc_pool.
1413 */
1414int
1415svc_recv(struct svc_rqst *rqstp, long timeout)
1416{
1417 struct svc_sock *svsk = NULL;
1418 struct svc_serv *serv = rqstp->rq_server;
1419 struct svc_pool *pool = rqstp->rq_pool;
1420 int len, i;
1421 int pages;
1422 struct xdr_buf *arg;
1423 DECLARE_WAITQUEUE(wait, current);
1424
1425 dprintk("svc: server %p waiting for data (to = %ld)\n",
1426 rqstp, timeout);
1427
1428 if (rqstp->rq_sock)
1429 printk(KERN_ERR
1430 "svc_recv: service %p, socket not NULL!\n",
1431 rqstp);
1432 if (waitqueue_active(&rqstp->rq_wait))
1433 printk(KERN_ERR
1434 "svc_recv: service %p, wait queue active!\n",
1435 rqstp);
1436
1437
1438 /* now allocate needed pages. If we get a failure, sleep briefly */
1439 pages = (serv->sv_max_mesg + PAGE_SIZE) / PAGE_SIZE;
1440 for (i=0; i < pages ; i++)
1441 while (rqstp->rq_pages[i] == NULL) {
1442 struct page *p = alloc_page(GFP_KERNEL);
1443 if (!p)
1444 schedule_timeout_uninterruptible(msecs_to_jiffies(500));
1445 rqstp->rq_pages[i] = p;
1446 }
1447 rqstp->rq_pages[i++] = NULL; /* this might be seen in nfs_read_actor */
1448 BUG_ON(pages >= RPCSVC_MAXPAGES);
1449
1450 /* Make arg->head point to first page and arg->pages point to rest */
1451 arg = &rqstp->rq_arg;
1452 arg->head[0].iov_base = page_address(rqstp->rq_pages[0]);
1453 arg->head[0].iov_len = PAGE_SIZE;
1454 arg->pages = rqstp->rq_pages + 1;
1455 arg->page_base = 0;
1456 /* save at least one page for response */
1457 arg->page_len = (pages-2)*PAGE_SIZE;
1458 arg->len = (pages-1)*PAGE_SIZE;
1459 arg->tail[0].iov_len = 0;
1460
1461 try_to_freeze();
1462 cond_resched();
1463 if (signalled())
1464 return -EINTR;
1465
1466 spin_lock_bh(&pool->sp_lock);
1467 if ((svsk = svc_sock_dequeue(pool)) != NULL) {
1468 rqstp->rq_sock = svsk;
1469 atomic_inc(&svsk->sk_inuse);
1470 rqstp->rq_reserved = serv->sv_max_mesg;
1471 atomic_add(rqstp->rq_reserved, &svsk->sk_reserved);
1472 } else {
1473 /* No data pending. Go to sleep */
1474 svc_thread_enqueue(pool, rqstp);
1475
1476 /*
1477 * We have to be able to interrupt this wait
1478 * to bring down the daemons ...
1479 */
1480 set_current_state(TASK_INTERRUPTIBLE);
1481 add_wait_queue(&rqstp->rq_wait, &wait);
1482 spin_unlock_bh(&pool->sp_lock);
1483
1484 schedule_timeout(timeout);
1485
1486 try_to_freeze();
1487
1488 spin_lock_bh(&pool->sp_lock);
1489 remove_wait_queue(&rqstp->rq_wait, &wait);
1490
1491 if (!(svsk = rqstp->rq_sock)) {
1492 svc_thread_dequeue(pool, rqstp);
1493 spin_unlock_bh(&pool->sp_lock);
1494 dprintk("svc: server %p, no data yet\n", rqstp);
1495 return signalled()? -EINTR : -EAGAIN;
1496 }
1497 }
1498 spin_unlock_bh(&pool->sp_lock);
1499
1500 dprintk("svc: server %p, pool %u, socket %p, inuse=%d\n",
1501 rqstp, pool->sp_id, svsk, atomic_read(&svsk->sk_inuse));
1502 len = svsk->sk_recvfrom(rqstp);
1503 dprintk("svc: got len=%d\n", len);
1504
1505 /* No data, incomplete (TCP) read, or accept() */
1506 if (len == 0 || len == -EAGAIN) {
1507 rqstp->rq_res.len = 0;
1508 svc_sock_release(rqstp);
1509 return -EAGAIN;
1510 }
1511 svsk->sk_lastrecv = get_seconds();
1512 clear_bit(SK_OLD, &svsk->sk_flags);
1513
1514 rqstp->rq_secure = svc_port_is_privileged(svc_addr(rqstp));
1515 rqstp->rq_chandle.defer = svc_defer;
1516
1517 if (serv->sv_stats)
1518 serv->sv_stats->netcnt++;
1519 return len;
1520}
1521
1522/*
1523 * Drop request
1524 */
1525void
1526svc_drop(struct svc_rqst *rqstp)
1527{
1528 dprintk("svc: socket %p dropped request\n", rqstp->rq_sock);
1529 svc_sock_release(rqstp);
1530}
1531
1532/*
1533 * Return reply to client.
1534 */
1535int
1536svc_send(struct svc_rqst *rqstp)
1537{
1538 struct svc_sock *svsk;
1539 int len;
1540 struct xdr_buf *xb;
1541
1542 if ((svsk = rqstp->rq_sock) == NULL) {
1543 printk(KERN_WARNING "NULL socket pointer in %s:%d\n",
1544 __FILE__, __LINE__);
1545 return -EFAULT;
1546 }
1547
1548 /* release the receive skb before sending the reply */
1549 svc_release_skb(rqstp);
1550
1551 /* calculate over-all length */
1552 xb = & rqstp->rq_res;
1553 xb->len = xb->head[0].iov_len +
1554 xb->page_len +
1555 xb->tail[0].iov_len;
1556
1557 /* Grab svsk->sk_mutex to serialize outgoing data. */
1558 mutex_lock(&svsk->sk_mutex);
1559 if (test_bit(SK_DEAD, &svsk->sk_flags))
1560 len = -ENOTCONN;
1561 else
1562 len = svsk->sk_sendto(rqstp);
1563 mutex_unlock(&svsk->sk_mutex);
1564 svc_sock_release(rqstp);
1565
1566 if (len == -ECONNREFUSED || len == -ENOTCONN || len == -EAGAIN)
1567 return 0;
1568 return len;
1569}
1570
1571/*
1572 * Timer function to close old temporary sockets, using
1573 * a mark-and-sweep algorithm.
1574 */
1575static void
1576svc_age_temp_sockets(unsigned long closure)
1577{
1578 struct svc_serv *serv = (struct svc_serv *)closure;
1579 struct svc_sock *svsk;
1580 struct list_head *le, *next;
1581 LIST_HEAD(to_be_aged);
1582
1583 dprintk("svc_age_temp_sockets\n");
1584
1585 if (!spin_trylock_bh(&serv->sv_lock)) {
1586 /* busy, try again 1 sec later */
1587 dprintk("svc_age_temp_sockets: busy\n");
1588 mod_timer(&serv->sv_temptimer, jiffies + HZ);
1589 return;
1590 }
1591
1592 list_for_each_safe(le, next, &serv->sv_tempsocks) {
1593 svsk = list_entry(le, struct svc_sock, sk_list);
1594
1595 if (!test_and_set_bit(SK_OLD, &svsk->sk_flags))
1596 continue;
1597 if (atomic_read(&svsk->sk_inuse) > 1 || test_bit(SK_BUSY, &svsk->sk_flags))
1598 continue;
1599 atomic_inc(&svsk->sk_inuse);
1600 list_move(le, &to_be_aged);
1601 set_bit(SK_CLOSE, &svsk->sk_flags);
1602 set_bit(SK_DETACHED, &svsk->sk_flags);
1603 }
1604 spin_unlock_bh(&serv->sv_lock);
1605
1606 while (!list_empty(&to_be_aged)) {
1607 le = to_be_aged.next;
1608 /* fiddling the sk_list node is safe 'cos we're SK_DETACHED */
1609 list_del_init(le);
1610 svsk = list_entry(le, struct svc_sock, sk_list);
1611
1612 dprintk("queuing svsk %p for closing, %lu seconds old\n",
1613 svsk, get_seconds() - svsk->sk_lastrecv);
1614
1615 /* a thread will dequeue and close it soon */
1616 svc_sock_enqueue(svsk);
1617 svc_sock_put(svsk);
1618 }
1619
1620 mod_timer(&serv->sv_temptimer, jiffies + svc_conn_age_period * HZ);
1621}
1622
1623/*
1624 * Initialize socket for RPC use and create svc_sock struct 1106 * Initialize socket for RPC use and create svc_sock struct
1625 * XXX: May want to setsockopt SO_SNDBUF and SO_RCVBUF. 1107 * XXX: May want to setsockopt SO_SNDBUF and SO_RCVBUF.
1626 */ 1108 */
@@ -1631,7 +1113,6 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
1631 struct svc_sock *svsk; 1113 struct svc_sock *svsk;
1632 struct sock *inet; 1114 struct sock *inet;
1633 int pmap_register = !(flags & SVC_SOCK_ANONYMOUS); 1115 int pmap_register = !(flags & SVC_SOCK_ANONYMOUS);
1634 int is_temporary = flags & SVC_SOCK_TEMPORARY;
1635 1116
1636 dprintk("svc: svc_setup_socket %p\n", sock); 1117 dprintk("svc: svc_setup_socket %p\n", sock);
1637 if (!(svsk = kzalloc(sizeof(*svsk), GFP_KERNEL))) { 1118 if (!(svsk = kzalloc(sizeof(*svsk), GFP_KERNEL))) {
@@ -1651,44 +1132,18 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
1651 return NULL; 1132 return NULL;
1652 } 1133 }
1653 1134
1654 set_bit(SK_BUSY, &svsk->sk_flags);
1655 inet->sk_user_data = svsk; 1135 inet->sk_user_data = svsk;
1656 svsk->sk_sock = sock; 1136 svsk->sk_sock = sock;
1657 svsk->sk_sk = inet; 1137 svsk->sk_sk = inet;
1658 svsk->sk_ostate = inet->sk_state_change; 1138 svsk->sk_ostate = inet->sk_state_change;
1659 svsk->sk_odata = inet->sk_data_ready; 1139 svsk->sk_odata = inet->sk_data_ready;
1660 svsk->sk_owspace = inet->sk_write_space; 1140 svsk->sk_owspace = inet->sk_write_space;
1661 svsk->sk_server = serv;
1662 atomic_set(&svsk->sk_inuse, 1);
1663 svsk->sk_lastrecv = get_seconds();
1664 spin_lock_init(&svsk->sk_lock);
1665 INIT_LIST_HEAD(&svsk->sk_deferred);
1666 INIT_LIST_HEAD(&svsk->sk_ready);
1667 mutex_init(&svsk->sk_mutex);
1668 1141
1669 /* Initialize the socket */ 1142 /* Initialize the socket */
1670 if (sock->type == SOCK_DGRAM) 1143 if (sock->type == SOCK_DGRAM)
1671 svc_udp_init(svsk); 1144 svc_udp_init(svsk, serv);
1672 else 1145 else
1673 svc_tcp_init(svsk); 1146 svc_tcp_init(svsk, serv);
1674
1675 spin_lock_bh(&serv->sv_lock);
1676 if (is_temporary) {
1677 set_bit(SK_TEMP, &svsk->sk_flags);
1678 list_add(&svsk->sk_list, &serv->sv_tempsocks);
1679 serv->sv_tmpcnt++;
1680 if (serv->sv_temptimer.function == NULL) {
1681 /* setup timer to age temp sockets */
1682 setup_timer(&serv->sv_temptimer, svc_age_temp_sockets,
1683 (unsigned long)serv);
1684 mod_timer(&serv->sv_temptimer,
1685 jiffies + svc_conn_age_period * HZ);
1686 }
1687 } else {
1688 clear_bit(SK_TEMP, &svsk->sk_flags);
1689 list_add(&svsk->sk_list, &serv->sv_permsocks);
1690 }
1691 spin_unlock_bh(&serv->sv_lock);
1692 1147
1693 dprintk("svc: svc_setup_socket created %p (inet %p)\n", 1148 dprintk("svc: svc_setup_socket created %p (inet %p)\n",
1694 svsk, svsk->sk_sk); 1149 svsk, svsk->sk_sk);
@@ -1717,7 +1172,16 @@ int svc_addsock(struct svc_serv *serv,
1717 else { 1172 else {
1718 svsk = svc_setup_socket(serv, so, &err, SVC_SOCK_DEFAULTS); 1173 svsk = svc_setup_socket(serv, so, &err, SVC_SOCK_DEFAULTS);
1719 if (svsk) { 1174 if (svsk) {
1720 svc_sock_received(svsk); 1175 struct sockaddr_storage addr;
1176 struct sockaddr *sin = (struct sockaddr *)&addr;
1177 int salen;
1178 if (kernel_getsockname(svsk->sk_sock, sin, &salen) == 0)
1179 svc_xprt_set_local(&svsk->sk_xprt, sin, salen);
1180 clear_bit(XPT_TEMP, &svsk->sk_xprt.xpt_flags);
1181 spin_lock_bh(&serv->sv_lock);
1182 list_add(&svsk->sk_xprt.xpt_list, &serv->sv_permsocks);
1183 spin_unlock_bh(&serv->sv_lock);
1184 svc_xprt_received(&svsk->sk_xprt);
1721 err = 0; 1185 err = 0;
1722 } 1186 }
1723 } 1187 }
@@ -1733,14 +1197,19 @@ EXPORT_SYMBOL_GPL(svc_addsock);
1733/* 1197/*
1734 * Create socket for RPC service. 1198 * Create socket for RPC service.
1735 */ 1199 */
1736static int svc_create_socket(struct svc_serv *serv, int protocol, 1200static struct svc_xprt *svc_create_socket(struct svc_serv *serv,
1737 struct sockaddr *sin, int len, int flags) 1201 int protocol,
1202 struct sockaddr *sin, int len,
1203 int flags)
1738{ 1204{
1739 struct svc_sock *svsk; 1205 struct svc_sock *svsk;
1740 struct socket *sock; 1206 struct socket *sock;
1741 int error; 1207 int error;
1742 int type; 1208 int type;
1743 char buf[RPC_MAX_ADDRBUFLEN]; 1209 char buf[RPC_MAX_ADDRBUFLEN];
1210 struct sockaddr_storage addr;
1211 struct sockaddr *newsin = (struct sockaddr *)&addr;
1212 int newlen;
1744 1213
1745 dprintk("svc: svc_create_socket(%s, %d, %s)\n", 1214 dprintk("svc: svc_create_socket(%s, %d, %s)\n",
1746 serv->sv_program->pg_name, protocol, 1215 serv->sv_program->pg_name, protocol,
@@ -1749,13 +1218,13 @@ static int svc_create_socket(struct svc_serv *serv, int protocol,
1749 if (protocol != IPPROTO_UDP && protocol != IPPROTO_TCP) { 1218 if (protocol != IPPROTO_UDP && protocol != IPPROTO_TCP) {
1750 printk(KERN_WARNING "svc: only UDP and TCP " 1219 printk(KERN_WARNING "svc: only UDP and TCP "
1751 "sockets supported\n"); 1220 "sockets supported\n");
1752 return -EINVAL; 1221 return ERR_PTR(-EINVAL);
1753 } 1222 }
1754 type = (protocol == IPPROTO_UDP)? SOCK_DGRAM : SOCK_STREAM; 1223 type = (protocol == IPPROTO_UDP)? SOCK_DGRAM : SOCK_STREAM;
1755 1224
1756 error = sock_create_kern(sin->sa_family, type, protocol, &sock); 1225 error = sock_create_kern(sin->sa_family, type, protocol, &sock);
1757 if (error < 0) 1226 if (error < 0)
1758 return error; 1227 return ERR_PTR(error);
1759 1228
1760 svc_reclassify_socket(sock); 1229 svc_reclassify_socket(sock);
1761 1230
@@ -1765,203 +1234,55 @@ static int svc_create_socket(struct svc_serv *serv, int protocol,
1765 if (error < 0) 1234 if (error < 0)
1766 goto bummer; 1235 goto bummer;
1767 1236
1237 newlen = len;
1238 error = kernel_getsockname(sock, newsin, &newlen);
1239 if (error < 0)
1240 goto bummer;
1241
1768 if (protocol == IPPROTO_TCP) { 1242 if (protocol == IPPROTO_TCP) {
1769 if ((error = kernel_listen(sock, 64)) < 0) 1243 if ((error = kernel_listen(sock, 64)) < 0)
1770 goto bummer; 1244 goto bummer;
1771 } 1245 }
1772 1246
1773 if ((svsk = svc_setup_socket(serv, sock, &error, flags)) != NULL) { 1247 if ((svsk = svc_setup_socket(serv, sock, &error, flags)) != NULL) {
1774 svc_sock_received(svsk); 1248 svc_xprt_set_local(&svsk->sk_xprt, newsin, newlen);
1775 return ntohs(inet_sk(svsk->sk_sk)->sport); 1249 return (struct svc_xprt *)svsk;
1776 } 1250 }
1777 1251
1778bummer: 1252bummer:
1779 dprintk("svc: svc_create_socket error = %d\n", -error); 1253 dprintk("svc: svc_create_socket error = %d\n", -error);
1780 sock_release(sock); 1254 sock_release(sock);
1781 return error; 1255 return ERR_PTR(error);
1782} 1256}
1783 1257
1784/* 1258/*
1785 * Remove a dead socket 1259 * Detach the svc_sock from the socket so that no
1260 * more callbacks occur.
1786 */ 1261 */
1787static void 1262static void svc_sock_detach(struct svc_xprt *xprt)
1788svc_delete_socket(struct svc_sock *svsk)
1789{ 1263{
1790 struct svc_serv *serv; 1264 struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
1791 struct sock *sk; 1265 struct sock *sk = svsk->sk_sk;
1792
1793 dprintk("svc: svc_delete_socket(%p)\n", svsk);
1794 1266
1795 serv = svsk->sk_server; 1267 dprintk("svc: svc_sock_detach(%p)\n", svsk);
1796 sk = svsk->sk_sk;
1797 1268
1269 /* put back the old socket callbacks */
1798 sk->sk_state_change = svsk->sk_ostate; 1270 sk->sk_state_change = svsk->sk_ostate;
1799 sk->sk_data_ready = svsk->sk_odata; 1271 sk->sk_data_ready = svsk->sk_odata;
1800 sk->sk_write_space = svsk->sk_owspace; 1272 sk->sk_write_space = svsk->sk_owspace;
1801
1802 spin_lock_bh(&serv->sv_lock);
1803
1804 if (!test_and_set_bit(SK_DETACHED, &svsk->sk_flags))
1805 list_del_init(&svsk->sk_list);
1806 /*
1807 * We used to delete the svc_sock from whichever list
1808 * it's sk_ready node was on, but we don't actually
1809 * need to. This is because the only time we're called
1810 * while still attached to a queue, the queue itself
1811 * is about to be destroyed (in svc_destroy).
1812 */
1813 if (!test_and_set_bit(SK_DEAD, &svsk->sk_flags)) {
1814 BUG_ON(atomic_read(&svsk->sk_inuse)<2);
1815 atomic_dec(&svsk->sk_inuse);
1816 if (test_bit(SK_TEMP, &svsk->sk_flags))
1817 serv->sv_tmpcnt--;
1818 }
1819
1820 spin_unlock_bh(&serv->sv_lock);
1821}
1822
1823static void svc_close_socket(struct svc_sock *svsk)
1824{
1825 set_bit(SK_CLOSE, &svsk->sk_flags);
1826 if (test_and_set_bit(SK_BUSY, &svsk->sk_flags))
1827 /* someone else will have to effect the close */
1828 return;
1829
1830 atomic_inc(&svsk->sk_inuse);
1831 svc_delete_socket(svsk);
1832 clear_bit(SK_BUSY, &svsk->sk_flags);
1833 svc_sock_put(svsk);
1834}
1835
1836void svc_force_close_socket(struct svc_sock *svsk)
1837{
1838 set_bit(SK_CLOSE, &svsk->sk_flags);
1839 if (test_bit(SK_BUSY, &svsk->sk_flags)) {
1840 /* Waiting to be processed, but no threads left,
1841 * So just remove it from the waiting list
1842 */
1843 list_del_init(&svsk->sk_ready);
1844 clear_bit(SK_BUSY, &svsk->sk_flags);
1845 }
1846 svc_close_socket(svsk);
1847}
1848
1849/**
1850 * svc_makesock - Make a socket for nfsd and lockd
1851 * @serv: RPC server structure
1852 * @protocol: transport protocol to use
1853 * @port: port to use
1854 * @flags: requested socket characteristics
1855 *
1856 */
1857int svc_makesock(struct svc_serv *serv, int protocol, unsigned short port,
1858 int flags)
1859{
1860 struct sockaddr_in sin = {
1861 .sin_family = AF_INET,
1862 .sin_addr.s_addr = INADDR_ANY,
1863 .sin_port = htons(port),
1864 };
1865
1866 dprintk("svc: creating socket proto = %d\n", protocol);
1867 return svc_create_socket(serv, protocol, (struct sockaddr *) &sin,
1868 sizeof(sin), flags);
1869} 1273}
1870 1274
1871/* 1275/*
1872 * Handle defer and revisit of requests 1276 * Free the svc_sock's socket resources and the svc_sock itself.
1873 */ 1277 */
1874 1278static void svc_sock_free(struct svc_xprt *xprt)
1875static void svc_revisit(struct cache_deferred_req *dreq, int too_many)
1876{ 1279{
1877 struct svc_deferred_req *dr = container_of(dreq, struct svc_deferred_req, handle); 1280 struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
1878 struct svc_sock *svsk; 1281 dprintk("svc: svc_sock_free(%p)\n", svsk);
1879 1282
1880 if (too_many) { 1283 if (svsk->sk_sock->file)
1881 svc_sock_put(dr->svsk); 1284 sockfd_put(svsk->sk_sock);
1882 kfree(dr); 1285 else
1883 return; 1286 sock_release(svsk->sk_sock);
1884 } 1287 kfree(svsk);
1885 dprintk("revisit queued\n");
1886 svsk = dr->svsk;
1887 dr->svsk = NULL;
1888 spin_lock(&svsk->sk_lock);
1889 list_add(&dr->handle.recent, &svsk->sk_deferred);
1890 spin_unlock(&svsk->sk_lock);
1891 set_bit(SK_DEFERRED, &svsk->sk_flags);
1892 svc_sock_enqueue(svsk);
1893 svc_sock_put(svsk);
1894}
1895
1896static struct cache_deferred_req *
1897svc_defer(struct cache_req *req)
1898{
1899 struct svc_rqst *rqstp = container_of(req, struct svc_rqst, rq_chandle);
1900 int size = sizeof(struct svc_deferred_req) + (rqstp->rq_arg.len);
1901 struct svc_deferred_req *dr;
1902
1903 if (rqstp->rq_arg.page_len)
1904 return NULL; /* if more than a page, give up FIXME */
1905 if (rqstp->rq_deferred) {
1906 dr = rqstp->rq_deferred;
1907 rqstp->rq_deferred = NULL;
1908 } else {
1909 int skip = rqstp->rq_arg.len - rqstp->rq_arg.head[0].iov_len;
1910 /* FIXME maybe discard if size too large */
1911 dr = kmalloc(size, GFP_KERNEL);
1912 if (dr == NULL)
1913 return NULL;
1914
1915 dr->handle.owner = rqstp->rq_server;
1916 dr->prot = rqstp->rq_prot;
1917 memcpy(&dr->addr, &rqstp->rq_addr, rqstp->rq_addrlen);
1918 dr->addrlen = rqstp->rq_addrlen;
1919 dr->daddr = rqstp->rq_daddr;
1920 dr->argslen = rqstp->rq_arg.len >> 2;
1921 memcpy(dr->args, rqstp->rq_arg.head[0].iov_base-skip, dr->argslen<<2);
1922 }
1923 atomic_inc(&rqstp->rq_sock->sk_inuse);
1924 dr->svsk = rqstp->rq_sock;
1925
1926 dr->handle.revisit = svc_revisit;
1927 return &dr->handle;
1928}
1929
1930/*
1931 * recv data from a deferred request into an active one
1932 */
1933static int svc_deferred_recv(struct svc_rqst *rqstp)
1934{
1935 struct svc_deferred_req *dr = rqstp->rq_deferred;
1936
1937 rqstp->rq_arg.head[0].iov_base = dr->args;
1938 rqstp->rq_arg.head[0].iov_len = dr->argslen<<2;
1939 rqstp->rq_arg.page_len = 0;
1940 rqstp->rq_arg.len = dr->argslen<<2;
1941 rqstp->rq_prot = dr->prot;
1942 memcpy(&rqstp->rq_addr, &dr->addr, dr->addrlen);
1943 rqstp->rq_addrlen = dr->addrlen;
1944 rqstp->rq_daddr = dr->daddr;
1945 rqstp->rq_respages = rqstp->rq_pages;
1946 return dr->argslen<<2;
1947}
1948
1949
1950static struct svc_deferred_req *svc_deferred_dequeue(struct svc_sock *svsk)
1951{
1952 struct svc_deferred_req *dr = NULL;
1953
1954 if (!test_bit(SK_DEFERRED, &svsk->sk_flags))
1955 return NULL;
1956 spin_lock(&svsk->sk_lock);
1957 clear_bit(SK_DEFERRED, &svsk->sk_flags);
1958 if (!list_empty(&svsk->sk_deferred)) {
1959 dr = list_entry(svsk->sk_deferred.next,
1960 struct svc_deferred_req,
1961 handle.recent);
1962 list_del_init(&dr->handle.recent);
1963 set_bit(SK_DEFERRED, &svsk->sk_flags);
1964 }
1965 spin_unlock(&svsk->sk_lock);
1966 return dr;
1967} 1288}
diff --git a/net/sunrpc/sysctl.c b/net/sunrpc/sysctl.c
index bada7de0c2fc..0f8c439b848a 100644
--- a/net/sunrpc/sysctl.c
+++ b/net/sunrpc/sysctl.c
@@ -18,6 +18,7 @@
18#include <linux/sunrpc/types.h> 18#include <linux/sunrpc/types.h>
19#include <linux/sunrpc/sched.h> 19#include <linux/sunrpc/sched.h>
20#include <linux/sunrpc/stats.h> 20#include <linux/sunrpc/stats.h>
21#include <linux/sunrpc/svc_xprt.h>
21 22
22/* 23/*
23 * Declare the debug flags here 24 * Declare the debug flags here
@@ -55,6 +56,30 @@ rpc_unregister_sysctl(void)
55 } 56 }
56} 57}
57 58
59static int proc_do_xprt(ctl_table *table, int write, struct file *file,
60 void __user *buffer, size_t *lenp, loff_t *ppos)
61{
62 char tmpbuf[256];
63 int len;
64 if ((*ppos && !write) || !*lenp) {
65 *lenp = 0;
66 return 0;
67 }
68 if (write)
69 return -EINVAL;
70 else {
71 len = svc_print_xprts(tmpbuf, sizeof(tmpbuf));
72 if (!access_ok(VERIFY_WRITE, buffer, len))
73 return -EFAULT;
74
75 if (__copy_to_user(buffer, tmpbuf, len))
76 return -EFAULT;
77 }
78 *lenp -= len;
79 *ppos += len;
80 return 0;
81}
82
58static int 83static int
59proc_dodebug(ctl_table *table, int write, struct file *file, 84proc_dodebug(ctl_table *table, int write, struct file *file,
60 void __user *buffer, size_t *lenp, loff_t *ppos) 85 void __user *buffer, size_t *lenp, loff_t *ppos)
@@ -147,6 +172,12 @@ static ctl_table debug_table[] = {
147 .mode = 0644, 172 .mode = 0644,
148 .proc_handler = &proc_dodebug 173 .proc_handler = &proc_dodebug
149 }, 174 },
175 {
176 .procname = "transports",
177 .maxlen = 256,
178 .mode = 0444,
179 .proc_handler = &proc_do_xprt,
180 },
150 { .ctl_name = 0 } 181 { .ctl_name = 0 }
151}; 182};
152 183
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 54264062ea69..995c3fdc16c2 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -96,11 +96,13 @@ xdr_encode_string(__be32 *p, const char *string)
96EXPORT_SYMBOL(xdr_encode_string); 96EXPORT_SYMBOL(xdr_encode_string);
97 97
98__be32 * 98__be32 *
99xdr_decode_string_inplace(__be32 *p, char **sp, int *lenp, int maxlen) 99xdr_decode_string_inplace(__be32 *p, char **sp,
100 unsigned int *lenp, unsigned int maxlen)
100{ 101{
101 unsigned int len; 102 u32 len;
102 103
103 if ((len = ntohl(*p++)) > maxlen) 104 len = ntohl(*p++);
105 if (len > maxlen)
104 return NULL; 106 return NULL;
105 *lenp = len; 107 *lenp = len;
106 *sp = (char *) p; 108 *sp = (char *) p;
diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile
index 264f0feeb513..5a8f268bdd30 100644
--- a/net/sunrpc/xprtrdma/Makefile
+++ b/net/sunrpc/xprtrdma/Makefile
@@ -1,3 +1,8 @@
1obj-$(CONFIG_SUNRPC_XPRT_RDMA) += xprtrdma.o 1obj-$(CONFIG_SUNRPC_XPRT_RDMA) += xprtrdma.o
2 2
3xprtrdma-y := transport.o rpc_rdma.o verbs.o 3xprtrdma-y := transport.o rpc_rdma.o verbs.o
4
5obj-$(CONFIG_SUNRPC_XPRT_RDMA) += svcrdma.o
6
7svcrdma-y := svc_rdma.o svc_rdma_transport.o \
8 svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o
diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c
new file mode 100644
index 000000000000..88c0ca20bb1e
--- /dev/null
+++ b/net/sunrpc/xprtrdma/svc_rdma.c
@@ -0,0 +1,266 @@
1/*
2 * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the BSD-type
8 * license below:
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 *
17 * Redistributions in binary form must reproduce the above
18 * copyright notice, this list of conditions and the following
19 * disclaimer in the documentation and/or other materials provided
20 * with the distribution.
21 *
22 * Neither the name of the Network Appliance, Inc. nor the names of
23 * its contributors may be used to endorse or promote products
24 * derived from this software without specific prior written
25 * permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 *
39 * Author: Tom Tucker <tom@opengridcomputing.com>
40 */
41#include <linux/module.h>
42#include <linux/init.h>
43#include <linux/fs.h>
44#include <linux/sysctl.h>
45#include <linux/sunrpc/clnt.h>
46#include <linux/sunrpc/sched.h>
47#include <linux/sunrpc/svc_rdma.h>
48
49#define RPCDBG_FACILITY RPCDBG_SVCXPRT
50
51/* RPC/RDMA parameters */
52unsigned int svcrdma_ord = RPCRDMA_ORD;
53static unsigned int min_ord = 1;
54static unsigned int max_ord = 4096;
55unsigned int svcrdma_max_requests = RPCRDMA_MAX_REQUESTS;
56static unsigned int min_max_requests = 4;
57static unsigned int max_max_requests = 16384;
58unsigned int svcrdma_max_req_size = RPCRDMA_MAX_REQ_SIZE;
59static unsigned int min_max_inline = 4096;
60static unsigned int max_max_inline = 65536;
61
62atomic_t rdma_stat_recv;
63atomic_t rdma_stat_read;
64atomic_t rdma_stat_write;
65atomic_t rdma_stat_sq_starve;
66atomic_t rdma_stat_rq_starve;
67atomic_t rdma_stat_rq_poll;
68atomic_t rdma_stat_rq_prod;
69atomic_t rdma_stat_sq_poll;
70atomic_t rdma_stat_sq_prod;
71
72/*
73 * This function implements reading and resetting an atomic_t stat
74 * variable through read/write to a proc file. Any write to the file
75 * resets the associated statistic to zero. Any read returns it's
76 * current value.
77 */
78static int read_reset_stat(ctl_table *table, int write,
79 struct file *filp, void __user *buffer, size_t *lenp,
80 loff_t *ppos)
81{
82 atomic_t *stat = (atomic_t *)table->data;
83
84 if (!stat)
85 return -EINVAL;
86
87 if (write)
88 atomic_set(stat, 0);
89 else {
90 char str_buf[32];
91 char *data;
92 int len = snprintf(str_buf, 32, "%d\n", atomic_read(stat));
93 if (len >= 32)
94 return -EFAULT;
95 len = strlen(str_buf);
96 if (*ppos > len) {
97 *lenp = 0;
98 return 0;
99 }
100 data = &str_buf[*ppos];
101 len -= *ppos;
102 if (len > *lenp)
103 len = *lenp;
104 if (len && copy_to_user(buffer, str_buf, len))
105 return -EFAULT;
106 *lenp = len;
107 *ppos += len;
108 }
109 return 0;
110}
111
112static struct ctl_table_header *svcrdma_table_header;
113static ctl_table svcrdma_parm_table[] = {
114 {
115 .procname = "max_requests",
116 .data = &svcrdma_max_requests,
117 .maxlen = sizeof(unsigned int),
118 .mode = 0644,
119 .proc_handler = &proc_dointvec_minmax,
120 .strategy = &sysctl_intvec,
121 .extra1 = &min_max_requests,
122 .extra2 = &max_max_requests
123 },
124 {
125 .procname = "max_req_size",
126 .data = &svcrdma_max_req_size,
127 .maxlen = sizeof(unsigned int),
128 .mode = 0644,
129 .proc_handler = &proc_dointvec_minmax,
130 .strategy = &sysctl_intvec,
131 .extra1 = &min_max_inline,
132 .extra2 = &max_max_inline
133 },
134 {
135 .procname = "max_outbound_read_requests",
136 .data = &svcrdma_ord,
137 .maxlen = sizeof(unsigned int),
138 .mode = 0644,
139 .proc_handler = &proc_dointvec_minmax,
140 .strategy = &sysctl_intvec,
141 .extra1 = &min_ord,
142 .extra2 = &max_ord,
143 },
144
145 {
146 .procname = "rdma_stat_read",
147 .data = &rdma_stat_read,
148 .maxlen = sizeof(atomic_t),
149 .mode = 0644,
150 .proc_handler = &read_reset_stat,
151 },
152 {
153 .procname = "rdma_stat_recv",
154 .data = &rdma_stat_recv,
155 .maxlen = sizeof(atomic_t),
156 .mode = 0644,
157 .proc_handler = &read_reset_stat,
158 },
159 {
160 .procname = "rdma_stat_write",
161 .data = &rdma_stat_write,
162 .maxlen = sizeof(atomic_t),
163 .mode = 0644,
164 .proc_handler = &read_reset_stat,
165 },
166 {
167 .procname = "rdma_stat_sq_starve",
168 .data = &rdma_stat_sq_starve,
169 .maxlen = sizeof(atomic_t),
170 .mode = 0644,
171 .proc_handler = &read_reset_stat,
172 },
173 {
174 .procname = "rdma_stat_rq_starve",
175 .data = &rdma_stat_rq_starve,
176 .maxlen = sizeof(atomic_t),
177 .mode = 0644,
178 .proc_handler = &read_reset_stat,
179 },
180 {
181 .procname = "rdma_stat_rq_poll",
182 .data = &rdma_stat_rq_poll,
183 .maxlen = sizeof(atomic_t),
184 .mode = 0644,
185 .proc_handler = &read_reset_stat,
186 },
187 {
188 .procname = "rdma_stat_rq_prod",
189 .data = &rdma_stat_rq_prod,
190 .maxlen = sizeof(atomic_t),
191 .mode = 0644,
192 .proc_handler = &read_reset_stat,
193 },
194 {
195 .procname = "rdma_stat_sq_poll",
196 .data = &rdma_stat_sq_poll,
197 .maxlen = sizeof(atomic_t),
198 .mode = 0644,
199 .proc_handler = &read_reset_stat,
200 },
201 {
202 .procname = "rdma_stat_sq_prod",
203 .data = &rdma_stat_sq_prod,
204 .maxlen = sizeof(atomic_t),
205 .mode = 0644,
206 .proc_handler = &read_reset_stat,
207 },
208 {
209 .ctl_name = 0,
210 },
211};
212
213static ctl_table svcrdma_table[] = {
214 {
215 .procname = "svc_rdma",
216 .mode = 0555,
217 .child = svcrdma_parm_table
218 },
219 {
220 .ctl_name = 0,
221 },
222};
223
224static ctl_table svcrdma_root_table[] = {
225 {
226 .ctl_name = CTL_SUNRPC,
227 .procname = "sunrpc",
228 .mode = 0555,
229 .child = svcrdma_table
230 },
231 {
232 .ctl_name = 0,
233 },
234};
235
236void svc_rdma_cleanup(void)
237{
238 dprintk("SVCRDMA Module Removed, deregister RPC RDMA transport\n");
239 if (svcrdma_table_header) {
240 unregister_sysctl_table(svcrdma_table_header);
241 svcrdma_table_header = NULL;
242 }
243 svc_unreg_xprt_class(&svc_rdma_class);
244}
245
246int svc_rdma_init(void)
247{
248 dprintk("SVCRDMA Module Init, register RPC RDMA transport\n");
249 dprintk("\tsvcrdma_ord : %d\n", svcrdma_ord);
250 dprintk("\tmax_requests : %d\n", svcrdma_max_requests);
251 dprintk("\tsq_depth : %d\n",
252 svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT);
253 dprintk("\tmax_inline : %d\n", svcrdma_max_req_size);
254 if (!svcrdma_table_header)
255 svcrdma_table_header =
256 register_sysctl_table(svcrdma_root_table);
257
258 /* Register RDMA with the SVC transport switch */
259 svc_reg_xprt_class(&svc_rdma_class);
260 return 0;
261}
262MODULE_AUTHOR("Tom Tucker <tom@opengridcomputing.com>");
263MODULE_DESCRIPTION("SVC RDMA Transport");
264MODULE_LICENSE("Dual BSD/GPL");
265module_init(svc_rdma_init);
266module_exit(svc_rdma_cleanup);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_marshal.c b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
new file mode 100644
index 000000000000..9530ef2d40dc
--- /dev/null
+++ b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
@@ -0,0 +1,412 @@
1/*
2 * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the BSD-type
8 * license below:
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 *
17 * Redistributions in binary form must reproduce the above
18 * copyright notice, this list of conditions and the following
19 * disclaimer in the documentation and/or other materials provided
20 * with the distribution.
21 *
22 * Neither the name of the Network Appliance, Inc. nor the names of
23 * its contributors may be used to endorse or promote products
24 * derived from this software without specific prior written
25 * permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 *
39 * Author: Tom Tucker <tom@opengridcomputing.com>
40 */
41
42#include <linux/sunrpc/xdr.h>
43#include <linux/sunrpc/debug.h>
44#include <asm/unaligned.h>
45#include <linux/sunrpc/rpc_rdma.h>
46#include <linux/sunrpc/svc_rdma.h>
47
48#define RPCDBG_FACILITY RPCDBG_SVCXPRT
49
50/*
51 * Decodes a read chunk list. The expected format is as follows:
52 * descrim : xdr_one
53 * position : u32 offset into XDR stream
54 * handle : u32 RKEY
55 * . . .
56 * end-of-list: xdr_zero
57 */
58static u32 *decode_read_list(u32 *va, u32 *vaend)
59{
60 struct rpcrdma_read_chunk *ch = (struct rpcrdma_read_chunk *)va;
61
62 while (ch->rc_discrim != xdr_zero) {
63 u64 ch_offset;
64
65 if (((unsigned long)ch + sizeof(struct rpcrdma_read_chunk)) >
66 (unsigned long)vaend) {
67 dprintk("svcrdma: vaend=%p, ch=%p\n", vaend, ch);
68 return NULL;
69 }
70
71 ch->rc_discrim = ntohl(ch->rc_discrim);
72 ch->rc_position = ntohl(ch->rc_position);
73 ch->rc_target.rs_handle = ntohl(ch->rc_target.rs_handle);
74 ch->rc_target.rs_length = ntohl(ch->rc_target.rs_length);
75 va = (u32 *)&ch->rc_target.rs_offset;
76 xdr_decode_hyper(va, &ch_offset);
77 put_unaligned(ch_offset, (u64 *)va);
78 ch++;
79 }
80 return (u32 *)&ch->rc_position;
81}
82
83/*
84 * Determine number of chunks and total bytes in chunk list. The chunk
85 * list has already been verified to fit within the RPCRDMA header.
86 */
87void svc_rdma_rcl_chunk_counts(struct rpcrdma_read_chunk *ch,
88 int *ch_count, int *byte_count)
89{
90 /* compute the number of bytes represented by read chunks */
91 *byte_count = 0;
92 *ch_count = 0;
93 for (; ch->rc_discrim != 0; ch++) {
94 *byte_count = *byte_count + ch->rc_target.rs_length;
95 *ch_count = *ch_count + 1;
96 }
97}
98
99/*
100 * Decodes a write chunk list. The expected format is as follows:
101 * descrim : xdr_one
102 * nchunks : <count>
103 * handle : u32 RKEY ---+
104 * length : u32 <len of segment> |
105 * offset : remove va + <count>
106 * . . . |
107 * ---+
108 */
109static u32 *decode_write_list(u32 *va, u32 *vaend)
110{
111 int ch_no;
112 struct rpcrdma_write_array *ary =
113 (struct rpcrdma_write_array *)va;
114
115 /* Check for not write-array */
116 if (ary->wc_discrim == xdr_zero)
117 return (u32 *)&ary->wc_nchunks;
118
119 if ((unsigned long)ary + sizeof(struct rpcrdma_write_array) >
120 (unsigned long)vaend) {
121 dprintk("svcrdma: ary=%p, vaend=%p\n", ary, vaend);
122 return NULL;
123 }
124 ary->wc_discrim = ntohl(ary->wc_discrim);
125 ary->wc_nchunks = ntohl(ary->wc_nchunks);
126 if (((unsigned long)&ary->wc_array[0] +
127 (sizeof(struct rpcrdma_write_chunk) * ary->wc_nchunks)) >
128 (unsigned long)vaend) {
129 dprintk("svcrdma: ary=%p, wc_nchunks=%d, vaend=%p\n",
130 ary, ary->wc_nchunks, vaend);
131 return NULL;
132 }
133 for (ch_no = 0; ch_no < ary->wc_nchunks; ch_no++) {
134 u64 ch_offset;
135
136 ary->wc_array[ch_no].wc_target.rs_handle =
137 ntohl(ary->wc_array[ch_no].wc_target.rs_handle);
138 ary->wc_array[ch_no].wc_target.rs_length =
139 ntohl(ary->wc_array[ch_no].wc_target.rs_length);
140 va = (u32 *)&ary->wc_array[ch_no].wc_target.rs_offset;
141 xdr_decode_hyper(va, &ch_offset);
142 put_unaligned(ch_offset, (u64 *)va);
143 }
144
145 /*
146 * rs_length is the 2nd 4B field in wc_target and taking its
147 * address skips the list terminator
148 */
149 return (u32 *)&ary->wc_array[ch_no].wc_target.rs_length;
150}
151
152static u32 *decode_reply_array(u32 *va, u32 *vaend)
153{
154 int ch_no;
155 struct rpcrdma_write_array *ary =
156 (struct rpcrdma_write_array *)va;
157
158 /* Check for no reply-array */
159 if (ary->wc_discrim == xdr_zero)
160 return (u32 *)&ary->wc_nchunks;
161
162 if ((unsigned long)ary + sizeof(struct rpcrdma_write_array) >
163 (unsigned long)vaend) {
164 dprintk("svcrdma: ary=%p, vaend=%p\n", ary, vaend);
165 return NULL;
166 }
167 ary->wc_discrim = ntohl(ary->wc_discrim);
168 ary->wc_nchunks = ntohl(ary->wc_nchunks);
169 if (((unsigned long)&ary->wc_array[0] +
170 (sizeof(struct rpcrdma_write_chunk) * ary->wc_nchunks)) >
171 (unsigned long)vaend) {
172 dprintk("svcrdma: ary=%p, wc_nchunks=%d, vaend=%p\n",
173 ary, ary->wc_nchunks, vaend);
174 return NULL;
175 }
176 for (ch_no = 0; ch_no < ary->wc_nchunks; ch_no++) {
177 u64 ch_offset;
178
179 ary->wc_array[ch_no].wc_target.rs_handle =
180 ntohl(ary->wc_array[ch_no].wc_target.rs_handle);
181 ary->wc_array[ch_no].wc_target.rs_length =
182 ntohl(ary->wc_array[ch_no].wc_target.rs_length);
183 va = (u32 *)&ary->wc_array[ch_no].wc_target.rs_offset;
184 xdr_decode_hyper(va, &ch_offset);
185 put_unaligned(ch_offset, (u64 *)va);
186 }
187
188 return (u32 *)&ary->wc_array[ch_no];
189}
190
191int svc_rdma_xdr_decode_req(struct rpcrdma_msg **rdma_req,
192 struct svc_rqst *rqstp)
193{
194 struct rpcrdma_msg *rmsgp = NULL;
195 u32 *va;
196 u32 *vaend;
197 u32 hdr_len;
198
199 rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base;
200
201 /* Verify that there's enough bytes for header + something */
202 if (rqstp->rq_arg.len <= RPCRDMA_HDRLEN_MIN) {
203 dprintk("svcrdma: header too short = %d\n",
204 rqstp->rq_arg.len);
205 return -EINVAL;
206 }
207
208 /* Decode the header */
209 rmsgp->rm_xid = ntohl(rmsgp->rm_xid);
210 rmsgp->rm_vers = ntohl(rmsgp->rm_vers);
211 rmsgp->rm_credit = ntohl(rmsgp->rm_credit);
212 rmsgp->rm_type = ntohl(rmsgp->rm_type);
213
214 if (rmsgp->rm_vers != RPCRDMA_VERSION)
215 return -ENOSYS;
216
217 /* Pull in the extra for the padded case and bump our pointer */
218 if (rmsgp->rm_type == RDMA_MSGP) {
219 int hdrlen;
220 rmsgp->rm_body.rm_padded.rm_align =
221 ntohl(rmsgp->rm_body.rm_padded.rm_align);
222 rmsgp->rm_body.rm_padded.rm_thresh =
223 ntohl(rmsgp->rm_body.rm_padded.rm_thresh);
224
225 va = &rmsgp->rm_body.rm_padded.rm_pempty[4];
226 rqstp->rq_arg.head[0].iov_base = va;
227 hdrlen = (u32)((unsigned long)va - (unsigned long)rmsgp);
228 rqstp->rq_arg.head[0].iov_len -= hdrlen;
229 if (hdrlen > rqstp->rq_arg.len)
230 return -EINVAL;
231 return hdrlen;
232 }
233
234 /* The chunk list may contain either a read chunk list or a write
235 * chunk list and a reply chunk list.
236 */
237 va = &rmsgp->rm_body.rm_chunks[0];
238 vaend = (u32 *)((unsigned long)rmsgp + rqstp->rq_arg.len);
239 va = decode_read_list(va, vaend);
240 if (!va)
241 return -EINVAL;
242 va = decode_write_list(va, vaend);
243 if (!va)
244 return -EINVAL;
245 va = decode_reply_array(va, vaend);
246 if (!va)
247 return -EINVAL;
248
249 rqstp->rq_arg.head[0].iov_base = va;
250 hdr_len = (unsigned long)va - (unsigned long)rmsgp;
251 rqstp->rq_arg.head[0].iov_len -= hdr_len;
252
253 *rdma_req = rmsgp;
254 return hdr_len;
255}
256
257int svc_rdma_xdr_decode_deferred_req(struct svc_rqst *rqstp)
258{
259 struct rpcrdma_msg *rmsgp = NULL;
260 struct rpcrdma_read_chunk *ch;
261 struct rpcrdma_write_array *ary;
262 u32 *va;
263 u32 hdrlen;
264
265 dprintk("svcrdma: processing deferred RDMA header on rqstp=%p\n",
266 rqstp);
267 rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base;
268
269 /* Pull in the extra for the padded case and bump our pointer */
270 if (rmsgp->rm_type == RDMA_MSGP) {
271 va = &rmsgp->rm_body.rm_padded.rm_pempty[4];
272 rqstp->rq_arg.head[0].iov_base = va;
273 hdrlen = (u32)((unsigned long)va - (unsigned long)rmsgp);
274 rqstp->rq_arg.head[0].iov_len -= hdrlen;
275 return hdrlen;
276 }
277
278 /*
279 * Skip all chunks to find RPC msg. These were previously processed
280 */
281 va = &rmsgp->rm_body.rm_chunks[0];
282
283 /* Skip read-list */
284 for (ch = (struct rpcrdma_read_chunk *)va;
285 ch->rc_discrim != xdr_zero; ch++);
286 va = (u32 *)&ch->rc_position;
287
288 /* Skip write-list */
289 ary = (struct rpcrdma_write_array *)va;
290 if (ary->wc_discrim == xdr_zero)
291 va = (u32 *)&ary->wc_nchunks;
292 else
293 /*
294 * rs_length is the 2nd 4B field in wc_target and taking its
295 * address skips the list terminator
296 */
297 va = (u32 *)&ary->wc_array[ary->wc_nchunks].wc_target.rs_length;
298
299 /* Skip reply-array */
300 ary = (struct rpcrdma_write_array *)va;
301 if (ary->wc_discrim == xdr_zero)
302 va = (u32 *)&ary->wc_nchunks;
303 else
304 va = (u32 *)&ary->wc_array[ary->wc_nchunks];
305
306 rqstp->rq_arg.head[0].iov_base = va;
307 hdrlen = (unsigned long)va - (unsigned long)rmsgp;
308 rqstp->rq_arg.head[0].iov_len -= hdrlen;
309
310 return hdrlen;
311}
312
313int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt,
314 struct rpcrdma_msg *rmsgp,
315 enum rpcrdma_errcode err, u32 *va)
316{
317 u32 *startp = va;
318
319 *va++ = htonl(rmsgp->rm_xid);
320 *va++ = htonl(rmsgp->rm_vers);
321 *va++ = htonl(xprt->sc_max_requests);
322 *va++ = htonl(RDMA_ERROR);
323 *va++ = htonl(err);
324 if (err == ERR_VERS) {
325 *va++ = htonl(RPCRDMA_VERSION);
326 *va++ = htonl(RPCRDMA_VERSION);
327 }
328
329 return (int)((unsigned long)va - (unsigned long)startp);
330}
331
332int svc_rdma_xdr_get_reply_hdr_len(struct rpcrdma_msg *rmsgp)
333{
334 struct rpcrdma_write_array *wr_ary;
335
336 /* There is no read-list in a reply */
337
338 /* skip write list */
339 wr_ary = (struct rpcrdma_write_array *)
340 &rmsgp->rm_body.rm_chunks[1];
341 if (wr_ary->wc_discrim)
342 wr_ary = (struct rpcrdma_write_array *)
343 &wr_ary->wc_array[ntohl(wr_ary->wc_nchunks)].
344 wc_target.rs_length;
345 else
346 wr_ary = (struct rpcrdma_write_array *)
347 &wr_ary->wc_nchunks;
348
349 /* skip reply array */
350 if (wr_ary->wc_discrim)
351 wr_ary = (struct rpcrdma_write_array *)
352 &wr_ary->wc_array[ntohl(wr_ary->wc_nchunks)];
353 else
354 wr_ary = (struct rpcrdma_write_array *)
355 &wr_ary->wc_nchunks;
356
357 return (unsigned long) wr_ary - (unsigned long) rmsgp;
358}
359
360void svc_rdma_xdr_encode_write_list(struct rpcrdma_msg *rmsgp, int chunks)
361{
362 struct rpcrdma_write_array *ary;
363
364 /* no read-list */
365 rmsgp->rm_body.rm_chunks[0] = xdr_zero;
366
367 /* write-array discrim */
368 ary = (struct rpcrdma_write_array *)
369 &rmsgp->rm_body.rm_chunks[1];
370 ary->wc_discrim = xdr_one;
371 ary->wc_nchunks = htonl(chunks);
372
373 /* write-list terminator */
374 ary->wc_array[chunks].wc_target.rs_handle = xdr_zero;
375
376 /* reply-array discriminator */
377 ary->wc_array[chunks].wc_target.rs_length = xdr_zero;
378}
379
380void svc_rdma_xdr_encode_reply_array(struct rpcrdma_write_array *ary,
381 int chunks)
382{
383 ary->wc_discrim = xdr_one;
384 ary->wc_nchunks = htonl(chunks);
385}
386
387void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *ary,
388 int chunk_no,
389 u32 rs_handle, u64 rs_offset,
390 u32 write_len)
391{
392 struct rpcrdma_segment *seg = &ary->wc_array[chunk_no].wc_target;
393 seg->rs_handle = htonl(rs_handle);
394 seg->rs_length = htonl(write_len);
395 xdr_encode_hyper((u32 *) &seg->rs_offset, rs_offset);
396}
397
398void svc_rdma_xdr_encode_reply_header(struct svcxprt_rdma *xprt,
399 struct rpcrdma_msg *rdma_argp,
400 struct rpcrdma_msg *rdma_resp,
401 enum rpcrdma_proc rdma_type)
402{
403 rdma_resp->rm_xid = htonl(rdma_argp->rm_xid);
404 rdma_resp->rm_vers = htonl(rdma_argp->rm_vers);
405 rdma_resp->rm_credit = htonl(xprt->sc_max_requests);
406 rdma_resp->rm_type = htonl(rdma_type);
407
408 /* Encode <nul> chunks lists */
409 rdma_resp->rm_body.rm_chunks[0] = xdr_zero;
410 rdma_resp->rm_body.rm_chunks[1] = xdr_zero;
411 rdma_resp->rm_body.rm_chunks[2] = xdr_zero;
412}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
new file mode 100644
index 000000000000..ab54a736486e
--- /dev/null
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -0,0 +1,586 @@
1/*
2 * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the BSD-type
8 * license below:
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 *
17 * Redistributions in binary form must reproduce the above
18 * copyright notice, this list of conditions and the following
19 * disclaimer in the documentation and/or other materials provided
20 * with the distribution.
21 *
22 * Neither the name of the Network Appliance, Inc. nor the names of
23 * its contributors may be used to endorse or promote products
24 * derived from this software without specific prior written
25 * permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 *
39 * Author: Tom Tucker <tom@opengridcomputing.com>
40 */
41
42#include <linux/sunrpc/debug.h>
43#include <linux/sunrpc/rpc_rdma.h>
44#include <linux/spinlock.h>
45#include <asm/unaligned.h>
46#include <rdma/ib_verbs.h>
47#include <rdma/rdma_cm.h>
48#include <linux/sunrpc/svc_rdma.h>
49
50#define RPCDBG_FACILITY RPCDBG_SVCXPRT
51
52/*
53 * Replace the pages in the rq_argpages array with the pages from the SGE in
54 * the RDMA_RECV completion. The SGL should contain full pages up until the
55 * last one.
56 */
57static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
58 struct svc_rdma_op_ctxt *ctxt,
59 u32 byte_count)
60{
61 struct page *page;
62 u32 bc;
63 int sge_no;
64
65 /* Swap the page in the SGE with the page in argpages */
66 page = ctxt->pages[0];
67 put_page(rqstp->rq_pages[0]);
68 rqstp->rq_pages[0] = page;
69
70 /* Set up the XDR head */
71 rqstp->rq_arg.head[0].iov_base = page_address(page);
72 rqstp->rq_arg.head[0].iov_len = min(byte_count, ctxt->sge[0].length);
73 rqstp->rq_arg.len = byte_count;
74 rqstp->rq_arg.buflen = byte_count;
75
76 /* Compute bytes past head in the SGL */
77 bc = byte_count - rqstp->rq_arg.head[0].iov_len;
78
79 /* If data remains, store it in the pagelist */
80 rqstp->rq_arg.page_len = bc;
81 rqstp->rq_arg.page_base = 0;
82 rqstp->rq_arg.pages = &rqstp->rq_pages[1];
83 sge_no = 1;
84 while (bc && sge_no < ctxt->count) {
85 page = ctxt->pages[sge_no];
86 put_page(rqstp->rq_pages[sge_no]);
87 rqstp->rq_pages[sge_no] = page;
88 bc -= min(bc, ctxt->sge[sge_no].length);
89 rqstp->rq_arg.buflen += ctxt->sge[sge_no].length;
90 sge_no++;
91 }
92 rqstp->rq_respages = &rqstp->rq_pages[sge_no];
93
94 /* We should never run out of SGE because the limit is defined to
95 * support the max allowed RPC data length
96 */
97 BUG_ON(bc && (sge_no == ctxt->count));
98 BUG_ON((rqstp->rq_arg.head[0].iov_len + rqstp->rq_arg.page_len)
99 != byte_count);
100 BUG_ON(rqstp->rq_arg.len != byte_count);
101
102 /* If not all pages were used from the SGL, free the remaining ones */
103 bc = sge_no;
104 while (sge_no < ctxt->count) {
105 page = ctxt->pages[sge_no++];
106 put_page(page);
107 }
108 ctxt->count = bc;
109
110 /* Set up tail */
111 rqstp->rq_arg.tail[0].iov_base = NULL;
112 rqstp->rq_arg.tail[0].iov_len = 0;
113}
114
115struct chunk_sge {
116 int start; /* sge no for this chunk */
117 int count; /* sge count for this chunk */
118};
119
120/* Encode a read-chunk-list as an array of IB SGE
121 *
122 * Assumptions:
123 * - chunk[0]->position points to pages[0] at an offset of 0
124 * - pages[] is not physically or virtually contigous and consists of
125 * PAGE_SIZE elements.
126 *
127 * Output:
128 * - sge array pointing into pages[] array.
129 * - chunk_sge array specifying sge index and count for each
130 * chunk in the read list
131 *
132 */
133static int rdma_rcl_to_sge(struct svcxprt_rdma *xprt,
134 struct svc_rqst *rqstp,
135 struct svc_rdma_op_ctxt *head,
136 struct rpcrdma_msg *rmsgp,
137 struct ib_sge *sge,
138 struct chunk_sge *ch_sge_ary,
139 int ch_count,
140 int byte_count)
141{
142 int sge_no;
143 int sge_bytes;
144 int page_off;
145 int page_no;
146 int ch_bytes;
147 int ch_no;
148 struct rpcrdma_read_chunk *ch;
149
150 sge_no = 0;
151 page_no = 0;
152 page_off = 0;
153 ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
154 ch_no = 0;
155 ch_bytes = ch->rc_target.rs_length;
156 head->arg.head[0] = rqstp->rq_arg.head[0];
157 head->arg.tail[0] = rqstp->rq_arg.tail[0];
158 head->arg.pages = &head->pages[head->count];
159 head->sge[0].length = head->count; /* save count of hdr pages */
160 head->arg.page_base = 0;
161 head->arg.page_len = ch_bytes;
162 head->arg.len = rqstp->rq_arg.len + ch_bytes;
163 head->arg.buflen = rqstp->rq_arg.buflen + ch_bytes;
164 head->count++;
165 ch_sge_ary[0].start = 0;
166 while (byte_count) {
167 sge_bytes = min_t(int, PAGE_SIZE-page_off, ch_bytes);
168 sge[sge_no].addr =
169 ib_dma_map_page(xprt->sc_cm_id->device,
170 rqstp->rq_arg.pages[page_no],
171 page_off, sge_bytes,
172 DMA_FROM_DEVICE);
173 sge[sge_no].length = sge_bytes;
174 sge[sge_no].lkey = xprt->sc_phys_mr->lkey;
175 /*
176 * Don't bump head->count here because the same page
177 * may be used by multiple SGE.
178 */
179 head->arg.pages[page_no] = rqstp->rq_arg.pages[page_no];
180 rqstp->rq_respages = &rqstp->rq_arg.pages[page_no+1];
181
182 byte_count -= sge_bytes;
183 ch_bytes -= sge_bytes;
184 sge_no++;
185 /*
186 * If all bytes for this chunk have been mapped to an
187 * SGE, move to the next SGE
188 */
189 if (ch_bytes == 0) {
190 ch_sge_ary[ch_no].count =
191 sge_no - ch_sge_ary[ch_no].start;
192 ch_no++;
193 ch++;
194 ch_sge_ary[ch_no].start = sge_no;
195 ch_bytes = ch->rc_target.rs_length;
196 /* If bytes remaining account for next chunk */
197 if (byte_count) {
198 head->arg.page_len += ch_bytes;
199 head->arg.len += ch_bytes;
200 head->arg.buflen += ch_bytes;
201 }
202 }
203 /*
204 * If this SGE consumed all of the page, move to the
205 * next page
206 */
207 if ((sge_bytes + page_off) == PAGE_SIZE) {
208 page_no++;
209 page_off = 0;
210 /*
211 * If there are still bytes left to map, bump
212 * the page count
213 */
214 if (byte_count)
215 head->count++;
216 } else
217 page_off += sge_bytes;
218 }
219 BUG_ON(byte_count != 0);
220 return sge_no;
221}
222
223static void rdma_set_ctxt_sge(struct svc_rdma_op_ctxt *ctxt,
224 struct ib_sge *sge,
225 u64 *sgl_offset,
226 int count)
227{
228 int i;
229
230 ctxt->count = count;
231 for (i = 0; i < count; i++) {
232 ctxt->sge[i].addr = sge[i].addr;
233 ctxt->sge[i].length = sge[i].length;
234 *sgl_offset = *sgl_offset + sge[i].length;
235 }
236}
237
238static int rdma_read_max_sge(struct svcxprt_rdma *xprt, int sge_count)
239{
240#ifdef RDMA_TRANSPORT_IWARP
241 if ((RDMA_TRANSPORT_IWARP ==
242 rdma_node_get_transport(xprt->sc_cm_id->
243 device->node_type))
244 && sge_count > 1)
245 return 1;
246 else
247#endif
248 return min_t(int, sge_count, xprt->sc_max_sge);
249}
250
251/*
252 * Use RDMA_READ to read data from the advertised client buffer into the
253 * XDR stream starting at rq_arg.head[0].iov_base.
254 * Each chunk in the array
255 * contains the following fields:
256 * discrim - '1', This isn't used for data placement
257 * position - The xdr stream offset (the same for every chunk)
258 * handle - RMR for client memory region
259 * length - data transfer length
260 * offset - 64 bit tagged offset in remote memory region
261 *
262 * On our side, we need to read into a pagelist. The first page immediately
263 * follows the RPC header.
264 *
265 * This function returns 1 to indicate success. The data is not yet in
266 * the pagelist and therefore the RPC request must be deferred. The
267 * I/O completion will enqueue the transport again and
268 * svc_rdma_recvfrom will complete the request.
269 *
270 * NOTE: The ctxt must not be touched after the last WR has been posted
271 * because the I/O completion processing may occur on another
272 * processor and free / modify the context. Ne touche pas!
273 */
274static int rdma_read_xdr(struct svcxprt_rdma *xprt,
275 struct rpcrdma_msg *rmsgp,
276 struct svc_rqst *rqstp,
277 struct svc_rdma_op_ctxt *hdr_ctxt)
278{
279 struct ib_send_wr read_wr;
280 int err = 0;
281 int ch_no;
282 struct ib_sge *sge;
283 int ch_count;
284 int byte_count;
285 int sge_count;
286 u64 sgl_offset;
287 struct rpcrdma_read_chunk *ch;
288 struct svc_rdma_op_ctxt *ctxt = NULL;
289 struct svc_rdma_op_ctxt *head;
290 struct svc_rdma_op_ctxt *tmp_sge_ctxt;
291 struct svc_rdma_op_ctxt *tmp_ch_ctxt;
292 struct chunk_sge *ch_sge_ary;
293
294 /* If no read list is present, return 0 */
295 ch = svc_rdma_get_read_chunk(rmsgp);
296 if (!ch)
297 return 0;
298
299 /* Allocate temporary contexts to keep SGE */
300 BUG_ON(sizeof(struct ib_sge) < sizeof(struct chunk_sge));
301 tmp_sge_ctxt = svc_rdma_get_context(xprt);
302 sge = tmp_sge_ctxt->sge;
303 tmp_ch_ctxt = svc_rdma_get_context(xprt);
304 ch_sge_ary = (struct chunk_sge *)tmp_ch_ctxt->sge;
305
306 svc_rdma_rcl_chunk_counts(ch, &ch_count, &byte_count);
307 sge_count = rdma_rcl_to_sge(xprt, rqstp, hdr_ctxt, rmsgp,
308 sge, ch_sge_ary,
309 ch_count, byte_count);
310 head = svc_rdma_get_context(xprt);
311 sgl_offset = 0;
312 ch_no = 0;
313
314 for (ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
315 ch->rc_discrim != 0; ch++, ch_no++) {
316next_sge:
317 if (!ctxt)
318 ctxt = head;
319 else {
320 ctxt->next = svc_rdma_get_context(xprt);
321 ctxt = ctxt->next;
322 }
323 ctxt->next = NULL;
324 ctxt->direction = DMA_FROM_DEVICE;
325 clear_bit(RDMACTXT_F_READ_DONE, &ctxt->flags);
326 clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
327 if ((ch+1)->rc_discrim == 0) {
328 /*
329 * Checked in sq_cq_reap to see if we need to
330 * be enqueued
331 */
332 set_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
333 ctxt->next = hdr_ctxt;
334 hdr_ctxt->next = head;
335 }
336
337 /* Prepare READ WR */
338 memset(&read_wr, 0, sizeof read_wr);
339 ctxt->wr_op = IB_WR_RDMA_READ;
340 read_wr.wr_id = (unsigned long)ctxt;
341 read_wr.opcode = IB_WR_RDMA_READ;
342 read_wr.send_flags = IB_SEND_SIGNALED;
343 read_wr.wr.rdma.rkey = ch->rc_target.rs_handle;
344 read_wr.wr.rdma.remote_addr =
345 get_unaligned(&(ch->rc_target.rs_offset)) +
346 sgl_offset;
347 read_wr.sg_list = &sge[ch_sge_ary[ch_no].start];
348 read_wr.num_sge =
349 rdma_read_max_sge(xprt, ch_sge_ary[ch_no].count);
350 rdma_set_ctxt_sge(ctxt, &sge[ch_sge_ary[ch_no].start],
351 &sgl_offset,
352 read_wr.num_sge);
353
354 /* Post the read */
355 err = svc_rdma_send(xprt, &read_wr);
356 if (err) {
357 printk(KERN_ERR "svcrdma: Error posting send = %d\n",
358 err);
359 /*
360 * Break the circular list so free knows when
361 * to stop if the error happened to occur on
362 * the last read
363 */
364 ctxt->next = NULL;
365 goto out;
366 }
367 atomic_inc(&rdma_stat_read);
368
369 if (read_wr.num_sge < ch_sge_ary[ch_no].count) {
370 ch_sge_ary[ch_no].count -= read_wr.num_sge;
371 ch_sge_ary[ch_no].start += read_wr.num_sge;
372 goto next_sge;
373 }
374 sgl_offset = 0;
375 err = 0;
376 }
377
378 out:
379 svc_rdma_put_context(tmp_sge_ctxt, 0);
380 svc_rdma_put_context(tmp_ch_ctxt, 0);
381
382 /* Detach arg pages. svc_recv will replenish them */
383 for (ch_no = 0; &rqstp->rq_pages[ch_no] < rqstp->rq_respages; ch_no++)
384 rqstp->rq_pages[ch_no] = NULL;
385
386 /*
387 * Detach res pages. svc_release must see a resused count of
388 * zero or it will attempt to put them.
389 */
390 while (rqstp->rq_resused)
391 rqstp->rq_respages[--rqstp->rq_resused] = NULL;
392
393 if (err) {
394 printk(KERN_ERR "svcrdma : RDMA_READ error = %d\n", err);
395 set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
396 /* Free the linked list of read contexts */
397 while (head != NULL) {
398 ctxt = head->next;
399 svc_rdma_put_context(head, 1);
400 head = ctxt;
401 }
402 return 0;
403 }
404
405 return 1;
406}
407
408static int rdma_read_complete(struct svc_rqst *rqstp,
409 struct svc_rdma_op_ctxt *data)
410{
411 struct svc_rdma_op_ctxt *head = data->next;
412 int page_no;
413 int ret;
414
415 BUG_ON(!head);
416
417 /* Copy RPC pages */
418 for (page_no = 0; page_no < head->count; page_no++) {
419 put_page(rqstp->rq_pages[page_no]);
420 rqstp->rq_pages[page_no] = head->pages[page_no];
421 }
422 /* Point rq_arg.pages past header */
423 rqstp->rq_arg.pages = &rqstp->rq_pages[head->sge[0].length];
424 rqstp->rq_arg.page_len = head->arg.page_len;
425 rqstp->rq_arg.page_base = head->arg.page_base;
426
427 /* rq_respages starts after the last arg page */
428 rqstp->rq_respages = &rqstp->rq_arg.pages[page_no];
429 rqstp->rq_resused = 0;
430
431 /* Rebuild rq_arg head and tail. */
432 rqstp->rq_arg.head[0] = head->arg.head[0];
433 rqstp->rq_arg.tail[0] = head->arg.tail[0];
434 rqstp->rq_arg.len = head->arg.len;
435 rqstp->rq_arg.buflen = head->arg.buflen;
436
437 /* XXX: What should this be? */
438 rqstp->rq_prot = IPPROTO_MAX;
439
440 /*
441 * Free the contexts we used to build the RDMA_READ. We have
442 * to be careful here because the context list uses the same
443 * next pointer used to chain the contexts associated with the
444 * RDMA_READ
445 */
446 data->next = NULL; /* terminate circular list */
447 do {
448 data = head->next;
449 svc_rdma_put_context(head, 0);
450 head = data;
451 } while (head != NULL);
452
453 ret = rqstp->rq_arg.head[0].iov_len
454 + rqstp->rq_arg.page_len
455 + rqstp->rq_arg.tail[0].iov_len;
456 dprintk("svcrdma: deferred read ret=%d, rq_arg.len =%d, "
457 "rq_arg.head[0].iov_base=%p, rq_arg.head[0].iov_len = %zd\n",
458 ret, rqstp->rq_arg.len, rqstp->rq_arg.head[0].iov_base,
459 rqstp->rq_arg.head[0].iov_len);
460
461 /* Indicate that we've consumed an RQ credit */
462 rqstp->rq_xprt_ctxt = rqstp->rq_xprt;
463 svc_xprt_received(rqstp->rq_xprt);
464 return ret;
465}
466
467/*
468 * Set up the rqstp thread context to point to the RQ buffer. If
469 * necessary, pull additional data from the client with an RDMA_READ
470 * request.
471 */
472int svc_rdma_recvfrom(struct svc_rqst *rqstp)
473{
474 struct svc_xprt *xprt = rqstp->rq_xprt;
475 struct svcxprt_rdma *rdma_xprt =
476 container_of(xprt, struct svcxprt_rdma, sc_xprt);
477 struct svc_rdma_op_ctxt *ctxt = NULL;
478 struct rpcrdma_msg *rmsgp;
479 int ret = 0;
480 int len;
481
482 dprintk("svcrdma: rqstp=%p\n", rqstp);
483
484 /*
485 * The rq_xprt_ctxt indicates if we've consumed an RQ credit
486 * or not. It is used in the rdma xpo_release_rqst function to
487 * determine whether or not to return an RQ WQE to the RQ.
488 */
489 rqstp->rq_xprt_ctxt = NULL;
490
491 spin_lock_bh(&rdma_xprt->sc_read_complete_lock);
492 if (!list_empty(&rdma_xprt->sc_read_complete_q)) {
493 ctxt = list_entry(rdma_xprt->sc_read_complete_q.next,
494 struct svc_rdma_op_ctxt,
495 dto_q);
496 list_del_init(&ctxt->dto_q);
497 }
498 spin_unlock_bh(&rdma_xprt->sc_read_complete_lock);
499 if (ctxt)
500 return rdma_read_complete(rqstp, ctxt);
501
502 spin_lock_bh(&rdma_xprt->sc_rq_dto_lock);
503 if (!list_empty(&rdma_xprt->sc_rq_dto_q)) {
504 ctxt = list_entry(rdma_xprt->sc_rq_dto_q.next,
505 struct svc_rdma_op_ctxt,
506 dto_q);
507 list_del_init(&ctxt->dto_q);
508 } else {
509 atomic_inc(&rdma_stat_rq_starve);
510 clear_bit(XPT_DATA, &xprt->xpt_flags);
511 ctxt = NULL;
512 }
513 spin_unlock_bh(&rdma_xprt->sc_rq_dto_lock);
514 if (!ctxt) {
515 /* This is the EAGAIN path. The svc_recv routine will
516 * return -EAGAIN, the nfsd thread will go to call into
517 * svc_recv again and we shouldn't be on the active
518 * transport list
519 */
520 if (test_bit(XPT_CLOSE, &xprt->xpt_flags))
521 goto close_out;
522
523 BUG_ON(ret);
524 goto out;
525 }
526 dprintk("svcrdma: processing ctxt=%p on xprt=%p, rqstp=%p, status=%d\n",
527 ctxt, rdma_xprt, rqstp, ctxt->wc_status);
528 BUG_ON(ctxt->wc_status != IB_WC_SUCCESS);
529 atomic_inc(&rdma_stat_recv);
530
531 /* Build up the XDR from the receive buffers. */
532 rdma_build_arg_xdr(rqstp, ctxt, ctxt->byte_len);
533
534 /* Decode the RDMA header. */
535 len = svc_rdma_xdr_decode_req(&rmsgp, rqstp);
536 rqstp->rq_xprt_hlen = len;
537
538 /* If the request is invalid, reply with an error */
539 if (len < 0) {
540 if (len == -ENOSYS)
541 (void)svc_rdma_send_error(rdma_xprt, rmsgp, ERR_VERS);
542 goto close_out;
543 }
544
545 /* Read read-list data. If we would need to wait, defer
546 * it. Not that in this case, we don't return the RQ credit
547 * until after the read completes.
548 */
549 if (rdma_read_xdr(rdma_xprt, rmsgp, rqstp, ctxt)) {
550 svc_xprt_received(xprt);
551 return 0;
552 }
553
554 /* Indicate we've consumed an RQ credit */
555 rqstp->rq_xprt_ctxt = rqstp->rq_xprt;
556
557 ret = rqstp->rq_arg.head[0].iov_len
558 + rqstp->rq_arg.page_len
559 + rqstp->rq_arg.tail[0].iov_len;
560 svc_rdma_put_context(ctxt, 0);
561 out:
562 dprintk("svcrdma: ret = %d, rq_arg.len =%d, "
563 "rq_arg.head[0].iov_base=%p, rq_arg.head[0].iov_len = %zd\n",
564 ret, rqstp->rq_arg.len,
565 rqstp->rq_arg.head[0].iov_base,
566 rqstp->rq_arg.head[0].iov_len);
567 rqstp->rq_prot = IPPROTO_MAX;
568 svc_xprt_copy_addrs(rqstp, xprt);
569 svc_xprt_received(xprt);
570 return ret;
571
572 close_out:
573 if (ctxt) {
574 svc_rdma_put_context(ctxt, 1);
575 /* Indicate we've consumed an RQ credit */
576 rqstp->rq_xprt_ctxt = rqstp->rq_xprt;
577 }
578 dprintk("svcrdma: transport %p is closing\n", xprt);
579 /*
580 * Set the close bit and enqueue it. svc_recv will see the
581 * close bit and call svc_xprt_delete
582 */
583 set_bit(XPT_CLOSE, &xprt->xpt_flags);
584 svc_xprt_received(xprt);
585 return 0;
586}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
new file mode 100644
index 000000000000..3e321949e1dc
--- /dev/null
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -0,0 +1,520 @@
1/*
2 * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the BSD-type
8 * license below:
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 *
17 * Redistributions in binary form must reproduce the above
18 * copyright notice, this list of conditions and the following
19 * disclaimer in the documentation and/or other materials provided
20 * with the distribution.
21 *
22 * Neither the name of the Network Appliance, Inc. nor the names of
23 * its contributors may be used to endorse or promote products
24 * derived from this software without specific prior written
25 * permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 *
39 * Author: Tom Tucker <tom@opengridcomputing.com>
40 */
41
42#include <linux/sunrpc/debug.h>
43#include <linux/sunrpc/rpc_rdma.h>
44#include <linux/spinlock.h>
45#include <asm/unaligned.h>
46#include <rdma/ib_verbs.h>
47#include <rdma/rdma_cm.h>
48#include <linux/sunrpc/svc_rdma.h>
49
50#define RPCDBG_FACILITY RPCDBG_SVCXPRT
51
52/* Encode an XDR as an array of IB SGE
53 *
54 * Assumptions:
55 * - head[0] is physically contiguous.
56 * - tail[0] is physically contiguous.
57 * - pages[] is not physically or virtually contigous and consists of
58 * PAGE_SIZE elements.
59 *
60 * Output:
61 * SGE[0] reserved for RCPRDMA header
62 * SGE[1] data from xdr->head[]
63 * SGE[2..sge_count-2] data from xdr->pages[]
64 * SGE[sge_count-1] data from xdr->tail.
65 *
66 */
67static struct ib_sge *xdr_to_sge(struct svcxprt_rdma *xprt,
68 struct xdr_buf *xdr,
69 struct ib_sge *sge,
70 int *sge_count)
71{
72 /* Max we need is the length of the XDR / pagesize + one for
73 * head + one for tail + one for RPCRDMA header
74 */
75 int sge_max = (xdr->len+PAGE_SIZE-1) / PAGE_SIZE + 3;
76 int sge_no;
77 u32 byte_count = xdr->len;
78 u32 sge_bytes;
79 u32 page_bytes;
80 int page_off;
81 int page_no;
82
83 /* Skip the first sge, this is for the RPCRDMA header */
84 sge_no = 1;
85
86 /* Head SGE */
87 sge[sge_no].addr = ib_dma_map_single(xprt->sc_cm_id->device,
88 xdr->head[0].iov_base,
89 xdr->head[0].iov_len,
90 DMA_TO_DEVICE);
91 sge_bytes = min_t(u32, byte_count, xdr->head[0].iov_len);
92 byte_count -= sge_bytes;
93 sge[sge_no].length = sge_bytes;
94 sge[sge_no].lkey = xprt->sc_phys_mr->lkey;
95 sge_no++;
96
97 /* pages SGE */
98 page_no = 0;
99 page_bytes = xdr->page_len;
100 page_off = xdr->page_base;
101 while (byte_count && page_bytes) {
102 sge_bytes = min_t(u32, byte_count, (PAGE_SIZE-page_off));
103 sge[sge_no].addr =
104 ib_dma_map_page(xprt->sc_cm_id->device,
105 xdr->pages[page_no], page_off,
106 sge_bytes, DMA_TO_DEVICE);
107 sge_bytes = min(sge_bytes, page_bytes);
108 byte_count -= sge_bytes;
109 page_bytes -= sge_bytes;
110 sge[sge_no].length = sge_bytes;
111 sge[sge_no].lkey = xprt->sc_phys_mr->lkey;
112
113 sge_no++;
114 page_no++;
115 page_off = 0; /* reset for next time through loop */
116 }
117
118 /* Tail SGE */
119 if (byte_count && xdr->tail[0].iov_len) {
120 sge[sge_no].addr =
121 ib_dma_map_single(xprt->sc_cm_id->device,
122 xdr->tail[0].iov_base,
123 xdr->tail[0].iov_len,
124 DMA_TO_DEVICE);
125 sge_bytes = min_t(u32, byte_count, xdr->tail[0].iov_len);
126 byte_count -= sge_bytes;
127 sge[sge_no].length = sge_bytes;
128 sge[sge_no].lkey = xprt->sc_phys_mr->lkey;
129 sge_no++;
130 }
131
132 BUG_ON(sge_no > sge_max);
133 BUG_ON(byte_count != 0);
134
135 *sge_count = sge_no;
136 return sge;
137}
138
139
140/* Assumptions:
141 * - The specified write_len can be represented in sc_max_sge * PAGE_SIZE
142 */
143static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
144 u32 rmr, u64 to,
145 u32 xdr_off, int write_len,
146 struct ib_sge *xdr_sge, int sge_count)
147{
148 struct svc_rdma_op_ctxt *tmp_sge_ctxt;
149 struct ib_send_wr write_wr;
150 struct ib_sge *sge;
151 int xdr_sge_no;
152 int sge_no;
153 int sge_bytes;
154 int sge_off;
155 int bc;
156 struct svc_rdma_op_ctxt *ctxt;
157 int ret = 0;
158
159 BUG_ON(sge_count >= 32);
160 dprintk("svcrdma: RDMA_WRITE rmr=%x, to=%llx, xdr_off=%d, "
161 "write_len=%d, xdr_sge=%p, sge_count=%d\n",
162 rmr, to, xdr_off, write_len, xdr_sge, sge_count);
163
164 ctxt = svc_rdma_get_context(xprt);
165 ctxt->count = 0;
166 tmp_sge_ctxt = svc_rdma_get_context(xprt);
167 sge = tmp_sge_ctxt->sge;
168
169 /* Find the SGE associated with xdr_off */
170 for (bc = xdr_off, xdr_sge_no = 1; bc && xdr_sge_no < sge_count;
171 xdr_sge_no++) {
172 if (xdr_sge[xdr_sge_no].length > bc)
173 break;
174 bc -= xdr_sge[xdr_sge_no].length;
175 }
176
177 sge_off = bc;
178 bc = write_len;
179 sge_no = 0;
180
181 /* Copy the remaining SGE */
182 while (bc != 0 && xdr_sge_no < sge_count) {
183 sge[sge_no].addr = xdr_sge[xdr_sge_no].addr + sge_off;
184 sge[sge_no].lkey = xdr_sge[xdr_sge_no].lkey;
185 sge_bytes = min((size_t)bc,
186 (size_t)(xdr_sge[xdr_sge_no].length-sge_off));
187 sge[sge_no].length = sge_bytes;
188
189 sge_off = 0;
190 sge_no++;
191 xdr_sge_no++;
192 bc -= sge_bytes;
193 }
194
195 BUG_ON(bc != 0);
196 BUG_ON(xdr_sge_no > sge_count);
197
198 /* Prepare WRITE WR */
199 memset(&write_wr, 0, sizeof write_wr);
200 ctxt->wr_op = IB_WR_RDMA_WRITE;
201 write_wr.wr_id = (unsigned long)ctxt;
202 write_wr.sg_list = &sge[0];
203 write_wr.num_sge = sge_no;
204 write_wr.opcode = IB_WR_RDMA_WRITE;
205 write_wr.send_flags = IB_SEND_SIGNALED;
206 write_wr.wr.rdma.rkey = rmr;
207 write_wr.wr.rdma.remote_addr = to;
208
209 /* Post It */
210 atomic_inc(&rdma_stat_write);
211 if (svc_rdma_send(xprt, &write_wr)) {
212 svc_rdma_put_context(ctxt, 1);
213 /* Fatal error, close transport */
214 ret = -EIO;
215 }
216 svc_rdma_put_context(tmp_sge_ctxt, 0);
217 return ret;
218}
219
220static int send_write_chunks(struct svcxprt_rdma *xprt,
221 struct rpcrdma_msg *rdma_argp,
222 struct rpcrdma_msg *rdma_resp,
223 struct svc_rqst *rqstp,
224 struct ib_sge *sge,
225 int sge_count)
226{
227 u32 xfer_len = rqstp->rq_res.page_len + rqstp->rq_res.tail[0].iov_len;
228 int write_len;
229 int max_write;
230 u32 xdr_off;
231 int chunk_off;
232 int chunk_no;
233 struct rpcrdma_write_array *arg_ary;
234 struct rpcrdma_write_array *res_ary;
235 int ret;
236
237 arg_ary = svc_rdma_get_write_array(rdma_argp);
238 if (!arg_ary)
239 return 0;
240 res_ary = (struct rpcrdma_write_array *)
241 &rdma_resp->rm_body.rm_chunks[1];
242
243 max_write = xprt->sc_max_sge * PAGE_SIZE;
244
245 /* Write chunks start at the pagelist */
246 for (xdr_off = rqstp->rq_res.head[0].iov_len, chunk_no = 0;
247 xfer_len && chunk_no < arg_ary->wc_nchunks;
248 chunk_no++) {
249 struct rpcrdma_segment *arg_ch;
250 u64 rs_offset;
251
252 arg_ch = &arg_ary->wc_array[chunk_no].wc_target;
253 write_len = min(xfer_len, arg_ch->rs_length);
254
255 /* Prepare the response chunk given the length actually
256 * written */
257 rs_offset = get_unaligned(&(arg_ch->rs_offset));
258 svc_rdma_xdr_encode_array_chunk(res_ary, chunk_no,
259 arg_ch->rs_handle,
260 rs_offset,
261 write_len);
262 chunk_off = 0;
263 while (write_len) {
264 int this_write;
265 this_write = min(write_len, max_write);
266 ret = send_write(xprt, rqstp,
267 arg_ch->rs_handle,
268 rs_offset + chunk_off,
269 xdr_off,
270 this_write,
271 sge,
272 sge_count);
273 if (ret) {
274 dprintk("svcrdma: RDMA_WRITE failed, ret=%d\n",
275 ret);
276 return -EIO;
277 }
278 chunk_off += this_write;
279 xdr_off += this_write;
280 xfer_len -= this_write;
281 write_len -= this_write;
282 }
283 }
284 /* Update the req with the number of chunks actually used */
285 svc_rdma_xdr_encode_write_list(rdma_resp, chunk_no);
286
287 return rqstp->rq_res.page_len + rqstp->rq_res.tail[0].iov_len;
288}
289
290static int send_reply_chunks(struct svcxprt_rdma *xprt,
291 struct rpcrdma_msg *rdma_argp,
292 struct rpcrdma_msg *rdma_resp,
293 struct svc_rqst *rqstp,
294 struct ib_sge *sge,
295 int sge_count)
296{
297 u32 xfer_len = rqstp->rq_res.len;
298 int write_len;
299 int max_write;
300 u32 xdr_off;
301 int chunk_no;
302 int chunk_off;
303 struct rpcrdma_segment *ch;
304 struct rpcrdma_write_array *arg_ary;
305 struct rpcrdma_write_array *res_ary;
306 int ret;
307
308 arg_ary = svc_rdma_get_reply_array(rdma_argp);
309 if (!arg_ary)
310 return 0;
311 /* XXX: need to fix when reply lists occur with read-list and or
312 * write-list */
313 res_ary = (struct rpcrdma_write_array *)
314 &rdma_resp->rm_body.rm_chunks[2];
315
316 max_write = xprt->sc_max_sge * PAGE_SIZE;
317
318 /* xdr offset starts at RPC message */
319 for (xdr_off = 0, chunk_no = 0;
320 xfer_len && chunk_no < arg_ary->wc_nchunks;
321 chunk_no++) {
322 u64 rs_offset;
323 ch = &arg_ary->wc_array[chunk_no].wc_target;
324 write_len = min(xfer_len, ch->rs_length);
325
326
327 /* Prepare the reply chunk given the length actually
328 * written */
329 rs_offset = get_unaligned(&(ch->rs_offset));
330 svc_rdma_xdr_encode_array_chunk(res_ary, chunk_no,
331 ch->rs_handle, rs_offset,
332 write_len);
333 chunk_off = 0;
334 while (write_len) {
335 int this_write;
336
337 this_write = min(write_len, max_write);
338 ret = send_write(xprt, rqstp,
339 ch->rs_handle,
340 rs_offset + chunk_off,
341 xdr_off,
342 this_write,
343 sge,
344 sge_count);
345 if (ret) {
346 dprintk("svcrdma: RDMA_WRITE failed, ret=%d\n",
347 ret);
348 return -EIO;
349 }
350 chunk_off += this_write;
351 xdr_off += this_write;
352 xfer_len -= this_write;
353 write_len -= this_write;
354 }
355 }
356 /* Update the req with the number of chunks actually used */
357 svc_rdma_xdr_encode_reply_array(res_ary, chunk_no);
358
359 return rqstp->rq_res.len;
360}
361
362/* This function prepares the portion of the RPCRDMA message to be
363 * sent in the RDMA_SEND. This function is called after data sent via
364 * RDMA has already been transmitted. There are three cases:
365 * - The RPCRDMA header, RPC header, and payload are all sent in a
366 * single RDMA_SEND. This is the "inline" case.
367 * - The RPCRDMA header and some portion of the RPC header and data
368 * are sent via this RDMA_SEND and another portion of the data is
369 * sent via RDMA.
370 * - The RPCRDMA header [NOMSG] is sent in this RDMA_SEND and the RPC
371 * header and data are all transmitted via RDMA.
372 * In all three cases, this function prepares the RPCRDMA header in
373 * sge[0], the 'type' parameter indicates the type to place in the
374 * RPCRDMA header, and the 'byte_count' field indicates how much of
375 * the XDR to include in this RDMA_SEND.
376 */
377static int send_reply(struct svcxprt_rdma *rdma,
378 struct svc_rqst *rqstp,
379 struct page *page,
380 struct rpcrdma_msg *rdma_resp,
381 struct svc_rdma_op_ctxt *ctxt,
382 int sge_count,
383 int byte_count)
384{
385 struct ib_send_wr send_wr;
386 int sge_no;
387 int sge_bytes;
388 int page_no;
389 int ret;
390
391 /* Prepare the context */
392 ctxt->pages[0] = page;
393 ctxt->count = 1;
394
395 /* Prepare the SGE for the RPCRDMA Header */
396 ctxt->sge[0].addr =
397 ib_dma_map_page(rdma->sc_cm_id->device,
398 page, 0, PAGE_SIZE, DMA_TO_DEVICE);
399 ctxt->direction = DMA_TO_DEVICE;
400 ctxt->sge[0].length = svc_rdma_xdr_get_reply_hdr_len(rdma_resp);
401 ctxt->sge[0].lkey = rdma->sc_phys_mr->lkey;
402
403 /* Determine how many of our SGE are to be transmitted */
404 for (sge_no = 1; byte_count && sge_no < sge_count; sge_no++) {
405 sge_bytes = min((size_t)ctxt->sge[sge_no].length,
406 (size_t)byte_count);
407 byte_count -= sge_bytes;
408 }
409 BUG_ON(byte_count != 0);
410
411 /* Save all respages in the ctxt and remove them from the
412 * respages array. They are our pages until the I/O
413 * completes.
414 */
415 for (page_no = 0; page_no < rqstp->rq_resused; page_no++) {
416 ctxt->pages[page_no+1] = rqstp->rq_respages[page_no];
417 ctxt->count++;
418 rqstp->rq_respages[page_no] = NULL;
419 }
420
421 BUG_ON(sge_no > rdma->sc_max_sge);
422 memset(&send_wr, 0, sizeof send_wr);
423 ctxt->wr_op = IB_WR_SEND;
424 send_wr.wr_id = (unsigned long)ctxt;
425 send_wr.sg_list = ctxt->sge;
426 send_wr.num_sge = sge_no;
427 send_wr.opcode = IB_WR_SEND;
428 send_wr.send_flags = IB_SEND_SIGNALED;
429
430 ret = svc_rdma_send(rdma, &send_wr);
431 if (ret)
432 svc_rdma_put_context(ctxt, 1);
433
434 return ret;
435}
436
437void svc_rdma_prep_reply_hdr(struct svc_rqst *rqstp)
438{
439}
440
441/*
442 * Return the start of an xdr buffer.
443 */
444static void *xdr_start(struct xdr_buf *xdr)
445{
446 return xdr->head[0].iov_base -
447 (xdr->len -
448 xdr->page_len -
449 xdr->tail[0].iov_len -
450 xdr->head[0].iov_len);
451}
452
453int svc_rdma_sendto(struct svc_rqst *rqstp)
454{
455 struct svc_xprt *xprt = rqstp->rq_xprt;
456 struct svcxprt_rdma *rdma =
457 container_of(xprt, struct svcxprt_rdma, sc_xprt);
458 struct rpcrdma_msg *rdma_argp;
459 struct rpcrdma_msg *rdma_resp;
460 struct rpcrdma_write_array *reply_ary;
461 enum rpcrdma_proc reply_type;
462 int ret;
463 int inline_bytes;
464 struct ib_sge *sge;
465 int sge_count = 0;
466 struct page *res_page;
467 struct svc_rdma_op_ctxt *ctxt;
468
469 dprintk("svcrdma: sending response for rqstp=%p\n", rqstp);
470
471 /* Get the RDMA request header. */
472 rdma_argp = xdr_start(&rqstp->rq_arg);
473
474 /* Build an SGE for the XDR */
475 ctxt = svc_rdma_get_context(rdma);
476 ctxt->direction = DMA_TO_DEVICE;
477 sge = xdr_to_sge(rdma, &rqstp->rq_res, ctxt->sge, &sge_count);
478
479 inline_bytes = rqstp->rq_res.len;
480
481 /* Create the RDMA response header */
482 res_page = svc_rdma_get_page();
483 rdma_resp = page_address(res_page);
484 reply_ary = svc_rdma_get_reply_array(rdma_argp);
485 if (reply_ary)
486 reply_type = RDMA_NOMSG;
487 else
488 reply_type = RDMA_MSG;
489 svc_rdma_xdr_encode_reply_header(rdma, rdma_argp,
490 rdma_resp, reply_type);
491
492 /* Send any write-chunk data and build resp write-list */
493 ret = send_write_chunks(rdma, rdma_argp, rdma_resp,
494 rqstp, sge, sge_count);
495 if (ret < 0) {
496 printk(KERN_ERR "svcrdma: failed to send write chunks, rc=%d\n",
497 ret);
498 goto error;
499 }
500 inline_bytes -= ret;
501
502 /* Send any reply-list data and update resp reply-list */
503 ret = send_reply_chunks(rdma, rdma_argp, rdma_resp,
504 rqstp, sge, sge_count);
505 if (ret < 0) {
506 printk(KERN_ERR "svcrdma: failed to send reply chunks, rc=%d\n",
507 ret);
508 goto error;
509 }
510 inline_bytes -= ret;
511
512 ret = send_reply(rdma, rqstp, res_page, rdma_resp, ctxt, sge_count,
513 inline_bytes);
514 dprintk("svcrdma: send_reply returns %d\n", ret);
515 return ret;
516 error:
517 svc_rdma_put_context(ctxt, 0);
518 put_page(res_page);
519 return ret;
520}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
new file mode 100644
index 000000000000..f09444c451bc
--- /dev/null
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -0,0 +1,1080 @@
1/*
2 * Copyright (c) 2005-2007 Network Appliance, Inc. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the BSD-type
8 * license below:
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 *
17 * Redistributions in binary form must reproduce the above
18 * copyright notice, this list of conditions and the following
19 * disclaimer in the documentation and/or other materials provided
20 * with the distribution.
21 *
22 * Neither the name of the Network Appliance, Inc. nor the names of
23 * its contributors may be used to endorse or promote products
24 * derived from this software without specific prior written
25 * permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 *
39 * Author: Tom Tucker <tom@opengridcomputing.com>
40 */
41
42#include <linux/sunrpc/svc_xprt.h>
43#include <linux/sunrpc/debug.h>
44#include <linux/sunrpc/rpc_rdma.h>
45#include <linux/spinlock.h>
46#include <rdma/ib_verbs.h>
47#include <rdma/rdma_cm.h>
48#include <linux/sunrpc/svc_rdma.h>
49
50#define RPCDBG_FACILITY RPCDBG_SVCXPRT
51
52static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
53 struct sockaddr *sa, int salen,
54 int flags);
55static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt);
56static void svc_rdma_release_rqst(struct svc_rqst *);
57static void rdma_destroy_xprt(struct svcxprt_rdma *xprt);
58static void dto_tasklet_func(unsigned long data);
59static void svc_rdma_detach(struct svc_xprt *xprt);
60static void svc_rdma_free(struct svc_xprt *xprt);
61static int svc_rdma_has_wspace(struct svc_xprt *xprt);
62static void rq_cq_reap(struct svcxprt_rdma *xprt);
63static void sq_cq_reap(struct svcxprt_rdma *xprt);
64
65DECLARE_TASKLET(dto_tasklet, dto_tasklet_func, 0UL);
66static DEFINE_SPINLOCK(dto_lock);
67static LIST_HEAD(dto_xprt_q);
68
69static struct svc_xprt_ops svc_rdma_ops = {
70 .xpo_create = svc_rdma_create,
71 .xpo_recvfrom = svc_rdma_recvfrom,
72 .xpo_sendto = svc_rdma_sendto,
73 .xpo_release_rqst = svc_rdma_release_rqst,
74 .xpo_detach = svc_rdma_detach,
75 .xpo_free = svc_rdma_free,
76 .xpo_prep_reply_hdr = svc_rdma_prep_reply_hdr,
77 .xpo_has_wspace = svc_rdma_has_wspace,
78 .xpo_accept = svc_rdma_accept,
79};
80
81struct svc_xprt_class svc_rdma_class = {
82 .xcl_name = "rdma",
83 .xcl_owner = THIS_MODULE,
84 .xcl_ops = &svc_rdma_ops,
85 .xcl_max_payload = RPCSVC_MAXPAYLOAD_TCP,
86};
87
88static int rdma_bump_context_cache(struct svcxprt_rdma *xprt)
89{
90 int target;
91 int at_least_one = 0;
92 struct svc_rdma_op_ctxt *ctxt;
93
94 target = min(xprt->sc_ctxt_cnt + xprt->sc_ctxt_bump,
95 xprt->sc_ctxt_max);
96
97 spin_lock_bh(&xprt->sc_ctxt_lock);
98 while (xprt->sc_ctxt_cnt < target) {
99 xprt->sc_ctxt_cnt++;
100 spin_unlock_bh(&xprt->sc_ctxt_lock);
101
102 ctxt = kmalloc(sizeof(*ctxt), GFP_KERNEL);
103
104 spin_lock_bh(&xprt->sc_ctxt_lock);
105 if (ctxt) {
106 at_least_one = 1;
107 ctxt->next = xprt->sc_ctxt_head;
108 xprt->sc_ctxt_head = ctxt;
109 } else {
110 /* kmalloc failed...give up for now */
111 xprt->sc_ctxt_cnt--;
112 break;
113 }
114 }
115 spin_unlock_bh(&xprt->sc_ctxt_lock);
116 dprintk("svcrdma: sc_ctxt_max=%d, sc_ctxt_cnt=%d\n",
117 xprt->sc_ctxt_max, xprt->sc_ctxt_cnt);
118 return at_least_one;
119}
120
121struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt)
122{
123 struct svc_rdma_op_ctxt *ctxt;
124
125 while (1) {
126 spin_lock_bh(&xprt->sc_ctxt_lock);
127 if (unlikely(xprt->sc_ctxt_head == NULL)) {
128 /* Try to bump my cache. */
129 spin_unlock_bh(&xprt->sc_ctxt_lock);
130
131 if (rdma_bump_context_cache(xprt))
132 continue;
133
134 printk(KERN_INFO "svcrdma: sleeping waiting for "
135 "context memory on xprt=%p\n",
136 xprt);
137 schedule_timeout_uninterruptible(msecs_to_jiffies(500));
138 continue;
139 }
140 ctxt = xprt->sc_ctxt_head;
141 xprt->sc_ctxt_head = ctxt->next;
142 spin_unlock_bh(&xprt->sc_ctxt_lock);
143 ctxt->xprt = xprt;
144 INIT_LIST_HEAD(&ctxt->dto_q);
145 ctxt->count = 0;
146 break;
147 }
148 return ctxt;
149}
150
151void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages)
152{
153 struct svcxprt_rdma *xprt;
154 int i;
155
156 BUG_ON(!ctxt);
157 xprt = ctxt->xprt;
158 if (free_pages)
159 for (i = 0; i < ctxt->count; i++)
160 put_page(ctxt->pages[i]);
161
162 for (i = 0; i < ctxt->count; i++)
163 dma_unmap_single(xprt->sc_cm_id->device->dma_device,
164 ctxt->sge[i].addr,
165 ctxt->sge[i].length,
166 ctxt->direction);
167 spin_lock_bh(&xprt->sc_ctxt_lock);
168 ctxt->next = xprt->sc_ctxt_head;
169 xprt->sc_ctxt_head = ctxt;
170 spin_unlock_bh(&xprt->sc_ctxt_lock);
171}
172
173/* ib_cq event handler */
174static void cq_event_handler(struct ib_event *event, void *context)
175{
176 struct svc_xprt *xprt = context;
177 dprintk("svcrdma: received CQ event id=%d, context=%p\n",
178 event->event, context);
179 set_bit(XPT_CLOSE, &xprt->xpt_flags);
180}
181
182/* QP event handler */
183static void qp_event_handler(struct ib_event *event, void *context)
184{
185 struct svc_xprt *xprt = context;
186
187 switch (event->event) {
188 /* These are considered benign events */
189 case IB_EVENT_PATH_MIG:
190 case IB_EVENT_COMM_EST:
191 case IB_EVENT_SQ_DRAINED:
192 case IB_EVENT_QP_LAST_WQE_REACHED:
193 dprintk("svcrdma: QP event %d received for QP=%p\n",
194 event->event, event->element.qp);
195 break;
196 /* These are considered fatal events */
197 case IB_EVENT_PATH_MIG_ERR:
198 case IB_EVENT_QP_FATAL:
199 case IB_EVENT_QP_REQ_ERR:
200 case IB_EVENT_QP_ACCESS_ERR:
201 case IB_EVENT_DEVICE_FATAL:
202 default:
203 dprintk("svcrdma: QP ERROR event %d received for QP=%p, "
204 "closing transport\n",
205 event->event, event->element.qp);
206 set_bit(XPT_CLOSE, &xprt->xpt_flags);
207 break;
208 }
209}
210
211/*
212 * Data Transfer Operation Tasklet
213 *
214 * Walks a list of transports with I/O pending, removing entries as
215 * they are added to the server's I/O pending list. Two bits indicate
216 * if SQ, RQ, or both have I/O pending. The dto_lock is an irqsave
217 * spinlock that serializes access to the transport list with the RQ
218 * and SQ interrupt handlers.
219 */
220static void dto_tasklet_func(unsigned long data)
221{
222 struct svcxprt_rdma *xprt;
223 unsigned long flags;
224
225 spin_lock_irqsave(&dto_lock, flags);
226 while (!list_empty(&dto_xprt_q)) {
227 xprt = list_entry(dto_xprt_q.next,
228 struct svcxprt_rdma, sc_dto_q);
229 list_del_init(&xprt->sc_dto_q);
230 spin_unlock_irqrestore(&dto_lock, flags);
231
232 if (test_and_clear_bit(RDMAXPRT_RQ_PENDING, &xprt->sc_flags)) {
233 ib_req_notify_cq(xprt->sc_rq_cq, IB_CQ_NEXT_COMP);
234 rq_cq_reap(xprt);
235 set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
236 /*
237 * If data arrived before established event,
238 * don't enqueue. This defers RPC I/O until the
239 * RDMA connection is complete.
240 */
241 if (!test_bit(RDMAXPRT_CONN_PENDING, &xprt->sc_flags))
242 svc_xprt_enqueue(&xprt->sc_xprt);
243 }
244
245 if (test_and_clear_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags)) {
246 ib_req_notify_cq(xprt->sc_sq_cq, IB_CQ_NEXT_COMP);
247 sq_cq_reap(xprt);
248 }
249
250 spin_lock_irqsave(&dto_lock, flags);
251 }
252 spin_unlock_irqrestore(&dto_lock, flags);
253}
254
255/*
256 * Receive Queue Completion Handler
257 *
258 * Since an RQ completion handler is called on interrupt context, we
259 * need to defer the handling of the I/O to a tasklet
260 */
261static void rq_comp_handler(struct ib_cq *cq, void *cq_context)
262{
263 struct svcxprt_rdma *xprt = cq_context;
264 unsigned long flags;
265
266 /*
267 * Set the bit regardless of whether or not it's on the list
268 * because it may be on the list already due to an SQ
269 * completion.
270 */
271 set_bit(RDMAXPRT_RQ_PENDING, &xprt->sc_flags);
272
273 /*
274 * If this transport is not already on the DTO transport queue,
275 * add it
276 */
277 spin_lock_irqsave(&dto_lock, flags);
278 if (list_empty(&xprt->sc_dto_q))
279 list_add_tail(&xprt->sc_dto_q, &dto_xprt_q);
280 spin_unlock_irqrestore(&dto_lock, flags);
281
282 /* Tasklet does all the work to avoid irqsave locks. */
283 tasklet_schedule(&dto_tasklet);
284}
285
286/*
287 * rq_cq_reap - Process the RQ CQ.
288 *
289 * Take all completing WC off the CQE and enqueue the associated DTO
290 * context on the dto_q for the transport.
291 */
292static void rq_cq_reap(struct svcxprt_rdma *xprt)
293{
294 int ret;
295 struct ib_wc wc;
296 struct svc_rdma_op_ctxt *ctxt = NULL;
297
298 atomic_inc(&rdma_stat_rq_poll);
299
300 spin_lock_bh(&xprt->sc_rq_dto_lock);
301 while ((ret = ib_poll_cq(xprt->sc_rq_cq, 1, &wc)) > 0) {
302 ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id;
303 ctxt->wc_status = wc.status;
304 ctxt->byte_len = wc.byte_len;
305 if (wc.status != IB_WC_SUCCESS) {
306 /* Close the transport */
307 set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
308 svc_rdma_put_context(ctxt, 1);
309 continue;
310 }
311 list_add_tail(&ctxt->dto_q, &xprt->sc_rq_dto_q);
312 }
313 spin_unlock_bh(&xprt->sc_rq_dto_lock);
314
315 if (ctxt)
316 atomic_inc(&rdma_stat_rq_prod);
317}
318
319/*
320 * Send Queue Completion Handler - potentially called on interrupt context.
321 */
322static void sq_cq_reap(struct svcxprt_rdma *xprt)
323{
324 struct svc_rdma_op_ctxt *ctxt = NULL;
325 struct ib_wc wc;
326 struct ib_cq *cq = xprt->sc_sq_cq;
327 int ret;
328
329 atomic_inc(&rdma_stat_sq_poll);
330 while ((ret = ib_poll_cq(cq, 1, &wc)) > 0) {
331 ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id;
332 xprt = ctxt->xprt;
333
334 if (wc.status != IB_WC_SUCCESS)
335 /* Close the transport */
336 set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
337
338 /* Decrement used SQ WR count */
339 atomic_dec(&xprt->sc_sq_count);
340 wake_up(&xprt->sc_send_wait);
341
342 switch (ctxt->wr_op) {
343 case IB_WR_SEND:
344 case IB_WR_RDMA_WRITE:
345 svc_rdma_put_context(ctxt, 1);
346 break;
347
348 case IB_WR_RDMA_READ:
349 if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) {
350 set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
351 set_bit(RDMACTXT_F_READ_DONE, &ctxt->flags);
352 spin_lock_bh(&xprt->sc_read_complete_lock);
353 list_add_tail(&ctxt->dto_q,
354 &xprt->sc_read_complete_q);
355 spin_unlock_bh(&xprt->sc_read_complete_lock);
356 svc_xprt_enqueue(&xprt->sc_xprt);
357 }
358 break;
359
360 default:
361 printk(KERN_ERR "svcrdma: unexpected completion type, "
362 "opcode=%d, status=%d\n",
363 wc.opcode, wc.status);
364 break;
365 }
366 }
367
368 if (ctxt)
369 atomic_inc(&rdma_stat_sq_prod);
370}
371
372static void sq_comp_handler(struct ib_cq *cq, void *cq_context)
373{
374 struct svcxprt_rdma *xprt = cq_context;
375 unsigned long flags;
376
377 /*
378 * Set the bit regardless of whether or not it's on the list
379 * because it may be on the list already due to an RQ
380 * completion.
381 */
382 set_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags);
383
384 /*
385 * If this transport is not already on the DTO transport queue,
386 * add it
387 */
388 spin_lock_irqsave(&dto_lock, flags);
389 if (list_empty(&xprt->sc_dto_q))
390 list_add_tail(&xprt->sc_dto_q, &dto_xprt_q);
391 spin_unlock_irqrestore(&dto_lock, flags);
392
393 /* Tasklet does all the work to avoid irqsave locks. */
394 tasklet_schedule(&dto_tasklet);
395}
396
397static void create_context_cache(struct svcxprt_rdma *xprt,
398 int ctxt_count, int ctxt_bump, int ctxt_max)
399{
400 struct svc_rdma_op_ctxt *ctxt;
401 int i;
402
403 xprt->sc_ctxt_max = ctxt_max;
404 xprt->sc_ctxt_bump = ctxt_bump;
405 xprt->sc_ctxt_cnt = 0;
406 xprt->sc_ctxt_head = NULL;
407 for (i = 0; i < ctxt_count; i++) {
408 ctxt = kmalloc(sizeof(*ctxt), GFP_KERNEL);
409 if (ctxt) {
410 ctxt->next = xprt->sc_ctxt_head;
411 xprt->sc_ctxt_head = ctxt;
412 xprt->sc_ctxt_cnt++;
413 }
414 }
415}
416
417static void destroy_context_cache(struct svc_rdma_op_ctxt *ctxt)
418{
419 struct svc_rdma_op_ctxt *next;
420 if (!ctxt)
421 return;
422
423 do {
424 next = ctxt->next;
425 kfree(ctxt);
426 ctxt = next;
427 } while (next);
428}
429
430static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
431 int listener)
432{
433 struct svcxprt_rdma *cma_xprt = kzalloc(sizeof *cma_xprt, GFP_KERNEL);
434
435 if (!cma_xprt)
436 return NULL;
437 svc_xprt_init(&svc_rdma_class, &cma_xprt->sc_xprt, serv);
438 INIT_LIST_HEAD(&cma_xprt->sc_accept_q);
439 INIT_LIST_HEAD(&cma_xprt->sc_dto_q);
440 INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q);
441 INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q);
442 init_waitqueue_head(&cma_xprt->sc_send_wait);
443
444 spin_lock_init(&cma_xprt->sc_lock);
445 spin_lock_init(&cma_xprt->sc_read_complete_lock);
446 spin_lock_init(&cma_xprt->sc_ctxt_lock);
447 spin_lock_init(&cma_xprt->sc_rq_dto_lock);
448
449 cma_xprt->sc_ord = svcrdma_ord;
450
451 cma_xprt->sc_max_req_size = svcrdma_max_req_size;
452 cma_xprt->sc_max_requests = svcrdma_max_requests;
453 cma_xprt->sc_sq_depth = svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT;
454 atomic_set(&cma_xprt->sc_sq_count, 0);
455
456 if (!listener) {
457 int reqs = cma_xprt->sc_max_requests;
458 create_context_cache(cma_xprt,
459 reqs << 1, /* starting size */
460 reqs, /* bump amount */
461 reqs +
462 cma_xprt->sc_sq_depth +
463 RPCRDMA_MAX_THREADS + 1); /* max */
464 if (!cma_xprt->sc_ctxt_head) {
465 kfree(cma_xprt);
466 return NULL;
467 }
468 clear_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags);
469 } else
470 set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags);
471
472 return cma_xprt;
473}
474
475struct page *svc_rdma_get_page(void)
476{
477 struct page *page;
478
479 while ((page = alloc_page(GFP_KERNEL)) == NULL) {
480 /* If we can't get memory, wait a bit and try again */
481 printk(KERN_INFO "svcrdma: out of memory...retrying in 1000 "
482 "jiffies.\n");
483 schedule_timeout_uninterruptible(msecs_to_jiffies(1000));
484 }
485 return page;
486}
487
488int svc_rdma_post_recv(struct svcxprt_rdma *xprt)
489{
490 struct ib_recv_wr recv_wr, *bad_recv_wr;
491 struct svc_rdma_op_ctxt *ctxt;
492 struct page *page;
493 unsigned long pa;
494 int sge_no;
495 int buflen;
496 int ret;
497
498 ctxt = svc_rdma_get_context(xprt);
499 buflen = 0;
500 ctxt->direction = DMA_FROM_DEVICE;
501 for (sge_no = 0; buflen < xprt->sc_max_req_size; sge_no++) {
502 BUG_ON(sge_no >= xprt->sc_max_sge);
503 page = svc_rdma_get_page();
504 ctxt->pages[sge_no] = page;
505 pa = ib_dma_map_page(xprt->sc_cm_id->device,
506 page, 0, PAGE_SIZE,
507 DMA_FROM_DEVICE);
508 ctxt->sge[sge_no].addr = pa;
509 ctxt->sge[sge_no].length = PAGE_SIZE;
510 ctxt->sge[sge_no].lkey = xprt->sc_phys_mr->lkey;
511 buflen += PAGE_SIZE;
512 }
513 ctxt->count = sge_no;
514 recv_wr.next = NULL;
515 recv_wr.sg_list = &ctxt->sge[0];
516 recv_wr.num_sge = ctxt->count;
517 recv_wr.wr_id = (u64)(unsigned long)ctxt;
518
519 ret = ib_post_recv(xprt->sc_qp, &recv_wr, &bad_recv_wr);
520 return ret;
521}
522
523/*
524 * This function handles the CONNECT_REQUEST event on a listening
525 * endpoint. It is passed the cma_id for the _new_ connection. The context in
526 * this cma_id is inherited from the listening cma_id and is the svc_xprt
527 * structure for the listening endpoint.
528 *
529 * This function creates a new xprt for the new connection and enqueues it on
530 * the accept queue for the listent xprt. When the listen thread is kicked, it
531 * will call the recvfrom method on the listen xprt which will accept the new
532 * connection.
533 */
534static void handle_connect_req(struct rdma_cm_id *new_cma_id)
535{
536 struct svcxprt_rdma *listen_xprt = new_cma_id->context;
537 struct svcxprt_rdma *newxprt;
538
539 /* Create a new transport */
540 newxprt = rdma_create_xprt(listen_xprt->sc_xprt.xpt_server, 0);
541 if (!newxprt) {
542 dprintk("svcrdma: failed to create new transport\n");
543 return;
544 }
545 newxprt->sc_cm_id = new_cma_id;
546 new_cma_id->context = newxprt;
547 dprintk("svcrdma: Creating newxprt=%p, cm_id=%p, listenxprt=%p\n",
548 newxprt, newxprt->sc_cm_id, listen_xprt);
549
550 /*
551 * Enqueue the new transport on the accept queue of the listening
552 * transport
553 */
554 spin_lock_bh(&listen_xprt->sc_lock);
555 list_add_tail(&newxprt->sc_accept_q, &listen_xprt->sc_accept_q);
556 spin_unlock_bh(&listen_xprt->sc_lock);
557
558 /*
559 * Can't use svc_xprt_received here because we are not on a
560 * rqstp thread
561 */
562 set_bit(XPT_CONN, &listen_xprt->sc_xprt.xpt_flags);
563 svc_xprt_enqueue(&listen_xprt->sc_xprt);
564}
565
566/*
567 * Handles events generated on the listening endpoint. These events will be
568 * either be incoming connect requests or adapter removal events.
569 */
570static int rdma_listen_handler(struct rdma_cm_id *cma_id,
571 struct rdma_cm_event *event)
572{
573 struct svcxprt_rdma *xprt = cma_id->context;
574 int ret = 0;
575
576 switch (event->event) {
577 case RDMA_CM_EVENT_CONNECT_REQUEST:
578 dprintk("svcrdma: Connect request on cma_id=%p, xprt = %p, "
579 "event=%d\n", cma_id, cma_id->context, event->event);
580 handle_connect_req(cma_id);
581 break;
582
583 case RDMA_CM_EVENT_ESTABLISHED:
584 /* Accept complete */
585 dprintk("svcrdma: Connection completed on LISTEN xprt=%p, "
586 "cm_id=%p\n", xprt, cma_id);
587 break;
588
589 case RDMA_CM_EVENT_DEVICE_REMOVAL:
590 dprintk("svcrdma: Device removal xprt=%p, cm_id=%p\n",
591 xprt, cma_id);
592 if (xprt)
593 set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
594 break;
595
596 default:
597 dprintk("svcrdma: Unexpected event on listening endpoint %p, "
598 "event=%d\n", cma_id, event->event);
599 break;
600 }
601
602 return ret;
603}
604
605static int rdma_cma_handler(struct rdma_cm_id *cma_id,
606 struct rdma_cm_event *event)
607{
608 struct svc_xprt *xprt = cma_id->context;
609 struct svcxprt_rdma *rdma =
610 container_of(xprt, struct svcxprt_rdma, sc_xprt);
611 switch (event->event) {
612 case RDMA_CM_EVENT_ESTABLISHED:
613 /* Accept complete */
614 dprintk("svcrdma: Connection completed on DTO xprt=%p, "
615 "cm_id=%p\n", xprt, cma_id);
616 clear_bit(RDMAXPRT_CONN_PENDING, &rdma->sc_flags);
617 svc_xprt_enqueue(xprt);
618 break;
619 case RDMA_CM_EVENT_DISCONNECTED:
620 dprintk("svcrdma: Disconnect on DTO xprt=%p, cm_id=%p\n",
621 xprt, cma_id);
622 if (xprt) {
623 set_bit(XPT_CLOSE, &xprt->xpt_flags);
624 svc_xprt_enqueue(xprt);
625 }
626 break;
627 case RDMA_CM_EVENT_DEVICE_REMOVAL:
628 dprintk("svcrdma: Device removal cma_id=%p, xprt = %p, "
629 "event=%d\n", cma_id, xprt, event->event);
630 if (xprt) {
631 set_bit(XPT_CLOSE, &xprt->xpt_flags);
632 svc_xprt_enqueue(xprt);
633 }
634 break;
635 default:
636 dprintk("svcrdma: Unexpected event on DTO endpoint %p, "
637 "event=%d\n", cma_id, event->event);
638 break;
639 }
640 return 0;
641}
642
643/*
644 * Create a listening RDMA service endpoint.
645 */
646static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
647 struct sockaddr *sa, int salen,
648 int flags)
649{
650 struct rdma_cm_id *listen_id;
651 struct svcxprt_rdma *cma_xprt;
652 struct svc_xprt *xprt;
653 int ret;
654
655 dprintk("svcrdma: Creating RDMA socket\n");
656
657 cma_xprt = rdma_create_xprt(serv, 1);
658 if (!cma_xprt)
659 return ERR_PTR(ENOMEM);
660 xprt = &cma_xprt->sc_xprt;
661
662 listen_id = rdma_create_id(rdma_listen_handler, cma_xprt, RDMA_PS_TCP);
663 if (IS_ERR(listen_id)) {
664 rdma_destroy_xprt(cma_xprt);
665 dprintk("svcrdma: rdma_create_id failed = %ld\n",
666 PTR_ERR(listen_id));
667 return (void *)listen_id;
668 }
669 ret = rdma_bind_addr(listen_id, sa);
670 if (ret) {
671 rdma_destroy_xprt(cma_xprt);
672 rdma_destroy_id(listen_id);
673 dprintk("svcrdma: rdma_bind_addr failed = %d\n", ret);
674 return ERR_PTR(ret);
675 }
676 cma_xprt->sc_cm_id = listen_id;
677
678 ret = rdma_listen(listen_id, RPCRDMA_LISTEN_BACKLOG);
679 if (ret) {
680 rdma_destroy_id(listen_id);
681 rdma_destroy_xprt(cma_xprt);
682 dprintk("svcrdma: rdma_listen failed = %d\n", ret);
683 }
684
685 /*
686 * We need to use the address from the cm_id in case the
687 * caller specified 0 for the port number.
688 */
689 sa = (struct sockaddr *)&cma_xprt->sc_cm_id->route.addr.src_addr;
690 svc_xprt_set_local(&cma_xprt->sc_xprt, sa, salen);
691
692 return &cma_xprt->sc_xprt;
693}
694
695/*
696 * This is the xpo_recvfrom function for listening endpoints. Its
697 * purpose is to accept incoming connections. The CMA callback handler
698 * has already created a new transport and attached it to the new CMA
699 * ID.
700 *
701 * There is a queue of pending connections hung on the listening
702 * transport. This queue contains the new svc_xprt structure. This
703 * function takes svc_xprt structures off the accept_q and completes
704 * the connection.
705 */
706static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
707{
708 struct svcxprt_rdma *listen_rdma;
709 struct svcxprt_rdma *newxprt = NULL;
710 struct rdma_conn_param conn_param;
711 struct ib_qp_init_attr qp_attr;
712 struct ib_device_attr devattr;
713 struct sockaddr *sa;
714 int ret;
715 int i;
716
717 listen_rdma = container_of(xprt, struct svcxprt_rdma, sc_xprt);
718 clear_bit(XPT_CONN, &xprt->xpt_flags);
719 /* Get the next entry off the accept list */
720 spin_lock_bh(&listen_rdma->sc_lock);
721 if (!list_empty(&listen_rdma->sc_accept_q)) {
722 newxprt = list_entry(listen_rdma->sc_accept_q.next,
723 struct svcxprt_rdma, sc_accept_q);
724 list_del_init(&newxprt->sc_accept_q);
725 }
726 if (!list_empty(&listen_rdma->sc_accept_q))
727 set_bit(XPT_CONN, &listen_rdma->sc_xprt.xpt_flags);
728 spin_unlock_bh(&listen_rdma->sc_lock);
729 if (!newxprt)
730 return NULL;
731
732 dprintk("svcrdma: newxprt from accept queue = %p, cm_id=%p\n",
733 newxprt, newxprt->sc_cm_id);
734
735 ret = ib_query_device(newxprt->sc_cm_id->device, &devattr);
736 if (ret) {
737 dprintk("svcrdma: could not query device attributes on "
738 "device %p, rc=%d\n", newxprt->sc_cm_id->device, ret);
739 goto errout;
740 }
741
742 /* Qualify the transport resource defaults with the
743 * capabilities of this particular device */
744 newxprt->sc_max_sge = min((size_t)devattr.max_sge,
745 (size_t)RPCSVC_MAXPAGES);
746 newxprt->sc_max_requests = min((size_t)devattr.max_qp_wr,
747 (size_t)svcrdma_max_requests);
748 newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_max_requests;
749
750 newxprt->sc_ord = min((size_t)devattr.max_qp_rd_atom,
751 (size_t)svcrdma_ord);
752
753 newxprt->sc_pd = ib_alloc_pd(newxprt->sc_cm_id->device);
754 if (IS_ERR(newxprt->sc_pd)) {
755 dprintk("svcrdma: error creating PD for connect request\n");
756 goto errout;
757 }
758 newxprt->sc_sq_cq = ib_create_cq(newxprt->sc_cm_id->device,
759 sq_comp_handler,
760 cq_event_handler,
761 newxprt,
762 newxprt->sc_sq_depth,
763 0);
764 if (IS_ERR(newxprt->sc_sq_cq)) {
765 dprintk("svcrdma: error creating SQ CQ for connect request\n");
766 goto errout;
767 }
768 newxprt->sc_rq_cq = ib_create_cq(newxprt->sc_cm_id->device,
769 rq_comp_handler,
770 cq_event_handler,
771 newxprt,
772 newxprt->sc_max_requests,
773 0);
774 if (IS_ERR(newxprt->sc_rq_cq)) {
775 dprintk("svcrdma: error creating RQ CQ for connect request\n");
776 goto errout;
777 }
778
779 memset(&qp_attr, 0, sizeof qp_attr);
780 qp_attr.event_handler = qp_event_handler;
781 qp_attr.qp_context = &newxprt->sc_xprt;
782 qp_attr.cap.max_send_wr = newxprt->sc_sq_depth;
783 qp_attr.cap.max_recv_wr = newxprt->sc_max_requests;
784 qp_attr.cap.max_send_sge = newxprt->sc_max_sge;
785 qp_attr.cap.max_recv_sge = newxprt->sc_max_sge;
786 qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
787 qp_attr.qp_type = IB_QPT_RC;
788 qp_attr.send_cq = newxprt->sc_sq_cq;
789 qp_attr.recv_cq = newxprt->sc_rq_cq;
790 dprintk("svcrdma: newxprt->sc_cm_id=%p, newxprt->sc_pd=%p\n"
791 " cm_id->device=%p, sc_pd->device=%p\n"
792 " cap.max_send_wr = %d\n"
793 " cap.max_recv_wr = %d\n"
794 " cap.max_send_sge = %d\n"
795 " cap.max_recv_sge = %d\n",
796 newxprt->sc_cm_id, newxprt->sc_pd,
797 newxprt->sc_cm_id->device, newxprt->sc_pd->device,
798 qp_attr.cap.max_send_wr,
799 qp_attr.cap.max_recv_wr,
800 qp_attr.cap.max_send_sge,
801 qp_attr.cap.max_recv_sge);
802
803 ret = rdma_create_qp(newxprt->sc_cm_id, newxprt->sc_pd, &qp_attr);
804 if (ret) {
805 /*
806 * XXX: This is a hack. We need a xx_request_qp interface
807 * that will adjust the qp_attr's with a best-effort
808 * number
809 */
810 qp_attr.cap.max_send_sge -= 2;
811 qp_attr.cap.max_recv_sge -= 2;
812 ret = rdma_create_qp(newxprt->sc_cm_id, newxprt->sc_pd,
813 &qp_attr);
814 if (ret) {
815 dprintk("svcrdma: failed to create QP, ret=%d\n", ret);
816 goto errout;
817 }
818 newxprt->sc_max_sge = qp_attr.cap.max_send_sge;
819 newxprt->sc_max_sge = qp_attr.cap.max_recv_sge;
820 newxprt->sc_sq_depth = qp_attr.cap.max_send_wr;
821 newxprt->sc_max_requests = qp_attr.cap.max_recv_wr;
822 }
823 newxprt->sc_qp = newxprt->sc_cm_id->qp;
824
825 /* Register all of physical memory */
826 newxprt->sc_phys_mr = ib_get_dma_mr(newxprt->sc_pd,
827 IB_ACCESS_LOCAL_WRITE |
828 IB_ACCESS_REMOTE_WRITE);
829 if (IS_ERR(newxprt->sc_phys_mr)) {
830 dprintk("svcrdma: Failed to create DMA MR ret=%d\n", ret);
831 goto errout;
832 }
833
834 /* Post receive buffers */
835 for (i = 0; i < newxprt->sc_max_requests; i++) {
836 ret = svc_rdma_post_recv(newxprt);
837 if (ret) {
838 dprintk("svcrdma: failure posting receive buffers\n");
839 goto errout;
840 }
841 }
842
843 /* Swap out the handler */
844 newxprt->sc_cm_id->event_handler = rdma_cma_handler;
845
846 /* Accept Connection */
847 set_bit(RDMAXPRT_CONN_PENDING, &newxprt->sc_flags);
848 memset(&conn_param, 0, sizeof conn_param);
849 conn_param.responder_resources = 0;
850 conn_param.initiator_depth = newxprt->sc_ord;
851 ret = rdma_accept(newxprt->sc_cm_id, &conn_param);
852 if (ret) {
853 dprintk("svcrdma: failed to accept new connection, ret=%d\n",
854 ret);
855 goto errout;
856 }
857
858 dprintk("svcrdma: new connection %p accepted with the following "
859 "attributes:\n"
860 " local_ip : %d.%d.%d.%d\n"
861 " local_port : %d\n"
862 " remote_ip : %d.%d.%d.%d\n"
863 " remote_port : %d\n"
864 " max_sge : %d\n"
865 " sq_depth : %d\n"
866 " max_requests : %d\n"
867 " ord : %d\n",
868 newxprt,
869 NIPQUAD(((struct sockaddr_in *)&newxprt->sc_cm_id->
870 route.addr.src_addr)->sin_addr.s_addr),
871 ntohs(((struct sockaddr_in *)&newxprt->sc_cm_id->
872 route.addr.src_addr)->sin_port),
873 NIPQUAD(((struct sockaddr_in *)&newxprt->sc_cm_id->
874 route.addr.dst_addr)->sin_addr.s_addr),
875 ntohs(((struct sockaddr_in *)&newxprt->sc_cm_id->
876 route.addr.dst_addr)->sin_port),
877 newxprt->sc_max_sge,
878 newxprt->sc_sq_depth,
879 newxprt->sc_max_requests,
880 newxprt->sc_ord);
881
882 /* Set the local and remote addresses in the transport */
883 sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr;
884 svc_xprt_set_remote(&newxprt->sc_xprt, sa, svc_addr_len(sa));
885 sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.src_addr;
886 svc_xprt_set_local(&newxprt->sc_xprt, sa, svc_addr_len(sa));
887
888 ib_req_notify_cq(newxprt->sc_sq_cq, IB_CQ_NEXT_COMP);
889 ib_req_notify_cq(newxprt->sc_rq_cq, IB_CQ_NEXT_COMP);
890 return &newxprt->sc_xprt;
891
892 errout:
893 dprintk("svcrdma: failure accepting new connection rc=%d.\n", ret);
894 rdma_destroy_id(newxprt->sc_cm_id);
895 rdma_destroy_xprt(newxprt);
896 return NULL;
897}
898
899/*
900 * Post an RQ WQE to the RQ when the rqst is being released. This
901 * effectively returns an RQ credit to the client. The rq_xprt_ctxt
902 * will be null if the request is deferred due to an RDMA_READ or the
903 * transport had no data ready (EAGAIN). Note that an RPC deferred in
904 * svc_process will still return the credit, this is because the data
905 * is copied and no longer consume a WQE/WC.
906 */
907static void svc_rdma_release_rqst(struct svc_rqst *rqstp)
908{
909 int err;
910 struct svcxprt_rdma *rdma =
911 container_of(rqstp->rq_xprt, struct svcxprt_rdma, sc_xprt);
912 if (rqstp->rq_xprt_ctxt) {
913 BUG_ON(rqstp->rq_xprt_ctxt != rdma);
914 err = svc_rdma_post_recv(rdma);
915 if (err)
916 dprintk("svcrdma: failed to post an RQ WQE error=%d\n",
917 err);
918 }
919 rqstp->rq_xprt_ctxt = NULL;
920}
921
922/* Disable data ready events for this connection */
923static void svc_rdma_detach(struct svc_xprt *xprt)
924{
925 struct svcxprt_rdma *rdma =
926 container_of(xprt, struct svcxprt_rdma, sc_xprt);
927 unsigned long flags;
928
929 dprintk("svc: svc_rdma_detach(%p)\n", xprt);
930 /*
931 * Shutdown the connection. This will ensure we don't get any
932 * more events from the provider.
933 */
934 rdma_disconnect(rdma->sc_cm_id);
935 rdma_destroy_id(rdma->sc_cm_id);
936
937 /* We may already be on the DTO list */
938 spin_lock_irqsave(&dto_lock, flags);
939 if (!list_empty(&rdma->sc_dto_q))
940 list_del_init(&rdma->sc_dto_q);
941 spin_unlock_irqrestore(&dto_lock, flags);
942}
943
944static void svc_rdma_free(struct svc_xprt *xprt)
945{
946 struct svcxprt_rdma *rdma = (struct svcxprt_rdma *)xprt;
947 dprintk("svcrdma: svc_rdma_free(%p)\n", rdma);
948 rdma_destroy_xprt(rdma);
949 kfree(rdma);
950}
951
952static void rdma_destroy_xprt(struct svcxprt_rdma *xprt)
953{
954 if (xprt->sc_qp && !IS_ERR(xprt->sc_qp))
955 ib_destroy_qp(xprt->sc_qp);
956
957 if (xprt->sc_sq_cq && !IS_ERR(xprt->sc_sq_cq))
958 ib_destroy_cq(xprt->sc_sq_cq);
959
960 if (xprt->sc_rq_cq && !IS_ERR(xprt->sc_rq_cq))
961 ib_destroy_cq(xprt->sc_rq_cq);
962
963 if (xprt->sc_phys_mr && !IS_ERR(xprt->sc_phys_mr))
964 ib_dereg_mr(xprt->sc_phys_mr);
965
966 if (xprt->sc_pd && !IS_ERR(xprt->sc_pd))
967 ib_dealloc_pd(xprt->sc_pd);
968
969 destroy_context_cache(xprt->sc_ctxt_head);
970}
971
972static int svc_rdma_has_wspace(struct svc_xprt *xprt)
973{
974 struct svcxprt_rdma *rdma =
975 container_of(xprt, struct svcxprt_rdma, sc_xprt);
976
977 /*
978 * If there are fewer SQ WR available than required to send a
979 * simple response, return false.
980 */
981 if ((rdma->sc_sq_depth - atomic_read(&rdma->sc_sq_count) < 3))
982 return 0;
983
984 /*
985 * ...or there are already waiters on the SQ,
986 * return false.
987 */
988 if (waitqueue_active(&rdma->sc_send_wait))
989 return 0;
990
991 /* Otherwise return true. */
992 return 1;
993}
994
995int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr)
996{
997 struct ib_send_wr *bad_wr;
998 int ret;
999
1000 if (test_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags))
1001 return 0;
1002
1003 BUG_ON(wr->send_flags != IB_SEND_SIGNALED);
1004 BUG_ON(((struct svc_rdma_op_ctxt *)(unsigned long)wr->wr_id)->wr_op !=
1005 wr->opcode);
1006 /* If the SQ is full, wait until an SQ entry is available */
1007 while (1) {
1008 spin_lock_bh(&xprt->sc_lock);
1009 if (xprt->sc_sq_depth == atomic_read(&xprt->sc_sq_count)) {
1010 spin_unlock_bh(&xprt->sc_lock);
1011 atomic_inc(&rdma_stat_sq_starve);
1012 /* See if we can reap some SQ WR */
1013 sq_cq_reap(xprt);
1014
1015 /* Wait until SQ WR available if SQ still full */
1016 wait_event(xprt->sc_send_wait,
1017 atomic_read(&xprt->sc_sq_count) <
1018 xprt->sc_sq_depth);
1019 continue;
1020 }
1021 /* Bumped used SQ WR count and post */
1022 ret = ib_post_send(xprt->sc_qp, wr, &bad_wr);
1023 if (!ret)
1024 atomic_inc(&xprt->sc_sq_count);
1025 else
1026 dprintk("svcrdma: failed to post SQ WR rc=%d, "
1027 "sc_sq_count=%d, sc_sq_depth=%d\n",
1028 ret, atomic_read(&xprt->sc_sq_count),
1029 xprt->sc_sq_depth);
1030 spin_unlock_bh(&xprt->sc_lock);
1031 break;
1032 }
1033 return ret;
1034}
1035
1036int svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
1037 enum rpcrdma_errcode err)
1038{
1039 struct ib_send_wr err_wr;
1040 struct ib_sge sge;
1041 struct page *p;
1042 struct svc_rdma_op_ctxt *ctxt;
1043 u32 *va;
1044 int length;
1045 int ret;
1046
1047 p = svc_rdma_get_page();
1048 va = page_address(p);
1049
1050 /* XDR encode error */
1051 length = svc_rdma_xdr_encode_error(xprt, rmsgp, err, va);
1052
1053 /* Prepare SGE for local address */
1054 sge.addr = ib_dma_map_page(xprt->sc_cm_id->device,
1055 p, 0, PAGE_SIZE, DMA_FROM_DEVICE);
1056 sge.lkey = xprt->sc_phys_mr->lkey;
1057 sge.length = length;
1058
1059 ctxt = svc_rdma_get_context(xprt);
1060 ctxt->count = 1;
1061 ctxt->pages[0] = p;
1062
1063 /* Prepare SEND WR */
1064 memset(&err_wr, 0, sizeof err_wr);
1065 ctxt->wr_op = IB_WR_SEND;
1066 err_wr.wr_id = (unsigned long)ctxt;
1067 err_wr.sg_list = &sge;
1068 err_wr.num_sge = 1;
1069 err_wr.opcode = IB_WR_SEND;
1070 err_wr.send_flags = IB_SEND_SIGNALED;
1071
1072 /* Post It */
1073 ret = svc_rdma_send(xprt, &err_wr);
1074 if (ret) {
1075 dprintk("svcrdma: Error posting send = %d\n", ret);
1076 svc_rdma_put_context(ctxt, 1);
1077 }
1078
1079 return ret;
1080}