aboutsummaryrefslogtreecommitdiffstats
path: root/net/sunrpc
diff options
context:
space:
mode:
Diffstat (limited to 'net/sunrpc')
-rw-r--r--net/sunrpc/clnt.c2
-rw-r--r--net/sunrpc/rpcb_clnt.c81
-rw-r--r--net/sunrpc/svc.c251
-rw-r--r--net/sunrpc/svc_xprt.c39
-rw-r--r--net/sunrpc/svcsock.c17
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c4
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_recvfrom.c187
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_sendto.c255
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_transport.c364
-rw-r--r--net/sunrpc/xprtsock.c4
10 files changed, 993 insertions, 211 deletions
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index a59cdf423c1c..4895c341e46d 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -174,7 +174,7 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, stru
174 clnt->cl_procinfo = version->procs; 174 clnt->cl_procinfo = version->procs;
175 clnt->cl_maxproc = version->nrprocs; 175 clnt->cl_maxproc = version->nrprocs;
176 clnt->cl_protname = program->name; 176 clnt->cl_protname = program->name;
177 clnt->cl_prog = program->number; 177 clnt->cl_prog = args->prognumber ? : program->number;
178 clnt->cl_vers = version->number; 178 clnt->cl_vers = version->number;
179 clnt->cl_stats = program->stats; 179 clnt->cl_stats = program->stats;
180 clnt->cl_metrics = rpc_alloc_iostats(clnt); 180 clnt->cl_metrics = rpc_alloc_iostats(clnt);
diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c
index 0a22f00734a4..41013dd66ac3 100644
--- a/net/sunrpc/rpcb_clnt.c
+++ b/net/sunrpc/rpcb_clnt.c
@@ -20,6 +20,7 @@
20#include <linux/in6.h> 20#include <linux/in6.h>
21#include <linux/kernel.h> 21#include <linux/kernel.h>
22#include <linux/errno.h> 22#include <linux/errno.h>
23#include <net/ipv6.h>
23 24
24#include <linux/sunrpc/clnt.h> 25#include <linux/sunrpc/clnt.h>
25#include <linux/sunrpc/sched.h> 26#include <linux/sunrpc/sched.h>
@@ -176,13 +177,12 @@ static struct rpc_clnt *rpcb_create(char *hostname, struct sockaddr *srvaddr,
176} 177}
177 178
178static int rpcb_register_call(struct sockaddr *addr, size_t addrlen, 179static int rpcb_register_call(struct sockaddr *addr, size_t addrlen,
179 u32 version, struct rpc_message *msg, 180 u32 version, struct rpc_message *msg)
180 int *result)
181{ 181{
182 struct rpc_clnt *rpcb_clnt; 182 struct rpc_clnt *rpcb_clnt;
183 int error = 0; 183 int result, error = 0;
184 184
185 *result = 0; 185 msg->rpc_resp = &result;
186 186
187 rpcb_clnt = rpcb_create_local(addr, addrlen, version); 187 rpcb_clnt = rpcb_create_local(addr, addrlen, version);
188 if (!IS_ERR(rpcb_clnt)) { 188 if (!IS_ERR(rpcb_clnt)) {
@@ -191,12 +191,15 @@ static int rpcb_register_call(struct sockaddr *addr, size_t addrlen,
191 } else 191 } else
192 error = PTR_ERR(rpcb_clnt); 192 error = PTR_ERR(rpcb_clnt);
193 193
194 if (error < 0) 194 if (error < 0) {
195 printk(KERN_WARNING "RPC: failed to contact local rpcbind " 195 printk(KERN_WARNING "RPC: failed to contact local rpcbind "
196 "server (errno %d).\n", -error); 196 "server (errno %d).\n", -error);
197 dprintk("RPC: registration status %d/%d\n", error, *result); 197 return error;
198 }
198 199
199 return error; 200 if (!result)
201 return -EACCES;
202 return 0;
200} 203}
201 204
202/** 205/**
@@ -205,7 +208,11 @@ static int rpcb_register_call(struct sockaddr *addr, size_t addrlen,
205 * @vers: RPC version number to bind 208 * @vers: RPC version number to bind
206 * @prot: transport protocol to register 209 * @prot: transport protocol to register
207 * @port: port value to register 210 * @port: port value to register
208 * @okay: OUT: result code 211 *
212 * Returns zero if the registration request was dispatched successfully
213 * and the rpcbind daemon returned success. Otherwise, returns an errno
214 * value that reflects the nature of the error (request could not be
215 * dispatched, timed out, or rpcbind returned an error).
209 * 216 *
210 * RPC services invoke this function to advertise their contact 217 * RPC services invoke this function to advertise their contact
211 * information via the system's rpcbind daemon. RPC services 218 * information via the system's rpcbind daemon. RPC services
@@ -217,15 +224,6 @@ static int rpcb_register_call(struct sockaddr *addr, size_t addrlen,
217 * all registered transports for [program, version] from the local 224 * all registered transports for [program, version] from the local
218 * rpcbind database. 225 * rpcbind database.
219 * 226 *
220 * Returns zero if the registration request was dispatched
221 * successfully and a reply was received. The rpcbind daemon's
222 * boolean result code is stored in *okay.
223 *
224 * Returns an errno value and sets *result to zero if there was
225 * some problem that prevented the rpcbind request from being
226 * dispatched, or if the rpcbind daemon did not respond within
227 * the timeout.
228 *
229 * This function uses rpcbind protocol version 2 to contact the 227 * This function uses rpcbind protocol version 2 to contact the
230 * local rpcbind daemon. 228 * local rpcbind daemon.
231 * 229 *
@@ -236,7 +234,7 @@ static int rpcb_register_call(struct sockaddr *addr, size_t addrlen,
236 * IN6ADDR_ANY (ie available for all AF_INET and AF_INET6 234 * IN6ADDR_ANY (ie available for all AF_INET and AF_INET6
237 * addresses). 235 * addresses).
238 */ 236 */
239int rpcb_register(u32 prog, u32 vers, int prot, unsigned short port, int *okay) 237int rpcb_register(u32 prog, u32 vers, int prot, unsigned short port)
240{ 238{
241 struct rpcbind_args map = { 239 struct rpcbind_args map = {
242 .r_prog = prog, 240 .r_prog = prog,
@@ -246,7 +244,6 @@ int rpcb_register(u32 prog, u32 vers, int prot, unsigned short port, int *okay)
246 }; 244 };
247 struct rpc_message msg = { 245 struct rpc_message msg = {
248 .rpc_argp = &map, 246 .rpc_argp = &map,
249 .rpc_resp = okay,
250 }; 247 };
251 248
252 dprintk("RPC: %sregistering (%u, %u, %d, %u) with local " 249 dprintk("RPC: %sregistering (%u, %u, %d, %u) with local "
@@ -259,7 +256,7 @@ int rpcb_register(u32 prog, u32 vers, int prot, unsigned short port, int *okay)
259 256
260 return rpcb_register_call((struct sockaddr *)&rpcb_inaddr_loopback, 257 return rpcb_register_call((struct sockaddr *)&rpcb_inaddr_loopback,
261 sizeof(rpcb_inaddr_loopback), 258 sizeof(rpcb_inaddr_loopback),
262 RPCBVERS_2, &msg, okay); 259 RPCBVERS_2, &msg);
263} 260}
264 261
265/* 262/*
@@ -290,7 +287,7 @@ static int rpcb_register_netid4(struct sockaddr_in *address_to_register,
290 287
291 return rpcb_register_call((struct sockaddr *)&rpcb_inaddr_loopback, 288 return rpcb_register_call((struct sockaddr *)&rpcb_inaddr_loopback,
292 sizeof(rpcb_inaddr_loopback), 289 sizeof(rpcb_inaddr_loopback),
293 RPCBVERS_4, msg, msg->rpc_resp); 290 RPCBVERS_4, msg);
294} 291}
295 292
296/* 293/*
@@ -304,10 +301,13 @@ static int rpcb_register_netid6(struct sockaddr_in6 *address_to_register,
304 char buf[64]; 301 char buf[64];
305 302
306 /* Construct AF_INET6 universal address */ 303 /* Construct AF_INET6 universal address */
307 snprintf(buf, sizeof(buf), 304 if (ipv6_addr_any(&address_to_register->sin6_addr))
308 NIP6_FMT".%u.%u", 305 snprintf(buf, sizeof(buf), "::.%u.%u",
309 NIP6(address_to_register->sin6_addr), 306 port >> 8, port & 0xff);
310 port >> 8, port & 0xff); 307 else
308 snprintf(buf, sizeof(buf), NIP6_FMT".%u.%u",
309 NIP6(address_to_register->sin6_addr),
310 port >> 8, port & 0xff);
311 map->r_addr = buf; 311 map->r_addr = buf;
312 312
313 dprintk("RPC: %sregistering [%u, %u, %s, '%s'] with " 313 dprintk("RPC: %sregistering [%u, %u, %s, '%s'] with "
@@ -321,7 +321,7 @@ static int rpcb_register_netid6(struct sockaddr_in6 *address_to_register,
321 321
322 return rpcb_register_call((struct sockaddr *)&rpcb_in6addr_loopback, 322 return rpcb_register_call((struct sockaddr *)&rpcb_in6addr_loopback,
323 sizeof(rpcb_in6addr_loopback), 323 sizeof(rpcb_in6addr_loopback),
324 RPCBVERS_4, msg, msg->rpc_resp); 324 RPCBVERS_4, msg);
325} 325}
326 326
327/** 327/**
@@ -330,7 +330,11 @@ static int rpcb_register_netid6(struct sockaddr_in6 *address_to_register,
330 * @version: RPC version number of service to (un)register 330 * @version: RPC version number of service to (un)register
331 * @address: address family, IP address, and port to (un)register 331 * @address: address family, IP address, and port to (un)register
332 * @netid: netid of transport protocol to (un)register 332 * @netid: netid of transport protocol to (un)register
333 * @result: result code from rpcbind RPC call 333 *
334 * Returns zero if the registration request was dispatched successfully
335 * and the rpcbind daemon returned success. Otherwise, returns an errno
336 * value that reflects the nature of the error (request could not be
337 * dispatched, timed out, or rpcbind returned an error).
334 * 338 *
335 * RPC services invoke this function to advertise their contact 339 * RPC services invoke this function to advertise their contact
336 * information via the system's rpcbind daemon. RPC services 340 * information via the system's rpcbind daemon. RPC services
@@ -342,15 +346,6 @@ static int rpcb_register_netid6(struct sockaddr_in6 *address_to_register,
342 * to zero. Callers pass a netid of "" to unregister all 346 * to zero. Callers pass a netid of "" to unregister all
343 * transport netids associated with [program, version, address]. 347 * transport netids associated with [program, version, address].
344 * 348 *
345 * Returns zero if the registration request was dispatched
346 * successfully and a reply was received. The rpcbind daemon's
347 * result code is stored in *result.
348 *
349 * Returns an errno value and sets *result to zero if there was
350 * some problem that prevented the rpcbind request from being
351 * dispatched, or if the rpcbind daemon did not respond within
352 * the timeout.
353 *
354 * This function uses rpcbind protocol version 4 to contact the 349 * This function uses rpcbind protocol version 4 to contact the
355 * local rpcbind daemon. The local rpcbind daemon must support 350 * local rpcbind daemon. The local rpcbind daemon must support
356 * version 4 of the rpcbind protocol in order for these functions 351 * version 4 of the rpcbind protocol in order for these functions
@@ -372,8 +367,7 @@ static int rpcb_register_netid6(struct sockaddr_in6 *address_to_register,
372 * advertises the service on all IPv4 and IPv6 addresses. 367 * advertises the service on all IPv4 and IPv6 addresses.
373 */ 368 */
374int rpcb_v4_register(const u32 program, const u32 version, 369int rpcb_v4_register(const u32 program, const u32 version,
375 const struct sockaddr *address, const char *netid, 370 const struct sockaddr *address, const char *netid)
376 int *result)
377{ 371{
378 struct rpcbind_args map = { 372 struct rpcbind_args map = {
379 .r_prog = program, 373 .r_prog = program,
@@ -383,11 +377,8 @@ int rpcb_v4_register(const u32 program, const u32 version,
383 }; 377 };
384 struct rpc_message msg = { 378 struct rpc_message msg = {
385 .rpc_argp = &map, 379 .rpc_argp = &map,
386 .rpc_resp = result,
387 }; 380 };
388 381
389 *result = 0;
390
391 switch (address->sa_family) { 382 switch (address->sa_family) {
392 case AF_INET: 383 case AF_INET:
393 return rpcb_register_netid4((struct sockaddr_in *)address, 384 return rpcb_register_netid4((struct sockaddr_in *)address,
@@ -657,7 +648,7 @@ static void rpcb_getport_done(struct rpc_task *child, void *data)
657static int rpcb_encode_mapping(struct rpc_rqst *req, __be32 *p, 648static int rpcb_encode_mapping(struct rpc_rqst *req, __be32 *p,
658 struct rpcbind_args *rpcb) 649 struct rpcbind_args *rpcb)
659{ 650{
660 dprintk("RPC: rpcb_encode_mapping(%u, %u, %d, %u)\n", 651 dprintk("RPC: encoding rpcb request (%u, %u, %d, %u)\n",
661 rpcb->r_prog, rpcb->r_vers, rpcb->r_prot, rpcb->r_port); 652 rpcb->r_prog, rpcb->r_vers, rpcb->r_prot, rpcb->r_port);
662 *p++ = htonl(rpcb->r_prog); 653 *p++ = htonl(rpcb->r_prog);
663 *p++ = htonl(rpcb->r_vers); 654 *p++ = htonl(rpcb->r_vers);
@@ -672,7 +663,7 @@ static int rpcb_decode_getport(struct rpc_rqst *req, __be32 *p,
672 unsigned short *portp) 663 unsigned short *portp)
673{ 664{
674 *portp = (unsigned short) ntohl(*p++); 665 *portp = (unsigned short) ntohl(*p++);
675 dprintk("RPC: rpcb_decode_getport result %u\n", 666 dprintk("RPC: rpcb getport result: %u\n",
676 *portp); 667 *portp);
677 return 0; 668 return 0;
678} 669}
@@ -681,7 +672,7 @@ static int rpcb_decode_set(struct rpc_rqst *req, __be32 *p,
681 unsigned int *boolp) 672 unsigned int *boolp)
682{ 673{
683 *boolp = (unsigned int) ntohl(*p++); 674 *boolp = (unsigned int) ntohl(*p++);
684 dprintk("RPC: rpcb_decode_set: call %s\n", 675 dprintk("RPC: rpcb set/unset call %s\n",
685 (*boolp ? "succeeded" : "failed")); 676 (*boolp ? "succeeded" : "failed"));
686 return 0; 677 return 0;
687} 678}
@@ -689,7 +680,7 @@ static int rpcb_decode_set(struct rpc_rqst *req, __be32 *p,
689static int rpcb_encode_getaddr(struct rpc_rqst *req, __be32 *p, 680static int rpcb_encode_getaddr(struct rpc_rqst *req, __be32 *p,
690 struct rpcbind_args *rpcb) 681 struct rpcbind_args *rpcb)
691{ 682{
692 dprintk("RPC: rpcb_encode_getaddr(%u, %u, %s)\n", 683 dprintk("RPC: encoding rpcb request (%u, %u, %s)\n",
693 rpcb->r_prog, rpcb->r_vers, rpcb->r_addr); 684 rpcb->r_prog, rpcb->r_vers, rpcb->r_addr);
694 *p++ = htonl(rpcb->r_prog); 685 *p++ = htonl(rpcb->r_prog);
695 *p++ = htonl(rpcb->r_vers); 686 *p++ = htonl(rpcb->r_vers);
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 5a32cb7c4bb4..54c98d876847 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -28,6 +28,8 @@
28 28
29#define RPCDBG_FACILITY RPCDBG_SVCDSP 29#define RPCDBG_FACILITY RPCDBG_SVCDSP
30 30
31static void svc_unregister(const struct svc_serv *serv);
32
31#define svc_serv_is_pooled(serv) ((serv)->sv_function) 33#define svc_serv_is_pooled(serv) ((serv)->sv_function)
32 34
33/* 35/*
@@ -357,7 +359,7 @@ svc_pool_for_cpu(struct svc_serv *serv, int cpu)
357 */ 359 */
358static struct svc_serv * 360static struct svc_serv *
359__svc_create(struct svc_program *prog, unsigned int bufsize, int npools, 361__svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
360 void (*shutdown)(struct svc_serv *serv)) 362 sa_family_t family, void (*shutdown)(struct svc_serv *serv))
361{ 363{
362 struct svc_serv *serv; 364 struct svc_serv *serv;
363 unsigned int vers; 365 unsigned int vers;
@@ -366,6 +368,7 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
366 368
367 if (!(serv = kzalloc(sizeof(*serv), GFP_KERNEL))) 369 if (!(serv = kzalloc(sizeof(*serv), GFP_KERNEL)))
368 return NULL; 370 return NULL;
371 serv->sv_family = family;
369 serv->sv_name = prog->pg_name; 372 serv->sv_name = prog->pg_name;
370 serv->sv_program = prog; 373 serv->sv_program = prog;
371 serv->sv_nrthreads = 1; 374 serv->sv_nrthreads = 1;
@@ -416,30 +419,29 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
416 spin_lock_init(&pool->sp_lock); 419 spin_lock_init(&pool->sp_lock);
417 } 420 }
418 421
419
420 /* Remove any stale portmap registrations */ 422 /* Remove any stale portmap registrations */
421 svc_register(serv, 0, 0); 423 svc_unregister(serv);
422 424
423 return serv; 425 return serv;
424} 426}
425 427
426struct svc_serv * 428struct svc_serv *
427svc_create(struct svc_program *prog, unsigned int bufsize, 429svc_create(struct svc_program *prog, unsigned int bufsize,
428 void (*shutdown)(struct svc_serv *serv)) 430 sa_family_t family, void (*shutdown)(struct svc_serv *serv))
429{ 431{
430 return __svc_create(prog, bufsize, /*npools*/1, shutdown); 432 return __svc_create(prog, bufsize, /*npools*/1, family, shutdown);
431} 433}
432EXPORT_SYMBOL(svc_create); 434EXPORT_SYMBOL(svc_create);
433 435
434struct svc_serv * 436struct svc_serv *
435svc_create_pooled(struct svc_program *prog, unsigned int bufsize, 437svc_create_pooled(struct svc_program *prog, unsigned int bufsize,
436 void (*shutdown)(struct svc_serv *serv), 438 sa_family_t family, void (*shutdown)(struct svc_serv *serv),
437 svc_thread_fn func, struct module *mod) 439 svc_thread_fn func, struct module *mod)
438{ 440{
439 struct svc_serv *serv; 441 struct svc_serv *serv;
440 unsigned int npools = svc_pool_map_get(); 442 unsigned int npools = svc_pool_map_get();
441 443
442 serv = __svc_create(prog, bufsize, npools, shutdown); 444 serv = __svc_create(prog, bufsize, npools, family, shutdown);
443 445
444 if (serv != NULL) { 446 if (serv != NULL) {
445 serv->sv_function = func; 447 serv->sv_function = func;
@@ -486,8 +488,7 @@ svc_destroy(struct svc_serv *serv)
486 if (svc_serv_is_pooled(serv)) 488 if (svc_serv_is_pooled(serv))
487 svc_pool_map_put(); 489 svc_pool_map_put();
488 490
489 /* Unregister service with the portmapper */ 491 svc_unregister(serv);
490 svc_register(serv, 0, 0);
491 kfree(serv->sv_pools); 492 kfree(serv->sv_pools);
492 kfree(serv); 493 kfree(serv);
493} 494}
@@ -718,55 +719,245 @@ svc_exit_thread(struct svc_rqst *rqstp)
718} 719}
719EXPORT_SYMBOL(svc_exit_thread); 720EXPORT_SYMBOL(svc_exit_thread);
720 721
722#ifdef CONFIG_SUNRPC_REGISTER_V4
723
721/* 724/*
722 * Register an RPC service with the local portmapper. 725 * Register an "inet" protocol family netid with the local
723 * To unregister a service, call this routine with 726 * rpcbind daemon via an rpcbind v4 SET request.
724 * proto and port == 0. 727 *
728 * No netconfig infrastructure is available in the kernel, so
729 * we map IP_ protocol numbers to netids by hand.
730 *
731 * Returns zero on success; a negative errno value is returned
732 * if any error occurs.
725 */ 733 */
726int 734static int __svc_rpcb_register4(const u32 program, const u32 version,
727svc_register(struct svc_serv *serv, int proto, unsigned short port) 735 const unsigned short protocol,
736 const unsigned short port)
737{
738 struct sockaddr_in sin = {
739 .sin_family = AF_INET,
740 .sin_addr.s_addr = htonl(INADDR_ANY),
741 .sin_port = htons(port),
742 };
743 char *netid;
744
745 switch (protocol) {
746 case IPPROTO_UDP:
747 netid = RPCBIND_NETID_UDP;
748 break;
749 case IPPROTO_TCP:
750 netid = RPCBIND_NETID_TCP;
751 break;
752 default:
753 return -EPROTONOSUPPORT;
754 }
755
756 return rpcb_v4_register(program, version,
757 (struct sockaddr *)&sin, netid);
758}
759
760/*
761 * Register an "inet6" protocol family netid with the local
762 * rpcbind daemon via an rpcbind v4 SET request.
763 *
764 * No netconfig infrastructure is available in the kernel, so
765 * we map IP_ protocol numbers to netids by hand.
766 *
767 * Returns zero on success; a negative errno value is returned
768 * if any error occurs.
769 */
770static int __svc_rpcb_register6(const u32 program, const u32 version,
771 const unsigned short protocol,
772 const unsigned short port)
773{
774 struct sockaddr_in6 sin6 = {
775 .sin6_family = AF_INET6,
776 .sin6_addr = IN6ADDR_ANY_INIT,
777 .sin6_port = htons(port),
778 };
779 char *netid;
780
781 switch (protocol) {
782 case IPPROTO_UDP:
783 netid = RPCBIND_NETID_UDP6;
784 break;
785 case IPPROTO_TCP:
786 netid = RPCBIND_NETID_TCP6;
787 break;
788 default:
789 return -EPROTONOSUPPORT;
790 }
791
792 return rpcb_v4_register(program, version,
793 (struct sockaddr *)&sin6, netid);
794}
795
796/*
797 * Register a kernel RPC service via rpcbind version 4.
798 *
799 * Returns zero on success; a negative errno value is returned
800 * if any error occurs.
801 */
802static int __svc_register(const u32 program, const u32 version,
803 const sa_family_t family,
804 const unsigned short protocol,
805 const unsigned short port)
806{
807 int error;
808
809 switch (family) {
810 case AF_INET:
811 return __svc_rpcb_register4(program, version,
812 protocol, port);
813 case AF_INET6:
814 error = __svc_rpcb_register6(program, version,
815 protocol, port);
816 if (error < 0)
817 return error;
818
819 /*
820 * Work around bug in some versions of Linux rpcbind
821 * which don't allow registration of both inet and
822 * inet6 netids.
823 *
824 * Error return ignored for now.
825 */
826 __svc_rpcb_register4(program, version,
827 protocol, port);
828 return 0;
829 }
830
831 return -EAFNOSUPPORT;
832}
833
834#else /* CONFIG_SUNRPC_REGISTER_V4 */
835
836/*
837 * Register a kernel RPC service via rpcbind version 2.
838 *
839 * Returns zero on success; a negative errno value is returned
840 * if any error occurs.
841 */
842static int __svc_register(const u32 program, const u32 version,
843 sa_family_t family,
844 const unsigned short protocol,
845 const unsigned short port)
846{
847 if (family != AF_INET)
848 return -EAFNOSUPPORT;
849
850 return rpcb_register(program, version, protocol, port);
851}
852
853#endif /* CONFIG_SUNRPC_REGISTER_V4 */
854
855/**
856 * svc_register - register an RPC service with the local portmapper
857 * @serv: svc_serv struct for the service to register
858 * @proto: transport protocol number to advertise
859 * @port: port to advertise
860 *
861 * Service is registered for any address in serv's address family
862 */
863int svc_register(const struct svc_serv *serv, const unsigned short proto,
864 const unsigned short port)
728{ 865{
729 struct svc_program *progp; 866 struct svc_program *progp;
730 unsigned long flags;
731 unsigned int i; 867 unsigned int i;
732 int error = 0, dummy; 868 int error = 0;
733 869
734 if (!port) 870 BUG_ON(proto == 0 && port == 0);
735 clear_thread_flag(TIF_SIGPENDING);
736 871
737 for (progp = serv->sv_program; progp; progp = progp->pg_next) { 872 for (progp = serv->sv_program; progp; progp = progp->pg_next) {
738 for (i = 0; i < progp->pg_nvers; i++) { 873 for (i = 0; i < progp->pg_nvers; i++) {
739 if (progp->pg_vers[i] == NULL) 874 if (progp->pg_vers[i] == NULL)
740 continue; 875 continue;
741 876
742 dprintk("svc: svc_register(%s, %s, %d, %d)%s\n", 877 dprintk("svc: svc_register(%sv%d, %s, %u, %u)%s\n",
743 progp->pg_name, 878 progp->pg_name,
879 i,
744 proto == IPPROTO_UDP? "udp" : "tcp", 880 proto == IPPROTO_UDP? "udp" : "tcp",
745 port, 881 port,
746 i, 882 serv->sv_family,
747 progp->pg_vers[i]->vs_hidden? 883 progp->pg_vers[i]->vs_hidden?
748 " (but not telling portmap)" : ""); 884 " (but not telling portmap)" : "");
749 885
750 if (progp->pg_vers[i]->vs_hidden) 886 if (progp->pg_vers[i]->vs_hidden)
751 continue; 887 continue;
752 888
753 error = rpcb_register(progp->pg_prog, i, proto, port, &dummy); 889 error = __svc_register(progp->pg_prog, i,
890 serv->sv_family, proto, port);
754 if (error < 0) 891 if (error < 0)
755 break; 892 break;
756 if (port && !dummy) {
757 error = -EACCES;
758 break;
759 }
760 } 893 }
761 } 894 }
762 895
763 if (!port) { 896 return error;
764 spin_lock_irqsave(&current->sighand->siglock, flags); 897}
765 recalc_sigpending(); 898
766 spin_unlock_irqrestore(&current->sighand->siglock, flags); 899#ifdef CONFIG_SUNRPC_REGISTER_V4
900
901static void __svc_unregister(const u32 program, const u32 version,
902 const char *progname)
903{
904 struct sockaddr_in6 sin6 = {
905 .sin6_family = AF_INET6,
906 .sin6_addr = IN6ADDR_ANY_INIT,
907 .sin6_port = 0,
908 };
909 int error;
910
911 error = rpcb_v4_register(program, version,
912 (struct sockaddr *)&sin6, "");
913 dprintk("svc: %s(%sv%u), error %d\n",
914 __func__, progname, version, error);
915}
916
917#else /* CONFIG_SUNRPC_REGISTER_V4 */
918
919static void __svc_unregister(const u32 program, const u32 version,
920 const char *progname)
921{
922 int error;
923
924 error = rpcb_register(program, version, 0, 0);
925 dprintk("svc: %s(%sv%u), error %d\n",
926 __func__, progname, version, error);
927}
928
929#endif /* CONFIG_SUNRPC_REGISTER_V4 */
930
931/*
932 * All netids, bind addresses and ports registered for [program, version]
933 * are removed from the local rpcbind database (if the service is not
934 * hidden) to make way for a new instance of the service.
935 *
936 * The result of unregistration is reported via dprintk for those who want
937 * verification of the result, but is otherwise not important.
938 */
939static void svc_unregister(const struct svc_serv *serv)
940{
941 struct svc_program *progp;
942 unsigned long flags;
943 unsigned int i;
944
945 clear_thread_flag(TIF_SIGPENDING);
946
947 for (progp = serv->sv_program; progp; progp = progp->pg_next) {
948 for (i = 0; i < progp->pg_nvers; i++) {
949 if (progp->pg_vers[i] == NULL)
950 continue;
951 if (progp->pg_vers[i]->vs_hidden)
952 continue;
953
954 __svc_unregister(progp->pg_prog, i, progp->pg_name);
955 }
767 } 956 }
768 957
769 return error; 958 spin_lock_irqsave(&current->sighand->siglock, flags);
959 recalc_sigpending();
960 spin_unlock_irqrestore(&current->sighand->siglock, flags);
770} 961}
771 962
772/* 963/*
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index e46c825f4954..bf5b5cdafebf 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -159,15 +159,44 @@ void svc_xprt_init(struct svc_xprt_class *xcl, struct svc_xprt *xprt,
159} 159}
160EXPORT_SYMBOL_GPL(svc_xprt_init); 160EXPORT_SYMBOL_GPL(svc_xprt_init);
161 161
162int svc_create_xprt(struct svc_serv *serv, char *xprt_name, unsigned short port, 162static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl,
163 int flags) 163 struct svc_serv *serv,
164 unsigned short port, int flags)
164{ 165{
165 struct svc_xprt_class *xcl;
166 struct sockaddr_in sin = { 166 struct sockaddr_in sin = {
167 .sin_family = AF_INET, 167 .sin_family = AF_INET,
168 .sin_addr.s_addr = htonl(INADDR_ANY), 168 .sin_addr.s_addr = htonl(INADDR_ANY),
169 .sin_port = htons(port), 169 .sin_port = htons(port),
170 }; 170 };
171 struct sockaddr_in6 sin6 = {
172 .sin6_family = AF_INET6,
173 .sin6_addr = IN6ADDR_ANY_INIT,
174 .sin6_port = htons(port),
175 };
176 struct sockaddr *sap;
177 size_t len;
178
179 switch (serv->sv_family) {
180 case AF_INET:
181 sap = (struct sockaddr *)&sin;
182 len = sizeof(sin);
183 break;
184 case AF_INET6:
185 sap = (struct sockaddr *)&sin6;
186 len = sizeof(sin6);
187 break;
188 default:
189 return ERR_PTR(-EAFNOSUPPORT);
190 }
191
192 return xcl->xcl_ops->xpo_create(serv, sap, len, flags);
193}
194
195int svc_create_xprt(struct svc_serv *serv, char *xprt_name, unsigned short port,
196 int flags)
197{
198 struct svc_xprt_class *xcl;
199
171 dprintk("svc: creating transport %s[%d]\n", xprt_name, port); 200 dprintk("svc: creating transport %s[%d]\n", xprt_name, port);
172 spin_lock(&svc_xprt_class_lock); 201 spin_lock(&svc_xprt_class_lock);
173 list_for_each_entry(xcl, &svc_xprt_class_list, xcl_list) { 202 list_for_each_entry(xcl, &svc_xprt_class_list, xcl_list) {
@@ -180,9 +209,7 @@ int svc_create_xprt(struct svc_serv *serv, char *xprt_name, unsigned short port,
180 goto err; 209 goto err;
181 210
182 spin_unlock(&svc_xprt_class_lock); 211 spin_unlock(&svc_xprt_class_lock);
183 newxprt = xcl->xcl_ops-> 212 newxprt = __svc_xpo_create(xcl, serv, port, flags);
184 xpo_create(serv, (struct sockaddr *)&sin, sizeof(sin),
185 flags);
186 if (IS_ERR(newxprt)) { 213 if (IS_ERR(newxprt)) {
187 module_put(xcl->xcl_owner); 214 module_put(xcl->xcl_owner);
188 return PTR_ERR(newxprt); 215 return PTR_ERR(newxprt);
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 3e65719f1ef6..95293f549e9c 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1114,6 +1114,7 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
1114 struct svc_sock *svsk; 1114 struct svc_sock *svsk;
1115 struct sock *inet; 1115 struct sock *inet;
1116 int pmap_register = !(flags & SVC_SOCK_ANONYMOUS); 1116 int pmap_register = !(flags & SVC_SOCK_ANONYMOUS);
1117 int val;
1117 1118
1118 dprintk("svc: svc_setup_socket %p\n", sock); 1119 dprintk("svc: svc_setup_socket %p\n", sock);
1119 if (!(svsk = kzalloc(sizeof(*svsk), GFP_KERNEL))) { 1120 if (!(svsk = kzalloc(sizeof(*svsk), GFP_KERNEL))) {
@@ -1146,6 +1147,18 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
1146 else 1147 else
1147 svc_tcp_init(svsk, serv); 1148 svc_tcp_init(svsk, serv);
1148 1149
1150 /*
1151 * We start one listener per sv_serv. We want AF_INET
1152 * requests to be automatically shunted to our AF_INET6
1153 * listener using a mapped IPv4 address. Make sure
1154 * no-one starts an equivalent IPv4 listener, which
1155 * would steal our incoming connections.
1156 */
1157 val = 0;
1158 if (serv->sv_family == AF_INET6)
1159 kernel_setsockopt(sock, SOL_IPV6, IPV6_V6ONLY,
1160 (char *)&val, sizeof(val));
1161
1149 dprintk("svc: svc_setup_socket created %p (inet %p)\n", 1162 dprintk("svc: svc_setup_socket created %p (inet %p)\n",
1150 svsk, svsk->sk_sk); 1163 svsk, svsk->sk_sk);
1151 1164
@@ -1154,8 +1167,7 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
1154 1167
1155int svc_addsock(struct svc_serv *serv, 1168int svc_addsock(struct svc_serv *serv,
1156 int fd, 1169 int fd,
1157 char *name_return, 1170 char *name_return)
1158 int *proto)
1159{ 1171{
1160 int err = 0; 1172 int err = 0;
1161 struct socket *so = sockfd_lookup(fd, &err); 1173 struct socket *so = sockfd_lookup(fd, &err);
@@ -1190,7 +1202,6 @@ int svc_addsock(struct svc_serv *serv,
1190 sockfd_put(so); 1202 sockfd_put(so);
1191 return err; 1203 return err;
1192 } 1204 }
1193 if (proto) *proto = so->sk->sk_protocol;
1194 return one_sock_name(name_return, svsk); 1205 return one_sock_name(name_return, svsk);
1195} 1206}
1196EXPORT_SYMBOL_GPL(svc_addsock); 1207EXPORT_SYMBOL_GPL(svc_addsock);
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 15101f294e00..14106d26bb95 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -782,7 +782,7 @@ repost:
782 /* check for expected message types */ 782 /* check for expected message types */
783 /* The order of some of these tests is important. */ 783 /* The order of some of these tests is important. */
784 switch (headerp->rm_type) { 784 switch (headerp->rm_type) {
785 case __constant_htonl(RDMA_MSG): 785 case htonl(RDMA_MSG):
786 /* never expect read chunks */ 786 /* never expect read chunks */
787 /* never expect reply chunks (two ways to check) */ 787 /* never expect reply chunks (two ways to check) */
788 /* never expect write chunks without having offered RDMA */ 788 /* never expect write chunks without having offered RDMA */
@@ -821,7 +821,7 @@ repost:
821 rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len, rdmalen); 821 rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len, rdmalen);
822 break; 822 break;
823 823
824 case __constant_htonl(RDMA_NOMSG): 824 case htonl(RDMA_NOMSG):
825 /* never expect read or write chunks, always reply chunks */ 825 /* never expect read or write chunks, always reply chunks */
826 if (headerp->rm_body.rm_chunks[0] != xdr_zero || 826 if (headerp->rm_body.rm_chunks[0] != xdr_zero ||
827 headerp->rm_body.rm_chunks[1] != xdr_zero || 827 headerp->rm_body.rm_chunks[1] != xdr_zero ||
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 74de31a06616..a4756576d687 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -116,7 +116,7 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
116 * 116 *
117 * Assumptions: 117 * Assumptions:
118 * - chunk[0]->position points to pages[0] at an offset of 0 118 * - chunk[0]->position points to pages[0] at an offset of 0
119 * - pages[] is not physically or virtually contigous and consists of 119 * - pages[] is not physically or virtually contiguous and consists of
120 * PAGE_SIZE elements. 120 * PAGE_SIZE elements.
121 * 121 *
122 * Output: 122 * Output:
@@ -125,7 +125,7 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
125 * chunk in the read list 125 * chunk in the read list
126 * 126 *
127 */ 127 */
128static int rdma_rcl_to_sge(struct svcxprt_rdma *xprt, 128static int map_read_chunks(struct svcxprt_rdma *xprt,
129 struct svc_rqst *rqstp, 129 struct svc_rqst *rqstp,
130 struct svc_rdma_op_ctxt *head, 130 struct svc_rdma_op_ctxt *head,
131 struct rpcrdma_msg *rmsgp, 131 struct rpcrdma_msg *rmsgp,
@@ -211,26 +211,128 @@ static int rdma_rcl_to_sge(struct svcxprt_rdma *xprt,
211 return sge_no; 211 return sge_no;
212} 212}
213 213
214static void rdma_set_ctxt_sge(struct svcxprt_rdma *xprt, 214/* Map a read-chunk-list to an XDR and fast register the page-list.
215 struct svc_rdma_op_ctxt *ctxt, 215 *
216 struct kvec *vec, 216 * Assumptions:
217 u64 *sgl_offset, 217 * - chunk[0] position points to pages[0] at an offset of 0
218 int count) 218 * - pages[] will be made physically contiguous by creating a one-off memory
219 * region using the fastreg verb.
220 * - byte_count is # of bytes in read-chunk-list
221 * - ch_count is # of chunks in read-chunk-list
222 *
223 * Output:
224 * - sge array pointing into pages[] array.
225 * - chunk_sge array specifying sge index and count for each
226 * chunk in the read list
227 */
228static int fast_reg_read_chunks(struct svcxprt_rdma *xprt,
229 struct svc_rqst *rqstp,
230 struct svc_rdma_op_ctxt *head,
231 struct rpcrdma_msg *rmsgp,
232 struct svc_rdma_req_map *rpl_map,
233 struct svc_rdma_req_map *chl_map,
234 int ch_count,
235 int byte_count)
236{
237 int page_no;
238 int ch_no;
239 u32 offset;
240 struct rpcrdma_read_chunk *ch;
241 struct svc_rdma_fastreg_mr *frmr;
242 int ret = 0;
243
244 frmr = svc_rdma_get_frmr(xprt);
245 if (IS_ERR(frmr))
246 return -ENOMEM;
247
248 head->frmr = frmr;
249 head->arg.head[0] = rqstp->rq_arg.head[0];
250 head->arg.tail[0] = rqstp->rq_arg.tail[0];
251 head->arg.pages = &head->pages[head->count];
252 head->hdr_count = head->count; /* save count of hdr pages */
253 head->arg.page_base = 0;
254 head->arg.page_len = byte_count;
255 head->arg.len = rqstp->rq_arg.len + byte_count;
256 head->arg.buflen = rqstp->rq_arg.buflen + byte_count;
257
258 /* Fast register the page list */
259 frmr->kva = page_address(rqstp->rq_arg.pages[0]);
260 frmr->direction = DMA_FROM_DEVICE;
261 frmr->access_flags = (IB_ACCESS_LOCAL_WRITE|IB_ACCESS_REMOTE_WRITE);
262 frmr->map_len = byte_count;
263 frmr->page_list_len = PAGE_ALIGN(byte_count) >> PAGE_SHIFT;
264 for (page_no = 0; page_no < frmr->page_list_len; page_no++) {
265 frmr->page_list->page_list[page_no] =
266 ib_dma_map_single(xprt->sc_cm_id->device,
267 page_address(rqstp->rq_arg.pages[page_no]),
268 PAGE_SIZE, DMA_TO_DEVICE);
269 if (ib_dma_mapping_error(xprt->sc_cm_id->device,
270 frmr->page_list->page_list[page_no]))
271 goto fatal_err;
272 atomic_inc(&xprt->sc_dma_used);
273 head->arg.pages[page_no] = rqstp->rq_arg.pages[page_no];
274 }
275 head->count += page_no;
276
277 /* rq_respages points one past arg pages */
278 rqstp->rq_respages = &rqstp->rq_arg.pages[page_no];
279
280 /* Create the reply and chunk maps */
281 offset = 0;
282 ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
283 for (ch_no = 0; ch_no < ch_count; ch_no++) {
284 rpl_map->sge[ch_no].iov_base = frmr->kva + offset;
285 rpl_map->sge[ch_no].iov_len = ch->rc_target.rs_length;
286 chl_map->ch[ch_no].count = 1;
287 chl_map->ch[ch_no].start = ch_no;
288 offset += ch->rc_target.rs_length;
289 ch++;
290 }
291
292 ret = svc_rdma_fastreg(xprt, frmr);
293 if (ret)
294 goto fatal_err;
295
296 return ch_no;
297
298 fatal_err:
299 printk("svcrdma: error fast registering xdr for xprt %p", xprt);
300 svc_rdma_put_frmr(xprt, frmr);
301 return -EIO;
302}
303
304static int rdma_set_ctxt_sge(struct svcxprt_rdma *xprt,
305 struct svc_rdma_op_ctxt *ctxt,
306 struct svc_rdma_fastreg_mr *frmr,
307 struct kvec *vec,
308 u64 *sgl_offset,
309 int count)
219{ 310{
220 int i; 311 int i;
221 312
222 ctxt->count = count; 313 ctxt->count = count;
223 ctxt->direction = DMA_FROM_DEVICE; 314 ctxt->direction = DMA_FROM_DEVICE;
224 for (i = 0; i < count; i++) { 315 for (i = 0; i < count; i++) {
225 atomic_inc(&xprt->sc_dma_used); 316 ctxt->sge[i].length = 0; /* in case map fails */
226 ctxt->sge[i].addr = 317 if (!frmr) {
227 ib_dma_map_single(xprt->sc_cm_id->device, 318 ctxt->sge[i].addr =
228 vec[i].iov_base, vec[i].iov_len, 319 ib_dma_map_single(xprt->sc_cm_id->device,
229 DMA_FROM_DEVICE); 320 vec[i].iov_base,
321 vec[i].iov_len,
322 DMA_FROM_DEVICE);
323 if (ib_dma_mapping_error(xprt->sc_cm_id->device,
324 ctxt->sge[i].addr))
325 return -EINVAL;
326 ctxt->sge[i].lkey = xprt->sc_dma_lkey;
327 atomic_inc(&xprt->sc_dma_used);
328 } else {
329 ctxt->sge[i].addr = (unsigned long)vec[i].iov_base;
330 ctxt->sge[i].lkey = frmr->mr->lkey;
331 }
230 ctxt->sge[i].length = vec[i].iov_len; 332 ctxt->sge[i].length = vec[i].iov_len;
231 ctxt->sge[i].lkey = xprt->sc_phys_mr->lkey;
232 *sgl_offset = *sgl_offset + vec[i].iov_len; 333 *sgl_offset = *sgl_offset + vec[i].iov_len;
233 } 334 }
335 return 0;
234} 336}
235 337
236static int rdma_read_max_sge(struct svcxprt_rdma *xprt, int sge_count) 338static int rdma_read_max_sge(struct svcxprt_rdma *xprt, int sge_count)
@@ -278,6 +380,7 @@ static int rdma_read_xdr(struct svcxprt_rdma *xprt,
278 struct svc_rdma_op_ctxt *hdr_ctxt) 380 struct svc_rdma_op_ctxt *hdr_ctxt)
279{ 381{
280 struct ib_send_wr read_wr; 382 struct ib_send_wr read_wr;
383 struct ib_send_wr inv_wr;
281 int err = 0; 384 int err = 0;
282 int ch_no; 385 int ch_no;
283 int ch_count; 386 int ch_count;
@@ -301,9 +404,20 @@ static int rdma_read_xdr(struct svcxprt_rdma *xprt,
301 svc_rdma_rcl_chunk_counts(ch, &ch_count, &byte_count); 404 svc_rdma_rcl_chunk_counts(ch, &ch_count, &byte_count);
302 if (ch_count > RPCSVC_MAXPAGES) 405 if (ch_count > RPCSVC_MAXPAGES)
303 return -EINVAL; 406 return -EINVAL;
304 sge_count = rdma_rcl_to_sge(xprt, rqstp, hdr_ctxt, rmsgp, 407
305 rpl_map, chl_map, 408 if (!xprt->sc_frmr_pg_list_len)
306 ch_count, byte_count); 409 sge_count = map_read_chunks(xprt, rqstp, hdr_ctxt, rmsgp,
410 rpl_map, chl_map, ch_count,
411 byte_count);
412 else
413 sge_count = fast_reg_read_chunks(xprt, rqstp, hdr_ctxt, rmsgp,
414 rpl_map, chl_map, ch_count,
415 byte_count);
416 if (sge_count < 0) {
417 err = -EIO;
418 goto out;
419 }
420
307 sgl_offset = 0; 421 sgl_offset = 0;
308 ch_no = 0; 422 ch_no = 0;
309 423
@@ -312,13 +426,16 @@ static int rdma_read_xdr(struct svcxprt_rdma *xprt,
312next_sge: 426next_sge:
313 ctxt = svc_rdma_get_context(xprt); 427 ctxt = svc_rdma_get_context(xprt);
314 ctxt->direction = DMA_FROM_DEVICE; 428 ctxt->direction = DMA_FROM_DEVICE;
429 ctxt->frmr = hdr_ctxt->frmr;
430 ctxt->read_hdr = NULL;
315 clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags); 431 clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
432 clear_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags);
316 433
317 /* Prepare READ WR */ 434 /* Prepare READ WR */
318 memset(&read_wr, 0, sizeof read_wr); 435 memset(&read_wr, 0, sizeof read_wr);
319 ctxt->wr_op = IB_WR_RDMA_READ;
320 read_wr.wr_id = (unsigned long)ctxt; 436 read_wr.wr_id = (unsigned long)ctxt;
321 read_wr.opcode = IB_WR_RDMA_READ; 437 read_wr.opcode = IB_WR_RDMA_READ;
438 ctxt->wr_op = read_wr.opcode;
322 read_wr.send_flags = IB_SEND_SIGNALED; 439 read_wr.send_flags = IB_SEND_SIGNALED;
323 read_wr.wr.rdma.rkey = ch->rc_target.rs_handle; 440 read_wr.wr.rdma.rkey = ch->rc_target.rs_handle;
324 read_wr.wr.rdma.remote_addr = 441 read_wr.wr.rdma.remote_addr =
@@ -327,10 +444,15 @@ next_sge:
327 read_wr.sg_list = ctxt->sge; 444 read_wr.sg_list = ctxt->sge;
328 read_wr.num_sge = 445 read_wr.num_sge =
329 rdma_read_max_sge(xprt, chl_map->ch[ch_no].count); 446 rdma_read_max_sge(xprt, chl_map->ch[ch_no].count);
330 rdma_set_ctxt_sge(xprt, ctxt, 447 err = rdma_set_ctxt_sge(xprt, ctxt, hdr_ctxt->frmr,
331 &rpl_map->sge[chl_map->ch[ch_no].start], 448 &rpl_map->sge[chl_map->ch[ch_no].start],
332 &sgl_offset, 449 &sgl_offset,
333 read_wr.num_sge); 450 read_wr.num_sge);
451 if (err) {
452 svc_rdma_unmap_dma(ctxt);
453 svc_rdma_put_context(ctxt, 0);
454 goto out;
455 }
334 if (((ch+1)->rc_discrim == 0) && 456 if (((ch+1)->rc_discrim == 0) &&
335 (read_wr.num_sge == chl_map->ch[ch_no].count)) { 457 (read_wr.num_sge == chl_map->ch[ch_no].count)) {
336 /* 458 /*
@@ -339,6 +461,29 @@ next_sge:
339 * the client and the RPC needs to be enqueued. 461 * the client and the RPC needs to be enqueued.
340 */ 462 */
341 set_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags); 463 set_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
464 if (hdr_ctxt->frmr) {
465 set_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags);
466 /*
467 * Invalidate the local MR used to map the data
468 * sink.
469 */
470 if (xprt->sc_dev_caps &
471 SVCRDMA_DEVCAP_READ_W_INV) {
472 read_wr.opcode =
473 IB_WR_RDMA_READ_WITH_INV;
474 ctxt->wr_op = read_wr.opcode;
475 read_wr.ex.invalidate_rkey =
476 ctxt->frmr->mr->lkey;
477 } else {
478 /* Prepare INVALIDATE WR */
479 memset(&inv_wr, 0, sizeof inv_wr);
480 inv_wr.opcode = IB_WR_LOCAL_INV;
481 inv_wr.send_flags = IB_SEND_SIGNALED;
482 inv_wr.ex.invalidate_rkey =
483 hdr_ctxt->frmr->mr->lkey;
484 read_wr.next = &inv_wr;
485 }
486 }
342 ctxt->read_hdr = hdr_ctxt; 487 ctxt->read_hdr = hdr_ctxt;
343 } 488 }
344 /* Post the read */ 489 /* Post the read */
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 84d328329d98..9a7a8e7ae038 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -69,9 +69,127 @@
69 * array is only concerned with the reply we are assured that we have 69 * array is only concerned with the reply we are assured that we have
70 * on extra page for the RPCRMDA header. 70 * on extra page for the RPCRMDA header.
71 */ 71 */
72static void xdr_to_sge(struct svcxprt_rdma *xprt, 72int fast_reg_xdr(struct svcxprt_rdma *xprt,
73 struct xdr_buf *xdr, 73 struct xdr_buf *xdr,
74 struct svc_rdma_req_map *vec) 74 struct svc_rdma_req_map *vec)
75{
76 int sge_no;
77 u32 sge_bytes;
78 u32 page_bytes;
79 u32 page_off;
80 int page_no = 0;
81 u8 *frva;
82 struct svc_rdma_fastreg_mr *frmr;
83
84 frmr = svc_rdma_get_frmr(xprt);
85 if (IS_ERR(frmr))
86 return -ENOMEM;
87 vec->frmr = frmr;
88
89 /* Skip the RPCRDMA header */
90 sge_no = 1;
91
92 /* Map the head. */
93 frva = (void *)((unsigned long)(xdr->head[0].iov_base) & PAGE_MASK);
94 vec->sge[sge_no].iov_base = xdr->head[0].iov_base;
95 vec->sge[sge_no].iov_len = xdr->head[0].iov_len;
96 vec->count = 2;
97 sge_no++;
98
99 /* Build the FRMR */
100 frmr->kva = frva;
101 frmr->direction = DMA_TO_DEVICE;
102 frmr->access_flags = 0;
103 frmr->map_len = PAGE_SIZE;
104 frmr->page_list_len = 1;
105 frmr->page_list->page_list[page_no] =
106 ib_dma_map_single(xprt->sc_cm_id->device,
107 (void *)xdr->head[0].iov_base,
108 PAGE_SIZE, DMA_TO_DEVICE);
109 if (ib_dma_mapping_error(xprt->sc_cm_id->device,
110 frmr->page_list->page_list[page_no]))
111 goto fatal_err;
112 atomic_inc(&xprt->sc_dma_used);
113
114 page_off = xdr->page_base;
115 page_bytes = xdr->page_len + page_off;
116 if (!page_bytes)
117 goto encode_tail;
118
119 /* Map the pages */
120 vec->sge[sge_no].iov_base = frva + frmr->map_len + page_off;
121 vec->sge[sge_no].iov_len = page_bytes;
122 sge_no++;
123 while (page_bytes) {
124 struct page *page;
125
126 page = xdr->pages[page_no++];
127 sge_bytes = min_t(u32, page_bytes, (PAGE_SIZE - page_off));
128 page_bytes -= sge_bytes;
129
130 frmr->page_list->page_list[page_no] =
131 ib_dma_map_page(xprt->sc_cm_id->device, page, 0,
132 PAGE_SIZE, DMA_TO_DEVICE);
133 if (ib_dma_mapping_error(xprt->sc_cm_id->device,
134 frmr->page_list->page_list[page_no]))
135 goto fatal_err;
136
137 atomic_inc(&xprt->sc_dma_used);
138 page_off = 0; /* reset for next time through loop */
139 frmr->map_len += PAGE_SIZE;
140 frmr->page_list_len++;
141 }
142 vec->count++;
143
144 encode_tail:
145 /* Map tail */
146 if (0 == xdr->tail[0].iov_len)
147 goto done;
148
149 vec->count++;
150 vec->sge[sge_no].iov_len = xdr->tail[0].iov_len;
151
152 if (((unsigned long)xdr->tail[0].iov_base & PAGE_MASK) ==
153 ((unsigned long)xdr->head[0].iov_base & PAGE_MASK)) {
154 /*
155 * If head and tail use the same page, we don't need
156 * to map it again.
157 */
158 vec->sge[sge_no].iov_base = xdr->tail[0].iov_base;
159 } else {
160 void *va;
161
162 /* Map another page for the tail */
163 page_off = (unsigned long)xdr->tail[0].iov_base & ~PAGE_MASK;
164 va = (void *)((unsigned long)xdr->tail[0].iov_base & PAGE_MASK);
165 vec->sge[sge_no].iov_base = frva + frmr->map_len + page_off;
166
167 frmr->page_list->page_list[page_no] =
168 ib_dma_map_single(xprt->sc_cm_id->device, va, PAGE_SIZE,
169 DMA_TO_DEVICE);
170 if (ib_dma_mapping_error(xprt->sc_cm_id->device,
171 frmr->page_list->page_list[page_no]))
172 goto fatal_err;
173 atomic_inc(&xprt->sc_dma_used);
174 frmr->map_len += PAGE_SIZE;
175 frmr->page_list_len++;
176 }
177
178 done:
179 if (svc_rdma_fastreg(xprt, frmr))
180 goto fatal_err;
181
182 return 0;
183
184 fatal_err:
185 printk("svcrdma: Error fast registering memory for xprt %p\n", xprt);
186 svc_rdma_put_frmr(xprt, frmr);
187 return -EIO;
188}
189
190static int map_xdr(struct svcxprt_rdma *xprt,
191 struct xdr_buf *xdr,
192 struct svc_rdma_req_map *vec)
75{ 193{
76 int sge_max = (xdr->len+PAGE_SIZE-1) / PAGE_SIZE + 3; 194 int sge_max = (xdr->len+PAGE_SIZE-1) / PAGE_SIZE + 3;
77 int sge_no; 195 int sge_no;
@@ -83,6 +201,9 @@ static void xdr_to_sge(struct svcxprt_rdma *xprt,
83 BUG_ON(xdr->len != 201 BUG_ON(xdr->len !=
84 (xdr->head[0].iov_len + xdr->page_len + xdr->tail[0].iov_len)); 202 (xdr->head[0].iov_len + xdr->page_len + xdr->tail[0].iov_len));
85 203
204 if (xprt->sc_frmr_pg_list_len)
205 return fast_reg_xdr(xprt, xdr, vec);
206
86 /* Skip the first sge, this is for the RPCRDMA header */ 207 /* Skip the first sge, this is for the RPCRDMA header */
87 sge_no = 1; 208 sge_no = 1;
88 209
@@ -116,9 +237,12 @@ static void xdr_to_sge(struct svcxprt_rdma *xprt,
116 237
117 BUG_ON(sge_no > sge_max); 238 BUG_ON(sge_no > sge_max);
118 vec->count = sge_no; 239 vec->count = sge_no;
240 return 0;
119} 241}
120 242
121/* Assumptions: 243/* Assumptions:
244 * - We are using FRMR
245 * - or -
122 * - The specified write_len can be represented in sc_max_sge * PAGE_SIZE 246 * - The specified write_len can be represented in sc_max_sge * PAGE_SIZE
123 */ 247 */
124static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, 248static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
@@ -158,30 +282,35 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
158 sge_no = 0; 282 sge_no = 0;
159 283
160 /* Copy the remaining SGE */ 284 /* Copy the remaining SGE */
161 while (bc != 0 && xdr_sge_no < vec->count) { 285 while (bc != 0) {
162 sge[sge_no].lkey = xprt->sc_phys_mr->lkey; 286 sge_bytes = min_t(size_t,
163 sge_bytes = min((size_t)bc, 287 bc, vec->sge[xdr_sge_no].iov_len-sge_off);
164 (size_t)(vec->sge[xdr_sge_no].iov_len-sge_off));
165 sge[sge_no].length = sge_bytes; 288 sge[sge_no].length = sge_bytes;
166 atomic_inc(&xprt->sc_dma_used); 289 if (!vec->frmr) {
167 sge[sge_no].addr = 290 sge[sge_no].addr =
168 ib_dma_map_single(xprt->sc_cm_id->device, 291 ib_dma_map_single(xprt->sc_cm_id->device,
169 (void *) 292 (void *)
170 vec->sge[xdr_sge_no].iov_base + sge_off, 293 vec->sge[xdr_sge_no].iov_base + sge_off,
171 sge_bytes, DMA_TO_DEVICE); 294 sge_bytes, DMA_TO_DEVICE);
172 if (dma_mapping_error(xprt->sc_cm_id->device->dma_device, 295 if (ib_dma_mapping_error(xprt->sc_cm_id->device,
173 sge[sge_no].addr)) 296 sge[sge_no].addr))
174 goto err; 297 goto err;
298 atomic_inc(&xprt->sc_dma_used);
299 sge[sge_no].lkey = xprt->sc_dma_lkey;
300 } else {
301 sge[sge_no].addr = (unsigned long)
302 vec->sge[xdr_sge_no].iov_base + sge_off;
303 sge[sge_no].lkey = vec->frmr->mr->lkey;
304 }
305 ctxt->count++;
306 ctxt->frmr = vec->frmr;
175 sge_off = 0; 307 sge_off = 0;
176 sge_no++; 308 sge_no++;
177 ctxt->count++;
178 xdr_sge_no++; 309 xdr_sge_no++;
310 BUG_ON(xdr_sge_no > vec->count);
179 bc -= sge_bytes; 311 bc -= sge_bytes;
180 } 312 }
181 313
182 BUG_ON(bc != 0);
183 BUG_ON(xdr_sge_no > vec->count);
184
185 /* Prepare WRITE WR */ 314 /* Prepare WRITE WR */
186 memset(&write_wr, 0, sizeof write_wr); 315 memset(&write_wr, 0, sizeof write_wr);
187 ctxt->wr_op = IB_WR_RDMA_WRITE; 316 ctxt->wr_op = IB_WR_RDMA_WRITE;
@@ -226,7 +355,10 @@ static int send_write_chunks(struct svcxprt_rdma *xprt,
226 res_ary = (struct rpcrdma_write_array *) 355 res_ary = (struct rpcrdma_write_array *)
227 &rdma_resp->rm_body.rm_chunks[1]; 356 &rdma_resp->rm_body.rm_chunks[1];
228 357
229 max_write = xprt->sc_max_sge * PAGE_SIZE; 358 if (vec->frmr)
359 max_write = vec->frmr->map_len;
360 else
361 max_write = xprt->sc_max_sge * PAGE_SIZE;
230 362
231 /* Write chunks start at the pagelist */ 363 /* Write chunks start at the pagelist */
232 for (xdr_off = rqstp->rq_res.head[0].iov_len, chunk_no = 0; 364 for (xdr_off = rqstp->rq_res.head[0].iov_len, chunk_no = 0;
@@ -297,7 +429,10 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt,
297 res_ary = (struct rpcrdma_write_array *) 429 res_ary = (struct rpcrdma_write_array *)
298 &rdma_resp->rm_body.rm_chunks[2]; 430 &rdma_resp->rm_body.rm_chunks[2];
299 431
300 max_write = xprt->sc_max_sge * PAGE_SIZE; 432 if (vec->frmr)
433 max_write = vec->frmr->map_len;
434 else
435 max_write = xprt->sc_max_sge * PAGE_SIZE;
301 436
302 /* xdr offset starts at RPC message */ 437 /* xdr offset starts at RPC message */
303 for (xdr_off = 0, chunk_no = 0; 438 for (xdr_off = 0, chunk_no = 0;
@@ -307,7 +442,6 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt,
307 ch = &arg_ary->wc_array[chunk_no].wc_target; 442 ch = &arg_ary->wc_array[chunk_no].wc_target;
308 write_len = min(xfer_len, ch->rs_length); 443 write_len = min(xfer_len, ch->rs_length);
309 444
310
311 /* Prepare the reply chunk given the length actually 445 /* Prepare the reply chunk given the length actually
312 * written */ 446 * written */
313 rs_offset = get_unaligned(&(ch->rs_offset)); 447 rs_offset = get_unaligned(&(ch->rs_offset));
@@ -366,6 +500,7 @@ static int send_reply(struct svcxprt_rdma *rdma,
366 int byte_count) 500 int byte_count)
367{ 501{
368 struct ib_send_wr send_wr; 502 struct ib_send_wr send_wr;
503 struct ib_send_wr inv_wr;
369 int sge_no; 504 int sge_no;
370 int sge_bytes; 505 int sge_bytes;
371 int page_no; 506 int page_no;
@@ -385,27 +520,45 @@ static int send_reply(struct svcxprt_rdma *rdma,
385 /* Prepare the context */ 520 /* Prepare the context */
386 ctxt->pages[0] = page; 521 ctxt->pages[0] = page;
387 ctxt->count = 1; 522 ctxt->count = 1;
523 ctxt->frmr = vec->frmr;
524 if (vec->frmr)
525 set_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags);
526 else
527 clear_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags);
388 528
389 /* Prepare the SGE for the RPCRDMA Header */ 529 /* Prepare the SGE for the RPCRDMA Header */
390 atomic_inc(&rdma->sc_dma_used);
391 ctxt->sge[0].addr = 530 ctxt->sge[0].addr =
392 ib_dma_map_page(rdma->sc_cm_id->device, 531 ib_dma_map_page(rdma->sc_cm_id->device,
393 page, 0, PAGE_SIZE, DMA_TO_DEVICE); 532 page, 0, PAGE_SIZE, DMA_TO_DEVICE);
533 if (ib_dma_mapping_error(rdma->sc_cm_id->device, ctxt->sge[0].addr))
534 goto err;
535 atomic_inc(&rdma->sc_dma_used);
536
394 ctxt->direction = DMA_TO_DEVICE; 537 ctxt->direction = DMA_TO_DEVICE;
538
395 ctxt->sge[0].length = svc_rdma_xdr_get_reply_hdr_len(rdma_resp); 539 ctxt->sge[0].length = svc_rdma_xdr_get_reply_hdr_len(rdma_resp);
396 ctxt->sge[0].lkey = rdma->sc_phys_mr->lkey; 540 ctxt->sge[0].lkey = rdma->sc_dma_lkey;
397 541
398 /* Determine how many of our SGE are to be transmitted */ 542 /* Determine how many of our SGE are to be transmitted */
399 for (sge_no = 1; byte_count && sge_no < vec->count; sge_no++) { 543 for (sge_no = 1; byte_count && sge_no < vec->count; sge_no++) {
400 sge_bytes = min_t(size_t, vec->sge[sge_no].iov_len, byte_count); 544 sge_bytes = min_t(size_t, vec->sge[sge_no].iov_len, byte_count);
401 byte_count -= sge_bytes; 545 byte_count -= sge_bytes;
402 atomic_inc(&rdma->sc_dma_used); 546 if (!vec->frmr) {
403 ctxt->sge[sge_no].addr = 547 ctxt->sge[sge_no].addr =
404 ib_dma_map_single(rdma->sc_cm_id->device, 548 ib_dma_map_single(rdma->sc_cm_id->device,
405 vec->sge[sge_no].iov_base, 549 vec->sge[sge_no].iov_base,
406 sge_bytes, DMA_TO_DEVICE); 550 sge_bytes, DMA_TO_DEVICE);
551 if (ib_dma_mapping_error(rdma->sc_cm_id->device,
552 ctxt->sge[sge_no].addr))
553 goto err;
554 atomic_inc(&rdma->sc_dma_used);
555 ctxt->sge[sge_no].lkey = rdma->sc_dma_lkey;
556 } else {
557 ctxt->sge[sge_no].addr = (unsigned long)
558 vec->sge[sge_no].iov_base;
559 ctxt->sge[sge_no].lkey = vec->frmr->mr->lkey;
560 }
407 ctxt->sge[sge_no].length = sge_bytes; 561 ctxt->sge[sge_no].length = sge_bytes;
408 ctxt->sge[sge_no].lkey = rdma->sc_phys_mr->lkey;
409 } 562 }
410 BUG_ON(byte_count != 0); 563 BUG_ON(byte_count != 0);
411 564
@@ -417,11 +570,16 @@ static int send_reply(struct svcxprt_rdma *rdma,
417 ctxt->pages[page_no+1] = rqstp->rq_respages[page_no]; 570 ctxt->pages[page_no+1] = rqstp->rq_respages[page_no];
418 ctxt->count++; 571 ctxt->count++;
419 rqstp->rq_respages[page_no] = NULL; 572 rqstp->rq_respages[page_no] = NULL;
420 /* If there are more pages than SGE, terminate SGE list */ 573 /*
574 * If there are more pages than SGE, terminate SGE
575 * list so that svc_rdma_unmap_dma doesn't attempt to
576 * unmap garbage.
577 */
421 if (page_no+1 >= sge_no) 578 if (page_no+1 >= sge_no)
422 ctxt->sge[page_no+1].length = 0; 579 ctxt->sge[page_no+1].length = 0;
423 } 580 }
424 BUG_ON(sge_no > rdma->sc_max_sge); 581 BUG_ON(sge_no > rdma->sc_max_sge);
582 BUG_ON(sge_no > ctxt->count);
425 memset(&send_wr, 0, sizeof send_wr); 583 memset(&send_wr, 0, sizeof send_wr);
426 ctxt->wr_op = IB_WR_SEND; 584 ctxt->wr_op = IB_WR_SEND;
427 send_wr.wr_id = (unsigned long)ctxt; 585 send_wr.wr_id = (unsigned long)ctxt;
@@ -429,12 +587,26 @@ static int send_reply(struct svcxprt_rdma *rdma,
429 send_wr.num_sge = sge_no; 587 send_wr.num_sge = sge_no;
430 send_wr.opcode = IB_WR_SEND; 588 send_wr.opcode = IB_WR_SEND;
431 send_wr.send_flags = IB_SEND_SIGNALED; 589 send_wr.send_flags = IB_SEND_SIGNALED;
590 if (vec->frmr) {
591 /* Prepare INVALIDATE WR */
592 memset(&inv_wr, 0, sizeof inv_wr);
593 inv_wr.opcode = IB_WR_LOCAL_INV;
594 inv_wr.send_flags = IB_SEND_SIGNALED;
595 inv_wr.ex.invalidate_rkey =
596 vec->frmr->mr->lkey;
597 send_wr.next = &inv_wr;
598 }
432 599
433 ret = svc_rdma_send(rdma, &send_wr); 600 ret = svc_rdma_send(rdma, &send_wr);
434 if (ret) 601 if (ret)
435 svc_rdma_put_context(ctxt, 1); 602 goto err;
436 603
437 return ret; 604 return 0;
605
606 err:
607 svc_rdma_put_frmr(rdma, vec->frmr);
608 svc_rdma_put_context(ctxt, 1);
609 return -EIO;
438} 610}
439 611
440void svc_rdma_prep_reply_hdr(struct svc_rqst *rqstp) 612void svc_rdma_prep_reply_hdr(struct svc_rqst *rqstp)
@@ -477,8 +649,9 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
477 ctxt = svc_rdma_get_context(rdma); 649 ctxt = svc_rdma_get_context(rdma);
478 ctxt->direction = DMA_TO_DEVICE; 650 ctxt->direction = DMA_TO_DEVICE;
479 vec = svc_rdma_get_req_map(); 651 vec = svc_rdma_get_req_map();
480 xdr_to_sge(rdma, &rqstp->rq_res, vec); 652 ret = map_xdr(rdma, &rqstp->rq_res, vec);
481 653 if (ret)
654 goto err0;
482 inline_bytes = rqstp->rq_res.len; 655 inline_bytes = rqstp->rq_res.len;
483 656
484 /* Create the RDMA response header */ 657 /* Create the RDMA response header */
@@ -498,7 +671,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
498 if (ret < 0) { 671 if (ret < 0) {
499 printk(KERN_ERR "svcrdma: failed to send write chunks, rc=%d\n", 672 printk(KERN_ERR "svcrdma: failed to send write chunks, rc=%d\n",
500 ret); 673 ret);
501 goto error; 674 goto err1;
502 } 675 }
503 inline_bytes -= ret; 676 inline_bytes -= ret;
504 677
@@ -508,7 +681,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
508 if (ret < 0) { 681 if (ret < 0) {
509 printk(KERN_ERR "svcrdma: failed to send reply chunks, rc=%d\n", 682 printk(KERN_ERR "svcrdma: failed to send reply chunks, rc=%d\n",
510 ret); 683 ret);
511 goto error; 684 goto err1;
512 } 685 }
513 inline_bytes -= ret; 686 inline_bytes -= ret;
514 687
@@ -517,9 +690,11 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
517 svc_rdma_put_req_map(vec); 690 svc_rdma_put_req_map(vec);
518 dprintk("svcrdma: send_reply returns %d\n", ret); 691 dprintk("svcrdma: send_reply returns %d\n", ret);
519 return ret; 692 return ret;
520 error: 693
694 err1:
695 put_page(res_page);
696 err0:
521 svc_rdma_put_req_map(vec); 697 svc_rdma_put_req_map(vec);
522 svc_rdma_put_context(ctxt, 0); 698 svc_rdma_put_context(ctxt, 0);
523 put_page(res_page);
524 return ret; 699 return ret;
525} 700}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 900cb69728c6..6fb493cbd29f 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -100,20 +100,29 @@ struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt)
100 ctxt->xprt = xprt; 100 ctxt->xprt = xprt;
101 INIT_LIST_HEAD(&ctxt->dto_q); 101 INIT_LIST_HEAD(&ctxt->dto_q);
102 ctxt->count = 0; 102 ctxt->count = 0;
103 ctxt->frmr = NULL;
103 atomic_inc(&xprt->sc_ctxt_used); 104 atomic_inc(&xprt->sc_ctxt_used);
104 return ctxt; 105 return ctxt;
105} 106}
106 107
107static void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt) 108void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt)
108{ 109{
109 struct svcxprt_rdma *xprt = ctxt->xprt; 110 struct svcxprt_rdma *xprt = ctxt->xprt;
110 int i; 111 int i;
111 for (i = 0; i < ctxt->count && ctxt->sge[i].length; i++) { 112 for (i = 0; i < ctxt->count && ctxt->sge[i].length; i++) {
112 atomic_dec(&xprt->sc_dma_used); 113 /*
113 ib_dma_unmap_single(xprt->sc_cm_id->device, 114 * Unmap the DMA addr in the SGE if the lkey matches
114 ctxt->sge[i].addr, 115 * the sc_dma_lkey, otherwise, ignore it since it is
115 ctxt->sge[i].length, 116 * an FRMR lkey and will be unmapped later when the
116 ctxt->direction); 117 * last WR that uses it completes.
118 */
119 if (ctxt->sge[i].lkey == xprt->sc_dma_lkey) {
120 atomic_dec(&xprt->sc_dma_used);
121 ib_dma_unmap_single(xprt->sc_cm_id->device,
122 ctxt->sge[i].addr,
123 ctxt->sge[i].length,
124 ctxt->direction);
125 }
117 } 126 }
118} 127}
119 128
@@ -150,6 +159,7 @@ struct svc_rdma_req_map *svc_rdma_get_req_map(void)
150 schedule_timeout_uninterruptible(msecs_to_jiffies(500)); 159 schedule_timeout_uninterruptible(msecs_to_jiffies(500));
151 } 160 }
152 map->count = 0; 161 map->count = 0;
162 map->frmr = NULL;
153 return map; 163 return map;
154} 164}
155 165
@@ -316,6 +326,50 @@ static void rq_cq_reap(struct svcxprt_rdma *xprt)
316} 326}
317 327
318/* 328/*
329 * Processs a completion context
330 */
331static void process_context(struct svcxprt_rdma *xprt,
332 struct svc_rdma_op_ctxt *ctxt)
333{
334 svc_rdma_unmap_dma(ctxt);
335
336 switch (ctxt->wr_op) {
337 case IB_WR_SEND:
338 if (test_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags))
339 svc_rdma_put_frmr(xprt, ctxt->frmr);
340 svc_rdma_put_context(ctxt, 1);
341 break;
342
343 case IB_WR_RDMA_WRITE:
344 svc_rdma_put_context(ctxt, 0);
345 break;
346
347 case IB_WR_RDMA_READ:
348 case IB_WR_RDMA_READ_WITH_INV:
349 if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) {
350 struct svc_rdma_op_ctxt *read_hdr = ctxt->read_hdr;
351 BUG_ON(!read_hdr);
352 if (test_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags))
353 svc_rdma_put_frmr(xprt, ctxt->frmr);
354 spin_lock_bh(&xprt->sc_rq_dto_lock);
355 set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
356 list_add_tail(&read_hdr->dto_q,
357 &xprt->sc_read_complete_q);
358 spin_unlock_bh(&xprt->sc_rq_dto_lock);
359 svc_xprt_enqueue(&xprt->sc_xprt);
360 }
361 svc_rdma_put_context(ctxt, 0);
362 break;
363
364 default:
365 printk(KERN_ERR "svcrdma: unexpected completion type, "
366 "opcode=%d\n",
367 ctxt->wr_op);
368 break;
369 }
370}
371
372/*
319 * Send Queue Completion Handler - potentially called on interrupt context. 373 * Send Queue Completion Handler - potentially called on interrupt context.
320 * 374 *
321 * Note that caller must hold a transport reference. 375 * Note that caller must hold a transport reference.
@@ -327,17 +381,12 @@ static void sq_cq_reap(struct svcxprt_rdma *xprt)
327 struct ib_cq *cq = xprt->sc_sq_cq; 381 struct ib_cq *cq = xprt->sc_sq_cq;
328 int ret; 382 int ret;
329 383
330
331 if (!test_and_clear_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags)) 384 if (!test_and_clear_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags))
332 return; 385 return;
333 386
334 ib_req_notify_cq(xprt->sc_sq_cq, IB_CQ_NEXT_COMP); 387 ib_req_notify_cq(xprt->sc_sq_cq, IB_CQ_NEXT_COMP);
335 atomic_inc(&rdma_stat_sq_poll); 388 atomic_inc(&rdma_stat_sq_poll);
336 while ((ret = ib_poll_cq(cq, 1, &wc)) > 0) { 389 while ((ret = ib_poll_cq(cq, 1, &wc)) > 0) {
337 ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id;
338 xprt = ctxt->xprt;
339
340 svc_rdma_unmap_dma(ctxt);
341 if (wc.status != IB_WC_SUCCESS) 390 if (wc.status != IB_WC_SUCCESS)
342 /* Close the transport */ 391 /* Close the transport */
343 set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); 392 set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
@@ -346,35 +395,10 @@ static void sq_cq_reap(struct svcxprt_rdma *xprt)
346 atomic_dec(&xprt->sc_sq_count); 395 atomic_dec(&xprt->sc_sq_count);
347 wake_up(&xprt->sc_send_wait); 396 wake_up(&xprt->sc_send_wait);
348 397
349 switch (ctxt->wr_op) { 398 ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id;
350 case IB_WR_SEND: 399 if (ctxt)
351 svc_rdma_put_context(ctxt, 1); 400 process_context(xprt, ctxt);
352 break;
353
354 case IB_WR_RDMA_WRITE:
355 svc_rdma_put_context(ctxt, 0);
356 break;
357
358 case IB_WR_RDMA_READ:
359 if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) {
360 struct svc_rdma_op_ctxt *read_hdr = ctxt->read_hdr;
361 BUG_ON(!read_hdr);
362 spin_lock_bh(&xprt->sc_rq_dto_lock);
363 set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
364 list_add_tail(&read_hdr->dto_q,
365 &xprt->sc_read_complete_q);
366 spin_unlock_bh(&xprt->sc_rq_dto_lock);
367 svc_xprt_enqueue(&xprt->sc_xprt);
368 }
369 svc_rdma_put_context(ctxt, 0);
370 break;
371 401
372 default:
373 printk(KERN_ERR "svcrdma: unexpected completion type, "
374 "opcode=%d, status=%d\n",
375 wc.opcode, wc.status);
376 break;
377 }
378 svc_xprt_put(&xprt->sc_xprt); 402 svc_xprt_put(&xprt->sc_xprt);
379 } 403 }
380 404
@@ -425,10 +449,12 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
425 INIT_LIST_HEAD(&cma_xprt->sc_dto_q); 449 INIT_LIST_HEAD(&cma_xprt->sc_dto_q);
426 INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q); 450 INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q);
427 INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q); 451 INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q);
452 INIT_LIST_HEAD(&cma_xprt->sc_frmr_q);
428 init_waitqueue_head(&cma_xprt->sc_send_wait); 453 init_waitqueue_head(&cma_xprt->sc_send_wait);
429 454
430 spin_lock_init(&cma_xprt->sc_lock); 455 spin_lock_init(&cma_xprt->sc_lock);
431 spin_lock_init(&cma_xprt->sc_rq_dto_lock); 456 spin_lock_init(&cma_xprt->sc_rq_dto_lock);
457 spin_lock_init(&cma_xprt->sc_frmr_q_lock);
432 458
433 cma_xprt->sc_ord = svcrdma_ord; 459 cma_xprt->sc_ord = svcrdma_ord;
434 460
@@ -462,7 +488,7 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt)
462 struct ib_recv_wr recv_wr, *bad_recv_wr; 488 struct ib_recv_wr recv_wr, *bad_recv_wr;
463 struct svc_rdma_op_ctxt *ctxt; 489 struct svc_rdma_op_ctxt *ctxt;
464 struct page *page; 490 struct page *page;
465 unsigned long pa; 491 dma_addr_t pa;
466 int sge_no; 492 int sge_no;
467 int buflen; 493 int buflen;
468 int ret; 494 int ret;
@@ -474,13 +500,15 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt)
474 BUG_ON(sge_no >= xprt->sc_max_sge); 500 BUG_ON(sge_no >= xprt->sc_max_sge);
475 page = svc_rdma_get_page(); 501 page = svc_rdma_get_page();
476 ctxt->pages[sge_no] = page; 502 ctxt->pages[sge_no] = page;
477 atomic_inc(&xprt->sc_dma_used);
478 pa = ib_dma_map_page(xprt->sc_cm_id->device, 503 pa = ib_dma_map_page(xprt->sc_cm_id->device,
479 page, 0, PAGE_SIZE, 504 page, 0, PAGE_SIZE,
480 DMA_FROM_DEVICE); 505 DMA_FROM_DEVICE);
506 if (ib_dma_mapping_error(xprt->sc_cm_id->device, pa))
507 goto err_put_ctxt;
508 atomic_inc(&xprt->sc_dma_used);
481 ctxt->sge[sge_no].addr = pa; 509 ctxt->sge[sge_no].addr = pa;
482 ctxt->sge[sge_no].length = PAGE_SIZE; 510 ctxt->sge[sge_no].length = PAGE_SIZE;
483 ctxt->sge[sge_no].lkey = xprt->sc_phys_mr->lkey; 511 ctxt->sge[sge_no].lkey = xprt->sc_dma_lkey;
484 buflen += PAGE_SIZE; 512 buflen += PAGE_SIZE;
485 } 513 }
486 ctxt->count = sge_no; 514 ctxt->count = sge_no;
@@ -496,6 +524,10 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt)
496 svc_rdma_put_context(ctxt, 1); 524 svc_rdma_put_context(ctxt, 1);
497 } 525 }
498 return ret; 526 return ret;
527
528 err_put_ctxt:
529 svc_rdma_put_context(ctxt, 1);
530 return -ENOMEM;
499} 531}
500 532
501/* 533/*
@@ -566,7 +598,7 @@ static int rdma_listen_handler(struct rdma_cm_id *cma_id,
566 dprintk("svcrdma: Connect request on cma_id=%p, xprt = %p, " 598 dprintk("svcrdma: Connect request on cma_id=%p, xprt = %p, "
567 "event=%d\n", cma_id, cma_id->context, event->event); 599 "event=%d\n", cma_id, cma_id->context, event->event);
568 handle_connect_req(cma_id, 600 handle_connect_req(cma_id,
569 event->param.conn.responder_resources); 601 event->param.conn.initiator_depth);
570 break; 602 break;
571 603
572 case RDMA_CM_EVENT_ESTABLISHED: 604 case RDMA_CM_EVENT_ESTABLISHED:
@@ -686,6 +718,97 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
686 return ERR_PTR(ret); 718 return ERR_PTR(ret);
687} 719}
688 720
721static struct svc_rdma_fastreg_mr *rdma_alloc_frmr(struct svcxprt_rdma *xprt)
722{
723 struct ib_mr *mr;
724 struct ib_fast_reg_page_list *pl;
725 struct svc_rdma_fastreg_mr *frmr;
726
727 frmr = kmalloc(sizeof(*frmr), GFP_KERNEL);
728 if (!frmr)
729 goto err;
730
731 mr = ib_alloc_fast_reg_mr(xprt->sc_pd, RPCSVC_MAXPAGES);
732 if (!mr)
733 goto err_free_frmr;
734
735 pl = ib_alloc_fast_reg_page_list(xprt->sc_cm_id->device,
736 RPCSVC_MAXPAGES);
737 if (!pl)
738 goto err_free_mr;
739
740 frmr->mr = mr;
741 frmr->page_list = pl;
742 INIT_LIST_HEAD(&frmr->frmr_list);
743 return frmr;
744
745 err_free_mr:
746 ib_dereg_mr(mr);
747 err_free_frmr:
748 kfree(frmr);
749 err:
750 return ERR_PTR(-ENOMEM);
751}
752
753static void rdma_dealloc_frmr_q(struct svcxprt_rdma *xprt)
754{
755 struct svc_rdma_fastreg_mr *frmr;
756
757 while (!list_empty(&xprt->sc_frmr_q)) {
758 frmr = list_entry(xprt->sc_frmr_q.next,
759 struct svc_rdma_fastreg_mr, frmr_list);
760 list_del_init(&frmr->frmr_list);
761 ib_dereg_mr(frmr->mr);
762 ib_free_fast_reg_page_list(frmr->page_list);
763 kfree(frmr);
764 }
765}
766
767struct svc_rdma_fastreg_mr *svc_rdma_get_frmr(struct svcxprt_rdma *rdma)
768{
769 struct svc_rdma_fastreg_mr *frmr = NULL;
770
771 spin_lock_bh(&rdma->sc_frmr_q_lock);
772 if (!list_empty(&rdma->sc_frmr_q)) {
773 frmr = list_entry(rdma->sc_frmr_q.next,
774 struct svc_rdma_fastreg_mr, frmr_list);
775 list_del_init(&frmr->frmr_list);
776 frmr->map_len = 0;
777 frmr->page_list_len = 0;
778 }
779 spin_unlock_bh(&rdma->sc_frmr_q_lock);
780 if (frmr)
781 return frmr;
782
783 return rdma_alloc_frmr(rdma);
784}
785
786static void frmr_unmap_dma(struct svcxprt_rdma *xprt,
787 struct svc_rdma_fastreg_mr *frmr)
788{
789 int page_no;
790 for (page_no = 0; page_no < frmr->page_list_len; page_no++) {
791 dma_addr_t addr = frmr->page_list->page_list[page_no];
792 if (ib_dma_mapping_error(frmr->mr->device, addr))
793 continue;
794 atomic_dec(&xprt->sc_dma_used);
795 ib_dma_unmap_single(frmr->mr->device, addr, PAGE_SIZE,
796 frmr->direction);
797 }
798}
799
800void svc_rdma_put_frmr(struct svcxprt_rdma *rdma,
801 struct svc_rdma_fastreg_mr *frmr)
802{
803 if (frmr) {
804 frmr_unmap_dma(rdma, frmr);
805 spin_lock_bh(&rdma->sc_frmr_q_lock);
806 BUG_ON(!list_empty(&frmr->frmr_list));
807 list_add(&frmr->frmr_list, &rdma->sc_frmr_q);
808 spin_unlock_bh(&rdma->sc_frmr_q_lock);
809 }
810}
811
689/* 812/*
690 * This is the xpo_recvfrom function for listening endpoints. Its 813 * This is the xpo_recvfrom function for listening endpoints. Its
691 * purpose is to accept incoming connections. The CMA callback handler 814 * purpose is to accept incoming connections. The CMA callback handler
@@ -704,6 +827,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
704 struct rdma_conn_param conn_param; 827 struct rdma_conn_param conn_param;
705 struct ib_qp_init_attr qp_attr; 828 struct ib_qp_init_attr qp_attr;
706 struct ib_device_attr devattr; 829 struct ib_device_attr devattr;
830 int dma_mr_acc;
831 int need_dma_mr;
707 int ret; 832 int ret;
708 int i; 833 int i;
709 834
@@ -819,15 +944,77 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
819 } 944 }
820 newxprt->sc_qp = newxprt->sc_cm_id->qp; 945 newxprt->sc_qp = newxprt->sc_cm_id->qp;
821 946
822 /* Register all of physical memory */ 947 /*
823 newxprt->sc_phys_mr = ib_get_dma_mr(newxprt->sc_pd, 948 * Use the most secure set of MR resources based on the
824 IB_ACCESS_LOCAL_WRITE | 949 * transport type and available memory management features in
825 IB_ACCESS_REMOTE_WRITE); 950 * the device. Here's the table implemented below:
826 if (IS_ERR(newxprt->sc_phys_mr)) { 951 *
827 dprintk("svcrdma: Failed to create DMA MR ret=%d\n", ret); 952 * Fast Global DMA Remote WR
953 * Reg LKEY MR Access
954 * Sup'd Sup'd Needed Needed
955 *
956 * IWARP N N Y Y
957 * N Y Y Y
958 * Y N Y N
959 * Y Y N -
960 *
961 * IB N N Y N
962 * N Y N -
963 * Y N Y N
964 * Y Y N -
965 *
966 * NB: iWARP requires remote write access for the data sink
967 * of an RDMA_READ. IB does not.
968 */
969 if (devattr.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) {
970 newxprt->sc_frmr_pg_list_len =
971 devattr.max_fast_reg_page_list_len;
972 newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_FAST_REG;
973 }
974
975 /*
976 * Determine if a DMA MR is required and if so, what privs are required
977 */
978 switch (rdma_node_get_transport(newxprt->sc_cm_id->device->node_type)) {
979 case RDMA_TRANSPORT_IWARP:
980 newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_READ_W_INV;
981 if (!(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG)) {
982 need_dma_mr = 1;
983 dma_mr_acc =
984 (IB_ACCESS_LOCAL_WRITE |
985 IB_ACCESS_REMOTE_WRITE);
986 } else if (!(devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)) {
987 need_dma_mr = 1;
988 dma_mr_acc = IB_ACCESS_LOCAL_WRITE;
989 } else
990 need_dma_mr = 0;
991 break;
992 case RDMA_TRANSPORT_IB:
993 if (!(devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)) {
994 need_dma_mr = 1;
995 dma_mr_acc = IB_ACCESS_LOCAL_WRITE;
996 } else
997 need_dma_mr = 0;
998 break;
999 default:
828 goto errout; 1000 goto errout;
829 } 1001 }
830 1002
1003 /* Create the DMA MR if needed, otherwise, use the DMA LKEY */
1004 if (need_dma_mr) {
1005 /* Register all of physical memory */
1006 newxprt->sc_phys_mr =
1007 ib_get_dma_mr(newxprt->sc_pd, dma_mr_acc);
1008 if (IS_ERR(newxprt->sc_phys_mr)) {
1009 dprintk("svcrdma: Failed to create DMA MR ret=%d\n",
1010 ret);
1011 goto errout;
1012 }
1013 newxprt->sc_dma_lkey = newxprt->sc_phys_mr->lkey;
1014 } else
1015 newxprt->sc_dma_lkey =
1016 newxprt->sc_cm_id->device->local_dma_lkey;
1017
831 /* Post receive buffers */ 1018 /* Post receive buffers */
832 for (i = 0; i < newxprt->sc_max_requests; i++) { 1019 for (i = 0; i < newxprt->sc_max_requests; i++) {
833 ret = svc_rdma_post_recv(newxprt); 1020 ret = svc_rdma_post_recv(newxprt);
@@ -961,6 +1148,9 @@ static void __svc_rdma_free(struct work_struct *work)
961 WARN_ON(atomic_read(&rdma->sc_ctxt_used) != 0); 1148 WARN_ON(atomic_read(&rdma->sc_ctxt_used) != 0);
962 WARN_ON(atomic_read(&rdma->sc_dma_used) != 0); 1149 WARN_ON(atomic_read(&rdma->sc_dma_used) != 0);
963 1150
1151 /* De-allocate fastreg mr */
1152 rdma_dealloc_frmr_q(rdma);
1153
964 /* Destroy the QP if present (not a listener) */ 1154 /* Destroy the QP if present (not a listener) */
965 if (rdma->sc_qp && !IS_ERR(rdma->sc_qp)) 1155 if (rdma->sc_qp && !IS_ERR(rdma->sc_qp))
966 ib_destroy_qp(rdma->sc_qp); 1156 ib_destroy_qp(rdma->sc_qp);
@@ -1014,21 +1204,59 @@ static int svc_rdma_has_wspace(struct svc_xprt *xprt)
1014 return 1; 1204 return 1;
1015} 1205}
1016 1206
1207/*
1208 * Attempt to register the kvec representing the RPC memory with the
1209 * device.
1210 *
1211 * Returns:
1212 * NULL : The device does not support fastreg or there were no more
1213 * fastreg mr.
1214 * frmr : The kvec register request was successfully posted.
1215 * <0 : An error was encountered attempting to register the kvec.
1216 */
1217int svc_rdma_fastreg(struct svcxprt_rdma *xprt,
1218 struct svc_rdma_fastreg_mr *frmr)
1219{
1220 struct ib_send_wr fastreg_wr;
1221 u8 key;
1222
1223 /* Bump the key */
1224 key = (u8)(frmr->mr->lkey & 0x000000FF);
1225 ib_update_fast_reg_key(frmr->mr, ++key);
1226
1227 /* Prepare FASTREG WR */
1228 memset(&fastreg_wr, 0, sizeof fastreg_wr);
1229 fastreg_wr.opcode = IB_WR_FAST_REG_MR;
1230 fastreg_wr.send_flags = IB_SEND_SIGNALED;
1231 fastreg_wr.wr.fast_reg.iova_start = (unsigned long)frmr->kva;
1232 fastreg_wr.wr.fast_reg.page_list = frmr->page_list;
1233 fastreg_wr.wr.fast_reg.page_list_len = frmr->page_list_len;
1234 fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
1235 fastreg_wr.wr.fast_reg.length = frmr->map_len;
1236 fastreg_wr.wr.fast_reg.access_flags = frmr->access_flags;
1237 fastreg_wr.wr.fast_reg.rkey = frmr->mr->lkey;
1238 return svc_rdma_send(xprt, &fastreg_wr);
1239}
1240
1017int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr) 1241int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr)
1018{ 1242{
1019 struct ib_send_wr *bad_wr; 1243 struct ib_send_wr *bad_wr, *n_wr;
1244 int wr_count;
1245 int i;
1020 int ret; 1246 int ret;
1021 1247
1022 if (test_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags)) 1248 if (test_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags))
1023 return -ENOTCONN; 1249 return -ENOTCONN;
1024 1250
1025 BUG_ON(wr->send_flags != IB_SEND_SIGNALED); 1251 BUG_ON(wr->send_flags != IB_SEND_SIGNALED);
1026 BUG_ON(((struct svc_rdma_op_ctxt *)(unsigned long)wr->wr_id)->wr_op != 1252 wr_count = 1;
1027 wr->opcode); 1253 for (n_wr = wr->next; n_wr; n_wr = n_wr->next)
1254 wr_count++;
1255
1028 /* If the SQ is full, wait until an SQ entry is available */ 1256 /* If the SQ is full, wait until an SQ entry is available */
1029 while (1) { 1257 while (1) {
1030 spin_lock_bh(&xprt->sc_lock); 1258 spin_lock_bh(&xprt->sc_lock);
1031 if (xprt->sc_sq_depth == atomic_read(&xprt->sc_sq_count)) { 1259 if (xprt->sc_sq_depth < atomic_read(&xprt->sc_sq_count) + wr_count) {
1032 spin_unlock_bh(&xprt->sc_lock); 1260 spin_unlock_bh(&xprt->sc_lock);
1033 atomic_inc(&rdma_stat_sq_starve); 1261 atomic_inc(&rdma_stat_sq_starve);
1034 1262
@@ -1043,19 +1271,26 @@ int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr)
1043 return 0; 1271 return 0;
1044 continue; 1272 continue;
1045 } 1273 }
1046 /* Bumped used SQ WR count and post */ 1274 /* Take a transport ref for each WR posted */
1047 svc_xprt_get(&xprt->sc_xprt); 1275 for (i = 0; i < wr_count; i++)
1276 svc_xprt_get(&xprt->sc_xprt);
1277
1278 /* Bump used SQ WR count and post */
1279 atomic_add(wr_count, &xprt->sc_sq_count);
1048 ret = ib_post_send(xprt->sc_qp, wr, &bad_wr); 1280 ret = ib_post_send(xprt->sc_qp, wr, &bad_wr);
1049 if (!ret) 1281 if (ret) {
1050 atomic_inc(&xprt->sc_sq_count); 1282 set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
1051 else { 1283 atomic_sub(wr_count, &xprt->sc_sq_count);
1052 svc_xprt_put(&xprt->sc_xprt); 1284 for (i = 0; i < wr_count; i ++)
1285 svc_xprt_put(&xprt->sc_xprt);
1053 dprintk("svcrdma: failed to post SQ WR rc=%d, " 1286 dprintk("svcrdma: failed to post SQ WR rc=%d, "
1054 "sc_sq_count=%d, sc_sq_depth=%d\n", 1287 "sc_sq_count=%d, sc_sq_depth=%d\n",
1055 ret, atomic_read(&xprt->sc_sq_count), 1288 ret, atomic_read(&xprt->sc_sq_count),
1056 xprt->sc_sq_depth); 1289 xprt->sc_sq_depth);
1057 } 1290 }
1058 spin_unlock_bh(&xprt->sc_lock); 1291 spin_unlock_bh(&xprt->sc_lock);
1292 if (ret)
1293 wake_up(&xprt->sc_send_wait);
1059 break; 1294 break;
1060 } 1295 }
1061 return ret; 1296 return ret;
@@ -1079,10 +1314,14 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
1079 length = svc_rdma_xdr_encode_error(xprt, rmsgp, err, va); 1314 length = svc_rdma_xdr_encode_error(xprt, rmsgp, err, va);
1080 1315
1081 /* Prepare SGE for local address */ 1316 /* Prepare SGE for local address */
1082 atomic_inc(&xprt->sc_dma_used);
1083 sge.addr = ib_dma_map_page(xprt->sc_cm_id->device, 1317 sge.addr = ib_dma_map_page(xprt->sc_cm_id->device,
1084 p, 0, PAGE_SIZE, DMA_FROM_DEVICE); 1318 p, 0, PAGE_SIZE, DMA_FROM_DEVICE);
1085 sge.lkey = xprt->sc_phys_mr->lkey; 1319 if (ib_dma_mapping_error(xprt->sc_cm_id->device, sge.addr)) {
1320 put_page(p);
1321 return;
1322 }
1323 atomic_inc(&xprt->sc_dma_used);
1324 sge.lkey = xprt->sc_dma_lkey;
1086 sge.length = length; 1325 sge.length = length;
1087 1326
1088 ctxt = svc_rdma_get_context(xprt); 1327 ctxt = svc_rdma_get_context(xprt);
@@ -1103,6 +1342,9 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
1103 if (ret) { 1342 if (ret) {
1104 dprintk("svcrdma: Error %d posting send for protocol error\n", 1343 dprintk("svcrdma: Error %d posting send for protocol error\n",
1105 ret); 1344 ret);
1345 ib_dma_unmap_page(xprt->sc_cm_id->device,
1346 sge.addr, PAGE_SIZE,
1347 DMA_FROM_DEVICE);
1106 svc_rdma_put_context(ctxt, 1); 1348 svc_rdma_put_context(ctxt, 1);
1107 } 1349 }
1108} 1350}
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 4486c59c3aca..9a288d5eea64 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -3,8 +3,8 @@
3 * 3 *
4 * Client-side transport implementation for sockets. 4 * Client-side transport implementation for sockets.
5 * 5 *
6 * TCP callback races fixes (C) 1998 Red Hat Software <alan@redhat.com> 6 * TCP callback races fixes (C) 1998 Red Hat
7 * TCP send fixes (C) 1998 Red Hat Software <alan@redhat.com> 7 * TCP send fixes (C) 1998 Red Hat
8 * TCP NFS related read + write fixes 8 * TCP NFS related read + write fixes
9 * (C) 1999 Dave Airlie, University of Limerick, Ireland <airlied@linux.ie> 9 * (C) 1999 Dave Airlie, University of Limerick, Ireland <airlied@linux.ie>
10 * 10 *