aboutsummaryrefslogtreecommitdiffstats
path: root/net/sunrpc
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2008-02-01 22:31:28 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2008-02-01 22:31:28 -0500
commit63e9b66e29357dd12e8b1d3ebf7036e7591f81e3 (patch)
tree5aa6a70a8f4bbf306e2825a1e2fa2660c2c1c187 /net/sunrpc
parent687fcdf741e4a268c2c7bac8b3734de761bb9719 (diff)
parentea339d46b93c7b16e067a29aad1812f7a389815a (diff)
Merge branch 'for-linus' of git://linux-nfs.org/~bfields/linux
* 'for-linus' of git://linux-nfs.org/~bfields/linux: (100 commits) SUNRPC: RPC program information is stored in unsigned integers SUNRPC: Move exported symbol definitions after function declaration part 2 NLM: tear down RPC clients in nlm_shutdown_hosts SUNRPC: spin svc_rqst initialization to its own function nfsd: more careful input validation in nfsctl write methods lockd: minor log message fix knfsd: don't bother mapping putrootfh enoent to eperm rdma: makefile rdma: ONCRPC RDMA protocol marshalling rdma: SVCRDMA sendto rdma: SVCRDMA recvfrom rdma: SVCRDMA Core Transport Services rdma: SVCRDMA Transport Module rdma: SVCRMDA Header File svc: Add svc_xprt_names service to replace svc_sock_names knfsd: Support adding transports by writing portlist file svc: Add svc API that queries for a transport instance svc: Add /proc/sys/sunrpc/transport files svc: Add transport hdr size for defer/revisit svc: Move the xprt independent code to the svc_xprt.c file ...
Diffstat (limited to 'net/sunrpc')
-rw-r--r--net/sunrpc/Makefile3
-rw-r--r--net/sunrpc/auth_gss/svcauth_gss.c93
-rw-r--r--net/sunrpc/cache.c152
-rw-r--r--net/sunrpc/stats.c7
-rw-r--r--net/sunrpc/sunrpc_syms.c52
-rw-r--r--net/sunrpc/svc.c90
-rw-r--r--net/sunrpc/svc_xprt.c1055
-rw-r--r--net/sunrpc/svcauth.c6
-rw-r--r--net/sunrpc/svcauth_unix.c59
-rw-r--r--net/sunrpc/svcsock.c1311
-rw-r--r--net/sunrpc/sysctl.c31
-rw-r--r--net/sunrpc/xdr.c8
-rw-r--r--net/sunrpc/xprtrdma/Makefile5
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma.c266
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_marshal.c412
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_recvfrom.c586
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_sendto.c520
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_transport.c1080
18 files changed, 4530 insertions, 1206 deletions
diff --git a/net/sunrpc/Makefile b/net/sunrpc/Makefile
index 5c69a725e530..92e1dbe50947 100644
--- a/net/sunrpc/Makefile
+++ b/net/sunrpc/Makefile
@@ -11,6 +11,7 @@ sunrpc-y := clnt.o xprt.o socklib.o xprtsock.o sched.o \
11 auth.o auth_null.o auth_unix.o \ 11 auth.o auth_null.o auth_unix.o \
12 svc.o svcsock.o svcauth.o svcauth_unix.o \ 12 svc.o svcsock.o svcauth.o svcauth_unix.o \
13 rpcb_clnt.o timer.o xdr.o \ 13 rpcb_clnt.o timer.o xdr.o \
14 sunrpc_syms.o cache.o rpc_pipe.o 14 sunrpc_syms.o cache.o rpc_pipe.o \
15 svc_xprt.o
15sunrpc-$(CONFIG_PROC_FS) += stats.o 16sunrpc-$(CONFIG_PROC_FS) += stats.o
16sunrpc-$(CONFIG_SYSCTL) += sysctl.o 17sunrpc-$(CONFIG_SYSCTL) += sysctl.o
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index 73940df6c460..481f984e9a22 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -224,38 +224,34 @@ static int rsi_parse(struct cache_detail *cd,
224 224
225 /* major/minor */ 225 /* major/minor */
226 len = qword_get(&mesg, buf, mlen); 226 len = qword_get(&mesg, buf, mlen);
227 if (len < 0) 227 if (len <= 0)
228 goto out; 228 goto out;
229 if (len == 0) { 229 rsii.major_status = simple_strtoul(buf, &ep, 10);
230 if (*ep)
231 goto out;
232 len = qword_get(&mesg, buf, mlen);
233 if (len <= 0)
234 goto out;
235 rsii.minor_status = simple_strtoul(buf, &ep, 10);
236 if (*ep)
230 goto out; 237 goto out;
231 } else {
232 rsii.major_status = simple_strtoul(buf, &ep, 10);
233 if (*ep)
234 goto out;
235 len = qword_get(&mesg, buf, mlen);
236 if (len <= 0)
237 goto out;
238 rsii.minor_status = simple_strtoul(buf, &ep, 10);
239 if (*ep)
240 goto out;
241 238
242 /* out_handle */ 239 /* out_handle */
243 len = qword_get(&mesg, buf, mlen); 240 len = qword_get(&mesg, buf, mlen);
244 if (len < 0) 241 if (len < 0)
245 goto out; 242 goto out;
246 status = -ENOMEM; 243 status = -ENOMEM;
247 if (dup_to_netobj(&rsii.out_handle, buf, len)) 244 if (dup_to_netobj(&rsii.out_handle, buf, len))
248 goto out; 245 goto out;
249 246
250 /* out_token */ 247 /* out_token */
251 len = qword_get(&mesg, buf, mlen); 248 len = qword_get(&mesg, buf, mlen);
252 status = -EINVAL; 249 status = -EINVAL;
253 if (len < 0) 250 if (len < 0)
254 goto out; 251 goto out;
255 status = -ENOMEM; 252 status = -ENOMEM;
256 if (dup_to_netobj(&rsii.out_token, buf, len)) 253 if (dup_to_netobj(&rsii.out_token, buf, len))
257 goto out; 254 goto out;
258 }
259 rsii.h.expiry_time = expiry; 255 rsii.h.expiry_time = expiry;
260 rsip = rsi_update(&rsii, rsip); 256 rsip = rsi_update(&rsii, rsip);
261 status = 0; 257 status = 0;
@@ -975,6 +971,7 @@ static int svcauth_gss_handle_init(struct svc_rqst *rqstp,
975 struct kvec *resv = &rqstp->rq_res.head[0]; 971 struct kvec *resv = &rqstp->rq_res.head[0];
976 struct xdr_netobj tmpobj; 972 struct xdr_netobj tmpobj;
977 struct rsi *rsip, rsikey; 973 struct rsi *rsip, rsikey;
974 int ret;
978 975
979 /* Read the verifier; should be NULL: */ 976 /* Read the verifier; should be NULL: */
980 *authp = rpc_autherr_badverf; 977 *authp = rpc_autherr_badverf;
@@ -1014,23 +1011,27 @@ static int svcauth_gss_handle_init(struct svc_rqst *rqstp,
1014 /* No upcall result: */ 1011 /* No upcall result: */
1015 return SVC_DROP; 1012 return SVC_DROP;
1016 case 0: 1013 case 0:
1014 ret = SVC_DROP;
1017 /* Got an answer to the upcall; use it: */ 1015 /* Got an answer to the upcall; use it: */
1018 if (gss_write_init_verf(rqstp, rsip)) 1016 if (gss_write_init_verf(rqstp, rsip))
1019 return SVC_DROP; 1017 goto out;
1020 if (resv->iov_len + 4 > PAGE_SIZE) 1018 if (resv->iov_len + 4 > PAGE_SIZE)
1021 return SVC_DROP; 1019 goto out;
1022 svc_putnl(resv, RPC_SUCCESS); 1020 svc_putnl(resv, RPC_SUCCESS);
1023 if (svc_safe_putnetobj(resv, &rsip->out_handle)) 1021 if (svc_safe_putnetobj(resv, &rsip->out_handle))
1024 return SVC_DROP; 1022 goto out;
1025 if (resv->iov_len + 3 * 4 > PAGE_SIZE) 1023 if (resv->iov_len + 3 * 4 > PAGE_SIZE)
1026 return SVC_DROP; 1024 goto out;
1027 svc_putnl(resv, rsip->major_status); 1025 svc_putnl(resv, rsip->major_status);
1028 svc_putnl(resv, rsip->minor_status); 1026 svc_putnl(resv, rsip->minor_status);
1029 svc_putnl(resv, GSS_SEQ_WIN); 1027 svc_putnl(resv, GSS_SEQ_WIN);
1030 if (svc_safe_putnetobj(resv, &rsip->out_token)) 1028 if (svc_safe_putnetobj(resv, &rsip->out_token))
1031 return SVC_DROP; 1029 goto out;
1032 } 1030 }
1033 return SVC_COMPLETE; 1031 ret = SVC_COMPLETE;
1032out:
1033 cache_put(&rsip->h, &rsi_cache);
1034 return ret;
1034} 1035}
1035 1036
1036/* 1037/*
@@ -1125,6 +1126,7 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp)
1125 case RPC_GSS_PROC_DESTROY: 1126 case RPC_GSS_PROC_DESTROY:
1126 if (gss_write_verf(rqstp, rsci->mechctx, gc->gc_seq)) 1127 if (gss_write_verf(rqstp, rsci->mechctx, gc->gc_seq))
1127 goto auth_err; 1128 goto auth_err;
1129 rsci->h.expiry_time = get_seconds();
1128 set_bit(CACHE_NEGATIVE, &rsci->h.flags); 1130 set_bit(CACHE_NEGATIVE, &rsci->h.flags);
1129 if (resv->iov_len + 4 > PAGE_SIZE) 1131 if (resv->iov_len + 4 > PAGE_SIZE)
1130 goto drop; 1132 goto drop;
@@ -1386,19 +1388,26 @@ int
1386gss_svc_init(void) 1388gss_svc_init(void)
1387{ 1389{
1388 int rv = svc_auth_register(RPC_AUTH_GSS, &svcauthops_gss); 1390 int rv = svc_auth_register(RPC_AUTH_GSS, &svcauthops_gss);
1389 if (rv == 0) { 1391 if (rv)
1390 cache_register(&rsc_cache); 1392 return rv;
1391 cache_register(&rsi_cache); 1393 rv = cache_register(&rsc_cache);
1392 } 1394 if (rv)
1395 goto out1;
1396 rv = cache_register(&rsi_cache);
1397 if (rv)
1398 goto out2;
1399 return 0;
1400out2:
1401 cache_unregister(&rsc_cache);
1402out1:
1403 svc_auth_unregister(RPC_AUTH_GSS);
1393 return rv; 1404 return rv;
1394} 1405}
1395 1406
1396void 1407void
1397gss_svc_shutdown(void) 1408gss_svc_shutdown(void)
1398{ 1409{
1399 if (cache_unregister(&rsc_cache)) 1410 cache_unregister(&rsc_cache);
1400 printk(KERN_ERR "auth_rpcgss: failed to unregister rsc cache\n"); 1411 cache_unregister(&rsi_cache);
1401 if (cache_unregister(&rsi_cache))
1402 printk(KERN_ERR "auth_rpcgss: failed to unregister rsi cache\n");
1403 svc_auth_unregister(RPC_AUTH_GSS); 1412 svc_auth_unregister(RPC_AUTH_GSS);
1404} 1413}
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 73f053d0cc7a..636c8e04e0be 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -245,6 +245,7 @@ int cache_check(struct cache_detail *detail,
245 cache_put(h, detail); 245 cache_put(h, detail);
246 return rv; 246 return rv;
247} 247}
248EXPORT_SYMBOL(cache_check);
248 249
249/* 250/*
250 * caches need to be periodically cleaned. 251 * caches need to be periodically cleaned.
@@ -290,44 +291,78 @@ static const struct file_operations cache_flush_operations;
290static void do_cache_clean(struct work_struct *work); 291static void do_cache_clean(struct work_struct *work);
291static DECLARE_DELAYED_WORK(cache_cleaner, do_cache_clean); 292static DECLARE_DELAYED_WORK(cache_cleaner, do_cache_clean);
292 293
293void cache_register(struct cache_detail *cd) 294static void remove_cache_proc_entries(struct cache_detail *cd)
294{ 295{
295 cd->proc_ent = proc_mkdir(cd->name, proc_net_rpc); 296 if (cd->proc_ent == NULL)
296 if (cd->proc_ent) { 297 return;
297 struct proc_dir_entry *p; 298 if (cd->flush_ent)
298 cd->proc_ent->owner = cd->owner; 299 remove_proc_entry("flush", cd->proc_ent);
299 cd->channel_ent = cd->content_ent = NULL; 300 if (cd->channel_ent)
301 remove_proc_entry("channel", cd->proc_ent);
302 if (cd->content_ent)
303 remove_proc_entry("content", cd->proc_ent);
304 cd->proc_ent = NULL;
305 remove_proc_entry(cd->name, proc_net_rpc);
306}
300 307
301 p = create_proc_entry("flush", S_IFREG|S_IRUSR|S_IWUSR, 308#ifdef CONFIG_PROC_FS
302 cd->proc_ent); 309static int create_cache_proc_entries(struct cache_detail *cd)
303 cd->flush_ent = p; 310{
304 if (p) { 311 struct proc_dir_entry *p;
305 p->proc_fops = &cache_flush_operations;
306 p->owner = cd->owner;
307 p->data = cd;
308 }
309 312
310 if (cd->cache_request || cd->cache_parse) { 313 cd->proc_ent = proc_mkdir(cd->name, proc_net_rpc);
311 p = create_proc_entry("channel", S_IFREG|S_IRUSR|S_IWUSR, 314 if (cd->proc_ent == NULL)
312 cd->proc_ent); 315 goto out_nomem;
313 cd->channel_ent = p; 316 cd->proc_ent->owner = cd->owner;
314 if (p) { 317 cd->channel_ent = cd->content_ent = NULL;
315 p->proc_fops = &cache_file_operations; 318
316 p->owner = cd->owner; 319 p = create_proc_entry("flush", S_IFREG|S_IRUSR|S_IWUSR, cd->proc_ent);
317 p->data = cd; 320 cd->flush_ent = p;
318 } 321 if (p == NULL)
319 } 322 goto out_nomem;
320 if (cd->cache_show) { 323 p->proc_fops = &cache_flush_operations;
321 p = create_proc_entry("content", S_IFREG|S_IRUSR|S_IWUSR, 324 p->owner = cd->owner;
322 cd->proc_ent); 325 p->data = cd;
323 cd->content_ent = p; 326
324 if (p) { 327 if (cd->cache_request || cd->cache_parse) {
325 p->proc_fops = &content_file_operations; 328 p = create_proc_entry("channel", S_IFREG|S_IRUSR|S_IWUSR,
326 p->owner = cd->owner; 329 cd->proc_ent);
327 p->data = cd; 330 cd->channel_ent = p;
328 } 331 if (p == NULL)
329 } 332 goto out_nomem;
333 p->proc_fops = &cache_file_operations;
334 p->owner = cd->owner;
335 p->data = cd;
330 } 336 }
337 if (cd->cache_show) {
338 p = create_proc_entry("content", S_IFREG|S_IRUSR|S_IWUSR,
339 cd->proc_ent);
340 cd->content_ent = p;
341 if (p == NULL)
342 goto out_nomem;
343 p->proc_fops = &content_file_operations;
344 p->owner = cd->owner;
345 p->data = cd;
346 }
347 return 0;
348out_nomem:
349 remove_cache_proc_entries(cd);
350 return -ENOMEM;
351}
352#else /* CONFIG_PROC_FS */
353static int create_cache_proc_entries(struct cache_detail *cd)
354{
355 return 0;
356}
357#endif
358
359int cache_register(struct cache_detail *cd)
360{
361 int ret;
362
363 ret = create_cache_proc_entries(cd);
364 if (ret)
365 return ret;
331 rwlock_init(&cd->hash_lock); 366 rwlock_init(&cd->hash_lock);
332 INIT_LIST_HEAD(&cd->queue); 367 INIT_LIST_HEAD(&cd->queue);
333 spin_lock(&cache_list_lock); 368 spin_lock(&cache_list_lock);
@@ -341,9 +376,11 @@ void cache_register(struct cache_detail *cd)
341 376
342 /* start the cleaning process */ 377 /* start the cleaning process */
343 schedule_delayed_work(&cache_cleaner, 0); 378 schedule_delayed_work(&cache_cleaner, 0);
379 return 0;
344} 380}
381EXPORT_SYMBOL(cache_register);
345 382
346int cache_unregister(struct cache_detail *cd) 383void cache_unregister(struct cache_detail *cd)
347{ 384{
348 cache_purge(cd); 385 cache_purge(cd);
349 spin_lock(&cache_list_lock); 386 spin_lock(&cache_list_lock);
@@ -351,30 +388,23 @@ int cache_unregister(struct cache_detail *cd)
351 if (cd->entries || atomic_read(&cd->inuse)) { 388 if (cd->entries || atomic_read(&cd->inuse)) {
352 write_unlock(&cd->hash_lock); 389 write_unlock(&cd->hash_lock);
353 spin_unlock(&cache_list_lock); 390 spin_unlock(&cache_list_lock);
354 return -EBUSY; 391 goto out;
355 } 392 }
356 if (current_detail == cd) 393 if (current_detail == cd)
357 current_detail = NULL; 394 current_detail = NULL;
358 list_del_init(&cd->others); 395 list_del_init(&cd->others);
359 write_unlock(&cd->hash_lock); 396 write_unlock(&cd->hash_lock);
360 spin_unlock(&cache_list_lock); 397 spin_unlock(&cache_list_lock);
361 if (cd->proc_ent) { 398 remove_cache_proc_entries(cd);
362 if (cd->flush_ent)
363 remove_proc_entry("flush", cd->proc_ent);
364 if (cd->channel_ent)
365 remove_proc_entry("channel", cd->proc_ent);
366 if (cd->content_ent)
367 remove_proc_entry("content", cd->proc_ent);
368
369 cd->proc_ent = NULL;
370 remove_proc_entry(cd->name, proc_net_rpc);
371 }
372 if (list_empty(&cache_list)) { 399 if (list_empty(&cache_list)) {
373 /* module must be being unloaded so its safe to kill the worker */ 400 /* module must be being unloaded so its safe to kill the worker */
374 cancel_delayed_work_sync(&cache_cleaner); 401 cancel_delayed_work_sync(&cache_cleaner);
375 } 402 }
376 return 0; 403 return;
404out:
405 printk(KERN_ERR "nfsd: failed to unregister %s cache\n", cd->name);
377} 406}
407EXPORT_SYMBOL(cache_unregister);
378 408
379/* clean cache tries to find something to clean 409/* clean cache tries to find something to clean
380 * and cleans it. 410 * and cleans it.
@@ -489,6 +519,7 @@ void cache_flush(void)
489 while (cache_clean() != -1) 519 while (cache_clean() != -1)
490 cond_resched(); 520 cond_resched();
491} 521}
522EXPORT_SYMBOL(cache_flush);
492 523
493void cache_purge(struct cache_detail *detail) 524void cache_purge(struct cache_detail *detail)
494{ 525{
@@ -497,7 +528,7 @@ void cache_purge(struct cache_detail *detail)
497 cache_flush(); 528 cache_flush();
498 detail->flush_time = 1; 529 detail->flush_time = 1;
499} 530}
500 531EXPORT_SYMBOL(cache_purge);
501 532
502 533
503/* 534/*
@@ -634,13 +665,13 @@ void cache_clean_deferred(void *owner)
634/* 665/*
635 * communicate with user-space 666 * communicate with user-space
636 * 667 *
637 * We have a magic /proc file - /proc/sunrpc/cache 668 * We have a magic /proc file - /proc/sunrpc/<cachename>/channel.
638 * On read, you get a full request, or block 669 * On read, you get a full request, or block.
639 * On write, an update request is processed 670 * On write, an update request is processed.
640 * Poll works if anything to read, and always allows write 671 * Poll works if anything to read, and always allows write.
641 * 672 *
642 * Implemented by linked list of requests. Each open file has 673 * Implemented by linked list of requests. Each open file has
643 * a ->private that also exists in this list. New request are added 674 * a ->private that also exists in this list. New requests are added
644 * to the end and may wakeup and preceding readers. 675 * to the end and may wakeup and preceding readers.
645 * New readers are added to the head. If, on read, an item is found with 676 * New readers are added to the head. If, on read, an item is found with
646 * CACHE_UPCALLING clear, we free it from the list. 677 * CACHE_UPCALLING clear, we free it from the list.
@@ -963,6 +994,7 @@ void qword_add(char **bpp, int *lp, char *str)
963 *bpp = bp; 994 *bpp = bp;
964 *lp = len; 995 *lp = len;
965} 996}
997EXPORT_SYMBOL(qword_add);
966 998
967void qword_addhex(char **bpp, int *lp, char *buf, int blen) 999void qword_addhex(char **bpp, int *lp, char *buf, int blen)
968{ 1000{
@@ -991,6 +1023,7 @@ void qword_addhex(char **bpp, int *lp, char *buf, int blen)
991 *bpp = bp; 1023 *bpp = bp;
992 *lp = len; 1024 *lp = len;
993} 1025}
1026EXPORT_SYMBOL(qword_addhex);
994 1027
995static void warn_no_listener(struct cache_detail *detail) 1028static void warn_no_listener(struct cache_detail *detail)
996{ 1029{
@@ -1113,6 +1146,7 @@ int qword_get(char **bpp, char *dest, int bufsize)
1113 *dest = '\0'; 1146 *dest = '\0';
1114 return len; 1147 return len;
1115} 1148}
1149EXPORT_SYMBOL(qword_get);
1116 1150
1117 1151
1118/* 1152/*
@@ -1244,18 +1278,18 @@ static ssize_t read_flush(struct file *file, char __user *buf,
1244 struct cache_detail *cd = PDE(file->f_path.dentry->d_inode)->data; 1278 struct cache_detail *cd = PDE(file->f_path.dentry->d_inode)->data;
1245 char tbuf[20]; 1279 char tbuf[20];
1246 unsigned long p = *ppos; 1280 unsigned long p = *ppos;
1247 int len; 1281 size_t len;
1248 1282
1249 sprintf(tbuf, "%lu\n", cd->flush_time); 1283 sprintf(tbuf, "%lu\n", cd->flush_time);
1250 len = strlen(tbuf); 1284 len = strlen(tbuf);
1251 if (p >= len) 1285 if (p >= len)
1252 return 0; 1286 return 0;
1253 len -= p; 1287 len -= p;
1254 if (len > count) len = count; 1288 if (len > count)
1289 len = count;
1255 if (copy_to_user(buf, (void*)(tbuf+p), len)) 1290 if (copy_to_user(buf, (void*)(tbuf+p), len))
1256 len = -EFAULT; 1291 return -EFAULT;
1257 else 1292 *ppos += len;
1258 *ppos += len;
1259 return len; 1293 return len;
1260} 1294}
1261 1295
diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c
index 74df2d358e61..5a16875f5ac8 100644
--- a/net/sunrpc/stats.c
+++ b/net/sunrpc/stats.c
@@ -33,7 +33,7 @@ struct proc_dir_entry *proc_net_rpc = NULL;
33static int rpc_proc_show(struct seq_file *seq, void *v) { 33static int rpc_proc_show(struct seq_file *seq, void *v) {
34 const struct rpc_stat *statp = seq->private; 34 const struct rpc_stat *statp = seq->private;
35 const struct rpc_program *prog = statp->program; 35 const struct rpc_program *prog = statp->program;
36 int i, j; 36 unsigned int i, j;
37 37
38 seq_printf(seq, 38 seq_printf(seq,
39 "net %u %u %u %u\n", 39 "net %u %u %u %u\n",
@@ -81,7 +81,7 @@ void svc_seq_show(struct seq_file *seq, const struct svc_stat *statp) {
81 const struct svc_program *prog = statp->program; 81 const struct svc_program *prog = statp->program;
82 const struct svc_procedure *proc; 82 const struct svc_procedure *proc;
83 const struct svc_version *vers; 83 const struct svc_version *vers;
84 int i, j; 84 unsigned int i, j;
85 85
86 seq_printf(seq, 86 seq_printf(seq,
87 "net %u %u %u %u\n", 87 "net %u %u %u %u\n",
@@ -106,6 +106,7 @@ void svc_seq_show(struct seq_file *seq, const struct svc_stat *statp) {
106 seq_putc(seq, '\n'); 106 seq_putc(seq, '\n');
107 } 107 }
108} 108}
109EXPORT_SYMBOL(svc_seq_show);
109 110
110/** 111/**
111 * rpc_alloc_iostats - allocate an rpc_iostats structure 112 * rpc_alloc_iostats - allocate an rpc_iostats structure
@@ -255,12 +256,14 @@ svc_proc_register(struct svc_stat *statp, const struct file_operations *fops)
255{ 256{
256 return do_register(statp->program->pg_name, statp, fops); 257 return do_register(statp->program->pg_name, statp, fops);
257} 258}
259EXPORT_SYMBOL(svc_proc_register);
258 260
259void 261void
260svc_proc_unregister(const char *name) 262svc_proc_unregister(const char *name)
261{ 263{
262 remove_proc_entry(name, proc_net_rpc); 264 remove_proc_entry(name, proc_net_rpc);
263} 265}
266EXPORT_SYMBOL(svc_proc_unregister);
264 267
265void 268void
266rpc_proc_init(void) 269rpc_proc_init(void)
diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
index 1a7e309d008b..843629f55763 100644
--- a/net/sunrpc/sunrpc_syms.c
+++ b/net/sunrpc/sunrpc_syms.c
@@ -22,48 +22,6 @@
22#include <linux/sunrpc/rpc_pipe_fs.h> 22#include <linux/sunrpc/rpc_pipe_fs.h>
23#include <linux/sunrpc/xprtsock.h> 23#include <linux/sunrpc/xprtsock.h>
24 24
25/* RPC server stuff */
26EXPORT_SYMBOL(svc_create);
27EXPORT_SYMBOL(svc_create_thread);
28EXPORT_SYMBOL(svc_create_pooled);
29EXPORT_SYMBOL(svc_set_num_threads);
30EXPORT_SYMBOL(svc_exit_thread);
31EXPORT_SYMBOL(svc_destroy);
32EXPORT_SYMBOL(svc_drop);
33EXPORT_SYMBOL(svc_process);
34EXPORT_SYMBOL(svc_recv);
35EXPORT_SYMBOL(svc_wake_up);
36EXPORT_SYMBOL(svc_makesock);
37EXPORT_SYMBOL(svc_reserve);
38EXPORT_SYMBOL(svc_auth_register);
39EXPORT_SYMBOL(auth_domain_lookup);
40EXPORT_SYMBOL(svc_authenticate);
41EXPORT_SYMBOL(svc_set_client);
42
43/* RPC statistics */
44#ifdef CONFIG_PROC_FS
45EXPORT_SYMBOL(svc_proc_register);
46EXPORT_SYMBOL(svc_proc_unregister);
47EXPORT_SYMBOL(svc_seq_show);
48#endif
49
50/* caching... */
51EXPORT_SYMBOL(auth_domain_find);
52EXPORT_SYMBOL(auth_domain_put);
53EXPORT_SYMBOL(auth_unix_add_addr);
54EXPORT_SYMBOL(auth_unix_forget_old);
55EXPORT_SYMBOL(auth_unix_lookup);
56EXPORT_SYMBOL(cache_check);
57EXPORT_SYMBOL(cache_flush);
58EXPORT_SYMBOL(cache_purge);
59EXPORT_SYMBOL(cache_register);
60EXPORT_SYMBOL(cache_unregister);
61EXPORT_SYMBOL(qword_add);
62EXPORT_SYMBOL(qword_addhex);
63EXPORT_SYMBOL(qword_get);
64EXPORT_SYMBOL(svcauth_unix_purge);
65EXPORT_SYMBOL(unix_domain_find);
66
67extern struct cache_detail ip_map_cache, unix_gid_cache; 25extern struct cache_detail ip_map_cache, unix_gid_cache;
68 26
69static int __init 27static int __init
@@ -85,7 +43,8 @@ init_sunrpc(void)
85#endif 43#endif
86 cache_register(&ip_map_cache); 44 cache_register(&ip_map_cache);
87 cache_register(&unix_gid_cache); 45 cache_register(&unix_gid_cache);
88 init_socket_xprt(); 46 svc_init_xprt_sock(); /* svc sock transport */
47 init_socket_xprt(); /* clnt sock transport */
89 rpcauth_init_module(); 48 rpcauth_init_module();
90out: 49out:
91 return err; 50 return err;
@@ -96,12 +55,11 @@ cleanup_sunrpc(void)
96{ 55{
97 rpcauth_remove_module(); 56 rpcauth_remove_module();
98 cleanup_socket_xprt(); 57 cleanup_socket_xprt();
58 svc_cleanup_xprt_sock();
99 unregister_rpc_pipefs(); 59 unregister_rpc_pipefs();
100 rpc_destroy_mempool(); 60 rpc_destroy_mempool();
101 if (cache_unregister(&ip_map_cache)) 61 cache_unregister(&ip_map_cache);
102 printk(KERN_ERR "sunrpc: failed to unregister ip_map cache\n"); 62 cache_unregister(&unix_gid_cache);
103 if (cache_unregister(&unix_gid_cache))
104 printk(KERN_ERR "sunrpc: failed to unregister unix_gid cache\n");
105#ifdef RPC_DEBUG 63#ifdef RPC_DEBUG
106 rpc_unregister_sysctl(); 64 rpc_unregister_sysctl();
107#endif 65#endif
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 4ad5fbbb18b4..a290e1523297 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -364,7 +364,7 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
364 void (*shutdown)(struct svc_serv *serv)) 364 void (*shutdown)(struct svc_serv *serv))
365{ 365{
366 struct svc_serv *serv; 366 struct svc_serv *serv;
367 int vers; 367 unsigned int vers;
368 unsigned int xdrsize; 368 unsigned int xdrsize;
369 unsigned int i; 369 unsigned int i;
370 370
@@ -433,6 +433,7 @@ svc_create(struct svc_program *prog, unsigned int bufsize,
433{ 433{
434 return __svc_create(prog, bufsize, /*npools*/1, shutdown); 434 return __svc_create(prog, bufsize, /*npools*/1, shutdown);
435} 435}
436EXPORT_SYMBOL(svc_create);
436 437
437struct svc_serv * 438struct svc_serv *
438svc_create_pooled(struct svc_program *prog, unsigned int bufsize, 439svc_create_pooled(struct svc_program *prog, unsigned int bufsize,
@@ -452,6 +453,7 @@ svc_create_pooled(struct svc_program *prog, unsigned int bufsize,
452 453
453 return serv; 454 return serv;
454} 455}
456EXPORT_SYMBOL(svc_create_pooled);
455 457
456/* 458/*
457 * Destroy an RPC service. Should be called with the BKL held 459 * Destroy an RPC service. Should be called with the BKL held
@@ -459,9 +461,6 @@ svc_create_pooled(struct svc_program *prog, unsigned int bufsize,
459void 461void
460svc_destroy(struct svc_serv *serv) 462svc_destroy(struct svc_serv *serv)
461{ 463{
462 struct svc_sock *svsk;
463 struct svc_sock *tmp;
464
465 dprintk("svc: svc_destroy(%s, %d)\n", 464 dprintk("svc: svc_destroy(%s, %d)\n",
466 serv->sv_program->pg_name, 465 serv->sv_program->pg_name,
467 serv->sv_nrthreads); 466 serv->sv_nrthreads);
@@ -476,14 +475,12 @@ svc_destroy(struct svc_serv *serv)
476 475
477 del_timer_sync(&serv->sv_temptimer); 476 del_timer_sync(&serv->sv_temptimer);
478 477
479 list_for_each_entry_safe(svsk, tmp, &serv->sv_tempsocks, sk_list) 478 svc_close_all(&serv->sv_tempsocks);
480 svc_force_close_socket(svsk);
481 479
482 if (serv->sv_shutdown) 480 if (serv->sv_shutdown)
483 serv->sv_shutdown(serv); 481 serv->sv_shutdown(serv);
484 482
485 list_for_each_entry_safe(svsk, tmp, &serv->sv_permsocks, sk_list) 483 svc_close_all(&serv->sv_permsocks);
486 svc_force_close_socket(svsk);
487 484
488 BUG_ON(!list_empty(&serv->sv_permsocks)); 485 BUG_ON(!list_empty(&serv->sv_permsocks));
489 BUG_ON(!list_empty(&serv->sv_tempsocks)); 486 BUG_ON(!list_empty(&serv->sv_tempsocks));
@@ -498,6 +495,7 @@ svc_destroy(struct svc_serv *serv)
498 kfree(serv->sv_pools); 495 kfree(serv->sv_pools);
499 kfree(serv); 496 kfree(serv);
500} 497}
498EXPORT_SYMBOL(svc_destroy);
501 499
502/* 500/*
503 * Allocate an RPC server's buffer space. 501 * Allocate an RPC server's buffer space.
@@ -536,31 +534,17 @@ svc_release_buffer(struct svc_rqst *rqstp)
536 put_page(rqstp->rq_pages[i]); 534 put_page(rqstp->rq_pages[i]);
537} 535}
538 536
539/* 537struct svc_rqst *
540 * Create a thread in the given pool. Caller must hold BKL. 538svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool)
541 * On a NUMA or SMP machine, with a multi-pool serv, the thread
542 * will be restricted to run on the cpus belonging to the pool.
543 */
544static int
545__svc_create_thread(svc_thread_fn func, struct svc_serv *serv,
546 struct svc_pool *pool)
547{ 539{
548 struct svc_rqst *rqstp; 540 struct svc_rqst *rqstp;
549 int error = -ENOMEM;
550 int have_oldmask = 0;
551 cpumask_t oldmask;
552 541
553 rqstp = kzalloc(sizeof(*rqstp), GFP_KERNEL); 542 rqstp = kzalloc(sizeof(*rqstp), GFP_KERNEL);
554 if (!rqstp) 543 if (!rqstp)
555 goto out; 544 goto out_enomem;
556 545
557 init_waitqueue_head(&rqstp->rq_wait); 546 init_waitqueue_head(&rqstp->rq_wait);
558 547
559 if (!(rqstp->rq_argp = kmalloc(serv->sv_xdrsize, GFP_KERNEL))
560 || !(rqstp->rq_resp = kmalloc(serv->sv_xdrsize, GFP_KERNEL))
561 || !svc_init_buffer(rqstp, serv->sv_max_mesg))
562 goto out_thread;
563
564 serv->sv_nrthreads++; 548 serv->sv_nrthreads++;
565 spin_lock_bh(&pool->sp_lock); 549 spin_lock_bh(&pool->sp_lock);
566 pool->sp_nrthreads++; 550 pool->sp_nrthreads++;
@@ -569,6 +553,45 @@ __svc_create_thread(svc_thread_fn func, struct svc_serv *serv,
569 rqstp->rq_server = serv; 553 rqstp->rq_server = serv;
570 rqstp->rq_pool = pool; 554 rqstp->rq_pool = pool;
571 555
556 rqstp->rq_argp = kmalloc(serv->sv_xdrsize, GFP_KERNEL);
557 if (!rqstp->rq_argp)
558 goto out_thread;
559
560 rqstp->rq_resp = kmalloc(serv->sv_xdrsize, GFP_KERNEL);
561 if (!rqstp->rq_resp)
562 goto out_thread;
563
564 if (!svc_init_buffer(rqstp, serv->sv_max_mesg))
565 goto out_thread;
566
567 return rqstp;
568out_thread:
569 svc_exit_thread(rqstp);
570out_enomem:
571 return ERR_PTR(-ENOMEM);
572}
573EXPORT_SYMBOL(svc_prepare_thread);
574
575/*
576 * Create a thread in the given pool. Caller must hold BKL.
577 * On a NUMA or SMP machine, with a multi-pool serv, the thread
578 * will be restricted to run on the cpus belonging to the pool.
579 */
580static int
581__svc_create_thread(svc_thread_fn func, struct svc_serv *serv,
582 struct svc_pool *pool)
583{
584 struct svc_rqst *rqstp;
585 int error = -ENOMEM;
586 int have_oldmask = 0;
587 cpumask_t oldmask;
588
589 rqstp = svc_prepare_thread(serv, pool);
590 if (IS_ERR(rqstp)) {
591 error = PTR_ERR(rqstp);
592 goto out;
593 }
594
572 if (serv->sv_nrpools > 1) 595 if (serv->sv_nrpools > 1)
573 have_oldmask = svc_pool_map_set_cpumask(pool->sp_id, &oldmask); 596 have_oldmask = svc_pool_map_set_cpumask(pool->sp_id, &oldmask);
574 597
@@ -597,6 +620,7 @@ svc_create_thread(svc_thread_fn func, struct svc_serv *serv)
597{ 620{
598 return __svc_create_thread(func, serv, &serv->sv_pools[0]); 621 return __svc_create_thread(func, serv, &serv->sv_pools[0]);
599} 622}
623EXPORT_SYMBOL(svc_create_thread);
600 624
601/* 625/*
602 * Choose a pool in which to create a new thread, for svc_set_num_threads 626 * Choose a pool in which to create a new thread, for svc_set_num_threads
@@ -700,6 +724,7 @@ svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
700 724
701 return error; 725 return error;
702} 726}
727EXPORT_SYMBOL(svc_set_num_threads);
703 728
704/* 729/*
705 * Called from a server thread as it's exiting. Caller must hold BKL. 730 * Called from a server thread as it's exiting. Caller must hold BKL.
@@ -726,6 +751,7 @@ svc_exit_thread(struct svc_rqst *rqstp)
726 if (serv) 751 if (serv)
727 svc_destroy(serv); 752 svc_destroy(serv);
728} 753}
754EXPORT_SYMBOL(svc_exit_thread);
729 755
730/* 756/*
731 * Register an RPC service with the local portmapper. 757 * Register an RPC service with the local portmapper.
@@ -737,7 +763,8 @@ svc_register(struct svc_serv *serv, int proto, unsigned short port)
737{ 763{
738 struct svc_program *progp; 764 struct svc_program *progp;
739 unsigned long flags; 765 unsigned long flags;
740 int i, error = 0, dummy; 766 unsigned int i;
767 int error = 0, dummy;
741 768
742 if (!port) 769 if (!port)
743 clear_thread_flag(TIF_SIGPENDING); 770 clear_thread_flag(TIF_SIGPENDING);
@@ -840,9 +867,9 @@ svc_process(struct svc_rqst *rqstp)
840 rqstp->rq_res.tail[0].iov_len = 0; 867 rqstp->rq_res.tail[0].iov_len = 0;
841 /* Will be turned off only in gss privacy case: */ 868 /* Will be turned off only in gss privacy case: */
842 rqstp->rq_splice_ok = 1; 869 rqstp->rq_splice_ok = 1;
843 /* tcp needs a space for the record length... */ 870
844 if (rqstp->rq_prot == IPPROTO_TCP) 871 /* Setup reply header */
845 svc_putnl(resv, 0); 872 rqstp->rq_xprt->xpt_ops->xpo_prep_reply_hdr(rqstp);
846 873
847 rqstp->rq_xid = svc_getu32(argv); 874 rqstp->rq_xid = svc_getu32(argv);
848 svc_putu32(resv, rqstp->rq_xid); 875 svc_putu32(resv, rqstp->rq_xid);
@@ -1049,16 +1076,15 @@ err_bad:
1049 svc_putnl(resv, ntohl(rpc_stat)); 1076 svc_putnl(resv, ntohl(rpc_stat));
1050 goto sendit; 1077 goto sendit;
1051} 1078}
1079EXPORT_SYMBOL(svc_process);
1052 1080
1053/* 1081/*
1054 * Return (transport-specific) limit on the rpc payload. 1082 * Return (transport-specific) limit on the rpc payload.
1055 */ 1083 */
1056u32 svc_max_payload(const struct svc_rqst *rqstp) 1084u32 svc_max_payload(const struct svc_rqst *rqstp)
1057{ 1085{
1058 int max = RPCSVC_MAXPAYLOAD_TCP; 1086 u32 max = rqstp->rq_xprt->xpt_class->xcl_max_payload;
1059 1087
1060 if (rqstp->rq_sock->sk_sock->type == SOCK_DGRAM)
1061 max = RPCSVC_MAXPAYLOAD_UDP;
1062 if (rqstp->rq_server->sv_max_payload < max) 1088 if (rqstp->rq_server->sv_max_payload < max)
1063 max = rqstp->rq_server->sv_max_payload; 1089 max = rqstp->rq_server->sv_max_payload;
1064 return max; 1090 return max;
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
new file mode 100644
index 000000000000..ea377e06afae
--- /dev/null
+++ b/net/sunrpc/svc_xprt.c
@@ -0,0 +1,1055 @@
1/*
2 * linux/net/sunrpc/svc_xprt.c
3 *
4 * Author: Tom Tucker <tom@opengridcomputing.com>
5 */
6
7#include <linux/sched.h>
8#include <linux/errno.h>
9#include <linux/fcntl.h>
10#include <linux/net.h>
11#include <linux/in.h>
12#include <linux/inet.h>
13#include <linux/udp.h>
14#include <linux/tcp.h>
15#include <linux/unistd.h>
16#include <linux/slab.h>
17#include <linux/netdevice.h>
18#include <linux/skbuff.h>
19#include <linux/file.h>
20#include <linux/freezer.h>
21#include <net/sock.h>
22#include <net/checksum.h>
23#include <net/ip.h>
24#include <net/ipv6.h>
25#include <net/tcp_states.h>
26#include <linux/uaccess.h>
27#include <asm/ioctls.h>
28
29#include <linux/sunrpc/types.h>
30#include <linux/sunrpc/clnt.h>
31#include <linux/sunrpc/xdr.h>
32#include <linux/sunrpc/stats.h>
33#include <linux/sunrpc/svc_xprt.h>
34
35#define RPCDBG_FACILITY RPCDBG_SVCXPRT
36
37static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt);
38static int svc_deferred_recv(struct svc_rqst *rqstp);
39static struct cache_deferred_req *svc_defer(struct cache_req *req);
40static void svc_age_temp_xprts(unsigned long closure);
41
42/* apparently the "standard" is that clients close
43 * idle connections after 5 minutes, servers after
44 * 6 minutes
45 * http://www.connectathon.org/talks96/nfstcp.pdf
46 */
47static int svc_conn_age_period = 6*60;
48
49/* List of registered transport classes */
50static DEFINE_SPINLOCK(svc_xprt_class_lock);
51static LIST_HEAD(svc_xprt_class_list);
52
53/* SMP locking strategy:
54 *
55 * svc_pool->sp_lock protects most of the fields of that pool.
56 * svc_serv->sv_lock protects sv_tempsocks, sv_permsocks, sv_tmpcnt.
57 * when both need to be taken (rare), svc_serv->sv_lock is first.
58 * BKL protects svc_serv->sv_nrthread.
59 * svc_sock->sk_lock protects the svc_sock->sk_deferred list
60 * and the ->sk_info_authunix cache.
61 *
62 * The XPT_BUSY bit in xprt->xpt_flags prevents a transport being
63 * enqueued multiply. During normal transport processing this bit
64 * is set by svc_xprt_enqueue and cleared by svc_xprt_received.
65 * Providers should not manipulate this bit directly.
66 *
67 * Some flags can be set to certain values at any time
68 * providing that certain rules are followed:
69 *
70 * XPT_CONN, XPT_DATA:
71 * - Can be set or cleared at any time.
72 * - After a set, svc_xprt_enqueue must be called to enqueue
73 * the transport for processing.
74 * - After a clear, the transport must be read/accepted.
75 * If this succeeds, it must be set again.
76 * XPT_CLOSE:
77 * - Can set at any time. It is never cleared.
78 * XPT_DEAD:
79 * - Can only be set while XPT_BUSY is held which ensures
80 * that no other thread will be using the transport or will
81 * try to set XPT_DEAD.
82 */
83
84int svc_reg_xprt_class(struct svc_xprt_class *xcl)
85{
86 struct svc_xprt_class *cl;
87 int res = -EEXIST;
88
89 dprintk("svc: Adding svc transport class '%s'\n", xcl->xcl_name);
90
91 INIT_LIST_HEAD(&xcl->xcl_list);
92 spin_lock(&svc_xprt_class_lock);
93 /* Make sure there isn't already a class with the same name */
94 list_for_each_entry(cl, &svc_xprt_class_list, xcl_list) {
95 if (strcmp(xcl->xcl_name, cl->xcl_name) == 0)
96 goto out;
97 }
98 list_add_tail(&xcl->xcl_list, &svc_xprt_class_list);
99 res = 0;
100out:
101 spin_unlock(&svc_xprt_class_lock);
102 return res;
103}
104EXPORT_SYMBOL_GPL(svc_reg_xprt_class);
105
106void svc_unreg_xprt_class(struct svc_xprt_class *xcl)
107{
108 dprintk("svc: Removing svc transport class '%s'\n", xcl->xcl_name);
109 spin_lock(&svc_xprt_class_lock);
110 list_del_init(&xcl->xcl_list);
111 spin_unlock(&svc_xprt_class_lock);
112}
113EXPORT_SYMBOL_GPL(svc_unreg_xprt_class);
114
115/*
116 * Format the transport list for printing
117 */
118int svc_print_xprts(char *buf, int maxlen)
119{
120 struct list_head *le;
121 char tmpstr[80];
122 int len = 0;
123 buf[0] = '\0';
124
125 spin_lock(&svc_xprt_class_lock);
126 list_for_each(le, &svc_xprt_class_list) {
127 int slen;
128 struct svc_xprt_class *xcl =
129 list_entry(le, struct svc_xprt_class, xcl_list);
130
131 sprintf(tmpstr, "%s %d\n", xcl->xcl_name, xcl->xcl_max_payload);
132 slen = strlen(tmpstr);
133 if (len + slen > maxlen)
134 break;
135 len += slen;
136 strcat(buf, tmpstr);
137 }
138 spin_unlock(&svc_xprt_class_lock);
139
140 return len;
141}
142
143static void svc_xprt_free(struct kref *kref)
144{
145 struct svc_xprt *xprt =
146 container_of(kref, struct svc_xprt, xpt_ref);
147 struct module *owner = xprt->xpt_class->xcl_owner;
148 if (test_bit(XPT_CACHE_AUTH, &xprt->xpt_flags)
149 && xprt->xpt_auth_cache != NULL)
150 svcauth_unix_info_release(xprt->xpt_auth_cache);
151 xprt->xpt_ops->xpo_free(xprt);
152 module_put(owner);
153}
154
155void svc_xprt_put(struct svc_xprt *xprt)
156{
157 kref_put(&xprt->xpt_ref, svc_xprt_free);
158}
159EXPORT_SYMBOL_GPL(svc_xprt_put);
160
161/*
162 * Called by transport drivers to initialize the transport independent
163 * portion of the transport instance.
164 */
165void svc_xprt_init(struct svc_xprt_class *xcl, struct svc_xprt *xprt,
166 struct svc_serv *serv)
167{
168 memset(xprt, 0, sizeof(*xprt));
169 xprt->xpt_class = xcl;
170 xprt->xpt_ops = xcl->xcl_ops;
171 kref_init(&xprt->xpt_ref);
172 xprt->xpt_server = serv;
173 INIT_LIST_HEAD(&xprt->xpt_list);
174 INIT_LIST_HEAD(&xprt->xpt_ready);
175 INIT_LIST_HEAD(&xprt->xpt_deferred);
176 mutex_init(&xprt->xpt_mutex);
177 spin_lock_init(&xprt->xpt_lock);
178 set_bit(XPT_BUSY, &xprt->xpt_flags);
179}
180EXPORT_SYMBOL_GPL(svc_xprt_init);
181
182int svc_create_xprt(struct svc_serv *serv, char *xprt_name, unsigned short port,
183 int flags)
184{
185 struct svc_xprt_class *xcl;
186 struct sockaddr_in sin = {
187 .sin_family = AF_INET,
188 .sin_addr.s_addr = INADDR_ANY,
189 .sin_port = htons(port),
190 };
191 dprintk("svc: creating transport %s[%d]\n", xprt_name, port);
192 spin_lock(&svc_xprt_class_lock);
193 list_for_each_entry(xcl, &svc_xprt_class_list, xcl_list) {
194 struct svc_xprt *newxprt;
195
196 if (strcmp(xprt_name, xcl->xcl_name))
197 continue;
198
199 if (!try_module_get(xcl->xcl_owner))
200 goto err;
201
202 spin_unlock(&svc_xprt_class_lock);
203 newxprt = xcl->xcl_ops->
204 xpo_create(serv, (struct sockaddr *)&sin, sizeof(sin),
205 flags);
206 if (IS_ERR(newxprt)) {
207 module_put(xcl->xcl_owner);
208 return PTR_ERR(newxprt);
209 }
210
211 clear_bit(XPT_TEMP, &newxprt->xpt_flags);
212 spin_lock_bh(&serv->sv_lock);
213 list_add(&newxprt->xpt_list, &serv->sv_permsocks);
214 spin_unlock_bh(&serv->sv_lock);
215 clear_bit(XPT_BUSY, &newxprt->xpt_flags);
216 return svc_xprt_local_port(newxprt);
217 }
218 err:
219 spin_unlock(&svc_xprt_class_lock);
220 dprintk("svc: transport %s not found\n", xprt_name);
221 return -ENOENT;
222}
223EXPORT_SYMBOL_GPL(svc_create_xprt);
224
225/*
226 * Copy the local and remote xprt addresses to the rqstp structure
227 */
228void svc_xprt_copy_addrs(struct svc_rqst *rqstp, struct svc_xprt *xprt)
229{
230 struct sockaddr *sin;
231
232 memcpy(&rqstp->rq_addr, &xprt->xpt_remote, xprt->xpt_remotelen);
233 rqstp->rq_addrlen = xprt->xpt_remotelen;
234
235 /*
236 * Destination address in request is needed for binding the
237 * source address in RPC replies/callbacks later.
238 */
239 sin = (struct sockaddr *)&xprt->xpt_local;
240 switch (sin->sa_family) {
241 case AF_INET:
242 rqstp->rq_daddr.addr = ((struct sockaddr_in *)sin)->sin_addr;
243 break;
244 case AF_INET6:
245 rqstp->rq_daddr.addr6 = ((struct sockaddr_in6 *)sin)->sin6_addr;
246 break;
247 }
248}
249EXPORT_SYMBOL_GPL(svc_xprt_copy_addrs);
250
251/**
252 * svc_print_addr - Format rq_addr field for printing
253 * @rqstp: svc_rqst struct containing address to print
254 * @buf: target buffer for formatted address
255 * @len: length of target buffer
256 *
257 */
258char *svc_print_addr(struct svc_rqst *rqstp, char *buf, size_t len)
259{
260 return __svc_print_addr(svc_addr(rqstp), buf, len);
261}
262EXPORT_SYMBOL_GPL(svc_print_addr);
263
264/*
265 * Queue up an idle server thread. Must have pool->sp_lock held.
266 * Note: this is really a stack rather than a queue, so that we only
267 * use as many different threads as we need, and the rest don't pollute
268 * the cache.
269 */
270static void svc_thread_enqueue(struct svc_pool *pool, struct svc_rqst *rqstp)
271{
272 list_add(&rqstp->rq_list, &pool->sp_threads);
273}
274
275/*
276 * Dequeue an nfsd thread. Must have pool->sp_lock held.
277 */
278static void svc_thread_dequeue(struct svc_pool *pool, struct svc_rqst *rqstp)
279{
280 list_del(&rqstp->rq_list);
281}
282
283/*
284 * Queue up a transport with data pending. If there are idle nfsd
285 * processes, wake 'em up.
286 *
287 */
288void svc_xprt_enqueue(struct svc_xprt *xprt)
289{
290 struct svc_serv *serv = xprt->xpt_server;
291 struct svc_pool *pool;
292 struct svc_rqst *rqstp;
293 int cpu;
294
295 if (!(xprt->xpt_flags &
296 ((1<<XPT_CONN)|(1<<XPT_DATA)|(1<<XPT_CLOSE)|(1<<XPT_DEFERRED))))
297 return;
298 if (test_bit(XPT_DEAD, &xprt->xpt_flags))
299 return;
300
301 cpu = get_cpu();
302 pool = svc_pool_for_cpu(xprt->xpt_server, cpu);
303 put_cpu();
304
305 spin_lock_bh(&pool->sp_lock);
306
307 if (!list_empty(&pool->sp_threads) &&
308 !list_empty(&pool->sp_sockets))
309 printk(KERN_ERR
310 "svc_xprt_enqueue: "
311 "threads and transports both waiting??\n");
312
313 if (test_bit(XPT_DEAD, &xprt->xpt_flags)) {
314 /* Don't enqueue dead transports */
315 dprintk("svc: transport %p is dead, not enqueued\n", xprt);
316 goto out_unlock;
317 }
318
319 /* Mark transport as busy. It will remain in this state until
320 * the provider calls svc_xprt_received. We update XPT_BUSY
321 * atomically because it also guards against trying to enqueue
322 * the transport twice.
323 */
324 if (test_and_set_bit(XPT_BUSY, &xprt->xpt_flags)) {
325 /* Don't enqueue transport while already enqueued */
326 dprintk("svc: transport %p busy, not enqueued\n", xprt);
327 goto out_unlock;
328 }
329 BUG_ON(xprt->xpt_pool != NULL);
330 xprt->xpt_pool = pool;
331
332 /* Handle pending connection */
333 if (test_bit(XPT_CONN, &xprt->xpt_flags))
334 goto process;
335
336 /* Handle close in-progress */
337 if (test_bit(XPT_CLOSE, &xprt->xpt_flags))
338 goto process;
339
340 /* Check if we have space to reply to a request */
341 if (!xprt->xpt_ops->xpo_has_wspace(xprt)) {
342 /* Don't enqueue while not enough space for reply */
343 dprintk("svc: no write space, transport %p not enqueued\n",
344 xprt);
345 xprt->xpt_pool = NULL;
346 clear_bit(XPT_BUSY, &xprt->xpt_flags);
347 goto out_unlock;
348 }
349
350 process:
351 if (!list_empty(&pool->sp_threads)) {
352 rqstp = list_entry(pool->sp_threads.next,
353 struct svc_rqst,
354 rq_list);
355 dprintk("svc: transport %p served by daemon %p\n",
356 xprt, rqstp);
357 svc_thread_dequeue(pool, rqstp);
358 if (rqstp->rq_xprt)
359 printk(KERN_ERR
360 "svc_xprt_enqueue: server %p, rq_xprt=%p!\n",
361 rqstp, rqstp->rq_xprt);
362 rqstp->rq_xprt = xprt;
363 svc_xprt_get(xprt);
364 rqstp->rq_reserved = serv->sv_max_mesg;
365 atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved);
366 BUG_ON(xprt->xpt_pool != pool);
367 wake_up(&rqstp->rq_wait);
368 } else {
369 dprintk("svc: transport %p put into queue\n", xprt);
370 list_add_tail(&xprt->xpt_ready, &pool->sp_sockets);
371 BUG_ON(xprt->xpt_pool != pool);
372 }
373
374out_unlock:
375 spin_unlock_bh(&pool->sp_lock);
376}
377EXPORT_SYMBOL_GPL(svc_xprt_enqueue);
378
379/*
380 * Dequeue the first transport. Must be called with the pool->sp_lock held.
381 */
382static struct svc_xprt *svc_xprt_dequeue(struct svc_pool *pool)
383{
384 struct svc_xprt *xprt;
385
386 if (list_empty(&pool->sp_sockets))
387 return NULL;
388
389 xprt = list_entry(pool->sp_sockets.next,
390 struct svc_xprt, xpt_ready);
391 list_del_init(&xprt->xpt_ready);
392
393 dprintk("svc: transport %p dequeued, inuse=%d\n",
394 xprt, atomic_read(&xprt->xpt_ref.refcount));
395
396 return xprt;
397}
398
399/*
400 * svc_xprt_received conditionally queues the transport for processing
401 * by another thread. The caller must hold the XPT_BUSY bit and must
402 * not thereafter touch transport data.
403 *
404 * Note: XPT_DATA only gets cleared when a read-attempt finds no (or
405 * insufficient) data.
406 */
407void svc_xprt_received(struct svc_xprt *xprt)
408{
409 BUG_ON(!test_bit(XPT_BUSY, &xprt->xpt_flags));
410 xprt->xpt_pool = NULL;
411 clear_bit(XPT_BUSY, &xprt->xpt_flags);
412 svc_xprt_enqueue(xprt);
413}
414EXPORT_SYMBOL_GPL(svc_xprt_received);
415
416/**
417 * svc_reserve - change the space reserved for the reply to a request.
418 * @rqstp: The request in question
419 * @space: new max space to reserve
420 *
421 * Each request reserves some space on the output queue of the transport
422 * to make sure the reply fits. This function reduces that reserved
423 * space to be the amount of space used already, plus @space.
424 *
425 */
426void svc_reserve(struct svc_rqst *rqstp, int space)
427{
428 space += rqstp->rq_res.head[0].iov_len;
429
430 if (space < rqstp->rq_reserved) {
431 struct svc_xprt *xprt = rqstp->rq_xprt;
432 atomic_sub((rqstp->rq_reserved - space), &xprt->xpt_reserved);
433 rqstp->rq_reserved = space;
434
435 svc_xprt_enqueue(xprt);
436 }
437}
438EXPORT_SYMBOL(svc_reserve);
439
440static void svc_xprt_release(struct svc_rqst *rqstp)
441{
442 struct svc_xprt *xprt = rqstp->rq_xprt;
443
444 rqstp->rq_xprt->xpt_ops->xpo_release_rqst(rqstp);
445
446 svc_free_res_pages(rqstp);
447 rqstp->rq_res.page_len = 0;
448 rqstp->rq_res.page_base = 0;
449
450 /* Reset response buffer and release
451 * the reservation.
452 * But first, check that enough space was reserved
453 * for the reply, otherwise we have a bug!
454 */
455 if ((rqstp->rq_res.len) > rqstp->rq_reserved)
456 printk(KERN_ERR "RPC request reserved %d but used %d\n",
457 rqstp->rq_reserved,
458 rqstp->rq_res.len);
459
460 rqstp->rq_res.head[0].iov_len = 0;
461 svc_reserve(rqstp, 0);
462 rqstp->rq_xprt = NULL;
463
464 svc_xprt_put(xprt);
465}
466
467/*
468 * External function to wake up a server waiting for data
469 * This really only makes sense for services like lockd
470 * which have exactly one thread anyway.
471 */
472void svc_wake_up(struct svc_serv *serv)
473{
474 struct svc_rqst *rqstp;
475 unsigned int i;
476 struct svc_pool *pool;
477
478 for (i = 0; i < serv->sv_nrpools; i++) {
479 pool = &serv->sv_pools[i];
480
481 spin_lock_bh(&pool->sp_lock);
482 if (!list_empty(&pool->sp_threads)) {
483 rqstp = list_entry(pool->sp_threads.next,
484 struct svc_rqst,
485 rq_list);
486 dprintk("svc: daemon %p woken up.\n", rqstp);
487 /*
488 svc_thread_dequeue(pool, rqstp);
489 rqstp->rq_xprt = NULL;
490 */
491 wake_up(&rqstp->rq_wait);
492 }
493 spin_unlock_bh(&pool->sp_lock);
494 }
495}
496EXPORT_SYMBOL(svc_wake_up);
497
498int svc_port_is_privileged(struct sockaddr *sin)
499{
500 switch (sin->sa_family) {
501 case AF_INET:
502 return ntohs(((struct sockaddr_in *)sin)->sin_port)
503 < PROT_SOCK;
504 case AF_INET6:
505 return ntohs(((struct sockaddr_in6 *)sin)->sin6_port)
506 < PROT_SOCK;
507 default:
508 return 0;
509 }
510}
511
512/*
513 * Make sure that we don't have too many active connections. If we
514 * have, something must be dropped.
515 *
516 * There's no point in trying to do random drop here for DoS
517 * prevention. The NFS clients does 1 reconnect in 15 seconds. An
518 * attacker can easily beat that.
519 *
520 * The only somewhat efficient mechanism would be if drop old
521 * connections from the same IP first. But right now we don't even
522 * record the client IP in svc_sock.
523 */
524static void svc_check_conn_limits(struct svc_serv *serv)
525{
526 if (serv->sv_tmpcnt > (serv->sv_nrthreads+3)*20) {
527 struct svc_xprt *xprt = NULL;
528 spin_lock_bh(&serv->sv_lock);
529 if (!list_empty(&serv->sv_tempsocks)) {
530 if (net_ratelimit()) {
531 /* Try to help the admin */
532 printk(KERN_NOTICE "%s: too many open "
533 "connections, consider increasing the "
534 "number of nfsd threads\n",
535 serv->sv_name);
536 }
537 /*
538 * Always select the oldest connection. It's not fair,
539 * but so is life
540 */
541 xprt = list_entry(serv->sv_tempsocks.prev,
542 struct svc_xprt,
543 xpt_list);
544 set_bit(XPT_CLOSE, &xprt->xpt_flags);
545 svc_xprt_get(xprt);
546 }
547 spin_unlock_bh(&serv->sv_lock);
548
549 if (xprt) {
550 svc_xprt_enqueue(xprt);
551 svc_xprt_put(xprt);
552 }
553 }
554}
555
556/*
557 * Receive the next request on any transport. This code is carefully
558 * organised not to touch any cachelines in the shared svc_serv
559 * structure, only cachelines in the local svc_pool.
560 */
561int svc_recv(struct svc_rqst *rqstp, long timeout)
562{
563 struct svc_xprt *xprt = NULL;
564 struct svc_serv *serv = rqstp->rq_server;
565 struct svc_pool *pool = rqstp->rq_pool;
566 int len, i;
567 int pages;
568 struct xdr_buf *arg;
569 DECLARE_WAITQUEUE(wait, current);
570
571 dprintk("svc: server %p waiting for data (to = %ld)\n",
572 rqstp, timeout);
573
574 if (rqstp->rq_xprt)
575 printk(KERN_ERR
576 "svc_recv: service %p, transport not NULL!\n",
577 rqstp);
578 if (waitqueue_active(&rqstp->rq_wait))
579 printk(KERN_ERR
580 "svc_recv: service %p, wait queue active!\n",
581 rqstp);
582
583 /* now allocate needed pages. If we get a failure, sleep briefly */
584 pages = (serv->sv_max_mesg + PAGE_SIZE) / PAGE_SIZE;
585 for (i = 0; i < pages ; i++)
586 while (rqstp->rq_pages[i] == NULL) {
587 struct page *p = alloc_page(GFP_KERNEL);
588 if (!p) {
589 int j = msecs_to_jiffies(500);
590 schedule_timeout_uninterruptible(j);
591 }
592 rqstp->rq_pages[i] = p;
593 }
594 rqstp->rq_pages[i++] = NULL; /* this might be seen in nfs_read_actor */
595 BUG_ON(pages >= RPCSVC_MAXPAGES);
596
597 /* Make arg->head point to first page and arg->pages point to rest */
598 arg = &rqstp->rq_arg;
599 arg->head[0].iov_base = page_address(rqstp->rq_pages[0]);
600 arg->head[0].iov_len = PAGE_SIZE;
601 arg->pages = rqstp->rq_pages + 1;
602 arg->page_base = 0;
603 /* save at least one page for response */
604 arg->page_len = (pages-2)*PAGE_SIZE;
605 arg->len = (pages-1)*PAGE_SIZE;
606 arg->tail[0].iov_len = 0;
607
608 try_to_freeze();
609 cond_resched();
610 if (signalled())
611 return -EINTR;
612
613 spin_lock_bh(&pool->sp_lock);
614 xprt = svc_xprt_dequeue(pool);
615 if (xprt) {
616 rqstp->rq_xprt = xprt;
617 svc_xprt_get(xprt);
618 rqstp->rq_reserved = serv->sv_max_mesg;
619 atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved);
620 } else {
621 /* No data pending. Go to sleep */
622 svc_thread_enqueue(pool, rqstp);
623
624 /*
625 * We have to be able to interrupt this wait
626 * to bring down the daemons ...
627 */
628 set_current_state(TASK_INTERRUPTIBLE);
629 add_wait_queue(&rqstp->rq_wait, &wait);
630 spin_unlock_bh(&pool->sp_lock);
631
632 schedule_timeout(timeout);
633
634 try_to_freeze();
635
636 spin_lock_bh(&pool->sp_lock);
637 remove_wait_queue(&rqstp->rq_wait, &wait);
638
639 xprt = rqstp->rq_xprt;
640 if (!xprt) {
641 svc_thread_dequeue(pool, rqstp);
642 spin_unlock_bh(&pool->sp_lock);
643 dprintk("svc: server %p, no data yet\n", rqstp);
644 return signalled()? -EINTR : -EAGAIN;
645 }
646 }
647 spin_unlock_bh(&pool->sp_lock);
648
649 len = 0;
650 if (test_bit(XPT_CLOSE, &xprt->xpt_flags)) {
651 dprintk("svc_recv: found XPT_CLOSE\n");
652 svc_delete_xprt(xprt);
653 } else if (test_bit(XPT_LISTENER, &xprt->xpt_flags)) {
654 struct svc_xprt *newxpt;
655 newxpt = xprt->xpt_ops->xpo_accept(xprt);
656 if (newxpt) {
657 /*
658 * We know this module_get will succeed because the
659 * listener holds a reference too
660 */
661 __module_get(newxpt->xpt_class->xcl_owner);
662 svc_check_conn_limits(xprt->xpt_server);
663 spin_lock_bh(&serv->sv_lock);
664 set_bit(XPT_TEMP, &newxpt->xpt_flags);
665 list_add(&newxpt->xpt_list, &serv->sv_tempsocks);
666 serv->sv_tmpcnt++;
667 if (serv->sv_temptimer.function == NULL) {
668 /* setup timer to age temp transports */
669 setup_timer(&serv->sv_temptimer,
670 svc_age_temp_xprts,
671 (unsigned long)serv);
672 mod_timer(&serv->sv_temptimer,
673 jiffies + svc_conn_age_period * HZ);
674 }
675 spin_unlock_bh(&serv->sv_lock);
676 svc_xprt_received(newxpt);
677 }
678 svc_xprt_received(xprt);
679 } else {
680 dprintk("svc: server %p, pool %u, transport %p, inuse=%d\n",
681 rqstp, pool->sp_id, xprt,
682 atomic_read(&xprt->xpt_ref.refcount));
683 rqstp->rq_deferred = svc_deferred_dequeue(xprt);
684 if (rqstp->rq_deferred) {
685 svc_xprt_received(xprt);
686 len = svc_deferred_recv(rqstp);
687 } else
688 len = xprt->xpt_ops->xpo_recvfrom(rqstp);
689 dprintk("svc: got len=%d\n", len);
690 }
691
692 /* No data, incomplete (TCP) read, or accept() */
693 if (len == 0 || len == -EAGAIN) {
694 rqstp->rq_res.len = 0;
695 svc_xprt_release(rqstp);
696 return -EAGAIN;
697 }
698 clear_bit(XPT_OLD, &xprt->xpt_flags);
699
700 rqstp->rq_secure = svc_port_is_privileged(svc_addr(rqstp));
701 rqstp->rq_chandle.defer = svc_defer;
702
703 if (serv->sv_stats)
704 serv->sv_stats->netcnt++;
705 return len;
706}
707EXPORT_SYMBOL(svc_recv);
708
709/*
710 * Drop request
711 */
712void svc_drop(struct svc_rqst *rqstp)
713{
714 dprintk("svc: xprt %p dropped request\n", rqstp->rq_xprt);
715 svc_xprt_release(rqstp);
716}
717EXPORT_SYMBOL(svc_drop);
718
719/*
720 * Return reply to client.
721 */
722int svc_send(struct svc_rqst *rqstp)
723{
724 struct svc_xprt *xprt;
725 int len;
726 struct xdr_buf *xb;
727
728 xprt = rqstp->rq_xprt;
729 if (!xprt)
730 return -EFAULT;
731
732 /* release the receive skb before sending the reply */
733 rqstp->rq_xprt->xpt_ops->xpo_release_rqst(rqstp);
734
735 /* calculate over-all length */
736 xb = &rqstp->rq_res;
737 xb->len = xb->head[0].iov_len +
738 xb->page_len +
739 xb->tail[0].iov_len;
740
741 /* Grab mutex to serialize outgoing data. */
742 mutex_lock(&xprt->xpt_mutex);
743 if (test_bit(XPT_DEAD, &xprt->xpt_flags))
744 len = -ENOTCONN;
745 else
746 len = xprt->xpt_ops->xpo_sendto(rqstp);
747 mutex_unlock(&xprt->xpt_mutex);
748 svc_xprt_release(rqstp);
749
750 if (len == -ECONNREFUSED || len == -ENOTCONN || len == -EAGAIN)
751 return 0;
752 return len;
753}
754
755/*
756 * Timer function to close old temporary transports, using
757 * a mark-and-sweep algorithm.
758 */
759static void svc_age_temp_xprts(unsigned long closure)
760{
761 struct svc_serv *serv = (struct svc_serv *)closure;
762 struct svc_xprt *xprt;
763 struct list_head *le, *next;
764 LIST_HEAD(to_be_aged);
765
766 dprintk("svc_age_temp_xprts\n");
767
768 if (!spin_trylock_bh(&serv->sv_lock)) {
769 /* busy, try again 1 sec later */
770 dprintk("svc_age_temp_xprts: busy\n");
771 mod_timer(&serv->sv_temptimer, jiffies + HZ);
772 return;
773 }
774
775 list_for_each_safe(le, next, &serv->sv_tempsocks) {
776 xprt = list_entry(le, struct svc_xprt, xpt_list);
777
778 /* First time through, just mark it OLD. Second time
779 * through, close it. */
780 if (!test_and_set_bit(XPT_OLD, &xprt->xpt_flags))
781 continue;
782 if (atomic_read(&xprt->xpt_ref.refcount) > 1
783 || test_bit(XPT_BUSY, &xprt->xpt_flags))
784 continue;
785 svc_xprt_get(xprt);
786 list_move(le, &to_be_aged);
787 set_bit(XPT_CLOSE, &xprt->xpt_flags);
788 set_bit(XPT_DETACHED, &xprt->xpt_flags);
789 }
790 spin_unlock_bh(&serv->sv_lock);
791
792 while (!list_empty(&to_be_aged)) {
793 le = to_be_aged.next;
794 /* fiddling the xpt_list node is safe 'cos we're XPT_DETACHED */
795 list_del_init(le);
796 xprt = list_entry(le, struct svc_xprt, xpt_list);
797
798 dprintk("queuing xprt %p for closing\n", xprt);
799
800 /* a thread will dequeue and close it soon */
801 svc_xprt_enqueue(xprt);
802 svc_xprt_put(xprt);
803 }
804
805 mod_timer(&serv->sv_temptimer, jiffies + svc_conn_age_period * HZ);
806}
807
808/*
809 * Remove a dead transport
810 */
811void svc_delete_xprt(struct svc_xprt *xprt)
812{
813 struct svc_serv *serv = xprt->xpt_server;
814
815 dprintk("svc: svc_delete_xprt(%p)\n", xprt);
816 xprt->xpt_ops->xpo_detach(xprt);
817
818 spin_lock_bh(&serv->sv_lock);
819 if (!test_and_set_bit(XPT_DETACHED, &xprt->xpt_flags))
820 list_del_init(&xprt->xpt_list);
821 /*
822 * We used to delete the transport from whichever list
823 * it's sk_xprt.xpt_ready node was on, but we don't actually
824 * need to. This is because the only time we're called
825 * while still attached to a queue, the queue itself
826 * is about to be destroyed (in svc_destroy).
827 */
828 if (!test_and_set_bit(XPT_DEAD, &xprt->xpt_flags)) {
829 BUG_ON(atomic_read(&xprt->xpt_ref.refcount) < 2);
830 if (test_bit(XPT_TEMP, &xprt->xpt_flags))
831 serv->sv_tmpcnt--;
832 svc_xprt_put(xprt);
833 }
834 spin_unlock_bh(&serv->sv_lock);
835}
836
837void svc_close_xprt(struct svc_xprt *xprt)
838{
839 set_bit(XPT_CLOSE, &xprt->xpt_flags);
840 if (test_and_set_bit(XPT_BUSY, &xprt->xpt_flags))
841 /* someone else will have to effect the close */
842 return;
843
844 svc_xprt_get(xprt);
845 svc_delete_xprt(xprt);
846 clear_bit(XPT_BUSY, &xprt->xpt_flags);
847 svc_xprt_put(xprt);
848}
849EXPORT_SYMBOL_GPL(svc_close_xprt);
850
851void svc_close_all(struct list_head *xprt_list)
852{
853 struct svc_xprt *xprt;
854 struct svc_xprt *tmp;
855
856 list_for_each_entry_safe(xprt, tmp, xprt_list, xpt_list) {
857 set_bit(XPT_CLOSE, &xprt->xpt_flags);
858 if (test_bit(XPT_BUSY, &xprt->xpt_flags)) {
859 /* Waiting to be processed, but no threads left,
860 * So just remove it from the waiting list
861 */
862 list_del_init(&xprt->xpt_ready);
863 clear_bit(XPT_BUSY, &xprt->xpt_flags);
864 }
865 svc_close_xprt(xprt);
866 }
867}
868
869/*
870 * Handle defer and revisit of requests
871 */
872
873static void svc_revisit(struct cache_deferred_req *dreq, int too_many)
874{
875 struct svc_deferred_req *dr =
876 container_of(dreq, struct svc_deferred_req, handle);
877 struct svc_xprt *xprt = dr->xprt;
878
879 if (too_many) {
880 svc_xprt_put(xprt);
881 kfree(dr);
882 return;
883 }
884 dprintk("revisit queued\n");
885 dr->xprt = NULL;
886 spin_lock(&xprt->xpt_lock);
887 list_add(&dr->handle.recent, &xprt->xpt_deferred);
888 spin_unlock(&xprt->xpt_lock);
889 set_bit(XPT_DEFERRED, &xprt->xpt_flags);
890 svc_xprt_enqueue(xprt);
891 svc_xprt_put(xprt);
892}
893
894/*
895 * Save the request off for later processing. The request buffer looks
896 * like this:
897 *
898 * <xprt-header><rpc-header><rpc-pagelist><rpc-tail>
899 *
900 * This code can only handle requests that consist of an xprt-header
901 * and rpc-header.
902 */
903static struct cache_deferred_req *svc_defer(struct cache_req *req)
904{
905 struct svc_rqst *rqstp = container_of(req, struct svc_rqst, rq_chandle);
906 struct svc_deferred_req *dr;
907
908 if (rqstp->rq_arg.page_len)
909 return NULL; /* if more than a page, give up FIXME */
910 if (rqstp->rq_deferred) {
911 dr = rqstp->rq_deferred;
912 rqstp->rq_deferred = NULL;
913 } else {
914 size_t skip;
915 size_t size;
916 /* FIXME maybe discard if size too large */
917 size = sizeof(struct svc_deferred_req) + rqstp->rq_arg.len;
918 dr = kmalloc(size, GFP_KERNEL);
919 if (dr == NULL)
920 return NULL;
921
922 dr->handle.owner = rqstp->rq_server;
923 dr->prot = rqstp->rq_prot;
924 memcpy(&dr->addr, &rqstp->rq_addr, rqstp->rq_addrlen);
925 dr->addrlen = rqstp->rq_addrlen;
926 dr->daddr = rqstp->rq_daddr;
927 dr->argslen = rqstp->rq_arg.len >> 2;
928 dr->xprt_hlen = rqstp->rq_xprt_hlen;
929
930 /* back up head to the start of the buffer and copy */
931 skip = rqstp->rq_arg.len - rqstp->rq_arg.head[0].iov_len;
932 memcpy(dr->args, rqstp->rq_arg.head[0].iov_base - skip,
933 dr->argslen << 2);
934 }
935 svc_xprt_get(rqstp->rq_xprt);
936 dr->xprt = rqstp->rq_xprt;
937
938 dr->handle.revisit = svc_revisit;
939 return &dr->handle;
940}
941
942/*
943 * recv data from a deferred request into an active one
944 */
945static int svc_deferred_recv(struct svc_rqst *rqstp)
946{
947 struct svc_deferred_req *dr = rqstp->rq_deferred;
948
949 /* setup iov_base past transport header */
950 rqstp->rq_arg.head[0].iov_base = dr->args + (dr->xprt_hlen>>2);
951 /* The iov_len does not include the transport header bytes */
952 rqstp->rq_arg.head[0].iov_len = (dr->argslen<<2) - dr->xprt_hlen;
953 rqstp->rq_arg.page_len = 0;
954 /* The rq_arg.len includes the transport header bytes */
955 rqstp->rq_arg.len = dr->argslen<<2;
956 rqstp->rq_prot = dr->prot;
957 memcpy(&rqstp->rq_addr, &dr->addr, dr->addrlen);
958 rqstp->rq_addrlen = dr->addrlen;
959 /* Save off transport header len in case we get deferred again */
960 rqstp->rq_xprt_hlen = dr->xprt_hlen;
961 rqstp->rq_daddr = dr->daddr;
962 rqstp->rq_respages = rqstp->rq_pages;
963 return (dr->argslen<<2) - dr->xprt_hlen;
964}
965
966
967static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt)
968{
969 struct svc_deferred_req *dr = NULL;
970
971 if (!test_bit(XPT_DEFERRED, &xprt->xpt_flags))
972 return NULL;
973 spin_lock(&xprt->xpt_lock);
974 clear_bit(XPT_DEFERRED, &xprt->xpt_flags);
975 if (!list_empty(&xprt->xpt_deferred)) {
976 dr = list_entry(xprt->xpt_deferred.next,
977 struct svc_deferred_req,
978 handle.recent);
979 list_del_init(&dr->handle.recent);
980 set_bit(XPT_DEFERRED, &xprt->xpt_flags);
981 }
982 spin_unlock(&xprt->xpt_lock);
983 return dr;
984}
985
986/*
987 * Return the transport instance pointer for the endpoint accepting
988 * connections/peer traffic from the specified transport class,
989 * address family and port.
990 *
991 * Specifying 0 for the address family or port is effectively a
992 * wild-card, and will result in matching the first transport in the
993 * service's list that has a matching class name.
994 */
995struct svc_xprt *svc_find_xprt(struct svc_serv *serv, char *xcl_name,
996 int af, int port)
997{
998 struct svc_xprt *xprt;
999 struct svc_xprt *found = NULL;
1000
1001 /* Sanity check the args */
1002 if (!serv || !xcl_name)
1003 return found;
1004
1005 spin_lock_bh(&serv->sv_lock);
1006 list_for_each_entry(xprt, &serv->sv_permsocks, xpt_list) {
1007 if (strcmp(xprt->xpt_class->xcl_name, xcl_name))
1008 continue;
1009 if (af != AF_UNSPEC && af != xprt->xpt_local.ss_family)
1010 continue;
1011 if (port && port != svc_xprt_local_port(xprt))
1012 continue;
1013 found = xprt;
1014 svc_xprt_get(xprt);
1015 break;
1016 }
1017 spin_unlock_bh(&serv->sv_lock);
1018 return found;
1019}
1020EXPORT_SYMBOL_GPL(svc_find_xprt);
1021
1022/*
1023 * Format a buffer with a list of the active transports. A zero for
1024 * the buflen parameter disables target buffer overflow checking.
1025 */
1026int svc_xprt_names(struct svc_serv *serv, char *buf, int buflen)
1027{
1028 struct svc_xprt *xprt;
1029 char xprt_str[64];
1030 int totlen = 0;
1031 int len;
1032
1033 /* Sanity check args */
1034 if (!serv)
1035 return 0;
1036
1037 spin_lock_bh(&serv->sv_lock);
1038 list_for_each_entry(xprt, &serv->sv_permsocks, xpt_list) {
1039 len = snprintf(xprt_str, sizeof(xprt_str),
1040 "%s %d\n", xprt->xpt_class->xcl_name,
1041 svc_xprt_local_port(xprt));
1042 /* If the string was truncated, replace with error string */
1043 if (len >= sizeof(xprt_str))
1044 strcpy(xprt_str, "name-too-long\n");
1045 /* Don't overflow buffer */
1046 len = strlen(xprt_str);
1047 if (buflen && (len + totlen >= buflen))
1048 break;
1049 strcpy(buf+totlen, xprt_str);
1050 totlen += len;
1051 }
1052 spin_unlock_bh(&serv->sv_lock);
1053 return totlen;
1054}
1055EXPORT_SYMBOL_GPL(svc_xprt_names);
diff --git a/net/sunrpc/svcauth.c b/net/sunrpc/svcauth.c
index af7c5f05c6e1..8a73cbb16052 100644
--- a/net/sunrpc/svcauth.c
+++ b/net/sunrpc/svcauth.c
@@ -57,11 +57,13 @@ svc_authenticate(struct svc_rqst *rqstp, __be32 *authp)
57 rqstp->rq_authop = aops; 57 rqstp->rq_authop = aops;
58 return aops->accept(rqstp, authp); 58 return aops->accept(rqstp, authp);
59} 59}
60EXPORT_SYMBOL(svc_authenticate);
60 61
61int svc_set_client(struct svc_rqst *rqstp) 62int svc_set_client(struct svc_rqst *rqstp)
62{ 63{
63 return rqstp->rq_authop->set_client(rqstp); 64 return rqstp->rq_authop->set_client(rqstp);
64} 65}
66EXPORT_SYMBOL(svc_set_client);
65 67
66/* A request, which was authenticated, has now executed. 68/* A request, which was authenticated, has now executed.
67 * Time to finalise the credentials and verifier 69 * Time to finalise the credentials and verifier
@@ -93,6 +95,7 @@ svc_auth_register(rpc_authflavor_t flavor, struct auth_ops *aops)
93 spin_unlock(&authtab_lock); 95 spin_unlock(&authtab_lock);
94 return rv; 96 return rv;
95} 97}
98EXPORT_SYMBOL(svc_auth_register);
96 99
97void 100void
98svc_auth_unregister(rpc_authflavor_t flavor) 101svc_auth_unregister(rpc_authflavor_t flavor)
@@ -129,6 +132,7 @@ void auth_domain_put(struct auth_domain *dom)
129 spin_unlock(&auth_domain_lock); 132 spin_unlock(&auth_domain_lock);
130 } 133 }
131} 134}
135EXPORT_SYMBOL(auth_domain_put);
132 136
133struct auth_domain * 137struct auth_domain *
134auth_domain_lookup(char *name, struct auth_domain *new) 138auth_domain_lookup(char *name, struct auth_domain *new)
@@ -153,8 +157,10 @@ auth_domain_lookup(char *name, struct auth_domain *new)
153 spin_unlock(&auth_domain_lock); 157 spin_unlock(&auth_domain_lock);
154 return new; 158 return new;
155} 159}
160EXPORT_SYMBOL(auth_domain_lookup);
156 161
157struct auth_domain *auth_domain_find(char *name) 162struct auth_domain *auth_domain_find(char *name)
158{ 163{
159 return auth_domain_lookup(name, NULL); 164 return auth_domain_lookup(name, NULL);
160} 165}
166EXPORT_SYMBOL(auth_domain_find);
diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c
index 411479411b21..3c64051e4555 100644
--- a/net/sunrpc/svcauth_unix.c
+++ b/net/sunrpc/svcauth_unix.c
@@ -63,6 +63,7 @@ struct auth_domain *unix_domain_find(char *name)
63 rv = auth_domain_lookup(name, &new->h); 63 rv = auth_domain_lookup(name, &new->h);
64 } 64 }
65} 65}
66EXPORT_SYMBOL(unix_domain_find);
66 67
67static void svcauth_unix_domain_release(struct auth_domain *dom) 68static void svcauth_unix_domain_release(struct auth_domain *dom)
68{ 69{
@@ -340,6 +341,7 @@ int auth_unix_add_addr(struct in_addr addr, struct auth_domain *dom)
340 else 341 else
341 return -ENOMEM; 342 return -ENOMEM;
342} 343}
344EXPORT_SYMBOL(auth_unix_add_addr);
343 345
344int auth_unix_forget_old(struct auth_domain *dom) 346int auth_unix_forget_old(struct auth_domain *dom)
345{ 347{
@@ -351,6 +353,7 @@ int auth_unix_forget_old(struct auth_domain *dom)
351 udom->addr_changes++; 353 udom->addr_changes++;
352 return 0; 354 return 0;
353} 355}
356EXPORT_SYMBOL(auth_unix_forget_old);
354 357
355struct auth_domain *auth_unix_lookup(struct in_addr addr) 358struct auth_domain *auth_unix_lookup(struct in_addr addr)
356{ 359{
@@ -375,50 +378,56 @@ struct auth_domain *auth_unix_lookup(struct in_addr addr)
375 cache_put(&ipm->h, &ip_map_cache); 378 cache_put(&ipm->h, &ip_map_cache);
376 return rv; 379 return rv;
377} 380}
381EXPORT_SYMBOL(auth_unix_lookup);
378 382
379void svcauth_unix_purge(void) 383void svcauth_unix_purge(void)
380{ 384{
381 cache_purge(&ip_map_cache); 385 cache_purge(&ip_map_cache);
382} 386}
387EXPORT_SYMBOL(svcauth_unix_purge);
383 388
384static inline struct ip_map * 389static inline struct ip_map *
385ip_map_cached_get(struct svc_rqst *rqstp) 390ip_map_cached_get(struct svc_rqst *rqstp)
386{ 391{
387 struct ip_map *ipm; 392 struct ip_map *ipm = NULL;
388 struct svc_sock *svsk = rqstp->rq_sock; 393 struct svc_xprt *xprt = rqstp->rq_xprt;
389 spin_lock(&svsk->sk_lock); 394
390 ipm = svsk->sk_info_authunix; 395 if (test_bit(XPT_CACHE_AUTH, &xprt->xpt_flags)) {
391 if (ipm != NULL) { 396 spin_lock(&xprt->xpt_lock);
392 if (!cache_valid(&ipm->h)) { 397 ipm = xprt->xpt_auth_cache;
393 /* 398 if (ipm != NULL) {
394 * The entry has been invalidated since it was 399 if (!cache_valid(&ipm->h)) {
395 * remembered, e.g. by a second mount from the 400 /*
396 * same IP address. 401 * The entry has been invalidated since it was
397 */ 402 * remembered, e.g. by a second mount from the
398 svsk->sk_info_authunix = NULL; 403 * same IP address.
399 spin_unlock(&svsk->sk_lock); 404 */
400 cache_put(&ipm->h, &ip_map_cache); 405 xprt->xpt_auth_cache = NULL;
401 return NULL; 406 spin_unlock(&xprt->xpt_lock);
407 cache_put(&ipm->h, &ip_map_cache);
408 return NULL;
409 }
410 cache_get(&ipm->h);
402 } 411 }
403 cache_get(&ipm->h); 412 spin_unlock(&xprt->xpt_lock);
404 } 413 }
405 spin_unlock(&svsk->sk_lock);
406 return ipm; 414 return ipm;
407} 415}
408 416
409static inline void 417static inline void
410ip_map_cached_put(struct svc_rqst *rqstp, struct ip_map *ipm) 418ip_map_cached_put(struct svc_rqst *rqstp, struct ip_map *ipm)
411{ 419{
412 struct svc_sock *svsk = rqstp->rq_sock; 420 struct svc_xprt *xprt = rqstp->rq_xprt;
413 421
414 spin_lock(&svsk->sk_lock); 422 if (test_bit(XPT_CACHE_AUTH, &xprt->xpt_flags)) {
415 if (svsk->sk_sock->type == SOCK_STREAM && 423 spin_lock(&xprt->xpt_lock);
416 svsk->sk_info_authunix == NULL) { 424 if (xprt->xpt_auth_cache == NULL) {
417 /* newly cached, keep the reference */ 425 /* newly cached, keep the reference */
418 svsk->sk_info_authunix = ipm; 426 xprt->xpt_auth_cache = ipm;
419 ipm = NULL; 427 ipm = NULL;
428 }
429 spin_unlock(&xprt->xpt_lock);
420 } 430 }
421 spin_unlock(&svsk->sk_lock);
422 if (ipm) 431 if (ipm)
423 cache_put(&ipm->h, &ip_map_cache); 432 cache_put(&ipm->h, &ip_map_cache);
424} 433}
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index c75bffeb89eb..1d3e5fcc2cc4 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -5,7 +5,7 @@
5 * 5 *
6 * The server scheduling algorithm does not always distribute the load 6 * The server scheduling algorithm does not always distribute the load
7 * evenly when servicing a single client. May need to modify the 7 * evenly when servicing a single client. May need to modify the
8 * svc_sock_enqueue procedure... 8 * svc_xprt_enqueue procedure...
9 * 9 *
10 * TCP support is largely untested and may be a little slow. The problem 10 * TCP support is largely untested and may be a little slow. The problem
11 * is that we currently do two separate recvfrom's, one for the 4-byte 11 * is that we currently do two separate recvfrom's, one for the 4-byte
@@ -48,72 +48,40 @@
48#include <linux/sunrpc/svcsock.h> 48#include <linux/sunrpc/svcsock.h>
49#include <linux/sunrpc/stats.h> 49#include <linux/sunrpc/stats.h>
50 50
51/* SMP locking strategy: 51#define RPCDBG_FACILITY RPCDBG_SVCXPRT
52 *
53 * svc_pool->sp_lock protects most of the fields of that pool.
54 * svc_serv->sv_lock protects sv_tempsocks, sv_permsocks, sv_tmpcnt.
55 * when both need to be taken (rare), svc_serv->sv_lock is first.
56 * BKL protects svc_serv->sv_nrthread.
57 * svc_sock->sk_lock protects the svc_sock->sk_deferred list
58 * and the ->sk_info_authunix cache.
59 * svc_sock->sk_flags.SK_BUSY prevents a svc_sock being enqueued multiply.
60 *
61 * Some flags can be set to certain values at any time
62 * providing that certain rules are followed:
63 *
64 * SK_CONN, SK_DATA, can be set or cleared at any time.
65 * after a set, svc_sock_enqueue must be called.
66 * after a clear, the socket must be read/accepted
67 * if this succeeds, it must be set again.
68 * SK_CLOSE can set at any time. It is never cleared.
69 * sk_inuse contains a bias of '1' until SK_DEAD is set.
70 * so when sk_inuse hits zero, we know the socket is dead
71 * and no-one is using it.
72 * SK_DEAD can only be set while SK_BUSY is held which ensures
73 * no other thread will be using the socket or will try to
74 * set SK_DEAD.
75 *
76 */
77
78#define RPCDBG_FACILITY RPCDBG_SVCSOCK
79 52
80 53
81static struct svc_sock *svc_setup_socket(struct svc_serv *, struct socket *, 54static struct svc_sock *svc_setup_socket(struct svc_serv *, struct socket *,
82 int *errp, int flags); 55 int *errp, int flags);
83static void svc_delete_socket(struct svc_sock *svsk);
84static void svc_udp_data_ready(struct sock *, int); 56static void svc_udp_data_ready(struct sock *, int);
85static int svc_udp_recvfrom(struct svc_rqst *); 57static int svc_udp_recvfrom(struct svc_rqst *);
86static int svc_udp_sendto(struct svc_rqst *); 58static int svc_udp_sendto(struct svc_rqst *);
87static void svc_close_socket(struct svc_sock *svsk); 59static void svc_sock_detach(struct svc_xprt *);
88 60static void svc_sock_free(struct svc_xprt *);
89static struct svc_deferred_req *svc_deferred_dequeue(struct svc_sock *svsk);
90static int svc_deferred_recv(struct svc_rqst *rqstp);
91static struct cache_deferred_req *svc_defer(struct cache_req *req);
92
93/* apparently the "standard" is that clients close
94 * idle connections after 5 minutes, servers after
95 * 6 minutes
96 * http://www.connectathon.org/talks96/nfstcp.pdf
97 */
98static int svc_conn_age_period = 6*60;
99 61
62static struct svc_xprt *svc_create_socket(struct svc_serv *, int,
63 struct sockaddr *, int, int);
100#ifdef CONFIG_DEBUG_LOCK_ALLOC 64#ifdef CONFIG_DEBUG_LOCK_ALLOC
101static struct lock_class_key svc_key[2]; 65static struct lock_class_key svc_key[2];
102static struct lock_class_key svc_slock_key[2]; 66static struct lock_class_key svc_slock_key[2];
103 67
104static inline void svc_reclassify_socket(struct socket *sock) 68static void svc_reclassify_socket(struct socket *sock)
105{ 69{
106 struct sock *sk = sock->sk; 70 struct sock *sk = sock->sk;
107 BUG_ON(sock_owned_by_user(sk)); 71 BUG_ON(sock_owned_by_user(sk));
108 switch (sk->sk_family) { 72 switch (sk->sk_family) {
109 case AF_INET: 73 case AF_INET:
110 sock_lock_init_class_and_name(sk, "slock-AF_INET-NFSD", 74 sock_lock_init_class_and_name(sk, "slock-AF_INET-NFSD",
111 &svc_slock_key[0], "sk_lock-AF_INET-NFSD", &svc_key[0]); 75 &svc_slock_key[0],
76 "sk_xprt.xpt_lock-AF_INET-NFSD",
77 &svc_key[0]);
112 break; 78 break;
113 79
114 case AF_INET6: 80 case AF_INET6:
115 sock_lock_init_class_and_name(sk, "slock-AF_INET6-NFSD", 81 sock_lock_init_class_and_name(sk, "slock-AF_INET6-NFSD",
116 &svc_slock_key[1], "sk_lock-AF_INET6-NFSD", &svc_key[1]); 82 &svc_slock_key[1],
83 "sk_xprt.xpt_lock-AF_INET6-NFSD",
84 &svc_key[1]);
117 break; 85 break;
118 86
119 default: 87 default:
@@ -121,81 +89,26 @@ static inline void svc_reclassify_socket(struct socket *sock)
121 } 89 }
122} 90}
123#else 91#else
124static inline void svc_reclassify_socket(struct socket *sock) 92static void svc_reclassify_socket(struct socket *sock)
125{ 93{
126} 94}
127#endif 95#endif
128 96
129static char *__svc_print_addr(struct sockaddr *addr, char *buf, size_t len)
130{
131 switch (addr->sa_family) {
132 case AF_INET:
133 snprintf(buf, len, "%u.%u.%u.%u, port=%u",
134 NIPQUAD(((struct sockaddr_in *) addr)->sin_addr),
135 ntohs(((struct sockaddr_in *) addr)->sin_port));
136 break;
137
138 case AF_INET6:
139 snprintf(buf, len, "%x:%x:%x:%x:%x:%x:%x:%x, port=%u",
140 NIP6(((struct sockaddr_in6 *) addr)->sin6_addr),
141 ntohs(((struct sockaddr_in6 *) addr)->sin6_port));
142 break;
143
144 default:
145 snprintf(buf, len, "unknown address type: %d", addr->sa_family);
146 break;
147 }
148 return buf;
149}
150
151/**
152 * svc_print_addr - Format rq_addr field for printing
153 * @rqstp: svc_rqst struct containing address to print
154 * @buf: target buffer for formatted address
155 * @len: length of target buffer
156 *
157 */
158char *svc_print_addr(struct svc_rqst *rqstp, char *buf, size_t len)
159{
160 return __svc_print_addr(svc_addr(rqstp), buf, len);
161}
162EXPORT_SYMBOL_GPL(svc_print_addr);
163
164/*
165 * Queue up an idle server thread. Must have pool->sp_lock held.
166 * Note: this is really a stack rather than a queue, so that we only
167 * use as many different threads as we need, and the rest don't pollute
168 * the cache.
169 */
170static inline void
171svc_thread_enqueue(struct svc_pool *pool, struct svc_rqst *rqstp)
172{
173 list_add(&rqstp->rq_list, &pool->sp_threads);
174}
175
176/*
177 * Dequeue an nfsd thread. Must have pool->sp_lock held.
178 */
179static inline void
180svc_thread_dequeue(struct svc_pool *pool, struct svc_rqst *rqstp)
181{
182 list_del(&rqstp->rq_list);
183}
184
185/* 97/*
186 * Release an skbuff after use 98 * Release an skbuff after use
187 */ 99 */
188static inline void 100static void svc_release_skb(struct svc_rqst *rqstp)
189svc_release_skb(struct svc_rqst *rqstp)
190{ 101{
191 struct sk_buff *skb = rqstp->rq_skbuff; 102 struct sk_buff *skb = rqstp->rq_xprt_ctxt;
192 struct svc_deferred_req *dr = rqstp->rq_deferred; 103 struct svc_deferred_req *dr = rqstp->rq_deferred;
193 104
194 if (skb) { 105 if (skb) {
195 rqstp->rq_skbuff = NULL; 106 struct svc_sock *svsk =
107 container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
108 rqstp->rq_xprt_ctxt = NULL;
196 109
197 dprintk("svc: service %p, releasing skb %p\n", rqstp, skb); 110 dprintk("svc: service %p, releasing skb %p\n", rqstp, skb);
198 skb_free_datagram(rqstp->rq_sock->sk_sk, skb); 111 skb_free_datagram(svsk->sk_sk, skb);
199 } 112 }
200 if (dr) { 113 if (dr) {
201 rqstp->rq_deferred = NULL; 114 rqstp->rq_deferred = NULL;
@@ -203,253 +116,6 @@ svc_release_skb(struct svc_rqst *rqstp)
203 } 116 }
204} 117}
205 118
206/*
207 * Any space to write?
208 */
209static inline unsigned long
210svc_sock_wspace(struct svc_sock *svsk)
211{
212 int wspace;
213
214 if (svsk->sk_sock->type == SOCK_STREAM)
215 wspace = sk_stream_wspace(svsk->sk_sk);
216 else
217 wspace = sock_wspace(svsk->sk_sk);
218
219 return wspace;
220}
221
222/*
223 * Queue up a socket with data pending. If there are idle nfsd
224 * processes, wake 'em up.
225 *
226 */
227static void
228svc_sock_enqueue(struct svc_sock *svsk)
229{
230 struct svc_serv *serv = svsk->sk_server;
231 struct svc_pool *pool;
232 struct svc_rqst *rqstp;
233 int cpu;
234
235 if (!(svsk->sk_flags &
236 ( (1<<SK_CONN)|(1<<SK_DATA)|(1<<SK_CLOSE)|(1<<SK_DEFERRED)) ))
237 return;
238 if (test_bit(SK_DEAD, &svsk->sk_flags))
239 return;
240
241 cpu = get_cpu();
242 pool = svc_pool_for_cpu(svsk->sk_server, cpu);
243 put_cpu();
244
245 spin_lock_bh(&pool->sp_lock);
246
247 if (!list_empty(&pool->sp_threads) &&
248 !list_empty(&pool->sp_sockets))
249 printk(KERN_ERR
250 "svc_sock_enqueue: threads and sockets both waiting??\n");
251
252 if (test_bit(SK_DEAD, &svsk->sk_flags)) {
253 /* Don't enqueue dead sockets */
254 dprintk("svc: socket %p is dead, not enqueued\n", svsk->sk_sk);
255 goto out_unlock;
256 }
257
258 /* Mark socket as busy. It will remain in this state until the
259 * server has processed all pending data and put the socket back
260 * on the idle list. We update SK_BUSY atomically because
261 * it also guards against trying to enqueue the svc_sock twice.
262 */
263 if (test_and_set_bit(SK_BUSY, &svsk->sk_flags)) {
264 /* Don't enqueue socket while already enqueued */
265 dprintk("svc: socket %p busy, not enqueued\n", svsk->sk_sk);
266 goto out_unlock;
267 }
268 BUG_ON(svsk->sk_pool != NULL);
269 svsk->sk_pool = pool;
270
271 set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
272 if (((atomic_read(&svsk->sk_reserved) + serv->sv_max_mesg)*2
273 > svc_sock_wspace(svsk))
274 && !test_bit(SK_CLOSE, &svsk->sk_flags)
275 && !test_bit(SK_CONN, &svsk->sk_flags)) {
276 /* Don't enqueue while not enough space for reply */
277 dprintk("svc: socket %p no space, %d*2 > %ld, not enqueued\n",
278 svsk->sk_sk, atomic_read(&svsk->sk_reserved)+serv->sv_max_mesg,
279 svc_sock_wspace(svsk));
280 svsk->sk_pool = NULL;
281 clear_bit(SK_BUSY, &svsk->sk_flags);
282 goto out_unlock;
283 }
284 clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
285
286
287 if (!list_empty(&pool->sp_threads)) {
288 rqstp = list_entry(pool->sp_threads.next,
289 struct svc_rqst,
290 rq_list);
291 dprintk("svc: socket %p served by daemon %p\n",
292 svsk->sk_sk, rqstp);
293 svc_thread_dequeue(pool, rqstp);
294 if (rqstp->rq_sock)
295 printk(KERN_ERR
296 "svc_sock_enqueue: server %p, rq_sock=%p!\n",
297 rqstp, rqstp->rq_sock);
298 rqstp->rq_sock = svsk;
299 atomic_inc(&svsk->sk_inuse);
300 rqstp->rq_reserved = serv->sv_max_mesg;
301 atomic_add(rqstp->rq_reserved, &svsk->sk_reserved);
302 BUG_ON(svsk->sk_pool != pool);
303 wake_up(&rqstp->rq_wait);
304 } else {
305 dprintk("svc: socket %p put into queue\n", svsk->sk_sk);
306 list_add_tail(&svsk->sk_ready, &pool->sp_sockets);
307 BUG_ON(svsk->sk_pool != pool);
308 }
309
310out_unlock:
311 spin_unlock_bh(&pool->sp_lock);
312}
313
314/*
315 * Dequeue the first socket. Must be called with the pool->sp_lock held.
316 */
317static inline struct svc_sock *
318svc_sock_dequeue(struct svc_pool *pool)
319{
320 struct svc_sock *svsk;
321
322 if (list_empty(&pool->sp_sockets))
323 return NULL;
324
325 svsk = list_entry(pool->sp_sockets.next,
326 struct svc_sock, sk_ready);
327 list_del_init(&svsk->sk_ready);
328
329 dprintk("svc: socket %p dequeued, inuse=%d\n",
330 svsk->sk_sk, atomic_read(&svsk->sk_inuse));
331
332 return svsk;
333}
334
335/*
336 * Having read something from a socket, check whether it
337 * needs to be re-enqueued.
338 * Note: SK_DATA only gets cleared when a read-attempt finds
339 * no (or insufficient) data.
340 */
341static inline void
342svc_sock_received(struct svc_sock *svsk)
343{
344 svsk->sk_pool = NULL;
345 clear_bit(SK_BUSY, &svsk->sk_flags);
346 svc_sock_enqueue(svsk);
347}
348
349
350/**
351 * svc_reserve - change the space reserved for the reply to a request.
352 * @rqstp: The request in question
353 * @space: new max space to reserve
354 *
355 * Each request reserves some space on the output queue of the socket
356 * to make sure the reply fits. This function reduces that reserved
357 * space to be the amount of space used already, plus @space.
358 *
359 */
360void svc_reserve(struct svc_rqst *rqstp, int space)
361{
362 space += rqstp->rq_res.head[0].iov_len;
363
364 if (space < rqstp->rq_reserved) {
365 struct svc_sock *svsk = rqstp->rq_sock;
366 atomic_sub((rqstp->rq_reserved - space), &svsk->sk_reserved);
367 rqstp->rq_reserved = space;
368
369 svc_sock_enqueue(svsk);
370 }
371}
372
373/*
374 * Release a socket after use.
375 */
376static inline void
377svc_sock_put(struct svc_sock *svsk)
378{
379 if (atomic_dec_and_test(&svsk->sk_inuse)) {
380 BUG_ON(! test_bit(SK_DEAD, &svsk->sk_flags));
381
382 dprintk("svc: releasing dead socket\n");
383 if (svsk->sk_sock->file)
384 sockfd_put(svsk->sk_sock);
385 else
386 sock_release(svsk->sk_sock);
387 if (svsk->sk_info_authunix != NULL)
388 svcauth_unix_info_release(svsk->sk_info_authunix);
389 kfree(svsk);
390 }
391}
392
393static void
394svc_sock_release(struct svc_rqst *rqstp)
395{
396 struct svc_sock *svsk = rqstp->rq_sock;
397
398 svc_release_skb(rqstp);
399
400 svc_free_res_pages(rqstp);
401 rqstp->rq_res.page_len = 0;
402 rqstp->rq_res.page_base = 0;
403
404
405 /* Reset response buffer and release
406 * the reservation.
407 * But first, check that enough space was reserved
408 * for the reply, otherwise we have a bug!
409 */
410 if ((rqstp->rq_res.len) > rqstp->rq_reserved)
411 printk(KERN_ERR "RPC request reserved %d but used %d\n",
412 rqstp->rq_reserved,
413 rqstp->rq_res.len);
414
415 rqstp->rq_res.head[0].iov_len = 0;
416 svc_reserve(rqstp, 0);
417 rqstp->rq_sock = NULL;
418
419 svc_sock_put(svsk);
420}
421
422/*
423 * External function to wake up a server waiting for data
424 * This really only makes sense for services like lockd
425 * which have exactly one thread anyway.
426 */
427void
428svc_wake_up(struct svc_serv *serv)
429{
430 struct svc_rqst *rqstp;
431 unsigned int i;
432 struct svc_pool *pool;
433
434 for (i = 0; i < serv->sv_nrpools; i++) {
435 pool = &serv->sv_pools[i];
436
437 spin_lock_bh(&pool->sp_lock);
438 if (!list_empty(&pool->sp_threads)) {
439 rqstp = list_entry(pool->sp_threads.next,
440 struct svc_rqst,
441 rq_list);
442 dprintk("svc: daemon %p woken up.\n", rqstp);
443 /*
444 svc_thread_dequeue(pool, rqstp);
445 rqstp->rq_sock = NULL;
446 */
447 wake_up(&rqstp->rq_wait);
448 }
449 spin_unlock_bh(&pool->sp_lock);
450 }
451}
452
453union svc_pktinfo_u { 119union svc_pktinfo_u {
454 struct in_pktinfo pkti; 120 struct in_pktinfo pkti;
455 struct in6_pktinfo pkti6; 121 struct in6_pktinfo pkti6;
@@ -459,7 +125,9 @@ union svc_pktinfo_u {
459 125
460static void svc_set_cmsg_data(struct svc_rqst *rqstp, struct cmsghdr *cmh) 126static void svc_set_cmsg_data(struct svc_rqst *rqstp, struct cmsghdr *cmh)
461{ 127{
462 switch (rqstp->rq_sock->sk_sk->sk_family) { 128 struct svc_sock *svsk =
129 container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
130 switch (svsk->sk_sk->sk_family) {
463 case AF_INET: { 131 case AF_INET: {
464 struct in_pktinfo *pki = CMSG_DATA(cmh); 132 struct in_pktinfo *pki = CMSG_DATA(cmh);
465 133
@@ -489,10 +157,10 @@ static void svc_set_cmsg_data(struct svc_rqst *rqstp, struct cmsghdr *cmh)
489/* 157/*
490 * Generic sendto routine 158 * Generic sendto routine
491 */ 159 */
492static int 160static int svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr)
493svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr)
494{ 161{
495 struct svc_sock *svsk = rqstp->rq_sock; 162 struct svc_sock *svsk =
163 container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
496 struct socket *sock = svsk->sk_sock; 164 struct socket *sock = svsk->sk_sock;
497 int slen; 165 int slen;
498 union { 166 union {
@@ -565,7 +233,7 @@ svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr)
565 } 233 }
566out: 234out:
567 dprintk("svc: socket %p sendto([%p %Zu... ], %d) = %d (addr %s)\n", 235 dprintk("svc: socket %p sendto([%p %Zu... ], %d) = %d (addr %s)\n",
568 rqstp->rq_sock, xdr->head[0].iov_base, xdr->head[0].iov_len, 236 svsk, xdr->head[0].iov_base, xdr->head[0].iov_len,
569 xdr->len, len, svc_print_addr(rqstp, buf, sizeof(buf))); 237 xdr->len, len, svc_print_addr(rqstp, buf, sizeof(buf)));
570 238
571 return len; 239 return len;
@@ -602,7 +270,7 @@ svc_sock_names(char *buf, struct svc_serv *serv, char *toclose)
602 if (!serv) 270 if (!serv)
603 return 0; 271 return 0;
604 spin_lock_bh(&serv->sv_lock); 272 spin_lock_bh(&serv->sv_lock);
605 list_for_each_entry(svsk, &serv->sv_permsocks, sk_list) { 273 list_for_each_entry(svsk, &serv->sv_permsocks, sk_xprt.xpt_list) {
606 int onelen = one_sock_name(buf+len, svsk); 274 int onelen = one_sock_name(buf+len, svsk);
607 if (toclose && strcmp(toclose, buf+len) == 0) 275 if (toclose && strcmp(toclose, buf+len) == 0)
608 closesk = svsk; 276 closesk = svsk;
@@ -614,7 +282,7 @@ svc_sock_names(char *buf, struct svc_serv *serv, char *toclose)
614 /* Should unregister with portmap, but you cannot 282 /* Should unregister with portmap, but you cannot
615 * unregister just one protocol... 283 * unregister just one protocol...
616 */ 284 */
617 svc_close_socket(closesk); 285 svc_close_xprt(&closesk->sk_xprt);
618 else if (toclose) 286 else if (toclose)
619 return -ENOENT; 287 return -ENOENT;
620 return len; 288 return len;
@@ -624,8 +292,7 @@ EXPORT_SYMBOL(svc_sock_names);
624/* 292/*
625 * Check input queue length 293 * Check input queue length
626 */ 294 */
627static int 295static int svc_recv_available(struct svc_sock *svsk)
628svc_recv_available(struct svc_sock *svsk)
629{ 296{
630 struct socket *sock = svsk->sk_sock; 297 struct socket *sock = svsk->sk_sock;
631 int avail, err; 298 int avail, err;
@@ -638,48 +305,31 @@ svc_recv_available(struct svc_sock *svsk)
638/* 305/*
639 * Generic recvfrom routine. 306 * Generic recvfrom routine.
640 */ 307 */
641static int 308static int svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov, int nr,
642svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov, int nr, int buflen) 309 int buflen)
643{ 310{
644 struct svc_sock *svsk = rqstp->rq_sock; 311 struct svc_sock *svsk =
312 container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
645 struct msghdr msg = { 313 struct msghdr msg = {
646 .msg_flags = MSG_DONTWAIT, 314 .msg_flags = MSG_DONTWAIT,
647 }; 315 };
648 struct sockaddr *sin;
649 int len; 316 int len;
650 317
318 rqstp->rq_xprt_hlen = 0;
319
651 len = kernel_recvmsg(svsk->sk_sock, &msg, iov, nr, buflen, 320 len = kernel_recvmsg(svsk->sk_sock, &msg, iov, nr, buflen,
652 msg.msg_flags); 321 msg.msg_flags);
653 322
654 /* sock_recvmsg doesn't fill in the name/namelen, so we must..
655 */
656 memcpy(&rqstp->rq_addr, &svsk->sk_remote, svsk->sk_remotelen);
657 rqstp->rq_addrlen = svsk->sk_remotelen;
658
659 /* Destination address in request is needed for binding the
660 * source address in RPC callbacks later.
661 */
662 sin = (struct sockaddr *)&svsk->sk_local;
663 switch (sin->sa_family) {
664 case AF_INET:
665 rqstp->rq_daddr.addr = ((struct sockaddr_in *)sin)->sin_addr;
666 break;
667 case AF_INET6:
668 rqstp->rq_daddr.addr6 = ((struct sockaddr_in6 *)sin)->sin6_addr;
669 break;
670 }
671
672 dprintk("svc: socket %p recvfrom(%p, %Zu) = %d\n", 323 dprintk("svc: socket %p recvfrom(%p, %Zu) = %d\n",
673 svsk, iov[0].iov_base, iov[0].iov_len, len); 324 svsk, iov[0].iov_base, iov[0].iov_len, len);
674
675 return len; 325 return len;
676} 326}
677 327
678/* 328/*
679 * Set socket snd and rcv buffer lengths 329 * Set socket snd and rcv buffer lengths
680 */ 330 */
681static inline void 331static void svc_sock_setbufsize(struct socket *sock, unsigned int snd,
682svc_sock_setbufsize(struct socket *sock, unsigned int snd, unsigned int rcv) 332 unsigned int rcv)
683{ 333{
684#if 0 334#if 0
685 mm_segment_t oldfs; 335 mm_segment_t oldfs;
@@ -704,16 +354,16 @@ svc_sock_setbufsize(struct socket *sock, unsigned int snd, unsigned int rcv)
704/* 354/*
705 * INET callback when data has been received on the socket. 355 * INET callback when data has been received on the socket.
706 */ 356 */
707static void 357static void svc_udp_data_ready(struct sock *sk, int count)
708svc_udp_data_ready(struct sock *sk, int count)
709{ 358{
710 struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; 359 struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
711 360
712 if (svsk) { 361 if (svsk) {
713 dprintk("svc: socket %p(inet %p), count=%d, busy=%d\n", 362 dprintk("svc: socket %p(inet %p), count=%d, busy=%d\n",
714 svsk, sk, count, test_bit(SK_BUSY, &svsk->sk_flags)); 363 svsk, sk, count,
715 set_bit(SK_DATA, &svsk->sk_flags); 364 test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags));
716 svc_sock_enqueue(svsk); 365 set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
366 svc_xprt_enqueue(&svsk->sk_xprt);
717 } 367 }
718 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) 368 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
719 wake_up_interruptible(sk->sk_sleep); 369 wake_up_interruptible(sk->sk_sleep);
@@ -722,15 +372,14 @@ svc_udp_data_ready(struct sock *sk, int count)
722/* 372/*
723 * INET callback when space is newly available on the socket. 373 * INET callback when space is newly available on the socket.
724 */ 374 */
725static void 375static void svc_write_space(struct sock *sk)
726svc_write_space(struct sock *sk)
727{ 376{
728 struct svc_sock *svsk = (struct svc_sock *)(sk->sk_user_data); 377 struct svc_sock *svsk = (struct svc_sock *)(sk->sk_user_data);
729 378
730 if (svsk) { 379 if (svsk) {
731 dprintk("svc: socket %p(inet %p), write_space busy=%d\n", 380 dprintk("svc: socket %p(inet %p), write_space busy=%d\n",
732 svsk, sk, test_bit(SK_BUSY, &svsk->sk_flags)); 381 svsk, sk, test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags));
733 svc_sock_enqueue(svsk); 382 svc_xprt_enqueue(&svsk->sk_xprt);
734 } 383 }
735 384
736 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) { 385 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) {
@@ -740,10 +389,19 @@ svc_write_space(struct sock *sk)
740 } 389 }
741} 390}
742 391
743static inline void svc_udp_get_dest_address(struct svc_rqst *rqstp, 392/*
744 struct cmsghdr *cmh) 393 * Copy the UDP datagram's destination address to the rqstp structure.
394 * The 'destination' address in this case is the address to which the
395 * peer sent the datagram, i.e. our local address. For multihomed
396 * hosts, this can change from msg to msg. Note that only the IP
397 * address changes, the port number should remain the same.
398 */
399static void svc_udp_get_dest_address(struct svc_rqst *rqstp,
400 struct cmsghdr *cmh)
745{ 401{
746 switch (rqstp->rq_sock->sk_sk->sk_family) { 402 struct svc_sock *svsk =
403 container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
404 switch (svsk->sk_sk->sk_family) {
747 case AF_INET: { 405 case AF_INET: {
748 struct in_pktinfo *pki = CMSG_DATA(cmh); 406 struct in_pktinfo *pki = CMSG_DATA(cmh);
749 rqstp->rq_daddr.addr.s_addr = pki->ipi_spec_dst.s_addr; 407 rqstp->rq_daddr.addr.s_addr = pki->ipi_spec_dst.s_addr;
@@ -760,11 +418,11 @@ static inline void svc_udp_get_dest_address(struct svc_rqst *rqstp,
760/* 418/*
761 * Receive a datagram from a UDP socket. 419 * Receive a datagram from a UDP socket.
762 */ 420 */
763static int 421static int svc_udp_recvfrom(struct svc_rqst *rqstp)
764svc_udp_recvfrom(struct svc_rqst *rqstp)
765{ 422{
766 struct svc_sock *svsk = rqstp->rq_sock; 423 struct svc_sock *svsk =
767 struct svc_serv *serv = svsk->sk_server; 424 container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
425 struct svc_serv *serv = svsk->sk_xprt.xpt_server;
768 struct sk_buff *skb; 426 struct sk_buff *skb;
769 union { 427 union {
770 struct cmsghdr hdr; 428 struct cmsghdr hdr;
@@ -779,7 +437,7 @@ svc_udp_recvfrom(struct svc_rqst *rqstp)
779 .msg_flags = MSG_DONTWAIT, 437 .msg_flags = MSG_DONTWAIT,
780 }; 438 };
781 439
782 if (test_and_clear_bit(SK_CHNGBUF, &svsk->sk_flags)) 440 if (test_and_clear_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags))
783 /* udp sockets need large rcvbuf as all pending 441 /* udp sockets need large rcvbuf as all pending
784 * requests are still in that buffer. sndbuf must 442 * requests are still in that buffer. sndbuf must
785 * also be large enough that there is enough space 443 * also be large enough that there is enough space
@@ -792,17 +450,7 @@ svc_udp_recvfrom(struct svc_rqst *rqstp)
792 (serv->sv_nrthreads+3) * serv->sv_max_mesg, 450 (serv->sv_nrthreads+3) * serv->sv_max_mesg,
793 (serv->sv_nrthreads+3) * serv->sv_max_mesg); 451 (serv->sv_nrthreads+3) * serv->sv_max_mesg);
794 452
795 if ((rqstp->rq_deferred = svc_deferred_dequeue(svsk))) { 453 clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
796 svc_sock_received(svsk);
797 return svc_deferred_recv(rqstp);
798 }
799
800 if (test_bit(SK_CLOSE, &svsk->sk_flags)) {
801 svc_delete_socket(svsk);
802 return 0;
803 }
804
805 clear_bit(SK_DATA, &svsk->sk_flags);
806 skb = NULL; 454 skb = NULL;
807 err = kernel_recvmsg(svsk->sk_sock, &msg, NULL, 455 err = kernel_recvmsg(svsk->sk_sock, &msg, NULL,
808 0, 0, MSG_PEEK | MSG_DONTWAIT); 456 0, 0, MSG_PEEK | MSG_DONTWAIT);
@@ -813,24 +461,27 @@ svc_udp_recvfrom(struct svc_rqst *rqstp)
813 if (err != -EAGAIN) { 461 if (err != -EAGAIN) {
814 /* possibly an icmp error */ 462 /* possibly an icmp error */
815 dprintk("svc: recvfrom returned error %d\n", -err); 463 dprintk("svc: recvfrom returned error %d\n", -err);
816 set_bit(SK_DATA, &svsk->sk_flags); 464 set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
817 } 465 }
818 svc_sock_received(svsk); 466 svc_xprt_received(&svsk->sk_xprt);
819 return -EAGAIN; 467 return -EAGAIN;
820 } 468 }
821 rqstp->rq_addrlen = sizeof(rqstp->rq_addr); 469 len = svc_addr_len(svc_addr(rqstp));
470 if (len < 0)
471 return len;
472 rqstp->rq_addrlen = len;
822 if (skb->tstamp.tv64 == 0) { 473 if (skb->tstamp.tv64 == 0) {
823 skb->tstamp = ktime_get_real(); 474 skb->tstamp = ktime_get_real();
824 /* Don't enable netstamp, sunrpc doesn't 475 /* Don't enable netstamp, sunrpc doesn't
825 need that much accuracy */ 476 need that much accuracy */
826 } 477 }
827 svsk->sk_sk->sk_stamp = skb->tstamp; 478 svsk->sk_sk->sk_stamp = skb->tstamp;
828 set_bit(SK_DATA, &svsk->sk_flags); /* there may be more data... */ 479 set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); /* there may be more data... */
829 480
830 /* 481 /*
831 * Maybe more packets - kick another thread ASAP. 482 * Maybe more packets - kick another thread ASAP.
832 */ 483 */
833 svc_sock_received(svsk); 484 svc_xprt_received(&svsk->sk_xprt);
834 485
835 len = skb->len - sizeof(struct udphdr); 486 len = skb->len - sizeof(struct udphdr);
836 rqstp->rq_arg.len = len; 487 rqstp->rq_arg.len = len;
@@ -861,13 +512,14 @@ svc_udp_recvfrom(struct svc_rqst *rqstp)
861 skb_free_datagram(svsk->sk_sk, skb); 512 skb_free_datagram(svsk->sk_sk, skb);
862 } else { 513 } else {
863 /* we can use it in-place */ 514 /* we can use it in-place */
864 rqstp->rq_arg.head[0].iov_base = skb->data + sizeof(struct udphdr); 515 rqstp->rq_arg.head[0].iov_base = skb->data +
516 sizeof(struct udphdr);
865 rqstp->rq_arg.head[0].iov_len = len; 517 rqstp->rq_arg.head[0].iov_len = len;
866 if (skb_checksum_complete(skb)) { 518 if (skb_checksum_complete(skb)) {
867 skb_free_datagram(svsk->sk_sk, skb); 519 skb_free_datagram(svsk->sk_sk, skb);
868 return 0; 520 return 0;
869 } 521 }
870 rqstp->rq_skbuff = skb; 522 rqstp->rq_xprt_ctxt = skb;
871 } 523 }
872 524
873 rqstp->rq_arg.page_base = 0; 525 rqstp->rq_arg.page_base = 0;
@@ -900,27 +552,81 @@ svc_udp_sendto(struct svc_rqst *rqstp)
900 return error; 552 return error;
901} 553}
902 554
903static void 555static void svc_udp_prep_reply_hdr(struct svc_rqst *rqstp)
904svc_udp_init(struct svc_sock *svsk) 556{
557}
558
559static int svc_udp_has_wspace(struct svc_xprt *xprt)
560{
561 struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
562 struct svc_serv *serv = xprt->xpt_server;
563 unsigned long required;
564
565 /*
566 * Set the SOCK_NOSPACE flag before checking the available
567 * sock space.
568 */
569 set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
570 required = atomic_read(&svsk->sk_xprt.xpt_reserved) + serv->sv_max_mesg;
571 if (required*2 > sock_wspace(svsk->sk_sk))
572 return 0;
573 clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
574 return 1;
575}
576
577static struct svc_xprt *svc_udp_accept(struct svc_xprt *xprt)
578{
579 BUG();
580 return NULL;
581}
582
583static struct svc_xprt *svc_udp_create(struct svc_serv *serv,
584 struct sockaddr *sa, int salen,
585 int flags)
586{
587 return svc_create_socket(serv, IPPROTO_UDP, sa, salen, flags);
588}
589
590static struct svc_xprt_ops svc_udp_ops = {
591 .xpo_create = svc_udp_create,
592 .xpo_recvfrom = svc_udp_recvfrom,
593 .xpo_sendto = svc_udp_sendto,
594 .xpo_release_rqst = svc_release_skb,
595 .xpo_detach = svc_sock_detach,
596 .xpo_free = svc_sock_free,
597 .xpo_prep_reply_hdr = svc_udp_prep_reply_hdr,
598 .xpo_has_wspace = svc_udp_has_wspace,
599 .xpo_accept = svc_udp_accept,
600};
601
602static struct svc_xprt_class svc_udp_class = {
603 .xcl_name = "udp",
604 .xcl_owner = THIS_MODULE,
605 .xcl_ops = &svc_udp_ops,
606 .xcl_max_payload = RPCSVC_MAXPAYLOAD_UDP,
607};
608
609static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv)
905{ 610{
906 int one = 1; 611 int one = 1;
907 mm_segment_t oldfs; 612 mm_segment_t oldfs;
908 613
614 svc_xprt_init(&svc_udp_class, &svsk->sk_xprt, serv);
615 clear_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags);
909 svsk->sk_sk->sk_data_ready = svc_udp_data_ready; 616 svsk->sk_sk->sk_data_ready = svc_udp_data_ready;
910 svsk->sk_sk->sk_write_space = svc_write_space; 617 svsk->sk_sk->sk_write_space = svc_write_space;
911 svsk->sk_recvfrom = svc_udp_recvfrom;
912 svsk->sk_sendto = svc_udp_sendto;
913 618
914 /* initialise setting must have enough space to 619 /* initialise setting must have enough space to
915 * receive and respond to one request. 620 * receive and respond to one request.
916 * svc_udp_recvfrom will re-adjust if necessary 621 * svc_udp_recvfrom will re-adjust if necessary
917 */ 622 */
918 svc_sock_setbufsize(svsk->sk_sock, 623 svc_sock_setbufsize(svsk->sk_sock,
919 3 * svsk->sk_server->sv_max_mesg, 624 3 * svsk->sk_xprt.xpt_server->sv_max_mesg,
920 3 * svsk->sk_server->sv_max_mesg); 625 3 * svsk->sk_xprt.xpt_server->sv_max_mesg);
921 626
922 set_bit(SK_DATA, &svsk->sk_flags); /* might have come in before data_ready set up */ 627 /* data might have come in before data_ready set up */
923 set_bit(SK_CHNGBUF, &svsk->sk_flags); 628 set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
629 set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags);
924 630
925 oldfs = get_fs(); 631 oldfs = get_fs();
926 set_fs(KERNEL_DS); 632 set_fs(KERNEL_DS);
@@ -934,8 +640,7 @@ svc_udp_init(struct svc_sock *svsk)
934 * A data_ready event on a listening socket means there's a connection 640 * A data_ready event on a listening socket means there's a connection
935 * pending. Do not use state_change as a substitute for it. 641 * pending. Do not use state_change as a substitute for it.
936 */ 642 */
937static void 643static void svc_tcp_listen_data_ready(struct sock *sk, int count_unused)
938svc_tcp_listen_data_ready(struct sock *sk, int count_unused)
939{ 644{
940 struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; 645 struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
941 646
@@ -954,8 +659,8 @@ svc_tcp_listen_data_ready(struct sock *sk, int count_unused)
954 */ 659 */
955 if (sk->sk_state == TCP_LISTEN) { 660 if (sk->sk_state == TCP_LISTEN) {
956 if (svsk) { 661 if (svsk) {
957 set_bit(SK_CONN, &svsk->sk_flags); 662 set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags);
958 svc_sock_enqueue(svsk); 663 svc_xprt_enqueue(&svsk->sk_xprt);
959 } else 664 } else
960 printk("svc: socket %p: no user data\n", sk); 665 printk("svc: socket %p: no user data\n", sk);
961 } 666 }
@@ -967,8 +672,7 @@ svc_tcp_listen_data_ready(struct sock *sk, int count_unused)
967/* 672/*
968 * A state change on a connected socket means it's dying or dead. 673 * A state change on a connected socket means it's dying or dead.
969 */ 674 */
970static void 675static void svc_tcp_state_change(struct sock *sk)
971svc_tcp_state_change(struct sock *sk)
972{ 676{
973 struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; 677 struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
974 678
@@ -978,51 +682,36 @@ svc_tcp_state_change(struct sock *sk)
978 if (!svsk) 682 if (!svsk)
979 printk("svc: socket %p: no user data\n", sk); 683 printk("svc: socket %p: no user data\n", sk);
980 else { 684 else {
981 set_bit(SK_CLOSE, &svsk->sk_flags); 685 set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
982 svc_sock_enqueue(svsk); 686 svc_xprt_enqueue(&svsk->sk_xprt);
983 } 687 }
984 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) 688 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
985 wake_up_interruptible_all(sk->sk_sleep); 689 wake_up_interruptible_all(sk->sk_sleep);
986} 690}
987 691
988static void 692static void svc_tcp_data_ready(struct sock *sk, int count)
989svc_tcp_data_ready(struct sock *sk, int count)
990{ 693{
991 struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; 694 struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
992 695
993 dprintk("svc: socket %p TCP data ready (svsk %p)\n", 696 dprintk("svc: socket %p TCP data ready (svsk %p)\n",
994 sk, sk->sk_user_data); 697 sk, sk->sk_user_data);
995 if (svsk) { 698 if (svsk) {
996 set_bit(SK_DATA, &svsk->sk_flags); 699 set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
997 svc_sock_enqueue(svsk); 700 svc_xprt_enqueue(&svsk->sk_xprt);
998 } 701 }
999 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) 702 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1000 wake_up_interruptible(sk->sk_sleep); 703 wake_up_interruptible(sk->sk_sleep);
1001} 704}
1002 705
1003static inline int svc_port_is_privileged(struct sockaddr *sin)
1004{
1005 switch (sin->sa_family) {
1006 case AF_INET:
1007 return ntohs(((struct sockaddr_in *)sin)->sin_port)
1008 < PROT_SOCK;
1009 case AF_INET6:
1010 return ntohs(((struct sockaddr_in6 *)sin)->sin6_port)
1011 < PROT_SOCK;
1012 default:
1013 return 0;
1014 }
1015}
1016
1017/* 706/*
1018 * Accept a TCP connection 707 * Accept a TCP connection
1019 */ 708 */
1020static void 709static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt)
1021svc_tcp_accept(struct svc_sock *svsk)
1022{ 710{
711 struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
1023 struct sockaddr_storage addr; 712 struct sockaddr_storage addr;
1024 struct sockaddr *sin = (struct sockaddr *) &addr; 713 struct sockaddr *sin = (struct sockaddr *) &addr;
1025 struct svc_serv *serv = svsk->sk_server; 714 struct svc_serv *serv = svsk->sk_xprt.xpt_server;
1026 struct socket *sock = svsk->sk_sock; 715 struct socket *sock = svsk->sk_sock;
1027 struct socket *newsock; 716 struct socket *newsock;
1028 struct svc_sock *newsvsk; 717 struct svc_sock *newsvsk;
@@ -1031,9 +720,9 @@ svc_tcp_accept(struct svc_sock *svsk)
1031 720
1032 dprintk("svc: tcp_accept %p sock %p\n", svsk, sock); 721 dprintk("svc: tcp_accept %p sock %p\n", svsk, sock);
1033 if (!sock) 722 if (!sock)
1034 return; 723 return NULL;
1035 724
1036 clear_bit(SK_CONN, &svsk->sk_flags); 725 clear_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags);
1037 err = kernel_accept(sock, &newsock, O_NONBLOCK); 726 err = kernel_accept(sock, &newsock, O_NONBLOCK);
1038 if (err < 0) { 727 if (err < 0) {
1039 if (err == -ENOMEM) 728 if (err == -ENOMEM)
@@ -1042,11 +731,9 @@ svc_tcp_accept(struct svc_sock *svsk)
1042 else if (err != -EAGAIN && net_ratelimit()) 731 else if (err != -EAGAIN && net_ratelimit())
1043 printk(KERN_WARNING "%s: accept failed (err %d)!\n", 732 printk(KERN_WARNING "%s: accept failed (err %d)!\n",
1044 serv->sv_name, -err); 733 serv->sv_name, -err);
1045 return; 734 return NULL;
1046 } 735 }
1047 736 set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags);
1048 set_bit(SK_CONN, &svsk->sk_flags);
1049 svc_sock_enqueue(svsk);
1050 737
1051 err = kernel_getpeername(newsock, sin, &slen); 738 err = kernel_getpeername(newsock, sin, &slen);
1052 if (err < 0) { 739 if (err < 0) {
@@ -1077,106 +764,42 @@ svc_tcp_accept(struct svc_sock *svsk)
1077 if (!(newsvsk = svc_setup_socket(serv, newsock, &err, 764 if (!(newsvsk = svc_setup_socket(serv, newsock, &err,
1078 (SVC_SOCK_ANONYMOUS | SVC_SOCK_TEMPORARY)))) 765 (SVC_SOCK_ANONYMOUS | SVC_SOCK_TEMPORARY))))
1079 goto failed; 766 goto failed;
1080 memcpy(&newsvsk->sk_remote, sin, slen); 767 svc_xprt_set_remote(&newsvsk->sk_xprt, sin, slen);
1081 newsvsk->sk_remotelen = slen;
1082 err = kernel_getsockname(newsock, sin, &slen); 768 err = kernel_getsockname(newsock, sin, &slen);
1083 if (unlikely(err < 0)) { 769 if (unlikely(err < 0)) {
1084 dprintk("svc_tcp_accept: kernel_getsockname error %d\n", -err); 770 dprintk("svc_tcp_accept: kernel_getsockname error %d\n", -err);
1085 slen = offsetof(struct sockaddr, sa_data); 771 slen = offsetof(struct sockaddr, sa_data);
1086 } 772 }
1087 memcpy(&newsvsk->sk_local, sin, slen); 773 svc_xprt_set_local(&newsvsk->sk_xprt, sin, slen);
1088
1089 svc_sock_received(newsvsk);
1090
1091 /* make sure that we don't have too many active connections.
1092 * If we have, something must be dropped.
1093 *
1094 * There's no point in trying to do random drop here for
1095 * DoS prevention. The NFS clients does 1 reconnect in 15
1096 * seconds. An attacker can easily beat that.
1097 *
1098 * The only somewhat efficient mechanism would be if drop
1099 * old connections from the same IP first. But right now
1100 * we don't even record the client IP in svc_sock.
1101 */
1102 if (serv->sv_tmpcnt > (serv->sv_nrthreads+3)*20) {
1103 struct svc_sock *svsk = NULL;
1104 spin_lock_bh(&serv->sv_lock);
1105 if (!list_empty(&serv->sv_tempsocks)) {
1106 if (net_ratelimit()) {
1107 /* Try to help the admin */
1108 printk(KERN_NOTICE "%s: too many open TCP "
1109 "sockets, consider increasing the "
1110 "number of nfsd threads\n",
1111 serv->sv_name);
1112 printk(KERN_NOTICE
1113 "%s: last TCP connect from %s\n",
1114 serv->sv_name, __svc_print_addr(sin,
1115 buf, sizeof(buf)));
1116 }
1117 /*
1118 * Always select the oldest socket. It's not fair,
1119 * but so is life
1120 */
1121 svsk = list_entry(serv->sv_tempsocks.prev,
1122 struct svc_sock,
1123 sk_list);
1124 set_bit(SK_CLOSE, &svsk->sk_flags);
1125 atomic_inc(&svsk->sk_inuse);
1126 }
1127 spin_unlock_bh(&serv->sv_lock);
1128
1129 if (svsk) {
1130 svc_sock_enqueue(svsk);
1131 svc_sock_put(svsk);
1132 }
1133
1134 }
1135 774
1136 if (serv->sv_stats) 775 if (serv->sv_stats)
1137 serv->sv_stats->nettcpconn++; 776 serv->sv_stats->nettcpconn++;
1138 777
1139 return; 778 return &newsvsk->sk_xprt;
1140 779
1141failed: 780failed:
1142 sock_release(newsock); 781 sock_release(newsock);
1143 return; 782 return NULL;
1144} 783}
1145 784
1146/* 785/*
1147 * Receive data from a TCP socket. 786 * Receive data from a TCP socket.
1148 */ 787 */
1149static int 788static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
1150svc_tcp_recvfrom(struct svc_rqst *rqstp)
1151{ 789{
1152 struct svc_sock *svsk = rqstp->rq_sock; 790 struct svc_sock *svsk =
1153 struct svc_serv *serv = svsk->sk_server; 791 container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
792 struct svc_serv *serv = svsk->sk_xprt.xpt_server;
1154 int len; 793 int len;
1155 struct kvec *vec; 794 struct kvec *vec;
1156 int pnum, vlen; 795 int pnum, vlen;
1157 796
1158 dprintk("svc: tcp_recv %p data %d conn %d close %d\n", 797 dprintk("svc: tcp_recv %p data %d conn %d close %d\n",
1159 svsk, test_bit(SK_DATA, &svsk->sk_flags), 798 svsk, test_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags),
1160 test_bit(SK_CONN, &svsk->sk_flags), 799 test_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags),
1161 test_bit(SK_CLOSE, &svsk->sk_flags)); 800 test_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags));
1162 801
1163 if ((rqstp->rq_deferred = svc_deferred_dequeue(svsk))) { 802 if (test_and_clear_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags))
1164 svc_sock_received(svsk);
1165 return svc_deferred_recv(rqstp);
1166 }
1167
1168 if (test_bit(SK_CLOSE, &svsk->sk_flags)) {
1169 svc_delete_socket(svsk);
1170 return 0;
1171 }
1172
1173 if (svsk->sk_sk->sk_state == TCP_LISTEN) {
1174 svc_tcp_accept(svsk);
1175 svc_sock_received(svsk);
1176 return 0;
1177 }
1178
1179 if (test_and_clear_bit(SK_CHNGBUF, &svsk->sk_flags))
1180 /* sndbuf needs to have room for one request 803 /* sndbuf needs to have room for one request
1181 * per thread, otherwise we can stall even when the 804 * per thread, otherwise we can stall even when the
1182 * network isn't a bottleneck. 805 * network isn't a bottleneck.
@@ -1193,7 +816,7 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp)
1193 (serv->sv_nrthreads+3) * serv->sv_max_mesg, 816 (serv->sv_nrthreads+3) * serv->sv_max_mesg,
1194 3 * serv->sv_max_mesg); 817 3 * serv->sv_max_mesg);
1195 818
1196 clear_bit(SK_DATA, &svsk->sk_flags); 819 clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
1197 820
1198 /* Receive data. If we haven't got the record length yet, get 821 /* Receive data. If we haven't got the record length yet, get
1199 * the next four bytes. Otherwise try to gobble up as much as 822 * the next four bytes. Otherwise try to gobble up as much as
@@ -1212,7 +835,7 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp)
1212 if (len < want) { 835 if (len < want) {
1213 dprintk("svc: short recvfrom while reading record length (%d of %lu)\n", 836 dprintk("svc: short recvfrom while reading record length (%d of %lu)\n",
1214 len, want); 837 len, want);
1215 svc_sock_received(svsk); 838 svc_xprt_received(&svsk->sk_xprt);
1216 return -EAGAIN; /* record header not complete */ 839 return -EAGAIN; /* record header not complete */
1217 } 840 }
1218 841
@@ -1248,11 +871,11 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp)
1248 if (len < svsk->sk_reclen) { 871 if (len < svsk->sk_reclen) {
1249 dprintk("svc: incomplete TCP record (%d of %d)\n", 872 dprintk("svc: incomplete TCP record (%d of %d)\n",
1250 len, svsk->sk_reclen); 873 len, svsk->sk_reclen);
1251 svc_sock_received(svsk); 874 svc_xprt_received(&svsk->sk_xprt);
1252 return -EAGAIN; /* record not complete */ 875 return -EAGAIN; /* record not complete */
1253 } 876 }
1254 len = svsk->sk_reclen; 877 len = svsk->sk_reclen;
1255 set_bit(SK_DATA, &svsk->sk_flags); 878 set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
1256 879
1257 vec = rqstp->rq_vec; 880 vec = rqstp->rq_vec;
1258 vec[0] = rqstp->rq_arg.head[0]; 881 vec[0] = rqstp->rq_arg.head[0];
@@ -1281,30 +904,31 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp)
1281 rqstp->rq_arg.page_len = len - rqstp->rq_arg.head[0].iov_len; 904 rqstp->rq_arg.page_len = len - rqstp->rq_arg.head[0].iov_len;
1282 } 905 }
1283 906
1284 rqstp->rq_skbuff = NULL; 907 rqstp->rq_xprt_ctxt = NULL;
1285 rqstp->rq_prot = IPPROTO_TCP; 908 rqstp->rq_prot = IPPROTO_TCP;
1286 909
1287 /* Reset TCP read info */ 910 /* Reset TCP read info */
1288 svsk->sk_reclen = 0; 911 svsk->sk_reclen = 0;
1289 svsk->sk_tcplen = 0; 912 svsk->sk_tcplen = 0;
1290 913
1291 svc_sock_received(svsk); 914 svc_xprt_copy_addrs(rqstp, &svsk->sk_xprt);
915 svc_xprt_received(&svsk->sk_xprt);
1292 if (serv->sv_stats) 916 if (serv->sv_stats)
1293 serv->sv_stats->nettcpcnt++; 917 serv->sv_stats->nettcpcnt++;
1294 918
1295 return len; 919 return len;
1296 920
1297 err_delete: 921 err_delete:
1298 svc_delete_socket(svsk); 922 set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
1299 return -EAGAIN; 923 return -EAGAIN;
1300 924
1301 error: 925 error:
1302 if (len == -EAGAIN) { 926 if (len == -EAGAIN) {
1303 dprintk("RPC: TCP recvfrom got EAGAIN\n"); 927 dprintk("RPC: TCP recvfrom got EAGAIN\n");
1304 svc_sock_received(svsk); 928 svc_xprt_received(&svsk->sk_xprt);
1305 } else { 929 } else {
1306 printk(KERN_NOTICE "%s: recvfrom returned errno %d\n", 930 printk(KERN_NOTICE "%s: recvfrom returned errno %d\n",
1307 svsk->sk_server->sv_name, -len); 931 svsk->sk_xprt.xpt_server->sv_name, -len);
1308 goto err_delete; 932 goto err_delete;
1309 } 933 }
1310 934
@@ -1314,8 +938,7 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp)
1314/* 938/*
1315 * Send out data on TCP socket. 939 * Send out data on TCP socket.
1316 */ 940 */
1317static int 941static int svc_tcp_sendto(struct svc_rqst *rqstp)
1318svc_tcp_sendto(struct svc_rqst *rqstp)
1319{ 942{
1320 struct xdr_buf *xbufp = &rqstp->rq_res; 943 struct xdr_buf *xbufp = &rqstp->rq_res;
1321 int sent; 944 int sent;
@@ -1328,35 +951,109 @@ svc_tcp_sendto(struct svc_rqst *rqstp)
1328 reclen = htonl(0x80000000|((xbufp->len ) - 4)); 951 reclen = htonl(0x80000000|((xbufp->len ) - 4));
1329 memcpy(xbufp->head[0].iov_base, &reclen, 4); 952 memcpy(xbufp->head[0].iov_base, &reclen, 4);
1330 953
1331 if (test_bit(SK_DEAD, &rqstp->rq_sock->sk_flags)) 954 if (test_bit(XPT_DEAD, &rqstp->rq_xprt->xpt_flags))
1332 return -ENOTCONN; 955 return -ENOTCONN;
1333 956
1334 sent = svc_sendto(rqstp, &rqstp->rq_res); 957 sent = svc_sendto(rqstp, &rqstp->rq_res);
1335 if (sent != xbufp->len) { 958 if (sent != xbufp->len) {
1336 printk(KERN_NOTICE "rpc-srv/tcp: %s: %s %d when sending %d bytes - shutting down socket\n", 959 printk(KERN_NOTICE
1337 rqstp->rq_sock->sk_server->sv_name, 960 "rpc-srv/tcp: %s: %s %d when sending %d bytes "
961 "- shutting down socket\n",
962 rqstp->rq_xprt->xpt_server->sv_name,
1338 (sent<0)?"got error":"sent only", 963 (sent<0)?"got error":"sent only",
1339 sent, xbufp->len); 964 sent, xbufp->len);
1340 set_bit(SK_CLOSE, &rqstp->rq_sock->sk_flags); 965 set_bit(XPT_CLOSE, &rqstp->rq_xprt->xpt_flags);
1341 svc_sock_enqueue(rqstp->rq_sock); 966 svc_xprt_enqueue(rqstp->rq_xprt);
1342 sent = -EAGAIN; 967 sent = -EAGAIN;
1343 } 968 }
1344 return sent; 969 return sent;
1345} 970}
1346 971
1347static void 972/*
1348svc_tcp_init(struct svc_sock *svsk) 973 * Setup response header. TCP has a 4B record length field.
974 */
975static void svc_tcp_prep_reply_hdr(struct svc_rqst *rqstp)
976{
977 struct kvec *resv = &rqstp->rq_res.head[0];
978
979 /* tcp needs a space for the record length... */
980 svc_putnl(resv, 0);
981}
982
983static int svc_tcp_has_wspace(struct svc_xprt *xprt)
984{
985 struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
986 struct svc_serv *serv = svsk->sk_xprt.xpt_server;
987 int required;
988 int wspace;
989
990 /*
991 * Set the SOCK_NOSPACE flag before checking the available
992 * sock space.
993 */
994 set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
995 required = atomic_read(&svsk->sk_xprt.xpt_reserved) + serv->sv_max_mesg;
996 wspace = sk_stream_wspace(svsk->sk_sk);
997
998 if (wspace < sk_stream_min_wspace(svsk->sk_sk))
999 return 0;
1000 if (required * 2 > wspace)
1001 return 0;
1002
1003 clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
1004 return 1;
1005}
1006
1007static struct svc_xprt *svc_tcp_create(struct svc_serv *serv,
1008 struct sockaddr *sa, int salen,
1009 int flags)
1010{
1011 return svc_create_socket(serv, IPPROTO_TCP, sa, salen, flags);
1012}
1013
1014static struct svc_xprt_ops svc_tcp_ops = {
1015 .xpo_create = svc_tcp_create,
1016 .xpo_recvfrom = svc_tcp_recvfrom,
1017 .xpo_sendto = svc_tcp_sendto,
1018 .xpo_release_rqst = svc_release_skb,
1019 .xpo_detach = svc_sock_detach,
1020 .xpo_free = svc_sock_free,
1021 .xpo_prep_reply_hdr = svc_tcp_prep_reply_hdr,
1022 .xpo_has_wspace = svc_tcp_has_wspace,
1023 .xpo_accept = svc_tcp_accept,
1024};
1025
1026static struct svc_xprt_class svc_tcp_class = {
1027 .xcl_name = "tcp",
1028 .xcl_owner = THIS_MODULE,
1029 .xcl_ops = &svc_tcp_ops,
1030 .xcl_max_payload = RPCSVC_MAXPAYLOAD_TCP,
1031};
1032
1033void svc_init_xprt_sock(void)
1034{
1035 svc_reg_xprt_class(&svc_tcp_class);
1036 svc_reg_xprt_class(&svc_udp_class);
1037}
1038
1039void svc_cleanup_xprt_sock(void)
1040{
1041 svc_unreg_xprt_class(&svc_tcp_class);
1042 svc_unreg_xprt_class(&svc_udp_class);
1043}
1044
1045static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv)
1349{ 1046{
1350 struct sock *sk = svsk->sk_sk; 1047 struct sock *sk = svsk->sk_sk;
1351 struct tcp_sock *tp = tcp_sk(sk); 1048 struct tcp_sock *tp = tcp_sk(sk);
1352 1049
1353 svsk->sk_recvfrom = svc_tcp_recvfrom; 1050 svc_xprt_init(&svc_tcp_class, &svsk->sk_xprt, serv);
1354 svsk->sk_sendto = svc_tcp_sendto; 1051 set_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags);
1355
1356 if (sk->sk_state == TCP_LISTEN) { 1052 if (sk->sk_state == TCP_LISTEN) {
1357 dprintk("setting up TCP socket for listening\n"); 1053 dprintk("setting up TCP socket for listening\n");
1054 set_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags);
1358 sk->sk_data_ready = svc_tcp_listen_data_ready; 1055 sk->sk_data_ready = svc_tcp_listen_data_ready;
1359 set_bit(SK_CONN, &svsk->sk_flags); 1056 set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags);
1360 } else { 1057 } else {
1361 dprintk("setting up TCP socket for reading\n"); 1058 dprintk("setting up TCP socket for reading\n");
1362 sk->sk_state_change = svc_tcp_state_change; 1059 sk->sk_state_change = svc_tcp_state_change;
@@ -1373,18 +1070,17 @@ svc_tcp_init(struct svc_sock *svsk)
1373 * svc_tcp_recvfrom will re-adjust if necessary 1070 * svc_tcp_recvfrom will re-adjust if necessary
1374 */ 1071 */
1375 svc_sock_setbufsize(svsk->sk_sock, 1072 svc_sock_setbufsize(svsk->sk_sock,
1376 3 * svsk->sk_server->sv_max_mesg, 1073 3 * svsk->sk_xprt.xpt_server->sv_max_mesg,
1377 3 * svsk->sk_server->sv_max_mesg); 1074 3 * svsk->sk_xprt.xpt_server->sv_max_mesg);
1378 1075
1379 set_bit(SK_CHNGBUF, &svsk->sk_flags); 1076 set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags);
1380 set_bit(SK_DATA, &svsk->sk_flags); 1077 set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
1381 if (sk->sk_state != TCP_ESTABLISHED) 1078 if (sk->sk_state != TCP_ESTABLISHED)
1382 set_bit(SK_CLOSE, &svsk->sk_flags); 1079 set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
1383 } 1080 }
1384} 1081}
1385 1082
1386void 1083void svc_sock_update_bufs(struct svc_serv *serv)
1387svc_sock_update_bufs(struct svc_serv *serv)
1388{ 1084{
1389 /* 1085 /*
1390 * The number of server threads has changed. Update 1086 * The number of server threads has changed. Update
@@ -1395,232 +1091,18 @@ svc_sock_update_bufs(struct svc_serv *serv)
1395 spin_lock_bh(&serv->sv_lock); 1091 spin_lock_bh(&serv->sv_lock);
1396 list_for_each(le, &serv->sv_permsocks) { 1092 list_for_each(le, &serv->sv_permsocks) {
1397 struct svc_sock *svsk = 1093 struct svc_sock *svsk =
1398 list_entry(le, struct svc_sock, sk_list); 1094 list_entry(le, struct svc_sock, sk_xprt.xpt_list);
1399 set_bit(SK_CHNGBUF, &svsk->sk_flags); 1095 set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags);
1400 } 1096 }
1401 list_for_each(le, &serv->sv_tempsocks) { 1097 list_for_each(le, &serv->sv_tempsocks) {
1402 struct svc_sock *svsk = 1098 struct svc_sock *svsk =
1403 list_entry(le, struct svc_sock, sk_list); 1099 list_entry(le, struct svc_sock, sk_xprt.xpt_list);
1404 set_bit(SK_CHNGBUF, &svsk->sk_flags); 1100 set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags);
1405 } 1101 }
1406 spin_unlock_bh(&serv->sv_lock); 1102 spin_unlock_bh(&serv->sv_lock);
1407} 1103}
1408 1104
1409/* 1105/*
1410 * Receive the next request on any socket. This code is carefully
1411 * organised not to touch any cachelines in the shared svc_serv
1412 * structure, only cachelines in the local svc_pool.
1413 */
1414int
1415svc_recv(struct svc_rqst *rqstp, long timeout)
1416{
1417 struct svc_sock *svsk = NULL;
1418 struct svc_serv *serv = rqstp->rq_server;
1419 struct svc_pool *pool = rqstp->rq_pool;
1420 int len, i;
1421 int pages;
1422 struct xdr_buf *arg;
1423 DECLARE_WAITQUEUE(wait, current);
1424
1425 dprintk("svc: server %p waiting for data (to = %ld)\n",
1426 rqstp, timeout);
1427
1428 if (rqstp->rq_sock)
1429 printk(KERN_ERR
1430 "svc_recv: service %p, socket not NULL!\n",
1431 rqstp);
1432 if (waitqueue_active(&rqstp->rq_wait))
1433 printk(KERN_ERR
1434 "svc_recv: service %p, wait queue active!\n",
1435 rqstp);
1436
1437
1438 /* now allocate needed pages. If we get a failure, sleep briefly */
1439 pages = (serv->sv_max_mesg + PAGE_SIZE) / PAGE_SIZE;
1440 for (i=0; i < pages ; i++)
1441 while (rqstp->rq_pages[i] == NULL) {
1442 struct page *p = alloc_page(GFP_KERNEL);
1443 if (!p)
1444 schedule_timeout_uninterruptible(msecs_to_jiffies(500));
1445 rqstp->rq_pages[i] = p;
1446 }
1447 rqstp->rq_pages[i++] = NULL; /* this might be seen in nfs_read_actor */
1448 BUG_ON(pages >= RPCSVC_MAXPAGES);
1449
1450 /* Make arg->head point to first page and arg->pages point to rest */
1451 arg = &rqstp->rq_arg;
1452 arg->head[0].iov_base = page_address(rqstp->rq_pages[0]);
1453 arg->head[0].iov_len = PAGE_SIZE;
1454 arg->pages = rqstp->rq_pages + 1;
1455 arg->page_base = 0;
1456 /* save at least one page for response */
1457 arg->page_len = (pages-2)*PAGE_SIZE;
1458 arg->len = (pages-1)*PAGE_SIZE;
1459 arg->tail[0].iov_len = 0;
1460
1461 try_to_freeze();
1462 cond_resched();
1463 if (signalled())
1464 return -EINTR;
1465
1466 spin_lock_bh(&pool->sp_lock);
1467 if ((svsk = svc_sock_dequeue(pool)) != NULL) {
1468 rqstp->rq_sock = svsk;
1469 atomic_inc(&svsk->sk_inuse);
1470 rqstp->rq_reserved = serv->sv_max_mesg;
1471 atomic_add(rqstp->rq_reserved, &svsk->sk_reserved);
1472 } else {
1473 /* No data pending. Go to sleep */
1474 svc_thread_enqueue(pool, rqstp);
1475
1476 /*
1477 * We have to be able to interrupt this wait
1478 * to bring down the daemons ...
1479 */
1480 set_current_state(TASK_INTERRUPTIBLE);
1481 add_wait_queue(&rqstp->rq_wait, &wait);
1482 spin_unlock_bh(&pool->sp_lock);
1483
1484 schedule_timeout(timeout);
1485
1486 try_to_freeze();
1487
1488 spin_lock_bh(&pool->sp_lock);
1489 remove_wait_queue(&rqstp->rq_wait, &wait);
1490
1491 if (!(svsk = rqstp->rq_sock)) {
1492 svc_thread_dequeue(pool, rqstp);
1493 spin_unlock_bh(&pool->sp_lock);
1494 dprintk("svc: server %p, no data yet\n", rqstp);
1495 return signalled()? -EINTR : -EAGAIN;
1496 }
1497 }
1498 spin_unlock_bh(&pool->sp_lock);
1499
1500 dprintk("svc: server %p, pool %u, socket %p, inuse=%d\n",
1501 rqstp, pool->sp_id, svsk, atomic_read(&svsk->sk_inuse));
1502 len = svsk->sk_recvfrom(rqstp);
1503 dprintk("svc: got len=%d\n", len);
1504
1505 /* No data, incomplete (TCP) read, or accept() */
1506 if (len == 0 || len == -EAGAIN) {
1507 rqstp->rq_res.len = 0;
1508 svc_sock_release(rqstp);
1509 return -EAGAIN;
1510 }
1511 svsk->sk_lastrecv = get_seconds();
1512 clear_bit(SK_OLD, &svsk->sk_flags);
1513
1514 rqstp->rq_secure = svc_port_is_privileged(svc_addr(rqstp));
1515 rqstp->rq_chandle.defer = svc_defer;
1516
1517 if (serv->sv_stats)
1518 serv->sv_stats->netcnt++;
1519 return len;
1520}
1521
1522/*
1523 * Drop request
1524 */
1525void
1526svc_drop(struct svc_rqst *rqstp)
1527{
1528 dprintk("svc: socket %p dropped request\n", rqstp->rq_sock);
1529 svc_sock_release(rqstp);
1530}
1531
1532/*
1533 * Return reply to client.
1534 */
1535int
1536svc_send(struct svc_rqst *rqstp)
1537{
1538 struct svc_sock *svsk;
1539 int len;
1540 struct xdr_buf *xb;
1541
1542 if ((svsk = rqstp->rq_sock) == NULL) {
1543 printk(KERN_WARNING "NULL socket pointer in %s:%d\n",
1544 __FILE__, __LINE__);
1545 return -EFAULT;
1546 }
1547
1548 /* release the receive skb before sending the reply */
1549 svc_release_skb(rqstp);
1550
1551 /* calculate over-all length */
1552 xb = & rqstp->rq_res;
1553 xb->len = xb->head[0].iov_len +
1554 xb->page_len +
1555 xb->tail[0].iov_len;
1556
1557 /* Grab svsk->sk_mutex to serialize outgoing data. */
1558 mutex_lock(&svsk->sk_mutex);
1559 if (test_bit(SK_DEAD, &svsk->sk_flags))
1560 len = -ENOTCONN;
1561 else
1562 len = svsk->sk_sendto(rqstp);
1563 mutex_unlock(&svsk->sk_mutex);
1564 svc_sock_release(rqstp);
1565
1566 if (len == -ECONNREFUSED || len == -ENOTCONN || len == -EAGAIN)
1567 return 0;
1568 return len;
1569}
1570
1571/*
1572 * Timer function to close old temporary sockets, using
1573 * a mark-and-sweep algorithm.
1574 */
1575static void
1576svc_age_temp_sockets(unsigned long closure)
1577{
1578 struct svc_serv *serv = (struct svc_serv *)closure;
1579 struct svc_sock *svsk;
1580 struct list_head *le, *next;
1581 LIST_HEAD(to_be_aged);
1582
1583 dprintk("svc_age_temp_sockets\n");
1584
1585 if (!spin_trylock_bh(&serv->sv_lock)) {
1586 /* busy, try again 1 sec later */
1587 dprintk("svc_age_temp_sockets: busy\n");
1588 mod_timer(&serv->sv_temptimer, jiffies + HZ);
1589 return;
1590 }
1591
1592 list_for_each_safe(le, next, &serv->sv_tempsocks) {
1593 svsk = list_entry(le, struct svc_sock, sk_list);
1594
1595 if (!test_and_set_bit(SK_OLD, &svsk->sk_flags))
1596 continue;
1597 if (atomic_read(&svsk->sk_inuse) > 1 || test_bit(SK_BUSY, &svsk->sk_flags))
1598 continue;
1599 atomic_inc(&svsk->sk_inuse);
1600 list_move(le, &to_be_aged);
1601 set_bit(SK_CLOSE, &svsk->sk_flags);
1602 set_bit(SK_DETACHED, &svsk->sk_flags);
1603 }
1604 spin_unlock_bh(&serv->sv_lock);
1605
1606 while (!list_empty(&to_be_aged)) {
1607 le = to_be_aged.next;
1608 /* fiddling the sk_list node is safe 'cos we're SK_DETACHED */
1609 list_del_init(le);
1610 svsk = list_entry(le, struct svc_sock, sk_list);
1611
1612 dprintk("queuing svsk %p for closing, %lu seconds old\n",
1613 svsk, get_seconds() - svsk->sk_lastrecv);
1614
1615 /* a thread will dequeue and close it soon */
1616 svc_sock_enqueue(svsk);
1617 svc_sock_put(svsk);
1618 }
1619
1620 mod_timer(&serv->sv_temptimer, jiffies + svc_conn_age_period * HZ);
1621}
1622
1623/*
1624 * Initialize socket for RPC use and create svc_sock struct 1106 * Initialize socket for RPC use and create svc_sock struct
1625 * XXX: May want to setsockopt SO_SNDBUF and SO_RCVBUF. 1107 * XXX: May want to setsockopt SO_SNDBUF and SO_RCVBUF.
1626 */ 1108 */
@@ -1631,7 +1113,6 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
1631 struct svc_sock *svsk; 1113 struct svc_sock *svsk;
1632 struct sock *inet; 1114 struct sock *inet;
1633 int pmap_register = !(flags & SVC_SOCK_ANONYMOUS); 1115 int pmap_register = !(flags & SVC_SOCK_ANONYMOUS);
1634 int is_temporary = flags & SVC_SOCK_TEMPORARY;
1635 1116
1636 dprintk("svc: svc_setup_socket %p\n", sock); 1117 dprintk("svc: svc_setup_socket %p\n", sock);
1637 if (!(svsk = kzalloc(sizeof(*svsk), GFP_KERNEL))) { 1118 if (!(svsk = kzalloc(sizeof(*svsk), GFP_KERNEL))) {
@@ -1651,44 +1132,18 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
1651 return NULL; 1132 return NULL;
1652 } 1133 }
1653 1134
1654 set_bit(SK_BUSY, &svsk->sk_flags);
1655 inet->sk_user_data = svsk; 1135 inet->sk_user_data = svsk;
1656 svsk->sk_sock = sock; 1136 svsk->sk_sock = sock;
1657 svsk->sk_sk = inet; 1137 svsk->sk_sk = inet;
1658 svsk->sk_ostate = inet->sk_state_change; 1138 svsk->sk_ostate = inet->sk_state_change;
1659 svsk->sk_odata = inet->sk_data_ready; 1139 svsk->sk_odata = inet->sk_data_ready;
1660 svsk->sk_owspace = inet->sk_write_space; 1140 svsk->sk_owspace = inet->sk_write_space;
1661 svsk->sk_server = serv;
1662 atomic_set(&svsk->sk_inuse, 1);
1663 svsk->sk_lastrecv = get_seconds();
1664 spin_lock_init(&svsk->sk_lock);
1665 INIT_LIST_HEAD(&svsk->sk_deferred);
1666 INIT_LIST_HEAD(&svsk->sk_ready);
1667 mutex_init(&svsk->sk_mutex);
1668 1141
1669 /* Initialize the socket */ 1142 /* Initialize the socket */
1670 if (sock->type == SOCK_DGRAM) 1143 if (sock->type == SOCK_DGRAM)
1671 svc_udp_init(svsk); 1144 svc_udp_init(svsk, serv);
1672 else 1145 else
1673 svc_tcp_init(svsk); 1146 svc_tcp_init(svsk, serv);
1674
1675 spin_lock_bh(&serv->sv_lock);
1676 if (is_temporary) {
1677 set_bit(SK_TEMP, &svsk->sk_flags);
1678 list_add(&svsk->sk_list, &serv->sv_tempsocks);
1679 serv->sv_tmpcnt++;
1680 if (serv->sv_temptimer.function == NULL) {
1681 /* setup timer to age temp sockets */
1682 setup_timer(&serv->sv_temptimer, svc_age_temp_sockets,
1683 (unsigned long)serv);
1684 mod_timer(&serv->sv_temptimer,
1685 jiffies + svc_conn_age_period * HZ);
1686 }
1687 } else {
1688 clear_bit(SK_TEMP, &svsk->sk_flags);
1689 list_add(&svsk->sk_list, &serv->sv_permsocks);
1690 }
1691 spin_unlock_bh(&serv->sv_lock);
1692 1147
1693 dprintk("svc: svc_setup_socket created %p (inet %p)\n", 1148 dprintk("svc: svc_setup_socket created %p (inet %p)\n",
1694 svsk, svsk->sk_sk); 1149 svsk, svsk->sk_sk);
@@ -1717,7 +1172,16 @@ int svc_addsock(struct svc_serv *serv,
1717 else { 1172 else {
1718 svsk = svc_setup_socket(serv, so, &err, SVC_SOCK_DEFAULTS); 1173 svsk = svc_setup_socket(serv, so, &err, SVC_SOCK_DEFAULTS);
1719 if (svsk) { 1174 if (svsk) {
1720 svc_sock_received(svsk); 1175 struct sockaddr_storage addr;
1176 struct sockaddr *sin = (struct sockaddr *)&addr;
1177 int salen;
1178 if (kernel_getsockname(svsk->sk_sock, sin, &salen) == 0)
1179 svc_xprt_set_local(&svsk->sk_xprt, sin, salen);
1180 clear_bit(XPT_TEMP, &svsk->sk_xprt.xpt_flags);
1181 spin_lock_bh(&serv->sv_lock);
1182 list_add(&svsk->sk_xprt.xpt_list, &serv->sv_permsocks);
1183 spin_unlock_bh(&serv->sv_lock);
1184 svc_xprt_received(&svsk->sk_xprt);
1721 err = 0; 1185 err = 0;
1722 } 1186 }
1723 } 1187 }
@@ -1733,14 +1197,19 @@ EXPORT_SYMBOL_GPL(svc_addsock);
1733/* 1197/*
1734 * Create socket for RPC service. 1198 * Create socket for RPC service.
1735 */ 1199 */
1736static int svc_create_socket(struct svc_serv *serv, int protocol, 1200static struct svc_xprt *svc_create_socket(struct svc_serv *serv,
1737 struct sockaddr *sin, int len, int flags) 1201 int protocol,
1202 struct sockaddr *sin, int len,
1203 int flags)
1738{ 1204{
1739 struct svc_sock *svsk; 1205 struct svc_sock *svsk;
1740 struct socket *sock; 1206 struct socket *sock;
1741 int error; 1207 int error;
1742 int type; 1208 int type;
1743 char buf[RPC_MAX_ADDRBUFLEN]; 1209 char buf[RPC_MAX_ADDRBUFLEN];
1210 struct sockaddr_storage addr;
1211 struct sockaddr *newsin = (struct sockaddr *)&addr;
1212 int newlen;
1744 1213
1745 dprintk("svc: svc_create_socket(%s, %d, %s)\n", 1214 dprintk("svc: svc_create_socket(%s, %d, %s)\n",
1746 serv->sv_program->pg_name, protocol, 1215 serv->sv_program->pg_name, protocol,
@@ -1749,13 +1218,13 @@ static int svc_create_socket(struct svc_serv *serv, int protocol,
1749 if (protocol != IPPROTO_UDP && protocol != IPPROTO_TCP) { 1218 if (protocol != IPPROTO_UDP && protocol != IPPROTO_TCP) {
1750 printk(KERN_WARNING "svc: only UDP and TCP " 1219 printk(KERN_WARNING "svc: only UDP and TCP "
1751 "sockets supported\n"); 1220 "sockets supported\n");
1752 return -EINVAL; 1221 return ERR_PTR(-EINVAL);
1753 } 1222 }
1754 type = (protocol == IPPROTO_UDP)? SOCK_DGRAM : SOCK_STREAM; 1223 type = (protocol == IPPROTO_UDP)? SOCK_DGRAM : SOCK_STREAM;
1755 1224
1756 error = sock_create_kern(sin->sa_family, type, protocol, &sock); 1225 error = sock_create_kern(sin->sa_family, type, protocol, &sock);
1757 if (error < 0) 1226 if (error < 0)
1758 return error; 1227 return ERR_PTR(error);
1759 1228
1760 svc_reclassify_socket(sock); 1229 svc_reclassify_socket(sock);
1761 1230
@@ -1765,203 +1234,55 @@ static int svc_create_socket(struct svc_serv *serv, int protocol,
1765 if (error < 0) 1234 if (error < 0)
1766 goto bummer; 1235 goto bummer;
1767 1236
1237 newlen = len;
1238 error = kernel_getsockname(sock, newsin, &newlen);
1239 if (error < 0)
1240 goto bummer;
1241
1768 if (protocol == IPPROTO_TCP) { 1242 if (protocol == IPPROTO_TCP) {
1769 if ((error = kernel_listen(sock, 64)) < 0) 1243 if ((error = kernel_listen(sock, 64)) < 0)
1770 goto bummer; 1244 goto bummer;
1771 } 1245 }
1772 1246
1773 if ((svsk = svc_setup_socket(serv, sock, &error, flags)) != NULL) { 1247 if ((svsk = svc_setup_socket(serv, sock, &error, flags)) != NULL) {
1774 svc_sock_received(svsk); 1248 svc_xprt_set_local(&svsk->sk_xprt, newsin, newlen);
1775 return ntohs(inet_sk(svsk->sk_sk)->sport); 1249 return (struct svc_xprt *)svsk;
1776 } 1250 }
1777 1251
1778bummer: 1252bummer:
1779 dprintk("svc: svc_create_socket error = %d\n", -error); 1253 dprintk("svc: svc_create_socket error = %d\n", -error);
1780 sock_release(sock); 1254 sock_release(sock);
1781 return error; 1255 return ERR_PTR(error);
1782} 1256}
1783 1257
1784/* 1258/*
1785 * Remove a dead socket 1259 * Detach the svc_sock from the socket so that no
1260 * more callbacks occur.
1786 */ 1261 */
1787static void 1262static void svc_sock_detach(struct svc_xprt *xprt)
1788svc_delete_socket(struct svc_sock *svsk)
1789{ 1263{
1790 struct svc_serv *serv; 1264 struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
1791 struct sock *sk; 1265 struct sock *sk = svsk->sk_sk;
1792
1793 dprintk("svc: svc_delete_socket(%p)\n", svsk);
1794 1266
1795 serv = svsk->sk_server; 1267 dprintk("svc: svc_sock_detach(%p)\n", svsk);
1796 sk = svsk->sk_sk;
1797 1268
1269 /* put back the old socket callbacks */
1798 sk->sk_state_change = svsk->sk_ostate; 1270 sk->sk_state_change = svsk->sk_ostate;
1799 sk->sk_data_ready = svsk->sk_odata; 1271 sk->sk_data_ready = svsk->sk_odata;
1800 sk->sk_write_space = svsk->sk_owspace; 1272 sk->sk_write_space = svsk->sk_owspace;
1801
1802 spin_lock_bh(&serv->sv_lock);
1803
1804 if (!test_and_set_bit(SK_DETACHED, &svsk->sk_flags))
1805 list_del_init(&svsk->sk_list);
1806 /*
1807 * We used to delete the svc_sock from whichever list
1808 * it's sk_ready node was on, but we don't actually
1809 * need to. This is because the only time we're called
1810 * while still attached to a queue, the queue itself
1811 * is about to be destroyed (in svc_destroy).
1812 */
1813 if (!test_and_set_bit(SK_DEAD, &svsk->sk_flags)) {
1814 BUG_ON(atomic_read(&svsk->sk_inuse)<2);
1815 atomic_dec(&svsk->sk_inuse);
1816 if (test_bit(SK_TEMP, &svsk->sk_flags))
1817 serv->sv_tmpcnt--;
1818 }
1819
1820 spin_unlock_bh(&serv->sv_lock);
1821}
1822
1823static void svc_close_socket(struct svc_sock *svsk)
1824{
1825 set_bit(SK_CLOSE, &svsk->sk_flags);
1826 if (test_and_set_bit(SK_BUSY, &svsk->sk_flags))
1827 /* someone else will have to effect the close */
1828 return;
1829
1830 atomic_inc(&svsk->sk_inuse);
1831 svc_delete_socket(svsk);
1832 clear_bit(SK_BUSY, &svsk->sk_flags);
1833 svc_sock_put(svsk);
1834}
1835
1836void svc_force_close_socket(struct svc_sock *svsk)
1837{
1838 set_bit(SK_CLOSE, &svsk->sk_flags);
1839 if (test_bit(SK_BUSY, &svsk->sk_flags)) {
1840 /* Waiting to be processed, but no threads left,
1841 * So just remove it from the waiting list
1842 */
1843 list_del_init(&svsk->sk_ready);
1844 clear_bit(SK_BUSY, &svsk->sk_flags);
1845 }
1846 svc_close_socket(svsk);
1847}
1848
1849/**
1850 * svc_makesock - Make a socket for nfsd and lockd
1851 * @serv: RPC server structure
1852 * @protocol: transport protocol to use
1853 * @port: port to use
1854 * @flags: requested socket characteristics
1855 *
1856 */
1857int svc_makesock(struct svc_serv *serv, int protocol, unsigned short port,
1858 int flags)
1859{
1860 struct sockaddr_in sin = {
1861 .sin_family = AF_INET,
1862 .sin_addr.s_addr = INADDR_ANY,
1863 .sin_port = htons(port),
1864 };
1865
1866 dprintk("svc: creating socket proto = %d\n", protocol);
1867 return svc_create_socket(serv, protocol, (struct sockaddr *) &sin,
1868 sizeof(sin), flags);
1869} 1273}
1870 1274
1871/* 1275/*
1872 * Handle defer and revisit of requests 1276 * Free the svc_sock's socket resources and the svc_sock itself.
1873 */ 1277 */
1874 1278static void svc_sock_free(struct svc_xprt *xprt)
1875static void svc_revisit(struct cache_deferred_req *dreq, int too_many)
1876{ 1279{
1877 struct svc_deferred_req *dr = container_of(dreq, struct svc_deferred_req, handle); 1280 struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
1878 struct svc_sock *svsk; 1281 dprintk("svc: svc_sock_free(%p)\n", svsk);
1879 1282
1880 if (too_many) { 1283 if (svsk->sk_sock->file)
1881 svc_sock_put(dr->svsk); 1284 sockfd_put(svsk->sk_sock);
1882 kfree(dr); 1285 else
1883 return; 1286 sock_release(svsk->sk_sock);
1884 } 1287 kfree(svsk);
1885 dprintk("revisit queued\n");
1886 svsk = dr->svsk;
1887 dr->svsk = NULL;
1888 spin_lock(&svsk->sk_lock);
1889 list_add(&dr->handle.recent, &svsk->sk_deferred);
1890 spin_unlock(&svsk->sk_lock);
1891 set_bit(SK_DEFERRED, &svsk->sk_flags);
1892 svc_sock_enqueue(svsk);
1893 svc_sock_put(svsk);
1894}
1895
1896static struct cache_deferred_req *
1897svc_defer(struct cache_req *req)
1898{
1899 struct svc_rqst *rqstp = container_of(req, struct svc_rqst, rq_chandle);
1900 int size = sizeof(struct svc_deferred_req) + (rqstp->rq_arg.len);
1901 struct svc_deferred_req *dr;
1902
1903 if (rqstp->rq_arg.page_len)
1904 return NULL; /* if more than a page, give up FIXME */
1905 if (rqstp->rq_deferred) {
1906 dr = rqstp->rq_deferred;
1907 rqstp->rq_deferred = NULL;
1908 } else {
1909 int skip = rqstp->rq_arg.len - rqstp->rq_arg.head[0].iov_len;
1910 /* FIXME maybe discard if size too large */
1911 dr = kmalloc(size, GFP_KERNEL);
1912 if (dr == NULL)
1913 return NULL;
1914
1915 dr->handle.owner = rqstp->rq_server;
1916 dr->prot = rqstp->rq_prot;
1917 memcpy(&dr->addr, &rqstp->rq_addr, rqstp->rq_addrlen);
1918 dr->addrlen = rqstp->rq_addrlen;
1919 dr->daddr = rqstp->rq_daddr;
1920 dr->argslen = rqstp->rq_arg.len >> 2;
1921 memcpy(dr->args, rqstp->rq_arg.head[0].iov_base-skip, dr->argslen<<2);
1922 }
1923 atomic_inc(&rqstp->rq_sock->sk_inuse);
1924 dr->svsk = rqstp->rq_sock;
1925
1926 dr->handle.revisit = svc_revisit;
1927 return &dr->handle;
1928}
1929
1930/*
1931 * recv data from a deferred request into an active one
1932 */
1933static int svc_deferred_recv(struct svc_rqst *rqstp)
1934{
1935 struct svc_deferred_req *dr = rqstp->rq_deferred;
1936
1937 rqstp->rq_arg.head[0].iov_base = dr->args;
1938 rqstp->rq_arg.head[0].iov_len = dr->argslen<<2;
1939 rqstp->rq_arg.page_len = 0;
1940 rqstp->rq_arg.len = dr->argslen<<2;
1941 rqstp->rq_prot = dr->prot;
1942 memcpy(&rqstp->rq_addr, &dr->addr, dr->addrlen);
1943 rqstp->rq_addrlen = dr->addrlen;
1944 rqstp->rq_daddr = dr->daddr;
1945 rqstp->rq_respages = rqstp->rq_pages;
1946 return dr->argslen<<2;
1947}
1948
1949
1950static struct svc_deferred_req *svc_deferred_dequeue(struct svc_sock *svsk)
1951{
1952 struct svc_deferred_req *dr = NULL;
1953
1954 if (!test_bit(SK_DEFERRED, &svsk->sk_flags))
1955 return NULL;
1956 spin_lock(&svsk->sk_lock);
1957 clear_bit(SK_DEFERRED, &svsk->sk_flags);
1958 if (!list_empty(&svsk->sk_deferred)) {
1959 dr = list_entry(svsk->sk_deferred.next,
1960 struct svc_deferred_req,
1961 handle.recent);
1962 list_del_init(&dr->handle.recent);
1963 set_bit(SK_DEFERRED, &svsk->sk_flags);
1964 }
1965 spin_unlock(&svsk->sk_lock);
1966 return dr;
1967} 1288}
diff --git a/net/sunrpc/sysctl.c b/net/sunrpc/sysctl.c
index bada7de0c2fc..0f8c439b848a 100644
--- a/net/sunrpc/sysctl.c
+++ b/net/sunrpc/sysctl.c
@@ -18,6 +18,7 @@
18#include <linux/sunrpc/types.h> 18#include <linux/sunrpc/types.h>
19#include <linux/sunrpc/sched.h> 19#include <linux/sunrpc/sched.h>
20#include <linux/sunrpc/stats.h> 20#include <linux/sunrpc/stats.h>
21#include <linux/sunrpc/svc_xprt.h>
21 22
22/* 23/*
23 * Declare the debug flags here 24 * Declare the debug flags here
@@ -55,6 +56,30 @@ rpc_unregister_sysctl(void)
55 } 56 }
56} 57}
57 58
59static int proc_do_xprt(ctl_table *table, int write, struct file *file,
60 void __user *buffer, size_t *lenp, loff_t *ppos)
61{
62 char tmpbuf[256];
63 int len;
64 if ((*ppos && !write) || !*lenp) {
65 *lenp = 0;
66 return 0;
67 }
68 if (write)
69 return -EINVAL;
70 else {
71 len = svc_print_xprts(tmpbuf, sizeof(tmpbuf));
72 if (!access_ok(VERIFY_WRITE, buffer, len))
73 return -EFAULT;
74
75 if (__copy_to_user(buffer, tmpbuf, len))
76 return -EFAULT;
77 }
78 *lenp -= len;
79 *ppos += len;
80 return 0;
81}
82
58static int 83static int
59proc_dodebug(ctl_table *table, int write, struct file *file, 84proc_dodebug(ctl_table *table, int write, struct file *file,
60 void __user *buffer, size_t *lenp, loff_t *ppos) 85 void __user *buffer, size_t *lenp, loff_t *ppos)
@@ -147,6 +172,12 @@ static ctl_table debug_table[] = {
147 .mode = 0644, 172 .mode = 0644,
148 .proc_handler = &proc_dodebug 173 .proc_handler = &proc_dodebug
149 }, 174 },
175 {
176 .procname = "transports",
177 .maxlen = 256,
178 .mode = 0444,
179 .proc_handler = &proc_do_xprt,
180 },
150 { .ctl_name = 0 } 181 { .ctl_name = 0 }
151}; 182};
152 183
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 54264062ea69..995c3fdc16c2 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -96,11 +96,13 @@ xdr_encode_string(__be32 *p, const char *string)
96EXPORT_SYMBOL(xdr_encode_string); 96EXPORT_SYMBOL(xdr_encode_string);
97 97
98__be32 * 98__be32 *
99xdr_decode_string_inplace(__be32 *p, char **sp, int *lenp, int maxlen) 99xdr_decode_string_inplace(__be32 *p, char **sp,
100 unsigned int *lenp, unsigned int maxlen)
100{ 101{
101 unsigned int len; 102 u32 len;
102 103
103 if ((len = ntohl(*p++)) > maxlen) 104 len = ntohl(*p++);
105 if (len > maxlen)
104 return NULL; 106 return NULL;
105 *lenp = len; 107 *lenp = len;
106 *sp = (char *) p; 108 *sp = (char *) p;
diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile
index 264f0feeb513..5a8f268bdd30 100644
--- a/net/sunrpc/xprtrdma/Makefile
+++ b/net/sunrpc/xprtrdma/Makefile
@@ -1,3 +1,8 @@
1obj-$(CONFIG_SUNRPC_XPRT_RDMA) += xprtrdma.o 1obj-$(CONFIG_SUNRPC_XPRT_RDMA) += xprtrdma.o
2 2
3xprtrdma-y := transport.o rpc_rdma.o verbs.o 3xprtrdma-y := transport.o rpc_rdma.o verbs.o
4
5obj-$(CONFIG_SUNRPC_XPRT_RDMA) += svcrdma.o
6
7svcrdma-y := svc_rdma.o svc_rdma_transport.o \
8 svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o
diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c
new file mode 100644
index 000000000000..88c0ca20bb1e
--- /dev/null
+++ b/net/sunrpc/xprtrdma/svc_rdma.c
@@ -0,0 +1,266 @@
1/*
2 * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the BSD-type
8 * license below:
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 *
17 * Redistributions in binary form must reproduce the above
18 * copyright notice, this list of conditions and the following
19 * disclaimer in the documentation and/or other materials provided
20 * with the distribution.
21 *
22 * Neither the name of the Network Appliance, Inc. nor the names of
23 * its contributors may be used to endorse or promote products
24 * derived from this software without specific prior written
25 * permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 *
39 * Author: Tom Tucker <tom@opengridcomputing.com>
40 */
41#include <linux/module.h>
42#include <linux/init.h>
43#include <linux/fs.h>
44#include <linux/sysctl.h>
45#include <linux/sunrpc/clnt.h>
46#include <linux/sunrpc/sched.h>
47#include <linux/sunrpc/svc_rdma.h>
48
49#define RPCDBG_FACILITY RPCDBG_SVCXPRT
50
51/* RPC/RDMA parameters */
52unsigned int svcrdma_ord = RPCRDMA_ORD;
53static unsigned int min_ord = 1;
54static unsigned int max_ord = 4096;
55unsigned int svcrdma_max_requests = RPCRDMA_MAX_REQUESTS;
56static unsigned int min_max_requests = 4;
57static unsigned int max_max_requests = 16384;
58unsigned int svcrdma_max_req_size = RPCRDMA_MAX_REQ_SIZE;
59static unsigned int min_max_inline = 4096;
60static unsigned int max_max_inline = 65536;
61
62atomic_t rdma_stat_recv;
63atomic_t rdma_stat_read;
64atomic_t rdma_stat_write;
65atomic_t rdma_stat_sq_starve;
66atomic_t rdma_stat_rq_starve;
67atomic_t rdma_stat_rq_poll;
68atomic_t rdma_stat_rq_prod;
69atomic_t rdma_stat_sq_poll;
70atomic_t rdma_stat_sq_prod;
71
72/*
73 * This function implements reading and resetting an atomic_t stat
74 * variable through read/write to a proc file. Any write to the file
75 * resets the associated statistic to zero. Any read returns it's
76 * current value.
77 */
78static int read_reset_stat(ctl_table *table, int write,
79 struct file *filp, void __user *buffer, size_t *lenp,
80 loff_t *ppos)
81{
82 atomic_t *stat = (atomic_t *)table->data;
83
84 if (!stat)
85 return -EINVAL;
86
87 if (write)
88 atomic_set(stat, 0);
89 else {
90 char str_buf[32];
91 char *data;
92 int len = snprintf(str_buf, 32, "%d\n", atomic_read(stat));
93 if (len >= 32)
94 return -EFAULT;
95 len = strlen(str_buf);
96 if (*ppos > len) {
97 *lenp = 0;
98 return 0;
99 }
100 data = &str_buf[*ppos];
101 len -= *ppos;
102 if (len > *lenp)
103 len = *lenp;
104 if (len && copy_to_user(buffer, str_buf, len))
105 return -EFAULT;
106 *lenp = len;
107 *ppos += len;
108 }
109 return 0;
110}
111
112static struct ctl_table_header *svcrdma_table_header;
113static ctl_table svcrdma_parm_table[] = {
114 {
115 .procname = "max_requests",
116 .data = &svcrdma_max_requests,
117 .maxlen = sizeof(unsigned int),
118 .mode = 0644,
119 .proc_handler = &proc_dointvec_minmax,
120 .strategy = &sysctl_intvec,
121 .extra1 = &min_max_requests,
122 .extra2 = &max_max_requests
123 },
124 {
125 .procname = "max_req_size",
126 .data = &svcrdma_max_req_size,
127 .maxlen = sizeof(unsigned int),
128 .mode = 0644,
129 .proc_handler = &proc_dointvec_minmax,
130 .strategy = &sysctl_intvec,
131 .extra1 = &min_max_inline,
132 .extra2 = &max_max_inline
133 },
134 {
135 .procname = "max_outbound_read_requests",
136 .data = &svcrdma_ord,
137 .maxlen = sizeof(unsigned int),
138 .mode = 0644,
139 .proc_handler = &proc_dointvec_minmax,
140 .strategy = &sysctl_intvec,
141 .extra1 = &min_ord,
142 .extra2 = &max_ord,
143 },
144
145 {
146 .procname = "rdma_stat_read",
147 .data = &rdma_stat_read,
148 .maxlen = sizeof(atomic_t),
149 .mode = 0644,
150 .proc_handler = &read_reset_stat,
151 },
152 {
153 .procname = "rdma_stat_recv",
154 .data = &rdma_stat_recv,
155 .maxlen = sizeof(atomic_t),
156 .mode = 0644,
157 .proc_handler = &read_reset_stat,
158 },
159 {
160 .procname = "rdma_stat_write",
161 .data = &rdma_stat_write,
162 .maxlen = sizeof(atomic_t),
163 .mode = 0644,
164 .proc_handler = &read_reset_stat,
165 },
166 {
167 .procname = "rdma_stat_sq_starve",
168 .data = &rdma_stat_sq_starve,
169 .maxlen = sizeof(atomic_t),
170 .mode = 0644,
171 .proc_handler = &read_reset_stat,
172 },
173 {
174 .procname = "rdma_stat_rq_starve",
175 .data = &rdma_stat_rq_starve,
176 .maxlen = sizeof(atomic_t),
177 .mode = 0644,
178 .proc_handler = &read_reset_stat,
179 },
180 {
181 .procname = "rdma_stat_rq_poll",
182 .data = &rdma_stat_rq_poll,
183 .maxlen = sizeof(atomic_t),
184 .mode = 0644,
185 .proc_handler = &read_reset_stat,
186 },
187 {
188 .procname = "rdma_stat_rq_prod",
189 .data = &rdma_stat_rq_prod,
190 .maxlen = sizeof(atomic_t),
191 .mode = 0644,
192 .proc_handler = &read_reset_stat,
193 },
194 {
195 .procname = "rdma_stat_sq_poll",
196 .data = &rdma_stat_sq_poll,
197 .maxlen = sizeof(atomic_t),
198 .mode = 0644,
199 .proc_handler = &read_reset_stat,
200 },
201 {
202 .procname = "rdma_stat_sq_prod",
203 .data = &rdma_stat_sq_prod,
204 .maxlen = sizeof(atomic_t),
205 .mode = 0644,
206 .proc_handler = &read_reset_stat,
207 },
208 {
209 .ctl_name = 0,
210 },
211};
212
213static ctl_table svcrdma_table[] = {
214 {
215 .procname = "svc_rdma",
216 .mode = 0555,
217 .child = svcrdma_parm_table
218 },
219 {
220 .ctl_name = 0,
221 },
222};
223
224static ctl_table svcrdma_root_table[] = {
225 {
226 .ctl_name = CTL_SUNRPC,
227 .procname = "sunrpc",
228 .mode = 0555,
229 .child = svcrdma_table
230 },
231 {
232 .ctl_name = 0,
233 },
234};
235
236void svc_rdma_cleanup(void)
237{
238 dprintk("SVCRDMA Module Removed, deregister RPC RDMA transport\n");
239 if (svcrdma_table_header) {
240 unregister_sysctl_table(svcrdma_table_header);
241 svcrdma_table_header = NULL;
242 }
243 svc_unreg_xprt_class(&svc_rdma_class);
244}
245
246int svc_rdma_init(void)
247{
248 dprintk("SVCRDMA Module Init, register RPC RDMA transport\n");
249 dprintk("\tsvcrdma_ord : %d\n", svcrdma_ord);
250 dprintk("\tmax_requests : %d\n", svcrdma_max_requests);
251 dprintk("\tsq_depth : %d\n",
252 svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT);
253 dprintk("\tmax_inline : %d\n", svcrdma_max_req_size);
254 if (!svcrdma_table_header)
255 svcrdma_table_header =
256 register_sysctl_table(svcrdma_root_table);
257
258 /* Register RDMA with the SVC transport switch */
259 svc_reg_xprt_class(&svc_rdma_class);
260 return 0;
261}
262MODULE_AUTHOR("Tom Tucker <tom@opengridcomputing.com>");
263MODULE_DESCRIPTION("SVC RDMA Transport");
264MODULE_LICENSE("Dual BSD/GPL");
265module_init(svc_rdma_init);
266module_exit(svc_rdma_cleanup);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_marshal.c b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
new file mode 100644
index 000000000000..9530ef2d40dc
--- /dev/null
+++ b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
@@ -0,0 +1,412 @@
1/*
2 * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the BSD-type
8 * license below:
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 *
17 * Redistributions in binary form must reproduce the above
18 * copyright notice, this list of conditions and the following
19 * disclaimer in the documentation and/or other materials provided
20 * with the distribution.
21 *
22 * Neither the name of the Network Appliance, Inc. nor the names of
23 * its contributors may be used to endorse or promote products
24 * derived from this software without specific prior written
25 * permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 *
39 * Author: Tom Tucker <tom@opengridcomputing.com>
40 */
41
42#include <linux/sunrpc/xdr.h>
43#include <linux/sunrpc/debug.h>
44#include <asm/unaligned.h>
45#include <linux/sunrpc/rpc_rdma.h>
46#include <linux/sunrpc/svc_rdma.h>
47
48#define RPCDBG_FACILITY RPCDBG_SVCXPRT
49
50/*
51 * Decodes a read chunk list. The expected format is as follows:
52 * descrim : xdr_one
53 * position : u32 offset into XDR stream
54 * handle : u32 RKEY
55 * . . .
56 * end-of-list: xdr_zero
57 */
58static u32 *decode_read_list(u32 *va, u32 *vaend)
59{
60 struct rpcrdma_read_chunk *ch = (struct rpcrdma_read_chunk *)va;
61
62 while (ch->rc_discrim != xdr_zero) {
63 u64 ch_offset;
64
65 if (((unsigned long)ch + sizeof(struct rpcrdma_read_chunk)) >
66 (unsigned long)vaend) {
67 dprintk("svcrdma: vaend=%p, ch=%p\n", vaend, ch);
68 return NULL;
69 }
70
71 ch->rc_discrim = ntohl(ch->rc_discrim);
72 ch->rc_position = ntohl(ch->rc_position);
73 ch->rc_target.rs_handle = ntohl(ch->rc_target.rs_handle);
74 ch->rc_target.rs_length = ntohl(ch->rc_target.rs_length);
75 va = (u32 *)&ch->rc_target.rs_offset;
76 xdr_decode_hyper(va, &ch_offset);
77 put_unaligned(ch_offset, (u64 *)va);
78 ch++;
79 }
80 return (u32 *)&ch->rc_position;
81}
82
83/*
84 * Determine number of chunks and total bytes in chunk list. The chunk
85 * list has already been verified to fit within the RPCRDMA header.
86 */
87void svc_rdma_rcl_chunk_counts(struct rpcrdma_read_chunk *ch,
88 int *ch_count, int *byte_count)
89{
90 /* compute the number of bytes represented by read chunks */
91 *byte_count = 0;
92 *ch_count = 0;
93 for (; ch->rc_discrim != 0; ch++) {
94 *byte_count = *byte_count + ch->rc_target.rs_length;
95 *ch_count = *ch_count + 1;
96 }
97}
98
99/*
100 * Decodes a write chunk list. The expected format is as follows:
101 * descrim : xdr_one
102 * nchunks : <count>
103 * handle : u32 RKEY ---+
104 * length : u32 <len of segment> |
105 * offset : remove va + <count>
106 * . . . |
107 * ---+
108 */
109static u32 *decode_write_list(u32 *va, u32 *vaend)
110{
111 int ch_no;
112 struct rpcrdma_write_array *ary =
113 (struct rpcrdma_write_array *)va;
114
115 /* Check for not write-array */
116 if (ary->wc_discrim == xdr_zero)
117 return (u32 *)&ary->wc_nchunks;
118
119 if ((unsigned long)ary + sizeof(struct rpcrdma_write_array) >
120 (unsigned long)vaend) {
121 dprintk("svcrdma: ary=%p, vaend=%p\n", ary, vaend);
122 return NULL;
123 }
124 ary->wc_discrim = ntohl(ary->wc_discrim);
125 ary->wc_nchunks = ntohl(ary->wc_nchunks);
126 if (((unsigned long)&ary->wc_array[0] +
127 (sizeof(struct rpcrdma_write_chunk) * ary->wc_nchunks)) >
128 (unsigned long)vaend) {
129 dprintk("svcrdma: ary=%p, wc_nchunks=%d, vaend=%p\n",
130 ary, ary->wc_nchunks, vaend);
131 return NULL;
132 }
133 for (ch_no = 0; ch_no < ary->wc_nchunks; ch_no++) {
134 u64 ch_offset;
135
136 ary->wc_array[ch_no].wc_target.rs_handle =
137 ntohl(ary->wc_array[ch_no].wc_target.rs_handle);
138 ary->wc_array[ch_no].wc_target.rs_length =
139 ntohl(ary->wc_array[ch_no].wc_target.rs_length);
140 va = (u32 *)&ary->wc_array[ch_no].wc_target.rs_offset;
141 xdr_decode_hyper(va, &ch_offset);
142 put_unaligned(ch_offset, (u64 *)va);
143 }
144
145 /*
146 * rs_length is the 2nd 4B field in wc_target and taking its
147 * address skips the list terminator
148 */
149 return (u32 *)&ary->wc_array[ch_no].wc_target.rs_length;
150}
151
152static u32 *decode_reply_array(u32 *va, u32 *vaend)
153{
154 int ch_no;
155 struct rpcrdma_write_array *ary =
156 (struct rpcrdma_write_array *)va;
157
158 /* Check for no reply-array */
159 if (ary->wc_discrim == xdr_zero)
160 return (u32 *)&ary->wc_nchunks;
161
162 if ((unsigned long)ary + sizeof(struct rpcrdma_write_array) >
163 (unsigned long)vaend) {
164 dprintk("svcrdma: ary=%p, vaend=%p\n", ary, vaend);
165 return NULL;
166 }
167 ary->wc_discrim = ntohl(ary->wc_discrim);
168 ary->wc_nchunks = ntohl(ary->wc_nchunks);
169 if (((unsigned long)&ary->wc_array[0] +
170 (sizeof(struct rpcrdma_write_chunk) * ary->wc_nchunks)) >
171 (unsigned long)vaend) {
172 dprintk("svcrdma: ary=%p, wc_nchunks=%d, vaend=%p\n",
173 ary, ary->wc_nchunks, vaend);
174 return NULL;
175 }
176 for (ch_no = 0; ch_no < ary->wc_nchunks; ch_no++) {
177 u64 ch_offset;
178
179 ary->wc_array[ch_no].wc_target.rs_handle =
180 ntohl(ary->wc_array[ch_no].wc_target.rs_handle);
181 ary->wc_array[ch_no].wc_target.rs_length =
182 ntohl(ary->wc_array[ch_no].wc_target.rs_length);
183 va = (u32 *)&ary->wc_array[ch_no].wc_target.rs_offset;
184 xdr_decode_hyper(va, &ch_offset);
185 put_unaligned(ch_offset, (u64 *)va);
186 }
187
188 return (u32 *)&ary->wc_array[ch_no];
189}
190
191int svc_rdma_xdr_decode_req(struct rpcrdma_msg **rdma_req,
192 struct svc_rqst *rqstp)
193{
194 struct rpcrdma_msg *rmsgp = NULL;
195 u32 *va;
196 u32 *vaend;
197 u32 hdr_len;
198
199 rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base;
200
201 /* Verify that there's enough bytes for header + something */
202 if (rqstp->rq_arg.len <= RPCRDMA_HDRLEN_MIN) {
203 dprintk("svcrdma: header too short = %d\n",
204 rqstp->rq_arg.len);
205 return -EINVAL;
206 }
207
208 /* Decode the header */
209 rmsgp->rm_xid = ntohl(rmsgp->rm_xid);
210 rmsgp->rm_vers = ntohl(rmsgp->rm_vers);
211 rmsgp->rm_credit = ntohl(rmsgp->rm_credit);
212 rmsgp->rm_type = ntohl(rmsgp->rm_type);
213
214 if (rmsgp->rm_vers != RPCRDMA_VERSION)
215 return -ENOSYS;
216
217 /* Pull in the extra for the padded case and bump our pointer */
218 if (rmsgp->rm_type == RDMA_MSGP) {
219 int hdrlen;
220 rmsgp->rm_body.rm_padded.rm_align =
221 ntohl(rmsgp->rm_body.rm_padded.rm_align);
222 rmsgp->rm_body.rm_padded.rm_thresh =
223 ntohl(rmsgp->rm_body.rm_padded.rm_thresh);
224
225 va = &rmsgp->rm_body.rm_padded.rm_pempty[4];
226 rqstp->rq_arg.head[0].iov_base = va;
227 hdrlen = (u32)((unsigned long)va - (unsigned long)rmsgp);
228 rqstp->rq_arg.head[0].iov_len -= hdrlen;
229 if (hdrlen > rqstp->rq_arg.len)
230 return -EINVAL;
231 return hdrlen;
232 }
233
234 /* The chunk list may contain either a read chunk list or a write
235 * chunk list and a reply chunk list.
236 */
237 va = &rmsgp->rm_body.rm_chunks[0];
238 vaend = (u32 *)((unsigned long)rmsgp + rqstp->rq_arg.len);
239 va = decode_read_list(va, vaend);
240 if (!va)
241 return -EINVAL;
242 va = decode_write_list(va, vaend);
243 if (!va)
244 return -EINVAL;
245 va = decode_reply_array(va, vaend);
246 if (!va)
247 return -EINVAL;
248
249 rqstp->rq_arg.head[0].iov_base = va;
250 hdr_len = (unsigned long)va - (unsigned long)rmsgp;
251 rqstp->rq_arg.head[0].iov_len -= hdr_len;
252
253 *rdma_req = rmsgp;
254 return hdr_len;
255}
256
257int svc_rdma_xdr_decode_deferred_req(struct svc_rqst *rqstp)
258{
259 struct rpcrdma_msg *rmsgp = NULL;
260 struct rpcrdma_read_chunk *ch;
261 struct rpcrdma_write_array *ary;
262 u32 *va;
263 u32 hdrlen;
264
265 dprintk("svcrdma: processing deferred RDMA header on rqstp=%p\n",
266 rqstp);
267 rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base;
268
269 /* Pull in the extra for the padded case and bump our pointer */
270 if (rmsgp->rm_type == RDMA_MSGP) {
271 va = &rmsgp->rm_body.rm_padded.rm_pempty[4];
272 rqstp->rq_arg.head[0].iov_base = va;
273 hdrlen = (u32)((unsigned long)va - (unsigned long)rmsgp);
274 rqstp->rq_arg.head[0].iov_len -= hdrlen;
275 return hdrlen;
276 }
277
278 /*
279 * Skip all chunks to find RPC msg. These were previously processed
280 */
281 va = &rmsgp->rm_body.rm_chunks[0];
282
283 /* Skip read-list */
284 for (ch = (struct rpcrdma_read_chunk *)va;
285 ch->rc_discrim != xdr_zero; ch++);
286 va = (u32 *)&ch->rc_position;
287
288 /* Skip write-list */
289 ary = (struct rpcrdma_write_array *)va;
290 if (ary->wc_discrim == xdr_zero)
291 va = (u32 *)&ary->wc_nchunks;
292 else
293 /*
294 * rs_length is the 2nd 4B field in wc_target and taking its
295 * address skips the list terminator
296 */
297 va = (u32 *)&ary->wc_array[ary->wc_nchunks].wc_target.rs_length;
298
299 /* Skip reply-array */
300 ary = (struct rpcrdma_write_array *)va;
301 if (ary->wc_discrim == xdr_zero)
302 va = (u32 *)&ary->wc_nchunks;
303 else
304 va = (u32 *)&ary->wc_array[ary->wc_nchunks];
305
306 rqstp->rq_arg.head[0].iov_base = va;
307 hdrlen = (unsigned long)va - (unsigned long)rmsgp;
308 rqstp->rq_arg.head[0].iov_len -= hdrlen;
309
310 return hdrlen;
311}
312
313int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt,
314 struct rpcrdma_msg *rmsgp,
315 enum rpcrdma_errcode err, u32 *va)
316{
317 u32 *startp = va;
318
319 *va++ = htonl(rmsgp->rm_xid);
320 *va++ = htonl(rmsgp->rm_vers);
321 *va++ = htonl(xprt->sc_max_requests);
322 *va++ = htonl(RDMA_ERROR);
323 *va++ = htonl(err);
324 if (err == ERR_VERS) {
325 *va++ = htonl(RPCRDMA_VERSION);
326 *va++ = htonl(RPCRDMA_VERSION);
327 }
328
329 return (int)((unsigned long)va - (unsigned long)startp);
330}
331
332int svc_rdma_xdr_get_reply_hdr_len(struct rpcrdma_msg *rmsgp)
333{
334 struct rpcrdma_write_array *wr_ary;
335
336 /* There is no read-list in a reply */
337
338 /* skip write list */
339 wr_ary = (struct rpcrdma_write_array *)
340 &rmsgp->rm_body.rm_chunks[1];
341 if (wr_ary->wc_discrim)
342 wr_ary = (struct rpcrdma_write_array *)
343 &wr_ary->wc_array[ntohl(wr_ary->wc_nchunks)].
344 wc_target.rs_length;
345 else
346 wr_ary = (struct rpcrdma_write_array *)
347 &wr_ary->wc_nchunks;
348
349 /* skip reply array */
350 if (wr_ary->wc_discrim)
351 wr_ary = (struct rpcrdma_write_array *)
352 &wr_ary->wc_array[ntohl(wr_ary->wc_nchunks)];
353 else
354 wr_ary = (struct rpcrdma_write_array *)
355 &wr_ary->wc_nchunks;
356
357 return (unsigned long) wr_ary - (unsigned long) rmsgp;
358}
359
360void svc_rdma_xdr_encode_write_list(struct rpcrdma_msg *rmsgp, int chunks)
361{
362 struct rpcrdma_write_array *ary;
363
364 /* no read-list */
365 rmsgp->rm_body.rm_chunks[0] = xdr_zero;
366
367 /* write-array discrim */
368 ary = (struct rpcrdma_write_array *)
369 &rmsgp->rm_body.rm_chunks[1];
370 ary->wc_discrim = xdr_one;
371 ary->wc_nchunks = htonl(chunks);
372
373 /* write-list terminator */
374 ary->wc_array[chunks].wc_target.rs_handle = xdr_zero;
375
376 /* reply-array discriminator */
377 ary->wc_array[chunks].wc_target.rs_length = xdr_zero;
378}
379
380void svc_rdma_xdr_encode_reply_array(struct rpcrdma_write_array *ary,
381 int chunks)
382{
383 ary->wc_discrim = xdr_one;
384 ary->wc_nchunks = htonl(chunks);
385}
386
387void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *ary,
388 int chunk_no,
389 u32 rs_handle, u64 rs_offset,
390 u32 write_len)
391{
392 struct rpcrdma_segment *seg = &ary->wc_array[chunk_no].wc_target;
393 seg->rs_handle = htonl(rs_handle);
394 seg->rs_length = htonl(write_len);
395 xdr_encode_hyper((u32 *) &seg->rs_offset, rs_offset);
396}
397
398void svc_rdma_xdr_encode_reply_header(struct svcxprt_rdma *xprt,
399 struct rpcrdma_msg *rdma_argp,
400 struct rpcrdma_msg *rdma_resp,
401 enum rpcrdma_proc rdma_type)
402{
403 rdma_resp->rm_xid = htonl(rdma_argp->rm_xid);
404 rdma_resp->rm_vers = htonl(rdma_argp->rm_vers);
405 rdma_resp->rm_credit = htonl(xprt->sc_max_requests);
406 rdma_resp->rm_type = htonl(rdma_type);
407
408 /* Encode <nul> chunks lists */
409 rdma_resp->rm_body.rm_chunks[0] = xdr_zero;
410 rdma_resp->rm_body.rm_chunks[1] = xdr_zero;
411 rdma_resp->rm_body.rm_chunks[2] = xdr_zero;
412}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
new file mode 100644
index 000000000000..ab54a736486e
--- /dev/null
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -0,0 +1,586 @@
1/*
2 * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the BSD-type
8 * license below:
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 *
17 * Redistributions in binary form must reproduce the above
18 * copyright notice, this list of conditions and the following
19 * disclaimer in the documentation and/or other materials provided
20 * with the distribution.
21 *
22 * Neither the name of the Network Appliance, Inc. nor the names of
23 * its contributors may be used to endorse or promote products
24 * derived from this software without specific prior written
25 * permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 *
39 * Author: Tom Tucker <tom@opengridcomputing.com>
40 */
41
42#include <linux/sunrpc/debug.h>
43#include <linux/sunrpc/rpc_rdma.h>
44#include <linux/spinlock.h>
45#include <asm/unaligned.h>
46#include <rdma/ib_verbs.h>
47#include <rdma/rdma_cm.h>
48#include <linux/sunrpc/svc_rdma.h>
49
50#define RPCDBG_FACILITY RPCDBG_SVCXPRT
51
52/*
53 * Replace the pages in the rq_argpages array with the pages from the SGE in
54 * the RDMA_RECV completion. The SGL should contain full pages up until the
55 * last one.
56 */
57static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
58 struct svc_rdma_op_ctxt *ctxt,
59 u32 byte_count)
60{
61 struct page *page;
62 u32 bc;
63 int sge_no;
64
65 /* Swap the page in the SGE with the page in argpages */
66 page = ctxt->pages[0];
67 put_page(rqstp->rq_pages[0]);
68 rqstp->rq_pages[0] = page;
69
70 /* Set up the XDR head */
71 rqstp->rq_arg.head[0].iov_base = page_address(page);
72 rqstp->rq_arg.head[0].iov_len = min(byte_count, ctxt->sge[0].length);
73 rqstp->rq_arg.len = byte_count;
74 rqstp->rq_arg.buflen = byte_count;
75
76 /* Compute bytes past head in the SGL */
77 bc = byte_count - rqstp->rq_arg.head[0].iov_len;
78
79 /* If data remains, store it in the pagelist */
80 rqstp->rq_arg.page_len = bc;
81 rqstp->rq_arg.page_base = 0;
82 rqstp->rq_arg.pages = &rqstp->rq_pages[1];
83 sge_no = 1;
84 while (bc && sge_no < ctxt->count) {
85 page = ctxt->pages[sge_no];
86 put_page(rqstp->rq_pages[sge_no]);
87 rqstp->rq_pages[sge_no] = page;
88 bc -= min(bc, ctxt->sge[sge_no].length);
89 rqstp->rq_arg.buflen += ctxt->sge[sge_no].length;
90 sge_no++;
91 }
92 rqstp->rq_respages = &rqstp->rq_pages[sge_no];
93
94 /* We should never run out of SGE because the limit is defined to
95 * support the max allowed RPC data length
96 */
97 BUG_ON(bc && (sge_no == ctxt->count));
98 BUG_ON((rqstp->rq_arg.head[0].iov_len + rqstp->rq_arg.page_len)
99 != byte_count);
100 BUG_ON(rqstp->rq_arg.len != byte_count);
101
102 /* If not all pages were used from the SGL, free the remaining ones */
103 bc = sge_no;
104 while (sge_no < ctxt->count) {
105 page = ctxt->pages[sge_no++];
106 put_page(page);
107 }
108 ctxt->count = bc;
109
110 /* Set up tail */
111 rqstp->rq_arg.tail[0].iov_base = NULL;
112 rqstp->rq_arg.tail[0].iov_len = 0;
113}
114
115struct chunk_sge {
116 int start; /* sge no for this chunk */
117 int count; /* sge count for this chunk */
118};
119
120/* Encode a read-chunk-list as an array of IB SGE
121 *
122 * Assumptions:
123 * - chunk[0]->position points to pages[0] at an offset of 0
124 * - pages[] is not physically or virtually contigous and consists of
125 * PAGE_SIZE elements.
126 *
127 * Output:
128 * - sge array pointing into pages[] array.
129 * - chunk_sge array specifying sge index and count for each
130 * chunk in the read list
131 *
132 */
133static int rdma_rcl_to_sge(struct svcxprt_rdma *xprt,
134 struct svc_rqst *rqstp,
135 struct svc_rdma_op_ctxt *head,
136 struct rpcrdma_msg *rmsgp,
137 struct ib_sge *sge,
138 struct chunk_sge *ch_sge_ary,
139 int ch_count,
140 int byte_count)
141{
142 int sge_no;
143 int sge_bytes;
144 int page_off;
145 int page_no;
146 int ch_bytes;
147 int ch_no;
148 struct rpcrdma_read_chunk *ch;
149
150 sge_no = 0;
151 page_no = 0;
152 page_off = 0;
153 ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
154 ch_no = 0;
155 ch_bytes = ch->rc_target.rs_length;
156 head->arg.head[0] = rqstp->rq_arg.head[0];
157 head->arg.tail[0] = rqstp->rq_arg.tail[0];
158 head->arg.pages = &head->pages[head->count];
159 head->sge[0].length = head->count; /* save count of hdr pages */
160 head->arg.page_base = 0;
161 head->arg.page_len = ch_bytes;
162 head->arg.len = rqstp->rq_arg.len + ch_bytes;
163 head->arg.buflen = rqstp->rq_arg.buflen + ch_bytes;
164 head->count++;
165 ch_sge_ary[0].start = 0;
166 while (byte_count) {
167 sge_bytes = min_t(int, PAGE_SIZE-page_off, ch_bytes);
168 sge[sge_no].addr =
169 ib_dma_map_page(xprt->sc_cm_id->device,
170 rqstp->rq_arg.pages[page_no],
171 page_off, sge_bytes,
172 DMA_FROM_DEVICE);
173 sge[sge_no].length = sge_bytes;
174 sge[sge_no].lkey = xprt->sc_phys_mr->lkey;
175 /*
176 * Don't bump head->count here because the same page
177 * may be used by multiple SGE.
178 */
179 head->arg.pages[page_no] = rqstp->rq_arg.pages[page_no];
180 rqstp->rq_respages = &rqstp->rq_arg.pages[page_no+1];
181
182 byte_count -= sge_bytes;
183 ch_bytes -= sge_bytes;
184 sge_no++;
185 /*
186 * If all bytes for this chunk have been mapped to an
187 * SGE, move to the next SGE
188 */
189 if (ch_bytes == 0) {
190 ch_sge_ary[ch_no].count =
191 sge_no - ch_sge_ary[ch_no].start;
192 ch_no++;
193 ch++;
194 ch_sge_ary[ch_no].start = sge_no;
195 ch_bytes = ch->rc_target.rs_length;
196 /* If bytes remaining account for next chunk */
197 if (byte_count) {
198 head->arg.page_len += ch_bytes;
199 head->arg.len += ch_bytes;
200 head->arg.buflen += ch_bytes;
201 }
202 }
203 /*
204 * If this SGE consumed all of the page, move to the
205 * next page
206 */
207 if ((sge_bytes + page_off) == PAGE_SIZE) {
208 page_no++;
209 page_off = 0;
210 /*
211 * If there are still bytes left to map, bump
212 * the page count
213 */
214 if (byte_count)
215 head->count++;
216 } else
217 page_off += sge_bytes;
218 }
219 BUG_ON(byte_count != 0);
220 return sge_no;
221}
222
223static void rdma_set_ctxt_sge(struct svc_rdma_op_ctxt *ctxt,
224 struct ib_sge *sge,
225 u64 *sgl_offset,
226 int count)
227{
228 int i;
229
230 ctxt->count = count;
231 for (i = 0; i < count; i++) {
232 ctxt->sge[i].addr = sge[i].addr;
233 ctxt->sge[i].length = sge[i].length;
234 *sgl_offset = *sgl_offset + sge[i].length;
235 }
236}
237
238static int rdma_read_max_sge(struct svcxprt_rdma *xprt, int sge_count)
239{
240#ifdef RDMA_TRANSPORT_IWARP
241 if ((RDMA_TRANSPORT_IWARP ==
242 rdma_node_get_transport(xprt->sc_cm_id->
243 device->node_type))
244 && sge_count > 1)
245 return 1;
246 else
247#endif
248 return min_t(int, sge_count, xprt->sc_max_sge);
249}
250
251/*
252 * Use RDMA_READ to read data from the advertised client buffer into the
253 * XDR stream starting at rq_arg.head[0].iov_base.
254 * Each chunk in the array
255 * contains the following fields:
256 * discrim - '1', This isn't used for data placement
257 * position - The xdr stream offset (the same for every chunk)
258 * handle - RMR for client memory region
259 * length - data transfer length
260 * offset - 64 bit tagged offset in remote memory region
261 *
262 * On our side, we need to read into a pagelist. The first page immediately
263 * follows the RPC header.
264 *
265 * This function returns 1 to indicate success. The data is not yet in
266 * the pagelist and therefore the RPC request must be deferred. The
267 * I/O completion will enqueue the transport again and
268 * svc_rdma_recvfrom will complete the request.
269 *
270 * NOTE: The ctxt must not be touched after the last WR has been posted
271 * because the I/O completion processing may occur on another
272 * processor and free / modify the context. Ne touche pas!
273 */
274static int rdma_read_xdr(struct svcxprt_rdma *xprt,
275 struct rpcrdma_msg *rmsgp,
276 struct svc_rqst *rqstp,
277 struct svc_rdma_op_ctxt *hdr_ctxt)
278{
279 struct ib_send_wr read_wr;
280 int err = 0;
281 int ch_no;
282 struct ib_sge *sge;
283 int ch_count;
284 int byte_count;
285 int sge_count;
286 u64 sgl_offset;
287 struct rpcrdma_read_chunk *ch;
288 struct svc_rdma_op_ctxt *ctxt = NULL;
289 struct svc_rdma_op_ctxt *head;
290 struct svc_rdma_op_ctxt *tmp_sge_ctxt;
291 struct svc_rdma_op_ctxt *tmp_ch_ctxt;
292 struct chunk_sge *ch_sge_ary;
293
294 /* If no read list is present, return 0 */
295 ch = svc_rdma_get_read_chunk(rmsgp);
296 if (!ch)
297 return 0;
298
299 /* Allocate temporary contexts to keep SGE */
300 BUG_ON(sizeof(struct ib_sge) < sizeof(struct chunk_sge));
301 tmp_sge_ctxt = svc_rdma_get_context(xprt);
302 sge = tmp_sge_ctxt->sge;
303 tmp_ch_ctxt = svc_rdma_get_context(xprt);
304 ch_sge_ary = (struct chunk_sge *)tmp_ch_ctxt->sge;
305
306 svc_rdma_rcl_chunk_counts(ch, &ch_count, &byte_count);
307 sge_count = rdma_rcl_to_sge(xprt, rqstp, hdr_ctxt, rmsgp,
308 sge, ch_sge_ary,
309 ch_count, byte_count);
310 head = svc_rdma_get_context(xprt);
311 sgl_offset = 0;
312 ch_no = 0;
313
314 for (ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
315 ch->rc_discrim != 0; ch++, ch_no++) {
316next_sge:
317 if (!ctxt)
318 ctxt = head;
319 else {
320 ctxt->next = svc_rdma_get_context(xprt);
321 ctxt = ctxt->next;
322 }
323 ctxt->next = NULL;
324 ctxt->direction = DMA_FROM_DEVICE;
325 clear_bit(RDMACTXT_F_READ_DONE, &ctxt->flags);
326 clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
327 if ((ch+1)->rc_discrim == 0) {
328 /*
329 * Checked in sq_cq_reap to see if we need to
330 * be enqueued
331 */
332 set_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
333 ctxt->next = hdr_ctxt;
334 hdr_ctxt->next = head;
335 }
336
337 /* Prepare READ WR */
338 memset(&read_wr, 0, sizeof read_wr);
339 ctxt->wr_op = IB_WR_RDMA_READ;
340 read_wr.wr_id = (unsigned long)ctxt;
341 read_wr.opcode = IB_WR_RDMA_READ;
342 read_wr.send_flags = IB_SEND_SIGNALED;
343 read_wr.wr.rdma.rkey = ch->rc_target.rs_handle;
344 read_wr.wr.rdma.remote_addr =
345 get_unaligned(&(ch->rc_target.rs_offset)) +
346 sgl_offset;
347 read_wr.sg_list = &sge[ch_sge_ary[ch_no].start];
348 read_wr.num_sge =
349 rdma_read_max_sge(xprt, ch_sge_ary[ch_no].count);
350 rdma_set_ctxt_sge(ctxt, &sge[ch_sge_ary[ch_no].start],
351 &sgl_offset,
352 read_wr.num_sge);
353
354 /* Post the read */
355 err = svc_rdma_send(xprt, &read_wr);
356 if (err) {
357 printk(KERN_ERR "svcrdma: Error posting send = %d\n",
358 err);
359 /*
360 * Break the circular list so free knows when
361 * to stop if the error happened to occur on
362 * the last read
363 */
364 ctxt->next = NULL;
365 goto out;
366 }
367 atomic_inc(&rdma_stat_read);
368
369 if (read_wr.num_sge < ch_sge_ary[ch_no].count) {
370 ch_sge_ary[ch_no].count -= read_wr.num_sge;
371 ch_sge_ary[ch_no].start += read_wr.num_sge;
372 goto next_sge;
373 }
374 sgl_offset = 0;
375 err = 0;
376 }
377
378 out:
379 svc_rdma_put_context(tmp_sge_ctxt, 0);
380 svc_rdma_put_context(tmp_ch_ctxt, 0);
381
382 /* Detach arg pages. svc_recv will replenish them */
383 for (ch_no = 0; &rqstp->rq_pages[ch_no] < rqstp->rq_respages; ch_no++)
384 rqstp->rq_pages[ch_no] = NULL;
385
386 /*
387 * Detach res pages. svc_release must see a resused count of
388 * zero or it will attempt to put them.
389 */
390 while (rqstp->rq_resused)
391 rqstp->rq_respages[--rqstp->rq_resused] = NULL;
392
393 if (err) {
394 printk(KERN_ERR "svcrdma : RDMA_READ error = %d\n", err);
395 set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
396 /* Free the linked list of read contexts */
397 while (head != NULL) {
398 ctxt = head->next;
399 svc_rdma_put_context(head, 1);
400 head = ctxt;
401 }
402 return 0;
403 }
404
405 return 1;
406}
407
408static int rdma_read_complete(struct svc_rqst *rqstp,
409 struct svc_rdma_op_ctxt *data)
410{
411 struct svc_rdma_op_ctxt *head = data->next;
412 int page_no;
413 int ret;
414
415 BUG_ON(!head);
416
417 /* Copy RPC pages */
418 for (page_no = 0; page_no < head->count; page_no++) {
419 put_page(rqstp->rq_pages[page_no]);
420 rqstp->rq_pages[page_no] = head->pages[page_no];
421 }
422 /* Point rq_arg.pages past header */
423 rqstp->rq_arg.pages = &rqstp->rq_pages[head->sge[0].length];
424 rqstp->rq_arg.page_len = head->arg.page_len;
425 rqstp->rq_arg.page_base = head->arg.page_base;
426
427 /* rq_respages starts after the last arg page */
428 rqstp->rq_respages = &rqstp->rq_arg.pages[page_no];
429 rqstp->rq_resused = 0;
430
431 /* Rebuild rq_arg head and tail. */
432 rqstp->rq_arg.head[0] = head->arg.head[0];
433 rqstp->rq_arg.tail[0] = head->arg.tail[0];
434 rqstp->rq_arg.len = head->arg.len;
435 rqstp->rq_arg.buflen = head->arg.buflen;
436
437 /* XXX: What should this be? */
438 rqstp->rq_prot = IPPROTO_MAX;
439
440 /*
441 * Free the contexts we used to build the RDMA_READ. We have
442 * to be careful here because the context list uses the same
443 * next pointer used to chain the contexts associated with the
444 * RDMA_READ
445 */
446 data->next = NULL; /* terminate circular list */
447 do {
448 data = head->next;
449 svc_rdma_put_context(head, 0);
450 head = data;
451 } while (head != NULL);
452
453 ret = rqstp->rq_arg.head[0].iov_len
454 + rqstp->rq_arg.page_len
455 + rqstp->rq_arg.tail[0].iov_len;
456 dprintk("svcrdma: deferred read ret=%d, rq_arg.len =%d, "
457 "rq_arg.head[0].iov_base=%p, rq_arg.head[0].iov_len = %zd\n",
458 ret, rqstp->rq_arg.len, rqstp->rq_arg.head[0].iov_base,
459 rqstp->rq_arg.head[0].iov_len);
460
461 /* Indicate that we've consumed an RQ credit */
462 rqstp->rq_xprt_ctxt = rqstp->rq_xprt;
463 svc_xprt_received(rqstp->rq_xprt);
464 return ret;
465}
466
467/*
468 * Set up the rqstp thread context to point to the RQ buffer. If
469 * necessary, pull additional data from the client with an RDMA_READ
470 * request.
471 */
472int svc_rdma_recvfrom(struct svc_rqst *rqstp)
473{
474 struct svc_xprt *xprt = rqstp->rq_xprt;
475 struct svcxprt_rdma *rdma_xprt =
476 container_of(xprt, struct svcxprt_rdma, sc_xprt);
477 struct svc_rdma_op_ctxt *ctxt = NULL;
478 struct rpcrdma_msg *rmsgp;
479 int ret = 0;
480 int len;
481
482 dprintk("svcrdma: rqstp=%p\n", rqstp);
483
484 /*
485 * The rq_xprt_ctxt indicates if we've consumed an RQ credit
486 * or not. It is used in the rdma xpo_release_rqst function to
487 * determine whether or not to return an RQ WQE to the RQ.
488 */
489 rqstp->rq_xprt_ctxt = NULL;
490
491 spin_lock_bh(&rdma_xprt->sc_read_complete_lock);
492 if (!list_empty(&rdma_xprt->sc_read_complete_q)) {
493 ctxt = list_entry(rdma_xprt->sc_read_complete_q.next,
494 struct svc_rdma_op_ctxt,
495 dto_q);
496 list_del_init(&ctxt->dto_q);
497 }
498 spin_unlock_bh(&rdma_xprt->sc_read_complete_lock);
499 if (ctxt)
500 return rdma_read_complete(rqstp, ctxt);
501
502 spin_lock_bh(&rdma_xprt->sc_rq_dto_lock);
503 if (!list_empty(&rdma_xprt->sc_rq_dto_q)) {
504 ctxt = list_entry(rdma_xprt->sc_rq_dto_q.next,
505 struct svc_rdma_op_ctxt,
506 dto_q);
507 list_del_init(&ctxt->dto_q);
508 } else {
509 atomic_inc(&rdma_stat_rq_starve);
510 clear_bit(XPT_DATA, &xprt->xpt_flags);
511 ctxt = NULL;
512 }
513 spin_unlock_bh(&rdma_xprt->sc_rq_dto_lock);
514 if (!ctxt) {
515 /* This is the EAGAIN path. The svc_recv routine will
516 * return -EAGAIN, the nfsd thread will go to call into
517 * svc_recv again and we shouldn't be on the active
518 * transport list
519 */
520 if (test_bit(XPT_CLOSE, &xprt->xpt_flags))
521 goto close_out;
522
523 BUG_ON(ret);
524 goto out;
525 }
526 dprintk("svcrdma: processing ctxt=%p on xprt=%p, rqstp=%p, status=%d\n",
527 ctxt, rdma_xprt, rqstp, ctxt->wc_status);
528 BUG_ON(ctxt->wc_status != IB_WC_SUCCESS);
529 atomic_inc(&rdma_stat_recv);
530
531 /* Build up the XDR from the receive buffers. */
532 rdma_build_arg_xdr(rqstp, ctxt, ctxt->byte_len);
533
534 /* Decode the RDMA header. */
535 len = svc_rdma_xdr_decode_req(&rmsgp, rqstp);
536 rqstp->rq_xprt_hlen = len;
537
538 /* If the request is invalid, reply with an error */
539 if (len < 0) {
540 if (len == -ENOSYS)
541 (void)svc_rdma_send_error(rdma_xprt, rmsgp, ERR_VERS);
542 goto close_out;
543 }
544
545 /* Read read-list data. If we would need to wait, defer
546 * it. Not that in this case, we don't return the RQ credit
547 * until after the read completes.
548 */
549 if (rdma_read_xdr(rdma_xprt, rmsgp, rqstp, ctxt)) {
550 svc_xprt_received(xprt);
551 return 0;
552 }
553
554 /* Indicate we've consumed an RQ credit */
555 rqstp->rq_xprt_ctxt = rqstp->rq_xprt;
556
557 ret = rqstp->rq_arg.head[0].iov_len
558 + rqstp->rq_arg.page_len
559 + rqstp->rq_arg.tail[0].iov_len;
560 svc_rdma_put_context(ctxt, 0);
561 out:
562 dprintk("svcrdma: ret = %d, rq_arg.len =%d, "
563 "rq_arg.head[0].iov_base=%p, rq_arg.head[0].iov_len = %zd\n",
564 ret, rqstp->rq_arg.len,
565 rqstp->rq_arg.head[0].iov_base,
566 rqstp->rq_arg.head[0].iov_len);
567 rqstp->rq_prot = IPPROTO_MAX;
568 svc_xprt_copy_addrs(rqstp, xprt);
569 svc_xprt_received(xprt);
570 return ret;
571
572 close_out:
573 if (ctxt) {
574 svc_rdma_put_context(ctxt, 1);
575 /* Indicate we've consumed an RQ credit */
576 rqstp->rq_xprt_ctxt = rqstp->rq_xprt;
577 }
578 dprintk("svcrdma: transport %p is closing\n", xprt);
579 /*
580 * Set the close bit and enqueue it. svc_recv will see the
581 * close bit and call svc_xprt_delete
582 */
583 set_bit(XPT_CLOSE, &xprt->xpt_flags);
584 svc_xprt_received(xprt);
585 return 0;
586}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
new file mode 100644
index 000000000000..3e321949e1dc
--- /dev/null
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -0,0 +1,520 @@
1/*
2 * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the BSD-type
8 * license below:
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 *
17 * Redistributions in binary form must reproduce the above
18 * copyright notice, this list of conditions and the following
19 * disclaimer in the documentation and/or other materials provided
20 * with the distribution.
21 *
22 * Neither the name of the Network Appliance, Inc. nor the names of
23 * its contributors may be used to endorse or promote products
24 * derived from this software without specific prior written
25 * permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 *
39 * Author: Tom Tucker <tom@opengridcomputing.com>
40 */
41
42#include <linux/sunrpc/debug.h>
43#include <linux/sunrpc/rpc_rdma.h>
44#include <linux/spinlock.h>
45#include <asm/unaligned.h>
46#include <rdma/ib_verbs.h>
47#include <rdma/rdma_cm.h>
48#include <linux/sunrpc/svc_rdma.h>
49
50#define RPCDBG_FACILITY RPCDBG_SVCXPRT
51
52/* Encode an XDR as an array of IB SGE
53 *
54 * Assumptions:
55 * - head[0] is physically contiguous.
56 * - tail[0] is physically contiguous.
57 * - pages[] is not physically or virtually contigous and consists of
58 * PAGE_SIZE elements.
59 *
60 * Output:
61 * SGE[0] reserved for RCPRDMA header
62 * SGE[1] data from xdr->head[]
63 * SGE[2..sge_count-2] data from xdr->pages[]
64 * SGE[sge_count-1] data from xdr->tail.
65 *
66 */
67static struct ib_sge *xdr_to_sge(struct svcxprt_rdma *xprt,
68 struct xdr_buf *xdr,
69 struct ib_sge *sge,
70 int *sge_count)
71{
72 /* Max we need is the length of the XDR / pagesize + one for
73 * head + one for tail + one for RPCRDMA header
74 */
75 int sge_max = (xdr->len+PAGE_SIZE-1) / PAGE_SIZE + 3;
76 int sge_no;
77 u32 byte_count = xdr->len;
78 u32 sge_bytes;
79 u32 page_bytes;
80 int page_off;
81 int page_no;
82
83 /* Skip the first sge, this is for the RPCRDMA header */
84 sge_no = 1;
85
86 /* Head SGE */
87 sge[sge_no].addr = ib_dma_map_single(xprt->sc_cm_id->device,
88 xdr->head[0].iov_base,
89 xdr->head[0].iov_len,
90 DMA_TO_DEVICE);
91 sge_bytes = min_t(u32, byte_count, xdr->head[0].iov_len);
92 byte_count -= sge_bytes;
93 sge[sge_no].length = sge_bytes;
94 sge[sge_no].lkey = xprt->sc_phys_mr->lkey;
95 sge_no++;
96
97 /* pages SGE */
98 page_no = 0;
99 page_bytes = xdr->page_len;
100 page_off = xdr->page_base;
101 while (byte_count && page_bytes) {
102 sge_bytes = min_t(u32, byte_count, (PAGE_SIZE-page_off));
103 sge[sge_no].addr =
104 ib_dma_map_page(xprt->sc_cm_id->device,
105 xdr->pages[page_no], page_off,
106 sge_bytes, DMA_TO_DEVICE);
107 sge_bytes = min(sge_bytes, page_bytes);
108 byte_count -= sge_bytes;
109 page_bytes -= sge_bytes;
110 sge[sge_no].length = sge_bytes;
111 sge[sge_no].lkey = xprt->sc_phys_mr->lkey;
112
113 sge_no++;
114 page_no++;
115 page_off = 0; /* reset for next time through loop */
116 }
117
118 /* Tail SGE */
119 if (byte_count && xdr->tail[0].iov_len) {
120 sge[sge_no].addr =
121 ib_dma_map_single(xprt->sc_cm_id->device,
122 xdr->tail[0].iov_base,
123 xdr->tail[0].iov_len,
124 DMA_TO_DEVICE);
125 sge_bytes = min_t(u32, byte_count, xdr->tail[0].iov_len);
126 byte_count -= sge_bytes;
127 sge[sge_no].length = sge_bytes;
128 sge[sge_no].lkey = xprt->sc_phys_mr->lkey;
129 sge_no++;
130 }
131
132 BUG_ON(sge_no > sge_max);
133 BUG_ON(byte_count != 0);
134
135 *sge_count = sge_no;
136 return sge;
137}
138
139
140/* Assumptions:
141 * - The specified write_len can be represented in sc_max_sge * PAGE_SIZE
142 */
143static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
144 u32 rmr, u64 to,
145 u32 xdr_off, int write_len,
146 struct ib_sge *xdr_sge, int sge_count)
147{
148 struct svc_rdma_op_ctxt *tmp_sge_ctxt;
149 struct ib_send_wr write_wr;
150 struct ib_sge *sge;
151 int xdr_sge_no;
152 int sge_no;
153 int sge_bytes;
154 int sge_off;
155 int bc;
156 struct svc_rdma_op_ctxt *ctxt;
157 int ret = 0;
158
159 BUG_ON(sge_count >= 32);
160 dprintk("svcrdma: RDMA_WRITE rmr=%x, to=%llx, xdr_off=%d, "
161 "write_len=%d, xdr_sge=%p, sge_count=%d\n",
162 rmr, to, xdr_off, write_len, xdr_sge, sge_count);
163
164 ctxt = svc_rdma_get_context(xprt);
165 ctxt->count = 0;
166 tmp_sge_ctxt = svc_rdma_get_context(xprt);
167 sge = tmp_sge_ctxt->sge;
168
169 /* Find the SGE associated with xdr_off */
170 for (bc = xdr_off, xdr_sge_no = 1; bc && xdr_sge_no < sge_count;
171 xdr_sge_no++) {
172 if (xdr_sge[xdr_sge_no].length > bc)
173 break;
174 bc -= xdr_sge[xdr_sge_no].length;
175 }
176
177 sge_off = bc;
178 bc = write_len;
179 sge_no = 0;
180
181 /* Copy the remaining SGE */
182 while (bc != 0 && xdr_sge_no < sge_count) {
183 sge[sge_no].addr = xdr_sge[xdr_sge_no].addr + sge_off;
184 sge[sge_no].lkey = xdr_sge[xdr_sge_no].lkey;
185 sge_bytes = min((size_t)bc,
186 (size_t)(xdr_sge[xdr_sge_no].length-sge_off));
187 sge[sge_no].length = sge_bytes;
188
189 sge_off = 0;
190 sge_no++;
191 xdr_sge_no++;
192 bc -= sge_bytes;
193 }
194
195 BUG_ON(bc != 0);
196 BUG_ON(xdr_sge_no > sge_count);
197
198 /* Prepare WRITE WR */
199 memset(&write_wr, 0, sizeof write_wr);
200 ctxt->wr_op = IB_WR_RDMA_WRITE;
201 write_wr.wr_id = (unsigned long)ctxt;
202 write_wr.sg_list = &sge[0];
203 write_wr.num_sge = sge_no;
204 write_wr.opcode = IB_WR_RDMA_WRITE;
205 write_wr.send_flags = IB_SEND_SIGNALED;
206 write_wr.wr.rdma.rkey = rmr;
207 write_wr.wr.rdma.remote_addr = to;
208
209 /* Post It */
210 atomic_inc(&rdma_stat_write);
211 if (svc_rdma_send(xprt, &write_wr)) {
212 svc_rdma_put_context(ctxt, 1);
213 /* Fatal error, close transport */
214 ret = -EIO;
215 }
216 svc_rdma_put_context(tmp_sge_ctxt, 0);
217 return ret;
218}
219
220static int send_write_chunks(struct svcxprt_rdma *xprt,
221 struct rpcrdma_msg *rdma_argp,
222 struct rpcrdma_msg *rdma_resp,
223 struct svc_rqst *rqstp,
224 struct ib_sge *sge,
225 int sge_count)
226{
227 u32 xfer_len = rqstp->rq_res.page_len + rqstp->rq_res.tail[0].iov_len;
228 int write_len;
229 int max_write;
230 u32 xdr_off;
231 int chunk_off;
232 int chunk_no;
233 struct rpcrdma_write_array *arg_ary;
234 struct rpcrdma_write_array *res_ary;
235 int ret;
236
237 arg_ary = svc_rdma_get_write_array(rdma_argp);
238 if (!arg_ary)
239 return 0;
240 res_ary = (struct rpcrdma_write_array *)
241 &rdma_resp->rm_body.rm_chunks[1];
242
243 max_write = xprt->sc_max_sge * PAGE_SIZE;
244
245 /* Write chunks start at the pagelist */
246 for (xdr_off = rqstp->rq_res.head[0].iov_len, chunk_no = 0;
247 xfer_len && chunk_no < arg_ary->wc_nchunks;
248 chunk_no++) {
249 struct rpcrdma_segment *arg_ch;
250 u64 rs_offset;
251
252 arg_ch = &arg_ary->wc_array[chunk_no].wc_target;
253 write_len = min(xfer_len, arg_ch->rs_length);
254
255 /* Prepare the response chunk given the length actually
256 * written */
257 rs_offset = get_unaligned(&(arg_ch->rs_offset));
258 svc_rdma_xdr_encode_array_chunk(res_ary, chunk_no,
259 arg_ch->rs_handle,
260 rs_offset,
261 write_len);
262 chunk_off = 0;
263 while (write_len) {
264 int this_write;
265 this_write = min(write_len, max_write);
266 ret = send_write(xprt, rqstp,
267 arg_ch->rs_handle,
268 rs_offset + chunk_off,
269 xdr_off,
270 this_write,
271 sge,
272 sge_count);
273 if (ret) {
274 dprintk("svcrdma: RDMA_WRITE failed, ret=%d\n",
275 ret);
276 return -EIO;
277 }
278 chunk_off += this_write;
279 xdr_off += this_write;
280 xfer_len -= this_write;
281 write_len -= this_write;
282 }
283 }
284 /* Update the req with the number of chunks actually used */
285 svc_rdma_xdr_encode_write_list(rdma_resp, chunk_no);
286
287 return rqstp->rq_res.page_len + rqstp->rq_res.tail[0].iov_len;
288}
289
290static int send_reply_chunks(struct svcxprt_rdma *xprt,
291 struct rpcrdma_msg *rdma_argp,
292 struct rpcrdma_msg *rdma_resp,
293 struct svc_rqst *rqstp,
294 struct ib_sge *sge,
295 int sge_count)
296{
297 u32 xfer_len = rqstp->rq_res.len;
298 int write_len;
299 int max_write;
300 u32 xdr_off;
301 int chunk_no;
302 int chunk_off;
303 struct rpcrdma_segment *ch;
304 struct rpcrdma_write_array *arg_ary;
305 struct rpcrdma_write_array *res_ary;
306 int ret;
307
308 arg_ary = svc_rdma_get_reply_array(rdma_argp);
309 if (!arg_ary)
310 return 0;
311 /* XXX: need to fix when reply lists occur with read-list and or
312 * write-list */
313 res_ary = (struct rpcrdma_write_array *)
314 &rdma_resp->rm_body.rm_chunks[2];
315
316 max_write = xprt->sc_max_sge * PAGE_SIZE;
317
318 /* xdr offset starts at RPC message */
319 for (xdr_off = 0, chunk_no = 0;
320 xfer_len && chunk_no < arg_ary->wc_nchunks;
321 chunk_no++) {
322 u64 rs_offset;
323 ch = &arg_ary->wc_array[chunk_no].wc_target;
324 write_len = min(xfer_len, ch->rs_length);
325
326
327 /* Prepare the reply chunk given the length actually
328 * written */
329 rs_offset = get_unaligned(&(ch->rs_offset));
330 svc_rdma_xdr_encode_array_chunk(res_ary, chunk_no,
331 ch->rs_handle, rs_offset,
332 write_len);
333 chunk_off = 0;
334 while (write_len) {
335 int this_write;
336
337 this_write = min(write_len, max_write);
338 ret = send_write(xprt, rqstp,
339 ch->rs_handle,
340 rs_offset + chunk_off,
341 xdr_off,
342 this_write,
343 sge,
344 sge_count);
345 if (ret) {
346 dprintk("svcrdma: RDMA_WRITE failed, ret=%d\n",
347 ret);
348 return -EIO;
349 }
350 chunk_off += this_write;
351 xdr_off += this_write;
352 xfer_len -= this_write;
353 write_len -= this_write;
354 }
355 }
356 /* Update the req with the number of chunks actually used */
357 svc_rdma_xdr_encode_reply_array(res_ary, chunk_no);
358
359 return rqstp->rq_res.len;
360}
361
362/* This function prepares the portion of the RPCRDMA message to be
363 * sent in the RDMA_SEND. This function is called after data sent via
364 * RDMA has already been transmitted. There are three cases:
365 * - The RPCRDMA header, RPC header, and payload are all sent in a
366 * single RDMA_SEND. This is the "inline" case.
367 * - The RPCRDMA header and some portion of the RPC header and data
368 * are sent via this RDMA_SEND and another portion of the data is
369 * sent via RDMA.
370 * - The RPCRDMA header [NOMSG] is sent in this RDMA_SEND and the RPC
371 * header and data are all transmitted via RDMA.
372 * In all three cases, this function prepares the RPCRDMA header in
373 * sge[0], the 'type' parameter indicates the type to place in the
374 * RPCRDMA header, and the 'byte_count' field indicates how much of
375 * the XDR to include in this RDMA_SEND.
376 */
377static int send_reply(struct svcxprt_rdma *rdma,
378 struct svc_rqst *rqstp,
379 struct page *page,
380 struct rpcrdma_msg *rdma_resp,
381 struct svc_rdma_op_ctxt *ctxt,
382 int sge_count,
383 int byte_count)
384{
385 struct ib_send_wr send_wr;
386 int sge_no;
387 int sge_bytes;
388 int page_no;
389 int ret;
390
391 /* Prepare the context */
392 ctxt->pages[0] = page;
393 ctxt->count = 1;
394
395 /* Prepare the SGE for the RPCRDMA Header */
396 ctxt->sge[0].addr =
397 ib_dma_map_page(rdma->sc_cm_id->device,
398 page, 0, PAGE_SIZE, DMA_TO_DEVICE);
399 ctxt->direction = DMA_TO_DEVICE;
400 ctxt->sge[0].length = svc_rdma_xdr_get_reply_hdr_len(rdma_resp);
401 ctxt->sge[0].lkey = rdma->sc_phys_mr->lkey;
402
403 /* Determine how many of our SGE are to be transmitted */
404 for (sge_no = 1; byte_count && sge_no < sge_count; sge_no++) {
405 sge_bytes = min((size_t)ctxt->sge[sge_no].length,
406 (size_t)byte_count);
407 byte_count -= sge_bytes;
408 }
409 BUG_ON(byte_count != 0);
410
411 /* Save all respages in the ctxt and remove them from the
412 * respages array. They are our pages until the I/O
413 * completes.
414 */
415 for (page_no = 0; page_no < rqstp->rq_resused; page_no++) {
416 ctxt->pages[page_no+1] = rqstp->rq_respages[page_no];
417 ctxt->count++;
418 rqstp->rq_respages[page_no] = NULL;
419 }
420
421 BUG_ON(sge_no > rdma->sc_max_sge);
422 memset(&send_wr, 0, sizeof send_wr);
423 ctxt->wr_op = IB_WR_SEND;
424 send_wr.wr_id = (unsigned long)ctxt;
425 send_wr.sg_list = ctxt->sge;
426 send_wr.num_sge = sge_no;
427 send_wr.opcode = IB_WR_SEND;
428 send_wr.send_flags = IB_SEND_SIGNALED;
429
430 ret = svc_rdma_send(rdma, &send_wr);
431 if (ret)
432 svc_rdma_put_context(ctxt, 1);
433
434 return ret;
435}
436
437void svc_rdma_prep_reply_hdr(struct svc_rqst *rqstp)
438{
439}
440
441/*
442 * Return the start of an xdr buffer.
443 */
444static void *xdr_start(struct xdr_buf *xdr)
445{
446 return xdr->head[0].iov_base -
447 (xdr->len -
448 xdr->page_len -
449 xdr->tail[0].iov_len -
450 xdr->head[0].iov_len);
451}
452
453int svc_rdma_sendto(struct svc_rqst *rqstp)
454{
455 struct svc_xprt *xprt = rqstp->rq_xprt;
456 struct svcxprt_rdma *rdma =
457 container_of(xprt, struct svcxprt_rdma, sc_xprt);
458 struct rpcrdma_msg *rdma_argp;
459 struct rpcrdma_msg *rdma_resp;
460 struct rpcrdma_write_array *reply_ary;
461 enum rpcrdma_proc reply_type;
462 int ret;
463 int inline_bytes;
464 struct ib_sge *sge;
465 int sge_count = 0;
466 struct page *res_page;
467 struct svc_rdma_op_ctxt *ctxt;
468
469 dprintk("svcrdma: sending response for rqstp=%p\n", rqstp);
470
471 /* Get the RDMA request header. */
472 rdma_argp = xdr_start(&rqstp->rq_arg);
473
474 /* Build an SGE for the XDR */
475 ctxt = svc_rdma_get_context(rdma);
476 ctxt->direction = DMA_TO_DEVICE;
477 sge = xdr_to_sge(rdma, &rqstp->rq_res, ctxt->sge, &sge_count);
478
479 inline_bytes = rqstp->rq_res.len;
480
481 /* Create the RDMA response header */
482 res_page = svc_rdma_get_page();
483 rdma_resp = page_address(res_page);
484 reply_ary = svc_rdma_get_reply_array(rdma_argp);
485 if (reply_ary)
486 reply_type = RDMA_NOMSG;
487 else
488 reply_type = RDMA_MSG;
489 svc_rdma_xdr_encode_reply_header(rdma, rdma_argp,
490 rdma_resp, reply_type);
491
492 /* Send any write-chunk data and build resp write-list */
493 ret = send_write_chunks(rdma, rdma_argp, rdma_resp,
494 rqstp, sge, sge_count);
495 if (ret < 0) {
496 printk(KERN_ERR "svcrdma: failed to send write chunks, rc=%d\n",
497 ret);
498 goto error;
499 }
500 inline_bytes -= ret;
501
502 /* Send any reply-list data and update resp reply-list */
503 ret = send_reply_chunks(rdma, rdma_argp, rdma_resp,
504 rqstp, sge, sge_count);
505 if (ret < 0) {
506 printk(KERN_ERR "svcrdma: failed to send reply chunks, rc=%d\n",
507 ret);
508 goto error;
509 }
510 inline_bytes -= ret;
511
512 ret = send_reply(rdma, rqstp, res_page, rdma_resp, ctxt, sge_count,
513 inline_bytes);
514 dprintk("svcrdma: send_reply returns %d\n", ret);
515 return ret;
516 error:
517 svc_rdma_put_context(ctxt, 0);
518 put_page(res_page);
519 return ret;
520}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
new file mode 100644
index 000000000000..f09444c451bc
--- /dev/null
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -0,0 +1,1080 @@
1/*
2 * Copyright (c) 2005-2007 Network Appliance, Inc. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the BSD-type
8 * license below:
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 *
17 * Redistributions in binary form must reproduce the above
18 * copyright notice, this list of conditions and the following
19 * disclaimer in the documentation and/or other materials provided
20 * with the distribution.
21 *
22 * Neither the name of the Network Appliance, Inc. nor the names of
23 * its contributors may be used to endorse or promote products
24 * derived from this software without specific prior written
25 * permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 *
39 * Author: Tom Tucker <tom@opengridcomputing.com>
40 */
41
42#include <linux/sunrpc/svc_xprt.h>
43#include <linux/sunrpc/debug.h>
44#include <linux/sunrpc/rpc_rdma.h>
45#include <linux/spinlock.h>
46#include <rdma/ib_verbs.h>
47#include <rdma/rdma_cm.h>
48#include <linux/sunrpc/svc_rdma.h>
49
50#define RPCDBG_FACILITY RPCDBG_SVCXPRT
51
52static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
53 struct sockaddr *sa, int salen,
54 int flags);
55static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt);
56static void svc_rdma_release_rqst(struct svc_rqst *);
57static void rdma_destroy_xprt(struct svcxprt_rdma *xprt);
58static void dto_tasklet_func(unsigned long data);
59static void svc_rdma_detach(struct svc_xprt *xprt);
60static void svc_rdma_free(struct svc_xprt *xprt);
61static int svc_rdma_has_wspace(struct svc_xprt *xprt);
62static void rq_cq_reap(struct svcxprt_rdma *xprt);
63static void sq_cq_reap(struct svcxprt_rdma *xprt);
64
65DECLARE_TASKLET(dto_tasklet, dto_tasklet_func, 0UL);
66static DEFINE_SPINLOCK(dto_lock);
67static LIST_HEAD(dto_xprt_q);
68
69static struct svc_xprt_ops svc_rdma_ops = {
70 .xpo_create = svc_rdma_create,
71 .xpo_recvfrom = svc_rdma_recvfrom,
72 .xpo_sendto = svc_rdma_sendto,
73 .xpo_release_rqst = svc_rdma_release_rqst,
74 .xpo_detach = svc_rdma_detach,
75 .xpo_free = svc_rdma_free,
76 .xpo_prep_reply_hdr = svc_rdma_prep_reply_hdr,
77 .xpo_has_wspace = svc_rdma_has_wspace,
78 .xpo_accept = svc_rdma_accept,
79};
80
81struct svc_xprt_class svc_rdma_class = {
82 .xcl_name = "rdma",
83 .xcl_owner = THIS_MODULE,
84 .xcl_ops = &svc_rdma_ops,
85 .xcl_max_payload = RPCSVC_MAXPAYLOAD_TCP,
86};
87
88static int rdma_bump_context_cache(struct svcxprt_rdma *xprt)
89{
90 int target;
91 int at_least_one = 0;
92 struct svc_rdma_op_ctxt *ctxt;
93
94 target = min(xprt->sc_ctxt_cnt + xprt->sc_ctxt_bump,
95 xprt->sc_ctxt_max);
96
97 spin_lock_bh(&xprt->sc_ctxt_lock);
98 while (xprt->sc_ctxt_cnt < target) {
99 xprt->sc_ctxt_cnt++;
100 spin_unlock_bh(&xprt->sc_ctxt_lock);
101
102 ctxt = kmalloc(sizeof(*ctxt), GFP_KERNEL);
103
104 spin_lock_bh(&xprt->sc_ctxt_lock);
105 if (ctxt) {
106 at_least_one = 1;
107 ctxt->next = xprt->sc_ctxt_head;
108 xprt->sc_ctxt_head = ctxt;
109 } else {
110 /* kmalloc failed...give up for now */
111 xprt->sc_ctxt_cnt--;
112 break;
113 }
114 }
115 spin_unlock_bh(&xprt->sc_ctxt_lock);
116 dprintk("svcrdma: sc_ctxt_max=%d, sc_ctxt_cnt=%d\n",
117 xprt->sc_ctxt_max, xprt->sc_ctxt_cnt);
118 return at_least_one;
119}
120
121struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt)
122{
123 struct svc_rdma_op_ctxt *ctxt;
124
125 while (1) {
126 spin_lock_bh(&xprt->sc_ctxt_lock);
127 if (unlikely(xprt->sc_ctxt_head == NULL)) {
128 /* Try to bump my cache. */
129 spin_unlock_bh(&xprt->sc_ctxt_lock);
130
131 if (rdma_bump_context_cache(xprt))
132 continue;
133
134 printk(KERN_INFO "svcrdma: sleeping waiting for "
135 "context memory on xprt=%p\n",
136 xprt);
137 schedule_timeout_uninterruptible(msecs_to_jiffies(500));
138 continue;
139 }
140 ctxt = xprt->sc_ctxt_head;
141 xprt->sc_ctxt_head = ctxt->next;
142 spin_unlock_bh(&xprt->sc_ctxt_lock);
143 ctxt->xprt = xprt;
144 INIT_LIST_HEAD(&ctxt->dto_q);
145 ctxt->count = 0;
146 break;
147 }
148 return ctxt;
149}
150
151void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages)
152{
153 struct svcxprt_rdma *xprt;
154 int i;
155
156 BUG_ON(!ctxt);
157 xprt = ctxt->xprt;
158 if (free_pages)
159 for (i = 0; i < ctxt->count; i++)
160 put_page(ctxt->pages[i]);
161
162 for (i = 0; i < ctxt->count; i++)
163 dma_unmap_single(xprt->sc_cm_id->device->dma_device,
164 ctxt->sge[i].addr,
165 ctxt->sge[i].length,
166 ctxt->direction);
167 spin_lock_bh(&xprt->sc_ctxt_lock);
168 ctxt->next = xprt->sc_ctxt_head;
169 xprt->sc_ctxt_head = ctxt;
170 spin_unlock_bh(&xprt->sc_ctxt_lock);
171}
172
173/* ib_cq event handler */
174static void cq_event_handler(struct ib_event *event, void *context)
175{
176 struct svc_xprt *xprt = context;
177 dprintk("svcrdma: received CQ event id=%d, context=%p\n",
178 event->event, context);
179 set_bit(XPT_CLOSE, &xprt->xpt_flags);
180}
181
182/* QP event handler */
183static void qp_event_handler(struct ib_event *event, void *context)
184{
185 struct svc_xprt *xprt = context;
186
187 switch (event->event) {
188 /* These are considered benign events */
189 case IB_EVENT_PATH_MIG:
190 case IB_EVENT_COMM_EST:
191 case IB_EVENT_SQ_DRAINED:
192 case IB_EVENT_QP_LAST_WQE_REACHED:
193 dprintk("svcrdma: QP event %d received for QP=%p\n",
194 event->event, event->element.qp);
195 break;
196 /* These are considered fatal events */
197 case IB_EVENT_PATH_MIG_ERR:
198 case IB_EVENT_QP_FATAL:
199 case IB_EVENT_QP_REQ_ERR:
200 case IB_EVENT_QP_ACCESS_ERR:
201 case IB_EVENT_DEVICE_FATAL:
202 default:
203 dprintk("svcrdma: QP ERROR event %d received for QP=%p, "
204 "closing transport\n",
205 event->event, event->element.qp);
206 set_bit(XPT_CLOSE, &xprt->xpt_flags);
207 break;
208 }
209}
210
211/*
212 * Data Transfer Operation Tasklet
213 *
214 * Walks a list of transports with I/O pending, removing entries as
215 * they are added to the server's I/O pending list. Two bits indicate
216 * if SQ, RQ, or both have I/O pending. The dto_lock is an irqsave
217 * spinlock that serializes access to the transport list with the RQ
218 * and SQ interrupt handlers.
219 */
220static void dto_tasklet_func(unsigned long data)
221{
222 struct svcxprt_rdma *xprt;
223 unsigned long flags;
224
225 spin_lock_irqsave(&dto_lock, flags);
226 while (!list_empty(&dto_xprt_q)) {
227 xprt = list_entry(dto_xprt_q.next,
228 struct svcxprt_rdma, sc_dto_q);
229 list_del_init(&xprt->sc_dto_q);
230 spin_unlock_irqrestore(&dto_lock, flags);
231
232 if (test_and_clear_bit(RDMAXPRT_RQ_PENDING, &xprt->sc_flags)) {
233 ib_req_notify_cq(xprt->sc_rq_cq, IB_CQ_NEXT_COMP);
234 rq_cq_reap(xprt);
235 set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
236 /*
237 * If data arrived before established event,
238 * don't enqueue. This defers RPC I/O until the
239 * RDMA connection is complete.
240 */
241 if (!test_bit(RDMAXPRT_CONN_PENDING, &xprt->sc_flags))
242 svc_xprt_enqueue(&xprt->sc_xprt);
243 }
244
245 if (test_and_clear_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags)) {
246 ib_req_notify_cq(xprt->sc_sq_cq, IB_CQ_NEXT_COMP);
247 sq_cq_reap(xprt);
248 }
249
250 spin_lock_irqsave(&dto_lock, flags);
251 }
252 spin_unlock_irqrestore(&dto_lock, flags);
253}
254
255/*
256 * Receive Queue Completion Handler
257 *
258 * Since an RQ completion handler is called on interrupt context, we
259 * need to defer the handling of the I/O to a tasklet
260 */
261static void rq_comp_handler(struct ib_cq *cq, void *cq_context)
262{
263 struct svcxprt_rdma *xprt = cq_context;
264 unsigned long flags;
265
266 /*
267 * Set the bit regardless of whether or not it's on the list
268 * because it may be on the list already due to an SQ
269 * completion.
270 */
271 set_bit(RDMAXPRT_RQ_PENDING, &xprt->sc_flags);
272
273 /*
274 * If this transport is not already on the DTO transport queue,
275 * add it
276 */
277 spin_lock_irqsave(&dto_lock, flags);
278 if (list_empty(&xprt->sc_dto_q))
279 list_add_tail(&xprt->sc_dto_q, &dto_xprt_q);
280 spin_unlock_irqrestore(&dto_lock, flags);
281
282 /* Tasklet does all the work to avoid irqsave locks. */
283 tasklet_schedule(&dto_tasklet);
284}
285
286/*
287 * rq_cq_reap - Process the RQ CQ.
288 *
289 * Take all completing WC off the CQE and enqueue the associated DTO
290 * context on the dto_q for the transport.
291 */
292static void rq_cq_reap(struct svcxprt_rdma *xprt)
293{
294 int ret;
295 struct ib_wc wc;
296 struct svc_rdma_op_ctxt *ctxt = NULL;
297
298 atomic_inc(&rdma_stat_rq_poll);
299
300 spin_lock_bh(&xprt->sc_rq_dto_lock);
301 while ((ret = ib_poll_cq(xprt->sc_rq_cq, 1, &wc)) > 0) {
302 ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id;
303 ctxt->wc_status = wc.status;
304 ctxt->byte_len = wc.byte_len;
305 if (wc.status != IB_WC_SUCCESS) {
306 /* Close the transport */
307 set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
308 svc_rdma_put_context(ctxt, 1);
309 continue;
310 }
311 list_add_tail(&ctxt->dto_q, &xprt->sc_rq_dto_q);
312 }
313 spin_unlock_bh(&xprt->sc_rq_dto_lock);
314
315 if (ctxt)
316 atomic_inc(&rdma_stat_rq_prod);
317}
318
319/*
320 * Send Queue Completion Handler - potentially called on interrupt context.
321 */
322static void sq_cq_reap(struct svcxprt_rdma *xprt)
323{
324 struct svc_rdma_op_ctxt *ctxt = NULL;
325 struct ib_wc wc;
326 struct ib_cq *cq = xprt->sc_sq_cq;
327 int ret;
328
329 atomic_inc(&rdma_stat_sq_poll);
330 while ((ret = ib_poll_cq(cq, 1, &wc)) > 0) {
331 ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id;
332 xprt = ctxt->xprt;
333
334 if (wc.status != IB_WC_SUCCESS)
335 /* Close the transport */
336 set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
337
338 /* Decrement used SQ WR count */
339 atomic_dec(&xprt->sc_sq_count);
340 wake_up(&xprt->sc_send_wait);
341
342 switch (ctxt->wr_op) {
343 case IB_WR_SEND:
344 case IB_WR_RDMA_WRITE:
345 svc_rdma_put_context(ctxt, 1);
346 break;
347
348 case IB_WR_RDMA_READ:
349 if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) {
350 set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
351 set_bit(RDMACTXT_F_READ_DONE, &ctxt->flags);
352 spin_lock_bh(&xprt->sc_read_complete_lock);
353 list_add_tail(&ctxt->dto_q,
354 &xprt->sc_read_complete_q);
355 spin_unlock_bh(&xprt->sc_read_complete_lock);
356 svc_xprt_enqueue(&xprt->sc_xprt);
357 }
358 break;
359
360 default:
361 printk(KERN_ERR "svcrdma: unexpected completion type, "
362 "opcode=%d, status=%d\n",
363 wc.opcode, wc.status);
364 break;
365 }
366 }
367
368 if (ctxt)
369 atomic_inc(&rdma_stat_sq_prod);
370}
371
372static void sq_comp_handler(struct ib_cq *cq, void *cq_context)
373{
374 struct svcxprt_rdma *xprt = cq_context;
375 unsigned long flags;
376
377 /*
378 * Set the bit regardless of whether or not it's on the list
379 * because it may be on the list already due to an RQ
380 * completion.
381 */
382 set_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags);
383
384 /*
385 * If this transport is not already on the DTO transport queue,
386 * add it
387 */
388 spin_lock_irqsave(&dto_lock, flags);
389 if (list_empty(&xprt->sc_dto_q))
390 list_add_tail(&xprt->sc_dto_q, &dto_xprt_q);
391 spin_unlock_irqrestore(&dto_lock, flags);
392
393 /* Tasklet does all the work to avoid irqsave locks. */
394 tasklet_schedule(&dto_tasklet);
395}
396
397static void create_context_cache(struct svcxprt_rdma *xprt,
398 int ctxt_count, int ctxt_bump, int ctxt_max)
399{
400 struct svc_rdma_op_ctxt *ctxt;
401 int i;
402
403 xprt->sc_ctxt_max = ctxt_max;
404 xprt->sc_ctxt_bump = ctxt_bump;
405 xprt->sc_ctxt_cnt = 0;
406 xprt->sc_ctxt_head = NULL;
407 for (i = 0; i < ctxt_count; i++) {
408 ctxt = kmalloc(sizeof(*ctxt), GFP_KERNEL);
409 if (ctxt) {
410 ctxt->next = xprt->sc_ctxt_head;
411 xprt->sc_ctxt_head = ctxt;
412 xprt->sc_ctxt_cnt++;
413 }
414 }
415}
416
417static void destroy_context_cache(struct svc_rdma_op_ctxt *ctxt)
418{
419 struct svc_rdma_op_ctxt *next;
420 if (!ctxt)
421 return;
422
423 do {
424 next = ctxt->next;
425 kfree(ctxt);
426 ctxt = next;
427 } while (next);
428}
429
430static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
431 int listener)
432{
433 struct svcxprt_rdma *cma_xprt = kzalloc(sizeof *cma_xprt, GFP_KERNEL);
434
435 if (!cma_xprt)
436 return NULL;
437 svc_xprt_init(&svc_rdma_class, &cma_xprt->sc_xprt, serv);
438 INIT_LIST_HEAD(&cma_xprt->sc_accept_q);
439 INIT_LIST_HEAD(&cma_xprt->sc_dto_q);
440 INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q);
441 INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q);
442 init_waitqueue_head(&cma_xprt->sc_send_wait);
443
444 spin_lock_init(&cma_xprt->sc_lock);
445 spin_lock_init(&cma_xprt->sc_read_complete_lock);
446 spin_lock_init(&cma_xprt->sc_ctxt_lock);
447 spin_lock_init(&cma_xprt->sc_rq_dto_lock);
448
449 cma_xprt->sc_ord = svcrdma_ord;
450
451 cma_xprt->sc_max_req_size = svcrdma_max_req_size;
452 cma_xprt->sc_max_requests = svcrdma_max_requests;
453 cma_xprt->sc_sq_depth = svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT;
454 atomic_set(&cma_xprt->sc_sq_count, 0);
455
456 if (!listener) {
457 int reqs = cma_xprt->sc_max_requests;
458 create_context_cache(cma_xprt,
459 reqs << 1, /* starting size */
460 reqs, /* bump amount */
461 reqs +
462 cma_xprt->sc_sq_depth +
463 RPCRDMA_MAX_THREADS + 1); /* max */
464 if (!cma_xprt->sc_ctxt_head) {
465 kfree(cma_xprt);
466 return NULL;
467 }
468 clear_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags);
469 } else
470 set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags);
471
472 return cma_xprt;
473}
474
475struct page *svc_rdma_get_page(void)
476{
477 struct page *page;
478
479 while ((page = alloc_page(GFP_KERNEL)) == NULL) {
480 /* If we can't get memory, wait a bit and try again */
481 printk(KERN_INFO "svcrdma: out of memory...retrying in 1000 "
482 "jiffies.\n");
483 schedule_timeout_uninterruptible(msecs_to_jiffies(1000));
484 }
485 return page;
486}
487
488int svc_rdma_post_recv(struct svcxprt_rdma *xprt)
489{
490 struct ib_recv_wr recv_wr, *bad_recv_wr;
491 struct svc_rdma_op_ctxt *ctxt;
492 struct page *page;
493 unsigned long pa;
494 int sge_no;
495 int buflen;
496 int ret;
497
498 ctxt = svc_rdma_get_context(xprt);
499 buflen = 0;
500 ctxt->direction = DMA_FROM_DEVICE;
501 for (sge_no = 0; buflen < xprt->sc_max_req_size; sge_no++) {
502 BUG_ON(sge_no >= xprt->sc_max_sge);
503 page = svc_rdma_get_page();
504 ctxt->pages[sge_no] = page;
505 pa = ib_dma_map_page(xprt->sc_cm_id->device,
506 page, 0, PAGE_SIZE,
507 DMA_FROM_DEVICE);
508 ctxt->sge[sge_no].addr = pa;
509 ctxt->sge[sge_no].length = PAGE_SIZE;
510 ctxt->sge[sge_no].lkey = xprt->sc_phys_mr->lkey;
511 buflen += PAGE_SIZE;
512 }
513 ctxt->count = sge_no;
514 recv_wr.next = NULL;
515 recv_wr.sg_list = &ctxt->sge[0];
516 recv_wr.num_sge = ctxt->count;
517 recv_wr.wr_id = (u64)(unsigned long)ctxt;
518
519 ret = ib_post_recv(xprt->sc_qp, &recv_wr, &bad_recv_wr);
520 return ret;
521}
522
523/*
524 * This function handles the CONNECT_REQUEST event on a listening
525 * endpoint. It is passed the cma_id for the _new_ connection. The context in
526 * this cma_id is inherited from the listening cma_id and is the svc_xprt
527 * structure for the listening endpoint.
528 *
529 * This function creates a new xprt for the new connection and enqueues it on
530 * the accept queue for the listent xprt. When the listen thread is kicked, it
531 * will call the recvfrom method on the listen xprt which will accept the new
532 * connection.
533 */
534static void handle_connect_req(struct rdma_cm_id *new_cma_id)
535{
536 struct svcxprt_rdma *listen_xprt = new_cma_id->context;
537 struct svcxprt_rdma *newxprt;
538
539 /* Create a new transport */
540 newxprt = rdma_create_xprt(listen_xprt->sc_xprt.xpt_server, 0);
541 if (!newxprt) {
542 dprintk("svcrdma: failed to create new transport\n");
543 return;
544 }
545 newxprt->sc_cm_id = new_cma_id;
546 new_cma_id->context = newxprt;
547 dprintk("svcrdma: Creating newxprt=%p, cm_id=%p, listenxprt=%p\n",
548 newxprt, newxprt->sc_cm_id, listen_xprt);
549
550 /*
551 * Enqueue the new transport on the accept queue of the listening
552 * transport
553 */
554 spin_lock_bh(&listen_xprt->sc_lock);
555 list_add_tail(&newxprt->sc_accept_q, &listen_xprt->sc_accept_q);
556 spin_unlock_bh(&listen_xprt->sc_lock);
557
558 /*
559 * Can't use svc_xprt_received here because we are not on a
560 * rqstp thread
561 */
562 set_bit(XPT_CONN, &listen_xprt->sc_xprt.xpt_flags);
563 svc_xprt_enqueue(&listen_xprt->sc_xprt);
564}
565
566/*
567 * Handles events generated on the listening endpoint. These events will be
568 * either be incoming connect requests or adapter removal events.
569 */
570static int rdma_listen_handler(struct rdma_cm_id *cma_id,
571 struct rdma_cm_event *event)
572{
573 struct svcxprt_rdma *xprt = cma_id->context;
574 int ret = 0;
575
576 switch (event->event) {
577 case RDMA_CM_EVENT_CONNECT_REQUEST:
578 dprintk("svcrdma: Connect request on cma_id=%p, xprt = %p, "
579 "event=%d\n", cma_id, cma_id->context, event->event);
580 handle_connect_req(cma_id);
581 break;
582
583 case RDMA_CM_EVENT_ESTABLISHED:
584 /* Accept complete */
585 dprintk("svcrdma: Connection completed on LISTEN xprt=%p, "
586 "cm_id=%p\n", xprt, cma_id);
587 break;
588
589 case RDMA_CM_EVENT_DEVICE_REMOVAL:
590 dprintk("svcrdma: Device removal xprt=%p, cm_id=%p\n",
591 xprt, cma_id);
592 if (xprt)
593 set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
594 break;
595
596 default:
597 dprintk("svcrdma: Unexpected event on listening endpoint %p, "
598 "event=%d\n", cma_id, event->event);
599 break;
600 }
601
602 return ret;
603}
604
605static int rdma_cma_handler(struct rdma_cm_id *cma_id,
606 struct rdma_cm_event *event)
607{
608 struct svc_xprt *xprt = cma_id->context;
609 struct svcxprt_rdma *rdma =
610 container_of(xprt, struct svcxprt_rdma, sc_xprt);
611 switch (event->event) {
612 case RDMA_CM_EVENT_ESTABLISHED:
613 /* Accept complete */
614 dprintk("svcrdma: Connection completed on DTO xprt=%p, "
615 "cm_id=%p\n", xprt, cma_id);
616 clear_bit(RDMAXPRT_CONN_PENDING, &rdma->sc_flags);
617 svc_xprt_enqueue(xprt);
618 break;
619 case RDMA_CM_EVENT_DISCONNECTED:
620 dprintk("svcrdma: Disconnect on DTO xprt=%p, cm_id=%p\n",
621 xprt, cma_id);
622 if (xprt) {
623 set_bit(XPT_CLOSE, &xprt->xpt_flags);
624 svc_xprt_enqueue(xprt);
625 }
626 break;
627 case RDMA_CM_EVENT_DEVICE_REMOVAL:
628 dprintk("svcrdma: Device removal cma_id=%p, xprt = %p, "
629 "event=%d\n", cma_id, xprt, event->event);
630 if (xprt) {
631 set_bit(XPT_CLOSE, &xprt->xpt_flags);
632 svc_xprt_enqueue(xprt);
633 }
634 break;
635 default:
636 dprintk("svcrdma: Unexpected event on DTO endpoint %p, "
637 "event=%d\n", cma_id, event->event);
638 break;
639 }
640 return 0;
641}
642
643/*
644 * Create a listening RDMA service endpoint.
645 */
646static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
647 struct sockaddr *sa, int salen,
648 int flags)
649{
650 struct rdma_cm_id *listen_id;
651 struct svcxprt_rdma *cma_xprt;
652 struct svc_xprt *xprt;
653 int ret;
654
655 dprintk("svcrdma: Creating RDMA socket\n");
656
657 cma_xprt = rdma_create_xprt(serv, 1);
658 if (!cma_xprt)
659 return ERR_PTR(ENOMEM);
660 xprt = &cma_xprt->sc_xprt;
661
662 listen_id = rdma_create_id(rdma_listen_handler, cma_xprt, RDMA_PS_TCP);
663 if (IS_ERR(listen_id)) {
664 rdma_destroy_xprt(cma_xprt);
665 dprintk("svcrdma: rdma_create_id failed = %ld\n",
666 PTR_ERR(listen_id));
667 return (void *)listen_id;
668 }
669 ret = rdma_bind_addr(listen_id, sa);
670 if (ret) {
671 rdma_destroy_xprt(cma_xprt);
672 rdma_destroy_id(listen_id);
673 dprintk("svcrdma: rdma_bind_addr failed = %d\n", ret);
674 return ERR_PTR(ret);
675 }
676 cma_xprt->sc_cm_id = listen_id;
677
678 ret = rdma_listen(listen_id, RPCRDMA_LISTEN_BACKLOG);
679 if (ret) {
680 rdma_destroy_id(listen_id);
681 rdma_destroy_xprt(cma_xprt);
682 dprintk("svcrdma: rdma_listen failed = %d\n", ret);
683 }
684
685 /*
686 * We need to use the address from the cm_id in case the
687 * caller specified 0 for the port number.
688 */
689 sa = (struct sockaddr *)&cma_xprt->sc_cm_id->route.addr.src_addr;
690 svc_xprt_set_local(&cma_xprt->sc_xprt, sa, salen);
691
692 return &cma_xprt->sc_xprt;
693}
694
695/*
696 * This is the xpo_recvfrom function for listening endpoints. Its
697 * purpose is to accept incoming connections. The CMA callback handler
698 * has already created a new transport and attached it to the new CMA
699 * ID.
700 *
701 * There is a queue of pending connections hung on the listening
702 * transport. This queue contains the new svc_xprt structure. This
703 * function takes svc_xprt structures off the accept_q and completes
704 * the connection.
705 */
706static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
707{
708 struct svcxprt_rdma *listen_rdma;
709 struct svcxprt_rdma *newxprt = NULL;
710 struct rdma_conn_param conn_param;
711 struct ib_qp_init_attr qp_attr;
712 struct ib_device_attr devattr;
713 struct sockaddr *sa;
714 int ret;
715 int i;
716
717 listen_rdma = container_of(xprt, struct svcxprt_rdma, sc_xprt);
718 clear_bit(XPT_CONN, &xprt->xpt_flags);
719 /* Get the next entry off the accept list */
720 spin_lock_bh(&listen_rdma->sc_lock);
721 if (!list_empty(&listen_rdma->sc_accept_q)) {
722 newxprt = list_entry(listen_rdma->sc_accept_q.next,
723 struct svcxprt_rdma, sc_accept_q);
724 list_del_init(&newxprt->sc_accept_q);
725 }
726 if (!list_empty(&listen_rdma->sc_accept_q))
727 set_bit(XPT_CONN, &listen_rdma->sc_xprt.xpt_flags);
728 spin_unlock_bh(&listen_rdma->sc_lock);
729 if (!newxprt)
730 return NULL;
731
732 dprintk("svcrdma: newxprt from accept queue = %p, cm_id=%p\n",
733 newxprt, newxprt->sc_cm_id);
734
735 ret = ib_query_device(newxprt->sc_cm_id->device, &devattr);
736 if (ret) {
737 dprintk("svcrdma: could not query device attributes on "
738 "device %p, rc=%d\n", newxprt->sc_cm_id->device, ret);
739 goto errout;
740 }
741
742 /* Qualify the transport resource defaults with the
743 * capabilities of this particular device */
744 newxprt->sc_max_sge = min((size_t)devattr.max_sge,
745 (size_t)RPCSVC_MAXPAGES);
746 newxprt->sc_max_requests = min((size_t)devattr.max_qp_wr,
747 (size_t)svcrdma_max_requests);
748 newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_max_requests;
749
750 newxprt->sc_ord = min((size_t)devattr.max_qp_rd_atom,
751 (size_t)svcrdma_ord);
752
753 newxprt->sc_pd = ib_alloc_pd(newxprt->sc_cm_id->device);
754 if (IS_ERR(newxprt->sc_pd)) {
755 dprintk("svcrdma: error creating PD for connect request\n");
756 goto errout;
757 }
758 newxprt->sc_sq_cq = ib_create_cq(newxprt->sc_cm_id->device,
759 sq_comp_handler,
760 cq_event_handler,
761 newxprt,
762 newxprt->sc_sq_depth,
763 0);
764 if (IS_ERR(newxprt->sc_sq_cq)) {
765 dprintk("svcrdma: error creating SQ CQ for connect request\n");
766 goto errout;
767 }
768 newxprt->sc_rq_cq = ib_create_cq(newxprt->sc_cm_id->device,
769 rq_comp_handler,
770 cq_event_handler,
771 newxprt,
772 newxprt->sc_max_requests,
773 0);
774 if (IS_ERR(newxprt->sc_rq_cq)) {
775 dprintk("svcrdma: error creating RQ CQ for connect request\n");
776 goto errout;
777 }
778
779 memset(&qp_attr, 0, sizeof qp_attr);
780 qp_attr.event_handler = qp_event_handler;
781 qp_attr.qp_context = &newxprt->sc_xprt;
782 qp_attr.cap.max_send_wr = newxprt->sc_sq_depth;
783 qp_attr.cap.max_recv_wr = newxprt->sc_max_requests;
784 qp_attr.cap.max_send_sge = newxprt->sc_max_sge;
785 qp_attr.cap.max_recv_sge = newxprt->sc_max_sge;
786 qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
787 qp_attr.qp_type = IB_QPT_RC;
788 qp_attr.send_cq = newxprt->sc_sq_cq;
789 qp_attr.recv_cq = newxprt->sc_rq_cq;
790 dprintk("svcrdma: newxprt->sc_cm_id=%p, newxprt->sc_pd=%p\n"
791 " cm_id->device=%p, sc_pd->device=%p\n"
792 " cap.max_send_wr = %d\n"
793 " cap.max_recv_wr = %d\n"
794 " cap.max_send_sge = %d\n"
795 " cap.max_recv_sge = %d\n",
796 newxprt->sc_cm_id, newxprt->sc_pd,
797 newxprt->sc_cm_id->device, newxprt->sc_pd->device,
798 qp_attr.cap.max_send_wr,
799 qp_attr.cap.max_recv_wr,
800 qp_attr.cap.max_send_sge,
801 qp_attr.cap.max_recv_sge);
802
803 ret = rdma_create_qp(newxprt->sc_cm_id, newxprt->sc_pd, &qp_attr);
804 if (ret) {
805 /*
806 * XXX: This is a hack. We need a xx_request_qp interface
807 * that will adjust the qp_attr's with a best-effort
808 * number
809 */
810 qp_attr.cap.max_send_sge -= 2;
811 qp_attr.cap.max_recv_sge -= 2;
812 ret = rdma_create_qp(newxprt->sc_cm_id, newxprt->sc_pd,
813 &qp_attr);
814 if (ret) {
815 dprintk("svcrdma: failed to create QP, ret=%d\n", ret);
816 goto errout;
817 }
818 newxprt->sc_max_sge = qp_attr.cap.max_send_sge;
819 newxprt->sc_max_sge = qp_attr.cap.max_recv_sge;
820 newxprt->sc_sq_depth = qp_attr.cap.max_send_wr;
821 newxprt->sc_max_requests = qp_attr.cap.max_recv_wr;
822 }
823 newxprt->sc_qp = newxprt->sc_cm_id->qp;
824
825 /* Register all of physical memory */
826 newxprt->sc_phys_mr = ib_get_dma_mr(newxprt->sc_pd,
827 IB_ACCESS_LOCAL_WRITE |
828 IB_ACCESS_REMOTE_WRITE);
829 if (IS_ERR(newxprt->sc_phys_mr)) {
830 dprintk("svcrdma: Failed to create DMA MR ret=%d\n", ret);
831 goto errout;
832 }
833
834 /* Post receive buffers */
835 for (i = 0; i < newxprt->sc_max_requests; i++) {
836 ret = svc_rdma_post_recv(newxprt);
837 if (ret) {
838 dprintk("svcrdma: failure posting receive buffers\n");
839 goto errout;
840 }
841 }
842
843 /* Swap out the handler */
844 newxprt->sc_cm_id->event_handler = rdma_cma_handler;
845
846 /* Accept Connection */
847 set_bit(RDMAXPRT_CONN_PENDING, &newxprt->sc_flags);
848 memset(&conn_param, 0, sizeof conn_param);
849 conn_param.responder_resources = 0;
850 conn_param.initiator_depth = newxprt->sc_ord;
851 ret = rdma_accept(newxprt->sc_cm_id, &conn_param);
852 if (ret) {
853 dprintk("svcrdma: failed to accept new connection, ret=%d\n",
854 ret);
855 goto errout;
856 }
857
858 dprintk("svcrdma: new connection %p accepted with the following "
859 "attributes:\n"
860 " local_ip : %d.%d.%d.%d\n"
861 " local_port : %d\n"
862 " remote_ip : %d.%d.%d.%d\n"
863 " remote_port : %d\n"
864 " max_sge : %d\n"
865 " sq_depth : %d\n"
866 " max_requests : %d\n"
867 " ord : %d\n",
868 newxprt,
869 NIPQUAD(((struct sockaddr_in *)&newxprt->sc_cm_id->
870 route.addr.src_addr)->sin_addr.s_addr),
871 ntohs(((struct sockaddr_in *)&newxprt->sc_cm_id->
872 route.addr.src_addr)->sin_port),
873 NIPQUAD(((struct sockaddr_in *)&newxprt->sc_cm_id->
874 route.addr.dst_addr)->sin_addr.s_addr),
875 ntohs(((struct sockaddr_in *)&newxprt->sc_cm_id->
876 route.addr.dst_addr)->sin_port),
877 newxprt->sc_max_sge,
878 newxprt->sc_sq_depth,
879 newxprt->sc_max_requests,
880 newxprt->sc_ord);
881
882 /* Set the local and remote addresses in the transport */
883 sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr;
884 svc_xprt_set_remote(&newxprt->sc_xprt, sa, svc_addr_len(sa));
885 sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.src_addr;
886 svc_xprt_set_local(&newxprt->sc_xprt, sa, svc_addr_len(sa));
887
888 ib_req_notify_cq(newxprt->sc_sq_cq, IB_CQ_NEXT_COMP);
889 ib_req_notify_cq(newxprt->sc_rq_cq, IB_CQ_NEXT_COMP);
890 return &newxprt->sc_xprt;
891
892 errout:
893 dprintk("svcrdma: failure accepting new connection rc=%d.\n", ret);
894 rdma_destroy_id(newxprt->sc_cm_id);
895 rdma_destroy_xprt(newxprt);
896 return NULL;
897}
898
899/*
900 * Post an RQ WQE to the RQ when the rqst is being released. This
901 * effectively returns an RQ credit to the client. The rq_xprt_ctxt
902 * will be null if the request is deferred due to an RDMA_READ or the
903 * transport had no data ready (EAGAIN). Note that an RPC deferred in
904 * svc_process will still return the credit, this is because the data
905 * is copied and no longer consume a WQE/WC.
906 */
907static void svc_rdma_release_rqst(struct svc_rqst *rqstp)
908{
909 int err;
910 struct svcxprt_rdma *rdma =
911 container_of(rqstp->rq_xprt, struct svcxprt_rdma, sc_xprt);
912 if (rqstp->rq_xprt_ctxt) {
913 BUG_ON(rqstp->rq_xprt_ctxt != rdma);
914 err = svc_rdma_post_recv(rdma);
915 if (err)
916 dprintk("svcrdma: failed to post an RQ WQE error=%d\n",
917 err);
918 }
919 rqstp->rq_xprt_ctxt = NULL;
920}
921
922/* Disable data ready events for this connection */
923static void svc_rdma_detach(struct svc_xprt *xprt)
924{
925 struct svcxprt_rdma *rdma =
926 container_of(xprt, struct svcxprt_rdma, sc_xprt);
927 unsigned long flags;
928
929 dprintk("svc: svc_rdma_detach(%p)\n", xprt);
930 /*
931 * Shutdown the connection. This will ensure we don't get any
932 * more events from the provider.
933 */
934 rdma_disconnect(rdma->sc_cm_id);
935 rdma_destroy_id(rdma->sc_cm_id);
936
937 /* We may already be on the DTO list */
938 spin_lock_irqsave(&dto_lock, flags);
939 if (!list_empty(&rdma->sc_dto_q))
940 list_del_init(&rdma->sc_dto_q);
941 spin_unlock_irqrestore(&dto_lock, flags);
942}
943
944static void svc_rdma_free(struct svc_xprt *xprt)
945{
946 struct svcxprt_rdma *rdma = (struct svcxprt_rdma *)xprt;
947 dprintk("svcrdma: svc_rdma_free(%p)\n", rdma);
948 rdma_destroy_xprt(rdma);
949 kfree(rdma);
950}
951
952static void rdma_destroy_xprt(struct svcxprt_rdma *xprt)
953{
954 if (xprt->sc_qp && !IS_ERR(xprt->sc_qp))
955 ib_destroy_qp(xprt->sc_qp);
956
957 if (xprt->sc_sq_cq && !IS_ERR(xprt->sc_sq_cq))
958 ib_destroy_cq(xprt->sc_sq_cq);
959
960 if (xprt->sc_rq_cq && !IS_ERR(xprt->sc_rq_cq))
961 ib_destroy_cq(xprt->sc_rq_cq);
962
963 if (xprt->sc_phys_mr && !IS_ERR(xprt->sc_phys_mr))
964 ib_dereg_mr(xprt->sc_phys_mr);
965
966 if (xprt->sc_pd && !IS_ERR(xprt->sc_pd))
967 ib_dealloc_pd(xprt->sc_pd);
968
969 destroy_context_cache(xprt->sc_ctxt_head);
970}
971
972static int svc_rdma_has_wspace(struct svc_xprt *xprt)
973{
974 struct svcxprt_rdma *rdma =
975 container_of(xprt, struct svcxprt_rdma, sc_xprt);
976
977 /*
978 * If there are fewer SQ WR available than required to send a
979 * simple response, return false.
980 */
981 if ((rdma->sc_sq_depth - atomic_read(&rdma->sc_sq_count) < 3))
982 return 0;
983
984 /*
985 * ...or there are already waiters on the SQ,
986 * return false.
987 */
988 if (waitqueue_active(&rdma->sc_send_wait))
989 return 0;
990
991 /* Otherwise return true. */
992 return 1;
993}
994
995int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr)
996{
997 struct ib_send_wr *bad_wr;
998 int ret;
999
1000 if (test_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags))
1001 return 0;
1002
1003 BUG_ON(wr->send_flags != IB_SEND_SIGNALED);
1004 BUG_ON(((struct svc_rdma_op_ctxt *)(unsigned long)wr->wr_id)->wr_op !=
1005 wr->opcode);
1006 /* If the SQ is full, wait until an SQ entry is available */
1007 while (1) {
1008 spin_lock_bh(&xprt->sc_lock);
1009 if (xprt->sc_sq_depth == atomic_read(&xprt->sc_sq_count)) {
1010 spin_unlock_bh(&xprt->sc_lock);
1011 atomic_inc(&rdma_stat_sq_starve);
1012 /* See if we can reap some SQ WR */
1013 sq_cq_reap(xprt);
1014
1015 /* Wait until SQ WR available if SQ still full */
1016 wait_event(xprt->sc_send_wait,
1017 atomic_read(&xprt->sc_sq_count) <
1018 xprt->sc_sq_depth);
1019 continue;
1020 }
1021 /* Bumped used SQ WR count and post */
1022 ret = ib_post_send(xprt->sc_qp, wr, &bad_wr);
1023 if (!ret)
1024 atomic_inc(&xprt->sc_sq_count);
1025 else
1026 dprintk("svcrdma: failed to post SQ WR rc=%d, "
1027 "sc_sq_count=%d, sc_sq_depth=%d\n",
1028 ret, atomic_read(&xprt->sc_sq_count),
1029 xprt->sc_sq_depth);
1030 spin_unlock_bh(&xprt->sc_lock);
1031 break;
1032 }
1033 return ret;
1034}
1035
1036int svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
1037 enum rpcrdma_errcode err)
1038{
1039 struct ib_send_wr err_wr;
1040 struct ib_sge sge;
1041 struct page *p;
1042 struct svc_rdma_op_ctxt *ctxt;
1043 u32 *va;
1044 int length;
1045 int ret;
1046
1047 p = svc_rdma_get_page();
1048 va = page_address(p);
1049
1050 /* XDR encode error */
1051 length = svc_rdma_xdr_encode_error(xprt, rmsgp, err, va);
1052
1053 /* Prepare SGE for local address */
1054 sge.addr = ib_dma_map_page(xprt->sc_cm_id->device,
1055 p, 0, PAGE_SIZE, DMA_FROM_DEVICE);
1056 sge.lkey = xprt->sc_phys_mr->lkey;
1057 sge.length = length;
1058
1059 ctxt = svc_rdma_get_context(xprt);
1060 ctxt->count = 1;
1061 ctxt->pages[0] = p;
1062
1063 /* Prepare SEND WR */
1064 memset(&err_wr, 0, sizeof err_wr);
1065 ctxt->wr_op = IB_WR_SEND;
1066 err_wr.wr_id = (unsigned long)ctxt;
1067 err_wr.sg_list = &sge;
1068 err_wr.num_sge = 1;
1069 err_wr.opcode = IB_WR_SEND;
1070 err_wr.send_flags = IB_SEND_SIGNALED;
1071
1072 /* Post It */
1073 ret = svc_rdma_send(xprt, &err_wr);
1074 if (ret) {
1075 dprintk("svcrdma: Error posting send = %d\n", ret);
1076 svc_rdma_put_context(ctxt, 1);
1077 }
1078
1079 return ret;
1080}