diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2008-02-01 22:31:28 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2008-02-01 22:31:28 -0500 |
commit | 63e9b66e29357dd12e8b1d3ebf7036e7591f81e3 (patch) | |
tree | 5aa6a70a8f4bbf306e2825a1e2fa2660c2c1c187 /net | |
parent | 687fcdf741e4a268c2c7bac8b3734de761bb9719 (diff) | |
parent | ea339d46b93c7b16e067a29aad1812f7a389815a (diff) |
Merge branch 'for-linus' of git://linux-nfs.org/~bfields/linux
* 'for-linus' of git://linux-nfs.org/~bfields/linux: (100 commits)
SUNRPC: RPC program information is stored in unsigned integers
SUNRPC: Move exported symbol definitions after function declaration part 2
NLM: tear down RPC clients in nlm_shutdown_hosts
SUNRPC: spin svc_rqst initialization to its own function
nfsd: more careful input validation in nfsctl write methods
lockd: minor log message fix
knfsd: don't bother mapping putrootfh enoent to eperm
rdma: makefile
rdma: ONCRPC RDMA protocol marshalling
rdma: SVCRDMA sendto
rdma: SVCRDMA recvfrom
rdma: SVCRDMA Core Transport Services
rdma: SVCRDMA Transport Module
rdma: SVCRMDA Header File
svc: Add svc_xprt_names service to replace svc_sock_names
knfsd: Support adding transports by writing portlist file
svc: Add svc API that queries for a transport instance
svc: Add /proc/sys/sunrpc/transport files
svc: Add transport hdr size for defer/revisit
svc: Move the xprt independent code to the svc_xprt.c file
...
Diffstat (limited to 'net')
-rw-r--r-- | net/sunrpc/Makefile | 3 | ||||
-rw-r--r-- | net/sunrpc/auth_gss/svcauth_gss.c | 93 | ||||
-rw-r--r-- | net/sunrpc/cache.c | 152 | ||||
-rw-r--r-- | net/sunrpc/stats.c | 7 | ||||
-rw-r--r-- | net/sunrpc/sunrpc_syms.c | 52 | ||||
-rw-r--r-- | net/sunrpc/svc.c | 90 | ||||
-rw-r--r-- | net/sunrpc/svc_xprt.c | 1055 | ||||
-rw-r--r-- | net/sunrpc/svcauth.c | 6 | ||||
-rw-r--r-- | net/sunrpc/svcauth_unix.c | 59 | ||||
-rw-r--r-- | net/sunrpc/svcsock.c | 1311 | ||||
-rw-r--r-- | net/sunrpc/sysctl.c | 31 | ||||
-rw-r--r-- | net/sunrpc/xdr.c | 8 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/Makefile | 5 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/svc_rdma.c | 266 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/svc_rdma_marshal.c | 412 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 586 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/svc_rdma_sendto.c | 520 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/svc_rdma_transport.c | 1080 |
18 files changed, 4530 insertions, 1206 deletions
diff --git a/net/sunrpc/Makefile b/net/sunrpc/Makefile index 5c69a725e530..92e1dbe50947 100644 --- a/net/sunrpc/Makefile +++ b/net/sunrpc/Makefile | |||
@@ -11,6 +11,7 @@ sunrpc-y := clnt.o xprt.o socklib.o xprtsock.o sched.o \ | |||
11 | auth.o auth_null.o auth_unix.o \ | 11 | auth.o auth_null.o auth_unix.o \ |
12 | svc.o svcsock.o svcauth.o svcauth_unix.o \ | 12 | svc.o svcsock.o svcauth.o svcauth_unix.o \ |
13 | rpcb_clnt.o timer.o xdr.o \ | 13 | rpcb_clnt.o timer.o xdr.o \ |
14 | sunrpc_syms.o cache.o rpc_pipe.o | 14 | sunrpc_syms.o cache.o rpc_pipe.o \ |
15 | svc_xprt.o | ||
15 | sunrpc-$(CONFIG_PROC_FS) += stats.o | 16 | sunrpc-$(CONFIG_PROC_FS) += stats.o |
16 | sunrpc-$(CONFIG_SYSCTL) += sysctl.o | 17 | sunrpc-$(CONFIG_SYSCTL) += sysctl.o |
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 73940df6c460..481f984e9a22 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c | |||
@@ -224,38 +224,34 @@ static int rsi_parse(struct cache_detail *cd, | |||
224 | 224 | ||
225 | /* major/minor */ | 225 | /* major/minor */ |
226 | len = qword_get(&mesg, buf, mlen); | 226 | len = qword_get(&mesg, buf, mlen); |
227 | if (len < 0) | 227 | if (len <= 0) |
228 | goto out; | 228 | goto out; |
229 | if (len == 0) { | 229 | rsii.major_status = simple_strtoul(buf, &ep, 10); |
230 | if (*ep) | ||
231 | goto out; | ||
232 | len = qword_get(&mesg, buf, mlen); | ||
233 | if (len <= 0) | ||
234 | goto out; | ||
235 | rsii.minor_status = simple_strtoul(buf, &ep, 10); | ||
236 | if (*ep) | ||
230 | goto out; | 237 | goto out; |
231 | } else { | ||
232 | rsii.major_status = simple_strtoul(buf, &ep, 10); | ||
233 | if (*ep) | ||
234 | goto out; | ||
235 | len = qword_get(&mesg, buf, mlen); | ||
236 | if (len <= 0) | ||
237 | goto out; | ||
238 | rsii.minor_status = simple_strtoul(buf, &ep, 10); | ||
239 | if (*ep) | ||
240 | goto out; | ||
241 | 238 | ||
242 | /* out_handle */ | 239 | /* out_handle */ |
243 | len = qword_get(&mesg, buf, mlen); | 240 | len = qword_get(&mesg, buf, mlen); |
244 | if (len < 0) | 241 | if (len < 0) |
245 | goto out; | 242 | goto out; |
246 | status = -ENOMEM; | 243 | status = -ENOMEM; |
247 | if (dup_to_netobj(&rsii.out_handle, buf, len)) | 244 | if (dup_to_netobj(&rsii.out_handle, buf, len)) |
248 | goto out; | 245 | goto out; |
249 | 246 | ||
250 | /* out_token */ | 247 | /* out_token */ |
251 | len = qword_get(&mesg, buf, mlen); | 248 | len = qword_get(&mesg, buf, mlen); |
252 | status = -EINVAL; | 249 | status = -EINVAL; |
253 | if (len < 0) | 250 | if (len < 0) |
254 | goto out; | 251 | goto out; |
255 | status = -ENOMEM; | 252 | status = -ENOMEM; |
256 | if (dup_to_netobj(&rsii.out_token, buf, len)) | 253 | if (dup_to_netobj(&rsii.out_token, buf, len)) |
257 | goto out; | 254 | goto out; |
258 | } | ||
259 | rsii.h.expiry_time = expiry; | 255 | rsii.h.expiry_time = expiry; |
260 | rsip = rsi_update(&rsii, rsip); | 256 | rsip = rsi_update(&rsii, rsip); |
261 | status = 0; | 257 | status = 0; |
@@ -975,6 +971,7 @@ static int svcauth_gss_handle_init(struct svc_rqst *rqstp, | |||
975 | struct kvec *resv = &rqstp->rq_res.head[0]; | 971 | struct kvec *resv = &rqstp->rq_res.head[0]; |
976 | struct xdr_netobj tmpobj; | 972 | struct xdr_netobj tmpobj; |
977 | struct rsi *rsip, rsikey; | 973 | struct rsi *rsip, rsikey; |
974 | int ret; | ||
978 | 975 | ||
979 | /* Read the verifier; should be NULL: */ | 976 | /* Read the verifier; should be NULL: */ |
980 | *authp = rpc_autherr_badverf; | 977 | *authp = rpc_autherr_badverf; |
@@ -1014,23 +1011,27 @@ static int svcauth_gss_handle_init(struct svc_rqst *rqstp, | |||
1014 | /* No upcall result: */ | 1011 | /* No upcall result: */ |
1015 | return SVC_DROP; | 1012 | return SVC_DROP; |
1016 | case 0: | 1013 | case 0: |
1014 | ret = SVC_DROP; | ||
1017 | /* Got an answer to the upcall; use it: */ | 1015 | /* Got an answer to the upcall; use it: */ |
1018 | if (gss_write_init_verf(rqstp, rsip)) | 1016 | if (gss_write_init_verf(rqstp, rsip)) |
1019 | return SVC_DROP; | 1017 | goto out; |
1020 | if (resv->iov_len + 4 > PAGE_SIZE) | 1018 | if (resv->iov_len + 4 > PAGE_SIZE) |
1021 | return SVC_DROP; | 1019 | goto out; |
1022 | svc_putnl(resv, RPC_SUCCESS); | 1020 | svc_putnl(resv, RPC_SUCCESS); |
1023 | if (svc_safe_putnetobj(resv, &rsip->out_handle)) | 1021 | if (svc_safe_putnetobj(resv, &rsip->out_handle)) |
1024 | return SVC_DROP; | 1022 | goto out; |
1025 | if (resv->iov_len + 3 * 4 > PAGE_SIZE) | 1023 | if (resv->iov_len + 3 * 4 > PAGE_SIZE) |
1026 | return SVC_DROP; | 1024 | goto out; |
1027 | svc_putnl(resv, rsip->major_status); | 1025 | svc_putnl(resv, rsip->major_status); |
1028 | svc_putnl(resv, rsip->minor_status); | 1026 | svc_putnl(resv, rsip->minor_status); |
1029 | svc_putnl(resv, GSS_SEQ_WIN); | 1027 | svc_putnl(resv, GSS_SEQ_WIN); |
1030 | if (svc_safe_putnetobj(resv, &rsip->out_token)) | 1028 | if (svc_safe_putnetobj(resv, &rsip->out_token)) |
1031 | return SVC_DROP; | 1029 | goto out; |
1032 | } | 1030 | } |
1033 | return SVC_COMPLETE; | 1031 | ret = SVC_COMPLETE; |
1032 | out: | ||
1033 | cache_put(&rsip->h, &rsi_cache); | ||
1034 | return ret; | ||
1034 | } | 1035 | } |
1035 | 1036 | ||
1036 | /* | 1037 | /* |
@@ -1125,6 +1126,7 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp) | |||
1125 | case RPC_GSS_PROC_DESTROY: | 1126 | case RPC_GSS_PROC_DESTROY: |
1126 | if (gss_write_verf(rqstp, rsci->mechctx, gc->gc_seq)) | 1127 | if (gss_write_verf(rqstp, rsci->mechctx, gc->gc_seq)) |
1127 | goto auth_err; | 1128 | goto auth_err; |
1129 | rsci->h.expiry_time = get_seconds(); | ||
1128 | set_bit(CACHE_NEGATIVE, &rsci->h.flags); | 1130 | set_bit(CACHE_NEGATIVE, &rsci->h.flags); |
1129 | if (resv->iov_len + 4 > PAGE_SIZE) | 1131 | if (resv->iov_len + 4 > PAGE_SIZE) |
1130 | goto drop; | 1132 | goto drop; |
@@ -1386,19 +1388,26 @@ int | |||
1386 | gss_svc_init(void) | 1388 | gss_svc_init(void) |
1387 | { | 1389 | { |
1388 | int rv = svc_auth_register(RPC_AUTH_GSS, &svcauthops_gss); | 1390 | int rv = svc_auth_register(RPC_AUTH_GSS, &svcauthops_gss); |
1389 | if (rv == 0) { | 1391 | if (rv) |
1390 | cache_register(&rsc_cache); | 1392 | return rv; |
1391 | cache_register(&rsi_cache); | 1393 | rv = cache_register(&rsc_cache); |
1392 | } | 1394 | if (rv) |
1395 | goto out1; | ||
1396 | rv = cache_register(&rsi_cache); | ||
1397 | if (rv) | ||
1398 | goto out2; | ||
1399 | return 0; | ||
1400 | out2: | ||
1401 | cache_unregister(&rsc_cache); | ||
1402 | out1: | ||
1403 | svc_auth_unregister(RPC_AUTH_GSS); | ||
1393 | return rv; | 1404 | return rv; |
1394 | } | 1405 | } |
1395 | 1406 | ||
1396 | void | 1407 | void |
1397 | gss_svc_shutdown(void) | 1408 | gss_svc_shutdown(void) |
1398 | { | 1409 | { |
1399 | if (cache_unregister(&rsc_cache)) | 1410 | cache_unregister(&rsc_cache); |
1400 | printk(KERN_ERR "auth_rpcgss: failed to unregister rsc cache\n"); | 1411 | cache_unregister(&rsi_cache); |
1401 | if (cache_unregister(&rsi_cache)) | ||
1402 | printk(KERN_ERR "auth_rpcgss: failed to unregister rsi cache\n"); | ||
1403 | svc_auth_unregister(RPC_AUTH_GSS); | 1412 | svc_auth_unregister(RPC_AUTH_GSS); |
1404 | } | 1413 | } |
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 73f053d0cc7a..636c8e04e0be 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c | |||
@@ -245,6 +245,7 @@ int cache_check(struct cache_detail *detail, | |||
245 | cache_put(h, detail); | 245 | cache_put(h, detail); |
246 | return rv; | 246 | return rv; |
247 | } | 247 | } |
248 | EXPORT_SYMBOL(cache_check); | ||
248 | 249 | ||
249 | /* | 250 | /* |
250 | * caches need to be periodically cleaned. | 251 | * caches need to be periodically cleaned. |
@@ -290,44 +291,78 @@ static const struct file_operations cache_flush_operations; | |||
290 | static void do_cache_clean(struct work_struct *work); | 291 | static void do_cache_clean(struct work_struct *work); |
291 | static DECLARE_DELAYED_WORK(cache_cleaner, do_cache_clean); | 292 | static DECLARE_DELAYED_WORK(cache_cleaner, do_cache_clean); |
292 | 293 | ||
293 | void cache_register(struct cache_detail *cd) | 294 | static void remove_cache_proc_entries(struct cache_detail *cd) |
294 | { | 295 | { |
295 | cd->proc_ent = proc_mkdir(cd->name, proc_net_rpc); | 296 | if (cd->proc_ent == NULL) |
296 | if (cd->proc_ent) { | 297 | return; |
297 | struct proc_dir_entry *p; | 298 | if (cd->flush_ent) |
298 | cd->proc_ent->owner = cd->owner; | 299 | remove_proc_entry("flush", cd->proc_ent); |
299 | cd->channel_ent = cd->content_ent = NULL; | 300 | if (cd->channel_ent) |
301 | remove_proc_entry("channel", cd->proc_ent); | ||
302 | if (cd->content_ent) | ||
303 | remove_proc_entry("content", cd->proc_ent); | ||
304 | cd->proc_ent = NULL; | ||
305 | remove_proc_entry(cd->name, proc_net_rpc); | ||
306 | } | ||
300 | 307 | ||
301 | p = create_proc_entry("flush", S_IFREG|S_IRUSR|S_IWUSR, | 308 | #ifdef CONFIG_PROC_FS |
302 | cd->proc_ent); | 309 | static int create_cache_proc_entries(struct cache_detail *cd) |
303 | cd->flush_ent = p; | 310 | { |
304 | if (p) { | 311 | struct proc_dir_entry *p; |
305 | p->proc_fops = &cache_flush_operations; | ||
306 | p->owner = cd->owner; | ||
307 | p->data = cd; | ||
308 | } | ||
309 | 312 | ||
310 | if (cd->cache_request || cd->cache_parse) { | 313 | cd->proc_ent = proc_mkdir(cd->name, proc_net_rpc); |
311 | p = create_proc_entry("channel", S_IFREG|S_IRUSR|S_IWUSR, | 314 | if (cd->proc_ent == NULL) |
312 | cd->proc_ent); | 315 | goto out_nomem; |
313 | cd->channel_ent = p; | 316 | cd->proc_ent->owner = cd->owner; |
314 | if (p) { | 317 | cd->channel_ent = cd->content_ent = NULL; |
315 | p->proc_fops = &cache_file_operations; | 318 | |
316 | p->owner = cd->owner; | 319 | p = create_proc_entry("flush", S_IFREG|S_IRUSR|S_IWUSR, cd->proc_ent); |
317 | p->data = cd; | 320 | cd->flush_ent = p; |
318 | } | 321 | if (p == NULL) |
319 | } | 322 | goto out_nomem; |
320 | if (cd->cache_show) { | 323 | p->proc_fops = &cache_flush_operations; |
321 | p = create_proc_entry("content", S_IFREG|S_IRUSR|S_IWUSR, | 324 | p->owner = cd->owner; |
322 | cd->proc_ent); | 325 | p->data = cd; |
323 | cd->content_ent = p; | 326 | |
324 | if (p) { | 327 | if (cd->cache_request || cd->cache_parse) { |
325 | p->proc_fops = &content_file_operations; | 328 | p = create_proc_entry("channel", S_IFREG|S_IRUSR|S_IWUSR, |
326 | p->owner = cd->owner; | 329 | cd->proc_ent); |
327 | p->data = cd; | 330 | cd->channel_ent = p; |
328 | } | 331 | if (p == NULL) |
329 | } | 332 | goto out_nomem; |
333 | p->proc_fops = &cache_file_operations; | ||
334 | p->owner = cd->owner; | ||
335 | p->data = cd; | ||
330 | } | 336 | } |
337 | if (cd->cache_show) { | ||
338 | p = create_proc_entry("content", S_IFREG|S_IRUSR|S_IWUSR, | ||
339 | cd->proc_ent); | ||
340 | cd->content_ent = p; | ||
341 | if (p == NULL) | ||
342 | goto out_nomem; | ||
343 | p->proc_fops = &content_file_operations; | ||
344 | p->owner = cd->owner; | ||
345 | p->data = cd; | ||
346 | } | ||
347 | return 0; | ||
348 | out_nomem: | ||
349 | remove_cache_proc_entries(cd); | ||
350 | return -ENOMEM; | ||
351 | } | ||
352 | #else /* CONFIG_PROC_FS */ | ||
353 | static int create_cache_proc_entries(struct cache_detail *cd) | ||
354 | { | ||
355 | return 0; | ||
356 | } | ||
357 | #endif | ||
358 | |||
359 | int cache_register(struct cache_detail *cd) | ||
360 | { | ||
361 | int ret; | ||
362 | |||
363 | ret = create_cache_proc_entries(cd); | ||
364 | if (ret) | ||
365 | return ret; | ||
331 | rwlock_init(&cd->hash_lock); | 366 | rwlock_init(&cd->hash_lock); |
332 | INIT_LIST_HEAD(&cd->queue); | 367 | INIT_LIST_HEAD(&cd->queue); |
333 | spin_lock(&cache_list_lock); | 368 | spin_lock(&cache_list_lock); |
@@ -341,9 +376,11 @@ void cache_register(struct cache_detail *cd) | |||
341 | 376 | ||
342 | /* start the cleaning process */ | 377 | /* start the cleaning process */ |
343 | schedule_delayed_work(&cache_cleaner, 0); | 378 | schedule_delayed_work(&cache_cleaner, 0); |
379 | return 0; | ||
344 | } | 380 | } |
381 | EXPORT_SYMBOL(cache_register); | ||
345 | 382 | ||
346 | int cache_unregister(struct cache_detail *cd) | 383 | void cache_unregister(struct cache_detail *cd) |
347 | { | 384 | { |
348 | cache_purge(cd); | 385 | cache_purge(cd); |
349 | spin_lock(&cache_list_lock); | 386 | spin_lock(&cache_list_lock); |
@@ -351,30 +388,23 @@ int cache_unregister(struct cache_detail *cd) | |||
351 | if (cd->entries || atomic_read(&cd->inuse)) { | 388 | if (cd->entries || atomic_read(&cd->inuse)) { |
352 | write_unlock(&cd->hash_lock); | 389 | write_unlock(&cd->hash_lock); |
353 | spin_unlock(&cache_list_lock); | 390 | spin_unlock(&cache_list_lock); |
354 | return -EBUSY; | 391 | goto out; |
355 | } | 392 | } |
356 | if (current_detail == cd) | 393 | if (current_detail == cd) |
357 | current_detail = NULL; | 394 | current_detail = NULL; |
358 | list_del_init(&cd->others); | 395 | list_del_init(&cd->others); |
359 | write_unlock(&cd->hash_lock); | 396 | write_unlock(&cd->hash_lock); |
360 | spin_unlock(&cache_list_lock); | 397 | spin_unlock(&cache_list_lock); |
361 | if (cd->proc_ent) { | 398 | remove_cache_proc_entries(cd); |
362 | if (cd->flush_ent) | ||
363 | remove_proc_entry("flush", cd->proc_ent); | ||
364 | if (cd->channel_ent) | ||
365 | remove_proc_entry("channel", cd->proc_ent); | ||
366 | if (cd->content_ent) | ||
367 | remove_proc_entry("content", cd->proc_ent); | ||
368 | |||
369 | cd->proc_ent = NULL; | ||
370 | remove_proc_entry(cd->name, proc_net_rpc); | ||
371 | } | ||
372 | if (list_empty(&cache_list)) { | 399 | if (list_empty(&cache_list)) { |
373 | /* module must be being unloaded so its safe to kill the worker */ | 400 | /* module must be being unloaded so its safe to kill the worker */ |
374 | cancel_delayed_work_sync(&cache_cleaner); | 401 | cancel_delayed_work_sync(&cache_cleaner); |
375 | } | 402 | } |
376 | return 0; | 403 | return; |
404 | out: | ||
405 | printk(KERN_ERR "nfsd: failed to unregister %s cache\n", cd->name); | ||
377 | } | 406 | } |
407 | EXPORT_SYMBOL(cache_unregister); | ||
378 | 408 | ||
379 | /* clean cache tries to find something to clean | 409 | /* clean cache tries to find something to clean |
380 | * and cleans it. | 410 | * and cleans it. |
@@ -489,6 +519,7 @@ void cache_flush(void) | |||
489 | while (cache_clean() != -1) | 519 | while (cache_clean() != -1) |
490 | cond_resched(); | 520 | cond_resched(); |
491 | } | 521 | } |
522 | EXPORT_SYMBOL(cache_flush); | ||
492 | 523 | ||
493 | void cache_purge(struct cache_detail *detail) | 524 | void cache_purge(struct cache_detail *detail) |
494 | { | 525 | { |
@@ -497,7 +528,7 @@ void cache_purge(struct cache_detail *detail) | |||
497 | cache_flush(); | 528 | cache_flush(); |
498 | detail->flush_time = 1; | 529 | detail->flush_time = 1; |
499 | } | 530 | } |
500 | 531 | EXPORT_SYMBOL(cache_purge); | |
501 | 532 | ||
502 | 533 | ||
503 | /* | 534 | /* |
@@ -634,13 +665,13 @@ void cache_clean_deferred(void *owner) | |||
634 | /* | 665 | /* |
635 | * communicate with user-space | 666 | * communicate with user-space |
636 | * | 667 | * |
637 | * We have a magic /proc file - /proc/sunrpc/cache | 668 | * We have a magic /proc file - /proc/sunrpc/<cachename>/channel. |
638 | * On read, you get a full request, or block | 669 | * On read, you get a full request, or block. |
639 | * On write, an update request is processed | 670 | * On write, an update request is processed. |
640 | * Poll works if anything to read, and always allows write | 671 | * Poll works if anything to read, and always allows write. |
641 | * | 672 | * |
642 | * Implemented by linked list of requests. Each open file has | 673 | * Implemented by linked list of requests. Each open file has |
643 | * a ->private that also exists in this list. New request are added | 674 | * a ->private that also exists in this list. New requests are added |
644 | * to the end and may wakeup and preceding readers. | 675 | * to the end and may wakeup and preceding readers. |
645 | * New readers are added to the head. If, on read, an item is found with | 676 | * New readers are added to the head. If, on read, an item is found with |
646 | * CACHE_UPCALLING clear, we free it from the list. | 677 | * CACHE_UPCALLING clear, we free it from the list. |
@@ -963,6 +994,7 @@ void qword_add(char **bpp, int *lp, char *str) | |||
963 | *bpp = bp; | 994 | *bpp = bp; |
964 | *lp = len; | 995 | *lp = len; |
965 | } | 996 | } |
997 | EXPORT_SYMBOL(qword_add); | ||
966 | 998 | ||
967 | void qword_addhex(char **bpp, int *lp, char *buf, int blen) | 999 | void qword_addhex(char **bpp, int *lp, char *buf, int blen) |
968 | { | 1000 | { |
@@ -991,6 +1023,7 @@ void qword_addhex(char **bpp, int *lp, char *buf, int blen) | |||
991 | *bpp = bp; | 1023 | *bpp = bp; |
992 | *lp = len; | 1024 | *lp = len; |
993 | } | 1025 | } |
1026 | EXPORT_SYMBOL(qword_addhex); | ||
994 | 1027 | ||
995 | static void warn_no_listener(struct cache_detail *detail) | 1028 | static void warn_no_listener(struct cache_detail *detail) |
996 | { | 1029 | { |
@@ -1113,6 +1146,7 @@ int qword_get(char **bpp, char *dest, int bufsize) | |||
1113 | *dest = '\0'; | 1146 | *dest = '\0'; |
1114 | return len; | 1147 | return len; |
1115 | } | 1148 | } |
1149 | EXPORT_SYMBOL(qword_get); | ||
1116 | 1150 | ||
1117 | 1151 | ||
1118 | /* | 1152 | /* |
@@ -1244,18 +1278,18 @@ static ssize_t read_flush(struct file *file, char __user *buf, | |||
1244 | struct cache_detail *cd = PDE(file->f_path.dentry->d_inode)->data; | 1278 | struct cache_detail *cd = PDE(file->f_path.dentry->d_inode)->data; |
1245 | char tbuf[20]; | 1279 | char tbuf[20]; |
1246 | unsigned long p = *ppos; | 1280 | unsigned long p = *ppos; |
1247 | int len; | 1281 | size_t len; |
1248 | 1282 | ||
1249 | sprintf(tbuf, "%lu\n", cd->flush_time); | 1283 | sprintf(tbuf, "%lu\n", cd->flush_time); |
1250 | len = strlen(tbuf); | 1284 | len = strlen(tbuf); |
1251 | if (p >= len) | 1285 | if (p >= len) |
1252 | return 0; | 1286 | return 0; |
1253 | len -= p; | 1287 | len -= p; |
1254 | if (len > count) len = count; | 1288 | if (len > count) |
1289 | len = count; | ||
1255 | if (copy_to_user(buf, (void*)(tbuf+p), len)) | 1290 | if (copy_to_user(buf, (void*)(tbuf+p), len)) |
1256 | len = -EFAULT; | 1291 | return -EFAULT; |
1257 | else | 1292 | *ppos += len; |
1258 | *ppos += len; | ||
1259 | return len; | 1293 | return len; |
1260 | } | 1294 | } |
1261 | 1295 | ||
diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c index 74df2d358e61..5a16875f5ac8 100644 --- a/net/sunrpc/stats.c +++ b/net/sunrpc/stats.c | |||
@@ -33,7 +33,7 @@ struct proc_dir_entry *proc_net_rpc = NULL; | |||
33 | static int rpc_proc_show(struct seq_file *seq, void *v) { | 33 | static int rpc_proc_show(struct seq_file *seq, void *v) { |
34 | const struct rpc_stat *statp = seq->private; | 34 | const struct rpc_stat *statp = seq->private; |
35 | const struct rpc_program *prog = statp->program; | 35 | const struct rpc_program *prog = statp->program; |
36 | int i, j; | 36 | unsigned int i, j; |
37 | 37 | ||
38 | seq_printf(seq, | 38 | seq_printf(seq, |
39 | "net %u %u %u %u\n", | 39 | "net %u %u %u %u\n", |
@@ -81,7 +81,7 @@ void svc_seq_show(struct seq_file *seq, const struct svc_stat *statp) { | |||
81 | const struct svc_program *prog = statp->program; | 81 | const struct svc_program *prog = statp->program; |
82 | const struct svc_procedure *proc; | 82 | const struct svc_procedure *proc; |
83 | const struct svc_version *vers; | 83 | const struct svc_version *vers; |
84 | int i, j; | 84 | unsigned int i, j; |
85 | 85 | ||
86 | seq_printf(seq, | 86 | seq_printf(seq, |
87 | "net %u %u %u %u\n", | 87 | "net %u %u %u %u\n", |
@@ -106,6 +106,7 @@ void svc_seq_show(struct seq_file *seq, const struct svc_stat *statp) { | |||
106 | seq_putc(seq, '\n'); | 106 | seq_putc(seq, '\n'); |
107 | } | 107 | } |
108 | } | 108 | } |
109 | EXPORT_SYMBOL(svc_seq_show); | ||
109 | 110 | ||
110 | /** | 111 | /** |
111 | * rpc_alloc_iostats - allocate an rpc_iostats structure | 112 | * rpc_alloc_iostats - allocate an rpc_iostats structure |
@@ -255,12 +256,14 @@ svc_proc_register(struct svc_stat *statp, const struct file_operations *fops) | |||
255 | { | 256 | { |
256 | return do_register(statp->program->pg_name, statp, fops); | 257 | return do_register(statp->program->pg_name, statp, fops); |
257 | } | 258 | } |
259 | EXPORT_SYMBOL(svc_proc_register); | ||
258 | 260 | ||
259 | void | 261 | void |
260 | svc_proc_unregister(const char *name) | 262 | svc_proc_unregister(const char *name) |
261 | { | 263 | { |
262 | remove_proc_entry(name, proc_net_rpc); | 264 | remove_proc_entry(name, proc_net_rpc); |
263 | } | 265 | } |
266 | EXPORT_SYMBOL(svc_proc_unregister); | ||
264 | 267 | ||
265 | void | 268 | void |
266 | rpc_proc_init(void) | 269 | rpc_proc_init(void) |
diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c index 1a7e309d008b..843629f55763 100644 --- a/net/sunrpc/sunrpc_syms.c +++ b/net/sunrpc/sunrpc_syms.c | |||
@@ -22,48 +22,6 @@ | |||
22 | #include <linux/sunrpc/rpc_pipe_fs.h> | 22 | #include <linux/sunrpc/rpc_pipe_fs.h> |
23 | #include <linux/sunrpc/xprtsock.h> | 23 | #include <linux/sunrpc/xprtsock.h> |
24 | 24 | ||
25 | /* RPC server stuff */ | ||
26 | EXPORT_SYMBOL(svc_create); | ||
27 | EXPORT_SYMBOL(svc_create_thread); | ||
28 | EXPORT_SYMBOL(svc_create_pooled); | ||
29 | EXPORT_SYMBOL(svc_set_num_threads); | ||
30 | EXPORT_SYMBOL(svc_exit_thread); | ||
31 | EXPORT_SYMBOL(svc_destroy); | ||
32 | EXPORT_SYMBOL(svc_drop); | ||
33 | EXPORT_SYMBOL(svc_process); | ||
34 | EXPORT_SYMBOL(svc_recv); | ||
35 | EXPORT_SYMBOL(svc_wake_up); | ||
36 | EXPORT_SYMBOL(svc_makesock); | ||
37 | EXPORT_SYMBOL(svc_reserve); | ||
38 | EXPORT_SYMBOL(svc_auth_register); | ||
39 | EXPORT_SYMBOL(auth_domain_lookup); | ||
40 | EXPORT_SYMBOL(svc_authenticate); | ||
41 | EXPORT_SYMBOL(svc_set_client); | ||
42 | |||
43 | /* RPC statistics */ | ||
44 | #ifdef CONFIG_PROC_FS | ||
45 | EXPORT_SYMBOL(svc_proc_register); | ||
46 | EXPORT_SYMBOL(svc_proc_unregister); | ||
47 | EXPORT_SYMBOL(svc_seq_show); | ||
48 | #endif | ||
49 | |||
50 | /* caching... */ | ||
51 | EXPORT_SYMBOL(auth_domain_find); | ||
52 | EXPORT_SYMBOL(auth_domain_put); | ||
53 | EXPORT_SYMBOL(auth_unix_add_addr); | ||
54 | EXPORT_SYMBOL(auth_unix_forget_old); | ||
55 | EXPORT_SYMBOL(auth_unix_lookup); | ||
56 | EXPORT_SYMBOL(cache_check); | ||
57 | EXPORT_SYMBOL(cache_flush); | ||
58 | EXPORT_SYMBOL(cache_purge); | ||
59 | EXPORT_SYMBOL(cache_register); | ||
60 | EXPORT_SYMBOL(cache_unregister); | ||
61 | EXPORT_SYMBOL(qword_add); | ||
62 | EXPORT_SYMBOL(qword_addhex); | ||
63 | EXPORT_SYMBOL(qword_get); | ||
64 | EXPORT_SYMBOL(svcauth_unix_purge); | ||
65 | EXPORT_SYMBOL(unix_domain_find); | ||
66 | |||
67 | extern struct cache_detail ip_map_cache, unix_gid_cache; | 25 | extern struct cache_detail ip_map_cache, unix_gid_cache; |
68 | 26 | ||
69 | static int __init | 27 | static int __init |
@@ -85,7 +43,8 @@ init_sunrpc(void) | |||
85 | #endif | 43 | #endif |
86 | cache_register(&ip_map_cache); | 44 | cache_register(&ip_map_cache); |
87 | cache_register(&unix_gid_cache); | 45 | cache_register(&unix_gid_cache); |
88 | init_socket_xprt(); | 46 | svc_init_xprt_sock(); /* svc sock transport */ |
47 | init_socket_xprt(); /* clnt sock transport */ | ||
89 | rpcauth_init_module(); | 48 | rpcauth_init_module(); |
90 | out: | 49 | out: |
91 | return err; | 50 | return err; |
@@ -96,12 +55,11 @@ cleanup_sunrpc(void) | |||
96 | { | 55 | { |
97 | rpcauth_remove_module(); | 56 | rpcauth_remove_module(); |
98 | cleanup_socket_xprt(); | 57 | cleanup_socket_xprt(); |
58 | svc_cleanup_xprt_sock(); | ||
99 | unregister_rpc_pipefs(); | 59 | unregister_rpc_pipefs(); |
100 | rpc_destroy_mempool(); | 60 | rpc_destroy_mempool(); |
101 | if (cache_unregister(&ip_map_cache)) | 61 | cache_unregister(&ip_map_cache); |
102 | printk(KERN_ERR "sunrpc: failed to unregister ip_map cache\n"); | 62 | cache_unregister(&unix_gid_cache); |
103 | if (cache_unregister(&unix_gid_cache)) | ||
104 | printk(KERN_ERR "sunrpc: failed to unregister unix_gid cache\n"); | ||
105 | #ifdef RPC_DEBUG | 63 | #ifdef RPC_DEBUG |
106 | rpc_unregister_sysctl(); | 64 | rpc_unregister_sysctl(); |
107 | #endif | 65 | #endif |
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 4ad5fbbb18b4..a290e1523297 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c | |||
@@ -364,7 +364,7 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools, | |||
364 | void (*shutdown)(struct svc_serv *serv)) | 364 | void (*shutdown)(struct svc_serv *serv)) |
365 | { | 365 | { |
366 | struct svc_serv *serv; | 366 | struct svc_serv *serv; |
367 | int vers; | 367 | unsigned int vers; |
368 | unsigned int xdrsize; | 368 | unsigned int xdrsize; |
369 | unsigned int i; | 369 | unsigned int i; |
370 | 370 | ||
@@ -433,6 +433,7 @@ svc_create(struct svc_program *prog, unsigned int bufsize, | |||
433 | { | 433 | { |
434 | return __svc_create(prog, bufsize, /*npools*/1, shutdown); | 434 | return __svc_create(prog, bufsize, /*npools*/1, shutdown); |
435 | } | 435 | } |
436 | EXPORT_SYMBOL(svc_create); | ||
436 | 437 | ||
437 | struct svc_serv * | 438 | struct svc_serv * |
438 | svc_create_pooled(struct svc_program *prog, unsigned int bufsize, | 439 | svc_create_pooled(struct svc_program *prog, unsigned int bufsize, |
@@ -452,6 +453,7 @@ svc_create_pooled(struct svc_program *prog, unsigned int bufsize, | |||
452 | 453 | ||
453 | return serv; | 454 | return serv; |
454 | } | 455 | } |
456 | EXPORT_SYMBOL(svc_create_pooled); | ||
455 | 457 | ||
456 | /* | 458 | /* |
457 | * Destroy an RPC service. Should be called with the BKL held | 459 | * Destroy an RPC service. Should be called with the BKL held |
@@ -459,9 +461,6 @@ svc_create_pooled(struct svc_program *prog, unsigned int bufsize, | |||
459 | void | 461 | void |
460 | svc_destroy(struct svc_serv *serv) | 462 | svc_destroy(struct svc_serv *serv) |
461 | { | 463 | { |
462 | struct svc_sock *svsk; | ||
463 | struct svc_sock *tmp; | ||
464 | |||
465 | dprintk("svc: svc_destroy(%s, %d)\n", | 464 | dprintk("svc: svc_destroy(%s, %d)\n", |
466 | serv->sv_program->pg_name, | 465 | serv->sv_program->pg_name, |
467 | serv->sv_nrthreads); | 466 | serv->sv_nrthreads); |
@@ -476,14 +475,12 @@ svc_destroy(struct svc_serv *serv) | |||
476 | 475 | ||
477 | del_timer_sync(&serv->sv_temptimer); | 476 | del_timer_sync(&serv->sv_temptimer); |
478 | 477 | ||
479 | list_for_each_entry_safe(svsk, tmp, &serv->sv_tempsocks, sk_list) | 478 | svc_close_all(&serv->sv_tempsocks); |
480 | svc_force_close_socket(svsk); | ||
481 | 479 | ||
482 | if (serv->sv_shutdown) | 480 | if (serv->sv_shutdown) |
483 | serv->sv_shutdown(serv); | 481 | serv->sv_shutdown(serv); |
484 | 482 | ||
485 | list_for_each_entry_safe(svsk, tmp, &serv->sv_permsocks, sk_list) | 483 | svc_close_all(&serv->sv_permsocks); |
486 | svc_force_close_socket(svsk); | ||
487 | 484 | ||
488 | BUG_ON(!list_empty(&serv->sv_permsocks)); | 485 | BUG_ON(!list_empty(&serv->sv_permsocks)); |
489 | BUG_ON(!list_empty(&serv->sv_tempsocks)); | 486 | BUG_ON(!list_empty(&serv->sv_tempsocks)); |
@@ -498,6 +495,7 @@ svc_destroy(struct svc_serv *serv) | |||
498 | kfree(serv->sv_pools); | 495 | kfree(serv->sv_pools); |
499 | kfree(serv); | 496 | kfree(serv); |
500 | } | 497 | } |
498 | EXPORT_SYMBOL(svc_destroy); | ||
501 | 499 | ||
502 | /* | 500 | /* |
503 | * Allocate an RPC server's buffer space. | 501 | * Allocate an RPC server's buffer space. |
@@ -536,31 +534,17 @@ svc_release_buffer(struct svc_rqst *rqstp) | |||
536 | put_page(rqstp->rq_pages[i]); | 534 | put_page(rqstp->rq_pages[i]); |
537 | } | 535 | } |
538 | 536 | ||
539 | /* | 537 | struct svc_rqst * |
540 | * Create a thread in the given pool. Caller must hold BKL. | 538 | svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool) |
541 | * On a NUMA or SMP machine, with a multi-pool serv, the thread | ||
542 | * will be restricted to run on the cpus belonging to the pool. | ||
543 | */ | ||
544 | static int | ||
545 | __svc_create_thread(svc_thread_fn func, struct svc_serv *serv, | ||
546 | struct svc_pool *pool) | ||
547 | { | 539 | { |
548 | struct svc_rqst *rqstp; | 540 | struct svc_rqst *rqstp; |
549 | int error = -ENOMEM; | ||
550 | int have_oldmask = 0; | ||
551 | cpumask_t oldmask; | ||
552 | 541 | ||
553 | rqstp = kzalloc(sizeof(*rqstp), GFP_KERNEL); | 542 | rqstp = kzalloc(sizeof(*rqstp), GFP_KERNEL); |
554 | if (!rqstp) | 543 | if (!rqstp) |
555 | goto out; | 544 | goto out_enomem; |
556 | 545 | ||
557 | init_waitqueue_head(&rqstp->rq_wait); | 546 | init_waitqueue_head(&rqstp->rq_wait); |
558 | 547 | ||
559 | if (!(rqstp->rq_argp = kmalloc(serv->sv_xdrsize, GFP_KERNEL)) | ||
560 | || !(rqstp->rq_resp = kmalloc(serv->sv_xdrsize, GFP_KERNEL)) | ||
561 | || !svc_init_buffer(rqstp, serv->sv_max_mesg)) | ||
562 | goto out_thread; | ||
563 | |||
564 | serv->sv_nrthreads++; | 548 | serv->sv_nrthreads++; |
565 | spin_lock_bh(&pool->sp_lock); | 549 | spin_lock_bh(&pool->sp_lock); |
566 | pool->sp_nrthreads++; | 550 | pool->sp_nrthreads++; |
@@ -569,6 +553,45 @@ __svc_create_thread(svc_thread_fn func, struct svc_serv *serv, | |||
569 | rqstp->rq_server = serv; | 553 | rqstp->rq_server = serv; |
570 | rqstp->rq_pool = pool; | 554 | rqstp->rq_pool = pool; |
571 | 555 | ||
556 | rqstp->rq_argp = kmalloc(serv->sv_xdrsize, GFP_KERNEL); | ||
557 | if (!rqstp->rq_argp) | ||
558 | goto out_thread; | ||
559 | |||
560 | rqstp->rq_resp = kmalloc(serv->sv_xdrsize, GFP_KERNEL); | ||
561 | if (!rqstp->rq_resp) | ||
562 | goto out_thread; | ||
563 | |||
564 | if (!svc_init_buffer(rqstp, serv->sv_max_mesg)) | ||
565 | goto out_thread; | ||
566 | |||
567 | return rqstp; | ||
568 | out_thread: | ||
569 | svc_exit_thread(rqstp); | ||
570 | out_enomem: | ||
571 | return ERR_PTR(-ENOMEM); | ||
572 | } | ||
573 | EXPORT_SYMBOL(svc_prepare_thread); | ||
574 | |||
575 | /* | ||
576 | * Create a thread in the given pool. Caller must hold BKL. | ||
577 | * On a NUMA or SMP machine, with a multi-pool serv, the thread | ||
578 | * will be restricted to run on the cpus belonging to the pool. | ||
579 | */ | ||
580 | static int | ||
581 | __svc_create_thread(svc_thread_fn func, struct svc_serv *serv, | ||
582 | struct svc_pool *pool) | ||
583 | { | ||
584 | struct svc_rqst *rqstp; | ||
585 | int error = -ENOMEM; | ||
586 | int have_oldmask = 0; | ||
587 | cpumask_t oldmask; | ||
588 | |||
589 | rqstp = svc_prepare_thread(serv, pool); | ||
590 | if (IS_ERR(rqstp)) { | ||
591 | error = PTR_ERR(rqstp); | ||
592 | goto out; | ||
593 | } | ||
594 | |||
572 | if (serv->sv_nrpools > 1) | 595 | if (serv->sv_nrpools > 1) |
573 | have_oldmask = svc_pool_map_set_cpumask(pool->sp_id, &oldmask); | 596 | have_oldmask = svc_pool_map_set_cpumask(pool->sp_id, &oldmask); |
574 | 597 | ||
@@ -597,6 +620,7 @@ svc_create_thread(svc_thread_fn func, struct svc_serv *serv) | |||
597 | { | 620 | { |
598 | return __svc_create_thread(func, serv, &serv->sv_pools[0]); | 621 | return __svc_create_thread(func, serv, &serv->sv_pools[0]); |
599 | } | 622 | } |
623 | EXPORT_SYMBOL(svc_create_thread); | ||
600 | 624 | ||
601 | /* | 625 | /* |
602 | * Choose a pool in which to create a new thread, for svc_set_num_threads | 626 | * Choose a pool in which to create a new thread, for svc_set_num_threads |
@@ -700,6 +724,7 @@ svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) | |||
700 | 724 | ||
701 | return error; | 725 | return error; |
702 | } | 726 | } |
727 | EXPORT_SYMBOL(svc_set_num_threads); | ||
703 | 728 | ||
704 | /* | 729 | /* |
705 | * Called from a server thread as it's exiting. Caller must hold BKL. | 730 | * Called from a server thread as it's exiting. Caller must hold BKL. |
@@ -726,6 +751,7 @@ svc_exit_thread(struct svc_rqst *rqstp) | |||
726 | if (serv) | 751 | if (serv) |
727 | svc_destroy(serv); | 752 | svc_destroy(serv); |
728 | } | 753 | } |
754 | EXPORT_SYMBOL(svc_exit_thread); | ||
729 | 755 | ||
730 | /* | 756 | /* |
731 | * Register an RPC service with the local portmapper. | 757 | * Register an RPC service with the local portmapper. |
@@ -737,7 +763,8 @@ svc_register(struct svc_serv *serv, int proto, unsigned short port) | |||
737 | { | 763 | { |
738 | struct svc_program *progp; | 764 | struct svc_program *progp; |
739 | unsigned long flags; | 765 | unsigned long flags; |
740 | int i, error = 0, dummy; | 766 | unsigned int i; |
767 | int error = 0, dummy; | ||
741 | 768 | ||
742 | if (!port) | 769 | if (!port) |
743 | clear_thread_flag(TIF_SIGPENDING); | 770 | clear_thread_flag(TIF_SIGPENDING); |
@@ -840,9 +867,9 @@ svc_process(struct svc_rqst *rqstp) | |||
840 | rqstp->rq_res.tail[0].iov_len = 0; | 867 | rqstp->rq_res.tail[0].iov_len = 0; |
841 | /* Will be turned off only in gss privacy case: */ | 868 | /* Will be turned off only in gss privacy case: */ |
842 | rqstp->rq_splice_ok = 1; | 869 | rqstp->rq_splice_ok = 1; |
843 | /* tcp needs a space for the record length... */ | 870 | |
844 | if (rqstp->rq_prot == IPPROTO_TCP) | 871 | /* Setup reply header */ |
845 | svc_putnl(resv, 0); | 872 | rqstp->rq_xprt->xpt_ops->xpo_prep_reply_hdr(rqstp); |
846 | 873 | ||
847 | rqstp->rq_xid = svc_getu32(argv); | 874 | rqstp->rq_xid = svc_getu32(argv); |
848 | svc_putu32(resv, rqstp->rq_xid); | 875 | svc_putu32(resv, rqstp->rq_xid); |
@@ -1049,16 +1076,15 @@ err_bad: | |||
1049 | svc_putnl(resv, ntohl(rpc_stat)); | 1076 | svc_putnl(resv, ntohl(rpc_stat)); |
1050 | goto sendit; | 1077 | goto sendit; |
1051 | } | 1078 | } |
1079 | EXPORT_SYMBOL(svc_process); | ||
1052 | 1080 | ||
1053 | /* | 1081 | /* |
1054 | * Return (transport-specific) limit on the rpc payload. | 1082 | * Return (transport-specific) limit on the rpc payload. |
1055 | */ | 1083 | */ |
1056 | u32 svc_max_payload(const struct svc_rqst *rqstp) | 1084 | u32 svc_max_payload(const struct svc_rqst *rqstp) |
1057 | { | 1085 | { |
1058 | int max = RPCSVC_MAXPAYLOAD_TCP; | 1086 | u32 max = rqstp->rq_xprt->xpt_class->xcl_max_payload; |
1059 | 1087 | ||
1060 | if (rqstp->rq_sock->sk_sock->type == SOCK_DGRAM) | ||
1061 | max = RPCSVC_MAXPAYLOAD_UDP; | ||
1062 | if (rqstp->rq_server->sv_max_payload < max) | 1088 | if (rqstp->rq_server->sv_max_payload < max) |
1063 | max = rqstp->rq_server->sv_max_payload; | 1089 | max = rqstp->rq_server->sv_max_payload; |
1064 | return max; | 1090 | return max; |
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c new file mode 100644 index 000000000000..ea377e06afae --- /dev/null +++ b/net/sunrpc/svc_xprt.c | |||
@@ -0,0 +1,1055 @@ | |||
1 | /* | ||
2 | * linux/net/sunrpc/svc_xprt.c | ||
3 | * | ||
4 | * Author: Tom Tucker <tom@opengridcomputing.com> | ||
5 | */ | ||
6 | |||
7 | #include <linux/sched.h> | ||
8 | #include <linux/errno.h> | ||
9 | #include <linux/fcntl.h> | ||
10 | #include <linux/net.h> | ||
11 | #include <linux/in.h> | ||
12 | #include <linux/inet.h> | ||
13 | #include <linux/udp.h> | ||
14 | #include <linux/tcp.h> | ||
15 | #include <linux/unistd.h> | ||
16 | #include <linux/slab.h> | ||
17 | #include <linux/netdevice.h> | ||
18 | #include <linux/skbuff.h> | ||
19 | #include <linux/file.h> | ||
20 | #include <linux/freezer.h> | ||
21 | #include <net/sock.h> | ||
22 | #include <net/checksum.h> | ||
23 | #include <net/ip.h> | ||
24 | #include <net/ipv6.h> | ||
25 | #include <net/tcp_states.h> | ||
26 | #include <linux/uaccess.h> | ||
27 | #include <asm/ioctls.h> | ||
28 | |||
29 | #include <linux/sunrpc/types.h> | ||
30 | #include <linux/sunrpc/clnt.h> | ||
31 | #include <linux/sunrpc/xdr.h> | ||
32 | #include <linux/sunrpc/stats.h> | ||
33 | #include <linux/sunrpc/svc_xprt.h> | ||
34 | |||
35 | #define RPCDBG_FACILITY RPCDBG_SVCXPRT | ||
36 | |||
37 | static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt); | ||
38 | static int svc_deferred_recv(struct svc_rqst *rqstp); | ||
39 | static struct cache_deferred_req *svc_defer(struct cache_req *req); | ||
40 | static void svc_age_temp_xprts(unsigned long closure); | ||
41 | |||
42 | /* apparently the "standard" is that clients close | ||
43 | * idle connections after 5 minutes, servers after | ||
44 | * 6 minutes | ||
45 | * http://www.connectathon.org/talks96/nfstcp.pdf | ||
46 | */ | ||
47 | static int svc_conn_age_period = 6*60; | ||
48 | |||
49 | /* List of registered transport classes */ | ||
50 | static DEFINE_SPINLOCK(svc_xprt_class_lock); | ||
51 | static LIST_HEAD(svc_xprt_class_list); | ||
52 | |||
53 | /* SMP locking strategy: | ||
54 | * | ||
55 | * svc_pool->sp_lock protects most of the fields of that pool. | ||
56 | * svc_serv->sv_lock protects sv_tempsocks, sv_permsocks, sv_tmpcnt. | ||
57 | * when both need to be taken (rare), svc_serv->sv_lock is first. | ||
58 | * BKL protects svc_serv->sv_nrthread. | ||
59 | * svc_sock->sk_lock protects the svc_sock->sk_deferred list | ||
60 | * and the ->sk_info_authunix cache. | ||
61 | * | ||
62 | * The XPT_BUSY bit in xprt->xpt_flags prevents a transport being | ||
63 | * enqueued multiply. During normal transport processing this bit | ||
64 | * is set by svc_xprt_enqueue and cleared by svc_xprt_received. | ||
65 | * Providers should not manipulate this bit directly. | ||
66 | * | ||
67 | * Some flags can be set to certain values at any time | ||
68 | * providing that certain rules are followed: | ||
69 | * | ||
70 | * XPT_CONN, XPT_DATA: | ||
71 | * - Can be set or cleared at any time. | ||
72 | * - After a set, svc_xprt_enqueue must be called to enqueue | ||
73 | * the transport for processing. | ||
74 | * - After a clear, the transport must be read/accepted. | ||
75 | * If this succeeds, it must be set again. | ||
76 | * XPT_CLOSE: | ||
77 | * - Can set at any time. It is never cleared. | ||
78 | * XPT_DEAD: | ||
79 | * - Can only be set while XPT_BUSY is held which ensures | ||
80 | * that no other thread will be using the transport or will | ||
81 | * try to set XPT_DEAD. | ||
82 | */ | ||
83 | |||
84 | int svc_reg_xprt_class(struct svc_xprt_class *xcl) | ||
85 | { | ||
86 | struct svc_xprt_class *cl; | ||
87 | int res = -EEXIST; | ||
88 | |||
89 | dprintk("svc: Adding svc transport class '%s'\n", xcl->xcl_name); | ||
90 | |||
91 | INIT_LIST_HEAD(&xcl->xcl_list); | ||
92 | spin_lock(&svc_xprt_class_lock); | ||
93 | /* Make sure there isn't already a class with the same name */ | ||
94 | list_for_each_entry(cl, &svc_xprt_class_list, xcl_list) { | ||
95 | if (strcmp(xcl->xcl_name, cl->xcl_name) == 0) | ||
96 | goto out; | ||
97 | } | ||
98 | list_add_tail(&xcl->xcl_list, &svc_xprt_class_list); | ||
99 | res = 0; | ||
100 | out: | ||
101 | spin_unlock(&svc_xprt_class_lock); | ||
102 | return res; | ||
103 | } | ||
104 | EXPORT_SYMBOL_GPL(svc_reg_xprt_class); | ||
105 | |||
106 | void svc_unreg_xprt_class(struct svc_xprt_class *xcl) | ||
107 | { | ||
108 | dprintk("svc: Removing svc transport class '%s'\n", xcl->xcl_name); | ||
109 | spin_lock(&svc_xprt_class_lock); | ||
110 | list_del_init(&xcl->xcl_list); | ||
111 | spin_unlock(&svc_xprt_class_lock); | ||
112 | } | ||
113 | EXPORT_SYMBOL_GPL(svc_unreg_xprt_class); | ||
114 | |||
115 | /* | ||
116 | * Format the transport list for printing | ||
117 | */ | ||
118 | int svc_print_xprts(char *buf, int maxlen) | ||
119 | { | ||
120 | struct list_head *le; | ||
121 | char tmpstr[80]; | ||
122 | int len = 0; | ||
123 | buf[0] = '\0'; | ||
124 | |||
125 | spin_lock(&svc_xprt_class_lock); | ||
126 | list_for_each(le, &svc_xprt_class_list) { | ||
127 | int slen; | ||
128 | struct svc_xprt_class *xcl = | ||
129 | list_entry(le, struct svc_xprt_class, xcl_list); | ||
130 | |||
131 | sprintf(tmpstr, "%s %d\n", xcl->xcl_name, xcl->xcl_max_payload); | ||
132 | slen = strlen(tmpstr); | ||
133 | if (len + slen > maxlen) | ||
134 | break; | ||
135 | len += slen; | ||
136 | strcat(buf, tmpstr); | ||
137 | } | ||
138 | spin_unlock(&svc_xprt_class_lock); | ||
139 | |||
140 | return len; | ||
141 | } | ||
142 | |||
143 | static void svc_xprt_free(struct kref *kref) | ||
144 | { | ||
145 | struct svc_xprt *xprt = | ||
146 | container_of(kref, struct svc_xprt, xpt_ref); | ||
147 | struct module *owner = xprt->xpt_class->xcl_owner; | ||
148 | if (test_bit(XPT_CACHE_AUTH, &xprt->xpt_flags) | ||
149 | && xprt->xpt_auth_cache != NULL) | ||
150 | svcauth_unix_info_release(xprt->xpt_auth_cache); | ||
151 | xprt->xpt_ops->xpo_free(xprt); | ||
152 | module_put(owner); | ||
153 | } | ||
154 | |||
155 | void svc_xprt_put(struct svc_xprt *xprt) | ||
156 | { | ||
157 | kref_put(&xprt->xpt_ref, svc_xprt_free); | ||
158 | } | ||
159 | EXPORT_SYMBOL_GPL(svc_xprt_put); | ||
160 | |||
161 | /* | ||
162 | * Called by transport drivers to initialize the transport independent | ||
163 | * portion of the transport instance. | ||
164 | */ | ||
165 | void svc_xprt_init(struct svc_xprt_class *xcl, struct svc_xprt *xprt, | ||
166 | struct svc_serv *serv) | ||
167 | { | ||
168 | memset(xprt, 0, sizeof(*xprt)); | ||
169 | xprt->xpt_class = xcl; | ||
170 | xprt->xpt_ops = xcl->xcl_ops; | ||
171 | kref_init(&xprt->xpt_ref); | ||
172 | xprt->xpt_server = serv; | ||
173 | INIT_LIST_HEAD(&xprt->xpt_list); | ||
174 | INIT_LIST_HEAD(&xprt->xpt_ready); | ||
175 | INIT_LIST_HEAD(&xprt->xpt_deferred); | ||
176 | mutex_init(&xprt->xpt_mutex); | ||
177 | spin_lock_init(&xprt->xpt_lock); | ||
178 | set_bit(XPT_BUSY, &xprt->xpt_flags); | ||
179 | } | ||
180 | EXPORT_SYMBOL_GPL(svc_xprt_init); | ||
181 | |||
182 | int svc_create_xprt(struct svc_serv *serv, char *xprt_name, unsigned short port, | ||
183 | int flags) | ||
184 | { | ||
185 | struct svc_xprt_class *xcl; | ||
186 | struct sockaddr_in sin = { | ||
187 | .sin_family = AF_INET, | ||
188 | .sin_addr.s_addr = INADDR_ANY, | ||
189 | .sin_port = htons(port), | ||
190 | }; | ||
191 | dprintk("svc: creating transport %s[%d]\n", xprt_name, port); | ||
192 | spin_lock(&svc_xprt_class_lock); | ||
193 | list_for_each_entry(xcl, &svc_xprt_class_list, xcl_list) { | ||
194 | struct svc_xprt *newxprt; | ||
195 | |||
196 | if (strcmp(xprt_name, xcl->xcl_name)) | ||
197 | continue; | ||
198 | |||
199 | if (!try_module_get(xcl->xcl_owner)) | ||
200 | goto err; | ||
201 | |||
202 | spin_unlock(&svc_xprt_class_lock); | ||
203 | newxprt = xcl->xcl_ops-> | ||
204 | xpo_create(serv, (struct sockaddr *)&sin, sizeof(sin), | ||
205 | flags); | ||
206 | if (IS_ERR(newxprt)) { | ||
207 | module_put(xcl->xcl_owner); | ||
208 | return PTR_ERR(newxprt); | ||
209 | } | ||
210 | |||
211 | clear_bit(XPT_TEMP, &newxprt->xpt_flags); | ||
212 | spin_lock_bh(&serv->sv_lock); | ||
213 | list_add(&newxprt->xpt_list, &serv->sv_permsocks); | ||
214 | spin_unlock_bh(&serv->sv_lock); | ||
215 | clear_bit(XPT_BUSY, &newxprt->xpt_flags); | ||
216 | return svc_xprt_local_port(newxprt); | ||
217 | } | ||
218 | err: | ||
219 | spin_unlock(&svc_xprt_class_lock); | ||
220 | dprintk("svc: transport %s not found\n", xprt_name); | ||
221 | return -ENOENT; | ||
222 | } | ||
223 | EXPORT_SYMBOL_GPL(svc_create_xprt); | ||
224 | |||
225 | /* | ||
226 | * Copy the local and remote xprt addresses to the rqstp structure | ||
227 | */ | ||
228 | void svc_xprt_copy_addrs(struct svc_rqst *rqstp, struct svc_xprt *xprt) | ||
229 | { | ||
230 | struct sockaddr *sin; | ||
231 | |||
232 | memcpy(&rqstp->rq_addr, &xprt->xpt_remote, xprt->xpt_remotelen); | ||
233 | rqstp->rq_addrlen = xprt->xpt_remotelen; | ||
234 | |||
235 | /* | ||
236 | * Destination address in request is needed for binding the | ||
237 | * source address in RPC replies/callbacks later. | ||
238 | */ | ||
239 | sin = (struct sockaddr *)&xprt->xpt_local; | ||
240 | switch (sin->sa_family) { | ||
241 | case AF_INET: | ||
242 | rqstp->rq_daddr.addr = ((struct sockaddr_in *)sin)->sin_addr; | ||
243 | break; | ||
244 | case AF_INET6: | ||
245 | rqstp->rq_daddr.addr6 = ((struct sockaddr_in6 *)sin)->sin6_addr; | ||
246 | break; | ||
247 | } | ||
248 | } | ||
249 | EXPORT_SYMBOL_GPL(svc_xprt_copy_addrs); | ||
250 | |||
251 | /** | ||
252 | * svc_print_addr - Format rq_addr field for printing | ||
253 | * @rqstp: svc_rqst struct containing address to print | ||
254 | * @buf: target buffer for formatted address | ||
255 | * @len: length of target buffer | ||
256 | * | ||
257 | */ | ||
258 | char *svc_print_addr(struct svc_rqst *rqstp, char *buf, size_t len) | ||
259 | { | ||
260 | return __svc_print_addr(svc_addr(rqstp), buf, len); | ||
261 | } | ||
262 | EXPORT_SYMBOL_GPL(svc_print_addr); | ||
263 | |||
264 | /* | ||
265 | * Queue up an idle server thread. Must have pool->sp_lock held. | ||
266 | * Note: this is really a stack rather than a queue, so that we only | ||
267 | * use as many different threads as we need, and the rest don't pollute | ||
268 | * the cache. | ||
269 | */ | ||
270 | static void svc_thread_enqueue(struct svc_pool *pool, struct svc_rqst *rqstp) | ||
271 | { | ||
272 | list_add(&rqstp->rq_list, &pool->sp_threads); | ||
273 | } | ||
274 | |||
275 | /* | ||
276 | * Dequeue an nfsd thread. Must have pool->sp_lock held. | ||
277 | */ | ||
278 | static void svc_thread_dequeue(struct svc_pool *pool, struct svc_rqst *rqstp) | ||
279 | { | ||
280 | list_del(&rqstp->rq_list); | ||
281 | } | ||
282 | |||
283 | /* | ||
284 | * Queue up a transport with data pending. If there are idle nfsd | ||
285 | * processes, wake 'em up. | ||
286 | * | ||
287 | */ | ||
288 | void svc_xprt_enqueue(struct svc_xprt *xprt) | ||
289 | { | ||
290 | struct svc_serv *serv = xprt->xpt_server; | ||
291 | struct svc_pool *pool; | ||
292 | struct svc_rqst *rqstp; | ||
293 | int cpu; | ||
294 | |||
295 | if (!(xprt->xpt_flags & | ||
296 | ((1<<XPT_CONN)|(1<<XPT_DATA)|(1<<XPT_CLOSE)|(1<<XPT_DEFERRED)))) | ||
297 | return; | ||
298 | if (test_bit(XPT_DEAD, &xprt->xpt_flags)) | ||
299 | return; | ||
300 | |||
301 | cpu = get_cpu(); | ||
302 | pool = svc_pool_for_cpu(xprt->xpt_server, cpu); | ||
303 | put_cpu(); | ||
304 | |||
305 | spin_lock_bh(&pool->sp_lock); | ||
306 | |||
307 | if (!list_empty(&pool->sp_threads) && | ||
308 | !list_empty(&pool->sp_sockets)) | ||
309 | printk(KERN_ERR | ||
310 | "svc_xprt_enqueue: " | ||
311 | "threads and transports both waiting??\n"); | ||
312 | |||
313 | if (test_bit(XPT_DEAD, &xprt->xpt_flags)) { | ||
314 | /* Don't enqueue dead transports */ | ||
315 | dprintk("svc: transport %p is dead, not enqueued\n", xprt); | ||
316 | goto out_unlock; | ||
317 | } | ||
318 | |||
319 | /* Mark transport as busy. It will remain in this state until | ||
320 | * the provider calls svc_xprt_received. We update XPT_BUSY | ||
321 | * atomically because it also guards against trying to enqueue | ||
322 | * the transport twice. | ||
323 | */ | ||
324 | if (test_and_set_bit(XPT_BUSY, &xprt->xpt_flags)) { | ||
325 | /* Don't enqueue transport while already enqueued */ | ||
326 | dprintk("svc: transport %p busy, not enqueued\n", xprt); | ||
327 | goto out_unlock; | ||
328 | } | ||
329 | BUG_ON(xprt->xpt_pool != NULL); | ||
330 | xprt->xpt_pool = pool; | ||
331 | |||
332 | /* Handle pending connection */ | ||
333 | if (test_bit(XPT_CONN, &xprt->xpt_flags)) | ||
334 | goto process; | ||
335 | |||
336 | /* Handle close in-progress */ | ||
337 | if (test_bit(XPT_CLOSE, &xprt->xpt_flags)) | ||
338 | goto process; | ||
339 | |||
340 | /* Check if we have space to reply to a request */ | ||
341 | if (!xprt->xpt_ops->xpo_has_wspace(xprt)) { | ||
342 | /* Don't enqueue while not enough space for reply */ | ||
343 | dprintk("svc: no write space, transport %p not enqueued\n", | ||
344 | xprt); | ||
345 | xprt->xpt_pool = NULL; | ||
346 | clear_bit(XPT_BUSY, &xprt->xpt_flags); | ||
347 | goto out_unlock; | ||
348 | } | ||
349 | |||
350 | process: | ||
351 | if (!list_empty(&pool->sp_threads)) { | ||
352 | rqstp = list_entry(pool->sp_threads.next, | ||
353 | struct svc_rqst, | ||
354 | rq_list); | ||
355 | dprintk("svc: transport %p served by daemon %p\n", | ||
356 | xprt, rqstp); | ||
357 | svc_thread_dequeue(pool, rqstp); | ||
358 | if (rqstp->rq_xprt) | ||
359 | printk(KERN_ERR | ||
360 | "svc_xprt_enqueue: server %p, rq_xprt=%p!\n", | ||
361 | rqstp, rqstp->rq_xprt); | ||
362 | rqstp->rq_xprt = xprt; | ||
363 | svc_xprt_get(xprt); | ||
364 | rqstp->rq_reserved = serv->sv_max_mesg; | ||
365 | atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved); | ||
366 | BUG_ON(xprt->xpt_pool != pool); | ||
367 | wake_up(&rqstp->rq_wait); | ||
368 | } else { | ||
369 | dprintk("svc: transport %p put into queue\n", xprt); | ||
370 | list_add_tail(&xprt->xpt_ready, &pool->sp_sockets); | ||
371 | BUG_ON(xprt->xpt_pool != pool); | ||
372 | } | ||
373 | |||
374 | out_unlock: | ||
375 | spin_unlock_bh(&pool->sp_lock); | ||
376 | } | ||
377 | EXPORT_SYMBOL_GPL(svc_xprt_enqueue); | ||
378 | |||
379 | /* | ||
380 | * Dequeue the first transport. Must be called with the pool->sp_lock held. | ||
381 | */ | ||
382 | static struct svc_xprt *svc_xprt_dequeue(struct svc_pool *pool) | ||
383 | { | ||
384 | struct svc_xprt *xprt; | ||
385 | |||
386 | if (list_empty(&pool->sp_sockets)) | ||
387 | return NULL; | ||
388 | |||
389 | xprt = list_entry(pool->sp_sockets.next, | ||
390 | struct svc_xprt, xpt_ready); | ||
391 | list_del_init(&xprt->xpt_ready); | ||
392 | |||
393 | dprintk("svc: transport %p dequeued, inuse=%d\n", | ||
394 | xprt, atomic_read(&xprt->xpt_ref.refcount)); | ||
395 | |||
396 | return xprt; | ||
397 | } | ||
398 | |||
399 | /* | ||
400 | * svc_xprt_received conditionally queues the transport for processing | ||
401 | * by another thread. The caller must hold the XPT_BUSY bit and must | ||
402 | * not thereafter touch transport data. | ||
403 | * | ||
404 | * Note: XPT_DATA only gets cleared when a read-attempt finds no (or | ||
405 | * insufficient) data. | ||
406 | */ | ||
407 | void svc_xprt_received(struct svc_xprt *xprt) | ||
408 | { | ||
409 | BUG_ON(!test_bit(XPT_BUSY, &xprt->xpt_flags)); | ||
410 | xprt->xpt_pool = NULL; | ||
411 | clear_bit(XPT_BUSY, &xprt->xpt_flags); | ||
412 | svc_xprt_enqueue(xprt); | ||
413 | } | ||
414 | EXPORT_SYMBOL_GPL(svc_xprt_received); | ||
415 | |||
416 | /** | ||
417 | * svc_reserve - change the space reserved for the reply to a request. | ||
418 | * @rqstp: The request in question | ||
419 | * @space: new max space to reserve | ||
420 | * | ||
421 | * Each request reserves some space on the output queue of the transport | ||
422 | * to make sure the reply fits. This function reduces that reserved | ||
423 | * space to be the amount of space used already, plus @space. | ||
424 | * | ||
425 | */ | ||
426 | void svc_reserve(struct svc_rqst *rqstp, int space) | ||
427 | { | ||
428 | space += rqstp->rq_res.head[0].iov_len; | ||
429 | |||
430 | if (space < rqstp->rq_reserved) { | ||
431 | struct svc_xprt *xprt = rqstp->rq_xprt; | ||
432 | atomic_sub((rqstp->rq_reserved - space), &xprt->xpt_reserved); | ||
433 | rqstp->rq_reserved = space; | ||
434 | |||
435 | svc_xprt_enqueue(xprt); | ||
436 | } | ||
437 | } | ||
438 | EXPORT_SYMBOL(svc_reserve); | ||
439 | |||
440 | static void svc_xprt_release(struct svc_rqst *rqstp) | ||
441 | { | ||
442 | struct svc_xprt *xprt = rqstp->rq_xprt; | ||
443 | |||
444 | rqstp->rq_xprt->xpt_ops->xpo_release_rqst(rqstp); | ||
445 | |||
446 | svc_free_res_pages(rqstp); | ||
447 | rqstp->rq_res.page_len = 0; | ||
448 | rqstp->rq_res.page_base = 0; | ||
449 | |||
450 | /* Reset response buffer and release | ||
451 | * the reservation. | ||
452 | * But first, check that enough space was reserved | ||
453 | * for the reply, otherwise we have a bug! | ||
454 | */ | ||
455 | if ((rqstp->rq_res.len) > rqstp->rq_reserved) | ||
456 | printk(KERN_ERR "RPC request reserved %d but used %d\n", | ||
457 | rqstp->rq_reserved, | ||
458 | rqstp->rq_res.len); | ||
459 | |||
460 | rqstp->rq_res.head[0].iov_len = 0; | ||
461 | svc_reserve(rqstp, 0); | ||
462 | rqstp->rq_xprt = NULL; | ||
463 | |||
464 | svc_xprt_put(xprt); | ||
465 | } | ||
466 | |||
467 | /* | ||
468 | * External function to wake up a server waiting for data | ||
469 | * This really only makes sense for services like lockd | ||
470 | * which have exactly one thread anyway. | ||
471 | */ | ||
472 | void svc_wake_up(struct svc_serv *serv) | ||
473 | { | ||
474 | struct svc_rqst *rqstp; | ||
475 | unsigned int i; | ||
476 | struct svc_pool *pool; | ||
477 | |||
478 | for (i = 0; i < serv->sv_nrpools; i++) { | ||
479 | pool = &serv->sv_pools[i]; | ||
480 | |||
481 | spin_lock_bh(&pool->sp_lock); | ||
482 | if (!list_empty(&pool->sp_threads)) { | ||
483 | rqstp = list_entry(pool->sp_threads.next, | ||
484 | struct svc_rqst, | ||
485 | rq_list); | ||
486 | dprintk("svc: daemon %p woken up.\n", rqstp); | ||
487 | /* | ||
488 | svc_thread_dequeue(pool, rqstp); | ||
489 | rqstp->rq_xprt = NULL; | ||
490 | */ | ||
491 | wake_up(&rqstp->rq_wait); | ||
492 | } | ||
493 | spin_unlock_bh(&pool->sp_lock); | ||
494 | } | ||
495 | } | ||
496 | EXPORT_SYMBOL(svc_wake_up); | ||
497 | |||
498 | int svc_port_is_privileged(struct sockaddr *sin) | ||
499 | { | ||
500 | switch (sin->sa_family) { | ||
501 | case AF_INET: | ||
502 | return ntohs(((struct sockaddr_in *)sin)->sin_port) | ||
503 | < PROT_SOCK; | ||
504 | case AF_INET6: | ||
505 | return ntohs(((struct sockaddr_in6 *)sin)->sin6_port) | ||
506 | < PROT_SOCK; | ||
507 | default: | ||
508 | return 0; | ||
509 | } | ||
510 | } | ||
511 | |||
512 | /* | ||
513 | * Make sure that we don't have too many active connections. If we | ||
514 | * have, something must be dropped. | ||
515 | * | ||
516 | * There's no point in trying to do random drop here for DoS | ||
517 | * prevention. The NFS clients does 1 reconnect in 15 seconds. An | ||
518 | * attacker can easily beat that. | ||
519 | * | ||
520 | * The only somewhat efficient mechanism would be if drop old | ||
521 | * connections from the same IP first. But right now we don't even | ||
522 | * record the client IP in svc_sock. | ||
523 | */ | ||
524 | static void svc_check_conn_limits(struct svc_serv *serv) | ||
525 | { | ||
526 | if (serv->sv_tmpcnt > (serv->sv_nrthreads+3)*20) { | ||
527 | struct svc_xprt *xprt = NULL; | ||
528 | spin_lock_bh(&serv->sv_lock); | ||
529 | if (!list_empty(&serv->sv_tempsocks)) { | ||
530 | if (net_ratelimit()) { | ||
531 | /* Try to help the admin */ | ||
532 | printk(KERN_NOTICE "%s: too many open " | ||
533 | "connections, consider increasing the " | ||
534 | "number of nfsd threads\n", | ||
535 | serv->sv_name); | ||
536 | } | ||
537 | /* | ||
538 | * Always select the oldest connection. It's not fair, | ||
539 | * but so is life | ||
540 | */ | ||
541 | xprt = list_entry(serv->sv_tempsocks.prev, | ||
542 | struct svc_xprt, | ||
543 | xpt_list); | ||
544 | set_bit(XPT_CLOSE, &xprt->xpt_flags); | ||
545 | svc_xprt_get(xprt); | ||
546 | } | ||
547 | spin_unlock_bh(&serv->sv_lock); | ||
548 | |||
549 | if (xprt) { | ||
550 | svc_xprt_enqueue(xprt); | ||
551 | svc_xprt_put(xprt); | ||
552 | } | ||
553 | } | ||
554 | } | ||
555 | |||
556 | /* | ||
557 | * Receive the next request on any transport. This code is carefully | ||
558 | * organised not to touch any cachelines in the shared svc_serv | ||
559 | * structure, only cachelines in the local svc_pool. | ||
560 | */ | ||
561 | int svc_recv(struct svc_rqst *rqstp, long timeout) | ||
562 | { | ||
563 | struct svc_xprt *xprt = NULL; | ||
564 | struct svc_serv *serv = rqstp->rq_server; | ||
565 | struct svc_pool *pool = rqstp->rq_pool; | ||
566 | int len, i; | ||
567 | int pages; | ||
568 | struct xdr_buf *arg; | ||
569 | DECLARE_WAITQUEUE(wait, current); | ||
570 | |||
571 | dprintk("svc: server %p waiting for data (to = %ld)\n", | ||
572 | rqstp, timeout); | ||
573 | |||
574 | if (rqstp->rq_xprt) | ||
575 | printk(KERN_ERR | ||
576 | "svc_recv: service %p, transport not NULL!\n", | ||
577 | rqstp); | ||
578 | if (waitqueue_active(&rqstp->rq_wait)) | ||
579 | printk(KERN_ERR | ||
580 | "svc_recv: service %p, wait queue active!\n", | ||
581 | rqstp); | ||
582 | |||
583 | /* now allocate needed pages. If we get a failure, sleep briefly */ | ||
584 | pages = (serv->sv_max_mesg + PAGE_SIZE) / PAGE_SIZE; | ||
585 | for (i = 0; i < pages ; i++) | ||
586 | while (rqstp->rq_pages[i] == NULL) { | ||
587 | struct page *p = alloc_page(GFP_KERNEL); | ||
588 | if (!p) { | ||
589 | int j = msecs_to_jiffies(500); | ||
590 | schedule_timeout_uninterruptible(j); | ||
591 | } | ||
592 | rqstp->rq_pages[i] = p; | ||
593 | } | ||
594 | rqstp->rq_pages[i++] = NULL; /* this might be seen in nfs_read_actor */ | ||
595 | BUG_ON(pages >= RPCSVC_MAXPAGES); | ||
596 | |||
597 | /* Make arg->head point to first page and arg->pages point to rest */ | ||
598 | arg = &rqstp->rq_arg; | ||
599 | arg->head[0].iov_base = page_address(rqstp->rq_pages[0]); | ||
600 | arg->head[0].iov_len = PAGE_SIZE; | ||
601 | arg->pages = rqstp->rq_pages + 1; | ||
602 | arg->page_base = 0; | ||
603 | /* save at least one page for response */ | ||
604 | arg->page_len = (pages-2)*PAGE_SIZE; | ||
605 | arg->len = (pages-1)*PAGE_SIZE; | ||
606 | arg->tail[0].iov_len = 0; | ||
607 | |||
608 | try_to_freeze(); | ||
609 | cond_resched(); | ||
610 | if (signalled()) | ||
611 | return -EINTR; | ||
612 | |||
613 | spin_lock_bh(&pool->sp_lock); | ||
614 | xprt = svc_xprt_dequeue(pool); | ||
615 | if (xprt) { | ||
616 | rqstp->rq_xprt = xprt; | ||
617 | svc_xprt_get(xprt); | ||
618 | rqstp->rq_reserved = serv->sv_max_mesg; | ||
619 | atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved); | ||
620 | } else { | ||
621 | /* No data pending. Go to sleep */ | ||
622 | svc_thread_enqueue(pool, rqstp); | ||
623 | |||
624 | /* | ||
625 | * We have to be able to interrupt this wait | ||
626 | * to bring down the daemons ... | ||
627 | */ | ||
628 | set_current_state(TASK_INTERRUPTIBLE); | ||
629 | add_wait_queue(&rqstp->rq_wait, &wait); | ||
630 | spin_unlock_bh(&pool->sp_lock); | ||
631 | |||
632 | schedule_timeout(timeout); | ||
633 | |||
634 | try_to_freeze(); | ||
635 | |||
636 | spin_lock_bh(&pool->sp_lock); | ||
637 | remove_wait_queue(&rqstp->rq_wait, &wait); | ||
638 | |||
639 | xprt = rqstp->rq_xprt; | ||
640 | if (!xprt) { | ||
641 | svc_thread_dequeue(pool, rqstp); | ||
642 | spin_unlock_bh(&pool->sp_lock); | ||
643 | dprintk("svc: server %p, no data yet\n", rqstp); | ||
644 | return signalled()? -EINTR : -EAGAIN; | ||
645 | } | ||
646 | } | ||
647 | spin_unlock_bh(&pool->sp_lock); | ||
648 | |||
649 | len = 0; | ||
650 | if (test_bit(XPT_CLOSE, &xprt->xpt_flags)) { | ||
651 | dprintk("svc_recv: found XPT_CLOSE\n"); | ||
652 | svc_delete_xprt(xprt); | ||
653 | } else if (test_bit(XPT_LISTENER, &xprt->xpt_flags)) { | ||
654 | struct svc_xprt *newxpt; | ||
655 | newxpt = xprt->xpt_ops->xpo_accept(xprt); | ||
656 | if (newxpt) { | ||
657 | /* | ||
658 | * We know this module_get will succeed because the | ||
659 | * listener holds a reference too | ||
660 | */ | ||
661 | __module_get(newxpt->xpt_class->xcl_owner); | ||
662 | svc_check_conn_limits(xprt->xpt_server); | ||
663 | spin_lock_bh(&serv->sv_lock); | ||
664 | set_bit(XPT_TEMP, &newxpt->xpt_flags); | ||
665 | list_add(&newxpt->xpt_list, &serv->sv_tempsocks); | ||
666 | serv->sv_tmpcnt++; | ||
667 | if (serv->sv_temptimer.function == NULL) { | ||
668 | /* setup timer to age temp transports */ | ||
669 | setup_timer(&serv->sv_temptimer, | ||
670 | svc_age_temp_xprts, | ||
671 | (unsigned long)serv); | ||
672 | mod_timer(&serv->sv_temptimer, | ||
673 | jiffies + svc_conn_age_period * HZ); | ||
674 | } | ||
675 | spin_unlock_bh(&serv->sv_lock); | ||
676 | svc_xprt_received(newxpt); | ||
677 | } | ||
678 | svc_xprt_received(xprt); | ||
679 | } else { | ||
680 | dprintk("svc: server %p, pool %u, transport %p, inuse=%d\n", | ||
681 | rqstp, pool->sp_id, xprt, | ||
682 | atomic_read(&xprt->xpt_ref.refcount)); | ||
683 | rqstp->rq_deferred = svc_deferred_dequeue(xprt); | ||
684 | if (rqstp->rq_deferred) { | ||
685 | svc_xprt_received(xprt); | ||
686 | len = svc_deferred_recv(rqstp); | ||
687 | } else | ||
688 | len = xprt->xpt_ops->xpo_recvfrom(rqstp); | ||
689 | dprintk("svc: got len=%d\n", len); | ||
690 | } | ||
691 | |||
692 | /* No data, incomplete (TCP) read, or accept() */ | ||
693 | if (len == 0 || len == -EAGAIN) { | ||
694 | rqstp->rq_res.len = 0; | ||
695 | svc_xprt_release(rqstp); | ||
696 | return -EAGAIN; | ||
697 | } | ||
698 | clear_bit(XPT_OLD, &xprt->xpt_flags); | ||
699 | |||
700 | rqstp->rq_secure = svc_port_is_privileged(svc_addr(rqstp)); | ||
701 | rqstp->rq_chandle.defer = svc_defer; | ||
702 | |||
703 | if (serv->sv_stats) | ||
704 | serv->sv_stats->netcnt++; | ||
705 | return len; | ||
706 | } | ||
707 | EXPORT_SYMBOL(svc_recv); | ||
708 | |||
709 | /* | ||
710 | * Drop request | ||
711 | */ | ||
712 | void svc_drop(struct svc_rqst *rqstp) | ||
713 | { | ||
714 | dprintk("svc: xprt %p dropped request\n", rqstp->rq_xprt); | ||
715 | svc_xprt_release(rqstp); | ||
716 | } | ||
717 | EXPORT_SYMBOL(svc_drop); | ||
718 | |||
719 | /* | ||
720 | * Return reply to client. | ||
721 | */ | ||
722 | int svc_send(struct svc_rqst *rqstp) | ||
723 | { | ||
724 | struct svc_xprt *xprt; | ||
725 | int len; | ||
726 | struct xdr_buf *xb; | ||
727 | |||
728 | xprt = rqstp->rq_xprt; | ||
729 | if (!xprt) | ||
730 | return -EFAULT; | ||
731 | |||
732 | /* release the receive skb before sending the reply */ | ||
733 | rqstp->rq_xprt->xpt_ops->xpo_release_rqst(rqstp); | ||
734 | |||
735 | /* calculate over-all length */ | ||
736 | xb = &rqstp->rq_res; | ||
737 | xb->len = xb->head[0].iov_len + | ||
738 | xb->page_len + | ||
739 | xb->tail[0].iov_len; | ||
740 | |||
741 | /* Grab mutex to serialize outgoing data. */ | ||
742 | mutex_lock(&xprt->xpt_mutex); | ||
743 | if (test_bit(XPT_DEAD, &xprt->xpt_flags)) | ||
744 | len = -ENOTCONN; | ||
745 | else | ||
746 | len = xprt->xpt_ops->xpo_sendto(rqstp); | ||
747 | mutex_unlock(&xprt->xpt_mutex); | ||
748 | svc_xprt_release(rqstp); | ||
749 | |||
750 | if (len == -ECONNREFUSED || len == -ENOTCONN || len == -EAGAIN) | ||
751 | return 0; | ||
752 | return len; | ||
753 | } | ||
754 | |||
755 | /* | ||
756 | * Timer function to close old temporary transports, using | ||
757 | * a mark-and-sweep algorithm. | ||
758 | */ | ||
759 | static void svc_age_temp_xprts(unsigned long closure) | ||
760 | { | ||
761 | struct svc_serv *serv = (struct svc_serv *)closure; | ||
762 | struct svc_xprt *xprt; | ||
763 | struct list_head *le, *next; | ||
764 | LIST_HEAD(to_be_aged); | ||
765 | |||
766 | dprintk("svc_age_temp_xprts\n"); | ||
767 | |||
768 | if (!spin_trylock_bh(&serv->sv_lock)) { | ||
769 | /* busy, try again 1 sec later */ | ||
770 | dprintk("svc_age_temp_xprts: busy\n"); | ||
771 | mod_timer(&serv->sv_temptimer, jiffies + HZ); | ||
772 | return; | ||
773 | } | ||
774 | |||
775 | list_for_each_safe(le, next, &serv->sv_tempsocks) { | ||
776 | xprt = list_entry(le, struct svc_xprt, xpt_list); | ||
777 | |||
778 | /* First time through, just mark it OLD. Second time | ||
779 | * through, close it. */ | ||
780 | if (!test_and_set_bit(XPT_OLD, &xprt->xpt_flags)) | ||
781 | continue; | ||
782 | if (atomic_read(&xprt->xpt_ref.refcount) > 1 | ||
783 | || test_bit(XPT_BUSY, &xprt->xpt_flags)) | ||
784 | continue; | ||
785 | svc_xprt_get(xprt); | ||
786 | list_move(le, &to_be_aged); | ||
787 | set_bit(XPT_CLOSE, &xprt->xpt_flags); | ||
788 | set_bit(XPT_DETACHED, &xprt->xpt_flags); | ||
789 | } | ||
790 | spin_unlock_bh(&serv->sv_lock); | ||
791 | |||
792 | while (!list_empty(&to_be_aged)) { | ||
793 | le = to_be_aged.next; | ||
794 | /* fiddling the xpt_list node is safe 'cos we're XPT_DETACHED */ | ||
795 | list_del_init(le); | ||
796 | xprt = list_entry(le, struct svc_xprt, xpt_list); | ||
797 | |||
798 | dprintk("queuing xprt %p for closing\n", xprt); | ||
799 | |||
800 | /* a thread will dequeue and close it soon */ | ||
801 | svc_xprt_enqueue(xprt); | ||
802 | svc_xprt_put(xprt); | ||
803 | } | ||
804 | |||
805 | mod_timer(&serv->sv_temptimer, jiffies + svc_conn_age_period * HZ); | ||
806 | } | ||
807 | |||
808 | /* | ||
809 | * Remove a dead transport | ||
810 | */ | ||
811 | void svc_delete_xprt(struct svc_xprt *xprt) | ||
812 | { | ||
813 | struct svc_serv *serv = xprt->xpt_server; | ||
814 | |||
815 | dprintk("svc: svc_delete_xprt(%p)\n", xprt); | ||
816 | xprt->xpt_ops->xpo_detach(xprt); | ||
817 | |||
818 | spin_lock_bh(&serv->sv_lock); | ||
819 | if (!test_and_set_bit(XPT_DETACHED, &xprt->xpt_flags)) | ||
820 | list_del_init(&xprt->xpt_list); | ||
821 | /* | ||
822 | * We used to delete the transport from whichever list | ||
823 | * it's sk_xprt.xpt_ready node was on, but we don't actually | ||
824 | * need to. This is because the only time we're called | ||
825 | * while still attached to a queue, the queue itself | ||
826 | * is about to be destroyed (in svc_destroy). | ||
827 | */ | ||
828 | if (!test_and_set_bit(XPT_DEAD, &xprt->xpt_flags)) { | ||
829 | BUG_ON(atomic_read(&xprt->xpt_ref.refcount) < 2); | ||
830 | if (test_bit(XPT_TEMP, &xprt->xpt_flags)) | ||
831 | serv->sv_tmpcnt--; | ||
832 | svc_xprt_put(xprt); | ||
833 | } | ||
834 | spin_unlock_bh(&serv->sv_lock); | ||
835 | } | ||
836 | |||
837 | void svc_close_xprt(struct svc_xprt *xprt) | ||
838 | { | ||
839 | set_bit(XPT_CLOSE, &xprt->xpt_flags); | ||
840 | if (test_and_set_bit(XPT_BUSY, &xprt->xpt_flags)) | ||
841 | /* someone else will have to effect the close */ | ||
842 | return; | ||
843 | |||
844 | svc_xprt_get(xprt); | ||
845 | svc_delete_xprt(xprt); | ||
846 | clear_bit(XPT_BUSY, &xprt->xpt_flags); | ||
847 | svc_xprt_put(xprt); | ||
848 | } | ||
849 | EXPORT_SYMBOL_GPL(svc_close_xprt); | ||
850 | |||
851 | void svc_close_all(struct list_head *xprt_list) | ||
852 | { | ||
853 | struct svc_xprt *xprt; | ||
854 | struct svc_xprt *tmp; | ||
855 | |||
856 | list_for_each_entry_safe(xprt, tmp, xprt_list, xpt_list) { | ||
857 | set_bit(XPT_CLOSE, &xprt->xpt_flags); | ||
858 | if (test_bit(XPT_BUSY, &xprt->xpt_flags)) { | ||
859 | /* Waiting to be processed, but no threads left, | ||
860 | * So just remove it from the waiting list | ||
861 | */ | ||
862 | list_del_init(&xprt->xpt_ready); | ||
863 | clear_bit(XPT_BUSY, &xprt->xpt_flags); | ||
864 | } | ||
865 | svc_close_xprt(xprt); | ||
866 | } | ||
867 | } | ||
868 | |||
869 | /* | ||
870 | * Handle defer and revisit of requests | ||
871 | */ | ||
872 | |||
873 | static void svc_revisit(struct cache_deferred_req *dreq, int too_many) | ||
874 | { | ||
875 | struct svc_deferred_req *dr = | ||
876 | container_of(dreq, struct svc_deferred_req, handle); | ||
877 | struct svc_xprt *xprt = dr->xprt; | ||
878 | |||
879 | if (too_many) { | ||
880 | svc_xprt_put(xprt); | ||
881 | kfree(dr); | ||
882 | return; | ||
883 | } | ||
884 | dprintk("revisit queued\n"); | ||
885 | dr->xprt = NULL; | ||
886 | spin_lock(&xprt->xpt_lock); | ||
887 | list_add(&dr->handle.recent, &xprt->xpt_deferred); | ||
888 | spin_unlock(&xprt->xpt_lock); | ||
889 | set_bit(XPT_DEFERRED, &xprt->xpt_flags); | ||
890 | svc_xprt_enqueue(xprt); | ||
891 | svc_xprt_put(xprt); | ||
892 | } | ||
893 | |||
894 | /* | ||
895 | * Save the request off for later processing. The request buffer looks | ||
896 | * like this: | ||
897 | * | ||
898 | * <xprt-header><rpc-header><rpc-pagelist><rpc-tail> | ||
899 | * | ||
900 | * This code can only handle requests that consist of an xprt-header | ||
901 | * and rpc-header. | ||
902 | */ | ||
903 | static struct cache_deferred_req *svc_defer(struct cache_req *req) | ||
904 | { | ||
905 | struct svc_rqst *rqstp = container_of(req, struct svc_rqst, rq_chandle); | ||
906 | struct svc_deferred_req *dr; | ||
907 | |||
908 | if (rqstp->rq_arg.page_len) | ||
909 | return NULL; /* if more than a page, give up FIXME */ | ||
910 | if (rqstp->rq_deferred) { | ||
911 | dr = rqstp->rq_deferred; | ||
912 | rqstp->rq_deferred = NULL; | ||
913 | } else { | ||
914 | size_t skip; | ||
915 | size_t size; | ||
916 | /* FIXME maybe discard if size too large */ | ||
917 | size = sizeof(struct svc_deferred_req) + rqstp->rq_arg.len; | ||
918 | dr = kmalloc(size, GFP_KERNEL); | ||
919 | if (dr == NULL) | ||
920 | return NULL; | ||
921 | |||
922 | dr->handle.owner = rqstp->rq_server; | ||
923 | dr->prot = rqstp->rq_prot; | ||
924 | memcpy(&dr->addr, &rqstp->rq_addr, rqstp->rq_addrlen); | ||
925 | dr->addrlen = rqstp->rq_addrlen; | ||
926 | dr->daddr = rqstp->rq_daddr; | ||
927 | dr->argslen = rqstp->rq_arg.len >> 2; | ||
928 | dr->xprt_hlen = rqstp->rq_xprt_hlen; | ||
929 | |||
930 | /* back up head to the start of the buffer and copy */ | ||
931 | skip = rqstp->rq_arg.len - rqstp->rq_arg.head[0].iov_len; | ||
932 | memcpy(dr->args, rqstp->rq_arg.head[0].iov_base - skip, | ||
933 | dr->argslen << 2); | ||
934 | } | ||
935 | svc_xprt_get(rqstp->rq_xprt); | ||
936 | dr->xprt = rqstp->rq_xprt; | ||
937 | |||
938 | dr->handle.revisit = svc_revisit; | ||
939 | return &dr->handle; | ||
940 | } | ||
941 | |||
942 | /* | ||
943 | * recv data from a deferred request into an active one | ||
944 | */ | ||
945 | static int svc_deferred_recv(struct svc_rqst *rqstp) | ||
946 | { | ||
947 | struct svc_deferred_req *dr = rqstp->rq_deferred; | ||
948 | |||
949 | /* setup iov_base past transport header */ | ||
950 | rqstp->rq_arg.head[0].iov_base = dr->args + (dr->xprt_hlen>>2); | ||
951 | /* The iov_len does not include the transport header bytes */ | ||
952 | rqstp->rq_arg.head[0].iov_len = (dr->argslen<<2) - dr->xprt_hlen; | ||
953 | rqstp->rq_arg.page_len = 0; | ||
954 | /* The rq_arg.len includes the transport header bytes */ | ||
955 | rqstp->rq_arg.len = dr->argslen<<2; | ||
956 | rqstp->rq_prot = dr->prot; | ||
957 | memcpy(&rqstp->rq_addr, &dr->addr, dr->addrlen); | ||
958 | rqstp->rq_addrlen = dr->addrlen; | ||
959 | /* Save off transport header len in case we get deferred again */ | ||
960 | rqstp->rq_xprt_hlen = dr->xprt_hlen; | ||
961 | rqstp->rq_daddr = dr->daddr; | ||
962 | rqstp->rq_respages = rqstp->rq_pages; | ||
963 | return (dr->argslen<<2) - dr->xprt_hlen; | ||
964 | } | ||
965 | |||
966 | |||
967 | static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt) | ||
968 | { | ||
969 | struct svc_deferred_req *dr = NULL; | ||
970 | |||
971 | if (!test_bit(XPT_DEFERRED, &xprt->xpt_flags)) | ||
972 | return NULL; | ||
973 | spin_lock(&xprt->xpt_lock); | ||
974 | clear_bit(XPT_DEFERRED, &xprt->xpt_flags); | ||
975 | if (!list_empty(&xprt->xpt_deferred)) { | ||
976 | dr = list_entry(xprt->xpt_deferred.next, | ||
977 | struct svc_deferred_req, | ||
978 | handle.recent); | ||
979 | list_del_init(&dr->handle.recent); | ||
980 | set_bit(XPT_DEFERRED, &xprt->xpt_flags); | ||
981 | } | ||
982 | spin_unlock(&xprt->xpt_lock); | ||
983 | return dr; | ||
984 | } | ||
985 | |||
986 | /* | ||
987 | * Return the transport instance pointer for the endpoint accepting | ||
988 | * connections/peer traffic from the specified transport class, | ||
989 | * address family and port. | ||
990 | * | ||
991 | * Specifying 0 for the address family or port is effectively a | ||
992 | * wild-card, and will result in matching the first transport in the | ||
993 | * service's list that has a matching class name. | ||
994 | */ | ||
995 | struct svc_xprt *svc_find_xprt(struct svc_serv *serv, char *xcl_name, | ||
996 | int af, int port) | ||
997 | { | ||
998 | struct svc_xprt *xprt; | ||
999 | struct svc_xprt *found = NULL; | ||
1000 | |||
1001 | /* Sanity check the args */ | ||
1002 | if (!serv || !xcl_name) | ||
1003 | return found; | ||
1004 | |||
1005 | spin_lock_bh(&serv->sv_lock); | ||
1006 | list_for_each_entry(xprt, &serv->sv_permsocks, xpt_list) { | ||
1007 | if (strcmp(xprt->xpt_class->xcl_name, xcl_name)) | ||
1008 | continue; | ||
1009 | if (af != AF_UNSPEC && af != xprt->xpt_local.ss_family) | ||
1010 | continue; | ||
1011 | if (port && port != svc_xprt_local_port(xprt)) | ||
1012 | continue; | ||
1013 | found = xprt; | ||
1014 | svc_xprt_get(xprt); | ||
1015 | break; | ||
1016 | } | ||
1017 | spin_unlock_bh(&serv->sv_lock); | ||
1018 | return found; | ||
1019 | } | ||
1020 | EXPORT_SYMBOL_GPL(svc_find_xprt); | ||
1021 | |||
1022 | /* | ||
1023 | * Format a buffer with a list of the active transports. A zero for | ||
1024 | * the buflen parameter disables target buffer overflow checking. | ||
1025 | */ | ||
1026 | int svc_xprt_names(struct svc_serv *serv, char *buf, int buflen) | ||
1027 | { | ||
1028 | struct svc_xprt *xprt; | ||
1029 | char xprt_str[64]; | ||
1030 | int totlen = 0; | ||
1031 | int len; | ||
1032 | |||
1033 | /* Sanity check args */ | ||
1034 | if (!serv) | ||
1035 | return 0; | ||
1036 | |||
1037 | spin_lock_bh(&serv->sv_lock); | ||
1038 | list_for_each_entry(xprt, &serv->sv_permsocks, xpt_list) { | ||
1039 | len = snprintf(xprt_str, sizeof(xprt_str), | ||
1040 | "%s %d\n", xprt->xpt_class->xcl_name, | ||
1041 | svc_xprt_local_port(xprt)); | ||
1042 | /* If the string was truncated, replace with error string */ | ||
1043 | if (len >= sizeof(xprt_str)) | ||
1044 | strcpy(xprt_str, "name-too-long\n"); | ||
1045 | /* Don't overflow buffer */ | ||
1046 | len = strlen(xprt_str); | ||
1047 | if (buflen && (len + totlen >= buflen)) | ||
1048 | break; | ||
1049 | strcpy(buf+totlen, xprt_str); | ||
1050 | totlen += len; | ||
1051 | } | ||
1052 | spin_unlock_bh(&serv->sv_lock); | ||
1053 | return totlen; | ||
1054 | } | ||
1055 | EXPORT_SYMBOL_GPL(svc_xprt_names); | ||
diff --git a/net/sunrpc/svcauth.c b/net/sunrpc/svcauth.c index af7c5f05c6e1..8a73cbb16052 100644 --- a/net/sunrpc/svcauth.c +++ b/net/sunrpc/svcauth.c | |||
@@ -57,11 +57,13 @@ svc_authenticate(struct svc_rqst *rqstp, __be32 *authp) | |||
57 | rqstp->rq_authop = aops; | 57 | rqstp->rq_authop = aops; |
58 | return aops->accept(rqstp, authp); | 58 | return aops->accept(rqstp, authp); |
59 | } | 59 | } |
60 | EXPORT_SYMBOL(svc_authenticate); | ||
60 | 61 | ||
61 | int svc_set_client(struct svc_rqst *rqstp) | 62 | int svc_set_client(struct svc_rqst *rqstp) |
62 | { | 63 | { |
63 | return rqstp->rq_authop->set_client(rqstp); | 64 | return rqstp->rq_authop->set_client(rqstp); |
64 | } | 65 | } |
66 | EXPORT_SYMBOL(svc_set_client); | ||
65 | 67 | ||
66 | /* A request, which was authenticated, has now executed. | 68 | /* A request, which was authenticated, has now executed. |
67 | * Time to finalise the credentials and verifier | 69 | * Time to finalise the credentials and verifier |
@@ -93,6 +95,7 @@ svc_auth_register(rpc_authflavor_t flavor, struct auth_ops *aops) | |||
93 | spin_unlock(&authtab_lock); | 95 | spin_unlock(&authtab_lock); |
94 | return rv; | 96 | return rv; |
95 | } | 97 | } |
98 | EXPORT_SYMBOL(svc_auth_register); | ||
96 | 99 | ||
97 | void | 100 | void |
98 | svc_auth_unregister(rpc_authflavor_t flavor) | 101 | svc_auth_unregister(rpc_authflavor_t flavor) |
@@ -129,6 +132,7 @@ void auth_domain_put(struct auth_domain *dom) | |||
129 | spin_unlock(&auth_domain_lock); | 132 | spin_unlock(&auth_domain_lock); |
130 | } | 133 | } |
131 | } | 134 | } |
135 | EXPORT_SYMBOL(auth_domain_put); | ||
132 | 136 | ||
133 | struct auth_domain * | 137 | struct auth_domain * |
134 | auth_domain_lookup(char *name, struct auth_domain *new) | 138 | auth_domain_lookup(char *name, struct auth_domain *new) |
@@ -153,8 +157,10 @@ auth_domain_lookup(char *name, struct auth_domain *new) | |||
153 | spin_unlock(&auth_domain_lock); | 157 | spin_unlock(&auth_domain_lock); |
154 | return new; | 158 | return new; |
155 | } | 159 | } |
160 | EXPORT_SYMBOL(auth_domain_lookup); | ||
156 | 161 | ||
157 | struct auth_domain *auth_domain_find(char *name) | 162 | struct auth_domain *auth_domain_find(char *name) |
158 | { | 163 | { |
159 | return auth_domain_lookup(name, NULL); | 164 | return auth_domain_lookup(name, NULL); |
160 | } | 165 | } |
166 | EXPORT_SYMBOL(auth_domain_find); | ||
diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c index 411479411b21..3c64051e4555 100644 --- a/net/sunrpc/svcauth_unix.c +++ b/net/sunrpc/svcauth_unix.c | |||
@@ -63,6 +63,7 @@ struct auth_domain *unix_domain_find(char *name) | |||
63 | rv = auth_domain_lookup(name, &new->h); | 63 | rv = auth_domain_lookup(name, &new->h); |
64 | } | 64 | } |
65 | } | 65 | } |
66 | EXPORT_SYMBOL(unix_domain_find); | ||
66 | 67 | ||
67 | static void svcauth_unix_domain_release(struct auth_domain *dom) | 68 | static void svcauth_unix_domain_release(struct auth_domain *dom) |
68 | { | 69 | { |
@@ -340,6 +341,7 @@ int auth_unix_add_addr(struct in_addr addr, struct auth_domain *dom) | |||
340 | else | 341 | else |
341 | return -ENOMEM; | 342 | return -ENOMEM; |
342 | } | 343 | } |
344 | EXPORT_SYMBOL(auth_unix_add_addr); | ||
343 | 345 | ||
344 | int auth_unix_forget_old(struct auth_domain *dom) | 346 | int auth_unix_forget_old(struct auth_domain *dom) |
345 | { | 347 | { |
@@ -351,6 +353,7 @@ int auth_unix_forget_old(struct auth_domain *dom) | |||
351 | udom->addr_changes++; | 353 | udom->addr_changes++; |
352 | return 0; | 354 | return 0; |
353 | } | 355 | } |
356 | EXPORT_SYMBOL(auth_unix_forget_old); | ||
354 | 357 | ||
355 | struct auth_domain *auth_unix_lookup(struct in_addr addr) | 358 | struct auth_domain *auth_unix_lookup(struct in_addr addr) |
356 | { | 359 | { |
@@ -375,50 +378,56 @@ struct auth_domain *auth_unix_lookup(struct in_addr addr) | |||
375 | cache_put(&ipm->h, &ip_map_cache); | 378 | cache_put(&ipm->h, &ip_map_cache); |
376 | return rv; | 379 | return rv; |
377 | } | 380 | } |
381 | EXPORT_SYMBOL(auth_unix_lookup); | ||
378 | 382 | ||
379 | void svcauth_unix_purge(void) | 383 | void svcauth_unix_purge(void) |
380 | { | 384 | { |
381 | cache_purge(&ip_map_cache); | 385 | cache_purge(&ip_map_cache); |
382 | } | 386 | } |
387 | EXPORT_SYMBOL(svcauth_unix_purge); | ||
383 | 388 | ||
384 | static inline struct ip_map * | 389 | static inline struct ip_map * |
385 | ip_map_cached_get(struct svc_rqst *rqstp) | 390 | ip_map_cached_get(struct svc_rqst *rqstp) |
386 | { | 391 | { |
387 | struct ip_map *ipm; | 392 | struct ip_map *ipm = NULL; |
388 | struct svc_sock *svsk = rqstp->rq_sock; | 393 | struct svc_xprt *xprt = rqstp->rq_xprt; |
389 | spin_lock(&svsk->sk_lock); | 394 | |
390 | ipm = svsk->sk_info_authunix; | 395 | if (test_bit(XPT_CACHE_AUTH, &xprt->xpt_flags)) { |
391 | if (ipm != NULL) { | 396 | spin_lock(&xprt->xpt_lock); |
392 | if (!cache_valid(&ipm->h)) { | 397 | ipm = xprt->xpt_auth_cache; |
393 | /* | 398 | if (ipm != NULL) { |
394 | * The entry has been invalidated since it was | 399 | if (!cache_valid(&ipm->h)) { |
395 | * remembered, e.g. by a second mount from the | 400 | /* |
396 | * same IP address. | 401 | * The entry has been invalidated since it was |
397 | */ | 402 | * remembered, e.g. by a second mount from the |
398 | svsk->sk_info_authunix = NULL; | 403 | * same IP address. |
399 | spin_unlock(&svsk->sk_lock); | 404 | */ |
400 | cache_put(&ipm->h, &ip_map_cache); | 405 | xprt->xpt_auth_cache = NULL; |
401 | return NULL; | 406 | spin_unlock(&xprt->xpt_lock); |
407 | cache_put(&ipm->h, &ip_map_cache); | ||
408 | return NULL; | ||
409 | } | ||
410 | cache_get(&ipm->h); | ||
402 | } | 411 | } |
403 | cache_get(&ipm->h); | 412 | spin_unlock(&xprt->xpt_lock); |
404 | } | 413 | } |
405 | spin_unlock(&svsk->sk_lock); | ||
406 | return ipm; | 414 | return ipm; |
407 | } | 415 | } |
408 | 416 | ||
409 | static inline void | 417 | static inline void |
410 | ip_map_cached_put(struct svc_rqst *rqstp, struct ip_map *ipm) | 418 | ip_map_cached_put(struct svc_rqst *rqstp, struct ip_map *ipm) |
411 | { | 419 | { |
412 | struct svc_sock *svsk = rqstp->rq_sock; | 420 | struct svc_xprt *xprt = rqstp->rq_xprt; |
413 | 421 | ||
414 | spin_lock(&svsk->sk_lock); | 422 | if (test_bit(XPT_CACHE_AUTH, &xprt->xpt_flags)) { |
415 | if (svsk->sk_sock->type == SOCK_STREAM && | 423 | spin_lock(&xprt->xpt_lock); |
416 | svsk->sk_info_authunix == NULL) { | 424 | if (xprt->xpt_auth_cache == NULL) { |
417 | /* newly cached, keep the reference */ | 425 | /* newly cached, keep the reference */ |
418 | svsk->sk_info_authunix = ipm; | 426 | xprt->xpt_auth_cache = ipm; |
419 | ipm = NULL; | 427 | ipm = NULL; |
428 | } | ||
429 | spin_unlock(&xprt->xpt_lock); | ||
420 | } | 430 | } |
421 | spin_unlock(&svsk->sk_lock); | ||
422 | if (ipm) | 431 | if (ipm) |
423 | cache_put(&ipm->h, &ip_map_cache); | 432 | cache_put(&ipm->h, &ip_map_cache); |
424 | } | 433 | } |
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index c75bffeb89eb..1d3e5fcc2cc4 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c | |||
@@ -5,7 +5,7 @@ | |||
5 | * | 5 | * |
6 | * The server scheduling algorithm does not always distribute the load | 6 | * The server scheduling algorithm does not always distribute the load |
7 | * evenly when servicing a single client. May need to modify the | 7 | * evenly when servicing a single client. May need to modify the |
8 | * svc_sock_enqueue procedure... | 8 | * svc_xprt_enqueue procedure... |
9 | * | 9 | * |
10 | * TCP support is largely untested and may be a little slow. The problem | 10 | * TCP support is largely untested and may be a little slow. The problem |
11 | * is that we currently do two separate recvfrom's, one for the 4-byte | 11 | * is that we currently do two separate recvfrom's, one for the 4-byte |
@@ -48,72 +48,40 @@ | |||
48 | #include <linux/sunrpc/svcsock.h> | 48 | #include <linux/sunrpc/svcsock.h> |
49 | #include <linux/sunrpc/stats.h> | 49 | #include <linux/sunrpc/stats.h> |
50 | 50 | ||
51 | /* SMP locking strategy: | 51 | #define RPCDBG_FACILITY RPCDBG_SVCXPRT |
52 | * | ||
53 | * svc_pool->sp_lock protects most of the fields of that pool. | ||
54 | * svc_serv->sv_lock protects sv_tempsocks, sv_permsocks, sv_tmpcnt. | ||
55 | * when both need to be taken (rare), svc_serv->sv_lock is first. | ||
56 | * BKL protects svc_serv->sv_nrthread. | ||
57 | * svc_sock->sk_lock protects the svc_sock->sk_deferred list | ||
58 | * and the ->sk_info_authunix cache. | ||
59 | * svc_sock->sk_flags.SK_BUSY prevents a svc_sock being enqueued multiply. | ||
60 | * | ||
61 | * Some flags can be set to certain values at any time | ||
62 | * providing that certain rules are followed: | ||
63 | * | ||
64 | * SK_CONN, SK_DATA, can be set or cleared at any time. | ||
65 | * after a set, svc_sock_enqueue must be called. | ||
66 | * after a clear, the socket must be read/accepted | ||
67 | * if this succeeds, it must be set again. | ||
68 | * SK_CLOSE can set at any time. It is never cleared. | ||
69 | * sk_inuse contains a bias of '1' until SK_DEAD is set. | ||
70 | * so when sk_inuse hits zero, we know the socket is dead | ||
71 | * and no-one is using it. | ||
72 | * SK_DEAD can only be set while SK_BUSY is held which ensures | ||
73 | * no other thread will be using the socket or will try to | ||
74 | * set SK_DEAD. | ||
75 | * | ||
76 | */ | ||
77 | |||
78 | #define RPCDBG_FACILITY RPCDBG_SVCSOCK | ||
79 | 52 | ||
80 | 53 | ||
81 | static struct svc_sock *svc_setup_socket(struct svc_serv *, struct socket *, | 54 | static struct svc_sock *svc_setup_socket(struct svc_serv *, struct socket *, |
82 | int *errp, int flags); | 55 | int *errp, int flags); |
83 | static void svc_delete_socket(struct svc_sock *svsk); | ||
84 | static void svc_udp_data_ready(struct sock *, int); | 56 | static void svc_udp_data_ready(struct sock *, int); |
85 | static int svc_udp_recvfrom(struct svc_rqst *); | 57 | static int svc_udp_recvfrom(struct svc_rqst *); |
86 | static int svc_udp_sendto(struct svc_rqst *); | 58 | static int svc_udp_sendto(struct svc_rqst *); |
87 | static void svc_close_socket(struct svc_sock *svsk); | 59 | static void svc_sock_detach(struct svc_xprt *); |
88 | 60 | static void svc_sock_free(struct svc_xprt *); | |
89 | static struct svc_deferred_req *svc_deferred_dequeue(struct svc_sock *svsk); | ||
90 | static int svc_deferred_recv(struct svc_rqst *rqstp); | ||
91 | static struct cache_deferred_req *svc_defer(struct cache_req *req); | ||
92 | |||
93 | /* apparently the "standard" is that clients close | ||
94 | * idle connections after 5 minutes, servers after | ||
95 | * 6 minutes | ||
96 | * http://www.connectathon.org/talks96/nfstcp.pdf | ||
97 | */ | ||
98 | static int svc_conn_age_period = 6*60; | ||
99 | 61 | ||
62 | static struct svc_xprt *svc_create_socket(struct svc_serv *, int, | ||
63 | struct sockaddr *, int, int); | ||
100 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | 64 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
101 | static struct lock_class_key svc_key[2]; | 65 | static struct lock_class_key svc_key[2]; |
102 | static struct lock_class_key svc_slock_key[2]; | 66 | static struct lock_class_key svc_slock_key[2]; |
103 | 67 | ||
104 | static inline void svc_reclassify_socket(struct socket *sock) | 68 | static void svc_reclassify_socket(struct socket *sock) |
105 | { | 69 | { |
106 | struct sock *sk = sock->sk; | 70 | struct sock *sk = sock->sk; |
107 | BUG_ON(sock_owned_by_user(sk)); | 71 | BUG_ON(sock_owned_by_user(sk)); |
108 | switch (sk->sk_family) { | 72 | switch (sk->sk_family) { |
109 | case AF_INET: | 73 | case AF_INET: |
110 | sock_lock_init_class_and_name(sk, "slock-AF_INET-NFSD", | 74 | sock_lock_init_class_and_name(sk, "slock-AF_INET-NFSD", |
111 | &svc_slock_key[0], "sk_lock-AF_INET-NFSD", &svc_key[0]); | 75 | &svc_slock_key[0], |
76 | "sk_xprt.xpt_lock-AF_INET-NFSD", | ||
77 | &svc_key[0]); | ||
112 | break; | 78 | break; |
113 | 79 | ||
114 | case AF_INET6: | 80 | case AF_INET6: |
115 | sock_lock_init_class_and_name(sk, "slock-AF_INET6-NFSD", | 81 | sock_lock_init_class_and_name(sk, "slock-AF_INET6-NFSD", |
116 | &svc_slock_key[1], "sk_lock-AF_INET6-NFSD", &svc_key[1]); | 82 | &svc_slock_key[1], |
83 | "sk_xprt.xpt_lock-AF_INET6-NFSD", | ||
84 | &svc_key[1]); | ||
117 | break; | 85 | break; |
118 | 86 | ||
119 | default: | 87 | default: |
@@ -121,81 +89,26 @@ static inline void svc_reclassify_socket(struct socket *sock) | |||
121 | } | 89 | } |
122 | } | 90 | } |
123 | #else | 91 | #else |
124 | static inline void svc_reclassify_socket(struct socket *sock) | 92 | static void svc_reclassify_socket(struct socket *sock) |
125 | { | 93 | { |
126 | } | 94 | } |
127 | #endif | 95 | #endif |
128 | 96 | ||
129 | static char *__svc_print_addr(struct sockaddr *addr, char *buf, size_t len) | ||
130 | { | ||
131 | switch (addr->sa_family) { | ||
132 | case AF_INET: | ||
133 | snprintf(buf, len, "%u.%u.%u.%u, port=%u", | ||
134 | NIPQUAD(((struct sockaddr_in *) addr)->sin_addr), | ||
135 | ntohs(((struct sockaddr_in *) addr)->sin_port)); | ||
136 | break; | ||
137 | |||
138 | case AF_INET6: | ||
139 | snprintf(buf, len, "%x:%x:%x:%x:%x:%x:%x:%x, port=%u", | ||
140 | NIP6(((struct sockaddr_in6 *) addr)->sin6_addr), | ||
141 | ntohs(((struct sockaddr_in6 *) addr)->sin6_port)); | ||
142 | break; | ||
143 | |||
144 | default: | ||
145 | snprintf(buf, len, "unknown address type: %d", addr->sa_family); | ||
146 | break; | ||
147 | } | ||
148 | return buf; | ||
149 | } | ||
150 | |||
151 | /** | ||
152 | * svc_print_addr - Format rq_addr field for printing | ||
153 | * @rqstp: svc_rqst struct containing address to print | ||
154 | * @buf: target buffer for formatted address | ||
155 | * @len: length of target buffer | ||
156 | * | ||
157 | */ | ||
158 | char *svc_print_addr(struct svc_rqst *rqstp, char *buf, size_t len) | ||
159 | { | ||
160 | return __svc_print_addr(svc_addr(rqstp), buf, len); | ||
161 | } | ||
162 | EXPORT_SYMBOL_GPL(svc_print_addr); | ||
163 | |||
164 | /* | ||
165 | * Queue up an idle server thread. Must have pool->sp_lock held. | ||
166 | * Note: this is really a stack rather than a queue, so that we only | ||
167 | * use as many different threads as we need, and the rest don't pollute | ||
168 | * the cache. | ||
169 | */ | ||
170 | static inline void | ||
171 | svc_thread_enqueue(struct svc_pool *pool, struct svc_rqst *rqstp) | ||
172 | { | ||
173 | list_add(&rqstp->rq_list, &pool->sp_threads); | ||
174 | } | ||
175 | |||
176 | /* | ||
177 | * Dequeue an nfsd thread. Must have pool->sp_lock held. | ||
178 | */ | ||
179 | static inline void | ||
180 | svc_thread_dequeue(struct svc_pool *pool, struct svc_rqst *rqstp) | ||
181 | { | ||
182 | list_del(&rqstp->rq_list); | ||
183 | } | ||
184 | |||
185 | /* | 97 | /* |
186 | * Release an skbuff after use | 98 | * Release an skbuff after use |
187 | */ | 99 | */ |
188 | static inline void | 100 | static void svc_release_skb(struct svc_rqst *rqstp) |
189 | svc_release_skb(struct svc_rqst *rqstp) | ||
190 | { | 101 | { |
191 | struct sk_buff *skb = rqstp->rq_skbuff; | 102 | struct sk_buff *skb = rqstp->rq_xprt_ctxt; |
192 | struct svc_deferred_req *dr = rqstp->rq_deferred; | 103 | struct svc_deferred_req *dr = rqstp->rq_deferred; |
193 | 104 | ||
194 | if (skb) { | 105 | if (skb) { |
195 | rqstp->rq_skbuff = NULL; | 106 | struct svc_sock *svsk = |
107 | container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt); | ||
108 | rqstp->rq_xprt_ctxt = NULL; | ||
196 | 109 | ||
197 | dprintk("svc: service %p, releasing skb %p\n", rqstp, skb); | 110 | dprintk("svc: service %p, releasing skb %p\n", rqstp, skb); |
198 | skb_free_datagram(rqstp->rq_sock->sk_sk, skb); | 111 | skb_free_datagram(svsk->sk_sk, skb); |
199 | } | 112 | } |
200 | if (dr) { | 113 | if (dr) { |
201 | rqstp->rq_deferred = NULL; | 114 | rqstp->rq_deferred = NULL; |
@@ -203,253 +116,6 @@ svc_release_skb(struct svc_rqst *rqstp) | |||
203 | } | 116 | } |
204 | } | 117 | } |
205 | 118 | ||
206 | /* | ||
207 | * Any space to write? | ||
208 | */ | ||
209 | static inline unsigned long | ||
210 | svc_sock_wspace(struct svc_sock *svsk) | ||
211 | { | ||
212 | int wspace; | ||
213 | |||
214 | if (svsk->sk_sock->type == SOCK_STREAM) | ||
215 | wspace = sk_stream_wspace(svsk->sk_sk); | ||
216 | else | ||
217 | wspace = sock_wspace(svsk->sk_sk); | ||
218 | |||
219 | return wspace; | ||
220 | } | ||
221 | |||
222 | /* | ||
223 | * Queue up a socket with data pending. If there are idle nfsd | ||
224 | * processes, wake 'em up. | ||
225 | * | ||
226 | */ | ||
227 | static void | ||
228 | svc_sock_enqueue(struct svc_sock *svsk) | ||
229 | { | ||
230 | struct svc_serv *serv = svsk->sk_server; | ||
231 | struct svc_pool *pool; | ||
232 | struct svc_rqst *rqstp; | ||
233 | int cpu; | ||
234 | |||
235 | if (!(svsk->sk_flags & | ||
236 | ( (1<<SK_CONN)|(1<<SK_DATA)|(1<<SK_CLOSE)|(1<<SK_DEFERRED)) )) | ||
237 | return; | ||
238 | if (test_bit(SK_DEAD, &svsk->sk_flags)) | ||
239 | return; | ||
240 | |||
241 | cpu = get_cpu(); | ||
242 | pool = svc_pool_for_cpu(svsk->sk_server, cpu); | ||
243 | put_cpu(); | ||
244 | |||
245 | spin_lock_bh(&pool->sp_lock); | ||
246 | |||
247 | if (!list_empty(&pool->sp_threads) && | ||
248 | !list_empty(&pool->sp_sockets)) | ||
249 | printk(KERN_ERR | ||
250 | "svc_sock_enqueue: threads and sockets both waiting??\n"); | ||
251 | |||
252 | if (test_bit(SK_DEAD, &svsk->sk_flags)) { | ||
253 | /* Don't enqueue dead sockets */ | ||
254 | dprintk("svc: socket %p is dead, not enqueued\n", svsk->sk_sk); | ||
255 | goto out_unlock; | ||
256 | } | ||
257 | |||
258 | /* Mark socket as busy. It will remain in this state until the | ||
259 | * server has processed all pending data and put the socket back | ||
260 | * on the idle list. We update SK_BUSY atomically because | ||
261 | * it also guards against trying to enqueue the svc_sock twice. | ||
262 | */ | ||
263 | if (test_and_set_bit(SK_BUSY, &svsk->sk_flags)) { | ||
264 | /* Don't enqueue socket while already enqueued */ | ||
265 | dprintk("svc: socket %p busy, not enqueued\n", svsk->sk_sk); | ||
266 | goto out_unlock; | ||
267 | } | ||
268 | BUG_ON(svsk->sk_pool != NULL); | ||
269 | svsk->sk_pool = pool; | ||
270 | |||
271 | set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); | ||
272 | if (((atomic_read(&svsk->sk_reserved) + serv->sv_max_mesg)*2 | ||
273 | > svc_sock_wspace(svsk)) | ||
274 | && !test_bit(SK_CLOSE, &svsk->sk_flags) | ||
275 | && !test_bit(SK_CONN, &svsk->sk_flags)) { | ||
276 | /* Don't enqueue while not enough space for reply */ | ||
277 | dprintk("svc: socket %p no space, %d*2 > %ld, not enqueued\n", | ||
278 | svsk->sk_sk, atomic_read(&svsk->sk_reserved)+serv->sv_max_mesg, | ||
279 | svc_sock_wspace(svsk)); | ||
280 | svsk->sk_pool = NULL; | ||
281 | clear_bit(SK_BUSY, &svsk->sk_flags); | ||
282 | goto out_unlock; | ||
283 | } | ||
284 | clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); | ||
285 | |||
286 | |||
287 | if (!list_empty(&pool->sp_threads)) { | ||
288 | rqstp = list_entry(pool->sp_threads.next, | ||
289 | struct svc_rqst, | ||
290 | rq_list); | ||
291 | dprintk("svc: socket %p served by daemon %p\n", | ||
292 | svsk->sk_sk, rqstp); | ||
293 | svc_thread_dequeue(pool, rqstp); | ||
294 | if (rqstp->rq_sock) | ||
295 | printk(KERN_ERR | ||
296 | "svc_sock_enqueue: server %p, rq_sock=%p!\n", | ||
297 | rqstp, rqstp->rq_sock); | ||
298 | rqstp->rq_sock = svsk; | ||
299 | atomic_inc(&svsk->sk_inuse); | ||
300 | rqstp->rq_reserved = serv->sv_max_mesg; | ||
301 | atomic_add(rqstp->rq_reserved, &svsk->sk_reserved); | ||
302 | BUG_ON(svsk->sk_pool != pool); | ||
303 | wake_up(&rqstp->rq_wait); | ||
304 | } else { | ||
305 | dprintk("svc: socket %p put into queue\n", svsk->sk_sk); | ||
306 | list_add_tail(&svsk->sk_ready, &pool->sp_sockets); | ||
307 | BUG_ON(svsk->sk_pool != pool); | ||
308 | } | ||
309 | |||
310 | out_unlock: | ||
311 | spin_unlock_bh(&pool->sp_lock); | ||
312 | } | ||
313 | |||
314 | /* | ||
315 | * Dequeue the first socket. Must be called with the pool->sp_lock held. | ||
316 | */ | ||
317 | static inline struct svc_sock * | ||
318 | svc_sock_dequeue(struct svc_pool *pool) | ||
319 | { | ||
320 | struct svc_sock *svsk; | ||
321 | |||
322 | if (list_empty(&pool->sp_sockets)) | ||
323 | return NULL; | ||
324 | |||
325 | svsk = list_entry(pool->sp_sockets.next, | ||
326 | struct svc_sock, sk_ready); | ||
327 | list_del_init(&svsk->sk_ready); | ||
328 | |||
329 | dprintk("svc: socket %p dequeued, inuse=%d\n", | ||
330 | svsk->sk_sk, atomic_read(&svsk->sk_inuse)); | ||
331 | |||
332 | return svsk; | ||
333 | } | ||
334 | |||
335 | /* | ||
336 | * Having read something from a socket, check whether it | ||
337 | * needs to be re-enqueued. | ||
338 | * Note: SK_DATA only gets cleared when a read-attempt finds | ||
339 | * no (or insufficient) data. | ||
340 | */ | ||
341 | static inline void | ||
342 | svc_sock_received(struct svc_sock *svsk) | ||
343 | { | ||
344 | svsk->sk_pool = NULL; | ||
345 | clear_bit(SK_BUSY, &svsk->sk_flags); | ||
346 | svc_sock_enqueue(svsk); | ||
347 | } | ||
348 | |||
349 | |||
350 | /** | ||
351 | * svc_reserve - change the space reserved for the reply to a request. | ||
352 | * @rqstp: The request in question | ||
353 | * @space: new max space to reserve | ||
354 | * | ||
355 | * Each request reserves some space on the output queue of the socket | ||
356 | * to make sure the reply fits. This function reduces that reserved | ||
357 | * space to be the amount of space used already, plus @space. | ||
358 | * | ||
359 | */ | ||
360 | void svc_reserve(struct svc_rqst *rqstp, int space) | ||
361 | { | ||
362 | space += rqstp->rq_res.head[0].iov_len; | ||
363 | |||
364 | if (space < rqstp->rq_reserved) { | ||
365 | struct svc_sock *svsk = rqstp->rq_sock; | ||
366 | atomic_sub((rqstp->rq_reserved - space), &svsk->sk_reserved); | ||
367 | rqstp->rq_reserved = space; | ||
368 | |||
369 | svc_sock_enqueue(svsk); | ||
370 | } | ||
371 | } | ||
372 | |||
373 | /* | ||
374 | * Release a socket after use. | ||
375 | */ | ||
376 | static inline void | ||
377 | svc_sock_put(struct svc_sock *svsk) | ||
378 | { | ||
379 | if (atomic_dec_and_test(&svsk->sk_inuse)) { | ||
380 | BUG_ON(! test_bit(SK_DEAD, &svsk->sk_flags)); | ||
381 | |||
382 | dprintk("svc: releasing dead socket\n"); | ||
383 | if (svsk->sk_sock->file) | ||
384 | sockfd_put(svsk->sk_sock); | ||
385 | else | ||
386 | sock_release(svsk->sk_sock); | ||
387 | if (svsk->sk_info_authunix != NULL) | ||
388 | svcauth_unix_info_release(svsk->sk_info_authunix); | ||
389 | kfree(svsk); | ||
390 | } | ||
391 | } | ||
392 | |||
393 | static void | ||
394 | svc_sock_release(struct svc_rqst *rqstp) | ||
395 | { | ||
396 | struct svc_sock *svsk = rqstp->rq_sock; | ||
397 | |||
398 | svc_release_skb(rqstp); | ||
399 | |||
400 | svc_free_res_pages(rqstp); | ||
401 | rqstp->rq_res.page_len = 0; | ||
402 | rqstp->rq_res.page_base = 0; | ||
403 | |||
404 | |||
405 | /* Reset response buffer and release | ||
406 | * the reservation. | ||
407 | * But first, check that enough space was reserved | ||
408 | * for the reply, otherwise we have a bug! | ||
409 | */ | ||
410 | if ((rqstp->rq_res.len) > rqstp->rq_reserved) | ||
411 | printk(KERN_ERR "RPC request reserved %d but used %d\n", | ||
412 | rqstp->rq_reserved, | ||
413 | rqstp->rq_res.len); | ||
414 | |||
415 | rqstp->rq_res.head[0].iov_len = 0; | ||
416 | svc_reserve(rqstp, 0); | ||
417 | rqstp->rq_sock = NULL; | ||
418 | |||
419 | svc_sock_put(svsk); | ||
420 | } | ||
421 | |||
422 | /* | ||
423 | * External function to wake up a server waiting for data | ||
424 | * This really only makes sense for services like lockd | ||
425 | * which have exactly one thread anyway. | ||
426 | */ | ||
427 | void | ||
428 | svc_wake_up(struct svc_serv *serv) | ||
429 | { | ||
430 | struct svc_rqst *rqstp; | ||
431 | unsigned int i; | ||
432 | struct svc_pool *pool; | ||
433 | |||
434 | for (i = 0; i < serv->sv_nrpools; i++) { | ||
435 | pool = &serv->sv_pools[i]; | ||
436 | |||
437 | spin_lock_bh(&pool->sp_lock); | ||
438 | if (!list_empty(&pool->sp_threads)) { | ||
439 | rqstp = list_entry(pool->sp_threads.next, | ||
440 | struct svc_rqst, | ||
441 | rq_list); | ||
442 | dprintk("svc: daemon %p woken up.\n", rqstp); | ||
443 | /* | ||
444 | svc_thread_dequeue(pool, rqstp); | ||
445 | rqstp->rq_sock = NULL; | ||
446 | */ | ||
447 | wake_up(&rqstp->rq_wait); | ||
448 | } | ||
449 | spin_unlock_bh(&pool->sp_lock); | ||
450 | } | ||
451 | } | ||
452 | |||
453 | union svc_pktinfo_u { | 119 | union svc_pktinfo_u { |
454 | struct in_pktinfo pkti; | 120 | struct in_pktinfo pkti; |
455 | struct in6_pktinfo pkti6; | 121 | struct in6_pktinfo pkti6; |
@@ -459,7 +125,9 @@ union svc_pktinfo_u { | |||
459 | 125 | ||
460 | static void svc_set_cmsg_data(struct svc_rqst *rqstp, struct cmsghdr *cmh) | 126 | static void svc_set_cmsg_data(struct svc_rqst *rqstp, struct cmsghdr *cmh) |
461 | { | 127 | { |
462 | switch (rqstp->rq_sock->sk_sk->sk_family) { | 128 | struct svc_sock *svsk = |
129 | container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt); | ||
130 | switch (svsk->sk_sk->sk_family) { | ||
463 | case AF_INET: { | 131 | case AF_INET: { |
464 | struct in_pktinfo *pki = CMSG_DATA(cmh); | 132 | struct in_pktinfo *pki = CMSG_DATA(cmh); |
465 | 133 | ||
@@ -489,10 +157,10 @@ static void svc_set_cmsg_data(struct svc_rqst *rqstp, struct cmsghdr *cmh) | |||
489 | /* | 157 | /* |
490 | * Generic sendto routine | 158 | * Generic sendto routine |
491 | */ | 159 | */ |
492 | static int | 160 | static int svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr) |
493 | svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr) | ||
494 | { | 161 | { |
495 | struct svc_sock *svsk = rqstp->rq_sock; | 162 | struct svc_sock *svsk = |
163 | container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt); | ||
496 | struct socket *sock = svsk->sk_sock; | 164 | struct socket *sock = svsk->sk_sock; |
497 | int slen; | 165 | int slen; |
498 | union { | 166 | union { |
@@ -565,7 +233,7 @@ svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr) | |||
565 | } | 233 | } |
566 | out: | 234 | out: |
567 | dprintk("svc: socket %p sendto([%p %Zu... ], %d) = %d (addr %s)\n", | 235 | dprintk("svc: socket %p sendto([%p %Zu... ], %d) = %d (addr %s)\n", |
568 | rqstp->rq_sock, xdr->head[0].iov_base, xdr->head[0].iov_len, | 236 | svsk, xdr->head[0].iov_base, xdr->head[0].iov_len, |
569 | xdr->len, len, svc_print_addr(rqstp, buf, sizeof(buf))); | 237 | xdr->len, len, svc_print_addr(rqstp, buf, sizeof(buf))); |
570 | 238 | ||
571 | return len; | 239 | return len; |
@@ -602,7 +270,7 @@ svc_sock_names(char *buf, struct svc_serv *serv, char *toclose) | |||
602 | if (!serv) | 270 | if (!serv) |
603 | return 0; | 271 | return 0; |
604 | spin_lock_bh(&serv->sv_lock); | 272 | spin_lock_bh(&serv->sv_lock); |
605 | list_for_each_entry(svsk, &serv->sv_permsocks, sk_list) { | 273 | list_for_each_entry(svsk, &serv->sv_permsocks, sk_xprt.xpt_list) { |
606 | int onelen = one_sock_name(buf+len, svsk); | 274 | int onelen = one_sock_name(buf+len, svsk); |
607 | if (toclose && strcmp(toclose, buf+len) == 0) | 275 | if (toclose && strcmp(toclose, buf+len) == 0) |
608 | closesk = svsk; | 276 | closesk = svsk; |
@@ -614,7 +282,7 @@ svc_sock_names(char *buf, struct svc_serv *serv, char *toclose) | |||
614 | /* Should unregister with portmap, but you cannot | 282 | /* Should unregister with portmap, but you cannot |
615 | * unregister just one protocol... | 283 | * unregister just one protocol... |
616 | */ | 284 | */ |
617 | svc_close_socket(closesk); | 285 | svc_close_xprt(&closesk->sk_xprt); |
618 | else if (toclose) | 286 | else if (toclose) |
619 | return -ENOENT; | 287 | return -ENOENT; |
620 | return len; | 288 | return len; |
@@ -624,8 +292,7 @@ EXPORT_SYMBOL(svc_sock_names); | |||
624 | /* | 292 | /* |
625 | * Check input queue length | 293 | * Check input queue length |
626 | */ | 294 | */ |
627 | static int | 295 | static int svc_recv_available(struct svc_sock *svsk) |
628 | svc_recv_available(struct svc_sock *svsk) | ||
629 | { | 296 | { |
630 | struct socket *sock = svsk->sk_sock; | 297 | struct socket *sock = svsk->sk_sock; |
631 | int avail, err; | 298 | int avail, err; |
@@ -638,48 +305,31 @@ svc_recv_available(struct svc_sock *svsk) | |||
638 | /* | 305 | /* |
639 | * Generic recvfrom routine. | 306 | * Generic recvfrom routine. |
640 | */ | 307 | */ |
641 | static int | 308 | static int svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov, int nr, |
642 | svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov, int nr, int buflen) | 309 | int buflen) |
643 | { | 310 | { |
644 | struct svc_sock *svsk = rqstp->rq_sock; | 311 | struct svc_sock *svsk = |
312 | container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt); | ||
645 | struct msghdr msg = { | 313 | struct msghdr msg = { |
646 | .msg_flags = MSG_DONTWAIT, | 314 | .msg_flags = MSG_DONTWAIT, |
647 | }; | 315 | }; |
648 | struct sockaddr *sin; | ||
649 | int len; | 316 | int len; |
650 | 317 | ||
318 | rqstp->rq_xprt_hlen = 0; | ||
319 | |||
651 | len = kernel_recvmsg(svsk->sk_sock, &msg, iov, nr, buflen, | 320 | len = kernel_recvmsg(svsk->sk_sock, &msg, iov, nr, buflen, |
652 | msg.msg_flags); | 321 | msg.msg_flags); |
653 | 322 | ||
654 | /* sock_recvmsg doesn't fill in the name/namelen, so we must.. | ||
655 | */ | ||
656 | memcpy(&rqstp->rq_addr, &svsk->sk_remote, svsk->sk_remotelen); | ||
657 | rqstp->rq_addrlen = svsk->sk_remotelen; | ||
658 | |||
659 | /* Destination address in request is needed for binding the | ||
660 | * source address in RPC callbacks later. | ||
661 | */ | ||
662 | sin = (struct sockaddr *)&svsk->sk_local; | ||
663 | switch (sin->sa_family) { | ||
664 | case AF_INET: | ||
665 | rqstp->rq_daddr.addr = ((struct sockaddr_in *)sin)->sin_addr; | ||
666 | break; | ||
667 | case AF_INET6: | ||
668 | rqstp->rq_daddr.addr6 = ((struct sockaddr_in6 *)sin)->sin6_addr; | ||
669 | break; | ||
670 | } | ||
671 | |||
672 | dprintk("svc: socket %p recvfrom(%p, %Zu) = %d\n", | 323 | dprintk("svc: socket %p recvfrom(%p, %Zu) = %d\n", |
673 | svsk, iov[0].iov_base, iov[0].iov_len, len); | 324 | svsk, iov[0].iov_base, iov[0].iov_len, len); |
674 | |||
675 | return len; | 325 | return len; |
676 | } | 326 | } |
677 | 327 | ||
678 | /* | 328 | /* |
679 | * Set socket snd and rcv buffer lengths | 329 | * Set socket snd and rcv buffer lengths |
680 | */ | 330 | */ |
681 | static inline void | 331 | static void svc_sock_setbufsize(struct socket *sock, unsigned int snd, |
682 | svc_sock_setbufsize(struct socket *sock, unsigned int snd, unsigned int rcv) | 332 | unsigned int rcv) |
683 | { | 333 | { |
684 | #if 0 | 334 | #if 0 |
685 | mm_segment_t oldfs; | 335 | mm_segment_t oldfs; |
@@ -704,16 +354,16 @@ svc_sock_setbufsize(struct socket *sock, unsigned int snd, unsigned int rcv) | |||
704 | /* | 354 | /* |
705 | * INET callback when data has been received on the socket. | 355 | * INET callback when data has been received on the socket. |
706 | */ | 356 | */ |
707 | static void | 357 | static void svc_udp_data_ready(struct sock *sk, int count) |
708 | svc_udp_data_ready(struct sock *sk, int count) | ||
709 | { | 358 | { |
710 | struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; | 359 | struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; |
711 | 360 | ||
712 | if (svsk) { | 361 | if (svsk) { |
713 | dprintk("svc: socket %p(inet %p), count=%d, busy=%d\n", | 362 | dprintk("svc: socket %p(inet %p), count=%d, busy=%d\n", |
714 | svsk, sk, count, test_bit(SK_BUSY, &svsk->sk_flags)); | 363 | svsk, sk, count, |
715 | set_bit(SK_DATA, &svsk->sk_flags); | 364 | test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags)); |
716 | svc_sock_enqueue(svsk); | 365 | set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); |
366 | svc_xprt_enqueue(&svsk->sk_xprt); | ||
717 | } | 367 | } |
718 | if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) | 368 | if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) |
719 | wake_up_interruptible(sk->sk_sleep); | 369 | wake_up_interruptible(sk->sk_sleep); |
@@ -722,15 +372,14 @@ svc_udp_data_ready(struct sock *sk, int count) | |||
722 | /* | 372 | /* |
723 | * INET callback when space is newly available on the socket. | 373 | * INET callback when space is newly available on the socket. |
724 | */ | 374 | */ |
725 | static void | 375 | static void svc_write_space(struct sock *sk) |
726 | svc_write_space(struct sock *sk) | ||
727 | { | 376 | { |
728 | struct svc_sock *svsk = (struct svc_sock *)(sk->sk_user_data); | 377 | struct svc_sock *svsk = (struct svc_sock *)(sk->sk_user_data); |
729 | 378 | ||
730 | if (svsk) { | 379 | if (svsk) { |
731 | dprintk("svc: socket %p(inet %p), write_space busy=%d\n", | 380 | dprintk("svc: socket %p(inet %p), write_space busy=%d\n", |
732 | svsk, sk, test_bit(SK_BUSY, &svsk->sk_flags)); | 381 | svsk, sk, test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags)); |
733 | svc_sock_enqueue(svsk); | 382 | svc_xprt_enqueue(&svsk->sk_xprt); |
734 | } | 383 | } |
735 | 384 | ||
736 | if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) { | 385 | if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) { |
@@ -740,10 +389,19 @@ svc_write_space(struct sock *sk) | |||
740 | } | 389 | } |
741 | } | 390 | } |
742 | 391 | ||
743 | static inline void svc_udp_get_dest_address(struct svc_rqst *rqstp, | 392 | /* |
744 | struct cmsghdr *cmh) | 393 | * Copy the UDP datagram's destination address to the rqstp structure. |
394 | * The 'destination' address in this case is the address to which the | ||
395 | * peer sent the datagram, i.e. our local address. For multihomed | ||
396 | * hosts, this can change from msg to msg. Note that only the IP | ||
397 | * address changes, the port number should remain the same. | ||
398 | */ | ||
399 | static void svc_udp_get_dest_address(struct svc_rqst *rqstp, | ||
400 | struct cmsghdr *cmh) | ||
745 | { | 401 | { |
746 | switch (rqstp->rq_sock->sk_sk->sk_family) { | 402 | struct svc_sock *svsk = |
403 | container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt); | ||
404 | switch (svsk->sk_sk->sk_family) { | ||
747 | case AF_INET: { | 405 | case AF_INET: { |
748 | struct in_pktinfo *pki = CMSG_DATA(cmh); | 406 | struct in_pktinfo *pki = CMSG_DATA(cmh); |
749 | rqstp->rq_daddr.addr.s_addr = pki->ipi_spec_dst.s_addr; | 407 | rqstp->rq_daddr.addr.s_addr = pki->ipi_spec_dst.s_addr; |
@@ -760,11 +418,11 @@ static inline void svc_udp_get_dest_address(struct svc_rqst *rqstp, | |||
760 | /* | 418 | /* |
761 | * Receive a datagram from a UDP socket. | 419 | * Receive a datagram from a UDP socket. |
762 | */ | 420 | */ |
763 | static int | 421 | static int svc_udp_recvfrom(struct svc_rqst *rqstp) |
764 | svc_udp_recvfrom(struct svc_rqst *rqstp) | ||
765 | { | 422 | { |
766 | struct svc_sock *svsk = rqstp->rq_sock; | 423 | struct svc_sock *svsk = |
767 | struct svc_serv *serv = svsk->sk_server; | 424 | container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt); |
425 | struct svc_serv *serv = svsk->sk_xprt.xpt_server; | ||
768 | struct sk_buff *skb; | 426 | struct sk_buff *skb; |
769 | union { | 427 | union { |
770 | struct cmsghdr hdr; | 428 | struct cmsghdr hdr; |
@@ -779,7 +437,7 @@ svc_udp_recvfrom(struct svc_rqst *rqstp) | |||
779 | .msg_flags = MSG_DONTWAIT, | 437 | .msg_flags = MSG_DONTWAIT, |
780 | }; | 438 | }; |
781 | 439 | ||
782 | if (test_and_clear_bit(SK_CHNGBUF, &svsk->sk_flags)) | 440 | if (test_and_clear_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags)) |
783 | /* udp sockets need large rcvbuf as all pending | 441 | /* udp sockets need large rcvbuf as all pending |
784 | * requests are still in that buffer. sndbuf must | 442 | * requests are still in that buffer. sndbuf must |
785 | * also be large enough that there is enough space | 443 | * also be large enough that there is enough space |
@@ -792,17 +450,7 @@ svc_udp_recvfrom(struct svc_rqst *rqstp) | |||
792 | (serv->sv_nrthreads+3) * serv->sv_max_mesg, | 450 | (serv->sv_nrthreads+3) * serv->sv_max_mesg, |
793 | (serv->sv_nrthreads+3) * serv->sv_max_mesg); | 451 | (serv->sv_nrthreads+3) * serv->sv_max_mesg); |
794 | 452 | ||
795 | if ((rqstp->rq_deferred = svc_deferred_dequeue(svsk))) { | 453 | clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); |
796 | svc_sock_received(svsk); | ||
797 | return svc_deferred_recv(rqstp); | ||
798 | } | ||
799 | |||
800 | if (test_bit(SK_CLOSE, &svsk->sk_flags)) { | ||
801 | svc_delete_socket(svsk); | ||
802 | return 0; | ||
803 | } | ||
804 | |||
805 | clear_bit(SK_DATA, &svsk->sk_flags); | ||
806 | skb = NULL; | 454 | skb = NULL; |
807 | err = kernel_recvmsg(svsk->sk_sock, &msg, NULL, | 455 | err = kernel_recvmsg(svsk->sk_sock, &msg, NULL, |
808 | 0, 0, MSG_PEEK | MSG_DONTWAIT); | 456 | 0, 0, MSG_PEEK | MSG_DONTWAIT); |
@@ -813,24 +461,27 @@ svc_udp_recvfrom(struct svc_rqst *rqstp) | |||
813 | if (err != -EAGAIN) { | 461 | if (err != -EAGAIN) { |
814 | /* possibly an icmp error */ | 462 | /* possibly an icmp error */ |
815 | dprintk("svc: recvfrom returned error %d\n", -err); | 463 | dprintk("svc: recvfrom returned error %d\n", -err); |
816 | set_bit(SK_DATA, &svsk->sk_flags); | 464 | set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); |
817 | } | 465 | } |
818 | svc_sock_received(svsk); | 466 | svc_xprt_received(&svsk->sk_xprt); |
819 | return -EAGAIN; | 467 | return -EAGAIN; |
820 | } | 468 | } |
821 | rqstp->rq_addrlen = sizeof(rqstp->rq_addr); | 469 | len = svc_addr_len(svc_addr(rqstp)); |
470 | if (len < 0) | ||
471 | return len; | ||
472 | rqstp->rq_addrlen = len; | ||
822 | if (skb->tstamp.tv64 == 0) { | 473 | if (skb->tstamp.tv64 == 0) { |
823 | skb->tstamp = ktime_get_real(); | 474 | skb->tstamp = ktime_get_real(); |
824 | /* Don't enable netstamp, sunrpc doesn't | 475 | /* Don't enable netstamp, sunrpc doesn't |
825 | need that much accuracy */ | 476 | need that much accuracy */ |
826 | } | 477 | } |
827 | svsk->sk_sk->sk_stamp = skb->tstamp; | 478 | svsk->sk_sk->sk_stamp = skb->tstamp; |
828 | set_bit(SK_DATA, &svsk->sk_flags); /* there may be more data... */ | 479 | set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); /* there may be more data... */ |
829 | 480 | ||
830 | /* | 481 | /* |
831 | * Maybe more packets - kick another thread ASAP. | 482 | * Maybe more packets - kick another thread ASAP. |
832 | */ | 483 | */ |
833 | svc_sock_received(svsk); | 484 | svc_xprt_received(&svsk->sk_xprt); |
834 | 485 | ||
835 | len = skb->len - sizeof(struct udphdr); | 486 | len = skb->len - sizeof(struct udphdr); |
836 | rqstp->rq_arg.len = len; | 487 | rqstp->rq_arg.len = len; |
@@ -861,13 +512,14 @@ svc_udp_recvfrom(struct svc_rqst *rqstp) | |||
861 | skb_free_datagram(svsk->sk_sk, skb); | 512 | skb_free_datagram(svsk->sk_sk, skb); |
862 | } else { | 513 | } else { |
863 | /* we can use it in-place */ | 514 | /* we can use it in-place */ |
864 | rqstp->rq_arg.head[0].iov_base = skb->data + sizeof(struct udphdr); | 515 | rqstp->rq_arg.head[0].iov_base = skb->data + |
516 | sizeof(struct udphdr); | ||
865 | rqstp->rq_arg.head[0].iov_len = len; | 517 | rqstp->rq_arg.head[0].iov_len = len; |
866 | if (skb_checksum_complete(skb)) { | 518 | if (skb_checksum_complete(skb)) { |
867 | skb_free_datagram(svsk->sk_sk, skb); | 519 | skb_free_datagram(svsk->sk_sk, skb); |
868 | return 0; | 520 | return 0; |
869 | } | 521 | } |
870 | rqstp->rq_skbuff = skb; | 522 | rqstp->rq_xprt_ctxt = skb; |
871 | } | 523 | } |
872 | 524 | ||
873 | rqstp->rq_arg.page_base = 0; | 525 | rqstp->rq_arg.page_base = 0; |
@@ -900,27 +552,81 @@ svc_udp_sendto(struct svc_rqst *rqstp) | |||
900 | return error; | 552 | return error; |
901 | } | 553 | } |
902 | 554 | ||
903 | static void | 555 | static void svc_udp_prep_reply_hdr(struct svc_rqst *rqstp) |
904 | svc_udp_init(struct svc_sock *svsk) | 556 | { |
557 | } | ||
558 | |||
559 | static int svc_udp_has_wspace(struct svc_xprt *xprt) | ||
560 | { | ||
561 | struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); | ||
562 | struct svc_serv *serv = xprt->xpt_server; | ||
563 | unsigned long required; | ||
564 | |||
565 | /* | ||
566 | * Set the SOCK_NOSPACE flag before checking the available | ||
567 | * sock space. | ||
568 | */ | ||
569 | set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); | ||
570 | required = atomic_read(&svsk->sk_xprt.xpt_reserved) + serv->sv_max_mesg; | ||
571 | if (required*2 > sock_wspace(svsk->sk_sk)) | ||
572 | return 0; | ||
573 | clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); | ||
574 | return 1; | ||
575 | } | ||
576 | |||
577 | static struct svc_xprt *svc_udp_accept(struct svc_xprt *xprt) | ||
578 | { | ||
579 | BUG(); | ||
580 | return NULL; | ||
581 | } | ||
582 | |||
583 | static struct svc_xprt *svc_udp_create(struct svc_serv *serv, | ||
584 | struct sockaddr *sa, int salen, | ||
585 | int flags) | ||
586 | { | ||
587 | return svc_create_socket(serv, IPPROTO_UDP, sa, salen, flags); | ||
588 | } | ||
589 | |||
590 | static struct svc_xprt_ops svc_udp_ops = { | ||
591 | .xpo_create = svc_udp_create, | ||
592 | .xpo_recvfrom = svc_udp_recvfrom, | ||
593 | .xpo_sendto = svc_udp_sendto, | ||
594 | .xpo_release_rqst = svc_release_skb, | ||
595 | .xpo_detach = svc_sock_detach, | ||
596 | .xpo_free = svc_sock_free, | ||
597 | .xpo_prep_reply_hdr = svc_udp_prep_reply_hdr, | ||
598 | .xpo_has_wspace = svc_udp_has_wspace, | ||
599 | .xpo_accept = svc_udp_accept, | ||
600 | }; | ||
601 | |||
602 | static struct svc_xprt_class svc_udp_class = { | ||
603 | .xcl_name = "udp", | ||
604 | .xcl_owner = THIS_MODULE, | ||
605 | .xcl_ops = &svc_udp_ops, | ||
606 | .xcl_max_payload = RPCSVC_MAXPAYLOAD_UDP, | ||
607 | }; | ||
608 | |||
609 | static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv) | ||
905 | { | 610 | { |
906 | int one = 1; | 611 | int one = 1; |
907 | mm_segment_t oldfs; | 612 | mm_segment_t oldfs; |
908 | 613 | ||
614 | svc_xprt_init(&svc_udp_class, &svsk->sk_xprt, serv); | ||
615 | clear_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags); | ||
909 | svsk->sk_sk->sk_data_ready = svc_udp_data_ready; | 616 | svsk->sk_sk->sk_data_ready = svc_udp_data_ready; |
910 | svsk->sk_sk->sk_write_space = svc_write_space; | 617 | svsk->sk_sk->sk_write_space = svc_write_space; |
911 | svsk->sk_recvfrom = svc_udp_recvfrom; | ||
912 | svsk->sk_sendto = svc_udp_sendto; | ||
913 | 618 | ||
914 | /* initialise setting must have enough space to | 619 | /* initialise setting must have enough space to |
915 | * receive and respond to one request. | 620 | * receive and respond to one request. |
916 | * svc_udp_recvfrom will re-adjust if necessary | 621 | * svc_udp_recvfrom will re-adjust if necessary |
917 | */ | 622 | */ |
918 | svc_sock_setbufsize(svsk->sk_sock, | 623 | svc_sock_setbufsize(svsk->sk_sock, |
919 | 3 * svsk->sk_server->sv_max_mesg, | 624 | 3 * svsk->sk_xprt.xpt_server->sv_max_mesg, |
920 | 3 * svsk->sk_server->sv_max_mesg); | 625 | 3 * svsk->sk_xprt.xpt_server->sv_max_mesg); |
921 | 626 | ||
922 | set_bit(SK_DATA, &svsk->sk_flags); /* might have come in before data_ready set up */ | 627 | /* data might have come in before data_ready set up */ |
923 | set_bit(SK_CHNGBUF, &svsk->sk_flags); | 628 | set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); |
629 | set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags); | ||
924 | 630 | ||
925 | oldfs = get_fs(); | 631 | oldfs = get_fs(); |
926 | set_fs(KERNEL_DS); | 632 | set_fs(KERNEL_DS); |
@@ -934,8 +640,7 @@ svc_udp_init(struct svc_sock *svsk) | |||
934 | * A data_ready event on a listening socket means there's a connection | 640 | * A data_ready event on a listening socket means there's a connection |
935 | * pending. Do not use state_change as a substitute for it. | 641 | * pending. Do not use state_change as a substitute for it. |
936 | */ | 642 | */ |
937 | static void | 643 | static void svc_tcp_listen_data_ready(struct sock *sk, int count_unused) |
938 | svc_tcp_listen_data_ready(struct sock *sk, int count_unused) | ||
939 | { | 644 | { |
940 | struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; | 645 | struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; |
941 | 646 | ||
@@ -954,8 +659,8 @@ svc_tcp_listen_data_ready(struct sock *sk, int count_unused) | |||
954 | */ | 659 | */ |
955 | if (sk->sk_state == TCP_LISTEN) { | 660 | if (sk->sk_state == TCP_LISTEN) { |
956 | if (svsk) { | 661 | if (svsk) { |
957 | set_bit(SK_CONN, &svsk->sk_flags); | 662 | set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags); |
958 | svc_sock_enqueue(svsk); | 663 | svc_xprt_enqueue(&svsk->sk_xprt); |
959 | } else | 664 | } else |
960 | printk("svc: socket %p: no user data\n", sk); | 665 | printk("svc: socket %p: no user data\n", sk); |
961 | } | 666 | } |
@@ -967,8 +672,7 @@ svc_tcp_listen_data_ready(struct sock *sk, int count_unused) | |||
967 | /* | 672 | /* |
968 | * A state change on a connected socket means it's dying or dead. | 673 | * A state change on a connected socket means it's dying or dead. |
969 | */ | 674 | */ |
970 | static void | 675 | static void svc_tcp_state_change(struct sock *sk) |
971 | svc_tcp_state_change(struct sock *sk) | ||
972 | { | 676 | { |
973 | struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; | 677 | struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; |
974 | 678 | ||
@@ -978,51 +682,36 @@ svc_tcp_state_change(struct sock *sk) | |||
978 | if (!svsk) | 682 | if (!svsk) |
979 | printk("svc: socket %p: no user data\n", sk); | 683 | printk("svc: socket %p: no user data\n", sk); |
980 | else { | 684 | else { |
981 | set_bit(SK_CLOSE, &svsk->sk_flags); | 685 | set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); |
982 | svc_sock_enqueue(svsk); | 686 | svc_xprt_enqueue(&svsk->sk_xprt); |
983 | } | 687 | } |
984 | if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) | 688 | if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) |
985 | wake_up_interruptible_all(sk->sk_sleep); | 689 | wake_up_interruptible_all(sk->sk_sleep); |
986 | } | 690 | } |
987 | 691 | ||
988 | static void | 692 | static void svc_tcp_data_ready(struct sock *sk, int count) |
989 | svc_tcp_data_ready(struct sock *sk, int count) | ||
990 | { | 693 | { |
991 | struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; | 694 | struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; |
992 | 695 | ||
993 | dprintk("svc: socket %p TCP data ready (svsk %p)\n", | 696 | dprintk("svc: socket %p TCP data ready (svsk %p)\n", |
994 | sk, sk->sk_user_data); | 697 | sk, sk->sk_user_data); |
995 | if (svsk) { | 698 | if (svsk) { |
996 | set_bit(SK_DATA, &svsk->sk_flags); | 699 | set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); |
997 | svc_sock_enqueue(svsk); | 700 | svc_xprt_enqueue(&svsk->sk_xprt); |
998 | } | 701 | } |
999 | if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) | 702 | if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) |
1000 | wake_up_interruptible(sk->sk_sleep); | 703 | wake_up_interruptible(sk->sk_sleep); |
1001 | } | 704 | } |
1002 | 705 | ||
1003 | static inline int svc_port_is_privileged(struct sockaddr *sin) | ||
1004 | { | ||
1005 | switch (sin->sa_family) { | ||
1006 | case AF_INET: | ||
1007 | return ntohs(((struct sockaddr_in *)sin)->sin_port) | ||
1008 | < PROT_SOCK; | ||
1009 | case AF_INET6: | ||
1010 | return ntohs(((struct sockaddr_in6 *)sin)->sin6_port) | ||
1011 | < PROT_SOCK; | ||
1012 | default: | ||
1013 | return 0; | ||
1014 | } | ||
1015 | } | ||
1016 | |||
1017 | /* | 706 | /* |
1018 | * Accept a TCP connection | 707 | * Accept a TCP connection |
1019 | */ | 708 | */ |
1020 | static void | 709 | static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt) |
1021 | svc_tcp_accept(struct svc_sock *svsk) | ||
1022 | { | 710 | { |
711 | struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); | ||
1023 | struct sockaddr_storage addr; | 712 | struct sockaddr_storage addr; |
1024 | struct sockaddr *sin = (struct sockaddr *) &addr; | 713 | struct sockaddr *sin = (struct sockaddr *) &addr; |
1025 | struct svc_serv *serv = svsk->sk_server; | 714 | struct svc_serv *serv = svsk->sk_xprt.xpt_server; |
1026 | struct socket *sock = svsk->sk_sock; | 715 | struct socket *sock = svsk->sk_sock; |
1027 | struct socket *newsock; | 716 | struct socket *newsock; |
1028 | struct svc_sock *newsvsk; | 717 | struct svc_sock *newsvsk; |
@@ -1031,9 +720,9 @@ svc_tcp_accept(struct svc_sock *svsk) | |||
1031 | 720 | ||
1032 | dprintk("svc: tcp_accept %p sock %p\n", svsk, sock); | 721 | dprintk("svc: tcp_accept %p sock %p\n", svsk, sock); |
1033 | if (!sock) | 722 | if (!sock) |
1034 | return; | 723 | return NULL; |
1035 | 724 | ||
1036 | clear_bit(SK_CONN, &svsk->sk_flags); | 725 | clear_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags); |
1037 | err = kernel_accept(sock, &newsock, O_NONBLOCK); | 726 | err = kernel_accept(sock, &newsock, O_NONBLOCK); |
1038 | if (err < 0) { | 727 | if (err < 0) { |
1039 | if (err == -ENOMEM) | 728 | if (err == -ENOMEM) |
@@ -1042,11 +731,9 @@ svc_tcp_accept(struct svc_sock *svsk) | |||
1042 | else if (err != -EAGAIN && net_ratelimit()) | 731 | else if (err != -EAGAIN && net_ratelimit()) |
1043 | printk(KERN_WARNING "%s: accept failed (err %d)!\n", | 732 | printk(KERN_WARNING "%s: accept failed (err %d)!\n", |
1044 | serv->sv_name, -err); | 733 | serv->sv_name, -err); |
1045 | return; | 734 | return NULL; |
1046 | } | 735 | } |
1047 | 736 | set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags); | |
1048 | set_bit(SK_CONN, &svsk->sk_flags); | ||
1049 | svc_sock_enqueue(svsk); | ||
1050 | 737 | ||
1051 | err = kernel_getpeername(newsock, sin, &slen); | 738 | err = kernel_getpeername(newsock, sin, &slen); |
1052 | if (err < 0) { | 739 | if (err < 0) { |
@@ -1077,106 +764,42 @@ svc_tcp_accept(struct svc_sock *svsk) | |||
1077 | if (!(newsvsk = svc_setup_socket(serv, newsock, &err, | 764 | if (!(newsvsk = svc_setup_socket(serv, newsock, &err, |
1078 | (SVC_SOCK_ANONYMOUS | SVC_SOCK_TEMPORARY)))) | 765 | (SVC_SOCK_ANONYMOUS | SVC_SOCK_TEMPORARY)))) |
1079 | goto failed; | 766 | goto failed; |
1080 | memcpy(&newsvsk->sk_remote, sin, slen); | 767 | svc_xprt_set_remote(&newsvsk->sk_xprt, sin, slen); |
1081 | newsvsk->sk_remotelen = slen; | ||
1082 | err = kernel_getsockname(newsock, sin, &slen); | 768 | err = kernel_getsockname(newsock, sin, &slen); |
1083 | if (unlikely(err < 0)) { | 769 | if (unlikely(err < 0)) { |
1084 | dprintk("svc_tcp_accept: kernel_getsockname error %d\n", -err); | 770 | dprintk("svc_tcp_accept: kernel_getsockname error %d\n", -err); |
1085 | slen = offsetof(struct sockaddr, sa_data); | 771 | slen = offsetof(struct sockaddr, sa_data); |
1086 | } | 772 | } |
1087 | memcpy(&newsvsk->sk_local, sin, slen); | 773 | svc_xprt_set_local(&newsvsk->sk_xprt, sin, slen); |
1088 | |||
1089 | svc_sock_received(newsvsk); | ||
1090 | |||
1091 | /* make sure that we don't have too many active connections. | ||
1092 | * If we have, something must be dropped. | ||
1093 | * | ||
1094 | * There's no point in trying to do random drop here for | ||
1095 | * DoS prevention. The NFS clients does 1 reconnect in 15 | ||
1096 | * seconds. An attacker can easily beat that. | ||
1097 | * | ||
1098 | * The only somewhat efficient mechanism would be if drop | ||
1099 | * old connections from the same IP first. But right now | ||
1100 | * we don't even record the client IP in svc_sock. | ||
1101 | */ | ||
1102 | if (serv->sv_tmpcnt > (serv->sv_nrthreads+3)*20) { | ||
1103 | struct svc_sock *svsk = NULL; | ||
1104 | spin_lock_bh(&serv->sv_lock); | ||
1105 | if (!list_empty(&serv->sv_tempsocks)) { | ||
1106 | if (net_ratelimit()) { | ||
1107 | /* Try to help the admin */ | ||
1108 | printk(KERN_NOTICE "%s: too many open TCP " | ||
1109 | "sockets, consider increasing the " | ||
1110 | "number of nfsd threads\n", | ||
1111 | serv->sv_name); | ||
1112 | printk(KERN_NOTICE | ||
1113 | "%s: last TCP connect from %s\n", | ||
1114 | serv->sv_name, __svc_print_addr(sin, | ||
1115 | buf, sizeof(buf))); | ||
1116 | } | ||
1117 | /* | ||
1118 | * Always select the oldest socket. It's not fair, | ||
1119 | * but so is life | ||
1120 | */ | ||
1121 | svsk = list_entry(serv->sv_tempsocks.prev, | ||
1122 | struct svc_sock, | ||
1123 | sk_list); | ||
1124 | set_bit(SK_CLOSE, &svsk->sk_flags); | ||
1125 | atomic_inc(&svsk->sk_inuse); | ||
1126 | } | ||
1127 | spin_unlock_bh(&serv->sv_lock); | ||
1128 | |||
1129 | if (svsk) { | ||
1130 | svc_sock_enqueue(svsk); | ||
1131 | svc_sock_put(svsk); | ||
1132 | } | ||
1133 | |||
1134 | } | ||
1135 | 774 | ||
1136 | if (serv->sv_stats) | 775 | if (serv->sv_stats) |
1137 | serv->sv_stats->nettcpconn++; | 776 | serv->sv_stats->nettcpconn++; |
1138 | 777 | ||
1139 | return; | 778 | return &newsvsk->sk_xprt; |
1140 | 779 | ||
1141 | failed: | 780 | failed: |
1142 | sock_release(newsock); | 781 | sock_release(newsock); |
1143 | return; | 782 | return NULL; |
1144 | } | 783 | } |
1145 | 784 | ||
1146 | /* | 785 | /* |
1147 | * Receive data from a TCP socket. | 786 | * Receive data from a TCP socket. |
1148 | */ | 787 | */ |
1149 | static int | 788 | static int svc_tcp_recvfrom(struct svc_rqst *rqstp) |
1150 | svc_tcp_recvfrom(struct svc_rqst *rqstp) | ||
1151 | { | 789 | { |
1152 | struct svc_sock *svsk = rqstp->rq_sock; | 790 | struct svc_sock *svsk = |
1153 | struct svc_serv *serv = svsk->sk_server; | 791 | container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt); |
792 | struct svc_serv *serv = svsk->sk_xprt.xpt_server; | ||
1154 | int len; | 793 | int len; |
1155 | struct kvec *vec; | 794 | struct kvec *vec; |
1156 | int pnum, vlen; | 795 | int pnum, vlen; |
1157 | 796 | ||
1158 | dprintk("svc: tcp_recv %p data %d conn %d close %d\n", | 797 | dprintk("svc: tcp_recv %p data %d conn %d close %d\n", |
1159 | svsk, test_bit(SK_DATA, &svsk->sk_flags), | 798 | svsk, test_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags), |
1160 | test_bit(SK_CONN, &svsk->sk_flags), | 799 | test_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags), |
1161 | test_bit(SK_CLOSE, &svsk->sk_flags)); | 800 | test_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags)); |
1162 | 801 | ||
1163 | if ((rqstp->rq_deferred = svc_deferred_dequeue(svsk))) { | 802 | if (test_and_clear_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags)) |
1164 | svc_sock_received(svsk); | ||
1165 | return svc_deferred_recv(rqstp); | ||
1166 | } | ||
1167 | |||
1168 | if (test_bit(SK_CLOSE, &svsk->sk_flags)) { | ||
1169 | svc_delete_socket(svsk); | ||
1170 | return 0; | ||
1171 | } | ||
1172 | |||
1173 | if (svsk->sk_sk->sk_state == TCP_LISTEN) { | ||
1174 | svc_tcp_accept(svsk); | ||
1175 | svc_sock_received(svsk); | ||
1176 | return 0; | ||
1177 | } | ||
1178 | |||
1179 | if (test_and_clear_bit(SK_CHNGBUF, &svsk->sk_flags)) | ||
1180 | /* sndbuf needs to have room for one request | 803 | /* sndbuf needs to have room for one request |
1181 | * per thread, otherwise we can stall even when the | 804 | * per thread, otherwise we can stall even when the |
1182 | * network isn't a bottleneck. | 805 | * network isn't a bottleneck. |
@@ -1193,7 +816,7 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp) | |||
1193 | (serv->sv_nrthreads+3) * serv->sv_max_mesg, | 816 | (serv->sv_nrthreads+3) * serv->sv_max_mesg, |
1194 | 3 * serv->sv_max_mesg); | 817 | 3 * serv->sv_max_mesg); |
1195 | 818 | ||
1196 | clear_bit(SK_DATA, &svsk->sk_flags); | 819 | clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); |
1197 | 820 | ||
1198 | /* Receive data. If we haven't got the record length yet, get | 821 | /* Receive data. If we haven't got the record length yet, get |
1199 | * the next four bytes. Otherwise try to gobble up as much as | 822 | * the next four bytes. Otherwise try to gobble up as much as |
@@ -1212,7 +835,7 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp) | |||
1212 | if (len < want) { | 835 | if (len < want) { |
1213 | dprintk("svc: short recvfrom while reading record length (%d of %lu)\n", | 836 | dprintk("svc: short recvfrom while reading record length (%d of %lu)\n", |
1214 | len, want); | 837 | len, want); |
1215 | svc_sock_received(svsk); | 838 | svc_xprt_received(&svsk->sk_xprt); |
1216 | return -EAGAIN; /* record header not complete */ | 839 | return -EAGAIN; /* record header not complete */ |
1217 | } | 840 | } |
1218 | 841 | ||
@@ -1248,11 +871,11 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp) | |||
1248 | if (len < svsk->sk_reclen) { | 871 | if (len < svsk->sk_reclen) { |
1249 | dprintk("svc: incomplete TCP record (%d of %d)\n", | 872 | dprintk("svc: incomplete TCP record (%d of %d)\n", |
1250 | len, svsk->sk_reclen); | 873 | len, svsk->sk_reclen); |
1251 | svc_sock_received(svsk); | 874 | svc_xprt_received(&svsk->sk_xprt); |
1252 | return -EAGAIN; /* record not complete */ | 875 | return -EAGAIN; /* record not complete */ |
1253 | } | 876 | } |
1254 | len = svsk->sk_reclen; | 877 | len = svsk->sk_reclen; |
1255 | set_bit(SK_DATA, &svsk->sk_flags); | 878 | set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); |
1256 | 879 | ||
1257 | vec = rqstp->rq_vec; | 880 | vec = rqstp->rq_vec; |
1258 | vec[0] = rqstp->rq_arg.head[0]; | 881 | vec[0] = rqstp->rq_arg.head[0]; |
@@ -1281,30 +904,31 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp) | |||
1281 | rqstp->rq_arg.page_len = len - rqstp->rq_arg.head[0].iov_len; | 904 | rqstp->rq_arg.page_len = len - rqstp->rq_arg.head[0].iov_len; |
1282 | } | 905 | } |
1283 | 906 | ||
1284 | rqstp->rq_skbuff = NULL; | 907 | rqstp->rq_xprt_ctxt = NULL; |
1285 | rqstp->rq_prot = IPPROTO_TCP; | 908 | rqstp->rq_prot = IPPROTO_TCP; |
1286 | 909 | ||
1287 | /* Reset TCP read info */ | 910 | /* Reset TCP read info */ |
1288 | svsk->sk_reclen = 0; | 911 | svsk->sk_reclen = 0; |
1289 | svsk->sk_tcplen = 0; | 912 | svsk->sk_tcplen = 0; |
1290 | 913 | ||
1291 | svc_sock_received(svsk); | 914 | svc_xprt_copy_addrs(rqstp, &svsk->sk_xprt); |
915 | svc_xprt_received(&svsk->sk_xprt); | ||
1292 | if (serv->sv_stats) | 916 | if (serv->sv_stats) |
1293 | serv->sv_stats->nettcpcnt++; | 917 | serv->sv_stats->nettcpcnt++; |
1294 | 918 | ||
1295 | return len; | 919 | return len; |
1296 | 920 | ||
1297 | err_delete: | 921 | err_delete: |
1298 | svc_delete_socket(svsk); | 922 | set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); |
1299 | return -EAGAIN; | 923 | return -EAGAIN; |
1300 | 924 | ||
1301 | error: | 925 | error: |
1302 | if (len == -EAGAIN) { | 926 | if (len == -EAGAIN) { |
1303 | dprintk("RPC: TCP recvfrom got EAGAIN\n"); | 927 | dprintk("RPC: TCP recvfrom got EAGAIN\n"); |
1304 | svc_sock_received(svsk); | 928 | svc_xprt_received(&svsk->sk_xprt); |
1305 | } else { | 929 | } else { |
1306 | printk(KERN_NOTICE "%s: recvfrom returned errno %d\n", | 930 | printk(KERN_NOTICE "%s: recvfrom returned errno %d\n", |
1307 | svsk->sk_server->sv_name, -len); | 931 | svsk->sk_xprt.xpt_server->sv_name, -len); |
1308 | goto err_delete; | 932 | goto err_delete; |
1309 | } | 933 | } |
1310 | 934 | ||
@@ -1314,8 +938,7 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp) | |||
1314 | /* | 938 | /* |
1315 | * Send out data on TCP socket. | 939 | * Send out data on TCP socket. |
1316 | */ | 940 | */ |
1317 | static int | 941 | static int svc_tcp_sendto(struct svc_rqst *rqstp) |
1318 | svc_tcp_sendto(struct svc_rqst *rqstp) | ||
1319 | { | 942 | { |
1320 | struct xdr_buf *xbufp = &rqstp->rq_res; | 943 | struct xdr_buf *xbufp = &rqstp->rq_res; |
1321 | int sent; | 944 | int sent; |
@@ -1328,35 +951,109 @@ svc_tcp_sendto(struct svc_rqst *rqstp) | |||
1328 | reclen = htonl(0x80000000|((xbufp->len ) - 4)); | 951 | reclen = htonl(0x80000000|((xbufp->len ) - 4)); |
1329 | memcpy(xbufp->head[0].iov_base, &reclen, 4); | 952 | memcpy(xbufp->head[0].iov_base, &reclen, 4); |
1330 | 953 | ||
1331 | if (test_bit(SK_DEAD, &rqstp->rq_sock->sk_flags)) | 954 | if (test_bit(XPT_DEAD, &rqstp->rq_xprt->xpt_flags)) |
1332 | return -ENOTCONN; | 955 | return -ENOTCONN; |
1333 | 956 | ||
1334 | sent = svc_sendto(rqstp, &rqstp->rq_res); | 957 | sent = svc_sendto(rqstp, &rqstp->rq_res); |
1335 | if (sent != xbufp->len) { | 958 | if (sent != xbufp->len) { |
1336 | printk(KERN_NOTICE "rpc-srv/tcp: %s: %s %d when sending %d bytes - shutting down socket\n", | 959 | printk(KERN_NOTICE |
1337 | rqstp->rq_sock->sk_server->sv_name, | 960 | "rpc-srv/tcp: %s: %s %d when sending %d bytes " |
961 | "- shutting down socket\n", | ||
962 | rqstp->rq_xprt->xpt_server->sv_name, | ||
1338 | (sent<0)?"got error":"sent only", | 963 | (sent<0)?"got error":"sent only", |
1339 | sent, xbufp->len); | 964 | sent, xbufp->len); |
1340 | set_bit(SK_CLOSE, &rqstp->rq_sock->sk_flags); | 965 | set_bit(XPT_CLOSE, &rqstp->rq_xprt->xpt_flags); |
1341 | svc_sock_enqueue(rqstp->rq_sock); | 966 | svc_xprt_enqueue(rqstp->rq_xprt); |
1342 | sent = -EAGAIN; | 967 | sent = -EAGAIN; |
1343 | } | 968 | } |
1344 | return sent; | 969 | return sent; |
1345 | } | 970 | } |
1346 | 971 | ||
1347 | static void | 972 | /* |
1348 | svc_tcp_init(struct svc_sock *svsk) | 973 | * Setup response header. TCP has a 4B record length field. |
974 | */ | ||
975 | static void svc_tcp_prep_reply_hdr(struct svc_rqst *rqstp) | ||
976 | { | ||
977 | struct kvec *resv = &rqstp->rq_res.head[0]; | ||
978 | |||
979 | /* tcp needs a space for the record length... */ | ||
980 | svc_putnl(resv, 0); | ||
981 | } | ||
982 | |||
983 | static int svc_tcp_has_wspace(struct svc_xprt *xprt) | ||
984 | { | ||
985 | struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); | ||
986 | struct svc_serv *serv = svsk->sk_xprt.xpt_server; | ||
987 | int required; | ||
988 | int wspace; | ||
989 | |||
990 | /* | ||
991 | * Set the SOCK_NOSPACE flag before checking the available | ||
992 | * sock space. | ||
993 | */ | ||
994 | set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); | ||
995 | required = atomic_read(&svsk->sk_xprt.xpt_reserved) + serv->sv_max_mesg; | ||
996 | wspace = sk_stream_wspace(svsk->sk_sk); | ||
997 | |||
998 | if (wspace < sk_stream_min_wspace(svsk->sk_sk)) | ||
999 | return 0; | ||
1000 | if (required * 2 > wspace) | ||
1001 | return 0; | ||
1002 | |||
1003 | clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); | ||
1004 | return 1; | ||
1005 | } | ||
1006 | |||
1007 | static struct svc_xprt *svc_tcp_create(struct svc_serv *serv, | ||
1008 | struct sockaddr *sa, int salen, | ||
1009 | int flags) | ||
1010 | { | ||
1011 | return svc_create_socket(serv, IPPROTO_TCP, sa, salen, flags); | ||
1012 | } | ||
1013 | |||
1014 | static struct svc_xprt_ops svc_tcp_ops = { | ||
1015 | .xpo_create = svc_tcp_create, | ||
1016 | .xpo_recvfrom = svc_tcp_recvfrom, | ||
1017 | .xpo_sendto = svc_tcp_sendto, | ||
1018 | .xpo_release_rqst = svc_release_skb, | ||
1019 | .xpo_detach = svc_sock_detach, | ||
1020 | .xpo_free = svc_sock_free, | ||
1021 | .xpo_prep_reply_hdr = svc_tcp_prep_reply_hdr, | ||
1022 | .xpo_has_wspace = svc_tcp_has_wspace, | ||
1023 | .xpo_accept = svc_tcp_accept, | ||
1024 | }; | ||
1025 | |||
1026 | static struct svc_xprt_class svc_tcp_class = { | ||
1027 | .xcl_name = "tcp", | ||
1028 | .xcl_owner = THIS_MODULE, | ||
1029 | .xcl_ops = &svc_tcp_ops, | ||
1030 | .xcl_max_payload = RPCSVC_MAXPAYLOAD_TCP, | ||
1031 | }; | ||
1032 | |||
1033 | void svc_init_xprt_sock(void) | ||
1034 | { | ||
1035 | svc_reg_xprt_class(&svc_tcp_class); | ||
1036 | svc_reg_xprt_class(&svc_udp_class); | ||
1037 | } | ||
1038 | |||
1039 | void svc_cleanup_xprt_sock(void) | ||
1040 | { | ||
1041 | svc_unreg_xprt_class(&svc_tcp_class); | ||
1042 | svc_unreg_xprt_class(&svc_udp_class); | ||
1043 | } | ||
1044 | |||
1045 | static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv) | ||
1349 | { | 1046 | { |
1350 | struct sock *sk = svsk->sk_sk; | 1047 | struct sock *sk = svsk->sk_sk; |
1351 | struct tcp_sock *tp = tcp_sk(sk); | 1048 | struct tcp_sock *tp = tcp_sk(sk); |
1352 | 1049 | ||
1353 | svsk->sk_recvfrom = svc_tcp_recvfrom; | 1050 | svc_xprt_init(&svc_tcp_class, &svsk->sk_xprt, serv); |
1354 | svsk->sk_sendto = svc_tcp_sendto; | 1051 | set_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags); |
1355 | |||
1356 | if (sk->sk_state == TCP_LISTEN) { | 1052 | if (sk->sk_state == TCP_LISTEN) { |
1357 | dprintk("setting up TCP socket for listening\n"); | 1053 | dprintk("setting up TCP socket for listening\n"); |
1054 | set_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags); | ||
1358 | sk->sk_data_ready = svc_tcp_listen_data_ready; | 1055 | sk->sk_data_ready = svc_tcp_listen_data_ready; |
1359 | set_bit(SK_CONN, &svsk->sk_flags); | 1056 | set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags); |
1360 | } else { | 1057 | } else { |
1361 | dprintk("setting up TCP socket for reading\n"); | 1058 | dprintk("setting up TCP socket for reading\n"); |
1362 | sk->sk_state_change = svc_tcp_state_change; | 1059 | sk->sk_state_change = svc_tcp_state_change; |
@@ -1373,18 +1070,17 @@ svc_tcp_init(struct svc_sock *svsk) | |||
1373 | * svc_tcp_recvfrom will re-adjust if necessary | 1070 | * svc_tcp_recvfrom will re-adjust if necessary |
1374 | */ | 1071 | */ |
1375 | svc_sock_setbufsize(svsk->sk_sock, | 1072 | svc_sock_setbufsize(svsk->sk_sock, |
1376 | 3 * svsk->sk_server->sv_max_mesg, | 1073 | 3 * svsk->sk_xprt.xpt_server->sv_max_mesg, |
1377 | 3 * svsk->sk_server->sv_max_mesg); | 1074 | 3 * svsk->sk_xprt.xpt_server->sv_max_mesg); |
1378 | 1075 | ||
1379 | set_bit(SK_CHNGBUF, &svsk->sk_flags); | 1076 | set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags); |
1380 | set_bit(SK_DATA, &svsk->sk_flags); | 1077 | set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); |
1381 | if (sk->sk_state != TCP_ESTABLISHED) | 1078 | if (sk->sk_state != TCP_ESTABLISHED) |
1382 | set_bit(SK_CLOSE, &svsk->sk_flags); | 1079 | set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); |
1383 | } | 1080 | } |
1384 | } | 1081 | } |
1385 | 1082 | ||
1386 | void | 1083 | void svc_sock_update_bufs(struct svc_serv *serv) |
1387 | svc_sock_update_bufs(struct svc_serv *serv) | ||
1388 | { | 1084 | { |
1389 | /* | 1085 | /* |
1390 | * The number of server threads has changed. Update | 1086 | * The number of server threads has changed. Update |
@@ -1395,232 +1091,18 @@ svc_sock_update_bufs(struct svc_serv *serv) | |||
1395 | spin_lock_bh(&serv->sv_lock); | 1091 | spin_lock_bh(&serv->sv_lock); |
1396 | list_for_each(le, &serv->sv_permsocks) { | 1092 | list_for_each(le, &serv->sv_permsocks) { |
1397 | struct svc_sock *svsk = | 1093 | struct svc_sock *svsk = |
1398 | list_entry(le, struct svc_sock, sk_list); | 1094 | list_entry(le, struct svc_sock, sk_xprt.xpt_list); |
1399 | set_bit(SK_CHNGBUF, &svsk->sk_flags); | 1095 | set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags); |
1400 | } | 1096 | } |
1401 | list_for_each(le, &serv->sv_tempsocks) { | 1097 | list_for_each(le, &serv->sv_tempsocks) { |
1402 | struct svc_sock *svsk = | 1098 | struct svc_sock *svsk = |
1403 | list_entry(le, struct svc_sock, sk_list); | 1099 | list_entry(le, struct svc_sock, sk_xprt.xpt_list); |
1404 | set_bit(SK_CHNGBUF, &svsk->sk_flags); | 1100 | set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags); |
1405 | } | 1101 | } |
1406 | spin_unlock_bh(&serv->sv_lock); | 1102 | spin_unlock_bh(&serv->sv_lock); |
1407 | } | 1103 | } |
1408 | 1104 | ||
1409 | /* | 1105 | /* |
1410 | * Receive the next request on any socket. This code is carefully | ||
1411 | * organised not to touch any cachelines in the shared svc_serv | ||
1412 | * structure, only cachelines in the local svc_pool. | ||
1413 | */ | ||
1414 | int | ||
1415 | svc_recv(struct svc_rqst *rqstp, long timeout) | ||
1416 | { | ||
1417 | struct svc_sock *svsk = NULL; | ||
1418 | struct svc_serv *serv = rqstp->rq_server; | ||
1419 | struct svc_pool *pool = rqstp->rq_pool; | ||
1420 | int len, i; | ||
1421 | int pages; | ||
1422 | struct xdr_buf *arg; | ||
1423 | DECLARE_WAITQUEUE(wait, current); | ||
1424 | |||
1425 | dprintk("svc: server %p waiting for data (to = %ld)\n", | ||
1426 | rqstp, timeout); | ||
1427 | |||
1428 | if (rqstp->rq_sock) | ||
1429 | printk(KERN_ERR | ||
1430 | "svc_recv: service %p, socket not NULL!\n", | ||
1431 | rqstp); | ||
1432 | if (waitqueue_active(&rqstp->rq_wait)) | ||
1433 | printk(KERN_ERR | ||
1434 | "svc_recv: service %p, wait queue active!\n", | ||
1435 | rqstp); | ||
1436 | |||
1437 | |||
1438 | /* now allocate needed pages. If we get a failure, sleep briefly */ | ||
1439 | pages = (serv->sv_max_mesg + PAGE_SIZE) / PAGE_SIZE; | ||
1440 | for (i=0; i < pages ; i++) | ||
1441 | while (rqstp->rq_pages[i] == NULL) { | ||
1442 | struct page *p = alloc_page(GFP_KERNEL); | ||
1443 | if (!p) | ||
1444 | schedule_timeout_uninterruptible(msecs_to_jiffies(500)); | ||
1445 | rqstp->rq_pages[i] = p; | ||
1446 | } | ||
1447 | rqstp->rq_pages[i++] = NULL; /* this might be seen in nfs_read_actor */ | ||
1448 | BUG_ON(pages >= RPCSVC_MAXPAGES); | ||
1449 | |||
1450 | /* Make arg->head point to first page and arg->pages point to rest */ | ||
1451 | arg = &rqstp->rq_arg; | ||
1452 | arg->head[0].iov_base = page_address(rqstp->rq_pages[0]); | ||
1453 | arg->head[0].iov_len = PAGE_SIZE; | ||
1454 | arg->pages = rqstp->rq_pages + 1; | ||
1455 | arg->page_base = 0; | ||
1456 | /* save at least one page for response */ | ||
1457 | arg->page_len = (pages-2)*PAGE_SIZE; | ||
1458 | arg->len = (pages-1)*PAGE_SIZE; | ||
1459 | arg->tail[0].iov_len = 0; | ||
1460 | |||
1461 | try_to_freeze(); | ||
1462 | cond_resched(); | ||
1463 | if (signalled()) | ||
1464 | return -EINTR; | ||
1465 | |||
1466 | spin_lock_bh(&pool->sp_lock); | ||
1467 | if ((svsk = svc_sock_dequeue(pool)) != NULL) { | ||
1468 | rqstp->rq_sock = svsk; | ||
1469 | atomic_inc(&svsk->sk_inuse); | ||
1470 | rqstp->rq_reserved = serv->sv_max_mesg; | ||
1471 | atomic_add(rqstp->rq_reserved, &svsk->sk_reserved); | ||
1472 | } else { | ||
1473 | /* No data pending. Go to sleep */ | ||
1474 | svc_thread_enqueue(pool, rqstp); | ||
1475 | |||
1476 | /* | ||
1477 | * We have to be able to interrupt this wait | ||
1478 | * to bring down the daemons ... | ||
1479 | */ | ||
1480 | set_current_state(TASK_INTERRUPTIBLE); | ||
1481 | add_wait_queue(&rqstp->rq_wait, &wait); | ||
1482 | spin_unlock_bh(&pool->sp_lock); | ||
1483 | |||
1484 | schedule_timeout(timeout); | ||
1485 | |||
1486 | try_to_freeze(); | ||
1487 | |||
1488 | spin_lock_bh(&pool->sp_lock); | ||
1489 | remove_wait_queue(&rqstp->rq_wait, &wait); | ||
1490 | |||
1491 | if (!(svsk = rqstp->rq_sock)) { | ||
1492 | svc_thread_dequeue(pool, rqstp); | ||
1493 | spin_unlock_bh(&pool->sp_lock); | ||
1494 | dprintk("svc: server %p, no data yet\n", rqstp); | ||
1495 | return signalled()? -EINTR : -EAGAIN; | ||
1496 | } | ||
1497 | } | ||
1498 | spin_unlock_bh(&pool->sp_lock); | ||
1499 | |||
1500 | dprintk("svc: server %p, pool %u, socket %p, inuse=%d\n", | ||
1501 | rqstp, pool->sp_id, svsk, atomic_read(&svsk->sk_inuse)); | ||
1502 | len = svsk->sk_recvfrom(rqstp); | ||
1503 | dprintk("svc: got len=%d\n", len); | ||
1504 | |||
1505 | /* No data, incomplete (TCP) read, or accept() */ | ||
1506 | if (len == 0 || len == -EAGAIN) { | ||
1507 | rqstp->rq_res.len = 0; | ||
1508 | svc_sock_release(rqstp); | ||
1509 | return -EAGAIN; | ||
1510 | } | ||
1511 | svsk->sk_lastrecv = get_seconds(); | ||
1512 | clear_bit(SK_OLD, &svsk->sk_flags); | ||
1513 | |||
1514 | rqstp->rq_secure = svc_port_is_privileged(svc_addr(rqstp)); | ||
1515 | rqstp->rq_chandle.defer = svc_defer; | ||
1516 | |||
1517 | if (serv->sv_stats) | ||
1518 | serv->sv_stats->netcnt++; | ||
1519 | return len; | ||
1520 | } | ||
1521 | |||
1522 | /* | ||
1523 | * Drop request | ||
1524 | */ | ||
1525 | void | ||
1526 | svc_drop(struct svc_rqst *rqstp) | ||
1527 | { | ||
1528 | dprintk("svc: socket %p dropped request\n", rqstp->rq_sock); | ||
1529 | svc_sock_release(rqstp); | ||
1530 | } | ||
1531 | |||
1532 | /* | ||
1533 | * Return reply to client. | ||
1534 | */ | ||
1535 | int | ||
1536 | svc_send(struct svc_rqst *rqstp) | ||
1537 | { | ||
1538 | struct svc_sock *svsk; | ||
1539 | int len; | ||
1540 | struct xdr_buf *xb; | ||
1541 | |||
1542 | if ((svsk = rqstp->rq_sock) == NULL) { | ||
1543 | printk(KERN_WARNING "NULL socket pointer in %s:%d\n", | ||
1544 | __FILE__, __LINE__); | ||
1545 | return -EFAULT; | ||
1546 | } | ||
1547 | |||
1548 | /* release the receive skb before sending the reply */ | ||
1549 | svc_release_skb(rqstp); | ||
1550 | |||
1551 | /* calculate over-all length */ | ||
1552 | xb = & rqstp->rq_res; | ||
1553 | xb->len = xb->head[0].iov_len + | ||
1554 | xb->page_len + | ||
1555 | xb->tail[0].iov_len; | ||
1556 | |||
1557 | /* Grab svsk->sk_mutex to serialize outgoing data. */ | ||
1558 | mutex_lock(&svsk->sk_mutex); | ||
1559 | if (test_bit(SK_DEAD, &svsk->sk_flags)) | ||
1560 | len = -ENOTCONN; | ||
1561 | else | ||
1562 | len = svsk->sk_sendto(rqstp); | ||
1563 | mutex_unlock(&svsk->sk_mutex); | ||
1564 | svc_sock_release(rqstp); | ||
1565 | |||
1566 | if (len == -ECONNREFUSED || len == -ENOTCONN || len == -EAGAIN) | ||
1567 | return 0; | ||
1568 | return len; | ||
1569 | } | ||
1570 | |||
1571 | /* | ||
1572 | * Timer function to close old temporary sockets, using | ||
1573 | * a mark-and-sweep algorithm. | ||
1574 | */ | ||
1575 | static void | ||
1576 | svc_age_temp_sockets(unsigned long closure) | ||
1577 | { | ||
1578 | struct svc_serv *serv = (struct svc_serv *)closure; | ||
1579 | struct svc_sock *svsk; | ||
1580 | struct list_head *le, *next; | ||
1581 | LIST_HEAD(to_be_aged); | ||
1582 | |||
1583 | dprintk("svc_age_temp_sockets\n"); | ||
1584 | |||
1585 | if (!spin_trylock_bh(&serv->sv_lock)) { | ||
1586 | /* busy, try again 1 sec later */ | ||
1587 | dprintk("svc_age_temp_sockets: busy\n"); | ||
1588 | mod_timer(&serv->sv_temptimer, jiffies + HZ); | ||
1589 | return; | ||
1590 | } | ||
1591 | |||
1592 | list_for_each_safe(le, next, &serv->sv_tempsocks) { | ||
1593 | svsk = list_entry(le, struct svc_sock, sk_list); | ||
1594 | |||
1595 | if (!test_and_set_bit(SK_OLD, &svsk->sk_flags)) | ||
1596 | continue; | ||
1597 | if (atomic_read(&svsk->sk_inuse) > 1 || test_bit(SK_BUSY, &svsk->sk_flags)) | ||
1598 | continue; | ||
1599 | atomic_inc(&svsk->sk_inuse); | ||
1600 | list_move(le, &to_be_aged); | ||
1601 | set_bit(SK_CLOSE, &svsk->sk_flags); | ||
1602 | set_bit(SK_DETACHED, &svsk->sk_flags); | ||
1603 | } | ||
1604 | spin_unlock_bh(&serv->sv_lock); | ||
1605 | |||
1606 | while (!list_empty(&to_be_aged)) { | ||
1607 | le = to_be_aged.next; | ||
1608 | /* fiddling the sk_list node is safe 'cos we're SK_DETACHED */ | ||
1609 | list_del_init(le); | ||
1610 | svsk = list_entry(le, struct svc_sock, sk_list); | ||
1611 | |||
1612 | dprintk("queuing svsk %p for closing, %lu seconds old\n", | ||
1613 | svsk, get_seconds() - svsk->sk_lastrecv); | ||
1614 | |||
1615 | /* a thread will dequeue and close it soon */ | ||
1616 | svc_sock_enqueue(svsk); | ||
1617 | svc_sock_put(svsk); | ||
1618 | } | ||
1619 | |||
1620 | mod_timer(&serv->sv_temptimer, jiffies + svc_conn_age_period * HZ); | ||
1621 | } | ||
1622 | |||
1623 | /* | ||
1624 | * Initialize socket for RPC use and create svc_sock struct | 1106 | * Initialize socket for RPC use and create svc_sock struct |
1625 | * XXX: May want to setsockopt SO_SNDBUF and SO_RCVBUF. | 1107 | * XXX: May want to setsockopt SO_SNDBUF and SO_RCVBUF. |
1626 | */ | 1108 | */ |
@@ -1631,7 +1113,6 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, | |||
1631 | struct svc_sock *svsk; | 1113 | struct svc_sock *svsk; |
1632 | struct sock *inet; | 1114 | struct sock *inet; |
1633 | int pmap_register = !(flags & SVC_SOCK_ANONYMOUS); | 1115 | int pmap_register = !(flags & SVC_SOCK_ANONYMOUS); |
1634 | int is_temporary = flags & SVC_SOCK_TEMPORARY; | ||
1635 | 1116 | ||
1636 | dprintk("svc: svc_setup_socket %p\n", sock); | 1117 | dprintk("svc: svc_setup_socket %p\n", sock); |
1637 | if (!(svsk = kzalloc(sizeof(*svsk), GFP_KERNEL))) { | 1118 | if (!(svsk = kzalloc(sizeof(*svsk), GFP_KERNEL))) { |
@@ -1651,44 +1132,18 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, | |||
1651 | return NULL; | 1132 | return NULL; |
1652 | } | 1133 | } |
1653 | 1134 | ||
1654 | set_bit(SK_BUSY, &svsk->sk_flags); | ||
1655 | inet->sk_user_data = svsk; | 1135 | inet->sk_user_data = svsk; |
1656 | svsk->sk_sock = sock; | 1136 | svsk->sk_sock = sock; |
1657 | svsk->sk_sk = inet; | 1137 | svsk->sk_sk = inet; |
1658 | svsk->sk_ostate = inet->sk_state_change; | 1138 | svsk->sk_ostate = inet->sk_state_change; |
1659 | svsk->sk_odata = inet->sk_data_ready; | 1139 | svsk->sk_odata = inet->sk_data_ready; |
1660 | svsk->sk_owspace = inet->sk_write_space; | 1140 | svsk->sk_owspace = inet->sk_write_space; |
1661 | svsk->sk_server = serv; | ||
1662 | atomic_set(&svsk->sk_inuse, 1); | ||
1663 | svsk->sk_lastrecv = get_seconds(); | ||
1664 | spin_lock_init(&svsk->sk_lock); | ||
1665 | INIT_LIST_HEAD(&svsk->sk_deferred); | ||
1666 | INIT_LIST_HEAD(&svsk->sk_ready); | ||
1667 | mutex_init(&svsk->sk_mutex); | ||
1668 | 1141 | ||
1669 | /* Initialize the socket */ | 1142 | /* Initialize the socket */ |
1670 | if (sock->type == SOCK_DGRAM) | 1143 | if (sock->type == SOCK_DGRAM) |
1671 | svc_udp_init(svsk); | 1144 | svc_udp_init(svsk, serv); |
1672 | else | 1145 | else |
1673 | svc_tcp_init(svsk); | 1146 | svc_tcp_init(svsk, serv); |
1674 | |||
1675 | spin_lock_bh(&serv->sv_lock); | ||
1676 | if (is_temporary) { | ||
1677 | set_bit(SK_TEMP, &svsk->sk_flags); | ||
1678 | list_add(&svsk->sk_list, &serv->sv_tempsocks); | ||
1679 | serv->sv_tmpcnt++; | ||
1680 | if (serv->sv_temptimer.function == NULL) { | ||
1681 | /* setup timer to age temp sockets */ | ||
1682 | setup_timer(&serv->sv_temptimer, svc_age_temp_sockets, | ||
1683 | (unsigned long)serv); | ||
1684 | mod_timer(&serv->sv_temptimer, | ||
1685 | jiffies + svc_conn_age_period * HZ); | ||
1686 | } | ||
1687 | } else { | ||
1688 | clear_bit(SK_TEMP, &svsk->sk_flags); | ||
1689 | list_add(&svsk->sk_list, &serv->sv_permsocks); | ||
1690 | } | ||
1691 | spin_unlock_bh(&serv->sv_lock); | ||
1692 | 1147 | ||
1693 | dprintk("svc: svc_setup_socket created %p (inet %p)\n", | 1148 | dprintk("svc: svc_setup_socket created %p (inet %p)\n", |
1694 | svsk, svsk->sk_sk); | 1149 | svsk, svsk->sk_sk); |
@@ -1717,7 +1172,16 @@ int svc_addsock(struct svc_serv *serv, | |||
1717 | else { | 1172 | else { |
1718 | svsk = svc_setup_socket(serv, so, &err, SVC_SOCK_DEFAULTS); | 1173 | svsk = svc_setup_socket(serv, so, &err, SVC_SOCK_DEFAULTS); |
1719 | if (svsk) { | 1174 | if (svsk) { |
1720 | svc_sock_received(svsk); | 1175 | struct sockaddr_storage addr; |
1176 | struct sockaddr *sin = (struct sockaddr *)&addr; | ||
1177 | int salen; | ||
1178 | if (kernel_getsockname(svsk->sk_sock, sin, &salen) == 0) | ||
1179 | svc_xprt_set_local(&svsk->sk_xprt, sin, salen); | ||
1180 | clear_bit(XPT_TEMP, &svsk->sk_xprt.xpt_flags); | ||
1181 | spin_lock_bh(&serv->sv_lock); | ||
1182 | list_add(&svsk->sk_xprt.xpt_list, &serv->sv_permsocks); | ||
1183 | spin_unlock_bh(&serv->sv_lock); | ||
1184 | svc_xprt_received(&svsk->sk_xprt); | ||
1721 | err = 0; | 1185 | err = 0; |
1722 | } | 1186 | } |
1723 | } | 1187 | } |
@@ -1733,14 +1197,19 @@ EXPORT_SYMBOL_GPL(svc_addsock); | |||
1733 | /* | 1197 | /* |
1734 | * Create socket for RPC service. | 1198 | * Create socket for RPC service. |
1735 | */ | 1199 | */ |
1736 | static int svc_create_socket(struct svc_serv *serv, int protocol, | 1200 | static struct svc_xprt *svc_create_socket(struct svc_serv *serv, |
1737 | struct sockaddr *sin, int len, int flags) | 1201 | int protocol, |
1202 | struct sockaddr *sin, int len, | ||
1203 | int flags) | ||
1738 | { | 1204 | { |
1739 | struct svc_sock *svsk; | 1205 | struct svc_sock *svsk; |
1740 | struct socket *sock; | 1206 | struct socket *sock; |
1741 | int error; | 1207 | int error; |
1742 | int type; | 1208 | int type; |
1743 | char buf[RPC_MAX_ADDRBUFLEN]; | 1209 | char buf[RPC_MAX_ADDRBUFLEN]; |
1210 | struct sockaddr_storage addr; | ||
1211 | struct sockaddr *newsin = (struct sockaddr *)&addr; | ||
1212 | int newlen; | ||
1744 | 1213 | ||
1745 | dprintk("svc: svc_create_socket(%s, %d, %s)\n", | 1214 | dprintk("svc: svc_create_socket(%s, %d, %s)\n", |
1746 | serv->sv_program->pg_name, protocol, | 1215 | serv->sv_program->pg_name, protocol, |
@@ -1749,13 +1218,13 @@ static int svc_create_socket(struct svc_serv *serv, int protocol, | |||
1749 | if (protocol != IPPROTO_UDP && protocol != IPPROTO_TCP) { | 1218 | if (protocol != IPPROTO_UDP && protocol != IPPROTO_TCP) { |
1750 | printk(KERN_WARNING "svc: only UDP and TCP " | 1219 | printk(KERN_WARNING "svc: only UDP and TCP " |
1751 | "sockets supported\n"); | 1220 | "sockets supported\n"); |
1752 | return -EINVAL; | 1221 | return ERR_PTR(-EINVAL); |
1753 | } | 1222 | } |
1754 | type = (protocol == IPPROTO_UDP)? SOCK_DGRAM : SOCK_STREAM; | 1223 | type = (protocol == IPPROTO_UDP)? SOCK_DGRAM : SOCK_STREAM; |
1755 | 1224 | ||
1756 | error = sock_create_kern(sin->sa_family, type, protocol, &sock); | 1225 | error = sock_create_kern(sin->sa_family, type, protocol, &sock); |
1757 | if (error < 0) | 1226 | if (error < 0) |
1758 | return error; | 1227 | return ERR_PTR(error); |
1759 | 1228 | ||
1760 | svc_reclassify_socket(sock); | 1229 | svc_reclassify_socket(sock); |
1761 | 1230 | ||
@@ -1765,203 +1234,55 @@ static int svc_create_socket(struct svc_serv *serv, int protocol, | |||
1765 | if (error < 0) | 1234 | if (error < 0) |
1766 | goto bummer; | 1235 | goto bummer; |
1767 | 1236 | ||
1237 | newlen = len; | ||
1238 | error = kernel_getsockname(sock, newsin, &newlen); | ||
1239 | if (error < 0) | ||
1240 | goto bummer; | ||
1241 | |||
1768 | if (protocol == IPPROTO_TCP) { | 1242 | if (protocol == IPPROTO_TCP) { |
1769 | if ((error = kernel_listen(sock, 64)) < 0) | 1243 | if ((error = kernel_listen(sock, 64)) < 0) |
1770 | goto bummer; | 1244 | goto bummer; |
1771 | } | 1245 | } |
1772 | 1246 | ||
1773 | if ((svsk = svc_setup_socket(serv, sock, &error, flags)) != NULL) { | 1247 | if ((svsk = svc_setup_socket(serv, sock, &error, flags)) != NULL) { |
1774 | svc_sock_received(svsk); | 1248 | svc_xprt_set_local(&svsk->sk_xprt, newsin, newlen); |
1775 | return ntohs(inet_sk(svsk->sk_sk)->sport); | 1249 | return (struct svc_xprt *)svsk; |
1776 | } | 1250 | } |
1777 | 1251 | ||
1778 | bummer: | 1252 | bummer: |
1779 | dprintk("svc: svc_create_socket error = %d\n", -error); | 1253 | dprintk("svc: svc_create_socket error = %d\n", -error); |
1780 | sock_release(sock); | 1254 | sock_release(sock); |
1781 | return error; | 1255 | return ERR_PTR(error); |
1782 | } | 1256 | } |
1783 | 1257 | ||
1784 | /* | 1258 | /* |
1785 | * Remove a dead socket | 1259 | * Detach the svc_sock from the socket so that no |
1260 | * more callbacks occur. | ||
1786 | */ | 1261 | */ |
1787 | static void | 1262 | static void svc_sock_detach(struct svc_xprt *xprt) |
1788 | svc_delete_socket(struct svc_sock *svsk) | ||
1789 | { | 1263 | { |
1790 | struct svc_serv *serv; | 1264 | struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); |
1791 | struct sock *sk; | 1265 | struct sock *sk = svsk->sk_sk; |
1792 | |||
1793 | dprintk("svc: svc_delete_socket(%p)\n", svsk); | ||
1794 | 1266 | ||
1795 | serv = svsk->sk_server; | 1267 | dprintk("svc: svc_sock_detach(%p)\n", svsk); |
1796 | sk = svsk->sk_sk; | ||
1797 | 1268 | ||
1269 | /* put back the old socket callbacks */ | ||
1798 | sk->sk_state_change = svsk->sk_ostate; | 1270 | sk->sk_state_change = svsk->sk_ostate; |
1799 | sk->sk_data_ready = svsk->sk_odata; | 1271 | sk->sk_data_ready = svsk->sk_odata; |
1800 | sk->sk_write_space = svsk->sk_owspace; | 1272 | sk->sk_write_space = svsk->sk_owspace; |
1801 | |||
1802 | spin_lock_bh(&serv->sv_lock); | ||
1803 | |||
1804 | if (!test_and_set_bit(SK_DETACHED, &svsk->sk_flags)) | ||
1805 | list_del_init(&svsk->sk_list); | ||
1806 | /* | ||
1807 | * We used to delete the svc_sock from whichever list | ||
1808 | * it's sk_ready node was on, but we don't actually | ||
1809 | * need to. This is because the only time we're called | ||
1810 | * while still attached to a queue, the queue itself | ||
1811 | * is about to be destroyed (in svc_destroy). | ||
1812 | */ | ||
1813 | if (!test_and_set_bit(SK_DEAD, &svsk->sk_flags)) { | ||
1814 | BUG_ON(atomic_read(&svsk->sk_inuse)<2); | ||
1815 | atomic_dec(&svsk->sk_inuse); | ||
1816 | if (test_bit(SK_TEMP, &svsk->sk_flags)) | ||
1817 | serv->sv_tmpcnt--; | ||
1818 | } | ||
1819 | |||
1820 | spin_unlock_bh(&serv->sv_lock); | ||
1821 | } | ||
1822 | |||
1823 | static void svc_close_socket(struct svc_sock *svsk) | ||
1824 | { | ||
1825 | set_bit(SK_CLOSE, &svsk->sk_flags); | ||
1826 | if (test_and_set_bit(SK_BUSY, &svsk->sk_flags)) | ||
1827 | /* someone else will have to effect the close */ | ||
1828 | return; | ||
1829 | |||
1830 | atomic_inc(&svsk->sk_inuse); | ||
1831 | svc_delete_socket(svsk); | ||
1832 | clear_bit(SK_BUSY, &svsk->sk_flags); | ||
1833 | svc_sock_put(svsk); | ||
1834 | } | ||
1835 | |||
1836 | void svc_force_close_socket(struct svc_sock *svsk) | ||
1837 | { | ||
1838 | set_bit(SK_CLOSE, &svsk->sk_flags); | ||
1839 | if (test_bit(SK_BUSY, &svsk->sk_flags)) { | ||
1840 | /* Waiting to be processed, but no threads left, | ||
1841 | * So just remove it from the waiting list | ||
1842 | */ | ||
1843 | list_del_init(&svsk->sk_ready); | ||
1844 | clear_bit(SK_BUSY, &svsk->sk_flags); | ||
1845 | } | ||
1846 | svc_close_socket(svsk); | ||
1847 | } | ||
1848 | |||
1849 | /** | ||
1850 | * svc_makesock - Make a socket for nfsd and lockd | ||
1851 | * @serv: RPC server structure | ||
1852 | * @protocol: transport protocol to use | ||
1853 | * @port: port to use | ||
1854 | * @flags: requested socket characteristics | ||
1855 | * | ||
1856 | */ | ||
1857 | int svc_makesock(struct svc_serv *serv, int protocol, unsigned short port, | ||
1858 | int flags) | ||
1859 | { | ||
1860 | struct sockaddr_in sin = { | ||
1861 | .sin_family = AF_INET, | ||
1862 | .sin_addr.s_addr = INADDR_ANY, | ||
1863 | .sin_port = htons(port), | ||
1864 | }; | ||
1865 | |||
1866 | dprintk("svc: creating socket proto = %d\n", protocol); | ||
1867 | return svc_create_socket(serv, protocol, (struct sockaddr *) &sin, | ||
1868 | sizeof(sin), flags); | ||
1869 | } | 1273 | } |
1870 | 1274 | ||
1871 | /* | 1275 | /* |
1872 | * Handle defer and revisit of requests | 1276 | * Free the svc_sock's socket resources and the svc_sock itself. |
1873 | */ | 1277 | */ |
1874 | 1278 | static void svc_sock_free(struct svc_xprt *xprt) | |
1875 | static void svc_revisit(struct cache_deferred_req *dreq, int too_many) | ||
1876 | { | 1279 | { |
1877 | struct svc_deferred_req *dr = container_of(dreq, struct svc_deferred_req, handle); | 1280 | struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); |
1878 | struct svc_sock *svsk; | 1281 | dprintk("svc: svc_sock_free(%p)\n", svsk); |
1879 | 1282 | ||
1880 | if (too_many) { | 1283 | if (svsk->sk_sock->file) |
1881 | svc_sock_put(dr->svsk); | 1284 | sockfd_put(svsk->sk_sock); |
1882 | kfree(dr); | 1285 | else |
1883 | return; | 1286 | sock_release(svsk->sk_sock); |
1884 | } | 1287 | kfree(svsk); |
1885 | dprintk("revisit queued\n"); | ||
1886 | svsk = dr->svsk; | ||
1887 | dr->svsk = NULL; | ||
1888 | spin_lock(&svsk->sk_lock); | ||
1889 | list_add(&dr->handle.recent, &svsk->sk_deferred); | ||
1890 | spin_unlock(&svsk->sk_lock); | ||
1891 | set_bit(SK_DEFERRED, &svsk->sk_flags); | ||
1892 | svc_sock_enqueue(svsk); | ||
1893 | svc_sock_put(svsk); | ||
1894 | } | ||
1895 | |||
1896 | static struct cache_deferred_req * | ||
1897 | svc_defer(struct cache_req *req) | ||
1898 | { | ||
1899 | struct svc_rqst *rqstp = container_of(req, struct svc_rqst, rq_chandle); | ||
1900 | int size = sizeof(struct svc_deferred_req) + (rqstp->rq_arg.len); | ||
1901 | struct svc_deferred_req *dr; | ||
1902 | |||
1903 | if (rqstp->rq_arg.page_len) | ||
1904 | return NULL; /* if more than a page, give up FIXME */ | ||
1905 | if (rqstp->rq_deferred) { | ||
1906 | dr = rqstp->rq_deferred; | ||
1907 | rqstp->rq_deferred = NULL; | ||
1908 | } else { | ||
1909 | int skip = rqstp->rq_arg.len - rqstp->rq_arg.head[0].iov_len; | ||
1910 | /* FIXME maybe discard if size too large */ | ||
1911 | dr = kmalloc(size, GFP_KERNEL); | ||
1912 | if (dr == NULL) | ||
1913 | return NULL; | ||
1914 | |||
1915 | dr->handle.owner = rqstp->rq_server; | ||
1916 | dr->prot = rqstp->rq_prot; | ||
1917 | memcpy(&dr->addr, &rqstp->rq_addr, rqstp->rq_addrlen); | ||
1918 | dr->addrlen = rqstp->rq_addrlen; | ||
1919 | dr->daddr = rqstp->rq_daddr; | ||
1920 | dr->argslen = rqstp->rq_arg.len >> 2; | ||
1921 | memcpy(dr->args, rqstp->rq_arg.head[0].iov_base-skip, dr->argslen<<2); | ||
1922 | } | ||
1923 | atomic_inc(&rqstp->rq_sock->sk_inuse); | ||
1924 | dr->svsk = rqstp->rq_sock; | ||
1925 | |||
1926 | dr->handle.revisit = svc_revisit; | ||
1927 | return &dr->handle; | ||
1928 | } | ||
1929 | |||
1930 | /* | ||
1931 | * recv data from a deferred request into an active one | ||
1932 | */ | ||
1933 | static int svc_deferred_recv(struct svc_rqst *rqstp) | ||
1934 | { | ||
1935 | struct svc_deferred_req *dr = rqstp->rq_deferred; | ||
1936 | |||
1937 | rqstp->rq_arg.head[0].iov_base = dr->args; | ||
1938 | rqstp->rq_arg.head[0].iov_len = dr->argslen<<2; | ||
1939 | rqstp->rq_arg.page_len = 0; | ||
1940 | rqstp->rq_arg.len = dr->argslen<<2; | ||
1941 | rqstp->rq_prot = dr->prot; | ||
1942 | memcpy(&rqstp->rq_addr, &dr->addr, dr->addrlen); | ||
1943 | rqstp->rq_addrlen = dr->addrlen; | ||
1944 | rqstp->rq_daddr = dr->daddr; | ||
1945 | rqstp->rq_respages = rqstp->rq_pages; | ||
1946 | return dr->argslen<<2; | ||
1947 | } | ||
1948 | |||
1949 | |||
1950 | static struct svc_deferred_req *svc_deferred_dequeue(struct svc_sock *svsk) | ||
1951 | { | ||
1952 | struct svc_deferred_req *dr = NULL; | ||
1953 | |||
1954 | if (!test_bit(SK_DEFERRED, &svsk->sk_flags)) | ||
1955 | return NULL; | ||
1956 | spin_lock(&svsk->sk_lock); | ||
1957 | clear_bit(SK_DEFERRED, &svsk->sk_flags); | ||
1958 | if (!list_empty(&svsk->sk_deferred)) { | ||
1959 | dr = list_entry(svsk->sk_deferred.next, | ||
1960 | struct svc_deferred_req, | ||
1961 | handle.recent); | ||
1962 | list_del_init(&dr->handle.recent); | ||
1963 | set_bit(SK_DEFERRED, &svsk->sk_flags); | ||
1964 | } | ||
1965 | spin_unlock(&svsk->sk_lock); | ||
1966 | return dr; | ||
1967 | } | 1288 | } |
diff --git a/net/sunrpc/sysctl.c b/net/sunrpc/sysctl.c index bada7de0c2fc..0f8c439b848a 100644 --- a/net/sunrpc/sysctl.c +++ b/net/sunrpc/sysctl.c | |||
@@ -18,6 +18,7 @@ | |||
18 | #include <linux/sunrpc/types.h> | 18 | #include <linux/sunrpc/types.h> |
19 | #include <linux/sunrpc/sched.h> | 19 | #include <linux/sunrpc/sched.h> |
20 | #include <linux/sunrpc/stats.h> | 20 | #include <linux/sunrpc/stats.h> |
21 | #include <linux/sunrpc/svc_xprt.h> | ||
21 | 22 | ||
22 | /* | 23 | /* |
23 | * Declare the debug flags here | 24 | * Declare the debug flags here |
@@ -55,6 +56,30 @@ rpc_unregister_sysctl(void) | |||
55 | } | 56 | } |
56 | } | 57 | } |
57 | 58 | ||
59 | static int proc_do_xprt(ctl_table *table, int write, struct file *file, | ||
60 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
61 | { | ||
62 | char tmpbuf[256]; | ||
63 | int len; | ||
64 | if ((*ppos && !write) || !*lenp) { | ||
65 | *lenp = 0; | ||
66 | return 0; | ||
67 | } | ||
68 | if (write) | ||
69 | return -EINVAL; | ||
70 | else { | ||
71 | len = svc_print_xprts(tmpbuf, sizeof(tmpbuf)); | ||
72 | if (!access_ok(VERIFY_WRITE, buffer, len)) | ||
73 | return -EFAULT; | ||
74 | |||
75 | if (__copy_to_user(buffer, tmpbuf, len)) | ||
76 | return -EFAULT; | ||
77 | } | ||
78 | *lenp -= len; | ||
79 | *ppos += len; | ||
80 | return 0; | ||
81 | } | ||
82 | |||
58 | static int | 83 | static int |
59 | proc_dodebug(ctl_table *table, int write, struct file *file, | 84 | proc_dodebug(ctl_table *table, int write, struct file *file, |
60 | void __user *buffer, size_t *lenp, loff_t *ppos) | 85 | void __user *buffer, size_t *lenp, loff_t *ppos) |
@@ -147,6 +172,12 @@ static ctl_table debug_table[] = { | |||
147 | .mode = 0644, | 172 | .mode = 0644, |
148 | .proc_handler = &proc_dodebug | 173 | .proc_handler = &proc_dodebug |
149 | }, | 174 | }, |
175 | { | ||
176 | .procname = "transports", | ||
177 | .maxlen = 256, | ||
178 | .mode = 0444, | ||
179 | .proc_handler = &proc_do_xprt, | ||
180 | }, | ||
150 | { .ctl_name = 0 } | 181 | { .ctl_name = 0 } |
151 | }; | 182 | }; |
152 | 183 | ||
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 54264062ea69..995c3fdc16c2 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c | |||
@@ -96,11 +96,13 @@ xdr_encode_string(__be32 *p, const char *string) | |||
96 | EXPORT_SYMBOL(xdr_encode_string); | 96 | EXPORT_SYMBOL(xdr_encode_string); |
97 | 97 | ||
98 | __be32 * | 98 | __be32 * |
99 | xdr_decode_string_inplace(__be32 *p, char **sp, int *lenp, int maxlen) | 99 | xdr_decode_string_inplace(__be32 *p, char **sp, |
100 | unsigned int *lenp, unsigned int maxlen) | ||
100 | { | 101 | { |
101 | unsigned int len; | 102 | u32 len; |
102 | 103 | ||
103 | if ((len = ntohl(*p++)) > maxlen) | 104 | len = ntohl(*p++); |
105 | if (len > maxlen) | ||
104 | return NULL; | 106 | return NULL; |
105 | *lenp = len; | 107 | *lenp = len; |
106 | *sp = (char *) p; | 108 | *sp = (char *) p; |
diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile index 264f0feeb513..5a8f268bdd30 100644 --- a/net/sunrpc/xprtrdma/Makefile +++ b/net/sunrpc/xprtrdma/Makefile | |||
@@ -1,3 +1,8 @@ | |||
1 | obj-$(CONFIG_SUNRPC_XPRT_RDMA) += xprtrdma.o | 1 | obj-$(CONFIG_SUNRPC_XPRT_RDMA) += xprtrdma.o |
2 | 2 | ||
3 | xprtrdma-y := transport.o rpc_rdma.o verbs.o | 3 | xprtrdma-y := transport.o rpc_rdma.o verbs.o |
4 | |||
5 | obj-$(CONFIG_SUNRPC_XPRT_RDMA) += svcrdma.o | ||
6 | |||
7 | svcrdma-y := svc_rdma.o svc_rdma_transport.o \ | ||
8 | svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o | ||
diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c new file mode 100644 index 000000000000..88c0ca20bb1e --- /dev/null +++ b/net/sunrpc/xprtrdma/svc_rdma.c | |||
@@ -0,0 +1,266 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved. | ||
3 | * | ||
4 | * This software is available to you under a choice of one of two | ||
5 | * licenses. You may choose to be licensed under the terms of the GNU | ||
6 | * General Public License (GPL) Version 2, available from the file | ||
7 | * COPYING in the main directory of this source tree, or the BSD-type | ||
8 | * license below: | ||
9 | * | ||
10 | * Redistribution and use in source and binary forms, with or without | ||
11 | * modification, are permitted provided that the following conditions | ||
12 | * are met: | ||
13 | * | ||
14 | * Redistributions of source code must retain the above copyright | ||
15 | * notice, this list of conditions and the following disclaimer. | ||
16 | * | ||
17 | * Redistributions in binary form must reproduce the above | ||
18 | * copyright notice, this list of conditions and the following | ||
19 | * disclaimer in the documentation and/or other materials provided | ||
20 | * with the distribution. | ||
21 | * | ||
22 | * Neither the name of the Network Appliance, Inc. nor the names of | ||
23 | * its contributors may be used to endorse or promote products | ||
24 | * derived from this software without specific prior written | ||
25 | * permission. | ||
26 | * | ||
27 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | ||
28 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | ||
29 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | ||
30 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | ||
31 | * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | ||
32 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | ||
33 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | ||
34 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | ||
35 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | ||
36 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | ||
37 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||
38 | * | ||
39 | * Author: Tom Tucker <tom@opengridcomputing.com> | ||
40 | */ | ||
41 | #include <linux/module.h> | ||
42 | #include <linux/init.h> | ||
43 | #include <linux/fs.h> | ||
44 | #include <linux/sysctl.h> | ||
45 | #include <linux/sunrpc/clnt.h> | ||
46 | #include <linux/sunrpc/sched.h> | ||
47 | #include <linux/sunrpc/svc_rdma.h> | ||
48 | |||
49 | #define RPCDBG_FACILITY RPCDBG_SVCXPRT | ||
50 | |||
51 | /* RPC/RDMA parameters */ | ||
52 | unsigned int svcrdma_ord = RPCRDMA_ORD; | ||
53 | static unsigned int min_ord = 1; | ||
54 | static unsigned int max_ord = 4096; | ||
55 | unsigned int svcrdma_max_requests = RPCRDMA_MAX_REQUESTS; | ||
56 | static unsigned int min_max_requests = 4; | ||
57 | static unsigned int max_max_requests = 16384; | ||
58 | unsigned int svcrdma_max_req_size = RPCRDMA_MAX_REQ_SIZE; | ||
59 | static unsigned int min_max_inline = 4096; | ||
60 | static unsigned int max_max_inline = 65536; | ||
61 | |||
62 | atomic_t rdma_stat_recv; | ||
63 | atomic_t rdma_stat_read; | ||
64 | atomic_t rdma_stat_write; | ||
65 | atomic_t rdma_stat_sq_starve; | ||
66 | atomic_t rdma_stat_rq_starve; | ||
67 | atomic_t rdma_stat_rq_poll; | ||
68 | atomic_t rdma_stat_rq_prod; | ||
69 | atomic_t rdma_stat_sq_poll; | ||
70 | atomic_t rdma_stat_sq_prod; | ||
71 | |||
72 | /* | ||
73 | * This function implements reading and resetting an atomic_t stat | ||
74 | * variable through read/write to a proc file. Any write to the file | ||
75 | * resets the associated statistic to zero. Any read returns it's | ||
76 | * current value. | ||
77 | */ | ||
78 | static int read_reset_stat(ctl_table *table, int write, | ||
79 | struct file *filp, void __user *buffer, size_t *lenp, | ||
80 | loff_t *ppos) | ||
81 | { | ||
82 | atomic_t *stat = (atomic_t *)table->data; | ||
83 | |||
84 | if (!stat) | ||
85 | return -EINVAL; | ||
86 | |||
87 | if (write) | ||
88 | atomic_set(stat, 0); | ||
89 | else { | ||
90 | char str_buf[32]; | ||
91 | char *data; | ||
92 | int len = snprintf(str_buf, 32, "%d\n", atomic_read(stat)); | ||
93 | if (len >= 32) | ||
94 | return -EFAULT; | ||
95 | len = strlen(str_buf); | ||
96 | if (*ppos > len) { | ||
97 | *lenp = 0; | ||
98 | return 0; | ||
99 | } | ||
100 | data = &str_buf[*ppos]; | ||
101 | len -= *ppos; | ||
102 | if (len > *lenp) | ||
103 | len = *lenp; | ||
104 | if (len && copy_to_user(buffer, str_buf, len)) | ||
105 | return -EFAULT; | ||
106 | *lenp = len; | ||
107 | *ppos += len; | ||
108 | } | ||
109 | return 0; | ||
110 | } | ||
111 | |||
112 | static struct ctl_table_header *svcrdma_table_header; | ||
113 | static ctl_table svcrdma_parm_table[] = { | ||
114 | { | ||
115 | .procname = "max_requests", | ||
116 | .data = &svcrdma_max_requests, | ||
117 | .maxlen = sizeof(unsigned int), | ||
118 | .mode = 0644, | ||
119 | .proc_handler = &proc_dointvec_minmax, | ||
120 | .strategy = &sysctl_intvec, | ||
121 | .extra1 = &min_max_requests, | ||
122 | .extra2 = &max_max_requests | ||
123 | }, | ||
124 | { | ||
125 | .procname = "max_req_size", | ||
126 | .data = &svcrdma_max_req_size, | ||
127 | .maxlen = sizeof(unsigned int), | ||
128 | .mode = 0644, | ||
129 | .proc_handler = &proc_dointvec_minmax, | ||
130 | .strategy = &sysctl_intvec, | ||
131 | .extra1 = &min_max_inline, | ||
132 | .extra2 = &max_max_inline | ||
133 | }, | ||
134 | { | ||
135 | .procname = "max_outbound_read_requests", | ||
136 | .data = &svcrdma_ord, | ||
137 | .maxlen = sizeof(unsigned int), | ||
138 | .mode = 0644, | ||
139 | .proc_handler = &proc_dointvec_minmax, | ||
140 | .strategy = &sysctl_intvec, | ||
141 | .extra1 = &min_ord, | ||
142 | .extra2 = &max_ord, | ||
143 | }, | ||
144 | |||
145 | { | ||
146 | .procname = "rdma_stat_read", | ||
147 | .data = &rdma_stat_read, | ||
148 | .maxlen = sizeof(atomic_t), | ||
149 | .mode = 0644, | ||
150 | .proc_handler = &read_reset_stat, | ||
151 | }, | ||
152 | { | ||
153 | .procname = "rdma_stat_recv", | ||
154 | .data = &rdma_stat_recv, | ||
155 | .maxlen = sizeof(atomic_t), | ||
156 | .mode = 0644, | ||
157 | .proc_handler = &read_reset_stat, | ||
158 | }, | ||
159 | { | ||
160 | .procname = "rdma_stat_write", | ||
161 | .data = &rdma_stat_write, | ||
162 | .maxlen = sizeof(atomic_t), | ||
163 | .mode = 0644, | ||
164 | .proc_handler = &read_reset_stat, | ||
165 | }, | ||
166 | { | ||
167 | .procname = "rdma_stat_sq_starve", | ||
168 | .data = &rdma_stat_sq_starve, | ||
169 | .maxlen = sizeof(atomic_t), | ||
170 | .mode = 0644, | ||
171 | .proc_handler = &read_reset_stat, | ||
172 | }, | ||
173 | { | ||
174 | .procname = "rdma_stat_rq_starve", | ||
175 | .data = &rdma_stat_rq_starve, | ||
176 | .maxlen = sizeof(atomic_t), | ||
177 | .mode = 0644, | ||
178 | .proc_handler = &read_reset_stat, | ||
179 | }, | ||
180 | { | ||
181 | .procname = "rdma_stat_rq_poll", | ||
182 | .data = &rdma_stat_rq_poll, | ||
183 | .maxlen = sizeof(atomic_t), | ||
184 | .mode = 0644, | ||
185 | .proc_handler = &read_reset_stat, | ||
186 | }, | ||
187 | { | ||
188 | .procname = "rdma_stat_rq_prod", | ||
189 | .data = &rdma_stat_rq_prod, | ||
190 | .maxlen = sizeof(atomic_t), | ||
191 | .mode = 0644, | ||
192 | .proc_handler = &read_reset_stat, | ||
193 | }, | ||
194 | { | ||
195 | .procname = "rdma_stat_sq_poll", | ||
196 | .data = &rdma_stat_sq_poll, | ||
197 | .maxlen = sizeof(atomic_t), | ||
198 | .mode = 0644, | ||
199 | .proc_handler = &read_reset_stat, | ||
200 | }, | ||
201 | { | ||
202 | .procname = "rdma_stat_sq_prod", | ||
203 | .data = &rdma_stat_sq_prod, | ||
204 | .maxlen = sizeof(atomic_t), | ||
205 | .mode = 0644, | ||
206 | .proc_handler = &read_reset_stat, | ||
207 | }, | ||
208 | { | ||
209 | .ctl_name = 0, | ||
210 | }, | ||
211 | }; | ||
212 | |||
213 | static ctl_table svcrdma_table[] = { | ||
214 | { | ||
215 | .procname = "svc_rdma", | ||
216 | .mode = 0555, | ||
217 | .child = svcrdma_parm_table | ||
218 | }, | ||
219 | { | ||
220 | .ctl_name = 0, | ||
221 | }, | ||
222 | }; | ||
223 | |||
224 | static ctl_table svcrdma_root_table[] = { | ||
225 | { | ||
226 | .ctl_name = CTL_SUNRPC, | ||
227 | .procname = "sunrpc", | ||
228 | .mode = 0555, | ||
229 | .child = svcrdma_table | ||
230 | }, | ||
231 | { | ||
232 | .ctl_name = 0, | ||
233 | }, | ||
234 | }; | ||
235 | |||
236 | void svc_rdma_cleanup(void) | ||
237 | { | ||
238 | dprintk("SVCRDMA Module Removed, deregister RPC RDMA transport\n"); | ||
239 | if (svcrdma_table_header) { | ||
240 | unregister_sysctl_table(svcrdma_table_header); | ||
241 | svcrdma_table_header = NULL; | ||
242 | } | ||
243 | svc_unreg_xprt_class(&svc_rdma_class); | ||
244 | } | ||
245 | |||
246 | int svc_rdma_init(void) | ||
247 | { | ||
248 | dprintk("SVCRDMA Module Init, register RPC RDMA transport\n"); | ||
249 | dprintk("\tsvcrdma_ord : %d\n", svcrdma_ord); | ||
250 | dprintk("\tmax_requests : %d\n", svcrdma_max_requests); | ||
251 | dprintk("\tsq_depth : %d\n", | ||
252 | svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT); | ||
253 | dprintk("\tmax_inline : %d\n", svcrdma_max_req_size); | ||
254 | if (!svcrdma_table_header) | ||
255 | svcrdma_table_header = | ||
256 | register_sysctl_table(svcrdma_root_table); | ||
257 | |||
258 | /* Register RDMA with the SVC transport switch */ | ||
259 | svc_reg_xprt_class(&svc_rdma_class); | ||
260 | return 0; | ||
261 | } | ||
262 | MODULE_AUTHOR("Tom Tucker <tom@opengridcomputing.com>"); | ||
263 | MODULE_DESCRIPTION("SVC RDMA Transport"); | ||
264 | MODULE_LICENSE("Dual BSD/GPL"); | ||
265 | module_init(svc_rdma_init); | ||
266 | module_exit(svc_rdma_cleanup); | ||
diff --git a/net/sunrpc/xprtrdma/svc_rdma_marshal.c b/net/sunrpc/xprtrdma/svc_rdma_marshal.c new file mode 100644 index 000000000000..9530ef2d40dc --- /dev/null +++ b/net/sunrpc/xprtrdma/svc_rdma_marshal.c | |||
@@ -0,0 +1,412 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved. | ||
3 | * | ||
4 | * This software is available to you under a choice of one of two | ||
5 | * licenses. You may choose to be licensed under the terms of the GNU | ||
6 | * General Public License (GPL) Version 2, available from the file | ||
7 | * COPYING in the main directory of this source tree, or the BSD-type | ||
8 | * license below: | ||
9 | * | ||
10 | * Redistribution and use in source and binary forms, with or without | ||
11 | * modification, are permitted provided that the following conditions | ||
12 | * are met: | ||
13 | * | ||
14 | * Redistributions of source code must retain the above copyright | ||
15 | * notice, this list of conditions and the following disclaimer. | ||
16 | * | ||
17 | * Redistributions in binary form must reproduce the above | ||
18 | * copyright notice, this list of conditions and the following | ||
19 | * disclaimer in the documentation and/or other materials provided | ||
20 | * with the distribution. | ||
21 | * | ||
22 | * Neither the name of the Network Appliance, Inc. nor the names of | ||
23 | * its contributors may be used to endorse or promote products | ||
24 | * derived from this software without specific prior written | ||
25 | * permission. | ||
26 | * | ||
27 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | ||
28 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | ||
29 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | ||
30 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | ||
31 | * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | ||
32 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | ||
33 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | ||
34 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | ||
35 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | ||
36 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | ||
37 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||
38 | * | ||
39 | * Author: Tom Tucker <tom@opengridcomputing.com> | ||
40 | */ | ||
41 | |||
42 | #include <linux/sunrpc/xdr.h> | ||
43 | #include <linux/sunrpc/debug.h> | ||
44 | #include <asm/unaligned.h> | ||
45 | #include <linux/sunrpc/rpc_rdma.h> | ||
46 | #include <linux/sunrpc/svc_rdma.h> | ||
47 | |||
48 | #define RPCDBG_FACILITY RPCDBG_SVCXPRT | ||
49 | |||
50 | /* | ||
51 | * Decodes a read chunk list. The expected format is as follows: | ||
52 | * descrim : xdr_one | ||
53 | * position : u32 offset into XDR stream | ||
54 | * handle : u32 RKEY | ||
55 | * . . . | ||
56 | * end-of-list: xdr_zero | ||
57 | */ | ||
58 | static u32 *decode_read_list(u32 *va, u32 *vaend) | ||
59 | { | ||
60 | struct rpcrdma_read_chunk *ch = (struct rpcrdma_read_chunk *)va; | ||
61 | |||
62 | while (ch->rc_discrim != xdr_zero) { | ||
63 | u64 ch_offset; | ||
64 | |||
65 | if (((unsigned long)ch + sizeof(struct rpcrdma_read_chunk)) > | ||
66 | (unsigned long)vaend) { | ||
67 | dprintk("svcrdma: vaend=%p, ch=%p\n", vaend, ch); | ||
68 | return NULL; | ||
69 | } | ||
70 | |||
71 | ch->rc_discrim = ntohl(ch->rc_discrim); | ||
72 | ch->rc_position = ntohl(ch->rc_position); | ||
73 | ch->rc_target.rs_handle = ntohl(ch->rc_target.rs_handle); | ||
74 | ch->rc_target.rs_length = ntohl(ch->rc_target.rs_length); | ||
75 | va = (u32 *)&ch->rc_target.rs_offset; | ||
76 | xdr_decode_hyper(va, &ch_offset); | ||
77 | put_unaligned(ch_offset, (u64 *)va); | ||
78 | ch++; | ||
79 | } | ||
80 | return (u32 *)&ch->rc_position; | ||
81 | } | ||
82 | |||
83 | /* | ||
84 | * Determine number of chunks and total bytes in chunk list. The chunk | ||
85 | * list has already been verified to fit within the RPCRDMA header. | ||
86 | */ | ||
87 | void svc_rdma_rcl_chunk_counts(struct rpcrdma_read_chunk *ch, | ||
88 | int *ch_count, int *byte_count) | ||
89 | { | ||
90 | /* compute the number of bytes represented by read chunks */ | ||
91 | *byte_count = 0; | ||
92 | *ch_count = 0; | ||
93 | for (; ch->rc_discrim != 0; ch++) { | ||
94 | *byte_count = *byte_count + ch->rc_target.rs_length; | ||
95 | *ch_count = *ch_count + 1; | ||
96 | } | ||
97 | } | ||
98 | |||
99 | /* | ||
100 | * Decodes a write chunk list. The expected format is as follows: | ||
101 | * descrim : xdr_one | ||
102 | * nchunks : <count> | ||
103 | * handle : u32 RKEY ---+ | ||
104 | * length : u32 <len of segment> | | ||
105 | * offset : remove va + <count> | ||
106 | * . . . | | ||
107 | * ---+ | ||
108 | */ | ||
109 | static u32 *decode_write_list(u32 *va, u32 *vaend) | ||
110 | { | ||
111 | int ch_no; | ||
112 | struct rpcrdma_write_array *ary = | ||
113 | (struct rpcrdma_write_array *)va; | ||
114 | |||
115 | /* Check for not write-array */ | ||
116 | if (ary->wc_discrim == xdr_zero) | ||
117 | return (u32 *)&ary->wc_nchunks; | ||
118 | |||
119 | if ((unsigned long)ary + sizeof(struct rpcrdma_write_array) > | ||
120 | (unsigned long)vaend) { | ||
121 | dprintk("svcrdma: ary=%p, vaend=%p\n", ary, vaend); | ||
122 | return NULL; | ||
123 | } | ||
124 | ary->wc_discrim = ntohl(ary->wc_discrim); | ||
125 | ary->wc_nchunks = ntohl(ary->wc_nchunks); | ||
126 | if (((unsigned long)&ary->wc_array[0] + | ||
127 | (sizeof(struct rpcrdma_write_chunk) * ary->wc_nchunks)) > | ||
128 | (unsigned long)vaend) { | ||
129 | dprintk("svcrdma: ary=%p, wc_nchunks=%d, vaend=%p\n", | ||
130 | ary, ary->wc_nchunks, vaend); | ||
131 | return NULL; | ||
132 | } | ||
133 | for (ch_no = 0; ch_no < ary->wc_nchunks; ch_no++) { | ||
134 | u64 ch_offset; | ||
135 | |||
136 | ary->wc_array[ch_no].wc_target.rs_handle = | ||
137 | ntohl(ary->wc_array[ch_no].wc_target.rs_handle); | ||
138 | ary->wc_array[ch_no].wc_target.rs_length = | ||
139 | ntohl(ary->wc_array[ch_no].wc_target.rs_length); | ||
140 | va = (u32 *)&ary->wc_array[ch_no].wc_target.rs_offset; | ||
141 | xdr_decode_hyper(va, &ch_offset); | ||
142 | put_unaligned(ch_offset, (u64 *)va); | ||
143 | } | ||
144 | |||
145 | /* | ||
146 | * rs_length is the 2nd 4B field in wc_target and taking its | ||
147 | * address skips the list terminator | ||
148 | */ | ||
149 | return (u32 *)&ary->wc_array[ch_no].wc_target.rs_length; | ||
150 | } | ||
151 | |||
152 | static u32 *decode_reply_array(u32 *va, u32 *vaend) | ||
153 | { | ||
154 | int ch_no; | ||
155 | struct rpcrdma_write_array *ary = | ||
156 | (struct rpcrdma_write_array *)va; | ||
157 | |||
158 | /* Check for no reply-array */ | ||
159 | if (ary->wc_discrim == xdr_zero) | ||
160 | return (u32 *)&ary->wc_nchunks; | ||
161 | |||
162 | if ((unsigned long)ary + sizeof(struct rpcrdma_write_array) > | ||
163 | (unsigned long)vaend) { | ||
164 | dprintk("svcrdma: ary=%p, vaend=%p\n", ary, vaend); | ||
165 | return NULL; | ||
166 | } | ||
167 | ary->wc_discrim = ntohl(ary->wc_discrim); | ||
168 | ary->wc_nchunks = ntohl(ary->wc_nchunks); | ||
169 | if (((unsigned long)&ary->wc_array[0] + | ||
170 | (sizeof(struct rpcrdma_write_chunk) * ary->wc_nchunks)) > | ||
171 | (unsigned long)vaend) { | ||
172 | dprintk("svcrdma: ary=%p, wc_nchunks=%d, vaend=%p\n", | ||
173 | ary, ary->wc_nchunks, vaend); | ||
174 | return NULL; | ||
175 | } | ||
176 | for (ch_no = 0; ch_no < ary->wc_nchunks; ch_no++) { | ||
177 | u64 ch_offset; | ||
178 | |||
179 | ary->wc_array[ch_no].wc_target.rs_handle = | ||
180 | ntohl(ary->wc_array[ch_no].wc_target.rs_handle); | ||
181 | ary->wc_array[ch_no].wc_target.rs_length = | ||
182 | ntohl(ary->wc_array[ch_no].wc_target.rs_length); | ||
183 | va = (u32 *)&ary->wc_array[ch_no].wc_target.rs_offset; | ||
184 | xdr_decode_hyper(va, &ch_offset); | ||
185 | put_unaligned(ch_offset, (u64 *)va); | ||
186 | } | ||
187 | |||
188 | return (u32 *)&ary->wc_array[ch_no]; | ||
189 | } | ||
190 | |||
191 | int svc_rdma_xdr_decode_req(struct rpcrdma_msg **rdma_req, | ||
192 | struct svc_rqst *rqstp) | ||
193 | { | ||
194 | struct rpcrdma_msg *rmsgp = NULL; | ||
195 | u32 *va; | ||
196 | u32 *vaend; | ||
197 | u32 hdr_len; | ||
198 | |||
199 | rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base; | ||
200 | |||
201 | /* Verify that there's enough bytes for header + something */ | ||
202 | if (rqstp->rq_arg.len <= RPCRDMA_HDRLEN_MIN) { | ||
203 | dprintk("svcrdma: header too short = %d\n", | ||
204 | rqstp->rq_arg.len); | ||
205 | return -EINVAL; | ||
206 | } | ||
207 | |||
208 | /* Decode the header */ | ||
209 | rmsgp->rm_xid = ntohl(rmsgp->rm_xid); | ||
210 | rmsgp->rm_vers = ntohl(rmsgp->rm_vers); | ||
211 | rmsgp->rm_credit = ntohl(rmsgp->rm_credit); | ||
212 | rmsgp->rm_type = ntohl(rmsgp->rm_type); | ||
213 | |||
214 | if (rmsgp->rm_vers != RPCRDMA_VERSION) | ||
215 | return -ENOSYS; | ||
216 | |||
217 | /* Pull in the extra for the padded case and bump our pointer */ | ||
218 | if (rmsgp->rm_type == RDMA_MSGP) { | ||
219 | int hdrlen; | ||
220 | rmsgp->rm_body.rm_padded.rm_align = | ||
221 | ntohl(rmsgp->rm_body.rm_padded.rm_align); | ||
222 | rmsgp->rm_body.rm_padded.rm_thresh = | ||
223 | ntohl(rmsgp->rm_body.rm_padded.rm_thresh); | ||
224 | |||
225 | va = &rmsgp->rm_body.rm_padded.rm_pempty[4]; | ||
226 | rqstp->rq_arg.head[0].iov_base = va; | ||
227 | hdrlen = (u32)((unsigned long)va - (unsigned long)rmsgp); | ||
228 | rqstp->rq_arg.head[0].iov_len -= hdrlen; | ||
229 | if (hdrlen > rqstp->rq_arg.len) | ||
230 | return -EINVAL; | ||
231 | return hdrlen; | ||
232 | } | ||
233 | |||
234 | /* The chunk list may contain either a read chunk list or a write | ||
235 | * chunk list and a reply chunk list. | ||
236 | */ | ||
237 | va = &rmsgp->rm_body.rm_chunks[0]; | ||
238 | vaend = (u32 *)((unsigned long)rmsgp + rqstp->rq_arg.len); | ||
239 | va = decode_read_list(va, vaend); | ||
240 | if (!va) | ||
241 | return -EINVAL; | ||
242 | va = decode_write_list(va, vaend); | ||
243 | if (!va) | ||
244 | return -EINVAL; | ||
245 | va = decode_reply_array(va, vaend); | ||
246 | if (!va) | ||
247 | return -EINVAL; | ||
248 | |||
249 | rqstp->rq_arg.head[0].iov_base = va; | ||
250 | hdr_len = (unsigned long)va - (unsigned long)rmsgp; | ||
251 | rqstp->rq_arg.head[0].iov_len -= hdr_len; | ||
252 | |||
253 | *rdma_req = rmsgp; | ||
254 | return hdr_len; | ||
255 | } | ||
256 | |||
257 | int svc_rdma_xdr_decode_deferred_req(struct svc_rqst *rqstp) | ||
258 | { | ||
259 | struct rpcrdma_msg *rmsgp = NULL; | ||
260 | struct rpcrdma_read_chunk *ch; | ||
261 | struct rpcrdma_write_array *ary; | ||
262 | u32 *va; | ||
263 | u32 hdrlen; | ||
264 | |||
265 | dprintk("svcrdma: processing deferred RDMA header on rqstp=%p\n", | ||
266 | rqstp); | ||
267 | rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base; | ||
268 | |||
269 | /* Pull in the extra for the padded case and bump our pointer */ | ||
270 | if (rmsgp->rm_type == RDMA_MSGP) { | ||
271 | va = &rmsgp->rm_body.rm_padded.rm_pempty[4]; | ||
272 | rqstp->rq_arg.head[0].iov_base = va; | ||
273 | hdrlen = (u32)((unsigned long)va - (unsigned long)rmsgp); | ||
274 | rqstp->rq_arg.head[0].iov_len -= hdrlen; | ||
275 | return hdrlen; | ||
276 | } | ||
277 | |||
278 | /* | ||
279 | * Skip all chunks to find RPC msg. These were previously processed | ||
280 | */ | ||
281 | va = &rmsgp->rm_body.rm_chunks[0]; | ||
282 | |||
283 | /* Skip read-list */ | ||
284 | for (ch = (struct rpcrdma_read_chunk *)va; | ||
285 | ch->rc_discrim != xdr_zero; ch++); | ||
286 | va = (u32 *)&ch->rc_position; | ||
287 | |||
288 | /* Skip write-list */ | ||
289 | ary = (struct rpcrdma_write_array *)va; | ||
290 | if (ary->wc_discrim == xdr_zero) | ||
291 | va = (u32 *)&ary->wc_nchunks; | ||
292 | else | ||
293 | /* | ||
294 | * rs_length is the 2nd 4B field in wc_target and taking its | ||
295 | * address skips the list terminator | ||
296 | */ | ||
297 | va = (u32 *)&ary->wc_array[ary->wc_nchunks].wc_target.rs_length; | ||
298 | |||
299 | /* Skip reply-array */ | ||
300 | ary = (struct rpcrdma_write_array *)va; | ||
301 | if (ary->wc_discrim == xdr_zero) | ||
302 | va = (u32 *)&ary->wc_nchunks; | ||
303 | else | ||
304 | va = (u32 *)&ary->wc_array[ary->wc_nchunks]; | ||
305 | |||
306 | rqstp->rq_arg.head[0].iov_base = va; | ||
307 | hdrlen = (unsigned long)va - (unsigned long)rmsgp; | ||
308 | rqstp->rq_arg.head[0].iov_len -= hdrlen; | ||
309 | |||
310 | return hdrlen; | ||
311 | } | ||
312 | |||
313 | int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt, | ||
314 | struct rpcrdma_msg *rmsgp, | ||
315 | enum rpcrdma_errcode err, u32 *va) | ||
316 | { | ||
317 | u32 *startp = va; | ||
318 | |||
319 | *va++ = htonl(rmsgp->rm_xid); | ||
320 | *va++ = htonl(rmsgp->rm_vers); | ||
321 | *va++ = htonl(xprt->sc_max_requests); | ||
322 | *va++ = htonl(RDMA_ERROR); | ||
323 | *va++ = htonl(err); | ||
324 | if (err == ERR_VERS) { | ||
325 | *va++ = htonl(RPCRDMA_VERSION); | ||
326 | *va++ = htonl(RPCRDMA_VERSION); | ||
327 | } | ||
328 | |||
329 | return (int)((unsigned long)va - (unsigned long)startp); | ||
330 | } | ||
331 | |||
332 | int svc_rdma_xdr_get_reply_hdr_len(struct rpcrdma_msg *rmsgp) | ||
333 | { | ||
334 | struct rpcrdma_write_array *wr_ary; | ||
335 | |||
336 | /* There is no read-list in a reply */ | ||
337 | |||
338 | /* skip write list */ | ||
339 | wr_ary = (struct rpcrdma_write_array *) | ||
340 | &rmsgp->rm_body.rm_chunks[1]; | ||
341 | if (wr_ary->wc_discrim) | ||
342 | wr_ary = (struct rpcrdma_write_array *) | ||
343 | &wr_ary->wc_array[ntohl(wr_ary->wc_nchunks)]. | ||
344 | wc_target.rs_length; | ||
345 | else | ||
346 | wr_ary = (struct rpcrdma_write_array *) | ||
347 | &wr_ary->wc_nchunks; | ||
348 | |||
349 | /* skip reply array */ | ||
350 | if (wr_ary->wc_discrim) | ||
351 | wr_ary = (struct rpcrdma_write_array *) | ||
352 | &wr_ary->wc_array[ntohl(wr_ary->wc_nchunks)]; | ||
353 | else | ||
354 | wr_ary = (struct rpcrdma_write_array *) | ||
355 | &wr_ary->wc_nchunks; | ||
356 | |||
357 | return (unsigned long) wr_ary - (unsigned long) rmsgp; | ||
358 | } | ||
359 | |||
360 | void svc_rdma_xdr_encode_write_list(struct rpcrdma_msg *rmsgp, int chunks) | ||
361 | { | ||
362 | struct rpcrdma_write_array *ary; | ||
363 | |||
364 | /* no read-list */ | ||
365 | rmsgp->rm_body.rm_chunks[0] = xdr_zero; | ||
366 | |||
367 | /* write-array discrim */ | ||
368 | ary = (struct rpcrdma_write_array *) | ||
369 | &rmsgp->rm_body.rm_chunks[1]; | ||
370 | ary->wc_discrim = xdr_one; | ||
371 | ary->wc_nchunks = htonl(chunks); | ||
372 | |||
373 | /* write-list terminator */ | ||
374 | ary->wc_array[chunks].wc_target.rs_handle = xdr_zero; | ||
375 | |||
376 | /* reply-array discriminator */ | ||
377 | ary->wc_array[chunks].wc_target.rs_length = xdr_zero; | ||
378 | } | ||
379 | |||
380 | void svc_rdma_xdr_encode_reply_array(struct rpcrdma_write_array *ary, | ||
381 | int chunks) | ||
382 | { | ||
383 | ary->wc_discrim = xdr_one; | ||
384 | ary->wc_nchunks = htonl(chunks); | ||
385 | } | ||
386 | |||
387 | void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *ary, | ||
388 | int chunk_no, | ||
389 | u32 rs_handle, u64 rs_offset, | ||
390 | u32 write_len) | ||
391 | { | ||
392 | struct rpcrdma_segment *seg = &ary->wc_array[chunk_no].wc_target; | ||
393 | seg->rs_handle = htonl(rs_handle); | ||
394 | seg->rs_length = htonl(write_len); | ||
395 | xdr_encode_hyper((u32 *) &seg->rs_offset, rs_offset); | ||
396 | } | ||
397 | |||
398 | void svc_rdma_xdr_encode_reply_header(struct svcxprt_rdma *xprt, | ||
399 | struct rpcrdma_msg *rdma_argp, | ||
400 | struct rpcrdma_msg *rdma_resp, | ||
401 | enum rpcrdma_proc rdma_type) | ||
402 | { | ||
403 | rdma_resp->rm_xid = htonl(rdma_argp->rm_xid); | ||
404 | rdma_resp->rm_vers = htonl(rdma_argp->rm_vers); | ||
405 | rdma_resp->rm_credit = htonl(xprt->sc_max_requests); | ||
406 | rdma_resp->rm_type = htonl(rdma_type); | ||
407 | |||
408 | /* Encode <nul> chunks lists */ | ||
409 | rdma_resp->rm_body.rm_chunks[0] = xdr_zero; | ||
410 | rdma_resp->rm_body.rm_chunks[1] = xdr_zero; | ||
411 | rdma_resp->rm_body.rm_chunks[2] = xdr_zero; | ||
412 | } | ||
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c new file mode 100644 index 000000000000..ab54a736486e --- /dev/null +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | |||
@@ -0,0 +1,586 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved. | ||
3 | * | ||
4 | * This software is available to you under a choice of one of two | ||
5 | * licenses. You may choose to be licensed under the terms of the GNU | ||
6 | * General Public License (GPL) Version 2, available from the file | ||
7 | * COPYING in the main directory of this source tree, or the BSD-type | ||
8 | * license below: | ||
9 | * | ||
10 | * Redistribution and use in source and binary forms, with or without | ||
11 | * modification, are permitted provided that the following conditions | ||
12 | * are met: | ||
13 | * | ||
14 | * Redistributions of source code must retain the above copyright | ||
15 | * notice, this list of conditions and the following disclaimer. | ||
16 | * | ||
17 | * Redistributions in binary form must reproduce the above | ||
18 | * copyright notice, this list of conditions and the following | ||
19 | * disclaimer in the documentation and/or other materials provided | ||
20 | * with the distribution. | ||
21 | * | ||
22 | * Neither the name of the Network Appliance, Inc. nor the names of | ||
23 | * its contributors may be used to endorse or promote products | ||
24 | * derived from this software without specific prior written | ||
25 | * permission. | ||
26 | * | ||
27 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | ||
28 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | ||
29 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | ||
30 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | ||
31 | * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | ||
32 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | ||
33 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | ||
34 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | ||
35 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | ||
36 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | ||
37 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||
38 | * | ||
39 | * Author: Tom Tucker <tom@opengridcomputing.com> | ||
40 | */ | ||
41 | |||
42 | #include <linux/sunrpc/debug.h> | ||
43 | #include <linux/sunrpc/rpc_rdma.h> | ||
44 | #include <linux/spinlock.h> | ||
45 | #include <asm/unaligned.h> | ||
46 | #include <rdma/ib_verbs.h> | ||
47 | #include <rdma/rdma_cm.h> | ||
48 | #include <linux/sunrpc/svc_rdma.h> | ||
49 | |||
50 | #define RPCDBG_FACILITY RPCDBG_SVCXPRT | ||
51 | |||
52 | /* | ||
53 | * Replace the pages in the rq_argpages array with the pages from the SGE in | ||
54 | * the RDMA_RECV completion. The SGL should contain full pages up until the | ||
55 | * last one. | ||
56 | */ | ||
57 | static void rdma_build_arg_xdr(struct svc_rqst *rqstp, | ||
58 | struct svc_rdma_op_ctxt *ctxt, | ||
59 | u32 byte_count) | ||
60 | { | ||
61 | struct page *page; | ||
62 | u32 bc; | ||
63 | int sge_no; | ||
64 | |||
65 | /* Swap the page in the SGE with the page in argpages */ | ||
66 | page = ctxt->pages[0]; | ||
67 | put_page(rqstp->rq_pages[0]); | ||
68 | rqstp->rq_pages[0] = page; | ||
69 | |||
70 | /* Set up the XDR head */ | ||
71 | rqstp->rq_arg.head[0].iov_base = page_address(page); | ||
72 | rqstp->rq_arg.head[0].iov_len = min(byte_count, ctxt->sge[0].length); | ||
73 | rqstp->rq_arg.len = byte_count; | ||
74 | rqstp->rq_arg.buflen = byte_count; | ||
75 | |||
76 | /* Compute bytes past head in the SGL */ | ||
77 | bc = byte_count - rqstp->rq_arg.head[0].iov_len; | ||
78 | |||
79 | /* If data remains, store it in the pagelist */ | ||
80 | rqstp->rq_arg.page_len = bc; | ||
81 | rqstp->rq_arg.page_base = 0; | ||
82 | rqstp->rq_arg.pages = &rqstp->rq_pages[1]; | ||
83 | sge_no = 1; | ||
84 | while (bc && sge_no < ctxt->count) { | ||
85 | page = ctxt->pages[sge_no]; | ||
86 | put_page(rqstp->rq_pages[sge_no]); | ||
87 | rqstp->rq_pages[sge_no] = page; | ||
88 | bc -= min(bc, ctxt->sge[sge_no].length); | ||
89 | rqstp->rq_arg.buflen += ctxt->sge[sge_no].length; | ||
90 | sge_no++; | ||
91 | } | ||
92 | rqstp->rq_respages = &rqstp->rq_pages[sge_no]; | ||
93 | |||
94 | /* We should never run out of SGE because the limit is defined to | ||
95 | * support the max allowed RPC data length | ||
96 | */ | ||
97 | BUG_ON(bc && (sge_no == ctxt->count)); | ||
98 | BUG_ON((rqstp->rq_arg.head[0].iov_len + rqstp->rq_arg.page_len) | ||
99 | != byte_count); | ||
100 | BUG_ON(rqstp->rq_arg.len != byte_count); | ||
101 | |||
102 | /* If not all pages were used from the SGL, free the remaining ones */ | ||
103 | bc = sge_no; | ||
104 | while (sge_no < ctxt->count) { | ||
105 | page = ctxt->pages[sge_no++]; | ||
106 | put_page(page); | ||
107 | } | ||
108 | ctxt->count = bc; | ||
109 | |||
110 | /* Set up tail */ | ||
111 | rqstp->rq_arg.tail[0].iov_base = NULL; | ||
112 | rqstp->rq_arg.tail[0].iov_len = 0; | ||
113 | } | ||
114 | |||
115 | struct chunk_sge { | ||
116 | int start; /* sge no for this chunk */ | ||
117 | int count; /* sge count for this chunk */ | ||
118 | }; | ||
119 | |||
120 | /* Encode a read-chunk-list as an array of IB SGE | ||
121 | * | ||
122 | * Assumptions: | ||
123 | * - chunk[0]->position points to pages[0] at an offset of 0 | ||
124 | * - pages[] is not physically or virtually contigous and consists of | ||
125 | * PAGE_SIZE elements. | ||
126 | * | ||
127 | * Output: | ||
128 | * - sge array pointing into pages[] array. | ||
129 | * - chunk_sge array specifying sge index and count for each | ||
130 | * chunk in the read list | ||
131 | * | ||
132 | */ | ||
133 | static int rdma_rcl_to_sge(struct svcxprt_rdma *xprt, | ||
134 | struct svc_rqst *rqstp, | ||
135 | struct svc_rdma_op_ctxt *head, | ||
136 | struct rpcrdma_msg *rmsgp, | ||
137 | struct ib_sge *sge, | ||
138 | struct chunk_sge *ch_sge_ary, | ||
139 | int ch_count, | ||
140 | int byte_count) | ||
141 | { | ||
142 | int sge_no; | ||
143 | int sge_bytes; | ||
144 | int page_off; | ||
145 | int page_no; | ||
146 | int ch_bytes; | ||
147 | int ch_no; | ||
148 | struct rpcrdma_read_chunk *ch; | ||
149 | |||
150 | sge_no = 0; | ||
151 | page_no = 0; | ||
152 | page_off = 0; | ||
153 | ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0]; | ||
154 | ch_no = 0; | ||
155 | ch_bytes = ch->rc_target.rs_length; | ||
156 | head->arg.head[0] = rqstp->rq_arg.head[0]; | ||
157 | head->arg.tail[0] = rqstp->rq_arg.tail[0]; | ||
158 | head->arg.pages = &head->pages[head->count]; | ||
159 | head->sge[0].length = head->count; /* save count of hdr pages */ | ||
160 | head->arg.page_base = 0; | ||
161 | head->arg.page_len = ch_bytes; | ||
162 | head->arg.len = rqstp->rq_arg.len + ch_bytes; | ||
163 | head->arg.buflen = rqstp->rq_arg.buflen + ch_bytes; | ||
164 | head->count++; | ||
165 | ch_sge_ary[0].start = 0; | ||
166 | while (byte_count) { | ||
167 | sge_bytes = min_t(int, PAGE_SIZE-page_off, ch_bytes); | ||
168 | sge[sge_no].addr = | ||
169 | ib_dma_map_page(xprt->sc_cm_id->device, | ||
170 | rqstp->rq_arg.pages[page_no], | ||
171 | page_off, sge_bytes, | ||
172 | DMA_FROM_DEVICE); | ||
173 | sge[sge_no].length = sge_bytes; | ||
174 | sge[sge_no].lkey = xprt->sc_phys_mr->lkey; | ||
175 | /* | ||
176 | * Don't bump head->count here because the same page | ||
177 | * may be used by multiple SGE. | ||
178 | */ | ||
179 | head->arg.pages[page_no] = rqstp->rq_arg.pages[page_no]; | ||
180 | rqstp->rq_respages = &rqstp->rq_arg.pages[page_no+1]; | ||
181 | |||
182 | byte_count -= sge_bytes; | ||
183 | ch_bytes -= sge_bytes; | ||
184 | sge_no++; | ||
185 | /* | ||
186 | * If all bytes for this chunk have been mapped to an | ||
187 | * SGE, move to the next SGE | ||
188 | */ | ||
189 | if (ch_bytes == 0) { | ||
190 | ch_sge_ary[ch_no].count = | ||
191 | sge_no - ch_sge_ary[ch_no].start; | ||
192 | ch_no++; | ||
193 | ch++; | ||
194 | ch_sge_ary[ch_no].start = sge_no; | ||
195 | ch_bytes = ch->rc_target.rs_length; | ||
196 | /* If bytes remaining account for next chunk */ | ||
197 | if (byte_count) { | ||
198 | head->arg.page_len += ch_bytes; | ||
199 | head->arg.len += ch_bytes; | ||
200 | head->arg.buflen += ch_bytes; | ||
201 | } | ||
202 | } | ||
203 | /* | ||
204 | * If this SGE consumed all of the page, move to the | ||
205 | * next page | ||
206 | */ | ||
207 | if ((sge_bytes + page_off) == PAGE_SIZE) { | ||
208 | page_no++; | ||
209 | page_off = 0; | ||
210 | /* | ||
211 | * If there are still bytes left to map, bump | ||
212 | * the page count | ||
213 | */ | ||
214 | if (byte_count) | ||
215 | head->count++; | ||
216 | } else | ||
217 | page_off += sge_bytes; | ||
218 | } | ||
219 | BUG_ON(byte_count != 0); | ||
220 | return sge_no; | ||
221 | } | ||
222 | |||
223 | static void rdma_set_ctxt_sge(struct svc_rdma_op_ctxt *ctxt, | ||
224 | struct ib_sge *sge, | ||
225 | u64 *sgl_offset, | ||
226 | int count) | ||
227 | { | ||
228 | int i; | ||
229 | |||
230 | ctxt->count = count; | ||
231 | for (i = 0; i < count; i++) { | ||
232 | ctxt->sge[i].addr = sge[i].addr; | ||
233 | ctxt->sge[i].length = sge[i].length; | ||
234 | *sgl_offset = *sgl_offset + sge[i].length; | ||
235 | } | ||
236 | } | ||
237 | |||
238 | static int rdma_read_max_sge(struct svcxprt_rdma *xprt, int sge_count) | ||
239 | { | ||
240 | #ifdef RDMA_TRANSPORT_IWARP | ||
241 | if ((RDMA_TRANSPORT_IWARP == | ||
242 | rdma_node_get_transport(xprt->sc_cm_id-> | ||
243 | device->node_type)) | ||
244 | && sge_count > 1) | ||
245 | return 1; | ||
246 | else | ||
247 | #endif | ||
248 | return min_t(int, sge_count, xprt->sc_max_sge); | ||
249 | } | ||
250 | |||
251 | /* | ||
252 | * Use RDMA_READ to read data from the advertised client buffer into the | ||
253 | * XDR stream starting at rq_arg.head[0].iov_base. | ||
254 | * Each chunk in the array | ||
255 | * contains the following fields: | ||
256 | * discrim - '1', This isn't used for data placement | ||
257 | * position - The xdr stream offset (the same for every chunk) | ||
258 | * handle - RMR for client memory region | ||
259 | * length - data transfer length | ||
260 | * offset - 64 bit tagged offset in remote memory region | ||
261 | * | ||
262 | * On our side, we need to read into a pagelist. The first page immediately | ||
263 | * follows the RPC header. | ||
264 | * | ||
265 | * This function returns 1 to indicate success. The data is not yet in | ||
266 | * the pagelist and therefore the RPC request must be deferred. The | ||
267 | * I/O completion will enqueue the transport again and | ||
268 | * svc_rdma_recvfrom will complete the request. | ||
269 | * | ||
270 | * NOTE: The ctxt must not be touched after the last WR has been posted | ||
271 | * because the I/O completion processing may occur on another | ||
272 | * processor and free / modify the context. Ne touche pas! | ||
273 | */ | ||
274 | static int rdma_read_xdr(struct svcxprt_rdma *xprt, | ||
275 | struct rpcrdma_msg *rmsgp, | ||
276 | struct svc_rqst *rqstp, | ||
277 | struct svc_rdma_op_ctxt *hdr_ctxt) | ||
278 | { | ||
279 | struct ib_send_wr read_wr; | ||
280 | int err = 0; | ||
281 | int ch_no; | ||
282 | struct ib_sge *sge; | ||
283 | int ch_count; | ||
284 | int byte_count; | ||
285 | int sge_count; | ||
286 | u64 sgl_offset; | ||
287 | struct rpcrdma_read_chunk *ch; | ||
288 | struct svc_rdma_op_ctxt *ctxt = NULL; | ||
289 | struct svc_rdma_op_ctxt *head; | ||
290 | struct svc_rdma_op_ctxt *tmp_sge_ctxt; | ||
291 | struct svc_rdma_op_ctxt *tmp_ch_ctxt; | ||
292 | struct chunk_sge *ch_sge_ary; | ||
293 | |||
294 | /* If no read list is present, return 0 */ | ||
295 | ch = svc_rdma_get_read_chunk(rmsgp); | ||
296 | if (!ch) | ||
297 | return 0; | ||
298 | |||
299 | /* Allocate temporary contexts to keep SGE */ | ||
300 | BUG_ON(sizeof(struct ib_sge) < sizeof(struct chunk_sge)); | ||
301 | tmp_sge_ctxt = svc_rdma_get_context(xprt); | ||
302 | sge = tmp_sge_ctxt->sge; | ||
303 | tmp_ch_ctxt = svc_rdma_get_context(xprt); | ||
304 | ch_sge_ary = (struct chunk_sge *)tmp_ch_ctxt->sge; | ||
305 | |||
306 | svc_rdma_rcl_chunk_counts(ch, &ch_count, &byte_count); | ||
307 | sge_count = rdma_rcl_to_sge(xprt, rqstp, hdr_ctxt, rmsgp, | ||
308 | sge, ch_sge_ary, | ||
309 | ch_count, byte_count); | ||
310 | head = svc_rdma_get_context(xprt); | ||
311 | sgl_offset = 0; | ||
312 | ch_no = 0; | ||
313 | |||
314 | for (ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0]; | ||
315 | ch->rc_discrim != 0; ch++, ch_no++) { | ||
316 | next_sge: | ||
317 | if (!ctxt) | ||
318 | ctxt = head; | ||
319 | else { | ||
320 | ctxt->next = svc_rdma_get_context(xprt); | ||
321 | ctxt = ctxt->next; | ||
322 | } | ||
323 | ctxt->next = NULL; | ||
324 | ctxt->direction = DMA_FROM_DEVICE; | ||
325 | clear_bit(RDMACTXT_F_READ_DONE, &ctxt->flags); | ||
326 | clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags); | ||
327 | if ((ch+1)->rc_discrim == 0) { | ||
328 | /* | ||
329 | * Checked in sq_cq_reap to see if we need to | ||
330 | * be enqueued | ||
331 | */ | ||
332 | set_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags); | ||
333 | ctxt->next = hdr_ctxt; | ||
334 | hdr_ctxt->next = head; | ||
335 | } | ||
336 | |||
337 | /* Prepare READ WR */ | ||
338 | memset(&read_wr, 0, sizeof read_wr); | ||
339 | ctxt->wr_op = IB_WR_RDMA_READ; | ||
340 | read_wr.wr_id = (unsigned long)ctxt; | ||
341 | read_wr.opcode = IB_WR_RDMA_READ; | ||
342 | read_wr.send_flags = IB_SEND_SIGNALED; | ||
343 | read_wr.wr.rdma.rkey = ch->rc_target.rs_handle; | ||
344 | read_wr.wr.rdma.remote_addr = | ||
345 | get_unaligned(&(ch->rc_target.rs_offset)) + | ||
346 | sgl_offset; | ||
347 | read_wr.sg_list = &sge[ch_sge_ary[ch_no].start]; | ||
348 | read_wr.num_sge = | ||
349 | rdma_read_max_sge(xprt, ch_sge_ary[ch_no].count); | ||
350 | rdma_set_ctxt_sge(ctxt, &sge[ch_sge_ary[ch_no].start], | ||
351 | &sgl_offset, | ||
352 | read_wr.num_sge); | ||
353 | |||
354 | /* Post the read */ | ||
355 | err = svc_rdma_send(xprt, &read_wr); | ||
356 | if (err) { | ||
357 | printk(KERN_ERR "svcrdma: Error posting send = %d\n", | ||
358 | err); | ||
359 | /* | ||
360 | * Break the circular list so free knows when | ||
361 | * to stop if the error happened to occur on | ||
362 | * the last read | ||
363 | */ | ||
364 | ctxt->next = NULL; | ||
365 | goto out; | ||
366 | } | ||
367 | atomic_inc(&rdma_stat_read); | ||
368 | |||
369 | if (read_wr.num_sge < ch_sge_ary[ch_no].count) { | ||
370 | ch_sge_ary[ch_no].count -= read_wr.num_sge; | ||
371 | ch_sge_ary[ch_no].start += read_wr.num_sge; | ||
372 | goto next_sge; | ||
373 | } | ||
374 | sgl_offset = 0; | ||
375 | err = 0; | ||
376 | } | ||
377 | |||
378 | out: | ||
379 | svc_rdma_put_context(tmp_sge_ctxt, 0); | ||
380 | svc_rdma_put_context(tmp_ch_ctxt, 0); | ||
381 | |||
382 | /* Detach arg pages. svc_recv will replenish them */ | ||
383 | for (ch_no = 0; &rqstp->rq_pages[ch_no] < rqstp->rq_respages; ch_no++) | ||
384 | rqstp->rq_pages[ch_no] = NULL; | ||
385 | |||
386 | /* | ||
387 | * Detach res pages. svc_release must see a resused count of | ||
388 | * zero or it will attempt to put them. | ||
389 | */ | ||
390 | while (rqstp->rq_resused) | ||
391 | rqstp->rq_respages[--rqstp->rq_resused] = NULL; | ||
392 | |||
393 | if (err) { | ||
394 | printk(KERN_ERR "svcrdma : RDMA_READ error = %d\n", err); | ||
395 | set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); | ||
396 | /* Free the linked list of read contexts */ | ||
397 | while (head != NULL) { | ||
398 | ctxt = head->next; | ||
399 | svc_rdma_put_context(head, 1); | ||
400 | head = ctxt; | ||
401 | } | ||
402 | return 0; | ||
403 | } | ||
404 | |||
405 | return 1; | ||
406 | } | ||
407 | |||
408 | static int rdma_read_complete(struct svc_rqst *rqstp, | ||
409 | struct svc_rdma_op_ctxt *data) | ||
410 | { | ||
411 | struct svc_rdma_op_ctxt *head = data->next; | ||
412 | int page_no; | ||
413 | int ret; | ||
414 | |||
415 | BUG_ON(!head); | ||
416 | |||
417 | /* Copy RPC pages */ | ||
418 | for (page_no = 0; page_no < head->count; page_no++) { | ||
419 | put_page(rqstp->rq_pages[page_no]); | ||
420 | rqstp->rq_pages[page_no] = head->pages[page_no]; | ||
421 | } | ||
422 | /* Point rq_arg.pages past header */ | ||
423 | rqstp->rq_arg.pages = &rqstp->rq_pages[head->sge[0].length]; | ||
424 | rqstp->rq_arg.page_len = head->arg.page_len; | ||
425 | rqstp->rq_arg.page_base = head->arg.page_base; | ||
426 | |||
427 | /* rq_respages starts after the last arg page */ | ||
428 | rqstp->rq_respages = &rqstp->rq_arg.pages[page_no]; | ||
429 | rqstp->rq_resused = 0; | ||
430 | |||
431 | /* Rebuild rq_arg head and tail. */ | ||
432 | rqstp->rq_arg.head[0] = head->arg.head[0]; | ||
433 | rqstp->rq_arg.tail[0] = head->arg.tail[0]; | ||
434 | rqstp->rq_arg.len = head->arg.len; | ||
435 | rqstp->rq_arg.buflen = head->arg.buflen; | ||
436 | |||
437 | /* XXX: What should this be? */ | ||
438 | rqstp->rq_prot = IPPROTO_MAX; | ||
439 | |||
440 | /* | ||
441 | * Free the contexts we used to build the RDMA_READ. We have | ||
442 | * to be careful here because the context list uses the same | ||
443 | * next pointer used to chain the contexts associated with the | ||
444 | * RDMA_READ | ||
445 | */ | ||
446 | data->next = NULL; /* terminate circular list */ | ||
447 | do { | ||
448 | data = head->next; | ||
449 | svc_rdma_put_context(head, 0); | ||
450 | head = data; | ||
451 | } while (head != NULL); | ||
452 | |||
453 | ret = rqstp->rq_arg.head[0].iov_len | ||
454 | + rqstp->rq_arg.page_len | ||
455 | + rqstp->rq_arg.tail[0].iov_len; | ||
456 | dprintk("svcrdma: deferred read ret=%d, rq_arg.len =%d, " | ||
457 | "rq_arg.head[0].iov_base=%p, rq_arg.head[0].iov_len = %zd\n", | ||
458 | ret, rqstp->rq_arg.len, rqstp->rq_arg.head[0].iov_base, | ||
459 | rqstp->rq_arg.head[0].iov_len); | ||
460 | |||
461 | /* Indicate that we've consumed an RQ credit */ | ||
462 | rqstp->rq_xprt_ctxt = rqstp->rq_xprt; | ||
463 | svc_xprt_received(rqstp->rq_xprt); | ||
464 | return ret; | ||
465 | } | ||
466 | |||
467 | /* | ||
468 | * Set up the rqstp thread context to point to the RQ buffer. If | ||
469 | * necessary, pull additional data from the client with an RDMA_READ | ||
470 | * request. | ||
471 | */ | ||
472 | int svc_rdma_recvfrom(struct svc_rqst *rqstp) | ||
473 | { | ||
474 | struct svc_xprt *xprt = rqstp->rq_xprt; | ||
475 | struct svcxprt_rdma *rdma_xprt = | ||
476 | container_of(xprt, struct svcxprt_rdma, sc_xprt); | ||
477 | struct svc_rdma_op_ctxt *ctxt = NULL; | ||
478 | struct rpcrdma_msg *rmsgp; | ||
479 | int ret = 0; | ||
480 | int len; | ||
481 | |||
482 | dprintk("svcrdma: rqstp=%p\n", rqstp); | ||
483 | |||
484 | /* | ||
485 | * The rq_xprt_ctxt indicates if we've consumed an RQ credit | ||
486 | * or not. It is used in the rdma xpo_release_rqst function to | ||
487 | * determine whether or not to return an RQ WQE to the RQ. | ||
488 | */ | ||
489 | rqstp->rq_xprt_ctxt = NULL; | ||
490 | |||
491 | spin_lock_bh(&rdma_xprt->sc_read_complete_lock); | ||
492 | if (!list_empty(&rdma_xprt->sc_read_complete_q)) { | ||
493 | ctxt = list_entry(rdma_xprt->sc_read_complete_q.next, | ||
494 | struct svc_rdma_op_ctxt, | ||
495 | dto_q); | ||
496 | list_del_init(&ctxt->dto_q); | ||
497 | } | ||
498 | spin_unlock_bh(&rdma_xprt->sc_read_complete_lock); | ||
499 | if (ctxt) | ||
500 | return rdma_read_complete(rqstp, ctxt); | ||
501 | |||
502 | spin_lock_bh(&rdma_xprt->sc_rq_dto_lock); | ||
503 | if (!list_empty(&rdma_xprt->sc_rq_dto_q)) { | ||
504 | ctxt = list_entry(rdma_xprt->sc_rq_dto_q.next, | ||
505 | struct svc_rdma_op_ctxt, | ||
506 | dto_q); | ||
507 | list_del_init(&ctxt->dto_q); | ||
508 | } else { | ||
509 | atomic_inc(&rdma_stat_rq_starve); | ||
510 | clear_bit(XPT_DATA, &xprt->xpt_flags); | ||
511 | ctxt = NULL; | ||
512 | } | ||
513 | spin_unlock_bh(&rdma_xprt->sc_rq_dto_lock); | ||
514 | if (!ctxt) { | ||
515 | /* This is the EAGAIN path. The svc_recv routine will | ||
516 | * return -EAGAIN, the nfsd thread will go to call into | ||
517 | * svc_recv again and we shouldn't be on the active | ||
518 | * transport list | ||
519 | */ | ||
520 | if (test_bit(XPT_CLOSE, &xprt->xpt_flags)) | ||
521 | goto close_out; | ||
522 | |||
523 | BUG_ON(ret); | ||
524 | goto out; | ||
525 | } | ||
526 | dprintk("svcrdma: processing ctxt=%p on xprt=%p, rqstp=%p, status=%d\n", | ||
527 | ctxt, rdma_xprt, rqstp, ctxt->wc_status); | ||
528 | BUG_ON(ctxt->wc_status != IB_WC_SUCCESS); | ||
529 | atomic_inc(&rdma_stat_recv); | ||
530 | |||
531 | /* Build up the XDR from the receive buffers. */ | ||
532 | rdma_build_arg_xdr(rqstp, ctxt, ctxt->byte_len); | ||
533 | |||
534 | /* Decode the RDMA header. */ | ||
535 | len = svc_rdma_xdr_decode_req(&rmsgp, rqstp); | ||
536 | rqstp->rq_xprt_hlen = len; | ||
537 | |||
538 | /* If the request is invalid, reply with an error */ | ||
539 | if (len < 0) { | ||
540 | if (len == -ENOSYS) | ||
541 | (void)svc_rdma_send_error(rdma_xprt, rmsgp, ERR_VERS); | ||
542 | goto close_out; | ||
543 | } | ||
544 | |||
545 | /* Read read-list data. If we would need to wait, defer | ||
546 | * it. Not that in this case, we don't return the RQ credit | ||
547 | * until after the read completes. | ||
548 | */ | ||
549 | if (rdma_read_xdr(rdma_xprt, rmsgp, rqstp, ctxt)) { | ||
550 | svc_xprt_received(xprt); | ||
551 | return 0; | ||
552 | } | ||
553 | |||
554 | /* Indicate we've consumed an RQ credit */ | ||
555 | rqstp->rq_xprt_ctxt = rqstp->rq_xprt; | ||
556 | |||
557 | ret = rqstp->rq_arg.head[0].iov_len | ||
558 | + rqstp->rq_arg.page_len | ||
559 | + rqstp->rq_arg.tail[0].iov_len; | ||
560 | svc_rdma_put_context(ctxt, 0); | ||
561 | out: | ||
562 | dprintk("svcrdma: ret = %d, rq_arg.len =%d, " | ||
563 | "rq_arg.head[0].iov_base=%p, rq_arg.head[0].iov_len = %zd\n", | ||
564 | ret, rqstp->rq_arg.len, | ||
565 | rqstp->rq_arg.head[0].iov_base, | ||
566 | rqstp->rq_arg.head[0].iov_len); | ||
567 | rqstp->rq_prot = IPPROTO_MAX; | ||
568 | svc_xprt_copy_addrs(rqstp, xprt); | ||
569 | svc_xprt_received(xprt); | ||
570 | return ret; | ||
571 | |||
572 | close_out: | ||
573 | if (ctxt) { | ||
574 | svc_rdma_put_context(ctxt, 1); | ||
575 | /* Indicate we've consumed an RQ credit */ | ||
576 | rqstp->rq_xprt_ctxt = rqstp->rq_xprt; | ||
577 | } | ||
578 | dprintk("svcrdma: transport %p is closing\n", xprt); | ||
579 | /* | ||
580 | * Set the close bit and enqueue it. svc_recv will see the | ||
581 | * close bit and call svc_xprt_delete | ||
582 | */ | ||
583 | set_bit(XPT_CLOSE, &xprt->xpt_flags); | ||
584 | svc_xprt_received(xprt); | ||
585 | return 0; | ||
586 | } | ||
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c new file mode 100644 index 000000000000..3e321949e1dc --- /dev/null +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c | |||
@@ -0,0 +1,520 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved. | ||
3 | * | ||
4 | * This software is available to you under a choice of one of two | ||
5 | * licenses. You may choose to be licensed under the terms of the GNU | ||
6 | * General Public License (GPL) Version 2, available from the file | ||
7 | * COPYING in the main directory of this source tree, or the BSD-type | ||
8 | * license below: | ||
9 | * | ||
10 | * Redistribution and use in source and binary forms, with or without | ||
11 | * modification, are permitted provided that the following conditions | ||
12 | * are met: | ||
13 | * | ||
14 | * Redistributions of source code must retain the above copyright | ||
15 | * notice, this list of conditions and the following disclaimer. | ||
16 | * | ||
17 | * Redistributions in binary form must reproduce the above | ||
18 | * copyright notice, this list of conditions and the following | ||
19 | * disclaimer in the documentation and/or other materials provided | ||
20 | * with the distribution. | ||
21 | * | ||
22 | * Neither the name of the Network Appliance, Inc. nor the names of | ||
23 | * its contributors may be used to endorse or promote products | ||
24 | * derived from this software without specific prior written | ||
25 | * permission. | ||
26 | * | ||
27 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | ||
28 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | ||
29 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | ||
30 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | ||
31 | * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | ||
32 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | ||
33 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | ||
34 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | ||
35 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | ||
36 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | ||
37 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||
38 | * | ||
39 | * Author: Tom Tucker <tom@opengridcomputing.com> | ||
40 | */ | ||
41 | |||
42 | #include <linux/sunrpc/debug.h> | ||
43 | #include <linux/sunrpc/rpc_rdma.h> | ||
44 | #include <linux/spinlock.h> | ||
45 | #include <asm/unaligned.h> | ||
46 | #include <rdma/ib_verbs.h> | ||
47 | #include <rdma/rdma_cm.h> | ||
48 | #include <linux/sunrpc/svc_rdma.h> | ||
49 | |||
50 | #define RPCDBG_FACILITY RPCDBG_SVCXPRT | ||
51 | |||
52 | /* Encode an XDR as an array of IB SGE | ||
53 | * | ||
54 | * Assumptions: | ||
55 | * - head[0] is physically contiguous. | ||
56 | * - tail[0] is physically contiguous. | ||
57 | * - pages[] is not physically or virtually contigous and consists of | ||
58 | * PAGE_SIZE elements. | ||
59 | * | ||
60 | * Output: | ||
61 | * SGE[0] reserved for RCPRDMA header | ||
62 | * SGE[1] data from xdr->head[] | ||
63 | * SGE[2..sge_count-2] data from xdr->pages[] | ||
64 | * SGE[sge_count-1] data from xdr->tail. | ||
65 | * | ||
66 | */ | ||
67 | static struct ib_sge *xdr_to_sge(struct svcxprt_rdma *xprt, | ||
68 | struct xdr_buf *xdr, | ||
69 | struct ib_sge *sge, | ||
70 | int *sge_count) | ||
71 | { | ||
72 | /* Max we need is the length of the XDR / pagesize + one for | ||
73 | * head + one for tail + one for RPCRDMA header | ||
74 | */ | ||
75 | int sge_max = (xdr->len+PAGE_SIZE-1) / PAGE_SIZE + 3; | ||
76 | int sge_no; | ||
77 | u32 byte_count = xdr->len; | ||
78 | u32 sge_bytes; | ||
79 | u32 page_bytes; | ||
80 | int page_off; | ||
81 | int page_no; | ||
82 | |||
83 | /* Skip the first sge, this is for the RPCRDMA header */ | ||
84 | sge_no = 1; | ||
85 | |||
86 | /* Head SGE */ | ||
87 | sge[sge_no].addr = ib_dma_map_single(xprt->sc_cm_id->device, | ||
88 | xdr->head[0].iov_base, | ||
89 | xdr->head[0].iov_len, | ||
90 | DMA_TO_DEVICE); | ||
91 | sge_bytes = min_t(u32, byte_count, xdr->head[0].iov_len); | ||
92 | byte_count -= sge_bytes; | ||
93 | sge[sge_no].length = sge_bytes; | ||
94 | sge[sge_no].lkey = xprt->sc_phys_mr->lkey; | ||
95 | sge_no++; | ||
96 | |||
97 | /* pages SGE */ | ||
98 | page_no = 0; | ||
99 | page_bytes = xdr->page_len; | ||
100 | page_off = xdr->page_base; | ||
101 | while (byte_count && page_bytes) { | ||
102 | sge_bytes = min_t(u32, byte_count, (PAGE_SIZE-page_off)); | ||
103 | sge[sge_no].addr = | ||
104 | ib_dma_map_page(xprt->sc_cm_id->device, | ||
105 | xdr->pages[page_no], page_off, | ||
106 | sge_bytes, DMA_TO_DEVICE); | ||
107 | sge_bytes = min(sge_bytes, page_bytes); | ||
108 | byte_count -= sge_bytes; | ||
109 | page_bytes -= sge_bytes; | ||
110 | sge[sge_no].length = sge_bytes; | ||
111 | sge[sge_no].lkey = xprt->sc_phys_mr->lkey; | ||
112 | |||
113 | sge_no++; | ||
114 | page_no++; | ||
115 | page_off = 0; /* reset for next time through loop */ | ||
116 | } | ||
117 | |||
118 | /* Tail SGE */ | ||
119 | if (byte_count && xdr->tail[0].iov_len) { | ||
120 | sge[sge_no].addr = | ||
121 | ib_dma_map_single(xprt->sc_cm_id->device, | ||
122 | xdr->tail[0].iov_base, | ||
123 | xdr->tail[0].iov_len, | ||
124 | DMA_TO_DEVICE); | ||
125 | sge_bytes = min_t(u32, byte_count, xdr->tail[0].iov_len); | ||
126 | byte_count -= sge_bytes; | ||
127 | sge[sge_no].length = sge_bytes; | ||
128 | sge[sge_no].lkey = xprt->sc_phys_mr->lkey; | ||
129 | sge_no++; | ||
130 | } | ||
131 | |||
132 | BUG_ON(sge_no > sge_max); | ||
133 | BUG_ON(byte_count != 0); | ||
134 | |||
135 | *sge_count = sge_no; | ||
136 | return sge; | ||
137 | } | ||
138 | |||
139 | |||
140 | /* Assumptions: | ||
141 | * - The specified write_len can be represented in sc_max_sge * PAGE_SIZE | ||
142 | */ | ||
143 | static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, | ||
144 | u32 rmr, u64 to, | ||
145 | u32 xdr_off, int write_len, | ||
146 | struct ib_sge *xdr_sge, int sge_count) | ||
147 | { | ||
148 | struct svc_rdma_op_ctxt *tmp_sge_ctxt; | ||
149 | struct ib_send_wr write_wr; | ||
150 | struct ib_sge *sge; | ||
151 | int xdr_sge_no; | ||
152 | int sge_no; | ||
153 | int sge_bytes; | ||
154 | int sge_off; | ||
155 | int bc; | ||
156 | struct svc_rdma_op_ctxt *ctxt; | ||
157 | int ret = 0; | ||
158 | |||
159 | BUG_ON(sge_count >= 32); | ||
160 | dprintk("svcrdma: RDMA_WRITE rmr=%x, to=%llx, xdr_off=%d, " | ||
161 | "write_len=%d, xdr_sge=%p, sge_count=%d\n", | ||
162 | rmr, to, xdr_off, write_len, xdr_sge, sge_count); | ||
163 | |||
164 | ctxt = svc_rdma_get_context(xprt); | ||
165 | ctxt->count = 0; | ||
166 | tmp_sge_ctxt = svc_rdma_get_context(xprt); | ||
167 | sge = tmp_sge_ctxt->sge; | ||
168 | |||
169 | /* Find the SGE associated with xdr_off */ | ||
170 | for (bc = xdr_off, xdr_sge_no = 1; bc && xdr_sge_no < sge_count; | ||
171 | xdr_sge_no++) { | ||
172 | if (xdr_sge[xdr_sge_no].length > bc) | ||
173 | break; | ||
174 | bc -= xdr_sge[xdr_sge_no].length; | ||
175 | } | ||
176 | |||
177 | sge_off = bc; | ||
178 | bc = write_len; | ||
179 | sge_no = 0; | ||
180 | |||
181 | /* Copy the remaining SGE */ | ||
182 | while (bc != 0 && xdr_sge_no < sge_count) { | ||
183 | sge[sge_no].addr = xdr_sge[xdr_sge_no].addr + sge_off; | ||
184 | sge[sge_no].lkey = xdr_sge[xdr_sge_no].lkey; | ||
185 | sge_bytes = min((size_t)bc, | ||
186 | (size_t)(xdr_sge[xdr_sge_no].length-sge_off)); | ||
187 | sge[sge_no].length = sge_bytes; | ||
188 | |||
189 | sge_off = 0; | ||
190 | sge_no++; | ||
191 | xdr_sge_no++; | ||
192 | bc -= sge_bytes; | ||
193 | } | ||
194 | |||
195 | BUG_ON(bc != 0); | ||
196 | BUG_ON(xdr_sge_no > sge_count); | ||
197 | |||
198 | /* Prepare WRITE WR */ | ||
199 | memset(&write_wr, 0, sizeof write_wr); | ||
200 | ctxt->wr_op = IB_WR_RDMA_WRITE; | ||
201 | write_wr.wr_id = (unsigned long)ctxt; | ||
202 | write_wr.sg_list = &sge[0]; | ||
203 | write_wr.num_sge = sge_no; | ||
204 | write_wr.opcode = IB_WR_RDMA_WRITE; | ||
205 | write_wr.send_flags = IB_SEND_SIGNALED; | ||
206 | write_wr.wr.rdma.rkey = rmr; | ||
207 | write_wr.wr.rdma.remote_addr = to; | ||
208 | |||
209 | /* Post It */ | ||
210 | atomic_inc(&rdma_stat_write); | ||
211 | if (svc_rdma_send(xprt, &write_wr)) { | ||
212 | svc_rdma_put_context(ctxt, 1); | ||
213 | /* Fatal error, close transport */ | ||
214 | ret = -EIO; | ||
215 | } | ||
216 | svc_rdma_put_context(tmp_sge_ctxt, 0); | ||
217 | return ret; | ||
218 | } | ||
219 | |||
220 | static int send_write_chunks(struct svcxprt_rdma *xprt, | ||
221 | struct rpcrdma_msg *rdma_argp, | ||
222 | struct rpcrdma_msg *rdma_resp, | ||
223 | struct svc_rqst *rqstp, | ||
224 | struct ib_sge *sge, | ||
225 | int sge_count) | ||
226 | { | ||
227 | u32 xfer_len = rqstp->rq_res.page_len + rqstp->rq_res.tail[0].iov_len; | ||
228 | int write_len; | ||
229 | int max_write; | ||
230 | u32 xdr_off; | ||
231 | int chunk_off; | ||
232 | int chunk_no; | ||
233 | struct rpcrdma_write_array *arg_ary; | ||
234 | struct rpcrdma_write_array *res_ary; | ||
235 | int ret; | ||
236 | |||
237 | arg_ary = svc_rdma_get_write_array(rdma_argp); | ||
238 | if (!arg_ary) | ||
239 | return 0; | ||
240 | res_ary = (struct rpcrdma_write_array *) | ||
241 | &rdma_resp->rm_body.rm_chunks[1]; | ||
242 | |||
243 | max_write = xprt->sc_max_sge * PAGE_SIZE; | ||
244 | |||
245 | /* Write chunks start at the pagelist */ | ||
246 | for (xdr_off = rqstp->rq_res.head[0].iov_len, chunk_no = 0; | ||
247 | xfer_len && chunk_no < arg_ary->wc_nchunks; | ||
248 | chunk_no++) { | ||
249 | struct rpcrdma_segment *arg_ch; | ||
250 | u64 rs_offset; | ||
251 | |||
252 | arg_ch = &arg_ary->wc_array[chunk_no].wc_target; | ||
253 | write_len = min(xfer_len, arg_ch->rs_length); | ||
254 | |||
255 | /* Prepare the response chunk given the length actually | ||
256 | * written */ | ||
257 | rs_offset = get_unaligned(&(arg_ch->rs_offset)); | ||
258 | svc_rdma_xdr_encode_array_chunk(res_ary, chunk_no, | ||
259 | arg_ch->rs_handle, | ||
260 | rs_offset, | ||
261 | write_len); | ||
262 | chunk_off = 0; | ||
263 | while (write_len) { | ||
264 | int this_write; | ||
265 | this_write = min(write_len, max_write); | ||
266 | ret = send_write(xprt, rqstp, | ||
267 | arg_ch->rs_handle, | ||
268 | rs_offset + chunk_off, | ||
269 | xdr_off, | ||
270 | this_write, | ||
271 | sge, | ||
272 | sge_count); | ||
273 | if (ret) { | ||
274 | dprintk("svcrdma: RDMA_WRITE failed, ret=%d\n", | ||
275 | ret); | ||
276 | return -EIO; | ||
277 | } | ||
278 | chunk_off += this_write; | ||
279 | xdr_off += this_write; | ||
280 | xfer_len -= this_write; | ||
281 | write_len -= this_write; | ||
282 | } | ||
283 | } | ||
284 | /* Update the req with the number of chunks actually used */ | ||
285 | svc_rdma_xdr_encode_write_list(rdma_resp, chunk_no); | ||
286 | |||
287 | return rqstp->rq_res.page_len + rqstp->rq_res.tail[0].iov_len; | ||
288 | } | ||
289 | |||
290 | static int send_reply_chunks(struct svcxprt_rdma *xprt, | ||
291 | struct rpcrdma_msg *rdma_argp, | ||
292 | struct rpcrdma_msg *rdma_resp, | ||
293 | struct svc_rqst *rqstp, | ||
294 | struct ib_sge *sge, | ||
295 | int sge_count) | ||
296 | { | ||
297 | u32 xfer_len = rqstp->rq_res.len; | ||
298 | int write_len; | ||
299 | int max_write; | ||
300 | u32 xdr_off; | ||
301 | int chunk_no; | ||
302 | int chunk_off; | ||
303 | struct rpcrdma_segment *ch; | ||
304 | struct rpcrdma_write_array *arg_ary; | ||
305 | struct rpcrdma_write_array *res_ary; | ||
306 | int ret; | ||
307 | |||
308 | arg_ary = svc_rdma_get_reply_array(rdma_argp); | ||
309 | if (!arg_ary) | ||
310 | return 0; | ||
311 | /* XXX: need to fix when reply lists occur with read-list and or | ||
312 | * write-list */ | ||
313 | res_ary = (struct rpcrdma_write_array *) | ||
314 | &rdma_resp->rm_body.rm_chunks[2]; | ||
315 | |||
316 | max_write = xprt->sc_max_sge * PAGE_SIZE; | ||
317 | |||
318 | /* xdr offset starts at RPC message */ | ||
319 | for (xdr_off = 0, chunk_no = 0; | ||
320 | xfer_len && chunk_no < arg_ary->wc_nchunks; | ||
321 | chunk_no++) { | ||
322 | u64 rs_offset; | ||
323 | ch = &arg_ary->wc_array[chunk_no].wc_target; | ||
324 | write_len = min(xfer_len, ch->rs_length); | ||
325 | |||
326 | |||
327 | /* Prepare the reply chunk given the length actually | ||
328 | * written */ | ||
329 | rs_offset = get_unaligned(&(ch->rs_offset)); | ||
330 | svc_rdma_xdr_encode_array_chunk(res_ary, chunk_no, | ||
331 | ch->rs_handle, rs_offset, | ||
332 | write_len); | ||
333 | chunk_off = 0; | ||
334 | while (write_len) { | ||
335 | int this_write; | ||
336 | |||
337 | this_write = min(write_len, max_write); | ||
338 | ret = send_write(xprt, rqstp, | ||
339 | ch->rs_handle, | ||
340 | rs_offset + chunk_off, | ||
341 | xdr_off, | ||
342 | this_write, | ||
343 | sge, | ||
344 | sge_count); | ||
345 | if (ret) { | ||
346 | dprintk("svcrdma: RDMA_WRITE failed, ret=%d\n", | ||
347 | ret); | ||
348 | return -EIO; | ||
349 | } | ||
350 | chunk_off += this_write; | ||
351 | xdr_off += this_write; | ||
352 | xfer_len -= this_write; | ||
353 | write_len -= this_write; | ||
354 | } | ||
355 | } | ||
356 | /* Update the req with the number of chunks actually used */ | ||
357 | svc_rdma_xdr_encode_reply_array(res_ary, chunk_no); | ||
358 | |||
359 | return rqstp->rq_res.len; | ||
360 | } | ||
361 | |||
362 | /* This function prepares the portion of the RPCRDMA message to be | ||
363 | * sent in the RDMA_SEND. This function is called after data sent via | ||
364 | * RDMA has already been transmitted. There are three cases: | ||
365 | * - The RPCRDMA header, RPC header, and payload are all sent in a | ||
366 | * single RDMA_SEND. This is the "inline" case. | ||
367 | * - The RPCRDMA header and some portion of the RPC header and data | ||
368 | * are sent via this RDMA_SEND and another portion of the data is | ||
369 | * sent via RDMA. | ||
370 | * - The RPCRDMA header [NOMSG] is sent in this RDMA_SEND and the RPC | ||
371 | * header and data are all transmitted via RDMA. | ||
372 | * In all three cases, this function prepares the RPCRDMA header in | ||
373 | * sge[0], the 'type' parameter indicates the type to place in the | ||
374 | * RPCRDMA header, and the 'byte_count' field indicates how much of | ||
375 | * the XDR to include in this RDMA_SEND. | ||
376 | */ | ||
377 | static int send_reply(struct svcxprt_rdma *rdma, | ||
378 | struct svc_rqst *rqstp, | ||
379 | struct page *page, | ||
380 | struct rpcrdma_msg *rdma_resp, | ||
381 | struct svc_rdma_op_ctxt *ctxt, | ||
382 | int sge_count, | ||
383 | int byte_count) | ||
384 | { | ||
385 | struct ib_send_wr send_wr; | ||
386 | int sge_no; | ||
387 | int sge_bytes; | ||
388 | int page_no; | ||
389 | int ret; | ||
390 | |||
391 | /* Prepare the context */ | ||
392 | ctxt->pages[0] = page; | ||
393 | ctxt->count = 1; | ||
394 | |||
395 | /* Prepare the SGE for the RPCRDMA Header */ | ||
396 | ctxt->sge[0].addr = | ||
397 | ib_dma_map_page(rdma->sc_cm_id->device, | ||
398 | page, 0, PAGE_SIZE, DMA_TO_DEVICE); | ||
399 | ctxt->direction = DMA_TO_DEVICE; | ||
400 | ctxt->sge[0].length = svc_rdma_xdr_get_reply_hdr_len(rdma_resp); | ||
401 | ctxt->sge[0].lkey = rdma->sc_phys_mr->lkey; | ||
402 | |||
403 | /* Determine how many of our SGE are to be transmitted */ | ||
404 | for (sge_no = 1; byte_count && sge_no < sge_count; sge_no++) { | ||
405 | sge_bytes = min((size_t)ctxt->sge[sge_no].length, | ||
406 | (size_t)byte_count); | ||
407 | byte_count -= sge_bytes; | ||
408 | } | ||
409 | BUG_ON(byte_count != 0); | ||
410 | |||
411 | /* Save all respages in the ctxt and remove them from the | ||
412 | * respages array. They are our pages until the I/O | ||
413 | * completes. | ||
414 | */ | ||
415 | for (page_no = 0; page_no < rqstp->rq_resused; page_no++) { | ||
416 | ctxt->pages[page_no+1] = rqstp->rq_respages[page_no]; | ||
417 | ctxt->count++; | ||
418 | rqstp->rq_respages[page_no] = NULL; | ||
419 | } | ||
420 | |||
421 | BUG_ON(sge_no > rdma->sc_max_sge); | ||
422 | memset(&send_wr, 0, sizeof send_wr); | ||
423 | ctxt->wr_op = IB_WR_SEND; | ||
424 | send_wr.wr_id = (unsigned long)ctxt; | ||
425 | send_wr.sg_list = ctxt->sge; | ||
426 | send_wr.num_sge = sge_no; | ||
427 | send_wr.opcode = IB_WR_SEND; | ||
428 | send_wr.send_flags = IB_SEND_SIGNALED; | ||
429 | |||
430 | ret = svc_rdma_send(rdma, &send_wr); | ||
431 | if (ret) | ||
432 | svc_rdma_put_context(ctxt, 1); | ||
433 | |||
434 | return ret; | ||
435 | } | ||
436 | |||
437 | void svc_rdma_prep_reply_hdr(struct svc_rqst *rqstp) | ||
438 | { | ||
439 | } | ||
440 | |||
441 | /* | ||
442 | * Return the start of an xdr buffer. | ||
443 | */ | ||
444 | static void *xdr_start(struct xdr_buf *xdr) | ||
445 | { | ||
446 | return xdr->head[0].iov_base - | ||
447 | (xdr->len - | ||
448 | xdr->page_len - | ||
449 | xdr->tail[0].iov_len - | ||
450 | xdr->head[0].iov_len); | ||
451 | } | ||
452 | |||
453 | int svc_rdma_sendto(struct svc_rqst *rqstp) | ||
454 | { | ||
455 | struct svc_xprt *xprt = rqstp->rq_xprt; | ||
456 | struct svcxprt_rdma *rdma = | ||
457 | container_of(xprt, struct svcxprt_rdma, sc_xprt); | ||
458 | struct rpcrdma_msg *rdma_argp; | ||
459 | struct rpcrdma_msg *rdma_resp; | ||
460 | struct rpcrdma_write_array *reply_ary; | ||
461 | enum rpcrdma_proc reply_type; | ||
462 | int ret; | ||
463 | int inline_bytes; | ||
464 | struct ib_sge *sge; | ||
465 | int sge_count = 0; | ||
466 | struct page *res_page; | ||
467 | struct svc_rdma_op_ctxt *ctxt; | ||
468 | |||
469 | dprintk("svcrdma: sending response for rqstp=%p\n", rqstp); | ||
470 | |||
471 | /* Get the RDMA request header. */ | ||
472 | rdma_argp = xdr_start(&rqstp->rq_arg); | ||
473 | |||
474 | /* Build an SGE for the XDR */ | ||
475 | ctxt = svc_rdma_get_context(rdma); | ||
476 | ctxt->direction = DMA_TO_DEVICE; | ||
477 | sge = xdr_to_sge(rdma, &rqstp->rq_res, ctxt->sge, &sge_count); | ||
478 | |||
479 | inline_bytes = rqstp->rq_res.len; | ||
480 | |||
481 | /* Create the RDMA response header */ | ||
482 | res_page = svc_rdma_get_page(); | ||
483 | rdma_resp = page_address(res_page); | ||
484 | reply_ary = svc_rdma_get_reply_array(rdma_argp); | ||
485 | if (reply_ary) | ||
486 | reply_type = RDMA_NOMSG; | ||
487 | else | ||
488 | reply_type = RDMA_MSG; | ||
489 | svc_rdma_xdr_encode_reply_header(rdma, rdma_argp, | ||
490 | rdma_resp, reply_type); | ||
491 | |||
492 | /* Send any write-chunk data and build resp write-list */ | ||
493 | ret = send_write_chunks(rdma, rdma_argp, rdma_resp, | ||
494 | rqstp, sge, sge_count); | ||
495 | if (ret < 0) { | ||
496 | printk(KERN_ERR "svcrdma: failed to send write chunks, rc=%d\n", | ||
497 | ret); | ||
498 | goto error; | ||
499 | } | ||
500 | inline_bytes -= ret; | ||
501 | |||
502 | /* Send any reply-list data and update resp reply-list */ | ||
503 | ret = send_reply_chunks(rdma, rdma_argp, rdma_resp, | ||
504 | rqstp, sge, sge_count); | ||
505 | if (ret < 0) { | ||
506 | printk(KERN_ERR "svcrdma: failed to send reply chunks, rc=%d\n", | ||
507 | ret); | ||
508 | goto error; | ||
509 | } | ||
510 | inline_bytes -= ret; | ||
511 | |||
512 | ret = send_reply(rdma, rqstp, res_page, rdma_resp, ctxt, sge_count, | ||
513 | inline_bytes); | ||
514 | dprintk("svcrdma: send_reply returns %d\n", ret); | ||
515 | return ret; | ||
516 | error: | ||
517 | svc_rdma_put_context(ctxt, 0); | ||
518 | put_page(res_page); | ||
519 | return ret; | ||
520 | } | ||
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c new file mode 100644 index 000000000000..f09444c451bc --- /dev/null +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c | |||
@@ -0,0 +1,1080 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2005-2007 Network Appliance, Inc. All rights reserved. | ||
3 | * | ||
4 | * This software is available to you under a choice of one of two | ||
5 | * licenses. You may choose to be licensed under the terms of the GNU | ||
6 | * General Public License (GPL) Version 2, available from the file | ||
7 | * COPYING in the main directory of this source tree, or the BSD-type | ||
8 | * license below: | ||
9 | * | ||
10 | * Redistribution and use in source and binary forms, with or without | ||
11 | * modification, are permitted provided that the following conditions | ||
12 | * are met: | ||
13 | * | ||
14 | * Redistributions of source code must retain the above copyright | ||
15 | * notice, this list of conditions and the following disclaimer. | ||
16 | * | ||
17 | * Redistributions in binary form must reproduce the above | ||
18 | * copyright notice, this list of conditions and the following | ||
19 | * disclaimer in the documentation and/or other materials provided | ||
20 | * with the distribution. | ||
21 | * | ||
22 | * Neither the name of the Network Appliance, Inc. nor the names of | ||
23 | * its contributors may be used to endorse or promote products | ||
24 | * derived from this software without specific prior written | ||
25 | * permission. | ||
26 | * | ||
27 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | ||
28 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | ||
29 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | ||
30 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | ||
31 | * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | ||
32 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | ||
33 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | ||
34 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | ||
35 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | ||
36 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | ||
37 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||
38 | * | ||
39 | * Author: Tom Tucker <tom@opengridcomputing.com> | ||
40 | */ | ||
41 | |||
42 | #include <linux/sunrpc/svc_xprt.h> | ||
43 | #include <linux/sunrpc/debug.h> | ||
44 | #include <linux/sunrpc/rpc_rdma.h> | ||
45 | #include <linux/spinlock.h> | ||
46 | #include <rdma/ib_verbs.h> | ||
47 | #include <rdma/rdma_cm.h> | ||
48 | #include <linux/sunrpc/svc_rdma.h> | ||
49 | |||
50 | #define RPCDBG_FACILITY RPCDBG_SVCXPRT | ||
51 | |||
52 | static struct svc_xprt *svc_rdma_create(struct svc_serv *serv, | ||
53 | struct sockaddr *sa, int salen, | ||
54 | int flags); | ||
55 | static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt); | ||
56 | static void svc_rdma_release_rqst(struct svc_rqst *); | ||
57 | static void rdma_destroy_xprt(struct svcxprt_rdma *xprt); | ||
58 | static void dto_tasklet_func(unsigned long data); | ||
59 | static void svc_rdma_detach(struct svc_xprt *xprt); | ||
60 | static void svc_rdma_free(struct svc_xprt *xprt); | ||
61 | static int svc_rdma_has_wspace(struct svc_xprt *xprt); | ||
62 | static void rq_cq_reap(struct svcxprt_rdma *xprt); | ||
63 | static void sq_cq_reap(struct svcxprt_rdma *xprt); | ||
64 | |||
65 | DECLARE_TASKLET(dto_tasklet, dto_tasklet_func, 0UL); | ||
66 | static DEFINE_SPINLOCK(dto_lock); | ||
67 | static LIST_HEAD(dto_xprt_q); | ||
68 | |||
69 | static struct svc_xprt_ops svc_rdma_ops = { | ||
70 | .xpo_create = svc_rdma_create, | ||
71 | .xpo_recvfrom = svc_rdma_recvfrom, | ||
72 | .xpo_sendto = svc_rdma_sendto, | ||
73 | .xpo_release_rqst = svc_rdma_release_rqst, | ||
74 | .xpo_detach = svc_rdma_detach, | ||
75 | .xpo_free = svc_rdma_free, | ||
76 | .xpo_prep_reply_hdr = svc_rdma_prep_reply_hdr, | ||
77 | .xpo_has_wspace = svc_rdma_has_wspace, | ||
78 | .xpo_accept = svc_rdma_accept, | ||
79 | }; | ||
80 | |||
81 | struct svc_xprt_class svc_rdma_class = { | ||
82 | .xcl_name = "rdma", | ||
83 | .xcl_owner = THIS_MODULE, | ||
84 | .xcl_ops = &svc_rdma_ops, | ||
85 | .xcl_max_payload = RPCSVC_MAXPAYLOAD_TCP, | ||
86 | }; | ||
87 | |||
88 | static int rdma_bump_context_cache(struct svcxprt_rdma *xprt) | ||
89 | { | ||
90 | int target; | ||
91 | int at_least_one = 0; | ||
92 | struct svc_rdma_op_ctxt *ctxt; | ||
93 | |||
94 | target = min(xprt->sc_ctxt_cnt + xprt->sc_ctxt_bump, | ||
95 | xprt->sc_ctxt_max); | ||
96 | |||
97 | spin_lock_bh(&xprt->sc_ctxt_lock); | ||
98 | while (xprt->sc_ctxt_cnt < target) { | ||
99 | xprt->sc_ctxt_cnt++; | ||
100 | spin_unlock_bh(&xprt->sc_ctxt_lock); | ||
101 | |||
102 | ctxt = kmalloc(sizeof(*ctxt), GFP_KERNEL); | ||
103 | |||
104 | spin_lock_bh(&xprt->sc_ctxt_lock); | ||
105 | if (ctxt) { | ||
106 | at_least_one = 1; | ||
107 | ctxt->next = xprt->sc_ctxt_head; | ||
108 | xprt->sc_ctxt_head = ctxt; | ||
109 | } else { | ||
110 | /* kmalloc failed...give up for now */ | ||
111 | xprt->sc_ctxt_cnt--; | ||
112 | break; | ||
113 | } | ||
114 | } | ||
115 | spin_unlock_bh(&xprt->sc_ctxt_lock); | ||
116 | dprintk("svcrdma: sc_ctxt_max=%d, sc_ctxt_cnt=%d\n", | ||
117 | xprt->sc_ctxt_max, xprt->sc_ctxt_cnt); | ||
118 | return at_least_one; | ||
119 | } | ||
120 | |||
121 | struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt) | ||
122 | { | ||
123 | struct svc_rdma_op_ctxt *ctxt; | ||
124 | |||
125 | while (1) { | ||
126 | spin_lock_bh(&xprt->sc_ctxt_lock); | ||
127 | if (unlikely(xprt->sc_ctxt_head == NULL)) { | ||
128 | /* Try to bump my cache. */ | ||
129 | spin_unlock_bh(&xprt->sc_ctxt_lock); | ||
130 | |||
131 | if (rdma_bump_context_cache(xprt)) | ||
132 | continue; | ||
133 | |||
134 | printk(KERN_INFO "svcrdma: sleeping waiting for " | ||
135 | "context memory on xprt=%p\n", | ||
136 | xprt); | ||
137 | schedule_timeout_uninterruptible(msecs_to_jiffies(500)); | ||
138 | continue; | ||
139 | } | ||
140 | ctxt = xprt->sc_ctxt_head; | ||
141 | xprt->sc_ctxt_head = ctxt->next; | ||
142 | spin_unlock_bh(&xprt->sc_ctxt_lock); | ||
143 | ctxt->xprt = xprt; | ||
144 | INIT_LIST_HEAD(&ctxt->dto_q); | ||
145 | ctxt->count = 0; | ||
146 | break; | ||
147 | } | ||
148 | return ctxt; | ||
149 | } | ||
150 | |||
151 | void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages) | ||
152 | { | ||
153 | struct svcxprt_rdma *xprt; | ||
154 | int i; | ||
155 | |||
156 | BUG_ON(!ctxt); | ||
157 | xprt = ctxt->xprt; | ||
158 | if (free_pages) | ||
159 | for (i = 0; i < ctxt->count; i++) | ||
160 | put_page(ctxt->pages[i]); | ||
161 | |||
162 | for (i = 0; i < ctxt->count; i++) | ||
163 | dma_unmap_single(xprt->sc_cm_id->device->dma_device, | ||
164 | ctxt->sge[i].addr, | ||
165 | ctxt->sge[i].length, | ||
166 | ctxt->direction); | ||
167 | spin_lock_bh(&xprt->sc_ctxt_lock); | ||
168 | ctxt->next = xprt->sc_ctxt_head; | ||
169 | xprt->sc_ctxt_head = ctxt; | ||
170 | spin_unlock_bh(&xprt->sc_ctxt_lock); | ||
171 | } | ||
172 | |||
173 | /* ib_cq event handler */ | ||
174 | static void cq_event_handler(struct ib_event *event, void *context) | ||
175 | { | ||
176 | struct svc_xprt *xprt = context; | ||
177 | dprintk("svcrdma: received CQ event id=%d, context=%p\n", | ||
178 | event->event, context); | ||
179 | set_bit(XPT_CLOSE, &xprt->xpt_flags); | ||
180 | } | ||
181 | |||
182 | /* QP event handler */ | ||
183 | static void qp_event_handler(struct ib_event *event, void *context) | ||
184 | { | ||
185 | struct svc_xprt *xprt = context; | ||
186 | |||
187 | switch (event->event) { | ||
188 | /* These are considered benign events */ | ||
189 | case IB_EVENT_PATH_MIG: | ||
190 | case IB_EVENT_COMM_EST: | ||
191 | case IB_EVENT_SQ_DRAINED: | ||
192 | case IB_EVENT_QP_LAST_WQE_REACHED: | ||
193 | dprintk("svcrdma: QP event %d received for QP=%p\n", | ||
194 | event->event, event->element.qp); | ||
195 | break; | ||
196 | /* These are considered fatal events */ | ||
197 | case IB_EVENT_PATH_MIG_ERR: | ||
198 | case IB_EVENT_QP_FATAL: | ||
199 | case IB_EVENT_QP_REQ_ERR: | ||
200 | case IB_EVENT_QP_ACCESS_ERR: | ||
201 | case IB_EVENT_DEVICE_FATAL: | ||
202 | default: | ||
203 | dprintk("svcrdma: QP ERROR event %d received for QP=%p, " | ||
204 | "closing transport\n", | ||
205 | event->event, event->element.qp); | ||
206 | set_bit(XPT_CLOSE, &xprt->xpt_flags); | ||
207 | break; | ||
208 | } | ||
209 | } | ||
210 | |||
211 | /* | ||
212 | * Data Transfer Operation Tasklet | ||
213 | * | ||
214 | * Walks a list of transports with I/O pending, removing entries as | ||
215 | * they are added to the server's I/O pending list. Two bits indicate | ||
216 | * if SQ, RQ, or both have I/O pending. The dto_lock is an irqsave | ||
217 | * spinlock that serializes access to the transport list with the RQ | ||
218 | * and SQ interrupt handlers. | ||
219 | */ | ||
220 | static void dto_tasklet_func(unsigned long data) | ||
221 | { | ||
222 | struct svcxprt_rdma *xprt; | ||
223 | unsigned long flags; | ||
224 | |||
225 | spin_lock_irqsave(&dto_lock, flags); | ||
226 | while (!list_empty(&dto_xprt_q)) { | ||
227 | xprt = list_entry(dto_xprt_q.next, | ||
228 | struct svcxprt_rdma, sc_dto_q); | ||
229 | list_del_init(&xprt->sc_dto_q); | ||
230 | spin_unlock_irqrestore(&dto_lock, flags); | ||
231 | |||
232 | if (test_and_clear_bit(RDMAXPRT_RQ_PENDING, &xprt->sc_flags)) { | ||
233 | ib_req_notify_cq(xprt->sc_rq_cq, IB_CQ_NEXT_COMP); | ||
234 | rq_cq_reap(xprt); | ||
235 | set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags); | ||
236 | /* | ||
237 | * If data arrived before established event, | ||
238 | * don't enqueue. This defers RPC I/O until the | ||
239 | * RDMA connection is complete. | ||
240 | */ | ||
241 | if (!test_bit(RDMAXPRT_CONN_PENDING, &xprt->sc_flags)) | ||
242 | svc_xprt_enqueue(&xprt->sc_xprt); | ||
243 | } | ||
244 | |||
245 | if (test_and_clear_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags)) { | ||
246 | ib_req_notify_cq(xprt->sc_sq_cq, IB_CQ_NEXT_COMP); | ||
247 | sq_cq_reap(xprt); | ||
248 | } | ||
249 | |||
250 | spin_lock_irqsave(&dto_lock, flags); | ||
251 | } | ||
252 | spin_unlock_irqrestore(&dto_lock, flags); | ||
253 | } | ||
254 | |||
255 | /* | ||
256 | * Receive Queue Completion Handler | ||
257 | * | ||
258 | * Since an RQ completion handler is called on interrupt context, we | ||
259 | * need to defer the handling of the I/O to a tasklet | ||
260 | */ | ||
261 | static void rq_comp_handler(struct ib_cq *cq, void *cq_context) | ||
262 | { | ||
263 | struct svcxprt_rdma *xprt = cq_context; | ||
264 | unsigned long flags; | ||
265 | |||
266 | /* | ||
267 | * Set the bit regardless of whether or not it's on the list | ||
268 | * because it may be on the list already due to an SQ | ||
269 | * completion. | ||
270 | */ | ||
271 | set_bit(RDMAXPRT_RQ_PENDING, &xprt->sc_flags); | ||
272 | |||
273 | /* | ||
274 | * If this transport is not already on the DTO transport queue, | ||
275 | * add it | ||
276 | */ | ||
277 | spin_lock_irqsave(&dto_lock, flags); | ||
278 | if (list_empty(&xprt->sc_dto_q)) | ||
279 | list_add_tail(&xprt->sc_dto_q, &dto_xprt_q); | ||
280 | spin_unlock_irqrestore(&dto_lock, flags); | ||
281 | |||
282 | /* Tasklet does all the work to avoid irqsave locks. */ | ||
283 | tasklet_schedule(&dto_tasklet); | ||
284 | } | ||
285 | |||
286 | /* | ||
287 | * rq_cq_reap - Process the RQ CQ. | ||
288 | * | ||
289 | * Take all completing WC off the CQE and enqueue the associated DTO | ||
290 | * context on the dto_q for the transport. | ||
291 | */ | ||
292 | static void rq_cq_reap(struct svcxprt_rdma *xprt) | ||
293 | { | ||
294 | int ret; | ||
295 | struct ib_wc wc; | ||
296 | struct svc_rdma_op_ctxt *ctxt = NULL; | ||
297 | |||
298 | atomic_inc(&rdma_stat_rq_poll); | ||
299 | |||
300 | spin_lock_bh(&xprt->sc_rq_dto_lock); | ||
301 | while ((ret = ib_poll_cq(xprt->sc_rq_cq, 1, &wc)) > 0) { | ||
302 | ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id; | ||
303 | ctxt->wc_status = wc.status; | ||
304 | ctxt->byte_len = wc.byte_len; | ||
305 | if (wc.status != IB_WC_SUCCESS) { | ||
306 | /* Close the transport */ | ||
307 | set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); | ||
308 | svc_rdma_put_context(ctxt, 1); | ||
309 | continue; | ||
310 | } | ||
311 | list_add_tail(&ctxt->dto_q, &xprt->sc_rq_dto_q); | ||
312 | } | ||
313 | spin_unlock_bh(&xprt->sc_rq_dto_lock); | ||
314 | |||
315 | if (ctxt) | ||
316 | atomic_inc(&rdma_stat_rq_prod); | ||
317 | } | ||
318 | |||
319 | /* | ||
320 | * Send Queue Completion Handler - potentially called on interrupt context. | ||
321 | */ | ||
322 | static void sq_cq_reap(struct svcxprt_rdma *xprt) | ||
323 | { | ||
324 | struct svc_rdma_op_ctxt *ctxt = NULL; | ||
325 | struct ib_wc wc; | ||
326 | struct ib_cq *cq = xprt->sc_sq_cq; | ||
327 | int ret; | ||
328 | |||
329 | atomic_inc(&rdma_stat_sq_poll); | ||
330 | while ((ret = ib_poll_cq(cq, 1, &wc)) > 0) { | ||
331 | ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id; | ||
332 | xprt = ctxt->xprt; | ||
333 | |||
334 | if (wc.status != IB_WC_SUCCESS) | ||
335 | /* Close the transport */ | ||
336 | set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); | ||
337 | |||
338 | /* Decrement used SQ WR count */ | ||
339 | atomic_dec(&xprt->sc_sq_count); | ||
340 | wake_up(&xprt->sc_send_wait); | ||
341 | |||
342 | switch (ctxt->wr_op) { | ||
343 | case IB_WR_SEND: | ||
344 | case IB_WR_RDMA_WRITE: | ||
345 | svc_rdma_put_context(ctxt, 1); | ||
346 | break; | ||
347 | |||
348 | case IB_WR_RDMA_READ: | ||
349 | if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) { | ||
350 | set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags); | ||
351 | set_bit(RDMACTXT_F_READ_DONE, &ctxt->flags); | ||
352 | spin_lock_bh(&xprt->sc_read_complete_lock); | ||
353 | list_add_tail(&ctxt->dto_q, | ||
354 | &xprt->sc_read_complete_q); | ||
355 | spin_unlock_bh(&xprt->sc_read_complete_lock); | ||
356 | svc_xprt_enqueue(&xprt->sc_xprt); | ||
357 | } | ||
358 | break; | ||
359 | |||
360 | default: | ||
361 | printk(KERN_ERR "svcrdma: unexpected completion type, " | ||
362 | "opcode=%d, status=%d\n", | ||
363 | wc.opcode, wc.status); | ||
364 | break; | ||
365 | } | ||
366 | } | ||
367 | |||
368 | if (ctxt) | ||
369 | atomic_inc(&rdma_stat_sq_prod); | ||
370 | } | ||
371 | |||
372 | static void sq_comp_handler(struct ib_cq *cq, void *cq_context) | ||
373 | { | ||
374 | struct svcxprt_rdma *xprt = cq_context; | ||
375 | unsigned long flags; | ||
376 | |||
377 | /* | ||
378 | * Set the bit regardless of whether or not it's on the list | ||
379 | * because it may be on the list already due to an RQ | ||
380 | * completion. | ||
381 | */ | ||
382 | set_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags); | ||
383 | |||
384 | /* | ||
385 | * If this transport is not already on the DTO transport queue, | ||
386 | * add it | ||
387 | */ | ||
388 | spin_lock_irqsave(&dto_lock, flags); | ||
389 | if (list_empty(&xprt->sc_dto_q)) | ||
390 | list_add_tail(&xprt->sc_dto_q, &dto_xprt_q); | ||
391 | spin_unlock_irqrestore(&dto_lock, flags); | ||
392 | |||
393 | /* Tasklet does all the work to avoid irqsave locks. */ | ||
394 | tasklet_schedule(&dto_tasklet); | ||
395 | } | ||
396 | |||
397 | static void create_context_cache(struct svcxprt_rdma *xprt, | ||
398 | int ctxt_count, int ctxt_bump, int ctxt_max) | ||
399 | { | ||
400 | struct svc_rdma_op_ctxt *ctxt; | ||
401 | int i; | ||
402 | |||
403 | xprt->sc_ctxt_max = ctxt_max; | ||
404 | xprt->sc_ctxt_bump = ctxt_bump; | ||
405 | xprt->sc_ctxt_cnt = 0; | ||
406 | xprt->sc_ctxt_head = NULL; | ||
407 | for (i = 0; i < ctxt_count; i++) { | ||
408 | ctxt = kmalloc(sizeof(*ctxt), GFP_KERNEL); | ||
409 | if (ctxt) { | ||
410 | ctxt->next = xprt->sc_ctxt_head; | ||
411 | xprt->sc_ctxt_head = ctxt; | ||
412 | xprt->sc_ctxt_cnt++; | ||
413 | } | ||
414 | } | ||
415 | } | ||
416 | |||
417 | static void destroy_context_cache(struct svc_rdma_op_ctxt *ctxt) | ||
418 | { | ||
419 | struct svc_rdma_op_ctxt *next; | ||
420 | if (!ctxt) | ||
421 | return; | ||
422 | |||
423 | do { | ||
424 | next = ctxt->next; | ||
425 | kfree(ctxt); | ||
426 | ctxt = next; | ||
427 | } while (next); | ||
428 | } | ||
429 | |||
430 | static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv, | ||
431 | int listener) | ||
432 | { | ||
433 | struct svcxprt_rdma *cma_xprt = kzalloc(sizeof *cma_xprt, GFP_KERNEL); | ||
434 | |||
435 | if (!cma_xprt) | ||
436 | return NULL; | ||
437 | svc_xprt_init(&svc_rdma_class, &cma_xprt->sc_xprt, serv); | ||
438 | INIT_LIST_HEAD(&cma_xprt->sc_accept_q); | ||
439 | INIT_LIST_HEAD(&cma_xprt->sc_dto_q); | ||
440 | INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q); | ||
441 | INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q); | ||
442 | init_waitqueue_head(&cma_xprt->sc_send_wait); | ||
443 | |||
444 | spin_lock_init(&cma_xprt->sc_lock); | ||
445 | spin_lock_init(&cma_xprt->sc_read_complete_lock); | ||
446 | spin_lock_init(&cma_xprt->sc_ctxt_lock); | ||
447 | spin_lock_init(&cma_xprt->sc_rq_dto_lock); | ||
448 | |||
449 | cma_xprt->sc_ord = svcrdma_ord; | ||
450 | |||
451 | cma_xprt->sc_max_req_size = svcrdma_max_req_size; | ||
452 | cma_xprt->sc_max_requests = svcrdma_max_requests; | ||
453 | cma_xprt->sc_sq_depth = svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT; | ||
454 | atomic_set(&cma_xprt->sc_sq_count, 0); | ||
455 | |||
456 | if (!listener) { | ||
457 | int reqs = cma_xprt->sc_max_requests; | ||
458 | create_context_cache(cma_xprt, | ||
459 | reqs << 1, /* starting size */ | ||
460 | reqs, /* bump amount */ | ||
461 | reqs + | ||
462 | cma_xprt->sc_sq_depth + | ||
463 | RPCRDMA_MAX_THREADS + 1); /* max */ | ||
464 | if (!cma_xprt->sc_ctxt_head) { | ||
465 | kfree(cma_xprt); | ||
466 | return NULL; | ||
467 | } | ||
468 | clear_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags); | ||
469 | } else | ||
470 | set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags); | ||
471 | |||
472 | return cma_xprt; | ||
473 | } | ||
474 | |||
475 | struct page *svc_rdma_get_page(void) | ||
476 | { | ||
477 | struct page *page; | ||
478 | |||
479 | while ((page = alloc_page(GFP_KERNEL)) == NULL) { | ||
480 | /* If we can't get memory, wait a bit and try again */ | ||
481 | printk(KERN_INFO "svcrdma: out of memory...retrying in 1000 " | ||
482 | "jiffies.\n"); | ||
483 | schedule_timeout_uninterruptible(msecs_to_jiffies(1000)); | ||
484 | } | ||
485 | return page; | ||
486 | } | ||
487 | |||
488 | int svc_rdma_post_recv(struct svcxprt_rdma *xprt) | ||
489 | { | ||
490 | struct ib_recv_wr recv_wr, *bad_recv_wr; | ||
491 | struct svc_rdma_op_ctxt *ctxt; | ||
492 | struct page *page; | ||
493 | unsigned long pa; | ||
494 | int sge_no; | ||
495 | int buflen; | ||
496 | int ret; | ||
497 | |||
498 | ctxt = svc_rdma_get_context(xprt); | ||
499 | buflen = 0; | ||
500 | ctxt->direction = DMA_FROM_DEVICE; | ||
501 | for (sge_no = 0; buflen < xprt->sc_max_req_size; sge_no++) { | ||
502 | BUG_ON(sge_no >= xprt->sc_max_sge); | ||
503 | page = svc_rdma_get_page(); | ||
504 | ctxt->pages[sge_no] = page; | ||
505 | pa = ib_dma_map_page(xprt->sc_cm_id->device, | ||
506 | page, 0, PAGE_SIZE, | ||
507 | DMA_FROM_DEVICE); | ||
508 | ctxt->sge[sge_no].addr = pa; | ||
509 | ctxt->sge[sge_no].length = PAGE_SIZE; | ||
510 | ctxt->sge[sge_no].lkey = xprt->sc_phys_mr->lkey; | ||
511 | buflen += PAGE_SIZE; | ||
512 | } | ||
513 | ctxt->count = sge_no; | ||
514 | recv_wr.next = NULL; | ||
515 | recv_wr.sg_list = &ctxt->sge[0]; | ||
516 | recv_wr.num_sge = ctxt->count; | ||
517 | recv_wr.wr_id = (u64)(unsigned long)ctxt; | ||
518 | |||
519 | ret = ib_post_recv(xprt->sc_qp, &recv_wr, &bad_recv_wr); | ||
520 | return ret; | ||
521 | } | ||
522 | |||
523 | /* | ||
524 | * This function handles the CONNECT_REQUEST event on a listening | ||
525 | * endpoint. It is passed the cma_id for the _new_ connection. The context in | ||
526 | * this cma_id is inherited from the listening cma_id and is the svc_xprt | ||
527 | * structure for the listening endpoint. | ||
528 | * | ||
529 | * This function creates a new xprt for the new connection and enqueues it on | ||
530 | * the accept queue for the listent xprt. When the listen thread is kicked, it | ||
531 | * will call the recvfrom method on the listen xprt which will accept the new | ||
532 | * connection. | ||
533 | */ | ||
534 | static void handle_connect_req(struct rdma_cm_id *new_cma_id) | ||
535 | { | ||
536 | struct svcxprt_rdma *listen_xprt = new_cma_id->context; | ||
537 | struct svcxprt_rdma *newxprt; | ||
538 | |||
539 | /* Create a new transport */ | ||
540 | newxprt = rdma_create_xprt(listen_xprt->sc_xprt.xpt_server, 0); | ||
541 | if (!newxprt) { | ||
542 | dprintk("svcrdma: failed to create new transport\n"); | ||
543 | return; | ||
544 | } | ||
545 | newxprt->sc_cm_id = new_cma_id; | ||
546 | new_cma_id->context = newxprt; | ||
547 | dprintk("svcrdma: Creating newxprt=%p, cm_id=%p, listenxprt=%p\n", | ||
548 | newxprt, newxprt->sc_cm_id, listen_xprt); | ||
549 | |||
550 | /* | ||
551 | * Enqueue the new transport on the accept queue of the listening | ||
552 | * transport | ||
553 | */ | ||
554 | spin_lock_bh(&listen_xprt->sc_lock); | ||
555 | list_add_tail(&newxprt->sc_accept_q, &listen_xprt->sc_accept_q); | ||
556 | spin_unlock_bh(&listen_xprt->sc_lock); | ||
557 | |||
558 | /* | ||
559 | * Can't use svc_xprt_received here because we are not on a | ||
560 | * rqstp thread | ||
561 | */ | ||
562 | set_bit(XPT_CONN, &listen_xprt->sc_xprt.xpt_flags); | ||
563 | svc_xprt_enqueue(&listen_xprt->sc_xprt); | ||
564 | } | ||
565 | |||
566 | /* | ||
567 | * Handles events generated on the listening endpoint. These events will be | ||
568 | * either be incoming connect requests or adapter removal events. | ||
569 | */ | ||
570 | static int rdma_listen_handler(struct rdma_cm_id *cma_id, | ||
571 | struct rdma_cm_event *event) | ||
572 | { | ||
573 | struct svcxprt_rdma *xprt = cma_id->context; | ||
574 | int ret = 0; | ||
575 | |||
576 | switch (event->event) { | ||
577 | case RDMA_CM_EVENT_CONNECT_REQUEST: | ||
578 | dprintk("svcrdma: Connect request on cma_id=%p, xprt = %p, " | ||
579 | "event=%d\n", cma_id, cma_id->context, event->event); | ||
580 | handle_connect_req(cma_id); | ||
581 | break; | ||
582 | |||
583 | case RDMA_CM_EVENT_ESTABLISHED: | ||
584 | /* Accept complete */ | ||
585 | dprintk("svcrdma: Connection completed on LISTEN xprt=%p, " | ||
586 | "cm_id=%p\n", xprt, cma_id); | ||
587 | break; | ||
588 | |||
589 | case RDMA_CM_EVENT_DEVICE_REMOVAL: | ||
590 | dprintk("svcrdma: Device removal xprt=%p, cm_id=%p\n", | ||
591 | xprt, cma_id); | ||
592 | if (xprt) | ||
593 | set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); | ||
594 | break; | ||
595 | |||
596 | default: | ||
597 | dprintk("svcrdma: Unexpected event on listening endpoint %p, " | ||
598 | "event=%d\n", cma_id, event->event); | ||
599 | break; | ||
600 | } | ||
601 | |||
602 | return ret; | ||
603 | } | ||
604 | |||
605 | static int rdma_cma_handler(struct rdma_cm_id *cma_id, | ||
606 | struct rdma_cm_event *event) | ||
607 | { | ||
608 | struct svc_xprt *xprt = cma_id->context; | ||
609 | struct svcxprt_rdma *rdma = | ||
610 | container_of(xprt, struct svcxprt_rdma, sc_xprt); | ||
611 | switch (event->event) { | ||
612 | case RDMA_CM_EVENT_ESTABLISHED: | ||
613 | /* Accept complete */ | ||
614 | dprintk("svcrdma: Connection completed on DTO xprt=%p, " | ||
615 | "cm_id=%p\n", xprt, cma_id); | ||
616 | clear_bit(RDMAXPRT_CONN_PENDING, &rdma->sc_flags); | ||
617 | svc_xprt_enqueue(xprt); | ||
618 | break; | ||
619 | case RDMA_CM_EVENT_DISCONNECTED: | ||
620 | dprintk("svcrdma: Disconnect on DTO xprt=%p, cm_id=%p\n", | ||
621 | xprt, cma_id); | ||
622 | if (xprt) { | ||
623 | set_bit(XPT_CLOSE, &xprt->xpt_flags); | ||
624 | svc_xprt_enqueue(xprt); | ||
625 | } | ||
626 | break; | ||
627 | case RDMA_CM_EVENT_DEVICE_REMOVAL: | ||
628 | dprintk("svcrdma: Device removal cma_id=%p, xprt = %p, " | ||
629 | "event=%d\n", cma_id, xprt, event->event); | ||
630 | if (xprt) { | ||
631 | set_bit(XPT_CLOSE, &xprt->xpt_flags); | ||
632 | svc_xprt_enqueue(xprt); | ||
633 | } | ||
634 | break; | ||
635 | default: | ||
636 | dprintk("svcrdma: Unexpected event on DTO endpoint %p, " | ||
637 | "event=%d\n", cma_id, event->event); | ||
638 | break; | ||
639 | } | ||
640 | return 0; | ||
641 | } | ||
642 | |||
643 | /* | ||
644 | * Create a listening RDMA service endpoint. | ||
645 | */ | ||
646 | static struct svc_xprt *svc_rdma_create(struct svc_serv *serv, | ||
647 | struct sockaddr *sa, int salen, | ||
648 | int flags) | ||
649 | { | ||
650 | struct rdma_cm_id *listen_id; | ||
651 | struct svcxprt_rdma *cma_xprt; | ||
652 | struct svc_xprt *xprt; | ||
653 | int ret; | ||
654 | |||
655 | dprintk("svcrdma: Creating RDMA socket\n"); | ||
656 | |||
657 | cma_xprt = rdma_create_xprt(serv, 1); | ||
658 | if (!cma_xprt) | ||
659 | return ERR_PTR(ENOMEM); | ||
660 | xprt = &cma_xprt->sc_xprt; | ||
661 | |||
662 | listen_id = rdma_create_id(rdma_listen_handler, cma_xprt, RDMA_PS_TCP); | ||
663 | if (IS_ERR(listen_id)) { | ||
664 | rdma_destroy_xprt(cma_xprt); | ||
665 | dprintk("svcrdma: rdma_create_id failed = %ld\n", | ||
666 | PTR_ERR(listen_id)); | ||
667 | return (void *)listen_id; | ||
668 | } | ||
669 | ret = rdma_bind_addr(listen_id, sa); | ||
670 | if (ret) { | ||
671 | rdma_destroy_xprt(cma_xprt); | ||
672 | rdma_destroy_id(listen_id); | ||
673 | dprintk("svcrdma: rdma_bind_addr failed = %d\n", ret); | ||
674 | return ERR_PTR(ret); | ||
675 | } | ||
676 | cma_xprt->sc_cm_id = listen_id; | ||
677 | |||
678 | ret = rdma_listen(listen_id, RPCRDMA_LISTEN_BACKLOG); | ||
679 | if (ret) { | ||
680 | rdma_destroy_id(listen_id); | ||
681 | rdma_destroy_xprt(cma_xprt); | ||
682 | dprintk("svcrdma: rdma_listen failed = %d\n", ret); | ||
683 | } | ||
684 | |||
685 | /* | ||
686 | * We need to use the address from the cm_id in case the | ||
687 | * caller specified 0 for the port number. | ||
688 | */ | ||
689 | sa = (struct sockaddr *)&cma_xprt->sc_cm_id->route.addr.src_addr; | ||
690 | svc_xprt_set_local(&cma_xprt->sc_xprt, sa, salen); | ||
691 | |||
692 | return &cma_xprt->sc_xprt; | ||
693 | } | ||
694 | |||
695 | /* | ||
696 | * This is the xpo_recvfrom function for listening endpoints. Its | ||
697 | * purpose is to accept incoming connections. The CMA callback handler | ||
698 | * has already created a new transport and attached it to the new CMA | ||
699 | * ID. | ||
700 | * | ||
701 | * There is a queue of pending connections hung on the listening | ||
702 | * transport. This queue contains the new svc_xprt structure. This | ||
703 | * function takes svc_xprt structures off the accept_q and completes | ||
704 | * the connection. | ||
705 | */ | ||
706 | static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) | ||
707 | { | ||
708 | struct svcxprt_rdma *listen_rdma; | ||
709 | struct svcxprt_rdma *newxprt = NULL; | ||
710 | struct rdma_conn_param conn_param; | ||
711 | struct ib_qp_init_attr qp_attr; | ||
712 | struct ib_device_attr devattr; | ||
713 | struct sockaddr *sa; | ||
714 | int ret; | ||
715 | int i; | ||
716 | |||
717 | listen_rdma = container_of(xprt, struct svcxprt_rdma, sc_xprt); | ||
718 | clear_bit(XPT_CONN, &xprt->xpt_flags); | ||
719 | /* Get the next entry off the accept list */ | ||
720 | spin_lock_bh(&listen_rdma->sc_lock); | ||
721 | if (!list_empty(&listen_rdma->sc_accept_q)) { | ||
722 | newxprt = list_entry(listen_rdma->sc_accept_q.next, | ||
723 | struct svcxprt_rdma, sc_accept_q); | ||
724 | list_del_init(&newxprt->sc_accept_q); | ||
725 | } | ||
726 | if (!list_empty(&listen_rdma->sc_accept_q)) | ||
727 | set_bit(XPT_CONN, &listen_rdma->sc_xprt.xpt_flags); | ||
728 | spin_unlock_bh(&listen_rdma->sc_lock); | ||
729 | if (!newxprt) | ||
730 | return NULL; | ||
731 | |||
732 | dprintk("svcrdma: newxprt from accept queue = %p, cm_id=%p\n", | ||
733 | newxprt, newxprt->sc_cm_id); | ||
734 | |||
735 | ret = ib_query_device(newxprt->sc_cm_id->device, &devattr); | ||
736 | if (ret) { | ||
737 | dprintk("svcrdma: could not query device attributes on " | ||
738 | "device %p, rc=%d\n", newxprt->sc_cm_id->device, ret); | ||
739 | goto errout; | ||
740 | } | ||
741 | |||
742 | /* Qualify the transport resource defaults with the | ||
743 | * capabilities of this particular device */ | ||
744 | newxprt->sc_max_sge = min((size_t)devattr.max_sge, | ||
745 | (size_t)RPCSVC_MAXPAGES); | ||
746 | newxprt->sc_max_requests = min((size_t)devattr.max_qp_wr, | ||
747 | (size_t)svcrdma_max_requests); | ||
748 | newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_max_requests; | ||
749 | |||
750 | newxprt->sc_ord = min((size_t)devattr.max_qp_rd_atom, | ||
751 | (size_t)svcrdma_ord); | ||
752 | |||
753 | newxprt->sc_pd = ib_alloc_pd(newxprt->sc_cm_id->device); | ||
754 | if (IS_ERR(newxprt->sc_pd)) { | ||
755 | dprintk("svcrdma: error creating PD for connect request\n"); | ||
756 | goto errout; | ||
757 | } | ||
758 | newxprt->sc_sq_cq = ib_create_cq(newxprt->sc_cm_id->device, | ||
759 | sq_comp_handler, | ||
760 | cq_event_handler, | ||
761 | newxprt, | ||
762 | newxprt->sc_sq_depth, | ||
763 | 0); | ||
764 | if (IS_ERR(newxprt->sc_sq_cq)) { | ||
765 | dprintk("svcrdma: error creating SQ CQ for connect request\n"); | ||
766 | goto errout; | ||
767 | } | ||
768 | newxprt->sc_rq_cq = ib_create_cq(newxprt->sc_cm_id->device, | ||
769 | rq_comp_handler, | ||
770 | cq_event_handler, | ||
771 | newxprt, | ||
772 | newxprt->sc_max_requests, | ||
773 | 0); | ||
774 | if (IS_ERR(newxprt->sc_rq_cq)) { | ||
775 | dprintk("svcrdma: error creating RQ CQ for connect request\n"); | ||
776 | goto errout; | ||
777 | } | ||
778 | |||
779 | memset(&qp_attr, 0, sizeof qp_attr); | ||
780 | qp_attr.event_handler = qp_event_handler; | ||
781 | qp_attr.qp_context = &newxprt->sc_xprt; | ||
782 | qp_attr.cap.max_send_wr = newxprt->sc_sq_depth; | ||
783 | qp_attr.cap.max_recv_wr = newxprt->sc_max_requests; | ||
784 | qp_attr.cap.max_send_sge = newxprt->sc_max_sge; | ||
785 | qp_attr.cap.max_recv_sge = newxprt->sc_max_sge; | ||
786 | qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR; | ||
787 | qp_attr.qp_type = IB_QPT_RC; | ||
788 | qp_attr.send_cq = newxprt->sc_sq_cq; | ||
789 | qp_attr.recv_cq = newxprt->sc_rq_cq; | ||
790 | dprintk("svcrdma: newxprt->sc_cm_id=%p, newxprt->sc_pd=%p\n" | ||
791 | " cm_id->device=%p, sc_pd->device=%p\n" | ||
792 | " cap.max_send_wr = %d\n" | ||
793 | " cap.max_recv_wr = %d\n" | ||
794 | " cap.max_send_sge = %d\n" | ||
795 | " cap.max_recv_sge = %d\n", | ||
796 | newxprt->sc_cm_id, newxprt->sc_pd, | ||
797 | newxprt->sc_cm_id->device, newxprt->sc_pd->device, | ||
798 | qp_attr.cap.max_send_wr, | ||
799 | qp_attr.cap.max_recv_wr, | ||
800 | qp_attr.cap.max_send_sge, | ||
801 | qp_attr.cap.max_recv_sge); | ||
802 | |||
803 | ret = rdma_create_qp(newxprt->sc_cm_id, newxprt->sc_pd, &qp_attr); | ||
804 | if (ret) { | ||
805 | /* | ||
806 | * XXX: This is a hack. We need a xx_request_qp interface | ||
807 | * that will adjust the qp_attr's with a best-effort | ||
808 | * number | ||
809 | */ | ||
810 | qp_attr.cap.max_send_sge -= 2; | ||
811 | qp_attr.cap.max_recv_sge -= 2; | ||
812 | ret = rdma_create_qp(newxprt->sc_cm_id, newxprt->sc_pd, | ||
813 | &qp_attr); | ||
814 | if (ret) { | ||
815 | dprintk("svcrdma: failed to create QP, ret=%d\n", ret); | ||
816 | goto errout; | ||
817 | } | ||
818 | newxprt->sc_max_sge = qp_attr.cap.max_send_sge; | ||
819 | newxprt->sc_max_sge = qp_attr.cap.max_recv_sge; | ||
820 | newxprt->sc_sq_depth = qp_attr.cap.max_send_wr; | ||
821 | newxprt->sc_max_requests = qp_attr.cap.max_recv_wr; | ||
822 | } | ||
823 | newxprt->sc_qp = newxprt->sc_cm_id->qp; | ||
824 | |||
825 | /* Register all of physical memory */ | ||
826 | newxprt->sc_phys_mr = ib_get_dma_mr(newxprt->sc_pd, | ||
827 | IB_ACCESS_LOCAL_WRITE | | ||
828 | IB_ACCESS_REMOTE_WRITE); | ||
829 | if (IS_ERR(newxprt->sc_phys_mr)) { | ||
830 | dprintk("svcrdma: Failed to create DMA MR ret=%d\n", ret); | ||
831 | goto errout; | ||
832 | } | ||
833 | |||
834 | /* Post receive buffers */ | ||
835 | for (i = 0; i < newxprt->sc_max_requests; i++) { | ||
836 | ret = svc_rdma_post_recv(newxprt); | ||
837 | if (ret) { | ||
838 | dprintk("svcrdma: failure posting receive buffers\n"); | ||
839 | goto errout; | ||
840 | } | ||
841 | } | ||
842 | |||
843 | /* Swap out the handler */ | ||
844 | newxprt->sc_cm_id->event_handler = rdma_cma_handler; | ||
845 | |||
846 | /* Accept Connection */ | ||
847 | set_bit(RDMAXPRT_CONN_PENDING, &newxprt->sc_flags); | ||
848 | memset(&conn_param, 0, sizeof conn_param); | ||
849 | conn_param.responder_resources = 0; | ||
850 | conn_param.initiator_depth = newxprt->sc_ord; | ||
851 | ret = rdma_accept(newxprt->sc_cm_id, &conn_param); | ||
852 | if (ret) { | ||
853 | dprintk("svcrdma: failed to accept new connection, ret=%d\n", | ||
854 | ret); | ||
855 | goto errout; | ||
856 | } | ||
857 | |||
858 | dprintk("svcrdma: new connection %p accepted with the following " | ||
859 | "attributes:\n" | ||
860 | " local_ip : %d.%d.%d.%d\n" | ||
861 | " local_port : %d\n" | ||
862 | " remote_ip : %d.%d.%d.%d\n" | ||
863 | " remote_port : %d\n" | ||
864 | " max_sge : %d\n" | ||
865 | " sq_depth : %d\n" | ||
866 | " max_requests : %d\n" | ||
867 | " ord : %d\n", | ||
868 | newxprt, | ||
869 | NIPQUAD(((struct sockaddr_in *)&newxprt->sc_cm_id-> | ||
870 | route.addr.src_addr)->sin_addr.s_addr), | ||
871 | ntohs(((struct sockaddr_in *)&newxprt->sc_cm_id-> | ||
872 | route.addr.src_addr)->sin_port), | ||
873 | NIPQUAD(((struct sockaddr_in *)&newxprt->sc_cm_id-> | ||
874 | route.addr.dst_addr)->sin_addr.s_addr), | ||
875 | ntohs(((struct sockaddr_in *)&newxprt->sc_cm_id-> | ||
876 | route.addr.dst_addr)->sin_port), | ||
877 | newxprt->sc_max_sge, | ||
878 | newxprt->sc_sq_depth, | ||
879 | newxprt->sc_max_requests, | ||
880 | newxprt->sc_ord); | ||
881 | |||
882 | /* Set the local and remote addresses in the transport */ | ||
883 | sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr; | ||
884 | svc_xprt_set_remote(&newxprt->sc_xprt, sa, svc_addr_len(sa)); | ||
885 | sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.src_addr; | ||
886 | svc_xprt_set_local(&newxprt->sc_xprt, sa, svc_addr_len(sa)); | ||
887 | |||
888 | ib_req_notify_cq(newxprt->sc_sq_cq, IB_CQ_NEXT_COMP); | ||
889 | ib_req_notify_cq(newxprt->sc_rq_cq, IB_CQ_NEXT_COMP); | ||
890 | return &newxprt->sc_xprt; | ||
891 | |||
892 | errout: | ||
893 | dprintk("svcrdma: failure accepting new connection rc=%d.\n", ret); | ||
894 | rdma_destroy_id(newxprt->sc_cm_id); | ||
895 | rdma_destroy_xprt(newxprt); | ||
896 | return NULL; | ||
897 | } | ||
898 | |||
899 | /* | ||
900 | * Post an RQ WQE to the RQ when the rqst is being released. This | ||
901 | * effectively returns an RQ credit to the client. The rq_xprt_ctxt | ||
902 | * will be null if the request is deferred due to an RDMA_READ or the | ||
903 | * transport had no data ready (EAGAIN). Note that an RPC deferred in | ||
904 | * svc_process will still return the credit, this is because the data | ||
905 | * is copied and no longer consume a WQE/WC. | ||
906 | */ | ||
907 | static void svc_rdma_release_rqst(struct svc_rqst *rqstp) | ||
908 | { | ||
909 | int err; | ||
910 | struct svcxprt_rdma *rdma = | ||
911 | container_of(rqstp->rq_xprt, struct svcxprt_rdma, sc_xprt); | ||
912 | if (rqstp->rq_xprt_ctxt) { | ||
913 | BUG_ON(rqstp->rq_xprt_ctxt != rdma); | ||
914 | err = svc_rdma_post_recv(rdma); | ||
915 | if (err) | ||
916 | dprintk("svcrdma: failed to post an RQ WQE error=%d\n", | ||
917 | err); | ||
918 | } | ||
919 | rqstp->rq_xprt_ctxt = NULL; | ||
920 | } | ||
921 | |||
922 | /* Disable data ready events for this connection */ | ||
923 | static void svc_rdma_detach(struct svc_xprt *xprt) | ||
924 | { | ||
925 | struct svcxprt_rdma *rdma = | ||
926 | container_of(xprt, struct svcxprt_rdma, sc_xprt); | ||
927 | unsigned long flags; | ||
928 | |||
929 | dprintk("svc: svc_rdma_detach(%p)\n", xprt); | ||
930 | /* | ||
931 | * Shutdown the connection. This will ensure we don't get any | ||
932 | * more events from the provider. | ||
933 | */ | ||
934 | rdma_disconnect(rdma->sc_cm_id); | ||
935 | rdma_destroy_id(rdma->sc_cm_id); | ||
936 | |||
937 | /* We may already be on the DTO list */ | ||
938 | spin_lock_irqsave(&dto_lock, flags); | ||
939 | if (!list_empty(&rdma->sc_dto_q)) | ||
940 | list_del_init(&rdma->sc_dto_q); | ||
941 | spin_unlock_irqrestore(&dto_lock, flags); | ||
942 | } | ||
943 | |||
944 | static void svc_rdma_free(struct svc_xprt *xprt) | ||
945 | { | ||
946 | struct svcxprt_rdma *rdma = (struct svcxprt_rdma *)xprt; | ||
947 | dprintk("svcrdma: svc_rdma_free(%p)\n", rdma); | ||
948 | rdma_destroy_xprt(rdma); | ||
949 | kfree(rdma); | ||
950 | } | ||
951 | |||
952 | static void rdma_destroy_xprt(struct svcxprt_rdma *xprt) | ||
953 | { | ||
954 | if (xprt->sc_qp && !IS_ERR(xprt->sc_qp)) | ||
955 | ib_destroy_qp(xprt->sc_qp); | ||
956 | |||
957 | if (xprt->sc_sq_cq && !IS_ERR(xprt->sc_sq_cq)) | ||
958 | ib_destroy_cq(xprt->sc_sq_cq); | ||
959 | |||
960 | if (xprt->sc_rq_cq && !IS_ERR(xprt->sc_rq_cq)) | ||
961 | ib_destroy_cq(xprt->sc_rq_cq); | ||
962 | |||
963 | if (xprt->sc_phys_mr && !IS_ERR(xprt->sc_phys_mr)) | ||
964 | ib_dereg_mr(xprt->sc_phys_mr); | ||
965 | |||
966 | if (xprt->sc_pd && !IS_ERR(xprt->sc_pd)) | ||
967 | ib_dealloc_pd(xprt->sc_pd); | ||
968 | |||
969 | destroy_context_cache(xprt->sc_ctxt_head); | ||
970 | } | ||
971 | |||
972 | static int svc_rdma_has_wspace(struct svc_xprt *xprt) | ||
973 | { | ||
974 | struct svcxprt_rdma *rdma = | ||
975 | container_of(xprt, struct svcxprt_rdma, sc_xprt); | ||
976 | |||
977 | /* | ||
978 | * If there are fewer SQ WR available than required to send a | ||
979 | * simple response, return false. | ||
980 | */ | ||
981 | if ((rdma->sc_sq_depth - atomic_read(&rdma->sc_sq_count) < 3)) | ||
982 | return 0; | ||
983 | |||
984 | /* | ||
985 | * ...or there are already waiters on the SQ, | ||
986 | * return false. | ||
987 | */ | ||
988 | if (waitqueue_active(&rdma->sc_send_wait)) | ||
989 | return 0; | ||
990 | |||
991 | /* Otherwise return true. */ | ||
992 | return 1; | ||
993 | } | ||
994 | |||
995 | int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr) | ||
996 | { | ||
997 | struct ib_send_wr *bad_wr; | ||
998 | int ret; | ||
999 | |||
1000 | if (test_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags)) | ||
1001 | return 0; | ||
1002 | |||
1003 | BUG_ON(wr->send_flags != IB_SEND_SIGNALED); | ||
1004 | BUG_ON(((struct svc_rdma_op_ctxt *)(unsigned long)wr->wr_id)->wr_op != | ||
1005 | wr->opcode); | ||
1006 | /* If the SQ is full, wait until an SQ entry is available */ | ||
1007 | while (1) { | ||
1008 | spin_lock_bh(&xprt->sc_lock); | ||
1009 | if (xprt->sc_sq_depth == atomic_read(&xprt->sc_sq_count)) { | ||
1010 | spin_unlock_bh(&xprt->sc_lock); | ||
1011 | atomic_inc(&rdma_stat_sq_starve); | ||
1012 | /* See if we can reap some SQ WR */ | ||
1013 | sq_cq_reap(xprt); | ||
1014 | |||
1015 | /* Wait until SQ WR available if SQ still full */ | ||
1016 | wait_event(xprt->sc_send_wait, | ||
1017 | atomic_read(&xprt->sc_sq_count) < | ||
1018 | xprt->sc_sq_depth); | ||
1019 | continue; | ||
1020 | } | ||
1021 | /* Bumped used SQ WR count and post */ | ||
1022 | ret = ib_post_send(xprt->sc_qp, wr, &bad_wr); | ||
1023 | if (!ret) | ||
1024 | atomic_inc(&xprt->sc_sq_count); | ||
1025 | else | ||
1026 | dprintk("svcrdma: failed to post SQ WR rc=%d, " | ||
1027 | "sc_sq_count=%d, sc_sq_depth=%d\n", | ||
1028 | ret, atomic_read(&xprt->sc_sq_count), | ||
1029 | xprt->sc_sq_depth); | ||
1030 | spin_unlock_bh(&xprt->sc_lock); | ||
1031 | break; | ||
1032 | } | ||
1033 | return ret; | ||
1034 | } | ||
1035 | |||
1036 | int svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp, | ||
1037 | enum rpcrdma_errcode err) | ||
1038 | { | ||
1039 | struct ib_send_wr err_wr; | ||
1040 | struct ib_sge sge; | ||
1041 | struct page *p; | ||
1042 | struct svc_rdma_op_ctxt *ctxt; | ||
1043 | u32 *va; | ||
1044 | int length; | ||
1045 | int ret; | ||
1046 | |||
1047 | p = svc_rdma_get_page(); | ||
1048 | va = page_address(p); | ||
1049 | |||
1050 | /* XDR encode error */ | ||
1051 | length = svc_rdma_xdr_encode_error(xprt, rmsgp, err, va); | ||
1052 | |||
1053 | /* Prepare SGE for local address */ | ||
1054 | sge.addr = ib_dma_map_page(xprt->sc_cm_id->device, | ||
1055 | p, 0, PAGE_SIZE, DMA_FROM_DEVICE); | ||
1056 | sge.lkey = xprt->sc_phys_mr->lkey; | ||
1057 | sge.length = length; | ||
1058 | |||
1059 | ctxt = svc_rdma_get_context(xprt); | ||
1060 | ctxt->count = 1; | ||
1061 | ctxt->pages[0] = p; | ||
1062 | |||
1063 | /* Prepare SEND WR */ | ||
1064 | memset(&err_wr, 0, sizeof err_wr); | ||
1065 | ctxt->wr_op = IB_WR_SEND; | ||
1066 | err_wr.wr_id = (unsigned long)ctxt; | ||
1067 | err_wr.sg_list = &sge; | ||
1068 | err_wr.num_sge = 1; | ||
1069 | err_wr.opcode = IB_WR_SEND; | ||
1070 | err_wr.send_flags = IB_SEND_SIGNALED; | ||
1071 | |||
1072 | /* Post It */ | ||
1073 | ret = svc_rdma_send(xprt, &err_wr); | ||
1074 | if (ret) { | ||
1075 | dprintk("svcrdma: Error posting send = %d\n", ret); | ||
1076 | svc_rdma_put_context(ctxt, 1); | ||
1077 | } | ||
1078 | |||
1079 | return ret; | ||
1080 | } | ||