diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2016-07-30 19:33:25 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-07-30 19:33:25 -0400 |
commit | 7f155c702677d057d03b192ce652311de5434697 (patch) | |
tree | dcee0fbb463ec3e55cb50181180c7d175d5895c3 /net | |
parent | d761f3ed6e71bcca724a6e9e39efcac65b7b4ac1 (diff) | |
parent | 944171cbf499d3445c749f7c13c46de0a564a905 (diff) |
Merge tag 'nfs-for-4.8-1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs
Pull NFS client updates from Trond Myklebust:
"Highlights include:
Stable bugfixes:
- nfs: don't create zero-length requests
- several LAYOUTGET bugfixes
Features:
- several performance related features
- more aggressive caching when we can rely on close-to-open
cache consistency
- remove serialisation of O_DIRECT reads and writes
- optimise several code paths to not flush to disk unnecessarily.
However allow for the idiosyncracies of pNFS for those layout
types that need to issue a LAYOUTCOMMIT before the metadata can
be updated on the server.
- SUNRPC updates to the client data receive path
- pNFS/SCSI support RH/Fedora dm-mpath device nodes
- pNFS files/flexfiles can now use unprivileged ports when
the generic NFS mount options allow it.
Bugfixes:
- Don't use RDMA direct data placement together with data
integrity or privacy security flavours
- Remove the RDMA ALLPHYSICAL memory registration mode as
it has potential security holes.
- Several layout recall fixes to improve NFSv4.1 protocol
compliance.
- Fix an Oops in the pNFS files and flexfiles connection
setup to the DS
- Allow retry of operations that used a returned delegation
stateid
- Don't mark the inode as revalidated if a LAYOUTCOMMIT is
outstanding
- Fix writeback races in nfs4_copy_range() and
nfs42_proc_deallocate()"
* tag 'nfs-for-4.8-1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs: (104 commits)
pNFS: Actively set attributes as invalid if LAYOUTCOMMIT is outstanding
NFSv4: Clean up lookup of SECINFO_NO_NAME
NFSv4.2: Fix warning "variable ‘stateids’ set but not used"
NFSv4: Fix warning "no previous prototype for ‘nfs4_listxattr’"
SUNRPC: Fix a compiler warning in fs/nfs/clnt.c
pNFS: Remove redundant smp_mb() from pnfs_init_lseg()
pNFS: Cleanup - do layout segment initialisation in one place
pNFS: Remove redundant stateid invalidation
pNFS: Remove redundant pnfs_mark_layout_returned_if_empty()
pNFS: Clear the layout metadata if the server changed the layout stateid
pNFS: Cleanup - don't open code pnfs_mark_layout_stateid_invalid()
NFS: pnfs_mark_matching_lsegs_return() should match the layout sequence id
pNFS: Do not set plh_return_seq for non-callback related layoutreturns
pNFS: Ensure layoutreturn acts as a completion for layout callbacks
pNFS: Fix CB_LAYOUTRECALL stateid verification
pNFS: Always update the layout barrier seqid on LAYOUTGET
pNFS: Always update the layout stateid if NFS_LAYOUT_INVALID_STID is set
pNFS: Clear the layout return tracking on layout reinitialisation
pNFS: LAYOUTRETURN should only update the stateid if the layout is valid
nfs: don't create zero-length requests
...
Diffstat (limited to 'net')
-rw-r--r-- | net/sunrpc/auth.c | 8 | ||||
-rw-r--r-- | net/sunrpc/auth_generic.c | 9 | ||||
-rw-r--r-- | net/sunrpc/auth_gss/auth_gss.c | 3 | ||||
-rw-r--r-- | net/sunrpc/auth_gss/gss_krb5_mech.c | 2 | ||||
-rw-r--r-- | net/sunrpc/auth_gss/gss_mech_switch.c | 12 | ||||
-rw-r--r-- | net/sunrpc/auth_null.c | 1 | ||||
-rw-r--r-- | net/sunrpc/auth_unix.c | 1 | ||||
-rw-r--r-- | net/sunrpc/clnt.c | 2 | ||||
-rw-r--r-- | net/sunrpc/sched.c | 67 | ||||
-rw-r--r-- | net/sunrpc/svc.c | 8 | ||||
-rw-r--r-- | net/sunrpc/xprt.c | 14 | ||||
-rw-r--r-- | net/sunrpc/xprtmultipath.c | 8 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/Makefile | 2 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/fmr_ops.c | 378 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/frwr_ops.c | 369 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/physical_ops.c | 122 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/rpc_rdma.c | 274 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/transport.c | 40 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/verbs.c | 242 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/xprt_rdma.h | 118 | ||||
-rw-r--r-- | net/sunrpc/xprtsock.c | 125 |
21 files changed, 864 insertions, 941 deletions
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index 040ff627c18a..a7e42f9a405c 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c | |||
@@ -51,9 +51,7 @@ static int param_set_hashtbl_sz(const char *val, const struct kernel_param *kp) | |||
51 | ret = kstrtoul(val, 0, &num); | 51 | ret = kstrtoul(val, 0, &num); |
52 | if (ret == -EINVAL) | 52 | if (ret == -EINVAL) |
53 | goto out_inval; | 53 | goto out_inval; |
54 | nbits = fls(num); | 54 | nbits = fls(num - 1); |
55 | if (num > (1U << nbits)) | ||
56 | nbits++; | ||
57 | if (nbits > MAX_HASHTABLE_BITS || nbits < 2) | 55 | if (nbits > MAX_HASHTABLE_BITS || nbits < 2) |
58 | goto out_inval; | 56 | goto out_inval; |
59 | *(unsigned int *)kp->arg = nbits; | 57 | *(unsigned int *)kp->arg = nbits; |
@@ -359,8 +357,10 @@ rpcauth_key_timeout_notify(struct rpc_auth *auth, struct rpc_cred *cred) | |||
359 | EXPORT_SYMBOL_GPL(rpcauth_key_timeout_notify); | 357 | EXPORT_SYMBOL_GPL(rpcauth_key_timeout_notify); |
360 | 358 | ||
361 | bool | 359 | bool |
362 | rpcauth_cred_key_to_expire(struct rpc_cred *cred) | 360 | rpcauth_cred_key_to_expire(struct rpc_auth *auth, struct rpc_cred *cred) |
363 | { | 361 | { |
362 | if (auth->au_flags & RPCAUTH_AUTH_NO_CRKEY_TIMEOUT) | ||
363 | return false; | ||
364 | if (!cred->cr_ops->crkey_to_expire) | 364 | if (!cred->cr_ops->crkey_to_expire) |
365 | return false; | 365 | return false; |
366 | return cred->cr_ops->crkey_to_expire(cred); | 366 | return cred->cr_ops->crkey_to_expire(cred); |
diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c index 54dd3fdead54..168219535a34 100644 --- a/net/sunrpc/auth_generic.c +++ b/net/sunrpc/auth_generic.c | |||
@@ -224,7 +224,7 @@ generic_key_timeout(struct rpc_auth *auth, struct rpc_cred *cred) | |||
224 | 224 | ||
225 | 225 | ||
226 | /* Fast track for non crkey_timeout (no key) underlying credentials */ | 226 | /* Fast track for non crkey_timeout (no key) underlying credentials */ |
227 | if (test_bit(RPC_CRED_NO_CRKEY_TIMEOUT, &acred->ac_flags)) | 227 | if (auth->au_flags & RPCAUTH_AUTH_NO_CRKEY_TIMEOUT) |
228 | return 0; | 228 | return 0; |
229 | 229 | ||
230 | /* Fast track for the normal case */ | 230 | /* Fast track for the normal case */ |
@@ -236,12 +236,6 @@ generic_key_timeout(struct rpc_auth *auth, struct rpc_cred *cred) | |||
236 | if (IS_ERR(tcred)) | 236 | if (IS_ERR(tcred)) |
237 | return -EACCES; | 237 | return -EACCES; |
238 | 238 | ||
239 | if (!tcred->cr_ops->crkey_timeout) { | ||
240 | set_bit(RPC_CRED_NO_CRKEY_TIMEOUT, &acred->ac_flags); | ||
241 | ret = 0; | ||
242 | goto out_put; | ||
243 | } | ||
244 | |||
245 | /* Test for the almost error case */ | 239 | /* Test for the almost error case */ |
246 | ret = tcred->cr_ops->crkey_timeout(tcred); | 240 | ret = tcred->cr_ops->crkey_timeout(tcred); |
247 | if (ret != 0) { | 241 | if (ret != 0) { |
@@ -257,7 +251,6 @@ generic_key_timeout(struct rpc_auth *auth, struct rpc_cred *cred) | |||
257 | set_bit(RPC_CRED_NOTIFY_TIMEOUT, &acred->ac_flags); | 251 | set_bit(RPC_CRED_NOTIFY_TIMEOUT, &acred->ac_flags); |
258 | } | 252 | } |
259 | 253 | ||
260 | out_put: | ||
261 | put_rpccred(tcred); | 254 | put_rpccred(tcred); |
262 | return ret; | 255 | return ret; |
263 | } | 256 | } |
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index e64ae93d5b4f..23c8e7c39656 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c | |||
@@ -1015,8 +1015,11 @@ gss_create_new(struct rpc_auth_create_args *args, struct rpc_clnt *clnt) | |||
1015 | auth = &gss_auth->rpc_auth; | 1015 | auth = &gss_auth->rpc_auth; |
1016 | auth->au_cslack = GSS_CRED_SLACK >> 2; | 1016 | auth->au_cslack = GSS_CRED_SLACK >> 2; |
1017 | auth->au_rslack = GSS_VERF_SLACK >> 2; | 1017 | auth->au_rslack = GSS_VERF_SLACK >> 2; |
1018 | auth->au_flags = 0; | ||
1018 | auth->au_ops = &authgss_ops; | 1019 | auth->au_ops = &authgss_ops; |
1019 | auth->au_flavor = flavor; | 1020 | auth->au_flavor = flavor; |
1021 | if (gss_pseudoflavor_to_datatouch(gss_auth->mech, flavor)) | ||
1022 | auth->au_flags |= RPCAUTH_AUTH_DATATOUCH; | ||
1020 | atomic_set(&auth->au_count, 1); | 1023 | atomic_set(&auth->au_count, 1); |
1021 | kref_init(&gss_auth->kref); | 1024 | kref_init(&gss_auth->kref); |
1022 | 1025 | ||
diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c index 65427492b1c9..60595835317a 100644 --- a/net/sunrpc/auth_gss/gss_krb5_mech.c +++ b/net/sunrpc/auth_gss/gss_krb5_mech.c | |||
@@ -745,12 +745,14 @@ static struct pf_desc gss_kerberos_pfs[] = { | |||
745 | .qop = GSS_C_QOP_DEFAULT, | 745 | .qop = GSS_C_QOP_DEFAULT, |
746 | .service = RPC_GSS_SVC_INTEGRITY, | 746 | .service = RPC_GSS_SVC_INTEGRITY, |
747 | .name = "krb5i", | 747 | .name = "krb5i", |
748 | .datatouch = true, | ||
748 | }, | 749 | }, |
749 | [2] = { | 750 | [2] = { |
750 | .pseudoflavor = RPC_AUTH_GSS_KRB5P, | 751 | .pseudoflavor = RPC_AUTH_GSS_KRB5P, |
751 | .qop = GSS_C_QOP_DEFAULT, | 752 | .qop = GSS_C_QOP_DEFAULT, |
752 | .service = RPC_GSS_SVC_PRIVACY, | 753 | .service = RPC_GSS_SVC_PRIVACY, |
753 | .name = "krb5p", | 754 | .name = "krb5p", |
755 | .datatouch = true, | ||
754 | }, | 756 | }, |
755 | }; | 757 | }; |
756 | 758 | ||
diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c index 7063d856a598..5fec3abbe19b 100644 --- a/net/sunrpc/auth_gss/gss_mech_switch.c +++ b/net/sunrpc/auth_gss/gss_mech_switch.c | |||
@@ -361,6 +361,18 @@ gss_pseudoflavor_to_service(struct gss_api_mech *gm, u32 pseudoflavor) | |||
361 | } | 361 | } |
362 | EXPORT_SYMBOL(gss_pseudoflavor_to_service); | 362 | EXPORT_SYMBOL(gss_pseudoflavor_to_service); |
363 | 363 | ||
364 | bool | ||
365 | gss_pseudoflavor_to_datatouch(struct gss_api_mech *gm, u32 pseudoflavor) | ||
366 | { | ||
367 | int i; | ||
368 | |||
369 | for (i = 0; i < gm->gm_pf_num; i++) { | ||
370 | if (gm->gm_pfs[i].pseudoflavor == pseudoflavor) | ||
371 | return gm->gm_pfs[i].datatouch; | ||
372 | } | ||
373 | return false; | ||
374 | } | ||
375 | |||
364 | char * | 376 | char * |
365 | gss_service_to_auth_domain_name(struct gss_api_mech *gm, u32 service) | 377 | gss_service_to_auth_domain_name(struct gss_api_mech *gm, u32 service) |
366 | { | 378 | { |
diff --git a/net/sunrpc/auth_null.c b/net/sunrpc/auth_null.c index 8d9eb4d5ddd8..4d17376b2acb 100644 --- a/net/sunrpc/auth_null.c +++ b/net/sunrpc/auth_null.c | |||
@@ -115,6 +115,7 @@ static | |||
115 | struct rpc_auth null_auth = { | 115 | struct rpc_auth null_auth = { |
116 | .au_cslack = NUL_CALLSLACK, | 116 | .au_cslack = NUL_CALLSLACK, |
117 | .au_rslack = NUL_REPLYSLACK, | 117 | .au_rslack = NUL_REPLYSLACK, |
118 | .au_flags = RPCAUTH_AUTH_NO_CRKEY_TIMEOUT, | ||
118 | .au_ops = &authnull_ops, | 119 | .au_ops = &authnull_ops, |
119 | .au_flavor = RPC_AUTH_NULL, | 120 | .au_flavor = RPC_AUTH_NULL, |
120 | .au_count = ATOMIC_INIT(0), | 121 | .au_count = ATOMIC_INIT(0), |
diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c index 9f65452b7cbc..a99278c984e8 100644 --- a/net/sunrpc/auth_unix.c +++ b/net/sunrpc/auth_unix.c | |||
@@ -228,6 +228,7 @@ static | |||
228 | struct rpc_auth unix_auth = { | 228 | struct rpc_auth unix_auth = { |
229 | .au_cslack = UNX_CALLSLACK, | 229 | .au_cslack = UNX_CALLSLACK, |
230 | .au_rslack = NUL_REPLYSLACK, | 230 | .au_rslack = NUL_REPLYSLACK, |
231 | .au_flags = RPCAUTH_AUTH_NO_CRKEY_TIMEOUT, | ||
231 | .au_ops = &authunix_ops, | 232 | .au_ops = &authunix_ops, |
232 | .au_flavor = RPC_AUTH_UNIX, | 233 | .au_flavor = RPC_AUTH_UNIX, |
233 | .au_count = ATOMIC_INIT(0), | 234 | .au_count = ATOMIC_INIT(0), |
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 2808d550d273..cb49898a5a58 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c | |||
@@ -2577,7 +2577,7 @@ static void rpc_cb_add_xprt_release(void *calldata) | |||
2577 | kfree(data); | 2577 | kfree(data); |
2578 | } | 2578 | } |
2579 | 2579 | ||
2580 | const static struct rpc_call_ops rpc_cb_add_xprt_call_ops = { | 2580 | static const struct rpc_call_ops rpc_cb_add_xprt_call_ops = { |
2581 | .rpc_call_done = rpc_cb_add_xprt_done, | 2581 | .rpc_call_done = rpc_cb_add_xprt_done, |
2582 | .rpc_release = rpc_cb_add_xprt_release, | 2582 | .rpc_release = rpc_cb_add_xprt_release, |
2583 | }; | 2583 | }; |
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index fcfd48d263f6..9ae588511aaf 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c | |||
@@ -54,7 +54,8 @@ static struct rpc_wait_queue delay_queue; | |||
54 | /* | 54 | /* |
55 | * rpciod-related stuff | 55 | * rpciod-related stuff |
56 | */ | 56 | */ |
57 | struct workqueue_struct *rpciod_workqueue; | 57 | struct workqueue_struct *rpciod_workqueue __read_mostly; |
58 | struct workqueue_struct *xprtiod_workqueue __read_mostly; | ||
58 | 59 | ||
59 | /* | 60 | /* |
60 | * Disable the timer for a given RPC task. Should be called with | 61 | * Disable the timer for a given RPC task. Should be called with |
@@ -329,7 +330,8 @@ EXPORT_SYMBOL_GPL(__rpc_wait_for_completion_task); | |||
329 | * lockless RPC_IS_QUEUED() test) before we've had a chance to test | 330 | * lockless RPC_IS_QUEUED() test) before we've had a chance to test |
330 | * the RPC_TASK_RUNNING flag. | 331 | * the RPC_TASK_RUNNING flag. |
331 | */ | 332 | */ |
332 | static void rpc_make_runnable(struct rpc_task *task) | 333 | static void rpc_make_runnable(struct workqueue_struct *wq, |
334 | struct rpc_task *task) | ||
333 | { | 335 | { |
334 | bool need_wakeup = !rpc_test_and_set_running(task); | 336 | bool need_wakeup = !rpc_test_and_set_running(task); |
335 | 337 | ||
@@ -338,7 +340,7 @@ static void rpc_make_runnable(struct rpc_task *task) | |||
338 | return; | 340 | return; |
339 | if (RPC_IS_ASYNC(task)) { | 341 | if (RPC_IS_ASYNC(task)) { |
340 | INIT_WORK(&task->u.tk_work, rpc_async_schedule); | 342 | INIT_WORK(&task->u.tk_work, rpc_async_schedule); |
341 | queue_work(rpciod_workqueue, &task->u.tk_work); | 343 | queue_work(wq, &task->u.tk_work); |
342 | } else | 344 | } else |
343 | wake_up_bit(&task->tk_runstate, RPC_TASK_QUEUED); | 345 | wake_up_bit(&task->tk_runstate, RPC_TASK_QUEUED); |
344 | } | 346 | } |
@@ -407,13 +409,16 @@ void rpc_sleep_on_priority(struct rpc_wait_queue *q, struct rpc_task *task, | |||
407 | EXPORT_SYMBOL_GPL(rpc_sleep_on_priority); | 409 | EXPORT_SYMBOL_GPL(rpc_sleep_on_priority); |
408 | 410 | ||
409 | /** | 411 | /** |
410 | * __rpc_do_wake_up_task - wake up a single rpc_task | 412 | * __rpc_do_wake_up_task_on_wq - wake up a single rpc_task |
413 | * @wq: workqueue on which to run task | ||
411 | * @queue: wait queue | 414 | * @queue: wait queue |
412 | * @task: task to be woken up | 415 | * @task: task to be woken up |
413 | * | 416 | * |
414 | * Caller must hold queue->lock, and have cleared the task queued flag. | 417 | * Caller must hold queue->lock, and have cleared the task queued flag. |
415 | */ | 418 | */ |
416 | static void __rpc_do_wake_up_task(struct rpc_wait_queue *queue, struct rpc_task *task) | 419 | static void __rpc_do_wake_up_task_on_wq(struct workqueue_struct *wq, |
420 | struct rpc_wait_queue *queue, | ||
421 | struct rpc_task *task) | ||
417 | { | 422 | { |
418 | dprintk("RPC: %5u __rpc_wake_up_task (now %lu)\n", | 423 | dprintk("RPC: %5u __rpc_wake_up_task (now %lu)\n", |
419 | task->tk_pid, jiffies); | 424 | task->tk_pid, jiffies); |
@@ -428,7 +433,7 @@ static void __rpc_do_wake_up_task(struct rpc_wait_queue *queue, struct rpc_task | |||
428 | 433 | ||
429 | __rpc_remove_wait_queue(queue, task); | 434 | __rpc_remove_wait_queue(queue, task); |
430 | 435 | ||
431 | rpc_make_runnable(task); | 436 | rpc_make_runnable(wq, task); |
432 | 437 | ||
433 | dprintk("RPC: __rpc_wake_up_task done\n"); | 438 | dprintk("RPC: __rpc_wake_up_task done\n"); |
434 | } | 439 | } |
@@ -436,16 +441,25 @@ static void __rpc_do_wake_up_task(struct rpc_wait_queue *queue, struct rpc_task | |||
436 | /* | 441 | /* |
437 | * Wake up a queued task while the queue lock is being held | 442 | * Wake up a queued task while the queue lock is being held |
438 | */ | 443 | */ |
439 | static void rpc_wake_up_task_queue_locked(struct rpc_wait_queue *queue, struct rpc_task *task) | 444 | static void rpc_wake_up_task_on_wq_queue_locked(struct workqueue_struct *wq, |
445 | struct rpc_wait_queue *queue, struct rpc_task *task) | ||
440 | { | 446 | { |
441 | if (RPC_IS_QUEUED(task)) { | 447 | if (RPC_IS_QUEUED(task)) { |
442 | smp_rmb(); | 448 | smp_rmb(); |
443 | if (task->tk_waitqueue == queue) | 449 | if (task->tk_waitqueue == queue) |
444 | __rpc_do_wake_up_task(queue, task); | 450 | __rpc_do_wake_up_task_on_wq(wq, queue, task); |
445 | } | 451 | } |
446 | } | 452 | } |
447 | 453 | ||
448 | /* | 454 | /* |
455 | * Wake up a queued task while the queue lock is being held | ||
456 | */ | ||
457 | static void rpc_wake_up_task_queue_locked(struct rpc_wait_queue *queue, struct rpc_task *task) | ||
458 | { | ||
459 | rpc_wake_up_task_on_wq_queue_locked(rpciod_workqueue, queue, task); | ||
460 | } | ||
461 | |||
462 | /* | ||
449 | * Wake up a task on a specific queue | 463 | * Wake up a task on a specific queue |
450 | */ | 464 | */ |
451 | void rpc_wake_up_queued_task(struct rpc_wait_queue *queue, struct rpc_task *task) | 465 | void rpc_wake_up_queued_task(struct rpc_wait_queue *queue, struct rpc_task *task) |
@@ -518,7 +532,8 @@ static struct rpc_task *__rpc_find_next_queued(struct rpc_wait_queue *queue) | |||
518 | /* | 532 | /* |
519 | * Wake up the first task on the wait queue. | 533 | * Wake up the first task on the wait queue. |
520 | */ | 534 | */ |
521 | struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *queue, | 535 | struct rpc_task *rpc_wake_up_first_on_wq(struct workqueue_struct *wq, |
536 | struct rpc_wait_queue *queue, | ||
522 | bool (*func)(struct rpc_task *, void *), void *data) | 537 | bool (*func)(struct rpc_task *, void *), void *data) |
523 | { | 538 | { |
524 | struct rpc_task *task = NULL; | 539 | struct rpc_task *task = NULL; |
@@ -529,7 +544,7 @@ struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *queue, | |||
529 | task = __rpc_find_next_queued(queue); | 544 | task = __rpc_find_next_queued(queue); |
530 | if (task != NULL) { | 545 | if (task != NULL) { |
531 | if (func(task, data)) | 546 | if (func(task, data)) |
532 | rpc_wake_up_task_queue_locked(queue, task); | 547 | rpc_wake_up_task_on_wq_queue_locked(wq, queue, task); |
533 | else | 548 | else |
534 | task = NULL; | 549 | task = NULL; |
535 | } | 550 | } |
@@ -537,6 +552,15 @@ struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *queue, | |||
537 | 552 | ||
538 | return task; | 553 | return task; |
539 | } | 554 | } |
555 | |||
556 | /* | ||
557 | * Wake up the first task on the wait queue. | ||
558 | */ | ||
559 | struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *queue, | ||
560 | bool (*func)(struct rpc_task *, void *), void *data) | ||
561 | { | ||
562 | return rpc_wake_up_first_on_wq(rpciod_workqueue, queue, func, data); | ||
563 | } | ||
540 | EXPORT_SYMBOL_GPL(rpc_wake_up_first); | 564 | EXPORT_SYMBOL_GPL(rpc_wake_up_first); |
541 | 565 | ||
542 | static bool rpc_wake_up_next_func(struct rpc_task *task, void *data) | 566 | static bool rpc_wake_up_next_func(struct rpc_task *task, void *data) |
@@ -814,7 +838,7 @@ void rpc_execute(struct rpc_task *task) | |||
814 | bool is_async = RPC_IS_ASYNC(task); | 838 | bool is_async = RPC_IS_ASYNC(task); |
815 | 839 | ||
816 | rpc_set_active(task); | 840 | rpc_set_active(task); |
817 | rpc_make_runnable(task); | 841 | rpc_make_runnable(rpciod_workqueue, task); |
818 | if (!is_async) | 842 | if (!is_async) |
819 | __rpc_execute(task); | 843 | __rpc_execute(task); |
820 | } | 844 | } |
@@ -1071,10 +1095,22 @@ static int rpciod_start(void) | |||
1071 | * Create the rpciod thread and wait for it to start. | 1095 | * Create the rpciod thread and wait for it to start. |
1072 | */ | 1096 | */ |
1073 | dprintk("RPC: creating workqueue rpciod\n"); | 1097 | dprintk("RPC: creating workqueue rpciod\n"); |
1074 | /* Note: highpri because network receive is latency sensitive */ | 1098 | wq = alloc_workqueue("rpciod", WQ_MEM_RECLAIM, 0); |
1075 | wq = alloc_workqueue("rpciod", WQ_MEM_RECLAIM | WQ_HIGHPRI, 0); | 1099 | if (!wq) |
1100 | goto out_failed; | ||
1076 | rpciod_workqueue = wq; | 1101 | rpciod_workqueue = wq; |
1077 | return rpciod_workqueue != NULL; | 1102 | /* Note: highpri because network receive is latency sensitive */ |
1103 | wq = alloc_workqueue("xprtiod", WQ_MEM_RECLAIM | WQ_HIGHPRI, 0); | ||
1104 | if (!wq) | ||
1105 | goto free_rpciod; | ||
1106 | xprtiod_workqueue = wq; | ||
1107 | return 1; | ||
1108 | free_rpciod: | ||
1109 | wq = rpciod_workqueue; | ||
1110 | rpciod_workqueue = NULL; | ||
1111 | destroy_workqueue(wq); | ||
1112 | out_failed: | ||
1113 | return 0; | ||
1078 | } | 1114 | } |
1079 | 1115 | ||
1080 | static void rpciod_stop(void) | 1116 | static void rpciod_stop(void) |
@@ -1088,6 +1124,9 @@ static void rpciod_stop(void) | |||
1088 | wq = rpciod_workqueue; | 1124 | wq = rpciod_workqueue; |
1089 | rpciod_workqueue = NULL; | 1125 | rpciod_workqueue = NULL; |
1090 | destroy_workqueue(wq); | 1126 | destroy_workqueue(wq); |
1127 | wq = xprtiod_workqueue; | ||
1128 | xprtiod_workqueue = NULL; | ||
1129 | destroy_workqueue(wq); | ||
1091 | } | 1130 | } |
1092 | 1131 | ||
1093 | void | 1132 | void |
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index cc9852897395..c5b0cb4f4056 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c | |||
@@ -1188,11 +1188,17 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) | |||
1188 | *statp = procp->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp); | 1188 | *statp = procp->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp); |
1189 | 1189 | ||
1190 | /* Encode reply */ | 1190 | /* Encode reply */ |
1191 | if (test_bit(RQ_DROPME, &rqstp->rq_flags)) { | 1191 | if (*statp == rpc_drop_reply || |
1192 | test_bit(RQ_DROPME, &rqstp->rq_flags)) { | ||
1192 | if (procp->pc_release) | 1193 | if (procp->pc_release) |
1193 | procp->pc_release(rqstp, NULL, rqstp->rq_resp); | 1194 | procp->pc_release(rqstp, NULL, rqstp->rq_resp); |
1194 | goto dropit; | 1195 | goto dropit; |
1195 | } | 1196 | } |
1197 | if (*statp == rpc_autherr_badcred) { | ||
1198 | if (procp->pc_release) | ||
1199 | procp->pc_release(rqstp, NULL, rqstp->rq_resp); | ||
1200 | goto err_bad_auth; | ||
1201 | } | ||
1196 | if (*statp == rpc_success && | 1202 | if (*statp == rpc_success && |
1197 | (xdr = procp->pc_encode) && | 1203 | (xdr = procp->pc_encode) && |
1198 | !xdr(rqstp, resv->iov_base+resv->iov_len, rqstp->rq_resp)) { | 1204 | !xdr(rqstp, resv->iov_base+resv->iov_len, rqstp->rq_resp)) { |
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 216a1385718a..8313960cac52 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c | |||
@@ -220,7 +220,7 @@ static void xprt_clear_locked(struct rpc_xprt *xprt) | |||
220 | clear_bit(XPRT_LOCKED, &xprt->state); | 220 | clear_bit(XPRT_LOCKED, &xprt->state); |
221 | smp_mb__after_atomic(); | 221 | smp_mb__after_atomic(); |
222 | } else | 222 | } else |
223 | queue_work(rpciod_workqueue, &xprt->task_cleanup); | 223 | queue_work(xprtiod_workqueue, &xprt->task_cleanup); |
224 | } | 224 | } |
225 | 225 | ||
226 | /* | 226 | /* |
@@ -295,7 +295,8 @@ static void __xprt_lock_write_next(struct rpc_xprt *xprt) | |||
295 | if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) | 295 | if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) |
296 | return; | 296 | return; |
297 | 297 | ||
298 | if (rpc_wake_up_first(&xprt->sending, __xprt_lock_write_func, xprt)) | 298 | if (rpc_wake_up_first_on_wq(xprtiod_workqueue, &xprt->sending, |
299 | __xprt_lock_write_func, xprt)) | ||
299 | return; | 300 | return; |
300 | xprt_clear_locked(xprt); | 301 | xprt_clear_locked(xprt); |
301 | } | 302 | } |
@@ -324,7 +325,8 @@ static void __xprt_lock_write_next_cong(struct rpc_xprt *xprt) | |||
324 | return; | 325 | return; |
325 | if (RPCXPRT_CONGESTED(xprt)) | 326 | if (RPCXPRT_CONGESTED(xprt)) |
326 | goto out_unlock; | 327 | goto out_unlock; |
327 | if (rpc_wake_up_first(&xprt->sending, __xprt_lock_write_cong_func, xprt)) | 328 | if (rpc_wake_up_first_on_wq(xprtiod_workqueue, &xprt->sending, |
329 | __xprt_lock_write_cong_func, xprt)) | ||
328 | return; | 330 | return; |
329 | out_unlock: | 331 | out_unlock: |
330 | xprt_clear_locked(xprt); | 332 | xprt_clear_locked(xprt); |
@@ -645,7 +647,7 @@ void xprt_force_disconnect(struct rpc_xprt *xprt) | |||
645 | set_bit(XPRT_CLOSE_WAIT, &xprt->state); | 647 | set_bit(XPRT_CLOSE_WAIT, &xprt->state); |
646 | /* Try to schedule an autoclose RPC call */ | 648 | /* Try to schedule an autoclose RPC call */ |
647 | if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0) | 649 | if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0) |
648 | queue_work(rpciod_workqueue, &xprt->task_cleanup); | 650 | queue_work(xprtiod_workqueue, &xprt->task_cleanup); |
649 | xprt_wake_pending_tasks(xprt, -EAGAIN); | 651 | xprt_wake_pending_tasks(xprt, -EAGAIN); |
650 | spin_unlock_bh(&xprt->transport_lock); | 652 | spin_unlock_bh(&xprt->transport_lock); |
651 | } | 653 | } |
@@ -672,7 +674,7 @@ void xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie) | |||
672 | set_bit(XPRT_CLOSE_WAIT, &xprt->state); | 674 | set_bit(XPRT_CLOSE_WAIT, &xprt->state); |
673 | /* Try to schedule an autoclose RPC call */ | 675 | /* Try to schedule an autoclose RPC call */ |
674 | if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0) | 676 | if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0) |
675 | queue_work(rpciod_workqueue, &xprt->task_cleanup); | 677 | queue_work(xprtiod_workqueue, &xprt->task_cleanup); |
676 | xprt_wake_pending_tasks(xprt, -EAGAIN); | 678 | xprt_wake_pending_tasks(xprt, -EAGAIN); |
677 | out: | 679 | out: |
678 | spin_unlock_bh(&xprt->transport_lock); | 680 | spin_unlock_bh(&xprt->transport_lock); |
@@ -689,7 +691,7 @@ xprt_init_autodisconnect(unsigned long data) | |||
689 | if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) | 691 | if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) |
690 | goto out_abort; | 692 | goto out_abort; |
691 | spin_unlock(&xprt->transport_lock); | 693 | spin_unlock(&xprt->transport_lock); |
692 | queue_work(rpciod_workqueue, &xprt->task_cleanup); | 694 | queue_work(xprtiod_workqueue, &xprt->task_cleanup); |
693 | return; | 695 | return; |
694 | out_abort: | 696 | out_abort: |
695 | spin_unlock(&xprt->transport_lock); | 697 | spin_unlock(&xprt->transport_lock); |
diff --git a/net/sunrpc/xprtmultipath.c b/net/sunrpc/xprtmultipath.c index e7fd76975d86..66c9d63f4797 100644 --- a/net/sunrpc/xprtmultipath.c +++ b/net/sunrpc/xprtmultipath.c | |||
@@ -271,14 +271,12 @@ struct rpc_xprt *xprt_iter_next_entry_multiple(struct rpc_xprt_iter *xpi, | |||
271 | xprt_switch_find_xprt_t find_next) | 271 | xprt_switch_find_xprt_t find_next) |
272 | { | 272 | { |
273 | struct rpc_xprt_switch *xps = rcu_dereference(xpi->xpi_xpswitch); | 273 | struct rpc_xprt_switch *xps = rcu_dereference(xpi->xpi_xpswitch); |
274 | struct list_head *head; | ||
275 | 274 | ||
276 | if (xps == NULL) | 275 | if (xps == NULL) |
277 | return NULL; | 276 | return NULL; |
278 | head = &xps->xps_xprt_list; | 277 | return xprt_switch_set_next_cursor(&xps->xps_xprt_list, |
279 | if (xps->xps_nxprts < 2) | 278 | &xpi->xpi_cursor, |
280 | return xprt_switch_find_first_entry(head); | 279 | find_next); |
281 | return xprt_switch_set_next_cursor(head, &xpi->xpi_cursor, find_next); | ||
282 | } | 280 | } |
283 | 281 | ||
284 | static | 282 | static |
diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile index dc9f3b513a05..ef19fa42c50f 100644 --- a/net/sunrpc/xprtrdma/Makefile +++ b/net/sunrpc/xprtrdma/Makefile | |||
@@ -1,7 +1,7 @@ | |||
1 | obj-$(CONFIG_SUNRPC_XPRT_RDMA) += rpcrdma.o | 1 | obj-$(CONFIG_SUNRPC_XPRT_RDMA) += rpcrdma.o |
2 | 2 | ||
3 | rpcrdma-y := transport.o rpc_rdma.o verbs.o \ | 3 | rpcrdma-y := transport.o rpc_rdma.o verbs.o \ |
4 | fmr_ops.o frwr_ops.o physical_ops.o \ | 4 | fmr_ops.o frwr_ops.o \ |
5 | svc_rdma.o svc_rdma_backchannel.o svc_rdma_transport.o \ | 5 | svc_rdma.o svc_rdma_backchannel.o svc_rdma_transport.o \ |
6 | svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o \ | 6 | svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o \ |
7 | module.o | 7 | module.o |
diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c index 6326ebe8b595..21cb3b150b37 100644 --- a/net/sunrpc/xprtrdma/fmr_ops.c +++ b/net/sunrpc/xprtrdma/fmr_ops.c | |||
@@ -19,13 +19,6 @@ | |||
19 | * verb (fmr_op_unmap). | 19 | * verb (fmr_op_unmap). |
20 | */ | 20 | */ |
21 | 21 | ||
22 | /* Transport recovery | ||
23 | * | ||
24 | * After a transport reconnect, fmr_op_map re-uses the MR already | ||
25 | * allocated for the RPC, but generates a fresh rkey then maps the | ||
26 | * MR again. This process is synchronous. | ||
27 | */ | ||
28 | |||
29 | #include "xprt_rdma.h" | 22 | #include "xprt_rdma.h" |
30 | 23 | ||
31 | #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) | 24 | #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) |
@@ -35,62 +28,132 @@ | |||
35 | /* Maximum scatter/gather per FMR */ | 28 | /* Maximum scatter/gather per FMR */ |
36 | #define RPCRDMA_MAX_FMR_SGES (64) | 29 | #define RPCRDMA_MAX_FMR_SGES (64) |
37 | 30 | ||
38 | static struct workqueue_struct *fmr_recovery_wq; | 31 | /* Access mode of externally registered pages */ |
39 | 32 | enum { | |
40 | #define FMR_RECOVERY_WQ_FLAGS (WQ_UNBOUND) | 33 | RPCRDMA_FMR_ACCESS_FLAGS = IB_ACCESS_REMOTE_WRITE | |
34 | IB_ACCESS_REMOTE_READ, | ||
35 | }; | ||
41 | 36 | ||
42 | int | 37 | bool |
43 | fmr_alloc_recovery_wq(void) | 38 | fmr_is_supported(struct rpcrdma_ia *ia) |
44 | { | 39 | { |
45 | fmr_recovery_wq = alloc_workqueue("fmr_recovery", WQ_UNBOUND, 0); | 40 | if (!ia->ri_device->alloc_fmr) { |
46 | return !fmr_recovery_wq ? -ENOMEM : 0; | 41 | pr_info("rpcrdma: 'fmr' mode is not supported by device %s\n", |
42 | ia->ri_device->name); | ||
43 | return false; | ||
44 | } | ||
45 | return true; | ||
47 | } | 46 | } |
48 | 47 | ||
49 | void | 48 | static int |
50 | fmr_destroy_recovery_wq(void) | 49 | fmr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *mw) |
51 | { | 50 | { |
52 | struct workqueue_struct *wq; | 51 | static struct ib_fmr_attr fmr_attr = { |
52 | .max_pages = RPCRDMA_MAX_FMR_SGES, | ||
53 | .max_maps = 1, | ||
54 | .page_shift = PAGE_SHIFT | ||
55 | }; | ||
53 | 56 | ||
54 | if (!fmr_recovery_wq) | 57 | mw->fmr.fm_physaddrs = kcalloc(RPCRDMA_MAX_FMR_SGES, |
55 | return; | 58 | sizeof(u64), GFP_KERNEL); |
59 | if (!mw->fmr.fm_physaddrs) | ||
60 | goto out_free; | ||
56 | 61 | ||
57 | wq = fmr_recovery_wq; | 62 | mw->mw_sg = kcalloc(RPCRDMA_MAX_FMR_SGES, |
58 | fmr_recovery_wq = NULL; | 63 | sizeof(*mw->mw_sg), GFP_KERNEL); |
59 | destroy_workqueue(wq); | 64 | if (!mw->mw_sg) |
65 | goto out_free; | ||
66 | |||
67 | sg_init_table(mw->mw_sg, RPCRDMA_MAX_FMR_SGES); | ||
68 | |||
69 | mw->fmr.fm_mr = ib_alloc_fmr(ia->ri_pd, RPCRDMA_FMR_ACCESS_FLAGS, | ||
70 | &fmr_attr); | ||
71 | if (IS_ERR(mw->fmr.fm_mr)) | ||
72 | goto out_fmr_err; | ||
73 | |||
74 | return 0; | ||
75 | |||
76 | out_fmr_err: | ||
77 | dprintk("RPC: %s: ib_alloc_fmr returned %ld\n", __func__, | ||
78 | PTR_ERR(mw->fmr.fm_mr)); | ||
79 | |||
80 | out_free: | ||
81 | kfree(mw->mw_sg); | ||
82 | kfree(mw->fmr.fm_physaddrs); | ||
83 | return -ENOMEM; | ||
60 | } | 84 | } |
61 | 85 | ||
62 | static int | 86 | static int |
63 | __fmr_unmap(struct rpcrdma_mw *mw) | 87 | __fmr_unmap(struct rpcrdma_mw *mw) |
64 | { | 88 | { |
65 | LIST_HEAD(l); | 89 | LIST_HEAD(l); |
90 | int rc; | ||
66 | 91 | ||
67 | list_add(&mw->fmr.fmr->list, &l); | 92 | list_add(&mw->fmr.fm_mr->list, &l); |
68 | return ib_unmap_fmr(&l); | 93 | rc = ib_unmap_fmr(&l); |
94 | list_del_init(&mw->fmr.fm_mr->list); | ||
95 | return rc; | ||
69 | } | 96 | } |
70 | 97 | ||
71 | /* Deferred reset of a single FMR. Generate a fresh rkey by | ||
72 | * replacing the MR. There's no recovery if this fails. | ||
73 | */ | ||
74 | static void | 98 | static void |
75 | __fmr_recovery_worker(struct work_struct *work) | 99 | fmr_op_release_mr(struct rpcrdma_mw *r) |
76 | { | 100 | { |
77 | struct rpcrdma_mw *mw = container_of(work, struct rpcrdma_mw, | 101 | LIST_HEAD(unmap_list); |
78 | mw_work); | 102 | int rc; |
79 | struct rpcrdma_xprt *r_xprt = mw->mw_xprt; | ||
80 | 103 | ||
81 | __fmr_unmap(mw); | 104 | /* Ensure MW is not on any rl_registered list */ |
82 | rpcrdma_put_mw(r_xprt, mw); | 105 | if (!list_empty(&r->mw_list)) |
83 | return; | 106 | list_del(&r->mw_list); |
107 | |||
108 | kfree(r->fmr.fm_physaddrs); | ||
109 | kfree(r->mw_sg); | ||
110 | |||
111 | /* In case this one was left mapped, try to unmap it | ||
112 | * to prevent dealloc_fmr from failing with EBUSY | ||
113 | */ | ||
114 | rc = __fmr_unmap(r); | ||
115 | if (rc) | ||
116 | pr_err("rpcrdma: final ib_unmap_fmr for %p failed %i\n", | ||
117 | r, rc); | ||
118 | |||
119 | rc = ib_dealloc_fmr(r->fmr.fm_mr); | ||
120 | if (rc) | ||
121 | pr_err("rpcrdma: final ib_dealloc_fmr for %p returned %i\n", | ||
122 | r, rc); | ||
123 | |||
124 | kfree(r); | ||
84 | } | 125 | } |
85 | 126 | ||
86 | /* A broken MR was discovered in a context that can't sleep. | 127 | /* Reset of a single FMR. |
87 | * Defer recovery to the recovery worker. | ||
88 | */ | 128 | */ |
89 | static void | 129 | static void |
90 | __fmr_queue_recovery(struct rpcrdma_mw *mw) | 130 | fmr_op_recover_mr(struct rpcrdma_mw *mw) |
91 | { | 131 | { |
92 | INIT_WORK(&mw->mw_work, __fmr_recovery_worker); | 132 | struct rpcrdma_xprt *r_xprt = mw->mw_xprt; |
93 | queue_work(fmr_recovery_wq, &mw->mw_work); | 133 | int rc; |
134 | |||
135 | /* ORDER: invalidate first */ | ||
136 | rc = __fmr_unmap(mw); | ||
137 | |||
138 | /* ORDER: then DMA unmap */ | ||
139 | ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, | ||
140 | mw->mw_sg, mw->mw_nents, mw->mw_dir); | ||
141 | if (rc) | ||
142 | goto out_release; | ||
143 | |||
144 | rpcrdma_put_mw(r_xprt, mw); | ||
145 | r_xprt->rx_stats.mrs_recovered++; | ||
146 | return; | ||
147 | |||
148 | out_release: | ||
149 | pr_err("rpcrdma: FMR reset failed (%d), %p released\n", rc, mw); | ||
150 | r_xprt->rx_stats.mrs_orphaned++; | ||
151 | |||
152 | spin_lock(&r_xprt->rx_buf.rb_mwlock); | ||
153 | list_del(&mw->mw_all); | ||
154 | spin_unlock(&r_xprt->rx_buf.rb_mwlock); | ||
155 | |||
156 | fmr_op_release_mr(mw); | ||
94 | } | 157 | } |
95 | 158 | ||
96 | static int | 159 | static int |
@@ -112,86 +175,21 @@ fmr_op_maxpages(struct rpcrdma_xprt *r_xprt) | |||
112 | RPCRDMA_MAX_HDR_SEGS * RPCRDMA_MAX_FMR_SGES); | 175 | RPCRDMA_MAX_HDR_SEGS * RPCRDMA_MAX_FMR_SGES); |
113 | } | 176 | } |
114 | 177 | ||
115 | static int | ||
116 | fmr_op_init(struct rpcrdma_xprt *r_xprt) | ||
117 | { | ||
118 | struct rpcrdma_buffer *buf = &r_xprt->rx_buf; | ||
119 | int mr_access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ; | ||
120 | struct ib_fmr_attr fmr_attr = { | ||
121 | .max_pages = RPCRDMA_MAX_FMR_SGES, | ||
122 | .max_maps = 1, | ||
123 | .page_shift = PAGE_SHIFT | ||
124 | }; | ||
125 | struct ib_pd *pd = r_xprt->rx_ia.ri_pd; | ||
126 | struct rpcrdma_mw *r; | ||
127 | int i, rc; | ||
128 | |||
129 | spin_lock_init(&buf->rb_mwlock); | ||
130 | INIT_LIST_HEAD(&buf->rb_mws); | ||
131 | INIT_LIST_HEAD(&buf->rb_all); | ||
132 | |||
133 | i = max_t(int, RPCRDMA_MAX_DATA_SEGS / RPCRDMA_MAX_FMR_SGES, 1); | ||
134 | i += 2; /* head + tail */ | ||
135 | i *= buf->rb_max_requests; /* one set for each RPC slot */ | ||
136 | dprintk("RPC: %s: initalizing %d FMRs\n", __func__, i); | ||
137 | |||
138 | rc = -ENOMEM; | ||
139 | while (i--) { | ||
140 | r = kzalloc(sizeof(*r), GFP_KERNEL); | ||
141 | if (!r) | ||
142 | goto out; | ||
143 | |||
144 | r->fmr.physaddrs = kmalloc(RPCRDMA_MAX_FMR_SGES * | ||
145 | sizeof(u64), GFP_KERNEL); | ||
146 | if (!r->fmr.physaddrs) | ||
147 | goto out_free; | ||
148 | |||
149 | r->fmr.fmr = ib_alloc_fmr(pd, mr_access_flags, &fmr_attr); | ||
150 | if (IS_ERR(r->fmr.fmr)) | ||
151 | goto out_fmr_err; | ||
152 | |||
153 | r->mw_xprt = r_xprt; | ||
154 | list_add(&r->mw_list, &buf->rb_mws); | ||
155 | list_add(&r->mw_all, &buf->rb_all); | ||
156 | } | ||
157 | return 0; | ||
158 | |||
159 | out_fmr_err: | ||
160 | rc = PTR_ERR(r->fmr.fmr); | ||
161 | dprintk("RPC: %s: ib_alloc_fmr status %i\n", __func__, rc); | ||
162 | kfree(r->fmr.physaddrs); | ||
163 | out_free: | ||
164 | kfree(r); | ||
165 | out: | ||
166 | return rc; | ||
167 | } | ||
168 | |||
169 | /* Use the ib_map_phys_fmr() verb to register a memory region | 178 | /* Use the ib_map_phys_fmr() verb to register a memory region |
170 | * for remote access via RDMA READ or RDMA WRITE. | 179 | * for remote access via RDMA READ or RDMA WRITE. |
171 | */ | 180 | */ |
172 | static int | 181 | static int |
173 | fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, | 182 | fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, |
174 | int nsegs, bool writing) | 183 | int nsegs, bool writing, struct rpcrdma_mw **out) |
175 | { | 184 | { |
176 | struct rpcrdma_ia *ia = &r_xprt->rx_ia; | ||
177 | struct ib_device *device = ia->ri_device; | ||
178 | enum dma_data_direction direction = rpcrdma_data_dir(writing); | ||
179 | struct rpcrdma_mr_seg *seg1 = seg; | 185 | struct rpcrdma_mr_seg *seg1 = seg; |
180 | int len, pageoff, i, rc; | 186 | int len, pageoff, i, rc; |
181 | struct rpcrdma_mw *mw; | 187 | struct rpcrdma_mw *mw; |
188 | u64 *dma_pages; | ||
182 | 189 | ||
183 | mw = seg1->rl_mw; | 190 | mw = rpcrdma_get_mw(r_xprt); |
184 | seg1->rl_mw = NULL; | 191 | if (!mw) |
185 | if (!mw) { | 192 | return -ENOBUFS; |
186 | mw = rpcrdma_get_mw(r_xprt); | ||
187 | if (!mw) | ||
188 | return -ENOMEM; | ||
189 | } else { | ||
190 | /* this is a retransmit; generate a fresh rkey */ | ||
191 | rc = __fmr_unmap(mw); | ||
192 | if (rc) | ||
193 | return rc; | ||
194 | } | ||
195 | 193 | ||
196 | pageoff = offset_in_page(seg1->mr_offset); | 194 | pageoff = offset_in_page(seg1->mr_offset); |
197 | seg1->mr_offset -= pageoff; /* start of page */ | 195 | seg1->mr_offset -= pageoff; /* start of page */ |
@@ -200,8 +198,14 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, | |||
200 | if (nsegs > RPCRDMA_MAX_FMR_SGES) | 198 | if (nsegs > RPCRDMA_MAX_FMR_SGES) |
201 | nsegs = RPCRDMA_MAX_FMR_SGES; | 199 | nsegs = RPCRDMA_MAX_FMR_SGES; |
202 | for (i = 0; i < nsegs;) { | 200 | for (i = 0; i < nsegs;) { |
203 | rpcrdma_map_one(device, seg, direction); | 201 | if (seg->mr_page) |
204 | mw->fmr.physaddrs[i] = seg->mr_dma; | 202 | sg_set_page(&mw->mw_sg[i], |
203 | seg->mr_page, | ||
204 | seg->mr_len, | ||
205 | offset_in_page(seg->mr_offset)); | ||
206 | else | ||
207 | sg_set_buf(&mw->mw_sg[i], seg->mr_offset, | ||
208 | seg->mr_len); | ||
205 | len += seg->mr_len; | 209 | len += seg->mr_len; |
206 | ++seg; | 210 | ++seg; |
207 | ++i; | 211 | ++i; |
@@ -210,49 +214,54 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, | |||
210 | offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) | 214 | offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) |
211 | break; | 215 | break; |
212 | } | 216 | } |
213 | 217 | mw->mw_nents = i; | |
214 | rc = ib_map_phys_fmr(mw->fmr.fmr, mw->fmr.physaddrs, | 218 | mw->mw_dir = rpcrdma_data_dir(writing); |
215 | i, seg1->mr_dma); | 219 | if (i == 0) |
220 | goto out_dmamap_err; | ||
221 | |||
222 | if (!ib_dma_map_sg(r_xprt->rx_ia.ri_device, | ||
223 | mw->mw_sg, mw->mw_nents, mw->mw_dir)) | ||
224 | goto out_dmamap_err; | ||
225 | |||
226 | for (i = 0, dma_pages = mw->fmr.fm_physaddrs; i < mw->mw_nents; i++) | ||
227 | dma_pages[i] = sg_dma_address(&mw->mw_sg[i]); | ||
228 | rc = ib_map_phys_fmr(mw->fmr.fm_mr, dma_pages, mw->mw_nents, | ||
229 | dma_pages[0]); | ||
216 | if (rc) | 230 | if (rc) |
217 | goto out_maperr; | 231 | goto out_maperr; |
218 | 232 | ||
219 | seg1->rl_mw = mw; | 233 | mw->mw_handle = mw->fmr.fm_mr->rkey; |
220 | seg1->mr_rkey = mw->fmr.fmr->rkey; | 234 | mw->mw_length = len; |
221 | seg1->mr_base = seg1->mr_dma + pageoff; | 235 | mw->mw_offset = dma_pages[0] + pageoff; |
222 | seg1->mr_nsegs = i; | ||
223 | seg1->mr_len = len; | ||
224 | return i; | ||
225 | 236 | ||
226 | out_maperr: | 237 | *out = mw; |
227 | dprintk("RPC: %s: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n", | 238 | return mw->mw_nents; |
228 | __func__, len, (unsigned long long)seg1->mr_dma, | ||
229 | pageoff, i, rc); | ||
230 | while (i--) | ||
231 | rpcrdma_unmap_one(device, --seg); | ||
232 | return rc; | ||
233 | } | ||
234 | 239 | ||
235 | static void | 240 | out_dmamap_err: |
236 | __fmr_dma_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg) | 241 | pr_err("rpcrdma: failed to dma map sg %p sg_nents %u\n", |
237 | { | 242 | mw->mw_sg, mw->mw_nents); |
238 | struct ib_device *device = r_xprt->rx_ia.ri_device; | 243 | rpcrdma_defer_mr_recovery(mw); |
239 | int nsegs = seg->mr_nsegs; | 244 | return -EIO; |
240 | 245 | ||
241 | while (nsegs--) | 246 | out_maperr: |
242 | rpcrdma_unmap_one(device, seg++); | 247 | pr_err("rpcrdma: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n", |
248 | len, (unsigned long long)dma_pages[0], | ||
249 | pageoff, mw->mw_nents, rc); | ||
250 | rpcrdma_defer_mr_recovery(mw); | ||
251 | return -EIO; | ||
243 | } | 252 | } |
244 | 253 | ||
245 | /* Invalidate all memory regions that were registered for "req". | 254 | /* Invalidate all memory regions that were registered for "req". |
246 | * | 255 | * |
247 | * Sleeps until it is safe for the host CPU to access the | 256 | * Sleeps until it is safe for the host CPU to access the |
248 | * previously mapped memory regions. | 257 | * previously mapped memory regions. |
258 | * | ||
259 | * Caller ensures that req->rl_registered is not empty. | ||
249 | */ | 260 | */ |
250 | static void | 261 | static void |
251 | fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) | 262 | fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) |
252 | { | 263 | { |
253 | struct rpcrdma_mr_seg *seg; | 264 | struct rpcrdma_mw *mw, *tmp; |
254 | unsigned int i, nchunks; | ||
255 | struct rpcrdma_mw *mw; | ||
256 | LIST_HEAD(unmap_list); | 265 | LIST_HEAD(unmap_list); |
257 | int rc; | 266 | int rc; |
258 | 267 | ||
@@ -261,90 +270,54 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) | |||
261 | /* ORDER: Invalidate all of the req's MRs first | 270 | /* ORDER: Invalidate all of the req's MRs first |
262 | * | 271 | * |
263 | * ib_unmap_fmr() is slow, so use a single call instead | 272 | * ib_unmap_fmr() is slow, so use a single call instead |
264 | * of one call per mapped MR. | 273 | * of one call per mapped FMR. |
265 | */ | 274 | */ |
266 | for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) { | 275 | list_for_each_entry(mw, &req->rl_registered, mw_list) |
267 | seg = &req->rl_segments[i]; | 276 | list_add_tail(&mw->fmr.fm_mr->list, &unmap_list); |
268 | mw = seg->rl_mw; | ||
269 | |||
270 | list_add(&mw->fmr.fmr->list, &unmap_list); | ||
271 | |||
272 | i += seg->mr_nsegs; | ||
273 | } | ||
274 | rc = ib_unmap_fmr(&unmap_list); | 277 | rc = ib_unmap_fmr(&unmap_list); |
275 | if (rc) | 278 | if (rc) |
276 | pr_warn("%s: ib_unmap_fmr failed (%i)\n", __func__, rc); | 279 | goto out_reset; |
277 | 280 | ||
278 | /* ORDER: Now DMA unmap all of the req's MRs, and return | 281 | /* ORDER: Now DMA unmap all of the req's MRs, and return |
279 | * them to the free MW list. | 282 | * them to the free MW list. |
280 | */ | 283 | */ |
281 | for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) { | 284 | list_for_each_entry_safe(mw, tmp, &req->rl_registered, mw_list) { |
282 | seg = &req->rl_segments[i]; | 285 | list_del_init(&mw->mw_list); |
286 | list_del_init(&mw->fmr.fm_mr->list); | ||
287 | ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, | ||
288 | mw->mw_sg, mw->mw_nents, mw->mw_dir); | ||
289 | rpcrdma_put_mw(r_xprt, mw); | ||
290 | } | ||
283 | 291 | ||
284 | __fmr_dma_unmap(r_xprt, seg); | 292 | return; |
285 | rpcrdma_put_mw(r_xprt, seg->rl_mw); | ||
286 | 293 | ||
287 | i += seg->mr_nsegs; | 294 | out_reset: |
288 | seg->mr_nsegs = 0; | 295 | pr_err("rpcrdma: ib_unmap_fmr failed (%i)\n", rc); |
289 | seg->rl_mw = NULL; | ||
290 | } | ||
291 | 296 | ||
292 | req->rl_nchunks = 0; | 297 | list_for_each_entry_safe(mw, tmp, &req->rl_registered, mw_list) { |
298 | list_del_init(&mw->fmr.fm_mr->list); | ||
299 | fmr_op_recover_mr(mw); | ||
300 | } | ||
293 | } | 301 | } |
294 | 302 | ||
295 | /* Use a slow, safe mechanism to invalidate all memory regions | 303 | /* Use a slow, safe mechanism to invalidate all memory regions |
296 | * that were registered for "req". | 304 | * that were registered for "req". |
297 | * | ||
298 | * In the asynchronous case, DMA unmapping occurs first here | ||
299 | * because the rpcrdma_mr_seg is released immediately after this | ||
300 | * call. It's contents won't be available in __fmr_dma_unmap later. | ||
301 | * FIXME. | ||
302 | */ | 305 | */ |
303 | static void | 306 | static void |
304 | fmr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, | 307 | fmr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, |
305 | bool sync) | 308 | bool sync) |
306 | { | 309 | { |
307 | struct rpcrdma_mr_seg *seg; | ||
308 | struct rpcrdma_mw *mw; | 310 | struct rpcrdma_mw *mw; |
309 | unsigned int i; | ||
310 | |||
311 | for (i = 0; req->rl_nchunks; req->rl_nchunks--) { | ||
312 | seg = &req->rl_segments[i]; | ||
313 | mw = seg->rl_mw; | ||
314 | |||
315 | if (sync) { | ||
316 | /* ORDER */ | ||
317 | __fmr_unmap(mw); | ||
318 | __fmr_dma_unmap(r_xprt, seg); | ||
319 | rpcrdma_put_mw(r_xprt, mw); | ||
320 | } else { | ||
321 | __fmr_dma_unmap(r_xprt, seg); | ||
322 | __fmr_queue_recovery(mw); | ||
323 | } | ||
324 | |||
325 | i += seg->mr_nsegs; | ||
326 | seg->mr_nsegs = 0; | ||
327 | seg->rl_mw = NULL; | ||
328 | } | ||
329 | } | ||
330 | |||
331 | static void | ||
332 | fmr_op_destroy(struct rpcrdma_buffer *buf) | ||
333 | { | ||
334 | struct rpcrdma_mw *r; | ||
335 | int rc; | ||
336 | |||
337 | while (!list_empty(&buf->rb_all)) { | ||
338 | r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all); | ||
339 | list_del(&r->mw_all); | ||
340 | kfree(r->fmr.physaddrs); | ||
341 | 311 | ||
342 | rc = ib_dealloc_fmr(r->fmr.fmr); | 312 | while (!list_empty(&req->rl_registered)) { |
343 | if (rc) | 313 | mw = list_first_entry(&req->rl_registered, |
344 | dprintk("RPC: %s: ib_dealloc_fmr failed %i\n", | 314 | struct rpcrdma_mw, mw_list); |
345 | __func__, rc); | 315 | list_del_init(&mw->mw_list); |
346 | 316 | ||
347 | kfree(r); | 317 | if (sync) |
318 | fmr_op_recover_mr(mw); | ||
319 | else | ||
320 | rpcrdma_defer_mr_recovery(mw); | ||
348 | } | 321 | } |
349 | } | 322 | } |
350 | 323 | ||
@@ -352,9 +325,10 @@ const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = { | |||
352 | .ro_map = fmr_op_map, | 325 | .ro_map = fmr_op_map, |
353 | .ro_unmap_sync = fmr_op_unmap_sync, | 326 | .ro_unmap_sync = fmr_op_unmap_sync, |
354 | .ro_unmap_safe = fmr_op_unmap_safe, | 327 | .ro_unmap_safe = fmr_op_unmap_safe, |
328 | .ro_recover_mr = fmr_op_recover_mr, | ||
355 | .ro_open = fmr_op_open, | 329 | .ro_open = fmr_op_open, |
356 | .ro_maxpages = fmr_op_maxpages, | 330 | .ro_maxpages = fmr_op_maxpages, |
357 | .ro_init = fmr_op_init, | 331 | .ro_init_mr = fmr_op_init_mr, |
358 | .ro_destroy = fmr_op_destroy, | 332 | .ro_release_mr = fmr_op_release_mr, |
359 | .ro_displayname = "fmr", | 333 | .ro_displayname = "fmr", |
360 | }; | 334 | }; |
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index c0947544babe..892b5e1d9b09 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c | |||
@@ -73,29 +73,71 @@ | |||
73 | # define RPCDBG_FACILITY RPCDBG_TRANS | 73 | # define RPCDBG_FACILITY RPCDBG_TRANS |
74 | #endif | 74 | #endif |
75 | 75 | ||
76 | static struct workqueue_struct *frwr_recovery_wq; | 76 | bool |
77 | 77 | frwr_is_supported(struct rpcrdma_ia *ia) | |
78 | #define FRWR_RECOVERY_WQ_FLAGS (WQ_UNBOUND | WQ_MEM_RECLAIM) | 78 | { |
79 | struct ib_device_attr *attrs = &ia->ri_device->attrs; | ||
80 | |||
81 | if (!(attrs->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) | ||
82 | goto out_not_supported; | ||
83 | if (attrs->max_fast_reg_page_list_len == 0) | ||
84 | goto out_not_supported; | ||
85 | return true; | ||
86 | |||
87 | out_not_supported: | ||
88 | pr_info("rpcrdma: 'frwr' mode is not supported by device %s\n", | ||
89 | ia->ri_device->name); | ||
90 | return false; | ||
91 | } | ||
79 | 92 | ||
80 | int | 93 | static int |
81 | frwr_alloc_recovery_wq(void) | 94 | frwr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r) |
82 | { | 95 | { |
83 | frwr_recovery_wq = alloc_workqueue("frwr_recovery", | 96 | unsigned int depth = ia->ri_max_frmr_depth; |
84 | FRWR_RECOVERY_WQ_FLAGS, 0); | 97 | struct rpcrdma_frmr *f = &r->frmr; |
85 | return !frwr_recovery_wq ? -ENOMEM : 0; | 98 | int rc; |
99 | |||
100 | f->fr_mr = ib_alloc_mr(ia->ri_pd, IB_MR_TYPE_MEM_REG, depth); | ||
101 | if (IS_ERR(f->fr_mr)) | ||
102 | goto out_mr_err; | ||
103 | |||
104 | r->mw_sg = kcalloc(depth, sizeof(*r->mw_sg), GFP_KERNEL); | ||
105 | if (!r->mw_sg) | ||
106 | goto out_list_err; | ||
107 | |||
108 | sg_init_table(r->mw_sg, depth); | ||
109 | init_completion(&f->fr_linv_done); | ||
110 | return 0; | ||
111 | |||
112 | out_mr_err: | ||
113 | rc = PTR_ERR(f->fr_mr); | ||
114 | dprintk("RPC: %s: ib_alloc_mr status %i\n", | ||
115 | __func__, rc); | ||
116 | return rc; | ||
117 | |||
118 | out_list_err: | ||
119 | rc = -ENOMEM; | ||
120 | dprintk("RPC: %s: sg allocation failure\n", | ||
121 | __func__); | ||
122 | ib_dereg_mr(f->fr_mr); | ||
123 | return rc; | ||
86 | } | 124 | } |
87 | 125 | ||
88 | void | 126 | static void |
89 | frwr_destroy_recovery_wq(void) | 127 | frwr_op_release_mr(struct rpcrdma_mw *r) |
90 | { | 128 | { |
91 | struct workqueue_struct *wq; | 129 | int rc; |
92 | 130 | ||
93 | if (!frwr_recovery_wq) | 131 | /* Ensure MW is not on any rl_registered list */ |
94 | return; | 132 | if (!list_empty(&r->mw_list)) |
133 | list_del(&r->mw_list); | ||
95 | 134 | ||
96 | wq = frwr_recovery_wq; | 135 | rc = ib_dereg_mr(r->frmr.fr_mr); |
97 | frwr_recovery_wq = NULL; | 136 | if (rc) |
98 | destroy_workqueue(wq); | 137 | pr_err("rpcrdma: final ib_dereg_mr for %p returned %i\n", |
138 | r, rc); | ||
139 | kfree(r->mw_sg); | ||
140 | kfree(r); | ||
99 | } | 141 | } |
100 | 142 | ||
101 | static int | 143 | static int |
@@ -124,93 +166,37 @@ __frwr_reset_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r) | |||
124 | return 0; | 166 | return 0; |
125 | } | 167 | } |
126 | 168 | ||
127 | static void | 169 | /* Reset of a single FRMR. Generate a fresh rkey by replacing the MR. |
128 | __frwr_reset_and_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw) | ||
129 | { | ||
130 | struct rpcrdma_ia *ia = &r_xprt->rx_ia; | ||
131 | struct rpcrdma_frmr *f = &mw->frmr; | ||
132 | int rc; | ||
133 | |||
134 | rc = __frwr_reset_mr(ia, mw); | ||
135 | ib_dma_unmap_sg(ia->ri_device, f->fr_sg, f->fr_nents, f->fr_dir); | ||
136 | if (rc) | ||
137 | return; | ||
138 | |||
139 | rpcrdma_put_mw(r_xprt, mw); | ||
140 | } | ||
141 | |||
142 | /* Deferred reset of a single FRMR. Generate a fresh rkey by | ||
143 | * replacing the MR. | ||
144 | * | 170 | * |
145 | * There's no recovery if this fails. The FRMR is abandoned, but | 171 | * There's no recovery if this fails. The FRMR is abandoned, but |
146 | * remains in rb_all. It will be cleaned up when the transport is | 172 | * remains in rb_all. It will be cleaned up when the transport is |
147 | * destroyed. | 173 | * destroyed. |
148 | */ | 174 | */ |
149 | static void | 175 | static void |
150 | __frwr_recovery_worker(struct work_struct *work) | 176 | frwr_op_recover_mr(struct rpcrdma_mw *mw) |
151 | { | ||
152 | struct rpcrdma_mw *r = container_of(work, struct rpcrdma_mw, | ||
153 | mw_work); | ||
154 | |||
155 | __frwr_reset_and_unmap(r->mw_xprt, r); | ||
156 | return; | ||
157 | } | ||
158 | |||
159 | /* A broken MR was discovered in a context that can't sleep. | ||
160 | * Defer recovery to the recovery worker. | ||
161 | */ | ||
162 | static void | ||
163 | __frwr_queue_recovery(struct rpcrdma_mw *r) | ||
164 | { | ||
165 | INIT_WORK(&r->mw_work, __frwr_recovery_worker); | ||
166 | queue_work(frwr_recovery_wq, &r->mw_work); | ||
167 | } | ||
168 | |||
169 | static int | ||
170 | __frwr_init(struct rpcrdma_mw *r, struct ib_pd *pd, struct ib_device *device, | ||
171 | unsigned int depth) | ||
172 | { | 177 | { |
173 | struct rpcrdma_frmr *f = &r->frmr; | 178 | struct rpcrdma_xprt *r_xprt = mw->mw_xprt; |
179 | struct rpcrdma_ia *ia = &r_xprt->rx_ia; | ||
174 | int rc; | 180 | int rc; |
175 | 181 | ||
176 | f->fr_mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, depth); | 182 | rc = __frwr_reset_mr(ia, mw); |
177 | if (IS_ERR(f->fr_mr)) | 183 | ib_dma_unmap_sg(ia->ri_device, mw->mw_sg, mw->mw_nents, mw->mw_dir); |
178 | goto out_mr_err; | 184 | if (rc) |
179 | 185 | goto out_release; | |
180 | f->fr_sg = kcalloc(depth, sizeof(*f->fr_sg), GFP_KERNEL); | ||
181 | if (!f->fr_sg) | ||
182 | goto out_list_err; | ||
183 | |||
184 | sg_init_table(f->fr_sg, depth); | ||
185 | |||
186 | init_completion(&f->fr_linv_done); | ||
187 | |||
188 | return 0; | ||
189 | 186 | ||
190 | out_mr_err: | 187 | rpcrdma_put_mw(r_xprt, mw); |
191 | rc = PTR_ERR(f->fr_mr); | 188 | r_xprt->rx_stats.mrs_recovered++; |
192 | dprintk("RPC: %s: ib_alloc_mr status %i\n", | 189 | return; |
193 | __func__, rc); | ||
194 | return rc; | ||
195 | 190 | ||
196 | out_list_err: | 191 | out_release: |
197 | rc = -ENOMEM; | 192 | pr_err("rpcrdma: FRMR reset failed %d, %p release\n", rc, mw); |
198 | dprintk("RPC: %s: sg allocation failure\n", | 193 | r_xprt->rx_stats.mrs_orphaned++; |
199 | __func__); | ||
200 | ib_dereg_mr(f->fr_mr); | ||
201 | return rc; | ||
202 | } | ||
203 | 194 | ||
204 | static void | 195 | spin_lock(&r_xprt->rx_buf.rb_mwlock); |
205 | __frwr_release(struct rpcrdma_mw *r) | 196 | list_del(&mw->mw_all); |
206 | { | 197 | spin_unlock(&r_xprt->rx_buf.rb_mwlock); |
207 | int rc; | ||
208 | 198 | ||
209 | rc = ib_dereg_mr(r->frmr.fr_mr); | 199 | frwr_op_release_mr(mw); |
210 | if (rc) | ||
211 | dprintk("RPC: %s: ib_dereg_mr status %i\n", | ||
212 | __func__, rc); | ||
213 | kfree(r->frmr.fr_sg); | ||
214 | } | 200 | } |
215 | 201 | ||
216 | static int | 202 | static int |
@@ -346,57 +332,14 @@ frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc) | |||
346 | complete_all(&frmr->fr_linv_done); | 332 | complete_all(&frmr->fr_linv_done); |
347 | } | 333 | } |
348 | 334 | ||
349 | static int | 335 | /* Post a REG_MR Work Request to register a memory region |
350 | frwr_op_init(struct rpcrdma_xprt *r_xprt) | ||
351 | { | ||
352 | struct rpcrdma_buffer *buf = &r_xprt->rx_buf; | ||
353 | struct ib_device *device = r_xprt->rx_ia.ri_device; | ||
354 | unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth; | ||
355 | struct ib_pd *pd = r_xprt->rx_ia.ri_pd; | ||
356 | int i; | ||
357 | |||
358 | spin_lock_init(&buf->rb_mwlock); | ||
359 | INIT_LIST_HEAD(&buf->rb_mws); | ||
360 | INIT_LIST_HEAD(&buf->rb_all); | ||
361 | |||
362 | i = max_t(int, RPCRDMA_MAX_DATA_SEGS / depth, 1); | ||
363 | i += 2; /* head + tail */ | ||
364 | i *= buf->rb_max_requests; /* one set for each RPC slot */ | ||
365 | dprintk("RPC: %s: initalizing %d FRMRs\n", __func__, i); | ||
366 | |||
367 | while (i--) { | ||
368 | struct rpcrdma_mw *r; | ||
369 | int rc; | ||
370 | |||
371 | r = kzalloc(sizeof(*r), GFP_KERNEL); | ||
372 | if (!r) | ||
373 | return -ENOMEM; | ||
374 | |||
375 | rc = __frwr_init(r, pd, device, depth); | ||
376 | if (rc) { | ||
377 | kfree(r); | ||
378 | return rc; | ||
379 | } | ||
380 | |||
381 | r->mw_xprt = r_xprt; | ||
382 | list_add(&r->mw_list, &buf->rb_mws); | ||
383 | list_add(&r->mw_all, &buf->rb_all); | ||
384 | } | ||
385 | |||
386 | return 0; | ||
387 | } | ||
388 | |||
389 | /* Post a FAST_REG Work Request to register a memory region | ||
390 | * for remote access via RDMA READ or RDMA WRITE. | 336 | * for remote access via RDMA READ or RDMA WRITE. |
391 | */ | 337 | */ |
392 | static int | 338 | static int |
393 | frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, | 339 | frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, |
394 | int nsegs, bool writing) | 340 | int nsegs, bool writing, struct rpcrdma_mw **out) |
395 | { | 341 | { |
396 | struct rpcrdma_ia *ia = &r_xprt->rx_ia; | 342 | struct rpcrdma_ia *ia = &r_xprt->rx_ia; |
397 | struct ib_device *device = ia->ri_device; | ||
398 | enum dma_data_direction direction = rpcrdma_data_dir(writing); | ||
399 | struct rpcrdma_mr_seg *seg1 = seg; | ||
400 | struct rpcrdma_mw *mw; | 343 | struct rpcrdma_mw *mw; |
401 | struct rpcrdma_frmr *frmr; | 344 | struct rpcrdma_frmr *frmr; |
402 | struct ib_mr *mr; | 345 | struct ib_mr *mr; |
@@ -405,14 +348,13 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, | |||
405 | int rc, i, n, dma_nents; | 348 | int rc, i, n, dma_nents; |
406 | u8 key; | 349 | u8 key; |
407 | 350 | ||
408 | mw = seg1->rl_mw; | 351 | mw = NULL; |
409 | seg1->rl_mw = NULL; | ||
410 | do { | 352 | do { |
411 | if (mw) | 353 | if (mw) |
412 | __frwr_queue_recovery(mw); | 354 | rpcrdma_defer_mr_recovery(mw); |
413 | mw = rpcrdma_get_mw(r_xprt); | 355 | mw = rpcrdma_get_mw(r_xprt); |
414 | if (!mw) | 356 | if (!mw) |
415 | return -ENOMEM; | 357 | return -ENOBUFS; |
416 | } while (mw->frmr.fr_state != FRMR_IS_INVALID); | 358 | } while (mw->frmr.fr_state != FRMR_IS_INVALID); |
417 | frmr = &mw->frmr; | 359 | frmr = &mw->frmr; |
418 | frmr->fr_state = FRMR_IS_VALID; | 360 | frmr->fr_state = FRMR_IS_VALID; |
@@ -421,15 +363,14 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, | |||
421 | 363 | ||
422 | if (nsegs > ia->ri_max_frmr_depth) | 364 | if (nsegs > ia->ri_max_frmr_depth) |
423 | nsegs = ia->ri_max_frmr_depth; | 365 | nsegs = ia->ri_max_frmr_depth; |
424 | |||
425 | for (i = 0; i < nsegs;) { | 366 | for (i = 0; i < nsegs;) { |
426 | if (seg->mr_page) | 367 | if (seg->mr_page) |
427 | sg_set_page(&frmr->fr_sg[i], | 368 | sg_set_page(&mw->mw_sg[i], |
428 | seg->mr_page, | 369 | seg->mr_page, |
429 | seg->mr_len, | 370 | seg->mr_len, |
430 | offset_in_page(seg->mr_offset)); | 371 | offset_in_page(seg->mr_offset)); |
431 | else | 372 | else |
432 | sg_set_buf(&frmr->fr_sg[i], seg->mr_offset, | 373 | sg_set_buf(&mw->mw_sg[i], seg->mr_offset, |
433 | seg->mr_len); | 374 | seg->mr_len); |
434 | 375 | ||
435 | ++seg; | 376 | ++seg; |
@@ -440,26 +381,22 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, | |||
440 | offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) | 381 | offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) |
441 | break; | 382 | break; |
442 | } | 383 | } |
443 | frmr->fr_nents = i; | 384 | mw->mw_nents = i; |
444 | frmr->fr_dir = direction; | 385 | mw->mw_dir = rpcrdma_data_dir(writing); |
445 | 386 | if (i == 0) | |
446 | dma_nents = ib_dma_map_sg(device, frmr->fr_sg, frmr->fr_nents, direction); | 387 | goto out_dmamap_err; |
447 | if (!dma_nents) { | ||
448 | pr_err("RPC: %s: failed to dma map sg %p sg_nents %u\n", | ||
449 | __func__, frmr->fr_sg, frmr->fr_nents); | ||
450 | return -ENOMEM; | ||
451 | } | ||
452 | 388 | ||
453 | n = ib_map_mr_sg(mr, frmr->fr_sg, frmr->fr_nents, NULL, PAGE_SIZE); | 389 | dma_nents = ib_dma_map_sg(ia->ri_device, |
454 | if (unlikely(n != frmr->fr_nents)) { | 390 | mw->mw_sg, mw->mw_nents, mw->mw_dir); |
455 | pr_err("RPC: %s: failed to map mr %p (%u/%u)\n", | 391 | if (!dma_nents) |
456 | __func__, frmr->fr_mr, n, frmr->fr_nents); | 392 | goto out_dmamap_err; |
457 | rc = n < 0 ? n : -EINVAL; | 393 | |
458 | goto out_senderr; | 394 | n = ib_map_mr_sg(mr, mw->mw_sg, mw->mw_nents, NULL, PAGE_SIZE); |
459 | } | 395 | if (unlikely(n != mw->mw_nents)) |
396 | goto out_mapmr_err; | ||
460 | 397 | ||
461 | dprintk("RPC: %s: Using frmr %p to map %u segments (%u bytes)\n", | 398 | dprintk("RPC: %s: Using frmr %p to map %u segments (%u bytes)\n", |
462 | __func__, mw, frmr->fr_nents, mr->length); | 399 | __func__, mw, mw->mw_nents, mr->length); |
463 | 400 | ||
464 | key = (u8)(mr->rkey & 0x000000FF); | 401 | key = (u8)(mr->rkey & 0x000000FF); |
465 | ib_update_fast_reg_key(mr, ++key); | 402 | ib_update_fast_reg_key(mr, ++key); |
@@ -481,24 +418,34 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, | |||
481 | if (rc) | 418 | if (rc) |
482 | goto out_senderr; | 419 | goto out_senderr; |
483 | 420 | ||
484 | seg1->rl_mw = mw; | 421 | mw->mw_handle = mr->rkey; |
485 | seg1->mr_rkey = mr->rkey; | 422 | mw->mw_length = mr->length; |
486 | seg1->mr_base = mr->iova; | 423 | mw->mw_offset = mr->iova; |
487 | seg1->mr_nsegs = frmr->fr_nents; | 424 | |
488 | seg1->mr_len = mr->length; | 425 | *out = mw; |
426 | return mw->mw_nents; | ||
489 | 427 | ||
490 | return frmr->fr_nents; | 428 | out_dmamap_err: |
429 | pr_err("rpcrdma: failed to dma map sg %p sg_nents %u\n", | ||
430 | mw->mw_sg, mw->mw_nents); | ||
431 | rpcrdma_defer_mr_recovery(mw); | ||
432 | return -EIO; | ||
433 | |||
434 | out_mapmr_err: | ||
435 | pr_err("rpcrdma: failed to map mr %p (%u/%u)\n", | ||
436 | frmr->fr_mr, n, mw->mw_nents); | ||
437 | rpcrdma_defer_mr_recovery(mw); | ||
438 | return -EIO; | ||
491 | 439 | ||
492 | out_senderr: | 440 | out_senderr: |
493 | dprintk("RPC: %s: ib_post_send status %i\n", __func__, rc); | 441 | pr_err("rpcrdma: FRMR registration ib_post_send returned %i\n", rc); |
494 | __frwr_queue_recovery(mw); | 442 | rpcrdma_defer_mr_recovery(mw); |
495 | return rc; | 443 | return -ENOTCONN; |
496 | } | 444 | } |
497 | 445 | ||
498 | static struct ib_send_wr * | 446 | static struct ib_send_wr * |
499 | __frwr_prepare_linv_wr(struct rpcrdma_mr_seg *seg) | 447 | __frwr_prepare_linv_wr(struct rpcrdma_mw *mw) |
500 | { | 448 | { |
501 | struct rpcrdma_mw *mw = seg->rl_mw; | ||
502 | struct rpcrdma_frmr *f = &mw->frmr; | 449 | struct rpcrdma_frmr *f = &mw->frmr; |
503 | struct ib_send_wr *invalidate_wr; | 450 | struct ib_send_wr *invalidate_wr; |
504 | 451 | ||
@@ -518,16 +465,16 @@ __frwr_prepare_linv_wr(struct rpcrdma_mr_seg *seg) | |||
518 | * | 465 | * |
519 | * Sleeps until it is safe for the host CPU to access the | 466 | * Sleeps until it is safe for the host CPU to access the |
520 | * previously mapped memory regions. | 467 | * previously mapped memory regions. |
468 | * | ||
469 | * Caller ensures that req->rl_registered is not empty. | ||
521 | */ | 470 | */ |
522 | static void | 471 | static void |
523 | frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) | 472 | frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) |
524 | { | 473 | { |
525 | struct ib_send_wr *invalidate_wrs, *pos, *prev, *bad_wr; | 474 | struct ib_send_wr *invalidate_wrs, *pos, *prev, *bad_wr; |
526 | struct rpcrdma_ia *ia = &r_xprt->rx_ia; | 475 | struct rpcrdma_ia *ia = &r_xprt->rx_ia; |
527 | struct rpcrdma_mr_seg *seg; | 476 | struct rpcrdma_mw *mw, *tmp; |
528 | unsigned int i, nchunks; | ||
529 | struct rpcrdma_frmr *f; | 477 | struct rpcrdma_frmr *f; |
530 | struct rpcrdma_mw *mw; | ||
531 | int rc; | 478 | int rc; |
532 | 479 | ||
533 | dprintk("RPC: %s: req %p\n", __func__, req); | 480 | dprintk("RPC: %s: req %p\n", __func__, req); |
@@ -537,22 +484,18 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) | |||
537 | * Chain the LOCAL_INV Work Requests and post them with | 484 | * Chain the LOCAL_INV Work Requests and post them with |
538 | * a single ib_post_send() call. | 485 | * a single ib_post_send() call. |
539 | */ | 486 | */ |
487 | f = NULL; | ||
540 | invalidate_wrs = pos = prev = NULL; | 488 | invalidate_wrs = pos = prev = NULL; |
541 | seg = NULL; | 489 | list_for_each_entry(mw, &req->rl_registered, mw_list) { |
542 | for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) { | 490 | pos = __frwr_prepare_linv_wr(mw); |
543 | seg = &req->rl_segments[i]; | ||
544 | |||
545 | pos = __frwr_prepare_linv_wr(seg); | ||
546 | 491 | ||
547 | if (!invalidate_wrs) | 492 | if (!invalidate_wrs) |
548 | invalidate_wrs = pos; | 493 | invalidate_wrs = pos; |
549 | else | 494 | else |
550 | prev->next = pos; | 495 | prev->next = pos; |
551 | prev = pos; | 496 | prev = pos; |
552 | 497 | f = &mw->frmr; | |
553 | i += seg->mr_nsegs; | ||
554 | } | 498 | } |
555 | f = &seg->rl_mw->frmr; | ||
556 | 499 | ||
557 | /* Strong send queue ordering guarantees that when the | 500 | /* Strong send queue ordering guarantees that when the |
558 | * last WR in the chain completes, all WRs in the chain | 501 | * last WR in the chain completes, all WRs in the chain |
@@ -577,39 +520,27 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) | |||
577 | * them to the free MW list. | 520 | * them to the free MW list. |
578 | */ | 521 | */ |
579 | unmap: | 522 | unmap: |
580 | for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) { | 523 | list_for_each_entry_safe(mw, tmp, &req->rl_registered, mw_list) { |
581 | seg = &req->rl_segments[i]; | 524 | list_del_init(&mw->mw_list); |
582 | mw = seg->rl_mw; | 525 | ib_dma_unmap_sg(ia->ri_device, |
583 | seg->rl_mw = NULL; | 526 | mw->mw_sg, mw->mw_nents, mw->mw_dir); |
584 | |||
585 | ib_dma_unmap_sg(ia->ri_device, f->fr_sg, f->fr_nents, | ||
586 | f->fr_dir); | ||
587 | rpcrdma_put_mw(r_xprt, mw); | 527 | rpcrdma_put_mw(r_xprt, mw); |
588 | |||
589 | i += seg->mr_nsegs; | ||
590 | seg->mr_nsegs = 0; | ||
591 | } | 528 | } |
592 | |||
593 | req->rl_nchunks = 0; | ||
594 | return; | 529 | return; |
595 | 530 | ||
596 | reset_mrs: | 531 | reset_mrs: |
597 | pr_warn("%s: ib_post_send failed %i\n", __func__, rc); | 532 | pr_err("rpcrdma: FRMR invalidate ib_post_send returned %i\n", rc); |
533 | rdma_disconnect(ia->ri_id); | ||
598 | 534 | ||
599 | /* Find and reset the MRs in the LOCAL_INV WRs that did not | 535 | /* Find and reset the MRs in the LOCAL_INV WRs that did not |
600 | * get posted. This is synchronous, and slow. | 536 | * get posted. This is synchronous, and slow. |
601 | */ | 537 | */ |
602 | for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) { | 538 | list_for_each_entry(mw, &req->rl_registered, mw_list) { |
603 | seg = &req->rl_segments[i]; | ||
604 | mw = seg->rl_mw; | ||
605 | f = &mw->frmr; | 539 | f = &mw->frmr; |
606 | |||
607 | if (mw->frmr.fr_mr->rkey == bad_wr->ex.invalidate_rkey) { | 540 | if (mw->frmr.fr_mr->rkey == bad_wr->ex.invalidate_rkey) { |
608 | __frwr_reset_mr(ia, mw); | 541 | __frwr_reset_mr(ia, mw); |
609 | bad_wr = bad_wr->next; | 542 | bad_wr = bad_wr->next; |
610 | } | 543 | } |
611 | |||
612 | i += seg->mr_nsegs; | ||
613 | } | 544 | } |
614 | goto unmap; | 545 | goto unmap; |
615 | } | 546 | } |
@@ -621,38 +552,17 @@ static void | |||
621 | frwr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, | 552 | frwr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, |
622 | bool sync) | 553 | bool sync) |
623 | { | 554 | { |
624 | struct rpcrdma_mr_seg *seg; | ||
625 | struct rpcrdma_mw *mw; | 555 | struct rpcrdma_mw *mw; |
626 | unsigned int i; | ||
627 | 556 | ||
628 | for (i = 0; req->rl_nchunks; req->rl_nchunks--) { | 557 | while (!list_empty(&req->rl_registered)) { |
629 | seg = &req->rl_segments[i]; | 558 | mw = list_first_entry(&req->rl_registered, |
630 | mw = seg->rl_mw; | 559 | struct rpcrdma_mw, mw_list); |
560 | list_del_init(&mw->mw_list); | ||
631 | 561 | ||
632 | if (sync) | 562 | if (sync) |
633 | __frwr_reset_and_unmap(r_xprt, mw); | 563 | frwr_op_recover_mr(mw); |
634 | else | 564 | else |
635 | __frwr_queue_recovery(mw); | 565 | rpcrdma_defer_mr_recovery(mw); |
636 | |||
637 | i += seg->mr_nsegs; | ||
638 | seg->mr_nsegs = 0; | ||
639 | seg->rl_mw = NULL; | ||
640 | } | ||
641 | } | ||
642 | |||
643 | static void | ||
644 | frwr_op_destroy(struct rpcrdma_buffer *buf) | ||
645 | { | ||
646 | struct rpcrdma_mw *r; | ||
647 | |||
648 | /* Ensure stale MWs for "buf" are no longer in flight */ | ||
649 | flush_workqueue(frwr_recovery_wq); | ||
650 | |||
651 | while (!list_empty(&buf->rb_all)) { | ||
652 | r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all); | ||
653 | list_del(&r->mw_all); | ||
654 | __frwr_release(r); | ||
655 | kfree(r); | ||
656 | } | 566 | } |
657 | } | 567 | } |
658 | 568 | ||
@@ -660,9 +570,10 @@ const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = { | |||
660 | .ro_map = frwr_op_map, | 570 | .ro_map = frwr_op_map, |
661 | .ro_unmap_sync = frwr_op_unmap_sync, | 571 | .ro_unmap_sync = frwr_op_unmap_sync, |
662 | .ro_unmap_safe = frwr_op_unmap_safe, | 572 | .ro_unmap_safe = frwr_op_unmap_safe, |
573 | .ro_recover_mr = frwr_op_recover_mr, | ||
663 | .ro_open = frwr_op_open, | 574 | .ro_open = frwr_op_open, |
664 | .ro_maxpages = frwr_op_maxpages, | 575 | .ro_maxpages = frwr_op_maxpages, |
665 | .ro_init = frwr_op_init, | 576 | .ro_init_mr = frwr_op_init_mr, |
666 | .ro_destroy = frwr_op_destroy, | 577 | .ro_release_mr = frwr_op_release_mr, |
667 | .ro_displayname = "frwr", | 578 | .ro_displayname = "frwr", |
668 | }; | 579 | }; |
diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c deleted file mode 100644 index 3750596cc432..000000000000 --- a/net/sunrpc/xprtrdma/physical_ops.c +++ /dev/null | |||
@@ -1,122 +0,0 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2015 Oracle. All rights reserved. | ||
3 | * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. | ||
4 | */ | ||
5 | |||
6 | /* No-op chunk preparation. All client memory is pre-registered. | ||
7 | * Sometimes referred to as ALLPHYSICAL mode. | ||
8 | * | ||
9 | * Physical registration is simple because all client memory is | ||
10 | * pre-registered and never deregistered. This mode is good for | ||
11 | * adapter bring up, but is considered not safe: the server is | ||
12 | * trusted not to abuse its access to client memory not involved | ||
13 | * in RDMA I/O. | ||
14 | */ | ||
15 | |||
16 | #include "xprt_rdma.h" | ||
17 | |||
18 | #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) | ||
19 | # define RPCDBG_FACILITY RPCDBG_TRANS | ||
20 | #endif | ||
21 | |||
22 | static int | ||
23 | physical_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, | ||
24 | struct rpcrdma_create_data_internal *cdata) | ||
25 | { | ||
26 | struct ib_mr *mr; | ||
27 | |||
28 | /* Obtain an rkey to use for RPC data payloads. | ||
29 | */ | ||
30 | mr = ib_get_dma_mr(ia->ri_pd, | ||
31 | IB_ACCESS_LOCAL_WRITE | | ||
32 | IB_ACCESS_REMOTE_WRITE | | ||
33 | IB_ACCESS_REMOTE_READ); | ||
34 | if (IS_ERR(mr)) { | ||
35 | pr_err("%s: ib_get_dma_mr for failed with %lX\n", | ||
36 | __func__, PTR_ERR(mr)); | ||
37 | return -ENOMEM; | ||
38 | } | ||
39 | ia->ri_dma_mr = mr; | ||
40 | |||
41 | rpcrdma_set_max_header_sizes(ia, cdata, min_t(unsigned int, | ||
42 | RPCRDMA_MAX_DATA_SEGS, | ||
43 | RPCRDMA_MAX_HDR_SEGS)); | ||
44 | return 0; | ||
45 | } | ||
46 | |||
47 | /* PHYSICAL memory registration conveys one page per chunk segment. | ||
48 | */ | ||
49 | static size_t | ||
50 | physical_op_maxpages(struct rpcrdma_xprt *r_xprt) | ||
51 | { | ||
52 | return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS, | ||
53 | RPCRDMA_MAX_HDR_SEGS); | ||
54 | } | ||
55 | |||
56 | static int | ||
57 | physical_op_init(struct rpcrdma_xprt *r_xprt) | ||
58 | { | ||
59 | return 0; | ||
60 | } | ||
61 | |||
62 | /* The client's physical memory is already exposed for | ||
63 | * remote access via RDMA READ or RDMA WRITE. | ||
64 | */ | ||
65 | static int | ||
66 | physical_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, | ||
67 | int nsegs, bool writing) | ||
68 | { | ||
69 | struct rpcrdma_ia *ia = &r_xprt->rx_ia; | ||
70 | |||
71 | rpcrdma_map_one(ia->ri_device, seg, rpcrdma_data_dir(writing)); | ||
72 | seg->mr_rkey = ia->ri_dma_mr->rkey; | ||
73 | seg->mr_base = seg->mr_dma; | ||
74 | return 1; | ||
75 | } | ||
76 | |||
77 | /* DMA unmap all memory regions that were mapped for "req". | ||
78 | */ | ||
79 | static void | ||
80 | physical_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) | ||
81 | { | ||
82 | struct ib_device *device = r_xprt->rx_ia.ri_device; | ||
83 | unsigned int i; | ||
84 | |||
85 | for (i = 0; req->rl_nchunks; --req->rl_nchunks) | ||
86 | rpcrdma_unmap_one(device, &req->rl_segments[i++]); | ||
87 | } | ||
88 | |||
89 | /* Use a slow, safe mechanism to invalidate all memory regions | ||
90 | * that were registered for "req". | ||
91 | * | ||
92 | * For physical memory registration, there is no good way to | ||
93 | * fence a single MR that has been advertised to the server. The | ||
94 | * client has already handed the server an R_key that cannot be | ||
95 | * invalidated and is shared by all MRs on this connection. | ||
96 | * Tearing down the PD might be the only safe choice, but it's | ||
97 | * not clear that a freshly acquired DMA R_key would be different | ||
98 | * than the one used by the PD that was just destroyed. | ||
99 | * FIXME. | ||
100 | */ | ||
101 | static void | ||
102 | physical_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, | ||
103 | bool sync) | ||
104 | { | ||
105 | physical_op_unmap_sync(r_xprt, req); | ||
106 | } | ||
107 | |||
108 | static void | ||
109 | physical_op_destroy(struct rpcrdma_buffer *buf) | ||
110 | { | ||
111 | } | ||
112 | |||
113 | const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops = { | ||
114 | .ro_map = physical_op_map, | ||
115 | .ro_unmap_sync = physical_op_unmap_sync, | ||
116 | .ro_unmap_safe = physical_op_unmap_safe, | ||
117 | .ro_open = physical_op_open, | ||
118 | .ro_maxpages = physical_op_maxpages, | ||
119 | .ro_init = physical_op_init, | ||
120 | .ro_destroy = physical_op_destroy, | ||
121 | .ro_displayname = "physical", | ||
122 | }; | ||
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 35a81096e83d..a47f170b20ef 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c | |||
@@ -196,8 +196,7 @@ rpcrdma_tail_pullup(struct xdr_buf *buf) | |||
196 | * MR when they can. | 196 | * MR when they can. |
197 | */ | 197 | */ |
198 | static int | 198 | static int |
199 | rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, | 199 | rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, int n) |
200 | int n, int nsegs) | ||
201 | { | 200 | { |
202 | size_t page_offset; | 201 | size_t page_offset; |
203 | u32 remaining; | 202 | u32 remaining; |
@@ -206,7 +205,7 @@ rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, | |||
206 | base = vec->iov_base; | 205 | base = vec->iov_base; |
207 | page_offset = offset_in_page(base); | 206 | page_offset = offset_in_page(base); |
208 | remaining = vec->iov_len; | 207 | remaining = vec->iov_len; |
209 | while (remaining && n < nsegs) { | 208 | while (remaining && n < RPCRDMA_MAX_SEGS) { |
210 | seg[n].mr_page = NULL; | 209 | seg[n].mr_page = NULL; |
211 | seg[n].mr_offset = base; | 210 | seg[n].mr_offset = base; |
212 | seg[n].mr_len = min_t(u32, PAGE_SIZE - page_offset, remaining); | 211 | seg[n].mr_len = min_t(u32, PAGE_SIZE - page_offset, remaining); |
@@ -230,34 +229,34 @@ rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, | |||
230 | 229 | ||
231 | static int | 230 | static int |
232 | rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, | 231 | rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, |
233 | enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg, int nsegs) | 232 | enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg) |
234 | { | 233 | { |
235 | int len, n = 0, p; | 234 | int len, n, p, page_base; |
236 | int page_base; | ||
237 | struct page **ppages; | 235 | struct page **ppages; |
238 | 236 | ||
237 | n = 0; | ||
239 | if (pos == 0) { | 238 | if (pos == 0) { |
240 | n = rpcrdma_convert_kvec(&xdrbuf->head[0], seg, n, nsegs); | 239 | n = rpcrdma_convert_kvec(&xdrbuf->head[0], seg, n); |
241 | if (n == nsegs) | 240 | if (n == RPCRDMA_MAX_SEGS) |
242 | return -EIO; | 241 | goto out_overflow; |
243 | } | 242 | } |
244 | 243 | ||
245 | len = xdrbuf->page_len; | 244 | len = xdrbuf->page_len; |
246 | ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT); | 245 | ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT); |
247 | page_base = xdrbuf->page_base & ~PAGE_MASK; | 246 | page_base = xdrbuf->page_base & ~PAGE_MASK; |
248 | p = 0; | 247 | p = 0; |
249 | while (len && n < nsegs) { | 248 | while (len && n < RPCRDMA_MAX_SEGS) { |
250 | if (!ppages[p]) { | 249 | if (!ppages[p]) { |
251 | /* alloc the pagelist for receiving buffer */ | 250 | /* alloc the pagelist for receiving buffer */ |
252 | ppages[p] = alloc_page(GFP_ATOMIC); | 251 | ppages[p] = alloc_page(GFP_ATOMIC); |
253 | if (!ppages[p]) | 252 | if (!ppages[p]) |
254 | return -ENOMEM; | 253 | return -EAGAIN; |
255 | } | 254 | } |
256 | seg[n].mr_page = ppages[p]; | 255 | seg[n].mr_page = ppages[p]; |
257 | seg[n].mr_offset = (void *)(unsigned long) page_base; | 256 | seg[n].mr_offset = (void *)(unsigned long) page_base; |
258 | seg[n].mr_len = min_t(u32, PAGE_SIZE - page_base, len); | 257 | seg[n].mr_len = min_t(u32, PAGE_SIZE - page_base, len); |
259 | if (seg[n].mr_len > PAGE_SIZE) | 258 | if (seg[n].mr_len > PAGE_SIZE) |
260 | return -EIO; | 259 | goto out_overflow; |
261 | len -= seg[n].mr_len; | 260 | len -= seg[n].mr_len; |
262 | ++n; | 261 | ++n; |
263 | ++p; | 262 | ++p; |
@@ -265,8 +264,8 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, | |||
265 | } | 264 | } |
266 | 265 | ||
267 | /* Message overflows the seg array */ | 266 | /* Message overflows the seg array */ |
268 | if (len && n == nsegs) | 267 | if (len && n == RPCRDMA_MAX_SEGS) |
269 | return -EIO; | 268 | goto out_overflow; |
270 | 269 | ||
271 | /* When encoding the read list, the tail is always sent inline */ | 270 | /* When encoding the read list, the tail is always sent inline */ |
272 | if (type == rpcrdma_readch) | 271 | if (type == rpcrdma_readch) |
@@ -277,20 +276,24 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, | |||
277 | * xdr pad bytes, saving the server an RDMA operation. */ | 276 | * xdr pad bytes, saving the server an RDMA operation. */ |
278 | if (xdrbuf->tail[0].iov_len < 4 && xprt_rdma_pad_optimize) | 277 | if (xdrbuf->tail[0].iov_len < 4 && xprt_rdma_pad_optimize) |
279 | return n; | 278 | return n; |
280 | n = rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, n, nsegs); | 279 | n = rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, n); |
281 | if (n == nsegs) | 280 | if (n == RPCRDMA_MAX_SEGS) |
282 | return -EIO; | 281 | goto out_overflow; |
283 | } | 282 | } |
284 | 283 | ||
285 | return n; | 284 | return n; |
285 | |||
286 | out_overflow: | ||
287 | pr_err("rpcrdma: segment array overflow\n"); | ||
288 | return -EIO; | ||
286 | } | 289 | } |
287 | 290 | ||
288 | static inline __be32 * | 291 | static inline __be32 * |
289 | xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mr_seg *seg) | 292 | xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mw *mw) |
290 | { | 293 | { |
291 | *iptr++ = cpu_to_be32(seg->mr_rkey); | 294 | *iptr++ = cpu_to_be32(mw->mw_handle); |
292 | *iptr++ = cpu_to_be32(seg->mr_len); | 295 | *iptr++ = cpu_to_be32(mw->mw_length); |
293 | return xdr_encode_hyper(iptr, seg->mr_base); | 296 | return xdr_encode_hyper(iptr, mw->mw_offset); |
294 | } | 297 | } |
295 | 298 | ||
296 | /* XDR-encode the Read list. Supports encoding a list of read | 299 | /* XDR-encode the Read list. Supports encoding a list of read |
@@ -310,7 +313,8 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, | |||
310 | struct rpcrdma_req *req, struct rpc_rqst *rqst, | 313 | struct rpcrdma_req *req, struct rpc_rqst *rqst, |
311 | __be32 *iptr, enum rpcrdma_chunktype rtype) | 314 | __be32 *iptr, enum rpcrdma_chunktype rtype) |
312 | { | 315 | { |
313 | struct rpcrdma_mr_seg *seg = req->rl_nextseg; | 316 | struct rpcrdma_mr_seg *seg; |
317 | struct rpcrdma_mw *mw; | ||
314 | unsigned int pos; | 318 | unsigned int pos; |
315 | int n, nsegs; | 319 | int n, nsegs; |
316 | 320 | ||
@@ -322,15 +326,17 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, | |||
322 | pos = rqst->rq_snd_buf.head[0].iov_len; | 326 | pos = rqst->rq_snd_buf.head[0].iov_len; |
323 | if (rtype == rpcrdma_areadch) | 327 | if (rtype == rpcrdma_areadch) |
324 | pos = 0; | 328 | pos = 0; |
325 | nsegs = rpcrdma_convert_iovs(&rqst->rq_snd_buf, pos, rtype, seg, | 329 | seg = req->rl_segments; |
326 | RPCRDMA_MAX_SEGS - req->rl_nchunks); | 330 | nsegs = rpcrdma_convert_iovs(&rqst->rq_snd_buf, pos, rtype, seg); |
327 | if (nsegs < 0) | 331 | if (nsegs < 0) |
328 | return ERR_PTR(nsegs); | 332 | return ERR_PTR(nsegs); |
329 | 333 | ||
330 | do { | 334 | do { |
331 | n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, false); | 335 | n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, |
332 | if (n <= 0) | 336 | false, &mw); |
337 | if (n < 0) | ||
333 | return ERR_PTR(n); | 338 | return ERR_PTR(n); |
339 | list_add(&mw->mw_list, &req->rl_registered); | ||
334 | 340 | ||
335 | *iptr++ = xdr_one; /* item present */ | 341 | *iptr++ = xdr_one; /* item present */ |
336 | 342 | ||
@@ -338,20 +344,17 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, | |||
338 | * have the same "position". | 344 | * have the same "position". |
339 | */ | 345 | */ |
340 | *iptr++ = cpu_to_be32(pos); | 346 | *iptr++ = cpu_to_be32(pos); |
341 | iptr = xdr_encode_rdma_segment(iptr, seg); | 347 | iptr = xdr_encode_rdma_segment(iptr, mw); |
342 | 348 | ||
343 | dprintk("RPC: %5u %s: read segment pos %u " | 349 | dprintk("RPC: %5u %s: pos %u %u@0x%016llx:0x%08x (%s)\n", |
344 | "%d@0x%016llx:0x%08x (%s)\n", | ||
345 | rqst->rq_task->tk_pid, __func__, pos, | 350 | rqst->rq_task->tk_pid, __func__, pos, |
346 | seg->mr_len, (unsigned long long)seg->mr_base, | 351 | mw->mw_length, (unsigned long long)mw->mw_offset, |
347 | seg->mr_rkey, n < nsegs ? "more" : "last"); | 352 | mw->mw_handle, n < nsegs ? "more" : "last"); |
348 | 353 | ||
349 | r_xprt->rx_stats.read_chunk_count++; | 354 | r_xprt->rx_stats.read_chunk_count++; |
350 | req->rl_nchunks++; | ||
351 | seg += n; | 355 | seg += n; |
352 | nsegs -= n; | 356 | nsegs -= n; |
353 | } while (nsegs); | 357 | } while (nsegs); |
354 | req->rl_nextseg = seg; | ||
355 | 358 | ||
356 | /* Finish Read list */ | 359 | /* Finish Read list */ |
357 | *iptr++ = xdr_zero; /* Next item not present */ | 360 | *iptr++ = xdr_zero; /* Next item not present */ |
@@ -375,7 +378,8 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, | |||
375 | struct rpc_rqst *rqst, __be32 *iptr, | 378 | struct rpc_rqst *rqst, __be32 *iptr, |
376 | enum rpcrdma_chunktype wtype) | 379 | enum rpcrdma_chunktype wtype) |
377 | { | 380 | { |
378 | struct rpcrdma_mr_seg *seg = req->rl_nextseg; | 381 | struct rpcrdma_mr_seg *seg; |
382 | struct rpcrdma_mw *mw; | ||
379 | int n, nsegs, nchunks; | 383 | int n, nsegs, nchunks; |
380 | __be32 *segcount; | 384 | __be32 *segcount; |
381 | 385 | ||
@@ -384,10 +388,10 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, | |||
384 | return iptr; | 388 | return iptr; |
385 | } | 389 | } |
386 | 390 | ||
391 | seg = req->rl_segments; | ||
387 | nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, | 392 | nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, |
388 | rqst->rq_rcv_buf.head[0].iov_len, | 393 | rqst->rq_rcv_buf.head[0].iov_len, |
389 | wtype, seg, | 394 | wtype, seg); |
390 | RPCRDMA_MAX_SEGS - req->rl_nchunks); | ||
391 | if (nsegs < 0) | 395 | if (nsegs < 0) |
392 | return ERR_PTR(nsegs); | 396 | return ERR_PTR(nsegs); |
393 | 397 | ||
@@ -396,26 +400,25 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, | |||
396 | 400 | ||
397 | nchunks = 0; | 401 | nchunks = 0; |
398 | do { | 402 | do { |
399 | n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, true); | 403 | n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, |
400 | if (n <= 0) | 404 | true, &mw); |
405 | if (n < 0) | ||
401 | return ERR_PTR(n); | 406 | return ERR_PTR(n); |
407 | list_add(&mw->mw_list, &req->rl_registered); | ||
402 | 408 | ||
403 | iptr = xdr_encode_rdma_segment(iptr, seg); | 409 | iptr = xdr_encode_rdma_segment(iptr, mw); |
404 | 410 | ||
405 | dprintk("RPC: %5u %s: write segment " | 411 | dprintk("RPC: %5u %s: %u@0x016%llx:0x%08x (%s)\n", |
406 | "%d@0x016%llx:0x%08x (%s)\n", | ||
407 | rqst->rq_task->tk_pid, __func__, | 412 | rqst->rq_task->tk_pid, __func__, |
408 | seg->mr_len, (unsigned long long)seg->mr_base, | 413 | mw->mw_length, (unsigned long long)mw->mw_offset, |
409 | seg->mr_rkey, n < nsegs ? "more" : "last"); | 414 | mw->mw_handle, n < nsegs ? "more" : "last"); |
410 | 415 | ||
411 | r_xprt->rx_stats.write_chunk_count++; | 416 | r_xprt->rx_stats.write_chunk_count++; |
412 | r_xprt->rx_stats.total_rdma_request += seg->mr_len; | 417 | r_xprt->rx_stats.total_rdma_request += seg->mr_len; |
413 | req->rl_nchunks++; | ||
414 | nchunks++; | 418 | nchunks++; |
415 | seg += n; | 419 | seg += n; |
416 | nsegs -= n; | 420 | nsegs -= n; |
417 | } while (nsegs); | 421 | } while (nsegs); |
418 | req->rl_nextseg = seg; | ||
419 | 422 | ||
420 | /* Update count of segments in this Write chunk */ | 423 | /* Update count of segments in this Write chunk */ |
421 | *segcount = cpu_to_be32(nchunks); | 424 | *segcount = cpu_to_be32(nchunks); |
@@ -442,7 +445,8 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, | |||
442 | struct rpcrdma_req *req, struct rpc_rqst *rqst, | 445 | struct rpcrdma_req *req, struct rpc_rqst *rqst, |
443 | __be32 *iptr, enum rpcrdma_chunktype wtype) | 446 | __be32 *iptr, enum rpcrdma_chunktype wtype) |
444 | { | 447 | { |
445 | struct rpcrdma_mr_seg *seg = req->rl_nextseg; | 448 | struct rpcrdma_mr_seg *seg; |
449 | struct rpcrdma_mw *mw; | ||
446 | int n, nsegs, nchunks; | 450 | int n, nsegs, nchunks; |
447 | __be32 *segcount; | 451 | __be32 *segcount; |
448 | 452 | ||
@@ -451,8 +455,8 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, | |||
451 | return iptr; | 455 | return iptr; |
452 | } | 456 | } |
453 | 457 | ||
454 | nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, 0, wtype, seg, | 458 | seg = req->rl_segments; |
455 | RPCRDMA_MAX_SEGS - req->rl_nchunks); | 459 | nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, 0, wtype, seg); |
456 | if (nsegs < 0) | 460 | if (nsegs < 0) |
457 | return ERR_PTR(nsegs); | 461 | return ERR_PTR(nsegs); |
458 | 462 | ||
@@ -461,26 +465,25 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, | |||
461 | 465 | ||
462 | nchunks = 0; | 466 | nchunks = 0; |
463 | do { | 467 | do { |
464 | n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, true); | 468 | n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, |
465 | if (n <= 0) | 469 | true, &mw); |
470 | if (n < 0) | ||
466 | return ERR_PTR(n); | 471 | return ERR_PTR(n); |
472 | list_add(&mw->mw_list, &req->rl_registered); | ||
467 | 473 | ||
468 | iptr = xdr_encode_rdma_segment(iptr, seg); | 474 | iptr = xdr_encode_rdma_segment(iptr, mw); |
469 | 475 | ||
470 | dprintk("RPC: %5u %s: reply segment " | 476 | dprintk("RPC: %5u %s: %u@0x%016llx:0x%08x (%s)\n", |
471 | "%d@0x%016llx:0x%08x (%s)\n", | ||
472 | rqst->rq_task->tk_pid, __func__, | 477 | rqst->rq_task->tk_pid, __func__, |
473 | seg->mr_len, (unsigned long long)seg->mr_base, | 478 | mw->mw_length, (unsigned long long)mw->mw_offset, |
474 | seg->mr_rkey, n < nsegs ? "more" : "last"); | 479 | mw->mw_handle, n < nsegs ? "more" : "last"); |
475 | 480 | ||
476 | r_xprt->rx_stats.reply_chunk_count++; | 481 | r_xprt->rx_stats.reply_chunk_count++; |
477 | r_xprt->rx_stats.total_rdma_request += seg->mr_len; | 482 | r_xprt->rx_stats.total_rdma_request += seg->mr_len; |
478 | req->rl_nchunks++; | ||
479 | nchunks++; | 483 | nchunks++; |
480 | seg += n; | 484 | seg += n; |
481 | nsegs -= n; | 485 | nsegs -= n; |
482 | } while (nsegs); | 486 | } while (nsegs); |
483 | req->rl_nextseg = seg; | ||
484 | 487 | ||
485 | /* Update count of segments in the Reply chunk */ | 488 | /* Update count of segments in the Reply chunk */ |
486 | *segcount = cpu_to_be32(nchunks); | 489 | *segcount = cpu_to_be32(nchunks); |
@@ -567,6 +570,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) | |||
567 | struct rpcrdma_req *req = rpcr_to_rdmar(rqst); | 570 | struct rpcrdma_req *req = rpcr_to_rdmar(rqst); |
568 | enum rpcrdma_chunktype rtype, wtype; | 571 | enum rpcrdma_chunktype rtype, wtype; |
569 | struct rpcrdma_msg *headerp; | 572 | struct rpcrdma_msg *headerp; |
573 | bool ddp_allowed; | ||
570 | ssize_t hdrlen; | 574 | ssize_t hdrlen; |
571 | size_t rpclen; | 575 | size_t rpclen; |
572 | __be32 *iptr; | 576 | __be32 *iptr; |
@@ -583,6 +587,13 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) | |||
583 | headerp->rm_credit = cpu_to_be32(r_xprt->rx_buf.rb_max_requests); | 587 | headerp->rm_credit = cpu_to_be32(r_xprt->rx_buf.rb_max_requests); |
584 | headerp->rm_type = rdma_msg; | 588 | headerp->rm_type = rdma_msg; |
585 | 589 | ||
590 | /* When the ULP employs a GSS flavor that guarantees integrity | ||
591 | * or privacy, direct data placement of individual data items | ||
592 | * is not allowed. | ||
593 | */ | ||
594 | ddp_allowed = !(rqst->rq_cred->cr_auth->au_flags & | ||
595 | RPCAUTH_AUTH_DATATOUCH); | ||
596 | |||
586 | /* | 597 | /* |
587 | * Chunks needed for results? | 598 | * Chunks needed for results? |
588 | * | 599 | * |
@@ -594,7 +605,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) | |||
594 | */ | 605 | */ |
595 | if (rpcrdma_results_inline(r_xprt, rqst)) | 606 | if (rpcrdma_results_inline(r_xprt, rqst)) |
596 | wtype = rpcrdma_noch; | 607 | wtype = rpcrdma_noch; |
597 | else if (rqst->rq_rcv_buf.flags & XDRBUF_READ) | 608 | else if (ddp_allowed && rqst->rq_rcv_buf.flags & XDRBUF_READ) |
598 | wtype = rpcrdma_writech; | 609 | wtype = rpcrdma_writech; |
599 | else | 610 | else |
600 | wtype = rpcrdma_replych; | 611 | wtype = rpcrdma_replych; |
@@ -617,7 +628,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) | |||
617 | rtype = rpcrdma_noch; | 628 | rtype = rpcrdma_noch; |
618 | rpcrdma_inline_pullup(rqst); | 629 | rpcrdma_inline_pullup(rqst); |
619 | rpclen = rqst->rq_svec[0].iov_len; | 630 | rpclen = rqst->rq_svec[0].iov_len; |
620 | } else if (rqst->rq_snd_buf.flags & XDRBUF_WRITE) { | 631 | } else if (ddp_allowed && rqst->rq_snd_buf.flags & XDRBUF_WRITE) { |
621 | rtype = rpcrdma_readch; | 632 | rtype = rpcrdma_readch; |
622 | rpclen = rqst->rq_svec[0].iov_len; | 633 | rpclen = rqst->rq_svec[0].iov_len; |
623 | rpclen += rpcrdma_tail_pullup(&rqst->rq_snd_buf); | 634 | rpclen += rpcrdma_tail_pullup(&rqst->rq_snd_buf); |
@@ -650,8 +661,6 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) | |||
650 | * send a Call message with a Position Zero Read chunk and a | 661 | * send a Call message with a Position Zero Read chunk and a |
651 | * regular Read chunk at the same time. | 662 | * regular Read chunk at the same time. |
652 | */ | 663 | */ |
653 | req->rl_nchunks = 0; | ||
654 | req->rl_nextseg = req->rl_segments; | ||
655 | iptr = headerp->rm_body.rm_chunks; | 664 | iptr = headerp->rm_body.rm_chunks; |
656 | iptr = rpcrdma_encode_read_list(r_xprt, req, rqst, iptr, rtype); | 665 | iptr = rpcrdma_encode_read_list(r_xprt, req, rqst, iptr, rtype); |
657 | if (IS_ERR(iptr)) | 666 | if (IS_ERR(iptr)) |
@@ -690,10 +699,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) | |||
690 | out_overflow: | 699 | out_overflow: |
691 | pr_err("rpcrdma: send overflow: hdrlen %zd rpclen %zu %s/%s\n", | 700 | pr_err("rpcrdma: send overflow: hdrlen %zd rpclen %zu %s/%s\n", |
692 | hdrlen, rpclen, transfertypes[rtype], transfertypes[wtype]); | 701 | hdrlen, rpclen, transfertypes[rtype], transfertypes[wtype]); |
693 | /* Terminate this RPC. Chunks registered above will be | 702 | iptr = ERR_PTR(-EIO); |
694 | * released by xprt_release -> xprt_rmda_free . | ||
695 | */ | ||
696 | return -EIO; | ||
697 | 703 | ||
698 | out_unmap: | 704 | out_unmap: |
699 | r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false); | 705 | r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false); |
@@ -705,15 +711,13 @@ out_unmap: | |||
705 | * RDMA'd by server. See map at rpcrdma_create_chunks()! :-) | 711 | * RDMA'd by server. See map at rpcrdma_create_chunks()! :-) |
706 | */ | 712 | */ |
707 | static int | 713 | static int |
708 | rpcrdma_count_chunks(struct rpcrdma_rep *rep, unsigned int max, int wrchunk, __be32 **iptrp) | 714 | rpcrdma_count_chunks(struct rpcrdma_rep *rep, int wrchunk, __be32 **iptrp) |
709 | { | 715 | { |
710 | unsigned int i, total_len; | 716 | unsigned int i, total_len; |
711 | struct rpcrdma_write_chunk *cur_wchunk; | 717 | struct rpcrdma_write_chunk *cur_wchunk; |
712 | char *base = (char *)rdmab_to_msg(rep->rr_rdmabuf); | 718 | char *base = (char *)rdmab_to_msg(rep->rr_rdmabuf); |
713 | 719 | ||
714 | i = be32_to_cpu(**iptrp); | 720 | i = be32_to_cpu(**iptrp); |
715 | if (i > max) | ||
716 | return -1; | ||
717 | cur_wchunk = (struct rpcrdma_write_chunk *) (*iptrp + 1); | 721 | cur_wchunk = (struct rpcrdma_write_chunk *) (*iptrp + 1); |
718 | total_len = 0; | 722 | total_len = 0; |
719 | while (i--) { | 723 | while (i--) { |
@@ -744,45 +748,66 @@ rpcrdma_count_chunks(struct rpcrdma_rep *rep, unsigned int max, int wrchunk, __b | |||
744 | return total_len; | 748 | return total_len; |
745 | } | 749 | } |
746 | 750 | ||
747 | /* | 751 | /** |
748 | * Scatter inline received data back into provided iov's. | 752 | * rpcrdma_inline_fixup - Scatter inline received data into rqst's iovecs |
753 | * @rqst: controlling RPC request | ||
754 | * @srcp: points to RPC message payload in receive buffer | ||
755 | * @copy_len: remaining length of receive buffer content | ||
756 | * @pad: Write chunk pad bytes needed (zero for pure inline) | ||
757 | * | ||
758 | * The upper layer has set the maximum number of bytes it can | ||
759 | * receive in each component of rq_rcv_buf. These values are set in | ||
760 | * the head.iov_len, page_len, tail.iov_len, and buflen fields. | ||
761 | * | ||
762 | * Unlike the TCP equivalent (xdr_partial_copy_from_skb), in | ||
763 | * many cases this function simply updates iov_base pointers in | ||
764 | * rq_rcv_buf to point directly to the received reply data, to | ||
765 | * avoid copying reply data. | ||
766 | * | ||
767 | * Returns the count of bytes which had to be memcopied. | ||
749 | */ | 768 | */ |
750 | static void | 769 | static unsigned long |
751 | rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad) | 770 | rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad) |
752 | { | 771 | { |
753 | int i, npages, curlen, olen; | 772 | unsigned long fixup_copy_count; |
773 | int i, npages, curlen; | ||
754 | char *destp; | 774 | char *destp; |
755 | struct page **ppages; | 775 | struct page **ppages; |
756 | int page_base; | 776 | int page_base; |
757 | 777 | ||
778 | /* The head iovec is redirected to the RPC reply message | ||
779 | * in the receive buffer, to avoid a memcopy. | ||
780 | */ | ||
781 | rqst->rq_rcv_buf.head[0].iov_base = srcp; | ||
782 | rqst->rq_private_buf.head[0].iov_base = srcp; | ||
783 | |||
784 | /* The contents of the receive buffer that follow | ||
785 | * head.iov_len bytes are copied into the page list. | ||
786 | */ | ||
758 | curlen = rqst->rq_rcv_buf.head[0].iov_len; | 787 | curlen = rqst->rq_rcv_buf.head[0].iov_len; |
759 | if (curlen > copy_len) { /* write chunk header fixup */ | 788 | if (curlen > copy_len) |
760 | curlen = copy_len; | 789 | curlen = copy_len; |
761 | rqst->rq_rcv_buf.head[0].iov_len = curlen; | ||
762 | } | ||
763 | |||
764 | dprintk("RPC: %s: srcp 0x%p len %d hdrlen %d\n", | 790 | dprintk("RPC: %s: srcp 0x%p len %d hdrlen %d\n", |
765 | __func__, srcp, copy_len, curlen); | 791 | __func__, srcp, copy_len, curlen); |
766 | |||
767 | /* Shift pointer for first receive segment only */ | ||
768 | rqst->rq_rcv_buf.head[0].iov_base = srcp; | ||
769 | srcp += curlen; | 792 | srcp += curlen; |
770 | copy_len -= curlen; | 793 | copy_len -= curlen; |
771 | 794 | ||
772 | olen = copy_len; | ||
773 | i = 0; | ||
774 | rpcx_to_rdmax(rqst->rq_xprt)->rx_stats.fixup_copy_count += olen; | ||
775 | page_base = rqst->rq_rcv_buf.page_base; | 795 | page_base = rqst->rq_rcv_buf.page_base; |
776 | ppages = rqst->rq_rcv_buf.pages + (page_base >> PAGE_SHIFT); | 796 | ppages = rqst->rq_rcv_buf.pages + (page_base >> PAGE_SHIFT); |
777 | page_base &= ~PAGE_MASK; | 797 | page_base &= ~PAGE_MASK; |
778 | 798 | fixup_copy_count = 0; | |
779 | if (copy_len && rqst->rq_rcv_buf.page_len) { | 799 | if (copy_len && rqst->rq_rcv_buf.page_len) { |
780 | npages = PAGE_ALIGN(page_base + | 800 | int pagelist_len; |
781 | rqst->rq_rcv_buf.page_len) >> PAGE_SHIFT; | 801 | |
782 | for (; i < npages; i++) { | 802 | pagelist_len = rqst->rq_rcv_buf.page_len; |
803 | if (pagelist_len > copy_len) | ||
804 | pagelist_len = copy_len; | ||
805 | npages = PAGE_ALIGN(page_base + pagelist_len) >> PAGE_SHIFT; | ||
806 | for (i = 0; i < npages; i++) { | ||
783 | curlen = PAGE_SIZE - page_base; | 807 | curlen = PAGE_SIZE - page_base; |
784 | if (curlen > copy_len) | 808 | if (curlen > pagelist_len) |
785 | curlen = copy_len; | 809 | curlen = pagelist_len; |
810 | |||
786 | dprintk("RPC: %s: page %d" | 811 | dprintk("RPC: %s: page %d" |
787 | " srcp 0x%p len %d curlen %d\n", | 812 | " srcp 0x%p len %d curlen %d\n", |
788 | __func__, i, srcp, copy_len, curlen); | 813 | __func__, i, srcp, copy_len, curlen); |
@@ -792,39 +817,32 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad) | |||
792 | kunmap_atomic(destp); | 817 | kunmap_atomic(destp); |
793 | srcp += curlen; | 818 | srcp += curlen; |
794 | copy_len -= curlen; | 819 | copy_len -= curlen; |
795 | if (copy_len == 0) | 820 | fixup_copy_count += curlen; |
821 | pagelist_len -= curlen; | ||
822 | if (!pagelist_len) | ||
796 | break; | 823 | break; |
797 | page_base = 0; | 824 | page_base = 0; |
798 | } | 825 | } |
799 | } | ||
800 | 826 | ||
801 | if (copy_len && rqst->rq_rcv_buf.tail[0].iov_len) { | 827 | /* Implicit padding for the last segment in a Write |
802 | curlen = copy_len; | 828 | * chunk is inserted inline at the front of the tail |
803 | if (curlen > rqst->rq_rcv_buf.tail[0].iov_len) | 829 | * iovec. The upper layer ignores the content of |
804 | curlen = rqst->rq_rcv_buf.tail[0].iov_len; | 830 | * the pad. Simply ensure inline content in the tail |
805 | if (rqst->rq_rcv_buf.tail[0].iov_base != srcp) | 831 | * that follows the Write chunk is properly aligned. |
806 | memmove(rqst->rq_rcv_buf.tail[0].iov_base, srcp, curlen); | 832 | */ |
807 | dprintk("RPC: %s: tail srcp 0x%p len %d curlen %d\n", | 833 | if (pad) |
808 | __func__, srcp, copy_len, curlen); | 834 | srcp -= pad; |
809 | rqst->rq_rcv_buf.tail[0].iov_len = curlen; | ||
810 | copy_len -= curlen; ++i; | ||
811 | } else | ||
812 | rqst->rq_rcv_buf.tail[0].iov_len = 0; | ||
813 | |||
814 | if (pad) { | ||
815 | /* implicit padding on terminal chunk */ | ||
816 | unsigned char *p = rqst->rq_rcv_buf.tail[0].iov_base; | ||
817 | while (pad--) | ||
818 | p[rqst->rq_rcv_buf.tail[0].iov_len++] = 0; | ||
819 | } | 835 | } |
820 | 836 | ||
821 | if (copy_len) | 837 | /* The tail iovec is redirected to the remaining data |
822 | dprintk("RPC: %s: %d bytes in" | 838 | * in the receive buffer, to avoid a memcopy. |
823 | " %d extra segments (%d lost)\n", | 839 | */ |
824 | __func__, olen, i, copy_len); | 840 | if (copy_len || pad) { |
841 | rqst->rq_rcv_buf.tail[0].iov_base = srcp; | ||
842 | rqst->rq_private_buf.tail[0].iov_base = srcp; | ||
843 | } | ||
825 | 844 | ||
826 | /* TBD avoid a warning from call_decode() */ | 845 | return fixup_copy_count; |
827 | rqst->rq_private_buf = rqst->rq_rcv_buf; | ||
828 | } | 846 | } |
829 | 847 | ||
830 | void | 848 | void |
@@ -960,14 +978,13 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep) | |||
960 | (headerp->rm_body.rm_chunks[1] == xdr_zero && | 978 | (headerp->rm_body.rm_chunks[1] == xdr_zero && |
961 | headerp->rm_body.rm_chunks[2] != xdr_zero) || | 979 | headerp->rm_body.rm_chunks[2] != xdr_zero) || |
962 | (headerp->rm_body.rm_chunks[1] != xdr_zero && | 980 | (headerp->rm_body.rm_chunks[1] != xdr_zero && |
963 | req->rl_nchunks == 0)) | 981 | list_empty(&req->rl_registered))) |
964 | goto badheader; | 982 | goto badheader; |
965 | if (headerp->rm_body.rm_chunks[1] != xdr_zero) { | 983 | if (headerp->rm_body.rm_chunks[1] != xdr_zero) { |
966 | /* count any expected write chunks in read reply */ | 984 | /* count any expected write chunks in read reply */ |
967 | /* start at write chunk array count */ | 985 | /* start at write chunk array count */ |
968 | iptr = &headerp->rm_body.rm_chunks[2]; | 986 | iptr = &headerp->rm_body.rm_chunks[2]; |
969 | rdmalen = rpcrdma_count_chunks(rep, | 987 | rdmalen = rpcrdma_count_chunks(rep, 1, &iptr); |
970 | req->rl_nchunks, 1, &iptr); | ||
971 | /* check for validity, and no reply chunk after */ | 988 | /* check for validity, and no reply chunk after */ |
972 | if (rdmalen < 0 || *iptr++ != xdr_zero) | 989 | if (rdmalen < 0 || *iptr++ != xdr_zero) |
973 | goto badheader; | 990 | goto badheader; |
@@ -988,8 +1005,10 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep) | |||
988 | rep->rr_len -= RPCRDMA_HDRLEN_MIN; | 1005 | rep->rr_len -= RPCRDMA_HDRLEN_MIN; |
989 | status = rep->rr_len; | 1006 | status = rep->rr_len; |
990 | } | 1007 | } |
991 | /* Fix up the rpc results for upper layer */ | 1008 | |
992 | rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len, rdmalen); | 1009 | r_xprt->rx_stats.fixup_copy_count += |
1010 | rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len, | ||
1011 | rdmalen); | ||
993 | break; | 1012 | break; |
994 | 1013 | ||
995 | case rdma_nomsg: | 1014 | case rdma_nomsg: |
@@ -997,11 +1016,11 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep) | |||
997 | if (headerp->rm_body.rm_chunks[0] != xdr_zero || | 1016 | if (headerp->rm_body.rm_chunks[0] != xdr_zero || |
998 | headerp->rm_body.rm_chunks[1] != xdr_zero || | 1017 | headerp->rm_body.rm_chunks[1] != xdr_zero || |
999 | headerp->rm_body.rm_chunks[2] != xdr_one || | 1018 | headerp->rm_body.rm_chunks[2] != xdr_one || |
1000 | req->rl_nchunks == 0) | 1019 | list_empty(&req->rl_registered)) |
1001 | goto badheader; | 1020 | goto badheader; |
1002 | iptr = (__be32 *)((unsigned char *)headerp + | 1021 | iptr = (__be32 *)((unsigned char *)headerp + |
1003 | RPCRDMA_HDRLEN_MIN); | 1022 | RPCRDMA_HDRLEN_MIN); |
1004 | rdmalen = rpcrdma_count_chunks(rep, req->rl_nchunks, 0, &iptr); | 1023 | rdmalen = rpcrdma_count_chunks(rep, 0, &iptr); |
1005 | if (rdmalen < 0) | 1024 | if (rdmalen < 0) |
1006 | goto badheader; | 1025 | goto badheader; |
1007 | r_xprt->rx_stats.total_rdma_reply += rdmalen; | 1026 | r_xprt->rx_stats.total_rdma_reply += rdmalen; |
@@ -1014,14 +1033,9 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep) | |||
1014 | 1033 | ||
1015 | badheader: | 1034 | badheader: |
1016 | default: | 1035 | default: |
1017 | dprintk("%s: invalid rpcrdma reply header (type %d):" | 1036 | dprintk("RPC: %5u %s: invalid rpcrdma reply (type %u)\n", |
1018 | " chunks[012] == %d %d %d" | 1037 | rqst->rq_task->tk_pid, __func__, |
1019 | " expected chunks <= %d\n", | 1038 | be32_to_cpu(headerp->rm_type)); |
1020 | __func__, be32_to_cpu(headerp->rm_type), | ||
1021 | headerp->rm_body.rm_chunks[0], | ||
1022 | headerp->rm_body.rm_chunks[1], | ||
1023 | headerp->rm_body.rm_chunks[2], | ||
1024 | req->rl_nchunks); | ||
1025 | status = -EIO; | 1039 | status = -EIO; |
1026 | r_xprt->rx_stats.bad_reply_count++; | 1040 | r_xprt->rx_stats.bad_reply_count++; |
1027 | break; | 1041 | break; |
@@ -1035,7 +1049,7 @@ out: | |||
1035 | * control: waking the next RPC waits until this RPC has | 1049 | * control: waking the next RPC waits until this RPC has |
1036 | * relinquished all its Send Queue entries. | 1050 | * relinquished all its Send Queue entries. |
1037 | */ | 1051 | */ |
1038 | if (req->rl_nchunks) | 1052 | if (!list_empty(&req->rl_registered)) |
1039 | r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt, req); | 1053 | r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt, req); |
1040 | 1054 | ||
1041 | spin_lock_bh(&xprt->transport_lock); | 1055 | spin_lock_bh(&xprt->transport_lock); |
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 99d2e5b72726..81f0e879f019 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c | |||
@@ -558,7 +558,6 @@ out_sendbuf: | |||
558 | 558 | ||
559 | out_fail: | 559 | out_fail: |
560 | rpcrdma_buffer_put(req); | 560 | rpcrdma_buffer_put(req); |
561 | r_xprt->rx_stats.failed_marshal_count++; | ||
562 | return NULL; | 561 | return NULL; |
563 | } | 562 | } |
564 | 563 | ||
@@ -590,8 +589,19 @@ xprt_rdma_free(void *buffer) | |||
590 | rpcrdma_buffer_put(req); | 589 | rpcrdma_buffer_put(req); |
591 | } | 590 | } |
592 | 591 | ||
593 | /* | 592 | /** |
593 | * xprt_rdma_send_request - marshal and send an RPC request | ||
594 | * @task: RPC task with an RPC message in rq_snd_buf | ||
595 | * | ||
596 | * Return values: | ||
597 | * 0: The request has been sent | ||
598 | * ENOTCONN: Caller needs to invoke connect logic then call again | ||
599 | * ENOBUFS: Call again later to send the request | ||
600 | * EIO: A permanent error occurred. The request was not sent, | ||
601 | * and don't try it again | ||
602 | * | ||
594 | * send_request invokes the meat of RPC RDMA. It must do the following: | 603 | * send_request invokes the meat of RPC RDMA. It must do the following: |
604 | * | ||
595 | * 1. Marshal the RPC request into an RPC RDMA request, which means | 605 | * 1. Marshal the RPC request into an RPC RDMA request, which means |
596 | * putting a header in front of data, and creating IOVs for RDMA | 606 | * putting a header in front of data, and creating IOVs for RDMA |
597 | * from those in the request. | 607 | * from those in the request. |
@@ -600,7 +610,6 @@ xprt_rdma_free(void *buffer) | |||
600 | * the request (rpcrdma_ep_post). | 610 | * the request (rpcrdma_ep_post). |
601 | * 4. No partial sends are possible in the RPC-RDMA protocol (as in UDP). | 611 | * 4. No partial sends are possible in the RPC-RDMA protocol (as in UDP). |
602 | */ | 612 | */ |
603 | |||
604 | static int | 613 | static int |
605 | xprt_rdma_send_request(struct rpc_task *task) | 614 | xprt_rdma_send_request(struct rpc_task *task) |
606 | { | 615 | { |
@@ -610,6 +619,9 @@ xprt_rdma_send_request(struct rpc_task *task) | |||
610 | struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); | 619 | struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); |
611 | int rc = 0; | 620 | int rc = 0; |
612 | 621 | ||
622 | /* On retransmit, remove any previously registered chunks */ | ||
623 | r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false); | ||
624 | |||
613 | rc = rpcrdma_marshal_req(rqst); | 625 | rc = rpcrdma_marshal_req(rqst); |
614 | if (rc < 0) | 626 | if (rc < 0) |
615 | goto failed_marshal; | 627 | goto failed_marshal; |
@@ -630,11 +642,12 @@ xprt_rdma_send_request(struct rpc_task *task) | |||
630 | return 0; | 642 | return 0; |
631 | 643 | ||
632 | failed_marshal: | 644 | failed_marshal: |
633 | r_xprt->rx_stats.failed_marshal_count++; | ||
634 | dprintk("RPC: %s: rpcrdma_marshal_req failed, status %i\n", | 645 | dprintk("RPC: %s: rpcrdma_marshal_req failed, status %i\n", |
635 | __func__, rc); | 646 | __func__, rc); |
636 | if (rc == -EIO) | 647 | if (rc == -EIO) |
637 | return -EIO; | 648 | r_xprt->rx_stats.failed_marshal_count++; |
649 | if (rc != -ENOTCONN) | ||
650 | return rc; | ||
638 | drop_connection: | 651 | drop_connection: |
639 | xprt_disconnect_done(xprt); | 652 | xprt_disconnect_done(xprt); |
640 | return -ENOTCONN; /* implies disconnect */ | 653 | return -ENOTCONN; /* implies disconnect */ |
@@ -660,7 +673,7 @@ void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) | |||
660 | xprt->stat.bad_xids, | 673 | xprt->stat.bad_xids, |
661 | xprt->stat.req_u, | 674 | xprt->stat.req_u, |
662 | xprt->stat.bklog_u); | 675 | xprt->stat.bklog_u); |
663 | seq_printf(seq, "%lu %lu %lu %llu %llu %llu %llu %lu %lu %lu %lu\n", | 676 | seq_printf(seq, "%lu %lu %lu %llu %llu %llu %llu %lu %lu %lu %lu ", |
664 | r_xprt->rx_stats.read_chunk_count, | 677 | r_xprt->rx_stats.read_chunk_count, |
665 | r_xprt->rx_stats.write_chunk_count, | 678 | r_xprt->rx_stats.write_chunk_count, |
666 | r_xprt->rx_stats.reply_chunk_count, | 679 | r_xprt->rx_stats.reply_chunk_count, |
@@ -672,6 +685,10 @@ void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) | |||
672 | r_xprt->rx_stats.failed_marshal_count, | 685 | r_xprt->rx_stats.failed_marshal_count, |
673 | r_xprt->rx_stats.bad_reply_count, | 686 | r_xprt->rx_stats.bad_reply_count, |
674 | r_xprt->rx_stats.nomsg_call_count); | 687 | r_xprt->rx_stats.nomsg_call_count); |
688 | seq_printf(seq, "%lu %lu %lu\n", | ||
689 | r_xprt->rx_stats.mrs_recovered, | ||
690 | r_xprt->rx_stats.mrs_orphaned, | ||
691 | r_xprt->rx_stats.mrs_allocated); | ||
675 | } | 692 | } |
676 | 693 | ||
677 | static int | 694 | static int |
@@ -741,7 +758,6 @@ void xprt_rdma_cleanup(void) | |||
741 | __func__, rc); | 758 | __func__, rc); |
742 | 759 | ||
743 | rpcrdma_destroy_wq(); | 760 | rpcrdma_destroy_wq(); |
744 | frwr_destroy_recovery_wq(); | ||
745 | 761 | ||
746 | rc = xprt_unregister_transport(&xprt_rdma_bc); | 762 | rc = xprt_unregister_transport(&xprt_rdma_bc); |
747 | if (rc) | 763 | if (rc) |
@@ -753,20 +769,13 @@ int xprt_rdma_init(void) | |||
753 | { | 769 | { |
754 | int rc; | 770 | int rc; |
755 | 771 | ||
756 | rc = frwr_alloc_recovery_wq(); | ||
757 | if (rc) | ||
758 | return rc; | ||
759 | |||
760 | rc = rpcrdma_alloc_wq(); | 772 | rc = rpcrdma_alloc_wq(); |
761 | if (rc) { | 773 | if (rc) |
762 | frwr_destroy_recovery_wq(); | ||
763 | return rc; | 774 | return rc; |
764 | } | ||
765 | 775 | ||
766 | rc = xprt_register_transport(&xprt_rdma); | 776 | rc = xprt_register_transport(&xprt_rdma); |
767 | if (rc) { | 777 | if (rc) { |
768 | rpcrdma_destroy_wq(); | 778 | rpcrdma_destroy_wq(); |
769 | frwr_destroy_recovery_wq(); | ||
770 | return rc; | 779 | return rc; |
771 | } | 780 | } |
772 | 781 | ||
@@ -774,7 +783,6 @@ int xprt_rdma_init(void) | |||
774 | if (rc) { | 783 | if (rc) { |
775 | xprt_unregister_transport(&xprt_rdma); | 784 | xprt_unregister_transport(&xprt_rdma); |
776 | rpcrdma_destroy_wq(); | 785 | rpcrdma_destroy_wq(); |
777 | frwr_destroy_recovery_wq(); | ||
778 | return rc; | 786 | return rc; |
779 | } | 787 | } |
780 | 788 | ||
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index b044d98a1370..536d0be3f61b 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c | |||
@@ -379,8 +379,6 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) | |||
379 | struct rpcrdma_ia *ia = &xprt->rx_ia; | 379 | struct rpcrdma_ia *ia = &xprt->rx_ia; |
380 | int rc; | 380 | int rc; |
381 | 381 | ||
382 | ia->ri_dma_mr = NULL; | ||
383 | |||
384 | ia->ri_id = rpcrdma_create_id(xprt, ia, addr); | 382 | ia->ri_id = rpcrdma_create_id(xprt, ia, addr); |
385 | if (IS_ERR(ia->ri_id)) { | 383 | if (IS_ERR(ia->ri_id)) { |
386 | rc = PTR_ERR(ia->ri_id); | 384 | rc = PTR_ERR(ia->ri_id); |
@@ -391,47 +389,29 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) | |||
391 | ia->ri_pd = ib_alloc_pd(ia->ri_device); | 389 | ia->ri_pd = ib_alloc_pd(ia->ri_device); |
392 | if (IS_ERR(ia->ri_pd)) { | 390 | if (IS_ERR(ia->ri_pd)) { |
393 | rc = PTR_ERR(ia->ri_pd); | 391 | rc = PTR_ERR(ia->ri_pd); |
394 | dprintk("RPC: %s: ib_alloc_pd() failed %i\n", | 392 | pr_err("rpcrdma: ib_alloc_pd() returned %d\n", rc); |
395 | __func__, rc); | ||
396 | goto out2; | 393 | goto out2; |
397 | } | 394 | } |
398 | 395 | ||
399 | if (memreg == RPCRDMA_FRMR) { | ||
400 | if (!(ia->ri_device->attrs.device_cap_flags & | ||
401 | IB_DEVICE_MEM_MGT_EXTENSIONS) || | ||
402 | (ia->ri_device->attrs.max_fast_reg_page_list_len == 0)) { | ||
403 | dprintk("RPC: %s: FRMR registration " | ||
404 | "not supported by HCA\n", __func__); | ||
405 | memreg = RPCRDMA_MTHCAFMR; | ||
406 | } | ||
407 | } | ||
408 | if (memreg == RPCRDMA_MTHCAFMR) { | ||
409 | if (!ia->ri_device->alloc_fmr) { | ||
410 | dprintk("RPC: %s: MTHCAFMR registration " | ||
411 | "not supported by HCA\n", __func__); | ||
412 | rc = -EINVAL; | ||
413 | goto out3; | ||
414 | } | ||
415 | } | ||
416 | |||
417 | switch (memreg) { | 396 | switch (memreg) { |
418 | case RPCRDMA_FRMR: | 397 | case RPCRDMA_FRMR: |
419 | ia->ri_ops = &rpcrdma_frwr_memreg_ops; | 398 | if (frwr_is_supported(ia)) { |
420 | break; | 399 | ia->ri_ops = &rpcrdma_frwr_memreg_ops; |
421 | case RPCRDMA_ALLPHYSICAL: | 400 | break; |
422 | ia->ri_ops = &rpcrdma_physical_memreg_ops; | 401 | } |
423 | break; | 402 | /*FALLTHROUGH*/ |
424 | case RPCRDMA_MTHCAFMR: | 403 | case RPCRDMA_MTHCAFMR: |
425 | ia->ri_ops = &rpcrdma_fmr_memreg_ops; | 404 | if (fmr_is_supported(ia)) { |
426 | break; | 405 | ia->ri_ops = &rpcrdma_fmr_memreg_ops; |
406 | break; | ||
407 | } | ||
408 | /*FALLTHROUGH*/ | ||
427 | default: | 409 | default: |
428 | printk(KERN_ERR "RPC: Unsupported memory " | 410 | pr_err("rpcrdma: Unsupported memory registration mode: %d\n", |
429 | "registration mode: %d\n", memreg); | 411 | memreg); |
430 | rc = -ENOMEM; | 412 | rc = -EINVAL; |
431 | goto out3; | 413 | goto out3; |
432 | } | 414 | } |
433 | dprintk("RPC: %s: memory registration strategy is '%s'\n", | ||
434 | __func__, ia->ri_ops->ro_displayname); | ||
435 | 415 | ||
436 | return 0; | 416 | return 0; |
437 | 417 | ||
@@ -585,8 +565,6 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, | |||
585 | out2: | 565 | out2: |
586 | ib_free_cq(sendcq); | 566 | ib_free_cq(sendcq); |
587 | out1: | 567 | out1: |
588 | if (ia->ri_dma_mr) | ||
589 | ib_dereg_mr(ia->ri_dma_mr); | ||
590 | return rc; | 568 | return rc; |
591 | } | 569 | } |
592 | 570 | ||
@@ -600,8 +578,6 @@ out1: | |||
600 | void | 578 | void |
601 | rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) | 579 | rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) |
602 | { | 580 | { |
603 | int rc; | ||
604 | |||
605 | dprintk("RPC: %s: entering, connected is %d\n", | 581 | dprintk("RPC: %s: entering, connected is %d\n", |
606 | __func__, ep->rep_connected); | 582 | __func__, ep->rep_connected); |
607 | 583 | ||
@@ -615,12 +591,6 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) | |||
615 | 591 | ||
616 | ib_free_cq(ep->rep_attr.recv_cq); | 592 | ib_free_cq(ep->rep_attr.recv_cq); |
617 | ib_free_cq(ep->rep_attr.send_cq); | 593 | ib_free_cq(ep->rep_attr.send_cq); |
618 | |||
619 | if (ia->ri_dma_mr) { | ||
620 | rc = ib_dereg_mr(ia->ri_dma_mr); | ||
621 | dprintk("RPC: %s: ib_dereg_mr returned %i\n", | ||
622 | __func__, rc); | ||
623 | } | ||
624 | } | 594 | } |
625 | 595 | ||
626 | /* | 596 | /* |
@@ -777,6 +747,90 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) | |||
777 | ib_drain_qp(ia->ri_id->qp); | 747 | ib_drain_qp(ia->ri_id->qp); |
778 | } | 748 | } |
779 | 749 | ||
750 | static void | ||
751 | rpcrdma_mr_recovery_worker(struct work_struct *work) | ||
752 | { | ||
753 | struct rpcrdma_buffer *buf = container_of(work, struct rpcrdma_buffer, | ||
754 | rb_recovery_worker.work); | ||
755 | struct rpcrdma_mw *mw; | ||
756 | |||
757 | spin_lock(&buf->rb_recovery_lock); | ||
758 | while (!list_empty(&buf->rb_stale_mrs)) { | ||
759 | mw = list_first_entry(&buf->rb_stale_mrs, | ||
760 | struct rpcrdma_mw, mw_list); | ||
761 | list_del_init(&mw->mw_list); | ||
762 | spin_unlock(&buf->rb_recovery_lock); | ||
763 | |||
764 | dprintk("RPC: %s: recovering MR %p\n", __func__, mw); | ||
765 | mw->mw_xprt->rx_ia.ri_ops->ro_recover_mr(mw); | ||
766 | |||
767 | spin_lock(&buf->rb_recovery_lock); | ||
768 | } | ||
769 | spin_unlock(&buf->rb_recovery_lock); | ||
770 | } | ||
771 | |||
772 | void | ||
773 | rpcrdma_defer_mr_recovery(struct rpcrdma_mw *mw) | ||
774 | { | ||
775 | struct rpcrdma_xprt *r_xprt = mw->mw_xprt; | ||
776 | struct rpcrdma_buffer *buf = &r_xprt->rx_buf; | ||
777 | |||
778 | spin_lock(&buf->rb_recovery_lock); | ||
779 | list_add(&mw->mw_list, &buf->rb_stale_mrs); | ||
780 | spin_unlock(&buf->rb_recovery_lock); | ||
781 | |||
782 | schedule_delayed_work(&buf->rb_recovery_worker, 0); | ||
783 | } | ||
784 | |||
785 | static void | ||
786 | rpcrdma_create_mrs(struct rpcrdma_xprt *r_xprt) | ||
787 | { | ||
788 | struct rpcrdma_buffer *buf = &r_xprt->rx_buf; | ||
789 | struct rpcrdma_ia *ia = &r_xprt->rx_ia; | ||
790 | unsigned int count; | ||
791 | LIST_HEAD(free); | ||
792 | LIST_HEAD(all); | ||
793 | |||
794 | for (count = 0; count < 32; count++) { | ||
795 | struct rpcrdma_mw *mw; | ||
796 | int rc; | ||
797 | |||
798 | mw = kzalloc(sizeof(*mw), GFP_KERNEL); | ||
799 | if (!mw) | ||
800 | break; | ||
801 | |||
802 | rc = ia->ri_ops->ro_init_mr(ia, mw); | ||
803 | if (rc) { | ||
804 | kfree(mw); | ||
805 | break; | ||
806 | } | ||
807 | |||
808 | mw->mw_xprt = r_xprt; | ||
809 | |||
810 | list_add(&mw->mw_list, &free); | ||
811 | list_add(&mw->mw_all, &all); | ||
812 | } | ||
813 | |||
814 | spin_lock(&buf->rb_mwlock); | ||
815 | list_splice(&free, &buf->rb_mws); | ||
816 | list_splice(&all, &buf->rb_all); | ||
817 | r_xprt->rx_stats.mrs_allocated += count; | ||
818 | spin_unlock(&buf->rb_mwlock); | ||
819 | |||
820 | dprintk("RPC: %s: created %u MRs\n", __func__, count); | ||
821 | } | ||
822 | |||
823 | static void | ||
824 | rpcrdma_mr_refresh_worker(struct work_struct *work) | ||
825 | { | ||
826 | struct rpcrdma_buffer *buf = container_of(work, struct rpcrdma_buffer, | ||
827 | rb_refresh_worker.work); | ||
828 | struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt, | ||
829 | rx_buf); | ||
830 | |||
831 | rpcrdma_create_mrs(r_xprt); | ||
832 | } | ||
833 | |||
780 | struct rpcrdma_req * | 834 | struct rpcrdma_req * |
781 | rpcrdma_create_req(struct rpcrdma_xprt *r_xprt) | 835 | rpcrdma_create_req(struct rpcrdma_xprt *r_xprt) |
782 | { | 836 | { |
@@ -793,6 +847,7 @@ rpcrdma_create_req(struct rpcrdma_xprt *r_xprt) | |||
793 | spin_unlock(&buffer->rb_reqslock); | 847 | spin_unlock(&buffer->rb_reqslock); |
794 | req->rl_cqe.done = rpcrdma_wc_send; | 848 | req->rl_cqe.done = rpcrdma_wc_send; |
795 | req->rl_buffer = &r_xprt->rx_buf; | 849 | req->rl_buffer = &r_xprt->rx_buf; |
850 | INIT_LIST_HEAD(&req->rl_registered); | ||
796 | return req; | 851 | return req; |
797 | } | 852 | } |
798 | 853 | ||
@@ -832,17 +887,23 @@ int | |||
832 | rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) | 887 | rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) |
833 | { | 888 | { |
834 | struct rpcrdma_buffer *buf = &r_xprt->rx_buf; | 889 | struct rpcrdma_buffer *buf = &r_xprt->rx_buf; |
835 | struct rpcrdma_ia *ia = &r_xprt->rx_ia; | ||
836 | int i, rc; | 890 | int i, rc; |
837 | 891 | ||
838 | buf->rb_max_requests = r_xprt->rx_data.max_requests; | 892 | buf->rb_max_requests = r_xprt->rx_data.max_requests; |
839 | buf->rb_bc_srv_max_requests = 0; | 893 | buf->rb_bc_srv_max_requests = 0; |
840 | spin_lock_init(&buf->rb_lock); | ||
841 | atomic_set(&buf->rb_credits, 1); | 894 | atomic_set(&buf->rb_credits, 1); |
895 | spin_lock_init(&buf->rb_mwlock); | ||
896 | spin_lock_init(&buf->rb_lock); | ||
897 | spin_lock_init(&buf->rb_recovery_lock); | ||
898 | INIT_LIST_HEAD(&buf->rb_mws); | ||
899 | INIT_LIST_HEAD(&buf->rb_all); | ||
900 | INIT_LIST_HEAD(&buf->rb_stale_mrs); | ||
901 | INIT_DELAYED_WORK(&buf->rb_refresh_worker, | ||
902 | rpcrdma_mr_refresh_worker); | ||
903 | INIT_DELAYED_WORK(&buf->rb_recovery_worker, | ||
904 | rpcrdma_mr_recovery_worker); | ||
842 | 905 | ||
843 | rc = ia->ri_ops->ro_init(r_xprt); | 906 | rpcrdma_create_mrs(r_xprt); |
844 | if (rc) | ||
845 | goto out; | ||
846 | 907 | ||
847 | INIT_LIST_HEAD(&buf->rb_send_bufs); | 908 | INIT_LIST_HEAD(&buf->rb_send_bufs); |
848 | INIT_LIST_HEAD(&buf->rb_allreqs); | 909 | INIT_LIST_HEAD(&buf->rb_allreqs); |
@@ -862,7 +923,7 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) | |||
862 | } | 923 | } |
863 | 924 | ||
864 | INIT_LIST_HEAD(&buf->rb_recv_bufs); | 925 | INIT_LIST_HEAD(&buf->rb_recv_bufs); |
865 | for (i = 0; i < buf->rb_max_requests + 2; i++) { | 926 | for (i = 0; i < buf->rb_max_requests; i++) { |
866 | struct rpcrdma_rep *rep; | 927 | struct rpcrdma_rep *rep; |
867 | 928 | ||
868 | rep = rpcrdma_create_rep(r_xprt); | 929 | rep = rpcrdma_create_rep(r_xprt); |
@@ -918,11 +979,39 @@ rpcrdma_destroy_req(struct rpcrdma_ia *ia, struct rpcrdma_req *req) | |||
918 | kfree(req); | 979 | kfree(req); |
919 | } | 980 | } |
920 | 981 | ||
982 | static void | ||
983 | rpcrdma_destroy_mrs(struct rpcrdma_buffer *buf) | ||
984 | { | ||
985 | struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt, | ||
986 | rx_buf); | ||
987 | struct rpcrdma_ia *ia = rdmab_to_ia(buf); | ||
988 | struct rpcrdma_mw *mw; | ||
989 | unsigned int count; | ||
990 | |||
991 | count = 0; | ||
992 | spin_lock(&buf->rb_mwlock); | ||
993 | while (!list_empty(&buf->rb_all)) { | ||
994 | mw = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all); | ||
995 | list_del(&mw->mw_all); | ||
996 | |||
997 | spin_unlock(&buf->rb_mwlock); | ||
998 | ia->ri_ops->ro_release_mr(mw); | ||
999 | count++; | ||
1000 | spin_lock(&buf->rb_mwlock); | ||
1001 | } | ||
1002 | spin_unlock(&buf->rb_mwlock); | ||
1003 | r_xprt->rx_stats.mrs_allocated = 0; | ||
1004 | |||
1005 | dprintk("RPC: %s: released %u MRs\n", __func__, count); | ||
1006 | } | ||
1007 | |||
921 | void | 1008 | void |
922 | rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) | 1009 | rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) |
923 | { | 1010 | { |
924 | struct rpcrdma_ia *ia = rdmab_to_ia(buf); | 1011 | struct rpcrdma_ia *ia = rdmab_to_ia(buf); |
925 | 1012 | ||
1013 | cancel_delayed_work_sync(&buf->rb_recovery_worker); | ||
1014 | |||
926 | while (!list_empty(&buf->rb_recv_bufs)) { | 1015 | while (!list_empty(&buf->rb_recv_bufs)) { |
927 | struct rpcrdma_rep *rep; | 1016 | struct rpcrdma_rep *rep; |
928 | 1017 | ||
@@ -944,7 +1033,7 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) | |||
944 | } | 1033 | } |
945 | spin_unlock(&buf->rb_reqslock); | 1034 | spin_unlock(&buf->rb_reqslock); |
946 | 1035 | ||
947 | ia->ri_ops->ro_destroy(buf); | 1036 | rpcrdma_destroy_mrs(buf); |
948 | } | 1037 | } |
949 | 1038 | ||
950 | struct rpcrdma_mw * | 1039 | struct rpcrdma_mw * |
@@ -962,8 +1051,17 @@ rpcrdma_get_mw(struct rpcrdma_xprt *r_xprt) | |||
962 | spin_unlock(&buf->rb_mwlock); | 1051 | spin_unlock(&buf->rb_mwlock); |
963 | 1052 | ||
964 | if (!mw) | 1053 | if (!mw) |
965 | pr_err("RPC: %s: no MWs available\n", __func__); | 1054 | goto out_nomws; |
966 | return mw; | 1055 | return mw; |
1056 | |||
1057 | out_nomws: | ||
1058 | dprintk("RPC: %s: no MWs available\n", __func__); | ||
1059 | schedule_delayed_work(&buf->rb_refresh_worker, 0); | ||
1060 | |||
1061 | /* Allow the reply handler and refresh worker to run */ | ||
1062 | cond_resched(); | ||
1063 | |||
1064 | return NULL; | ||
967 | } | 1065 | } |
968 | 1066 | ||
969 | void | 1067 | void |
@@ -978,8 +1076,6 @@ rpcrdma_put_mw(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw) | |||
978 | 1076 | ||
979 | /* | 1077 | /* |
980 | * Get a set of request/reply buffers. | 1078 | * Get a set of request/reply buffers. |
981 | * | ||
982 | * Reply buffer (if available) is attached to send buffer upon return. | ||
983 | */ | 1079 | */ |
984 | struct rpcrdma_req * | 1080 | struct rpcrdma_req * |
985 | rpcrdma_buffer_get(struct rpcrdma_buffer *buffers) | 1081 | rpcrdma_buffer_get(struct rpcrdma_buffer *buffers) |
@@ -998,13 +1094,13 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers) | |||
998 | 1094 | ||
999 | out_reqbuf: | 1095 | out_reqbuf: |
1000 | spin_unlock(&buffers->rb_lock); | 1096 | spin_unlock(&buffers->rb_lock); |
1001 | pr_warn("RPC: %s: out of request buffers\n", __func__); | 1097 | pr_warn("rpcrdma: out of request buffers (%p)\n", buffers); |
1002 | return NULL; | 1098 | return NULL; |
1003 | out_repbuf: | 1099 | out_repbuf: |
1100 | list_add(&req->rl_free, &buffers->rb_send_bufs); | ||
1004 | spin_unlock(&buffers->rb_lock); | 1101 | spin_unlock(&buffers->rb_lock); |
1005 | pr_warn("RPC: %s: out of reply buffers\n", __func__); | 1102 | pr_warn("rpcrdma: out of reply buffers (%p)\n", buffers); |
1006 | req->rl_reply = NULL; | 1103 | return NULL; |
1007 | return req; | ||
1008 | } | 1104 | } |
1009 | 1105 | ||
1010 | /* | 1106 | /* |
@@ -1060,14 +1156,6 @@ rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep) | |||
1060 | * Wrappers for internal-use kmalloc memory registration, used by buffer code. | 1156 | * Wrappers for internal-use kmalloc memory registration, used by buffer code. |
1061 | */ | 1157 | */ |
1062 | 1158 | ||
1063 | void | ||
1064 | rpcrdma_mapping_error(struct rpcrdma_mr_seg *seg) | ||
1065 | { | ||
1066 | dprintk("RPC: map_one: offset %p iova %llx len %zu\n", | ||
1067 | seg->mr_offset, | ||
1068 | (unsigned long long)seg->mr_dma, seg->mr_dmalen); | ||
1069 | } | ||
1070 | |||
1071 | /** | 1159 | /** |
1072 | * rpcrdma_alloc_regbuf - kmalloc and register memory for SEND/RECV buffers | 1160 | * rpcrdma_alloc_regbuf - kmalloc and register memory for SEND/RECV buffers |
1073 | * @ia: controlling rpcrdma_ia | 1161 | * @ia: controlling rpcrdma_ia |
@@ -1150,7 +1238,7 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia, | |||
1150 | if (rep) { | 1238 | if (rep) { |
1151 | rc = rpcrdma_ep_post_recv(ia, ep, rep); | 1239 | rc = rpcrdma_ep_post_recv(ia, ep, rep); |
1152 | if (rc) | 1240 | if (rc) |
1153 | goto out; | 1241 | return rc; |
1154 | req->rl_reply = NULL; | 1242 | req->rl_reply = NULL; |
1155 | } | 1243 | } |
1156 | 1244 | ||
@@ -1175,10 +1263,12 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia, | |||
1175 | 1263 | ||
1176 | rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail); | 1264 | rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail); |
1177 | if (rc) | 1265 | if (rc) |
1178 | dprintk("RPC: %s: ib_post_send returned %i\n", __func__, | 1266 | goto out_postsend_err; |
1179 | rc); | 1267 | return 0; |
1180 | out: | 1268 | |
1181 | return rc; | 1269 | out_postsend_err: |
1270 | pr_err("rpcrdma: RDMA Send ib_post_send returned %i\n", rc); | ||
1271 | return -ENOTCONN; | ||
1182 | } | 1272 | } |
1183 | 1273 | ||
1184 | /* | 1274 | /* |
@@ -1203,11 +1293,13 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia, | |||
1203 | DMA_BIDIRECTIONAL); | 1293 | DMA_BIDIRECTIONAL); |
1204 | 1294 | ||
1205 | rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail); | 1295 | rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail); |
1206 | |||
1207 | if (rc) | 1296 | if (rc) |
1208 | dprintk("RPC: %s: ib_post_recv returned %i\n", __func__, | 1297 | goto out_postrecv; |
1209 | rc); | 1298 | return 0; |
1210 | return rc; | 1299 | |
1300 | out_postrecv: | ||
1301 | pr_err("rpcrdma: ib_post_recv returned %i\n", rc); | ||
1302 | return -ENOTCONN; | ||
1211 | } | 1303 | } |
1212 | 1304 | ||
1213 | /** | 1305 | /** |
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 95cdc66225ee..670fad57153a 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h | |||
@@ -68,7 +68,6 @@ struct rpcrdma_ia { | |||
68 | struct ib_device *ri_device; | 68 | struct ib_device *ri_device; |
69 | struct rdma_cm_id *ri_id; | 69 | struct rdma_cm_id *ri_id; |
70 | struct ib_pd *ri_pd; | 70 | struct ib_pd *ri_pd; |
71 | struct ib_mr *ri_dma_mr; | ||
72 | struct completion ri_done; | 71 | struct completion ri_done; |
73 | int ri_async_rc; | 72 | int ri_async_rc; |
74 | unsigned int ri_max_frmr_depth; | 73 | unsigned int ri_max_frmr_depth; |
@@ -172,23 +171,14 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb) | |||
172 | * o recv buffer (posted to provider) | 171 | * o recv buffer (posted to provider) |
173 | * o ib_sge (also donated to provider) | 172 | * o ib_sge (also donated to provider) |
174 | * o status of reply (length, success or not) | 173 | * o status of reply (length, success or not) |
175 | * o bookkeeping state to get run by tasklet (list, etc) | 174 | * o bookkeeping state to get run by reply handler (list, etc) |
176 | * | 175 | * |
177 | * These are allocated during initialization, per-transport instance; | 176 | * These are allocated during initialization, per-transport instance. |
178 | * however, the tasklet execution list itself is global, as it should | ||
179 | * always be pretty short. | ||
180 | * | 177 | * |
181 | * N of these are associated with a transport instance, and stored in | 178 | * N of these are associated with a transport instance, and stored in |
182 | * struct rpcrdma_buffer. N is the max number of outstanding requests. | 179 | * struct rpcrdma_buffer. N is the max number of outstanding requests. |
183 | */ | 180 | */ |
184 | 181 | ||
185 | #define RPCRDMA_MAX_DATA_SEGS ((1 * 1024 * 1024) / PAGE_SIZE) | ||
186 | |||
187 | /* data segments + head/tail for Call + head/tail for Reply */ | ||
188 | #define RPCRDMA_MAX_SEGS (RPCRDMA_MAX_DATA_SEGS + 4) | ||
189 | |||
190 | struct rpcrdma_buffer; | ||
191 | |||
192 | struct rpcrdma_rep { | 182 | struct rpcrdma_rep { |
193 | struct ib_cqe rr_cqe; | 183 | struct ib_cqe rr_cqe; |
194 | unsigned int rr_len; | 184 | unsigned int rr_len; |
@@ -221,9 +211,6 @@ enum rpcrdma_frmr_state { | |||
221 | }; | 211 | }; |
222 | 212 | ||
223 | struct rpcrdma_frmr { | 213 | struct rpcrdma_frmr { |
224 | struct scatterlist *fr_sg; | ||
225 | int fr_nents; | ||
226 | enum dma_data_direction fr_dir; | ||
227 | struct ib_mr *fr_mr; | 214 | struct ib_mr *fr_mr; |
228 | struct ib_cqe fr_cqe; | 215 | struct ib_cqe fr_cqe; |
229 | enum rpcrdma_frmr_state fr_state; | 216 | enum rpcrdma_frmr_state fr_state; |
@@ -235,18 +222,23 @@ struct rpcrdma_frmr { | |||
235 | }; | 222 | }; |
236 | 223 | ||
237 | struct rpcrdma_fmr { | 224 | struct rpcrdma_fmr { |
238 | struct ib_fmr *fmr; | 225 | struct ib_fmr *fm_mr; |
239 | u64 *physaddrs; | 226 | u64 *fm_physaddrs; |
240 | }; | 227 | }; |
241 | 228 | ||
242 | struct rpcrdma_mw { | 229 | struct rpcrdma_mw { |
230 | struct list_head mw_list; | ||
231 | struct scatterlist *mw_sg; | ||
232 | int mw_nents; | ||
233 | enum dma_data_direction mw_dir; | ||
243 | union { | 234 | union { |
244 | struct rpcrdma_fmr fmr; | 235 | struct rpcrdma_fmr fmr; |
245 | struct rpcrdma_frmr frmr; | 236 | struct rpcrdma_frmr frmr; |
246 | }; | 237 | }; |
247 | struct work_struct mw_work; | ||
248 | struct rpcrdma_xprt *mw_xprt; | 238 | struct rpcrdma_xprt *mw_xprt; |
249 | struct list_head mw_list; | 239 | u32 mw_handle; |
240 | u32 mw_length; | ||
241 | u64 mw_offset; | ||
250 | struct list_head mw_all; | 242 | struct list_head mw_all; |
251 | }; | 243 | }; |
252 | 244 | ||
@@ -266,33 +258,30 @@ struct rpcrdma_mw { | |||
266 | * of iovs for send operations. The reason is that the iovs passed to | 258 | * of iovs for send operations. The reason is that the iovs passed to |
267 | * ib_post_{send,recv} must not be modified until the work request | 259 | * ib_post_{send,recv} must not be modified until the work request |
268 | * completes. | 260 | * completes. |
269 | * | ||
270 | * NOTES: | ||
271 | * o RPCRDMA_MAX_SEGS is the max number of addressible chunk elements we | ||
272 | * marshal. The number needed varies depending on the iov lists that | ||
273 | * are passed to us, the memory registration mode we are in, and if | ||
274 | * physical addressing is used, the layout. | ||
275 | */ | 261 | */ |
276 | 262 | ||
263 | /* Maximum number of page-sized "segments" per chunk list to be | ||
264 | * registered or invalidated. Must handle a Reply chunk: | ||
265 | */ | ||
266 | enum { | ||
267 | RPCRDMA_MAX_IOV_SEGS = 3, | ||
268 | RPCRDMA_MAX_DATA_SEGS = ((1 * 1024 * 1024) / PAGE_SIZE) + 1, | ||
269 | RPCRDMA_MAX_SEGS = RPCRDMA_MAX_DATA_SEGS + | ||
270 | RPCRDMA_MAX_IOV_SEGS, | ||
271 | }; | ||
272 | |||
277 | struct rpcrdma_mr_seg { /* chunk descriptors */ | 273 | struct rpcrdma_mr_seg { /* chunk descriptors */ |
278 | struct rpcrdma_mw *rl_mw; /* registered MR */ | ||
279 | u64 mr_base; /* registration result */ | ||
280 | u32 mr_rkey; /* registration result */ | ||
281 | u32 mr_len; /* length of chunk or segment */ | 274 | u32 mr_len; /* length of chunk or segment */ |
282 | int mr_nsegs; /* number of segments in chunk or 0 */ | ||
283 | enum dma_data_direction mr_dir; /* segment mapping direction */ | ||
284 | dma_addr_t mr_dma; /* segment mapping address */ | ||
285 | size_t mr_dmalen; /* segment mapping length */ | ||
286 | struct page *mr_page; /* owning page, if any */ | 275 | struct page *mr_page; /* owning page, if any */ |
287 | char *mr_offset; /* kva if no page, else offset */ | 276 | char *mr_offset; /* kva if no page, else offset */ |
288 | }; | 277 | }; |
289 | 278 | ||
290 | #define RPCRDMA_MAX_IOVS (2) | 279 | #define RPCRDMA_MAX_IOVS (2) |
291 | 280 | ||
281 | struct rpcrdma_buffer; | ||
292 | struct rpcrdma_req { | 282 | struct rpcrdma_req { |
293 | struct list_head rl_free; | 283 | struct list_head rl_free; |
294 | unsigned int rl_niovs; | 284 | unsigned int rl_niovs; |
295 | unsigned int rl_nchunks; | ||
296 | unsigned int rl_connect_cookie; | 285 | unsigned int rl_connect_cookie; |
297 | struct rpc_task *rl_task; | 286 | struct rpc_task *rl_task; |
298 | struct rpcrdma_buffer *rl_buffer; | 287 | struct rpcrdma_buffer *rl_buffer; |
@@ -300,12 +289,13 @@ struct rpcrdma_req { | |||
300 | struct ib_sge rl_send_iov[RPCRDMA_MAX_IOVS]; | 289 | struct ib_sge rl_send_iov[RPCRDMA_MAX_IOVS]; |
301 | struct rpcrdma_regbuf *rl_rdmabuf; | 290 | struct rpcrdma_regbuf *rl_rdmabuf; |
302 | struct rpcrdma_regbuf *rl_sendbuf; | 291 | struct rpcrdma_regbuf *rl_sendbuf; |
303 | struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS]; | ||
304 | struct rpcrdma_mr_seg *rl_nextseg; | ||
305 | 292 | ||
306 | struct ib_cqe rl_cqe; | 293 | struct ib_cqe rl_cqe; |
307 | struct list_head rl_all; | 294 | struct list_head rl_all; |
308 | bool rl_backchannel; | 295 | bool rl_backchannel; |
296 | |||
297 | struct list_head rl_registered; /* registered segments */ | ||
298 | struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS]; | ||
309 | }; | 299 | }; |
310 | 300 | ||
311 | static inline struct rpcrdma_req * | 301 | static inline struct rpcrdma_req * |
@@ -341,6 +331,11 @@ struct rpcrdma_buffer { | |||
341 | struct list_head rb_allreqs; | 331 | struct list_head rb_allreqs; |
342 | 332 | ||
343 | u32 rb_bc_max_requests; | 333 | u32 rb_bc_max_requests; |
334 | |||
335 | spinlock_t rb_recovery_lock; /* protect rb_stale_mrs */ | ||
336 | struct list_head rb_stale_mrs; | ||
337 | struct delayed_work rb_recovery_worker; | ||
338 | struct delayed_work rb_refresh_worker; | ||
344 | }; | 339 | }; |
345 | #define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia) | 340 | #define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia) |
346 | 341 | ||
@@ -387,6 +382,9 @@ struct rpcrdma_stats { | |||
387 | unsigned long bad_reply_count; | 382 | unsigned long bad_reply_count; |
388 | unsigned long nomsg_call_count; | 383 | unsigned long nomsg_call_count; |
389 | unsigned long bcall_count; | 384 | unsigned long bcall_count; |
385 | unsigned long mrs_recovered; | ||
386 | unsigned long mrs_orphaned; | ||
387 | unsigned long mrs_allocated; | ||
390 | }; | 388 | }; |
391 | 389 | ||
392 | /* | 390 | /* |
@@ -395,23 +393,25 @@ struct rpcrdma_stats { | |||
395 | struct rpcrdma_xprt; | 393 | struct rpcrdma_xprt; |
396 | struct rpcrdma_memreg_ops { | 394 | struct rpcrdma_memreg_ops { |
397 | int (*ro_map)(struct rpcrdma_xprt *, | 395 | int (*ro_map)(struct rpcrdma_xprt *, |
398 | struct rpcrdma_mr_seg *, int, bool); | 396 | struct rpcrdma_mr_seg *, int, bool, |
397 | struct rpcrdma_mw **); | ||
399 | void (*ro_unmap_sync)(struct rpcrdma_xprt *, | 398 | void (*ro_unmap_sync)(struct rpcrdma_xprt *, |
400 | struct rpcrdma_req *); | 399 | struct rpcrdma_req *); |
401 | void (*ro_unmap_safe)(struct rpcrdma_xprt *, | 400 | void (*ro_unmap_safe)(struct rpcrdma_xprt *, |
402 | struct rpcrdma_req *, bool); | 401 | struct rpcrdma_req *, bool); |
402 | void (*ro_recover_mr)(struct rpcrdma_mw *); | ||
403 | int (*ro_open)(struct rpcrdma_ia *, | 403 | int (*ro_open)(struct rpcrdma_ia *, |
404 | struct rpcrdma_ep *, | 404 | struct rpcrdma_ep *, |
405 | struct rpcrdma_create_data_internal *); | 405 | struct rpcrdma_create_data_internal *); |
406 | size_t (*ro_maxpages)(struct rpcrdma_xprt *); | 406 | size_t (*ro_maxpages)(struct rpcrdma_xprt *); |
407 | int (*ro_init)(struct rpcrdma_xprt *); | 407 | int (*ro_init_mr)(struct rpcrdma_ia *, |
408 | void (*ro_destroy)(struct rpcrdma_buffer *); | 408 | struct rpcrdma_mw *); |
409 | void (*ro_release_mr)(struct rpcrdma_mw *); | ||
409 | const char *ro_displayname; | 410 | const char *ro_displayname; |
410 | }; | 411 | }; |
411 | 412 | ||
412 | extern const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops; | 413 | extern const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops; |
413 | extern const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops; | 414 | extern const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops; |
414 | extern const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops; | ||
415 | 415 | ||
416 | /* | 416 | /* |
417 | * RPCRDMA transport -- encapsulates the structures above for | 417 | * RPCRDMA transport -- encapsulates the structures above for |
@@ -446,6 +446,8 @@ extern int xprt_rdma_pad_optimize; | |||
446 | */ | 446 | */ |
447 | int rpcrdma_ia_open(struct rpcrdma_xprt *, struct sockaddr *, int); | 447 | int rpcrdma_ia_open(struct rpcrdma_xprt *, struct sockaddr *, int); |
448 | void rpcrdma_ia_close(struct rpcrdma_ia *); | 448 | void rpcrdma_ia_close(struct rpcrdma_ia *); |
449 | bool frwr_is_supported(struct rpcrdma_ia *); | ||
450 | bool fmr_is_supported(struct rpcrdma_ia *); | ||
449 | 451 | ||
450 | /* | 452 | /* |
451 | * Endpoint calls - xprtrdma/verbs.c | 453 | * Endpoint calls - xprtrdma/verbs.c |
@@ -477,6 +479,8 @@ void rpcrdma_buffer_put(struct rpcrdma_req *); | |||
477 | void rpcrdma_recv_buffer_get(struct rpcrdma_req *); | 479 | void rpcrdma_recv_buffer_get(struct rpcrdma_req *); |
478 | void rpcrdma_recv_buffer_put(struct rpcrdma_rep *); | 480 | void rpcrdma_recv_buffer_put(struct rpcrdma_rep *); |
479 | 481 | ||
482 | void rpcrdma_defer_mr_recovery(struct rpcrdma_mw *); | ||
483 | |||
480 | struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(struct rpcrdma_ia *, | 484 | struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(struct rpcrdma_ia *, |
481 | size_t, gfp_t); | 485 | size_t, gfp_t); |
482 | void rpcrdma_free_regbuf(struct rpcrdma_ia *, | 486 | void rpcrdma_free_regbuf(struct rpcrdma_ia *, |
@@ -484,9 +488,6 @@ void rpcrdma_free_regbuf(struct rpcrdma_ia *, | |||
484 | 488 | ||
485 | int rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *, unsigned int); | 489 | int rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *, unsigned int); |
486 | 490 | ||
487 | int frwr_alloc_recovery_wq(void); | ||
488 | void frwr_destroy_recovery_wq(void); | ||
489 | |||
490 | int rpcrdma_alloc_wq(void); | 491 | int rpcrdma_alloc_wq(void); |
491 | void rpcrdma_destroy_wq(void); | 492 | void rpcrdma_destroy_wq(void); |
492 | 493 | ||
@@ -494,45 +495,12 @@ void rpcrdma_destroy_wq(void); | |||
494 | * Wrappers for chunk registration, shared by read/write chunk code. | 495 | * Wrappers for chunk registration, shared by read/write chunk code. |
495 | */ | 496 | */ |
496 | 497 | ||
497 | void rpcrdma_mapping_error(struct rpcrdma_mr_seg *); | ||
498 | |||
499 | static inline enum dma_data_direction | 498 | static inline enum dma_data_direction |
500 | rpcrdma_data_dir(bool writing) | 499 | rpcrdma_data_dir(bool writing) |
501 | { | 500 | { |
502 | return writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE; | 501 | return writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE; |
503 | } | 502 | } |
504 | 503 | ||
505 | static inline void | ||
506 | rpcrdma_map_one(struct ib_device *device, struct rpcrdma_mr_seg *seg, | ||
507 | enum dma_data_direction direction) | ||
508 | { | ||
509 | seg->mr_dir = direction; | ||
510 | seg->mr_dmalen = seg->mr_len; | ||
511 | |||
512 | if (seg->mr_page) | ||
513 | seg->mr_dma = ib_dma_map_page(device, | ||
514 | seg->mr_page, offset_in_page(seg->mr_offset), | ||
515 | seg->mr_dmalen, seg->mr_dir); | ||
516 | else | ||
517 | seg->mr_dma = ib_dma_map_single(device, | ||
518 | seg->mr_offset, | ||
519 | seg->mr_dmalen, seg->mr_dir); | ||
520 | |||
521 | if (ib_dma_mapping_error(device, seg->mr_dma)) | ||
522 | rpcrdma_mapping_error(seg); | ||
523 | } | ||
524 | |||
525 | static inline void | ||
526 | rpcrdma_unmap_one(struct ib_device *device, struct rpcrdma_mr_seg *seg) | ||
527 | { | ||
528 | if (seg->mr_page) | ||
529 | ib_dma_unmap_page(device, | ||
530 | seg->mr_dma, seg->mr_dmalen, seg->mr_dir); | ||
531 | else | ||
532 | ib_dma_unmap_single(device, | ||
533 | seg->mr_dma, seg->mr_dmalen, seg->mr_dir); | ||
534 | } | ||
535 | |||
536 | /* | 504 | /* |
537 | * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c | 505 | * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c |
538 | */ | 506 | */ |
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 7e2b2fa189c3..111767ab124a 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c | |||
@@ -124,7 +124,7 @@ static struct ctl_table xs_tunables_table[] = { | |||
124 | .mode = 0644, | 124 | .mode = 0644, |
125 | .proc_handler = proc_dointvec_minmax, | 125 | .proc_handler = proc_dointvec_minmax, |
126 | .extra1 = &xprt_min_resvport_limit, | 126 | .extra1 = &xprt_min_resvport_limit, |
127 | .extra2 = &xprt_max_resvport_limit | 127 | .extra2 = &xprt_max_resvport |
128 | }, | 128 | }, |
129 | { | 129 | { |
130 | .procname = "max_resvport", | 130 | .procname = "max_resvport", |
@@ -132,7 +132,7 @@ static struct ctl_table xs_tunables_table[] = { | |||
132 | .maxlen = sizeof(unsigned int), | 132 | .maxlen = sizeof(unsigned int), |
133 | .mode = 0644, | 133 | .mode = 0644, |
134 | .proc_handler = proc_dointvec_minmax, | 134 | .proc_handler = proc_dointvec_minmax, |
135 | .extra1 = &xprt_min_resvport_limit, | 135 | .extra1 = &xprt_min_resvport, |
136 | .extra2 = &xprt_max_resvport_limit | 136 | .extra2 = &xprt_max_resvport_limit |
137 | }, | 137 | }, |
138 | { | 138 | { |
@@ -642,6 +642,7 @@ static int xs_tcp_send_request(struct rpc_task *task) | |||
642 | struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); | 642 | struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); |
643 | struct xdr_buf *xdr = &req->rq_snd_buf; | 643 | struct xdr_buf *xdr = &req->rq_snd_buf; |
644 | bool zerocopy = true; | 644 | bool zerocopy = true; |
645 | bool vm_wait = false; | ||
645 | int status; | 646 | int status; |
646 | int sent; | 647 | int sent; |
647 | 648 | ||
@@ -677,15 +678,33 @@ static int xs_tcp_send_request(struct rpc_task *task) | |||
677 | return 0; | 678 | return 0; |
678 | } | 679 | } |
679 | 680 | ||
681 | WARN_ON_ONCE(sent == 0 && status == 0); | ||
682 | |||
683 | if (status == -EAGAIN ) { | ||
684 | /* | ||
685 | * Return EAGAIN if we're sure we're hitting the | ||
686 | * socket send buffer limits. | ||
687 | */ | ||
688 | if (test_bit(SOCK_NOSPACE, &transport->sock->flags)) | ||
689 | break; | ||
690 | /* | ||
691 | * Did we hit a memory allocation failure? | ||
692 | */ | ||
693 | if (sent == 0) { | ||
694 | status = -ENOBUFS; | ||
695 | if (vm_wait) | ||
696 | break; | ||
697 | /* Retry, knowing now that we're below the | ||
698 | * socket send buffer limit | ||
699 | */ | ||
700 | vm_wait = true; | ||
701 | } | ||
702 | continue; | ||
703 | } | ||
680 | if (status < 0) | 704 | if (status < 0) |
681 | break; | 705 | break; |
682 | if (sent == 0) { | 706 | vm_wait = false; |
683 | status = -EAGAIN; | ||
684 | break; | ||
685 | } | ||
686 | } | 707 | } |
687 | if (status == -EAGAIN && sk_stream_is_writeable(transport->inet)) | ||
688 | status = -ENOBUFS; | ||
689 | 708 | ||
690 | switch (status) { | 709 | switch (status) { |
691 | case -ENOTSOCK: | 710 | case -ENOTSOCK: |
@@ -755,11 +774,19 @@ static void xs_restore_old_callbacks(struct sock_xprt *transport, struct sock *s | |||
755 | sk->sk_error_report = transport->old_error_report; | 774 | sk->sk_error_report = transport->old_error_report; |
756 | } | 775 | } |
757 | 776 | ||
777 | static void xs_sock_reset_state_flags(struct rpc_xprt *xprt) | ||
778 | { | ||
779 | struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); | ||
780 | |||
781 | clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state); | ||
782 | } | ||
783 | |||
758 | static void xs_sock_reset_connection_flags(struct rpc_xprt *xprt) | 784 | static void xs_sock_reset_connection_flags(struct rpc_xprt *xprt) |
759 | { | 785 | { |
760 | smp_mb__before_atomic(); | 786 | smp_mb__before_atomic(); |
761 | clear_bit(XPRT_CLOSE_WAIT, &xprt->state); | 787 | clear_bit(XPRT_CLOSE_WAIT, &xprt->state); |
762 | clear_bit(XPRT_CLOSING, &xprt->state); | 788 | clear_bit(XPRT_CLOSING, &xprt->state); |
789 | xs_sock_reset_state_flags(xprt); | ||
763 | smp_mb__after_atomic(); | 790 | smp_mb__after_atomic(); |
764 | } | 791 | } |
765 | 792 | ||
@@ -962,10 +989,13 @@ static void xs_local_data_receive(struct sock_xprt *transport) | |||
962 | goto out; | 989 | goto out; |
963 | for (;;) { | 990 | for (;;) { |
964 | skb = skb_recv_datagram(sk, 0, 1, &err); | 991 | skb = skb_recv_datagram(sk, 0, 1, &err); |
965 | if (skb == NULL) | 992 | if (skb != NULL) { |
993 | xs_local_data_read_skb(&transport->xprt, sk, skb); | ||
994 | skb_free_datagram(sk, skb); | ||
995 | continue; | ||
996 | } | ||
997 | if (!test_and_clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state)) | ||
966 | break; | 998 | break; |
967 | xs_local_data_read_skb(&transport->xprt, sk, skb); | ||
968 | skb_free_datagram(sk, skb); | ||
969 | } | 999 | } |
970 | out: | 1000 | out: |
971 | mutex_unlock(&transport->recv_mutex); | 1001 | mutex_unlock(&transport->recv_mutex); |
@@ -1043,10 +1073,13 @@ static void xs_udp_data_receive(struct sock_xprt *transport) | |||
1043 | goto out; | 1073 | goto out; |
1044 | for (;;) { | 1074 | for (;;) { |
1045 | skb = skb_recv_datagram(sk, 0, 1, &err); | 1075 | skb = skb_recv_datagram(sk, 0, 1, &err); |
1046 | if (skb == NULL) | 1076 | if (skb != NULL) { |
1077 | xs_udp_data_read_skb(&transport->xprt, sk, skb); | ||
1078 | skb_free_datagram(sk, skb); | ||
1079 | continue; | ||
1080 | } | ||
1081 | if (!test_and_clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state)) | ||
1047 | break; | 1082 | break; |
1048 | xs_udp_data_read_skb(&transport->xprt, sk, skb); | ||
1049 | skb_free_datagram(sk, skb); | ||
1050 | } | 1083 | } |
1051 | out: | 1084 | out: |
1052 | mutex_unlock(&transport->recv_mutex); | 1085 | mutex_unlock(&transport->recv_mutex); |
@@ -1074,7 +1107,14 @@ static void xs_data_ready(struct sock *sk) | |||
1074 | if (xprt != NULL) { | 1107 | if (xprt != NULL) { |
1075 | struct sock_xprt *transport = container_of(xprt, | 1108 | struct sock_xprt *transport = container_of(xprt, |
1076 | struct sock_xprt, xprt); | 1109 | struct sock_xprt, xprt); |
1077 | queue_work(rpciod_workqueue, &transport->recv_worker); | 1110 | transport->old_data_ready(sk); |
1111 | /* Any data means we had a useful conversation, so | ||
1112 | * then we don't need to delay the next reconnect | ||
1113 | */ | ||
1114 | if (xprt->reestablish_timeout) | ||
1115 | xprt->reestablish_timeout = 0; | ||
1116 | if (!test_and_set_bit(XPRT_SOCK_DATA_READY, &transport->sock_state)) | ||
1117 | queue_work(xprtiod_workqueue, &transport->recv_worker); | ||
1078 | } | 1118 | } |
1079 | read_unlock_bh(&sk->sk_callback_lock); | 1119 | read_unlock_bh(&sk->sk_callback_lock); |
1080 | } | 1120 | } |
@@ -1474,10 +1514,15 @@ static void xs_tcp_data_receive(struct sock_xprt *transport) | |||
1474 | for (;;) { | 1514 | for (;;) { |
1475 | lock_sock(sk); | 1515 | lock_sock(sk); |
1476 | read = tcp_read_sock(sk, &rd_desc, xs_tcp_data_recv); | 1516 | read = tcp_read_sock(sk, &rd_desc, xs_tcp_data_recv); |
1477 | release_sock(sk); | 1517 | if (read <= 0) { |
1478 | if (read <= 0) | 1518 | clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state); |
1479 | break; | 1519 | release_sock(sk); |
1480 | total += read; | 1520 | if (!test_bit(XPRT_SOCK_DATA_READY, &transport->sock_state)) |
1521 | break; | ||
1522 | } else { | ||
1523 | release_sock(sk); | ||
1524 | total += read; | ||
1525 | } | ||
1481 | rd_desc.count = 65536; | 1526 | rd_desc.count = 65536; |
1482 | } | 1527 | } |
1483 | out: | 1528 | out: |
@@ -1493,34 +1538,6 @@ static void xs_tcp_data_receive_workfn(struct work_struct *work) | |||
1493 | } | 1538 | } |
1494 | 1539 | ||
1495 | /** | 1540 | /** |
1496 | * xs_tcp_data_ready - "data ready" callback for TCP sockets | ||
1497 | * @sk: socket with data to read | ||
1498 | * | ||
1499 | */ | ||
1500 | static void xs_tcp_data_ready(struct sock *sk) | ||
1501 | { | ||
1502 | struct sock_xprt *transport; | ||
1503 | struct rpc_xprt *xprt; | ||
1504 | |||
1505 | dprintk("RPC: xs_tcp_data_ready...\n"); | ||
1506 | |||
1507 | read_lock_bh(&sk->sk_callback_lock); | ||
1508 | if (!(xprt = xprt_from_sock(sk))) | ||
1509 | goto out; | ||
1510 | transport = container_of(xprt, struct sock_xprt, xprt); | ||
1511 | |||
1512 | /* Any data means we had a useful conversation, so | ||
1513 | * the we don't need to delay the next reconnect | ||
1514 | */ | ||
1515 | if (xprt->reestablish_timeout) | ||
1516 | xprt->reestablish_timeout = 0; | ||
1517 | queue_work(rpciod_workqueue, &transport->recv_worker); | ||
1518 | |||
1519 | out: | ||
1520 | read_unlock_bh(&sk->sk_callback_lock); | ||
1521 | } | ||
1522 | |||
1523 | /** | ||
1524 | * xs_tcp_state_change - callback to handle TCP socket state changes | 1541 | * xs_tcp_state_change - callback to handle TCP socket state changes |
1525 | * @sk: socket whose state has changed | 1542 | * @sk: socket whose state has changed |
1526 | * | 1543 | * |
@@ -1714,7 +1731,7 @@ static void xs_udp_timer(struct rpc_xprt *xprt, struct rpc_task *task) | |||
1714 | 1731 | ||
1715 | static unsigned short xs_get_random_port(void) | 1732 | static unsigned short xs_get_random_port(void) |
1716 | { | 1733 | { |
1717 | unsigned short range = xprt_max_resvport - xprt_min_resvport; | 1734 | unsigned short range = xprt_max_resvport - xprt_min_resvport + 1; |
1718 | unsigned short rand = (unsigned short) prandom_u32() % range; | 1735 | unsigned short rand = (unsigned short) prandom_u32() % range; |
1719 | return rand + xprt_min_resvport; | 1736 | return rand + xprt_min_resvport; |
1720 | } | 1737 | } |
@@ -2241,7 +2258,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) | |||
2241 | xs_save_old_callbacks(transport, sk); | 2258 | xs_save_old_callbacks(transport, sk); |
2242 | 2259 | ||
2243 | sk->sk_user_data = xprt; | 2260 | sk->sk_user_data = xprt; |
2244 | sk->sk_data_ready = xs_tcp_data_ready; | 2261 | sk->sk_data_ready = xs_data_ready; |
2245 | sk->sk_state_change = xs_tcp_state_change; | 2262 | sk->sk_state_change = xs_tcp_state_change; |
2246 | sk->sk_write_space = xs_tcp_write_space; | 2263 | sk->sk_write_space = xs_tcp_write_space; |
2247 | sock_set_flag(sk, SOCK_FASYNC); | 2264 | sock_set_flag(sk, SOCK_FASYNC); |
@@ -2380,7 +2397,7 @@ static void xs_connect(struct rpc_xprt *xprt, struct rpc_task *task) | |||
2380 | /* Start by resetting any existing state */ | 2397 | /* Start by resetting any existing state */ |
2381 | xs_reset_transport(transport); | 2398 | xs_reset_transport(transport); |
2382 | 2399 | ||
2383 | queue_delayed_work(rpciod_workqueue, | 2400 | queue_delayed_work(xprtiod_workqueue, |
2384 | &transport->connect_worker, | 2401 | &transport->connect_worker, |
2385 | xprt->reestablish_timeout); | 2402 | xprt->reestablish_timeout); |
2386 | xprt->reestablish_timeout <<= 1; | 2403 | xprt->reestablish_timeout <<= 1; |
@@ -2390,7 +2407,7 @@ static void xs_connect(struct rpc_xprt *xprt, struct rpc_task *task) | |||
2390 | xprt->reestablish_timeout = XS_TCP_MAX_REEST_TO; | 2407 | xprt->reestablish_timeout = XS_TCP_MAX_REEST_TO; |
2391 | } else { | 2408 | } else { |
2392 | dprintk("RPC: xs_connect scheduled xprt %p\n", xprt); | 2409 | dprintk("RPC: xs_connect scheduled xprt %p\n", xprt); |
2393 | queue_delayed_work(rpciod_workqueue, | 2410 | queue_delayed_work(xprtiod_workqueue, |
2394 | &transport->connect_worker, 0); | 2411 | &transport->connect_worker, 0); |
2395 | } | 2412 | } |
2396 | } | 2413 | } |
@@ -3153,8 +3170,12 @@ static int param_set_uint_minmax(const char *val, | |||
3153 | 3170 | ||
3154 | static int param_set_portnr(const char *val, const struct kernel_param *kp) | 3171 | static int param_set_portnr(const char *val, const struct kernel_param *kp) |
3155 | { | 3172 | { |
3156 | return param_set_uint_minmax(val, kp, | 3173 | if (kp->arg == &xprt_min_resvport) |
3174 | return param_set_uint_minmax(val, kp, | ||
3157 | RPC_MIN_RESVPORT, | 3175 | RPC_MIN_RESVPORT, |
3176 | xprt_max_resvport); | ||
3177 | return param_set_uint_minmax(val, kp, | ||
3178 | xprt_min_resvport, | ||
3158 | RPC_MAX_RESVPORT); | 3179 | RPC_MAX_RESVPORT); |
3159 | } | 3180 | } |
3160 | 3181 | ||