aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2018-01-30 22:03:48 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2018-01-30 22:03:48 -0500
commitefd52b5d363e3e3b6224ad39949219c0df117c91 (patch)
tree2d885d2f431a324af58d8f267755240bff3e32da
parent1ed2d76e0213751c82e3a242b61b0883daf330df (diff)
parente231c6879cfd44e4fffd384bb6dd7d313249a523 (diff)
Merge tag 'nfs-for-4.16-1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs
Pull NFS client updates from Trond Myklebust: "Highlights include: Stable bugfixes: - Fix breakages in the nfsstat utility due to the inclusion of the NFSv4 LOOKUPP operation - Fix a NULL pointer dereference in nfs_idmap_prepare_pipe_upcall() due to nfs_idmap_legacy_upcall() being called without an 'aux' parameter - Fix a refcount leak in the standard O_DIRECT error path - Fix a refcount leak in the pNFS O_DIRECT fallback to MDS path - Fix CPU latency issues with nfs_commit_release_pages() - Fix the LAYOUTUNAVAILABLE error case in the file layout type - NFS: Fix a race between mmap() and O_DIRECT Features: - Support the statx() mask and query flags to enable optimisations when the user is requesting only attributes that are already up to date in the inode cache, or is specifying the AT_STATX_DONT_SYNC flag - Add a module alias for the SCSI pNFS layout type Bugfixes: - Automounting when resolving a NFSv4 referral should preserve the RDMA transport protocol settings - Various other RDMA bugfixes from Chuck - pNFS block layout fixes - Always set NFS_LOCK_LOST when a lock is lost" * tag 'nfs-for-4.16-1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs: (69 commits) NFS: Fix a race between mmap() and O_DIRECT NFS: Remove a redundant call to unmap_mapping_range() pnfs/blocklayout: Ensure disk address in block device map pnfs/blocklayout: pnfs_block_dev_map uses bytes, not sectors lockd: Fix server refcounting SUNRPC: Fix null rpc_clnt dereference in rpc_task_queued tracepoint SUNRPC: Micro-optimize __rpc_execute SUNRPC: task_run_action should display tk_callback sunrpc: Format RPC events consistently for display SUNRPC: Trace xprt_timer events xprtrdma: Correct some documenting comments xprtrdma: Fix "bytes registered" accounting xprtrdma: Instrument allocation/release of rpcrdma_req/rep objects xprtrdma: Add trace points to instrument QP and CQ access upcalls xprtrdma: Add trace points in the client-side backchannel code paths xprtrdma: Add trace points for connect events xprtrdma: Add trace points to instrument MR allocation and recovery xprtrdma: Add trace points to instrument memory invalidation xprtrdma: Add trace points in reply decoder path xprtrdma: Add trace points to instrument memory registration ..
-rw-r--r--fs/lockd/clntproc.c14
-rw-r--r--fs/lockd/host.c22
-rw-r--r--fs/lockd/mon.c14
-rw-r--r--fs/lockd/svcproc.c2
-rw-r--r--fs/nfs/blocklayout/blocklayout.c94
-rw-r--r--fs/nfs/blocklayout/blocklayout.h7
-rw-r--r--fs/nfs/blocklayout/dev.c7
-rw-r--r--fs/nfs/direct.c4
-rw-r--r--fs/nfs/export.c5
-rw-r--r--fs/nfs/filelayout/filelayout.c4
-rw-r--r--fs/nfs/inode.c53
-rw-r--r--fs/nfs/io.c2
-rw-r--r--fs/nfs/nfs4client.c24
-rw-r--r--fs/nfs/nfs4idmap.c6
-rw-r--r--fs/nfs/nfs4namespace.c2
-rw-r--r--fs/nfs/nfs4proc.c42
-rw-r--r--fs/nfs/nfs4state.c5
-rw-r--r--fs/nfs/nfs4sysctl.c2
-rw-r--r--fs/nfs/nfs4xdr.c64
-rw-r--r--fs/nfs/nfstrace.h22
-rw-r--r--fs/nfs/pagelist.c8
-rw-r--r--fs/nfs/pnfs.c6
-rw-r--r--fs/nfs/pnfs.h6
-rw-r--r--fs/nfs/pnfs_dev.c1
-rw-r--r--fs/nfs/write.c2
-rw-r--r--include/linux/lockd/lockd.h9
-rw-r--r--include/linux/nfs4.h12
-rw-r--r--include/linux/sunrpc/clnt.h1
-rw-r--r--include/linux/sunrpc/xprtrdma.h2
-rw-r--r--include/trace/events/rdma.h129
-rw-r--r--include/trace/events/rpcrdma.h890
-rw-r--r--include/trace/events/sunrpc.h12
-rw-r--r--include/uapi/linux/nfs.h1
-rw-r--r--net/sunrpc/clnt.c16
-rw-r--r--net/sunrpc/sched.c26
-rw-r--r--net/sunrpc/xprt.c2
-rw-r--r--net/sunrpc/xprtrdma/backchannel.c78
-rw-r--r--net/sunrpc/xprtrdma/fmr_ops.c157
-rw-r--r--net/sunrpc/xprtrdma/frwr_ops.c329
-rw-r--r--net/sunrpc/xprtrdma/module.c12
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c162
-rw-r--r--net/sunrpc/xprtrdma/transport.c128
-rw-r--r--net/sunrpc/xprtrdma/verbs.c280
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h116
-rw-r--r--net/sunrpc/xprtsock.c36
45 files changed, 1995 insertions, 821 deletions
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 066ac313ae5c..a2c0dfc6fdc0 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -48,13 +48,13 @@ void nlmclnt_next_cookie(struct nlm_cookie *c)
48 48
49static struct nlm_lockowner *nlm_get_lockowner(struct nlm_lockowner *lockowner) 49static struct nlm_lockowner *nlm_get_lockowner(struct nlm_lockowner *lockowner)
50{ 50{
51 atomic_inc(&lockowner->count); 51 refcount_inc(&lockowner->count);
52 return lockowner; 52 return lockowner;
53} 53}
54 54
55static void nlm_put_lockowner(struct nlm_lockowner *lockowner) 55static void nlm_put_lockowner(struct nlm_lockowner *lockowner)
56{ 56{
57 if (!atomic_dec_and_lock(&lockowner->count, &lockowner->host->h_lock)) 57 if (!refcount_dec_and_lock(&lockowner->count, &lockowner->host->h_lock))
58 return; 58 return;
59 list_del(&lockowner->list); 59 list_del(&lockowner->list);
60 spin_unlock(&lockowner->host->h_lock); 60 spin_unlock(&lockowner->host->h_lock);
@@ -105,7 +105,7 @@ static struct nlm_lockowner *nlm_find_lockowner(struct nlm_host *host, fl_owner_
105 res = __nlm_find_lockowner(host, owner); 105 res = __nlm_find_lockowner(host, owner);
106 if (res == NULL && new != NULL) { 106 if (res == NULL && new != NULL) {
107 res = new; 107 res = new;
108 atomic_set(&new->count, 1); 108 refcount_set(&new->count, 1);
109 new->owner = owner; 109 new->owner = owner;
110 new->pid = __nlm_alloc_pid(host); 110 new->pid = __nlm_alloc_pid(host);
111 new->host = nlm_get_host(host); 111 new->host = nlm_get_host(host);
@@ -204,7 +204,7 @@ struct nlm_rqst *nlm_alloc_call(struct nlm_host *host)
204 for(;;) { 204 for(;;) {
205 call = kzalloc(sizeof(*call), GFP_KERNEL); 205 call = kzalloc(sizeof(*call), GFP_KERNEL);
206 if (call != NULL) { 206 if (call != NULL) {
207 atomic_set(&call->a_count, 1); 207 refcount_set(&call->a_count, 1);
208 locks_init_lock(&call->a_args.lock.fl); 208 locks_init_lock(&call->a_args.lock.fl);
209 locks_init_lock(&call->a_res.lock.fl); 209 locks_init_lock(&call->a_res.lock.fl);
210 call->a_host = nlm_get_host(host); 210 call->a_host = nlm_get_host(host);
@@ -222,7 +222,7 @@ void nlmclnt_release_call(struct nlm_rqst *call)
222{ 222{
223 const struct nlmclnt_operations *nlmclnt_ops = call->a_host->h_nlmclnt_ops; 223 const struct nlmclnt_operations *nlmclnt_ops = call->a_host->h_nlmclnt_ops;
224 224
225 if (!atomic_dec_and_test(&call->a_count)) 225 if (!refcount_dec_and_test(&call->a_count))
226 return; 226 return;
227 if (nlmclnt_ops && nlmclnt_ops->nlmclnt_release_call) 227 if (nlmclnt_ops && nlmclnt_ops->nlmclnt_release_call)
228 nlmclnt_ops->nlmclnt_release_call(call->a_callback_data); 228 nlmclnt_ops->nlmclnt_release_call(call->a_callback_data);
@@ -678,7 +678,7 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl)
678 goto out; 678 goto out;
679 } 679 }
680 680
681 atomic_inc(&req->a_count); 681 refcount_inc(&req->a_count);
682 status = nlmclnt_async_call(nfs_file_cred(fl->fl_file), req, 682 status = nlmclnt_async_call(nfs_file_cred(fl->fl_file), req,
683 NLMPROC_UNLOCK, &nlmclnt_unlock_ops); 683 NLMPROC_UNLOCK, &nlmclnt_unlock_ops);
684 if (status < 0) 684 if (status < 0)
@@ -769,7 +769,7 @@ static int nlmclnt_cancel(struct nlm_host *host, int block, struct file_lock *fl
769 nlmclnt_setlockargs(req, fl); 769 nlmclnt_setlockargs(req, fl);
770 req->a_args.block = block; 770 req->a_args.block = block;
771 771
772 atomic_inc(&req->a_count); 772 refcount_inc(&req->a_count);
773 status = nlmclnt_async_call(nfs_file_cred(fl->fl_file), req, 773 status = nlmclnt_async_call(nfs_file_cred(fl->fl_file), req,
774 NLMPROC_CANCEL, &nlmclnt_cancel_ops); 774 NLMPROC_CANCEL, &nlmclnt_cancel_ops);
775 if (status == 0 && req->a_res.status == nlm_lck_denied) 775 if (status == 0 && req->a_res.status == nlm_lck_denied)
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 826a89184f90..d35cd6be0675 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -114,7 +114,7 @@ static struct nlm_host *nlm_alloc_host(struct nlm_lookup_host_info *ni,
114 unsigned long now = jiffies; 114 unsigned long now = jiffies;
115 115
116 if (nsm != NULL) 116 if (nsm != NULL)
117 atomic_inc(&nsm->sm_count); 117 refcount_inc(&nsm->sm_count);
118 else { 118 else {
119 host = NULL; 119 host = NULL;
120 nsm = nsm_get_handle(ni->net, ni->sap, ni->salen, 120 nsm = nsm_get_handle(ni->net, ni->sap, ni->salen,
@@ -151,7 +151,7 @@ static struct nlm_host *nlm_alloc_host(struct nlm_lookup_host_info *ni,
151 host->h_state = 0; 151 host->h_state = 0;
152 host->h_nsmstate = 0; 152 host->h_nsmstate = 0;
153 host->h_pidcount = 0; 153 host->h_pidcount = 0;
154 atomic_set(&host->h_count, 1); 154 refcount_set(&host->h_count, 1);
155 mutex_init(&host->h_mutex); 155 mutex_init(&host->h_mutex);
156 host->h_nextrebind = now + NLM_HOST_REBIND; 156 host->h_nextrebind = now + NLM_HOST_REBIND;
157 host->h_expires = now + NLM_HOST_EXPIRE; 157 host->h_expires = now + NLM_HOST_EXPIRE;
@@ -290,7 +290,7 @@ void nlmclnt_release_host(struct nlm_host *host)
290 290
291 WARN_ON_ONCE(host->h_server); 291 WARN_ON_ONCE(host->h_server);
292 292
293 if (atomic_dec_and_test(&host->h_count)) { 293 if (refcount_dec_and_test(&host->h_count)) {
294 WARN_ON_ONCE(!list_empty(&host->h_lockowners)); 294 WARN_ON_ONCE(!list_empty(&host->h_lockowners));
295 WARN_ON_ONCE(!list_empty(&host->h_granted)); 295 WARN_ON_ONCE(!list_empty(&host->h_granted));
296 WARN_ON_ONCE(!list_empty(&host->h_reclaim)); 296 WARN_ON_ONCE(!list_empty(&host->h_reclaim));
@@ -388,6 +388,8 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp,
388 ln->nrhosts++; 388 ln->nrhosts++;
389 nrhosts++; 389 nrhosts++;
390 390
391 refcount_inc(&host->h_count);
392
391 dprintk("lockd: %s created host %s (%s)\n", 393 dprintk("lockd: %s created host %s (%s)\n",
392 __func__, host->h_name, host->h_addrbuf); 394 __func__, host->h_name, host->h_addrbuf);
393 395
@@ -410,7 +412,7 @@ void nlmsvc_release_host(struct nlm_host *host)
410 dprintk("lockd: release server host %s\n", host->h_name); 412 dprintk("lockd: release server host %s\n", host->h_name);
411 413
412 WARN_ON_ONCE(!host->h_server); 414 WARN_ON_ONCE(!host->h_server);
413 atomic_dec(&host->h_count); 415 refcount_dec(&host->h_count);
414} 416}
415 417
416/* 418/*
@@ -504,7 +506,7 @@ struct nlm_host * nlm_get_host(struct nlm_host *host)
504{ 506{
505 if (host) { 507 if (host) {
506 dprintk("lockd: get host %s\n", host->h_name); 508 dprintk("lockd: get host %s\n", host->h_name);
507 atomic_inc(&host->h_count); 509 refcount_inc(&host->h_count);
508 host->h_expires = jiffies + NLM_HOST_EXPIRE; 510 host->h_expires = jiffies + NLM_HOST_EXPIRE;
509 } 511 }
510 return host; 512 return host;
@@ -593,7 +595,7 @@ static void nlm_complain_hosts(struct net *net)
593 if (net && host->net != net) 595 if (net && host->net != net)
594 continue; 596 continue;
595 dprintk(" %s (cnt %d use %d exp %ld net %x)\n", 597 dprintk(" %s (cnt %d use %d exp %ld net %x)\n",
596 host->h_name, atomic_read(&host->h_count), 598 host->h_name, refcount_read(&host->h_count),
597 host->h_inuse, host->h_expires, host->net->ns.inum); 599 host->h_inuse, host->h_expires, host->net->ns.inum);
598 } 600 }
599} 601}
@@ -662,16 +664,16 @@ nlm_gc_hosts(struct net *net)
662 for_each_host_safe(host, next, chain, nlm_server_hosts) { 664 for_each_host_safe(host, next, chain, nlm_server_hosts) {
663 if (net && host->net != net) 665 if (net && host->net != net)
664 continue; 666 continue;
665 if (atomic_read(&host->h_count) || host->h_inuse 667 if (host->h_inuse || time_before(jiffies, host->h_expires)) {
666 || time_before(jiffies, host->h_expires)) {
667 dprintk("nlm_gc_hosts skipping %s " 668 dprintk("nlm_gc_hosts skipping %s "
668 "(cnt %d use %d exp %ld net %x)\n", 669 "(cnt %d use %d exp %ld net %x)\n",
669 host->h_name, atomic_read(&host->h_count), 670 host->h_name, refcount_read(&host->h_count),
670 host->h_inuse, host->h_expires, 671 host->h_inuse, host->h_expires,
671 host->net->ns.inum); 672 host->net->ns.inum);
672 continue; 673 continue;
673 } 674 }
674 nlm_destroy_host_locked(host); 675 if (refcount_dec_if_one(&host->h_count))
676 nlm_destroy_host_locked(host);
675 } 677 }
676 678
677 if (net) { 679 if (net) {
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 96cfb2967ac7..654594ef4f94 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -191,7 +191,7 @@ void nsm_unmonitor(const struct nlm_host *host)
191 struct nsm_res res; 191 struct nsm_res res;
192 int status; 192 int status;
193 193
194 if (atomic_read(&nsm->sm_count) == 1 194 if (refcount_read(&nsm->sm_count) == 1
195 && nsm->sm_monitored && !nsm->sm_sticky) { 195 && nsm->sm_monitored && !nsm->sm_sticky) {
196 dprintk("lockd: nsm_unmonitor(%s)\n", nsm->sm_name); 196 dprintk("lockd: nsm_unmonitor(%s)\n", nsm->sm_name);
197 197
@@ -279,7 +279,7 @@ static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap,
279 if (unlikely(new == NULL)) 279 if (unlikely(new == NULL))
280 return NULL; 280 return NULL;
281 281
282 atomic_set(&new->sm_count, 1); 282 refcount_set(&new->sm_count, 1);
283 new->sm_name = (char *)(new + 1); 283 new->sm_name = (char *)(new + 1);
284 memcpy(nsm_addr(new), sap, salen); 284 memcpy(nsm_addr(new), sap, salen);
285 new->sm_addrlen = salen; 285 new->sm_addrlen = salen;
@@ -337,13 +337,13 @@ retry:
337 cached = nsm_lookup_addr(&ln->nsm_handles, sap); 337 cached = nsm_lookup_addr(&ln->nsm_handles, sap);
338 338
339 if (cached != NULL) { 339 if (cached != NULL) {
340 atomic_inc(&cached->sm_count); 340 refcount_inc(&cached->sm_count);
341 spin_unlock(&nsm_lock); 341 spin_unlock(&nsm_lock);
342 kfree(new); 342 kfree(new);
343 dprintk("lockd: found nsm_handle for %s (%s), " 343 dprintk("lockd: found nsm_handle for %s (%s), "
344 "cnt %d\n", cached->sm_name, 344 "cnt %d\n", cached->sm_name,
345 cached->sm_addrbuf, 345 cached->sm_addrbuf,
346 atomic_read(&cached->sm_count)); 346 refcount_read(&cached->sm_count));
347 return cached; 347 return cached;
348 } 348 }
349 349
@@ -388,12 +388,12 @@ struct nsm_handle *nsm_reboot_lookup(const struct net *net,
388 return cached; 388 return cached;
389 } 389 }
390 390
391 atomic_inc(&cached->sm_count); 391 refcount_inc(&cached->sm_count);
392 spin_unlock(&nsm_lock); 392 spin_unlock(&nsm_lock);
393 393
394 dprintk("lockd: host %s (%s) rebooted, cnt %d\n", 394 dprintk("lockd: host %s (%s) rebooted, cnt %d\n",
395 cached->sm_name, cached->sm_addrbuf, 395 cached->sm_name, cached->sm_addrbuf,
396 atomic_read(&cached->sm_count)); 396 refcount_read(&cached->sm_count));
397 return cached; 397 return cached;
398} 398}
399 399
@@ -404,7 +404,7 @@ struct nsm_handle *nsm_reboot_lookup(const struct net *net,
404 */ 404 */
405void nsm_release(struct nsm_handle *nsm) 405void nsm_release(struct nsm_handle *nsm)
406{ 406{
407 if (atomic_dec_and_lock(&nsm->sm_count, &nsm_lock)) { 407 if (refcount_dec_and_lock(&nsm->sm_count, &nsm_lock)) {
408 list_del(&nsm->sm_link); 408 list_del(&nsm->sm_link);
409 spin_unlock(&nsm_lock); 409 spin_unlock(&nsm_lock);
410 dprintk("lockd: destroyed nsm_handle for %s (%s)\n", 410 dprintk("lockd: destroyed nsm_handle for %s (%s)\n",
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 0d670c5c378f..ea77c66d3cc3 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -295,7 +295,7 @@ static void nlmsvc_callback_exit(struct rpc_task *task, void *data)
295 295
296void nlmsvc_release_call(struct nlm_rqst *call) 296void nlmsvc_release_call(struct nlm_rqst *call)
297{ 297{
298 if (!atomic_dec_and_test(&call->a_count)) 298 if (!refcount_dec_and_test(&call->a_count))
299 return; 299 return;
300 nlmsvc_release_host(call->a_host); 300 nlmsvc_release_host(call->a_host);
301 kfree(call); 301 kfree(call);
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 995d707537da..7cb5c38c19e4 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -137,6 +137,11 @@ bl_alloc_init_bio(int npg, struct block_device *bdev, sector_t disk_sector,
137 return bio; 137 return bio;
138} 138}
139 139
140static bool offset_in_map(u64 offset, struct pnfs_block_dev_map *map)
141{
142 return offset >= map->start && offset < map->start + map->len;
143}
144
140static struct bio * 145static struct bio *
141do_add_page_to_bio(struct bio *bio, int npg, int rw, sector_t isect, 146do_add_page_to_bio(struct bio *bio, int npg, int rw, sector_t isect,
142 struct page *page, struct pnfs_block_dev_map *map, 147 struct page *page, struct pnfs_block_dev_map *map,
@@ -156,8 +161,8 @@ do_add_page_to_bio(struct bio *bio, int npg, int rw, sector_t isect,
156 161
157 /* translate to physical disk offset */ 162 /* translate to physical disk offset */
158 disk_addr = (u64)isect << SECTOR_SHIFT; 163 disk_addr = (u64)isect << SECTOR_SHIFT;
159 if (disk_addr < map->start || disk_addr >= map->start + map->len) { 164 if (!offset_in_map(disk_addr, map)) {
160 if (!dev->map(dev, disk_addr, map)) 165 if (!dev->map(dev, disk_addr, map) || !offset_in_map(disk_addr, map))
161 return ERR_PTR(-EIO); 166 return ERR_PTR(-EIO);
162 bio = bl_submit_bio(bio); 167 bio = bl_submit_bio(bio);
163 } 168 }
@@ -184,6 +189,29 @@ retry:
184 return bio; 189 return bio;
185} 190}
186 191
192static void bl_mark_devices_unavailable(struct nfs_pgio_header *header, bool rw)
193{
194 struct pnfs_block_layout *bl = BLK_LSEG2EXT(header->lseg);
195 size_t bytes_left = header->args.count;
196 sector_t isect, extent_length = 0;
197 struct pnfs_block_extent be;
198
199 isect = header->args.offset >> SECTOR_SHIFT;
200 bytes_left += header->args.offset - (isect << SECTOR_SHIFT);
201
202 while (bytes_left > 0) {
203 if (!ext_tree_lookup(bl, isect, &be, rw))
204 return;
205 extent_length = be.be_length - (isect - be.be_f_offset);
206 nfs4_mark_deviceid_unavailable(be.be_device);
207 isect += extent_length;
208 if (bytes_left > extent_length << SECTOR_SHIFT)
209 bytes_left -= extent_length << SECTOR_SHIFT;
210 else
211 bytes_left = 0;
212 }
213}
214
187static void bl_end_io_read(struct bio *bio) 215static void bl_end_io_read(struct bio *bio)
188{ 216{
189 struct parallel_io *par = bio->bi_private; 217 struct parallel_io *par = bio->bi_private;
@@ -194,6 +222,7 @@ static void bl_end_io_read(struct bio *bio)
194 if (!header->pnfs_error) 222 if (!header->pnfs_error)
195 header->pnfs_error = -EIO; 223 header->pnfs_error = -EIO;
196 pnfs_set_lo_fail(header->lseg); 224 pnfs_set_lo_fail(header->lseg);
225 bl_mark_devices_unavailable(header, false);
197 } 226 }
198 227
199 bio_put(bio); 228 bio_put(bio);
@@ -323,6 +352,7 @@ static void bl_end_io_write(struct bio *bio)
323 if (!header->pnfs_error) 352 if (!header->pnfs_error)
324 header->pnfs_error = -EIO; 353 header->pnfs_error = -EIO;
325 pnfs_set_lo_fail(header->lseg); 354 pnfs_set_lo_fail(header->lseg);
355 bl_mark_devices_unavailable(header, true);
326 } 356 }
327 bio_put(bio); 357 bio_put(bio);
328 put_parallel(par); 358 put_parallel(par);
@@ -552,6 +582,31 @@ static int decode_sector_number(__be32 **rp, sector_t *sp)
552 return 0; 582 return 0;
553} 583}
554 584
585static struct nfs4_deviceid_node *
586bl_find_get_deviceid(struct nfs_server *server,
587 const struct nfs4_deviceid *id, struct rpc_cred *cred,
588 gfp_t gfp_mask)
589{
590 struct nfs4_deviceid_node *node;
591 unsigned long start, end;
592
593retry:
594 node = nfs4_find_get_deviceid(server, id, cred, gfp_mask);
595 if (!node)
596 return ERR_PTR(-ENODEV);
597
598 if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags) == 0)
599 return node;
600
601 end = jiffies;
602 start = end - PNFS_DEVICE_RETRY_TIMEOUT;
603 if (!time_in_range(node->timestamp_unavailable, start, end)) {
604 nfs4_delete_deviceid(node->ld, node->nfs_client, id);
605 goto retry;
606 }
607 return ERR_PTR(-ENODEV);
608}
609
555static int 610static int
556bl_alloc_extent(struct xdr_stream *xdr, struct pnfs_layout_hdr *lo, 611bl_alloc_extent(struct xdr_stream *xdr, struct pnfs_layout_hdr *lo,
557 struct layout_verification *lv, struct list_head *extents, 612 struct layout_verification *lv, struct list_head *extents,
@@ -573,16 +628,18 @@ bl_alloc_extent(struct xdr_stream *xdr, struct pnfs_layout_hdr *lo,
573 memcpy(&id, p, NFS4_DEVICEID4_SIZE); 628 memcpy(&id, p, NFS4_DEVICEID4_SIZE);
574 p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE); 629 p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
575 630
576 error = -EIO; 631 be->be_device = bl_find_get_deviceid(NFS_SERVER(lo->plh_inode), &id,
577 be->be_device = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode), &id,
578 lo->plh_lc_cred, gfp_mask); 632 lo->plh_lc_cred, gfp_mask);
579 if (!be->be_device) 633 if (IS_ERR(be->be_device)) {
634 error = PTR_ERR(be->be_device);
580 goto out_free_be; 635 goto out_free_be;
636 }
581 637
582 /* 638 /*
583 * The next three values are read in as bytes, but stored in the 639 * The next three values are read in as bytes, but stored in the
584 * extent structure in 512-byte granularity. 640 * extent structure in 512-byte granularity.
585 */ 641 */
642 error = -EIO;
586 if (decode_sector_number(&p, &be->be_f_offset) < 0) 643 if (decode_sector_number(&p, &be->be_f_offset) < 0)
587 goto out_put_deviceid; 644 goto out_put_deviceid;
588 if (decode_sector_number(&p, &be->be_length) < 0) 645 if (decode_sector_number(&p, &be->be_length) < 0)
@@ -692,11 +749,16 @@ out_free_scratch:
692 __free_page(scratch); 749 __free_page(scratch);
693out: 750out:
694 dprintk("%s returns %d\n", __func__, status); 751 dprintk("%s returns %d\n", __func__, status);
695 if (status) { 752 switch (status) {
753 case -ENODEV:
754 /* Our extent block devices are unavailable */
755 set_bit(NFS_LSEG_UNAVAILABLE, &lseg->pls_flags);
756 case 0:
757 return lseg;
758 default:
696 kfree(lseg); 759 kfree(lseg);
697 return ERR_PTR(status); 760 return ERR_PTR(status);
698 } 761 }
699 return lseg;
700} 762}
701 763
702static void 764static void
@@ -798,6 +860,13 @@ bl_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
798 } 860 }
799 861
800 pnfs_generic_pg_init_read(pgio, req); 862 pnfs_generic_pg_init_read(pgio, req);
863
864 if (pgio->pg_lseg &&
865 test_bit(NFS_LSEG_UNAVAILABLE, &pgio->pg_lseg->pls_flags)) {
866 pnfs_error_mark_layout_for_return(pgio->pg_inode, pgio->pg_lseg);
867 pnfs_set_lo_fail(pgio->pg_lseg);
868 nfs_pageio_reset_read_mds(pgio);
869 }
801} 870}
802 871
803/* 872/*
@@ -853,6 +922,14 @@ bl_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
853 wb_size = nfs_dreq_bytes_left(pgio->pg_dreq); 922 wb_size = nfs_dreq_bytes_left(pgio->pg_dreq);
854 923
855 pnfs_generic_pg_init_write(pgio, req, wb_size); 924 pnfs_generic_pg_init_write(pgio, req, wb_size);
925
926 if (pgio->pg_lseg &&
927 test_bit(NFS_LSEG_UNAVAILABLE, &pgio->pg_lseg->pls_flags)) {
928
929 pnfs_error_mark_layout_for_return(pgio->pg_inode, pgio->pg_lseg);
930 pnfs_set_lo_fail(pgio->pg_lseg);
931 nfs_pageio_reset_write_mds(pgio);
932 }
856} 933}
857 934
858/* 935/*
@@ -887,6 +964,7 @@ static struct pnfs_layoutdriver_type blocklayout_type = {
887 .name = "LAYOUT_BLOCK_VOLUME", 964 .name = "LAYOUT_BLOCK_VOLUME",
888 .owner = THIS_MODULE, 965 .owner = THIS_MODULE,
889 .flags = PNFS_LAYOUTRET_ON_SETATTR | 966 .flags = PNFS_LAYOUTRET_ON_SETATTR |
967 PNFS_LAYOUTRET_ON_ERROR |
890 PNFS_READ_WHOLE_PAGE, 968 PNFS_READ_WHOLE_PAGE,
891 .read_pagelist = bl_read_pagelist, 969 .read_pagelist = bl_read_pagelist,
892 .write_pagelist = bl_write_pagelist, 970 .write_pagelist = bl_write_pagelist,
@@ -910,6 +988,7 @@ static struct pnfs_layoutdriver_type scsilayout_type = {
910 .name = "LAYOUT_SCSI", 988 .name = "LAYOUT_SCSI",
911 .owner = THIS_MODULE, 989 .owner = THIS_MODULE,
912 .flags = PNFS_LAYOUTRET_ON_SETATTR | 990 .flags = PNFS_LAYOUTRET_ON_SETATTR |
991 PNFS_LAYOUTRET_ON_ERROR |
913 PNFS_READ_WHOLE_PAGE, 992 PNFS_READ_WHOLE_PAGE,
914 .read_pagelist = bl_read_pagelist, 993 .read_pagelist = bl_read_pagelist,
915 .write_pagelist = bl_write_pagelist, 994 .write_pagelist = bl_write_pagelist,
@@ -967,6 +1046,7 @@ static void __exit nfs4blocklayout_exit(void)
967} 1046}
968 1047
969MODULE_ALIAS("nfs-layouttype4-3"); 1048MODULE_ALIAS("nfs-layouttype4-3");
1049MODULE_ALIAS("nfs-layouttype4-5");
970 1050
971module_init(nfs4blocklayout_init); 1051module_init(nfs4blocklayout_init);
972module_exit(nfs4blocklayout_exit); 1052module_exit(nfs4blocklayout_exit);
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index efc007f00742..716bc75e9ed2 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -92,10 +92,9 @@ struct pnfs_block_volume {
92}; 92};
93 93
94struct pnfs_block_dev_map { 94struct pnfs_block_dev_map {
95 sector_t start; 95 u64 start;
96 sector_t len; 96 u64 len;
97 97 u64 disk_offset;
98 sector_t disk_offset;
99 struct block_device *bdev; 98 struct block_device *bdev;
100}; 99};
101 100
diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
index 95f74bd2c067..a7efd83779d2 100644
--- a/fs/nfs/blocklayout/dev.c
+++ b/fs/nfs/blocklayout/dev.c
@@ -533,14 +533,11 @@ bl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
533 goto out_free_volumes; 533 goto out_free_volumes;
534 534
535 ret = bl_parse_deviceid(server, top, volumes, nr_volumes - 1, gfp_mask); 535 ret = bl_parse_deviceid(server, top, volumes, nr_volumes - 1, gfp_mask);
536 if (ret) {
537 bl_free_device(top);
538 kfree(top);
539 goto out_free_volumes;
540 }
541 536
542 node = &top->node; 537 node = &top->node;
543 nfs4_init_deviceid_node(node, server, &pdev->dev_id); 538 nfs4_init_deviceid_node(node, server, &pdev->dev_id);
539 if (ret)
540 nfs4_mark_deviceid_unavailable(node);
544 541
545out_free_volumes: 542out_free_volumes:
546 kfree(volumes); 543 kfree(volumes);
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index d2972d537469..8c10b0562e75 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -775,10 +775,8 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
775 775
776 spin_lock(&dreq->lock); 776 spin_lock(&dreq->lock);
777 777
778 if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) { 778 if (test_bit(NFS_IOHDR_ERROR, &hdr->flags))
779 dreq->flags = 0;
780 dreq->error = hdr->error; 779 dreq->error = hdr->error;
781 }
782 if (dreq->error == 0) { 780 if (dreq->error == 0) {
783 nfs_direct_good_bytes(dreq, hdr); 781 nfs_direct_good_bytes(dreq, hdr);
784 if (nfs_write_need_commit(hdr)) { 782 if (nfs_write_need_commit(hdr)) {
diff --git a/fs/nfs/export.c b/fs/nfs/export.c
index 83fd09fc8f77..ab5de3246c5c 100644
--- a/fs/nfs/export.c
+++ b/fs/nfs/export.c
@@ -48,10 +48,6 @@ nfs_encode_fh(struct inode *inode, __u32 *p, int *max_len, struct inode *parent)
48 *max_len = len; 48 *max_len = len;
49 return FILEID_INVALID; 49 return FILEID_INVALID;
50 } 50 }
51 if (IS_AUTOMOUNT(inode)) {
52 *max_len = FILEID_INVALID;
53 goto out;
54 }
55 51
56 p[FILEID_HIGH_OFF] = NFS_FILEID(inode) >> 32; 52 p[FILEID_HIGH_OFF] = NFS_FILEID(inode) >> 32;
57 p[FILEID_LOW_OFF] = NFS_FILEID(inode); 53 p[FILEID_LOW_OFF] = NFS_FILEID(inode);
@@ -59,7 +55,6 @@ nfs_encode_fh(struct inode *inode, __u32 *p, int *max_len, struct inode *parent)
59 p[len - 1] = 0; /* Padding */ 55 p[len - 1] = 0; /* Padding */
60 nfs_copy_fh(clnt_fh, server_fh); 56 nfs_copy_fh(clnt_fh, server_fh);
61 *max_len = len; 57 *max_len = len;
62out:
63 dprintk("%s: result fh fileid %llu mode %u size %d\n", 58 dprintk("%s: result fh fileid %llu mode %u size %d\n",
64 __func__, NFS_FILEID(inode), inode->i_mode, *max_len); 59 __func__, NFS_FILEID(inode), inode->i_mode, *max_len);
65 return *max_len; 60 return *max_len;
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index 4e54d8b5413a..d175724ff566 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -895,9 +895,7 @@ fl_pnfs_update_layout(struct inode *ino,
895 895
896 lseg = pnfs_update_layout(ino, ctx, pos, count, iomode, strict_iomode, 896 lseg = pnfs_update_layout(ino, ctx, pos, count, iomode, strict_iomode,
897 gfp_flags); 897 gfp_flags);
898 if (!lseg) 898 if (IS_ERR_OR_NULL(lseg))
899 lseg = ERR_PTR(-ENOMEM);
900 if (IS_ERR(lseg))
901 goto out; 899 goto out;
902 900
903 lo = NFS_I(ino)->layout; 901 lo = NFS_I(ino)->layout;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 93552c482992..ceeaf0fb6657 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -735,12 +735,20 @@ int nfs_getattr(const struct path *path, struct kstat *stat,
735 u32 request_mask, unsigned int query_flags) 735 u32 request_mask, unsigned int query_flags)
736{ 736{
737 struct inode *inode = d_inode(path->dentry); 737 struct inode *inode = d_inode(path->dentry);
738 int need_atime = NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATIME; 738 struct nfs_server *server = NFS_SERVER(inode);
739 unsigned long cache_validity;
739 int err = 0; 740 int err = 0;
741 bool force_sync = query_flags & AT_STATX_FORCE_SYNC;
742 bool do_update = false;
740 743
741 trace_nfs_getattr_enter(inode); 744 trace_nfs_getattr_enter(inode);
745
746 if ((query_flags & AT_STATX_DONT_SYNC) && !force_sync)
747 goto out_no_update;
748
742 /* Flush out writes to the server in order to update c/mtime. */ 749 /* Flush out writes to the server in order to update c/mtime. */
743 if (S_ISREG(inode->i_mode)) { 750 if ((request_mask & (STATX_CTIME|STATX_MTIME)) &&
751 S_ISREG(inode->i_mode)) {
744 err = filemap_write_and_wait(inode->i_mapping); 752 err = filemap_write_and_wait(inode->i_mapping);
745 if (err) 753 if (err)
746 goto out; 754 goto out;
@@ -757,24 +765,42 @@ int nfs_getattr(const struct path *path, struct kstat *stat,
757 */ 765 */
758 if ((path->mnt->mnt_flags & MNT_NOATIME) || 766 if ((path->mnt->mnt_flags & MNT_NOATIME) ||
759 ((path->mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))) 767 ((path->mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)))
760 need_atime = 0; 768 request_mask &= ~STATX_ATIME;
761 769
762 if (need_atime || nfs_need_revalidate_inode(inode)) { 770 /* Is the user requesting attributes that might need revalidation? */
763 struct nfs_server *server = NFS_SERVER(inode); 771 if (!(request_mask & (STATX_MODE|STATX_NLINK|STATX_ATIME|STATX_CTIME|
764 772 STATX_MTIME|STATX_UID|STATX_GID|
773 STATX_SIZE|STATX_BLOCKS)))
774 goto out_no_revalidate;
775
776 /* Check whether the cached attributes are stale */
777 do_update |= force_sync || nfs_attribute_cache_expired(inode);
778 cache_validity = READ_ONCE(NFS_I(inode)->cache_validity);
779 do_update |= cache_validity &
780 (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL);
781 if (request_mask & STATX_ATIME)
782 do_update |= cache_validity & NFS_INO_INVALID_ATIME;
783 if (request_mask & (STATX_CTIME|STATX_MTIME))
784 do_update |= cache_validity & NFS_INO_REVAL_PAGECACHE;
785 if (do_update) {
786 /* Update the attribute cache */
765 if (!(server->flags & NFS_MOUNT_NOAC)) 787 if (!(server->flags & NFS_MOUNT_NOAC))
766 nfs_readdirplus_parent_cache_miss(path->dentry); 788 nfs_readdirplus_parent_cache_miss(path->dentry);
767 else 789 else
768 nfs_readdirplus_parent_cache_hit(path->dentry); 790 nfs_readdirplus_parent_cache_hit(path->dentry);
769 err = __nfs_revalidate_inode(server, inode); 791 err = __nfs_revalidate_inode(server, inode);
792 if (err)
793 goto out;
770 } else 794 } else
771 nfs_readdirplus_parent_cache_hit(path->dentry); 795 nfs_readdirplus_parent_cache_hit(path->dentry);
772 if (!err) { 796out_no_revalidate:
773 generic_fillattr(inode, stat); 797 /* Only return attributes that were revalidated. */
774 stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode)); 798 stat->result_mask &= request_mask;
775 if (S_ISDIR(inode->i_mode)) 799out_no_update:
776 stat->blksize = NFS_SERVER(inode)->dtsize; 800 generic_fillattr(inode, stat);
777 } 801 stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode));
802 if (S_ISDIR(inode->i_mode))
803 stat->blksize = NFS_SERVER(inode)->dtsize;
778out: 804out:
779 trace_nfs_getattr_exit(inode, err); 805 trace_nfs_getattr_exit(inode, err);
780 return err; 806 return err;
@@ -1144,7 +1170,6 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map
1144 1170
1145 if (mapping->nrpages != 0) { 1171 if (mapping->nrpages != 0) {
1146 if (S_ISREG(inode->i_mode)) { 1172 if (S_ISREG(inode->i_mode)) {
1147 unmap_mapping_range(mapping, 0, 0, 0);
1148 ret = nfs_sync_mapping(mapping); 1173 ret = nfs_sync_mapping(mapping);
1149 if (ret < 0) 1174 if (ret < 0)
1150 return ret; 1175 return ret;
diff --git a/fs/nfs/io.c b/fs/nfs/io.c
index 20fef85d2bb1..9034b4926909 100644
--- a/fs/nfs/io.c
+++ b/fs/nfs/io.c
@@ -99,7 +99,7 @@ static void nfs_block_buffered(struct nfs_inode *nfsi, struct inode *inode)
99{ 99{
100 if (!test_bit(NFS_INO_ODIRECT, &nfsi->flags)) { 100 if (!test_bit(NFS_INO_ODIRECT, &nfsi->flags)) {
101 set_bit(NFS_INO_ODIRECT, &nfsi->flags); 101 set_bit(NFS_INO_ODIRECT, &nfsi->flags);
102 nfs_wb_all(inode); 102 nfs_sync_mapping(inode->i_mapping);
103 } 103 }
104} 104}
105 105
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 65a7e5da508c..04612c24d394 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -861,6 +861,7 @@ static int nfs4_set_client(struct nfs_server *server,
861 set_bit(NFS_CS_MIGRATION, &cl_init.init_flags); 861 set_bit(NFS_CS_MIGRATION, &cl_init.init_flags);
862 if (test_bit(NFS_MIG_TSM_POSSIBLE, &server->mig_status)) 862 if (test_bit(NFS_MIG_TSM_POSSIBLE, &server->mig_status))
863 set_bit(NFS_CS_TSM_POSSIBLE, &cl_init.init_flags); 863 set_bit(NFS_CS_TSM_POSSIBLE, &cl_init.init_flags);
864 server->port = rpc_get_port(addr);
864 865
865 /* Allocate or find a client reference we can use */ 866 /* Allocate or find a client reference we can use */
866 clp = nfs_get_client(&cl_init); 867 clp = nfs_get_client(&cl_init);
@@ -1123,19 +1124,36 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
1123 /* Initialise the client representation from the parent server */ 1124 /* Initialise the client representation from the parent server */
1124 nfs_server_copy_userdata(server, parent_server); 1125 nfs_server_copy_userdata(server, parent_server);
1125 1126
1126 /* Get a client representation. 1127 /* Get a client representation */
1127 * Note: NFSv4 always uses TCP, */ 1128#ifdef CONFIG_SUNRPC_XPRT_RDMA
1129 rpc_set_port(data->addr, NFS_RDMA_PORT);
1128 error = nfs4_set_client(server, data->hostname, 1130 error = nfs4_set_client(server, data->hostname,
1129 data->addr, 1131 data->addr,
1130 data->addrlen, 1132 data->addrlen,
1131 parent_client->cl_ipaddr, 1133 parent_client->cl_ipaddr,
1132 rpc_protocol(parent_server->client), 1134 XPRT_TRANSPORT_RDMA,
1135 parent_server->client->cl_timeout,
1136 parent_client->cl_mvops->minor_version,
1137 parent_client->cl_net);
1138 if (!error)
1139 goto init_server;
1140#endif /* CONFIG_SUNRPC_XPRT_RDMA */
1141
1142 rpc_set_port(data->addr, NFS_PORT);
1143 error = nfs4_set_client(server, data->hostname,
1144 data->addr,
1145 data->addrlen,
1146 parent_client->cl_ipaddr,
1147 XPRT_TRANSPORT_TCP,
1133 parent_server->client->cl_timeout, 1148 parent_server->client->cl_timeout,
1134 parent_client->cl_mvops->minor_version, 1149 parent_client->cl_mvops->minor_version,
1135 parent_client->cl_net); 1150 parent_client->cl_net);
1136 if (error < 0) 1151 if (error < 0)
1137 goto error; 1152 goto error;
1138 1153
1154#ifdef CONFIG_SUNRPC_XPRT_RDMA
1155init_server:
1156#endif
1139 error = nfs_init_server_rpcclient(server, parent_server->client->cl_timeout, data->authflavor); 1157 error = nfs_init_server_rpcclient(server, parent_server->client->cl_timeout, data->authflavor);
1140 if (error < 0) 1158 if (error < 0)
1141 goto error; 1159 goto error;
diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c
index 30426c1a1bbd..22dc30a679a0 100644
--- a/fs/nfs/nfs4idmap.c
+++ b/fs/nfs/nfs4idmap.c
@@ -568,9 +568,13 @@ static int nfs_idmap_legacy_upcall(struct key_construction *cons,
568 struct idmap_msg *im; 568 struct idmap_msg *im;
569 struct idmap *idmap = (struct idmap *)aux; 569 struct idmap *idmap = (struct idmap *)aux;
570 struct key *key = cons->key; 570 struct key *key = cons->key;
571 int ret = -ENOMEM; 571 int ret = -ENOKEY;
572
573 if (!aux)
574 goto out1;
572 575
573 /* msg and im are freed in idmap_pipe_destroy_msg */ 576 /* msg and im are freed in idmap_pipe_destroy_msg */
577 ret = -ENOMEM;
574 data = kzalloc(sizeof(*data), GFP_KERNEL); 578 data = kzalloc(sizeof(*data), GFP_KERNEL);
575 if (!data) 579 if (!data)
576 goto out1; 580 goto out1;
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index 8c3f327d858d..24f06dcc2b08 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -270,8 +270,6 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
270 if (mountdata->addrlen == 0) 270 if (mountdata->addrlen == 0)
271 continue; 271 continue;
272 272
273 rpc_set_port(mountdata->addr, NFS_PORT);
274
275 memcpy(page2, buf->data, buf->len); 273 memcpy(page2, buf->data, buf->len);
276 page2[buf->len] = '\0'; 274 page2[buf->len] = '\0';
277 mountdata->hostname = page2; 275 mountdata->hostname = page2;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 17a03f2c4330..47f3c273245e 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2020,7 +2020,7 @@ static int nfs4_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *sta
2020 return ret; 2020 return ret;
2021} 2021}
2022 2022
2023static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct nfs4_state *state, const nfs4_stateid *stateid, int err) 2023static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct nfs4_state *state, const nfs4_stateid *stateid, struct file_lock *fl, int err)
2024{ 2024{
2025 switch (err) { 2025 switch (err) {
2026 default: 2026 default:
@@ -2067,7 +2067,11 @@ static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct
2067 return -EAGAIN; 2067 return -EAGAIN;
2068 case -ENOMEM: 2068 case -ENOMEM:
2069 case -NFS4ERR_DENIED: 2069 case -NFS4ERR_DENIED:
2070 /* kill_proc(fl->fl_pid, SIGLOST, 1); */ 2070 if (fl) {
2071 struct nfs4_lock_state *lsp = fl->fl_u.nfs4_fl.owner;
2072 if (lsp)
2073 set_bit(NFS_LOCK_LOST, &lsp->ls_flags);
2074 }
2071 return 0; 2075 return 0;
2072 } 2076 }
2073 return err; 2077 return err;
@@ -2103,7 +2107,7 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx,
2103 err = nfs4_open_recover_helper(opendata, FMODE_READ); 2107 err = nfs4_open_recover_helper(opendata, FMODE_READ);
2104 } 2108 }
2105 nfs4_opendata_put(opendata); 2109 nfs4_opendata_put(opendata);
2106 return nfs4_handle_delegation_recall_error(server, state, stateid, err); 2110 return nfs4_handle_delegation_recall_error(server, state, stateid, NULL, err);
2107} 2111}
2108 2112
2109static void nfs4_open_confirm_prepare(struct rpc_task *task, void *calldata) 2113static void nfs4_open_confirm_prepare(struct rpc_task *task, void *calldata)
@@ -3150,6 +3154,11 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
3150 struct nfs4_state *state = calldata->state; 3154 struct nfs4_state *state = calldata->state;
3151 struct nfs_server *server = NFS_SERVER(calldata->inode); 3155 struct nfs_server *server = NFS_SERVER(calldata->inode);
3152 nfs4_stateid *res_stateid = NULL; 3156 nfs4_stateid *res_stateid = NULL;
3157 struct nfs4_exception exception = {
3158 .state = state,
3159 .inode = calldata->inode,
3160 .stateid = &calldata->arg.stateid,
3161 };
3153 3162
3154 dprintk("%s: begin!\n", __func__); 3163 dprintk("%s: begin!\n", __func__);
3155 if (!nfs4_sequence_done(task, &calldata->res.seq_res)) 3164 if (!nfs4_sequence_done(task, &calldata->res.seq_res))
@@ -3215,7 +3224,9 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
3215 case -NFS4ERR_BAD_STATEID: 3224 case -NFS4ERR_BAD_STATEID:
3216 break; 3225 break;
3217 default: 3226 default:
3218 if (nfs4_async_handle_error(task, server, state, NULL) == -EAGAIN) 3227 task->tk_status = nfs4_async_handle_exception(task,
3228 server, task->tk_status, &exception);
3229 if (exception.retry)
3219 goto out_restart; 3230 goto out_restart;
3220 } 3231 }
3221 nfs_clear_open_stateid(state, &calldata->arg.stateid, 3232 nfs_clear_open_stateid(state, &calldata->arg.stateid,
@@ -5759,6 +5770,10 @@ struct nfs4_delegreturndata {
5759static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) 5770static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
5760{ 5771{
5761 struct nfs4_delegreturndata *data = calldata; 5772 struct nfs4_delegreturndata *data = calldata;
5773 struct nfs4_exception exception = {
5774 .inode = data->inode,
5775 .stateid = &data->stateid,
5776 };
5762 5777
5763 if (!nfs4_sequence_done(task, &data->res.seq_res)) 5778 if (!nfs4_sequence_done(task, &data->res.seq_res))
5764 return; 5779 return;
@@ -5820,10 +5835,11 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
5820 } 5835 }
5821 /* Fallthrough */ 5836 /* Fallthrough */
5822 default: 5837 default:
5823 if (nfs4_async_handle_error(task, data->res.server, 5838 task->tk_status = nfs4_async_handle_exception(task,
5824 NULL, NULL) == -EAGAIN) { 5839 data->res.server, task->tk_status,
5840 &exception);
5841 if (exception.retry)
5825 goto out_restart; 5842 goto out_restart;
5826 }
5827 } 5843 }
5828 data->rpc_status = task->tk_status; 5844 data->rpc_status = task->tk_status;
5829 return; 5845 return;
@@ -6061,6 +6077,10 @@ static void nfs4_locku_release_calldata(void *data)
6061static void nfs4_locku_done(struct rpc_task *task, void *data) 6077static void nfs4_locku_done(struct rpc_task *task, void *data)
6062{ 6078{
6063 struct nfs4_unlockdata *calldata = data; 6079 struct nfs4_unlockdata *calldata = data;
6080 struct nfs4_exception exception = {
6081 .inode = calldata->lsp->ls_state->inode,
6082 .stateid = &calldata->arg.stateid,
6083 };
6064 6084
6065 if (!nfs4_sequence_done(task, &calldata->res.seq_res)) 6085 if (!nfs4_sequence_done(task, &calldata->res.seq_res))
6066 return; 6086 return;
@@ -6084,8 +6104,10 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
6084 rpc_restart_call_prepare(task); 6104 rpc_restart_call_prepare(task);
6085 break; 6105 break;
6086 default: 6106 default:
6087 if (nfs4_async_handle_error(task, calldata->server, 6107 task->tk_status = nfs4_async_handle_exception(task,
6088 NULL, NULL) == -EAGAIN) 6108 calldata->server, task->tk_status,
6109 &exception);
6110 if (exception.retry)
6089 rpc_restart_call_prepare(task); 6111 rpc_restart_call_prepare(task);
6090 } 6112 }
6091 nfs_release_seqid(calldata->arg.seqid); 6113 nfs_release_seqid(calldata->arg.seqid);
@@ -6741,7 +6763,7 @@ int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state,
6741 if (err != 0) 6763 if (err != 0)
6742 return err; 6764 return err;
6743 err = _nfs4_do_setlk(state, F_SETLK, fl, NFS_LOCK_NEW); 6765 err = _nfs4_do_setlk(state, F_SETLK, fl, NFS_LOCK_NEW);
6744 return nfs4_handle_delegation_recall_error(server, state, stateid, err); 6766 return nfs4_handle_delegation_recall_error(server, state, stateid, fl, err);
6745} 6767}
6746 6768
6747struct nfs_release_lockowner_data { 6769struct nfs_release_lockowner_data {
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index e4f4a09ed9f4..91a4d4eeb235 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1482,6 +1482,7 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_
1482 struct inode *inode = state->inode; 1482 struct inode *inode = state->inode;
1483 struct nfs_inode *nfsi = NFS_I(inode); 1483 struct nfs_inode *nfsi = NFS_I(inode);
1484 struct file_lock *fl; 1484 struct file_lock *fl;
1485 struct nfs4_lock_state *lsp;
1485 int status = 0; 1486 int status = 0;
1486 struct file_lock_context *flctx = inode->i_flctx; 1487 struct file_lock_context *flctx = inode->i_flctx;
1487 struct list_head *list; 1488 struct list_head *list;
@@ -1522,7 +1523,9 @@ restart:
1522 case -NFS4ERR_DENIED: 1523 case -NFS4ERR_DENIED:
1523 case -NFS4ERR_RECLAIM_BAD: 1524 case -NFS4ERR_RECLAIM_BAD:
1524 case -NFS4ERR_RECLAIM_CONFLICT: 1525 case -NFS4ERR_RECLAIM_CONFLICT:
1525 /* kill_proc(fl->fl_pid, SIGLOST, 1); */ 1526 lsp = fl->fl_u.nfs4_fl.owner;
1527 if (lsp)
1528 set_bit(NFS_LOCK_LOST, &lsp->ls_flags);
1526 status = 0; 1529 status = 0;
1527 } 1530 }
1528 spin_lock(&flctx->flc_lock); 1531 spin_lock(&flctx->flc_lock);
diff --git a/fs/nfs/nfs4sysctl.c b/fs/nfs/nfs4sysctl.c
index 0d91d84e5822..c394e4447100 100644
--- a/fs/nfs/nfs4sysctl.c
+++ b/fs/nfs/nfs4sysctl.c
@@ -32,7 +32,7 @@ static struct ctl_table nfs4_cb_sysctls[] = {
32 .data = &nfs_idmap_cache_timeout, 32 .data = &nfs_idmap_cache_timeout,
33 .maxlen = sizeof(int), 33 .maxlen = sizeof(int),
34 .mode = 0644, 34 .mode = 0644,
35 .proc_handler = proc_dointvec_jiffies, 35 .proc_handler = proc_dointvec,
36 }, 36 },
37 { } 37 { }
38}; 38};
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 77c6729e57f0..65c9c4175145 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -7678,6 +7678,22 @@ nfs4_stat_to_errno(int stat)
7678 .p_name = #proc, \ 7678 .p_name = #proc, \
7679} 7679}
7680 7680
7681#if defined(CONFIG_NFS_V4_1)
7682#define PROC41(proc, argtype, restype) \
7683 PROC(proc, argtype, restype)
7684#else
7685#define PROC41(proc, argtype, restype) \
7686 STUB(proc)
7687#endif
7688
7689#if defined(CONFIG_NFS_V4_2)
7690#define PROC42(proc, argtype, restype) \
7691 PROC(proc, argtype, restype)
7692#else
7693#define PROC42(proc, argtype, restype) \
7694 STUB(proc)
7695#endif
7696
7681const struct rpc_procinfo nfs4_procedures[] = { 7697const struct rpc_procinfo nfs4_procedures[] = {
7682 PROC(READ, enc_read, dec_read), 7698 PROC(READ, enc_read, dec_read),
7683 PROC(WRITE, enc_write, dec_write), 7699 PROC(WRITE, enc_write, dec_write),
@@ -7698,7 +7714,6 @@ const struct rpc_procinfo nfs4_procedures[] = {
7698 PROC(ACCESS, enc_access, dec_access), 7714 PROC(ACCESS, enc_access, dec_access),
7699 PROC(GETATTR, enc_getattr, dec_getattr), 7715 PROC(GETATTR, enc_getattr, dec_getattr),
7700 PROC(LOOKUP, enc_lookup, dec_lookup), 7716 PROC(LOOKUP, enc_lookup, dec_lookup),
7701 PROC(LOOKUPP, enc_lookupp, dec_lookupp),
7702 PROC(LOOKUP_ROOT, enc_lookup_root, dec_lookup_root), 7717 PROC(LOOKUP_ROOT, enc_lookup_root, dec_lookup_root),
7703 PROC(REMOVE, enc_remove, dec_remove), 7718 PROC(REMOVE, enc_remove, dec_remove),
7704 PROC(RENAME, enc_rename, dec_rename), 7719 PROC(RENAME, enc_rename, dec_rename),
@@ -7717,33 +7732,30 @@ const struct rpc_procinfo nfs4_procedures[] = {
7717 PROC(RELEASE_LOCKOWNER, enc_release_lockowner, dec_release_lockowner), 7732 PROC(RELEASE_LOCKOWNER, enc_release_lockowner, dec_release_lockowner),
7718 PROC(SECINFO, enc_secinfo, dec_secinfo), 7733 PROC(SECINFO, enc_secinfo, dec_secinfo),
7719 PROC(FSID_PRESENT, enc_fsid_present, dec_fsid_present), 7734 PROC(FSID_PRESENT, enc_fsid_present, dec_fsid_present),
7720#if defined(CONFIG_NFS_V4_1) 7735 PROC41(EXCHANGE_ID, enc_exchange_id, dec_exchange_id),
7721 PROC(EXCHANGE_ID, enc_exchange_id, dec_exchange_id), 7736 PROC41(CREATE_SESSION, enc_create_session, dec_create_session),
7722 PROC(CREATE_SESSION, enc_create_session, dec_create_session), 7737 PROC41(DESTROY_SESSION, enc_destroy_session, dec_destroy_session),
7723 PROC(DESTROY_SESSION, enc_destroy_session, dec_destroy_session), 7738 PROC41(SEQUENCE, enc_sequence, dec_sequence),
7724 PROC(SEQUENCE, enc_sequence, dec_sequence), 7739 PROC41(GET_LEASE_TIME, enc_get_lease_time, dec_get_lease_time),
7725 PROC(GET_LEASE_TIME, enc_get_lease_time, dec_get_lease_time), 7740 PROC41(RECLAIM_COMPLETE,enc_reclaim_complete, dec_reclaim_complete),
7726 PROC(RECLAIM_COMPLETE, enc_reclaim_complete, dec_reclaim_complete), 7741 PROC41(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo),
7727 PROC(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo), 7742 PROC41(LAYOUTGET, enc_layoutget, dec_layoutget),
7728 PROC(LAYOUTGET, enc_layoutget, dec_layoutget), 7743 PROC41(LAYOUTCOMMIT, enc_layoutcommit, dec_layoutcommit),
7729 PROC(LAYOUTCOMMIT, enc_layoutcommit, dec_layoutcommit), 7744 PROC41(LAYOUTRETURN, enc_layoutreturn, dec_layoutreturn),
7730 PROC(LAYOUTRETURN, enc_layoutreturn, dec_layoutreturn), 7745 PROC41(SECINFO_NO_NAME, enc_secinfo_no_name, dec_secinfo_no_name),
7731 PROC(SECINFO_NO_NAME, enc_secinfo_no_name, dec_secinfo_no_name), 7746 PROC41(TEST_STATEID, enc_test_stateid, dec_test_stateid),
7732 PROC(TEST_STATEID, enc_test_stateid, dec_test_stateid), 7747 PROC41(FREE_STATEID, enc_free_stateid, dec_free_stateid),
7733 PROC(FREE_STATEID, enc_free_stateid, dec_free_stateid),
7734 STUB(GETDEVICELIST), 7748 STUB(GETDEVICELIST),
7735 PROC(BIND_CONN_TO_SESSION, 7749 PROC41(BIND_CONN_TO_SESSION,
7736 enc_bind_conn_to_session, dec_bind_conn_to_session), 7750 enc_bind_conn_to_session, dec_bind_conn_to_session),
7737 PROC(DESTROY_CLIENTID, enc_destroy_clientid, dec_destroy_clientid), 7751 PROC41(DESTROY_CLIENTID,enc_destroy_clientid, dec_destroy_clientid),
7738#endif /* CONFIG_NFS_V4_1 */ 7752 PROC42(SEEK, enc_seek, dec_seek),
7739#ifdef CONFIG_NFS_V4_2 7753 PROC42(ALLOCATE, enc_allocate, dec_allocate),
7740 PROC(SEEK, enc_seek, dec_seek), 7754 PROC42(DEALLOCATE, enc_deallocate, dec_deallocate),
7741 PROC(ALLOCATE, enc_allocate, dec_allocate), 7755 PROC42(LAYOUTSTATS, enc_layoutstats, dec_layoutstats),
7742 PROC(DEALLOCATE, enc_deallocate, dec_deallocate), 7756 PROC42(CLONE, enc_clone, dec_clone),
7743 PROC(LAYOUTSTATS, enc_layoutstats, dec_layoutstats), 7757 PROC42(COPY, enc_copy, dec_copy),
7744 PROC(CLONE, enc_clone, dec_clone), 7758 PROC(LOOKUPP, enc_lookupp, dec_lookupp),
7745 PROC(COPY, enc_copy, dec_copy),
7746#endif /* CONFIG_NFS_V4_2 */
7747}; 7759};
7748 7760
7749static unsigned int nfs_version4_counts[ARRAY_SIZE(nfs4_procedures)]; 7761static unsigned int nfs_version4_counts[ARRAY_SIZE(nfs4_procedures)];
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index 610d89d8942e..bd60f8d1e181 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -797,15 +797,15 @@ TRACE_EVENT(nfs_readpage_done,
797 ) 797 )
798); 798);
799 799
800/* 800TRACE_DEFINE_ENUM(NFS_UNSTABLE);
801 * XXX: I tried using NFS_UNSTABLE and friends in this table, but they 801TRACE_DEFINE_ENUM(NFS_DATA_SYNC);
802 * all evaluate to 0 for some reason, even if I include linux/nfs.h. 802TRACE_DEFINE_ENUM(NFS_FILE_SYNC);
803 */ 803
804#define nfs_show_stable(stable) \ 804#define nfs_show_stable(stable) \
805 __print_symbolic(stable, \ 805 __print_symbolic(stable, \
806 { 0, " (UNSTABLE)" }, \ 806 { NFS_UNSTABLE, "UNSTABLE" }, \
807 { 1, " (DATA_SYNC)" }, \ 807 { NFS_DATA_SYNC, "DATA_SYNC" }, \
808 { 2, " (FILE_SYNC)" }) 808 { NFS_FILE_SYNC, "FILE_SYNC" })
809 809
810TRACE_EVENT(nfs_initiate_write, 810TRACE_EVENT(nfs_initiate_write,
811 TP_PROTO( 811 TP_PROTO(
@@ -838,12 +838,12 @@ TRACE_EVENT(nfs_initiate_write,
838 838
839 TP_printk( 839 TP_printk(
840 "fileid=%02x:%02x:%llu fhandle=0x%08x " 840 "fileid=%02x:%02x:%llu fhandle=0x%08x "
841 "offset=%lld count=%lu stable=%d%s", 841 "offset=%lld count=%lu stable=%s",
842 MAJOR(__entry->dev), MINOR(__entry->dev), 842 MAJOR(__entry->dev), MINOR(__entry->dev),
843 (unsigned long long)__entry->fileid, 843 (unsigned long long)__entry->fileid,
844 __entry->fhandle, 844 __entry->fhandle,
845 __entry->offset, __entry->count, 845 __entry->offset, __entry->count,
846 __entry->stable, nfs_show_stable(__entry->stable) 846 nfs_show_stable(__entry->stable)
847 ) 847 )
848); 848);
849 849
@@ -882,13 +882,13 @@ TRACE_EVENT(nfs_writeback_done,
882 882
883 TP_printk( 883 TP_printk(
884 "fileid=%02x:%02x:%llu fhandle=0x%08x " 884 "fileid=%02x:%02x:%llu fhandle=0x%08x "
885 "offset=%lld status=%d stable=%d%s " 885 "offset=%lld status=%d stable=%s "
886 "verifier 0x%016llx", 886 "verifier 0x%016llx",
887 MAJOR(__entry->dev), MINOR(__entry->dev), 887 MAJOR(__entry->dev), MINOR(__entry->dev),
888 (unsigned long long)__entry->fileid, 888 (unsigned long long)__entry->fileid,
889 __entry->fhandle, 889 __entry->fhandle,
890 __entry->offset, __entry->status, 890 __entry->offset, __entry->status,
891 __entry->stable, nfs_show_stable(__entry->stable), 891 nfs_show_stable(__entry->stable),
892 __entry->verifier 892 __entry->verifier
893 ) 893 )
894); 894);
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index d0543e19098a..18a7626ac638 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -537,7 +537,7 @@ EXPORT_SYMBOL_GPL(nfs_pgio_header_free);
537 * @cinfo: Commit information for the call (writes only) 537 * @cinfo: Commit information for the call (writes only)
538 */ 538 */
539static void nfs_pgio_rpcsetup(struct nfs_pgio_header *hdr, 539static void nfs_pgio_rpcsetup(struct nfs_pgio_header *hdr,
540 unsigned int count, unsigned int offset, 540 unsigned int count,
541 int how, struct nfs_commit_info *cinfo) 541 int how, struct nfs_commit_info *cinfo)
542{ 542{
543 struct nfs_page *req = hdr->req; 543 struct nfs_page *req = hdr->req;
@@ -546,10 +546,10 @@ static void nfs_pgio_rpcsetup(struct nfs_pgio_header *hdr,
546 * NB: take care not to mess about with hdr->commit et al. */ 546 * NB: take care not to mess about with hdr->commit et al. */
547 547
548 hdr->args.fh = NFS_FH(hdr->inode); 548 hdr->args.fh = NFS_FH(hdr->inode);
549 hdr->args.offset = req_offset(req) + offset; 549 hdr->args.offset = req_offset(req);
550 /* pnfs_set_layoutcommit needs this */ 550 /* pnfs_set_layoutcommit needs this */
551 hdr->mds_offset = hdr->args.offset; 551 hdr->mds_offset = hdr->args.offset;
552 hdr->args.pgbase = req->wb_pgbase + offset; 552 hdr->args.pgbase = req->wb_pgbase;
553 hdr->args.pages = hdr->page_array.pagevec; 553 hdr->args.pages = hdr->page_array.pagevec;
554 hdr->args.count = count; 554 hdr->args.count = count;
555 hdr->args.context = get_nfs_open_context(req->wb_context); 555 hdr->args.context = get_nfs_open_context(req->wb_context);
@@ -789,7 +789,7 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc,
789 desc->pg_ioflags &= ~FLUSH_COND_STABLE; 789 desc->pg_ioflags &= ~FLUSH_COND_STABLE;
790 790
791 /* Set up the argument struct */ 791 /* Set up the argument struct */
792 nfs_pgio_rpcsetup(hdr, mirror->pg_count, 0, desc->pg_ioflags, &cinfo); 792 nfs_pgio_rpcsetup(hdr, mirror->pg_count, desc->pg_ioflags, &cinfo);
793 desc->pg_rpc_callops = &nfs_pgio_common_ops; 793 desc->pg_rpc_callops = &nfs_pgio_common_ops;
794 return 0; 794 return 0;
795} 795}
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index d602fe9e1ac8..c13e826614b5 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -655,7 +655,7 @@ pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
655 return 0; 655 return 0;
656 list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) 656 list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
657 if (pnfs_match_lseg_recall(lseg, recall_range, seq)) { 657 if (pnfs_match_lseg_recall(lseg, recall_range, seq)) {
658 dprintk("%s: freeing lseg %p iomode %d seq %u" 658 dprintk("%s: freeing lseg %p iomode %d seq %u "
659 "offset %llu length %llu\n", __func__, 659 "offset %llu length %llu\n", __func__,
660 lseg, lseg->pls_range.iomode, lseg->pls_seq, 660 lseg, lseg->pls_range.iomode, lseg->pls_seq,
661 lseg->pls_range.offset, lseg->pls_range.length); 661 lseg->pls_range.offset, lseg->pls_range.length);
@@ -2255,7 +2255,7 @@ pnfs_write_through_mds(struct nfs_pageio_descriptor *desc,
2255 nfs_pageio_reset_write_mds(desc); 2255 nfs_pageio_reset_write_mds(desc);
2256 mirror->pg_recoalesce = 1; 2256 mirror->pg_recoalesce = 1;
2257 } 2257 }
2258 hdr->release(hdr); 2258 hdr->completion_ops->completion(hdr);
2259} 2259}
2260 2260
2261static enum pnfs_try_status 2261static enum pnfs_try_status
@@ -2378,7 +2378,7 @@ pnfs_read_through_mds(struct nfs_pageio_descriptor *desc,
2378 nfs_pageio_reset_read_mds(desc); 2378 nfs_pageio_reset_read_mds(desc);
2379 mirror->pg_recoalesce = 1; 2379 mirror->pg_recoalesce = 1;
2380 } 2380 }
2381 hdr->release(hdr); 2381 hdr->completion_ops->completion(hdr);
2382} 2382}
2383 2383
2384/* 2384/*
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 8d507c361d98..daf6cbf5c15f 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -40,6 +40,7 @@ enum {
40 NFS_LSEG_ROC, /* roc bit received from server */ 40 NFS_LSEG_ROC, /* roc bit received from server */
41 NFS_LSEG_LAYOUTCOMMIT, /* layoutcommit bit set for layoutcommit */ 41 NFS_LSEG_LAYOUTCOMMIT, /* layoutcommit bit set for layoutcommit */
42 NFS_LSEG_LAYOUTRETURN, /* layoutreturn bit set for layoutreturn */ 42 NFS_LSEG_LAYOUTRETURN, /* layoutreturn bit set for layoutreturn */
43 NFS_LSEG_UNAVAILABLE, /* unavailable bit set for temporary problem */
43}; 44};
44 45
45/* Individual ip address */ 46/* Individual ip address */
@@ -86,6 +87,7 @@ enum pnfs_try_status {
86 */ 87 */
87#define NFS4_DEF_DS_TIMEO 600 /* in tenths of a second */ 88#define NFS4_DEF_DS_TIMEO 600 /* in tenths of a second */
88#define NFS4_DEF_DS_RETRANS 5 89#define NFS4_DEF_DS_RETRANS 5
90#define PNFS_DEVICE_RETRY_TIMEOUT (120*HZ)
89 91
90/* error codes for internal use */ 92/* error codes for internal use */
91#define NFS4ERR_RESET_TO_MDS 12001 93#define NFS4ERR_RESET_TO_MDS 12001
@@ -524,8 +526,10 @@ static inline int pnfs_return_layout(struct inode *ino)
524 struct nfs_inode *nfsi = NFS_I(ino); 526 struct nfs_inode *nfsi = NFS_I(ino);
525 struct nfs_server *nfss = NFS_SERVER(ino); 527 struct nfs_server *nfss = NFS_SERVER(ino);
526 528
527 if (pnfs_enabled_sb(nfss) && nfsi->layout) 529 if (pnfs_enabled_sb(nfss) && nfsi->layout) {
530 set_bit(NFS_LAYOUT_RETURN_REQUESTED, &nfsi->layout->plh_flags);
528 return _pnfs_return_layout(ino); 531 return _pnfs_return_layout(ino);
532 }
529 533
530 return 0; 534 return 0;
531} 535}
diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c
index 2961fcd7a2df..e8a07b3f9aaa 100644
--- a/fs/nfs/pnfs_dev.c
+++ b/fs/nfs/pnfs_dev.c
@@ -43,7 +43,6 @@
43#define NFS4_DEVICE_ID_HASH_SIZE (1 << NFS4_DEVICE_ID_HASH_BITS) 43#define NFS4_DEVICE_ID_HASH_SIZE (1 << NFS4_DEVICE_ID_HASH_BITS)
44#define NFS4_DEVICE_ID_HASH_MASK (NFS4_DEVICE_ID_HASH_SIZE - 1) 44#define NFS4_DEVICE_ID_HASH_MASK (NFS4_DEVICE_ID_HASH_SIZE - 1)
45 45
46#define PNFS_DEVICE_RETRY_TIMEOUT (120*HZ)
47 46
48static struct hlist_head nfs4_deviceid_cache[NFS4_DEVICE_ID_HASH_SIZE]; 47static struct hlist_head nfs4_deviceid_cache[NFS4_DEVICE_ID_HASH_SIZE];
49static DEFINE_SPINLOCK(nfs4_deviceid_lock); 48static DEFINE_SPINLOCK(nfs4_deviceid_lock);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 12b2d477836b..7428a669d7a7 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1835,6 +1835,8 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
1835 set_bit(NFS_CONTEXT_RESEND_WRITES, &req->wb_context->flags); 1835 set_bit(NFS_CONTEXT_RESEND_WRITES, &req->wb_context->flags);
1836 next: 1836 next:
1837 nfs_unlock_and_release_request(req); 1837 nfs_unlock_and_release_request(req);
1838 /* Latency breaker */
1839 cond_resched();
1838 } 1840 }
1839 nfss = NFS_SERVER(data->inode); 1841 nfss = NFS_SERVER(data->inode);
1840 if (atomic_long_read(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) 1842 if (atomic_long_read(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h
index d7d313fb9cd4..4fd95dbeb52f 100644
--- a/include/linux/lockd/lockd.h
+++ b/include/linux/lockd/lockd.h
@@ -17,6 +17,7 @@
17#include <net/ipv6.h> 17#include <net/ipv6.h>
18#include <linux/fs.h> 18#include <linux/fs.h>
19#include <linux/kref.h> 19#include <linux/kref.h>
20#include <linux/refcount.h>
20#include <linux/utsname.h> 21#include <linux/utsname.h>
21#include <linux/lockd/bind.h> 22#include <linux/lockd/bind.h>
22#include <linux/lockd/xdr.h> 23#include <linux/lockd/xdr.h>
@@ -58,7 +59,7 @@ struct nlm_host {
58 u32 h_state; /* pseudo-state counter */ 59 u32 h_state; /* pseudo-state counter */
59 u32 h_nsmstate; /* true remote NSM state */ 60 u32 h_nsmstate; /* true remote NSM state */
60 u32 h_pidcount; /* Pseudopids */ 61 u32 h_pidcount; /* Pseudopids */
61 atomic_t h_count; /* reference count */ 62 refcount_t h_count; /* reference count */
62 struct mutex h_mutex; /* mutex for pmap binding */ 63 struct mutex h_mutex; /* mutex for pmap binding */
63 unsigned long h_nextrebind; /* next portmap call */ 64 unsigned long h_nextrebind; /* next portmap call */
64 unsigned long h_expires; /* eligible for GC */ 65 unsigned long h_expires; /* eligible for GC */
@@ -83,7 +84,7 @@ struct nlm_host {
83 84
84struct nsm_handle { 85struct nsm_handle {
85 struct list_head sm_link; 86 struct list_head sm_link;
86 atomic_t sm_count; 87 refcount_t sm_count;
87 char *sm_mon_name; 88 char *sm_mon_name;
88 char *sm_name; 89 char *sm_name;
89 struct sockaddr_storage sm_addr; 90 struct sockaddr_storage sm_addr;
@@ -122,7 +123,7 @@ static inline struct sockaddr *nlm_srcaddr(const struct nlm_host *host)
122 */ 123 */
123struct nlm_lockowner { 124struct nlm_lockowner {
124 struct list_head list; 125 struct list_head list;
125 atomic_t count; 126 refcount_t count;
126 127
127 struct nlm_host *host; 128 struct nlm_host *host;
128 fl_owner_t owner; 129 fl_owner_t owner;
@@ -136,7 +137,7 @@ struct nlm_wait;
136 */ 137 */
137#define NLMCLNT_OHSIZE ((__NEW_UTS_LEN) + 10u) 138#define NLMCLNT_OHSIZE ((__NEW_UTS_LEN) + 10u)
138struct nlm_rqst { 139struct nlm_rqst {
139 atomic_t a_count; 140 refcount_t a_count;
140 unsigned int a_flags; /* initial RPC task flags */ 141 unsigned int a_flags; /* initial RPC task flags */
141 struct nlm_host * a_host; /* host handle */ 142 struct nlm_host * a_host; /* host handle */
142 struct nlm_args a_args; /* arguments */ 143 struct nlm_args a_args; /* arguments */
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index 47adac640191..57ffaa20d564 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -457,7 +457,12 @@ enum lock_type4 {
457 457
458#define NFS4_DEBUG 1 458#define NFS4_DEBUG 1
459 459
460/* Index of predefined Linux client operations */ 460/*
461 * Index of predefined Linux client operations
462 *
463 * To ensure that /proc/net/rpc/nfs remains correctly ordered, please
464 * append only to this enum when adding new client operations.
465 */
461 466
462enum { 467enum {
463 NFSPROC4_CLNT_NULL = 0, /* Unused */ 468 NFSPROC4_CLNT_NULL = 0, /* Unused */
@@ -480,7 +485,6 @@ enum {
480 NFSPROC4_CLNT_ACCESS, 485 NFSPROC4_CLNT_ACCESS,
481 NFSPROC4_CLNT_GETATTR, 486 NFSPROC4_CLNT_GETATTR,
482 NFSPROC4_CLNT_LOOKUP, 487 NFSPROC4_CLNT_LOOKUP,
483 NFSPROC4_CLNT_LOOKUPP,
484 NFSPROC4_CLNT_LOOKUP_ROOT, 488 NFSPROC4_CLNT_LOOKUP_ROOT,
485 NFSPROC4_CLNT_REMOVE, 489 NFSPROC4_CLNT_REMOVE,
486 NFSPROC4_CLNT_RENAME, 490 NFSPROC4_CLNT_RENAME,
@@ -500,7 +504,6 @@ enum {
500 NFSPROC4_CLNT_SECINFO, 504 NFSPROC4_CLNT_SECINFO,
501 NFSPROC4_CLNT_FSID_PRESENT, 505 NFSPROC4_CLNT_FSID_PRESENT,
502 506
503 /* nfs41 */
504 NFSPROC4_CLNT_EXCHANGE_ID, 507 NFSPROC4_CLNT_EXCHANGE_ID,
505 NFSPROC4_CLNT_CREATE_SESSION, 508 NFSPROC4_CLNT_CREATE_SESSION,
506 NFSPROC4_CLNT_DESTROY_SESSION, 509 NFSPROC4_CLNT_DESTROY_SESSION,
@@ -518,13 +521,14 @@ enum {
518 NFSPROC4_CLNT_BIND_CONN_TO_SESSION, 521 NFSPROC4_CLNT_BIND_CONN_TO_SESSION,
519 NFSPROC4_CLNT_DESTROY_CLIENTID, 522 NFSPROC4_CLNT_DESTROY_CLIENTID,
520 523
521 /* nfs42 */
522 NFSPROC4_CLNT_SEEK, 524 NFSPROC4_CLNT_SEEK,
523 NFSPROC4_CLNT_ALLOCATE, 525 NFSPROC4_CLNT_ALLOCATE,
524 NFSPROC4_CLNT_DEALLOCATE, 526 NFSPROC4_CLNT_DEALLOCATE,
525 NFSPROC4_CLNT_LAYOUTSTATS, 527 NFSPROC4_CLNT_LAYOUTSTATS,
526 NFSPROC4_CLNT_CLONE, 528 NFSPROC4_CLNT_CLONE,
527 NFSPROC4_CLNT_COPY, 529 NFSPROC4_CLNT_COPY,
530
531 NFSPROC4_CLNT_LOOKUPP,
528}; 532};
529 533
530/* nfs41 types */ 534/* nfs41 types */
diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index 71c237e8240e..ed761f751ecb 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -179,7 +179,6 @@ struct rpc_task *rpc_call_null(struct rpc_clnt *clnt, struct rpc_cred *cred,
179int rpc_restart_call_prepare(struct rpc_task *); 179int rpc_restart_call_prepare(struct rpc_task *);
180int rpc_restart_call(struct rpc_task *); 180int rpc_restart_call(struct rpc_task *);
181void rpc_setbufsize(struct rpc_clnt *, unsigned int, unsigned int); 181void rpc_setbufsize(struct rpc_clnt *, unsigned int, unsigned int);
182int rpc_protocol(struct rpc_clnt *);
183struct net * rpc_net_ns(struct rpc_clnt *); 182struct net * rpc_net_ns(struct rpc_clnt *);
184size_t rpc_max_payload(struct rpc_clnt *); 183size_t rpc_max_payload(struct rpc_clnt *);
185size_t rpc_max_bc_payload(struct rpc_clnt *); 184size_t rpc_max_bc_payload(struct rpc_clnt *);
diff --git a/include/linux/sunrpc/xprtrdma.h b/include/linux/sunrpc/xprtrdma.h
index 221b7a2e5406..5859563e3c1f 100644
--- a/include/linux/sunrpc/xprtrdma.h
+++ b/include/linux/sunrpc/xprtrdma.h
@@ -64,7 +64,7 @@ enum rpcrdma_memreg {
64 RPCRDMA_MEMWINDOWS, 64 RPCRDMA_MEMWINDOWS,
65 RPCRDMA_MEMWINDOWS_ASYNC, 65 RPCRDMA_MEMWINDOWS_ASYNC,
66 RPCRDMA_MTHCAFMR, 66 RPCRDMA_MTHCAFMR,
67 RPCRDMA_FRMR, 67 RPCRDMA_FRWR,
68 RPCRDMA_ALLPHYSICAL, 68 RPCRDMA_ALLPHYSICAL,
69 RPCRDMA_LAST 69 RPCRDMA_LAST
70}; 70};
diff --git a/include/trace/events/rdma.h b/include/trace/events/rdma.h
new file mode 100644
index 000000000000..aa19afc73a4e
--- /dev/null
+++ b/include/trace/events/rdma.h
@@ -0,0 +1,129 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2/*
3 * Copyright (c) 2017 Oracle. All rights reserved.
4 */
5
6/*
7 * enum ib_event_type, from include/rdma/ib_verbs.h
8 */
9
10#define IB_EVENT_LIST \
11 ib_event(CQ_ERR) \
12 ib_event(QP_FATAL) \
13 ib_event(QP_REQ_ERR) \
14 ib_event(QP_ACCESS_ERR) \
15 ib_event(COMM_EST) \
16 ib_event(SQ_DRAINED) \
17 ib_event(PATH_MIG) \
18 ib_event(PATH_MIG_ERR) \
19 ib_event(DEVICE_FATAL) \
20 ib_event(PORT_ACTIVE) \
21 ib_event(PORT_ERR) \
22 ib_event(LID_CHANGE) \
23 ib_event(PKEY_CHANGE) \
24 ib_event(SM_CHANGE) \
25 ib_event(SRQ_ERR) \
26 ib_event(SRQ_LIMIT_REACHED) \
27 ib_event(QP_LAST_WQE_REACHED) \
28 ib_event(CLIENT_REREGISTER) \
29 ib_event(GID_CHANGE) \
30 ib_event_end(WQ_FATAL)
31
32#undef ib_event
33#undef ib_event_end
34
35#define ib_event(x) TRACE_DEFINE_ENUM(IB_EVENT_##x);
36#define ib_event_end(x) TRACE_DEFINE_ENUM(IB_EVENT_##x);
37
38IB_EVENT_LIST
39
40#undef ib_event
41#undef ib_event_end
42
43#define ib_event(x) { IB_EVENT_##x, #x },
44#define ib_event_end(x) { IB_EVENT_##x, #x }
45
46#define rdma_show_ib_event(x) \
47 __print_symbolic(x, IB_EVENT_LIST)
48
49/*
50 * enum ib_wc_status type, from include/rdma/ib_verbs.h
51 */
52#define IB_WC_STATUS_LIST \
53 ib_wc_status(SUCCESS) \
54 ib_wc_status(LOC_LEN_ERR) \
55 ib_wc_status(LOC_QP_OP_ERR) \
56 ib_wc_status(LOC_EEC_OP_ERR) \
57 ib_wc_status(LOC_PROT_ERR) \
58 ib_wc_status(WR_FLUSH_ERR) \
59 ib_wc_status(MW_BIND_ERR) \
60 ib_wc_status(BAD_RESP_ERR) \
61 ib_wc_status(LOC_ACCESS_ERR) \
62 ib_wc_status(REM_INV_REQ_ERR) \
63 ib_wc_status(REM_ACCESS_ERR) \
64 ib_wc_status(REM_OP_ERR) \
65 ib_wc_status(RETRY_EXC_ERR) \
66 ib_wc_status(RNR_RETRY_EXC_ERR) \
67 ib_wc_status(LOC_RDD_VIOL_ERR) \
68 ib_wc_status(REM_INV_RD_REQ_ERR) \
69 ib_wc_status(REM_ABORT_ERR) \
70 ib_wc_status(INV_EECN_ERR) \
71 ib_wc_status(INV_EEC_STATE_ERR) \
72 ib_wc_status(FATAL_ERR) \
73 ib_wc_status(RESP_TIMEOUT_ERR) \
74 ib_wc_status_end(GENERAL_ERR)
75
76#undef ib_wc_status
77#undef ib_wc_status_end
78
79#define ib_wc_status(x) TRACE_DEFINE_ENUM(IB_WC_##x);
80#define ib_wc_status_end(x) TRACE_DEFINE_ENUM(IB_WC_##x);
81
82IB_WC_STATUS_LIST
83
84#undef ib_wc_status
85#undef ib_wc_status_end
86
87#define ib_wc_status(x) { IB_WC_##x, #x },
88#define ib_wc_status_end(x) { IB_WC_##x, #x }
89
90#define rdma_show_wc_status(x) \
91 __print_symbolic(x, IB_WC_STATUS_LIST)
92
93/*
94 * enum rdma_cm_event_type, from include/rdma/rdma_cm.h
95 */
96#define RDMA_CM_EVENT_LIST \
97 rdma_cm_event(ADDR_RESOLVED) \
98 rdma_cm_event(ADDR_ERROR) \
99 rdma_cm_event(ROUTE_RESOLVED) \
100 rdma_cm_event(ROUTE_ERROR) \
101 rdma_cm_event(CONNECT_REQUEST) \
102 rdma_cm_event(CONNECT_RESPONSE) \
103 rdma_cm_event(CONNECT_ERROR) \
104 rdma_cm_event(UNREACHABLE) \
105 rdma_cm_event(REJECTED) \
106 rdma_cm_event(ESTABLISHED) \
107 rdma_cm_event(DISCONNECTED) \
108 rdma_cm_event(DEVICE_REMOVAL) \
109 rdma_cm_event(MULTICAST_JOIN) \
110 rdma_cm_event(MULTICAST_ERROR) \
111 rdma_cm_event(ADDR_CHANGE) \
112 rdma_cm_event_end(TIMEWAIT_EXIT)
113
114#undef rdma_cm_event
115#undef rdma_cm_event_end
116
117#define rdma_cm_event(x) TRACE_DEFINE_ENUM(RDMA_CM_EVENT_##x);
118#define rdma_cm_event_end(x) TRACE_DEFINE_ENUM(RDMA_CM_EVENT_##x);
119
120RDMA_CM_EVENT_LIST
121
122#undef rdma_cm_event
123#undef rdma_cm_event_end
124
125#define rdma_cm_event(x) { RDMA_CM_EVENT_##x, #x },
126#define rdma_cm_event_end(x) { RDMA_CM_EVENT_##x, #x }
127
128#define rdma_show_cm_event(x) \
129 __print_symbolic(x, RDMA_CM_EVENT_LIST)
diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h
new file mode 100644
index 000000000000..50ed3f8bf534
--- /dev/null
+++ b/include/trace/events/rpcrdma.h
@@ -0,0 +1,890 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2/*
3 * Copyright (c) 2017 Oracle. All rights reserved.
4 */
5#undef TRACE_SYSTEM
6#define TRACE_SYSTEM rpcrdma
7
8#if !defined(_TRACE_RPCRDMA_H) || defined(TRACE_HEADER_MULTI_READ)
9#define _TRACE_RPCRDMA_H
10
11#include <linux/tracepoint.h>
12#include <trace/events/rdma.h>
13
14/**
15 ** Event classes
16 **/
17
18DECLARE_EVENT_CLASS(xprtrdma_reply_event,
19 TP_PROTO(
20 const struct rpcrdma_rep *rep
21 ),
22
23 TP_ARGS(rep),
24
25 TP_STRUCT__entry(
26 __field(const void *, rep)
27 __field(const void *, r_xprt)
28 __field(u32, xid)
29 __field(u32, version)
30 __field(u32, proc)
31 ),
32
33 TP_fast_assign(
34 __entry->rep = rep;
35 __entry->r_xprt = rep->rr_rxprt;
36 __entry->xid = be32_to_cpu(rep->rr_xid);
37 __entry->version = be32_to_cpu(rep->rr_vers);
38 __entry->proc = be32_to_cpu(rep->rr_proc);
39 ),
40
41 TP_printk("rxprt %p xid=0x%08x rep=%p: version %u proc %u",
42 __entry->r_xprt, __entry->xid, __entry->rep,
43 __entry->version, __entry->proc
44 )
45);
46
47#define DEFINE_REPLY_EVENT(name) \
48 DEFINE_EVENT(xprtrdma_reply_event, name, \
49 TP_PROTO( \
50 const struct rpcrdma_rep *rep \
51 ), \
52 TP_ARGS(rep))
53
54DECLARE_EVENT_CLASS(xprtrdma_rxprt,
55 TP_PROTO(
56 const struct rpcrdma_xprt *r_xprt
57 ),
58
59 TP_ARGS(r_xprt),
60
61 TP_STRUCT__entry(
62 __field(const void *, r_xprt)
63 __string(addr, rpcrdma_addrstr(r_xprt))
64 __string(port, rpcrdma_portstr(r_xprt))
65 ),
66
67 TP_fast_assign(
68 __entry->r_xprt = r_xprt;
69 __assign_str(addr, rpcrdma_addrstr(r_xprt));
70 __assign_str(port, rpcrdma_portstr(r_xprt));
71 ),
72
73 TP_printk("peer=[%s]:%s r_xprt=%p",
74 __get_str(addr), __get_str(port), __entry->r_xprt
75 )
76);
77
78#define DEFINE_RXPRT_EVENT(name) \
79 DEFINE_EVENT(xprtrdma_rxprt, name, \
80 TP_PROTO( \
81 const struct rpcrdma_xprt *r_xprt \
82 ), \
83 TP_ARGS(r_xprt))
84
85DECLARE_EVENT_CLASS(xprtrdma_rdch_event,
86 TP_PROTO(
87 const struct rpc_task *task,
88 unsigned int pos,
89 struct rpcrdma_mr *mr,
90 int nsegs
91 ),
92
93 TP_ARGS(task, pos, mr, nsegs),
94
95 TP_STRUCT__entry(
96 __field(unsigned int, task_id)
97 __field(unsigned int, client_id)
98 __field(const void *, mr)
99 __field(unsigned int, pos)
100 __field(int, nents)
101 __field(u32, handle)
102 __field(u32, length)
103 __field(u64, offset)
104 __field(int, nsegs)
105 ),
106
107 TP_fast_assign(
108 __entry->task_id = task->tk_pid;
109 __entry->client_id = task->tk_client->cl_clid;
110 __entry->mr = mr;
111 __entry->pos = pos;
112 __entry->nents = mr->mr_nents;
113 __entry->handle = mr->mr_handle;
114 __entry->length = mr->mr_length;
115 __entry->offset = mr->mr_offset;
116 __entry->nsegs = nsegs;
117 ),
118
119 TP_printk("task:%u@%u mr=%p pos=%u %u@0x%016llx:0x%08x (%s)",
120 __entry->task_id, __entry->client_id, __entry->mr,
121 __entry->pos, __entry->length,
122 (unsigned long long)__entry->offset, __entry->handle,
123 __entry->nents < __entry->nsegs ? "more" : "last"
124 )
125);
126
127#define DEFINE_RDCH_EVENT(name) \
128 DEFINE_EVENT(xprtrdma_rdch_event, name, \
129 TP_PROTO( \
130 const struct rpc_task *task, \
131 unsigned int pos, \
132 struct rpcrdma_mr *mr, \
133 int nsegs \
134 ), \
135 TP_ARGS(task, pos, mr, nsegs))
136
137DECLARE_EVENT_CLASS(xprtrdma_wrch_event,
138 TP_PROTO(
139 const struct rpc_task *task,
140 struct rpcrdma_mr *mr,
141 int nsegs
142 ),
143
144 TP_ARGS(task, mr, nsegs),
145
146 TP_STRUCT__entry(
147 __field(unsigned int, task_id)
148 __field(unsigned int, client_id)
149 __field(const void *, mr)
150 __field(int, nents)
151 __field(u32, handle)
152 __field(u32, length)
153 __field(u64, offset)
154 __field(int, nsegs)
155 ),
156
157 TP_fast_assign(
158 __entry->task_id = task->tk_pid;
159 __entry->client_id = task->tk_client->cl_clid;
160 __entry->mr = mr;
161 __entry->nents = mr->mr_nents;
162 __entry->handle = mr->mr_handle;
163 __entry->length = mr->mr_length;
164 __entry->offset = mr->mr_offset;
165 __entry->nsegs = nsegs;
166 ),
167
168 TP_printk("task:%u@%u mr=%p %u@0x%016llx:0x%08x (%s)",
169 __entry->task_id, __entry->client_id, __entry->mr,
170 __entry->length, (unsigned long long)__entry->offset,
171 __entry->handle,
172 __entry->nents < __entry->nsegs ? "more" : "last"
173 )
174);
175
176#define DEFINE_WRCH_EVENT(name) \
177 DEFINE_EVENT(xprtrdma_wrch_event, name, \
178 TP_PROTO( \
179 const struct rpc_task *task, \
180 struct rpcrdma_mr *mr, \
181 int nsegs \
182 ), \
183 TP_ARGS(task, mr, nsegs))
184
185TRACE_DEFINE_ENUM(FRWR_IS_INVALID);
186TRACE_DEFINE_ENUM(FRWR_IS_VALID);
187TRACE_DEFINE_ENUM(FRWR_FLUSHED_FR);
188TRACE_DEFINE_ENUM(FRWR_FLUSHED_LI);
189
190#define xprtrdma_show_frwr_state(x) \
191 __print_symbolic(x, \
192 { FRWR_IS_INVALID, "INVALID" }, \
193 { FRWR_IS_VALID, "VALID" }, \
194 { FRWR_FLUSHED_FR, "FLUSHED_FR" }, \
195 { FRWR_FLUSHED_LI, "FLUSHED_LI" })
196
197DECLARE_EVENT_CLASS(xprtrdma_frwr_done,
198 TP_PROTO(
199 const struct ib_wc *wc,
200 const struct rpcrdma_frwr *frwr
201 ),
202
203 TP_ARGS(wc, frwr),
204
205 TP_STRUCT__entry(
206 __field(const void *, mr)
207 __field(unsigned int, state)
208 __field(unsigned int, status)
209 __field(unsigned int, vendor_err)
210 ),
211
212 TP_fast_assign(
213 __entry->mr = container_of(frwr, struct rpcrdma_mr, frwr);
214 __entry->state = frwr->fr_state;
215 __entry->status = wc->status;
216 __entry->vendor_err = __entry->status ? wc->vendor_err : 0;
217 ),
218
219 TP_printk(
220 "mr=%p state=%s: %s (%u/0x%x)",
221 __entry->mr, xprtrdma_show_frwr_state(__entry->state),
222 rdma_show_wc_status(__entry->status),
223 __entry->status, __entry->vendor_err
224 )
225);
226
227#define DEFINE_FRWR_DONE_EVENT(name) \
228 DEFINE_EVENT(xprtrdma_frwr_done, name, \
229 TP_PROTO( \
230 const struct ib_wc *wc, \
231 const struct rpcrdma_frwr *frwr \
232 ), \
233 TP_ARGS(wc, frwr))
234
235DECLARE_EVENT_CLASS(xprtrdma_mr,
236 TP_PROTO(
237 const struct rpcrdma_mr *mr
238 ),
239
240 TP_ARGS(mr),
241
242 TP_STRUCT__entry(
243 __field(const void *, mr)
244 __field(u32, handle)
245 __field(u32, length)
246 __field(u64, offset)
247 ),
248
249 TP_fast_assign(
250 __entry->mr = mr;
251 __entry->handle = mr->mr_handle;
252 __entry->length = mr->mr_length;
253 __entry->offset = mr->mr_offset;
254 ),
255
256 TP_printk("mr=%p %u@0x%016llx:0x%08x",
257 __entry->mr, __entry->length,
258 (unsigned long long)__entry->offset,
259 __entry->handle
260 )
261);
262
263#define DEFINE_MR_EVENT(name) \
264 DEFINE_EVENT(xprtrdma_mr, name, \
265 TP_PROTO( \
266 const struct rpcrdma_mr *mr \
267 ), \
268 TP_ARGS(mr))
269
270DECLARE_EVENT_CLASS(xprtrdma_cb_event,
271 TP_PROTO(
272 const struct rpc_rqst *rqst
273 ),
274
275 TP_ARGS(rqst),
276
277 TP_STRUCT__entry(
278 __field(const void *, rqst)
279 __field(const void *, rep)
280 __field(const void *, req)
281 __field(u32, xid)
282 ),
283
284 TP_fast_assign(
285 __entry->rqst = rqst;
286 __entry->req = rpcr_to_rdmar(rqst);
287 __entry->rep = rpcr_to_rdmar(rqst)->rl_reply;
288 __entry->xid = be32_to_cpu(rqst->rq_xid);
289 ),
290
291 TP_printk("xid=0x%08x, rqst=%p req=%p rep=%p",
292 __entry->xid, __entry->rqst, __entry->req, __entry->rep
293 )
294);
295
296#define DEFINE_CB_EVENT(name) \
297 DEFINE_EVENT(xprtrdma_cb_event, name, \
298 TP_PROTO( \
299 const struct rpc_rqst *rqst \
300 ), \
301 TP_ARGS(rqst))
302
303/**
304 ** Connection events
305 **/
306
307TRACE_EVENT(xprtrdma_conn_upcall,
308 TP_PROTO(
309 const struct rpcrdma_xprt *r_xprt,
310 struct rdma_cm_event *event
311 ),
312
313 TP_ARGS(r_xprt, event),
314
315 TP_STRUCT__entry(
316 __field(const void *, r_xprt)
317 __field(unsigned int, event)
318 __field(int, status)
319 __string(addr, rpcrdma_addrstr(r_xprt))
320 __string(port, rpcrdma_portstr(r_xprt))
321 ),
322
323 TP_fast_assign(
324 __entry->r_xprt = r_xprt;
325 __entry->event = event->event;
326 __entry->status = event->status;
327 __assign_str(addr, rpcrdma_addrstr(r_xprt));
328 __assign_str(port, rpcrdma_portstr(r_xprt));
329 ),
330
331 TP_printk("peer=[%s]:%s r_xprt=%p: %s (%u/%d)",
332 __get_str(addr), __get_str(port),
333 __entry->r_xprt, rdma_show_cm_event(__entry->event),
334 __entry->event, __entry->status
335 )
336);
337
338TRACE_EVENT(xprtrdma_disconnect,
339 TP_PROTO(
340 const struct rpcrdma_xprt *r_xprt,
341 int status
342 ),
343
344 TP_ARGS(r_xprt, status),
345
346 TP_STRUCT__entry(
347 __field(const void *, r_xprt)
348 __field(int, status)
349 __field(int, connected)
350 __string(addr, rpcrdma_addrstr(r_xprt))
351 __string(port, rpcrdma_portstr(r_xprt))
352 ),
353
354 TP_fast_assign(
355 __entry->r_xprt = r_xprt;
356 __entry->status = status;
357 __entry->connected = r_xprt->rx_ep.rep_connected;
358 __assign_str(addr, rpcrdma_addrstr(r_xprt));
359 __assign_str(port, rpcrdma_portstr(r_xprt));
360 ),
361
362 TP_printk("peer=[%s]:%s r_xprt=%p: status=%d %sconnected",
363 __get_str(addr), __get_str(port),
364 __entry->r_xprt, __entry->status,
365 __entry->connected == 1 ? "still " : "dis"
366 )
367);
368
369DEFINE_RXPRT_EVENT(xprtrdma_conn_start);
370DEFINE_RXPRT_EVENT(xprtrdma_conn_tout);
371DEFINE_RXPRT_EVENT(xprtrdma_create);
372DEFINE_RXPRT_EVENT(xprtrdma_destroy);
373DEFINE_RXPRT_EVENT(xprtrdma_remove);
374DEFINE_RXPRT_EVENT(xprtrdma_reinsert);
375DEFINE_RXPRT_EVENT(xprtrdma_reconnect);
376DEFINE_RXPRT_EVENT(xprtrdma_inject_dsc);
377
378TRACE_EVENT(xprtrdma_qp_error,
379 TP_PROTO(
380 const struct rpcrdma_xprt *r_xprt,
381 const struct ib_event *event
382 ),
383
384 TP_ARGS(r_xprt, event),
385
386 TP_STRUCT__entry(
387 __field(const void *, r_xprt)
388 __field(unsigned int, event)
389 __string(name, event->device->name)
390 __string(addr, rpcrdma_addrstr(r_xprt))
391 __string(port, rpcrdma_portstr(r_xprt))
392 ),
393
394 TP_fast_assign(
395 __entry->r_xprt = r_xprt;
396 __entry->event = event->event;
397 __assign_str(name, event->device->name);
398 __assign_str(addr, rpcrdma_addrstr(r_xprt));
399 __assign_str(port, rpcrdma_portstr(r_xprt));
400 ),
401
402 TP_printk("peer=[%s]:%s r_xprt=%p: dev %s: %s (%u)",
403 __get_str(addr), __get_str(port), __entry->r_xprt,
404 __get_str(name), rdma_show_ib_event(__entry->event),
405 __entry->event
406 )
407);
408
409/**
410 ** Call events
411 **/
412
413TRACE_EVENT(xprtrdma_createmrs,
414 TP_PROTO(
415 const struct rpcrdma_xprt *r_xprt,
416 unsigned int count
417 ),
418
419 TP_ARGS(r_xprt, count),
420
421 TP_STRUCT__entry(
422 __field(const void *, r_xprt)
423 __field(unsigned int, count)
424 ),
425
426 TP_fast_assign(
427 __entry->r_xprt = r_xprt;
428 __entry->count = count;
429 ),
430
431 TP_printk("r_xprt=%p: created %u MRs",
432 __entry->r_xprt, __entry->count
433 )
434);
435
436DEFINE_RXPRT_EVENT(xprtrdma_nomrs);
437
438DEFINE_RDCH_EVENT(xprtrdma_read_chunk);
439DEFINE_WRCH_EVENT(xprtrdma_write_chunk);
440DEFINE_WRCH_EVENT(xprtrdma_reply_chunk);
441
442TRACE_DEFINE_ENUM(rpcrdma_noch);
443TRACE_DEFINE_ENUM(rpcrdma_readch);
444TRACE_DEFINE_ENUM(rpcrdma_areadch);
445TRACE_DEFINE_ENUM(rpcrdma_writech);
446TRACE_DEFINE_ENUM(rpcrdma_replych);
447
448#define xprtrdma_show_chunktype(x) \
449 __print_symbolic(x, \
450 { rpcrdma_noch, "inline" }, \
451 { rpcrdma_readch, "read list" }, \
452 { rpcrdma_areadch, "*read list" }, \
453 { rpcrdma_writech, "write list" }, \
454 { rpcrdma_replych, "reply chunk" })
455
456TRACE_EVENT(xprtrdma_marshal,
457 TP_PROTO(
458 const struct rpc_rqst *rqst,
459 unsigned int hdrlen,
460 unsigned int rtype,
461 unsigned int wtype
462 ),
463
464 TP_ARGS(rqst, hdrlen, rtype, wtype),
465
466 TP_STRUCT__entry(
467 __field(unsigned int, task_id)
468 __field(unsigned int, client_id)
469 __field(u32, xid)
470 __field(unsigned int, hdrlen)
471 __field(unsigned int, headlen)
472 __field(unsigned int, pagelen)
473 __field(unsigned int, taillen)
474 __field(unsigned int, rtype)
475 __field(unsigned int, wtype)
476 ),
477
478 TP_fast_assign(
479 __entry->task_id = rqst->rq_task->tk_pid;
480 __entry->client_id = rqst->rq_task->tk_client->cl_clid;
481 __entry->xid = be32_to_cpu(rqst->rq_xid);
482 __entry->hdrlen = hdrlen;
483 __entry->headlen = rqst->rq_snd_buf.head[0].iov_len;
484 __entry->pagelen = rqst->rq_snd_buf.page_len;
485 __entry->taillen = rqst->rq_snd_buf.tail[0].iov_len;
486 __entry->rtype = rtype;
487 __entry->wtype = wtype;
488 ),
489
490 TP_printk("task:%u@%u xid=0x%08x: hdr=%u xdr=%u/%u/%u %s/%s",
491 __entry->task_id, __entry->client_id, __entry->xid,
492 __entry->hdrlen,
493 __entry->headlen, __entry->pagelen, __entry->taillen,
494 xprtrdma_show_chunktype(__entry->rtype),
495 xprtrdma_show_chunktype(__entry->wtype)
496 )
497);
498
499TRACE_EVENT(xprtrdma_post_send,
500 TP_PROTO(
501 const struct rpcrdma_req *req,
502 int status
503 ),
504
505 TP_ARGS(req, status),
506
507 TP_STRUCT__entry(
508 __field(const void *, req)
509 __field(int, num_sge)
510 __field(bool, signaled)
511 __field(int, status)
512 ),
513
514 TP_fast_assign(
515 __entry->req = req;
516 __entry->num_sge = req->rl_sendctx->sc_wr.num_sge;
517 __entry->signaled = req->rl_sendctx->sc_wr.send_flags &
518 IB_SEND_SIGNALED;
519 __entry->status = status;
520 ),
521
522 TP_printk("req=%p, %d SGEs%s, status=%d",
523 __entry->req, __entry->num_sge,
524 (__entry->signaled ? ", signaled" : ""),
525 __entry->status
526 )
527);
528
529TRACE_EVENT(xprtrdma_post_recv,
530 TP_PROTO(
531 const struct rpcrdma_rep *rep,
532 int status
533 ),
534
535 TP_ARGS(rep, status),
536
537 TP_STRUCT__entry(
538 __field(const void *, rep)
539 __field(int, status)
540 ),
541
542 TP_fast_assign(
543 __entry->rep = rep;
544 __entry->status = status;
545 ),
546
547 TP_printk("rep=%p status=%d",
548 __entry->rep, __entry->status
549 )
550);
551
552/**
553 ** Completion events
554 **/
555
556TRACE_EVENT(xprtrdma_wc_send,
557 TP_PROTO(
558 const struct rpcrdma_sendctx *sc,
559 const struct ib_wc *wc
560 ),
561
562 TP_ARGS(sc, wc),
563
564 TP_STRUCT__entry(
565 __field(const void *, req)
566 __field(unsigned int, unmap_count)
567 __field(unsigned int, status)
568 __field(unsigned int, vendor_err)
569 ),
570
571 TP_fast_assign(
572 __entry->req = sc->sc_req;
573 __entry->unmap_count = sc->sc_unmap_count;
574 __entry->status = wc->status;
575 __entry->vendor_err = __entry->status ? wc->vendor_err : 0;
576 ),
577
578 TP_printk("req=%p, unmapped %u pages: %s (%u/0x%x)",
579 __entry->req, __entry->unmap_count,
580 rdma_show_wc_status(__entry->status),
581 __entry->status, __entry->vendor_err
582 )
583);
584
585TRACE_EVENT(xprtrdma_wc_receive,
586 TP_PROTO(
587 const struct rpcrdma_rep *rep,
588 const struct ib_wc *wc
589 ),
590
591 TP_ARGS(rep, wc),
592
593 TP_STRUCT__entry(
594 __field(const void *, rep)
595 __field(unsigned int, byte_len)
596 __field(unsigned int, status)
597 __field(unsigned int, vendor_err)
598 ),
599
600 TP_fast_assign(
601 __entry->rep = rep;
602 __entry->byte_len = wc->byte_len;
603 __entry->status = wc->status;
604 __entry->vendor_err = __entry->status ? wc->vendor_err : 0;
605 ),
606
607 TP_printk("rep=%p, %u bytes: %s (%u/0x%x)",
608 __entry->rep, __entry->byte_len,
609 rdma_show_wc_status(__entry->status),
610 __entry->status, __entry->vendor_err
611 )
612);
613
614DEFINE_FRWR_DONE_EVENT(xprtrdma_wc_fastreg);
615DEFINE_FRWR_DONE_EVENT(xprtrdma_wc_li);
616DEFINE_FRWR_DONE_EVENT(xprtrdma_wc_li_wake);
617
618DEFINE_MR_EVENT(xprtrdma_localinv);
619DEFINE_MR_EVENT(xprtrdma_dma_unmap);
620DEFINE_MR_EVENT(xprtrdma_remoteinv);
621DEFINE_MR_EVENT(xprtrdma_recover_mr);
622
623/**
624 ** Reply events
625 **/
626
627TRACE_EVENT(xprtrdma_reply,
628 TP_PROTO(
629 const struct rpc_task *task,
630 const struct rpcrdma_rep *rep,
631 const struct rpcrdma_req *req,
632 unsigned int credits
633 ),
634
635 TP_ARGS(task, rep, req, credits),
636
637 TP_STRUCT__entry(
638 __field(unsigned int, task_id)
639 __field(unsigned int, client_id)
640 __field(const void *, rep)
641 __field(const void *, req)
642 __field(u32, xid)
643 __field(unsigned int, credits)
644 ),
645
646 TP_fast_assign(
647 __entry->task_id = task->tk_pid;
648 __entry->client_id = task->tk_client->cl_clid;
649 __entry->rep = rep;
650 __entry->req = req;
651 __entry->xid = be32_to_cpu(rep->rr_xid);
652 __entry->credits = credits;
653 ),
654
655 TP_printk("task:%u@%u xid=0x%08x, %u credits, rep=%p -> req=%p",
656 __entry->task_id, __entry->client_id, __entry->xid,
657 __entry->credits, __entry->rep, __entry->req
658 )
659);
660
661TRACE_EVENT(xprtrdma_defer_cmp,
662 TP_PROTO(
663 const struct rpcrdma_rep *rep
664 ),
665
666 TP_ARGS(rep),
667
668 TP_STRUCT__entry(
669 __field(unsigned int, task_id)
670 __field(unsigned int, client_id)
671 __field(const void *, rep)
672 __field(u32, xid)
673 ),
674
675 TP_fast_assign(
676 __entry->task_id = rep->rr_rqst->rq_task->tk_pid;
677 __entry->client_id = rep->rr_rqst->rq_task->tk_client->cl_clid;
678 __entry->rep = rep;
679 __entry->xid = be32_to_cpu(rep->rr_xid);
680 ),
681
682 TP_printk("task:%u@%u xid=0x%08x rep=%p",
683 __entry->task_id, __entry->client_id, __entry->xid,
684 __entry->rep
685 )
686);
687
688DEFINE_REPLY_EVENT(xprtrdma_reply_vers);
689DEFINE_REPLY_EVENT(xprtrdma_reply_rqst);
690DEFINE_REPLY_EVENT(xprtrdma_reply_short);
691DEFINE_REPLY_EVENT(xprtrdma_reply_hdr);
692
693TRACE_EVENT(xprtrdma_fixup,
694 TP_PROTO(
695 const struct rpc_rqst *rqst,
696 int len,
697 int hdrlen
698 ),
699
700 TP_ARGS(rqst, len, hdrlen),
701
702 TP_STRUCT__entry(
703 __field(unsigned int, task_id)
704 __field(unsigned int, client_id)
705 __field(const void *, base)
706 __field(int, len)
707 __field(int, hdrlen)
708 ),
709
710 TP_fast_assign(
711 __entry->task_id = rqst->rq_task->tk_pid;
712 __entry->client_id = rqst->rq_task->tk_client->cl_clid;
713 __entry->base = rqst->rq_rcv_buf.head[0].iov_base;
714 __entry->len = len;
715 __entry->hdrlen = hdrlen;
716 ),
717
718 TP_printk("task:%u@%u base=%p len=%d hdrlen=%d",
719 __entry->task_id, __entry->client_id,
720 __entry->base, __entry->len, __entry->hdrlen
721 )
722);
723
724TRACE_EVENT(xprtrdma_fixup_pg,
725 TP_PROTO(
726 const struct rpc_rqst *rqst,
727 int pageno,
728 const void *pos,
729 int len,
730 int curlen
731 ),
732
733 TP_ARGS(rqst, pageno, pos, len, curlen),
734
735 TP_STRUCT__entry(
736 __field(unsigned int, task_id)
737 __field(unsigned int, client_id)
738 __field(const void *, pos)
739 __field(int, pageno)
740 __field(int, len)
741 __field(int, curlen)
742 ),
743
744 TP_fast_assign(
745 __entry->task_id = rqst->rq_task->tk_pid;
746 __entry->client_id = rqst->rq_task->tk_client->cl_clid;
747 __entry->pos = pos;
748 __entry->pageno = pageno;
749 __entry->len = len;
750 __entry->curlen = curlen;
751 ),
752
753 TP_printk("task:%u@%u pageno=%d pos=%p len=%d curlen=%d",
754 __entry->task_id, __entry->client_id,
755 __entry->pageno, __entry->pos, __entry->len, __entry->curlen
756 )
757);
758
759TRACE_EVENT(xprtrdma_decode_seg,
760 TP_PROTO(
761 u32 handle,
762 u32 length,
763 u64 offset
764 ),
765
766 TP_ARGS(handle, length, offset),
767
768 TP_STRUCT__entry(
769 __field(u32, handle)
770 __field(u32, length)
771 __field(u64, offset)
772 ),
773
774 TP_fast_assign(
775 __entry->handle = handle;
776 __entry->length = length;
777 __entry->offset = offset;
778 ),
779
780 TP_printk("%u@0x%016llx:0x%08x",
781 __entry->length, (unsigned long long)__entry->offset,
782 __entry->handle
783 )
784);
785
786/**
787 ** Allocation/release of rpcrdma_reqs and rpcrdma_reps
788 **/
789
790TRACE_EVENT(xprtrdma_allocate,
791 TP_PROTO(
792 const struct rpc_task *task,
793 const struct rpcrdma_req *req
794 ),
795
796 TP_ARGS(task, req),
797
798 TP_STRUCT__entry(
799 __field(unsigned int, task_id)
800 __field(unsigned int, client_id)
801 __field(const void *, req)
802 __field(const void *, rep)
803 __field(size_t, callsize)
804 __field(size_t, rcvsize)
805 ),
806
807 TP_fast_assign(
808 __entry->task_id = task->tk_pid;
809 __entry->client_id = task->tk_client->cl_clid;
810 __entry->req = req;
811 __entry->rep = req ? req->rl_reply : NULL;
812 __entry->callsize = task->tk_rqstp->rq_callsize;
813 __entry->rcvsize = task->tk_rqstp->rq_rcvsize;
814 ),
815
816 TP_printk("task:%u@%u req=%p rep=%p (%zu, %zu)",
817 __entry->task_id, __entry->client_id,
818 __entry->req, __entry->rep,
819 __entry->callsize, __entry->rcvsize
820 )
821);
822
823TRACE_EVENT(xprtrdma_rpc_done,
824 TP_PROTO(
825 const struct rpc_task *task,
826 const struct rpcrdma_req *req
827 ),
828
829 TP_ARGS(task, req),
830
831 TP_STRUCT__entry(
832 __field(unsigned int, task_id)
833 __field(unsigned int, client_id)
834 __field(const void *, req)
835 __field(const void *, rep)
836 ),
837
838 TP_fast_assign(
839 __entry->task_id = task->tk_pid;
840 __entry->client_id = task->tk_client->cl_clid;
841 __entry->req = req;
842 __entry->rep = req->rl_reply;
843 ),
844
845 TP_printk("task:%u@%u req=%p rep=%p",
846 __entry->task_id, __entry->client_id,
847 __entry->req, __entry->rep
848 )
849);
850
851DEFINE_RXPRT_EVENT(xprtrdma_noreps);
852
853/**
854 ** Callback events
855 **/
856
857TRACE_EVENT(xprtrdma_cb_setup,
858 TP_PROTO(
859 const struct rpcrdma_xprt *r_xprt,
860 unsigned int reqs
861 ),
862
863 TP_ARGS(r_xprt, reqs),
864
865 TP_STRUCT__entry(
866 __field(const void *, r_xprt)
867 __field(unsigned int, reqs)
868 __string(addr, rpcrdma_addrstr(r_xprt))
869 __string(port, rpcrdma_portstr(r_xprt))
870 ),
871
872 TP_fast_assign(
873 __entry->r_xprt = r_xprt;
874 __entry->reqs = reqs;
875 __assign_str(addr, rpcrdma_addrstr(r_xprt));
876 __assign_str(port, rpcrdma_portstr(r_xprt));
877 ),
878
879 TP_printk("peer=[%s]:%s r_xprt=%p: %u reqs",
880 __get_str(addr), __get_str(port),
881 __entry->r_xprt, __entry->reqs
882 )
883);
884
885DEFINE_CB_EVENT(xprtrdma_cb_call);
886DEFINE_CB_EVENT(xprtrdma_cb_reply);
887
888#endif /* _TRACE_RPCRDMA_H */
889
890#include <trace/define_trace.h>
diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h
index 8c153f68509e..970c91a83173 100644
--- a/include/trace/events/sunrpc.h
+++ b/include/trace/events/sunrpc.h
@@ -32,7 +32,7 @@ DECLARE_EVENT_CLASS(rpc_task_status,
32 __entry->status = task->tk_status; 32 __entry->status = task->tk_status;
33 ), 33 ),
34 34
35 TP_printk("task:%u@%u, status %d", 35 TP_printk("task:%u@%u status=%d",
36 __entry->task_id, __entry->client_id, 36 __entry->task_id, __entry->client_id,
37 __entry->status) 37 __entry->status)
38); 38);
@@ -66,7 +66,7 @@ TRACE_EVENT(rpc_connect_status,
66 __entry->status = status; 66 __entry->status = status;
67 ), 67 ),
68 68
69 TP_printk("task:%u@%u, status %d", 69 TP_printk("task:%u@%u status=%d",
70 __entry->task_id, __entry->client_id, 70 __entry->task_id, __entry->client_id,
71 __entry->status) 71 __entry->status)
72); 72);
@@ -175,7 +175,7 @@ DECLARE_EVENT_CLASS(rpc_task_queued,
175 ), 175 ),
176 176
177 TP_fast_assign( 177 TP_fast_assign(
178 __entry->client_id = clnt->cl_clid; 178 __entry->client_id = clnt ? clnt->cl_clid : -1;
179 __entry->task_id = task->tk_pid; 179 __entry->task_id = task->tk_pid;
180 __entry->timeout = task->tk_timeout; 180 __entry->timeout = task->tk_timeout;
181 __entry->runstate = task->tk_runstate; 181 __entry->runstate = task->tk_runstate;
@@ -184,7 +184,7 @@ DECLARE_EVENT_CLASS(rpc_task_queued,
184 __assign_str(q_name, rpc_qname(q)); 184 __assign_str(q_name, rpc_qname(q));
185 ), 185 ),
186 186
187 TP_printk("task:%u@%u flags=%4.4x state=%4.4lx status=%d timeout=%lu queue=%s", 187 TP_printk("task:%u@%d flags=%4.4x state=%4.4lx status=%d timeout=%lu queue=%s",
188 __entry->task_id, __entry->client_id, 188 __entry->task_id, __entry->client_id,
189 __entry->flags, 189 __entry->flags,
190 __entry->runstate, 190 __entry->runstate,
@@ -390,6 +390,10 @@ DECLARE_EVENT_CLASS(rpc_xprt_event,
390 __entry->status) 390 __entry->status)
391); 391);
392 392
393DEFINE_EVENT(rpc_xprt_event, xprt_timer,
394 TP_PROTO(struct rpc_xprt *xprt, __be32 xid, int status),
395 TP_ARGS(xprt, xid, status));
396
393DEFINE_EVENT(rpc_xprt_event, xprt_lookup_rqst, 397DEFINE_EVENT(rpc_xprt_event, xprt_lookup_rqst,
394 TP_PROTO(struct rpc_xprt *xprt, __be32 xid, int status), 398 TP_PROTO(struct rpc_xprt *xprt, __be32 xid, int status),
395 TP_ARGS(xprt, xid, status)); 399 TP_ARGS(xprt, xid, status));
diff --git a/include/uapi/linux/nfs.h b/include/uapi/linux/nfs.h
index 057d22a48416..946cb62d64b0 100644
--- a/include/uapi/linux/nfs.h
+++ b/include/uapi/linux/nfs.h
@@ -12,6 +12,7 @@
12 12
13#define NFS_PROGRAM 100003 13#define NFS_PROGRAM 100003
14#define NFS_PORT 2049 14#define NFS_PORT 2049
15#define NFS_RDMA_PORT 20049
15#define NFS_MAXDATA 8192 16#define NFS_MAXDATA 8192
16#define NFS_MAXPATHLEN 1024 17#define NFS_MAXPATHLEN 1024
17#define NFS_MAXNAMLEN 255 18#define NFS_MAXNAMLEN 255
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index e2a4184f3c5d..6e432ecd7f99 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1376,22 +1376,6 @@ rpc_setbufsize(struct rpc_clnt *clnt, unsigned int sndsize, unsigned int rcvsize
1376EXPORT_SYMBOL_GPL(rpc_setbufsize); 1376EXPORT_SYMBOL_GPL(rpc_setbufsize);
1377 1377
1378/** 1378/**
1379 * rpc_protocol - Get transport protocol number for an RPC client
1380 * @clnt: RPC client to query
1381 *
1382 */
1383int rpc_protocol(struct rpc_clnt *clnt)
1384{
1385 int protocol;
1386
1387 rcu_read_lock();
1388 protocol = rcu_dereference(clnt->cl_xprt)->prot;
1389 rcu_read_unlock();
1390 return protocol;
1391}
1392EXPORT_SYMBOL_GPL(rpc_protocol);
1393
1394/**
1395 * rpc_net_ns - Get the network namespace for this RPC client 1379 * rpc_net_ns - Get the network namespace for this RPC client
1396 * @clnt: RPC client to query 1380 * @clnt: RPC client to query
1397 * 1381 *
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index b1b49edd7c4d..896691afbb1a 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -755,22 +755,20 @@ static void __rpc_execute(struct rpc_task *task)
755 void (*do_action)(struct rpc_task *); 755 void (*do_action)(struct rpc_task *);
756 756
757 /* 757 /*
758 * Execute any pending callback first. 758 * Perform the next FSM step or a pending callback.
759 *
760 * tk_action may be NULL if the task has been killed.
761 * In particular, note that rpc_killall_tasks may
762 * do this at any time, so beware when dereferencing.
759 */ 763 */
760 do_action = task->tk_callback; 764 do_action = task->tk_action;
761 task->tk_callback = NULL; 765 if (task->tk_callback) {
762 if (do_action == NULL) { 766 do_action = task->tk_callback;
763 /* 767 task->tk_callback = NULL;
764 * Perform the next FSM step.
765 * tk_action may be NULL if the task has been killed.
766 * In particular, note that rpc_killall_tasks may
767 * do this at any time, so beware when dereferencing.
768 */
769 do_action = task->tk_action;
770 if (do_action == NULL)
771 break;
772 } 768 }
773 trace_rpc_task_run_action(task->tk_client, task, task->tk_action); 769 if (!do_action)
770 break;
771 trace_rpc_task_run_action(task->tk_client, task, do_action);
774 do_action(task); 772 do_action(task);
775 773
776 /* 774 /*
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 33b74fd84051..2436fd1125fc 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -940,8 +940,8 @@ static void xprt_timer(struct rpc_task *task)
940 940
941 if (task->tk_status != -ETIMEDOUT) 941 if (task->tk_status != -ETIMEDOUT)
942 return; 942 return;
943 dprintk("RPC: %5u xprt_timer\n", task->tk_pid);
944 943
944 trace_xprt_timer(xprt, req->rq_xid, task->tk_status);
945 if (!req->rq_reply_bytes_recvd) { 945 if (!req->rq_reply_bytes_recvd) {
946 if (xprt->ops->timer) 946 if (xprt->ops->timer)
947 xprt->ops->timer(xprt, task); 947 xprt->ops->timer(xprt, task);
diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
index 8b818bb3518a..ed1a4a3065ee 100644
--- a/net/sunrpc/xprtrdma/backchannel.c
+++ b/net/sunrpc/xprtrdma/backchannel.c
@@ -43,7 +43,6 @@ static int rpcrdma_bc_setup_rqst(struct rpcrdma_xprt *r_xprt,
43 req = rpcrdma_create_req(r_xprt); 43 req = rpcrdma_create_req(r_xprt);
44 if (IS_ERR(req)) 44 if (IS_ERR(req))
45 return PTR_ERR(req); 45 return PTR_ERR(req);
46 __set_bit(RPCRDMA_REQ_F_BACKCHANNEL, &req->rl_flags);
47 46
48 rb = rpcrdma_alloc_regbuf(RPCRDMA_HDRBUF_SIZE, 47 rb = rpcrdma_alloc_regbuf(RPCRDMA_HDRBUF_SIZE,
49 DMA_TO_DEVICE, GFP_KERNEL); 48 DMA_TO_DEVICE, GFP_KERNEL);
@@ -74,21 +73,13 @@ out_fail:
74static int rpcrdma_bc_setup_reps(struct rpcrdma_xprt *r_xprt, 73static int rpcrdma_bc_setup_reps(struct rpcrdma_xprt *r_xprt,
75 unsigned int count) 74 unsigned int count)
76{ 75{
77 struct rpcrdma_rep *rep;
78 int rc = 0; 76 int rc = 0;
79 77
80 while (count--) { 78 while (count--) {
81 rep = rpcrdma_create_rep(r_xprt); 79 rc = rpcrdma_create_rep(r_xprt);
82 if (IS_ERR(rep)) { 80 if (rc)
83 pr_err("RPC: %s: reply buffer alloc failed\n",
84 __func__);
85 rc = PTR_ERR(rep);
86 break; 81 break;
87 }
88
89 rpcrdma_recv_buffer_put(rep);
90 } 82 }
91
92 return rc; 83 return rc;
93} 84}
94 85
@@ -129,6 +120,7 @@ int xprt_rdma_bc_setup(struct rpc_xprt *xprt, unsigned int reqs)
129 rqst->rq_xprt = &r_xprt->rx_xprt; 120 rqst->rq_xprt = &r_xprt->rx_xprt;
130 INIT_LIST_HEAD(&rqst->rq_list); 121 INIT_LIST_HEAD(&rqst->rq_list);
131 INIT_LIST_HEAD(&rqst->rq_bc_list); 122 INIT_LIST_HEAD(&rqst->rq_bc_list);
123 __set_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state);
132 124
133 if (rpcrdma_bc_setup_rqst(r_xprt, rqst)) 125 if (rpcrdma_bc_setup_rqst(r_xprt, rqst))
134 goto out_free; 126 goto out_free;
@@ -148,7 +140,7 @@ int xprt_rdma_bc_setup(struct rpc_xprt *xprt, unsigned int reqs)
148 140
149 buffer->rb_bc_srv_max_requests = reqs; 141 buffer->rb_bc_srv_max_requests = reqs;
150 request_module("svcrdma"); 142 request_module("svcrdma");
151 143 trace_xprtrdma_cb_setup(r_xprt, reqs);
152 return 0; 144 return 0;
153 145
154out_free: 146out_free:
@@ -196,13 +188,7 @@ size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *xprt)
196 return maxmsg - RPCRDMA_HDRLEN_MIN; 188 return maxmsg - RPCRDMA_HDRLEN_MIN;
197} 189}
198 190
199/** 191static int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst)
200 * rpcrdma_bc_marshal_reply - Send backwards direction reply
201 * @rqst: buffer containing RPC reply data
202 *
203 * Returns zero on success.
204 */
205int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst)
206{ 192{
207 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); 193 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
208 struct rpcrdma_req *req = rpcr_to_rdmar(rqst); 194 struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
@@ -226,7 +212,46 @@ int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst)
226 if (rpcrdma_prepare_send_sges(r_xprt, req, RPCRDMA_HDRLEN_MIN, 212 if (rpcrdma_prepare_send_sges(r_xprt, req, RPCRDMA_HDRLEN_MIN,
227 &rqst->rq_snd_buf, rpcrdma_noch)) 213 &rqst->rq_snd_buf, rpcrdma_noch))
228 return -EIO; 214 return -EIO;
215
216 trace_xprtrdma_cb_reply(rqst);
217 return 0;
218}
219
220/**
221 * xprt_rdma_bc_send_reply - marshal and send a backchannel reply
222 * @rqst: RPC rqst with a backchannel RPC reply in rq_snd_buf
223 *
224 * Caller holds the transport's write lock.
225 *
226 * Returns:
227 * %0 if the RPC message has been sent
228 * %-ENOTCONN if the caller should reconnect and call again
229 * %-EIO if a permanent error occurred and the request was not
230 * sent. Do not try to send this message again.
231 */
232int xprt_rdma_bc_send_reply(struct rpc_rqst *rqst)
233{
234 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
235 struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
236 int rc;
237
238 if (!xprt_connected(rqst->rq_xprt))
239 goto drop_connection;
240
241 rc = rpcrdma_bc_marshal_reply(rqst);
242 if (rc < 0)
243 goto failed_marshal;
244
245 if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req))
246 goto drop_connection;
229 return 0; 247 return 0;
248
249failed_marshal:
250 if (rc != -ENOTCONN)
251 return rc;
252drop_connection:
253 xprt_disconnect_done(rqst->rq_xprt);
254 return -ENOTCONN;
230} 255}
231 256
232/** 257/**
@@ -262,11 +287,6 @@ void xprt_rdma_bc_free_rqst(struct rpc_rqst *rqst)
262 dprintk("RPC: %s: freeing rqst %p (req %p)\n", 287 dprintk("RPC: %s: freeing rqst %p (req %p)\n",
263 __func__, rqst, rpcr_to_rdmar(rqst)); 288 __func__, rqst, rpcr_to_rdmar(rqst));
264 289
265 smp_mb__before_atomic();
266 WARN_ON_ONCE(!test_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state));
267 clear_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state);
268 smp_mb__after_atomic();
269
270 spin_lock_bh(&xprt->bc_pa_lock); 290 spin_lock_bh(&xprt->bc_pa_lock);
271 list_add_tail(&rqst->rq_bc_pa_list, &xprt->bc_pa_list); 291 list_add_tail(&rqst->rq_bc_pa_list, &xprt->bc_pa_list);
272 spin_unlock_bh(&xprt->bc_pa_lock); 292 spin_unlock_bh(&xprt->bc_pa_lock);
@@ -274,7 +294,7 @@ void xprt_rdma_bc_free_rqst(struct rpc_rqst *rqst)
274 294
275/** 295/**
276 * rpcrdma_bc_receive_call - Handle a backward direction call 296 * rpcrdma_bc_receive_call - Handle a backward direction call
277 * @xprt: transport receiving the call 297 * @r_xprt: transport receiving the call
278 * @rep: receive buffer containing the call 298 * @rep: receive buffer containing the call
279 * 299 *
280 * Operational assumptions: 300 * Operational assumptions:
@@ -313,7 +333,6 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt,
313 struct rpc_rqst, rq_bc_pa_list); 333 struct rpc_rqst, rq_bc_pa_list);
314 list_del(&rqst->rq_bc_pa_list); 334 list_del(&rqst->rq_bc_pa_list);
315 spin_unlock(&xprt->bc_pa_lock); 335 spin_unlock(&xprt->bc_pa_lock);
316 dprintk("RPC: %s: using rqst %p\n", __func__, rqst);
317 336
318 /* Prepare rqst */ 337 /* Prepare rqst */
319 rqst->rq_reply_bytes_recvd = 0; 338 rqst->rq_reply_bytes_recvd = 0;
@@ -321,7 +340,6 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt,
321 rqst->rq_xid = *p; 340 rqst->rq_xid = *p;
322 341
323 rqst->rq_private_buf.len = size; 342 rqst->rq_private_buf.len = size;
324 set_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state);
325 343
326 buf = &rqst->rq_rcv_buf; 344 buf = &rqst->rq_rcv_buf;
327 memset(buf, 0, sizeof(*buf)); 345 memset(buf, 0, sizeof(*buf));
@@ -335,12 +353,8 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt,
335 * the Upper Layer is done decoding it. 353 * the Upper Layer is done decoding it.
336 */ 354 */
337 req = rpcr_to_rdmar(rqst); 355 req = rpcr_to_rdmar(rqst);
338 dprintk("RPC: %s: attaching rep %p to req %p\n",
339 __func__, rep, req);
340 req->rl_reply = rep; 356 req->rl_reply = rep;
341 357 trace_xprtrdma_cb_call(rqst);
342 /* Defeat the retransmit detection logic in send_request */
343 req->rl_connect_cookie = 0;
344 358
345 /* Queue rqst for ULP's callback service */ 359 /* Queue rqst for ULP's callback service */
346 bc_serv = xprt->bc_serv; 360 bc_serv = xprt->bc_serv;
diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
index 29fc84c7ff98..d5f95bb39300 100644
--- a/net/sunrpc/xprtrdma/fmr_ops.c
+++ b/net/sunrpc/xprtrdma/fmr_ops.c
@@ -1,6 +1,6 @@
1// SPDX-License-Identifier: GPL-2.0 1// SPDX-License-Identifier: GPL-2.0
2/* 2/*
3 * Copyright (c) 2015 Oracle. All rights reserved. 3 * Copyright (c) 2015, 2017 Oracle. All rights reserved.
4 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. 4 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
5 */ 5 */
6 6
@@ -47,7 +47,7 @@ fmr_is_supported(struct rpcrdma_ia *ia)
47} 47}
48 48
49static int 49static int
50fmr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *mw) 50fmr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr)
51{ 51{
52 static struct ib_fmr_attr fmr_attr = { 52 static struct ib_fmr_attr fmr_attr = {
53 .max_pages = RPCRDMA_MAX_FMR_SGES, 53 .max_pages = RPCRDMA_MAX_FMR_SGES,
@@ -55,106 +55,108 @@ fmr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *mw)
55 .page_shift = PAGE_SHIFT 55 .page_shift = PAGE_SHIFT
56 }; 56 };
57 57
58 mw->fmr.fm_physaddrs = kcalloc(RPCRDMA_MAX_FMR_SGES, 58 mr->fmr.fm_physaddrs = kcalloc(RPCRDMA_MAX_FMR_SGES,
59 sizeof(u64), GFP_KERNEL); 59 sizeof(u64), GFP_KERNEL);
60 if (!mw->fmr.fm_physaddrs) 60 if (!mr->fmr.fm_physaddrs)
61 goto out_free; 61 goto out_free;
62 62
63 mw->mw_sg = kcalloc(RPCRDMA_MAX_FMR_SGES, 63 mr->mr_sg = kcalloc(RPCRDMA_MAX_FMR_SGES,
64 sizeof(*mw->mw_sg), GFP_KERNEL); 64 sizeof(*mr->mr_sg), GFP_KERNEL);
65 if (!mw->mw_sg) 65 if (!mr->mr_sg)
66 goto out_free; 66 goto out_free;
67 67
68 sg_init_table(mw->mw_sg, RPCRDMA_MAX_FMR_SGES); 68 sg_init_table(mr->mr_sg, RPCRDMA_MAX_FMR_SGES);
69 69
70 mw->fmr.fm_mr = ib_alloc_fmr(ia->ri_pd, RPCRDMA_FMR_ACCESS_FLAGS, 70 mr->fmr.fm_mr = ib_alloc_fmr(ia->ri_pd, RPCRDMA_FMR_ACCESS_FLAGS,
71 &fmr_attr); 71 &fmr_attr);
72 if (IS_ERR(mw->fmr.fm_mr)) 72 if (IS_ERR(mr->fmr.fm_mr))
73 goto out_fmr_err; 73 goto out_fmr_err;
74 74
75 return 0; 75 return 0;
76 76
77out_fmr_err: 77out_fmr_err:
78 dprintk("RPC: %s: ib_alloc_fmr returned %ld\n", __func__, 78 dprintk("RPC: %s: ib_alloc_fmr returned %ld\n", __func__,
79 PTR_ERR(mw->fmr.fm_mr)); 79 PTR_ERR(mr->fmr.fm_mr));
80 80
81out_free: 81out_free:
82 kfree(mw->mw_sg); 82 kfree(mr->mr_sg);
83 kfree(mw->fmr.fm_physaddrs); 83 kfree(mr->fmr.fm_physaddrs);
84 return -ENOMEM; 84 return -ENOMEM;
85} 85}
86 86
87static int 87static int
88__fmr_unmap(struct rpcrdma_mw *mw) 88__fmr_unmap(struct rpcrdma_mr *mr)
89{ 89{
90 LIST_HEAD(l); 90 LIST_HEAD(l);
91 int rc; 91 int rc;
92 92
93 list_add(&mw->fmr.fm_mr->list, &l); 93 list_add(&mr->fmr.fm_mr->list, &l);
94 rc = ib_unmap_fmr(&l); 94 rc = ib_unmap_fmr(&l);
95 list_del(&mw->fmr.fm_mr->list); 95 list_del(&mr->fmr.fm_mr->list);
96 return rc; 96 return rc;
97} 97}
98 98
99static void 99static void
100fmr_op_release_mr(struct rpcrdma_mw *r) 100fmr_op_release_mr(struct rpcrdma_mr *mr)
101{ 101{
102 LIST_HEAD(unmap_list); 102 LIST_HEAD(unmap_list);
103 int rc; 103 int rc;
104 104
105 /* Ensure MW is not on any rl_registered list */ 105 /* Ensure MW is not on any rl_registered list */
106 if (!list_empty(&r->mw_list)) 106 if (!list_empty(&mr->mr_list))
107 list_del(&r->mw_list); 107 list_del(&mr->mr_list);
108 108
109 kfree(r->fmr.fm_physaddrs); 109 kfree(mr->fmr.fm_physaddrs);
110 kfree(r->mw_sg); 110 kfree(mr->mr_sg);
111 111
112 /* In case this one was left mapped, try to unmap it 112 /* In case this one was left mapped, try to unmap it
113 * to prevent dealloc_fmr from failing with EBUSY 113 * to prevent dealloc_fmr from failing with EBUSY
114 */ 114 */
115 rc = __fmr_unmap(r); 115 rc = __fmr_unmap(mr);
116 if (rc) 116 if (rc)
117 pr_err("rpcrdma: final ib_unmap_fmr for %p failed %i\n", 117 pr_err("rpcrdma: final ib_unmap_fmr for %p failed %i\n",
118 r, rc); 118 mr, rc);
119 119
120 rc = ib_dealloc_fmr(r->fmr.fm_mr); 120 rc = ib_dealloc_fmr(mr->fmr.fm_mr);
121 if (rc) 121 if (rc)
122 pr_err("rpcrdma: final ib_dealloc_fmr for %p returned %i\n", 122 pr_err("rpcrdma: final ib_dealloc_fmr for %p returned %i\n",
123 r, rc); 123 mr, rc);
124 124
125 kfree(r); 125 kfree(mr);
126} 126}
127 127
128/* Reset of a single FMR. 128/* Reset of a single FMR.
129 */ 129 */
130static void 130static void
131fmr_op_recover_mr(struct rpcrdma_mw *mw) 131fmr_op_recover_mr(struct rpcrdma_mr *mr)
132{ 132{
133 struct rpcrdma_xprt *r_xprt = mw->mw_xprt; 133 struct rpcrdma_xprt *r_xprt = mr->mr_xprt;
134 int rc; 134 int rc;
135 135
136 /* ORDER: invalidate first */ 136 /* ORDER: invalidate first */
137 rc = __fmr_unmap(mw); 137 rc = __fmr_unmap(mr);
138
139 /* ORDER: then DMA unmap */
140 ib_dma_unmap_sg(r_xprt->rx_ia.ri_device,
141 mw->mw_sg, mw->mw_nents, mw->mw_dir);
142 if (rc) 138 if (rc)
143 goto out_release; 139 goto out_release;
144 140
145 rpcrdma_put_mw(r_xprt, mw); 141 /* ORDER: then DMA unmap */
142 rpcrdma_mr_unmap_and_put(mr);
143
146 r_xprt->rx_stats.mrs_recovered++; 144 r_xprt->rx_stats.mrs_recovered++;
147 return; 145 return;
148 146
149out_release: 147out_release:
150 pr_err("rpcrdma: FMR reset failed (%d), %p released\n", rc, mw); 148 pr_err("rpcrdma: FMR reset failed (%d), %p released\n", rc, mr);
151 r_xprt->rx_stats.mrs_orphaned++; 149 r_xprt->rx_stats.mrs_orphaned++;
152 150
153 spin_lock(&r_xprt->rx_buf.rb_mwlock); 151 trace_xprtrdma_dma_unmap(mr);
154 list_del(&mw->mw_all); 152 ib_dma_unmap_sg(r_xprt->rx_ia.ri_device,
155 spin_unlock(&r_xprt->rx_buf.rb_mwlock); 153 mr->mr_sg, mr->mr_nents, mr->mr_dir);
154
155 spin_lock(&r_xprt->rx_buf.rb_mrlock);
156 list_del(&mr->mr_all);
157 spin_unlock(&r_xprt->rx_buf.rb_mrlock);
156 158
157 fmr_op_release_mr(mw); 159 fmr_op_release_mr(mr);
158} 160}
159 161
160static int 162static int
@@ -180,15 +182,15 @@ fmr_op_maxpages(struct rpcrdma_xprt *r_xprt)
180 */ 182 */
181static struct rpcrdma_mr_seg * 183static struct rpcrdma_mr_seg *
182fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, 184fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
183 int nsegs, bool writing, struct rpcrdma_mw **out) 185 int nsegs, bool writing, struct rpcrdma_mr **out)
184{ 186{
185 struct rpcrdma_mr_seg *seg1 = seg; 187 struct rpcrdma_mr_seg *seg1 = seg;
186 int len, pageoff, i, rc; 188 int len, pageoff, i, rc;
187 struct rpcrdma_mw *mw; 189 struct rpcrdma_mr *mr;
188 u64 *dma_pages; 190 u64 *dma_pages;
189 191
190 mw = rpcrdma_get_mw(r_xprt); 192 mr = rpcrdma_mr_get(r_xprt);
191 if (!mw) 193 if (!mr)
192 return ERR_PTR(-ENOBUFS); 194 return ERR_PTR(-ENOBUFS);
193 195
194 pageoff = offset_in_page(seg1->mr_offset); 196 pageoff = offset_in_page(seg1->mr_offset);
@@ -199,12 +201,12 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
199 nsegs = RPCRDMA_MAX_FMR_SGES; 201 nsegs = RPCRDMA_MAX_FMR_SGES;
200 for (i = 0; i < nsegs;) { 202 for (i = 0; i < nsegs;) {
201 if (seg->mr_page) 203 if (seg->mr_page)
202 sg_set_page(&mw->mw_sg[i], 204 sg_set_page(&mr->mr_sg[i],
203 seg->mr_page, 205 seg->mr_page,
204 seg->mr_len, 206 seg->mr_len,
205 offset_in_page(seg->mr_offset)); 207 offset_in_page(seg->mr_offset));
206 else 208 else
207 sg_set_buf(&mw->mw_sg[i], seg->mr_offset, 209 sg_set_buf(&mr->mr_sg[i], seg->mr_offset,
208 seg->mr_len); 210 seg->mr_len);
209 len += seg->mr_len; 211 len += seg->mr_len;
210 ++seg; 212 ++seg;
@@ -214,40 +216,38 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
214 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) 216 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
215 break; 217 break;
216 } 218 }
217 mw->mw_dir = rpcrdma_data_dir(writing); 219 mr->mr_dir = rpcrdma_data_dir(writing);
218 220
219 mw->mw_nents = ib_dma_map_sg(r_xprt->rx_ia.ri_device, 221 mr->mr_nents = ib_dma_map_sg(r_xprt->rx_ia.ri_device,
220 mw->mw_sg, i, mw->mw_dir); 222 mr->mr_sg, i, mr->mr_dir);
221 if (!mw->mw_nents) 223 if (!mr->mr_nents)
222 goto out_dmamap_err; 224 goto out_dmamap_err;
223 225
224 for (i = 0, dma_pages = mw->fmr.fm_physaddrs; i < mw->mw_nents; i++) 226 for (i = 0, dma_pages = mr->fmr.fm_physaddrs; i < mr->mr_nents; i++)
225 dma_pages[i] = sg_dma_address(&mw->mw_sg[i]); 227 dma_pages[i] = sg_dma_address(&mr->mr_sg[i]);
226 rc = ib_map_phys_fmr(mw->fmr.fm_mr, dma_pages, mw->mw_nents, 228 rc = ib_map_phys_fmr(mr->fmr.fm_mr, dma_pages, mr->mr_nents,
227 dma_pages[0]); 229 dma_pages[0]);
228 if (rc) 230 if (rc)
229 goto out_maperr; 231 goto out_maperr;
230 232
231 mw->mw_handle = mw->fmr.fm_mr->rkey; 233 mr->mr_handle = mr->fmr.fm_mr->rkey;
232 mw->mw_length = len; 234 mr->mr_length = len;
233 mw->mw_offset = dma_pages[0] + pageoff; 235 mr->mr_offset = dma_pages[0] + pageoff;
234 236
235 *out = mw; 237 *out = mr;
236 return seg; 238 return seg;
237 239
238out_dmamap_err: 240out_dmamap_err:
239 pr_err("rpcrdma: failed to DMA map sg %p sg_nents %d\n", 241 pr_err("rpcrdma: failed to DMA map sg %p sg_nents %d\n",
240 mw->mw_sg, i); 242 mr->mr_sg, i);
241 rpcrdma_put_mw(r_xprt, mw); 243 rpcrdma_mr_put(mr);
242 return ERR_PTR(-EIO); 244 return ERR_PTR(-EIO);
243 245
244out_maperr: 246out_maperr:
245 pr_err("rpcrdma: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n", 247 pr_err("rpcrdma: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n",
246 len, (unsigned long long)dma_pages[0], 248 len, (unsigned long long)dma_pages[0],
247 pageoff, mw->mw_nents, rc); 249 pageoff, mr->mr_nents, rc);
248 ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, 250 rpcrdma_mr_unmap_and_put(mr);
249 mw->mw_sg, mw->mw_nents, mw->mw_dir);
250 rpcrdma_put_mw(r_xprt, mw);
251 return ERR_PTR(-EIO); 251 return ERR_PTR(-EIO);
252} 252}
253 253
@@ -256,13 +256,13 @@ out_maperr:
256 * Sleeps until it is safe for the host CPU to access the 256 * Sleeps until it is safe for the host CPU to access the
257 * previously mapped memory regions. 257 * previously mapped memory regions.
258 * 258 *
259 * Caller ensures that @mws is not empty before the call. This 259 * Caller ensures that @mrs is not empty before the call. This
260 * function empties the list. 260 * function empties the list.
261 */ 261 */
262static void 262static void
263fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mws) 263fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mrs)
264{ 264{
265 struct rpcrdma_mw *mw; 265 struct rpcrdma_mr *mr;
266 LIST_HEAD(unmap_list); 266 LIST_HEAD(unmap_list);
267 int rc; 267 int rc;
268 268
@@ -271,10 +271,11 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mws)
271 * ib_unmap_fmr() is slow, so use a single call instead 271 * ib_unmap_fmr() is slow, so use a single call instead
272 * of one call per mapped FMR. 272 * of one call per mapped FMR.
273 */ 273 */
274 list_for_each_entry(mw, mws, mw_list) { 274 list_for_each_entry(mr, mrs, mr_list) {
275 dprintk("RPC: %s: unmapping fmr %p\n", 275 dprintk("RPC: %s: unmapping fmr %p\n",
276 __func__, &mw->fmr); 276 __func__, &mr->fmr);
277 list_add_tail(&mw->fmr.fm_mr->list, &unmap_list); 277 trace_xprtrdma_localinv(mr);
278 list_add_tail(&mr->fmr.fm_mr->list, &unmap_list);
278 } 279 }
279 r_xprt->rx_stats.local_inv_needed++; 280 r_xprt->rx_stats.local_inv_needed++;
280 rc = ib_unmap_fmr(&unmap_list); 281 rc = ib_unmap_fmr(&unmap_list);
@@ -284,14 +285,10 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mws)
284 /* ORDER: Now DMA unmap all of the req's MRs, and return 285 /* ORDER: Now DMA unmap all of the req's MRs, and return
285 * them to the free MW list. 286 * them to the free MW list.
286 */ 287 */
287 while (!list_empty(mws)) { 288 while (!list_empty(mrs)) {
288 mw = rpcrdma_pop_mw(mws); 289 mr = rpcrdma_mr_pop(mrs);
289 dprintk("RPC: %s: DMA unmapping fmr %p\n", 290 list_del(&mr->fmr.fm_mr->list);
290 __func__, &mw->fmr); 291 rpcrdma_mr_unmap_and_put(mr);
291 list_del(&mw->fmr.fm_mr->list);
292 ib_dma_unmap_sg(r_xprt->rx_ia.ri_device,
293 mw->mw_sg, mw->mw_nents, mw->mw_dir);
294 rpcrdma_put_mw(r_xprt, mw);
295 } 292 }
296 293
297 return; 294 return;
@@ -299,10 +296,10 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mws)
299out_reset: 296out_reset:
300 pr_err("rpcrdma: ib_unmap_fmr failed (%i)\n", rc); 297 pr_err("rpcrdma: ib_unmap_fmr failed (%i)\n", rc);
301 298
302 while (!list_empty(mws)) { 299 while (!list_empty(mrs)) {
303 mw = rpcrdma_pop_mw(mws); 300 mr = rpcrdma_mr_pop(mrs);
304 list_del(&mw->fmr.fm_mr->list); 301 list_del(&mr->fmr.fm_mr->list);
305 fmr_op_recover_mr(mw); 302 fmr_op_recover_mr(mr);
306 } 303 }
307} 304}
308 305
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 773e66e10a15..90f688f19783 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -1,11 +1,11 @@
1// SPDX-License-Identifier: GPL-2.0 1// SPDX-License-Identifier: GPL-2.0
2/* 2/*
3 * Copyright (c) 2015 Oracle. All rights reserved. 3 * Copyright (c) 2015, 2017 Oracle. All rights reserved.
4 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. 4 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
5 */ 5 */
6 6
7/* Lightweight memory registration using Fast Registration Work 7/* Lightweight memory registration using Fast Registration Work
8 * Requests (FRWR). Also referred to sometimes as FRMR mode. 8 * Requests (FRWR).
9 * 9 *
10 * FRWR features ordered asynchronous registration and deregistration 10 * FRWR features ordered asynchronous registration and deregistration
11 * of arbitrarily sized memory regions. This is the fastest and safest 11 * of arbitrarily sized memory regions. This is the fastest and safest
@@ -15,9 +15,9 @@
15/* Normal operation 15/* Normal operation
16 * 16 *
17 * A Memory Region is prepared for RDMA READ or WRITE using a FAST_REG 17 * A Memory Region is prepared for RDMA READ or WRITE using a FAST_REG
18 * Work Request (frmr_op_map). When the RDMA operation is finished, this 18 * Work Request (frwr_op_map). When the RDMA operation is finished, this
19 * Memory Region is invalidated using a LOCAL_INV Work Request 19 * Memory Region is invalidated using a LOCAL_INV Work Request
20 * (frmr_op_unmap). 20 * (frwr_op_unmap_sync).
21 * 21 *
22 * Typically these Work Requests are not signaled, and neither are RDMA 22 * Typically these Work Requests are not signaled, and neither are RDMA
23 * SEND Work Requests (with the exception of signaling occasionally to 23 * SEND Work Requests (with the exception of signaling occasionally to
@@ -26,7 +26,7 @@
26 * 26 *
27 * As an optimization, frwr_op_unmap marks MRs INVALID before the 27 * As an optimization, frwr_op_unmap marks MRs INVALID before the
28 * LOCAL_INV WR is posted. If posting succeeds, the MR is placed on 28 * LOCAL_INV WR is posted. If posting succeeds, the MR is placed on
29 * rb_mws immediately so that no work (like managing a linked list 29 * rb_mrs immediately so that no work (like managing a linked list
30 * under a spinlock) is needed in the completion upcall. 30 * under a spinlock) is needed in the completion upcall.
31 * 31 *
32 * But this means that frwr_op_map() can occasionally encounter an MR 32 * But this means that frwr_op_map() can occasionally encounter an MR
@@ -60,7 +60,7 @@
60 * When frwr_op_map encounters FLUSHED and VALID MRs, they are recovered 60 * When frwr_op_map encounters FLUSHED and VALID MRs, they are recovered
61 * with ib_dereg_mr and then are re-initialized. Because MR recovery 61 * with ib_dereg_mr and then are re-initialized. Because MR recovery
62 * allocates fresh resources, it is deferred to a workqueue, and the 62 * allocates fresh resources, it is deferred to a workqueue, and the
63 * recovered MRs are placed back on the rb_mws list when recovery is 63 * recovered MRs are placed back on the rb_mrs list when recovery is
64 * complete. frwr_op_map allocates another MR for the current RPC while 64 * complete. frwr_op_map allocates another MR for the current RPC while
65 * the broken MR is reset. 65 * the broken MR is reset.
66 * 66 *
@@ -96,26 +96,26 @@ out_not_supported:
96} 96}
97 97
98static int 98static int
99frwr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r) 99frwr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr)
100{ 100{
101 unsigned int depth = ia->ri_max_frmr_depth; 101 unsigned int depth = ia->ri_max_frwr_depth;
102 struct rpcrdma_frmr *f = &r->frmr; 102 struct rpcrdma_frwr *frwr = &mr->frwr;
103 int rc; 103 int rc;
104 104
105 f->fr_mr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype, depth); 105 frwr->fr_mr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype, depth);
106 if (IS_ERR(f->fr_mr)) 106 if (IS_ERR(frwr->fr_mr))
107 goto out_mr_err; 107 goto out_mr_err;
108 108
109 r->mw_sg = kcalloc(depth, sizeof(*r->mw_sg), GFP_KERNEL); 109 mr->mr_sg = kcalloc(depth, sizeof(*mr->mr_sg), GFP_KERNEL);
110 if (!r->mw_sg) 110 if (!mr->mr_sg)
111 goto out_list_err; 111 goto out_list_err;
112 112
113 sg_init_table(r->mw_sg, depth); 113 sg_init_table(mr->mr_sg, depth);
114 init_completion(&f->fr_linv_done); 114 init_completion(&frwr->fr_linv_done);
115 return 0; 115 return 0;
116 116
117out_mr_err: 117out_mr_err:
118 rc = PTR_ERR(f->fr_mr); 118 rc = PTR_ERR(frwr->fr_mr);
119 dprintk("RPC: %s: ib_alloc_mr status %i\n", 119 dprintk("RPC: %s: ib_alloc_mr status %i\n",
120 __func__, rc); 120 __func__, rc);
121 return rc; 121 return rc;
@@ -124,83 +124,85 @@ out_list_err:
124 rc = -ENOMEM; 124 rc = -ENOMEM;
125 dprintk("RPC: %s: sg allocation failure\n", 125 dprintk("RPC: %s: sg allocation failure\n",
126 __func__); 126 __func__);
127 ib_dereg_mr(f->fr_mr); 127 ib_dereg_mr(frwr->fr_mr);
128 return rc; 128 return rc;
129} 129}
130 130
131static void 131static void
132frwr_op_release_mr(struct rpcrdma_mw *r) 132frwr_op_release_mr(struct rpcrdma_mr *mr)
133{ 133{
134 int rc; 134 int rc;
135 135
136 /* Ensure MW is not on any rl_registered list */ 136 /* Ensure MR is not on any rl_registered list */
137 if (!list_empty(&r->mw_list)) 137 if (!list_empty(&mr->mr_list))
138 list_del(&r->mw_list); 138 list_del(&mr->mr_list);
139 139
140 rc = ib_dereg_mr(r->frmr.fr_mr); 140 rc = ib_dereg_mr(mr->frwr.fr_mr);
141 if (rc) 141 if (rc)
142 pr_err("rpcrdma: final ib_dereg_mr for %p returned %i\n", 142 pr_err("rpcrdma: final ib_dereg_mr for %p returned %i\n",
143 r, rc); 143 mr, rc);
144 kfree(r->mw_sg); 144 kfree(mr->mr_sg);
145 kfree(r); 145 kfree(mr);
146} 146}
147 147
148static int 148static int
149__frwr_reset_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r) 149__frwr_mr_reset(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr)
150{ 150{
151 struct rpcrdma_frmr *f = &r->frmr; 151 struct rpcrdma_frwr *frwr = &mr->frwr;
152 int rc; 152 int rc;
153 153
154 rc = ib_dereg_mr(f->fr_mr); 154 rc = ib_dereg_mr(frwr->fr_mr);
155 if (rc) { 155 if (rc) {
156 pr_warn("rpcrdma: ib_dereg_mr status %d, frwr %p orphaned\n", 156 pr_warn("rpcrdma: ib_dereg_mr status %d, frwr %p orphaned\n",
157 rc, r); 157 rc, mr);
158 return rc; 158 return rc;
159 } 159 }
160 160
161 f->fr_mr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype, 161 frwr->fr_mr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype,
162 ia->ri_max_frmr_depth); 162 ia->ri_max_frwr_depth);
163 if (IS_ERR(f->fr_mr)) { 163 if (IS_ERR(frwr->fr_mr)) {
164 pr_warn("rpcrdma: ib_alloc_mr status %ld, frwr %p orphaned\n", 164 pr_warn("rpcrdma: ib_alloc_mr status %ld, frwr %p orphaned\n",
165 PTR_ERR(f->fr_mr), r); 165 PTR_ERR(frwr->fr_mr), mr);
166 return PTR_ERR(f->fr_mr); 166 return PTR_ERR(frwr->fr_mr);
167 } 167 }
168 168
169 dprintk("RPC: %s: recovered FRMR %p\n", __func__, f); 169 dprintk("RPC: %s: recovered FRWR %p\n", __func__, frwr);
170 f->fr_state = FRMR_IS_INVALID; 170 frwr->fr_state = FRWR_IS_INVALID;
171 return 0; 171 return 0;
172} 172}
173 173
174/* Reset of a single FRMR. Generate a fresh rkey by replacing the MR. 174/* Reset of a single FRWR. Generate a fresh rkey by replacing the MR.
175 */ 175 */
176static void 176static void
177frwr_op_recover_mr(struct rpcrdma_mw *mw) 177frwr_op_recover_mr(struct rpcrdma_mr *mr)
178{ 178{
179 enum rpcrdma_frmr_state state = mw->frmr.fr_state; 179 enum rpcrdma_frwr_state state = mr->frwr.fr_state;
180 struct rpcrdma_xprt *r_xprt = mw->mw_xprt; 180 struct rpcrdma_xprt *r_xprt = mr->mr_xprt;
181 struct rpcrdma_ia *ia = &r_xprt->rx_ia; 181 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
182 int rc; 182 int rc;
183 183
184 rc = __frwr_reset_mr(ia, mw); 184 rc = __frwr_mr_reset(ia, mr);
185 if (state != FRMR_FLUSHED_LI) 185 if (state != FRWR_FLUSHED_LI) {
186 trace_xprtrdma_dma_unmap(mr);
186 ib_dma_unmap_sg(ia->ri_device, 187 ib_dma_unmap_sg(ia->ri_device,
187 mw->mw_sg, mw->mw_nents, mw->mw_dir); 188 mr->mr_sg, mr->mr_nents, mr->mr_dir);
189 }
188 if (rc) 190 if (rc)
189 goto out_release; 191 goto out_release;
190 192
191 rpcrdma_put_mw(r_xprt, mw); 193 rpcrdma_mr_put(mr);
192 r_xprt->rx_stats.mrs_recovered++; 194 r_xprt->rx_stats.mrs_recovered++;
193 return; 195 return;
194 196
195out_release: 197out_release:
196 pr_err("rpcrdma: FRMR reset failed %d, %p release\n", rc, mw); 198 pr_err("rpcrdma: FRWR reset failed %d, %p release\n", rc, mr);
197 r_xprt->rx_stats.mrs_orphaned++; 199 r_xprt->rx_stats.mrs_orphaned++;
198 200
199 spin_lock(&r_xprt->rx_buf.rb_mwlock); 201 spin_lock(&r_xprt->rx_buf.rb_mrlock);
200 list_del(&mw->mw_all); 202 list_del(&mr->mr_all);
201 spin_unlock(&r_xprt->rx_buf.rb_mwlock); 203 spin_unlock(&r_xprt->rx_buf.rb_mrlock);
202 204
203 frwr_op_release_mr(mw); 205 frwr_op_release_mr(mr);
204} 206}
205 207
206static int 208static int
@@ -214,31 +216,31 @@ frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
214 if (attrs->device_cap_flags & IB_DEVICE_SG_GAPS_REG) 216 if (attrs->device_cap_flags & IB_DEVICE_SG_GAPS_REG)
215 ia->ri_mrtype = IB_MR_TYPE_SG_GAPS; 217 ia->ri_mrtype = IB_MR_TYPE_SG_GAPS;
216 218
217 ia->ri_max_frmr_depth = 219 ia->ri_max_frwr_depth =
218 min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS, 220 min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
219 attrs->max_fast_reg_page_list_len); 221 attrs->max_fast_reg_page_list_len);
220 dprintk("RPC: %s: device's max FR page list len = %u\n", 222 dprintk("RPC: %s: device's max FR page list len = %u\n",
221 __func__, ia->ri_max_frmr_depth); 223 __func__, ia->ri_max_frwr_depth);
222 224
223 /* Add room for frmr register and invalidate WRs. 225 /* Add room for frwr register and invalidate WRs.
224 * 1. FRMR reg WR for head 226 * 1. FRWR reg WR for head
225 * 2. FRMR invalidate WR for head 227 * 2. FRWR invalidate WR for head
226 * 3. N FRMR reg WRs for pagelist 228 * 3. N FRWR reg WRs for pagelist
227 * 4. N FRMR invalidate WRs for pagelist 229 * 4. N FRWR invalidate WRs for pagelist
228 * 5. FRMR reg WR for tail 230 * 5. FRWR reg WR for tail
229 * 6. FRMR invalidate WR for tail 231 * 6. FRWR invalidate WR for tail
230 * 7. The RDMA_SEND WR 232 * 7. The RDMA_SEND WR
231 */ 233 */
232 depth = 7; 234 depth = 7;
233 235
234 /* Calculate N if the device max FRMR depth is smaller than 236 /* Calculate N if the device max FRWR depth is smaller than
235 * RPCRDMA_MAX_DATA_SEGS. 237 * RPCRDMA_MAX_DATA_SEGS.
236 */ 238 */
237 if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) { 239 if (ia->ri_max_frwr_depth < RPCRDMA_MAX_DATA_SEGS) {
238 delta = RPCRDMA_MAX_DATA_SEGS - ia->ri_max_frmr_depth; 240 delta = RPCRDMA_MAX_DATA_SEGS - ia->ri_max_frwr_depth;
239 do { 241 do {
240 depth += 2; /* FRMR reg + invalidate */ 242 depth += 2; /* FRWR reg + invalidate */
241 delta -= ia->ri_max_frmr_depth; 243 delta -= ia->ri_max_frwr_depth;
242 } while (delta > 0); 244 } while (delta > 0);
243 } 245 }
244 246
@@ -252,7 +254,7 @@ frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
252 } 254 }
253 255
254 ia->ri_max_segs = max_t(unsigned int, 1, RPCRDMA_MAX_DATA_SEGS / 256 ia->ri_max_segs = max_t(unsigned int, 1, RPCRDMA_MAX_DATA_SEGS /
255 ia->ri_max_frmr_depth); 257 ia->ri_max_frwr_depth);
256 return 0; 258 return 0;
257} 259}
258 260
@@ -265,7 +267,7 @@ frwr_op_maxpages(struct rpcrdma_xprt *r_xprt)
265 struct rpcrdma_ia *ia = &r_xprt->rx_ia; 267 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
266 268
267 return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS, 269 return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
268 RPCRDMA_MAX_HDR_SEGS * ia->ri_max_frmr_depth); 270 RPCRDMA_MAX_HDR_SEGS * ia->ri_max_frwr_depth);
269} 271}
270 272
271static void 273static void
@@ -286,16 +288,16 @@ __frwr_sendcompletion_flush(struct ib_wc *wc, const char *wr)
286static void 288static void
287frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc) 289frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc)
288{ 290{
289 struct rpcrdma_frmr *frmr; 291 struct ib_cqe *cqe = wc->wr_cqe;
290 struct ib_cqe *cqe; 292 struct rpcrdma_frwr *frwr =
293 container_of(cqe, struct rpcrdma_frwr, fr_cqe);
291 294
292 /* WARNING: Only wr_cqe and status are reliable at this point */ 295 /* WARNING: Only wr_cqe and status are reliable at this point */
293 if (wc->status != IB_WC_SUCCESS) { 296 if (wc->status != IB_WC_SUCCESS) {
294 cqe = wc->wr_cqe; 297 frwr->fr_state = FRWR_FLUSHED_FR;
295 frmr = container_of(cqe, struct rpcrdma_frmr, fr_cqe);
296 frmr->fr_state = FRMR_FLUSHED_FR;
297 __frwr_sendcompletion_flush(wc, "fastreg"); 298 __frwr_sendcompletion_flush(wc, "fastreg");
298 } 299 }
300 trace_xprtrdma_wc_fastreg(wc, frwr);
299} 301}
300 302
301/** 303/**
@@ -307,16 +309,16 @@ frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc)
307static void 309static void
308frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc) 310frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc)
309{ 311{
310 struct rpcrdma_frmr *frmr; 312 struct ib_cqe *cqe = wc->wr_cqe;
311 struct ib_cqe *cqe; 313 struct rpcrdma_frwr *frwr = container_of(cqe, struct rpcrdma_frwr,
314 fr_cqe);
312 315
313 /* WARNING: Only wr_cqe and status are reliable at this point */ 316 /* WARNING: Only wr_cqe and status are reliable at this point */
314 if (wc->status != IB_WC_SUCCESS) { 317 if (wc->status != IB_WC_SUCCESS) {
315 cqe = wc->wr_cqe; 318 frwr->fr_state = FRWR_FLUSHED_LI;
316 frmr = container_of(cqe, struct rpcrdma_frmr, fr_cqe);
317 frmr->fr_state = FRMR_FLUSHED_LI;
318 __frwr_sendcompletion_flush(wc, "localinv"); 319 __frwr_sendcompletion_flush(wc, "localinv");
319 } 320 }
321 trace_xprtrdma_wc_li(wc, frwr);
320} 322}
321 323
322/** 324/**
@@ -329,17 +331,17 @@ frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc)
329static void 331static void
330frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc) 332frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc)
331{ 333{
332 struct rpcrdma_frmr *frmr; 334 struct ib_cqe *cqe = wc->wr_cqe;
333 struct ib_cqe *cqe; 335 struct rpcrdma_frwr *frwr = container_of(cqe, struct rpcrdma_frwr,
336 fr_cqe);
334 337
335 /* WARNING: Only wr_cqe and status are reliable at this point */ 338 /* WARNING: Only wr_cqe and status are reliable at this point */
336 cqe = wc->wr_cqe;
337 frmr = container_of(cqe, struct rpcrdma_frmr, fr_cqe);
338 if (wc->status != IB_WC_SUCCESS) { 339 if (wc->status != IB_WC_SUCCESS) {
339 frmr->fr_state = FRMR_FLUSHED_LI; 340 frwr->fr_state = FRWR_FLUSHED_LI;
340 __frwr_sendcompletion_flush(wc, "localinv"); 341 __frwr_sendcompletion_flush(wc, "localinv");
341 } 342 }
342 complete(&frmr->fr_linv_done); 343 complete(&frwr->fr_linv_done);
344 trace_xprtrdma_wc_li_wake(wc, frwr);
343} 345}
344 346
345/* Post a REG_MR Work Request to register a memory region 347/* Post a REG_MR Work Request to register a memory region
@@ -347,41 +349,39 @@ frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc)
347 */ 349 */
348static struct rpcrdma_mr_seg * 350static struct rpcrdma_mr_seg *
349frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, 351frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
350 int nsegs, bool writing, struct rpcrdma_mw **out) 352 int nsegs, bool writing, struct rpcrdma_mr **out)
351{ 353{
352 struct rpcrdma_ia *ia = &r_xprt->rx_ia; 354 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
353 bool holes_ok = ia->ri_mrtype == IB_MR_TYPE_SG_GAPS; 355 bool holes_ok = ia->ri_mrtype == IB_MR_TYPE_SG_GAPS;
354 struct rpcrdma_mw *mw; 356 struct rpcrdma_frwr *frwr;
355 struct rpcrdma_frmr *frmr; 357 struct rpcrdma_mr *mr;
356 struct ib_mr *mr; 358 struct ib_mr *ibmr;
357 struct ib_reg_wr *reg_wr; 359 struct ib_reg_wr *reg_wr;
358 struct ib_send_wr *bad_wr; 360 struct ib_send_wr *bad_wr;
359 int rc, i, n; 361 int rc, i, n;
360 u8 key; 362 u8 key;
361 363
362 mw = NULL; 364 mr = NULL;
363 do { 365 do {
364 if (mw) 366 if (mr)
365 rpcrdma_defer_mr_recovery(mw); 367 rpcrdma_mr_defer_recovery(mr);
366 mw = rpcrdma_get_mw(r_xprt); 368 mr = rpcrdma_mr_get(r_xprt);
367 if (!mw) 369 if (!mr)
368 return ERR_PTR(-ENOBUFS); 370 return ERR_PTR(-ENOBUFS);
369 } while (mw->frmr.fr_state != FRMR_IS_INVALID); 371 } while (mr->frwr.fr_state != FRWR_IS_INVALID);
370 frmr = &mw->frmr; 372 frwr = &mr->frwr;
371 frmr->fr_state = FRMR_IS_VALID; 373 frwr->fr_state = FRWR_IS_VALID;
372 mr = frmr->fr_mr; 374
373 reg_wr = &frmr->fr_regwr; 375 if (nsegs > ia->ri_max_frwr_depth)
374 376 nsegs = ia->ri_max_frwr_depth;
375 if (nsegs > ia->ri_max_frmr_depth)
376 nsegs = ia->ri_max_frmr_depth;
377 for (i = 0; i < nsegs;) { 377 for (i = 0; i < nsegs;) {
378 if (seg->mr_page) 378 if (seg->mr_page)
379 sg_set_page(&mw->mw_sg[i], 379 sg_set_page(&mr->mr_sg[i],
380 seg->mr_page, 380 seg->mr_page,
381 seg->mr_len, 381 seg->mr_len,
382 offset_in_page(seg->mr_offset)); 382 offset_in_page(seg->mr_offset));
383 else 383 else
384 sg_set_buf(&mw->mw_sg[i], seg->mr_offset, 384 sg_set_buf(&mr->mr_sg[i], seg->mr_offset,
385 seg->mr_len); 385 seg->mr_len);
386 386
387 ++seg; 387 ++seg;
@@ -392,30 +392,29 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
392 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) 392 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
393 break; 393 break;
394 } 394 }
395 mw->mw_dir = rpcrdma_data_dir(writing); 395 mr->mr_dir = rpcrdma_data_dir(writing);
396 396
397 mw->mw_nents = ib_dma_map_sg(ia->ri_device, mw->mw_sg, i, mw->mw_dir); 397 mr->mr_nents = ib_dma_map_sg(ia->ri_device, mr->mr_sg, i, mr->mr_dir);
398 if (!mw->mw_nents) 398 if (!mr->mr_nents)
399 goto out_dmamap_err; 399 goto out_dmamap_err;
400 400
401 n = ib_map_mr_sg(mr, mw->mw_sg, mw->mw_nents, NULL, PAGE_SIZE); 401 ibmr = frwr->fr_mr;
402 if (unlikely(n != mw->mw_nents)) 402 n = ib_map_mr_sg(ibmr, mr->mr_sg, mr->mr_nents, NULL, PAGE_SIZE);
403 if (unlikely(n != mr->mr_nents))
403 goto out_mapmr_err; 404 goto out_mapmr_err;
404 405
405 dprintk("RPC: %s: Using frmr %p to map %u segments (%llu bytes)\n", 406 key = (u8)(ibmr->rkey & 0x000000FF);
406 __func__, frmr, mw->mw_nents, mr->length); 407 ib_update_fast_reg_key(ibmr, ++key);
407
408 key = (u8)(mr->rkey & 0x000000FF);
409 ib_update_fast_reg_key(mr, ++key);
410 408
409 reg_wr = &frwr->fr_regwr;
411 reg_wr->wr.next = NULL; 410 reg_wr->wr.next = NULL;
412 reg_wr->wr.opcode = IB_WR_REG_MR; 411 reg_wr->wr.opcode = IB_WR_REG_MR;
413 frmr->fr_cqe.done = frwr_wc_fastreg; 412 frwr->fr_cqe.done = frwr_wc_fastreg;
414 reg_wr->wr.wr_cqe = &frmr->fr_cqe; 413 reg_wr->wr.wr_cqe = &frwr->fr_cqe;
415 reg_wr->wr.num_sge = 0; 414 reg_wr->wr.num_sge = 0;
416 reg_wr->wr.send_flags = 0; 415 reg_wr->wr.send_flags = 0;
417 reg_wr->mr = mr; 416 reg_wr->mr = ibmr;
418 reg_wr->key = mr->rkey; 417 reg_wr->key = ibmr->rkey;
419 reg_wr->access = writing ? 418 reg_wr->access = writing ?
420 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : 419 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
421 IB_ACCESS_REMOTE_READ; 420 IB_ACCESS_REMOTE_READ;
@@ -424,47 +423,64 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
424 if (rc) 423 if (rc)
425 goto out_senderr; 424 goto out_senderr;
426 425
427 mw->mw_handle = mr->rkey; 426 mr->mr_handle = ibmr->rkey;
428 mw->mw_length = mr->length; 427 mr->mr_length = ibmr->length;
429 mw->mw_offset = mr->iova; 428 mr->mr_offset = ibmr->iova;
430 429
431 *out = mw; 430 *out = mr;
432 return seg; 431 return seg;
433 432
434out_dmamap_err: 433out_dmamap_err:
435 pr_err("rpcrdma: failed to DMA map sg %p sg_nents %d\n", 434 pr_err("rpcrdma: failed to DMA map sg %p sg_nents %d\n",
436 mw->mw_sg, i); 435 mr->mr_sg, i);
437 frmr->fr_state = FRMR_IS_INVALID; 436 frwr->fr_state = FRWR_IS_INVALID;
438 rpcrdma_put_mw(r_xprt, mw); 437 rpcrdma_mr_put(mr);
439 return ERR_PTR(-EIO); 438 return ERR_PTR(-EIO);
440 439
441out_mapmr_err: 440out_mapmr_err:
442 pr_err("rpcrdma: failed to map mr %p (%d/%d)\n", 441 pr_err("rpcrdma: failed to map mr %p (%d/%d)\n",
443 frmr->fr_mr, n, mw->mw_nents); 442 frwr->fr_mr, n, mr->mr_nents);
444 rpcrdma_defer_mr_recovery(mw); 443 rpcrdma_mr_defer_recovery(mr);
445 return ERR_PTR(-EIO); 444 return ERR_PTR(-EIO);
446 445
447out_senderr: 446out_senderr:
448 pr_err("rpcrdma: FRMR registration ib_post_send returned %i\n", rc); 447 pr_err("rpcrdma: FRWR registration ib_post_send returned %i\n", rc);
449 rpcrdma_defer_mr_recovery(mw); 448 rpcrdma_mr_defer_recovery(mr);
450 return ERR_PTR(-ENOTCONN); 449 return ERR_PTR(-ENOTCONN);
451} 450}
452 451
452/* Handle a remotely invalidated mr on the @mrs list
453 */
454static void
455frwr_op_reminv(struct rpcrdma_rep *rep, struct list_head *mrs)
456{
457 struct rpcrdma_mr *mr;
458
459 list_for_each_entry(mr, mrs, mr_list)
460 if (mr->mr_handle == rep->rr_inv_rkey) {
461 list_del(&mr->mr_list);
462 trace_xprtrdma_remoteinv(mr);
463 mr->frwr.fr_state = FRWR_IS_INVALID;
464 rpcrdma_mr_unmap_and_put(mr);
465 break; /* only one invalidated MR per RPC */
466 }
467}
468
453/* Invalidate all memory regions that were registered for "req". 469/* Invalidate all memory regions that were registered for "req".
454 * 470 *
455 * Sleeps until it is safe for the host CPU to access the 471 * Sleeps until it is safe for the host CPU to access the
456 * previously mapped memory regions. 472 * previously mapped memory regions.
457 * 473 *
458 * Caller ensures that @mws is not empty before the call. This 474 * Caller ensures that @mrs is not empty before the call. This
459 * function empties the list. 475 * function empties the list.
460 */ 476 */
461static void 477static void
462frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mws) 478frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mrs)
463{ 479{
464 struct ib_send_wr *first, **prev, *last, *bad_wr; 480 struct ib_send_wr *first, **prev, *last, *bad_wr;
465 struct rpcrdma_ia *ia = &r_xprt->rx_ia; 481 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
466 struct rpcrdma_frmr *f; 482 struct rpcrdma_frwr *frwr;
467 struct rpcrdma_mw *mw; 483 struct rpcrdma_mr *mr;
468 int count, rc; 484 int count, rc;
469 485
470 /* ORDER: Invalidate all of the MRs first 486 /* ORDER: Invalidate all of the MRs first
@@ -472,31 +488,27 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mws)
472 * Chain the LOCAL_INV Work Requests and post them with 488 * Chain the LOCAL_INV Work Requests and post them with
473 * a single ib_post_send() call. 489 * a single ib_post_send() call.
474 */ 490 */
475 f = NULL; 491 frwr = NULL;
476 count = 0; 492 count = 0;
477 prev = &first; 493 prev = &first;
478 list_for_each_entry(mw, mws, mw_list) { 494 list_for_each_entry(mr, mrs, mr_list) {
479 mw->frmr.fr_state = FRMR_IS_INVALID; 495 mr->frwr.fr_state = FRWR_IS_INVALID;
480 496
481 if (mw->mw_flags & RPCRDMA_MW_F_RI) 497 frwr = &mr->frwr;
482 continue; 498 trace_xprtrdma_localinv(mr);
483 499
484 f = &mw->frmr; 500 frwr->fr_cqe.done = frwr_wc_localinv;
485 dprintk("RPC: %s: invalidating frmr %p\n", 501 last = &frwr->fr_invwr;
486 __func__, f);
487
488 f->fr_cqe.done = frwr_wc_localinv;
489 last = &f->fr_invwr;
490 memset(last, 0, sizeof(*last)); 502 memset(last, 0, sizeof(*last));
491 last->wr_cqe = &f->fr_cqe; 503 last->wr_cqe = &frwr->fr_cqe;
492 last->opcode = IB_WR_LOCAL_INV; 504 last->opcode = IB_WR_LOCAL_INV;
493 last->ex.invalidate_rkey = mw->mw_handle; 505 last->ex.invalidate_rkey = mr->mr_handle;
494 count++; 506 count++;
495 507
496 *prev = last; 508 *prev = last;
497 prev = &last->next; 509 prev = &last->next;
498 } 510 }
499 if (!f) 511 if (!frwr)
500 goto unmap; 512 goto unmap;
501 513
502 /* Strong send queue ordering guarantees that when the 514 /* Strong send queue ordering guarantees that when the
@@ -504,8 +516,8 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mws)
504 * are complete. 516 * are complete.
505 */ 517 */
506 last->send_flags = IB_SEND_SIGNALED; 518 last->send_flags = IB_SEND_SIGNALED;
507 f->fr_cqe.done = frwr_wc_localinv_wake; 519 frwr->fr_cqe.done = frwr_wc_localinv_wake;
508 reinit_completion(&f->fr_linv_done); 520 reinit_completion(&frwr->fr_linv_done);
509 521
510 /* Transport disconnect drains the receive CQ before it 522 /* Transport disconnect drains the receive CQ before it
511 * replaces the QP. The RPC reply handler won't call us 523 * replaces the QP. The RPC reply handler won't call us
@@ -515,36 +527,32 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mws)
515 bad_wr = NULL; 527 bad_wr = NULL;
516 rc = ib_post_send(ia->ri_id->qp, first, &bad_wr); 528 rc = ib_post_send(ia->ri_id->qp, first, &bad_wr);
517 if (bad_wr != first) 529 if (bad_wr != first)
518 wait_for_completion(&f->fr_linv_done); 530 wait_for_completion(&frwr->fr_linv_done);
519 if (rc) 531 if (rc)
520 goto reset_mrs; 532 goto reset_mrs;
521 533
522 /* ORDER: Now DMA unmap all of the MRs, and return 534 /* ORDER: Now DMA unmap all of the MRs, and return
523 * them to the free MW list. 535 * them to the free MR list.
524 */ 536 */
525unmap: 537unmap:
526 while (!list_empty(mws)) { 538 while (!list_empty(mrs)) {
527 mw = rpcrdma_pop_mw(mws); 539 mr = rpcrdma_mr_pop(mrs);
528 dprintk("RPC: %s: DMA unmapping frmr %p\n", 540 rpcrdma_mr_unmap_and_put(mr);
529 __func__, &mw->frmr);
530 ib_dma_unmap_sg(ia->ri_device,
531 mw->mw_sg, mw->mw_nents, mw->mw_dir);
532 rpcrdma_put_mw(r_xprt, mw);
533 } 541 }
534 return; 542 return;
535 543
536reset_mrs: 544reset_mrs:
537 pr_err("rpcrdma: FRMR invalidate ib_post_send returned %i\n", rc); 545 pr_err("rpcrdma: FRWR invalidate ib_post_send returned %i\n", rc);
538 546
539 /* Find and reset the MRs in the LOCAL_INV WRs that did not 547 /* Find and reset the MRs in the LOCAL_INV WRs that did not
540 * get posted. 548 * get posted.
541 */ 549 */
542 while (bad_wr) { 550 while (bad_wr) {
543 f = container_of(bad_wr, struct rpcrdma_frmr, 551 frwr = container_of(bad_wr, struct rpcrdma_frwr,
544 fr_invwr); 552 fr_invwr);
545 mw = container_of(f, struct rpcrdma_mw, frmr); 553 mr = container_of(frwr, struct rpcrdma_mr, frwr);
546 554
547 __frwr_reset_mr(ia, mw); 555 __frwr_mr_reset(ia, mr);
548 556
549 bad_wr = bad_wr->next; 557 bad_wr = bad_wr->next;
550 } 558 }
@@ -553,6 +561,7 @@ reset_mrs:
553 561
554const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = { 562const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = {
555 .ro_map = frwr_op_map, 563 .ro_map = frwr_op_map,
564 .ro_reminv = frwr_op_reminv,
556 .ro_unmap_sync = frwr_op_unmap_sync, 565 .ro_unmap_sync = frwr_op_unmap_sync,
557 .ro_recover_mr = frwr_op_recover_mr, 566 .ro_recover_mr = frwr_op_recover_mr,
558 .ro_open = frwr_op_open, 567 .ro_open = frwr_op_open,
diff --git a/net/sunrpc/xprtrdma/module.c b/net/sunrpc/xprtrdma/module.c
index 560712bd9fa2..a762d192372b 100644
--- a/net/sunrpc/xprtrdma/module.c
+++ b/net/sunrpc/xprtrdma/module.c
@@ -1,18 +1,20 @@
1/* 1/*
2 * Copyright (c) 2015 Oracle. All rights reserved. 2 * Copyright (c) 2015, 2017 Oracle. All rights reserved.
3 */ 3 */
4 4
5/* rpcrdma.ko module initialization 5/* rpcrdma.ko module initialization
6 */ 6 */
7 7
8#include <linux/types.h>
9#include <linux/compiler.h>
8#include <linux/module.h> 10#include <linux/module.h>
9#include <linux/init.h> 11#include <linux/init.h>
10#include <linux/sunrpc/svc_rdma.h> 12#include <linux/sunrpc/svc_rdma.h>
11#include "xprt_rdma.h"
12 13
13#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) 14#include <asm/swab.h>
14# define RPCDBG_FACILITY RPCDBG_TRANS 15
15#endif 16#define CREATE_TRACE_POINTS
17#include "xprt_rdma.h"
16 18
17MODULE_AUTHOR("Open Grid Computing and Network Appliance, Inc."); 19MODULE_AUTHOR("Open Grid Computing and Network Appliance, Inc.");
18MODULE_DESCRIPTION("RPC/RDMA Transport"); 20MODULE_DESCRIPTION("RPC/RDMA Transport");
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index a3f2ab283aeb..162e5dd82466 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -292,15 +292,15 @@ encode_item_not_present(struct xdr_stream *xdr)
292} 292}
293 293
294static void 294static void
295xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mw *mw) 295xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mr *mr)
296{ 296{
297 *iptr++ = cpu_to_be32(mw->mw_handle); 297 *iptr++ = cpu_to_be32(mr->mr_handle);
298 *iptr++ = cpu_to_be32(mw->mw_length); 298 *iptr++ = cpu_to_be32(mr->mr_length);
299 xdr_encode_hyper(iptr, mw->mw_offset); 299 xdr_encode_hyper(iptr, mr->mr_offset);
300} 300}
301 301
302static int 302static int
303encode_rdma_segment(struct xdr_stream *xdr, struct rpcrdma_mw *mw) 303encode_rdma_segment(struct xdr_stream *xdr, struct rpcrdma_mr *mr)
304{ 304{
305 __be32 *p; 305 __be32 *p;
306 306
@@ -308,12 +308,12 @@ encode_rdma_segment(struct xdr_stream *xdr, struct rpcrdma_mw *mw)
308 if (unlikely(!p)) 308 if (unlikely(!p))
309 return -EMSGSIZE; 309 return -EMSGSIZE;
310 310
311 xdr_encode_rdma_segment(p, mw); 311 xdr_encode_rdma_segment(p, mr);
312 return 0; 312 return 0;
313} 313}
314 314
315static int 315static int
316encode_read_segment(struct xdr_stream *xdr, struct rpcrdma_mw *mw, 316encode_read_segment(struct xdr_stream *xdr, struct rpcrdma_mr *mr,
317 u32 position) 317 u32 position)
318{ 318{
319 __be32 *p; 319 __be32 *p;
@@ -324,7 +324,7 @@ encode_read_segment(struct xdr_stream *xdr, struct rpcrdma_mw *mw,
324 324
325 *p++ = xdr_one; /* Item present */ 325 *p++ = xdr_one; /* Item present */
326 *p++ = cpu_to_be32(position); 326 *p++ = cpu_to_be32(position);
327 xdr_encode_rdma_segment(p, mw); 327 xdr_encode_rdma_segment(p, mr);
328 return 0; 328 return 0;
329} 329}
330 330
@@ -348,7 +348,7 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
348{ 348{
349 struct xdr_stream *xdr = &req->rl_stream; 349 struct xdr_stream *xdr = &req->rl_stream;
350 struct rpcrdma_mr_seg *seg; 350 struct rpcrdma_mr_seg *seg;
351 struct rpcrdma_mw *mw; 351 struct rpcrdma_mr *mr;
352 unsigned int pos; 352 unsigned int pos;
353 int nsegs; 353 int nsegs;
354 354
@@ -363,21 +363,17 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
363 363
364 do { 364 do {
365 seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, 365 seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs,
366 false, &mw); 366 false, &mr);
367 if (IS_ERR(seg)) 367 if (IS_ERR(seg))
368 return PTR_ERR(seg); 368 return PTR_ERR(seg);
369 rpcrdma_push_mw(mw, &req->rl_registered); 369 rpcrdma_mr_push(mr, &req->rl_registered);
370 370
371 if (encode_read_segment(xdr, mw, pos) < 0) 371 if (encode_read_segment(xdr, mr, pos) < 0)
372 return -EMSGSIZE; 372 return -EMSGSIZE;
373 373
374 dprintk("RPC: %5u %s: pos %u %u@0x%016llx:0x%08x (%s)\n", 374 trace_xprtrdma_read_chunk(rqst->rq_task, pos, mr, nsegs);
375 rqst->rq_task->tk_pid, __func__, pos,
376 mw->mw_length, (unsigned long long)mw->mw_offset,
377 mw->mw_handle, mw->mw_nents < nsegs ? "more" : "last");
378
379 r_xprt->rx_stats.read_chunk_count++; 375 r_xprt->rx_stats.read_chunk_count++;
380 nsegs -= mw->mw_nents; 376 nsegs -= mr->mr_nents;
381 } while (nsegs); 377 } while (nsegs);
382 378
383 return 0; 379 return 0;
@@ -404,7 +400,7 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
404{ 400{
405 struct xdr_stream *xdr = &req->rl_stream; 401 struct xdr_stream *xdr = &req->rl_stream;
406 struct rpcrdma_mr_seg *seg; 402 struct rpcrdma_mr_seg *seg;
407 struct rpcrdma_mw *mw; 403 struct rpcrdma_mr *mr;
408 int nsegs, nchunks; 404 int nsegs, nchunks;
409 __be32 *segcount; 405 __be32 *segcount;
410 406
@@ -425,23 +421,19 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
425 nchunks = 0; 421 nchunks = 0;
426 do { 422 do {
427 seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, 423 seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs,
428 true, &mw); 424 true, &mr);
429 if (IS_ERR(seg)) 425 if (IS_ERR(seg))
430 return PTR_ERR(seg); 426 return PTR_ERR(seg);
431 rpcrdma_push_mw(mw, &req->rl_registered); 427 rpcrdma_mr_push(mr, &req->rl_registered);
432 428
433 if (encode_rdma_segment(xdr, mw) < 0) 429 if (encode_rdma_segment(xdr, mr) < 0)
434 return -EMSGSIZE; 430 return -EMSGSIZE;
435 431
436 dprintk("RPC: %5u %s: %u@0x016%llx:0x%08x (%s)\n", 432 trace_xprtrdma_write_chunk(rqst->rq_task, mr, nsegs);
437 rqst->rq_task->tk_pid, __func__,
438 mw->mw_length, (unsigned long long)mw->mw_offset,
439 mw->mw_handle, mw->mw_nents < nsegs ? "more" : "last");
440
441 r_xprt->rx_stats.write_chunk_count++; 433 r_xprt->rx_stats.write_chunk_count++;
442 r_xprt->rx_stats.total_rdma_request += seg->mr_len; 434 r_xprt->rx_stats.total_rdma_request += mr->mr_length;
443 nchunks++; 435 nchunks++;
444 nsegs -= mw->mw_nents; 436 nsegs -= mr->mr_nents;
445 } while (nsegs); 437 } while (nsegs);
446 438
447 /* Update count of segments in this Write chunk */ 439 /* Update count of segments in this Write chunk */
@@ -468,7 +460,7 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
468{ 460{
469 struct xdr_stream *xdr = &req->rl_stream; 461 struct xdr_stream *xdr = &req->rl_stream;
470 struct rpcrdma_mr_seg *seg; 462 struct rpcrdma_mr_seg *seg;
471 struct rpcrdma_mw *mw; 463 struct rpcrdma_mr *mr;
472 int nsegs, nchunks; 464 int nsegs, nchunks;
473 __be32 *segcount; 465 __be32 *segcount;
474 466
@@ -487,23 +479,19 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
487 nchunks = 0; 479 nchunks = 0;
488 do { 480 do {
489 seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, 481 seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs,
490 true, &mw); 482 true, &mr);
491 if (IS_ERR(seg)) 483 if (IS_ERR(seg))
492 return PTR_ERR(seg); 484 return PTR_ERR(seg);
493 rpcrdma_push_mw(mw, &req->rl_registered); 485 rpcrdma_mr_push(mr, &req->rl_registered);
494 486
495 if (encode_rdma_segment(xdr, mw) < 0) 487 if (encode_rdma_segment(xdr, mr) < 0)
496 return -EMSGSIZE; 488 return -EMSGSIZE;
497 489
498 dprintk("RPC: %5u %s: %u@0x%016llx:0x%08x (%s)\n", 490 trace_xprtrdma_reply_chunk(rqst->rq_task, mr, nsegs);
499 rqst->rq_task->tk_pid, __func__,
500 mw->mw_length, (unsigned long long)mw->mw_offset,
501 mw->mw_handle, mw->mw_nents < nsegs ? "more" : "last");
502
503 r_xprt->rx_stats.reply_chunk_count++; 491 r_xprt->rx_stats.reply_chunk_count++;
504 r_xprt->rx_stats.total_rdma_request += seg->mr_len; 492 r_xprt->rx_stats.total_rdma_request += mr->mr_length;
505 nchunks++; 493 nchunks++;
506 nsegs -= mw->mw_nents; 494 nsegs -= mr->mr_nents;
507 } while (nsegs); 495 } while (nsegs);
508 496
509 /* Update count of segments in the Reply chunk */ 497 /* Update count of segments in the Reply chunk */
@@ -524,9 +512,6 @@ rpcrdma_unmap_sendctx(struct rpcrdma_sendctx *sc)
524 struct ib_sge *sge; 512 struct ib_sge *sge;
525 unsigned int count; 513 unsigned int count;
526 514
527 dprintk("RPC: %s: unmapping %u sges for sc=%p\n",
528 __func__, sc->sc_unmap_count, sc);
529
530 /* The first two SGEs contain the transport header and 515 /* The first two SGEs contain the transport header and
531 * the inline buffer. These are always left mapped so 516 * the inline buffer. These are always left mapped so
532 * they can be cheaply re-used. 517 * they can be cheaply re-used.
@@ -754,11 +739,6 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst)
754 __be32 *p; 739 __be32 *p;
755 int ret; 740 int ret;
756 741
757#if defined(CONFIG_SUNRPC_BACKCHANNEL)
758 if (test_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state))
759 return rpcrdma_bc_marshal_reply(rqst);
760#endif
761
762 rpcrdma_set_xdrlen(&req->rl_hdrbuf, 0); 742 rpcrdma_set_xdrlen(&req->rl_hdrbuf, 0);
763 xdr_init_encode(xdr, &req->rl_hdrbuf, 743 xdr_init_encode(xdr, &req->rl_hdrbuf,
764 req->rl_rdmabuf->rg_base); 744 req->rl_rdmabuf->rg_base);
@@ -821,6 +801,17 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst)
821 rtype = rpcrdma_areadch; 801 rtype = rpcrdma_areadch;
822 } 802 }
823 803
804 /* If this is a retransmit, discard previously registered
805 * chunks. Very likely the connection has been replaced,
806 * so these registrations are invalid and unusable.
807 */
808 while (unlikely(!list_empty(&req->rl_registered))) {
809 struct rpcrdma_mr *mr;
810
811 mr = rpcrdma_mr_pop(&req->rl_registered);
812 rpcrdma_mr_defer_recovery(mr);
813 }
814
824 /* This implementation supports the following combinations 815 /* This implementation supports the following combinations
825 * of chunk lists in one RPC-over-RDMA Call message: 816 * of chunk lists in one RPC-over-RDMA Call message:
826 * 817 *
@@ -868,10 +859,7 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst)
868 if (ret) 859 if (ret)
869 goto out_err; 860 goto out_err;
870 861
871 dprintk("RPC: %5u %s: %s/%s: hdrlen %u rpclen\n", 862 trace_xprtrdma_marshal(rqst, xdr_stream_pos(xdr), rtype, wtype);
872 rqst->rq_task->tk_pid, __func__,
873 transfertypes[rtype], transfertypes[wtype],
874 xdr_stream_pos(xdr));
875 863
876 ret = rpcrdma_prepare_send_sges(r_xprt, req, xdr_stream_pos(xdr), 864 ret = rpcrdma_prepare_send_sges(r_xprt, req, xdr_stream_pos(xdr),
877 &rqst->rq_snd_buf, rtype); 865 &rqst->rq_snd_buf, rtype);
@@ -926,8 +914,7 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
926 curlen = rqst->rq_rcv_buf.head[0].iov_len; 914 curlen = rqst->rq_rcv_buf.head[0].iov_len;
927 if (curlen > copy_len) 915 if (curlen > copy_len)
928 curlen = copy_len; 916 curlen = copy_len;
929 dprintk("RPC: %s: srcp 0x%p len %d hdrlen %d\n", 917 trace_xprtrdma_fixup(rqst, copy_len, curlen);
930 __func__, srcp, copy_len, curlen);
931 srcp += curlen; 918 srcp += curlen;
932 copy_len -= curlen; 919 copy_len -= curlen;
933 920
@@ -947,9 +934,8 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
947 if (curlen > pagelist_len) 934 if (curlen > pagelist_len)
948 curlen = pagelist_len; 935 curlen = pagelist_len;
949 936
950 dprintk("RPC: %s: page %d" 937 trace_xprtrdma_fixup_pg(rqst, i, srcp,
951 " srcp 0x%p len %d curlen %d\n", 938 copy_len, curlen);
952 __func__, i, srcp, copy_len, curlen);
953 destp = kmap_atomic(ppages[i]); 939 destp = kmap_atomic(ppages[i]);
954 memcpy(destp + page_base, srcp, curlen); 940 memcpy(destp + page_base, srcp, curlen);
955 flush_dcache_page(ppages[i]); 941 flush_dcache_page(ppages[i]);
@@ -984,24 +970,6 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
984 return fixup_copy_count; 970 return fixup_copy_count;
985} 971}
986 972
987/* Caller must guarantee @rep remains stable during this call.
988 */
989static void
990rpcrdma_mark_remote_invalidation(struct list_head *mws,
991 struct rpcrdma_rep *rep)
992{
993 struct rpcrdma_mw *mw;
994
995 if (!(rep->rr_wc_flags & IB_WC_WITH_INVALIDATE))
996 return;
997
998 list_for_each_entry(mw, mws, mw_list)
999 if (mw->mw_handle == rep->rr_inv_rkey) {
1000 mw->mw_flags = RPCRDMA_MW_F_RI;
1001 break; /* only one invalidated MR per RPC */
1002 }
1003}
1004
1005/* By convention, backchannel calls arrive via rdma_msg type 973/* By convention, backchannel calls arrive via rdma_msg type
1006 * messages, and never populate the chunk lists. This makes 974 * messages, and never populate the chunk lists. This makes
1007 * the RPC/RDMA header small and fixed in size, so it is 975 * the RPC/RDMA header small and fixed in size, so it is
@@ -1058,26 +1026,19 @@ out_short:
1058 1026
1059static int decode_rdma_segment(struct xdr_stream *xdr, u32 *length) 1027static int decode_rdma_segment(struct xdr_stream *xdr, u32 *length)
1060{ 1028{
1029 u32 handle;
1030 u64 offset;
1061 __be32 *p; 1031 __be32 *p;
1062 1032
1063 p = xdr_inline_decode(xdr, 4 * sizeof(*p)); 1033 p = xdr_inline_decode(xdr, 4 * sizeof(*p));
1064 if (unlikely(!p)) 1034 if (unlikely(!p))
1065 return -EIO; 1035 return -EIO;
1066 1036
1067 ifdebug(FACILITY) { 1037 handle = be32_to_cpup(p++);
1068 u64 offset; 1038 *length = be32_to_cpup(p++);
1069 u32 handle; 1039 xdr_decode_hyper(p, &offset);
1070
1071 handle = be32_to_cpup(p++);
1072 *length = be32_to_cpup(p++);
1073 xdr_decode_hyper(p, &offset);
1074 dprintk("RPC: %s: segment %u@0x%016llx:0x%08x\n",
1075 __func__, *length, (unsigned long long)offset,
1076 handle);
1077 } else {
1078 *length = be32_to_cpup(p + 1);
1079 }
1080 1040
1041 trace_xprtrdma_decode_seg(handle, *length, offset);
1081 return 0; 1042 return 0;
1082} 1043}
1083 1044
@@ -1098,8 +1059,6 @@ static int decode_write_chunk(struct xdr_stream *xdr, u32 *length)
1098 *length += seglength; 1059 *length += seglength;
1099 } 1060 }
1100 1061
1101 dprintk("RPC: %s: segcount=%u, %u bytes\n",
1102 __func__, be32_to_cpup(p), *length);
1103 return 0; 1062 return 0;
1104} 1063}
1105 1064
@@ -1296,8 +1255,7 @@ out:
1296 * being marshaled. 1255 * being marshaled.
1297 */ 1256 */
1298out_badheader: 1257out_badheader:
1299 dprintk("RPC: %5u %s: invalid rpcrdma reply (type %u)\n", 1258 trace_xprtrdma_reply_hdr(rep);
1300 rqst->rq_task->tk_pid, __func__, be32_to_cpu(rep->rr_proc));
1301 r_xprt->rx_stats.bad_reply_count++; 1259 r_xprt->rx_stats.bad_reply_count++;
1302 status = -EIO; 1260 status = -EIO;
1303 goto out; 1261 goto out;
@@ -1339,9 +1297,12 @@ void rpcrdma_deferred_completion(struct work_struct *work)
1339 struct rpcrdma_rep *rep = 1297 struct rpcrdma_rep *rep =
1340 container_of(work, struct rpcrdma_rep, rr_work); 1298 container_of(work, struct rpcrdma_rep, rr_work);
1341 struct rpcrdma_req *req = rpcr_to_rdmar(rep->rr_rqst); 1299 struct rpcrdma_req *req = rpcr_to_rdmar(rep->rr_rqst);
1300 struct rpcrdma_xprt *r_xprt = rep->rr_rxprt;
1342 1301
1343 rpcrdma_mark_remote_invalidation(&req->rl_registered, rep); 1302 trace_xprtrdma_defer_cmp(rep);
1344 rpcrdma_release_rqst(rep->rr_rxprt, req); 1303 if (rep->rr_wc_flags & IB_WC_WITH_INVALIDATE)
1304 r_xprt->rx_ia.ri_ops->ro_reminv(rep, &req->rl_registered);
1305 rpcrdma_release_rqst(r_xprt, req);
1345 rpcrdma_complete_rqst(rep); 1306 rpcrdma_complete_rqst(rep);
1346} 1307}
1347 1308
@@ -1360,8 +1321,6 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep)
1360 u32 credits; 1321 u32 credits;
1361 __be32 *p; 1322 __be32 *p;
1362 1323
1363 dprintk("RPC: %s: incoming rep %p\n", __func__, rep);
1364
1365 if (rep->rr_hdrbuf.head[0].iov_len == 0) 1324 if (rep->rr_hdrbuf.head[0].iov_len == 0)
1366 goto out_badstatus; 1325 goto out_badstatus;
1367 1326
@@ -1405,8 +1364,7 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep)
1405 rep->rr_rqst = rqst; 1364 rep->rr_rqst = rqst;
1406 clear_bit(RPCRDMA_REQ_F_PENDING, &req->rl_flags); 1365 clear_bit(RPCRDMA_REQ_F_PENDING, &req->rl_flags);
1407 1366
1408 dprintk("RPC: %s: reply %p completes request %p (xid 0x%08x)\n", 1367 trace_xprtrdma_reply(rqst->rq_task, rep, req, credits);
1409 __func__, rep, req, be32_to_cpu(rep->rr_xid));
1410 1368
1411 queue_work_on(req->rl_cpu, rpcrdma_receive_wq, &rep->rr_work); 1369 queue_work_on(req->rl_cpu, rpcrdma_receive_wq, &rep->rr_work);
1412 return; 1370 return;
@@ -1420,8 +1378,7 @@ out_badstatus:
1420 return; 1378 return;
1421 1379
1422out_badversion: 1380out_badversion:
1423 dprintk("RPC: %s: invalid version %d\n", 1381 trace_xprtrdma_reply_vers(rep);
1424 __func__, be32_to_cpu(rep->rr_vers));
1425 goto repost; 1382 goto repost;
1426 1383
1427/* The RPC transaction has already been terminated, or the header 1384/* The RPC transaction has already been terminated, or the header
@@ -1429,12 +1386,11 @@ out_badversion:
1429 */ 1386 */
1430out_norqst: 1387out_norqst:
1431 spin_unlock(&xprt->recv_lock); 1388 spin_unlock(&xprt->recv_lock);
1432 dprintk("RPC: %s: no match for incoming xid 0x%08x\n", 1389 trace_xprtrdma_reply_rqst(rep);
1433 __func__, be32_to_cpu(rep->rr_xid));
1434 goto repost; 1390 goto repost;
1435 1391
1436out_shortreply: 1392out_shortreply:
1437 dprintk("RPC: %s: short/invalid reply\n", __func__); 1393 trace_xprtrdma_reply_short(rep);
1438 1394
1439/* If no pending RPC transaction was matched, post a replacement 1395/* If no pending RPC transaction was matched, post a replacement
1440 * receive buffer before returning. 1396 * receive buffer before returning.
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 6ee1ad8978f3..4b1ecfe979cf 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -67,8 +67,7 @@
67static unsigned int xprt_rdma_slot_table_entries = RPCRDMA_DEF_SLOT_TABLE; 67static unsigned int xprt_rdma_slot_table_entries = RPCRDMA_DEF_SLOT_TABLE;
68unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE; 68unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE;
69static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE; 69static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE;
70static unsigned int xprt_rdma_inline_write_padding; 70unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRWR;
71unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR;
72int xprt_rdma_pad_optimize; 71int xprt_rdma_pad_optimize;
73 72
74#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) 73#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
@@ -81,6 +80,7 @@ static unsigned int zero;
81static unsigned int max_padding = PAGE_SIZE; 80static unsigned int max_padding = PAGE_SIZE;
82static unsigned int min_memreg = RPCRDMA_BOUNCEBUFFERS; 81static unsigned int min_memreg = RPCRDMA_BOUNCEBUFFERS;
83static unsigned int max_memreg = RPCRDMA_LAST - 1; 82static unsigned int max_memreg = RPCRDMA_LAST - 1;
83static unsigned int dummy;
84 84
85static struct ctl_table_header *sunrpc_table_header; 85static struct ctl_table_header *sunrpc_table_header;
86 86
@@ -114,7 +114,7 @@ static struct ctl_table xr_tunables_table[] = {
114 }, 114 },
115 { 115 {
116 .procname = "rdma_inline_write_padding", 116 .procname = "rdma_inline_write_padding",
117 .data = &xprt_rdma_inline_write_padding, 117 .data = &dummy,
118 .maxlen = sizeof(unsigned int), 118 .maxlen = sizeof(unsigned int),
119 .mode = 0644, 119 .mode = 0644,
120 .proc_handler = proc_dointvec_minmax, 120 .proc_handler = proc_dointvec_minmax,
@@ -259,13 +259,10 @@ xprt_rdma_connect_worker(struct work_struct *work)
259 259
260 xprt_clear_connected(xprt); 260 xprt_clear_connected(xprt);
261 261
262 dprintk("RPC: %s: %sconnect\n", __func__,
263 r_xprt->rx_ep.rep_connected != 0 ? "re" : "");
264 rc = rpcrdma_ep_connect(&r_xprt->rx_ep, &r_xprt->rx_ia); 262 rc = rpcrdma_ep_connect(&r_xprt->rx_ep, &r_xprt->rx_ia);
265 if (rc) 263 if (rc)
266 xprt_wake_pending_tasks(xprt, rc); 264 xprt_wake_pending_tasks(xprt, rc);
267 265
268 dprintk("RPC: %s: exit\n", __func__);
269 xprt_clear_connecting(xprt); 266 xprt_clear_connecting(xprt);
270} 267}
271 268
@@ -275,7 +272,7 @@ xprt_rdma_inject_disconnect(struct rpc_xprt *xprt)
275 struct rpcrdma_xprt *r_xprt = container_of(xprt, struct rpcrdma_xprt, 272 struct rpcrdma_xprt *r_xprt = container_of(xprt, struct rpcrdma_xprt,
276 rx_xprt); 273 rx_xprt);
277 274
278 pr_info("rpcrdma: injecting transport disconnect on xprt=%p\n", xprt); 275 trace_xprtrdma_inject_dsc(r_xprt);
279 rdma_disconnect(r_xprt->rx_ia.ri_id); 276 rdma_disconnect(r_xprt->rx_ia.ri_id);
280} 277}
281 278
@@ -295,7 +292,7 @@ xprt_rdma_destroy(struct rpc_xprt *xprt)
295{ 292{
296 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); 293 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
297 294
298 dprintk("RPC: %s: called\n", __func__); 295 trace_xprtrdma_destroy(r_xprt);
299 296
300 cancel_delayed_work_sync(&r_xprt->rx_connect_worker); 297 cancel_delayed_work_sync(&r_xprt->rx_connect_worker);
301 298
@@ -306,11 +303,8 @@ xprt_rdma_destroy(struct rpc_xprt *xprt)
306 rpcrdma_ia_close(&r_xprt->rx_ia); 303 rpcrdma_ia_close(&r_xprt->rx_ia);
307 304
308 xprt_rdma_free_addresses(xprt); 305 xprt_rdma_free_addresses(xprt);
309
310 xprt_free(xprt); 306 xprt_free(xprt);
311 307
312 dprintk("RPC: %s: returning\n", __func__);
313
314 module_put(THIS_MODULE); 308 module_put(THIS_MODULE);
315} 309}
316 310
@@ -361,9 +355,7 @@ xprt_setup_rdma(struct xprt_create *args)
361 /* 355 /*
362 * Set up RDMA-specific connect data. 356 * Set up RDMA-specific connect data.
363 */ 357 */
364 358 sap = args->dstaddr;
365 sap = (struct sockaddr *)&cdata.addr;
366 memcpy(sap, args->dstaddr, args->addrlen);
367 359
368 /* Ensure xprt->addr holds valid server TCP (not RDMA) 360 /* Ensure xprt->addr holds valid server TCP (not RDMA)
369 * address, for any side protocols which peek at it */ 361 * address, for any side protocols which peek at it */
@@ -373,6 +365,7 @@ xprt_setup_rdma(struct xprt_create *args)
373 365
374 if (rpc_get_port(sap)) 366 if (rpc_get_port(sap))
375 xprt_set_bound(xprt); 367 xprt_set_bound(xprt);
368 xprt_rdma_format_addresses(xprt, sap);
376 369
377 cdata.max_requests = xprt->max_reqs; 370 cdata.max_requests = xprt->max_reqs;
378 371
@@ -387,8 +380,6 @@ xprt_setup_rdma(struct xprt_create *args)
387 if (cdata.inline_rsize > cdata.rsize) 380 if (cdata.inline_rsize > cdata.rsize)
388 cdata.inline_rsize = cdata.rsize; 381 cdata.inline_rsize = cdata.rsize;
389 382
390 cdata.padding = xprt_rdma_inline_write_padding;
391
392 /* 383 /*
393 * Create new transport instance, which includes initialized 384 * Create new transport instance, which includes initialized
394 * o ia 385 * o ia
@@ -398,7 +389,7 @@ xprt_setup_rdma(struct xprt_create *args)
398 389
399 new_xprt = rpcx_to_rdmax(xprt); 390 new_xprt = rpcx_to_rdmax(xprt);
400 391
401 rc = rpcrdma_ia_open(new_xprt, sap); 392 rc = rpcrdma_ia_open(new_xprt);
402 if (rc) 393 if (rc)
403 goto out1; 394 goto out1;
404 395
@@ -407,31 +398,19 @@ xprt_setup_rdma(struct xprt_create *args)
407 */ 398 */
408 new_xprt->rx_data = cdata; 399 new_xprt->rx_data = cdata;
409 new_ep = &new_xprt->rx_ep; 400 new_ep = &new_xprt->rx_ep;
410 new_ep->rep_remote_addr = cdata.addr;
411 401
412 rc = rpcrdma_ep_create(&new_xprt->rx_ep, 402 rc = rpcrdma_ep_create(&new_xprt->rx_ep,
413 &new_xprt->rx_ia, &new_xprt->rx_data); 403 &new_xprt->rx_ia, &new_xprt->rx_data);
414 if (rc) 404 if (rc)
415 goto out2; 405 goto out2;
416 406
417 /*
418 * Allocate pre-registered send and receive buffers for headers and
419 * any inline data. Also specify any padding which will be provided
420 * from a preregistered zero buffer.
421 */
422 rc = rpcrdma_buffer_create(new_xprt); 407 rc = rpcrdma_buffer_create(new_xprt);
423 if (rc) 408 if (rc)
424 goto out3; 409 goto out3;
425 410
426 /*
427 * Register a callback for connection events. This is necessary because
428 * connection loss notification is async. We also catch connection loss
429 * when reaping receives.
430 */
431 INIT_DELAYED_WORK(&new_xprt->rx_connect_worker, 411 INIT_DELAYED_WORK(&new_xprt->rx_connect_worker,
432 xprt_rdma_connect_worker); 412 xprt_rdma_connect_worker);
433 413
434 xprt_rdma_format_addresses(xprt, sap);
435 xprt->max_payload = new_xprt->rx_ia.ri_ops->ro_maxpages(new_xprt); 414 xprt->max_payload = new_xprt->rx_ia.ri_ops->ro_maxpages(new_xprt);
436 if (xprt->max_payload == 0) 415 if (xprt->max_payload == 0)
437 goto out4; 416 goto out4;
@@ -445,16 +424,19 @@ xprt_setup_rdma(struct xprt_create *args)
445 dprintk("RPC: %s: %s:%s\n", __func__, 424 dprintk("RPC: %s: %s:%s\n", __func__,
446 xprt->address_strings[RPC_DISPLAY_ADDR], 425 xprt->address_strings[RPC_DISPLAY_ADDR],
447 xprt->address_strings[RPC_DISPLAY_PORT]); 426 xprt->address_strings[RPC_DISPLAY_PORT]);
427 trace_xprtrdma_create(new_xprt);
448 return xprt; 428 return xprt;
449 429
450out4: 430out4:
451 xprt_rdma_free_addresses(xprt); 431 rpcrdma_buffer_destroy(&new_xprt->rx_buf);
452 rc = -EINVAL; 432 rc = -ENODEV;
453out3: 433out3:
454 rpcrdma_ep_destroy(new_ep, &new_xprt->rx_ia); 434 rpcrdma_ep_destroy(new_ep, &new_xprt->rx_ia);
455out2: 435out2:
456 rpcrdma_ia_close(&new_xprt->rx_ia); 436 rpcrdma_ia_close(&new_xprt->rx_ia);
457out1: 437out1:
438 trace_xprtrdma_destroy(new_xprt);
439 xprt_rdma_free_addresses(xprt);
458 xprt_free(xprt); 440 xprt_free(xprt);
459 return ERR_PTR(rc); 441 return ERR_PTR(rc);
460} 442}
@@ -488,16 +470,34 @@ xprt_rdma_close(struct rpc_xprt *xprt)
488 rpcrdma_ep_disconnect(ep, ia); 470 rpcrdma_ep_disconnect(ep, ia);
489} 471}
490 472
473/**
474 * xprt_rdma_set_port - update server port with rpcbind result
475 * @xprt: controlling RPC transport
476 * @port: new port value
477 *
478 * Transport connect status is unchanged.
479 */
491static void 480static void
492xprt_rdma_set_port(struct rpc_xprt *xprt, u16 port) 481xprt_rdma_set_port(struct rpc_xprt *xprt, u16 port)
493{ 482{
494 struct sockaddr_in *sap; 483 struct sockaddr *sap = (struct sockaddr *)&xprt->addr;
484 char buf[8];
495 485
496 sap = (struct sockaddr_in *)&xprt->addr; 486 dprintk("RPC: %s: setting port for xprt %p (%s:%s) to %u\n",
497 sap->sin_port = htons(port); 487 __func__, xprt,
498 sap = (struct sockaddr_in *)&rpcx_to_rdmad(xprt).addr; 488 xprt->address_strings[RPC_DISPLAY_ADDR],
499 sap->sin_port = htons(port); 489 xprt->address_strings[RPC_DISPLAY_PORT],
500 dprintk("RPC: %s: %u\n", __func__, port); 490 port);
491
492 rpc_set_port(sap, port);
493
494 kfree(xprt->address_strings[RPC_DISPLAY_PORT]);
495 snprintf(buf, sizeof(buf), "%u", port);
496 xprt->address_strings[RPC_DISPLAY_PORT] = kstrdup(buf, GFP_KERNEL);
497
498 kfree(xprt->address_strings[RPC_DISPLAY_HEX_PORT]);
499 snprintf(buf, sizeof(buf), "%4hx", port);
500 xprt->address_strings[RPC_DISPLAY_HEX_PORT] = kstrdup(buf, GFP_KERNEL);
501} 501}
502 502
503/** 503/**
@@ -516,8 +516,6 @@ xprt_rdma_set_port(struct rpc_xprt *xprt, u16 port)
516static void 516static void
517xprt_rdma_timer(struct rpc_xprt *xprt, struct rpc_task *task) 517xprt_rdma_timer(struct rpc_xprt *xprt, struct rpc_task *task)
518{ 518{
519 dprintk("RPC: %5u %s: xprt = %p\n", task->tk_pid, __func__, xprt);
520
521 xprt_force_disconnect(xprt); 519 xprt_force_disconnect(xprt);
522} 520}
523 521
@@ -640,7 +638,7 @@ xprt_rdma_allocate(struct rpc_task *task)
640 638
641 req = rpcrdma_buffer_get(&r_xprt->rx_buf); 639 req = rpcrdma_buffer_get(&r_xprt->rx_buf);
642 if (req == NULL) 640 if (req == NULL)
643 return -ENOMEM; 641 goto out_get;
644 642
645 flags = RPCRDMA_DEF_GFP; 643 flags = RPCRDMA_DEF_GFP;
646 if (RPC_IS_SWAPPER(task)) 644 if (RPC_IS_SWAPPER(task))
@@ -653,19 +651,18 @@ xprt_rdma_allocate(struct rpc_task *task)
653 if (!rpcrdma_get_recvbuf(r_xprt, req, rqst->rq_rcvsize, flags)) 651 if (!rpcrdma_get_recvbuf(r_xprt, req, rqst->rq_rcvsize, flags))
654 goto out_fail; 652 goto out_fail;
655 653
656 dprintk("RPC: %5u %s: send size = %zd, recv size = %zd, req = %p\n",
657 task->tk_pid, __func__, rqst->rq_callsize,
658 rqst->rq_rcvsize, req);
659
660 req->rl_cpu = smp_processor_id(); 654 req->rl_cpu = smp_processor_id();
661 req->rl_connect_cookie = 0; /* our reserved value */ 655 req->rl_connect_cookie = 0; /* our reserved value */
662 rpcrdma_set_xprtdata(rqst, req); 656 rpcrdma_set_xprtdata(rqst, req);
663 rqst->rq_buffer = req->rl_sendbuf->rg_base; 657 rqst->rq_buffer = req->rl_sendbuf->rg_base;
664 rqst->rq_rbuffer = req->rl_recvbuf->rg_base; 658 rqst->rq_rbuffer = req->rl_recvbuf->rg_base;
659 trace_xprtrdma_allocate(task, req);
665 return 0; 660 return 0;
666 661
667out_fail: 662out_fail:
668 rpcrdma_buffer_put(req); 663 rpcrdma_buffer_put(req);
664out_get:
665 trace_xprtrdma_allocate(task, NULL);
669 return -ENOMEM; 666 return -ENOMEM;
670} 667}
671 668
@@ -682,13 +679,9 @@ xprt_rdma_free(struct rpc_task *task)
682 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); 679 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
683 struct rpcrdma_req *req = rpcr_to_rdmar(rqst); 680 struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
684 681
685 if (test_bit(RPCRDMA_REQ_F_BACKCHANNEL, &req->rl_flags))
686 return;
687
688 dprintk("RPC: %s: called on 0x%p\n", __func__, req->rl_reply);
689
690 if (test_bit(RPCRDMA_REQ_F_PENDING, &req->rl_flags)) 682 if (test_bit(RPCRDMA_REQ_F_PENDING, &req->rl_flags))
691 rpcrdma_release_rqst(r_xprt, req); 683 rpcrdma_release_rqst(r_xprt, req);
684 trace_xprtrdma_rpc_done(task, req);
692 rpcrdma_buffer_put(req); 685 rpcrdma_buffer_put(req);
693} 686}
694 687
@@ -698,22 +691,12 @@ xprt_rdma_free(struct rpc_task *task)
698 * 691 *
699 * Caller holds the transport's write lock. 692 * Caller holds the transport's write lock.
700 * 693 *
701 * Return values: 694 * Returns:
702 * 0: The request has been sent 695 * %0 if the RPC message has been sent
703 * ENOTCONN: Caller needs to invoke connect logic then call again 696 * %-ENOTCONN if the caller should reconnect and call again
704 * ENOBUFS: Call again later to send the request 697 * %-ENOBUFS if the caller should call again later
705 * EIO: A permanent error occurred. The request was not sent, 698 * %-EIO if a permanent error occurred and the request was not
706 * and don't try it again 699 * sent. Do not try to send this message again.
707 *
708 * send_request invokes the meat of RPC RDMA. It must do the following:
709 *
710 * 1. Marshal the RPC request into an RPC RDMA request, which means
711 * putting a header in front of data, and creating IOVs for RDMA
712 * from those in the request.
713 * 2. In marshaling, detect opportunities for RDMA, and use them.
714 * 3. Post a recv message to set up asynch completion, then send
715 * the request (rpcrdma_ep_post).
716 * 4. No partial sends are possible in the RPC-RDMA protocol (as in UDP).
717 */ 700 */
718static int 701static int
719xprt_rdma_send_request(struct rpc_task *task) 702xprt_rdma_send_request(struct rpc_task *task)
@@ -724,14 +707,14 @@ xprt_rdma_send_request(struct rpc_task *task)
724 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); 707 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
725 int rc = 0; 708 int rc = 0;
726 709
710#if defined(CONFIG_SUNRPC_BACKCHANNEL)
711 if (unlikely(!rqst->rq_buffer))
712 return xprt_rdma_bc_send_reply(rqst);
713#endif /* CONFIG_SUNRPC_BACKCHANNEL */
714
727 if (!xprt_connected(xprt)) 715 if (!xprt_connected(xprt))
728 goto drop_connection; 716 goto drop_connection;
729 717
730 /* On retransmit, remove any previously registered chunks */
731 if (unlikely(!list_empty(&req->rl_registered)))
732 r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt,
733 &req->rl_registered);
734
735 rc = rpcrdma_marshal_req(r_xprt, rqst); 718 rc = rpcrdma_marshal_req(r_xprt, rqst);
736 if (rc < 0) 719 if (rc < 0)
737 goto failed_marshal; 720 goto failed_marshal;
@@ -744,7 +727,7 @@ xprt_rdma_send_request(struct rpc_task *task)
744 goto drop_connection; 727 goto drop_connection;
745 req->rl_connect_cookie = xprt->connect_cookie; 728 req->rl_connect_cookie = xprt->connect_cookie;
746 729
747 set_bit(RPCRDMA_REQ_F_PENDING, &req->rl_flags); 730 __set_bit(RPCRDMA_REQ_F_PENDING, &req->rl_flags);
748 if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req)) 731 if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req))
749 goto drop_connection; 732 goto drop_connection;
750 733
@@ -904,8 +887,7 @@ int xprt_rdma_init(void)
904 "\tMaxInlineRead %d\n\tMaxInlineWrite %d\n", 887 "\tMaxInlineRead %d\n\tMaxInlineWrite %d\n",
905 xprt_rdma_slot_table_entries, 888 xprt_rdma_slot_table_entries,
906 xprt_rdma_max_inline_read, xprt_rdma_max_inline_write); 889 xprt_rdma_max_inline_read, xprt_rdma_max_inline_write);
907 dprintk("\tPadding %d\n\tMemreg %d\n", 890 dprintk("\tPadding 0\n\tMemreg %d\n", xprt_rdma_memreg_strategy);
908 xprt_rdma_inline_write_padding, xprt_rdma_memreg_strategy);
909 891
910#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) 892#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
911 if (!sunrpc_table_header) 893 if (!sunrpc_table_header)
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 8607c029c0dd..f4eb63e8e689 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -71,8 +71,8 @@
71/* 71/*
72 * internal functions 72 * internal functions
73 */ 73 */
74static void rpcrdma_create_mrs(struct rpcrdma_xprt *r_xprt); 74static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt);
75static void rpcrdma_destroy_mrs(struct rpcrdma_buffer *buf); 75static void rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf);
76static void rpcrdma_dma_unmap_regbuf(struct rpcrdma_regbuf *rb); 76static void rpcrdma_dma_unmap_regbuf(struct rpcrdma_regbuf *rb);
77 77
78struct workqueue_struct *rpcrdma_receive_wq __read_mostly; 78struct workqueue_struct *rpcrdma_receive_wq __read_mostly;
@@ -108,7 +108,10 @@ static void
108rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context) 108rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
109{ 109{
110 struct rpcrdma_ep *ep = context; 110 struct rpcrdma_ep *ep = context;
111 struct rpcrdma_xprt *r_xprt = container_of(ep, struct rpcrdma_xprt,
112 rx_ep);
111 113
114 trace_xprtrdma_qp_error(r_xprt, event);
112 pr_err("rpcrdma: %s on device %s ep %p\n", 115 pr_err("rpcrdma: %s on device %s ep %p\n",
113 ib_event_msg(event->event), event->device->name, context); 116 ib_event_msg(event->event), event->device->name, context);
114 117
@@ -133,6 +136,7 @@ rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
133 container_of(cqe, struct rpcrdma_sendctx, sc_cqe); 136 container_of(cqe, struct rpcrdma_sendctx, sc_cqe);
134 137
135 /* WARNING: Only wr_cqe and status are reliable at this point */ 138 /* WARNING: Only wr_cqe and status are reliable at this point */
139 trace_xprtrdma_wc_send(sc, wc);
136 if (wc->status != IB_WC_SUCCESS && wc->status != IB_WC_WR_FLUSH_ERR) 140 if (wc->status != IB_WC_SUCCESS && wc->status != IB_WC_WR_FLUSH_ERR)
137 pr_err("rpcrdma: Send: %s (%u/0x%x)\n", 141 pr_err("rpcrdma: Send: %s (%u/0x%x)\n",
138 ib_wc_status_msg(wc->status), 142 ib_wc_status_msg(wc->status),
@@ -155,13 +159,11 @@ rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
155 rr_cqe); 159 rr_cqe);
156 160
157 /* WARNING: Only wr_id and status are reliable at this point */ 161 /* WARNING: Only wr_id and status are reliable at this point */
162 trace_xprtrdma_wc_receive(rep, wc);
158 if (wc->status != IB_WC_SUCCESS) 163 if (wc->status != IB_WC_SUCCESS)
159 goto out_fail; 164 goto out_fail;
160 165
161 /* status == SUCCESS means all fields in wc are trustworthy */ 166 /* status == SUCCESS means all fields in wc are trustworthy */
162 dprintk("RPC: %s: rep %p opcode 'recv', length %u: success\n",
163 __func__, rep, wc->byte_len);
164
165 rpcrdma_set_xdrlen(&rep->rr_hdrbuf, wc->byte_len); 167 rpcrdma_set_xdrlen(&rep->rr_hdrbuf, wc->byte_len);
166 rep->rr_wc_flags = wc->wc_flags; 168 rep->rr_wc_flags = wc->wc_flags;
167 rep->rr_inv_rkey = wc->ex.invalidate_rkey; 169 rep->rr_inv_rkey = wc->ex.invalidate_rkey;
@@ -192,7 +194,6 @@ rpcrdma_update_connect_private(struct rpcrdma_xprt *r_xprt,
192 unsigned int rsize, wsize; 194 unsigned int rsize, wsize;
193 195
194 /* Default settings for RPC-over-RDMA Version One */ 196 /* Default settings for RPC-over-RDMA Version One */
195 r_xprt->rx_ia.ri_reminv_expected = false;
196 r_xprt->rx_ia.ri_implicit_roundup = xprt_rdma_pad_optimize; 197 r_xprt->rx_ia.ri_implicit_roundup = xprt_rdma_pad_optimize;
197 rsize = RPCRDMA_V1_DEF_INLINE_SIZE; 198 rsize = RPCRDMA_V1_DEF_INLINE_SIZE;
198 wsize = RPCRDMA_V1_DEF_INLINE_SIZE; 199 wsize = RPCRDMA_V1_DEF_INLINE_SIZE;
@@ -200,7 +201,6 @@ rpcrdma_update_connect_private(struct rpcrdma_xprt *r_xprt,
200 if (pmsg && 201 if (pmsg &&
201 pmsg->cp_magic == rpcrdma_cmp_magic && 202 pmsg->cp_magic == rpcrdma_cmp_magic &&
202 pmsg->cp_version == RPCRDMA_CMP_VERSION) { 203 pmsg->cp_version == RPCRDMA_CMP_VERSION) {
203 r_xprt->rx_ia.ri_reminv_expected = true;
204 r_xprt->rx_ia.ri_implicit_roundup = true; 204 r_xprt->rx_ia.ri_implicit_roundup = true;
205 rsize = rpcrdma_decode_buffer_size(pmsg->cp_send_size); 205 rsize = rpcrdma_decode_buffer_size(pmsg->cp_send_size);
206 wsize = rpcrdma_decode_buffer_size(pmsg->cp_recv_size); 206 wsize = rpcrdma_decode_buffer_size(pmsg->cp_recv_size);
@@ -221,11 +221,9 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
221 struct rpcrdma_xprt *xprt = id->context; 221 struct rpcrdma_xprt *xprt = id->context;
222 struct rpcrdma_ia *ia = &xprt->rx_ia; 222 struct rpcrdma_ia *ia = &xprt->rx_ia;
223 struct rpcrdma_ep *ep = &xprt->rx_ep; 223 struct rpcrdma_ep *ep = &xprt->rx_ep;
224#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
225 struct sockaddr *sap = (struct sockaddr *)&ep->rep_remote_addr;
226#endif
227 int connstate = 0; 224 int connstate = 0;
228 225
226 trace_xprtrdma_conn_upcall(xprt, event);
229 switch (event->event) { 227 switch (event->event) {
230 case RDMA_CM_EVENT_ADDR_RESOLVED: 228 case RDMA_CM_EVENT_ADDR_RESOLVED:
231 case RDMA_CM_EVENT_ROUTE_RESOLVED: 229 case RDMA_CM_EVENT_ROUTE_RESOLVED:
@@ -234,21 +232,17 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
234 break; 232 break;
235 case RDMA_CM_EVENT_ADDR_ERROR: 233 case RDMA_CM_EVENT_ADDR_ERROR:
236 ia->ri_async_rc = -EHOSTUNREACH; 234 ia->ri_async_rc = -EHOSTUNREACH;
237 dprintk("RPC: %s: CM address resolution error, ep 0x%p\n",
238 __func__, ep);
239 complete(&ia->ri_done); 235 complete(&ia->ri_done);
240 break; 236 break;
241 case RDMA_CM_EVENT_ROUTE_ERROR: 237 case RDMA_CM_EVENT_ROUTE_ERROR:
242 ia->ri_async_rc = -ENETUNREACH; 238 ia->ri_async_rc = -ENETUNREACH;
243 dprintk("RPC: %s: CM route resolution error, ep 0x%p\n",
244 __func__, ep);
245 complete(&ia->ri_done); 239 complete(&ia->ri_done);
246 break; 240 break;
247 case RDMA_CM_EVENT_DEVICE_REMOVAL: 241 case RDMA_CM_EVENT_DEVICE_REMOVAL:
248#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) 242#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
249 pr_info("rpcrdma: removing device %s for %pIS:%u\n", 243 pr_info("rpcrdma: removing device %s for %s:%s\n",
250 ia->ri_device->name, 244 ia->ri_device->name,
251 sap, rpc_get_port(sap)); 245 rpcrdma_addrstr(xprt), rpcrdma_portstr(xprt));
252#endif 246#endif
253 set_bit(RPCRDMA_IAF_REMOVING, &ia->ri_flags); 247 set_bit(RPCRDMA_IAF_REMOVING, &ia->ri_flags);
254 ep->rep_connected = -ENODEV; 248 ep->rep_connected = -ENODEV;
@@ -271,8 +265,8 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
271 connstate = -ENETDOWN; 265 connstate = -ENETDOWN;
272 goto connected; 266 goto connected;
273 case RDMA_CM_EVENT_REJECTED: 267 case RDMA_CM_EVENT_REJECTED:
274 dprintk("rpcrdma: connection to %pIS:%u rejected: %s\n", 268 dprintk("rpcrdma: connection to %s:%s rejected: %s\n",
275 sap, rpc_get_port(sap), 269 rpcrdma_addrstr(xprt), rpcrdma_portstr(xprt),
276 rdma_reject_msg(id, event->status)); 270 rdma_reject_msg(id, event->status));
277 connstate = -ECONNREFUSED; 271 connstate = -ECONNREFUSED;
278 if (event->status == IB_CM_REJ_STALE_CONN) 272 if (event->status == IB_CM_REJ_STALE_CONN)
@@ -287,8 +281,9 @@ connected:
287 wake_up_all(&ep->rep_connect_wait); 281 wake_up_all(&ep->rep_connect_wait);
288 /*FALLTHROUGH*/ 282 /*FALLTHROUGH*/
289 default: 283 default:
290 dprintk("RPC: %s: %pIS:%u on %s/%s (ep 0x%p): %s\n", 284 dprintk("RPC: %s: %s:%s on %s/%s (ep 0x%p): %s\n",
291 __func__, sap, rpc_get_port(sap), 285 __func__,
286 rpcrdma_addrstr(xprt), rpcrdma_portstr(xprt),
292 ia->ri_device->name, ia->ri_ops->ro_displayname, 287 ia->ri_device->name, ia->ri_ops->ro_displayname,
293 ep, rdma_event_msg(event->event)); 288 ep, rdma_event_msg(event->event));
294 break; 289 break;
@@ -298,13 +293,14 @@ connected:
298} 293}
299 294
300static struct rdma_cm_id * 295static struct rdma_cm_id *
301rpcrdma_create_id(struct rpcrdma_xprt *xprt, 296rpcrdma_create_id(struct rpcrdma_xprt *xprt, struct rpcrdma_ia *ia)
302 struct rpcrdma_ia *ia, struct sockaddr *addr)
303{ 297{
304 unsigned long wtimeout = msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1; 298 unsigned long wtimeout = msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1;
305 struct rdma_cm_id *id; 299 struct rdma_cm_id *id;
306 int rc; 300 int rc;
307 301
302 trace_xprtrdma_conn_start(xprt);
303
308 init_completion(&ia->ri_done); 304 init_completion(&ia->ri_done);
309 init_completion(&ia->ri_remove_done); 305 init_completion(&ia->ri_remove_done);
310 306
@@ -318,7 +314,9 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt,
318 } 314 }
319 315
320 ia->ri_async_rc = -ETIMEDOUT; 316 ia->ri_async_rc = -ETIMEDOUT;
321 rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT); 317 rc = rdma_resolve_addr(id, NULL,
318 (struct sockaddr *)&xprt->rx_xprt.addr,
319 RDMA_RESOLVE_TIMEOUT);
322 if (rc) { 320 if (rc) {
323 dprintk("RPC: %s: rdma_resolve_addr() failed %i\n", 321 dprintk("RPC: %s: rdma_resolve_addr() failed %i\n",
324 __func__, rc); 322 __func__, rc);
@@ -326,8 +324,7 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt,
326 } 324 }
327 rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout); 325 rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout);
328 if (rc < 0) { 326 if (rc < 0) {
329 dprintk("RPC: %s: wait() exited: %i\n", 327 trace_xprtrdma_conn_tout(xprt);
330 __func__, rc);
331 goto out; 328 goto out;
332 } 329 }
333 330
@@ -344,8 +341,7 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt,
344 } 341 }
345 rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout); 342 rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout);
346 if (rc < 0) { 343 if (rc < 0) {
347 dprintk("RPC: %s: wait() exited: %i\n", 344 trace_xprtrdma_conn_tout(xprt);
348 __func__, rc);
349 goto out; 345 goto out;
350 } 346 }
351 rc = ia->ri_async_rc; 347 rc = ia->ri_async_rc;
@@ -365,19 +361,18 @@ out:
365 361
366/** 362/**
367 * rpcrdma_ia_open - Open and initialize an Interface Adapter. 363 * rpcrdma_ia_open - Open and initialize an Interface Adapter.
368 * @xprt: controlling transport 364 * @xprt: transport with IA to (re)initialize
369 * @addr: IP address of remote peer
370 * 365 *
371 * Returns 0 on success, negative errno if an appropriate 366 * Returns 0 on success, negative errno if an appropriate
372 * Interface Adapter could not be found and opened. 367 * Interface Adapter could not be found and opened.
373 */ 368 */
374int 369int
375rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr) 370rpcrdma_ia_open(struct rpcrdma_xprt *xprt)
376{ 371{
377 struct rpcrdma_ia *ia = &xprt->rx_ia; 372 struct rpcrdma_ia *ia = &xprt->rx_ia;
378 int rc; 373 int rc;
379 374
380 ia->ri_id = rpcrdma_create_id(xprt, ia, addr); 375 ia->ri_id = rpcrdma_create_id(xprt, ia);
381 if (IS_ERR(ia->ri_id)) { 376 if (IS_ERR(ia->ri_id)) {
382 rc = PTR_ERR(ia->ri_id); 377 rc = PTR_ERR(ia->ri_id);
383 goto out_err; 378 goto out_err;
@@ -392,7 +387,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr)
392 } 387 }
393 388
394 switch (xprt_rdma_memreg_strategy) { 389 switch (xprt_rdma_memreg_strategy) {
395 case RPCRDMA_FRMR: 390 case RPCRDMA_FRWR:
396 if (frwr_is_supported(ia)) { 391 if (frwr_is_supported(ia)) {
397 ia->ri_ops = &rpcrdma_frwr_memreg_ops; 392 ia->ri_ops = &rpcrdma_frwr_memreg_ops;
398 break; 393 break;
@@ -462,10 +457,12 @@ rpcrdma_ia_remove(struct rpcrdma_ia *ia)
462 rpcrdma_dma_unmap_regbuf(req->rl_sendbuf); 457 rpcrdma_dma_unmap_regbuf(req->rl_sendbuf);
463 rpcrdma_dma_unmap_regbuf(req->rl_recvbuf); 458 rpcrdma_dma_unmap_regbuf(req->rl_recvbuf);
464 } 459 }
465 rpcrdma_destroy_mrs(buf); 460 rpcrdma_mrs_destroy(buf);
466 461
467 /* Allow waiters to continue */ 462 /* Allow waiters to continue */
468 complete(&ia->ri_remove_done); 463 complete(&ia->ri_remove_done);
464
465 trace_xprtrdma_remove(r_xprt);
469} 466}
470 467
471/** 468/**
@@ -476,7 +473,6 @@ rpcrdma_ia_remove(struct rpcrdma_ia *ia)
476void 473void
477rpcrdma_ia_close(struct rpcrdma_ia *ia) 474rpcrdma_ia_close(struct rpcrdma_ia *ia)
478{ 475{
479 dprintk("RPC: %s: entering\n", __func__);
480 if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) { 476 if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
481 if (ia->ri_id->qp) 477 if (ia->ri_id->qp)
482 rdma_destroy_qp(ia->ri_id); 478 rdma_destroy_qp(ia->ri_id);
@@ -630,9 +626,6 @@ out1:
630void 626void
631rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) 627rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
632{ 628{
633 dprintk("RPC: %s: entering, connected is %d\n",
634 __func__, ep->rep_connected);
635
636 cancel_delayed_work_sync(&ep->rep_connect_worker); 629 cancel_delayed_work_sync(&ep->rep_connect_worker);
637 630
638 if (ia->ri_id->qp) { 631 if (ia->ri_id->qp) {
@@ -653,13 +646,12 @@ static int
653rpcrdma_ep_recreate_xprt(struct rpcrdma_xprt *r_xprt, 646rpcrdma_ep_recreate_xprt(struct rpcrdma_xprt *r_xprt,
654 struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) 647 struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
655{ 648{
656 struct sockaddr *sap = (struct sockaddr *)&r_xprt->rx_data.addr;
657 int rc, err; 649 int rc, err;
658 650
659 pr_info("%s: r_xprt = %p\n", __func__, r_xprt); 651 trace_xprtrdma_reinsert(r_xprt);
660 652
661 rc = -EHOSTUNREACH; 653 rc = -EHOSTUNREACH;
662 if (rpcrdma_ia_open(r_xprt, sap)) 654 if (rpcrdma_ia_open(r_xprt))
663 goto out1; 655 goto out1;
664 656
665 rc = -ENOMEM; 657 rc = -ENOMEM;
@@ -676,7 +668,7 @@ rpcrdma_ep_recreate_xprt(struct rpcrdma_xprt *r_xprt,
676 goto out3; 668 goto out3;
677 } 669 }
678 670
679 rpcrdma_create_mrs(r_xprt); 671 rpcrdma_mrs_create(r_xprt);
680 return 0; 672 return 0;
681 673
682out3: 674out3:
@@ -691,16 +683,15 @@ static int
691rpcrdma_ep_reconnect(struct rpcrdma_xprt *r_xprt, struct rpcrdma_ep *ep, 683rpcrdma_ep_reconnect(struct rpcrdma_xprt *r_xprt, struct rpcrdma_ep *ep,
692 struct rpcrdma_ia *ia) 684 struct rpcrdma_ia *ia)
693{ 685{
694 struct sockaddr *sap = (struct sockaddr *)&r_xprt->rx_data.addr;
695 struct rdma_cm_id *id, *old; 686 struct rdma_cm_id *id, *old;
696 int err, rc; 687 int err, rc;
697 688
698 dprintk("RPC: %s: reconnecting...\n", __func__); 689 trace_xprtrdma_reconnect(r_xprt);
699 690
700 rpcrdma_ep_disconnect(ep, ia); 691 rpcrdma_ep_disconnect(ep, ia);
701 692
702 rc = -EHOSTUNREACH; 693 rc = -EHOSTUNREACH;
703 id = rpcrdma_create_id(r_xprt, ia, sap); 694 id = rpcrdma_create_id(r_xprt, ia);
704 if (IS_ERR(id)) 695 if (IS_ERR(id))
705 goto out; 696 goto out;
706 697
@@ -817,16 +808,14 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
817 int rc; 808 int rc;
818 809
819 rc = rdma_disconnect(ia->ri_id); 810 rc = rdma_disconnect(ia->ri_id);
820 if (!rc) { 811 if (!rc)
821 /* returns without wait if not connected */ 812 /* returns without wait if not connected */
822 wait_event_interruptible(ep->rep_connect_wait, 813 wait_event_interruptible(ep->rep_connect_wait,
823 ep->rep_connected != 1); 814 ep->rep_connected != 1);
824 dprintk("RPC: %s: after wait, %sconnected\n", __func__, 815 else
825 (ep->rep_connected == 1) ? "still " : "dis");
826 } else {
827 dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc);
828 ep->rep_connected = rc; 816 ep->rep_connected = rc;
829 } 817 trace_xprtrdma_disconnect(container_of(ep, struct rpcrdma_xprt,
818 rx_ep), rc);
830 819
831 ib_drain_qp(ia->ri_id->qp); 820 ib_drain_qp(ia->ri_id->qp);
832} 821}
@@ -998,15 +987,15 @@ rpcrdma_mr_recovery_worker(struct work_struct *work)
998{ 987{
999 struct rpcrdma_buffer *buf = container_of(work, struct rpcrdma_buffer, 988 struct rpcrdma_buffer *buf = container_of(work, struct rpcrdma_buffer,
1000 rb_recovery_worker.work); 989 rb_recovery_worker.work);
1001 struct rpcrdma_mw *mw; 990 struct rpcrdma_mr *mr;
1002 991
1003 spin_lock(&buf->rb_recovery_lock); 992 spin_lock(&buf->rb_recovery_lock);
1004 while (!list_empty(&buf->rb_stale_mrs)) { 993 while (!list_empty(&buf->rb_stale_mrs)) {
1005 mw = rpcrdma_pop_mw(&buf->rb_stale_mrs); 994 mr = rpcrdma_mr_pop(&buf->rb_stale_mrs);
1006 spin_unlock(&buf->rb_recovery_lock); 995 spin_unlock(&buf->rb_recovery_lock);
1007 996
1008 dprintk("RPC: %s: recovering MR %p\n", __func__, mw); 997 trace_xprtrdma_recover_mr(mr);
1009 mw->mw_xprt->rx_ia.ri_ops->ro_recover_mr(mw); 998 mr->mr_xprt->rx_ia.ri_ops->ro_recover_mr(mr);
1010 999
1011 spin_lock(&buf->rb_recovery_lock); 1000 spin_lock(&buf->rb_recovery_lock);
1012 } 1001 }
@@ -1014,20 +1003,20 @@ rpcrdma_mr_recovery_worker(struct work_struct *work)
1014} 1003}
1015 1004
1016void 1005void
1017rpcrdma_defer_mr_recovery(struct rpcrdma_mw *mw) 1006rpcrdma_mr_defer_recovery(struct rpcrdma_mr *mr)
1018{ 1007{
1019 struct rpcrdma_xprt *r_xprt = mw->mw_xprt; 1008 struct rpcrdma_xprt *r_xprt = mr->mr_xprt;
1020 struct rpcrdma_buffer *buf = &r_xprt->rx_buf; 1009 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
1021 1010
1022 spin_lock(&buf->rb_recovery_lock); 1011 spin_lock(&buf->rb_recovery_lock);
1023 rpcrdma_push_mw(mw, &buf->rb_stale_mrs); 1012 rpcrdma_mr_push(mr, &buf->rb_stale_mrs);
1024 spin_unlock(&buf->rb_recovery_lock); 1013 spin_unlock(&buf->rb_recovery_lock);
1025 1014
1026 schedule_delayed_work(&buf->rb_recovery_worker, 0); 1015 schedule_delayed_work(&buf->rb_recovery_worker, 0);
1027} 1016}
1028 1017
1029static void 1018static void
1030rpcrdma_create_mrs(struct rpcrdma_xprt *r_xprt) 1019rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt)
1031{ 1020{
1032 struct rpcrdma_buffer *buf = &r_xprt->rx_buf; 1021 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
1033 struct rpcrdma_ia *ia = &r_xprt->rx_ia; 1022 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
@@ -1036,32 +1025,32 @@ rpcrdma_create_mrs(struct rpcrdma_xprt *r_xprt)
1036 LIST_HEAD(all); 1025 LIST_HEAD(all);
1037 1026
1038 for (count = 0; count < 32; count++) { 1027 for (count = 0; count < 32; count++) {
1039 struct rpcrdma_mw *mw; 1028 struct rpcrdma_mr *mr;
1040 int rc; 1029 int rc;
1041 1030
1042 mw = kzalloc(sizeof(*mw), GFP_KERNEL); 1031 mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1043 if (!mw) 1032 if (!mr)
1044 break; 1033 break;
1045 1034
1046 rc = ia->ri_ops->ro_init_mr(ia, mw); 1035 rc = ia->ri_ops->ro_init_mr(ia, mr);
1047 if (rc) { 1036 if (rc) {
1048 kfree(mw); 1037 kfree(mr);
1049 break; 1038 break;
1050 } 1039 }
1051 1040
1052 mw->mw_xprt = r_xprt; 1041 mr->mr_xprt = r_xprt;
1053 1042
1054 list_add(&mw->mw_list, &free); 1043 list_add(&mr->mr_list, &free);
1055 list_add(&mw->mw_all, &all); 1044 list_add(&mr->mr_all, &all);
1056 } 1045 }
1057 1046
1058 spin_lock(&buf->rb_mwlock); 1047 spin_lock(&buf->rb_mrlock);
1059 list_splice(&free, &buf->rb_mws); 1048 list_splice(&free, &buf->rb_mrs);
1060 list_splice(&all, &buf->rb_all); 1049 list_splice(&all, &buf->rb_all);
1061 r_xprt->rx_stats.mrs_allocated += count; 1050 r_xprt->rx_stats.mrs_allocated += count;
1062 spin_unlock(&buf->rb_mwlock); 1051 spin_unlock(&buf->rb_mrlock);
1063 1052
1064 dprintk("RPC: %s: created %u MRs\n", __func__, count); 1053 trace_xprtrdma_createmrs(r_xprt, count);
1065} 1054}
1066 1055
1067static void 1056static void
@@ -1072,7 +1061,7 @@ rpcrdma_mr_refresh_worker(struct work_struct *work)
1072 struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt, 1061 struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt,
1073 rx_buf); 1062 rx_buf);
1074 1063
1075 rpcrdma_create_mrs(r_xprt); 1064 rpcrdma_mrs_create(r_xprt);
1076} 1065}
1077 1066
1078struct rpcrdma_req * 1067struct rpcrdma_req *
@@ -1093,10 +1082,17 @@ rpcrdma_create_req(struct rpcrdma_xprt *r_xprt)
1093 return req; 1082 return req;
1094} 1083}
1095 1084
1096struct rpcrdma_rep * 1085/**
1086 * rpcrdma_create_rep - Allocate an rpcrdma_rep object
1087 * @r_xprt: controlling transport
1088 *
1089 * Returns 0 on success or a negative errno on failure.
1090 */
1091int
1097rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt) 1092rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt)
1098{ 1093{
1099 struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data; 1094 struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
1095 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
1100 struct rpcrdma_rep *rep; 1096 struct rpcrdma_rep *rep;
1101 int rc; 1097 int rc;
1102 1098
@@ -1121,12 +1117,18 @@ rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt)
1121 rep->rr_recv_wr.wr_cqe = &rep->rr_cqe; 1117 rep->rr_recv_wr.wr_cqe = &rep->rr_cqe;
1122 rep->rr_recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov; 1118 rep->rr_recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov;
1123 rep->rr_recv_wr.num_sge = 1; 1119 rep->rr_recv_wr.num_sge = 1;
1124 return rep; 1120
1121 spin_lock(&buf->rb_lock);
1122 list_add(&rep->rr_list, &buf->rb_recv_bufs);
1123 spin_unlock(&buf->rb_lock);
1124 return 0;
1125 1125
1126out_free: 1126out_free:
1127 kfree(rep); 1127 kfree(rep);
1128out: 1128out:
1129 return ERR_PTR(rc); 1129 dprintk("RPC: %s: reply buffer %d alloc failed\n",
1130 __func__, rc);
1131 return rc;
1130} 1132}
1131 1133
1132int 1134int
@@ -1137,10 +1139,10 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
1137 1139
1138 buf->rb_max_requests = r_xprt->rx_data.max_requests; 1140 buf->rb_max_requests = r_xprt->rx_data.max_requests;
1139 buf->rb_bc_srv_max_requests = 0; 1141 buf->rb_bc_srv_max_requests = 0;
1140 spin_lock_init(&buf->rb_mwlock); 1142 spin_lock_init(&buf->rb_mrlock);
1141 spin_lock_init(&buf->rb_lock); 1143 spin_lock_init(&buf->rb_lock);
1142 spin_lock_init(&buf->rb_recovery_lock); 1144 spin_lock_init(&buf->rb_recovery_lock);
1143 INIT_LIST_HEAD(&buf->rb_mws); 1145 INIT_LIST_HEAD(&buf->rb_mrs);
1144 INIT_LIST_HEAD(&buf->rb_all); 1146 INIT_LIST_HEAD(&buf->rb_all);
1145 INIT_LIST_HEAD(&buf->rb_stale_mrs); 1147 INIT_LIST_HEAD(&buf->rb_stale_mrs);
1146 INIT_DELAYED_WORK(&buf->rb_refresh_worker, 1148 INIT_DELAYED_WORK(&buf->rb_refresh_worker,
@@ -1148,7 +1150,7 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
1148 INIT_DELAYED_WORK(&buf->rb_recovery_worker, 1150 INIT_DELAYED_WORK(&buf->rb_recovery_worker,
1149 rpcrdma_mr_recovery_worker); 1151 rpcrdma_mr_recovery_worker);
1150 1152
1151 rpcrdma_create_mrs(r_xprt); 1153 rpcrdma_mrs_create(r_xprt);
1152 1154
1153 INIT_LIST_HEAD(&buf->rb_send_bufs); 1155 INIT_LIST_HEAD(&buf->rb_send_bufs);
1154 INIT_LIST_HEAD(&buf->rb_allreqs); 1156 INIT_LIST_HEAD(&buf->rb_allreqs);
@@ -1167,17 +1169,10 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
1167 } 1169 }
1168 1170
1169 INIT_LIST_HEAD(&buf->rb_recv_bufs); 1171 INIT_LIST_HEAD(&buf->rb_recv_bufs);
1170 for (i = 0; i < buf->rb_max_requests + RPCRDMA_MAX_BC_REQUESTS; i++) { 1172 for (i = 0; i <= buf->rb_max_requests; i++) {
1171 struct rpcrdma_rep *rep; 1173 rc = rpcrdma_create_rep(r_xprt);
1172 1174 if (rc)
1173 rep = rpcrdma_create_rep(r_xprt);
1174 if (IS_ERR(rep)) {
1175 dprintk("RPC: %s: reply buffer %d alloc failed\n",
1176 __func__, i);
1177 rc = PTR_ERR(rep);
1178 goto out; 1175 goto out;
1179 }
1180 list_add(&rep->rr_list, &buf->rb_recv_bufs);
1181 } 1176 }
1182 1177
1183 rc = rpcrdma_sendctxs_create(r_xprt); 1178 rc = rpcrdma_sendctxs_create(r_xprt);
@@ -1229,26 +1224,26 @@ rpcrdma_destroy_req(struct rpcrdma_req *req)
1229} 1224}
1230 1225
1231static void 1226static void
1232rpcrdma_destroy_mrs(struct rpcrdma_buffer *buf) 1227rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf)
1233{ 1228{
1234 struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt, 1229 struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt,
1235 rx_buf); 1230 rx_buf);
1236 struct rpcrdma_ia *ia = rdmab_to_ia(buf); 1231 struct rpcrdma_ia *ia = rdmab_to_ia(buf);
1237 struct rpcrdma_mw *mw; 1232 struct rpcrdma_mr *mr;
1238 unsigned int count; 1233 unsigned int count;
1239 1234
1240 count = 0; 1235 count = 0;
1241 spin_lock(&buf->rb_mwlock); 1236 spin_lock(&buf->rb_mrlock);
1242 while (!list_empty(&buf->rb_all)) { 1237 while (!list_empty(&buf->rb_all)) {
1243 mw = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all); 1238 mr = list_entry(buf->rb_all.next, struct rpcrdma_mr, mr_all);
1244 list_del(&mw->mw_all); 1239 list_del(&mr->mr_all);
1245 1240
1246 spin_unlock(&buf->rb_mwlock); 1241 spin_unlock(&buf->rb_mrlock);
1247 ia->ri_ops->ro_release_mr(mw); 1242 ia->ri_ops->ro_release_mr(mr);
1248 count++; 1243 count++;
1249 spin_lock(&buf->rb_mwlock); 1244 spin_lock(&buf->rb_mrlock);
1250 } 1245 }
1251 spin_unlock(&buf->rb_mwlock); 1246 spin_unlock(&buf->rb_mrlock);
1252 r_xprt->rx_stats.mrs_allocated = 0; 1247 r_xprt->rx_stats.mrs_allocated = 0;
1253 1248
1254 dprintk("RPC: %s: released %u MRs\n", __func__, count); 1249 dprintk("RPC: %s: released %u MRs\n", __func__, count);
@@ -1285,27 +1280,33 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
1285 spin_unlock(&buf->rb_reqslock); 1280 spin_unlock(&buf->rb_reqslock);
1286 buf->rb_recv_count = 0; 1281 buf->rb_recv_count = 0;
1287 1282
1288 rpcrdma_destroy_mrs(buf); 1283 rpcrdma_mrs_destroy(buf);
1289} 1284}
1290 1285
1291struct rpcrdma_mw * 1286/**
1292rpcrdma_get_mw(struct rpcrdma_xprt *r_xprt) 1287 * rpcrdma_mr_get - Allocate an rpcrdma_mr object
1288 * @r_xprt: controlling transport
1289 *
1290 * Returns an initialized rpcrdma_mr or NULL if no free
1291 * rpcrdma_mr objects are available.
1292 */
1293struct rpcrdma_mr *
1294rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt)
1293{ 1295{
1294 struct rpcrdma_buffer *buf = &r_xprt->rx_buf; 1296 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
1295 struct rpcrdma_mw *mw = NULL; 1297 struct rpcrdma_mr *mr = NULL;
1296 1298
1297 spin_lock(&buf->rb_mwlock); 1299 spin_lock(&buf->rb_mrlock);
1298 if (!list_empty(&buf->rb_mws)) 1300 if (!list_empty(&buf->rb_mrs))
1299 mw = rpcrdma_pop_mw(&buf->rb_mws); 1301 mr = rpcrdma_mr_pop(&buf->rb_mrs);
1300 spin_unlock(&buf->rb_mwlock); 1302 spin_unlock(&buf->rb_mrlock);
1301 1303
1302 if (!mw) 1304 if (!mr)
1303 goto out_nomws; 1305 goto out_nomrs;
1304 mw->mw_flags = 0; 1306 return mr;
1305 return mw;
1306 1307
1307out_nomws: 1308out_nomrs:
1308 dprintk("RPC: %s: no MWs available\n", __func__); 1309 trace_xprtrdma_nomrs(r_xprt);
1309 if (r_xprt->rx_ep.rep_connected != -ENODEV) 1310 if (r_xprt->rx_ep.rep_connected != -ENODEV)
1310 schedule_delayed_work(&buf->rb_refresh_worker, 0); 1311 schedule_delayed_work(&buf->rb_refresh_worker, 0);
1311 1312
@@ -1315,14 +1316,39 @@ out_nomws:
1315 return NULL; 1316 return NULL;
1316} 1317}
1317 1318
1319static void
1320__rpcrdma_mr_put(struct rpcrdma_buffer *buf, struct rpcrdma_mr *mr)
1321{
1322 spin_lock(&buf->rb_mrlock);
1323 rpcrdma_mr_push(mr, &buf->rb_mrs);
1324 spin_unlock(&buf->rb_mrlock);
1325}
1326
1327/**
1328 * rpcrdma_mr_put - Release an rpcrdma_mr object
1329 * @mr: object to release
1330 *
1331 */
1318void 1332void
1319rpcrdma_put_mw(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw) 1333rpcrdma_mr_put(struct rpcrdma_mr *mr)
1320{ 1334{
1321 struct rpcrdma_buffer *buf = &r_xprt->rx_buf; 1335 __rpcrdma_mr_put(&mr->mr_xprt->rx_buf, mr);
1336}
1337
1338/**
1339 * rpcrdma_mr_unmap_and_put - DMA unmap an MR and release it
1340 * @mr: object to release
1341 *
1342 */
1343void
1344rpcrdma_mr_unmap_and_put(struct rpcrdma_mr *mr)
1345{
1346 struct rpcrdma_xprt *r_xprt = mr->mr_xprt;
1322 1347
1323 spin_lock(&buf->rb_mwlock); 1348 trace_xprtrdma_dma_unmap(mr);
1324 rpcrdma_push_mw(mw, &buf->rb_mws); 1349 ib_dma_unmap_sg(r_xprt->rx_ia.ri_device,
1325 spin_unlock(&buf->rb_mwlock); 1350 mr->mr_sg, mr->mr_nents, mr->mr_dir);
1351 __rpcrdma_mr_put(&r_xprt->rx_buf, mr);
1326} 1352}
1327 1353
1328static struct rpcrdma_rep * 1354static struct rpcrdma_rep *
@@ -1359,11 +1385,11 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
1359 req = rpcrdma_buffer_get_req_locked(buffers); 1385 req = rpcrdma_buffer_get_req_locked(buffers);
1360 req->rl_reply = rpcrdma_buffer_get_rep(buffers); 1386 req->rl_reply = rpcrdma_buffer_get_rep(buffers);
1361 spin_unlock(&buffers->rb_lock); 1387 spin_unlock(&buffers->rb_lock);
1388
1362 return req; 1389 return req;
1363 1390
1364out_reqbuf: 1391out_reqbuf:
1365 spin_unlock(&buffers->rb_lock); 1392 spin_unlock(&buffers->rb_lock);
1366 pr_warn("RPC: %s: out of request buffers\n", __func__);
1367 return NULL; 1393 return NULL;
1368} 1394}
1369 1395
@@ -1519,9 +1545,6 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
1519 req->rl_reply = NULL; 1545 req->rl_reply = NULL;
1520 } 1546 }
1521 1547
1522 dprintk("RPC: %s: posting %d s/g entries\n",
1523 __func__, send_wr->num_sge);
1524
1525 if (!ep->rep_send_count || 1548 if (!ep->rep_send_count ||
1526 test_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags)) { 1549 test_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags)) {
1527 send_wr->send_flags |= IB_SEND_SIGNALED; 1550 send_wr->send_flags |= IB_SEND_SIGNALED;
@@ -1530,14 +1553,12 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
1530 send_wr->send_flags &= ~IB_SEND_SIGNALED; 1553 send_wr->send_flags &= ~IB_SEND_SIGNALED;
1531 --ep->rep_send_count; 1554 --ep->rep_send_count;
1532 } 1555 }
1556
1533 rc = ib_post_send(ia->ri_id->qp, send_wr, &send_wr_fail); 1557 rc = ib_post_send(ia->ri_id->qp, send_wr, &send_wr_fail);
1558 trace_xprtrdma_post_send(req, rc);
1534 if (rc) 1559 if (rc)
1535 goto out_postsend_err; 1560 return -ENOTCONN;
1536 return 0; 1561 return 0;
1537
1538out_postsend_err:
1539 pr_err("rpcrdma: RDMA Send ib_post_send returned %i\n", rc);
1540 return -ENOTCONN;
1541} 1562}
1542 1563
1543int 1564int
@@ -1550,23 +1571,20 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
1550 if (!rpcrdma_dma_map_regbuf(ia, rep->rr_rdmabuf)) 1571 if (!rpcrdma_dma_map_regbuf(ia, rep->rr_rdmabuf))
1551 goto out_map; 1572 goto out_map;
1552 rc = ib_post_recv(ia->ri_id->qp, &rep->rr_recv_wr, &recv_wr_fail); 1573 rc = ib_post_recv(ia->ri_id->qp, &rep->rr_recv_wr, &recv_wr_fail);
1574 trace_xprtrdma_post_recv(rep, rc);
1553 if (rc) 1575 if (rc)
1554 goto out_postrecv; 1576 return -ENOTCONN;
1555 return 0; 1577 return 0;
1556 1578
1557out_map: 1579out_map:
1558 pr_err("rpcrdma: failed to DMA map the Receive buffer\n"); 1580 pr_err("rpcrdma: failed to DMA map the Receive buffer\n");
1559 return -EIO; 1581 return -EIO;
1560
1561out_postrecv:
1562 pr_err("rpcrdma: ib_post_recv returned %i\n", rc);
1563 return -ENOTCONN;
1564} 1582}
1565 1583
1566/** 1584/**
1567 * rpcrdma_ep_post_extra_recv - Post buffers for incoming backchannel requests 1585 * rpcrdma_ep_post_extra_recv - Post buffers for incoming backchannel requests
1568 * @r_xprt: transport associated with these backchannel resources 1586 * @r_xprt: transport associated with these backchannel resources
1569 * @min_reqs: minimum number of incoming requests expected 1587 * @count: minimum number of incoming requests expected
1570 * 1588 *
1571 * Returns zero if all requested buffers were posted, or a negative errno. 1589 * Returns zero if all requested buffers were posted, or a negative errno.
1572 */ 1590 */
@@ -1594,7 +1612,7 @@ rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *r_xprt, unsigned int count)
1594 1612
1595out_reqbuf: 1613out_reqbuf:
1596 spin_unlock(&buffers->rb_lock); 1614 spin_unlock(&buffers->rb_lock);
1597 pr_warn("%s: no extra receive buffers\n", __func__); 1615 trace_xprtrdma_noreps(r_xprt);
1598 return -ENOMEM; 1616 return -ENOMEM;
1599 1617
1600out_rc: 1618out_rc:
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 1342f743f1c4..69883a960a3f 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -73,11 +73,10 @@ struct rpcrdma_ia {
73 struct completion ri_remove_done; 73 struct completion ri_remove_done;
74 int ri_async_rc; 74 int ri_async_rc;
75 unsigned int ri_max_segs; 75 unsigned int ri_max_segs;
76 unsigned int ri_max_frmr_depth; 76 unsigned int ri_max_frwr_depth;
77 unsigned int ri_max_inline_write; 77 unsigned int ri_max_inline_write;
78 unsigned int ri_max_inline_read; 78 unsigned int ri_max_inline_read;
79 unsigned int ri_max_send_sges; 79 unsigned int ri_max_send_sges;
80 bool ri_reminv_expected;
81 bool ri_implicit_roundup; 80 bool ri_implicit_roundup;
82 enum ib_mr_type ri_mrtype; 81 enum ib_mr_type ri_mrtype;
83 unsigned long ri_flags; 82 unsigned long ri_flags;
@@ -101,7 +100,6 @@ struct rpcrdma_ep {
101 wait_queue_head_t rep_connect_wait; 100 wait_queue_head_t rep_connect_wait;
102 struct rpcrdma_connect_private rep_cm_private; 101 struct rpcrdma_connect_private rep_cm_private;
103 struct rdma_conn_param rep_remote_cma; 102 struct rdma_conn_param rep_remote_cma;
104 struct sockaddr_storage rep_remote_addr;
105 struct delayed_work rep_connect_worker; 103 struct delayed_work rep_connect_worker;
106}; 104};
107 105
@@ -232,29 +230,29 @@ enum {
232}; 230};
233 231
234/* 232/*
235 * struct rpcrdma_mw - external memory region metadata 233 * struct rpcrdma_mr - external memory region metadata
236 * 234 *
237 * An external memory region is any buffer or page that is registered 235 * An external memory region is any buffer or page that is registered
238 * on the fly (ie, not pre-registered). 236 * on the fly (ie, not pre-registered).
239 * 237 *
240 * Each rpcrdma_buffer has a list of free MWs anchored in rb_mws. During 238 * Each rpcrdma_buffer has a list of free MWs anchored in rb_mrs. During
241 * call_allocate, rpcrdma_buffer_get() assigns one to each segment in 239 * call_allocate, rpcrdma_buffer_get() assigns one to each segment in
242 * an rpcrdma_req. Then rpcrdma_register_external() grabs these to keep 240 * an rpcrdma_req. Then rpcrdma_register_external() grabs these to keep
243 * track of registration metadata while each RPC is pending. 241 * track of registration metadata while each RPC is pending.
244 * rpcrdma_deregister_external() uses this metadata to unmap and 242 * rpcrdma_deregister_external() uses this metadata to unmap and
245 * release these resources when an RPC is complete. 243 * release these resources when an RPC is complete.
246 */ 244 */
247enum rpcrdma_frmr_state { 245enum rpcrdma_frwr_state {
248 FRMR_IS_INVALID, /* ready to be used */ 246 FRWR_IS_INVALID, /* ready to be used */
249 FRMR_IS_VALID, /* in use */ 247 FRWR_IS_VALID, /* in use */
250 FRMR_FLUSHED_FR, /* flushed FASTREG WR */ 248 FRWR_FLUSHED_FR, /* flushed FASTREG WR */
251 FRMR_FLUSHED_LI, /* flushed LOCALINV WR */ 249 FRWR_FLUSHED_LI, /* flushed LOCALINV WR */
252}; 250};
253 251
254struct rpcrdma_frmr { 252struct rpcrdma_frwr {
255 struct ib_mr *fr_mr; 253 struct ib_mr *fr_mr;
256 struct ib_cqe fr_cqe; 254 struct ib_cqe fr_cqe;
257 enum rpcrdma_frmr_state fr_state; 255 enum rpcrdma_frwr_state fr_state;
258 struct completion fr_linv_done; 256 struct completion fr_linv_done;
259 union { 257 union {
260 struct ib_reg_wr fr_regwr; 258 struct ib_reg_wr fr_regwr;
@@ -267,26 +265,20 @@ struct rpcrdma_fmr {
267 u64 *fm_physaddrs; 265 u64 *fm_physaddrs;
268}; 266};
269 267
270struct rpcrdma_mw { 268struct rpcrdma_mr {
271 struct list_head mw_list; 269 struct list_head mr_list;
272 struct scatterlist *mw_sg; 270 struct scatterlist *mr_sg;
273 int mw_nents; 271 int mr_nents;
274 enum dma_data_direction mw_dir; 272 enum dma_data_direction mr_dir;
275 unsigned long mw_flags;
276 union { 273 union {
277 struct rpcrdma_fmr fmr; 274 struct rpcrdma_fmr fmr;
278 struct rpcrdma_frmr frmr; 275 struct rpcrdma_frwr frwr;
279 }; 276 };
280 struct rpcrdma_xprt *mw_xprt; 277 struct rpcrdma_xprt *mr_xprt;
281 u32 mw_handle; 278 u32 mr_handle;
282 u32 mw_length; 279 u32 mr_length;
283 u64 mw_offset; 280 u64 mr_offset;
284 struct list_head mw_all; 281 struct list_head mr_all;
285};
286
287/* mw_flags */
288enum {
289 RPCRDMA_MW_F_RI = 1,
290}; 282};
291 283
292/* 284/*
@@ -362,8 +354,7 @@ struct rpcrdma_req {
362 354
363/* rl_flags */ 355/* rl_flags */
364enum { 356enum {
365 RPCRDMA_REQ_F_BACKCHANNEL = 0, 357 RPCRDMA_REQ_F_PENDING = 0,
366 RPCRDMA_REQ_F_PENDING,
367 RPCRDMA_REQ_F_TX_RESOURCES, 358 RPCRDMA_REQ_F_TX_RESOURCES,
368}; 359};
369 360
@@ -374,25 +365,25 @@ rpcrdma_set_xprtdata(struct rpc_rqst *rqst, struct rpcrdma_req *req)
374} 365}
375 366
376static inline struct rpcrdma_req * 367static inline struct rpcrdma_req *
377rpcr_to_rdmar(struct rpc_rqst *rqst) 368rpcr_to_rdmar(const struct rpc_rqst *rqst)
378{ 369{
379 return rqst->rq_xprtdata; 370 return rqst->rq_xprtdata;
380} 371}
381 372
382static inline void 373static inline void
383rpcrdma_push_mw(struct rpcrdma_mw *mw, struct list_head *list) 374rpcrdma_mr_push(struct rpcrdma_mr *mr, struct list_head *list)
384{ 375{
385 list_add_tail(&mw->mw_list, list); 376 list_add_tail(&mr->mr_list, list);
386} 377}
387 378
388static inline struct rpcrdma_mw * 379static inline struct rpcrdma_mr *
389rpcrdma_pop_mw(struct list_head *list) 380rpcrdma_mr_pop(struct list_head *list)
390{ 381{
391 struct rpcrdma_mw *mw; 382 struct rpcrdma_mr *mr;
392 383
393 mw = list_first_entry(list, struct rpcrdma_mw, mw_list); 384 mr = list_first_entry(list, struct rpcrdma_mr, mr_list);
394 list_del(&mw->mw_list); 385 list_del(&mr->mr_list);
395 return mw; 386 return mr;
396} 387}
397 388
398/* 389/*
@@ -402,8 +393,8 @@ rpcrdma_pop_mw(struct list_head *list)
402 * One of these is associated with a transport instance 393 * One of these is associated with a transport instance
403 */ 394 */
404struct rpcrdma_buffer { 395struct rpcrdma_buffer {
405 spinlock_t rb_mwlock; /* protect rb_mws list */ 396 spinlock_t rb_mrlock; /* protect rb_mrs list */
406 struct list_head rb_mws; 397 struct list_head rb_mrs;
407 struct list_head rb_all; 398 struct list_head rb_all;
408 399
409 unsigned long rb_sc_head; 400 unsigned long rb_sc_head;
@@ -438,13 +429,11 @@ struct rpcrdma_buffer {
438 * This data should be set with mount options 429 * This data should be set with mount options
439 */ 430 */
440struct rpcrdma_create_data_internal { 431struct rpcrdma_create_data_internal {
441 struct sockaddr_storage addr; /* RDMA server address */
442 unsigned int max_requests; /* max requests (slots) in flight */ 432 unsigned int max_requests; /* max requests (slots) in flight */
443 unsigned int rsize; /* mount rsize - max read hdr+data */ 433 unsigned int rsize; /* mount rsize - max read hdr+data */
444 unsigned int wsize; /* mount wsize - max write hdr+data */ 434 unsigned int wsize; /* mount wsize - max write hdr+data */
445 unsigned int inline_rsize; /* max non-rdma read data payload */ 435 unsigned int inline_rsize; /* max non-rdma read data payload */
446 unsigned int inline_wsize; /* max non-rdma write data payload */ 436 unsigned int inline_wsize; /* max non-rdma write data payload */
447 unsigned int padding; /* non-rdma write header padding */
448}; 437};
449 438
450/* 439/*
@@ -484,17 +473,19 @@ struct rpcrdma_memreg_ops {
484 struct rpcrdma_mr_seg * 473 struct rpcrdma_mr_seg *
485 (*ro_map)(struct rpcrdma_xprt *, 474 (*ro_map)(struct rpcrdma_xprt *,
486 struct rpcrdma_mr_seg *, int, bool, 475 struct rpcrdma_mr_seg *, int, bool,
487 struct rpcrdma_mw **); 476 struct rpcrdma_mr **);
477 void (*ro_reminv)(struct rpcrdma_rep *rep,
478 struct list_head *mrs);
488 void (*ro_unmap_sync)(struct rpcrdma_xprt *, 479 void (*ro_unmap_sync)(struct rpcrdma_xprt *,
489 struct list_head *); 480 struct list_head *);
490 void (*ro_recover_mr)(struct rpcrdma_mw *); 481 void (*ro_recover_mr)(struct rpcrdma_mr *mr);
491 int (*ro_open)(struct rpcrdma_ia *, 482 int (*ro_open)(struct rpcrdma_ia *,
492 struct rpcrdma_ep *, 483 struct rpcrdma_ep *,
493 struct rpcrdma_create_data_internal *); 484 struct rpcrdma_create_data_internal *);
494 size_t (*ro_maxpages)(struct rpcrdma_xprt *); 485 size_t (*ro_maxpages)(struct rpcrdma_xprt *);
495 int (*ro_init_mr)(struct rpcrdma_ia *, 486 int (*ro_init_mr)(struct rpcrdma_ia *,
496 struct rpcrdma_mw *); 487 struct rpcrdma_mr *);
497 void (*ro_release_mr)(struct rpcrdma_mw *); 488 void (*ro_release_mr)(struct rpcrdma_mr *mr);
498 const char *ro_displayname; 489 const char *ro_displayname;
499 const int ro_send_w_inv_ok; 490 const int ro_send_w_inv_ok;
500}; 491};
@@ -525,6 +516,18 @@ struct rpcrdma_xprt {
525#define rpcx_to_rdmax(x) container_of(x, struct rpcrdma_xprt, rx_xprt) 516#define rpcx_to_rdmax(x) container_of(x, struct rpcrdma_xprt, rx_xprt)
526#define rpcx_to_rdmad(x) (rpcx_to_rdmax(x)->rx_data) 517#define rpcx_to_rdmad(x) (rpcx_to_rdmax(x)->rx_data)
527 518
519static inline const char *
520rpcrdma_addrstr(const struct rpcrdma_xprt *r_xprt)
521{
522 return r_xprt->rx_xprt.address_strings[RPC_DISPLAY_ADDR];
523}
524
525static inline const char *
526rpcrdma_portstr(const struct rpcrdma_xprt *r_xprt)
527{
528 return r_xprt->rx_xprt.address_strings[RPC_DISPLAY_PORT];
529}
530
528/* Setting this to 0 ensures interoperability with early servers. 531/* Setting this to 0 ensures interoperability with early servers.
529 * Setting this to 1 enhances certain unaligned read/write performance. 532 * Setting this to 1 enhances certain unaligned read/write performance.
530 * Default is 0, see sysctl entry and rpc_rdma.c rpcrdma_convert_iovs() */ 533 * Default is 0, see sysctl entry and rpc_rdma.c rpcrdma_convert_iovs() */
@@ -538,7 +541,7 @@ extern unsigned int xprt_rdma_memreg_strategy;
538/* 541/*
539 * Interface Adapter calls - xprtrdma/verbs.c 542 * Interface Adapter calls - xprtrdma/verbs.c
540 */ 543 */
541int rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr); 544int rpcrdma_ia_open(struct rpcrdma_xprt *xprt);
542void rpcrdma_ia_remove(struct rpcrdma_ia *ia); 545void rpcrdma_ia_remove(struct rpcrdma_ia *ia);
543void rpcrdma_ia_close(struct rpcrdma_ia *); 546void rpcrdma_ia_close(struct rpcrdma_ia *);
544bool frwr_is_supported(struct rpcrdma_ia *); 547bool frwr_is_supported(struct rpcrdma_ia *);
@@ -564,22 +567,23 @@ int rpcrdma_ep_post_recv(struct rpcrdma_ia *, struct rpcrdma_rep *);
564 * Buffer calls - xprtrdma/verbs.c 567 * Buffer calls - xprtrdma/verbs.c
565 */ 568 */
566struct rpcrdma_req *rpcrdma_create_req(struct rpcrdma_xprt *); 569struct rpcrdma_req *rpcrdma_create_req(struct rpcrdma_xprt *);
567struct rpcrdma_rep *rpcrdma_create_rep(struct rpcrdma_xprt *);
568void rpcrdma_destroy_req(struct rpcrdma_req *); 570void rpcrdma_destroy_req(struct rpcrdma_req *);
571int rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt);
569int rpcrdma_buffer_create(struct rpcrdma_xprt *); 572int rpcrdma_buffer_create(struct rpcrdma_xprt *);
570void rpcrdma_buffer_destroy(struct rpcrdma_buffer *); 573void rpcrdma_buffer_destroy(struct rpcrdma_buffer *);
571struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_buffer *buf); 574struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_buffer *buf);
572void rpcrdma_sendctx_put_locked(struct rpcrdma_sendctx *sc); 575void rpcrdma_sendctx_put_locked(struct rpcrdma_sendctx *sc);
573 576
574struct rpcrdma_mw *rpcrdma_get_mw(struct rpcrdma_xprt *); 577struct rpcrdma_mr *rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt);
575void rpcrdma_put_mw(struct rpcrdma_xprt *, struct rpcrdma_mw *); 578void rpcrdma_mr_put(struct rpcrdma_mr *mr);
579void rpcrdma_mr_unmap_and_put(struct rpcrdma_mr *mr);
580void rpcrdma_mr_defer_recovery(struct rpcrdma_mr *mr);
581
576struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *); 582struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *);
577void rpcrdma_buffer_put(struct rpcrdma_req *); 583void rpcrdma_buffer_put(struct rpcrdma_req *);
578void rpcrdma_recv_buffer_get(struct rpcrdma_req *); 584void rpcrdma_recv_buffer_get(struct rpcrdma_req *);
579void rpcrdma_recv_buffer_put(struct rpcrdma_rep *); 585void rpcrdma_recv_buffer_put(struct rpcrdma_rep *);
580 586
581void rpcrdma_defer_mr_recovery(struct rpcrdma_mw *);
582
583struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(size_t, enum dma_data_direction, 587struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(size_t, enum dma_data_direction,
584 gfp_t); 588 gfp_t);
585bool __rpcrdma_dma_map_regbuf(struct rpcrdma_ia *, struct rpcrdma_regbuf *); 589bool __rpcrdma_dma_map_regbuf(struct rpcrdma_ia *, struct rpcrdma_regbuf *);
@@ -663,7 +667,7 @@ int xprt_rdma_bc_up(struct svc_serv *, struct net *);
663size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *); 667size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *);
664int rpcrdma_bc_post_recv(struct rpcrdma_xprt *, unsigned int); 668int rpcrdma_bc_post_recv(struct rpcrdma_xprt *, unsigned int);
665void rpcrdma_bc_receive_call(struct rpcrdma_xprt *, struct rpcrdma_rep *); 669void rpcrdma_bc_receive_call(struct rpcrdma_xprt *, struct rpcrdma_rep *);
666int rpcrdma_bc_marshal_reply(struct rpc_rqst *); 670int xprt_rdma_bc_send_reply(struct rpc_rqst *rqst);
667void xprt_rdma_bc_free_rqst(struct rpc_rqst *); 671void xprt_rdma_bc_free_rqst(struct rpc_rqst *);
668void xprt_rdma_bc_destroy(struct rpc_xprt *, unsigned int); 672void xprt_rdma_bc_destroy(struct rpc_xprt *, unsigned int);
669#endif /* CONFIG_SUNRPC_BACKCHANNEL */ 673#endif /* CONFIG_SUNRPC_BACKCHANNEL */
@@ -671,3 +675,5 @@ void xprt_rdma_bc_destroy(struct rpc_xprt *, unsigned int);
671extern struct xprt_class xprt_rdma_bc; 675extern struct xprt_class xprt_rdma_bc;
672 676
673#endif /* _LINUX_SUNRPC_XPRT_RDMA_H */ 677#endif /* _LINUX_SUNRPC_XPRT_RDMA_H */
678
679#include <trace/events/rpcrdma.h>
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 6d0cc3b8f932..18803021f242 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -52,6 +52,8 @@
52 52
53#include "sunrpc.h" 53#include "sunrpc.h"
54 54
55#define RPC_TCP_READ_CHUNK_SZ (3*512*1024)
56
55static void xs_close(struct rpc_xprt *xprt); 57static void xs_close(struct rpc_xprt *xprt);
56static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt, 58static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt,
57 struct socket *sock); 59 struct socket *sock);
@@ -1003,6 +1005,7 @@ static void xs_local_data_receive(struct sock_xprt *transport)
1003 struct sock *sk; 1005 struct sock *sk;
1004 int err; 1006 int err;
1005 1007
1008restart:
1006 mutex_lock(&transport->recv_mutex); 1009 mutex_lock(&transport->recv_mutex);
1007 sk = transport->inet; 1010 sk = transport->inet;
1008 if (sk == NULL) 1011 if (sk == NULL)
@@ -1016,6 +1019,11 @@ static void xs_local_data_receive(struct sock_xprt *transport)
1016 } 1019 }
1017 if (!test_and_clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state)) 1020 if (!test_and_clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state))
1018 break; 1021 break;
1022 if (need_resched()) {
1023 mutex_unlock(&transport->recv_mutex);
1024 cond_resched();
1025 goto restart;
1026 }
1019 } 1027 }
1020out: 1028out:
1021 mutex_unlock(&transport->recv_mutex); 1029 mutex_unlock(&transport->recv_mutex);
@@ -1094,6 +1102,7 @@ static void xs_udp_data_receive(struct sock_xprt *transport)
1094 struct sock *sk; 1102 struct sock *sk;
1095 int err; 1103 int err;
1096 1104
1105restart:
1097 mutex_lock(&transport->recv_mutex); 1106 mutex_lock(&transport->recv_mutex);
1098 sk = transport->inet; 1107 sk = transport->inet;
1099 if (sk == NULL) 1108 if (sk == NULL)
@@ -1107,6 +1116,11 @@ static void xs_udp_data_receive(struct sock_xprt *transport)
1107 } 1116 }
1108 if (!test_and_clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state)) 1117 if (!test_and_clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state))
1109 break; 1118 break;
1119 if (need_resched()) {
1120 mutex_unlock(&transport->recv_mutex);
1121 cond_resched();
1122 goto restart;
1123 }
1110 } 1124 }
1111out: 1125out:
1112 mutex_unlock(&transport->recv_mutex); 1126 mutex_unlock(&transport->recv_mutex);
@@ -1479,6 +1493,7 @@ static int xs_tcp_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, uns
1479 .offset = offset, 1493 .offset = offset,
1480 .count = len, 1494 .count = len,
1481 }; 1495 };
1496 size_t ret;
1482 1497
1483 dprintk("RPC: xs_tcp_data_recv started\n"); 1498 dprintk("RPC: xs_tcp_data_recv started\n");
1484 do { 1499 do {
@@ -1507,9 +1522,14 @@ static int xs_tcp_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, uns
1507 /* Skip over any trailing bytes on short reads */ 1522 /* Skip over any trailing bytes on short reads */
1508 xs_tcp_read_discard(transport, &desc); 1523 xs_tcp_read_discard(transport, &desc);
1509 } while (desc.count); 1524 } while (desc.count);
1525 ret = len - desc.count;
1526 if (ret < rd_desc->count)
1527 rd_desc->count -= ret;
1528 else
1529 rd_desc->count = 0;
1510 trace_xs_tcp_data_recv(transport); 1530 trace_xs_tcp_data_recv(transport);
1511 dprintk("RPC: xs_tcp_data_recv done\n"); 1531 dprintk("RPC: xs_tcp_data_recv done\n");
1512 return len - desc.count; 1532 return ret;
1513} 1533}
1514 1534
1515static void xs_tcp_data_receive(struct sock_xprt *transport) 1535static void xs_tcp_data_receive(struct sock_xprt *transport)
@@ -1517,30 +1537,34 @@ static void xs_tcp_data_receive(struct sock_xprt *transport)
1517 struct rpc_xprt *xprt = &transport->xprt; 1537 struct rpc_xprt *xprt = &transport->xprt;
1518 struct sock *sk; 1538 struct sock *sk;
1519 read_descriptor_t rd_desc = { 1539 read_descriptor_t rd_desc = {
1520 .count = 2*1024*1024,
1521 .arg.data = xprt, 1540 .arg.data = xprt,
1522 }; 1541 };
1523 unsigned long total = 0; 1542 unsigned long total = 0;
1524 int loop;
1525 int read = 0; 1543 int read = 0;
1526 1544
1545restart:
1527 mutex_lock(&transport->recv_mutex); 1546 mutex_lock(&transport->recv_mutex);
1528 sk = transport->inet; 1547 sk = transport->inet;
1529 if (sk == NULL) 1548 if (sk == NULL)
1530 goto out; 1549 goto out;
1531 1550
1532 /* We use rd_desc to pass struct xprt to xs_tcp_data_recv */ 1551 /* We use rd_desc to pass struct xprt to xs_tcp_data_recv */
1533 for (loop = 0; loop < 64; loop++) { 1552 for (;;) {
1553 rd_desc.count = RPC_TCP_READ_CHUNK_SZ;
1534 lock_sock(sk); 1554 lock_sock(sk);
1535 read = tcp_read_sock(sk, &rd_desc, xs_tcp_data_recv); 1555 read = tcp_read_sock(sk, &rd_desc, xs_tcp_data_recv);
1536 if (read <= 0) { 1556 if (rd_desc.count != 0 || read < 0) {
1537 clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state); 1557 clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state);
1538 release_sock(sk); 1558 release_sock(sk);
1539 break; 1559 break;
1540 } 1560 }
1541 release_sock(sk); 1561 release_sock(sk);
1542 total += read; 1562 total += read;
1543 rd_desc.count = 65536; 1563 if (need_resched()) {
1564 mutex_unlock(&transport->recv_mutex);
1565 cond_resched();
1566 goto restart;
1567 }
1544 } 1568 }
1545 if (test_bit(XPRT_SOCK_DATA_READY, &transport->sock_state)) 1569 if (test_bit(XPRT_SOCK_DATA_READY, &transport->sock_state))
1546 queue_work(xprtiod_workqueue, &transport->recv_worker); 1570 queue_work(xprtiod_workqueue, &transport->recv_worker);