aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-04-26 20:33:59 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2015-04-26 20:33:59 -0400
commit59953fba87e5e535657403cc6439d24187929559 (patch)
tree4f92cc3bcacf052cb3fb895512af5a7d3dad86cb
parent9ec3a646fe09970f801ab15e0f1694060b9f19af (diff)
parentf139b6c676c7e49b66016b28bf3f8ec5c54be891 (diff)
Merge tag 'nfs-for-4.1-1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs
Pull NFS client updates from Trond Myklebust: "Another set of mainly bugfixes and a couple of cleanups. No new functionality in this round. Highlights include: Stable patches: - Fix a regression in /proc/self/mountstats - Fix the pNFS flexfiles O_DIRECT support - Fix high load average due to callback thread sleeping Bugfixes: - Various patches to fix the pNFS layoutcommit support - Do not cache pNFS deviceids unless server notifications are enabled - Fix a SUNRPC transport reconnection regression - make debugfs file creation failure non-fatal in SUNRPC - Another fix for circular directory warnings on NFSv4 "junctioned" mountpoints - Fix locking around NFSv4.2 fallocate() support - Truncating NFSv4 file opens should also sync O_DIRECT writes - Prevent infinite loop in rpcrdma_ep_create() Features: - Various improvements to the RDMA transport code's handling of memory registration - Various code cleanups" * tag 'nfs-for-4.1-1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs: (55 commits) fs/nfs: fix new compiler warning about boolean in switch nfs: Remove unneeded casts in nfs NFS: Don't attempt to decode missing directory entries Revert "nfs: replace nfs_add_stats with nfs_inc_stats when add one" NFS: Rename idmap.c to nfs4idmap.c NFS: Move nfs_idmap.h into fs/nfs/ NFS: Remove CONFIG_NFS_V4 checks from nfs_idmap.h NFS: Add a stub for GETDEVICELIST nfs: remove WARN_ON_ONCE from nfs_direct_good_bytes nfs: fix DIO good bytes calculation nfs: Fetch MOUNTED_ON_FILEID when updating an inode sunrpc: make debugfs file creation failure non-fatal nfs: fix high load average due to callback thread sleeping NFS: Reduce time spent holding the i_mutex during fallocate() NFS: Don't zap caches on fallocate() xprtrdma: Make rpcrdma_{un}map_one() into inline functions xprtrdma: Handle non-SEND completions via a callout xprtrdma: Add "open" memreg op xprtrdma: Add "destroy MRs" memreg op xprtrdma: Add "reset MRs" memreg op ...
-rw-r--r--fs/nfs/Makefile2
-rw-r--r--fs/nfs/blocklayout/blocklayout.c1
-rw-r--r--fs/nfs/blocklayout/dev.c2
-rw-r--r--fs/nfs/callback.c6
-rw-r--r--fs/nfs/client.c1
-rw-r--r--fs/nfs/delegation.c4
-rw-r--r--fs/nfs/dir.c4
-rw-r--r--fs/nfs/direct.c39
-rw-r--r--fs/nfs/file.c3
-rw-r--r--fs/nfs/filelayout/filelayout.c10
-rw-r--r--fs/nfs/filelayout/filelayoutdev.c2
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c12
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayoutdev.c2
-rw-r--r--fs/nfs/inode.c36
-rw-r--r--fs/nfs/nfs42proc.c31
-rw-r--r--fs/nfs/nfs42xdr.c20
-rw-r--r--fs/nfs/nfs4client.c2
-rw-r--r--fs/nfs/nfs4file.c20
-rw-r--r--fs/nfs/nfs4idmap.c (renamed from fs/nfs/idmap.c)2
-rw-r--r--fs/nfs/nfs4idmap.h (renamed from include/linux/nfs_idmap.h)13
-rw-r--r--fs/nfs/nfs4proc.c23
-rw-r--r--fs/nfs/nfs4state.c2
-rw-r--r--fs/nfs/nfs4super.c7
-rw-r--r--fs/nfs/nfs4sysctl.c2
-rw-r--r--fs/nfs/nfs4xdr.c22
-rw-r--r--fs/nfs/nfstrace.c3
-rw-r--r--fs/nfs/objlayout/objio_osd.c4
-rw-r--r--fs/nfs/pnfs.c68
-rw-r--r--fs/nfs/pnfs.h24
-rw-r--r--fs/nfs/pnfs_dev.c21
-rw-r--r--fs/nfs/pnfs_nfs.c12
-rw-r--r--fs/nfs/read.c2
-rw-r--r--fs/nfs/super.c4
-rw-r--r--fs/nfs/write.c15
-rw-r--r--include/linux/nfs_fs.h1
-rw-r--r--include/linux/nfs_xdr.h6
-rw-r--r--include/linux/sunrpc/msg_prot.h8
-rw-r--r--include/linux/sunrpc/xprtrdma.h5
-rw-r--r--include/uapi/linux/nfs_idmap.h2
-rw-r--r--net/sunrpc/sched.c4
-rw-r--r--net/sunrpc/xprt.c22
-rw-r--r--net/sunrpc/xprtrdma/Makefile3
-rw-r--r--net/sunrpc/xprtrdma/fmr_ops.c208
-rw-r--r--net/sunrpc/xprtrdma/frwr_ops.c353
-rw-r--r--net/sunrpc/xprtrdma/physical_ops.c94
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c87
-rw-r--r--net/sunrpc/xprtrdma/transport.c61
-rw-r--r--net/sunrpc/xprtrdma/verbs.c699
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h90
49 files changed, 1150 insertions, 914 deletions
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index 1e987acf20c9..8664417955a2 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -22,7 +22,7 @@ nfsv3-$(CONFIG_NFS_V3_ACL) += nfs3acl.o
22obj-$(CONFIG_NFS_V4) += nfsv4.o 22obj-$(CONFIG_NFS_V4) += nfsv4.o
23CFLAGS_nfs4trace.o += -I$(src) 23CFLAGS_nfs4trace.o += -I$(src)
24nfsv4-y := nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o nfs4super.o nfs4file.o \ 24nfsv4-y := nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o nfs4super.o nfs4file.o \
25 delegation.o idmap.o callback.o callback_xdr.o callback_proc.o \ 25 delegation.o nfs4idmap.o callback.o callback_xdr.o callback_proc.o \
26 nfs4namespace.o nfs4getroot.o nfs4client.o nfs4session.o \ 26 nfs4namespace.o nfs4getroot.o nfs4client.o nfs4session.o \
27 dns_resolve.o nfs4trace.o 27 dns_resolve.o nfs4trace.o
28nfsv4-$(CONFIG_NFS_USE_LEGACY_DNS) += cache_lib.o 28nfsv4-$(CONFIG_NFS_USE_LEGACY_DNS) += cache_lib.o
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 1cac3c175d18..d2554fe140a3 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -890,6 +890,7 @@ static struct pnfs_layoutdriver_type blocklayout_type = {
890 .free_deviceid_node = bl_free_deviceid_node, 890 .free_deviceid_node = bl_free_deviceid_node,
891 .pg_read_ops = &bl_pg_read_ops, 891 .pg_read_ops = &bl_pg_read_ops,
892 .pg_write_ops = &bl_pg_write_ops, 892 .pg_write_ops = &bl_pg_write_ops,
893 .sync = pnfs_generic_sync,
893}; 894};
894 895
895static int __init nfs4blocklayout_init(void) 896static int __init nfs4blocklayout_init(void)
diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
index 5aed4f98df41..e535599a0719 100644
--- a/fs/nfs/blocklayout/dev.c
+++ b/fs/nfs/blocklayout/dev.c
@@ -33,7 +33,7 @@ bl_free_deviceid_node(struct nfs4_deviceid_node *d)
33 container_of(d, struct pnfs_block_dev, node); 33 container_of(d, struct pnfs_block_dev, node);
34 34
35 bl_free_device(dev); 35 bl_free_device(dev);
36 kfree(dev); 36 kfree_rcu(dev, node.rcu);
37} 37}
38 38
39static int 39static int
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 351be9205bf8..8d129bb7355a 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -128,7 +128,7 @@ nfs41_callback_svc(void *vrqstp)
128 if (try_to_freeze()) 128 if (try_to_freeze())
129 continue; 129 continue;
130 130
131 prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_UNINTERRUPTIBLE); 131 prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE);
132 spin_lock_bh(&serv->sv_cb_lock); 132 spin_lock_bh(&serv->sv_cb_lock);
133 if (!list_empty(&serv->sv_cb_list)) { 133 if (!list_empty(&serv->sv_cb_list)) {
134 req = list_first_entry(&serv->sv_cb_list, 134 req = list_first_entry(&serv->sv_cb_list,
@@ -142,10 +142,10 @@ nfs41_callback_svc(void *vrqstp)
142 error); 142 error);
143 } else { 143 } else {
144 spin_unlock_bh(&serv->sv_cb_lock); 144 spin_unlock_bh(&serv->sv_cb_lock);
145 /* schedule_timeout to game the hung task watchdog */ 145 schedule();
146 schedule_timeout(60 * HZ);
147 finish_wait(&serv->sv_cb_waitq, &wq); 146 finish_wait(&serv->sv_cb_waitq, &wq);
148 } 147 }
148 flush_signals(current);
149 } 149 }
150 return 0; 150 return 0;
151} 151}
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 19874151e95c..892aefff3630 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -31,7 +31,6 @@
31#include <linux/lockd/bind.h> 31#include <linux/lockd/bind.h>
32#include <linux/seq_file.h> 32#include <linux/seq_file.h>
33#include <linux/mount.h> 33#include <linux/mount.h>
34#include <linux/nfs_idmap.h>
35#include <linux/vfs.h> 34#include <linux/vfs.h>
36#include <linux/inet.h> 35#include <linux/inet.h>
37#include <linux/in6.h> 36#include <linux/in6.h>
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index a6ad68865880..029d688a969f 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -378,7 +378,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
378 if (freeme == NULL) 378 if (freeme == NULL)
379 goto out; 379 goto out;
380 } 380 }
381 list_add_rcu(&delegation->super_list, &server->delegations); 381 list_add_tail_rcu(&delegation->super_list, &server->delegations);
382 rcu_assign_pointer(nfsi->delegation, delegation); 382 rcu_assign_pointer(nfsi->delegation, delegation);
383 delegation = NULL; 383 delegation = NULL;
384 384
@@ -514,7 +514,7 @@ void nfs_inode_return_delegation_noreclaim(struct inode *inode)
514 514
515 delegation = nfs_inode_detach_delegation(inode); 515 delegation = nfs_inode_detach_delegation(inode);
516 if (delegation != NULL) 516 if (delegation != NULL)
517 nfs_do_return_delegation(inode, delegation, 0); 517 nfs_do_return_delegation(inode, delegation, 1);
518} 518}
519 519
520/** 520/**
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 1e51ecd61854..b2c8b31b2be7 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -543,6 +543,9 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en
543 if (scratch == NULL) 543 if (scratch == NULL)
544 return -ENOMEM; 544 return -ENOMEM;
545 545
546 if (buflen == 0)
547 goto out_nopages;
548
546 xdr_init_decode_pages(&stream, &buf, xdr_pages, buflen); 549 xdr_init_decode_pages(&stream, &buf, xdr_pages, buflen);
547 xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); 550 xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
548 551
@@ -564,6 +567,7 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en
564 break; 567 break;
565 } while (!entry->eof); 568 } while (!entry->eof);
566 569
570out_nopages:
567 if (count == 0 || (status == -EBADCOOKIE && entry->eof != 0)) { 571 if (count == 0 || (status == -EBADCOOKIE && entry->eof != 0)) {
568 array = nfs_readdir_get_array(page); 572 array = nfs_readdir_get_array(page);
569 if (!IS_ERR(array)) { 573 if (!IS_ERR(array)) {
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index b2cbc3a6cdd9..38678d9a5cc4 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -129,22 +129,25 @@ nfs_direct_good_bytes(struct nfs_direct_req *dreq, struct nfs_pgio_header *hdr)
129 int i; 129 int i;
130 ssize_t count; 130 ssize_t count;
131 131
132 WARN_ON_ONCE(hdr->pgio_mirror_idx >= dreq->mirror_count); 132 if (dreq->mirror_count == 1) {
133 133 dreq->mirrors[hdr->pgio_mirror_idx].count += hdr->good_bytes;
134 count = dreq->mirrors[hdr->pgio_mirror_idx].count; 134 dreq->count += hdr->good_bytes;
135 if (count + dreq->io_start < hdr->io_start + hdr->good_bytes) { 135 } else {
136 count = hdr->io_start + hdr->good_bytes - dreq->io_start; 136 /* mirrored writes */
137 dreq->mirrors[hdr->pgio_mirror_idx].count = count; 137 count = dreq->mirrors[hdr->pgio_mirror_idx].count;
138 } 138 if (count + dreq->io_start < hdr->io_start + hdr->good_bytes) {
139 139 count = hdr->io_start + hdr->good_bytes - dreq->io_start;
140 /* update the dreq->count by finding the minimum agreed count from all 140 dreq->mirrors[hdr->pgio_mirror_idx].count = count;
141 * mirrors */ 141 }
142 count = dreq->mirrors[0].count; 142 /* update the dreq->count by finding the minimum agreed count from all
143 * mirrors */
144 count = dreq->mirrors[0].count;
143 145
144 for (i = 1; i < dreq->mirror_count; i++) 146 for (i = 1; i < dreq->mirror_count; i++)
145 count = min(count, dreq->mirrors[i].count); 147 count = min(count, dreq->mirrors[i].count);
146 148
147 dreq->count = count; 149 dreq->count = count;
150 }
148} 151}
149 152
150/* 153/*
@@ -258,18 +261,11 @@ ssize_t nfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t pos)
258 if (!IS_SWAPFILE(inode)) 261 if (!IS_SWAPFILE(inode))
259 return 0; 262 return 0;
260 263
261#ifndef CONFIG_NFS_SWAP
262 dprintk("NFS: nfs_direct_IO (%pD) off/no(%Ld/%lu) EINVAL\n",
263 iocb->ki_filp, (long long) pos, iter->nr_segs);
264
265 return -EINVAL;
266#else
267 VM_BUG_ON(iov_iter_count(iter) != PAGE_SIZE); 264 VM_BUG_ON(iov_iter_count(iter) != PAGE_SIZE);
268 265
269 if (iov_iter_rw(iter) == READ) 266 if (iov_iter_rw(iter) == READ)
270 return nfs_file_direct_read(iocb, iter, pos); 267 return nfs_file_direct_read(iocb, iter, pos);
271 return nfs_file_direct_write(iocb, iter); 268 return nfs_file_direct_write(iocb, iter);
272#endif /* CONFIG_NFS_SWAP */
273} 269}
274 270
275static void nfs_direct_release_pages(struct page **pages, unsigned int npages) 271static void nfs_direct_release_pages(struct page **pages, unsigned int npages)
@@ -1030,6 +1026,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
1030 if (i_size_read(inode) < iocb->ki_pos) 1026 if (i_size_read(inode) < iocb->ki_pos)
1031 i_size_write(inode, iocb->ki_pos); 1027 i_size_write(inode, iocb->ki_pos);
1032 spin_unlock(&inode->i_lock); 1028 spin_unlock(&inode->i_lock);
1029 generic_write_sync(file, pos, result);
1033 } 1030 }
1034 } 1031 }
1035 nfs_direct_req_release(dreq); 1032 nfs_direct_req_release(dreq);
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index c40e4363e746..8b8d83a526ce 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -280,6 +280,7 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
280 280
281 trace_nfs_fsync_enter(inode); 281 trace_nfs_fsync_enter(inode);
282 282
283 nfs_inode_dio_wait(inode);
283 do { 284 do {
284 ret = filemap_write_and_wait_range(inode->i_mapping, start, end); 285 ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
285 if (ret != 0) 286 if (ret != 0)
@@ -782,7 +783,7 @@ do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
782 * Flush all pending writes before doing anything 783 * Flush all pending writes before doing anything
783 * with locks.. 784 * with locks..
784 */ 785 */
785 nfs_sync_mapping(filp->f_mapping); 786 vfs_fsync(filp, 0);
786 787
787 l_ctx = nfs_get_lock_context(nfs_file_open_context(filp)); 788 l_ctx = nfs_get_lock_context(nfs_file_open_context(filp));
788 if (!IS_ERR(l_ctx)) { 789 if (!IS_ERR(l_ctx)) {
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index 91e88a7ecef0..a46bf6de9ce4 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -258,7 +258,8 @@ filelayout_set_layoutcommit(struct nfs_pgio_header *hdr)
258 hdr->res.verf->committed != NFS_DATA_SYNC) 258 hdr->res.verf->committed != NFS_DATA_SYNC)
259 return; 259 return;
260 260
261 pnfs_set_layoutcommit(hdr); 261 pnfs_set_layoutcommit(hdr->inode, hdr->lseg,
262 hdr->mds_offset + hdr->res.count);
262 dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino, 263 dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino,
263 (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb); 264 (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb);
264} 265}
@@ -373,7 +374,7 @@ static int filelayout_commit_done_cb(struct rpc_task *task,
373 } 374 }
374 375
375 if (data->verf.committed == NFS_UNSTABLE) 376 if (data->verf.committed == NFS_UNSTABLE)
376 pnfs_commit_set_layoutcommit(data); 377 pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb);
377 378
378 return 0; 379 return 0;
379} 380}
@@ -1086,7 +1087,7 @@ filelayout_alloc_deviceid_node(struct nfs_server *server,
1086} 1087}
1087 1088
1088static void 1089static void
1089filelayout_free_deveiceid_node(struct nfs4_deviceid_node *d) 1090filelayout_free_deviceid_node(struct nfs4_deviceid_node *d)
1090{ 1091{
1091 nfs4_fl_free_deviceid(container_of(d, struct nfs4_file_layout_dsaddr, id_node)); 1092 nfs4_fl_free_deviceid(container_of(d, struct nfs4_file_layout_dsaddr, id_node));
1092} 1093}
@@ -1137,7 +1138,8 @@ static struct pnfs_layoutdriver_type filelayout_type = {
1137 .read_pagelist = filelayout_read_pagelist, 1138 .read_pagelist = filelayout_read_pagelist,
1138 .write_pagelist = filelayout_write_pagelist, 1139 .write_pagelist = filelayout_write_pagelist,
1139 .alloc_deviceid_node = filelayout_alloc_deviceid_node, 1140 .alloc_deviceid_node = filelayout_alloc_deviceid_node,
1140 .free_deviceid_node = filelayout_free_deveiceid_node, 1141 .free_deviceid_node = filelayout_free_deviceid_node,
1142 .sync = pnfs_nfs_generic_sync,
1141}; 1143};
1142 1144
1143static int __init nfs4filelayout_init(void) 1145static int __init nfs4filelayout_init(void)
diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c
index 4f372e224603..4946ef40ba87 100644
--- a/fs/nfs/filelayout/filelayoutdev.c
+++ b/fs/nfs/filelayout/filelayoutdev.c
@@ -55,7 +55,7 @@ nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
55 nfs4_pnfs_ds_put(ds); 55 nfs4_pnfs_ds_put(ds);
56 } 56 }
57 kfree(dsaddr->stripe_indices); 57 kfree(dsaddr->stripe_indices);
58 kfree(dsaddr); 58 kfree_rcu(dsaddr, id_node.rcu);
59} 59}
60 60
61/* Decode opaque device data and return the result */ 61/* Decode opaque device data and return the result */
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 315cc68945b9..7d05089e52d6 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -11,10 +11,10 @@
11#include <linux/module.h> 11#include <linux/module.h>
12 12
13#include <linux/sunrpc/metrics.h> 13#include <linux/sunrpc/metrics.h>
14#include <linux/nfs_idmap.h>
15 14
16#include "flexfilelayout.h" 15#include "flexfilelayout.h"
17#include "../nfs4session.h" 16#include "../nfs4session.h"
17#include "../nfs4idmap.h"
18#include "../internal.h" 18#include "../internal.h"
19#include "../delegation.h" 19#include "../delegation.h"
20#include "../nfs4trace.h" 20#include "../nfs4trace.h"
@@ -891,7 +891,8 @@ static int ff_layout_read_done_cb(struct rpc_task *task,
891static void 891static void
892ff_layout_set_layoutcommit(struct nfs_pgio_header *hdr) 892ff_layout_set_layoutcommit(struct nfs_pgio_header *hdr)
893{ 893{
894 pnfs_set_layoutcommit(hdr); 894 pnfs_set_layoutcommit(hdr->inode, hdr->lseg,
895 hdr->mds_offset + hdr->res.count);
895 dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino, 896 dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino,
896 (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb); 897 (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb);
897} 898}
@@ -1074,7 +1075,7 @@ static int ff_layout_commit_done_cb(struct rpc_task *task,
1074 } 1075 }
1075 1076
1076 if (data->verf.committed == NFS_UNSTABLE) 1077 if (data->verf.committed == NFS_UNSTABLE)
1077 pnfs_commit_set_layoutcommit(data); 1078 pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb);
1078 1079
1079 return 0; 1080 return 0;
1080} 1081}
@@ -1414,7 +1415,7 @@ ff_layout_get_ds_info(struct inode *inode)
1414} 1415}
1415 1416
1416static void 1417static void
1417ff_layout_free_deveiceid_node(struct nfs4_deviceid_node *d) 1418ff_layout_free_deviceid_node(struct nfs4_deviceid_node *d)
1418{ 1419{
1419 nfs4_ff_layout_free_deviceid(container_of(d, struct nfs4_ff_layout_ds, 1420 nfs4_ff_layout_free_deviceid(container_of(d, struct nfs4_ff_layout_ds,
1420 id_node)); 1421 id_node));
@@ -1498,7 +1499,7 @@ static struct pnfs_layoutdriver_type flexfilelayout_type = {
1498 .pg_read_ops = &ff_layout_pg_read_ops, 1499 .pg_read_ops = &ff_layout_pg_read_ops,
1499 .pg_write_ops = &ff_layout_pg_write_ops, 1500 .pg_write_ops = &ff_layout_pg_write_ops,
1500 .get_ds_info = ff_layout_get_ds_info, 1501 .get_ds_info = ff_layout_get_ds_info,
1501 .free_deviceid_node = ff_layout_free_deveiceid_node, 1502 .free_deviceid_node = ff_layout_free_deviceid_node,
1502 .mark_request_commit = pnfs_layout_mark_request_commit, 1503 .mark_request_commit = pnfs_layout_mark_request_commit,
1503 .clear_request_commit = pnfs_generic_clear_request_commit, 1504 .clear_request_commit = pnfs_generic_clear_request_commit,
1504 .scan_commit_lists = pnfs_generic_scan_commit_lists, 1505 .scan_commit_lists = pnfs_generic_scan_commit_lists,
@@ -1508,6 +1509,7 @@ static struct pnfs_layoutdriver_type flexfilelayout_type = {
1508 .write_pagelist = ff_layout_write_pagelist, 1509 .write_pagelist = ff_layout_write_pagelist,
1509 .alloc_deviceid_node = ff_layout_alloc_deviceid_node, 1510 .alloc_deviceid_node = ff_layout_alloc_deviceid_node,
1510 .encode_layoutreturn = ff_layout_encode_layoutreturn, 1511 .encode_layoutreturn = ff_layout_encode_layoutreturn,
1512 .sync = pnfs_nfs_generic_sync,
1511}; 1513};
1512 1514
1513static int __init nfs4flexfilelayout_init(void) 1515static int __init nfs4flexfilelayout_init(void)
diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
index e2c01f204a95..77a2d026aa12 100644
--- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -30,7 +30,7 @@ void nfs4_ff_layout_free_deviceid(struct nfs4_ff_layout_ds *mirror_ds)
30{ 30{
31 nfs4_print_deviceid(&mirror_ds->id_node.deviceid); 31 nfs4_print_deviceid(&mirror_ds->id_node.deviceid);
32 nfs4_pnfs_ds_put(mirror_ds->ds); 32 nfs4_pnfs_ds_put(mirror_ds->ds);
33 kfree(mirror_ds); 33 kfree_rcu(mirror_ds, id_node.rcu);
34} 34}
35 35
36/* Decode opaque device data and construct new_ds using it */ 36/* Decode opaque device data and construct new_ds using it */
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 3689e95da79a..f734562c6d24 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -133,6 +133,13 @@ void nfs_evict_inode(struct inode *inode)
133 nfs_clear_inode(inode); 133 nfs_clear_inode(inode);
134} 134}
135 135
136int nfs_sync_inode(struct inode *inode)
137{
138 nfs_inode_dio_wait(inode);
139 return nfs_wb_all(inode);
140}
141EXPORT_SYMBOL_GPL(nfs_sync_inode);
142
136/** 143/**
137 * nfs_sync_mapping - helper to flush all mmapped dirty data to disk 144 * nfs_sync_mapping - helper to flush all mmapped dirty data to disk
138 */ 145 */
@@ -192,7 +199,6 @@ void nfs_zap_caches(struct inode *inode)
192 nfs_zap_caches_locked(inode); 199 nfs_zap_caches_locked(inode);
193 spin_unlock(&inode->i_lock); 200 spin_unlock(&inode->i_lock);
194} 201}
195EXPORT_SYMBOL_GPL(nfs_zap_caches);
196 202
197void nfs_zap_mapping(struct inode *inode, struct address_space *mapping) 203void nfs_zap_mapping(struct inode *inode, struct address_space *mapping)
198{ 204{
@@ -525,10 +531,8 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
525 trace_nfs_setattr_enter(inode); 531 trace_nfs_setattr_enter(inode);
526 532
527 /* Write all dirty data */ 533 /* Write all dirty data */
528 if (S_ISREG(inode->i_mode)) { 534 if (S_ISREG(inode->i_mode))
529 nfs_inode_dio_wait(inode); 535 nfs_sync_inode(inode);
530 nfs_wb_all(inode);
531 }
532 536
533 fattr = nfs_alloc_fattr(); 537 fattr = nfs_alloc_fattr();
534 if (fattr == NULL) 538 if (fattr == NULL)
@@ -644,8 +648,9 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
644 trace_nfs_getattr_enter(inode); 648 trace_nfs_getattr_enter(inode);
645 /* Flush out writes to the server in order to update c/mtime. */ 649 /* Flush out writes to the server in order to update c/mtime. */
646 if (S_ISREG(inode->i_mode)) { 650 if (S_ISREG(inode->i_mode)) {
647 nfs_inode_dio_wait(inode); 651 mutex_lock(&inode->i_mutex);
648 err = filemap_write_and_wait(inode->i_mapping); 652 err = nfs_sync_inode(inode);
653 mutex_unlock(&inode->i_mutex);
649 if (err) 654 if (err)
650 goto out; 655 goto out;
651 } 656 }
@@ -1588,6 +1593,19 @@ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fa
1588} 1593}
1589EXPORT_SYMBOL_GPL(nfs_post_op_update_inode_force_wcc); 1594EXPORT_SYMBOL_GPL(nfs_post_op_update_inode_force_wcc);
1590 1595
1596
1597static inline bool nfs_fileid_valid(struct nfs_inode *nfsi,
1598 struct nfs_fattr *fattr)
1599{
1600 bool ret1 = true, ret2 = true;
1601
1602 if (fattr->valid & NFS_ATTR_FATTR_FILEID)
1603 ret1 = (nfsi->fileid == fattr->fileid);
1604 if (fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID)
1605 ret2 = (nfsi->fileid == fattr->mounted_on_fileid);
1606 return ret1 || ret2;
1607}
1608
1591/* 1609/*
1592 * Many nfs protocol calls return the new file attributes after 1610 * Many nfs protocol calls return the new file attributes after
1593 * an operation. Here we update the inode to reflect the state 1611 * an operation. Here we update the inode to reflect the state
@@ -1614,7 +1632,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1614 nfs_display_fhandle_hash(NFS_FH(inode)), 1632 nfs_display_fhandle_hash(NFS_FH(inode)),
1615 atomic_read(&inode->i_count), fattr->valid); 1633 atomic_read(&inode->i_count), fattr->valid);
1616 1634
1617 if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid) { 1635 if (!nfs_fileid_valid(nfsi, fattr)) {
1618 printk(KERN_ERR "NFS: server %s error: fileid changed\n" 1636 printk(KERN_ERR "NFS: server %s error: fileid changed\n"
1619 "fsid %s: expected fileid 0x%Lx, got 0x%Lx\n", 1637 "fsid %s: expected fileid 0x%Lx, got 0x%Lx\n",
1620 NFS_SERVER(inode)->nfs_client->cl_hostname, 1638 NFS_SERVER(inode)->nfs_client->cl_hostname,
@@ -1819,7 +1837,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1819struct inode *nfs_alloc_inode(struct super_block *sb) 1837struct inode *nfs_alloc_inode(struct super_block *sb)
1820{ 1838{
1821 struct nfs_inode *nfsi; 1839 struct nfs_inode *nfsi;
1822 nfsi = (struct nfs_inode *)kmem_cache_alloc(nfs_inode_cachep, GFP_KERNEL); 1840 nfsi = kmem_cache_alloc(nfs_inode_cachep, GFP_KERNEL);
1823 if (!nfsi) 1841 if (!nfsi)
1824 return NULL; 1842 return NULL;
1825 nfsi->flags = 0UL; 1843 nfsi->flags = 0UL;
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index cb170722769c..3a9e75235f30 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -36,13 +36,16 @@ static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep,
36 loff_t offset, loff_t len) 36 loff_t offset, loff_t len)
37{ 37{
38 struct inode *inode = file_inode(filep); 38 struct inode *inode = file_inode(filep);
39 struct nfs_server *server = NFS_SERVER(inode);
39 struct nfs42_falloc_args args = { 40 struct nfs42_falloc_args args = {
40 .falloc_fh = NFS_FH(inode), 41 .falloc_fh = NFS_FH(inode),
41 .falloc_offset = offset, 42 .falloc_offset = offset,
42 .falloc_length = len, 43 .falloc_length = len,
44 .falloc_bitmask = server->cache_consistency_bitmask,
45 };
46 struct nfs42_falloc_res res = {
47 .falloc_server = server,
43 }; 48 };
44 struct nfs42_falloc_res res;
45 struct nfs_server *server = NFS_SERVER(inode);
46 int status; 49 int status;
47 50
48 msg->rpc_argp = &args; 51 msg->rpc_argp = &args;
@@ -52,8 +55,17 @@ static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep,
52 if (status) 55 if (status)
53 return status; 56 return status;
54 57
55 return nfs4_call_sync(server->client, server, msg, 58 res.falloc_fattr = nfs_alloc_fattr();
56 &args.seq_args, &res.seq_res, 0); 59 if (!res.falloc_fattr)
60 return -ENOMEM;
61
62 status = nfs4_call_sync(server->client, server, msg,
63 &args.seq_args, &res.seq_res, 0);
64 if (status == 0)
65 status = nfs_post_op_update_inode(inode, res.falloc_fattr);
66
67 kfree(res.falloc_fattr);
68 return status;
57} 69}
58 70
59static int nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep, 71static int nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep,
@@ -84,9 +96,13 @@ int nfs42_proc_allocate(struct file *filep, loff_t offset, loff_t len)
84 if (!nfs_server_capable(inode, NFS_CAP_ALLOCATE)) 96 if (!nfs_server_capable(inode, NFS_CAP_ALLOCATE))
85 return -EOPNOTSUPP; 97 return -EOPNOTSUPP;
86 98
99 mutex_lock(&inode->i_mutex);
100
87 err = nfs42_proc_fallocate(&msg, filep, offset, len); 101 err = nfs42_proc_fallocate(&msg, filep, offset, len);
88 if (err == -EOPNOTSUPP) 102 if (err == -EOPNOTSUPP)
89 NFS_SERVER(inode)->caps &= ~NFS_CAP_ALLOCATE; 103 NFS_SERVER(inode)->caps &= ~NFS_CAP_ALLOCATE;
104
105 mutex_unlock(&inode->i_mutex);
90 return err; 106 return err;
91} 107}
92 108
@@ -101,9 +117,16 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len)
101 if (!nfs_server_capable(inode, NFS_CAP_DEALLOCATE)) 117 if (!nfs_server_capable(inode, NFS_CAP_DEALLOCATE))
102 return -EOPNOTSUPP; 118 return -EOPNOTSUPP;
103 119
120 nfs_wb_all(inode);
121 mutex_lock(&inode->i_mutex);
122
104 err = nfs42_proc_fallocate(&msg, filep, offset, len); 123 err = nfs42_proc_fallocate(&msg, filep, offset, len);
124 if (err == 0)
125 truncate_pagecache_range(inode, offset, (offset + len) -1);
105 if (err == -EOPNOTSUPP) 126 if (err == -EOPNOTSUPP)
106 NFS_SERVER(inode)->caps &= ~NFS_CAP_DEALLOCATE; 127 NFS_SERVER(inode)->caps &= ~NFS_CAP_DEALLOCATE;
128
129 mutex_unlock(&inode->i_mutex);
107 return err; 130 return err;
108} 131}
109 132
diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index 038a7e1521fa..1a25b27248f2 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -25,16 +25,20 @@
25 25
26#define NFS4_enc_allocate_sz (compound_encode_hdr_maxsz + \ 26#define NFS4_enc_allocate_sz (compound_encode_hdr_maxsz + \
27 encode_putfh_maxsz + \ 27 encode_putfh_maxsz + \
28 encode_allocate_maxsz) 28 encode_allocate_maxsz + \
29 encode_getattr_maxsz)
29#define NFS4_dec_allocate_sz (compound_decode_hdr_maxsz + \ 30#define NFS4_dec_allocate_sz (compound_decode_hdr_maxsz + \
30 decode_putfh_maxsz + \ 31 decode_putfh_maxsz + \
31 decode_allocate_maxsz) 32 decode_allocate_maxsz + \
33 decode_getattr_maxsz)
32#define NFS4_enc_deallocate_sz (compound_encode_hdr_maxsz + \ 34#define NFS4_enc_deallocate_sz (compound_encode_hdr_maxsz + \
33 encode_putfh_maxsz + \ 35 encode_putfh_maxsz + \
34 encode_deallocate_maxsz) 36 encode_deallocate_maxsz + \
37 encode_getattr_maxsz)
35#define NFS4_dec_deallocate_sz (compound_decode_hdr_maxsz + \ 38#define NFS4_dec_deallocate_sz (compound_decode_hdr_maxsz + \
36 decode_putfh_maxsz + \ 39 decode_putfh_maxsz + \
37 decode_deallocate_maxsz) 40 decode_deallocate_maxsz + \
41 decode_getattr_maxsz)
38#define NFS4_enc_seek_sz (compound_encode_hdr_maxsz + \ 42#define NFS4_enc_seek_sz (compound_encode_hdr_maxsz + \
39 encode_putfh_maxsz + \ 43 encode_putfh_maxsz + \
40 encode_seek_maxsz) 44 encode_seek_maxsz)
@@ -92,6 +96,7 @@ static void nfs4_xdr_enc_allocate(struct rpc_rqst *req,
92 encode_sequence(xdr, &args->seq_args, &hdr); 96 encode_sequence(xdr, &args->seq_args, &hdr);
93 encode_putfh(xdr, args->falloc_fh, &hdr); 97 encode_putfh(xdr, args->falloc_fh, &hdr);
94 encode_allocate(xdr, args, &hdr); 98 encode_allocate(xdr, args, &hdr);
99 encode_getfattr(xdr, args->falloc_bitmask, &hdr);
95 encode_nops(&hdr); 100 encode_nops(&hdr);
96} 101}
97 102
@@ -110,6 +115,7 @@ static void nfs4_xdr_enc_deallocate(struct rpc_rqst *req,
110 encode_sequence(xdr, &args->seq_args, &hdr); 115 encode_sequence(xdr, &args->seq_args, &hdr);
111 encode_putfh(xdr, args->falloc_fh, &hdr); 116 encode_putfh(xdr, args->falloc_fh, &hdr);
112 encode_deallocate(xdr, args, &hdr); 117 encode_deallocate(xdr, args, &hdr);
118 encode_getfattr(xdr, args->falloc_bitmask, &hdr);
113 encode_nops(&hdr); 119 encode_nops(&hdr);
114} 120}
115 121
@@ -183,6 +189,9 @@ static int nfs4_xdr_dec_allocate(struct rpc_rqst *rqstp,
183 if (status) 189 if (status)
184 goto out; 190 goto out;
185 status = decode_allocate(xdr, res); 191 status = decode_allocate(xdr, res);
192 if (status)
193 goto out;
194 decode_getfattr(xdr, res->falloc_fattr, res->falloc_server);
186out: 195out:
187 return status; 196 return status;
188} 197}
@@ -207,6 +216,9 @@ static int nfs4_xdr_dec_deallocate(struct rpc_rqst *rqstp,
207 if (status) 216 if (status)
208 goto out; 217 goto out;
209 status = decode_deallocate(xdr, res); 218 status = decode_deallocate(xdr, res);
219 if (status)
220 goto out;
221 decode_getfattr(xdr, res->falloc_fattr, res->falloc_server);
210out: 222out:
211 return status; 223 return status;
212} 224}
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 51c2dbd1e942..e42be52a8c18 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -4,7 +4,6 @@
4 */ 4 */
5#include <linux/module.h> 5#include <linux/module.h>
6#include <linux/nfs_fs.h> 6#include <linux/nfs_fs.h>
7#include <linux/nfs_idmap.h>
8#include <linux/nfs_mount.h> 7#include <linux/nfs_mount.h>
9#include <linux/sunrpc/addr.h> 8#include <linux/sunrpc/addr.h>
10#include <linux/sunrpc/auth.h> 9#include <linux/sunrpc/auth.h>
@@ -15,6 +14,7 @@
15#include "callback.h" 14#include "callback.h"
16#include "delegation.h" 15#include "delegation.h"
17#include "nfs4session.h" 16#include "nfs4session.h"
17#include "nfs4idmap.h"
18#include "pnfs.h" 18#include "pnfs.h"
19#include "netns.h" 19#include "netns.h"
20 20
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 619eca34e70f..f58c17b3b480 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -10,6 +10,8 @@
10#include "fscache.h" 10#include "fscache.h"
11#include "pnfs.h" 11#include "pnfs.h"
12 12
13#include "nfstrace.h"
14
13#ifdef CONFIG_NFS_V4_2 15#ifdef CONFIG_NFS_V4_2
14#include "nfs42.h" 16#include "nfs42.h"
15#endif 17#endif
@@ -57,7 +59,7 @@ nfs4_file_open(struct inode *inode, struct file *filp)
57 if (openflags & O_TRUNC) { 59 if (openflags & O_TRUNC) {
58 attr.ia_valid |= ATTR_SIZE; 60 attr.ia_valid |= ATTR_SIZE;
59 attr.ia_size = 0; 61 attr.ia_size = 0;
60 nfs_wb_all(inode); 62 nfs_sync_inode(inode);
61 } 63 }
62 64
63 inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr, &opened); 65 inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr, &opened);
@@ -100,6 +102,9 @@ nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
100 int ret; 102 int ret;
101 struct inode *inode = file_inode(file); 103 struct inode *inode = file_inode(file);
102 104
105 trace_nfs_fsync_enter(inode);
106
107 nfs_inode_dio_wait(inode);
103 do { 108 do {
104 ret = filemap_write_and_wait_range(inode->i_mapping, start, end); 109 ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
105 if (ret != 0) 110 if (ret != 0)
@@ -107,7 +112,7 @@ nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
107 mutex_lock(&inode->i_mutex); 112 mutex_lock(&inode->i_mutex);
108 ret = nfs_file_fsync_commit(file, start, end, datasync); 113 ret = nfs_file_fsync_commit(file, start, end, datasync);
109 if (!ret) 114 if (!ret)
110 ret = pnfs_layoutcommit_inode(inode, true); 115 ret = pnfs_sync_inode(inode, !!datasync);
111 mutex_unlock(&inode->i_mutex); 116 mutex_unlock(&inode->i_mutex);
112 /* 117 /*
113 * If nfs_file_fsync_commit detected a server reboot, then 118 * If nfs_file_fsync_commit detected a server reboot, then
@@ -118,6 +123,7 @@ nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
118 end = LLONG_MAX; 123 end = LLONG_MAX;
119 } while (ret == -EAGAIN); 124 } while (ret == -EAGAIN);
120 125
126 trace_nfs_fsync_exit(inode, ret);
121 return ret; 127 return ret;
122} 128}
123 129
@@ -152,15 +158,9 @@ static long nfs42_fallocate(struct file *filep, int mode, loff_t offset, loff_t
152 if (ret < 0) 158 if (ret < 0)
153 return ret; 159 return ret;
154 160
155 mutex_lock(&inode->i_mutex);
156 if (mode & FALLOC_FL_PUNCH_HOLE) 161 if (mode & FALLOC_FL_PUNCH_HOLE)
157 ret = nfs42_proc_deallocate(filep, offset, len); 162 return nfs42_proc_deallocate(filep, offset, len);
158 else 163 return nfs42_proc_allocate(filep, offset, len);
159 ret = nfs42_proc_allocate(filep, offset, len);
160 mutex_unlock(&inode->i_mutex);
161
162 nfs_zap_caches(inode);
163 return ret;
164} 164}
165#endif /* CONFIG_NFS_V4_2 */ 165#endif /* CONFIG_NFS_V4_2 */
166 166
diff --git a/fs/nfs/idmap.c b/fs/nfs/nfs4idmap.c
index 857e2a99acc8..2e1737c40a29 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/nfs4idmap.c
@@ -36,7 +36,6 @@
36#include <linux/types.h> 36#include <linux/types.h>
37#include <linux/parser.h> 37#include <linux/parser.h>
38#include <linux/fs.h> 38#include <linux/fs.h>
39#include <linux/nfs_idmap.h>
40#include <net/net_namespace.h> 39#include <net/net_namespace.h>
41#include <linux/sunrpc/rpc_pipe_fs.h> 40#include <linux/sunrpc/rpc_pipe_fs.h>
42#include <linux/nfs_fs.h> 41#include <linux/nfs_fs.h>
@@ -49,6 +48,7 @@
49 48
50#include "internal.h" 49#include "internal.h"
51#include "netns.h" 50#include "netns.h"
51#include "nfs4idmap.h"
52#include "nfs4trace.h" 52#include "nfs4trace.h"
53 53
54#define NFS_UINT_MAXLEN 11 54#define NFS_UINT_MAXLEN 11
diff --git a/include/linux/nfs_idmap.h b/fs/nfs/nfs4idmap.h
index 333844e38f66..de44d7330ab3 100644
--- a/include/linux/nfs_idmap.h
+++ b/fs/nfs/nfs4idmap.h
@@ -1,5 +1,5 @@
1/* 1/*
2 * include/linux/nfs_idmap.h 2 * fs/nfs/nfs4idmap.h
3 * 3 *
4 * UID and GID to name mapping for clients. 4 * UID and GID to name mapping for clients.
5 * 5 *
@@ -46,19 +46,8 @@ struct nfs_server;
46struct nfs_fattr; 46struct nfs_fattr;
47struct nfs4_string; 47struct nfs4_string;
48 48
49#if IS_ENABLED(CONFIG_NFS_V4)
50int nfs_idmap_init(void); 49int nfs_idmap_init(void);
51void nfs_idmap_quit(void); 50void nfs_idmap_quit(void);
52#else
53static inline int nfs_idmap_init(void)
54{
55 return 0;
56}
57
58static inline void nfs_idmap_quit(void)
59{}
60#endif
61
62int nfs_idmap_new(struct nfs_client *); 51int nfs_idmap_new(struct nfs_client *);
63void nfs_idmap_delete(struct nfs_client *); 52void nfs_idmap_delete(struct nfs_client *);
64 53
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 98e533f2c94a..45b35b9b1e36 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -51,7 +51,6 @@
51#include <linux/namei.h> 51#include <linux/namei.h>
52#include <linux/mount.h> 52#include <linux/mount.h>
53#include <linux/module.h> 53#include <linux/module.h>
54#include <linux/nfs_idmap.h>
55#include <linux/xattr.h> 54#include <linux/xattr.h>
56#include <linux/utsname.h> 55#include <linux/utsname.h>
57#include <linux/freezer.h> 56#include <linux/freezer.h>
@@ -63,6 +62,7 @@
63#include "callback.h" 62#include "callback.h"
64#include "pnfs.h" 63#include "pnfs.h"
65#include "netns.h" 64#include "netns.h"
65#include "nfs4idmap.h"
66#include "nfs4session.h" 66#include "nfs4session.h"
67#include "fscache.h" 67#include "fscache.h"
68 68
@@ -185,7 +185,8 @@ const u32 nfs4_fattr_bitmap[3] = {
185 | FATTR4_WORD1_SPACE_USED 185 | FATTR4_WORD1_SPACE_USED
186 | FATTR4_WORD1_TIME_ACCESS 186 | FATTR4_WORD1_TIME_ACCESS
187 | FATTR4_WORD1_TIME_METADATA 187 | FATTR4_WORD1_TIME_METADATA
188 | FATTR4_WORD1_TIME_MODIFY, 188 | FATTR4_WORD1_TIME_MODIFY
189 | FATTR4_WORD1_MOUNTED_ON_FILEID,
189#ifdef CONFIG_NFS_V4_SECURITY_LABEL 190#ifdef CONFIG_NFS_V4_SECURITY_LABEL
190 FATTR4_WORD2_SECURITY_LABEL 191 FATTR4_WORD2_SECURITY_LABEL
191#endif 192#endif
@@ -3095,16 +3096,13 @@ int nfs4_proc_get_rootfh(struct nfs_server *server, struct nfs_fh *fhandle,
3095 struct nfs_fsinfo *info, 3096 struct nfs_fsinfo *info,
3096 bool auth_probe) 3097 bool auth_probe)
3097{ 3098{
3098 int status; 3099 int status = 0;
3099 3100
3100 switch (auth_probe) { 3101 if (!auth_probe)
3101 case false:
3102 status = nfs4_lookup_root(server, fhandle, info); 3102 status = nfs4_lookup_root(server, fhandle, info);
3103 if (status != -NFS4ERR_WRONGSEC) 3103
3104 break; 3104 if (auth_probe || status == NFS4ERR_WRONGSEC)
3105 default:
3106 status = nfs4_do_find_root_sec(server, fhandle, info); 3105 status = nfs4_do_find_root_sec(server, fhandle, info);
3107 }
3108 3106
3109 if (status == 0) 3107 if (status == 0)
3110 status = nfs4_server_capabilities(server, fhandle); 3108 status = nfs4_server_capabilities(server, fhandle);
@@ -7944,6 +7942,8 @@ _nfs4_proc_getdeviceinfo(struct nfs_server *server,
7944{ 7942{
7945 struct nfs4_getdeviceinfo_args args = { 7943 struct nfs4_getdeviceinfo_args args = {
7946 .pdev = pdev, 7944 .pdev = pdev,
7945 .notify_types = NOTIFY_DEVICEID4_CHANGE |
7946 NOTIFY_DEVICEID4_DELETE,
7947 }; 7947 };
7948 struct nfs4_getdeviceinfo_res res = { 7948 struct nfs4_getdeviceinfo_res res = {
7949 .pdev = pdev, 7949 .pdev = pdev,
@@ -7958,6 +7958,11 @@ _nfs4_proc_getdeviceinfo(struct nfs_server *server,
7958 7958
7959 dprintk("--> %s\n", __func__); 7959 dprintk("--> %s\n", __func__);
7960 status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); 7960 status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
7961 if (res.notification & ~args.notify_types)
7962 dprintk("%s: unsupported notification\n", __func__);
7963 if (res.notification != args.notify_types)
7964 pdev->nocache = 1;
7965
7961 dprintk("<-- %s status=%d\n", __func__, status); 7966 dprintk("<-- %s status=%d\n", __func__, status);
7962 7967
7963 return status; 7968 return status;
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 3b2b20534a3a..2782cfca2265 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -42,7 +42,6 @@
42#include <linux/slab.h> 42#include <linux/slab.h>
43#include <linux/fs.h> 43#include <linux/fs.h>
44#include <linux/nfs_fs.h> 44#include <linux/nfs_fs.h>
45#include <linux/nfs_idmap.h>
46#include <linux/kthread.h> 45#include <linux/kthread.h>
47#include <linux/module.h> 46#include <linux/module.h>
48#include <linux/random.h> 47#include <linux/random.h>
@@ -57,6 +56,7 @@
57#include "callback.h" 56#include "callback.h"
58#include "delegation.h" 57#include "delegation.h"
59#include "internal.h" 58#include "internal.h"
59#include "nfs4idmap.h"
60#include "nfs4session.h" 60#include "nfs4session.h"
61#include "pnfs.h" 61#include "pnfs.h"
62#include "netns.h" 62#include "netns.h"
diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c
index 75090feeafad..6fb7cb6b3f4b 100644
--- a/fs/nfs/nfs4super.c
+++ b/fs/nfs/nfs4super.c
@@ -3,12 +3,12 @@
3 */ 3 */
4#include <linux/init.h> 4#include <linux/init.h>
5#include <linux/module.h> 5#include <linux/module.h>
6#include <linux/nfs_idmap.h>
7#include <linux/nfs4_mount.h> 6#include <linux/nfs4_mount.h>
8#include <linux/nfs_fs.h> 7#include <linux/nfs_fs.h>
9#include "delegation.h" 8#include "delegation.h"
10#include "internal.h" 9#include "internal.h"
11#include "nfs4_fs.h" 10#include "nfs4_fs.h"
11#include "nfs4idmap.h"
12#include "dns_resolve.h" 12#include "dns_resolve.h"
13#include "pnfs.h" 13#include "pnfs.h"
14#include "nfs.h" 14#include "nfs.h"
@@ -91,10 +91,11 @@ static void nfs4_evict_inode(struct inode *inode)
91{ 91{
92 truncate_inode_pages_final(&inode->i_data); 92 truncate_inode_pages_final(&inode->i_data);
93 clear_inode(inode); 93 clear_inode(inode);
94 pnfs_return_layout(inode);
95 pnfs_destroy_layout(NFS_I(inode));
96 /* If we are holding a delegation, return it! */ 94 /* If we are holding a delegation, return it! */
97 nfs_inode_return_delegation_noreclaim(inode); 95 nfs_inode_return_delegation_noreclaim(inode);
96 /* Note that above delegreturn would trigger pnfs return-on-close */
97 pnfs_return_layout(inode);
98 pnfs_destroy_layout(NFS_I(inode));
98 /* First call standard NFS clear_inode() code */ 99 /* First call standard NFS clear_inode() code */
99 nfs_clear_inode(inode); 100 nfs_clear_inode(inode);
100} 101}
diff --git a/fs/nfs/nfs4sysctl.c b/fs/nfs/nfs4sysctl.c
index b6ebe7e445f6..0fbd3ab1be22 100644
--- a/fs/nfs/nfs4sysctl.c
+++ b/fs/nfs/nfs4sysctl.c
@@ -6,10 +6,10 @@
6 * Copyright (c) 2006 Trond Myklebust <Trond.Myklebust@netapp.com> 6 * Copyright (c) 2006 Trond Myklebust <Trond.Myklebust@netapp.com>
7 */ 7 */
8#include <linux/sysctl.h> 8#include <linux/sysctl.h>
9#include <linux/nfs_idmap.h>
10#include <linux/nfs_fs.h> 9#include <linux/nfs_fs.h>
11 10
12#include "nfs4_fs.h" 11#include "nfs4_fs.h"
12#include "nfs4idmap.h"
13#include "callback.h" 13#include "callback.h"
14 14
15static const int nfs_set_port_min = 0; 15static const int nfs_set_port_min = 0;
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 5c399ec41079..0aea97841d30 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -52,10 +52,10 @@
52#include <linux/nfs.h> 52#include <linux/nfs.h>
53#include <linux/nfs4.h> 53#include <linux/nfs4.h>
54#include <linux/nfs_fs.h> 54#include <linux/nfs_fs.h>
55#include <linux/nfs_idmap.h>
56 55
57#include "nfs4_fs.h" 56#include "nfs4_fs.h"
58#include "internal.h" 57#include "internal.h"
58#include "nfs4idmap.h"
59#include "nfs4session.h" 59#include "nfs4session.h"
60#include "pnfs.h" 60#include "pnfs.h"
61#include "netns.h" 61#include "netns.h"
@@ -1920,7 +1920,7 @@ encode_getdeviceinfo(struct xdr_stream *xdr,
1920 1920
1921 p = reserve_space(xdr, 4 + 4); 1921 p = reserve_space(xdr, 4 + 4);
1922 *p++ = cpu_to_be32(1); /* bitmap length */ 1922 *p++ = cpu_to_be32(1); /* bitmap length */
1923 *p++ = cpu_to_be32(NOTIFY_DEVICEID4_CHANGE | NOTIFY_DEVICEID4_DELETE); 1923 *p++ = cpu_to_be32(args->notify_types);
1924} 1924}
1925 1925
1926static void 1926static void
@@ -5753,8 +5753,9 @@ out_overflow:
5753 5753
5754#if defined(CONFIG_NFS_V4_1) 5754#if defined(CONFIG_NFS_V4_1)
5755static int decode_getdeviceinfo(struct xdr_stream *xdr, 5755static int decode_getdeviceinfo(struct xdr_stream *xdr,
5756 struct pnfs_device *pdev) 5756 struct nfs4_getdeviceinfo_res *res)
5757{ 5757{
5758 struct pnfs_device *pdev = res->pdev;
5758 __be32 *p; 5759 __be32 *p;
5759 uint32_t len, type; 5760 uint32_t len, type;
5760 int status; 5761 int status;
@@ -5802,12 +5803,7 @@ static int decode_getdeviceinfo(struct xdr_stream *xdr,
5802 if (unlikely(!p)) 5803 if (unlikely(!p))
5803 goto out_overflow; 5804 goto out_overflow;
5804 5805
5805 if (be32_to_cpup(p++) & 5806 res->notification = be32_to_cpup(p++);
5806 ~(NOTIFY_DEVICEID4_CHANGE | NOTIFY_DEVICEID4_DELETE)) {
5807 dprintk("%s: unsupported notification\n",
5808 __func__);
5809 }
5810
5811 for (i = 1; i < len; i++) { 5807 for (i = 1; i < len; i++) {
5812 if (be32_to_cpup(p++)) { 5808 if (be32_to_cpup(p++)) {
5813 dprintk("%s: unsupported notification\n", 5809 dprintk("%s: unsupported notification\n",
@@ -7061,7 +7057,7 @@ static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp,
7061 status = decode_sequence(xdr, &res->seq_res, rqstp); 7057 status = decode_sequence(xdr, &res->seq_res, rqstp);
7062 if (status != 0) 7058 if (status != 0)
7063 goto out; 7059 goto out;
7064 status = decode_getdeviceinfo(xdr, res->pdev); 7060 status = decode_getdeviceinfo(xdr, res);
7065out: 7061out:
7066 return status; 7062 return status;
7067} 7063}
@@ -7365,6 +7361,11 @@ nfs4_stat_to_errno(int stat)
7365 .p_name = #proc, \ 7361 .p_name = #proc, \
7366} 7362}
7367 7363
7364#define STUB(proc) \
7365[NFSPROC4_CLNT_##proc] = { \
7366 .p_name = #proc, \
7367}
7368
7368struct rpc_procinfo nfs4_procedures[] = { 7369struct rpc_procinfo nfs4_procedures[] = {
7369 PROC(READ, enc_read, dec_read), 7370 PROC(READ, enc_read, dec_read),
7370 PROC(WRITE, enc_write, dec_write), 7371 PROC(WRITE, enc_write, dec_write),
@@ -7417,6 +7418,7 @@ struct rpc_procinfo nfs4_procedures[] = {
7417 PROC(SECINFO_NO_NAME, enc_secinfo_no_name, dec_secinfo_no_name), 7418 PROC(SECINFO_NO_NAME, enc_secinfo_no_name, dec_secinfo_no_name),
7418 PROC(TEST_STATEID, enc_test_stateid, dec_test_stateid), 7419 PROC(TEST_STATEID, enc_test_stateid, dec_test_stateid),
7419 PROC(FREE_STATEID, enc_free_stateid, dec_free_stateid), 7420 PROC(FREE_STATEID, enc_free_stateid, dec_free_stateid),
7421 STUB(GETDEVICELIST),
7420 PROC(BIND_CONN_TO_SESSION, 7422 PROC(BIND_CONN_TO_SESSION,
7421 enc_bind_conn_to_session, dec_bind_conn_to_session), 7423 enc_bind_conn_to_session, dec_bind_conn_to_session),
7422 PROC(DESTROY_CLIENTID, enc_destroy_clientid, dec_destroy_clientid), 7424 PROC(DESTROY_CLIENTID, enc_destroy_clientid, dec_destroy_clientid),
diff --git a/fs/nfs/nfstrace.c b/fs/nfs/nfstrace.c
index 4eb0aead69b6..c74f7af23d77 100644
--- a/fs/nfs/nfstrace.c
+++ b/fs/nfs/nfstrace.c
@@ -7,3 +7,6 @@
7 7
8#define CREATE_TRACE_POINTS 8#define CREATE_TRACE_POINTS
9#include "nfstrace.h" 9#include "nfstrace.h"
10
11EXPORT_TRACEPOINT_SYMBOL_GPL(nfs_fsync_enter);
12EXPORT_TRACEPOINT_SYMBOL_GPL(nfs_fsync_exit);
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index 24e1d7403c0b..5aaed363556a 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -57,7 +57,7 @@ objio_free_deviceid_node(struct nfs4_deviceid_node *d)
57 57
58 dprintk("%s: free od=%p\n", __func__, de->od.od); 58 dprintk("%s: free od=%p\n", __func__, de->od.od);
59 osduld_put_device(de->od.od); 59 osduld_put_device(de->od.od);
60 kfree(de); 60 kfree_rcu(d, rcu);
61} 61}
62 62
63struct objio_segment { 63struct objio_segment {
@@ -637,6 +637,8 @@ static struct pnfs_layoutdriver_type objlayout_type = {
637 .pg_read_ops = &objio_pg_read_ops, 637 .pg_read_ops = &objio_pg_read_ops,
638 .pg_write_ops = &objio_pg_write_ops, 638 .pg_write_ops = &objio_pg_write_ops,
639 639
640 .sync = pnfs_generic_sync,
641
640 .free_deviceid_node = objio_free_deviceid_node, 642 .free_deviceid_node = objio_free_deviceid_node,
641 643
642 .encode_layoutcommit = objlayout_encode_layoutcommit, 644 .encode_layoutcommit = objlayout_encode_layoutcommit,
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 4f802b02fbb9..230606243be6 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1090,6 +1090,7 @@ bool pnfs_roc(struct inode *ino)
1090 pnfs_get_layout_hdr(lo); /* matched in pnfs_roc_release */ 1090 pnfs_get_layout_hdr(lo); /* matched in pnfs_roc_release */
1091 spin_unlock(&ino->i_lock); 1091 spin_unlock(&ino->i_lock);
1092 pnfs_free_lseg_list(&tmp_list); 1092 pnfs_free_lseg_list(&tmp_list);
1093 pnfs_layoutcommit_inode(ino, true);
1093 return true; 1094 return true;
1094 1095
1095out_noroc: 1096out_noroc:
@@ -1104,8 +1105,10 @@ out_noroc:
1104 } 1105 }
1105 } 1106 }
1106 spin_unlock(&ino->i_lock); 1107 spin_unlock(&ino->i_lock);
1107 if (layoutreturn) 1108 if (layoutreturn) {
1109 pnfs_layoutcommit_inode(ino, true);
1108 pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true); 1110 pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true);
1111 }
1109 return false; 1112 return false;
1110} 1113}
1111 1114
@@ -1841,7 +1844,8 @@ void pnfs_ld_write_done(struct nfs_pgio_header *hdr)
1841{ 1844{
1842 trace_nfs4_pnfs_write(hdr, hdr->pnfs_error); 1845 trace_nfs4_pnfs_write(hdr, hdr->pnfs_error);
1843 if (!hdr->pnfs_error) { 1846 if (!hdr->pnfs_error) {
1844 pnfs_set_layoutcommit(hdr); 1847 pnfs_set_layoutcommit(hdr->inode, hdr->lseg,
1848 hdr->mds_offset + hdr->res.count);
1845 hdr->mds_ops->rpc_call_done(&hdr->task, hdr); 1849 hdr->mds_ops->rpc_call_done(&hdr->task, hdr);
1846 } else 1850 } else
1847 pnfs_ld_handle_write_error(hdr); 1851 pnfs_ld_handle_write_error(hdr);
@@ -1902,7 +1906,6 @@ static void pnfs_writehdr_free(struct nfs_pgio_header *hdr)
1902 pnfs_put_lseg(hdr->lseg); 1906 pnfs_put_lseg(hdr->lseg);
1903 nfs_pgio_header_free(hdr); 1907 nfs_pgio_header_free(hdr);
1904} 1908}
1905EXPORT_SYMBOL_GPL(pnfs_writehdr_free);
1906 1909
1907int 1910int
1908pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc) 1911pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
@@ -2032,7 +2035,6 @@ static void pnfs_readhdr_free(struct nfs_pgio_header *hdr)
2032 pnfs_put_lseg(hdr->lseg); 2035 pnfs_put_lseg(hdr->lseg);
2033 nfs_pgio_header_free(hdr); 2036 nfs_pgio_header_free(hdr);
2034} 2037}
2035EXPORT_SYMBOL_GPL(pnfs_readhdr_free);
2036 2038
2037int 2039int
2038pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc) 2040pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
@@ -2099,64 +2101,34 @@ void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg)
2099EXPORT_SYMBOL_GPL(pnfs_set_lo_fail); 2101EXPORT_SYMBOL_GPL(pnfs_set_lo_fail);
2100 2102
2101void 2103void
2102pnfs_set_layoutcommit(struct nfs_pgio_header *hdr) 2104pnfs_set_layoutcommit(struct inode *inode, struct pnfs_layout_segment *lseg,
2105 loff_t end_pos)
2103{ 2106{
2104 struct inode *inode = hdr->inode;
2105 struct nfs_inode *nfsi = NFS_I(inode); 2107 struct nfs_inode *nfsi = NFS_I(inode);
2106 loff_t end_pos = hdr->mds_offset + hdr->res.count;
2107 bool mark_as_dirty = false; 2108 bool mark_as_dirty = false;
2108 2109
2109 spin_lock(&inode->i_lock); 2110 spin_lock(&inode->i_lock);
2110 if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { 2111 if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
2111 mark_as_dirty = true;
2112 dprintk("%s: Set layoutcommit for inode %lu ",
2113 __func__, inode->i_ino);
2114 }
2115 if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &hdr->lseg->pls_flags)) {
2116 /* references matched in nfs4_layoutcommit_release */
2117 pnfs_get_lseg(hdr->lseg);
2118 }
2119 if (end_pos > nfsi->layout->plh_lwb)
2120 nfsi->layout->plh_lwb = end_pos; 2112 nfsi->layout->plh_lwb = end_pos;
2121 spin_unlock(&inode->i_lock);
2122 dprintk("%s: lseg %p end_pos %llu\n",
2123 __func__, hdr->lseg, nfsi->layout->plh_lwb);
2124
2125 /* if pnfs_layoutcommit_inode() runs between inode locks, the next one
2126 * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */
2127 if (mark_as_dirty)
2128 mark_inode_dirty_sync(inode);
2129}
2130EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit);
2131
2132void pnfs_commit_set_layoutcommit(struct nfs_commit_data *data)
2133{
2134 struct inode *inode = data->inode;
2135 struct nfs_inode *nfsi = NFS_I(inode);
2136 bool mark_as_dirty = false;
2137
2138 spin_lock(&inode->i_lock);
2139 if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
2140 mark_as_dirty = true; 2113 mark_as_dirty = true;
2141 dprintk("%s: Set layoutcommit for inode %lu ", 2114 dprintk("%s: Set layoutcommit for inode %lu ",
2142 __func__, inode->i_ino); 2115 __func__, inode->i_ino);
2143 } 2116 } else if (end_pos > nfsi->layout->plh_lwb)
2144 if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &data->lseg->pls_flags)) { 2117 nfsi->layout->plh_lwb = end_pos;
2118 if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags)) {
2145 /* references matched in nfs4_layoutcommit_release */ 2119 /* references matched in nfs4_layoutcommit_release */
2146 pnfs_get_lseg(data->lseg); 2120 pnfs_get_lseg(lseg);
2147 } 2121 }
2148 if (data->lwb > nfsi->layout->plh_lwb)
2149 nfsi->layout->plh_lwb = data->lwb;
2150 spin_unlock(&inode->i_lock); 2122 spin_unlock(&inode->i_lock);
2151 dprintk("%s: lseg %p end_pos %llu\n", 2123 dprintk("%s: lseg %p end_pos %llu\n",
2152 __func__, data->lseg, nfsi->layout->plh_lwb); 2124 __func__, lseg, nfsi->layout->plh_lwb);
2153 2125
2154 /* if pnfs_layoutcommit_inode() runs between inode locks, the next one 2126 /* if pnfs_layoutcommit_inode() runs between inode locks, the next one
2155 * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */ 2127 * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */
2156 if (mark_as_dirty) 2128 if (mark_as_dirty)
2157 mark_inode_dirty_sync(inode); 2129 mark_inode_dirty_sync(inode);
2158} 2130}
2159EXPORT_SYMBOL_GPL(pnfs_commit_set_layoutcommit); 2131EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit);
2160 2132
2161void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data) 2133void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data)
2162{ 2134{
@@ -2216,7 +2188,6 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
2216 pnfs_list_write_lseg(inode, &data->lseg_list); 2188 pnfs_list_write_lseg(inode, &data->lseg_list);
2217 2189
2218 end_pos = nfsi->layout->plh_lwb; 2190 end_pos = nfsi->layout->plh_lwb;
2219 nfsi->layout->plh_lwb = 0;
2220 2191
2221 nfs4_stateid_copy(&data->args.stateid, &nfsi->layout->plh_stateid); 2192 nfs4_stateid_copy(&data->args.stateid, &nfsi->layout->plh_stateid);
2222 spin_unlock(&inode->i_lock); 2193 spin_unlock(&inode->i_lock);
@@ -2233,11 +2204,11 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
2233 status = ld->prepare_layoutcommit(&data->args); 2204 status = ld->prepare_layoutcommit(&data->args);
2234 if (status) { 2205 if (status) {
2235 spin_lock(&inode->i_lock); 2206 spin_lock(&inode->i_lock);
2236 if (end_pos < nfsi->layout->plh_lwb) 2207 set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags);
2208 if (end_pos > nfsi->layout->plh_lwb)
2237 nfsi->layout->plh_lwb = end_pos; 2209 nfsi->layout->plh_lwb = end_pos;
2238 spin_unlock(&inode->i_lock); 2210 spin_unlock(&inode->i_lock);
2239 put_rpccred(data->cred); 2211 put_rpccred(data->cred);
2240 set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags);
2241 goto clear_layoutcommitting; 2212 goto clear_layoutcommitting;
2242 } 2213 }
2243 } 2214 }
@@ -2258,6 +2229,13 @@ clear_layoutcommitting:
2258} 2229}
2259EXPORT_SYMBOL_GPL(pnfs_layoutcommit_inode); 2230EXPORT_SYMBOL_GPL(pnfs_layoutcommit_inode);
2260 2231
2232int
2233pnfs_generic_sync(struct inode *inode, bool datasync)
2234{
2235 return pnfs_layoutcommit_inode(inode, true);
2236}
2237EXPORT_SYMBOL_GPL(pnfs_generic_sync);
2238
2261struct nfs4_threshold *pnfs_mdsthreshold_alloc(void) 2239struct nfs4_threshold *pnfs_mdsthreshold_alloc(void)
2262{ 2240{
2263 struct nfs4_threshold *thp; 2241 struct nfs4_threshold *thp;
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 084c9144f86d..1e6308f82fc3 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -155,6 +155,8 @@ struct pnfs_layoutdriver_type {
155 int how, 155 int how,
156 struct nfs_commit_info *cinfo); 156 struct nfs_commit_info *cinfo);
157 157
158 int (*sync)(struct inode *inode, bool datasync);
159
158 /* 160 /*
159 * Return PNFS_ATTEMPTED to indicate the layout code has attempted 161 * Return PNFS_ATTEMPTED to indicate the layout code has attempted
160 * I/O, else return PNFS_NOT_ATTEMPTED to fall back to normal NFS 162 * I/O, else return PNFS_NOT_ATTEMPTED to fall back to normal NFS
@@ -203,6 +205,7 @@ struct pnfs_device {
203 struct page **pages; 205 struct page **pages;
204 unsigned int pgbase; 206 unsigned int pgbase;
205 unsigned int pglen; /* reply buffer length */ 207 unsigned int pglen; /* reply buffer length */
208 unsigned char nocache : 1;/* May not be cached */
206}; 209};
207 210
208#define NFS4_PNFS_GETDEVLIST_MAXNUM 16 211#define NFS4_PNFS_GETDEVLIST_MAXNUM 16
@@ -263,10 +266,11 @@ bool pnfs_roc(struct inode *ino);
263void pnfs_roc_release(struct inode *ino); 266void pnfs_roc_release(struct inode *ino);
264void pnfs_roc_set_barrier(struct inode *ino, u32 barrier); 267void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
265bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task); 268bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task);
266void pnfs_set_layoutcommit(struct nfs_pgio_header *); 269void pnfs_set_layoutcommit(struct inode *, struct pnfs_layout_segment *, loff_t);
267void pnfs_commit_set_layoutcommit(struct nfs_commit_data *data);
268void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data); 270void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data);
269int pnfs_layoutcommit_inode(struct inode *inode, bool sync); 271int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
272int pnfs_generic_sync(struct inode *inode, bool datasync);
273int pnfs_nfs_generic_sync(struct inode *inode, bool datasync);
270int _pnfs_return_layout(struct inode *); 274int _pnfs_return_layout(struct inode *);
271int pnfs_commit_and_return_layout(struct inode *); 275int pnfs_commit_and_return_layout(struct inode *);
272void pnfs_ld_write_done(struct nfs_pgio_header *); 276void pnfs_ld_write_done(struct nfs_pgio_header *);
@@ -291,6 +295,7 @@ void pnfs_error_mark_layout_for_return(struct inode *inode,
291enum { 295enum {
292 NFS_DEVICEID_INVALID = 0, /* set when MDS clientid recalled */ 296 NFS_DEVICEID_INVALID = 0, /* set when MDS clientid recalled */
293 NFS_DEVICEID_UNAVAILABLE, /* device temporarily unavailable */ 297 NFS_DEVICEID_UNAVAILABLE, /* device temporarily unavailable */
298 NFS_DEVICEID_NOCACHE, /* device may not be cached */
294}; 299};
295 300
296/* pnfs_dev.c */ 301/* pnfs_dev.c */
@@ -302,6 +307,7 @@ struct nfs4_deviceid_node {
302 unsigned long flags; 307 unsigned long flags;
303 unsigned long timestamp_unavailable; 308 unsigned long timestamp_unavailable;
304 struct nfs4_deviceid deviceid; 309 struct nfs4_deviceid deviceid;
310 struct rcu_head rcu;
305 atomic_t ref; 311 atomic_t ref;
306}; 312};
307 313
@@ -486,6 +492,14 @@ pnfs_ld_read_whole_page(struct inode *inode)
486 return NFS_SERVER(inode)->pnfs_curr_ld->flags & PNFS_READ_WHOLE_PAGE; 492 return NFS_SERVER(inode)->pnfs_curr_ld->flags & PNFS_READ_WHOLE_PAGE;
487} 493}
488 494
495static inline int
496pnfs_sync_inode(struct inode *inode, bool datasync)
497{
498 if (!pnfs_enabled_sb(NFS_SERVER(inode)))
499 return 0;
500 return NFS_SERVER(inode)->pnfs_curr_ld->sync(inode, datasync);
501}
502
489static inline bool 503static inline bool
490pnfs_layoutcommit_outstanding(struct inode *inode) 504pnfs_layoutcommit_outstanding(struct inode *inode)
491{ 505{
@@ -568,6 +582,12 @@ pnfs_ld_read_whole_page(struct inode *inode)
568 return false; 582 return false;
569} 583}
570 584
585static inline int
586pnfs_sync_inode(struct inode *inode, bool datasync)
587{
588 return 0;
589}
590
571static inline bool 591static inline bool
572pnfs_roc(struct inode *ino) 592pnfs_roc(struct inode *ino)
573{ 593{
diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c
index aa2ec0015183..2961fcd7a2df 100644
--- a/fs/nfs/pnfs_dev.c
+++ b/fs/nfs/pnfs_dev.c
@@ -149,6 +149,8 @@ nfs4_get_device_info(struct nfs_server *server,
149 */ 149 */
150 d = server->pnfs_curr_ld->alloc_deviceid_node(server, pdev, 150 d = server->pnfs_curr_ld->alloc_deviceid_node(server, pdev,
151 gfp_flags); 151 gfp_flags);
152 if (d && pdev->nocache)
153 set_bit(NFS_DEVICEID_NOCACHE, &d->flags);
152 154
153out_free_pages: 155out_free_pages:
154 for (i = 0; i < max_pages; i++) 156 for (i = 0; i < max_pages; i++)
@@ -175,8 +177,8 @@ __nfs4_find_get_deviceid(struct nfs_server *server,
175 rcu_read_lock(); 177 rcu_read_lock();
176 d = _lookup_deviceid(server->pnfs_curr_ld, server->nfs_client, id, 178 d = _lookup_deviceid(server->pnfs_curr_ld, server->nfs_client, id,
177 hash); 179 hash);
178 if (d != NULL) 180 if (d != NULL && !atomic_inc_not_zero(&d->ref))
179 atomic_inc(&d->ref); 181 d = NULL;
180 rcu_read_unlock(); 182 rcu_read_unlock();
181 return d; 183 return d;
182} 184}
@@ -235,12 +237,11 @@ nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *ld,
235 return; 237 return;
236 } 238 }
237 hlist_del_init_rcu(&d->node); 239 hlist_del_init_rcu(&d->node);
240 clear_bit(NFS_DEVICEID_NOCACHE, &d->flags);
238 spin_unlock(&nfs4_deviceid_lock); 241 spin_unlock(&nfs4_deviceid_lock);
239 synchronize_rcu();
240 242
241 /* balance the initial ref set in pnfs_insert_deviceid */ 243 /* balance the initial ref set in pnfs_insert_deviceid */
242 if (atomic_dec_and_test(&d->ref)) 244 nfs4_put_deviceid_node(d);
243 d->ld->free_deviceid_node(d);
244} 245}
245EXPORT_SYMBOL_GPL(nfs4_delete_deviceid); 246EXPORT_SYMBOL_GPL(nfs4_delete_deviceid);
246 247
@@ -271,6 +272,11 @@ EXPORT_SYMBOL_GPL(nfs4_init_deviceid_node);
271bool 272bool
272nfs4_put_deviceid_node(struct nfs4_deviceid_node *d) 273nfs4_put_deviceid_node(struct nfs4_deviceid_node *d)
273{ 274{
275 if (test_bit(NFS_DEVICEID_NOCACHE, &d->flags)) {
276 if (atomic_add_unless(&d->ref, -1, 2))
277 return false;
278 nfs4_delete_deviceid(d->ld, d->nfs_client, &d->deviceid);
279 }
274 if (!atomic_dec_and_test(&d->ref)) 280 if (!atomic_dec_and_test(&d->ref))
275 return false; 281 return false;
276 d->ld->free_deviceid_node(d); 282 d->ld->free_deviceid_node(d);
@@ -314,6 +320,7 @@ _deviceid_purge_client(const struct nfs_client *clp, long hash)
314 if (d->nfs_client == clp && atomic_read(&d->ref)) { 320 if (d->nfs_client == clp && atomic_read(&d->ref)) {
315 hlist_del_init_rcu(&d->node); 321 hlist_del_init_rcu(&d->node);
316 hlist_add_head(&d->tmpnode, &tmp); 322 hlist_add_head(&d->tmpnode, &tmp);
323 clear_bit(NFS_DEVICEID_NOCACHE, &d->flags);
317 } 324 }
318 rcu_read_unlock(); 325 rcu_read_unlock();
319 spin_unlock(&nfs4_deviceid_lock); 326 spin_unlock(&nfs4_deviceid_lock);
@@ -321,12 +328,10 @@ _deviceid_purge_client(const struct nfs_client *clp, long hash)
321 if (hlist_empty(&tmp)) 328 if (hlist_empty(&tmp))
322 return; 329 return;
323 330
324 synchronize_rcu();
325 while (!hlist_empty(&tmp)) { 331 while (!hlist_empty(&tmp)) {
326 d = hlist_entry(tmp.first, struct nfs4_deviceid_node, tmpnode); 332 d = hlist_entry(tmp.first, struct nfs4_deviceid_node, tmpnode);
327 hlist_del(&d->tmpnode); 333 hlist_del(&d->tmpnode);
328 if (atomic_dec_and_test(&d->ref)) 334 nfs4_put_deviceid_node(d);
329 d->ld->free_deviceid_node(d);
330 } 335 }
331} 336}
332 337
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index 54e36b38fb5f..f37e25b6311c 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -561,7 +561,7 @@ static bool load_v3_ds_connect(void)
561 return(get_v3_ds_connect != NULL); 561 return(get_v3_ds_connect != NULL);
562} 562}
563 563
564void __exit nfs4_pnfs_v3_ds_connect_unload(void) 564void nfs4_pnfs_v3_ds_connect_unload(void)
565{ 565{
566 if (get_v3_ds_connect) { 566 if (get_v3_ds_connect) {
567 symbol_put(nfs3_set_ds_client); 567 symbol_put(nfs3_set_ds_client);
@@ -868,3 +868,13 @@ pnfs_layout_mark_request_commit(struct nfs_page *req,
868 nfs_request_add_commit_list(req, list, cinfo); 868 nfs_request_add_commit_list(req, list, cinfo);
869} 869}
870EXPORT_SYMBOL_GPL(pnfs_layout_mark_request_commit); 870EXPORT_SYMBOL_GPL(pnfs_layout_mark_request_commit);
871
872int
873pnfs_nfs_generic_sync(struct inode *inode, bool datasync)
874{
875 if (datasync)
876 return 0;
877 return pnfs_layoutcommit_inode(inode, true);
878}
879EXPORT_SYMBOL_GPL(pnfs_nfs_generic_sync);
880
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index a5b7427c3754..ae0ff7a11b40 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -284,7 +284,7 @@ int nfs_readpage(struct file *file, struct page *page)
284 dprintk("NFS: nfs_readpage (%p %ld@%lu)\n", 284 dprintk("NFS: nfs_readpage (%p %ld@%lu)\n",
285 page, PAGE_CACHE_SIZE, page_file_index(page)); 285 page, PAGE_CACHE_SIZE, page_file_index(page));
286 nfs_inc_stats(inode, NFSIOS_VFSREADPAGE); 286 nfs_inc_stats(inode, NFSIOS_VFSREADPAGE);
287 nfs_inc_stats(inode, NFSIOS_READPAGES); 287 nfs_add_stats(inode, NFSIOS_READPAGES, 1);
288 288
289 /* 289 /*
290 * Try to flush any pending writes to the file.. 290 * Try to flush any pending writes to the file..
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 21f8f52bf37d..f175b833b6ba 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -43,7 +43,6 @@
43#include <linux/seq_file.h> 43#include <linux/seq_file.h>
44#include <linux/mount.h> 44#include <linux/mount.h>
45#include <linux/namei.h> 45#include <linux/namei.h>
46#include <linux/nfs_idmap.h>
47#include <linux/vfs.h> 46#include <linux/vfs.h>
48#include <linux/inet.h> 47#include <linux/inet.h>
49#include <linux/in6.h> 48#include <linux/in6.h>
@@ -2193,7 +2192,7 @@ nfs_compare_remount_data(struct nfs_server *nfss,
2193 data->version != nfss->nfs_client->rpc_ops->version || 2192 data->version != nfss->nfs_client->rpc_ops->version ||
2194 data->minorversion != nfss->nfs_client->cl_minorversion || 2193 data->minorversion != nfss->nfs_client->cl_minorversion ||
2195 data->retrans != nfss->client->cl_timeout->to_retries || 2194 data->retrans != nfss->client->cl_timeout->to_retries ||
2196 data->selected_flavor != nfss->client->cl_auth->au_flavor || 2195 !nfs_auth_info_match(&data->auth_info, nfss->client->cl_auth->au_flavor) ||
2197 data->acregmin != nfss->acregmin / HZ || 2196 data->acregmin != nfss->acregmin / HZ ||
2198 data->acregmax != nfss->acregmax / HZ || 2197 data->acregmax != nfss->acregmax / HZ ||
2199 data->acdirmin != nfss->acdirmin / HZ || 2198 data->acdirmin != nfss->acdirmin / HZ ||
@@ -2241,7 +2240,6 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)
2241 data->wsize = nfss->wsize; 2240 data->wsize = nfss->wsize;
2242 data->retrans = nfss->client->cl_timeout->to_retries; 2241 data->retrans = nfss->client->cl_timeout->to_retries;
2243 data->selected_flavor = nfss->client->cl_auth->au_flavor; 2242 data->selected_flavor = nfss->client->cl_auth->au_flavor;
2244 data->auth_info = nfss->auth_info;
2245 data->acregmin = nfss->acregmin / HZ; 2243 data->acregmin = nfss->acregmin / HZ;
2246 data->acregmax = nfss->acregmax / HZ; 2244 data->acregmax = nfss->acregmax / HZ;
2247 data->acdirmin = nfss->acdirmin / HZ; 2245 data->acdirmin = nfss->acdirmin / HZ;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 3612b4622337..d12a4be613a5 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -580,7 +580,7 @@ static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, st
580 int ret; 580 int ret;
581 581
582 nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE); 582 nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
583 nfs_inc_stats(inode, NFSIOS_WRITEPAGES); 583 nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1);
584 584
585 nfs_pageio_cond_complete(pgio, page_file_index(page)); 585 nfs_pageio_cond_complete(pgio, page_file_index(page));
586 ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE); 586 ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE);
@@ -1840,17 +1840,16 @@ EXPORT_SYMBOL_GPL(nfs_write_inode);
1840 */ 1840 */
1841int nfs_wb_all(struct inode *inode) 1841int nfs_wb_all(struct inode *inode)
1842{ 1842{
1843 struct writeback_control wbc = {
1844 .sync_mode = WB_SYNC_ALL,
1845 .nr_to_write = LONG_MAX,
1846 .range_start = 0,
1847 .range_end = LLONG_MAX,
1848 };
1849 int ret; 1843 int ret;
1850 1844
1851 trace_nfs_writeback_inode_enter(inode); 1845 trace_nfs_writeback_inode_enter(inode);
1852 1846
1853 ret = sync_inode(inode, &wbc); 1847 ret = filemap_write_and_wait(inode->i_mapping);
1848 if (!ret) {
1849 ret = nfs_commit_inode(inode, FLUSH_SYNC);
1850 if (!ret)
1851 pnfs_sync_inode(inode, true);
1852 }
1854 1853
1855 trace_nfs_writeback_inode_exit(inode, ret); 1854 trace_nfs_writeback_inode_exit(inode, ret);
1856 return ret; 1855 return ret;
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 410abd172feb..b95f914ce083 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -511,6 +511,7 @@ extern int nfs_updatepage(struct file *, struct page *, unsigned int, unsigned
511 * Try to write back everything synchronously (but check the 511 * Try to write back everything synchronously (but check the
512 * return value!) 512 * return value!)
513 */ 513 */
514extern int nfs_sync_inode(struct inode *inode);
514extern int nfs_wb_all(struct inode *inode); 515extern int nfs_wb_all(struct inode *inode);
515extern int nfs_wb_page(struct inode *inode, struct page* page); 516extern int nfs_wb_page(struct inode *inode, struct page* page);
516extern int nfs_wb_page_cancel(struct inode *inode, struct page* page); 517extern int nfs_wb_page_cancel(struct inode *inode, struct page* page);
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 4cb3eaa89cf7..93ab6071bbe9 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -255,11 +255,13 @@ struct nfs4_layoutget {
255struct nfs4_getdeviceinfo_args { 255struct nfs4_getdeviceinfo_args {
256 struct nfs4_sequence_args seq_args; 256 struct nfs4_sequence_args seq_args;
257 struct pnfs_device *pdev; 257 struct pnfs_device *pdev;
258 __u32 notify_types;
258}; 259};
259 260
260struct nfs4_getdeviceinfo_res { 261struct nfs4_getdeviceinfo_res {
261 struct nfs4_sequence_res seq_res; 262 struct nfs4_sequence_res seq_res;
262 struct pnfs_device *pdev; 263 struct pnfs_device *pdev;
264 __u32 notification;
263}; 265};
264 266
265struct nfs4_layoutcommit_args { 267struct nfs4_layoutcommit_args {
@@ -1271,11 +1273,15 @@ struct nfs42_falloc_args {
1271 nfs4_stateid falloc_stateid; 1273 nfs4_stateid falloc_stateid;
1272 u64 falloc_offset; 1274 u64 falloc_offset;
1273 u64 falloc_length; 1275 u64 falloc_length;
1276 const u32 *falloc_bitmask;
1274}; 1277};
1275 1278
1276struct nfs42_falloc_res { 1279struct nfs42_falloc_res {
1277 struct nfs4_sequence_res seq_res; 1280 struct nfs4_sequence_res seq_res;
1278 unsigned int status; 1281 unsigned int status;
1282
1283 struct nfs_fattr *falloc_fattr;
1284 const struct nfs_server *falloc_server;
1279}; 1285};
1280 1286
1281struct nfs42_seek_args { 1287struct nfs42_seek_args {
diff --git a/include/linux/sunrpc/msg_prot.h b/include/linux/sunrpc/msg_prot.h
index aadc6a04e1ac..807371357160 100644
--- a/include/linux/sunrpc/msg_prot.h
+++ b/include/linux/sunrpc/msg_prot.h
@@ -142,12 +142,18 @@ typedef __be32 rpc_fraghdr;
142 (RPC_REPHDRSIZE + (2 + RPC_MAX_AUTH_SIZE/4)) 142 (RPC_REPHDRSIZE + (2 + RPC_MAX_AUTH_SIZE/4))
143 143
144/* 144/*
145 * RFC1833/RFC3530 rpcbind (v3+) well-known netid's. 145 * Well-known netids. See:
146 *
147 * http://www.iana.org/assignments/rpc-netids/rpc-netids.xhtml
146 */ 148 */
147#define RPCBIND_NETID_UDP "udp" 149#define RPCBIND_NETID_UDP "udp"
148#define RPCBIND_NETID_TCP "tcp" 150#define RPCBIND_NETID_TCP "tcp"
151#define RPCBIND_NETID_RDMA "rdma"
152#define RPCBIND_NETID_SCTP "sctp"
149#define RPCBIND_NETID_UDP6 "udp6" 153#define RPCBIND_NETID_UDP6 "udp6"
150#define RPCBIND_NETID_TCP6 "tcp6" 154#define RPCBIND_NETID_TCP6 "tcp6"
155#define RPCBIND_NETID_RDMA6 "rdma6"
156#define RPCBIND_NETID_SCTP6 "sctp6"
151#define RPCBIND_NETID_LOCAL "local" 157#define RPCBIND_NETID_LOCAL "local"
152 158
153/* 159/*
diff --git a/include/linux/sunrpc/xprtrdma.h b/include/linux/sunrpc/xprtrdma.h
index 64a0a0a97b23..c984c85981ea 100644
--- a/include/linux/sunrpc/xprtrdma.h
+++ b/include/linux/sunrpc/xprtrdma.h
@@ -41,11 +41,6 @@
41#define _LINUX_SUNRPC_XPRTRDMA_H 41#define _LINUX_SUNRPC_XPRTRDMA_H
42 42
43/* 43/*
44 * rpcbind (v3+) RDMA netid.
45 */
46#define RPCBIND_NETID_RDMA "rdma"
47
48/*
49 * Constants. Max RPC/NFS header is big enough to account for 44 * Constants. Max RPC/NFS header is big enough to account for
50 * additional marshaling buffers passed down by Linux client. 45 * additional marshaling buffers passed down by Linux client.
51 * 46 *
diff --git a/include/uapi/linux/nfs_idmap.h b/include/uapi/linux/nfs_idmap.h
index 8d4b1c7b24d4..038e36c96669 100644
--- a/include/uapi/linux/nfs_idmap.h
+++ b/include/uapi/linux/nfs_idmap.h
@@ -1,5 +1,5 @@
1/* 1/*
2 * include/linux/nfs_idmap.h 2 * include/uapi/linux/nfs_idmap.h
3 * 3 *
4 * UID and GID to name mapping for clients. 4 * UID and GID to name mapping for clients.
5 * 5 *
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index b91fd9c597b4..337ca851a350 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -89,8 +89,8 @@ __rpc_add_timer(struct rpc_wait_queue *queue, struct rpc_task *task)
89 if (!task->tk_timeout) 89 if (!task->tk_timeout)
90 return; 90 return;
91 91
92 dprintk("RPC: %5u setting alarm for %lu ms\n", 92 dprintk("RPC: %5u setting alarm for %u ms\n",
93 task->tk_pid, task->tk_timeout * 1000 / HZ); 93 task->tk_pid, jiffies_to_msecs(task->tk_timeout));
94 94
95 task->u.tk_wait.expires = jiffies + task->tk_timeout; 95 task->u.tk_wait.expires = jiffies + task->tk_timeout;
96 if (list_empty(&queue->timer_list.list) || time_before(task->u.tk_wait.expires, queue->timer_list.expires)) 96 if (list_empty(&queue->timer_list.list) || time_before(task->u.tk_wait.expires, queue->timer_list.expires))
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 9949722d99ce..1d4fe24af06a 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -326,6 +326,15 @@ out_unlock:
326 xprt_clear_locked(xprt); 326 xprt_clear_locked(xprt);
327} 327}
328 328
329static void xprt_task_clear_bytes_sent(struct rpc_task *task)
330{
331 if (task != NULL) {
332 struct rpc_rqst *req = task->tk_rqstp;
333 if (req != NULL)
334 req->rq_bytes_sent = 0;
335 }
336}
337
329/** 338/**
330 * xprt_release_xprt - allow other requests to use a transport 339 * xprt_release_xprt - allow other requests to use a transport
331 * @xprt: transport with other tasks potentially waiting 340 * @xprt: transport with other tasks potentially waiting
@@ -336,11 +345,7 @@ out_unlock:
336void xprt_release_xprt(struct rpc_xprt *xprt, struct rpc_task *task) 345void xprt_release_xprt(struct rpc_xprt *xprt, struct rpc_task *task)
337{ 346{
338 if (xprt->snd_task == task) { 347 if (xprt->snd_task == task) {
339 if (task != NULL) { 348 xprt_task_clear_bytes_sent(task);
340 struct rpc_rqst *req = task->tk_rqstp;
341 if (req != NULL)
342 req->rq_bytes_sent = 0;
343 }
344 xprt_clear_locked(xprt); 349 xprt_clear_locked(xprt);
345 __xprt_lock_write_next(xprt); 350 __xprt_lock_write_next(xprt);
346 } 351 }
@@ -358,11 +363,7 @@ EXPORT_SYMBOL_GPL(xprt_release_xprt);
358void xprt_release_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task) 363void xprt_release_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task)
359{ 364{
360 if (xprt->snd_task == task) { 365 if (xprt->snd_task == task) {
361 if (task != NULL) { 366 xprt_task_clear_bytes_sent(task);
362 struct rpc_rqst *req = task->tk_rqstp;
363 if (req != NULL)
364 req->rq_bytes_sent = 0;
365 }
366 xprt_clear_locked(xprt); 367 xprt_clear_locked(xprt);
367 __xprt_lock_write_next_cong(xprt); 368 __xprt_lock_write_next_cong(xprt);
368 } 369 }
@@ -700,6 +701,7 @@ bool xprt_lock_connect(struct rpc_xprt *xprt,
700 goto out; 701 goto out;
701 if (xprt->snd_task != task) 702 if (xprt->snd_task != task)
702 goto out; 703 goto out;
704 xprt_task_clear_bytes_sent(task);
703 xprt->snd_task = cookie; 705 xprt->snd_task = cookie;
704 ret = true; 706 ret = true;
705out: 707out:
diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile
index da5136fd5694..579f72bbcf4b 100644
--- a/net/sunrpc/xprtrdma/Makefile
+++ b/net/sunrpc/xprtrdma/Makefile
@@ -1,6 +1,7 @@
1obj-$(CONFIG_SUNRPC_XPRT_RDMA_CLIENT) += xprtrdma.o 1obj-$(CONFIG_SUNRPC_XPRT_RDMA_CLIENT) += xprtrdma.o
2 2
3xprtrdma-y := transport.o rpc_rdma.o verbs.o 3xprtrdma-y := transport.o rpc_rdma.o verbs.o \
4 fmr_ops.o frwr_ops.o physical_ops.o
4 5
5obj-$(CONFIG_SUNRPC_XPRT_RDMA_SERVER) += svcrdma.o 6obj-$(CONFIG_SUNRPC_XPRT_RDMA_SERVER) += svcrdma.o
6 7
diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
new file mode 100644
index 000000000000..302d4ebf6fbf
--- /dev/null
+++ b/net/sunrpc/xprtrdma/fmr_ops.c
@@ -0,0 +1,208 @@
1/*
2 * Copyright (c) 2015 Oracle. All rights reserved.
3 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
4 */
5
6/* Lightweight memory registration using Fast Memory Regions (FMR).
7 * Referred to sometimes as MTHCAFMR mode.
8 *
9 * FMR uses synchronous memory registration and deregistration.
10 * FMR registration is known to be fast, but FMR deregistration
11 * can take tens of usecs to complete.
12 */
13
14#include "xprt_rdma.h"
15
16#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
17# define RPCDBG_FACILITY RPCDBG_TRANS
18#endif
19
20/* Maximum scatter/gather per FMR */
21#define RPCRDMA_MAX_FMR_SGES (64)
22
23static int
24fmr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
25 struct rpcrdma_create_data_internal *cdata)
26{
27 return 0;
28}
29
30/* FMR mode conveys up to 64 pages of payload per chunk segment.
31 */
32static size_t
33fmr_op_maxpages(struct rpcrdma_xprt *r_xprt)
34{
35 return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
36 rpcrdma_max_segments(r_xprt) * RPCRDMA_MAX_FMR_SGES);
37}
38
39static int
40fmr_op_init(struct rpcrdma_xprt *r_xprt)
41{
42 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
43 int mr_access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ;
44 struct ib_fmr_attr fmr_attr = {
45 .max_pages = RPCRDMA_MAX_FMR_SGES,
46 .max_maps = 1,
47 .page_shift = PAGE_SHIFT
48 };
49 struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
50 struct rpcrdma_mw *r;
51 int i, rc;
52
53 INIT_LIST_HEAD(&buf->rb_mws);
54 INIT_LIST_HEAD(&buf->rb_all);
55
56 i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
57 dprintk("RPC: %s: initializing %d FMRs\n", __func__, i);
58
59 while (i--) {
60 r = kzalloc(sizeof(*r), GFP_KERNEL);
61 if (!r)
62 return -ENOMEM;
63
64 r->r.fmr = ib_alloc_fmr(pd, mr_access_flags, &fmr_attr);
65 if (IS_ERR(r->r.fmr))
66 goto out_fmr_err;
67
68 list_add(&r->mw_list, &buf->rb_mws);
69 list_add(&r->mw_all, &buf->rb_all);
70 }
71 return 0;
72
73out_fmr_err:
74 rc = PTR_ERR(r->r.fmr);
75 dprintk("RPC: %s: ib_alloc_fmr status %i\n", __func__, rc);
76 kfree(r);
77 return rc;
78}
79
80/* Use the ib_map_phys_fmr() verb to register a memory region
81 * for remote access via RDMA READ or RDMA WRITE.
82 */
83static int
84fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
85 int nsegs, bool writing)
86{
87 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
88 struct ib_device *device = ia->ri_id->device;
89 enum dma_data_direction direction = rpcrdma_data_dir(writing);
90 struct rpcrdma_mr_seg *seg1 = seg;
91 struct rpcrdma_mw *mw = seg1->rl_mw;
92 u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
93 int len, pageoff, i, rc;
94
95 pageoff = offset_in_page(seg1->mr_offset);
96 seg1->mr_offset -= pageoff; /* start of page */
97 seg1->mr_len += pageoff;
98 len = -pageoff;
99 if (nsegs > RPCRDMA_MAX_FMR_SGES)
100 nsegs = RPCRDMA_MAX_FMR_SGES;
101 for (i = 0; i < nsegs;) {
102 rpcrdma_map_one(device, seg, direction);
103 physaddrs[i] = seg->mr_dma;
104 len += seg->mr_len;
105 ++seg;
106 ++i;
107 /* Check for holes */
108 if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
109 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
110 break;
111 }
112
113 rc = ib_map_phys_fmr(mw->r.fmr, physaddrs, i, seg1->mr_dma);
114 if (rc)
115 goto out_maperr;
116
117 seg1->mr_rkey = mw->r.fmr->rkey;
118 seg1->mr_base = seg1->mr_dma + pageoff;
119 seg1->mr_nsegs = i;
120 seg1->mr_len = len;
121 return i;
122
123out_maperr:
124 dprintk("RPC: %s: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n",
125 __func__, len, (unsigned long long)seg1->mr_dma,
126 pageoff, i, rc);
127 while (i--)
128 rpcrdma_unmap_one(device, --seg);
129 return rc;
130}
131
132/* Use the ib_unmap_fmr() verb to prevent further remote
133 * access via RDMA READ or RDMA WRITE.
134 */
135static int
136fmr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
137{
138 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
139 struct rpcrdma_mr_seg *seg1 = seg;
140 struct ib_device *device;
141 int rc, nsegs = seg->mr_nsegs;
142 LIST_HEAD(l);
143
144 list_add(&seg1->rl_mw->r.fmr->list, &l);
145 rc = ib_unmap_fmr(&l);
146 read_lock(&ia->ri_qplock);
147 device = ia->ri_id->device;
148 while (seg1->mr_nsegs--)
149 rpcrdma_unmap_one(device, seg++);
150 read_unlock(&ia->ri_qplock);
151 if (rc)
152 goto out_err;
153 return nsegs;
154
155out_err:
156 dprintk("RPC: %s: ib_unmap_fmr status %i\n", __func__, rc);
157 return nsegs;
158}
159
160/* After a disconnect, unmap all FMRs.
161 *
162 * This is invoked only in the transport connect worker in order
163 * to serialize with rpcrdma_register_fmr_external().
164 */
165static void
166fmr_op_reset(struct rpcrdma_xprt *r_xprt)
167{
168 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
169 struct rpcrdma_mw *r;
170 LIST_HEAD(list);
171 int rc;
172
173 list_for_each_entry(r, &buf->rb_all, mw_all)
174 list_add(&r->r.fmr->list, &list);
175
176 rc = ib_unmap_fmr(&list);
177 if (rc)
178 dprintk("RPC: %s: ib_unmap_fmr failed %i\n",
179 __func__, rc);
180}
181
182static void
183fmr_op_destroy(struct rpcrdma_buffer *buf)
184{
185 struct rpcrdma_mw *r;
186 int rc;
187
188 while (!list_empty(&buf->rb_all)) {
189 r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
190 list_del(&r->mw_all);
191 rc = ib_dealloc_fmr(r->r.fmr);
192 if (rc)
193 dprintk("RPC: %s: ib_dealloc_fmr failed %i\n",
194 __func__, rc);
195 kfree(r);
196 }
197}
198
199const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = {
200 .ro_map = fmr_op_map,
201 .ro_unmap = fmr_op_unmap,
202 .ro_open = fmr_op_open,
203 .ro_maxpages = fmr_op_maxpages,
204 .ro_init = fmr_op_init,
205 .ro_reset = fmr_op_reset,
206 .ro_destroy = fmr_op_destroy,
207 .ro_displayname = "fmr",
208};
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
new file mode 100644
index 000000000000..dff0481dbcf8
--- /dev/null
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -0,0 +1,353 @@
1/*
2 * Copyright (c) 2015 Oracle. All rights reserved.
3 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
4 */
5
6/* Lightweight memory registration using Fast Registration Work
7 * Requests (FRWR). Also referred to sometimes as FRMR mode.
8 *
9 * FRWR features ordered asynchronous registration and deregistration
10 * of arbitrarily sized memory regions. This is the fastest and safest
11 * but most complex memory registration mode.
12 */
13
14#include "xprt_rdma.h"
15
16#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
17# define RPCDBG_FACILITY RPCDBG_TRANS
18#endif
19
20static int
21__frwr_init(struct rpcrdma_mw *r, struct ib_pd *pd, struct ib_device *device,
22 unsigned int depth)
23{
24 struct rpcrdma_frmr *f = &r->r.frmr;
25 int rc;
26
27 f->fr_mr = ib_alloc_fast_reg_mr(pd, depth);
28 if (IS_ERR(f->fr_mr))
29 goto out_mr_err;
30 f->fr_pgl = ib_alloc_fast_reg_page_list(device, depth);
31 if (IS_ERR(f->fr_pgl))
32 goto out_list_err;
33 return 0;
34
35out_mr_err:
36 rc = PTR_ERR(f->fr_mr);
37 dprintk("RPC: %s: ib_alloc_fast_reg_mr status %i\n",
38 __func__, rc);
39 return rc;
40
41out_list_err:
42 rc = PTR_ERR(f->fr_pgl);
43 dprintk("RPC: %s: ib_alloc_fast_reg_page_list status %i\n",
44 __func__, rc);
45 ib_dereg_mr(f->fr_mr);
46 return rc;
47}
48
49static void
50__frwr_release(struct rpcrdma_mw *r)
51{
52 int rc;
53
54 rc = ib_dereg_mr(r->r.frmr.fr_mr);
55 if (rc)
56 dprintk("RPC: %s: ib_dereg_mr status %i\n",
57 __func__, rc);
58 ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
59}
60
61static int
62frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
63 struct rpcrdma_create_data_internal *cdata)
64{
65 struct ib_device_attr *devattr = &ia->ri_devattr;
66 int depth, delta;
67
68 ia->ri_max_frmr_depth =
69 min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
70 devattr->max_fast_reg_page_list_len);
71 dprintk("RPC: %s: device's max FR page list len = %u\n",
72 __func__, ia->ri_max_frmr_depth);
73
74 /* Add room for frmr register and invalidate WRs.
75 * 1. FRMR reg WR for head
76 * 2. FRMR invalidate WR for head
77 * 3. N FRMR reg WRs for pagelist
78 * 4. N FRMR invalidate WRs for pagelist
79 * 5. FRMR reg WR for tail
80 * 6. FRMR invalidate WR for tail
81 * 7. The RDMA_SEND WR
82 */
83 depth = 7;
84
85 /* Calculate N if the device max FRMR depth is smaller than
86 * RPCRDMA_MAX_DATA_SEGS.
87 */
88 if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) {
89 delta = RPCRDMA_MAX_DATA_SEGS - ia->ri_max_frmr_depth;
90 do {
91 depth += 2; /* FRMR reg + invalidate */
92 delta -= ia->ri_max_frmr_depth;
93 } while (delta > 0);
94 }
95
96 ep->rep_attr.cap.max_send_wr *= depth;
97 if (ep->rep_attr.cap.max_send_wr > devattr->max_qp_wr) {
98 cdata->max_requests = devattr->max_qp_wr / depth;
99 if (!cdata->max_requests)
100 return -EINVAL;
101 ep->rep_attr.cap.max_send_wr = cdata->max_requests *
102 depth;
103 }
104
105 return 0;
106}
107
108/* FRWR mode conveys a list of pages per chunk segment. The
109 * maximum length of that list is the FRWR page list depth.
110 */
111static size_t
112frwr_op_maxpages(struct rpcrdma_xprt *r_xprt)
113{
114 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
115
116 return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
117 rpcrdma_max_segments(r_xprt) * ia->ri_max_frmr_depth);
118}
119
120/* If FAST_REG or LOCAL_INV failed, indicate the frmr needs to be reset. */
121static void
122frwr_sendcompletion(struct ib_wc *wc)
123{
124 struct rpcrdma_mw *r;
125
126 if (likely(wc->status == IB_WC_SUCCESS))
127 return;
128
129 /* WARNING: Only wr_id and status are reliable at this point */
130 r = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
131 dprintk("RPC: %s: frmr %p (stale), status %d\n",
132 __func__, r, wc->status);
133 r->r.frmr.fr_state = FRMR_IS_STALE;
134}
135
136static int
137frwr_op_init(struct rpcrdma_xprt *r_xprt)
138{
139 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
140 struct ib_device *device = r_xprt->rx_ia.ri_id->device;
141 unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth;
142 struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
143 int i;
144
145 INIT_LIST_HEAD(&buf->rb_mws);
146 INIT_LIST_HEAD(&buf->rb_all);
147
148 i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
149 dprintk("RPC: %s: initializing %d FRMRs\n", __func__, i);
150
151 while (i--) {
152 struct rpcrdma_mw *r;
153 int rc;
154
155 r = kzalloc(sizeof(*r), GFP_KERNEL);
156 if (!r)
157 return -ENOMEM;
158
159 rc = __frwr_init(r, pd, device, depth);
160 if (rc) {
161 kfree(r);
162 return rc;
163 }
164
165 list_add(&r->mw_list, &buf->rb_mws);
166 list_add(&r->mw_all, &buf->rb_all);
167 r->mw_sendcompletion = frwr_sendcompletion;
168 }
169
170 return 0;
171}
172
173/* Post a FAST_REG Work Request to register a memory region
174 * for remote access via RDMA READ or RDMA WRITE.
175 */
176static int
177frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
178 int nsegs, bool writing)
179{
180 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
181 struct ib_device *device = ia->ri_id->device;
182 enum dma_data_direction direction = rpcrdma_data_dir(writing);
183 struct rpcrdma_mr_seg *seg1 = seg;
184 struct rpcrdma_mw *mw = seg1->rl_mw;
185 struct rpcrdma_frmr *frmr = &mw->r.frmr;
186 struct ib_mr *mr = frmr->fr_mr;
187 struct ib_send_wr fastreg_wr, *bad_wr;
188 u8 key;
189 int len, pageoff;
190 int i, rc;
191 int seg_len;
192 u64 pa;
193 int page_no;
194
195 pageoff = offset_in_page(seg1->mr_offset);
196 seg1->mr_offset -= pageoff; /* start of page */
197 seg1->mr_len += pageoff;
198 len = -pageoff;
199 if (nsegs > ia->ri_max_frmr_depth)
200 nsegs = ia->ri_max_frmr_depth;
201 for (page_no = i = 0; i < nsegs;) {
202 rpcrdma_map_one(device, seg, direction);
203 pa = seg->mr_dma;
204 for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) {
205 frmr->fr_pgl->page_list[page_no++] = pa;
206 pa += PAGE_SIZE;
207 }
208 len += seg->mr_len;
209 ++seg;
210 ++i;
211 /* Check for holes */
212 if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
213 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
214 break;
215 }
216 dprintk("RPC: %s: Using frmr %p to map %d segments (%d bytes)\n",
217 __func__, mw, i, len);
218
219 frmr->fr_state = FRMR_IS_VALID;
220
221 memset(&fastreg_wr, 0, sizeof(fastreg_wr));
222 fastreg_wr.wr_id = (unsigned long)(void *)mw;
223 fastreg_wr.opcode = IB_WR_FAST_REG_MR;
224 fastreg_wr.wr.fast_reg.iova_start = seg1->mr_dma + pageoff;
225 fastreg_wr.wr.fast_reg.page_list = frmr->fr_pgl;
226 fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
227 fastreg_wr.wr.fast_reg.page_list_len = page_no;
228 fastreg_wr.wr.fast_reg.length = len;
229 fastreg_wr.wr.fast_reg.access_flags = writing ?
230 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
231 IB_ACCESS_REMOTE_READ;
232 key = (u8)(mr->rkey & 0x000000FF);
233 ib_update_fast_reg_key(mr, ++key);
234 fastreg_wr.wr.fast_reg.rkey = mr->rkey;
235
236 DECR_CQCOUNT(&r_xprt->rx_ep);
237 rc = ib_post_send(ia->ri_id->qp, &fastreg_wr, &bad_wr);
238 if (rc)
239 goto out_senderr;
240
241 seg1->mr_rkey = mr->rkey;
242 seg1->mr_base = seg1->mr_dma + pageoff;
243 seg1->mr_nsegs = i;
244 seg1->mr_len = len;
245 return i;
246
247out_senderr:
248 dprintk("RPC: %s: ib_post_send status %i\n", __func__, rc);
249 ib_update_fast_reg_key(mr, --key);
250 frmr->fr_state = FRMR_IS_INVALID;
251 while (i--)
252 rpcrdma_unmap_one(device, --seg);
253 return rc;
254}
255
256/* Post a LOCAL_INV Work Request to prevent further remote access
257 * via RDMA READ or RDMA WRITE.
258 */
259static int
260frwr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
261{
262 struct rpcrdma_mr_seg *seg1 = seg;
263 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
264 struct ib_send_wr invalidate_wr, *bad_wr;
265 int rc, nsegs = seg->mr_nsegs;
266 struct ib_device *device;
267
268 seg1->rl_mw->r.frmr.fr_state = FRMR_IS_INVALID;
269
270 memset(&invalidate_wr, 0, sizeof(invalidate_wr));
271 invalidate_wr.wr_id = (unsigned long)(void *)seg1->rl_mw;
272 invalidate_wr.opcode = IB_WR_LOCAL_INV;
273 invalidate_wr.ex.invalidate_rkey = seg1->rl_mw->r.frmr.fr_mr->rkey;
274 DECR_CQCOUNT(&r_xprt->rx_ep);
275
276 read_lock(&ia->ri_qplock);
277 device = ia->ri_id->device;
278 while (seg1->mr_nsegs--)
279 rpcrdma_unmap_one(device, seg++);
280 rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
281 read_unlock(&ia->ri_qplock);
282 if (rc)
283 goto out_err;
284 return nsegs;
285
286out_err:
287 /* Force rpcrdma_buffer_get() to retry */
288 seg1->rl_mw->r.frmr.fr_state = FRMR_IS_STALE;
289 dprintk("RPC: %s: ib_post_send status %i\n", __func__, rc);
290 return nsegs;
291}
292
293/* After a disconnect, a flushed FAST_REG_MR can leave an FRMR in
294 * an unusable state. Find FRMRs in this state and dereg / reg
295 * each. FRMRs that are VALID and attached to an rpcrdma_req are
296 * also torn down.
297 *
298 * This gives all in-use FRMRs a fresh rkey and leaves them INVALID.
299 *
300 * This is invoked only in the transport connect worker in order
301 * to serialize with rpcrdma_register_frmr_external().
302 */
303static void
304frwr_op_reset(struct rpcrdma_xprt *r_xprt)
305{
306 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
307 struct ib_device *device = r_xprt->rx_ia.ri_id->device;
308 unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth;
309 struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
310 struct rpcrdma_mw *r;
311 int rc;
312
313 list_for_each_entry(r, &buf->rb_all, mw_all) {
314 if (r->r.frmr.fr_state == FRMR_IS_INVALID)
315 continue;
316
317 __frwr_release(r);
318 rc = __frwr_init(r, pd, device, depth);
319 if (rc) {
320 dprintk("RPC: %s: mw %p left %s\n",
321 __func__, r,
322 (r->r.frmr.fr_state == FRMR_IS_STALE ?
323 "stale" : "valid"));
324 continue;
325 }
326
327 r->r.frmr.fr_state = FRMR_IS_INVALID;
328 }
329}
330
331static void
332frwr_op_destroy(struct rpcrdma_buffer *buf)
333{
334 struct rpcrdma_mw *r;
335
336 while (!list_empty(&buf->rb_all)) {
337 r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
338 list_del(&r->mw_all);
339 __frwr_release(r);
340 kfree(r);
341 }
342}
343
344const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = {
345 .ro_map = frwr_op_map,
346 .ro_unmap = frwr_op_unmap,
347 .ro_open = frwr_op_open,
348 .ro_maxpages = frwr_op_maxpages,
349 .ro_init = frwr_op_init,
350 .ro_reset = frwr_op_reset,
351 .ro_destroy = frwr_op_destroy,
352 .ro_displayname = "frwr",
353};
diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c
new file mode 100644
index 000000000000..ba518af16787
--- /dev/null
+++ b/net/sunrpc/xprtrdma/physical_ops.c
@@ -0,0 +1,94 @@
1/*
2 * Copyright (c) 2015 Oracle. All rights reserved.
3 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
4 */
5
6/* No-op chunk preparation. All client memory is pre-registered.
7 * Sometimes referred to as ALLPHYSICAL mode.
8 *
9 * Physical registration is simple because all client memory is
10 * pre-registered and never deregistered. This mode is good for
11 * adapter bring up, but is considered not safe: the server is
12 * trusted not to abuse its access to client memory not involved
13 * in RDMA I/O.
14 */
15
16#include "xprt_rdma.h"
17
18#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
19# define RPCDBG_FACILITY RPCDBG_TRANS
20#endif
21
22static int
23physical_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
24 struct rpcrdma_create_data_internal *cdata)
25{
26 return 0;
27}
28
29/* PHYSICAL memory registration conveys one page per chunk segment.
30 */
31static size_t
32physical_op_maxpages(struct rpcrdma_xprt *r_xprt)
33{
34 return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
35 rpcrdma_max_segments(r_xprt));
36}
37
38static int
39physical_op_init(struct rpcrdma_xprt *r_xprt)
40{
41 return 0;
42}
43
44/* The client's physical memory is already exposed for
45 * remote access via RDMA READ or RDMA WRITE.
46 */
47static int
48physical_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
49 int nsegs, bool writing)
50{
51 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
52
53 rpcrdma_map_one(ia->ri_id->device, seg,
54 rpcrdma_data_dir(writing));
55 seg->mr_rkey = ia->ri_bind_mem->rkey;
56 seg->mr_base = seg->mr_dma;
57 seg->mr_nsegs = 1;
58 return 1;
59}
60
61/* Unmap a memory region, but leave it registered.
62 */
63static int
64physical_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
65{
66 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
67
68 read_lock(&ia->ri_qplock);
69 rpcrdma_unmap_one(ia->ri_id->device, seg);
70 read_unlock(&ia->ri_qplock);
71
72 return 1;
73}
74
75static void
76physical_op_reset(struct rpcrdma_xprt *r_xprt)
77{
78}
79
80static void
81physical_op_destroy(struct rpcrdma_buffer *buf)
82{
83}
84
85const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops = {
86 .ro_map = physical_op_map,
87 .ro_unmap = physical_op_unmap,
88 .ro_open = physical_op_open,
89 .ro_maxpages = physical_op_maxpages,
90 .ro_init = physical_op_init,
91 .ro_reset = physical_op_reset,
92 .ro_destroy = physical_op_destroy,
93 .ro_displayname = "physical",
94};
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 91ffde82fa0c..2c53ea9e1b83 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -53,6 +53,14 @@
53# define RPCDBG_FACILITY RPCDBG_TRANS 53# define RPCDBG_FACILITY RPCDBG_TRANS
54#endif 54#endif
55 55
56enum rpcrdma_chunktype {
57 rpcrdma_noch = 0,
58 rpcrdma_readch,
59 rpcrdma_areadch,
60 rpcrdma_writech,
61 rpcrdma_replych
62};
63
56#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) 64#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
57static const char transfertypes[][12] = { 65static const char transfertypes[][12] = {
58 "pure inline", /* no chunks */ 66 "pure inline", /* no chunks */
@@ -179,6 +187,7 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
179 struct rpcrdma_write_array *warray = NULL; 187 struct rpcrdma_write_array *warray = NULL;
180 struct rpcrdma_write_chunk *cur_wchunk = NULL; 188 struct rpcrdma_write_chunk *cur_wchunk = NULL;
181 __be32 *iptr = headerp->rm_body.rm_chunks; 189 __be32 *iptr = headerp->rm_body.rm_chunks;
190 int (*map)(struct rpcrdma_xprt *, struct rpcrdma_mr_seg *, int, bool);
182 191
183 if (type == rpcrdma_readch || type == rpcrdma_areadch) { 192 if (type == rpcrdma_readch || type == rpcrdma_areadch) {
184 /* a read chunk - server will RDMA Read our memory */ 193 /* a read chunk - server will RDMA Read our memory */
@@ -201,9 +210,9 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
201 if (nsegs < 0) 210 if (nsegs < 0)
202 return nsegs; 211 return nsegs;
203 212
213 map = r_xprt->rx_ia.ri_ops->ro_map;
204 do { 214 do {
205 n = rpcrdma_register_external(seg, nsegs, 215 n = map(r_xprt, seg, nsegs, cur_wchunk != NULL);
206 cur_wchunk != NULL, r_xprt);
207 if (n <= 0) 216 if (n <= 0)
208 goto out; 217 goto out;
209 if (cur_rchunk) { /* read */ 218 if (cur_rchunk) { /* read */
@@ -275,34 +284,13 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
275 return (unsigned char *)iptr - (unsigned char *)headerp; 284 return (unsigned char *)iptr - (unsigned char *)headerp;
276 285
277out: 286out:
278 if (r_xprt->rx_ia.ri_memreg_strategy != RPCRDMA_FRMR) { 287 if (r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_FRMR)
279 for (pos = 0; nchunks--;) 288 return n;
280 pos += rpcrdma_deregister_external(
281 &req->rl_segments[pos], r_xprt);
282 }
283 return n;
284}
285 289
286/* 290 for (pos = 0; nchunks--;)
287 * Marshal chunks. This routine returns the header length 291 pos += r_xprt->rx_ia.ri_ops->ro_unmap(r_xprt,
288 * consumed by marshaling. 292 &req->rl_segments[pos]);
289 * 293 return n;
290 * Returns positive RPC/RDMA header size, or negative errno.
291 */
292
293ssize_t
294rpcrdma_marshal_chunks(struct rpc_rqst *rqst, ssize_t result)
295{
296 struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
297 struct rpcrdma_msg *headerp = rdmab_to_msg(req->rl_rdmabuf);
298
299 if (req->rl_rtype != rpcrdma_noch)
300 result = rpcrdma_create_chunks(rqst, &rqst->rq_snd_buf,
301 headerp, req->rl_rtype);
302 else if (req->rl_wtype != rpcrdma_noch)
303 result = rpcrdma_create_chunks(rqst, &rqst->rq_rcv_buf,
304 headerp, req->rl_wtype);
305 return result;
306} 294}
307 295
308/* 296/*
@@ -397,6 +385,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
397 char *base; 385 char *base;
398 size_t rpclen, padlen; 386 size_t rpclen, padlen;
399 ssize_t hdrlen; 387 ssize_t hdrlen;
388 enum rpcrdma_chunktype rtype, wtype;
400 struct rpcrdma_msg *headerp; 389 struct rpcrdma_msg *headerp;
401 390
402 /* 391 /*
@@ -433,13 +422,13 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
433 * into pages; otherwise use reply chunks. 422 * into pages; otherwise use reply chunks.
434 */ 423 */
435 if (rqst->rq_rcv_buf.buflen <= RPCRDMA_INLINE_READ_THRESHOLD(rqst)) 424 if (rqst->rq_rcv_buf.buflen <= RPCRDMA_INLINE_READ_THRESHOLD(rqst))
436 req->rl_wtype = rpcrdma_noch; 425 wtype = rpcrdma_noch;
437 else if (rqst->rq_rcv_buf.page_len == 0) 426 else if (rqst->rq_rcv_buf.page_len == 0)
438 req->rl_wtype = rpcrdma_replych; 427 wtype = rpcrdma_replych;
439 else if (rqst->rq_rcv_buf.flags & XDRBUF_READ) 428 else if (rqst->rq_rcv_buf.flags & XDRBUF_READ)
440 req->rl_wtype = rpcrdma_writech; 429 wtype = rpcrdma_writech;
441 else 430 else
442 req->rl_wtype = rpcrdma_replych; 431 wtype = rpcrdma_replych;
443 432
444 /* 433 /*
445 * Chunks needed for arguments? 434 * Chunks needed for arguments?
@@ -456,16 +445,16 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
456 * TBD check NFSv4 setacl 445 * TBD check NFSv4 setacl
457 */ 446 */
458 if (rqst->rq_snd_buf.len <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst)) 447 if (rqst->rq_snd_buf.len <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst))
459 req->rl_rtype = rpcrdma_noch; 448 rtype = rpcrdma_noch;
460 else if (rqst->rq_snd_buf.page_len == 0) 449 else if (rqst->rq_snd_buf.page_len == 0)
461 req->rl_rtype = rpcrdma_areadch; 450 rtype = rpcrdma_areadch;
462 else 451 else
463 req->rl_rtype = rpcrdma_readch; 452 rtype = rpcrdma_readch;
464 453
465 /* The following simplification is not true forever */ 454 /* The following simplification is not true forever */
466 if (req->rl_rtype != rpcrdma_noch && req->rl_wtype == rpcrdma_replych) 455 if (rtype != rpcrdma_noch && wtype == rpcrdma_replych)
467 req->rl_wtype = rpcrdma_noch; 456 wtype = rpcrdma_noch;
468 if (req->rl_rtype != rpcrdma_noch && req->rl_wtype != rpcrdma_noch) { 457 if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) {
469 dprintk("RPC: %s: cannot marshal multiple chunk lists\n", 458 dprintk("RPC: %s: cannot marshal multiple chunk lists\n",
470 __func__); 459 __func__);
471 return -EIO; 460 return -EIO;
@@ -479,7 +468,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
479 * When padding is in use and applies to the transfer, insert 468 * When padding is in use and applies to the transfer, insert
480 * it and change the message type. 469 * it and change the message type.
481 */ 470 */
482 if (req->rl_rtype == rpcrdma_noch) { 471 if (rtype == rpcrdma_noch) {
483 472
484 padlen = rpcrdma_inline_pullup(rqst, 473 padlen = rpcrdma_inline_pullup(rqst,
485 RPCRDMA_INLINE_PAD_VALUE(rqst)); 474 RPCRDMA_INLINE_PAD_VALUE(rqst));
@@ -494,7 +483,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
494 headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero; 483 headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero;
495 headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero; 484 headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero;
496 hdrlen += 2 * sizeof(u32); /* extra words in padhdr */ 485 hdrlen += 2 * sizeof(u32); /* extra words in padhdr */
497 if (req->rl_wtype != rpcrdma_noch) { 486 if (wtype != rpcrdma_noch) {
498 dprintk("RPC: %s: invalid chunk list\n", 487 dprintk("RPC: %s: invalid chunk list\n",
499 __func__); 488 __func__);
500 return -EIO; 489 return -EIO;
@@ -515,18 +504,26 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
515 * on receive. Therefore, we request a reply chunk 504 * on receive. Therefore, we request a reply chunk
516 * for non-writes wherever feasible and efficient. 505 * for non-writes wherever feasible and efficient.
517 */ 506 */
518 if (req->rl_wtype == rpcrdma_noch) 507 if (wtype == rpcrdma_noch)
519 req->rl_wtype = rpcrdma_replych; 508 wtype = rpcrdma_replych;
520 } 509 }
521 } 510 }
522 511
523 hdrlen = rpcrdma_marshal_chunks(rqst, hdrlen); 512 if (rtype != rpcrdma_noch) {
513 hdrlen = rpcrdma_create_chunks(rqst, &rqst->rq_snd_buf,
514 headerp, rtype);
515 wtype = rtype; /* simplify dprintk */
516
517 } else if (wtype != rpcrdma_noch) {
518 hdrlen = rpcrdma_create_chunks(rqst, &rqst->rq_rcv_buf,
519 headerp, wtype);
520 }
524 if (hdrlen < 0) 521 if (hdrlen < 0)
525 return hdrlen; 522 return hdrlen;
526 523
527 dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd" 524 dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd"
528 " headerp 0x%p base 0x%p lkey 0x%x\n", 525 " headerp 0x%p base 0x%p lkey 0x%x\n",
529 __func__, transfertypes[req->rl_wtype], hdrlen, rpclen, padlen, 526 __func__, transfertypes[wtype], hdrlen, rpclen, padlen,
530 headerp, base, rdmab_lkey(req->rl_rdmabuf)); 527 headerp, base, rdmab_lkey(req->rl_rdmabuf));
531 528
532 /* 529 /*
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 2e192baa59f3..54f23b1be986 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -157,12 +157,47 @@ static struct ctl_table sunrpc_table[] = {
157static struct rpc_xprt_ops xprt_rdma_procs; /* forward reference */ 157static struct rpc_xprt_ops xprt_rdma_procs; /* forward reference */
158 158
159static void 159static void
160xprt_rdma_format_addresses4(struct rpc_xprt *xprt, struct sockaddr *sap)
161{
162 struct sockaddr_in *sin = (struct sockaddr_in *)sap;
163 char buf[20];
164
165 snprintf(buf, sizeof(buf), "%08x", ntohl(sin->sin_addr.s_addr));
166 xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = kstrdup(buf, GFP_KERNEL);
167
168 xprt->address_strings[RPC_DISPLAY_NETID] = RPCBIND_NETID_RDMA;
169}
170
171static void
172xprt_rdma_format_addresses6(struct rpc_xprt *xprt, struct sockaddr *sap)
173{
174 struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
175 char buf[40];
176
177 snprintf(buf, sizeof(buf), "%pi6", &sin6->sin6_addr);
178 xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = kstrdup(buf, GFP_KERNEL);
179
180 xprt->address_strings[RPC_DISPLAY_NETID] = RPCBIND_NETID_RDMA6;
181}
182
183static void
160xprt_rdma_format_addresses(struct rpc_xprt *xprt) 184xprt_rdma_format_addresses(struct rpc_xprt *xprt)
161{ 185{
162 struct sockaddr *sap = (struct sockaddr *) 186 struct sockaddr *sap = (struct sockaddr *)
163 &rpcx_to_rdmad(xprt).addr; 187 &rpcx_to_rdmad(xprt).addr;
164 struct sockaddr_in *sin = (struct sockaddr_in *)sap; 188 char buf[128];
165 char buf[64]; 189
190 switch (sap->sa_family) {
191 case AF_INET:
192 xprt_rdma_format_addresses4(xprt, sap);
193 break;
194 case AF_INET6:
195 xprt_rdma_format_addresses6(xprt, sap);
196 break;
197 default:
198 pr_err("rpcrdma: Unrecognized address family\n");
199 return;
200 }
166 201
167 (void)rpc_ntop(sap, buf, sizeof(buf)); 202 (void)rpc_ntop(sap, buf, sizeof(buf));
168 xprt->address_strings[RPC_DISPLAY_ADDR] = kstrdup(buf, GFP_KERNEL); 203 xprt->address_strings[RPC_DISPLAY_ADDR] = kstrdup(buf, GFP_KERNEL);
@@ -170,16 +205,10 @@ xprt_rdma_format_addresses(struct rpc_xprt *xprt)
170 snprintf(buf, sizeof(buf), "%u", rpc_get_port(sap)); 205 snprintf(buf, sizeof(buf), "%u", rpc_get_port(sap));
171 xprt->address_strings[RPC_DISPLAY_PORT] = kstrdup(buf, GFP_KERNEL); 206 xprt->address_strings[RPC_DISPLAY_PORT] = kstrdup(buf, GFP_KERNEL);
172 207
173 xprt->address_strings[RPC_DISPLAY_PROTO] = "rdma";
174
175 snprintf(buf, sizeof(buf), "%08x", ntohl(sin->sin_addr.s_addr));
176 xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = kstrdup(buf, GFP_KERNEL);
177
178 snprintf(buf, sizeof(buf), "%4hx", rpc_get_port(sap)); 208 snprintf(buf, sizeof(buf), "%4hx", rpc_get_port(sap));
179 xprt->address_strings[RPC_DISPLAY_HEX_PORT] = kstrdup(buf, GFP_KERNEL); 209 xprt->address_strings[RPC_DISPLAY_HEX_PORT] = kstrdup(buf, GFP_KERNEL);
180 210
181 /* netid */ 211 xprt->address_strings[RPC_DISPLAY_PROTO] = "rdma";
182 xprt->address_strings[RPC_DISPLAY_NETID] = "rdma";
183} 212}
184 213
185static void 214static void
@@ -377,7 +406,10 @@ xprt_setup_rdma(struct xprt_create *args)
377 xprt_rdma_connect_worker); 406 xprt_rdma_connect_worker);
378 407
379 xprt_rdma_format_addresses(xprt); 408 xprt_rdma_format_addresses(xprt);
380 xprt->max_payload = rpcrdma_max_payload(new_xprt); 409 xprt->max_payload = new_xprt->rx_ia.ri_ops->ro_maxpages(new_xprt);
410 if (xprt->max_payload == 0)
411 goto out4;
412 xprt->max_payload <<= PAGE_SHIFT;
381 dprintk("RPC: %s: transport data payload maximum: %zu bytes\n", 413 dprintk("RPC: %s: transport data payload maximum: %zu bytes\n",
382 __func__, xprt->max_payload); 414 __func__, xprt->max_payload);
383 415
@@ -552,8 +584,8 @@ xprt_rdma_free(void *buffer)
552 584
553 for (i = 0; req->rl_nchunks;) { 585 for (i = 0; req->rl_nchunks;) {
554 --req->rl_nchunks; 586 --req->rl_nchunks;
555 i += rpcrdma_deregister_external( 587 i += r_xprt->rx_ia.ri_ops->ro_unmap(r_xprt,
556 &req->rl_segments[i], r_xprt); 588 &req->rl_segments[i]);
557 } 589 }
558 590
559 rpcrdma_buffer_put(req); 591 rpcrdma_buffer_put(req);
@@ -579,10 +611,7 @@ xprt_rdma_send_request(struct rpc_task *task)
579 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); 611 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
580 int rc = 0; 612 int rc = 0;
581 613
582 if (req->rl_niovs == 0) 614 rc = rpcrdma_marshal_req(rqst);
583 rc = rpcrdma_marshal_req(rqst);
584 else if (r_xprt->rx_ia.ri_memreg_strategy != RPCRDMA_ALLPHYSICAL)
585 rc = rpcrdma_marshal_chunks(rqst, 0);
586 if (rc < 0) 615 if (rc < 0)
587 goto failed_marshal; 616 goto failed_marshal;
588 617
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index e28909fddd30..4870d272e006 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -50,6 +50,7 @@
50#include <linux/interrupt.h> 50#include <linux/interrupt.h>
51#include <linux/slab.h> 51#include <linux/slab.h>
52#include <linux/prefetch.h> 52#include <linux/prefetch.h>
53#include <linux/sunrpc/addr.h>
53#include <asm/bitops.h> 54#include <asm/bitops.h>
54 55
55#include "xprt_rdma.h" 56#include "xprt_rdma.h"
@@ -62,9 +63,6 @@
62# define RPCDBG_FACILITY RPCDBG_TRANS 63# define RPCDBG_FACILITY RPCDBG_TRANS
63#endif 64#endif
64 65
65static void rpcrdma_reset_frmrs(struct rpcrdma_ia *);
66static void rpcrdma_reset_fmrs(struct rpcrdma_ia *);
67
68/* 66/*
69 * internal functions 67 * internal functions
70 */ 68 */
@@ -188,7 +186,7 @@ static const char * const wc_status[] = {
188 "remote access error", 186 "remote access error",
189 "remote operation error", 187 "remote operation error",
190 "transport retry counter exceeded", 188 "transport retry counter exceeded",
191 "RNR retrycounter exceeded", 189 "RNR retry counter exceeded",
192 "local RDD violation error", 190 "local RDD violation error",
193 "remove invalid RD request", 191 "remove invalid RD request",
194 "operation aborted", 192 "operation aborted",
@@ -206,21 +204,17 @@ static const char * const wc_status[] = {
206static void 204static void
207rpcrdma_sendcq_process_wc(struct ib_wc *wc) 205rpcrdma_sendcq_process_wc(struct ib_wc *wc)
208{ 206{
209 if (likely(wc->status == IB_WC_SUCCESS))
210 return;
211
212 /* WARNING: Only wr_id and status are reliable at this point */ 207 /* WARNING: Only wr_id and status are reliable at this point */
213 if (wc->wr_id == 0ULL) { 208 if (wc->wr_id == RPCRDMA_IGNORE_COMPLETION) {
214 if (wc->status != IB_WC_WR_FLUSH_ERR) 209 if (wc->status != IB_WC_SUCCESS &&
210 wc->status != IB_WC_WR_FLUSH_ERR)
215 pr_err("RPC: %s: SEND: %s\n", 211 pr_err("RPC: %s: SEND: %s\n",
216 __func__, COMPLETION_MSG(wc->status)); 212 __func__, COMPLETION_MSG(wc->status));
217 } else { 213 } else {
218 struct rpcrdma_mw *r; 214 struct rpcrdma_mw *r;
219 215
220 r = (struct rpcrdma_mw *)(unsigned long)wc->wr_id; 216 r = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
221 r->r.frmr.fr_state = FRMR_IS_STALE; 217 r->mw_sendcompletion(wc);
222 pr_err("RPC: %s: frmr %p (stale): %s\n",
223 __func__, r, COMPLETION_MSG(wc->status));
224 } 218 }
225} 219}
226 220
@@ -424,7 +418,7 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
424 struct rpcrdma_ia *ia = &xprt->rx_ia; 418 struct rpcrdma_ia *ia = &xprt->rx_ia;
425 struct rpcrdma_ep *ep = &xprt->rx_ep; 419 struct rpcrdma_ep *ep = &xprt->rx_ep;
426#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) 420#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
427 struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr; 421 struct sockaddr *sap = (struct sockaddr *)&ep->rep_remote_addr;
428#endif 422#endif
429 struct ib_qp_attr *attr = &ia->ri_qp_attr; 423 struct ib_qp_attr *attr = &ia->ri_qp_attr;
430 struct ib_qp_init_attr *iattr = &ia->ri_qp_init_attr; 424 struct ib_qp_init_attr *iattr = &ia->ri_qp_init_attr;
@@ -480,9 +474,8 @@ connected:
480 wake_up_all(&ep->rep_connect_wait); 474 wake_up_all(&ep->rep_connect_wait);
481 /*FALLTHROUGH*/ 475 /*FALLTHROUGH*/
482 default: 476 default:
483 dprintk("RPC: %s: %pI4:%u (ep 0x%p): %s\n", 477 dprintk("RPC: %s: %pIS:%u (ep 0x%p): %s\n",
484 __func__, &addr->sin_addr.s_addr, 478 __func__, sap, rpc_get_port(sap), ep,
485 ntohs(addr->sin_port), ep,
486 CONNECTION_MSG(event->event)); 479 CONNECTION_MSG(event->event));
487 break; 480 break;
488 } 481 }
@@ -491,19 +484,16 @@ connected:
491 if (connstate == 1) { 484 if (connstate == 1) {
492 int ird = attr->max_dest_rd_atomic; 485 int ird = attr->max_dest_rd_atomic;
493 int tird = ep->rep_remote_cma.responder_resources; 486 int tird = ep->rep_remote_cma.responder_resources;
494 printk(KERN_INFO "rpcrdma: connection to %pI4:%u " 487
495 "on %s, memreg %d slots %d ird %d%s\n", 488 pr_info("rpcrdma: connection to %pIS:%u on %s, memreg '%s', %d credits, %d responders%s\n",
496 &addr->sin_addr.s_addr, 489 sap, rpc_get_port(sap),
497 ntohs(addr->sin_port),
498 ia->ri_id->device->name, 490 ia->ri_id->device->name,
499 ia->ri_memreg_strategy, 491 ia->ri_ops->ro_displayname,
500 xprt->rx_buf.rb_max_requests, 492 xprt->rx_buf.rb_max_requests,
501 ird, ird < 4 && ird < tird / 2 ? " (low!)" : ""); 493 ird, ird < 4 && ird < tird / 2 ? " (low!)" : "");
502 } else if (connstate < 0) { 494 } else if (connstate < 0) {
503 printk(KERN_INFO "rpcrdma: connection to %pI4:%u closed (%d)\n", 495 pr_info("rpcrdma: connection to %pIS:%u closed (%d)\n",
504 &addr->sin_addr.s_addr, 496 sap, rpc_get_port(sap), connstate);
505 ntohs(addr->sin_port),
506 connstate);
507 } 497 }
508#endif 498#endif
509 499
@@ -621,17 +611,13 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
621 611
622 if (memreg == RPCRDMA_FRMR) { 612 if (memreg == RPCRDMA_FRMR) {
623 /* Requires both frmr reg and local dma lkey */ 613 /* Requires both frmr reg and local dma lkey */
624 if ((devattr->device_cap_flags & 614 if (((devattr->device_cap_flags &
625 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) != 615 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
626 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) { 616 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) ||
617 (devattr->max_fast_reg_page_list_len == 0)) {
627 dprintk("RPC: %s: FRMR registration " 618 dprintk("RPC: %s: FRMR registration "
628 "not supported by HCA\n", __func__); 619 "not supported by HCA\n", __func__);
629 memreg = RPCRDMA_MTHCAFMR; 620 memreg = RPCRDMA_MTHCAFMR;
630 } else {
631 /* Mind the ia limit on FRMR page list depth */
632 ia->ri_max_frmr_depth = min_t(unsigned int,
633 RPCRDMA_MAX_DATA_SEGS,
634 devattr->max_fast_reg_page_list_len);
635 } 621 }
636 } 622 }
637 if (memreg == RPCRDMA_MTHCAFMR) { 623 if (memreg == RPCRDMA_MTHCAFMR) {
@@ -652,13 +638,16 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
652 */ 638 */
653 switch (memreg) { 639 switch (memreg) {
654 case RPCRDMA_FRMR: 640 case RPCRDMA_FRMR:
641 ia->ri_ops = &rpcrdma_frwr_memreg_ops;
655 break; 642 break;
656 case RPCRDMA_ALLPHYSICAL: 643 case RPCRDMA_ALLPHYSICAL:
644 ia->ri_ops = &rpcrdma_physical_memreg_ops;
657 mem_priv = IB_ACCESS_LOCAL_WRITE | 645 mem_priv = IB_ACCESS_LOCAL_WRITE |
658 IB_ACCESS_REMOTE_WRITE | 646 IB_ACCESS_REMOTE_WRITE |
659 IB_ACCESS_REMOTE_READ; 647 IB_ACCESS_REMOTE_READ;
660 goto register_setup; 648 goto register_setup;
661 case RPCRDMA_MTHCAFMR: 649 case RPCRDMA_MTHCAFMR:
650 ia->ri_ops = &rpcrdma_fmr_memreg_ops;
662 if (ia->ri_have_dma_lkey) 651 if (ia->ri_have_dma_lkey)
663 break; 652 break;
664 mem_priv = IB_ACCESS_LOCAL_WRITE; 653 mem_priv = IB_ACCESS_LOCAL_WRITE;
@@ -678,8 +667,8 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
678 rc = -ENOMEM; 667 rc = -ENOMEM;
679 goto out3; 668 goto out3;
680 } 669 }
681 dprintk("RPC: %s: memory registration strategy is %d\n", 670 dprintk("RPC: %s: memory registration strategy is '%s'\n",
682 __func__, memreg); 671 __func__, ia->ri_ops->ro_displayname);
683 672
684 /* Else will do memory reg/dereg for each chunk */ 673 /* Else will do memory reg/dereg for each chunk */
685 ia->ri_memreg_strategy = memreg; 674 ia->ri_memreg_strategy = memreg;
@@ -743,49 +732,11 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
743 732
744 ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall; 733 ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
745 ep->rep_attr.qp_context = ep; 734 ep->rep_attr.qp_context = ep;
746 /* send_cq and recv_cq initialized below */
747 ep->rep_attr.srq = NULL; 735 ep->rep_attr.srq = NULL;
748 ep->rep_attr.cap.max_send_wr = cdata->max_requests; 736 ep->rep_attr.cap.max_send_wr = cdata->max_requests;
749 switch (ia->ri_memreg_strategy) { 737 rc = ia->ri_ops->ro_open(ia, ep, cdata);
750 case RPCRDMA_FRMR: { 738 if (rc)
751 int depth = 7; 739 return rc;
752
753 /* Add room for frmr register and invalidate WRs.
754 * 1. FRMR reg WR for head
755 * 2. FRMR invalidate WR for head
756 * 3. N FRMR reg WRs for pagelist
757 * 4. N FRMR invalidate WRs for pagelist
758 * 5. FRMR reg WR for tail
759 * 6. FRMR invalidate WR for tail
760 * 7. The RDMA_SEND WR
761 */
762
763 /* Calculate N if the device max FRMR depth is smaller than
764 * RPCRDMA_MAX_DATA_SEGS.
765 */
766 if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) {
767 int delta = RPCRDMA_MAX_DATA_SEGS -
768 ia->ri_max_frmr_depth;
769
770 do {
771 depth += 2; /* FRMR reg + invalidate */
772 delta -= ia->ri_max_frmr_depth;
773 } while (delta > 0);
774
775 }
776 ep->rep_attr.cap.max_send_wr *= depth;
777 if (ep->rep_attr.cap.max_send_wr > devattr->max_qp_wr) {
778 cdata->max_requests = devattr->max_qp_wr / depth;
779 if (!cdata->max_requests)
780 return -EINVAL;
781 ep->rep_attr.cap.max_send_wr = cdata->max_requests *
782 depth;
783 }
784 break;
785 }
786 default:
787 break;
788 }
789 ep->rep_attr.cap.max_recv_wr = cdata->max_requests; 740 ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
790 ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2); 741 ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);
791 ep->rep_attr.cap.max_recv_sge = 1; 742 ep->rep_attr.cap.max_recv_sge = 1;
@@ -944,21 +895,9 @@ retry:
944 rpcrdma_ep_disconnect(ep, ia); 895 rpcrdma_ep_disconnect(ep, ia);
945 rpcrdma_flush_cqs(ep); 896 rpcrdma_flush_cqs(ep);
946 897
947 switch (ia->ri_memreg_strategy) {
948 case RPCRDMA_FRMR:
949 rpcrdma_reset_frmrs(ia);
950 break;
951 case RPCRDMA_MTHCAFMR:
952 rpcrdma_reset_fmrs(ia);
953 break;
954 case RPCRDMA_ALLPHYSICAL:
955 break;
956 default:
957 rc = -EIO;
958 goto out;
959 }
960
961 xprt = container_of(ia, struct rpcrdma_xprt, rx_ia); 898 xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
899 ia->ri_ops->ro_reset(xprt);
900
962 id = rpcrdma_create_id(xprt, ia, 901 id = rpcrdma_create_id(xprt, ia,
963 (struct sockaddr *)&xprt->rx_data.addr); 902 (struct sockaddr *)&xprt->rx_data.addr);
964 if (IS_ERR(id)) { 903 if (IS_ERR(id)) {
@@ -1123,91 +1062,6 @@ out:
1123 return ERR_PTR(rc); 1062 return ERR_PTR(rc);
1124} 1063}
1125 1064
1126static int
1127rpcrdma_init_fmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf)
1128{
1129 int mr_access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ;
1130 struct ib_fmr_attr fmr_attr = {
1131 .max_pages = RPCRDMA_MAX_DATA_SEGS,
1132 .max_maps = 1,
1133 .page_shift = PAGE_SHIFT
1134 };
1135 struct rpcrdma_mw *r;
1136 int i, rc;
1137
1138 i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
1139 dprintk("RPC: %s: initializing %d FMRs\n", __func__, i);
1140
1141 while (i--) {
1142 r = kzalloc(sizeof(*r), GFP_KERNEL);
1143 if (r == NULL)
1144 return -ENOMEM;
1145
1146 r->r.fmr = ib_alloc_fmr(ia->ri_pd, mr_access_flags, &fmr_attr);
1147 if (IS_ERR(r->r.fmr)) {
1148 rc = PTR_ERR(r->r.fmr);
1149 dprintk("RPC: %s: ib_alloc_fmr failed %i\n",
1150 __func__, rc);
1151 goto out_free;
1152 }
1153
1154 list_add(&r->mw_list, &buf->rb_mws);
1155 list_add(&r->mw_all, &buf->rb_all);
1156 }
1157 return 0;
1158
1159out_free:
1160 kfree(r);
1161 return rc;
1162}
1163
1164static int
1165rpcrdma_init_frmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf)
1166{
1167 struct rpcrdma_frmr *f;
1168 struct rpcrdma_mw *r;
1169 int i, rc;
1170
1171 i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
1172 dprintk("RPC: %s: initializing %d FRMRs\n", __func__, i);
1173
1174 while (i--) {
1175 r = kzalloc(sizeof(*r), GFP_KERNEL);
1176 if (r == NULL)
1177 return -ENOMEM;
1178 f = &r->r.frmr;
1179
1180 f->fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
1181 ia->ri_max_frmr_depth);
1182 if (IS_ERR(f->fr_mr)) {
1183 rc = PTR_ERR(f->fr_mr);
1184 dprintk("RPC: %s: ib_alloc_fast_reg_mr "
1185 "failed %i\n", __func__, rc);
1186 goto out_free;
1187 }
1188
1189 f->fr_pgl = ib_alloc_fast_reg_page_list(ia->ri_id->device,
1190 ia->ri_max_frmr_depth);
1191 if (IS_ERR(f->fr_pgl)) {
1192 rc = PTR_ERR(f->fr_pgl);
1193 dprintk("RPC: %s: ib_alloc_fast_reg_page_list "
1194 "failed %i\n", __func__, rc);
1195
1196 ib_dereg_mr(f->fr_mr);
1197 goto out_free;
1198 }
1199
1200 list_add(&r->mw_list, &buf->rb_mws);
1201 list_add(&r->mw_all, &buf->rb_all);
1202 }
1203
1204 return 0;
1205
1206out_free:
1207 kfree(r);
1208 return rc;
1209}
1210
1211int 1065int
1212rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) 1066rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
1213{ 1067{
@@ -1244,22 +1098,9 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
1244 buf->rb_recv_bufs = (struct rpcrdma_rep **) p; 1098 buf->rb_recv_bufs = (struct rpcrdma_rep **) p;
1245 p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests]; 1099 p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests];
1246 1100
1247 INIT_LIST_HEAD(&buf->rb_mws); 1101 rc = ia->ri_ops->ro_init(r_xprt);
1248 INIT_LIST_HEAD(&buf->rb_all); 1102 if (rc)
1249 switch (ia->ri_memreg_strategy) { 1103 goto out;
1250 case RPCRDMA_FRMR:
1251 rc = rpcrdma_init_frmrs(ia, buf);
1252 if (rc)
1253 goto out;
1254 break;
1255 case RPCRDMA_MTHCAFMR:
1256 rc = rpcrdma_init_fmrs(ia, buf);
1257 if (rc)
1258 goto out;
1259 break;
1260 default:
1261 break;
1262 }
1263 1104
1264 for (i = 0; i < buf->rb_max_requests; i++) { 1105 for (i = 0; i < buf->rb_max_requests; i++) {
1265 struct rpcrdma_req *req; 1106 struct rpcrdma_req *req;
@@ -1311,47 +1152,6 @@ rpcrdma_destroy_req(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
1311 kfree(req); 1152 kfree(req);
1312} 1153}
1313 1154
1314static void
1315rpcrdma_destroy_fmrs(struct rpcrdma_buffer *buf)
1316{
1317 struct rpcrdma_mw *r;
1318 int rc;
1319
1320 while (!list_empty(&buf->rb_all)) {
1321 r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
1322 list_del(&r->mw_all);
1323 list_del(&r->mw_list);
1324
1325 rc = ib_dealloc_fmr(r->r.fmr);
1326 if (rc)
1327 dprintk("RPC: %s: ib_dealloc_fmr failed %i\n",
1328 __func__, rc);
1329
1330 kfree(r);
1331 }
1332}
1333
1334static void
1335rpcrdma_destroy_frmrs(struct rpcrdma_buffer *buf)
1336{
1337 struct rpcrdma_mw *r;
1338 int rc;
1339
1340 while (!list_empty(&buf->rb_all)) {
1341 r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
1342 list_del(&r->mw_all);
1343 list_del(&r->mw_list);
1344
1345 rc = ib_dereg_mr(r->r.frmr.fr_mr);
1346 if (rc)
1347 dprintk("RPC: %s: ib_dereg_mr failed %i\n",
1348 __func__, rc);
1349 ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
1350
1351 kfree(r);
1352 }
1353}
1354
1355void 1155void
1356rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) 1156rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
1357{ 1157{
@@ -1372,104 +1172,11 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
1372 rpcrdma_destroy_req(ia, buf->rb_send_bufs[i]); 1172 rpcrdma_destroy_req(ia, buf->rb_send_bufs[i]);
1373 } 1173 }
1374 1174
1375 switch (ia->ri_memreg_strategy) { 1175 ia->ri_ops->ro_destroy(buf);
1376 case RPCRDMA_FRMR:
1377 rpcrdma_destroy_frmrs(buf);
1378 break;
1379 case RPCRDMA_MTHCAFMR:
1380 rpcrdma_destroy_fmrs(buf);
1381 break;
1382 default:
1383 break;
1384 }
1385 1176
1386 kfree(buf->rb_pool); 1177 kfree(buf->rb_pool);
1387} 1178}
1388 1179
1389/* After a disconnect, unmap all FMRs.
1390 *
1391 * This is invoked only in the transport connect worker in order
1392 * to serialize with rpcrdma_register_fmr_external().
1393 */
1394static void
1395rpcrdma_reset_fmrs(struct rpcrdma_ia *ia)
1396{
1397 struct rpcrdma_xprt *r_xprt =
1398 container_of(ia, struct rpcrdma_xprt, rx_ia);
1399 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
1400 struct list_head *pos;
1401 struct rpcrdma_mw *r;
1402 LIST_HEAD(l);
1403 int rc;
1404
1405 list_for_each(pos, &buf->rb_all) {
1406 r = list_entry(pos, struct rpcrdma_mw, mw_all);
1407
1408 INIT_LIST_HEAD(&l);
1409 list_add(&r->r.fmr->list, &l);
1410 rc = ib_unmap_fmr(&l);
1411 if (rc)
1412 dprintk("RPC: %s: ib_unmap_fmr failed %i\n",
1413 __func__, rc);
1414 }
1415}
1416
1417/* After a disconnect, a flushed FAST_REG_MR can leave an FRMR in
1418 * an unusable state. Find FRMRs in this state and dereg / reg
1419 * each. FRMRs that are VALID and attached to an rpcrdma_req are
1420 * also torn down.
1421 *
1422 * This gives all in-use FRMRs a fresh rkey and leaves them INVALID.
1423 *
1424 * This is invoked only in the transport connect worker in order
1425 * to serialize with rpcrdma_register_frmr_external().
1426 */
1427static void
1428rpcrdma_reset_frmrs(struct rpcrdma_ia *ia)
1429{
1430 struct rpcrdma_xprt *r_xprt =
1431 container_of(ia, struct rpcrdma_xprt, rx_ia);
1432 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
1433 struct list_head *pos;
1434 struct rpcrdma_mw *r;
1435 int rc;
1436
1437 list_for_each(pos, &buf->rb_all) {
1438 r = list_entry(pos, struct rpcrdma_mw, mw_all);
1439
1440 if (r->r.frmr.fr_state == FRMR_IS_INVALID)
1441 continue;
1442
1443 rc = ib_dereg_mr(r->r.frmr.fr_mr);
1444 if (rc)
1445 dprintk("RPC: %s: ib_dereg_mr failed %i\n",
1446 __func__, rc);
1447 ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
1448
1449 r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
1450 ia->ri_max_frmr_depth);
1451 if (IS_ERR(r->r.frmr.fr_mr)) {
1452 rc = PTR_ERR(r->r.frmr.fr_mr);
1453 dprintk("RPC: %s: ib_alloc_fast_reg_mr"
1454 " failed %i\n", __func__, rc);
1455 continue;
1456 }
1457 r->r.frmr.fr_pgl = ib_alloc_fast_reg_page_list(
1458 ia->ri_id->device,
1459 ia->ri_max_frmr_depth);
1460 if (IS_ERR(r->r.frmr.fr_pgl)) {
1461 rc = PTR_ERR(r->r.frmr.fr_pgl);
1462 dprintk("RPC: %s: "
1463 "ib_alloc_fast_reg_page_list "
1464 "failed %i\n", __func__, rc);
1465
1466 ib_dereg_mr(r->r.frmr.fr_mr);
1467 continue;
1468 }
1469 r->r.frmr.fr_state = FRMR_IS_INVALID;
1470 }
1471}
1472
1473/* "*mw" can be NULL when rpcrdma_buffer_get_mrs() fails, leaving 1180/* "*mw" can be NULL when rpcrdma_buffer_get_mrs() fails, leaving
1474 * some req segments uninitialized. 1181 * some req segments uninitialized.
1475 */ 1182 */
@@ -1509,7 +1216,7 @@ rpcrdma_buffer_put_sendbuf(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
1509 } 1216 }
1510} 1217}
1511 1218
1512/* rpcrdma_unmap_one() was already done by rpcrdma_deregister_frmr_external(). 1219/* rpcrdma_unmap_one() was already done during deregistration.
1513 * Redo only the ib_post_send(). 1220 * Redo only the ib_post_send().
1514 */ 1221 */
1515static void 1222static void
@@ -1729,6 +1436,14 @@ rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
1729 * Wrappers for internal-use kmalloc memory registration, used by buffer code. 1436 * Wrappers for internal-use kmalloc memory registration, used by buffer code.
1730 */ 1437 */
1731 1438
1439void
1440rpcrdma_mapping_error(struct rpcrdma_mr_seg *seg)
1441{
1442 dprintk("RPC: map_one: offset %p iova %llx len %zu\n",
1443 seg->mr_offset,
1444 (unsigned long long)seg->mr_dma, seg->mr_dmalen);
1445}
1446
1732static int 1447static int
1733rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len, 1448rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
1734 struct ib_mr **mrp, struct ib_sge *iov) 1449 struct ib_mr **mrp, struct ib_sge *iov)
@@ -1854,287 +1569,6 @@ rpcrdma_free_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb)
1854} 1569}
1855 1570
1856/* 1571/*
1857 * Wrappers for chunk registration, shared by read/write chunk code.
1858 */
1859
1860static void
1861rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing)
1862{
1863 seg->mr_dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
1864 seg->mr_dmalen = seg->mr_len;
1865 if (seg->mr_page)
1866 seg->mr_dma = ib_dma_map_page(ia->ri_id->device,
1867 seg->mr_page, offset_in_page(seg->mr_offset),
1868 seg->mr_dmalen, seg->mr_dir);
1869 else
1870 seg->mr_dma = ib_dma_map_single(ia->ri_id->device,
1871 seg->mr_offset,
1872 seg->mr_dmalen, seg->mr_dir);
1873 if (ib_dma_mapping_error(ia->ri_id->device, seg->mr_dma)) {
1874 dprintk("RPC: %s: mr_dma %llx mr_offset %p mr_dma_len %zu\n",
1875 __func__,
1876 (unsigned long long)seg->mr_dma,
1877 seg->mr_offset, seg->mr_dmalen);
1878 }
1879}
1880
1881static void
1882rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg)
1883{
1884 if (seg->mr_page)
1885 ib_dma_unmap_page(ia->ri_id->device,
1886 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1887 else
1888 ib_dma_unmap_single(ia->ri_id->device,
1889 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1890}
1891
1892static int
1893rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
1894 int *nsegs, int writing, struct rpcrdma_ia *ia,
1895 struct rpcrdma_xprt *r_xprt)
1896{
1897 struct rpcrdma_mr_seg *seg1 = seg;
1898 struct rpcrdma_mw *mw = seg1->rl_mw;
1899 struct rpcrdma_frmr *frmr = &mw->r.frmr;
1900 struct ib_mr *mr = frmr->fr_mr;
1901 struct ib_send_wr fastreg_wr, *bad_wr;
1902 u8 key;
1903 int len, pageoff;
1904 int i, rc;
1905 int seg_len;
1906 u64 pa;
1907 int page_no;
1908
1909 pageoff = offset_in_page(seg1->mr_offset);
1910 seg1->mr_offset -= pageoff; /* start of page */
1911 seg1->mr_len += pageoff;
1912 len = -pageoff;
1913 if (*nsegs > ia->ri_max_frmr_depth)
1914 *nsegs = ia->ri_max_frmr_depth;
1915 for (page_no = i = 0; i < *nsegs;) {
1916 rpcrdma_map_one(ia, seg, writing);
1917 pa = seg->mr_dma;
1918 for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) {
1919 frmr->fr_pgl->page_list[page_no++] = pa;
1920 pa += PAGE_SIZE;
1921 }
1922 len += seg->mr_len;
1923 ++seg;
1924 ++i;
1925 /* Check for holes */
1926 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1927 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1928 break;
1929 }
1930 dprintk("RPC: %s: Using frmr %p to map %d segments\n",
1931 __func__, mw, i);
1932
1933 frmr->fr_state = FRMR_IS_VALID;
1934
1935 memset(&fastreg_wr, 0, sizeof(fastreg_wr));
1936 fastreg_wr.wr_id = (unsigned long)(void *)mw;
1937 fastreg_wr.opcode = IB_WR_FAST_REG_MR;
1938 fastreg_wr.wr.fast_reg.iova_start = seg1->mr_dma;
1939 fastreg_wr.wr.fast_reg.page_list = frmr->fr_pgl;
1940 fastreg_wr.wr.fast_reg.page_list_len = page_no;
1941 fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
1942 fastreg_wr.wr.fast_reg.length = page_no << PAGE_SHIFT;
1943 if (fastreg_wr.wr.fast_reg.length < len) {
1944 rc = -EIO;
1945 goto out_err;
1946 }
1947
1948 /* Bump the key */
1949 key = (u8)(mr->rkey & 0x000000FF);
1950 ib_update_fast_reg_key(mr, ++key);
1951
1952 fastreg_wr.wr.fast_reg.access_flags = (writing ?
1953 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
1954 IB_ACCESS_REMOTE_READ);
1955 fastreg_wr.wr.fast_reg.rkey = mr->rkey;
1956 DECR_CQCOUNT(&r_xprt->rx_ep);
1957
1958 rc = ib_post_send(ia->ri_id->qp, &fastreg_wr, &bad_wr);
1959 if (rc) {
1960 dprintk("RPC: %s: failed ib_post_send for register,"
1961 " status %i\n", __func__, rc);
1962 ib_update_fast_reg_key(mr, --key);
1963 goto out_err;
1964 } else {
1965 seg1->mr_rkey = mr->rkey;
1966 seg1->mr_base = seg1->mr_dma + pageoff;
1967 seg1->mr_nsegs = i;
1968 seg1->mr_len = len;
1969 }
1970 *nsegs = i;
1971 return 0;
1972out_err:
1973 frmr->fr_state = FRMR_IS_INVALID;
1974 while (i--)
1975 rpcrdma_unmap_one(ia, --seg);
1976 return rc;
1977}
1978
1979static int
1980rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg,
1981 struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt)
1982{
1983 struct rpcrdma_mr_seg *seg1 = seg;
1984 struct ib_send_wr invalidate_wr, *bad_wr;
1985 int rc;
1986
1987 seg1->rl_mw->r.frmr.fr_state = FRMR_IS_INVALID;
1988
1989 memset(&invalidate_wr, 0, sizeof invalidate_wr);
1990 invalidate_wr.wr_id = (unsigned long)(void *)seg1->rl_mw;
1991 invalidate_wr.opcode = IB_WR_LOCAL_INV;
1992 invalidate_wr.ex.invalidate_rkey = seg1->rl_mw->r.frmr.fr_mr->rkey;
1993 DECR_CQCOUNT(&r_xprt->rx_ep);
1994
1995 read_lock(&ia->ri_qplock);
1996 while (seg1->mr_nsegs--)
1997 rpcrdma_unmap_one(ia, seg++);
1998 rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
1999 read_unlock(&ia->ri_qplock);
2000 if (rc) {
2001 /* Force rpcrdma_buffer_get() to retry */
2002 seg1->rl_mw->r.frmr.fr_state = FRMR_IS_STALE;
2003 dprintk("RPC: %s: failed ib_post_send for invalidate,"
2004 " status %i\n", __func__, rc);
2005 }
2006 return rc;
2007}
2008
2009static int
2010rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg,
2011 int *nsegs, int writing, struct rpcrdma_ia *ia)
2012{
2013 struct rpcrdma_mr_seg *seg1 = seg;
2014 u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
2015 int len, pageoff, i, rc;
2016
2017 pageoff = offset_in_page(seg1->mr_offset);
2018 seg1->mr_offset -= pageoff; /* start of page */
2019 seg1->mr_len += pageoff;
2020 len = -pageoff;
2021 if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
2022 *nsegs = RPCRDMA_MAX_DATA_SEGS;
2023 for (i = 0; i < *nsegs;) {
2024 rpcrdma_map_one(ia, seg, writing);
2025 physaddrs[i] = seg->mr_dma;
2026 len += seg->mr_len;
2027 ++seg;
2028 ++i;
2029 /* Check for holes */
2030 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
2031 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
2032 break;
2033 }
2034 rc = ib_map_phys_fmr(seg1->rl_mw->r.fmr, physaddrs, i, seg1->mr_dma);
2035 if (rc) {
2036 dprintk("RPC: %s: failed ib_map_phys_fmr "
2037 "%u@0x%llx+%i (%d)... status %i\n", __func__,
2038 len, (unsigned long long)seg1->mr_dma,
2039 pageoff, i, rc);
2040 while (i--)
2041 rpcrdma_unmap_one(ia, --seg);
2042 } else {
2043 seg1->mr_rkey = seg1->rl_mw->r.fmr->rkey;
2044 seg1->mr_base = seg1->mr_dma + pageoff;
2045 seg1->mr_nsegs = i;
2046 seg1->mr_len = len;
2047 }
2048 *nsegs = i;
2049 return rc;
2050}
2051
2052static int
2053rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
2054 struct rpcrdma_ia *ia)
2055{
2056 struct rpcrdma_mr_seg *seg1 = seg;
2057 LIST_HEAD(l);
2058 int rc;
2059
2060 list_add(&seg1->rl_mw->r.fmr->list, &l);
2061 rc = ib_unmap_fmr(&l);
2062 read_lock(&ia->ri_qplock);
2063 while (seg1->mr_nsegs--)
2064 rpcrdma_unmap_one(ia, seg++);
2065 read_unlock(&ia->ri_qplock);
2066 if (rc)
2067 dprintk("RPC: %s: failed ib_unmap_fmr,"
2068 " status %i\n", __func__, rc);
2069 return rc;
2070}
2071
2072int
2073rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
2074 int nsegs, int writing, struct rpcrdma_xprt *r_xprt)
2075{
2076 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
2077 int rc = 0;
2078
2079 switch (ia->ri_memreg_strategy) {
2080
2081 case RPCRDMA_ALLPHYSICAL:
2082 rpcrdma_map_one(ia, seg, writing);
2083 seg->mr_rkey = ia->ri_bind_mem->rkey;
2084 seg->mr_base = seg->mr_dma;
2085 seg->mr_nsegs = 1;
2086 nsegs = 1;
2087 break;
2088
2089 /* Registration using frmr registration */
2090 case RPCRDMA_FRMR:
2091 rc = rpcrdma_register_frmr_external(seg, &nsegs, writing, ia, r_xprt);
2092 break;
2093
2094 /* Registration using fmr memory registration */
2095 case RPCRDMA_MTHCAFMR:
2096 rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia);
2097 break;
2098
2099 default:
2100 return -EIO;
2101 }
2102 if (rc)
2103 return rc;
2104
2105 return nsegs;
2106}
2107
2108int
2109rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
2110 struct rpcrdma_xprt *r_xprt)
2111{
2112 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
2113 int nsegs = seg->mr_nsegs, rc;
2114
2115 switch (ia->ri_memreg_strategy) {
2116
2117 case RPCRDMA_ALLPHYSICAL:
2118 read_lock(&ia->ri_qplock);
2119 rpcrdma_unmap_one(ia, seg);
2120 read_unlock(&ia->ri_qplock);
2121 break;
2122
2123 case RPCRDMA_FRMR:
2124 rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt);
2125 break;
2126
2127 case RPCRDMA_MTHCAFMR:
2128 rc = rpcrdma_deregister_fmr_external(seg, ia);
2129 break;
2130
2131 default:
2132 break;
2133 }
2134 return nsegs;
2135}
2136
2137/*
2138 * Prepost any receive buffer, then post send. 1572 * Prepost any receive buffer, then post send.
2139 * 1573 *
2140 * Receive buffer is donated to hardware, reclaimed upon recv completion. 1574 * Receive buffer is donated to hardware, reclaimed upon recv completion.
@@ -2156,7 +1590,7 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
2156 } 1590 }
2157 1591
2158 send_wr.next = NULL; 1592 send_wr.next = NULL;
2159 send_wr.wr_id = 0ULL; /* no send cookie */ 1593 send_wr.wr_id = RPCRDMA_IGNORE_COMPLETION;
2160 send_wr.sg_list = req->rl_send_iov; 1594 send_wr.sg_list = req->rl_send_iov;
2161 send_wr.num_sge = req->rl_niovs; 1595 send_wr.num_sge = req->rl_niovs;
2162 send_wr.opcode = IB_WR_SEND; 1596 send_wr.opcode = IB_WR_SEND;
@@ -2215,43 +1649,24 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
2215 return rc; 1649 return rc;
2216} 1650}
2217 1651
2218/* Physical mapping means one Read/Write list entry per-page. 1652/* How many chunk list items fit within our inline buffers?
2219 * All list entries must fit within an inline buffer
2220 *
2221 * NB: The server must return a Write list for NFS READ,
2222 * which has the same constraint. Factor in the inline
2223 * rsize as well.
2224 */ 1653 */
2225static size_t 1654unsigned int
2226rpcrdma_physical_max_payload(struct rpcrdma_xprt *r_xprt) 1655rpcrdma_max_segments(struct rpcrdma_xprt *r_xprt)
2227{ 1656{
2228 struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data; 1657 struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
2229 unsigned int inline_size, pages; 1658 int bytes, segments;
2230 1659
2231 inline_size = min_t(unsigned int, 1660 bytes = min_t(unsigned int, cdata->inline_wsize, cdata->inline_rsize);
2232 cdata->inline_wsize, cdata->inline_rsize); 1661 bytes -= RPCRDMA_HDRLEN_MIN;
2233 inline_size -= RPCRDMA_HDRLEN_MIN; 1662 if (bytes < sizeof(struct rpcrdma_segment) * 2) {
2234 pages = inline_size / sizeof(struct rpcrdma_segment); 1663 pr_warn("RPC: %s: inline threshold too small\n",
2235 return pages << PAGE_SHIFT; 1664 __func__);
2236} 1665 return 0;
2237
2238static size_t
2239rpcrdma_mr_max_payload(struct rpcrdma_xprt *r_xprt)
2240{
2241 return RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT;
2242}
2243
2244size_t
2245rpcrdma_max_payload(struct rpcrdma_xprt *r_xprt)
2246{
2247 size_t result;
2248
2249 switch (r_xprt->rx_ia.ri_memreg_strategy) {
2250 case RPCRDMA_ALLPHYSICAL:
2251 result = rpcrdma_physical_max_payload(r_xprt);
2252 break;
2253 default:
2254 result = rpcrdma_mr_max_payload(r_xprt);
2255 } 1666 }
2256 return result; 1667
1668 segments = 1 << (fls(bytes / sizeof(struct rpcrdma_segment)) - 1);
1669 dprintk("RPC: %s: max chunk list size = %d segments\n",
1670 __func__, segments);
1671 return segments;
2257} 1672}
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 0a16fb6f0885..78e0b8beaa36 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -60,6 +60,7 @@
60 * Interface Adapter -- one per transport instance 60 * Interface Adapter -- one per transport instance
61 */ 61 */
62struct rpcrdma_ia { 62struct rpcrdma_ia {
63 const struct rpcrdma_memreg_ops *ri_ops;
63 rwlock_t ri_qplock; 64 rwlock_t ri_qplock;
64 struct rdma_cm_id *ri_id; 65 struct rdma_cm_id *ri_id;
65 struct ib_pd *ri_pd; 66 struct ib_pd *ri_pd;
@@ -105,6 +106,10 @@ struct rpcrdma_ep {
105#define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit) 106#define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit)
106#define DECR_CQCOUNT(ep) atomic_sub_return(1, &(ep)->rep_cqcount) 107#define DECR_CQCOUNT(ep) atomic_sub_return(1, &(ep)->rep_cqcount)
107 108
109/* Force completion handler to ignore the signal
110 */
111#define RPCRDMA_IGNORE_COMPLETION (0ULL)
112
108/* Registered buffer -- registered kmalloc'd memory for RDMA SEND/RECV 113/* Registered buffer -- registered kmalloc'd memory for RDMA SEND/RECV
109 * 114 *
110 * The below structure appears at the front of a large region of kmalloc'd 115 * The below structure appears at the front of a large region of kmalloc'd
@@ -143,14 +148,6 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb)
143 return (struct rpcrdma_msg *)rb->rg_base; 148 return (struct rpcrdma_msg *)rb->rg_base;
144} 149}
145 150
146enum rpcrdma_chunktype {
147 rpcrdma_noch = 0,
148 rpcrdma_readch,
149 rpcrdma_areadch,
150 rpcrdma_writech,
151 rpcrdma_replych
152};
153
154/* 151/*
155 * struct rpcrdma_rep -- this structure encapsulates state required to recv 152 * struct rpcrdma_rep -- this structure encapsulates state required to recv
156 * and complete a reply, asychronously. It needs several pieces of 153 * and complete a reply, asychronously. It needs several pieces of
@@ -213,6 +210,7 @@ struct rpcrdma_mw {
213 struct ib_fmr *fmr; 210 struct ib_fmr *fmr;
214 struct rpcrdma_frmr frmr; 211 struct rpcrdma_frmr frmr;
215 } r; 212 } r;
213 void (*mw_sendcompletion)(struct ib_wc *);
216 struct list_head mw_list; 214 struct list_head mw_list;
217 struct list_head mw_all; 215 struct list_head mw_all;
218}; 216};
@@ -258,7 +256,6 @@ struct rpcrdma_req {
258 unsigned int rl_niovs; /* 0, 2 or 4 */ 256 unsigned int rl_niovs; /* 0, 2 or 4 */
259 unsigned int rl_nchunks; /* non-zero if chunks */ 257 unsigned int rl_nchunks; /* non-zero if chunks */
260 unsigned int rl_connect_cookie; /* retry detection */ 258 unsigned int rl_connect_cookie; /* retry detection */
261 enum rpcrdma_chunktype rl_rtype, rl_wtype;
262 struct rpcrdma_buffer *rl_buffer; /* home base for this structure */ 259 struct rpcrdma_buffer *rl_buffer; /* home base for this structure */
263 struct rpcrdma_rep *rl_reply;/* holder for reply buffer */ 260 struct rpcrdma_rep *rl_reply;/* holder for reply buffer */
264 struct ib_sge rl_send_iov[4]; /* for active requests */ 261 struct ib_sge rl_send_iov[4]; /* for active requests */
@@ -340,6 +337,29 @@ struct rpcrdma_stats {
340}; 337};
341 338
342/* 339/*
340 * Per-registration mode operations
341 */
342struct rpcrdma_xprt;
343struct rpcrdma_memreg_ops {
344 int (*ro_map)(struct rpcrdma_xprt *,
345 struct rpcrdma_mr_seg *, int, bool);
346 int (*ro_unmap)(struct rpcrdma_xprt *,
347 struct rpcrdma_mr_seg *);
348 int (*ro_open)(struct rpcrdma_ia *,
349 struct rpcrdma_ep *,
350 struct rpcrdma_create_data_internal *);
351 size_t (*ro_maxpages)(struct rpcrdma_xprt *);
352 int (*ro_init)(struct rpcrdma_xprt *);
353 void (*ro_reset)(struct rpcrdma_xprt *);
354 void (*ro_destroy)(struct rpcrdma_buffer *);
355 const char *ro_displayname;
356};
357
358extern const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops;
359extern const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops;
360extern const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops;
361
362/*
343 * RPCRDMA transport -- encapsulates the structures above for 363 * RPCRDMA transport -- encapsulates the structures above for
344 * integration with RPC. 364 * integration with RPC.
345 * 365 *
@@ -398,16 +418,56 @@ void rpcrdma_buffer_put(struct rpcrdma_req *);
398void rpcrdma_recv_buffer_get(struct rpcrdma_req *); 418void rpcrdma_recv_buffer_get(struct rpcrdma_req *);
399void rpcrdma_recv_buffer_put(struct rpcrdma_rep *); 419void rpcrdma_recv_buffer_put(struct rpcrdma_rep *);
400 420
401int rpcrdma_register_external(struct rpcrdma_mr_seg *,
402 int, int, struct rpcrdma_xprt *);
403int rpcrdma_deregister_external(struct rpcrdma_mr_seg *,
404 struct rpcrdma_xprt *);
405
406struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(struct rpcrdma_ia *, 421struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(struct rpcrdma_ia *,
407 size_t, gfp_t); 422 size_t, gfp_t);
408void rpcrdma_free_regbuf(struct rpcrdma_ia *, 423void rpcrdma_free_regbuf(struct rpcrdma_ia *,
409 struct rpcrdma_regbuf *); 424 struct rpcrdma_regbuf *);
410 425
426unsigned int rpcrdma_max_segments(struct rpcrdma_xprt *);
427
428/*
429 * Wrappers for chunk registration, shared by read/write chunk code.
430 */
431
432void rpcrdma_mapping_error(struct rpcrdma_mr_seg *);
433
434static inline enum dma_data_direction
435rpcrdma_data_dir(bool writing)
436{
437 return writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
438}
439
440static inline void
441rpcrdma_map_one(struct ib_device *device, struct rpcrdma_mr_seg *seg,
442 enum dma_data_direction direction)
443{
444 seg->mr_dir = direction;
445 seg->mr_dmalen = seg->mr_len;
446
447 if (seg->mr_page)
448 seg->mr_dma = ib_dma_map_page(device,
449 seg->mr_page, offset_in_page(seg->mr_offset),
450 seg->mr_dmalen, seg->mr_dir);
451 else
452 seg->mr_dma = ib_dma_map_single(device,
453 seg->mr_offset,
454 seg->mr_dmalen, seg->mr_dir);
455
456 if (ib_dma_mapping_error(device, seg->mr_dma))
457 rpcrdma_mapping_error(seg);
458}
459
460static inline void
461rpcrdma_unmap_one(struct ib_device *device, struct rpcrdma_mr_seg *seg)
462{
463 if (seg->mr_page)
464 ib_dma_unmap_page(device,
465 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
466 else
467 ib_dma_unmap_single(device,
468 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
469}
470
411/* 471/*
412 * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c 472 * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c
413 */ 473 */
@@ -418,9 +478,7 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *);
418/* 478/*
419 * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c 479 * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c
420 */ 480 */
421ssize_t rpcrdma_marshal_chunks(struct rpc_rqst *, ssize_t);
422int rpcrdma_marshal_req(struct rpc_rqst *); 481int rpcrdma_marshal_req(struct rpc_rqst *);
423size_t rpcrdma_max_payload(struct rpcrdma_xprt *);
424 482
425/* Temporary NFS request map cache. Created in svc_rdma.c */ 483/* Temporary NFS request map cache. Created in svc_rdma.c */
426extern struct kmem_cache *svc_rdma_map_cachep; 484extern struct kmem_cache *svc_rdma_map_cachep;