aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2008-10-16 18:39:20 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2008-10-16 18:39:20 -0400
commit52ad096465d60ee7fdc99583f969a99a1166c7c3 (patch)
tree0cf33926ce8b5eb4f6675d3b1332b35e914f4846
parent8cde1ad6683f8738ad71f788dca8ab4810cf5afe (diff)
parent6925bac120097b823fc990c143b9789c21cc60b5 (diff)
Merge git://git.linux-nfs.org/projects/trondmy/nfs-2.6
* git://git.linux-nfs.org/projects/trondmy/nfs-2.6: (53 commits) NFS: Fix a resolution problem with nfs_inode->cache_change_attribute NFS: Fix the resolution problem with nfs_inode_attrs_need_update() NFS: Changes to inode->i_nlinks must set the NFS_INO_INVALID_ATTR flag RPC/RDMA: ensure connection attempt is complete before signalling. RPC/RDMA: correct the reconnect timer backoff RPC/RDMA: optionally emit useful transport info upon connect/disconnect. RPC/RDMA: reformat a debug printk to keep lines together. RPC/RDMA: harden connection logic against missing/late rdma_cm upcalls. RPC/RDMA: fix connect/reconnect resource leak. RPC/RDMA: return a consistent error, when connect fails. RPC/RDMA: adhere to protocol for unpadded client trailing write chunks. RPC/RDMA: avoid an oops due to disconnect racing with async upcalls. RPC/RDMA: maintain the RPC task bytes-sent statistic. RPC/RDMA: suppress retransmit on RPC/RDMA clients. RPC/RDMA: fix connection IRD/ORD setting RPC/RDMA: support FRMR client memory registration. RPC/RDMA: check selected memory registration mode at runtime. RPC/RDMA: add data types and new FRMR memory registration enum. RPC/RDMA: refactor the inline memory registration code. NFS: fix nfs_parse_ip_address() corner case ...
-rw-r--r--fs/nfs/client.c5
-rw-r--r--fs/nfs/dir.c20
-rw-r--r--fs/nfs/file.c18
-rw-r--r--fs/nfs/inode.c183
-rw-r--r--fs/nfs/internal.h25
-rw-r--r--fs/nfs/mount_clnt.c3
-rw-r--r--fs/nfs/namespace.c7
-rw-r--r--fs/nfs/nfs3acl.c2
-rw-r--r--fs/nfs/nfs3proc.c20
-rw-r--r--fs/nfs/nfs4namespace.c105
-rw-r--r--fs/nfs/proc.c10
-rw-r--r--fs/nfs/super.c126
-rw-r--r--fs/nfs/unlink.c5
-rw-r--r--fs/nfs/write.c3
-rw-r--r--include/linux/nfs_fs.h19
-rw-r--r--include/linux/nfs_fs_sb.h1
-rw-r--r--include/linux/nfs_mount.h4
-rw-r--r--include/linux/nfs_xdr.h11
-rw-r--r--include/linux/sunrpc/xprtrdma.h4
-rw-r--r--net/sunrpc/clnt.c4
-rw-r--r--net/sunrpc/rpcb_clnt.c40
-rw-r--r--net/sunrpc/xprt.c12
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c29
-rw-r--r--net/sunrpc/xprtrdma/transport.c41
-rw-r--r--net/sunrpc/xprtrdma/verbs.c741
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h17
26 files changed, 955 insertions, 500 deletions
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 5ee23e7058b3..7547600b6174 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -675,7 +675,7 @@ static int nfs_init_server(struct nfs_server *server,
675 server->nfs_client = clp; 675 server->nfs_client = clp;
676 676
677 /* Initialise the client representation from the mount data */ 677 /* Initialise the client representation from the mount data */
678 server->flags = data->flags & NFS_MOUNT_FLAGMASK; 678 server->flags = data->flags;
679 679
680 if (data->rsize) 680 if (data->rsize)
681 server->rsize = nfs_block_size(data->rsize, NULL); 681 server->rsize = nfs_block_size(data->rsize, NULL);
@@ -850,7 +850,6 @@ static struct nfs_server *nfs_alloc_server(void)
850 INIT_LIST_HEAD(&server->client_link); 850 INIT_LIST_HEAD(&server->client_link);
851 INIT_LIST_HEAD(&server->master_link); 851 INIT_LIST_HEAD(&server->master_link);
852 852
853 init_waitqueue_head(&server->active_wq);
854 atomic_set(&server->active, 0); 853 atomic_set(&server->active, 0);
855 854
856 server->io_stats = nfs_alloc_iostats(); 855 server->io_stats = nfs_alloc_iostats();
@@ -1073,7 +1072,7 @@ static int nfs4_init_server(struct nfs_server *server,
1073 goto error; 1072 goto error;
1074 1073
1075 /* Initialise the client representation from the mount data */ 1074 /* Initialise the client representation from the mount data */
1076 server->flags = data->flags & NFS_MOUNT_FLAGMASK; 1075 server->flags = data->flags;
1077 server->caps |= NFS_CAP_ATOMIC_OPEN; 1076 server->caps |= NFS_CAP_ATOMIC_OPEN;
1078 1077
1079 if (data->rsize) 1078 if (data->rsize)
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 74f92b717f78..2ab70d46ecbc 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -156,6 +156,7 @@ typedef struct {
156 decode_dirent_t decode; 156 decode_dirent_t decode;
157 int plus; 157 int plus;
158 unsigned long timestamp; 158 unsigned long timestamp;
159 unsigned long gencount;
159 int timestamp_valid; 160 int timestamp_valid;
160} nfs_readdir_descriptor_t; 161} nfs_readdir_descriptor_t;
161 162
@@ -177,7 +178,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
177 struct file *file = desc->file; 178 struct file *file = desc->file;
178 struct inode *inode = file->f_path.dentry->d_inode; 179 struct inode *inode = file->f_path.dentry->d_inode;
179 struct rpc_cred *cred = nfs_file_cred(file); 180 struct rpc_cred *cred = nfs_file_cred(file);
180 unsigned long timestamp; 181 unsigned long timestamp, gencount;
181 int error; 182 int error;
182 183
183 dfprintk(DIRCACHE, "NFS: %s: reading cookie %Lu into page %lu\n", 184 dfprintk(DIRCACHE, "NFS: %s: reading cookie %Lu into page %lu\n",
@@ -186,6 +187,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
186 187
187 again: 188 again:
188 timestamp = jiffies; 189 timestamp = jiffies;
190 gencount = nfs_inc_attr_generation_counter();
189 error = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, desc->entry->cookie, page, 191 error = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, desc->entry->cookie, page,
190 NFS_SERVER(inode)->dtsize, desc->plus); 192 NFS_SERVER(inode)->dtsize, desc->plus);
191 if (error < 0) { 193 if (error < 0) {
@@ -199,6 +201,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
199 goto error; 201 goto error;
200 } 202 }
201 desc->timestamp = timestamp; 203 desc->timestamp = timestamp;
204 desc->gencount = gencount;
202 desc->timestamp_valid = 1; 205 desc->timestamp_valid = 1;
203 SetPageUptodate(page); 206 SetPageUptodate(page);
204 /* Ensure consistent page alignment of the data. 207 /* Ensure consistent page alignment of the data.
@@ -224,9 +227,10 @@ int dir_decode(nfs_readdir_descriptor_t *desc)
224 if (IS_ERR(p)) 227 if (IS_ERR(p))
225 return PTR_ERR(p); 228 return PTR_ERR(p);
226 desc->ptr = p; 229 desc->ptr = p;
227 if (desc->timestamp_valid) 230 if (desc->timestamp_valid) {
228 desc->entry->fattr->time_start = desc->timestamp; 231 desc->entry->fattr->time_start = desc->timestamp;
229 else 232 desc->entry->fattr->gencount = desc->gencount;
233 } else
230 desc->entry->fattr->valid &= ~NFS_ATTR_FATTR; 234 desc->entry->fattr->valid &= ~NFS_ATTR_FATTR;
231 return 0; 235 return 0;
232} 236}
@@ -471,7 +475,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
471 struct rpc_cred *cred = nfs_file_cred(file); 475 struct rpc_cred *cred = nfs_file_cred(file);
472 struct page *page = NULL; 476 struct page *page = NULL;
473 int status; 477 int status;
474 unsigned long timestamp; 478 unsigned long timestamp, gencount;
475 479
476 dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n", 480 dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n",
477 (unsigned long long)*desc->dir_cookie); 481 (unsigned long long)*desc->dir_cookie);
@@ -482,6 +486,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
482 goto out; 486 goto out;
483 } 487 }
484 timestamp = jiffies; 488 timestamp = jiffies;
489 gencount = nfs_inc_attr_generation_counter();
485 status = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, 490 status = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred,
486 *desc->dir_cookie, page, 491 *desc->dir_cookie, page,
487 NFS_SERVER(inode)->dtsize, 492 NFS_SERVER(inode)->dtsize,
@@ -490,6 +495,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
490 desc->ptr = kmap(page); /* matching kunmap in nfs_do_filldir */ 495 desc->ptr = kmap(page); /* matching kunmap in nfs_do_filldir */
491 if (status >= 0) { 496 if (status >= 0) {
492 desc->timestamp = timestamp; 497 desc->timestamp = timestamp;
498 desc->gencount = gencount;
493 desc->timestamp_valid = 1; 499 desc->timestamp_valid = 1;
494 if ((status = dir_decode(desc)) == 0) 500 if ((status = dir_decode(desc)) == 0)
495 desc->entry->prev_cookie = *desc->dir_cookie; 501 desc->entry->prev_cookie = *desc->dir_cookie;
@@ -655,7 +661,7 @@ static int nfs_fsync_dir(struct file *filp, struct dentry *dentry, int datasync)
655 */ 661 */
656void nfs_force_lookup_revalidate(struct inode *dir) 662void nfs_force_lookup_revalidate(struct inode *dir)
657{ 663{
658 NFS_I(dir)->cache_change_attribute = jiffies; 664 NFS_I(dir)->cache_change_attribute++;
659} 665}
660 666
661/* 667/*
@@ -667,6 +673,8 @@ static int nfs_check_verifier(struct inode *dir, struct dentry *dentry)
667{ 673{
668 if (IS_ROOT(dentry)) 674 if (IS_ROOT(dentry))
669 return 1; 675 return 1;
676 if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONE)
677 return 0;
670 if (!nfs_verify_change_attribute(dir, dentry->d_time)) 678 if (!nfs_verify_change_attribute(dir, dentry->d_time))
671 return 0; 679 return 0;
672 /* Revalidate nfsi->cache_change_attribute before we declare a match */ 680 /* Revalidate nfsi->cache_change_attribute before we declare a match */
@@ -750,6 +758,8 @@ int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry,
750 /* Don't revalidate a negative dentry if we're creating a new file */ 758 /* Don't revalidate a negative dentry if we're creating a new file */
751 if (nd != NULL && nfs_lookup_check_intent(nd, LOOKUP_CREATE) != 0) 759 if (nd != NULL && nfs_lookup_check_intent(nd, LOOKUP_CREATE) != 0)
752 return 0; 760 return 0;
761 if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG)
762 return 1;
753 return !nfs_check_verifier(dir, dentry); 763 return !nfs_check_verifier(dir, dentry);
754} 764}
755 765
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 78460657f5cb..d319b49f8f06 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -188,13 +188,16 @@ static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
188 /* origin == SEEK_END => we must revalidate the cached file length */ 188 /* origin == SEEK_END => we must revalidate the cached file length */
189 if (origin == SEEK_END) { 189 if (origin == SEEK_END) {
190 struct inode *inode = filp->f_mapping->host; 190 struct inode *inode = filp->f_mapping->host;
191
191 int retval = nfs_revalidate_file_size(inode, filp); 192 int retval = nfs_revalidate_file_size(inode, filp);
192 if (retval < 0) 193 if (retval < 0)
193 return (loff_t)retval; 194 return (loff_t)retval;
194 } 195
195 lock_kernel(); /* BKL needed? */ 196 spin_lock(&inode->i_lock);
196 loff = generic_file_llseek_unlocked(filp, offset, origin); 197 loff = generic_file_llseek_unlocked(filp, offset, origin);
197 unlock_kernel(); 198 spin_unlock(&inode->i_lock);
199 } else
200 loff = generic_file_llseek_unlocked(filp, offset, origin);
198 return loff; 201 return loff;
199} 202}
200 203
@@ -699,13 +702,6 @@ static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)
699 filp->f_path.dentry->d_name.name, 702 filp->f_path.dentry->d_name.name,
700 fl->fl_type, fl->fl_flags); 703 fl->fl_type, fl->fl_flags);
701 704
702 /*
703 * No BSD flocks over NFS allowed.
704 * Note: we could try to fake a POSIX lock request here by
705 * using ((u32) filp | 0x80000000) or some such as the pid.
706 * Not sure whether that would be unique, though, or whether
707 * that would break in other places.
708 */
709 if (!(fl->fl_flags & FL_FLOCK)) 705 if (!(fl->fl_flags & FL_FLOCK))
710 return -ENOLCK; 706 return -ENOLCK;
711 707
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 52daefa2f521..b9195c02a863 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -305,8 +305,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
305 init_special_inode(inode, inode->i_mode, fattr->rdev); 305 init_special_inode(inode, inode->i_mode, fattr->rdev);
306 306
307 nfsi->read_cache_jiffies = fattr->time_start; 307 nfsi->read_cache_jiffies = fattr->time_start;
308 nfsi->last_updated = now; 308 nfsi->attr_gencount = fattr->gencount;
309 nfsi->cache_change_attribute = now;
310 inode->i_atime = fattr->atime; 309 inode->i_atime = fattr->atime;
311 inode->i_mtime = fattr->mtime; 310 inode->i_mtime = fattr->mtime;
312 inode->i_ctime = fattr->ctime; 311 inode->i_ctime = fattr->ctime;
@@ -453,6 +452,7 @@ out_big:
453void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr) 452void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr)
454{ 453{
455 if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) { 454 if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) {
455 spin_lock(&inode->i_lock);
456 if ((attr->ia_valid & ATTR_MODE) != 0) { 456 if ((attr->ia_valid & ATTR_MODE) != 0) {
457 int mode = attr->ia_mode & S_IALLUGO; 457 int mode = attr->ia_mode & S_IALLUGO;
458 mode |= inode->i_mode & ~S_IALLUGO; 458 mode |= inode->i_mode & ~S_IALLUGO;
@@ -462,7 +462,6 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr)
462 inode->i_uid = attr->ia_uid; 462 inode->i_uid = attr->ia_uid;
463 if ((attr->ia_valid & ATTR_GID) != 0) 463 if ((attr->ia_valid & ATTR_GID) != 0)
464 inode->i_gid = attr->ia_gid; 464 inode->i_gid = attr->ia_gid;
465 spin_lock(&inode->i_lock);
466 NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; 465 NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
467 spin_unlock(&inode->i_lock); 466 spin_unlock(&inode->i_lock);
468 } 467 }
@@ -472,37 +471,6 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr)
472 } 471 }
473} 472}
474 473
475static int nfs_wait_schedule(void *word)
476{
477 if (signal_pending(current))
478 return -ERESTARTSYS;
479 schedule();
480 return 0;
481}
482
483/*
484 * Wait for the inode to get unlocked.
485 */
486static int nfs_wait_on_inode(struct inode *inode)
487{
488 struct nfs_inode *nfsi = NFS_I(inode);
489 int error;
490
491 error = wait_on_bit_lock(&nfsi->flags, NFS_INO_REVALIDATING,
492 nfs_wait_schedule, TASK_KILLABLE);
493
494 return error;
495}
496
497static void nfs_wake_up_inode(struct inode *inode)
498{
499 struct nfs_inode *nfsi = NFS_I(inode);
500
501 clear_bit(NFS_INO_REVALIDATING, &nfsi->flags);
502 smp_mb__after_clear_bit();
503 wake_up_bit(&nfsi->flags, NFS_INO_REVALIDATING);
504}
505
506int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) 474int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
507{ 475{
508 struct inode *inode = dentry->d_inode; 476 struct inode *inode = dentry->d_inode;
@@ -697,20 +665,15 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
697 dfprintk(PAGECACHE, "NFS: revalidating (%s/%Ld)\n", 665 dfprintk(PAGECACHE, "NFS: revalidating (%s/%Ld)\n",
698 inode->i_sb->s_id, (long long)NFS_FILEID(inode)); 666 inode->i_sb->s_id, (long long)NFS_FILEID(inode));
699 667
700 nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE);
701 if (is_bad_inode(inode)) 668 if (is_bad_inode(inode))
702 goto out_nowait; 669 goto out;
703 if (NFS_STALE(inode)) 670 if (NFS_STALE(inode))
704 goto out_nowait;
705
706 status = nfs_wait_on_inode(inode);
707 if (status < 0)
708 goto out; 671 goto out;
709 672
710 status = -ESTALE;
711 if (NFS_STALE(inode)) 673 if (NFS_STALE(inode))
712 goto out; 674 goto out;
713 675
676 nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE);
714 status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), &fattr); 677 status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), &fattr);
715 if (status != 0) { 678 if (status != 0) {
716 dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) getattr failed, error=%d\n", 679 dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) getattr failed, error=%d\n",
@@ -724,16 +687,13 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
724 goto out; 687 goto out;
725 } 688 }
726 689
727 spin_lock(&inode->i_lock); 690 status = nfs_refresh_inode(inode, &fattr);
728 status = nfs_update_inode(inode, &fattr);
729 if (status) { 691 if (status) {
730 spin_unlock(&inode->i_lock);
731 dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) refresh failed, error=%d\n", 692 dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) refresh failed, error=%d\n",
732 inode->i_sb->s_id, 693 inode->i_sb->s_id,
733 (long long)NFS_FILEID(inode), status); 694 (long long)NFS_FILEID(inode), status);
734 goto out; 695 goto out;
735 } 696 }
736 spin_unlock(&inode->i_lock);
737 697
738 if (nfsi->cache_validity & NFS_INO_INVALID_ACL) 698 if (nfsi->cache_validity & NFS_INO_INVALID_ACL)
739 nfs_zap_acl_cache(inode); 699 nfs_zap_acl_cache(inode);
@@ -743,9 +703,6 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
743 (long long)NFS_FILEID(inode)); 703 (long long)NFS_FILEID(inode));
744 704
745 out: 705 out:
746 nfs_wake_up_inode(inode);
747
748 out_nowait:
749 return status; 706 return status;
750} 707}
751 708
@@ -908,9 +865,6 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
908 return -EIO; 865 return -EIO;
909 } 866 }
910 867
911 /* Do atomic weak cache consistency updates */
912 nfs_wcc_update_inode(inode, fattr);
913
914 if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 && 868 if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 &&
915 nfsi->change_attr != fattr->change_attr) 869 nfsi->change_attr != fattr->change_attr)
916 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; 870 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
@@ -939,15 +893,81 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
939 893
940 if (invalid != 0) 894 if (invalid != 0)
941 nfsi->cache_validity |= invalid; 895 nfsi->cache_validity |= invalid;
942 else
943 nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR
944 | NFS_INO_INVALID_ATIME
945 | NFS_INO_REVAL_PAGECACHE);
946 896
947 nfsi->read_cache_jiffies = fattr->time_start; 897 nfsi->read_cache_jiffies = fattr->time_start;
948 return 0; 898 return 0;
949} 899}
950 900
901static int nfs_ctime_need_update(const struct inode *inode, const struct nfs_fattr *fattr)
902{
903 return timespec_compare(&fattr->ctime, &inode->i_ctime) > 0;
904}
905
906static int nfs_size_need_update(const struct inode *inode, const struct nfs_fattr *fattr)
907{
908 return nfs_size_to_loff_t(fattr->size) > i_size_read(inode);
909}
910
911static unsigned long nfs_attr_generation_counter;
912
913static unsigned long nfs_read_attr_generation_counter(void)
914{
915 smp_rmb();
916 return nfs_attr_generation_counter;
917}
918
919unsigned long nfs_inc_attr_generation_counter(void)
920{
921 unsigned long ret;
922 smp_rmb();
923 ret = ++nfs_attr_generation_counter;
924 smp_wmb();
925 return ret;
926}
927
928void nfs_fattr_init(struct nfs_fattr *fattr)
929{
930 fattr->valid = 0;
931 fattr->time_start = jiffies;
932 fattr->gencount = nfs_inc_attr_generation_counter();
933}
934
935/**
936 * nfs_inode_attrs_need_update - check if the inode attributes need updating
937 * @inode - pointer to inode
938 * @fattr - attributes
939 *
940 * Attempt to divine whether or not an RPC call reply carrying stale
941 * attributes got scheduled after another call carrying updated ones.
942 *
943 * To do so, the function first assumes that a more recent ctime means
944 * that the attributes in fattr are newer, however it also attempt to
945 * catch the case where ctime either didn't change, or went backwards
946 * (if someone reset the clock on the server) by looking at whether
947 * or not this RPC call was started after the inode was last updated.
948 * Note also the check for wraparound of 'attr_gencount'
949 *
950 * The function returns 'true' if it thinks the attributes in 'fattr' are
951 * more recent than the ones cached in the inode.
952 *
953 */
954static int nfs_inode_attrs_need_update(const struct inode *inode, const struct nfs_fattr *fattr)
955{
956 const struct nfs_inode *nfsi = NFS_I(inode);
957
958 return ((long)fattr->gencount - (long)nfsi->attr_gencount) > 0 ||
959 nfs_ctime_need_update(inode, fattr) ||
960 nfs_size_need_update(inode, fattr) ||
961 ((long)nfsi->attr_gencount - (long)nfs_read_attr_generation_counter() > 0);
962}
963
964static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr)
965{
966 if (nfs_inode_attrs_need_update(inode, fattr))
967 return nfs_update_inode(inode, fattr);
968 return nfs_check_inode_attributes(inode, fattr);
969}
970
951/** 971/**
952 * nfs_refresh_inode - try to update the inode attribute cache 972 * nfs_refresh_inode - try to update the inode attribute cache
953 * @inode - pointer to inode 973 * @inode - pointer to inode
@@ -960,21 +980,28 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
960 */ 980 */
961int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr) 981int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr)
962{ 982{
963 struct nfs_inode *nfsi = NFS_I(inode);
964 int status; 983 int status;
965 984
966 if ((fattr->valid & NFS_ATTR_FATTR) == 0) 985 if ((fattr->valid & NFS_ATTR_FATTR) == 0)
967 return 0; 986 return 0;
968 spin_lock(&inode->i_lock); 987 spin_lock(&inode->i_lock);
969 if (time_after(fattr->time_start, nfsi->last_updated)) 988 status = nfs_refresh_inode_locked(inode, fattr);
970 status = nfs_update_inode(inode, fattr);
971 else
972 status = nfs_check_inode_attributes(inode, fattr);
973
974 spin_unlock(&inode->i_lock); 989 spin_unlock(&inode->i_lock);
975 return status; 990 return status;
976} 991}
977 992
993static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr *fattr)
994{
995 struct nfs_inode *nfsi = NFS_I(inode);
996
997 nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
998 if (S_ISDIR(inode->i_mode))
999 nfsi->cache_validity |= NFS_INO_INVALID_DATA;
1000 if ((fattr->valid & NFS_ATTR_FATTR) == 0)
1001 return 0;
1002 return nfs_refresh_inode_locked(inode, fattr);
1003}
1004
978/** 1005/**
979 * nfs_post_op_update_inode - try to update the inode attribute cache 1006 * nfs_post_op_update_inode - try to update the inode attribute cache
980 * @inode - pointer to inode 1007 * @inode - pointer to inode
@@ -991,14 +1018,12 @@ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr)
991 */ 1018 */
992int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr) 1019int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr)
993{ 1020{
994 struct nfs_inode *nfsi = NFS_I(inode); 1021 int status;
995 1022
996 spin_lock(&inode->i_lock); 1023 spin_lock(&inode->i_lock);
997 nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; 1024 status = nfs_post_op_update_inode_locked(inode, fattr);
998 if (S_ISDIR(inode->i_mode))
999 nfsi->cache_validity |= NFS_INO_INVALID_DATA;
1000 spin_unlock(&inode->i_lock); 1025 spin_unlock(&inode->i_lock);
1001 return nfs_refresh_inode(inode, fattr); 1026 return status;
1002} 1027}
1003 1028
1004/** 1029/**
@@ -1014,6 +1039,15 @@ int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1014 */ 1039 */
1015int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fattr) 1040int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fattr)
1016{ 1041{
1042 int status;
1043
1044 spin_lock(&inode->i_lock);
1045 /* Don't do a WCC update if these attributes are already stale */
1046 if ((fattr->valid & NFS_ATTR_FATTR) == 0 ||
1047 !nfs_inode_attrs_need_update(inode, fattr)) {
1048 fattr->valid &= ~(NFS_ATTR_WCC_V4|NFS_ATTR_WCC);
1049 goto out_noforce;
1050 }
1017 if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 && 1051 if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 &&
1018 (fattr->valid & NFS_ATTR_WCC_V4) == 0) { 1052 (fattr->valid & NFS_ATTR_WCC_V4) == 0) {
1019 fattr->pre_change_attr = NFS_I(inode)->change_attr; 1053 fattr->pre_change_attr = NFS_I(inode)->change_attr;
@@ -1026,7 +1060,10 @@ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fa
1026 fattr->pre_size = i_size_read(inode); 1060 fattr->pre_size = i_size_read(inode);
1027 fattr->valid |= NFS_ATTR_WCC; 1061 fattr->valid |= NFS_ATTR_WCC;
1028 } 1062 }
1029 return nfs_post_op_update_inode(inode, fattr); 1063out_noforce:
1064 status = nfs_post_op_update_inode_locked(inode, fattr);
1065 spin_unlock(&inode->i_lock);
1066 return status;
1030} 1067}
1031 1068
1032/* 1069/*
@@ -1092,7 +1129,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1092 } 1129 }
1093 /* If ctime has changed we should definitely clear access+acl caches */ 1130 /* If ctime has changed we should definitely clear access+acl caches */
1094 if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) 1131 if (!timespec_equal(&inode->i_ctime, &fattr->ctime))
1095 invalid |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; 1132 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
1096 } else if (nfsi->change_attr != fattr->change_attr) { 1133 } else if (nfsi->change_attr != fattr->change_attr) {
1097 dprintk("NFS: change_attr change on server for file %s/%ld\n", 1134 dprintk("NFS: change_attr change on server for file %s/%ld\n",
1098 inode->i_sb->s_id, inode->i_ino); 1135 inode->i_sb->s_id, inode->i_ino);
@@ -1126,6 +1163,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1126 inode->i_gid != fattr->gid) 1163 inode->i_gid != fattr->gid)
1127 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; 1164 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
1128 1165
1166 if (inode->i_nlink != fattr->nlink)
1167 invalid |= NFS_INO_INVALID_ATTR;
1168
1129 inode->i_mode = fattr->mode; 1169 inode->i_mode = fattr->mode;
1130 inode->i_nlink = fattr->nlink; 1170 inode->i_nlink = fattr->nlink;
1131 inode->i_uid = fattr->uid; 1171 inode->i_uid = fattr->uid;
@@ -1145,18 +1185,13 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1145 nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE); 1185 nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE);
1146 nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); 1186 nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
1147 nfsi->attrtimeo_timestamp = now; 1187 nfsi->attrtimeo_timestamp = now;
1148 nfsi->last_updated = now; 1188 nfsi->attr_gencount = nfs_inc_attr_generation_counter();
1149 } else { 1189 } else {
1150 if (!time_in_range(now, nfsi->attrtimeo_timestamp, nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) { 1190 if (!time_in_range(now, nfsi->attrtimeo_timestamp, nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) {
1151 if ((nfsi->attrtimeo <<= 1) > NFS_MAXATTRTIMEO(inode)) 1191 if ((nfsi->attrtimeo <<= 1) > NFS_MAXATTRTIMEO(inode))
1152 nfsi->attrtimeo = NFS_MAXATTRTIMEO(inode); 1192 nfsi->attrtimeo = NFS_MAXATTRTIMEO(inode);
1153 nfsi->attrtimeo_timestamp = now; 1193 nfsi->attrtimeo_timestamp = now;
1154 } 1194 }
1155 /*
1156 * Avoid jiffy wraparound issues with nfsi->last_updated
1157 */
1158 if (!time_in_range(nfsi->last_updated, nfsi->read_cache_jiffies, now))
1159 nfsi->last_updated = nfsi->read_cache_jiffies;
1160 } 1195 }
1161 invalid &= ~NFS_INO_INVALID_ATTR; 1196 invalid &= ~NFS_INO_INVALID_ATTR;
1162 /* Don't invalidate the data if we were to blame */ 1197 /* Don't invalidate the data if we were to blame */
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 24241fcbb98d..d212ee41caf2 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -153,6 +153,7 @@ extern void nfs4_clear_inode(struct inode *);
153void nfs_zap_acl_cache(struct inode *inode); 153void nfs_zap_acl_cache(struct inode *inode);
154 154
155/* super.c */ 155/* super.c */
156void nfs_parse_ip_address(char *, size_t, struct sockaddr *, size_t *);
156extern struct file_system_type nfs_xdev_fs_type; 157extern struct file_system_type nfs_xdev_fs_type;
157#ifdef CONFIG_NFS_V4 158#ifdef CONFIG_NFS_V4
158extern struct file_system_type nfs4_xdev_fs_type; 159extern struct file_system_type nfs4_xdev_fs_type;
@@ -163,8 +164,8 @@ extern struct rpc_stat nfs_rpcstat;
163 164
164extern int __init register_nfs_fs(void); 165extern int __init register_nfs_fs(void);
165extern void __exit unregister_nfs_fs(void); 166extern void __exit unregister_nfs_fs(void);
166extern void nfs_sb_active(struct nfs_server *server); 167extern void nfs_sb_active(struct super_block *sb);
167extern void nfs_sb_deactive(struct nfs_server *server); 168extern void nfs_sb_deactive(struct super_block *sb);
168 169
169/* namespace.c */ 170/* namespace.c */
170extern char *nfs_path(const char *base, 171extern char *nfs_path(const char *base,
@@ -276,3 +277,23 @@ unsigned int nfs_page_array_len(unsigned int base, size_t len)
276 PAGE_SIZE - 1) >> PAGE_SHIFT; 277 PAGE_SIZE - 1) >> PAGE_SHIFT;
277} 278}
278 279
280#define IPV6_SCOPE_DELIMITER '%'
281
282/*
283 * Set the port number in an address. Be agnostic about the address
284 * family.
285 */
286static inline void nfs_set_port(struct sockaddr *sap, unsigned short port)
287{
288 struct sockaddr_in *ap = (struct sockaddr_in *)sap;
289 struct sockaddr_in6 *ap6 = (struct sockaddr_in6 *)sap;
290
291 switch (sap->sa_family) {
292 case AF_INET:
293 ap->sin_port = htons(port);
294 break;
295 case AF_INET6:
296 ap6->sin6_port = htons(port);
297 break;
298 }
299}
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 779d2eb649c5..086a6830d785 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -14,6 +14,7 @@
14#include <linux/sunrpc/clnt.h> 14#include <linux/sunrpc/clnt.h>
15#include <linux/sunrpc/sched.h> 15#include <linux/sunrpc/sched.h>
16#include <linux/nfs_fs.h> 16#include <linux/nfs_fs.h>
17#include "internal.h"
17 18
18#ifdef RPC_DEBUG 19#ifdef RPC_DEBUG
19# define NFSDBG_FACILITY NFSDBG_MOUNT 20# define NFSDBG_FACILITY NFSDBG_MOUNT
@@ -98,7 +99,7 @@ out_call_err:
98 99
99out_mnt_err: 100out_mnt_err:
100 dprintk("NFS: MNT server returned result %d\n", result.status); 101 dprintk("NFS: MNT server returned result %d\n", result.status);
101 status = -EACCES; 102 status = nfs_stat_to_errno(result.status);
102 goto out; 103 goto out;
103} 104}
104 105
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 66df08dd1caf..64a288ee046d 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -105,7 +105,10 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
105 105
106 dprintk("--> nfs_follow_mountpoint()\n"); 106 dprintk("--> nfs_follow_mountpoint()\n");
107 107
108 BUG_ON(IS_ROOT(dentry)); 108 err = -ESTALE;
109 if (IS_ROOT(dentry))
110 goto out_err;
111
109 dprintk("%s: enter\n", __func__); 112 dprintk("%s: enter\n", __func__);
110 dput(nd->path.dentry); 113 dput(nd->path.dentry);
111 nd->path.dentry = dget(dentry); 114 nd->path.dentry = dget(dentry);
@@ -189,7 +192,7 @@ static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server,
189 struct nfs_clone_mount *mountdata) 192 struct nfs_clone_mount *mountdata)
190{ 193{
191#ifdef CONFIG_NFS_V4 194#ifdef CONFIG_NFS_V4
192 struct vfsmount *mnt = NULL; 195 struct vfsmount *mnt = ERR_PTR(-EINVAL);
193 switch (server->nfs_client->rpc_ops->version) { 196 switch (server->nfs_client->rpc_ops->version) {
194 case 2: 197 case 2:
195 case 3: 198 case 3:
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 423842f51ac9..cef62557c87d 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -229,6 +229,7 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
229 229
230 dprintk("NFS call getacl\n"); 230 dprintk("NFS call getacl\n");
231 msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_GETACL]; 231 msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_GETACL];
232 nfs_fattr_init(&fattr);
232 status = rpc_call_sync(server->client_acl, &msg, 0); 233 status = rpc_call_sync(server->client_acl, &msg, 0);
233 dprintk("NFS reply getacl: %d\n", status); 234 dprintk("NFS reply getacl: %d\n", status);
234 235
@@ -322,6 +323,7 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
322 323
323 dprintk("NFS call setacl\n"); 324 dprintk("NFS call setacl\n");
324 msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_SETACL]; 325 msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_SETACL];
326 nfs_fattr_init(&fattr);
325 status = rpc_call_sync(server->client_acl, &msg, 0); 327 status = rpc_call_sync(server->client_acl, &msg, 0);
326 nfs_access_zap_cache(inode); 328 nfs_access_zap_cache(inode);
327 nfs_zap_acl_cache(inode); 329 nfs_zap_acl_cache(inode);
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 1e750e4574a9..c55be7a7679e 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -699,7 +699,7 @@ nfs3_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle,
699} 699}
700 700
701static int 701static int
702nfs3_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, 702do_proc_fsinfo(struct rpc_clnt *client, struct nfs_fh *fhandle,
703 struct nfs_fsinfo *info) 703 struct nfs_fsinfo *info)
704{ 704{
705 struct rpc_message msg = { 705 struct rpc_message msg = {
@@ -711,11 +711,27 @@ nfs3_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle,
711 711
712 dprintk("NFS call fsinfo\n"); 712 dprintk("NFS call fsinfo\n");
713 nfs_fattr_init(info->fattr); 713 nfs_fattr_init(info->fattr);
714 status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0); 714 status = rpc_call_sync(client, &msg, 0);
715 dprintk("NFS reply fsinfo: %d\n", status); 715 dprintk("NFS reply fsinfo: %d\n", status);
716 return status; 716 return status;
717} 717}
718 718
719/*
720 * Bare-bones access to fsinfo: this is for nfs_get_root/nfs_get_sb via
721 * nfs_create_server
722 */
723static int
724nfs3_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle,
725 struct nfs_fsinfo *info)
726{
727 int status;
728
729 status = do_proc_fsinfo(server->client, fhandle, info);
730 if (status && server->nfs_client->cl_rpcclient != server->client)
731 status = do_proc_fsinfo(server->nfs_client->cl_rpcclient, fhandle, info);
732 return status;
733}
734
719static int 735static int
720nfs3_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, 736nfs3_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
721 struct nfs_pathconf *info) 737 struct nfs_pathconf *info)
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index b112857301f7..30befc39b3c6 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -93,21 +93,52 @@ static int nfs4_validate_fspath(const struct vfsmount *mnt_parent,
93 return 0; 93 return 0;
94} 94}
95 95
96/* 96static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
97 * Check if the string represents a "valid" IPv4 address 97 char *page, char *page2,
98 */ 98 const struct nfs4_fs_location *location)
99static inline int valid_ipaddr4(const char *buf)
100{ 99{
101 int rc, count, in[4]; 100 struct vfsmount *mnt = ERR_PTR(-ENOENT);
102 101 char *mnt_path;
103 rc = sscanf(buf, "%d.%d.%d.%d", &in[0], &in[1], &in[2], &in[3]); 102 int page2len;
104 if (rc != 4) 103 unsigned int s;
105 return -EINVAL; 104
106 for (count = 0; count < 4; count++) { 105 mnt_path = nfs4_pathname_string(&location->rootpath, page2, PAGE_SIZE);
107 if (in[count] > 255) 106 if (IS_ERR(mnt_path))
108 return -EINVAL; 107 return mnt;
108 mountdata->mnt_path = mnt_path;
109 page2 += strlen(mnt_path) + 1;
110 page2len = PAGE_SIZE - strlen(mnt_path) - 1;
111
112 for (s = 0; s < location->nservers; s++) {
113 const struct nfs4_string *buf = &location->servers[s];
114 struct sockaddr_storage addr;
115
116 if (buf->len <= 0 || buf->len >= PAGE_SIZE)
117 continue;
118
119 mountdata->addr = (struct sockaddr *)&addr;
120
121 if (memchr(buf->data, IPV6_SCOPE_DELIMITER, buf->len))
122 continue;
123 nfs_parse_ip_address(buf->data, buf->len,
124 mountdata->addr, &mountdata->addrlen);
125 if (mountdata->addr->sa_family == AF_UNSPEC)
126 continue;
127 nfs_set_port(mountdata->addr, NFS_PORT);
128
129 strncpy(page2, buf->data, page2len);
130 page2[page2len] = '\0';
131 mountdata->hostname = page2;
132
133 snprintf(page, PAGE_SIZE, "%s:%s",
134 mountdata->hostname,
135 mountdata->mnt_path);
136
137 mnt = vfs_kern_mount(&nfs4_referral_fs_type, 0, page, mountdata);
138 if (!IS_ERR(mnt))
139 break;
109 } 140 }
110 return 0; 141 return mnt;
111} 142}
112 143
113/** 144/**
@@ -128,7 +159,6 @@ static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent,
128 .authflavor = NFS_SB(mnt_parent->mnt_sb)->client->cl_auth->au_flavor, 159 .authflavor = NFS_SB(mnt_parent->mnt_sb)->client->cl_auth->au_flavor,
129 }; 160 };
130 char *page = NULL, *page2 = NULL; 161 char *page = NULL, *page2 = NULL;
131 unsigned int s;
132 int loc, error; 162 int loc, error;
133 163
134 if (locations == NULL || locations->nlocations <= 0) 164 if (locations == NULL || locations->nlocations <= 0)
@@ -152,53 +182,16 @@ static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent,
152 goto out; 182 goto out;
153 } 183 }
154 184
155 loc = 0; 185 for (loc = 0; loc < locations->nlocations; loc++) {
156 while (loc < locations->nlocations && IS_ERR(mnt)) {
157 const struct nfs4_fs_location *location = &locations->locations[loc]; 186 const struct nfs4_fs_location *location = &locations->locations[loc];
158 char *mnt_path;
159 187
160 if (location == NULL || location->nservers <= 0 || 188 if (location == NULL || location->nservers <= 0 ||
161 location->rootpath.ncomponents == 0) { 189 location->rootpath.ncomponents == 0)
162 loc++;
163 continue; 190 continue;
164 }
165 191
166 mnt_path = nfs4_pathname_string(&location->rootpath, page2, PAGE_SIZE); 192 mnt = try_location(&mountdata, page, page2, location);
167 if (IS_ERR(mnt_path)) { 193 if (!IS_ERR(mnt))
168 loc++; 194 break;
169 continue;
170 }
171 mountdata.mnt_path = mnt_path;
172
173 s = 0;
174 while (s < location->nservers) {
175 struct sockaddr_in addr = {
176 .sin_family = AF_INET,
177 .sin_port = htons(NFS_PORT),
178 };
179
180 if (location->servers[s].len <= 0 ||
181 valid_ipaddr4(location->servers[s].data) < 0) {
182 s++;
183 continue;
184 }
185
186 mountdata.hostname = location->servers[s].data;
187 addr.sin_addr.s_addr = in_aton(mountdata.hostname),
188 mountdata.addr = (struct sockaddr *)&addr;
189 mountdata.addrlen = sizeof(addr);
190
191 snprintf(page, PAGE_SIZE, "%s:%s",
192 mountdata.hostname,
193 mountdata.mnt_path);
194
195 mnt = vfs_kern_mount(&nfs4_referral_fs_type, 0, page, &mountdata);
196 if (!IS_ERR(mnt)) {
197 break;
198 }
199 s++;
200 }
201 loc++;
202 } 195 }
203 196
204out: 197out:
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 4dbb84df1b68..193465210d7c 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -65,14 +65,20 @@ nfs_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
65 65
66 dprintk("%s: call getattr\n", __func__); 66 dprintk("%s: call getattr\n", __func__);
67 nfs_fattr_init(fattr); 67 nfs_fattr_init(fattr);
68 status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0); 68 status = rpc_call_sync(server->client, &msg, 0);
69 /* Retry with default authentication if different */
70 if (status && server->nfs_client->cl_rpcclient != server->client)
71 status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0);
69 dprintk("%s: reply getattr: %d\n", __func__, status); 72 dprintk("%s: reply getattr: %d\n", __func__, status);
70 if (status) 73 if (status)
71 return status; 74 return status;
72 dprintk("%s: call statfs\n", __func__); 75 dprintk("%s: call statfs\n", __func__);
73 msg.rpc_proc = &nfs_procedures[NFSPROC_STATFS]; 76 msg.rpc_proc = &nfs_procedures[NFSPROC_STATFS];
74 msg.rpc_resp = &fsinfo; 77 msg.rpc_resp = &fsinfo;
75 status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0); 78 status = rpc_call_sync(server->client, &msg, 0);
79 /* Retry with default authentication if different */
80 if (status && server->nfs_client->cl_rpcclient != server->client)
81 status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0);
76 dprintk("%s: reply statfs: %d\n", __func__, status); 82 dprintk("%s: reply statfs: %d\n", __func__, status);
77 if (status) 83 if (status)
78 return status; 84 return status;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index ffb697416cb1..8b28b95c9e44 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -91,6 +91,7 @@ enum {
91 /* Mount options that take string arguments */ 91 /* Mount options that take string arguments */
92 Opt_sec, Opt_proto, Opt_mountproto, Opt_mounthost, 92 Opt_sec, Opt_proto, Opt_mountproto, Opt_mounthost,
93 Opt_addr, Opt_mountaddr, Opt_clientaddr, 93 Opt_addr, Opt_mountaddr, Opt_clientaddr,
94 Opt_lookupcache,
94 95
95 /* Special mount options */ 96 /* Special mount options */
96 Opt_userspace, Opt_deprecated, Opt_sloppy, 97 Opt_userspace, Opt_deprecated, Opt_sloppy,
@@ -154,6 +155,8 @@ static const match_table_t nfs_mount_option_tokens = {
154 { Opt_mounthost, "mounthost=%s" }, 155 { Opt_mounthost, "mounthost=%s" },
155 { Opt_mountaddr, "mountaddr=%s" }, 156 { Opt_mountaddr, "mountaddr=%s" },
156 157
158 { Opt_lookupcache, "lookupcache=%s" },
159
157 { Opt_err, NULL } 160 { Opt_err, NULL }
158}; 161};
159 162
@@ -200,6 +203,22 @@ static const match_table_t nfs_secflavor_tokens = {
200 { Opt_sec_err, NULL } 203 { Opt_sec_err, NULL }
201}; 204};
202 205
206enum {
207 Opt_lookupcache_all, Opt_lookupcache_positive,
208 Opt_lookupcache_none,
209
210 Opt_lookupcache_err
211};
212
213static match_table_t nfs_lookupcache_tokens = {
214 { Opt_lookupcache_all, "all" },
215 { Opt_lookupcache_positive, "pos" },
216 { Opt_lookupcache_positive, "positive" },
217 { Opt_lookupcache_none, "none" },
218
219 { Opt_lookupcache_err, NULL }
220};
221
203 222
204static void nfs_umount_begin(struct super_block *); 223static void nfs_umount_begin(struct super_block *);
205static int nfs_statfs(struct dentry *, struct kstatfs *); 224static int nfs_statfs(struct dentry *, struct kstatfs *);
@@ -209,7 +228,6 @@ static int nfs_get_sb(struct file_system_type *, int, const char *, void *, stru
209static int nfs_xdev_get_sb(struct file_system_type *fs_type, 228static int nfs_xdev_get_sb(struct file_system_type *fs_type,
210 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); 229 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
211static void nfs_kill_super(struct super_block *); 230static void nfs_kill_super(struct super_block *);
212static void nfs_put_super(struct super_block *);
213static int nfs_remount(struct super_block *sb, int *flags, char *raw_data); 231static int nfs_remount(struct super_block *sb, int *flags, char *raw_data);
214 232
215static struct file_system_type nfs_fs_type = { 233static struct file_system_type nfs_fs_type = {
@@ -232,7 +250,6 @@ static const struct super_operations nfs_sops = {
232 .alloc_inode = nfs_alloc_inode, 250 .alloc_inode = nfs_alloc_inode,
233 .destroy_inode = nfs_destroy_inode, 251 .destroy_inode = nfs_destroy_inode,
234 .write_inode = nfs_write_inode, 252 .write_inode = nfs_write_inode,
235 .put_super = nfs_put_super,
236 .statfs = nfs_statfs, 253 .statfs = nfs_statfs,
237 .clear_inode = nfs_clear_inode, 254 .clear_inode = nfs_clear_inode,
238 .umount_begin = nfs_umount_begin, 255 .umount_begin = nfs_umount_begin,
@@ -337,26 +354,20 @@ void __exit unregister_nfs_fs(void)
337 unregister_filesystem(&nfs_fs_type); 354 unregister_filesystem(&nfs_fs_type);
338} 355}
339 356
340void nfs_sb_active(struct nfs_server *server) 357void nfs_sb_active(struct super_block *sb)
341{ 358{
342 atomic_inc(&server->active); 359 struct nfs_server *server = NFS_SB(sb);
343}
344 360
345void nfs_sb_deactive(struct nfs_server *server) 361 if (atomic_inc_return(&server->active) == 1)
346{ 362 atomic_inc(&sb->s_active);
347 if (atomic_dec_and_test(&server->active))
348 wake_up(&server->active_wq);
349} 363}
350 364
351static void nfs_put_super(struct super_block *sb) 365void nfs_sb_deactive(struct super_block *sb)
352{ 366{
353 struct nfs_server *server = NFS_SB(sb); 367 struct nfs_server *server = NFS_SB(sb);
354 /* 368
355 * Make sure there are no outstanding ops to this server. 369 if (atomic_dec_and_test(&server->active))
356 * If so, wait for them to finish before allowing the 370 deactivate_super(sb);
357 * unmount to continue.
358 */
359 wait_event(server->active_wq, atomic_read(&server->active) == 0);
360} 371}
361 372
362/* 373/*
@@ -664,25 +675,6 @@ static void nfs_umount_begin(struct super_block *sb)
664} 675}
665 676
666/* 677/*
667 * Set the port number in an address. Be agnostic about the address family.
668 */
669static void nfs_set_port(struct sockaddr *sap, unsigned short port)
670{
671 switch (sap->sa_family) {
672 case AF_INET: {
673 struct sockaddr_in *ap = (struct sockaddr_in *)sap;
674 ap->sin_port = htons(port);
675 break;
676 }
677 case AF_INET6: {
678 struct sockaddr_in6 *ap = (struct sockaddr_in6 *)sap;
679 ap->sin6_port = htons(port);
680 break;
681 }
682 }
683}
684
685/*
686 * Sanity-check a server address provided by the mount command. 678 * Sanity-check a server address provided by the mount command.
687 * 679 *
688 * Address family must be initialized, and address must not be 680 * Address family must be initialized, and address must not be
@@ -724,20 +716,22 @@ static void nfs_parse_ipv4_address(char *string, size_t str_len,
724 *addr_len = 0; 716 *addr_len = 0;
725} 717}
726 718
727#define IPV6_SCOPE_DELIMITER '%'
728
729#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 719#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
730static void nfs_parse_ipv6_scope_id(const char *string, const size_t str_len, 720static int nfs_parse_ipv6_scope_id(const char *string, const size_t str_len,
731 const char *delim, 721 const char *delim,
732 struct sockaddr_in6 *sin6) 722 struct sockaddr_in6 *sin6)
733{ 723{
734 char *p; 724 char *p;
735 size_t len; 725 size_t len;
736 726
737 if (!(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL)) 727 if ((string + str_len) == delim)
738 return ; 728 return 1;
729
739 if (*delim != IPV6_SCOPE_DELIMITER) 730 if (*delim != IPV6_SCOPE_DELIMITER)
740 return; 731 return 0;
732
733 if (!(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL))
734 return 0;
741 735
742 len = (string + str_len) - delim - 1; 736 len = (string + str_len) - delim - 1;
743 p = kstrndup(delim + 1, len, GFP_KERNEL); 737 p = kstrndup(delim + 1, len, GFP_KERNEL);
@@ -750,14 +744,20 @@ static void nfs_parse_ipv6_scope_id(const char *string, const size_t str_len,
750 scope_id = dev->ifindex; 744 scope_id = dev->ifindex;
751 dev_put(dev); 745 dev_put(dev);
752 } else { 746 } else {
753 /* scope_id is set to zero on error */ 747 if (strict_strtoul(p, 10, &scope_id) == 0) {
754 strict_strtoul(p, 10, &scope_id); 748 kfree(p);
749 return 0;
750 }
755 } 751 }
756 752
757 kfree(p); 753 kfree(p);
754
758 sin6->sin6_scope_id = scope_id; 755 sin6->sin6_scope_id = scope_id;
759 dfprintk(MOUNT, "NFS: IPv6 scope ID = %lu\n", scope_id); 756 dfprintk(MOUNT, "NFS: IPv6 scope ID = %lu\n", scope_id);
757 return 1;
760 } 758 }
759
760 return 0;
761} 761}
762 762
763static void nfs_parse_ipv6_address(char *string, size_t str_len, 763static void nfs_parse_ipv6_address(char *string, size_t str_len,
@@ -773,9 +773,11 @@ static void nfs_parse_ipv6_address(char *string, size_t str_len,
773 773
774 sin6->sin6_family = AF_INET6; 774 sin6->sin6_family = AF_INET6;
775 *addr_len = sizeof(*sin6); 775 *addr_len = sizeof(*sin6);
776 if (in6_pton(string, str_len, addr, IPV6_SCOPE_DELIMITER, &delim)) { 776 if (in6_pton(string, str_len, addr,
777 nfs_parse_ipv6_scope_id(string, str_len, delim, sin6); 777 IPV6_SCOPE_DELIMITER, &delim) != 0) {
778 return; 778 if (nfs_parse_ipv6_scope_id(string, str_len,
779 delim, sin6) != 0)
780 return;
779 } 781 }
780 } 782 }
781 783
@@ -798,7 +800,7 @@ static void nfs_parse_ipv6_address(char *string, size_t str_len,
798 * If there is a problem constructing the new sockaddr, set the address 800 * If there is a problem constructing the new sockaddr, set the address
799 * family to AF_UNSPEC. 801 * family to AF_UNSPEC.
800 */ 802 */
801static void nfs_parse_ip_address(char *string, size_t str_len, 803void nfs_parse_ip_address(char *string, size_t str_len,
802 struct sockaddr *sap, size_t *addr_len) 804 struct sockaddr *sap, size_t *addr_len)
803{ 805{
804 unsigned int i, colons; 806 unsigned int i, colons;
@@ -1258,6 +1260,30 @@ static int nfs_parse_mount_options(char *raw,
1258 &mnt->mount_server.addrlen); 1260 &mnt->mount_server.addrlen);
1259 kfree(string); 1261 kfree(string);
1260 break; 1262 break;
1263 case Opt_lookupcache:
1264 string = match_strdup(args);
1265 if (string == NULL)
1266 goto out_nomem;
1267 token = match_token(string,
1268 nfs_lookupcache_tokens, args);
1269 kfree(string);
1270 switch (token) {
1271 case Opt_lookupcache_all:
1272 mnt->flags &= ~(NFS_MOUNT_LOOKUP_CACHE_NONEG|NFS_MOUNT_LOOKUP_CACHE_NONE);
1273 break;
1274 case Opt_lookupcache_positive:
1275 mnt->flags &= ~NFS_MOUNT_LOOKUP_CACHE_NONE;
1276 mnt->flags |= NFS_MOUNT_LOOKUP_CACHE_NONEG;
1277 break;
1278 case Opt_lookupcache_none:
1279 mnt->flags |= NFS_MOUNT_LOOKUP_CACHE_NONEG|NFS_MOUNT_LOOKUP_CACHE_NONE;
1280 break;
1281 default:
1282 errors++;
1283 dfprintk(MOUNT, "NFS: invalid "
1284 "lookupcache argument\n");
1285 };
1286 break;
1261 1287
1262 /* 1288 /*
1263 * Special options 1289 * Special options
@@ -1558,7 +1584,7 @@ static int nfs_validate_mount_data(void *options,
1558 * Translate to nfs_parsed_mount_data, which nfs_fill_super 1584 * Translate to nfs_parsed_mount_data, which nfs_fill_super
1559 * can deal with. 1585 * can deal with.
1560 */ 1586 */
1561 args->flags = data->flags; 1587 args->flags = data->flags & NFS_MOUNT_FLAGMASK;
1562 args->rsize = data->rsize; 1588 args->rsize = data->rsize;
1563 args->wsize = data->wsize; 1589 args->wsize = data->wsize;
1564 args->timeo = data->timeo; 1590 args->timeo = data->timeo;
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index f089e5839d7d..ecc295347775 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -99,7 +99,7 @@ static void nfs_async_unlink_release(void *calldata)
99 99
100 nfs_dec_sillycount(data->dir); 100 nfs_dec_sillycount(data->dir);
101 nfs_free_unlinkdata(data); 101 nfs_free_unlinkdata(data);
102 nfs_sb_deactive(NFS_SB(sb)); 102 nfs_sb_deactive(sb);
103} 103}
104 104
105static const struct rpc_call_ops nfs_unlink_ops = { 105static const struct rpc_call_ops nfs_unlink_ops = {
@@ -118,6 +118,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n
118 .rpc_message = &msg, 118 .rpc_message = &msg,
119 .callback_ops = &nfs_unlink_ops, 119 .callback_ops = &nfs_unlink_ops,
120 .callback_data = data, 120 .callback_data = data,
121 .workqueue = nfsiod_workqueue,
121 .flags = RPC_TASK_ASYNC, 122 .flags = RPC_TASK_ASYNC,
122 }; 123 };
123 struct rpc_task *task; 124 struct rpc_task *task;
@@ -149,7 +150,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n
149 nfs_dec_sillycount(dir); 150 nfs_dec_sillycount(dir);
150 return 0; 151 return 0;
151 } 152 }
152 nfs_sb_active(NFS_SERVER(dir)); 153 nfs_sb_active(dir->i_sb);
153 data->args.fh = NFS_FH(dir); 154 data->args.fh = NFS_FH(dir);
154 nfs_fattr_init(&data->res.dir_attr); 155 nfs_fattr_init(&data->res.dir_attr);
155 156
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 3229e217c773..9f9845859fc1 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1427,8 +1427,9 @@ static int nfs_write_mapping(struct address_space *mapping, int how)
1427 .bdi = mapping->backing_dev_info, 1427 .bdi = mapping->backing_dev_info,
1428 .sync_mode = WB_SYNC_NONE, 1428 .sync_mode = WB_SYNC_NONE,
1429 .nr_to_write = LONG_MAX, 1429 .nr_to_write = LONG_MAX,
1430 .range_start = 0,
1431 .range_end = LLONG_MAX,
1430 .for_writepages = 1, 1432 .for_writepages = 1,
1431 .range_cyclic = 1,
1432 }; 1433 };
1433 int ret; 1434 int ret;
1434 1435
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 78a5922a2f11..ac8d0233b05c 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -137,7 +137,7 @@ struct nfs_inode {
137 unsigned long attrtimeo_timestamp; 137 unsigned long attrtimeo_timestamp;
138 __u64 change_attr; /* v4 only */ 138 __u64 change_attr; /* v4 only */
139 139
140 unsigned long last_updated; 140 unsigned long attr_gencount;
141 /* "Generation counter" for the attribute cache. This is 141 /* "Generation counter" for the attribute cache. This is
142 * bumped whenever we update the metadata on the 142 * bumped whenever we update the metadata on the
143 * server. 143 * server.
@@ -200,11 +200,10 @@ struct nfs_inode {
200/* 200/*
201 * Bit offsets in flags field 201 * Bit offsets in flags field
202 */ 202 */
203#define NFS_INO_REVALIDATING (0) /* revalidating attrs */ 203#define NFS_INO_ADVISE_RDPLUS (0) /* advise readdirplus */
204#define NFS_INO_ADVISE_RDPLUS (1) /* advise readdirplus */ 204#define NFS_INO_STALE (1) /* possible stale inode */
205#define NFS_INO_STALE (2) /* possible stale inode */ 205#define NFS_INO_ACL_LRU_SET (2) /* Inode is on the LRU list */
206#define NFS_INO_ACL_LRU_SET (3) /* Inode is on the LRU list */ 206#define NFS_INO_MOUNTPOINT (3) /* inode is remote mountpoint */
207#define NFS_INO_MOUNTPOINT (4) /* inode is remote mountpoint */
208 207
209static inline struct nfs_inode *NFS_I(const struct inode *inode) 208static inline struct nfs_inode *NFS_I(const struct inode *inode)
210{ 209{
@@ -345,15 +344,11 @@ extern struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ct
345extern void put_nfs_open_context(struct nfs_open_context *ctx); 344extern void put_nfs_open_context(struct nfs_open_context *ctx);
346extern struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_cred *cred, int mode); 345extern struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_cred *cred, int mode);
347extern u64 nfs_compat_user_ino64(u64 fileid); 346extern u64 nfs_compat_user_ino64(u64 fileid);
347extern void nfs_fattr_init(struct nfs_fattr *fattr);
348 348
349/* linux/net/ipv4/ipconfig.c: trims ip addr off front of name, too. */ 349/* linux/net/ipv4/ipconfig.c: trims ip addr off front of name, too. */
350extern __be32 root_nfs_parse_addr(char *name); /*__init*/ 350extern __be32 root_nfs_parse_addr(char *name); /*__init*/
351 351extern unsigned long nfs_inc_attr_generation_counter(void);
352static inline void nfs_fattr_init(struct nfs_fattr *fattr)
353{
354 fattr->valid = 0;
355 fattr->time_start = jiffies;
356}
357 352
358/* 353/*
359 * linux/fs/nfs/file.c 354 * linux/fs/nfs/file.c
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index c9beacd16c00..4e477ae58699 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -119,7 +119,6 @@ struct nfs_server {
119 void (*destroy)(struct nfs_server *); 119 void (*destroy)(struct nfs_server *);
120 120
121 atomic_t active; /* Keep trace of any activity to this server */ 121 atomic_t active; /* Keep trace of any activity to this server */
122 wait_queue_head_t active_wq; /* Wait for any activity to stop */
123 122
124 /* mountd-related mount options */ 123 /* mountd-related mount options */
125 struct sockaddr_storage mountd_address; 124 struct sockaddr_storage mountd_address;
diff --git a/include/linux/nfs_mount.h b/include/linux/nfs_mount.h
index df7c6b7a7ebb..6549a06ac16e 100644
--- a/include/linux/nfs_mount.h
+++ b/include/linux/nfs_mount.h
@@ -65,4 +65,8 @@ struct nfs_mount_data {
65#define NFS_MOUNT_UNSHARED 0x8000 /* 5 */ 65#define NFS_MOUNT_UNSHARED 0x8000 /* 5 */
66#define NFS_MOUNT_FLAGMASK 0xFFFF 66#define NFS_MOUNT_FLAGMASK 0xFFFF
67 67
68/* The following are for internal use only */
69#define NFS_MOUNT_LOOKUP_CACHE_NONEG 0x10000
70#define NFS_MOUNT_LOOKUP_CACHE_NONE 0x20000
71
68#endif 72#endif
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 8c77c11224d1..c1c31acb8a2b 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -36,6 +36,7 @@ struct nfs_fattr {
36 __u32 nlink; 36 __u32 nlink;
37 __u32 uid; 37 __u32 uid;
38 __u32 gid; 38 __u32 gid;
39 dev_t rdev;
39 __u64 size; 40 __u64 size;
40 union { 41 union {
41 struct { 42 struct {
@@ -46,7 +47,6 @@ struct nfs_fattr {
46 __u64 used; 47 __u64 used;
47 } nfs3; 48 } nfs3;
48 } du; 49 } du;
49 dev_t rdev;
50 struct nfs_fsid fsid; 50 struct nfs_fsid fsid;
51 __u64 fileid; 51 __u64 fileid;
52 struct timespec atime; 52 struct timespec atime;
@@ -56,6 +56,7 @@ struct nfs_fattr {
56 __u64 change_attr; /* NFSv4 change attribute */ 56 __u64 change_attr; /* NFSv4 change attribute */
57 __u64 pre_change_attr;/* pre-op NFSv4 change attribute */ 57 __u64 pre_change_attr;/* pre-op NFSv4 change attribute */
58 unsigned long time_start; 58 unsigned long time_start;
59 unsigned long gencount;
59}; 60};
60 61
61#define NFS_ATTR_WCC 0x0001 /* pre-op WCC data */ 62#define NFS_ATTR_WCC 0x0001 /* pre-op WCC data */
@@ -672,16 +673,16 @@ struct nfs4_rename_res {
672 struct nfs_fattr * new_fattr; 673 struct nfs_fattr * new_fattr;
673}; 674};
674 675
675#define NFS4_SETCLIENTID_NAMELEN (56) 676#define NFS4_SETCLIENTID_NAMELEN (127)
676struct nfs4_setclientid { 677struct nfs4_setclientid {
677 const nfs4_verifier * sc_verifier; 678 const nfs4_verifier * sc_verifier;
678 unsigned int sc_name_len; 679 unsigned int sc_name_len;
679 char sc_name[NFS4_SETCLIENTID_NAMELEN]; 680 char sc_name[NFS4_SETCLIENTID_NAMELEN + 1];
680 u32 sc_prog; 681 u32 sc_prog;
681 unsigned int sc_netid_len; 682 unsigned int sc_netid_len;
682 char sc_netid[RPCBIND_MAXNETIDLEN]; 683 char sc_netid[RPCBIND_MAXNETIDLEN + 1];
683 unsigned int sc_uaddr_len; 684 unsigned int sc_uaddr_len;
684 char sc_uaddr[RPCBIND_MAXUADDRLEN]; 685 char sc_uaddr[RPCBIND_MAXUADDRLEN + 1];
685 u32 sc_cb_ident; 686 u32 sc_cb_ident;
686}; 687};
687 688
diff --git a/include/linux/sunrpc/xprtrdma.h b/include/linux/sunrpc/xprtrdma.h
index 4de56b1d372b..54a379c9e8eb 100644
--- a/include/linux/sunrpc/xprtrdma.h
+++ b/include/linux/sunrpc/xprtrdma.h
@@ -66,9 +66,6 @@
66 66
67#define RPCRDMA_INLINE_PAD_THRESH (512)/* payload threshold to pad (bytes) */ 67#define RPCRDMA_INLINE_PAD_THRESH (512)/* payload threshold to pad (bytes) */
68 68
69#define RDMA_RESOLVE_TIMEOUT (5*HZ) /* TBD 5 seconds */
70#define RDMA_CONNECT_RETRY_MAX (2) /* retries if no listener backlog */
71
72/* memory registration strategies */ 69/* memory registration strategies */
73#define RPCRDMA_PERSISTENT_REGISTRATION (1) 70#define RPCRDMA_PERSISTENT_REGISTRATION (1)
74 71
@@ -78,6 +75,7 @@ enum rpcrdma_memreg {
78 RPCRDMA_MEMWINDOWS, 75 RPCRDMA_MEMWINDOWS,
79 RPCRDMA_MEMWINDOWS_ASYNC, 76 RPCRDMA_MEMWINDOWS_ASYNC,
80 RPCRDMA_MTHCAFMR, 77 RPCRDMA_MTHCAFMR,
78 RPCRDMA_FRMR,
81 RPCRDMA_ALLPHYSICAL, 79 RPCRDMA_ALLPHYSICAL,
82 RPCRDMA_LAST 80 RPCRDMA_LAST
83}; 81};
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index da0789fa1b88..4895c341e46d 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -213,10 +213,10 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, stru
213 } 213 }
214 214
215 /* save the nodename */ 215 /* save the nodename */
216 clnt->cl_nodelen = strlen(utsname()->nodename); 216 clnt->cl_nodelen = strlen(init_utsname()->nodename);
217 if (clnt->cl_nodelen > UNX_MAXNODENAME) 217 if (clnt->cl_nodelen > UNX_MAXNODENAME)
218 clnt->cl_nodelen = UNX_MAXNODENAME; 218 clnt->cl_nodelen = UNX_MAXNODENAME;
219 memcpy(clnt->cl_nodename, utsname()->nodename, clnt->cl_nodelen); 219 memcpy(clnt->cl_nodename, init_utsname()->nodename, clnt->cl_nodelen);
220 rpc_register_client(clnt); 220 rpc_register_client(clnt);
221 return clnt; 221 return clnt;
222 222
diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c
index 34abc91058d8..41013dd66ac3 100644
--- a/net/sunrpc/rpcb_clnt.c
+++ b/net/sunrpc/rpcb_clnt.c
@@ -460,6 +460,28 @@ static struct rpc_task *rpcb_call_async(struct rpc_clnt *rpcb_clnt, struct rpcbi
460 return rpc_run_task(&task_setup_data); 460 return rpc_run_task(&task_setup_data);
461} 461}
462 462
463/*
464 * In the case where rpc clients have been cloned, we want to make
465 * sure that we use the program number/version etc of the actual
466 * owner of the xprt. To do so, we walk back up the tree of parents
467 * to find whoever created the transport and/or whoever has the
468 * autobind flag set.
469 */
470static struct rpc_clnt *rpcb_find_transport_owner(struct rpc_clnt *clnt)
471{
472 struct rpc_clnt *parent = clnt->cl_parent;
473
474 while (parent != clnt) {
475 if (parent->cl_xprt != clnt->cl_xprt)
476 break;
477 if (clnt->cl_autobind)
478 break;
479 clnt = parent;
480 parent = parent->cl_parent;
481 }
482 return clnt;
483}
484
463/** 485/**
464 * rpcb_getport_async - obtain the port for a given RPC service on a given host 486 * rpcb_getport_async - obtain the port for a given RPC service on a given host
465 * @task: task that is waiting for portmapper request 487 * @task: task that is waiting for portmapper request
@@ -469,10 +491,10 @@ static struct rpc_task *rpcb_call_async(struct rpc_clnt *rpcb_clnt, struct rpcbi
469 */ 491 */
470void rpcb_getport_async(struct rpc_task *task) 492void rpcb_getport_async(struct rpc_task *task)
471{ 493{
472 struct rpc_clnt *clnt = task->tk_client; 494 struct rpc_clnt *clnt;
473 struct rpc_procinfo *proc; 495 struct rpc_procinfo *proc;
474 u32 bind_version; 496 u32 bind_version;
475 struct rpc_xprt *xprt = task->tk_xprt; 497 struct rpc_xprt *xprt;
476 struct rpc_clnt *rpcb_clnt; 498 struct rpc_clnt *rpcb_clnt;
477 static struct rpcbind_args *map; 499 static struct rpcbind_args *map;
478 struct rpc_task *child; 500 struct rpc_task *child;
@@ -481,13 +503,13 @@ void rpcb_getport_async(struct rpc_task *task)
481 size_t salen; 503 size_t salen;
482 int status; 504 int status;
483 505
506 clnt = rpcb_find_transport_owner(task->tk_client);
507 xprt = clnt->cl_xprt;
508
484 dprintk("RPC: %5u %s(%s, %u, %u, %d)\n", 509 dprintk("RPC: %5u %s(%s, %u, %u, %d)\n",
485 task->tk_pid, __func__, 510 task->tk_pid, __func__,
486 clnt->cl_server, clnt->cl_prog, clnt->cl_vers, xprt->prot); 511 clnt->cl_server, clnt->cl_prog, clnt->cl_vers, xprt->prot);
487 512
488 /* Autobind on cloned rpc clients is discouraged */
489 BUG_ON(clnt->cl_parent != clnt);
490
491 /* Put self on the wait queue to ensure we get notified if 513 /* Put self on the wait queue to ensure we get notified if
492 * some other task is already attempting to bind the port */ 514 * some other task is already attempting to bind the port */
493 rpc_sleep_on(&xprt->binding, task, NULL); 515 rpc_sleep_on(&xprt->binding, task, NULL);
@@ -549,7 +571,7 @@ void rpcb_getport_async(struct rpc_task *task)
549 status = -ENOMEM; 571 status = -ENOMEM;
550 dprintk("RPC: %5u %s: no memory available\n", 572 dprintk("RPC: %5u %s: no memory available\n",
551 task->tk_pid, __func__); 573 task->tk_pid, __func__);
552 goto bailout_nofree; 574 goto bailout_release_client;
553 } 575 }
554 map->r_prog = clnt->cl_prog; 576 map->r_prog = clnt->cl_prog;
555 map->r_vers = clnt->cl_vers; 577 map->r_vers = clnt->cl_vers;
@@ -569,11 +591,13 @@ void rpcb_getport_async(struct rpc_task *task)
569 task->tk_pid, __func__); 591 task->tk_pid, __func__);
570 return; 592 return;
571 } 593 }
572 rpc_put_task(child);
573 594
574 task->tk_xprt->stat.bind_count++; 595 xprt->stat.bind_count++;
596 rpc_put_task(child);
575 return; 597 return;
576 598
599bailout_release_client:
600 rpc_release_client(rpcb_clnt);
577bailout_nofree: 601bailout_nofree:
578 rpcb_wake_rpcbind_waiters(xprt, status); 602 rpcb_wake_rpcbind_waiters(xprt, status);
579 task->tk_status = status; 603 task->tk_status = status;
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 99a52aabe332..29e401bb612e 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -108,13 +108,10 @@ int xprt_register_transport(struct xprt_class *transport)
108 goto out; 108 goto out;
109 } 109 }
110 110
111 result = -EINVAL; 111 list_add_tail(&transport->list, &xprt_list);
112 if (try_module_get(THIS_MODULE)) { 112 printk(KERN_INFO "RPC: Registered %s transport module.\n",
113 list_add_tail(&transport->list, &xprt_list); 113 transport->name);
114 printk(KERN_INFO "RPC: Registered %s transport module.\n", 114 result = 0;
115 transport->name);
116 result = 0;
117 }
118 115
119out: 116out:
120 spin_unlock(&xprt_list_lock); 117 spin_unlock(&xprt_list_lock);
@@ -143,7 +140,6 @@ int xprt_unregister_transport(struct xprt_class *transport)
143 "RPC: Unregistered %s transport module.\n", 140 "RPC: Unregistered %s transport module.\n",
144 transport->name); 141 transport->name);
145 list_del_init(&transport->list); 142 list_del_init(&transport->list);
146 module_put(THIS_MODULE);
147 goto out; 143 goto out;
148 } 144 }
149 } 145 }
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 5c1954d28d09..14106d26bb95 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -118,6 +118,10 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
118 } 118 }
119 119
120 if (xdrbuf->tail[0].iov_len) { 120 if (xdrbuf->tail[0].iov_len) {
121 /* the rpcrdma protocol allows us to omit any trailing
122 * xdr pad bytes, saving the server an RDMA operation. */
123 if (xdrbuf->tail[0].iov_len < 4 && xprt_rdma_pad_optimize)
124 return n;
121 if (n == nsegs) 125 if (n == nsegs)
122 return 0; 126 return 0;
123 seg[n].mr_page = NULL; 127 seg[n].mr_page = NULL;
@@ -508,8 +512,8 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
508 if (hdrlen == 0) 512 if (hdrlen == 0)
509 return -1; 513 return -1;
510 514
511 dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd\n" 515 dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd"
512 " headerp 0x%p base 0x%p lkey 0x%x\n", 516 " headerp 0x%p base 0x%p lkey 0x%x\n",
513 __func__, transfertypes[wtype], hdrlen, rpclen, padlen, 517 __func__, transfertypes[wtype], hdrlen, rpclen, padlen,
514 headerp, base, req->rl_iov.lkey); 518 headerp, base, req->rl_iov.lkey);
515 519
@@ -594,7 +598,7 @@ rpcrdma_count_chunks(struct rpcrdma_rep *rep, unsigned int max, int wrchunk, __b
594 * Scatter inline received data back into provided iov's. 598 * Scatter inline received data back into provided iov's.
595 */ 599 */
596static void 600static void
597rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len) 601rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
598{ 602{
599 int i, npages, curlen, olen; 603 int i, npages, curlen, olen;
600 char *destp; 604 char *destp;
@@ -660,6 +664,13 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len)
660 } else 664 } else
661 rqst->rq_rcv_buf.tail[0].iov_len = 0; 665 rqst->rq_rcv_buf.tail[0].iov_len = 0;
662 666
667 if (pad) {
668 /* implicit padding on terminal chunk */
669 unsigned char *p = rqst->rq_rcv_buf.tail[0].iov_base;
670 while (pad--)
671 p[rqst->rq_rcv_buf.tail[0].iov_len++] = 0;
672 }
673
663 if (copy_len) 674 if (copy_len)
664 dprintk("RPC: %s: %d bytes in" 675 dprintk("RPC: %s: %d bytes in"
665 " %d extra segments (%d lost)\n", 676 " %d extra segments (%d lost)\n",
@@ -681,12 +692,14 @@ rpcrdma_conn_func(struct rpcrdma_ep *ep)
681 struct rpc_xprt *xprt = ep->rep_xprt; 692 struct rpc_xprt *xprt = ep->rep_xprt;
682 693
683 spin_lock_bh(&xprt->transport_lock); 694 spin_lock_bh(&xprt->transport_lock);
695 if (++xprt->connect_cookie == 0) /* maintain a reserved value */
696 ++xprt->connect_cookie;
684 if (ep->rep_connected > 0) { 697 if (ep->rep_connected > 0) {
685 if (!xprt_test_and_set_connected(xprt)) 698 if (!xprt_test_and_set_connected(xprt))
686 xprt_wake_pending_tasks(xprt, 0); 699 xprt_wake_pending_tasks(xprt, 0);
687 } else { 700 } else {
688 if (xprt_test_and_clear_connected(xprt)) 701 if (xprt_test_and_clear_connected(xprt))
689 xprt_wake_pending_tasks(xprt, ep->rep_connected); 702 xprt_wake_pending_tasks(xprt, -ENOTCONN);
690 } 703 }
691 spin_unlock_bh(&xprt->transport_lock); 704 spin_unlock_bh(&xprt->transport_lock);
692} 705}
@@ -792,14 +805,20 @@ repost:
792 ((unsigned char *)iptr - (unsigned char *)headerp); 805 ((unsigned char *)iptr - (unsigned char *)headerp);
793 status = rep->rr_len + rdmalen; 806 status = rep->rr_len + rdmalen;
794 r_xprt->rx_stats.total_rdma_reply += rdmalen; 807 r_xprt->rx_stats.total_rdma_reply += rdmalen;
808 /* special case - last chunk may omit padding */
809 if (rdmalen &= 3) {
810 rdmalen = 4 - rdmalen;
811 status += rdmalen;
812 }
795 } else { 813 } else {
796 /* else ordinary inline */ 814 /* else ordinary inline */
815 rdmalen = 0;
797 iptr = (__be32 *)((unsigned char *)headerp + 28); 816 iptr = (__be32 *)((unsigned char *)headerp + 28);
798 rep->rr_len -= 28; /*sizeof *headerp;*/ 817 rep->rr_len -= 28; /*sizeof *headerp;*/
799 status = rep->rr_len; 818 status = rep->rr_len;
800 } 819 }
801 /* Fix up the rpc results for upper layer */ 820 /* Fix up the rpc results for upper layer */
802 rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len); 821 rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len, rdmalen);
803 break; 822 break;
804 823
805 case htonl(RDMA_NOMSG): 824 case htonl(RDMA_NOMSG):
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index a564c1a39ec5..9839c3d94145 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -70,11 +70,8 @@ static unsigned int xprt_rdma_slot_table_entries = RPCRDMA_DEF_SLOT_TABLE;
70static unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE; 70static unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE;
71static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE; 71static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE;
72static unsigned int xprt_rdma_inline_write_padding; 72static unsigned int xprt_rdma_inline_write_padding;
73#if !RPCRDMA_PERSISTENT_REGISTRATION 73static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR;
74static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_REGISTER; /* FMR? */ 74 int xprt_rdma_pad_optimize = 0;
75#else
76static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_ALLPHYSICAL;
77#endif
78 75
79#ifdef RPC_DEBUG 76#ifdef RPC_DEBUG
80 77
@@ -140,6 +137,14 @@ static ctl_table xr_tunables_table[] = {
140 .extra2 = &max_memreg, 137 .extra2 = &max_memreg,
141 }, 138 },
142 { 139 {
140 .ctl_name = CTL_UNNUMBERED,
141 .procname = "rdma_pad_optimize",
142 .data = &xprt_rdma_pad_optimize,
143 .maxlen = sizeof(unsigned int),
144 .mode = 0644,
145 .proc_handler = &proc_dointvec,
146 },
147 {
143 .ctl_name = 0, 148 .ctl_name = 0,
144 }, 149 },
145}; 150};
@@ -458,6 +463,8 @@ xprt_rdma_close(struct rpc_xprt *xprt)
458 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); 463 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
459 464
460 dprintk("RPC: %s: closing\n", __func__); 465 dprintk("RPC: %s: closing\n", __func__);
466 if (r_xprt->rx_ep.rep_connected > 0)
467 xprt->reestablish_timeout = 0;
461 xprt_disconnect_done(xprt); 468 xprt_disconnect_done(xprt);
462 (void) rpcrdma_ep_disconnect(&r_xprt->rx_ep, &r_xprt->rx_ia); 469 (void) rpcrdma_ep_disconnect(&r_xprt->rx_ep, &r_xprt->rx_ia);
463} 470}
@@ -485,6 +492,11 @@ xprt_rdma_connect(struct rpc_task *task)
485 /* Reconnect */ 492 /* Reconnect */
486 schedule_delayed_work(&r_xprt->rdma_connect, 493 schedule_delayed_work(&r_xprt->rdma_connect,
487 xprt->reestablish_timeout); 494 xprt->reestablish_timeout);
495 xprt->reestablish_timeout <<= 1;
496 if (xprt->reestablish_timeout > (30 * HZ))
497 xprt->reestablish_timeout = (30 * HZ);
498 else if (xprt->reestablish_timeout < (5 * HZ))
499 xprt->reestablish_timeout = (5 * HZ);
488 } else { 500 } else {
489 schedule_delayed_work(&r_xprt->rdma_connect, 0); 501 schedule_delayed_work(&r_xprt->rdma_connect, 0);
490 if (!RPC_IS_ASYNC(task)) 502 if (!RPC_IS_ASYNC(task))
@@ -591,6 +603,7 @@ xprt_rdma_allocate(struct rpc_task *task, size_t size)
591 } 603 }
592 dprintk("RPC: %s: size %zd, request 0x%p\n", __func__, size, req); 604 dprintk("RPC: %s: size %zd, request 0x%p\n", __func__, size, req);
593out: 605out:
606 req->rl_connect_cookie = 0; /* our reserved value */
594 return req->rl_xdr_buf; 607 return req->rl_xdr_buf;
595 608
596outfail: 609outfail:
@@ -694,13 +707,21 @@ xprt_rdma_send_request(struct rpc_task *task)
694 req->rl_reply->rr_xprt = xprt; 707 req->rl_reply->rr_xprt = xprt;
695 } 708 }
696 709
697 if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req)) { 710 /* Must suppress retransmit to maintain credits */
698 xprt_disconnect_done(xprt); 711 if (req->rl_connect_cookie == xprt->connect_cookie)
699 return -ENOTCONN; /* implies disconnect */ 712 goto drop_connection;
700 } 713 req->rl_connect_cookie = xprt->connect_cookie;
714
715 if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req))
716 goto drop_connection;
701 717
718 task->tk_bytes_sent += rqst->rq_snd_buf.len;
702 rqst->rq_bytes_sent = 0; 719 rqst->rq_bytes_sent = 0;
703 return 0; 720 return 0;
721
722drop_connection:
723 xprt_disconnect_done(xprt);
724 return -ENOTCONN; /* implies disconnect */
704} 725}
705 726
706static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) 727static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
@@ -770,7 +791,7 @@ static void __exit xprt_rdma_cleanup(void)
770{ 791{
771 int rc; 792 int rc;
772 793
773 dprintk("RPCRDMA Module Removed, deregister RPC RDMA transport\n"); 794 dprintk(KERN_INFO "RPCRDMA Module Removed, deregister RPC RDMA transport\n");
774#ifdef RPC_DEBUG 795#ifdef RPC_DEBUG
775 if (sunrpc_table_header) { 796 if (sunrpc_table_header) {
776 unregister_sysctl_table(sunrpc_table_header); 797 unregister_sysctl_table(sunrpc_table_header);
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 8ea283ecc522..a5fef5e6c323 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -284,6 +284,7 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
284 switch (event->event) { 284 switch (event->event) {
285 case RDMA_CM_EVENT_ADDR_RESOLVED: 285 case RDMA_CM_EVENT_ADDR_RESOLVED:
286 case RDMA_CM_EVENT_ROUTE_RESOLVED: 286 case RDMA_CM_EVENT_ROUTE_RESOLVED:
287 ia->ri_async_rc = 0;
287 complete(&ia->ri_done); 288 complete(&ia->ri_done);
288 break; 289 break;
289 case RDMA_CM_EVENT_ADDR_ERROR: 290 case RDMA_CM_EVENT_ADDR_ERROR:
@@ -338,13 +339,32 @@ connected:
338 wake_up_all(&ep->rep_connect_wait); 339 wake_up_all(&ep->rep_connect_wait);
339 break; 340 break;
340 default: 341 default:
341 ia->ri_async_rc = -EINVAL; 342 dprintk("RPC: %s: unexpected CM event %d\n",
342 dprintk("RPC: %s: unexpected CM event %X\n",
343 __func__, event->event); 343 __func__, event->event);
344 complete(&ia->ri_done);
345 break; 344 break;
346 } 345 }
347 346
347#ifdef RPC_DEBUG
348 if (connstate == 1) {
349 int ird = attr.max_dest_rd_atomic;
350 int tird = ep->rep_remote_cma.responder_resources;
351 printk(KERN_INFO "rpcrdma: connection to %u.%u.%u.%u:%u "
352 "on %s, memreg %d slots %d ird %d%s\n",
353 NIPQUAD(addr->sin_addr.s_addr),
354 ntohs(addr->sin_port),
355 ia->ri_id->device->name,
356 ia->ri_memreg_strategy,
357 xprt->rx_buf.rb_max_requests,
358 ird, ird < 4 && ird < tird / 2 ? " (low!)" : "");
359 } else if (connstate < 0) {
360 printk(KERN_INFO "rpcrdma: connection to %u.%u.%u.%u:%u "
361 "closed (%d)\n",
362 NIPQUAD(addr->sin_addr.s_addr),
363 ntohs(addr->sin_port),
364 connstate);
365 }
366#endif
367
348 return 0; 368 return 0;
349} 369}
350 370
@@ -355,6 +375,8 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt,
355 struct rdma_cm_id *id; 375 struct rdma_cm_id *id;
356 int rc; 376 int rc;
357 377
378 init_completion(&ia->ri_done);
379
358 id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP); 380 id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP);
359 if (IS_ERR(id)) { 381 if (IS_ERR(id)) {
360 rc = PTR_ERR(id); 382 rc = PTR_ERR(id);
@@ -363,26 +385,28 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt,
363 return id; 385 return id;
364 } 386 }
365 387
366 ia->ri_async_rc = 0; 388 ia->ri_async_rc = -ETIMEDOUT;
367 rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT); 389 rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT);
368 if (rc) { 390 if (rc) {
369 dprintk("RPC: %s: rdma_resolve_addr() failed %i\n", 391 dprintk("RPC: %s: rdma_resolve_addr() failed %i\n",
370 __func__, rc); 392 __func__, rc);
371 goto out; 393 goto out;
372 } 394 }
373 wait_for_completion(&ia->ri_done); 395 wait_for_completion_interruptible_timeout(&ia->ri_done,
396 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
374 rc = ia->ri_async_rc; 397 rc = ia->ri_async_rc;
375 if (rc) 398 if (rc)
376 goto out; 399 goto out;
377 400
378 ia->ri_async_rc = 0; 401 ia->ri_async_rc = -ETIMEDOUT;
379 rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT); 402 rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
380 if (rc) { 403 if (rc) {
381 dprintk("RPC: %s: rdma_resolve_route() failed %i\n", 404 dprintk("RPC: %s: rdma_resolve_route() failed %i\n",
382 __func__, rc); 405 __func__, rc);
383 goto out; 406 goto out;
384 } 407 }
385 wait_for_completion(&ia->ri_done); 408 wait_for_completion_interruptible_timeout(&ia->ri_done,
409 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
386 rc = ia->ri_async_rc; 410 rc = ia->ri_async_rc;
387 if (rc) 411 if (rc)
388 goto out; 412 goto out;
@@ -423,11 +447,10 @@ rpcrdma_clean_cq(struct ib_cq *cq)
423int 447int
424rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) 448rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
425{ 449{
426 int rc; 450 int rc, mem_priv;
451 struct ib_device_attr devattr;
427 struct rpcrdma_ia *ia = &xprt->rx_ia; 452 struct rpcrdma_ia *ia = &xprt->rx_ia;
428 453
429 init_completion(&ia->ri_done);
430
431 ia->ri_id = rpcrdma_create_id(xprt, ia, addr); 454 ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
432 if (IS_ERR(ia->ri_id)) { 455 if (IS_ERR(ia->ri_id)) {
433 rc = PTR_ERR(ia->ri_id); 456 rc = PTR_ERR(ia->ri_id);
@@ -443,6 +466,73 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
443 } 466 }
444 467
445 /* 468 /*
469 * Query the device to determine if the requested memory
470 * registration strategy is supported. If it isn't, set the
471 * strategy to a globally supported model.
472 */
473 rc = ib_query_device(ia->ri_id->device, &devattr);
474 if (rc) {
475 dprintk("RPC: %s: ib_query_device failed %d\n",
476 __func__, rc);
477 goto out2;
478 }
479
480 if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
481 ia->ri_have_dma_lkey = 1;
482 ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey;
483 }
484
485 switch (memreg) {
486 case RPCRDMA_MEMWINDOWS:
487 case RPCRDMA_MEMWINDOWS_ASYNC:
488 if (!(devattr.device_cap_flags & IB_DEVICE_MEM_WINDOW)) {
489 dprintk("RPC: %s: MEMWINDOWS registration "
490 "specified but not supported by adapter, "
491 "using slower RPCRDMA_REGISTER\n",
492 __func__);
493 memreg = RPCRDMA_REGISTER;
494 }
495 break;
496 case RPCRDMA_MTHCAFMR:
497 if (!ia->ri_id->device->alloc_fmr) {
498#if RPCRDMA_PERSISTENT_REGISTRATION
499 dprintk("RPC: %s: MTHCAFMR registration "
500 "specified but not supported by adapter, "
501 "using riskier RPCRDMA_ALLPHYSICAL\n",
502 __func__);
503 memreg = RPCRDMA_ALLPHYSICAL;
504#else
505 dprintk("RPC: %s: MTHCAFMR registration "
506 "specified but not supported by adapter, "
507 "using slower RPCRDMA_REGISTER\n",
508 __func__);
509 memreg = RPCRDMA_REGISTER;
510#endif
511 }
512 break;
513 case RPCRDMA_FRMR:
514 /* Requires both frmr reg and local dma lkey */
515 if ((devattr.device_cap_flags &
516 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
517 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) {
518#if RPCRDMA_PERSISTENT_REGISTRATION
519 dprintk("RPC: %s: FRMR registration "
520 "specified but not supported by adapter, "
521 "using riskier RPCRDMA_ALLPHYSICAL\n",
522 __func__);
523 memreg = RPCRDMA_ALLPHYSICAL;
524#else
525 dprintk("RPC: %s: FRMR registration "
526 "specified but not supported by adapter, "
527 "using slower RPCRDMA_REGISTER\n",
528 __func__);
529 memreg = RPCRDMA_REGISTER;
530#endif
531 }
532 break;
533 }
534
535 /*
446 * Optionally obtain an underlying physical identity mapping in 536 * Optionally obtain an underlying physical identity mapping in
447 * order to do a memory window-based bind. This base registration 537 * order to do a memory window-based bind. This base registration
448 * is protected from remote access - that is enabled only by binding 538 * is protected from remote access - that is enabled only by binding
@@ -450,22 +540,28 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
450 * revoked after the corresponding completion similar to a storage 540 * revoked after the corresponding completion similar to a storage
451 * adapter. 541 * adapter.
452 */ 542 */
453 if (memreg > RPCRDMA_REGISTER) { 543 switch (memreg) {
454 int mem_priv = IB_ACCESS_LOCAL_WRITE; 544 case RPCRDMA_BOUNCEBUFFERS:
455 switch (memreg) { 545 case RPCRDMA_REGISTER:
546 case RPCRDMA_FRMR:
547 break;
456#if RPCRDMA_PERSISTENT_REGISTRATION 548#if RPCRDMA_PERSISTENT_REGISTRATION
457 case RPCRDMA_ALLPHYSICAL: 549 case RPCRDMA_ALLPHYSICAL:
458 mem_priv |= IB_ACCESS_REMOTE_WRITE; 550 mem_priv = IB_ACCESS_LOCAL_WRITE |
459 mem_priv |= IB_ACCESS_REMOTE_READ; 551 IB_ACCESS_REMOTE_WRITE |
460 break; 552 IB_ACCESS_REMOTE_READ;
553 goto register_setup;
461#endif 554#endif
462 case RPCRDMA_MEMWINDOWS_ASYNC: 555 case RPCRDMA_MEMWINDOWS_ASYNC:
463 case RPCRDMA_MEMWINDOWS: 556 case RPCRDMA_MEMWINDOWS:
464 mem_priv |= IB_ACCESS_MW_BIND; 557 mem_priv = IB_ACCESS_LOCAL_WRITE |
465 break; 558 IB_ACCESS_MW_BIND;
466 default: 559 goto register_setup;
560 case RPCRDMA_MTHCAFMR:
561 if (ia->ri_have_dma_lkey)
467 break; 562 break;
468 } 563 mem_priv = IB_ACCESS_LOCAL_WRITE;
564 register_setup:
469 ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv); 565 ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
470 if (IS_ERR(ia->ri_bind_mem)) { 566 if (IS_ERR(ia->ri_bind_mem)) {
471 printk(KERN_ALERT "%s: ib_get_dma_mr for " 567 printk(KERN_ALERT "%s: ib_get_dma_mr for "
@@ -475,7 +571,15 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
475 memreg = RPCRDMA_REGISTER; 571 memreg = RPCRDMA_REGISTER;
476 ia->ri_bind_mem = NULL; 572 ia->ri_bind_mem = NULL;
477 } 573 }
574 break;
575 default:
576 printk(KERN_ERR "%s: invalid memory registration mode %d\n",
577 __func__, memreg);
578 rc = -EINVAL;
579 goto out2;
478 } 580 }
581 dprintk("RPC: %s: memory registration strategy is %d\n",
582 __func__, memreg);
479 583
480 /* Else will do memory reg/dereg for each chunk */ 584 /* Else will do memory reg/dereg for each chunk */
481 ia->ri_memreg_strategy = memreg; 585 ia->ri_memreg_strategy = memreg;
@@ -483,6 +587,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
483 return 0; 587 return 0;
484out2: 588out2:
485 rdma_destroy_id(ia->ri_id); 589 rdma_destroy_id(ia->ri_id);
590 ia->ri_id = NULL;
486out1: 591out1:
487 return rc; 592 return rc;
488} 593}
@@ -503,15 +608,17 @@ rpcrdma_ia_close(struct rpcrdma_ia *ia)
503 dprintk("RPC: %s: ib_dereg_mr returned %i\n", 608 dprintk("RPC: %s: ib_dereg_mr returned %i\n",
504 __func__, rc); 609 __func__, rc);
505 } 610 }
506 if (ia->ri_id != NULL && !IS_ERR(ia->ri_id) && ia->ri_id->qp) 611 if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
507 rdma_destroy_qp(ia->ri_id); 612 if (ia->ri_id->qp)
613 rdma_destroy_qp(ia->ri_id);
614 rdma_destroy_id(ia->ri_id);
615 ia->ri_id = NULL;
616 }
508 if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) { 617 if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) {
509 rc = ib_dealloc_pd(ia->ri_pd); 618 rc = ib_dealloc_pd(ia->ri_pd);
510 dprintk("RPC: %s: ib_dealloc_pd returned %i\n", 619 dprintk("RPC: %s: ib_dealloc_pd returned %i\n",
511 __func__, rc); 620 __func__, rc);
512 } 621 }
513 if (ia->ri_id != NULL && !IS_ERR(ia->ri_id))
514 rdma_destroy_id(ia->ri_id);
515} 622}
516 623
517/* 624/*
@@ -541,6 +648,12 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
541 ep->rep_attr.srq = NULL; 648 ep->rep_attr.srq = NULL;
542 ep->rep_attr.cap.max_send_wr = cdata->max_requests; 649 ep->rep_attr.cap.max_send_wr = cdata->max_requests;
543 switch (ia->ri_memreg_strategy) { 650 switch (ia->ri_memreg_strategy) {
651 case RPCRDMA_FRMR:
652 /* Add room for frmr register and invalidate WRs */
653 ep->rep_attr.cap.max_send_wr *= 3;
654 if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr)
655 return -EINVAL;
656 break;
544 case RPCRDMA_MEMWINDOWS_ASYNC: 657 case RPCRDMA_MEMWINDOWS_ASYNC:
545 case RPCRDMA_MEMWINDOWS: 658 case RPCRDMA_MEMWINDOWS:
546 /* Add room for mw_binds+unbinds - overkill! */ 659 /* Add room for mw_binds+unbinds - overkill! */
@@ -617,29 +730,13 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
617 ep->rep_remote_cma.private_data_len = 0; 730 ep->rep_remote_cma.private_data_len = 0;
618 731
619 /* Client offers RDMA Read but does not initiate */ 732 /* Client offers RDMA Read but does not initiate */
620 switch (ia->ri_memreg_strategy) { 733 ep->rep_remote_cma.initiator_depth = 0;
621 case RPCRDMA_BOUNCEBUFFERS: 734 if (ia->ri_memreg_strategy == RPCRDMA_BOUNCEBUFFERS)
622 ep->rep_remote_cma.responder_resources = 0; 735 ep->rep_remote_cma.responder_resources = 0;
623 break; 736 else if (devattr.max_qp_rd_atom > 32) /* arbitrary but <= 255 */
624 case RPCRDMA_MTHCAFMR: 737 ep->rep_remote_cma.responder_resources = 32;
625 case RPCRDMA_REGISTER: 738 else
626 ep->rep_remote_cma.responder_resources = cdata->max_requests *
627 (RPCRDMA_MAX_DATA_SEGS / 8);
628 break;
629 case RPCRDMA_MEMWINDOWS:
630 case RPCRDMA_MEMWINDOWS_ASYNC:
631#if RPCRDMA_PERSISTENT_REGISTRATION
632 case RPCRDMA_ALLPHYSICAL:
633#endif
634 ep->rep_remote_cma.responder_resources = cdata->max_requests *
635 (RPCRDMA_MAX_DATA_SEGS / 2);
636 break;
637 default:
638 break;
639 }
640 if (ep->rep_remote_cma.responder_resources > devattr.max_qp_rd_atom)
641 ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom; 739 ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom;
642 ep->rep_remote_cma.initiator_depth = 0;
643 740
644 ep->rep_remote_cma.retry_count = 7; 741 ep->rep_remote_cma.retry_count = 7;
645 ep->rep_remote_cma.flow_control = 0; 742 ep->rep_remote_cma.flow_control = 0;
@@ -679,21 +776,16 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
679 if (rc) 776 if (rc)
680 dprintk("RPC: %s: rpcrdma_ep_disconnect" 777 dprintk("RPC: %s: rpcrdma_ep_disconnect"
681 " returned %i\n", __func__, rc); 778 " returned %i\n", __func__, rc);
779 rdma_destroy_qp(ia->ri_id);
780 ia->ri_id->qp = NULL;
682 } 781 }
683 782
684 ep->rep_func = NULL;
685
686 /* padding - could be done in rpcrdma_buffer_destroy... */ 783 /* padding - could be done in rpcrdma_buffer_destroy... */
687 if (ep->rep_pad_mr) { 784 if (ep->rep_pad_mr) {
688 rpcrdma_deregister_internal(ia, ep->rep_pad_mr, &ep->rep_pad); 785 rpcrdma_deregister_internal(ia, ep->rep_pad_mr, &ep->rep_pad);
689 ep->rep_pad_mr = NULL; 786 ep->rep_pad_mr = NULL;
690 } 787 }
691 788
692 if (ia->ri_id->qp) {
693 rdma_destroy_qp(ia->ri_id);
694 ia->ri_id->qp = NULL;
695 }
696
697 rpcrdma_clean_cq(ep->rep_cq); 789 rpcrdma_clean_cq(ep->rep_cq);
698 rc = ib_destroy_cq(ep->rep_cq); 790 rc = ib_destroy_cq(ep->rep_cq);
699 if (rc) 791 if (rc)
@@ -712,9 +804,8 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
712 struct rdma_cm_id *id; 804 struct rdma_cm_id *id;
713 int rc = 0; 805 int rc = 0;
714 int retry_count = 0; 806 int retry_count = 0;
715 int reconnect = (ep->rep_connected != 0);
716 807
717 if (reconnect) { 808 if (ep->rep_connected != 0) {
718 struct rpcrdma_xprt *xprt; 809 struct rpcrdma_xprt *xprt;
719retry: 810retry:
720 rc = rpcrdma_ep_disconnect(ep, ia); 811 rc = rpcrdma_ep_disconnect(ep, ia);
@@ -745,6 +836,7 @@ retry:
745 goto out; 836 goto out;
746 } 837 }
747 /* END TEMP */ 838 /* END TEMP */
839 rdma_destroy_qp(ia->ri_id);
748 rdma_destroy_id(ia->ri_id); 840 rdma_destroy_id(ia->ri_id);
749 ia->ri_id = id; 841 ia->ri_id = id;
750 } 842 }
@@ -769,14 +861,6 @@ if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) {
769 } 861 }
770} 862}
771 863
772 /* Theoretically a client initiator_depth > 0 is not needed,
773 * but many peers fail to complete the connection unless they
774 * == responder_resources! */
775 if (ep->rep_remote_cma.initiator_depth !=
776 ep->rep_remote_cma.responder_resources)
777 ep->rep_remote_cma.initiator_depth =
778 ep->rep_remote_cma.responder_resources;
779
780 ep->rep_connected = 0; 864 ep->rep_connected = 0;
781 865
782 rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma); 866 rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
@@ -786,9 +870,6 @@ if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) {
786 goto out; 870 goto out;
787 } 871 }
788 872
789 if (reconnect)
790 return 0;
791
792 wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0); 873 wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
793 874
794 /* 875 /*
@@ -805,14 +886,16 @@ if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) {
805 if (ep->rep_connected <= 0) { 886 if (ep->rep_connected <= 0) {
806 /* Sometimes, the only way to reliably connect to remote 887 /* Sometimes, the only way to reliably connect to remote
807 * CMs is to use same nonzero values for ORD and IRD. */ 888 * CMs is to use same nonzero values for ORD and IRD. */
808 ep->rep_remote_cma.initiator_depth = 889 if (retry_count++ <= RDMA_CONNECT_RETRY_MAX + 1 &&
809 ep->rep_remote_cma.responder_resources; 890 (ep->rep_remote_cma.responder_resources == 0 ||
810 if (ep->rep_remote_cma.initiator_depth == 0) 891 ep->rep_remote_cma.initiator_depth !=
811 ++ep->rep_remote_cma.initiator_depth; 892 ep->rep_remote_cma.responder_resources)) {
812 if (ep->rep_remote_cma.responder_resources == 0) 893 if (ep->rep_remote_cma.responder_resources == 0)
813 ++ep->rep_remote_cma.responder_resources; 894 ep->rep_remote_cma.responder_resources = 1;
814 if (retry_count++ == 0) 895 ep->rep_remote_cma.initiator_depth =
896 ep->rep_remote_cma.responder_resources;
815 goto retry; 897 goto retry;
898 }
816 rc = ep->rep_connected; 899 rc = ep->rep_connected;
817 } else { 900 } else {
818 dprintk("RPC: %s: connected\n", __func__); 901 dprintk("RPC: %s: connected\n", __func__);
@@ -863,6 +946,7 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
863 char *p; 946 char *p;
864 size_t len; 947 size_t len;
865 int i, rc; 948 int i, rc;
949 struct rpcrdma_mw *r;
866 950
867 buf->rb_max_requests = cdata->max_requests; 951 buf->rb_max_requests = cdata->max_requests;
868 spin_lock_init(&buf->rb_lock); 952 spin_lock_init(&buf->rb_lock);
@@ -873,7 +957,7 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
873 * 2. arrays of struct rpcrdma_req to fill in pointers 957 * 2. arrays of struct rpcrdma_req to fill in pointers
874 * 3. array of struct rpcrdma_rep for replies 958 * 3. array of struct rpcrdma_rep for replies
875 * 4. padding, if any 959 * 4. padding, if any
876 * 5. mw's, if any 960 * 5. mw's, fmr's or frmr's, if any
877 * Send/recv buffers in req/rep need to be registered 961 * Send/recv buffers in req/rep need to be registered
878 */ 962 */
879 963
@@ -881,6 +965,10 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
881 (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *)); 965 (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
882 len += cdata->padding; 966 len += cdata->padding;
883 switch (ia->ri_memreg_strategy) { 967 switch (ia->ri_memreg_strategy) {
968 case RPCRDMA_FRMR:
969 len += buf->rb_max_requests * RPCRDMA_MAX_SEGS *
970 sizeof(struct rpcrdma_mw);
971 break;
884 case RPCRDMA_MTHCAFMR: 972 case RPCRDMA_MTHCAFMR:
885 /* TBD we are perhaps overallocating here */ 973 /* TBD we are perhaps overallocating here */
886 len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS * 974 len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
@@ -927,15 +1015,37 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
927 * and also reduce unbind-to-bind collision. 1015 * and also reduce unbind-to-bind collision.
928 */ 1016 */
929 INIT_LIST_HEAD(&buf->rb_mws); 1017 INIT_LIST_HEAD(&buf->rb_mws);
1018 r = (struct rpcrdma_mw *)p;
930 switch (ia->ri_memreg_strategy) { 1019 switch (ia->ri_memreg_strategy) {
1020 case RPCRDMA_FRMR:
1021 for (i = buf->rb_max_requests * RPCRDMA_MAX_SEGS; i; i--) {
1022 r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
1023 RPCRDMA_MAX_SEGS);
1024 if (IS_ERR(r->r.frmr.fr_mr)) {
1025 rc = PTR_ERR(r->r.frmr.fr_mr);
1026 dprintk("RPC: %s: ib_alloc_fast_reg_mr"
1027 " failed %i\n", __func__, rc);
1028 goto out;
1029 }
1030 r->r.frmr.fr_pgl =
1031 ib_alloc_fast_reg_page_list(ia->ri_id->device,
1032 RPCRDMA_MAX_SEGS);
1033 if (IS_ERR(r->r.frmr.fr_pgl)) {
1034 rc = PTR_ERR(r->r.frmr.fr_pgl);
1035 dprintk("RPC: %s: "
1036 "ib_alloc_fast_reg_page_list "
1037 "failed %i\n", __func__, rc);
1038 goto out;
1039 }
1040 list_add(&r->mw_list, &buf->rb_mws);
1041 ++r;
1042 }
1043 break;
931 case RPCRDMA_MTHCAFMR: 1044 case RPCRDMA_MTHCAFMR:
932 {
933 struct rpcrdma_mw *r = (struct rpcrdma_mw *)p;
934 struct ib_fmr_attr fa = {
935 RPCRDMA_MAX_DATA_SEGS, 1, PAGE_SHIFT
936 };
937 /* TBD we are perhaps overallocating here */ 1045 /* TBD we are perhaps overallocating here */
938 for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) { 1046 for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
1047 static struct ib_fmr_attr fa =
1048 { RPCRDMA_MAX_DATA_SEGS, 1, PAGE_SHIFT };
939 r->r.fmr = ib_alloc_fmr(ia->ri_pd, 1049 r->r.fmr = ib_alloc_fmr(ia->ri_pd,
940 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ, 1050 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ,
941 &fa); 1051 &fa);
@@ -948,12 +1058,9 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
948 list_add(&r->mw_list, &buf->rb_mws); 1058 list_add(&r->mw_list, &buf->rb_mws);
949 ++r; 1059 ++r;
950 } 1060 }
951 }
952 break; 1061 break;
953 case RPCRDMA_MEMWINDOWS_ASYNC: 1062 case RPCRDMA_MEMWINDOWS_ASYNC:
954 case RPCRDMA_MEMWINDOWS: 1063 case RPCRDMA_MEMWINDOWS:
955 {
956 struct rpcrdma_mw *r = (struct rpcrdma_mw *)p;
957 /* Allocate one extra request's worth, for full cycling */ 1064 /* Allocate one extra request's worth, for full cycling */
958 for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) { 1065 for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
959 r->r.mw = ib_alloc_mw(ia->ri_pd); 1066 r->r.mw = ib_alloc_mw(ia->ri_pd);
@@ -966,7 +1073,6 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
966 list_add(&r->mw_list, &buf->rb_mws); 1073 list_add(&r->mw_list, &buf->rb_mws);
967 ++r; 1074 ++r;
968 } 1075 }
969 }
970 break; 1076 break;
971 default: 1077 default:
972 break; 1078 break;
@@ -1046,6 +1152,7 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
1046{ 1152{
1047 int rc, i; 1153 int rc, i;
1048 struct rpcrdma_ia *ia = rdmab_to_ia(buf); 1154 struct rpcrdma_ia *ia = rdmab_to_ia(buf);
1155 struct rpcrdma_mw *r;
1049 1156
1050 /* clean up in reverse order from create 1157 /* clean up in reverse order from create
1051 * 1. recv mr memory (mr free, then kfree) 1158 * 1. recv mr memory (mr free, then kfree)
@@ -1065,11 +1172,19 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
1065 } 1172 }
1066 if (buf->rb_send_bufs && buf->rb_send_bufs[i]) { 1173 if (buf->rb_send_bufs && buf->rb_send_bufs[i]) {
1067 while (!list_empty(&buf->rb_mws)) { 1174 while (!list_empty(&buf->rb_mws)) {
1068 struct rpcrdma_mw *r;
1069 r = list_entry(buf->rb_mws.next, 1175 r = list_entry(buf->rb_mws.next,
1070 struct rpcrdma_mw, mw_list); 1176 struct rpcrdma_mw, mw_list);
1071 list_del(&r->mw_list); 1177 list_del(&r->mw_list);
1072 switch (ia->ri_memreg_strategy) { 1178 switch (ia->ri_memreg_strategy) {
1179 case RPCRDMA_FRMR:
1180 rc = ib_dereg_mr(r->r.frmr.fr_mr);
1181 if (rc)
1182 dprintk("RPC: %s:"
1183 " ib_dereg_mr"
1184 " failed %i\n",
1185 __func__, rc);
1186 ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
1187 break;
1073 case RPCRDMA_MTHCAFMR: 1188 case RPCRDMA_MTHCAFMR:
1074 rc = ib_dealloc_fmr(r->r.fmr); 1189 rc = ib_dealloc_fmr(r->r.fmr);
1075 if (rc) 1190 if (rc)
@@ -1115,6 +1230,8 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
1115{ 1230{
1116 struct rpcrdma_req *req; 1231 struct rpcrdma_req *req;
1117 unsigned long flags; 1232 unsigned long flags;
1233 int i;
1234 struct rpcrdma_mw *r;
1118 1235
1119 spin_lock_irqsave(&buffers->rb_lock, flags); 1236 spin_lock_irqsave(&buffers->rb_lock, flags);
1120 if (buffers->rb_send_index == buffers->rb_max_requests) { 1237 if (buffers->rb_send_index == buffers->rb_max_requests) {
@@ -1135,9 +1252,8 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
1135 } 1252 }
1136 buffers->rb_send_bufs[buffers->rb_send_index++] = NULL; 1253 buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
1137 if (!list_empty(&buffers->rb_mws)) { 1254 if (!list_empty(&buffers->rb_mws)) {
1138 int i = RPCRDMA_MAX_SEGS - 1; 1255 i = RPCRDMA_MAX_SEGS - 1;
1139 do { 1256 do {
1140 struct rpcrdma_mw *r;
1141 r = list_entry(buffers->rb_mws.next, 1257 r = list_entry(buffers->rb_mws.next,
1142 struct rpcrdma_mw, mw_list); 1258 struct rpcrdma_mw, mw_list);
1143 list_del(&r->mw_list); 1259 list_del(&r->mw_list);
@@ -1171,6 +1287,7 @@ rpcrdma_buffer_put(struct rpcrdma_req *req)
1171 req->rl_reply = NULL; 1287 req->rl_reply = NULL;
1172 } 1288 }
1173 switch (ia->ri_memreg_strategy) { 1289 switch (ia->ri_memreg_strategy) {
1290 case RPCRDMA_FRMR:
1174 case RPCRDMA_MTHCAFMR: 1291 case RPCRDMA_MTHCAFMR:
1175 case RPCRDMA_MEMWINDOWS_ASYNC: 1292 case RPCRDMA_MEMWINDOWS_ASYNC:
1176 case RPCRDMA_MEMWINDOWS: 1293 case RPCRDMA_MEMWINDOWS:
@@ -1252,7 +1369,11 @@ rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
1252 va, len, DMA_BIDIRECTIONAL); 1369 va, len, DMA_BIDIRECTIONAL);
1253 iov->length = len; 1370 iov->length = len;
1254 1371
1255 if (ia->ri_bind_mem != NULL) { 1372 if (ia->ri_have_dma_lkey) {
1373 *mrp = NULL;
1374 iov->lkey = ia->ri_dma_lkey;
1375 return 0;
1376 } else if (ia->ri_bind_mem != NULL) {
1256 *mrp = NULL; 1377 *mrp = NULL;
1257 iov->lkey = ia->ri_bind_mem->lkey; 1378 iov->lkey = ia->ri_bind_mem->lkey;
1258 return 0; 1379 return 0;
@@ -1329,15 +1450,292 @@ rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg)
1329 seg->mr_dma, seg->mr_dmalen, seg->mr_dir); 1450 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1330} 1451}
1331 1452
1453static int
1454rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
1455 int *nsegs, int writing, struct rpcrdma_ia *ia,
1456 struct rpcrdma_xprt *r_xprt)
1457{
1458 struct rpcrdma_mr_seg *seg1 = seg;
1459 struct ib_send_wr frmr_wr, *bad_wr;
1460 u8 key;
1461 int len, pageoff;
1462 int i, rc;
1463
1464 pageoff = offset_in_page(seg1->mr_offset);
1465 seg1->mr_offset -= pageoff; /* start of page */
1466 seg1->mr_len += pageoff;
1467 len = -pageoff;
1468 if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1469 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1470 for (i = 0; i < *nsegs;) {
1471 rpcrdma_map_one(ia, seg, writing);
1472 seg1->mr_chunk.rl_mw->r.frmr.fr_pgl->page_list[i] = seg->mr_dma;
1473 len += seg->mr_len;
1474 ++seg;
1475 ++i;
1476 /* Check for holes */
1477 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1478 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1479 break;
1480 }
1481 dprintk("RPC: %s: Using frmr %p to map %d segments\n",
1482 __func__, seg1->mr_chunk.rl_mw, i);
1483
1484 /* Bump the key */
1485 key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF);
1486 ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key);
1487
1488 /* Prepare FRMR WR */
1489 memset(&frmr_wr, 0, sizeof frmr_wr);
1490 frmr_wr.opcode = IB_WR_FAST_REG_MR;
1491 frmr_wr.send_flags = 0; /* unsignaled */
1492 frmr_wr.wr.fast_reg.iova_start = (unsigned long)seg1->mr_dma;
1493 frmr_wr.wr.fast_reg.page_list = seg1->mr_chunk.rl_mw->r.frmr.fr_pgl;
1494 frmr_wr.wr.fast_reg.page_list_len = i;
1495 frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
1496 frmr_wr.wr.fast_reg.length = i << PAGE_SHIFT;
1497 frmr_wr.wr.fast_reg.access_flags = (writing ?
1498 IB_ACCESS_REMOTE_WRITE : IB_ACCESS_REMOTE_READ);
1499 frmr_wr.wr.fast_reg.rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1500 DECR_CQCOUNT(&r_xprt->rx_ep);
1501
1502 rc = ib_post_send(ia->ri_id->qp, &frmr_wr, &bad_wr);
1503
1504 if (rc) {
1505 dprintk("RPC: %s: failed ib_post_send for register,"
1506 " status %i\n", __func__, rc);
1507 while (i--)
1508 rpcrdma_unmap_one(ia, --seg);
1509 } else {
1510 seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1511 seg1->mr_base = seg1->mr_dma + pageoff;
1512 seg1->mr_nsegs = i;
1513 seg1->mr_len = len;
1514 }
1515 *nsegs = i;
1516 return rc;
1517}
1518
1519static int
1520rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg,
1521 struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt)
1522{
1523 struct rpcrdma_mr_seg *seg1 = seg;
1524 struct ib_send_wr invalidate_wr, *bad_wr;
1525 int rc;
1526
1527 while (seg1->mr_nsegs--)
1528 rpcrdma_unmap_one(ia, seg++);
1529
1530 memset(&invalidate_wr, 0, sizeof invalidate_wr);
1531 invalidate_wr.opcode = IB_WR_LOCAL_INV;
1532 invalidate_wr.send_flags = 0; /* unsignaled */
1533 invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1534 DECR_CQCOUNT(&r_xprt->rx_ep);
1535
1536 rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
1537 if (rc)
1538 dprintk("RPC: %s: failed ib_post_send for invalidate,"
1539 " status %i\n", __func__, rc);
1540 return rc;
1541}
1542
1543static int
1544rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg,
1545 int *nsegs, int writing, struct rpcrdma_ia *ia)
1546{
1547 struct rpcrdma_mr_seg *seg1 = seg;
1548 u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
1549 int len, pageoff, i, rc;
1550
1551 pageoff = offset_in_page(seg1->mr_offset);
1552 seg1->mr_offset -= pageoff; /* start of page */
1553 seg1->mr_len += pageoff;
1554 len = -pageoff;
1555 if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1556 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1557 for (i = 0; i < *nsegs;) {
1558 rpcrdma_map_one(ia, seg, writing);
1559 physaddrs[i] = seg->mr_dma;
1560 len += seg->mr_len;
1561 ++seg;
1562 ++i;
1563 /* Check for holes */
1564 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1565 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1566 break;
1567 }
1568 rc = ib_map_phys_fmr(seg1->mr_chunk.rl_mw->r.fmr,
1569 physaddrs, i, seg1->mr_dma);
1570 if (rc) {
1571 dprintk("RPC: %s: failed ib_map_phys_fmr "
1572 "%u@0x%llx+%i (%d)... status %i\n", __func__,
1573 len, (unsigned long long)seg1->mr_dma,
1574 pageoff, i, rc);
1575 while (i--)
1576 rpcrdma_unmap_one(ia, --seg);
1577 } else {
1578 seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.fmr->rkey;
1579 seg1->mr_base = seg1->mr_dma + pageoff;
1580 seg1->mr_nsegs = i;
1581 seg1->mr_len = len;
1582 }
1583 *nsegs = i;
1584 return rc;
1585}
1586
1587static int
1588rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
1589 struct rpcrdma_ia *ia)
1590{
1591 struct rpcrdma_mr_seg *seg1 = seg;
1592 LIST_HEAD(l);
1593 int rc;
1594
1595 list_add(&seg1->mr_chunk.rl_mw->r.fmr->list, &l);
1596 rc = ib_unmap_fmr(&l);
1597 while (seg1->mr_nsegs--)
1598 rpcrdma_unmap_one(ia, seg++);
1599 if (rc)
1600 dprintk("RPC: %s: failed ib_unmap_fmr,"
1601 " status %i\n", __func__, rc);
1602 return rc;
1603}
1604
1605static int
1606rpcrdma_register_memwin_external(struct rpcrdma_mr_seg *seg,
1607 int *nsegs, int writing, struct rpcrdma_ia *ia,
1608 struct rpcrdma_xprt *r_xprt)
1609{
1610 int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
1611 IB_ACCESS_REMOTE_READ);
1612 struct ib_mw_bind param;
1613 int rc;
1614
1615 *nsegs = 1;
1616 rpcrdma_map_one(ia, seg, writing);
1617 param.mr = ia->ri_bind_mem;
1618 param.wr_id = 0ULL; /* no send cookie */
1619 param.addr = seg->mr_dma;
1620 param.length = seg->mr_len;
1621 param.send_flags = 0;
1622 param.mw_access_flags = mem_priv;
1623
1624 DECR_CQCOUNT(&r_xprt->rx_ep);
1625 rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, &param);
1626 if (rc) {
1627 dprintk("RPC: %s: failed ib_bind_mw "
1628 "%u@0x%llx status %i\n",
1629 __func__, seg->mr_len,
1630 (unsigned long long)seg->mr_dma, rc);
1631 rpcrdma_unmap_one(ia, seg);
1632 } else {
1633 seg->mr_rkey = seg->mr_chunk.rl_mw->r.mw->rkey;
1634 seg->mr_base = param.addr;
1635 seg->mr_nsegs = 1;
1636 }
1637 return rc;
1638}
1639
1640static int
1641rpcrdma_deregister_memwin_external(struct rpcrdma_mr_seg *seg,
1642 struct rpcrdma_ia *ia,
1643 struct rpcrdma_xprt *r_xprt, void **r)
1644{
1645 struct ib_mw_bind param;
1646 LIST_HEAD(l);
1647 int rc;
1648
1649 BUG_ON(seg->mr_nsegs != 1);
1650 param.mr = ia->ri_bind_mem;
1651 param.addr = 0ULL; /* unbind */
1652 param.length = 0;
1653 param.mw_access_flags = 0;
1654 if (*r) {
1655 param.wr_id = (u64) (unsigned long) *r;
1656 param.send_flags = IB_SEND_SIGNALED;
1657 INIT_CQCOUNT(&r_xprt->rx_ep);
1658 } else {
1659 param.wr_id = 0ULL;
1660 param.send_flags = 0;
1661 DECR_CQCOUNT(&r_xprt->rx_ep);
1662 }
1663 rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, &param);
1664 rpcrdma_unmap_one(ia, seg);
1665 if (rc)
1666 dprintk("RPC: %s: failed ib_(un)bind_mw,"
1667 " status %i\n", __func__, rc);
1668 else
1669 *r = NULL; /* will upcall on completion */
1670 return rc;
1671}
1672
1673static int
1674rpcrdma_register_default_external(struct rpcrdma_mr_seg *seg,
1675 int *nsegs, int writing, struct rpcrdma_ia *ia)
1676{
1677 int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
1678 IB_ACCESS_REMOTE_READ);
1679 struct rpcrdma_mr_seg *seg1 = seg;
1680 struct ib_phys_buf ipb[RPCRDMA_MAX_DATA_SEGS];
1681 int len, i, rc = 0;
1682
1683 if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1684 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1685 for (len = 0, i = 0; i < *nsegs;) {
1686 rpcrdma_map_one(ia, seg, writing);
1687 ipb[i].addr = seg->mr_dma;
1688 ipb[i].size = seg->mr_len;
1689 len += seg->mr_len;
1690 ++seg;
1691 ++i;
1692 /* Check for holes */
1693 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1694 offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len))
1695 break;
1696 }
1697 seg1->mr_base = seg1->mr_dma;
1698 seg1->mr_chunk.rl_mr = ib_reg_phys_mr(ia->ri_pd,
1699 ipb, i, mem_priv, &seg1->mr_base);
1700 if (IS_ERR(seg1->mr_chunk.rl_mr)) {
1701 rc = PTR_ERR(seg1->mr_chunk.rl_mr);
1702 dprintk("RPC: %s: failed ib_reg_phys_mr "
1703 "%u@0x%llx (%d)... status %i\n",
1704 __func__, len,
1705 (unsigned long long)seg1->mr_dma, i, rc);
1706 while (i--)
1707 rpcrdma_unmap_one(ia, --seg);
1708 } else {
1709 seg1->mr_rkey = seg1->mr_chunk.rl_mr->rkey;
1710 seg1->mr_nsegs = i;
1711 seg1->mr_len = len;
1712 }
1713 *nsegs = i;
1714 return rc;
1715}
1716
1717static int
1718rpcrdma_deregister_default_external(struct rpcrdma_mr_seg *seg,
1719 struct rpcrdma_ia *ia)
1720{
1721 struct rpcrdma_mr_seg *seg1 = seg;
1722 int rc;
1723
1724 rc = ib_dereg_mr(seg1->mr_chunk.rl_mr);
1725 seg1->mr_chunk.rl_mr = NULL;
1726 while (seg1->mr_nsegs--)
1727 rpcrdma_unmap_one(ia, seg++);
1728 if (rc)
1729 dprintk("RPC: %s: failed ib_dereg_mr,"
1730 " status %i\n", __func__, rc);
1731 return rc;
1732}
1733
1332int 1734int
1333rpcrdma_register_external(struct rpcrdma_mr_seg *seg, 1735rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
1334 int nsegs, int writing, struct rpcrdma_xprt *r_xprt) 1736 int nsegs, int writing, struct rpcrdma_xprt *r_xprt)
1335{ 1737{
1336 struct rpcrdma_ia *ia = &r_xprt->rx_ia; 1738 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
1337 int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
1338 IB_ACCESS_REMOTE_READ);
1339 struct rpcrdma_mr_seg *seg1 = seg;
1340 int i;
1341 int rc = 0; 1739 int rc = 0;
1342 1740
1343 switch (ia->ri_memreg_strategy) { 1741 switch (ia->ri_memreg_strategy) {
@@ -1352,114 +1750,25 @@ rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
1352 break; 1750 break;
1353#endif 1751#endif
1354 1752
1355 /* Registration using fast memory registration */ 1753 /* Registration using frmr registration */
1754 case RPCRDMA_FRMR:
1755 rc = rpcrdma_register_frmr_external(seg, &nsegs, writing, ia, r_xprt);
1756 break;
1757
1758 /* Registration using fmr memory registration */
1356 case RPCRDMA_MTHCAFMR: 1759 case RPCRDMA_MTHCAFMR:
1357 { 1760 rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia);
1358 u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
1359 int len, pageoff = offset_in_page(seg->mr_offset);
1360 seg1->mr_offset -= pageoff; /* start of page */
1361 seg1->mr_len += pageoff;
1362 len = -pageoff;
1363 if (nsegs > RPCRDMA_MAX_DATA_SEGS)
1364 nsegs = RPCRDMA_MAX_DATA_SEGS;
1365 for (i = 0; i < nsegs;) {
1366 rpcrdma_map_one(ia, seg, writing);
1367 physaddrs[i] = seg->mr_dma;
1368 len += seg->mr_len;
1369 ++seg;
1370 ++i;
1371 /* Check for holes */
1372 if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
1373 offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len))
1374 break;
1375 }
1376 nsegs = i;
1377 rc = ib_map_phys_fmr(seg1->mr_chunk.rl_mw->r.fmr,
1378 physaddrs, nsegs, seg1->mr_dma);
1379 if (rc) {
1380 dprintk("RPC: %s: failed ib_map_phys_fmr "
1381 "%u@0x%llx+%i (%d)... status %i\n", __func__,
1382 len, (unsigned long long)seg1->mr_dma,
1383 pageoff, nsegs, rc);
1384 while (nsegs--)
1385 rpcrdma_unmap_one(ia, --seg);
1386 } else {
1387 seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.fmr->rkey;
1388 seg1->mr_base = seg1->mr_dma + pageoff;
1389 seg1->mr_nsegs = nsegs;
1390 seg1->mr_len = len;
1391 }
1392 }
1393 break; 1761 break;
1394 1762
1395 /* Registration using memory windows */ 1763 /* Registration using memory windows */
1396 case RPCRDMA_MEMWINDOWS_ASYNC: 1764 case RPCRDMA_MEMWINDOWS_ASYNC:
1397 case RPCRDMA_MEMWINDOWS: 1765 case RPCRDMA_MEMWINDOWS:
1398 { 1766 rc = rpcrdma_register_memwin_external(seg, &nsegs, writing, ia, r_xprt);
1399 struct ib_mw_bind param;
1400 rpcrdma_map_one(ia, seg, writing);
1401 param.mr = ia->ri_bind_mem;
1402 param.wr_id = 0ULL; /* no send cookie */
1403 param.addr = seg->mr_dma;
1404 param.length = seg->mr_len;
1405 param.send_flags = 0;
1406 param.mw_access_flags = mem_priv;
1407
1408 DECR_CQCOUNT(&r_xprt->rx_ep);
1409 rc = ib_bind_mw(ia->ri_id->qp,
1410 seg->mr_chunk.rl_mw->r.mw, &param);
1411 if (rc) {
1412 dprintk("RPC: %s: failed ib_bind_mw "
1413 "%u@0x%llx status %i\n",
1414 __func__, seg->mr_len,
1415 (unsigned long long)seg->mr_dma, rc);
1416 rpcrdma_unmap_one(ia, seg);
1417 } else {
1418 seg->mr_rkey = seg->mr_chunk.rl_mw->r.mw->rkey;
1419 seg->mr_base = param.addr;
1420 seg->mr_nsegs = 1;
1421 nsegs = 1;
1422 }
1423 }
1424 break; 1767 break;
1425 1768
1426 /* Default registration each time */ 1769 /* Default registration each time */
1427 default: 1770 default:
1428 { 1771 rc = rpcrdma_register_default_external(seg, &nsegs, writing, ia);
1429 struct ib_phys_buf ipb[RPCRDMA_MAX_DATA_SEGS];
1430 int len = 0;
1431 if (nsegs > RPCRDMA_MAX_DATA_SEGS)
1432 nsegs = RPCRDMA_MAX_DATA_SEGS;
1433 for (i = 0; i < nsegs;) {
1434 rpcrdma_map_one(ia, seg, writing);
1435 ipb[i].addr = seg->mr_dma;
1436 ipb[i].size = seg->mr_len;
1437 len += seg->mr_len;
1438 ++seg;
1439 ++i;
1440 /* Check for holes */
1441 if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
1442 offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len))
1443 break;
1444 }
1445 nsegs = i;
1446 seg1->mr_base = seg1->mr_dma;
1447 seg1->mr_chunk.rl_mr = ib_reg_phys_mr(ia->ri_pd,
1448 ipb, nsegs, mem_priv, &seg1->mr_base);
1449 if (IS_ERR(seg1->mr_chunk.rl_mr)) {
1450 rc = PTR_ERR(seg1->mr_chunk.rl_mr);
1451 dprintk("RPC: %s: failed ib_reg_phys_mr "
1452 "%u@0x%llx (%d)... status %i\n",
1453 __func__, len,
1454 (unsigned long long)seg1->mr_dma, nsegs, rc);
1455 while (nsegs--)
1456 rpcrdma_unmap_one(ia, --seg);
1457 } else {
1458 seg1->mr_rkey = seg1->mr_chunk.rl_mr->rkey;
1459 seg1->mr_nsegs = nsegs;
1460 seg1->mr_len = len;
1461 }
1462 }
1463 break; 1772 break;
1464 } 1773 }
1465 if (rc) 1774 if (rc)
@@ -1473,7 +1782,6 @@ rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
1473 struct rpcrdma_xprt *r_xprt, void *r) 1782 struct rpcrdma_xprt *r_xprt, void *r)
1474{ 1783{
1475 struct rpcrdma_ia *ia = &r_xprt->rx_ia; 1784 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
1476 struct rpcrdma_mr_seg *seg1 = seg;
1477 int nsegs = seg->mr_nsegs, rc; 1785 int nsegs = seg->mr_nsegs, rc;
1478 1786
1479 switch (ia->ri_memreg_strategy) { 1787 switch (ia->ri_memreg_strategy) {
@@ -1486,56 +1794,21 @@ rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
1486 break; 1794 break;
1487#endif 1795#endif
1488 1796
1797 case RPCRDMA_FRMR:
1798 rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt);
1799 break;
1800
1489 case RPCRDMA_MTHCAFMR: 1801 case RPCRDMA_MTHCAFMR:
1490 { 1802 rc = rpcrdma_deregister_fmr_external(seg, ia);
1491 LIST_HEAD(l);
1492 list_add(&seg->mr_chunk.rl_mw->r.fmr->list, &l);
1493 rc = ib_unmap_fmr(&l);
1494 while (seg1->mr_nsegs--)
1495 rpcrdma_unmap_one(ia, seg++);
1496 }
1497 if (rc)
1498 dprintk("RPC: %s: failed ib_unmap_fmr,"
1499 " status %i\n", __func__, rc);
1500 break; 1803 break;
1501 1804
1502 case RPCRDMA_MEMWINDOWS_ASYNC: 1805 case RPCRDMA_MEMWINDOWS_ASYNC:
1503 case RPCRDMA_MEMWINDOWS: 1806 case RPCRDMA_MEMWINDOWS:
1504 { 1807 rc = rpcrdma_deregister_memwin_external(seg, ia, r_xprt, &r);
1505 struct ib_mw_bind param;
1506 BUG_ON(nsegs != 1);
1507 param.mr = ia->ri_bind_mem;
1508 param.addr = 0ULL; /* unbind */
1509 param.length = 0;
1510 param.mw_access_flags = 0;
1511 if (r) {
1512 param.wr_id = (u64) (unsigned long) r;
1513 param.send_flags = IB_SEND_SIGNALED;
1514 INIT_CQCOUNT(&r_xprt->rx_ep);
1515 } else {
1516 param.wr_id = 0ULL;
1517 param.send_flags = 0;
1518 DECR_CQCOUNT(&r_xprt->rx_ep);
1519 }
1520 rc = ib_bind_mw(ia->ri_id->qp,
1521 seg->mr_chunk.rl_mw->r.mw, &param);
1522 rpcrdma_unmap_one(ia, seg);
1523 }
1524 if (rc)
1525 dprintk("RPC: %s: failed ib_(un)bind_mw,"
1526 " status %i\n", __func__, rc);
1527 else
1528 r = NULL; /* will upcall on completion */
1529 break; 1808 break;
1530 1809
1531 default: 1810 default:
1532 rc = ib_dereg_mr(seg1->mr_chunk.rl_mr); 1811 rc = rpcrdma_deregister_default_external(seg, ia);
1533 seg1->mr_chunk.rl_mr = NULL;
1534 while (seg1->mr_nsegs--)
1535 rpcrdma_unmap_one(ia, seg++);
1536 if (rc)
1537 dprintk("RPC: %s: failed ib_dereg_mr,"
1538 " status %i\n", __func__, rc);
1539 break; 1812 break;
1540 } 1813 }
1541 if (r) { 1814 if (r) {
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 2427822f8bd4..c7a7eba991bc 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -51,6 +51,9 @@
51#include <linux/sunrpc/rpc_rdma.h> /* RPC/RDMA protocol */ 51#include <linux/sunrpc/rpc_rdma.h> /* RPC/RDMA protocol */
52#include <linux/sunrpc/xprtrdma.h> /* xprt parameters */ 52#include <linux/sunrpc/xprtrdma.h> /* xprt parameters */
53 53
54#define RDMA_RESOLVE_TIMEOUT (5000) /* 5 seconds */
55#define RDMA_CONNECT_RETRY_MAX (2) /* retries if no listener backlog */
56
54/* 57/*
55 * Interface Adapter -- one per transport instance 58 * Interface Adapter -- one per transport instance
56 */ 59 */
@@ -58,6 +61,8 @@ struct rpcrdma_ia {
58 struct rdma_cm_id *ri_id; 61 struct rdma_cm_id *ri_id;
59 struct ib_pd *ri_pd; 62 struct ib_pd *ri_pd;
60 struct ib_mr *ri_bind_mem; 63 struct ib_mr *ri_bind_mem;
64 u32 ri_dma_lkey;
65 int ri_have_dma_lkey;
61 struct completion ri_done; 66 struct completion ri_done;
62 int ri_async_rc; 67 int ri_async_rc;
63 enum rpcrdma_memreg ri_memreg_strategy; 68 enum rpcrdma_memreg ri_memreg_strategy;
@@ -156,6 +161,10 @@ struct rpcrdma_mr_seg { /* chunk descriptors */
156 union { 161 union {
157 struct ib_mw *mw; 162 struct ib_mw *mw;
158 struct ib_fmr *fmr; 163 struct ib_fmr *fmr;
164 struct {
165 struct ib_fast_reg_page_list *fr_pgl;
166 struct ib_mr *fr_mr;
167 } frmr;
159 } r; 168 } r;
160 struct list_head mw_list; 169 struct list_head mw_list;
161 } *rl_mw; 170 } *rl_mw;
@@ -175,6 +184,7 @@ struct rpcrdma_req {
175 size_t rl_size; /* actual length of buffer */ 184 size_t rl_size; /* actual length of buffer */
176 unsigned int rl_niovs; /* 0, 2 or 4 */ 185 unsigned int rl_niovs; /* 0, 2 or 4 */
177 unsigned int rl_nchunks; /* non-zero if chunks */ 186 unsigned int rl_nchunks; /* non-zero if chunks */
187 unsigned int rl_connect_cookie; /* retry detection */
178 struct rpcrdma_buffer *rl_buffer; /* home base for this structure */ 188 struct rpcrdma_buffer *rl_buffer; /* home base for this structure */
179 struct rpcrdma_rep *rl_reply;/* holder for reply buffer */ 189 struct rpcrdma_rep *rl_reply;/* holder for reply buffer */
180 struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];/* chunk segments */ 190 struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];/* chunk segments */
@@ -198,7 +208,7 @@ struct rpcrdma_buffer {
198 atomic_t rb_credits; /* most recent server credits */ 208 atomic_t rb_credits; /* most recent server credits */
199 unsigned long rb_cwndscale; /* cached framework rpc_cwndscale */ 209 unsigned long rb_cwndscale; /* cached framework rpc_cwndscale */
200 int rb_max_requests;/* client max requests */ 210 int rb_max_requests;/* client max requests */
201 struct list_head rb_mws; /* optional memory windows/fmrs */ 211 struct list_head rb_mws; /* optional memory windows/fmrs/frmrs */
202 int rb_send_index; 212 int rb_send_index;
203 struct rpcrdma_req **rb_send_bufs; 213 struct rpcrdma_req **rb_send_bufs;
204 int rb_recv_index; 214 int rb_recv_index;
@@ -273,6 +283,11 @@ struct rpcrdma_xprt {
273#define rpcx_to_rdmax(x) container_of(x, struct rpcrdma_xprt, xprt) 283#define rpcx_to_rdmax(x) container_of(x, struct rpcrdma_xprt, xprt)
274#define rpcx_to_rdmad(x) (rpcx_to_rdmax(x)->rx_data) 284#define rpcx_to_rdmad(x) (rpcx_to_rdmax(x)->rx_data)
275 285
286/* Setting this to 0 ensures interoperability with early servers.
287 * Setting this to 1 enhances certain unaligned read/write performance.
288 * Default is 0, see sysctl entry and rpc_rdma.c rpcrdma_convert_iovs() */
289extern int xprt_rdma_pad_optimize;
290
276/* 291/*
277 * Interface Adapter calls - xprtrdma/verbs.c 292 * Interface Adapter calls - xprtrdma/verbs.c
278 */ 293 */