aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-02-12 13:39:41 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2015-02-12 13:39:41 -0500
commit61845143febe6b88349acad4732adc54894009a3 (patch)
treebed6a23efe05b9867b8b4d1f4d251fc1c562e7e4
parenta26be149facb22d30cd92cadb26f651d6fe802c9 (diff)
parentc23ae6017835b5bc9b9ec9d5d9c2b1523053f503 (diff)
Merge branch 'for-3.20' of git://linux-nfs.org/~bfields/linux
Pull nfsd updates from Bruce Fields: "The main change is the pNFS block server support from Christoph, which allows an NFS client connected to shared disk to do block IO to the shared disk in place of NFS reads and writes. This also requires xfs patches, which should arrive soon through the xfs tree, barring unexpected problems. Support for other filesystems is also possible if there's interest. Thanks also to Chuck Lever for continuing work to get NFS/RDMA into shape" * 'for-3.20' of git://linux-nfs.org/~bfields/linux: (32 commits) nfsd: default NFSv4.2 to on nfsd: pNFS block layout driver exportfs: add methods for block layout exports nfsd: add trace events nfsd: update documentation for pNFS support nfsd: implement pNFS layout recalls nfsd: implement pNFS operations nfsd: make find_any_file available outside nfs4state.c nfsd: make find/get/put file available outside nfs4state.c nfsd: make lookup/alloc/unhash_stid available outside nfs4state.c nfsd: add fh_fsid_match helper nfsd: move nfsd_fh_match to nfsfh.h fs: add FL_LAYOUT lease type fs: track fl_owner for leases nfs: add LAYOUT_TYPE_MAX enum value nfsd: factor out a helper to decode nfstime4 values sunrpc/lockd: fix references to the BKL nfsd: fix year-2038 nfs4 state problem svcrdma: Handle additional inline content svcrdma: Move read list XDR round-up logic ...
-rw-r--r--Documentation/filesystems/nfs/nfs41-server.txt23
-rw-r--r--Documentation/filesystems/nfs/pnfs-block-server.txt37
-rw-r--r--fs/lockd/svclock.c4
-rw-r--r--fs/lockd/xdr.c8
-rw-r--r--fs/locks.c26
-rw-r--r--fs/nfsd/Kconfig10
-rw-r--r--fs/nfsd/Makefile8
-rw-r--r--fs/nfsd/blocklayout.c189
-rw-r--r--fs/nfsd/blocklayoutxdr.c157
-rw-r--r--fs/nfsd/blocklayoutxdr.h62
-rw-r--r--fs/nfsd/export.c8
-rw-r--r--fs/nfsd/export.h2
-rw-r--r--fs/nfsd/nfs4callback.c99
-rw-r--r--fs/nfsd/nfs4layouts.c721
-rw-r--r--fs/nfsd/nfs4proc.c310
-rw-r--r--fs/nfsd/nfs4state.c76
-rw-r--r--fs/nfsd/nfs4xdr.c362
-rw-r--r--fs/nfsd/nfsctl.c9
-rw-r--r--fs/nfsd/nfsd.h16
-rw-r--r--fs/nfsd/nfsfh.h18
-rw-r--r--fs/nfsd/nfssvc.c1
-rw-r--r--fs/nfsd/pnfs.h81
-rw-r--r--fs/nfsd/state.h43
-rw-r--r--fs/nfsd/trace.c5
-rw-r--r--fs/nfsd/trace.h54
-rw-r--r--fs/nfsd/xdr4.h59
-rw-r--r--fs/nfsd/xdr4cb.h7
-rw-r--r--include/linux/exportfs.h23
-rw-r--r--include/linux/fs.h16
-rw-r--r--include/linux/nfs4.h2
-rw-r--r--include/linux/sunrpc/svc.h2
-rw-r--r--include/linux/sunrpc/svc_rdma.h13
-rw-r--r--include/uapi/linux/nfsd/debug.h1
-rw-r--r--include/uapi/linux/nfsd/export.h4
-rw-r--r--net/sunrpc/svc.c4
-rw-r--r--net/sunrpc/svc_xprt.c3
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_marshal.c16
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_recvfrom.c244
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_sendto.c46
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_transport.c47
40 files changed, 2562 insertions, 254 deletions
diff --git a/Documentation/filesystems/nfs/nfs41-server.txt b/Documentation/filesystems/nfs/nfs41-server.txt
index c49cd7e796e7..682a59fabe3f 100644
--- a/Documentation/filesystems/nfs/nfs41-server.txt
+++ b/Documentation/filesystems/nfs/nfs41-server.txt
@@ -24,11 +24,6 @@ focuses on the mandatory-to-implement NFSv4.1 Sessions, providing
24"exactly once" semantics and better control and throttling of the 24"exactly once" semantics and better control and throttling of the
25resources allocated for each client. 25resources allocated for each client.
26 26
27Other NFSv4.1 features, Parallel NFS operations in particular,
28are still under development out of tree.
29See http://wiki.linux-nfs.org/wiki/index.php/PNFS_prototype_design
30for more information.
31
32The table below, taken from the NFSv4.1 document, lists 27The table below, taken from the NFSv4.1 document, lists
33the operations that are mandatory to implement (REQ), optional 28the operations that are mandatory to implement (REQ), optional
34(OPT), and NFSv4.0 operations that are required not to implement (MNI) 29(OPT), and NFSv4.0 operations that are required not to implement (MNI)
@@ -43,9 +38,7 @@ The OPTIONAL features identified and their abbreviations are as follows:
43The following abbreviations indicate the linux server implementation status. 38The following abbreviations indicate the linux server implementation status.
44 I Implemented NFSv4.1 operations. 39 I Implemented NFSv4.1 operations.
45 NS Not Supported. 40 NS Not Supported.
46 NS* unimplemented optional feature. 41 NS* Unimplemented optional feature.
47 P pNFS features implemented out of tree.
48 PNS pNFS features that are not supported yet (out of tree).
49 42
50Operations 43Operations
51 44
@@ -70,13 +63,13 @@ I | DESTROY_SESSION | REQ | | Section 18.37 |
70I | EXCHANGE_ID | REQ | | Section 18.35 | 63I | EXCHANGE_ID | REQ | | Section 18.35 |
71I | FREE_STATEID | REQ | | Section 18.38 | 64I | FREE_STATEID | REQ | | Section 18.38 |
72 | GETATTR | REQ | | Section 18.7 | 65 | GETATTR | REQ | | Section 18.7 |
73P | GETDEVICEINFO | OPT | pNFS (REQ) | Section 18.40 | 66I | GETDEVICEINFO | OPT | pNFS (REQ) | Section 18.40 |
74P | GETDEVICELIST | OPT | pNFS (OPT) | Section 18.41 | 67NS*| GETDEVICELIST | OPT | pNFS (OPT) | Section 18.41 |
75 | GETFH | REQ | | Section 18.8 | 68 | GETFH | REQ | | Section 18.8 |
76NS*| GET_DIR_DELEGATION | OPT | DDELG (REQ) | Section 18.39 | 69NS*| GET_DIR_DELEGATION | OPT | DDELG (REQ) | Section 18.39 |
77P | LAYOUTCOMMIT | OPT | pNFS (REQ) | Section 18.42 | 70I | LAYOUTCOMMIT | OPT | pNFS (REQ) | Section 18.42 |
78P | LAYOUTGET | OPT | pNFS (REQ) | Section 18.43 | 71I | LAYOUTGET | OPT | pNFS (REQ) | Section 18.43 |
79P | LAYOUTRETURN | OPT | pNFS (REQ) | Section 18.44 | 72I | LAYOUTRETURN | OPT | pNFS (REQ) | Section 18.44 |
80 | LINK | OPT | | Section 18.9 | 73 | LINK | OPT | | Section 18.9 |
81 | LOCK | REQ | | Section 18.10 | 74 | LOCK | REQ | | Section 18.10 |
82 | LOCKT | REQ | | Section 18.11 | 75 | LOCKT | REQ | | Section 18.11 |
@@ -122,9 +115,9 @@ Callback Operations
122 | | MNI | or OPT) | | 115 | | MNI | or OPT) | |
123 +-------------------------+-----------+-------------+---------------+ 116 +-------------------------+-----------+-------------+---------------+
124 | CB_GETATTR | OPT | FDELG (REQ) | Section 20.1 | 117 | CB_GETATTR | OPT | FDELG (REQ) | Section 20.1 |
125P | CB_LAYOUTRECALL | OPT | pNFS (REQ) | Section 20.3 | 118I | CB_LAYOUTRECALL | OPT | pNFS (REQ) | Section 20.3 |
126NS*| CB_NOTIFY | OPT | DDELG (REQ) | Section 20.4 | 119NS*| CB_NOTIFY | OPT | DDELG (REQ) | Section 20.4 |
127P | CB_NOTIFY_DEVICEID | OPT | pNFS (OPT) | Section 20.12 | 120NS*| CB_NOTIFY_DEVICEID | OPT | pNFS (OPT) | Section 20.12 |
128NS*| CB_NOTIFY_LOCK | OPT | | Section 20.11 | 121NS*| CB_NOTIFY_LOCK | OPT | | Section 20.11 |
129NS*| CB_PUSH_DELEG | OPT | FDELG (OPT) | Section 20.5 | 122NS*| CB_PUSH_DELEG | OPT | FDELG (OPT) | Section 20.5 |
130 | CB_RECALL | OPT | FDELG, | Section 20.2 | 123 | CB_RECALL | OPT | FDELG, | Section 20.2 |
diff --git a/Documentation/filesystems/nfs/pnfs-block-server.txt b/Documentation/filesystems/nfs/pnfs-block-server.txt
new file mode 100644
index 000000000000..2143673cf154
--- /dev/null
+++ b/Documentation/filesystems/nfs/pnfs-block-server.txt
@@ -0,0 +1,37 @@
1pNFS block layout server user guide
2
3The Linux NFS server now supports the pNFS block layout extension. In this
4case the NFS server acts as Metadata Server (MDS) for pNFS, which in addition
5to handling all the metadata access to the NFS export also hands out layouts
6to the clients to directly access the underlying block devices that are
7shared with the client.
8
9To use pNFS block layouts with with the Linux NFS server the exported file
10system needs to support the pNFS block layouts (currently just XFS), and the
11file system must sit on shared storage (typically iSCSI) that is accessible
12to the clients in addition to the MDS. As of now the file system needs to
13sit directly on the exported volume, striping or concatenation of
14volumes on the MDS and clients is not supported yet.
15
16On the server, pNFS block volume support is automatically if the file system
17support it. On the client make sure the kernel has the CONFIG_PNFS_BLOCK
18option enabled, the blkmapd daemon from nfs-utils is running, and the
19file system is mounted using the NFSv4.1 protocol version (mount -o vers=4.1).
20
21If the nfsd server needs to fence a non-responding client it calls
22/sbin/nfsd-recall-failed with the first argument set to the IP address of
23the client, and the second argument set to the device node without the /dev
24prefix for the file system to be fenced. Below is an example file that shows
25how to translate the device into a serial number from SCSI EVPD 0x80:
26
27cat > /sbin/nfsd-recall-failed << EOF
28#!/bin/sh
29
30CLIENT="$1"
31DEV="/dev/$2"
32EVPD=`sg_inq --page=0x80 ${DEV} | \
33 grep "Unit serial number:" | \
34 awk -F ': ' '{print $2}'`
35
36echo "fencing client ${CLIENT} serial ${EVPD}" >> /var/log/pnfsd-fence.log
37EOF
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 56598742dde4..5581e020644b 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -57,8 +57,8 @@ static DEFINE_SPINLOCK(nlm_blocked_lock);
57static const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie) 57static const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie)
58{ 58{
59 /* 59 /*
60 * We can get away with a static buffer because we're only 60 * We can get away with a static buffer because this is only called
61 * called with BKL held. 61 * from lockd, which is single-threaded.
62 */ 62 */
63 static char buf[2*NLM_MAXCOOKIELEN+1]; 63 static char buf[2*NLM_MAXCOOKIELEN+1];
64 unsigned int i, len = sizeof(buf); 64 unsigned int i, len = sizeof(buf);
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index 9340e7e10ef6..5b651daad518 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -95,14 +95,6 @@ nlm_decode_fh(__be32 *p, struct nfs_fh *f)
95 return p + XDR_QUADLEN(NFS2_FHSIZE); 95 return p + XDR_QUADLEN(NFS2_FHSIZE);
96} 96}
97 97
98static inline __be32 *
99nlm_encode_fh(__be32 *p, struct nfs_fh *f)
100{
101 *p++ = htonl(NFS2_FHSIZE);
102 memcpy(p, f->data, NFS2_FHSIZE);
103 return p + XDR_QUADLEN(NFS2_FHSIZE);
104}
105
106/* 98/*
107 * Encode and decode owner handle 99 * Encode and decode owner handle
108 */ 100 */
diff --git a/fs/locks.c b/fs/locks.c
index 4d0d41163a50..4753218f308e 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -137,7 +137,7 @@
137 137
138#define IS_POSIX(fl) (fl->fl_flags & FL_POSIX) 138#define IS_POSIX(fl) (fl->fl_flags & FL_POSIX)
139#define IS_FLOCK(fl) (fl->fl_flags & FL_FLOCK) 139#define IS_FLOCK(fl) (fl->fl_flags & FL_FLOCK)
140#define IS_LEASE(fl) (fl->fl_flags & (FL_LEASE|FL_DELEG)) 140#define IS_LEASE(fl) (fl->fl_flags & (FL_LEASE|FL_DELEG|FL_LAYOUT))
141#define IS_OFDLCK(fl) (fl->fl_flags & FL_OFDLCK) 141#define IS_OFDLCK(fl) (fl->fl_flags & FL_OFDLCK)
142 142
143static bool lease_breaking(struct file_lock *fl) 143static bool lease_breaking(struct file_lock *fl)
@@ -1371,6 +1371,8 @@ static void time_out_leases(struct inode *inode, struct list_head *dispose)
1371 1371
1372static bool leases_conflict(struct file_lock *lease, struct file_lock *breaker) 1372static bool leases_conflict(struct file_lock *lease, struct file_lock *breaker)
1373{ 1373{
1374 if ((breaker->fl_flags & FL_LAYOUT) != (lease->fl_flags & FL_LAYOUT))
1375 return false;
1374 if ((breaker->fl_flags & FL_DELEG) && (lease->fl_flags & FL_LEASE)) 1376 if ((breaker->fl_flags & FL_DELEG) && (lease->fl_flags & FL_LEASE))
1375 return false; 1377 return false;
1376 return locks_conflict(breaker, lease); 1378 return locks_conflict(breaker, lease);
@@ -1594,11 +1596,14 @@ int fcntl_getlease(struct file *filp)
1594 * conflict with the lease we're trying to set. 1596 * conflict with the lease we're trying to set.
1595 */ 1597 */
1596static int 1598static int
1597check_conflicting_open(const struct dentry *dentry, const long arg) 1599check_conflicting_open(const struct dentry *dentry, const long arg, int flags)
1598{ 1600{
1599 int ret = 0; 1601 int ret = 0;
1600 struct inode *inode = dentry->d_inode; 1602 struct inode *inode = dentry->d_inode;
1601 1603
1604 if (flags & FL_LAYOUT)
1605 return 0;
1606
1602 if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0)) 1607 if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0))
1603 return -EAGAIN; 1608 return -EAGAIN;
1604 1609
@@ -1647,7 +1652,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
1647 1652
1648 spin_lock(&ctx->flc_lock); 1653 spin_lock(&ctx->flc_lock);
1649 time_out_leases(inode, &dispose); 1654 time_out_leases(inode, &dispose);
1650 error = check_conflicting_open(dentry, arg); 1655 error = check_conflicting_open(dentry, arg, lease->fl_flags);
1651 if (error) 1656 if (error)
1652 goto out; 1657 goto out;
1653 1658
@@ -1661,7 +1666,8 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
1661 */ 1666 */
1662 error = -EAGAIN; 1667 error = -EAGAIN;
1663 list_for_each_entry(fl, &ctx->flc_lease, fl_list) { 1668 list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
1664 if (fl->fl_file == filp) { 1669 if (fl->fl_file == filp &&
1670 fl->fl_owner == lease->fl_owner) {
1665 my_fl = fl; 1671 my_fl = fl;
1666 continue; 1672 continue;
1667 } 1673 }
@@ -1702,7 +1708,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
1702 * precedes these checks. 1708 * precedes these checks.
1703 */ 1709 */
1704 smp_mb(); 1710 smp_mb();
1705 error = check_conflicting_open(dentry, arg); 1711 error = check_conflicting_open(dentry, arg, lease->fl_flags);
1706 if (error) { 1712 if (error) {
1707 locks_unlink_lock_ctx(lease, &ctx->flc_lease_cnt); 1713 locks_unlink_lock_ctx(lease, &ctx->flc_lease_cnt);
1708 goto out; 1714 goto out;
@@ -1721,7 +1727,7 @@ out:
1721 return error; 1727 return error;
1722} 1728}
1723 1729
1724static int generic_delete_lease(struct file *filp) 1730static int generic_delete_lease(struct file *filp, void *owner)
1725{ 1731{
1726 int error = -EAGAIN; 1732 int error = -EAGAIN;
1727 struct file_lock *fl, *victim = NULL; 1733 struct file_lock *fl, *victim = NULL;
@@ -1737,7 +1743,8 @@ static int generic_delete_lease(struct file *filp)
1737 1743
1738 spin_lock(&ctx->flc_lock); 1744 spin_lock(&ctx->flc_lock);
1739 list_for_each_entry(fl, &ctx->flc_lease, fl_list) { 1745 list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
1740 if (fl->fl_file == filp) { 1746 if (fl->fl_file == filp &&
1747 fl->fl_owner == owner) {
1741 victim = fl; 1748 victim = fl;
1742 break; 1749 break;
1743 } 1750 }
@@ -1778,13 +1785,14 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp,
1778 1785
1779 switch (arg) { 1786 switch (arg) {
1780 case F_UNLCK: 1787 case F_UNLCK:
1781 return generic_delete_lease(filp); 1788 return generic_delete_lease(filp, *priv);
1782 case F_RDLCK: 1789 case F_RDLCK:
1783 case F_WRLCK: 1790 case F_WRLCK:
1784 if (!(*flp)->fl_lmops->lm_break) { 1791 if (!(*flp)->fl_lmops->lm_break) {
1785 WARN_ON_ONCE(1); 1792 WARN_ON_ONCE(1);
1786 return -ENOLCK; 1793 return -ENOLCK;
1787 } 1794 }
1795
1788 return generic_add_lease(filp, arg, flp, priv); 1796 return generic_add_lease(filp, arg, flp, priv);
1789 default: 1797 default:
1790 return -EINVAL; 1798 return -EINVAL;
@@ -1857,7 +1865,7 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
1857int fcntl_setlease(unsigned int fd, struct file *filp, long arg) 1865int fcntl_setlease(unsigned int fd, struct file *filp, long arg)
1858{ 1866{
1859 if (arg == F_UNLCK) 1867 if (arg == F_UNLCK)
1860 return vfs_setlease(filp, F_UNLCK, NULL, NULL); 1868 return vfs_setlease(filp, F_UNLCK, NULL, (void **)&filp);
1861 return do_fcntl_add_lease(fd, filp, arg); 1869 return do_fcntl_add_lease(fd, filp, arg);
1862} 1870}
1863 1871
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 73395156bdb4..683bf718aead 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -82,6 +82,16 @@ config NFSD_V4
82 82
83 If unsure, say N. 83 If unsure, say N.
84 84
85config NFSD_PNFS
86 bool "NFSv4.1 server support for Parallel NFS (pNFS)"
87 depends on NFSD_V4
88 help
89 This option enables support for the parallel NFS features of the
90 minor version 1 of the NFSv4 protocol (RFC5661) in the kernel's NFS
91 server.
92
93 If unsure, say N.
94
85config NFSD_V4_SECURITY_LABEL 95config NFSD_V4_SECURITY_LABEL
86 bool "Provide Security Label support for NFSv4 server" 96 bool "Provide Security Label support for NFSv4 server"
87 depends on NFSD_V4 && SECURITY 97 depends on NFSD_V4 && SECURITY
diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
index af32ef06b4fe..9a6028e120c6 100644
--- a/fs/nfsd/Makefile
+++ b/fs/nfsd/Makefile
@@ -2,9 +2,14 @@
2# Makefile for the Linux nfs server 2# Makefile for the Linux nfs server
3# 3#
4 4
5ccflags-y += -I$(src) # needed for trace events
6
5obj-$(CONFIG_NFSD) += nfsd.o 7obj-$(CONFIG_NFSD) += nfsd.o
6 8
7nfsd-y := nfssvc.o nfsctl.o nfsproc.o nfsfh.o vfs.o \ 9# this one should be compiled first, as the tracing macros can easily blow up
10nfsd-y += trace.o
11
12nfsd-y += nfssvc.o nfsctl.o nfsproc.o nfsfh.o vfs.o \
8 export.o auth.o lockd.o nfscache.o nfsxdr.o stats.o 13 export.o auth.o lockd.o nfscache.o nfsxdr.o stats.o
9nfsd-$(CONFIG_NFSD_FAULT_INJECTION) += fault_inject.o 14nfsd-$(CONFIG_NFSD_FAULT_INJECTION) += fault_inject.o
10nfsd-$(CONFIG_NFSD_V2_ACL) += nfs2acl.o 15nfsd-$(CONFIG_NFSD_V2_ACL) += nfs2acl.o
@@ -12,3 +17,4 @@ nfsd-$(CONFIG_NFSD_V3) += nfs3proc.o nfs3xdr.o
12nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o 17nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o
13nfsd-$(CONFIG_NFSD_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \ 18nfsd-$(CONFIG_NFSD_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \
14 nfs4acl.o nfs4callback.o nfs4recover.o 19 nfs4acl.o nfs4callback.o nfs4recover.o
20nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o blocklayout.o blocklayoutxdr.o
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
new file mode 100644
index 000000000000..cdbc78c72542
--- /dev/null
+++ b/fs/nfsd/blocklayout.c
@@ -0,0 +1,189 @@
1/*
2 * Copyright (c) 2014 Christoph Hellwig.
3 */
4#include <linux/exportfs.h>
5#include <linux/genhd.h>
6#include <linux/slab.h>
7
8#include <linux/nfsd/debug.h>
9
10#include "blocklayoutxdr.h"
11#include "pnfs.h"
12
13#define NFSDDBG_FACILITY NFSDDBG_PNFS
14
15
16static int
17nfsd4_block_get_device_info_simple(struct super_block *sb,
18 struct nfsd4_getdeviceinfo *gdp)
19{
20 struct pnfs_block_deviceaddr *dev;
21 struct pnfs_block_volume *b;
22
23 dev = kzalloc(sizeof(struct pnfs_block_deviceaddr) +
24 sizeof(struct pnfs_block_volume), GFP_KERNEL);
25 if (!dev)
26 return -ENOMEM;
27 gdp->gd_device = dev;
28
29 dev->nr_volumes = 1;
30 b = &dev->volumes[0];
31
32 b->type = PNFS_BLOCK_VOLUME_SIMPLE;
33 b->simple.sig_len = PNFS_BLOCK_UUID_LEN;
34 return sb->s_export_op->get_uuid(sb, b->simple.sig, &b->simple.sig_len,
35 &b->simple.offset);
36}
37
38static __be32
39nfsd4_block_proc_getdeviceinfo(struct super_block *sb,
40 struct nfsd4_getdeviceinfo *gdp)
41{
42 if (sb->s_bdev != sb->s_bdev->bd_contains)
43 return nfserr_inval;
44 return nfserrno(nfsd4_block_get_device_info_simple(sb, gdp));
45}
46
47static __be32
48nfsd4_block_proc_layoutget(struct inode *inode, const struct svc_fh *fhp,
49 struct nfsd4_layoutget *args)
50{
51 struct nfsd4_layout_seg *seg = &args->lg_seg;
52 struct super_block *sb = inode->i_sb;
53 u32 block_size = (1 << inode->i_blkbits);
54 struct pnfs_block_extent *bex;
55 struct iomap iomap;
56 u32 device_generation = 0;
57 int error;
58
59 /*
60 * We do not attempt to support I/O smaller than the fs block size,
61 * or not aligned to it.
62 */
63 if (args->lg_minlength < block_size) {
64 dprintk("pnfsd: I/O too small\n");
65 goto out_layoutunavailable;
66 }
67 if (seg->offset & (block_size - 1)) {
68 dprintk("pnfsd: I/O misaligned\n");
69 goto out_layoutunavailable;
70 }
71
72 /*
73 * Some clients barf on non-zero block numbers for NONE or INVALID
74 * layouts, so make sure to zero the whole structure.
75 */
76 error = -ENOMEM;
77 bex = kzalloc(sizeof(*bex), GFP_KERNEL);
78 if (!bex)
79 goto out_error;
80 args->lg_content = bex;
81
82 error = sb->s_export_op->map_blocks(inode, seg->offset, seg->length,
83 &iomap, seg->iomode != IOMODE_READ,
84 &device_generation);
85 if (error) {
86 if (error == -ENXIO)
87 goto out_layoutunavailable;
88 goto out_error;
89 }
90
91 if (iomap.length < args->lg_minlength) {
92 dprintk("pnfsd: extent smaller than minlength\n");
93 goto out_layoutunavailable;
94 }
95
96 switch (iomap.type) {
97 case IOMAP_MAPPED:
98 if (seg->iomode == IOMODE_READ)
99 bex->es = PNFS_BLOCK_READ_DATA;
100 else
101 bex->es = PNFS_BLOCK_READWRITE_DATA;
102 bex->soff = (iomap.blkno << 9);
103 break;
104 case IOMAP_UNWRITTEN:
105 if (seg->iomode & IOMODE_RW) {
106 /*
107 * Crack monkey special case from section 2.3.1.
108 */
109 if (args->lg_minlength == 0) {
110 dprintk("pnfsd: no soup for you!\n");
111 goto out_layoutunavailable;
112 }
113
114 bex->es = PNFS_BLOCK_INVALID_DATA;
115 bex->soff = (iomap.blkno << 9);
116 break;
117 }
118 /*FALLTHRU*/
119 case IOMAP_HOLE:
120 if (seg->iomode == IOMODE_READ) {
121 bex->es = PNFS_BLOCK_NONE_DATA;
122 break;
123 }
124 /*FALLTHRU*/
125 case IOMAP_DELALLOC:
126 default:
127 WARN(1, "pnfsd: filesystem returned %d extent\n", iomap.type);
128 goto out_layoutunavailable;
129 }
130
131 error = nfsd4_set_deviceid(&bex->vol_id, fhp, device_generation);
132 if (error)
133 goto out_error;
134 bex->foff = iomap.offset;
135 bex->len = iomap.length;
136
137 seg->offset = iomap.offset;
138 seg->length = iomap.length;
139
140 dprintk("GET: %lld:%lld %d\n", bex->foff, bex->len, bex->es);
141 return 0;
142
143out_error:
144 seg->length = 0;
145 return nfserrno(error);
146out_layoutunavailable:
147 seg->length = 0;
148 return nfserr_layoutunavailable;
149}
150
151static __be32
152nfsd4_block_proc_layoutcommit(struct inode *inode,
153 struct nfsd4_layoutcommit *lcp)
154{
155 loff_t new_size = lcp->lc_last_wr + 1;
156 struct iattr iattr = { .ia_valid = 0 };
157 struct iomap *iomaps;
158 int nr_iomaps;
159 int error;
160
161 nr_iomaps = nfsd4_block_decode_layoutupdate(lcp->lc_up_layout,
162 lcp->lc_up_len, &iomaps, 1 << inode->i_blkbits);
163 if (nr_iomaps < 0)
164 return nfserrno(nr_iomaps);
165
166 if (lcp->lc_mtime.tv_nsec == UTIME_NOW ||
167 timespec_compare(&lcp->lc_mtime, &inode->i_mtime) < 0)
168 lcp->lc_mtime = current_fs_time(inode->i_sb);
169 iattr.ia_valid |= ATTR_ATIME | ATTR_CTIME | ATTR_MTIME;
170 iattr.ia_atime = iattr.ia_ctime = iattr.ia_mtime = lcp->lc_mtime;
171
172 if (new_size > i_size_read(inode)) {
173 iattr.ia_valid |= ATTR_SIZE;
174 iattr.ia_size = new_size;
175 }
176
177 error = inode->i_sb->s_export_op->commit_blocks(inode, iomaps,
178 nr_iomaps, &iattr);
179 kfree(iomaps);
180 return nfserrno(error);
181}
182
183const struct nfsd4_layout_ops bl_layout_ops = {
184 .proc_getdeviceinfo = nfsd4_block_proc_getdeviceinfo,
185 .encode_getdeviceinfo = nfsd4_block_encode_getdeviceinfo,
186 .proc_layoutget = nfsd4_block_proc_layoutget,
187 .encode_layoutget = nfsd4_block_encode_layoutget,
188 .proc_layoutcommit = nfsd4_block_proc_layoutcommit,
189};
diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c
new file mode 100644
index 000000000000..9da89fddab33
--- /dev/null
+++ b/fs/nfsd/blocklayoutxdr.c
@@ -0,0 +1,157 @@
1/*
2 * Copyright (c) 2014 Christoph Hellwig.
3 */
4#include <linux/sunrpc/svc.h>
5#include <linux/exportfs.h>
6#include <linux/nfs4.h>
7
8#include "nfsd.h"
9#include "blocklayoutxdr.h"
10
11#define NFSDDBG_FACILITY NFSDDBG_PNFS
12
13
14__be32
15nfsd4_block_encode_layoutget(struct xdr_stream *xdr,
16 struct nfsd4_layoutget *lgp)
17{
18 struct pnfs_block_extent *b = lgp->lg_content;
19 int len = sizeof(__be32) + 5 * sizeof(__be64) + sizeof(__be32);
20 __be32 *p;
21
22 p = xdr_reserve_space(xdr, sizeof(__be32) + len);
23 if (!p)
24 return nfserr_toosmall;
25
26 *p++ = cpu_to_be32(len);
27 *p++ = cpu_to_be32(1); /* we always return a single extent */
28
29 p = xdr_encode_opaque_fixed(p, &b->vol_id,
30 sizeof(struct nfsd4_deviceid));
31 p = xdr_encode_hyper(p, b->foff);
32 p = xdr_encode_hyper(p, b->len);
33 p = xdr_encode_hyper(p, b->soff);
34 *p++ = cpu_to_be32(b->es);
35 return 0;
36}
37
38static int
39nfsd4_block_encode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
40{
41 __be32 *p;
42 int len;
43
44 switch (b->type) {
45 case PNFS_BLOCK_VOLUME_SIMPLE:
46 len = 4 + 4 + 8 + 4 + b->simple.sig_len;
47 p = xdr_reserve_space(xdr, len);
48 if (!p)
49 return -ETOOSMALL;
50
51 *p++ = cpu_to_be32(b->type);
52 *p++ = cpu_to_be32(1); /* single signature */
53 p = xdr_encode_hyper(p, b->simple.offset);
54 p = xdr_encode_opaque(p, b->simple.sig, b->simple.sig_len);
55 break;
56 default:
57 return -ENOTSUPP;
58 }
59
60 return len;
61}
62
63__be32
64nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr,
65 struct nfsd4_getdeviceinfo *gdp)
66{
67 struct pnfs_block_deviceaddr *dev = gdp->gd_device;
68 int len = sizeof(__be32), ret, i;
69 __be32 *p;
70
71 p = xdr_reserve_space(xdr, len + sizeof(__be32));
72 if (!p)
73 return nfserr_resource;
74
75 for (i = 0; i < dev->nr_volumes; i++) {
76 ret = nfsd4_block_encode_volume(xdr, &dev->volumes[i]);
77 if (ret < 0)
78 return nfserrno(ret);
79 len += ret;
80 }
81
82 /*
83 * Fill in the overall length and number of volumes at the beginning
84 * of the layout.
85 */
86 *p++ = cpu_to_be32(len);
87 *p++ = cpu_to_be32(dev->nr_volumes);
88 return 0;
89}
90
91int
92nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
93 u32 block_size)
94{
95 struct iomap *iomaps;
96 u32 nr_iomaps, expected, i;
97
98 if (len < sizeof(u32)) {
99 dprintk("%s: extent array too small: %u\n", __func__, len);
100 return -EINVAL;
101 }
102
103 nr_iomaps = be32_to_cpup(p++);
104 expected = sizeof(__be32) + nr_iomaps * NFS4_BLOCK_EXTENT_SIZE;
105 if (len != expected) {
106 dprintk("%s: extent array size mismatch: %u/%u\n",
107 __func__, len, expected);
108 return -EINVAL;
109 }
110
111 iomaps = kcalloc(nr_iomaps, sizeof(*iomaps), GFP_KERNEL);
112 if (!iomaps) {
113 dprintk("%s: failed to allocate extent array\n", __func__);
114 return -ENOMEM;
115 }
116
117 for (i = 0; i < nr_iomaps; i++) {
118 struct pnfs_block_extent bex;
119
120 memcpy(&bex.vol_id, p, sizeof(struct nfsd4_deviceid));
121 p += XDR_QUADLEN(sizeof(struct nfsd4_deviceid));
122
123 p = xdr_decode_hyper(p, &bex.foff);
124 if (bex.foff & (block_size - 1)) {
125 dprintk("%s: unaligned offset %lld\n",
126 __func__, bex.foff);
127 goto fail;
128 }
129 p = xdr_decode_hyper(p, &bex.len);
130 if (bex.len & (block_size - 1)) {
131 dprintk("%s: unaligned length %lld\n",
132 __func__, bex.foff);
133 goto fail;
134 }
135 p = xdr_decode_hyper(p, &bex.soff);
136 if (bex.soff & (block_size - 1)) {
137 dprintk("%s: unaligned disk offset %lld\n",
138 __func__, bex.soff);
139 goto fail;
140 }
141 bex.es = be32_to_cpup(p++);
142 if (bex.es != PNFS_BLOCK_READWRITE_DATA) {
143 dprintk("%s: incorrect extent state %d\n",
144 __func__, bex.es);
145 goto fail;
146 }
147
148 iomaps[i].offset = bex.foff;
149 iomaps[i].length = bex.len;
150 }
151
152 *iomapp = iomaps;
153 return nr_iomaps;
154fail:
155 kfree(iomaps);
156 return -EINVAL;
157}
diff --git a/fs/nfsd/blocklayoutxdr.h b/fs/nfsd/blocklayoutxdr.h
new file mode 100644
index 000000000000..fdc79037c0e7
--- /dev/null
+++ b/fs/nfsd/blocklayoutxdr.h
@@ -0,0 +1,62 @@
1#ifndef _NFSD_BLOCKLAYOUTXDR_H
2#define _NFSD_BLOCKLAYOUTXDR_H 1
3
4#include <linux/blkdev.h>
5#include "xdr4.h"
6
7struct iomap;
8struct xdr_stream;
9
10enum pnfs_block_extent_state {
11 PNFS_BLOCK_READWRITE_DATA = 0,
12 PNFS_BLOCK_READ_DATA = 1,
13 PNFS_BLOCK_INVALID_DATA = 2,
14 PNFS_BLOCK_NONE_DATA = 3,
15};
16
17struct pnfs_block_extent {
18 struct nfsd4_deviceid vol_id;
19 u64 foff;
20 u64 len;
21 u64 soff;
22 enum pnfs_block_extent_state es;
23};
24#define NFS4_BLOCK_EXTENT_SIZE 44
25
26enum pnfs_block_volume_type {
27 PNFS_BLOCK_VOLUME_SIMPLE = 0,
28 PNFS_BLOCK_VOLUME_SLICE = 1,
29 PNFS_BLOCK_VOLUME_CONCAT = 2,
30 PNFS_BLOCK_VOLUME_STRIPE = 3,
31};
32
33/*
34 * Random upper cap for the uuid length to avoid unbounded allocation.
35 * Not actually limited by the protocol.
36 */
37#define PNFS_BLOCK_UUID_LEN 128
38
39struct pnfs_block_volume {
40 enum pnfs_block_volume_type type;
41 union {
42 struct {
43 u64 offset;
44 u32 sig_len;
45 u8 sig[PNFS_BLOCK_UUID_LEN];
46 } simple;
47 };
48};
49
50struct pnfs_block_deviceaddr {
51 u32 nr_volumes;
52 struct pnfs_block_volume volumes[];
53};
54
55__be32 nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr,
56 struct nfsd4_getdeviceinfo *gdp);
57__be32 nfsd4_block_encode_layoutget(struct xdr_stream *xdr,
58 struct nfsd4_layoutget *lgp);
59int nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
60 u32 block_size);
61
62#endif /* _NFSD_BLOCKLAYOUTXDR_H */
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 30a739d896ff..c3e3b6e55ae2 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -20,6 +20,7 @@
20#include "nfsd.h" 20#include "nfsd.h"
21#include "nfsfh.h" 21#include "nfsfh.h"
22#include "netns.h" 22#include "netns.h"
23#include "pnfs.h"
23 24
24#define NFSDDBG_FACILITY NFSDDBG_EXPORT 25#define NFSDDBG_FACILITY NFSDDBG_EXPORT
25 26
@@ -545,6 +546,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
545 546
546 exp.ex_client = dom; 547 exp.ex_client = dom;
547 exp.cd = cd; 548 exp.cd = cd;
549 exp.ex_devid_map = NULL;
548 550
549 /* expiry */ 551 /* expiry */
550 err = -EINVAL; 552 err = -EINVAL;
@@ -621,6 +623,8 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
621 if (!gid_valid(exp.ex_anon_gid)) 623 if (!gid_valid(exp.ex_anon_gid))
622 goto out4; 624 goto out4;
623 err = 0; 625 err = 0;
626
627 nfsd4_setup_layout_type(&exp);
624 } 628 }
625 629
626 expp = svc_export_lookup(&exp); 630 expp = svc_export_lookup(&exp);
@@ -703,6 +707,7 @@ static void svc_export_init(struct cache_head *cnew, struct cache_head *citem)
703 new->ex_fslocs.locations = NULL; 707 new->ex_fslocs.locations = NULL;
704 new->ex_fslocs.locations_count = 0; 708 new->ex_fslocs.locations_count = 0;
705 new->ex_fslocs.migrated = 0; 709 new->ex_fslocs.migrated = 0;
710 new->ex_layout_type = 0;
706 new->ex_uuid = NULL; 711 new->ex_uuid = NULL;
707 new->cd = item->cd; 712 new->cd = item->cd;
708} 713}
@@ -717,6 +722,8 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem)
717 new->ex_anon_uid = item->ex_anon_uid; 722 new->ex_anon_uid = item->ex_anon_uid;
718 new->ex_anon_gid = item->ex_anon_gid; 723 new->ex_anon_gid = item->ex_anon_gid;
719 new->ex_fsid = item->ex_fsid; 724 new->ex_fsid = item->ex_fsid;
725 new->ex_devid_map = item->ex_devid_map;
726 item->ex_devid_map = NULL;
720 new->ex_uuid = item->ex_uuid; 727 new->ex_uuid = item->ex_uuid;
721 item->ex_uuid = NULL; 728 item->ex_uuid = NULL;
722 new->ex_fslocs.locations = item->ex_fslocs.locations; 729 new->ex_fslocs.locations = item->ex_fslocs.locations;
@@ -725,6 +732,7 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem)
725 item->ex_fslocs.locations_count = 0; 732 item->ex_fslocs.locations_count = 0;
726 new->ex_fslocs.migrated = item->ex_fslocs.migrated; 733 new->ex_fslocs.migrated = item->ex_fslocs.migrated;
727 item->ex_fslocs.migrated = 0; 734 item->ex_fslocs.migrated = 0;
735 new->ex_layout_type = item->ex_layout_type;
728 new->ex_nflavors = item->ex_nflavors; 736 new->ex_nflavors = item->ex_nflavors;
729 for (i = 0; i < MAX_SECINFO_LIST; i++) { 737 for (i = 0; i < MAX_SECINFO_LIST; i++) {
730 new->ex_flavors[i] = item->ex_flavors[i]; 738 new->ex_flavors[i] = item->ex_flavors[i];
diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h
index 04dc8c167b0c..1f52bfcc436f 100644
--- a/fs/nfsd/export.h
+++ b/fs/nfsd/export.h
@@ -56,6 +56,8 @@ struct svc_export {
56 struct nfsd4_fs_locations ex_fslocs; 56 struct nfsd4_fs_locations ex_fslocs;
57 uint32_t ex_nflavors; 57 uint32_t ex_nflavors;
58 struct exp_flavor_info ex_flavors[MAX_SECINFO_LIST]; 58 struct exp_flavor_info ex_flavors[MAX_SECINFO_LIST];
59 enum pnfs_layouttype ex_layout_type;
60 struct nfsd4_deviceid_map *ex_devid_map;
59 struct cache_detail *cd; 61 struct cache_detail *cd;
60}; 62};
61 63
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 7cbdf1b2e4ab..58277859a467 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -546,6 +546,102 @@ out:
546 return status; 546 return status;
547} 547}
548 548
549#ifdef CONFIG_NFSD_PNFS
550/*
551 * CB_LAYOUTRECALL4args
552 *
553 * struct layoutrecall_file4 {
554 * nfs_fh4 lor_fh;
555 * offset4 lor_offset;
556 * length4 lor_length;
557 * stateid4 lor_stateid;
558 * };
559 *
560 * union layoutrecall4 switch(layoutrecall_type4 lor_recalltype) {
561 * case LAYOUTRECALL4_FILE:
562 * layoutrecall_file4 lor_layout;
563 * case LAYOUTRECALL4_FSID:
564 * fsid4 lor_fsid;
565 * case LAYOUTRECALL4_ALL:
566 * void;
567 * };
568 *
569 * struct CB_LAYOUTRECALL4args {
570 * layouttype4 clora_type;
571 * layoutiomode4 clora_iomode;
572 * bool clora_changed;
573 * layoutrecall4 clora_recall;
574 * };
575 */
576static void encode_cb_layout4args(struct xdr_stream *xdr,
577 const struct nfs4_layout_stateid *ls,
578 struct nfs4_cb_compound_hdr *hdr)
579{
580 __be32 *p;
581
582 BUG_ON(hdr->minorversion == 0);
583
584 p = xdr_reserve_space(xdr, 5 * 4);
585 *p++ = cpu_to_be32(OP_CB_LAYOUTRECALL);
586 *p++ = cpu_to_be32(ls->ls_layout_type);
587 *p++ = cpu_to_be32(IOMODE_ANY);
588 *p++ = cpu_to_be32(1);
589 *p = cpu_to_be32(RETURN_FILE);
590
591 encode_nfs_fh4(xdr, &ls->ls_stid.sc_file->fi_fhandle);
592
593 p = xdr_reserve_space(xdr, 2 * 8);
594 p = xdr_encode_hyper(p, 0);
595 xdr_encode_hyper(p, NFS4_MAX_UINT64);
596
597 encode_stateid4(xdr, &ls->ls_recall_sid);
598
599 hdr->nops++;
600}
601
602static void nfs4_xdr_enc_cb_layout(struct rpc_rqst *req,
603 struct xdr_stream *xdr,
604 const struct nfsd4_callback *cb)
605{
606 const struct nfs4_layout_stateid *ls =
607 container_of(cb, struct nfs4_layout_stateid, ls_recall);
608 struct nfs4_cb_compound_hdr hdr = {
609 .ident = 0,
610 .minorversion = cb->cb_minorversion,
611 };
612
613 encode_cb_compound4args(xdr, &hdr);
614 encode_cb_sequence4args(xdr, cb, &hdr);
615 encode_cb_layout4args(xdr, ls, &hdr);
616 encode_cb_nops(&hdr);
617}
618
619static int nfs4_xdr_dec_cb_layout(struct rpc_rqst *rqstp,
620 struct xdr_stream *xdr,
621 struct nfsd4_callback *cb)
622{
623 struct nfs4_cb_compound_hdr hdr;
624 enum nfsstat4 nfserr;
625 int status;
626
627 status = decode_cb_compound4res(xdr, &hdr);
628 if (unlikely(status))
629 goto out;
630 if (cb) {
631 status = decode_cb_sequence4res(xdr, cb);
632 if (unlikely(status))
633 goto out;
634 }
635 status = decode_cb_op_status(xdr, OP_CB_LAYOUTRECALL, &nfserr);
636 if (unlikely(status))
637 goto out;
638 if (unlikely(nfserr != NFS4_OK))
639 status = nfs_cb_stat_to_errno(nfserr);
640out:
641 return status;
642}
643#endif /* CONFIG_NFSD_PNFS */
644
549/* 645/*
550 * RPC procedure tables 646 * RPC procedure tables
551 */ 647 */
@@ -563,6 +659,9 @@ out:
563static struct rpc_procinfo nfs4_cb_procedures[] = { 659static struct rpc_procinfo nfs4_cb_procedures[] = {
564 PROC(CB_NULL, NULL, cb_null, cb_null), 660 PROC(CB_NULL, NULL, cb_null, cb_null),
565 PROC(CB_RECALL, COMPOUND, cb_recall, cb_recall), 661 PROC(CB_RECALL, COMPOUND, cb_recall, cb_recall),
662#ifdef CONFIG_NFSD_PNFS
663 PROC(CB_LAYOUT, COMPOUND, cb_layout, cb_layout),
664#endif
566}; 665};
567 666
568static struct rpc_version nfs_cb_version4 = { 667static struct rpc_version nfs_cb_version4 = {
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
new file mode 100644
index 000000000000..3c1bfa155571
--- /dev/null
+++ b/fs/nfsd/nfs4layouts.c
@@ -0,0 +1,721 @@
1/*
2 * Copyright (c) 2014 Christoph Hellwig.
3 */
4#include <linux/kmod.h>
5#include <linux/file.h>
6#include <linux/jhash.h>
7#include <linux/sched.h>
8#include <linux/sunrpc/addr.h>
9
10#include "pnfs.h"
11#include "netns.h"
12#include "trace.h"
13
14#define NFSDDBG_FACILITY NFSDDBG_PNFS
15
16struct nfs4_layout {
17 struct list_head lo_perstate;
18 struct nfs4_layout_stateid *lo_state;
19 struct nfsd4_layout_seg lo_seg;
20};
21
22static struct kmem_cache *nfs4_layout_cache;
23static struct kmem_cache *nfs4_layout_stateid_cache;
24
25static struct nfsd4_callback_ops nfsd4_cb_layout_ops;
26static const struct lock_manager_operations nfsd4_layouts_lm_ops;
27
28const struct nfsd4_layout_ops *nfsd4_layout_ops[LAYOUT_TYPE_MAX] = {
29 [LAYOUT_BLOCK_VOLUME] = &bl_layout_ops,
30};
31
32/* pNFS device ID to export fsid mapping */
33#define DEVID_HASH_BITS 8
34#define DEVID_HASH_SIZE (1 << DEVID_HASH_BITS)
35#define DEVID_HASH_MASK (DEVID_HASH_SIZE - 1)
36static u64 nfsd_devid_seq = 1;
37static struct list_head nfsd_devid_hash[DEVID_HASH_SIZE];
38static DEFINE_SPINLOCK(nfsd_devid_lock);
39
40static inline u32 devid_hashfn(u64 idx)
41{
42 return jhash_2words(idx, idx >> 32, 0) & DEVID_HASH_MASK;
43}
44
45static void
46nfsd4_alloc_devid_map(const struct svc_fh *fhp)
47{
48 const struct knfsd_fh *fh = &fhp->fh_handle;
49 size_t fsid_len = key_len(fh->fh_fsid_type);
50 struct nfsd4_deviceid_map *map, *old;
51 int i;
52
53 map = kzalloc(sizeof(*map) + fsid_len, GFP_KERNEL);
54 if (!map)
55 return;
56
57 map->fsid_type = fh->fh_fsid_type;
58 memcpy(&map->fsid, fh->fh_fsid, fsid_len);
59
60 spin_lock(&nfsd_devid_lock);
61 if (fhp->fh_export->ex_devid_map)
62 goto out_unlock;
63
64 for (i = 0; i < DEVID_HASH_SIZE; i++) {
65 list_for_each_entry(old, &nfsd_devid_hash[i], hash) {
66 if (old->fsid_type != fh->fh_fsid_type)
67 continue;
68 if (memcmp(old->fsid, fh->fh_fsid,
69 key_len(old->fsid_type)))
70 continue;
71
72 fhp->fh_export->ex_devid_map = old;
73 goto out_unlock;
74 }
75 }
76
77 map->idx = nfsd_devid_seq++;
78 list_add_tail_rcu(&map->hash, &nfsd_devid_hash[devid_hashfn(map->idx)]);
79 fhp->fh_export->ex_devid_map = map;
80 map = NULL;
81
82out_unlock:
83 spin_unlock(&nfsd_devid_lock);
84 kfree(map);
85}
86
87struct nfsd4_deviceid_map *
88nfsd4_find_devid_map(int idx)
89{
90 struct nfsd4_deviceid_map *map, *ret = NULL;
91
92 rcu_read_lock();
93 list_for_each_entry_rcu(map, &nfsd_devid_hash[devid_hashfn(idx)], hash)
94 if (map->idx == idx)
95 ret = map;
96 rcu_read_unlock();
97
98 return ret;
99}
100
101int
102nfsd4_set_deviceid(struct nfsd4_deviceid *id, const struct svc_fh *fhp,
103 u32 device_generation)
104{
105 if (!fhp->fh_export->ex_devid_map) {
106 nfsd4_alloc_devid_map(fhp);
107 if (!fhp->fh_export->ex_devid_map)
108 return -ENOMEM;
109 }
110
111 id->fsid_idx = fhp->fh_export->ex_devid_map->idx;
112 id->generation = device_generation;
113 id->pad = 0;
114 return 0;
115}
116
117void nfsd4_setup_layout_type(struct svc_export *exp)
118{
119 struct super_block *sb = exp->ex_path.mnt->mnt_sb;
120
121 if (exp->ex_flags & NFSEXP_NOPNFS)
122 return;
123
124 if (sb->s_export_op->get_uuid &&
125 sb->s_export_op->map_blocks &&
126 sb->s_export_op->commit_blocks)
127 exp->ex_layout_type = LAYOUT_BLOCK_VOLUME;
128}
129
130static void
131nfsd4_free_layout_stateid(struct nfs4_stid *stid)
132{
133 struct nfs4_layout_stateid *ls = layoutstateid(stid);
134 struct nfs4_client *clp = ls->ls_stid.sc_client;
135 struct nfs4_file *fp = ls->ls_stid.sc_file;
136
137 trace_layoutstate_free(&ls->ls_stid.sc_stateid);
138
139 spin_lock(&clp->cl_lock);
140 list_del_init(&ls->ls_perclnt);
141 spin_unlock(&clp->cl_lock);
142
143 spin_lock(&fp->fi_lock);
144 list_del_init(&ls->ls_perfile);
145 spin_unlock(&fp->fi_lock);
146
147 vfs_setlease(ls->ls_file, F_UNLCK, NULL, (void **)&ls);
148 fput(ls->ls_file);
149
150 if (ls->ls_recalled)
151 atomic_dec(&ls->ls_stid.sc_file->fi_lo_recalls);
152
153 kmem_cache_free(nfs4_layout_stateid_cache, ls);
154}
155
156static int
157nfsd4_layout_setlease(struct nfs4_layout_stateid *ls)
158{
159 struct file_lock *fl;
160 int status;
161
162 fl = locks_alloc_lock();
163 if (!fl)
164 return -ENOMEM;
165 locks_init_lock(fl);
166 fl->fl_lmops = &nfsd4_layouts_lm_ops;
167 fl->fl_flags = FL_LAYOUT;
168 fl->fl_type = F_RDLCK;
169 fl->fl_end = OFFSET_MAX;
170 fl->fl_owner = ls;
171 fl->fl_pid = current->tgid;
172 fl->fl_file = ls->ls_file;
173
174 status = vfs_setlease(fl->fl_file, fl->fl_type, &fl, NULL);
175 if (status) {
176 locks_free_lock(fl);
177 return status;
178 }
179 BUG_ON(fl != NULL);
180 return 0;
181}
182
183static struct nfs4_layout_stateid *
184nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate,
185 struct nfs4_stid *parent, u32 layout_type)
186{
187 struct nfs4_client *clp = cstate->clp;
188 struct nfs4_file *fp = parent->sc_file;
189 struct nfs4_layout_stateid *ls;
190 struct nfs4_stid *stp;
191
192 stp = nfs4_alloc_stid(cstate->clp, nfs4_layout_stateid_cache);
193 if (!stp)
194 return NULL;
195 stp->sc_free = nfsd4_free_layout_stateid;
196 get_nfs4_file(fp);
197 stp->sc_file = fp;
198
199 ls = layoutstateid(stp);
200 INIT_LIST_HEAD(&ls->ls_perclnt);
201 INIT_LIST_HEAD(&ls->ls_perfile);
202 spin_lock_init(&ls->ls_lock);
203 INIT_LIST_HEAD(&ls->ls_layouts);
204 ls->ls_layout_type = layout_type;
205 nfsd4_init_cb(&ls->ls_recall, clp, &nfsd4_cb_layout_ops,
206 NFSPROC4_CLNT_CB_LAYOUT);
207
208 if (parent->sc_type == NFS4_DELEG_STID)
209 ls->ls_file = get_file(fp->fi_deleg_file);
210 else
211 ls->ls_file = find_any_file(fp);
212 BUG_ON(!ls->ls_file);
213
214 if (nfsd4_layout_setlease(ls)) {
215 put_nfs4_file(fp);
216 kmem_cache_free(nfs4_layout_stateid_cache, ls);
217 return NULL;
218 }
219
220 spin_lock(&clp->cl_lock);
221 stp->sc_type = NFS4_LAYOUT_STID;
222 list_add(&ls->ls_perclnt, &clp->cl_lo_states);
223 spin_unlock(&clp->cl_lock);
224
225 spin_lock(&fp->fi_lock);
226 list_add(&ls->ls_perfile, &fp->fi_lo_states);
227 spin_unlock(&fp->fi_lock);
228
229 trace_layoutstate_alloc(&ls->ls_stid.sc_stateid);
230 return ls;
231}
232
233__be32
234nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp,
235 struct nfsd4_compound_state *cstate, stateid_t *stateid,
236 bool create, u32 layout_type, struct nfs4_layout_stateid **lsp)
237{
238 struct nfs4_layout_stateid *ls;
239 struct nfs4_stid *stid;
240 unsigned char typemask = NFS4_LAYOUT_STID;
241 __be32 status;
242
243 if (create)
244 typemask |= (NFS4_OPEN_STID | NFS4_LOCK_STID | NFS4_DELEG_STID);
245
246 status = nfsd4_lookup_stateid(cstate, stateid, typemask, &stid,
247 net_generic(SVC_NET(rqstp), nfsd_net_id));
248 if (status)
249 goto out;
250
251 if (!fh_match(&cstate->current_fh.fh_handle,
252 &stid->sc_file->fi_fhandle)) {
253 status = nfserr_bad_stateid;
254 goto out_put_stid;
255 }
256
257 if (stid->sc_type != NFS4_LAYOUT_STID) {
258 ls = nfsd4_alloc_layout_stateid(cstate, stid, layout_type);
259 nfs4_put_stid(stid);
260
261 status = nfserr_jukebox;
262 if (!ls)
263 goto out;
264 } else {
265 ls = container_of(stid, struct nfs4_layout_stateid, ls_stid);
266
267 status = nfserr_bad_stateid;
268 if (stateid->si_generation > stid->sc_stateid.si_generation)
269 goto out_put_stid;
270 if (layout_type != ls->ls_layout_type)
271 goto out_put_stid;
272 }
273
274 *lsp = ls;
275 return 0;
276
277out_put_stid:
278 nfs4_put_stid(stid);
279out:
280 return status;
281}
282
283static void
284nfsd4_recall_file_layout(struct nfs4_layout_stateid *ls)
285{
286 spin_lock(&ls->ls_lock);
287 if (ls->ls_recalled)
288 goto out_unlock;
289
290 ls->ls_recalled = true;
291 atomic_inc(&ls->ls_stid.sc_file->fi_lo_recalls);
292 if (list_empty(&ls->ls_layouts))
293 goto out_unlock;
294
295 trace_layout_recall(&ls->ls_stid.sc_stateid);
296
297 atomic_inc(&ls->ls_stid.sc_count);
298 update_stateid(&ls->ls_stid.sc_stateid);
299 memcpy(&ls->ls_recall_sid, &ls->ls_stid.sc_stateid, sizeof(stateid_t));
300 nfsd4_run_cb(&ls->ls_recall);
301
302out_unlock:
303 spin_unlock(&ls->ls_lock);
304}
305
306static inline u64
307layout_end(struct nfsd4_layout_seg *seg)
308{
309 u64 end = seg->offset + seg->length;
310 return end >= seg->offset ? end : NFS4_MAX_UINT64;
311}
312
313static void
314layout_update_len(struct nfsd4_layout_seg *lo, u64 end)
315{
316 if (end == NFS4_MAX_UINT64)
317 lo->length = NFS4_MAX_UINT64;
318 else
319 lo->length = end - lo->offset;
320}
321
322static bool
323layouts_overlapping(struct nfs4_layout *lo, struct nfsd4_layout_seg *s)
324{
325 if (s->iomode != IOMODE_ANY && s->iomode != lo->lo_seg.iomode)
326 return false;
327 if (layout_end(&lo->lo_seg) <= s->offset)
328 return false;
329 if (layout_end(s) <= lo->lo_seg.offset)
330 return false;
331 return true;
332}
333
334static bool
335layouts_try_merge(struct nfsd4_layout_seg *lo, struct nfsd4_layout_seg *new)
336{
337 if (lo->iomode != new->iomode)
338 return false;
339 if (layout_end(new) < lo->offset)
340 return false;
341 if (layout_end(lo) < new->offset)
342 return false;
343
344 lo->offset = min(lo->offset, new->offset);
345 layout_update_len(lo, max(layout_end(lo), layout_end(new)));
346 return true;
347}
348
349static __be32
350nfsd4_recall_conflict(struct nfs4_layout_stateid *ls)
351{
352 struct nfs4_file *fp = ls->ls_stid.sc_file;
353 struct nfs4_layout_stateid *l, *n;
354 __be32 nfserr = nfs_ok;
355
356 assert_spin_locked(&fp->fi_lock);
357
358 list_for_each_entry_safe(l, n, &fp->fi_lo_states, ls_perfile) {
359 if (l != ls) {
360 nfsd4_recall_file_layout(l);
361 nfserr = nfserr_recallconflict;
362 }
363 }
364
365 return nfserr;
366}
367
368__be32
369nfsd4_insert_layout(struct nfsd4_layoutget *lgp, struct nfs4_layout_stateid *ls)
370{
371 struct nfsd4_layout_seg *seg = &lgp->lg_seg;
372 struct nfs4_file *fp = ls->ls_stid.sc_file;
373 struct nfs4_layout *lp, *new = NULL;
374 __be32 nfserr;
375
376 spin_lock(&fp->fi_lock);
377 nfserr = nfsd4_recall_conflict(ls);
378 if (nfserr)
379 goto out;
380 spin_lock(&ls->ls_lock);
381 list_for_each_entry(lp, &ls->ls_layouts, lo_perstate) {
382 if (layouts_try_merge(&lp->lo_seg, seg))
383 goto done;
384 }
385 spin_unlock(&ls->ls_lock);
386 spin_unlock(&fp->fi_lock);
387
388 new = kmem_cache_alloc(nfs4_layout_cache, GFP_KERNEL);
389 if (!new)
390 return nfserr_jukebox;
391 memcpy(&new->lo_seg, seg, sizeof(lp->lo_seg));
392 new->lo_state = ls;
393
394 spin_lock(&fp->fi_lock);
395 nfserr = nfsd4_recall_conflict(ls);
396 if (nfserr)
397 goto out;
398 spin_lock(&ls->ls_lock);
399 list_for_each_entry(lp, &ls->ls_layouts, lo_perstate) {
400 if (layouts_try_merge(&lp->lo_seg, seg))
401 goto done;
402 }
403
404 atomic_inc(&ls->ls_stid.sc_count);
405 list_add_tail(&new->lo_perstate, &ls->ls_layouts);
406 new = NULL;
407done:
408 update_stateid(&ls->ls_stid.sc_stateid);
409 memcpy(&lgp->lg_sid, &ls->ls_stid.sc_stateid, sizeof(stateid_t));
410 spin_unlock(&ls->ls_lock);
411out:
412 spin_unlock(&fp->fi_lock);
413 if (new)
414 kmem_cache_free(nfs4_layout_cache, new);
415 return nfserr;
416}
417
418static void
419nfsd4_free_layouts(struct list_head *reaplist)
420{
421 while (!list_empty(reaplist)) {
422 struct nfs4_layout *lp = list_first_entry(reaplist,
423 struct nfs4_layout, lo_perstate);
424
425 list_del(&lp->lo_perstate);
426 nfs4_put_stid(&lp->lo_state->ls_stid);
427 kmem_cache_free(nfs4_layout_cache, lp);
428 }
429}
430
431static void
432nfsd4_return_file_layout(struct nfs4_layout *lp, struct nfsd4_layout_seg *seg,
433 struct list_head *reaplist)
434{
435 struct nfsd4_layout_seg *lo = &lp->lo_seg;
436 u64 end = layout_end(lo);
437
438 if (seg->offset <= lo->offset) {
439 if (layout_end(seg) >= end) {
440 list_move_tail(&lp->lo_perstate, reaplist);
441 return;
442 }
443 end = seg->offset;
444 } else {
445 /* retain the whole layout segment on a split. */
446 if (layout_end(seg) < end) {
447 dprintk("%s: split not supported\n", __func__);
448 return;
449 }
450
451 lo->offset = layout_end(seg);
452 }
453
454 layout_update_len(lo, end);
455}
456
457__be32
458nfsd4_return_file_layouts(struct svc_rqst *rqstp,
459 struct nfsd4_compound_state *cstate,
460 struct nfsd4_layoutreturn *lrp)
461{
462 struct nfs4_layout_stateid *ls;
463 struct nfs4_layout *lp, *n;
464 LIST_HEAD(reaplist);
465 __be32 nfserr;
466 int found = 0;
467
468 nfserr = nfsd4_preprocess_layout_stateid(rqstp, cstate, &lrp->lr_sid,
469 false, lrp->lr_layout_type,
470 &ls);
471 if (nfserr) {
472 trace_layout_return_lookup_fail(&lrp->lr_sid);
473 return nfserr;
474 }
475
476 spin_lock(&ls->ls_lock);
477 list_for_each_entry_safe(lp, n, &ls->ls_layouts, lo_perstate) {
478 if (layouts_overlapping(lp, &lrp->lr_seg)) {
479 nfsd4_return_file_layout(lp, &lrp->lr_seg, &reaplist);
480 found++;
481 }
482 }
483 if (!list_empty(&ls->ls_layouts)) {
484 if (found) {
485 update_stateid(&ls->ls_stid.sc_stateid);
486 memcpy(&lrp->lr_sid, &ls->ls_stid.sc_stateid,
487 sizeof(stateid_t));
488 }
489 lrp->lrs_present = 1;
490 } else {
491 trace_layoutstate_unhash(&ls->ls_stid.sc_stateid);
492 nfs4_unhash_stid(&ls->ls_stid);
493 lrp->lrs_present = 0;
494 }
495 spin_unlock(&ls->ls_lock);
496
497 nfs4_put_stid(&ls->ls_stid);
498 nfsd4_free_layouts(&reaplist);
499 return nfs_ok;
500}
501
502__be32
503nfsd4_return_client_layouts(struct svc_rqst *rqstp,
504 struct nfsd4_compound_state *cstate,
505 struct nfsd4_layoutreturn *lrp)
506{
507 struct nfs4_layout_stateid *ls, *n;
508 struct nfs4_client *clp = cstate->clp;
509 struct nfs4_layout *lp, *t;
510 LIST_HEAD(reaplist);
511
512 lrp->lrs_present = 0;
513
514 spin_lock(&clp->cl_lock);
515 list_for_each_entry_safe(ls, n, &clp->cl_lo_states, ls_perclnt) {
516 if (lrp->lr_return_type == RETURN_FSID &&
517 !fh_fsid_match(&ls->ls_stid.sc_file->fi_fhandle,
518 &cstate->current_fh.fh_handle))
519 continue;
520
521 spin_lock(&ls->ls_lock);
522 list_for_each_entry_safe(lp, t, &ls->ls_layouts, lo_perstate) {
523 if (lrp->lr_seg.iomode == IOMODE_ANY ||
524 lrp->lr_seg.iomode == lp->lo_seg.iomode)
525 list_move_tail(&lp->lo_perstate, &reaplist);
526 }
527 spin_unlock(&ls->ls_lock);
528 }
529 spin_unlock(&clp->cl_lock);
530
531 nfsd4_free_layouts(&reaplist);
532 return 0;
533}
534
535static void
536nfsd4_return_all_layouts(struct nfs4_layout_stateid *ls,
537 struct list_head *reaplist)
538{
539 spin_lock(&ls->ls_lock);
540 list_splice_init(&ls->ls_layouts, reaplist);
541 spin_unlock(&ls->ls_lock);
542}
543
544void
545nfsd4_return_all_client_layouts(struct nfs4_client *clp)
546{
547 struct nfs4_layout_stateid *ls, *n;
548 LIST_HEAD(reaplist);
549
550 spin_lock(&clp->cl_lock);
551 list_for_each_entry_safe(ls, n, &clp->cl_lo_states, ls_perclnt)
552 nfsd4_return_all_layouts(ls, &reaplist);
553 spin_unlock(&clp->cl_lock);
554
555 nfsd4_free_layouts(&reaplist);
556}
557
558void
559nfsd4_return_all_file_layouts(struct nfs4_client *clp, struct nfs4_file *fp)
560{
561 struct nfs4_layout_stateid *ls, *n;
562 LIST_HEAD(reaplist);
563
564 spin_lock(&fp->fi_lock);
565 list_for_each_entry_safe(ls, n, &fp->fi_lo_states, ls_perfile) {
566 if (ls->ls_stid.sc_client == clp)
567 nfsd4_return_all_layouts(ls, &reaplist);
568 }
569 spin_unlock(&fp->fi_lock);
570
571 nfsd4_free_layouts(&reaplist);
572}
573
574static void
575nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls)
576{
577 struct nfs4_client *clp = ls->ls_stid.sc_client;
578 char addr_str[INET6_ADDRSTRLEN];
579 static char *envp[] = {
580 "HOME=/",
581 "TERM=linux",
582 "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
583 NULL
584 };
585 char *argv[8];
586 int error;
587
588 rpc_ntop((struct sockaddr *)&clp->cl_addr, addr_str, sizeof(addr_str));
589
590 nfsd4_cb_layout_fail(ls);
591
592 printk(KERN_WARNING
593 "nfsd: client %s failed to respond to layout recall. "
594 " Fencing..\n", addr_str);
595
596 argv[0] = "/sbin/nfsd-recall-failed";
597 argv[1] = addr_str;
598 argv[2] = ls->ls_file->f_path.mnt->mnt_sb->s_id;
599 argv[3] = NULL;
600
601 error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
602 if (error) {
603 printk(KERN_ERR "nfsd: fence failed for client %s: %d!\n",
604 addr_str, error);
605 }
606}
607
608static int
609nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
610{
611 struct nfs4_layout_stateid *ls =
612 container_of(cb, struct nfs4_layout_stateid, ls_recall);
613 LIST_HEAD(reaplist);
614
615 switch (task->tk_status) {
616 case 0:
617 return 1;
618 case -NFS4ERR_NOMATCHING_LAYOUT:
619 trace_layout_recall_done(&ls->ls_stid.sc_stateid);
620 task->tk_status = 0;
621 return 1;
622 case -NFS4ERR_DELAY:
623 /* Poll the client until it's done with the layout */
624 /* FIXME: cap number of retries.
625 * The pnfs standard states that we need to only expire
626 * the client after at-least "lease time" .eg lease-time * 2
627 * when failing to communicate a recall
628 */
629 rpc_delay(task, HZ/100); /* 10 mili-seconds */
630 return 0;
631 default:
632 /*
633 * Unknown error or non-responding client, we'll need to fence.
634 */
635 nfsd4_cb_layout_fail(ls);
636 return -1;
637 }
638}
639
640static void
641nfsd4_cb_layout_release(struct nfsd4_callback *cb)
642{
643 struct nfs4_layout_stateid *ls =
644 container_of(cb, struct nfs4_layout_stateid, ls_recall);
645 LIST_HEAD(reaplist);
646
647 trace_layout_recall_release(&ls->ls_stid.sc_stateid);
648
649 nfsd4_return_all_layouts(ls, &reaplist);
650 nfsd4_free_layouts(&reaplist);
651 nfs4_put_stid(&ls->ls_stid);
652}
653
654static struct nfsd4_callback_ops nfsd4_cb_layout_ops = {
655 .done = nfsd4_cb_layout_done,
656 .release = nfsd4_cb_layout_release,
657};
658
659static bool
660nfsd4_layout_lm_break(struct file_lock *fl)
661{
662 /*
663 * We don't want the locks code to timeout the lease for us;
664 * we'll remove it ourself if a layout isn't returned
665 * in time:
666 */
667 fl->fl_break_time = 0;
668 nfsd4_recall_file_layout(fl->fl_owner);
669 return false;
670}
671
672static int
673nfsd4_layout_lm_change(struct file_lock *onlist, int arg,
674 struct list_head *dispose)
675{
676 BUG_ON(!(arg & F_UNLCK));
677 return lease_modify(onlist, arg, dispose);
678}
679
680static const struct lock_manager_operations nfsd4_layouts_lm_ops = {
681 .lm_break = nfsd4_layout_lm_break,
682 .lm_change = nfsd4_layout_lm_change,
683};
684
685int
686nfsd4_init_pnfs(void)
687{
688 int i;
689
690 for (i = 0; i < DEVID_HASH_SIZE; i++)
691 INIT_LIST_HEAD(&nfsd_devid_hash[i]);
692
693 nfs4_layout_cache = kmem_cache_create("nfs4_layout",
694 sizeof(struct nfs4_layout), 0, 0, NULL);
695 if (!nfs4_layout_cache)
696 return -ENOMEM;
697
698 nfs4_layout_stateid_cache = kmem_cache_create("nfs4_layout_stateid",
699 sizeof(struct nfs4_layout_stateid), 0, 0, NULL);
700 if (!nfs4_layout_stateid_cache) {
701 kmem_cache_destroy(nfs4_layout_cache);
702 return -ENOMEM;
703 }
704 return 0;
705}
706
707void
708nfsd4_exit_pnfs(void)
709{
710 int i;
711
712 kmem_cache_destroy(nfs4_layout_cache);
713 kmem_cache_destroy(nfs4_layout_stateid_cache);
714
715 for (i = 0; i < DEVID_HASH_SIZE; i++) {
716 struct nfsd4_deviceid_map *map, *n;
717
718 list_for_each_entry_safe(map, n, &nfsd_devid_hash[i], hash)
719 kfree(map);
720 }
721}
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index ac71d13c69ef..d30bea8d0277 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -43,6 +43,8 @@
43#include "current_stateid.h" 43#include "current_stateid.h"
44#include "netns.h" 44#include "netns.h"
45#include "acl.h" 45#include "acl.h"
46#include "pnfs.h"
47#include "trace.h"
46 48
47#ifdef CONFIG_NFSD_V4_SECURITY_LABEL 49#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
48#include <linux/security.h> 50#include <linux/security.h>
@@ -1178,6 +1180,259 @@ nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
1178 return status == nfserr_same ? nfs_ok : status; 1180 return status == nfserr_same ? nfs_ok : status;
1179} 1181}
1180 1182
1183#ifdef CONFIG_NFSD_PNFS
1184static const struct nfsd4_layout_ops *
1185nfsd4_layout_verify(struct svc_export *exp, unsigned int layout_type)
1186{
1187 if (!exp->ex_layout_type) {
1188 dprintk("%s: export does not support pNFS\n", __func__);
1189 return NULL;
1190 }
1191
1192 if (exp->ex_layout_type != layout_type) {
1193 dprintk("%s: layout type %d not supported\n",
1194 __func__, layout_type);
1195 return NULL;
1196 }
1197
1198 return nfsd4_layout_ops[layout_type];
1199}
1200
1201static __be32
1202nfsd4_getdeviceinfo(struct svc_rqst *rqstp,
1203 struct nfsd4_compound_state *cstate,
1204 struct nfsd4_getdeviceinfo *gdp)
1205{
1206 const struct nfsd4_layout_ops *ops;
1207 struct nfsd4_deviceid_map *map;
1208 struct svc_export *exp;
1209 __be32 nfserr;
1210
1211 dprintk("%s: layout_type %u dev_id [0x%llx:0x%x] maxcnt %u\n",
1212 __func__,
1213 gdp->gd_layout_type,
1214 gdp->gd_devid.fsid_idx, gdp->gd_devid.generation,
1215 gdp->gd_maxcount);
1216
1217 map = nfsd4_find_devid_map(gdp->gd_devid.fsid_idx);
1218 if (!map) {
1219 dprintk("%s: couldn't find device ID to export mapping!\n",
1220 __func__);
1221 return nfserr_noent;
1222 }
1223
1224 exp = rqst_exp_find(rqstp, map->fsid_type, map->fsid);
1225 if (IS_ERR(exp)) {
1226 dprintk("%s: could not find device id\n", __func__);
1227 return nfserr_noent;
1228 }
1229
1230 nfserr = nfserr_layoutunavailable;
1231 ops = nfsd4_layout_verify(exp, gdp->gd_layout_type);
1232 if (!ops)
1233 goto out;
1234
1235 nfserr = nfs_ok;
1236 if (gdp->gd_maxcount != 0)
1237 nfserr = ops->proc_getdeviceinfo(exp->ex_path.mnt->mnt_sb, gdp);
1238
1239 gdp->gd_notify_types &= ops->notify_types;
1240 exp_put(exp);
1241out:
1242 return nfserr;
1243}
1244
1245static __be32
1246nfsd4_layoutget(struct svc_rqst *rqstp,
1247 struct nfsd4_compound_state *cstate,
1248 struct nfsd4_layoutget *lgp)
1249{
1250 struct svc_fh *current_fh = &cstate->current_fh;
1251 const struct nfsd4_layout_ops *ops;
1252 struct nfs4_layout_stateid *ls;
1253 __be32 nfserr;
1254 int accmode;
1255
1256 switch (lgp->lg_seg.iomode) {
1257 case IOMODE_READ:
1258 accmode = NFSD_MAY_READ;
1259 break;
1260 case IOMODE_RW:
1261 accmode = NFSD_MAY_READ | NFSD_MAY_WRITE;
1262 break;
1263 default:
1264 dprintk("%s: invalid iomode %d\n",
1265 __func__, lgp->lg_seg.iomode);
1266 nfserr = nfserr_badiomode;
1267 goto out;
1268 }
1269
1270 nfserr = fh_verify(rqstp, current_fh, 0, accmode);
1271 if (nfserr)
1272 goto out;
1273
1274 nfserr = nfserr_layoutunavailable;
1275 ops = nfsd4_layout_verify(current_fh->fh_export, lgp->lg_layout_type);
1276 if (!ops)
1277 goto out;
1278
1279 /*
1280 * Verify minlength and range as per RFC5661:
1281 * o If loga_length is less than loga_minlength,
1282 * the metadata server MUST return NFS4ERR_INVAL.
1283 * o If the sum of loga_offset and loga_minlength exceeds
1284 * NFS4_UINT64_MAX, and loga_minlength is not
1285 * NFS4_UINT64_MAX, the error NFS4ERR_INVAL MUST result.
1286 * o If the sum of loga_offset and loga_length exceeds
1287 * NFS4_UINT64_MAX, and loga_length is not NFS4_UINT64_MAX,
1288 * the error NFS4ERR_INVAL MUST result.
1289 */
1290 nfserr = nfserr_inval;
1291 if (lgp->lg_seg.length < lgp->lg_minlength ||
1292 (lgp->lg_minlength != NFS4_MAX_UINT64 &&
1293 lgp->lg_minlength > NFS4_MAX_UINT64 - lgp->lg_seg.offset) ||
1294 (lgp->lg_seg.length != NFS4_MAX_UINT64 &&
1295 lgp->lg_seg.length > NFS4_MAX_UINT64 - lgp->lg_seg.offset))
1296 goto out;
1297 if (lgp->lg_seg.length == 0)
1298 goto out;
1299
1300 nfserr = nfsd4_preprocess_layout_stateid(rqstp, cstate, &lgp->lg_sid,
1301 true, lgp->lg_layout_type, &ls);
1302 if (nfserr) {
1303 trace_layout_get_lookup_fail(&lgp->lg_sid);
1304 goto out;
1305 }
1306
1307 nfserr = nfserr_recallconflict;
1308 if (atomic_read(&ls->ls_stid.sc_file->fi_lo_recalls))
1309 goto out_put_stid;
1310
1311 nfserr = ops->proc_layoutget(current_fh->fh_dentry->d_inode,
1312 current_fh, lgp);
1313 if (nfserr)
1314 goto out_put_stid;
1315
1316 nfserr = nfsd4_insert_layout(lgp, ls);
1317
1318out_put_stid:
1319 nfs4_put_stid(&ls->ls_stid);
1320out:
1321 return nfserr;
1322}
1323
1324static __be32
1325nfsd4_layoutcommit(struct svc_rqst *rqstp,
1326 struct nfsd4_compound_state *cstate,
1327 struct nfsd4_layoutcommit *lcp)
1328{
1329 const struct nfsd4_layout_seg *seg = &lcp->lc_seg;
1330 struct svc_fh *current_fh = &cstate->current_fh;
1331 const struct nfsd4_layout_ops *ops;
1332 loff_t new_size = lcp->lc_last_wr + 1;
1333 struct inode *inode;
1334 struct nfs4_layout_stateid *ls;
1335 __be32 nfserr;
1336
1337 nfserr = fh_verify(rqstp, current_fh, 0, NFSD_MAY_WRITE);
1338 if (nfserr)
1339 goto out;
1340
1341 nfserr = nfserr_layoutunavailable;
1342 ops = nfsd4_layout_verify(current_fh->fh_export, lcp->lc_layout_type);
1343 if (!ops)
1344 goto out;
1345 inode = current_fh->fh_dentry->d_inode;
1346
1347 nfserr = nfserr_inval;
1348 if (new_size <= seg->offset) {
1349 dprintk("pnfsd: last write before layout segment\n");
1350 goto out;
1351 }
1352 if (new_size > seg->offset + seg->length) {
1353 dprintk("pnfsd: last write beyond layout segment\n");
1354 goto out;
1355 }
1356 if (!lcp->lc_newoffset && new_size > i_size_read(inode)) {
1357 dprintk("pnfsd: layoutcommit beyond EOF\n");
1358 goto out;
1359 }
1360
1361 nfserr = nfsd4_preprocess_layout_stateid(rqstp, cstate, &lcp->lc_sid,
1362 false, lcp->lc_layout_type,
1363 &ls);
1364 if (nfserr) {
1365 trace_layout_commit_lookup_fail(&lcp->lc_sid);
1366 /* fixup error code as per RFC5661 */
1367 if (nfserr == nfserr_bad_stateid)
1368 nfserr = nfserr_badlayout;
1369 goto out;
1370 }
1371
1372 nfserr = ops->proc_layoutcommit(inode, lcp);
1373 if (nfserr)
1374 goto out_put_stid;
1375
1376 if (new_size > i_size_read(inode)) {
1377 lcp->lc_size_chg = 1;
1378 lcp->lc_newsize = new_size;
1379 } else {
1380 lcp->lc_size_chg = 0;
1381 }
1382
1383out_put_stid:
1384 nfs4_put_stid(&ls->ls_stid);
1385out:
1386 return nfserr;
1387}
1388
1389static __be32
1390nfsd4_layoutreturn(struct svc_rqst *rqstp,
1391 struct nfsd4_compound_state *cstate,
1392 struct nfsd4_layoutreturn *lrp)
1393{
1394 struct svc_fh *current_fh = &cstate->current_fh;
1395 __be32 nfserr;
1396
1397 nfserr = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP);
1398 if (nfserr)
1399 goto out;
1400
1401 nfserr = nfserr_layoutunavailable;
1402 if (!nfsd4_layout_verify(current_fh->fh_export, lrp->lr_layout_type))
1403 goto out;
1404
1405 switch (lrp->lr_seg.iomode) {
1406 case IOMODE_READ:
1407 case IOMODE_RW:
1408 case IOMODE_ANY:
1409 break;
1410 default:
1411 dprintk("%s: invalid iomode %d\n", __func__,
1412 lrp->lr_seg.iomode);
1413 nfserr = nfserr_inval;
1414 goto out;
1415 }
1416
1417 switch (lrp->lr_return_type) {
1418 case RETURN_FILE:
1419 nfserr = nfsd4_return_file_layouts(rqstp, cstate, lrp);
1420 break;
1421 case RETURN_FSID:
1422 case RETURN_ALL:
1423 nfserr = nfsd4_return_client_layouts(rqstp, cstate, lrp);
1424 break;
1425 default:
1426 dprintk("%s: invalid return_type %d\n", __func__,
1427 lrp->lr_return_type);
1428 nfserr = nfserr_inval;
1429 break;
1430 }
1431out:
1432 return nfserr;
1433}
1434#endif /* CONFIG_NFSD_PNFS */
1435
1181/* 1436/*
1182 * NULL call. 1437 * NULL call.
1183 */ 1438 */
@@ -1679,6 +1934,36 @@ static inline u32 nfsd4_create_session_rsize(struct svc_rqst *rqstp, struct nfsd
1679 op_encode_channel_attrs_maxsz) * sizeof(__be32); 1934 op_encode_channel_attrs_maxsz) * sizeof(__be32);
1680} 1935}
1681 1936
1937#ifdef CONFIG_NFSD_PNFS
1938/*
1939 * At this stage we don't really know what layout driver will handle the request,
1940 * so we need to define an arbitrary upper bound here.
1941 */
1942#define MAX_LAYOUT_SIZE 128
1943static inline u32 nfsd4_layoutget_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
1944{
1945 return (op_encode_hdr_size +
1946 1 /* logr_return_on_close */ +
1947 op_encode_stateid_maxsz +
1948 1 /* nr of layouts */ +
1949 MAX_LAYOUT_SIZE) * sizeof(__be32);
1950}
1951
1952static inline u32 nfsd4_layoutcommit_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
1953{
1954 return (op_encode_hdr_size +
1955 1 /* locr_newsize */ +
1956 2 /* ns_size */) * sizeof(__be32);
1957}
1958
1959static inline u32 nfsd4_layoutreturn_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
1960{
1961 return (op_encode_hdr_size +
1962 1 /* lrs_stateid */ +
1963 op_encode_stateid_maxsz) * sizeof(__be32);
1964}
1965#endif /* CONFIG_NFSD_PNFS */
1966
1682static struct nfsd4_operation nfsd4_ops[] = { 1967static struct nfsd4_operation nfsd4_ops[] = {
1683 [OP_ACCESS] = { 1968 [OP_ACCESS] = {
1684 .op_func = (nfsd4op_func)nfsd4_access, 1969 .op_func = (nfsd4op_func)nfsd4_access,
@@ -1966,6 +2251,31 @@ static struct nfsd4_operation nfsd4_ops[] = {
1966 .op_get_currentstateid = (stateid_getter)nfsd4_get_freestateid, 2251 .op_get_currentstateid = (stateid_getter)nfsd4_get_freestateid,
1967 .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, 2252 .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
1968 }, 2253 },
2254#ifdef CONFIG_NFSD_PNFS
2255 [OP_GETDEVICEINFO] = {
2256 .op_func = (nfsd4op_func)nfsd4_getdeviceinfo,
2257 .op_flags = ALLOWED_WITHOUT_FH,
2258 .op_name = "OP_GETDEVICEINFO",
2259 },
2260 [OP_LAYOUTGET] = {
2261 .op_func = (nfsd4op_func)nfsd4_layoutget,
2262 .op_flags = OP_MODIFIES_SOMETHING,
2263 .op_name = "OP_LAYOUTGET",
2264 .op_rsize_bop = (nfsd4op_rsize)nfsd4_layoutget_rsize,
2265 },
2266 [OP_LAYOUTCOMMIT] = {
2267 .op_func = (nfsd4op_func)nfsd4_layoutcommit,
2268 .op_flags = OP_MODIFIES_SOMETHING,
2269 .op_name = "OP_LAYOUTCOMMIT",
2270 .op_rsize_bop = (nfsd4op_rsize)nfsd4_layoutcommit_rsize,
2271 },
2272 [OP_LAYOUTRETURN] = {
2273 .op_func = (nfsd4op_func)nfsd4_layoutreturn,
2274 .op_flags = OP_MODIFIES_SOMETHING,
2275 .op_name = "OP_LAYOUTRETURN",
2276 .op_rsize_bop = (nfsd4op_rsize)nfsd4_layoutreturn_rsize,
2277 },
2278#endif /* CONFIG_NFSD_PNFS */
1969 2279
1970 /* NFSv4.2 operations */ 2280 /* NFSv4.2 operations */
1971 [OP_ALLOCATE] = { 2281 [OP_ALLOCATE] = {
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 532a60cca2fb..f6b2a09f793f 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -48,6 +48,7 @@
48#include "current_stateid.h" 48#include "current_stateid.h"
49 49
50#include "netns.h" 50#include "netns.h"
51#include "pnfs.h"
51 52
52#define NFSDDBG_FACILITY NFSDDBG_PROC 53#define NFSDDBG_FACILITY NFSDDBG_PROC
53 54
@@ -150,16 +151,6 @@ renew_client_locked(struct nfs4_client *clp)
150 clp->cl_time = get_seconds(); 151 clp->cl_time = get_seconds();
151} 152}
152 153
153static inline void
154renew_client(struct nfs4_client *clp)
155{
156 struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
157
158 spin_lock(&nn->client_lock);
159 renew_client_locked(clp);
160 spin_unlock(&nn->client_lock);
161}
162
163static void put_client_renew_locked(struct nfs4_client *clp) 154static void put_client_renew_locked(struct nfs4_client *clp)
164{ 155{
165 struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); 156 struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
@@ -282,7 +273,7 @@ static void nfsd4_free_file_rcu(struct rcu_head *rcu)
282 kmem_cache_free(file_slab, fp); 273 kmem_cache_free(file_slab, fp);
283} 274}
284 275
285static inline void 276void
286put_nfs4_file(struct nfs4_file *fi) 277put_nfs4_file(struct nfs4_file *fi)
287{ 278{
288 might_lock(&state_lock); 279 might_lock(&state_lock);
@@ -295,12 +286,6 @@ put_nfs4_file(struct nfs4_file *fi)
295 } 286 }
296} 287}
297 288
298static inline void
299get_nfs4_file(struct nfs4_file *fi)
300{
301 atomic_inc(&fi->fi_ref);
302}
303
304static struct file * 289static struct file *
305__nfs4_get_fd(struct nfs4_file *f, int oflag) 290__nfs4_get_fd(struct nfs4_file *f, int oflag)
306{ 291{
@@ -358,7 +343,7 @@ find_readable_file(struct nfs4_file *f)
358 return ret; 343 return ret;
359} 344}
360 345
361static struct file * 346struct file *
362find_any_file(struct nfs4_file *f) 347find_any_file(struct nfs4_file *f)
363{ 348{
364 struct file *ret; 349 struct file *ret;
@@ -408,14 +393,6 @@ static unsigned int file_hashval(struct knfsd_fh *fh)
408 return nfsd_fh_hashval(fh) & (FILE_HASH_SIZE - 1); 393 return nfsd_fh_hashval(fh) & (FILE_HASH_SIZE - 1);
409} 394}
410 395
411static bool nfsd_fh_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2)
412{
413 return fh1->fh_size == fh2->fh_size &&
414 !memcmp(fh1->fh_base.fh_pad,
415 fh2->fh_base.fh_pad,
416 fh1->fh_size);
417}
418
419static struct hlist_head file_hashtbl[FILE_HASH_SIZE]; 396static struct hlist_head file_hashtbl[FILE_HASH_SIZE];
420 397
421static void 398static void
@@ -494,7 +471,7 @@ static void nfs4_file_put_access(struct nfs4_file *fp, u32 access)
494 __nfs4_file_put_access(fp, O_RDONLY); 471 __nfs4_file_put_access(fp, O_RDONLY);
495} 472}
496 473
497static struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, 474struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl,
498 struct kmem_cache *slab) 475 struct kmem_cache *slab)
499{ 476{
500 struct nfs4_stid *stid; 477 struct nfs4_stid *stid;
@@ -688,17 +665,17 @@ static void nfs4_put_deleg_lease(struct nfs4_file *fp)
688 struct file *filp = NULL; 665 struct file *filp = NULL;
689 666
690 spin_lock(&fp->fi_lock); 667 spin_lock(&fp->fi_lock);
691 if (fp->fi_deleg_file && atomic_dec_and_test(&fp->fi_delegees)) 668 if (fp->fi_deleg_file && --fp->fi_delegees == 0)
692 swap(filp, fp->fi_deleg_file); 669 swap(filp, fp->fi_deleg_file);
693 spin_unlock(&fp->fi_lock); 670 spin_unlock(&fp->fi_lock);
694 671
695 if (filp) { 672 if (filp) {
696 vfs_setlease(filp, F_UNLCK, NULL, NULL); 673 vfs_setlease(filp, F_UNLCK, NULL, (void **)&fp);
697 fput(filp); 674 fput(filp);
698 } 675 }
699} 676}
700 677
701static void unhash_stid(struct nfs4_stid *s) 678void nfs4_unhash_stid(struct nfs4_stid *s)
702{ 679{
703 s->sc_type = 0; 680 s->sc_type = 0;
704} 681}
@@ -1006,7 +983,7 @@ static void unhash_lock_stateid(struct nfs4_ol_stateid *stp)
1006 983
1007 list_del_init(&stp->st_locks); 984 list_del_init(&stp->st_locks);
1008 unhash_ol_stateid(stp); 985 unhash_ol_stateid(stp);
1009 unhash_stid(&stp->st_stid); 986 nfs4_unhash_stid(&stp->st_stid);
1010} 987}
1011 988
1012static void release_lock_stateid(struct nfs4_ol_stateid *stp) 989static void release_lock_stateid(struct nfs4_ol_stateid *stp)
@@ -1518,7 +1495,12 @@ unhash_session(struct nfsd4_session *ses)
1518static int 1495static int
1519STALE_CLIENTID(clientid_t *clid, struct nfsd_net *nn) 1496STALE_CLIENTID(clientid_t *clid, struct nfsd_net *nn)
1520{ 1497{
1521 if (clid->cl_boot == nn->boot_time) 1498 /*
1499 * We're assuming the clid was not given out from a boot
1500 * precisely 2^32 (about 136 years) before this one. That seems
1501 * a safe assumption:
1502 */
1503 if (clid->cl_boot == (u32)nn->boot_time)
1522 return 0; 1504 return 0;
1523 dprintk("NFSD stale clientid (%08x/%08x) boot_time %08lx\n", 1505 dprintk("NFSD stale clientid (%08x/%08x) boot_time %08lx\n",
1524 clid->cl_boot, clid->cl_id, nn->boot_time); 1506 clid->cl_boot, clid->cl_id, nn->boot_time);
@@ -1558,6 +1540,9 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
1558 INIT_LIST_HEAD(&clp->cl_lru); 1540 INIT_LIST_HEAD(&clp->cl_lru);
1559 INIT_LIST_HEAD(&clp->cl_callbacks); 1541 INIT_LIST_HEAD(&clp->cl_callbacks);
1560 INIT_LIST_HEAD(&clp->cl_revoked); 1542 INIT_LIST_HEAD(&clp->cl_revoked);
1543#ifdef CONFIG_NFSD_PNFS
1544 INIT_LIST_HEAD(&clp->cl_lo_states);
1545#endif
1561 spin_lock_init(&clp->cl_lock); 1546 spin_lock_init(&clp->cl_lock);
1562 rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table"); 1547 rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
1563 return clp; 1548 return clp;
@@ -1662,6 +1647,7 @@ __destroy_client(struct nfs4_client *clp)
1662 nfs4_get_stateowner(&oo->oo_owner); 1647 nfs4_get_stateowner(&oo->oo_owner);
1663 release_openowner(oo); 1648 release_openowner(oo);
1664 } 1649 }
1650 nfsd4_return_all_client_layouts(clp);
1665 nfsd4_shutdown_callback(clp); 1651 nfsd4_shutdown_callback(clp);
1666 if (clp->cl_cb_conn.cb_xprt) 1652 if (clp->cl_cb_conn.cb_xprt)
1667 svc_xprt_put(clp->cl_cb_conn.cb_xprt); 1653 svc_xprt_put(clp->cl_cb_conn.cb_xprt);
@@ -2145,8 +2131,11 @@ nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
2145static void 2131static void
2146nfsd4_set_ex_flags(struct nfs4_client *new, struct nfsd4_exchange_id *clid) 2132nfsd4_set_ex_flags(struct nfs4_client *new, struct nfsd4_exchange_id *clid)
2147{ 2133{
2148 /* pNFS is not supported */ 2134#ifdef CONFIG_NFSD_PNFS
2135 new->cl_exchange_flags |= EXCHGID4_FLAG_USE_PNFS_MDS;
2136#else
2149 new->cl_exchange_flags |= EXCHGID4_FLAG_USE_NON_PNFS; 2137 new->cl_exchange_flags |= EXCHGID4_FLAG_USE_NON_PNFS;
2138#endif
2150 2139
2151 /* Referrals are supported, Migration is not. */ 2140 /* Referrals are supported, Migration is not. */
2152 new->cl_exchange_flags |= EXCHGID4_FLAG_SUPP_MOVED_REFER; 2141 new->cl_exchange_flags |= EXCHGID4_FLAG_SUPP_MOVED_REFER;
@@ -3074,6 +3063,10 @@ static void nfsd4_init_file(struct knfsd_fh *fh, unsigned int hashval,
3074 fp->fi_share_deny = 0; 3063 fp->fi_share_deny = 0;
3075 memset(fp->fi_fds, 0, sizeof(fp->fi_fds)); 3064 memset(fp->fi_fds, 0, sizeof(fp->fi_fds));
3076 memset(fp->fi_access, 0, sizeof(fp->fi_access)); 3065 memset(fp->fi_access, 0, sizeof(fp->fi_access));
3066#ifdef CONFIG_NFSD_PNFS
3067 INIT_LIST_HEAD(&fp->fi_lo_states);
3068 atomic_set(&fp->fi_lo_recalls, 0);
3069#endif
3077 hlist_add_head_rcu(&fp->fi_hash, &file_hashtbl[hashval]); 3070 hlist_add_head_rcu(&fp->fi_hash, &file_hashtbl[hashval]);
3078} 3071}
3079 3072
@@ -3300,7 +3293,7 @@ find_file_locked(struct knfsd_fh *fh, unsigned int hashval)
3300 struct nfs4_file *fp; 3293 struct nfs4_file *fp;
3301 3294
3302 hlist_for_each_entry_rcu(fp, &file_hashtbl[hashval], fi_hash) { 3295 hlist_for_each_entry_rcu(fp, &file_hashtbl[hashval], fi_hash) {
3303 if (nfsd_fh_match(&fp->fi_fhandle, fh)) { 3296 if (fh_match(&fp->fi_fhandle, fh)) {
3304 if (atomic_inc_not_zero(&fp->fi_ref)) 3297 if (atomic_inc_not_zero(&fp->fi_ref))
3305 return fp; 3298 return fp;
3306 } 3299 }
@@ -3308,7 +3301,7 @@ find_file_locked(struct knfsd_fh *fh, unsigned int hashval)
3308 return NULL; 3301 return NULL;
3309} 3302}
3310 3303
3311static struct nfs4_file * 3304struct nfs4_file *
3312find_file(struct knfsd_fh *fh) 3305find_file(struct knfsd_fh *fh)
3313{ 3306{
3314 struct nfs4_file *fp; 3307 struct nfs4_file *fp;
@@ -3856,12 +3849,12 @@ static int nfs4_setlease(struct nfs4_delegation *dp)
3856 /* Race breaker */ 3849 /* Race breaker */
3857 if (fp->fi_deleg_file) { 3850 if (fp->fi_deleg_file) {
3858 status = 0; 3851 status = 0;
3859 atomic_inc(&fp->fi_delegees); 3852 ++fp->fi_delegees;
3860 hash_delegation_locked(dp, fp); 3853 hash_delegation_locked(dp, fp);
3861 goto out_unlock; 3854 goto out_unlock;
3862 } 3855 }
3863 fp->fi_deleg_file = filp; 3856 fp->fi_deleg_file = filp;
3864 atomic_set(&fp->fi_delegees, 1); 3857 fp->fi_delegees = 1;
3865 hash_delegation_locked(dp, fp); 3858 hash_delegation_locked(dp, fp);
3866 spin_unlock(&fp->fi_lock); 3859 spin_unlock(&fp->fi_lock);
3867 spin_unlock(&state_lock); 3860 spin_unlock(&state_lock);
@@ -3902,7 +3895,7 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh,
3902 status = -EAGAIN; 3895 status = -EAGAIN;
3903 goto out_unlock; 3896 goto out_unlock;
3904 } 3897 }
3905 atomic_inc(&fp->fi_delegees); 3898 ++fp->fi_delegees;
3906 hash_delegation_locked(dp, fp); 3899 hash_delegation_locked(dp, fp);
3907 status = 0; 3900 status = 0;
3908out_unlock: 3901out_unlock:
@@ -4295,7 +4288,7 @@ laundromat_main(struct work_struct *laundry)
4295 4288
4296static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_ol_stateid *stp) 4289static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_ol_stateid *stp)
4297{ 4290{
4298 if (!nfsd_fh_match(&fhp->fh_handle, &stp->st_stid.sc_file->fi_fhandle)) 4291 if (!fh_match(&fhp->fh_handle, &stp->st_stid.sc_file->fi_fhandle))
4299 return nfserr_bad_stateid; 4292 return nfserr_bad_stateid;
4300 return nfs_ok; 4293 return nfs_ok;
4301} 4294}
@@ -4446,7 +4439,7 @@ out_unlock:
4446 return status; 4439 return status;
4447} 4440}
4448 4441
4449static __be32 4442__be32
4450nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, 4443nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
4451 stateid_t *stateid, unsigned char typemask, 4444 stateid_t *stateid, unsigned char typemask,
4452 struct nfs4_stid **s, struct nfsd_net *nn) 4445 struct nfs4_stid **s, struct nfsd_net *nn)
@@ -4860,6 +4853,9 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4860 update_stateid(&stp->st_stid.sc_stateid); 4853 update_stateid(&stp->st_stid.sc_stateid);
4861 memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); 4854 memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
4862 4855
4856 nfsd4_return_all_file_layouts(stp->st_stateowner->so_client,
4857 stp->st_stid.sc_file);
4858
4863 nfsd4_close_open_stateid(stp); 4859 nfsd4_close_open_stateid(stp);
4864 4860
4865 /* put reference from nfs4_preprocess_seqid_op */ 4861 /* put reference from nfs4_preprocess_seqid_op */
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 15f7b73e0c0f..df5e66caf100 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -47,6 +47,7 @@
47#include "state.h" 47#include "state.h"
48#include "cache.h" 48#include "cache.h"
49#include "netns.h" 49#include "netns.h"
50#include "pnfs.h"
50 51
51#ifdef CONFIG_NFSD_V4_SECURITY_LABEL 52#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
52#include <linux/security.h> 53#include <linux/security.h>
@@ -234,6 +235,26 @@ static char *savemem(struct nfsd4_compoundargs *argp, __be32 *p, int nbytes)
234 return ret; 235 return ret;
235} 236}
236 237
238/*
239 * We require the high 32 bits of 'seconds' to be 0, and
240 * we ignore all 32 bits of 'nseconds'.
241 */
242static __be32
243nfsd4_decode_time(struct nfsd4_compoundargs *argp, struct timespec *tv)
244{
245 DECODE_HEAD;
246 u64 sec;
247
248 READ_BUF(12);
249 p = xdr_decode_hyper(p, &sec);
250 tv->tv_sec = sec;
251 tv->tv_nsec = be32_to_cpup(p++);
252 if (tv->tv_nsec >= (u32)1000000000)
253 return nfserr_inval;
254
255 DECODE_TAIL;
256}
257
237static __be32 258static __be32
238nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval) 259nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval)
239{ 260{
@@ -267,7 +288,6 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
267{ 288{
268 int expected_len, len = 0; 289 int expected_len, len = 0;
269 u32 dummy32; 290 u32 dummy32;
270 u64 sec;
271 char *buf; 291 char *buf;
272 292
273 DECODE_HEAD; 293 DECODE_HEAD;
@@ -358,15 +378,10 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
358 dummy32 = be32_to_cpup(p++); 378 dummy32 = be32_to_cpup(p++);
359 switch (dummy32) { 379 switch (dummy32) {
360 case NFS4_SET_TO_CLIENT_TIME: 380 case NFS4_SET_TO_CLIENT_TIME:
361 /* We require the high 32 bits of 'seconds' to be 0, and we ignore
362 all 32 bits of 'nseconds'. */
363 READ_BUF(12);
364 len += 12; 381 len += 12;
365 p = xdr_decode_hyper(p, &sec); 382 status = nfsd4_decode_time(argp, &iattr->ia_atime);
366 iattr->ia_atime.tv_sec = (time_t)sec; 383 if (status)
367 iattr->ia_atime.tv_nsec = be32_to_cpup(p++); 384 return status;
368 if (iattr->ia_atime.tv_nsec >= (u32)1000000000)
369 return nfserr_inval;
370 iattr->ia_valid |= (ATTR_ATIME | ATTR_ATIME_SET); 385 iattr->ia_valid |= (ATTR_ATIME | ATTR_ATIME_SET);
371 break; 386 break;
372 case NFS4_SET_TO_SERVER_TIME: 387 case NFS4_SET_TO_SERVER_TIME:
@@ -382,15 +397,10 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
382 dummy32 = be32_to_cpup(p++); 397 dummy32 = be32_to_cpup(p++);
383 switch (dummy32) { 398 switch (dummy32) {
384 case NFS4_SET_TO_CLIENT_TIME: 399 case NFS4_SET_TO_CLIENT_TIME:
385 /* We require the high 32 bits of 'seconds' to be 0, and we ignore
386 all 32 bits of 'nseconds'. */
387 READ_BUF(12);
388 len += 12; 400 len += 12;
389 p = xdr_decode_hyper(p, &sec); 401 status = nfsd4_decode_time(argp, &iattr->ia_mtime);
390 iattr->ia_mtime.tv_sec = sec; 402 if (status)
391 iattr->ia_mtime.tv_nsec = be32_to_cpup(p++); 403 return status;
392 if (iattr->ia_mtime.tv_nsec >= (u32)1000000000)
393 return nfserr_inval;
394 iattr->ia_valid |= (ATTR_MTIME | ATTR_MTIME_SET); 404 iattr->ia_valid |= (ATTR_MTIME | ATTR_MTIME_SET);
395 break; 405 break;
396 case NFS4_SET_TO_SERVER_TIME: 406 case NFS4_SET_TO_SERVER_TIME:
@@ -1513,6 +1523,127 @@ static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, str
1513 DECODE_TAIL; 1523 DECODE_TAIL;
1514} 1524}
1515 1525
1526#ifdef CONFIG_NFSD_PNFS
1527static __be32
1528nfsd4_decode_getdeviceinfo(struct nfsd4_compoundargs *argp,
1529 struct nfsd4_getdeviceinfo *gdev)
1530{
1531 DECODE_HEAD;
1532 u32 num, i;
1533
1534 READ_BUF(sizeof(struct nfsd4_deviceid) + 3 * 4);
1535 COPYMEM(&gdev->gd_devid, sizeof(struct nfsd4_deviceid));
1536 gdev->gd_layout_type = be32_to_cpup(p++);
1537 gdev->gd_maxcount = be32_to_cpup(p++);
1538 num = be32_to_cpup(p++);
1539 if (num) {
1540 READ_BUF(4 * num);
1541 gdev->gd_notify_types = be32_to_cpup(p++);
1542 for (i = 1; i < num; i++) {
1543 if (be32_to_cpup(p++)) {
1544 status = nfserr_inval;
1545 goto out;
1546 }
1547 }
1548 }
1549 DECODE_TAIL;
1550}
1551
1552static __be32
1553nfsd4_decode_layoutget(struct nfsd4_compoundargs *argp,
1554 struct nfsd4_layoutget *lgp)
1555{
1556 DECODE_HEAD;
1557
1558 READ_BUF(36);
1559 lgp->lg_signal = be32_to_cpup(p++);
1560 lgp->lg_layout_type = be32_to_cpup(p++);
1561 lgp->lg_seg.iomode = be32_to_cpup(p++);
1562 p = xdr_decode_hyper(p, &lgp->lg_seg.offset);
1563 p = xdr_decode_hyper(p, &lgp->lg_seg.length);
1564 p = xdr_decode_hyper(p, &lgp->lg_minlength);
1565 nfsd4_decode_stateid(argp, &lgp->lg_sid);
1566 READ_BUF(4);
1567 lgp->lg_maxcount = be32_to_cpup(p++);
1568
1569 DECODE_TAIL;
1570}
1571
1572static __be32
1573nfsd4_decode_layoutcommit(struct nfsd4_compoundargs *argp,
1574 struct nfsd4_layoutcommit *lcp)
1575{
1576 DECODE_HEAD;
1577 u32 timechange;
1578
1579 READ_BUF(20);
1580 p = xdr_decode_hyper(p, &lcp->lc_seg.offset);
1581 p = xdr_decode_hyper(p, &lcp->lc_seg.length);
1582 lcp->lc_reclaim = be32_to_cpup(p++);
1583 nfsd4_decode_stateid(argp, &lcp->lc_sid);
1584 READ_BUF(4);
1585 lcp->lc_newoffset = be32_to_cpup(p++);
1586 if (lcp->lc_newoffset) {
1587 READ_BUF(8);
1588 p = xdr_decode_hyper(p, &lcp->lc_last_wr);
1589 } else
1590 lcp->lc_last_wr = 0;
1591 READ_BUF(4);
1592 timechange = be32_to_cpup(p++);
1593 if (timechange) {
1594 status = nfsd4_decode_time(argp, &lcp->lc_mtime);
1595 if (status)
1596 return status;
1597 } else {
1598 lcp->lc_mtime.tv_nsec = UTIME_NOW;
1599 }
1600 READ_BUF(8);
1601 lcp->lc_layout_type = be32_to_cpup(p++);
1602
1603 /*
1604 * Save the layout update in XDR format and let the layout driver deal
1605 * with it later.
1606 */
1607 lcp->lc_up_len = be32_to_cpup(p++);
1608 if (lcp->lc_up_len > 0) {
1609 READ_BUF(lcp->lc_up_len);
1610 READMEM(lcp->lc_up_layout, lcp->lc_up_len);
1611 }
1612
1613 DECODE_TAIL;
1614}
1615
1616static __be32
1617nfsd4_decode_layoutreturn(struct nfsd4_compoundargs *argp,
1618 struct nfsd4_layoutreturn *lrp)
1619{
1620 DECODE_HEAD;
1621
1622 READ_BUF(16);
1623 lrp->lr_reclaim = be32_to_cpup(p++);
1624 lrp->lr_layout_type = be32_to_cpup(p++);
1625 lrp->lr_seg.iomode = be32_to_cpup(p++);
1626 lrp->lr_return_type = be32_to_cpup(p++);
1627 if (lrp->lr_return_type == RETURN_FILE) {
1628 READ_BUF(16);
1629 p = xdr_decode_hyper(p, &lrp->lr_seg.offset);
1630 p = xdr_decode_hyper(p, &lrp->lr_seg.length);
1631 nfsd4_decode_stateid(argp, &lrp->lr_sid);
1632 READ_BUF(4);
1633 lrp->lrf_body_len = be32_to_cpup(p++);
1634 if (lrp->lrf_body_len > 0) {
1635 READ_BUF(lrp->lrf_body_len);
1636 READMEM(lrp->lrf_body, lrp->lrf_body_len);
1637 }
1638 } else {
1639 lrp->lr_seg.offset = 0;
1640 lrp->lr_seg.length = NFS4_MAX_UINT64;
1641 }
1642
1643 DECODE_TAIL;
1644}
1645#endif /* CONFIG_NFSD_PNFS */
1646
1516static __be32 1647static __be32
1517nfsd4_decode_fallocate(struct nfsd4_compoundargs *argp, 1648nfsd4_decode_fallocate(struct nfsd4_compoundargs *argp,
1518 struct nfsd4_fallocate *fallocate) 1649 struct nfsd4_fallocate *fallocate)
@@ -1607,11 +1738,19 @@ static nfsd4_dec nfsd4_dec_ops[] = {
1607 [OP_DESTROY_SESSION] = (nfsd4_dec)nfsd4_decode_destroy_session, 1738 [OP_DESTROY_SESSION] = (nfsd4_dec)nfsd4_decode_destroy_session,
1608 [OP_FREE_STATEID] = (nfsd4_dec)nfsd4_decode_free_stateid, 1739 [OP_FREE_STATEID] = (nfsd4_dec)nfsd4_decode_free_stateid,
1609 [OP_GET_DIR_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp, 1740 [OP_GET_DIR_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp,
1741#ifdef CONFIG_NFSD_PNFS
1742 [OP_GETDEVICEINFO] = (nfsd4_dec)nfsd4_decode_getdeviceinfo,
1743 [OP_GETDEVICELIST] = (nfsd4_dec)nfsd4_decode_notsupp,
1744 [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_layoutcommit,
1745 [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_layoutget,
1746 [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_layoutreturn,
1747#else
1610 [OP_GETDEVICEINFO] = (nfsd4_dec)nfsd4_decode_notsupp, 1748 [OP_GETDEVICEINFO] = (nfsd4_dec)nfsd4_decode_notsupp,
1611 [OP_GETDEVICELIST] = (nfsd4_dec)nfsd4_decode_notsupp, 1749 [OP_GETDEVICELIST] = (nfsd4_dec)nfsd4_decode_notsupp,
1612 [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_notsupp, 1750 [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_notsupp,
1613 [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_notsupp, 1751 [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_notsupp,
1614 [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_notsupp, 1752 [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_notsupp,
1753#endif
1615 [OP_SECINFO_NO_NAME] = (nfsd4_dec)nfsd4_decode_secinfo_no_name, 1754 [OP_SECINFO_NO_NAME] = (nfsd4_dec)nfsd4_decode_secinfo_no_name,
1616 [OP_SEQUENCE] = (nfsd4_dec)nfsd4_decode_sequence, 1755 [OP_SEQUENCE] = (nfsd4_dec)nfsd4_decode_sequence,
1617 [OP_SET_SSV] = (nfsd4_dec)nfsd4_decode_notsupp, 1756 [OP_SET_SSV] = (nfsd4_dec)nfsd4_decode_notsupp,
@@ -2539,6 +2678,30 @@ out_acl:
2539 get_parent_attributes(exp, &stat); 2678 get_parent_attributes(exp, &stat);
2540 p = xdr_encode_hyper(p, stat.ino); 2679 p = xdr_encode_hyper(p, stat.ino);
2541 } 2680 }
2681#ifdef CONFIG_NFSD_PNFS
2682 if ((bmval1 & FATTR4_WORD1_FS_LAYOUT_TYPES) ||
2683 (bmval2 & FATTR4_WORD2_LAYOUT_TYPES)) {
2684 if (exp->ex_layout_type) {
2685 p = xdr_reserve_space(xdr, 8);
2686 if (!p)
2687 goto out_resource;
2688 *p++ = cpu_to_be32(1);
2689 *p++ = cpu_to_be32(exp->ex_layout_type);
2690 } else {
2691 p = xdr_reserve_space(xdr, 4);
2692 if (!p)
2693 goto out_resource;
2694 *p++ = cpu_to_be32(0);
2695 }
2696 }
2697
2698 if (bmval2 & FATTR4_WORD2_LAYOUT_BLKSIZE) {
2699 p = xdr_reserve_space(xdr, 4);
2700 if (!p)
2701 goto out_resource;
2702 *p++ = cpu_to_be32(stat.blksize);
2703 }
2704#endif /* CONFIG_NFSD_PNFS */
2542 if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) { 2705 if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) {
2543 status = nfsd4_encode_security_label(xdr, rqstp, context, 2706 status = nfsd4_encode_security_label(xdr, rqstp, context,
2544 contextlen); 2707 contextlen);
@@ -2768,16 +2931,17 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
2768 if (entry_bytes > cd->rd_maxcount) 2931 if (entry_bytes > cd->rd_maxcount)
2769 goto fail; 2932 goto fail;
2770 cd->rd_maxcount -= entry_bytes; 2933 cd->rd_maxcount -= entry_bytes;
2771 if (!cd->rd_dircount)
2772 goto fail;
2773 /* 2934 /*
2774 * RFC 3530 14.2.24 describes rd_dircount as only a "hint", so 2935 * RFC 3530 14.2.24 describes rd_dircount as only a "hint", so
2775 * let's always let through the first entry, at least: 2936 * let's always let through the first entry, at least:
2776 */ 2937 */
2777 name_and_cookie = 4 * XDR_QUADLEN(namlen) + 8; 2938 if (!cd->rd_dircount)
2939 goto fail;
2940 name_and_cookie = 4 + 4 * XDR_QUADLEN(namlen) + 8;
2778 if (name_and_cookie > cd->rd_dircount && cd->cookie_offset) 2941 if (name_and_cookie > cd->rd_dircount && cd->cookie_offset)
2779 goto fail; 2942 goto fail;
2780 cd->rd_dircount -= min(cd->rd_dircount, name_and_cookie); 2943 cd->rd_dircount -= min(cd->rd_dircount, name_and_cookie);
2944
2781 cd->cookie_offset = cookie_offset; 2945 cd->cookie_offset = cookie_offset;
2782skip_entry: 2946skip_entry:
2783 cd->common.err = nfs_ok; 2947 cd->common.err = nfs_ok;
@@ -3814,6 +3978,156 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr,
3814 return nfserr; 3978 return nfserr;
3815} 3979}
3816 3980
3981#ifdef CONFIG_NFSD_PNFS
3982static __be32
3983nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
3984 struct nfsd4_getdeviceinfo *gdev)
3985{
3986 struct xdr_stream *xdr = &resp->xdr;
3987 const struct nfsd4_layout_ops *ops =
3988 nfsd4_layout_ops[gdev->gd_layout_type];
3989 u32 starting_len = xdr->buf->len, needed_len;
3990 __be32 *p;
3991
3992 dprintk("%s: err %d\n", __func__, nfserr);
3993 if (nfserr)
3994 goto out;
3995
3996 nfserr = nfserr_resource;
3997 p = xdr_reserve_space(xdr, 4);
3998 if (!p)
3999 goto out;
4000
4001 *p++ = cpu_to_be32(gdev->gd_layout_type);
4002
4003 /* If maxcount is 0 then just update notifications */
4004 if (gdev->gd_maxcount != 0) {
4005 nfserr = ops->encode_getdeviceinfo(xdr, gdev);
4006 if (nfserr) {
4007 /*
4008 * We don't bother to burden the layout drivers with
4009 * enforcing gd_maxcount, just tell the client to
4010 * come back with a bigger buffer if it's not enough.
4011 */
4012 if (xdr->buf->len + 4 > gdev->gd_maxcount)
4013 goto toosmall;
4014 goto out;
4015 }
4016 }
4017
4018 nfserr = nfserr_resource;
4019 if (gdev->gd_notify_types) {
4020 p = xdr_reserve_space(xdr, 4 + 4);
4021 if (!p)
4022 goto out;
4023 *p++ = cpu_to_be32(1); /* bitmap length */
4024 *p++ = cpu_to_be32(gdev->gd_notify_types);
4025 } else {
4026 p = xdr_reserve_space(xdr, 4);
4027 if (!p)
4028 goto out;
4029 *p++ = 0;
4030 }
4031
4032 nfserr = 0;
4033out:
4034 kfree(gdev->gd_device);
4035 dprintk("%s: done: %d\n", __func__, be32_to_cpu(nfserr));
4036 return nfserr;
4037
4038toosmall:
4039 dprintk("%s: maxcount too small\n", __func__);
4040 needed_len = xdr->buf->len + 4 /* notifications */;
4041 xdr_truncate_encode(xdr, starting_len);
4042 p = xdr_reserve_space(xdr, 4);
4043 if (!p) {
4044 nfserr = nfserr_resource;
4045 } else {
4046 *p++ = cpu_to_be32(needed_len);
4047 nfserr = nfserr_toosmall;
4048 }
4049 goto out;
4050}
4051
4052static __be32
4053nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, __be32 nfserr,
4054 struct nfsd4_layoutget *lgp)
4055{
4056 struct xdr_stream *xdr = &resp->xdr;
4057 const struct nfsd4_layout_ops *ops =
4058 nfsd4_layout_ops[lgp->lg_layout_type];
4059 __be32 *p;
4060
4061 dprintk("%s: err %d\n", __func__, nfserr);
4062 if (nfserr)
4063 goto out;
4064
4065 nfserr = nfserr_resource;
4066 p = xdr_reserve_space(xdr, 36 + sizeof(stateid_opaque_t));
4067 if (!p)
4068 goto out;
4069
4070 *p++ = cpu_to_be32(1); /* we always set return-on-close */
4071 *p++ = cpu_to_be32(lgp->lg_sid.si_generation);
4072 p = xdr_encode_opaque_fixed(p, &lgp->lg_sid.si_opaque,
4073 sizeof(stateid_opaque_t));
4074
4075 *p++ = cpu_to_be32(1); /* we always return a single layout */
4076 p = xdr_encode_hyper(p, lgp->lg_seg.offset);
4077 p = xdr_encode_hyper(p, lgp->lg_seg.length);
4078 *p++ = cpu_to_be32(lgp->lg_seg.iomode);
4079 *p++ = cpu_to_be32(lgp->lg_layout_type);
4080
4081 nfserr = ops->encode_layoutget(xdr, lgp);
4082out:
4083 kfree(lgp->lg_content);
4084 return nfserr;
4085}
4086
4087static __be32
4088nfsd4_encode_layoutcommit(struct nfsd4_compoundres *resp, __be32 nfserr,
4089 struct nfsd4_layoutcommit *lcp)
4090{
4091 struct xdr_stream *xdr = &resp->xdr;
4092 __be32 *p;
4093
4094 if (nfserr)
4095 return nfserr;
4096
4097 p = xdr_reserve_space(xdr, 4);
4098 if (!p)
4099 return nfserr_resource;
4100 *p++ = cpu_to_be32(lcp->lc_size_chg);
4101 if (lcp->lc_size_chg) {
4102 p = xdr_reserve_space(xdr, 8);
4103 if (!p)
4104 return nfserr_resource;
4105 p = xdr_encode_hyper(p, lcp->lc_newsize);
4106 }
4107
4108 return nfs_ok;
4109}
4110
4111static __be32
4112nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr,
4113 struct nfsd4_layoutreturn *lrp)
4114{
4115 struct xdr_stream *xdr = &resp->xdr;
4116 __be32 *p;
4117
4118 if (nfserr)
4119 return nfserr;
4120
4121 p = xdr_reserve_space(xdr, 4);
4122 if (!p)
4123 return nfserr_resource;
4124 *p++ = cpu_to_be32(lrp->lrs_present);
4125 if (lrp->lrs_present)
4126 nfsd4_encode_stateid(xdr, &lrp->lr_sid);
4127 return nfs_ok;
4128}
4129#endif /* CONFIG_NFSD_PNFS */
4130
3817static __be32 4131static __be32
3818nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr, 4132nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr,
3819 struct nfsd4_seek *seek) 4133 struct nfsd4_seek *seek)
@@ -3890,11 +4204,19 @@ static nfsd4_enc nfsd4_enc_ops[] = {
3890 [OP_DESTROY_SESSION] = (nfsd4_enc)nfsd4_encode_noop, 4204 [OP_DESTROY_SESSION] = (nfsd4_enc)nfsd4_encode_noop,
3891 [OP_FREE_STATEID] = (nfsd4_enc)nfsd4_encode_noop, 4205 [OP_FREE_STATEID] = (nfsd4_enc)nfsd4_encode_noop,
3892 [OP_GET_DIR_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop, 4206 [OP_GET_DIR_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop,
4207#ifdef CONFIG_NFSD_PNFS
4208 [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_getdeviceinfo,
4209 [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_noop,
4210 [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_layoutcommit,
4211 [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_layoutget,
4212 [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_layoutreturn,
4213#else
3893 [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_noop, 4214 [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_noop,
3894 [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_noop, 4215 [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_noop,
3895 [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_noop, 4216 [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_noop,
3896 [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_noop, 4217 [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_noop,
3897 [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_noop, 4218 [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_noop,
4219#endif
3898 [OP_SECINFO_NO_NAME] = (nfsd4_enc)nfsd4_encode_secinfo_no_name, 4220 [OP_SECINFO_NO_NAME] = (nfsd4_enc)nfsd4_encode_secinfo_no_name,
3899 [OP_SEQUENCE] = (nfsd4_enc)nfsd4_encode_sequence, 4221 [OP_SEQUENCE] = (nfsd4_enc)nfsd4_encode_sequence,
3900 [OP_SET_SSV] = (nfsd4_enc)nfsd4_encode_noop, 4222 [OP_SET_SSV] = (nfsd4_enc)nfsd4_encode_noop,
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 19ace74d35f6..aa47d75ddb26 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -21,6 +21,7 @@
21#include "cache.h" 21#include "cache.h"
22#include "state.h" 22#include "state.h"
23#include "netns.h" 23#include "netns.h"
24#include "pnfs.h"
24 25
25/* 26/*
26 * We have a single directory with several nodes in it. 27 * We have a single directory with several nodes in it.
@@ -1258,9 +1259,12 @@ static int __init init_nfsd(void)
1258 retval = nfsd4_init_slabs(); 1259 retval = nfsd4_init_slabs();
1259 if (retval) 1260 if (retval)
1260 goto out_unregister_pernet; 1261 goto out_unregister_pernet;
1261 retval = nfsd_fault_inject_init(); /* nfsd fault injection controls */ 1262 retval = nfsd4_init_pnfs();
1262 if (retval) 1263 if (retval)
1263 goto out_free_slabs; 1264 goto out_free_slabs;
1265 retval = nfsd_fault_inject_init(); /* nfsd fault injection controls */
1266 if (retval)
1267 goto out_exit_pnfs;
1264 nfsd_stat_init(); /* Statistics */ 1268 nfsd_stat_init(); /* Statistics */
1265 retval = nfsd_reply_cache_init(); 1269 retval = nfsd_reply_cache_init();
1266 if (retval) 1270 if (retval)
@@ -1282,6 +1286,8 @@ out_free_lockd:
1282out_free_stat: 1286out_free_stat:
1283 nfsd_stat_shutdown(); 1287 nfsd_stat_shutdown();
1284 nfsd_fault_inject_cleanup(); 1288 nfsd_fault_inject_cleanup();
1289out_exit_pnfs:
1290 nfsd4_exit_pnfs();
1285out_free_slabs: 1291out_free_slabs:
1286 nfsd4_free_slabs(); 1292 nfsd4_free_slabs();
1287out_unregister_pernet: 1293out_unregister_pernet:
@@ -1299,6 +1305,7 @@ static void __exit exit_nfsd(void)
1299 nfsd_stat_shutdown(); 1305 nfsd_stat_shutdown();
1300 nfsd_lockd_shutdown(); 1306 nfsd_lockd_shutdown();
1301 nfsd4_free_slabs(); 1307 nfsd4_free_slabs();
1308 nfsd4_exit_pnfs();
1302 nfsd_fault_inject_cleanup(); 1309 nfsd_fault_inject_cleanup();
1303 unregister_filesystem(&nfsd_fs_type); 1310 unregister_filesystem(&nfsd_fs_type);
1304 unregister_pernet_subsys(&nfsd_net_ops); 1311 unregister_pernet_subsys(&nfsd_net_ops);
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 33a46a8dfaf7..565c4da1a9eb 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -325,15 +325,27 @@ void nfsd_lockd_shutdown(void);
325 325
326#define NFSD4_SUPPORTED_ATTRS_WORD2 0 326#define NFSD4_SUPPORTED_ATTRS_WORD2 0
327 327
328/* 4.1 */
329#ifdef CONFIG_NFSD_PNFS
330#define PNFSD_SUPPORTED_ATTRS_WORD1 FATTR4_WORD1_FS_LAYOUT_TYPES
331#define PNFSD_SUPPORTED_ATTRS_WORD2 \
332(FATTR4_WORD2_LAYOUT_BLKSIZE | FATTR4_WORD2_LAYOUT_TYPES)
333#else
334#define PNFSD_SUPPORTED_ATTRS_WORD1 0
335#define PNFSD_SUPPORTED_ATTRS_WORD2 0
336#endif /* CONFIG_NFSD_PNFS */
337
328#define NFSD4_1_SUPPORTED_ATTRS_WORD0 \ 338#define NFSD4_1_SUPPORTED_ATTRS_WORD0 \
329 NFSD4_SUPPORTED_ATTRS_WORD0 339 NFSD4_SUPPORTED_ATTRS_WORD0
330 340
331#define NFSD4_1_SUPPORTED_ATTRS_WORD1 \ 341#define NFSD4_1_SUPPORTED_ATTRS_WORD1 \
332 NFSD4_SUPPORTED_ATTRS_WORD1 342 (NFSD4_SUPPORTED_ATTRS_WORD1 | PNFSD_SUPPORTED_ATTRS_WORD1)
333 343
334#define NFSD4_1_SUPPORTED_ATTRS_WORD2 \ 344#define NFSD4_1_SUPPORTED_ATTRS_WORD2 \
335 (NFSD4_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SUPPATTR_EXCLCREAT) 345 (NFSD4_SUPPORTED_ATTRS_WORD2 | PNFSD_SUPPORTED_ATTRS_WORD2 | \
346 FATTR4_WORD2_SUPPATTR_EXCLCREAT)
336 347
348/* 4.2 */
337#ifdef CONFIG_NFSD_V4_SECURITY_LABEL 349#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
338#define NFSD4_2_SECURITY_ATTRS FATTR4_WORD2_SECURITY_LABEL 350#define NFSD4_2_SECURITY_ATTRS FATTR4_WORD2_SECURITY_LABEL
339#else 351#else
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index 08236d70c667..84cae2079d21 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -187,6 +187,24 @@ fh_init(struct svc_fh *fhp, int maxsize)
187 return fhp; 187 return fhp;
188} 188}
189 189
190static inline bool fh_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2)
191{
192 if (fh1->fh_size != fh2->fh_size)
193 return false;
194 if (memcmp(fh1->fh_base.fh_pad, fh2->fh_base.fh_pad, fh1->fh_size) != 0)
195 return false;
196 return true;
197}
198
199static inline bool fh_fsid_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2)
200{
201 if (fh1->fh_fsid_type != fh2->fh_fsid_type)
202 return false;
203 if (memcmp(fh1->fh_fsid, fh2->fh_fsid, key_len(fh1->fh_fsid_type) != 0))
204 return false;
205 return true;
206}
207
190#ifdef CONFIG_NFSD_V3 208#ifdef CONFIG_NFSD_V3
191/* 209/*
192 * The wcc data stored in current_fh should be cleared 210 * The wcc data stored in current_fh should be cleared
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 314f5c8f8f1a..9277cc91c21b 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -119,6 +119,7 @@ struct svc_program nfsd_program = {
119static bool nfsd_supported_minorversions[NFSD_SUPPORTED_MINOR_VERSION + 1] = { 119static bool nfsd_supported_minorversions[NFSD_SUPPORTED_MINOR_VERSION + 1] = {
120 [0] = 1, 120 [0] = 1,
121 [1] = 1, 121 [1] = 1,
122 [2] = 1,
122}; 123};
123 124
124int nfsd_vers(int vers, enum vers_op change) 125int nfsd_vers(int vers, enum vers_op change)
diff --git a/fs/nfsd/pnfs.h b/fs/nfsd/pnfs.h
new file mode 100644
index 000000000000..fedb4d620a81
--- /dev/null
+++ b/fs/nfsd/pnfs.h
@@ -0,0 +1,81 @@
1#ifndef _FS_NFSD_PNFS_H
2#define _FS_NFSD_PNFS_H 1
3
4#include <linux/exportfs.h>
5#include <linux/nfsd/export.h>
6
7#include "state.h"
8#include "xdr4.h"
9
10struct xdr_stream;
11
12struct nfsd4_deviceid_map {
13 struct list_head hash;
14 u64 idx;
15 int fsid_type;
16 u32 fsid[];
17};
18
19struct nfsd4_layout_ops {
20 u32 notify_types;
21
22 __be32 (*proc_getdeviceinfo)(struct super_block *sb,
23 struct nfsd4_getdeviceinfo *gdevp);
24 __be32 (*encode_getdeviceinfo)(struct xdr_stream *xdr,
25 struct nfsd4_getdeviceinfo *gdevp);
26
27 __be32 (*proc_layoutget)(struct inode *, const struct svc_fh *fhp,
28 struct nfsd4_layoutget *lgp);
29 __be32 (*encode_layoutget)(struct xdr_stream *,
30 struct nfsd4_layoutget *lgp);
31
32 __be32 (*proc_layoutcommit)(struct inode *inode,
33 struct nfsd4_layoutcommit *lcp);
34};
35
36extern const struct nfsd4_layout_ops *nfsd4_layout_ops[];
37extern const struct nfsd4_layout_ops bl_layout_ops;
38
39__be32 nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp,
40 struct nfsd4_compound_state *cstate, stateid_t *stateid,
41 bool create, u32 layout_type, struct nfs4_layout_stateid **lsp);
42__be32 nfsd4_insert_layout(struct nfsd4_layoutget *lgp,
43 struct nfs4_layout_stateid *ls);
44__be32 nfsd4_return_file_layouts(struct svc_rqst *rqstp,
45 struct nfsd4_compound_state *cstate,
46 struct nfsd4_layoutreturn *lrp);
47__be32 nfsd4_return_client_layouts(struct svc_rqst *rqstp,
48 struct nfsd4_compound_state *cstate,
49 struct nfsd4_layoutreturn *lrp);
50int nfsd4_set_deviceid(struct nfsd4_deviceid *id, const struct svc_fh *fhp,
51 u32 device_generation);
52struct nfsd4_deviceid_map *nfsd4_find_devid_map(int idx);
53
54#ifdef CONFIG_NFSD_PNFS
55void nfsd4_setup_layout_type(struct svc_export *exp);
56void nfsd4_return_all_client_layouts(struct nfs4_client *);
57void nfsd4_return_all_file_layouts(struct nfs4_client *clp,
58 struct nfs4_file *fp);
59int nfsd4_init_pnfs(void);
60void nfsd4_exit_pnfs(void);
61#else
62static inline void nfsd4_setup_layout_type(struct svc_export *exp)
63{
64}
65
66static inline void nfsd4_return_all_client_layouts(struct nfs4_client *clp)
67{
68}
69static inline void nfsd4_return_all_file_layouts(struct nfs4_client *clp,
70 struct nfs4_file *fp)
71{
72}
73static inline void nfsd4_exit_pnfs(void)
74{
75}
76static inline int nfsd4_init_pnfs(void)
77{
78 return 0;
79}
80#endif /* CONFIG_NFSD_PNFS */
81#endif /* _FS_NFSD_PNFS_H */
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 9d3be371240a..4f3bfeb11766 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -92,6 +92,7 @@ struct nfs4_stid {
92/* For a deleg stateid kept around only to process free_stateid's: */ 92/* For a deleg stateid kept around only to process free_stateid's: */
93#define NFS4_REVOKED_DELEG_STID 16 93#define NFS4_REVOKED_DELEG_STID 16
94#define NFS4_CLOSED_DELEG_STID 32 94#define NFS4_CLOSED_DELEG_STID 32
95#define NFS4_LAYOUT_STID 64
95 unsigned char sc_type; 96 unsigned char sc_type;
96 stateid_t sc_stateid; 97 stateid_t sc_stateid;
97 struct nfs4_client *sc_client; 98 struct nfs4_client *sc_client;
@@ -297,6 +298,9 @@ struct nfs4_client {
297 struct list_head cl_delegations; 298 struct list_head cl_delegations;
298 struct list_head cl_revoked; /* unacknowledged, revoked 4.1 state */ 299 struct list_head cl_revoked; /* unacknowledged, revoked 4.1 state */
299 struct list_head cl_lru; /* tail queue */ 300 struct list_head cl_lru; /* tail queue */
301#ifdef CONFIG_NFSD_PNFS
302 struct list_head cl_lo_states; /* outstanding layout states */
303#endif
300 struct xdr_netobj cl_name; /* id generated by client */ 304 struct xdr_netobj cl_name; /* id generated by client */
301 nfs4_verifier cl_verifier; /* generated by client */ 305 nfs4_verifier cl_verifier; /* generated by client */
302 time_t cl_time; /* time of last lease renewal */ 306 time_t cl_time; /* time of last lease renewal */
@@ -493,9 +497,13 @@ struct nfs4_file {
493 atomic_t fi_access[2]; 497 atomic_t fi_access[2];
494 u32 fi_share_deny; 498 u32 fi_share_deny;
495 struct file *fi_deleg_file; 499 struct file *fi_deleg_file;
496 atomic_t fi_delegees; 500 int fi_delegees;
497 struct knfsd_fh fi_fhandle; 501 struct knfsd_fh fi_fhandle;
498 bool fi_had_conflict; 502 bool fi_had_conflict;
503#ifdef CONFIG_NFSD_PNFS
504 struct list_head fi_lo_states;
505 atomic_t fi_lo_recalls;
506#endif
499}; 507};
500 508
501/* 509/*
@@ -528,6 +536,24 @@ static inline struct nfs4_ol_stateid *openlockstateid(struct nfs4_stid *s)
528 return container_of(s, struct nfs4_ol_stateid, st_stid); 536 return container_of(s, struct nfs4_ol_stateid, st_stid);
529} 537}
530 538
539struct nfs4_layout_stateid {
540 struct nfs4_stid ls_stid;
541 struct list_head ls_perclnt;
542 struct list_head ls_perfile;
543 spinlock_t ls_lock;
544 struct list_head ls_layouts;
545 u32 ls_layout_type;
546 struct file *ls_file;
547 struct nfsd4_callback ls_recall;
548 stateid_t ls_recall_sid;
549 bool ls_recalled;
550};
551
552static inline struct nfs4_layout_stateid *layoutstateid(struct nfs4_stid *s)
553{
554 return container_of(s, struct nfs4_layout_stateid, ls_stid);
555}
556
531/* flags for preprocess_seqid_op() */ 557/* flags for preprocess_seqid_op() */
532#define RD_STATE 0x00000010 558#define RD_STATE 0x00000010
533#define WR_STATE 0x00000020 559#define WR_STATE 0x00000020
@@ -535,6 +561,7 @@ static inline struct nfs4_ol_stateid *openlockstateid(struct nfs4_stid *s)
535enum nfsd4_cb_op { 561enum nfsd4_cb_op {
536 NFSPROC4_CLNT_CB_NULL = 0, 562 NFSPROC4_CLNT_CB_NULL = 0,
537 NFSPROC4_CLNT_CB_RECALL, 563 NFSPROC4_CLNT_CB_RECALL,
564 NFSPROC4_CLNT_CB_LAYOUT,
538 NFSPROC4_CLNT_CB_SEQUENCE, 565 NFSPROC4_CLNT_CB_SEQUENCE,
539}; 566};
540 567
@@ -545,6 +572,12 @@ struct nfsd_net;
545extern __be32 nfs4_preprocess_stateid_op(struct net *net, 572extern __be32 nfs4_preprocess_stateid_op(struct net *net,
546 struct nfsd4_compound_state *cstate, 573 struct nfsd4_compound_state *cstate,
547 stateid_t *stateid, int flags, struct file **filp); 574 stateid_t *stateid, int flags, struct file **filp);
575__be32 nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
576 stateid_t *stateid, unsigned char typemask,
577 struct nfs4_stid **s, struct nfsd_net *nn);
578struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl,
579 struct kmem_cache *slab);
580void nfs4_unhash_stid(struct nfs4_stid *s);
548void nfs4_put_stid(struct nfs4_stid *s); 581void nfs4_put_stid(struct nfs4_stid *s);
549void nfs4_remove_reclaim_record(struct nfs4_client_reclaim *, struct nfsd_net *); 582void nfs4_remove_reclaim_record(struct nfs4_client_reclaim *, struct nfsd_net *);
550extern void nfs4_release_reclaim(struct nfsd_net *); 583extern void nfs4_release_reclaim(struct nfsd_net *);
@@ -567,6 +600,14 @@ extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(const char *name,
567 struct nfsd_net *nn); 600 struct nfsd_net *nn);
568extern bool nfs4_has_reclaimed_state(const char *name, struct nfsd_net *nn); 601extern bool nfs4_has_reclaimed_state(const char *name, struct nfsd_net *nn);
569 602
603struct nfs4_file *find_file(struct knfsd_fh *fh);
604void put_nfs4_file(struct nfs4_file *fi);
605static inline void get_nfs4_file(struct nfs4_file *fi)
606{
607 atomic_inc(&fi->fi_ref);
608}
609struct file *find_any_file(struct nfs4_file *f);
610
570/* grace period management */ 611/* grace period management */
571void nfsd4_end_grace(struct nfsd_net *nn); 612void nfsd4_end_grace(struct nfsd_net *nn);
572 613
diff --git a/fs/nfsd/trace.c b/fs/nfsd/trace.c
new file mode 100644
index 000000000000..82f89070594c
--- /dev/null
+++ b/fs/nfsd/trace.c
@@ -0,0 +1,5 @@
1
2#include "state.h"
3
4#define CREATE_TRACE_POINTS
5#include "trace.h"
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
new file mode 100644
index 000000000000..c668520c344b
--- /dev/null
+++ b/fs/nfsd/trace.h
@@ -0,0 +1,54 @@
1/*
2 * Copyright (c) 2014 Christoph Hellwig.
3 */
4#undef TRACE_SYSTEM
5#define TRACE_SYSTEM nfsd
6
7#if !defined(_NFSD_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
8#define _NFSD_TRACE_H
9
10#include <linux/tracepoint.h>
11
12DECLARE_EVENT_CLASS(nfsd_stateid_class,
13 TP_PROTO(stateid_t *stp),
14 TP_ARGS(stp),
15 TP_STRUCT__entry(
16 __field(u32, cl_boot)
17 __field(u32, cl_id)
18 __field(u32, si_id)
19 __field(u32, si_generation)
20 ),
21 TP_fast_assign(
22 __entry->cl_boot = stp->si_opaque.so_clid.cl_boot;
23 __entry->cl_id = stp->si_opaque.so_clid.cl_id;
24 __entry->si_id = stp->si_opaque.so_id;
25 __entry->si_generation = stp->si_generation;
26 ),
27 TP_printk("client %08x:%08x stateid %08x:%08x",
28 __entry->cl_boot,
29 __entry->cl_id,
30 __entry->si_id,
31 __entry->si_generation)
32)
33
34#define DEFINE_STATEID_EVENT(name) \
35DEFINE_EVENT(nfsd_stateid_class, name, \
36 TP_PROTO(stateid_t *stp), \
37 TP_ARGS(stp))
38DEFINE_STATEID_EVENT(layoutstate_alloc);
39DEFINE_STATEID_EVENT(layoutstate_unhash);
40DEFINE_STATEID_EVENT(layoutstate_free);
41DEFINE_STATEID_EVENT(layout_get_lookup_fail);
42DEFINE_STATEID_EVENT(layout_commit_lookup_fail);
43DEFINE_STATEID_EVENT(layout_return_lookup_fail);
44DEFINE_STATEID_EVENT(layout_recall);
45DEFINE_STATEID_EVENT(layout_recall_done);
46DEFINE_STATEID_EVENT(layout_recall_fail);
47DEFINE_STATEID_EVENT(layout_recall_release);
48
49#endif /* _NFSD_TRACE_H */
50
51#undef TRACE_INCLUDE_PATH
52#define TRACE_INCLUDE_PATH .
53#define TRACE_INCLUDE_FILE trace
54#include <trace/define_trace.h>
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 90a5925bd6ab..0bda93e58e1b 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -428,6 +428,61 @@ struct nfsd4_reclaim_complete {
428 u32 rca_one_fs; 428 u32 rca_one_fs;
429}; 429};
430 430
431struct nfsd4_deviceid {
432 u64 fsid_idx;
433 u32 generation;
434 u32 pad;
435};
436
437struct nfsd4_layout_seg {
438 u32 iomode;
439 u64 offset;
440 u64 length;
441};
442
443struct nfsd4_getdeviceinfo {
444 struct nfsd4_deviceid gd_devid; /* request */
445 u32 gd_layout_type; /* request */
446 u32 gd_maxcount; /* request */
447 u32 gd_notify_types;/* request - response */
448 void *gd_device; /* response */
449};
450
451struct nfsd4_layoutget {
452 u64 lg_minlength; /* request */
453 u32 lg_signal; /* request */
454 u32 lg_layout_type; /* request */
455 u32 lg_maxcount; /* request */
456 stateid_t lg_sid; /* request/response */
457 struct nfsd4_layout_seg lg_seg; /* request/response */
458 void *lg_content; /* response */
459};
460
461struct nfsd4_layoutcommit {
462 stateid_t lc_sid; /* request */
463 struct nfsd4_layout_seg lc_seg; /* request */
464 u32 lc_reclaim; /* request */
465 u32 lc_newoffset; /* request */
466 u64 lc_last_wr; /* request */
467 struct timespec lc_mtime; /* request */
468 u32 lc_layout_type; /* request */
469 u32 lc_up_len; /* layout length */
470 void *lc_up_layout; /* decoded by callback */
471 u32 lc_size_chg; /* boolean for response */
472 u64 lc_newsize; /* response */
473};
474
475struct nfsd4_layoutreturn {
476 u32 lr_return_type; /* request */
477 u32 lr_layout_type; /* request */
478 struct nfsd4_layout_seg lr_seg; /* request */
479 u32 lr_reclaim; /* request */
480 u32 lrf_body_len; /* request */
481 void *lrf_body; /* request */
482 stateid_t lr_sid; /* request/response */
483 u32 lrs_present; /* response */
484};
485
431struct nfsd4_fallocate { 486struct nfsd4_fallocate {
432 /* request */ 487 /* request */
433 stateid_t falloc_stateid; 488 stateid_t falloc_stateid;
@@ -491,6 +546,10 @@ struct nfsd4_op {
491 struct nfsd4_reclaim_complete reclaim_complete; 546 struct nfsd4_reclaim_complete reclaim_complete;
492 struct nfsd4_test_stateid test_stateid; 547 struct nfsd4_test_stateid test_stateid;
493 struct nfsd4_free_stateid free_stateid; 548 struct nfsd4_free_stateid free_stateid;
549 struct nfsd4_getdeviceinfo getdeviceinfo;
550 struct nfsd4_layoutget layoutget;
551 struct nfsd4_layoutcommit layoutcommit;
552 struct nfsd4_layoutreturn layoutreturn;
494 553
495 /* NFSv4.2 */ 554 /* NFSv4.2 */
496 struct nfsd4_fallocate allocate; 555 struct nfsd4_fallocate allocate;
diff --git a/fs/nfsd/xdr4cb.h b/fs/nfsd/xdr4cb.h
index c5c55dfb91a9..c47f6fdb111a 100644
--- a/fs/nfsd/xdr4cb.h
+++ b/fs/nfsd/xdr4cb.h
@@ -21,3 +21,10 @@
21#define NFS4_dec_cb_recall_sz (cb_compound_dec_hdr_sz + \ 21#define NFS4_dec_cb_recall_sz (cb_compound_dec_hdr_sz + \
22 cb_sequence_dec_sz + \ 22 cb_sequence_dec_sz + \
23 op_dec_sz) 23 op_dec_sz)
24#define NFS4_enc_cb_layout_sz (cb_compound_enc_hdr_sz + \
25 cb_sequence_enc_sz + \
26 1 + 3 + \
27 enc_nfs4_fh_sz + 4)
28#define NFS4_dec_cb_layout_sz (cb_compound_dec_hdr_sz + \
29 cb_sequence_dec_sz + \
30 op_dec_sz)
diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index 41b223a59a63..fa05e04c5531 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -4,6 +4,7 @@
4#include <linux/types.h> 4#include <linux/types.h>
5 5
6struct dentry; 6struct dentry;
7struct iattr;
7struct inode; 8struct inode;
8struct super_block; 9struct super_block;
9struct vfsmount; 10struct vfsmount;
@@ -180,6 +181,21 @@ struct fid {
180 * get_name is not (which is possibly inconsistent) 181 * get_name is not (which is possibly inconsistent)
181 */ 182 */
182 183
184/* types of block ranges for multipage write mappings. */
185#define IOMAP_HOLE 0x01 /* no blocks allocated, need allocation */
186#define IOMAP_DELALLOC 0x02 /* delayed allocation blocks */
187#define IOMAP_MAPPED 0x03 /* blocks allocated @blkno */
188#define IOMAP_UNWRITTEN 0x04 /* blocks allocated @blkno in unwritten state */
189
190#define IOMAP_NULL_BLOCK -1LL /* blkno is not valid */
191
192struct iomap {
193 sector_t blkno; /* first sector of mapping */
194 loff_t offset; /* file offset of mapping, bytes */
195 u64 length; /* length of mapping, bytes */
196 int type; /* type of mapping */
197};
198
183struct export_operations { 199struct export_operations {
184 int (*encode_fh)(struct inode *inode, __u32 *fh, int *max_len, 200 int (*encode_fh)(struct inode *inode, __u32 *fh, int *max_len,
185 struct inode *parent); 201 struct inode *parent);
@@ -191,6 +207,13 @@ struct export_operations {
191 struct dentry *child); 207 struct dentry *child);
192 struct dentry * (*get_parent)(struct dentry *child); 208 struct dentry * (*get_parent)(struct dentry *child);
193 int (*commit_metadata)(struct inode *inode); 209 int (*commit_metadata)(struct inode *inode);
210
211 int (*get_uuid)(struct super_block *sb, u8 *buf, u32 *len, u64 *offset);
212 int (*map_blocks)(struct inode *inode, loff_t offset,
213 u64 len, struct iomap *iomap,
214 bool write, u32 *device_generation);
215 int (*commit_blocks)(struct inode *inode, struct iomap *iomaps,
216 int nr_iomaps, struct iattr *iattr);
194}; 217};
195 218
196extern int exportfs_encode_inode_fh(struct inode *inode, struct fid *fid, 219extern int exportfs_encode_inode_fh(struct inode *inode, struct fid *fid,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f125b88443bd..cdcb1e9d9613 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -873,6 +873,7 @@ static inline struct file *get_file(struct file *f)
873#define FL_DOWNGRADE_PENDING 256 /* Lease is being downgraded */ 873#define FL_DOWNGRADE_PENDING 256 /* Lease is being downgraded */
874#define FL_UNLOCK_PENDING 512 /* Lease is being broken */ 874#define FL_UNLOCK_PENDING 512 /* Lease is being broken */
875#define FL_OFDLCK 1024 /* lock is "owned" by struct file */ 875#define FL_OFDLCK 1024 /* lock is "owned" by struct file */
876#define FL_LAYOUT 2048 /* outstanding pNFS layout */
876 877
877/* 878/*
878 * Special return value from posix_lock_file() and vfs_lock_file() for 879 * Special return value from posix_lock_file() and vfs_lock_file() for
@@ -2035,6 +2036,16 @@ static inline int break_deleg_wait(struct inode **delegated_inode)
2035 return ret; 2036 return ret;
2036} 2037}
2037 2038
2039static inline int break_layout(struct inode *inode, bool wait)
2040{
2041 smp_mb();
2042 if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease))
2043 return __break_lease(inode,
2044 wait ? O_WRONLY : O_WRONLY | O_NONBLOCK,
2045 FL_LAYOUT);
2046 return 0;
2047}
2048
2038#else /* !CONFIG_FILE_LOCKING */ 2049#else /* !CONFIG_FILE_LOCKING */
2039static inline int locks_mandatory_locked(struct file *file) 2050static inline int locks_mandatory_locked(struct file *file)
2040{ 2051{
@@ -2090,6 +2101,11 @@ static inline int break_deleg_wait(struct inode **delegated_inode)
2090 return 0; 2101 return 0;
2091} 2102}
2092 2103
2104static inline int break_layout(struct inode *inode, bool wait)
2105{
2106 return 0;
2107}
2108
2093#endif /* CONFIG_FILE_LOCKING */ 2109#endif /* CONFIG_FILE_LOCKING */
2094 2110
2095/* fs/open.c */ 2111/* fs/open.c */
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index de7c91ca427e..ed43cb74b11d 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -411,6 +411,7 @@ enum lock_type4 {
411#define FATTR4_WORD1_TIME_MODIFY_SET (1UL << 22) 411#define FATTR4_WORD1_TIME_MODIFY_SET (1UL << 22)
412#define FATTR4_WORD1_MOUNTED_ON_FILEID (1UL << 23) 412#define FATTR4_WORD1_MOUNTED_ON_FILEID (1UL << 23)
413#define FATTR4_WORD1_FS_LAYOUT_TYPES (1UL << 30) 413#define FATTR4_WORD1_FS_LAYOUT_TYPES (1UL << 30)
414#define FATTR4_WORD2_LAYOUT_TYPES (1UL << 0)
414#define FATTR4_WORD2_LAYOUT_BLKSIZE (1UL << 1) 415#define FATTR4_WORD2_LAYOUT_BLKSIZE (1UL << 1)
415#define FATTR4_WORD2_MDSTHRESHOLD (1UL << 4) 416#define FATTR4_WORD2_MDSTHRESHOLD (1UL << 4)
416#define FATTR4_WORD2_SECURITY_LABEL (1UL << 16) 417#define FATTR4_WORD2_SECURITY_LABEL (1UL << 16)
@@ -517,6 +518,7 @@ enum pnfs_layouttype {
517 LAYOUT_OSD2_OBJECTS = 2, 518 LAYOUT_OSD2_OBJECTS = 2,
518 LAYOUT_BLOCK_VOLUME = 3, 519 LAYOUT_BLOCK_VOLUME = 3,
519 LAYOUT_FLEX_FILES = 4, 520 LAYOUT_FLEX_FILES = 4,
521 LAYOUT_TYPE_MAX
520}; 522};
521 523
522/* used for both layout return and recall */ 524/* used for both layout return and recall */
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 6f22cfeef5e3..fae6fb947fc8 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -110,7 +110,7 @@ struct svc_serv {
110 * We use sv_nrthreads as a reference count. svc_destroy() drops 110 * We use sv_nrthreads as a reference count. svc_destroy() drops
111 * this refcount, so we need to bump it up around operations that 111 * this refcount, so we need to bump it up around operations that
112 * change the number of threads. Horrible, but there it is. 112 * change the number of threads. Horrible, but there it is.
113 * Should be called with the BKL held. 113 * Should be called with the "service mutex" held.
114 */ 114 */
115static inline void svc_get(struct svc_serv *serv) 115static inline void svc_get(struct svc_serv *serv)
116{ 116{
diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index ddfe88f52219..df8edf8ec914 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -77,6 +77,7 @@ struct svc_rdma_op_ctxt {
77 enum ib_wr_opcode wr_op; 77 enum ib_wr_opcode wr_op;
78 enum ib_wc_status wc_status; 78 enum ib_wc_status wc_status;
79 u32 byte_len; 79 u32 byte_len;
80 u32 position;
80 struct svcxprt_rdma *xprt; 81 struct svcxprt_rdma *xprt;
81 unsigned long flags; 82 unsigned long flags;
82 enum dma_data_direction direction; 83 enum dma_data_direction direction;
@@ -148,6 +149,10 @@ struct svcxprt_rdma {
148 struct ib_cq *sc_rq_cq; 149 struct ib_cq *sc_rq_cq;
149 struct ib_cq *sc_sq_cq; 150 struct ib_cq *sc_sq_cq;
150 struct ib_mr *sc_phys_mr; /* MR for server memory */ 151 struct ib_mr *sc_phys_mr; /* MR for server memory */
152 int (*sc_reader)(struct svcxprt_rdma *,
153 struct svc_rqst *,
154 struct svc_rdma_op_ctxt *,
155 int *, u32 *, u32, u32, u64, bool);
151 u32 sc_dev_caps; /* distilled device caps */ 156 u32 sc_dev_caps; /* distilled device caps */
152 u32 sc_dma_lkey; /* local dma key */ 157 u32 sc_dma_lkey; /* local dma key */
153 unsigned int sc_frmr_pg_list_len; 158 unsigned int sc_frmr_pg_list_len;
@@ -176,8 +181,6 @@ struct svcxprt_rdma {
176#define RPCRDMA_MAX_REQ_SIZE 4096 181#define RPCRDMA_MAX_REQ_SIZE 4096
177 182
178/* svc_rdma_marshal.c */ 183/* svc_rdma_marshal.c */
179extern void svc_rdma_rcl_chunk_counts(struct rpcrdma_read_chunk *,
180 int *, int *);
181extern int svc_rdma_xdr_decode_req(struct rpcrdma_msg **, struct svc_rqst *); 184extern int svc_rdma_xdr_decode_req(struct rpcrdma_msg **, struct svc_rqst *);
182extern int svc_rdma_xdr_decode_deferred_req(struct svc_rqst *); 185extern int svc_rdma_xdr_decode_deferred_req(struct svc_rqst *);
183extern int svc_rdma_xdr_encode_error(struct svcxprt_rdma *, 186extern int svc_rdma_xdr_encode_error(struct svcxprt_rdma *,
@@ -195,6 +198,12 @@ extern int svc_rdma_xdr_get_reply_hdr_len(struct rpcrdma_msg *);
195 198
196/* svc_rdma_recvfrom.c */ 199/* svc_rdma_recvfrom.c */
197extern int svc_rdma_recvfrom(struct svc_rqst *); 200extern int svc_rdma_recvfrom(struct svc_rqst *);
201extern int rdma_read_chunk_lcl(struct svcxprt_rdma *, struct svc_rqst *,
202 struct svc_rdma_op_ctxt *, int *, u32 *,
203 u32, u32, u64, bool);
204extern int rdma_read_chunk_frmr(struct svcxprt_rdma *, struct svc_rqst *,
205 struct svc_rdma_op_ctxt *, int *, u32 *,
206 u32, u32, u64, bool);
198 207
199/* svc_rdma_sendto.c */ 208/* svc_rdma_sendto.c */
200extern int svc_rdma_sendto(struct svc_rqst *); 209extern int svc_rdma_sendto(struct svc_rqst *);
diff --git a/include/uapi/linux/nfsd/debug.h b/include/uapi/linux/nfsd/debug.h
index 1fdc95bb2375..0bf130a1c58d 100644
--- a/include/uapi/linux/nfsd/debug.h
+++ b/include/uapi/linux/nfsd/debug.h
@@ -32,6 +32,7 @@
32#define NFSDDBG_REPCACHE 0x0080 32#define NFSDDBG_REPCACHE 0x0080
33#define NFSDDBG_XDR 0x0100 33#define NFSDDBG_XDR 0x0100
34#define NFSDDBG_LOCKD 0x0200 34#define NFSDDBG_LOCKD 0x0200
35#define NFSDDBG_PNFS 0x0400
35#define NFSDDBG_ALL 0x7FFF 36#define NFSDDBG_ALL 0x7FFF
36#define NFSDDBG_NOCHANGE 0xFFFF 37#define NFSDDBG_NOCHANGE 0xFFFF
37 38
diff --git a/include/uapi/linux/nfsd/export.h b/include/uapi/linux/nfsd/export.h
index 584b6ef3a5e8..4742f2cb42f2 100644
--- a/include/uapi/linux/nfsd/export.h
+++ b/include/uapi/linux/nfsd/export.h
@@ -47,8 +47,10 @@
47 * exported filesystem. 47 * exported filesystem.
48 */ 48 */
49#define NFSEXP_V4ROOT 0x10000 49#define NFSEXP_V4ROOT 0x10000
50#define NFSEXP_NOPNFS 0x20000
51
50/* All flags that we claim to support. (Note we don't support NOACL.) */ 52/* All flags that we claim to support. (Note we don't support NOACL.) */
51#define NFSEXP_ALLFLAGS 0x1FE7F 53#define NFSEXP_ALLFLAGS 0x3FE7F
52 54
53/* The flags that may vary depending on security flavor: */ 55/* The flags that may vary depending on security flavor: */
54#define NFSEXP_SECINFO_FLAGS (NFSEXP_READONLY | NFSEXP_ROOTSQUASH \ 56#define NFSEXP_SECINFO_FLAGS (NFSEXP_READONLY | NFSEXP_ROOTSQUASH \
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 91eaef1844c8..78974e4d9ad2 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -768,8 +768,8 @@ svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
768EXPORT_SYMBOL_GPL(svc_set_num_threads); 768EXPORT_SYMBOL_GPL(svc_set_num_threads);
769 769
770/* 770/*
771 * Called from a server thread as it's exiting. Caller must hold the BKL or 771 * Called from a server thread as it's exiting. Caller must hold the "service
772 * the "service mutex", whichever is appropriate for the service. 772 * mutex" for the service.
773 */ 773 */
774void 774void
775svc_exit_thread(struct svc_rqst *rqstp) 775svc_exit_thread(struct svc_rqst *rqstp)
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index c69358b3cf7f..163ac45c3639 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -42,7 +42,7 @@ static LIST_HEAD(svc_xprt_class_list);
42 * svc_pool->sp_lock protects most of the fields of that pool. 42 * svc_pool->sp_lock protects most of the fields of that pool.
43 * svc_serv->sv_lock protects sv_tempsocks, sv_permsocks, sv_tmpcnt. 43 * svc_serv->sv_lock protects sv_tempsocks, sv_permsocks, sv_tmpcnt.
44 * when both need to be taken (rare), svc_serv->sv_lock is first. 44 * when both need to be taken (rare), svc_serv->sv_lock is first.
45 * BKL protects svc_serv->sv_nrthread. 45 * The "service mutex" protects svc_serv->sv_nrthread.
46 * svc_sock->sk_lock protects the svc_sock->sk_deferred list 46 * svc_sock->sk_lock protects the svc_sock->sk_deferred list
47 * and the ->sk_info_authunix cache. 47 * and the ->sk_info_authunix cache.
48 * 48 *
@@ -67,7 +67,6 @@ static LIST_HEAD(svc_xprt_class_list);
67 * that no other thread will be using the transport or will 67 * that no other thread will be using the transport or will
68 * try to set XPT_DEAD. 68 * try to set XPT_DEAD.
69 */ 69 */
70
71int svc_reg_xprt_class(struct svc_xprt_class *xcl) 70int svc_reg_xprt_class(struct svc_xprt_class *xcl)
72{ 71{
73 struct svc_xprt_class *cl; 72 struct svc_xprt_class *cl;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_marshal.c b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
index 65b146297f5a..b681855cf970 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_marshal.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
@@ -71,22 +71,6 @@ static u32 *decode_read_list(u32 *va, u32 *vaend)
71} 71}
72 72
73/* 73/*
74 * Determine number of chunks and total bytes in chunk list. The chunk
75 * list has already been verified to fit within the RPCRDMA header.
76 */
77void svc_rdma_rcl_chunk_counts(struct rpcrdma_read_chunk *ch,
78 int *ch_count, int *byte_count)
79{
80 /* compute the number of bytes represented by read chunks */
81 *byte_count = 0;
82 *ch_count = 0;
83 for (; ch->rc_discrim != 0; ch++) {
84 *byte_count = *byte_count + ntohl(ch->rc_target.rs_length);
85 *ch_count = *ch_count + 1;
86 }
87}
88
89/*
90 * Decodes a write chunk list. The expected format is as follows: 74 * Decodes a write chunk list. The expected format is as follows:
91 * descrim : xdr_one 75 * descrim : xdr_one
92 * nchunks : <count> 76 * nchunks : <count>
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index e0110270d650..f9f13a32ddb8 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -43,7 +43,6 @@
43#include <linux/sunrpc/debug.h> 43#include <linux/sunrpc/debug.h>
44#include <linux/sunrpc/rpc_rdma.h> 44#include <linux/sunrpc/rpc_rdma.h>
45#include <linux/spinlock.h> 45#include <linux/spinlock.h>
46#include <linux/highmem.h>
47#include <asm/unaligned.h> 46#include <asm/unaligned.h>
48#include <rdma/ib_verbs.h> 47#include <rdma/ib_verbs.h>
49#include <rdma/rdma_cm.h> 48#include <rdma/rdma_cm.h>
@@ -60,6 +59,7 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
60 struct svc_rdma_op_ctxt *ctxt, 59 struct svc_rdma_op_ctxt *ctxt,
61 u32 byte_count) 60 u32 byte_count)
62{ 61{
62 struct rpcrdma_msg *rmsgp;
63 struct page *page; 63 struct page *page;
64 u32 bc; 64 u32 bc;
65 int sge_no; 65 int sge_no;
@@ -82,7 +82,14 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
82 /* If data remains, store it in the pagelist */ 82 /* If data remains, store it in the pagelist */
83 rqstp->rq_arg.page_len = bc; 83 rqstp->rq_arg.page_len = bc;
84 rqstp->rq_arg.page_base = 0; 84 rqstp->rq_arg.page_base = 0;
85 rqstp->rq_arg.pages = &rqstp->rq_pages[1]; 85
86 /* RDMA_NOMSG: RDMA READ data should land just after RDMA RECV data */
87 rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base;
88 if (be32_to_cpu(rmsgp->rm_type) == RDMA_NOMSG)
89 rqstp->rq_arg.pages = &rqstp->rq_pages[0];
90 else
91 rqstp->rq_arg.pages = &rqstp->rq_pages[1];
92
86 sge_no = 1; 93 sge_no = 1;
87 while (bc && sge_no < ctxt->count) { 94 while (bc && sge_no < ctxt->count) {
88 page = ctxt->pages[sge_no]; 95 page = ctxt->pages[sge_no];
@@ -95,14 +102,6 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
95 rqstp->rq_respages = &rqstp->rq_pages[sge_no]; 102 rqstp->rq_respages = &rqstp->rq_pages[sge_no];
96 rqstp->rq_next_page = rqstp->rq_respages + 1; 103 rqstp->rq_next_page = rqstp->rq_respages + 1;
97 104
98 /* We should never run out of SGE because the limit is defined to
99 * support the max allowed RPC data length
100 */
101 BUG_ON(bc && (sge_no == ctxt->count));
102 BUG_ON((rqstp->rq_arg.head[0].iov_len + rqstp->rq_arg.page_len)
103 != byte_count);
104 BUG_ON(rqstp->rq_arg.len != byte_count);
105
106 /* If not all pages were used from the SGL, free the remaining ones */ 105 /* If not all pages were used from the SGL, free the remaining ones */
107 bc = sge_no; 106 bc = sge_no;
108 while (sge_no < ctxt->count) { 107 while (sge_no < ctxt->count) {
@@ -125,26 +124,16 @@ static int rdma_read_max_sge(struct svcxprt_rdma *xprt, int sge_count)
125 return min_t(int, sge_count, xprt->sc_max_sge); 124 return min_t(int, sge_count, xprt->sc_max_sge);
126} 125}
127 126
128typedef int (*rdma_reader_fn)(struct svcxprt_rdma *xprt,
129 struct svc_rqst *rqstp,
130 struct svc_rdma_op_ctxt *head,
131 int *page_no,
132 u32 *page_offset,
133 u32 rs_handle,
134 u32 rs_length,
135 u64 rs_offset,
136 int last);
137
138/* Issue an RDMA_READ using the local lkey to map the data sink */ 127/* Issue an RDMA_READ using the local lkey to map the data sink */
139static int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt, 128int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt,
140 struct svc_rqst *rqstp, 129 struct svc_rqst *rqstp,
141 struct svc_rdma_op_ctxt *head, 130 struct svc_rdma_op_ctxt *head,
142 int *page_no, 131 int *page_no,
143 u32 *page_offset, 132 u32 *page_offset,
144 u32 rs_handle, 133 u32 rs_handle,
145 u32 rs_length, 134 u32 rs_length,
146 u64 rs_offset, 135 u64 rs_offset,
147 int last) 136 bool last)
148{ 137{
149 struct ib_send_wr read_wr; 138 struct ib_send_wr read_wr;
150 int pages_needed = PAGE_ALIGN(*page_offset + rs_length) >> PAGE_SHIFT; 139 int pages_needed = PAGE_ALIGN(*page_offset + rs_length) >> PAGE_SHIFT;
@@ -229,15 +218,15 @@ static int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt,
229} 218}
230 219
231/* Issue an RDMA_READ using an FRMR to map the data sink */ 220/* Issue an RDMA_READ using an FRMR to map the data sink */
232static int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt, 221int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt,
233 struct svc_rqst *rqstp, 222 struct svc_rqst *rqstp,
234 struct svc_rdma_op_ctxt *head, 223 struct svc_rdma_op_ctxt *head,
235 int *page_no, 224 int *page_no,
236 u32 *page_offset, 225 u32 *page_offset,
237 u32 rs_handle, 226 u32 rs_handle,
238 u32 rs_length, 227 u32 rs_length,
239 u64 rs_offset, 228 u64 rs_offset,
240 int last) 229 bool last)
241{ 230{
242 struct ib_send_wr read_wr; 231 struct ib_send_wr read_wr;
243 struct ib_send_wr inv_wr; 232 struct ib_send_wr inv_wr;
@@ -365,24 +354,84 @@ static int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt,
365 return ret; 354 return ret;
366} 355}
367 356
357static unsigned int
358rdma_rcl_chunk_count(struct rpcrdma_read_chunk *ch)
359{
360 unsigned int count;
361
362 for (count = 0; ch->rc_discrim != xdr_zero; ch++)
363 count++;
364 return count;
365}
366
367/* If there was additional inline content, append it to the end of arg.pages.
368 * Tail copy has to be done after the reader function has determined how many
369 * pages are needed for RDMA READ.
370 */
371static int
372rdma_copy_tail(struct svc_rqst *rqstp, struct svc_rdma_op_ctxt *head,
373 u32 position, u32 byte_count, u32 page_offset, int page_no)
374{
375 char *srcp, *destp;
376 int ret;
377
378 ret = 0;
379 srcp = head->arg.head[0].iov_base + position;
380 byte_count = head->arg.head[0].iov_len - position;
381 if (byte_count > PAGE_SIZE) {
382 dprintk("svcrdma: large tail unsupported\n");
383 return 0;
384 }
385
386 /* Fit as much of the tail on the current page as possible */
387 if (page_offset != PAGE_SIZE) {
388 destp = page_address(rqstp->rq_arg.pages[page_no]);
389 destp += page_offset;
390 while (byte_count--) {
391 *destp++ = *srcp++;
392 page_offset++;
393 if (page_offset == PAGE_SIZE && byte_count)
394 goto more;
395 }
396 goto done;
397 }
398
399more:
400 /* Fit the rest on the next page */
401 page_no++;
402 destp = page_address(rqstp->rq_arg.pages[page_no]);
403 while (byte_count--)
404 *destp++ = *srcp++;
405
406 rqstp->rq_respages = &rqstp->rq_arg.pages[page_no+1];
407 rqstp->rq_next_page = rqstp->rq_respages + 1;
408
409done:
410 byte_count = head->arg.head[0].iov_len - position;
411 head->arg.page_len += byte_count;
412 head->arg.len += byte_count;
413 head->arg.buflen += byte_count;
414 return 1;
415}
416
368static int rdma_read_chunks(struct svcxprt_rdma *xprt, 417static int rdma_read_chunks(struct svcxprt_rdma *xprt,
369 struct rpcrdma_msg *rmsgp, 418 struct rpcrdma_msg *rmsgp,
370 struct svc_rqst *rqstp, 419 struct svc_rqst *rqstp,
371 struct svc_rdma_op_ctxt *head) 420 struct svc_rdma_op_ctxt *head)
372{ 421{
373 int page_no, ch_count, ret; 422 int page_no, ret;
374 struct rpcrdma_read_chunk *ch; 423 struct rpcrdma_read_chunk *ch;
375 u32 page_offset, byte_count; 424 u32 handle, page_offset, byte_count;
425 u32 position;
376 u64 rs_offset; 426 u64 rs_offset;
377 rdma_reader_fn reader; 427 bool last;
378 428
379 /* If no read list is present, return 0 */ 429 /* If no read list is present, return 0 */
380 ch = svc_rdma_get_read_chunk(rmsgp); 430 ch = svc_rdma_get_read_chunk(rmsgp);
381 if (!ch) 431 if (!ch)
382 return 0; 432 return 0;
383 433
384 svc_rdma_rcl_chunk_counts(ch, &ch_count, &byte_count); 434 if (rdma_rcl_chunk_count(ch) > RPCSVC_MAXPAGES)
385 if (ch_count > RPCSVC_MAXPAGES)
386 return -EINVAL; 435 return -EINVAL;
387 436
388 /* The request is completed when the RDMA_READs complete. The 437 /* The request is completed when the RDMA_READs complete. The
@@ -391,34 +440,41 @@ static int rdma_read_chunks(struct svcxprt_rdma *xprt,
391 */ 440 */
392 head->arg.head[0] = rqstp->rq_arg.head[0]; 441 head->arg.head[0] = rqstp->rq_arg.head[0];
393 head->arg.tail[0] = rqstp->rq_arg.tail[0]; 442 head->arg.tail[0] = rqstp->rq_arg.tail[0];
394 head->arg.pages = &head->pages[head->count];
395 head->hdr_count = head->count; 443 head->hdr_count = head->count;
396 head->arg.page_base = 0; 444 head->arg.page_base = 0;
397 head->arg.page_len = 0; 445 head->arg.page_len = 0;
398 head->arg.len = rqstp->rq_arg.len; 446 head->arg.len = rqstp->rq_arg.len;
399 head->arg.buflen = rqstp->rq_arg.buflen; 447 head->arg.buflen = rqstp->rq_arg.buflen;
400 448
401 /* Use FRMR if supported */ 449 ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
402 if (xprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG) 450 position = be32_to_cpu(ch->rc_position);
403 reader = rdma_read_chunk_frmr; 451
404 else 452 /* RDMA_NOMSG: RDMA READ data should land just after RDMA RECV data */
405 reader = rdma_read_chunk_lcl; 453 if (position == 0) {
454 head->arg.pages = &head->pages[0];
455 page_offset = head->byte_len;
456 } else {
457 head->arg.pages = &head->pages[head->count];
458 page_offset = 0;
459 }
406 460
407 page_no = 0; page_offset = 0; 461 ret = 0;
408 for (ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0]; 462 page_no = 0;
409 ch->rc_discrim != 0; ch++) { 463 for (; ch->rc_discrim != xdr_zero; ch++) {
464 if (be32_to_cpu(ch->rc_position) != position)
465 goto err;
410 466
467 handle = be32_to_cpu(ch->rc_target.rs_handle),
468 byte_count = be32_to_cpu(ch->rc_target.rs_length);
411 xdr_decode_hyper((__be32 *)&ch->rc_target.rs_offset, 469 xdr_decode_hyper((__be32 *)&ch->rc_target.rs_offset,
412 &rs_offset); 470 &rs_offset);
413 byte_count = ntohl(ch->rc_target.rs_length);
414 471
415 while (byte_count > 0) { 472 while (byte_count > 0) {
416 ret = reader(xprt, rqstp, head, 473 last = (ch + 1)->rc_discrim == xdr_zero;
417 &page_no, &page_offset, 474 ret = xprt->sc_reader(xprt, rqstp, head,
418 ntohl(ch->rc_target.rs_handle), 475 &page_no, &page_offset,
419 byte_count, rs_offset, 476 handle, byte_count,
420 ((ch+1)->rc_discrim == 0) /* last */ 477 rs_offset, last);
421 );
422 if (ret < 0) 478 if (ret < 0)
423 goto err; 479 goto err;
424 byte_count -= ret; 480 byte_count -= ret;
@@ -426,7 +482,24 @@ static int rdma_read_chunks(struct svcxprt_rdma *xprt,
426 head->arg.buflen += ret; 482 head->arg.buflen += ret;
427 } 483 }
428 } 484 }
485
486 /* Read list may need XDR round-up (see RFC 5666, s. 3.7) */
487 if (page_offset & 3) {
488 u32 pad = 4 - (page_offset & 3);
489
490 head->arg.page_len += pad;
491 head->arg.len += pad;
492 head->arg.buflen += pad;
493 page_offset += pad;
494 }
495
429 ret = 1; 496 ret = 1;
497 if (position && position < head->arg.head[0].iov_len)
498 ret = rdma_copy_tail(rqstp, head, position,
499 byte_count, page_offset, page_no);
500 head->arg.head[0].iov_len = position;
501 head->position = position;
502
430 err: 503 err:
431 /* Detach arg pages. svc_recv will replenish them */ 504 /* Detach arg pages. svc_recv will replenish them */
432 for (page_no = 0; 505 for (page_no = 0;
@@ -436,47 +509,33 @@ static int rdma_read_chunks(struct svcxprt_rdma *xprt,
436 return ret; 509 return ret;
437} 510}
438 511
439/*
440 * To avoid a separate RDMA READ just for a handful of zero bytes,
441 * RFC 5666 section 3.7 allows the client to omit the XDR zero pad
442 * in chunk lists.
443 */
444static void
445rdma_fix_xdr_pad(struct xdr_buf *buf)
446{
447 unsigned int page_len = buf->page_len;
448 unsigned int size = (XDR_QUADLEN(page_len) << 2) - page_len;
449 unsigned int offset, pg_no;
450 char *p;
451
452 if (size == 0)
453 return;
454
455 pg_no = page_len >> PAGE_SHIFT;
456 offset = page_len & ~PAGE_MASK;
457 p = page_address(buf->pages[pg_no]);
458 memset(p + offset, 0, size);
459
460 buf->page_len += size;
461 buf->buflen += size;
462 buf->len += size;
463}
464
465static int rdma_read_complete(struct svc_rqst *rqstp, 512static int rdma_read_complete(struct svc_rqst *rqstp,
466 struct svc_rdma_op_ctxt *head) 513 struct svc_rdma_op_ctxt *head)
467{ 514{
468 int page_no; 515 int page_no;
469 int ret; 516 int ret;
470 517
471 BUG_ON(!head);
472
473 /* Copy RPC pages */ 518 /* Copy RPC pages */
474 for (page_no = 0; page_no < head->count; page_no++) { 519 for (page_no = 0; page_no < head->count; page_no++) {
475 put_page(rqstp->rq_pages[page_no]); 520 put_page(rqstp->rq_pages[page_no]);
476 rqstp->rq_pages[page_no] = head->pages[page_no]; 521 rqstp->rq_pages[page_no] = head->pages[page_no];
477 } 522 }
523
524 /* Adjustments made for RDMA_NOMSG type requests */
525 if (head->position == 0) {
526 if (head->arg.len <= head->sge[0].length) {
527 head->arg.head[0].iov_len = head->arg.len -
528 head->byte_len;
529 head->arg.page_len = 0;
530 } else {
531 head->arg.head[0].iov_len = head->sge[0].length -
532 head->byte_len;
533 head->arg.page_len = head->arg.len -
534 head->sge[0].length;
535 }
536 }
537
478 /* Point rq_arg.pages past header */ 538 /* Point rq_arg.pages past header */
479 rdma_fix_xdr_pad(&head->arg);
480 rqstp->rq_arg.pages = &rqstp->rq_pages[head->hdr_count]; 539 rqstp->rq_arg.pages = &rqstp->rq_pages[head->hdr_count];
481 rqstp->rq_arg.page_len = head->arg.page_len; 540 rqstp->rq_arg.page_len = head->arg.page_len;
482 rqstp->rq_arg.page_base = head->arg.page_base; 541 rqstp->rq_arg.page_base = head->arg.page_base;
@@ -501,8 +560,8 @@ static int rdma_read_complete(struct svc_rqst *rqstp,
501 ret = rqstp->rq_arg.head[0].iov_len 560 ret = rqstp->rq_arg.head[0].iov_len
502 + rqstp->rq_arg.page_len 561 + rqstp->rq_arg.page_len
503 + rqstp->rq_arg.tail[0].iov_len; 562 + rqstp->rq_arg.tail[0].iov_len;
504 dprintk("svcrdma: deferred read ret=%d, rq_arg.len =%d, " 563 dprintk("svcrdma: deferred read ret=%d, rq_arg.len=%u, "
505 "rq_arg.head[0].iov_base=%p, rq_arg.head[0].iov_len = %zd\n", 564 "rq_arg.head[0].iov_base=%p, rq_arg.head[0].iov_len=%zu\n",
506 ret, rqstp->rq_arg.len, rqstp->rq_arg.head[0].iov_base, 565 ret, rqstp->rq_arg.len, rqstp->rq_arg.head[0].iov_base,
507 rqstp->rq_arg.head[0].iov_len); 566 rqstp->rq_arg.head[0].iov_len);
508 567
@@ -558,7 +617,6 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
558 } 617 }
559 dprintk("svcrdma: processing ctxt=%p on xprt=%p, rqstp=%p, status=%d\n", 618 dprintk("svcrdma: processing ctxt=%p on xprt=%p, rqstp=%p, status=%d\n",
560 ctxt, rdma_xprt, rqstp, ctxt->wc_status); 619 ctxt, rdma_xprt, rqstp, ctxt->wc_status);
561 BUG_ON(ctxt->wc_status != IB_WC_SUCCESS);
562 atomic_inc(&rdma_stat_recv); 620 atomic_inc(&rdma_stat_recv);
563 621
564 /* Build up the XDR from the receive buffers. */ 622 /* Build up the XDR from the receive buffers. */
@@ -591,8 +649,8 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
591 + rqstp->rq_arg.tail[0].iov_len; 649 + rqstp->rq_arg.tail[0].iov_len;
592 svc_rdma_put_context(ctxt, 0); 650 svc_rdma_put_context(ctxt, 0);
593 out: 651 out:
594 dprintk("svcrdma: ret = %d, rq_arg.len =%d, " 652 dprintk("svcrdma: ret=%d, rq_arg.len=%u, "
595 "rq_arg.head[0].iov_base=%p, rq_arg.head[0].iov_len = %zd\n", 653 "rq_arg.head[0].iov_base=%p, rq_arg.head[0].iov_len=%zd\n",
596 ret, rqstp->rq_arg.len, 654 ret, rqstp->rq_arg.len,
597 rqstp->rq_arg.head[0].iov_base, 655 rqstp->rq_arg.head[0].iov_base,
598 rqstp->rq_arg.head[0].iov_len); 656 rqstp->rq_arg.head[0].iov_len);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 9f1b50689c0f..7de33d1af9b6 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -60,8 +60,11 @@ static int map_xdr(struct svcxprt_rdma *xprt,
60 u32 page_off; 60 u32 page_off;
61 int page_no; 61 int page_no;
62 62
63 BUG_ON(xdr->len != 63 if (xdr->len !=
64 (xdr->head[0].iov_len + xdr->page_len + xdr->tail[0].iov_len)); 64 (xdr->head[0].iov_len + xdr->page_len + xdr->tail[0].iov_len)) {
65 pr_err("svcrdma: map_xdr: XDR buffer length error\n");
66 return -EIO;
67 }
65 68
66 /* Skip the first sge, this is for the RPCRDMA header */ 69 /* Skip the first sge, this is for the RPCRDMA header */
67 sge_no = 1; 70 sge_no = 1;
@@ -150,7 +153,11 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
150 int bc; 153 int bc;
151 struct svc_rdma_op_ctxt *ctxt; 154 struct svc_rdma_op_ctxt *ctxt;
152 155
153 BUG_ON(vec->count > RPCSVC_MAXPAGES); 156 if (vec->count > RPCSVC_MAXPAGES) {
157 pr_err("svcrdma: Too many pages (%lu)\n", vec->count);
158 return -EIO;
159 }
160
154 dprintk("svcrdma: RDMA_WRITE rmr=%x, to=%llx, xdr_off=%d, " 161 dprintk("svcrdma: RDMA_WRITE rmr=%x, to=%llx, xdr_off=%d, "
155 "write_len=%d, vec->sge=%p, vec->count=%lu\n", 162 "write_len=%d, vec->sge=%p, vec->count=%lu\n",
156 rmr, (unsigned long long)to, xdr_off, 163 rmr, (unsigned long long)to, xdr_off,
@@ -190,7 +197,10 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
190 sge_off = 0; 197 sge_off = 0;
191 sge_no++; 198 sge_no++;
192 xdr_sge_no++; 199 xdr_sge_no++;
193 BUG_ON(xdr_sge_no > vec->count); 200 if (xdr_sge_no > vec->count) {
201 pr_err("svcrdma: Too many sges (%d)\n", xdr_sge_no);
202 goto err;
203 }
194 bc -= sge_bytes; 204 bc -= sge_bytes;
195 if (sge_no == xprt->sc_max_sge) 205 if (sge_no == xprt->sc_max_sge)
196 break; 206 break;
@@ -421,7 +431,10 @@ static int send_reply(struct svcxprt_rdma *rdma,
421 ctxt->sge[sge_no].lkey = rdma->sc_dma_lkey; 431 ctxt->sge[sge_no].lkey = rdma->sc_dma_lkey;
422 ctxt->sge[sge_no].length = sge_bytes; 432 ctxt->sge[sge_no].length = sge_bytes;
423 } 433 }
424 BUG_ON(byte_count != 0); 434 if (byte_count != 0) {
435 pr_err("svcrdma: Could not map %d bytes\n", byte_count);
436 goto err;
437 }
425 438
426 /* Save all respages in the ctxt and remove them from the 439 /* Save all respages in the ctxt and remove them from the
427 * respages array. They are our pages until the I/O 440 * respages array. They are our pages until the I/O
@@ -442,7 +455,10 @@ static int send_reply(struct svcxprt_rdma *rdma,
442 } 455 }
443 rqstp->rq_next_page = rqstp->rq_respages + 1; 456 rqstp->rq_next_page = rqstp->rq_respages + 1;
444 457
445 BUG_ON(sge_no > rdma->sc_max_sge); 458 if (sge_no > rdma->sc_max_sge) {
459 pr_err("svcrdma: Too many sges (%d)\n", sge_no);
460 goto err;
461 }
446 memset(&send_wr, 0, sizeof send_wr); 462 memset(&send_wr, 0, sizeof send_wr);
447 ctxt->wr_op = IB_WR_SEND; 463 ctxt->wr_op = IB_WR_SEND;
448 send_wr.wr_id = (unsigned long)ctxt; 464 send_wr.wr_id = (unsigned long)ctxt;
@@ -467,18 +483,6 @@ void svc_rdma_prep_reply_hdr(struct svc_rqst *rqstp)
467{ 483{
468} 484}
469 485
470/*
471 * Return the start of an xdr buffer.
472 */
473static void *xdr_start(struct xdr_buf *xdr)
474{
475 return xdr->head[0].iov_base -
476 (xdr->len -
477 xdr->page_len -
478 xdr->tail[0].iov_len -
479 xdr->head[0].iov_len);
480}
481
482int svc_rdma_sendto(struct svc_rqst *rqstp) 486int svc_rdma_sendto(struct svc_rqst *rqstp)
483{ 487{
484 struct svc_xprt *xprt = rqstp->rq_xprt; 488 struct svc_xprt *xprt = rqstp->rq_xprt;
@@ -496,8 +500,10 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
496 500
497 dprintk("svcrdma: sending response for rqstp=%p\n", rqstp); 501 dprintk("svcrdma: sending response for rqstp=%p\n", rqstp);
498 502
499 /* Get the RDMA request header. */ 503 /* Get the RDMA request header. The receive logic always
500 rdma_argp = xdr_start(&rqstp->rq_arg); 504 * places this at the start of page 0.
505 */
506 rdma_argp = page_address(rqstp->rq_pages[0]);
501 507
502 /* Build an req vec for the XDR */ 508 /* Build an req vec for the XDR */
503 ctxt = svc_rdma_get_context(rdma); 509 ctxt = svc_rdma_get_context(rdma);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 4e618808bc98..f609c1c2d38d 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -139,7 +139,6 @@ void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages)
139 struct svcxprt_rdma *xprt; 139 struct svcxprt_rdma *xprt;
140 int i; 140 int i;
141 141
142 BUG_ON(!ctxt);
143 xprt = ctxt->xprt; 142 xprt = ctxt->xprt;
144 if (free_pages) 143 if (free_pages)
145 for (i = 0; i < ctxt->count; i++) 144 for (i = 0; i < ctxt->count; i++)
@@ -339,12 +338,14 @@ static void process_context(struct svcxprt_rdma *xprt,
339 338
340 switch (ctxt->wr_op) { 339 switch (ctxt->wr_op) {
341 case IB_WR_SEND: 340 case IB_WR_SEND:
342 BUG_ON(ctxt->frmr); 341 if (ctxt->frmr)
342 pr_err("svcrdma: SEND: ctxt->frmr != NULL\n");
343 svc_rdma_put_context(ctxt, 1); 343 svc_rdma_put_context(ctxt, 1);
344 break; 344 break;
345 345
346 case IB_WR_RDMA_WRITE: 346 case IB_WR_RDMA_WRITE:
347 BUG_ON(ctxt->frmr); 347 if (ctxt->frmr)
348 pr_err("svcrdma: WRITE: ctxt->frmr != NULL\n");
348 svc_rdma_put_context(ctxt, 0); 349 svc_rdma_put_context(ctxt, 0);
349 break; 350 break;
350 351
@@ -353,19 +354,21 @@ static void process_context(struct svcxprt_rdma *xprt,
353 svc_rdma_put_frmr(xprt, ctxt->frmr); 354 svc_rdma_put_frmr(xprt, ctxt->frmr);
354 if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) { 355 if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) {
355 struct svc_rdma_op_ctxt *read_hdr = ctxt->read_hdr; 356 struct svc_rdma_op_ctxt *read_hdr = ctxt->read_hdr;
356 BUG_ON(!read_hdr); 357 if (read_hdr) {
357 spin_lock_bh(&xprt->sc_rq_dto_lock); 358 spin_lock_bh(&xprt->sc_rq_dto_lock);
358 set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags); 359 set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
359 list_add_tail(&read_hdr->dto_q, 360 list_add_tail(&read_hdr->dto_q,
360 &xprt->sc_read_complete_q); 361 &xprt->sc_read_complete_q);
361 spin_unlock_bh(&xprt->sc_rq_dto_lock); 362 spin_unlock_bh(&xprt->sc_rq_dto_lock);
363 } else {
364 pr_err("svcrdma: ctxt->read_hdr == NULL\n");
365 }
362 svc_xprt_enqueue(&xprt->sc_xprt); 366 svc_xprt_enqueue(&xprt->sc_xprt);
363 } 367 }
364 svc_rdma_put_context(ctxt, 0); 368 svc_rdma_put_context(ctxt, 0);
365 break; 369 break;
366 370
367 default: 371 default:
368 BUG_ON(1);
369 printk(KERN_ERR "svcrdma: unexpected completion type, " 372 printk(KERN_ERR "svcrdma: unexpected completion type, "
370 "opcode=%d\n", 373 "opcode=%d\n",
371 ctxt->wr_op); 374 ctxt->wr_op);
@@ -513,7 +516,10 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt)
513 buflen = 0; 516 buflen = 0;
514 ctxt->direction = DMA_FROM_DEVICE; 517 ctxt->direction = DMA_FROM_DEVICE;
515 for (sge_no = 0; buflen < xprt->sc_max_req_size; sge_no++) { 518 for (sge_no = 0; buflen < xprt->sc_max_req_size; sge_no++) {
516 BUG_ON(sge_no >= xprt->sc_max_sge); 519 if (sge_no >= xprt->sc_max_sge) {
520 pr_err("svcrdma: Too many sges (%d)\n", sge_no);
521 goto err_put_ctxt;
522 }
517 page = svc_rdma_get_page(); 523 page = svc_rdma_get_page();
518 ctxt->pages[sge_no] = page; 524 ctxt->pages[sge_no] = page;
519 pa = ib_dma_map_page(xprt->sc_cm_id->device, 525 pa = ib_dma_map_page(xprt->sc_cm_id->device,
@@ -687,7 +693,6 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
687{ 693{
688 struct rdma_cm_id *listen_id; 694 struct rdma_cm_id *listen_id;
689 struct svcxprt_rdma *cma_xprt; 695 struct svcxprt_rdma *cma_xprt;
690 struct svc_xprt *xprt;
691 int ret; 696 int ret;
692 697
693 dprintk("svcrdma: Creating RDMA socket\n"); 698 dprintk("svcrdma: Creating RDMA socket\n");
@@ -698,7 +703,6 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
698 cma_xprt = rdma_create_xprt(serv, 1); 703 cma_xprt = rdma_create_xprt(serv, 1);
699 if (!cma_xprt) 704 if (!cma_xprt)
700 return ERR_PTR(-ENOMEM); 705 return ERR_PTR(-ENOMEM);
701 xprt = &cma_xprt->sc_xprt;
702 706
703 listen_id = rdma_create_id(rdma_listen_handler, cma_xprt, RDMA_PS_TCP, 707 listen_id = rdma_create_id(rdma_listen_handler, cma_xprt, RDMA_PS_TCP,
704 IB_QPT_RC); 708 IB_QPT_RC);
@@ -822,7 +826,7 @@ void svc_rdma_put_frmr(struct svcxprt_rdma *rdma,
822 if (frmr) { 826 if (frmr) {
823 frmr_unmap_dma(rdma, frmr); 827 frmr_unmap_dma(rdma, frmr);
824 spin_lock_bh(&rdma->sc_frmr_q_lock); 828 spin_lock_bh(&rdma->sc_frmr_q_lock);
825 BUG_ON(!list_empty(&frmr->frmr_list)); 829 WARN_ON_ONCE(!list_empty(&frmr->frmr_list));
826 list_add(&frmr->frmr_list, &rdma->sc_frmr_q); 830 list_add(&frmr->frmr_list, &rdma->sc_frmr_q);
827 spin_unlock_bh(&rdma->sc_frmr_q_lock); 831 spin_unlock_bh(&rdma->sc_frmr_q_lock);
828 } 832 }
@@ -970,10 +974,12 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
970 * NB: iWARP requires remote write access for the data sink 974 * NB: iWARP requires remote write access for the data sink
971 * of an RDMA_READ. IB does not. 975 * of an RDMA_READ. IB does not.
972 */ 976 */
977 newxprt->sc_reader = rdma_read_chunk_lcl;
973 if (devattr.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) { 978 if (devattr.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) {
974 newxprt->sc_frmr_pg_list_len = 979 newxprt->sc_frmr_pg_list_len =
975 devattr.max_fast_reg_page_list_len; 980 devattr.max_fast_reg_page_list_len;
976 newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_FAST_REG; 981 newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_FAST_REG;
982 newxprt->sc_reader = rdma_read_chunk_frmr;
977 } 983 }
978 984
979 /* 985 /*
@@ -1125,7 +1131,9 @@ static void __svc_rdma_free(struct work_struct *work)
1125 dprintk("svcrdma: svc_rdma_free(%p)\n", rdma); 1131 dprintk("svcrdma: svc_rdma_free(%p)\n", rdma);
1126 1132
1127 /* We should only be called from kref_put */ 1133 /* We should only be called from kref_put */
1128 BUG_ON(atomic_read(&rdma->sc_xprt.xpt_ref.refcount) != 0); 1134 if (atomic_read(&rdma->sc_xprt.xpt_ref.refcount) != 0)
1135 pr_err("svcrdma: sc_xprt still in use? (%d)\n",
1136 atomic_read(&rdma->sc_xprt.xpt_ref.refcount));
1129 1137
1130 /* 1138 /*
1131 * Destroy queued, but not processed read completions. Note 1139 * Destroy queued, but not processed read completions. Note
@@ -1153,8 +1161,12 @@ static void __svc_rdma_free(struct work_struct *work)
1153 } 1161 }
1154 1162
1155 /* Warn if we leaked a resource or under-referenced */ 1163 /* Warn if we leaked a resource or under-referenced */
1156 WARN_ON(atomic_read(&rdma->sc_ctxt_used) != 0); 1164 if (atomic_read(&rdma->sc_ctxt_used) != 0)
1157 WARN_ON(atomic_read(&rdma->sc_dma_used) != 0); 1165 pr_err("svcrdma: ctxt still in use? (%d)\n",
1166 atomic_read(&rdma->sc_ctxt_used));
1167 if (atomic_read(&rdma->sc_dma_used) != 0)
1168 pr_err("svcrdma: dma still in use? (%d)\n",
1169 atomic_read(&rdma->sc_dma_used));
1158 1170
1159 /* De-allocate fastreg mr */ 1171 /* De-allocate fastreg mr */
1160 rdma_dealloc_frmr_q(rdma); 1172 rdma_dealloc_frmr_q(rdma);
@@ -1254,7 +1266,6 @@ int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr)
1254 if (test_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags)) 1266 if (test_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags))
1255 return -ENOTCONN; 1267 return -ENOTCONN;
1256 1268
1257 BUG_ON(wr->send_flags != IB_SEND_SIGNALED);
1258 wr_count = 1; 1269 wr_count = 1;
1259 for (n_wr = wr->next; n_wr; n_wr = n_wr->next) 1270 for (n_wr = wr->next; n_wr; n_wr = n_wr->next)
1260 wr_count++; 1271 wr_count++;