aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/filesystems/nfs/pnfs-block-server.txt37
-rw-r--r--fs/nfsd/Makefile2
-rw-r--r--fs/nfsd/blocklayout.c189
-rw-r--r--fs/nfsd/blocklayoutxdr.c157
-rw-r--r--fs/nfsd/blocklayoutxdr.h62
-rw-r--r--fs/nfsd/nfs4layouts.c8
-rw-r--r--fs/nfsd/pnfs.h1
7 files changed, 455 insertions, 1 deletions
diff --git a/Documentation/filesystems/nfs/pnfs-block-server.txt b/Documentation/filesystems/nfs/pnfs-block-server.txt
new file mode 100644
index 000000000000..2143673cf154
--- /dev/null
+++ b/Documentation/filesystems/nfs/pnfs-block-server.txt
@@ -0,0 +1,37 @@
1pNFS block layout server user guide
2
3The Linux NFS server now supports the pNFS block layout extension. In this
4case the NFS server acts as Metadata Server (MDS) for pNFS, which in addition
5to handling all the metadata access to the NFS export also hands out layouts
6to the clients to directly access the underlying block devices that are
7shared with the client.
8
9To use pNFS block layouts with with the Linux NFS server the exported file
10system needs to support the pNFS block layouts (currently just XFS), and the
11file system must sit on shared storage (typically iSCSI) that is accessible
12to the clients in addition to the MDS. As of now the file system needs to
13sit directly on the exported volume, striping or concatenation of
14volumes on the MDS and clients is not supported yet.
15
16On the server, pNFS block volume support is automatically if the file system
17support it. On the client make sure the kernel has the CONFIG_PNFS_BLOCK
18option enabled, the blkmapd daemon from nfs-utils is running, and the
19file system is mounted using the NFSv4.1 protocol version (mount -o vers=4.1).
20
21If the nfsd server needs to fence a non-responding client it calls
22/sbin/nfsd-recall-failed with the first argument set to the IP address of
23the client, and the second argument set to the device node without the /dev
24prefix for the file system to be fenced. Below is an example file that shows
25how to translate the device into a serial number from SCSI EVPD 0x80:
26
27cat > /sbin/nfsd-recall-failed << EOF
28#!/bin/sh
29
30CLIENT="$1"
31DEV="/dev/$2"
32EVPD=`sg_inq --page=0x80 ${DEV} | \
33 grep "Unit serial number:" | \
34 awk -F ': ' '{print $2}'`
35
36echo "fencing client ${CLIENT} serial ${EVPD}" >> /var/log/pnfsd-fence.log
37EOF
diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
index 6cba933880c5..9a6028e120c6 100644
--- a/fs/nfsd/Makefile
+++ b/fs/nfsd/Makefile
@@ -17,4 +17,4 @@ nfsd-$(CONFIG_NFSD_V3) += nfs3proc.o nfs3xdr.o
17nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o 17nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o
18nfsd-$(CONFIG_NFSD_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \ 18nfsd-$(CONFIG_NFSD_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \
19 nfs4acl.o nfs4callback.o nfs4recover.o 19 nfs4acl.o nfs4callback.o nfs4recover.o
20nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o 20nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o blocklayout.o blocklayoutxdr.o
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
new file mode 100644
index 000000000000..cdbc78c72542
--- /dev/null
+++ b/fs/nfsd/blocklayout.c
@@ -0,0 +1,189 @@
1/*
2 * Copyright (c) 2014 Christoph Hellwig.
3 */
4#include <linux/exportfs.h>
5#include <linux/genhd.h>
6#include <linux/slab.h>
7
8#include <linux/nfsd/debug.h>
9
10#include "blocklayoutxdr.h"
11#include "pnfs.h"
12
13#define NFSDDBG_FACILITY NFSDDBG_PNFS
14
15
16static int
17nfsd4_block_get_device_info_simple(struct super_block *sb,
18 struct nfsd4_getdeviceinfo *gdp)
19{
20 struct pnfs_block_deviceaddr *dev;
21 struct pnfs_block_volume *b;
22
23 dev = kzalloc(sizeof(struct pnfs_block_deviceaddr) +
24 sizeof(struct pnfs_block_volume), GFP_KERNEL);
25 if (!dev)
26 return -ENOMEM;
27 gdp->gd_device = dev;
28
29 dev->nr_volumes = 1;
30 b = &dev->volumes[0];
31
32 b->type = PNFS_BLOCK_VOLUME_SIMPLE;
33 b->simple.sig_len = PNFS_BLOCK_UUID_LEN;
34 return sb->s_export_op->get_uuid(sb, b->simple.sig, &b->simple.sig_len,
35 &b->simple.offset);
36}
37
38static __be32
39nfsd4_block_proc_getdeviceinfo(struct super_block *sb,
40 struct nfsd4_getdeviceinfo *gdp)
41{
42 if (sb->s_bdev != sb->s_bdev->bd_contains)
43 return nfserr_inval;
44 return nfserrno(nfsd4_block_get_device_info_simple(sb, gdp));
45}
46
47static __be32
48nfsd4_block_proc_layoutget(struct inode *inode, const struct svc_fh *fhp,
49 struct nfsd4_layoutget *args)
50{
51 struct nfsd4_layout_seg *seg = &args->lg_seg;
52 struct super_block *sb = inode->i_sb;
53 u32 block_size = (1 << inode->i_blkbits);
54 struct pnfs_block_extent *bex;
55 struct iomap iomap;
56 u32 device_generation = 0;
57 int error;
58
59 /*
60 * We do not attempt to support I/O smaller than the fs block size,
61 * or not aligned to it.
62 */
63 if (args->lg_minlength < block_size) {
64 dprintk("pnfsd: I/O too small\n");
65 goto out_layoutunavailable;
66 }
67 if (seg->offset & (block_size - 1)) {
68 dprintk("pnfsd: I/O misaligned\n");
69 goto out_layoutunavailable;
70 }
71
72 /*
73 * Some clients barf on non-zero block numbers for NONE or INVALID
74 * layouts, so make sure to zero the whole structure.
75 */
76 error = -ENOMEM;
77 bex = kzalloc(sizeof(*bex), GFP_KERNEL);
78 if (!bex)
79 goto out_error;
80 args->lg_content = bex;
81
82 error = sb->s_export_op->map_blocks(inode, seg->offset, seg->length,
83 &iomap, seg->iomode != IOMODE_READ,
84 &device_generation);
85 if (error) {
86 if (error == -ENXIO)
87 goto out_layoutunavailable;
88 goto out_error;
89 }
90
91 if (iomap.length < args->lg_minlength) {
92 dprintk("pnfsd: extent smaller than minlength\n");
93 goto out_layoutunavailable;
94 }
95
96 switch (iomap.type) {
97 case IOMAP_MAPPED:
98 if (seg->iomode == IOMODE_READ)
99 bex->es = PNFS_BLOCK_READ_DATA;
100 else
101 bex->es = PNFS_BLOCK_READWRITE_DATA;
102 bex->soff = (iomap.blkno << 9);
103 break;
104 case IOMAP_UNWRITTEN:
105 if (seg->iomode & IOMODE_RW) {
106 /*
107 * Crack monkey special case from section 2.3.1.
108 */
109 if (args->lg_minlength == 0) {
110 dprintk("pnfsd: no soup for you!\n");
111 goto out_layoutunavailable;
112 }
113
114 bex->es = PNFS_BLOCK_INVALID_DATA;
115 bex->soff = (iomap.blkno << 9);
116 break;
117 }
118 /*FALLTHRU*/
119 case IOMAP_HOLE:
120 if (seg->iomode == IOMODE_READ) {
121 bex->es = PNFS_BLOCK_NONE_DATA;
122 break;
123 }
124 /*FALLTHRU*/
125 case IOMAP_DELALLOC:
126 default:
127 WARN(1, "pnfsd: filesystem returned %d extent\n", iomap.type);
128 goto out_layoutunavailable;
129 }
130
131 error = nfsd4_set_deviceid(&bex->vol_id, fhp, device_generation);
132 if (error)
133 goto out_error;
134 bex->foff = iomap.offset;
135 bex->len = iomap.length;
136
137 seg->offset = iomap.offset;
138 seg->length = iomap.length;
139
140 dprintk("GET: %lld:%lld %d\n", bex->foff, bex->len, bex->es);
141 return 0;
142
143out_error:
144 seg->length = 0;
145 return nfserrno(error);
146out_layoutunavailable:
147 seg->length = 0;
148 return nfserr_layoutunavailable;
149}
150
151static __be32
152nfsd4_block_proc_layoutcommit(struct inode *inode,
153 struct nfsd4_layoutcommit *lcp)
154{
155 loff_t new_size = lcp->lc_last_wr + 1;
156 struct iattr iattr = { .ia_valid = 0 };
157 struct iomap *iomaps;
158 int nr_iomaps;
159 int error;
160
161 nr_iomaps = nfsd4_block_decode_layoutupdate(lcp->lc_up_layout,
162 lcp->lc_up_len, &iomaps, 1 << inode->i_blkbits);
163 if (nr_iomaps < 0)
164 return nfserrno(nr_iomaps);
165
166 if (lcp->lc_mtime.tv_nsec == UTIME_NOW ||
167 timespec_compare(&lcp->lc_mtime, &inode->i_mtime) < 0)
168 lcp->lc_mtime = current_fs_time(inode->i_sb);
169 iattr.ia_valid |= ATTR_ATIME | ATTR_CTIME | ATTR_MTIME;
170 iattr.ia_atime = iattr.ia_ctime = iattr.ia_mtime = lcp->lc_mtime;
171
172 if (new_size > i_size_read(inode)) {
173 iattr.ia_valid |= ATTR_SIZE;
174 iattr.ia_size = new_size;
175 }
176
177 error = inode->i_sb->s_export_op->commit_blocks(inode, iomaps,
178 nr_iomaps, &iattr);
179 kfree(iomaps);
180 return nfserrno(error);
181}
182
183const struct nfsd4_layout_ops bl_layout_ops = {
184 .proc_getdeviceinfo = nfsd4_block_proc_getdeviceinfo,
185 .encode_getdeviceinfo = nfsd4_block_encode_getdeviceinfo,
186 .proc_layoutget = nfsd4_block_proc_layoutget,
187 .encode_layoutget = nfsd4_block_encode_layoutget,
188 .proc_layoutcommit = nfsd4_block_proc_layoutcommit,
189};
diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c
new file mode 100644
index 000000000000..9da89fddab33
--- /dev/null
+++ b/fs/nfsd/blocklayoutxdr.c
@@ -0,0 +1,157 @@
1/*
2 * Copyright (c) 2014 Christoph Hellwig.
3 */
4#include <linux/sunrpc/svc.h>
5#include <linux/exportfs.h>
6#include <linux/nfs4.h>
7
8#include "nfsd.h"
9#include "blocklayoutxdr.h"
10
11#define NFSDDBG_FACILITY NFSDDBG_PNFS
12
13
14__be32
15nfsd4_block_encode_layoutget(struct xdr_stream *xdr,
16 struct nfsd4_layoutget *lgp)
17{
18 struct pnfs_block_extent *b = lgp->lg_content;
19 int len = sizeof(__be32) + 5 * sizeof(__be64) + sizeof(__be32);
20 __be32 *p;
21
22 p = xdr_reserve_space(xdr, sizeof(__be32) + len);
23 if (!p)
24 return nfserr_toosmall;
25
26 *p++ = cpu_to_be32(len);
27 *p++ = cpu_to_be32(1); /* we always return a single extent */
28
29 p = xdr_encode_opaque_fixed(p, &b->vol_id,
30 sizeof(struct nfsd4_deviceid));
31 p = xdr_encode_hyper(p, b->foff);
32 p = xdr_encode_hyper(p, b->len);
33 p = xdr_encode_hyper(p, b->soff);
34 *p++ = cpu_to_be32(b->es);
35 return 0;
36}
37
38static int
39nfsd4_block_encode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
40{
41 __be32 *p;
42 int len;
43
44 switch (b->type) {
45 case PNFS_BLOCK_VOLUME_SIMPLE:
46 len = 4 + 4 + 8 + 4 + b->simple.sig_len;
47 p = xdr_reserve_space(xdr, len);
48 if (!p)
49 return -ETOOSMALL;
50
51 *p++ = cpu_to_be32(b->type);
52 *p++ = cpu_to_be32(1); /* single signature */
53 p = xdr_encode_hyper(p, b->simple.offset);
54 p = xdr_encode_opaque(p, b->simple.sig, b->simple.sig_len);
55 break;
56 default:
57 return -ENOTSUPP;
58 }
59
60 return len;
61}
62
63__be32
64nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr,
65 struct nfsd4_getdeviceinfo *gdp)
66{
67 struct pnfs_block_deviceaddr *dev = gdp->gd_device;
68 int len = sizeof(__be32), ret, i;
69 __be32 *p;
70
71 p = xdr_reserve_space(xdr, len + sizeof(__be32));
72 if (!p)
73 return nfserr_resource;
74
75 for (i = 0; i < dev->nr_volumes; i++) {
76 ret = nfsd4_block_encode_volume(xdr, &dev->volumes[i]);
77 if (ret < 0)
78 return nfserrno(ret);
79 len += ret;
80 }
81
82 /*
83 * Fill in the overall length and number of volumes at the beginning
84 * of the layout.
85 */
86 *p++ = cpu_to_be32(len);
87 *p++ = cpu_to_be32(dev->nr_volumes);
88 return 0;
89}
90
91int
92nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
93 u32 block_size)
94{
95 struct iomap *iomaps;
96 u32 nr_iomaps, expected, i;
97
98 if (len < sizeof(u32)) {
99 dprintk("%s: extent array too small: %u\n", __func__, len);
100 return -EINVAL;
101 }
102
103 nr_iomaps = be32_to_cpup(p++);
104 expected = sizeof(__be32) + nr_iomaps * NFS4_BLOCK_EXTENT_SIZE;
105 if (len != expected) {
106 dprintk("%s: extent array size mismatch: %u/%u\n",
107 __func__, len, expected);
108 return -EINVAL;
109 }
110
111 iomaps = kcalloc(nr_iomaps, sizeof(*iomaps), GFP_KERNEL);
112 if (!iomaps) {
113 dprintk("%s: failed to allocate extent array\n", __func__);
114 return -ENOMEM;
115 }
116
117 for (i = 0; i < nr_iomaps; i++) {
118 struct pnfs_block_extent bex;
119
120 memcpy(&bex.vol_id, p, sizeof(struct nfsd4_deviceid));
121 p += XDR_QUADLEN(sizeof(struct nfsd4_deviceid));
122
123 p = xdr_decode_hyper(p, &bex.foff);
124 if (bex.foff & (block_size - 1)) {
125 dprintk("%s: unaligned offset %lld\n",
126 __func__, bex.foff);
127 goto fail;
128 }
129 p = xdr_decode_hyper(p, &bex.len);
130 if (bex.len & (block_size - 1)) {
131 dprintk("%s: unaligned length %lld\n",
132 __func__, bex.foff);
133 goto fail;
134 }
135 p = xdr_decode_hyper(p, &bex.soff);
136 if (bex.soff & (block_size - 1)) {
137 dprintk("%s: unaligned disk offset %lld\n",
138 __func__, bex.soff);
139 goto fail;
140 }
141 bex.es = be32_to_cpup(p++);
142 if (bex.es != PNFS_BLOCK_READWRITE_DATA) {
143 dprintk("%s: incorrect extent state %d\n",
144 __func__, bex.es);
145 goto fail;
146 }
147
148 iomaps[i].offset = bex.foff;
149 iomaps[i].length = bex.len;
150 }
151
152 *iomapp = iomaps;
153 return nr_iomaps;
154fail:
155 kfree(iomaps);
156 return -EINVAL;
157}
diff --git a/fs/nfsd/blocklayoutxdr.h b/fs/nfsd/blocklayoutxdr.h
new file mode 100644
index 000000000000..fdc79037c0e7
--- /dev/null
+++ b/fs/nfsd/blocklayoutxdr.h
@@ -0,0 +1,62 @@
1#ifndef _NFSD_BLOCKLAYOUTXDR_H
2#define _NFSD_BLOCKLAYOUTXDR_H 1
3
4#include <linux/blkdev.h>
5#include "xdr4.h"
6
7struct iomap;
8struct xdr_stream;
9
10enum pnfs_block_extent_state {
11 PNFS_BLOCK_READWRITE_DATA = 0,
12 PNFS_BLOCK_READ_DATA = 1,
13 PNFS_BLOCK_INVALID_DATA = 2,
14 PNFS_BLOCK_NONE_DATA = 3,
15};
16
17struct pnfs_block_extent {
18 struct nfsd4_deviceid vol_id;
19 u64 foff;
20 u64 len;
21 u64 soff;
22 enum pnfs_block_extent_state es;
23};
24#define NFS4_BLOCK_EXTENT_SIZE 44
25
26enum pnfs_block_volume_type {
27 PNFS_BLOCK_VOLUME_SIMPLE = 0,
28 PNFS_BLOCK_VOLUME_SLICE = 1,
29 PNFS_BLOCK_VOLUME_CONCAT = 2,
30 PNFS_BLOCK_VOLUME_STRIPE = 3,
31};
32
33/*
34 * Random upper cap for the uuid length to avoid unbounded allocation.
35 * Not actually limited by the protocol.
36 */
37#define PNFS_BLOCK_UUID_LEN 128
38
39struct pnfs_block_volume {
40 enum pnfs_block_volume_type type;
41 union {
42 struct {
43 u64 offset;
44 u32 sig_len;
45 u8 sig[PNFS_BLOCK_UUID_LEN];
46 } simple;
47 };
48};
49
50struct pnfs_block_deviceaddr {
51 u32 nr_volumes;
52 struct pnfs_block_volume volumes[];
53};
54
55__be32 nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr,
56 struct nfsd4_getdeviceinfo *gdp);
57__be32 nfsd4_block_encode_layoutget(struct xdr_stream *xdr,
58 struct nfsd4_layoutget *lgp);
59int nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
60 u32 block_size);
61
62#endif /* _NFSD_BLOCKLAYOUTXDR_H */
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index 60137c54b2f7..3c1bfa155571 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -26,6 +26,7 @@ static struct nfsd4_callback_ops nfsd4_cb_layout_ops;
26static const struct lock_manager_operations nfsd4_layouts_lm_ops; 26static const struct lock_manager_operations nfsd4_layouts_lm_ops;
27 27
28const struct nfsd4_layout_ops *nfsd4_layout_ops[LAYOUT_TYPE_MAX] = { 28const struct nfsd4_layout_ops *nfsd4_layout_ops[LAYOUT_TYPE_MAX] = {
29 [LAYOUT_BLOCK_VOLUME] = &bl_layout_ops,
29}; 30};
30 31
31/* pNFS device ID to export fsid mapping */ 32/* pNFS device ID to export fsid mapping */
@@ -115,8 +116,15 @@ nfsd4_set_deviceid(struct nfsd4_deviceid *id, const struct svc_fh *fhp,
115 116
116void nfsd4_setup_layout_type(struct svc_export *exp) 117void nfsd4_setup_layout_type(struct svc_export *exp)
117{ 118{
119 struct super_block *sb = exp->ex_path.mnt->mnt_sb;
120
118 if (exp->ex_flags & NFSEXP_NOPNFS) 121 if (exp->ex_flags & NFSEXP_NOPNFS)
119 return; 122 return;
123
124 if (sb->s_export_op->get_uuid &&
125 sb->s_export_op->map_blocks &&
126 sb->s_export_op->commit_blocks)
127 exp->ex_layout_type = LAYOUT_BLOCK_VOLUME;
120} 128}
121 129
122static void 130static void
diff --git a/fs/nfsd/pnfs.h b/fs/nfsd/pnfs.h
index a9616a4e13cd..fedb4d620a81 100644
--- a/fs/nfsd/pnfs.h
+++ b/fs/nfsd/pnfs.h
@@ -34,6 +34,7 @@ struct nfsd4_layout_ops {
34}; 34};
35 35
36extern const struct nfsd4_layout_ops *nfsd4_layout_ops[]; 36extern const struct nfsd4_layout_ops *nfsd4_layout_ops[];
37extern const struct nfsd4_layout_ops bl_layout_ops;
37 38
38__be32 nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp, 39__be32 nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp,
39 struct nfsd4_compound_state *cstate, stateid_t *stateid, 40 struct nfsd4_compound_state *cstate, stateid_t *stateid,